Skip to content

Commit e7026ab

Browse files
Firmament2: tokenizer & AST NDJSON emitters (filename/source_id + envelope)
1 parent d2ad0d2 commit e7026ab

File tree

3 files changed

+452
-122
lines changed

3 files changed

+452
-122
lines changed

Parser/asdl_c.py

Lines changed: 103 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,66 @@
2222
"constant": "PyBaseObject_Type",
2323
}
2424

25+
AST_EVENT_HELPER_C = r"""
26+
#include <stdio.h>
27+
#include <string.h>
28+
#include <stdlib.h>
29+
#include "firmament2.h"
30+
31+
/* Emit one AST event line, gated by FIRMAMENT2_ENABLE. */
32+
static void
33+
emit_ast_event_json(const char *kind,
34+
int lineno, int col_offset,
35+
int end_lineno, int end_col_offset)
36+
{
37+
if (!_firm2_enabled()) {
38+
return;
39+
}
40+
41+
/* Envelope */
42+
unsigned long long eid = _firm2_next_eid();
43+
unsigned long pid = _firm2_pid();
44+
unsigned long long tid = _firm2_tid();
45+
long long ts = _firm2_now_ns();
46+
47+
/* Source scope */
48+
const char *filename = _firm2_current_filename();
49+
const char *source_id = _firm2_current_source_id_hex();
50+
if (!filename) filename = "<unknown>";
51+
if (!source_id) source_id = "";
52+
53+
char json_buf[640];
54+
(void)snprintf(
55+
json_buf,
56+
sizeof(json_buf),
57+
"{"
58+
"\"type\":\"ast\","
59+
"\"envelope\":{"
60+
"\"event_id\":%llu,"
61+
"\"pid\":%lu,"
62+
"\"tid\":%llu,"
63+
"\"ts_ns\":%lld"
64+
"},"
65+
"\"payload\":{"
66+
"\"kind\":\"%s\","
67+
"\"lineno\":%d,"
68+
"\"col_offset\":%d,"
69+
"\"end_lineno\":%d,"
70+
"\"end_col_offset\":%d,"
71+
"\"filename\":\"%s\","
72+
"\"source_id\":\"%s\""
73+
"}"
74+
"}",
75+
eid, pid, tid, ts,
76+
kind,
77+
lineno, col_offset, end_lineno, end_col_offset,
78+
filename, source_id
79+
);
80+
printf("%s\n", json_buf);
81+
fflush(stdout);
82+
}
83+
"""
84+
2585
def get_c_type(name):
2686
"""Return a string for the C name of the type.
2787
@@ -407,61 +467,78 @@ def visitProduct(self, prod, name):
407467
self.get_args(prod.attributes),
408468
union=False)
409469

410-
411470
class FunctionVisitor(PrototypeVisitor):
412471
"""Visitor to generate constructor functions for AST."""
413472

414473
def emit_function(self, name, ctype, args, attrs, union=True):
415474
def emit(s, depth=0, reflow=True):
416475
self.emit(s, depth, reflow)
417-
argstr = ", ".join(["%s %s" % (atype, aname)
418-
for atype, aname, opt in args + attrs])
476+
477+
# Build full C argument list (fields + attributes)
478+
all_args = args + attrs
479+
argstr = ", ".join(f"{atype} {aname}" for atype, aname, opt in all_args)
419480
if argstr:
420481
argstr += ", PyArena *arena"
421482
else:
422483
argstr = "PyArena *arena"
423-
self.emit("%s" % ctype, 0)
424-
emit("%s(%s)" % (ast_func_name(name), argstr))
484+
485+
# Function signature
486+
self.emit(f"{ctype}", 0)
487+
emit(f"{ast_func_name(name)}({argstr})")
425488
emit("{")
426-
emit("%s p;" % ctype, 1)
489+
emit(f"{ctype} p;", 1)
490+
491+
# Required argument checks (non-optional, non-int)
427492
for argtype, argname, opt in args:
428493
if not opt and argtype != "int":
429-
emit("if (!%s) {" % argname, 1)
494+
emit(f"if (!{argname}) {{", 1)
430495
emit("PyErr_SetString(PyExc_ValueError,", 2)
431-
msg = "field '%s' is required for %s" % (argname, name)
432-
emit(' "%s");' % msg,
433-
2, reflow=False)
434-
emit('return NULL;', 2)
435-
emit('}', 1)
496+
msg = f"field '{argname}' is required for {name}"
497+
emit(f' "{msg}");', 2, reflow=False)
498+
emit("return NULL;", 2)
499+
emit("}", 1)
436500

437-
emit("p = (%s)_PyArena_Malloc(arena, sizeof(*p));" % ctype, 1);
501+
# Allocate node
502+
emit(f"p = ({ctype})_PyArena_Malloc(arena, sizeof(*p));", 1)
438503
emit("if (!p)", 1)
439-
emit("return NULL;", 2)
504+
emit(" return NULL;", 2)
505+
506+
# Initialize node fields and attributes
440507
if union:
441508
self.emit_body_union(name, args, attrs)
442509
else:
443510
self.emit_body_struct(name, args, attrs)
511+
512+
# Emit JSON event for nodes with location info
513+
attr_names = {aname for _, aname, _ in attrs}
514+
if "lineno" in attr_names and "col_offset" in attr_names:
515+
end_lineno_expr = "end_lineno" if "end_lineno" in attr_names else "lineno"
516+
end_col_expr = "end_col_offset" if "end_col_offset" in attr_names else "col_offset"
517+
emit(
518+
f'emit_ast_event_json("{name}", lineno, col_offset, {end_lineno_expr}, {end_col_expr});',
519+
1, reflow=False
520+
)
521+
444522
emit("return p;", 1)
445523
emit("}")
446524
emit("")
447525

448526
def emit_body_union(self, name, args, attrs):
449527
def emit(s, depth=0, reflow=True):
450528
self.emit(s, depth, reflow)
451-
emit("p->kind = %s_kind;" % name, 1)
529+
emit(f"p->kind = {name}_kind;", 1)
452530
for argtype, argname, opt in args:
453-
emit("p->v.%s.%s = %s;" % (name, argname, argname), 1)
531+
emit(f"p->v.{name}.{argname} = {argname};", 1)
454532
for argtype, argname, opt in attrs:
455-
emit("p->%s = %s;" % (argname, argname), 1)
533+
emit(f"p->{argname} = {argname};", 1)
456534

457535
def emit_body_struct(self, name, args, attrs):
458536
def emit(s, depth=0, reflow=True):
459537
self.emit(s, depth, reflow)
460538
for argtype, argname, opt in args:
461-
emit("p->%s = %s;" % (argname, argname), 1)
539+
emit(f"p->{argname} = {argname};", 1)
462540
for argtype, argname, opt in attrs:
463-
emit("p->%s = %s;" % (argname, argname), 1)
464-
541+
emit(f"p->{argname} = {argname};", 1)
465542

466543
class PickleVisitor(EmitVisitor):
467544

@@ -1009,7 +1086,7 @@ def visitModule(self, mod):
10091086
else {
10101087
if (PyErr_WarnFormat(
10111088
PyExc_DeprecationWarning, 1,
1012-
"Field %R is missing from %.400s._field_types. "
1089+
"Field '%U' is missing from %.400s._field_types. "
10131090
"This will become an error in Python 3.15.",
10141091
name, Py_TYPE(self)->tp_name
10151092
) < 0) {
@@ -1044,7 +1121,7 @@ def visitModule(self, mod):
10441121
// simple field (e.g., identifier)
10451122
if (PyErr_WarnFormat(
10461123
PyExc_DeprecationWarning, 1,
1047-
"%.400s.__init__ missing 1 required positional argument: %R. "
1124+
"%.400s.__init__ missing 1 required positional argument: '%U'. "
10481125
"This will become an error in Python 3.15.",
10491126
Py_TYPE(self)->tp_name, name
10501127
) < 0) {
@@ -2249,7 +2326,6 @@ def generate_ast_state(module_state, f):
22492326
f.write(' PyObject *' + s + ';\n')
22502327
f.write('};')
22512328

2252-
22532329
def generate_ast_fini(module_state, f):
22542330
f.write(textwrap.dedent("""
22552331
void _PyAST_Fini(PyInterpreterState *interp)
@@ -2266,7 +2342,6 @@ def generate_ast_fini(module_state, f):
22662342
22672343
"""))
22682344

2269-
22702345
def generate_module_def(mod, metadata, f, internal_h):
22712346
# Gather all the data needed for ModuleSpec
22722347
state_strings = {
@@ -2326,6 +2401,9 @@ def generate_module_def(mod, metadata, f, internal_h):
23262401
}
23272402
""").strip(), file=f)
23282403

2404+
# Firmament2: helper used by generated _PyAST_* constructors.
2405+
f.write(AST_EVENT_HELPER_C)
2406+
23292407
generate_ast_fini(module_state, f)
23302408

23312409
f.write('static int init_identifiers(struct ast_state *state)\n')
@@ -2337,6 +2415,7 @@ def generate_module_def(mod, metadata, f, internal_h):
23372415
f.write(' return 0;\n')
23382416
f.write('};\n\n')
23392417

2418+
23402419
def write_header(mod, metadata, f):
23412420
f.write(textwrap.dedent("""
23422421
#ifndef Py_INTERNAL_AST_H

Parser/lexer/lexer.c

Lines changed: 136 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,144 @@
22
#include "pycore_token.h"
33
#include "pycore_unicodeobject.h"
44
#include "errcode.h"
5+
#include <stdio.h>
6+
#include <string.h>
7+
#include <stdlib.h>
58

69
#include "state.h"
710
#include "../tokenizer/helpers.h"
11+
#include "firmament2.h" /* gate + current source info */
12+
13+
/* The internal lexer function is defined later in this file */
14+
static int tok_get(struct tok_state *tok, struct token *token);
15+
16+
/* Optional envelope helpers (decls may also live in firmament2.h) */
17+
extern unsigned long long _firm2_next_eid(void);
18+
extern unsigned long _firm2_pid(void);
19+
extern unsigned long long _firm2_tid(void);
20+
extern long long _firm2_now_ns(void);
21+
22+
/* Emit one tokenizer event as JSON (guarded by FIRMAMENT2_ENABLE). */
23+
static void
24+
emit_tokenizer_event_json(struct tok_state *tok, struct token *token, int type)
25+
{
26+
if (!_firm2_enabled()) {
27+
return;
28+
}
29+
30+
/* Envelope */
31+
unsigned long long eid = _firm2_next_eid();
32+
unsigned long pid = _firm2_pid();
33+
unsigned long long tid = _firm2_tid();
34+
long long ts = _firm2_now_ns();
35+
36+
/* Scope (compilation unit) */
37+
const char *filename = _firm2_current_filename();
38+
const char *source_id = _firm2_current_source_id_hex();
39+
if (!filename) filename = "<unknown>";
40+
if (!source_id) source_id = "";
41+
42+
char kind_buf[32];
43+
char value_buf[256];
44+
char json_buf[800];
45+
46+
/* Render token type as a string. */
47+
snprintf(kind_buf, sizeof(kind_buf), "%d", type);
48+
49+
/* Compute token text. */
50+
const char *start = token->start;
51+
const char *end = token->end;
52+
if (start == NULL || end == NULL || end <= start) {
53+
value_buf[0] = '\0';
54+
} else {
55+
int src_len = (int)(end - start);
56+
int out_idx = 0;
57+
for (int i = 0; i < src_len && out_idx < (int)sizeof(value_buf) - 1; i++) {
58+
unsigned char c = (unsigned char)start[i];
59+
if (c == '"' || c == '\\') {
60+
if (out_idx < (int)sizeof(value_buf) - 2) {
61+
value_buf[out_idx++] = '\\';
62+
value_buf[out_idx++] = (char)c;
63+
} else {
64+
break;
65+
}
66+
}
67+
else if (c == '\n' || c == '\r' || c == '\t') {
68+
if (out_idx < (int)sizeof(value_buf) - 2) {
69+
value_buf[out_idx++] = '\\';
70+
value_buf[out_idx++] = (c == '\n') ? 'n' : (c == '\r' ? 'r' : 't');
71+
} else {
72+
break;
73+
}
74+
}
75+
else if (c < 0x20) {
76+
continue; /* skip other control chars */
77+
}
78+
else {
79+
value_buf[out_idx++] = (char)c;
80+
}
81+
}
82+
value_buf[out_idx] = '\0';
83+
}
84+
85+
/* Line/column */
86+
int lineno = 0;
87+
int col_offset = 0;
88+
if (tok != NULL) {
89+
lineno = tok->lineno;
90+
if (token->start != NULL && tok->line_start != NULL) {
91+
col_offset = (int)(token->start - tok->line_start);
92+
if (col_offset < 0) col_offset = 0;
93+
}
94+
}
95+
96+
/* Build NDJSON line */
97+
(void)snprintf(
98+
json_buf,
99+
sizeof(json_buf),
100+
"{"
101+
"\"type\":\"tokenizer\","
102+
"\"envelope\":{"
103+
"\"event_id\":%llu,"
104+
"\"pid\":%lu,"
105+
"\"tid\":%llu,"
106+
"\"ts_ns\":%lld"
107+
"},"
108+
"\"payload\":{"
109+
"\"kind\":\"%s\","
110+
"\"value\":\"%s\","
111+
"\"lineno\":%d,"
112+
"\"col_offset\":%d,"
113+
"\"filename\":\"%s\","
114+
"\"source_id\":\"%s\""
115+
"}"
116+
"}",
117+
eid, pid, tid, ts,
118+
kind_buf, value_buf, lineno, col_offset, filename, source_id
119+
);
120+
121+
printf("%s\n", json_buf);
122+
fflush(stdout);
123+
}
124+
125+
/* Interpose on token production to emit JSON per token. */
126+
int
127+
_PyTokenizer_Get(struct tok_state *tok, struct token *token)
128+
{
129+
/* Call the real lexer */
130+
int result = tok_get(tok, token);
131+
if (tok->decoding_erred) {
132+
result = ERRORTOKEN;
133+
tok->done = E_DECODE;
134+
}
135+
136+
/* Emit JSON event for every token we successfully produced (when enabled). */
137+
if (token != NULL && token->start != NULL && token->end != NULL) {
138+
emit_tokenizer_event_json(tok, token, result);
139+
}
140+
return result;
141+
}
142+
8143

9144
/* Alternate tab spacing */
10145
#define ALTTABSIZE 1
@@ -539,9 +674,6 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
539674
return MAKE_TOKEN(ERRORTOKEN);
540675
}
541676
}
542-
else if (c == EOF && PyErr_Occurred()) {
543-
return MAKE_TOKEN(ERRORTOKEN);
544-
}
545677
else {
546678
break;
547679
}
@@ -1379,7 +1511,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
13791511
return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid non-printable character U+%04X", c));
13801512
}
13811513

1382-
if( c == '=' && INSIDE_FSTRING_EXPR_AT_TOP(current_tok)) {
1514+
if( c == '=' && INSIDE_FSTRING_EXPR(current_tok)) {
13831515
current_tok->in_debug = 1;
13841516
}
13851517

@@ -1622,14 +1754,3 @@ tok_get(struct tok_state *tok, struct token *token)
16221754
return tok_get_fstring_mode(tok, current_tok, token);
16231755
}
16241756
}
1625-
1626-
int
1627-
_PyTokenizer_Get(struct tok_state *tok, struct token *token)
1628-
{
1629-
int result = tok_get(tok, token);
1630-
if (tok->decoding_erred) {
1631-
result = ERRORTOKEN;
1632-
tok->done = E_DECODE;
1633-
}
1634-
return result;
1635-
}

0 commit comments

Comments
 (0)