diff --git a/NEWS.md b/NEWS.md index 5a828805..a95237bb 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,6 +2,29 @@ ## Lrama 0.8.0 (2026-xx-xx) +### Support `%define api.token.raw` directive + +Support `%define api.token.raw` directive for Bison compatibility. +When enabled, external token numbers are unified with internal symbol numbers, eliminating the `yytranslate[]` array from generated code. + +This feature is useful for: +- Reducing generated code size +- Simplifying token handling (YYTRANSLATE becomes identity function) +- Embedded systems with memory constraints +- Debugging with unified token numbers + +```yacc +%define api.token.raw + +%token NUM PLUS MINUS +%% +expr: NUM + | expr PLUS expr + ; +``` + +Note: Character literals (`'+'`, `';'`, etc.) cannot be used with this directive. + ## Lrama 0.7.1 (2025-12-24) ### Optimize IELR diff --git a/lib/lrama/context.rb b/lib/lrama/context.rb index eb068c1b..71212bbc 100644 --- a/lib/lrama/context.rb +++ b/lib/lrama/context.rb @@ -25,9 +25,15 @@ def initialize(states) # enum yytokentype def yytokentype - @states.terms.reject do |term| - 0 < term.token_id && term.token_id < 128 - end.map do |term| + terms = if api_token_raw? + @states.terms + else + @states.terms.reject do |term| + 0 < term.token_id && term.token_id < 128 + end + end + + terms.map do |term| [term.id.s_value, term.token_id, term.display_name] end.unshift(["YYEMPTY", -2, nil]) end @@ -73,6 +79,11 @@ def yymaxutok @states.terms.map(&:token_id).max end + # Check if api.token.raw is enabled + def api_token_raw? + @states.api_token_raw? + end + # YYTRANSLATE # # yytranslate is a mapping from token id to symbol number diff --git a/lib/lrama/grammar.rb b/lib/lrama/grammar.rb index 95a80bb0..8150db08 100644 --- a/lib/lrama/grammar.rb +++ b/lib/lrama/grammar.rb @@ -277,6 +277,7 @@ def validate! validate_no_precedence_for_nterm! validate_rule_lhs_is_nterm! validate_duplicated_precedence! + validate_api_token_raw! end # @rbs (Grammar::Symbol sym) -> Array[Rule] @@ -304,6 +305,16 @@ def ielr_defined? @define.key?('lr.type') && @define['lr.type'] == 'ielr' end + # @rbs () -> bool + def api_token_raw? + return false unless @define.key?('api.token.raw') + + value = @define['api.token.raw'] + # When value is nil, empty string, or "true", it's enabled + # When value is "false", it's disabled + value != 'false' + end + private # @rbs () -> void @@ -529,7 +540,7 @@ def fill_default_precedence # @rbs () -> Array[Grammar::Symbol] def fill_symbols - fill_symbol_number + fill_symbol_number(api_token_raw: api_token_raw?) fill_nterm_type(@types) fill_printer(@printers) fill_destructor(@destructors) @@ -595,6 +606,23 @@ def validate_duplicated_precedence! raise errors.join("\n") end + # @rbs () -> void + def validate_api_token_raw! + return unless api_token_raw? + + errors = [] #: Array[String] + + terms.each do |term| + next unless term.id.is_a?(Lrama::Lexer::Token::Char) + + errors << "character literal #{term.id.s_value} cannot be used with %define api.token.raw (line: #{term.id.first_line})" + end + + return if errors.empty? + + raise errors.join("\n") + end + # @rbs () -> void def set_locations @locations = @locations || @rules.any? {|rule| rule.contains_at_reference? } diff --git a/lib/lrama/grammar/symbols/resolver.rb b/lib/lrama/grammar/symbols/resolver.rb index 085a835d..05cd6af3 100644 --- a/lib/lrama/grammar/symbols/resolver.rb +++ b/lib/lrama/grammar/symbols/resolver.rb @@ -18,7 +18,7 @@ class Resolver # def token_to_symbol: (Lexer::Token::Base token) -> Grammar::Symbol # def find_symbol_by_s_value!: (::String s_value) -> Grammar::Symbol # def fill_nterm_type: (Array[Grammar::Type] types) -> void - # def fill_symbol_number: () -> void + # def fill_symbol_number: (?api_token_raw: bool) -> void # def fill_printer: (Array[Grammar::Printer] printers) -> void # def fill_destructor: (Array[Destructor] destructors) -> (Destructor | bot) # def fill_error_token: (Array[Grammar::ErrorToken] error_tokens) -> void @@ -130,13 +130,14 @@ def find_symbol_by_number!(number) sym end - # @rbs () -> void - def fill_symbol_number + # @rbs (?api_token_raw: bool) -> void + def fill_symbol_number(api_token_raw: false) # YYEMPTY = -2 # YYEOF = 0 # YYerror = 1 # YYUNDEF = 2 @number = 3 + @api_token_raw = api_token_raw fill_terms_number fill_nterms_number end @@ -231,6 +232,34 @@ def find_nterm_by_id!(id) # @rbs () -> void def fill_terms_number + if @api_token_raw + fill_terms_number_raw + else + fill_terms_number_normal + end + end + + # @rbs () -> void + def fill_terms_number_raw + @terms.each do |sym| + while used_numbers[@number] do + @number += 1 + end + + if sym.number.nil? + sym.number = @number + used_numbers[@number] = true + @number += 1 + end + + if sym.token_id.nil? + sym.token_id = sym.number + end + end + end + + # @rbs () -> void + def fill_terms_number_normal # Character literal in grammar file has # token id corresponding to ASCII code by default, # so start token_id from 256. diff --git a/lib/lrama/output.rb b/lib/lrama/output.rb index d527be8b..a8c1f99e 100644 --- a/lib/lrama/output.rb +++ b/lib/lrama/output.rb @@ -13,7 +13,7 @@ class Output def_delegators "@context", :yyfinal, :yylast, :yyntokens, :yynnts, :yynrules, :yynstates, :yymaxutok, :yypact_ninf, :yytable_ninf - def_delegators "@grammar", :eof_symbol, :error_symbol, :undef_symbol, :accept_symbol + def_delegators "@grammar", :eof_symbol, :error_symbol, :undef_symbol, :accept_symbol, :api_token_raw? def initialize( out:, output_file_path:, template_name:, grammar_file_path:, diff --git a/lib/lrama/states.rb b/lib/lrama/states.rb index ddce627d..d319282e 100644 --- a/lib/lrama/states.rb +++ b/lib/lrama/states.rb @@ -36,7 +36,7 @@ class States include Lrama::Tracer::Duration def_delegators "@grammar", :symbols, :terms, :nterms, :rules, :precedences, - :accept_symbol, :eof_symbol, :undef_symbol, :find_symbol_by_s_value!, :ielr_defined? + :accept_symbol, :eof_symbol, :undef_symbol, :find_symbol_by_s_value!, :ielr_defined?, :api_token_raw? attr_reader :states #: Array[State] attr_reader :reads_relation #: Hash[State::Action::Goto, Array[State::Action::Goto]] diff --git a/sig/generated/lrama/grammar.rbs b/sig/generated/lrama/grammar.rbs index faab4f04..cb68dc2c 100644 --- a/sig/generated/lrama/grammar.rbs +++ b/sig/generated/lrama/grammar.rbs @@ -227,6 +227,9 @@ module Lrama # @rbs () -> bool def ielr_defined?: () -> bool + # @rbs () -> bool + def api_token_raw?: () -> bool + private # @rbs () -> void @@ -283,6 +286,9 @@ module Lrama # # @rbs () -> void def validate_duplicated_precedence!: () -> untyped + # @rbs () -> void + def validate_api_token_raw!: () -> void + # @rbs () -> void def set_locations: () -> void end diff --git a/sig/generated/lrama/grammar/symbols/resolver.rbs b/sig/generated/lrama/grammar/symbols/resolver.rbs index 2e5f2ebf..85231d7a 100644 --- a/sig/generated/lrama/grammar/symbols/resolver.rbs +++ b/sig/generated/lrama/grammar/symbols/resolver.rbs @@ -25,7 +25,7 @@ module Lrama def fill_nterm_type: (Array[Grammar::Type] types) -> void - def fill_symbol_number: () -> void + def fill_symbol_number: (?api_token_raw: bool) -> void def fill_printer: (Array[Grammar::Printer] printers) -> void @@ -82,8 +82,8 @@ module Lrama # @rbs (Integer number) -> Grammar::Symbol def find_symbol_by_number!: (Integer number) -> Grammar::Symbol - # @rbs () -> void - def fill_symbol_number: () -> void + # @rbs (?api_token_raw: bool) -> void + def fill_symbol_number: (?api_token_raw: bool) -> void # @rbs (Array[Grammar::Type] types) -> void def fill_nterm_type: (Array[Grammar::Type] types) -> void @@ -111,6 +111,12 @@ module Lrama # @rbs () -> void def fill_terms_number: () -> void + # @rbs () -> void + def fill_terms_number_raw: () -> void + + # @rbs () -> void + def fill_terms_number_normal: () -> void + # @rbs () -> void def fill_nterms_number: () -> void diff --git a/spec/fixtures/integration/api_token_raw.l b/spec/fixtures/integration/api_token_raw.l new file mode 100644 index 00000000..16dfb97b --- /dev/null +++ b/spec/fixtures/integration/api_token_raw.l @@ -0,0 +1,60 @@ +%option noinput nounput noyywrap never-interactive bison-bridge bison-locations + +%{ + +#include +#include +#include "api_token_raw.h" + +%} + +NUMBER [0-9]+ + +%% + +{NUMBER} { + ((void) yylloc); + yylval->val = atoi(yytext); + return NUM; +} + +"+" { + return PLUS; +} + +"-" { + return MINUS; +} + +"*" { + return STAR; +} + +"/" { + return SLASH; +} + +"(" { + return LPAREN; +} + +")" { + return RPAREN; +} + +[\n|\r\n] { + return(YYEOF); +} + +[[:space:]] {} + +<> { + return(YYEOF); +} + +. { + fprintf(stderr, "Illegal character '%s'\n", yytext); + return(YYEOF); +} + +%% diff --git a/spec/fixtures/integration/api_token_raw.y b/spec/fixtures/integration/api_token_raw.y new file mode 100644 index 00000000..85e582bb --- /dev/null +++ b/spec/fixtures/integration/api_token_raw.y @@ -0,0 +1,55 @@ +%{ + +#include +#include "api_token_raw.h" +#include "api_token_raw-lexer.h" + +static int yyerror(YYLTYPE *loc, const char *str); + +%} + +%define api.token.raw + +%union { + int val; +} + +%token NUM +%token PLUS MINUS STAR SLASH LPAREN RPAREN +%type expr +%left PLUS MINUS +%left STAR SLASH + +%locations + +%% + +program : /* empty */ + | expr { printf("=> %d", $1); } + ; +expr : NUM + | expr PLUS expr { $$ = $1 + $3; } + | expr MINUS expr { $$ = $1 - $3; } + | expr STAR expr { $$ = $1 * $3; } + | expr SLASH expr { $$ = $1 / $3; } + | LPAREN expr RPAREN { $$ = $2; } + ; + +%% + +static int yyerror(YYLTYPE *loc, const char *str) { + fprintf(stderr, "parse error: %s\n", str); + return 0; +} + +int main(int argc, char *argv[]) { + if (argc == 2) { + yy_scan_string(argv[1]); + } + + if (yyparse()) { + fprintf(stderr, "syntax error\n"); + return 1; + } + return 0; +} diff --git a/spec/lrama/context_spec.rb b/spec/lrama/context_spec.rb index 1a00d34e..bfe7d720 100644 --- a/spec/lrama/context_spec.rb +++ b/spec/lrama/context_spec.rb @@ -263,4 +263,84 @@ end end end + + describe "api.token.raw" do + it "assigns token_id equal to symbol number" do + y = <<~INPUT + %{ + // Prologue + %} + + %define api.token.raw + + %union { + int i; + } + + %token NUMBER + %token PLUS "+" + %token MINUS "-" + + %% + + program: expr ; + + expr: NUMBER + | expr PLUS expr + | expr MINUS expr + ; + + %% + INPUT + + grammar = Lrama::Parser.new(y, "parse.y").parse + grammar.prepare + grammar.validate! + states = Lrama::States.new(grammar, Lrama::Tracer.new(Lrama::Logger.new)) + states.compute + context = Lrama::Context.new(states) + + expect(context.api_token_raw?).to be true + + number_term = states.terms.find { |t| t.id.s_value == 'NUMBER' } + plus_term = states.terms.find { |t| t.id.s_value == 'PLUS' } + minus_term = states.terms.find { |t| t.id.s_value == 'MINUS' } + + expect(number_term.token_id).to eq(number_term.number) + expect(plus_term.token_id).to eq(plus_term.number) + expect(minus_term.token_id).to eq(minus_term.number) + + expect([number_term.number, plus_term.number, minus_term.number].min).to be >= 3 + end + + it "does not have api_token_raw when directive is not specified" do + y = <<~INPUT + %{ + // Prologue + %} + + %union { + int i; + } + + %token NUMBER + + %% + + program: NUMBER ; + + %% + INPUT + + grammar = Lrama::Parser.new(y, "parse.y").parse + grammar.prepare + grammar.validate! + states = Lrama::States.new(grammar, Lrama::Tracer.new(Lrama::Logger.new)) + states.compute + context = Lrama::Context.new(states) + expect(context.api_token_raw?).to be false + number_term = states.terms.find { |t| t.id.s_value == 'NUMBER' } + expect(number_term.token_id).to be >= 256 + end + end end diff --git a/spec/lrama/grammar_spec.rb b/spec/lrama/grammar_spec.rb index 3be8eab4..b0bc812d 100644 --- a/spec/lrama/grammar_spec.rb +++ b/spec/lrama/grammar_spec.rb @@ -243,4 +243,93 @@ end end end + + describe '#api_token_raw?' do + context 'when api.token.raw is not defined' do + let(:grammar) { described_class.new(rule_counter, false, {}) } + + it 'returns false' do + expect(grammar.api_token_raw?).to be false + end + end + + context 'when api.token.raw is defined without value' do + let(:grammar) { described_class.new(rule_counter, false, { 'api.token.raw' => nil }) } + + it 'returns true' do + expect(grammar.api_token_raw?).to be true + end + end + + context 'when api.token.raw is defined with empty string' do + let(:grammar) { described_class.new(rule_counter, false, { 'api.token.raw' => '' }) } + + it 'returns true' do + expect(grammar.api_token_raw?).to be true + end + end + + context 'when api.token.raw is defined with "true"' do + let(:grammar) { described_class.new(rule_counter, false, { 'api.token.raw' => 'true' }) } + + it 'returns true' do + expect(grammar.api_token_raw?).to be true + end + end + + context 'when api.token.raw is defined with "false"' do + let(:grammar) { described_class.new(rule_counter, false, { 'api.token.raw' => 'false' }) } + + it 'returns false' do + expect(grammar.api_token_raw?).to be false + end + end + end + + describe '#validate! with api.token.raw' do + let(:grammar_file) { Lrama::Lexer::GrammarFile.new('parse.y', '') } + + context 'when api.token.raw is enabled with character literal' do + let(:grammar) { described_class.new(rule_counter, false, { 'api.token.raw' => 'true' }) } + + before do + location = Lrama::Lexer::Location.new(grammar_file: grammar_file, first_line: 5, first_column: 0, last_line: 5, last_column: 3) + grammar.add_term(id: Lrama::Lexer::Token::Char.new(s_value: "'+'", location: location)) + grammar.fill_symbol_number + end + + it 'raises error about character literal' do + expect { grammar.validate! } + .to raise_error(RuntimeError, /character literal '\+' cannot be used with %define api\.token\.raw/) + end + end + + context 'when api.token.raw is enabled without character literal' do + let(:grammar) { described_class.new(rule_counter, false, { 'api.token.raw' => 'true' }) } + + before do + location = Lrama::Lexer::Location.new(grammar_file: grammar_file, first_line: 5, first_column: 0, last_line: 5, last_column: 4) + grammar.add_term(id: Lrama::Lexer::Token::Ident.new(s_value: 'PLUS', location: location)) + grammar.fill_symbol_number + end + + it 'does not raise error' do + expect { grammar.validate! }.not_to raise_error + end + end + + context 'when api.token.raw is disabled with character literal' do + let(:grammar) { described_class.new(rule_counter, false, { 'api.token.raw' => 'false' }) } + + before do + location = Lrama::Lexer::Location.new(grammar_file: grammar_file, first_line: 5, first_column: 0, last_line: 5, last_column: 3) + grammar.add_term(id: Lrama::Lexer::Token::Char.new(s_value: "'+'", location: location)) + grammar.fill_symbol_number + end + + it 'does not raise error' do + expect { grammar.validate! }.not_to raise_error + end + end + end end diff --git a/spec/lrama/integration_spec.rb b/spec/lrama/integration_spec.rb index 52922a0f..ba22f258 100644 --- a/spec/lrama/integration_spec.rb +++ b/spec/lrama/integration_spec.rb @@ -70,6 +70,39 @@ def generate_object(grammar_file_path, c_path, obj_path, command_args: []) end end + describe "api.token.raw" do + it "generates YYTRANSLATE as identity function without yytranslate array" do + tmpdir = Dir.tmpdir + grammar_file_path = fixture_path("integration/api_token_raw.y") + parser_c_path = tmpdir + "/api_token_raw_check.c" + + Lrama::Command.new(%W[-o#{parser_c_path} #{grammar_file_path}]).run + generated_code = File.read(parser_c_path) + + expect(generated_code).to include('#define YYTRANSLATE(YYX) ((yysymbol_kind_t) (YYX))') + expect(generated_code).not_to match(/static const .* yytranslate\[\]/) + end + + it "assigns token IDs starting from 3 (after YYEOF=0, YYerror=1, YYUNDEF=2)" do + tmpdir = Dir.tmpdir + grammar_file_path = fixture_path("integration/api_token_raw.y") + parser_c_path = tmpdir + "/api_token_raw_check.c" + + Lrama::Command.new(%W[-o#{parser_c_path} #{grammar_file_path}]).run + generated_code = File.read(parser_c_path) + + expect(generated_code).to match(/NUM\s*=\s*3/) + expect(generated_code).to match(/PLUS\s*=\s*4/) + expect(generated_code).to match(/MINUS\s*=\s*5/) + expect(generated_code).to match(/STAR\s*=\s*6/) + expect(generated_code).to match(/SLASH\s*=\s*7/) + end + + it "works correctly as a functional parser" do + test_parser("api_token_raw", "( 1 + 2 ) * 3", "=> 9") + end + end + it "prologue and epilogue are optional" do test_parser("prologue_epilogue_optional", "", "") end diff --git a/template/bison/yacc.c b/template/bison/yacc.c index 6edd59a0..5efbb534 100644 --- a/template/bison/yacc.c +++ b/template/bison/yacc.c @@ -461,6 +461,10 @@ union yyalloc #define YYMAXUTOK <%= output.yymaxutok %> +<%- if output.api_token_raw? -%> +/* api.token.raw: Token numbers are used directly without translation. */ +#define YYTRANSLATE(YYX) ((yysymbol_kind_t) (YYX)) +<%- else -%> /* YYTRANSLATE(TOKEN-NUM) -- Symbol number corresponding to TOKEN-NUM as returned by yylex, with out-of-bounds checking. */ #define YYTRANSLATE(YYX) \ @@ -474,6 +478,7 @@ static const <%= output.int_type_for(output.context.yytranslate) %> yytranslate[ { <%= output.yytranslate %> }; +<%- end -%> <%- if output.error_recovery -%> /* YYTRANSLATE_INVERTED[SYMBOL-NUM] -- Token number corresponding to SYMBOL-NUM */