diff --git a/darray.h b/darray.h index dc9d282be28e7f..10fd5e4ccc7e59 100644 --- a/darray.h +++ b/darray.h @@ -4,6 +4,7 @@ #include #include #include +#include "ruby/ruby.h" // Type for a dynamic array. Use to declare a dynamic array. // It is a pointer so it fits in st_table nicely. Designed @@ -147,6 +148,9 @@ rb_darray_size(const void *ary) return meta ? meta->size : 0; } +/* Estimate of the amount of memory used by this darray. + * Useful for TypedData objects. */ +#define rb_darray_memsize(ary) (sizeof(*(ary)) + (rb_darray_size(ary) * sizeof((ary)->data[0]))) static inline void rb_darray_pop(void *ary, size_t count) diff --git a/depend b/depend index e7ad0b9f8ba508..cfafc77703b9fa 100644 --- a/depend +++ b/depend @@ -17138,6 +17138,7 @@ symbol.$(OBJEXT): {$(VPATH)}backward/2/stdarg.h symbol.$(OBJEXT): {$(VPATH)}builtin.h symbol.$(OBJEXT): {$(VPATH)}config.h symbol.$(OBJEXT): {$(VPATH)}constant.h +symbol.$(OBJEXT): {$(VPATH)}darray.h symbol.$(OBJEXT): {$(VPATH)}debug_counter.h symbol.$(OBJEXT): {$(VPATH)}defines.h symbol.$(OBJEXT): {$(VPATH)}encoding.h diff --git a/gc/default/default.c b/gc/default/default.c index f6628fe9fd7104..be5385f166fde1 100644 --- a/gc/default/default.c +++ b/gc/default/default.c @@ -6872,12 +6872,6 @@ gc_is_moveable_obj(rb_objspace_t *objspace, VALUE obj) case T_ZOMBIE: return FALSE; case T_SYMBOL: - // TODO: restore original behavior - // if (RSYMBOL(obj)->id & ~ID_SCOPE_MASK) { - // return FALSE; - // } - return false; - /* fall through */ case T_STRING: case T_OBJECT: case T_FLOAT: diff --git a/internal/imemo.h b/internal/imemo.h index 7192631e92a5fc..31cc0be35ae9c3 100644 --- a/internal/imemo.h +++ b/internal/imemo.h @@ -256,8 +256,8 @@ struct rb_fields { #define OBJ_FIELD_HEAP ROBJECT_HEAP STATIC_ASSERT(imemo_fields_flags, OBJ_FIELD_HEAP == IMEMO_FL_USER0); STATIC_ASSERT(imemo_fields_embed_offset, offsetof(struct RObject, as.ary) == offsetof(struct rb_fields, as.embed.fields)); -STATIC_ASSERT(imemo_fields_embed_offset, offsetof(struct RObject, as.heap.fields) == offsetof(struct rb_fields, as.external.ptr)); -STATIC_ASSERT(imemo_fields_embed_offset, offsetof(struct RObject, as.heap.fields) == offsetof(struct rb_fields, as.complex.table)); +STATIC_ASSERT(imemo_fields_external_offset, offsetof(struct RObject, as.heap.fields) == offsetof(struct rb_fields, as.external.ptr)); +STATIC_ASSERT(imemo_fields_complex_offset, offsetof(struct RObject, as.heap.fields) == offsetof(struct rb_fields, as.complex.table)); #define IMEMO_OBJ_FIELDS(fields) ((struct rb_fields *)fields) diff --git a/lib/prism/translation/ripper/lexer.rb b/lib/prism/translation/ripper/lexer.rb index ed02e965747d81..787181b5a7fc42 100644 --- a/lib/prism/translation/ripper/lexer.rb +++ b/lib/prism/translation/ripper/lexer.rb @@ -6,7 +6,7 @@ module Prism module Translation class Ripper - class Lexer # :nodoc: + class Lexer < Ripper # :nodoc: # :stopdoc: class State @@ -39,6 +39,92 @@ def allbits?(i) to_int.allbits?(i) end def anybits?(i) to_int.anybits?(i) end def nobits?(i) to_int.nobits?(i) end end + + class Elem + attr_accessor :pos, :event, :tok, :state, :message + + def initialize(pos, event, tok, state, message = nil) + @pos = pos + @event = event + @tok = tok + @state = State.new(state) + @message = message + end + + def [](index) + case index + when 0, :pos + @pos + when 1, :event + @event + when 2, :tok + @tok + when 3, :state + @state + when 4, :message + @message + else + nil + end + end + + def inspect + "#<#{self.class}: #{event}@#{pos[0]}:#{pos[1]}:#{state}: #{tok.inspect}#{": " if message}#{message}>" + end + + alias to_s inspect + + def pretty_print(q) + q.group(2, "#<#{self.class}:", ">") { + q.breakable + q.text("#{event}@#{pos[0]}:#{pos[1]}") + q.breakable + state.pretty_print(q) + q.breakable + q.text("token: ") + tok.pretty_print(q) + if message + q.breakable + q.text("message: ") + q.text(message) + end + } + end + + def to_a + if @message + [@pos, @event, @tok, @state, @message] + else + [@pos, @event, @tok, @state] + end + end + end + + def initialize(...) + super + @lex_compat = Prism.lex_compat(@source, filepath: filename, line: lineno) + end + + # Returns the lex_compat result wrapped in `Elem`. Errors are omitted. + # Since ripper is a streaming parser, tokens are expected to be emitted in the order + # that the parser encounters them. This is not implemented. + def parse(raise_errors: false) + if @lex_compat.failure? && raise_errors + raise SyntaxError, @lex_compat.errors.first.message + else + @lex_compat.value.map do |position, event, token, state| + Elem.new(position, event, token, state.to_int) + end + end + end + + # Similar to parse but ripper sorts the elements by position in the source. Also + # includes errors. Since prism does error recovery, in cases of syntax errors + # the result may differ greatly compared to ripper. + def scan(...) + parse(...) + end + # :startdoc: end end diff --git a/prism/prism.c b/prism/prism.c index b36a6da20493ba..b158e505b2dc82 100644 --- a/prism/prism.c +++ b/prism/prism.c @@ -12438,7 +12438,7 @@ expect1_opening(pm_parser_t *parser, pm_token_type_t type, pm_diagnostic_id_t di pm_parser_err(parser, opening->start, opening->end, diag_id); - parser->previous.start = opening->end; + parser->previous.start = parser->previous.end; parser->previous.type = PM_TOKEN_MISSING; } diff --git a/symbol.c b/symbol.c index 11602ee33b7c5d..f0316ddd609838 100644 --- a/symbol.c +++ b/symbol.c @@ -9,6 +9,7 @@ **********************************************************************/ +#include "darray.h" #include "internal.h" #include "internal/concurrent_set.h" #include "internal/error.h" @@ -87,12 +88,6 @@ Init_op_tbl(void) static const int ID_ENTRY_UNIT = 512; -enum id_entry_type { - ID_ENTRY_STR, - ID_ENTRY_SYM, - ID_ENTRY_SIZE -}; - typedef struct { rb_atomic_t next_id; VALUE sym_set; @@ -169,6 +164,62 @@ sym_set_cmp(VALUE a, VALUE b) return rb_str_hash_cmp(sym_set_sym_get_str(a), sym_set_sym_get_str(b)) == false; } +struct sym_id_entry { + VALUE sym; + VALUE str; +}; + +static void +sym_id_entry_list_mark(void *ptr) +{ + rb_darray(struct sym_id_entry) ary = ptr; + + struct sym_id_entry *entry; + rb_darray_foreach(ary, i, entry) { + // sym must be pinned because it may be used in places that don't + // support compaction + rb_gc_mark(entry->sym); + rb_gc_mark_movable(entry->str); + } +} + +static void +sym_id_entry_list_free(void *ptr) +{ + rb_darray(struct sym_id_entry) ary = ptr; + + rb_darray_free(ary); +} + +static size_t +sym_id_entry_list_memsize(const void *ptr) +{ + const rb_darray(struct sym_id_entry) ary = ptr; + + return rb_darray_memsize(ary); +} + +static void +sym_id_entry_list_compact(void *ptr) +{ + rb_darray(struct sym_id_entry) ary = ptr; + + struct sym_id_entry *entry; + rb_darray_foreach(ary, i, entry) { + entry->str = rb_gc_location(entry->str); + } +} + +static const rb_data_type_t sym_id_entry_list_type = { + "symbol_id_entry_list", + { + sym_id_entry_list_mark, + sym_id_entry_list_free, + sym_id_entry_list_memsize, + sym_id_entry_list_compact, + }, + 0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED +}; static int sym_check_asciionly(VALUE str, bool fake_str) @@ -231,14 +282,24 @@ set_id_entry(rb_symbols_t *symbols, rb_id_serial_t num, VALUE str, VALUE sym) size_t idx = num / ID_ENTRY_UNIT; - VALUE ary, ids = symbols->ids; - if (idx >= (size_t)RARRAY_LEN(ids) || NIL_P(ary = rb_ary_entry(ids, (long)idx))) { - ary = rb_ary_hidden_new(ID_ENTRY_UNIT * ID_ENTRY_SIZE); - rb_ary_store(ids, (long)idx, ary); + VALUE id_entry_list, ids = symbols->ids; + rb_darray(struct sym_id_entry) entries; + if (idx >= (size_t)RARRAY_LEN(ids) || NIL_P(id_entry_list = rb_ary_entry(ids, (long)idx))) { + rb_darray_make(&entries, ID_ENTRY_UNIT); + id_entry_list = TypedData_Wrap_Struct(0, &sym_id_entry_list_type, entries); + rb_ary_store(ids, (long)idx, id_entry_list); + } + else { + entries = RTYPEDDATA_GET_DATA(id_entry_list); } - idx = (num % ID_ENTRY_UNIT) * ID_ENTRY_SIZE; - rb_ary_store(ary, (long)idx + ID_ENTRY_STR, str); - rb_ary_store(ary, (long)idx + ID_ENTRY_SYM, sym); + + idx = num % ID_ENTRY_UNIT; + struct sym_id_entry *entry = rb_darray_ref(entries, idx); + RUBY_ASSERT(entry->str == 0); + RUBY_ASSERT(entry->sym == 0); + + RB_OBJ_WRITE(id_entry_list, &entry->str, str); + RB_OBJ_WRITE(id_entry_list, &entry->sym, sym); } static VALUE @@ -394,7 +455,7 @@ rb_free_global_symbol_table(void) } WARN_UNUSED_RESULT(static ID lookup_str_id(VALUE str)); -WARN_UNUSED_RESULT(static VALUE lookup_id_str(ID id)); +WARN_UNUSED_RESULT(static VALUE get_id_str(ID id)); ID rb_id_attrset(ID id) @@ -419,7 +480,7 @@ rb_id_attrset(ID id) return id; default: { - VALUE str = lookup_id_str(id); + VALUE str = get_id_str(id); if (str != 0) { rb_name_error(id, "cannot make unknown type ID %d:%"PRIsVALUE" attrset", scope, str); @@ -434,7 +495,7 @@ rb_id_attrset(ID id) bool error = false; /* make new symbol and ID */ - VALUE str = lookup_id_str(id); + VALUE str = get_id_str(id); if (str) { str = rb_str_dup(str); rb_str_cat(str, "=", 1); @@ -705,75 +766,60 @@ rb_enc_symname2_p(const char *name, long len, rb_encoding *enc) return rb_enc_symname_type(name, len, enc, IDSET_ATTRSET_FOR_SYNTAX) != -1; } -static VALUE -get_id_serial_entry(rb_id_serial_t num, ID id, const enum id_entry_type t) +static struct sym_id_entry * +get_id_serial_entry(rb_id_serial_t num) { - VALUE result = 0; + struct sym_id_entry *entry = NULL; GLOBAL_SYMBOLS_LOCKING(symbols) { if (num && num < RUBY_ATOMIC_LOAD(symbols->next_id)) { size_t idx = num / ID_ENTRY_UNIT; VALUE ids = symbols->ids; - VALUE ary; - if (idx < (size_t)RARRAY_LEN(ids) && !NIL_P(ary = rb_ary_entry(ids, (long)idx))) { - long pos = (long)(num % ID_ENTRY_UNIT) * ID_ENTRY_SIZE; - result = rb_ary_entry(ary, pos + t); + VALUE id_entry_list; + if (idx < (size_t)RARRAY_LEN(ids) && !NIL_P(id_entry_list = rb_ary_entry(ids, (long)idx))) { + rb_darray(struct sym_id_entry) entries = RTYPEDDATA_GET_DATA(id_entry_list); - if (NIL_P(result)) { - result = 0; - } - else if (CHECK_ID_SERIAL) { - if (id) { - VALUE sym = result; - if (t != ID_ENTRY_SYM) - sym = rb_ary_entry(ary, pos + ID_ENTRY_SYM); - if (STATIC_SYM_P(sym)) { - if (STATIC_SYM2ID(sym) != id) result = 0; - } - else { - if (RSYMBOL(sym)->id != id) result = 0; - } - } - } + size_t pos = (size_t)(num % ID_ENTRY_UNIT); + RUBY_ASSERT(pos < rb_darray_size(entries)); + entry = rb_darray_ref(entries, pos); } } } - if (result) { - switch (t) { - case ID_ENTRY_STR: - RUBY_ASSERT_BUILTIN_TYPE(result, T_STRING); - break; - case ID_ENTRY_SYM: - RUBY_ASSERT_BUILTIN_TYPE(result, T_SYMBOL); - break; - default: - break; - } - } + return entry; +} - return result; +static VALUE +get_id_sym(ID id) +{ + struct sym_id_entry *entry = get_id_serial_entry(rb_id_to_serial(id)); + return entry ? entry->sym : 0; } static VALUE -get_id_entry(ID id, const enum id_entry_type t) +get_id_str(ID id) { - return get_id_serial_entry(rb_id_to_serial(id), id, t); + struct sym_id_entry *entry = get_id_serial_entry(rb_id_to_serial(id)); + return entry ? entry->str : 0; } int rb_static_id_valid_p(ID id) { - return STATIC_ID2SYM(id) == get_id_entry(id, ID_ENTRY_SYM); + return STATIC_ID2SYM(id) == get_id_sym(id); } static inline ID rb_id_serial_to_id(rb_id_serial_t num) { if (is_notop_id((ID)num)) { - VALUE sym = get_id_serial_entry(num, 0, ID_ENTRY_SYM); - if (sym) return SYM2ID(sym); - return ((ID)num << ID_SCOPE_SHIFT) | ID_INTERNAL | ID_STATIC_SYM; + struct sym_id_entry *entry = get_id_serial_entry(num); + if (entry && entry->sym != 0) { + return SYM2ID(entry->sym); + } + else { + return ((ID)num << ID_SCOPE_SHIFT) | ID_INTERNAL | ID_STATIC_SYM; + } } else { return (ID)num; @@ -836,12 +882,6 @@ lookup_str_id(VALUE str) return (ID)0; } -static VALUE -lookup_id_str(ID id) -{ - return get_id_entry(id, ID_ENTRY_STR); -} - ID rb_intern3(const char *name, long len, rb_encoding *enc) { @@ -974,7 +1014,7 @@ VALUE rb_id2sym(ID x) { if (!DYNAMIC_ID_P(x)) return STATIC_ID2SYM(x); - return get_id_entry(x, ID_ENTRY_SYM); + return get_id_sym(x); } /* @@ -1008,7 +1048,7 @@ rb_sym2str(VALUE sym) VALUE rb_id2str(ID id) { - return lookup_id_str(id); + return get_id_str(id); } const char * diff --git a/test/prism/errors_test.rb b/test/prism/errors_test.rb index 706b7395574e05..aa264ae5b7a08f 100644 --- a/test/prism/errors_test.rb +++ b/test/prism/errors_test.rb @@ -82,6 +82,11 @@ def test_regexp_encoding_option_mismatch_error assert_empty result.errors end + def test_incomplete_def_closing_loc + statement = Prism.parse_statement("def f; 123") + assert_empty(statement.end_keyword) + end + private def assert_errors(filepath, version) diff --git a/test/prism/ruby/ripper_test.rb b/test/prism/ruby/ripper_test.rb index bbd85585a9237e..2bd9c2fe4af5eb 100644 --- a/test/prism/ruby/ripper_test.rb +++ b/test/prism/ruby/ripper_test.rb @@ -38,7 +38,7 @@ class RipperTest < TestCase end # Skip these tests that we haven't implemented yet. - omitted = [ + omitted_sexp_raw = [ "dos_endings.txt", "heredocs_with_fake_newlines.txt", "heredocs_with_ignored_newlines.txt", @@ -59,8 +59,29 @@ class RipperTest < TestCase "whitequark/slash_newline_in_heredocs.txt" ] - Fixture.each_for_current_ruby(except: incorrect | omitted) do |fixture| - define_method(fixture.test_name) { assert_ripper(fixture.read) } + omitted_lexer_parse = [ + "comments.txt", + "heredoc_percent_q_newline_delimiter.txt", + "heredoc_with_escaped_newline_at_start.txt", + "heredocs_with_fake_newlines.txt", + "indented_file_end.txt", + "seattlerb/TestRubyParserShared.txt", + "seattlerb/class_comments.txt", + "seattlerb/module_comments.txt", + "seattlerb/parse_line_block_inline_comment_leading_newlines.txt", + "seattlerb/parse_line_block_inline_multiline_comment.txt", + "spanning_heredoc_newlines.txt", + "strings.txt", + "whitequark/dedenting_heredoc.txt", + "whitequark/procarg0.txt", + ] + + Fixture.each_for_current_ruby(except: incorrect | omitted_sexp_raw) do |fixture| + define_method("#{fixture.test_name}_sexp_raw") { assert_ripper_sexp_raw(fixture.read) } + end + + Fixture.each_for_current_ruby(except: incorrect | omitted_lexer_parse) do |fixture| + define_method("#{fixture.test_name}_lexer_parse") { assert_ripper_lexer_parse(fixture.read) } end # Check that the hardcoded values don't change without us noticing. @@ -76,8 +97,27 @@ def test_internals private - def assert_ripper(source) + def assert_ripper_sexp_raw(source) assert_equal Ripper.sexp_raw(source), Prism::Translation::Ripper.sexp_raw(source) end + + def assert_ripper_lexer_parse(source) + prism = Translation::Ripper::Lexer.new(source).parse + ripper = Ripper::Lexer.new(source).parse + ripper.reject! { |elem| elem.event == :on_sp } # Prism doesn't emit on_sp + ripper.sort_by!(&:pos) # Prism emits tokens by their order in the code, not in parse order + + [prism.size, ripper.size].max.times do |i| + expected = ripper[i].to_a + actual = prism[i].to_a + # Since tokens related to heredocs are not emitted in the same order, + # the state also doesn't line up. + if expected[1] == :on_heredoc_end && actual[1] == :on_heredoc_end + expected[3] = actual[3] = nil + end + + assert_equal(expected, actual) + end + end end end