ruby · Earlopain · Jan 20, 2026 · Jan 19, 2026 · Jan 20, 2026 · Earlopain
diff --git a/lib/prism.rb b/lib/prism.rb
@@ -61,8 +61,7 @@ def initialize(version)
   #   Prism::lex_compat(source, **options) -> LexCompat::Result
   #
   # Returns a parse result whose value is an array of tokens that closely
-  # resembles the return value of Ripper::lex. The main difference is that the
-  # `:on_sp` token is not emitted.
+  # resembles the return value of Ripper::lex.
   #
   # For supported options, see Prism::parse.
   def self.lex_compat(source, **options)
@@ -72,9 +71,8 @@ def self.lex_compat(source, **options)
   # :call-seq:
   #   Prism::lex_ripper(source) -> Array
   #
-  # This lexes with the Ripper lex. It drops any space events but otherwise
-  # returns the same tokens. Raises SyntaxError if the syntax in source is
-  # invalid.
+  # This wraps the result of Ripper.lex. It produces almost exactly the
+  # same tokens. Raises SyntaxError if the syntax in source is invalid.
   def self.lex_ripper(source)
     LexRipper.new(source).result # steep:ignore
   end

diff --git a/lib/prism/lex_compat.rb b/lib/prism/lex_compat.rb
@@ -226,7 +226,7 @@ def state
     end
 
     # Tokens where state should be ignored
-    # used for :on_comment, :on_heredoc_end, :on_embexpr_end
+    # used for :on_sp, :on_comment, :on_heredoc_end, :on_embexpr_end
     class IgnoreStateToken < Token
       def ==(other) # :nodoc:
         self[0...-1] == other[0...-1]
@@ -611,10 +611,10 @@ def self.build(opening)
     BOM_FLUSHED = RUBY_VERSION >= "3.3.0"
     private_constant :BOM_FLUSHED
 
-    attr_reader :source, :options
+    attr_reader :options
 
-    def initialize(source, **options)
-      @source = source
+    def initialize(code, **options)
+      @code = code
       @options = options
     end
 
@@ -624,12 +624,14 @@ def result
       state = :default
       heredoc_stack = [[]] #: Array[Array[Heredoc::PlainHeredoc | Heredoc::DashHeredoc | Heredoc::DedentingHeredoc]]
 
-      result = Prism.lex(source, **options)
+      result = Prism.lex(@code, **options)
+      source = result.source
       result_value = result.value
       previous_state = nil #: State?
       last_heredoc_end = nil #: Integer?
+      eof_token = nil
 
-      bom = source.byteslice(0..2) == "\xEF\xBB\xBF"
+      bom = source.slice(0, 3) == "\xEF\xBB\xBF"
 
       result_value.each_with_index do |(token, lex_state), index|
         lineno = token.location.start_line
@@ -741,6 +743,7 @@ def result
 
             Token.new([[lineno, column], event, value, lex_state])
           when :on_eof
+            eof_token = token
             previous_token = result_value[index - 1][0]
 
             # If we're at the end of the file and the previous token was a
@@ -763,7 +766,7 @@ def result
                   end_offset += 3
                 end
 
-                tokens << Token.new([[lineno, 0], :on_nl, source.byteslice(start_offset...end_offset), lex_state])
+                tokens << Token.new([[lineno, 0], :on_nl, source.slice(start_offset, end_offset - start_offset), lex_state])
               end
             end
 
@@ -857,7 +860,89 @@ def result
       # We sort by location to compare against Ripper's output
       tokens.sort_by!(&:location)
 
-      Result.new(tokens, result.comments, result.magic_comments, result.data_loc, result.errors, result.warnings, Source.for(source))
+      # Add :on_sp tokens
+      tokens = add_on_sp_tokens(tokens, source, result.data_loc, bom, eof_token)
+
+      Result.new(tokens, result.comments, result.magic_comments, result.data_loc, result.errors, result.warnings, source)
+    end
+
+    def add_on_sp_tokens(tokens, source, data_loc, bom, eof_token)
+      new_tokens = []
+
+      prev_token_state = Translation::Ripper::Lexer::State.cached(Translation::Ripper::EXPR_BEG)
+      prev_token_end = bom ? 3 : 0
+
+      tokens.each do |token|
+        line, column = token.location
+        start_offset = source.line_to_byte_offset(line) + column
+        # Ripper reports columns on line 1 without counting the BOM, so we adjust to get the real offset
+        start_offset += 3 if line == 1 && bom
+
+        if start_offset > prev_token_end
+          sp_value = source.slice(prev_token_end, start_offset - prev_token_end)
+          sp_line = source.line(prev_token_end)
+          sp_column = source.column(prev_token_end)
+          # Ripper reports columns on line 1 without counting the BOM
+          sp_column -= 3 if sp_line == 1 && bom
+          continuation_index = sp_value.byteindex("\\")
+
+          # ripper emits up to three :on_sp tokens when line continuations are used
+          if continuation_index
+            next_whitespace_index = continuation_index + 1
+            next_whitespace_index += 1 if sp_value.byteslice(next_whitespace_index) == "\r"
+            next_whitespace_index += 1
+            first_whitespace = sp_value[0...continuation_index]
+            continuation = sp_value[continuation_index...next_whitespace_index]
+            second_whitespace = sp_value[next_whitespace_index..]
+
+            new_tokens << IgnoreStateToken.new([
+              [sp_line, sp_column],
+              :on_sp,
+              first_whitespace,
+              prev_token_state
+            ]) unless first_whitespace.empty?
+
+            new_tokens << IgnoreStateToken.new([
+              [sp_line, sp_column + continuation_index],
+              :on_sp,
+              continuation,
+              prev_token_state
+            ])
+
+            new_tokens << IgnoreStateToken.new([
+              [sp_line + 1, 0],
+              :on_sp,
+              second_whitespace,
+              prev_token_state
+            ]) unless second_whitespace.empty?
+          else
+            new_tokens << IgnoreStateToken.new([
+              [sp_line, sp_column],
+              :on_sp,
+              sp_value,
+              prev_token_state
+            ])
+          end
+        end
+
+        new_tokens << token
+        prev_token_state = token.state
+        prev_token_end = start_offset + token.value.bytesize
+      end
+
+      unless data_loc # no trailing :on_sp with __END__ as it is always preceded by :on_nl
+        end_offset = eof_token.location.end_offset
+        if prev_token_end < end_offset
+          new_tokens << IgnoreStateToken.new([
+            [source.line(prev_token_end), source.column(prev_token_end)],
+            :on_sp,
+            source.slice(prev_token_end, end_offset - prev_token_end),
+            prev_token_state
+          ])
+        end
+      end
+
+      new_tokens
     end
   end
 

diff --git a/lib/prism/lex_ripper.rb b/lib/prism/lex_ripper.rb
@@ -19,8 +19,6 @@ def result
 
       lex(source).each do |token|
         case token[1]
-        when :on_sp
-          # skip
         when :on_tstring_content
           if previous[1] == :on_tstring_content && (token[2].start_with?("\#$") || token[2].start_with?("\#@"))
             previous[2] << token[2]

diff --git a/rakelib/typecheck.rake b/rakelib/typecheck.rake
@@ -51,6 +51,7 @@ namespace :typecheck do
       --ignore=rakelib/
       --ignore=Rakefile
       --ignore=top-100-gems/
+      #{Dir.glob("*.rb").map { |f| "--ignore=/#{f}" }.join("\n")}
       # Treat all files as "typed: true" by default
       --typed=true
       # Use the typed-override file to revert some files to "typed: false"

diff --git a/snapshots/bom_leading_space.txt b/snapshots/bom_leading_space.txt
@@ -0,0 +1,32 @@
+@ ProgramNode (location: (1,4)-(1,10))
+├── flags: ∅
+├── locals: []
+└── statements:
+    @ StatementsNode (location: (1,4)-(1,10))
+    ├── flags: ∅
+    └── body: (length: 1)
+        └── @ CallNode (location: (1,4)-(1,10))
+            ├── flags: newline, ignore_visibility
+            ├── receiver: ∅
+            ├── call_operator_loc: ∅
+            ├── name: :p
+            ├── message_loc: (1,4)-(1,5) = "p"
+            ├── opening_loc: ∅
+            ├── arguments:
+            │   @ ArgumentsNode (location: (1,6)-(1,10))
+            │   ├── flags: ∅
+            │   └── arguments: (length: 1)
+            │       └── @ ParenthesesNode (location: (1,6)-(1,10))
+            │           ├── flags: ∅
+            │           ├── body:
+            │           │   @ StatementsNode (location: (1,7)-(1,9))
+            │           │   ├── flags: ∅
+            │           │   └── body: (length: 1)
+            │           │       └── @ IntegerNode (location: (1,7)-(1,9))
+            │           │           ├── flags: newline, static_literal, decimal
+            │           │           └── value: 42
+            │           ├── opening_loc: (1,6)-(1,7) = "("
+            │           └── closing_loc: (1,9)-(1,10) = ")"
+            ├── closing_loc: ∅
+            ├── equal_loc: ∅
+            └── block: ∅
diff --git a/snapshots/bom_spaces.txt b/snapshots/bom_spaces.txt
@@ -0,0 +1,32 @@
+@ ProgramNode (location: (1,3)-(1,11))
+├── flags: ∅
+├── locals: []
+└── statements:
+    @ StatementsNode (location: (1,3)-(1,11))
+    ├── flags: ∅
+    └── body: (length: 1)
+        └── @ CallNode (location: (1,3)-(1,11))
+            ├── flags: newline, ignore_visibility
+            ├── receiver: ∅
+            ├── call_operator_loc: ∅
+            ├── name: :p
+            ├── message_loc: (1,3)-(1,4) = "p"
+            ├── opening_loc: ∅
+            ├── arguments:
+            │   @ ArgumentsNode (location: (1,5)-(1,11))
+            │   ├── flags: ∅
+            │   └── arguments: (length: 1)
+            │       └── @ ParenthesesNode (location: (1,5)-(1,11))
+            │           ├── flags: ∅
+            │           ├── body:
+            │           │   @ StatementsNode (location: (1,7)-(1,9))
+            │           │   ├── flags: ∅
+            │           │   └── body: (length: 1)
+            │           │       └── @ IntegerNode (location: (1,7)-(1,9))
+            │           │           ├── flags: newline, static_literal, decimal
+            │           │           └── value: 42
+            │           ├── opening_loc: (1,5)-(1,6) = "("
+            │           └── closing_loc: (1,10)-(1,11) = ")"
+            ├── closing_loc: ∅
+            ├── equal_loc: ∅
+            └── block: ∅
diff --git a/test/prism/fixtures/bom_leading_space.txt b/test/prism/fixtures/bom_leading_space.txt
@@ -0,0 +1 @@
+ p (42)
diff --git a/test/prism/fixtures/bom_spaces.txt b/test/prism/fixtures/bom_spaces.txt
@@ -0,0 +1 @@
+p ( 42 )
diff --git a/test/prism/ruby/ripper_test.rb b/test/prism/ruby/ripper_test.rb
@@ -39,6 +39,8 @@ class RipperTest < TestCase
 
     # Skip these tests that we haven't implemented yet.
     omitted_sexp_raw = [
+      "bom_leading_space.txt",
+      "bom_spaces.txt",
       "dos_endings.txt",
       "heredocs_with_fake_newlines.txt",
       "heredocs_with_ignored_newlines.txt",
@@ -92,7 +94,7 @@ def test_lexer
       assert_equal(expected, lexer.parse[0].to_a)
       assert_equal(lexer.parse[0].to_a, lexer.scan[0].to_a)
 
-      assert_equal(%i[on_int on_op], Translation::Ripper::Lexer.new("1 +").lex.map(&:event))
+      assert_equal(%i[on_int on_sp on_op], Translation::Ripper::Lexer.new("1 +").lex.map(&:event))
       assert_raise(SyntaxError) { Translation::Ripper::Lexer.new("1 +").lex(raise_errors: true) }
     end
 
@@ -121,15 +123,17 @@ def assert_ripper_sexp_raw(source)
     def assert_ripper_lex(source)
       prism = Translation::Ripper.lex(source)
       ripper = Ripper.lex(source)
-      ripper.reject! { |elem| elem[1] == :on_sp } # Prism doesn't emit on_sp
-      ripper.sort_by! { |elem| elem[0] } # Prism emits tokens by their order in the code, not in parse order
+
+      # Prism emits tokens by their order in the code, not in parse order
+      ripper.sort_by! { |elem| elem[0] }
 
       [prism.size, ripper.size].max.times do |i|
         expected = ripper[i]
         actual = prism[i]
+
         # Since tokens related to heredocs are not emitted in the same order,
         # the state also doesn't line up.
-        if expected[1] == :on_heredoc_end && actual[1] == :on_heredoc_end
+        if expected && actual && expected[1] == :on_heredoc_end && actual[1] == :on_heredoc_end
           expected[3] = actual[3] = nil
         end
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		p (42)
Copy link Collaborator Earlopain Jan 20, 2026 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. Commited the two snapshots as well. Basically they get created when when running `bundle exec rake` eregon reacted with thumbs up emoji