Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 3 additions & 5 deletions lib/prism.rb
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,7 @@ def initialize(version)
# Prism::lex_compat(source, **options) -> LexCompat::Result
#
# Returns a parse result whose value is an array of tokens that closely
# resembles the return value of Ripper::lex. The main difference is that the
# `:on_sp` token is not emitted.
# resembles the return value of Ripper::lex.
#
# For supported options, see Prism::parse.
def self.lex_compat(source, **options)
Expand All @@ -72,9 +71,8 @@ def self.lex_compat(source, **options)
# :call-seq:
# Prism::lex_ripper(source) -> Array
#
# This lexes with the Ripper lex. It drops any space events but otherwise
# returns the same tokens. Raises SyntaxError if the syntax in source is
# invalid.
# This wraps the result of Ripper.lex. It produces almost exactly the
# same tokens. Raises SyntaxError if the syntax in source is invalid.
def self.lex_ripper(source)
LexRipper.new(source).result # steep:ignore
end
Expand Down
101 changes: 93 additions & 8 deletions lib/prism/lex_compat.rb
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,7 @@ def state
end

# Tokens where state should be ignored
# used for :on_comment, :on_heredoc_end, :on_embexpr_end
# used for :on_sp, :on_comment, :on_heredoc_end, :on_embexpr_end
class IgnoreStateToken < Token
def ==(other) # :nodoc:
self[0...-1] == other[0...-1]
Expand Down Expand Up @@ -611,10 +611,10 @@ def self.build(opening)
BOM_FLUSHED = RUBY_VERSION >= "3.3.0"
private_constant :BOM_FLUSHED

attr_reader :source, :options
attr_reader :options

def initialize(source, **options)
@source = source
def initialize(code, **options)
@code = code
@options = options
end

Expand All @@ -624,12 +624,14 @@ def result
state = :default
heredoc_stack = [[]] #: Array[Array[Heredoc::PlainHeredoc | Heredoc::DashHeredoc | Heredoc::DedentingHeredoc]]

result = Prism.lex(source, **options)
result = Prism.lex(@code, **options)
source = result.source
result_value = result.value
previous_state = nil #: State?
last_heredoc_end = nil #: Integer?
eof_token = nil

bom = source.byteslice(0..2) == "\xEF\xBB\xBF"
bom = source.slice(0, 3) == "\xEF\xBB\xBF"

result_value.each_with_index do |(token, lex_state), index|
lineno = token.location.start_line
Expand Down Expand Up @@ -741,6 +743,7 @@ def result

Token.new([[lineno, column], event, value, lex_state])
when :on_eof
eof_token = token
previous_token = result_value[index - 1][0]

# If we're at the end of the file and the previous token was a
Expand All @@ -763,7 +766,7 @@ def result
end_offset += 3
end

tokens << Token.new([[lineno, 0], :on_nl, source.byteslice(start_offset...end_offset), lex_state])
tokens << Token.new([[lineno, 0], :on_nl, source.slice(start_offset, end_offset - start_offset), lex_state])
end
end

Expand Down Expand Up @@ -857,7 +860,89 @@ def result
# We sort by location to compare against Ripper's output
tokens.sort_by!(&:location)

Result.new(tokens, result.comments, result.magic_comments, result.data_loc, result.errors, result.warnings, Source.for(source))
# Add :on_sp tokens
tokens = add_on_sp_tokens(tokens, source, result.data_loc, bom, eof_token)

Result.new(tokens, result.comments, result.magic_comments, result.data_loc, result.errors, result.warnings, source)
end

def add_on_sp_tokens(tokens, source, data_loc, bom, eof_token)
new_tokens = []

prev_token_state = Translation::Ripper::Lexer::State.cached(Translation::Ripper::EXPR_BEG)
prev_token_end = bom ? 3 : 0

tokens.each do |token|
line, column = token.location
start_offset = source.line_to_byte_offset(line) + column
# Ripper reports columns on line 1 without counting the BOM, so we adjust to get the real offset
start_offset += 3 if line == 1 && bom

if start_offset > prev_token_end
sp_value = source.slice(prev_token_end, start_offset - prev_token_end)
sp_line = source.line(prev_token_end)
sp_column = source.column(prev_token_end)
# Ripper reports columns on line 1 without counting the BOM
sp_column -= 3 if sp_line == 1 && bom
continuation_index = sp_value.byteindex("\\")

# ripper emits up to three :on_sp tokens when line continuations are used
if continuation_index
next_whitespace_index = continuation_index + 1
next_whitespace_index += 1 if sp_value.byteslice(next_whitespace_index) == "\r"
next_whitespace_index += 1
first_whitespace = sp_value[0...continuation_index]
continuation = sp_value[continuation_index...next_whitespace_index]
second_whitespace = sp_value[next_whitespace_index..]

new_tokens << IgnoreStateToken.new([
[sp_line, sp_column],
:on_sp,
first_whitespace,
prev_token_state
]) unless first_whitespace.empty?

new_tokens << IgnoreStateToken.new([
[sp_line, sp_column + continuation_index],
:on_sp,
continuation,
prev_token_state
])

new_tokens << IgnoreStateToken.new([
[sp_line + 1, 0],
:on_sp,
second_whitespace,
prev_token_state
]) unless second_whitespace.empty?
else
new_tokens << IgnoreStateToken.new([
[sp_line, sp_column],
:on_sp,
sp_value,
prev_token_state
])
end
end

new_tokens << token
prev_token_state = token.state
prev_token_end = start_offset + token.value.bytesize
end

unless data_loc # no trailing :on_sp with __END__ as it is always preceded by :on_nl
end_offset = eof_token.location.end_offset
if prev_token_end < end_offset
new_tokens << IgnoreStateToken.new([
[source.line(prev_token_end), source.column(prev_token_end)],
:on_sp,
source.slice(prev_token_end, end_offset - prev_token_end),
prev_token_state
])
end
end

new_tokens
end
end

Expand Down
2 changes: 0 additions & 2 deletions lib/prism/lex_ripper.rb
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,6 @@ def result

lex(source).each do |token|
case token[1]
when :on_sp
# skip
when :on_tstring_content
if previous[1] == :on_tstring_content && (token[2].start_with?("\#$") || token[2].start_with?("\#@"))
previous[2] << token[2]
Expand Down
1 change: 1 addition & 0 deletions rakelib/typecheck.rake
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ namespace :typecheck do
--ignore=rakelib/
--ignore=Rakefile
--ignore=top-100-gems/
#{Dir.glob("*.rb").map { |f| "--ignore=/#{f}" }.join("\n")}
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is nice, thanks

# Treat all files as "typed: true" by default
--typed=true
# Use the typed-override file to revert some files to "typed: false"
Expand Down
32 changes: 32 additions & 0 deletions snapshots/bom_leading_space.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
@ ProgramNode (location: (1,4)-(1,10))
├── flags: ∅
├── locals: []
└── statements:
@ StatementsNode (location: (1,4)-(1,10))
├── flags: ∅
└── body: (length: 1)
└── @ CallNode (location: (1,4)-(1,10))
├── flags: newline, ignore_visibility
├── receiver: ∅
├── call_operator_loc: ∅
├── name: :p
├── message_loc: (1,4)-(1,5) = "p"
├── opening_loc: ∅
├── arguments:
│ @ ArgumentsNode (location: (1,6)-(1,10))
│ ├── flags: ∅
│ └── arguments: (length: 1)
│ └── @ ParenthesesNode (location: (1,6)-(1,10))
│ ├── flags: ∅
│ ├── body:
│ │ @ StatementsNode (location: (1,7)-(1,9))
│ │ ├── flags: ∅
│ │ └── body: (length: 1)
│ │ └── @ IntegerNode (location: (1,7)-(1,9))
│ │ ├── flags: newline, static_literal, decimal
│ │ └── value: 42
│ ├── opening_loc: (1,6)-(1,7) = "("
│ └── closing_loc: (1,9)-(1,10) = ")"
├── closing_loc: ∅
├── equal_loc: ∅
└── block: ∅
32 changes: 32 additions & 0 deletions snapshots/bom_spaces.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
@ ProgramNode (location: (1,3)-(1,11))
├── flags: ∅
├── locals: []
└── statements:
@ StatementsNode (location: (1,3)-(1,11))
├── flags: ∅
└── body: (length: 1)
└── @ CallNode (location: (1,3)-(1,11))
├── flags: newline, ignore_visibility
├── receiver: ∅
├── call_operator_loc: ∅
├── name: :p
├── message_loc: (1,3)-(1,4) = "p"
├── opening_loc: ∅
├── arguments:
│ @ ArgumentsNode (location: (1,5)-(1,11))
│ ├── flags: ∅
│ └── arguments: (length: 1)
│ └── @ ParenthesesNode (location: (1,5)-(1,11))
│ ├── flags: ∅
│ ├── body:
│ │ @ StatementsNode (location: (1,7)-(1,9))
│ │ ├── flags: ∅
│ │ └── body: (length: 1)
│ │ └── @ IntegerNode (location: (1,7)-(1,9))
│ │ ├── flags: newline, static_literal, decimal
│ │ └── value: 42
│ ├── opening_loc: (1,5)-(1,6) = "("
│ └── closing_loc: (1,10)-(1,11) = ")"
├── closing_loc: ∅
├── equal_loc: ∅
└── block: ∅
1 change: 1 addition & 0 deletions test/prism/fixtures/bom_leading_space.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
 p (42)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Commited the two snapshots as well. Basically they get created when when running bundle exec rake

1 change: 1 addition & 0 deletions test/prism/fixtures/bom_spaces.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
p ( 42 )
12 changes: 8 additions & 4 deletions test/prism/ruby/ripper_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ class RipperTest < TestCase

# Skip these tests that we haven't implemented yet.
omitted_sexp_raw = [
"bom_leading_space.txt",
"bom_spaces.txt",
"dos_endings.txt",
"heredocs_with_fake_newlines.txt",
"heredocs_with_ignored_newlines.txt",
Expand Down Expand Up @@ -92,7 +94,7 @@ def test_lexer
assert_equal(expected, lexer.parse[0].to_a)
assert_equal(lexer.parse[0].to_a, lexer.scan[0].to_a)

assert_equal(%i[on_int on_op], Translation::Ripper::Lexer.new("1 +").lex.map(&:event))
assert_equal(%i[on_int on_sp on_op], Translation::Ripper::Lexer.new("1 +").lex.map(&:event))
assert_raise(SyntaxError) { Translation::Ripper::Lexer.new("1 +").lex(raise_errors: true) }
end

Expand Down Expand Up @@ -121,15 +123,17 @@ def assert_ripper_sexp_raw(source)
def assert_ripper_lex(source)
prism = Translation::Ripper.lex(source)
ripper = Ripper.lex(source)
ripper.reject! { |elem| elem[1] == :on_sp } # Prism doesn't emit on_sp
ripper.sort_by! { |elem| elem[0] } # Prism emits tokens by their order in the code, not in parse order

# Prism emits tokens by their order in the code, not in parse order
ripper.sort_by! { |elem| elem[0] }

[prism.size, ripper.size].max.times do |i|
expected = ripper[i]
actual = prism[i]

# Since tokens related to heredocs are not emitted in the same order,
# the state also doesn't line up.
if expected[1] == :on_heredoc_end && actual[1] == :on_heredoc_end
if expected && actual && expected[1] == :on_heredoc_end && actual[1] == :on_heredoc_end
expected[3] = actual[3] = nil
end

Expand Down