Skip to content

Commit 27c24fd

Browse files
Earlopainkddnewton
authored andcommitted
Fix lexing for unterminated strings/heredocs etc.
When we hit EOF and still have lex modes left, it means some content was unterminated. Heredocs specifically have logic that needs to happen when the body finished lexing. If we don't reset the mode back to how it was before, it will not continue lexing at the correct place. Followup to #3918. We can't call into `parser_lex` since it resets token locations.
1 parent eb29d03 commit 27c24fd

File tree

5 files changed

+107
-4
lines changed

5 files changed

+107
-4
lines changed

src/prism.c

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9783,6 +9783,12 @@ parser_lex(pm_parser_t *parser) {
97839783
unsigned int semantic_token_seen = parser->semantic_token_seen;
97849784
parser->semantic_token_seen = true;
97859785

9786+
// We'll jump to this label when we are about to encounter an EOF.
9787+
// If we still have lex_modes on the stack, we pop them so that cleanup
9788+
// can happen. For example, we should still continue parsing after a heredoc
9789+
// identifier, even if the heredoc body was syntax invalid.
9790+
switch_lex_modes:
9791+
97869792
switch (parser->lex_modes.current->mode) {
97879793
case PM_LEX_DEFAULT:
97889794
case PM_LEX_EMBEXPR:
@@ -9856,6 +9862,14 @@ parser_lex(pm_parser_t *parser) {
98569862
// We'll check if we're at the end of the file. If we are, then we
98579863
// need to return the EOF token.
98589864
if (parser->current.end >= parser->end) {
9865+
// We may be missing closing tokens. We should pop modes one by one
9866+
// to do the appropriate cleanup like moving next_start for heredocs.
9867+
// Only when no mode is remaining will we actually emit the EOF token.
9868+
if (parser->lex_modes.current->mode != PM_LEX_DEFAULT) {
9869+
lex_mode_pop(parser);
9870+
goto switch_lex_modes;
9871+
}
9872+
98599873
// If we hit EOF, but the EOF came immediately after a newline,
98609874
// set the start of the token to the newline. This way any EOF
98619875
// errors will be reported as happening on that line rather than
@@ -15433,7 +15447,7 @@ parse_string_part(pm_parser_t *parser, uint16_t depth) {
1543315447
pm_token_t opening = parser->previous;
1543415448
pm_statements_node_t *statements = NULL;
1543515449

15436-
if (!match1(parser, PM_TOKEN_EMBEXPR_END)) {
15450+
if (!match3(parser, PM_TOKEN_EMBEXPR_END, PM_TOKEN_HEREDOC_END, PM_TOKEN_EOF)) {
1543715451
pm_accepts_block_stack_push(parser, true);
1543815452
statements = parse_statements(parser, PM_CONTEXT_EMBEXPR, (uint16_t) (depth + 1));
1543915453
pm_accepts_block_stack_pop(parser);
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
<<A+B
2+
^ unterminated heredoc; can't find string "A" anywhere before EOF
3+
^ unexpected '+', ignoring it
4+
^ unterminated heredoc; can't find string "A" anywhere before EOF
5+
#{C
6+
^ unexpected heredoc ending; expected an argument
7+
^ unexpected heredoc ending, expecting end-of-input
8+
^ unexpected heredoc ending, ignoring it
9+
^ unexpected end-of-input, assuming it is closing the parent top level context
10+
^ expected a `}` to close the embedded expression
11+
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
<<A+B
2+
^ unterminated heredoc; can't find string "A" anywhere before EOF
3+
#{C + "#{"}
4+
^ unterminated string meets end of file
5+
^ unexpected end-of-input, assuming it is closing the parent top level context
6+
^ expected a `}` to close the embedded expression
7+
^ unterminated string; expected a closing delimiter for the interpolated string
8+
^ expected a `}` to close the embedded expression
9+

test/prism/errors_test.rb

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,28 @@ def test_incomplete_def_closing_loc
8787
assert_nil(statement.end_keyword)
8888
end
8989

90+
def test_unclosed_interpolation
91+
statement = Prism.parse_statement("\"\#{")
92+
assert_equal('"', statement.opening)
93+
assert_nil(statement.closing)
94+
95+
assert_equal(1, statement.parts.count)
96+
assert_equal('#{', statement.parts[0].opening)
97+
assert_equal("", statement.parts[0].closing)
98+
assert_nil(statement.parts[0].statements)
99+
end
100+
101+
def test_unclosed_heredoc_and_interpolation
102+
statement = Prism.parse_statement("<<D\n\#{")
103+
assert_equal("<<D", statement.opening)
104+
assert_nil(statement.closing)
105+
106+
assert_equal(1, statement.parts.count)
107+
assert_equal('#{', statement.parts[0].opening)
108+
assert_equal("", statement.parts[0].closing)
109+
assert_nil(statement.parts[0].statements)
110+
end
111+
90112
private
91113

92114
def assert_errors(filepath, version)

test/prism/lex_test.rb

Lines changed: 50 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -48,11 +48,58 @@ def test_parse_lex_file
4848
end
4949

5050
if RUBY_VERSION >= "3.3"
51-
def test_lex_compare
52-
prism = Prism.lex_compat(File.read(__FILE__), version: "current").value
53-
ripper = Ripper.lex(File.read(__FILE__))
51+
def test_lex_compat
52+
source = "foo bar"
53+
prism = Prism.lex_compat(source, version: "current").value
54+
ripper = Ripper.lex(source)
5455
assert_equal(ripper, prism)
5556
end
5657
end
58+
59+
def test_lex_interpolation_unterminated
60+
assert_equal(
61+
%i[STRING_BEGIN EMBEXPR_BEGIN EOF],
62+
token_types('"#{')
63+
)
64+
65+
assert_equal(
66+
%i[STRING_BEGIN EMBEXPR_BEGIN IGNORED_NEWLINE EOF],
67+
token_types('"#{' + "\n")
68+
)
69+
end
70+
71+
def test_lex_interpolation_unterminated_with_content
72+
# FIXME: Emits EOL twice.
73+
assert_equal(
74+
%i[STRING_BEGIN EMBEXPR_BEGIN CONSTANT EOF EOF],
75+
token_types('"#{C')
76+
)
77+
78+
assert_equal(
79+
%i[STRING_BEGIN EMBEXPR_BEGIN CONSTANT NEWLINE EOF],
80+
token_types('"#{C' + "\n")
81+
)
82+
end
83+
84+
def test_lex_heredoc_unterminated
85+
code = <<~'RUBY'.strip
86+
<<A+B
87+
#{C
88+
RUBY
89+
90+
assert_equal(
91+
%i[HEREDOC_START EMBEXPR_BEGIN CONSTANT HEREDOC_END PLUS CONSTANT NEWLINE EOF],
92+
token_types(code)
93+
)
94+
95+
assert_equal(
96+
%i[HEREDOC_START EMBEXPR_BEGIN CONSTANT NEWLINE HEREDOC_END PLUS CONSTANT NEWLINE EOF],
97+
token_types(code + "\n")
98+
)
99+
end
100+
101+
def token_types(code)
102+
Prism.lex(code).value.map { |token, _state| token.type }
103+
end
57104
end
58105
end

0 commit comments

Comments
 (0)