Skip to content

Commit 8bfbfe3

Browse files
committed
Fix str.splitlines to recognise unicode line terminators
1 parent 21615cb commit 8bfbfe3

File tree

5 files changed

+70
-6
lines changed

5 files changed

+70
-6
lines changed

Lib/test/test_codecs.py

Lines changed: 31 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -149,8 +149,6 @@ def check_partial(self, input, partialresults):
149149
"".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
150150
)
151151

152-
# TODO: RUSTPYTHON
153-
@unittest.expectedFailure
154152
def test_readline(self):
155153
def getreader(input):
156154
stream = io.BytesIO(input.encode(self.encoding))
@@ -463,6 +461,12 @@ class UTF32Test(ReadTest, unittest.TestCase):
463461
b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
464462
b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
465463

464+
# TODO: RUSTPYTHON
465+
@unittest.expectedFailure
466+
def test_readline(self): # TODO: RUSTPYTHON, remove when this passes
467+
super().test_readline() # TODO: RUSTPYTHON, remove when this passes
468+
469+
466470
# TODO: RUSTPYTHON
467471
@unittest.expectedFailure
468472
def test_only_one_bom(self):
@@ -593,6 +597,11 @@ class UTF32LETest(ReadTest, unittest.TestCase):
593597
encoding = "utf-32-le"
594598
ill_formed_sequence = b"\x80\xdc\x00\x00"
595599

600+
# TODO: RUSTPYTHON
601+
@unittest.expectedFailure
602+
def test_readline(self): # TODO: RUSTPYTHON, remove when this passes
603+
super().test_readline() # TODO: RUSTPYTHON, remove when this passes
604+
596605
# TODO: RUSTPYTHON
597606
@unittest.expectedFailure
598607
def test_partial(self):
@@ -677,6 +686,11 @@ class UTF32BETest(ReadTest, unittest.TestCase):
677686
encoding = "utf-32-be"
678687
ill_formed_sequence = b"\x00\x00\xdc\x80"
679688

689+
# TODO: RUSTPYTHON
690+
@unittest.expectedFailure
691+
def test_readline(self): # TODO: RUSTPYTHON, remove when this passes
692+
super().test_readline() # TODO: RUSTPYTHON, remove when this passes
693+
680694
# TODO: RUSTPYTHON
681695
@unittest.expectedFailure
682696
def test_partial(self):
@@ -1048,6 +1062,11 @@ def test_incremental_errors(self):
10481062
class UTF7Test(ReadTest, unittest.TestCase):
10491063
encoding = "utf-7"
10501064

1065+
# TODO: RUSTPYTHON
1066+
@unittest.expectedFailure
1067+
def test_readline(self): # TODO: RUSTPYTHON, remove when this passes
1068+
super().test_readline() # TODO: RUSTPYTHON, remove when this passes
1069+
10511070
# TODO: RUSTPYTHON
10521071
@unittest.expectedFailure
10531072
def test_ascii(self):
@@ -2546,6 +2565,11 @@ class UnicodeEscapeTest(ReadTest, unittest.TestCase):
25462565
def test_incremental_surrogatepass(self): # TODO: RUSTPYTHON, remove when this passes
25472566
super().test_incremental_surrogatepass() # TODO: RUSTPYTHON, remove when this passes
25482567

2568+
# TODO: RUSTPYTHON
2569+
@unittest.expectedFailure
2570+
def test_readline(self): # TODO: RUSTPYTHON, remove when this passes
2571+
super().test_readline() # TODO: RUSTPYTHON, remove when this passes
2572+
25492573
def test_empty(self):
25502574
self.assertEqual(codecs.unicode_escape_encode(""), (b"", 0))
25512575
self.assertEqual(codecs.unicode_escape_decode(b""), ("", 0))
@@ -2683,6 +2707,11 @@ class RawUnicodeEscapeTest(ReadTest, unittest.TestCase):
26832707
def test_incremental_surrogatepass(self): # TODO: RUSTPYTHON, remove when this passes
26842708
super().test_incremental_surrogatepass() # TODO: RUSTPYTHON, remove when this passes
26852709

2710+
# TODO: RUSTPYTHON
2711+
@unittest.expectedFailure
2712+
def test_readline(self): # TODO: RUSTPYTHON, remove when this passes
2713+
super().test_readline() # TODO: RUSTPYTHON, remove when this passes
2714+
26862715
def test_empty(self):
26872716
self.assertEqual(codecs.raw_unicode_escape_encode(""), (b"", 0))
26882717
self.assertEqual(codecs.raw_unicode_escape_decode(b""), ("", 0))

extra_tests/snippets/builtin_str.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,8 @@
171171
assert 'hello\nhallo\nHallo\n'.splitlines() == ['hello', 'hallo', 'Hallo']
172172
assert 'hello\nhallo\nHallo'.splitlines(keepends=True) == ['hello\n', 'hallo\n', 'Hallo']
173173
assert 'hello\nhallo\nHallo\n'.splitlines(keepends=True) == ['hello\n', 'hallo\n', 'Hallo\n']
174+
assert 'hello\vhallo\x0cHallo\x1cHELLO\x1dhoho\x1ehaha\x85another\u2028yetanother\u2029last\r\n.'.splitlines() == ['hello', 'hallo', 'Hallo', 'HELLO', 'hoho', 'haha', 'another', 'yetanother', 'last', '.']
175+
assert 'hello\vhallo\x0cHallo\x1cHELLO\x1dhoho\x1ehaha\x85another\u2028yetanother\u2029last\r\n.'.splitlines(keepends=True) == ['hello\x0b', 'hallo\x0c', 'Hallo\x1c', 'HELLO\x1d', 'hoho\x1e', 'haha\x85', 'another\u2028', 'yetanother\u2029', 'last\r\n', '.']
174176
assert 'abc\t12345\txyz'.expandtabs() == 'abc 12345 xyz'
175177
assert '-'.join(['1', '2', '3']) == '1-2-3'
176178
assert 'HALLO'.isupper()

vm/src/anystr.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -376,7 +376,7 @@ pub trait AnyStr {
376376
}
377377
}
378378

379-
fn py_splitlines<FW, W>(&self, options: SplitLinesArgs, into_wrapper: FW) -> Vec<W>
379+
fn py_bytes_splitlines<FW, W>(&self, options: SplitLinesArgs, into_wrapper: FW) -> Vec<W>
380380
where
381381
FW: Fn(&Self) -> W,
382382
{

vm/src/builtins/str.rs

Lines changed: 35 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -893,8 +893,41 @@ impl PyStr {
893893

894894
#[pymethod]
895895
fn splitlines(&self, args: anystr::SplitLinesArgs, vm: &VirtualMachine) -> Vec<PyObjectRef> {
896-
self.as_str()
897-
.py_splitlines(args, |s| self.new_substr(s.to_owned()).to_pyobject(vm))
896+
let into_wrapper = |s: &str| self.new_substr(s.to_owned()).to_pyobject(vm);
897+
let mut elements = Vec::new();
898+
let mut last_i = 0;
899+
let self_str = self.as_str();
900+
let mut enumerated = self_str.char_indices().peekable();
901+
while let Some((i, ch)) = enumerated.next() {
902+
let end_len = match ch {
903+
'\n' => 1,
904+
'\r' => {
905+
let is_rn = enumerated.peek().map_or(false, |(_, ch)| *ch == '\n');
906+
if is_rn {
907+
let _ = enumerated.next();
908+
2
909+
} else {
910+
1
911+
}
912+
}
913+
'\x0b' | '\x0c' | '\x1c' | '\x1d' | '\x1e' | '\u{0085}' | '\u{2028}'
914+
| '\u{2029}' => ch.len_utf8(),
915+
_ => {
916+
continue;
917+
}
918+
};
919+
let range = if args.keepends {
920+
last_i..i + end_len
921+
} else {
922+
last_i..i
923+
};
924+
last_i = i + end_len;
925+
elements.push(into_wrapper(&self_str[range]));
926+
}
927+
if last_i != self_str.len() {
928+
elements.push(into_wrapper(&self_str[last_i..]));
929+
}
930+
elements
898931
}
899932

900933
#[pymethod]

vm/src/bytesinner.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -716,7 +716,7 @@ impl PyBytesInner {
716716
where
717717
FW: Fn(&[u8]) -> W,
718718
{
719-
self.elements.py_splitlines(options, into_wrapper)
719+
self.elements.py_bytes_splitlines(options, into_wrapper)
720720
}
721721

722722
pub fn zfill(&self, width: isize) -> Vec<u8> {

0 commit comments

Comments
 (0)