Fix str.splitlines to recognise unicode line terminators

r3m0t · r3m0t · commit 8bfbfe3e4672 · 2023-03-18T01:10:59.000Z
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
@@ -149,8 +149,6 @@ def check_partial(self, input, partialresults):
             "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
         )
 
-    # TODO: RUSTPYTHON
-    @unittest.expectedFailure
     def test_readline(self):
         def getreader(input):
             stream = io.BytesIO(input.encode(self.encoding))
@@ -463,6 +461,12 @@ class UTF32Test(ReadTest, unittest.TestCase):
               b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
               b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
 
+    # TODO: RUSTPYTHON
+    @unittest.expectedFailure
+    def test_readline(self): # TODO: RUSTPYTHON, remove when this passes
+        super().test_readline() # TODO: RUSTPYTHON, remove when this passes
+
+
     # TODO: RUSTPYTHON
     @unittest.expectedFailure
     def test_only_one_bom(self):
@@ -593,6 +597,11 @@ class UTF32LETest(ReadTest, unittest.TestCase):
     encoding = "utf-32-le"
     ill_formed_sequence = b"\x80\xdc\x00\x00"
 
+    # TODO: RUSTPYTHON
+    @unittest.expectedFailure
+    def test_readline(self): # TODO: RUSTPYTHON, remove when this passes
+        super().test_readline() # TODO: RUSTPYTHON, remove when this passes
+
     # TODO: RUSTPYTHON
     @unittest.expectedFailure
     def test_partial(self):
@@ -677,6 +686,11 @@ class UTF32BETest(ReadTest, unittest.TestCase):
     encoding = "utf-32-be"
     ill_formed_sequence = b"\x00\x00\xdc\x80"
 
+    # TODO: RUSTPYTHON
+    @unittest.expectedFailure
+    def test_readline(self): # TODO: RUSTPYTHON, remove when this passes
+        super().test_readline() # TODO: RUSTPYTHON, remove when this passes
+
     # TODO: RUSTPYTHON
     @unittest.expectedFailure
     def test_partial(self):
@@ -1048,6 +1062,11 @@ def test_incremental_errors(self):
 class UTF7Test(ReadTest, unittest.TestCase):
     encoding = "utf-7"
 
+    # TODO: RUSTPYTHON
+    @unittest.expectedFailure
+    def test_readline(self): # TODO: RUSTPYTHON, remove when this passes
+        super().test_readline() # TODO: RUSTPYTHON, remove when this passes
+
     # TODO: RUSTPYTHON
     @unittest.expectedFailure
     def test_ascii(self):
@@ -2546,6 +2565,11 @@ class UnicodeEscapeTest(ReadTest, unittest.TestCase):
     def test_incremental_surrogatepass(self): # TODO: RUSTPYTHON, remove when this passes
         super().test_incremental_surrogatepass() # TODO: RUSTPYTHON, remove when this passes
 
+    # TODO: RUSTPYTHON
+    @unittest.expectedFailure
+    def test_readline(self): # TODO: RUSTPYTHON, remove when this passes
+        super().test_readline() # TODO: RUSTPYTHON, remove when this passes
+
     def test_empty(self):
         self.assertEqual(codecs.unicode_escape_encode(""), (b"", 0))
         self.assertEqual(codecs.unicode_escape_decode(b""), ("", 0))
@@ -2683,6 +2707,11 @@ class RawUnicodeEscapeTest(ReadTest, unittest.TestCase):
     def test_incremental_surrogatepass(self): # TODO: RUSTPYTHON, remove when this passes
         super().test_incremental_surrogatepass() # TODO: RUSTPYTHON, remove when this passes
 
+    # TODO: RUSTPYTHON
+    @unittest.expectedFailure
+    def test_readline(self): # TODO: RUSTPYTHON, remove when this passes
+        super().test_readline() # TODO: RUSTPYTHON, remove when this passes
+
     def test_empty(self):
         self.assertEqual(codecs.raw_unicode_escape_encode(""), (b"", 0))
         self.assertEqual(codecs.raw_unicode_escape_decode(b""), ("", 0))
diff --git a/extra_tests/snippets/builtin_str.py b/extra_tests/snippets/builtin_str.py
@@ -171,6 +171,8 @@
 assert 'hello\nhallo\nHallo\n'.splitlines() == ['hello', 'hallo', 'Hallo']
 assert 'hello\nhallo\nHallo'.splitlines(keepends=True) == ['hello\n', 'hallo\n', 'Hallo']
 assert 'hello\nhallo\nHallo\n'.splitlines(keepends=True) == ['hello\n', 'hallo\n', 'Hallo\n']
+assert 'hello\vhallo\x0cHallo\x1cHELLO\x1dhoho\x1ehaha\x85another\u2028yetanother\u2029last\r\n.'.splitlines() == ['hello', 'hallo', 'Hallo', 'HELLO', 'hoho', 'haha', 'another', 'yetanother', 'last', '.']
+assert 'hello\vhallo\x0cHallo\x1cHELLO\x1dhoho\x1ehaha\x85another\u2028yetanother\u2029last\r\n.'.splitlines(keepends=True) == ['hello\x0b', 'hallo\x0c', 'Hallo\x1c', 'HELLO\x1d', 'hoho\x1e', 'haha\x85', 'another\u2028', 'yetanother\u2029', 'last\r\n', '.']
 assert 'abc\t12345\txyz'.expandtabs() == 'abc     12345   xyz'
 assert '-'.join(['1', '2', '3']) == '1-2-3'
 assert 'HALLO'.isupper()
diff --git a/vm/src/anystr.rs b/vm/src/anystr.rs
@@ -376,7 +376,7 @@ pub trait AnyStr {
         }
     }
 
-    fn py_splitlines<FW, W>(&self, options: SplitLinesArgs, into_wrapper: FW) -> Vec<W>
+    fn py_bytes_splitlines<FW, W>(&self, options: SplitLinesArgs, into_wrapper: FW) -> Vec<W>
     where
         FW: Fn(&Self) -> W,
     {
diff --git a/vm/src/builtins/str.rs b/vm/src/builtins/str.rs
@@ -893,8 +893,41 @@ impl PyStr {
 
     #[pymethod]
     fn splitlines(&self, args: anystr::SplitLinesArgs, vm: &VirtualMachine) -> Vec<PyObjectRef> {
-        self.as_str()
-            .py_splitlines(args, |s| self.new_substr(s.to_owned()).to_pyobject(vm))
+        let into_wrapper = |s: &str| self.new_substr(s.to_owned()).to_pyobject(vm);
+        let mut elements = Vec::new();
+        let mut last_i = 0;
+        let self_str = self.as_str();
+        let mut enumerated = self_str.char_indices().peekable();
+        while let Some((i, ch)) = enumerated.next() {
+            let end_len = match ch {
+                '\n' => 1,
+                '\r' => {
+                    let is_rn = enumerated.peek().map_or(false, |(_, ch)| *ch == '\n');
+                    if is_rn {
+                        let _ = enumerated.next();
+                        2
+                    } else {
+                        1
+                    }
+                }
+                '\x0b' | '\x0c' | '\x1c' | '\x1d' | '\x1e' | '\u{0085}' | '\u{2028}'
+                | '\u{2029}' => ch.len_utf8(),
+                _ => {
+                    continue;
+                }
+            };
+            let range = if args.keepends {
+                last_i..i + end_len
+            } else {
+                last_i..i
+            };
+            last_i = i + end_len;
+            elements.push(into_wrapper(&self_str[range]));
+        }
+        if last_i != self_str.len() {
+            elements.push(into_wrapper(&self_str[last_i..]));
+        }
+        elements
     }
 
     #[pymethod]
diff --git a/vm/src/bytesinner.rs b/vm/src/bytesinner.rs
@@ -716,7 +716,7 @@ impl PyBytesInner {
     where
         FW: Fn(&[u8]) -> W,
     {
-        self.elements.py_splitlines(options, into_wrapper)
+        self.elements.py_bytes_splitlines(options, into_wrapper)
     }
 
     pub fn zfill(&self, width: isize) -> Vec<u8> {

Original file line number	Diff line number	Diff line change
`@@ -376,7 +376,7 @@ pub trait AnyStr {`
`376`	`376`	`}`
`377`	`377`	`}`
`378`	`378`
`379`		`- fn py_splitlines<FW, W>(&self, options: SplitLinesArgs, into_wrapper: FW) -> Vec<W>`
	`379`	`+ fn py_bytes_splitlines<FW, W>(&self, options: SplitLinesArgs, into_wrapper: FW) -> Vec<W>`
`380`	`380`	`where`
`381`	`381`	`FW: Fn(&Self) -> W,`
`382`	`382`	`{`
Original file line number	Diff line number	Diff line change
`@@ -716,7 +716,7 @@ impl PyBytesInner {`
`716`	`716`	`where`
`717`	`717`	`FW: Fn(&[u8]) -> W,`
`718`	`718`	`{`
`719`		`- self.elements.py_splitlines(options, into_wrapper)`
	`719`	`+ self.elements.py_bytes_splitlines(options, into_wrapper)`
`720`	`720`	`}`
`721`	`721`
`722`	`722`	`pub fn zfill(&self, width: isize) -> Vec<u8> {`