Fix word boundary handling

lkollar · lkollar · commit 17ef56b92589 · 2025-11-22T13:29:08.000Z
In emacs mode _ is not a word boundary but in vi it is.
diff --git a/Lib/_pyrepl/reader.py b/Lib/_pyrepl/reader.py
@@ -57,6 +57,10 @@ def make_default_syntax_table() -> dict[str, int]:
     return st
 
 
+def _is_vi_word_char(c: str) -> bool:
+    return c.isalnum() or c == '_'
+
+
 def make_default_commands() -> dict[CommandName, type[Command]]:
     result: dict[CommandName, type[Command]] = {}
     all_commands = itertools.chain(vars(commands).values(), vars(vi_commands).values())
@@ -512,24 +516,23 @@ def vi_eow(self, p: int | None = None) -> int:
         following p most immediately (vi 'e' semantics).
 
         Unlike eow(), this returns the position ON the last word character,
-        not past it. p defaults to self.pos; word boundaries are determined
-        using self.syntax_table."""
+        not past it. p defaults to self.pos; word boundaries use vi rules
+        (alphanumeric + underscore)."""
         if p is None:
             p = self.pos
-        st = self.syntax_table
         b = self.buffer
 
         # If we're already at the end of a word, move past it
-        if (p < len(b) and st.get(b[p], SYNTAX_WORD) == SYNTAX_WORD and
-            (p + 1 >= len(b) or st.get(b[p + 1], SYNTAX_WORD) != SYNTAX_WORD)):
+        if (p < len(b) and _is_vi_word_char(b[p]) and
+            (p + 1 >= len(b) or not _is_vi_word_char(b[p + 1]))):
             p += 1
 
         # Skip non-word characters to find the start of next word
-        while p < len(b) and st.get(b[p], SYNTAX_WORD) != SYNTAX_WORD:
+        while p < len(b) and not _is_vi_word_char(b[p]):
             p += 1
 
         # Move to the last character of this word (not past it)
-        while p + 1 < len(b) and st.get(b[p + 1], SYNTAX_WORD) == SYNTAX_WORD:
+        while p + 1 < len(b) and _is_vi_word_char(b[p + 1]):
             p += 1
 
         # Clamp to valid buffer range
@@ -540,24 +543,41 @@ def vi_forward_word(self, p: int | None = None) -> int:
         (vi 'w' semantics).
 
         Unlike eow(), this lands ON the first character of the next word,
-        not past it. p defaults to self.pos; word boundaries are determined
-        using self.syntax_table."""
+        not past it. p defaults to self.pos; word boundaries use vi rules
+        (alphanumeric + underscore)."""
         if p is None:
             p = self.pos
-        st = self.syntax_table
         b = self.buffer
 
         # Skip the rest of the current word if we're on one
-        while p < len(b) and st.get(b[p], SYNTAX_WORD) == SYNTAX_WORD:
+        while p < len(b) and _is_vi_word_char(b[p]):
             p += 1
 
         # Skip non-word characters to find the start of next word
-        while p < len(b) and st.get(b[p], SYNTAX_WORD) != SYNTAX_WORD:
+        while p < len(b) and not _is_vi_word_char(b[p]):
             p += 1
 
         # Clamp to valid buffer range
         return min(p, len(b) - 1) if b else 0
 
+    def vi_bow(self, p: int | None = None) -> int:
+        """Return the 0-based index of the beginning of the word preceding p
+        (vi 'b' semantics).
+
+        p defaults to self.pos; word boundaries use vi rules
+        (alphanumeric + underscore)."""
+        if p is None:
+            p = self.pos
+        b = self.buffer
+        p -= 1
+        # Skip non-word characters
+        while p >= 0 and not _is_vi_word_char(b[p]):
+            p -= 1
+        # Skip word characters to find beginning of word
+        while p >= 0 and _is_vi_word_char(b[p]):
+            p -= 1
+        return p + 1
+
     def bol(self, p: int | None = None) -> int:
         """Return the 0-based index of the line break preceding p most
         immediately.
diff --git a/Lib/test/test_pyrepl/test_reader.py b/Lib/test/test_pyrepl/test_reader.py
@@ -911,7 +911,7 @@ def test_first_non_whitespace_character(self):
         self.assertEqual(reader2.buffer[reader2.pos], 't')
 
     def test_word_motion_edge_cases(self):
-        # Test with punctuation - underscore should be a word boundary
+        # Test with underscore - in vi mode, underscore IS a word character
         events = itertools.chain(
             code_to_events("hello_world"),
             [
@@ -921,8 +921,9 @@ def test_word_motion_edge_cases(self):
             ],
         )
         reader, _ = self._run_vi(events)
-        # 'w' moves to next word, underscore is not alphanumeric so treated as boundary
-        self.assertIn(reader.pos, [5, 6])  # Could be on '_' or 'w' depending on implementation
+        # In vi mode, underscore is part of word, so 'w' goes past end of "hello_world"
+        # which clamps to end of buffer (pos 10, on 'd')
+        self.assertEqual(reader.pos, 10)
 
         # Test 'e' at end of buffer stays in bounds
         events2 = itertools.chain(
@@ -977,6 +978,43 @@ def test_repeat_count_with_word_motions(self):
         # Should be at end of "beta"
         self.assertEqual(reader2.buffer[reader2.pos], 'a')  # Last 'a' of "beta"
 
+    def test_vi_word_boundaries(self):
+        """Test vi word motions match vim behavior for word characters.
+
+        In vi, word characters are alphanumeric + underscore.
+        """
+        # Test cases: (text, start_key_sequence, expected_pos, description)
+        test_cases = [
+            # Underscore is part of word in vi, unlike emacs mode
+            ("function_name", "0w", 12, "underscore is word char, w clamps to end"),
+            ("hello_world test", "0w", 12, "underscore word, then to next word"),
+            ("get_value(x)", "0w", 10, "underscore word, skip ( to x"),
+
+            # Basic word motion
+            ("hello world", "0w", 6, "basic word jump"),
+            ("one  two", "0w", 5, "double space handled"),
+            ("abc def ghi", "0ww", 8, "two w's"),
+
+            # End of word (e) - lands ON last char
+            ("function_name", "0e", 12, "e lands on last char of underscore word"),
+            ("foo bar", "0e", 2, "e lands on last char of foo"),
+            ("one two three", "0ee", 6, "two e's land on end of two"),
+        ]
+
+        for text, keys, expected_pos, desc in test_cases:
+            with self.subTest(text=text, keys=keys, desc=desc):
+                key_events = []
+                for k in keys:
+                    key_events.append(Event(evt="key", data=k, raw=bytearray(k.encode())))
+                events = itertools.chain(
+                    code_to_events(text),
+                    [Event(evt="key", data="\x1b", raw=bytearray(b"\x1b"))],  # ESC
+                    key_events,
+                )
+                reader, _ = self._run_vi(events)
+                self.assertEqual(reader.pos, expected_pos,
+                    f"Expected pos {expected_pos} but got {reader.pos} for '{text}' with keys '{keys}'")
+
 
 @force_not_colorized_test_class
 class TestHistoricalReaderBindings(TestCase):