Handle punctuation as separate words in vi motions

lkollar · lkollar · commit 23babd6a9ec1 · 2025-11-22T13:29:25.000Z
Vi has three character classes: word chars (alnum + _), punctuation
(non-word, non-whitespace), and whitespace. Now w, e, and b treat
punctuation sequences as separate words, matching vim behavior.
diff --git a/Lib/_pyrepl/reader.py b/Lib/_pyrepl/reader.py
@@ -521,46 +521,79 @@ def vi_eow(self, p: int | None = None) -> int:
         """Return the 0-based index of the last character of the word
         following p most immediately (vi 'e' semantics).
 
-        Unlike eow(), this returns the position ON the last word character,
-        not past it. p defaults to self.pos; word boundaries use vi rules
-        (alphanumeric + underscore)."""
+        Vi has three character classes: word chars (alnum + _), punctuation
+        (non-word, non-whitespace), and whitespace. 'e' moves to the end
+        of the current or next word/punctuation sequence."""
         if p is None:
             p = self.pos
         b = self.buffer
 
-        # If we're already at the end of a word, move past it
-        if (p < len(b) and _is_vi_word_char(b[p]) and
-            (p + 1 >= len(b) or not _is_vi_word_char(b[p + 1]))):
-            p += 1
+        if not b:
+            return 0
+
+        # Helper to check if at end of current sequence
+        def at_sequence_end(pos: int) -> bool:
+            if pos >= len(b) - 1:
+                return True
+            curr_is_word = _is_vi_word_char(b[pos])
+            next_is_word = _is_vi_word_char(b[pos + 1])
+            curr_is_space = b[pos].isspace()
+            next_is_space = b[pos + 1].isspace()
+            if curr_is_word:
+                return not next_is_word
+            elif not curr_is_space:
+                # Punctuation - at end if next is word or whitespace
+                return next_is_word or next_is_space
+            return True
 
-        # Skip non-word characters to find the start of next word
-        while p < len(b) and not _is_vi_word_char(b[p]):
+        # If already at end of a word/punctuation, move forward
+        if p < len(b) and at_sequence_end(p):
             p += 1
 
-        # Move to the last character of this word (not past it)
-        while p + 1 < len(b) and _is_vi_word_char(b[p + 1]):
+        # Skip whitespace
+        while p < len(b) and b[p].isspace():
             p += 1
 
-        # Clamp to valid buffer range
-        return min(p, len(b) - 1) if b else 0
+        if p >= len(b):
+            return len(b) - 1
+
+        # Move to end of current word or punctuation sequence
+        if _is_vi_word_char(b[p]):
+            while p + 1 < len(b) and _is_vi_word_char(b[p + 1]):
+                p += 1
+        else:
+            # Punctuation sequence
+            while p + 1 < len(b) and not _is_vi_word_char(b[p + 1]) and not b[p + 1].isspace():
+                p += 1
+
+        return min(p, len(b) - 1)
 
     def vi_forward_word(self, p: int | None = None) -> int:
         """Return the 0-based index of the first character of the next word
         (vi 'w' semantics).
 
-        Unlike eow(), this lands ON the first character of the next word,
-        not past it. p defaults to self.pos; word boundaries use vi rules
-        (alphanumeric + underscore)."""
+        Vi has three character classes: word chars (alnum + _), punctuation
+        (non-word, non-whitespace), and whitespace. 'w' moves to the start
+        of the next word or punctuation sequence."""
         if p is None:
             p = self.pos
         b = self.buffer
 
-        # Skip the rest of the current word if we're on one
-        while p < len(b) and _is_vi_word_char(b[p]):
-            p += 1
-
-        # Skip non-word characters to find the start of next word
-        while p < len(b) and not _is_vi_word_char(b[p]):
+        if not b or p >= len(b):
+            return max(0, len(b) - 1) if b else 0
+
+        # Skip current word or punctuation sequence
+        if _is_vi_word_char(b[p]):
+            # On a word char - skip word chars
+            while p < len(b) and _is_vi_word_char(b[p]):
+                p += 1
+        elif not b[p].isspace():
+            # On punctuation - skip punctuation
+            while p < len(b) and not _is_vi_word_char(b[p]) and not b[p].isspace():
+                p += 1
+
+        # Skip whitespace to find next word or punctuation
+        while p < len(b) and b[p].isspace():
             p += 1
 
         # Clamp to valid buffer range
@@ -570,19 +603,35 @@ def vi_bow(self, p: int | None = None) -> int:
         """Return the 0-based index of the beginning of the word preceding p
         (vi 'b' semantics).
 
-        p defaults to self.pos; word boundaries use vi rules
-        (alphanumeric + underscore)."""
+        Vi has three character classes: word chars (alnum + _), punctuation
+        (non-word, non-whitespace), and whitespace. 'b' moves to the start
+        of the current or previous word/punctuation sequence."""
         if p is None:
             p = self.pos
         b = self.buffer
+
+        if not b or p <= 0:
+            return 0
+
         p -= 1
-        # Skip non-word characters
-        while p >= 0 and not _is_vi_word_char(b[p]):
-            p -= 1
-        # Skip word characters to find beginning of word
-        while p >= 0 and _is_vi_word_char(b[p]):
+
+        # Skip whitespace going backward
+        while p >= 0 and b[p].isspace():
             p -= 1
-        return p + 1
+
+        if p < 0:
+            return 0
+
+        # Now skip the word or punctuation sequence we landed in
+        if _is_vi_word_char(b[p]):
+            while p > 0 and _is_vi_word_char(b[p - 1]):
+                p -= 1
+        else:
+            # Punctuation sequence
+            while p > 0 and not _is_vi_word_char(b[p - 1]) and not b[p - 1].isspace():
+                p -= 1
+
+        return p
 
     def bol(self, p: int | None = None) -> int:
         """Return the 0-based index of the line break preceding p most
diff --git a/Lib/test/test_pyrepl/test_reader.py b/Lib/test/test_pyrepl/test_reader.py
@@ -979,16 +979,26 @@ def test_repeat_count_with_word_motions(self):
         self.assertEqual(reader2.buffer[reader2.pos], 'a')  # Last 'a' of "beta"
 
     def test_vi_word_boundaries(self):
-        """Test vi word motions match vim behavior for word characters.
+        """Test vi word motions match vim behavior.
 
-        In vi, word characters are alphanumeric + underscore.
+        Vi has three character classes:
+        1. Word chars: alphanumeric + underscore
+        2. Punctuation: non-word, non-whitespace (forms separate words)
+        3. Whitespace: delimiters
         """
         # Test cases: (text, start_key_sequence, expected_pos, description)
         test_cases = [
             # Underscore is part of word in vi, unlike emacs mode
             ("function_name", "0w", 12, "underscore is word char, w clamps to end"),
-            ("hello_world test", "0w", 12, "underscore word, then to next word"),
-            ("get_value(x)", "0w", 10, "underscore word, skip ( to x"),
+            ("hello_world test", "0w", 12, "underscore word to end"),
+
+            # Punctuation is a separate word
+            ("foo.bar", "0w", 3, "w stops at dot (punctuation)"),
+            ("foo.bar", "0ww", 4, "second w goes to bar"),
+            ("foo..bar", "0w", 3, "w stops at first dot"),
+            ("foo..bar", "0ww", 5, "second w skips dot sequence to bar"),
+            ("get_value(x)", "0w", 9, "underscore word stops at ("),
+            ("get_value(x)", "0ww", 10, "second w goes to x"),
 
             # Basic word motion
             ("hello world", "0w", 6, "basic word jump"),
@@ -998,7 +1008,9 @@ def test_vi_word_boundaries(self):
             # End of word (e) - lands ON last char
             ("function_name", "0e", 12, "e lands on last char of underscore word"),
             ("foo bar", "0e", 2, "e lands on last char of foo"),
-            ("one two three", "0ee", 6, "two e's land on end of two"),
+            ("foo.bar", "0e", 2, "e lands on last o of foo"),
+            ("foo.bar", "0ee", 3, "second e lands on dot"),
+            ("foo.bar", "0eee", 6, "third e lands on last r of bar"),
         ]
 
         for text, keys, expected_pos, desc in test_cases: