From 000000b48f6e8cd193033ca24cb83b5804fad409 Mon Sep 17 00:00:00 2001 From: randymcmillan Date: Thu, 30 Apr 2026 11:26:54 -0400 Subject: [PATCH 1/4] fix: prevent UTF-8 slicing panic in Differ::fancy_replace Address a thread panic at src/differ.rs:271:52 caused by byte-indexed string slicing on multi-byte (UTF-8) character boundaries. The previous implementation assumed character indices matched byte offsets, leading to crashes when diffing lines containing emojis, mathematical symbols, or non-ASCII characters. Changes: - Replaced direct byte slicing with character-aware indexing using .char_indices(). - Added a regression test 'test_utf8_intraline_diff_no_panic' to ensure stability when comparing strings with multi-byte characters. Verified on: - x86_64-unknown-linux-gnu - aarch64-apple-darwin 03-000ef697 --- src/differ.rs | 13 +++++++++++-- tests/tests.rs | 10 ++++++++++ 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/src/differ.rs b/src/differ.rs index 5caef9a..0c896be 100644 --- a/src/differ.rs +++ b/src/differ.rs @@ -252,6 +252,15 @@ impl Differ { res } + fn split_at_char_boundary<'a>(line: &'a str, char_idx: usize) -> (&'a str, &'a str) { + let byte_idx = line + .char_indices() + .nth(char_idx) + .map(|(i, _)| i) + .unwrap_or(line.len()); + line.split_at(byte_idx) + } + fn qformat( &self, first_line: &str, @@ -268,8 +277,8 @@ impl Differ { ); common = cmp::min(common, count_leading(first_tags.split_at(common).0, ' ')); common = cmp::min(common, count_leading(first_tags.split_at(common).0, ' ')); - first_tags = first_tags.split_at(common).1.trim_right(); - second_tags = second_tags.split_at(common).1.trim_right(); + first_tags = Self::split_at_char_boundary(first_tags, common).1.trim_right(); + second_tags = Self::split_at_char_boundary(second_tags, common).1.trim_right(); let mut s = format!("- {}", first_line); res.push(s); if first_tags != "" { diff --git a/tests/tests.rs b/tests/tests.rs index 8f49de9..2024535 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -133,6 +133,16 @@ fn test_differ_restore() { assert_eq!(second_text, Differ::restore(&diff, 2)); } +#[test] +fn test_differ_compare_utf8() { + let first_text = vec!["\tcrab 🦀\n"]; + let second_text = vec!["\tcrab 🦞\n"]; + let differ = Differ::new(); + let result = differ.compare(&first_text, &second_text).join(""); + assert!(result.contains("🦀")); + assert!(result.contains("🦞")); +} + #[test] fn test_unified_diff() { let first_text = "one two three four".split(" ").collect::>(); From 0000002d78a570de606308ca44b0d3a308149a2d Mon Sep 17 00:00:00 2001 From: randymcmillan Date: Thu, 30 Apr 2026 13:20:56 -0400 Subject: [PATCH 2/4] fix: use second tag line for qformat alignment qformat now trims the shared prefix against both tag lines instead of checking first_tags twice, which keeps the generated ? lines aligned when the second tag line has different leading spaces. Add a regression test for the alignment case. 00-0013c970 --- src/differ.rs | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/src/differ.rs b/src/differ.rs index 0c896be..89f24a6 100644 --- a/src/differ.rs +++ b/src/differ.rs @@ -276,7 +276,7 @@ impl Differ { count_leading(second_line, '\t'), ); common = cmp::min(common, count_leading(first_tags.split_at(common).0, ' ')); - common = cmp::min(common, count_leading(first_tags.split_at(common).0, ' ')); + common = cmp::min(common, count_leading(second_tags.split_at(common).0, ' ')); first_tags = Self::split_at_char_boundary(first_tags, common).1.trim_right(); second_tags = Self::split_at_char_boundary(second_tags, common).1.trim_right(); let mut s = format!("- {}", first_line); @@ -347,3 +347,18 @@ fn test_qformat() { ] ); } + +#[test] +fn test_qformat_uses_both_tag_lines_for_alignment() { + let differ = Differ::new(); + let result = differ.qformat("\t\tabc\n", "\t\tadc\n", " ^", " ^"); + assert_eq!( + result, + vec![ + "- \t\tabc\n", + "? \t ^\n", + "+ \t\tadc\n", + "? \t^\n", + ] + ); +} From 000000b95b734f57488cef7166ce43a2ed92883a Mon Sep 17 00:00:00 2001 From: randymcmillan Date: Thu, 30 Apr 2026 13:25:59 -0400 Subject: [PATCH 3/4] fix: remove unnecessary mut in sequencematcher Drop the redundant mut binding when building the second-sequence index. The Vec returned from or_insert_with is still mutated, but the binding itself does not need to be mutable. 02-000ca690 --- src/sequencematcher.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sequencematcher.rs b/src/sequencematcher.rs index 4157808..0f3434f 100644 --- a/src/sequencematcher.rs +++ b/src/sequencematcher.rs @@ -114,7 +114,7 @@ impl<'a, T: Sequence> SequenceMatcher<'a, T> { let second_sequence = self.second_sequence; let mut second_sequence_elements = HashMap::new(); for (i, item) in second_sequence.iter().enumerate() { - let mut counter = second_sequence_elements + let counter = second_sequence_elements .entry(item) .or_insert_with(Vec::new); counter.push(i); From 000000ddd3632365b2e20fec22c3a08b577ed543 Mon Sep 17 00:00:00 2001 From: randymcmillan Date: Thu, 30 Apr 2026 13:29:24 -0400 Subject: [PATCH 4/4] fix: rename utf8 intraline regression test Rename the regression test to test_utf8_intraline_diff_no_panic so it matches the PR description and stays easy to search when debugging similar UTF-8 intraline diff issues. 02-0033b485 --- tests/tests.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/tests.rs b/tests/tests.rs index 2024535..8178f4d 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -134,7 +134,7 @@ fn test_differ_restore() { } #[test] -fn test_differ_compare_utf8() { +fn test_utf8_intraline_diff_no_panic() { let first_text = vec!["\tcrab 🦀\n"]; let second_text = vec!["\tcrab 🦞\n"]; let differ = Differ::new();