Fix: address Copilot feedback - custom scoring, efficiency, and edge cases.

AliAlimohammadi · web-flow · commit 2eea7a411e85 · 2025-12-05T15:18:00.000-08:00
diff --git a/src/dynamic_programming/smith_waterman.rs b/src/dynamic_programming/smith_waterman.rs
@@ -1,11 +1,11 @@
 //! This module contains the Smith-Waterman algorithm implementation for local sequence alignment.
-//!
+//! 
 //! The Smith-Waterman algorithm is a dynamic programming algorithm used for determining
 //! similar regions between two sequences (nucleotide or protein sequences). It is particularly
 //! useful in bioinformatics for identifying optimal local alignments.
 //!
 //! # Algorithm Overview
-//!
+//! 
 //! The algorithm works by:
 //! 1. Creating a scoring matrix where each cell represents the maximum alignment score
 //!    ending at that position
@@ -14,15 +14,15 @@
 //! 4. Tracing back from the highest scoring position to reconstruct the alignment
 //!
 //! # Time Complexity
-//!
+//! 
 //! O(m * n) where m and n are the lengths of the two sequences
 //!
 //! # Space Complexity
-//!
+//! 
 //! O(m * n) for the scoring matrix
 //!
 //! # References
-//!
+//! 
 //! - [Smith, T.F., Waterman, M.S. (1981). "Identification of Common Molecular Subsequences"](https://doi.org/10.1016/0022-2836(81)90087-5)
 //! - [Wikipedia: Smith-Waterman algorithm](https://en.wikipedia.org/wiki/Smith%E2%80%93Waterman_algorithm)
 
@@ -46,7 +46,7 @@ use std::cmp::max;
 ///
 /// ```
 /// use the_algorithms_rust::dynamic_programming::score_function;
-///
+/// 
 /// let score = score_function('A', 'A', 1, -1, -2);
 /// assert_eq!(score, 1); // Match
 ///
@@ -93,7 +93,7 @@ pub fn score_function(
 ///
 /// ```
 /// use the_algorithms_rust::dynamic_programming::smith_waterman;
-///
+/// 
 /// let score_matrix = smith_waterman("ACAC", "CA", 1, -1, -2);
 /// assert_eq!(score_matrix.len(), 5); // query length + 1
 /// assert_eq!(score_matrix[0].len(), 3); // subject length + 1
@@ -107,37 +107,36 @@ pub fn smith_waterman(
 ) -> Vec<Vec<i32>> {
     let query_upper: Vec<char> = query.to_uppercase().chars().collect();
     let subject_upper: Vec<char> = subject.to_uppercase().chars().collect();
-
+    
     let m = query_upper.len();
     let n = subject_upper.len();
-
+    
     // Initialize scoring matrix with zeros
     let mut score = vec![vec![0; n + 1]; m + 1];
-
+    
     // Fill the scoring matrix using dynamic programming
     for i in 1..=m {
         for j in 1..=n {
             // Calculate score for match/mismatch
-            let match_or_mismatch = score[i - 1][j - 1]
-                + score_function(
-                    query_upper[i - 1],
-                    subject_upper[j - 1],
-                    match_score,
-                    mismatch_score,
-                    gap_score,
-                );
-
+            let match_or_mismatch = score[i - 1][j - 1] + score_function(
+                query_upper[i - 1],
+                subject_upper[j - 1],
+                match_score,
+                mismatch_score,
+                gap_score,
+            );
+            
             // Calculate score for deletion (gap in subject)
             let delete = score[i - 1][j] + gap_score;
-
+            
             // Calculate score for insertion (gap in query)
             let insert = score[i][j - 1] + gap_score;
-
+            
             // Take maximum of all options, but never go below 0 (local alignment)
             score[i][j] = max(0, max(match_or_mismatch, max(delete, insert)));
         }
     }
-
+    
     score
 }
 
@@ -152,6 +151,9 @@ pub fn smith_waterman(
 /// * `score` - The score matrix from the Smith-Waterman algorithm
 /// * `query` - Original query sequence used in alignment
 /// * `subject` - Original subject sequence used in alignment
+/// * `match_score` - Score for matching characters (should match smith_waterman call)
+/// * `mismatch_score` - Score for mismatching characters (should match smith_waterman call)
+/// * `gap_score` - Penalty for gaps (should match smith_waterman call)
 ///
 /// # Returns
 ///
@@ -162,19 +164,26 @@ pub fn smith_waterman(
 ///
 /// ```
 /// use the_algorithms_rust::dynamic_programming::{smith_waterman, traceback};
-///
+/// 
 /// let score_matrix = smith_waterman("ACAC", "CA", 1, -1, -2);
-/// let alignment = traceback(&score_matrix, "ACAC", "CA");
+/// let alignment = traceback(&score_matrix, "ACAC", "CA", 1, -1, -2);
 /// assert_eq!(alignment, "CA\nCA");
 /// ```
-pub fn traceback(score: &[Vec<i32>], query: &str, subject: &str) -> String {
+pub fn traceback(
+    score: &[Vec<i32>],
+    query: &str,
+    subject: &str,
+    match_score: i32,
+    mismatch_score: i32,
+    gap_score: i32,
+) -> String {
     let query_upper: Vec<char> = query.to_uppercase().chars().collect();
     let subject_upper: Vec<char> = subject.to_uppercase().chars().collect();
-
+    
     // Find the cell with maximum score
-    let mut max_value = i32::MIN;
+    let mut max_value = 0;
     let (mut i_max, mut j_max) = (0, 0);
-
+    
     for (i, row) in score.iter().enumerate() {
         for (j, &value) in row.iter().enumerate() {
             if value > max_value {
@@ -184,46 +193,57 @@ pub fn traceback(score: &[Vec<i32>], query: &str, subject: &str) -> String {
             }
         }
     }
-
+    
     // If no significant alignment found, return empty string
-    if i_max == 0 || j_max == 0 {
+    if max_value <= 0 {
         return String::new();
     }
-
+    
     // Traceback from the maximum scoring cell
     let (mut i, mut j) = (i_max, j_max);
-    let mut align1 = String::new();
-    let mut align2 = String::new();
-
+    let mut align1 = Vec::new();
+    let mut align2 = Vec::new();
+    
     // Continue tracing back until we hit a cell with score 0
     while i > 0 && j > 0 && score[i][j] > 0 {
         let current_score = score[i][j];
-
+        
         // Check if we came from diagonal (match/mismatch)
-        if current_score
-            == score[i - 1][j - 1]
-                + score_function(query_upper[i - 1], subject_upper[j - 1], 1, -1, -2)
-        {
-            align1.insert(0, query_upper[i - 1]);
-            align2.insert(0, subject_upper[j - 1]);
+        if current_score == score[i - 1][j - 1] + score_function(
+            query_upper[i - 1],
+            subject_upper[j - 1],
+            match_score,
+            mismatch_score,
+            gap_score,
+        ) {
+            align1.push(query_upper[i - 1]);
+            align2.push(subject_upper[j - 1]);
             i -= 1;
             j -= 1;
-        }
+        } 
         // Check if we came from above (deletion/gap in subject)
-        else if current_score == score[i - 1][j] - 2 {
-            align1.insert(0, query_upper[i - 1]);
-            align2.insert(0, '-');
+        else if current_score == score[i - 1][j] + gap_score {
+            align1.push(query_upper[i - 1]);
+            align2.push('-');
             i -= 1;
-        }
+        } 
         // Otherwise we came from left (insertion/gap in query)
         else {
-            align1.insert(0, '-');
-            align2.insert(0, subject_upper[j - 1]);
+            align1.push('-');
+            align2.push(subject_upper[j - 1]);
             j -= 1;
         }
     }
-
-    format!("{align1}\n{align2}")
+    
+    // Reverse the sequences (we built them backwards)
+    align1.reverse();
+    align2.reverse();
+    
+    format!(
+        "{}\n{}",
+        align1.into_iter().collect::<String>(),
+        align2.into_iter().collect::<String>()
+    )
 }
 
 #[cfg(test)]
@@ -247,8 +267,8 @@ mod tests {
             $(
                 #[test]
                 fn $name() {
-                    let (score, query, subject, expected) = $test_cases;
-                    assert_eq!(traceback(&score, query, subject), expected);
+                    let (score, query, subject, match_score, mismatch_score, gap_score, expected) = $test_cases;
+                    assert_eq!(traceback(&score, query, subject, match_score, mismatch_score, gap_score), expected);
                 }
             )*
         }
@@ -300,6 +320,7 @@ mod tests {
             ],
             "ACAC",
             "CA",
+            1, -1, -2,
             "CA\nCA",
         ),
         test_traceback_agt_agt: (
@@ -311,9 +332,23 @@ mod tests {
             ],
             "AGT",
             "AGT",
+            1, -1, -2,
             "AGT\nAGT",
         ),
-        test_traceback_empty: (vec![vec![0, 0, 0]], "ACAC", "", ""),
+        test_traceback_empty: (vec![vec![0, 0, 0]], "ACAC", "", 1, -1, -2, ""),
+        test_traceback_custom_scoring: (
+            vec![
+                vec![0, 0, 0],
+                vec![0, 0, 2],
+                vec![0, 2, 0],
+                vec![0, 0, 4],
+                vec![0, 2, 0],
+            ],
+            "ACAC",
+            "CA",
+            2, -2, -3,
+            "CA\nCA",
+        ),
     }
 
     #[test]
@@ -339,8 +374,43 @@ mod tests {
         let result1 = smith_waterman("acac", "CA", 1, -1, -2);
         let result2 = smith_waterman("ACAC", "ca", 1, -1, -2);
         let result3 = smith_waterman("AcAc", "Ca", 1, -1, -2);
-
+        
         assert_eq!(result1, result2);
         assert_eq!(result2, result3);
     }
+
+    #[test]
+    fn test_custom_scoring_end_to_end() {
+        // Test with custom scoring parameters (match=2, mismatch=-2, gap=-3)
+        let query = "ACGT";
+        let subject = "ACGT";
+        let match_score = 2;
+        let mismatch_score = -2;
+        let gap_score = -3;
+        
+        // Generate score matrix with custom parameters
+        let score_matrix = smith_waterman(query, subject, match_score, mismatch_score, gap_score);
+        
+        // Traceback using the same custom parameters
+        let alignment = traceback(&score_matrix, query, subject, match_score, mismatch_score, gap_score);
+        
+        // With perfect match and match_score=2, we expect alignment "ACGT\nACGT"
+        assert_eq!(alignment, "ACGT\nACGT");
+        
+        // Verify the score is correct (4 matches × 2 = 8)
+        assert_eq!(score_matrix[4][4], 8);
+    }
+
+    #[test]
+    fn test_alignment_at_boundary() {
+        // Test case where optimal alignment might be at row/column 0
+        let query = "A";
+        let subject = "A";
+        let score_matrix = smith_waterman(query, subject, 1, -1, -2);
+        let alignment = traceback(&score_matrix, query, subject, 1, -1, -2);
+        
+        // Should find the alignment even though it's near the boundary
+        assert_eq!(alignment, "A\nA");
+        assert_eq!(score_matrix[1][1], 1);
+    }
 }