Skip to content

Commit 2eea7a4

Browse files
Fix: address Copilot feedback - custom scoring, efficiency, and edge cases.
1 parent 3fcaa85 commit 2eea7a4

File tree

1 file changed

+124
-54
lines changed

1 file changed

+124
-54
lines changed

src/dynamic_programming/smith_waterman.rs

Lines changed: 124 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
//! This module contains the Smith-Waterman algorithm implementation for local sequence alignment.
2-
//!
2+
//!
33
//! The Smith-Waterman algorithm is a dynamic programming algorithm used for determining
44
//! similar regions between two sequences (nucleotide or protein sequences). It is particularly
55
//! useful in bioinformatics for identifying optimal local alignments.
66
//!
77
//! # Algorithm Overview
8-
//!
8+
//!
99
//! The algorithm works by:
1010
//! 1. Creating a scoring matrix where each cell represents the maximum alignment score
1111
//! ending at that position
@@ -14,15 +14,15 @@
1414
//! 4. Tracing back from the highest scoring position to reconstruct the alignment
1515
//!
1616
//! # Time Complexity
17-
//!
17+
//!
1818
//! O(m * n) where m and n are the lengths of the two sequences
1919
//!
2020
//! # Space Complexity
21-
//!
21+
//!
2222
//! O(m * n) for the scoring matrix
2323
//!
2424
//! # References
25-
//!
25+
//!
2626
//! - [Smith, T.F., Waterman, M.S. (1981). "Identification of Common Molecular Subsequences"](https://doi.org/10.1016/0022-2836(81)90087-5)
2727
//! - [Wikipedia: Smith-Waterman algorithm](https://en.wikipedia.org/wiki/Smith%E2%80%93Waterman_algorithm)
2828
@@ -46,7 +46,7 @@ use std::cmp::max;
4646
///
4747
/// ```
4848
/// use the_algorithms_rust::dynamic_programming::score_function;
49-
///
49+
///
5050
/// let score = score_function('A', 'A', 1, -1, -2);
5151
/// assert_eq!(score, 1); // Match
5252
///
@@ -93,7 +93,7 @@ pub fn score_function(
9393
///
9494
/// ```
9595
/// use the_algorithms_rust::dynamic_programming::smith_waterman;
96-
///
96+
///
9797
/// let score_matrix = smith_waterman("ACAC", "CA", 1, -1, -2);
9898
/// assert_eq!(score_matrix.len(), 5); // query length + 1
9999
/// assert_eq!(score_matrix[0].len(), 3); // subject length + 1
@@ -107,37 +107,36 @@ pub fn smith_waterman(
107107
) -> Vec<Vec<i32>> {
108108
let query_upper: Vec<char> = query.to_uppercase().chars().collect();
109109
let subject_upper: Vec<char> = subject.to_uppercase().chars().collect();
110-
110+
111111
let m = query_upper.len();
112112
let n = subject_upper.len();
113-
113+
114114
// Initialize scoring matrix with zeros
115115
let mut score = vec![vec![0; n + 1]; m + 1];
116-
116+
117117
// Fill the scoring matrix using dynamic programming
118118
for i in 1..=m {
119119
for j in 1..=n {
120120
// Calculate score for match/mismatch
121-
let match_or_mismatch = score[i - 1][j - 1]
122-
+ score_function(
123-
query_upper[i - 1],
124-
subject_upper[j - 1],
125-
match_score,
126-
mismatch_score,
127-
gap_score,
128-
);
129-
121+
let match_or_mismatch = score[i - 1][j - 1] + score_function(
122+
query_upper[i - 1],
123+
subject_upper[j - 1],
124+
match_score,
125+
mismatch_score,
126+
gap_score,
127+
);
128+
130129
// Calculate score for deletion (gap in subject)
131130
let delete = score[i - 1][j] + gap_score;
132-
131+
133132
// Calculate score for insertion (gap in query)
134133
let insert = score[i][j - 1] + gap_score;
135-
134+
136135
// Take maximum of all options, but never go below 0 (local alignment)
137136
score[i][j] = max(0, max(match_or_mismatch, max(delete, insert)));
138137
}
139138
}
140-
139+
141140
score
142141
}
143142

@@ -152,6 +151,9 @@ pub fn smith_waterman(
152151
/// * `score` - The score matrix from the Smith-Waterman algorithm
153152
/// * `query` - Original query sequence used in alignment
154153
/// * `subject` - Original subject sequence used in alignment
154+
/// * `match_score` - Score for matching characters (should match smith_waterman call)
155+
/// * `mismatch_score` - Score for mismatching characters (should match smith_waterman call)
156+
/// * `gap_score` - Penalty for gaps (should match smith_waterman call)
155157
///
156158
/// # Returns
157159
///
@@ -162,19 +164,26 @@ pub fn smith_waterman(
162164
///
163165
/// ```
164166
/// use the_algorithms_rust::dynamic_programming::{smith_waterman, traceback};
165-
///
167+
///
166168
/// let score_matrix = smith_waterman("ACAC", "CA", 1, -1, -2);
167-
/// let alignment = traceback(&score_matrix, "ACAC", "CA");
169+
/// let alignment = traceback(&score_matrix, "ACAC", "CA", 1, -1, -2);
168170
/// assert_eq!(alignment, "CA\nCA");
169171
/// ```
170-
pub fn traceback(score: &[Vec<i32>], query: &str, subject: &str) -> String {
172+
pub fn traceback(
173+
score: &[Vec<i32>],
174+
query: &str,
175+
subject: &str,
176+
match_score: i32,
177+
mismatch_score: i32,
178+
gap_score: i32,
179+
) -> String {
171180
let query_upper: Vec<char> = query.to_uppercase().chars().collect();
172181
let subject_upper: Vec<char> = subject.to_uppercase().chars().collect();
173-
182+
174183
// Find the cell with maximum score
175-
let mut max_value = i32::MIN;
184+
let mut max_value = 0;
176185
let (mut i_max, mut j_max) = (0, 0);
177-
186+
178187
for (i, row) in score.iter().enumerate() {
179188
for (j, &value) in row.iter().enumerate() {
180189
if value > max_value {
@@ -184,46 +193,57 @@ pub fn traceback(score: &[Vec<i32>], query: &str, subject: &str) -> String {
184193
}
185194
}
186195
}
187-
196+
188197
// If no significant alignment found, return empty string
189-
if i_max == 0 || j_max == 0 {
198+
if max_value <= 0 {
190199
return String::new();
191200
}
192-
201+
193202
// Traceback from the maximum scoring cell
194203
let (mut i, mut j) = (i_max, j_max);
195-
let mut align1 = String::new();
196-
let mut align2 = String::new();
197-
204+
let mut align1 = Vec::new();
205+
let mut align2 = Vec::new();
206+
198207
// Continue tracing back until we hit a cell with score 0
199208
while i > 0 && j > 0 && score[i][j] > 0 {
200209
let current_score = score[i][j];
201-
210+
202211
// Check if we came from diagonal (match/mismatch)
203-
if current_score
204-
== score[i - 1][j - 1]
205-
+ score_function(query_upper[i - 1], subject_upper[j - 1], 1, -1, -2)
206-
{
207-
align1.insert(0, query_upper[i - 1]);
208-
align2.insert(0, subject_upper[j - 1]);
212+
if current_score == score[i - 1][j - 1] + score_function(
213+
query_upper[i - 1],
214+
subject_upper[j - 1],
215+
match_score,
216+
mismatch_score,
217+
gap_score,
218+
) {
219+
align1.push(query_upper[i - 1]);
220+
align2.push(subject_upper[j - 1]);
209221
i -= 1;
210222
j -= 1;
211-
}
223+
}
212224
// Check if we came from above (deletion/gap in subject)
213-
else if current_score == score[i - 1][j] - 2 {
214-
align1.insert(0, query_upper[i - 1]);
215-
align2.insert(0, '-');
225+
else if current_score == score[i - 1][j] + gap_score {
226+
align1.push(query_upper[i - 1]);
227+
align2.push('-');
216228
i -= 1;
217-
}
229+
}
218230
// Otherwise we came from left (insertion/gap in query)
219231
else {
220-
align1.insert(0, '-');
221-
align2.insert(0, subject_upper[j - 1]);
232+
align1.push('-');
233+
align2.push(subject_upper[j - 1]);
222234
j -= 1;
223235
}
224236
}
225-
226-
format!("{align1}\n{align2}")
237+
238+
// Reverse the sequences (we built them backwards)
239+
align1.reverse();
240+
align2.reverse();
241+
242+
format!(
243+
"{}\n{}",
244+
align1.into_iter().collect::<String>(),
245+
align2.into_iter().collect::<String>()
246+
)
227247
}
228248

229249
#[cfg(test)]
@@ -247,8 +267,8 @@ mod tests {
247267
$(
248268
#[test]
249269
fn $name() {
250-
let (score, query, subject, expected) = $test_cases;
251-
assert_eq!(traceback(&score, query, subject), expected);
270+
let (score, query, subject, match_score, mismatch_score, gap_score, expected) = $test_cases;
271+
assert_eq!(traceback(&score, query, subject, match_score, mismatch_score, gap_score), expected);
252272
}
253273
)*
254274
}
@@ -300,6 +320,7 @@ mod tests {
300320
],
301321
"ACAC",
302322
"CA",
323+
1, -1, -2,
303324
"CA\nCA",
304325
),
305326
test_traceback_agt_agt: (
@@ -311,9 +332,23 @@ mod tests {
311332
],
312333
"AGT",
313334
"AGT",
335+
1, -1, -2,
314336
"AGT\nAGT",
315337
),
316-
test_traceback_empty: (vec![vec![0, 0, 0]], "ACAC", "", ""),
338+
test_traceback_empty: (vec![vec![0, 0, 0]], "ACAC", "", 1, -1, -2, ""),
339+
test_traceback_custom_scoring: (
340+
vec![
341+
vec![0, 0, 0],
342+
vec![0, 0, 2],
343+
vec![0, 2, 0],
344+
vec![0, 0, 4],
345+
vec![0, 2, 0],
346+
],
347+
"ACAC",
348+
"CA",
349+
2, -2, -3,
350+
"CA\nCA",
351+
),
317352
}
318353

319354
#[test]
@@ -339,8 +374,43 @@ mod tests {
339374
let result1 = smith_waterman("acac", "CA", 1, -1, -2);
340375
let result2 = smith_waterman("ACAC", "ca", 1, -1, -2);
341376
let result3 = smith_waterman("AcAc", "Ca", 1, -1, -2);
342-
377+
343378
assert_eq!(result1, result2);
344379
assert_eq!(result2, result3);
345380
}
381+
382+
#[test]
383+
fn test_custom_scoring_end_to_end() {
384+
// Test with custom scoring parameters (match=2, mismatch=-2, gap=-3)
385+
let query = "ACGT";
386+
let subject = "ACGT";
387+
let match_score = 2;
388+
let mismatch_score = -2;
389+
let gap_score = -3;
390+
391+
// Generate score matrix with custom parameters
392+
let score_matrix = smith_waterman(query, subject, match_score, mismatch_score, gap_score);
393+
394+
// Traceback using the same custom parameters
395+
let alignment = traceback(&score_matrix, query, subject, match_score, mismatch_score, gap_score);
396+
397+
// With perfect match and match_score=2, we expect alignment "ACGT\nACGT"
398+
assert_eq!(alignment, "ACGT\nACGT");
399+
400+
// Verify the score is correct (4 matches × 2 = 8)
401+
assert_eq!(score_matrix[4][4], 8);
402+
}
403+
404+
#[test]
405+
fn test_alignment_at_boundary() {
406+
// Test case where optimal alignment might be at row/column 0
407+
let query = "A";
408+
let subject = "A";
409+
let score_matrix = smith_waterman(query, subject, 1, -1, -2);
410+
let alignment = traceback(&score_matrix, query, subject, 1, -1, -2);
411+
412+
// Should find the alignment even though it's near the boundary
413+
assert_eq!(alignment, "A\nA");
414+
assert_eq!(score_matrix[1][1], 1);
415+
}
346416
}

0 commit comments

Comments
 (0)