Skip to content

Commit 05233f1

Browse files
authored
Add fuzzy matching for article grid search filter (#59284)
1 parent fc833c8 commit 05233f1

File tree

3 files changed

+194
-11
lines changed

3 files changed

+194
-11
lines changed

src/landings/components/shared/LandingArticleGridWithFilter.tsx

Lines changed: 19 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import { useTranslation } from '@/languages/components/useTranslation'
88
import { ArticleCardItems, ChildTocItem, TocItem } from '@/landings/types'
99
import { LandingType } from '@/landings/context/LandingContext'
1010
import type { QueryParams } from '@/search/components/hooks/useMultiQueryParams'
11+
import { fuzzyMatchScore } from '@/landings/lib/fuzzy-match'
1112

1213
import styles from './LandingArticleGridWithFilter.module.scss'
1314

@@ -151,20 +152,27 @@ export const ArticleGrid = ({
151152
let results = filteredArticlesByLandingType
152153

153154
if (searchQuery) {
154-
results = results.filter((token) => {
155-
return Object.values(token).some((value) => {
156-
if (typeof value === 'string') {
157-
return value.toLowerCase().includes(searchQuery.toLowerCase())
158-
} else if (Array.isArray(value)) {
159-
return value.some((item) => {
160-
if (typeof item === 'string') {
161-
return item.toLowerCase().includes(searchQuery.toLowerCase())
155+
// Calculate match scores for each article
156+
const scoredResults = results
157+
.map((token) => {
158+
let maxScore = -1
159+
for (const value of Object.values(token)) {
160+
if (typeof value === 'string') {
161+
maxScore = Math.max(maxScore, fuzzyMatchScore(value, searchQuery))
162+
} else if (Array.isArray(value)) {
163+
for (const item of value) {
164+
if (typeof item === 'string') {
165+
maxScore = Math.max(maxScore, fuzzyMatchScore(item, searchQuery))
166+
}
162167
}
163-
})
168+
}
164169
}
165-
return false
170+
return { token, score: maxScore }
166171
})
167-
})
172+
.filter(({ score }) => score >= 0)
173+
.sort((a, b) => b.score - a.score)
174+
175+
results = scoredResults.map(({ token }) => token)
168176
}
169177

170178
if (selectedCategory !== ALL_CATEGORIES) {

src/landings/lib/fuzzy-match.ts

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
// 60% threshold: Empirically chosen to balance precision vs recall.
2+
// Lower values (e.g., 40%) match too loosely (e.g., "agent" matches "urgent").
3+
// Higher values (e.g., 80%) miss reasonable matches like singular/plural variations.
4+
// 60% captures most typo corrections and word form variations while filtering noise.
5+
const BIGRAM_COVERAGE_THRESHOLD = 0.6
6+
7+
// Memoization cache for bigram computation
8+
const bigramCache = new Map<string, Set<string>>()
9+
10+
// Extract character bigrams from a string (e.g., "agent" → ["ag", "ge", "en", "nt"])
11+
const getBigrams = (str: string): Set<string> => {
12+
const key = str.toLowerCase()
13+
if (bigramCache.has(key)) {
14+
return bigramCache.get(key)!
15+
}
16+
17+
const s = key.replace(/\s+/g, '')
18+
const bigrams = new Set<string>()
19+
for (let i = 0; i < s.length - 1; i++) {
20+
bigrams.add(s.slice(i, i + 2))
21+
}
22+
23+
bigramCache.set(key, bigrams)
24+
return bigrams
25+
}
26+
27+
// Coverage: what percentage of search bigrams are found in text
28+
// Better for matching short queries against long text
29+
export const bigramCoverage = (text: string, search: string): number => {
30+
const textBigrams = getBigrams(text)
31+
const searchBigrams = getBigrams(search)
32+
33+
if (searchBigrams.size === 0) return 0
34+
35+
const found = [...searchBigrams].filter((b) => textBigrams.has(b)).length
36+
return found / searchBigrams.size
37+
}
38+
39+
// Returns a match score: 1 for exact match, 0-1 for bigram coverage, -1 for no match
40+
export const fuzzyMatchScore = (text: string, searchTerm: string): number => {
41+
const lowerText = text.toLowerCase()
42+
const lowerSearch = searchTerm.toLowerCase()
43+
44+
// Exact substring match gets highest score
45+
if (lowerText.includes(lowerSearch)) return 1
46+
47+
// Bigram coverage: what % of search bigrams appear in text
48+
// This works better than Jaccard when text is much longer than search
49+
const score = bigramCoverage(text, searchTerm)
50+
return score >= BIGRAM_COVERAGE_THRESHOLD ? score : -1
51+
}
52+
53+
// Check if searchTerm matches text (for filtering)
54+
export const fuzzyMatch = (text: string, searchTerm: string): boolean => {
55+
return fuzzyMatchScore(text, searchTerm) >= 0
56+
}

src/landings/tests/fuzzy-match.ts

Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
import { describe, expect, test } from 'vitest'
2+
3+
import { fuzzyMatch, fuzzyMatchScore, bigramCoverage } from '@/landings/lib/fuzzy-match'
4+
5+
describe('fuzzyMatch', () => {
6+
test('matches exact substrings', () => {
7+
expect(fuzzyMatch('GitHub Copilot agents', 'agent')).toBe(true)
8+
expect(fuzzyMatch('GitHub Copilot agents', 'copilot')).toBe(true)
9+
})
10+
11+
test('matches singular vs plural via bigrams', () => {
12+
expect(fuzzyMatch('GitHub Copilot agent', 'agents')).toBe(true)
13+
expect(fuzzyMatch('Managing your repository', 'repositories')).toBe(true)
14+
})
15+
16+
test('is case insensitive', () => {
17+
expect(fuzzyMatch('GitHub Copilot', 'COPILOT')).toBe(true)
18+
expect(fuzzyMatch('AGENTS', 'agents')).toBe(true)
19+
})
20+
21+
test('returns false for non-matching text', () => {
22+
expect(fuzzyMatch('GitHub Copilot', 'xyz')).toBe(false)
23+
expect(fuzzyMatch('Repository settings', 'workflow')).toBe(false)
24+
})
25+
26+
test('matches multi-word queries via bigram coverage', () => {
27+
expect(fuzzyMatch('About GitHub Copilot agent features', 'copilot agent')).toBe(true)
28+
expect(fuzzyMatch('Using agent in Copilot', 'copilot agent')).toBe(true)
29+
})
30+
31+
test('multi-word queries require sufficient bigram overlap', () => {
32+
expect(fuzzyMatch('xyz abc', 'copilot agents')).toBe(false)
33+
})
34+
35+
test('handles edge cases gracefully', () => {
36+
// Empty strings
37+
expect(fuzzyMatch('GitHub Copilot', '')).toBe(true) // empty search matches anything
38+
expect(fuzzyMatch('', 'copilot')).toBe(false)
39+
expect(fuzzyMatch('', '')).toBe(true)
40+
41+
// Whitespace-only queries
42+
expect(fuzzyMatch('GitHub Copilot', ' ')).toBe(false)
43+
44+
// Multiple consecutive spaces in query
45+
expect(fuzzyMatch('GitHub Copilot agent', 'copilot agent')).toBe(true)
46+
})
47+
})
48+
49+
describe('fuzzyMatchScore', () => {
50+
test('returns 1 for exact substring match', () => {
51+
expect(fuzzyMatchScore('GitHub Copilot agents', 'copilot')).toBe(1)
52+
})
53+
54+
test('returns -1 for no match', () => {
55+
expect(fuzzyMatchScore('GitHub Copilot', 'xyz')).toBe(-1)
56+
})
57+
58+
test('returns bigram coverage score for fuzzy matches', () => {
59+
// Bigram coverage should give a score between 0.6 and 1
60+
const score = fuzzyMatchScore('About Copilot memory features', 'memory copilot')
61+
expect(score).toBeGreaterThan(0.6)
62+
expect(score).toBeLessThan(1)
63+
})
64+
65+
test('matches singular vs plural via bigrams', () => {
66+
// "agents" bigrams: ag, ge, en, nt, ts (5)
67+
// "agent" in text has: ag, ge, en, nt (4)
68+
// Coverage: 4/5 = 0.8, which is > 0.6 threshold
69+
const score = fuzzyMatchScore('GitHub Copilot agent', 'agents')
70+
expect(score).toBeGreaterThan(0.6)
71+
})
72+
73+
test('exact substring matches score higher than fuzzy matches', () => {
74+
const exactScore = fuzzyMatchScore('copilot agent guide', 'copilot agent')
75+
const fuzzyScore = fuzzyMatchScore('About Copilot memory features', 'memory copilot')
76+
expect(exactScore).toBe(1)
77+
expect(fuzzyScore).toBeLessThan(1)
78+
})
79+
})
80+
81+
describe('bigramCoverage', () => {
82+
test('returns 1.0 when all search bigrams are found in text', () => {
83+
expect(bigramCoverage('copilot agent', 'agent')).toBe(1)
84+
})
85+
86+
test('returns 0 for completely different texts', () => {
87+
expect(bigramCoverage('xyz', 'abc')).toBe(0)
88+
})
89+
90+
test('returns 0 for empty search string', () => {
91+
expect(bigramCoverage('some text', '')).toBe(0)
92+
})
93+
94+
test('handles singular vs plural with high coverage', () => {
95+
// "agents" bigrams: ag, ge, en, nt, ts (5)
96+
// "agent" in text has: ag, ge, en, nt (4)
97+
// Coverage: 4/5 = 0.8
98+
const coverage = bigramCoverage('agent', 'agents')
99+
expect(coverage).toBeCloseTo(4 / 5, 2)
100+
})
101+
102+
test('calculates partial coverage correctly', () => {
103+
// Text "hello" has bigrams: he, el, ll, lo
104+
// Search "help" has bigrams: he, el, lp
105+
// Found: he, el (2 of 3) = 0.67
106+
const coverage = bigramCoverage('hello', 'help')
107+
expect(coverage).toBeCloseTo(2 / 3, 2)
108+
})
109+
110+
test('is case insensitive', () => {
111+
expect(bigramCoverage('COPILOT', 'copilot')).toBe(1)
112+
expect(bigramCoverage('copilot', 'COPILOT')).toBe(1)
113+
})
114+
115+
test('ignores whitespace in both text and search', () => {
116+
expect(bigramCoverage('co pi lot', 'copilot')).toBe(1)
117+
expect(bigramCoverage('copilot', 'co pi lot')).toBe(1)
118+
})
119+
})

0 commit comments

Comments
 (0)