Add fuzzy matching for article grid search filter (#59284)

heiskr · web-flow · commit 05233f1b3642 · 2026-01-23T01:31:59.000Z
diff --git a/src/landings/components/shared/LandingArticleGridWithFilter.tsx b/src/landings/components/shared/LandingArticleGridWithFilter.tsx
@@ -8,6 +8,7 @@ import { useTranslation } from '@/languages/components/useTranslation'
 import { ArticleCardItems, ChildTocItem, TocItem } from '@/landings/types'
 import { LandingType } from '@/landings/context/LandingContext'
 import type { QueryParams } from '@/search/components/hooks/useMultiQueryParams'
+import { fuzzyMatchScore } from '@/landings/lib/fuzzy-match'
 
 import styles from './LandingArticleGridWithFilter.module.scss'
 
@@ -151,20 +152,27 @@ export const ArticleGrid = ({
     let results = filteredArticlesByLandingType
 
     if (searchQuery) {
-      results = results.filter((token) => {
-        return Object.values(token).some((value) => {
-          if (typeof value === 'string') {
-            return value.toLowerCase().includes(searchQuery.toLowerCase())
-          } else if (Array.isArray(value)) {
-            return value.some((item) => {
-              if (typeof item === 'string') {
-                return item.toLowerCase().includes(searchQuery.toLowerCase())
+      // Calculate match scores for each article
+      const scoredResults = results
+        .map((token) => {
+          let maxScore = -1
+          for (const value of Object.values(token)) {
+            if (typeof value === 'string') {
+              maxScore = Math.max(maxScore, fuzzyMatchScore(value, searchQuery))
+            } else if (Array.isArray(value)) {
+              for (const item of value) {
+                if (typeof item === 'string') {
+                  maxScore = Math.max(maxScore, fuzzyMatchScore(item, searchQuery))
+                }
               }
-            })
+            }
           }
-          return false
+          return { token, score: maxScore }
         })
-      })
+        .filter(({ score }) => score >= 0)
+        .sort((a, b) => b.score - a.score)
+
+      results = scoredResults.map(({ token }) => token)
     }
 
     if (selectedCategory !== ALL_CATEGORIES) {
diff --git a/src/landings/lib/fuzzy-match.ts b/src/landings/lib/fuzzy-match.ts
@@ -0,0 +1,56 @@
+// 60% threshold: Empirically chosen to balance precision vs recall.
+// Lower values (e.g., 40%) match too loosely (e.g., "agent" matches "urgent").
+// Higher values (e.g., 80%) miss reasonable matches like singular/plural variations.
+// 60% captures most typo corrections and word form variations while filtering noise.
+const BIGRAM_COVERAGE_THRESHOLD = 0.6
+
+// Memoization cache for bigram computation
+const bigramCache = new Map<string, Set<string>>()
+
+// Extract character bigrams from a string (e.g., "agent" → ["ag", "ge", "en", "nt"])
+const getBigrams = (str: string): Set<string> => {
+  const key = str.toLowerCase()
+  if (bigramCache.has(key)) {
+    return bigramCache.get(key)!
+  }
+
+  const s = key.replace(/\s+/g, '')
+  const bigrams = new Set<string>()
+  for (let i = 0; i < s.length - 1; i++) {
+    bigrams.add(s.slice(i, i + 2))
+  }
+
+  bigramCache.set(key, bigrams)
+  return bigrams
+}
+
+// Coverage: what percentage of search bigrams are found in text
+// Better for matching short queries against long text
+export const bigramCoverage = (text: string, search: string): number => {
+  const textBigrams = getBigrams(text)
+  const searchBigrams = getBigrams(search)
+
+  if (searchBigrams.size === 0) return 0
+
+  const found = [...searchBigrams].filter((b) => textBigrams.has(b)).length
+  return found / searchBigrams.size
+}
+
+// Returns a match score: 1 for exact match, 0-1 for bigram coverage, -1 for no match
+export const fuzzyMatchScore = (text: string, searchTerm: string): number => {
+  const lowerText = text.toLowerCase()
+  const lowerSearch = searchTerm.toLowerCase()
+
+  // Exact substring match gets highest score
+  if (lowerText.includes(lowerSearch)) return 1
+
+  // Bigram coverage: what % of search bigrams appear in text
+  // This works better than Jaccard when text is much longer than search
+  const score = bigramCoverage(text, searchTerm)
+  return score >= BIGRAM_COVERAGE_THRESHOLD ? score : -1
+}
+
+// Check if searchTerm matches text (for filtering)
+export const fuzzyMatch = (text: string, searchTerm: string): boolean => {
+  return fuzzyMatchScore(text, searchTerm) >= 0
+}
diff --git a/src/landings/tests/fuzzy-match.ts b/src/landings/tests/fuzzy-match.ts
@@ -0,0 +1,119 @@
+import { describe, expect, test } from 'vitest'
+
+import { fuzzyMatch, fuzzyMatchScore, bigramCoverage } from '@/landings/lib/fuzzy-match'
+
+describe('fuzzyMatch', () => {
+  test('matches exact substrings', () => {
+    expect(fuzzyMatch('GitHub Copilot agents', 'agent')).toBe(true)
+    expect(fuzzyMatch('GitHub Copilot agents', 'copilot')).toBe(true)
+  })
+
+  test('matches singular vs plural via bigrams', () => {
+    expect(fuzzyMatch('GitHub Copilot agent', 'agents')).toBe(true)
+    expect(fuzzyMatch('Managing your repository', 'repositories')).toBe(true)
+  })
+
+  test('is case insensitive', () => {
+    expect(fuzzyMatch('GitHub Copilot', 'COPILOT')).toBe(true)
+    expect(fuzzyMatch('AGENTS', 'agents')).toBe(true)
+  })
+
+  test('returns false for non-matching text', () => {
+    expect(fuzzyMatch('GitHub Copilot', 'xyz')).toBe(false)
+    expect(fuzzyMatch('Repository settings', 'workflow')).toBe(false)
+  })
+
+  test('matches multi-word queries via bigram coverage', () => {
+    expect(fuzzyMatch('About GitHub Copilot agent features', 'copilot agent')).toBe(true)
+    expect(fuzzyMatch('Using agent in Copilot', 'copilot agent')).toBe(true)
+  })
+
+  test('multi-word queries require sufficient bigram overlap', () => {
+    expect(fuzzyMatch('xyz abc', 'copilot agents')).toBe(false)
+  })
+
+  test('handles edge cases gracefully', () => {
+    // Empty strings
+    expect(fuzzyMatch('GitHub Copilot', '')).toBe(true) // empty search matches anything
+    expect(fuzzyMatch('', 'copilot')).toBe(false)
+    expect(fuzzyMatch('', '')).toBe(true)
+
+    // Whitespace-only queries
+    expect(fuzzyMatch('GitHub Copilot', '   ')).toBe(false)
+
+    // Multiple consecutive spaces in query
+    expect(fuzzyMatch('GitHub Copilot agent', 'copilot   agent')).toBe(true)
+  })
+})
+
+describe('fuzzyMatchScore', () => {
+  test('returns 1 for exact substring match', () => {
+    expect(fuzzyMatchScore('GitHub Copilot agents', 'copilot')).toBe(1)
+  })
+
+  test('returns -1 for no match', () => {
+    expect(fuzzyMatchScore('GitHub Copilot', 'xyz')).toBe(-1)
+  })
+
+  test('returns bigram coverage score for fuzzy matches', () => {
+    // Bigram coverage should give a score between 0.6 and 1
+    const score = fuzzyMatchScore('About Copilot memory features', 'memory copilot')
+    expect(score).toBeGreaterThan(0.6)
+    expect(score).toBeLessThan(1)
+  })
+
+  test('matches singular vs plural via bigrams', () => {
+    // "agents" bigrams: ag, ge, en, nt, ts (5)
+    // "agent" in text has: ag, ge, en, nt (4)
+    // Coverage: 4/5 = 0.8, which is > 0.6 threshold
+    const score = fuzzyMatchScore('GitHub Copilot agent', 'agents')
+    expect(score).toBeGreaterThan(0.6)
+  })
+
+  test('exact substring matches score higher than fuzzy matches', () => {
+    const exactScore = fuzzyMatchScore('copilot agent guide', 'copilot agent')
+    const fuzzyScore = fuzzyMatchScore('About Copilot memory features', 'memory copilot')
+    expect(exactScore).toBe(1)
+    expect(fuzzyScore).toBeLessThan(1)
+  })
+})
+
+describe('bigramCoverage', () => {
+  test('returns 1.0 when all search bigrams are found in text', () => {
+    expect(bigramCoverage('copilot agent', 'agent')).toBe(1)
+  })
+
+  test('returns 0 for completely different texts', () => {
+    expect(bigramCoverage('xyz', 'abc')).toBe(0)
+  })
+
+  test('returns 0 for empty search string', () => {
+    expect(bigramCoverage('some text', '')).toBe(0)
+  })
+
+  test('handles singular vs plural with high coverage', () => {
+    // "agents" bigrams: ag, ge, en, nt, ts (5)
+    // "agent" in text has: ag, ge, en, nt (4)
+    // Coverage: 4/5 = 0.8
+    const coverage = bigramCoverage('agent', 'agents')
+    expect(coverage).toBeCloseTo(4 / 5, 2)
+  })
+
+  test('calculates partial coverage correctly', () => {
+    // Text "hello" has bigrams: he, el, ll, lo
+    // Search "help" has bigrams: he, el, lp
+    // Found: he, el (2 of 3) = 0.67
+    const coverage = bigramCoverage('hello', 'help')
+    expect(coverage).toBeCloseTo(2 / 3, 2)
+  })
+
+  test('is case insensitive', () => {
+    expect(bigramCoverage('COPILOT', 'copilot')).toBe(1)
+    expect(bigramCoverage('copilot', 'COPILOT')).toBe(1)
+  })
+
+  test('ignores whitespace in both text and search', () => {
+    expect(bigramCoverage('co pi lot', 'copilot')).toBe(1)
+    expect(bigramCoverage('copilot', 'co pi lot')).toBe(1)
+  })
+})