Skip to content

Commit f13293b

Browse files
ndbroadbentclaude
andcommitted
Add media, game, and theatre entity types for combined categories
- Add "media" type combining movie and tv_show for ambiguous content - Add "game" type combining video_game and physical_game - Rename "play" to "theatre" for musicals/shows/plays (clearer naming) - Update classifier prompt with guidance on when to use combined types - Add Wikidata QIDs for new entity types - Update Google search query building for new types - Update E2E test to accept media type for movies - Re-record flaky integration test fixtures - Update biome schema to 2.3.10 - Add scraper/index.ts to knip entry points - Anonymize test fixture data 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent 3230860 commit f13293b

File tree

155 files changed

+4388
-2308
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

155 files changed

+4388
-2308
lines changed

knip.json

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,10 @@
11
{
2-
"entry": ["src/cli/steps/index.ts", "src/test-support/index.ts"],
2+
"entry": [
3+
"src/cli/steps/index.ts",
4+
"src/test-support/index.ts",
5+
"src/search/index.ts",
6+
"src/scraper/index.ts"
7+
],
38
"project": ["src/**/*.ts"],
49
"ignore": [
510
"src/**/*.test.ts",

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,4 +112,4 @@
112112
"engines": {
113113
"node": ">=18.0.0"
114114
}
115-
}
115+
}

scripts/check-query-embeddings.ts

Lines changed: 52 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -6,68 +6,77 @@
66
* Usage: bun scripts/check-query-embeddings.ts
77
*/
88

9-
import { readFileSync } from 'node:fs'
10-
import { join } from 'node:path'
11-
import { gunzipSync } from 'node:zlib'
9+
import { readFileSync } from "node:fs";
10+
import { join } from "node:path";
11+
import { gunzipSync } from "node:zlib";
1212

13-
import activityTypes from '../src/extraction/embeddings/queries/activity-types.json'
14-
import agreementQueries from '../src/extraction/embeddings/queries/agreement.json'
15-
import suggestionQueries from '../src/extraction/embeddings/queries/suggestions.json'
13+
import activityTypes from "../src/extraction/embeddings/queries/activity-types.json";
14+
import agreementQueries from "../src/extraction/embeddings/queries/agreement.json";
15+
import suggestionQueries from "../src/extraction/embeddings/queries/suggestions.json";
1616

1717
// Load compressed embeddings
18-
const embeddingsPath = join(import.meta.dir, '../src/extraction/embeddings/queries/query-embeddings.json.gz')
19-
const compressed = readFileSync(embeddingsPath)
20-
const jsonData = gunzipSync(compressed).toString()
21-
const queryEmbeddings = JSON.parse(jsonData) as { queries: Array<{ text: string }> }
18+
const embeddingsPath = join(
19+
import.meta.dir,
20+
"../src/extraction/embeddings/queries/query-embeddings.json.gz",
21+
);
22+
const compressed = readFileSync(embeddingsPath);
23+
const jsonData = gunzipSync(compressed).toString();
24+
const queryEmbeddings = JSON.parse(jsonData) as {
25+
queries: Array<{ text: string }>;
26+
};
2227

2328
// Get all queries from source files
24-
const allActivityTypes = Object.values(activityTypes).flat()
25-
const allQueries = new Set([...suggestionQueries, ...agreementQueries, ...allActivityTypes])
29+
const allActivityTypes = Object.values(activityTypes).flat();
30+
const allQueries = new Set([
31+
...suggestionQueries,
32+
...agreementQueries,
33+
...allActivityTypes,
34+
]);
2635

2736
// Get queries that have embeddings
28-
const embeddedQueries = new Set(queryEmbeddings.queries.map((q) => q.text))
37+
const embeddedQueries = new Set(queryEmbeddings.queries.map((q) => q.text));
2938

3039
// Find missing
31-
const missing: string[] = []
40+
const missing: string[] = [];
3241
for (const query of allQueries) {
33-
if (!embeddedQueries.has(query)) {
34-
missing.push(query)
35-
}
42+
if (!embeddedQueries.has(query)) {
43+
missing.push(query);
44+
}
3645
}
3746

3847
// Find stale (in embeddings but not in source)
39-
const stale: string[] = []
48+
const stale: string[] = [];
4049
for (const query of embeddedQueries) {
41-
if (!allQueries.has(query)) {
42-
stale.push(query)
43-
}
50+
if (!allQueries.has(query)) {
51+
stale.push(query);
52+
}
4453
}
4554

4655
if (missing.length > 0 || stale.length > 0) {
47-
console.error('❌ Query embeddings are out of sync!\n')
56+
console.error("❌ Query embeddings are out of sync!\n");
4857

49-
if (missing.length > 0) {
50-
console.error(`Missing embeddings for ${missing.length} queries:`)
51-
for (const q of missing.slice(0, 10)) {
52-
console.error(` - "${q}"`)
53-
}
54-
if (missing.length > 10) {
55-
console.error(` ... and ${missing.length - 10} more`)
56-
}
57-
}
58+
if (missing.length > 0) {
59+
console.error(`Missing embeddings for ${missing.length} queries:`);
60+
for (const q of missing.slice(0, 10)) {
61+
console.error(` - "${q}"`);
62+
}
63+
if (missing.length > 10) {
64+
console.error(` ... and ${missing.length - 10} more`);
65+
}
66+
}
5867

59-
if (stale.length > 0) {
60-
console.error(`\nStale embeddings for ${stale.length} removed queries:`)
61-
for (const q of stale.slice(0, 10)) {
62-
console.error(` - "${q}"`)
63-
}
64-
if (stale.length > 10) {
65-
console.error(` ... and ${stale.length - 10} more`)
66-
}
67-
}
68+
if (stale.length > 0) {
69+
console.error(`\nStale embeddings for ${stale.length} removed queries:`);
70+
for (const q of stale.slice(0, 10)) {
71+
console.error(` - "${q}"`);
72+
}
73+
if (stale.length > 10) {
74+
console.error(` ... and ${stale.length - 10} more`);
75+
}
76+
}
6877

69-
console.error('\nRun: bun scripts/generate-query-embeddings.ts')
70-
process.exit(1)
78+
console.error("\nRun: bun scripts/generate-query-embeddings.ts");
79+
process.exit(1);
7180
}
7281

73-
console.log(`✓ All ${allQueries.size} queries have embeddings`)
82+
console.log(`✓ All ${allQueries.size} queries have embeddings`);

scripts/generate-query-embeddings.ts

Lines changed: 93 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -6,103 +6,112 @@
66
* Usage: bun scripts/generate-query-embeddings.ts
77
*/
88

9-
import { writeFileSync } from 'node:fs'
10-
import { join } from 'node:path'
11-
import { gzipSync } from 'node:zlib'
9+
import { writeFileSync } from "node:fs";
10+
import { join } from "node:path";
11+
import { gzipSync } from "node:zlib";
1212

1313
// Load queries
14-
import activityTypes from '../src/extraction/embeddings/queries/activity-types.json'
15-
import agreementQueries from '../src/extraction/embeddings/queries/agreement.json'
16-
import suggestionQueries from '../src/extraction/embeddings/queries/suggestions.json'
14+
import activityTypes from "../src/extraction/embeddings/queries/activity-types.json";
15+
import agreementQueries from "../src/extraction/embeddings/queries/agreement.json";
16+
import suggestionQueries from "../src/extraction/embeddings/queries/suggestions.json";
1717

18-
const OPENAI_API_KEY = process.env.OPENAI_API_KEY
18+
const OPENAI_API_KEY = process.env.OPENAI_API_KEY;
1919
if (!OPENAI_API_KEY) {
20-
console.error('Error: OPENAI_API_KEY environment variable required')
21-
console.error('Set it in .env or export it')
22-
process.exit(1)
20+
console.error("Error: OPENAI_API_KEY environment variable required");
21+
console.error("Set it in .env or export it");
22+
process.exit(1);
2323
}
2424

25-
const MODEL = 'text-embedding-3-large'
25+
const MODEL = "text-embedding-3-large";
2626

2727
interface OpenAIEmbeddingResponse {
28-
data: Array<{ embedding: number[]; index: number }>
29-
model: string
30-
usage: { prompt_tokens: number; total_tokens: number }
28+
data: Array<{ embedding: number[]; index: number }>;
29+
model: string;
30+
usage: { prompt_tokens: number; total_tokens: number };
3131
}
3232

3333
async function embedBatch(texts: string[]): Promise<number[][]> {
34-
const response = await fetch('https://api.openai.com/v1/embeddings', {
35-
method: 'POST',
36-
headers: {
37-
'Content-Type': 'application/json',
38-
Authorization: `Bearer ${OPENAI_API_KEY}`
39-
},
40-
body: JSON.stringify({ model: MODEL, input: texts })
41-
})
42-
43-
if (!response.ok) {
44-
const error = await response.text()
45-
throw new Error(`OpenAI API error: ${response.status} ${error}`)
46-
}
47-
48-
const data = (await response.json()) as OpenAIEmbeddingResponse
49-
50-
// Sort by index and return embeddings
51-
const embeddings: number[][] = new Array(texts.length)
52-
for (const item of data.data) {
53-
embeddings[item.index] = item.embedding
54-
}
55-
56-
return embeddings
34+
const response = await fetch("https://api.openai.com/v1/embeddings", {
35+
method: "POST",
36+
headers: {
37+
"Content-Type": "application/json",
38+
Authorization: `Bearer ${OPENAI_API_KEY}`,
39+
},
40+
body: JSON.stringify({ model: MODEL, input: texts }),
41+
});
42+
43+
if (!response.ok) {
44+
const error = await response.text();
45+
throw new Error(`OpenAI API error: ${response.status} ${error}`);
46+
}
47+
48+
const data = (await response.json()) as OpenAIEmbeddingResponse;
49+
50+
// Sort by index and return embeddings
51+
const embeddings: number[][] = new Array(texts.length);
52+
for (const item of data.data) {
53+
embeddings[item.index] = item.embedding;
54+
}
55+
56+
return embeddings;
5757
}
5858

5959
async function main() {
60-
console.log('Generating query embeddings...\n')
61-
62-
// Flatten all queries
63-
const allActivityTypes = Object.values(activityTypes).flat()
64-
const allQueries = [...suggestionQueries, ...agreementQueries, ...allActivityTypes]
65-
66-
console.log(`Suggestion queries: ${suggestionQueries.length}`)
67-
console.log(`Agreement queries: ${agreementQueries.length}`)
68-
console.log(`Activity types: ${allActivityTypes.length}`)
69-
console.log(`Total queries: ${allQueries.length}\n`)
70-
71-
// Embed in batches of 100
72-
const BATCH_SIZE = 100
73-
const allEmbeddings: number[][] = []
74-
75-
for (let i = 0; i < allQueries.length; i += BATCH_SIZE) {
76-
const batch = allQueries.slice(i, i + BATCH_SIZE)
77-
console.log(`Embedding batch ${Math.floor(i / BATCH_SIZE) + 1}/${Math.ceil(allQueries.length / BATCH_SIZE)}...`)
78-
79-
const embeddings = await embedBatch(batch)
80-
allEmbeddings.push(...embeddings)
81-
}
82-
83-
// Build output structure
84-
const output = {
85-
model: MODEL,
86-
generatedAt: new Date().toISOString(),
87-
queryCount: allQueries.length,
88-
dimensions: allEmbeddings[0]?.length ?? 0,
89-
queries: allQueries.map((query, i) => ({
90-
text: query,
91-
embedding: allEmbeddings[i]
92-
}))
93-
}
94-
95-
// Write compressed file
96-
const outputPath = join(import.meta.dir, '../src/extraction/embeddings/queries/query-embeddings.json.gz')
97-
const jsonData = JSON.stringify(output)
98-
const compressed = gzipSync(jsonData)
99-
writeFileSync(outputPath, compressed)
100-
101-
const sizeMB = (compressed.length / 1024 / 1024).toFixed(1)
102-
console.log(`\nWritten ${allQueries.length} embeddings to:`)
103-
console.log(outputPath)
104-
console.log(`\nDimensions: ${output.dimensions}`)
105-
console.log(`Compressed size: ${sizeMB}MB`)
60+
console.log("Generating query embeddings...\n");
61+
62+
// Flatten all queries
63+
const allActivityTypes = Object.values(activityTypes).flat();
64+
const allQueries = [
65+
...suggestionQueries,
66+
...agreementQueries,
67+
...allActivityTypes,
68+
];
69+
70+
console.log(`Suggestion queries: ${suggestionQueries.length}`);
71+
console.log(`Agreement queries: ${agreementQueries.length}`);
72+
console.log(`Activity types: ${allActivityTypes.length}`);
73+
console.log(`Total queries: ${allQueries.length}\n`);
74+
75+
// Embed in batches of 100
76+
const BATCH_SIZE = 100;
77+
const allEmbeddings: number[][] = [];
78+
79+
for (let i = 0; i < allQueries.length; i += BATCH_SIZE) {
80+
const batch = allQueries.slice(i, i + BATCH_SIZE);
81+
console.log(
82+
`Embedding batch ${Math.floor(i / BATCH_SIZE) + 1}/${Math.ceil(allQueries.length / BATCH_SIZE)}...`,
83+
);
84+
85+
const embeddings = await embedBatch(batch);
86+
allEmbeddings.push(...embeddings);
87+
}
88+
89+
// Build output structure
90+
const output = {
91+
model: MODEL,
92+
generatedAt: new Date().toISOString(),
93+
queryCount: allQueries.length,
94+
dimensions: allEmbeddings[0]?.length ?? 0,
95+
queries: allQueries.map((query, i) => ({
96+
text: query,
97+
embedding: allEmbeddings[i],
98+
})),
99+
};
100+
101+
// Write compressed file
102+
const outputPath = join(
103+
import.meta.dir,
104+
"../src/extraction/embeddings/queries/query-embeddings.json.gz",
105+
);
106+
const jsonData = JSON.stringify(output);
107+
const compressed = gzipSync(jsonData);
108+
writeFileSync(outputPath, compressed);
109+
110+
const sizeMB = (compressed.length / 1024 / 1024).toFixed(1);
111+
console.log(`\nWritten ${allQueries.length} embeddings to:`);
112+
console.log(outputPath);
113+
console.log(`\nDimensions: ${output.dimensions}`);
114+
console.log(`Compressed size: ${sizeMB}MB`);
106115
}
107116

108-
main().catch(console.error)
117+
main().catch(console.error);

0 commit comments

Comments
 (0)