Skip to content

Commit aeafaac

Browse files
committed
feat: ingestion, chunk, embeddings
1 parent 5f3b2a7 commit aeafaac

16 files changed

Lines changed: 1262 additions & 26 deletions

File tree

.env.example

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# Database connection
2+
# For local development with Docker:
3+
DATABASE_URL=postgresql://postgres:postgres@localhost:5432/ecma_spec
4+
5+
# For production with NeonDB:
6+
# DATABASE_URL=postgresql://user:password@ep-xxx.us-east-1.aws.neon.tech/ecma_spec?sslmode=require
7+
8+
# Embedding API keys (only one required, based on chosen provider)
9+
OPENAI_API_KEY=sk-...
10+
# GOOGLE_API_KEY=...
11+
# VOYAGE_API_KEY=...
12+
# COHERE_API_KEY=...
13+
14+
# Embedding configuration
15+
EMBEDDING_PROVIDER=openai
16+
EMBEDDING_MODEL=text-embedding-3-large
17+
EMBEDDING_DIMENSIONS=3072

bun.lock

Lines changed: 5 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

db/schema.sql

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
-- ECMA-376 Spec Vector Database Schema
2+
-- Simple single-table design - evolve as needed
3+
4+
CREATE EXTENSION IF NOT EXISTS vector;
5+
6+
-- Single table for all spec content
7+
CREATE TABLE spec_content (
8+
id SERIAL PRIMARY KEY,
9+
part_number INT NOT NULL,
10+
section_id TEXT,
11+
title TEXT,
12+
content TEXT NOT NULL,
13+
content_type TEXT DEFAULT 'text',
14+
embedding vector(1024),
15+
created_at TIMESTAMPTZ DEFAULT NOW()
16+
);
17+
18+
-- Vector similarity search
19+
CREATE INDEX idx_content_embedding ON spec_content USING hnsw (embedding vector_cosine_ops);
20+
21+
-- Filtering indexes
22+
CREATE INDEX idx_content_part ON spec_content(part_number);
23+
CREATE INDEX idx_content_section ON spec_content(section_id);

docker-compose.yml

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
services:
2+
db:
3+
image: pgvector/pgvector:pg16
4+
container_name: ecma-spec-db
5+
environment:
6+
POSTGRES_USER: postgres
7+
POSTGRES_PASSWORD: postgres
8+
POSTGRES_DB: ecma_spec
9+
ports:
10+
- "5432:5432"
11+
volumes:
12+
- pgdata:/var/lib/postgresql/data
13+
- ./db/schema.sql:/docker-entrypoint-initdb.d/schema.sql
14+
healthcheck:
15+
test: ["CMD-SHELL", "pg_isready -U postgres"]
16+
interval: 5s
17+
timeout: 5s
18+
retries: 5
19+
20+
volumes:
21+
pgdata:

package.json

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,16 @@
1414
"typecheck": "bun run --cwd apps/web typecheck && bun run --cwd apps/mcp-server typecheck && bun run --cwd packages/shared typecheck",
1515
"release": "semantic-release",
1616
"deploy": "bun run format && bun run lint && bun run --cwd apps/web deploy",
17-
"prepare": "husky"
17+
"prepare": "husky",
18+
"db:up": "docker compose up -d",
19+
"db:down": "docker compose down",
20+
"db:reset": "docker compose down -v && docker compose up -d",
21+
"db:shell": "docker compose exec db psql -U postgres -d ecma_spec",
22+
"ingest": "bun scripts/ingest/pipeline.ts",
23+
"ingest:chunk": "bun scripts/ingest/chunk.ts",
24+
"ingest:embed": "bun scripts/ingest/embed.ts",
25+
"ingest:upload": "bun scripts/ingest/upload.ts",
26+
"ingest:setup": "pip install -r scripts/requirements.txt"
1827
},
1928
"devDependencies": {
2029
"@biomejs/biome": "^2.3.11",

packages/shared/package.json

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,9 @@
1414
"scripts": {
1515
"typecheck": "tsc --noEmit"
1616
},
17+
"dependencies": {
18+
"postgres": "^3.4.5"
19+
},
1720
"devDependencies": {
1821
"typescript": "~5.9.3"
1922
}

packages/shared/src/db/index.ts

Lines changed: 146 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,148 @@
1-
// Database client for NeonDB
2-
// TODO: Implement NeonDB connection
1+
import postgres from "postgres";
2+
import type { SearchResult, SpecContent } from "../types";
33

4-
export function createDbClient(_connectionString: string) {
5-
throw new Error("Not implemented yet");
4+
export type DbClient = ReturnType<typeof createDbClient>;
5+
6+
export function createDbClient(connectionString: string) {
7+
const sql = postgres(connectionString);
8+
9+
return {
10+
sql,
11+
12+
async close() {
13+
await sql.end();
14+
},
15+
16+
// Insert content
17+
async insert(content: Omit<SpecContent, "id">) {
18+
const [result] = await sql<[{ id: number }]>`
19+
INSERT INTO spec_content (part_number, section_id, title, content, content_type, embedding)
20+
VALUES (
21+
${content.partNumber},
22+
${content.sectionId},
23+
${content.title},
24+
${content.content},
25+
${content.contentType},
26+
${content.embedding ? `[${content.embedding.join(",")}]` : null}
27+
)
28+
RETURNING id
29+
`;
30+
return result.id;
31+
},
32+
33+
// Insert multiple (batch)
34+
async insertBatch(items: Omit<SpecContent, "id">[]) {
35+
const values = items.map((item) => ({
36+
part_number: item.partNumber,
37+
section_id: item.sectionId,
38+
title: item.title,
39+
content: item.content,
40+
content_type: item.contentType,
41+
embedding: item.embedding ? `[${item.embedding.join(",")}]` : null,
42+
}));
43+
44+
const result = await sql`
45+
INSERT INTO spec_content ${sql(values)}
46+
RETURNING id
47+
`;
48+
return result.map((r) => r.id as number);
49+
},
50+
51+
// Update embedding
52+
async updateEmbedding(id: number, embedding: number[]) {
53+
await sql`
54+
UPDATE spec_content
55+
SET embedding = ${`[${embedding.join(",")}]`}
56+
WHERE id = ${id}
57+
`;
58+
},
59+
60+
// Semantic search
61+
async search(
62+
queryEmbedding: number[],
63+
options: { limit?: number; partNumber?: number; contentType?: string } = {},
64+
): Promise<SearchResult[]> {
65+
const { limit = 5, partNumber, contentType } = options;
66+
const embeddingStr = `[${queryEmbedding.join(",")}]`;
67+
68+
const results = await sql<
69+
Array<{
70+
id: number;
71+
part_number: number;
72+
section_id: string | null;
73+
title: string | null;
74+
content: string;
75+
content_type: string;
76+
score: number;
77+
}>
78+
>`
79+
SELECT
80+
id, part_number, section_id, title, content, content_type,
81+
1 - (embedding <=> ${embeddingStr}::vector) as score
82+
FROM spec_content
83+
WHERE embedding IS NOT NULL
84+
${partNumber ? sql`AND part_number = ${partNumber}` : sql``}
85+
${contentType ? sql`AND content_type = ${contentType}` : sql``}
86+
ORDER BY embedding <=> ${embeddingStr}::vector
87+
LIMIT ${limit}
88+
`;
89+
90+
return results.map((r) => ({
91+
id: r.id,
92+
partNumber: r.part_number,
93+
sectionId: r.section_id,
94+
title: r.title,
95+
content: r.content,
96+
contentType: r.content_type,
97+
score: r.score,
98+
}));
99+
},
100+
101+
// Get by section
102+
async getBySection(partNumber: number, sectionId: string): Promise<SpecContent[]> {
103+
const results = await sql<
104+
Array<{
105+
id: number;
106+
part_number: number;
107+
section_id: string | null;
108+
title: string | null;
109+
content: string;
110+
content_type: string;
111+
}>
112+
>`
113+
SELECT id, part_number, section_id, title, content, content_type
114+
FROM spec_content
115+
WHERE part_number = ${partNumber} AND section_id = ${sectionId}
116+
ORDER BY id
117+
`;
118+
119+
return results.map((r) => ({
120+
id: r.id,
121+
partNumber: r.part_number,
122+
sectionId: r.section_id,
123+
title: r.title,
124+
content: r.content,
125+
contentType: r.content_type,
126+
}));
127+
},
128+
129+
// Get stats
130+
async getStats() {
131+
const [stats] = await sql<[{ total: number; embedded: number }]>`
132+
SELECT
133+
COUNT(*) as total,
134+
COUNT(*) FILTER (WHERE embedding IS NOT NULL) as embedded
135+
FROM spec_content
136+
`;
137+
return {
138+
total: Number(stats.total),
139+
embedded: Number(stats.embedded),
140+
};
141+
},
142+
143+
// Clear all
144+
async clearAll() {
145+
await sql`TRUNCATE spec_content RESTART IDENTITY`;
146+
},
147+
};
6148
}

0 commit comments

Comments
 (0)