-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfaiss_store.py
More file actions
64 lines (53 loc) · 2.14 KB
/
faiss_store.py
File metadata and controls
64 lines (53 loc) · 2.14 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
from pdf_chunker import load_and_chunk_pdf # Use the updated chunker
from gemini_utils import embed_text
import faiss
import numpy as np
import pickle
import os
# Define PDF path and output paths
PDF_PATH = "grade-11-history-text-book.pdf"
INDEX_PATH = "faiss_index.index"
METADATA_PATH = "faiss_metadata.pkl"
print(f"🔄 Starting FAISS index build process for {PDF_PATH}...")
# Load chunks with enhanced metadata using the updated pdf_chunker
# Adjust chunk_size and overlap as needed
chunks = load_and_chunk_pdf(PDF_PATH, chunk_size=500, overlap=50)
if not chunks:
print("❌ No chunks were generated. Exiting.")
exit()
# Embed all chunks
texts = []
embeddings = []
metadatas = [] # Will store dicts like {"page": X, "section": Y, "paragraphs": Z}
print(f"🧠 Embedding {len(chunks)} chunks...")
for i, chunk in enumerate(chunks):
# Simple progress indicator
if (i + 1) % 50 == 0:
print(f" Embedding chunk {i + 1}/{len(chunks)}")
try:
emb = embed_text(chunk["text"])
embeddings.append(emb)
texts.append(chunk["text"])
# Store the whole metadata dict from the chunker
metadatas.append(chunk["metadata"])
except Exception as e:
print(f"⚠️ Error embedding chunk {i + 1}: {e}. Skipping this chunk.")
if not embeddings:
print("❌ No embeddings were generated. Exiting.")
exit()
# Convert to numpy array
embedding_dim = len(embeddings[0])
embeddings_np = np.array(embeddings).astype("float32")
print(f"🔢 Converted embeddings to NumPy array shape: {embeddings_np.shape}")
# Build FAISS index (using IndexFlatL2 for simplicity)
print(f"🛠️ Building FAISS index (Dimension: {embedding_dim})...")
index = faiss.IndexFlatL2(embedding_dim)
index.add(embeddings_np)
print(f"✅ FAISS index built. Total vectors: {index.ntotal}")
# Save index + metadata (texts and the enhanced metadatas list)
print(f"💾 Saving FAISS index to {INDEX_PATH}...")
faiss.write_index(index, INDEX_PATH)
print(f"💾 Saving metadata to {METADATA_PATH}...")
with open(METADATA_PATH, "wb") as f:
pickle.dump({"texts": texts, "metadatas": metadatas}, f)
print("✅ FAISS index build process complete.")