-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathVectorDBManager.js
More file actions
124 lines (106 loc) · 3.34 KB
/
VectorDBManager.js
File metadata and controls
124 lines (106 loc) · 3.34 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import pdf from 'pdf-parse-fork';
import fs from 'fs';
import path from 'path';
import { ChromaClient, DefaultEmbeddingFunction } from "chromadb";
import docx from 'docx-parser'
export default class VectorDBManager {
#oClient;
#oEmbeddingFunction;
#oCollection;
constructor() {
this.#oClient = new ChromaClient();
this.#oEmbeddingFunction = new DefaultEmbeddingFunction();
this.#oCollection = null;
}
#chunkText(sText) {
const aWords = sText.split(/\s+/);
const nChunkSize = 100
;
const aChunks = [];
for (let i = 0; i < aWords.length; i += nChunkSize) {
const sChunk = aWords.slice(i, i + nChunkSize).join(' ');
aChunks.push(sChunk);
}
return aChunks;
}
//PDF Parser
async #extractTextFromPDF(sFilePath) {
const oDataBuffer = fs.readFileSync(sFilePath);
const sPdfText = await pdf(oDataBuffer);
return sPdfText.text;
}
//Docx Parser
async #extractTextFromDocx(sFilePath) {
return new Promise((resolve, reject) => {
docx.parseDocx(sFilePath, function (sData) {
if (sData) {
resolve(sData);
} else {
reject(new Error("No data returned from parseDocx"));
}
});
});
}
async #addTextToCollection(sText, sFilePath) {
console.log(`Ingesting File ${sFilePath}\n...`)
const aChunks = this.#chunkText(sText);
const sFileName = sFilePath.split('/').pop(); // Get filename from path
const aIds = aChunks.map((_, index) => `${sFileName}_chunk_${index + 1}`);
const aMetadata = aChunks.map(() => ({ source: sFileName }));
const sResult = await this.#oCollection.add({
ids: aIds,
metadatas: aMetadata,
documents: aChunks,
});
console.log(sResult);
}
//Master Parser
async #addFilesToCollection(sFolderPath) {
const aFileNames = fs.readdirSync(sFolderPath);
for (const sFileName of aFileNames) {
const sFilePath = path.join(sFolderPath, sFileName);
let sText;
const sFileExtension = path.extname(sFileName).toLowerCase();
switch (sFileExtension) {
case ".pdf":
sText = await this.#extractTextFromPDF(sFilePath);
await this.#addTextToCollection(sText, sFilePath);
break;
case ".docx":
sText = await this.#extractTextFromDocx(sFilePath);
await this.#addTextToCollection(sText, sFilePath);
break;
case ".txt":
sText = fs.readFileSync(sFilePath, 'utf8');
await this.#addTextToCollection(sText, sFilePath);
break;
default:
continue;
}
}
}
async #getOrCreateCollection(sCollectionName) {
return await this.#oClient.getOrCreateCollection({
name: sCollectionName,
metadata: {
description: "Private Docs",
"hnsw:space": "l2" // define distance function
},
embeddingFunction: this.#oEmbeddingFunction,
});
}
async setup(sCollectionName, sContentPath) {
this.#oCollection = await this.#getOrCreateCollection(sCollectionName || "defaultCollection");
if (!sContentPath) {
throw new Error("No content path to load content");
}
await this.#addFilesToCollection(sContentPath);
}
async queryCollection(nResults, aQueryTexts) {
const sResult = await this.#oCollection.query({
nResults,
queryTexts: aQueryTexts,
});
return sResult;
}
}