chatDialog/VectorDBManager.js at main · praseodeveloper/chatDialog · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import pdf from 'pdf-parse-fork';
import fs from 'fs';
import path from 'path';
import { ChromaClient, DefaultEmbeddingFunction } from "chromadb";
import docx from 'docx-parser'

export default class VectorDBManager {
  #oClient;
  #oEmbeddingFunction;
  #oCollection;

  constructor() {
    this.#oClient = new ChromaClient();
    this.#oEmbeddingFunction = new DefaultEmbeddingFunction();
    this.#oCollection = null;
  }

  #chunkText(sText) {
    const aWords = sText.split(/\s+/);
    const nChunkSize = 100
      ;
    const aChunks = [];

    for (let i = 0; i < aWords.length; i += nChunkSize) {
      const sChunk = aWords.slice(i, i + nChunkSize).join(' ');
      aChunks.push(sChunk);
    }

    return aChunks;
  }

  //PDF Parser
  async #extractTextFromPDF(sFilePath) {
    const oDataBuffer = fs.readFileSync(sFilePath);
    const sPdfText = await pdf(oDataBuffer);
    return sPdfText.text;
  }

  //Docx Parser
  async #extractTextFromDocx(sFilePath) {
    return new Promise((resolve, reject) => {
      docx.parseDocx(sFilePath, function (sData) {
        if (sData) {
          resolve(sData);
        } else {
          reject(new Error("No data returned from parseDocx"));
        }
      });
    });
  }

  async #addTextToCollection(sText, sFilePath) {
    console.log(`Ingesting File ${sFilePath}\n...`)
    const aChunks = this.#chunkText(sText);
    const sFileName = sFilePath.split('/').pop(); // Get filename from path
    const aIds = aChunks.map((_, index) => `${sFileName}_chunk_${index + 1}`);
    const aMetadata = aChunks.map(() => ({ source: sFileName }));

    const sResult = await this.#oCollection.add({
      ids: aIds,
      metadatas: aMetadata,
      documents: aChunks,
    });

    console.log(sResult);
  }

  //Master Parser
  async #addFilesToCollection(sFolderPath) {
    const aFileNames = fs.readdirSync(sFolderPath);

    for (const sFileName of aFileNames) {
      const sFilePath = path.join(sFolderPath, sFileName);
      let sText;
      const sFileExtension = path.extname(sFileName).toLowerCase();
      switch (sFileExtension) {
        case ".pdf":
          sText = await this.#extractTextFromPDF(sFilePath);
          await this.#addTextToCollection(sText, sFilePath);
          break;

        case ".docx":
          sText = await this.#extractTextFromDocx(sFilePath);
          await this.#addTextToCollection(sText, sFilePath);
          break;

        case ".txt":
          sText = fs.readFileSync(sFilePath, 'utf8');
          await this.#addTextToCollection(sText, sFilePath);
          break;

        default:
          continue;
      }
    }
  }

  async #getOrCreateCollection(sCollectionName) {
    return await this.#oClient.getOrCreateCollection({
      name: sCollectionName,
      metadata: {
        description: "Private Docs",
        "hnsw:space": "l2" // define distance function
      },
      embeddingFunction: this.#oEmbeddingFunction,
    });
  }

  async setup(sCollectionName, sContentPath) {
    this.#oCollection = await this.#getOrCreateCollection(sCollectionName || "defaultCollection");
    if (!sContentPath) {
      throw new Error("No content path to load content");
    }
    await this.#addFilesToCollection(sContentPath);
  }

  async queryCollection(nResults, aQueryTexts) {
    const sResult = await this.#oCollection.query({
      nResults,
      queryTexts: aQueryTexts,
    });
    return sResult;
  }
}