forked from tiprock-network/azure-qa-rag-mpesa
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathuploadDocs.js
120 lines (100 loc) · 4 KB
/
uploadDocs.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import 'dotenv/config'
import fs from 'fs'
import { client, connectToMongoDB, vectorCollection } from './mongodbConnection.js'
import path from 'path';
import { fileURLToPath } from 'url';
import { PDFLoader } from '@langchain/community/document_loaders/fs/pdf';
import { CharacterTextSplitter } from 'langchain/text_splitter';
import { AzureOpenAIEmbeddings } from '@langchain/openai'
//import vector store instance
import { MongoDBAtlasVectorSearch } from '@langchain/mongodb'
import { DefaultAzureCredential, getBearerTokenProvider } from '@azure/identity'
const credentials = new DefaultAzureCredential();
const azureADTokenProvider = getBearerTokenProvider(
credentials,
"https://cognitiveservices.azure.com/.default"
);
// Recreate __dirname
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
// Create a splitter
const rawSplitter = new CharacterTextSplitter({
chunkSize: 1000,
chunkOverlap: 0
});
//create new embedding model instance
const azOpenEmbedding = new AzureOpenAIEmbeddings({
azureADTokenProvider,
azureOpenAIApiInstanceName: process.env.AZURE_OPENAI_API_INSTANCE_NAME,
azureOpenAIApiEmbeddingsDeploymentName: process.env.AZURE_OPENAI_API_DEPLOYMENT_EMBEDDING_NAME,
azureOpenAIApiVersion: process.env.AZURE_OPENAI_API_VERSION,
azureOpenAIBasePath: "https://eastus2.api.cognitive.microsoft.com/openai/deployments"
});
// Get all file names
const folderPath = path.join(__dirname, '/docs');
function getCreateFileNameArray(folderPath) {
return new Promise((resolve, reject) => {
fs.readdir(folderPath, (err, files) => {
if (err) {
console.error('Error reading directory:', err);
reject(err);
return;
}
// Filter out only files (ignoring directories)
const fileNames = files.filter(file => {
const fullPath = path.join(folderPath, file);
return fs.lstatSync(fullPath).isFile();
});
// Resolve with an array of file names
resolve(fileNames.map(file => `docs/${file}`)); // Resolving with relative paths
});
});
}
async function getRawDoc(filePath) {
// Create new PDF loader
const loader = new PDFLoader(filePath);
// Raw content
const rawDoc = await loader.load();
// Print raw content (first 5 entries)
console.log(rawDoc.slice(0, 5));
return rawDoc;
}
const returnSplittedContent = async () => {
const splitDocs = []; // To hold the split documents
// Get the list of files
const listOfFiles = await getCreateFileNameArray(folderPath);
// Use for...of to await each file processing
for (const pathOfFile of listOfFiles) {
const rawDoc = await getRawDoc(pathOfFile); // Await the raw document
const splittedContent = await rawSplitter.splitDocuments(rawDoc); // Await the splitting
splitDocs.push(...splittedContent); // Add the split documents to the array
}
return splitDocs; // Return the array of split documents
}
// Call the function and handle the result with await
const storeToCosmosVectorStore = async () => {
try {
const documents = await returnSplittedContent()
//create store instance
const store = await MongoDBAtlasVectorSearch.fromDocuments(
documents,
azOpenEmbedding,
{
collection: vectorCollection,
indexName: "myrag_index",
textKey: "text",
embeddingKey: "embedding",
}
)
if(!store){
console.log('Something wrong happened while creating store or getting store!')
return false
}
console.log('Done creating/getting and uploading to store.')
return true
} catch (e) {
console.log(`This error occurred: ${e}`)
return false
}
}
console.log(await storeToCosmosVectorStore() ? 'Successfully uploaded the embedded documents ':'Failed to upload the documents ')