-
Notifications
You must be signed in to change notification settings - Fork 0
/
save-embeddings.js
65 lines (52 loc) · 2.01 KB
/
save-embeddings.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import Replicate from 'replicate';
import * as dotenv from 'dotenv';
import * as fs from 'fs';
dotenv.config();
// Initialize Replicate with API token
const replicate = new Replicate({
auth: process.env.REPLICATE_API_TOKEN,
});
// Model information for embedding
const version = 'b6b7585c9640cd7a9572c6e129c9549d79c9c31f0d3fdce7baac7c67ca38f305';
const model = 'replicate/all-mpnet-base-v25';
// Read and split the data into "chunks"
console.log('Reading source file...');
const raw = fs.readFileSync('data/processing-course.txt', 'utf-8');
let chunks = raw.split(/\n+/);
// Trim each chunk and filter out empty strings
chunks = chunks.map((chunk) => chunk.trim()).filter((chunk) => chunk !== '');
console.log(`Total chunks to process: ${chunks.length}`);
// Start the process of generating embeddings
console.log('Starting embedding generation...');
createEmbeddings(chunks);
// Function to create embeddings for each chunk
async function createEmbeddings(chunks) {
// Number of chunks to process in each batch
let batchSize = 10;
// Array to store all embeddings
let embeddings = [];
// Process chunks in batches
for (let i = 0; i < chunks.length; i += batchSize) {
process.stdout.write(
`\rProcessing batch ${i / batchSize + 1}/${Math.ceil(chunks.length / batchSize)}`
);
const texts = chunks.slice(i, i + batchSize);
const input = {
text_batch: JSON.stringify(texts),
};
// Generate embeddings for the current batch
const output = await replicate.run(`${model}:${version}`, { input });
// Map each text to its corresponding embedding
let currentEmbeddings = texts.map((text, index) => {
const { embedding } = output[index];
return { text, embedding };
});
// Add to the array
embeddings = embeddings.concat(currentEmbeddings);
}
const jsonOut = { embeddings };
// Write the embeddings to a JSON file
const fileOut = 'data/embeddings.json';
fs.writeFileSync(fileOut, JSON.stringify(jsonOut));
console.log(`\nEmbeddings saved to ${fileOut}`);
}