Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

HnswDensevector SafeTensor Generator #2515

Closed
wants to merge 48 commits into from
Closed
Show file tree
Hide file tree
Changes from 9 commits
Commits
Show all changes
48 commits
Select commit Hold shift + click to select a range
85fe2e0
not issue
Panizghi Jun 1, 2024
e0a7937
update
Panizghi Jun 1, 2024
b486073
test working
Panizghi Jun 2, 2024
b1b4ef0
fix paths
Panizghi Jun 2, 2024
07f777c
fix
Panizghi Jun 2, 2024
2d84a38
delete extra files
Panizghi Jun 2, 2024
03dfdde
file ext fix
Panizghi Jun 2, 2024
9e40ec3
refactor code based on feedback
Jun 11, 2024
7187238
python fix
Jun 12, 2024
bbc8a20
hardcoded path
Jun 13, 2024
008864e
indent fix
Panizghi Jun 16, 2024
23c90b8
path python fix
Panizghi Jun 16, 2024
59165a6
java
Panizghi Jun 16, 2024
699f42e
Merge pull request #1 from Panizghi/Melissa
Panizghi Jul 9, 2024
9a8bbff
Update HnswJsonWithSafeTensorsDenseVectorDocumentGenerator.java
Panizghi Jul 9, 2024
2094e24
Update AbstractIndexer.java
Panizghi Jul 9, 2024
4868d00
Update json_to_bin.py
Panizghi Jul 9, 2024
bca9754
Delete src/main/java/io/anserini/index/SafeTensorsIndexCollection.java
Panizghi Jul 9, 2024
01a642e
Fixed paths and updated indexing commands
Panizghi Jul 9, 2024
8403c48
Merge branch 'master' into clean-fix-branch
Panizghi Jul 9, 2024
9ef81c7
Merge pull request #2 from Panizghi/clean-fix-branch
Panizghi Jul 9, 2024
2aa4435
fix eof
Panizghi Jul 9, 2024
af44889
revert indexhnsw
Panizghi Jul 11, 2024
5e80648
abstarct ndexer update
Panizghi Jul 11, 2024
507866c
Add to onboarding reproduction logs (#2546)
XKTZ Jul 14, 2024
7d7ce98
file
Panizghi Jul 15, 2024
7d3726a
fix eof
Panizghi Jul 9, 2024
24e9bb8
abstarct ndexer update
Panizghi Jul 11, 2024
5f1d8c8
Merge pull request #3 from castorini/master
Panizghi Jul 15, 2024
76cf238
test
Panizghi Jul 15, 2024
ae1044a
test
Panizghi Jul 15, 2024
d1c3a3c
test
Panizghi Jul 15, 2024
f804c6e
test
Panizghi Jul 15, 2024
92987dd
test
Panizghi Jul 15, 2024
01b1c51
fixed collection and indexer
Panizghi Jul 15, 2024
1fcd094
update
Panizghi Jul 17, 2024
897fac7
added comments
Panizghi Jul 17, 2024
7551989
fixed thread exception
Panizghi Aug 25, 2024
20ab166
remove extra files
Panizghi Aug 25, 2024
229331c
bar progress and high level logging
Panizghi Aug 25, 2024
a2ec450
Merge branch 'castorini:master' into master
Panizghi Aug 25, 2024
d50c552
update header
Panizghi Aug 25, 2024
381e5cd
fix space
Panizghi Aug 25, 2024
2e56f19
testing refactored
Panizghi Aug 26, 2024
7429ed8
multiple jsonl handeling
Panizghi Sep 1, 2024
50272dd
remove test
Panizghi Sep 1, 2024
f17420b
Delete src/main/python/safetensors/test.py
Panizghi Sep 1, 2024
bc68411
update tools
Panizghi Sep 1, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
package io.anserini.index.generator;

import com.fasterxml.jackson.databind.ObjectMapper;
import io.anserini.collection.SourceDocument;
import io.anserini.index.Constants;
import org.apache.lucene.document.BinaryDocValuesField;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.KnnFloatVectorField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.VectorSimilarityFunction;
import org.apache.lucene.util.BytesRef;

import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.nio.charset.StandardCharsets;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;


/**
* Converts a {@link SourceDocument} from SafeTensors into a Lucene {@link Document}, ready to be indexed.
*
* @param <T> type of the source document
*/
public class HnswJsonWithSafeTensorsDenseVectorDocumentGenerator<T extends SourceDocument> implements LuceneDocumentGenerator<T> {

Panizghi marked this conversation as resolved.
Show resolved Hide resolved


public HnswJsonWithSafeTensorsDenseVectorDocumentGenerator() {

}

// public void setCurrentJsonlFile(String currentJsonlFile) {
// this.currentJsonlFile = currentJsonlFile;
// }

@Override
public Document createDocument(T src) throws InvalidDocumentException {
try {

String vectorsFilePath = "collections/beir-v1.0.0/bge-base-en-v1.5.safetensors/nfcorpus/vectors.part00_vectors.safetensors";
Panizghi marked this conversation as resolved.
Show resolved Hide resolved
String docidsFilePath = "collections/beir-v1.0.0/bge-base-en-v1.5.safetensors/nfcorpus/vectors.part00_docids.safetensors";
String docidToIdxFilePath = "collections/beir-v1.0.0/bge-base-en-v1.5.safetensors/nfcorpus/vectors.part00_docid_to_idx.json";
Panizghi marked this conversation as resolved.
Show resolved Hide resolved

// Read and deserialize the SafeTensors files
byte[] vectorsData = Files.readAllBytes(Paths.get(vectorsFilePath));
byte[] docidsData = Files.readAllBytes(Paths.get(docidsFilePath));

// Deserialize docid_to_idx.json
ObjectMapper objectMapper = new ObjectMapper();
@SuppressWarnings("unchecked")
Map<String, Integer> docidToIdx = objectMapper.readValue(Files.readAllBytes(Paths.get(docidToIdxFilePath)), Map.class);
Map<Integer, String> idxToDocid = new LinkedHashMap<>();
for (Map.Entry<String, Integer> entry : docidToIdx.entrySet()) {
idxToDocid.put(entry.getValue(), entry.getKey());
}

// Deserialize docids
Map<String, Object> docidsHeader = parseHeader(docidsData);
int[] docidIndices = extractDocidIndices(docidsData, docidsHeader);
String[] docids = new String[docidIndices.length];
for (int i = 0; i < docidIndices.length; i++) {
docids[i] = idxToDocid.get(docidIndices[i]);
}

// Deserialize vectors
Map<String, Object> vectorsHeader = parseHeader(vectorsData);
double[][] vectors = extractVectors(vectorsData, vectorsHeader);

// Create the Lucene document
String id = src.id();
Integer indexObj = idxToDocid.entrySet().stream().filter(entry -> entry.getValue().equals(id)).findFirst().orElse(null).getKey();
if (indexObj == null) {
throw new InvalidDocumentException();
}
int index = indexObj;
float[] contents = new float[vectors[index].length];
for (int i = 0; i < contents.length; i++) {
contents[i] = (float) vectors[index][i];
}

final Document document = new Document();
document.add(new StringField(Constants.ID, id, Field.Store.YES));
document.add(new BinaryDocValuesField(Constants.ID, new BytesRef(id)));
document.add(new KnnFloatVectorField(Constants.VECTOR, contents, VectorSimilarityFunction.DOT_PRODUCT));

return document;

} catch (Exception e) {
throw new InvalidDocumentException();
}
}

private Map<String, Object> parseHeader(byte[] data) throws IOException {
ByteBuffer buffer = ByteBuffer.wrap(data).order(ByteOrder.LITTLE_ENDIAN);
long headerSize = buffer.getLong();
byte[] headerBytes = new byte[(int) headerSize];
buffer.get(headerBytes);
String headerJson = new String(headerBytes, StandardCharsets.UTF_8).trim();
ObjectMapper objectMapper = new ObjectMapper();
return objectMapper.readValue(headerJson, Map.class);
}

private double[][] extractVectors(byte[] data, Map<String, Object> header) {
@SuppressWarnings("unchecked")
Map<String, Object> vectorsInfo = (Map<String, Object>) header.get("vectors");
String dtype = (String) vectorsInfo.get("dtype");

@SuppressWarnings("unchecked")
List<Integer> shapeList = (List<Integer>) vectorsInfo.get("shape");
int rows = shapeList.get(0);
int cols = shapeList.get(1);
@SuppressWarnings("unchecked")
List<Number> dataOffsets = (List<Number>) vectorsInfo.get("data_offsets");
long begin = dataOffsets.get(0).longValue();

ByteBuffer buffer = ByteBuffer.wrap(data).order(ByteOrder.LITTLE_ENDIAN);
buffer.position((int) (begin + buffer.getLong(0) + 8));

double[][] vectors = new double[rows][cols];
if (dtype.equals("F64")) {
for (int i = 0; i < rows; i++) {
for (int j = 0; j < cols; j++) {
vectors[i][j] = buffer.getDouble();
}
}
} else {
throw new UnsupportedOperationException("Unsupported data type: " + dtype);
}

return vectors;
}

private int[] extractDocidIndices(byte[] data, Map<String, Object> header) {
@SuppressWarnings("unchecked")
Map<String, Object> docidsInfo = (Map<String, Object>) header.get("docids");
String dtype = (String) docidsInfo.get("dtype");

@SuppressWarnings("unchecked")
List<Integer> shapeList = (List<Integer>) docidsInfo.get("shape");
int length = shapeList.get(0);

@SuppressWarnings("unchecked")
List<Number> dataOffsets = (List<Number>) docidsInfo.get("data_offsets");
long begin = dataOffsets.get(0).longValue();

ByteBuffer buffer = ByteBuffer.wrap(data).order(ByteOrder.LITTLE_ENDIAN);
buffer.position((int) (begin + buffer.getLong(0) + 8));

int[] docidIndices = new int[length];
if (dtype.equals("I64")) {
for (int i = 0; i < length; i++) {
docidIndices[i] = (int) buffer.getLong();
}
} else {
throw new UnsupportedOperationException("Unsupported data type: " + dtype);
}

return docidIndices;
}
}
50 changes: 50 additions & 0 deletions src/main/python/safetensors/compare_jsonl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import json
import sys

def read_jsonl(file_path):
with open(file_path, 'r') as f:
return [json.loads(line) for line in f]

def compare_jsonl(vectors_file, contents_file):
vectors_data = read_jsonl(vectors_file)
contents_data = read_jsonl(contents_file)

vectors_dict = {entry['docid']: entry['vector'] for entry in vectors_data}
contents_dict = {entry['docid']: entry for entry in contents_data} # Keep full entry for accurate comparison

all_docids = set(vectors_dict.keys()).union(contents_dict.keys())

differences = []

for docid in sorted(all_docids):
vector = vectors_dict.get(docid)
content_entry = contents_dict.get(docid)

if not vector or not content_entry:
differences.append(f"Missing entry for docid: {docid}")
continue

content_docid = content_entry.get('docid')

if docid != content_docid:
differences.append(f"Docid mismatch for docid: {docid}, content docid: {content_docid}")
else:
if not vector == content_entry.get('vector'):
differences.append(f"Vector mismatch for docid: {docid}")

if differences:
print("Differences found:")
for difference in differences:
print(difference)
else:
print("No differences found. The files are identical.")

if __name__ == "__main__":
if len(sys.argv) != 3:
print("Usage: python compare_jsonl.py <vectors_file.jsonl> <contents_file.jsonl>")
sys.exit(1)

vectors_file = sys.argv[1]
contents_file = sys.argv[2]

compare_jsonl(vectors_file, contents_file)
66 changes: 66 additions & 0 deletions src/main/python/safetensors/json_to_bin.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import json
import torch
import os
import subprocess
from safetensors.torch import save_file

# Base directory relative to the expected script execution path in the Anserini repository
Panizghi marked this conversation as resolved.
Show resolved Hide resolved
base_directory = './collections/beir-v1.0.0/bge-base-en-v1.5.safetensors/nfcorpus'

input_direcotry = './collections/beir-v1.0.0/bge-base-en-v1.5/nfcorpus'
# Complete path to the JSONL file (assuming it's gzipped and needs to be decompressed)
jsonl_gz_file = os.path.join(input_direcotry , 'vectors.part00.jsonl.gz')

# Check if the gzipped file exists and unzip it
if os.path.exists(jsonl_gz_file):
subprocess.run(['gzip', '-d', jsonl_gz_file], check=True)
print(f"Unzipped the file in the directory {input_direcotry}")
else:
print(f"File not found: {jsonl_gz_file}")

# exit(1) # Exit if the file does not exist to avoid further errors

# Process all JSONL files in the input directory
for input_filename in os.listdir(input_direcotry):
if input_filename.endswith('.jsonl'):
input_file_path = os.path.join(input_direcotry , input_filename)

# Extract the base name (e.g., "vectors.part00" from "vectors.part00.jsonl")
base_name = os.path.splitext(input_filename)[0]

# Define paths for output files using the new naming convention
vectors_path = os.path.join(base_directory, f'{base_name}_vectors.safetensors')
docids_path = os.path.join(base_directory, f'{base_name}_docids.safetensors')
docid_to_idx_path = os.path.join(base_directory, f'{base_name}_docid_to_idx.json')

# Initialize lists to hold data
vectors = []
docids = []

# Process the JSONL file to extract vectors and docids
with open(input_file_path, 'r') as file:
for line in file:
entry = json.loads(line)
if isinstance(entry['vector'][0], float):
vectors.append(entry['vector'])
docids.append(entry['docid'])
else:
print(f"Skipped invalid vector entry with docid: {entry['docid']}")

# Convert lists to tensors
vectors_tensor = torch.tensor(vectors, dtype=torch.float64)
docid_to_idx = {docid: idx for idx, docid in enumerate(set(docids))}
idxs = [docid_to_idx[docid] for docid in docids]
docids_tensor = torch.tensor(idxs, dtype=torch.int64)

# Save the tensors to SafeTensors files
save_file({'vectors': vectors_tensor}, vectors_path)
save_file({'docids': docids_tensor}, docids_path)

# Save the docid_to_idx mapping to a JSON file
with open(docid_to_idx_path, 'w') as f:
json.dump(docid_to_idx, f)

print(f"Saved vectors to {vectors_path}")
print(f"Saved docids to {docids_path}")
print(f"Saved docid_to_idx mapping to {docid_to_idx_path}")
26 changes: 26 additions & 0 deletions src/main/python/safetensors/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
certifi==2024.2.2
charset-normalizer==3.3.2
contourpy==1.2.1
cycler==0.12.1
filelock==3.14.0
fonttools==4.52.1
fsspec==2024.5.0
idna==3.7
Jinja2==3.1.4
kiwisolver==1.4.5
MarkupSafe==2.1.5
matplotlib==3.9.0
mpmath==1.3.0
networkx==3.3
numpy==1.26.4
packaging==24.0
pillow==10.3.0
pyparsing==3.1.2
python-dateutil==2.9.0.post0
requests==2.32.2
safetensors==0.4.3
six==1.16.0
sympy==1.12
torch==2.3.0
typing_extensions==4.11.0
urllib3==2.2.1
Loading
Loading