castorini · Panizghi · Jun 1, 2024 · Jun 1, 2024 · Jun 2, 2024 · Jun 2, 2024
diff --git a/...java/io/anserini/index/generator/HnswJsonWithSafeTensorsDenseVectorDocumentGenerator.java b/...java/io/anserini/index/generator/HnswJsonWithSafeTensorsDenseVectorDocumentGenerator.java
@@ -0,0 +1,166 @@
+package io.anserini.index.generator;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import io.anserini.collection.SourceDocument;
+import io.anserini.index.Constants;
+import org.apache.lucene.document.BinaryDocValuesField;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.KnnFloatVectorField;
+import org.apache.lucene.document.StringField;
+import org.apache.lucene.index.VectorSimilarityFunction;
+import org.apache.lucene.util.BytesRef;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.nio.file.Files;
+import java.nio.file.Paths;
+import java.nio.charset.StandardCharsets;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+
+
+/**
+ * Converts a {@link SourceDocument} from SafeTensors into a Lucene {@link Document}, ready to be indexed.
+ *
+ * @param <T> type of the source document
+ */
+public class HnswJsonWithSafeTensorsDenseVectorDocumentGenerator<T extends SourceDocument> implements LuceneDocumentGenerator<T> {
+
+
+
+    public HnswJsonWithSafeTensorsDenseVectorDocumentGenerator() {
+
+    }
+
+    // public void setCurrentJsonlFile(String currentJsonlFile) {
+    //     this.currentJsonlFile = currentJsonlFile;
+    // }
+
+    @Override
+    public Document createDocument(T src) throws InvalidDocumentException {
+        try {
+
+            String vectorsFilePath = "collections/beir-v1.0.0/bge-base-en-v1.5.safetensors/nfcorpus/vectors.part00_vectors.safetensors";
+            String docidsFilePath = "collections/beir-v1.0.0/bge-base-en-v1.5.safetensors/nfcorpus/vectors.part00_docids.safetensors";
+            String docidToIdxFilePath = "collections/beir-v1.0.0/bge-base-en-v1.5.safetensors/nfcorpus/vectors.part00_docid_to_idx.json";
+
+            // Read and deserialize the SafeTensors files
+            byte[] vectorsData = Files.readAllBytes(Paths.get(vectorsFilePath));
+            byte[] docidsData = Files.readAllBytes(Paths.get(docidsFilePath));
+
+            // Deserialize docid_to_idx.json
+            ObjectMapper objectMapper = new ObjectMapper();
+            @SuppressWarnings("unchecked")
+            Map<String, Integer> docidToIdx = objectMapper.readValue(Files.readAllBytes(Paths.get(docidToIdxFilePath)), Map.class);
+            Map<Integer, String> idxToDocid = new LinkedHashMap<>();
+            for (Map.Entry<String, Integer> entry : docidToIdx.entrySet()) {
+                idxToDocid.put(entry.getValue(), entry.getKey());
+            }
+
+            // Deserialize docids
+            Map<String, Object> docidsHeader = parseHeader(docidsData);
+            int[] docidIndices = extractDocidIndices(docidsData, docidsHeader);
+            String[] docids = new String[docidIndices.length];
+            for (int i = 0; i < docidIndices.length; i++) {
+                docids[i] = idxToDocid.get(docidIndices[i]);
+            }
+
+            // Deserialize vectors
+            Map<String, Object> vectorsHeader = parseHeader(vectorsData);
+            double[][] vectors = extractVectors(vectorsData, vectorsHeader);
+
+            // Create the Lucene document
+            String id = src.id();
+            Integer indexObj = idxToDocid.entrySet().stream().filter(entry -> entry.getValue().equals(id)).findFirst().orElse(null).getKey();
+            if (indexObj == null) {
+                throw new InvalidDocumentException();
+            }
+            int index = indexObj;
+            float[] contents = new float[vectors[index].length];
+            for (int i = 0; i < contents.length; i++) {
+                contents[i] = (float) vectors[index][i];
+            }
+
+            final Document document = new Document();
+            document.add(new StringField(Constants.ID, id, Field.Store.YES));
+            document.add(new BinaryDocValuesField(Constants.ID, new BytesRef(id)));
+            document.add(new KnnFloatVectorField(Constants.VECTOR, contents, VectorSimilarityFunction.DOT_PRODUCT));
+
+            return document;
+
+        } catch (Exception e) {
+            throw new InvalidDocumentException();
+        }
+    }
+
+    private Map<String, Object> parseHeader(byte[] data) throws IOException {
+        ByteBuffer buffer = ByteBuffer.wrap(data).order(ByteOrder.LITTLE_ENDIAN);
+        long headerSize = buffer.getLong();
+        byte[] headerBytes = new byte[(int) headerSize];
+        buffer.get(headerBytes);
+        String headerJson = new String(headerBytes, StandardCharsets.UTF_8).trim();
+        ObjectMapper objectMapper = new ObjectMapper();
+        return objectMapper.readValue(headerJson, Map.class);
+    }
+
+    private double[][] extractVectors(byte[] data, Map<String, Object> header) {
+        @SuppressWarnings("unchecked")
+        Map<String, Object> vectorsInfo = (Map<String, Object>) header.get("vectors");
+        String dtype = (String) vectorsInfo.get("dtype");
+
+        @SuppressWarnings("unchecked")
+        List<Integer> shapeList = (List<Integer>) vectorsInfo.get("shape");
+        int rows = shapeList.get(0);
+        int cols = shapeList.get(1);
+        @SuppressWarnings("unchecked")
+        List<Number> dataOffsets = (List<Number>) vectorsInfo.get("data_offsets");
+        long begin = dataOffsets.get(0).longValue();
+
+        ByteBuffer buffer = ByteBuffer.wrap(data).order(ByteOrder.LITTLE_ENDIAN);
+        buffer.position((int) (begin + buffer.getLong(0) + 8));
+
+        double[][] vectors = new double[rows][cols];
+        if (dtype.equals("F64")) {
+            for (int i = 0; i < rows; i++) {
+                for (int j = 0; j < cols; j++) {
+                    vectors[i][j] = buffer.getDouble();
+                }
+            }
+        } else {
+            throw new UnsupportedOperationException("Unsupported data type: " + dtype);
+        }
+
+        return vectors;
+    }
+
+    private int[] extractDocidIndices(byte[] data, Map<String, Object> header) {
+        @SuppressWarnings("unchecked")
+        Map<String, Object> docidsInfo = (Map<String, Object>) header.get("docids");
+        String dtype = (String) docidsInfo.get("dtype");
+
+        @SuppressWarnings("unchecked")
+        List<Integer> shapeList = (List<Integer>) docidsInfo.get("shape");
+        int length = shapeList.get(0);
+
+        @SuppressWarnings("unchecked")
+        List<Number> dataOffsets = (List<Number>) docidsInfo.get("data_offsets");
+        long begin = dataOffsets.get(0).longValue();
+
+        ByteBuffer buffer = ByteBuffer.wrap(data).order(ByteOrder.LITTLE_ENDIAN);
+        buffer.position((int) (begin + buffer.getLong(0) + 8));
+
+        int[] docidIndices = new int[length];
+        if (dtype.equals("I64")) {
+            for (int i = 0; i < length; i++) {
+                docidIndices[i] = (int) buffer.getLong();
+            }
+        } else {
+            throw new UnsupportedOperationException("Unsupported data type: " + dtype);
+        }
+
+        return docidIndices;
+    }
+}
diff --git a/src/main/python/safetensors/compare_jsonl.py b/src/main/python/safetensors/compare_jsonl.py
@@ -0,0 +1,50 @@
+import json
+import sys
+
+def read_jsonl(file_path):
+    with open(file_path, 'r') as f:
+        return [json.loads(line) for line in f]
+
+def compare_jsonl(vectors_file, contents_file):
+    vectors_data = read_jsonl(vectors_file)
+    contents_data = read_jsonl(contents_file)
+
+    vectors_dict = {entry['docid']: entry['vector'] for entry in vectors_data}
+    contents_dict = {entry['docid']: entry for entry in contents_data}  # Keep full entry for accurate comparison
+
+    all_docids = set(vectors_dict.keys()).union(contents_dict.keys())
+
+    differences = []
+
+    for docid in sorted(all_docids):
+        vector = vectors_dict.get(docid)
+        content_entry = contents_dict.get(docid)
+
+        if not vector or not content_entry:
+            differences.append(f"Missing entry for docid: {docid}")
+            continue
+
+        content_docid = content_entry.get('docid')
+
+        if docid != content_docid:
+            differences.append(f"Docid mismatch for docid: {docid}, content docid: {content_docid}")
+        else:
+            if not vector == content_entry.get('vector'):
+                differences.append(f"Vector mismatch for docid: {docid}")
+
+    if differences:
+        print("Differences found:")
+        for difference in differences:
+            print(difference)
+    else:
+        print("No differences found. The files are identical.")
+
+if __name__ == "__main__":
+    if len(sys.argv) != 3:
+        print("Usage: python compare_jsonl.py <vectors_file.jsonl> <contents_file.jsonl>")
+        sys.exit(1)
+
+    vectors_file = sys.argv[1]
+    contents_file = sys.argv[2]
+
+    compare_jsonl(vectors_file, contents_file)
diff --git a/src/main/python/safetensors/json_to_bin.py b/src/main/python/safetensors/json_to_bin.py
@@ -0,0 +1,66 @@
+import json
+import torch
+import os
+import subprocess
+from safetensors.torch import save_file
+
+# Base directory relative to the expected script execution path in the Anserini repository
+base_directory = './collections/beir-v1.0.0/bge-base-en-v1.5.safetensors/nfcorpus'
+
+input_direcotry = './collections/beir-v1.0.0/bge-base-en-v1.5/nfcorpus'
+# Complete path to the JSONL file (assuming it's gzipped and needs to be decompressed)
+jsonl_gz_file = os.path.join(input_direcotry , 'vectors.part00.jsonl.gz')
+
+# Check if the gzipped file exists and unzip it
+if os.path.exists(jsonl_gz_file):
+    subprocess.run(['gzip', '-d', jsonl_gz_file], check=True)
+    print(f"Unzipped the file in the directory {input_direcotry}")
+else:
+    print(f"File not found: {jsonl_gz_file}")
+
+    # exit(1)  # Exit if the file does not exist to avoid further errors
+
+# Process all JSONL files in the input directory
+for input_filename in os.listdir(input_direcotry):
+    if input_filename.endswith('.jsonl'):
+        input_file_path = os.path.join(input_direcotry , input_filename)
+
+        # Extract the base name (e.g., "vectors.part00" from "vectors.part00.jsonl")
+        base_name = os.path.splitext(input_filename)[0]
+
+        # Define paths for output files using the new naming convention
+        vectors_path = os.path.join(base_directory, f'{base_name}_vectors.safetensors')
+        docids_path = os.path.join(base_directory, f'{base_name}_docids.safetensors')
+        docid_to_idx_path = os.path.join(base_directory, f'{base_name}_docid_to_idx.json')
+
+        # Initialize lists to hold data
+        vectors = []
+        docids = []
+
+        # Process the JSONL file to extract vectors and docids
+        with open(input_file_path, 'r') as file:
+            for line in file:
+                entry = json.loads(line)
+                if isinstance(entry['vector'][0], float):
+                    vectors.append(entry['vector'])
+                    docids.append(entry['docid'])
+                else:
+                    print(f"Skipped invalid vector entry with docid: {entry['docid']}")
+
+        # Convert lists to tensors
+        vectors_tensor = torch.tensor(vectors, dtype=torch.float64)
+        docid_to_idx = {docid: idx for idx, docid in enumerate(set(docids))}
+        idxs = [docid_to_idx[docid] for docid in docids]
+        docids_tensor = torch.tensor(idxs, dtype=torch.int64)
+
+        # Save the tensors to SafeTensors files
+        save_file({'vectors': vectors_tensor}, vectors_path)
+        save_file({'docids': docids_tensor}, docids_path)
+
+        # Save the docid_to_idx mapping to a JSON file
+        with open(docid_to_idx_path, 'w') as f:
+            json.dump(docid_to_idx, f)
+
+        print(f"Saved vectors to {vectors_path}")
+        print(f"Saved docids to {docids_path}")
+        print(f"Saved docid_to_idx mapping to {docid_to_idx_path}")
diff --git a/src/main/python/safetensors/requirements.txt b/src/main/python/safetensors/requirements.txt
@@ -0,0 +1,26 @@
+certifi==2024.2.2
+charset-normalizer==3.3.2
+contourpy==1.2.1
+cycler==0.12.1
+filelock==3.14.0
+fonttools==4.52.1
+fsspec==2024.5.0
+idna==3.7
+Jinja2==3.1.4
+kiwisolver==1.4.5
+MarkupSafe==2.1.5
+matplotlib==3.9.0
+mpmath==1.3.0
+networkx==3.3
+numpy==1.26.4
+packaging==24.0
+pillow==10.3.0
+pyparsing==3.1.2
+python-dateutil==2.9.0.post0
+requests==2.32.2
+safetensors==0.4.3
+six==1.16.0
+sympy==1.12
+torch==2.3.0
+typing_extensions==4.11.0
+urllib3==2.2.1