-
Notifications
You must be signed in to change notification settings - Fork 16
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Gpsr_command_similarity_parser (#161)
* fix: incorrect remapping keys for handover * fix: remove speech server in launch file * feat: handle multiple txt files and indices in faiss vector db * feat: util to split a large text file into a set number of chunks * feat: command similarity matcher state * feat: working similarity state, but need to speed up * feat: handle different index types and multiple data sources * feat: querying with smart lookup * feat: working blazingly fast command similarity matcher * Update tasks/gpsr/states/command_similarity_matcher.py Co-authored-by: Jared Swift <[email protected]> * Update tasks/gpsr/states/command_similarity_matcher.py Co-authored-by: Jared Swift <[email protected]> * Update tasks/gpsr/states/command_similarity_matcher.py Co-authored-by: Jared Swift <[email protected]> --------- Co-authored-by: Jared Swift <[email protected]>
- Loading branch information
Showing
11 changed files
with
364 additions
and
60 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
8 changes: 7 additions & 1 deletion
8
.../vector_databases/lasr_vector_databases_faiss/src/lasr_vector_databases_faiss/__init__.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,8 @@ | ||
from .database_utils import create_vector_database, load_vector_database, query_database | ||
from .database_utils import ( | ||
load_vector_database, | ||
query_database, | ||
save_index_to_disk, | ||
add_vectors_to_index, | ||
construct_faiss_index, | ||
) | ||
from .get_sentence_embeddings import get_sentence_embeddings, load_model, parse_txt_file |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
27 changes: 27 additions & 0 deletions
27
...r_databases/lasr_vector_databases_faiss/src/lasr_vector_databases_faiss/split_txt_file.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
import argparse | ||
from typing import Dict | ||
|
||
|
||
def split_txt_file(input_file, output_file, num_splits): | ||
with open(input_file, "r", encoding="utf8") as src: | ||
lines = src.readlines() | ||
split_size = len(lines) // num_splits | ||
for i in range(num_splits): | ||
with open(f"{output_file}_chunk_{i+1}.txt", "w", encoding="utf8") as dest: | ||
dest.writelines(lines[i * split_size : (i + 1) * split_size]) | ||
|
||
|
||
def parse_args() -> Dict: | ||
parser = argparse.ArgumentParser(description="Split a txt file into chunks") | ||
parser.add_argument("input_file", type=str, help="Path to the input txt file") | ||
parser.add_argument("output_file", type=str, help="Path to the output txt file") | ||
parser.add_argument( | ||
"num_splits", type=int, help="Number of chunks to split the file into" | ||
) | ||
known, _ = parser.parse_known_args() | ||
return vars(known) | ||
|
||
|
||
if __name__ == "__main__": | ||
args = parse_args() | ||
split_txt_file(args["input_file"], args["output_file"], args["num_splits"]) |
15 changes: 11 additions & 4 deletions
15
common/vector_databases/lasr_vector_databases_msgs/srv/TxtIndex.srv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,14 @@ | ||
# Path to input text file | ||
string txt_path | ||
# Path to input text files | ||
string[] txt_paths | ||
|
||
# Output path to save index | ||
string index_path | ||
# Output path to save created indices | ||
# If multiple text files are provided, but one | ||
# index file path is provided, this index will contain | ||
# all of the vectors from all of the txt files. | ||
string[] index_paths | ||
|
||
# Specifies the type of index to create | ||
# see https://github.com/facebookresearch/faiss/wiki/The-index-factory | ||
string index_factory_string | ||
--- | ||
int32[] vecs_per_txt_file |
Oops, something went wrong.