You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
If you pass a string to lembed() for which the tokenized array length exceeds the models maximum, then a segmentation fault is triggered. The code below will output 'success for 2040' then segfault
It would be nice if you could pass a truncate flag or something
from transformers import AutoTokenizer
MODEL_ID = "Snowflake/snowflake-arctic-embed-m-v1.5"
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
print(len(tokenizer(('abc '*2000)[:2040])['input_ids'])) # 512
import sqlite3
import sqlite_vec
import sqlite_lembed
import time
def q(sql): return db.execute(sql).fetchall()
db = sqlite3.connect(":memory:")
db.enable_load_extension(True)
sqlite_vec.load(db)
sqlite_lembed.load(db)
db.enable_load_extension(False)
cursor = db.cursor()
cursor.execute("""insert into lembed_models(name, model) values ('default', lembed_model_from_file('/Users/ntaylor/.cache/huggingface/hub/models--yishan-wang--snowflake-arctic-embed-m-v1.5-Q8_0-GGUF/blobs/1073b6711706f55b451efe6c3ecf7398bf93c8c3b9b2df918673df9b77146a34'));""")
cursor.execute("create table foo(mycol text);")
q("delete from foo")
cursor.execute("insert into foo values (?)", ['abc '*2000])
q("select lembed(substr(mycol, 0, 2040)) from foo")
print('success for 2040')
time.sleep(1)
q("delete from foo")
cursor.execute("insert into foo values (?)", ['abc '*5000])
q("select lembed(substr(mycol, 0, 2041)) from foo")
print('success for 2041')
The text was updated successfully, but these errors were encountered:
If you pass a string to
lembed()
for which the tokenized array length exceeds the models maximum, then a segmentation fault is triggered. The code below will output 'success for 2040' then segfaultIt would be nice if you could pass a truncate flag or something
The text was updated successfully, but these errors were encountered: