Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Custom Tokenizers #184

Merged
merged 43 commits into from
Nov 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
00db50d
User defined c files is compiled and callable
ospfranco Nov 1, 2024
b8f7013
basic header generation
ospfranco Nov 1, 2024
03f8319
Pass tokenizer list as compilation param
ospfranco Nov 2, 2024
d5efc75
Correct tokenizers list variable
ospfranco Nov 2, 2024
e648e74
Tokenizer init works
ospfranco Nov 3, 2024
1ccfe32
Working tokenizer plus test
ospfranco Nov 3, 2024
59c03ae
Working include of dynamically generated header
ospfranco Nov 3, 2024
66c350a
Partial Android implementation
ospfranco Nov 4, 2024
8366e55
Working Android adding of sources
ospfranco Nov 10, 2024
182ae6c
Fix codegen script
ospfranco Nov 10, 2024
840367f
Bump version to beta
ospfranco Nov 10, 2024
9b73943
Add missing file to publish
ospfranco Nov 10, 2024
356dee8
Corrections
ospfranco Nov 10, 2024
53264f2
Fix header path
ospfranco Nov 11, 2024
6c0ff38
Fix sources path
ospfranco Nov 11, 2024
7b64a87
Fix sources path
ospfranco Nov 11, 2024
36dc9d2
Copy file sources to podspec dir
ospfranco Nov 12, 2024
6ed63ee
Copy files on Android
ospfranco Nov 12, 2024
573a310
Fix typo in empty macro
ospfranco Nov 15, 2024
4a323a6
Bump
ospfranco Nov 15, 2024
f42bf52
Fix tokenizers header generation
ospfranco Nov 15, 2024
cffc893
Bump
ospfranco Nov 15, 2024
e854895
Clean up source files
ospfranco Nov 18, 2024
60673ce
Restore tests
ospfranco Nov 18, 2024
2a25122
Get rid of warn
ospfranco Nov 18, 2024
15807dc
Add docs to libsql functions
ospfranco Nov 18, 2024
4e477cc
Modify turbo files
ospfranco Nov 18, 2024
e64213a
Modify Android test script
ospfranco Nov 18, 2024
8d6e624
Modify android script
ospfranco Nov 18, 2024
148f17e
Restore turbo
ospfranco Nov 18, 2024
1b66ac7
Change turbo
ospfranco Nov 18, 2024
ebbddd2
Get rid of turbo on android
ospfranco Nov 18, 2024
d121fb4
CI
ospfranco Nov 18, 2024
301a731
CI
ospfranco Nov 18, 2024
b271b15
CI
ospfranco Nov 18, 2024
1b30cdd
Add message to build.gradle when tokenizers are enabled
ospfranco Nov 18, 2024
4844bdf
CI
ospfranco Nov 18, 2024
ccd79a4
CI
ospfranco Nov 18, 2024
c2190fd
CI
ospfranco Nov 18, 2024
33404b5
Set server error
ospfranco Nov 18, 2024
8675ec3
Fix tests
ospfranco Nov 18, 2024
109541f
Disable tokenizer tests for libsql
ospfranco Nov 18, 2024
52cdf25
Turn off libsql
ospfranco Nov 18, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 2 additions & 5 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -420,12 +420,9 @@ jobs:
env:
TURBO_CACHE_DIR: .turbo/android
steps:
- name: Checkout
uses: actions/checkout@v4
- uses: actions/checkout@v4

- name: Turn on libsql
run: |
node ./scripts/turnOnLibsql.js
- run: node ./scripts/turnOnLibsql.js

- name: Setup
uses: ./.github/actions/setup
Expand Down
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -81,4 +81,6 @@ android/gradle/
!.yarn/plugins
!.yarn/releases
!.yarn/sdks
!.yarn/versions
!.yarn/versions

android/c_sources
5 changes: 0 additions & 5 deletions Gemfile

This file was deleted.

8 changes: 8 additions & 0 deletions android/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ set (BUILD_DIR ${CMAKE_SOURCE_DIR}/build)
../cpp
../cpp/sqlcipher
../cpp/libsql
# ../example/c_sources
)

add_definitions(
Expand Down Expand Up @@ -72,6 +73,13 @@ find_package(ReactAndroid REQUIRED CONFIG)
find_package(fbjni REQUIRED CONFIG)
find_library(LOG_LIB log)

# Add user defined files
if (USER_DEFINED_SOURCE_FILES)
target_sources(${PACKAGE_NAME} PRIVATE ${USER_DEFINED_SOURCE_FILES})

add_definitions("-DTOKENIZERS_HEADER_PATH=\"${USER_DEFINED_TOKENIZERS_HEADER_PATH}\"")
endif()

if (USE_SQLCIPHER)
if (ReactAndroid_VERSION_MINOR GREATER_EQUAL 76)
target_link_libraries(
Expand Down
41 changes: 37 additions & 4 deletions android/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,18 @@ def sqliteFlags = ""
def enableFTS5 = false
def useSqliteVec = false
def enableRtree = false
def tokenizers = []

def packageJsonFile = new File("$rootDir/../package.json")
def packageJson = new JsonSlurper().parseText(packageJsonFile.text)
def isInsideNodeModules = rootDir.absolutePath.contains("node_modules")
def packageJson

if ( isInsideNodeModules ) {
def packageJsonFile = new File("$rootDir/../../../package.json")
packageJson = new JsonSlurper().parseText(packageJsonFile.text)
} else {
def packageJsonFile = new File("$rootDir/../package.json")
packageJson = new JsonSlurper().parseText(packageJsonFile.text)
}

def opsqliteConfig = packageJson["op-sqlite"]
if(opsqliteConfig) {
Expand All @@ -49,6 +58,7 @@ if(opsqliteConfig) {
enableFTS5 = opsqliteConfig["fts5"]
useLibsql = opsqliteConfig["libsql"]
enableRtree = opsqliteConfig["rtree"]
tokenizers = opsqliteConfig["tokenizers"] ? opsqliteConfig["tokenizers"] : []
}

if(useSQLCipher) {
Expand Down Expand Up @@ -83,6 +93,11 @@ if(useSqliteVec) {
println "[OP-SQLITE] Sqlite Vec enabled! ↗️"
}


if (!tokenizers.isEmpty()) {
println "[OP-SQLITE] Tokenizers enabled! 🧾 Tokenizers: " + tokenizers
}

if (isNewArchitectureEnabled()) {
apply plugin: "com.facebook.react"
}
Expand Down Expand Up @@ -153,14 +168,32 @@ android {
cppFlags += "-DOP_SQLITE_USE_SQLITE_VEC=1"
}

cppFlags "-O2", "-fexceptions", "-frtti", "-std=c++1y", "-DONANDROID"
// This are zeroes because they will be passed as C flags, so they become falsy
def sourceFiles = 0
// def tokenizerInitStrings = 0
def tokenizersHeaderPath = 0
if (!tokenizers.isEmpty()) {
def sourceDir = isInsideNodeModules ? file("$rootDir/../../../c_sources") : file("$rootDir/../c_sources")
def destDir = file("$buildscript.sourceFile.parentFile/c_sources")
copy {
from sourceDir
into destDir
include "**/*.cpp", "**/*.h"
}
sourceFiles = fileTree(dir: destDir, include: ["**/*.cpp", "**/*.h"]).files.join(";")
tokenizersHeaderPath = "../c_sources/tokenizers.h"
}

cppFlags "-O2", "-fexceptions", "-DONANDROID"
abiFilters 'x86', 'x86_64', 'armeabi-v7a', 'arm64-v8a'
arguments "-DANDROID_STL=c++_shared",
"-DSQLITE_FLAGS='$sqliteFlags'",
"-DUSE_SQLCIPHER=${useSQLCipher ? 1 : 0}",
"-DUSE_CRSQLITE=${useCRSQLite ? 1 : 0}",
"-DUSE_LIBSQL=${useLibsql ? 1 : 0}",
"-DUSE_SQLITE_VEC=${useSqliteVec ? 1 : 0}"
"-DUSE_SQLITE_VEC=${useSqliteVec ? 1 : 0}",
"-DUSER_DEFINED_SOURCE_FILES=${sourceFiles}",
"-DUSER_DEFINED_TOKENIZERS_HEADER_PATH='${tokenizersHeaderPath}'"
}
}

Expand Down
88 changes: 88 additions & 0 deletions c_sources/tokenizers.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
#include "tokenizers.h"
#include <cctype>
#include <memory>
#include <string>

namespace opsqlite {

fts5_api *fts5_api_from_db(sqlite3 *db) {
fts5_api *pRet = 0;
sqlite3_stmt *pStmt = 0;

if (SQLITE_OK == sqlite3_prepare_v2(db, "SELECT fts5(?1)", -1, &pStmt, 0)) {
sqlite3_bind_pointer(pStmt, 1, (void *)&pRet, "fts5_api_ptr", NULL);
sqlite3_step(pStmt);
}
sqlite3_finalize(pStmt);
return pRet;
}

class WordTokenizer {
public:
WordTokenizer() = default;
~WordTokenizer() = default;
};

// Define `xCreate`, which initializes the tokenizer
int wordTokenizerCreate(void *pUnused, const char **azArg, int nArg,
Fts5Tokenizer **ppOut) {
auto tokenizer = std::make_unique<WordTokenizer>();
*ppOut = reinterpret_cast<Fts5Tokenizer *>(
tokenizer.release()); // Cast to Fts5Tokenizer*
return SQLITE_OK;
}

// Define `xDelete`, which frees the tokenizer
void wordTokenizerDelete(Fts5Tokenizer *pTokenizer) {
delete reinterpret_cast<WordTokenizer *>(pTokenizer);
}

// Define `xTokenize`, which performs the actual tokenization
int wordTokenizerTokenize(Fts5Tokenizer *pTokenizer, void *pCtx, int flags,
const char *pText, int nText,
int (*xToken)(void *, int, const char *, int, int,
int)) {
int start = 0;
int i = 0;

while (i <= nText) {
if (i == nText || !std::isalnum(static_cast<unsigned char>(pText[i]))) {
if (start < i) { // Found a token
int rc = xToken(pCtx, 0, pText + start, i - start, start, i);
if (rc != SQLITE_OK)
return rc;
}
start = i + 1;
}
i++;
}
return SQLITE_OK;
}

int opsqlite_wordtokenizer_init(sqlite3 *db, char **error,
sqlite3_api_routines const *api) {
fts5_tokenizer wordtokenizer = {wordTokenizerCreate, wordTokenizerDelete,
wordTokenizerTokenize};

fts5_api *ftsApi = (fts5_api *)fts5_api_from_db(db);
if (ftsApi == NULL)
return SQLITE_ERROR;

return ftsApi->xCreateTokenizer(ftsApi, "wordtokenizer", NULL, &wordtokenizer,
NULL);
}

int opsqlite_porter_init(sqlite3 *db, char **error,
sqlite3_api_routines const *api) {
fts5_tokenizer porter_tokenizer = {wordTokenizerCreate, wordTokenizerDelete,
wordTokenizerTokenize};

fts5_api *ftsApi = (fts5_api *)fts5_api_from_db(db);
if (ftsApi == nullptr)
return SQLITE_ERROR;

return ftsApi->xCreateTokenizer(ftsApi, "portertokenizer", NULL,
&porter_tokenizer, NULL);
}

} // namespace opsqlite
15 changes: 15 additions & 0 deletions c_sources/tokenizers.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#ifndef TOKENIZERS_H
#define TOKENIZERS_H

#define TOKENIZER_LIST opsqlite_wordtokenizer_init(db,&errMsg,nullptr);opsqlite_porter_init(db,&errMsg,nullptr);

#include "sqlite3.h"

namespace opsqlite {

int opsqlite_wordtokenizer_init(sqlite3 *db, char **error, sqlite3_api_routines const *api);
int opsqlite_porter_init(sqlite3 *db, char **error, sqlite3_api_routines const *api);

} // namespace opsqlite

#endif // TOKENIZERS_H
3 changes: 2 additions & 1 deletion cpp/bindings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,8 @@ void clearState() {
thread_pool->restartPool();
}

void install(jsi::Runtime &rt, const std::shared_ptr<react::CallInvoker>& invoker,
void install(jsi::Runtime &rt,
const std::shared_ptr<react::CallInvoker> &invoker,
const char *base_path, const char *crsqlite_path,
const char *sqlite_vec_path) {
invalidated = false;
Expand Down
10 changes: 9 additions & 1 deletion cpp/bridge.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,16 @@
#include "logs.h"
#include "utils.h"
#include <iostream>
#include <sstream>
#include <unordered_map>
#include <variant>

#ifdef TOKENIZERS_HEADER_PATH
#include TOKENIZERS_HEADER_PATH
#else
#define TOKENIZER_LIST
#endif

namespace opsqlite {

/// Maps to hold the different objects
Expand Down Expand Up @@ -109,9 +116,10 @@ BridgeResult opsqlite_open(std::string const &name,
if (errMsg != nullptr) {
return {.type = SQLiteError, .message = errMsg};
}

#endif

TOKENIZER_LIST

return {.type = SQLiteOk, .affectedRows = 0};
}

Expand Down
10 changes: 10 additions & 0 deletions cpp/utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -341,4 +341,14 @@ int mkdir(std::string const &path) {
return 0;
}

std::vector<std::string> parse_string_list(const std::string& str) {
std::vector<std::string> result;
std::istringstream stream(str);
std::string token;
while (std::getline(stream, token, ',')) {
result.push_back(token);
}
return result;
}

} // namespace opsqlite
10 changes: 10 additions & 0 deletions cpp/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,20 +9,28 @@
#include <jsi/jsilib.h>
#include <map>
#include <vector>
#include <string>

namespace opsqlite {

namespace jsi = facebook::jsi;

jsi::Value toJSI(jsi::Runtime &rt, const JSVariant &value);

JSVariant toVariant(jsi::Runtime &rt, jsi::Value const &value);

std::vector<std::string> to_string_vec(jsi::Runtime &rt, jsi::Value const &xs);

std::vector<JSVariant> to_variant_vec(jsi::Runtime &rt, jsi::Value const &xs);

std::vector<int> to_int_vec(jsi::Runtime &rt, jsi::Value const &xs);

jsi::Value createResult(jsi::Runtime &rt, BridgeResult status,
std::vector<DumbHostObject> *results,
std::shared_ptr<std::vector<SmartHostObject>> metadata);

jsi::Value create_js_rows(jsi::Runtime &rt, const BridgeResult &status);

jsi::Value
create_raw_result(jsi::Runtime &rt, BridgeResult status,
const std::vector<std::vector<JSVariant>> *results);
Expand All @@ -38,6 +46,8 @@ bool folder_exists(const std::string &foldername);

bool file_exists(const std::string &path);

std::vector<std::string> parse_string_list(const std::string& str);

} // namespace opsqlite

#endif /* utils_h */
6 changes: 4 additions & 2 deletions example/Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,7 @@ source 'https://rubygems.org'

ruby '>= 2.7.6'

gem 'cocoapods', '>= 1.13', '!= 1.15.0', '!= 1.15.1'
gem 'activesupport', '>= 6.1.7.5', '!= 7.1.0'
gem 'cocoapods', '=1.15.2'
gem 'activesupport', '>= 6.1.7.5', '!= 7.1.0'
gem 'bigdecimal'
gem 'mutex_m'
Loading
Loading