diff --git a/.gitignore b/.gitignore index 76f0c21..9ec7746 100644 --- a/.gitignore +++ b/.gitignore @@ -84,4 +84,5 @@ android/gradle/ !.yarn/versions android/c_sources -c_sources/ \ No newline at end of file +# Android does not generate the sources on its own, leave this in for CI +# c_sources/ \ No newline at end of file diff --git a/c_sources/tokenizers.cpp b/c_sources/tokenizers.cpp new file mode 100644 index 0000000..5b0a977 --- /dev/null +++ b/c_sources/tokenizers.cpp @@ -0,0 +1,88 @@ +#include "tokenizers.h" +#include +#include +#include + +namespace opsqlite { + +fts5_api *fts5_api_from_db(sqlite3 *db) { + fts5_api *pRet = 0; + sqlite3_stmt *pStmt = 0; + + if (SQLITE_OK == sqlite3_prepare_v2(db, "SELECT fts5(?1)", -1, &pStmt, 0)) { + sqlite3_bind_pointer(pStmt, 1, (void *)&pRet, "fts5_api_ptr", NULL); + sqlite3_step(pStmt); + } + sqlite3_finalize(pStmt); + return pRet; +} + +class WordTokenizer { +public: + WordTokenizer() = default; + ~WordTokenizer() = default; +}; + +// Define `xCreate`, which initializes the tokenizer +int wordTokenizerCreate(void *pUnused, const char **azArg, int nArg, + Fts5Tokenizer **ppOut) { + auto tokenizer = std::make_unique(); + *ppOut = reinterpret_cast( + tokenizer.release()); // Cast to Fts5Tokenizer* + return SQLITE_OK; +} + +// Define `xDelete`, which frees the tokenizer +void wordTokenizerDelete(Fts5Tokenizer *pTokenizer) { + delete reinterpret_cast(pTokenizer); +} + +// Define `xTokenize`, which performs the actual tokenization +int wordTokenizerTokenize(Fts5Tokenizer *pTokenizer, void *pCtx, int flags, + const char *pText, int nText, + int (*xToken)(void *, int, const char *, int, int, + int)) { + int start = 0; + int i = 0; + + while (i <= nText) { + if (i == nText || !std::isalnum(static_cast(pText[i]))) { + if (start < i) { // Found a token + int rc = xToken(pCtx, 0, pText + start, i - start, start, i); + if (rc != SQLITE_OK) + return rc; + } + start = i + 1; + } + i++; + } + return SQLITE_OK; +} + +int opsqlite_wordtokenizer_init(sqlite3 *db, char **error, + sqlite3_api_routines const *api) { + fts5_tokenizer wordtokenizer = {wordTokenizerCreate, wordTokenizerDelete, + wordTokenizerTokenize}; + + fts5_api *ftsApi = (fts5_api *)fts5_api_from_db(db); + if (ftsApi == NULL) + return SQLITE_ERROR; + + return ftsApi->xCreateTokenizer(ftsApi, "wordtokenizer", NULL, &wordtokenizer, + NULL); +} + +int opsqlite_porter_init(sqlite3 *db, char **error, + sqlite3_api_routines const *api) { + fts5_tokenizer porter_tokenizer = {wordTokenizerCreate, wordTokenizerDelete, + wordTokenizerTokenize}; + + fts5_api *ftsApi = (fts5_api *)fts5_api_from_db(db); + if (ftsApi == nullptr) + return SQLITE_ERROR; + + return ftsApi->xCreateTokenizer(ftsApi, "portertokenizer", NULL, + &porter_tokenizer, NULL); +} + +} // namespace opsqlite diff --git a/c_sources/tokenizers.h b/c_sources/tokenizers.h new file mode 100644 index 0000000..2a41fbe --- /dev/null +++ b/c_sources/tokenizers.h @@ -0,0 +1,15 @@ +#ifndef TOKENIZERS_H +#define TOKENIZERS_H + +#define TOKENIZER_LIST opsqlite_wordtokenizer_init(db,&errMsg,nullptr);opsqlite_porter_init(db,&errMsg,nullptr); + +#include + +namespace opsqlite { + +int opsqlite_wordtokenizer_init(sqlite3 *db, char **error, sqlite3_api_routines const *api); +int opsqlite_porter_init(sqlite3 *db, char **error, sqlite3_api_routines const *api); + +} // namespace opsqlite + +#endif // TOKENIZERS_H