-
Notifications
You must be signed in to change notification settings - Fork 1.2k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add snowball libstemmer v2.2.0 as one of the dependencies. And use it to implement the word_stem() as a scalar UDF. When using the libstemmer API, each language creates an sb_stemmer instance which consumes 114 bytes, including the default 10 bytes for the output stem. It uses the realloc to increase the memory block for the output stem if needed. Signed-off-by: Yihong Wang <[email protected]>
- Loading branch information
Showing
10 changed files
with
343 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
24 changes: 24 additions & 0 deletions
24
CMake/resolve_dependency_modules/libstemmer/Makefile.patch
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
# Copyright (c) Facebook, Inc. and its affiliates. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
--- a/Makefile | ||
+++ b/Makefile | ||
@@ -3,7 +3,7 @@ | ||
EXEEXT=.exe | ||
endif | ||
CFLAGS=-O2 | ||
-CPPFLAGS=-Iinclude | ||
+CPPFLAGS=-Iinclude -fPIC | ||
all: libstemmer.a stemwords$(EXEEXT) | ||
libstemmer.a: $(snowball_sources:.c=.o) | ||
$(AR) -cru $@ $^ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
# Copyright (c) Facebook, Inc. and its affiliates. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
include_guard(GLOBAL) | ||
|
||
set(VELOX_STEMMER_VERSION 2.2.0) | ||
set(VELOX_STEMMER_BUILD_SHA256_CHECKSUM | ||
b941d9fe9cf36b4e2f8d3873cd4d8b8775bd94867a1df8d8c001bb8b688377c3) | ||
set(VELOX_STEMMER_SOURCE_URL | ||
"https://snowballstem.org/dist/libstemmer_c-${VELOX_STEMMER_VERSION}.tar.gz" | ||
) | ||
|
||
resolve_dependency_url(STEMMER) | ||
|
||
message(STATUS "Building stemmer from source") | ||
find_program(MAKE_PROGRAM make REQUIRED) | ||
|
||
set(STEMMER_PREFIX "${CMAKE_BINARY_DIR}/_deps/libstemmer") | ||
set(STEMMER_INCLUDE_PATH ${STEMMER_PREFIX}/src/libstemmer/include) | ||
|
||
# We can not use FetchContent as libstemmer does not use cmake | ||
ExternalProject_Add( | ||
libstemmer | ||
PREFIX ${STEMMER_PREFIX} | ||
SOURCE_DIR ${STEMMER_PREFIX}/src/libstemmer | ||
URL ${VELOX_STEMMER_SOURCE_URL} | ||
URL_HASH ${VELOX_STEMMER_BUILD_SHA256_CHECKSUM} | ||
BUILD_IN_SOURCE TRUE | ||
CONFIGURE_COMMAND "" | ||
BUILD_COMMAND ${MAKE_PROGRAM} | ||
INSTALL_COMMAND "" | ||
PATCH_COMMAND git apply ${CMAKE_CURRENT_LIST_DIR}/libstemmer/Makefile.patch | ||
BUILD_BYPRODUCTS | ||
${STEMMER_PREFIX}/src/libstemmer/${CMAKE_STATIC_LIBRARY_PREFIX}stemmer${CMAKE_STATIC_LIBRARY_SUFFIX} | ||
) | ||
|
||
add_library(stemmer STATIC IMPORTED) | ||
add_library(stemmer::stemmer ALIAS stemmer) | ||
file(MAKE_DIRECTORY ${STEMMER_INCLUDE_PATH}) | ||
set_target_properties( | ||
stemmer | ||
PROPERTIES | ||
IMPORTED_LOCATION | ||
${STEMMER_PREFIX}/src/libstemmer/${CMAKE_STATIC_LIBRARY_PREFIX}stemmer${CMAKE_STATIC_LIBRARY_SUFFIX} | ||
INTERFACE_INCLUDE_DIRECTORIES ${STEMMER_INCLUDE_PATH}) | ||
|
||
add_dependencies(stemmer libstemmer) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,132 @@ | ||
/* | ||
* Copyright (c) Facebook, Inc. and its affiliates. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
#pragma once | ||
|
||
#include <folly/container/F14Map.h> | ||
#include <libstemmer.h> | ||
|
||
#include "velox/functions/Udf.h" | ||
#include "velox/functions/lib/string/StringImpl.h" | ||
|
||
namespace facebook::velox::functions { | ||
|
||
namespace detail { | ||
// Wrap the sbstemmer library and use its sb_stemmer_stem | ||
// to get word stem. | ||
class Stemmer { | ||
public: | ||
Stemmer(sb_stemmer* stemmer) : sbStemmer_(stemmer) { | ||
VELOX_CHECK_NOT_NULL(stemmer); | ||
} | ||
|
||
~Stemmer() { | ||
sb_stemmer_delete(sbStemmer_); | ||
} | ||
|
||
// Returns the word stem or nullptr if an unlikely out-of-memory error occurs. | ||
const char* stem(const std::string& input) { | ||
return (const char*)(sb_stemmer_stem( | ||
sbStemmer_, | ||
reinterpret_cast<unsigned char const*>(input.c_str()), | ||
input.length())); | ||
} | ||
|
||
private: | ||
sb_stemmer* sbStemmer_; | ||
}; | ||
} // namespace detail | ||
|
||
/// word_stem function | ||
/// word_stem(word) -> varchar | ||
/// return the stem of the word in the English language | ||
/// word_stem(word, lang) -> varchar | ||
/// return the stem of the word in the specificed language | ||
/// | ||
/// Use the snowball stemmer library to calculate the stem. | ||
/// https://snowballstem.org | ||
/// The website provides Java implementation which is used in Presto as well | ||
/// as C implementation. Therefore, both Presto and Prestimissio | ||
/// would have the same word stem results. | ||
template <typename TExec> | ||
struct WordStemFunction { | ||
VELOX_DEFINE_FUNCTION_TYPES(TExec); | ||
|
||
// ASCII input always produces ASCII result. | ||
static constexpr bool is_default_ascii_behavior = true; | ||
|
||
FOLLY_ALWAYS_INLINE void call( | ||
out_type<Varchar>& result, | ||
const arg_type<Varchar>& input) { | ||
return doCall<false>(result, input); | ||
} | ||
|
||
FOLLY_ALWAYS_INLINE void callAscii( | ||
out_type<Varchar>& result, | ||
const arg_type<Varchar>& input) { | ||
return doCall<true>(result, input); | ||
} | ||
|
||
FOLLY_ALWAYS_INLINE void call( | ||
out_type<Varchar>& result, | ||
const arg_type<Varchar>& input, | ||
const arg_type<Varchar>& lang) { | ||
return doCall<false>(result, input, lang.str()); | ||
} | ||
|
||
FOLLY_ALWAYS_INLINE void callAscii( | ||
out_type<Varchar>& result, | ||
const arg_type<Varchar>& input, | ||
const arg_type<Varchar>& lang) { | ||
return doCall<true>(result, input, lang.str()); | ||
} | ||
|
||
template <bool isAscii> | ||
FOLLY_ALWAYS_INLINE void doCall( | ||
out_type<Varchar>& result, | ||
const arg_type<Varchar>& input, | ||
const std::string& lang = "en") { | ||
auto* stemmer = getStemmer(lang); | ||
VELOX_USER_CHECK_NOT_NULL( | ||
stemmer, "Unsupported stemmer language: {}", lang); | ||
|
||
std::string lowerOutput; | ||
stringImpl::lower<isAscii>(lowerOutput, input); | ||
auto* stem = stemmer->stem(lowerOutput); | ||
VELOX_CHECK_NOT_NULL( | ||
stem, "Stemmer library returned a NULL (out-of-memory)") | ||
result = stem; | ||
} | ||
|
||
private: | ||
folly::F14FastMap<std::string, std::unique_ptr<detail::Stemmer>> stemmers_; | ||
|
||
// Get a detail::Stemmer from the the map using the lang as the key or create | ||
// a new one if it doesn't exist. Return nullptr if the specified lang is not | ||
// supported. | ||
detail::Stemmer* getStemmer(const std::string& lang) { | ||
if (auto found = stemmers_.find(lang); found != stemmers_.end()) { | ||
return found->second.get(); | ||
} | ||
// Only support ASCII and UTF-8. | ||
if (auto sbStemmer = sb_stemmer_new(lang.c_str(), "UTF_8")) { | ||
auto* stemmer = new detail::Stemmer(sbStemmer); | ||
stemmers_[lang] = std::unique_ptr<detail::Stemmer>(stemmer); | ||
return stemmer; | ||
} | ||
return nullptr; | ||
} | ||
}; | ||
} // namespace facebook::velox::functions |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
/* | ||
* Copyright (c) Facebook, Inc. and its affiliates. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
#include <optional> | ||
#include <string> | ||
|
||
#include "velox/common/base/tests/GTestUtils.h" | ||
#include "velox/functions/prestosql/tests/utils/FunctionBaseTest.h" | ||
|
||
using namespace facebook::velox::functions::test; | ||
|
||
namespace facebook::velox::functions { | ||
namespace { | ||
class WordStemTest : public FunctionBaseTest { | ||
protected: | ||
std::string wordStem(const std::string& word, const std::string& lang) { | ||
return evaluateOnce<std::string>( | ||
"word_stem(c0, c1)", std::optional(word), std::optional(lang)) | ||
.value(); | ||
} | ||
|
||
std::string wordStem(const std::string& word) { | ||
return evaluateOnce<std::string>("word_stem(c0)", std::optional(word)) | ||
.value(); | ||
} | ||
}; | ||
|
||
/// Borrow test cases from Presto Java: | ||
/// https://github.com/prestodb/presto/blob/master/presto-main/src/test/java/com/facebook/presto/operator/scalar/TestWordStemFunction.java | ||
TEST_F(WordStemTest, asciiWord) { | ||
EXPECT_EQ(wordStem(""), ""); | ||
EXPECT_EQ(wordStem("x"), "x"); | ||
EXPECT_EQ(wordStem("abc"), "abc"); | ||
EXPECT_EQ(wordStem("generally"), "general"); | ||
EXPECT_EQ(wordStem("useful"), "use"); | ||
EXPECT_EQ(wordStem("runs"), "run"); | ||
EXPECT_EQ(wordStem("run"), "run"); | ||
EXPECT_EQ(wordStem("authorized", "en"), "author"); | ||
EXPECT_EQ(wordStem("accessories", "en"), "accessori"); | ||
EXPECT_EQ(wordStem("intensifying", "en"), "intensifi"); | ||
EXPECT_EQ(wordStem("resentment", "en"), "resent"); | ||
EXPECT_EQ(wordStem("faithfulness", "en"), "faith"); | ||
EXPECT_EQ(wordStem("continuerait", "fr"), "continu"); | ||
EXPECT_EQ(wordStem("torpedearon", "es"), "torped"); | ||
EXPECT_EQ(wordStem("quilomtricos", "pt"), "quilomtr"); | ||
EXPECT_EQ(wordStem("pronunziare", "it"), "pronunz"); | ||
EXPECT_EQ(wordStem("auferstnde", "de"), "auferstnd"); | ||
} | ||
|
||
TEST_F(WordStemTest, invalidLang) { | ||
VELOX_ASSERT_THROW( | ||
wordStem("hello", "xx"), "Unsupported stemmer language: xx"); | ||
} | ||
|
||
TEST_F(WordStemTest, unicodeWord) { | ||
EXPECT_EQ( | ||
wordStem( | ||
"\u004b\u0069\u0074\u0061\u0062\u0131\u006d\u0131\u007a\u0064\u0131", | ||
"tr"), | ||
"kitap"); | ||
EXPECT_EQ( | ||
wordStem("\u0432\u0435\u0441\u0435\u043d\u043d\u0438\u0439", "ru"), | ||
"\u0432\u0435\u0441\u0435\u043d"); | ||
} | ||
|
||
} // namespace | ||
} // namespace facebook::velox::functions |