Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add word_stem Presto function #9363

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CMake/resolve_dependency_modules/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ by Velox. See details on bundling below.
| wangle | v2024.04.01.00 | No |
| mvfst | v2024.04.01.00 | No |
| fbthrift | v2024.04.01.00 | No |
| libstemmer | 2.2.0 | Yes |
| DuckDB (testing) | 0.8.1 | Yes |
| cpr (testing) | 1.10.15 | Yes |

Expand Down
24 changes: 24 additions & 0 deletions CMake/resolve_dependency_modules/libstemmer/Makefile.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# Copyright (c) Facebook, Inc. and its affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
--- a/Makefile
+++ b/Makefile
@@ -3,7 +3,7 @@
EXEEXT=.exe
endif
CFLAGS=-O2
-CPPFLAGS=-Iinclude
+CPPFLAGS=-Iinclude -fPIC
all: libstemmer.a stemwords$(EXEEXT)
libstemmer.a: $(snowball_sources:.c=.o)
$(AR) -cru $@ $^
57 changes: 57 additions & 0 deletions CMake/resolve_dependency_modules/stemmer.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# Copyright (c) Facebook, Inc. and its affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
include_guard(GLOBAL)

set(VELOX_STEMMER_VERSION 2.2.0)
set(VELOX_STEMMER_BUILD_SHA256_CHECKSUM
b941d9fe9cf36b4e2f8d3873cd4d8b8775bd94867a1df8d8c001bb8b688377c3)
set(VELOX_STEMMER_SOURCE_URL
"https://snowballstem.org/dist/libstemmer_c-${VELOX_STEMMER_VERSION}.tar.gz"
)

resolve_dependency_url(STEMMER)

message(STATUS "Building stemmer from source")
find_program(MAKE_PROGRAM make REQUIRED)

set(STEMMER_PREFIX "${CMAKE_BINARY_DIR}/_deps/libstemmer")
set(STEMMER_INCLUDE_PATH ${STEMMER_PREFIX}/src/libstemmer/include)

# We can not use FetchContent as libstemmer does not use cmake
ExternalProject_Add(
libstemmer
PREFIX ${STEMMER_PREFIX}
SOURCE_DIR ${STEMMER_PREFIX}/src/libstemmer
URL ${VELOX_STEMMER_SOURCE_URL}
URL_HASH ${VELOX_STEMMER_BUILD_SHA256_CHECKSUM}
BUILD_IN_SOURCE TRUE
CONFIGURE_COMMAND ""
BUILD_COMMAND ${MAKE_PROGRAM}
INSTALL_COMMAND ""
PATCH_COMMAND git apply ${CMAKE_CURRENT_LIST_DIR}/libstemmer/Makefile.patch
BUILD_BYPRODUCTS
${STEMMER_PREFIX}/src/libstemmer/${CMAKE_STATIC_LIBRARY_PREFIX}stemmer${CMAKE_STATIC_LIBRARY_SUFFIX}
)

add_library(stemmer STATIC IMPORTED)
yhwang marked this conversation as resolved.
Show resolved Hide resolved
add_library(stemmer::stemmer ALIAS stemmer)
file(MAKE_DIRECTORY ${STEMMER_INCLUDE_PATH})
set_target_properties(
stemmer
PROPERTIES
IMPORTED_LOCATION
${STEMMER_PREFIX}/src/libstemmer/${CMAKE_STATIC_LIBRARY_PREFIX}stemmer${CMAKE_STATIC_LIBRARY_SUFFIX}
INTERFACE_INCLUDE_DIRECTORIES ${STEMMER_INCLUDE_PATH})

add_dependencies(stemmer libstemmer)
3 changes: 3 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -541,6 +541,9 @@ endif()
set_source(xsimd)
resolve_dependency(xsimd 10.0.0)

set(stemmer_SOURCE BUNDLED)
resolve_dependency(stemmer)
yhwang marked this conversation as resolved.
Show resolved Hide resolved

if(VELOX_BUILD_TESTING)
set(BUILD_TESTING ON)
include(CTest) # include after project() but before add_subdirectory()
Expand Down
37 changes: 37 additions & 0 deletions velox/docs/functions/presto/string.rst
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,43 @@ String Functions

Converts ``string`` to uppercase.

.. function:: word_stem(word) -> varchar

Returns the stem of ``word`` in the English language. If the ``word`` is not an English word,
the ``word`` in lowercase is returned.

.. function:: word_stem(word, lang) -> varchar

Returns the stem of ``word`` in the ``lang`` language. This function supports the following languages:

=========== ================
lang Language
=========== ================
``ca`` ``Catalan``
``da`` ``Danish``
``de`` ``German``
``en`` ``English``
``es`` ``Spanish``
``eu`` ``Basque``
``fi`` ``Finnish``
``fr`` ``French``
``hu`` ``Hungarian``
``hy`` ``Armenian``
``ir`` ``Irish``
``it`` ``Italian``
``lt`` ``Lithuanian``
``nl`` ``Dutch``
``no`` ``Norwegian``
``pt`` ``Portuguese``
``ro`` ``Romanian``
``ru`` ``Russian``
``sv`` ``Swedish``
``tr`` ``Turkish``
=========== ================

If the specified ``lang`` is not supported, this function throws a user error.


Unicode Functions
-----------------

Expand Down
3 changes: 2 additions & 1 deletion velox/functions/prestosql/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,8 @@ target_link_libraries(
velox_type_tz
velox_presto_types
velox_functions_util
Folly::folly)
Folly::folly
stemmer::stemmer)

set_property(TARGET velox_functions_prestosql_impl PROPERTY JOB_POOL_COMPILE
high_memory_pool)
Expand Down
132 changes: 132 additions & 0 deletions velox/functions/prestosql/WordStem.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once

#include <folly/container/F14Map.h>
#include <libstemmer.h>

#include "velox/functions/Udf.h"
#include "velox/functions/lib/string/StringImpl.h"

namespace facebook::velox::functions {

namespace detail {
// Wrap the sbstemmer library and use its sb_stemmer_stem
// to get word stem.
class Stemmer {
public:
Stemmer(sb_stemmer* stemmer) : sbStemmer_(stemmer) {
VELOX_CHECK_NOT_NULL(stemmer);
}

~Stemmer() {
yhwang marked this conversation as resolved.
Show resolved Hide resolved
sb_stemmer_delete(sbStemmer_);
}

// Returns the word stem or nullptr if an unlikely out-of-memory error occurs.
const char* stem(const std::string& input) {
yhwang marked this conversation as resolved.
Show resolved Hide resolved
return (const char*)(sb_stemmer_stem(
sbStemmer_,
reinterpret_cast<unsigned char const*>(input.c_str()),
input.length()));
}

private:
sb_stemmer* sbStemmer_;
};
} // namespace detail

/// word_stem function
/// word_stem(word) -> varchar
/// return the stem of the word in the English language
/// word_stem(word, lang) -> varchar
/// return the stem of the word in the specificed language
///
/// Use the snowball stemmer library to calculate the stem.
/// https://snowballstem.org
/// The website provides Java implementation which is used in Presto as well
/// as C implementation. Therefore, both Presto and Prestimissio
/// would have the same word stem results.
template <typename TExec>
struct WordStemFunction {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@yhwang All you need to do is the following:

1- Add member variable to WordStemFunction to store a map of stemmers.
2- Move getStemmer method into WordStemFunction struct as a private method.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When I move getStemmer into WordStemFunction, it moves the std::map<std::string, std::unique_ptr<Stemmer>> stemmers together. Then wait for the thread-local decision, right?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Then wait for the thread-local decision, right?

Not sure if we need to wait. Let's remove 'thread-local' and make the map a regular private member of the WordStemFunction struct.

FYI, in case you haven't seen these already: https://facebookincubator.github.io/velox/develop/scalar-functions.html

VELOX_DEFINE_FUNCTION_TYPES(TExec);

// ASCII input always produces ASCII result.
static constexpr bool is_default_ascii_behavior = true;

FOLLY_ALWAYS_INLINE void call(
out_type<Varchar>& result,
const arg_type<Varchar>& input) {
return doCall<false>(result, input);
}

FOLLY_ALWAYS_INLINE void callAscii(
out_type<Varchar>& result,
const arg_type<Varchar>& input) {
return doCall<true>(result, input);
}

FOLLY_ALWAYS_INLINE void call(
out_type<Varchar>& result,
const arg_type<Varchar>& input,
const arg_type<Varchar>& lang) {
return doCall<false>(result, input, lang.str());
}

FOLLY_ALWAYS_INLINE void callAscii(
out_type<Varchar>& result,
const arg_type<Varchar>& input,
const arg_type<Varchar>& lang) {
return doCall<true>(result, input, lang.str());
}

template <bool isAscii>
FOLLY_ALWAYS_INLINE void doCall(
out_type<Varchar>& result,
const arg_type<Varchar>& input,
const std::string& lang = "en") {
auto* stemmer = getStemmer(lang);
VELOX_USER_CHECK_NOT_NULL(
stemmer, "Unsupported stemmer language: {}", lang);

std::string lowerOutput;
stringImpl::lower<isAscii>(lowerOutput, input);
auto* stem = stemmer->stem(lowerOutput);
VELOX_CHECK_NOT_NULL(
stem, "Stemmer library returned a NULL (out-of-memory)")
result = stem;
}

private:
folly::F14FastMap<std::string, std::unique_ptr<detail::Stemmer>> stemmers_;

// Get a detail::Stemmer from the the map using the lang as the key or create
// a new one if it doesn't exist. Return nullptr if the specified lang is not
// supported.
detail::Stemmer* getStemmer(const std::string& lang) {
if (auto found = stemmers_.find(lang); found != stemmers_.end()) {
return found->second.get();
}
// Only support ASCII and UTF-8.
if (auto sbStemmer = sb_stemmer_new(lang.c_str(), "UTF_8")) {
auto* stemmer = new detail::Stemmer(sbStemmer);
stemmers_[lang] = std::unique_ptr<detail::Stemmer>(stemmer);
return stemmer;
}
return nullptr;
}
};
} // namespace facebook::velox::functions
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include "velox/functions/prestosql/SplitPart.h"
#include "velox/functions/prestosql/SplitToMap.h"
#include "velox/functions/prestosql/StringFunctions.h"
#include "velox/functions/prestosql/WordStem.h"

namespace facebook::velox::functions {

Expand Down Expand Up @@ -127,5 +128,10 @@ void registerStringFunctions(const std::string& prefix) {
{prefix + "strrpos"});
registerFunction<StrRPosFunction, int64_t, Varchar, Varchar, int64_t>(
{prefix + "strrpos"});

// word_stem function
registerFunction<WordStemFunction, Varchar, Varchar>({prefix + "word_stem"});
registerFunction<WordStemFunction, Varchar, Varchar, Varchar>(
{prefix + "word_stem"});
}
} // namespace facebook::velox::functions
1 change: 1 addition & 0 deletions velox/functions/prestosql/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ add_executable(
URLFunctionsTest.cpp
Utf8Test.cpp
WidthBucketArrayTest.cpp
WordStemTest.cpp
ZipTest.cpp
ZipWithTest.cpp)

Expand Down
80 changes: 80 additions & 0 deletions velox/functions/prestosql/tests/WordStemTest.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <optional>
#include <string>

#include "velox/common/base/tests/GTestUtils.h"
#include "velox/functions/prestosql/tests/utils/FunctionBaseTest.h"

using namespace facebook::velox::functions::test;

namespace facebook::velox::functions {
namespace {
class WordStemTest : public FunctionBaseTest {
yhwang marked this conversation as resolved.
Show resolved Hide resolved
protected:
std::string wordStem(const std::string& word, const std::string& lang) {
return evaluateOnce<std::string>(
"word_stem(c0, c1)", std::optional(word), std::optional(lang))
.value();
}

std::string wordStem(const std::string& word) {
return evaluateOnce<std::string>("word_stem(c0)", std::optional(word))
.value();
}
};

/// Borrow test cases from Presto Java:
/// https://github.com/prestodb/presto/blob/master/presto-main/src/test/java/com/facebook/presto/operator/scalar/TestWordStemFunction.java
TEST_F(WordStemTest, asciiWord) {
EXPECT_EQ(wordStem(""), "");
EXPECT_EQ(wordStem("x"), "x");
EXPECT_EQ(wordStem("abc"), "abc");
EXPECT_EQ(wordStem("generally"), "general");
EXPECT_EQ(wordStem("useful"), "use");
EXPECT_EQ(wordStem("runs"), "run");
EXPECT_EQ(wordStem("run"), "run");
EXPECT_EQ(wordStem("authorized", "en"), "author");
EXPECT_EQ(wordStem("accessories", "en"), "accessori");
EXPECT_EQ(wordStem("intensifying", "en"), "intensifi");
EXPECT_EQ(wordStem("resentment", "en"), "resent");
EXPECT_EQ(wordStem("faithfulness", "en"), "faith");
EXPECT_EQ(wordStem("continuerait", "fr"), "continu");
EXPECT_EQ(wordStem("torpedearon", "es"), "torped");
EXPECT_EQ(wordStem("quilomtricos", "pt"), "quilomtr");
EXPECT_EQ(wordStem("pronunziare", "it"), "pronunz");
EXPECT_EQ(wordStem("auferstnde", "de"), "auferstnd");
}

TEST_F(WordStemTest, invalidLang) {
VELOX_ASSERT_THROW(
wordStem("hello", "xx"), "Unsupported stemmer language: xx");
}

TEST_F(WordStemTest, unicodeWord) {
EXPECT_EQ(
wordStem(
"\u004b\u0069\u0074\u0061\u0062\u0131\u006d\u0131\u007a\u0064\u0131",
"tr"),
"kitap");
EXPECT_EQ(
wordStem("\u0432\u0435\u0441\u0435\u043d\u043d\u0438\u0439", "ru"),
"\u0432\u0435\u0441\u0435\u043d");
}

} // namespace
} // namespace facebook::velox::functions
Loading