From b0698b130625f768f4b2554100a9d1c5c6ff4cc2 Mon Sep 17 00:00:00 2001 From: Surbhi Vijayvargeeya Date: Thu, 23 May 2024 16:11:51 +0530 Subject: [PATCH] Revert "Add word_stem Presto function (#9363)" This reverts commit 47970417ac92135e862c0fde350d4d60fa2f1423. --- CMake/resolve_dependency_modules/README.md | 1 - .../libstemmer/Makefile.patch | 24 ---- .../resolve_dependency_modules/stemmer.cmake | 57 -------- CMakeLists.txt | 3 - velox/docs/functions/presto/string.rst | 37 ----- velox/functions/prestosql/CMakeLists.txt | 3 +- velox/functions/prestosql/WordStem.h | 132 ------------------ .../StringFunctionsRegistration.cpp | 6 - .../functions/prestosql/tests/CMakeLists.txt | 1 - .../prestosql/tests/WordStemTest.cpp | 80 ----------- 10 files changed, 1 insertion(+), 343 deletions(-) delete mode 100644 CMake/resolve_dependency_modules/libstemmer/Makefile.patch delete mode 100644 CMake/resolve_dependency_modules/stemmer.cmake delete mode 100644 velox/functions/prestosql/WordStem.h delete mode 100644 velox/functions/prestosql/tests/WordStemTest.cpp diff --git a/CMake/resolve_dependency_modules/README.md b/CMake/resolve_dependency_modules/README.md index a88273280978..651926be6f19 100644 --- a/CMake/resolve_dependency_modules/README.md +++ b/CMake/resolve_dependency_modules/README.md @@ -37,7 +37,6 @@ by Velox. See details on bundling below. | wangle | v2024.04.01.00 | No | | mvfst | v2024.04.01.00 | No | | fbthrift | v2024.04.01.00 | No | -| libstemmer | 2.2.0 | Yes | | DuckDB (testing) | 0.8.1 | Yes | | cpr (testing) | 1.10.15 | Yes | diff --git a/CMake/resolve_dependency_modules/libstemmer/Makefile.patch b/CMake/resolve_dependency_modules/libstemmer/Makefile.patch deleted file mode 100644 index e7d691052111..000000000000 --- a/CMake/resolve_dependency_modules/libstemmer/Makefile.patch +++ /dev/null @@ -1,24 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- a/Makefile -+++ b/Makefile -@@ -3,7 +3,7 @@ - EXEEXT=.exe - endif - CFLAGS=-O2 --CPPFLAGS=-Iinclude -+CPPFLAGS=-Iinclude -fPIC - all: libstemmer.a stemwords$(EXEEXT) - libstemmer.a: $(snowball_sources:.c=.o) - $(AR) -cru $@ $^ diff --git a/CMake/resolve_dependency_modules/stemmer.cmake b/CMake/resolve_dependency_modules/stemmer.cmake deleted file mode 100644 index dbaca146341b..000000000000 --- a/CMake/resolve_dependency_modules/stemmer.cmake +++ /dev/null @@ -1,57 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -include_guard(GLOBAL) - -set(VELOX_STEMMER_VERSION 2.2.0) -set(VELOX_STEMMER_BUILD_SHA256_CHECKSUM - b941d9fe9cf36b4e2f8d3873cd4d8b8775bd94867a1df8d8c001bb8b688377c3) -set(VELOX_STEMMER_SOURCE_URL - "https://snowballstem.org/dist/libstemmer_c-${VELOX_STEMMER_VERSION}.tar.gz" -) - -resolve_dependency_url(STEMMER) - -message(STATUS "Building stemmer from source") -find_program(MAKE_PROGRAM make REQUIRED) - -set(STEMMER_PREFIX "${CMAKE_BINARY_DIR}/_deps/libstemmer") -set(STEMMER_INCLUDE_PATH ${STEMMER_PREFIX}/src/libstemmer/include) - -# We can not use FetchContent as libstemmer does not use cmake -ExternalProject_Add( - libstemmer - PREFIX ${STEMMER_PREFIX} - SOURCE_DIR ${STEMMER_PREFIX}/src/libstemmer - URL ${VELOX_STEMMER_SOURCE_URL} - URL_HASH ${VELOX_STEMMER_BUILD_SHA256_CHECKSUM} - BUILD_IN_SOURCE TRUE - CONFIGURE_COMMAND "" - BUILD_COMMAND ${MAKE_PROGRAM} - INSTALL_COMMAND "" - PATCH_COMMAND git apply ${CMAKE_CURRENT_LIST_DIR}/libstemmer/Makefile.patch - BUILD_BYPRODUCTS - ${STEMMER_PREFIX}/src/libstemmer/${CMAKE_STATIC_LIBRARY_PREFIX}stemmer${CMAKE_STATIC_LIBRARY_SUFFIX} -) - -add_library(stemmer STATIC IMPORTED GLOBAL) -add_library(stemmer::stemmer ALIAS stemmer) -file(MAKE_DIRECTORY ${STEMMER_INCLUDE_PATH}) -set_target_properties( - stemmer - PROPERTIES - IMPORTED_LOCATION - ${STEMMER_PREFIX}/src/libstemmer/${CMAKE_STATIC_LIBRARY_PREFIX}stemmer${CMAKE_STATIC_LIBRARY_SUFFIX} - INTERFACE_INCLUDE_DIRECTORIES ${STEMMER_INCLUDE_PATH}) - -add_dependencies(stemmer libstemmer) diff --git a/CMakeLists.txt b/CMakeLists.txt index f8cb34164dbc..c78016e89281 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -550,9 +550,6 @@ endif() set_source(xsimd) resolve_dependency(xsimd 10.0.0) -set(stemmer_SOURCE BUNDLED) -resolve_dependency(stemmer) - if(VELOX_BUILD_TESTING) set(BUILD_TESTING ON) include(CTest) # include after project() but before add_subdirectory() diff --git a/velox/docs/functions/presto/string.rst b/velox/docs/functions/presto/string.rst index 44fb82cef4fa..2db49c6f6c81 100644 --- a/velox/docs/functions/presto/string.rst +++ b/velox/docs/functions/presto/string.rst @@ -255,43 +255,6 @@ String Functions Converts ``string`` to uppercase. -.. function:: word_stem(word) -> varchar - - Returns the stem of ``word`` in the English language. If the ``word`` is not an English word, - the ``word`` in lowercase is returned. - -.. function:: word_stem(word, lang) -> varchar - - Returns the stem of ``word`` in the ``lang`` language. This function supports the following languages: - - =========== ================ - lang Language - =========== ================ - ``ca`` ``Catalan`` - ``da`` ``Danish`` - ``de`` ``German`` - ``en`` ``English`` - ``es`` ``Spanish`` - ``eu`` ``Basque`` - ``fi`` ``Finnish`` - ``fr`` ``French`` - ``hu`` ``Hungarian`` - ``hy`` ``Armenian`` - ``ir`` ``Irish`` - ``it`` ``Italian`` - ``lt`` ``Lithuanian`` - ``nl`` ``Dutch`` - ``no`` ``Norwegian`` - ``pt`` ``Portuguese`` - ``ro`` ``Romanian`` - ``ru`` ``Russian`` - ``sv`` ``Swedish`` - ``tr`` ``Turkish`` - =========== ================ - - If the specified ``lang`` is not supported, this function throws a user error. - - Unicode Functions ----------------- diff --git a/velox/functions/prestosql/CMakeLists.txt b/velox/functions/prestosql/CMakeLists.txt index 18b8d3fff65b..e52e21695ba5 100644 --- a/velox/functions/prestosql/CMakeLists.txt +++ b/velox/functions/prestosql/CMakeLists.txt @@ -72,8 +72,7 @@ target_link_libraries( velox_type_tz velox_presto_types velox_functions_util - Folly::folly - stemmer::stemmer) + Folly::folly) set_property(TARGET velox_functions_prestosql_impl PROPERTY JOB_POOL_COMPILE high_memory_pool) diff --git a/velox/functions/prestosql/WordStem.h b/velox/functions/prestosql/WordStem.h deleted file mode 100644 index 9d088b202467..000000000000 --- a/velox/functions/prestosql/WordStem.h +++ /dev/null @@ -1,132 +0,0 @@ -/* - * Copyright (c) Facebook, Inc. and its affiliates. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include // @manual - -#include "velox/functions/Udf.h" -#include "velox/functions/lib/string/StringImpl.h" - -namespace facebook::velox::functions { - -namespace detail { -// Wrap the sbstemmer library and use its sb_stemmer_stem -// to get word stem. -class Stemmer { - public: - Stemmer(sb_stemmer* stemmer) : sbStemmer_(stemmer) { - VELOX_CHECK_NOT_NULL(stemmer); - } - - ~Stemmer() { - sb_stemmer_delete(sbStemmer_); - } - - // Get the word stem or NULL if out of memory. - const char* stem(const std::string& input) { - return (const char*)(sb_stemmer_stem( - sbStemmer_, - reinterpret_cast(input.c_str()), - input.length())); - } - - private: - sb_stemmer* sbStemmer_; -}; -} // namespace detail - -/// word_stem function -/// word_stem(word) -> varchar -/// return the stem of the word in the English language -/// word_stem(word, lang) -> varchar -/// return the stem of the word in the specificed language -/// -/// Use the snowball stemmer library to calculate the stem. -/// https://snowballstem.org -/// The website provides Java implementation which is used in Presto as well -/// as C implementation. Therefore, both Presto and Prestimissio -/// would have the same word stem results. -template -struct WordStemFunction { - VELOX_DEFINE_FUNCTION_TYPES(TExec); - - // ASCII input always produces ASCII result. - static constexpr bool is_default_ascii_behavior = true; - - FOLLY_ALWAYS_INLINE void call( - out_type& result, - const arg_type& input) { - return doCall(result, input); - } - - FOLLY_ALWAYS_INLINE void callAscii( - out_type& result, - const arg_type& input) { - return doCall(result, input); - } - - FOLLY_ALWAYS_INLINE void call( - out_type& result, - const arg_type& input, - const arg_type& lang) { - return doCall(result, input, lang); - } - - FOLLY_ALWAYS_INLINE void callAscii( - out_type& result, - const arg_type& input, - const arg_type& lang) { - return doCall(result, input, lang); - } - - template - FOLLY_ALWAYS_INLINE void doCall( - out_type& result, - const arg_type& input, - const std::string& lang = "en") { - auto* stemmer = getStemmer(lang); - VELOX_USER_CHECK_NOT_NULL( - stemmer, "Unsupported stemmer language: {}", lang); - - std::string lowerOutput; - stringImpl::lower(lowerOutput, input); - auto* stem = stemmer->stem(lowerOutput); - VELOX_CHECK_NOT_NULL( - stem, "Stemmer library returned a NULL (out-of-memory)") - result = stem; - } - - private: - folly::F14FastMap> stemmers_; - - // Get a detail::Stemmer from the the map using the lang as the key or create - // a new one if it doesn't exist. Return nullptr if the specified lang is not - // supported. - detail::Stemmer* getStemmer(const std::string& lang) { - if (auto found = stemmers_.find(lang); found != stemmers_.end()) { - return found->second.get(); - } - // Only support ASCII and UTF-8. - if (auto sbStemmer = sb_stemmer_new(lang.c_str(), "UTF_8")) { - auto* stemmer = new detail::Stemmer(sbStemmer); - stemmers_[lang] = std::unique_ptr(stemmer); - return stemmer; - } - return nullptr; - } -}; -} // namespace facebook::velox::functions diff --git a/velox/functions/prestosql/registration/StringFunctionsRegistration.cpp b/velox/functions/prestosql/registration/StringFunctionsRegistration.cpp index 4f9a93872d72..9420be9c7fca 100644 --- a/velox/functions/prestosql/registration/StringFunctionsRegistration.cpp +++ b/velox/functions/prestosql/registration/StringFunctionsRegistration.cpp @@ -19,7 +19,6 @@ #include "velox/functions/prestosql/SplitPart.h" #include "velox/functions/prestosql/SplitToMap.h" #include "velox/functions/prestosql/StringFunctions.h" -#include "velox/functions/prestosql/WordStem.h" namespace facebook::velox::functions { @@ -131,10 +130,5 @@ void registerStringFunctions(const std::string& prefix) { {prefix + "strrpos"}); registerFunction( {prefix + "strrpos"}); - - // word_stem function - registerFunction({prefix + "word_stem"}); - registerFunction( - {prefix + "word_stem"}); } } // namespace facebook::velox::functions diff --git a/velox/functions/prestosql/tests/CMakeLists.txt b/velox/functions/prestosql/tests/CMakeLists.txt index 7b6dd6694400..8f4ca76b8813 100644 --- a/velox/functions/prestosql/tests/CMakeLists.txt +++ b/velox/functions/prestosql/tests/CMakeLists.txt @@ -98,7 +98,6 @@ add_executable( URLFunctionsTest.cpp Utf8Test.cpp WidthBucketArrayTest.cpp - WordStemTest.cpp ZipTest.cpp ZipWithTest.cpp) diff --git a/velox/functions/prestosql/tests/WordStemTest.cpp b/velox/functions/prestosql/tests/WordStemTest.cpp deleted file mode 100644 index e94783e1dd1a..000000000000 --- a/velox/functions/prestosql/tests/WordStemTest.cpp +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Copyright (c) Facebook, Inc. and its affiliates. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include - -#include "velox/common/base/tests/GTestUtils.h" -#include "velox/functions/prestosql/tests/utils/FunctionBaseTest.h" - -using namespace facebook::velox::functions::test; - -namespace facebook::velox::functions { -namespace { -class WordStemTest : public FunctionBaseTest { - protected: - std::string wordStem(const std::string& word, const std::string& lang) { - return evaluateOnce( - "word_stem(c0, c1)", std::optional(word), std::optional(lang)) - .value(); - } - - std::string wordStem(const std::string& word) { - return evaluateOnce("word_stem(c0)", std::optional(word)) - .value(); - } -}; - -/// Borrow test cases from Presto Java: -/// https://github.com/prestodb/presto/blob/master/presto-main/src/test/java/com/facebook/presto/operator/scalar/TestWordStemFunction.java -TEST_F(WordStemTest, asciiWord) { - EXPECT_EQ(wordStem(""), ""); - EXPECT_EQ(wordStem("x"), "x"); - EXPECT_EQ(wordStem("abc"), "abc"); - EXPECT_EQ(wordStem("generally"), "general"); - EXPECT_EQ(wordStem("useful"), "use"); - EXPECT_EQ(wordStem("runs"), "run"); - EXPECT_EQ(wordStem("run"), "run"); - EXPECT_EQ(wordStem("authorized", "en"), "author"); - EXPECT_EQ(wordStem("accessories", "en"), "accessori"); - EXPECT_EQ(wordStem("intensifying", "en"), "intensifi"); - EXPECT_EQ(wordStem("resentment", "en"), "resent"); - EXPECT_EQ(wordStem("faithfulness", "en"), "faith"); - EXPECT_EQ(wordStem("continuerait", "fr"), "continu"); - EXPECT_EQ(wordStem("torpedearon", "es"), "torped"); - EXPECT_EQ(wordStem("quilomtricos", "pt"), "quilomtr"); - EXPECT_EQ(wordStem("pronunziare", "it"), "pronunz"); - EXPECT_EQ(wordStem("auferstnde", "de"), "auferstnd"); -} - -TEST_F(WordStemTest, invalidLang) { - VELOX_ASSERT_THROW( - wordStem("hello", "xx"), "Unsupported stemmer language: xx"); -} - -TEST_F(WordStemTest, unicodeWord) { - EXPECT_EQ( - wordStem( - "\u004b\u0069\u0074\u0061\u0062\u0131\u006d\u0131\u007a\u0064\u0131", - "tr"), - "kitap"); - EXPECT_EQ( - wordStem("\u0432\u0435\u0441\u0435\u043d\u043d\u0438\u0439", "ru"), - "\u0432\u0435\u0441\u0435\u043d"); -} - -} // namespace -} // namespace facebook::velox::functions