-
Notifications
You must be signed in to change notification settings - Fork 1.2k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Update word_stem impl to address comments
- doc update - separate the impl to a new header file - separate the test to a new cpp file - apply code convensions Signed-off-by: Yihong Wang <[email protected]>
- Loading branch information
Showing
7 changed files
with
242 additions
and
169 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,131 @@ | ||
/* | ||
* Copyright (c) Facebook, Inc. and its affiliates. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
#pragma once | ||
|
||
#include <libstemmer.h> | ||
|
||
#include "velox/functions/Udf.h" | ||
#include "velox/functions/lib/string/StringImpl.h" | ||
|
||
namespace facebook::velox::functions { | ||
|
||
namespace { | ||
/// Wrap the sbstemmer library and use its sb_stemmer_stem | ||
/// to get word stem | ||
class Stemmer { | ||
private: | ||
sb_stemmer* sbStemmer_; | ||
Stemmer(sb_stemmer* stemmer) : sbStemmer_(stemmer) {} | ||
|
||
public: | ||
~Stemmer() { | ||
sb_stemmer_delete(sbStemmer_); | ||
} | ||
|
||
/// Get a Stemmer from the the map stored in thread local storage | ||
/// or create a new one if it doesn't exist. Return nullptr if the | ||
/// specified lang is not supported. | ||
static Stemmer* getStemmer(const char* lang) { | ||
thread_local std::map<std::string, std::unique_ptr<Stemmer>> stemmers; | ||
if (auto found = stemmers.find(lang); found != stemmers.end()) { | ||
return found->second.get(); | ||
} | ||
Stemmer* stemmer = nullptr; | ||
// Only support ASCII and UTF-8 | ||
if (auto sbStemmer = sb_stemmer_new(lang, "UTF_8")) { | ||
stemmer = new Stemmer(sbStemmer); | ||
stemmers[lang] = std::unique_ptr<Stemmer>(stemmer); | ||
} | ||
return stemmer; | ||
} | ||
|
||
/// Get the word stem or NULL if out of memory | ||
const char* stem(const std::string& input) { | ||
return (const char*)(sb_stemmer_stem( | ||
sbStemmer_, | ||
reinterpret_cast<unsigned char const*>(input.c_str()), | ||
input.length())); | ||
} | ||
}; | ||
} // namespace | ||
|
||
/// word_stem function | ||
/// word_stem(word) -> varchar | ||
/// return the stem of the word in the English language | ||
/// word_stem(word, lang) -> varchar | ||
/// return the stem of the word in the specificed language | ||
/// | ||
/// It uses the snowball stemmer library to calculate the stem. | ||
/// https://snowballstem.org | ||
/// It provides Java implementation which is used in Presto as well | ||
/// as C implementation. Therefore, both Presto and Prestimissio | ||
/// would have the same word stem results. | ||
template <typename TExec> | ||
struct WordStemFunction { | ||
VELOX_DEFINE_FUNCTION_TYPES(TExec); | ||
|
||
// Results refer to strings in the first argument. | ||
static constexpr int32_t reuse_strings_from_arg = 0; | ||
|
||
// ASCII input always produces ASCII result. | ||
static constexpr bool is_default_ascii_behavior = true; | ||
|
||
FOLLY_ALWAYS_INLINE void call( | ||
out_type<Varchar>& result, | ||
const arg_type<Varchar>& input) { | ||
return doCall<false>(result, input); | ||
} | ||
|
||
FOLLY_ALWAYS_INLINE void callAscii( | ||
out_type<Varchar>& result, | ||
const arg_type<Varchar>& input) { | ||
return doCall<true>(result, input); | ||
} | ||
|
||
FOLLY_ALWAYS_INLINE void call( | ||
out_type<Varchar>& result, | ||
const arg_type<Varchar>& input, | ||
const arg_type<Varchar>& lang) { | ||
return doCall<false>(result, input, lang.data()); | ||
} | ||
|
||
FOLLY_ALWAYS_INLINE void callAscii( | ||
out_type<Varchar>& result, | ||
const arg_type<Varchar>& input, | ||
const arg_type<Varchar>& lang) { | ||
return doCall<true>(result, input, lang.data()); | ||
} | ||
|
||
template <bool isAscii> | ||
FOLLY_ALWAYS_INLINE void doCall( | ||
out_type<Varchar>& result, | ||
const arg_type<Varchar>& input, | ||
const char* lang = "en") { | ||
auto stemmer = Stemmer::getStemmer(lang); | ||
if (!stemmer) { | ||
// language is not supported | ||
VELOX_USER_FAIL("Unknown stemmer language: \"{}\"", lang); | ||
} | ||
|
||
std::string lowerOutput; | ||
stringImpl::lower<isAscii>(lowerOutput, input); | ||
auto stem = stemmer->stem(lowerOutput); | ||
VELOX_CHECK_NOT_NULL( | ||
stem, "Stemmer library returned a NULL (out-of-memory)") | ||
result = stem; | ||
} | ||
}; | ||
} // namespace facebook::velox::functions |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.