Skip to content

Commit

Permalink
Add tokenizer factory to support plugin custom tokenizer (7484)
Browse files Browse the repository at this point in the history
  • Loading branch information
JkSelf authored and zhztheplayer committed Dec 29, 2023
1 parent cd61634 commit 0e4237f
Show file tree
Hide file tree
Showing 5 changed files with 122 additions and 52 deletions.
14 changes: 6 additions & 8 deletions velox/type/Subfield.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,21 +18,19 @@

namespace facebook::velox::common {

Subfield::Subfield(
const std::string& path,
const std::shared_ptr<Separators>& separators) {
Tokenizer tokenizer(path, separators);
VELOX_CHECK(tokenizer.hasNext(), "Column name is missing: {}", path);
Subfield::Subfield(const std::string& path) {
auto tokenizer = Tokenizer::getInstance(path);
VELOX_CHECK(tokenizer->hasNext(), "Column name is missing: {}", path);

auto firstElement = tokenizer.next();
auto firstElement = tokenizer->next();
VELOX_CHECK(
firstElement->kind() == kNestedField,
"Subfield path must start with a name: {}",
path);
std::vector<std::unique_ptr<PathElement>> pathElements;
pathElements.push_back(std::move(firstElement));
while (tokenizer.hasNext()) {
pathElements.push_back(tokenizer.next());
while (tokenizer->hasNext()) {
pathElements.push_back(tokenizer->next());
}
path_ = std::move(pathElements);
}
Expand Down
5 changes: 1 addition & 4 deletions velox/type/Subfield.h
Original file line number Diff line number Diff line change
Expand Up @@ -218,10 +218,7 @@ class Subfield {
};

public:
// Separators: the customized separators to tokenize field name.
explicit Subfield(
const std::string& path,
const std::shared_ptr<Separators>& separators = Separators::get());
explicit Subfield(const std::string& path);

explicit Subfield(std::vector<std::unique_ptr<PathElement>>&& path);

Expand Down
65 changes: 43 additions & 22 deletions velox/type/Tokenizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,13 @@

namespace facebook::velox::common {

Tokenizer::Tokenizer(
const std::string& path,
const std::shared_ptr<Separators>& separators)
: path_(path), separators_(separators) {
DefaultTokenizer::DefaultTokenizer(const std::string& path)
: path_(path), separators_(Separators::get()) {
state = State::kNotReady;
index_ = 0;
}

bool Tokenizer::hasNext() {
bool DefaultTokenizer::hasNext() {
switch (state) {
case State::kDone:
return false;
Expand All @@ -39,19 +37,19 @@ bool Tokenizer::hasNext() {
return tryToComputeNext();
}

std::unique_ptr<Subfield::PathElement> Tokenizer::next() {
std::unique_ptr<Subfield::PathElement> DefaultTokenizer::next() {
if (!hasNext()) {
VELOX_FAIL("No more tokens");
}
state = State::kNotReady;
return std::move(next_);
}

bool Tokenizer::hasNextCharacter() {
bool DefaultTokenizer::hasNextCharacter() {
return index_ < path_.length();
}

std::unique_ptr<Subfield::PathElement> Tokenizer::computeNext() {
std::unique_ptr<Subfield::PathElement> DefaultTokenizer::computeNext() {
if (!hasNextCharacter()) {
state = State::kDone;
return nullptr;
Expand Down Expand Up @@ -83,33 +81,33 @@ std::unique_ptr<Subfield::PathElement> Tokenizer::computeNext() {
VELOX_UNREACHABLE();
}

bool Tokenizer::tryMatchSeparator(char expected) {
bool DefaultTokenizer::tryMatchSeparator(char expected) {
return separators_->isSeparator(expected) && tryMatch(expected);
}

void Tokenizer::match(char expected) {
void DefaultTokenizer::match(char expected) {
if (!tryMatch(expected)) {
invalidSubfieldPath();
}
}

bool Tokenizer::tryMatch(char expected) {
bool DefaultTokenizer::tryMatch(char expected) {
if (!hasNextCharacter() || peekCharacter() != expected) {
return false;
}
index_++;
return true;
}

void Tokenizer::nextCharacter() {
void DefaultTokenizer::nextCharacter() {
index_++;
}

char Tokenizer::peekCharacter() {
char DefaultTokenizer::peekCharacter() {
return path_[index_];
}

std::unique_ptr<Subfield::PathElement> Tokenizer::matchPathSegment() {
std::unique_ptr<Subfield::PathElement> DefaultTokenizer::matchPathSegment() {
// seek until we see a special character or whitespace
int start = index_;
while (hasNextCharacter() && !separators_->isSeparator(peekCharacter()) &&
Expand All @@ -128,7 +126,8 @@ std::unique_ptr<Subfield::PathElement> Tokenizer::matchPathSegment() {
return std::make_unique<Subfield::NestedField>(token);
}

std::unique_ptr<Subfield::PathElement> Tokenizer::matchUnquotedSubscript() {
std::unique_ptr<Subfield::PathElement>
DefaultTokenizer::matchUnquotedSubscript() {
// seek until we see a special character or whitespace
int start = index_;
while (hasNextCharacter() && isUnquotedSubscriptCharacter(peekCharacter())) {
Expand All @@ -151,16 +150,17 @@ std::unique_ptr<Subfield::PathElement> Tokenizer::matchUnquotedSubscript() {
return std::make_unique<Subfield::LongSubscript>(index);
}

bool Tokenizer::isUnquotedPathCharacter(char c) {
bool DefaultTokenizer::isUnquotedPathCharacter(char c) {
return c == ':' || c == '$' || c == '-' || c == '/' || c == '@' || c == '|' ||
c == '#' || c == '.' || isUnquotedSubscriptCharacter(c);
}

bool Tokenizer::isUnquotedSubscriptCharacter(char c) {
bool DefaultTokenizer::isUnquotedSubscriptCharacter(char c) {
return c == '-' || c == '_' || isalnum(c);
}

std::unique_ptr<Subfield::PathElement> Tokenizer::matchQuotedSubscript() {
std::unique_ptr<Subfield::PathElement>
DefaultTokenizer::matchQuotedSubscript() {
// quote has already been matched

// seek until we see the close quote
Expand Down Expand Up @@ -200,20 +200,21 @@ std::unique_ptr<Subfield::PathElement> Tokenizer::matchQuotedSubscript() {
return std::make_unique<Subfield::StringSubscript>(token);
}

std::unique_ptr<Subfield::PathElement> Tokenizer::matchWildcardSubscript() {
std::unique_ptr<Subfield::PathElement>
DefaultTokenizer::matchWildcardSubscript() {
return std::make_unique<Subfield::AllSubscripts>();
}

void Tokenizer::invalidSubfieldPath() {
void DefaultTokenizer::invalidSubfieldPath() {
VELOX_FAIL("Invalid subfield path: {}", this->toString());
}

std::string Tokenizer::toString() {
std::string DefaultTokenizer::toString() {
return path_.substr(0, index_) + separators_->unicodeCaret +
path_.substr(index_);
}

bool Tokenizer::tryToComputeNext() {
bool DefaultTokenizer::tryToComputeNext() {
state = State::kFailed; // temporary pessimism
next_ = computeNext();
if (state != State::kDone) {
Expand All @@ -222,4 +223,24 @@ bool Tokenizer::tryToComputeNext() {
}
return false;
}

std::function<std::unique_ptr<Tokenizer>(const std::string&)>
Tokenizer::tokenizerFactory_ = nullptr;

// static
std::unique_ptr<Tokenizer> Tokenizer::getInstance(const std::string& path) {
if (!tokenizerFactory_) {
tokenizerFactory_ = [](const std::string& p) {
return std::make_unique<DefaultTokenizer>(p);
};
}
return tokenizerFactory_(path);
}

// static
void Tokenizer::registerInstanceFactory(
std::function<std::unique_ptr<Tokenizer>(const std::string&)>
tokenizerFactory) {
tokenizerFactory_ = tokenizerFactory;
}
} // namespace facebook::velox::common
28 changes: 22 additions & 6 deletions velox/type/Tokenizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,14 +35,30 @@ class Tokenizer {
kFailed,
};

// Separators: the customized separators to tokenize field name.
explicit Tokenizer(
const std::string& path,
const std::shared_ptr<Separators>& separators);
virtual ~Tokenizer() = default;

bool hasNext();
virtual bool hasNext() = 0;

std::unique_ptr<Subfield::PathElement> next();
virtual std::unique_ptr<Subfield::PathElement> next() = 0;

static std::unique_ptr<Tokenizer> getInstance(const std::string& path);

static void registerInstanceFactory(
std::function<std::unique_ptr<Tokenizer>(const std::string&)>
tokenizerFactory);

private:
static std::function<std::unique_ptr<Tokenizer>(const std::string&)>
tokenizerFactory_;
};

class DefaultTokenizer : public Tokenizer {
public:
explicit DefaultTokenizer(const std::string& path);

bool hasNext() override;

std::unique_ptr<Subfield::PathElement> next() override;

private:
const std::string path_;
Expand Down
62 changes: 50 additions & 12 deletions velox/type/tests/SubfieldTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,11 @@
using namespace facebook::velox::common;

std::vector<std::unique_ptr<Subfield::PathElement>> tokenize(
const std::string& path,
const std::shared_ptr<Separators>& separators = Separators::get()) {
const std::string& path) {
std::vector<std::unique_ptr<Subfield::PathElement>> elements;
Tokenizer tokenizer(path, separators);
while (tokenizer.hasNext()) {
elements.push_back(tokenizer.next());
auto tokenizer = Tokenizer::getInstance(path);
while (tokenizer->hasNext()) {
elements.push_back(tokenizer->next());
}
return elements;
}
Expand All @@ -48,10 +47,8 @@ TEST(SubfieldTest, invalidPaths) {
assertInvalidSubfield("a[2].[3].", "Invalid subfield path: a[2].^[3].");
}

void testColumnName(
const std::string& name,
const std::shared_ptr<Separators>& separators = Separators::get()) {
auto elements = tokenize(name, separators);
void testColumnName(const std::string& name) {
auto elements = tokenize(name);
EXPECT_EQ(elements.size(), 1);
EXPECT_EQ(*elements[0].get(), Subfield::NestedField(name));
}
Expand All @@ -62,9 +59,6 @@ TEST(SubfieldTest, columnNamesWithSpecialCharacters) {
testColumnName("a/b/c:12");
testColumnName("@basis");
testColumnName("@basis|city_id");
auto separators = std::make_shared<Separators>();
separators->dot = '\0';
testColumnName("city.id@address:number/date|day$a-b$10_bucket", separators);
}

std::vector<std::unique_ptr<Subfield::PathElement>> createElements() {
Expand Down Expand Up @@ -154,3 +148,47 @@ TEST(SubfieldTest, longSubscript) {
ASSERT_TRUE(longSubscript);
ASSERT_EQ(longSubscript->index(), 3309189884973035076);
}

class FakeTokenizer : public Tokenizer {
public:
explicit FakeTokenizer(const std::string& path) : path_(path) {
state = State::kNotReady;
}

bool hasNext() override {
if (state == State::kDone) {
return false;
} else if (state == State::kNotReady) {
return true;
}
VELOX_FAIL("Illegal state");
}

std::unique_ptr<Subfield::PathElement> next() override {
if (!hasNext()) {
VELOX_USER_FAIL("No more tokens");
}
state = State::kDone;
return std::make_unique<Subfield::NestedField>(path_);
}

private:
const std::string path_;
State state;
};

TEST(SubfieldTest, CustomTokenizer) {
Tokenizer::registerInstanceFactory(
[](const std::string& p) { return std::make_unique<FakeTokenizer>(p); });

testColumnName("$bucket");
testColumnName("apollo-11");
testColumnName("a/b/c:12");
testColumnName("@basis");
testColumnName("@basis|city_id");
testColumnName("city.id@address*:number/date|day$a-b$10_bucket");

Tokenizer::registerInstanceFactory([](const std::string& p) {
return std::make_unique<DefaultTokenizer>(p);
});
}

0 comments on commit 0e4237f

Please sign in to comment.