Skip to content

Commit

Permalink
llama : move grammar code into llama-grammar
Browse files Browse the repository at this point in the history
ggml-ci
  • Loading branch information
ggerganov committed Jul 16, 2024
1 parent 516746a commit a0fff5d
Show file tree
Hide file tree
Showing 12 changed files with 739 additions and 672 deletions.
18 changes: 17 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -868,6 +868,8 @@ OBJ_GGML += \

OBJ_LLAMA = \
src/llama.o \
src/llama-vocab.o \
src/llama-grammar.o \
src/llama-sampling.o \
src/unicode.o \
src/unicode-data.o
Expand Down Expand Up @@ -1058,6 +1060,20 @@ src/llama.o: \
ggml/include/ggml-backend.h
$(CXX) $(CXXFLAGS) -c $< -o $@

src/llama-vocab.o: \
src/llama-vocab.cpp \
src/llama-vocab.h \
src/llama-impl.h \
include/llama.h
$(CXX) $(CXXFLAGS) -c $< -o $@

src/llama-grammar.o: \
src/llama-grammar.cpp \
src/llama-grammar.h \
src/llama-impl.h \
include/llama.h
$(CXX) $(CXXFLAGS) -c $< -o $@

src/llama-sampling.o: \
src/llama-sampling.cpp \
src/llama-sampling.h \
Expand Down Expand Up @@ -1440,7 +1456,7 @@ run-benchmark-matmult: llama-benchmark-matmult
.PHONY: run-benchmark-matmult swift

tests/test-llama-grammar: tests/test-llama-grammar.cpp \
$(OBJ_GGML) $(OBJ_COMMON) src/unicode.o src/unicode-data.o
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

Expand Down
4 changes: 2 additions & 2 deletions common/sampling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -330,7 +330,7 @@ static llama_token llama_sampling_sample_impl(
llama_token_data_array single_token_data_array = { &single_token_data, 1, false };

// Apply grammar constraints to the single token
llama_sample_grammar(ctx_main, &single_token_data_array, ctx_sampling->grammar);
llama_grammar_sample(ctx_main, &single_token_data_array, ctx_sampling->grammar);

// Check if the token is valid according to the grammar by seeing if its logit has been set to -INFINITY
bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
Expand Down Expand Up @@ -421,7 +421,7 @@ static llama_token_data_array llama_sampling_prepare_impl(

// apply grammar checks before sampling logic
if (apply_grammar && ctx_sampling->grammar != NULL) {
llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar);
llama_grammar_sample(ctx_main, &cur_p, ctx_sampling->grammar);
}

return cur_p;
Expand Down
13 changes: 8 additions & 5 deletions examples/gbnf-validator/gbnf-validator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,20 +16,23 @@ static bool llama_sample_grammar_string(struct llama_grammar * grammar, const st
auto decoded = decode_utf8(input_str, {});
const auto & code_points = decoded.first;

llama_grammar_stacks & cur_stacks = llama_grammar_get_stacks(grammar);

size_t pos = 0;
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
auto prev_stacks = grammar->stacks;
llama_grammar_accept(grammar->rules, prev_stacks, *it, grammar->stacks);
if (grammar->stacks.empty()) {
const llama_grammar_rules & prev_rules = llama_grammar_get_rules (grammar);
const llama_grammar_stacks prev_stacks = llama_grammar_get_stacks(grammar); // copy
llama_grammar_accept(prev_rules, prev_stacks, *it, cur_stacks);
if (cur_stacks.empty()) {
error_pos = pos;
error_msg = "Unexpected character '" + unicode_cpt_to_utf8(*it) + "'";
grammar->stacks = prev_stacks;
cur_stacks = prev_stacks;
return false;
}
++pos;
}

for (const auto & stack : grammar->stacks) {
for (const auto & stack : cur_stacks) {
if (stack.empty()) {
return true;
}
Expand Down
59 changes: 31 additions & 28 deletions include/llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -1000,6 +1000,18 @@ extern "C" {

LLAMA_API struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar);

/// @details Apply constraints from grammar
LLAMA_API void llama_grammar_sample(
struct llama_context * ctx,
llama_token_data_array * candidates,
const struct llama_grammar * grammar);

/// @details Accepts the sampled token into the grammar
LLAMA_API void llama_grammar_accept_token(
struct llama_context * ctx,
struct llama_grammar * grammar,
llama_token token);

//
// Sampling functions
//
Expand Down Expand Up @@ -1118,18 +1130,6 @@ extern "C" {
struct llama_context * ctx,
llama_token_data_array * candidates);

/// @details Apply constraints from grammar
LLAMA_API void llama_sample_grammar(
struct llama_context * ctx,
llama_token_data_array * candidates,
const struct llama_grammar * grammar);

/// @details Accepts the sampled token into the grammar
LLAMA_API void llama_grammar_accept_token(
struct llama_context * ctx,
struct llama_grammar * grammar,
llama_token token);

//
// Model split
//
Expand Down Expand Up @@ -1172,38 +1172,41 @@ extern "C" {

struct ggml_tensor;

const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
struct llama_context * ctx
);

struct llama_partial_utf8 {
uint32_t value; // bit value so far (unshifted)
int n_remain; // num bytes remaining; -1 indicates invalid sequence
};

struct llama_grammar {
const std::vector<std::vector<llama_grammar_element>> rules;
std::vector<std::vector<const llama_grammar_element *>> stacks;

// buffer for partially generated UTF-8 sequence from accepted tokens
llama_partial_utf8 partial_utf8;
};

struct llama_grammar_candidate {
size_t index;
const uint32_t * code_points;
llama_partial_utf8 partial_utf8;
};

const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
struct llama_context * ctx
);
using llama_grammar_rules = std::vector<std::vector<llama_grammar_element>>;
using llama_grammar_stacks = std::vector<std::vector<const llama_grammar_element *>>;

const llama_grammar_rules & llama_grammar_get_rules (const struct llama_grammar * grammar);
llama_grammar_stacks & llama_grammar_get_stacks( struct llama_grammar * grammar);

void llama_grammar_accept(
const std::vector<std::vector<llama_grammar_element>> & rules,
const std::vector<std::vector<const llama_grammar_element *>> & stacks,
const uint32_t chr,
std::vector<std::vector<const llama_grammar_element *>> & new_stacks);
const llama_grammar_rules & rules,
const llama_grammar_stacks & stacks,
const uint32_t chr,
llama_grammar_stacks & new_stacks);

std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_stack(
const std::vector<std::vector<llama_grammar_element>> & rules,
const std::vector<const llama_grammar_element *> & stack,
const std::vector<llama_grammar_candidate> & candidates);

std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
const std::string & src,
llama_partial_utf8 partial_start);
llama_partial_utf8 partial_start);

// Randomly selects a token from the candidates based on their probabilities using given std::mt19937.
// This is a temporary workaround in order to fix race conditions when sampling with multiple sequences.
Expand Down
2 changes: 2 additions & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ endif()
add_library(llama
../include/llama.h
llama.cpp
llama-vocab.cpp
llama-grammar.cpp
llama-sampling.cpp
unicode.h
unicode.cpp
Expand Down
Loading

0 comments on commit a0fff5d

Please sign in to comment.