diff --git a/.editorconfig b/.editorconfig index bd525e13f3ece..f88f8da67cd78 100644 --- a/.editorconfig +++ b/.editorconfig @@ -28,4 +28,5 @@ indent_size = 2 indent_style = tab [examples/cvector-generator/*.txt] +trim_trailing_whitespace = unset insert_final_newline = unset diff --git a/Makefile b/Makefile index dddf647cd551d..4ea59c0b4ef29 100644 --- a/Makefile +++ b/Makefile @@ -1051,7 +1051,7 @@ tests/test-grammar-parser: tests/test-grammar-parser.cpp ggml.o llama.o grammar- $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -tests/test-grammar-integration: tests/test-grammar-integration.cpp ggml.o llama.o grammar-parser.o $(OBJS) +tests/test-grammar-integration: tests/test-grammar-integration.cpp json-schema-to-grammar.o ggml.o llama.o grammar-parser.o $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) diff --git a/common/common.cpp b/common/common.cpp index 64f160af1c18c..cfdedcbae0cd9 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1989,8 +1989,8 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param options.push_back({ "cvector", " --completions-file FNAME", "completions file (default: '%s')", params.cvector_completions_file.c_str() }); options.push_back({ "cvector", " --completions N", "number of lines of completions file to use (default: %d)", params.n_completions }); - options.push_back({ "cvector", " --batch-pca N", "batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch }); - options.push_back({ "cvector", " --iter-pca N", "number of iterations used for PCA (default: %d)", params.n_pca_iterations }); + options.push_back({ "cvector", " --pca-batch N", "batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch }); + options.push_back({ "cvector", " --pca-iter N", "number of iterations used for PCA (default: %d)", params.n_pca_iterations }); printf("usage: %s [options]\n", argv[0]); diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index a6751cc80e682..3107b69f7e42e 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -967,7 +967,11 @@ def set_vocab(self): from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(dir_model) vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) - assert max(tokenizer.vocab.values()) < vocab_size + # Since we are checking the maximum index, we need to ensure it's strictly less than vocab_size, + # because vocab_size is the count of items, and indexes start at 0. + max_vocab_index = max(tokenizer.get_vocab().values()) + if max_vocab_index >= vocab_size: + raise ValueError("Vocabulary size exceeds expected maximum size.") reverse_vocab: dict[int, str] = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} added_vocab = tokenizer.get_added_vocab() diff --git a/examples/cvector-generator/README.md b/examples/cvector-generator/README.md index 7b0e79c1ffba8..5182e906d9180 100644 --- a/examples/cvector-generator/README.md +++ b/examples/cvector-generator/README.md @@ -17,7 +17,7 @@ Related PRs: ./cvector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 # With advanced options -./cvector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 --completions 128 --pca-iter 2000 --batch-pca 100 +./cvector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 --completions 128 --pca-iter 2000 --pca-batch 100 # To see help message ./cvector-generator -h diff --git a/examples/cvector-generator/cvector-generator.cpp b/examples/cvector-generator/cvector-generator.cpp index 9941683db677e..355905cb03d60 100644 --- a/examples/cvector-generator/cvector-generator.cpp +++ b/examples/cvector-generator/cvector-generator.cpp @@ -40,7 +40,7 @@ static void print_usage(int argc, char ** argv, const gpt_params & params) { printf("\nexample usage:\n"); printf("\n CPU only: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf\n", argv[0]); printf("\n with GPU: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99\n", argv[0]); - printf("\n advanced: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 --completions 128 --pca-iter 2000 --batch-pca 100\n", argv[0]); + printf("\n advanced: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 --completions 128 --pca-iter 2000 --pca-batch 100\n", argv[0]); printf("\n"); } @@ -377,8 +377,8 @@ static int prepare_entries(gpt_params & params, train_context & ctx_train) { // create templated prompts std::vector completions = ctrlvec_load_prompt_file(params.cvector_completions_file, false); auto format_template = [](std::string persona, std::string suffix) { - // entry in positive/negative.txt must already be formatted i.e. "[INST] Act as if you're extremely happy. [/INST]" - return persona + " " + suffix; + // entry in positive/negative.txt must already be formatted i.e. "[INST] Act as if you're extremely happy. [/INST] " + return persona + suffix; }; for (size_t i = 0; i < positive_prompts.size(); ++i) { for (int j = 0; j < std::min((int) completions.size(), params.n_completions); ++j) { diff --git a/examples/cvector-generator/negative.txt b/examples/cvector-generator/negative.txt index 2ac3387f184b0..3e9951752e886 100644 --- a/examples/cvector-generator/negative.txt +++ b/examples/cvector-generator/negative.txt @@ -1 +1 @@ -[INST] Act like a person who is extremely sad. [/INST] \ No newline at end of file +[INST] Act like a person who is extremely sad. [/INST] diff --git a/examples/cvector-generator/positive.txt b/examples/cvector-generator/positive.txt index f28e9aa1aeb72..8802367873cd9 100644 --- a/examples/cvector-generator/positive.txt +++ b/examples/cvector-generator/positive.txt @@ -1 +1 @@ -[INST] Act like a person who is extremely happy. [/INST] \ No newline at end of file +[INST] Act like a person who is extremely happy. [/INST] diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 28584e14b788c..76e2052d55d79 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -16,41 +16,41 @@ struct quant_option { }; static const std::vector QUANT_OPTIONS = { - { "Q4_0", LLAMA_FTYPE_MOSTLY_Q4_0, " 3.56G, +0.2166 ppl @ LLaMA-v1-7B", }, - { "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 3.90G, +0.1585 ppl @ LLaMA-v1-7B", }, - { "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 4.33G, +0.0683 ppl @ LLaMA-v1-7B", }, - { "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 4.70G, +0.0349 ppl @ LLaMA-v1-7B", }, + { "Q4_0", LLAMA_FTYPE_MOSTLY_Q4_0, " 4.34G, +0.4685 ppl @ Llama-3-8B", }, + { "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 4.78G, +0.4511 ppl @ Llama-3-8B", }, + { "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 5.21G, +0.1316 ppl @ Llama-3-8B", }, + { "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 5.65G, +0.1062 ppl @ Llama-3-8B", }, { "IQ2_XXS",LLAMA_FTYPE_MOSTLY_IQ2_XXS," 2.06 bpw quantization", }, { "IQ2_XS", LLAMA_FTYPE_MOSTLY_IQ2_XS, " 2.31 bpw quantization", }, { "IQ2_S", LLAMA_FTYPE_MOSTLY_IQ2_S, " 2.5 bpw quantization", }, { "IQ2_M", LLAMA_FTYPE_MOSTLY_IQ2_M, " 2.7 bpw quantization", }, { "IQ1_S", LLAMA_FTYPE_MOSTLY_IQ1_S, " 1.56 bpw quantization", }, { "IQ1_M", LLAMA_FTYPE_MOSTLY_IQ1_M, " 1.75 bpw quantization", }, - { "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", }, - { "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", }, + { "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.96G, +3.5199 ppl @ Llama-3-8B", }, + { "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.96G, +3.1836 ppl @ Llama-3-8B", }, { "IQ3_XXS",LLAMA_FTYPE_MOSTLY_IQ3_XXS," 3.06 bpw quantization", }, { "IQ3_S", LLAMA_FTYPE_MOSTLY_IQ3_S, " 3.44 bpw quantization", }, { "IQ3_M", LLAMA_FTYPE_MOSTLY_IQ3_M, " 3.66 bpw quantization mix", }, - { "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" }, - { "IQ3_XS", LLAMA_FTYPE_MOSTLY_IQ3_XS, " 3.3 bpw quantization" , }, - { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", }, - { "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.07G, +0.2496 ppl @ LLaMA-v1-7B", }, - { "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 3.35G, +0.1764 ppl @ LLaMA-v1-7B", }, + { "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" }, + { "IQ3_XS", LLAMA_FTYPE_MOSTLY_IQ3_XS, " 3.3 bpw quantization", }, + { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 3.41G, +1.6321 ppl @ Llama-3-8B", }, + { "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.74G, +0.6569 ppl @ Llama-3-8B", }, + { "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 4.03G, +0.5562 ppl @ Llama-3-8B", }, { "IQ4_NL", LLAMA_FTYPE_MOSTLY_IQ4_NL, " 4.50 bpw non-linear quantization", }, { "IQ4_XS", LLAMA_FTYPE_MOSTLY_IQ4_XS, " 4.25 bpw non-linear quantization", }, - { "Q4_K", LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", }, - { "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 3.59G, +0.0992 ppl @ LLaMA-v1-7B", }, - { "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 3.80G, +0.0532 ppl @ LLaMA-v1-7B", }, - { "Q5_K", LLAMA_FTYPE_MOSTLY_Q5_K_M, "alias for Q5_K_M", }, - { "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 4.33G, +0.0400 ppl @ LLaMA-v1-7B", }, - { "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0122 ppl @ LLaMA-v1-7B", }, - { "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 5.15G, +0.0008 ppl @ LLaMA-v1-7B", }, - { "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", }, - { "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, -0.0020 ppl @ Mistral-7B", }, - { "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", }, - { "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", }, + { "Q4_K", LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", }, + { "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 4.37G, +0.2689 ppl @ Llama-3-8B", }, + { "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 4.58G, +0.1754 ppl @ Llama-3-8B", }, + { "Q5_K", LLAMA_FTYPE_MOSTLY_Q5_K_M, "alias for Q5_K_M", }, + { "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 5.21G, +0.1049 ppl @ Llama-3-8B", }, + { "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 5.33G, +0.0569 ppl @ Llama-3-8B", }, + { "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 6.14G, +0.0217 ppl @ Llama-3-8B", }, + { "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 7.96G, +0.0026 ppl @ Llama-3-8B", }, + { "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, +0.0020 ppl @ Mistral-7B", }, + { "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", }, + { "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", }, // Note: Ensure COPY comes after F32 to avoid ftype 0 from matching. - { "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", }, + { "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", }, }; static const char * const LLM_KV_QUANTIZE_IMATRIX_FILE = "quantize.imatrix.file"; diff --git a/tests/test-grammar-integration.cpp b/tests/test-grammar-integration.cpp index 8787fb1ec6987..96f90c01e0d97 100644 --- a/tests/test-grammar-integration.cpp +++ b/tests/test-grammar-integration.cpp @@ -7,11 +7,16 @@ #include "ggml.h" #include "llama.h" #include "grammar-parser.h" +#include "json-schema-to-grammar.h" #include "unicode.h" #include #include #include +using json = nlohmann::ordered_json; + +//#define INCLUDE_FAILING_TESTS 1 + static llama_grammar* build_grammar(const std::string & grammar_str) { auto parsed_grammar = grammar_parser::parse(grammar_str.c_str()); @@ -65,8 +70,8 @@ static bool match_string(const std::string & input, llama_grammar* grammar) { return false; } -static void test_grammar(const std::string & test_desc, const std::string & grammar_str, const std::vector & passing_strings, const std::vector & failing_strings) { - fprintf(stderr, "⚫ Testing %s. Grammar: %s\n", test_desc.c_str(), grammar_str.c_str()); +static void test(const std::string & test_desc, const std::string & grammar_str, const std::vector & passing_strings, const std::vector & failing_strings) { + fprintf(stderr, "⚫ Testing %s\n%s\n", test_desc.c_str(), grammar_str.c_str()); fflush(stderr); auto grammar = build_grammar(grammar_str); @@ -85,6 +90,23 @@ static void test_grammar(const std::string & test_desc, const std::string & gram if (!matched) { fprintf(stderr, "❌ (failed to match)\n"); + + // DEBUG: Write strings to files so that we can analyze more easily with gbnf-validator program to see exactly where things failed. + // DEBUG: Write the grammar_str to test-grammar-integration.grammar.gbnf + FILE* grammar_file = fopen("test-grammar-integration.grammar.gbnf", "w"); + if (grammar_file) { + fprintf(grammar_file, "%s", grammar_str.c_str()); + fclose(grammar_file); + } + + // DEBUG: Write the test string to test-grammar-integration.string.txt + FILE* string_file = fopen("test-grammar-integration.string.txt", "w"); + if (string_file) { + fprintf(string_file, "%s", test_string.c_str()); + fclose(string_file); + } + + fprintf(stderr, "\n NOTE: Debug grammar file generated. To analyze this failure in detail, run the following command: ./llama-gbnf-validator test-grammar-integration.grammar.gbnf test-grammar-integration.string.txt\n\n"); } else { fprintf(stdout, "✅︎\n"); } @@ -118,6 +140,12 @@ static void test_grammar(const std::string & test_desc, const std::string & gram // Clean up allocated memory llama_grammar_free(grammar); } +static void test_grammar(const std::string & test_desc, const std::string & grammar_str, const std::vector & passing_strings, const std::vector & failing_strings) { + test(test_desc + ". Grammar: " + grammar_str, grammar_str, passing_strings, failing_strings); +} +static void test_schema(const std::string & test_desc, const std::string & schema_str, const std::vector & passing_strings, const std::vector & failing_strings) { + test(test_desc + ". Schema: " + schema_str, json_schema_to_grammar(json::parse(schema_str)), passing_strings, failing_strings); +} static void test_simple_grammar() { // Test case for a simple grammar @@ -400,10 +428,11 @@ static void test_quantifiers() { static void test_failure_missing_root() { fprintf(stderr, "⚫ Testing missing root node:\n"); // Test case for a grammar that is missing a root rule - const std::string grammar_str = R"""(rot ::= expr -expr ::= term ("+" term)* -term ::= number -number ::= [0-9]+)"""; + const std::string grammar_str = R"""( + rot ::= expr + expr ::= term ("+" term)* + term ::= number + number ::= [0-9]+)"""; grammar_parser::parse_state parsed_grammar = grammar_parser::parse(grammar_str.c_str()); @@ -420,10 +449,10 @@ static void test_failure_missing_reference() { // Test case for a grammar that is missing a referenced rule const std::string grammar_str = -R"""(root ::= expr -expr ::= term ("+" term)* -term ::= numero -number ::= [0-9]+)"""; + R"""(root ::= expr + expr ::= term ("+" term)* + term ::= numero + number ::= [0-9]+)"""; fprintf(stderr, " Expected error: "); @@ -445,29 +474,558 @@ static void test_failure_left_recursion() { // Test more complicated left recursion detection const std::string medium_str = R"""( -root ::= asdf -asdf ::= "a" | asdf "a" -)"""; + root ::= asdf + asdf ::= "a" | asdf "a" + )"""; assert(test_build_grammar_fails(medium_str)); // Test even more complicated left recursion detection const std::string hard_str = R"""( -root ::= asdf -asdf ::= "a" | foo "b" -foo ::= "c" | asdf "d" | "e")"""; + root ::= asdf + asdf ::= "a" | foo "b" + foo ::= "c" | asdf "d" | "e")"""; assert(test_build_grammar_fails(hard_str)); // Test yet even more complicated left recursion detection const std::string hardest_str = R"""( -root ::= asdf -asdf ::= "a" | foo "b" -foo ::= "c" | empty asdf "d" | "e" -empty ::= "blah" | )"""; + root ::= asdf + asdf ::= "a" | foo "b" + foo ::= "c" | empty asdf "d" | "e" + empty ::= "blah" | )"""; assert(test_build_grammar_fails(hardest_str)); fprintf(stderr, " ✅︎ Passed\n"); } +static void test_json_schema() { + // Note that this is similar to the regular grammar tests, + // but we convert each json schema to a grammar before parsing. + // Otherwise, this test structure is the same. + + test_schema( + "empty schema (object)", + // Schema + R"""( + {} + )""", + // Passing strings + { + "{}", + R"""({"foo": "bar"})""", + }, + // Failing strings + { + "", + "[]", + "null", + "\"\"", + "true", + } + ); + + test_schema( + "exotic formats (list)", + // Schema + R"""( + { + "items": [ + { "format": "date" }, + { "format": "uuid" }, + { "format": "time" }, + { "format": "date-time" } + ] + } + )""", + // Passing strings + { + // "{}", // NOTE: This string passes for this schema on https://www.jsonschemavalidator.net/ -- should it? + // "[]", // NOTE: This string passes for this schema on https://www.jsonschemavalidator.net/ -- should it? + R"""(["2012-04-23", "12345678-1234-1234-1234-1234567890ab", "18:25:43.511Z", "2012-04-23T18:25:43.511Z"])""", + //R"""(["2012-04-23","12345678-1234-1234-1234-1234567890ab"])""", // NOTE: This string passes for this schema on https://www.jsonschemavalidator.net/ -- should it? + //R"""({"foo": "bar"})""", // NOTE: This string passes for this schema on https://www.jsonschemavalidator.net/ -- should it? + }, + // Failing strings + { + R"""(["foo", "bar"])""", + R"""(["12345678-1234-1234-1234-1234567890ab"])""", + } + ); + + test_schema( + "string", + // Schema + R"""( + { + "type": "string" + } + )""", + // Passing strings + { + "\"foo\"", + "\"bar\"", + "\"\"", + }, + // Failing strings + { + "{}", + "\"foo\": \"bar\"", + } + ); + + test_schema( + "string w/ min length 1", + // Schema + R"""( + { + "type": "string", + "minLength": 1 + } + )""", + // Passing strings + { + "\"foo\"", + "\"bar\"", + }, + // Failing strings + { + "\"\"", + "{}", + "\"foo\": \"bar\"", + } + ); + + test_schema( + "string w/ min length 3", + // Schema + R"""( + { + "type": "string", + "minLength": 3 + } + )""", + // Passing strings + { + "\"foo\"", + "\"bar\"", + "\"foobar\"", + }, + // Failing strings + { + "\"\"", + "\"f\"", + "\"fo\"", + } + ); + + test_schema( + "string w/ max length", + // Schema + R"""( + { + "type": "string", + "maxLength": 3 + } + )""", + // Passing strings + { + "\"foo\"", + "\"bar\"", + "\"\"", + "\"f\"", + "\"fo\"", + }, + // Failing strings + { + "\"foobar\"", + } + ); + + test_schema( + "string w/ min & max length", + // Schema + R"""( + { + "type": "string", + "minLength": 1, + "maxLength": 4 + } + )""", + // Passing strings + { + "\"foo\"", + "\"bar\"", + "\"f\"", + "\"barf\"", + }, + // Failing strings + { + "\"\"", + "\"barfo\"", + "\"foobar\"", + } + ); + + test_schema( + "boolean", + // Schema + R"""( + { + "type": "boolean" + } + )""", + // Passing strings + { + "true", + "false", + }, + // Failing strings + { + "\"\"", + "\"true\"", + "True", + "FALSE", + } + ); + + test_schema( + "integer", + // Schema + R"""( + { + "type": "integer" + } + )""", + // Passing strings + { + "0", + "12345", + "1234567890123456" + }, + // Failing strings + { + "", + "01", + "007", + "12345678901234567" + } + ); + + test_schema( + "string const", + // Schema + R"""( + { + "const": "foo" + } + )""", + // Passing strings + { + "\"foo\"", + }, + // Failing strings + { + "foo", + "\"bar\"", + } + ); + + test_schema( + "non-string const", + // Schema + R"""( + { + "const": true + } + )""", + // Passing strings + { + "true", + }, + // Failing strings + { + "", + "foo", + "\"true\"", + } + ); + + test_schema( + "non-string const", + // Schema + R"""( + { + "enum": ["red", "amber", "green", null, 42, ["foo"]] + } + )""", + // Passing strings + { + "\"red\"", + "null", + "42", + "[\"foo\"]", + }, + // Failing strings + { + "", + "420", + "true", + "foo", + } + ); + + + test_schema( + "min+max items", + // Schema + R"""( + { + "items": { + "type": ["number", "integer"] + }, + "minItems": 3, + "maxItems": 5 + } + )""", + // Passing strings + { + "[1, 2, 3]", + "[1, 2, 3, 4]", + "[1, 2, 3, 4, 5]", + }, + // Failing strings + { + "[1, 2]", + "[1, 2, 3, 4, 5, 6]", + "1" + } + ); + + // Properties (from: https://json-schema.org/understanding-json-schema/reference/object#properties) + test_schema( + "object properties", + // Schema + R"""( + { + "type": "object", + "properties": { + "number": { "type": "number" }, + "street_name": { "type": "string" }, + "street_type": { "enum": ["Street", "Avenue", "Boulevard"] } + } + } + )""", + // Passing strings + { + R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type":"Avenue"})""", + // "By default, leaving out properties is valid" + R"""({ "street_name": "Pennsylvania" })""", + R"""({ "number": 1600, "street_name": "Pennsylvania" })""", + // "By extension, even an empty object is valid" + R"""({})""", + // "By default, providing additional properties is valid" +#ifdef INCLUDE_FAILING_TESTS + // TODO: The following should pass, but currently FAILS. Additional properties should be permitted by default. + R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type":"Avenue", "direction":"NW"})""", + // TODO: Spaces should be permitted around enum values, but currently they fail to pass. + R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type": "Avenue" })""", +#endif + }, + // Failing strings + { + // Change datatype from number to string + R"""({ "number": "1600", "street_name": "Pennsylvania", "street_type":"Avenue"})""", + // Reorder properties + R"""({ "street_name": "Pennsylvania", "number": 1600 })""", + // Reorder properties + R"""({ "number": "1600", "street_name": "Pennsylvania", "street_type":"Avenue"})""", + } + ); + + + // Properties (from: https://json-schema.org/understanding-json-schema/reference/object#properties) + test_schema( + "object properties, additionalProperties: true", + // Schema + R"""( + { + "type": "object", + "properties": { + "number": { "type": "number" }, + "street_name": { "type": "string" }, + "street_type": { "enum": ["Street", "Avenue", "Boulevard"] } + }, + "additionalProperties": true + } + )""", + // Passing strings + { + // "By extension, even an empty object is valid" + R"""({})""", +#ifdef INCLUDE_FAILING_TESTS + // TODO: Following line should pass and doesn't + R"""({"number":1600,"street_name":"Pennsylvania","street_type":"Avenue"})""", + // "By default, leaving out properties is valid" + // TODO: Following line should pass and doesn't + R"""({ "street_name": "Pennsylvania" })""", + // TODO: Following line should pass and doesn't + R"""({ "number": 1600, "street_name": "Pennsylvania" })""", + // "By default, providing additional properties is valid" + // TODO: The following should pass, but currently FAILS. Additional properties should be permitted by default. + R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type":"Avenue", "direction":"NW"})""", + // TODO: Spaces should be permitted around enum values, but currently they fail to pass. + R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type": "Avenue" })""", +#endif + }, + // Failing strings + { + // Change datatype from number to string + R"""({ "number": "1600", "street_name": "Pennsylvania", "street_type":"Avenue"})""", + // Reorder properties + R"""({ "street_name": "Pennsylvania", "number": 1600, "street_type":"Avenue"})""", + } + ); + + // Additional properties: false + test_schema( + "required + optional props each in original order", + // Schema + R"""( + { + "type": "object", + "properties": { + "number": { "type": "number" }, + "street_name": { "type": "string" }, + "street_type": { "enum": ["Street", "Avenue", "Boulevard"] } + }, + "additionalProperties": false + } + )""", + // Passing strings + { + R"""({ "street_name": "Pennsylvania" })""", + R"""({ "number": 1600, "street_type":"Avenue"})""", + R"""({ "number": 1600, "street_name": "Pennsylvania" })""", + R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type":"Avenue"})""", +#ifdef INCLUDE_FAILING_TESTS + // TODO: Spaces should be permitted around enum values, but currently they fail to pass. + R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type": "Avenue" })""", +#endif + }, + // Failing strings + { + // Reorder properties + R"""({ "street_type": "Avenue", "number": 1600 })""", + // Add "direction" + R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type": "Avenue", "direction": "NW" })""", + } + ); + + test_schema( + "required + optional props each in original order", + // Schema + R"""( + { + "properties": { + "b": {"type": "string"}, + "a": {"type": "string"}, + "d": {"type": "string"}, + "c": {"type": "string"} + }, + "required": ["a", "b"], + "additionalProperties": false + } + )""", + // Passing strings + { + R"""({"b": "foo", "a": "bar"})""", + R"""({"b":"foo","a":"bar","d":"qux"})""", + R"""({"b":"foo", "a":"bar", "d":"qux", "c":"baz"})""", + }, + // Failing strings + { + R"""({"a": "foo", "b": "bar"})""", + R"""({"b": "bar"})""", + R"""({"a": "foo", "c": "baz"})""", + R"""({"a":"foo", "b":"bar", "c":"baz", "d":"qux"})""", + } + ); + + // NOTE: Example from https://json-schema.org/learn/getting-started-step-by-step#define-required-properties + test_schema( + "required props", + // Schema + R"""( + { + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://example.com/product.schema.json", + "title": "Product", + "description": "A product from Acme's catalog", + "type": "object", + "properties": { + "productId": { + "description": "The unique identifier for a product", + "type": "integer" + }, + "productName": { + "description": "Name of the product", + "type": "string" + }, + "price": { + "description": "The price of the product", + "type": "number", + "exclusiveMinimum": 0 + }, + "tags": { + "description": "Tags for the product", + "type": "array", + "items": { + "type": "string" + }, + "minItems": 1, + "uniqueItems": true + }, + "dimensions": { + "type": "object", + "properties": { + "length": { + "type": "number" + }, + "width": { + "type": "number" + }, + "height": { + "type": "number" + } + }, + "required": [ "length", "width", "height" ] + } + }, + "required": [ "productId", "productName", "price" ] + } + )""", + // Passing strings + { + R"""({"productId": 1, "productName": "A green door", "price": 12.50})""", + R"""({"productId": 1, "productName": "A green door", "price": 12.50, "tags": ["home", "green"]})""", + R"""({"productId": 1, "productName": "A green door", "price": 12.50, "tags": ["home", "green"], "dimensions": {"length": 785, "width": 250.5, "height": -0.359}})""", + }, + // Failing strings + { + R"""({})""", // Missing all required properties + R"""({"productName": "A green door", "price": 12.50, "productId": 1})""", // Out of order properties + // TODO: The following line should fail, but currently it passes. `exclusiveMinimum` is not supported, as it would likely be too difficult to implement. + // Perhaps special checks for minimum and maximum values of 0 could be added (since that's relatively easy to do with grammars), but anything else would likely be too complex. + // R"""({"productId": 1, "productName": "A green door", "price": -12.50})""", + R"""({"productId": 1, "productName": "A green door"})""", // Missing required property (price) + R"""({"productName": "A green door", "price": 12.50})""", // Missing required property (productId) + R"""({"productId": 1, "productName": "A green door", "price": 12.50, "tags": []})""", // tags is empty, but minItems is 1 + R"""({"productId": 1, "productName": "A green door", "price": 12.50, "dimensions": {"length": 785, "width": 250.5, "height": -0.359}, "tags": ["home", "green"]})""", // Tags and dimensions are out of order + // TODO: The following line should fail, but currently it passes. `uniqueItems` is not supported, as it would likely be too difficult to implement. + // R"""({"productId": 1, "productName": "A green door", "price": 12.50, "tags": ["home", "green", "home"]})""", + } + ); +} + int main() { fprintf(stdout, "Running grammar integration tests...\n"); test_simple_grammar(); @@ -477,6 +1035,7 @@ int main() { test_failure_missing_root(); test_failure_missing_reference(); test_failure_left_recursion(); + test_json_schema(); fprintf(stdout, "All tests passed.\n"); return 0; }