From 7b2bdbe1f734460cfd8dac5ab61f169b6594b4df Mon Sep 17 00:00:00 2001 From: Jack Tysoe Date: Tue, 26 Nov 2024 23:09:12 +0000 Subject: [PATCH 01/10] fix(AG-178): set correct conf field for error logger (cherry picked from commit f43002625a3aec88d138394a5b689f1ffab9717c) --- .../ai-request-transformer/filters/transform-request.lua | 2 +- .../ai-response-transformer/filters/transform-response.lua | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kong/plugins/ai-request-transformer/filters/transform-request.lua b/kong/plugins/ai-request-transformer/filters/transform-request.lua index efa69dbe206e..b9f8952ce5b8 100644 --- a/kong/plugins/ai-request-transformer/filters/transform-request.lua +++ b/kong/plugins/ai-request-transformer/filters/transform-request.lua @@ -70,7 +70,7 @@ function _M:run(conf) local identity_interface = _KEYBASTION[conf.llm] if identity_interface and identity_interface.error then - kong.log.err("error authenticating with ", conf.model.provider, " using native provider auth, ", identity_interface.error) + kong.log.err("error authenticating with ", conf.llm.model.provider, " using native provider auth, ", identity_interface.error) return kong.response.exit(500, "LLM request failed before proxying") end diff --git a/kong/plugins/ai-response-transformer/filters/transform-response.lua b/kong/plugins/ai-response-transformer/filters/transform-response.lua index 354edbc17049..7c64dc83ed06 100644 --- a/kong/plugins/ai-response-transformer/filters/transform-response.lua +++ b/kong/plugins/ai-response-transformer/filters/transform-response.lua @@ -122,7 +122,7 @@ function _M:run(conf) local identity_interface = _KEYBASTION[conf.llm] if identity_interface and identity_interface.error then - kong.log.err("error authenticating with ", conf.model.provider, " using native provider auth, ", identity_interface.error) + kong.log.err("error authenticating with ", conf.llm.model.provider, " using native provider auth, ", identity_interface.error) return kong.response.exit(500, "LLM request failed before proxying") end From 3594ad3d48cf1eedb9575cba6393d95c275fd4b8 Mon Sep 17 00:00:00 2001 From: Jack Tysoe Date: Tue, 26 Nov 2024 23:12:31 +0000 Subject: [PATCH 02/10] fix(AG-178): send correct object to ctx buffer (cherry picked from commit 1f158b1fd584bfa33f222d5cd24bd8a47395a3ab) --- kong/llm/plugin/shared-filters/normalize-sse-chunk.lua | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kong/llm/plugin/shared-filters/normalize-sse-chunk.lua b/kong/llm/plugin/shared-filters/normalize-sse-chunk.lua index 975322c78ed0..4684f8d081ec 100644 --- a/kong/llm/plugin/shared-filters/normalize-sse-chunk.lua +++ b/kong/llm/plugin/shared-filters/normalize-sse-chunk.lua @@ -73,7 +73,7 @@ local function handle_streaming_frame(conf, chunk, finished) -- how do we know if this is false but some other filter will need the body? if conf.logging and conf.logging.log_payloads and not body_buffer then body_buffer = buffer.new() - set_global_ctx("sse_body_buffer", buffer) + set_global_ctx("sse_body_buffer", body_buffer) else kong.log.debug("using existing body buffer created by: ", source) end From d29cb7bc63e5da587b5b25d6f457a9041effe5a6 Mon Sep 17 00:00:00 2001 From: Jack Tysoe Date: Tue, 26 Nov 2024 23:13:50 +0000 Subject: [PATCH 03/10] fix(AG-178): make analytics collection run AFTER response transformation (cherry picked from commit 55e7241e65617794a3232daba4788b1a59125fb2) --- .../shared-filters/normalize-json-response.lua | 16 ++++++++++++++++ .../shared-filters/parse-json-response.lua | 2 +- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/kong/llm/plugin/shared-filters/normalize-json-response.lua b/kong/llm/plugin/shared-filters/normalize-json-response.lua index 63e2cc9389cd..37693da95464 100644 --- a/kong/llm/plugin/shared-filters/normalize-json-response.lua +++ b/kong/llm/plugin/shared-filters/normalize-json-response.lua @@ -38,6 +38,22 @@ local function transform_body(conf) response_body = cjson.encode({ error = { message = err }}) end + local t, err + if response_body then + t, err = cjson.decode(response_body) + if err then + kong.log.warn("failed to decode response body for usage introspection: ", err) + end + + if t and t.usage and t.usage.prompt_tokens then + ai_plugin_o11y.metrics_set("llm_prompt_tokens_count", t.usage.prompt_tokens) + end + + if t and t.usage and t.usage.completion_tokens then + ai_plugin_o11y.metrics_set("llm_completion_tokens_count", t.usage.completion_tokens) + end + end + set_global_ctx("response_body", response_body) -- to be sent out later or consumed by other plugins end diff --git a/kong/llm/plugin/shared-filters/parse-json-response.lua b/kong/llm/plugin/shared-filters/parse-json-response.lua index fc5308ec89b3..259e104874c3 100644 --- a/kong/llm/plugin/shared-filters/parse-json-response.lua +++ b/kong/llm/plugin/shared-filters/parse-json-response.lua @@ -46,4 +46,4 @@ function _M:run(_) return true end -return _M \ No newline at end of file +return _M From e20b76badb4b6e96ae4f4970d08ee0cfb1e38a8a Mon Sep 17 00:00:00 2001 From: Jack Tysoe Date: Tue, 26 Nov 2024 23:16:00 +0000 Subject: [PATCH 04/10] fix(AG-178): divide by zero protection on analytics maths (cherry picked from commit 08bf1a25176e56d3f84e96010661cd384bcc88f8) --- kong/llm/drivers/shared.lua | 5 ++++- kong/llm/plugin/observability.lua | 8 ++++++-- .../llm/plugin/shared-filters/normalize-json-response.lua | 2 ++ 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/kong/llm/drivers/shared.lua b/kong/llm/drivers/shared.lua index 139cc739d97b..0a1ec6d64965 100644 --- a/kong/llm/drivers/shared.lua +++ b/kong/llm/drivers/shared.lua @@ -719,7 +719,10 @@ function _M.post_request(conf, response_object) meta_container[log_entry_keys.LLM_LATENCY] = llm_latency if response_object.usage and response_object.usage.completion_tokens then - local time_per_token = math.floor(llm_latency / response_object.usage.completion_tokens) + local time_per_token = 0 + if response_object.usage.completion_tokens > 0 then + time_per_token = math.floor(llm_latency / response_object.usage.completion_tokens) + end request_analytics_plugin[log_entry_keys.USAGE_CONTAINER][log_entry_keys.TIME_PER_TOKEN] = time_per_token end end diff --git a/kong/llm/plugin/observability.lua b/kong/llm/plugin/observability.lua index 685ae4ee1a9e..916aeb19c6fd 100644 --- a/kong/llm/plugin/observability.lua +++ b/kong/llm/plugin/observability.lua @@ -70,7 +70,11 @@ function _M.metrics_get(key) -- process automatic calculation if not metrics[key] then if key == "llm_tpot_latency" then - return math.floor(_M.metrics_get("llm_e2e_latency") / _M.metrics_get("llm_completion_tokens_count")) + local llm_completion_tokens_count = _M.metrics_get("llm_completion_tokens_count") + if llm_completion_tokens_count > 0 then + return _M.metrics_get("llm_e2e_latency") / llm_completion_tokens_count + end + return 0 elseif key == "llm_total_tokens_count" then return _M.metrics_get("llm_prompt_tokens_count") + _M.metrics_get("llm_completion_tokens_count") end @@ -102,4 +106,4 @@ function _M.record_request_end() return latency end -return _M \ No newline at end of file +return _M diff --git a/kong/llm/plugin/shared-filters/normalize-json-response.lua b/kong/llm/plugin/shared-filters/normalize-json-response.lua index 37693da95464..1e0988f52495 100644 --- a/kong/llm/plugin/shared-filters/normalize-json-response.lua +++ b/kong/llm/plugin/shared-filters/normalize-json-response.lua @@ -38,6 +38,8 @@ local function transform_body(conf) response_body = cjson.encode({ error = { message = err }}) end + -- TODO: avoid json encode and decode when transforming + -- deduplicate body usage parsing from parse-json-response local t, err if response_body then t, err = cjson.decode(response_body) From 865c5d636da5c648f6f69b803d4dd447ffbc92c8 Mon Sep 17 00:00:00 2001 From: Jack Tysoe Date: Tue, 26 Nov 2024 23:20:37 +0000 Subject: [PATCH 05/10] fix(AG-178): properly support all streaming content types (cherry picked from commit 34cd042f097702bf2551b372664d2e440e1962eb) --- kong/llm/drivers/shared.lua | 6 ++++++ kong/llm/plugin/shared-filters/parse-sse-chunk.lua | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/kong/llm/drivers/shared.lua b/kong/llm/drivers/shared.lua index 0a1ec6d64965..03c00bbcddb9 100644 --- a/kong/llm/drivers/shared.lua +++ b/kong/llm/drivers/shared.lua @@ -78,6 +78,12 @@ _M._CONST = { ["AWS_STREAM_CONTENT_TYPE"] = "application/vnd.amazon.eventstream", } +_M._SUPPORTED_STREAMING_CONTENT_TYPES = { + ["text/event-stream"] = true, + ["application/vnd.amazon.eventstream"] = true, + ["application/json"] = true, +} + _M.streaming_has_token_counts = { ["cohere"] = true, ["llama2"] = true, diff --git a/kong/llm/plugin/shared-filters/parse-sse-chunk.lua b/kong/llm/plugin/shared-filters/parse-sse-chunk.lua index 219a5017259c..b0c9195a8e27 100644 --- a/kong/llm/plugin/shared-filters/parse-sse-chunk.lua +++ b/kong/llm/plugin/shared-filters/parse-sse-chunk.lua @@ -20,7 +20,7 @@ local function handle_streaming_frame(conf, chunk, finished) local content_type = kong.service.response.get_header("Content-Type") local normalized_content_type = content_type and content_type:sub(1, (content_type:find(";") or 0) - 1) - if normalized_content_type and normalized_content_type ~= "text/event-stream" and normalized_content_type ~= ai_shared._CONST.AWS_STREAM_CONTENT_TYPE then + if normalized_content_type and (not ai_shared._SUPPORTED_STREAMING_CONTENT_TYPES[normalized_content_type]) then return true end From 7a0505c52d5133b866ff470560d885a3c2238150 Mon Sep 17 00:00:00 2001 From: Jack Tysoe Date: Tue, 26 Nov 2024 23:48:45 +0000 Subject: [PATCH 06/10] fix(AG-178): (ai-transformers): add statistic logger test for non-openai format response (cherry picked from commit e02fd84cfc7bdf4ec0cfdcc31eb0098dd9063b5c) --- request.json | 35 ++++++ .../02-integration_spec.lua | 112 +++++++++++++++++- .../request-transformer/response-in-json.json | 22 ++++ 3 files changed, 166 insertions(+), 3 deletions(-) create mode 100644 request.json create mode 100644 spec/fixtures/ai-proxy/gemini/request-transformer/response-in-json.json diff --git a/request.json b/request.json new file mode 100644 index 000000000000..3d7efa85675b --- /dev/null +++ b/request.json @@ -0,0 +1,35 @@ +{ + "contents": [ + { + "role": "user", + "parts": [ + { + "text": "Hi Gemini" + } + ] + } + ] + , "generationConfig": { + "temperature": 1 + ,"maxOutputTokens": 8192 + ,"topP": 0.95 + }, + "safetySettings": [ + { + "category": "HARM_CATEGORY_HATE_SPEECH", + "threshold": "OFF" + }, + { + "category": "HARM_CATEGORY_DANGEROUS_CONTENT", + "threshold": "OFF" + }, + { + "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", + "threshold": "OFF" + }, + { + "category": "HARM_CATEGORY_HARASSMENT", + "threshold": "OFF" + } + ] +} diff --git a/spec/03-plugins/39-ai-request-transformer/02-integration_spec.lua b/spec/03-plugins/39-ai-request-transformer/02-integration_spec.lua index c8472b61959f..f61af5c5a031 100644 --- a/spec/03-plugins/39-ai-request-transformer/02-integration_spec.lua +++ b/spec/03-plugins/39-ai-request-transformer/02-integration_spec.lua @@ -8,6 +8,7 @@ local MOCK_PORT = helpers.get_available_port() local PLUGIN_NAME = "ai-request-transformer" local FILE_LOG_PATH_STATS_ONLY = os.tmpname() +local FILE_LOG_PATH_GEMINI_STATS_ONLY = os.tmpname() local function wait_for_json_log_entry(FILE_LOG_PATH) local json @@ -55,6 +56,29 @@ local OPENAI_FLAT_RESPONSE = { } local GEMINI_GOOD = { + route_type = "llm/v1/chat", + logging = { + log_payloads = false, + log_statistics = true, + }, + model = { + name = "gemini-1.5-flash", + provider = "gemini", + options = { + max_tokens = 512, + temperature = 0.6, + upstream_url = "http://"..helpers.mock_upstream_host..":"..MOCK_PORT.."/geminiflat", + input_cost = 20.0, + output_cost = 20.0, + }, + }, + auth = { + header_name = "x-goog-api-key", + header_value = "123", + }, +} + +local GEMINI_GOOD_FAILS_SAFETY = { route_type = "llm/v1/chat", logging = { log_payloads = false, @@ -160,6 +184,28 @@ local _EXPECTED_CHAT_STATS = { }, } + +local _EXPECTED_CHAT_STATS_GEMINI = { + ["ai-request-transformer"] = { + meta = { + plugin_id = '71083e79-4921-4f9f-97a4-ee7810b6cd8b', + provider_name = 'gemini', + request_model = 'UNSPECIFIED', + response_model = 'gemini-1.5-flash', + llm_latency = 1 + }, + usage = { + prompt_tokens = 2, + completion_tokens = 11, + total_tokens = 13, + time_per_token = 1, + cost = 0.00026, + }, + cache = {} + }, +} + + local SYSTEM_PROMPT = "You are a mathematician. " .. "Multiply all numbers in my JSON request, by 2." @@ -191,6 +237,13 @@ for _, strategy in helpers.all_strategies() do } } + location ~/geminiflat { + content_by_lua_block { + local pl_file = require "pl.file" + ngx.print(pl_file.read("spec/fixtures/ai-proxy/gemini/request-transformer/response-in-json.json")) + } + } + location = "/badrequest" { content_by_lua_block { local pl_file = require "pl.file" @@ -234,7 +287,6 @@ for _, strategy in helpers.all_strategies() do llm = OPENAI_FLAT_RESPONSE, }, } - bp.plugins:insert { name = "file-log", route = { id = without_response_instructions.id }, @@ -243,6 +295,27 @@ for _, strategy in helpers.all_strategies() do }, } + -- echo server via 'non-openai' LLM + local gemini_without_response_instructions = assert(bp.routes:insert { + paths = { "/gemini-echo-flat" } + }) + bp.plugins:insert { + name = PLUGIN_NAME, + id = "71083e79-4921-4f9f-97a4-ee7810b6cd8b", + route = { id = gemini_without_response_instructions.id }, + config = { + prompt = SYSTEM_PROMPT, + llm = GEMINI_GOOD, + }, + } + bp.plugins:insert { + name = "file-log", + route = { id = gemini_without_response_instructions.id }, + config = { + path = FILE_LOG_PATH_GEMINI_STATS_ONLY, + }, + } + local bad_request = assert(bp.routes:insert { paths = { "/echo-bad-request" } }) @@ -263,7 +336,7 @@ for _, strategy in helpers.all_strategies() do route = { id = fails_safety.id }, config = { prompt = SYSTEM_PROMPT, - llm = GEMINI_GOOD, + llm = GEMINI_GOOD_FAILS_SAFETY, }, } @@ -322,7 +395,7 @@ for _, strategy in helpers.all_strategies() do assert.same(EXPECTED_RESULT_FLAT, body_table.post_data.params) end) - it("logs statistics", function() + it("logs statistics - openai format", function() local r = client:get("/echo-flat", { headers = { ["content-type"] = "application/json", @@ -355,6 +428,39 @@ for _, strategy in helpers.all_strategies() do assert.same(actual_time_per_token, time_per_token) end) + it("logs statistics - non-openai format", function() + local r = client:get("/gemini-echo-flat", { + headers = { + ["content-type"] = "application/json", + ["accept"] = "application/json", + }, + body = REQUEST_BODY, + }) + + local body = assert.res_status(200 , r) + local _, err = cjson.decode(body) + + assert.is_nil(err) + + local log_message = wait_for_json_log_entry(FILE_LOG_PATH_GEMINI_STATS_ONLY) + assert.same("127.0.0.1", log_message.client_ip) + assert.is_number(log_message.request.size) + assert.is_number(log_message.response.size) + + -- test ai-request-transformer stats + local actual_chat_stats = log_message.ai + local actual_llm_latency = actual_chat_stats["ai-request-transformer"].meta.llm_latency + local actual_time_per_token = actual_chat_stats["ai-request-transformer"].usage.time_per_token + local time_per_token = math.floor(actual_llm_latency / actual_chat_stats["ai-request-transformer"].usage.completion_tokens) + + log_message.ai["ai-request-transformer"].meta.llm_latency = 1 + log_message.ai["ai-request-transformer"].usage.time_per_token = 1 + + assert.same(_EXPECTED_CHAT_STATS_GEMINI, log_message.ai) + assert.is_true(actual_llm_latency >= 0) + assert.same(actual_time_per_token, time_per_token) + end) + it("bad request from LLM", function() local r = client:get("/echo-bad-request", { headers = { diff --git a/spec/fixtures/ai-proxy/gemini/request-transformer/response-in-json.json b/spec/fixtures/ai-proxy/gemini/request-transformer/response-in-json.json new file mode 100644 index 000000000000..503796be6057 --- /dev/null +++ b/spec/fixtures/ai-proxy/gemini/request-transformer/response-in-json.json @@ -0,0 +1,22 @@ +{ + "candidates": [ + { + "content": { + "role": "model", + "parts": [ + { + "text": " {\n \"persons\": [\n {\n \"name\": \"Kong A\",\n \"age\": 62\n },\n {\n \"name\": \"Kong B\",\n \"age\": 84\n }\n ]\n }\n" + } + ] + }, + "finishReason": "STOP", + "avgLogprobs": -0.013348851691592823 + } + ], + "usageMetadata": { + "promptTokenCount": 2, + "candidatesTokenCount": 11, + "totalTokenCount": 13 + }, + "modelVersion": "gemini-1.5-flash-002" +} From d3a292fc95d25a4f422d6d91fe9fcf09ddf9a03e Mon Sep 17 00:00:00 2001 From: Jack Tysoe Date: Wed, 27 Nov 2024 00:21:18 +0000 Subject: [PATCH 07/10] fix(AG-178): add logging tests for non-openai format 'gemini' (cherry picked from commit db5618fae51b5c6da283d798d9d156d3f4e9f246) --- .../shared-filters/serialize-analytics.lua | 2 + .../09-streaming_integration_spec.lua | 25 ++ .../11-gemini_integration_spec.lua | 222 ++++++++++++++++++ .../gemini/llm-v1-chat/responses/good.json | 22 ++ 4 files changed, 271 insertions(+) create mode 100644 spec/03-plugins/38-ai-proxy/11-gemini_integration_spec.lua create mode 100644 spec/fixtures/ai-proxy/gemini/llm-v1-chat/responses/good.json diff --git a/kong/llm/plugin/shared-filters/serialize-analytics.lua b/kong/llm/plugin/shared-filters/serialize-analytics.lua index ffaff8bbb31c..1b246fa3ecd6 100644 --- a/kong/llm/plugin/shared-filters/serialize-analytics.lua +++ b/kong/llm/plugin/shared-filters/serialize-analytics.lua @@ -66,6 +66,8 @@ function _M:run(conf) cost = ai_plugin_o11y.metrics_get("llm_usage_cost"), } + kong.log.inspect(usage) + kong.log.set_serialize_value(string.format("ai.%s.usage", ai_plugin_o11y.NAMESPACE), usage) diff --git a/spec/03-plugins/38-ai-proxy/09-streaming_integration_spec.lua b/spec/03-plugins/38-ai-proxy/09-streaming_integration_spec.lua index a2a35a0fac0a..d7c89b79b0b7 100644 --- a/spec/03-plugins/38-ai-proxy/09-streaming_integration_spec.lua +++ b/spec/03-plugins/38-ai-proxy/09-streaming_integration_spec.lua @@ -632,6 +632,31 @@ for _, strategy in helpers.all_strategies() do assert.equal(#events, 8) assert.equal(buf:tostring(), "The answer to 1 + 1 is 2.") + + -- test analytics on this item + local log_message = wait_for_json_log_entry(FILE_LOG_PATH_WITH_PAYLOADS) + assert.same("127.0.0.1", log_message.client_ip) + assert.is_number(log_message.request.size) + assert.is_number(log_message.response.size) + + local actual_stats = log_message.ai.proxy + + local actual_llm_latency = actual_stats.meta.llm_latency + local actual_time_per_token = actual_stats.usage.time_per_token + local time_per_token = actual_llm_latency / actual_stats.usage.completion_tokens + + local actual_request_log = actual_stats.payload.request or "ERROR: NONE_RETURNED" + local actual_response_log = actual_stats.payload.response or "ERROR: NONE_RETURNED" + actual_stats.payload = nil + + actual_stats.meta.llm_latency = 1 + actual_stats.usage.time_per_token = 1 + + assert.same(_EXPECTED_CHAT_STATS, actual_stats) + assert.is_true(actual_llm_latency >= 0) + assert.same(tonumber(string.format("%.3f", actual_time_per_token)), tonumber(string.format("%.3f", time_per_token))) + assert.match_re(actual_request_log, [[.*content.*What is 1 \+ 1.*]]) + assert.match_re(actual_response_log, [[.*content.*The answer.*]]) end) it("good stream request cohere", function() diff --git a/spec/03-plugins/38-ai-proxy/11-gemini_integration_spec.lua b/spec/03-plugins/38-ai-proxy/11-gemini_integration_spec.lua new file mode 100644 index 000000000000..76c80d541ebb --- /dev/null +++ b/spec/03-plugins/38-ai-proxy/11-gemini_integration_spec.lua @@ -0,0 +1,222 @@ +local helpers = require("spec.helpers") +local cjson = require("cjson") +local pl_file = require("pl.file") +local strip = require("kong.tools.string").strip + +local PLUGIN_NAME = "ai-proxy" +local MOCK_PORT = helpers.get_available_port() + +local FILE_LOG_PATH_WITH_PAYLOADS = os.tmpname() + +local truncate_file = function(path) + local file = io.open(path, "w") + file:close() +end + +local function wait_for_json_log_entry(FILE_LOG_PATH) + local json + + assert + .with_timeout(10) + .ignore_exceptions(true) + .eventually(function() + local data = assert(pl_file.read(FILE_LOG_PATH)) + + data = strip(data) + assert(#data > 0, "log file is empty") + + data = data:match("%b{}") + assert(data, "log file does not contain JSON") + + json = cjson.decode(data) + end) + .has_no_error("log file contains a valid JSON entry") + + return json +end + +local _EXPECTED_CHAT_STATS = { + meta = { + plugin_id = '17434c15-2c7c-4c2f-b87a-58880533a3c1', + provider_name = 'gemini', + request_model = 'gemini-1.5-pro', + response_model = 'gemini-1.5-pro', + llm_latency = 1, + }, + usage = { + prompt_tokens = 2, + completion_tokens = 11, + total_tokens = 13, + time_per_token = 1, + cost = 0.000195, + }, +} + +for _, strategy in helpers.all_strategies() do + if strategy ~= "cassandra" then + describe(PLUGIN_NAME .. ": (access) [#" .. strategy .. "]", function() + local client + + lazy_setup(function() + local bp = helpers.get_db_utils(strategy == "off" and "postgres" or strategy, nil, { PLUGIN_NAME }) + + -- set up gemini mock fixtures + local fixtures = { + http_mock = {}, + } + + fixtures.http_mock.gemini = [[ + server { + server_name gemini; + listen ]] .. MOCK_PORT .. [[; + + default_type 'application/json'; + + location = "/v1/chat/completions" { + content_by_lua_block { + local pl_file = require "pl.file" + local json = require("cjson.safe") + + local token = ngx.req.get_headers()["authorization"] + if token == "Bearer gemini-key" then + ngx.req.read_body() + local body, err = ngx.req.get_body_data() + body, err = json.decode(body) + + ngx.status = 200 + ngx.print(pl_file.read("spec/fixtures/ai-proxy/gemini/llm-v1-chat/responses/good.json")) + end + } + } + } + ]] + + local empty_service = assert(bp.services:insert({ + name = "empty_service", + host = "localhost", --helpers.mock_upstream_host, + port = 8080, --MOCK_PORT, + path = "/", + })) + + -- 200 chat good with one option + local chat_good = assert(bp.routes:insert({ + service = empty_service, + protocols = { "http" }, + strip_path = true, + paths = { "/gemini/llm/v1/chat/good" }, + })) + bp.plugins:insert({ + name = PLUGIN_NAME, + id = "17434c15-2c7c-4c2f-b87a-58880533a3c1", + route = { id = chat_good.id }, + config = { + route_type = "llm/v1/chat", + auth = { + header_name = "Authorization", + header_value = "Bearer gemini-key", + }, + logging = { + log_payloads = true, + log_statistics = true, + }, + model = { + name = "gemini-1.5-pro", + provider = "gemini", + options = { + max_tokens = 256, + temperature = 1.0, + upstream_url = "http://" .. helpers.mock_upstream_host .. ":" .. MOCK_PORT .. "/v1/chat/completions", + input_cost = 15.0, + output_cost = 15.0, + }, + }, + }, + }) + bp.plugins:insert { + name = "file-log", + route = { id = chat_good.id }, + config = { + path = FILE_LOG_PATH_WITH_PAYLOADS, + }, + } + + -- start kong + assert(helpers.start_kong({ + -- set the strategy + database = strategy, + -- use the custom test template to create a local mock server + nginx_conf = "spec/fixtures/custom_nginx.template", + -- make sure our plugin gets loaded + plugins = "bundled," .. PLUGIN_NAME, + -- write & load declarative config, only if 'strategy=off' + declarative_config = strategy == "off" and helpers.make_yaml_file() or nil, + }, nil, nil, fixtures)) + end) + + lazy_teardown(function() + helpers.stop_kong() + os.remove(FILE_LOG_PATH_WITH_PAYLOADS) + end) + + before_each(function() + client = helpers.proxy_client() + truncate_file(FILE_LOG_PATH_WITH_PAYLOADS) + end) + + after_each(function() + if client then + client:close() + end + end) + + describe("gemini llm/v1/chat", function() + it("good request", function() + local r = client:get("/gemini/llm/v1/chat/good", { + headers = { + ["content-type"] = "application/json", + ["accept"] = "application/json", + }, + body = pl_file.read("spec/fixtures/ai-proxy/openai/llm-v1-chat/requests/good.json"), + }) + -- validate that the request succeeded, response status 200 + local body = assert.res_status(200, r) + local json = cjson.decode(body) + + -- check this is in the 'kong' response format + assert.equals(json.model, "gemini-1.5-pro") + assert.equals(json.object, "chat.completion") + assert.equals(json.choices[1].finish_reason, "stop") + + assert.is_table(json.choices) + assert.is_string(json.choices[1].message.content) + assert.same("Everything is okay.", json.choices[1].message.content) + + -- test stats from file-log + local log_message = wait_for_json_log_entry(FILE_LOG_PATH_WITH_PAYLOADS) + assert.same("127.0.0.1", log_message.client_ip) + assert.is_number(log_message.request.size) + assert.is_number(log_message.response.size) + + local actual_stats = log_message.ai.proxy + + local actual_llm_latency = actual_stats.meta.llm_latency + local actual_time_per_token = actual_stats.usage.time_per_token + local time_per_token = actual_llm_latency / actual_stats.usage.completion_tokens + + local actual_request_log = actual_stats.payload.request + local actual_response_log = actual_stats.payload.response + actual_stats.payload = nil + + actual_stats.meta.llm_latency = 1 + actual_stats.usage.time_per_token = 1 + + assert.same(_EXPECTED_CHAT_STATS, actual_stats) + assert.is_true(actual_llm_latency >= 0) + assert.same(tonumber(string.format("%.3f", actual_time_per_token)), tonumber(string.format("%.3f", time_per_token))) + assert.match_re(actual_request_log, [[.*contents.*What is 1 \+ 1.*]]) + assert.match_re(actual_response_log, [[.*content.*Everything is okay.*]]) + end) + end) + end) + end +end diff --git a/spec/fixtures/ai-proxy/gemini/llm-v1-chat/responses/good.json b/spec/fixtures/ai-proxy/gemini/llm-v1-chat/responses/good.json new file mode 100644 index 000000000000..0242dfd3b45f --- /dev/null +++ b/spec/fixtures/ai-proxy/gemini/llm-v1-chat/responses/good.json @@ -0,0 +1,22 @@ +{ + "candidates": [ + { + "content": { + "role": "model", + "parts": [ + { + "text": "Everything is okay." + } + ] + }, + "finishReason": "STOP", + "avgLogprobs": -0.013348851691592823 + } + ], + "usageMetadata": { + "promptTokenCount": 2, + "candidatesTokenCount": 11, + "totalTokenCount": 13 + }, + "modelVersion": "gemini-1.5-flash-002" +} From 2c436dbb33c84f575ac203480453b48753b3d373 Mon Sep 17 00:00:00 2001 From: Jack Tysoe Date: Wed, 27 Nov 2024 00:40:35 +0000 Subject: [PATCH 08/10] fix(AG-178): add logging tests for streaming (cherry picked from commit bd526e5ef1bcce2e641970da9ea4adf8f2f8c9a0) --- .../09-streaming_integration_spec.lua | 61 +++++++++++++++++++ .../11-gemini_integration_spec.lua | 2 +- 2 files changed, 62 insertions(+), 1 deletion(-) diff --git a/spec/03-plugins/38-ai-proxy/09-streaming_integration_spec.lua b/spec/03-plugins/38-ai-proxy/09-streaming_integration_spec.lua index d7c89b79b0b7..5cb8a94ed531 100644 --- a/spec/03-plugins/38-ai-proxy/09-streaming_integration_spec.lua +++ b/spec/03-plugins/38-ai-proxy/09-streaming_integration_spec.lua @@ -1,12 +1,59 @@ local helpers = require "spec.helpers" local cjson = require "cjson.safe" local pl_file = require "pl.file" +local strip = require("kong.tools.string").strip local http = require("resty.http") local PLUGIN_NAME = "ai-proxy" local MOCK_PORT = helpers.get_available_port() +local FILE_LOG_PATH_WITH_PAYLOADS = os.tmpname() + +local _EXPECTED_CHAT_STATS = { + meta = { + plugin_id = '6e7c40f6-ce96-48e4-a366-d109c169e444', + provider_name = 'openai', + request_model = 'gpt-3.5-turbo', + response_model = 'gpt-3.5-turbo', + llm_latency = 1 + }, + usage = { + prompt_tokens = 18, + completion_tokens = 7, + total_tokens = 25, + time_per_token = 1, + cost = 0.00037, + }, +} + +local truncate_file = function(path) + local file = io.open(path, "w") + file:close() +end + +local function wait_for_json_log_entry(FILE_LOG_PATH) + local json + + assert + .with_timeout(10) + .ignore_exceptions(true) + .eventually(function() + local data = assert(pl_file.read(FILE_LOG_PATH)) + + data = strip(data) + assert(#data > 0, "log file is empty") + + data = data:match("%b{}") + assert(data, "log file does not contain JSON") + + json = cjson.decode(data) + end) + .has_no_error("log file contains a valid JSON entry") + + return json +end + for _, strategy in helpers.all_strategies() do describe(PLUGIN_NAME .. ": (access) [#" .. strategy .. "]", function() local client @@ -353,6 +400,7 @@ for _, strategy in helpers.all_strategies() do }) bp.plugins:insert { name = PLUGIN_NAME, + id = "6e7c40f6-ce96-48e4-a366-d109c169e444", route = { id = openai_chat_partial.id }, config = { route_type = "llm/v1/chat", @@ -360,6 +408,10 @@ for _, strategy in helpers.all_strategies() do header_name = "Authorization", header_value = "Bearer openai-key", }, + logging = { + log_payloads = true, + log_statistics = true, + }, model = { name = "gpt-3.5-turbo", provider = "openai", @@ -371,6 +423,13 @@ for _, strategy in helpers.all_strategies() do }, }, } + bp.plugins:insert { + name = "file-log", + route = { id = openai_chat_partial.id }, + config = { + path = FILE_LOG_PATH_WITH_PAYLOADS, + }, + } -- -- 200 chat cohere @@ -497,10 +556,12 @@ for _, strategy in helpers.all_strategies() do lazy_teardown(function() helpers.stop_kong() + os.remove(FILE_LOG_PATH_WITH_PAYLOADS) end) before_each(function() client = helpers.proxy_client() + truncate_file(FILE_LOG_PATH_WITH_PAYLOADS) end) after_each(function() diff --git a/spec/03-plugins/38-ai-proxy/11-gemini_integration_spec.lua b/spec/03-plugins/38-ai-proxy/11-gemini_integration_spec.lua index 76c80d541ebb..1e1bce8081bb 100644 --- a/spec/03-plugins/38-ai-proxy/11-gemini_integration_spec.lua +++ b/spec/03-plugins/38-ai-proxy/11-gemini_integration_spec.lua @@ -219,4 +219,4 @@ for _, strategy in helpers.all_strategies() do end) end) end -end +end \ No newline at end of file From 6b13d961f4216e3d2f300f99affd6e8238a915e7 Mon Sep 17 00:00:00 2001 From: Jack Tysoe Date: Wed, 27 Nov 2024 01:49:17 +0000 Subject: [PATCH 09/10] fix(AG-178): fix gemini parsing multiple chained tool calls (cherry picked from commit 581571b0c35035f0304282978610f2cbe4f2c8b4) --- kong/llm/drivers/gemini.lua | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/kong/llm/drivers/gemini.lua b/kong/llm/drivers/gemini.lua index 1cbdd6095e5f..99ba1b4adbac 100644 --- a/kong/llm/drivers/gemini.lua +++ b/kong/llm/drivers/gemini.lua @@ -268,22 +268,23 @@ local function from_gemini_chat_openai(response, model_info, route_type) messages.model = model_info.name elseif is_tool_content(response) then + messages.choices[1] = { + index = 0, + message = { + role = "assistant", + tool_calls = {}, + }, + } + local function_call_responses = response.candidates[1].content.parts for i, v in ipairs(function_call_responses) do - messages.choices[i] = { - index = 0, - message = { - role = "assistant", - tool_calls = { - { - ['function'] = { - name = v.functionCall.name, - arguments = cjson.encode(v.functionCall.args), - }, - }, + messages.choices[1].message.tool_calls[i] = + { + ['function'] = { + name = v.functionCall.name, + arguments = cjson.encode(v.functionCall.args), }, - }, - } + } end end From d6314a7d6ca9776aeb7a9da77685f240caa8db58 Mon Sep 17 00:00:00 2001 From: Jack Tysoe Date: Thu, 28 Nov 2024 13:51:50 +0000 Subject: [PATCH 10/10] fix(llm): fix streaming sse filter not ran twice and prompt token count and missing metadata (cherry picked from commit e053601b40e4b642575cb184e64204a17414d484) --- kong/llm/plugin/base.lua | 9 ++++++- .../shared-filters/normalize-sse-chunk.lua | 21 +++++++++++----- .../02-openai_integration_spec.lua | 8 +++---- .../09-streaming_integration_spec.lua | 24 +++++++++++++------ 4 files changed, 44 insertions(+), 18 deletions(-) diff --git a/kong/llm/plugin/base.lua b/kong/llm/plugin/base.lua index 3a23c78a8732..0daca7a29419 100644 --- a/kong/llm/plugin/base.lua +++ b/kong/llm/plugin/base.lua @@ -19,6 +19,13 @@ local STAGES = { RES_POST_PROCESSING = 7, } +-- Filters in those stages are allowed to execute more than one time in a request +-- TODO: implement singleton support, that in one iteration of of body_filter only one filter +-- only ran one times. This is not an issue today as they are only used in one plugin. +local REPEATED_PHASES = { + [STAGES.STREAMING] = true, +} + local MetaPlugin = {} local all_filters = {} @@ -38,7 +45,7 @@ local function run_stage(stage, sub_plugin, conf) if not f then kong.log.err("no filter named '" .. name .. "' registered") - elseif not ai_executed_filters[name] then + elseif not ai_executed_filters[name] or REPEATED_PHASES[stage] then ai_executed_filters[name] = true kong.log.debug("executing filter ", name) diff --git a/kong/llm/plugin/shared-filters/normalize-sse-chunk.lua b/kong/llm/plugin/shared-filters/normalize-sse-chunk.lua index 4684f8d081ec..d97d5c59b949 100644 --- a/kong/llm/plugin/shared-filters/normalize-sse-chunk.lua +++ b/kong/llm/plugin/shared-filters/normalize-sse-chunk.lua @@ -83,6 +83,8 @@ local function handle_streaming_frame(conf, chunk, finished) for _, event in ipairs(events) do + -- TODO: currently only subset of driver follow the body, err, metadata pattern + -- unify this so that it was always extracted from the body local formatted, _, metadata = ai_driver.from_format(event, conf.model, "stream/" .. conf.route_type) if formatted then @@ -106,12 +108,6 @@ local function handle_streaming_frame(conf, chunk, finished) if body_buffer then body_buffer:put(token_t) end - - -- incredibly loose estimate based on https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them - -- but this is all we can do until OpenAI fixes this... - -- - -- essentially, every 4 characters is a token, with minimum of 1*4 per event - ai_plugin_o11y.metrics_add("llm_completion_tokens_count", math.ceil(#strip(token_t) / 4)) end end @@ -141,6 +137,19 @@ local function handle_streaming_frame(conf, chunk, finished) local prompt_tokens_count = ai_plugin_o11y.metrics_get("llm_prompt_tokens_count") local completion_tokens_count = ai_plugin_o11y.metrics_get("llm_completion_tokens_count") + + if conf.logging and conf.logging.log_statistics then + -- no metadata populated in the event streams, do our estimation + if completion_tokens_count == 0 then + -- incredibly loose estimate based on https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them + -- but this is all we can do until OpenAI fixes this... + -- + -- essentially, every 4 characters is a token, with minimum of 1*4 per event + completion_tokens_count = math.ceil(#strip(response) / 4) + ai_plugin_o11y.metrics_set("llm_completion_tokens_count", completion_tokens_count) + end + end + -- populate cost if conf.model.options and conf.model.options.input_cost and conf.model.options.output_cost then local cost = (prompt_tokens_count * conf.model.options.input_cost + diff --git a/spec/03-plugins/38-ai-proxy/02-openai_integration_spec.lua b/spec/03-plugins/38-ai-proxy/02-openai_integration_spec.lua index f4dbf536ab70..9831385efe95 100644 --- a/spec/03-plugins/38-ai-proxy/02-openai_integration_spec.lua +++ b/spec/03-plugins/38-ai-proxy/02-openai_integration_spec.lua @@ -871,14 +871,14 @@ for _, strategy in helpers.all_strategies() do local _, first_got = next(log_message.ai) local actual_llm_latency = first_got.meta.llm_latency local actual_time_per_token = first_got.usage.time_per_token - local time_per_token = math.floor(actual_llm_latency / first_got.usage.completion_tokens) + local time_per_token = actual_llm_latency / first_got.usage.completion_tokens first_got.meta.llm_latency = 1 first_got.usage.time_per_token = 1 assert.same(first_expected, first_got) assert.is_true(actual_llm_latency >= 0) - assert.same(actual_time_per_token, time_per_token) + assert.same(tonumber(string.format("%.3f", actual_time_per_token)), tonumber(string.format("%.3f", time_per_token))) assert.same(first_got.meta.request_model, "gpt-3.5-turbo") end) @@ -1529,14 +1529,14 @@ for _, strategy in helpers.all_strategies() do local actual_llm_latency = first_got.meta.llm_latency local actual_time_per_token = first_got.usage.time_per_token - local time_per_token = math.floor(actual_llm_latency / first_got.usage.completion_tokens) + local time_per_token = actual_llm_latency / first_got.usage.completion_tokens first_got.meta.llm_latency = 1 first_got.usage.time_per_token = 1 assert.same(first_expected, first_got) assert.is_true(actual_llm_latency >= 0) - assert.same(actual_time_per_token, time_per_token) + assert.same(tonumber(string.format("%.3f", actual_time_per_token)), tonumber(string.format("%.3f", time_per_token))) end) it("logs payloads", function() diff --git a/spec/03-plugins/38-ai-proxy/09-streaming_integration_spec.lua b/spec/03-plugins/38-ai-proxy/09-streaming_integration_spec.lua index 5cb8a94ed531..6ba0d7bdf97d 100644 --- a/spec/03-plugins/38-ai-proxy/09-streaming_integration_spec.lua +++ b/spec/03-plugins/38-ai-proxy/09-streaming_integration_spec.lua @@ -20,10 +20,10 @@ local _EXPECTED_CHAT_STATS = { }, usage = { prompt_tokens = 18, - completion_tokens = 7, - total_tokens = 25, + completion_tokens = 13, -- this was from estimation + total_tokens = 31, time_per_token = 1, - cost = 0.00037, + cost = 0.00031, }, } @@ -377,7 +377,9 @@ for _, strategy in helpers.all_strategies() do options = { max_tokens = 256, temperature = 1.0, - upstream_url = "http://"..helpers.mock_upstream_host..":"..MOCK_PORT.."/openai/llm/v1/chat/good" + upstream_url = "http://"..helpers.mock_upstream_host..":"..MOCK_PORT.."/openai/llm/v1/chat/good", + input_cost = 10.0, + output_cost = 10.0, }, }, }, @@ -418,7 +420,9 @@ for _, strategy in helpers.all_strategies() do options = { max_tokens = 256, temperature = 1.0, - upstream_url = "http://"..helpers.mock_upstream_host..":"..MOCK_PORT.."/openai/llm/v1/chat/partial" + upstream_url = "http://"..helpers.mock_upstream_host..":"..MOCK_PORT.."/openai/llm/v1/chat/partial", + input_cost = 10.0, + output_cost = 10.0, }, }, }, @@ -454,7 +458,9 @@ for _, strategy in helpers.all_strategies() do options = { max_tokens = 256, temperature = 1.0, - upstream_url = "http://"..helpers.mock_upstream_host..":"..MOCK_PORT.."/cohere/llm/v1/chat/good" + upstream_url = "http://"..helpers.mock_upstream_host..":"..MOCK_PORT.."/cohere/llm/v1/chat/good", + input_cost = 10.0, + output_cost = 10.0, }, }, }, @@ -492,6 +498,8 @@ for _, strategy in helpers.all_strategies() do temperature = 1.0, upstream_url = "http://"..helpers.mock_upstream_host..":"..MOCK_PORT.."/anthropic/llm/v1/chat/good", anthropic_version = "2023-06-01", + input_cost = 10.0, + output_cost = 10.0, }, }, }, @@ -527,7 +535,9 @@ for _, strategy in helpers.all_strategies() do options = { max_tokens = 256, temperature = 1.0, - upstream_url = "http://"..helpers.mock_upstream_host..":"..MOCK_PORT.."/openai/llm/v1/chat/bad" + upstream_url = "http://"..helpers.mock_upstream_host..":"..MOCK_PORT.."/openai/llm/v1/chat/bad", + input_cost = 10.0, + output_cost = 10.0, }, }, },