From 7b2bdbe1f734460cfd8dac5ab61f169b6594b4df Mon Sep 17 00:00:00 2001
From: Jack Tysoe <jack.tysoe@konghq.com>
Date: Tue, 26 Nov 2024 23:09:12 +0000
Subject: [PATCH 01/10] fix(AG-178): set correct conf field for error logger

(cherry picked from commit f43002625a3aec88d138394a5b689f1ffab9717c)
---
 .../ai-request-transformer/filters/transform-request.lua        | 2 +-
 .../ai-response-transformer/filters/transform-response.lua      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/kong/plugins/ai-request-transformer/filters/transform-request.lua b/kong/plugins/ai-request-transformer/filters/transform-request.lua
index efa69dbe206e..b9f8952ce5b8 100644
--- a/kong/plugins/ai-request-transformer/filters/transform-request.lua
+++ b/kong/plugins/ai-request-transformer/filters/transform-request.lua
@@ -70,7 +70,7 @@ function _M:run(conf)
   local identity_interface = _KEYBASTION[conf.llm]
 
   if identity_interface and identity_interface.error then
-    kong.log.err("error authenticating with ", conf.model.provider, " using native provider auth, ", identity_interface.error)
+    kong.log.err("error authenticating with ", conf.llm.model.provider, " using native provider auth, ", identity_interface.error)
     return kong.response.exit(500, "LLM request failed before proxying")
   end
 
diff --git a/kong/plugins/ai-response-transformer/filters/transform-response.lua b/kong/plugins/ai-response-transformer/filters/transform-response.lua
index 354edbc17049..7c64dc83ed06 100644
--- a/kong/plugins/ai-response-transformer/filters/transform-response.lua
+++ b/kong/plugins/ai-response-transformer/filters/transform-response.lua
@@ -122,7 +122,7 @@ function _M:run(conf)
   local identity_interface = _KEYBASTION[conf.llm]
 
   if identity_interface and identity_interface.error then
-    kong.log.err("error authenticating with ", conf.model.provider, " using native provider auth, ", identity_interface.error)
+    kong.log.err("error authenticating with ", conf.llm.model.provider, " using native provider auth, ", identity_interface.error)
     return kong.response.exit(500, "LLM request failed before proxying")
   end
 

From 3594ad3d48cf1eedb9575cba6393d95c275fd4b8 Mon Sep 17 00:00:00 2001
From: Jack Tysoe <jack.tysoe@konghq.com>
Date: Tue, 26 Nov 2024 23:12:31 +0000
Subject: [PATCH 02/10] fix(AG-178): send correct object to ctx buffer

(cherry picked from commit 1f158b1fd584bfa33f222d5cd24bd8a47395a3ab)
---
 kong/llm/plugin/shared-filters/normalize-sse-chunk.lua | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kong/llm/plugin/shared-filters/normalize-sse-chunk.lua b/kong/llm/plugin/shared-filters/normalize-sse-chunk.lua
index 975322c78ed0..4684f8d081ec 100644
--- a/kong/llm/plugin/shared-filters/normalize-sse-chunk.lua
+++ b/kong/llm/plugin/shared-filters/normalize-sse-chunk.lua
@@ -73,7 +73,7 @@ local function handle_streaming_frame(conf, chunk, finished)
     -- how do we know if this is false but some other filter will need the body?
     if conf.logging and conf.logging.log_payloads and not body_buffer then
       body_buffer = buffer.new()
-      set_global_ctx("sse_body_buffer", buffer)
+      set_global_ctx("sse_body_buffer", body_buffer)
     else
       kong.log.debug("using existing body buffer created by: ", source)
     end

From d29cb7bc63e5da587b5b25d6f457a9041effe5a6 Mon Sep 17 00:00:00 2001
From: Jack Tysoe <jack.tysoe@konghq.com>
Date: Tue, 26 Nov 2024 23:13:50 +0000
Subject: [PATCH 03/10] fix(AG-178): make analytics collection run AFTER
 response transformation

(cherry picked from commit 55e7241e65617794a3232daba4788b1a59125fb2)
---
 .../shared-filters/normalize-json-response.lua   | 16 ++++++++++++++++
 .../shared-filters/parse-json-response.lua       |  2 +-
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/kong/llm/plugin/shared-filters/normalize-json-response.lua b/kong/llm/plugin/shared-filters/normalize-json-response.lua
index 63e2cc9389cd..37693da95464 100644
--- a/kong/llm/plugin/shared-filters/normalize-json-response.lua
+++ b/kong/llm/plugin/shared-filters/normalize-json-response.lua
@@ -38,6 +38,22 @@ local function transform_body(conf)
     response_body = cjson.encode({ error = { message = err }})
   end
 
+  local t, err
+  if response_body then
+    t, err = cjson.decode(response_body)
+    if err then
+      kong.log.warn("failed to decode response body for usage introspection: ", err)
+    end
+
+    if t and t.usage and t.usage.prompt_tokens then
+      ai_plugin_o11y.metrics_set("llm_prompt_tokens_count", t.usage.prompt_tokens)
+    end
+
+    if t and t.usage and t.usage.completion_tokens then
+      ai_plugin_o11y.metrics_set("llm_completion_tokens_count", t.usage.completion_tokens)
+    end
+  end
+
   set_global_ctx("response_body", response_body) -- to be sent out later or consumed by other plugins
 end
 
diff --git a/kong/llm/plugin/shared-filters/parse-json-response.lua b/kong/llm/plugin/shared-filters/parse-json-response.lua
index fc5308ec89b3..259e104874c3 100644
--- a/kong/llm/plugin/shared-filters/parse-json-response.lua
+++ b/kong/llm/plugin/shared-filters/parse-json-response.lua
@@ -46,4 +46,4 @@ function _M:run(_)
   return true
 end
 
-return _M
\ No newline at end of file
+return _M

From e20b76badb4b6e96ae4f4970d08ee0cfb1e38a8a Mon Sep 17 00:00:00 2001
From: Jack Tysoe <jack.tysoe@konghq.com>
Date: Tue, 26 Nov 2024 23:16:00 +0000
Subject: [PATCH 04/10] fix(AG-178): divide by zero protection on analytics
 maths

(cherry picked from commit 08bf1a25176e56d3f84e96010661cd384bcc88f8)
---
 kong/llm/drivers/shared.lua                               | 5 ++++-
 kong/llm/plugin/observability.lua                         | 8 ++++++--
 .../llm/plugin/shared-filters/normalize-json-response.lua | 2 ++
 3 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/kong/llm/drivers/shared.lua b/kong/llm/drivers/shared.lua
index 139cc739d97b..0a1ec6d64965 100644
--- a/kong/llm/drivers/shared.lua
+++ b/kong/llm/drivers/shared.lua
@@ -719,7 +719,10 @@ function _M.post_request(conf, response_object)
     meta_container[log_entry_keys.LLM_LATENCY] = llm_latency
 
     if response_object.usage and response_object.usage.completion_tokens then
-      local time_per_token = math.floor(llm_latency / response_object.usage.completion_tokens)
+      local time_per_token = 0
+      if response_object.usage.completion_tokens > 0 then
+        time_per_token  = math.floor(llm_latency / response_object.usage.completion_tokens)
+      end
       request_analytics_plugin[log_entry_keys.USAGE_CONTAINER][log_entry_keys.TIME_PER_TOKEN] = time_per_token
     end
   end
diff --git a/kong/llm/plugin/observability.lua b/kong/llm/plugin/observability.lua
index 685ae4ee1a9e..916aeb19c6fd 100644
--- a/kong/llm/plugin/observability.lua
+++ b/kong/llm/plugin/observability.lua
@@ -70,7 +70,11 @@ function _M.metrics_get(key)
   -- process automatic calculation
   if not metrics[key] then
     if key == "llm_tpot_latency" then
-      return math.floor(_M.metrics_get("llm_e2e_latency") / _M.metrics_get("llm_completion_tokens_count"))
+      local llm_completion_tokens_count = _M.metrics_get("llm_completion_tokens_count")
+      if llm_completion_tokens_count  > 0 then
+        return _M.metrics_get("llm_e2e_latency") / llm_completion_tokens_count
+      end
+      return 0
     elseif key == "llm_total_tokens_count" then
       return _M.metrics_get("llm_prompt_tokens_count") + _M.metrics_get("llm_completion_tokens_count")
     end
@@ -102,4 +106,4 @@ function _M.record_request_end()
   return latency
 end
 
-return _M
\ No newline at end of file
+return _M
diff --git a/kong/llm/plugin/shared-filters/normalize-json-response.lua b/kong/llm/plugin/shared-filters/normalize-json-response.lua
index 37693da95464..1e0988f52495 100644
--- a/kong/llm/plugin/shared-filters/normalize-json-response.lua
+++ b/kong/llm/plugin/shared-filters/normalize-json-response.lua
@@ -38,6 +38,8 @@ local function transform_body(conf)
     response_body = cjson.encode({ error = { message = err }})
   end
 
+  -- TODO: avoid json encode and decode when transforming
+  --             deduplicate body usage parsing from parse-json-response
   local t, err
   if response_body then
     t, err = cjson.decode(response_body)

From 865c5d636da5c648f6f69b803d4dd447ffbc92c8 Mon Sep 17 00:00:00 2001
From: Jack Tysoe <jack.tysoe@konghq.com>
Date: Tue, 26 Nov 2024 23:20:37 +0000
Subject: [PATCH 05/10] fix(AG-178): properly support all streaming content
 types

(cherry picked from commit 34cd042f097702bf2551b372664d2e440e1962eb)
---
 kong/llm/drivers/shared.lua                        | 6 ++++++
 kong/llm/plugin/shared-filters/parse-sse-chunk.lua | 2 +-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/kong/llm/drivers/shared.lua b/kong/llm/drivers/shared.lua
index 0a1ec6d64965..03c00bbcddb9 100644
--- a/kong/llm/drivers/shared.lua
+++ b/kong/llm/drivers/shared.lua
@@ -78,6 +78,12 @@ _M._CONST = {
   ["AWS_STREAM_CONTENT_TYPE"] = "application/vnd.amazon.eventstream",
 }
 
+_M._SUPPORTED_STREAMING_CONTENT_TYPES = {
+  ["text/event-stream"] = true,
+  ["application/vnd.amazon.eventstream"] = true,
+  ["application/json"] = true,
+}
+
 _M.streaming_has_token_counts = {
   ["cohere"] = true,
   ["llama2"] = true,
diff --git a/kong/llm/plugin/shared-filters/parse-sse-chunk.lua b/kong/llm/plugin/shared-filters/parse-sse-chunk.lua
index 219a5017259c..b0c9195a8e27 100644
--- a/kong/llm/plugin/shared-filters/parse-sse-chunk.lua
+++ b/kong/llm/plugin/shared-filters/parse-sse-chunk.lua
@@ -20,7 +20,7 @@ local function handle_streaming_frame(conf, chunk, finished)
 
   local content_type = kong.service.response.get_header("Content-Type")
   local normalized_content_type = content_type and content_type:sub(1, (content_type:find(";") or 0) - 1)
-  if normalized_content_type and normalized_content_type ~= "text/event-stream" and normalized_content_type ~= ai_shared._CONST.AWS_STREAM_CONTENT_TYPE then
+  if normalized_content_type and (not ai_shared._SUPPORTED_STREAMING_CONTENT_TYPES[normalized_content_type]) then
     return true
   end
 

From 7a0505c52d5133b866ff470560d885a3c2238150 Mon Sep 17 00:00:00 2001
From: Jack Tysoe <jack.tysoe@konghq.com>
Date: Tue, 26 Nov 2024 23:48:45 +0000
Subject: [PATCH 06/10] fix(AG-178): (ai-transformers): add statistic logger
 test for non-openai format response

(cherry picked from commit e02fd84cfc7bdf4ec0cfdcc31eb0098dd9063b5c)
---
 request.json                                  |  35 ++++++
 .../02-integration_spec.lua                   | 112 +++++++++++++++++-
 .../request-transformer/response-in-json.json |  22 ++++
 3 files changed, 166 insertions(+), 3 deletions(-)
 create mode 100644 request.json
 create mode 100644 spec/fixtures/ai-proxy/gemini/request-transformer/response-in-json.json

diff --git a/request.json b/request.json
new file mode 100644
index 000000000000..3d7efa85675b
--- /dev/null
+++ b/request.json
@@ -0,0 +1,35 @@
+{
+    "contents": [
+        {
+            "role": "user",
+            "parts": [
+                {
+                    "text": "Hi Gemini"
+                }
+            ]
+        }
+    ]
+    , "generationConfig": {
+        "temperature": 1
+        ,"maxOutputTokens": 8192
+        ,"topP": 0.95
+    },
+    "safetySettings": [
+        {
+            "category": "HARM_CATEGORY_HATE_SPEECH",
+            "threshold": "OFF"
+        },
+        {
+            "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
+            "threshold": "OFF"
+        },
+        {
+            "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
+            "threshold": "OFF"
+        },
+        {
+            "category": "HARM_CATEGORY_HARASSMENT",
+            "threshold": "OFF"
+        }
+    ]
+}
diff --git a/spec/03-plugins/39-ai-request-transformer/02-integration_spec.lua b/spec/03-plugins/39-ai-request-transformer/02-integration_spec.lua
index c8472b61959f..f61af5c5a031 100644
--- a/spec/03-plugins/39-ai-request-transformer/02-integration_spec.lua
+++ b/spec/03-plugins/39-ai-request-transformer/02-integration_spec.lua
@@ -8,6 +8,7 @@ local MOCK_PORT = helpers.get_available_port()
 local PLUGIN_NAME = "ai-request-transformer"
 
 local FILE_LOG_PATH_STATS_ONLY = os.tmpname()
+local FILE_LOG_PATH_GEMINI_STATS_ONLY = os.tmpname()
 
 local function wait_for_json_log_entry(FILE_LOG_PATH)
   local json
@@ -55,6 +56,29 @@ local OPENAI_FLAT_RESPONSE = {
 }
 
 local GEMINI_GOOD = {
+  route_type = "llm/v1/chat",
+  logging = {
+    log_payloads = false,
+    log_statistics = true,
+  },
+  model = {
+    name = "gemini-1.5-flash",
+    provider = "gemini",
+    options = {
+      max_tokens = 512,
+      temperature = 0.6,
+      upstream_url = "http://"..helpers.mock_upstream_host..":"..MOCK_PORT.."/geminiflat",
+      input_cost = 20.0,
+      output_cost = 20.0,
+    },
+  },
+  auth = {
+    header_name = "x-goog-api-key",
+    header_value = "123",
+  },
+}
+
+local GEMINI_GOOD_FAILS_SAFETY = {
   route_type = "llm/v1/chat",
   logging = {
     log_payloads = false,
@@ -160,6 +184,28 @@ local _EXPECTED_CHAT_STATS = {
   },
 }
 
+
+local _EXPECTED_CHAT_STATS_GEMINI = {
+  ["ai-request-transformer"] = {
+    meta = {
+      plugin_id = '71083e79-4921-4f9f-97a4-ee7810b6cd8b',
+      provider_name = 'gemini',
+      request_model = 'UNSPECIFIED',
+      response_model = 'gemini-1.5-flash',
+      llm_latency = 1
+    },
+    usage = {
+      prompt_tokens = 2,
+      completion_tokens = 11,
+      total_tokens = 13,
+      time_per_token = 1,
+      cost = 0.00026,
+    },
+    cache = {}
+  },
+}
+
+
 local SYSTEM_PROMPT = "You are a mathematician. "
                    .. "Multiply all numbers in my JSON request, by 2."
 
@@ -191,6 +237,13 @@ for _, strategy in helpers.all_strategies() do
               }
             }
 
+            location ~/geminiflat {
+              content_by_lua_block {
+                local pl_file = require "pl.file"
+                ngx.print(pl_file.read("spec/fixtures/ai-proxy/gemini/request-transformer/response-in-json.json"))
+              }
+            }
+
             location = "/badrequest" {
               content_by_lua_block {
                 local pl_file = require "pl.file"
@@ -234,7 +287,6 @@ for _, strategy in helpers.all_strategies() do
           llm = OPENAI_FLAT_RESPONSE,
         },
       }
-
       bp.plugins:insert {
         name = "file-log",
         route = { id = without_response_instructions.id },
@@ -243,6 +295,27 @@ for _, strategy in helpers.all_strategies() do
         },
       }
 
+      -- echo server via 'non-openai' LLM
+      local gemini_without_response_instructions = assert(bp.routes:insert {
+        paths = { "/gemini-echo-flat" }
+      })
+      bp.plugins:insert {
+        name = PLUGIN_NAME,
+        id = "71083e79-4921-4f9f-97a4-ee7810b6cd8b",
+        route = { id = gemini_without_response_instructions.id },
+        config = {
+          prompt = SYSTEM_PROMPT,
+          llm = GEMINI_GOOD,
+        },
+      }
+      bp.plugins:insert {
+        name = "file-log",
+        route = { id = gemini_without_response_instructions.id },
+        config = {
+          path = FILE_LOG_PATH_GEMINI_STATS_ONLY,
+        },
+      }
+
       local bad_request = assert(bp.routes:insert {
         paths = { "/echo-bad-request" }
       })
@@ -263,7 +336,7 @@ for _, strategy in helpers.all_strategies() do
         route = { id = fails_safety.id },
         config = {
           prompt = SYSTEM_PROMPT,
-          llm = GEMINI_GOOD,
+          llm = GEMINI_GOOD_FAILS_SAFETY,
         },
       }
 
@@ -322,7 +395,7 @@ for _, strategy in helpers.all_strategies() do
         assert.same(EXPECTED_RESULT_FLAT, body_table.post_data.params)
       end)
 
-      it("logs statistics", function()
+      it("logs statistics - openai format", function()
         local r = client:get("/echo-flat", {
           headers = {
             ["content-type"] = "application/json",
@@ -355,6 +428,39 @@ for _, strategy in helpers.all_strategies() do
         assert.same(actual_time_per_token, time_per_token)
       end)
 
+      it("logs statistics - non-openai format", function()
+        local r = client:get("/gemini-echo-flat", {
+          headers = {
+            ["content-type"] = "application/json",
+            ["accept"] = "application/json",
+          },
+          body = REQUEST_BODY,
+        })
+
+        local body = assert.res_status(200 , r)
+        local _, err = cjson.decode(body)
+
+        assert.is_nil(err)
+
+        local log_message = wait_for_json_log_entry(FILE_LOG_PATH_GEMINI_STATS_ONLY)
+        assert.same("127.0.0.1", log_message.client_ip)
+        assert.is_number(log_message.request.size)
+        assert.is_number(log_message.response.size)
+
+        -- test ai-request-transformer stats
+        local actual_chat_stats = log_message.ai
+        local actual_llm_latency = actual_chat_stats["ai-request-transformer"].meta.llm_latency
+        local actual_time_per_token = actual_chat_stats["ai-request-transformer"].usage.time_per_token
+        local time_per_token = math.floor(actual_llm_latency / actual_chat_stats["ai-request-transformer"].usage.completion_tokens)
+
+        log_message.ai["ai-request-transformer"].meta.llm_latency = 1
+        log_message.ai["ai-request-transformer"].usage.time_per_token = 1
+
+        assert.same(_EXPECTED_CHAT_STATS_GEMINI, log_message.ai)
+        assert.is_true(actual_llm_latency >= 0)
+        assert.same(actual_time_per_token, time_per_token)
+      end)
+
       it("bad request from LLM", function()
         local r = client:get("/echo-bad-request", {
           headers = {
diff --git a/spec/fixtures/ai-proxy/gemini/request-transformer/response-in-json.json b/spec/fixtures/ai-proxy/gemini/request-transformer/response-in-json.json
new file mode 100644
index 000000000000..503796be6057
--- /dev/null
+++ b/spec/fixtures/ai-proxy/gemini/request-transformer/response-in-json.json
@@ -0,0 +1,22 @@
+{
+    "candidates": [
+      {
+        "content": {
+          "role": "model",
+          "parts": [
+            {
+              "text": " {\n    \"persons\": [\n      {\n        \"name\": \"Kong A\",\n        \"age\": 62\n      },\n      {\n        \"name\": \"Kong B\",\n        \"age\": 84\n      }\n    ]\n  }\n"
+            }
+          ]
+        },
+        "finishReason": "STOP",
+        "avgLogprobs": -0.013348851691592823
+      }
+    ],
+    "usageMetadata": {
+      "promptTokenCount": 2,
+      "candidatesTokenCount": 11,
+      "totalTokenCount": 13
+    },
+    "modelVersion": "gemini-1.5-flash-002"
+}

From d3a292fc95d25a4f422d6d91fe9fcf09ddf9a03e Mon Sep 17 00:00:00 2001
From: Jack Tysoe <jack.tysoe@konghq.com>
Date: Wed, 27 Nov 2024 00:21:18 +0000
Subject: [PATCH 07/10] fix(AG-178): add logging tests for non-openai format
 'gemini'

(cherry picked from commit db5618fae51b5c6da283d798d9d156d3f4e9f246)
---
 .../shared-filters/serialize-analytics.lua    |   2 +
 .../09-streaming_integration_spec.lua         |  25 ++
 .../11-gemini_integration_spec.lua            | 222 ++++++++++++++++++
 .../gemini/llm-v1-chat/responses/good.json    |  22 ++
 4 files changed, 271 insertions(+)
 create mode 100644 spec/03-plugins/38-ai-proxy/11-gemini_integration_spec.lua
 create mode 100644 spec/fixtures/ai-proxy/gemini/llm-v1-chat/responses/good.json

diff --git a/kong/llm/plugin/shared-filters/serialize-analytics.lua b/kong/llm/plugin/shared-filters/serialize-analytics.lua
index ffaff8bbb31c..1b246fa3ecd6 100644
--- a/kong/llm/plugin/shared-filters/serialize-analytics.lua
+++ b/kong/llm/plugin/shared-filters/serialize-analytics.lua
@@ -66,6 +66,8 @@ function _M:run(conf)
     cost = ai_plugin_o11y.metrics_get("llm_usage_cost"),
   }
 
+  kong.log.inspect(usage)
+
   kong.log.set_serialize_value(string.format("ai.%s.usage", ai_plugin_o11y.NAMESPACE), usage)
 
 
diff --git a/spec/03-plugins/38-ai-proxy/09-streaming_integration_spec.lua b/spec/03-plugins/38-ai-proxy/09-streaming_integration_spec.lua
index a2a35a0fac0a..d7c89b79b0b7 100644
--- a/spec/03-plugins/38-ai-proxy/09-streaming_integration_spec.lua
+++ b/spec/03-plugins/38-ai-proxy/09-streaming_integration_spec.lua
@@ -632,6 +632,31 @@ for _, strategy in helpers.all_strategies() do
 
         assert.equal(#events, 8)
         assert.equal(buf:tostring(), "The answer to 1 + 1 is 2.")
+
+        -- test analytics on this item
+        local log_message = wait_for_json_log_entry(FILE_LOG_PATH_WITH_PAYLOADS)
+        assert.same("127.0.0.1", log_message.client_ip)
+        assert.is_number(log_message.request.size)
+        assert.is_number(log_message.response.size)
+
+        local actual_stats = log_message.ai.proxy
+
+        local actual_llm_latency = actual_stats.meta.llm_latency
+        local actual_time_per_token = actual_stats.usage.time_per_token
+        local time_per_token = actual_llm_latency / actual_stats.usage.completion_tokens
+
+        local actual_request_log = actual_stats.payload.request or "ERROR: NONE_RETURNED"
+        local actual_response_log = actual_stats.payload.response or "ERROR: NONE_RETURNED"
+        actual_stats.payload = nil
+
+        actual_stats.meta.llm_latency = 1
+        actual_stats.usage.time_per_token = 1
+
+        assert.same(_EXPECTED_CHAT_STATS, actual_stats)
+        assert.is_true(actual_llm_latency >= 0)
+        assert.same(tonumber(string.format("%.3f", actual_time_per_token)), tonumber(string.format("%.3f", time_per_token)))
+        assert.match_re(actual_request_log, [[.*content.*What is 1 \+ 1.*]])
+        assert.match_re(actual_response_log, [[.*content.*The answer.*]])
       end)
 
       it("good stream request cohere", function()
diff --git a/spec/03-plugins/38-ai-proxy/11-gemini_integration_spec.lua b/spec/03-plugins/38-ai-proxy/11-gemini_integration_spec.lua
new file mode 100644
index 000000000000..76c80d541ebb
--- /dev/null
+++ b/spec/03-plugins/38-ai-proxy/11-gemini_integration_spec.lua
@@ -0,0 +1,222 @@
+local helpers = require("spec.helpers")
+local cjson = require("cjson")
+local pl_file = require("pl.file")
+local strip = require("kong.tools.string").strip
+
+local PLUGIN_NAME = "ai-proxy"
+local MOCK_PORT = helpers.get_available_port()
+
+local FILE_LOG_PATH_WITH_PAYLOADS = os.tmpname()
+
+local truncate_file = function(path)
+  local file = io.open(path, "w")
+  file:close()
+end
+
+local function wait_for_json_log_entry(FILE_LOG_PATH)
+  local json
+
+  assert
+    .with_timeout(10)
+    .ignore_exceptions(true)
+    .eventually(function()
+      local data = assert(pl_file.read(FILE_LOG_PATH))
+
+      data = strip(data)
+      assert(#data > 0, "log file is empty")
+
+      data = data:match("%b{}")
+      assert(data, "log file does not contain JSON")
+
+      json = cjson.decode(data)
+    end)
+    .has_no_error("log file contains a valid JSON entry")
+
+  return json
+end
+
+local _EXPECTED_CHAT_STATS = {
+  meta = {
+    plugin_id = '17434c15-2c7c-4c2f-b87a-58880533a3c1',
+    provider_name = 'gemini',
+    request_model = 'gemini-1.5-pro',
+    response_model = 'gemini-1.5-pro',
+    llm_latency = 1,
+  },
+  usage = {
+    prompt_tokens = 2,
+    completion_tokens = 11,
+    total_tokens = 13,
+    time_per_token = 1,
+    cost = 0.000195,
+  },
+}
+
+for _, strategy in helpers.all_strategies() do
+  if strategy ~= "cassandra" then
+    describe(PLUGIN_NAME .. ": (access) [#" .. strategy .. "]", function()
+      local client
+
+      lazy_setup(function()
+        local bp = helpers.get_db_utils(strategy == "off" and "postgres" or strategy, nil, { PLUGIN_NAME })
+
+        -- set up gemini mock fixtures
+        local fixtures = {
+          http_mock = {},
+        }
+
+        fixtures.http_mock.gemini = [[
+          server {
+            server_name gemini;
+            listen ]] .. MOCK_PORT .. [[;
+            
+            default_type 'application/json';
+
+            location = "/v1/chat/completions" {
+              content_by_lua_block {
+                local pl_file = require "pl.file"
+                local json = require("cjson.safe")
+
+                local token = ngx.req.get_headers()["authorization"]
+                if token == "Bearer gemini-key" then
+                  ngx.req.read_body()
+                  local body, err = ngx.req.get_body_data()
+                  body, err = json.decode(body)
+                  
+                  ngx.status = 200
+                  ngx.print(pl_file.read("spec/fixtures/ai-proxy/gemini/llm-v1-chat/responses/good.json"))
+                end
+              }
+            }
+          }
+        ]]
+
+        local empty_service = assert(bp.services:insert({
+          name = "empty_service",
+          host = "localhost", --helpers.mock_upstream_host,
+          port = 8080, --MOCK_PORT,
+          path = "/",
+        }))
+
+        -- 200 chat good with one option
+        local chat_good = assert(bp.routes:insert({
+          service = empty_service,
+          protocols = { "http" },
+          strip_path = true,
+          paths = { "/gemini/llm/v1/chat/good" },
+        }))
+        bp.plugins:insert({
+          name = PLUGIN_NAME,
+          id = "17434c15-2c7c-4c2f-b87a-58880533a3c1",
+          route = { id = chat_good.id },
+          config = {
+            route_type = "llm/v1/chat",
+            auth = {
+              header_name = "Authorization",
+              header_value = "Bearer gemini-key",
+            },
+            logging = {
+              log_payloads = true,
+              log_statistics = true,
+            },
+            model = {
+              name = "gemini-1.5-pro",
+              provider = "gemini",
+              options = {
+                max_tokens = 256,
+                temperature = 1.0,
+                upstream_url = "http://" .. helpers.mock_upstream_host .. ":" .. MOCK_PORT .. "/v1/chat/completions",
+                input_cost = 15.0,
+                output_cost = 15.0,
+              },
+            },
+          },
+        })
+        bp.plugins:insert {
+          name = "file-log",
+          route = { id = chat_good.id },
+          config = {
+            path = FILE_LOG_PATH_WITH_PAYLOADS,
+          },
+        }
+
+        -- start kong
+        assert(helpers.start_kong({
+          -- set the strategy
+          database = strategy,
+          -- use the custom test template to create a local mock server
+          nginx_conf = "spec/fixtures/custom_nginx.template",
+          -- make sure our plugin gets loaded
+          plugins = "bundled," .. PLUGIN_NAME,
+          -- write & load declarative config, only if 'strategy=off'
+          declarative_config = strategy == "off" and helpers.make_yaml_file() or nil,
+        }, nil, nil, fixtures))
+      end)
+
+      lazy_teardown(function()
+        helpers.stop_kong()
+        os.remove(FILE_LOG_PATH_WITH_PAYLOADS)
+      end)
+
+      before_each(function()
+        client = helpers.proxy_client()
+        truncate_file(FILE_LOG_PATH_WITH_PAYLOADS)
+      end)
+
+      after_each(function()
+        if client then
+          client:close()
+        end
+      end)
+
+      describe("gemini llm/v1/chat", function()
+        it("good request", function()
+          local r = client:get("/gemini/llm/v1/chat/good", {
+            headers = {
+              ["content-type"] = "application/json",
+              ["accept"] = "application/json",
+            },
+            body = pl_file.read("spec/fixtures/ai-proxy/openai/llm-v1-chat/requests/good.json"),
+          })
+          -- validate that the request succeeded, response status 200
+          local body = assert.res_status(200, r)
+          local json = cjson.decode(body)
+
+          -- check this is in the 'kong' response format
+          assert.equals(json.model, "gemini-1.5-pro")
+          assert.equals(json.object, "chat.completion")
+          assert.equals(json.choices[1].finish_reason, "stop")
+
+          assert.is_table(json.choices)
+          assert.is_string(json.choices[1].message.content)
+          assert.same("Everything is okay.", json.choices[1].message.content)
+
+          -- test stats from file-log
+          local log_message = wait_for_json_log_entry(FILE_LOG_PATH_WITH_PAYLOADS)
+          assert.same("127.0.0.1", log_message.client_ip)
+          assert.is_number(log_message.request.size)
+          assert.is_number(log_message.response.size)
+  
+          local actual_stats = log_message.ai.proxy
+
+          local actual_llm_latency = actual_stats.meta.llm_latency
+          local actual_time_per_token = actual_stats.usage.time_per_token
+          local time_per_token = actual_llm_latency / actual_stats.usage.completion_tokens
+          
+          local actual_request_log = actual_stats.payload.request
+          local actual_response_log = actual_stats.payload.response
+          actual_stats.payload = nil
+
+          actual_stats.meta.llm_latency = 1
+          actual_stats.usage.time_per_token = 1
+  
+          assert.same(_EXPECTED_CHAT_STATS, actual_stats)
+          assert.is_true(actual_llm_latency >= 0)
+          assert.same(tonumber(string.format("%.3f", actual_time_per_token)), tonumber(string.format("%.3f", time_per_token)))
+          assert.match_re(actual_request_log, [[.*contents.*What is 1 \+ 1.*]])
+          assert.match_re(actual_response_log, [[.*content.*Everything is okay.*]])
+        end)
+      end)
+    end)
+  end
+end
diff --git a/spec/fixtures/ai-proxy/gemini/llm-v1-chat/responses/good.json b/spec/fixtures/ai-proxy/gemini/llm-v1-chat/responses/good.json
new file mode 100644
index 000000000000..0242dfd3b45f
--- /dev/null
+++ b/spec/fixtures/ai-proxy/gemini/llm-v1-chat/responses/good.json
@@ -0,0 +1,22 @@
+{
+    "candidates": [
+      {
+        "content": {
+          "role": "model",
+          "parts": [
+            {
+              "text": "Everything is okay."
+            }
+          ]
+        },
+        "finishReason": "STOP",
+        "avgLogprobs": -0.013348851691592823
+      }
+    ],
+    "usageMetadata": {
+      "promptTokenCount": 2,
+      "candidatesTokenCount": 11,
+      "totalTokenCount": 13
+    },
+    "modelVersion": "gemini-1.5-flash-002"
+}

From 2c436dbb33c84f575ac203480453b48753b3d373 Mon Sep 17 00:00:00 2001
From: Jack Tysoe <jack.tysoe@konghq.com>
Date: Wed, 27 Nov 2024 00:40:35 +0000
Subject: [PATCH 08/10] fix(AG-178): add logging tests for streaming

(cherry picked from commit bd526e5ef1bcce2e641970da9ea4adf8f2f8c9a0)
---
 .../09-streaming_integration_spec.lua         | 61 +++++++++++++++++++
 .../11-gemini_integration_spec.lua            |  2 +-
 2 files changed, 62 insertions(+), 1 deletion(-)

diff --git a/spec/03-plugins/38-ai-proxy/09-streaming_integration_spec.lua b/spec/03-plugins/38-ai-proxy/09-streaming_integration_spec.lua
index d7c89b79b0b7..5cb8a94ed531 100644
--- a/spec/03-plugins/38-ai-proxy/09-streaming_integration_spec.lua
+++ b/spec/03-plugins/38-ai-proxy/09-streaming_integration_spec.lua
@@ -1,12 +1,59 @@
 local helpers = require "spec.helpers"
 local cjson = require "cjson.safe"
 local pl_file = require "pl.file"
+local strip = require("kong.tools.string").strip
 
 local http = require("resty.http")
 
 local PLUGIN_NAME = "ai-proxy"
 local MOCK_PORT = helpers.get_available_port()
 
+local FILE_LOG_PATH_WITH_PAYLOADS = os.tmpname()
+
+local _EXPECTED_CHAT_STATS = {
+  meta = {
+    plugin_id = '6e7c40f6-ce96-48e4-a366-d109c169e444',
+    provider_name = 'openai',
+    request_model = 'gpt-3.5-turbo',
+    response_model = 'gpt-3.5-turbo',
+    llm_latency = 1
+  },
+  usage = {
+    prompt_tokens = 18,
+    completion_tokens = 7,
+    total_tokens = 25,
+    time_per_token = 1,
+    cost = 0.00037,
+  },
+}
+
+local truncate_file = function(path)
+  local file = io.open(path, "w")
+  file:close()
+end
+
+local function wait_for_json_log_entry(FILE_LOG_PATH)
+  local json
+
+  assert
+    .with_timeout(10)
+    .ignore_exceptions(true)
+    .eventually(function()
+      local data = assert(pl_file.read(FILE_LOG_PATH))
+
+      data = strip(data)
+      assert(#data > 0, "log file is empty")
+
+      data = data:match("%b{}")
+      assert(data, "log file does not contain JSON")
+
+      json = cjson.decode(data)
+    end)
+    .has_no_error("log file contains a valid JSON entry")
+
+  return json
+end
+
 for _, strategy in helpers.all_strategies() do
   describe(PLUGIN_NAME .. ": (access) [#" .. strategy  .. "]", function()
     local client
@@ -353,6 +400,7 @@ for _, strategy in helpers.all_strategies() do
       })
       bp.plugins:insert {
         name = PLUGIN_NAME,
+        id = "6e7c40f6-ce96-48e4-a366-d109c169e444",
         route = { id = openai_chat_partial.id },
         config = {
           route_type = "llm/v1/chat",
@@ -360,6 +408,10 @@ for _, strategy in helpers.all_strategies() do
             header_name = "Authorization",
             header_value = "Bearer openai-key",
           },
+          logging = {
+            log_payloads = true,
+            log_statistics = true,
+          },
           model = {
             name = "gpt-3.5-turbo",
             provider = "openai",
@@ -371,6 +423,13 @@ for _, strategy in helpers.all_strategies() do
           },
         },
       }
+      bp.plugins:insert {
+        name = "file-log",
+        route = { id = openai_chat_partial.id },
+        config = {
+          path = FILE_LOG_PATH_WITH_PAYLOADS,
+        },
+      }
       --
 
       -- 200 chat cohere
@@ -497,10 +556,12 @@ for _, strategy in helpers.all_strategies() do
 
     lazy_teardown(function()
       helpers.stop_kong()
+      os.remove(FILE_LOG_PATH_WITH_PAYLOADS)
     end)
 
     before_each(function()
       client = helpers.proxy_client()
+      truncate_file(FILE_LOG_PATH_WITH_PAYLOADS)
     end)
 
     after_each(function()
diff --git a/spec/03-plugins/38-ai-proxy/11-gemini_integration_spec.lua b/spec/03-plugins/38-ai-proxy/11-gemini_integration_spec.lua
index 76c80d541ebb..1e1bce8081bb 100644
--- a/spec/03-plugins/38-ai-proxy/11-gemini_integration_spec.lua
+++ b/spec/03-plugins/38-ai-proxy/11-gemini_integration_spec.lua
@@ -219,4 +219,4 @@ for _, strategy in helpers.all_strategies() do
       end)
     end)
   end
-end
+end
\ No newline at end of file

From 6b13d961f4216e3d2f300f99affd6e8238a915e7 Mon Sep 17 00:00:00 2001
From: Jack Tysoe <jack.tysoe@konghq.com>
Date: Wed, 27 Nov 2024 01:49:17 +0000
Subject: [PATCH 09/10] fix(AG-178): fix gemini parsing multiple chained tool
 calls

(cherry picked from commit 581571b0c35035f0304282978610f2cbe4f2c8b4)
---
 kong/llm/drivers/gemini.lua | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/kong/llm/drivers/gemini.lua b/kong/llm/drivers/gemini.lua
index 1cbdd6095e5f..99ba1b4adbac 100644
--- a/kong/llm/drivers/gemini.lua
+++ b/kong/llm/drivers/gemini.lua
@@ -268,22 +268,23 @@ local function from_gemini_chat_openai(response, model_info, route_type)
       messages.model = model_info.name
 
     elseif is_tool_content(response) then
+      messages.choices[1] = {
+        index = 0,
+        message = {
+          role = "assistant",
+          tool_calls = {},
+        },
+      }
+
       local function_call_responses = response.candidates[1].content.parts
       for i, v in ipairs(function_call_responses) do
-        messages.choices[i] = {
-          index = 0,
-          message = {
-            role = "assistant",
-            tool_calls = {
-              {
-                ['function'] = {
-                  name = v.functionCall.name,
-                  arguments = cjson.encode(v.functionCall.args),
-                },
-              },
+        messages.choices[1].message.tool_calls[i] =
+          {
+            ['function'] = {
+              name = v.functionCall.name,
+              arguments = cjson.encode(v.functionCall.args),
             },
-          },
-        }
+          }
       end
     end
 

From d6314a7d6ca9776aeb7a9da77685f240caa8db58 Mon Sep 17 00:00:00 2001
From: Jack Tysoe <jack.tysoe@konghq.com>
Date: Thu, 28 Nov 2024 13:51:50 +0000
Subject: [PATCH 10/10] fix(llm): fix streaming sse filter not ran twice and
 prompt token count and missing metadata

(cherry picked from commit e053601b40e4b642575cb184e64204a17414d484)
---
 kong/llm/plugin/base.lua                      |  9 ++++++-
 .../shared-filters/normalize-sse-chunk.lua    | 21 +++++++++++-----
 .../02-openai_integration_spec.lua            |  8 +++----
 .../09-streaming_integration_spec.lua         | 24 +++++++++++++------
 4 files changed, 44 insertions(+), 18 deletions(-)

diff --git a/kong/llm/plugin/base.lua b/kong/llm/plugin/base.lua
index 3a23c78a8732..0daca7a29419 100644
--- a/kong/llm/plugin/base.lua
+++ b/kong/llm/plugin/base.lua
@@ -19,6 +19,13 @@ local STAGES = {
   RES_POST_PROCESSING = 7,
 }
 
+-- Filters in those stages are allowed to execute more than one time in a request
+-- TODO: implement singleton support, that in one iteration of of body_filter only one filter
+-- only ran one times. This is not an issue today as they are only used in one plugin.
+local REPEATED_PHASES = {
+  [STAGES.STREAMING] = true,
+}
+
 local MetaPlugin = {}
 
 local all_filters = {}
@@ -38,7 +45,7 @@ local function run_stage(stage, sub_plugin, conf)
     if not f then
       kong.log.err("no filter named '" .. name .. "' registered")
 
-    elseif not ai_executed_filters[name] then
+    elseif not ai_executed_filters[name] or REPEATED_PHASES[stage] then
       ai_executed_filters[name] = true
 
       kong.log.debug("executing filter ", name)
diff --git a/kong/llm/plugin/shared-filters/normalize-sse-chunk.lua b/kong/llm/plugin/shared-filters/normalize-sse-chunk.lua
index 4684f8d081ec..d97d5c59b949 100644
--- a/kong/llm/plugin/shared-filters/normalize-sse-chunk.lua
+++ b/kong/llm/plugin/shared-filters/normalize-sse-chunk.lua
@@ -83,6 +83,8 @@ local function handle_streaming_frame(conf, chunk, finished)
 
 
   for _, event in ipairs(events) do
+    -- TODO: currently only subset of driver follow the body, err, metadata pattern
+    -- unify this so that it was always extracted from the body
     local formatted, _, metadata = ai_driver.from_format(event, conf.model, "stream/" .. conf.route_type)
 
     if formatted then
@@ -106,12 +108,6 @@ local function handle_streaming_frame(conf, chunk, finished)
         if body_buffer then
           body_buffer:put(token_t)
         end
-
-        -- incredibly loose estimate based on https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
-        -- but this is all we can do until OpenAI fixes this...
-        --
-        -- essentially, every 4 characters is a token, with minimum of 1*4 per event
-        ai_plugin_o11y.metrics_add("llm_completion_tokens_count", math.ceil(#strip(token_t) / 4))
       end
     end
 
@@ -141,6 +137,19 @@ local function handle_streaming_frame(conf, chunk, finished)
 
     local prompt_tokens_count = ai_plugin_o11y.metrics_get("llm_prompt_tokens_count")
     local completion_tokens_count = ai_plugin_o11y.metrics_get("llm_completion_tokens_count")
+
+    if conf.logging and conf.logging.log_statistics then
+      -- no metadata populated in the event streams, do our estimation
+      if completion_tokens_count == 0 then
+        -- incredibly loose estimate based on https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
+        -- but this is all we can do until OpenAI fixes this...
+        --
+        -- essentially, every 4 characters is a token, with minimum of 1*4 per event
+        completion_tokens_count = math.ceil(#strip(response) / 4)
+        ai_plugin_o11y.metrics_set("llm_completion_tokens_count", completion_tokens_count)
+      end
+    end
+
     -- populate cost
     if conf.model.options and conf.model.options.input_cost and conf.model.options.output_cost then
       local cost = (prompt_tokens_count * conf.model.options.input_cost +
diff --git a/spec/03-plugins/38-ai-proxy/02-openai_integration_spec.lua b/spec/03-plugins/38-ai-proxy/02-openai_integration_spec.lua
index f4dbf536ab70..9831385efe95 100644
--- a/spec/03-plugins/38-ai-proxy/02-openai_integration_spec.lua
+++ b/spec/03-plugins/38-ai-proxy/02-openai_integration_spec.lua
@@ -871,14 +871,14 @@ for _, strategy in helpers.all_strategies() do
         local _, first_got = next(log_message.ai)
         local actual_llm_latency = first_got.meta.llm_latency
         local actual_time_per_token = first_got.usage.time_per_token
-        local time_per_token = math.floor(actual_llm_latency / first_got.usage.completion_tokens)
+        local time_per_token = actual_llm_latency / first_got.usage.completion_tokens
 
         first_got.meta.llm_latency = 1
         first_got.usage.time_per_token = 1
 
         assert.same(first_expected, first_got)
         assert.is_true(actual_llm_latency >= 0)
-        assert.same(actual_time_per_token, time_per_token)
+        assert.same(tonumber(string.format("%.3f", actual_time_per_token)), tonumber(string.format("%.3f", time_per_token)))
         assert.same(first_got.meta.request_model, "gpt-3.5-turbo")
       end)
 
@@ -1529,14 +1529,14 @@ for _, strategy in helpers.all_strategies() do
 
         local actual_llm_latency = first_got.meta.llm_latency
         local actual_time_per_token = first_got.usage.time_per_token
-        local time_per_token = math.floor(actual_llm_latency / first_got.usage.completion_tokens)
+        local time_per_token = actual_llm_latency / first_got.usage.completion_tokens
 
         first_got.meta.llm_latency = 1
         first_got.usage.time_per_token = 1
 
         assert.same(first_expected, first_got)
         assert.is_true(actual_llm_latency >= 0)
-        assert.same(actual_time_per_token, time_per_token)
+        assert.same(tonumber(string.format("%.3f", actual_time_per_token)), tonumber(string.format("%.3f", time_per_token)))
       end)
 
       it("logs payloads", function()
diff --git a/spec/03-plugins/38-ai-proxy/09-streaming_integration_spec.lua b/spec/03-plugins/38-ai-proxy/09-streaming_integration_spec.lua
index 5cb8a94ed531..6ba0d7bdf97d 100644
--- a/spec/03-plugins/38-ai-proxy/09-streaming_integration_spec.lua
+++ b/spec/03-plugins/38-ai-proxy/09-streaming_integration_spec.lua
@@ -20,10 +20,10 @@ local _EXPECTED_CHAT_STATS = {
   },
   usage = {
     prompt_tokens = 18,
-    completion_tokens = 7,
-    total_tokens = 25,
+    completion_tokens = 13, -- this was from estimation
+    total_tokens = 31,
     time_per_token = 1,
-    cost = 0.00037,
+    cost = 0.00031,
   },
 }
 
@@ -377,7 +377,9 @@ for _, strategy in helpers.all_strategies() do
             options = {
               max_tokens = 256,
               temperature = 1.0,
-              upstream_url = "http://"..helpers.mock_upstream_host..":"..MOCK_PORT.."/openai/llm/v1/chat/good"
+              upstream_url = "http://"..helpers.mock_upstream_host..":"..MOCK_PORT.."/openai/llm/v1/chat/good",
+              input_cost = 10.0,
+              output_cost = 10.0,
             },
           },
         },
@@ -418,7 +420,9 @@ for _, strategy in helpers.all_strategies() do
             options = {
               max_tokens = 256,
               temperature = 1.0,
-              upstream_url = "http://"..helpers.mock_upstream_host..":"..MOCK_PORT.."/openai/llm/v1/chat/partial"
+              upstream_url = "http://"..helpers.mock_upstream_host..":"..MOCK_PORT.."/openai/llm/v1/chat/partial",
+              input_cost = 10.0,
+              output_cost = 10.0,
             },
           },
         },
@@ -454,7 +458,9 @@ for _, strategy in helpers.all_strategies() do
             options = {
               max_tokens = 256,
               temperature = 1.0,
-              upstream_url = "http://"..helpers.mock_upstream_host..":"..MOCK_PORT.."/cohere/llm/v1/chat/good"
+              upstream_url = "http://"..helpers.mock_upstream_host..":"..MOCK_PORT.."/cohere/llm/v1/chat/good",
+              input_cost = 10.0,
+              output_cost = 10.0,
             },
           },
         },
@@ -492,6 +498,8 @@ for _, strategy in helpers.all_strategies() do
               temperature = 1.0,
               upstream_url = "http://"..helpers.mock_upstream_host..":"..MOCK_PORT.."/anthropic/llm/v1/chat/good",
               anthropic_version = "2023-06-01",
+              input_cost = 10.0,
+              output_cost = 10.0,
             },
           },
         },
@@ -527,7 +535,9 @@ for _, strategy in helpers.all_strategies() do
             options = {
               max_tokens = 256,
               temperature = 1.0,
-              upstream_url = "http://"..helpers.mock_upstream_host..":"..MOCK_PORT.."/openai/llm/v1/chat/bad"
+              upstream_url = "http://"..helpers.mock_upstream_host..":"..MOCK_PORT.."/openai/llm/v1/chat/bad",
+              input_cost = 10.0,
+              output_cost = 10.0,
             },
           },
         },