From d9a20eb599cdb21dcddd3c04269b4b3b4345abbb Mon Sep 17 00:00:00 2001
From: Brandon <132288221+brandon-groundlight@users.noreply.github.com>
Date: Mon, 14 Oct 2024 12:37:54 -0700
Subject: [PATCH] Utilize source field for confidence and answered (#264)

There's a bug currently for counting + multiclass where confidence can
be below 0.5, which throws off previous assumptions.

---------

Co-authored-by: Auto-format Bot <autoformatbot@groundlight.ai>
---
 pyproject.toml                                |   2 +-
 src/groundlight/internalapi.py                |  15 +-
 test/integration/test_groundlight.py          |  88 +-----------
 .../integration/test_groundlight_expensive.py | 135 ++++++++++++++++++
 4 files changed, 142 insertions(+), 98 deletions(-)
 create mode 100644 test/integration/test_groundlight_expensive.py

diff --git a/pyproject.toml b/pyproject.toml
index 10200297..ed154e3c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -9,7 +9,7 @@ packages = [
     {include = "**/*.py", from = "src"},
 ]
 readme = "README.md"
-version = "0.18.3"
+version = "0.18.4"
 
 [tool.poetry.dependencies]
 # For certifi, use ">=" instead of "^" since it upgrades its "major version" every year, not really following semver
diff --git a/src/groundlight/internalapi.py b/src/groundlight/internalapi.py
index f4984810..5952c264 100644
--- a/src/groundlight/internalapi.py
+++ b/src/groundlight/internalapi.py
@@ -11,7 +11,7 @@
 
 import requests
 from groundlight_openapi_client.api_client import ApiClient, ApiException
-from model import Detector, ImageQuery
+from model import Detector, ImageQuery, Source
 
 from groundlight.status_codes import is_ok
 from groundlight.version import get_version
@@ -61,12 +61,9 @@ def _generate_request_id():
 
 def iq_is_confident(iq: ImageQuery, confidence_threshold: float) -> bool:
     """Returns True if the image query's confidence is above threshold.
-    The only subtletie here is that currently confidence of None means
+    The only subtlety here is that currently confidence of None means
     human label, which is treated as confident.
     """
-    if iq.result.confidence is None:
-        # Human label
-        return True
     return iq.result.confidence >= confidence_threshold
 
 
@@ -74,11 +71,9 @@ def iq_is_answered(iq: ImageQuery) -> bool:
     """Returns True if the image query has a ML or human label.
     Placeholder and special labels (out of domain) have confidences exactly 0.5
     """
-    if iq.result.confidence is None:
-        # Human label
-        return True
-    placeholder_confidence = 0.5
-    return iq.result.confidence > placeholder_confidence
+    if (iq.result.source == Source.STILL_PROCESSING) or (iq.result.source is None):  # Should never be None
+        return False
+    return True
 
 
 class InternalApiError(ApiException, RuntimeError):
diff --git a/test/integration/test_groundlight.py b/test/integration/test_groundlight.py
index 8240b607..85a472c2 100644
--- a/test/integration/test_groundlight.py
+++ b/test/integration/test_groundlight.py
@@ -12,7 +12,7 @@
 import pytest
 from groundlight import Groundlight
 from groundlight.binary_labels import VALID_DISPLAY_LABELS, DeprecatedLabel, Label, convert_internal_label_to_display
-from groundlight.internalapi import InternalApiError, NotFoundError, iq_is_answered
+from groundlight.internalapi import InternalApiError, NotFoundError
 from groundlight.optional_imports import *
 from groundlight.status_codes import is_user_error
 from model import (
@@ -652,92 +652,6 @@ def test_submit_numpy_image(gl: Groundlight, detector: Detector):
     assert is_valid_display_result(_image_query.result)
 
 
-@pytest.mark.skip(reason="This test can block development depending on the state of the service")
-@pytest.mark.skipif(MISSING_PIL, reason="Needs pillow")  # type: ignore
-def test_detector_improvement(gl: Groundlight):
-    # test that we get confidence improvement after sending images in
-    # Pass two of each type of image in
-    import random
-    import time
-
-    from PIL import Image, ImageEnhance
-
-    random.seed(2741)
-
-    name = f"Test test_detector_improvement {datetime.utcnow()}"  # Need a unique name
-    query = "Is there a dog?"
-    detector = gl.create_detector(name=name, query=query)
-
-    def submit_noisy_image(image, label=None):
-        sharpness = ImageEnhance.Sharpness(image)
-        noisy_image = sharpness.enhance(random.uniform(0.75, 1.25))
-        color = ImageEnhance.Color(noisy_image)
-        noisy_image = color.enhance(random.uniform(0.75, 1))
-        contrast = ImageEnhance.Contrast(noisy_image)
-        noisy_image = contrast.enhance(random.uniform(0.75, 1))
-        brightness = ImageEnhance.Brightness(noisy_image)
-        noisy_image = brightness.enhance(random.uniform(0.75, 1))
-        img_query = gl.submit_image_query(detector=detector.id, image=noisy_image, wait=0, human_review="NEVER")
-        if label is not None:
-            gl.add_label(img_query, label)
-        return img_query
-
-    dog = Image.open("test/assets/dog.jpeg")
-    cat = Image.open("test/assets/cat.jpeg")
-
-    submit_noisy_image(dog, "YES")
-    submit_noisy_image(dog, "YES")
-    submit_noisy_image(cat, "NO")
-    submit_noisy_image(cat, "NO")
-
-    # wait to give enough time to train
-    wait_period = 30  # seconds
-    num_wait_periods = 4  # 2 minutes total
-    result_confidence = 0.6
-    new_dog_query = None
-    new_cat_query = None
-    for _ in range(num_wait_periods):
-        time.sleep(wait_period)
-        new_dog_query = submit_noisy_image(dog)
-        new_cat_query = submit_noisy_image(cat)
-        new_cat_result_confidence = new_cat_query.result.confidence
-        new_dog_result_confidence = new_dog_query.result.confidence
-
-        if (
-            new_cat_result_confidence and new_cat_result_confidence < result_confidence
-        ) or new_cat_query.result.label == "YES":
-            # If the new query is not confident enough, we'll try again
-            continue
-        elif (
-            new_dog_result_confidence and new_dog_result_confidence < result_confidence
-        ) or new_dog_query.result.label == "NO":
-            # If the new query is not confident enough, we'll try again
-            continue
-        else:
-            assert True
-            return
-
-    assert (
-        False
-    ), f"The detector {detector} quality has not improved after two minutes q.v. {new_dog_query}, {new_cat_query}"
-
-
-@pytest.mark.skip(
-    reason="We don't yet have an SLA level to test ask_confident against, and the test is flakey as a result"
-)
-def test_ask_method_quality(gl: Groundlight, detector: Detector):
-    # asks for some level of quality on how fast ask_ml is and that we will get a confident result from ask_confident
-    fast_always_yes_iq = gl.ask_ml(detector=detector.id, image="test/assets/dog.jpeg", wait=0)
-    assert iq_is_answered(fast_always_yes_iq)
-    name = f"Test {datetime.utcnow()}"  # Need a unique name
-    query = "Is there a dog?"
-    detector = gl.create_detector(name=name, query=query, confidence_threshold=0.8)
-    fast_iq = gl.ask_ml(detector=detector.id, image="test/assets/dog.jpeg", wait=0)
-    assert iq_is_answered(fast_iq)
-    confident_iq = gl.ask_confident(detector=detector.id, image="test/assets/dog.jpeg", wait=180)
-    assert confident_iq.result.confidence is None or (confident_iq.result.confidence > IQ_IMPROVEMENT_THRESHOLD)
-
-
 @pytest.mark.skip_for_edge_endpoint(reason="The edge-endpoint doesn't support inspection_id")
 def test_start_inspection(gl: Groundlight):
     inspection_id = gl.start_inspection()
diff --git a/test/integration/test_groundlight_expensive.py b/test/integration/test_groundlight_expensive.py
new file mode 100644
index 00000000..a45888c4
--- /dev/null
+++ b/test/integration/test_groundlight_expensive.py
@@ -0,0 +1,135 @@
+"""
+We collect various expensive tests here. These tests should not be run regularly.
+"""
+
+# Optional star-imports are weird and not usually recommended ...
+# ruff: noqa: F403,F405
+# pylint: disable=wildcard-import,unused-wildcard-import,redefined-outer-name,import-outside-toplevel
+import random
+import time
+from datetime import datetime
+
+import pytest
+from groundlight import Groundlight
+from groundlight.internalapi import iq_is_answered, iq_is_confident
+from groundlight.optional_imports import *
+from model import (
+    Detector,
+)
+
+DEFAULT_CONFIDENCE_THRESHOLD = 0.9
+IQ_IMPROVEMENT_THRESHOLD = 0.75
+
+
+@pytest.fixture(name="gl")
+def fixture_gl() -> Groundlight:
+    """Creates a Groundlight client object for testing."""
+    _gl = Groundlight()
+    _gl.DEFAULT_WAIT = 10
+    return _gl
+
+
+@pytest.mark.skip(reason="This test requires a human labeler who does not need to be in the testing loop")
+def test_human_label(gl: Groundlight):
+    detector = gl.create_detector(name=f"Test {datetime.utcnow()}", query="Is there a dog?")
+    img_query = gl.submit_image_query(
+        detector=detector.id, image="test/assets/dog.jpeg", wait=60, human_review="ALWAYS"
+    )
+
+    count = 0
+    sleep_time = 5
+    total_time = 60
+    while img_query.result.source == "ALGORITHM" or img_query.result.label == "STILL_PROCESSING":
+        count += 1
+        time.sleep(sleep_time)
+        img_query = gl.get_image_query(img_query.id)
+        if count > total_time / sleep_time:
+            assert False, f"Human review is taking too long: {img_query}"
+
+    assert iq_is_answered(img_query)
+    assert iq_is_confident(img_query, confidence_threshold=0.9)
+
+
+@pytest.mark.skip(reason="This test can block development depending on the state of the service")
+@pytest.mark.skipif(MISSING_PIL, reason="Needs pillow")  # type: ignore
+def test_detector_improvement(gl: Groundlight):
+    # test that we get confidence improvement after sending images in
+    # Pass two of each type of image in
+    import time
+
+    from PIL import Image, ImageEnhance
+
+    random.seed(2741)
+
+    name = f"Test test_detector_improvement {datetime.utcnow()}"  # Need a unique name
+    query = "Is there a dog?"
+    detector = gl.create_detector(name=name, query=query)
+
+    def submit_noisy_image(image, label=None):
+        sharpness = ImageEnhance.Sharpness(image)
+        noisy_image = sharpness.enhance(random.uniform(0.75, 1.25))
+        color = ImageEnhance.Color(noisy_image)
+        noisy_image = color.enhance(random.uniform(0.75, 1))
+        contrast = ImageEnhance.Contrast(noisy_image)
+        noisy_image = contrast.enhance(random.uniform(0.75, 1))
+        brightness = ImageEnhance.Brightness(noisy_image)
+        noisy_image = brightness.enhance(random.uniform(0.75, 1))
+        img_query = gl.submit_image_query(detector=detector.id, image=noisy_image, wait=0, human_review="NEVER")
+        if label is not None:
+            gl.add_label(img_query, label)
+        return img_query
+
+    dog = Image.open("test/assets/dog.jpeg")
+    cat = Image.open("test/assets/cat.jpeg")
+
+    submit_noisy_image(dog, "YES")
+    submit_noisy_image(dog, "YES")
+    submit_noisy_image(cat, "NO")
+    submit_noisy_image(cat, "NO")
+
+    # wait to give enough time to train
+    wait_period = 30  # seconds
+    num_wait_periods = 4  # 2 minutes total
+    result_confidence = 0.6
+    new_dog_query = None
+    new_cat_query = None
+    for _ in range(num_wait_periods):
+        time.sleep(wait_period)
+        new_dog_query = submit_noisy_image(dog)
+        new_cat_query = submit_noisy_image(cat)
+        new_cat_result_confidence = new_cat_query.result.confidence
+        new_dog_result_confidence = new_dog_query.result.confidence
+
+        if (
+            new_cat_result_confidence and new_cat_result_confidence < result_confidence
+        ) or new_cat_query.result.label == "YES":
+            # If the new query is not confident enough, we'll try again
+            continue
+        elif (
+            new_dog_result_confidence and new_dog_result_confidence < result_confidence
+        ) or new_dog_query.result.label == "NO":
+            # If the new query is not confident enough, we'll try again
+            continue
+        else:
+            assert True
+            return
+
+    assert (
+        False
+    ), f"The detector {detector} quality has not improved after two minutes q.v. {new_dog_query}, {new_cat_query}"
+
+
+@pytest.mark.skip(
+    reason="We don't yet have an SLA level to test ask_confident against, and the test is flakey as a result"
+)
+def test_ask_method_quality(gl: Groundlight, detector: Detector):
+    # asks for some level of quality on how fast ask_ml is and that we will get a confident result from ask_confident
+    fast_always_yes_iq = gl.ask_ml(detector=detector.id, image="test/assets/dog.jpeg", wait=0)
+    assert iq_is_answered(fast_always_yes_iq)
+    name = f"Test {datetime.utcnow()}"  # Need a unique name
+    query = "Is there a dog?"
+    detector = gl.create_detector(name=name, query=query, confidence_threshold=0.8)
+    fast_iq = gl.ask_ml(detector=detector.id, image="test/assets/dog.jpeg", wait=0)
+    assert iq_is_answered(fast_iq)
+    confident_iq = gl.ask_confident(detector=detector.id, image="test/assets/dog.jpeg", wait=180)
+    assert confident_iq.result.confidence is None or (confident_iq.result.confidence > IQ_IMPROVEMENT_THRESHOLD)