Skip to content

Commit

Permalink
Utilize source field for confidence and answered (#264)
Browse files Browse the repository at this point in the history
There's a bug currently for counting + multiclass where confidence can
be below 0.5, which throws off previous assumptions.

---------

Co-authored-by: Auto-format Bot <[email protected]>
  • Loading branch information
brandon-groundlight and Auto-format Bot authored Oct 14, 2024
1 parent b89936d commit d9a20eb
Show file tree
Hide file tree
Showing 4 changed files with 142 additions and 98 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ packages = [
{include = "**/*.py", from = "src"},
]
readme = "README.md"
version = "0.18.3"
version = "0.18.4"

[tool.poetry.dependencies]
# For certifi, use ">=" instead of "^" since it upgrades its "major version" every year, not really following semver
Expand Down
15 changes: 5 additions & 10 deletions src/groundlight/internalapi.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

import requests
from groundlight_openapi_client.api_client import ApiClient, ApiException
from model import Detector, ImageQuery
from model import Detector, ImageQuery, Source

from groundlight.status_codes import is_ok
from groundlight.version import get_version
Expand Down Expand Up @@ -61,24 +61,19 @@ def _generate_request_id():

def iq_is_confident(iq: ImageQuery, confidence_threshold: float) -> bool:
"""Returns True if the image query's confidence is above threshold.
The only subtletie here is that currently confidence of None means
The only subtlety here is that currently confidence of None means
human label, which is treated as confident.
"""
if iq.result.confidence is None:
# Human label
return True
return iq.result.confidence >= confidence_threshold


def iq_is_answered(iq: ImageQuery) -> bool:
"""Returns True if the image query has a ML or human label.
Placeholder and special labels (out of domain) have confidences exactly 0.5
"""
if iq.result.confidence is None:
# Human label
return True
placeholder_confidence = 0.5
return iq.result.confidence > placeholder_confidence
if (iq.result.source == Source.STILL_PROCESSING) or (iq.result.source is None): # Should never be None
return False
return True


class InternalApiError(ApiException, RuntimeError):
Expand Down
88 changes: 1 addition & 87 deletions test/integration/test_groundlight.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
import pytest
from groundlight import Groundlight
from groundlight.binary_labels import VALID_DISPLAY_LABELS, DeprecatedLabel, Label, convert_internal_label_to_display
from groundlight.internalapi import InternalApiError, NotFoundError, iq_is_answered
from groundlight.internalapi import InternalApiError, NotFoundError
from groundlight.optional_imports import *
from groundlight.status_codes import is_user_error
from model import (
Expand Down Expand Up @@ -652,92 +652,6 @@ def test_submit_numpy_image(gl: Groundlight, detector: Detector):
assert is_valid_display_result(_image_query.result)


@pytest.mark.skip(reason="This test can block development depending on the state of the service")
@pytest.mark.skipif(MISSING_PIL, reason="Needs pillow") # type: ignore
def test_detector_improvement(gl: Groundlight):
# test that we get confidence improvement after sending images in
# Pass two of each type of image in
import random
import time

from PIL import Image, ImageEnhance

random.seed(2741)

name = f"Test test_detector_improvement {datetime.utcnow()}" # Need a unique name
query = "Is there a dog?"
detector = gl.create_detector(name=name, query=query)

def submit_noisy_image(image, label=None):
sharpness = ImageEnhance.Sharpness(image)
noisy_image = sharpness.enhance(random.uniform(0.75, 1.25))
color = ImageEnhance.Color(noisy_image)
noisy_image = color.enhance(random.uniform(0.75, 1))
contrast = ImageEnhance.Contrast(noisy_image)
noisy_image = contrast.enhance(random.uniform(0.75, 1))
brightness = ImageEnhance.Brightness(noisy_image)
noisy_image = brightness.enhance(random.uniform(0.75, 1))
img_query = gl.submit_image_query(detector=detector.id, image=noisy_image, wait=0, human_review="NEVER")
if label is not None:
gl.add_label(img_query, label)
return img_query

dog = Image.open("test/assets/dog.jpeg")
cat = Image.open("test/assets/cat.jpeg")

submit_noisy_image(dog, "YES")
submit_noisy_image(dog, "YES")
submit_noisy_image(cat, "NO")
submit_noisy_image(cat, "NO")

# wait to give enough time to train
wait_period = 30 # seconds
num_wait_periods = 4 # 2 minutes total
result_confidence = 0.6
new_dog_query = None
new_cat_query = None
for _ in range(num_wait_periods):
time.sleep(wait_period)
new_dog_query = submit_noisy_image(dog)
new_cat_query = submit_noisy_image(cat)
new_cat_result_confidence = new_cat_query.result.confidence
new_dog_result_confidence = new_dog_query.result.confidence

if (
new_cat_result_confidence and new_cat_result_confidence < result_confidence
) or new_cat_query.result.label == "YES":
# If the new query is not confident enough, we'll try again
continue
elif (
new_dog_result_confidence and new_dog_result_confidence < result_confidence
) or new_dog_query.result.label == "NO":
# If the new query is not confident enough, we'll try again
continue
else:
assert True
return

assert (
False
), f"The detector {detector} quality has not improved after two minutes q.v. {new_dog_query}, {new_cat_query}"


@pytest.mark.skip(
reason="We don't yet have an SLA level to test ask_confident against, and the test is flakey as a result"
)
def test_ask_method_quality(gl: Groundlight, detector: Detector):
# asks for some level of quality on how fast ask_ml is and that we will get a confident result from ask_confident
fast_always_yes_iq = gl.ask_ml(detector=detector.id, image="test/assets/dog.jpeg", wait=0)
assert iq_is_answered(fast_always_yes_iq)
name = f"Test {datetime.utcnow()}" # Need a unique name
query = "Is there a dog?"
detector = gl.create_detector(name=name, query=query, confidence_threshold=0.8)
fast_iq = gl.ask_ml(detector=detector.id, image="test/assets/dog.jpeg", wait=0)
assert iq_is_answered(fast_iq)
confident_iq = gl.ask_confident(detector=detector.id, image="test/assets/dog.jpeg", wait=180)
assert confident_iq.result.confidence is None or (confident_iq.result.confidence > IQ_IMPROVEMENT_THRESHOLD)


@pytest.mark.skip_for_edge_endpoint(reason="The edge-endpoint doesn't support inspection_id")
def test_start_inspection(gl: Groundlight):
inspection_id = gl.start_inspection()
Expand Down
135 changes: 135 additions & 0 deletions test/integration/test_groundlight_expensive.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
"""
We collect various expensive tests here. These tests should not be run regularly.
"""

# Optional star-imports are weird and not usually recommended ...
# ruff: noqa: F403,F405
# pylint: disable=wildcard-import,unused-wildcard-import,redefined-outer-name,import-outside-toplevel
import random
import time
from datetime import datetime

import pytest
from groundlight import Groundlight
from groundlight.internalapi import iq_is_answered, iq_is_confident
from groundlight.optional_imports import *
from model import (
Detector,
)

DEFAULT_CONFIDENCE_THRESHOLD = 0.9
IQ_IMPROVEMENT_THRESHOLD = 0.75


@pytest.fixture(name="gl")
def fixture_gl() -> Groundlight:
"""Creates a Groundlight client object for testing."""
_gl = Groundlight()
_gl.DEFAULT_WAIT = 10
return _gl


@pytest.mark.skip(reason="This test requires a human labeler who does not need to be in the testing loop")
def test_human_label(gl: Groundlight):
detector = gl.create_detector(name=f"Test {datetime.utcnow()}", query="Is there a dog?")
img_query = gl.submit_image_query(
detector=detector.id, image="test/assets/dog.jpeg", wait=60, human_review="ALWAYS"
)

count = 0
sleep_time = 5
total_time = 60
while img_query.result.source == "ALGORITHM" or img_query.result.label == "STILL_PROCESSING":
count += 1
time.sleep(sleep_time)
img_query = gl.get_image_query(img_query.id)
if count > total_time / sleep_time:
assert False, f"Human review is taking too long: {img_query}"

assert iq_is_answered(img_query)
assert iq_is_confident(img_query, confidence_threshold=0.9)


@pytest.mark.skip(reason="This test can block development depending on the state of the service")
@pytest.mark.skipif(MISSING_PIL, reason="Needs pillow") # type: ignore
def test_detector_improvement(gl: Groundlight):
# test that we get confidence improvement after sending images in
# Pass two of each type of image in
import time

from PIL import Image, ImageEnhance

random.seed(2741)

name = f"Test test_detector_improvement {datetime.utcnow()}" # Need a unique name
query = "Is there a dog?"
detector = gl.create_detector(name=name, query=query)

def submit_noisy_image(image, label=None):
sharpness = ImageEnhance.Sharpness(image)
noisy_image = sharpness.enhance(random.uniform(0.75, 1.25))
color = ImageEnhance.Color(noisy_image)
noisy_image = color.enhance(random.uniform(0.75, 1))
contrast = ImageEnhance.Contrast(noisy_image)
noisy_image = contrast.enhance(random.uniform(0.75, 1))
brightness = ImageEnhance.Brightness(noisy_image)
noisy_image = brightness.enhance(random.uniform(0.75, 1))
img_query = gl.submit_image_query(detector=detector.id, image=noisy_image, wait=0, human_review="NEVER")
if label is not None:
gl.add_label(img_query, label)
return img_query

dog = Image.open("test/assets/dog.jpeg")
cat = Image.open("test/assets/cat.jpeg")

submit_noisy_image(dog, "YES")
submit_noisy_image(dog, "YES")
submit_noisy_image(cat, "NO")
submit_noisy_image(cat, "NO")

# wait to give enough time to train
wait_period = 30 # seconds
num_wait_periods = 4 # 2 minutes total
result_confidence = 0.6
new_dog_query = None
new_cat_query = None
for _ in range(num_wait_periods):
time.sleep(wait_period)
new_dog_query = submit_noisy_image(dog)
new_cat_query = submit_noisy_image(cat)
new_cat_result_confidence = new_cat_query.result.confidence
new_dog_result_confidence = new_dog_query.result.confidence

if (
new_cat_result_confidence and new_cat_result_confidence < result_confidence
) or new_cat_query.result.label == "YES":
# If the new query is not confident enough, we'll try again
continue
elif (
new_dog_result_confidence and new_dog_result_confidence < result_confidence
) or new_dog_query.result.label == "NO":
# If the new query is not confident enough, we'll try again
continue
else:
assert True
return

assert (
False
), f"The detector {detector} quality has not improved after two minutes q.v. {new_dog_query}, {new_cat_query}"


@pytest.mark.skip(
reason="We don't yet have an SLA level to test ask_confident against, and the test is flakey as a result"
)
def test_ask_method_quality(gl: Groundlight, detector: Detector):
# asks for some level of quality on how fast ask_ml is and that we will get a confident result from ask_confident
fast_always_yes_iq = gl.ask_ml(detector=detector.id, image="test/assets/dog.jpeg", wait=0)
assert iq_is_answered(fast_always_yes_iq)
name = f"Test {datetime.utcnow()}" # Need a unique name
query = "Is there a dog?"
detector = gl.create_detector(name=name, query=query, confidence_threshold=0.8)
fast_iq = gl.ask_ml(detector=detector.id, image="test/assets/dog.jpeg", wait=0)
assert iq_is_answered(fast_iq)
confident_iq = gl.ask_confident(detector=detector.id, image="test/assets/dog.jpeg", wait=180)
assert confident_iq.result.confidence is None or (confident_iq.result.confidence > IQ_IMPROVEMENT_THRESHOLD)

0 comments on commit d9a20eb

Please sign in to comment.