From 9c38fa492d544d27784fb44d2b5ab431127a6165 Mon Sep 17 00:00:00 2001 From: brandon Date: Fri, 11 Oct 2024 14:51:21 -0700 Subject: [PATCH 1/6] Utilize source field for confidence and answered --- src/groundlight/internalapi.py | 14 +- test/integration/test_groundlight.py | 85 ----------- .../integration/test_groundlight_expensive.py | 140 ++++++++++++++++++ 3 files changed, 145 insertions(+), 94 deletions(-) create mode 100644 test/integration/test_groundlight_expensive.py diff --git a/src/groundlight/internalapi.py b/src/groundlight/internalapi.py index f4984810..d40318f2 100644 --- a/src/groundlight/internalapi.py +++ b/src/groundlight/internalapi.py @@ -11,7 +11,7 @@ import requests from groundlight_openapi_client.api_client import ApiClient, ApiException -from model import Detector, ImageQuery +from model import Detector, ImageQuery, Source from groundlight.status_codes import is_ok from groundlight.version import get_version @@ -64,9 +64,6 @@ def iq_is_confident(iq: ImageQuery, confidence_threshold: float) -> bool: The only subtletie here is that currently confidence of None means human label, which is treated as confident. """ - if iq.result.confidence is None: - # Human label - return True return iq.result.confidence >= confidence_threshold @@ -74,11 +71,10 @@ def iq_is_answered(iq: ImageQuery) -> bool: """Returns True if the image query has a ML or human label. Placeholder and special labels (out of domain) have confidences exactly 0.5 """ - if iq.result.confidence is None: - # Human label - return True - placeholder_confidence = 0.5 - return iq.result.confidence > placeholder_confidence + if (iq.result.source == Source.STILL_PROCESSING) or (iq.result.source is None): # Should never be None + return False + return True + class InternalApiError(ApiException, RuntimeError): diff --git a/test/integration/test_groundlight.py b/test/integration/test_groundlight.py index 8240b607..a4534b9f 100644 --- a/test/integration/test_groundlight.py +++ b/test/integration/test_groundlight.py @@ -652,91 +652,6 @@ def test_submit_numpy_image(gl: Groundlight, detector: Detector): assert is_valid_display_result(_image_query.result) -@pytest.mark.skip(reason="This test can block development depending on the state of the service") -@pytest.mark.skipif(MISSING_PIL, reason="Needs pillow") # type: ignore -def test_detector_improvement(gl: Groundlight): - # test that we get confidence improvement after sending images in - # Pass two of each type of image in - import random - import time - - from PIL import Image, ImageEnhance - - random.seed(2741) - - name = f"Test test_detector_improvement {datetime.utcnow()}" # Need a unique name - query = "Is there a dog?" - detector = gl.create_detector(name=name, query=query) - - def submit_noisy_image(image, label=None): - sharpness = ImageEnhance.Sharpness(image) - noisy_image = sharpness.enhance(random.uniform(0.75, 1.25)) - color = ImageEnhance.Color(noisy_image) - noisy_image = color.enhance(random.uniform(0.75, 1)) - contrast = ImageEnhance.Contrast(noisy_image) - noisy_image = contrast.enhance(random.uniform(0.75, 1)) - brightness = ImageEnhance.Brightness(noisy_image) - noisy_image = brightness.enhance(random.uniform(0.75, 1)) - img_query = gl.submit_image_query(detector=detector.id, image=noisy_image, wait=0, human_review="NEVER") - if label is not None: - gl.add_label(img_query, label) - return img_query - - dog = Image.open("test/assets/dog.jpeg") - cat = Image.open("test/assets/cat.jpeg") - - submit_noisy_image(dog, "YES") - submit_noisy_image(dog, "YES") - submit_noisy_image(cat, "NO") - submit_noisy_image(cat, "NO") - - # wait to give enough time to train - wait_period = 30 # seconds - num_wait_periods = 4 # 2 minutes total - result_confidence = 0.6 - new_dog_query = None - new_cat_query = None - for _ in range(num_wait_periods): - time.sleep(wait_period) - new_dog_query = submit_noisy_image(dog) - new_cat_query = submit_noisy_image(cat) - new_cat_result_confidence = new_cat_query.result.confidence - new_dog_result_confidence = new_dog_query.result.confidence - - if ( - new_cat_result_confidence and new_cat_result_confidence < result_confidence - ) or new_cat_query.result.label == "YES": - # If the new query is not confident enough, we'll try again - continue - elif ( - new_dog_result_confidence and new_dog_result_confidence < result_confidence - ) or new_dog_query.result.label == "NO": - # If the new query is not confident enough, we'll try again - continue - else: - assert True - return - - assert ( - False - ), f"The detector {detector} quality has not improved after two minutes q.v. {new_dog_query}, {new_cat_query}" - - -@pytest.mark.skip( - reason="We don't yet have an SLA level to test ask_confident against, and the test is flakey as a result" -) -def test_ask_method_quality(gl: Groundlight, detector: Detector): - # asks for some level of quality on how fast ask_ml is and that we will get a confident result from ask_confident - fast_always_yes_iq = gl.ask_ml(detector=detector.id, image="test/assets/dog.jpeg", wait=0) - assert iq_is_answered(fast_always_yes_iq) - name = f"Test {datetime.utcnow()}" # Need a unique name - query = "Is there a dog?" - detector = gl.create_detector(name=name, query=query, confidence_threshold=0.8) - fast_iq = gl.ask_ml(detector=detector.id, image="test/assets/dog.jpeg", wait=0) - assert iq_is_answered(fast_iq) - confident_iq = gl.ask_confident(detector=detector.id, image="test/assets/dog.jpeg", wait=180) - assert confident_iq.result.confidence is None or (confident_iq.result.confidence > IQ_IMPROVEMENT_THRESHOLD) - @pytest.mark.skip_for_edge_endpoint(reason="The edge-endpoint doesn't support inspection_id") def test_start_inspection(gl: Groundlight): diff --git a/test/integration/test_groundlight_expensive.py b/test/integration/test_groundlight_expensive.py new file mode 100644 index 00000000..4da0b030 --- /dev/null +++ b/test/integration/test_groundlight_expensive.py @@ -0,0 +1,140 @@ +""" +We collect various expensive tests here. These tests should not be run regularly. +""" + +# Optional star-imports are weird and not usually recommended ... +# ruff: noqa: F403,F405 +# pylint: disable=wildcard-import,unused-wildcard-import,redefined-outer-name,import-outside-toplevel +import json +import random +import string +import time +from datetime import datetime +from typing import Any, Dict, Optional, Union + +import groundlight_openapi_client +import pytest +from groundlight import Groundlight +from groundlight.binary_labels import VALID_DISPLAY_LABELS, DeprecatedLabel, Label, convert_internal_label_to_display +from groundlight.internalapi import InternalApiError, NotFoundError, iq_is_answered, iq_is_confident +from groundlight.optional_imports import * +from groundlight.status_codes import is_user_error +from model import ( + BinaryClassificationResult, + CountingResult, + Detector, + ImageQuery, + PaginatedDetectorList, + PaginatedImageQueryList, +) + +DEFAULT_CONFIDENCE_THRESHOLD = 0.9 +IQ_IMPROVEMENT_THRESHOLD = 0.75 + +@pytest.fixture(name="gl") +def fixture_gl() -> Groundlight: + """Creates a Groundlight client object for testing.""" + _gl = Groundlight() + _gl.DEFAULT_WAIT = 10 + return _gl + +@pytest.mark.skip(reason="This test requires a human labeler who does not need to be in the testing loop") +def test_human_label(gl: Groundlight): + detector = gl.create_detector(name=f"Test {datetime.utcnow()}", query="Is there a dog?") + img_query = gl.submit_image_query(detector=detector.id, image="test/assets/dog.jpeg", wait=60, human_review="ALWAYS") + + count = 0 + while img_query.result.source == "ALGORITHM" or img_query.result.label == "STILL_PROCESSING": + count += 1 + time.sleep(5) + img_query = gl.get_image_query(img_query.id) + if count > 12: + assert False, f"Human review is taking too long: {img_query}" + + assert iq_is_answered(img_query) + assert iq_is_confident(img_query, confidence_threshold=0.9) + +@pytest.mark.skip(reason="This test can block development depending on the state of the service") +@pytest.mark.skipif(MISSING_PIL, reason="Needs pillow") # type: ignore +def test_detector_improvement(gl: Groundlight): + # test that we get confidence improvement after sending images in + # Pass two of each type of image in + import random + import time + + from PIL import Image, ImageEnhance + + random.seed(2741) + + name = f"Test test_detector_improvement {datetime.utcnow()}" # Need a unique name + query = "Is there a dog?" + detector = gl.create_detector(name=name, query=query) + + def submit_noisy_image(image, label=None): + sharpness = ImageEnhance.Sharpness(image) + noisy_image = sharpness.enhance(random.uniform(0.75, 1.25)) + color = ImageEnhance.Color(noisy_image) + noisy_image = color.enhance(random.uniform(0.75, 1)) + contrast = ImageEnhance.Contrast(noisy_image) + noisy_image = contrast.enhance(random.uniform(0.75, 1)) + brightness = ImageEnhance.Brightness(noisy_image) + noisy_image = brightness.enhance(random.uniform(0.75, 1)) + img_query = gl.submit_image_query(detector=detector.id, image=noisy_image, wait=0, human_review="NEVER") + if label is not None: + gl.add_label(img_query, label) + return img_query + + dog = Image.open("test/assets/dog.jpeg") + cat = Image.open("test/assets/cat.jpeg") + + submit_noisy_image(dog, "YES") + submit_noisy_image(dog, "YES") + submit_noisy_image(cat, "NO") + submit_noisy_image(cat, "NO") + + # wait to give enough time to train + wait_period = 30 # seconds + num_wait_periods = 4 # 2 minutes total + result_confidence = 0.6 + new_dog_query = None + new_cat_query = None + for _ in range(num_wait_periods): + time.sleep(wait_period) + new_dog_query = submit_noisy_image(dog) + new_cat_query = submit_noisy_image(cat) + new_cat_result_confidence = new_cat_query.result.confidence + new_dog_result_confidence = new_dog_query.result.confidence + + if ( + new_cat_result_confidence and new_cat_result_confidence < result_confidence + ) or new_cat_query.result.label == "YES": + # If the new query is not confident enough, we'll try again + continue + elif ( + new_dog_result_confidence and new_dog_result_confidence < result_confidence + ) or new_dog_query.result.label == "NO": + # If the new query is not confident enough, we'll try again + continue + else: + assert True + return + + assert ( + False + ), f"The detector {detector} quality has not improved after two minutes q.v. {new_dog_query}, {new_cat_query}" + + +@pytest.mark.skip( + reason="We don't yet have an SLA level to test ask_confident against, and the test is flakey as a result" +) +def test_ask_method_quality(gl: Groundlight, detector: Detector): + # asks for some level of quality on how fast ask_ml is and that we will get a confident result from ask_confident + fast_always_yes_iq = gl.ask_ml(detector=detector.id, image="test/assets/dog.jpeg", wait=0) + assert iq_is_answered(fast_always_yes_iq) + name = f"Test {datetime.utcnow()}" # Need a unique name + query = "Is there a dog?" + detector = gl.create_detector(name=name, query=query, confidence_threshold=0.8) + fast_iq = gl.ask_ml(detector=detector.id, image="test/assets/dog.jpeg", wait=0) + assert iq_is_answered(fast_iq) + confident_iq = gl.ask_confident(detector=detector.id, image="test/assets/dog.jpeg", wait=180) + assert confident_iq.result.confidence is None or (confident_iq.result.confidence > IQ_IMPROVEMENT_THRESHOLD) From 3f8323749423a014e3c9b0c51664759db83dedc4 Mon Sep 17 00:00:00 2001 From: Auto-format Bot Date: Fri, 11 Oct 2024 21:52:35 +0000 Subject: [PATCH 2/6] Automatically reformatting code --- src/groundlight/internalapi.py | 3 +-- test/integration/test_groundlight.py | 3 +-- .../integration/test_groundlight_expensive.py | 21 +++++++------------ 3 files changed, 9 insertions(+), 18 deletions(-) diff --git a/src/groundlight/internalapi.py b/src/groundlight/internalapi.py index d40318f2..889cafdd 100644 --- a/src/groundlight/internalapi.py +++ b/src/groundlight/internalapi.py @@ -71,12 +71,11 @@ def iq_is_answered(iq: ImageQuery) -> bool: """Returns True if the image query has a ML or human label. Placeholder and special labels (out of domain) have confidences exactly 0.5 """ - if (iq.result.source == Source.STILL_PROCESSING) or (iq.result.source is None): # Should never be None + if (iq.result.source == Source.STILL_PROCESSING) or (iq.result.source is None): # Should never be None return False return True - class InternalApiError(ApiException, RuntimeError): # TODO: We should really avoid this double inheritance since # both `ApiException` and `RuntimeError` are subclasses of diff --git a/test/integration/test_groundlight.py b/test/integration/test_groundlight.py index a4534b9f..85a472c2 100644 --- a/test/integration/test_groundlight.py +++ b/test/integration/test_groundlight.py @@ -12,7 +12,7 @@ import pytest from groundlight import Groundlight from groundlight.binary_labels import VALID_DISPLAY_LABELS, DeprecatedLabel, Label, convert_internal_label_to_display -from groundlight.internalapi import InternalApiError, NotFoundError, iq_is_answered +from groundlight.internalapi import InternalApiError, NotFoundError from groundlight.optional_imports import * from groundlight.status_codes import is_user_error from model import ( @@ -652,7 +652,6 @@ def test_submit_numpy_image(gl: Groundlight, detector: Detector): assert is_valid_display_result(_image_query.result) - @pytest.mark.skip_for_edge_endpoint(reason="The edge-endpoint doesn't support inspection_id") def test_start_inspection(gl: Groundlight): inspection_id = gl.start_inspection() diff --git a/test/integration/test_groundlight_expensive.py b/test/integration/test_groundlight_expensive.py index 4da0b030..43958fe4 100644 --- a/test/integration/test_groundlight_expensive.py +++ b/test/integration/test_groundlight_expensive.py @@ -5,32 +5,22 @@ # Optional star-imports are weird and not usually recommended ... # ruff: noqa: F403,F405 # pylint: disable=wildcard-import,unused-wildcard-import,redefined-outer-name,import-outside-toplevel -import json import random -import string import time from datetime import datetime -from typing import Any, Dict, Optional, Union -import groundlight_openapi_client import pytest from groundlight import Groundlight -from groundlight.binary_labels import VALID_DISPLAY_LABELS, DeprecatedLabel, Label, convert_internal_label_to_display -from groundlight.internalapi import InternalApiError, NotFoundError, iq_is_answered, iq_is_confident +from groundlight.internalapi import iq_is_answered, iq_is_confident from groundlight.optional_imports import * -from groundlight.status_codes import is_user_error from model import ( - BinaryClassificationResult, - CountingResult, Detector, - ImageQuery, - PaginatedDetectorList, - PaginatedImageQueryList, ) DEFAULT_CONFIDENCE_THRESHOLD = 0.9 IQ_IMPROVEMENT_THRESHOLD = 0.75 + @pytest.fixture(name="gl") def fixture_gl() -> Groundlight: """Creates a Groundlight client object for testing.""" @@ -38,10 +28,13 @@ def fixture_gl() -> Groundlight: _gl.DEFAULT_WAIT = 10 return _gl + @pytest.mark.skip(reason="This test requires a human labeler who does not need to be in the testing loop") def test_human_label(gl: Groundlight): detector = gl.create_detector(name=f"Test {datetime.utcnow()}", query="Is there a dog?") - img_query = gl.submit_image_query(detector=detector.id, image="test/assets/dog.jpeg", wait=60, human_review="ALWAYS") + img_query = gl.submit_image_query( + detector=detector.id, image="test/assets/dog.jpeg", wait=60, human_review="ALWAYS" + ) count = 0 while img_query.result.source == "ALGORITHM" or img_query.result.label == "STILL_PROCESSING": @@ -54,12 +47,12 @@ def test_human_label(gl: Groundlight): assert iq_is_answered(img_query) assert iq_is_confident(img_query, confidence_threshold=0.9) + @pytest.mark.skip(reason="This test can block development depending on the state of the service") @pytest.mark.skipif(MISSING_PIL, reason="Needs pillow") # type: ignore def test_detector_improvement(gl: Groundlight): # test that we get confidence improvement after sending images in # Pass two of each type of image in - import random import time from PIL import Image, ImageEnhance From 3bae22d9f1325323d02972cb970d4cdc0e9aeaa0 Mon Sep 17 00:00:00 2001 From: brandon Date: Fri, 11 Oct 2024 16:58:59 -0700 Subject: [PATCH 3/6] fix typo --- src/groundlight/internalapi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/groundlight/internalapi.py b/src/groundlight/internalapi.py index d40318f2..cba83c9d 100644 --- a/src/groundlight/internalapi.py +++ b/src/groundlight/internalapi.py @@ -61,7 +61,7 @@ def _generate_request_id(): def iq_is_confident(iq: ImageQuery, confidence_threshold: float) -> bool: """Returns True if the image query's confidence is above threshold. - The only subtletie here is that currently confidence of None means + The only subtlety here is that currently confidence of None means human label, which is treated as confident. """ return iq.result.confidence >= confidence_threshold From 0ad673f9a313fe534a56037cf545f909d06ee119 Mon Sep 17 00:00:00 2001 From: brandon Date: Fri, 11 Oct 2024 17:02:47 -0700 Subject: [PATCH 4/6] appease the linting gods --- test/integration/test_groundlight_expensive.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/test/integration/test_groundlight_expensive.py b/test/integration/test_groundlight_expensive.py index 43958fe4..a45888c4 100644 --- a/test/integration/test_groundlight_expensive.py +++ b/test/integration/test_groundlight_expensive.py @@ -37,11 +37,13 @@ def test_human_label(gl: Groundlight): ) count = 0 + sleep_time = 5 + total_time = 60 while img_query.result.source == "ALGORITHM" or img_query.result.label == "STILL_PROCESSING": count += 1 - time.sleep(5) + time.sleep(sleep_time) img_query = gl.get_image_query(img_query.id) - if count > 12: + if count > total_time / sleep_time: assert False, f"Human review is taking too long: {img_query}" assert iq_is_answered(img_query) From c516b3b3b7f24dac3981800edff52b670aa0fe25 Mon Sep 17 00:00:00 2001 From: brandon Date: Fri, 11 Oct 2024 17:08:36 -0700 Subject: [PATCH 5/6] bump version to push the fix --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 2b5228ba..10200297 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,7 +9,7 @@ packages = [ {include = "**/*.py", from = "src"}, ] readme = "README.md" -version = "0.18.2" +version = "0.18.3" [tool.poetry.dependencies] # For certifi, use ">=" instead of "^" since it upgrades its "major version" every year, not really following semver From 5f0a7f87a64e07887aa8f76e55e9f16971797a43 Mon Sep 17 00:00:00 2001 From: brandon Date: Mon, 14 Oct 2024 11:39:30 -0700 Subject: [PATCH 6/6] another fix got out first, bumping verison --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 10200297..ed154e3c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,7 +9,7 @@ packages = [ {include = "**/*.py", from = "src"}, ] readme = "README.md" -version = "0.18.3" +version = "0.18.4" [tool.poetry.dependencies] # For certifi, use ">=" instead of "^" since it upgrades its "major version" every year, not really following semver