From 8c74816c240635e6a06ce8d751d0e505064c2d43 Mon Sep 17 00:00:00 2001 From: k9845 Date: Wed, 9 Aug 2023 11:35:43 +0545 Subject: [PATCH] Add flake8 in CI --- .flake8 | 8 +- .github/pull_request_template.md | 25 ++ .github/workflows/flake8.yml | 25 ++ analysis_module/mockserver.py | 363 +++++++++--------- analysis_module/tests/test_apis.py | 4 +- analysis_module/tests/test_tasks.py | 7 - core/tasks/get_data_old.py | 1 - core/tasks/nlp_mapping.py | 11 +- core/tasks/queries.py | 1 - core/tests/test_apis.py | 1 - core_server/settings.py | 1 - docker-compose.yml | 1 - nlp_scripts/model_monitoring/__init__.py | 4 +- nlp_scripts/model_monitoring/featuredrift.py | 2 +- .../model_monitoring/generate_outputs.py | 7 +- .../model_monitoring/model_performance.py | 87 ++++- 16 files changed, 333 insertions(+), 215 deletions(-) create mode 100644 .github/pull_request_template.md create mode 100644 .github/workflows/flake8.yml diff --git a/.flake8 b/.flake8 index 7da1f96..5ea7c75 100644 --- a/.flake8 +++ b/.flake8 @@ -1,2 +1,8 @@ [flake8] -max-line-length = 100 +ignore = C901, W504 +max-line-length = 125 +exclude = .git,__pycache__,old,build,dist,*/migrations/*.py +max-complexity = 10 +per-file-ignores = + /**/tests/*_mock_data.py: E501 + **/snap_test_*.py: E501 diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md new file mode 100644 index 0000000..7e078fe --- /dev/null +++ b/.github/pull_request_template.md @@ -0,0 +1,25 @@ +Addresses xxxxxx (eg: #1 the-deep-nlp/core-server#1) \ +Depends on xxxxxx (eg: #1 the-deep-nlp/core-server/#1) + +## Changes + +* Detailed list or prose of changes +* Breaking changes +* Changes to configurations + +Mention related users here if any. + +## This PR doesn't introduce any: + +- [ ] temporary files, auto-generated files or secret keys +- [ ] n+1 queries +- [ ] flake8 issues +- [ ] `print` +- [ ] typos +- [ ] unwanted comments + +## This PR contains valid: + +- [ ] tests +- [ ] permission checks (tests here too) +- [ ] translations diff --git a/.github/workflows/flake8.yml b/.github/workflows/flake8.yml new file mode 100644 index 0000000..b5951fc --- /dev/null +++ b/.github/workflows/flake8.yml @@ -0,0 +1,25 @@ +name: Flake8 check. + +on: + pull_request: + push: + branches: + - main + +jobs: + check: + name: Flake8 check + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + + # Flake8 check + - name: Install flake8 + run: pip install flake8 + - name: Run flake8 + uses: suo/flake8-github-action@releases/v1 + with: + # NOTE: this needs to be the same as the job name + checkName: 'Flake8 check' + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/analysis_module/mockserver.py b/analysis_module/mockserver.py index 2a1e6c4..2aea71b 100644 --- a/analysis_module/mockserver.py +++ b/analysis_module/mockserver.py @@ -5,6 +5,7 @@ import logging import random import numpy as np + from random import shuffle from math import ceil from celery import shared_task @@ -19,7 +20,7 @@ from .utils import send_callback_url_request -logger = logging.getLogger('__name__') +logger = logging.getLogger("__name__") logger.setLevel(logging.INFO) @@ -87,183 +88,183 @@ ] MOCK_ENTRY_CLASSIFICATION = { - "classifications": [ - { - "client_id": "5", - "model_preds": { - "2": { - "204": { - "2402": { - "prediction": 0.4069949281240046, - "threshold": 0.489, - "is_selected": False - }, - "2401": { - "prediction": 0.27091098129102825, - "threshold": 0.461, - "is_selected": False - } - } - } - } - }, - { - "client_id": "7", - "model_preds": { - "2": { - "204": { - "2402": { - "prediction": 0.5442236220665992, - "threshold": 0.489, - "is_selected": True - }, - "2401": { - "prediction": 0.4262570897824335, - "threshold": 0.461, - "is_selected": False - } - }, - "202": { - "2206": { - "prediction": 0.25068859880169236, - "threshold": 0.576, - "is_selected": False + "classifications": [ + { + "client_id": "5", + "model_preds": { + "2": { + "204": { + "2402": { + "prediction": 0.4069949281240046, + "threshold": 0.489, + "is_selected": False, + }, + "2401": { + "prediction": 0.27091098129102825, + "threshold": 0.461, + "is_selected": False, + }, + } + } }, - "2201": { - "prediction": 0.5456802809044823, - "threshold": 0.431, - "is_selected": True - } - } }, - "5": { - "503": { - "5303": { - "prediction": 0.12105567270217965, - "threshold": 0.438, - "is_selected": False - }, - "5306": { - "prediction": 0.0934217669913229, - "threshold": 0.424, - "is_selected": False - }, - "5310": { - "prediction": 0.2706523782039786, - "threshold": 0.478, - "is_selected": False - }, - "5302": { - "prediction": 0.10373815047470006, - "threshold": 0.44, - "is_selected": False - }, - "5307": { - "prediction": 0.10675184680643865, - "threshold": 0.414, - "is_selected": False - }, - "5309": { - "prediction": 0.15713495668023825, - "threshold": 0.512, - "is_selected": False - }, - "5308": { - "prediction": 0.2450807941587348, - "threshold": 0.475, - "is_selected": False - }, - "5301": { - "prediction": 0.16692731163052263, - "threshold": 0.488, - "is_selected": False - }, - "5305": { - "prediction": 0.09886651321893601, - "threshold": 0.508, - "is_selected": False + { + "client_id": "7", + "model_preds": { + "2": { + "204": { + "2402": { + "prediction": 0.5442236220665992, + "threshold": 0.489, + "is_selected": True, + }, + "2401": { + "prediction": 0.4262570897824335, + "threshold": 0.461, + "is_selected": False, + }, + }, + "202": { + "2206": { + "prediction": 0.25068859880169236, + "threshold": 0.576, + "is_selected": False, + }, + "2201": { + "prediction": 0.5456802809044823, + "threshold": 0.431, + "is_selected": True, + }, + }, + }, + "5": { + "503": { + "5303": { + "prediction": 0.12105567270217965, + "threshold": 0.438, + "is_selected": False, + }, + "5306": { + "prediction": 0.0934217669913229, + "threshold": 0.424, + "is_selected": False, + }, + "5310": { + "prediction": 0.2706523782039786, + "threshold": 0.478, + "is_selected": False, + }, + "5302": { + "prediction": 0.10373815047470006, + "threshold": 0.44, + "is_selected": False, + }, + "5307": { + "prediction": 0.10675184680643865, + "threshold": 0.414, + "is_selected": False, + }, + "5309": { + "prediction": 0.15713495668023825, + "threshold": 0.512, + "is_selected": False, + }, + "5308": { + "prediction": 0.2450807941587348, + "threshold": 0.475, + "is_selected": False, + }, + "5301": { + "prediction": 0.16692731163052263, + "threshold": 0.488, + "is_selected": False, + }, + "5305": { + "prediction": 0.09886651321893601, + "threshold": 0.508, + "is_selected": False, + }, + "5304": { + "prediction": 0.18824445637496742, + "threshold": 0.444, + "is_selected": False, + }, + }, + "501": { + "5102": { + "prediction": 0.21789910171917756, + "threshold": 0.541, + "is_selected": False, + }, + "5109": { + "prediction": 0.3480727123794051, + "threshold": 0.454, + "is_selected": False, + }, + "5106": { + "prediction": 0.23486564947864202, + "threshold": 0.381, + "is_selected": False, + }, + "5108": { + "prediction": 0.05966722541108756, + "threshold": 0.527, + "is_selected": False, + }, + "5111": { + "prediction": 0.46915922655621894, + "threshold": 0.447, + "is_selected": True, + }, + "5107": { + "prediction": 0.3090465321041693, + "threshold": 0.449, + "is_selected": False, + }, + "5101": { + "prediction": 0.015221919587000888, + "threshold": 0.47, + "is_selected": False, + }, + "5103": { + "prediction": 0.3523940058170018, + "threshold": 0.482, + "is_selected": False, + }, + "5104": { + "prediction": 0.003284739766450025, + "threshold": 0.786, + "is_selected": False, + }, + "5105": { + "prediction": 0.22805604930227613, + "threshold": 0.534, + "is_selected": False, + }, + "5110": { + "prediction": 0.20070979371666908, + "threshold": 0.05, + "is_selected": True, + }, + }, + }, + "4": { + "401": { + "4102": { + "prediction": 0.004212768160319299, + "threshold": 0.814, + "is_selected": False, + }, + "4101": { + "prediction": 0.4228575605351778, + "threshold": 0.422, + "is_selected": True, + }, + } + }, }, - "5304": { - "prediction": 0.18824445637496742, - "threshold": 0.444, - "is_selected": False - } - }, - "501": { - "5102": { - "prediction": 0.21789910171917756, - "threshold": 0.541, - "is_selected": False - }, - "5109": { - "prediction": 0.3480727123794051, - "threshold": 0.454, - "is_selected": False - }, - "5106": { - "prediction": 0.23486564947864202, - "threshold": 0.381, - "is_selected": False - }, - "5108": { - "prediction": 0.05966722541108756, - "threshold": 0.527, - "is_selected": False - }, - "5111": { - "prediction": 0.46915922655621894, - "threshold": 0.447, - "is_selected": True - }, - "5107": { - "prediction": 0.3090465321041693, - "threshold": 0.449, - "is_selected": False - }, - "5101": { - "prediction": 0.015221919587000888, - "threshold": 0.47, - "is_selected": False - }, - "5103": { - "prediction": 0.3523940058170018, - "threshold": 0.482, - "is_selected": False - }, - "5104": { - "prediction": 0.003284739766450025, - "threshold": 0.786, - "is_selected": False - }, - "5105": { - "prediction": 0.22805604930227613, - "threshold": 0.534, - "is_selected": False - }, - "5110": { - "prediction": 0.20070979371666908, - "threshold": 0.05, - "is_selected": True - } - } }, - "4": { - "401": { - "4102": { - "prediction": 0.004212768160319299, - "threshold": 0.814, - "is_selected": False - }, - "4101": { - "prediction": 0.4228575605351778, - "threshold": 0.422, - "is_selected": True - } - } - } - } - } - ] + ] } @@ -427,7 +428,7 @@ def process_topicmodeling(body) -> Any: shuffle(excerpt_ids) data = [ - excerpt_ids[x: x + ceil(len(excerpt_ids) / clusters)] + excerpt_ids[x:x + ceil(len(excerpt_ids) / clusters)] for x in range(0, len(excerpt_ids), ceil(len(excerpt_ids) / clusters)) ] @@ -490,7 +491,9 @@ def shape_geo_entities(entity: dict, excerpt: str): data.append({"entry_id": int(entry_id), "entities": entities}) filepath = save_data_local_and_get_url( - dir_name="geolocation", client_id=client_id, data=data + dir_name="geolocation", + client_id=client_id, + data=data ) send_callback_url_request( @@ -507,7 +510,9 @@ def geolocation_mock_model(body) -> Any: def text_extraction_mock(body) -> Any: - process_extraction_mock.apply_async(args=(body,), countdown=2) # Trigger task after 2 seconds + process_extraction_mock.apply_async( + args=(body,), countdown=2 + ) # Trigger task after 2 seconds return json.dumps({"status": "Successfully received the request."}), 200 @@ -521,7 +526,9 @@ def process_extraction_mock(body) -> Any: for document in documents: client_id = document["client_id"] random_extracted_text = "This is some random extracted text" - filepath = save_data_local_and_get_url("extraction", client_id, random_extracted_text) + filepath = save_data_local_and_get_url( + "extraction", client_id, random_extracted_text + ) callback_data = { "text_path": filepath, "images_path": [], diff --git a/analysis_module/tests/test_apis.py b/analysis_module/tests/test_apis.py index f4c3e31..e2ac4f5 100644 --- a/analysis_module/tests/test_apis.py +++ b/analysis_module/tests/test_apis.py @@ -492,7 +492,7 @@ def test_extraction_system_request(self, requests_mock): """ documents = [ {"url": "someurl", "client_id": self.CLIENT_ID}, - {"url": "anothersomeurl", "client_id": self.CLIENT_ID+"1"}, + {"url": "anothersomeurl", "client_id": self.CLIENT_ID + "1"}, ] data = { "documents": documents, @@ -529,7 +529,7 @@ def test_extraction_user_request(self, requests_mock): """ documents = [ {"url": "someurl", "client_id": self.CLIENT_ID}, - {"url": "anothersomeurl", "client_id": self.CLIENT_ID+"1"}, + {"url": "anothersomeurl", "client_id": self.CLIENT_ID + "1"}, ] data = { "documents": documents, diff --git a/analysis_module/tests/test_tasks.py b/analysis_module/tests/test_tasks.py index 7770e19..50bacb9 100644 --- a/analysis_module/tests/test_tasks.py +++ b/analysis_module/tests/test_tasks.py @@ -51,14 +51,7 @@ def test_resend_callback_request_no_callback_url(self, requests_patch): assert failed_callback.status == FailedCallback.Status.FAILED def test_get_failed_callbacks(self): - successful_callback = self.create_failed_callback( - status=FailedCallback.Status.SUCCESS - ) now = datetime.now() - retry_maxed_callback = self.create_failed_callback( - status=FailedCallback.Status.RETRY_MAXED_OUT, - created_at=now - timedelta(hours=20), - ) old_callback = self.create_failed_callback( status=FailedCallback.Status.RETRYING, ) diff --git a/core/tasks/get_data_old.py b/core/tasks/get_data_old.py index e850893..cfa038a 100644 --- a/core/tasks/get_data_old.py +++ b/core/tasks/get_data_old.py @@ -145,7 +145,6 @@ def _get_id2label(frame): return t - def _reshape_report(a): a = a.split("-") real = [] diff --git a/core/tasks/nlp_mapping.py b/core/tasks/nlp_mapping.py index 8657bcd..c5d2bda 100644 --- a/core/tasks/nlp_mapping.py +++ b/core/tasks/nlp_mapping.py @@ -146,9 +146,8 @@ def get_nlp_outputs( second_last_item = all_items[-2] mapping_row = hum_mapping_sheet[ hum_mapping_sheet.apply( - lambda x: second_last_item - == x["Original first level"] - and last_item == x["Original second level"], + lambda x: second_last_item == x["Original first level"] and + last_item == x["Original second level"], axis=1, ) ].copy() @@ -175,10 +174,8 @@ def get_nlp_outputs( else: first_level_mapped_row = hum_mapping_sheet[ hum_mapping_sheet.apply( - lambda x: x["Original first level"] - == last_item - and str(x["Original second level"]) - == "nan", + lambda x: x["Original first level"] == last_item and + str(x["Original second level"]) == "nan", axis=1, ) ].copy() diff --git a/core/tasks/queries.py b/core/tasks/queries.py index 81d64e9..a420322 100644 --- a/core/tasks/queries.py +++ b/core/tasks/queries.py @@ -175,7 +175,6 @@ e.created_at >= '{}' and e.excerpt is not NULL and e.excerpt <> '' - ) res group by res.id diff --git a/core/tests/test_apis.py b/core/tests/test_apis.py index 8b8b518..5eac313 100644 --- a/core/tests/test_apis.py +++ b/core/tests/test_apis.py @@ -1,5 +1,4 @@ from core_server.base_test import BaseTestCase -from rest_framework.authtoken.models import Token class TestApiAuth(BaseTestCase): diff --git a/core_server/settings.py b/core_server/settings.py index e8d5592..b8e1bb0 100644 --- a/core_server/settings.py +++ b/core_server/settings.py @@ -17,7 +17,6 @@ from sentry_sdk.integrations.celery import CeleryIntegration from sentry_sdk.integrations.redis import RedisIntegration -from core_server.tasks_settings import CELERY_BEAT_SCHEDULE from core_server.env import env # Build paths inside the project like this: BASE_DIR / 'subdir'. diff --git a/docker-compose.yml b/docker-compose.yml index cb79368..5469e42 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -29,7 +29,6 @@ x-server: &base-server-config AWS_S3_SECRET_ACCESS_KEY: ${AWS_S3_SECRET_ACCESS_KEY:-} AWS_S3_BUCKET_NAME: ${AWS_S3_BUCKET_NAME} AWS_S3_REGION_NAME: ${AWS_S3_REGION_NAME} - USE_S3: ${USE_S3:-} # Celery CELERY_BROKER_URL: ${CELERY_BROKER_URL:-redis://redis:6379} CELERY_RESULT_BACKEND: ${CELERY_RESULT_BACKEND:-redis://redis:6379} diff --git a/nlp_scripts/model_monitoring/__init__.py b/nlp_scripts/model_monitoring/__init__.py index abeb9e7..9214196 100644 --- a/nlp_scripts/model_monitoring/__init__.py +++ b/nlp_scripts/model_monitoring/__init__.py @@ -1,8 +1,6 @@ import pandas as pd -from .generate_outputs import ClassificationModelOutput - -from core.models import Entry, ClassificationModel +from core.models import Entry from utils.transformations import batched diff --git a/nlp_scripts/model_monitoring/featuredrift.py b/nlp_scripts/model_monitoring/featuredrift.py index 9ae0a18..b0737d1 100644 --- a/nlp_scripts/model_monitoring/featuredrift.py +++ b/nlp_scripts/model_monitoring/featuredrift.py @@ -1,7 +1,7 @@ import datetime import warnings -from typing import List, Optional, Any +from typing import List, Optional from tqdm import tqdm import numpy as np diff --git a/nlp_scripts/model_monitoring/generate_outputs.py b/nlp_scripts/model_monitoring/generate_outputs.py index 7e7579f..4897ed0 100644 --- a/nlp_scripts/model_monitoring/generate_outputs.py +++ b/nlp_scripts/model_monitoring/generate_outputs.py @@ -1,4 +1,3 @@ -import json import logging from typing import Dict, Optional, Tuple, Any @@ -93,9 +92,9 @@ def get_model_outputs(self, excerpt: list) -> Tuple[list, list]: endpoint_name=self.endpoint_name, ) if ( - self.prediction_generation - and "raw_predictions" in outputs - and "thresholds" in outputs + self.prediction_generation and + "raw_predictions" in outputs and + "thresholds" in outputs ): predictions = outputs["raw_predictions"] thresholds = outputs["thresholds"] diff --git a/nlp_scripts/model_monitoring/model_performance.py b/nlp_scripts/model_monitoring/model_performance.py index 1f2082e..bbb62b7 100644 --- a/nlp_scripts/model_monitoring/model_performance.py +++ b/nlp_scripts/model_monitoring/model_performance.py @@ -42,7 +42,19 @@ class ModelPerformance: def __init__(self, dataframe: pl.DataFrame): """ - Categories: ["sectors", "pillars_1d", "pillars_2d", "subpillars_1d", "subpillars_2d", "age", "displaced", "gender", "non displaced", "severity", "specific_needs_groups"] + Categories: [ + "sectors", + "pillars_1d", + "pillars_2d", + "subpillars_1d", + "subpillars_2d", + "age", + "displaced", + "gender", + "non displaced", + "severity", + "specific_needs_groups" + ] Input Dataframe: project_id, {category} """ self.dataframe = dataframe @@ -178,7 +190,19 @@ def projectwise_perf_metrics( ) -> pl.DataFrame: """ Calculates the project wise performance metrics - Categories: ["sectors", "pillars_1d", "pillars_2d", "subpillars_1d", "subpillars_2d", "age", "displaced", "gender", "non displaced", "severity", "specific_needs_groups"] + Categories: [ + "sectors", + "pillars_1d", + "pillars_2d", + "subpillars_1d", + "subpillars_2d", + "age", + "displaced", + "gender", + "non displaced", + "severity", + "specific_needs_groups" + ] Input Dataframe: project_id, {category}_transformed, {category}_pred_transformed Output Dataframe: project_id, {category}_precision, {category}_recall, {category}_f1score, generated_at """ @@ -224,7 +248,19 @@ def overall_projects_perf_metrics( ) -> Optional[pl.DataFrame]: """ Calculates the overall performance metrics irrespective of the projects - Categories: ["sectors", "pillars_1d", "pillars_2d", "subpillars_1d", "subpillars_2d", "age", "displaced", "gender", "non displaced", "severity", "specific_needs_groups"] + Categories: [ + "sectors", + "pillars_1d", + "pillars_2d", + "subpillars_1d", + "subpillars_2d", + "age", + "displaced", + "gender", + "non displaced", + "severity", + "specific_needs_groups" + ] Input Dataframe: {category}_transformed, {category}_pred_transformed Output Dataframe: category, precision, recall, f1score, generated_at category: List of group as mentioned above @@ -258,7 +294,19 @@ def overall_projects_perf_metrics( def per_tag_perf_metrics(self) -> Optional[pl.DataFrame]: """ Calculates the performance metrics per tag - Categories: ["sectors", "pillars_1d", "pillars_2d", "subpillars_1d", "subpillars_2d", "age", "displaced", "gender", "non displaced", "severity", "specific_needs_groups"] + Categories: [ + "sectors", + "pillars_1d", + "pillars_2d", + "subpillars_1d", + "subpillars_2d", + "age", + "displaced", + "gender", + "non displaced", + "severity", + "specific_needs_groups" + ] Input Dataframe: {category}_transformed, {category}_pred_transformed Output Dataframe: tags, scores, metrics(type), generated_at tags refer to list of tags @@ -331,7 +379,19 @@ def _tags_matching_ratios( def calculate_ratios(self) -> Optional[pl.DataFrame]: """ Calculate the ratios of the categories - Categories: ["sectors", "pillars_1d", "pillars_2d", "subpillars_1d", "subpillars_2d", "age", "displaced", "gender", "non displaced", "severity", "specific_needs_groups"] + Categories: [ + "sectors", + "pillars_1d", + "pillars_2d", + "subpillars_1d", + "subpillars_2d", + "age", + "displaced", + "gender", + "non displaced", + "severity", + "specific_needs_groups" + ] Input Dataframe: project_id, {category}_transformed, {category}_pred_transformed Output Dataframe: project_id, completely_matched_{category}, missing_{category}, wrong_{category}, generated_at """ @@ -361,9 +421,22 @@ def calculate_ratios(self) -> Optional[pl.DataFrame]: def per_project_calc_ratios(self) -> Optional[pl.DataFrame]: """ Calculates the per project ratio of the categories - Categories: ["sectors", "pillars_1d", "pillars_2d", "subpillars_1d", "subpillars_2d", "age", "displaced", "gender", "non displaced", "severity", "specific_needs_groups"] + Categories: [ + "sectors", + "pillars_1d", + "pillars_2d", + "subpillars_1d", + "subpillars_2d", + "age", + "displaced", + "gender", + "non displaced", + "severity", + "specific_needs_groups" + ] Input Dataframe: project_id, completely_matched_{category}, missing_{category}, wrong_{category}, - Output Dataframe: project_id, completely_matched_{category}_mean, missing_{category}_mean, wrong_{category}_mean, generated_at + Output Dataframe: project_id, completely_matched_{category}_mean,\ + missing_{category}_mean, wrong_{category}_mean, generated_at """ ratios_df = self.calculate_ratios() if ratios_df is None: