Linting

* validate python formatting on every build with Ruff * fix lint warnings
arc53 · May 13, 2023 · 962becb · 962becb
1 parent 168648e
commit 962becb
Show file tree

Hide file tree

Showing 35 changed files with 271 additions and 246 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -23,7 +23,7 @@ jobs:
         with:
           username: ${{ secrets.DOCKER_USERNAME }}
           password: ${{ secrets.DOCKER_PASSWORD }}
-          
+
       - name: Login to ghcr.io
         uses: docker/login-action@v2
         with:

diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -0,0 +1,17 @@
+name: Python linting
+
+on:
+  push:
+    branches:
+      - '*'
+  pull_request:
+    types: [ opened, synchronize ]
+
+jobs:
+  ruff:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Lint with Ruff
+        uses: chartboost/ruff-action@v1
diff --git a/.ruff.toml b/.ruff.toml
@@ -0,0 +1,2 @@
+# Allow lines to be as long as 120 characters.
+line-length = 120
diff --git a/application/app.py b/application/app.py
@@ -1,8 +1,9 @@
+import asyncio
 import datetime
+import http.client
 import json
 import os
 import traceback
-import asyncio
 
 import dotenv
 import requests
@@ -26,10 +27,9 @@
 from pymongo import MongoClient
 from werkzeug.utils import secure_filename
 
+from core.settings import settings
 from error import bad_request
 from worker import ingest_worker
-from core.settings import settings
-import celeryconfig
 
 # os.environ["LANGCHAIN_HANDLER"] = "langchain"
 
@@ -177,18 +177,12 @@ def api_answer():
         q_prompt = PromptTemplate(input_variables=["context", "question"], template=template_quest,
                                   template_format="jinja2")
         if settings.LLM_NAME == "openai_chat":
-            # llm = ChatOpenAI(openai_api_key=api_key, model_name="gpt-4")
-            llm = ChatOpenAI(openai_api_key=api_key)
+            llm = ChatOpenAI(openai_api_key=api_key)  # optional parameter: model_name="gpt-4"
             messages_combine = [
                 SystemMessagePromptTemplate.from_template(chat_combine_template),
                 HumanMessagePromptTemplate.from_template("{question}")
             ]
             p_chat_combine = ChatPromptTemplate.from_messages(messages_combine)
-            messages_reduce = [
-                SystemMessagePromptTemplate.from_template(chat_reduce_template),
-                HumanMessagePromptTemplate.from_template("{question}")
-            ]
-            p_chat_reduce = ChatPromptTemplate.from_messages(messages_reduce)
         elif settings.LLM_NAME == "openai":
             llm = OpenAI(openai_api_key=api_key, temperature=0)
         elif settings.LLM_NAME == "manifest":
@@ -226,7 +220,7 @@ def api_answer():
         result['answer'] = result['answer'].replace("\\n", "\n")
         try:
             result['answer'] = result['answer'].split("SOURCES:")[0]
-        except:
+        except Exception:
             pass
 
         # mock result
@@ -295,7 +289,7 @@ def api_feedback():
             "feedback": feedback
         })
     )
-    return {"status": 'ok'}
+    return {"status": http.client.responses.get(response.status_code, 'ok')}
 
 
 @app.route('/api/combine', methods=['GET'])

diff --git a/application/celeryconfig.py b/application/celeryconfig.py
@@ -1,7 +1,8 @@
 import os
+
 broker_url = os.getenv("CELERY_BROKER_URL")
 result_backend = os.getenv("CELERY_RESULT_BACKEND")
 
 task_serializer = 'json'
 result_serializer = 'json'
-accept_content = ['json']
+accept_content = ['json']
diff --git a/application/core/settings.py b/application/core/settings.py
@@ -1,6 +1,7 @@
-from pydantic import BaseSettings
 from pathlib import Path
 
+from pydantic import BaseSettings
+
 
 class Settings(BaseSettings):
     LLM_NAME: str = "openai_chat"

diff --git a/application/error.py b/application/error.py
@@ -1,13 +1,15 @@
 from flask import jsonify
 from werkzeug.http import HTTP_STATUS_CODES
 
-def response_error(code_status,message=None):
-    payload = {'error':HTTP_STATUS_CODES.get(code_status,"something went wrong")}
+
+def response_error(code_status, message=None):
+    payload = {'error': HTTP_STATUS_CODES.get(code_status, "something went wrong")}
     if message:
         payload['message'] = message
     response = jsonify(payload)
     response.status_code = code_status
     return response
 
-def bad_request(status_code=400,message=''):
-    return response_error(code_status=status_code,message=message)
+
+def bad_request(status_code=400, message=''):
+    return response_error(code_status=status_code, message=message)
diff --git a/application/parser/file/base.py b/application/parser/file/base.py
@@ -3,7 +3,6 @@
 from typing import Any, List
 
 from langchain.docstore.document import Document as LCDocument
-
 from parser.schema.base import Document
 
 

diff --git a/application/parser/file/html_parser.py b/application/parser/file/html_parser.py
@@ -9,6 +9,7 @@
 
 from parser.file.base_parser import BaseParser
 
+
 class HTMLParser(BaseParser):
     """HTML parser."""
 
@@ -23,38 +24,37 @@ def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, list[str]
             Union[str, List[str]]: a string or a List of strings.
         """
         try:
-            import unstructured
+            from unstructured.partition.html import partition_html
+            from unstructured.staging.base import convert_to_isd
+            from unstructured.cleaners.core import clean
         except ImportError:
             raise ValueError("unstructured package is required to parse HTML files.")
-        from unstructured.partition.html import partition_html
-        from unstructured.staging.base import convert_to_isd
-        from unstructured.cleaners.core import clean
 
         # Using the unstructured library to convert the html to isd format
         # isd sample : isd = [
-                            #   {"text": "My Title", "type": "Title"},
-                            #   {"text": "My Narrative", "type": "NarrativeText"}
-                            # ]
+        #   {"text": "My Title", "type": "Title"},
+        #   {"text": "My Narrative", "type": "NarrativeText"}
+        # ]
         with open(file, "r", encoding="utf-8") as fp:
             elements = partition_html(file=fp)
-            isd = convert_to_isd(elements)  
+            isd = convert_to_isd(elements)
 
-        # Removing non ascii charactwers from isd_el['text']
+            # Removing non ascii charactwers from isd_el['text']
         for isd_el in isd:
             isd_el['text'] = isd_el['text'].encode("ascii", "ignore").decode()
 
         # Removing all the \n characters from isd_el['text'] using regex and replace with single space
         # Removing all the extra spaces  from isd_el['text'] using regex and replace with single space
         for isd_el in isd:
-            isd_el['text'] = re.sub(r'\n', ' ', isd_el['text'], flags=re.MULTILINE|re.DOTALL)
-            isd_el['text'] = re.sub(r"\s{2,}"," ", isd_el['text'], flags=re.MULTILINE|re.DOTALL)
+            isd_el['text'] = re.sub(r'\n', ' ', isd_el['text'], flags=re.MULTILINE | re.DOTALL)
+            isd_el['text'] = re.sub(r"\s{2,}", " ", isd_el['text'], flags=re.MULTILINE | re.DOTALL)
 
         # more cleaning: extra_whitespaces, dashes, bullets, trailing_punctuation
         for isd_el in isd:
-            clean(isd_el['text'], extra_whitespace=True, dashes=True, bullets=True, trailing_punctuation=True )
+            clean(isd_el['text'], extra_whitespace=True, dashes=True, bullets=True, trailing_punctuation=True)
 
         # Creating a list of all the indexes of isd_el['type'] = 'Title'
-        title_indexes = [i for i,isd_el in enumerate(isd) if isd_el['type'] == 'Title']
+        title_indexes = [i for i, isd_el in enumerate(isd) if isd_el['type'] == 'Title']
 
         # Creating 'Chunks' - List of lists of strings 
         # each list starting with with isd_el['type'] = 'Title' and all the data till the next 'Title'
@@ -64,19 +64,20 @@ def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, list[str]
         Chunks = [[]]
         final_chunks = list(list())
 
-        for i,isd_el in enumerate(isd):
+        for i, isd_el in enumerate(isd):
             if i in title_indexes:
                 Chunks.append([])
             Chunks[-1].append(isd_el['text'])
 
-        # Removing all the chunks with sum of lenth of all the strings in the chunk < 25 #TODO: This value can be an user defined variable
+        # Removing all the chunks with sum of lenth of all the strings in the chunk < 25
+        # TODO: This value can be an user defined variable
         for chunk in Chunks:
             # sum of lenth of all the strings in the chunk
             sum = 0
             sum += len(str(chunk))
             if sum < 25:
                 Chunks.remove(chunk)
-            else :         
+            else:
                 # appending all the approved chunks to final_chunks as a single string       
                 final_chunks.append(" ".join([str(item) for item in chunk]))
         return final_chunks
diff --git a/application/parser/file/markdown_parser.py b/application/parser/file/markdown_parser.py
@@ -7,8 +7,8 @@
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple, Union, cast
 
-from parser.file.base_parser import BaseParser
 import tiktoken
+from parser.file.base_parser import BaseParser
 
 
 class MarkdownParser(BaseParser):
@@ -20,13 +20,13 @@ class MarkdownParser(BaseParser):
     """
 
     def __init__(
-        self,
-        *args: Any,
-        remove_hyperlinks: bool = True,
-        remove_images: bool = True,
-        max_tokens: int = 2048,
-        # remove_tables: bool = True,
-        **kwargs: Any,
+            self,
+            *args: Any,
+            remove_hyperlinks: bool = True,
+            remove_images: bool = True,
+            max_tokens: int = 2048,
+            # remove_tables: bool = True,
+            **kwargs: Any,
     ) -> None:
         """Init params."""
         super().__init__(*args, **kwargs)
@@ -35,8 +35,8 @@ def __init__(
         self._max_tokens = max_tokens
         # self._remove_tables = remove_tables
 
-
-    def tups_chunk_append(self, tups: List[Tuple[Optional[str], str]], current_header: Optional[str], current_text: str):
+    def tups_chunk_append(self, tups: List[Tuple[Optional[str], str]], current_header: Optional[str],
+                          current_text: str):
         """Append to tups chunk."""
         num_tokens = len(tiktoken.get_encoding("cl100k_base").encode(current_text))
         if num_tokens > self._max_tokens:
@@ -46,6 +46,7 @@ def tups_chunk_append(self, tups: List[Tuple[Optional[str], str]], current_heade
         else:
             tups.append((current_header, current_text))
         return tups
+
     def markdown_to_tups(self, markdown_text: str) -> List[Tuple[Optional[str], str]]:
         """Convert a markdown file to a dictionary.
 
@@ -115,7 +116,7 @@ def _init_parser(self) -> Dict:
         return {}
 
     def parse_tups(
-        self, filepath: Path, errors: str = "ignore"
+            self, filepath: Path, errors: str = "ignore"
     ) -> List[Tuple[Optional[str], str]]:
         """Parse file into tuples."""
         with open(filepath, "r") as f:
@@ -130,7 +131,7 @@ def parse_tups(
         return markdown_tups
 
     def parse_file(
-        self, filepath: Path, errors: str = "ignore"
+            self, filepath: Path, errors: str = "ignore"
     ) -> Union[str, List[str]]:
         """Parse file into string."""
         tups = self.parse_tups(filepath, errors=errors)

diff --git a/application/parser/file/rst_parser.py b/application/parser/file/rst_parser.py
@@ -5,10 +5,10 @@
 """
 import re
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple, Union, cast
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 from parser.file.base_parser import BaseParser
-import tiktoken
+
 
 class RstParser(BaseParser):
     """reStructuredText parser.
@@ -19,17 +19,17 @@ class RstParser(BaseParser):
     """
 
     def __init__(
-        self,
-        *args: Any,
-        remove_hyperlinks: bool = True,
-        remove_images: bool = True,
-        remove_table_excess: bool = True,
-        remove_interpreters: bool = True,
-        remove_directives: bool = True,
-        remove_whitespaces_excess: bool = True,
-        #Be carefull with remove_characters_excess, might cause data loss
-        remove_characters_excess: bool = True,
-        **kwargs: Any,
+            self,
+            *args: Any,
+            remove_hyperlinks: bool = True,
+            remove_images: bool = True,
+            remove_table_excess: bool = True,
+            remove_interpreters: bool = True,
+            remove_directives: bool = True,
+            remove_whitespaces_excess: bool = True,
+            # Be carefull with remove_characters_excess, might cause data loss
+            remove_characters_excess: bool = True,
+            **kwargs: Any,
     ) -> None:
         """Init params."""
         super().__init__(*args, **kwargs)
@@ -41,7 +41,6 @@ def __init__(
         self._remove_whitespaces_excess = remove_whitespaces_excess
         self._remove_characters_excess = remove_characters_excess
 
-
     def rst_to_tups(self, rst_text: str) -> List[Tuple[Optional[str], str]]:
         """Convert a reStructuredText file to a dictionary.
 
@@ -56,7 +55,8 @@ def rst_to_tups(self, rst_text: str) -> List[Tuple[Optional[str], str]]:
 
         for i, line in enumerate(lines):
             header_match = re.match(r"^[^\S\n]*[-=]+[^\S\n]*$", line)
-            if header_match and i > 0 and (len(lines[i - 1].strip()) == len(header_match.group().strip()) or lines[i - 2] == lines[i - 2]):
+            if header_match and i > 0 and (
+                    len(lines[i - 1].strip()) == len(header_match.group().strip()) or lines[i - 2] == lines[i - 2]):
                 if current_header is not None:
                     if current_text == "" or None:
                         continue
@@ -72,7 +72,7 @@ def rst_to_tups(self, rst_text: str) -> List[Tuple[Optional[str], str]]:
 
         rst_tups.append((current_header, current_text))
 
-        #TODO: Format for rst
+        # TODO: Format for rst
         #
         # if current_header is not None:
         #     # pass linting, assert keys are defined
@@ -136,7 +136,7 @@ def _init_parser(self) -> Dict:
         return {}
 
     def parse_tups(
-        self, filepath: Path, errors: str = "ignore"
+            self, filepath: Path, errors: str = "ignore"
     ) -> List[Tuple[Optional[str], str]]:
         """Parse file into tuples."""
         with open(filepath, "r") as f:
@@ -159,7 +159,7 @@ def parse_tups(
         return rst_tups
 
     def parse_file(
-        self, filepath: Path, errors: str = "ignore"
+            self, filepath: Path, errors: str = "ignore"
     ) -> Union[str, List[str]]:
         """Parse file into string."""
         tups = self.parse_tups(filepath, errors=errors)

diff --git a/application/parser/file/tabular_parser.py b/application/parser/file/tabular_parser.py
@@ -77,13 +77,13 @@ class PandasCSVParser(BaseParser):
     """
 
     def __init__(
-        self,
-        *args: Any,
-        concat_rows: bool = True,
-        col_joiner: str = ", ",
-        row_joiner: str = "\n",
-        pandas_config: dict = {},
-        **kwargs: Any
+            self,
+            *args: Any,
+            concat_rows: bool = True,
+            col_joiner: str = ", ",
+            row_joiner: str = "\n",
+            pandas_config: dict = {},
+            **kwargs: Any
     ) -> None:
         """Init params."""
         super().__init__(*args, **kwargs)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# Allow lines to be as long as 120 characters.
		line-length = 120
Original file line number	Diff line number	Diff line change
Expand Up		@@ -3,7 +3,6 @@
		from typing import Any, List

		from langchain.docstore.document import Document as LCDocument

		from parser.schema.base import Document


Expand Down