Skip to content

Commit

Permalink
Linting
Browse files Browse the repository at this point in the history
* validate python formatting on every build with Ruff
* fix lint warnings
  • Loading branch information
larinam authored May 13, 2023
1 parent 168648e commit 962becb
Show file tree
Hide file tree
Showing 35 changed files with 271 additions and 246 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ jobs:
with:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}

- name: Login to ghcr.io
uses: docker/login-action@v2
with:
Expand Down
17 changes: 17 additions & 0 deletions .github/workflows/lint.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
name: Python linting

on:
push:
branches:
- '*'
pull_request:
types: [ opened, synchronize ]

jobs:
ruff:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3

- name: Lint with Ruff
uses: chartboost/ruff-action@v1
2 changes: 2 additions & 0 deletions .ruff.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Allow lines to be as long as 120 characters.
line-length = 120
18 changes: 6 additions & 12 deletions application/app.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import asyncio
import datetime
import http.client
import json
import os
import traceback
import asyncio

import dotenv
import requests
Expand All @@ -26,10 +27,9 @@
from pymongo import MongoClient
from werkzeug.utils import secure_filename

from core.settings import settings
from error import bad_request
from worker import ingest_worker
from core.settings import settings
import celeryconfig

# os.environ["LANGCHAIN_HANDLER"] = "langchain"

Expand Down Expand Up @@ -177,18 +177,12 @@ def api_answer():
q_prompt = PromptTemplate(input_variables=["context", "question"], template=template_quest,
template_format="jinja2")
if settings.LLM_NAME == "openai_chat":
# llm = ChatOpenAI(openai_api_key=api_key, model_name="gpt-4")
llm = ChatOpenAI(openai_api_key=api_key)
llm = ChatOpenAI(openai_api_key=api_key) # optional parameter: model_name="gpt-4"
messages_combine = [
SystemMessagePromptTemplate.from_template(chat_combine_template),
HumanMessagePromptTemplate.from_template("{question}")
]
p_chat_combine = ChatPromptTemplate.from_messages(messages_combine)
messages_reduce = [
SystemMessagePromptTemplate.from_template(chat_reduce_template),
HumanMessagePromptTemplate.from_template("{question}")
]
p_chat_reduce = ChatPromptTemplate.from_messages(messages_reduce)
elif settings.LLM_NAME == "openai":
llm = OpenAI(openai_api_key=api_key, temperature=0)
elif settings.LLM_NAME == "manifest":
Expand Down Expand Up @@ -226,7 +220,7 @@ def api_answer():
result['answer'] = result['answer'].replace("\\n", "\n")
try:
result['answer'] = result['answer'].split("SOURCES:")[0]
except:
except Exception:
pass

# mock result
Expand Down Expand Up @@ -295,7 +289,7 @@ def api_feedback():
"feedback": feedback
})
)
return {"status": 'ok'}
return {"status": http.client.responses.get(response.status_code, 'ok')}


@app.route('/api/combine', methods=['GET'])
Expand Down
3 changes: 2 additions & 1 deletion application/celeryconfig.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import os

broker_url = os.getenv("CELERY_BROKER_URL")
result_backend = os.getenv("CELERY_RESULT_BACKEND")

task_serializer = 'json'
result_serializer = 'json'
accept_content = ['json']
accept_content = ['json']
3 changes: 2 additions & 1 deletion application/core/settings.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from pydantic import BaseSettings
from pathlib import Path

from pydantic import BaseSettings


class Settings(BaseSettings):
LLM_NAME: str = "openai_chat"
Expand Down
10 changes: 6 additions & 4 deletions application/error.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
from flask import jsonify
from werkzeug.http import HTTP_STATUS_CODES

def response_error(code_status,message=None):
payload = {'error':HTTP_STATUS_CODES.get(code_status,"something went wrong")}

def response_error(code_status, message=None):
payload = {'error': HTTP_STATUS_CODES.get(code_status, "something went wrong")}
if message:
payload['message'] = message
response = jsonify(payload)
response.status_code = code_status
return response

def bad_request(status_code=400,message=''):
return response_error(code_status=status_code,message=message)

def bad_request(status_code=400, message=''):
return response_error(code_status=status_code, message=message)
1 change: 0 additions & 1 deletion application/parser/file/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from typing import Any, List

from langchain.docstore.document import Document as LCDocument

from parser.schema.base import Document


Expand Down
33 changes: 17 additions & 16 deletions application/parser/file/html_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

from parser.file.base_parser import BaseParser


class HTMLParser(BaseParser):
"""HTML parser."""

Expand All @@ -23,38 +24,37 @@ def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, list[str]
Union[str, List[str]]: a string or a List of strings.
"""
try:
import unstructured
from unstructured.partition.html import partition_html
from unstructured.staging.base import convert_to_isd
from unstructured.cleaners.core import clean
except ImportError:
raise ValueError("unstructured package is required to parse HTML files.")
from unstructured.partition.html import partition_html
from unstructured.staging.base import convert_to_isd
from unstructured.cleaners.core import clean

# Using the unstructured library to convert the html to isd format
# isd sample : isd = [
# {"text": "My Title", "type": "Title"},
# {"text": "My Narrative", "type": "NarrativeText"}
# ]
# {"text": "My Title", "type": "Title"},
# {"text": "My Narrative", "type": "NarrativeText"}
# ]
with open(file, "r", encoding="utf-8") as fp:
elements = partition_html(file=fp)
isd = convert_to_isd(elements)
isd = convert_to_isd(elements)

# Removing non ascii charactwers from isd_el['text']
# Removing non ascii charactwers from isd_el['text']
for isd_el in isd:
isd_el['text'] = isd_el['text'].encode("ascii", "ignore").decode()

# Removing all the \n characters from isd_el['text'] using regex and replace with single space
# Removing all the extra spaces from isd_el['text'] using regex and replace with single space
for isd_el in isd:
isd_el['text'] = re.sub(r'\n', ' ', isd_el['text'], flags=re.MULTILINE|re.DOTALL)
isd_el['text'] = re.sub(r"\s{2,}"," ", isd_el['text'], flags=re.MULTILINE|re.DOTALL)
isd_el['text'] = re.sub(r'\n', ' ', isd_el['text'], flags=re.MULTILINE | re.DOTALL)
isd_el['text'] = re.sub(r"\s{2,}", " ", isd_el['text'], flags=re.MULTILINE | re.DOTALL)

# more cleaning: extra_whitespaces, dashes, bullets, trailing_punctuation
for isd_el in isd:
clean(isd_el['text'], extra_whitespace=True, dashes=True, bullets=True, trailing_punctuation=True )
clean(isd_el['text'], extra_whitespace=True, dashes=True, bullets=True, trailing_punctuation=True)

# Creating a list of all the indexes of isd_el['type'] = 'Title'
title_indexes = [i for i,isd_el in enumerate(isd) if isd_el['type'] == 'Title']
title_indexes = [i for i, isd_el in enumerate(isd) if isd_el['type'] == 'Title']

# Creating 'Chunks' - List of lists of strings
# each list starting with with isd_el['type'] = 'Title' and all the data till the next 'Title'
Expand All @@ -64,19 +64,20 @@ def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, list[str]
Chunks = [[]]
final_chunks = list(list())

for i,isd_el in enumerate(isd):
for i, isd_el in enumerate(isd):
if i in title_indexes:
Chunks.append([])
Chunks[-1].append(isd_el['text'])

# Removing all the chunks with sum of lenth of all the strings in the chunk < 25 #TODO: This value can be an user defined variable
# Removing all the chunks with sum of lenth of all the strings in the chunk < 25
# TODO: This value can be an user defined variable
for chunk in Chunks:
# sum of lenth of all the strings in the chunk
sum = 0
sum += len(str(chunk))
if sum < 25:
Chunks.remove(chunk)
else :
else:
# appending all the approved chunks to final_chunks as a single string
final_chunks.append(" ".join([str(item) for item in chunk]))
return final_chunks
25 changes: 13 additions & 12 deletions application/parser/file/markdown_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union, cast

from parser.file.base_parser import BaseParser
import tiktoken
from parser.file.base_parser import BaseParser


class MarkdownParser(BaseParser):
Expand All @@ -20,13 +20,13 @@ class MarkdownParser(BaseParser):
"""

def __init__(
self,
*args: Any,
remove_hyperlinks: bool = True,
remove_images: bool = True,
max_tokens: int = 2048,
# remove_tables: bool = True,
**kwargs: Any,
self,
*args: Any,
remove_hyperlinks: bool = True,
remove_images: bool = True,
max_tokens: int = 2048,
# remove_tables: bool = True,
**kwargs: Any,
) -> None:
"""Init params."""
super().__init__(*args, **kwargs)
Expand All @@ -35,8 +35,8 @@ def __init__(
self._max_tokens = max_tokens
# self._remove_tables = remove_tables


def tups_chunk_append(self, tups: List[Tuple[Optional[str], str]], current_header: Optional[str], current_text: str):
def tups_chunk_append(self, tups: List[Tuple[Optional[str], str]], current_header: Optional[str],
current_text: str):
"""Append to tups chunk."""
num_tokens = len(tiktoken.get_encoding("cl100k_base").encode(current_text))
if num_tokens > self._max_tokens:
Expand All @@ -46,6 +46,7 @@ def tups_chunk_append(self, tups: List[Tuple[Optional[str], str]], current_heade
else:
tups.append((current_header, current_text))
return tups

def markdown_to_tups(self, markdown_text: str) -> List[Tuple[Optional[str], str]]:
"""Convert a markdown file to a dictionary.
Expand Down Expand Up @@ -115,7 +116,7 @@ def _init_parser(self) -> Dict:
return {}

def parse_tups(
self, filepath: Path, errors: str = "ignore"
self, filepath: Path, errors: str = "ignore"
) -> List[Tuple[Optional[str], str]]:
"""Parse file into tuples."""
with open(filepath, "r") as f:
Expand All @@ -130,7 +131,7 @@ def parse_tups(
return markdown_tups

def parse_file(
self, filepath: Path, errors: str = "ignore"
self, filepath: Path, errors: str = "ignore"
) -> Union[str, List[str]]:
"""Parse file into string."""
tups = self.parse_tups(filepath, errors=errors)
Expand Down
36 changes: 18 additions & 18 deletions application/parser/file/rst_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@
"""
import re
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union, cast
from typing import Any, Dict, List, Optional, Tuple, Union

from parser.file.base_parser import BaseParser
import tiktoken


class RstParser(BaseParser):
"""reStructuredText parser.
Expand All @@ -19,17 +19,17 @@ class RstParser(BaseParser):
"""

def __init__(
self,
*args: Any,
remove_hyperlinks: bool = True,
remove_images: bool = True,
remove_table_excess: bool = True,
remove_interpreters: bool = True,
remove_directives: bool = True,
remove_whitespaces_excess: bool = True,
#Be carefull with remove_characters_excess, might cause data loss
remove_characters_excess: bool = True,
**kwargs: Any,
self,
*args: Any,
remove_hyperlinks: bool = True,
remove_images: bool = True,
remove_table_excess: bool = True,
remove_interpreters: bool = True,
remove_directives: bool = True,
remove_whitespaces_excess: bool = True,
# Be carefull with remove_characters_excess, might cause data loss
remove_characters_excess: bool = True,
**kwargs: Any,
) -> None:
"""Init params."""
super().__init__(*args, **kwargs)
Expand All @@ -41,7 +41,6 @@ def __init__(
self._remove_whitespaces_excess = remove_whitespaces_excess
self._remove_characters_excess = remove_characters_excess


def rst_to_tups(self, rst_text: str) -> List[Tuple[Optional[str], str]]:
"""Convert a reStructuredText file to a dictionary.
Expand All @@ -56,7 +55,8 @@ def rst_to_tups(self, rst_text: str) -> List[Tuple[Optional[str], str]]:

for i, line in enumerate(lines):
header_match = re.match(r"^[^\S\n]*[-=]+[^\S\n]*$", line)
if header_match and i > 0 and (len(lines[i - 1].strip()) == len(header_match.group().strip()) or lines[i - 2] == lines[i - 2]):
if header_match and i > 0 and (
len(lines[i - 1].strip()) == len(header_match.group().strip()) or lines[i - 2] == lines[i - 2]):
if current_header is not None:
if current_text == "" or None:
continue
Expand All @@ -72,7 +72,7 @@ def rst_to_tups(self, rst_text: str) -> List[Tuple[Optional[str], str]]:

rst_tups.append((current_header, current_text))

#TODO: Format for rst
# TODO: Format for rst
#
# if current_header is not None:
# # pass linting, assert keys are defined
Expand Down Expand Up @@ -136,7 +136,7 @@ def _init_parser(self) -> Dict:
return {}

def parse_tups(
self, filepath: Path, errors: str = "ignore"
self, filepath: Path, errors: str = "ignore"
) -> List[Tuple[Optional[str], str]]:
"""Parse file into tuples."""
with open(filepath, "r") as f:
Expand All @@ -159,7 +159,7 @@ def parse_tups(
return rst_tups

def parse_file(
self, filepath: Path, errors: str = "ignore"
self, filepath: Path, errors: str = "ignore"
) -> Union[str, List[str]]:
"""Parse file into string."""
tups = self.parse_tups(filepath, errors=errors)
Expand Down
14 changes: 7 additions & 7 deletions application/parser/file/tabular_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,13 +77,13 @@ class PandasCSVParser(BaseParser):
"""

def __init__(
self,
*args: Any,
concat_rows: bool = True,
col_joiner: str = ", ",
row_joiner: str = "\n",
pandas_config: dict = {},
**kwargs: Any
self,
*args: Any,
concat_rows: bool = True,
col_joiner: str = ", ",
row_joiner: str = "\n",
pandas_config: dict = {},
**kwargs: Any
) -> None:
"""Init params."""
super().__init__(*args, **kwargs)
Expand Down
Loading

0 comments on commit 962becb

Please sign in to comment.