From cf9d7d49662f0b1d57cf40b1e106c71ad44066e6 Mon Sep 17 00:00:00 2001 From: zilaei Date: Mon, 22 Apr 2024 11:01:38 +0200 Subject: [PATCH 1/5] feat: parse uploaded files --- .../fai-backend/fai_backend/files/service.py | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/fai-rag-app/fai-backend/fai_backend/files/service.py b/fai-rag-app/fai-backend/fai_backend/files/service.py index 0a4e083c..4397d7c1 100644 --- a/fai-rag-app/fai-backend/fai_backend/files/service.py +++ b/fai-rag-app/fai-backend/fai_backend/files/service.py @@ -6,6 +6,7 @@ from fastapi import UploadFile from pydantic import ByteSize +from fai_backend.files.file_parser import ParserFactory from fai_backend.files.models import FileInfo @@ -67,3 +68,22 @@ def get_latest_upload_path(self, project_id: str) -> str | None: latest_directory = sorted(project_directories, key=lambda x: (x.split('_')[2], x.split('_')[3]), reverse=True)[ 0] return os.path.join(self.upload_dir, latest_directory) + + def parse_uploaded_files(self, project_id: str) -> list: + parsed_files = [] + + latest_upload_path = self.get_latest_upload_path(project_id) + if not latest_upload_path: + return parsed_files + + uploaded_files = [] + for file_name in os.listdir(latest_upload_path): + file_path = os.path.join(latest_upload_path, file_name) + if os.path.isfile(file_path): + uploaded_files.append(file_path) + + for file in uploaded_files: + parser = ParserFactory.get_parser(file) + parsed_files.append(parser.parse(file)) + + return parsed_files From 93b8758c19858d4cdafbce73f98c30be6b56a603 Mon Sep 17 00:00:00 2001 From: zilaei Date: Mon, 22 Apr 2024 11:01:38 +0200 Subject: [PATCH 2/5] feat: parse uploaded files --- fai-rag-app/fai-backend/fai_backend/files/service.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/fai-rag-app/fai-backend/fai_backend/files/service.py b/fai-rag-app/fai-backend/fai_backend/files/service.py index 4397d7c1..064aece4 100644 --- a/fai-rag-app/fai-backend/fai_backend/files/service.py +++ b/fai-rag-app/fai-backend/fai_backend/files/service.py @@ -76,14 +76,10 @@ def parse_uploaded_files(self, project_id: str) -> list: if not latest_upload_path: return parsed_files - uploaded_files = [] - for file_name in os.listdir(latest_upload_path): - file_path = os.path.join(latest_upload_path, file_name) - if os.path.isfile(file_path): - uploaded_files.append(file_path) + uploaded_files = self.list_files(project_id) for file in uploaded_files: - parser = ParserFactory.get_parser(file) - parsed_files.append(parser.parse(file)) + parser = ParserFactory.get_parser(file.path) + parsed_files.append(parser.parse(file.path)) return parsed_files From 03f946a83625adfd77b663e8d83e58caa6a7c95e Mon Sep 17 00:00:00 2001 From: zilaei Date: Thu, 25 Apr 2024 08:50:59 +0200 Subject: [PATCH 3/5] feat: dump parsed file content to JSON --- .../fai-backend/fai_backend/files/service.py | 60 ++++++++++++------- 1 file changed, 40 insertions(+), 20 deletions(-) diff --git a/fai-rag-app/fai-backend/fai_backend/files/service.py b/fai-rag-app/fai-backend/fai_backend/files/service.py index 064aece4..dbbe00d4 100644 --- a/fai-rag-app/fai-backend/fai_backend/files/service.py +++ b/fai-rag-app/fai-backend/fai_backend/files/service.py @@ -1,3 +1,4 @@ +import json import mimetypes import os import uuid @@ -32,19 +33,10 @@ def save_files(self, project_id: str, files: list[UploadFile]) -> str: file_object.write(file.file.read()) return upload_path - def list_files(self, project_id: str) -> list[FileInfo]: - project_directories = [d for d in os.listdir(self.upload_dir) if d.startswith(f'project_{project_id}_')] - if not project_directories: - return [] - - latest_directory = sorted(project_directories, key=lambda x: (x.split('_')[2], x.split('_')[3]), reverse=True)[ - 0] - latest_directory_path = os.path.join(self.upload_dir, latest_directory) - upload_date = datetime.fromtimestamp(os.path.getctime(latest_directory_path)) - + def get_file_infos(self, directory_path, upload_date: datetime) -> list[FileInfo]: file_infos = [] - for file_name in os.listdir(latest_directory_path): - file_path = os.path.join(latest_directory_path, file_name) + for file_name in os.listdir(directory_path): + file_path = os.path.join(directory_path, file_name) if os.path.isfile(file_path): stat = os.stat(file_path) mime_type, _ = mimetypes.guess_type(file_path) @@ -60,6 +52,18 @@ def list_files(self, project_id: str) -> list[FileInfo]: return file_infos + def list_files(self, project_id: str) -> list[FileInfo]: + project_directories = [d for d in os.listdir(self.upload_dir) if d.startswith(f'project_{project_id}_')] + if not project_directories: + return [] + + latest_directory = sorted(project_directories, key=lambda x: (x.split('_')[2], x.split('_')[3]), reverse=True)[ + 0] + latest_directory_path = os.path.join(self.upload_dir, latest_directory) + upload_date = datetime.fromtimestamp(os.path.getctime(latest_directory_path)) + + return self.get_file_infos(latest_directory_path, upload_date) + def get_latest_upload_path(self, project_id: str) -> str | None: project_directories = [d for d in os.listdir(self.upload_dir) if d.startswith(f'project_{project_id}_')] if not project_directories: @@ -69,17 +73,33 @@ def get_latest_upload_path(self, project_id: str) -> str | None: 0] return os.path.join(self.upload_dir, latest_directory) - def parse_uploaded_files(self, project_id: str) -> list: + def parse_files(self, src_directory_path: str): parsed_files = [] - latest_upload_path = self.get_latest_upload_path(project_id) - if not latest_upload_path: - return parsed_files + upload_date = datetime.fromtimestamp(os.path.getctime(src_directory_path)) + files = self.get_file_infos(src_directory_path, upload_date) - uploaded_files = self.list_files(project_id) - - for file in uploaded_files: + for file in files: parser = ParserFactory.get_parser(file.path) - parsed_files.append(parser.parse(file.path)) + parsed_file = parser.parse(file.path) + parsed_files.extend(parsed_file) return parsed_files + + def dump_list_to_json(self, parsed_files: list[str], dest_directory_path: str, dest_file_name: str): + if not os.path.isabs(dest_directory_path): + raise ValueError("Destination path must be absolute") + + os.makedirs(dest_directory_path, exist_ok=True) + file_path = os.path.join(dest_directory_path, dest_file_name + '.json') + if os.path.exists(file_path): + raise FileExistsError(f"The file {file_path} already exists.") + + try: + stringify_parsed_files = [str(elem) for elem in parsed_files] + with open(file_path, 'w') as f: + json.dump(stringify_parsed_files, f, indent=4) + except (IOError, OSError) as e: + raise Exception(f"Failed to write to {file_path}: {str(e)}") + except TypeError as e: + raise Exception(f"Data serialization error: {str(e)}") From ab515cb44fee6498501232c5e697a70ae9d202ac Mon Sep 17 00:00:00 2001 From: zilaei Date: Thu, 25 Apr 2024 08:52:06 +0200 Subject: [PATCH 4/5] feat: add route for parsing and savig documents --- .../fai-backend/fai_backend/documents/routes.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/fai-rag-app/fai-backend/fai_backend/documents/routes.py b/fai-rag-app/fai-backend/fai_backend/documents/routes.py index 0c964e04..483cc5d8 100644 --- a/fai-rag-app/fai-backend/fai_backend/documents/routes.py +++ b/fai-rag-app/fai-backend/fai_backend/documents/routes.py @@ -92,3 +92,18 @@ def upload_handler( c.FireEvent(event=e.GoToEvent(url='/documents')), _('submit_a_question', 'Create Question'''), ) + + +@router.get('documents/parse_and_save', response_model=list, response_model_exclude_none=True) +def parse_documents( + src_directory_path: str, + dest_directory_path: str, + dest_file_name: str, + file_service: FileUploadService = Depends(get_file_upload_service), +) -> list: + parsed_files = file_service.parse_files(src_directory_path) + stringify_parsed_files = [str(elem) for elem in parsed_files] + + file_service.dump_list_to_json(stringify_parsed_files, dest_directory_path, dest_file_name) + + return [] From ce056a135bd4ed1818abcf2eb87fe61da1c5531c Mon Sep 17 00:00:00 2001 From: zilaei Date: Thu, 25 Apr 2024 08:59:27 +0200 Subject: [PATCH 5/5] fix: don't append `.json` to files ending with `.json` --- fai-rag-app/fai-backend/fai_backend/files/service.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fai-rag-app/fai-backend/fai_backend/files/service.py b/fai-rag-app/fai-backend/fai_backend/files/service.py index dbbe00d4..9704f628 100644 --- a/fai-rag-app/fai-backend/fai_backend/files/service.py +++ b/fai-rag-app/fai-backend/fai_backend/files/service.py @@ -90,8 +90,12 @@ def dump_list_to_json(self, parsed_files: list[str], dest_directory_path: str, d if not os.path.isabs(dest_directory_path): raise ValueError("Destination path must be absolute") - os.makedirs(dest_directory_path, exist_ok=True) + if not dest_file_name.endswith('.json'): + dest_file_name += '.json' + file_path = os.path.join(dest_directory_path, dest_file_name + '.json') + os.makedirs(dest_directory_path, exist_ok=True) + if os.path.exists(file_path): raise FileExistsError(f"The file {file_path} already exists.")