From d06ea78232df2caac17bea21f94524430f7f4a32 Mon Sep 17 00:00:00 2001 From: Zeke Zhang Date: Mon, 26 Aug 2024 23:09:09 +1000 Subject: [PATCH 01/11] chore: update messaging --- src/SumGPT.py | 144 +++++++++++++++++++++++++++++--------------------- 1 file changed, 85 insertions(+), 59 deletions(-) diff --git a/src/SumGPT.py b/src/SumGPT.py index b2040ea..c5b46ae 100644 --- a/src/SumGPT.py +++ b/src/SumGPT.py @@ -1,15 +1,18 @@ import asyncio + import streamlit as st import Components.StreamlitSetup as StreamlitSetup + StreamlitSetup.setup() -import Modules.Youtube -from Components.sidebar import sidebar -import Modules.file_io as file_io +import time + import GPT +import Modules.file_io as file_io +import Modules.Youtube import util -import time +from Components.sidebar import sidebar app_header = st.container() @@ -21,7 +24,9 @@ st.title("πŸ“ SumGPT") st.markdown("##### Summarize your text with OpenAI's GPT-3.5 / GPT-4 API") st.markdown("##### [GitHub repo](https://github.com/sean1832/SumGPT)") - st.warning("🚧️ This app is still in beta. Please [report any bugs](https://github.com/sean1832/SumGPT/issues) to the GitHub repo.") + st.warning( + "Please [report any bugs](https://github.com/sean1832/SumGPT/issues) to the GitHub repo." + ) sidebar() @@ -31,19 +36,25 @@ youtube_link_empty = st.empty() upload_file_emtpy = st.empty() - youtube_link = youtube_link_empty.text_input(label="πŸ”— YouTube Link", - placeholder="Enter your YouTube link", - help="Enter your YouTube link to download the video and extract the audio") - upload_file = upload_file_emtpy.file_uploader("πŸ“ Upload your file", type=['txt', 'pdf', 'docx', 'md']) + youtube_link = youtube_link_empty.text_input( + label="πŸ”— YouTube Link", + placeholder="Enter your YouTube link", + help="Enter your YouTube link to download the video and extract the audio", + ) + upload_file = upload_file_emtpy.file_uploader( + "πŸ“ Upload your file", type=["txt", "pdf", "docx", "md"] + ) if youtube_link: upload_file_emtpy.empty() with st.spinner("πŸ” Extracting transcript..."): - transcript, title = Modules.Youtube.extract_youtube_transcript(youtube_link, st.session_state['CAPTION_LANGUAGES']) - file_content = {'name': f"{title}.txt", 'content': transcript} + transcript, title = Modules.Youtube.extract_youtube_transcript( + youtube_link, st.session_state["CAPTION_LANGUAGES"] + ) + file_content = {"name": f"{title}.txt", "content": transcript} elif upload_file: youtube_link_empty.empty() with st.spinner("πŸ” Reading file... (mp3 file might take a while)"): - file_content = {'name': upload_file.name, 'content': file_io.read(upload_file)} + file_content = {"name": upload_file.name, "content": file_io.read(upload_file)} elif youtube_link and upload_file: st.warning("Please only upload one file at a time") else: @@ -52,50 +63,50 @@ with content_handler: if file_content: with st.expander("File Preview"): - if file_content['name'].endswith(".pdf"): - content = "\n\n".join(file_content['content']) - st.text_area(file_content['name'], content, height=200) + if file_content["name"].endswith(".pdf"): + content = "\n\n".join(file_content["content"]) + st.text_area(file_content["name"], content, height=200) else: - content = file_content['content'] - st.text_area(file_content['name'], content, height=200) + content = file_content["content"] + st.text_area(file_content["name"], content, height=200) with result_handler: if file_content: chunks = [] - content = file_content['content'] - if file_content['name'].endswith(".pdf"): - content = "\n\n".join(file_content['content']) - chunks.extend(util.convert_to_chunks(content, chunk_size=st.session_state['CHUNK_SIZE'])) + content = file_content["content"] + if file_content["name"].endswith(".pdf"): + content = "\n\n".join(file_content["content"]) + chunks.extend(util.convert_to_chunks(content, chunk_size=st.session_state["CHUNK_SIZE"])) with st.expander(f"Chunks ({len(chunks)})"): for chunk in chunks: st.write(chunk) - token_usage = GPT.misc.predict_token(st.session_state['OPENAI_PARAMS'], chunks) + token_usage = GPT.misc.predict_token(st.session_state["OPENAI_PARAMS"], chunks) param = st.session_state["OPENAI_PARAMS"] - prompt_token = token_usage['prompt'] - completion_token = token_usage['completion'] - if param.model == 'gpt-4': + prompt_token = token_usage["prompt"] + completion_token = token_usage["completion"] + if param.model == "gpt-4": price = round(prompt_token * 0.00003 + completion_token * 0.00006, 5) - st.markdown('**Note:** To access GPT-4, please [join the waitlist](https://openai.com/waitlist/gpt-4-api)' - " if you haven't already received an invitation from OpenAI.") - st.info("ℹ️️ Please keep in mind that GPT-4 is significantly **[more expensive](https://openai.com/pricing#language-models)** than GPT-3.5. ") - elif param.model == 'gpt-3.5-turbo-16k': - price = round(prompt_token * 0.000003 + completion_token *0.000004, 5) + elif param.model == "gpt-3.5-turbo-16k": + price = round(prompt_token * 0.000003 + completion_token * 0.000004, 5) else: - price = round(prompt_token * 0.0000015 + completion_token * 0.000002 , 5) + price = round(prompt_token * 0.0000015 + completion_token * 0.000002, 5) st.markdown( - f"Price Prediction: `${price}` || Total Prompt: `{prompt_token}`, Total Completion: `{completion_token}`") + f"Price Prediction: `${price}` || Total Prompt: `{prompt_token}`, Total Completion: `{completion_token}`" + ) # max tokens exceeded warning - exceeded = util.exceeded_token_handler(param=st.session_state['OPENAI_PARAMS'], chunks=chunks) + exceeded = util.exceeded_token_handler( + param=st.session_state["OPENAI_PARAMS"], chunks=chunks + ) # load cached results - if st.session_state['PREVIOUS_RESULTS'] is not None: - rec_responses = st.session_state['PREVIOUS_RESULTS']['rec_responses'] - rec_id = st.session_state['PREVIOUS_RESULTS']['rec_ids'] - final_response = st.session_state['PREVIOUS_RESULTS']['final_response'] - finish_reason_rec = st.session_state['PREVIOUS_RESULTS']['finish_reason_rec'] - finish_reason_final = st.session_state['PREVIOUS_RESULTS']['finish_reason_final'] + if st.session_state["PREVIOUS_RESULTS"] is not None: + rec_responses = st.session_state["PREVIOUS_RESULTS"]["rec_responses"] + rec_id = st.session_state["PREVIOUS_RESULTS"]["rec_ids"] + final_response = st.session_state["PREVIOUS_RESULTS"]["final_response"] + finish_reason_rec = st.session_state["PREVIOUS_RESULTS"]["finish_reason_rec"] + finish_reason_final = st.session_state["PREVIOUS_RESULTS"]["finish_reason_final"] else: rec_responses = None rec_id = None @@ -107,46 +118,61 @@ if st.button("πŸš€ Run", disabled=exceeded): start_time = time.time() st.cache_data.clear() - API_KEY = st.session_state['OPENAI_API_KEY'] + API_KEY = st.session_state["OPENAI_API_KEY"] if not API_KEY and not GPT.misc.validate_api_key(API_KEY): - st.error("❌ Please enter a valid [OpenAI API key](https://beta.openai.com/account/api-keys).") + st.error( + "❌ Please enter a valid [OpenAI API key](https://beta.openai.com/account/api-keys)." + ) else: with st.spinner("Summarizing... (this might take a while)"): - if st.session_state['LEGACY']: - rec_max_token = st.session_state['OPENAI_PARAMS'].max_tokens_rec - rec_responses, finish_reason_rec = util.recursive_summarize(chunks, rec_max_token) - if st.session_state['FINAL_SUMMARY_MODE']: + if st.session_state["LEGACY"]: + rec_max_token = st.session_state["OPENAI_PARAMS"].max_tokens_rec + rec_responses, finish_reason_rec = util.recursive_summarize( + chunks, rec_max_token + ) + if st.session_state["FINAL_SUMMARY_MODE"]: final_response, finish_reason_final = util.summarize(rec_responses) else: final_response = None else: - completions, final_response = asyncio.run(util.summarize_experimental_concurrently(content, st.session_state['CHUNK_SIZE'])) + completions, final_response = asyncio.run( + util.summarize_experimental_concurrently( + content, st.session_state["CHUNK_SIZE"] + ) + ) rec_responses = [d["content"] for d in completions] rec_ids = [d["chunk_id"] for d in completions] # save previous completions - resp = {'rec_responses': rec_responses, - 'rec_ids': rec_ids, - 'final_response': final_response, - 'finish_reason_rec': finish_reason_rec, - 'finish_reason_final': finish_reason_final} - if resp != st.session_state['PREVIOUS_RESULTS']: - st.session_state['PREVIOUS_RESULTS'] = resp + resp = { + "rec_responses": rec_responses, + "rec_ids": rec_ids, + "final_response": final_response, + "finish_reason_rec": finish_reason_rec, + "finish_reason_final": finish_reason_final, + } + if resp != st.session_state["PREVIOUS_RESULTS"]: + st.session_state["PREVIOUS_RESULTS"] = resp end_time = time.time() st.markdown(f"⏱️ Time taken: `{round(end_time - start_time, 2)}s`") if rec_responses is not None: - with st.expander("Recursive Summaries", expanded=not st.session_state['FINAL_SUMMARY_MODE']): + with st.expander( + "Recursive Summaries", expanded=not st.session_state["FINAL_SUMMARY_MODE"] + ): for i, response in enumerate(rec_responses): - st.info(f'{response}') - if finish_reason_rec == 'length': - st.warning('⚠️Result cut off due to length. Consider increasing the [Max Tokens Chunks] parameter.') + st.info(f"{response}") + if finish_reason_rec == "length": + st.warning( + "⚠️Result cut off due to length. Consider increasing the [Max Tokens Chunks] parameter." + ) if final_response is not None: st.header("πŸ“Summary") st.info(final_response) - if finish_reason_final == 'length': + if finish_reason_final == "length": st.warning( - '⚠️Result cut off due to length. Consider increasing the [Max Tokens Summary] parameter.') + "⚠️Result cut off due to length. Consider increasing the [Max Tokens Summary] parameter." + ) if final_response is not None or rec_responses is not None: util.download_results(rec_responses, final_response) From d9e3af75d5c7b3766d7073888c009b3e2dc66d12 Mon Sep 17 00:00:00 2001 From: Zeke Zhang Date: Mon, 26 Aug 2024 23:13:56 +1000 Subject: [PATCH 02/11] refactor: move helpers under `utils` module --- src/SumGPT.py | 24 ++++++++++++------------ src/utils/__init__.py | 3 +++ src/{util.py => utils/helpers.py} | 0 3 files changed, 15 insertions(+), 12 deletions(-) create mode 100644 src/utils/__init__.py rename src/{util.py => utils/helpers.py} (100%) diff --git a/src/SumGPT.py b/src/SumGPT.py index c5b46ae..355ebd4 100644 --- a/src/SumGPT.py +++ b/src/SumGPT.py @@ -6,13 +6,13 @@ StreamlitSetup.setup() -import time +import time # noqa: E402 -import GPT -import Modules.file_io as file_io -import Modules.Youtube -import util -from Components.sidebar import sidebar +import GPT # noqa: E402 +import Modules.file_io as file_io # noqa: E402 +import Modules.Youtube # noqa: E402 +import utils.helpers as helpers # noqa: E402 +from Components.sidebar import sidebar # noqa: E402 app_header = st.container() @@ -76,7 +76,7 @@ content = file_content["content"] if file_content["name"].endswith(".pdf"): content = "\n\n".join(file_content["content"]) - chunks.extend(util.convert_to_chunks(content, chunk_size=st.session_state["CHUNK_SIZE"])) + chunks.extend(helpers.convert_to_chunks(content, chunk_size=st.session_state["CHUNK_SIZE"])) with st.expander(f"Chunks ({len(chunks)})"): for chunk in chunks: @@ -96,7 +96,7 @@ f"Price Prediction: `${price}` || Total Prompt: `{prompt_token}`, Total Completion: `{completion_token}`" ) # max tokens exceeded warning - exceeded = util.exceeded_token_handler( + exceeded = helpers.exceeded_token_handler( param=st.session_state["OPENAI_PARAMS"], chunks=chunks ) @@ -127,16 +127,16 @@ with st.spinner("Summarizing... (this might take a while)"): if st.session_state["LEGACY"]: rec_max_token = st.session_state["OPENAI_PARAMS"].max_tokens_rec - rec_responses, finish_reason_rec = util.recursive_summarize( + rec_responses, finish_reason_rec = helpers.recursive_summarize( chunks, rec_max_token ) if st.session_state["FINAL_SUMMARY_MODE"]: - final_response, finish_reason_final = util.summarize(rec_responses) + final_response, finish_reason_final = helpers.summarize(rec_responses) else: final_response = None else: completions, final_response = asyncio.run( - util.summarize_experimental_concurrently( + helpers.summarize_experimental_concurrently( content, st.session_state["CHUNK_SIZE"] ) ) @@ -175,4 +175,4 @@ "⚠️Result cut off due to length. Consider increasing the [Max Tokens Summary] parameter." ) if final_response is not None or rec_responses is not None: - util.download_results(rec_responses, final_response) + helpers.download_results(rec_responses, final_response) diff --git a/src/utils/__init__.py b/src/utils/__init__.py new file mode 100644 index 0000000..5fe7f8d --- /dev/null +++ b/src/utils/__init__.py @@ -0,0 +1,3 @@ +from utils import helpers + +__all__ = ["helpers"] diff --git a/src/util.py b/src/utils/helpers.py similarity index 100% rename from src/util.py rename to src/utils/helpers.py From 104184d39ad0cdec38b04beb484266f8e4b4c7d1 Mon Sep 17 00:00:00 2001 From: Zeke Zhang Date: Mon, 28 Oct 2024 16:59:35 +1100 Subject: [PATCH 03/11] feat: re-structure code base - Improved code readability and modularity - More organized code structure - Improved & Simplified UI - Support multi-files summarization. - Remove youtube summarization for focus functionality. --- .idea/.gitignore | 8 - .idea/inspectionProfiles/Project_Default.xml | 38 --- .../inspectionProfiles/profiles_settings.xml | 6 - .idea/misc.xml | 4 - .idea/modules.xml | 8 - .idea/sumGPT.iml | 24 -- .idea/vcs.xml | 6 - RUN.bat | 2 +- SumGPT/__init__.py | 4 + SumGPT/app/__init__.py | 0 SumGPT/app/body_handler.py | 243 ++++++++++++++++++ SumGPT/app/page.py | 69 +++++ SumGPT/app/sidebar_handler.py | 100 +++++++ SumGPT/core/__init__.py | 0 SumGPT/core/llm.py | 51 ++++ SumGPT/core/tokenizer.py | 20 ++ SumGPT/datamodel/chunk.py | 22 ++ SumGPT/datamodel/llm_model.py | 31 +++ SumGPT/datamodel/llm_params.py | 13 + SumGPT/main.py | 16 ++ SumGPT/manifest.json | 17 ++ SumGPT/models.json | 40 +++ SumGPT/prompt.json | 22 ++ SumGPT/utils/__init__.py | 4 + SumGPT/utils/helpers.py | 35 +++ SumGPT/utils/io.py | 17 ++ requirements.txt | 21 +- src/Components/Info.py | 18 -- src/Components/StreamlitSetup.py | 36 --- src/Components/__init__.py | 4 - src/Components/sidebar.py | 187 -------------- src/Data/__init__.py | 3 - src/Data/caption_languages.py | 6 - src/GPT/__init__.py | 7 - src/GPT/bot.py | 48 ---- src/GPT/embeddings.py | 12 - src/GPT/generate.py | 52 ---- src/GPT/misc.py | 98 ------- src/GPT/param.py | 11 - src/Modules/Youtube.py | 97 ------- src/Modules/__init__.py | 3 - src/Modules/file_io.py | 99 ------- src/SumGPT.py | 178 ------------- src/manifest.json | 17 -- src/utils/__init__.py | 3 - src/utils/helpers.py | 237 ----------------- 46 files changed, 715 insertions(+), 1222 deletions(-) delete mode 100644 .idea/.gitignore delete mode 100644 .idea/inspectionProfiles/Project_Default.xml delete mode 100644 .idea/inspectionProfiles/profiles_settings.xml delete mode 100644 .idea/misc.xml delete mode 100644 .idea/modules.xml delete mode 100644 .idea/sumGPT.iml delete mode 100644 .idea/vcs.xml create mode 100644 SumGPT/__init__.py create mode 100644 SumGPT/app/__init__.py create mode 100644 SumGPT/app/body_handler.py create mode 100644 SumGPT/app/page.py create mode 100644 SumGPT/app/sidebar_handler.py create mode 100644 SumGPT/core/__init__.py create mode 100644 SumGPT/core/llm.py create mode 100644 SumGPT/core/tokenizer.py create mode 100644 SumGPT/datamodel/chunk.py create mode 100644 SumGPT/datamodel/llm_model.py create mode 100644 SumGPT/datamodel/llm_params.py create mode 100644 SumGPT/main.py create mode 100644 SumGPT/manifest.json create mode 100644 SumGPT/models.json create mode 100644 SumGPT/prompt.json create mode 100644 SumGPT/utils/__init__.py create mode 100644 SumGPT/utils/helpers.py create mode 100644 SumGPT/utils/io.py delete mode 100644 src/Components/Info.py delete mode 100644 src/Components/StreamlitSetup.py delete mode 100644 src/Components/__init__.py delete mode 100644 src/Components/sidebar.py delete mode 100644 src/Data/__init__.py delete mode 100644 src/Data/caption_languages.py delete mode 100644 src/GPT/__init__.py delete mode 100644 src/GPT/bot.py delete mode 100644 src/GPT/embeddings.py delete mode 100644 src/GPT/generate.py delete mode 100644 src/GPT/misc.py delete mode 100644 src/GPT/param.py delete mode 100644 src/Modules/Youtube.py delete mode 100644 src/Modules/__init__.py delete mode 100644 src/Modules/file_io.py delete mode 100644 src/SumGPT.py delete mode 100644 src/manifest.json delete mode 100644 src/utils/__init__.py delete mode 100644 src/utils/helpers.py diff --git a/.idea/.gitignore b/.idea/.gitignore deleted file mode 100644 index 13566b8..0000000 --- a/.idea/.gitignore +++ /dev/null @@ -1,8 +0,0 @@ -# Default ignored files -/shelf/ -/workspace.xml -# Editor-based HTTP Client requests -/httpRequests/ -# Datasource local storage ignored files -/dataSources/ -/dataSources.local.xml diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml deleted file mode 100644 index 5a85bff..0000000 --- a/.idea/inspectionProfiles/Project_Default.xml +++ /dev/null @@ -1,38 +0,0 @@ - - - - \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml deleted file mode 100644 index 105ce2d..0000000 --- a/.idea/inspectionProfiles/profiles_settings.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml deleted file mode 100644 index 0594372..0000000 --- a/.idea/misc.xml +++ /dev/null @@ -1,4 +0,0 @@ - - - - \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml deleted file mode 100644 index 474a048..0000000 --- a/.idea/modules.xml +++ /dev/null @@ -1,8 +0,0 @@ - - - - - - - - \ No newline at end of file diff --git a/.idea/sumGPT.iml b/.idea/sumGPT.iml deleted file mode 100644 index a29c46d..0000000 --- a/.idea/sumGPT.iml +++ /dev/null @@ -1,24 +0,0 @@ - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml deleted file mode 100644 index 94a25f7..0000000 --- a/.idea/vcs.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - - - \ No newline at end of file diff --git a/RUN.bat b/RUN.bat index 6afd014..7488c88 100644 --- a/RUN.bat +++ b/RUN.bat @@ -38,4 +38,4 @@ if "%mod_date%" neq "%last_mod_date%" ( echo "Requirements file has not been modified. Skipping update." ) -streamlit run src/SumGPT.py \ No newline at end of file +streamlit run SumGPT/main.py \ No newline at end of file diff --git a/SumGPT/__init__.py b/SumGPT/__init__.py new file mode 100644 index 0000000..a576e4b --- /dev/null +++ b/SumGPT/__init__.py @@ -0,0 +1,4 @@ +import streamlit as st + +if "summaries" not in st.session_state: + st.session_state["summaries"] = [] diff --git a/SumGPT/app/__init__.py b/SumGPT/app/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/SumGPT/app/body_handler.py b/SumGPT/app/body_handler.py new file mode 100644 index 0000000..8becb93 --- /dev/null +++ b/SumGPT/app/body_handler.py @@ -0,0 +1,243 @@ +import asyncio +from typing import Any, Dict, List, Optional, Tuple + +import streamlit as st +import utils.io as io +from core.llm import LLM +from core.tokenizer import Tokenizer +from datamodel.chunk import Chunk +from datamodel.llm_params import LLMParams + + +class BodyHandler: + def file_uploader(self, type: List[str] = ["txt"]) -> List[Dict[str, str]]: + uploaded_files = st.file_uploader("Upload a file", type=type, accept_multiple_files=True) + files = [] + if uploaded_files is None: + st.stop() + st.warning("File is not uploaded.") + for file in uploaded_files: + text = io.read_to_string(file) + filename = file.name + files.append({"filename": filename, "text": text}) + return files + + def segment_text( + self, text: str, chunk_size: int, model: str, input_id: int + ) -> Tuple[List[Chunk], int]: + chunks: List[Chunk] = [] + tokenizer = Tokenizer(model) + total_tokens = tokenizer.tokenize(text) + count = 0 + for i in range(0, len(total_tokens), chunk_size): + chunk_tokens = total_tokens[i : i + chunk_size] + content = tokenizer.detokenize(chunk_tokens) + chunks.append(Chunk(count, content, len(chunk_tokens), input_id)) + count += 1 + return chunks, len(total_tokens) + + def _get_tokens(self, response_meta: Dict[str, Any]) -> Tuple[int, int, int]: + completion_tokens = response_meta.get("token_usage", {}).get("completion_tokens", 0) + prompt_tokens = response_meta.get("token_usage", {}).get("prompt_tokens", 0) + cached_tokens = ( + response_meta.get("token_usage", {}) + .get("prompt_tokens_details", {}) + .get("cached_tokens", 0) + ) + return completion_tokens, prompt_tokens, cached_tokens + + def generate( + self, + chunks: List[Chunk], + gpt_params: LLMParams, + role: str, + api_key: Optional[str], + ) -> None: + generate_button = st.button("Generate summary") + if generate_button: + if not api_key: + st.error("❌ Please enter your OpenAI API key in the sidebar.") + return + if not role: + st.error("❌ Please enter a role description in the sidebar.") + return + + st.session_state["summaries"] = [] # Initialize or reset summaries + + progress_text = st.empty() + progress_bar = st.progress(0) + total_chunks = len(chunks) + + # Group chunks by filename + filename_chunks = {} + for chunk in chunks: + if chunk.filename not in filename_chunks: + filename_chunks[chunk.filename] = [] + filename_chunks[chunk.filename].append(chunk) + + llm = LLM(api_key, gpt_params) + processed_chunks = 0 + + # Process chunks by filename + for filename, file_chunks in filename_chunks.items(): + expander = st.expander(f"{filename}") + for chunk in file_chunks: + processed_chunks += 1 + progress_text.write(f"Generating summaries {processed_chunks}/{total_chunks}") + progress_bar.progress(processed_chunks / total_chunks) + + summary = llm.generate(chunk.content, role) + with expander: + with st.chat_message("πŸ€–"): + st.write(summary.content) + completion_tokens, prompt_tokens, cached_tokens = self._get_tokens( + summary.response_metadata + ) + price = round( + llm.Calc_price(prompt_tokens, completion_tokens, cached_tokens), 6 + ) + st.write( + f"Tokens: `{completion_tokens + prompt_tokens}`, price: `${price}`" + ) + # Store the summary in session state + st.session_state["summaries"].append( + { + "filename": filename, + "content": summary.content, + "tokens": completion_tokens + prompt_tokens, + "price": price, + } + ) + + progress_text.write("βœ… All chunks processed!") + progress_bar.progress(1.0) + else: + # Check if summaries exist in session state and display them + if "summaries" in st.session_state: + # Group summaries by filename + filename_summaries = {} + for summary_data in st.session_state["summaries"]: + filename = summary_data["filename"] + if filename not in filename_summaries: + filename_summaries[filename] = [] + filename_summaries[filename].append(summary_data) + + # Display summaries grouped by filename + for filename, summaries in filename_summaries.items(): + with st.expander(f"{filename}"): + for summary_data in summaries: + with st.chat_message("πŸ€–"): + st.write(summary_data["content"]) + st.write( + f"Tokens: `{summary_data['tokens']}`, price: `${summary_data['price']}`" + ) + + def agenerate( + self, + chunks: List[Chunk], + gpt_params: LLMParams, + role: str, + api_key: Optional[str], + ) -> None: + generate_button = st.button("Generate summary") + if generate_button: + if not api_key: + st.error("❌ Please enter your OpenAI API key in the sidebar.") + return + if not role: + st.error("❌ Please enter a role description in the sidebar.") + return + + st.session_state["summaries"] = [] # Initialize or reset summaries + + async def process_chunks(): + llm = LLM(api_key, gpt_params) + total_chunks = len(chunks) + progress_text = st.empty() + progress_text.write(f"Generating summaries 0/{total_chunks}") + total_price_text = st.empty() + total_price = 0 + + progress_bar = st.progress(0) + completed_chunks = 0 + + # Sort chunks by chunk.id + sorted_chunks = sorted(chunks, key=lambda c: c.id) + + # Group chunks by filename + filename_chunks = {} + for chunk in sorted_chunks: + if chunk.filename not in filename_chunks: + filename_chunks[chunk.filename] = [] + filename_chunks[chunk.filename].append(chunk) + + # Create expanders for each file + expanders = { + filename: st.expander(f"{filename}") for filename in filename_chunks.keys() + } + + # Create tasks for all chunks (sorted by chunk.id) + tasks = [llm.agenerate(chunk.content, role) for chunk in sorted_chunks] + + # Run all tasks and get the results in the same order + summaries = await asyncio.gather(*tasks) + + # Process the results in order + for summary, current_chunk in zip(summaries, sorted_chunks): + completed_chunks += 1 + progress_text.write(f"Generating summaries {completed_chunks}/{total_chunks}") + progress_bar.progress(completed_chunks / total_chunks) + + with expanders[current_chunk.filename]: + with st.chat_message("ai"): + st.write(summary.content) + completion_tokens, prompt_tokens, cached_tokens = self._get_tokens( + summary.response_metadata + ) + price = round( + llm.Calc_price(prompt_tokens, completion_tokens, cached_tokens), 6 + ) + st.write( + f"Tokens: `{completion_tokens + prompt_tokens}`, price: `${price}`" + ) + total_price += price + + # Store the summary in session state + st.session_state["summaries"].append( + { + "filename": current_chunk.filename, + "content": summary.content, + "tokens": completion_tokens + prompt_tokens, + "price": price, + } + ) + + progress_text.write("βœ… All chunks processed!") + progress_bar.progress(1.0) + total_price_text.write(f"Total price: `${round(total_price, 6)}`") + + # Run the async processing + asyncio.run(process_chunks()) + else: + # Check if summaries exist in session state and display them + if "summaries" in st.session_state: + total_price = 0 + # Group summaries by filename + filename_summaries = {} + for summary_data in st.session_state["summaries"]: + filename = summary_data["filename"] + if filename not in filename_summaries: + filename_summaries[filename] = [] + filename_summaries[filename].append(summary_data) + + # Display summaries grouped by filename + for filename, summaries in filename_summaries.items(): + with st.expander(f"{filename}"): + for summary_data in summaries: + with st.chat_message("ai"): + st.write(summary_data["content"]) + st.write( + f"Tokens: `{summary_data['tokens']}`, price: `${summary_data['price']}`" + ) + total_price += summary_data["price"] + st.write(f"Total price: `${round(total_price, 6)}`") diff --git a/SumGPT/app/page.py b/SumGPT/app/page.py new file mode 100644 index 0000000..aed232b --- /dev/null +++ b/SumGPT/app/page.py @@ -0,0 +1,69 @@ +from typing import Dict, List, Optional + +import streamlit as st +from datamodel.llm_params import LLMParams + +from app.body_handler import BodyHandler +from app.sidebar_handler import SidebarHandler + + +class Page: + def __init__(self): + self.chunk_size: Optional[int] = None + self.role: Optional[str] = None + self.api_key: Optional[str] = None + self.llm_params: Optional[LLMParams] = None + + def draw_header(self, version): + st.title(f"πŸ“ SumGPT {version}") + st.markdown("##### Summarize your text with OpenAI's API") + st.markdown("##### [GitHub repo](https://github.com/sean1832/SumGPT)") + st.warning( + "Please [report any bugs](https://github.com/sean1832/SumGPT/issues) to the GitHub repo." + ) + + def draw_sidebar(self, manifest: Dict[str, str], models_data: List[Dict[str, str]]) -> None: + with st.sidebar: + sb = SidebarHandler() + sb.header() + sb.import_config() + self.api_key = sb.api_key_entry() + with st.expander("Role settings"): + self.role = sb.role_settings_panel() + with st.expander("Configuration"): + self.llm_params, self.chunk_size = sb.config_control_panel(models_data) + sb.export_config() + sb.footer(manifest) + + def draw_body(self) -> None: + if not self.chunk_size: + st.error("❌ Please set the chunk size in the sidebar.") + return + if not self.llm_params: + st.error("❌ Please set the model in the sidebar.") + return + if not self.role: + st.error("❌ Please set the role in the sidebar.") + return + + body = BodyHandler() + texts = body.file_uploader(["txt", "md"]) + + total_chunks = [] + filenames = [] + + for idx, text in enumerate(texts): + filename = text["filename"] + filenames.append(filename) + chunks, total_token_size = body.segment_text( + text["text"], self.chunk_size, self.llm_params.model.name, idx + ) + with st.expander(f"`{filename}` **(chunks: {len(chunks)})**"): + for chunk in chunks: + chunk.set_filename_from_list(filenames) + st.write([chunk.to_dict() for chunk in chunks]) + st.write(f"Tokens: `{total_token_size}`") + + total_chunks.extend(chunks) + + body.agenerate(total_chunks, self.llm_params, self.role, self.api_key) diff --git a/SumGPT/app/sidebar_handler.py b/SumGPT/app/sidebar_handler.py new file mode 100644 index 0000000..378e681 --- /dev/null +++ b/SumGPT/app/sidebar_handler.py @@ -0,0 +1,100 @@ +from typing import Any, Dict, List, Tuple + +import streamlit as st +import utils.helpers as helpers +from datamodel.llm_params import LLMModel, LLMParams + + +class SidebarHandler: + def __init__(self): + self.config = {} + self.chunk_size = None + + def header(self): + st.title("SumGPT") + st.markdown("Select the model and parameters for summarization.") + + def api_key_entry(self) -> str: + st.markdown("### API Key") + return st.text_input("Enter your OpenAI API key", type="password") + + def role_settings_panel(self, height=300) -> str: + language = st.selectbox( + "Role language", + ["English", "Chinese", "Japanese", "Spanish", "French", "German", "Italian"], + ) + role = st.text_area( + "Role settings", + self.config.get( + "role", + f"Write a detailed summary in perfect {language} that is concise, clear and coherent while capturing the main ideas the text. " + "The summary should be well-structured and free of grammatical errors.\n\n" + "The summary is to be written in markdown format, with a heading (###) that encapsulate the core concept of the content. It should be concise and specific. avoid generic headings like 'Summary' or 'Introduction'.", + ), + height=height, + ) + if role is None: + st.stop() + st.warning("Role settings are not set.") + return role + + def config_control_panel(self, models_data: List[Dict[str, str]]) -> Tuple[LLMParams, int]: + model_names = helpers.extract_values(models_data, "model") + model_name = st.selectbox("Model", model_names, self.config.get("model_index", 0)) + model = LLMModel.construct_from_dict(self._get_model_dict(models_data, model_name)) + + _param = self._construct_param(models_data, model_name) + + chunk_size = st.number_input( + "Chunk size (tokens)", + 32, + _param["context_window"], + self.config.get("chunk_size", 2048), + step=1024, + ) + max_tokens: int = st.number_input( + "Max output (tokens)", + 32, + _param["max_output_tokens"], + self.config.get("max_tokens", 512), + ) + temperature: float = st.slider("Temperature", 0.0, 1.0, self.config.get("temperature", 0.7)) + return ( + LLMParams( + model=model, + max_tokens=max_tokens, + temperature=temperature, + ), + chunk_size, + ) + + def _get_model_dict(self, models_data, selected_model) -> Dict[str, Any]: + model_index = helpers.extract_dict_index(models_data, "model", selected_model) + return models_data[model_index] + + def _construct_param(self, models_data, selected_model): + model_dict = self._get_model_dict(models_data, selected_model) + param = { + "max_output_tokens": model_dict["max_output_tokens"], + "context_window": model_dict["context_window"], + } + return param + + def import_config(self): + st.markdown("### Import Configuration") + if st.button("Import configuration"): + raise NotImplementedError # TODO: implement + + def export_config(self): + st.markdown("### Export Configuration") + if st.button("Export configuration"): + raise NotImplementedError # TODO: implement + + def footer(self, data: Dict[str, Any]): + st.markdown("---") + st.markdown("### SumGPT") + st.markdown(f"Version: `{data.get('version')}`") + st.markdown(f"Author: {data.get('author')}") + st.markdown(f"[Report a bug]({data['bugs']['url']})") + st.markdown(f"[GitHub repo]({data['repository']['url']})") + st.markdown(f"License: [{data['license']['type']}]({data['license']['url']})") diff --git a/SumGPT/core/__init__.py b/SumGPT/core/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/SumGPT/core/llm.py b/SumGPT/core/llm.py new file mode 100644 index 0000000..06b7421 --- /dev/null +++ b/SumGPT/core/llm.py @@ -0,0 +1,51 @@ +from datamodel.llm_params import LLMParams +from langchain_core.messages import BaseMessage, HumanMessage, SystemMessage +from langchain_openai import ChatOpenAI +from pydantic.types import SecretStr + + +class LLM: + def __init__(self, api_key: str, gpt_params: LLMParams): + self.api_key: str = api_key + self.llm_params: LLMParams = gpt_params + self.model: ChatOpenAI = self._set_llm() + + def _set_llm(self) -> ChatOpenAI: + return ChatOpenAI( + api_key=SecretStr(self.api_key), + model=self.llm_params.model.name, + max_tokens=self.llm_params.max_tokens, + temperature=self.llm_params.temperature, + ) + + def generate(_self, prompt: str, system: str = "") -> BaseMessage: + messages = [ + SystemMessage(content=system), + HumanMessage(content=prompt), + ] + return _self.model.invoke(messages) + + async def agenerate(_self, prompt: str, system: str = "") -> BaseMessage: + messages = [ + SystemMessage(content=system), + HumanMessage(content=prompt), + ] + return await _self.model.ainvoke(messages) + + def Calc_price( + self, + input_tokens: int, + output_tokens: int, + cached_tokens: int = 0, + scale_factor: int = 1000000, + ) -> float: + pricing = self.llm_params.model.pricing + if cached_tokens != 0 and pricing.cached is not None: + input_tokens -= cached_tokens + return ( + input_tokens * pricing.input + + output_tokens * pricing.output + + cached_tokens * pricing.cached + ) / scale_factor + + return (input_tokens * pricing.input + output_tokens * pricing.output) / scale_factor diff --git a/SumGPT/core/tokenizer.py b/SumGPT/core/tokenizer.py new file mode 100644 index 0000000..d0e7d12 --- /dev/null +++ b/SumGPT/core/tokenizer.py @@ -0,0 +1,20 @@ +from typing import List + +import tiktoken + + +class Tokenizer: + def __init__(self, model: str): + self.tokenizer = tiktoken.encoding_for_model(model) + + def tokenize(self, text: str) -> List[int]: + return self.tokenizer.encode(text) + + def detokenize(self, tokens: List[int]) -> str: + return self.tokenizer.decode(tokens) + + def detokenize_single(self, tokens: List[int]) -> List[str]: + results = [] + for token in tokens: + results.append(self.tokenizer.decode_single_token_bytes(token).decode("utf-8")) + return results diff --git a/SumGPT/datamodel/chunk.py b/SumGPT/datamodel/chunk.py new file mode 100644 index 0000000..64666d2 --- /dev/null +++ b/SumGPT/datamodel/chunk.py @@ -0,0 +1,22 @@ +class Chunk: + def __init__(self, id: int, content: str, tokens: int, input_id: int): + self.id = id + self.content = content + self.tokens = tokens + self.input_id = input_id + self.filename = None + + def __str__(self) -> str: + return f"Chunk(content={self.content}, tokens={self.tokens}, input_id={self.input_id})" + + def set_filename_from_list(self, filenames: list[str]) -> str: + self.filename = filenames[self.input_id] + return self.filename + + def to_dict(self) -> dict: + return { + "id": self.id, + "content": self.content, + "tokens": self.tokens, + "input_id": self.input_id, + } diff --git a/SumGPT/datamodel/llm_model.py b/SumGPT/datamodel/llm_model.py new file mode 100644 index 0000000..3c714c1 --- /dev/null +++ b/SumGPT/datamodel/llm_model.py @@ -0,0 +1,31 @@ +from typing import Optional + + +class LLMModelPricing: + def __init__(self, input: int, output: int, cached: Optional[int] = None): + self.input = input + self.output = output + self.cached = cached + + +class LLMModel: + def __init__( + self, name: str, context_window: int, max_output_tokens: int, pricing: LLMModelPricing + ): + self.name = name + self.context_window = context_window + self.max_output_tokens = max_output_tokens + self.pricing = pricing + + @staticmethod + def construct_from_dict(data: dict) -> "LLMModel": + pricing = LLMModelPricing(data["pricing"]["input"], data["pricing"]["output"]) + if "cached" in data["pricing"]: + pricing.cached = data["pricing"]["cached"] + + return LLMModel( + name=data["model"], + context_window=data["context_window"], + max_output_tokens=data["max_output_tokens"], + pricing=pricing, + ) diff --git a/SumGPT/datamodel/llm_params.py b/SumGPT/datamodel/llm_params.py new file mode 100644 index 0000000..9de6306 --- /dev/null +++ b/SumGPT/datamodel/llm_params.py @@ -0,0 +1,13 @@ +from datamodel.llm_model import LLMModel, LLMModelPricing # noqa: F401 + + +class LLMParams: + def __init__( + self, + model: LLMModel, + max_tokens=2048, + temperature=0.7, + ): + self.model: LLMModel = model + self.max_tokens: int = max_tokens + self.temperature: float = temperature diff --git a/SumGPT/main.py b/SumGPT/main.py new file mode 100644 index 0000000..f33c425 --- /dev/null +++ b/SumGPT/main.py @@ -0,0 +1,16 @@ +from app.page import Page +from utils import io + + +def main(): + manifest = io.read_json_file("SumGPT/manifest.json") + models = io.read_json_file("SumGPT/models.json") + + pg = Page() + pg.draw_header(manifest["version"]) + pg.draw_sidebar(manifest, models) + pg.draw_body() + + +if __name__ == "__main__": + main() diff --git a/SumGPT/manifest.json b/SumGPT/manifest.json new file mode 100644 index 0000000..7a33f79 --- /dev/null +++ b/SumGPT/manifest.json @@ -0,0 +1,17 @@ +{ + "name": "SumGPT", + "version": "2.0.0", + "license": { + "type": "MIT", + "url": "https://github.com/sean1832/SumGPT/blob/master/LICENSE" + }, + "author": "Zeke Zhang", + "homepage": "https://github.com/sean1832/SumGPT", + "repository": { + "type": "git", + "url": "https://github.com/sean1832/SumGPT" + }, + "bugs": { + "url": "https://github.com/sean1832/SumGPT/issues" + } + } \ No newline at end of file diff --git a/SumGPT/models.json b/SumGPT/models.json new file mode 100644 index 0000000..9a3cd16 --- /dev/null +++ b/SumGPT/models.json @@ -0,0 +1,40 @@ +[ + { + "model": "gpt-4o-mini", + "context_window": 128000, + "max_output_tokens": 16384, + "pricing": { + "input": 0.15, + "output": 0.6, + "cached": 0.075 + } + }, + { + "model": "gpt-4o", + "context_window": 128000, + "max_output_tokens": 4096, + "pricing": { + "input": 2.5, + "output": 10, + "cached": 1.25 + } + }, + { + "model": "gpt-4-turbo", + "context_window": 128000, + "max_output_tokens": 4096, + "pricing": { + "input": 10, + "output": 30 + } + }, + { + "model": "gpt-3.5-turbo", + "context_window": 16385, + "max_output_tokens": 4096, + "pricing": { + "input": 0.5, + "output": 1.5 + } + } +] \ No newline at end of file diff --git a/SumGPT/prompt.json b/SumGPT/prompt.json new file mode 100644 index 0000000..f76ec60 --- /dev/null +++ b/SumGPT/prompt.json @@ -0,0 +1,22 @@ +[ + { + "type": "recursive", + "legacy": false, + "prompt": "Write a detailed and comprehensive explanation of the following in perfect [LANGUAGE] with no grammar issues, ensuring all key points are covered. Create a markdown heading (###) that encapsulates the core information:\n\n{text}\n\nStructured markdown summary with heading (###) in fluent [LANGUAGE]:", + "variables": [ + { + "name": "[LANGUAGE]" + } + ] + }, + { + "type": "final", + "legacy": false, + "prompt": "Write a detailed summary of the following in [LANGUAGE]:\n\n{text}\n\nIdentify and summarise them into five headings. Use #### headings in markdown. Under headings, summarize a list of key points that best encapsulate the core information. Structured markdown summary with headings in perfect [LANGUAGE] (####): ", + "variables": [ + { + "name": "[LANGUAGE]" + } + ] + } +] \ No newline at end of file diff --git a/SumGPT/utils/__init__.py b/SumGPT/utils/__init__.py new file mode 100644 index 0000000..7979e22 --- /dev/null +++ b/SumGPT/utils/__init__.py @@ -0,0 +1,4 @@ +import utils.helpers as helpers +import utils.io as io + +__all__ = ["helpers", "io"] diff --git a/SumGPT/utils/helpers.py b/SumGPT/utils/helpers.py new file mode 100644 index 0000000..25afa4f --- /dev/null +++ b/SumGPT/utils/helpers.py @@ -0,0 +1,35 @@ +def extract_values(dicts, key, parent_key=None): + """ + Extracts values from a list of dictionaries based on a specified key. + If the key is nested, a parent key can be specified. + + :param dicts: List of dictionaries to query + :param key: The key for which values are to be extracted + :param parent_key: Optional parent key if the key is nested within another dictionary + :return: List of values corresponding to the specified key + """ + values = [] + for dict in dicts: + if parent_key: + # Access the nested dictionary and then the key if parent_key is specified + if parent_key in dict and key in dict[parent_key]: + values.append(dict[parent_key][key]) + else: + # Access the key directly if there is no parent_key + if key in dict: + values.append(dict[key]) + return values + +def extract_dict_index(dicts, key, value): + """ + Extracts the index of a dictionary in a list of dictionaries based on a specified key-value pair. + + :param dicts: List of dictionaries to query + :param key: The key to search for + :param value: The value to search for + :return: Index of the dictionary containing the specified key-value pair + """ + for i, dict in enumerate(dicts): + if key in dict and dict[key] == value: + return i + return None \ No newline at end of file diff --git a/SumGPT/utils/io.py b/SumGPT/utils/io.py new file mode 100644 index 0000000..fef6d69 --- /dev/null +++ b/SumGPT/utils/io.py @@ -0,0 +1,17 @@ +import json +from io import StringIO + + +def read_json_file(file): + with open(file, "r") as f: + return json.load(f) + + +def write_json_file(file, data: dict): + with open(file, "w") as f: + json.dump(data, f, indent=4) + + +def read_to_string(file): + stringio = StringIO(file.getvalue().decode("utf-8")) + return stringio.read() diff --git a/requirements.txt b/requirements.txt index 9770e95..8c8c682 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,15 +1,14 @@ docx==0.2.4 -python_docx==0.8.11 -langchain==0.0.123 +python_docx==1.1.2 langdetect==1.0.9 -numpy==1.24.2 -openai==0.27.2 -pydub==0.25.1 PyPDF4==1.27.0 -pytube==12.1.3 -streamlit==1.20.0 -streamlit_toggle_switch==1.0.2 -tiktoken==0.3.1 -requests==2.29.0 -youtube_transcript_api==0.6.0 +tiktoken==0.8.0 +requests==2.32.3 + +# langchain +langchain==0.3.4 +langchain-openai==0.2.3 + +# streamlit +streamlit==1.39.0 diff --git a/src/Components/Info.py b/src/Components/Info.py deleted file mode 100644 index 72f324f..0000000 --- a/src/Components/Info.py +++ /dev/null @@ -1,18 +0,0 @@ -import streamlit as st -import Modules.file_io as file_io - - -def info(): - info_panel = st.container() - - manifest = 'src/manifest.json' - st.session_state['MANIFEST'] = manifest_data = file_io.read_json(manifest) - - with info_panel: - st.markdown('---') - st.markdown(f"# {manifest_data['name']}") - st.markdown(f"Version: `{manifest_data['version']}`") - st.markdown(f"Author: {manifest_data['author']}") - st.markdown(f"[Report a bug]({manifest_data['bugs']['url']})") - st.markdown(f"[GitHub repo]({manifest_data['homepage']})") - st.markdown(f"License: [{manifest_data['license']['type']}]({manifest_data['license']['url']})") \ No newline at end of file diff --git a/src/Components/StreamlitSetup.py b/src/Components/StreamlitSetup.py deleted file mode 100644 index b57f3ce..0000000 --- a/src/Components/StreamlitSetup.py +++ /dev/null @@ -1,36 +0,0 @@ -import streamlit as st -import Data.caption_languages as data -import Modules.file_io as file_io - -def setup(): - st.set_page_config(page_title="SumGPT", page_icon="πŸ“", layout="wide") - - if not st.session_state.get('OPENAI_API_KEY'): - st.session_state['OPENAI_API_KEY'] = None - - if not st.session_state.get('OPENAI_PERSONA_REC'): - st.session_state['OPENAI_PERSONA_REC'] = None - - if not st.session_state.get('OPENAI_PERSONA_SUM'): - st.session_state['OPENAI_PERSONA_SUM'] = None - - if not st.session_state.get('CHUNK_SIZE'): - st.session_state['CHUNK_SIZE'] = None - - if not st.session_state.get('OPENAI_PARAMS'): - st.session_state['OPENAI_PARAMS'] = None - - if not st.session_state.get('DELAY'): - st.session_state['DELAY'] = 0 - - if not st.session_state.get('FINAL_SUMMARY_MODE'): - st.session_state['FINAL_SUMMARY_MODE'] = False - - if not st.session_state.get('CAPTION_LANGUAGES'): - st.session_state['CAPTION_LANGUAGES'] = data.languages + data.auto_languages - - if not st.session_state.get('PREVIOUS_RESULTS'): - st.session_state['PREVIOUS_RESULTS'] = None - - if not st.session_state.get('MANIFEST'): - st.session_state["MANIFEST"] = file_io.read_json("src/manifest.json") \ No newline at end of file diff --git a/src/Components/__init__.py b/src/Components/__init__.py deleted file mode 100644 index 9391db9..0000000 --- a/src/Components/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from Components import sidebar -from Components import StreamlitSetup -from Components import Info -__all__ = ['sidebar', 'StreamlitSetup', 'Info'] \ No newline at end of file diff --git a/src/Components/sidebar.py b/src/Components/sidebar.py deleted file mode 100644 index b5b3d41..0000000 --- a/src/Components/sidebar.py +++ /dev/null @@ -1,187 +0,0 @@ -import streamlit as st -import GPT -import Modules.file_io as file_io -from streamlit_toggle import st_toggle_switch -import Components -from typing import Any, Dict, List, Tuple, Union -import json - - -def set_openai_api_key(api_key: str): - st.session_state["OPENAI_API_KEY"] = api_key - - -def set_openai_persona(persona_rec: str, persona_sum: str): - st.session_state["OPENAI_PERSONA_REC"] = persona_rec - st.session_state["OPENAI_PERSONA_SUM"] = persona_sum - - -def set_param(params: GPT.param): - st.session_state["OPENAI_PARAMS"] = params - - -def set_chunk_size(size: int): - st.session_state['CHUNK_SIZE'] = size - - -def set_delay(time: int): - st.session_state['DELAY'] = time - - -def set_final_summary_mode(mode: bool): - st.session_state['FINAL_SUMMARY_MODE'] = mode - - -def _set_config(config_file, key: str, default_value): - if config_file: - return file_io.read_json_upload(config_file, key) - else: - return default_value - -def _set_language(language: str): - st.session_state['OUTPUT_LANGUAGE'] = language - -def _set_legacy(enable: bool): - st.session_state['LEGACY'] = enable -def _legacy(enable: bool, legacy, experimental): - if not enable: - return experimental - else: - return legacy -def _extract_prompt(json_data: List[Dict[str,Union[bool, str]]], target_type: str, target_legacy: bool, language: str = "English") -> str | None: - for item in json_data: - if item["type"] == target_type and item["legacy"] == target_legacy: - prompt = item["prompt"] - new_prompt = prompt.replace("[LANGUAGE]", language) - return new_prompt - return None - -def sidebar(): - with st.sidebar: - st.markdown("## How to use\n" - "1. πŸ”‘ Enter your [OpenAI API key](https://beta.openai.com/account/api-keys)\n" - "2. πŸ“ upload your file\n" - "3. πŸƒ Run\n" - "---") - - config_file = st.file_uploader("πŸ“ Import Configs", type=['json']) - - api_input = st.text_input(label="πŸ”‘ OpenAI API Key", - placeholder="Enter your OpenAI API key (sk-...)", - type="password", - help="You can get your API key from https://beta.openai.com/account/api-keys", - value=_set_config(config_file, "OPENAI_API_KEY", "")) - - enable_legacy = st_toggle_switch(label="Legacy", default_value=_set_config(config_file, "LEGACY", False)) - enable_final_summary = st_toggle_switch(label="Enable Final Summary", - default_value=_set_config(config_file, "FINAL_SUMMARY_MODE", False)) - if enable_final_summary: - set_final_summary_mode(True) - if st.session_state['FINAL_SUMMARY_MODE'] != enable_final_summary: - set_final_summary_mode(enable_final_summary) - - with st.expander('πŸ€– Bot Persona'): - language_options = ['English', 'Chinese', 'Japanese', 'Korean', 'Spanish', 'French', 'German'] - language_index = language_options.index(_set_config(config_file, "LANGUAGE", 'English')) - language = st.selectbox('Language', options=language_options, index=language_index) - _set_language(language) - - prompts = file_io.read_json("resources/prompt.json") - - persona_rec_legacy = _extract_prompt(prompts, "recursive", True, language) - persona_rec = _extract_prompt(prompts, "recursive", False, language) - persona_rec = st.text_area('Bot Persona Recursive', - value=_set_config(config_file, "OPENAI_PERSONA_REC", _legacy(enable_legacy, persona_rec_legacy, persona_rec)), - help='System message is a pre-defined message used to instruct the assistant at the ' - 'beginning of a conversation. iterating and ' - 'experimenting with potential improvements can help to generate better outputs.' - 'Make sure to use casual language.', - height=250) - if enable_final_summary: - persona_sum_legacy = _extract_prompt(prompts, "final", True, language) - persona_sum = _extract_prompt(prompts, "final", False, language) - - persona_sum = st.text_area('Bot Persona Total Sum', - value=_set_config(config_file, "OPENAI_PERSONA_SUM", _legacy(enable_legacy, persona_sum_legacy, persona_sum)), - help='This is a pre-defined message for total summarization that is used to' - 'instruct the assistant at the beginning of a conversation. ', - height=300) - else: - persona_sum = "" - - with st.expander('πŸ”₯ Advanced Options'): - model_options = ['gpt-3.5-turbo','gpt-3.5-turbo-16k', 'gpt-4'] - model_index = model_options.index(_set_config(config_file, "MODEL", 'gpt-3.5-turbo')) - model = st.selectbox("Model", options=model_options, index=model_index) - - if model == 'gpt-4': - max_chunk = 4000 - elif model == 'gpt-3.5-turbo-16k': - max_chunk = 16000 - else: - max_chunk = 2500 - chunk_size = st.slider('Chunk Size (word count)', min_value=0, max_value=max_chunk, step=20, - value=_set_config(config_file, "CHUNK_SIZE", 800)) - max_tokens_rec = st.slider('Max Tokens - Recursive Summary', min_value=0, max_value=4090, step=20, - value=_set_config(config_file, "MAX_TOKENS_REC", 250)) - if enable_final_summary: - max_tokens_final = st.slider('Max Tokens - Final Summary', min_value=0, max_value=4090, step=20, - value=_set_config(config_file, "MAX_TOKENS_FINAL", 650)) - else: - max_tokens_final = 0 - temperature = st.slider('Temperature', min_value=0.0, max_value=1.0, step=0.05, - value=_set_config(config_file, "TEMPERATURE", 0.7)) - top_p = st.slider('Top P', min_value=0.0, max_value=1.0, step=0.05, - value=_set_config(config_file, "TOP_P", 1.0)) - frequency_penalty = st.slider('Frequency Penalty', min_value=0.0, max_value=2.0, step=0.1, - value=_set_config(config_file, "FREQUENCY_PENALTY", 0.0)) - presence_penalty = st.slider('Presence Penalty', min_value=0.0, max_value=2.0, step=0.1, - value=_set_config(config_file, "PRESENCE_PENALTY", 0.0)) - if st_toggle_switch(label="Delay (free openAI API user)", - default_value=_set_config(config_file, "ENABLE_DELAY", False)): - delay = st.slider('Delay (seconds)', min_value=0, max_value=60, step=1, - value=_set_config(config_file, "DELAY_TIME", 1)) - else: - delay = 0 - param = GPT.param.gpt_param( - model=model, - max_tokens_final=max_tokens_final, - max_tokens_rec=max_tokens_rec, - temperature=temperature, - top_p=top_p, - frequency_penalty=frequency_penalty, - presence_penalty=presence_penalty - ) - - st.download_button(label="πŸ“₯ Export Configs", - data=json.dumps({ - "OPENAI_API_KEY": api_input, - "FINAL_SUMMARY_MODE": enable_final_summary, - "OPENAI_PERSONA_REC": persona_rec, - "OPENAI_PERSONA_SUM": persona_sum, - "CHUNK_SIZE": chunk_size, - "MAX_TOKENS_REC": max_tokens_rec, - "MAX_TOKENS_FINAL": max_tokens_final, - "TEMPERATURE": temperature, - "TOP_P": top_p, - "FREQUENCY_PENALTY": frequency_penalty, - "PRESENCE_PENALTY": presence_penalty, - "MODEL": model, - "ENABLE_DELAY": delay > 0, - "DELAY_TIME": delay, - "LANGUAGE": language, - "LEGACY": enable_legacy - }, indent=4), - file_name="configs.json") - Components.Info.info() - - if api_input: - set_openai_api_key(api_input) - - if persona_rec: - set_openai_persona(persona_rec, persona_sum) - - set_chunk_size(chunk_size) - set_param(param) - set_delay(delay) - _set_legacy(enable_legacy) \ No newline at end of file diff --git a/src/Data/__init__.py b/src/Data/__init__.py deleted file mode 100644 index 4de9124..0000000 --- a/src/Data/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from Data import caption_languages - -__all__ = ['caption_languages'] \ No newline at end of file diff --git a/src/Data/caption_languages.py b/src/Data/caption_languages.py deleted file mode 100644 index acec65e..0000000 --- a/src/Data/caption_languages.py +++ /dev/null @@ -1,6 +0,0 @@ -languages = [ - 'en', 'es', 'fr', 'de', 'it', 'pt', 'ru', 'ja', 'ko', 'zh-Hans', 'zh-Hant', 'zh-TW', 'zh-CN', 'zh', 'ar', 'hi', 'th' -] - -auto_languages = ['a.' + _language for _language in languages] - diff --git a/src/GPT/__init__.py b/src/GPT/__init__.py deleted file mode 100644 index 0bcd76d..0000000 --- a/src/GPT/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -from GPT import misc -from GPT import embeddings -from GPT import bot -from GPT import param -from GPT import generate - -__all__ = ['misc', 'embeddings', 'bot', 'param', 'generate'] \ No newline at end of file diff --git a/src/GPT/bot.py b/src/GPT/bot.py deleted file mode 100644 index ce8dd86..0000000 --- a/src/GPT/bot.py +++ /dev/null @@ -1,48 +0,0 @@ -import openai -from typing import Any, Dict, List, Tuple, Union - - -class OpenAIChatBot: - """A class to interact with the OpenAI API.""" - - def __init__(self, api_key: str, persona: str, model: str, max_tokens: int, temperature: float, top_p: float, - frequency_penalty: float, presence_penalty: float): - openai.api_key = api_key - self.persona = persona - self.model = model - self.max_tokens = max_tokens - self.temperature = temperature - self.top_p = top_p - self.frequency_penalty = frequency_penalty - self.presence_penalty = presence_penalty - - def chat_stream(self, prompt: str) -> openai.api_resources.chat_completion.ChatCompletion: - """Returns the streamed response from the OpenAI API.""" - completions = openai.ChatCompletion.create( - model=self.model, - max_tokens=self.max_tokens, - temperature=self.temperature, - top_p=self.top_p, - frequency_penalty=self.frequency_penalty, - presence_penalty=self.presence_penalty, - stream=True, - messages=[ - {"role": "system", "content": self.persona}, - {"role": "user", "content": prompt} - ]) - return completions - - def chat(self, prompt: str) -> Tuple[str, str]: - """Returns the response from the OpenAI API.""" - completions = openai.ChatCompletion.create( - model=self.model, - max_tokens=self.max_tokens, - temperature=self.temperature, - top_p=self.top_p, - frequency_penalty=self.frequency_penalty, - presence_penalty=self.presence_penalty, - messages=[ - {"role": "system", "content": self.persona}, - {"role": "user", "content": f"{self.persona} '{prompt}'"} - ]) - return completions['choices'][0]['message']['content'], completions['choices'][0]['finish_reason'] diff --git a/src/GPT/embeddings.py b/src/GPT/embeddings.py deleted file mode 100644 index 3e6cb50..0000000 --- a/src/GPT/embeddings.py +++ /dev/null @@ -1,12 +0,0 @@ -import openai - - -class openAIEmbeddings: - def __init__(self, api_key: str): - openai.api_key = api_key - - def embedding(self, content: str, engine: str = 'text-embedding-ada-002') -> float: - """Returns the embedding vector of a string.""" - response = openai.Embedding.create(input=content, engine=engine) - vector = response['data'][0]['embedding'] - return vector diff --git a/src/GPT/generate.py b/src/GPT/generate.py deleted file mode 100644 index 627faf6..0000000 --- a/src/GPT/generate.py +++ /dev/null @@ -1,52 +0,0 @@ -import GPT.bot -import streamlit as st -import GPT.param -from typing import Any, Dict, List, Tuple, Union - - -def get_answer_stream(content: str): - """Returns a stream of responses from the OpenAI API.""" - params = st.session_state["OPENAI_PARAMS"] - previous_char = '' - bot = GPT.bot.OpenAIChatBot(st.session_state["OPENAI_API_KEY"], - st.session_state["OPENAI_PERSONA"], - params.model, - params.max_tokens_rec, - params.temperature, - params.top_p, - params.frequency_penalty, - params.presence_penalty) - responses = bot.chat_stream(content) - response_panel = st.empty() - for response_json in responses: - choice = response_json['choices'][0] - if choice['finish_reason'] == 'stop': - break - - # error handling - if choice['finish_reason'] == 'length': - st.warning('⚠️Result cut off due to length. Consider increasing the max tokens parameter.') - break - - delta = choice['delta'] - if 'role' in delta or delta == {}: - char = '' - else: - char = delta['content'] - answer = previous_char + char - response_panel.info(answer) - - -def get_answer(content: str, max_tokens, persona: str) -> Tuple[str, str]: - """Returns a response from the OpenAI API.""" - params = st.session_state["OPENAI_PARAMS"] - bot = GPT.bot.OpenAIChatBot(st.session_state["OPENAI_API_KEY"], - persona, - params.model, - max_tokens, - params.temperature, - params.top_p, - params.frequency_penalty, - params.presence_penalty) - response, finish_reason = bot.chat(content) - return response, finish_reason diff --git a/src/GPT/misc.py b/src/GPT/misc.py deleted file mode 100644 index b93481c..0000000 --- a/src/GPT/misc.py +++ /dev/null @@ -1,98 +0,0 @@ -import openai -from langchain.llms import OpenAI -import os -import streamlit as st -from typing import Any, Dict, List, Tuple, Union - - -def validate_api_key(api_key: str) -> bool: - """Validates the OpenAI API key by trying to create a completion.""" - openai.api_key = api_key - try: - openai.ChatCompletion.create( - model="gpt-3.5-turbo", - max_tokens=1, - messages=[ - {"role": "user", "content": "Hello!"} - ] - ) - return True - except openai.error.AuthenticationError: - return False - - -def predict_token(param, chunks) -> Dict[str, int]: - """predict how many tokens to generate.""" - if st.session_state["OPENAI_API_KEY"] is not None: - os.environ['OPENAI_API_KEY'] = st.session_state["OPENAI_API_KEY"] - llm = OpenAI() - prompt_token_total = 0 - completion_token_total = 0 - for chunk in chunks: - prompt_token = llm.get_num_tokens(chunk['content']) - prompt_token_total += prompt_token - completion_token_total += param.max_tokens_rec - - if st.session_state['FINAL_SUMMARY_MODE']: - completion_token_total += param.max_tokens_final - total_token = prompt_token_total + completion_token_total - token = {'total': total_token, - 'prompt': prompt_token_total, - 'completion': completion_token_total} - - return token - else: - return {'total': 0, 'prompt': 0, 'completion': 0} - - -def predict_token_single(chunk: Dict[str, Union[str, float]] | str, max_tokens: int = None) -> int: - """predict how many tokens to generate.""" - if st.session_state["OPENAI_API_KEY"] is not None: - os.environ['OPENAI_API_KEY'] = st.session_state["OPENAI_API_KEY"] - llm = OpenAI() - if isinstance(chunk, str): - chunk_content = chunk - else: - chunk_content = chunk['content'] - chunk_token = llm.get_num_tokens(chunk_content) - if max_tokens is not None: - chunk_token += max_tokens - - return chunk_token - else: - return 0 - - -def is_tokens_exceeded(param, chunks, max_token: int = 4096) -> Dict[str, Union[bool, str]]: - """Checks if the number of tokens used has exceeded the limit.""" - - # check recursive chunks tokens - rec_chunks_token = [] - for chunk in chunks: - chunk_token = predict_token_single(chunk, param.max_tokens_rec) - rec_chunks_token.append(chunk_token) - - - # check final chunks tokens - final_prompt_token = len(chunks) * param.max_tokens_rec - final_completion_token = param.max_tokens_final - final_chunks_token = final_prompt_token + final_completion_token - - # evaluate - if max(rec_chunks_token) > max_token: - return {'exceeded': True, - 'reason': 'recursive', - 'message': f"**[ Recursive summary ]** tokens exceeded. Max tokens allowed: {max_token}. Tokens used: {max(rec_chunks_token)}\n" - f"(Prompt: {max(rec_chunks_token) - param.max_tokens_rec}, " - f"Completion: {param.max_tokens_rec})"} - - elif final_chunks_token > max_token and st.session_state['FINAL_SUMMARY_MODE']: - return {'exceeded': True, - 'reason': 'final', - 'message': f"**[ Final summary ]** tokens exceeded. Max tokens allowed: {max_token}. Tokens used: {final_chunks_token}\n" - f"(Prompt: {final_prompt_token}, Completion: {final_completion_token})"} - - else: - return {'exceeded': False, - 'reason': '', - 'message': ''} diff --git a/src/GPT/param.py b/src/GPT/param.py deleted file mode 100644 index 866f112..0000000 --- a/src/GPT/param.py +++ /dev/null @@ -1,11 +0,0 @@ - -class gpt_param: - def __init__(self, model: str, max_tokens_final: int, max_tokens_rec: int, temperature: float, top_p: float, - frequency_penalty: float, presence_penalty: float): - self.model = model - self.max_tokens_rec = max_tokens_rec - self.max_tokens_final = max_tokens_final - self.temperature = temperature - self.top_p = top_p - self.frequency_penalty = frequency_penalty - self.presence_penalty = presence_penalty diff --git a/src/Modules/Youtube.py b/src/Modules/Youtube.py deleted file mode 100644 index f399cb3..0000000 --- a/src/Modules/Youtube.py +++ /dev/null @@ -1,97 +0,0 @@ -import requests -import re -from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound -import streamlit as st -from typing import Any, Dict, List, Tuple, Union - - -manifest = st.session_state["MANIFEST"] -def _error_report_msg(youtube_url): - return f"Please create an issue on [GitHub]({manifest['bugs']['url']}). " \ - f"Please include the YouTube URL ({youtube_url}), version number ({manifest['version']}) " \ - f"and all necessary information to replicate the error. " \ - f"**Before creating a new issue, please check if the problem has already been reported.**" - -def _extract_video_id_from_url(url): - video_id_pattern = r'(?:v=|/v/|youtu\.be/|/embed/|/e/)([^?&"\'>]+)' - match = re.search(video_id_pattern, url) - if match: - return match.group(1) - else: - raise ValueError("Invalid YouTube URL") - -def get_video_title(youtube_url): - video_id = _extract_video_id_from_url(youtube_url) - url = f'https://www.youtube.com/watch?v={video_id}' - response = requests.get(url) - title_pattern = r'(.+?) - YouTube<\/title>' - match = re.search(title_pattern, response.text) - if match: - title = match.group(1) - return title - else: - return None - -def get_available_subtitle_languages(video_id): - try: - transcript_list = YouTubeTranscriptApi.list_transcripts(video_id) - languages = [transcript.language_code for transcript in transcript_list] - return languages - except Exception as e: - print(f"Error fetching available subtitle languages: {e}") - return [] - -def get_video_captions(youtube_url, languages): - video_id = _extract_video_id_from_url(youtube_url) - simplified_url = f'https://www.youtube.com/watch?v={video_id}' - - available_language = get_available_subtitle_languages(video_id) - - if not any(lang in languages for lang in available_language) and available_language != []: - print(f"Failed to retrieve transcript: Language {available_language} is/are not yet supported for {simplified_url}.") - st.error(f'❌ Language {available_language} is/are not yet supported for {simplified_url}.\n\n' + _error_report_msg(simplified_url)) - st.stop() - - for language in languages: - try: - transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=[language]) - captions = "" - for item in transcript: - captions += item['text'] + "\n" - return captions - - except NoTranscriptFound as e: - if language == languages[-1]: - print(f"Language {available_language} exist in language list but failed to retrieve in YouTubeTranscriptApi.get_transcript: {e}") - st.error(f'❌ Language {available_language} exist in language list but failed to retrieve in `YouTubeTranscriptApi.get_transcript`:\n\n' - f'languages = {available_language}\n\n' - f'language list = {languages}\n\n' - + _error_report_msg(simplified_url)) - st.stop() - else: - continue - - except TranscriptsDisabled: - print(f"Failed to retrieve transcript: transcripts disabled for {simplified_url}") - st.error(f'❌ Subtitles not available for {simplified_url}! \n\n---' - f'\n**Instruction:**\n\n' - f'1. Verify if the [video]({simplified_url}) has subtitles available.\n\n' - f"2. If you are confident that subtitles are available in the video but could not be retrieved, " - + _error_report_msg(simplified_url)) - st.stop() - raise TranscriptsDisabled - - except Exception as e: - print(e) - st.error(f'❌ Failed to fetch data from YouTube for {simplified_url}. \n\n' - f'{_error_report_msg(simplified_url)}' - f'\n\nError: \n\n---\n\n{e}') - st.stop() - break - -@st.cache_data(show_spinner=False) -def extract_youtube_transcript(url: str, lang_code: str | List[str] = 'a.en') -> Tuple[str, str]: - """Extracts the transcript from a YouTube video.""" - transcript = get_video_captions(url, lang_code) - title = get_video_title(url) - return transcript, title diff --git a/src/Modules/__init__.py b/src/Modules/__init__.py deleted file mode 100644 index 412ace4..0000000 --- a/src/Modules/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from Modules import file_io - -__all__ = ['file_io'] \ No newline at end of file diff --git a/src/Modules/file_io.py b/src/Modules/file_io.py deleted file mode 100644 index 0f214e8..0000000 --- a/src/Modules/file_io.py +++ /dev/null @@ -1,99 +0,0 @@ -import re -import PyPDF4 -import docx -from typing import Any, Dict, List, Tuple, Union -from pydub import AudioSegment -import math -import json -import streamlit as st - - - -@st.cache_data() -def read_json(file, key: str = None) -> Any: - """Reads a json file and returns the value of a key.""" - with open(file, "r") as f: - data = json.load(f) - if key and isinstance(data, dict): - return data[key] - elif key and isinstance(data, list): - return [d[key] for d in data] - else: - return data - - -@st.cache_data() -def read_json_upload(file, key: str) -> Any: - """Reads a json file and returns the value of a key.""" - if not isinstance(file, str): - f = file.getvalue().decode("utf-8") - data = json.loads(f) - return data[key] - - -@st.cache_data() -def read_txt(file, encoding: str = "utf-8") -> str: - """Reads a text file.""" - return file.read().decode(encoding) - - -@st.cache_data() -def read_pdf(file) -> List[str]: - """Reads a pdf file.""" - pdfReader = PyPDF4.PdfFileReader(file, strict=False) - texts = [] - for page in range(pdfReader.numPages): - text = pdfReader.getPage(page).extractText() - # Merge hyphenated words - text = re.sub(r"(\w+)-\n(\w+)", r"\1\2", text) - # Fix newlines in the middle of sentences - text = re.sub(r"(?<!\n\s)\n(?!\s\n)", " ", text.strip()) - # Remove multiple newlines - text = re.sub(r"\n\s*\n", "\n\n", text) - - texts.append(text) - return texts - - -@st.cache_data() -def read_docx(file) -> str: - """Reads a docx file.""" - doc = docx.Document(file) - text = "" - for para in doc.paragraphs: - # Remove multiple newlines - t = re.sub(r"\n\s*\n", "\n\n", para.text) - text += t + "\n" - return text - - -@st.cache_data() -def _split_audio(audio, chunk_size=2) -> List[AudioSegment]: - """Split audio into chunks of 10 minutes.""" - # load audio - audio = AudioSegment.from_file(audio, format="mp3") - # Define the chunk size (10 minutes default) - chunk_size = chunk_size * 60 * 1000 - # calculate the number of chunks - num_chunks = math.ceil(len(audio) / chunk_size) - chunks = [] - # split audio into chunks - for i in range(num_chunks): - start = i * chunk_size - end = start + chunk_size - chunk = audio[start:end] - chunks.append(chunk) - return chunks - - -@st.cache_data() -def read(file) -> str | List[str]: - """Reads a file and returns the content.""" - if file.name.endswith(".txt") or file.name.endswith(".md"): - return read_txt(file) - elif file.name.endswith(".pdf"): - return read_pdf(file) - elif file.name.endswith(".docx"): - return read_docx(file) - else: - raise ValueError("File type not supported") diff --git a/src/SumGPT.py b/src/SumGPT.py deleted file mode 100644 index 355ebd4..0000000 --- a/src/SumGPT.py +++ /dev/null @@ -1,178 +0,0 @@ -import asyncio - -import streamlit as st - -import Components.StreamlitSetup as StreamlitSetup - -StreamlitSetup.setup() - -import time # noqa: E402 - -import GPT # noqa: E402 -import Modules.file_io as file_io # noqa: E402 -import Modules.Youtube # noqa: E402 -import utils.helpers as helpers # noqa: E402 -from Components.sidebar import sidebar # noqa: E402 - -app_header = st.container() - -file_handler = st.container() -content_handler = st.container() -result_handler = st.container() - -with app_header: - st.title("πŸ“ SumGPT") - st.markdown("##### Summarize your text with OpenAI's GPT-3.5 / GPT-4 API") - st.markdown("##### [GitHub repo](https://github.com/sean1832/SumGPT)") - st.warning( - "Please [report any bugs](https://github.com/sean1832/SumGPT/issues) to the GitHub repo." - ) - -sidebar() - -with file_handler: - if st.button("πŸ”ƒ Refresh"): - st.cache_data.clear() - youtube_link_empty = st.empty() - upload_file_emtpy = st.empty() - - youtube_link = youtube_link_empty.text_input( - label="πŸ”— YouTube Link", - placeholder="Enter your YouTube link", - help="Enter your YouTube link to download the video and extract the audio", - ) - upload_file = upload_file_emtpy.file_uploader( - "πŸ“ Upload your file", type=["txt", "pdf", "docx", "md"] - ) - if youtube_link: - upload_file_emtpy.empty() - with st.spinner("πŸ” Extracting transcript..."): - transcript, title = Modules.Youtube.extract_youtube_transcript( - youtube_link, st.session_state["CAPTION_LANGUAGES"] - ) - file_content = {"name": f"{title}.txt", "content": transcript} - elif upload_file: - youtube_link_empty.empty() - with st.spinner("πŸ” Reading file... (mp3 file might take a while)"): - file_content = {"name": upload_file.name, "content": file_io.read(upload_file)} - elif youtube_link and upload_file: - st.warning("Please only upload one file at a time") - else: - file_content = None - -with content_handler: - if file_content: - with st.expander("File Preview"): - if file_content["name"].endswith(".pdf"): - content = "\n\n".join(file_content["content"]) - st.text_area(file_content["name"], content, height=200) - else: - content = file_content["content"] - st.text_area(file_content["name"], content, height=200) - -with result_handler: - if file_content: - chunks = [] - content = file_content["content"] - if file_content["name"].endswith(".pdf"): - content = "\n\n".join(file_content["content"]) - chunks.extend(helpers.convert_to_chunks(content, chunk_size=st.session_state["CHUNK_SIZE"])) - - with st.expander(f"Chunks ({len(chunks)})"): - for chunk in chunks: - st.write(chunk) - - token_usage = GPT.misc.predict_token(st.session_state["OPENAI_PARAMS"], chunks) - param = st.session_state["OPENAI_PARAMS"] - prompt_token = token_usage["prompt"] - completion_token = token_usage["completion"] - if param.model == "gpt-4": - price = round(prompt_token * 0.00003 + completion_token * 0.00006, 5) - elif param.model == "gpt-3.5-turbo-16k": - price = round(prompt_token * 0.000003 + completion_token * 0.000004, 5) - else: - price = round(prompt_token * 0.0000015 + completion_token * 0.000002, 5) - st.markdown( - f"Price Prediction: `${price}` || Total Prompt: `{prompt_token}`, Total Completion: `{completion_token}`" - ) - # max tokens exceeded warning - exceeded = helpers.exceeded_token_handler( - param=st.session_state["OPENAI_PARAMS"], chunks=chunks - ) - - # load cached results - if st.session_state["PREVIOUS_RESULTS"] is not None: - rec_responses = st.session_state["PREVIOUS_RESULTS"]["rec_responses"] - rec_id = st.session_state["PREVIOUS_RESULTS"]["rec_ids"] - final_response = st.session_state["PREVIOUS_RESULTS"]["final_response"] - finish_reason_rec = st.session_state["PREVIOUS_RESULTS"]["finish_reason_rec"] - finish_reason_final = st.session_state["PREVIOUS_RESULTS"]["finish_reason_final"] - else: - rec_responses = None - rec_id = None - final_response = None - finish_reason_rec = None - finish_reason_final = None - - # finish_reason_rec = None - if st.button("πŸš€ Run", disabled=exceeded): - start_time = time.time() - st.cache_data.clear() - API_KEY = st.session_state["OPENAI_API_KEY"] - if not API_KEY and not GPT.misc.validate_api_key(API_KEY): - st.error( - "❌ Please enter a valid [OpenAI API key](https://beta.openai.com/account/api-keys)." - ) - else: - with st.spinner("Summarizing... (this might take a while)"): - if st.session_state["LEGACY"]: - rec_max_token = st.session_state["OPENAI_PARAMS"].max_tokens_rec - rec_responses, finish_reason_rec = helpers.recursive_summarize( - chunks, rec_max_token - ) - if st.session_state["FINAL_SUMMARY_MODE"]: - final_response, finish_reason_final = helpers.summarize(rec_responses) - else: - final_response = None - else: - completions, final_response = asyncio.run( - helpers.summarize_experimental_concurrently( - content, st.session_state["CHUNK_SIZE"] - ) - ) - rec_responses = [d["content"] for d in completions] - rec_ids = [d["chunk_id"] for d in completions] - # save previous completions - resp = { - "rec_responses": rec_responses, - "rec_ids": rec_ids, - "final_response": final_response, - "finish_reason_rec": finish_reason_rec, - "finish_reason_final": finish_reason_final, - } - if resp != st.session_state["PREVIOUS_RESULTS"]: - st.session_state["PREVIOUS_RESULTS"] = resp - - end_time = time.time() - st.markdown(f"⏱️ Time taken: `{round(end_time - start_time, 2)}s`") - - if rec_responses is not None: - with st.expander( - "Recursive Summaries", expanded=not st.session_state["FINAL_SUMMARY_MODE"] - ): - for i, response in enumerate(rec_responses): - st.info(f"{response}") - if finish_reason_rec == "length": - st.warning( - "⚠️Result cut off due to length. Consider increasing the [Max Tokens Chunks] parameter." - ) - - if final_response is not None: - st.header("πŸ“Summary") - st.info(final_response) - if finish_reason_final == "length": - st.warning( - "⚠️Result cut off due to length. Consider increasing the [Max Tokens Summary] parameter." - ) - if final_response is not None or rec_responses is not None: - helpers.download_results(rec_responses, final_response) diff --git a/src/manifest.json b/src/manifest.json deleted file mode 100644 index 731522c..0000000 --- a/src/manifest.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "name": "SumGPT", - "version": "1.0.8", - "license": { - "type": "MIT", - "url": "https://github.com/sean1832/SumGPT/blob/master/LICENSE" - }, - "author": "Zeke Zhang", - "homepage": "https://github.com/sean1832/SumGPT", - "repository": { - "type": "git", - "url": "https://github.com/sean1832/SumGPT" - }, - "bugs": { - "url": "https://github.com/sean1832/SumGPT/issues" - } -} \ No newline at end of file diff --git a/src/utils/__init__.py b/src/utils/__init__.py deleted file mode 100644 index 5fe7f8d..0000000 --- a/src/utils/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from utils import helpers - -__all__ = ["helpers"] diff --git a/src/utils/helpers.py b/src/utils/helpers.py deleted file mode 100644 index 6fd2618..0000000 --- a/src/utils/helpers.py +++ /dev/null @@ -1,237 +0,0 @@ -import os -import asyncio - -import numpy as np -from typing import Any, Dict, List, Tuple, Union - -from GPT.embeddings import openAIEmbeddings -import streamlit as st -import re -import GPT -import textwrap -from langdetect import detect -import time -from datetime import datetime - -from langchain.chat_models import ChatOpenAI -from langchain.docstore.document import Document -from langchain.prompts import PromptTemplate -from langchain.chains.summarize import load_summarize_chain -from langchain.chains import LLMChain - -def _similarity(v1, v2) -> np.ndarray: - """Returns the cosine similarity between two vectors.""" - return np.dot(v1, v2) - -@st.cache_data(show_spinner=False) -def _chunk_spliter(content: str, chunk_size: int = 1000, lang_base: str = 'latin') -> List[str]: - """Splits a string into chunks of a given size.""" - - sentences = re.split(r'(?<=[.?!,γ€‚οΌŒγ€οΌοΌŸΒ·])\s+', content) - if lang_base == 'latin': - chunks = [] - chunk = '' - word_count = 0 - for sentence in sentences: - sentence += ' ' # add space at end to compensate for split - words = sentence.split() - sentence_word_count = len(words) - if word_count + sentence_word_count <= chunk_size: - chunk += sentence - word_count += sentence_word_count - else: - chunks.append(chunk.strip()) - chunk = sentence - word_count = sentence_word_count - # add the last chunk - if chunk: - chunks.append(chunk.strip()) - - new_chunks = [] - for c in chunks: - if c == '': - continue - if len(c.split()) > chunk_size + 25: - words = c.split() - small_chunks = [] - for i in range(0, len(words), chunk_size): - small_chunks.append(' '.join(words[i:i + chunk_size])) - new_chunks.extend(small_chunks) - else: - new_chunks.append(c) - return new_chunks - - else: - chunks = textwrap.wrap(content, width=chunk_size) - return chunks - - -def language_base(string: str) -> str: - try: - lang_code = detect(string) - latin_based = ['en', 'fr-ca', 'es'] - east_asian_based = ['zh', 'ja', 'ko'] - for lang in latin_based: - if lang_code.startswith(lang): - return 'latin' - for lang in east_asian_based: - if lang_code.startswith(lang): - return 'east_asian' - return 'other' - except KeyError: - return 'other' - -@st.cache_data(show_spinner=False) -def convert_to_chunks(content: str, chunk_size: int = 1000, enable_embedding: bool = False) -> List[Dict[str, float]]: - """Converts a string into chunks of a given size.""" - chunks_text = _chunk_spliter(content, chunk_size, language_base(content)) - chunks = [] - for i, chunk in enumerate(chunks_text): - if enable_embedding: - embedding = openAIEmbeddings(st.session_state["OPENAI_API_KEY"]) - chunks.append({'content': chunk, 'vector': embedding.embedding(chunk)}) - else: - chunks.append({'content': chunk, 'language_based': language_base(chunk), 'chunk_id': i}) - return chunks - - -def search_chunks(query: str, chunks: List[Dict[str, float]], count: int = 1) -> List[Dict[str, np.ndarray]]: - """Returns the top `count` chunks that are most similar to the query.""" - embedding = openAIEmbeddings(st.session_state["OPENAI_API_KEY"]) - vectors = embedding.embedding(query) - points = [] - - for chunk in chunks: - point = _similarity(vectors, chunk['vector']) - points.append({'content': chunk['content'], 'point': point}) - - # sort the points in descending order - ordered = sorted(points, key=lambda x: x['point'], reverse=True) - return ordered[0:count] - -@st.cache_data(show_spinner=False) -def convert_to_docs(chunks: List[Dict[str, Union[str, float]]]) -> List[Document] | Document: - """Converts a list of chunks into a list of documents.""" - docs = [] - for chunk in chunks: - content = chunk['content'] - metadata = {'chunk_id': chunk['chunk_id']} - doc = Document(page_content=content, metadata=metadata) - docs.append(doc) - return docs - -async def async_generate(chain, chunk)-> Dict[str, Union[str, int]]: - """Generates a summary asynchronously.""" - resp = await chain.arun(text=chunk['content']) - return {'content': resp, 'chunk_id': chunk['chunk_id']} - -async def summarize_experimental_concurrently(content: str, chunk_size: int = 1000) -> Tuple[List[Dict[str, Union[str, int]]], str]: - """Summarizes a string asynchronously.""" - os.environ['OPENAI_API_KEY'] = st.session_state["OPENAI_API_KEY"] - params = st.session_state['OPENAI_PARAMS'] - llm_rec = ChatOpenAI(model_name=params.model, - max_tokens=params.max_tokens_rec, - temperature=params.temperature, - top_p=params.top_p, - frequency_penalty=params.frequency_penalty, - presence_penalty=params.presence_penalty) - llm_final = ChatOpenAI(model_name=params.model, - max_tokens=params.max_tokens_final, - temperature=params.temperature, - top_p=params.top_p, - frequency_penalty=params.frequency_penalty, - presence_penalty=params.presence_penalty) - chunks = convert_to_chunks(content, chunk_size) - - REC_PROMPT = PromptTemplate(template=st.session_state['OPENAI_PERSONA_REC'], input_variables=['text']) - chain = LLMChain(llm=llm_rec, prompt=REC_PROMPT) - - tasks = [] - for chunk in chunks: - task = async_generate(chain, chunk) - tasks.append(task) - - outputs_rec = [] - progress_bar = st.progress(0, f"Generating summary 0/{len(chunks)}") - count = 1 - for coro in asyncio.as_completed(tasks): - output_rec = await coro - outputs_rec.append(output_rec) - progress_bar.progress(count / len(chunks), f"Generating summary {count}/{len(chunks)}") - count += 1 - rec_result = sorted(outputs_rec, key=lambda x: x['chunk_id']) - if st.session_state['FINAL_SUMMARY_MODE']: - FINAL_PROMPT = PromptTemplate(template=st.session_state['OPENAI_PERSONA_SUM'], input_variables=['text']) - chain = load_summarize_chain(llm_final, chain_type='stuff', prompt=FINAL_PROMPT) - docs = convert_to_docs(rec_result) - final_result = chain.run(docs) - else: - final_result = None - return rec_result, final_result - -@st.cache_data(show_spinner=False) -def recursive_summarize(chunks: List[Dict[str, Union[str, float]]], max_tokens) -> Tuple[List[str], str]: - """Returns a recursive summary of the given content.""" - recursiveSumTexts = [] - finish_reason = '' - chunks_length = len(chunks) - count = 0 - progress_bar = st.progress(0) - for chunk in chunks: - content = chunk['content'] - text, finish_reason = GPT.generate.get_answer(content, - max_tokens=max_tokens, - persona=st.session_state['OPENAI_PERSONA_REC']) - recursiveSumTexts.append(text) - progress_bar.progress((count + 1) / chunks_length) - count += 1 - time.sleep(st.session_state['DELAY']) - - return recursiveSumTexts, finish_reason - - -@st.cache_data(show_spinner=False) -def summarize(message: List[str] | str) -> Tuple[str, str]: - """Returns a summary of the given content.""" - if isinstance(message, list): - join_msg = ' '.join(message) - else: - join_msg = message - - params = st.session_state['OPENAI_PARAMS'] - max_asw_tokens_final = params.max_tokens_final - - answer, finish_reason = GPT.generate.get_answer(join_msg, max_tokens=max_asw_tokens_final, - persona=st.session_state['OPENAI_PERSONA_SUM']) - return answer, finish_reason - - -def download_results(rec_responses, final_response): - """Downloads the results as a txt file.""" - joint_rec_response = f"=====recursive responses=====\n\n" + '\n\n'.join(rec_responses) - joint_final_response = f"{joint_rec_response}\n\n======final response=====\n\n{final_response}" - now = datetime.now() - if final_response is not None: - st.download_button("πŸ“₯ Download Summary", - joint_final_response, - file_name=f"summary_{now.strftime('%Y-%m-%d_%H-%M')}.md") - else: - st.download_button("πŸ“₯ Download Summary", - joint_rec_response, - file_name=f"summary_{now.strftime('%Y-%m-%d_%H-%M')}.md") - - -def exceeded_token_handler(param, chunks) -> bool: - """Handles the case where the user has exceeded the number of tokens.""" - if param.model == 'gpt-4': - max_token = 8100 - elif param.model == 'gpt-3.5-turbo-16k': - max_token = 16385 - else: - max_token = 4096 - info = GPT.misc.is_tokens_exceeded(param, chunks, max_token) - if info['exceeded']: - st.error(f"❌ {info['message']}") - return True - else: - return False From f80ccea756ae4d80e902686540a91b2a94612865 Mon Sep 17 00:00:00 2001 From: Zeke Zhang <sean1832725142@gmail.com> Date: Mon, 28 Oct 2024 18:21:16 +1100 Subject: [PATCH 04/11] refactor: move `init` method into `main.py` --- SumGPT/__init__.py | 4 ---- SumGPT/main.py | 9 +++++++++ 2 files changed, 9 insertions(+), 4 deletions(-) delete mode 100644 SumGPT/__init__.py diff --git a/SumGPT/__init__.py b/SumGPT/__init__.py deleted file mode 100644 index a576e4b..0000000 --- a/SumGPT/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -import streamlit as st - -if "summaries" not in st.session_state: - st.session_state["summaries"] = [] diff --git a/SumGPT/main.py b/SumGPT/main.py index f33c425..2e0459d 100644 --- a/SumGPT/main.py +++ b/SumGPT/main.py @@ -1,7 +1,15 @@ +import streamlit as st from app.page import Page from utils import io +def init(): + st.set_page_config("SumGPT", "πŸ“", "wide") + + if "summaries" not in st.session_state: + st.session_state["summaries"] = [] + + def main(): manifest = io.read_json_file("SumGPT/manifest.json") models = io.read_json_file("SumGPT/models.json") @@ -13,4 +21,5 @@ def main(): if __name__ == "__main__": + init() main() From 9f9c1193382ae100fbeb6d668e38866320362b0d Mon Sep 17 00:00:00 2001 From: Zeke Zhang <sean1832725142@gmail.com> Date: Mon, 28 Oct 2024 20:12:40 +1100 Subject: [PATCH 05/11] feat: implement cookie - implement AES encryption of cookie data for security - implement import & export config --- .gitignore | 3 ++ SumGPT/app/body_handler.py | 31 ++++++++++++++++ SumGPT/app/page.py | 2 +- SumGPT/app/sidebar_handler.py | 48 +++++++++++++++++++++---- SumGPT/core/crypto.py | 67 +++++++++++++++++++++++++++++++++++ SumGPT/main.py | 3 ++ requirements.txt | 7 ++-- 7 files changed, 151 insertions(+), 10 deletions(-) create mode 100644 SumGPT/core/crypto.py diff --git a/.gitignore b/.gitignore index dc2e3ef..b8ec665 100644 --- a/.gitignore +++ b/.gitignore @@ -163,3 +163,6 @@ cython_debug/ # test folder .test/ /test/ + +# streamlit +.streamlit/ \ No newline at end of file diff --git a/SumGPT/app/body_handler.py b/SumGPT/app/body_handler.py index 8becb93..8afae6f 100644 --- a/SumGPT/app/body_handler.py +++ b/SumGPT/app/body_handler.py @@ -1,12 +1,16 @@ import asyncio +import datetime +import json from typing import Any, Dict, List, Optional, Tuple import streamlit as st import utils.io as io +from core.crypto import Crypto from core.llm import LLM from core.tokenizer import Tokenizer from datamodel.chunk import Chunk from datamodel.llm_params import LLMParams +from streamlit_cookies_controller import CookieController class BodyHandler: @@ -138,6 +142,7 @@ def agenerate( gpt_params: LLMParams, role: str, api_key: Optional[str], + chunk_size: int, ) -> None: generate_button = st.button("Generate summary") if generate_button: @@ -218,6 +223,22 @@ async def process_chunks(): # Run the async processing asyncio.run(process_chunks()) + config = self._serialize_config( + api_key, + role, + gpt_params.model.name, + chunk_size, + gpt_params.max_tokens, + gpt_params.temperature, + ) + crypto: Crypto = st.session_state["crypto"] + config = crypto.encrypt_b64(json.dumps(config)) + controler = CookieController() + controler.set( + "config", + config, + expires=datetime.datetime.now() + datetime.timedelta(days=30), + ) else: # Check if summaries exist in session state and display them if "summaries" in st.session_state: @@ -241,3 +262,13 @@ async def process_chunks(): ) total_price += summary_data["price"] st.write(f"Total price: `${round(total_price, 6)}`") + + def _serialize_config(self, api_key, role, model, chunk_size, max_tokens, temperature): + return { + "api_key": api_key, + "role": role, + "model": model, + "chunk_size": chunk_size, + "max_tokens": max_tokens, + "temperature": temperature, + } diff --git a/SumGPT/app/page.py b/SumGPT/app/page.py index aed232b..bba3a54 100644 --- a/SumGPT/app/page.py +++ b/SumGPT/app/page.py @@ -66,4 +66,4 @@ def draw_body(self) -> None: total_chunks.extend(chunks) - body.agenerate(total_chunks, self.llm_params, self.role, self.api_key) + body.agenerate(total_chunks, self.llm_params, self.role, self.api_key, self.chunk_size) diff --git a/SumGPT/app/sidebar_handler.py b/SumGPT/app/sidebar_handler.py index 378e681..db35f1b 100644 --- a/SumGPT/app/sidebar_handler.py +++ b/SumGPT/app/sidebar_handler.py @@ -1,22 +1,43 @@ +import json from typing import Any, Dict, List, Tuple import streamlit as st import utils.helpers as helpers +from core.crypto import Crypto from datamodel.llm_params import LLMModel, LLMParams +from streamlit_cookies_controller import CookieController class SidebarHandler: def __init__(self): + self.cookie_controller = CookieController() + self.crypto: Crypto = st.session_state["crypto"] self.config = {} + if self.config == {}: + self._set_config_from_cookie() + self.chunk_size = None + def _set_config_from_cookie(self): + config_binary = self.cookie_controller.get("config") + if config_binary: + try: + self.config = json.loads(self.crypto.decrypt_b64(config_binary)) + except TypeError: + self.config = {} + self.cookie_controller.remove("config") # Remove invalid cookie + def header(self): st.title("SumGPT") st.markdown("Select the model and parameters for summarization.") - def api_key_entry(self) -> str: + def api_key_entry(self) -> str | None: st.markdown("### API Key") - return st.text_input("Enter your OpenAI API key", type="password") + api_key = st.text_input( + "Enter your OpenAI API key", type="password", value=self.config.get("api_key", "") + ) + self.config["api_key"] = api_key + return api_key def role_settings_panel(self, height=300) -> str: language = st.selectbox( @@ -36,12 +57,15 @@ def role_settings_panel(self, height=300) -> str: if role is None: st.stop() st.warning("Role settings are not set.") + + self.config["role"] = role return role def config_control_panel(self, models_data: List[Dict[str, str]]) -> Tuple[LLMParams, int]: model_names = helpers.extract_values(models_data, "model") model_name = st.selectbox("Model", model_names, self.config.get("model_index", 0)) model = LLMModel.construct_from_dict(self._get_model_dict(models_data, model_name)) + self.config["model"] = model_name _param = self._construct_param(models_data, model_name) @@ -52,13 +76,19 @@ def config_control_panel(self, models_data: List[Dict[str, str]]) -> Tuple[LLMPa self.config.get("chunk_size", 2048), step=1024, ) + self.config["chunk_size"] = chunk_size + max_tokens: int = st.number_input( "Max output (tokens)", 32, _param["max_output_tokens"], self.config.get("max_tokens", 512), ) + self.config["max_tokens"] = max_tokens + temperature: float = st.slider("Temperature", 0.0, 1.0, self.config.get("temperature", 0.7)) + self.config["temperature"] = temperature + return ( LLMParams( model=model, @@ -82,13 +112,19 @@ def _construct_param(self, models_data, selected_model): def import_config(self): st.markdown("### Import Configuration") - if st.button("Import configuration"): - raise NotImplementedError # TODO: implement + config_file = st.file_uploader("Upload configuration file", type=["json"]) + if config_file: + config = json.load(config_file) + self.config = config + self.cookie_controller.set("config", self.crypto.encrypt_b64(json.dumps(config))) def export_config(self): st.markdown("### Export Configuration") - if st.button("Export configuration"): - raise NotImplementedError # TODO: implement + st.download_button( + "Export configuration", + data=json.dumps(self.config, indent=2), + file_name="sumgpt_config.json", + ) def footer(self, data: Dict[str, Any]): st.markdown("---") diff --git a/SumGPT/core/crypto.py b/SumGPT/core/crypto.py new file mode 100644 index 0000000..0f8ecd4 --- /dev/null +++ b/SumGPT/core/crypto.py @@ -0,0 +1,67 @@ +import base64 +import os +from typing import Optional + +from cryptography.hazmat.backends import default_backend +from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes +from cryptography.hazmat.primitives.kdf.scrypt import Scrypt + + +class Crypto: + def __init__(self, password: str, salt: Optional[bytes] = None): + """Initialize Crypto class with password and optional salt.""" + self.password = password + self.salt = salt if salt else os.urandom(16) # Generate salt if not provided + self.key = self._generate_key() + + def _generate_key(self) -> bytes: + """Generate a symmetric key from the password using Scrypt KDF.""" + kdf = Scrypt( + salt=self.salt, + length=32, + n=2**14, # CPU/memory cost factor + r=8, # Block size + p=1, # Parallelization factor + backend=default_backend(), + ) + key = kdf.derive(self.password.encode()) + return key + + def encrypt(self, data: str) -> bytes: + """Encrypt data using AES GCM mode (authenticated encryption).""" + nonce = os.urandom(12) # 12-byte nonce for GCM mode + cipher = Cipher(algorithms.AES(self.key), modes.GCM(nonce), backend=default_backend()) + encryptor = cipher.encryptor() + + ciphertext = encryptor.update(data.encode()) + encryptor.finalize() + # Prepend the salt to the nonce, tag, and ciphertext for storage + return self.salt + nonce + encryptor.tag + ciphertext + + def encrypt_b64(self, data: str) -> str: + """Encrypt data and return as base64 encoded string.""" + encrypted_data = self.encrypt(data) + return base64.b64encode(encrypted_data).decode("utf-8") + + def decrypt(self, encrypted_data: bytes) -> str: + """Decrypt data encrypted with AES GCM mode.""" + # Extract the salt, nonce, tag, and ciphertext + salt = encrypted_data[:16] # First 16 bytes are the salt + nonce = encrypted_data[16:28] # Next 12 bytes are the nonce + tag = encrypted_data[28:44] # Next 16 bytes are the GCM tag + ciphertext = encrypted_data[44:] # Rest is the ciphertext + + # Regenerate the key with the extracted salt + kdf = Scrypt(salt=salt, length=32, n=2**14, r=8, p=1, backend=default_backend()) + key = kdf.derive(self.password.encode()) + + # Initialize the cipher for decryption + cipher = Cipher(algorithms.AES(key), modes.GCM(nonce, tag), backend=default_backend()) + decryptor = cipher.decryptor() + + decrypted_data = decryptor.update(ciphertext) + decryptor.finalize() + return decrypted_data.decode() + + def decrypt_b64(self, encrypted_data: str) -> str: + """Decrypt base64 encoded data.""" + decoded_bytes = base64.b64decode(encrypted_data) + return self.decrypt(decoded_bytes) diff --git a/SumGPT/main.py b/SumGPT/main.py index 2e0459d..70b69c3 100644 --- a/SumGPT/main.py +++ b/SumGPT/main.py @@ -1,5 +1,6 @@ import streamlit as st from app.page import Page +from core.crypto import Crypto from utils import io @@ -8,6 +9,8 @@ def init(): if "summaries" not in st.session_state: st.session_state["summaries"] = [] + if "crypto" not in st.session_state: + st.session_state["crypto"] = Crypto(st.secrets["crypto_key"]) def main(): diff --git a/requirements.txt b/requirements.txt index 8c8c682..85630dc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,10 @@ docx==0.2.4 python_docx==1.1.2 -langdetect==1.0.9 PyPDF4==1.27.0 tiktoken==0.8.0 -requests==2.32.3 + +# crypto +cryptography==43.0.3 # langchain langchain==0.3.4 @@ -11,4 +12,4 @@ langchain-openai==0.2.3 # streamlit streamlit==1.39.0 - +streamlit-cookies-controller==0.0.4 From 452404b4186e6abf3d55999d43c2d69a7736bde8 Mon Sep 17 00:00:00 2001 From: Zeke Zhang <sean1832725142@gmail.com> Date: Mon, 28 Oct 2024 20:32:07 +1100 Subject: [PATCH 06/11] style: improve UI with emoji - add button to delete cookie data --- SumGPT/app/body_handler.py | 94 +++-------------------------------- SumGPT/app/page.py | 11 ++-- SumGPT/app/sidebar_handler.py | 24 ++++++--- 3 files changed, 30 insertions(+), 99 deletions(-) diff --git a/SumGPT/app/body_handler.py b/SumGPT/app/body_handler.py index 8afae6f..be33c57 100644 --- a/SumGPT/app/body_handler.py +++ b/SumGPT/app/body_handler.py @@ -15,7 +15,9 @@ class BodyHandler: def file_uploader(self, type: List[str] = ["txt"]) -> List[Dict[str, str]]: - uploaded_files = st.file_uploader("Upload a file", type=type, accept_multiple_files=True) + uploaded_files = st.file_uploader( + "πŸ“ Upload your files", type=type, accept_multiple_files=True + ) files = [] if uploaded_files is None: st.stop() @@ -50,92 +52,6 @@ def _get_tokens(self, response_meta: Dict[str, Any]) -> Tuple[int, int, int]: ) return completion_tokens, prompt_tokens, cached_tokens - def generate( - self, - chunks: List[Chunk], - gpt_params: LLMParams, - role: str, - api_key: Optional[str], - ) -> None: - generate_button = st.button("Generate summary") - if generate_button: - if not api_key: - st.error("❌ Please enter your OpenAI API key in the sidebar.") - return - if not role: - st.error("❌ Please enter a role description in the sidebar.") - return - - st.session_state["summaries"] = [] # Initialize or reset summaries - - progress_text = st.empty() - progress_bar = st.progress(0) - total_chunks = len(chunks) - - # Group chunks by filename - filename_chunks = {} - for chunk in chunks: - if chunk.filename not in filename_chunks: - filename_chunks[chunk.filename] = [] - filename_chunks[chunk.filename].append(chunk) - - llm = LLM(api_key, gpt_params) - processed_chunks = 0 - - # Process chunks by filename - for filename, file_chunks in filename_chunks.items(): - expander = st.expander(f"{filename}") - for chunk in file_chunks: - processed_chunks += 1 - progress_text.write(f"Generating summaries {processed_chunks}/{total_chunks}") - progress_bar.progress(processed_chunks / total_chunks) - - summary = llm.generate(chunk.content, role) - with expander: - with st.chat_message("πŸ€–"): - st.write(summary.content) - completion_tokens, prompt_tokens, cached_tokens = self._get_tokens( - summary.response_metadata - ) - price = round( - llm.Calc_price(prompt_tokens, completion_tokens, cached_tokens), 6 - ) - st.write( - f"Tokens: `{completion_tokens + prompt_tokens}`, price: `${price}`" - ) - # Store the summary in session state - st.session_state["summaries"].append( - { - "filename": filename, - "content": summary.content, - "tokens": completion_tokens + prompt_tokens, - "price": price, - } - ) - - progress_text.write("βœ… All chunks processed!") - progress_bar.progress(1.0) - else: - # Check if summaries exist in session state and display them - if "summaries" in st.session_state: - # Group summaries by filename - filename_summaries = {} - for summary_data in st.session_state["summaries"]: - filename = summary_data["filename"] - if filename not in filename_summaries: - filename_summaries[filename] = [] - filename_summaries[filename].append(summary_data) - - # Display summaries grouped by filename - for filename, summaries in filename_summaries.items(): - with st.expander(f"{filename}"): - for summary_data in summaries: - with st.chat_message("πŸ€–"): - st.write(summary_data["content"]) - st.write( - f"Tokens: `{summary_data['tokens']}`, price: `${summary_data['price']}`" - ) - def agenerate( self, chunks: List[Chunk], @@ -144,7 +60,9 @@ def agenerate( api_key: Optional[str], chunk_size: int, ) -> None: - generate_button = st.button("Generate summary") + generate_button = st.button( + "πŸš€ Run", + ) if generate_button: if not api_key: st.error("❌ Please enter your OpenAI API key in the sidebar.") diff --git a/SumGPT/app/page.py b/SumGPT/app/page.py index bba3a54..ccfcef5 100644 --- a/SumGPT/app/page.py +++ b/SumGPT/app/page.py @@ -28,11 +28,16 @@ def draw_sidebar(self, manifest: Dict[str, str], models_data: List[Dict[str, str sb.header() sb.import_config() self.api_key = sb.api_key_entry() - with st.expander("Role settings"): + with st.expander("πŸ€– Role settings"): self.role = sb.role_settings_panel() - with st.expander("Configuration"): + with st.expander("βš™οΈ Configuration"): self.llm_params, self.chunk_size = sb.config_control_panel(models_data) - sb.export_config() + + cols = st.columns([1, 1]) + with cols[0]: + sb.delete_cookie() + with cols[1]: + sb.export_config() sb.footer(manifest) def draw_body(self) -> None: diff --git a/SumGPT/app/sidebar_handler.py b/SumGPT/app/sidebar_handler.py index db35f1b..519dcd3 100644 --- a/SumGPT/app/sidebar_handler.py +++ b/SumGPT/app/sidebar_handler.py @@ -28,13 +28,17 @@ def _set_config_from_cookie(self): self.cookie_controller.remove("config") # Remove invalid cookie def header(self): - st.title("SumGPT") - st.markdown("Select the model and parameters for summarization.") + st.markdown("### How to use:") + st.markdown( + "1. πŸ”‘ Enter your [OpenAI API Key](https://beta.openai.com/account/api-keys)\n" + "2. πŸ“ Upload your file\n" + "3. πŸš€ Run" + ) + st.markdown("---") def api_key_entry(self) -> str | None: - st.markdown("### API Key") api_key = st.text_input( - "Enter your OpenAI API key", type="password", value=self.config.get("api_key", "") + "πŸ”‘ OpenAI API key", type="password", value=self.config.get("api_key", "") ) self.config["api_key"] = api_key return api_key @@ -111,21 +115,25 @@ def _construct_param(self, models_data, selected_model): return param def import_config(self): - st.markdown("### Import Configuration") - config_file = st.file_uploader("Upload configuration file", type=["json"]) + config_file = st.file_uploader("πŸ“ Import Config", type=["json"]) if config_file: config = json.load(config_file) self.config = config self.cookie_controller.set("config", self.crypto.encrypt_b64(json.dumps(config))) def export_config(self): - st.markdown("### Export Configuration") st.download_button( - "Export configuration", + "Export Config", data=json.dumps(self.config, indent=2), file_name="sumgpt_config.json", ) + def delete_cookie(self): + if st.button("Delete cookie"): + self.cookie_controller.remove("config") + self.config = {} + st.rerun() + def footer(self, data: Dict[str, Any]): st.markdown("---") st.markdown("### SumGPT") From 26a9d3e804bc904b3d1f58787ed3ad732ec50a05 Mon Sep 17 00:00:00 2001 From: Zeke Zhang <sean1832725142@gmail.com> Date: Mon, 28 Oct 2024 21:47:37 +1100 Subject: [PATCH 07/11] feat: implement download summaries button --- SumGPT/app/body_handler.py | 42 ++++++++++++++++++++++++++++++++------ SumGPT/app/page.py | 1 + 2 files changed, 37 insertions(+), 6 deletions(-) diff --git a/SumGPT/app/body_handler.py b/SumGPT/app/body_handler.py index be33c57..98de452 100644 --- a/SumGPT/app/body_handler.py +++ b/SumGPT/app/body_handler.py @@ -63,6 +63,11 @@ def agenerate( generate_button = st.button( "πŸš€ Run", ) + total_chunks = len(chunks) + progress_text = st.empty() + + total_price_text = st.empty() + if generate_button: if not api_key: st.error("❌ Please enter your OpenAI API key in the sidebar.") @@ -75,14 +80,10 @@ def agenerate( async def process_chunks(): llm = LLM(api_key, gpt_params) - total_chunks = len(chunks) - progress_text = st.empty() - progress_text.write(f"Generating summaries 0/{total_chunks}") - total_price_text = st.empty() total_price = 0 - progress_bar = st.progress(0) completed_chunks = 0 + progress_text.write(f"Generating summaries 0/{total_chunks}") # Sort chunks by chunk.id sorted_chunks = sorted(chunks, key=lambda c: c.id) @@ -179,7 +180,36 @@ async def process_chunks(): f"Tokens: `{summary_data['tokens']}`, price: `${summary_data['price']}`" ) total_price += summary_data["price"] - st.write(f"Total price: `${round(total_price, 6)}`") + total_price_text.write(f"Total price: `${round(total_price, 6)}`") + + def download_summaries(self): + if "summaries" in st.session_state: + summaries = st.session_state["summaries"] + if not summaries: + return + st.download_button( + "πŸ“₯ Download summaries", + self._serialize_summaries(summaries), + "summaries.md", + mime="application/markdown", + ) + + def _serialize_summaries(self, summaries): + markdown = "" + + markdown_by_filename = {} + for summary in summaries: + filename = summary["filename"] + if filename not in markdown_by_filename: + markdown_by_filename[filename] = [] + markdown_by_filename[filename].append(summary["content"]) + + for filename, content in markdown_by_filename.items(): + markdown += f"# {filename}\n" + markdown += "\n\n".join(content) + markdown += "\n\n" + + return markdown def _serialize_config(self, api_key, role, model, chunk_size, max_tokens, temperature): return { diff --git a/SumGPT/app/page.py b/SumGPT/app/page.py index ccfcef5..e29b634 100644 --- a/SumGPT/app/page.py +++ b/SumGPT/app/page.py @@ -72,3 +72,4 @@ def draw_body(self) -> None: total_chunks.extend(chunks) body.agenerate(total_chunks, self.llm_params, self.role, self.api_key, self.chunk_size) + body.download_summaries() From 94638fd16d6d89ab5cbe0b5dc7c1f46cc30ed92e Mon Sep 17 00:00:00 2001 From: Zeke Zhang <sean1832725142@gmail.com> Date: Mon, 28 Oct 2024 22:16:36 +1100 Subject: [PATCH 08/11] fix: language does not change after loading coockie. --- SumGPT/app/body_handler.py | 24 +++--------------------- SumGPT/app/page.py | 4 +++- SumGPT/app/sidebar_handler.py | 12 ++++++++++-- 3 files changed, 16 insertions(+), 24 deletions(-) diff --git a/SumGPT/app/body_handler.py b/SumGPT/app/body_handler.py index 98de452..4027928 100644 --- a/SumGPT/app/body_handler.py +++ b/SumGPT/app/body_handler.py @@ -58,7 +58,7 @@ def agenerate( gpt_params: LLMParams, role: str, api_key: Optional[str], - chunk_size: int, + config: Dict[str, Any], ) -> None: generate_button = st.button( "πŸš€ Run", @@ -142,20 +142,12 @@ async def process_chunks(): # Run the async processing asyncio.run(process_chunks()) - config = self._serialize_config( - api_key, - role, - gpt_params.model.name, - chunk_size, - gpt_params.max_tokens, - gpt_params.temperature, - ) crypto: Crypto = st.session_state["crypto"] - config = crypto.encrypt_b64(json.dumps(config)) + config_binary = crypto.encrypt_b64(json.dumps(config)) controler = CookieController() controler.set( "config", - config, + config_binary, expires=datetime.datetime.now() + datetime.timedelta(days=30), ) else: @@ -210,13 +202,3 @@ def _serialize_summaries(self, summaries): markdown += "\n\n" return markdown - - def _serialize_config(self, api_key, role, model, chunk_size, max_tokens, temperature): - return { - "api_key": api_key, - "role": role, - "model": model, - "chunk_size": chunk_size, - "max_tokens": max_tokens, - "temperature": temperature, - } diff --git a/SumGPT/app/page.py b/SumGPT/app/page.py index e29b634..f75f985 100644 --- a/SumGPT/app/page.py +++ b/SumGPT/app/page.py @@ -13,6 +13,7 @@ def __init__(self): self.role: Optional[str] = None self.api_key: Optional[str] = None self.llm_params: Optional[LLMParams] = None + self.config: Dict[str, str] = {} def draw_header(self, version): st.title(f"πŸ“ SumGPT {version}") @@ -39,6 +40,7 @@ def draw_sidebar(self, manifest: Dict[str, str], models_data: List[Dict[str, str with cols[1]: sb.export_config() sb.footer(manifest) + self.config = sb.get_config() def draw_body(self) -> None: if not self.chunk_size: @@ -71,5 +73,5 @@ def draw_body(self) -> None: total_chunks.extend(chunks) - body.agenerate(total_chunks, self.llm_params, self.role, self.api_key, self.chunk_size) + body.agenerate(total_chunks, self.llm_params, self.role, self.api_key, self.config) body.download_summaries() diff --git a/SumGPT/app/sidebar_handler.py b/SumGPT/app/sidebar_handler.py index 519dcd3..b7493ef 100644 --- a/SumGPT/app/sidebar_handler.py +++ b/SumGPT/app/sidebar_handler.py @@ -18,6 +18,9 @@ def __init__(self): self.chunk_size = None + def get_config(self) -> Dict[str, Any]: + return self.config + def _set_config_from_cookie(self): config_binary = self.cookie_controller.get("config") if config_binary: @@ -44,15 +47,17 @@ def api_key_entry(self) -> str | None: return api_key def role_settings_panel(self, height=300) -> str: + language_list = ["English", "Chinese", "Japanese", "Spanish", "French", "German", "Italian"] language = st.selectbox( "Role language", - ["English", "Chinese", "Japanese", "Spanish", "French", "German", "Italian"], + language_list, + language_list.index(self.config.get("role_language", "English")), ) role = st.text_area( "Role settings", self.config.get( "role", - f"Write a detailed summary in perfect {language} that is concise, clear and coherent while capturing the main ideas the text. " + "Write a detailed summary in perfect $(LANGUAGE) that is concise, clear and coherent while capturing the main ideas the text. " "The summary should be well-structured and free of grammatical errors.\n\n" "The summary is to be written in markdown format, with a heading (###) that encapsulate the core concept of the content. It should be concise and specific. avoid generic headings like 'Summary' or 'Introduction'.", ), @@ -62,7 +67,10 @@ def role_settings_panel(self, height=300) -> str: st.stop() st.warning("Role settings are not set.") + self.config["role_language"] = language self.config["role"] = role + + role = role.replace("$(LANGUAGE)", language) return role def config_control_panel(self, models_data: List[Dict[str, str]]) -> Tuple[LLMParams, int]: From 131219a03b5440630e3f687b232312a8209fd341 Mon Sep 17 00:00:00 2001 From: Zeke Zhang <sean1832725142@gmail.com> Date: Mon, 28 Oct 2024 22:36:31 +1100 Subject: [PATCH 09/11] doc: update readme --- README.md | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index c3bca46..6b8bb00 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # SumGPT [![python](https://img.shields.io/badge/python-3.11-blue)](https://www.python.org/downloads/release/python-3112/) -Achieve detailed summarization of extensive documents through πŸš€ultra-fast parallelized predictions, utilizing [GPT-3.5](https://platform.openai.com/docs/models/gpt-3-5) and [GPT-4](https://platform.openai.com/docs/models/gpt-4) APIs provided by [OpenAI](https://openai.com/). +Achieve detailed summarization of extensive documents through πŸš€ultra-fast parallelized completion with APIs provided by [OpenAI](https://openai.com/). 🌐 Web App: [https://sumgpt.streamlit.app](https://sumgpt.streamlit.app/) @@ -13,22 +13,35 @@ Achieve detailed summarization of extensive documents through πŸš€ultra-fast par --- ### 🌟 Features -- πŸ“„ Summarize document (.pdf, .docx, .txt, .md). -- πŸŽ₯ Summarize YouTube video with subtitles. +- πŸ“„ Summarize document (.txt, .md). - πŸ€– Customizable parameters and bot persona for refined response generation. -- πŸš€ Facilitates parallel processing of chunks, enabling ultra-fast generation speeds. +- πŸš€ Facilitates parallel processing of chunks. - πŸ’Ό Export & import configs for easy sharing and reuse. -- 🧠 Supports GPT-3.5 and GPT-4. +- 🌍 Encrypted browser cookies ensure configuration settings are preserved across sessions. +- 🧠 Supports multiple modles: + - `gpt-4o-mini` + - `gpt-4o` + - `gpt-4-turbo` + - `gpt-3.5-turbo` ### πŸ’‘ What you need - πŸ”‘ OpenAI **[API keys](https://platform.openai.com/account/api-keys)** -> ***Note: To access GPT-4, please [join the waitlist](https://openai.com/waitlist/gpt-4-api) if you haven't already received an invitation from OpenAI.*** - ### πŸ’» Running Locally - Make sure you have **[python 3.11](https://www.python.org/downloads)** | [python installation tutorial (YouTube)](https://youtu.be/HBxCHonP6Ro?t=105) 1. Clone the repository ```bash git clone https://github.com/sean1832/SumGPT +cd SumGPT +``` + +2. Create a `secrets.toml` file under `.streamlit\` directory. Replace `your_secure_key` with your own password for browser cookie encryption. +```bash +mkdir .streamlit +echo "crypto_keycrypto_key = 'your_secure_key'" > .streamlit/secrets.toml +``` + +3. Execute `RUN.bat` +```bash +./RUN.bat ``` -2. Execute `RUN.bat` From ce4bb1a904776e40ea2f040bbe062045cfd09cb3 Mon Sep 17 00:00:00 2001 From: Zeke Zhang <sean1832725142@gmail.com> Date: Mon, 28 Oct 2024 22:36:37 +1100 Subject: [PATCH 10/11] chore: remove unused prompt --- resources/prompt.json | 42 ------------------------------------------ 1 file changed, 42 deletions(-) delete mode 100644 resources/prompt.json diff --git a/resources/prompt.json b/resources/prompt.json deleted file mode 100644 index 3be18b8..0000000 --- a/resources/prompt.json +++ /dev/null @@ -1,42 +0,0 @@ -[ - { - "type": "recursive", - "legacy": true, - "prompt": "Provide a detailed and comprehensive summary of the following content in flawless [LANGUAGE], ensuring all key points are covered. Create a markdown heading (###) that encapsulates the core information of the content. Make sure it is answered in [LANGUAGE].", - "variables": [ - { - "name": "[LANGUAGE]" - } - ] - }, - { - "type": "recursive", - "legacy": false, - "prompt": "Write a detailed and comprehensive explanation of the following in perfect [LANGUAGE] with no grammar issues, ensuring all key points are covered. Create a markdown heading (###) that encapsulates the core information:\n\n{text}\n\nStructured markdown summary with heading (###) in fluent [LANGUAGE]:", - "variables": [ - { - "name": "[LANGUAGE]" - } - ] - }, - { - "type": "final", - "legacy": true, - "prompt": "identify headings in the transcript and summarise them into five headings. Use #### headings in markdown. Under headings, summarise at least 3 key points and then provide detail explanation of the concept based on the following text in the way that can be read fluently, make sense and avoid repetition. Make sure to include all information. Write it in beautiful and structured markdown in perfect [LANGUAGE]. ", - "variables": [ - { - "name": "[LANGUAGE]" - } - ] - }, - { - "type": "final", - "legacy": false, - "prompt": "Write a detailed summary of the following in [LANGUAGE]:\n\n{text}\n\nIdentify and summarise them into five headings. Use #### headings in markdown. Under headings, summarize a list of key points that best encapsulate the core information. Structured markdown summary with headings in perfect [LANGUAGE] (####): ", - "variables": [ - { - "name": "[LANGUAGE]" - } - ] - } -] \ No newline at end of file From ae18b10b7ebe6b61d64ecd60247b0430c9511107 Mon Sep 17 00:00:00 2001 From: Zeke Zhang <sean1832725142@gmail.com> Date: Mon, 28 Oct 2024 22:37:04 +1100 Subject: [PATCH 11/11] remove redundant script --- tools/get-requirement.bat | 16 ---------------- 1 file changed, 16 deletions(-) delete mode 100644 tools/get-requirement.bat diff --git a/tools/get-requirement.bat b/tools/get-requirement.bat deleted file mode 100644 index a5c88e2..0000000 --- a/tools/get-requirement.bat +++ /dev/null @@ -1,16 +0,0 @@ -@echo off -cd.. -echo Activating Virtural environment... -call .\venv\Scripts\activate - -echo upgrading pip... -python -m pip install --upgrade pip - - -echo Installing pipreqs... -pip install pipreqs - -echo Export to requirements.txt -pipreqs . --force --encoding utf-8 - -pause \ No newline at end of file