diff --git a/.gitignore b/.gitignore index dc2e3ef..b8ec665 100644 --- a/.gitignore +++ b/.gitignore @@ -163,3 +163,6 @@ cython_debug/ # test folder .test/ /test/ + +# streamlit +.streamlit/ \ No newline at end of file diff --git a/.idea/.gitignore b/.idea/.gitignore deleted file mode 100644 index 13566b8..0000000 --- a/.idea/.gitignore +++ /dev/null @@ -1,8 +0,0 @@ -# Default ignored files -/shelf/ -/workspace.xml -# Editor-based HTTP Client requests -/httpRequests/ -# Datasource local storage ignored files -/dataSources/ -/dataSources.local.xml diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml deleted file mode 100644 index 5a85bff..0000000 --- a/.idea/inspectionProfiles/Project_Default.xml +++ /dev/null @@ -1,38 +0,0 @@ - - - - \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml deleted file mode 100644 index 105ce2d..0000000 --- a/.idea/inspectionProfiles/profiles_settings.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml deleted file mode 100644 index 0594372..0000000 --- a/.idea/misc.xml +++ /dev/null @@ -1,4 +0,0 @@ - - - - \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml deleted file mode 100644 index 474a048..0000000 --- a/.idea/modules.xml +++ /dev/null @@ -1,8 +0,0 @@ - - - - - - - - \ No newline at end of file diff --git a/.idea/sumGPT.iml b/.idea/sumGPT.iml deleted file mode 100644 index a29c46d..0000000 --- a/.idea/sumGPT.iml +++ /dev/null @@ -1,24 +0,0 @@ - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml deleted file mode 100644 index 94a25f7..0000000 --- a/.idea/vcs.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - - - \ No newline at end of file diff --git a/README.md b/README.md index c3bca46..6b8bb00 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # SumGPT [![python](https://img.shields.io/badge/python-3.11-blue)](https://www.python.org/downloads/release/python-3112/) -Achieve detailed summarization of extensive documents through πŸš€ultra-fast parallelized predictions, utilizing [GPT-3.5](https://platform.openai.com/docs/models/gpt-3-5) and [GPT-4](https://platform.openai.com/docs/models/gpt-4) APIs provided by [OpenAI](https://openai.com/). +Achieve detailed summarization of extensive documents through πŸš€ultra-fast parallelized completion with APIs provided by [OpenAI](https://openai.com/). 🌐 Web App: [https://sumgpt.streamlit.app](https://sumgpt.streamlit.app/) @@ -13,22 +13,35 @@ Achieve detailed summarization of extensive documents through πŸš€ultra-fast par --- ### 🌟 Features -- πŸ“„ Summarize document (.pdf, .docx, .txt, .md). -- πŸŽ₯ Summarize YouTube video with subtitles. +- πŸ“„ Summarize document (.txt, .md). - πŸ€– Customizable parameters and bot persona for refined response generation. -- πŸš€ Facilitates parallel processing of chunks, enabling ultra-fast generation speeds. +- πŸš€ Facilitates parallel processing of chunks. - πŸ’Ό Export & import configs for easy sharing and reuse. -- 🧠 Supports GPT-3.5 and GPT-4. +- 🌍 Encrypted browser cookies ensure configuration settings are preserved across sessions. +- 🧠 Supports multiple modles: + - `gpt-4o-mini` + - `gpt-4o` + - `gpt-4-turbo` + - `gpt-3.5-turbo` ### πŸ’‘ What you need - πŸ”‘ OpenAI **[API keys](https://platform.openai.com/account/api-keys)** -> ***Note: To access GPT-4, please [join the waitlist](https://openai.com/waitlist/gpt-4-api) if you haven't already received an invitation from OpenAI.*** - ### πŸ’» Running Locally - Make sure you have **[python 3.11](https://www.python.org/downloads)** | [python installation tutorial (YouTube)](https://youtu.be/HBxCHonP6Ro?t=105) 1. Clone the repository ```bash git clone https://github.com/sean1832/SumGPT +cd SumGPT +``` + +2. Create a `secrets.toml` file under `.streamlit\` directory. Replace `your_secure_key` with your own password for browser cookie encryption. +```bash +mkdir .streamlit +echo "crypto_keycrypto_key = 'your_secure_key'" > .streamlit/secrets.toml +``` + +3. Execute `RUN.bat` +```bash +./RUN.bat ``` -2. Execute `RUN.bat` diff --git a/RUN.bat b/RUN.bat index 6afd014..7488c88 100644 --- a/RUN.bat +++ b/RUN.bat @@ -38,4 +38,4 @@ if "%mod_date%" neq "%last_mod_date%" ( echo "Requirements file has not been modified. Skipping update." ) -streamlit run src/SumGPT.py \ No newline at end of file +streamlit run SumGPT/main.py \ No newline at end of file diff --git a/SumGPT/app/__init__.py b/SumGPT/app/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/SumGPT/app/body_handler.py b/SumGPT/app/body_handler.py new file mode 100644 index 0000000..4027928 --- /dev/null +++ b/SumGPT/app/body_handler.py @@ -0,0 +1,204 @@ +import asyncio +import datetime +import json +from typing import Any, Dict, List, Optional, Tuple + +import streamlit as st +import utils.io as io +from core.crypto import Crypto +from core.llm import LLM +from core.tokenizer import Tokenizer +from datamodel.chunk import Chunk +from datamodel.llm_params import LLMParams +from streamlit_cookies_controller import CookieController + + +class BodyHandler: + def file_uploader(self, type: List[str] = ["txt"]) -> List[Dict[str, str]]: + uploaded_files = st.file_uploader( + "πŸ“ Upload your files", type=type, accept_multiple_files=True + ) + files = [] + if uploaded_files is None: + st.stop() + st.warning("File is not uploaded.") + for file in uploaded_files: + text = io.read_to_string(file) + filename = file.name + files.append({"filename": filename, "text": text}) + return files + + def segment_text( + self, text: str, chunk_size: int, model: str, input_id: int + ) -> Tuple[List[Chunk], int]: + chunks: List[Chunk] = [] + tokenizer = Tokenizer(model) + total_tokens = tokenizer.tokenize(text) + count = 0 + for i in range(0, len(total_tokens), chunk_size): + chunk_tokens = total_tokens[i : i + chunk_size] + content = tokenizer.detokenize(chunk_tokens) + chunks.append(Chunk(count, content, len(chunk_tokens), input_id)) + count += 1 + return chunks, len(total_tokens) + + def _get_tokens(self, response_meta: Dict[str, Any]) -> Tuple[int, int, int]: + completion_tokens = response_meta.get("token_usage", {}).get("completion_tokens", 0) + prompt_tokens = response_meta.get("token_usage", {}).get("prompt_tokens", 0) + cached_tokens = ( + response_meta.get("token_usage", {}) + .get("prompt_tokens_details", {}) + .get("cached_tokens", 0) + ) + return completion_tokens, prompt_tokens, cached_tokens + + def agenerate( + self, + chunks: List[Chunk], + gpt_params: LLMParams, + role: str, + api_key: Optional[str], + config: Dict[str, Any], + ) -> None: + generate_button = st.button( + "πŸš€ Run", + ) + total_chunks = len(chunks) + progress_text = st.empty() + + total_price_text = st.empty() + + if generate_button: + if not api_key: + st.error("❌ Please enter your OpenAI API key in the sidebar.") + return + if not role: + st.error("❌ Please enter a role description in the sidebar.") + return + + st.session_state["summaries"] = [] # Initialize or reset summaries + + async def process_chunks(): + llm = LLM(api_key, gpt_params) + total_price = 0 + progress_bar = st.progress(0) + completed_chunks = 0 + progress_text.write(f"Generating summaries 0/{total_chunks}") + + # Sort chunks by chunk.id + sorted_chunks = sorted(chunks, key=lambda c: c.id) + + # Group chunks by filename + filename_chunks = {} + for chunk in sorted_chunks: + if chunk.filename not in filename_chunks: + filename_chunks[chunk.filename] = [] + filename_chunks[chunk.filename].append(chunk) + + # Create expanders for each file + expanders = { + filename: st.expander(f"{filename}") for filename in filename_chunks.keys() + } + + # Create tasks for all chunks (sorted by chunk.id) + tasks = [llm.agenerate(chunk.content, role) for chunk in sorted_chunks] + + # Run all tasks and get the results in the same order + summaries = await asyncio.gather(*tasks) + + # Process the results in order + for summary, current_chunk in zip(summaries, sorted_chunks): + completed_chunks += 1 + progress_text.write(f"Generating summaries {completed_chunks}/{total_chunks}") + progress_bar.progress(completed_chunks / total_chunks) + + with expanders[current_chunk.filename]: + with st.chat_message("ai"): + st.write(summary.content) + completion_tokens, prompt_tokens, cached_tokens = self._get_tokens( + summary.response_metadata + ) + price = round( + llm.Calc_price(prompt_tokens, completion_tokens, cached_tokens), 6 + ) + st.write( + f"Tokens: `{completion_tokens + prompt_tokens}`, price: `${price}`" + ) + total_price += price + + # Store the summary in session state + st.session_state["summaries"].append( + { + "filename": current_chunk.filename, + "content": summary.content, + "tokens": completion_tokens + prompt_tokens, + "price": price, + } + ) + + progress_text.write("βœ… All chunks processed!") + progress_bar.progress(1.0) + total_price_text.write(f"Total price: `${round(total_price, 6)}`") + + # Run the async processing + asyncio.run(process_chunks()) + crypto: Crypto = st.session_state["crypto"] + config_binary = crypto.encrypt_b64(json.dumps(config)) + controler = CookieController() + controler.set( + "config", + config_binary, + expires=datetime.datetime.now() + datetime.timedelta(days=30), + ) + else: + # Check if summaries exist in session state and display them + if "summaries" in st.session_state: + total_price = 0 + # Group summaries by filename + filename_summaries = {} + for summary_data in st.session_state["summaries"]: + filename = summary_data["filename"] + if filename not in filename_summaries: + filename_summaries[filename] = [] + filename_summaries[filename].append(summary_data) + + # Display summaries grouped by filename + for filename, summaries in filename_summaries.items(): + with st.expander(f"{filename}"): + for summary_data in summaries: + with st.chat_message("ai"): + st.write(summary_data["content"]) + st.write( + f"Tokens: `{summary_data['tokens']}`, price: `${summary_data['price']}`" + ) + total_price += summary_data["price"] + total_price_text.write(f"Total price: `${round(total_price, 6)}`") + + def download_summaries(self): + if "summaries" in st.session_state: + summaries = st.session_state["summaries"] + if not summaries: + return + st.download_button( + "πŸ“₯ Download summaries", + self._serialize_summaries(summaries), + "summaries.md", + mime="application/markdown", + ) + + def _serialize_summaries(self, summaries): + markdown = "" + + markdown_by_filename = {} + for summary in summaries: + filename = summary["filename"] + if filename not in markdown_by_filename: + markdown_by_filename[filename] = [] + markdown_by_filename[filename].append(summary["content"]) + + for filename, content in markdown_by_filename.items(): + markdown += f"# {filename}\n" + markdown += "\n\n".join(content) + markdown += "\n\n" + + return markdown diff --git a/SumGPT/app/page.py b/SumGPT/app/page.py new file mode 100644 index 0000000..f75f985 --- /dev/null +++ b/SumGPT/app/page.py @@ -0,0 +1,77 @@ +from typing import Dict, List, Optional + +import streamlit as st +from datamodel.llm_params import LLMParams + +from app.body_handler import BodyHandler +from app.sidebar_handler import SidebarHandler + + +class Page: + def __init__(self): + self.chunk_size: Optional[int] = None + self.role: Optional[str] = None + self.api_key: Optional[str] = None + self.llm_params: Optional[LLMParams] = None + self.config: Dict[str, str] = {} + + def draw_header(self, version): + st.title(f"πŸ“ SumGPT {version}") + st.markdown("##### Summarize your text with OpenAI's API") + st.markdown("##### [GitHub repo](https://github.com/sean1832/SumGPT)") + st.warning( + "Please [report any bugs](https://github.com/sean1832/SumGPT/issues) to the GitHub repo." + ) + + def draw_sidebar(self, manifest: Dict[str, str], models_data: List[Dict[str, str]]) -> None: + with st.sidebar: + sb = SidebarHandler() + sb.header() + sb.import_config() + self.api_key = sb.api_key_entry() + with st.expander("πŸ€– Role settings"): + self.role = sb.role_settings_panel() + with st.expander("βš™οΈ Configuration"): + self.llm_params, self.chunk_size = sb.config_control_panel(models_data) + + cols = st.columns([1, 1]) + with cols[0]: + sb.delete_cookie() + with cols[1]: + sb.export_config() + sb.footer(manifest) + self.config = sb.get_config() + + def draw_body(self) -> None: + if not self.chunk_size: + st.error("❌ Please set the chunk size in the sidebar.") + return + if not self.llm_params: + st.error("❌ Please set the model in the sidebar.") + return + if not self.role: + st.error("❌ Please set the role in the sidebar.") + return + + body = BodyHandler() + texts = body.file_uploader(["txt", "md"]) + + total_chunks = [] + filenames = [] + + for idx, text in enumerate(texts): + filename = text["filename"] + filenames.append(filename) + chunks, total_token_size = body.segment_text( + text["text"], self.chunk_size, self.llm_params.model.name, idx + ) + with st.expander(f"`{filename}` **(chunks: {len(chunks)})**"): + for chunk in chunks: + chunk.set_filename_from_list(filenames) + st.write([chunk.to_dict() for chunk in chunks]) + st.write(f"Tokens: `{total_token_size}`") + + total_chunks.extend(chunks) + + body.agenerate(total_chunks, self.llm_params, self.role, self.api_key, self.config) + body.download_summaries() diff --git a/SumGPT/app/sidebar_handler.py b/SumGPT/app/sidebar_handler.py new file mode 100644 index 0000000..b7493ef --- /dev/null +++ b/SumGPT/app/sidebar_handler.py @@ -0,0 +1,152 @@ +import json +from typing import Any, Dict, List, Tuple + +import streamlit as st +import utils.helpers as helpers +from core.crypto import Crypto +from datamodel.llm_params import LLMModel, LLMParams +from streamlit_cookies_controller import CookieController + + +class SidebarHandler: + def __init__(self): + self.cookie_controller = CookieController() + self.crypto: Crypto = st.session_state["crypto"] + self.config = {} + if self.config == {}: + self._set_config_from_cookie() + + self.chunk_size = None + + def get_config(self) -> Dict[str, Any]: + return self.config + + def _set_config_from_cookie(self): + config_binary = self.cookie_controller.get("config") + if config_binary: + try: + self.config = json.loads(self.crypto.decrypt_b64(config_binary)) + except TypeError: + self.config = {} + self.cookie_controller.remove("config") # Remove invalid cookie + + def header(self): + st.markdown("### How to use:") + st.markdown( + "1. πŸ”‘ Enter your [OpenAI API Key](https://beta.openai.com/account/api-keys)\n" + "2. πŸ“ Upload your file\n" + "3. πŸš€ Run" + ) + st.markdown("---") + + def api_key_entry(self) -> str | None: + api_key = st.text_input( + "πŸ”‘ OpenAI API key", type="password", value=self.config.get("api_key", "") + ) + self.config["api_key"] = api_key + return api_key + + def role_settings_panel(self, height=300) -> str: + language_list = ["English", "Chinese", "Japanese", "Spanish", "French", "German", "Italian"] + language = st.selectbox( + "Role language", + language_list, + language_list.index(self.config.get("role_language", "English")), + ) + role = st.text_area( + "Role settings", + self.config.get( + "role", + "Write a detailed summary in perfect $(LANGUAGE) that is concise, clear and coherent while capturing the main ideas the text. " + "The summary should be well-structured and free of grammatical errors.\n\n" + "The summary is to be written in markdown format, with a heading (###) that encapsulate the core concept of the content. It should be concise and specific. avoid generic headings like 'Summary' or 'Introduction'.", + ), + height=height, + ) + if role is None: + st.stop() + st.warning("Role settings are not set.") + + self.config["role_language"] = language + self.config["role"] = role + + role = role.replace("$(LANGUAGE)", language) + return role + + def config_control_panel(self, models_data: List[Dict[str, str]]) -> Tuple[LLMParams, int]: + model_names = helpers.extract_values(models_data, "model") + model_name = st.selectbox("Model", model_names, self.config.get("model_index", 0)) + model = LLMModel.construct_from_dict(self._get_model_dict(models_data, model_name)) + self.config["model"] = model_name + + _param = self._construct_param(models_data, model_name) + + chunk_size = st.number_input( + "Chunk size (tokens)", + 32, + _param["context_window"], + self.config.get("chunk_size", 2048), + step=1024, + ) + self.config["chunk_size"] = chunk_size + + max_tokens: int = st.number_input( + "Max output (tokens)", + 32, + _param["max_output_tokens"], + self.config.get("max_tokens", 512), + ) + self.config["max_tokens"] = max_tokens + + temperature: float = st.slider("Temperature", 0.0, 1.0, self.config.get("temperature", 0.7)) + self.config["temperature"] = temperature + + return ( + LLMParams( + model=model, + max_tokens=max_tokens, + temperature=temperature, + ), + chunk_size, + ) + + def _get_model_dict(self, models_data, selected_model) -> Dict[str, Any]: + model_index = helpers.extract_dict_index(models_data, "model", selected_model) + return models_data[model_index] + + def _construct_param(self, models_data, selected_model): + model_dict = self._get_model_dict(models_data, selected_model) + param = { + "max_output_tokens": model_dict["max_output_tokens"], + "context_window": model_dict["context_window"], + } + return param + + def import_config(self): + config_file = st.file_uploader("πŸ“ Import Config", type=["json"]) + if config_file: + config = json.load(config_file) + self.config = config + self.cookie_controller.set("config", self.crypto.encrypt_b64(json.dumps(config))) + + def export_config(self): + st.download_button( + "Export Config", + data=json.dumps(self.config, indent=2), + file_name="sumgpt_config.json", + ) + + def delete_cookie(self): + if st.button("Delete cookie"): + self.cookie_controller.remove("config") + self.config = {} + st.rerun() + + def footer(self, data: Dict[str, Any]): + st.markdown("---") + st.markdown("### SumGPT") + st.markdown(f"Version: `{data.get('version')}`") + st.markdown(f"Author: {data.get('author')}") + st.markdown(f"[Report a bug]({data['bugs']['url']})") + st.markdown(f"[GitHub repo]({data['repository']['url']})") + st.markdown(f"License: [{data['license']['type']}]({data['license']['url']})") diff --git a/SumGPT/core/__init__.py b/SumGPT/core/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/SumGPT/core/crypto.py b/SumGPT/core/crypto.py new file mode 100644 index 0000000..0f8ecd4 --- /dev/null +++ b/SumGPT/core/crypto.py @@ -0,0 +1,67 @@ +import base64 +import os +from typing import Optional + +from cryptography.hazmat.backends import default_backend +from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes +from cryptography.hazmat.primitives.kdf.scrypt import Scrypt + + +class Crypto: + def __init__(self, password: str, salt: Optional[bytes] = None): + """Initialize Crypto class with password and optional salt.""" + self.password = password + self.salt = salt if salt else os.urandom(16) # Generate salt if not provided + self.key = self._generate_key() + + def _generate_key(self) -> bytes: + """Generate a symmetric key from the password using Scrypt KDF.""" + kdf = Scrypt( + salt=self.salt, + length=32, + n=2**14, # CPU/memory cost factor + r=8, # Block size + p=1, # Parallelization factor + backend=default_backend(), + ) + key = kdf.derive(self.password.encode()) + return key + + def encrypt(self, data: str) -> bytes: + """Encrypt data using AES GCM mode (authenticated encryption).""" + nonce = os.urandom(12) # 12-byte nonce for GCM mode + cipher = Cipher(algorithms.AES(self.key), modes.GCM(nonce), backend=default_backend()) + encryptor = cipher.encryptor() + + ciphertext = encryptor.update(data.encode()) + encryptor.finalize() + # Prepend the salt to the nonce, tag, and ciphertext for storage + return self.salt + nonce + encryptor.tag + ciphertext + + def encrypt_b64(self, data: str) -> str: + """Encrypt data and return as base64 encoded string.""" + encrypted_data = self.encrypt(data) + return base64.b64encode(encrypted_data).decode("utf-8") + + def decrypt(self, encrypted_data: bytes) -> str: + """Decrypt data encrypted with AES GCM mode.""" + # Extract the salt, nonce, tag, and ciphertext + salt = encrypted_data[:16] # First 16 bytes are the salt + nonce = encrypted_data[16:28] # Next 12 bytes are the nonce + tag = encrypted_data[28:44] # Next 16 bytes are the GCM tag + ciphertext = encrypted_data[44:] # Rest is the ciphertext + + # Regenerate the key with the extracted salt + kdf = Scrypt(salt=salt, length=32, n=2**14, r=8, p=1, backend=default_backend()) + key = kdf.derive(self.password.encode()) + + # Initialize the cipher for decryption + cipher = Cipher(algorithms.AES(key), modes.GCM(nonce, tag), backend=default_backend()) + decryptor = cipher.decryptor() + + decrypted_data = decryptor.update(ciphertext) + decryptor.finalize() + return decrypted_data.decode() + + def decrypt_b64(self, encrypted_data: str) -> str: + """Decrypt base64 encoded data.""" + decoded_bytes = base64.b64decode(encrypted_data) + return self.decrypt(decoded_bytes) diff --git a/SumGPT/core/llm.py b/SumGPT/core/llm.py new file mode 100644 index 0000000..06b7421 --- /dev/null +++ b/SumGPT/core/llm.py @@ -0,0 +1,51 @@ +from datamodel.llm_params import LLMParams +from langchain_core.messages import BaseMessage, HumanMessage, SystemMessage +from langchain_openai import ChatOpenAI +from pydantic.types import SecretStr + + +class LLM: + def __init__(self, api_key: str, gpt_params: LLMParams): + self.api_key: str = api_key + self.llm_params: LLMParams = gpt_params + self.model: ChatOpenAI = self._set_llm() + + def _set_llm(self) -> ChatOpenAI: + return ChatOpenAI( + api_key=SecretStr(self.api_key), + model=self.llm_params.model.name, + max_tokens=self.llm_params.max_tokens, + temperature=self.llm_params.temperature, + ) + + def generate(_self, prompt: str, system: str = "") -> BaseMessage: + messages = [ + SystemMessage(content=system), + HumanMessage(content=prompt), + ] + return _self.model.invoke(messages) + + async def agenerate(_self, prompt: str, system: str = "") -> BaseMessage: + messages = [ + SystemMessage(content=system), + HumanMessage(content=prompt), + ] + return await _self.model.ainvoke(messages) + + def Calc_price( + self, + input_tokens: int, + output_tokens: int, + cached_tokens: int = 0, + scale_factor: int = 1000000, + ) -> float: + pricing = self.llm_params.model.pricing + if cached_tokens != 0 and pricing.cached is not None: + input_tokens -= cached_tokens + return ( + input_tokens * pricing.input + + output_tokens * pricing.output + + cached_tokens * pricing.cached + ) / scale_factor + + return (input_tokens * pricing.input + output_tokens * pricing.output) / scale_factor diff --git a/SumGPT/core/tokenizer.py b/SumGPT/core/tokenizer.py new file mode 100644 index 0000000..d0e7d12 --- /dev/null +++ b/SumGPT/core/tokenizer.py @@ -0,0 +1,20 @@ +from typing import List + +import tiktoken + + +class Tokenizer: + def __init__(self, model: str): + self.tokenizer = tiktoken.encoding_for_model(model) + + def tokenize(self, text: str) -> List[int]: + return self.tokenizer.encode(text) + + def detokenize(self, tokens: List[int]) -> str: + return self.tokenizer.decode(tokens) + + def detokenize_single(self, tokens: List[int]) -> List[str]: + results = [] + for token in tokens: + results.append(self.tokenizer.decode_single_token_bytes(token).decode("utf-8")) + return results diff --git a/SumGPT/datamodel/chunk.py b/SumGPT/datamodel/chunk.py new file mode 100644 index 0000000..64666d2 --- /dev/null +++ b/SumGPT/datamodel/chunk.py @@ -0,0 +1,22 @@ +class Chunk: + def __init__(self, id: int, content: str, tokens: int, input_id: int): + self.id = id + self.content = content + self.tokens = tokens + self.input_id = input_id + self.filename = None + + def __str__(self) -> str: + return f"Chunk(content={self.content}, tokens={self.tokens}, input_id={self.input_id})" + + def set_filename_from_list(self, filenames: list[str]) -> str: + self.filename = filenames[self.input_id] + return self.filename + + def to_dict(self) -> dict: + return { + "id": self.id, + "content": self.content, + "tokens": self.tokens, + "input_id": self.input_id, + } diff --git a/SumGPT/datamodel/llm_model.py b/SumGPT/datamodel/llm_model.py new file mode 100644 index 0000000..3c714c1 --- /dev/null +++ b/SumGPT/datamodel/llm_model.py @@ -0,0 +1,31 @@ +from typing import Optional + + +class LLMModelPricing: + def __init__(self, input: int, output: int, cached: Optional[int] = None): + self.input = input + self.output = output + self.cached = cached + + +class LLMModel: + def __init__( + self, name: str, context_window: int, max_output_tokens: int, pricing: LLMModelPricing + ): + self.name = name + self.context_window = context_window + self.max_output_tokens = max_output_tokens + self.pricing = pricing + + @staticmethod + def construct_from_dict(data: dict) -> "LLMModel": + pricing = LLMModelPricing(data["pricing"]["input"], data["pricing"]["output"]) + if "cached" in data["pricing"]: + pricing.cached = data["pricing"]["cached"] + + return LLMModel( + name=data["model"], + context_window=data["context_window"], + max_output_tokens=data["max_output_tokens"], + pricing=pricing, + ) diff --git a/SumGPT/datamodel/llm_params.py b/SumGPT/datamodel/llm_params.py new file mode 100644 index 0000000..9de6306 --- /dev/null +++ b/SumGPT/datamodel/llm_params.py @@ -0,0 +1,13 @@ +from datamodel.llm_model import LLMModel, LLMModelPricing # noqa: F401 + + +class LLMParams: + def __init__( + self, + model: LLMModel, + max_tokens=2048, + temperature=0.7, + ): + self.model: LLMModel = model + self.max_tokens: int = max_tokens + self.temperature: float = temperature diff --git a/SumGPT/main.py b/SumGPT/main.py new file mode 100644 index 0000000..70b69c3 --- /dev/null +++ b/SumGPT/main.py @@ -0,0 +1,28 @@ +import streamlit as st +from app.page import Page +from core.crypto import Crypto +from utils import io + + +def init(): + st.set_page_config("SumGPT", "πŸ“", "wide") + + if "summaries" not in st.session_state: + st.session_state["summaries"] = [] + if "crypto" not in st.session_state: + st.session_state["crypto"] = Crypto(st.secrets["crypto_key"]) + + +def main(): + manifest = io.read_json_file("SumGPT/manifest.json") + models = io.read_json_file("SumGPT/models.json") + + pg = Page() + pg.draw_header(manifest["version"]) + pg.draw_sidebar(manifest, models) + pg.draw_body() + + +if __name__ == "__main__": + init() + main() diff --git a/SumGPT/manifest.json b/SumGPT/manifest.json new file mode 100644 index 0000000..7a33f79 --- /dev/null +++ b/SumGPT/manifest.json @@ -0,0 +1,17 @@ +{ + "name": "SumGPT", + "version": "2.0.0", + "license": { + "type": "MIT", + "url": "https://github.com/sean1832/SumGPT/blob/master/LICENSE" + }, + "author": "Zeke Zhang", + "homepage": "https://github.com/sean1832/SumGPT", + "repository": { + "type": "git", + "url": "https://github.com/sean1832/SumGPT" + }, + "bugs": { + "url": "https://github.com/sean1832/SumGPT/issues" + } + } \ No newline at end of file diff --git a/SumGPT/models.json b/SumGPT/models.json new file mode 100644 index 0000000..9a3cd16 --- /dev/null +++ b/SumGPT/models.json @@ -0,0 +1,40 @@ +[ + { + "model": "gpt-4o-mini", + "context_window": 128000, + "max_output_tokens": 16384, + "pricing": { + "input": 0.15, + "output": 0.6, + "cached": 0.075 + } + }, + { + "model": "gpt-4o", + "context_window": 128000, + "max_output_tokens": 4096, + "pricing": { + "input": 2.5, + "output": 10, + "cached": 1.25 + } + }, + { + "model": "gpt-4-turbo", + "context_window": 128000, + "max_output_tokens": 4096, + "pricing": { + "input": 10, + "output": 30 + } + }, + { + "model": "gpt-3.5-turbo", + "context_window": 16385, + "max_output_tokens": 4096, + "pricing": { + "input": 0.5, + "output": 1.5 + } + } +] \ No newline at end of file diff --git a/SumGPT/prompt.json b/SumGPT/prompt.json new file mode 100644 index 0000000..f76ec60 --- /dev/null +++ b/SumGPT/prompt.json @@ -0,0 +1,22 @@ +[ + { + "type": "recursive", + "legacy": false, + "prompt": "Write a detailed and comprehensive explanation of the following in perfect [LANGUAGE] with no grammar issues, ensuring all key points are covered. Create a markdown heading (###) that encapsulates the core information:\n\n{text}\n\nStructured markdown summary with heading (###) in fluent [LANGUAGE]:", + "variables": [ + { + "name": "[LANGUAGE]" + } + ] + }, + { + "type": "final", + "legacy": false, + "prompt": "Write a detailed summary of the following in [LANGUAGE]:\n\n{text}\n\nIdentify and summarise them into five headings. Use #### headings in markdown. Under headings, summarize a list of key points that best encapsulate the core information. Structured markdown summary with headings in perfect [LANGUAGE] (####): ", + "variables": [ + { + "name": "[LANGUAGE]" + } + ] + } +] \ No newline at end of file diff --git a/SumGPT/utils/__init__.py b/SumGPT/utils/__init__.py new file mode 100644 index 0000000..7979e22 --- /dev/null +++ b/SumGPT/utils/__init__.py @@ -0,0 +1,4 @@ +import utils.helpers as helpers +import utils.io as io + +__all__ = ["helpers", "io"] diff --git a/SumGPT/utils/helpers.py b/SumGPT/utils/helpers.py new file mode 100644 index 0000000..25afa4f --- /dev/null +++ b/SumGPT/utils/helpers.py @@ -0,0 +1,35 @@ +def extract_values(dicts, key, parent_key=None): + """ + Extracts values from a list of dictionaries based on a specified key. + If the key is nested, a parent key can be specified. + + :param dicts: List of dictionaries to query + :param key: The key for which values are to be extracted + :param parent_key: Optional parent key if the key is nested within another dictionary + :return: List of values corresponding to the specified key + """ + values = [] + for dict in dicts: + if parent_key: + # Access the nested dictionary and then the key if parent_key is specified + if parent_key in dict and key in dict[parent_key]: + values.append(dict[parent_key][key]) + else: + # Access the key directly if there is no parent_key + if key in dict: + values.append(dict[key]) + return values + +def extract_dict_index(dicts, key, value): + """ + Extracts the index of a dictionary in a list of dictionaries based on a specified key-value pair. + + :param dicts: List of dictionaries to query + :param key: The key to search for + :param value: The value to search for + :return: Index of the dictionary containing the specified key-value pair + """ + for i, dict in enumerate(dicts): + if key in dict and dict[key] == value: + return i + return None \ No newline at end of file diff --git a/SumGPT/utils/io.py b/SumGPT/utils/io.py new file mode 100644 index 0000000..fef6d69 --- /dev/null +++ b/SumGPT/utils/io.py @@ -0,0 +1,17 @@ +import json +from io import StringIO + + +def read_json_file(file): + with open(file, "r") as f: + return json.load(f) + + +def write_json_file(file, data: dict): + with open(file, "w") as f: + json.dump(data, f, indent=4) + + +def read_to_string(file): + stringio = StringIO(file.getvalue().decode("utf-8")) + return stringio.read() diff --git a/requirements.txt b/requirements.txt index 9770e95..85630dc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,15 +1,15 @@ docx==0.2.4 -python_docx==0.8.11 -langchain==0.0.123 -langdetect==1.0.9 -numpy==1.24.2 -openai==0.27.2 -pydub==0.25.1 +python_docx==1.1.2 PyPDF4==1.27.0 -pytube==12.1.3 -streamlit==1.20.0 -streamlit_toggle_switch==1.0.2 -tiktoken==0.3.1 -requests==2.29.0 -youtube_transcript_api==0.6.0 +tiktoken==0.8.0 +# crypto +cryptography==43.0.3 + +# langchain +langchain==0.3.4 +langchain-openai==0.2.3 + +# streamlit +streamlit==1.39.0 +streamlit-cookies-controller==0.0.4 diff --git a/resources/prompt.json b/resources/prompt.json deleted file mode 100644 index 3be18b8..0000000 --- a/resources/prompt.json +++ /dev/null @@ -1,42 +0,0 @@ -[ - { - "type": "recursive", - "legacy": true, - "prompt": "Provide a detailed and comprehensive summary of the following content in flawless [LANGUAGE], ensuring all key points are covered. Create a markdown heading (###) that encapsulates the core information of the content. Make sure it is answered in [LANGUAGE].", - "variables": [ - { - "name": "[LANGUAGE]" - } - ] - }, - { - "type": "recursive", - "legacy": false, - "prompt": "Write a detailed and comprehensive explanation of the following in perfect [LANGUAGE] with no grammar issues, ensuring all key points are covered. Create a markdown heading (###) that encapsulates the core information:\n\n{text}\n\nStructured markdown summary with heading (###) in fluent [LANGUAGE]:", - "variables": [ - { - "name": "[LANGUAGE]" - } - ] - }, - { - "type": "final", - "legacy": true, - "prompt": "identify headings in the transcript and summarise them into five headings. Use #### headings in markdown. Under headings, summarise at least 3 key points and then provide detail explanation of the concept based on the following text in the way that can be read fluently, make sense and avoid repetition. Make sure to include all information. Write it in beautiful and structured markdown in perfect [LANGUAGE]. ", - "variables": [ - { - "name": "[LANGUAGE]" - } - ] - }, - { - "type": "final", - "legacy": false, - "prompt": "Write a detailed summary of the following in [LANGUAGE]:\n\n{text}\n\nIdentify and summarise them into five headings. Use #### headings in markdown. Under headings, summarize a list of key points that best encapsulate the core information. Structured markdown summary with headings in perfect [LANGUAGE] (####): ", - "variables": [ - { - "name": "[LANGUAGE]" - } - ] - } -] \ No newline at end of file diff --git a/src/Components/Info.py b/src/Components/Info.py deleted file mode 100644 index 72f324f..0000000 --- a/src/Components/Info.py +++ /dev/null @@ -1,18 +0,0 @@ -import streamlit as st -import Modules.file_io as file_io - - -def info(): - info_panel = st.container() - - manifest = 'src/manifest.json' - st.session_state['MANIFEST'] = manifest_data = file_io.read_json(manifest) - - with info_panel: - st.markdown('---') - st.markdown(f"# {manifest_data['name']}") - st.markdown(f"Version: `{manifest_data['version']}`") - st.markdown(f"Author: {manifest_data['author']}") - st.markdown(f"[Report a bug]({manifest_data['bugs']['url']})") - st.markdown(f"[GitHub repo]({manifest_data['homepage']})") - st.markdown(f"License: [{manifest_data['license']['type']}]({manifest_data['license']['url']})") \ No newline at end of file diff --git a/src/Components/StreamlitSetup.py b/src/Components/StreamlitSetup.py deleted file mode 100644 index b57f3ce..0000000 --- a/src/Components/StreamlitSetup.py +++ /dev/null @@ -1,36 +0,0 @@ -import streamlit as st -import Data.caption_languages as data -import Modules.file_io as file_io - -def setup(): - st.set_page_config(page_title="SumGPT", page_icon="πŸ“", layout="wide") - - if not st.session_state.get('OPENAI_API_KEY'): - st.session_state['OPENAI_API_KEY'] = None - - if not st.session_state.get('OPENAI_PERSONA_REC'): - st.session_state['OPENAI_PERSONA_REC'] = None - - if not st.session_state.get('OPENAI_PERSONA_SUM'): - st.session_state['OPENAI_PERSONA_SUM'] = None - - if not st.session_state.get('CHUNK_SIZE'): - st.session_state['CHUNK_SIZE'] = None - - if not st.session_state.get('OPENAI_PARAMS'): - st.session_state['OPENAI_PARAMS'] = None - - if not st.session_state.get('DELAY'): - st.session_state['DELAY'] = 0 - - if not st.session_state.get('FINAL_SUMMARY_MODE'): - st.session_state['FINAL_SUMMARY_MODE'] = False - - if not st.session_state.get('CAPTION_LANGUAGES'): - st.session_state['CAPTION_LANGUAGES'] = data.languages + data.auto_languages - - if not st.session_state.get('PREVIOUS_RESULTS'): - st.session_state['PREVIOUS_RESULTS'] = None - - if not st.session_state.get('MANIFEST'): - st.session_state["MANIFEST"] = file_io.read_json("src/manifest.json") \ No newline at end of file diff --git a/src/Components/__init__.py b/src/Components/__init__.py deleted file mode 100644 index 9391db9..0000000 --- a/src/Components/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from Components import sidebar -from Components import StreamlitSetup -from Components import Info -__all__ = ['sidebar', 'StreamlitSetup', 'Info'] \ No newline at end of file diff --git a/src/Components/sidebar.py b/src/Components/sidebar.py deleted file mode 100644 index b5b3d41..0000000 --- a/src/Components/sidebar.py +++ /dev/null @@ -1,187 +0,0 @@ -import streamlit as st -import GPT -import Modules.file_io as file_io -from streamlit_toggle import st_toggle_switch -import Components -from typing import Any, Dict, List, Tuple, Union -import json - - -def set_openai_api_key(api_key: str): - st.session_state["OPENAI_API_KEY"] = api_key - - -def set_openai_persona(persona_rec: str, persona_sum: str): - st.session_state["OPENAI_PERSONA_REC"] = persona_rec - st.session_state["OPENAI_PERSONA_SUM"] = persona_sum - - -def set_param(params: GPT.param): - st.session_state["OPENAI_PARAMS"] = params - - -def set_chunk_size(size: int): - st.session_state['CHUNK_SIZE'] = size - - -def set_delay(time: int): - st.session_state['DELAY'] = time - - -def set_final_summary_mode(mode: bool): - st.session_state['FINAL_SUMMARY_MODE'] = mode - - -def _set_config(config_file, key: str, default_value): - if config_file: - return file_io.read_json_upload(config_file, key) - else: - return default_value - -def _set_language(language: str): - st.session_state['OUTPUT_LANGUAGE'] = language - -def _set_legacy(enable: bool): - st.session_state['LEGACY'] = enable -def _legacy(enable: bool, legacy, experimental): - if not enable: - return experimental - else: - return legacy -def _extract_prompt(json_data: List[Dict[str,Union[bool, str]]], target_type: str, target_legacy: bool, language: str = "English") -> str | None: - for item in json_data: - if item["type"] == target_type and item["legacy"] == target_legacy: - prompt = item["prompt"] - new_prompt = prompt.replace("[LANGUAGE]", language) - return new_prompt - return None - -def sidebar(): - with st.sidebar: - st.markdown("## How to use\n" - "1. πŸ”‘ Enter your [OpenAI API key](https://beta.openai.com/account/api-keys)\n" - "2. πŸ“ upload your file\n" - "3. πŸƒ Run\n" - "---") - - config_file = st.file_uploader("πŸ“ Import Configs", type=['json']) - - api_input = st.text_input(label="πŸ”‘ OpenAI API Key", - placeholder="Enter your OpenAI API key (sk-...)", - type="password", - help="You can get your API key from https://beta.openai.com/account/api-keys", - value=_set_config(config_file, "OPENAI_API_KEY", "")) - - enable_legacy = st_toggle_switch(label="Legacy", default_value=_set_config(config_file, "LEGACY", False)) - enable_final_summary = st_toggle_switch(label="Enable Final Summary", - default_value=_set_config(config_file, "FINAL_SUMMARY_MODE", False)) - if enable_final_summary: - set_final_summary_mode(True) - if st.session_state['FINAL_SUMMARY_MODE'] != enable_final_summary: - set_final_summary_mode(enable_final_summary) - - with st.expander('πŸ€– Bot Persona'): - language_options = ['English', 'Chinese', 'Japanese', 'Korean', 'Spanish', 'French', 'German'] - language_index = language_options.index(_set_config(config_file, "LANGUAGE", 'English')) - language = st.selectbox('Language', options=language_options, index=language_index) - _set_language(language) - - prompts = file_io.read_json("resources/prompt.json") - - persona_rec_legacy = _extract_prompt(prompts, "recursive", True, language) - persona_rec = _extract_prompt(prompts, "recursive", False, language) - persona_rec = st.text_area('Bot Persona Recursive', - value=_set_config(config_file, "OPENAI_PERSONA_REC", _legacy(enable_legacy, persona_rec_legacy, persona_rec)), - help='System message is a pre-defined message used to instruct the assistant at the ' - 'beginning of a conversation. iterating and ' - 'experimenting with potential improvements can help to generate better outputs.' - 'Make sure to use casual language.', - height=250) - if enable_final_summary: - persona_sum_legacy = _extract_prompt(prompts, "final", True, language) - persona_sum = _extract_prompt(prompts, "final", False, language) - - persona_sum = st.text_area('Bot Persona Total Sum', - value=_set_config(config_file, "OPENAI_PERSONA_SUM", _legacy(enable_legacy, persona_sum_legacy, persona_sum)), - help='This is a pre-defined message for total summarization that is used to' - 'instruct the assistant at the beginning of a conversation. ', - height=300) - else: - persona_sum = "" - - with st.expander('πŸ”₯ Advanced Options'): - model_options = ['gpt-3.5-turbo','gpt-3.5-turbo-16k', 'gpt-4'] - model_index = model_options.index(_set_config(config_file, "MODEL", 'gpt-3.5-turbo')) - model = st.selectbox("Model", options=model_options, index=model_index) - - if model == 'gpt-4': - max_chunk = 4000 - elif model == 'gpt-3.5-turbo-16k': - max_chunk = 16000 - else: - max_chunk = 2500 - chunk_size = st.slider('Chunk Size (word count)', min_value=0, max_value=max_chunk, step=20, - value=_set_config(config_file, "CHUNK_SIZE", 800)) - max_tokens_rec = st.slider('Max Tokens - Recursive Summary', min_value=0, max_value=4090, step=20, - value=_set_config(config_file, "MAX_TOKENS_REC", 250)) - if enable_final_summary: - max_tokens_final = st.slider('Max Tokens - Final Summary', min_value=0, max_value=4090, step=20, - value=_set_config(config_file, "MAX_TOKENS_FINAL", 650)) - else: - max_tokens_final = 0 - temperature = st.slider('Temperature', min_value=0.0, max_value=1.0, step=0.05, - value=_set_config(config_file, "TEMPERATURE", 0.7)) - top_p = st.slider('Top P', min_value=0.0, max_value=1.0, step=0.05, - value=_set_config(config_file, "TOP_P", 1.0)) - frequency_penalty = st.slider('Frequency Penalty', min_value=0.0, max_value=2.0, step=0.1, - value=_set_config(config_file, "FREQUENCY_PENALTY", 0.0)) - presence_penalty = st.slider('Presence Penalty', min_value=0.0, max_value=2.0, step=0.1, - value=_set_config(config_file, "PRESENCE_PENALTY", 0.0)) - if st_toggle_switch(label="Delay (free openAI API user)", - default_value=_set_config(config_file, "ENABLE_DELAY", False)): - delay = st.slider('Delay (seconds)', min_value=0, max_value=60, step=1, - value=_set_config(config_file, "DELAY_TIME", 1)) - else: - delay = 0 - param = GPT.param.gpt_param( - model=model, - max_tokens_final=max_tokens_final, - max_tokens_rec=max_tokens_rec, - temperature=temperature, - top_p=top_p, - frequency_penalty=frequency_penalty, - presence_penalty=presence_penalty - ) - - st.download_button(label="πŸ“₯ Export Configs", - data=json.dumps({ - "OPENAI_API_KEY": api_input, - "FINAL_SUMMARY_MODE": enable_final_summary, - "OPENAI_PERSONA_REC": persona_rec, - "OPENAI_PERSONA_SUM": persona_sum, - "CHUNK_SIZE": chunk_size, - "MAX_TOKENS_REC": max_tokens_rec, - "MAX_TOKENS_FINAL": max_tokens_final, - "TEMPERATURE": temperature, - "TOP_P": top_p, - "FREQUENCY_PENALTY": frequency_penalty, - "PRESENCE_PENALTY": presence_penalty, - "MODEL": model, - "ENABLE_DELAY": delay > 0, - "DELAY_TIME": delay, - "LANGUAGE": language, - "LEGACY": enable_legacy - }, indent=4), - file_name="configs.json") - Components.Info.info() - - if api_input: - set_openai_api_key(api_input) - - if persona_rec: - set_openai_persona(persona_rec, persona_sum) - - set_chunk_size(chunk_size) - set_param(param) - set_delay(delay) - _set_legacy(enable_legacy) \ No newline at end of file diff --git a/src/Data/__init__.py b/src/Data/__init__.py deleted file mode 100644 index 4de9124..0000000 --- a/src/Data/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from Data import caption_languages - -__all__ = ['caption_languages'] \ No newline at end of file diff --git a/src/Data/caption_languages.py b/src/Data/caption_languages.py deleted file mode 100644 index acec65e..0000000 --- a/src/Data/caption_languages.py +++ /dev/null @@ -1,6 +0,0 @@ -languages = [ - 'en', 'es', 'fr', 'de', 'it', 'pt', 'ru', 'ja', 'ko', 'zh-Hans', 'zh-Hant', 'zh-TW', 'zh-CN', 'zh', 'ar', 'hi', 'th' -] - -auto_languages = ['a.' + _language for _language in languages] - diff --git a/src/GPT/__init__.py b/src/GPT/__init__.py deleted file mode 100644 index 0bcd76d..0000000 --- a/src/GPT/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -from GPT import misc -from GPT import embeddings -from GPT import bot -from GPT import param -from GPT import generate - -__all__ = ['misc', 'embeddings', 'bot', 'param', 'generate'] \ No newline at end of file diff --git a/src/GPT/bot.py b/src/GPT/bot.py deleted file mode 100644 index ce8dd86..0000000 --- a/src/GPT/bot.py +++ /dev/null @@ -1,48 +0,0 @@ -import openai -from typing import Any, Dict, List, Tuple, Union - - -class OpenAIChatBot: - """A class to interact with the OpenAI API.""" - - def __init__(self, api_key: str, persona: str, model: str, max_tokens: int, temperature: float, top_p: float, - frequency_penalty: float, presence_penalty: float): - openai.api_key = api_key - self.persona = persona - self.model = model - self.max_tokens = max_tokens - self.temperature = temperature - self.top_p = top_p - self.frequency_penalty = frequency_penalty - self.presence_penalty = presence_penalty - - def chat_stream(self, prompt: str) -> openai.api_resources.chat_completion.ChatCompletion: - """Returns the streamed response from the OpenAI API.""" - completions = openai.ChatCompletion.create( - model=self.model, - max_tokens=self.max_tokens, - temperature=self.temperature, - top_p=self.top_p, - frequency_penalty=self.frequency_penalty, - presence_penalty=self.presence_penalty, - stream=True, - messages=[ - {"role": "system", "content": self.persona}, - {"role": "user", "content": prompt} - ]) - return completions - - def chat(self, prompt: str) -> Tuple[str, str]: - """Returns the response from the OpenAI API.""" - completions = openai.ChatCompletion.create( - model=self.model, - max_tokens=self.max_tokens, - temperature=self.temperature, - top_p=self.top_p, - frequency_penalty=self.frequency_penalty, - presence_penalty=self.presence_penalty, - messages=[ - {"role": "system", "content": self.persona}, - {"role": "user", "content": f"{self.persona} '{prompt}'"} - ]) - return completions['choices'][0]['message']['content'], completions['choices'][0]['finish_reason'] diff --git a/src/GPT/embeddings.py b/src/GPT/embeddings.py deleted file mode 100644 index 3e6cb50..0000000 --- a/src/GPT/embeddings.py +++ /dev/null @@ -1,12 +0,0 @@ -import openai - - -class openAIEmbeddings: - def __init__(self, api_key: str): - openai.api_key = api_key - - def embedding(self, content: str, engine: str = 'text-embedding-ada-002') -> float: - """Returns the embedding vector of a string.""" - response = openai.Embedding.create(input=content, engine=engine) - vector = response['data'][0]['embedding'] - return vector diff --git a/src/GPT/generate.py b/src/GPT/generate.py deleted file mode 100644 index 627faf6..0000000 --- a/src/GPT/generate.py +++ /dev/null @@ -1,52 +0,0 @@ -import GPT.bot -import streamlit as st -import GPT.param -from typing import Any, Dict, List, Tuple, Union - - -def get_answer_stream(content: str): - """Returns a stream of responses from the OpenAI API.""" - params = st.session_state["OPENAI_PARAMS"] - previous_char = '' - bot = GPT.bot.OpenAIChatBot(st.session_state["OPENAI_API_KEY"], - st.session_state["OPENAI_PERSONA"], - params.model, - params.max_tokens_rec, - params.temperature, - params.top_p, - params.frequency_penalty, - params.presence_penalty) - responses = bot.chat_stream(content) - response_panel = st.empty() - for response_json in responses: - choice = response_json['choices'][0] - if choice['finish_reason'] == 'stop': - break - - # error handling - if choice['finish_reason'] == 'length': - st.warning('⚠️Result cut off due to length. Consider increasing the max tokens parameter.') - break - - delta = choice['delta'] - if 'role' in delta or delta == {}: - char = '' - else: - char = delta['content'] - answer = previous_char + char - response_panel.info(answer) - - -def get_answer(content: str, max_tokens, persona: str) -> Tuple[str, str]: - """Returns a response from the OpenAI API.""" - params = st.session_state["OPENAI_PARAMS"] - bot = GPT.bot.OpenAIChatBot(st.session_state["OPENAI_API_KEY"], - persona, - params.model, - max_tokens, - params.temperature, - params.top_p, - params.frequency_penalty, - params.presence_penalty) - response, finish_reason = bot.chat(content) - return response, finish_reason diff --git a/src/GPT/misc.py b/src/GPT/misc.py deleted file mode 100644 index b93481c..0000000 --- a/src/GPT/misc.py +++ /dev/null @@ -1,98 +0,0 @@ -import openai -from langchain.llms import OpenAI -import os -import streamlit as st -from typing import Any, Dict, List, Tuple, Union - - -def validate_api_key(api_key: str) -> bool: - """Validates the OpenAI API key by trying to create a completion.""" - openai.api_key = api_key - try: - openai.ChatCompletion.create( - model="gpt-3.5-turbo", - max_tokens=1, - messages=[ - {"role": "user", "content": "Hello!"} - ] - ) - return True - except openai.error.AuthenticationError: - return False - - -def predict_token(param, chunks) -> Dict[str, int]: - """predict how many tokens to generate.""" - if st.session_state["OPENAI_API_KEY"] is not None: - os.environ['OPENAI_API_KEY'] = st.session_state["OPENAI_API_KEY"] - llm = OpenAI() - prompt_token_total = 0 - completion_token_total = 0 - for chunk in chunks: - prompt_token = llm.get_num_tokens(chunk['content']) - prompt_token_total += prompt_token - completion_token_total += param.max_tokens_rec - - if st.session_state['FINAL_SUMMARY_MODE']: - completion_token_total += param.max_tokens_final - total_token = prompt_token_total + completion_token_total - token = {'total': total_token, - 'prompt': prompt_token_total, - 'completion': completion_token_total} - - return token - else: - return {'total': 0, 'prompt': 0, 'completion': 0} - - -def predict_token_single(chunk: Dict[str, Union[str, float]] | str, max_tokens: int = None) -> int: - """predict how many tokens to generate.""" - if st.session_state["OPENAI_API_KEY"] is not None: - os.environ['OPENAI_API_KEY'] = st.session_state["OPENAI_API_KEY"] - llm = OpenAI() - if isinstance(chunk, str): - chunk_content = chunk - else: - chunk_content = chunk['content'] - chunk_token = llm.get_num_tokens(chunk_content) - if max_tokens is not None: - chunk_token += max_tokens - - return chunk_token - else: - return 0 - - -def is_tokens_exceeded(param, chunks, max_token: int = 4096) -> Dict[str, Union[bool, str]]: - """Checks if the number of tokens used has exceeded the limit.""" - - # check recursive chunks tokens - rec_chunks_token = [] - for chunk in chunks: - chunk_token = predict_token_single(chunk, param.max_tokens_rec) - rec_chunks_token.append(chunk_token) - - - # check final chunks tokens - final_prompt_token = len(chunks) * param.max_tokens_rec - final_completion_token = param.max_tokens_final - final_chunks_token = final_prompt_token + final_completion_token - - # evaluate - if max(rec_chunks_token) > max_token: - return {'exceeded': True, - 'reason': 'recursive', - 'message': f"**[ Recursive summary ]** tokens exceeded. Max tokens allowed: {max_token}. Tokens used: {max(rec_chunks_token)}\n" - f"(Prompt: {max(rec_chunks_token) - param.max_tokens_rec}, " - f"Completion: {param.max_tokens_rec})"} - - elif final_chunks_token > max_token and st.session_state['FINAL_SUMMARY_MODE']: - return {'exceeded': True, - 'reason': 'final', - 'message': f"**[ Final summary ]** tokens exceeded. Max tokens allowed: {max_token}. Tokens used: {final_chunks_token}\n" - f"(Prompt: {final_prompt_token}, Completion: {final_completion_token})"} - - else: - return {'exceeded': False, - 'reason': '', - 'message': ''} diff --git a/src/GPT/param.py b/src/GPT/param.py deleted file mode 100644 index 866f112..0000000 --- a/src/GPT/param.py +++ /dev/null @@ -1,11 +0,0 @@ - -class gpt_param: - def __init__(self, model: str, max_tokens_final: int, max_tokens_rec: int, temperature: float, top_p: float, - frequency_penalty: float, presence_penalty: float): - self.model = model - self.max_tokens_rec = max_tokens_rec - self.max_tokens_final = max_tokens_final - self.temperature = temperature - self.top_p = top_p - self.frequency_penalty = frequency_penalty - self.presence_penalty = presence_penalty diff --git a/src/Modules/Youtube.py b/src/Modules/Youtube.py deleted file mode 100644 index f399cb3..0000000 --- a/src/Modules/Youtube.py +++ /dev/null @@ -1,97 +0,0 @@ -import requests -import re -from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound -import streamlit as st -from typing import Any, Dict, List, Tuple, Union - - -manifest = st.session_state["MANIFEST"] -def _error_report_msg(youtube_url): - return f"Please create an issue on [GitHub]({manifest['bugs']['url']}). " \ - f"Please include the YouTube URL ({youtube_url}), version number ({manifest['version']}) " \ - f"and all necessary information to replicate the error. " \ - f"**Before creating a new issue, please check if the problem has already been reported.**" - -def _extract_video_id_from_url(url): - video_id_pattern = r'(?:v=|/v/|youtu\.be/|/embed/|/e/)([^?&"\'>]+)' - match = re.search(video_id_pattern, url) - if match: - return match.group(1) - else: - raise ValueError("Invalid YouTube URL") - -def get_video_title(youtube_url): - video_id = _extract_video_id_from_url(youtube_url) - url = f'https://www.youtube.com/watch?v={video_id}' - response = requests.get(url) - title_pattern = r'(.+?) - YouTube<\/title>' - match = re.search(title_pattern, response.text) - if match: - title = match.group(1) - return title - else: - return None - -def get_available_subtitle_languages(video_id): - try: - transcript_list = YouTubeTranscriptApi.list_transcripts(video_id) - languages = [transcript.language_code for transcript in transcript_list] - return languages - except Exception as e: - print(f"Error fetching available subtitle languages: {e}") - return [] - -def get_video_captions(youtube_url, languages): - video_id = _extract_video_id_from_url(youtube_url) - simplified_url = f'https://www.youtube.com/watch?v={video_id}' - - available_language = get_available_subtitle_languages(video_id) - - if not any(lang in languages for lang in available_language) and available_language != []: - print(f"Failed to retrieve transcript: Language {available_language} is/are not yet supported for {simplified_url}.") - st.error(f'❌ Language {available_language} is/are not yet supported for {simplified_url}.\n\n' + _error_report_msg(simplified_url)) - st.stop() - - for language in languages: - try: - transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=[language]) - captions = "" - for item in transcript: - captions += item['text'] + "\n" - return captions - - except NoTranscriptFound as e: - if language == languages[-1]: - print(f"Language {available_language} exist in language list but failed to retrieve in YouTubeTranscriptApi.get_transcript: {e}") - st.error(f'❌ Language {available_language} exist in language list but failed to retrieve in `YouTubeTranscriptApi.get_transcript`:\n\n' - f'languages = {available_language}\n\n' - f'language list = {languages}\n\n' - + _error_report_msg(simplified_url)) - st.stop() - else: - continue - - except TranscriptsDisabled: - print(f"Failed to retrieve transcript: transcripts disabled for {simplified_url}") - st.error(f'❌ Subtitles not available for {simplified_url}! \n\n---' - f'\n**Instruction:**\n\n' - f'1. Verify if the [video]({simplified_url}) has subtitles available.\n\n' - f"2. If you are confident that subtitles are available in the video but could not be retrieved, " - + _error_report_msg(simplified_url)) - st.stop() - raise TranscriptsDisabled - - except Exception as e: - print(e) - st.error(f'❌ Failed to fetch data from YouTube for {simplified_url}. \n\n' - f'{_error_report_msg(simplified_url)}' - f'\n\nError: \n\n---\n\n{e}') - st.stop() - break - -@st.cache_data(show_spinner=False) -def extract_youtube_transcript(url: str, lang_code: str | List[str] = 'a.en') -> Tuple[str, str]: - """Extracts the transcript from a YouTube video.""" - transcript = get_video_captions(url, lang_code) - title = get_video_title(url) - return transcript, title diff --git a/src/Modules/__init__.py b/src/Modules/__init__.py deleted file mode 100644 index 412ace4..0000000 --- a/src/Modules/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from Modules import file_io - -__all__ = ['file_io'] \ No newline at end of file diff --git a/src/Modules/file_io.py b/src/Modules/file_io.py deleted file mode 100644 index 0f214e8..0000000 --- a/src/Modules/file_io.py +++ /dev/null @@ -1,99 +0,0 @@ -import re -import PyPDF4 -import docx -from typing import Any, Dict, List, Tuple, Union -from pydub import AudioSegment -import math -import json -import streamlit as st - - - -@st.cache_data() -def read_json(file, key: str = None) -> Any: - """Reads a json file and returns the value of a key.""" - with open(file, "r") as f: - data = json.load(f) - if key and isinstance(data, dict): - return data[key] - elif key and isinstance(data, list): - return [d[key] for d in data] - else: - return data - - -@st.cache_data() -def read_json_upload(file, key: str) -> Any: - """Reads a json file and returns the value of a key.""" - if not isinstance(file, str): - f = file.getvalue().decode("utf-8") - data = json.loads(f) - return data[key] - - -@st.cache_data() -def read_txt(file, encoding: str = "utf-8") -> str: - """Reads a text file.""" - return file.read().decode(encoding) - - -@st.cache_data() -def read_pdf(file) -> List[str]: - """Reads a pdf file.""" - pdfReader = PyPDF4.PdfFileReader(file, strict=False) - texts = [] - for page in range(pdfReader.numPages): - text = pdfReader.getPage(page).extractText() - # Merge hyphenated words - text = re.sub(r"(\w+)-\n(\w+)", r"\1\2", text) - # Fix newlines in the middle of sentences - text = re.sub(r"(?<!\n\s)\n(?!\s\n)", " ", text.strip()) - # Remove multiple newlines - text = re.sub(r"\n\s*\n", "\n\n", text) - - texts.append(text) - return texts - - -@st.cache_data() -def read_docx(file) -> str: - """Reads a docx file.""" - doc = docx.Document(file) - text = "" - for para in doc.paragraphs: - # Remove multiple newlines - t = re.sub(r"\n\s*\n", "\n\n", para.text) - text += t + "\n" - return text - - -@st.cache_data() -def _split_audio(audio, chunk_size=2) -> List[AudioSegment]: - """Split audio into chunks of 10 minutes.""" - # load audio - audio = AudioSegment.from_file(audio, format="mp3") - # Define the chunk size (10 minutes default) - chunk_size = chunk_size * 60 * 1000 - # calculate the number of chunks - num_chunks = math.ceil(len(audio) / chunk_size) - chunks = [] - # split audio into chunks - for i in range(num_chunks): - start = i * chunk_size - end = start + chunk_size - chunk = audio[start:end] - chunks.append(chunk) - return chunks - - -@st.cache_data() -def read(file) -> str | List[str]: - """Reads a file and returns the content.""" - if file.name.endswith(".txt") or file.name.endswith(".md"): - return read_txt(file) - elif file.name.endswith(".pdf"): - return read_pdf(file) - elif file.name.endswith(".docx"): - return read_docx(file) - else: - raise ValueError("File type not supported") diff --git a/src/SumGPT.py b/src/SumGPT.py deleted file mode 100644 index b2040ea..0000000 --- a/src/SumGPT.py +++ /dev/null @@ -1,152 +0,0 @@ -import asyncio -import streamlit as st - -import Components.StreamlitSetup as StreamlitSetup -StreamlitSetup.setup() - -import Modules.Youtube -from Components.sidebar import sidebar -import Modules.file_io as file_io -import GPT -import util -import time - -app_header = st.container() - -file_handler = st.container() -content_handler = st.container() -result_handler = st.container() - -with app_header: - st.title("πŸ“ SumGPT") - st.markdown("##### Summarize your text with OpenAI's GPT-3.5 / GPT-4 API") - st.markdown("##### [GitHub repo](https://github.com/sean1832/SumGPT)") - st.warning("🚧️ This app is still in beta. Please [report any bugs](https://github.com/sean1832/SumGPT/issues) to the GitHub repo.") - -sidebar() - -with file_handler: - if st.button("πŸ”ƒ Refresh"): - st.cache_data.clear() - youtube_link_empty = st.empty() - upload_file_emtpy = st.empty() - - youtube_link = youtube_link_empty.text_input(label="πŸ”— YouTube Link", - placeholder="Enter your YouTube link", - help="Enter your YouTube link to download the video and extract the audio") - upload_file = upload_file_emtpy.file_uploader("πŸ“ Upload your file", type=['txt', 'pdf', 'docx', 'md']) - if youtube_link: - upload_file_emtpy.empty() - with st.spinner("πŸ” Extracting transcript..."): - transcript, title = Modules.Youtube.extract_youtube_transcript(youtube_link, st.session_state['CAPTION_LANGUAGES']) - file_content = {'name': f"{title}.txt", 'content': transcript} - elif upload_file: - youtube_link_empty.empty() - with st.spinner("πŸ” Reading file... (mp3 file might take a while)"): - file_content = {'name': upload_file.name, 'content': file_io.read(upload_file)} - elif youtube_link and upload_file: - st.warning("Please only upload one file at a time") - else: - file_content = None - -with content_handler: - if file_content: - with st.expander("File Preview"): - if file_content['name'].endswith(".pdf"): - content = "\n\n".join(file_content['content']) - st.text_area(file_content['name'], content, height=200) - else: - content = file_content['content'] - st.text_area(file_content['name'], content, height=200) - -with result_handler: - if file_content: - chunks = [] - content = file_content['content'] - if file_content['name'].endswith(".pdf"): - content = "\n\n".join(file_content['content']) - chunks.extend(util.convert_to_chunks(content, chunk_size=st.session_state['CHUNK_SIZE'])) - - with st.expander(f"Chunks ({len(chunks)})"): - for chunk in chunks: - st.write(chunk) - - token_usage = GPT.misc.predict_token(st.session_state['OPENAI_PARAMS'], chunks) - param = st.session_state["OPENAI_PARAMS"] - prompt_token = token_usage['prompt'] - completion_token = token_usage['completion'] - if param.model == 'gpt-4': - price = round(prompt_token * 0.00003 + completion_token * 0.00006, 5) - st.markdown('**Note:** To access GPT-4, please [join the waitlist](https://openai.com/waitlist/gpt-4-api)' - " if you haven't already received an invitation from OpenAI.") - st.info("ℹ️️ Please keep in mind that GPT-4 is significantly **[more expensive](https://openai.com/pricing#language-models)** than GPT-3.5. ") - elif param.model == 'gpt-3.5-turbo-16k': - price = round(prompt_token * 0.000003 + completion_token *0.000004, 5) - else: - price = round(prompt_token * 0.0000015 + completion_token * 0.000002 , 5) - st.markdown( - f"Price Prediction: `${price}` || Total Prompt: `{prompt_token}`, Total Completion: `{completion_token}`") - # max tokens exceeded warning - exceeded = util.exceeded_token_handler(param=st.session_state['OPENAI_PARAMS'], chunks=chunks) - - # load cached results - if st.session_state['PREVIOUS_RESULTS'] is not None: - rec_responses = st.session_state['PREVIOUS_RESULTS']['rec_responses'] - rec_id = st.session_state['PREVIOUS_RESULTS']['rec_ids'] - final_response = st.session_state['PREVIOUS_RESULTS']['final_response'] - finish_reason_rec = st.session_state['PREVIOUS_RESULTS']['finish_reason_rec'] - finish_reason_final = st.session_state['PREVIOUS_RESULTS']['finish_reason_final'] - else: - rec_responses = None - rec_id = None - final_response = None - finish_reason_rec = None - finish_reason_final = None - - # finish_reason_rec = None - if st.button("πŸš€ Run", disabled=exceeded): - start_time = time.time() - st.cache_data.clear() - API_KEY = st.session_state['OPENAI_API_KEY'] - if not API_KEY and not GPT.misc.validate_api_key(API_KEY): - st.error("❌ Please enter a valid [OpenAI API key](https://beta.openai.com/account/api-keys).") - else: - with st.spinner("Summarizing... (this might take a while)"): - if st.session_state['LEGACY']: - rec_max_token = st.session_state['OPENAI_PARAMS'].max_tokens_rec - rec_responses, finish_reason_rec = util.recursive_summarize(chunks, rec_max_token) - if st.session_state['FINAL_SUMMARY_MODE']: - final_response, finish_reason_final = util.summarize(rec_responses) - else: - final_response = None - else: - completions, final_response = asyncio.run(util.summarize_experimental_concurrently(content, st.session_state['CHUNK_SIZE'])) - rec_responses = [d["content"] for d in completions] - rec_ids = [d["chunk_id"] for d in completions] - # save previous completions - resp = {'rec_responses': rec_responses, - 'rec_ids': rec_ids, - 'final_response': final_response, - 'finish_reason_rec': finish_reason_rec, - 'finish_reason_final': finish_reason_final} - if resp != st.session_state['PREVIOUS_RESULTS']: - st.session_state['PREVIOUS_RESULTS'] = resp - - end_time = time.time() - st.markdown(f"⏱️ Time taken: `{round(end_time - start_time, 2)}s`") - - if rec_responses is not None: - with st.expander("Recursive Summaries", expanded=not st.session_state['FINAL_SUMMARY_MODE']): - for i, response in enumerate(rec_responses): - st.info(f'{response}') - if finish_reason_rec == 'length': - st.warning('⚠️Result cut off due to length. Consider increasing the [Max Tokens Chunks] parameter.') - - if final_response is not None: - st.header("πŸ“Summary") - st.info(final_response) - if finish_reason_final == 'length': - st.warning( - '⚠️Result cut off due to length. Consider increasing the [Max Tokens Summary] parameter.') - if final_response is not None or rec_responses is not None: - util.download_results(rec_responses, final_response) diff --git a/src/manifest.json b/src/manifest.json deleted file mode 100644 index 731522c..0000000 --- a/src/manifest.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "name": "SumGPT", - "version": "1.0.8", - "license": { - "type": "MIT", - "url": "https://github.com/sean1832/SumGPT/blob/master/LICENSE" - }, - "author": "Zeke Zhang", - "homepage": "https://github.com/sean1832/SumGPT", - "repository": { - "type": "git", - "url": "https://github.com/sean1832/SumGPT" - }, - "bugs": { - "url": "https://github.com/sean1832/SumGPT/issues" - } -} \ No newline at end of file diff --git a/src/util.py b/src/util.py deleted file mode 100644 index 6fd2618..0000000 --- a/src/util.py +++ /dev/null @@ -1,237 +0,0 @@ -import os -import asyncio - -import numpy as np -from typing import Any, Dict, List, Tuple, Union - -from GPT.embeddings import openAIEmbeddings -import streamlit as st -import re -import GPT -import textwrap -from langdetect import detect -import time -from datetime import datetime - -from langchain.chat_models import ChatOpenAI -from langchain.docstore.document import Document -from langchain.prompts import PromptTemplate -from langchain.chains.summarize import load_summarize_chain -from langchain.chains import LLMChain - -def _similarity(v1, v2) -> np.ndarray: - """Returns the cosine similarity between two vectors.""" - return np.dot(v1, v2) - -@st.cache_data(show_spinner=False) -def _chunk_spliter(content: str, chunk_size: int = 1000, lang_base: str = 'latin') -> List[str]: - """Splits a string into chunks of a given size.""" - - sentences = re.split(r'(?<=[.?!,γ€‚οΌŒγ€οΌοΌŸΒ·])\s+', content) - if lang_base == 'latin': - chunks = [] - chunk = '' - word_count = 0 - for sentence in sentences: - sentence += ' ' # add space at end to compensate for split - words = sentence.split() - sentence_word_count = len(words) - if word_count + sentence_word_count <= chunk_size: - chunk += sentence - word_count += sentence_word_count - else: - chunks.append(chunk.strip()) - chunk = sentence - word_count = sentence_word_count - # add the last chunk - if chunk: - chunks.append(chunk.strip()) - - new_chunks = [] - for c in chunks: - if c == '': - continue - if len(c.split()) > chunk_size + 25: - words = c.split() - small_chunks = [] - for i in range(0, len(words), chunk_size): - small_chunks.append(' '.join(words[i:i + chunk_size])) - new_chunks.extend(small_chunks) - else: - new_chunks.append(c) - return new_chunks - - else: - chunks = textwrap.wrap(content, width=chunk_size) - return chunks - - -def language_base(string: str) -> str: - try: - lang_code = detect(string) - latin_based = ['en', 'fr-ca', 'es'] - east_asian_based = ['zh', 'ja', 'ko'] - for lang in latin_based: - if lang_code.startswith(lang): - return 'latin' - for lang in east_asian_based: - if lang_code.startswith(lang): - return 'east_asian' - return 'other' - except KeyError: - return 'other' - -@st.cache_data(show_spinner=False) -def convert_to_chunks(content: str, chunk_size: int = 1000, enable_embedding: bool = False) -> List[Dict[str, float]]: - """Converts a string into chunks of a given size.""" - chunks_text = _chunk_spliter(content, chunk_size, language_base(content)) - chunks = [] - for i, chunk in enumerate(chunks_text): - if enable_embedding: - embedding = openAIEmbeddings(st.session_state["OPENAI_API_KEY"]) - chunks.append({'content': chunk, 'vector': embedding.embedding(chunk)}) - else: - chunks.append({'content': chunk, 'language_based': language_base(chunk), 'chunk_id': i}) - return chunks - - -def search_chunks(query: str, chunks: List[Dict[str, float]], count: int = 1) -> List[Dict[str, np.ndarray]]: - """Returns the top `count` chunks that are most similar to the query.""" - embedding = openAIEmbeddings(st.session_state["OPENAI_API_KEY"]) - vectors = embedding.embedding(query) - points = [] - - for chunk in chunks: - point = _similarity(vectors, chunk['vector']) - points.append({'content': chunk['content'], 'point': point}) - - # sort the points in descending order - ordered = sorted(points, key=lambda x: x['point'], reverse=True) - return ordered[0:count] - -@st.cache_data(show_spinner=False) -def convert_to_docs(chunks: List[Dict[str, Union[str, float]]]) -> List[Document] | Document: - """Converts a list of chunks into a list of documents.""" - docs = [] - for chunk in chunks: - content = chunk['content'] - metadata = {'chunk_id': chunk['chunk_id']} - doc = Document(page_content=content, metadata=metadata) - docs.append(doc) - return docs - -async def async_generate(chain, chunk)-> Dict[str, Union[str, int]]: - """Generates a summary asynchronously.""" - resp = await chain.arun(text=chunk['content']) - return {'content': resp, 'chunk_id': chunk['chunk_id']} - -async def summarize_experimental_concurrently(content: str, chunk_size: int = 1000) -> Tuple[List[Dict[str, Union[str, int]]], str]: - """Summarizes a string asynchronously.""" - os.environ['OPENAI_API_KEY'] = st.session_state["OPENAI_API_KEY"] - params = st.session_state['OPENAI_PARAMS'] - llm_rec = ChatOpenAI(model_name=params.model, - max_tokens=params.max_tokens_rec, - temperature=params.temperature, - top_p=params.top_p, - frequency_penalty=params.frequency_penalty, - presence_penalty=params.presence_penalty) - llm_final = ChatOpenAI(model_name=params.model, - max_tokens=params.max_tokens_final, - temperature=params.temperature, - top_p=params.top_p, - frequency_penalty=params.frequency_penalty, - presence_penalty=params.presence_penalty) - chunks = convert_to_chunks(content, chunk_size) - - REC_PROMPT = PromptTemplate(template=st.session_state['OPENAI_PERSONA_REC'], input_variables=['text']) - chain = LLMChain(llm=llm_rec, prompt=REC_PROMPT) - - tasks = [] - for chunk in chunks: - task = async_generate(chain, chunk) - tasks.append(task) - - outputs_rec = [] - progress_bar = st.progress(0, f"Generating summary 0/{len(chunks)}") - count = 1 - for coro in asyncio.as_completed(tasks): - output_rec = await coro - outputs_rec.append(output_rec) - progress_bar.progress(count / len(chunks), f"Generating summary {count}/{len(chunks)}") - count += 1 - rec_result = sorted(outputs_rec, key=lambda x: x['chunk_id']) - if st.session_state['FINAL_SUMMARY_MODE']: - FINAL_PROMPT = PromptTemplate(template=st.session_state['OPENAI_PERSONA_SUM'], input_variables=['text']) - chain = load_summarize_chain(llm_final, chain_type='stuff', prompt=FINAL_PROMPT) - docs = convert_to_docs(rec_result) - final_result = chain.run(docs) - else: - final_result = None - return rec_result, final_result - -@st.cache_data(show_spinner=False) -def recursive_summarize(chunks: List[Dict[str, Union[str, float]]], max_tokens) -> Tuple[List[str], str]: - """Returns a recursive summary of the given content.""" - recursiveSumTexts = [] - finish_reason = '' - chunks_length = len(chunks) - count = 0 - progress_bar = st.progress(0) - for chunk in chunks: - content = chunk['content'] - text, finish_reason = GPT.generate.get_answer(content, - max_tokens=max_tokens, - persona=st.session_state['OPENAI_PERSONA_REC']) - recursiveSumTexts.append(text) - progress_bar.progress((count + 1) / chunks_length) - count += 1 - time.sleep(st.session_state['DELAY']) - - return recursiveSumTexts, finish_reason - - -@st.cache_data(show_spinner=False) -def summarize(message: List[str] | str) -> Tuple[str, str]: - """Returns a summary of the given content.""" - if isinstance(message, list): - join_msg = ' '.join(message) - else: - join_msg = message - - params = st.session_state['OPENAI_PARAMS'] - max_asw_tokens_final = params.max_tokens_final - - answer, finish_reason = GPT.generate.get_answer(join_msg, max_tokens=max_asw_tokens_final, - persona=st.session_state['OPENAI_PERSONA_SUM']) - return answer, finish_reason - - -def download_results(rec_responses, final_response): - """Downloads the results as a txt file.""" - joint_rec_response = f"=====recursive responses=====\n\n" + '\n\n'.join(rec_responses) - joint_final_response = f"{joint_rec_response}\n\n======final response=====\n\n{final_response}" - now = datetime.now() - if final_response is not None: - st.download_button("πŸ“₯ Download Summary", - joint_final_response, - file_name=f"summary_{now.strftime('%Y-%m-%d_%H-%M')}.md") - else: - st.download_button("πŸ“₯ Download Summary", - joint_rec_response, - file_name=f"summary_{now.strftime('%Y-%m-%d_%H-%M')}.md") - - -def exceeded_token_handler(param, chunks) -> bool: - """Handles the case where the user has exceeded the number of tokens.""" - if param.model == 'gpt-4': - max_token = 8100 - elif param.model == 'gpt-3.5-turbo-16k': - max_token = 16385 - else: - max_token = 4096 - info = GPT.misc.is_tokens_exceeded(param, chunks, max_token) - if info['exceeded']: - st.error(f"❌ {info['message']}") - return True - else: - return False diff --git a/tools/get-requirement.bat b/tools/get-requirement.bat deleted file mode 100644 index a5c88e2..0000000 --- a/tools/get-requirement.bat +++ /dev/null @@ -1,16 +0,0 @@ -@echo off -cd.. -echo Activating Virtural environment... -call .\venv\Scripts\activate - -echo upgrading pip... -python -m pip install --upgrade pip - - -echo Installing pipreqs... -pip install pipreqs - -echo Export to requirements.txt -pipreqs . --force --encoding utf-8 - -pause \ No newline at end of file