Merge pull request #23 from sean1832/2.0.0

Major Restructure with New Features: Download Summaries, Cookie Management with AES Encryption, and Focus on Summarization
sean1832 · Oct 28, 2024 · 12ce3d1 · 12ce3d1
2 parents 5590572 + ae18b10
commit 12ce3d1
Show file tree

Hide file tree

Showing 49 changed files with 837 additions and 1,260 deletions.
diff --git a/.gitignore b/.gitignore
@@ -163,3 +163,6 @@ cython_debug/
 # test folder
 .test/
 /test/
+
+# streamlit
+.streamlit/
diff --git a/.idea/.gitignore b/.idea/.gitignore
diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml
diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml
diff --git a/.idea/misc.xml b/.idea/misc.xml
diff --git a/.idea/modules.xml b/.idea/modules.xml
diff --git a/.idea/sumGPT.iml b/.idea/sumGPT.iml
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
diff --git a/README.md b/README.md
@@ -1,7 +1,7 @@
 # SumGPT
 [![python](https://img.shields.io/badge/python-3.11-blue)](https://www.python.org/downloads/release/python-3112/)
 
-Achieve detailed summarization of extensive documents through 🚀ultra-fast parallelized predictions, utilizing [GPT-3.5](https://platform.openai.com/docs/models/gpt-3-5) and [GPT-4](https://platform.openai.com/docs/models/gpt-4) APIs provided by [OpenAI](https://openai.com/).
+Achieve detailed summarization of extensive documents through 🚀ultra-fast parallelized completion with APIs provided by [OpenAI](https://openai.com/).
 
 🌐 Web App: [https://sumgpt.streamlit.app](https://sumgpt.streamlit.app/)
 
@@ -13,22 +13,35 @@ Achieve detailed summarization of extensive documents through 🚀ultra-fast par
 ---
 
 ### 🌟 Features
-- 📄 Summarize document (.pdf, .docx, .txt, .md).
-- 🎥 Summarize YouTube video with subtitles.
+- 📄 Summarize document (.txt, .md).
 - 🤖 Customizable parameters and bot persona for refined response generation.
-- 🚀 Facilitates parallel processing of chunks, enabling ultra-fast generation speeds.
+- 🚀 Facilitates parallel processing of chunks.
 - 💼 Export & import configs for easy sharing and reuse.
-- 🧠 Supports GPT-3.5 and GPT-4.
+- 🌍 Encrypted browser cookies ensure configuration settings are preserved across sessions.
+- 🧠 Supports multiple modles:
+    - `gpt-4o-mini`
+    - `gpt-4o`
+    - `gpt-4-turbo`
+    - `gpt-3.5-turbo`
 
 ### 💡 What you need
 - 🔑 OpenAI **[API keys](https://platform.openai.com/account/api-keys)**
 
-> ***Note: To access GPT-4, please [join the waitlist](https://openai.com/waitlist/gpt-4-api) if you haven't already received an invitation from OpenAI.***
-
 ### 💻 Running Locally
 - Make sure you have **[python 3.11](https://www.python.org/downloads)** | [python installation tutorial (YouTube)](https://youtu.be/HBxCHonP6Ro?t=105)
 1. Clone the repository
 ```bash
 git clone https://github.com/sean1832/SumGPT
+cd SumGPT
+```
+
+2. Create a `secrets.toml` file under `.streamlit\` directory. Replace `your_secure_key` with your own password for browser cookie encryption.
+```bash
+mkdir .streamlit
+echo "crypto_keycrypto_key = 'your_secure_key'" > .streamlit/secrets.toml
+```
+
+3. Execute `RUN.bat`
+```bash
+./RUN.bat
 ```
-2. Execute `RUN.bat`
diff --git a/RUN.bat b/RUN.bat
@@ -38,4 +38,4 @@ if "%mod_date%" neq "%last_mod_date%" (
   echo "Requirements file has not been modified. Skipping update."
 )
 
-streamlit run src/SumGPT.py
+streamlit run SumGPT/main.py
diff --git a/SumGPT/app/__init__.py b/SumGPT/app/__init__.py
diff --git a/SumGPT/app/body_handler.py b/SumGPT/app/body_handler.py
@@ -0,0 +1,204 @@
+import asyncio
+import datetime
+import json
+from typing import Any, Dict, List, Optional, Tuple
+
+import streamlit as st
+import utils.io as io
+from core.crypto import Crypto
+from core.llm import LLM
+from core.tokenizer import Tokenizer
+from datamodel.chunk import Chunk
+from datamodel.llm_params import LLMParams
+from streamlit_cookies_controller import CookieController
+
+
+class BodyHandler:
+    def file_uploader(self, type: List[str] = ["txt"]) -> List[Dict[str, str]]:
+        uploaded_files = st.file_uploader(
+            "📁 Upload your files", type=type, accept_multiple_files=True
+        )
+        files = []
+        if uploaded_files is None:
+            st.stop()
+            st.warning("File is not uploaded.")
+        for file in uploaded_files:
+            text = io.read_to_string(file)
+            filename = file.name
+            files.append({"filename": filename, "text": text})
+        return files
+
+    def segment_text(
+        self, text: str, chunk_size: int, model: str, input_id: int
+    ) -> Tuple[List[Chunk], int]:
+        chunks: List[Chunk] = []
+        tokenizer = Tokenizer(model)
+        total_tokens = tokenizer.tokenize(text)
+        count = 0
+        for i in range(0, len(total_tokens), chunk_size):
+            chunk_tokens = total_tokens[i : i + chunk_size]
+            content = tokenizer.detokenize(chunk_tokens)
+            chunks.append(Chunk(count, content, len(chunk_tokens), input_id))
+            count += 1
+        return chunks, len(total_tokens)
+
+    def _get_tokens(self, response_meta: Dict[str, Any]) -> Tuple[int, int, int]:
+        completion_tokens = response_meta.get("token_usage", {}).get("completion_tokens", 0)
+        prompt_tokens = response_meta.get("token_usage", {}).get("prompt_tokens", 0)
+        cached_tokens = (
+            response_meta.get("token_usage", {})
+            .get("prompt_tokens_details", {})
+            .get("cached_tokens", 0)
+        )
+        return completion_tokens, prompt_tokens, cached_tokens
+
+    def agenerate(
+        self,
+        chunks: List[Chunk],
+        gpt_params: LLMParams,
+        role: str,
+        api_key: Optional[str],
+        config: Dict[str, Any],
+    ) -> None:
+        generate_button = st.button(
+            "🚀 Run",
+        )
+        total_chunks = len(chunks)
+        progress_text = st.empty()
+
+        total_price_text = st.empty()
+
+        if generate_button:
+            if not api_key:
+                st.error("❌ Please enter your OpenAI API key in the sidebar.")
+                return
+            if not role:
+                st.error("❌ Please enter a role description in the sidebar.")
+                return
+
+            st.session_state["summaries"] = []  # Initialize or reset summaries
+
+            async def process_chunks():
+                llm = LLM(api_key, gpt_params)
+                total_price = 0
+                progress_bar = st.progress(0)
+                completed_chunks = 0
+                progress_text.write(f"Generating summaries 0/{total_chunks}")
+
+                # Sort chunks by chunk.id
+                sorted_chunks = sorted(chunks, key=lambda c: c.id)
+
+                # Group chunks by filename
+                filename_chunks = {}
+                for chunk in sorted_chunks:
+                    if chunk.filename not in filename_chunks:
+                        filename_chunks[chunk.filename] = []
+                    filename_chunks[chunk.filename].append(chunk)
+
+                # Create expanders for each file
+                expanders = {
+                    filename: st.expander(f"{filename}") for filename in filename_chunks.keys()
+                }
+
+                # Create tasks for all chunks (sorted by chunk.id)
+                tasks = [llm.agenerate(chunk.content, role) for chunk in sorted_chunks]
+
+                # Run all tasks and get the results in the same order
+                summaries = await asyncio.gather(*tasks)
+
+                # Process the results in order
+                for summary, current_chunk in zip(summaries, sorted_chunks):
+                    completed_chunks += 1
+                    progress_text.write(f"Generating summaries {completed_chunks}/{total_chunks}")
+                    progress_bar.progress(completed_chunks / total_chunks)
+
+                    with expanders[current_chunk.filename]:
+                        with st.chat_message("ai"):
+                            st.write(summary.content)
+                            completion_tokens, prompt_tokens, cached_tokens = self._get_tokens(
+                                summary.response_metadata
+                            )
+                            price = round(
+                                llm.Calc_price(prompt_tokens, completion_tokens, cached_tokens), 6
+                            )
+                            st.write(
+                                f"Tokens: `{completion_tokens + prompt_tokens}`, price: `${price}`"
+                            )
+                            total_price += price
+
+                    # Store the summary in session state
+                    st.session_state["summaries"].append(
+                        {
+                            "filename": current_chunk.filename,
+                            "content": summary.content,
+                            "tokens": completion_tokens + prompt_tokens,
+                            "price": price,
+                        }
+                    )
+
+                progress_text.write("✅ All chunks processed!")
+                progress_bar.progress(1.0)
+                total_price_text.write(f"Total price: `${round(total_price, 6)}`")
+
+            # Run the async processing
+            asyncio.run(process_chunks())
+            crypto: Crypto = st.session_state["crypto"]
+            config_binary = crypto.encrypt_b64(json.dumps(config))
+            controler = CookieController()
+            controler.set(
+                "config",
+                config_binary,
+                expires=datetime.datetime.now() + datetime.timedelta(days=30),
+            )
+        else:
+            # Check if summaries exist in session state and display them
+            if "summaries" in st.session_state:
+                total_price = 0
+                # Group summaries by filename
+                filename_summaries = {}
+                for summary_data in st.session_state["summaries"]:
+                    filename = summary_data["filename"]
+                    if filename not in filename_summaries:
+                        filename_summaries[filename] = []
+                    filename_summaries[filename].append(summary_data)
+
+                # Display summaries grouped by filename
+                for filename, summaries in filename_summaries.items():
+                    with st.expander(f"{filename}"):
+                        for summary_data in summaries:
+                            with st.chat_message("ai"):
+                                st.write(summary_data["content"])
+                                st.write(
+                                    f"Tokens: `{summary_data['tokens']}`, price: `${summary_data['price']}`"
+                                )
+                                total_price += summary_data["price"]
+                total_price_text.write(f"Total price: `${round(total_price, 6)}`")
+
+    def download_summaries(self):
+        if "summaries" in st.session_state:
+            summaries = st.session_state["summaries"]
+            if not summaries:
+                return
+            st.download_button(
+                "📥 Download summaries",
+                self._serialize_summaries(summaries),
+                "summaries.md",
+                mime="application/markdown",
+            )
+
+    def _serialize_summaries(self, summaries):
+        markdown = ""
+
+        markdown_by_filename = {}
+        for summary in summaries:
+            filename = summary["filename"]
+            if filename not in markdown_by_filename:
+                markdown_by_filename[filename] = []
+            markdown_by_filename[filename].append(summary["content"])
+
+        for filename, content in markdown_by_filename.items():
+            markdown += f"# {filename}\n"
+            markdown += "\n\n".join(content)
+            markdown += "\n\n"
+
+        return markdown