Skip to content

Commit

Permalink
Merge pull request #23 from sean1832/2.0.0
Browse files Browse the repository at this point in the history
Major Restructure with New Features: Download Summaries, Cookie Management with AES Encryption, and Focus on Summarization
  • Loading branch information
sean1832 authored Oct 28, 2024
2 parents 5590572 + ae18b10 commit 12ce3d1
Show file tree
Hide file tree
Showing 49 changed files with 837 additions and 1,260 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -163,3 +163,6 @@ cython_debug/
# test folder
.test/
/test/

# streamlit
.streamlit/
8 changes: 0 additions & 8 deletions .idea/.gitignore

This file was deleted.

38 changes: 0 additions & 38 deletions .idea/inspectionProfiles/Project_Default.xml

This file was deleted.

6 changes: 0 additions & 6 deletions .idea/inspectionProfiles/profiles_settings.xml

This file was deleted.

4 changes: 0 additions & 4 deletions .idea/misc.xml

This file was deleted.

8 changes: 0 additions & 8 deletions .idea/modules.xml

This file was deleted.

24 changes: 0 additions & 24 deletions .idea/sumGPT.iml

This file was deleted.

6 changes: 0 additions & 6 deletions .idea/vcs.xml

This file was deleted.

29 changes: 21 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# SumGPT
[![python](https://img.shields.io/badge/python-3.11-blue)](https://www.python.org/downloads/release/python-3112/)

Achieve detailed summarization of extensive documents through 🚀ultra-fast parallelized predictions, utilizing [GPT-3.5](https://platform.openai.com/docs/models/gpt-3-5) and [GPT-4](https://platform.openai.com/docs/models/gpt-4) APIs provided by [OpenAI](https://openai.com/).
Achieve detailed summarization of extensive documents through 🚀ultra-fast parallelized completion with APIs provided by [OpenAI](https://openai.com/).

🌐 Web App: [https://sumgpt.streamlit.app](https://sumgpt.streamlit.app/)

Expand All @@ -13,22 +13,35 @@ Achieve detailed summarization of extensive documents through 🚀ultra-fast par
---

### 🌟 Features
- 📄 Summarize document (.pdf, .docx, .txt, .md).
- 🎥 Summarize YouTube video with subtitles.
- 📄 Summarize document (.txt, .md).
- 🤖 Customizable parameters and bot persona for refined response generation.
- 🚀 Facilitates parallel processing of chunks, enabling ultra-fast generation speeds.
- 🚀 Facilitates parallel processing of chunks.
- 💼 Export & import configs for easy sharing and reuse.
- 🧠 Supports GPT-3.5 and GPT-4.
- 🌍 Encrypted browser cookies ensure configuration settings are preserved across sessions.
- 🧠 Supports multiple modles:
- `gpt-4o-mini`
- `gpt-4o`
- `gpt-4-turbo`
- `gpt-3.5-turbo`

### 💡 What you need
- 🔑 OpenAI **[API keys](https://platform.openai.com/account/api-keys)**

> ***Note: To access GPT-4, please [join the waitlist](https://openai.com/waitlist/gpt-4-api) if you haven't already received an invitation from OpenAI.***
### 💻 Running Locally
- Make sure you have **[python 3.11](https://www.python.org/downloads)** | [python installation tutorial (YouTube)](https://youtu.be/HBxCHonP6Ro?t=105)
1. Clone the repository
```bash
git clone https://github.com/sean1832/SumGPT
cd SumGPT
```

2. Create a `secrets.toml` file under `.streamlit\` directory. Replace `your_secure_key` with your own password for browser cookie encryption.
```bash
mkdir .streamlit
echo "crypto_keycrypto_key = 'your_secure_key'" > .streamlit/secrets.toml
```

3. Execute `RUN.bat`
```bash
./RUN.bat
```
2. Execute `RUN.bat`
2 changes: 1 addition & 1 deletion RUN.bat
Original file line number Diff line number Diff line change
Expand Up @@ -38,4 +38,4 @@ if "%mod_date%" neq "%last_mod_date%" (
echo "Requirements file has not been modified. Skipping update."
)

streamlit run src/SumGPT.py
streamlit run SumGPT/main.py
Empty file added SumGPT/app/__init__.py
Empty file.
204 changes: 204 additions & 0 deletions SumGPT/app/body_handler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,204 @@
import asyncio
import datetime
import json
from typing import Any, Dict, List, Optional, Tuple

import streamlit as st
import utils.io as io
from core.crypto import Crypto
from core.llm import LLM
from core.tokenizer import Tokenizer
from datamodel.chunk import Chunk
from datamodel.llm_params import LLMParams
from streamlit_cookies_controller import CookieController


class BodyHandler:
def file_uploader(self, type: List[str] = ["txt"]) -> List[Dict[str, str]]:
uploaded_files = st.file_uploader(
"📁 Upload your files", type=type, accept_multiple_files=True
)
files = []
if uploaded_files is None:
st.stop()
st.warning("File is not uploaded.")
for file in uploaded_files:
text = io.read_to_string(file)
filename = file.name
files.append({"filename": filename, "text": text})
return files

def segment_text(
self, text: str, chunk_size: int, model: str, input_id: int
) -> Tuple[List[Chunk], int]:
chunks: List[Chunk] = []
tokenizer = Tokenizer(model)
total_tokens = tokenizer.tokenize(text)
count = 0
for i in range(0, len(total_tokens), chunk_size):
chunk_tokens = total_tokens[i : i + chunk_size]
content = tokenizer.detokenize(chunk_tokens)
chunks.append(Chunk(count, content, len(chunk_tokens), input_id))
count += 1
return chunks, len(total_tokens)

def _get_tokens(self, response_meta: Dict[str, Any]) -> Tuple[int, int, int]:
completion_tokens = response_meta.get("token_usage", {}).get("completion_tokens", 0)
prompt_tokens = response_meta.get("token_usage", {}).get("prompt_tokens", 0)
cached_tokens = (
response_meta.get("token_usage", {})
.get("prompt_tokens_details", {})
.get("cached_tokens", 0)
)
return completion_tokens, prompt_tokens, cached_tokens

def agenerate(
self,
chunks: List[Chunk],
gpt_params: LLMParams,
role: str,
api_key: Optional[str],
config: Dict[str, Any],
) -> None:
generate_button = st.button(
"🚀 Run",
)
total_chunks = len(chunks)
progress_text = st.empty()

total_price_text = st.empty()

if generate_button:
if not api_key:
st.error("❌ Please enter your OpenAI API key in the sidebar.")
return
if not role:
st.error("❌ Please enter a role description in the sidebar.")
return

st.session_state["summaries"] = [] # Initialize or reset summaries

async def process_chunks():
llm = LLM(api_key, gpt_params)
total_price = 0
progress_bar = st.progress(0)
completed_chunks = 0
progress_text.write(f"Generating summaries 0/{total_chunks}")

# Sort chunks by chunk.id
sorted_chunks = sorted(chunks, key=lambda c: c.id)

# Group chunks by filename
filename_chunks = {}
for chunk in sorted_chunks:
if chunk.filename not in filename_chunks:
filename_chunks[chunk.filename] = []
filename_chunks[chunk.filename].append(chunk)

# Create expanders for each file
expanders = {
filename: st.expander(f"{filename}") for filename in filename_chunks.keys()
}

# Create tasks for all chunks (sorted by chunk.id)
tasks = [llm.agenerate(chunk.content, role) for chunk in sorted_chunks]

# Run all tasks and get the results in the same order
summaries = await asyncio.gather(*tasks)

# Process the results in order
for summary, current_chunk in zip(summaries, sorted_chunks):
completed_chunks += 1
progress_text.write(f"Generating summaries {completed_chunks}/{total_chunks}")
progress_bar.progress(completed_chunks / total_chunks)

with expanders[current_chunk.filename]:
with st.chat_message("ai"):
st.write(summary.content)
completion_tokens, prompt_tokens, cached_tokens = self._get_tokens(
summary.response_metadata
)
price = round(
llm.Calc_price(prompt_tokens, completion_tokens, cached_tokens), 6
)
st.write(
f"Tokens: `{completion_tokens + prompt_tokens}`, price: `${price}`"
)
total_price += price

# Store the summary in session state
st.session_state["summaries"].append(
{
"filename": current_chunk.filename,
"content": summary.content,
"tokens": completion_tokens + prompt_tokens,
"price": price,
}
)

progress_text.write("✅ All chunks processed!")
progress_bar.progress(1.0)
total_price_text.write(f"Total price: `${round(total_price, 6)}`")

# Run the async processing
asyncio.run(process_chunks())
crypto: Crypto = st.session_state["crypto"]
config_binary = crypto.encrypt_b64(json.dumps(config))
controler = CookieController()
controler.set(
"config",
config_binary,
expires=datetime.datetime.now() + datetime.timedelta(days=30),
)
else:
# Check if summaries exist in session state and display them
if "summaries" in st.session_state:
total_price = 0
# Group summaries by filename
filename_summaries = {}
for summary_data in st.session_state["summaries"]:
filename = summary_data["filename"]
if filename not in filename_summaries:
filename_summaries[filename] = []
filename_summaries[filename].append(summary_data)

# Display summaries grouped by filename
for filename, summaries in filename_summaries.items():
with st.expander(f"{filename}"):
for summary_data in summaries:
with st.chat_message("ai"):
st.write(summary_data["content"])
st.write(
f"Tokens: `{summary_data['tokens']}`, price: `${summary_data['price']}`"
)
total_price += summary_data["price"]
total_price_text.write(f"Total price: `${round(total_price, 6)}`")

def download_summaries(self):
if "summaries" in st.session_state:
summaries = st.session_state["summaries"]
if not summaries:
return
st.download_button(
"📥 Download summaries",
self._serialize_summaries(summaries),
"summaries.md",
mime="application/markdown",
)

def _serialize_summaries(self, summaries):
markdown = ""

markdown_by_filename = {}
for summary in summaries:
filename = summary["filename"]
if filename not in markdown_by_filename:
markdown_by_filename[filename] = []
markdown_by_filename[filename].append(summary["content"])

for filename, content in markdown_by_filename.items():
markdown += f"# {filename}\n"
markdown += "\n\n".join(content)
markdown += "\n\n"

return markdown
Loading

0 comments on commit 12ce3d1

Please sign in to comment.