Skip to content

Commit

Permalink
BREAKING CHANGE: introducing experimental async summary generation.
Browse files Browse the repository at this point in the history
- feat: legacy mode
- feat: language mode for prompt
- feat: time elapse counter
- prompt: change of prompt structure
  • Loading branch information
sean1832 committed Mar 25, 2023
1 parent e4650b4 commit ba92f8a
Show file tree
Hide file tree
Showing 7 changed files with 165 additions and 49 deletions.
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
docx==0.2.4
langchain==0.0.112
langchain==0.0.123
langdetect==1.0.9
numpy==1.24.2
openai==0.27.2
Expand Down
55 changes: 38 additions & 17 deletions src/Components/sidebar.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,17 @@ def _set_config(config_file, key: str, default_value):
else:
return default_value

def _set_language(language: str):
st.session_state['OUTPUT_LANGUAGE'] = language

def _set_legacy(enable: bool):
st.session_state['LEGACY'] = enable
def _legacy(enable: bool, legacy, experimental):
if not enable:
return experimental
else:
return legacy


def sidebar():
with st.sidebar:
Expand All @@ -55,6 +66,7 @@ def sidebar():
help="You can get your API key from https://beta.openai.com/account/api-keys",
value=_set_config(config_file, "OPENAI_API_KEY", ""))

enable_legacy = st_toggle_switch(label="Legacy", default_value=_set_config(config_file, "LEGACY", False))
enable_final_summary = st_toggle_switch(label="Enable Final Summary",
default_value=_set_config(config_file, "FINAL_SUMMARY_MODE", False))
if enable_final_summary:
Expand All @@ -63,40 +75,48 @@ def sidebar():
set_final_summary_mode(enable_final_summary)

with st.expander('🤖 Bot Persona'):
default_persona_rec = 'Provide a detailed and comprehensive summary of the following content in flawless ' \
'English, ensuring all key points are covered. Create a markdown heading (###) that ' \
'encapsulates the core information of the content. '
language = st.selectbox('Language', ['English', 'Chinese', 'Japanese', 'Korean', 'Spanish', 'French', 'German'])
default_persona_rec_legacy = 'Provide a detailed and comprehensive summary of the following content in flawless ' \
f'{language}, ensuring all key points are covered. Create a markdown heading (###) that ' \
f'encapsulates the core information of the content. Make sure it is answered in {language}.'
default_persona_rec = f"""Write a detailed and comprehensive explanation of the following in perfect {language} with no grammar issues,
ensuring all key points are covered. Create a markdown heading (###) that encapsulates the core information:""" + \
"""
{text}
Structured markdown summary with heading (###): """
persona_rec = st.text_area('Bot Persona Recursive',
value=_set_config(config_file, "OPENAI_PERSONA_REC", default_persona_rec),
value=_set_config(config_file, "OPENAI_PERSONA_REC", _legacy(enable_legacy, default_persona_rec_legacy, default_persona_rec)),
help='System message is a pre-defined message used to instruct the assistant at the '
'beginning of a conversation. iterating and '
'experimenting with potential improvements can help to generate better outputs.'
'Make sure to use casual language.',
height=140)
height=250)
if enable_final_summary:
default_persona_sum_a = 'Provide detail explanation and summary of the following transcript ' \
'into comprehensive and cohesive article in markdown format with ' \
'perfect english while making sure all the key points are included. ' \
'Rephrase and restructure the text so that it can be read fluently, make sense ' \
'and avoid repetition.' \

default_persona_sum_b = 'identify headings in the transcript and summarise them into five ' \
default_persona_sum_legacy = 'identify headings in the transcript and summarise them into five ' \
'headings. Use #### headings in markdown. Under headings, summarise at least ' \
'3 key points and then provide detail explanation of the concept based on the ' \
'following text in the way that can be read fluently, make sense and avoid ' \
'repetition. Make sure to include all information. Write it in beautiful and ' \
'structured markdown. '
f'structured markdown in perfect {language}. '
default_persona_sum = """Write a detailed summary of the following:
{text}
Identify and summarise them into five headings. Use #### headings in markdown. Under headings, summarize a list of key points that best encapsulate the core information.""" + \
f"""Structured markdown summary with headings in perfect {language} (####): """

persona_sum = st.text_area('Bot Persona Total Sum',
value=_set_config(config_file, "OPENAI_PERSONA_SUM", default_persona_sum_b),
value=_set_config(config_file, "OPENAI_PERSONA_SUM", _legacy(enable_legacy, default_persona_sum_legacy, default_persona_sum)),
help='This is a pre-defined message for total summarization that is used to'
'instruct the assistant at the beginning of a conversation. ',
height=240)
height=300)
else:
persona_sum = ""

with st.expander('🔥 Advanced Options'):
model_options = ['gpt-3.5-turbo', 'gpt-3.5-turbo-0301', 'gpt-4']
model_options = ['gpt-3.5-turbo', 'gpt-4']
model_index = model_options.index(_set_config(config_file, "MODEL", 'gpt-3.5-turbo'))
model = st.selectbox("Model", options=model_options, index=model_index)

Expand Down Expand Up @@ -162,7 +182,8 @@ def sidebar():

if persona_rec:
set_openai_persona(persona_rec, persona_sum)

_set_language(language)
set_chunk_size(chunk_size)
set_param(param)
set_delay(delay)
_set_legacy(enable_legacy)
1 change: 1 addition & 0 deletions src/GPT/misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ def is_tokens_exceeded(param, chunks, max_token: int = 4096) -> Dict[str, Union[
'reason': 'final',
'message': f"**[ Final summary ]** tokens exceeded. Max tokens allowed: {max_token}. Tokens used: {final_chunks_token}\n"
f"(Prompt: {final_prompt_token}, Completion: {final_completion_token})"}

else:
return {'exceeded': False,
'reason': '',
Expand Down
41 changes: 27 additions & 14 deletions src/SumGPT.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import asyncio
import Components
import streamlit as st
from Components.sidebar import sidebar
import Modules.file_io as file_io
import GPT
import util
import time

Components.StreamlitSetup.setup()

Expand Down Expand Up @@ -82,37 +84,48 @@
f"Price Prediction: `${price}` || Total Prompt: `{prompt_token}`, Total Completion: `{completion_token}`")
# max tokens exceeded warning
exceeded = util.exceeded_token_handler(param=st.session_state['OPENAI_PARAMS'], chunks=chunks)

rec_responses = []
final_response = None
finish_reason_rec = None
finish_reason_final = None

# finish_reason_rec = None
if st.button("🚀 Run", disabled=exceeded):
start_time = time.time()
st.cache_data.clear()
API_KEY = st.session_state['OPENAI_API_KEY']
if not API_KEY and GPT.misc.validate_api_key(API_KEY):
st.error("❌ Please enter a valid [OpenAI API key](https://beta.openai.com/account/api-keys).")
else:
with st.spinner("Summarizing... (this might take a while)"):
rec_max_token = st.session_state['OPENAI_PARAMS'].max_tokens_rec
rec_responses, finish_reason_rec = util.recursive_summarize(chunks, rec_max_token)
if st.session_state['FINAL_SUMMARY_MODE']:
final_response, finish_reason_final = util.summarize(rec_responses)
if st.session_state['LEGACY']:
rec_max_token = st.session_state['OPENAI_PARAMS'].max_tokens_rec
rec_responses, finish_reason_rec = util.recursive_summarize(chunks, rec_max_token)
if st.session_state['FINAL_SUMMARY_MODE']:
final_response, finish_reason_final = util.summarize(rec_responses)
else:
final_response = None
else:
final_response = None
completions, final_response = asyncio.run(util.summarize_experimental_concurrently(content, st.session_state['CHUNK_SIZE']))
rec_responses = [d["content"] for d in completions]
rec_ids = [d["chunk_id"] for d in completions]
# final_response = completion['output_text']

end_time = time.time()
st.markdown(f"⏱️ Time taken: `{round(end_time - start_time, 2)}s`")

if rec_responses is not []:
with st.expander("Recursive Summaries", expanded=not st.session_state['FINAL_SUMMARY_MODE']):
for response in rec_responses:
st.info(response)
for i, response in enumerate(rec_responses):
st.info(f'{response}')
if finish_reason_rec == 'length':
st.warning('⚠️Result cut off due to length. Consider increasing the [Max Tokens Chunks] parameter.')

if final_response is not None:
if st.session_state['FINAL_SUMMARY_MODE']:
st.header("📝Summary")
st.info(final_response)
if finish_reason_final == 'length':
st.warning(
'⚠️Result cut off due to length. Consider increasing the [Max Tokens Summary] parameter.')
st.header("📝Summary")
st.info(final_response)
if finish_reason_final == 'length':
st.warning(
'⚠️Result cut off due to length. Consider increasing the [Max Tokens Summary] parameter.')
if rec_responses != [] or final_response is not None:
util.download_results(rec_responses, final_response)
2 changes: 1 addition & 1 deletion src/manifest.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "SumGPT",
"version": "0.6.5",
"version": "1.0.0",
"license": "MIT",
"author": "Zeke Zhang",
"homepage": "https://github.com/sean1832/SumGPT",
Expand Down
69 changes: 68 additions & 1 deletion src/util.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import os
import asyncio

import numpy as np
from typing import Any, Dict, List, Tuple, Union
from GPT.embeddings import openAIEmbeddings
Expand All @@ -11,6 +14,11 @@
import xml.etree.ElementTree as ET
from datetime import datetime

from langchain.chat_models import ChatOpenAI
from langchain.docstore.document import Document
from langchain.prompts import PromptTemplate
from langchain.chains.summarize import load_summarize_chain
from langchain.chains import LLMChain

def _is_auto_lang(lang_code: str) -> bool:
"""Checks if the language code is an auto language."""
Expand Down Expand Up @@ -153,7 +161,7 @@ def convert_to_chunks(content: str, chunk_size: int = 1000, enable_embedding: bo
embedding = openAIEmbeddings(st.session_state["OPENAI_API_KEY"])
chunks.append({'content': chunk, 'vector': embedding.embedding(chunk)})
else:
chunks.append({'content': chunk, 'language_based': language_base(chunk)})
chunks.append({'content': chunk, 'language_based': language_base(chunk), 'chunk_id': i})
return chunks


Expand All @@ -171,6 +179,65 @@ def search_chunks(query: str, chunks: List[Dict[str, float]], count: int = 1) ->
ordered = sorted(points, key=lambda x: x['point'], reverse=True)
return ordered[0:count]

@st.cache_data(show_spinner=False)
def convert_to_docs(chunks: List[Dict[str, Union[str, float]]]) -> List[Document] | Document:
"""Converts a list of chunks into a list of documents."""
docs = []
for chunk in chunks:
content = chunk['content']
metadata = {'chunk_id': chunk['chunk_id']}
doc = Document(page_content=content, metadata=metadata)
docs.append(doc)
return docs

async def async_generate(chain, chunk)-> Dict[str, Union[str, int]]:
"""Generates a summary asynchronously."""
resp = await chain.arun(text=chunk['content'])
return {'content': resp, 'chunk_id': chunk['chunk_id']}

async def summarize_experimental_concurrently(content: str, chunk_size: int = 1000) -> Tuple[List[Dict[str, Union[str, int]]], str]:
"""Summarizes a string asynchronously."""
os.environ['OPENAI_API_KEY'] = st.session_state["OPENAI_API_KEY"]
params = st.session_state['OPENAI_PARAMS']
llm_rec = ChatOpenAI(model_name=params.model,
max_tokens=params.max_tokens_rec,
temperature=params.temperature,
top_p=params.top_p,
frequency_penalty=params.frequency_penalty,
presence_penalty=params.presence_penalty)
llm_final = ChatOpenAI(model_name=params.model,
max_tokens=params.max_tokens_final,
temperature=params.temperature,
top_p=params.top_p,
frequency_penalty=params.frequency_penalty,
presence_penalty=params.presence_penalty)
chunks = convert_to_chunks(content, chunk_size)

REC_PROMPT = PromptTemplate(template=st.session_state['OPENAI_PERSONA_REC'], input_variables=['text'])
FINAL_PROMPT = PromptTemplate(template=st.session_state['OPENAI_PERSONA_SUM'], input_variables=['text'])
chain = LLMChain(llm=llm_rec, prompt=REC_PROMPT)

tasks = []
for chunk in chunks:
task = async_generate(chain, chunk)
tasks.append(task)

outputs_rec = []
progress_bar = st.progress(0, f"Generating summary 0/{len(chunks)}")
count = 1
for coro in asyncio.as_completed(tasks):
output_rec = await coro
outputs_rec.append(output_rec)
progress_bar.progress(count / len(chunks), f"Generating summary {count}/{len(chunks)}")
count += 1
rec_result = sorted(outputs_rec, key=lambda x: x['chunk_id'])
if st.session_state['FINAL_SUMMARY_MODE']:
chain = load_summarize_chain(llm_final, chain_type='stuff', prompt=FINAL_PROMPT)
docs = convert_to_docs(rec_result)
final_result = chain.run(docs)
else:
final_result = ''
return rec_result, final_result

@st.cache_data(show_spinner=False)
def recursive_summarize(chunks: List[Dict[str, Union[str, float]]], max_tokens) -> Tuple[List[str], str]:
Expand Down
44 changes: 29 additions & 15 deletions test/test.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,33 @@
from pytube import YouTube
import xml.etree.ElementTree as ET
import asyncio

async def coroutine1():
await asyncio.sleep(2)
return {"id": 1, "content": "result 1"}

async def coroutine2():
await asyncio.sleep(1)
return {"id": 2, "content": "result 2"}

async def main():
tasks = [coroutine1(), coroutine2()]
completed_count = 0
results = []

for coro in asyncio.as_completed(tasks):
result = await coro
results.append(result)
completed_count += 1
print(f"Coroutine {result['id']} completed ({completed_count}/{len(tasks)}).")

print("All coroutines completed.")
print("Results:", sorted(results, key=lambda x: x['id']))

asyncio.run(main())






def extract_xml_caption(xml: str) -> str:
"""Extracts the text content from the <s> elements of an XML string."""
root = ET.fromstring(xml)
text_content = ''
for child in root.iter('s'):
text_content += child.text
return text_content.strip()


def get_transcript(url: str, lang_code: str = 'a.en') -> str:
"""Extracts the transcript from a YouTube video."""
yt = YouTube(url)
caption = yt.captions[lang_code]
xml_caption = caption.xml_captions
return extract_xml_caption(xml_caption)

0 comments on commit ba92f8a

Please sign in to comment.