From ba92f8a639434b7dd286f7d4c02d0ff24bd646e3 Mon Sep 17 00:00:00 2001 From: sean1832 Date: Sun, 26 Mar 2023 06:46:53 +1100 Subject: [PATCH] BREAKING CHANGE: introducing experimental async summary generation. - feat: legacy mode - feat: language mode for prompt - feat: time elapse counter - prompt: change of prompt structure --- requirements.txt | 2 +- src/Components/sidebar.py | 55 +++++++++++++++++++++---------- src/GPT/misc.py | 1 + src/SumGPT.py | 41 +++++++++++++++-------- src/manifest.json | 2 +- src/util.py | 69 ++++++++++++++++++++++++++++++++++++++- test/test.py | 44 ++++++++++++++++--------- 7 files changed, 165 insertions(+), 49 deletions(-) diff --git a/requirements.txt b/requirements.txt index 5928838..39f646e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ docx==0.2.4 -langchain==0.0.112 +langchain==0.0.123 langdetect==1.0.9 numpy==1.24.2 openai==0.27.2 diff --git a/src/Components/sidebar.py b/src/Components/sidebar.py index 02ff29c..bca3af9 100644 --- a/src/Components/sidebar.py +++ b/src/Components/sidebar.py @@ -37,6 +37,17 @@ def _set_config(config_file, key: str, default_value): else: return default_value +def _set_language(language: str): + st.session_state['OUTPUT_LANGUAGE'] = language + +def _set_legacy(enable: bool): + st.session_state['LEGACY'] = enable +def _legacy(enable: bool, legacy, experimental): + if not enable: + return experimental + else: + return legacy + def sidebar(): with st.sidebar: @@ -55,6 +66,7 @@ def sidebar(): help="You can get your API key from https://beta.openai.com/account/api-keys", value=_set_config(config_file, "OPENAI_API_KEY", "")) + enable_legacy = st_toggle_switch(label="Legacy", default_value=_set_config(config_file, "LEGACY", False)) enable_final_summary = st_toggle_switch(label="Enable Final Summary", default_value=_set_config(config_file, "FINAL_SUMMARY_MODE", False)) if enable_final_summary: @@ -63,40 +75,48 @@ def sidebar(): set_final_summary_mode(enable_final_summary) with st.expander('🤖 Bot Persona'): - default_persona_rec = 'Provide a detailed and comprehensive summary of the following content in flawless ' \ - 'English, ensuring all key points are covered. Create a markdown heading (###) that ' \ - 'encapsulates the core information of the content. ' + language = st.selectbox('Language', ['English', 'Chinese', 'Japanese', 'Korean', 'Spanish', 'French', 'German']) + default_persona_rec_legacy = 'Provide a detailed and comprehensive summary of the following content in flawless ' \ + f'{language}, ensuring all key points are covered. Create a markdown heading (###) that ' \ + f'encapsulates the core information of the content. Make sure it is answered in {language}.' + default_persona_rec = f"""Write a detailed and comprehensive explanation of the following in perfect {language} with no grammar issues, +ensuring all key points are covered. Create a markdown heading (###) that encapsulates the core information:""" + \ +""" + +{text} + +Structured markdown summary with heading (###): """ persona_rec = st.text_area('Bot Persona Recursive', - value=_set_config(config_file, "OPENAI_PERSONA_REC", default_persona_rec), + value=_set_config(config_file, "OPENAI_PERSONA_REC", _legacy(enable_legacy, default_persona_rec_legacy, default_persona_rec)), help='System message is a pre-defined message used to instruct the assistant at the ' 'beginning of a conversation. iterating and ' 'experimenting with potential improvements can help to generate better outputs.' 'Make sure to use casual language.', - height=140) + height=250) if enable_final_summary: - default_persona_sum_a = 'Provide detail explanation and summary of the following transcript ' \ - 'into comprehensive and cohesive article in markdown format with ' \ - 'perfect english while making sure all the key points are included. ' \ - 'Rephrase and restructure the text so that it can be read fluently, make sense ' \ - 'and avoid repetition.' \ - - default_persona_sum_b = 'identify headings in the transcript and summarise them into five ' \ + default_persona_sum_legacy = 'identify headings in the transcript and summarise them into five ' \ 'headings. Use #### headings in markdown. Under headings, summarise at least ' \ '3 key points and then provide detail explanation of the concept based on the ' \ 'following text in the way that can be read fluently, make sense and avoid ' \ 'repetition. Make sure to include all information. Write it in beautiful and ' \ - 'structured markdown. ' + f'structured markdown in perfect {language}. ' + default_persona_sum = """Write a detailed summary of the following: + +{text} + +Identify and summarise them into five headings. Use #### headings in markdown. Under headings, summarize a list of key points that best encapsulate the core information.""" + \ +f"""Structured markdown summary with headings in perfect {language} (####): """ persona_sum = st.text_area('Bot Persona Total Sum', - value=_set_config(config_file, "OPENAI_PERSONA_SUM", default_persona_sum_b), + value=_set_config(config_file, "OPENAI_PERSONA_SUM", _legacy(enable_legacy, default_persona_sum_legacy, default_persona_sum)), help='This is a pre-defined message for total summarization that is used to' 'instruct the assistant at the beginning of a conversation. ', - height=240) + height=300) else: persona_sum = "" with st.expander('🔥 Advanced Options'): - model_options = ['gpt-3.5-turbo', 'gpt-3.5-turbo-0301', 'gpt-4'] + model_options = ['gpt-3.5-turbo', 'gpt-4'] model_index = model_options.index(_set_config(config_file, "MODEL", 'gpt-3.5-turbo')) model = st.selectbox("Model", options=model_options, index=model_index) @@ -162,7 +182,8 @@ def sidebar(): if persona_rec: set_openai_persona(persona_rec, persona_sum) - + _set_language(language) set_chunk_size(chunk_size) set_param(param) set_delay(delay) + _set_legacy(enable_legacy) \ No newline at end of file diff --git a/src/GPT/misc.py b/src/GPT/misc.py index d566e06..b93481c 100644 --- a/src/GPT/misc.py +++ b/src/GPT/misc.py @@ -91,6 +91,7 @@ def is_tokens_exceeded(param, chunks, max_token: int = 4096) -> Dict[str, Union[ 'reason': 'final', 'message': f"**[ Final summary ]** tokens exceeded. Max tokens allowed: {max_token}. Tokens used: {final_chunks_token}\n" f"(Prompt: {final_prompt_token}, Completion: {final_completion_token})"} + else: return {'exceeded': False, 'reason': '', diff --git a/src/SumGPT.py b/src/SumGPT.py index 19bf2a1..a122084 100644 --- a/src/SumGPT.py +++ b/src/SumGPT.py @@ -1,9 +1,11 @@ +import asyncio import Components import streamlit as st from Components.sidebar import sidebar import Modules.file_io as file_io import GPT import util +import time Components.StreamlitSetup.setup() @@ -82,37 +84,48 @@ f"Price Prediction: `${price}` || Total Prompt: `{prompt_token}`, Total Completion: `{completion_token}`") # max tokens exceeded warning exceeded = util.exceeded_token_handler(param=st.session_state['OPENAI_PARAMS'], chunks=chunks) - rec_responses = [] final_response = None finish_reason_rec = None + finish_reason_final = None + + # finish_reason_rec = None if st.button("🚀 Run", disabled=exceeded): + start_time = time.time() st.cache_data.clear() API_KEY = st.session_state['OPENAI_API_KEY'] if not API_KEY and GPT.misc.validate_api_key(API_KEY): st.error("❌ Please enter a valid [OpenAI API key](https://beta.openai.com/account/api-keys).") else: with st.spinner("Summarizing... (this might take a while)"): - rec_max_token = st.session_state['OPENAI_PARAMS'].max_tokens_rec - rec_responses, finish_reason_rec = util.recursive_summarize(chunks, rec_max_token) - if st.session_state['FINAL_SUMMARY_MODE']: - final_response, finish_reason_final = util.summarize(rec_responses) + if st.session_state['LEGACY']: + rec_max_token = st.session_state['OPENAI_PARAMS'].max_tokens_rec + rec_responses, finish_reason_rec = util.recursive_summarize(chunks, rec_max_token) + if st.session_state['FINAL_SUMMARY_MODE']: + final_response, finish_reason_final = util.summarize(rec_responses) + else: + final_response = None else: - final_response = None + completions, final_response = asyncio.run(util.summarize_experimental_concurrently(content, st.session_state['CHUNK_SIZE'])) + rec_responses = [d["content"] for d in completions] + rec_ids = [d["chunk_id"] for d in completions] + # final_response = completion['output_text'] + + end_time = time.time() + st.markdown(f"⏱️ Time taken: `{round(end_time - start_time, 2)}s`") if rec_responses is not []: with st.expander("Recursive Summaries", expanded=not st.session_state['FINAL_SUMMARY_MODE']): - for response in rec_responses: - st.info(response) + for i, response in enumerate(rec_responses): + st.info(f'{response}') if finish_reason_rec == 'length': st.warning('⚠️Result cut off due to length. Consider increasing the [Max Tokens Chunks] parameter.') if final_response is not None: - if st.session_state['FINAL_SUMMARY_MODE']: - st.header("📝Summary") - st.info(final_response) - if finish_reason_final == 'length': - st.warning( - '⚠️Result cut off due to length. Consider increasing the [Max Tokens Summary] parameter.') + st.header("📝Summary") + st.info(final_response) + if finish_reason_final == 'length': + st.warning( + '⚠️Result cut off due to length. Consider increasing the [Max Tokens Summary] parameter.') if rec_responses != [] or final_response is not None: util.download_results(rec_responses, final_response) diff --git a/src/manifest.json b/src/manifest.json index 17815f7..3deea71 100644 --- a/src/manifest.json +++ b/src/manifest.json @@ -1,6 +1,6 @@ { "name": "SumGPT", - "version": "0.6.5", + "version": "1.0.0", "license": "MIT", "author": "Zeke Zhang", "homepage": "https://github.com/sean1832/SumGPT", diff --git a/src/util.py b/src/util.py index 19f349a..7d00ded 100644 --- a/src/util.py +++ b/src/util.py @@ -1,3 +1,6 @@ +import os +import asyncio + import numpy as np from typing import Any, Dict, List, Tuple, Union from GPT.embeddings import openAIEmbeddings @@ -11,6 +14,11 @@ import xml.etree.ElementTree as ET from datetime import datetime +from langchain.chat_models import ChatOpenAI +from langchain.docstore.document import Document +from langchain.prompts import PromptTemplate +from langchain.chains.summarize import load_summarize_chain +from langchain.chains import LLMChain def _is_auto_lang(lang_code: str) -> bool: """Checks if the language code is an auto language.""" @@ -153,7 +161,7 @@ def convert_to_chunks(content: str, chunk_size: int = 1000, enable_embedding: bo embedding = openAIEmbeddings(st.session_state["OPENAI_API_KEY"]) chunks.append({'content': chunk, 'vector': embedding.embedding(chunk)}) else: - chunks.append({'content': chunk, 'language_based': language_base(chunk)}) + chunks.append({'content': chunk, 'language_based': language_base(chunk), 'chunk_id': i}) return chunks @@ -171,6 +179,65 @@ def search_chunks(query: str, chunks: List[Dict[str, float]], count: int = 1) -> ordered = sorted(points, key=lambda x: x['point'], reverse=True) return ordered[0:count] +@st.cache_data(show_spinner=False) +def convert_to_docs(chunks: List[Dict[str, Union[str, float]]]) -> List[Document] | Document: + """Converts a list of chunks into a list of documents.""" + docs = [] + for chunk in chunks: + content = chunk['content'] + metadata = {'chunk_id': chunk['chunk_id']} + doc = Document(page_content=content, metadata=metadata) + docs.append(doc) + return docs + +async def async_generate(chain, chunk)-> Dict[str, Union[str, int]]: + """Generates a summary asynchronously.""" + resp = await chain.arun(text=chunk['content']) + return {'content': resp, 'chunk_id': chunk['chunk_id']} + +async def summarize_experimental_concurrently(content: str, chunk_size: int = 1000) -> Tuple[List[Dict[str, Union[str, int]]], str]: + """Summarizes a string asynchronously.""" + os.environ['OPENAI_API_KEY'] = st.session_state["OPENAI_API_KEY"] + params = st.session_state['OPENAI_PARAMS'] + llm_rec = ChatOpenAI(model_name=params.model, + max_tokens=params.max_tokens_rec, + temperature=params.temperature, + top_p=params.top_p, + frequency_penalty=params.frequency_penalty, + presence_penalty=params.presence_penalty) + llm_final = ChatOpenAI(model_name=params.model, + max_tokens=params.max_tokens_final, + temperature=params.temperature, + top_p=params.top_p, + frequency_penalty=params.frequency_penalty, + presence_penalty=params.presence_penalty) + chunks = convert_to_chunks(content, chunk_size) + + REC_PROMPT = PromptTemplate(template=st.session_state['OPENAI_PERSONA_REC'], input_variables=['text']) + FINAL_PROMPT = PromptTemplate(template=st.session_state['OPENAI_PERSONA_SUM'], input_variables=['text']) + chain = LLMChain(llm=llm_rec, prompt=REC_PROMPT) + + tasks = [] + for chunk in chunks: + task = async_generate(chain, chunk) + tasks.append(task) + + outputs_rec = [] + progress_bar = st.progress(0, f"Generating summary 0/{len(chunks)}") + count = 1 + for coro in asyncio.as_completed(tasks): + output_rec = await coro + outputs_rec.append(output_rec) + progress_bar.progress(count / len(chunks), f"Generating summary {count}/{len(chunks)}") + count += 1 + rec_result = sorted(outputs_rec, key=lambda x: x['chunk_id']) + if st.session_state['FINAL_SUMMARY_MODE']: + chain = load_summarize_chain(llm_final, chain_type='stuff', prompt=FINAL_PROMPT) + docs = convert_to_docs(rec_result) + final_result = chain.run(docs) + else: + final_result = '' + return rec_result, final_result @st.cache_data(show_spinner=False) def recursive_summarize(chunks: List[Dict[str, Union[str, float]]], max_tokens) -> Tuple[List[str], str]: diff --git a/test/test.py b/test/test.py index 71da6db..5befa39 100644 --- a/test/test.py +++ b/test/test.py @@ -1,19 +1,33 @@ -from pytube import YouTube -import xml.etree.ElementTree as ET +import asyncio + +async def coroutine1(): + await asyncio.sleep(2) + return {"id": 1, "content": "result 1"} + +async def coroutine2(): + await asyncio.sleep(1) + return {"id": 2, "content": "result 2"} + +async def main(): + tasks = [coroutine1(), coroutine2()] + completed_count = 0 + results = [] + + for coro in asyncio.as_completed(tasks): + result = await coro + results.append(result) + completed_count += 1 + print(f"Coroutine {result['id']} completed ({completed_count}/{len(tasks)}).") + + print("All coroutines completed.") + print("Results:", sorted(results, key=lambda x: x['id'])) + +asyncio.run(main()) + + + + -def extract_xml_caption(xml: str) -> str: - """Extracts the text content from the elements of an XML string.""" - root = ET.fromstring(xml) - text_content = '' - for child in root.iter('s'): - text_content += child.text - return text_content.strip() -def get_transcript(url: str, lang_code: str = 'a.en') -> str: - """Extracts the transcript from a YouTube video.""" - yt = YouTube(url) - caption = yt.captions[lang_code] - xml_caption = caption.xml_captions - return extract_xml_caption(xml_caption) \ No newline at end of file