Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DST-199: base code for chunking #24

Merged
merged 14 commits into from
May 22, 2024
158 changes: 158 additions & 0 deletions 04-call-summaries/chunking.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
from langchain_text_splitters import (
RecursiveCharacterTextSplitter,
NLTKTextSplitter,
SpacyTextSplitter,
)
from langchain_core.prompts import PromptTemplate
from langchain.docstore.document import Document
from llm import google_gemini_client, claude_client, gpt_client, ollama_client
from run import get_transcript


# split text into chunks
def get_text_chunks(text, chunk_size, chunk_overlap, text_splitter_choice):
if text_splitter_choice == "2":
text_splitter = NLTKTextSplitter()
Comment on lines +15 to +16
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note that the current code always uses "2", so the chunk_size and chunk_overlap are not used.

elif text_splitter_choice == "3":
text_splitter = SpacyTextSplitter()
else:
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size, chunk_overlap=chunk_overlap
)

texts = text_splitter.split_text(text)

docs = [
Document(
page_content=t,
)
for t in texts
]

return docs


CHUNKING_PROMPT = """
You are a helpful AI assistant tasked with summarizing transcripts, however we can only process the transcripts in pieces.
Fill out the fields with the text given {text}. If the following template already has the field filled out, do not overwrite this information.
Please fill out the data with the following template: {template}
"""

initial_temp = """
1. Caller Information:
- Name
- Contact Information
- Availability
- Household Information
2. Reason/Type of Call: e.g., Applying for benefits, Follow-ups
3. Previous Benefits History:
- Applied for
- Receives
- Denied
4. Benefits Discussion: Prefix the discussed benefit with a hashtag (e.g., #SNAP, #LIHEAP)
5. Discussion Points:
- Key information points
6. Documents Needed: e.g., Income verification, Housing documentation
7. Next Steps for Client
8. Next Steps for Agent
"""


def chunking_ingest(transcript, prompt):
text_chunks = get_text_chunks(
transcript, chunk_size=750, chunk_overlap=300, text_splitter_choice="2"
)
prompt_template = PromptTemplate.from_template(prompt)
template = initial_temp

print("""
Select an llm
1. openhermes (default)
2. dolphin
3. gemini
4. gpt 4
5. gpt 4o
6. claude 3
""")

llm = input() or "1"

if llm == "2":
client = ollama_client(model_name="dolphin-mistral")
print("""----------
Dolphin
""")
for text in text_chunks:
formatted_prompt = prompt_template.format(
text=text.page_content, template=template
)
print("----------------------")
template = ollama_client(client=client, prompt=formatted_prompt)
return template
elif llm == "3":
gemini = google_gemini_client()
print("""----------
Gemini Flash 1.5
""")
for text in text_chunks:
formatted_prompt = prompt_template.format(
text=text.page_content, template=template
)
print("----------------------")
template = google_gemini_client(client=gemini, prompt=formatted_prompt)
return template
elif llm == "4":
print("""----------
GPT 4
""")
gpt = gpt_client()
for text in text_chunks:
formatted_prompt = prompt_template.format(
text=text.page_content, template=template
)
print("----------------------")
template = gpt_client(
client=gpt, model_choice="gpt4", prompt=formatted_prompt
)
return template
elif llm == "5":
print("""----------
GPT 4o
""")
gpt = gpt_client()
for text in text_chunks:
formatted_prompt = prompt_template.format(
text=text.page_content, template=template
)
print("----------------------")
template = gpt_client(
client=gpt, model_choice="gpt-4o", prompt=formatted_prompt
)
return template
elif llm == "6":
print("""----------
Claude 3
""")
claude = claude_client()
for text in text_chunks:
formatted_prompt = prompt_template.format(
text=text.page_content, template=template
)
print("----------------------")
template = claude_client(client=claude, prompt=formatted_prompt)
return template
else:
print("""
Openhermes
""")
ollama = ollama_client(model_name="openhermes")
for text in text_chunks:
formatted_prompt = prompt_template.format(
text=text.page_content, template=template
)
print("----------------------")
template = ollama_client(client=ollama, prompt=formatted_prompt)
return template


print(chunking_ingest(transcript=get_transcript(), prompt=CHUNKING_PROMPT))
88 changes: 48 additions & 40 deletions 04-call-summaries/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,7 @@ def get_transcript(file_path="./transcript.txt"):


def ollama_client(
model_name=None,
prompt=None,
callbacks=None,
settings=None,
model_name=None, prompt=None, callbacks=None, settings=None, client=None
):
if not settings:
settings = {
Expand All @@ -32,13 +29,14 @@ def ollama_client(

print("LLM settings:", model_name, settings)
# To connect via another URL: Ollama(base_url='http://localhost:11434', ...)
return Ollama(model=model_name, callbacks=callbacks, **settings).invoke(prompt)

if client:
return client.invoke(prompt)
return Ollama(model=model_name, callbacks=callbacks, **settings)


def google_gemini_client(
model_name="gemini-pro",
prompt=None,
settings=None,
model_name="gemini-1.5-flash-latest", prompt=None, settings=None, client=None
):
# Get a Google API key by following the steps after clicking on Get an API key button
# at https://ai.google.dev/tutorials/setup
Expand All @@ -49,39 +47,49 @@ def google_gemini_client(
genai.configure(api_key=GOOGLE_API_KEY)
if settings:
genai.GenerationConfig(**settings)
model = genai.GenerativeModel(model_name)
return model.generate_content(prompt)


def gpt3_5(prompt, model="gpt-3.5-turbo"):
# Get API key from https://platform.openai.com/api-keys
OPEN_AI_API_KEY = os.environ.get("OPEN_AI_API_KEY")
openai_client = OpenAI(api_key=OPEN_AI_API_KEY) # Uses OPENAI_API_KEY
return (
openai_client.chat.completions.create(
model=model, messages=[{"role": "user", "content": prompt}]
if client:
return client.generate_content(prompt).text
return genai.GenerativeModel(model_name)


def gpt_client(prompt=None, model_choice="gpt-4o", client=None):
if client:
if model_choice == "gpt3":
model = "gpt-3.5-turbo"
elif model_choice == "gpt4":
model = "gpt-4-turbo"
else:
model = model_choice
return (
client.chat.completions.create(
model=model, messages=[{"role": "user", "content": prompt}]
)
.choices[0]
.message.content
)
.choices[0]
.message.content
)


def gpt_4_turbo(prompt):
return gpt3_5(prompt, model="gpt-4-turbo")
else:
# Get API key from https://platform.openai.com/api-keys
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
return OpenAI(api_key=OPENAI_API_KEY) # Uses OPENAI_API_KEY


def claude(prompt, model="claude-3-opus-20240229", max_tokens=1024):
# Get API key from https://console.anthropic.com/settings/keys
ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY")

client = anthropic.Anthropic(
api_key=ANTHROPIC_API_KEY,
)
generated_response = client.messages.create(
model=model,
max_tokens=max_tokens,
messages=[{"role": "user", "content": prompt}],
).content
text_response = "\n".join([text_block.text for text_block in generated_response])
def claude_client(
prompt=None, model="claude-3-opus-20240229", max_tokens=1024, client=None
):
if client:
generated_response = client.messages.create(
model=model,
max_tokens=max_tokens,
messages=[{"role": "user", "content": prompt}],
).content
text_response = "\n".join(
[text_block.text for text_block in generated_response]
)

return text_response
return text_response
else:
# Get API key from https://console.anthropic.com/settings/keys
ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY")
return anthropic.Anthropic(
api_key=ANTHROPIC_API_KEY,
)
7 changes: 6 additions & 1 deletion 04-call-summaries/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,9 @@ google-generativeai
tokenizers
langchain
langchain_community
openai
openai
sentence-transformers
nltk
spacy==3.7.4
langchain-text-splitters
langchain_openai
Loading