Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DST-199: base code for chunking #24

Merged
merged 14 commits into from
May 22, 2024
134 changes: 134 additions & 0 deletions 04-call-summaries/chunking.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
import datetime
from langchain_text_splitters import (
RecursiveCharacterTextSplitter,
NLTKTextSplitter,
SpacyTextSplitter,
)
from langchain_core.prompts import PromptTemplate
from langchain.docstore.document import Document
from llm import LLM
from run import get_transcript


# split text into chunks
def get_text_chunks(text, chunk_size, chunk_overlap, text_splitter_choice):
if text_splitter_choice == "2":
text_splitter = NLTKTextSplitter()
Comment on lines +15 to +16
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note that the current code always uses "2", so the chunk_size and chunk_overlap are not used.

elif text_splitter_choice == "3":
text_splitter = SpacyTextSplitter()
else:
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size, chunk_overlap=chunk_overlap
)

texts = text_splitter.split_text(text)

docs = [
Document(
page_content=t,
)
for t in texts
]

return docs


CHUNKING_PROMPT = """
You are a helpful AI assistant tasked with summarizing transcripts, however we can only process the transcripts in pieces.
Please fill out and return the following template: {template} with data in the text: {text}
If the following template already has the field filled out, do not overwrite this information.
"""

initial_temp = """
1. Caller Information:
- Name
- Contact Information
- Availability
- Household Information
2. Reason/Type of Call: e.g., Applying for benefits, Follow-ups
3. Previous Benefits History:
- Applied for
- Receives
- Denied
4. Benefits Discussion: Prefix the discussed benefit with a hashtag (e.g., #SNAP, #LIHEAP)
5. Discussion Points:
- Key information points
6. Documents Needed: e.g., Income verification, Housing documentation
7. Next Steps for Client
8. Next Steps for Agent
"""


def chunking_ingest(transcript, prompt):
text_chunks = get_text_chunks(
transcript, chunk_size=750, chunk_overlap=300, text_splitter_choice="2"
)
prompt_template = PromptTemplate.from_template(prompt)
template = initial_temp
ccheng26 marked this conversation as resolved.
Show resolved Hide resolved

print("""
Select an llm
1. openhermes (default)
2. dolphin
3. gemini
4. gpt 4
5. gpt 4o
6. claude 3
""")

llm = input() or "1"

if llm == "2":
client = LLM(client_name="ollama", model_name="dolphin-mistral")
print("""----------
Dolphin
""")

elif llm == "3":
client = LLM(client_name="gemini")
print("""----------
Gemini Flash 1.5
""")
elif llm == "4":
print("""----------
GPT 4
""")
client = LLM(client_name="gpt", model_name="gpt4")
elif llm == "5":
print("""----------
GPT 4o
""")
client = LLM(client_name="gpt", model_name="gpt-4o")
elif llm == "6":
print("""----------
Claude 3
""")
client = LLM(client_name="claude")
else:
print("""
Openhermes
""")
client = LLM(client_name="ollama", model_name="openhermes")

client.init_client()
ct = datetime.datetime.now()
print("current time:-", ct)
for text in text_chunks:
formatted_prompt = prompt_template.format(
text=text.page_content, template=template
)
print("Processing Text Chunk")
template = client.generate_text(prompt=formatted_prompt)
print("Complete")
return template


if __name__ == "__main__":
print(
chunking_ingest(
transcript=get_transcript("./son_calling_behalf_mother_transcript.txt"),
prompt=CHUNKING_PROMPT,
)
)
ct = datetime.datetime.now()
print(ct)
169 changes: 93 additions & 76 deletions 04-call-summaries/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,79 +9,96 @@
dotenv.load_dotenv()


def get_transcript(file_path="./transcript.txt"):
file = open(file_path, encoding="utf-8")
content = file.read()
return content


def ollama_client(
model_name=None,
prompt=None,
callbacks=None,
settings=None,
):
if not settings:
settings = {
# "temperature": 0.1,
# "system": "",
# "template": "",
# See https://github.com/langchain-ai/langchain/blob/master/libs/community/langchain_community/llms/ollama.py
"stop": None
}

print("LLM settings:", model_name, settings)
# To connect via another URL: Ollama(base_url='http://localhost:11434', ...)
return Ollama(model=model_name, callbacks=callbacks, **settings).invoke(prompt)


def google_gemini_client(
model_name="gemini-pro",
prompt=None,
settings=None,
):
# Get a Google API key by following the steps after clicking on Get an API key button
# at https://ai.google.dev/tutorials/setup
GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")

print("LLM settings:", model_name, settings)

genai.configure(api_key=GOOGLE_API_KEY)
if settings:
genai.GenerationConfig(**settings)
model = genai.GenerativeModel(model_name)
return model.generate_content(prompt)


def gpt3_5(prompt, model="gpt-3.5-turbo"):
# Get API key from https://platform.openai.com/api-keys
OPEN_AI_API_KEY = os.environ.get("OPEN_AI_API_KEY")
openai_client = OpenAI(api_key=OPEN_AI_API_KEY) # Uses OPENAI_API_KEY
return (
openai_client.chat.completions.create(
model=model, messages=[{"role": "user", "content": prompt}]
)
.choices[0]
.message.content
)


def gpt_4_turbo(prompt):
return gpt3_5(prompt, model="gpt-4-turbo")


def claude(prompt, model="claude-3-opus-20240229", max_tokens=1024):
# Get API key from https://console.anthropic.com/settings/keys
ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY")

client = anthropic.Anthropic(
api_key=ANTHROPIC_API_KEY,
)
generated_response = client.messages.create(
model=model,
max_tokens=max_tokens,
messages=[{"role": "user", "content": prompt}],
).content
text_response = "\n".join([text_block.text for text_block in generated_response])

return text_response
class LLM:
def __init__(
self,
client_name=None,
model_name=None,
max_tokens=1024,
settings=None,
):
self.client_name = client_name
"""Name of llm selection"""
self.model_name = model_name
"""User friendly model name"""
self.model_version = model_name
"""Exact model name being passed into the initializer"""
self.max_tokens = max_tokens
self.client = None
self.settings = settings

def init_client(self):
"""Retrieves the llm client"""
if self.client_name == "ollama":
if self.settings is None:
self.settings = {
# "temperature": 0.1,
# "system": "",
# "template": "",
# See https://github.com/langchain-ai/langchain/blob/master/libs/community/langchain_community/llms/ollama.py
"stop": None
}
if self.model_name is None:
self.model_name = "openhermes"
# To connect via another URL: Ollama(base_url='http://localhost:11434', ...)
self.client = Ollama(model=self.model_version, **self.settings)

elif self.client_name == "gemini":
# Get a Google API key by following the steps after clicking on Get an API key button
# at https://ai.google.dev/tutorials/setup
GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
if self.model_name == "gemini-pro":
self.model_version = "gemini-1.5-pro-latest"
else:
self.model_version = "gemini-1.5-flash-latest"

genai.configure(api_key=GOOGLE_API_KEY)
if self.settings is not None:
genai.GenerationConfig(**self.settings)
self.client = genai.GenerativeModel(self.model_version)

elif self.client_name == "gpt":
if self.model_name == "gpt3":
self.model_version = "gpt-3.5-turbo"
elif self.model_name == "gpt4":
self.model_version = "gpt-4-turbo"
else:
self.model_version = "gpt-4o"

# Get API key from https://platform.openai.com/api-keys
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
self.client = OpenAI(api_key=OPENAI_API_KEY) # Uses OPENAI_API_KEY

elif self.client_name == "claude":
self.model_version = "claude-3-opus-20240229"
ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY")
self.client = anthropic.Anthropic(
api_key=ANTHROPIC_API_KEY,
)

def generate_text(self, prompt=None):
"""Generates response given prompt"""
if self.client_name == "ollama":
return self.client.invoke(prompt)
elif self.client_name == "gemini":
return self.client.generate_content(prompt).text
elif self.client_name == "gpt":
return (
self.client.chat.completions.create(
model=self.model_version,
messages=[{"role": "user", "content": prompt}],
)
.choices[0]
.message.content
)
elif self.client_name == "claude":
generated_response = self.client.messages.create(
model=self.model_version,
max_tokens=self.max_tokens,
messages=[{"role": "user", "content": prompt}],
).content
text_response = "\n".join(
[text_block.text for text_block in generated_response]
)

return text_response
7 changes: 6 additions & 1 deletion 04-call-summaries/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,9 @@ google-generativeai
tokenizers
langchain
langchain_community
openai
openai
sentence-transformers
nltk
spacy==3.7.4
langchain-text-splitters
langchain_openai
Loading