-
Notifications
You must be signed in to change notification settings - Fork 24
/
Copy pathcomputer_assistant.py
144 lines (108 loc) · 4.31 KB
/
computer_assistant.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import base64
import time
from openai import OpenAI
import cv2
import sounddevice as sd
import numpy as np
import soundfile as sf
import speech_recognition as sr
import whisper
from pathlib import Path
from playsound import playsound
import os
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores.singlestoredb import SingleStoreDB
import pyautogui
"""
Interactive Computer Assistant
"""
def encode_image(image_path):
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode("utf-8")
def assistant(llm_input, llm_history, client, context):
screenshot = pyautogui.screenshot()
screenshot.save("screenshot.png")
base64_image = encode_image("screenshot.png")
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "system",
"content": "Only use the provided prompt text and image to answer question. If it is not in document, do not answer.",
},
{
"role": "user",
"content": [
{"type": "text", "text": "User Question (If answer is not in context document say I don't know). Answer should be 3 sentences max. Provide no code in your answer: " + llm_input + "\nContext Document: " + context},
{
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
},
],
},
],
max_tokens=300,
)
response_text = response.choices[0].message.content
return response_text
def detect_and_record_audio(threshold=0.03, silence_duration=3, record_duration=5, samplerate=44100, channels=1):
recognizer = sr.Recognizer()
print("Listening for speech...")
started = False
def callback(indata, frames, time, status):
nonlocal started
if np.any(indata > threshold):
if not started:
print("Starting recording...")
started = True
raise sd.CallbackAbort
# Detect speech
with sd.InputStream(callback=callback, channels=channels, samplerate=samplerate):
while not started:
sd.sleep(100)
# Record for the specified duration after speech detection
audio_data = sd.rec(
int(record_duration * samplerate),
samplerate=samplerate,
channels=channels,
dtype="float32",
)
sd.wait() # Wait until the recording is finished
sf.write("voice_input.wav", audio_data, samplerate)
print("Audio saved as voice_input.wav")
def main():
llm_history = []
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
os.environ["SINGLESTOREDB_URL"] = "ayush:Test1234@svc-3482219c-a389-4079-b18b-d50662524e8a-shared-dml.aws-virginia-6.svc.singlestore.com:3333/database_79fb0"
# Load and process documents
loader = TextLoader("pytorch_docs.txt")
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=2000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)
# Generate embeddings and create a document search database
embeddings = OpenAIEmbeddings(api_key=os.environ.get("OPENAI_API_KEY"))
vectorDB = SingleStoreDB.from_documents(docs, embeddings, table_name="data")
while True:
detect_and_record_audio()
model = whisper.load_model("base")
result = model.transcribe("voice_input.wav")
llm_input = result["text"]
print(llm_input)
docs = vectorDB.similarity_search(llm_input)
context = docs[0].page_content
llm_output = assistant(llm_input, llm_history, client, context)
llm_history = llm_history + [{"role": "assistant", "content": llm_output}]
print(llm_output)
response = client.audio.speech.create(
model="tts-1",
voice="fable",
input=llm_output,
)
response.stream_to_file("output.mp3")
playsound("output.mp3")
time.sleep(2)
if __name__ == "__main__":
main()