-
Notifications
You must be signed in to change notification settings - Fork 0
/
transcribe_speech.py
110 lines (90 loc) · 3.52 KB
/
transcribe_speech.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import os
import whisper
from typing import Optional, List, Any
import tempfile
import pyaudio
import wave
import threading
def record_audio(frames: List[Any], stream: pyaudio.Stream, CHUNK: int, stop_recording: threading.Event) -> None:
"""
Records audio data and appends it to the frames list.
Args:
frames (List[Any]): A list to store audio data.
stream (pyaudio.Stream): The audio stream from which to record audio.
CHUNK (int): The size of audio data to read in a single iteration.
stop_recording (threading.Event): An event to signal when to stop recording.
"""
while not stop_recording.is_set():
data = stream.read(CHUNK)
frames.append(data)
def capture_audio() -> Optional[str]:
"""
Captures the user's spoken input using a microphone and saves it to a temporary file.
Returns:
audio_file (Optional[str]): The name of the temporary audio file or None if an error occurred.
"""
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000
CHUNK = 1024
audio = pyaudio.PyAudio()
# Start recording
stream = audio.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK)
print("Speak now... (Press Enter to stop recording.)")
frames = []
stop_recording = threading.Event()
record_thread = threading.Thread(target=record_audio, args=(frames, stream, CHUNK, stop_recording))
record_thread.start()
input() # Wait for Enter key to be pressed
stop_recording.set() # Signal to the recording thread to stop recording
record_thread.join() # Wait for the recording thread to finish
# Stop recording
stream.stop_stream()
stream.close()
audio.terminate()
# Save the recorded audio to a temporary file
try:
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_file:
wf = wave.open(temp_file.name, "wb")
wf.setnchannels(CHANNELS)
wf.setsampwidth(audio.get_sample_size(FORMAT))
wf.setframerate(RATE)
wf.writeframes(b"".join(frames))
wf.close()
return temp_file.name
except Exception as e:
print(f"Error capturing audio: {e}")
return None
def transcribe_speech(filename: str, whisper_model: str, language: str) -> Optional[str]:
"""
Transcribes the given audio file using the Whisper ASR model.
Args:
filename (str): The name of the audio file to transcribe.
whisper_model (str): The whisper model to be used.
language (str): The intended language of the audio.
Returns:
transcript (Optional[str]): The transcribed text or None if an error occurred.
"""
try:
model = whisper.load_model(whisper_model)
result = model.transcribe(filename, language=language, fp16=False)
transcript = result["text"]
return transcript
except Exception as e:
print(f"Error transcribing speech: {e}")
return None
def main(whisper_model: str, language: str) -> None:
"""
Captures user's spoken input, transcribes it, and prints the transcript.
Args:
whisper_model (str): The whisper model to be used.
language (str): The intended language of the audio.
"""
# Capture user's spoken input
audio_file = capture_audio()
print("Recording complete.")
if audio_file:
# Transcribe the spoken input
transcript = transcribe_speech(audio_file, whisper_model, language)
os.remove(audio_file)
return transcript