Release voice agent API (#497)

* Agent API Early Access * adds changes for API GA * additional changes for API GA * reverts FunctionCallingMessage to FunctionCalling * resolved code review * adds InjectionRefused * updates readme with agent examples * adds 3rd party TTS options * resolves linter errors in Readme * runs make lint * readme lint fixes --------- Co-authored-by: David vonThenen <[email protected]>
deepgram · Feb 3, 2025 · 203733a · 203733a
1 parent 25dadca
commit 203733a
Show file tree

Hide file tree

Showing 23 changed files with 2,800 additions and 25 deletions.
diff --git a/.gitignore b/.gitignore
@@ -7,6 +7,8 @@ venv/
 venv.bak/
 .vscode/
 .DS_Store
+Pipfile
+Pipfile.lock
 
 # python artifacts
 __pycache__
@@ -18,3 +20,4 @@ dist/
 # build
 build/
 poetry.lock
+
diff --git a/README.md b/README.md
@@ -175,19 +175,26 @@ Before running any of these examples, then you need to take a look at the README
 pip install -r examples/requirements-examples.txt
 ```
 
-Text to Speech:
+To run each example set the `DEEPGRAM_API_KEY` as an environment variable, then `cd` into each example folder and execute the example with: `python main.py` or `python3 main.py`.
+
+### Agent
+
+- Simple - [examples/agent/simple](https://github.com/deepgram/deepgram-python-sdk/blob/main/examples/agent/simple/main.py)
+- Async Simple - [examples/agent/async_simple](https://github.com/deepgram/deepgram-python-sdk/blob/main/examples/agent/async_simple/main.py)
+
+### Text to Speech
 
 - Asynchronous - [examples/text-to-speech](https://github.com/deepgram/deepgram-python-sdk/blob/main/examples/text-to-speech/rest/file/async_hello_world/main.py)
 - Synchronous - [examples/text-to-speech](https://github.com/deepgram/deepgram-python-sdk/blob/main/examples/text-to-speech/rest/file/hello_world/main.py)
 
-Analyze Text:
+### Analyze Text
 
 - Intent Recognition - [examples/analyze/intent](https://github.com/deepgram/deepgram-python-sdk/blob/main/examples/analyze/intent/main.py)
 - Sentiment Analysis - [examples/sentiment/intent](https://github.com/deepgram/deepgram-python-sdk/blob/main/examples/analyze/sentiment/main.py)
 - Summarization - [examples/analyze/intent](https://github.com/deepgram/deepgram-python-sdk/blob/main/examples/analyze/summary/main.py)
 - Topic Detection - [examples/analyze/intent](https://github.com/deepgram/deepgram-python-sdk/blob/main/examples/analyze/topic/main.py)
 
-PreRecorded Audio:
+### PreRecorded Audio
 
 - Transcription From an Audio File - [examples/prerecorded/file](https://github.com/deepgram/deepgram-python-sdk/blob/main/examples/speech-to-text/rest/file/main.py)
 - Transcription From an URL - [examples/prerecorded/url](https://github.com/deepgram/deepgram-python-sdk/blob/main/examples/speech-to-text/rest/url/main.py)
@@ -196,7 +203,7 @@ PreRecorded Audio:
 - Summarization - [examples/speech-to-text/rest/summary](https://github.com/deepgram/deepgram-python-sdk/blob/main/examples/speech-to-text/rest/summary/main.py)
 - Topic Detection - [examples/speech-to-text/rest/topic](https://github.com/deepgram/deepgram-python-sdk/blob/main/examples/speech-to-text/rest/topic/main.py)
 
-Live Audio Transcription:
+### Live Audio Transcription
 
 - From a Microphone - [examples/streaming/microphone](https://github.com/deepgram/deepgram-python-sdk/blob/main/examples/speech-to-text/rest/stream_file/main.py)
 - From an HTTP Endpoint - [examples/streaming/http](https://github.com/deepgram/deepgram-python-sdk/blob/main/examples/speech-to-text/rest/async_url/main.py)
@@ -211,8 +218,6 @@ Management API exercise the full [CRUD](https://en.wikipedia.org/wiki/Create,_re
 - Scopes - [examples/manage/scopes](https://github.com/deepgram/deepgram-python-sdk/blob/main/examples/manage/scopes/main.py)
 - Usage - [examples/manage/usage](https://github.com/deepgram/deepgram-python-sdk/blob/main/examples/manage/usage/main.py)
 
-To run each example set the `DEEPGRAM_API_KEY` as an environment variable, then `cd` into each example folder and execute the example: `go run main.py`.
-
 ## Logging
 
 This SDK provides logging as a means to troubleshoot and debug issues encountered. By default, this SDK will enable `Information` level messages and higher (ie `Warning`, `Error`, etc) when you initialize the library as follows:

diff --git a/deepgram/__init__.py b/deepgram/__init__.py
@@ -34,7 +34,7 @@
 from .errors import DeepgramApiKeyError
 
 # listen/read client
-from .client import Listen, Read
+from .client import ListenRouter, ReadRouter, SpeakRouter, AgentRouter
 
 # common
 from .client import (
@@ -302,6 +302,60 @@
     AsyncSelfHostedClient,
 )
 
+
+# agent
+from .client import AgentWebSocketEvents
+
+# websocket
+from .client import (
+    AgentWebSocketClient,
+    AsyncAgentWebSocketClient,
+)
+
+from .client import (
+    #### common websocket response
+    # OpenResponse,
+    # CloseResponse,
+    # ErrorResponse,
+    # UnhandledResponse,
+    #### unique
+    WelcomeResponse,
+    SettingsAppliedResponse,
+    ConversationTextResponse,
+    UserStartedSpeakingResponse,
+    AgentThinkingResponse,
+    FunctionCalling,
+    FunctionCallRequest,
+    AgentStartedSpeakingResponse,
+    AgentAudioDoneResponse,
+    InjectionRefusedResponse,
+)
+
+from .client import (
+    # top level
+    SettingsConfigurationOptions,
+    UpdateInstructionsOptions,
+    UpdateSpeakOptions,
+    InjectAgentMessageOptions,
+    FunctionCallResponse,
+    AgentKeepAlive,
+    # sub level
+    Listen,
+    Speak,
+    Header,
+    Item,
+    Properties,
+    Parameters,
+    Function,
+    Provider,
+    Think,
+    Agent,
+    Input,
+    Output,
+    Audio,
+    Context,
+)
+
 # utilities
 # pylint: disable=wrong-import-position
 from .audio import Microphone, DeepgramMicrophoneError

diff --git a/deepgram/audio/microphone/microphone.py b/deepgram/audio/microphone/microphone.py
@@ -9,6 +9,7 @@
 import logging
 
 from ...utils import verboselogs
+
 from .constants import LOGGING, CHANNELS, RATE, CHUNK
 
 if TYPE_CHECKING:

diff --git a/deepgram/audio/speaker/speaker.py b/deepgram/audio/speaker/speaker.py
@@ -50,7 +50,6 @@ class Speaker:  # pylint: disable=too-many-instance-attributes
     # _asyncio_loop: asyncio.AbstractEventLoop
     # _asyncio_thread: threading.Thread
     _receiver_thread: Optional[threading.Thread] = None
-
     _loop: Optional[asyncio.AbstractEventLoop] = None
 
     _push_callback_org: Optional[Callable] = None
@@ -265,6 +264,7 @@ async def _start_asyncio_receiver(self):
                     await self._push_callback(message)
                 elif isinstance(message, bytes):
                     self._logger.verbose("Received audio data...")
+                    await self._push_callback(message)
                     self.add_audio_to_queue(message)
         except websockets.exceptions.ConnectionClosedOK as e:
             self._logger.debug("send() exiting gracefully: %d", e.code)
@@ -297,6 +297,7 @@ def _start_threaded_receiver(self):
                     self._push_callback(message)
                 elif isinstance(message, bytes):
                     self._logger.verbose("Received audio data...")
+                    self._push_callback(message)
                     self.add_audio_to_queue(message)
         except Exception as e:  # pylint: disable=broad-except
             self._logger.notice("_start_threaded_receiver exception: %s", str(e))
@@ -365,6 +366,7 @@ def _play(self, audio_out, stream, stop):
                                 "LastPlay delta is greater than threshold. Unmute!"
                             )
                             self._microphone.unmute()
+
                 data = audio_out.get(True, TIMEOUT)
                 with self._lock_wait:
                     self._last_datagram = datetime.now()

diff --git a/deepgram/client.py b/deepgram/client.py
@@ -55,7 +55,7 @@
 )
 
 # listen client
-from .clients import Listen, Read, Speak
+from .clients import ListenRouter, ReadRouter, SpeakRouter, AgentRouter
 
 # speech-to-text
 from .clients import LiveClient, AsyncLiveClient  # backward compat
@@ -308,6 +308,61 @@
     AsyncSelfHostedClient,
 )
 
+
+# agent
+from .clients import AgentWebSocketEvents
+
+# websocket
+from .clients import (
+    AgentWebSocketClient,
+    AsyncAgentWebSocketClient,
+)
+
+from .clients import (
+    #### common websocket response
+    # OpenResponse,
+    # CloseResponse,
+    # ErrorResponse,
+    # UnhandledResponse,
+    #### unique
+    WelcomeResponse,
+    SettingsAppliedResponse,
+    ConversationTextResponse,
+    UserStartedSpeakingResponse,
+    AgentThinkingResponse,
+    FunctionCalling,
+    FunctionCallRequest,
+    AgentStartedSpeakingResponse,
+    AgentAudioDoneResponse,
+    InjectionRefusedResponse,
+)
+
+from .clients import (
+    # top level
+    SettingsConfigurationOptions,
+    UpdateInstructionsOptions,
+    UpdateSpeakOptions,
+    InjectAgentMessageOptions,
+    FunctionCallResponse,
+    AgentKeepAlive,
+    # sub level
+    Listen,
+    Speak,
+    Header,
+    Item,
+    Properties,
+    Parameters,
+    Function,
+    Provider,
+    Think,
+    Agent,
+    Input,
+    Output,
+    Audio,
+    Context,
+)
+
+
 # client errors and options
 from .options import DeepgramClientOptions, ClientOptionsFromEnv
 from .errors import DeepgramApiKeyError
@@ -397,21 +452,21 @@ def listen(self):
         """
         Returns a Listen dot-notation router for interacting with Deepgram's transcription services.
         """
-        return Listen(self._config)
+        return ListenRouter(self._config)
 
     @property
     def read(self):
         """
         Returns a Read dot-notation router for interacting with Deepgram's read services.
         """
-        return Read(self._config)
+        return ReadRouter(self._config)
 
     @property
     def speak(self):
         """
         Returns a Speak dot-notation router for interacting with Deepgram's speak services.
         """
-        return Speak(self._config)
+        return SpeakRouter(self._config)
 
     @property
     @deprecation.deprecated(
@@ -480,6 +535,13 @@ def asyncselfhosted(self):
         """
         return self.Version(self._config, "asyncselfhosted")
 
+    @property
+    def agent(self):
+        """
+        Returns a Agent dot-notation router for interacting with Deepgram's speak services.
+        """
+        return AgentRouter(self._config)
+
     # INTERNAL CLASSES
     class Version:
         """

diff --git a/deepgram/clients/__init__.py b/deepgram/clients/__init__.py
@@ -48,9 +48,10 @@
 )
 from .errors import DeepgramModuleError
 
-from .listen_router import Listen
-from .read_router import Read
-from .speak_router import Speak
+from .listen_router import ListenRouter
+from .read_router import ReadRouter
+from .speak_router import SpeakRouter
+from .agent_router import AgentRouter
 
 # listen
 from .listen import LiveTranscriptionEvents
@@ -318,3 +319,56 @@
     SelfHostedClient,
     AsyncSelfHostedClient,
 )
+
+# agent
+from .agent import AgentWebSocketEvents
+
+# websocket
+from .agent import (
+    AgentWebSocketClient,
+    AsyncAgentWebSocketClient,
+)
+
+from .agent import (
+    #### common websocket response
+    # OpenResponse,
+    # CloseResponse,
+    # ErrorResponse,
+    # UnhandledResponse,
+    #### unique
+    WelcomeResponse,
+    SettingsAppliedResponse,
+    ConversationTextResponse,
+    UserStartedSpeakingResponse,
+    AgentThinkingResponse,
+    FunctionCalling,
+    FunctionCallRequest,
+    AgentStartedSpeakingResponse,
+    AgentAudioDoneResponse,
+    InjectionRefusedResponse,
+)
+
+from .agent import (
+    # top level
+    SettingsConfigurationOptions,
+    UpdateInstructionsOptions,
+    UpdateSpeakOptions,
+    InjectAgentMessageOptions,
+    FunctionCallResponse,
+    AgentKeepAlive,
+    # sub level
+    Listen,
+    Speak,
+    Header,
+    Item,
+    Properties,
+    Parameters,
+    Function,
+    Provider,
+    Think,
+    Agent,
+    Input,
+    Output,
+    Audio,
+    Context,
+)