Skip to content

Latest commit

 

History

History
1199 lines (840 loc) · 23.7 KB

File metadata and controls

1199 lines (840 loc) · 23.7 KB

WebSocket Reference

Listen V1 Connect

client.listen.v1.connect(...)

📝 Description

Transcribe audio and video using Deepgram's speech-to-text WebSocket

🔌 Usage

from deepgram import DeepgramClient
from deepgram.core.events import EventType
from deepgram.extensions.types.sockets import ListenV1SocketClientResponse

client = DeepgramClient(
    api_key="YOUR_API_KEY",
)

with client.listen.v1.connect(model="nova-3") as connection:
    def on_message(message: ListenV1SocketClientResponse) -> None:
        msg_type = getattr(message, "type", "Unknown")
        print(f"Received {msg_type} event")

    connection.on(EventType.OPEN, lambda _: print("Connection opened"))
    connection.on(EventType.MESSAGE, on_message)
    connection.on(EventType.CLOSE, lambda _: print("Connection closed"))
    connection.on(EventType.ERROR, lambda error: print(f"Caught: {error}"))

    # Start listening
    connection.start_listening()

    # Send audio data
    from deepgram.extensions.types.sockets import ListenV1MediaMessage
    connection.send_media(ListenV1MediaMessage(audio_bytes))

    # Send control messages
    from deepgram.extensions.types.sockets import ListenV1ControlMessage
    connection.send_control(ListenV1ControlMessage(type="KeepAlive"))

🔌 Async Usage

import asyncio
from deepgram import AsyncDeepgramClient
from deepgram.core.events import EventType
from deepgram.extensions.types.sockets import ListenV1SocketClientResponse

client = AsyncDeepgramClient(
    api_key="YOUR_API_KEY",
)

async def main():
    async with client.listen.v1.connect(model="nova-3") as connection:
        def on_message(message: ListenV1SocketClientResponse) -> None:
            msg_type = getattr(message, "type", "Unknown")
            print(f"Received {msg_type} event")

        connection.on(EventType.OPEN, lambda _: print("Connection opened"))
        connection.on(EventType.MESSAGE, on_message)
        connection.on(EventType.CLOSE, lambda _: print("Connection closed"))
        connection.on(EventType.ERROR, lambda error: print(f"Caught: {error}"))

        # Start listening
        await connection.start_listening()

        # Send audio data
        from deepgram.extensions.types.sockets import ListenV1MediaMessage
        await connection.send_media(ListenV1MediaMessage(audio_bytes))

        # Send control messages
        from deepgram.extensions.types.sockets import ListenV1ControlMessage
        await connection.send_control(ListenV1ControlMessage(type="KeepAlive"))

asyncio.run(main())

📤 Send Methods

send_media(message) — Send binary audio data for transcription

  • ListenV1MediaMessage(audio_bytes)

send_control(message) — Send control messages to manage the connection

  • ListenV1ControlMessage(type="KeepAlive") — Keep the connection alive
  • ListenV1ControlMessage(type="Finalize") — Finalize the transcription

⚙️ Parameters

model: str — AI model to use for the transcription

callback: typing.Optional[str] — URL to which we'll make the callback request

callback_method: typing.Optional[str] — HTTP method by which the callback request will be made

channels: typing.Optional[str] — Number of independent audio channels contained in submitted audio

diarize: typing.Optional[str] — Recognize speaker changes. Each word in the transcript will be assigned a speaker number starting at 0

dictation: typing.Optional[str] — Dictation mode for controlling formatting with dictated speech

encoding: typing.Optional[str] — Specify the expected encoding of your submitted audio

endpointing: typing.Optional[str] — Control when speech recognition ends

extra: typing.Optional[str] — Arbitrary key-value pairs that are attached to the API response

filler_words: typing.Optional[str] — Include filler words like "uh" and "um" in transcripts

interim_results: typing.Optional[str] — Return partial transcripts as audio is being processed

keyterm: typing.Optional[str] — Key term prompting can boost or suppress specialized terminology and brands

keywords: typing.Optional[str] — Keywords can boost or suppress specialized terminology and brands

language: typing.Optional[str] — BCP-47 language tag that hints at the primary spoken language

mip_opt_out: typing.Optional[str] — Opts out requests from the Deepgram Model Improvement Program

multichannel: typing.Optional[str] — Transcribe each audio channel independently

numerals: typing.Optional[str] — Convert numbers from written format to numerical format

profanity_filter: typing.Optional[str] — Remove profanity from transcripts

punctuate: typing.Optional[str] — Add punctuation and capitalization to the transcript

redact: typing.Optional[str] — Redaction removes sensitive information from your transcripts

replace: typing.Optional[str] — Search for terms or phrases in submitted audio and replaces them

sample_rate: typing.Optional[str] — Sample rate of the submitted audio

search: typing.Optional[str] — Search for terms or phrases in submitted audio

smart_format: typing.Optional[str] — Apply formatting to transcript output for improved readability

tag: typing.Optional[str] — Label your requests for the purpose of identification during usage reporting

utterance_end_ms: typing.Optional[str] — Length of time in milliseconds of silence to wait for before finalizing speech

vad_events: typing.Optional[str] — Return Voice Activity Detection events via the websocket

version: typing.Optional[str] — Version of the model to use

authorization: typing.Optional[str] — Use your API key for authentication, or alternatively generate a temporary token and pass it via the token query parameter.

Example: token %DEEPGRAM_API_KEY% or bearer %DEEPGRAM_TOKEN%

request_options: typing.Optional[RequestOptions] — Request-specific configuration.

Listen V2 Connect

client.listen.v2.connect(...)

📝 Description

Real-time conversational speech recognition with contextual turn detection for natural voice conversations

🔌 Usage

from deepgram import DeepgramClient
from deepgram.core.events import EventType
from deepgram.extensions.types.sockets import ListenV2SocketClientResponse

client = DeepgramClient(
    api_key="YOUR_API_KEY",
)

with client.listen.v2.connect(
    model="flux-general-en",
    encoding="linear16",
    sample_rate="16000"
) as connection:
    def on_message(message: ListenV2SocketClientResponse) -> None:
        msg_type = getattr(message, "type", "Unknown")
        print(f"Received {msg_type} event")

    connection.on(EventType.OPEN, lambda _: print("Connection opened"))
    connection.on(EventType.MESSAGE, on_message)
    connection.on(EventType.CLOSE, lambda _: print("Connection closed"))
    connection.on(EventType.ERROR, lambda error: print(f"Caught: {error}"))

    # Start listening
    connection.start_listening()

    # Send audio data
    from deepgram.extensions.types.sockets import ListenV2MediaMessage
    connection.send_media(ListenV2MediaMessage(data=audio_bytes))

    # Send control messages
    from deepgram.extensions.types.sockets import ListenV2ControlMessage
    connection.send_control(ListenV2ControlMessage(type="CloseStream"))

🔌 Async Usage

import asyncio
from deepgram import AsyncDeepgramClient
from deepgram.core.events import EventType
from deepgram.extensions.types.sockets import ListenV2SocketClientResponse

client = AsyncDeepgramClient(
    api_key="YOUR_API_KEY",
)

async def main():
    async with client.listen.v2.connect(
        model="flux-general-en",
        encoding="linear16",
        sample_rate="16000"
    ) as connection:
        def on_message(message: ListenV2SocketClientResponse) -> None:
            msg_type = getattr(message, "type", "Unknown")
            print(f"Received {msg_type} event")

        connection.on(EventType.OPEN, lambda _: print("Connection opened"))
        connection.on(EventType.MESSAGE, on_message)
        connection.on(EventType.CLOSE, lambda _: print("Connection closed"))
        connection.on(EventType.ERROR, lambda error: print(f"Caught: {error}"))

        # Start listening
        await connection.start_listening()

        # Send audio data
        from deepgram.extensions.types.sockets import ListenV2MediaMessage
        await connection.send_media(ListenV2MediaMessage(data=audio_bytes))

        # Send control messages
        from deepgram.extensions.types.sockets import ListenV2ControlMessage
        await connection.send_control(ListenV2ControlMessage(type="CloseStream"))

asyncio.run(main())

📤 Send Methods

send_media(message) — Send binary audio data for transcription

  • ListenV2MediaMessage(data=audio_bytes)

send_control(message) — Send control messages to manage the connection

  • ListenV2ControlMessage(type="CloseStream") — Close the audio stream

⚙️ Parameters

model: str — AI model used to process submitted audio

encoding: str — Specify the expected encoding of your submitted audio

sample_rate: str — Sample rate of the submitted audio

eager_eot_threshold: typing.Optional[str] — Threshold for eager end-of-turn detection

eot_threshold: typing.Optional[str] — Threshold for end-of-turn detection

eot_timeout_ms: typing.Optional[str] — Timeout in milliseconds for end-of-turn detection

keyterm: typing.Optional[str] — Key term prompting can boost or suppress specialized terminology and brands

mip_opt_out: typing.Optional[str] — Opts out requests from the Deepgram Model Improvement Program

tag: typing.Optional[str] — Label your requests for the purpose of identification during usage reporting

authorization: typing.Optional[str] — Use your API key for authentication, or alternatively generate a temporary token and pass it via the token query parameter.

Example: token %DEEPGRAM_API_KEY% or bearer %DEEPGRAM_TOKEN%

request_options: typing.Optional[RequestOptions] — Request-specific configuration.

Speak V1 Connect

client.speak.v1.connect(...)

📝 Description

Convert text into natural-sounding speech using Deepgram's TTS WebSocket

🔌 Usage

from deepgram import DeepgramClient
from deepgram.core.events import EventType
from deepgram.extensions.types.sockets import SpeakV1SocketClientResponse

client = DeepgramClient(
    api_key="YOUR_API_KEY",
)

with client.speak.v1.connect(
    model="aura-2-asteria-en",
    encoding="linear16",
    sample_rate=24000
) as connection:
    def on_message(message: SpeakV1SocketClientResponse) -> None:
        if isinstance(message, bytes):
            print("Received audio event")
        else:
            msg_type = getattr(message, "type", "Unknown")
            print(f"Received {msg_type} event")

    connection.on(EventType.OPEN, lambda _: print("Connection opened"))
    connection.on(EventType.MESSAGE, on_message)
    connection.on(EventType.CLOSE, lambda _: print("Connection closed"))
    connection.on(EventType.ERROR, lambda error: print(f"Caught: {error}"))

    # Start listening
    connection.start_listening()

    # Send text to be converted to speech
    from deepgram.extensions.types.sockets import SpeakV1TextMessage
    connection.send_text(SpeakV1TextMessage(text="Hello, world!"))

    # Send control messages
    from deepgram.extensions.types.sockets import SpeakV1ControlMessage
    connection.send_control(SpeakV1ControlMessage(type="Flush"))
    connection.send_control(SpeakV1ControlMessage(type="Close"))

🔌 Async Usage

import asyncio
from deepgram import AsyncDeepgramClient
from deepgram.core.events import EventType
from deepgram.extensions.types.sockets import SpeakV1SocketClientResponse

client = AsyncDeepgramClient(
    api_key="YOUR_API_KEY",
)

async def main():
    async with client.speak.v1.connect(
        model="aura-2-asteria-en",
        encoding="linear16",
        sample_rate=24000
    ) as connection:
        def on_message(message: SpeakV1SocketClientResponse) -> None:
            if isinstance(message, bytes):
                print("Received audio event")
            else:
                msg_type = getattr(message, "type", "Unknown")
                print(f"Received {msg_type} event")

        connection.on(EventType.OPEN, lambda _: print("Connection opened"))
        connection.on(EventType.MESSAGE, on_message)
        connection.on(EventType.CLOSE, lambda _: print("Connection closed"))
        connection.on(EventType.ERROR, lambda error: print(f"Caught: {error}"))

        # Start listening
        await connection.start_listening()

        # Send text to be converted to speech
        from deepgram.extensions.types.sockets import SpeakV1TextMessage
        await connection.send_text(SpeakV1TextMessage(text="Hello, world!"))

        # Send control messages
        from deepgram.extensions.types.sockets import SpeakV1ControlMessage
        await connection.send_control(SpeakV1ControlMessage(type="Flush"))
        await connection.send_control(SpeakV1ControlMessage(type="Close"))

asyncio.run(main())

📤 Send Methods

send_text(message) — Send text to be converted to speech

  • SpeakV1TextMessage(text="Hello, world!")

send_control(message) — Send control messages to manage speech synthesis

  • SpeakV1ControlMessage(type="Flush") — Process all queued text immediately
  • SpeakV1ControlMessage(type="Clear") — Clear the text queue
  • SpeakV1ControlMessage(type="Close") — Close the connection

⚙️ Parameters

encoding: typing.Optional[str] — Specify the expected encoding of your output audio

mip_opt_out: typing.Optional[str] — Opts out requests from the Deepgram Model Improvement Program

model: typing.Optional[str] — AI model used to process submitted text

sample_rate: typing.Optional[str] — Sample rate for the output audio

authorization: typing.Optional[str] — Use your API key for authentication, or alternatively generate a temporary token and pass it via the token query parameter.

Example: token %DEEPGRAM_API_KEY% or bearer %DEEPGRAM_TOKEN%

request_options: typing.Optional[RequestOptions] — Request-specific configuration.

Agent V1 Connect

client.agent.v1.connect(...)

📝 Description

Build a conversational voice agent using Deepgram's Voice Agent WebSocket

🔌 Usage

from deepgram import DeepgramClient
from deepgram.core.events import EventType
from deepgram.extensions.types.sockets import (
    AgentV1Agent,
    AgentV1AudioConfig,
    AgentV1AudioInput,
    AgentV1DeepgramSpeakProvider,
    AgentV1Listen,
    AgentV1ListenProvider,
    AgentV1OpenAiThinkProvider,
    AgentV1SettingsMessage,
    AgentV1SocketClientResponse,
    AgentV1SpeakProviderConfig,
    AgentV1Think,
)

client = DeepgramClient(
    api_key="YOUR_API_KEY",
)

with client.agent.v1.connect() as agent:
    # Configure the agent
    settings = AgentV1SettingsMessage(
        audio=AgentV1AudioConfig(
            input=AgentV1AudioInput(
                encoding="linear16",
                sample_rate=44100,
            )
        ),
        agent=AgentV1Agent(
            listen=AgentV1Listen(
                provider=AgentV1ListenProvider(
                    type="deepgram",
                    model="nova-3",
                    smart_format=True,
                )
            ),
            think=AgentV1Think(
                provider=AgentV1OpenAiThinkProvider(
                    type="open_ai",
                    model="gpt-4o-mini",
                    temperature=0.7,
                ),
                prompt='Reply only and explicitly with "OK".',
            ),
            speak=AgentV1SpeakProviderConfig(
                provider=AgentV1DeepgramSpeakProvider(
                    type="deepgram",
                    model="aura-2-asteria-en",
                )
            ),
        ),
    )

    agent.send_settings(settings)

    def on_message(message: AgentV1SocketClientResponse) -> None:
        if isinstance(message, bytes):
            print("Received audio event")
        else:
            msg_type = getattr(message, "type", "Unknown")
            print(f"Received {msg_type} event")

    agent.on(EventType.OPEN, lambda _: print("Connection opened"))
    agent.on(EventType.MESSAGE, on_message)
    agent.on(EventType.CLOSE, lambda _: print("Connection closed"))
    agent.on(EventType.ERROR, lambda error: print(f"Caught: {error}"))

    # Start listening
    agent.start_listening()

    # Send audio data
    from deepgram.extensions.types.sockets import AgentV1MediaMessage
    agent.send_media(AgentV1MediaMessage(data=audio_bytes))

    # Send control messages
    from deepgram.extensions.types.sockets import AgentV1ControlMessage
    agent.send_control(AgentV1ControlMessage(type="KeepAlive"))

🔌 Async Usage

import asyncio
from deepgram import AsyncDeepgramClient
from deepgram.core.events import EventType
from deepgram.extensions.types.sockets import (
    AgentV1Agent,
    AgentV1AudioConfig,
    AgentV1AudioInput,
    AgentV1DeepgramSpeakProvider,
    AgentV1Listen,
    AgentV1ListenProvider,
    AgentV1OpenAiThinkProvider,
    AgentV1SettingsMessage,
    AgentV1SocketClientResponse,
    AgentV1SpeakProviderConfig,
    AgentV1Think,
)

client = AsyncDeepgramClient(
    api_key="YOUR_API_KEY",
)

async def main():
    async with client.agent.v1.connect() as agent:
        # Configure the agent
        settings = AgentV1SettingsMessage(
            audio=AgentV1AudioConfig(
                input=AgentV1AudioInput(
                    encoding="linear16",
                    sample_rate=16000,
                )
            ),
            agent=AgentV1Agent(
                listen=AgentV1Listen(
                    provider=AgentV1ListenProvider(
                        type="deepgram",
                        model="nova-3",
                        smart_format=True,
                    )
                ),
                think=AgentV1Think(
                    provider=AgentV1OpenAiThinkProvider(
                        type="open_ai",
                        model="gpt-4o-mini",
                        temperature=0.7,
                    )
                ),
                speak=AgentV1SpeakProviderConfig(
                    provider=AgentV1DeepgramSpeakProvider(
                        type="deepgram",
                        model="aura-2-asteria-en",
                    )
                ),
            ),
        )

        await agent.send_settings(settings)

        def on_message(message: AgentV1SocketClientResponse) -> None:
            if isinstance(message, bytes):
                print("Received audio event")
            else:
                msg_type = getattr(message, "type", "Unknown")
                print(f"Received {msg_type} event")

        agent.on(EventType.OPEN, lambda _: print("Connection opened"))
        agent.on(EventType.MESSAGE, on_message)
        agent.on(EventType.CLOSE, lambda _: print("Connection closed"))
        agent.on(EventType.ERROR, lambda error: print(f"Caught: {error}"))

        # Start listening
        await agent.start_listening()

        # Send audio data
        from deepgram.extensions.types.sockets import AgentV1MediaMessage
        await agent.send_media(AgentV1MediaMessage(data=audio_bytes))

        # Send control messages
        from deepgram.extensions.types.sockets import AgentV1ControlMessage
        await agent.send_control(AgentV1ControlMessage(type="KeepAlive"))

asyncio.run(main())

⚙️ Parameters

authorization: typing.Optional[str] — Use your API key for authentication, or alternatively generate a temporary token and pass it via the token query parameter.

Example: token %DEEPGRAM_API_KEY% or bearer %DEEPGRAM_TOKEN%

request_options: typing.Optional[RequestOptions] — Request-specific configuration.

📤 Send Methods

send_settings(message) — Send initial agent configuration settings

  • AgentV1SettingsMessage(...) — Configure audio, listen, think, and speak providers

send_media(message) — Send binary audio data to the agent

  • AgentV1MediaMessage(data=audio_bytes)

send_control(message) — Send control messages (keep_alive, etc.)

  • AgentV1ControlMessage(type="KeepAlive")

send_update_speak(message) — Update the agent's speech synthesis settings

  • AgentV1UpdateSpeakMessage(...) — Modify TTS configuration during conversation

send_update_prompt(message) — Update the agent's system prompt

  • AgentV1UpdatePromptMessage(...) — Change the agent's behavior instructions

send_inject_user_message(message) — Inject a user message into the conversation

  • AgentV1InjectUserMessageMessage(...) — Add a simulated user input

send_inject_agent_message(message) — Inject an agent message into the conversation

  • AgentV1InjectAgentMessageMessage(...) — Add a simulated agent response

send_function_call_response(message) — Send the result of a function call back to the agent

  • AgentV1FunctionCallResponseMessage(...) — Provide function execution results