Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -34,4 +34,8 @@ sounddevice
cartesia
soundfile
ollama
pydub
pydub
anthropic
google-generativeai
assemblyai
gTTS
2 changes: 1 addition & 1 deletion run_voice_assistant.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def main():
chat_history.append({"role": "assistant", "content": response_text})

# Determine the output file format based on the TTS model
if Config.TTS_MODEL == 'openai' or Config.TTS_MODEL == 'elevenlabs' or Config.TTS_MODEL == 'melotts' or Config.TTS_MODEL == 'cartesia':
if Config.TTS_MODEL == 'openai' or Config.TTS_MODEL == 'elevenlabs' or Config.TTS_MODEL == 'melotts' or Config.TTS_MODEL == 'cartesia' or Config.TTS_MODEL == 'google':
output_file = 'output.mp3'
else:
output_file = 'output.wav'
Expand Down
1 change: 1 addition & 0 deletions voice_assistant/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# voice_assistant/__init__.py
10 changes: 7 additions & 3 deletions voice_assistant/api_key_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,20 @@
"transcription":{
"openai": Config.OPENAI_API_KEY,
"groq": Config.GROQ_API_KEY,
"deepgram": Config.DEEPGRAM_API_KEY
"deepgram": Config.DEEPGRAM_API_KEY,
"assemblyai": Config.ASSEMBLYAI_API_KEY
},
"response":{
"openai":Config.OPENAI_API_KEY,
"groq": Config.GROQ_API_KEY
"groq": Config.GROQ_API_KEY,
"anthropic": Config.ANTHROPIC_API_KEY,
"google": Config.GOOGLE_API_KEY
},
"tts": {
"openai": Config.OPENAI_API_KEY,
"deepgram":Config.DEEPGRAM_API_KEY,
"elevenlabs": Config.ELEVENLABS_API_KEY
"elevenlabs": Config.ELEVENLABS_API_KEY,
"google": Config.GOOGLE_API_KEY
}
}

Expand Down
22 changes: 15 additions & 7 deletions voice_assistant/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ class Config:
LOCAL_MODEL_PATH (str): Path to the local model.
"""
# Model selection
TRANSCRIPTION_MODEL = 'deepgram' # possible values: openai, groq, deepgram, fastwhisperapi
RESPONSE_MODEL = 'openai' # possible values: openai, groq, ollama
TRANSCRIPTION_MODEL = 'deepgram' # possible values: openai, groq, deepgram, fastwhisperapi, assemblyai
RESPONSE_MODEL = 'openai' # possible values: openai, groq, ollama, anthropic, google
TTS_MODEL = 'openai' # possible values: openai, deepgram, elevenlabs, melotts, cartesia, piper

# Piper Server configuration
Expand All @@ -33,9 +33,11 @@ class Config:
# https://github.com/myshell-ai/MeloTTS/blob/main/docs/install.md#linux-and-macos-install

# LLM Selection
OLLAMA_LLM="llama3:8b"
GROQ_LLM="llama3-8b-8192"
OLLAMA_LLM="llama3.1:8b"
GROQ_LLM="llama-3.1-8b-instant"
OPENAI_LLM="gpt-4o"
ANTHROPIC_LLM="claude-3-5-sonnet-latest"
GOOGLE_LLM="gemini-1.5-flash-002"

# API keys and paths
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
Expand All @@ -44,6 +46,9 @@ class Config:
ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY")
LOCAL_MODEL_PATH = os.getenv("LOCAL_MODEL_PATH")
CARTESIA_API_KEY = os.getenv("CARTESIA_API_KEY")
ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY")
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
ASSEMBLYAI_API_KEY = os.getenv("ASSEMBLYAI_API_KEY")

# for serving the MeloTTS model
TTS_PORT_LOCAL = 5150
Expand All @@ -60,18 +65,21 @@ def validate_config():
ValueError: If a required environment variable is not set.
"""
Config._validate_model('TRANSCRIPTION_MODEL', [
'openai', 'groq', 'deepgram', 'fastwhisperapi', 'local'])
'openai', 'groq', 'deepgram', 'fastwhisperapi', 'local', 'assemblyai'])
Config._validate_model('RESPONSE_MODEL', [
'openai', 'groq', 'ollama', 'local'])
'openai', 'groq', 'ollama', 'local', 'anthropic', 'google'])
Config._validate_model('TTS_MODEL', [
'openai', 'deepgram', 'elevenlabs', 'melotts', 'cartesia', 'local', 'piper'])
'openai', 'deepgram', 'elevenlabs', 'melotts', 'cartesia', 'local', 'piper', 'google'])

Config._validate_api_key('TRANSCRIPTION_MODEL', 'openai', 'OPENAI_API_KEY')
Config._validate_api_key('TRANSCRIPTION_MODEL', 'groq', 'GROQ_API_KEY')
Config._validate_api_key('TRANSCRIPTION_MODEL', 'deepgram', 'DEEPGRAM_API_KEY')
Config._validate_api_key('TRANSCRIPTION_MODEL', 'assemblyai', 'ASSEMBLYAI_API_KEY')

Config._validate_api_key('RESPONSE_MODEL', 'openai', 'OPENAI_API_KEY')
Config._validate_api_key('RESPONSE_MODEL', 'groq', 'GROQ_API_KEY')
Config._validate_api_key('RESPONSE_MODEL', 'anthropic', 'ANTHROPIC_API_KEY')
Config._validate_api_key('RESPONSE_MODEL', 'google', 'GOOGLE_API_KEY')

Config._validate_api_key('TTS_MODEL', 'openai', 'OPENAI_API_KEY')
Config._validate_api_key('TTS_MODEL', 'deepgram', 'DEEPGRAM_API_KEY')
Expand Down
67 changes: 66 additions & 1 deletion voice_assistant/response_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
from openai import OpenAI
from groq import Groq
import ollama
import anthropic
import google.generativeai as genai

from voice_assistant.config import Config

Expand All @@ -29,6 +31,10 @@ def generate_response(model:str, api_key:str, chat_history:list, local_model_pat
return _generate_groq_response(api_key, chat_history)
elif model == 'ollama':
return _generate_ollama_response(chat_history)
elif model == 'anthropic':
return _generate_anthropic_response(api_key, chat_history)
elif model == 'google':
return _generate_google_response(api_key, chat_history)
elif model == 'local':
# Placeholder for local LLM response generation
return "Generated response from local model"
Expand Down Expand Up @@ -61,4 +67,63 @@ def _generate_ollama_response(chat_history):
model=Config.OLLAMA_LLM,
messages=chat_history,
)
return response['message']['content']
return response['message']['content']

def _generate_anthropic_response(api_key, chat_history):
client = anthropic.Anthropic(api_key=api_key)
# Convert chat_history to Anthropic format if needed
# Anthropic expects: [{"role": "user", "content": "..."}] or [{"role": "assistant", "content": "..."}]
# System message is passed separately in `system` parameter.

system_prompt = ""
messages = []
for msg in chat_history:
if msg['role'] == 'system':
system_prompt = msg['content']
else:
messages.append(msg)

message = client.messages.create(
model=Config.ANTHROPIC_LLM,
max_tokens=1024,
system=system_prompt,
messages=messages
)
return message.content[0].text

def _generate_google_response(api_key, chat_history):
genai.configure(api_key=api_key)
model = genai.GenerativeModel(Config.GOOGLE_LLM)

# Google Generative AI (Gemini) chat history format:
# history = [
# {"role": "user", "parts": "Hello"},
# {"role": "model", "parts": "Great to meet you. What would you like to know?"}
# ]
# Note: 'system' role is supported via system_instruction in model creation,
# but for simplicity here we might prepend it to the first message or just use the chat structure.
# Gemini 1.5 supports system instructions.

system_instruction = None
history = []

last_message = chat_history[-1]['content']

# Prepare history excluding the last message which is the new prompt
# Also handle system prompt

for msg in chat_history[:-1]:
if msg['role'] == 'system':
system_instruction = msg['content']
elif msg['role'] == 'user':
history.append({"role": "user", "parts": msg['content']})
elif msg['role'] == 'assistant':
history.append({"role": "model", "parts": msg['content']})

# Re-initialize model with system instruction if present
if system_instruction:
model = genai.GenerativeModel(Config.GOOGLE_LLM, system_instruction=system_instruction)

chat = model.start_chat(history=history)
response = chat.send_message(last_message)
return response.text
21 changes: 13 additions & 8 deletions voice_assistant/text_to_speech.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,10 @@
import requests

from openai import OpenAI
from deepgram import DeepgramClient, SpeakOptions
from deepgram import DeepgramClient
from elevenlabs.client import ElevenLabs
from cartesia import Cartesia
from gtts import gTTS

from voice_assistant.config import Config
from voice_assistant.local_tts_generation import generate_audio_file_melotts
Expand Down Expand Up @@ -41,11 +42,11 @@ def text_to_speech(model: str, api_key:str, text:str, output_file_path:str, loca

elif model == 'deepgram':
client = DeepgramClient(api_key=api_key)
options = SpeakOptions(
model="aura-arcas-en", #"aura-luna-en", # https://developers.deepgram.com/docs/tts-models
encoding="linear16",
container="wav"
)
options = {
"model": "aura-arcas-en", #"aura-luna-en", # https://developers.deepgram.com/docs/tts-models
"encoding": "linear16",
"container": "wav"
}
SPEAK_OPTIONS = {"text": text}
response = client.speak.v("1").save(output_file_path, SPEAK_OPTIONS, options)

Expand All @@ -55,7 +56,7 @@ def text_to_speech(model: str, api_key:str, text:str, output_file_path:str, loca
text=text,
voice="Paul J.",
output_format="mp3_22050_32",
model="eleven_turbo_v2"
model="eleven_flash_v2"
)
elevenlabs.save(audio, output_file_path)

Expand All @@ -66,7 +67,7 @@ def text_to_speech(model: str, api_key:str, text:str, output_file_path:str, loca
voice = client.voices.get(id=voice_id)

# You can check out our models at https://docs.cartesia.ai/getting-started/available-models
model_id = "sonic-english"
model_id = "sonic-2"

# You can find the supported `output_format`s at https://docs.cartesia.ai/api-reference/endpoints/stream-speech-server-sent-events
output_format = {
Expand Down Expand Up @@ -122,6 +123,10 @@ def text_to_speech(model: str, api_key:str, text:str, output_file_path:str, loca
except Exception as e:
logging.error(f"Piper TTS request failed: {e}")

elif model == 'google':
tts = gTTS(text=text, lang='en')
tts.save(output_file_path)

elif model == 'local':
with open(output_file_path, "wb") as f:
f.write(b"Local TTS audio data")
Expand Down
21 changes: 17 additions & 4 deletions voice_assistant/transcription.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@
from colorama import Fore, init
from openai import OpenAI
from groq import Groq
from deepgram import DeepgramClient,PrerecordedOptions,FileSource
from deepgram import DeepgramClient
import assemblyai as aai

fast_url = "http://localhost:8000"
checked_fastwhisperapi = False
Expand Down Expand Up @@ -48,6 +49,8 @@ def transcribe_audio(model, api_key, audio_file_path, local_model_path=None):
return _transcribe_with_deepgram(api_key, audio_file_path)
elif model == 'fastwhisperapi':
return _transcribe_with_fastwhisperapi(audio_file_path)
elif model == 'assemblyai':
return _transcribe_with_assemblyai(api_key, audio_file_path)
elif model == 'local':
# Placeholder for local STT model transcription
return "Transcribed text from local model"
Expand All @@ -72,7 +75,7 @@ def _transcribe_with_groq(api_key, audio_file_path):
client = Groq(api_key=api_key)
with open(audio_file_path, "rb") as audio_file:
transcription = client.audio.transcriptions.create(
model="whisper-large-v3",
model="whisper-large-v3-turbo",
file=audio_file,
language='en'
)
Expand All @@ -86,7 +89,7 @@ def _transcribe_with_deepgram(api_key, audio_file_path):
buffer_data = file.read()

payload = {"buffer": buffer_data}
options = PrerecordedOptions(model="nova-2", smart_format=True)
options = {"model": "nova-2", "smart_format": True}
response = deepgram.listen.prerecorded.v("1").transcribe_file(payload, options)
data = json.loads(response.to_json())

Expand All @@ -112,4 +115,14 @@ def _transcribe_with_fastwhisperapi(audio_file_path):

response = requests.post(endpoint, files=files, data=data, headers=headers)
response_json = response.json()
return response_json.get('text', 'No text found in the response.')
return response_json.get('text', 'No text found in the response.')

def _transcribe_with_assemblyai(api_key, audio_file_path):
aai.settings.api_key = api_key
transcriber = aai.Transcriber()
transcript = transcriber.transcribe(audio_file_path)

if transcript.status == aai.TranscriptStatus.error:
raise Exception(f"AssemblyAI Transcription Error: {transcript.error}")

return transcript.text