PromtEngineer · google-labs-jules · Nov 22, 2025
diff --git a/requirements.txt b/requirements.txt
@@ -34,4 +34,8 @@ sounddevice
 cartesia
 soundfile
 ollama
-pydub
+pydub
+anthropic
+google-generativeai
+assemblyai
+gTTS
diff --git a/run_voice_assistant.py b/run_voice_assistant.py
@@ -65,7 +65,7 @@ def main():
             chat_history.append({"role": "assistant", "content": response_text})
 
             # Determine the output file format based on the TTS model
-            if Config.TTS_MODEL == 'openai' or Config.TTS_MODEL == 'elevenlabs' or Config.TTS_MODEL == 'melotts' or Config.TTS_MODEL == 'cartesia':
+            if Config.TTS_MODEL == 'openai' or Config.TTS_MODEL == 'elevenlabs' or Config.TTS_MODEL == 'melotts' or Config.TTS_MODEL == 'cartesia' or Config.TTS_MODEL == 'google':
                 output_file = 'output.mp3'
             else:
                 output_file = 'output.wav'

diff --git a/voice_assistant/__init__.py b/voice_assistant/__init__.py
@@ -0,0 +1 @@
+# voice_assistant/__init__.py
diff --git a/voice_assistant/api_key_manager.py b/voice_assistant/api_key_manager.py
@@ -6,16 +6,20 @@
     "transcription":{
         "openai": Config.OPENAI_API_KEY,
         "groq": Config.GROQ_API_KEY,
-        "deepgram": Config.DEEPGRAM_API_KEY
+        "deepgram": Config.DEEPGRAM_API_KEY,
+        "assemblyai": Config.ASSEMBLYAI_API_KEY
     },
     "response":{
         "openai":Config.OPENAI_API_KEY,
-        "groq": Config.GROQ_API_KEY
+        "groq": Config.GROQ_API_KEY,
+        "anthropic": Config.ANTHROPIC_API_KEY,
+        "google": Config.GOOGLE_API_KEY
     },
     "tts": {
         "openai": Config.OPENAI_API_KEY,
         "deepgram":Config.DEEPGRAM_API_KEY,
-        "elevenlabs": Config.ELEVENLABS_API_KEY
+        "elevenlabs": Config.ELEVENLABS_API_KEY,
+        "google": Config.GOOGLE_API_KEY
     }
 }
 

diff --git a/voice_assistant/config.py b/voice_assistant/config.py
@@ -21,8 +21,8 @@ class Config:
         LOCAL_MODEL_PATH (str): Path to the local model.
     """
     # Model selection
-    TRANSCRIPTION_MODEL = 'deepgram'  # possible values: openai, groq, deepgram, fastwhisperapi
-    RESPONSE_MODEL = 'openai'  # possible values: openai, groq, ollama
+    TRANSCRIPTION_MODEL = 'deepgram'  # possible values: openai, groq, deepgram, fastwhisperapi, assemblyai
+    RESPONSE_MODEL = 'openai'  # possible values: openai, groq, ollama, anthropic, google
     TTS_MODEL = 'openai'  # possible values: openai, deepgram, elevenlabs, melotts, cartesia, piper
 
     # Piper Server configuration
@@ -33,9 +33,11 @@ class Config:
     # https://github.com/myshell-ai/MeloTTS/blob/main/docs/install.md#linux-and-macos-install
 
     # LLM Selection
-    OLLAMA_LLM="llama3:8b"
-    GROQ_LLM="llama3-8b-8192"
+    OLLAMA_LLM="llama3.1:8b"
+    GROQ_LLM="llama-3.1-8b-instant"
     OPENAI_LLM="gpt-4o"
+    ANTHROPIC_LLM="claude-3-5-sonnet-latest"
+    GOOGLE_LLM="gemini-1.5-flash-002"
 
     # API keys and paths
     OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
@@ -44,6 +46,9 @@ class Config:
     ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY")
     LOCAL_MODEL_PATH = os.getenv("LOCAL_MODEL_PATH")
     CARTESIA_API_KEY = os.getenv("CARTESIA_API_KEY")
+    ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY")
+    GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
+    ASSEMBLYAI_API_KEY = os.getenv("ASSEMBLYAI_API_KEY")
 
     # for serving the MeloTTS model
     TTS_PORT_LOCAL = 5150
@@ -60,18 +65,21 @@ def validate_config():
             ValueError: If a required environment variable is not set.
         """
         Config._validate_model('TRANSCRIPTION_MODEL', [
-            'openai', 'groq', 'deepgram', 'fastwhisperapi', 'local'])
+            'openai', 'groq', 'deepgram', 'fastwhisperapi', 'local', 'assemblyai'])
         Config._validate_model('RESPONSE_MODEL', [
-            'openai', 'groq', 'ollama', 'local'])
+            'openai', 'groq', 'ollama', 'local', 'anthropic', 'google'])
         Config._validate_model('TTS_MODEL', [
-            'openai', 'deepgram', 'elevenlabs', 'melotts', 'cartesia', 'local', 'piper'])
+            'openai', 'deepgram', 'elevenlabs', 'melotts', 'cartesia', 'local', 'piper', 'google'])
 
         Config._validate_api_key('TRANSCRIPTION_MODEL', 'openai', 'OPENAI_API_KEY')
         Config._validate_api_key('TRANSCRIPTION_MODEL', 'groq', 'GROQ_API_KEY')
         Config._validate_api_key('TRANSCRIPTION_MODEL', 'deepgram', 'DEEPGRAM_API_KEY')
+        Config._validate_api_key('TRANSCRIPTION_MODEL', 'assemblyai', 'ASSEMBLYAI_API_KEY')
 
         Config._validate_api_key('RESPONSE_MODEL', 'openai', 'OPENAI_API_KEY')
         Config._validate_api_key('RESPONSE_MODEL', 'groq', 'GROQ_API_KEY')
+        Config._validate_api_key('RESPONSE_MODEL', 'anthropic', 'ANTHROPIC_API_KEY')
+        Config._validate_api_key('RESPONSE_MODEL', 'google', 'GOOGLE_API_KEY')
 
         Config._validate_api_key('TTS_MODEL', 'openai', 'OPENAI_API_KEY')
         Config._validate_api_key('TTS_MODEL', 'deepgram', 'DEEPGRAM_API_KEY')

diff --git a/voice_assistant/response_generation.py b/voice_assistant/response_generation.py
@@ -5,6 +5,8 @@
 from openai import OpenAI
 from groq import Groq
 import ollama
+import anthropic
+import google.generativeai as genai
 
 from voice_assistant.config import Config
 
@@ -29,6 +31,10 @@ def generate_response(model:str, api_key:str, chat_history:list, local_model_pat
             return _generate_groq_response(api_key, chat_history)
         elif model == 'ollama':
             return _generate_ollama_response(chat_history)
+        elif model == 'anthropic':
+            return _generate_anthropic_response(api_key, chat_history)
+        elif model == 'google':
+            return _generate_google_response(api_key, chat_history)
         elif model == 'local':
             # Placeholder for local LLM response generation
             return "Generated response from local model"
@@ -61,4 +67,63 @@ def _generate_ollama_response(chat_history):
         model=Config.OLLAMA_LLM,
         messages=chat_history,
     )
-    return response['message']['content']
+    return response['message']['content']
+
+def _generate_anthropic_response(api_key, chat_history):
+    client = anthropic.Anthropic(api_key=api_key)
+    # Convert chat_history to Anthropic format if needed
+    # Anthropic expects: [{"role": "user", "content": "..."}] or [{"role": "assistant", "content": "..."}]
+    # System message is passed separately in `system` parameter.
+
+    system_prompt = ""
+    messages = []
+    for msg in chat_history:
+        if msg['role'] == 'system':
+            system_prompt = msg['content']
+        else:
+            messages.append(msg)
+
+    message = client.messages.create(
+        model=Config.ANTHROPIC_LLM,
+        max_tokens=1024,
+        system=system_prompt,
+        messages=messages
+    )
+    return message.content[0].text
+
+def _generate_google_response(api_key, chat_history):
+    genai.configure(api_key=api_key)
+    model = genai.GenerativeModel(Config.GOOGLE_LLM)
+
+    # Google Generative AI (Gemini) chat history format:
+    # history = [
+    #   {"role": "user", "parts": "Hello"},
+    #   {"role": "model", "parts": "Great to meet you. What would you like to know?"}
+    # ]
+    # Note: 'system' role is supported via system_instruction in model creation,
+    # but for simplicity here we might prepend it to the first message or just use the chat structure.
+    # Gemini 1.5 supports system instructions.
+
+    system_instruction = None
+    history = []
+
+    last_message = chat_history[-1]['content']
+
+    # Prepare history excluding the last message which is the new prompt
+    # Also handle system prompt
+
+    for msg in chat_history[:-1]:
+        if msg['role'] == 'system':
+            system_instruction = msg['content']
+        elif msg['role'] == 'user':
+            history.append({"role": "user", "parts": msg['content']})
+        elif msg['role'] == 'assistant':
+            history.append({"role": "model", "parts": msg['content']})
+
+    # Re-initialize model with system instruction if present
+    if system_instruction:
+        model = genai.GenerativeModel(Config.GOOGLE_LLM, system_instruction=system_instruction)
+
+    chat = model.start_chat(history=history)
+    response = chat.send_message(last_message)
+    return response.text
diff --git a/voice_assistant/text_to_speech.py b/voice_assistant/text_to_speech.py
@@ -7,9 +7,10 @@
 import requests
 
 from openai import OpenAI
-from deepgram import DeepgramClient, SpeakOptions
+from deepgram import DeepgramClient
 from elevenlabs.client import ElevenLabs
 from cartesia import Cartesia
+from gtts import gTTS
 
 from voice_assistant.config import Config
 from voice_assistant.local_tts_generation import generate_audio_file_melotts
@@ -41,11 +42,11 @@ def text_to_speech(model: str, api_key:str, text:str, output_file_path:str, loca
 
         elif model == 'deepgram':
             client = DeepgramClient(api_key=api_key)
-            options = SpeakOptions(
-                model="aura-arcas-en", #"aura-luna-en", # https://developers.deepgram.com/docs/tts-models
-                encoding="linear16",
-                container="wav"
-            )
+            options = {
+                "model": "aura-arcas-en", #"aura-luna-en", # https://developers.deepgram.com/docs/tts-models
+                "encoding": "linear16",
+                "container": "wav"
+            }
             SPEAK_OPTIONS = {"text": text}
             response = client.speak.v("1").save(output_file_path, SPEAK_OPTIONS, options)
 
@@ -55,7 +56,7 @@ def text_to_speech(model: str, api_key:str, text:str, output_file_path:str, loca
                 text=text, 
                 voice="Paul J.", 
                 output_format="mp3_22050_32", 
-                model="eleven_turbo_v2"
+                model="eleven_flash_v2"
             )
             elevenlabs.save(audio, output_file_path)
 
@@ -66,7 +67,7 @@ def text_to_speech(model: str, api_key:str, text:str, output_file_path:str, loca
             voice = client.voices.get(id=voice_id)
 
             # You can check out our models at https://docs.cartesia.ai/getting-started/available-models
-            model_id = "sonic-english"
+            model_id = "sonic-2"
 
             # You can find the supported `output_format`s at https://docs.cartesia.ai/api-reference/endpoints/stream-speech-server-sent-events
             output_format = {
@@ -122,6 +123,10 @@ def text_to_speech(model: str, api_key:str, text:str, output_file_path:str, loca
             except Exception as e:
                 logging.error(f"Piper TTS request failed: {e}")
 
+        elif model == 'google':
+            tts = gTTS(text=text, lang='en')
+            tts.save(output_file_path)
+
         elif model == 'local':
             with open(output_file_path, "wb") as f:
                 f.write(b"Local TTS audio data")

diff --git a/voice_assistant/transcription.py b/voice_assistant/transcription.py
@@ -8,7 +8,8 @@
 from colorama import Fore, init
 from openai import OpenAI
 from groq import Groq
-from deepgram import DeepgramClient,PrerecordedOptions,FileSource
+from deepgram import DeepgramClient
+import assemblyai as aai
 
 fast_url = "http://localhost:8000"
 checked_fastwhisperapi = False
@@ -48,6 +49,8 @@ def transcribe_audio(model, api_key, audio_file_path, local_model_path=None):
             return _transcribe_with_deepgram(api_key, audio_file_path)
         elif model == 'fastwhisperapi':
             return _transcribe_with_fastwhisperapi(audio_file_path)
+        elif model == 'assemblyai':
+            return _transcribe_with_assemblyai(api_key, audio_file_path)
         elif model == 'local':
             # Placeholder for local STT model transcription
             return "Transcribed text from local model"
@@ -72,7 +75,7 @@ def _transcribe_with_groq(api_key, audio_file_path):
     client = Groq(api_key=api_key)
     with open(audio_file_path, "rb") as audio_file:
         transcription = client.audio.transcriptions.create(
-            model="whisper-large-v3",
+            model="whisper-large-v3-turbo",
             file=audio_file,
             language='en'
         )
@@ -86,7 +89,7 @@ def _transcribe_with_deepgram(api_key, audio_file_path):
             buffer_data = file.read()
 
         payload = {"buffer": buffer_data}
-        options = PrerecordedOptions(model="nova-2", smart_format=True)
+        options = {"model": "nova-2", "smart_format": True}
         response = deepgram.listen.prerecorded.v("1").transcribe_file(payload, options)
         data = json.loads(response.to_json())
 
@@ -112,4 +115,14 @@ def _transcribe_with_fastwhisperapi(audio_file_path):
 
     response = requests.post(endpoint, files=files, data=data, headers=headers)
     response_json = response.json()
-    return response_json.get('text', 'No text found in the response.')
+    return response_json.get('text', 'No text found in the response.')
+
+def _transcribe_with_assemblyai(api_key, audio_file_path):
+    aai.settings.api_key = api_key
+    transcriber = aai.Transcriber()
+    transcript = transcriber.transcribe(audio_file_path)
+
+    if transcript.status == aai.TranscriptStatus.error:
+        raise Exception(f"AssemblyAI Transcription Error: {transcript.error}")
+
+    return transcript.text