diff --git a/requirements.txt b/requirements.txt index d533fbb..3ec8e2a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -34,4 +34,8 @@ sounddevice cartesia soundfile ollama -pydub \ No newline at end of file +pydub +anthropic +google-generativeai +assemblyai +gTTS diff --git a/run_voice_assistant.py b/run_voice_assistant.py index 418e1c0..1621f75 100644 --- a/run_voice_assistant.py +++ b/run_voice_assistant.py @@ -65,7 +65,7 @@ def main(): chat_history.append({"role": "assistant", "content": response_text}) # Determine the output file format based on the TTS model - if Config.TTS_MODEL == 'openai' or Config.TTS_MODEL == 'elevenlabs' or Config.TTS_MODEL == 'melotts' or Config.TTS_MODEL == 'cartesia': + if Config.TTS_MODEL == 'openai' or Config.TTS_MODEL == 'elevenlabs' or Config.TTS_MODEL == 'melotts' or Config.TTS_MODEL == 'cartesia' or Config.TTS_MODEL == 'google': output_file = 'output.mp3' else: output_file = 'output.wav' diff --git a/voice_assistant/__init__.py b/voice_assistant/__init__.py new file mode 100644 index 0000000..037dd1a --- /dev/null +++ b/voice_assistant/__init__.py @@ -0,0 +1 @@ +# voice_assistant/__init__.py diff --git a/voice_assistant/api_key_manager.py b/voice_assistant/api_key_manager.py index 68668e3..075d9a4 100644 --- a/voice_assistant/api_key_manager.py +++ b/voice_assistant/api_key_manager.py @@ -6,16 +6,20 @@ "transcription":{ "openai": Config.OPENAI_API_KEY, "groq": Config.GROQ_API_KEY, - "deepgram": Config.DEEPGRAM_API_KEY + "deepgram": Config.DEEPGRAM_API_KEY, + "assemblyai": Config.ASSEMBLYAI_API_KEY }, "response":{ "openai":Config.OPENAI_API_KEY, - "groq": Config.GROQ_API_KEY + "groq": Config.GROQ_API_KEY, + "anthropic": Config.ANTHROPIC_API_KEY, + "google": Config.GOOGLE_API_KEY }, "tts": { "openai": Config.OPENAI_API_KEY, "deepgram":Config.DEEPGRAM_API_KEY, - "elevenlabs": Config.ELEVENLABS_API_KEY + "elevenlabs": Config.ELEVENLABS_API_KEY, + "google": Config.GOOGLE_API_KEY } } diff --git a/voice_assistant/config.py b/voice_assistant/config.py index 73fbc87..69c036d 100644 --- a/voice_assistant/config.py +++ b/voice_assistant/config.py @@ -21,8 +21,8 @@ class Config: LOCAL_MODEL_PATH (str): Path to the local model. """ # Model selection - TRANSCRIPTION_MODEL = 'deepgram' # possible values: openai, groq, deepgram, fastwhisperapi - RESPONSE_MODEL = 'openai' # possible values: openai, groq, ollama + TRANSCRIPTION_MODEL = 'deepgram' # possible values: openai, groq, deepgram, fastwhisperapi, assemblyai + RESPONSE_MODEL = 'openai' # possible values: openai, groq, ollama, anthropic, google TTS_MODEL = 'openai' # possible values: openai, deepgram, elevenlabs, melotts, cartesia, piper # Piper Server configuration @@ -33,9 +33,11 @@ class Config: # https://github.com/myshell-ai/MeloTTS/blob/main/docs/install.md#linux-and-macos-install # LLM Selection - OLLAMA_LLM="llama3:8b" - GROQ_LLM="llama3-8b-8192" + OLLAMA_LLM="llama3.1:8b" + GROQ_LLM="llama-3.1-8b-instant" OPENAI_LLM="gpt-4o" + ANTHROPIC_LLM="claude-3-5-sonnet-latest" + GOOGLE_LLM="gemini-1.5-flash-002" # API keys and paths OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") @@ -44,6 +46,9 @@ class Config: ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY") LOCAL_MODEL_PATH = os.getenv("LOCAL_MODEL_PATH") CARTESIA_API_KEY = os.getenv("CARTESIA_API_KEY") + ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY") + GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY") + ASSEMBLYAI_API_KEY = os.getenv("ASSEMBLYAI_API_KEY") # for serving the MeloTTS model TTS_PORT_LOCAL = 5150 @@ -60,18 +65,21 @@ def validate_config(): ValueError: If a required environment variable is not set. """ Config._validate_model('TRANSCRIPTION_MODEL', [ - 'openai', 'groq', 'deepgram', 'fastwhisperapi', 'local']) + 'openai', 'groq', 'deepgram', 'fastwhisperapi', 'local', 'assemblyai']) Config._validate_model('RESPONSE_MODEL', [ - 'openai', 'groq', 'ollama', 'local']) + 'openai', 'groq', 'ollama', 'local', 'anthropic', 'google']) Config._validate_model('TTS_MODEL', [ - 'openai', 'deepgram', 'elevenlabs', 'melotts', 'cartesia', 'local', 'piper']) + 'openai', 'deepgram', 'elevenlabs', 'melotts', 'cartesia', 'local', 'piper', 'google']) Config._validate_api_key('TRANSCRIPTION_MODEL', 'openai', 'OPENAI_API_KEY') Config._validate_api_key('TRANSCRIPTION_MODEL', 'groq', 'GROQ_API_KEY') Config._validate_api_key('TRANSCRIPTION_MODEL', 'deepgram', 'DEEPGRAM_API_KEY') + Config._validate_api_key('TRANSCRIPTION_MODEL', 'assemblyai', 'ASSEMBLYAI_API_KEY') Config._validate_api_key('RESPONSE_MODEL', 'openai', 'OPENAI_API_KEY') Config._validate_api_key('RESPONSE_MODEL', 'groq', 'GROQ_API_KEY') + Config._validate_api_key('RESPONSE_MODEL', 'anthropic', 'ANTHROPIC_API_KEY') + Config._validate_api_key('RESPONSE_MODEL', 'google', 'GOOGLE_API_KEY') Config._validate_api_key('TTS_MODEL', 'openai', 'OPENAI_API_KEY') Config._validate_api_key('TTS_MODEL', 'deepgram', 'DEEPGRAM_API_KEY') diff --git a/voice_assistant/response_generation.py b/voice_assistant/response_generation.py index e6e3891..f3f3ed0 100644 --- a/voice_assistant/response_generation.py +++ b/voice_assistant/response_generation.py @@ -5,6 +5,8 @@ from openai import OpenAI from groq import Groq import ollama +import anthropic +import google.generativeai as genai from voice_assistant.config import Config @@ -29,6 +31,10 @@ def generate_response(model:str, api_key:str, chat_history:list, local_model_pat return _generate_groq_response(api_key, chat_history) elif model == 'ollama': return _generate_ollama_response(chat_history) + elif model == 'anthropic': + return _generate_anthropic_response(api_key, chat_history) + elif model == 'google': + return _generate_google_response(api_key, chat_history) elif model == 'local': # Placeholder for local LLM response generation return "Generated response from local model" @@ -61,4 +67,63 @@ def _generate_ollama_response(chat_history): model=Config.OLLAMA_LLM, messages=chat_history, ) - return response['message']['content'] \ No newline at end of file + return response['message']['content'] + +def _generate_anthropic_response(api_key, chat_history): + client = anthropic.Anthropic(api_key=api_key) + # Convert chat_history to Anthropic format if needed + # Anthropic expects: [{"role": "user", "content": "..."}] or [{"role": "assistant", "content": "..."}] + # System message is passed separately in `system` parameter. + + system_prompt = "" + messages = [] + for msg in chat_history: + if msg['role'] == 'system': + system_prompt = msg['content'] + else: + messages.append(msg) + + message = client.messages.create( + model=Config.ANTHROPIC_LLM, + max_tokens=1024, + system=system_prompt, + messages=messages + ) + return message.content[0].text + +def _generate_google_response(api_key, chat_history): + genai.configure(api_key=api_key) + model = genai.GenerativeModel(Config.GOOGLE_LLM) + + # Google Generative AI (Gemini) chat history format: + # history = [ + # {"role": "user", "parts": "Hello"}, + # {"role": "model", "parts": "Great to meet you. What would you like to know?"} + # ] + # Note: 'system' role is supported via system_instruction in model creation, + # but for simplicity here we might prepend it to the first message or just use the chat structure. + # Gemini 1.5 supports system instructions. + + system_instruction = None + history = [] + + last_message = chat_history[-1]['content'] + + # Prepare history excluding the last message which is the new prompt + # Also handle system prompt + + for msg in chat_history[:-1]: + if msg['role'] == 'system': + system_instruction = msg['content'] + elif msg['role'] == 'user': + history.append({"role": "user", "parts": msg['content']}) + elif msg['role'] == 'assistant': + history.append({"role": "model", "parts": msg['content']}) + + # Re-initialize model with system instruction if present + if system_instruction: + model = genai.GenerativeModel(Config.GOOGLE_LLM, system_instruction=system_instruction) + + chat = model.start_chat(history=history) + response = chat.send_message(last_message) + return response.text \ No newline at end of file diff --git a/voice_assistant/text_to_speech.py b/voice_assistant/text_to_speech.py index be3ee96..a2216b8 100644 --- a/voice_assistant/text_to_speech.py +++ b/voice_assistant/text_to_speech.py @@ -7,9 +7,10 @@ import requests from openai import OpenAI -from deepgram import DeepgramClient, SpeakOptions +from deepgram import DeepgramClient from elevenlabs.client import ElevenLabs from cartesia import Cartesia +from gtts import gTTS from voice_assistant.config import Config from voice_assistant.local_tts_generation import generate_audio_file_melotts @@ -41,11 +42,11 @@ def text_to_speech(model: str, api_key:str, text:str, output_file_path:str, loca elif model == 'deepgram': client = DeepgramClient(api_key=api_key) - options = SpeakOptions( - model="aura-arcas-en", #"aura-luna-en", # https://developers.deepgram.com/docs/tts-models - encoding="linear16", - container="wav" - ) + options = { + "model": "aura-arcas-en", #"aura-luna-en", # https://developers.deepgram.com/docs/tts-models + "encoding": "linear16", + "container": "wav" + } SPEAK_OPTIONS = {"text": text} response = client.speak.v("1").save(output_file_path, SPEAK_OPTIONS, options) @@ -55,7 +56,7 @@ def text_to_speech(model: str, api_key:str, text:str, output_file_path:str, loca text=text, voice="Paul J.", output_format="mp3_22050_32", - model="eleven_turbo_v2" + model="eleven_flash_v2" ) elevenlabs.save(audio, output_file_path) @@ -66,7 +67,7 @@ def text_to_speech(model: str, api_key:str, text:str, output_file_path:str, loca voice = client.voices.get(id=voice_id) # You can check out our models at https://docs.cartesia.ai/getting-started/available-models - model_id = "sonic-english" + model_id = "sonic-2" # You can find the supported `output_format`s at https://docs.cartesia.ai/api-reference/endpoints/stream-speech-server-sent-events output_format = { @@ -122,6 +123,10 @@ def text_to_speech(model: str, api_key:str, text:str, output_file_path:str, loca except Exception as e: logging.error(f"Piper TTS request failed: {e}") + elif model == 'google': + tts = gTTS(text=text, lang='en') + tts.save(output_file_path) + elif model == 'local': with open(output_file_path, "wb") as f: f.write(b"Local TTS audio data") diff --git a/voice_assistant/transcription.py b/voice_assistant/transcription.py index 2caa38b..004d2f7 100644 --- a/voice_assistant/transcription.py +++ b/voice_assistant/transcription.py @@ -8,7 +8,8 @@ from colorama import Fore, init from openai import OpenAI from groq import Groq -from deepgram import DeepgramClient,PrerecordedOptions,FileSource +from deepgram import DeepgramClient +import assemblyai as aai fast_url = "http://localhost:8000" checked_fastwhisperapi = False @@ -48,6 +49,8 @@ def transcribe_audio(model, api_key, audio_file_path, local_model_path=None): return _transcribe_with_deepgram(api_key, audio_file_path) elif model == 'fastwhisperapi': return _transcribe_with_fastwhisperapi(audio_file_path) + elif model == 'assemblyai': + return _transcribe_with_assemblyai(api_key, audio_file_path) elif model == 'local': # Placeholder for local STT model transcription return "Transcribed text from local model" @@ -72,7 +75,7 @@ def _transcribe_with_groq(api_key, audio_file_path): client = Groq(api_key=api_key) with open(audio_file_path, "rb") as audio_file: transcription = client.audio.transcriptions.create( - model="whisper-large-v3", + model="whisper-large-v3-turbo", file=audio_file, language='en' ) @@ -86,7 +89,7 @@ def _transcribe_with_deepgram(api_key, audio_file_path): buffer_data = file.read() payload = {"buffer": buffer_data} - options = PrerecordedOptions(model="nova-2", smart_format=True) + options = {"model": "nova-2", "smart_format": True} response = deepgram.listen.prerecorded.v("1").transcribe_file(payload, options) data = json.loads(response.to_json()) @@ -112,4 +115,14 @@ def _transcribe_with_fastwhisperapi(audio_file_path): response = requests.post(endpoint, files=files, data=data, headers=headers) response_json = response.json() - return response_json.get('text', 'No text found in the response.') \ No newline at end of file + return response_json.get('text', 'No text found in the response.') + +def _transcribe_with_assemblyai(api_key, audio_file_path): + aai.settings.api_key = api_key + transcriber = aai.Transcriber() + transcript = transcriber.transcribe(audio_file_path) + + if transcript.status == aai.TranscriptStatus.error: + raise Exception(f"AssemblyAI Transcription Error: {transcript.error}") + + return transcript.text \ No newline at end of file