diff --git a/README.md b/README.md index bf834a5..d67360b 100644 --- a/README.md +++ b/README.md @@ -91,7 +91,7 @@ Edit config.py to select the models you want to use: # Model selection TRANSCRIPTION_MODEL = 'groq' # Options: 'openai', 'groq', 'deepgram', 'fastwhisperapi' 'local' RESPONSE_MODEL = 'groq' # Options: 'openai', 'groq', 'ollama', 'local' - TTS_MODEL = 'deepgram' # Options: 'openai', 'deepgram', 'elevenlabs', 'local', 'melotts' + TTS_MODEL = 'deepgram' # Options: 'openai', 'deepgram', 'elevenlabs', 'cartesia', 'local', 'melotts', 'fastxttsapi' # API keys and paths OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") @@ -130,6 +130,9 @@ If you are running LLM locally via [Ollama](https://ollama.com/), make sure the ```shell fastapi run main.py ``` + > **Note:** To allow Verbi to query the FastWhisperAPI for transcription, ensure that the FastAPI server is running in a separate terminal in the background. + + ***Alternative Setup and Run Methods*** The API can also run directly on a Docker container or in Google Colab. @@ -166,6 +169,22 @@ If you are running LLM locally via [Ollama](https://ollama.com/), make sure the ``` You can run the main file to start using verbi with local models. +9. 🔊 **Alternative Local TTS - Coqui XTTS v2** + + _Optional step if you need a local Text to Speech model_ + + ***Install Coqui XTTS from Github*** + + To set up the TTS server, follow the instructions in the [FastXttsAPI](https://github.com/3choff/FastXttsAPI) + + ***Usage*** + + The multilingual TTS model is queried through a FastAPI app that provides an endpoint "/v1/speech" to generate speech, which can provide both stream and non-stream responses. To get a list of all the 62 studio voices available, query the endpoint "/voices". The best performance is achieved when you run the model in a Docker container, but it is also possible to run the server in the [Google Colab](https://github.com/3choff/FastXttsAPI/blob/main/FastXttsAPI_notebook.ipynb) provided in the repository. + + > **Note:** To allow Verbi to query the FastXttsAPI for speech synthesis, ensure that the FastAPI server is running in a separate terminal in the background. + + > **Fun Tip:** Explore voice cloning with FastXttsAPI! You can clone a voice using an audio clip of at least 10 seconds. Simply add the voice’s embedding to the 'studio_speakers' folder in FastXttsAPI, and enjoy interacting with a personalized Verbi chatbot. + ## Model Options ⚙️ #### Transcription Models 🎤 diff --git a/requirements.txt b/requirements.txt index 9b7025e..816c596 100644 --- a/requirements.txt +++ b/requirements.txt @@ -26,7 +26,6 @@ python-dotenv colorama requests keyboard -elevenlabs fastapi uvicorn numpy diff --git a/run_voice_assistant.py b/run_voice_assistant.py index 23ba620..bb86b77 100644 --- a/run_voice_assistant.py +++ b/run_voice_assistant.py @@ -72,11 +72,12 @@ def main(): text_to_speech(Config.TTS_MODEL, tts_api_key, response_text, output_file, Config.LOCAL_MODEL_PATH) # Play the generated speech audio - play_audio(output_file) - # Clean up audio files - # delete_file(Config.INPUT_AUDIO) - # delete_file(output_file) + if Config.TTS_MODEL not in ['fastxttsapi', 'elevenlabs']: + play_audio(output_file) + # Clean up audio files + # delete_file(Config.INPUT_AUDIO) + # delete_file(output_file) except Exception as e: logging.error(Fore.RED + f"An error occurred: {e}" + Fore.RESET) diff --git a/voice_assistant/api_key_manager.py b/voice_assistant/api_key_manager.py index 176ccfd..2a8a5b7 100644 --- a/voice_assistant/api_key_manager.py +++ b/voice_assistant/api_key_manager.py @@ -43,4 +43,6 @@ def get_tts_api_key(): return Config.DEEPGRAM_API_KEY elif Config.TTS_MODEL == 'elevenlabs': return Config.ELEVENLABS_API_KEY + elif Config.TTS_MODEL == 'cartesia': + return Config.CARTESIA_API_KEY return None diff --git a/voice_assistant/audio.py b/voice_assistant/audio.py index 5f90212..273868e 100644 --- a/voice_assistant/audio.py +++ b/voice_assistant/audio.py @@ -5,9 +5,16 @@ import time import logging import pydub +import subprocess +import shutil from io import BytesIO from pydub import AudioSegment + +def is_installed(command): + from shutil import which + return which(command) is not None + def record_audio(file_path, timeout=10, phrase_time_limit=None, retries=3, energy_threshold=2000, pause_threshold=1, phrase_threshold=0.1, dynamic_energy_threshold=True, calibration_duration=1): """ Record audio from the microphone and save it as an MP3 file. @@ -67,4 +74,32 @@ def play_audio(file_path): except pygame.error as e: logging.error(f"Failed to play audio: {e}") except Exception as e: - logging.error(f"An unexpected error occurred while playing audio: {e}") \ No newline at end of file + logging.error(f"An unexpected error occurred while playing audio: {e}") +def play_audio_stream(audio_stream): + """ + Play an audio stream using ffplay. + + Args: + audio_stream (generator): The audio stream to play. + """ + # Use subprocess to pipe the audio data to ffplay and play it + if not is_installed("ffplay"): + raise ValueError("ffplay not found, necessary to stream audio.") + ffplay_cmd = ['ffplay', '-probesize', '512', '-autoexit', '-', "-nodisp"] + ffplay_proc = subprocess.Popen( + ffplay_cmd, + stdin=subprocess.PIPE, + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) + + try: + for chunk in audio_stream: + ffplay_proc.stdin.write(chunk) + # print("Received and played a chunk") + except Exception as e: + logging.error(f"An error occurred: {e}") + finally: + if ffplay_proc.stdin: + ffplay_proc.stdin.close() + ffplay_proc.wait() \ No newline at end of file diff --git a/voice_assistant/config.py b/voice_assistant/config.py index 8d55241..db5c09f 100644 --- a/voice_assistant/config.py +++ b/voice_assistant/config.py @@ -13,7 +13,7 @@ class Config: Attributes: TRANSCRIPTION_MODEL (str): The model to use for transcription ('openai', 'groq', 'deepgram', 'fastwhisperapi', 'local'). RESPONSE_MODEL (str): The model to use for response generation ('openai', 'groq', 'local'). - TTS_MODEL (str): The model to use for text-to-speech ('openai', 'deepgram', 'elevenlabs', 'local'). + TTS_MODEL (str): The model to use for text-to-speech ('openai', 'deepgram', 'elevenlabs', 'melotts', 'cartesia', 'fastxttsapi', 'local'). OPENAI_API_KEY (str): API key for OpenAI services. GROQ_API_KEY (str): API key for Groq services. DEEPGRAM_API_KEY (str): API key for Deepgram services. @@ -22,8 +22,8 @@ class Config: """ # Model selection TRANSCRIPTION_MODEL = 'deepgram' # possible values: openai, groq, deepgram, fastwhisperapi - RESPONSE_MODEL = 'ollama' # possible values: openai, groq, ollama - TTS_MODEL = 'deepgram' # possible values: openai, deepgram, elevenlabs, melotts, cartesia + RESPONSE_MODEL = 'groq' # possible values: openai, groq, ollama + TTS_MODEL = 'fastxttsapi' # possible values: openai, deepgram, elevenlabs, melotts, cartesia, fastxttsapi # currently using the MeloTTS for local models. here is how to get started: # https://github.com/myshell-ai/MeloTTS/blob/main/docs/install.md#linux-and-macos-install @@ -59,8 +59,8 @@ def validate_config(): raise ValueError("Invalid TRANSCRIPTION_MODEL. Must be one of ['openai', 'groq', 'deepgram', 'fastwhisperapi', 'local']") if Config.RESPONSE_MODEL not in ['openai', 'groq', 'ollama', 'local']: raise ValueError("Invalid RESPONSE_MODEL. Must be one of ['openai', 'groq', 'local']") - if Config.TTS_MODEL not in ['openai', 'deepgram', 'elevenlabs', 'melotts', 'cartesia', 'local']: - raise ValueError("Invalid TTS_MODEL. Must be one of ['openai', 'deepgram', 'elevenlabs', 'melotts', 'cartesia', 'local']") + if Config.TTS_MODEL not in ['openai', 'deepgram', 'elevenlabs', 'melotts', 'cartesia', 'fastxttsapi', 'local']: + raise ValueError("Invalid TTS_MODEL. Must be one of ['openai', 'deepgram', 'elevenlabs', 'melotts', 'cartesia', 'fastxttsapi', 'local']") if Config.TRANSCRIPTION_MODEL == 'openai' and not Config.OPENAI_API_KEY: raise ValueError("OPENAI_API_KEY is required for OpenAI models") diff --git a/voice_assistant/text_to_speech.py b/voice_assistant/text_to_speech.py index 9475e46..07b1190 100644 --- a/voice_assistant/text_to_speech.py +++ b/voice_assistant/text_to_speech.py @@ -1,11 +1,11 @@ # voice_assistant/text_to_speech.py import logging -import elevenlabs +import requests from openai import OpenAI from deepgram import DeepgramClient, SpeakOptions -from elevenlabs.client import ElevenLabs from cartesia.tts import CartesiaTTS +from voice_assistant.audio import play_audio_stream import soundfile as sf import json @@ -17,7 +17,7 @@ def text_to_speech(model, api_key, text, output_file_path, local_model_path=None Convert text to speech using the specified model. Args: - model (str): The model to use for TTS ('openai', 'deepgram', 'elevenlabs', 'local'). + model (str): The model to use for TTS ('openai', 'deepgram', 'elevenlabs', 'cartesia', 'melotts', 'fastxttsapi', 'local'). api_key (str): The API key for the TTS service. text (str): The text to convert to speech. output_file_path (str): The path to save the generated speech audio file. @@ -48,11 +48,24 @@ def text_to_speech(model, api_key, text, output_file_path, local_model_path=None response = client.speak.v("1").save(output_file_path, SPEAK_OPTIONS, options) elif model == 'elevenlabs': ELEVENLABS_VOICE_ID = "Paul J." - client = ElevenLabs(api_key=api_key) - audio = client.generate( - text=text, voice=ELEVENLABS_VOICE_ID, output_format="mp3_22050_32", model="eleven_turbo_v2" - ) - elevenlabs.save(audio, output_file_path) + ELEVENLABS_URL = f'https://api.elevenlabs.io/v1/text-to-speech/{ELEVENLABS_VOICE_ID}/stream' + headers = { + 'accept': '*/*', + 'xi-api-key': api_key, + 'Content-Type': 'application/json' + } + data = { + 'text': text, + 'voice_settings': { + 'stability': 0.50, + 'similarity_boost': 0.75 + }, + "output_format": "mp3_22050_32" + } + + with requests.post(ELEVENLABS_URL, headers=headers, json=data, stream=True) as r: + audio_stream = r.iter_content(chunk_size=512) + play_audio_stream(audio_stream) elif model == "cartesia": # config with open('Barbershop Man.json') as f: @@ -73,9 +86,20 @@ def text_to_speech(model, api_key, text, output_file_path, local_model_path=None rate = output["sampling_rate"] sf.write(output_file_path, buffer, rate) - elif model == "melotts": # this is a local model generate_audio_file_melotts(text=text, filename=output_file_path) + elif model == "fastxttsapi": + # Set the URL for the FastXTTS API, change with the address of where the API is running either locally or on a server + FASTXTTSAPI_URL = 'https://localhost:8000' + payload = { + "text": text, + "language": "en", + "voice": "Dionisio Schuyler", #Query the endpoint https://localhost:8000/voices to get the list of available voices + "stream": True, + } + with requests.post(FASTXTTSAPI_URL + "/v1/speech", json=payload, verify=False) as r: + audio_stream = r.iter_content(chunk_size=512) + play_audio_stream(audio_stream) elif model == 'local': # Placeholder for local TTS model with open(output_file_path, "wb") as f: diff --git a/voice_samples/sample1.mp3 b/voice_samples/ElevenLabs_sample1.mp3 similarity index 100% rename from voice_samples/sample1.mp3 rename to voice_samples/ElevenLabs_sample1.mp3 diff --git a/voice_samples/sample2.mp3 b/voice_samples/ElevenLabs_sample2.mp3 similarity index 100% rename from voice_samples/sample2.mp3 rename to voice_samples/ElevenLabs_sample2.mp3 diff --git a/voice_samples/sample3.mp3 b/voice_samples/ElevenLabs_sample3.mp3 similarity index 100% rename from voice_samples/sample3.mp3 rename to voice_samples/ElevenLabs_sample3.mp3 diff --git a/voice_samples/Xtts_sample1.wav b/voice_samples/Xtts_sample1.wav new file mode 100644 index 0000000..ca512f8 Binary files /dev/null and b/voice_samples/Xtts_sample1.wav differ