diff --git a/demos/audio/README.md b/demos/audio/README.md index 963ced134a..4ed083ac2f 100644 --- a/demos/audio/README.md +++ b/demos/audio/README.md @@ -45,6 +45,41 @@ python export_model.py text2speech --source_model microsoft/speecht5_tts --weigh > **Note:** Change the `--weight-format` to quantize the model to `fp16` or `int8` precision to reduce memory consumption and improve performance. +### Speaker embeddings + +Instead of generating speech with default model voice you can create speaker embeddings with [this script](https://github.com/openvinotoolkit/openvino.genai/blob/master/samples/python/speech_generation/create_speaker_embedding.py) +```bash +curl --output create_speaker_embedding.py "https://raw.githubusercontent.com/openvinotoolkit/openvino.genai/refs/heads/master/samples/python/speech_generation/create_speaker_embedding.py" +python create_speaker_embedding.py +mv speaker_embedding.bin models/ +``` +Script records your speech for 5 seconds(you can adjust duration of recording to achieve better results) and then, using speechbrain/spkrec-xvect-voxceleb model, creates `speaker_embedding.bin` file that contains yout speaker embedding. +Now you need to add speaker embedding path to graph.pbtxt file of text2speech graph: +``` +input_stream: "HTTP_REQUEST_PAYLOAD:input" +output_stream: "HTTP_RESPONSE_PAYLOAD:output" +node { + name: "T2sExecutor" + input_side_packet: "TTS_NODE_RESOURCES:t2s_servable" + calculator: "T2sCalculator" + input_stream: "HTTP_REQUEST_PAYLOAD:input" + output_stream: "HTTP_RESPONSE_PAYLOAD:output" + node_options: { + [type.googleapis.com / mediapipe.T2sCalculatorOptions]: { + models_path: "./", + plugin_config: '{ "NUM_STREAMS": "1" }', + target_device: "CPU", + voices: [ + { + name: "voice", + path: "/models/speaker_embedding.bin", + } + ] + } + } +} +``` + ### Deployment **CPU** diff --git a/src/audio/text_to_speech/t2s_calculator.cc b/src/audio/text_to_speech/t2s_calculator.cc index e943251421..f43da70e42 100644 --- a/src/audio/text_to_speech/t2s_calculator.cc +++ b/src/audio/text_to_speech/t2s_calculator.cc @@ -103,8 +103,24 @@ class T2sCalculator : public CalculatorBase { if (streamIt != payload.parsedJson->MemberEnd()) { return absl::InvalidArgumentError("streaming is not supported"); } + std::optional voice; + auto voiceIt = payload.parsedJson->FindMember("voice"); + if (voiceIt != payload.parsedJson->MemberEnd() && voiceIt->value.IsString()) { + voice = voiceIt->value.GetString(); + } + if(voice.has_value()){ + if (pipe->voices.find(voice.value()) == pipe->voices.end()) + return absl::InvalidArgumentError(absl::StrCat("Requested voice not available: ", payload.uri)); + } + ov::genai::Text2SpeechDecodedResults generatedSpeech; std::unique_lock lock(pipe->ttsPipelineMutex); - auto generatedSpeech = pipe->ttsPipeline->generate(inputIt->value.GetString()); + + if(voice.has_value()){ + generatedSpeech = pipe->ttsPipeline->generate(inputIt->value.GetString(), pipe->voices[voice.value()]); + } + else{ + generatedSpeech = pipe->ttsPipeline->generate(inputIt->value.GetString()); + } auto bitsPerSample = generatedSpeech.speeches[0].get_element_type().bitwidth(); auto speechSize = generatedSpeech.speeches[0].get_size(); ov::Tensor cpuTensor(generatedSpeech.speeches[0].get_element_type(), generatedSpeech.speeches[0].get_shape()); diff --git a/src/audio/text_to_speech/t2s_calculator.proto b/src/audio/text_to_speech/t2s_calculator.proto index 47845d9028..efea722c3d 100644 --- a/src/audio/text_to_speech/t2s_calculator.proto +++ b/src/audio/text_to_speech/t2s_calculator.proto @@ -31,4 +31,13 @@ message T2sCalculatorOptions { required string models_path = 1; optional string target_device = 2; optional string plugin_config = 3; + + message SpeakerEmbeddings { + // Speaker name. + required string name = 1; + + // Path to speaker embeddings file. + required string path = 2; + } + repeated SpeakerEmbeddings voices = 4; } diff --git a/src/audio/text_to_speech/t2s_servable.hpp b/src/audio/text_to_speech/t2s_servable.hpp index 38dce75f00..465930079a 100644 --- a/src/audio/text_to_speech/t2s_servable.hpp +++ b/src/audio/text_to_speech/t2s_servable.hpp @@ -34,12 +34,34 @@ namespace ovms { +static ov::Tensor read_speaker_embedding(const std::filesystem::path& file_path) { + std::ifstream input(file_path, std::ios::binary); + OPENVINO_ASSERT(input, "Failed to open file: " + file_path.string()); + + // Get file size + input.seekg(0, std::ios::end); + size_t buffer_size = static_cast(input.tellg()); + input.seekg(0, std::ios::beg); + + // Check size is multiple of float + OPENVINO_ASSERT(buffer_size % sizeof(float) == 0, "File size is not a multiple of float size."); + size_t num_floats = buffer_size / sizeof(float); + OPENVINO_ASSERT(num_floats == 512, "File must contain speaker embedding including 512 32-bit floats."); + + OPENVINO_ASSERT(input, "Failed to read all data from file."); + ov::Tensor floats_tensor(ov::element::f32, ov::Shape{1, num_floats}); + input.read(reinterpret_cast(floats_tensor.data()), buffer_size); + + return floats_tensor; +} + struct TtsServable { std::filesystem::path parsedModelsPath; std::shared_ptr ttsPipeline; std::mutex ttsPipelineMutex; + std::unordered_map voices; - TtsServable(const std::string& modelDir, const std::string& targetDevice, const std::string& graphPath) { + TtsServable(const std::string& modelDir, const std::string& targetDevice, const google::protobuf::RepeatedPtrField& graphVoices, const std::string& graphPath) { auto fsModelsPath = std::filesystem::path(modelDir); if (fsModelsPath.is_relative()) { parsedModelsPath = (std::filesystem::path(graphPath) / fsModelsPath); @@ -47,6 +69,11 @@ struct TtsServable { parsedModelsPath = fsModelsPath.string(); } ttsPipeline = std::make_shared(parsedModelsPath.string(), targetDevice); + for(auto voice : graphVoices){ + if (!std::filesystem::exists(voice.path())) + throw std::runtime_error{"Requested voice speaker embeddings file does not exist."}; + voices[voice.name()] = read_speaker_embedding(voice.path()); + } } }; diff --git a/src/mediapipe_internal/mediapipegraphdefinition.cpp b/src/mediapipe_internal/mediapipegraphdefinition.cpp index a4323806ae..2df634d3f9 100644 --- a/src/mediapipe_internal/mediapipegraphdefinition.cpp +++ b/src/mediapipe_internal/mediapipegraphdefinition.cpp @@ -608,7 +608,7 @@ Status MediapipeGraphDefinition::initializeNodes() { } mediapipe::T2sCalculatorOptions nodeOptions; config.node(i).node_options(0).UnpackTo(&nodeOptions); - std::shared_ptr servable = std::make_shared(nodeOptions.models_path(), nodeOptions.target_device(), mgconfig.getBasePath()); + std::shared_ptr servable = std::make_shared(nodeOptions.models_path(), nodeOptions.target_device(), nodeOptions.voices(), mgconfig.getBasePath()); ttsServableMap.insert(std::pair>(nodeName, std::move(servable))); ttsServablesCleaningGuard.disableCleaning(); }