From 67788d2961bf2fd440646592baef2e9720a245e2 Mon Sep 17 00:00:00 2001 From: Michal Kulakowski Date: Tue, 25 Nov 2025 12:56:11 +0100 Subject: [PATCH 1/3] Add speaker ambeddings support --- src/audio/text_to_speech/t2s_calculator.cc | 33 +++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/src/audio/text_to_speech/t2s_calculator.cc b/src/audio/text_to_speech/t2s_calculator.cc index e943251421..d909c7db20 100644 --- a/src/audio/text_to_speech/t2s_calculator.cc +++ b/src/audio/text_to_speech/t2s_calculator.cc @@ -51,6 +51,27 @@ namespace mediapipe { const std::string TTS_SESSION_SIDE_PACKET_TAG = "TTS_NODE_RESOURCES"; +ov::Tensor read_speaker_embedding(const std::filesystem::path& file_path) { + std::ifstream input(file_path, std::ios::binary); + OPENVINO_ASSERT(input, "Failed to open file: " + file_path.string()); + + // Get file size + input.seekg(0, std::ios::end); + size_t buffer_size = static_cast(input.tellg()); + input.seekg(0, std::ios::beg); + + // Check size is multiple of float + OPENVINO_ASSERT(buffer_size % sizeof(float) == 0, "File size is not a multiple of float size."); + size_t num_floats = buffer_size / sizeof(float); + OPENVINO_ASSERT(num_floats == 512, "File must contain speaker embedding including 512 32-bit floats."); + + OPENVINO_ASSERT(input, "Failed to read all data from file."); + ov::Tensor floats_tensor(ov::element::f32, ov::Shape{1, num_floats}); + input.read(reinterpret_cast(floats_tensor.data()), buffer_size); + + return floats_tensor; +} + class T2sCalculator : public CalculatorBase { static const std::string INPUT_TAG_NAME; static const std::string OUTPUT_TAG_NAME; @@ -103,8 +124,18 @@ class T2sCalculator : public CalculatorBase { if (streamIt != payload.parsedJson->MemberEnd()) { return absl::InvalidArgumentError("streaming is not supported"); } + ov::genai::Text2SpeechDecodedResults generatedSpeech; + std::string voiceEmbeddingsPath = std::string(pipe->parsedModelsPath.c_str()) + std::string("speaker_embedding.bin"); std::unique_lock lock(pipe->ttsPipelineMutex); - auto generatedSpeech = pipe->ttsPipeline->generate(inputIt->value.GetString()); + if(std::filesystem::exists(voiceEmbeddingsPath)){ + SPDLOG_LOGGER_DEBUG(t2s_calculator_logger, "Voice embeddings file found"); + auto speakerEmbedding = read_speaker_embedding(voiceEmbeddingsPath); + generatedSpeech = pipe->ttsPipeline->generate(inputIt->value.GetString(), speakerEmbedding); + } + else{ + SPDLOG_LOGGER_DEBUG(t2s_calculator_logger, "Voice embeddings not found"); + generatedSpeech = pipe->ttsPipeline->generate(inputIt->value.GetString()); + } auto bitsPerSample = generatedSpeech.speeches[0].get_element_type().bitwidth(); auto speechSize = generatedSpeech.speeches[0].get_size(); ov::Tensor cpuTensor(generatedSpeech.speeches[0].get_element_type(), generatedSpeech.speeches[0].get_shape()); From a8930808421f2fc8c0978c975caeac164ba81633 Mon Sep 17 00:00:00 2001 From: Michal Kulakowski Date: Fri, 12 Dec 2025 09:41:45 +0100 Subject: [PATCH 2/3] fix --- demos/audio/README.md | 35 +++++++++++++++++++ src/audio/text_to_speech/t2s_calculator.cc | 19 +++++++--- src/audio/text_to_speech/t2s_calculator.proto | 9 +++++ src/audio/text_to_speech/t2s_servable.hpp | 7 +++- .../mediapipegraphdefinition.cpp | 2 +- 5 files changed, 66 insertions(+), 6 deletions(-) diff --git a/demos/audio/README.md b/demos/audio/README.md index 963ced134a..4ed083ac2f 100644 --- a/demos/audio/README.md +++ b/demos/audio/README.md @@ -45,6 +45,41 @@ python export_model.py text2speech --source_model microsoft/speecht5_tts --weigh > **Note:** Change the `--weight-format` to quantize the model to `fp16` or `int8` precision to reduce memory consumption and improve performance. +### Speaker embeddings + +Instead of generating speech with default model voice you can create speaker embeddings with [this script](https://github.com/openvinotoolkit/openvino.genai/blob/master/samples/python/speech_generation/create_speaker_embedding.py) +```bash +curl --output create_speaker_embedding.py "https://raw.githubusercontent.com/openvinotoolkit/openvino.genai/refs/heads/master/samples/python/speech_generation/create_speaker_embedding.py" +python create_speaker_embedding.py +mv speaker_embedding.bin models/ +``` +Script records your speech for 5 seconds(you can adjust duration of recording to achieve better results) and then, using speechbrain/spkrec-xvect-voxceleb model, creates `speaker_embedding.bin` file that contains yout speaker embedding. +Now you need to add speaker embedding path to graph.pbtxt file of text2speech graph: +``` +input_stream: "HTTP_REQUEST_PAYLOAD:input" +output_stream: "HTTP_RESPONSE_PAYLOAD:output" +node { + name: "T2sExecutor" + input_side_packet: "TTS_NODE_RESOURCES:t2s_servable" + calculator: "T2sCalculator" + input_stream: "HTTP_REQUEST_PAYLOAD:input" + output_stream: "HTTP_RESPONSE_PAYLOAD:output" + node_options: { + [type.googleapis.com / mediapipe.T2sCalculatorOptions]: { + models_path: "./", + plugin_config: '{ "NUM_STREAMS": "1" }', + target_device: "CPU", + voices: [ + { + name: "voice", + path: "/models/speaker_embedding.bin", + } + ] + } + } +} +``` + ### Deployment **CPU** diff --git a/src/audio/text_to_speech/t2s_calculator.cc b/src/audio/text_to_speech/t2s_calculator.cc index d909c7db20..135c4ff49e 100644 --- a/src/audio/text_to_speech/t2s_calculator.cc +++ b/src/audio/text_to_speech/t2s_calculator.cc @@ -124,16 +124,27 @@ class T2sCalculator : public CalculatorBase { if (streamIt != payload.parsedJson->MemberEnd()) { return absl::InvalidArgumentError("streaming is not supported"); } + std::optional voice; + auto voiceIt = payload.parsedJson->FindMember("voice"); + if (voiceIt != payload.parsedJson->MemberEnd() && voiceIt->value.IsString()) { + voice = voiceIt->value.GetString(); + } + std::string voiceEmbeddingsPath; + if(voice.has_value()){ + if (pipe->voices.find(voice.value()) == pipe->voices.end()) + return absl::InvalidArgumentError(absl::StrCat("Requested voice not available: ", payload.uri)); + if (!std::filesystem::exists(pipe->voices[voice.value()])) + return absl::InvalidArgumentError(absl::StrCat("Requested voice speaker embeddings file does not exist: ", pipe->voices[voice.value()])); + voiceEmbeddingsPath = pipe->voices[voice.value()]; + } ov::genai::Text2SpeechDecodedResults generatedSpeech; - std::string voiceEmbeddingsPath = std::string(pipe->parsedModelsPath.c_str()) + std::string("speaker_embedding.bin"); std::unique_lock lock(pipe->ttsPipelineMutex); - if(std::filesystem::exists(voiceEmbeddingsPath)){ - SPDLOG_LOGGER_DEBUG(t2s_calculator_logger, "Voice embeddings file found"); + + if(voice.has_value()){ auto speakerEmbedding = read_speaker_embedding(voiceEmbeddingsPath); generatedSpeech = pipe->ttsPipeline->generate(inputIt->value.GetString(), speakerEmbedding); } else{ - SPDLOG_LOGGER_DEBUG(t2s_calculator_logger, "Voice embeddings not found"); generatedSpeech = pipe->ttsPipeline->generate(inputIt->value.GetString()); } auto bitsPerSample = generatedSpeech.speeches[0].get_element_type().bitwidth(); diff --git a/src/audio/text_to_speech/t2s_calculator.proto b/src/audio/text_to_speech/t2s_calculator.proto index 47845d9028..efea722c3d 100644 --- a/src/audio/text_to_speech/t2s_calculator.proto +++ b/src/audio/text_to_speech/t2s_calculator.proto @@ -31,4 +31,13 @@ message T2sCalculatorOptions { required string models_path = 1; optional string target_device = 2; optional string plugin_config = 3; + + message SpeakerEmbeddings { + // Speaker name. + required string name = 1; + + // Path to speaker embeddings file. + required string path = 2; + } + repeated SpeakerEmbeddings voices = 4; } diff --git a/src/audio/text_to_speech/t2s_servable.hpp b/src/audio/text_to_speech/t2s_servable.hpp index 38dce75f00..581c08e75e 100644 --- a/src/audio/text_to_speech/t2s_servable.hpp +++ b/src/audio/text_to_speech/t2s_servable.hpp @@ -38,8 +38,9 @@ struct TtsServable { std::filesystem::path parsedModelsPath; std::shared_ptr ttsPipeline; std::mutex ttsPipelineMutex; + std::unordered_map voices; - TtsServable(const std::string& modelDir, const std::string& targetDevice, const std::string& graphPath) { + TtsServable(const std::string& modelDir, const std::string& targetDevice, const google::protobuf::RepeatedPtrField& graphVoices, const std::string& graphPath) { auto fsModelsPath = std::filesystem::path(modelDir); if (fsModelsPath.is_relative()) { parsedModelsPath = (std::filesystem::path(graphPath) / fsModelsPath); @@ -47,6 +48,10 @@ struct TtsServable { parsedModelsPath = fsModelsPath.string(); } ttsPipeline = std::make_shared(parsedModelsPath.string(), targetDevice); + for(auto voice : graphVoices){ + voices[voice.name()] = voice.path(); + SPDLOG_ERROR("{} : {}", voice.name(), voice.path()); + } } }; diff --git a/src/mediapipe_internal/mediapipegraphdefinition.cpp b/src/mediapipe_internal/mediapipegraphdefinition.cpp index a4323806ae..2df634d3f9 100644 --- a/src/mediapipe_internal/mediapipegraphdefinition.cpp +++ b/src/mediapipe_internal/mediapipegraphdefinition.cpp @@ -608,7 +608,7 @@ Status MediapipeGraphDefinition::initializeNodes() { } mediapipe::T2sCalculatorOptions nodeOptions; config.node(i).node_options(0).UnpackTo(&nodeOptions); - std::shared_ptr servable = std::make_shared(nodeOptions.models_path(), nodeOptions.target_device(), mgconfig.getBasePath()); + std::shared_ptr servable = std::make_shared(nodeOptions.models_path(), nodeOptions.target_device(), nodeOptions.voices(), mgconfig.getBasePath()); ttsServableMap.insert(std::pair>(nodeName, std::move(servable))); ttsServablesCleaningGuard.disableCleaning(); } From ec35a1ae03ddcfb3d3b060c59acd47968aeb2b57 Mon Sep 17 00:00:00 2001 From: Michal Kulakowski Date: Fri, 12 Dec 2025 16:16:36 +0100 Subject: [PATCH 3/3] fix --- src/audio/text_to_speech/t2s_calculator.cc | 28 +--------------------- src/audio/text_to_speech/t2s_servable.hpp | 28 +++++++++++++++++++--- 2 files changed, 26 insertions(+), 30 deletions(-) diff --git a/src/audio/text_to_speech/t2s_calculator.cc b/src/audio/text_to_speech/t2s_calculator.cc index 135c4ff49e..f43da70e42 100644 --- a/src/audio/text_to_speech/t2s_calculator.cc +++ b/src/audio/text_to_speech/t2s_calculator.cc @@ -51,27 +51,6 @@ namespace mediapipe { const std::string TTS_SESSION_SIDE_PACKET_TAG = "TTS_NODE_RESOURCES"; -ov::Tensor read_speaker_embedding(const std::filesystem::path& file_path) { - std::ifstream input(file_path, std::ios::binary); - OPENVINO_ASSERT(input, "Failed to open file: " + file_path.string()); - - // Get file size - input.seekg(0, std::ios::end); - size_t buffer_size = static_cast(input.tellg()); - input.seekg(0, std::ios::beg); - - // Check size is multiple of float - OPENVINO_ASSERT(buffer_size % sizeof(float) == 0, "File size is not a multiple of float size."); - size_t num_floats = buffer_size / sizeof(float); - OPENVINO_ASSERT(num_floats == 512, "File must contain speaker embedding including 512 32-bit floats."); - - OPENVINO_ASSERT(input, "Failed to read all data from file."); - ov::Tensor floats_tensor(ov::element::f32, ov::Shape{1, num_floats}); - input.read(reinterpret_cast(floats_tensor.data()), buffer_size); - - return floats_tensor; -} - class T2sCalculator : public CalculatorBase { static const std::string INPUT_TAG_NAME; static const std::string OUTPUT_TAG_NAME; @@ -129,20 +108,15 @@ class T2sCalculator : public CalculatorBase { if (voiceIt != payload.parsedJson->MemberEnd() && voiceIt->value.IsString()) { voice = voiceIt->value.GetString(); } - std::string voiceEmbeddingsPath; if(voice.has_value()){ if (pipe->voices.find(voice.value()) == pipe->voices.end()) return absl::InvalidArgumentError(absl::StrCat("Requested voice not available: ", payload.uri)); - if (!std::filesystem::exists(pipe->voices[voice.value()])) - return absl::InvalidArgumentError(absl::StrCat("Requested voice speaker embeddings file does not exist: ", pipe->voices[voice.value()])); - voiceEmbeddingsPath = pipe->voices[voice.value()]; } ov::genai::Text2SpeechDecodedResults generatedSpeech; std::unique_lock lock(pipe->ttsPipelineMutex); if(voice.has_value()){ - auto speakerEmbedding = read_speaker_embedding(voiceEmbeddingsPath); - generatedSpeech = pipe->ttsPipeline->generate(inputIt->value.GetString(), speakerEmbedding); + generatedSpeech = pipe->ttsPipeline->generate(inputIt->value.GetString(), pipe->voices[voice.value()]); } else{ generatedSpeech = pipe->ttsPipeline->generate(inputIt->value.GetString()); diff --git a/src/audio/text_to_speech/t2s_servable.hpp b/src/audio/text_to_speech/t2s_servable.hpp index 581c08e75e..465930079a 100644 --- a/src/audio/text_to_speech/t2s_servable.hpp +++ b/src/audio/text_to_speech/t2s_servable.hpp @@ -34,11 +34,32 @@ namespace ovms { +static ov::Tensor read_speaker_embedding(const std::filesystem::path& file_path) { + std::ifstream input(file_path, std::ios::binary); + OPENVINO_ASSERT(input, "Failed to open file: " + file_path.string()); + + // Get file size + input.seekg(0, std::ios::end); + size_t buffer_size = static_cast(input.tellg()); + input.seekg(0, std::ios::beg); + + // Check size is multiple of float + OPENVINO_ASSERT(buffer_size % sizeof(float) == 0, "File size is not a multiple of float size."); + size_t num_floats = buffer_size / sizeof(float); + OPENVINO_ASSERT(num_floats == 512, "File must contain speaker embedding including 512 32-bit floats."); + + OPENVINO_ASSERT(input, "Failed to read all data from file."); + ov::Tensor floats_tensor(ov::element::f32, ov::Shape{1, num_floats}); + input.read(reinterpret_cast(floats_tensor.data()), buffer_size); + + return floats_tensor; +} + struct TtsServable { std::filesystem::path parsedModelsPath; std::shared_ptr ttsPipeline; std::mutex ttsPipelineMutex; - std::unordered_map voices; + std::unordered_map voices; TtsServable(const std::string& modelDir, const std::string& targetDevice, const google::protobuf::RepeatedPtrField& graphVoices, const std::string& graphPath) { auto fsModelsPath = std::filesystem::path(modelDir); @@ -49,8 +70,9 @@ struct TtsServable { } ttsPipeline = std::make_shared(parsedModelsPath.string(), targetDevice); for(auto voice : graphVoices){ - voices[voice.name()] = voice.path(); - SPDLOG_ERROR("{} : {}", voice.name(), voice.path()); + if (!std::filesystem::exists(voice.path())) + throw std::runtime_error{"Requested voice speaker embeddings file does not exist."}; + voices[voice.name()] = read_speaker_embedding(voice.path()); } } };