openvinotoolkit · michalkulakowski · Nov 25, 2025 · Dec 12, 2025 · Dec 12, 2025
diff --git a/demos/audio/README.md b/demos/audio/README.md
@@ -45,6 +45,41 @@ python export_model.py text2speech --source_model microsoft/speecht5_tts --weigh
 
 > **Note:** Change the `--weight-format` to quantize the model to `fp16` or `int8` precision to reduce memory consumption and improve performance.
 
+### Speaker embeddings
+
+Instead of generating speech with default model voice you can create speaker embeddings with [this script](https://github.com/openvinotoolkit/openvino.genai/blob/master/samples/python/speech_generation/create_speaker_embedding.py)
+```bash
+curl --output create_speaker_embedding.py "https://raw.githubusercontent.com/openvinotoolkit/openvino.genai/refs/heads/master/samples/python/speech_generation/create_speaker_embedding.py"
+python create_speaker_embedding.py
+mv speaker_embedding.bin models/
+```
+Script records your speech for 5 seconds(you can adjust duration of recording to achieve better results) and then, using speechbrain/spkrec-xvect-voxceleb model, creates `speaker_embedding.bin` file that contains yout speaker embedding.
+Now you need to add speaker embedding path to graph.pbtxt file of text2speech graph:
+```
+input_stream: "HTTP_REQUEST_PAYLOAD:input"
+output_stream: "HTTP_RESPONSE_PAYLOAD:output"
+node {
+  name: "T2sExecutor"
+  input_side_packet: "TTS_NODE_RESOURCES:t2s_servable"
+  calculator: "T2sCalculator"
+  input_stream: "HTTP_REQUEST_PAYLOAD:input"
+  output_stream: "HTTP_RESPONSE_PAYLOAD:output"
+  node_options: {
+    [type.googleapis.com / mediapipe.T2sCalculatorOptions]: {
+      models_path: "./",
+      plugin_config: '{ "NUM_STREAMS": "1" }',
+      target_device: "CPU",
+      voices: [
+        {
+          name: "voice",
+          path: "/models/speaker_embedding.bin",
+        }
+      ]
+    }
+  }
+}
+```
+
 ### Deployment
 
 **CPU**

diff --git a/src/audio/text_to_speech/t2s_calculator.cc b/src/audio/text_to_speech/t2s_calculator.cc
@@ -103,8 +103,24 @@ class T2sCalculator : public CalculatorBase {
             if (streamIt != payload.parsedJson->MemberEnd()) {
                 return absl::InvalidArgumentError("streaming is not supported");
             }
+            std::optional<std::string> voice;
+            auto voiceIt = payload.parsedJson->FindMember("voice");
+            if (voiceIt != payload.parsedJson->MemberEnd() && voiceIt->value.IsString()) {
+                voice = voiceIt->value.GetString();
+            }
+            if(voice.has_value()){
+                if (pipe->voices.find(voice.value()) == pipe->voices.end())
+                    return absl::InvalidArgumentError(absl::StrCat("Requested voice not available: ", payload.uri));
+            }
+            ov::genai::Text2SpeechDecodedResults generatedSpeech;
             std::unique_lock lock(pipe->ttsPipelineMutex);
-            auto generatedSpeech = pipe->ttsPipeline->generate(inputIt->value.GetString());
+
+            if(voice.has_value()){
+                generatedSpeech = pipe->ttsPipeline->generate(inputIt->value.GetString(), pipe->voices[voice.value()]);
+            }
+            else{
+                generatedSpeech = pipe->ttsPipeline->generate(inputIt->value.GetString());
+            }
             auto bitsPerSample = generatedSpeech.speeches[0].get_element_type().bitwidth();
             auto speechSize = generatedSpeech.speeches[0].get_size();
             ov::Tensor cpuTensor(generatedSpeech.speeches[0].get_element_type(), generatedSpeech.speeches[0].get_shape());

diff --git a/src/audio/text_to_speech/t2s_calculator.proto b/src/audio/text_to_speech/t2s_calculator.proto
@@ -31,4 +31,13 @@ message T2sCalculatorOptions {
     required string models_path = 1;
     optional string target_device = 2;
     optional string plugin_config = 3;
+
+    message SpeakerEmbeddings {
+      // Speaker name.
+      required string name = 1;
+
+      // Path to speaker embeddings file.
+      required string path = 2;
+    }
+    repeated SpeakerEmbeddings voices = 4;
 }
diff --git a/src/audio/text_to_speech/t2s_servable.hpp b/src/audio/text_to_speech/t2s_servable.hpp
@@ -34,19 +34,46 @@
 
 namespace ovms {
 
+static ov::Tensor read_speaker_embedding(const std::filesystem::path& file_path) {
+    std::ifstream input(file_path, std::ios::binary);
+    OPENVINO_ASSERT(input, "Failed to open file: " + file_path.string());
+
+    // Get file size
+    input.seekg(0, std::ios::end);
+    size_t buffer_size = static_cast<size_t>(input.tellg());
+    input.seekg(0, std::ios::beg);
+
+    // Check size is multiple of float
+    OPENVINO_ASSERT(buffer_size % sizeof(float) == 0, "File size is not a multiple of float size.");
+    size_t num_floats = buffer_size / sizeof(float);
+    OPENVINO_ASSERT(num_floats == 512, "File must contain speaker embedding including 512 32-bit floats.");
+
+    OPENVINO_ASSERT(input, "Failed to read all data from file.");
+    ov::Tensor floats_tensor(ov::element::f32, ov::Shape{1, num_floats});
+    input.read(reinterpret_cast<char*>(floats_tensor.data()), buffer_size);
+
+    return floats_tensor;
+}
+
 struct TtsServable {
     std::filesystem::path parsedModelsPath;
     std::shared_ptr<ov::genai::Text2SpeechPipeline> ttsPipeline;
     std::mutex ttsPipelineMutex;
+    std::unordered_map<std::string, ov::Tensor> voices;
 
-    TtsServable(const std::string& modelDir, const std::string& targetDevice, const std::string& graphPath) {
+    TtsServable(const std::string& modelDir, const std::string& targetDevice, const google::protobuf::RepeatedPtrField<mediapipe::T2sCalculatorOptions_SpeakerEmbeddings>& graphVoices, const std::string& graphPath) {
         auto fsModelsPath = std::filesystem::path(modelDir);
         if (fsModelsPath.is_relative()) {
             parsedModelsPath = (std::filesystem::path(graphPath) / fsModelsPath);
         } else {
             parsedModelsPath = fsModelsPath.string();
         }
         ttsPipeline = std::make_shared<ov::genai::Text2SpeechPipeline>(parsedModelsPath.string(), targetDevice);
+        for(auto voice : graphVoices){
+            if (!std::filesystem::exists(voice.path()))
+                throw std::runtime_error{"Requested voice speaker embeddings file does not exist."};
+            voices[voice.name()] = read_speaker_embedding(voice.path());
+        }
     }
 };
 

diff --git a/src/mediapipe_internal/mediapipegraphdefinition.cpp b/src/mediapipe_internal/mediapipegraphdefinition.cpp
@@ -608,7 +608,7 @@ Status MediapipeGraphDefinition::initializeNodes() {
             }
             mediapipe::T2sCalculatorOptions nodeOptions;
             config.node(i).node_options(0).UnpackTo(&nodeOptions);
-            std::shared_ptr<TtsServable> servable = std::make_shared<TtsServable>(nodeOptions.models_path(), nodeOptions.target_device(), mgconfig.getBasePath());
+            std::shared_ptr<TtsServable> servable = std::make_shared<TtsServable>(nodeOptions.models_path(), nodeOptions.target_device(), nodeOptions.voices(), mgconfig.getBasePath());
             ttsServableMap.insert(std::pair<std::string, std::shared_ptr<TtsServable>>(nodeName, std::move(servable)));
             ttsServablesCleaningGuard.disableCleaning();
         }