Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions demos/audio/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,41 @@ python export_model.py text2speech --source_model microsoft/speecht5_tts --weigh

> **Note:** Change the `--weight-format` to quantize the model to `fp16` or `int8` precision to reduce memory consumption and improve performance.

### Speaker embeddings

Instead of generating speech with default model voice you can create speaker embeddings with [this script](https://github.com/openvinotoolkit/openvino.genai/blob/master/samples/python/speech_generation/create_speaker_embedding.py)
```bash
curl --output create_speaker_embedding.py "https://raw.githubusercontent.com/openvinotoolkit/openvino.genai/refs/heads/master/samples/python/speech_generation/create_speaker_embedding.py"
python create_speaker_embedding.py
mv speaker_embedding.bin models/
```
Script records your speech for 5 seconds(you can adjust duration of recording to achieve better results) and then, using speechbrain/spkrec-xvect-voxceleb model, creates `speaker_embedding.bin` file that contains yout speaker embedding.
Now you need to add speaker embedding path to graph.pbtxt file of text2speech graph:
```
input_stream: "HTTP_REQUEST_PAYLOAD:input"
output_stream: "HTTP_RESPONSE_PAYLOAD:output"
node {
name: "T2sExecutor"
input_side_packet: "TTS_NODE_RESOURCES:t2s_servable"
calculator: "T2sCalculator"
input_stream: "HTTP_REQUEST_PAYLOAD:input"
output_stream: "HTTP_RESPONSE_PAYLOAD:output"
node_options: {
[type.googleapis.com / mediapipe.T2sCalculatorOptions]: {
models_path: "./",
plugin_config: '{ "NUM_STREAMS": "1" }',
target_device: "CPU",
voices: [
{
name: "voice",
path: "/models/speaker_embedding.bin",
}
]
}
}
}
```

### Deployment

**CPU**
Expand Down
18 changes: 17 additions & 1 deletion src/audio/text_to_speech/t2s_calculator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -103,8 +103,24 @@ class T2sCalculator : public CalculatorBase {
if (streamIt != payload.parsedJson->MemberEnd()) {
return absl::InvalidArgumentError("streaming is not supported");
}
std::optional<std::string> voice;
auto voiceIt = payload.parsedJson->FindMember("voice");
if (voiceIt != payload.parsedJson->MemberEnd() && voiceIt->value.IsString()) {
voice = voiceIt->value.GetString();
}
if(voice.has_value()){
if (pipe->voices.find(voice.value()) == pipe->voices.end())
return absl::InvalidArgumentError(absl::StrCat("Requested voice not available: ", payload.uri));
}
ov::genai::Text2SpeechDecodedResults generatedSpeech;
std::unique_lock lock(pipe->ttsPipelineMutex);
auto generatedSpeech = pipe->ttsPipeline->generate(inputIt->value.GetString());

if(voice.has_value()){
generatedSpeech = pipe->ttsPipeline->generate(inputIt->value.GetString(), pipe->voices[voice.value()]);
}
else{
generatedSpeech = pipe->ttsPipeline->generate(inputIt->value.GetString());
}
auto bitsPerSample = generatedSpeech.speeches[0].get_element_type().bitwidth();
auto speechSize = generatedSpeech.speeches[0].get_size();
ov::Tensor cpuTensor(generatedSpeech.speeches[0].get_element_type(), generatedSpeech.speeches[0].get_shape());
Expand Down
9 changes: 9 additions & 0 deletions src/audio/text_to_speech/t2s_calculator.proto
Original file line number Diff line number Diff line change
Expand Up @@ -31,4 +31,13 @@ message T2sCalculatorOptions {
required string models_path = 1;
optional string target_device = 2;
optional string plugin_config = 3;

message SpeakerEmbeddings {
// Speaker name.
required string name = 1;

// Path to speaker embeddings file.
required string path = 2;
}
repeated SpeakerEmbeddings voices = 4;
}
29 changes: 28 additions & 1 deletion src/audio/text_to_speech/t2s_servable.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,19 +34,46 @@

namespace ovms {

static ov::Tensor read_speaker_embedding(const std::filesystem::path& file_path) {
std::ifstream input(file_path, std::ios::binary);
OPENVINO_ASSERT(input, "Failed to open file: " + file_path.string());

// Get file size
input.seekg(0, std::ios::end);
size_t buffer_size = static_cast<size_t>(input.tellg());
input.seekg(0, std::ios::beg);

// Check size is multiple of float
OPENVINO_ASSERT(buffer_size % sizeof(float) == 0, "File size is not a multiple of float size.");
size_t num_floats = buffer_size / sizeof(float);
OPENVINO_ASSERT(num_floats == 512, "File must contain speaker embedding including 512 32-bit floats.");

OPENVINO_ASSERT(input, "Failed to read all data from file.");
ov::Tensor floats_tensor(ov::element::f32, ov::Shape{1, num_floats});
input.read(reinterpret_cast<char*>(floats_tensor.data()), buffer_size);

return floats_tensor;
}

struct TtsServable {
std::filesystem::path parsedModelsPath;
std::shared_ptr<ov::genai::Text2SpeechPipeline> ttsPipeline;
std::mutex ttsPipelineMutex;
std::unordered_map<std::string, ov::Tensor> voices;

TtsServable(const std::string& modelDir, const std::string& targetDevice, const std::string& graphPath) {
TtsServable(const std::string& modelDir, const std::string& targetDevice, const google::protobuf::RepeatedPtrField<mediapipe::T2sCalculatorOptions_SpeakerEmbeddings>& graphVoices, const std::string& graphPath) {
auto fsModelsPath = std::filesystem::path(modelDir);
if (fsModelsPath.is_relative()) {
parsedModelsPath = (std::filesystem::path(graphPath) / fsModelsPath);
} else {
parsedModelsPath = fsModelsPath.string();
}
ttsPipeline = std::make_shared<ov::genai::Text2SpeechPipeline>(parsedModelsPath.string(), targetDevice);
for(auto voice : graphVoices){
if (!std::filesystem::exists(voice.path()))
throw std::runtime_error{"Requested voice speaker embeddings file does not exist."};
voices[voice.name()] = read_speaker_embedding(voice.path());
}
}
};

Expand Down
2 changes: 1 addition & 1 deletion src/mediapipe_internal/mediapipegraphdefinition.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -608,7 +608,7 @@ Status MediapipeGraphDefinition::initializeNodes() {
}
mediapipe::T2sCalculatorOptions nodeOptions;
config.node(i).node_options(0).UnpackTo(&nodeOptions);
std::shared_ptr<TtsServable> servable = std::make_shared<TtsServable>(nodeOptions.models_path(), nodeOptions.target_device(), mgconfig.getBasePath());
std::shared_ptr<TtsServable> servable = std::make_shared<TtsServable>(nodeOptions.models_path(), nodeOptions.target_device(), nodeOptions.voices(), mgconfig.getBasePath());
ttsServableMap.insert(std::pair<std::string, std::shared_ptr<TtsServable>>(nodeName, std::move(servable)));
ttsServablesCleaningGuard.disableCleaning();
}
Expand Down