From a7ff1f5ce0d3ade7aad84fedce41753b5a1b941c Mon Sep 17 00:00:00 2001 From: Mykyta Makarov Date: Fri, 20 Nov 2020 01:09:59 +0100 Subject: [PATCH 1/3] Export script implementation --- README.md | 3 + config_onnx.json | 54 +++++++ export_onnx.py | 178 ++++++++++++++++++++++++ flowtron_onnx.py | 356 +++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 591 insertions(+) create mode 100644 config_onnx.json create mode 100644 export_onnx.py create mode 100644 flowtron_onnx.py diff --git a/README.md b/README.md index 0296684..c83655c 100644 --- a/README.md +++ b/README.md @@ -50,6 +50,9 @@ Dataset dependent layers can be [ignored] ## Inference demo 1. `python inference.py -c config.json -f models/flowtron_ljs.pt -w models/waveglow_256channels_v4.pt -t "It is well know that deep generative models have a deep latent space!" -i 0` +## Export to ONNX format +1. `python export_onnx.py -c config_onnx.json -f models/flowtron_libritts.pt -w models/waveglow_256channels_universal_v5.pt -i 83` + ## Related repos [WaveGlow](https://github.com/NVIDIA/WaveGlow) Faster than real time Flow-based Generative Network for Speech Synthesis diff --git a/config_onnx.json b/config_onnx.json new file mode 100644 index 0000000..37d190b --- /dev/null +++ b/config_onnx.json @@ -0,0 +1,54 @@ +{ + "train_config": { + "output_directory": "outdir", + "epochs": 10000000, + "learning_rate": 1e-4, + "weight_decay": 1e-6, + "sigma": 1.0, + "iters_per_checkpoint": 5000, + "batch_size": 1, + "seed": 1234, + "checkpoint_path": "", + "ignore_layers": [], + "include_layers": ["speaker", "encoder", "embedding"], + "warmstart_checkpoint_path": "", + "with_tensorboard": true, + "fp16_run": false + }, + "data_config": { + "training_files": "filelists/libritts_train_clean_100_audiopath_text_sid_shorterthan10s_atleast5min_train_filelist.txt", + "validation_files": "filelists/libritts_train_clean_100_audiopath_text_sid_atleast5min_val_filelist.txt", + "text_cleaners": ["flowtron_cleaners"], + "p_arpabet": 0.5, + "cmudict_path": "data/cmudict_dictionary", + "sampling_rate": 22050, + "filter_length": 1024, + "hop_length": 256, + "win_length": 1024, + "mel_fmin": 0.0, + "mel_fmax": 8000.0, + "max_wav_value": 32768.0 + }, + "dist_config": { + "dist_backend": "nccl", + "dist_url": "tcp://localhost:54321" + }, + + "model_config": { + "n_speakers": 123, + "n_speaker_dim": 128, + "n_text": 185, + "n_text_dim": 512, + "n_flows": 2, + "n_mel_channels": 80, + "n_attn_channels": 640, + "n_hidden": 1024, + "n_lstm_layers": 2, + "mel_encoder_n_hidden": 512, + "n_components": 0, + "mean_scale": 0.0, + "fixed_gaussian": true, + "dummy_speaker_embedding": false, + "use_gate_layer": true + } +} diff --git a/export_onnx.py b/export_onnx.py new file mode 100644 index 0000000..c4752d6 --- /dev/null +++ b/export_onnx.py @@ -0,0 +1,178 @@ +############################################################################### +# +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +############################################################################### +import matplotlib +matplotlib.use("Agg") +import matplotlib.pylab as plt + +import os +import argparse +import json +import sys +import numpy as np +import torch + + +from flowtron_onnx import Flowtron, FlowtronTTS +from torch.utils.data import DataLoader +from data import Data +from train import update_params + +sys.path.insert(0, "tacotron2") +sys.path.insert(0, "tacotron2/waveglow") +from glow import WaveGlow +from scipy.io.wavfile import write +from copy import deepcopy + +import faulthandler +faulthandler.enable() + + +def export(flowtron_path, waveglow_path, output_dir, + speaker_id, n_frames, sigma, gate_threshold, seed, no_test_run): + text = "It is well know that deep generative models have a deep latent space!" + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + + # load waveglow + waveglow = torch.load(waveglow_path)['model'].cuda().eval() + waveglow.cuda() + for k in waveglow.convinv: + k.float() + waveglow.eval() + + # load flowtron + model = Flowtron(**model_config).cuda() + state_dict = torch.load(flowtron_path, map_location='cpu')['state_dict'] + model.load_state_dict(state_dict, False) + model.eval() + print("Loaded checkpoint '{}')" .format(flowtron_path)) + + # Script loop parts of the flows + model.script_flows() + + ignore_keys = ['training_files', 'validation_files'] + trainset = Data( + data_config['training_files'], + **dict((k, v) for k, v in data_config.items() if k not in ignore_keys)) + speaker_vecs = trainset.get_speaker_id(speaker_id).cuda() + text = trainset.get_text(text).cuda() + text_copy = deepcopy(text.cpu().numpy()) + speaker_vecs = speaker_vecs[None] + text = text[None] + + with torch.no_grad(): + residual = torch.cuda.FloatTensor(1, 80, n_frames).normal_() * sigma + mels = model(residual, speaker_vecs, text) + + waveglow = FlowtronTTS.patch_waveglow(waveglow) + + audio = waveglow(mels, sigma=0.8) + + model = FlowtronTTS(model, waveglow) + model_infer = torch.jit.trace( + model, [residual, speaker_vecs, text] + ) + torch.onnx.export( + model_infer, + [residual, speaker_vecs, text], + "./flowtron_waveglow.onnx", + opset_version=11, + do_constant_folding=True, + input_names=["residual", "speaker_vecs", "text"], + output_names=["audio"], + dynamic_axes={ + "text": {1: "text_seq"}, + "audio": {1: "audio_seq"}, + }, + example_outputs=audio, + verbose=False, + ) + + if not no_test_run: + print("Running test:") + import onnxruntime as rt + sess_options = rt.SessionOptions() + sess_options.graph_optimization_level = rt.GraphOptimizationLevel.ORT_DISABLE_ALL + print("Loading model.") + flowtron_tts = rt.InferenceSession( + "./flowtron_waveglow.onnx", + providers=rt.get_available_providers(), + sess_options=sess_options + ) + print("Model loaded, running tts.") + audio = flowtron_tts.run( + None, + { + "residual": residual.cpu().contiguous().numpy(), + "speaker_vecs": speaker_vecs.cpu().contiguous().numpy(), + "text": text_copy.reshape([1, -1]) + } + ) + print("Finished successfuly, saving the results") + audio = audio[0] + audio = audio / np.abs(audio).max() + + write( + os.path.join( + output_dir, 'sid{}_sigma{}_onnx_test.wav'.format( + speaker_id, sigma + ) + ), + data_config['sampling_rate'], audio + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('-c', '--config', type=str, + help='JSON file for configuration') + parser.add_argument('-p', '--params', nargs='+', default=[]) + parser.add_argument('-f', '--flowtron_path', + help='Path to flowtron state dict', type=str) + parser.add_argument('-w', '--waveglow_path', + help='Path to waveglow state dict', type=str) + parser.add_argument('-i', '--id', help='Speaker id', type=int) + parser.add_argument('-n', '--n_frames', help='Number of frames', + default=400, type=int) + parser.add_argument('-o', "--output_dir", default="results/") + parser.add_argument("-s", "--sigma", default=0.5, type=float) + parser.add_argument("-g", "--gate", default=0.5, type=float) + parser.add_argument("--seed", default=1234, type=int) + parser.add_argument('--no-test-run', dest='no_test_run', action='store_true') + args = parser.parse_args() + + # Parse configs. Globals nicer in this case + with open(args.config) as f: + data = f.read() + + global config + config = json.loads(data) + update_params(config, args.params) + + data_config = config["data_config"] + global model_config + model_config = config["model_config"] + + # Make directory if it doesn't exist + if not os.path.isdir(args.output_dir): + os.makedirs(args.output_dir) + os.chmod(args.output_dir, 0o775) + + torch.backends.cudnn.enabled = True + torch.backends.cudnn.benchmark = False + export(args.flowtron_path, args.waveglow_path, args.output_dir, + args.id, args.n_frames, args.sigma, args.gate, args.seed, args.no_test_run) diff --git a/flowtron_onnx.py b/flowtron_onnx.py new file mode 100644 index 0000000..4637fb7 --- /dev/null +++ b/flowtron_onnx.py @@ -0,0 +1,356 @@ +############################################################################### +# +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +############################################################################### +import sys +# sys.path.insert(0, "tacotron2") +# sys.path.insert(0, "tacotron2/waveglow") +import numpy as np +import torch +from torch import nn +from torch.nn import functional as F + +from flowtron import ( + LinearNorm, + ConvNorm, + GaussianMixture, + MelEncoder, + DenseLayer, + Encoder, + Attention, +) + + +class AR_Back_Step(torch.nn.Module): + + def __init__(self, n_mel_channels, n_speaker_dim, n_text_dim, + n_in_channels, n_hidden, n_attn_channels, n_lstm_layers, + add_gate): + super(AR_Back_Step, self).__init__() + self.ar_step = AR_Step(n_mel_channels, n_speaker_dim, n_text_dim, + n_mel_channels+n_speaker_dim, n_hidden, + n_attn_channels, n_lstm_layers, add_gate) + + def forward(self, residual, text): + residual, gates = self.ar_step( + torch.flip(residual, (0, )), text) + residual = torch.flip(residual, (0, )) + return residual, gates + + def trace_layers(self): + self.ar_step.trace_layers() + + +class AR_Step(torch.nn.Module): + __constants__ = ['gate_threshold', 'add_gate'] + + def __init__(self, n_mel_channels, n_speaker_dim, n_text_channels, + n_in_channels, n_hidden, n_attn_channels, n_lstm_layers, + add_gate: bool = False): + super(AR_Step, self).__init__() + self.conv = torch.nn.Conv1d(n_hidden, 2*n_mel_channels, 1).cuda() + self.conv.weight.data = 0.0*self.conv.weight.data + self.conv.bias.data = 0.0*self.conv.bias.data + # [1, 1, 1664] [2, 1, 1024] [2, 1, 1024] + self.lstm = torch.nn.LSTM(n_hidden+n_attn_channels, n_hidden, n_lstm_layers).cuda() + self.attention_lstm = torch.nn.LSTM(n_mel_channels, n_hidden).cuda() + + + self.attention_layer = Attention(n_hidden, n_speaker_dim, + n_text_channels, n_attn_channels,).cuda() + + self.dense_layer = DenseLayer(in_dim=n_hidden, + sizes=[n_hidden, n_hidden]).cuda() + self.add_gate: bool = add_gate + # if self.add_gate: + self.gate_threshold = 0.5 + self.gate_layer = LinearNorm( + n_hidden+n_attn_channels, 1, bias=True, w_init_gain='sigmoid' + ) + + def trace_layers(self): + self.lstm.flatten_parameters() + self.lstm = torch.jit.trace_module( + self.lstm, + inputs={ + 'forward': [ + torch.zeros([1, 1, 1664], dtype=torch.float, device='cuda').normal_(), + (torch.zeros([2, 1, 1024], dtype=torch.float, device='cuda').normal_(), + torch.zeros([2, 1, 1024], dtype=torch.float, device='cuda').normal_()) + ] + } + ) + self.attention_lstm.flatten_parameters() + self.attention_lstm = torch.jit.trace_module( + self.attention_lstm, + inputs={ + 'forward': [ + torch.zeros([1, 1, 80], dtype=torch.float, device='cuda').normal_(), + (torch.zeros([1, 1, 1024], dtype=torch.float, device='cuda').normal_(), + torch.zeros([1, 1, 1024], dtype=torch.float, device='cuda').normal_()) + ] + } + ) + self.conv = torch.jit.trace_module( + self.conv, + inputs={'forward': [torch.zeros([1, 1024, 1], dtype=torch.float, device='cuda').normal_()]} + ) + self.attention_layer = torch.jit.trace_module( + self.attention_layer, + inputs={ + 'forward': [ + torch.zeros([1, 1, 1024], dtype=torch.float, device='cuda').normal_(), + torch.zeros([63, 1, 640], dtype=torch.float, device='cuda').normal_(), + torch.zeros([63, 1, 640], dtype=torch.float, device='cuda').normal_() + ] + }, + ) + self.dense_layer = torch.jit.trace_module( + self.dense_layer, + inputs={ + 'forward': [ + torch.zeros([1, 1, 1024], dtype=torch.float, device='cuda').normal_() + ] + }, + ) + self.gate_layer = torch.jit.trace_module( + self.gate_layer, + inputs={ + 'forward': [ + torch.zeros([1, 1, 1664], dtype=torch.float, device='cuda').normal_() + ] + }, + ) + + def forward(self, residual, text): + total_output = [] # seems 10FPS faster than pre-allocation + gate_total = [] + dummy = torch.zeros([1, residual.size(1), residual.size(2)], device=residual.device) + (h, c) = (torch.zeros([1, 1, 1024], dtype=torch.float, device='cuda'), + torch.zeros([1, 1, 1024], dtype=torch.float, device='cuda')) + + attention_hidden, (h, c) = self.attention_lstm(dummy, (h, c)) + attention_context, attention_weight = self.attention_layer( + attention_hidden, text, text) + attention_context = attention_context.permute(2, 0, 1) + decoder_input = torch.cat((attention_hidden, attention_context), -1) + (h1, c1) = (torch.zeros([2, 1, 1024], dtype=torch.float, device='cuda'), + torch.zeros([2, 1, 1024], dtype=torch.float, device='cuda')) + lstm_hidden, (h1, c1) = self.lstm(decoder_input, (h1, c1)) + lstm_hidden = self.dense_layer(lstm_hidden).permute(1, 2, 0) + decoder_output = self.conv(lstm_hidden).permute(2, 0, 1) + + log_s = decoder_output[:, :, :decoder_output.size(2)//2] + b = decoder_output[:, :, decoder_output.size(2)//2:] + output = (residual[0, :, :] - b)/torch.exp(log_s) + total_output.append(output) + i = torch.tensor(1, dtype=torch.long) + lim = torch.tensor(residual.size(0), dtype=torch.long) + gate_total.append( + torch.sigmoid(self.gate_layer(decoder_input)).view([1]) + if self.add_gate else torch.tensor([0], dtype=torch.float, device=output.device) + ) + # more than one condition raises errors in onnx for some reason, so just returning gate layer instead + while i < lim: + attention_hidden, (h, c) = self.attention_lstm(output, (h, c)) + attention_context, attention_weight = self.attention_layer( + attention_hidden, text, text + ) + attention_context = attention_context.permute(2, 0, 1) + decoder_input = torch.cat((attention_hidden, attention_context), -1) + lstm_hidden, (h1, c1) = self.lstm(decoder_input, (h1, c1)) + lstm_hidden = self.dense_layer(lstm_hidden).permute(1, 2, 0) + decoder_output = self.conv(lstm_hidden).permute(2, 0, 1) + + log_s = decoder_output[:, :, :decoder_output.size(2)//2] + b = decoder_output[:, :, decoder_output.size(2)//2:] + output = (residual[i, :, :] - b)/torch.exp(log_s) + gate_total.append( + torch.sigmoid(self.gate_layer(decoder_input)).view([1]) + if self.add_gate else torch.tensor([0], dtype=torch.float, device=output.device) + ) + total_output.append(output) + i += 1 + total_output = torch.cat(total_output, 0) + return total_output, torch.cat(gate_total, 0) + + +class Flowtron(torch.nn.Module): + __constants__ = ['gate_threshold'] + + def __init__(self, n_speakers, n_speaker_dim, n_text, n_text_dim, n_flows, + n_mel_channels, n_hidden, n_attn_channels, n_lstm_layers, + use_gate_layer, mel_encoder_n_hidden, n_components, + fixed_gaussian, mean_scale, dummy_speaker_embedding, + temperature=1, gate_threshold=0.5): + + super(Flowtron, self).__init__() + norm_fn = nn.InstanceNorm1d + self.speaker_embedding = torch.nn.Embedding(n_speakers, n_speaker_dim) + self.embedding = torch.nn.Embedding(n_text, n_text_dim) + self.flows = torch.nn.ModuleList() + self.encoder = Encoder(norm_fn=norm_fn, encoder_embedding_dim=n_text_dim) + self.dummy_speaker_embedding = dummy_speaker_embedding + self.gate_threshold = gate_threshold + for i in range(n_flows): + add_gate = (i == (n_flows-1) and use_gate_layer) + if i % 2 == 0: + f = AR_Step(n_mel_channels, + n_speaker_dim, + n_text_dim, + n_mel_channels + n_speaker_dim, + n_hidden, n_attn_channels, + n_lstm_layers, + add_gate) + self.set_temperature_and_gate(f, temperature, gate_threshold) + self.flows.append(f) + else: + f = AR_Back_Step(n_mel_channels, + n_speaker_dim, + n_text_dim, + n_mel_channels + n_speaker_dim, + n_hidden, + n_attn_channels, + n_lstm_layers, + add_gate) + self.set_temperature_and_gate(f, temperature, gate_threshold) + self.flows.append(f) + + @torch.jit.ignore + def script_flows(self): + for i, flow in enumerate(self.flows): + flow.trace_layers() + self.flows[i] = torch.jit.script(flow) + + def forward(self, *args): + residual, speaker_vecs, text = args + speaker_vecs = self.speaker_embedding(speaker_vecs) + text = self.embedding(text).permute(0, 2, 1) + text = self.encoder.infer(text) + text = text.permute(1, 0, 2) + encoder_outputs = torch.cat( + [ + text, + speaker_vecs.expand(text.size(0), -1, -1) + ], 2 + ) + residual = residual.permute(2, 0, 1) + for flow in reversed(self.flows): + residual, gates = flow(residual, encoder_outputs) + gate_trigger_id_tuple = torch.nonzero(gates > self.gate_threshold, as_tuple=True) + if gate_trigger_id_tuple[0].nelement() > 0: + residual = residual[:gate_trigger_id_tuple[0].item(), ...] + return residual.permute(1, 2, 0) + + @staticmethod + def set_temperature_and_gate(flow, temperature, gate_threshold): + flow = flow.ar_step if hasattr(flow, "ar_step") else flow + flow.attention_layer.temperature = temperature + if hasattr(flow, 'gate_layer'): + flow.gate_threshold = gate_threshold + + +class FlowtronTTS(torch.nn.Module): + + def __init__(self, flowtron, waveglow, *args, **kwargs): + super().__init__(*args, **kwargs) + self.flowtron = flowtron + self.waveglow = waveglow + + def trace_flowtron(self, args): + self.flowtron_traced = torch.jit.trace( + self.flowtron, args + ) + + @classmethod + def patch_waveglow(cls, waveglow): + waveglow.forward = cls.waveglow_infer_forward.__get__( + waveglow, type(waveglow) + ) + return waveglow + + def forward(self, *args): + residual, speaker_vecs, text = args + mels = self.flowtron(residual, speaker_vecs, text) + audio = self.waveglow(mels) + return audio + + def waveglow_infer_forward(self, spect, sigma=0.8): + """Waveglow infer function. + + Fixes ONNX unsupported operator errors with replacement + for supported ones. + """ + + spect = self.upsample(spect) + # trim conv artifacts. maybe pad spec to kernel multiple + time_cutoff = self.upsample.kernel_size[0] - self.upsample.stride[0] + spect = spect[:, :, :-time_cutoff] + # Replacing unfold since it is compiled into a weird onnx representation (with slices and concat) + spect = spect.reshape(1, 80, -1, self.n_group).permute(0, 2, 1, 3) + spect = spect.contiguous().reshape( + spect.size(0), spect.size(1), -1 + ).permute(0, 2, 1) + + if spect.type() == 'torch.cuda.HalfTensor': + audio = torch.randn( + spect.size(0), + self.n_remaining_channels, + spect.size(2), dtype=torch.half, device='cuda' + ) + else: + audio = torch.randn( + spect.size(0), + self.n_remaining_channels, + spect.size(2), dtype=torch.float, device='cuda' + ) + + audio = torch.autograd.Variable(sigma*audio) + + for k in reversed(range(self.n_flows)): + n_half = int(audio.size(1)/2) + audio_0 = audio[:, :n_half, :] + audio_1 = audio[:, n_half:, :] + + output = self.WN[k]((audio_0, spect)) + + s = output[:, n_half:, :] + b = output[:, :n_half, :] + audio_1 = (audio_1 - b)/torch.exp(s) + audio = torch.cat([audio_0, audio_1], 1) + + audio = self.convinv[k](audio, reverse=True) + + if k % self.n_early_every == 0 and k > 0: + if spect.type() == 'torch.cuda.HalfTensor': + z = torch.randn( + spect.size(0), + self.n_early_size, + spect.size(2), + dtype=torch.half, + device='cuda' + ) + else: + z = torch.randn( + spect.size(0), + self.n_early_size, + spect.size(2), + dtype=torch.float, + device='cuda' + ) + audio = torch.cat((sigma*z, audio), 1) + + audio = audio.permute(0, 2, 1).contiguous().reshape(audio.size(0), -1) + return audio From 5733f734f08f073210d1ab7424e67dfbe13fe2f4 Mon Sep 17 00:00:00 2001 From: Mykyta Makarov Date: Fri, 27 Nov 2020 13:34:18 +0100 Subject: [PATCH 2/3] Fix for access exception --- export_onnx.py | 73 ++++++++++++++++++++++++++---------------------- flowtron_onnx.py | 66 +++++++++++++++++++------------------------ 2 files changed, 68 insertions(+), 71 deletions(-) diff --git a/export_onnx.py b/export_onnx.py index c4752d6..8c1ae3f 100644 --- a/export_onnx.py +++ b/export_onnx.py @@ -42,8 +42,8 @@ def export(flowtron_path, waveglow_path, output_dir, - speaker_id, n_frames, sigma, gate_threshold, seed, no_test_run): - text = "It is well know that deep generative models have a deep latent space!" + speaker_id, n_frames, sigma, gate_threshold, seed, no_test_run, no_export): + text = "Hello?" torch.manual_seed(seed) torch.cuda.manual_seed(seed) @@ -68,45 +68,49 @@ def export(flowtron_path, waveglow_path, output_dir, trainset = Data( data_config['training_files'], **dict((k, v) for k, v in data_config.items() if k not in ignore_keys)) + print(trainset.speaker_ids) speaker_vecs = trainset.get_speaker_id(speaker_id).cuda() text = trainset.get_text(text).cuda() text_copy = deepcopy(text.cpu().numpy()) speaker_vecs = speaker_vecs[None] text = text[None] - - with torch.no_grad(): - residual = torch.cuda.FloatTensor(1, 80, n_frames).normal_() * sigma - mels = model(residual, speaker_vecs, text) - - waveglow = FlowtronTTS.patch_waveglow(waveglow) - - audio = waveglow(mels, sigma=0.8) - - model = FlowtronTTS(model, waveglow) - model_infer = torch.jit.trace( - model, [residual, speaker_vecs, text] - ) - torch.onnx.export( - model_infer, - [residual, speaker_vecs, text], - "./flowtron_waveglow.onnx", - opset_version=11, - do_constant_folding=True, - input_names=["residual", "speaker_vecs", "text"], - output_names=["audio"], - dynamic_axes={ - "text": {1: "text_seq"}, - "audio": {1: "audio_seq"}, - }, - example_outputs=audio, - verbose=False, - ) + if not no_export: + with torch.no_grad(): + residual = torch.cuda.FloatTensor(1, 80, n_frames).normal_() * sigma + mels = model(residual, speaker_vecs, text) + print(mels.shape) + waveglow = FlowtronTTS.patch_waveglow(waveglow) + + audio = waveglow(mels, sigma=0.8) + + model = FlowtronTTS(model, waveglow) + model_infer = torch.jit.trace( + model, [residual, speaker_vecs, text] + ) + outp = model_infer(residual, speaker_vecs, text) + + torch.onnx.export( + model_infer, + [residual, speaker_vecs, text], + "./flowtron_waveglow.onnx", + opset_version=11, + do_constant_folding=True, + input_names=["residual", "speaker_vecs", "text"], + output_names=["audio"], + dynamic_axes={ + "residual": {1: "res_ch", 2: "res_frames"}, + "text": {1: "text_seq"}, + "audio": {1: "audio_seq"}, + }, + example_outputs=outp, + verbose=False, + ) if not no_test_run: print("Running test:") import onnxruntime as rt sess_options = rt.SessionOptions() - sess_options.graph_optimization_level = rt.GraphOptimizationLevel.ORT_DISABLE_ALL + sess_options.graph_optimization_level = rt.GraphOptimizationLevel.ORT_ENABLE_ALL print("Loading model.") flowtron_tts = rt.InferenceSession( "./flowtron_waveglow.onnx", @@ -123,9 +127,8 @@ def export(flowtron_path, waveglow_path, output_dir, } ) print("Finished successfuly, saving the results") - audio = audio[0] + audio = audio[0].reshape(-1) audio = audio / np.abs(audio).max() - write( os.path.join( output_dir, 'sid{}_sigma{}_onnx_test.wav'.format( @@ -153,6 +156,7 @@ def export(flowtron_path, waveglow_path, output_dir, parser.add_argument("-g", "--gate", default=0.5, type=float) parser.add_argument("--seed", default=1234, type=int) parser.add_argument('--no-test-run', dest='no_test_run', action='store_true') + parser.add_argument('--no-export', dest='no_export', action='store_true') args = parser.parse_args() # Parse configs. Globals nicer in this case @@ -175,4 +179,5 @@ def export(flowtron_path, waveglow_path, output_dir, torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = False export(args.flowtron_path, args.waveglow_path, args.output_dir, - args.id, args.n_frames, args.sigma, args.gate, args.seed, args.no_test_run) + args.id, args.n_frames, args.sigma, args.gate, args.seed, + args.no_test_run, args.no_export) diff --git a/flowtron_onnx.py b/flowtron_onnx.py index 4637fb7..8193f59 100644 --- a/flowtron_onnx.py +++ b/flowtron_onnx.py @@ -44,10 +44,10 @@ def __init__(self, n_mel_channels, n_speaker_dim, n_text_dim, n_attn_channels, n_lstm_layers, add_gate) def forward(self, residual, text): - residual, gates = self.ar_step( + residual, gate = self.ar_step( torch.flip(residual, (0, )), text) residual = torch.flip(residual, (0, )) - return residual, gates + return residual, gate def trace_layers(self): self.ar_step.trace_layers() @@ -135,35 +135,14 @@ def trace_layers(self): ) def forward(self, residual, text): - total_output = [] # seems 10FPS faster than pre-allocation + total_output = [] gate_total = [] - dummy = torch.zeros([1, residual.size(1), residual.size(2)], device=residual.device) + output = torch.zeros([1, residual.size(1), residual.size(2)], device=residual.device) (h, c) = (torch.zeros([1, 1, 1024], dtype=torch.float, device='cuda'), torch.zeros([1, 1, 1024], dtype=torch.float, device='cuda')) - - attention_hidden, (h, c) = self.attention_lstm(dummy, (h, c)) - attention_context, attention_weight = self.attention_layer( - attention_hidden, text, text) - attention_context = attention_context.permute(2, 0, 1) - decoder_input = torch.cat((attention_hidden, attention_context), -1) (h1, c1) = (torch.zeros([2, 1, 1024], dtype=torch.float, device='cuda'), torch.zeros([2, 1, 1024], dtype=torch.float, device='cuda')) - lstm_hidden, (h1, c1) = self.lstm(decoder_input, (h1, c1)) - lstm_hidden = self.dense_layer(lstm_hidden).permute(1, 2, 0) - decoder_output = self.conv(lstm_hidden).permute(2, 0, 1) - - log_s = decoder_output[:, :, :decoder_output.size(2)//2] - b = decoder_output[:, :, decoder_output.size(2)//2:] - output = (residual[0, :, :] - b)/torch.exp(log_s) - total_output.append(output) - i = torch.tensor(1, dtype=torch.long) - lim = torch.tensor(residual.size(0), dtype=torch.long) - gate_total.append( - torch.sigmoid(self.gate_layer(decoder_input)).view([1]) - if self.add_gate else torch.tensor([0], dtype=torch.float, device=output.device) - ) - # more than one condition raises errors in onnx for some reason, so just returning gate layer instead - while i < lim: + for i in range(int(residual.size(0))): attention_hidden, (h, c) = self.attention_lstm(output, (h, c)) attention_context, attention_weight = self.attention_layer( attention_hidden, text, text @@ -176,13 +155,13 @@ def forward(self, residual, text): log_s = decoder_output[:, :, :decoder_output.size(2)//2] b = decoder_output[:, :, decoder_output.size(2)//2:] - output = (residual[i, :, :] - b)/torch.exp(log_s) - gate_total.append( - torch.sigmoid(self.gate_layer(decoder_input)).view([1]) - if self.add_gate else torch.tensor([0], dtype=torch.float, device=output.device) - ) - total_output.append(output) - i += 1 + output = (residual[i, :, :].unsqueeze(0) - b)/torch.exp(log_s) + gate_total += [ + torch.sigmoid(self.gate_layer(decoder_input)).reshape([1]) + if self.add_gate else + torch.tensor([0], dtype=torch.float, device=output.device) + ] + total_output += [output] total_output = torch.cat(total_output, 0) return total_output, torch.cat(gate_total, 0) @@ -197,7 +176,7 @@ def __init__(self, n_speakers, n_speaker_dim, n_text, n_text_dim, n_flows, temperature=1, gate_threshold=0.5): super(Flowtron, self).__init__() - norm_fn = nn.InstanceNorm1d + norm_fn = InstanceNorm self.speaker_embedding = torch.nn.Embedding(n_speakers, n_speaker_dim) self.embedding = torch.nn.Embedding(n_text, n_text_dim) self.flows = torch.nn.ModuleList() @@ -249,9 +228,10 @@ def forward(self, *args): residual = residual.permute(2, 0, 1) for flow in reversed(self.flows): residual, gates = flow(residual, encoder_outputs) - gate_trigger_id_tuple = torch.nonzero(gates > self.gate_threshold, as_tuple=True) - if gate_trigger_id_tuple[0].nelement() > 0: - residual = residual[:gate_trigger_id_tuple[0].item(), ...] + gate_trigger_id_tuple = torch.nonzero(gates.double() > self.gate_threshold) + if gate_trigger_id_tuple.nelement() > 0: + indices = torch.arange(gate_trigger_id_tuple[0][0], device=residual.device) + residual = residual.flip(0).index_select(0, indices).flip(0) return residual.permute(1, 2, 0) @staticmethod @@ -262,6 +242,18 @@ def set_temperature_and_gate(flow, temperature, gate_threshold): flow.gate_threshold = gate_threshold +class InstanceNorm(torch.nn.modules.instancenorm._InstanceNorm): + def __init__(self, *args, **kwargs): + super(InstanceNorm, self).__init__(*args, **kwargs) + + def forward(self, x): + mn = x.mean(-1).detach().unsqueeze(-1) + sd = x.std(-1).detach().unsqueeze(-1) + + x = ((x - mn) / (sd + 1e-8)) * self.weight.view(1, -1, 1) + self.bias.view(1, -1, 1) + return x + + class FlowtronTTS(torch.nn.Module): def __init__(self, flowtron, waveglow, *args, **kwargs): From 2ba36a80e7ae1cf977be2ca179954e8693587e9f Mon Sep 17 00:00:00 2001 From: eublefar Date: Mon, 25 Oct 2021 21:05:13 +0200 Subject: [PATCH 3/3] ONNX export and prototype for semi-incremental inference --- export_onnx.py | 600 +++++++++++++++++-------- flowtron_onnx.py | 1114 +++++++++++++++++++++++++++++++--------------- 2 files changed, 1183 insertions(+), 531 deletions(-) diff --git a/export_onnx.py b/export_onnx.py index 8c1ae3f..2bbca08 100644 --- a/export_onnx.py +++ b/export_onnx.py @@ -1,183 +1,417 @@ -############################################################################### -# -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -############################################################################### -import matplotlib -matplotlib.use("Agg") -import matplotlib.pylab as plt - -import os -import argparse -import json -import sys -import numpy as np -import torch - - -from flowtron_onnx import Flowtron, FlowtronTTS -from torch.utils.data import DataLoader -from data import Data -from train import update_params - -sys.path.insert(0, "tacotron2") -sys.path.insert(0, "tacotron2/waveglow") -from glow import WaveGlow -from scipy.io.wavfile import write -from copy import deepcopy - -import faulthandler -faulthandler.enable() - - -def export(flowtron_path, waveglow_path, output_dir, - speaker_id, n_frames, sigma, gate_threshold, seed, no_test_run, no_export): - text = "Hello?" - torch.manual_seed(seed) - torch.cuda.manual_seed(seed) - - # load waveglow - waveglow = torch.load(waveglow_path)['model'].cuda().eval() - waveglow.cuda() - for k in waveglow.convinv: - k.float() - waveglow.eval() - - # load flowtron - model = Flowtron(**model_config).cuda() - state_dict = torch.load(flowtron_path, map_location='cpu')['state_dict'] - model.load_state_dict(state_dict, False) - model.eval() - print("Loaded checkpoint '{}')" .format(flowtron_path)) - - # Script loop parts of the flows - model.script_flows() - - ignore_keys = ['training_files', 'validation_files'] - trainset = Data( - data_config['training_files'], - **dict((k, v) for k, v in data_config.items() if k not in ignore_keys)) - print(trainset.speaker_ids) - speaker_vecs = trainset.get_speaker_id(speaker_id).cuda() - text = trainset.get_text(text).cuda() - text_copy = deepcopy(text.cpu().numpy()) - speaker_vecs = speaker_vecs[None] - text = text[None] - if not no_export: - with torch.no_grad(): - residual = torch.cuda.FloatTensor(1, 80, n_frames).normal_() * sigma - mels = model(residual, speaker_vecs, text) - print(mels.shape) - waveglow = FlowtronTTS.patch_waveglow(waveglow) - - audio = waveglow(mels, sigma=0.8) - - model = FlowtronTTS(model, waveglow) - model_infer = torch.jit.trace( - model, [residual, speaker_vecs, text] - ) - outp = model_infer(residual, speaker_vecs, text) - - torch.onnx.export( - model_infer, - [residual, speaker_vecs, text], - "./flowtron_waveglow.onnx", - opset_version=11, - do_constant_folding=True, - input_names=["residual", "speaker_vecs", "text"], - output_names=["audio"], - dynamic_axes={ - "residual": {1: "res_ch", 2: "res_frames"}, - "text": {1: "text_seq"}, - "audio": {1: "audio_seq"}, - }, - example_outputs=outp, - verbose=False, - ) - - if not no_test_run: - print("Running test:") - import onnxruntime as rt - sess_options = rt.SessionOptions() - sess_options.graph_optimization_level = rt.GraphOptimizationLevel.ORT_ENABLE_ALL - print("Loading model.") - flowtron_tts = rt.InferenceSession( - "./flowtron_waveglow.onnx", - providers=rt.get_available_providers(), - sess_options=sess_options - ) - print("Model loaded, running tts.") - audio = flowtron_tts.run( - None, - { - "residual": residual.cpu().contiguous().numpy(), - "speaker_vecs": speaker_vecs.cpu().contiguous().numpy(), - "text": text_copy.reshape([1, -1]) - } - ) - print("Finished successfuly, saving the results") - audio = audio[0].reshape(-1) - audio = audio / np.abs(audio).max() - write( - os.path.join( - output_dir, 'sid{}_sigma{}_onnx_test.wav'.format( - speaker_id, sigma - ) - ), - data_config['sampling_rate'], audio - ) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument('-c', '--config', type=str, - help='JSON file for configuration') - parser.add_argument('-p', '--params', nargs='+', default=[]) - parser.add_argument('-f', '--flowtron_path', - help='Path to flowtron state dict', type=str) - parser.add_argument('-w', '--waveglow_path', - help='Path to waveglow state dict', type=str) - parser.add_argument('-i', '--id', help='Speaker id', type=int) - parser.add_argument('-n', '--n_frames', help='Number of frames', - default=400, type=int) - parser.add_argument('-o', "--output_dir", default="results/") - parser.add_argument("-s", "--sigma", default=0.5, type=float) - parser.add_argument("-g", "--gate", default=0.5, type=float) - parser.add_argument("--seed", default=1234, type=int) - parser.add_argument('--no-test-run', dest='no_test_run', action='store_true') - parser.add_argument('--no-export', dest='no_export', action='store_true') - args = parser.parse_args() - - # Parse configs. Globals nicer in this case - with open(args.config) as f: - data = f.read() - - global config - config = json.loads(data) - update_params(config, args.params) - - data_config = config["data_config"] - global model_config - model_config = config["model_config"] - - # Make directory if it doesn't exist - if not os.path.isdir(args.output_dir): - os.makedirs(args.output_dir) - os.chmod(args.output_dir, 0o775) - - torch.backends.cudnn.enabled = True - torch.backends.cudnn.benchmark = False - export(args.flowtron_path, args.waveglow_path, args.output_dir, - args.id, args.n_frames, args.sigma, args.gate, args.seed, - args.no_test_run, args.no_export) +############################################################################### +# +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +############################################################################### +import matplotlib + +matplotlib.use("Agg") +import matplotlib.pylab as plt + +import os +import argparse +import json +import sys +import numpy as np +import torch + + +from flowtron_onnx import Flowtron, FlowtronTTS, FlowtronEncoder, SimpleTTSRunner +from torch.utils.data import DataLoader +from data import Data +from train import update_params + +sys.path.insert(0, "tacotron2") +sys.path.insert(0, "tacotron2/WaveGlow") +from glow import WaveGlow +from scipy.io.wavfile import write +from copy import deepcopy +import sounddevice as sd +from queue import Queue + +# import faulthandler +import time + +# faulthandler.enable() + + +def init_states(residual): + last_outputs = torch.zeros( + [1, residual.size(1), residual.size(2)], + device=residual.device, + dtype=torch.float, + ) + hidden_att = [ + torch.zeros([1, 1, 1024], dtype=torch.float, device="cuda"), + torch.zeros([1, 1, 1024], dtype=torch.float, device="cuda"), + ] + hidden_lstm = [ + torch.zeros([2, 1, 1024], dtype=torch.float, device="cuda"), + torch.zeros([2, 1, 1024], dtype=torch.float, device="cuda"), + ] + return last_outputs, hidden_att, hidden_lstm + + +def export( + flowtron_path, + waveglow_path, + output_dir, + speaker_id, + n_frames, + sigma, + gate_threshold, + seed, + no_test_run, + no_export, +): + text = """ + I am doing fine + """ + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + + # load waveglow + waveglow = torch.load(waveglow_path)["model"].cuda().eval() + waveglow.cuda() + for k in waveglow.convinv: + k.float() + waveglow.eval() + + # load flowtron + model = Flowtron(**model_config).cuda() + state_dict = torch.load(flowtron_path, map_location="cpu")["model"].state_dict() + + model.load_state_dict(state_dict, False) + model.eval() + print("Loaded checkpoint '{}')".format(flowtron_path)) + + # Script loop parts of the flows + # model.script_flows() + + ignore_keys = ["training_files", "validation_files"] + trainset = Data( + data_config["training_files"], + **dict((k, v) for k, v in data_config.items() if k not in ignore_keys) + ) + print(trainset.speaker_ids) + speaker_vecs = trainset.get_speaker_id(speaker_id).cuda() + text = trainset.get_text(text).cuda() + text_copy = deepcopy(text.cpu().numpy()) + speaker_vecs = speaker_vecs[None] + text = text[None] + if not no_export: + with torch.no_grad(): + residual = torch.cuda.FloatTensor(1, 80, n_frames).normal_() * sigma + + encoder = FlowtronEncoder( + model.embedding, model.speaker_embedding, model.encoder + ) + + # mels = model(residual, speaker_vecs, text) + # print(mels.shape) + waveglow = FlowtronTTS.patch_waveglow(waveglow) + + # audio = waveglow(mels, sigma=0.8) + + model = FlowtronTTS(encoder, model, waveglow) + + text = text.reshape([1, -1]) + + enc_outps = encoder(speaker_vecs, text) + print("enc_outps.shape", enc_outps.shape) + torch.onnx.export( + encoder, + (speaker_vecs, text), + "./encoder.onnx", + opset_version=11, + do_constant_folding=True, + input_names=["speaker_vecs", "text"], + output_names=["text_emb"], + dynamic_axes={"text": {1: "text_seq"}, "text_emb": {0: "text_seq"}}, + example_outputs=enc_outps, + verbose=False, + ) + + backward_flow = model.backward_flow.ar_step + residual = residual.permute(2, 0, 1) + residual_o, hidden_att, hidden_lstm = init_states(residual) + + ( + residual_o, + gates, + hidden_att[0], + hidden_att[1], + hidden_lstm[0], + hidden_lstm[1], + ) = backward_flow( + residual[0], + enc_outps, + residual_o, + hidden_att[0], + hidden_att[1], + hidden_lstm[0], + hidden_lstm[1], + ) + torch.onnx.export( + backward_flow, + ( + residual[0], + enc_outps, + residual_o, + hidden_att[0], + hidden_att[1], + hidden_lstm[0], + hidden_lstm[1], + ), + "./backward_flow.onnx", + opset_version=11, + do_constant_folding=True, + input_names=[ + "residual", + "text", + "last_output", + "hidden_att", + "hidden_att_c", + "hidden_lstm", + "hidden_lstm_c", + ], + output_names=[ + "output", + "gate", + "hidden_att_o", + "hidden_att_o_c", + "hidden_lstm_o", + "hidden_lstm_o_c", + ], + dynamic_axes={"text": {0: "text_seq"}}, + example_outputs=( + residual_o, + gates, + hidden_att[0], + hidden_att[1], + hidden_lstm[0], + hidden_lstm[1], + ), + verbose=False, + ) + + forward_flow = model.forward_flow + + ( + residual_o, + gates, + hidden_att[0], + hidden_att[1], + hidden_lstm[0], + hidden_lstm[1], + ) = forward_flow( + residual[0], + enc_outps, + residual_o, + hidden_att[0], + hidden_att[1], + hidden_lstm[0], + hidden_lstm[1], + ) + torch.onnx.export( + forward_flow, + ( + residual[0], + enc_outps, + residual_o, + hidden_att[0], + hidden_att[1], + hidden_lstm[0], + hidden_lstm[1], + ), + "./forward_flow.onnx", + opset_version=11, + do_constant_folding=True, + input_names=[ + "residual", + "text", + "last_output", + "hidden_att", + "hidden_att_c", + "hidden_lstm", + "hidden_lstm_c", + ], + output_names=[ + "output", + "gate", + "hidden_att_o", + "hidden_att_o_c", + "hidden_lstm_o", + "hidden_lstm_o_c", + ], + dynamic_axes={"text": {0: "text_seq"}}, + example_outputs=( + residual_o, + gates, + hidden_att[0], + hidden_att[1], + hidden_lstm[0], + hidden_lstm[1], + ), + verbose=False, + ) + + residual = residual.permute(1, 2, 0) + mels = model(residual, speaker_vecs, text) + + audio = waveglow(mels, sigma=0.8) + + torch.onnx.export( + waveglow, + (mels), + "./waveglow.onnx", + opset_version=11, + do_constant_folding=True, + input_names=["mels"], + output_names=["audio"], + dynamic_axes={"mels": {2: "mel_seq"}, "audio": {1: "audio_seq"}}, + example_outputs=audio, + verbose=False, + ) + + if not no_test_run: + print("Running test:") + import onnxruntime as rt + + sess_options = rt.SessionOptions() + sess_options.graph_optimization_level = ( + rt.GraphOptimizationLevel.ORT_DISABLE_ALL + ) + print("Loading model.") + + print(rt.get_available_providers()) + + encoder = rt.InferenceSession( + "./encoder.onnx", + providers=rt.get_available_providers()[:1], + sess_options=sess_options, + ) + backward_flow = rt.InferenceSession( + "./backward_flow.onnx", + providers=rt.get_available_providers()[:1], + sess_options=sess_options, + ) + print([i.name for i in backward_flow.get_inputs()]) + forward_flow = rt.InferenceSession( + "./forward_flow.onnx", + providers=rt.get_available_providers()[:1], + sess_options=sess_options, + ) + waveglow = rt.InferenceSession( + "./waveglow.onnx", + providers=rt.get_available_providers()[:1], + sess_options=sess_options, + ) + print("Model loaded, running tts.") + model = SimpleTTSRunner(encoder, backward_flow, forward_flow, waveglow) + speaker_id = speaker_vecs.cpu().contiguous().numpy() + text = text_copy.reshape([1, -1]) + full_audio = [] + print(text.shape[1]) + input("Press enter to start generating:") + start = time.time() + + audio = model.run(speaker_id, text) + queue = Queue() + def callback(indata, outdata, frames, time, status): + if not queue.empty(): + arr = np.zeros((5120, 1)) + inp = queue.get(False) + arr[:inp.shape[0], 0] = inp + outdata[:] = arr + + stream = sd.Stream(channels=1, samplerate=22050, callback=callback, blocksize=5120).__enter__() + for i, audio_el in enumerate(audio): + # stream.write(audio_el) + if i==0: + audio_el[:1000] = 0 + queue.put(audio_el) + full_audio += audio_el.tolist() + + while not queue.empty(): + sd.sleep(int(5120/22.05)) + end = time.time() + process_time = end - start + audio_time = len(full_audio) / data_config["sampling_rate"] + print(f" > Processing time: {process_time}") + print(f" > Real-time factor: {process_time / audio_time}") + print("Finished successfuly, saving the results") + print(f"data_config['sampling_rate'] {data_config['sampling_rate']}") + write( + os.path.join( + output_dir, "sid{}_sigma{}_onnx_test.wav".format(speaker_id, sigma) + ), + data_config["sampling_rate"], + np.asarray(full_audio), + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("-c", "--config", type=str, help="JSON file for configuration") + parser.add_argument("-p", "--params", nargs="+", default=[]) + parser.add_argument( + "-f", "--flowtron_path", help="Path to flowtron state dict", type=str + ) + parser.add_argument( + "-w", "--waveglow_path", help="Path to waveglow state dict", type=str + ) + parser.add_argument("-i", "--id", help="Speaker id", type=int) + parser.add_argument( + "-n", "--n_frames", help="Number of frames", default=400, type=int + ) + parser.add_argument("-o", "--output_dir", default="results/") + parser.add_argument("-s", "--sigma", default=0.5, type=float) + parser.add_argument("-g", "--gate", default=0.5, type=float) + parser.add_argument("--seed", default=1234, type=int) + parser.add_argument("--no-test-run", dest="no_test_run", action="store_true") + parser.add_argument("--no-export", dest="no_export", action="store_true") + args = parser.parse_args() + + # Parse configs. Globals nicer in this case + with open(args.config) as f: + data = f.read() + + global config + config = json.loads(data) + update_params(config, args.params) + + data_config = config["data_config"] + global model_config + model_config = config["model_config"] + + # Make directory if it doesn't exist + if not os.path.isdir(args.output_dir): + os.makedirs(args.output_dir) + os.chmod(args.output_dir, 0o775) + + torch.backends.cudnn.enabled = True + torch.backends.cudnn.benchmark = False + export( + args.flowtron_path, + args.waveglow_path, + args.output_dir, + args.id, + args.n_frames, + args.sigma, + args.gate, + args.seed, + args.no_test_run, + args.no_export, + ) diff --git a/flowtron_onnx.py b/flowtron_onnx.py index 8193f59..37a39c7 100644 --- a/flowtron_onnx.py +++ b/flowtron_onnx.py @@ -1,348 +1,766 @@ -############################################################################### -# -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -############################################################################### -import sys -# sys.path.insert(0, "tacotron2") -# sys.path.insert(0, "tacotron2/waveglow") -import numpy as np -import torch -from torch import nn -from torch.nn import functional as F - -from flowtron import ( - LinearNorm, - ConvNorm, - GaussianMixture, - MelEncoder, - DenseLayer, - Encoder, - Attention, -) - - -class AR_Back_Step(torch.nn.Module): - - def __init__(self, n_mel_channels, n_speaker_dim, n_text_dim, - n_in_channels, n_hidden, n_attn_channels, n_lstm_layers, - add_gate): - super(AR_Back_Step, self).__init__() - self.ar_step = AR_Step(n_mel_channels, n_speaker_dim, n_text_dim, - n_mel_channels+n_speaker_dim, n_hidden, - n_attn_channels, n_lstm_layers, add_gate) - - def forward(self, residual, text): - residual, gate = self.ar_step( - torch.flip(residual, (0, )), text) - residual = torch.flip(residual, (0, )) - return residual, gate - - def trace_layers(self): - self.ar_step.trace_layers() - - -class AR_Step(torch.nn.Module): - __constants__ = ['gate_threshold', 'add_gate'] - - def __init__(self, n_mel_channels, n_speaker_dim, n_text_channels, - n_in_channels, n_hidden, n_attn_channels, n_lstm_layers, - add_gate: bool = False): - super(AR_Step, self).__init__() - self.conv = torch.nn.Conv1d(n_hidden, 2*n_mel_channels, 1).cuda() - self.conv.weight.data = 0.0*self.conv.weight.data - self.conv.bias.data = 0.0*self.conv.bias.data - # [1, 1, 1664] [2, 1, 1024] [2, 1, 1024] - self.lstm = torch.nn.LSTM(n_hidden+n_attn_channels, n_hidden, n_lstm_layers).cuda() - self.attention_lstm = torch.nn.LSTM(n_mel_channels, n_hidden).cuda() - - - self.attention_layer = Attention(n_hidden, n_speaker_dim, - n_text_channels, n_attn_channels,).cuda() - - self.dense_layer = DenseLayer(in_dim=n_hidden, - sizes=[n_hidden, n_hidden]).cuda() - self.add_gate: bool = add_gate - # if self.add_gate: - self.gate_threshold = 0.5 - self.gate_layer = LinearNorm( - n_hidden+n_attn_channels, 1, bias=True, w_init_gain='sigmoid' - ) - - def trace_layers(self): - self.lstm.flatten_parameters() - self.lstm = torch.jit.trace_module( - self.lstm, - inputs={ - 'forward': [ - torch.zeros([1, 1, 1664], dtype=torch.float, device='cuda').normal_(), - (torch.zeros([2, 1, 1024], dtype=torch.float, device='cuda').normal_(), - torch.zeros([2, 1, 1024], dtype=torch.float, device='cuda').normal_()) - ] - } - ) - self.attention_lstm.flatten_parameters() - self.attention_lstm = torch.jit.trace_module( - self.attention_lstm, - inputs={ - 'forward': [ - torch.zeros([1, 1, 80], dtype=torch.float, device='cuda').normal_(), - (torch.zeros([1, 1, 1024], dtype=torch.float, device='cuda').normal_(), - torch.zeros([1, 1, 1024], dtype=torch.float, device='cuda').normal_()) - ] - } - ) - self.conv = torch.jit.trace_module( - self.conv, - inputs={'forward': [torch.zeros([1, 1024, 1], dtype=torch.float, device='cuda').normal_()]} - ) - self.attention_layer = torch.jit.trace_module( - self.attention_layer, - inputs={ - 'forward': [ - torch.zeros([1, 1, 1024], dtype=torch.float, device='cuda').normal_(), - torch.zeros([63, 1, 640], dtype=torch.float, device='cuda').normal_(), - torch.zeros([63, 1, 640], dtype=torch.float, device='cuda').normal_() - ] - }, - ) - self.dense_layer = torch.jit.trace_module( - self.dense_layer, - inputs={ - 'forward': [ - torch.zeros([1, 1, 1024], dtype=torch.float, device='cuda').normal_() - ] - }, - ) - self.gate_layer = torch.jit.trace_module( - self.gate_layer, - inputs={ - 'forward': [ - torch.zeros([1, 1, 1664], dtype=torch.float, device='cuda').normal_() - ] - }, - ) - - def forward(self, residual, text): - total_output = [] - gate_total = [] - output = torch.zeros([1, residual.size(1), residual.size(2)], device=residual.device) - (h, c) = (torch.zeros([1, 1, 1024], dtype=torch.float, device='cuda'), - torch.zeros([1, 1, 1024], dtype=torch.float, device='cuda')) - (h1, c1) = (torch.zeros([2, 1, 1024], dtype=torch.float, device='cuda'), - torch.zeros([2, 1, 1024], dtype=torch.float, device='cuda')) - for i in range(int(residual.size(0))): - attention_hidden, (h, c) = self.attention_lstm(output, (h, c)) - attention_context, attention_weight = self.attention_layer( - attention_hidden, text, text - ) - attention_context = attention_context.permute(2, 0, 1) - decoder_input = torch.cat((attention_hidden, attention_context), -1) - lstm_hidden, (h1, c1) = self.lstm(decoder_input, (h1, c1)) - lstm_hidden = self.dense_layer(lstm_hidden).permute(1, 2, 0) - decoder_output = self.conv(lstm_hidden).permute(2, 0, 1) - - log_s = decoder_output[:, :, :decoder_output.size(2)//2] - b = decoder_output[:, :, decoder_output.size(2)//2:] - output = (residual[i, :, :].unsqueeze(0) - b)/torch.exp(log_s) - gate_total += [ - torch.sigmoid(self.gate_layer(decoder_input)).reshape([1]) - if self.add_gate else - torch.tensor([0], dtype=torch.float, device=output.device) - ] - total_output += [output] - total_output = torch.cat(total_output, 0) - return total_output, torch.cat(gate_total, 0) - - -class Flowtron(torch.nn.Module): - __constants__ = ['gate_threshold'] - - def __init__(self, n_speakers, n_speaker_dim, n_text, n_text_dim, n_flows, - n_mel_channels, n_hidden, n_attn_channels, n_lstm_layers, - use_gate_layer, mel_encoder_n_hidden, n_components, - fixed_gaussian, mean_scale, dummy_speaker_embedding, - temperature=1, gate_threshold=0.5): - - super(Flowtron, self).__init__() - norm_fn = InstanceNorm - self.speaker_embedding = torch.nn.Embedding(n_speakers, n_speaker_dim) - self.embedding = torch.nn.Embedding(n_text, n_text_dim) - self.flows = torch.nn.ModuleList() - self.encoder = Encoder(norm_fn=norm_fn, encoder_embedding_dim=n_text_dim) - self.dummy_speaker_embedding = dummy_speaker_embedding - self.gate_threshold = gate_threshold - for i in range(n_flows): - add_gate = (i == (n_flows-1) and use_gate_layer) - if i % 2 == 0: - f = AR_Step(n_mel_channels, - n_speaker_dim, - n_text_dim, - n_mel_channels + n_speaker_dim, - n_hidden, n_attn_channels, - n_lstm_layers, - add_gate) - self.set_temperature_and_gate(f, temperature, gate_threshold) - self.flows.append(f) - else: - f = AR_Back_Step(n_mel_channels, - n_speaker_dim, - n_text_dim, - n_mel_channels + n_speaker_dim, - n_hidden, - n_attn_channels, - n_lstm_layers, - add_gate) - self.set_temperature_and_gate(f, temperature, gate_threshold) - self.flows.append(f) - - @torch.jit.ignore - def script_flows(self): - for i, flow in enumerate(self.flows): - flow.trace_layers() - self.flows[i] = torch.jit.script(flow) - - def forward(self, *args): - residual, speaker_vecs, text = args - speaker_vecs = self.speaker_embedding(speaker_vecs) - text = self.embedding(text).permute(0, 2, 1) - text = self.encoder.infer(text) - text = text.permute(1, 0, 2) - encoder_outputs = torch.cat( - [ - text, - speaker_vecs.expand(text.size(0), -1, -1) - ], 2 - ) - residual = residual.permute(2, 0, 1) - for flow in reversed(self.flows): - residual, gates = flow(residual, encoder_outputs) - gate_trigger_id_tuple = torch.nonzero(gates.double() > self.gate_threshold) - if gate_trigger_id_tuple.nelement() > 0: - indices = torch.arange(gate_trigger_id_tuple[0][0], device=residual.device) - residual = residual.flip(0).index_select(0, indices).flip(0) - return residual.permute(1, 2, 0) - - @staticmethod - def set_temperature_and_gate(flow, temperature, gate_threshold): - flow = flow.ar_step if hasattr(flow, "ar_step") else flow - flow.attention_layer.temperature = temperature - if hasattr(flow, 'gate_layer'): - flow.gate_threshold = gate_threshold - - -class InstanceNorm(torch.nn.modules.instancenorm._InstanceNorm): - def __init__(self, *args, **kwargs): - super(InstanceNorm, self).__init__(*args, **kwargs) - - def forward(self, x): - mn = x.mean(-1).detach().unsqueeze(-1) - sd = x.std(-1).detach().unsqueeze(-1) - - x = ((x - mn) / (sd + 1e-8)) * self.weight.view(1, -1, 1) + self.bias.view(1, -1, 1) - return x - - -class FlowtronTTS(torch.nn.Module): - - def __init__(self, flowtron, waveglow, *args, **kwargs): - super().__init__(*args, **kwargs) - self.flowtron = flowtron - self.waveglow = waveglow - - def trace_flowtron(self, args): - self.flowtron_traced = torch.jit.trace( - self.flowtron, args - ) - - @classmethod - def patch_waveglow(cls, waveglow): - waveglow.forward = cls.waveglow_infer_forward.__get__( - waveglow, type(waveglow) - ) - return waveglow - - def forward(self, *args): - residual, speaker_vecs, text = args - mels = self.flowtron(residual, speaker_vecs, text) - audio = self.waveglow(mels) - return audio - - def waveglow_infer_forward(self, spect, sigma=0.8): - """Waveglow infer function. - - Fixes ONNX unsupported operator errors with replacement - for supported ones. - """ - - spect = self.upsample(spect) - # trim conv artifacts. maybe pad spec to kernel multiple - time_cutoff = self.upsample.kernel_size[0] - self.upsample.stride[0] - spect = spect[:, :, :-time_cutoff] - # Replacing unfold since it is compiled into a weird onnx representation (with slices and concat) - spect = spect.reshape(1, 80, -1, self.n_group).permute(0, 2, 1, 3) - spect = spect.contiguous().reshape( - spect.size(0), spect.size(1), -1 - ).permute(0, 2, 1) - - if spect.type() == 'torch.cuda.HalfTensor': - audio = torch.randn( - spect.size(0), - self.n_remaining_channels, - spect.size(2), dtype=torch.half, device='cuda' - ) - else: - audio = torch.randn( - spect.size(0), - self.n_remaining_channels, - spect.size(2), dtype=torch.float, device='cuda' - ) - - audio = torch.autograd.Variable(sigma*audio) - - for k in reversed(range(self.n_flows)): - n_half = int(audio.size(1)/2) - audio_0 = audio[:, :n_half, :] - audio_1 = audio[:, n_half:, :] - - output = self.WN[k]((audio_0, spect)) - - s = output[:, n_half:, :] - b = output[:, :n_half, :] - audio_1 = (audio_1 - b)/torch.exp(s) - audio = torch.cat([audio_0, audio_1], 1) - - audio = self.convinv[k](audio, reverse=True) - - if k % self.n_early_every == 0 and k > 0: - if spect.type() == 'torch.cuda.HalfTensor': - z = torch.randn( - spect.size(0), - self.n_early_size, - spect.size(2), - dtype=torch.half, - device='cuda' - ) - else: - z = torch.randn( - spect.size(0), - self.n_early_size, - spect.size(2), - dtype=torch.float, - device='cuda' - ) - audio = torch.cat((sigma*z, audio), 1) - - audio = audio.permute(0, 2, 1).contiguous().reshape(audio.size(0), -1) - return audio +############################################################################### +# +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +############################################################################### +import sys + +# sys.path.insert(0, "tacotron2") +# sys.path.insert(0, "tacotron2/waveglow") +import numpy as np +import torch +from torch import nn +from torch.nn import functional as F +import onnxruntime +import time + +from flowtron import ( + LinearNorm, + ConvNorm, + GaussianMixture, + MelEncoder, + DenseLayer, + Encoder, + Attention, +) + + +class AR_Back_Step(torch.nn.Module): + def __init__( + self, + n_mel_channels, + n_speaker_dim, + n_text_dim, + n_in_channels, + n_hidden, + n_attn_channels, + n_lstm_layers, + add_gate, + ): + super(AR_Back_Step, self).__init__() + self.ar_step = AR_Step( + n_mel_channels, + n_speaker_dim, + n_text_dim, + n_mel_channels + n_speaker_dim, + n_hidden, + n_attn_channels, + n_lstm_layers, + add_gate, + ) + + def forward(self, residual, text): + residual, gate = self.ar_step(torch.flip(residual, (0,)), text) + residual = torch.flip(residual, (0,)) + return residual, gate + + def trace_layers(self): + self.ar_step.trace_layers() + + +class AR_Step(torch.nn.Module): + __constants__ = ["gate_threshold", "add_gate"] + + def __init__( + self, + n_mel_channels, + n_speaker_dim, + n_text_channels, + n_in_channels, + n_hidden, + n_attn_channels, + n_lstm_layers, + add_gate: bool = False, + ): + super(AR_Step, self).__init__() + self.conv = torch.nn.Conv1d(n_hidden, 2 * n_mel_channels, 1).cuda() + self.conv.weight.data = 0.0 * self.conv.weight.data + self.conv.bias.data = 0.0 * self.conv.bias.data + # [1, 1, 1664] [2, 1, 1024] [2, 1, 1024] + self.lstm = torch.nn.LSTM( + n_hidden + n_attn_channels, n_hidden, n_lstm_layers + ).cuda() + self.attention_lstm = torch.nn.LSTM(n_mel_channels, n_hidden).cuda() + + self.attention_layer = Attention( + n_hidden, n_speaker_dim, n_text_channels, n_attn_channels, + ).cuda() + + self.dense_layer = DenseLayer( + in_dim=n_hidden, sizes=[n_hidden, n_hidden] + ).cuda() + self.add_gate: bool = add_gate + # if self.add_gate: + self.gate_threshold = 0.5 + self.gate_layer = LinearNorm( + n_hidden + n_attn_channels, 1, bias=True, w_init_gain="sigmoid" + ) + + def trace_layers(self): + self.lstm.flatten_parameters() + self.lstm = torch.jit.trace_module( + self.lstm, + inputs={ + "forward": [ + torch.zeros( + [1, 1, 1664], dtype=torch.float, device="cpu" + ).normal_(), + ( + torch.zeros( + [2, 1, 1024], dtype=torch.float, device="cpu" + ).normal_(), + torch.zeros( + [2, 1, 1024], dtype=torch.float, device="cpu" + ).normal_(), + ), + ] + }, + ) + self.attention_lstm.flatten_parameters() + self.attention_lstm = torch.jit.trace_module( + self.attention_lstm, + inputs={ + "forward": [ + torch.zeros([1, 1, 80], dtype=torch.float, device="cpu").normal_(), + ( + torch.zeros( + [1, 1, 1024], dtype=torch.float, device="cpu" + ).normal_(), + torch.zeros( + [1, 1, 1024], dtype=torch.float, device="cpu" + ).normal_(), + ), + ] + }, + ) + self.conv = torch.jit.trace_module( + self.conv, + inputs={ + "forward": [ + torch.zeros([1, 1024, 1], dtype=torch.float, device="cpu").normal_() + ] + }, + ) + self.attention_layer = torch.jit.trace_module( + self.attention_layer, + inputs={ + "forward": [ + torch.zeros( + [1, 1, 1024], dtype=torch.float, device="cpu" + ).normal_(), + torch.zeros( + [63, 1, 640], dtype=torch.float, device="cpu" + ).normal_(), + torch.zeros( + [63, 1, 640], dtype=torch.float, device="cpu" + ).normal_(), + ] + }, + ) + self.dense_layer = torch.jit.trace_module( + self.dense_layer, + inputs={ + "forward": [ + torch.zeros([1, 1, 1024], dtype=torch.float, device="cpu").normal_() + ] + }, + ) + self.gate_layer = torch.jit.trace_module( + self.gate_layer, + inputs={ + "forward": [ + torch.zeros([1, 1, 1664], dtype=torch.float, device="cpu").normal_() + ] + }, + ) + + def forward( + self, + residual, + text, + last_output, + hidden_att_h, + hidden_att_c, + hidden_lstm_h, + hidden_lstm_c, + ): + output = last_output + (h, c) = (hidden_att_h, hidden_att_c) + (h1, c1) = (hidden_lstm_h, hidden_lstm_c) + + attention_hidden, (h, c) = self.attention_lstm(output, (h, c)) + attention_context, attention_weight = self.attention_layer( + attention_hidden, text, text + ) + attention_context = attention_context.permute(2, 0, 1) + decoder_input = torch.cat((attention_hidden, attention_context), -1) + lstm_hidden, (h1, c1) = self.lstm(decoder_input, (h1, c1)) + lstm_hidden = self.dense_layer(lstm_hidden).permute(1, 2, 0) + decoder_output = self.conv(lstm_hidden).permute(2, 0, 1) + a = int(decoder_output.size(2)) // 2 + log_s = decoder_output[:, :, :a] + b = decoder_output[:, :, a:] + output = (residual[:, :].unsqueeze(0) - b) / torch.exp(log_s) + gate = ( + torch.sigmoid(self.gate_layer(decoder_input)).reshape([1]) + if self.add_gate + else torch.tensor([0], dtype=torch.float, device="cpu") + ) + return output, gate, h, c, h1, c1 + + +class FlowtronEncoder(torch.nn.Module): + def __init__(self, embedding, speaker_embedding, encoder): + super().__init__() + self.embedding = embedding + self.speaker_embedding = speaker_embedding + self.encoder = encoder + + def forward(self, speaker_vecs, text): + speaker_vecs = self.speaker_embedding(speaker_vecs) + text = self.embedding(text).permute(0, 2, 1) + text = self.encoder.infer(text) + text = text.permute(1, 0, 2) + encoder_outputs = torch.cat( + [text, speaker_vecs.expand(text.size(0), -1, -1)], 2 + ) + return encoder_outputs + + +class Flowtron(torch.nn.Module): + __constants__ = ["gate_threshold"] + + def __init__( + self, + n_speakers, + n_speaker_dim, + n_text, + n_text_dim, + n_flows, + n_mel_channels, + n_hidden, + n_attn_channels, + n_lstm_layers, + use_gate_layer, + mel_encoder_n_hidden, + n_components, + fixed_gaussian, + mean_scale, + dummy_speaker_embedding, + temperature=1, + gate_threshold=0.5, + ): + + super(Flowtron, self).__init__() + norm_fn = InstanceNorm + self.speaker_embedding = torch.nn.Embedding(n_speakers, n_speaker_dim) + self.embedding = torch.nn.Embedding(n_text, n_text_dim) + self.flows = torch.nn.ModuleList() + self.encoder = Encoder(norm_fn=norm_fn, encoder_embedding_dim=n_text_dim) + self.dummy_speaker_embedding = dummy_speaker_embedding + self.gate_threshold = gate_threshold + for i in range(n_flows): + add_gate = i == (n_flows - 1) and use_gate_layer + if i % 2 == 0: + f = AR_Step( + n_mel_channels, + n_speaker_dim, + n_text_dim, + n_mel_channels + n_speaker_dim, + n_hidden, + n_attn_channels, + n_lstm_layers, + add_gate, + ) + self.set_temperature_and_gate(f, temperature, gate_threshold) + self.flows.append(f) + else: + f = AR_Back_Step( + n_mel_channels, + n_speaker_dim, + n_text_dim, + n_mel_channels + n_speaker_dim, + n_hidden, + n_attn_channels, + n_lstm_layers, + add_gate, + ) + self.set_temperature_and_gate(f, temperature, gate_threshold) + self.flows.append(f) + + @torch.jit.ignore + def script_flows(self): + for i, flow in enumerate(self.flows): + flow.trace_layers() + self.flows[i] = torch.jit.script(flow) + + def forward( + self, residual, encoder_outputs, last_outputs, hidden_atts, hidden_lstms, + ): + output1, gate1, hidden_att1, hidden_lstm1 = self.flows[1]( + residual, encoder_outputs, last_outputs[1], hidden_atts[1], hidden_lstms[1], + ) + output0, gate0, hidden_att0, hidden_lstm0 = self.flows[0]( + output1, encoder_outputs, last_outputs[0], hidden_atts[0], hidden_lstms[0], + ) + return ( + output0, + torch.cat([gate0, gate1]), + [output0, output1], + [hidden_att0, hidden_att1], + [hidden_lstm0, hidden_lstm1], + ) + + @staticmethod + def set_temperature_and_gate(flow, temperature, gate_threshold): + flow = flow.ar_step if hasattr(flow, "ar_step") else flow + flow.attention_layer.temperature = temperature + if hasattr(flow, "gate_layer"): + flow.gate_threshold = gate_threshold + + +class InstanceNorm(torch.nn.modules.instancenorm._InstanceNorm): + def __init__(self, *args, **kwargs): + super(InstanceNorm, self).__init__(*args, **kwargs) + + def forward(self, x): + mn = x.mean(-1).detach().unsqueeze(-1) + sd = x.std(-1).detach().unsqueeze(-1) + + x = ((x - mn) / (sd + 1e-8)) * self.weight.view(1, -1, 1) + self.bias.view( + 1, -1, 1 + ) + return x + + +class FlowtronTTS(torch.nn.Module): + def __init__(self, encoder, flowtron, waveglow, *args, **kwargs): + super().__init__(*args, **kwargs) + self.encoder = encoder + self.flowtron = flowtron + self.forward_flow = flowtron.flows[0] + self.backward_flow = flowtron.flows[1] + self.waveglow = waveglow + + def trace_flowtron(self, args): + self.flowtron_traced = torch.jit.trace(self.flowtron, args) + + @classmethod + def patch_waveglow(cls, waveglow): + waveglow.forward = cls.waveglow_infer_forward.__get__(waveglow, type(waveglow)) + return waveglow + + def forward(self, *args): + residual, speaker_vecs, text = args + enc_outps = self.encoder(speaker_vecs, text) + + residual = residual.permute(2, 0, 1) + + residual_outp = [] + residual_o, hidden_att, hidden_lstm = self.init_states(residual) + + for i in range(residual.shape[0] - 1, -1, -1): + ( + residual_o, + gates, + hidden_att[0], + hidden_att[1], + hidden_lstm[0], + hidden_lstm[1], + ) = self.backward_flow.ar_step( + residual[i], + enc_outps, + residual_o, + hidden_att[0], + hidden_att[1], + hidden_lstm[0], + hidden_lstm[1], + ) + residual_outp = [residual_o] + residual_outp + if (gates > self.flowtron.gate_threshold).any(): + break + + residual = torch.cat(residual_outp, dim=0) + + residual_outp = [] + residual_o, hidden_att, hidden_lstm = self.init_states(residual) + for i in range(residual.shape[0]): + ( + residual_o, + gates, + hidden_att[0], + hidden_att[1], + hidden_lstm[0], + hidden_lstm[1], + ) = self.forward_flow( + residual[i], + enc_outps, + residual_o, + hidden_att[0], + hidden_att[1], + hidden_lstm[0], + hidden_lstm[1], + ) + residual_outp.append(residual_o) + if (gates > self.flowtron.gate_threshold).any(): + break + residual = torch.cat(residual_outp) + residual = residual.permute(1, 2, 0) + # audio = self.waveglow(residual) + return residual + + def init_states(self, residual): + last_outputs = torch.zeros( + [1, residual.size(1), residual.size(2)], device=residual.device + ) + hidden_att = [ + torch.zeros([1, 1, 1024], dtype=torch.float, device="cuda"), + torch.zeros([1, 1, 1024], dtype=torch.float, device="cuda"), + ] + hidden_lstm = [ + torch.zeros([2, 1, 1024], dtype=torch.float, device="cuda"), + torch.zeros([2, 1, 1024], dtype=torch.float, device="cuda"), + ] + return last_outputs, hidden_att, hidden_lstm + + def waveglow_infer_forward(self, spect, sigma=0.8): + """Waveglow infer function. + Fixes ONNX unsupported operator errors with replacement + for supported ones. + """ + + spect = self.upsample(spect) + # trim conv artifacts. maybe pad spec to kernel multiple + time_cutoff = self.upsample.kernel_size[0] - self.upsample.stride[0] + spect = spect[:, :, :-time_cutoff] + # Replacing unfold since it is compiled into a weird onnx representation (with slices and concat) + spect = spect.reshape(1, 80, -1, self.n_group).permute(0, 2, 1, 3) + spect = spect.contiguous().reshape( + spect.size(0), spect.size(1), -1 + ).permute(0, 2, 1) + + if spect.type() == 'torch.cuda.HalfTensor': + audio = torch.randn( + spect.size(0), + self.n_remaining_channels, + spect.size(2), dtype=torch.half, device='cuda' + ) + else: + audio = torch.randn( + spect.size(0), + self.n_remaining_channels, + spect.size(2), dtype=torch.float, device='cuda' + ) + + audio = torch.autograd.Variable(sigma*audio) + + for k in reversed(range(self.n_flows)): + n_half = int(audio.size(1)/2) + audio_0 = audio[:, :n_half, :] + audio_1 = audio[:, n_half:, :] + + output = self.WN[k]((audio_0, spect)) + + s = output[:, n_half:, :] + b = output[:, :n_half, :] + audio_1 = (audio_1 - b)/torch.exp(s) + audio = torch.cat([audio_0, audio_1], 1) + + audio = self.convinv[k](audio, reverse=True) + + if k % self.n_early_every == 0 and k > 0: + if spect.type() == 'torch.cuda.HalfTensor': + z = torch.randn( + spect.size(0), + self.n_early_size, + spect.size(2), + dtype=torch.half, + device='cuda' + ) + else: + z = torch.randn( + spect.size(0), + self.n_early_size, + spect.size(2), + dtype=torch.float, + device='cuda' + ) + audio = torch.cat((sigma*z, audio), 1) + + audio = audio.permute(0, 2, 1).contiguous().reshape(audio.size(0), -1) + return audio + + +class SimpleTTSRunner: + def __init__( + self, + encoder, + backward_flow, + forward_flow, + vocoder, + max_frames=500, + gate_threshold=0.5, + ): + self.encoder = encoder + self.backward_flow = backward_flow + self.forward_flow = forward_flow + self.vocoder = vocoder + self.max_frames = max_frames + self.gate_threshold = gate_threshold + + def run(self, speaker_id, text): + + enc_outps_ortvalue = onnxruntime.OrtValue.ortvalue_from_shape_and_type( + [text.shape[1], 1, 640], np.float32, "cpu", 0 + ) + + io_binding = self.encoder.io_binding() + io_binding.bind_ortvalue_output("text_emb", enc_outps_ortvalue) + io_binding.bind_cpu_input("speaker_vecs", speaker_id) + io_binding.bind_cpu_input("text", text.reshape([1, -1])) + self.encoder.run_with_iobinding(io_binding) + # enc_outps = self.encoder.run( + # None, {"speaker_vecs": speaker_id, "text": text.reshape([1, -1])}, + # )[0] + + residual = np.random.normal(0, 0.8, size=[self.max_frames, 1, 80]).astype( + np.float32 + ) + + start = time.time() + residual = self.run_backward_flow(residual, enc_outps_ortvalue) + end = time.time() + print(f"First delay {end - start}") + + residual = self.run_forward_flow(residual, enc_outps_ortvalue, num_split=20) + last_audio = None + for residual in residual: + residual = np.transpose(residual, axes=(1, 2, 0)) + start = time.time() + audio = self.vocoder.run(None, {"mels": residual})[0] + audio = np.where((audio > (audio.mean() - audio.std())) | (audio< (audio.mean() + audio.std())), audio, audio.mean()) + tmp = audio + if last_audio is not None: + cumsum_vec = np.cumsum(np.concatenate([last_audio, audio], axis=1), axis=1) + ma_vec = (cumsum_vec[:, 5:] - cumsum_vec[:, :-5]) / 5 + audio = ma_vec[:, last_audio.shape[1]:] + last_audio = tmp + end = time.time() + process_time = end - start + audio_time = len(audio.reshape(-1)) / 22050 + print(f" > Real-time factor: {process_time / audio_time}") + audio = audio.reshape(-1) + # audio = audio / np.abs(audio).max() + yield audio + + def run_backward_flow(self, residual, enc_outps_ortvalue): + + residual_o, hidden_att, hidden_lstm = self.init_states(residual) + + hidden_att_ortvalue = onnxruntime.OrtValue.ortvalue_from_numpy( + hidden_att[0], "cpu", 0 + ) + hidden_att_c_ortvalue = onnxruntime.OrtValue.ortvalue_from_numpy( + hidden_att[1], "cpu", 0 + ) + hidden_lstm_ortvalue = onnxruntime.OrtValue.ortvalue_from_numpy( + hidden_lstm[0], "cpu", 0 + ) + hidden_lstm_c_ortvalue = onnxruntime.OrtValue.ortvalue_from_numpy( + hidden_lstm[1], "cpu", 0 + ) + + hidden_att_o_ortvalue = onnxruntime.OrtValue.ortvalue_from_numpy( + hidden_att[0], "cpu", 0 + ) + hidden_att_o_c_ortvalue = onnxruntime.OrtValue.ortvalue_from_numpy( + hidden_att[1], "cpu", 0 + ) + hidden_lstm_o_ortvalue = onnxruntime.OrtValue.ortvalue_from_numpy( + hidden_lstm[0], "cpu", 0 + ) + hidden_lstm_o_c_ortvalue = onnxruntime.OrtValue.ortvalue_from_numpy( + hidden_lstm[1], "cpu", 0 + ) + + residual_ortvalue = onnxruntime.OrtValue.ortvalue_from_numpy( + residual_o, "cpu", 0 + ) + + residual_outp = [residual_ortvalue] + + for i in range(residual.shape[0] - 1, -1, -1): + + io_binding = self.backward_flow.io_binding() + + io_binding.bind_cpu_input("residual", residual[i]) + + io_binding.bind_ortvalue_input("text", enc_outps_ortvalue) + io_binding.bind_ortvalue_input("last_output", residual_outp[0]) + + io_binding.bind_ortvalue_input("hidden_att", hidden_att_ortvalue) + io_binding.bind_ortvalue_input("hidden_att_c", hidden_att_c_ortvalue) + io_binding.bind_ortvalue_input("hidden_lstm", hidden_lstm_ortvalue) + io_binding.bind_ortvalue_input("hidden_lstm_c", hidden_lstm_c_ortvalue) + + io_binding.bind_output("output", "cpu") + io_binding.bind_output("gate", "cpu") + io_binding.bind_ortvalue_output("hidden_att_o", hidden_att_o_ortvalue) + io_binding.bind_ortvalue_output("hidden_att_o_c", hidden_att_o_c_ortvalue) + io_binding.bind_ortvalue_output("hidden_lstm_o", hidden_lstm_o_ortvalue) + io_binding.bind_ortvalue_output("hidden_lstm_o_c", hidden_lstm_o_c_ortvalue) + + self.backward_flow.run_with_iobinding(io_binding) + + outp = io_binding.get_outputs() + gates = outp[1].numpy() + residual_outp = [outp[0]] + residual_outp + if (gates > self.gate_threshold).any(): + break + + # Switch input and output to use latest output as input + (hidden_att_ortvalue, hidden_att_o_ortvalue) = ( + hidden_att_o_ortvalue, + hidden_att_ortvalue, + ) + (hidden_att_c_ortvalue, hidden_att_o_c_ortvalue) = ( + hidden_att_o_c_ortvalue, + hidden_att_c_ortvalue, + ) + (hidden_lstm_ortvalue, hidden_lstm_o_ortvalue) = ( + hidden_lstm_o_ortvalue, + hidden_lstm_ortvalue, + ) + (hidden_lstm_c_ortvalue, hidden_lstm_o_c_ortvalue) = ( + hidden_lstm_o_c_ortvalue, + hidden_lstm_c_ortvalue, + ) + + residual = np.concatenate( + [residual_ort.numpy() for residual_ort in residual_outp], axis=0 + ) + + return residual + + def run_forward_flow(self, residual, enc_outps_ortvalue, num_split): + + residual_o, hidden_att, hidden_lstm = self.init_states(residual) + + hidden_att_ortvalue = onnxruntime.OrtValue.ortvalue_from_numpy( + hidden_att[0], "cpu", 0 + ) + hidden_att_c_ortvalue = onnxruntime.OrtValue.ortvalue_from_numpy( + hidden_att[1], "cpu", 0 + ) + hidden_lstm_ortvalue = onnxruntime.OrtValue.ortvalue_from_numpy( + hidden_lstm[0], "cpu", 0 + ) + hidden_lstm_c_ortvalue = onnxruntime.OrtValue.ortvalue_from_numpy( + hidden_lstm[1], "cpu", 0 + ) + + hidden_att_o_ortvalue = onnxruntime.OrtValue.ortvalue_from_numpy( + hidden_att[0], "cpu", 0 + ) + hidden_att_o_c_ortvalue = onnxruntime.OrtValue.ortvalue_from_numpy( + hidden_att[1], "cpu", 0 + ) + hidden_lstm_o_ortvalue = onnxruntime.OrtValue.ortvalue_from_numpy( + hidden_lstm[0], "cpu", 0 + ) + hidden_lstm_o_c_ortvalue = onnxruntime.OrtValue.ortvalue_from_numpy( + hidden_lstm[1], "cpu", 0 + ) + + residual_ortvalue = onnxruntime.OrtValue.ortvalue_from_numpy( + residual_o, "cpu", 0 + ) + + residual_outp = [residual_ortvalue] + last_output = residual_ortvalue + for i in range(residual.shape[0]): + + io_binding = self.forward_flow.io_binding() + + io_binding.bind_cpu_input("residual", residual[i]) + + io_binding.bind_ortvalue_input("text", enc_outps_ortvalue) + io_binding.bind_ortvalue_input("last_output", last_output) + + io_binding.bind_ortvalue_input("hidden_att", hidden_att_ortvalue) + io_binding.bind_ortvalue_input("hidden_att_c", hidden_att_c_ortvalue) + io_binding.bind_ortvalue_input("hidden_lstm", hidden_lstm_ortvalue) + io_binding.bind_ortvalue_input("hidden_lstm_c", hidden_lstm_c_ortvalue) + + io_binding.bind_output("output", "cpu") + io_binding.bind_output("gate", "cpu") + io_binding.bind_ortvalue_output("hidden_att_o", hidden_att_o_ortvalue) + io_binding.bind_ortvalue_output("hidden_att_o_c", hidden_att_o_c_ortvalue) + io_binding.bind_ortvalue_output("hidden_lstm_o", hidden_lstm_o_ortvalue) + io_binding.bind_ortvalue_output("hidden_lstm_o_c", hidden_lstm_o_c_ortvalue) + + self.forward_flow.run_with_iobinding(io_binding) + + outp = io_binding.get_outputs() + gates = outp[1].numpy() + residual_outp.append(outp[0]) + last_output = outp[0] + if (gates > self.gate_threshold).any(): + break + + # Switch input and output to use latest output as input + (hidden_att_ortvalue, hidden_att_o_ortvalue) = ( + hidden_att_o_ortvalue, + hidden_att_ortvalue, + ) + (hidden_att_c_ortvalue, hidden_att_o_c_ortvalue) = ( + hidden_att_o_c_ortvalue, + hidden_att_c_ortvalue, + ) + (hidden_lstm_ortvalue, hidden_lstm_o_ortvalue) = ( + hidden_lstm_o_ortvalue, + hidden_lstm_ortvalue, + ) + (hidden_lstm_c_ortvalue, hidden_lstm_o_c_ortvalue) = ( + hidden_lstm_o_c_ortvalue, + hidden_lstm_c_ortvalue, + ) + if len(residual_outp) % num_split == 0 and i != 0: + + residual_o = np.concatenate( + [residual_ort.numpy() for residual_ort in residual_outp], axis=0 + ) + + yield residual_o + residual_outp = [] + if len(residual_outp) > 0: + residual_o = np.concatenate( + [residual_ort.numpy() for residual_ort in residual_outp], axis=0 + ) + + yield residual_o + + def init_states(self, residual): + last_outputs = np.zeros( + [1, residual.shape[1], residual.shape[2]], dtype=np.float32 + ) + hidden_att = [ + np.zeros([1, 1, 1024], dtype=np.float32), + np.zeros([1, 1, 1024], dtype=np.float32), + ] + hidden_lstm = [ + np.zeros([2, 1, 1024], dtype=np.float32), + np.zeros([2, 1, 1024], dtype=np.float32), + ] + return last_outputs, hidden_att, hidden_lstm