From a7ff1f5ce0d3ade7aad84fedce41753b5a1b941c Mon Sep 17 00:00:00 2001
From: Mykyta Makarov <mykytamakarov@gmail.com>
Date: Fri, 20 Nov 2020 01:09:59 +0100
Subject: [PATCH 1/3] Export script implementation

---
 README.md        |   3 +
 config_onnx.json |  54 +++++++
 export_onnx.py   | 178 ++++++++++++++++++++++++
 flowtron_onnx.py | 356 +++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 591 insertions(+)
 create mode 100644 config_onnx.json
 create mode 100644 export_onnx.py
 create mode 100644 flowtron_onnx.py

diff --git a/README.md b/README.md
index 0296684..c83655c 100644
--- a/README.md
+++ b/README.md
@@ -50,6 +50,9 @@ Dataset dependent layers can be [ignored]
 ## Inference demo
 1. `python inference.py -c config.json -f models/flowtron_ljs.pt -w models/waveglow_256channels_v4.pt -t "It is well know that deep generative models have a deep latent space!" -i 0`
 
+## Export to ONNX format
+1. `python export_onnx.py -c config_onnx.json -f models/flowtron_libritts.pt -w models/waveglow_256channels_universal_v5.pt -i 83`
+
 ## Related repos
 [WaveGlow](https://github.com/NVIDIA/WaveGlow) Faster than real time Flow-based
 Generative Network for Speech Synthesis
diff --git a/config_onnx.json b/config_onnx.json
new file mode 100644
index 0000000..37d190b
--- /dev/null
+++ b/config_onnx.json
@@ -0,0 +1,54 @@
+{
+    "train_config": {
+        "output_directory": "outdir",
+        "epochs": 10000000,
+        "learning_rate": 1e-4,
+        "weight_decay": 1e-6,
+        "sigma": 1.0,
+        "iters_per_checkpoint": 5000,
+        "batch_size": 1,
+        "seed": 1234,
+        "checkpoint_path": "",
+        "ignore_layers": [],
+        "include_layers": ["speaker", "encoder", "embedding"],
+        "warmstart_checkpoint_path": "",
+        "with_tensorboard": true,
+        "fp16_run": false
+    },
+    "data_config": {
+        "training_files": "filelists/libritts_train_clean_100_audiopath_text_sid_shorterthan10s_atleast5min_train_filelist.txt", 
+        "validation_files": "filelists/libritts_train_clean_100_audiopath_text_sid_atleast5min_val_filelist.txt",
+        "text_cleaners": ["flowtron_cleaners"],
+        "p_arpabet": 0.5,
+        "cmudict_path": "data/cmudict_dictionary",
+        "sampling_rate": 22050,
+        "filter_length": 1024,
+        "hop_length": 256,
+        "win_length": 1024,
+        "mel_fmin": 0.0,
+        "mel_fmax": 8000.0,
+        "max_wav_value": 32768.0
+    },
+    "dist_config": {
+        "dist_backend": "nccl",
+        "dist_url": "tcp://localhost:54321"
+    },
+    
+    "model_config": {
+        "n_speakers": 123,
+        "n_speaker_dim": 128,
+        "n_text": 185,
+        "n_text_dim": 512,
+        "n_flows": 2,
+        "n_mel_channels": 80,
+        "n_attn_channels": 640,
+        "n_hidden": 1024,
+        "n_lstm_layers": 2,
+        "mel_encoder_n_hidden": 512,
+        "n_components": 0,
+        "mean_scale": 0.0,
+        "fixed_gaussian": true,
+        "dummy_speaker_embedding": false,
+        "use_gate_layer": true
+    } 
+}
diff --git a/export_onnx.py b/export_onnx.py
new file mode 100644
index 0000000..c4752d6
--- /dev/null
+++ b/export_onnx.py
@@ -0,0 +1,178 @@
+###############################################################################
+#
+#  Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+###############################################################################
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pylab as plt
+
+import os
+import argparse
+import json
+import sys
+import numpy as np
+import torch
+
+
+from flowtron_onnx import Flowtron, FlowtronTTS
+from torch.utils.data import DataLoader
+from data import Data
+from train import update_params
+
+sys.path.insert(0, "tacotron2")
+sys.path.insert(0, "tacotron2/waveglow")
+from glow import WaveGlow
+from scipy.io.wavfile import write
+from copy import deepcopy
+
+import faulthandler
+faulthandler.enable()
+
+
+def export(flowtron_path, waveglow_path, output_dir,
+           speaker_id, n_frames, sigma, gate_threshold, seed, no_test_run):
+    text = "It is well know that deep generative models have a deep latent space!"
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+
+    # load waveglow
+    waveglow = torch.load(waveglow_path)['model'].cuda().eval()
+    waveglow.cuda()
+    for k in waveglow.convinv:
+        k.float()
+    waveglow.eval()
+
+    # load flowtron
+    model = Flowtron(**model_config).cuda()
+    state_dict = torch.load(flowtron_path, map_location='cpu')['state_dict']
+    model.load_state_dict(state_dict, False)
+    model.eval()
+    print("Loaded checkpoint '{}')" .format(flowtron_path))
+
+    # Script loop parts of the flows
+    model.script_flows()
+
+    ignore_keys = ['training_files', 'validation_files']
+    trainset = Data(
+        data_config['training_files'],
+        **dict((k, v) for k, v in data_config.items() if k not in ignore_keys))
+    speaker_vecs = trainset.get_speaker_id(speaker_id).cuda()
+    text = trainset.get_text(text).cuda()
+    text_copy = deepcopy(text.cpu().numpy())
+    speaker_vecs = speaker_vecs[None]
+    text = text[None]
+
+    with torch.no_grad():
+        residual = torch.cuda.FloatTensor(1, 80, n_frames).normal_() * sigma
+        mels = model(residual, speaker_vecs, text)
+
+        waveglow = FlowtronTTS.patch_waveglow(waveglow)
+
+        audio = waveglow(mels, sigma=0.8)
+
+        model = FlowtronTTS(model, waveglow)
+        model_infer = torch.jit.trace(
+            model, [residual, speaker_vecs, text]
+        )
+        torch.onnx.export(
+            model_infer,
+            [residual, speaker_vecs, text],
+            "./flowtron_waveglow.onnx",
+            opset_version=11,
+            do_constant_folding=True,
+            input_names=["residual", "speaker_vecs", "text"],
+            output_names=["audio"],
+            dynamic_axes={
+                "text": {1: "text_seq"},
+                "audio": {1: "audio_seq"},
+            },
+            example_outputs=audio,
+            verbose=False,
+        )
+
+    if not no_test_run:
+        print("Running test:")
+        import onnxruntime as rt
+        sess_options = rt.SessionOptions()
+        sess_options.graph_optimization_level = rt.GraphOptimizationLevel.ORT_DISABLE_ALL
+        print("Loading model.")
+        flowtron_tts = rt.InferenceSession(
+            "./flowtron_waveglow.onnx",
+            providers=rt.get_available_providers(),
+            sess_options=sess_options
+        )
+        print("Model loaded, running tts.")
+        audio = flowtron_tts.run(
+            None,
+            {
+                "residual": residual.cpu().contiguous().numpy(),
+                "speaker_vecs": speaker_vecs.cpu().contiguous().numpy(),
+                "text": text_copy.reshape([1, -1])
+            }
+        )
+        print("Finished successfuly, saving the results")
+        audio = audio[0]
+        audio = audio / np.abs(audio).max()
+
+        write(
+            os.path.join(
+                output_dir, 'sid{}_sigma{}_onnx_test.wav'.format(
+                    speaker_id, sigma
+                )
+            ),
+            data_config['sampling_rate'], audio
+        )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-c', '--config', type=str,
+                        help='JSON file for configuration')
+    parser.add_argument('-p', '--params', nargs='+', default=[])
+    parser.add_argument('-f', '--flowtron_path',
+                        help='Path to flowtron state dict', type=str)
+    parser.add_argument('-w', '--waveglow_path',
+                        help='Path to waveglow state dict', type=str)
+    parser.add_argument('-i', '--id', help='Speaker id', type=int)
+    parser.add_argument('-n', '--n_frames', help='Number of frames',
+                        default=400, type=int)
+    parser.add_argument('-o', "--output_dir", default="results/")
+    parser.add_argument("-s", "--sigma", default=0.5, type=float)
+    parser.add_argument("-g", "--gate", default=0.5, type=float)
+    parser.add_argument("--seed", default=1234, type=int)
+    parser.add_argument('--no-test-run', dest='no_test_run', action='store_true')
+    args = parser.parse_args()
+
+    # Parse configs.  Globals nicer in this case
+    with open(args.config) as f:
+        data = f.read()
+
+    global config
+    config = json.loads(data)
+    update_params(config, args.params)
+
+    data_config = config["data_config"]
+    global model_config
+    model_config = config["model_config"]
+
+    # Make directory if it doesn't exist
+    if not os.path.isdir(args.output_dir):
+        os.makedirs(args.output_dir)
+        os.chmod(args.output_dir, 0o775)
+
+    torch.backends.cudnn.enabled = True
+    torch.backends.cudnn.benchmark = False
+    export(args.flowtron_path, args.waveglow_path, args.output_dir,
+           args.id, args.n_frames, args.sigma, args.gate, args.seed, args.no_test_run)
diff --git a/flowtron_onnx.py b/flowtron_onnx.py
new file mode 100644
index 0000000..4637fb7
--- /dev/null
+++ b/flowtron_onnx.py
@@ -0,0 +1,356 @@
+###############################################################################
+#
+#  Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+###############################################################################
+import sys
+# sys.path.insert(0, "tacotron2")
+# sys.path.insert(0, "tacotron2/waveglow")
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from flowtron import (
+    LinearNorm,
+    ConvNorm,
+    GaussianMixture,
+    MelEncoder,
+    DenseLayer,
+    Encoder,
+    Attention,
+)
+
+
+class AR_Back_Step(torch.nn.Module):
+
+    def __init__(self, n_mel_channels, n_speaker_dim, n_text_dim,
+                 n_in_channels, n_hidden, n_attn_channels, n_lstm_layers,
+                 add_gate):
+        super(AR_Back_Step, self).__init__()
+        self.ar_step = AR_Step(n_mel_channels, n_speaker_dim, n_text_dim,
+                               n_mel_channels+n_speaker_dim, n_hidden,
+                               n_attn_channels, n_lstm_layers, add_gate)
+
+    def forward(self, residual, text):
+        residual, gates = self.ar_step(
+            torch.flip(residual, (0, )), text)
+        residual = torch.flip(residual, (0, ))
+        return residual, gates
+
+    def trace_layers(self):
+        self.ar_step.trace_layers()
+
+
+class AR_Step(torch.nn.Module):
+    __constants__ = ['gate_threshold', 'add_gate']
+
+    def __init__(self, n_mel_channels, n_speaker_dim, n_text_channels,
+                 n_in_channels, n_hidden, n_attn_channels, n_lstm_layers,
+                 add_gate: bool = False):
+        super(AR_Step, self).__init__()
+        self.conv = torch.nn.Conv1d(n_hidden, 2*n_mel_channels, 1).cuda()
+        self.conv.weight.data = 0.0*self.conv.weight.data
+        self.conv.bias.data = 0.0*self.conv.bias.data
+        # [1, 1, 1664] [2, 1, 1024] [2, 1, 1024]
+        self.lstm = torch.nn.LSTM(n_hidden+n_attn_channels, n_hidden, n_lstm_layers).cuda()
+        self.attention_lstm = torch.nn.LSTM(n_mel_channels, n_hidden).cuda()
+
+
+        self.attention_layer = Attention(n_hidden, n_speaker_dim,
+                                         n_text_channels, n_attn_channels,).cuda() 
+
+        self.dense_layer = DenseLayer(in_dim=n_hidden,
+                                      sizes=[n_hidden, n_hidden]).cuda()
+        self.add_gate: bool = add_gate
+        # if self.add_gate:
+        self.gate_threshold = 0.5
+        self.gate_layer = LinearNorm(
+            n_hidden+n_attn_channels, 1, bias=True, w_init_gain='sigmoid'
+        )
+
+    def trace_layers(self):
+        self.lstm.flatten_parameters()
+        self.lstm = torch.jit.trace_module(
+            self.lstm, 
+            inputs={
+                'forward': [
+                    torch.zeros([1, 1, 1664], dtype=torch.float, device='cuda').normal_(),
+                    (torch.zeros([2, 1, 1024], dtype=torch.float, device='cuda').normal_(),
+                     torch.zeros([2, 1, 1024], dtype=torch.float, device='cuda').normal_())
+                ]
+            }
+        )
+        self.attention_lstm.flatten_parameters()
+        self.attention_lstm = torch.jit.trace_module(
+            self.attention_lstm,
+            inputs={
+                'forward': [
+                    torch.zeros([1, 1, 80], dtype=torch.float, device='cuda').normal_(),
+                    (torch.zeros([1, 1, 1024], dtype=torch.float, device='cuda').normal_(),
+                     torch.zeros([1, 1, 1024], dtype=torch.float, device='cuda').normal_())
+                ]
+            }
+        )
+        self.conv = torch.jit.trace_module(
+            self.conv, 
+            inputs={'forward': [torch.zeros([1, 1024, 1], dtype=torch.float, device='cuda').normal_()]}
+        )
+        self.attention_layer = torch.jit.trace_module(
+            self.attention_layer,
+            inputs={
+                'forward': [
+                    torch.zeros([1, 1, 1024], dtype=torch.float, device='cuda').normal_(),
+                    torch.zeros([63, 1, 640], dtype=torch.float, device='cuda').normal_(),
+                    torch.zeros([63, 1, 640], dtype=torch.float, device='cuda').normal_()
+                ]
+            },
+        )
+        self.dense_layer = torch.jit.trace_module(
+            self.dense_layer,
+            inputs={
+                'forward': [
+                    torch.zeros([1, 1, 1024], dtype=torch.float, device='cuda').normal_()
+                ]
+            },
+        )
+        self.gate_layer = torch.jit.trace_module(
+            self.gate_layer,
+            inputs={
+                'forward': [
+                    torch.zeros([1, 1, 1664], dtype=torch.float, device='cuda').normal_()
+                ]
+            },
+        )
+
+    def forward(self, residual, text):
+        total_output = []  # seems 10FPS faster than pre-allocation
+        gate_total = []
+        dummy = torch.zeros([1, residual.size(1), residual.size(2)], device=residual.device)
+        (h, c) = (torch.zeros([1, 1, 1024], dtype=torch.float, device='cuda'),
+                  torch.zeros([1, 1, 1024], dtype=torch.float, device='cuda'))
+
+        attention_hidden, (h, c) = self.attention_lstm(dummy, (h, c))
+        attention_context, attention_weight = self.attention_layer(
+            attention_hidden, text, text)
+        attention_context = attention_context.permute(2, 0, 1)
+        decoder_input = torch.cat((attention_hidden, attention_context), -1)
+        (h1, c1) = (torch.zeros([2, 1, 1024], dtype=torch.float, device='cuda'),
+                    torch.zeros([2, 1, 1024], dtype=torch.float, device='cuda'))
+        lstm_hidden, (h1, c1) = self.lstm(decoder_input, (h1, c1))
+        lstm_hidden = self.dense_layer(lstm_hidden).permute(1, 2, 0)
+        decoder_output = self.conv(lstm_hidden).permute(2, 0, 1)
+
+        log_s = decoder_output[:, :, :decoder_output.size(2)//2]
+        b = decoder_output[:, :, decoder_output.size(2)//2:]
+        output = (residual[0, :, :] - b)/torch.exp(log_s)
+        total_output.append(output)
+        i = torch.tensor(1, dtype=torch.long)
+        lim = torch.tensor(residual.size(0), dtype=torch.long)
+        gate_total.append(
+            torch.sigmoid(self.gate_layer(decoder_input)).view([1])
+            if self.add_gate else torch.tensor([0], dtype=torch.float, device=output.device)
+        )
+        # more than one condition raises errors in onnx for some reason, so just returning gate layer instead
+        while i < lim:
+            attention_hidden, (h, c) = self.attention_lstm(output, (h, c))
+            attention_context, attention_weight = self.attention_layer(
+                attention_hidden, text, text
+            )
+            attention_context = attention_context.permute(2, 0, 1)
+            decoder_input = torch.cat((attention_hidden, attention_context), -1)
+            lstm_hidden, (h1, c1) = self.lstm(decoder_input, (h1, c1))
+            lstm_hidden = self.dense_layer(lstm_hidden).permute(1, 2, 0)
+            decoder_output = self.conv(lstm_hidden).permute(2, 0, 1)
+
+            log_s = decoder_output[:, :, :decoder_output.size(2)//2]
+            b = decoder_output[:, :, decoder_output.size(2)//2:]
+            output = (residual[i, :, :] - b)/torch.exp(log_s)
+            gate_total.append(
+                torch.sigmoid(self.gate_layer(decoder_input)).view([1])
+                if self.add_gate else torch.tensor([0], dtype=torch.float, device=output.device)
+            )
+            total_output.append(output)
+            i += 1
+        total_output = torch.cat(total_output, 0)
+        return total_output, torch.cat(gate_total, 0)
+
+
+class Flowtron(torch.nn.Module):
+    __constants__ = ['gate_threshold']
+
+    def __init__(self, n_speakers, n_speaker_dim, n_text, n_text_dim, n_flows,
+                 n_mel_channels, n_hidden, n_attn_channels, n_lstm_layers,
+                 use_gate_layer, mel_encoder_n_hidden, n_components,
+                 fixed_gaussian, mean_scale, dummy_speaker_embedding,
+                 temperature=1, gate_threshold=0.5):
+
+        super(Flowtron, self).__init__()
+        norm_fn = nn.InstanceNorm1d
+        self.speaker_embedding = torch.nn.Embedding(n_speakers, n_speaker_dim)
+        self.embedding = torch.nn.Embedding(n_text, n_text_dim)
+        self.flows = torch.nn.ModuleList()
+        self.encoder = Encoder(norm_fn=norm_fn, encoder_embedding_dim=n_text_dim)
+        self.dummy_speaker_embedding = dummy_speaker_embedding
+        self.gate_threshold = gate_threshold
+        for i in range(n_flows):
+            add_gate = (i == (n_flows-1) and use_gate_layer)
+            if i % 2 == 0:
+                f = AR_Step(n_mel_channels,
+                            n_speaker_dim,
+                            n_text_dim,
+                            n_mel_channels + n_speaker_dim,
+                            n_hidden, n_attn_channels,
+                            n_lstm_layers,
+                            add_gate)
+                self.set_temperature_and_gate(f, temperature, gate_threshold)
+                self.flows.append(f)
+            else:
+                f = AR_Back_Step(n_mel_channels,
+                                 n_speaker_dim,
+                                 n_text_dim,
+                                 n_mel_channels + n_speaker_dim,
+                                 n_hidden,
+                                 n_attn_channels,
+                                 n_lstm_layers,
+                                 add_gate)
+                self.set_temperature_and_gate(f, temperature, gate_threshold)
+                self.flows.append(f)
+
+    @torch.jit.ignore
+    def script_flows(self):
+        for i, flow in enumerate(self.flows):
+            flow.trace_layers()
+            self.flows[i] = torch.jit.script(flow)
+
+    def forward(self, *args):
+        residual, speaker_vecs, text = args
+        speaker_vecs = self.speaker_embedding(speaker_vecs)
+        text = self.embedding(text).permute(0, 2, 1)
+        text = self.encoder.infer(text)
+        text = text.permute(1, 0, 2)
+        encoder_outputs = torch.cat(
+            [
+                text,
+                speaker_vecs.expand(text.size(0), -1, -1)
+            ], 2
+        )
+        residual = residual.permute(2, 0, 1)
+        for flow in reversed(self.flows):
+            residual, gates = flow(residual, encoder_outputs)
+            gate_trigger_id_tuple = torch.nonzero(gates > self.gate_threshold, as_tuple=True)
+            if gate_trigger_id_tuple[0].nelement() > 0:
+                residual = residual[:gate_trigger_id_tuple[0].item(), ...]
+        return residual.permute(1, 2, 0)
+
+    @staticmethod
+    def set_temperature_and_gate(flow, temperature, gate_threshold):
+        flow = flow.ar_step if hasattr(flow, "ar_step") else flow
+        flow.attention_layer.temperature = temperature
+        if hasattr(flow, 'gate_layer'):
+            flow.gate_threshold = gate_threshold
+
+
+class FlowtronTTS(torch.nn.Module):
+
+    def __init__(self, flowtron, waveglow, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.flowtron = flowtron
+        self.waveglow = waveglow
+
+    def trace_flowtron(self, args):
+        self.flowtron_traced = torch.jit.trace(
+            self.flowtron, args
+        )
+
+    @classmethod
+    def patch_waveglow(cls, waveglow):
+        waveglow.forward = cls.waveglow_infer_forward.__get__(
+            waveglow, type(waveglow)
+        )
+        return waveglow
+
+    def forward(self, *args):
+        residual, speaker_vecs, text = args
+        mels = self.flowtron(residual, speaker_vecs, text)
+        audio = self.waveglow(mels)
+        return audio
+
+    def waveglow_infer_forward(self, spect, sigma=0.8):
+        """Waveglow infer function.
+
+        Fixes ONNX unsupported operator errors with replacement
+        for supported ones.
+        """
+
+        spect = self.upsample(spect)
+        # trim conv artifacts. maybe pad spec to kernel multiple
+        time_cutoff = self.upsample.kernel_size[0] - self.upsample.stride[0]
+        spect = spect[:, :, :-time_cutoff]
+        # Replacing unfold since it is compiled into a weird onnx representation (with slices and concat)
+        spect = spect.reshape(1, 80, -1, self.n_group).permute(0, 2, 1, 3)
+        spect = spect.contiguous().reshape(
+            spect.size(0), spect.size(1), -1
+        ).permute(0, 2, 1)
+
+        if spect.type() == 'torch.cuda.HalfTensor':
+            audio = torch.randn(
+                spect.size(0),
+                self.n_remaining_channels,
+                spect.size(2), dtype=torch.half, device='cuda'
+            )
+        else:
+            audio = torch.randn(
+                spect.size(0),
+                self.n_remaining_channels,
+                spect.size(2), dtype=torch.float, device='cuda'
+            )
+
+        audio = torch.autograd.Variable(sigma*audio)
+
+        for k in reversed(range(self.n_flows)):
+            n_half = int(audio.size(1)/2)
+            audio_0 = audio[:, :n_half, :]
+            audio_1 = audio[:, n_half:, :]
+
+            output = self.WN[k]((audio_0, spect))
+
+            s = output[:, n_half:, :]
+            b = output[:, :n_half, :]
+            audio_1 = (audio_1 - b)/torch.exp(s)
+            audio = torch.cat([audio_0, audio_1], 1)
+
+            audio = self.convinv[k](audio, reverse=True)
+
+            if k % self.n_early_every == 0 and k > 0:
+                if spect.type() == 'torch.cuda.HalfTensor':
+                    z = torch.randn(
+                        spect.size(0),
+                        self.n_early_size,
+                        spect.size(2),
+                        dtype=torch.half,
+                        device='cuda'
+                    )
+                else:
+                    z = torch.randn(
+                        spect.size(0),
+                        self.n_early_size,
+                        spect.size(2),
+                        dtype=torch.float,
+                        device='cuda'
+                    )
+                audio = torch.cat((sigma*z, audio), 1)
+
+        audio = audio.permute(0, 2, 1).contiguous().reshape(audio.size(0), -1)
+        return audio

From 5733f734f08f073210d1ab7424e67dfbe13fe2f4 Mon Sep 17 00:00:00 2001
From: Mykyta Makarov <mykytamakarov@gmail.com>
Date: Fri, 27 Nov 2020 13:34:18 +0100
Subject: [PATCH 2/3] Fix for access exception

---
 export_onnx.py   | 73 ++++++++++++++++++++++++++----------------------
 flowtron_onnx.py | 66 +++++++++++++++++++------------------------
 2 files changed, 68 insertions(+), 71 deletions(-)

diff --git a/export_onnx.py b/export_onnx.py
index c4752d6..8c1ae3f 100644
--- a/export_onnx.py
+++ b/export_onnx.py
@@ -42,8 +42,8 @@
 
 
 def export(flowtron_path, waveglow_path, output_dir,
-           speaker_id, n_frames, sigma, gate_threshold, seed, no_test_run):
-    text = "It is well know that deep generative models have a deep latent space!"
+           speaker_id, n_frames, sigma, gate_threshold, seed, no_test_run, no_export):
+    text = "Hello?"
     torch.manual_seed(seed)
     torch.cuda.manual_seed(seed)
 
@@ -68,45 +68,49 @@ def export(flowtron_path, waveglow_path, output_dir,
     trainset = Data(
         data_config['training_files'],
         **dict((k, v) for k, v in data_config.items() if k not in ignore_keys))
+    print(trainset.speaker_ids)
     speaker_vecs = trainset.get_speaker_id(speaker_id).cuda()
     text = trainset.get_text(text).cuda()
     text_copy = deepcopy(text.cpu().numpy())
     speaker_vecs = speaker_vecs[None]
     text = text[None]
-
-    with torch.no_grad():
-        residual = torch.cuda.FloatTensor(1, 80, n_frames).normal_() * sigma
-        mels = model(residual, speaker_vecs, text)
-
-        waveglow = FlowtronTTS.patch_waveglow(waveglow)
-
-        audio = waveglow(mels, sigma=0.8)
-
-        model = FlowtronTTS(model, waveglow)
-        model_infer = torch.jit.trace(
-            model, [residual, speaker_vecs, text]
-        )
-        torch.onnx.export(
-            model_infer,
-            [residual, speaker_vecs, text],
-            "./flowtron_waveglow.onnx",
-            opset_version=11,
-            do_constant_folding=True,
-            input_names=["residual", "speaker_vecs", "text"],
-            output_names=["audio"],
-            dynamic_axes={
-                "text": {1: "text_seq"},
-                "audio": {1: "audio_seq"},
-            },
-            example_outputs=audio,
-            verbose=False,
-        )
+    if not no_export:
+        with torch.no_grad():
+            residual = torch.cuda.FloatTensor(1, 80, n_frames).normal_() * sigma
+            mels = model(residual, speaker_vecs, text)
+            print(mels.shape)
+            waveglow = FlowtronTTS.patch_waveglow(waveglow)
+
+            audio = waveglow(mels, sigma=0.8)
+
+            model = FlowtronTTS(model, waveglow)
+            model_infer = torch.jit.trace(
+                model, [residual, speaker_vecs, text]
+            )
+            outp = model_infer(residual, speaker_vecs, text)
+
+            torch.onnx.export(
+                model_infer,
+                [residual, speaker_vecs, text],
+                "./flowtron_waveglow.onnx",
+                opset_version=11,
+                do_constant_folding=True,
+                input_names=["residual", "speaker_vecs", "text"],
+                output_names=["audio"],
+                dynamic_axes={
+                    "residual": {1: "res_ch", 2: "res_frames"},
+                    "text": {1: "text_seq"},
+                    "audio": {1: "audio_seq"},
+                },
+                example_outputs=outp,
+                verbose=False,
+            )
 
     if not no_test_run:
         print("Running test:")
         import onnxruntime as rt
         sess_options = rt.SessionOptions()
-        sess_options.graph_optimization_level = rt.GraphOptimizationLevel.ORT_DISABLE_ALL
+        sess_options.graph_optimization_level = rt.GraphOptimizationLevel.ORT_ENABLE_ALL
         print("Loading model.")
         flowtron_tts = rt.InferenceSession(
             "./flowtron_waveglow.onnx",
@@ -123,9 +127,8 @@ def export(flowtron_path, waveglow_path, output_dir,
             }
         )
         print("Finished successfuly, saving the results")
-        audio = audio[0]
+        audio = audio[0].reshape(-1)
         audio = audio / np.abs(audio).max()
-
         write(
             os.path.join(
                 output_dir, 'sid{}_sigma{}_onnx_test.wav'.format(
@@ -153,6 +156,7 @@ def export(flowtron_path, waveglow_path, output_dir,
     parser.add_argument("-g", "--gate", default=0.5, type=float)
     parser.add_argument("--seed", default=1234, type=int)
     parser.add_argument('--no-test-run', dest='no_test_run', action='store_true')
+    parser.add_argument('--no-export', dest='no_export', action='store_true')
     args = parser.parse_args()
 
     # Parse configs.  Globals nicer in this case
@@ -175,4 +179,5 @@ def export(flowtron_path, waveglow_path, output_dir,
     torch.backends.cudnn.enabled = True
     torch.backends.cudnn.benchmark = False
     export(args.flowtron_path, args.waveglow_path, args.output_dir,
-           args.id, args.n_frames, args.sigma, args.gate, args.seed, args.no_test_run)
+           args.id, args.n_frames, args.sigma, args.gate, args.seed,
+           args.no_test_run, args.no_export)
diff --git a/flowtron_onnx.py b/flowtron_onnx.py
index 4637fb7..8193f59 100644
--- a/flowtron_onnx.py
+++ b/flowtron_onnx.py
@@ -44,10 +44,10 @@ def __init__(self, n_mel_channels, n_speaker_dim, n_text_dim,
                                n_attn_channels, n_lstm_layers, add_gate)
 
     def forward(self, residual, text):
-        residual, gates = self.ar_step(
+        residual, gate = self.ar_step(
             torch.flip(residual, (0, )), text)
         residual = torch.flip(residual, (0, ))
-        return residual, gates
+        return residual, gate
 
     def trace_layers(self):
         self.ar_step.trace_layers()
@@ -135,35 +135,14 @@ def trace_layers(self):
         )
 
     def forward(self, residual, text):
-        total_output = []  # seems 10FPS faster than pre-allocation
+        total_output = []
         gate_total = []
-        dummy = torch.zeros([1, residual.size(1), residual.size(2)], device=residual.device)
+        output = torch.zeros([1, residual.size(1), residual.size(2)], device=residual.device)
         (h, c) = (torch.zeros([1, 1, 1024], dtype=torch.float, device='cuda'),
                   torch.zeros([1, 1, 1024], dtype=torch.float, device='cuda'))
-
-        attention_hidden, (h, c) = self.attention_lstm(dummy, (h, c))
-        attention_context, attention_weight = self.attention_layer(
-            attention_hidden, text, text)
-        attention_context = attention_context.permute(2, 0, 1)
-        decoder_input = torch.cat((attention_hidden, attention_context), -1)
         (h1, c1) = (torch.zeros([2, 1, 1024], dtype=torch.float, device='cuda'),
                     torch.zeros([2, 1, 1024], dtype=torch.float, device='cuda'))
-        lstm_hidden, (h1, c1) = self.lstm(decoder_input, (h1, c1))
-        lstm_hidden = self.dense_layer(lstm_hidden).permute(1, 2, 0)
-        decoder_output = self.conv(lstm_hidden).permute(2, 0, 1)
-
-        log_s = decoder_output[:, :, :decoder_output.size(2)//2]
-        b = decoder_output[:, :, decoder_output.size(2)//2:]
-        output = (residual[0, :, :] - b)/torch.exp(log_s)
-        total_output.append(output)
-        i = torch.tensor(1, dtype=torch.long)
-        lim = torch.tensor(residual.size(0), dtype=torch.long)
-        gate_total.append(
-            torch.sigmoid(self.gate_layer(decoder_input)).view([1])
-            if self.add_gate else torch.tensor([0], dtype=torch.float, device=output.device)
-        )
-        # more than one condition raises errors in onnx for some reason, so just returning gate layer instead
-        while i < lim:
+        for i in range(int(residual.size(0))):
             attention_hidden, (h, c) = self.attention_lstm(output, (h, c))
             attention_context, attention_weight = self.attention_layer(
                 attention_hidden, text, text
@@ -176,13 +155,13 @@ def forward(self, residual, text):
 
             log_s = decoder_output[:, :, :decoder_output.size(2)//2]
             b = decoder_output[:, :, decoder_output.size(2)//2:]
-            output = (residual[i, :, :] - b)/torch.exp(log_s)
-            gate_total.append(
-                torch.sigmoid(self.gate_layer(decoder_input)).view([1])
-                if self.add_gate else torch.tensor([0], dtype=torch.float, device=output.device)
-            )
-            total_output.append(output)
-            i += 1
+            output = (residual[i, :, :].unsqueeze(0) - b)/torch.exp(log_s)
+            gate_total += [
+                torch.sigmoid(self.gate_layer(decoder_input)).reshape([1])
+                if self.add_gate else
+                torch.tensor([0], dtype=torch.float, device=output.device)
+            ]
+            total_output += [output]
         total_output = torch.cat(total_output, 0)
         return total_output, torch.cat(gate_total, 0)
 
@@ -197,7 +176,7 @@ def __init__(self, n_speakers, n_speaker_dim, n_text, n_text_dim, n_flows,
                  temperature=1, gate_threshold=0.5):
 
         super(Flowtron, self).__init__()
-        norm_fn = nn.InstanceNorm1d
+        norm_fn = InstanceNorm
         self.speaker_embedding = torch.nn.Embedding(n_speakers, n_speaker_dim)
         self.embedding = torch.nn.Embedding(n_text, n_text_dim)
         self.flows = torch.nn.ModuleList()
@@ -249,9 +228,10 @@ def forward(self, *args):
         residual = residual.permute(2, 0, 1)
         for flow in reversed(self.flows):
             residual, gates = flow(residual, encoder_outputs)
-            gate_trigger_id_tuple = torch.nonzero(gates > self.gate_threshold, as_tuple=True)
-            if gate_trigger_id_tuple[0].nelement() > 0:
-                residual = residual[:gate_trigger_id_tuple[0].item(), ...]
+            gate_trigger_id_tuple = torch.nonzero(gates.double() > self.gate_threshold)
+            if gate_trigger_id_tuple.nelement() > 0:
+                indices = torch.arange(gate_trigger_id_tuple[0][0], device=residual.device)
+                residual = residual.flip(0).index_select(0, indices).flip(0)
         return residual.permute(1, 2, 0)
 
     @staticmethod
@@ -262,6 +242,18 @@ def set_temperature_and_gate(flow, temperature, gate_threshold):
             flow.gate_threshold = gate_threshold
 
 
+class InstanceNorm(torch.nn.modules.instancenorm._InstanceNorm):
+    def __init__(self, *args, **kwargs):
+        super(InstanceNorm, self).__init__(*args, **kwargs)
+
+    def forward(self, x):
+        mn = x.mean(-1).detach().unsqueeze(-1)
+        sd = x.std(-1).detach().unsqueeze(-1)
+
+        x = ((x - mn) / (sd + 1e-8)) * self.weight.view(1, -1, 1) + self.bias.view(1, -1, 1)
+        return x
+
+
 class FlowtronTTS(torch.nn.Module):
 
     def __init__(self, flowtron, waveglow, *args, **kwargs):

From 2ba36a80e7ae1cf977be2ca179954e8693587e9f Mon Sep 17 00:00:00 2001
From: eublefar <evil.unicorn1@gmail.com>
Date: Mon, 25 Oct 2021 21:05:13 +0200
Subject: [PATCH 3/3] ONNX export and prototype for semi-incremental inference

---
 export_onnx.py   |  600 +++++++++++++++++--------
 flowtron_onnx.py | 1114 +++++++++++++++++++++++++++++++---------------
 2 files changed, 1183 insertions(+), 531 deletions(-)

diff --git a/export_onnx.py b/export_onnx.py
index 8c1ae3f..2bbca08 100644
--- a/export_onnx.py
+++ b/export_onnx.py
@@ -1,183 +1,417 @@
-###############################################################################
-#
-#  Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-###############################################################################
-import matplotlib
-matplotlib.use("Agg")
-import matplotlib.pylab as plt
-
-import os
-import argparse
-import json
-import sys
-import numpy as np
-import torch
-
-
-from flowtron_onnx import Flowtron, FlowtronTTS
-from torch.utils.data import DataLoader
-from data import Data
-from train import update_params
-
-sys.path.insert(0, "tacotron2")
-sys.path.insert(0, "tacotron2/waveglow")
-from glow import WaveGlow
-from scipy.io.wavfile import write
-from copy import deepcopy
-
-import faulthandler
-faulthandler.enable()
-
-
-def export(flowtron_path, waveglow_path, output_dir,
-           speaker_id, n_frames, sigma, gate_threshold, seed, no_test_run, no_export):
-    text = "Hello?"
-    torch.manual_seed(seed)
-    torch.cuda.manual_seed(seed)
-
-    # load waveglow
-    waveglow = torch.load(waveglow_path)['model'].cuda().eval()
-    waveglow.cuda()
-    for k in waveglow.convinv:
-        k.float()
-    waveglow.eval()
-
-    # load flowtron
-    model = Flowtron(**model_config).cuda()
-    state_dict = torch.load(flowtron_path, map_location='cpu')['state_dict']
-    model.load_state_dict(state_dict, False)
-    model.eval()
-    print("Loaded checkpoint '{}')" .format(flowtron_path))
-
-    # Script loop parts of the flows
-    model.script_flows()
-
-    ignore_keys = ['training_files', 'validation_files']
-    trainset = Data(
-        data_config['training_files'],
-        **dict((k, v) for k, v in data_config.items() if k not in ignore_keys))
-    print(trainset.speaker_ids)
-    speaker_vecs = trainset.get_speaker_id(speaker_id).cuda()
-    text = trainset.get_text(text).cuda()
-    text_copy = deepcopy(text.cpu().numpy())
-    speaker_vecs = speaker_vecs[None]
-    text = text[None]
-    if not no_export:
-        with torch.no_grad():
-            residual = torch.cuda.FloatTensor(1, 80, n_frames).normal_() * sigma
-            mels = model(residual, speaker_vecs, text)
-            print(mels.shape)
-            waveglow = FlowtronTTS.patch_waveglow(waveglow)
-
-            audio = waveglow(mels, sigma=0.8)
-
-            model = FlowtronTTS(model, waveglow)
-            model_infer = torch.jit.trace(
-                model, [residual, speaker_vecs, text]
-            )
-            outp = model_infer(residual, speaker_vecs, text)
-
-            torch.onnx.export(
-                model_infer,
-                [residual, speaker_vecs, text],
-                "./flowtron_waveglow.onnx",
-                opset_version=11,
-                do_constant_folding=True,
-                input_names=["residual", "speaker_vecs", "text"],
-                output_names=["audio"],
-                dynamic_axes={
-                    "residual": {1: "res_ch", 2: "res_frames"},
-                    "text": {1: "text_seq"},
-                    "audio": {1: "audio_seq"},
-                },
-                example_outputs=outp,
-                verbose=False,
-            )
-
-    if not no_test_run:
-        print("Running test:")
-        import onnxruntime as rt
-        sess_options = rt.SessionOptions()
-        sess_options.graph_optimization_level = rt.GraphOptimizationLevel.ORT_ENABLE_ALL
-        print("Loading model.")
-        flowtron_tts = rt.InferenceSession(
-            "./flowtron_waveglow.onnx",
-            providers=rt.get_available_providers(),
-            sess_options=sess_options
-        )
-        print("Model loaded, running tts.")
-        audio = flowtron_tts.run(
-            None,
-            {
-                "residual": residual.cpu().contiguous().numpy(),
-                "speaker_vecs": speaker_vecs.cpu().contiguous().numpy(),
-                "text": text_copy.reshape([1, -1])
-            }
-        )
-        print("Finished successfuly, saving the results")
-        audio = audio[0].reshape(-1)
-        audio = audio / np.abs(audio).max()
-        write(
-            os.path.join(
-                output_dir, 'sid{}_sigma{}_onnx_test.wav'.format(
-                    speaker_id, sigma
-                )
-            ),
-            data_config['sampling_rate'], audio
-        )
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument('-c', '--config', type=str,
-                        help='JSON file for configuration')
-    parser.add_argument('-p', '--params', nargs='+', default=[])
-    parser.add_argument('-f', '--flowtron_path',
-                        help='Path to flowtron state dict', type=str)
-    parser.add_argument('-w', '--waveglow_path',
-                        help='Path to waveglow state dict', type=str)
-    parser.add_argument('-i', '--id', help='Speaker id', type=int)
-    parser.add_argument('-n', '--n_frames', help='Number of frames',
-                        default=400, type=int)
-    parser.add_argument('-o', "--output_dir", default="results/")
-    parser.add_argument("-s", "--sigma", default=0.5, type=float)
-    parser.add_argument("-g", "--gate", default=0.5, type=float)
-    parser.add_argument("--seed", default=1234, type=int)
-    parser.add_argument('--no-test-run', dest='no_test_run', action='store_true')
-    parser.add_argument('--no-export', dest='no_export', action='store_true')
-    args = parser.parse_args()
-
-    # Parse configs.  Globals nicer in this case
-    with open(args.config) as f:
-        data = f.read()
-
-    global config
-    config = json.loads(data)
-    update_params(config, args.params)
-
-    data_config = config["data_config"]
-    global model_config
-    model_config = config["model_config"]
-
-    # Make directory if it doesn't exist
-    if not os.path.isdir(args.output_dir):
-        os.makedirs(args.output_dir)
-        os.chmod(args.output_dir, 0o775)
-
-    torch.backends.cudnn.enabled = True
-    torch.backends.cudnn.benchmark = False
-    export(args.flowtron_path, args.waveglow_path, args.output_dir,
-           args.id, args.n_frames, args.sigma, args.gate, args.seed,
-           args.no_test_run, args.no_export)
+###############################################################################
+#
+#  Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+###############################################################################
+import matplotlib
+
+matplotlib.use("Agg")
+import matplotlib.pylab as plt
+
+import os
+import argparse
+import json
+import sys
+import numpy as np
+import torch
+
+
+from flowtron_onnx import Flowtron, FlowtronTTS, FlowtronEncoder, SimpleTTSRunner
+from torch.utils.data import DataLoader
+from data import Data
+from train import update_params
+
+sys.path.insert(0, "tacotron2")
+sys.path.insert(0, "tacotron2/WaveGlow")
+from glow import WaveGlow
+from scipy.io.wavfile import write
+from copy import deepcopy
+import sounddevice as sd
+from queue import Queue
+
+# import faulthandler
+import time
+
+# faulthandler.enable()
+
+
+def init_states(residual):
+    last_outputs = torch.zeros(
+        [1, residual.size(1), residual.size(2)],
+        device=residual.device,
+        dtype=torch.float,
+    )
+    hidden_att = [
+        torch.zeros([1, 1, 1024], dtype=torch.float, device="cuda"),
+        torch.zeros([1, 1, 1024], dtype=torch.float, device="cuda"),
+    ]
+    hidden_lstm = [
+        torch.zeros([2, 1, 1024], dtype=torch.float, device="cuda"),
+        torch.zeros([2, 1, 1024], dtype=torch.float, device="cuda"),
+    ]
+    return last_outputs, hidden_att, hidden_lstm
+
+
+def export(
+    flowtron_path,
+    waveglow_path,
+    output_dir,
+    speaker_id,
+    n_frames,
+    sigma,
+    gate_threshold,
+    seed,
+    no_test_run,
+    no_export,
+):
+    text = """
+        I am doing fine
+        """
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+
+    # load waveglow
+    waveglow = torch.load(waveglow_path)["model"].cuda().eval()
+    waveglow.cuda()
+    for k in waveglow.convinv:
+        k.float()
+    waveglow.eval()
+
+    # load flowtron
+    model = Flowtron(**model_config).cuda()
+    state_dict = torch.load(flowtron_path, map_location="cpu")["model"].state_dict()
+
+    model.load_state_dict(state_dict, False)
+    model.eval()
+    print("Loaded checkpoint '{}')".format(flowtron_path))
+
+    # Script loop parts of the flows
+    # model.script_flows()
+
+    ignore_keys = ["training_files", "validation_files"]
+    trainset = Data(
+        data_config["training_files"],
+        **dict((k, v) for k, v in data_config.items() if k not in ignore_keys)
+    )
+    print(trainset.speaker_ids)
+    speaker_vecs = trainset.get_speaker_id(speaker_id).cuda()
+    text = trainset.get_text(text).cuda()
+    text_copy = deepcopy(text.cpu().numpy())
+    speaker_vecs = speaker_vecs[None]
+    text = text[None]
+    if not no_export:
+        with torch.no_grad():
+            residual = torch.cuda.FloatTensor(1, 80, n_frames).normal_() * sigma
+
+            encoder = FlowtronEncoder(
+                model.embedding, model.speaker_embedding, model.encoder
+            )
+
+            # mels = model(residual, speaker_vecs, text)
+            # print(mels.shape)
+            waveglow = FlowtronTTS.patch_waveglow(waveglow)
+
+            # audio = waveglow(mels, sigma=0.8)
+
+            model = FlowtronTTS(encoder, model, waveglow)
+
+            text = text.reshape([1, -1])
+
+            enc_outps = encoder(speaker_vecs, text)
+            print("enc_outps.shape", enc_outps.shape)
+            torch.onnx.export(
+                encoder,
+                (speaker_vecs, text),
+                "./encoder.onnx",
+                opset_version=11,
+                do_constant_folding=True,
+                input_names=["speaker_vecs", "text"],
+                output_names=["text_emb"],
+                dynamic_axes={"text": {1: "text_seq"}, "text_emb": {0: "text_seq"}},
+                example_outputs=enc_outps,
+                verbose=False,
+            )
+
+            backward_flow = model.backward_flow.ar_step
+            residual = residual.permute(2, 0, 1)
+            residual_o, hidden_att, hidden_lstm = init_states(residual)
+
+            (
+                residual_o,
+                gates,
+                hidden_att[0],
+                hidden_att[1],
+                hidden_lstm[0],
+                hidden_lstm[1],
+            ) = backward_flow(
+                residual[0],
+                enc_outps,
+                residual_o,
+                hidden_att[0],
+                hidden_att[1],
+                hidden_lstm[0],
+                hidden_lstm[1],
+            )
+            torch.onnx.export(
+                backward_flow,
+                (
+                    residual[0],
+                    enc_outps,
+                    residual_o,
+                    hidden_att[0],
+                    hidden_att[1],
+                    hidden_lstm[0],
+                    hidden_lstm[1],
+                ),
+                "./backward_flow.onnx",
+                opset_version=11,
+                do_constant_folding=True,
+                input_names=[
+                    "residual",
+                    "text",
+                    "last_output",
+                    "hidden_att",
+                    "hidden_att_c",
+                    "hidden_lstm",
+                    "hidden_lstm_c",
+                ],
+                output_names=[
+                    "output",
+                    "gate",
+                    "hidden_att_o",
+                    "hidden_att_o_c",
+                    "hidden_lstm_o",
+                    "hidden_lstm_o_c",
+                ],
+                dynamic_axes={"text": {0: "text_seq"}},
+                example_outputs=(
+                    residual_o,
+                    gates,
+                    hidden_att[0],
+                    hidden_att[1],
+                    hidden_lstm[0],
+                    hidden_lstm[1],
+                ),
+                verbose=False,
+            )
+
+            forward_flow = model.forward_flow
+
+            (
+                residual_o,
+                gates,
+                hidden_att[0],
+                hidden_att[1],
+                hidden_lstm[0],
+                hidden_lstm[1],
+            ) = forward_flow(
+                residual[0],
+                enc_outps,
+                residual_o,
+                hidden_att[0],
+                hidden_att[1],
+                hidden_lstm[0],
+                hidden_lstm[1],
+            )
+            torch.onnx.export(
+                forward_flow,
+                (
+                    residual[0],
+                    enc_outps,
+                    residual_o,
+                    hidden_att[0],
+                    hidden_att[1],
+                    hidden_lstm[0],
+                    hidden_lstm[1],
+                ),
+                "./forward_flow.onnx",
+                opset_version=11,
+                do_constant_folding=True,
+                input_names=[
+                    "residual",
+                    "text",
+                    "last_output",
+                    "hidden_att",
+                    "hidden_att_c",
+                    "hidden_lstm",
+                    "hidden_lstm_c",
+                ],
+                output_names=[
+                    "output",
+                    "gate",
+                    "hidden_att_o",
+                    "hidden_att_o_c",
+                    "hidden_lstm_o",
+                    "hidden_lstm_o_c",
+                ],
+                dynamic_axes={"text": {0: "text_seq"}},
+                example_outputs=(
+                    residual_o,
+                    gates,
+                    hidden_att[0],
+                    hidden_att[1],
+                    hidden_lstm[0],
+                    hidden_lstm[1],
+                ),
+                verbose=False,
+            )
+
+            residual = residual.permute(1, 2, 0)
+            mels = model(residual, speaker_vecs, text)
+
+            audio = waveglow(mels, sigma=0.8)
+
+            torch.onnx.export(
+                waveglow,
+                (mels),
+                "./waveglow.onnx",
+                opset_version=11,
+                do_constant_folding=True,
+                input_names=["mels"],
+                output_names=["audio"],
+                dynamic_axes={"mels": {2: "mel_seq"}, "audio": {1: "audio_seq"}},
+                example_outputs=audio,
+                verbose=False,
+            )
+
+    if not no_test_run:
+        print("Running test:")
+        import onnxruntime as rt
+
+        sess_options = rt.SessionOptions()
+        sess_options.graph_optimization_level = (
+            rt.GraphOptimizationLevel.ORT_DISABLE_ALL
+        )
+        print("Loading model.")
+
+        print(rt.get_available_providers())
+
+        encoder = rt.InferenceSession(
+            "./encoder.onnx",
+            providers=rt.get_available_providers()[:1],
+            sess_options=sess_options,
+        )
+        backward_flow = rt.InferenceSession(
+            "./backward_flow.onnx",
+            providers=rt.get_available_providers()[:1],
+            sess_options=sess_options,
+        )
+        print([i.name for i in backward_flow.get_inputs()])
+        forward_flow = rt.InferenceSession(
+            "./forward_flow.onnx",
+            providers=rt.get_available_providers()[:1],
+            sess_options=sess_options,
+        )
+        waveglow = rt.InferenceSession(
+            "./waveglow.onnx",
+            providers=rt.get_available_providers()[:1],
+            sess_options=sess_options,
+        )
+        print("Model loaded, running tts.")
+        model = SimpleTTSRunner(encoder, backward_flow, forward_flow, waveglow)
+        speaker_id = speaker_vecs.cpu().contiguous().numpy()
+        text = text_copy.reshape([1, -1])
+        full_audio = []
+        print(text.shape[1])
+        input("Press enter to start generating:")
+        start = time.time()
+
+        audio = model.run(speaker_id, text)
+        queue = Queue()
+        def callback(indata, outdata, frames, time, status):
+            if not queue.empty():
+                arr = np.zeros((5120, 1))
+                inp = queue.get(False)
+                arr[:inp.shape[0], 0] = inp
+                outdata[:] = arr
+
+        stream = sd.Stream(channels=1, samplerate=22050,  callback=callback, blocksize=5120).__enter__()
+        for i, audio_el in enumerate(audio):
+            # stream.write(audio_el)
+            if i==0:
+                audio_el[:1000] = 0
+            queue.put(audio_el)
+            full_audio += audio_el.tolist()
+        
+        while not queue.empty():
+            sd.sleep(int(5120/22.05))
+        end = time.time()
+        process_time = end - start
+        audio_time = len(full_audio) / data_config["sampling_rate"]
+        print(f" > Processing time: {process_time}")
+        print(f" > Real-time factor: {process_time / audio_time}")
+        print("Finished successfuly, saving the results")
+        print(f"data_config['sampling_rate'] {data_config['sampling_rate']}")
+        write(
+            os.path.join(
+                output_dir, "sid{}_sigma{}_onnx_test.wav".format(speaker_id, sigma)
+            ),
+            data_config["sampling_rate"],
+            np.asarray(full_audio),
+        )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-c", "--config", type=str, help="JSON file for configuration")
+    parser.add_argument("-p", "--params", nargs="+", default=[])
+    parser.add_argument(
+        "-f", "--flowtron_path", help="Path to flowtron state dict", type=str
+    )
+    parser.add_argument(
+        "-w", "--waveglow_path", help="Path to waveglow state dict", type=str
+    )
+    parser.add_argument("-i", "--id", help="Speaker id", type=int)
+    parser.add_argument(
+        "-n", "--n_frames", help="Number of frames", default=400, type=int
+    )
+    parser.add_argument("-o", "--output_dir", default="results/")
+    parser.add_argument("-s", "--sigma", default=0.5, type=float)
+    parser.add_argument("-g", "--gate", default=0.5, type=float)
+    parser.add_argument("--seed", default=1234, type=int)
+    parser.add_argument("--no-test-run", dest="no_test_run", action="store_true")
+    parser.add_argument("--no-export", dest="no_export", action="store_true")
+    args = parser.parse_args()
+
+    # Parse configs.  Globals nicer in this case
+    with open(args.config) as f:
+        data = f.read()
+
+    global config
+    config = json.loads(data)
+    update_params(config, args.params)
+
+    data_config = config["data_config"]
+    global model_config
+    model_config = config["model_config"]
+
+    # Make directory if it doesn't exist
+    if not os.path.isdir(args.output_dir):
+        os.makedirs(args.output_dir)
+        os.chmod(args.output_dir, 0o775)
+
+    torch.backends.cudnn.enabled = True
+    torch.backends.cudnn.benchmark = False
+    export(
+        args.flowtron_path,
+        args.waveglow_path,
+        args.output_dir,
+        args.id,
+        args.n_frames,
+        args.sigma,
+        args.gate,
+        args.seed,
+        args.no_test_run,
+        args.no_export,
+    )
diff --git a/flowtron_onnx.py b/flowtron_onnx.py
index 8193f59..37a39c7 100644
--- a/flowtron_onnx.py
+++ b/flowtron_onnx.py
@@ -1,348 +1,766 @@
-###############################################################################
-#
-#  Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-###############################################################################
-import sys
-# sys.path.insert(0, "tacotron2")
-# sys.path.insert(0, "tacotron2/waveglow")
-import numpy as np
-import torch
-from torch import nn
-from torch.nn import functional as F
-
-from flowtron import (
-    LinearNorm,
-    ConvNorm,
-    GaussianMixture,
-    MelEncoder,
-    DenseLayer,
-    Encoder,
-    Attention,
-)
-
-
-class AR_Back_Step(torch.nn.Module):
-
-    def __init__(self, n_mel_channels, n_speaker_dim, n_text_dim,
-                 n_in_channels, n_hidden, n_attn_channels, n_lstm_layers,
-                 add_gate):
-        super(AR_Back_Step, self).__init__()
-        self.ar_step = AR_Step(n_mel_channels, n_speaker_dim, n_text_dim,
-                               n_mel_channels+n_speaker_dim, n_hidden,
-                               n_attn_channels, n_lstm_layers, add_gate)
-
-    def forward(self, residual, text):
-        residual, gate = self.ar_step(
-            torch.flip(residual, (0, )), text)
-        residual = torch.flip(residual, (0, ))
-        return residual, gate
-
-    def trace_layers(self):
-        self.ar_step.trace_layers()
-
-
-class AR_Step(torch.nn.Module):
-    __constants__ = ['gate_threshold', 'add_gate']
-
-    def __init__(self, n_mel_channels, n_speaker_dim, n_text_channels,
-                 n_in_channels, n_hidden, n_attn_channels, n_lstm_layers,
-                 add_gate: bool = False):
-        super(AR_Step, self).__init__()
-        self.conv = torch.nn.Conv1d(n_hidden, 2*n_mel_channels, 1).cuda()
-        self.conv.weight.data = 0.0*self.conv.weight.data
-        self.conv.bias.data = 0.0*self.conv.bias.data
-        # [1, 1, 1664] [2, 1, 1024] [2, 1, 1024]
-        self.lstm = torch.nn.LSTM(n_hidden+n_attn_channels, n_hidden, n_lstm_layers).cuda()
-        self.attention_lstm = torch.nn.LSTM(n_mel_channels, n_hidden).cuda()
-
-
-        self.attention_layer = Attention(n_hidden, n_speaker_dim,
-                                         n_text_channels, n_attn_channels,).cuda() 
-
-        self.dense_layer = DenseLayer(in_dim=n_hidden,
-                                      sizes=[n_hidden, n_hidden]).cuda()
-        self.add_gate: bool = add_gate
-        # if self.add_gate:
-        self.gate_threshold = 0.5
-        self.gate_layer = LinearNorm(
-            n_hidden+n_attn_channels, 1, bias=True, w_init_gain='sigmoid'
-        )
-
-    def trace_layers(self):
-        self.lstm.flatten_parameters()
-        self.lstm = torch.jit.trace_module(
-            self.lstm, 
-            inputs={
-                'forward': [
-                    torch.zeros([1, 1, 1664], dtype=torch.float, device='cuda').normal_(),
-                    (torch.zeros([2, 1, 1024], dtype=torch.float, device='cuda').normal_(),
-                     torch.zeros([2, 1, 1024], dtype=torch.float, device='cuda').normal_())
-                ]
-            }
-        )
-        self.attention_lstm.flatten_parameters()
-        self.attention_lstm = torch.jit.trace_module(
-            self.attention_lstm,
-            inputs={
-                'forward': [
-                    torch.zeros([1, 1, 80], dtype=torch.float, device='cuda').normal_(),
-                    (torch.zeros([1, 1, 1024], dtype=torch.float, device='cuda').normal_(),
-                     torch.zeros([1, 1, 1024], dtype=torch.float, device='cuda').normal_())
-                ]
-            }
-        )
-        self.conv = torch.jit.trace_module(
-            self.conv, 
-            inputs={'forward': [torch.zeros([1, 1024, 1], dtype=torch.float, device='cuda').normal_()]}
-        )
-        self.attention_layer = torch.jit.trace_module(
-            self.attention_layer,
-            inputs={
-                'forward': [
-                    torch.zeros([1, 1, 1024], dtype=torch.float, device='cuda').normal_(),
-                    torch.zeros([63, 1, 640], dtype=torch.float, device='cuda').normal_(),
-                    torch.zeros([63, 1, 640], dtype=torch.float, device='cuda').normal_()
-                ]
-            },
-        )
-        self.dense_layer = torch.jit.trace_module(
-            self.dense_layer,
-            inputs={
-                'forward': [
-                    torch.zeros([1, 1, 1024], dtype=torch.float, device='cuda').normal_()
-                ]
-            },
-        )
-        self.gate_layer = torch.jit.trace_module(
-            self.gate_layer,
-            inputs={
-                'forward': [
-                    torch.zeros([1, 1, 1664], dtype=torch.float, device='cuda').normal_()
-                ]
-            },
-        )
-
-    def forward(self, residual, text):
-        total_output = []
-        gate_total = []
-        output = torch.zeros([1, residual.size(1), residual.size(2)], device=residual.device)
-        (h, c) = (torch.zeros([1, 1, 1024], dtype=torch.float, device='cuda'),
-                  torch.zeros([1, 1, 1024], dtype=torch.float, device='cuda'))
-        (h1, c1) = (torch.zeros([2, 1, 1024], dtype=torch.float, device='cuda'),
-                    torch.zeros([2, 1, 1024], dtype=torch.float, device='cuda'))
-        for i in range(int(residual.size(0))):
-            attention_hidden, (h, c) = self.attention_lstm(output, (h, c))
-            attention_context, attention_weight = self.attention_layer(
-                attention_hidden, text, text
-            )
-            attention_context = attention_context.permute(2, 0, 1)
-            decoder_input = torch.cat((attention_hidden, attention_context), -1)
-            lstm_hidden, (h1, c1) = self.lstm(decoder_input, (h1, c1))
-            lstm_hidden = self.dense_layer(lstm_hidden).permute(1, 2, 0)
-            decoder_output = self.conv(lstm_hidden).permute(2, 0, 1)
-
-            log_s = decoder_output[:, :, :decoder_output.size(2)//2]
-            b = decoder_output[:, :, decoder_output.size(2)//2:]
-            output = (residual[i, :, :].unsqueeze(0) - b)/torch.exp(log_s)
-            gate_total += [
-                torch.sigmoid(self.gate_layer(decoder_input)).reshape([1])
-                if self.add_gate else
-                torch.tensor([0], dtype=torch.float, device=output.device)
-            ]
-            total_output += [output]
-        total_output = torch.cat(total_output, 0)
-        return total_output, torch.cat(gate_total, 0)
-
-
-class Flowtron(torch.nn.Module):
-    __constants__ = ['gate_threshold']
-
-    def __init__(self, n_speakers, n_speaker_dim, n_text, n_text_dim, n_flows,
-                 n_mel_channels, n_hidden, n_attn_channels, n_lstm_layers,
-                 use_gate_layer, mel_encoder_n_hidden, n_components,
-                 fixed_gaussian, mean_scale, dummy_speaker_embedding,
-                 temperature=1, gate_threshold=0.5):
-
-        super(Flowtron, self).__init__()
-        norm_fn = InstanceNorm
-        self.speaker_embedding = torch.nn.Embedding(n_speakers, n_speaker_dim)
-        self.embedding = torch.nn.Embedding(n_text, n_text_dim)
-        self.flows = torch.nn.ModuleList()
-        self.encoder = Encoder(norm_fn=norm_fn, encoder_embedding_dim=n_text_dim)
-        self.dummy_speaker_embedding = dummy_speaker_embedding
-        self.gate_threshold = gate_threshold
-        for i in range(n_flows):
-            add_gate = (i == (n_flows-1) and use_gate_layer)
-            if i % 2 == 0:
-                f = AR_Step(n_mel_channels,
-                            n_speaker_dim,
-                            n_text_dim,
-                            n_mel_channels + n_speaker_dim,
-                            n_hidden, n_attn_channels,
-                            n_lstm_layers,
-                            add_gate)
-                self.set_temperature_and_gate(f, temperature, gate_threshold)
-                self.flows.append(f)
-            else:
-                f = AR_Back_Step(n_mel_channels,
-                                 n_speaker_dim,
-                                 n_text_dim,
-                                 n_mel_channels + n_speaker_dim,
-                                 n_hidden,
-                                 n_attn_channels,
-                                 n_lstm_layers,
-                                 add_gate)
-                self.set_temperature_and_gate(f, temperature, gate_threshold)
-                self.flows.append(f)
-
-    @torch.jit.ignore
-    def script_flows(self):
-        for i, flow in enumerate(self.flows):
-            flow.trace_layers()
-            self.flows[i] = torch.jit.script(flow)
-
-    def forward(self, *args):
-        residual, speaker_vecs, text = args
-        speaker_vecs = self.speaker_embedding(speaker_vecs)
-        text = self.embedding(text).permute(0, 2, 1)
-        text = self.encoder.infer(text)
-        text = text.permute(1, 0, 2)
-        encoder_outputs = torch.cat(
-            [
-                text,
-                speaker_vecs.expand(text.size(0), -1, -1)
-            ], 2
-        )
-        residual = residual.permute(2, 0, 1)
-        for flow in reversed(self.flows):
-            residual, gates = flow(residual, encoder_outputs)
-            gate_trigger_id_tuple = torch.nonzero(gates.double() > self.gate_threshold)
-            if gate_trigger_id_tuple.nelement() > 0:
-                indices = torch.arange(gate_trigger_id_tuple[0][0], device=residual.device)
-                residual = residual.flip(0).index_select(0, indices).flip(0)
-        return residual.permute(1, 2, 0)
-
-    @staticmethod
-    def set_temperature_and_gate(flow, temperature, gate_threshold):
-        flow = flow.ar_step if hasattr(flow, "ar_step") else flow
-        flow.attention_layer.temperature = temperature
-        if hasattr(flow, 'gate_layer'):
-            flow.gate_threshold = gate_threshold
-
-
-class InstanceNorm(torch.nn.modules.instancenorm._InstanceNorm):
-    def __init__(self, *args, **kwargs):
-        super(InstanceNorm, self).__init__(*args, **kwargs)
-
-    def forward(self, x):
-        mn = x.mean(-1).detach().unsqueeze(-1)
-        sd = x.std(-1).detach().unsqueeze(-1)
-
-        x = ((x - mn) / (sd + 1e-8)) * self.weight.view(1, -1, 1) + self.bias.view(1, -1, 1)
-        return x
-
-
-class FlowtronTTS(torch.nn.Module):
-
-    def __init__(self, flowtron, waveglow, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.flowtron = flowtron
-        self.waveglow = waveglow
-
-    def trace_flowtron(self, args):
-        self.flowtron_traced = torch.jit.trace(
-            self.flowtron, args
-        )
-
-    @classmethod
-    def patch_waveglow(cls, waveglow):
-        waveglow.forward = cls.waveglow_infer_forward.__get__(
-            waveglow, type(waveglow)
-        )
-        return waveglow
-
-    def forward(self, *args):
-        residual, speaker_vecs, text = args
-        mels = self.flowtron(residual, speaker_vecs, text)
-        audio = self.waveglow(mels)
-        return audio
-
-    def waveglow_infer_forward(self, spect, sigma=0.8):
-        """Waveglow infer function.
-
-        Fixes ONNX unsupported operator errors with replacement
-        for supported ones.
-        """
-
-        spect = self.upsample(spect)
-        # trim conv artifacts. maybe pad spec to kernel multiple
-        time_cutoff = self.upsample.kernel_size[0] - self.upsample.stride[0]
-        spect = spect[:, :, :-time_cutoff]
-        # Replacing unfold since it is compiled into a weird onnx representation (with slices and concat)
-        spect = spect.reshape(1, 80, -1, self.n_group).permute(0, 2, 1, 3)
-        spect = spect.contiguous().reshape(
-            spect.size(0), spect.size(1), -1
-        ).permute(0, 2, 1)
-
-        if spect.type() == 'torch.cuda.HalfTensor':
-            audio = torch.randn(
-                spect.size(0),
-                self.n_remaining_channels,
-                spect.size(2), dtype=torch.half, device='cuda'
-            )
-        else:
-            audio = torch.randn(
-                spect.size(0),
-                self.n_remaining_channels,
-                spect.size(2), dtype=torch.float, device='cuda'
-            )
-
-        audio = torch.autograd.Variable(sigma*audio)
-
-        for k in reversed(range(self.n_flows)):
-            n_half = int(audio.size(1)/2)
-            audio_0 = audio[:, :n_half, :]
-            audio_1 = audio[:, n_half:, :]
-
-            output = self.WN[k]((audio_0, spect))
-
-            s = output[:, n_half:, :]
-            b = output[:, :n_half, :]
-            audio_1 = (audio_1 - b)/torch.exp(s)
-            audio = torch.cat([audio_0, audio_1], 1)
-
-            audio = self.convinv[k](audio, reverse=True)
-
-            if k % self.n_early_every == 0 and k > 0:
-                if spect.type() == 'torch.cuda.HalfTensor':
-                    z = torch.randn(
-                        spect.size(0),
-                        self.n_early_size,
-                        spect.size(2),
-                        dtype=torch.half,
-                        device='cuda'
-                    )
-                else:
-                    z = torch.randn(
-                        spect.size(0),
-                        self.n_early_size,
-                        spect.size(2),
-                        dtype=torch.float,
-                        device='cuda'
-                    )
-                audio = torch.cat((sigma*z, audio), 1)
-
-        audio = audio.permute(0, 2, 1).contiguous().reshape(audio.size(0), -1)
-        return audio
+###############################################################################
+#
+#  Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+###############################################################################
+import sys
+
+# sys.path.insert(0, "tacotron2")
+# sys.path.insert(0, "tacotron2/waveglow")
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import functional as F
+import onnxruntime
+import time
+
+from flowtron import (
+    LinearNorm,
+    ConvNorm,
+    GaussianMixture,
+    MelEncoder,
+    DenseLayer,
+    Encoder,
+    Attention,
+)
+
+
+class AR_Back_Step(torch.nn.Module):
+    def __init__(
+        self,
+        n_mel_channels,
+        n_speaker_dim,
+        n_text_dim,
+        n_in_channels,
+        n_hidden,
+        n_attn_channels,
+        n_lstm_layers,
+        add_gate,
+    ):
+        super(AR_Back_Step, self).__init__()
+        self.ar_step = AR_Step(
+            n_mel_channels,
+            n_speaker_dim,
+            n_text_dim,
+            n_mel_channels + n_speaker_dim,
+            n_hidden,
+            n_attn_channels,
+            n_lstm_layers,
+            add_gate,
+        )
+
+    def forward(self, residual, text):
+        residual, gate = self.ar_step(torch.flip(residual, (0,)), text)
+        residual = torch.flip(residual, (0,))
+        return residual, gate
+
+    def trace_layers(self):
+        self.ar_step.trace_layers()
+
+
+class AR_Step(torch.nn.Module):
+    __constants__ = ["gate_threshold", "add_gate"]
+
+    def __init__(
+        self,
+        n_mel_channels,
+        n_speaker_dim,
+        n_text_channels,
+        n_in_channels,
+        n_hidden,
+        n_attn_channels,
+        n_lstm_layers,
+        add_gate: bool = False,
+    ):
+        super(AR_Step, self).__init__()
+        self.conv = torch.nn.Conv1d(n_hidden, 2 * n_mel_channels, 1).cuda()
+        self.conv.weight.data = 0.0 * self.conv.weight.data
+        self.conv.bias.data = 0.0 * self.conv.bias.data
+        # [1, 1, 1664] [2, 1, 1024] [2, 1, 1024]
+        self.lstm = torch.nn.LSTM(
+            n_hidden + n_attn_channels, n_hidden, n_lstm_layers
+        ).cuda()
+        self.attention_lstm = torch.nn.LSTM(n_mel_channels, n_hidden).cuda()
+
+        self.attention_layer = Attention(
+            n_hidden, n_speaker_dim, n_text_channels, n_attn_channels,
+        ).cuda()
+
+        self.dense_layer = DenseLayer(
+            in_dim=n_hidden, sizes=[n_hidden, n_hidden]
+        ).cuda()
+        self.add_gate: bool = add_gate
+        # if self.add_gate:
+        self.gate_threshold = 0.5
+        self.gate_layer = LinearNorm(
+            n_hidden + n_attn_channels, 1, bias=True, w_init_gain="sigmoid"
+        )
+
+    def trace_layers(self):
+        self.lstm.flatten_parameters()
+        self.lstm = torch.jit.trace_module(
+            self.lstm,
+            inputs={
+                "forward": [
+                    torch.zeros(
+                        [1, 1, 1664], dtype=torch.float, device="cpu"
+                    ).normal_(),
+                    (
+                        torch.zeros(
+                            [2, 1, 1024], dtype=torch.float, device="cpu"
+                        ).normal_(),
+                        torch.zeros(
+                            [2, 1, 1024], dtype=torch.float, device="cpu"
+                        ).normal_(),
+                    ),
+                ]
+            },
+        )
+        self.attention_lstm.flatten_parameters()
+        self.attention_lstm = torch.jit.trace_module(
+            self.attention_lstm,
+            inputs={
+                "forward": [
+                    torch.zeros([1, 1, 80], dtype=torch.float, device="cpu").normal_(),
+                    (
+                        torch.zeros(
+                            [1, 1, 1024], dtype=torch.float, device="cpu"
+                        ).normal_(),
+                        torch.zeros(
+                            [1, 1, 1024], dtype=torch.float, device="cpu"
+                        ).normal_(),
+                    ),
+                ]
+            },
+        )
+        self.conv = torch.jit.trace_module(
+            self.conv,
+            inputs={
+                "forward": [
+                    torch.zeros([1, 1024, 1], dtype=torch.float, device="cpu").normal_()
+                ]
+            },
+        )
+        self.attention_layer = torch.jit.trace_module(
+            self.attention_layer,
+            inputs={
+                "forward": [
+                    torch.zeros(
+                        [1, 1, 1024], dtype=torch.float, device="cpu"
+                    ).normal_(),
+                    torch.zeros(
+                        [63, 1, 640], dtype=torch.float, device="cpu"
+                    ).normal_(),
+                    torch.zeros(
+                        [63, 1, 640], dtype=torch.float, device="cpu"
+                    ).normal_(),
+                ]
+            },
+        )
+        self.dense_layer = torch.jit.trace_module(
+            self.dense_layer,
+            inputs={
+                "forward": [
+                    torch.zeros([1, 1, 1024], dtype=torch.float, device="cpu").normal_()
+                ]
+            },
+        )
+        self.gate_layer = torch.jit.trace_module(
+            self.gate_layer,
+            inputs={
+                "forward": [
+                    torch.zeros([1, 1, 1664], dtype=torch.float, device="cpu").normal_()
+                ]
+            },
+        )
+
+    def forward(
+        self,
+        residual,
+        text,
+        last_output,
+        hidden_att_h,
+        hidden_att_c,
+        hidden_lstm_h,
+        hidden_lstm_c,
+    ):
+        output = last_output
+        (h, c) = (hidden_att_h, hidden_att_c)
+        (h1, c1) = (hidden_lstm_h, hidden_lstm_c)
+
+        attention_hidden, (h, c) = self.attention_lstm(output, (h, c))
+        attention_context, attention_weight = self.attention_layer(
+            attention_hidden, text, text
+        )
+        attention_context = attention_context.permute(2, 0, 1)
+        decoder_input = torch.cat((attention_hidden, attention_context), -1)
+        lstm_hidden, (h1, c1) = self.lstm(decoder_input, (h1, c1))
+        lstm_hidden = self.dense_layer(lstm_hidden).permute(1, 2, 0)
+        decoder_output = self.conv(lstm_hidden).permute(2, 0, 1)
+        a = int(decoder_output.size(2)) // 2
+        log_s = decoder_output[:, :, :a]
+        b = decoder_output[:, :, a:]
+        output = (residual[:, :].unsqueeze(0) - b) / torch.exp(log_s)
+        gate = (
+            torch.sigmoid(self.gate_layer(decoder_input)).reshape([1])
+            if self.add_gate
+            else torch.tensor([0], dtype=torch.float, device="cpu")
+        )
+        return output, gate, h, c, h1, c1
+
+
+class FlowtronEncoder(torch.nn.Module):
+    def __init__(self, embedding, speaker_embedding, encoder):
+        super().__init__()
+        self.embedding = embedding
+        self.speaker_embedding = speaker_embedding
+        self.encoder = encoder
+
+    def forward(self, speaker_vecs, text):
+        speaker_vecs = self.speaker_embedding(speaker_vecs)
+        text = self.embedding(text).permute(0, 2, 1)
+        text = self.encoder.infer(text)
+        text = text.permute(1, 0, 2)
+        encoder_outputs = torch.cat(
+            [text, speaker_vecs.expand(text.size(0), -1, -1)], 2
+        )
+        return encoder_outputs
+
+
+class Flowtron(torch.nn.Module):
+    __constants__ = ["gate_threshold"]
+
+    def __init__(
+        self,
+        n_speakers,
+        n_speaker_dim,
+        n_text,
+        n_text_dim,
+        n_flows,
+        n_mel_channels,
+        n_hidden,
+        n_attn_channels,
+        n_lstm_layers,
+        use_gate_layer,
+        mel_encoder_n_hidden,
+        n_components,
+        fixed_gaussian,
+        mean_scale,
+        dummy_speaker_embedding,
+        temperature=1,
+        gate_threshold=0.5,
+    ):
+
+        super(Flowtron, self).__init__()
+        norm_fn = InstanceNorm
+        self.speaker_embedding = torch.nn.Embedding(n_speakers, n_speaker_dim)
+        self.embedding = torch.nn.Embedding(n_text, n_text_dim)
+        self.flows = torch.nn.ModuleList()
+        self.encoder = Encoder(norm_fn=norm_fn, encoder_embedding_dim=n_text_dim)
+        self.dummy_speaker_embedding = dummy_speaker_embedding
+        self.gate_threshold = gate_threshold
+        for i in range(n_flows):
+            add_gate = i == (n_flows - 1) and use_gate_layer
+            if i % 2 == 0:
+                f = AR_Step(
+                    n_mel_channels,
+                    n_speaker_dim,
+                    n_text_dim,
+                    n_mel_channels + n_speaker_dim,
+                    n_hidden,
+                    n_attn_channels,
+                    n_lstm_layers,
+                    add_gate,
+                )
+                self.set_temperature_and_gate(f, temperature, gate_threshold)
+                self.flows.append(f)
+            else:
+                f = AR_Back_Step(
+                    n_mel_channels,
+                    n_speaker_dim,
+                    n_text_dim,
+                    n_mel_channels + n_speaker_dim,
+                    n_hidden,
+                    n_attn_channels,
+                    n_lstm_layers,
+                    add_gate,
+                )
+                self.set_temperature_and_gate(f, temperature, gate_threshold)
+                self.flows.append(f)
+
+    @torch.jit.ignore
+    def script_flows(self):
+        for i, flow in enumerate(self.flows):
+            flow.trace_layers()
+            self.flows[i] = torch.jit.script(flow)
+
+    def forward(
+        self, residual, encoder_outputs, last_outputs, hidden_atts, hidden_lstms,
+    ):
+        output1, gate1, hidden_att1, hidden_lstm1 = self.flows[1](
+            residual, encoder_outputs, last_outputs[1], hidden_atts[1], hidden_lstms[1],
+        )
+        output0, gate0, hidden_att0, hidden_lstm0 = self.flows[0](
+            output1, encoder_outputs, last_outputs[0], hidden_atts[0], hidden_lstms[0],
+        )
+        return (
+            output0,
+            torch.cat([gate0, gate1]),
+            [output0, output1],
+            [hidden_att0, hidden_att1],
+            [hidden_lstm0, hidden_lstm1],
+        )
+
+    @staticmethod
+    def set_temperature_and_gate(flow, temperature, gate_threshold):
+        flow = flow.ar_step if hasattr(flow, "ar_step") else flow
+        flow.attention_layer.temperature = temperature
+        if hasattr(flow, "gate_layer"):
+            flow.gate_threshold = gate_threshold
+
+
+class InstanceNorm(torch.nn.modules.instancenorm._InstanceNorm):
+    def __init__(self, *args, **kwargs):
+        super(InstanceNorm, self).__init__(*args, **kwargs)
+
+    def forward(self, x):
+        mn = x.mean(-1).detach().unsqueeze(-1)
+        sd = x.std(-1).detach().unsqueeze(-1)
+
+        x = ((x - mn) / (sd + 1e-8)) * self.weight.view(1, -1, 1) + self.bias.view(
+            1, -1, 1
+        )
+        return x
+
+
+class FlowtronTTS(torch.nn.Module):
+    def __init__(self, encoder, flowtron, waveglow, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.encoder = encoder
+        self.flowtron = flowtron
+        self.forward_flow = flowtron.flows[0]
+        self.backward_flow = flowtron.flows[1]
+        self.waveglow = waveglow
+
+    def trace_flowtron(self, args):
+        self.flowtron_traced = torch.jit.trace(self.flowtron, args)
+
+    @classmethod
+    def patch_waveglow(cls, waveglow):
+        waveglow.forward = cls.waveglow_infer_forward.__get__(waveglow, type(waveglow))
+        return waveglow
+
+    def forward(self, *args):
+        residual, speaker_vecs, text = args
+        enc_outps = self.encoder(speaker_vecs, text)
+
+        residual = residual.permute(2, 0, 1)
+
+        residual_outp = []
+        residual_o, hidden_att, hidden_lstm = self.init_states(residual)
+
+        for i in range(residual.shape[0] - 1, -1, -1):
+            (
+                residual_o,
+                gates,
+                hidden_att[0],
+                hidden_att[1],
+                hidden_lstm[0],
+                hidden_lstm[1],
+            ) = self.backward_flow.ar_step(
+                residual[i],
+                enc_outps,
+                residual_o,
+                hidden_att[0],
+                hidden_att[1],
+                hidden_lstm[0],
+                hidden_lstm[1],
+            )
+            residual_outp = [residual_o] + residual_outp
+            if (gates > self.flowtron.gate_threshold).any():
+                break
+
+        residual = torch.cat(residual_outp, dim=0)
+
+        residual_outp = []
+        residual_o, hidden_att, hidden_lstm = self.init_states(residual)
+        for i in range(residual.shape[0]):
+            (
+                residual_o,
+                gates,
+                hidden_att[0],
+                hidden_att[1],
+                hidden_lstm[0],
+                hidden_lstm[1],
+            ) = self.forward_flow(
+                residual[i],
+                enc_outps,
+                residual_o,
+                hidden_att[0],
+                hidden_att[1],
+                hidden_lstm[0],
+                hidden_lstm[1],
+            )
+            residual_outp.append(residual_o)
+            if (gates > self.flowtron.gate_threshold).any():
+                break
+        residual = torch.cat(residual_outp)
+        residual = residual.permute(1, 2, 0)
+        # audio = self.waveglow(residual)
+        return residual
+
+    def init_states(self, residual):
+        last_outputs = torch.zeros(
+            [1, residual.size(1), residual.size(2)], device=residual.device
+        )
+        hidden_att = [
+            torch.zeros([1, 1, 1024], dtype=torch.float, device="cuda"),
+            torch.zeros([1, 1, 1024], dtype=torch.float, device="cuda"),
+        ]
+        hidden_lstm = [
+            torch.zeros([2, 1, 1024], dtype=torch.float, device="cuda"),
+            torch.zeros([2, 1, 1024], dtype=torch.float, device="cuda"),
+        ]
+        return last_outputs, hidden_att, hidden_lstm
+
+    def waveglow_infer_forward(self, spect, sigma=0.8):
+        """Waveglow infer function.
+        Fixes ONNX unsupported operator errors with replacement
+        for supported ones.
+        """
+
+        spect = self.upsample(spect)
+        # trim conv artifacts. maybe pad spec to kernel multiple
+        time_cutoff = self.upsample.kernel_size[0] - self.upsample.stride[0]
+        spect = spect[:, :, :-time_cutoff]
+        # Replacing unfold since it is compiled into a weird onnx representation (with slices and concat)
+        spect = spect.reshape(1, 80, -1, self.n_group).permute(0, 2, 1, 3)
+        spect = spect.contiguous().reshape(
+            spect.size(0), spect.size(1), -1
+        ).permute(0, 2, 1)
+
+        if spect.type() == 'torch.cuda.HalfTensor':
+            audio = torch.randn(
+                spect.size(0),
+                self.n_remaining_channels,
+                spect.size(2), dtype=torch.half, device='cuda'
+            )
+        else:
+            audio = torch.randn(
+                spect.size(0),
+                self.n_remaining_channels,
+                spect.size(2), dtype=torch.float, device='cuda'
+            )
+
+        audio = torch.autograd.Variable(sigma*audio)
+
+        for k in reversed(range(self.n_flows)):
+            n_half = int(audio.size(1)/2)
+            audio_0 = audio[:, :n_half, :]
+            audio_1 = audio[:, n_half:, :]
+
+            output = self.WN[k]((audio_0, spect))
+
+            s = output[:, n_half:, :]
+            b = output[:, :n_half, :]
+            audio_1 = (audio_1 - b)/torch.exp(s)
+            audio = torch.cat([audio_0, audio_1], 1)
+
+            audio = self.convinv[k](audio, reverse=True)
+
+            if k % self.n_early_every == 0 and k > 0:
+                if spect.type() == 'torch.cuda.HalfTensor':
+                    z = torch.randn(
+                        spect.size(0),
+                        self.n_early_size,
+                        spect.size(2),
+                        dtype=torch.half,
+                        device='cuda'
+                    )
+                else:
+                    z = torch.randn(
+                        spect.size(0),
+                        self.n_early_size,
+                        spect.size(2),
+                        dtype=torch.float,
+                        device='cuda'
+                    )
+                audio = torch.cat((sigma*z, audio), 1)
+
+        audio = audio.permute(0, 2, 1).contiguous().reshape(audio.size(0), -1)
+        return audio
+
+
+class SimpleTTSRunner:
+    def __init__(
+        self,
+        encoder,
+        backward_flow,
+        forward_flow,
+        vocoder,
+        max_frames=500,
+        gate_threshold=0.5,
+    ):
+        self.encoder = encoder
+        self.backward_flow = backward_flow
+        self.forward_flow = forward_flow
+        self.vocoder = vocoder
+        self.max_frames = max_frames
+        self.gate_threshold = gate_threshold
+
+    def run(self, speaker_id, text):
+
+        enc_outps_ortvalue = onnxruntime.OrtValue.ortvalue_from_shape_and_type(
+            [text.shape[1], 1, 640], np.float32, "cpu", 0
+        )
+
+        io_binding = self.encoder.io_binding()
+        io_binding.bind_ortvalue_output("text_emb", enc_outps_ortvalue)
+        io_binding.bind_cpu_input("speaker_vecs", speaker_id)
+        io_binding.bind_cpu_input("text", text.reshape([1, -1]))
+        self.encoder.run_with_iobinding(io_binding)
+        # enc_outps = self.encoder.run(
+        #     None, {"speaker_vecs": speaker_id, "text": text.reshape([1, -1])},
+        # )[0]
+
+        residual = np.random.normal(0, 0.8, size=[self.max_frames, 1, 80]).astype(
+            np.float32
+        )
+
+        start = time.time()
+        residual = self.run_backward_flow(residual, enc_outps_ortvalue)
+        end = time.time()
+        print(f"First delay {end - start}")
+
+        residual = self.run_forward_flow(residual, enc_outps_ortvalue, num_split=20)
+        last_audio = None
+        for residual in residual:
+            residual = np.transpose(residual, axes=(1, 2, 0))
+            start = time.time()
+            audio = self.vocoder.run(None, {"mels": residual})[0]
+            audio = np.where((audio > (audio.mean() - audio.std())) | (audio< (audio.mean() + audio.std())), audio, audio.mean())
+            tmp = audio
+            if last_audio is not None:
+                cumsum_vec = np.cumsum(np.concatenate([last_audio, audio], axis=1), axis=1) 
+                ma_vec = (cumsum_vec[:, 5:] - cumsum_vec[:, :-5]) / 5
+                audio = ma_vec[:, last_audio.shape[1]:]
+            last_audio = tmp
+            end = time.time()
+            process_time = end - start
+            audio_time = len(audio.reshape(-1)) / 22050
+            print(f" > Real-time factor: {process_time / audio_time}")
+            audio = audio.reshape(-1)
+            # audio = audio / np.abs(audio).max()
+            yield audio
+
+    def run_backward_flow(self, residual, enc_outps_ortvalue):
+
+        residual_o, hidden_att, hidden_lstm = self.init_states(residual)
+
+        hidden_att_ortvalue = onnxruntime.OrtValue.ortvalue_from_numpy(
+            hidden_att[0], "cpu", 0
+        )
+        hidden_att_c_ortvalue = onnxruntime.OrtValue.ortvalue_from_numpy(
+            hidden_att[1], "cpu", 0
+        )
+        hidden_lstm_ortvalue = onnxruntime.OrtValue.ortvalue_from_numpy(
+            hidden_lstm[0], "cpu", 0
+        )
+        hidden_lstm_c_ortvalue = onnxruntime.OrtValue.ortvalue_from_numpy(
+            hidden_lstm[1], "cpu", 0
+        )
+
+        hidden_att_o_ortvalue = onnxruntime.OrtValue.ortvalue_from_numpy(
+            hidden_att[0], "cpu", 0
+        )
+        hidden_att_o_c_ortvalue = onnxruntime.OrtValue.ortvalue_from_numpy(
+            hidden_att[1], "cpu", 0
+        )
+        hidden_lstm_o_ortvalue = onnxruntime.OrtValue.ortvalue_from_numpy(
+            hidden_lstm[0], "cpu", 0
+        )
+        hidden_lstm_o_c_ortvalue = onnxruntime.OrtValue.ortvalue_from_numpy(
+            hidden_lstm[1], "cpu", 0
+        )
+
+        residual_ortvalue = onnxruntime.OrtValue.ortvalue_from_numpy(
+            residual_o, "cpu", 0
+        )
+
+        residual_outp = [residual_ortvalue]
+
+        for i in range(residual.shape[0] - 1, -1, -1):
+
+            io_binding = self.backward_flow.io_binding()
+
+            io_binding.bind_cpu_input("residual", residual[i])
+
+            io_binding.bind_ortvalue_input("text", enc_outps_ortvalue)
+            io_binding.bind_ortvalue_input("last_output", residual_outp[0])
+
+            io_binding.bind_ortvalue_input("hidden_att", hidden_att_ortvalue)
+            io_binding.bind_ortvalue_input("hidden_att_c", hidden_att_c_ortvalue)
+            io_binding.bind_ortvalue_input("hidden_lstm", hidden_lstm_ortvalue)
+            io_binding.bind_ortvalue_input("hidden_lstm_c", hidden_lstm_c_ortvalue)
+
+            io_binding.bind_output("output", "cpu")
+            io_binding.bind_output("gate", "cpu")
+            io_binding.bind_ortvalue_output("hidden_att_o", hidden_att_o_ortvalue)
+            io_binding.bind_ortvalue_output("hidden_att_o_c", hidden_att_o_c_ortvalue)
+            io_binding.bind_ortvalue_output("hidden_lstm_o", hidden_lstm_o_ortvalue)
+            io_binding.bind_ortvalue_output("hidden_lstm_o_c", hidden_lstm_o_c_ortvalue)
+
+            self.backward_flow.run_with_iobinding(io_binding)
+
+            outp = io_binding.get_outputs()
+            gates = outp[1].numpy()
+            residual_outp = [outp[0]] + residual_outp
+            if (gates > self.gate_threshold).any():
+                break
+
+            # Switch input and output to use latest output as input
+            (hidden_att_ortvalue, hidden_att_o_ortvalue) = (
+                hidden_att_o_ortvalue,
+                hidden_att_ortvalue,
+            )
+            (hidden_att_c_ortvalue, hidden_att_o_c_ortvalue) = (
+                hidden_att_o_c_ortvalue,
+                hidden_att_c_ortvalue,
+            )
+            (hidden_lstm_ortvalue, hidden_lstm_o_ortvalue) = (
+                hidden_lstm_o_ortvalue,
+                hidden_lstm_ortvalue,
+            )
+            (hidden_lstm_c_ortvalue, hidden_lstm_o_c_ortvalue) = (
+                hidden_lstm_o_c_ortvalue,
+                hidden_lstm_c_ortvalue,
+            )
+
+        residual = np.concatenate(
+            [residual_ort.numpy() for residual_ort in residual_outp], axis=0
+        )
+
+        return residual
+
+    def run_forward_flow(self, residual, enc_outps_ortvalue, num_split):
+
+        residual_o, hidden_att, hidden_lstm = self.init_states(residual)
+
+        hidden_att_ortvalue = onnxruntime.OrtValue.ortvalue_from_numpy(
+            hidden_att[0], "cpu", 0
+        )
+        hidden_att_c_ortvalue = onnxruntime.OrtValue.ortvalue_from_numpy(
+            hidden_att[1], "cpu", 0
+        )
+        hidden_lstm_ortvalue = onnxruntime.OrtValue.ortvalue_from_numpy(
+            hidden_lstm[0], "cpu", 0
+        )
+        hidden_lstm_c_ortvalue = onnxruntime.OrtValue.ortvalue_from_numpy(
+            hidden_lstm[1], "cpu", 0
+        )
+
+        hidden_att_o_ortvalue = onnxruntime.OrtValue.ortvalue_from_numpy(
+            hidden_att[0], "cpu", 0
+        )
+        hidden_att_o_c_ortvalue = onnxruntime.OrtValue.ortvalue_from_numpy(
+            hidden_att[1], "cpu", 0
+        )
+        hidden_lstm_o_ortvalue = onnxruntime.OrtValue.ortvalue_from_numpy(
+            hidden_lstm[0], "cpu", 0
+        )
+        hidden_lstm_o_c_ortvalue = onnxruntime.OrtValue.ortvalue_from_numpy(
+            hidden_lstm[1], "cpu", 0
+        )
+
+        residual_ortvalue = onnxruntime.OrtValue.ortvalue_from_numpy(
+            residual_o, "cpu", 0
+        )
+
+        residual_outp = [residual_ortvalue]
+        last_output = residual_ortvalue
+        for i in range(residual.shape[0]):
+
+            io_binding = self.forward_flow.io_binding()
+
+            io_binding.bind_cpu_input("residual", residual[i])
+
+            io_binding.bind_ortvalue_input("text", enc_outps_ortvalue)
+            io_binding.bind_ortvalue_input("last_output", last_output)
+
+            io_binding.bind_ortvalue_input("hidden_att", hidden_att_ortvalue)
+            io_binding.bind_ortvalue_input("hidden_att_c", hidden_att_c_ortvalue)
+            io_binding.bind_ortvalue_input("hidden_lstm", hidden_lstm_ortvalue)
+            io_binding.bind_ortvalue_input("hidden_lstm_c", hidden_lstm_c_ortvalue)
+
+            io_binding.bind_output("output", "cpu")
+            io_binding.bind_output("gate", "cpu")
+            io_binding.bind_ortvalue_output("hidden_att_o", hidden_att_o_ortvalue)
+            io_binding.bind_ortvalue_output("hidden_att_o_c", hidden_att_o_c_ortvalue)
+            io_binding.bind_ortvalue_output("hidden_lstm_o", hidden_lstm_o_ortvalue)
+            io_binding.bind_ortvalue_output("hidden_lstm_o_c", hidden_lstm_o_c_ortvalue)
+
+            self.forward_flow.run_with_iobinding(io_binding)
+
+            outp = io_binding.get_outputs()
+            gates = outp[1].numpy()
+            residual_outp.append(outp[0])
+            last_output = outp[0]
+            if (gates > self.gate_threshold).any():
+                break
+
+            # Switch input and output to use latest output as input
+            (hidden_att_ortvalue, hidden_att_o_ortvalue) = (
+                hidden_att_o_ortvalue,
+                hidden_att_ortvalue,
+            )
+            (hidden_att_c_ortvalue, hidden_att_o_c_ortvalue) = (
+                hidden_att_o_c_ortvalue,
+                hidden_att_c_ortvalue,
+            )
+            (hidden_lstm_ortvalue, hidden_lstm_o_ortvalue) = (
+                hidden_lstm_o_ortvalue,
+                hidden_lstm_ortvalue,
+            )
+            (hidden_lstm_c_ortvalue, hidden_lstm_o_c_ortvalue) = (
+                hidden_lstm_o_c_ortvalue,
+                hidden_lstm_c_ortvalue,
+            )
+            if len(residual_outp) % num_split == 0 and i != 0:
+
+                residual_o = np.concatenate(
+                    [residual_ort.numpy() for residual_ort in residual_outp], axis=0
+                )
+
+                yield residual_o
+                residual_outp = []
+        if len(residual_outp) > 0:
+            residual_o = np.concatenate(
+                [residual_ort.numpy() for residual_ort in residual_outp], axis=0
+            )
+
+            yield residual_o
+
+    def init_states(self, residual):
+        last_outputs = np.zeros(
+            [1, residual.shape[1], residual.shape[2]], dtype=np.float32
+        )
+        hidden_att = [
+            np.zeros([1, 1, 1024], dtype=np.float32),
+            np.zeros([1, 1, 1024], dtype=np.float32),
+        ]
+        hidden_lstm = [
+            np.zeros([2, 1, 1024], dtype=np.float32),
+            np.zeros([2, 1, 1024], dtype=np.float32),
+        ]
+        return last_outputs, hidden_att, hidden_lstm