add paddle tts vs espnet tts demos

9ccef7fa · 小湉湉 · 0c3c2183 · 9ccef7fa · 9ccef7fa · 9ccef7fa
5 changed file
--- a/docs/source/_static/custom.css
+++ b/docs/source/_static/custom.css
@@ -2,4 +2,4 @@
    max-width: 80%;
 }
 .table table{ background:#b9b9b9} 
-.table table td{ background:#FFF} 
+.table table td{ background:#FFF; } 
--- a/docs/source/tts/demo.rst
+++ b/docs/source/tts/demo.rst
@@ -248,7 +248,8 @@ Audio samples generated from ground-truth spectrograms with a vocoder.
        </tr>    
    </table>
    </div>
+    <br>
+    <br>
 TTS
 -------------------
@@ -633,10 +634,264 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
            </td>
        </tr>   
    </table>
    </div>
+    <br>
+    <br>
+Multi-Speaker TTS
+-------------------
+PaddleSpeech also support Multi-Speaker TTS, we provide the audio demos generated by FastSpeech2 + ParallelWaveGAN, we use AISHELL-3 Multi-Speaker TTS dataset.
+.. raw:: html
+    <div class="table">
+    <table border="2" cellspacing="1" cellpadding="1">
+        <tr>
+            <th align="center"> Text </th>
+            <th align="center"> Origin </th>
+            <th align="center"> Generated </th>
+        </tr>
+    <table>
+    <div>
+    <br>
+    <br>
+Duration control in FastSpeech2
+--------------------------------------
+In our FastSpeech2, we can control ``duration``, ``pitch`` and ``energy``, we provide the audio demos of duration control here. ``duration`` means the duration of phonemes, when we reduce duration, the speed of audios will increase, and when we incerase ``duration``, the speed of audios will reduce.
+The ``duration`` of different phonemes in a sentence can have different scale ratios (when you want to slow down one word and keep the other words' speed in a sentence). Here we use a fixed scale ratio for different phonemes to control the ``speed`` of audios.
+The duration control in FastSpeech2 can control the speed of audios will keep the pitch. (in some speech tool, increase the speed will increase the pitch, and vice versa.)
+.. raw:: html
+    <div class="table">
+    <table border="2" cellspacing="1" cellpadding="1">
+        <tr>
+            <th align="center"> Speed(0.8x) </th>
+            <th align="center"> Speed(1x) </th>
+            <th align="center"> Speed(1.2x) </th>
+        </tr>
+        <tr>
+             <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speed/x0.8_001.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speed/x1_001.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speed/x1.2_001.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+        <tr>
+             <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speed/x0.8_002.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speed/x1_002.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speed/x1.2_002.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+        <tr>
+             <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speed/x0.8_003.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speed/x1_003.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speed/x1.2_003.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+        <tr>
+             <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speed/x0.8_004.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speed/x1_004.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speed/x1.2_004.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+        <tr>
+             <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speed/x0.8_005.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speed/x1_005.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speed/x1.2_005.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+        <tr>
+             <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speed/x0.8_007.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speed/x1_007.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speed/x1.2_007.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+        <tr>
+             <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speed/x0.8_008.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speed/x1_008.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speed/x1.2_008.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+        <tr>
+             <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speed/x0.8_009.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speed/x1_009.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speed/x1.2_009.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+    <table>
+    <div>
+    <br>
+    <br>
 Chinese TTS with/without text frontend
 --------------------------------------
@@ -650,9 +905,9 @@ We use ``FastSpeech2`` + ``ParallelWaveGAN`` here.
    <div class="table">
    <table border="2" cellspacing="1" cellpadding="1">
        <tr>
-            <th align="left"> Text</th>
+            <th align="center"> Text</th>
-            <th align="left"> With Text Frontend </th>
+            <th align="center"> With Text Frontend </th>
-            <th align="left"> Without Text Frontend </th>
+            <th align="center"> Without Text Frontend </th>
        </tr>
        <tr>
            <td>他只是一个纸老虎。</td>
@@ -846,6 +1101,8 @@ We use ``FastSpeech2`` + ``ParallelWaveGAN`` here.
        </tr>
    <table>
-    </div>  
+    </div>
+    <br>
+    <br> 
\ No newline at end of file
--- a/docs/source/tts/demo_2.rst
+++ b/docs/source/tts/demo_2.rst
@@ -5,3 +5,283 @@ This is an audio demo page to contrast PaddleSpeech TTS and Espnet TTS, We use t
 We use Espnet's released models here.
 FastSpeech2 + Parallel WaveGAN in CSMSC
+.. raw:: html
+    <div class="table">
+    <table border="2" cellspacing="1" cellpadding="1"> 
+        <tr>
+            <th align="center"> Text </th>
+            <th align="center"> Espent TTS </th>
+            <th align="center"> PaddleSpeech TTS </th>
+        </tr>
+        <tr>
+            <td>早上好，今天是2020/10/29，最低温度是-3°C。</td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/espent/001.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/parakeet/001.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+        <tr>
+            <td>你好，我的编号是37249，很高兴为您服务。</td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/espent/002.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/parakeet/002.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+        <tr>
+            <td>我们公司有37249个人。</td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/espent/003.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/parakeet/003.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+        <tr>
+            <td>我出生于2005年10月8日。</td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/espent/004.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/parakeet/004.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+        <tr>
+            <td>我们习惯在12:30吃中午饭。</td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/espent/005.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/parakeet/005.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+        <tr>
+            <td>只要有超过3/4的人投票同意，你就会成为我们的新班长。</td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/espent/006.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/parakeet/006.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+        <tr>
+            <td>我要买一只价值999.9元的手表。</td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/espent/007.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/parakeet/007.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+        <tr>
+            <td>我的手机号是18544139121，欢迎来电。</td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/espent/008.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/parakeet/008.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+        <tr>
+            <td>明天有62%的概率降雨。</td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/espent/009.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/parakeet/009.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+        <tr>
+            <td>手表厂有五种好产品。</td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/espent/010.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/parakeet/010.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+        <tr>
+            <td>跑马场有五百匹很勇敢的千里马。</td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/espent/011.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/parakeet/011.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+        <tr>
+            <td>有一天，我看到了一栋楼，我顿感不妙，因为我看不清里面有没有人。</td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/espent/012.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/parakeet/012.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+        <tr>
+            <td>史小姐拿着小雨伞去找她的老保姆了。</td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/espent/013.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/parakeet/013.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+        <tr>
+            <td>不要相信这个老奶奶说的话，她一点儿也不好。</td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/espent/014.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/parakeet/014.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+        </table>
+        </div>
--- a/docs/source/tts/test_sentence.txt
+++ b/docs/source/tts/test_sentence.txt
+001 早上好，今天是2020/10/29，最低温度是-3°C。
+002 你好，我的编号是37249，很高兴为您服务。
+003 我们公司有37249个人。
+004 我出生于2005年10月8日。
+005 我们习惯在12:30吃中午饭。
+006 只要有超过3/4的人投票同意，你就会成为我们的新班长。
+007 我要买一只价值999.9元的手表。
+008 我的手机号是18544139121，欢迎来电。
+009 明天有62%的概率降雨。
+010 手表厂有五种好产品。
+011 跑马场有五百匹很勇敢的千里马。
+012 有一天，我看到了一栋楼，我顿感不妙，因为我看不清里面有没有人。
+013 史小姐拿着小雨伞去找她的老保姆了。
+014 不要相信这个老奶奶说的话，她一点儿也不好。
\ No newline at end of file
--- a/parakeet/models/fastspeech2/fastspeech2.py
+++ b/parakeet/models/fastspeech2/fastspeech2.py
@@ -420,9 +420,18 @@ class FastSpeech2(nn.Layer):
        if is_inference:
            # (B, Tmax)
-            d_outs = self.duration_predictor.inference(hs, d_masks)
+            if ds is not None:
+                d_outs = ds
+            else:
+                d_outs = self.duration_predictor.inference(hs, d_masks)
+            if ps is not None:
+                p_outs = ps
+            if es is not None:
+                e_outs = es
            # use prediction in inference
            # (B, Tmax, 1)
            p_embs = self.pitch_embed(p_outs.transpose((0, 2, 1))).transpose(
                (0, 2, 1))
            e_embs = self.energy_embed(e_outs.transpose((0, 2, 1))).transpose(
@@ -513,7 +522,7 @@ class FastSpeech2(nn.Layer):
        x = paddle.cast(text, 'int64')
        y = speech
        spemb = spembs
-        if durations:
+        if durations is not None:
            d = paddle.cast(durations, 'int64')
        p, e = pitch, energy
        # setup batch axis
@@ -531,9 +540,12 @@ class FastSpeech2(nn.Layer):
        if use_teacher_forcing:
            # use groundtruth of duration, pitch, and energy
-            ds, ps, es = d.unsqueeze(0), p.unsqueeze(0), e.unsqueeze(0)
+            ds = d.unsqueeze(0) if d is not None else None
+            ps = p.unsqueeze(0) if p is not None else None
+            es = e.unsqueeze(0) if e is not None else None
+            # ds, ps, es = , p.unsqueeze(0), e.unsqueeze(0)
            # (1, L, odim)
-            _, outs, *_ = self._forward(
+            _, outs, d_outs, *_ = self._forward(
                xs,
                ilens,
                ys,
@@ -542,10 +554,11 @@ class FastSpeech2(nn.Layer):
                es=es,
                spembs=spembs,
                spk_id=spk_id,
-                tone_id=tone_id)
+                tone_id=tone_id,
+                is_inference=True)
        else:
            # (1, L, odim)
-            _, outs, *_ = self._forward(
+            _, outs, d_outs, *_ = self._forward(
                xs,
                ilens,
                ys,