diff --git a/PaddleSpeech/DeepVoice3/README_cn.md b/PaddleSpeech/DeepVoice3/README_cn.md
index 0828c9835747c795aa6398d7f4ad5dcea34aa674..a726a074e782289d2212808d118c637699d79ddf 100644
--- a/PaddleSpeech/DeepVoice3/README_cn.md
+++ b/PaddleSpeech/DeepVoice3/README_cn.md
@@ -10,7 +10,7 @@ Paddle 实现的 Deepvoice3，一个基于卷积神经网络的语音合成 (Tex
 
 ### 安装 paddlepaddle 框架
 
-为了更快的训练速度和更好的支持，我们推荐使用最新的开发版 paddle。用户可以最新编译的开发版 whl 包，也可以选择从源码编译 Paddle。
+为了更快的训练速度和更好的支持，我们推荐使用最新的 Paddle 开发版。用户也可以最新编译的开发版 whl 包，也可以选择从源码编译 Paddle。
 
 1. 下载最新编译的开发版 whl 包。可以从  [**多版本 wheel 包列表-dev**](https://www.paddlepaddle.org.cn/documentation/docs/zh/beginners_guide/install/Tables.html#whl-dev) 页面中选择合适的版本。
 
diff --git a/PaddleSpeech/DeepVoice3/deepvoice3_paddle/conv.py b/PaddleSpeech/DeepVoice3/deepvoice3_paddle/conv.py
index 3e43232df3542c9cd12bd8228c0557a362fa4baf..0805135ff8a55163d6a5ea840d46cfa09b1139c2 100644
--- a/PaddleSpeech/DeepVoice3/deepvoice3_paddle/conv.py
+++ b/PaddleSpeech/DeepVoice3/deepvoice3_paddle/conv.py
@@ -31,7 +31,7 @@ class Conv1D(dg.Layer):
 
     def __init__(self,
                  name_scope,
-                 in_cahnnels,
+                 in_channels,
                  num_filters,
                  filter_size=3,
                  dilation=1,
@@ -49,7 +49,7 @@ class Conv1D(dg.Layer):
         else:
             padding = (dilation * (filter_size - 1)) // 2
 
-        self.in_channels = in_cahnnels
+        self.in_channels = in_channels
         self.num_filters = num_filters
         self.filter_size = filter_size
         self.dilation = dilation
diff --git a/PaddleSpeech/DeepVoice3/deepvoice3_paddle/data.py b/PaddleSpeech/DeepVoice3/deepvoice3_paddle/data.py
index bb9d62a31bfe44297fc53644d2d5cc4ea77ebc09..d6dc55db2a6574fa5327f6aa07141d19947ad9ad 100644
--- a/PaddleSpeech/DeepVoice3/deepvoice3_paddle/data.py
+++ b/PaddleSpeech/DeepVoice3/deepvoice3_paddle/data.py
@@ -294,7 +294,6 @@ def create_batch(batch):
     text_positions = np.array(
         [_pad(np.arange(1, len(x[0]) + 1), max_input_len) for x in batch],
         dtype=np.int64)
-    text_positions = np.expand_dims(text_positions, axis=-1)
 
     max_decoder_target_len = max_target_len // r // downsample_step
 
@@ -304,7 +303,6 @@ def create_batch(batch):
         np.expand_dims(
             np.arange(
                 s, e, dtype=np.int64), axis=0), (len(batch), 1))
-    frame_positions = np.expand_dims(frame_positions, axis=-1)
 
     # done flags
     done = np.array([
diff --git a/PaddleSpeech/DeepVoice3/deepvoice3_paddle/deepvoice3.py b/PaddleSpeech/DeepVoice3/deepvoice3_paddle/deepvoice3.py
index 4dda07c1fe53df7a040b435e1ac0c7840ad5b840..ca6dbbb6105d4e127187632d36d6cad0a1216e4e 100644
--- a/PaddleSpeech/DeepVoice3/deepvoice3_paddle/deepvoice3.py
+++ b/PaddleSpeech/DeepVoice3/deepvoice3_paddle/deepvoice3.py
@@ -591,10 +591,10 @@ class Decoder(dg.Layer):
                 of text inputs for each example.
             inputs (Variable): Shape(B, C_mel, 1, T_mel), ground truth
                 mel-spectrogram, which is used as decoder inputs when training.
-            text_positions (Variable): Shape(B, T_enc, 1), dtype: int64.
+            text_positions (Variable): Shape(B, T_enc), dtype: int64.
                 Positions indices for text inputs for the encoder, where 
                 T_enc means the encoder timesteps.
-            frame_positions (Variable): Shape(B, T_dec // r, 1), dtype: 
+            frame_positions (Variable): Shape(B, T_dec // r), dtype: 
                 int64. Positions indices for each decoder time steps.
             speaker_embed: shape(batch_size, speaker_dim), speaker embedding, 
                 only used for multispeaker model.
@@ -717,7 +717,7 @@ class Decoder(dg.Layer):
                 values (Variable): shape(B, C_emb, 1, T_enc), the value
                     representation from an encoder, where C_emb means
                     text embedding size.
-            text_positions (Variable): Shape(B, T_enc, 1), dtype: int64.
+            text_positions (Variable): Shape(B, T_enc), dtype: int64.
                 Positions indices for text inputs for the encoder, where 
                 T_enc means the encoder timesteps.
                
@@ -789,7 +789,7 @@ class Decoder(dg.Layer):
 
         while True:
             frame_pos = fluid.layers.fill_constant(
-                shape=[B, 1, 1], value=t + 1, dtype="int64")
+                shape=[B, 1], value=t + 1, dtype="int64")
             w = self.query_position_rate
             if self.n_speakers > 1:
                 w = w * fluid.layers.reshape(
@@ -1222,7 +1222,7 @@ class DeepVoiceTTS(dg.Layer):
         Encode text sequence and decode with ground truth mel spectrogram.
                 
         Args:
-            text_sequences (Variable): Shape(B, T_enc, 1), dtype: int64. Ihe
+            text_sequences (Variable): Shape(B, T_enc), dtype: int64. Ihe
                 input text indices. T_enc means the timesteps of text_sequences.
             valid_lengths (Variable): shape(batch_size,), dtype: int64,
                 valid lengths for each example in text_sequences.
@@ -1231,10 +1231,10 @@ class DeepVoiceTTS(dg.Layer):
             speaker_indices (Variable, optional): Shape(Batch_size),
                 dtype: int64. Speaker index for each example. This arg is not
                 None only when the model is a multispeaker model.
-            text_positions (Variable): Shape(B, T_enc, 1), dtype: int64.
+            text_positions (Variable): Shape(B, T_enc), dtype: int64.
                 Positions indices for text inputs for the encoder, where 
                 T_enc means the encoder timesteps.
-            frame_positions (Variable): Shape(B, T_dec // r, 1), dtype: 
+            frame_positions (Variable): Shape(B, T_dec // r), dtype: 
                 int64. Positions indices for each decoder time steps.
 
         Returns:
@@ -1295,12 +1295,12 @@ class DeepVoiceTTS(dg.Layer):
         Encode text sequence and decode without ground truth mel spectrogram.
         
         Args:
-            text_sequences (Variable): Shape(B, T_enc, 1), dtype: int64. Ihe
+            text_sequences (Variable): Shape(B, T_enc), dtype: int64. Ihe
                 input text indices. T_enc means the timesteps of text_sequences.
-            text_positions (Variable): Shape(B, T_enc, 1), dtype: int64.
+            text_positions (Variable): Shape(B, T_enc), dtype: int64.
                 Positions indices for text inputs for the encoder, where 
                 T_enc means the encoder timesteps.
-            speaker_indices (Variable, optional): Shape(Batch_size, 1),
+            speaker_indices (Variable, optional): Shape(Batch_size),
                 dtype: int64. Speaker index for each example. This arg is not
                 None only when the model is a multispeaker model.
 
@@ -1423,7 +1423,7 @@ class ConvS2S(dg.Layer):
         Encode text sequence and decode with ground truth mel spectrogram.
 
         Args:
-            text_sequences (Variable): Shape(B, T_enc, 1), dtype: int64. Ihe
+            text_sequences (Variable): Shape(B, T_enc), dtype: int64. Ihe
                 input text indices. T_enc means the timesteps of text_sequences.
             valid_lengths (Variable): shape(batch_size,), dtype: int64,
                 valid lengths for each example in text_sequences.
@@ -1432,10 +1432,10 @@ class ConvS2S(dg.Layer):
             speaker_embed (Variable, optional): Shape(Batch_size, speaker_dim),
                 dtype: float32. Speaker embeddings. This arg is not None only
                 when the model is a multispeaker model.
-            text_positions (Variable): Shape(B, T_enc, 1), dtype: int64.
+            text_positions (Variable): Shape(B, T_enc), dtype: int64.
                 Positions indices for text inputs for the encoder, where 
                 T_enc means the encoder timesteps.
-            frame_positions (Variable): Shape(B, T_dec // r, 1), dtype: 
+            frame_positions (Variable): Shape(B, T_dec // r), dtype: 
                 int64. Positions indices for each decoder time steps.
 
         Returns:
@@ -1466,9 +1466,9 @@ class ConvS2S(dg.Layer):
         Encode text sequence and decode without ground truth mel spectrogram.
         
         Args:
-            text_sequences (Variable): Shape(B, T_enc, 1), dtype: int64. Ihe
+            text_sequences (Variable): Shape(B, T_enc), dtype: int64. Ihe
                 input text indices. T_enc means the timesteps of text_sequences.
-            text_positions (Variable): Shape(B, T_enc, 1), dtype: int64.
+            text_positions (Variable): Shape(B, T_enc), dtype: int64.
                 Positions indices for text inputs for the encoder, where 
                 T_enc means the encoder timesteps.
             speaker_embed (Variable, optional): Shape(Batch_size, speaker_dim),
diff --git a/PaddleSpeech/DeepVoice3/deepvoice3_paddle/dry_run.py b/PaddleSpeech/DeepVoice3/deepvoice3_paddle/dry_run.py
index 9c9518ce8aa73ad12ec225dfd4fcdeb5f5d5995b..c947f5bd2315b73e57f72c4396465f8c61412e35 100644
--- a/PaddleSpeech/DeepVoice3/deepvoice3_paddle/dry_run.py
+++ b/PaddleSpeech/DeepVoice3/deepvoice3_paddle/dry_run.py
@@ -48,7 +48,7 @@ def dry_run(model):
     mel_dim = hparams.num_mels
 
     x = np.random.randint(
-        low=0, high=n_vocab, size=(batch_size, enc_length, 1), dtype="int64")
+        low=0, high=n_vocab, size=(batch_size, enc_length), dtype="int64")
     input_lengths = np.arange(
         enc_length - batch_size + 1, enc_length + 1, dtype="int64")
     mel = np.random.randn(batch_size, mel_dim, 1, mel_length).astype("float32")
@@ -60,18 +60,16 @@ def dry_run(model):
             0, enc_length, dtype="int64"), (batch_size, 1))
     text_mask = text_positions > np.expand_dims(input_lengths, 1)
     text_positions[text_mask] = 0
-    text_positions = np.expand_dims(text_positions, axis=-1)
 
     frame_positions = np.tile(
         np.arange(
             1, decoder_length + 1, dtype="int64"), (batch_size, 1))
-    frame_positions = np.expand_dims(frame_positions, axis=-1)
 
     done = np.zeros(shape=(batch_size, 1, 1, decoder_length), dtype="float32")
     target_lengths = np.array([snd_sample_length] * batch_size).astype("int64")
 
     speaker_ids = np.random.randint(
-        low=0, high=n_speakers, size=(batch_size, 1),
+        low=0, high=n_speakers, size=(batch_size),
         dtype="int64") if n_speakers > 1 else None
 
     ismultispeaker = speaker_ids is not None
diff --git a/PaddleSpeech/DeepVoice3/deepvoice3_paddle/modules.py b/PaddleSpeech/DeepVoice3/deepvoice3_paddle/modules.py
index b2921fdd089cf1909fda9dd307db3f18dedab9a6..d651c5edc380574773e3852919c45162590f73ad 100644
--- a/PaddleSpeech/DeepVoice3/deepvoice3_paddle/modules.py
+++ b/PaddleSpeech/DeepVoice3/deepvoice3_paddle/modules.py
@@ -366,14 +366,14 @@ class PositionEmbedding(dg.Layer):
         self._dtype = dtype
 
     def set_weight(self, array):
-        assert self.embed._w.shape == list(array.shape), "shape does not match"
-        self.embed._w.value().get_tensor().set(
-            array, fluid.framework._current_expected_place())
+        assert self.embed.weight.shape == list(
+            array.shape), "shape does not match"
+        self.embed.weight.set_value(array)
 
     def forward(self, indices, speaker_position_rate=None):
         """
         Args:
-            indices (Variable): Shape (B, T, 1), dtype: int64, position
+            indices (Variable): Shape (B, T), dtype: int64, position
                 indices, where B means the batch size, T means the time steps.
             speaker_position_rate (Variable | float, optional), position
                 rate. It can be a float point number or a Variable with 
@@ -391,7 +391,7 @@ class PositionEmbedding(dg.Layer):
             weight = compute_position_embedding(rad)
             out = self._helper.create_variable_for_type_inference(self._dtype)
             self._helper.append_op(
-                type="lookup_table",
+                type="lookup_table_v2",
                 inputs={"Ids": indices,
                         "W": weight},
                 outputs={"Out": out},
@@ -417,7 +417,7 @@ class PositionEmbedding(dg.Layer):
             weight = compute_position_embedding(scaled_rad)
             out = self._helper.create_variable_for_type_inference(self._dtype)
             self._helper.append_op(
-                type="lookup_table",
+                type="lookup_table_v2",
                 inputs={"Ids": indices,
                         "W": weight},
                 outputs={"Out": out},
@@ -441,7 +441,7 @@ class PositionEmbedding(dg.Layer):
                     self._dtype)
                 sequence = indices[i]
                 self._helper.append_op(
-                    type="lookup_table",
+                    type="lookup_table_v2",
                     inputs={"Ids": sequence,
                             "W": weight},
                     outputs={"Out": out},
diff --git a/PaddleSpeech/DeepVoice3/eval_model.py b/PaddleSpeech/DeepVoice3/eval_model.py
index 3d800f3398a7bdc78b976fa5e05748fb7d3a79d5..196b8d0c194b80e71eb5b944e6e812678dbfc4e0 100644
--- a/PaddleSpeech/DeepVoice3/eval_model.py
+++ b/PaddleSpeech/DeepVoice3/eval_model.py
@@ -67,9 +67,9 @@ def tts(model, text, p=0., speaker_id=None):
     model.eval()
 
     sequence = np.array(_frontend.text_to_sequence(text, p=p)).astype("int64")
-    sequence = np.reshape(sequence, (1, -1, 1))
+    sequence = np.reshape(sequence, (1, -1))
     text_positions = np.arange(1, sequence.shape[1] + 1, dtype="int64")
-    text_positions = np.reshape(text_positions, (1, -1, 1))
+    text_positions = np.reshape(text_positions, (1, -1))
 
     sequence = dg.to_variable(sequence)
     text_positions = dg.to_variable(text_positions)
@@ -191,8 +191,8 @@ def eval_model(global_step, writer, model, checkpoint_dir, ismultispeaker):
 
             # Mel
             writer.add_image(
-                "(Eval) Predicted mel spectrogram text{}_{}".format(
-                    idx, speaker_str),
+                "Eval_Predicted_mel_spectrogram_text{}_{}".format(idx,
+                                                                  speaker_str),
                 prepare_spec_image(mel),
                 global_step,
                 dataformats='HWC')
@@ -205,8 +205,8 @@ def eval_model(global_step, writer, model, checkpoint_dir, ismultispeaker):
 
             try:
                 writer.add_audio(
-                    "(Eval) Predicted audio signal {}_{}".format(idx,
-                                                                 speaker_str),
+                    "Eval_Predicted_audio_signal_{}_{}".format(idx,
+                                                               speaker_str),
                     signal,
                     global_step,
                     sample_rate=hparams.sample_rate)
@@ -273,7 +273,7 @@ def save_states(global_step,
         mel_output = mel_outputs[idx].numpy().squeeze().T
         mel_output = prepare_spec_image(audio._denormalize(mel_output))
         writer.add_image(
-            "Predicted mel spectrogram",
+            "Predicted_mel_spectrogram",
             mel_output,
             global_step,
             dataformats="HWC")
@@ -282,7 +282,7 @@ def save_states(global_step,
         linear_output = linear_outputs[idx].numpy().squeeze().T
         spectrogram = prepare_spec_image(audio._denormalize(linear_output))
         writer.add_image(
-            "Predicted linear spectrogram",
+            "Predicted_linear_spectrogram",
             spectrogram,
             global_step,
             dataformats="HWC")
@@ -293,7 +293,7 @@ def save_states(global_step,
                     "step{:09d}_predicted.wav".format(global_step))
         try:
             writer.add_audio(
-                "Predicted audio signal",
+                "Predicted_audio_signal",
                 signal,
                 global_step,
                 sample_rate=hparams.sample_rate)
@@ -306,7 +306,7 @@ def save_states(global_step,
         mel_output = mel[idx].numpy().squeeze().T
         mel_output = prepare_spec_image(audio._denormalize(mel_output))
         writer.add_image(
-            "Target mel spectrogram",
+            "Target_mel_spectrogram",
             mel_output,
             global_step,
             dataformats="HWC")
@@ -315,7 +315,7 @@ def save_states(global_step,
         linear_output = y[idx].numpy().squeeze().T
         spectrogram = prepare_spec_image(audio._denormalize(linear_output))
         writer.add_image(
-            "Target linear spectrogram",
+            "Target_linear_spectrogram",
             spectrogram,
             global_step,
             dataformats="HWC")