diff --git a/README.md b/README.md
index aacf6c31d86d6f8d1b7561cea68bda827b54af04..a8511921b2b621d0b7894ef0a6b48c626ac1663b 100644
--- a/README.md
+++ b/README.md
@@ -74,7 +74,7 @@ Entries to the introduction, and the launch of training and synthsis for differe
 
 ## Pre-trained models and audio samples
 
-Parakeet also releases some well-trained parameters for the example models, which can be accessed in the following tables. Each column of these tables lists resources for one model, including the url link to the pre-trained model, the dataset that the model is trained on and the total training steps, and several synthesized audio samples based on the pre-trained model.
+Parakeet also releases some well-trained parameters for the example models, which can be accessed in the following tables. Each column of these tables lists resources for one model, including the url link to the pre-trained model, the dataset that the model is trained on, and synthesized audio samples based on the pre-trained model.
 
 #### Vocoders
 
@@ -94,7 +94,7 @@ We provide the model checkpoints of WaveFlow with 64 and 128 residual channels,
     </thead>
     <tbody>
         <tr>
-            <th>LJSpeech, 3020 K</th>
+            <th>LJSpeech </th>
             <th>LJSpeech </th>
         </tr>
         <tr>
@@ -127,8 +127,8 @@ We provide the model checkpoints of WaveFlow with 64 and 128 residual channels,
     </thead>
     <tbody>
         <tr>
-            <th>LJSpeech, 500 K</th>
-            <th>LJSpeech, 2450 K</th>
+            <th>LJSpeech </th>
+            <th>LJSpeech </th>
         </tr>
         <tr>
             <th>
diff --git a/examples/waveflow/README.md b/examples/waveflow/README.md
index 34e69085e417231a0fec6ce50fc9cda1bbc591bd..16364f6d2cb85a37e5f5351209d3c2b8579131c4 100644
--- a/examples/waveflow/README.md
+++ b/examples/waveflow/README.md
@@ -13,8 +13,8 @@ PaddlePaddle dynamic graph implementation of [WaveFlow: A Compact Flow-based Mod
 ├── synthesis.py                                     # script for speech synthesis
 ├── train.py                                         # script for model training
 ├── utils.py                                         # helper functions for e.g., model checkpointing
-├── parakeet/models/waveflow/data.py                 # dataset and dataloader settings for LJSpeech
-├── parakeet/models/waveflow/waveflow.py             # WaveFlow model high level APIs
+├── data.py                                          # dataset and dataloader settings for LJSpeech
+├── waveflow.py                                      # WaveFlow model high level APIs
 └── parakeet/models/waveflow/waveflow_modules.py     # WaveFlow model implementation
 ```
 
@@ -48,12 +48,12 @@ python -u train.py \
     --config=./configs/waveflow_ljspeech.yaml \
     --root=./data/LJSpeech-1.1 \
     --name=${ModelName} --batch_size=4 \
-    --parallel=false --use_gpu=true
+    --use_gpu=true
 ```
 
 #### Save and Load checkpoints
 
-Our model will save model parameters as checkpoints in `./runs/waveflow/${ModelName}/checkpoint/` every 10000 iterations by default.
+Our model will save model parameters as checkpoints in `./runs/waveflow/${ModelName}/checkpoint/` every 10000 iterations by default, where `${ModelName}` is the model name for one single experiment and it could be whatever you like.
 The saved checkpoint will have the format of `step-${iteration_number}.pdparams` for model parameters and `step-${iteration_number}.pdopt` for optimizer parameters.
 
 There are three ways to load a checkpoint and resume training (take an example that you want to load a 500000-iteration checkpoint):
@@ -68,7 +68,7 @@ export CUDA_VISIBLE_DEVICES=0,1,2,3
 python -u -m paddle.distributed.launch train.py \
     --config=./configs/waveflow_ljspeech.yaml \
     --root=./data/LJSpeech-1.1 \
-    --name=${ModelName} --parallel=true --use_gpu=true
+    --name=${ModelName} --use_gpu=true
 ```
 
 Use `export CUDA_VISIBLE_DEVICES=0,1,2,3` to set the GPUs that you want to use to be visible. Then the `paddle.distributed.launch` module will use these visible GPUs to do data parallel training in multiprocessing mode.
diff --git a/examples/waveflow/benchmark.py b/examples/waveflow/benchmark.py
index 058147143bc39d26e4053b0f373340f669322897..222e73272cd3c847ba07b034824536e81d045363 100644
--- a/examples/waveflow/benchmark.py
+++ b/examples/waveflow/benchmark.py
@@ -23,7 +23,7 @@ from paddle import fluid
 
 import utils
 from parakeet.utils import io
-from parakeet.models.waveflow import WaveFlow
+from waveflow import WaveFlow
 
 
 def add_options_to_parser(parser):
diff --git a/parakeet/models/waveflow/data.py b/examples/waveflow/data.py
similarity index 100%
rename from parakeet/models/waveflow/data.py
rename to examples/waveflow/data.py
diff --git a/examples/waveflow/synthesis.py b/examples/waveflow/synthesis.py
index 5f3dd5aa29d8d6dc0a0982fe19432054bcdcd3e4..15c4d3b843165540c6f80f986b23e73ffeb4a59e 100644
--- a/examples/waveflow/synthesis.py
+++ b/examples/waveflow/synthesis.py
@@ -21,9 +21,9 @@ import numpy as np
 import paddle.fluid.dygraph as dg
 from paddle import fluid
 
-import utils
-from parakeet.models.waveflow import WaveFlow
 from parakeet.utils import io
+import utils
+from waveflow import WaveFlow
 
 
 def add_options_to_parser(parser):
diff --git a/examples/waveflow/train.py b/examples/waveflow/train.py
index 548c5da9d59dfd36cc530ced78967ef3b8ccdbc2..a033369fef4ca5850a31442c567035621d16dcb2 100644
--- a/examples/waveflow/train.py
+++ b/examples/waveflow/train.py
@@ -26,7 +26,7 @@ from tensorboardX import SummaryWriter
 
 import utils
 from parakeet.utils import io
-from parakeet.models.waveflow import WaveFlow
+from waveflow import WaveFlow
 
 
 def add_options_to_parser(parser):
@@ -40,11 +40,6 @@ def add_options_to_parser(parser):
     parser.add_argument(
         '--root', type=str, help="root path of the LJSpeech dataset")
 
-    parser.add_argument(
-        '--parallel',
-        type=utils.str2bool,
-        default=True,
-        help="option to use data parallel training")
     parser.add_argument(
         '--use_gpu',
         type=utils.str2bool,
@@ -66,11 +61,11 @@ def add_options_to_parser(parser):
 
 def train(config):
     use_gpu = config.use_gpu
-    parallel = config.parallel if use_gpu else False
 
     # Get the rank of the current training process.
-    rank = dg.parallel.Env().local_rank if parallel else 0
-    nranks = dg.parallel.Env().nranks if parallel else 1
+    rank = dg.parallel.Env().local_rank
+    nranks = dg.parallel.Env().nranks
+    parallel = nranks > 1
 
     if rank == 0:
         # Print the whole config setting.
@@ -100,16 +95,7 @@ def train(config):
 
         # Build model.
         model = WaveFlow(config, checkpoint_dir, parallel, rank, nranks, tb)
-        model.build()
-
-        # Obtain the current iteration.
-        if config.checkpoint is None:
-            if config.iteration is None:
-                iteration = io.load_latest_checkpoint(checkpoint_dir, rank)
-            else:
-                iteration = config.iteration
-        else:
-            iteration = int(config.checkpoint.split('/')[-1].split('-')[-1])
+        iteration = model.build()
 
         while iteration < config.max_iterations:
             # Run one single training step.
diff --git a/parakeet/models/waveflow/waveflow.py b/examples/waveflow/waveflow.py
similarity index 93%
rename from parakeet/models/waveflow/waveflow.py
rename to examples/waveflow/waveflow.py
index faf2fb6dc9e39bc7b1c90db19792970ad5798601..700116b4f2bb33d764acb759aee68c8aa9827162 100644
--- a/parakeet/models/waveflow/waveflow.py
+++ b/examples/waveflow/waveflow.py
@@ -21,11 +21,11 @@ import paddle.fluid.dygraph as dg
 from paddle import fluid
 from scipy.io.wavfile import write
 
-import utils
 from parakeet.utils import io
 from parakeet.modules import weight_norm
-from .data import LJSpeech
-from .waveflow_modules import WaveFlowLoss, WaveFlowModule
+from parakeet.models.waveflow import WaveFlowLoss, WaveFlowModule
+from data import LJSpeech
+import utils
 
 
 class WaveFlow():
@@ -93,13 +93,12 @@ class WaveFlow():
                 parameter_list=waveflow.parameters())
 
             # Load parameters.
-            io.load_parameters(
-                self.checkpoint_dir,
-                self.rank,
-                waveflow,
-                optimizer,
+            iteration = io.load_parameters(
+                model=waveflow,
+                optimizer=optimizer,
+                checkpoint_dir=self.checkpoint_dir,
                 iteration=config.iteration,
-                file_path=config.checkpoint)
+                checkpoint_path=config.checkpoint)
             print("Rank {}: checkpoint loaded.".format(self.rank))
 
             # Data parallelism.
@@ -113,13 +112,11 @@ class WaveFlow():
 
         else:
             # Load parameters.
-            io.load_parameters(
-                self.checkpoint_dir,
-                self.rank,
-                waveflow,
+            iteration = io.load_parameters(
+                model=waveflow,
+                checkpoint_dir=self.checkpoint_dir,
                 iteration=config.iteration,
-                file_path=config.checkpoint,
-                dtype=self.dtype)
+                checkpoint_path=config.checkpoint)
             print("Rank {}: checkpoint loaded.".format(self.rank))
 
             for layer in waveflow.sublayers():
@@ -128,6 +125,8 @@ class WaveFlow():
 
             self.waveflow = waveflow
 
+        return iteration
+
     def train_step(self, iteration):
         """Train the model for one step.
 
@@ -293,6 +292,5 @@ class WaveFlow():
         Returns:
             None
         """
-        io.save_latest_parameters(self.checkpoint_dir, iteration,
-                                  self.waveflow, self.optimizer)
-        io.save_latest_checkpoint(self.checkpoint_dir, iteration)
+        io.save_parameters(self.checkpoint_dir, iteration, self.waveflow,
+                           self.optimizer)
diff --git a/parakeet/models/waveflow/__init__.py b/parakeet/models/waveflow/__init__.py
index 73a7914565cafbc2e1a6161c2aa762a74b77e034..b068b590389016015a813b5bb4d73d0736fe56bd 100644
--- a/parakeet/models/waveflow/__init__.py
+++ b/parakeet/models/waveflow/__init__.py
@@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from parakeet.models.waveflow.waveflow import WaveFlow
+from parakeet.models.waveflow.waveflow_modules import WaveFlowLoss, WaveFlowModule
diff --git a/parakeet/utils/io.py b/parakeet/utils/io.py
index 959dbfb9e1f9724fb2349c03b570c1b117969eeb..ac6d548f67399f8d1ac2d38c0fda3f3b33f4fe64 100644
--- a/parakeet/utils/io.py
+++ b/parakeet/utils/io.py
@@ -18,6 +18,7 @@ import time
 import ruamel.yaml
 import numpy as np
 import paddle.fluid.dygraph as dg
+from paddle.fluid.framework import convert_np_dtype_to_dtype_ as convert_np_dtype
 
 
 def is_main_process():
@@ -51,8 +52,6 @@ def _load_latest_checkpoint(checkpoint_dir):
 
     Args:
         checkpoint_dir (str): the directory where checkpoint is saved.
-        rank (int, optional): the rank of the process in multi-process setting.
-            Defaults to 0.
 
     Returns:
         int: the latest iteration number.
@@ -90,9 +89,8 @@ def load_parameters(model,
                     optimizer=None,
                     checkpoint_dir=None,
                     iteration=None,
-                    checkpoint_path=None,
-                    dtype="float32"):
-    """Load a specific model checkpoint from disk.
+                    checkpoint_path=None):
+    """Load a specific model checkpoint from disk. 
 
     Args:
         model (obj): model to load parameters.
@@ -102,37 +100,36 @@ def load_parameters(model,
         iteration (int, optional): if specified, load the specific checkpoint,
             if not specified, load the latest one. Defaults to None.
         checkpoint_path (str, optional): if specified, load the checkpoint
-            stored in the checkpoint_path. Defaults to None. 
-        dtype (str, optional): precision of the model parameters.
-            Defaults to float32.
+            stored in the checkpoint_path and the argument 'checkpoint_dir' will 
+            be ignored. Defaults to None. 
 
     Returns:
         iteration (int): number of iterations that the loaded checkpoint has 
             been trained.
     """
-
-    if iteration is not None and checkpoint_dir is None:
-        raise ValueError(
-            "When iteration is specified, checkpoint_dir should not be None")
-
     if checkpoint_path is not None:
-        # checkpoint is not None
         iteration = int(os.path.basename(checkpoint_path).split("-")[-1])
-    else:
+    elif checkpoint_dir is not None:
         if iteration is None:
             iteration = _load_latest_checkpoint(checkpoint_dir)
+        if iteration == 0:
+            return iteration
         checkpoint_path = os.path.join(checkpoint_dir,
                                        "step-{}".format(iteration))
-        if iteration == 0 and not os.path.exists(checkpoint_path):
-            # if step-0 exist, it is also loaded
-            return iteration
+    else:
+        raise ValueError(
+            "At least one of 'checkpoint_dir' and 'checkpoint_path' should be specified!"
+        )
 
     local_rank = dg.parallel.Env().local_rank
     model_dict, optimizer_dict = dg.load_dygraph(checkpoint_path)
 
-    # cast to desired data type
+    state_dict = model.state_dict()
+    # cast to desired data type, for mixed-precision training/inference.
     for k, v in model_dict.items():
-        model_dict[k] = v.astype(dtype)
+        if k in state_dict and convert_np_dtype(v.dtype) != state_dict[
+                k].dtype:
+            model_dict[k] = v.astype(state_dict[k].numpy().dtype)
 
     model.set_dict(model_dict)
     print("[checkpoint] Rank {}: loaded model from {}.pdparams".format(