add transformed v1.8 model

4c7fefd4 · huangyuxin · 1db2567e · 4c7fefd4 · 4c7fefd4 · 4c7fefd4
115 changed file
--- a/examples/transv1.8to2.x/data/aishell/aishell.py
+++ b/examples/transv1.8to2.x/data/aishell/aishell.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Prepare Aishell mandarin dataset
+Download, unpack and create manifest files.
+Manifest file is a json-format file with each line containing the
+meta data (i.e. audio filepath, transcript and audio duration)
+of each audio file in the data set.
+"""
+import argparse
+import codecs
+import json
+import os
+import soundfile
+from data_utils.utility import download
+from data_utils.utility import unpack
+DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
+URL_ROOT = 'http://www.openslr.org/resources/33'
+URL_ROOT = 'https://openslr.magicdatatech.com/resources/33'
+DATA_URL = URL_ROOT + '/data_aishell.tgz'
+MD5_DATA = '2f494334227864a8a8fec932999db9d8'
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    "--target_dir",
+    default=DATA_HOME + "/Aishell",
+    type=str,
+    help="Directory to save the dataset. (default: %(default)s)")
+parser.add_argument(
+    "--manifest_prefix",
+    default="manifest",
+    type=str,
+    help="Filepath prefix for output manifests. (default: %(default)s)")
+args = parser.parse_args()
+def create_manifest(data_dir, manifest_path_prefix):
+    print("Creating manifest %s ..." % manifest_path_prefix)
+    json_lines = []
+    transcript_path = os.path.join(data_dir, 'transcript',
+                                   'aishell_transcript_v0.8.txt')
+    transcript_dict = {}
+    for line in codecs.open(transcript_path, 'r', 'utf-8'):
+        line = line.strip()
+        if line == '':
+            continue
+        audio_id, text = line.split(' ', 1)
+        # remove withespace
+        text = ''.join(text.split())
+        transcript_dict[audio_id] = text
+    data_types = ['train', 'dev', 'test']
+    for type in data_types:
+        del json_lines[:]
+        audio_dir = os.path.join(data_dir, 'wav', type)
+        for subfolder, _, filelist in sorted(os.walk(audio_dir)):
+            for fname in filelist:
+                audio_path = os.path.join(subfolder, fname)
+                audio_id = fname[:-4]
+                # if no transcription for audio then skipped
+                if audio_id not in transcript_dict:
+                    continue
+                audio_data, samplerate = soundfile.read(audio_path)
+                duration = float(len(audio_data) / samplerate)
+                text = transcript_dict[audio_id]
+                json_lines.append(
+                    json.dumps(
+                        {
+                            'audio_filepath': audio_path,
+                            'duration': duration,
+                            'text': text
+                        },
+                        ensure_ascii=False))
+        manifest_path = manifest_path_prefix + '.' + type
+        with codecs.open(manifest_path, 'w', 'utf-8') as fout:
+            for line in json_lines:
+                fout.write(line + '\n')
+def prepare_dataset(url, md5sum, target_dir, manifest_path):
+    """Download, unpack and create manifest file."""
+    data_dir = os.path.join(target_dir, 'data_aishell')
+    if not os.path.exists(data_dir):
+        filepath = download(url, md5sum, target_dir)
+        unpack(filepath, target_dir)
+        # unpack all audio tar files
+        audio_dir = os.path.join(data_dir, 'wav')
+        for subfolder, _, filelist in sorted(os.walk(audio_dir)):
+            for ftar in filelist:
+                unpack(os.path.join(subfolder, ftar), subfolder, True)
+    else:
+        print("Skip downloading and unpacking. Data already exists in %s." %
+              target_dir)
+    create_manifest(data_dir, manifest_path)
+def main():
+    if args.target_dir.startswith('~'):
+        args.target_dir = os.path.expanduser(args.target_dir)
+    prepare_dataset(
+        url=DATA_URL,
+        md5sum=MD5_DATA,
+        target_dir=args.target_dir,
+        manifest_path=args.manifest_prefix)
+if __name__ == '__main__':
+    main()
--- a/examples/transv1.8to2.x/data/librispeech/librispeech.py
+++ b/examples/transv1.8to2.x/data/librispeech/librispeech.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Prepare Librispeech ASR datasets.
+Download, unpack and create manifest files.
+Manifest file is a json-format file with each line containing the
+meta data (i.e. audio filepath, transcript and audio duration)
+of each audio file in the data set.
+"""
+import argparse
+import codecs
+import distutils.util
+import io
+import json
+import os
+import soundfile
+from data_utils.utility import download
+from data_utils.utility import unpack
+URL_ROOT = "http://www.openslr.org/resources/12"
+URL_ROOT = "https://openslr.magicdatatech.com/resources/12"
+URL_TEST_CLEAN = URL_ROOT + "/test-clean.tar.gz"
+URL_TEST_OTHER = URL_ROOT + "/test-other.tar.gz"
+URL_DEV_CLEAN = URL_ROOT + "/dev-clean.tar.gz"
+URL_DEV_OTHER = URL_ROOT + "/dev-other.tar.gz"
+URL_TRAIN_CLEAN_100 = URL_ROOT + "/train-clean-100.tar.gz"
+URL_TRAIN_CLEAN_360 = URL_ROOT + "/train-clean-360.tar.gz"
+URL_TRAIN_OTHER_500 = URL_ROOT + "/train-other-500.tar.gz"
+MD5_TEST_CLEAN = "32fa31d27d2e1cad72775fee3f4849a9"
+MD5_TEST_OTHER = "fb5a50374b501bb3bac4815ee91d3135"
+MD5_DEV_CLEAN = "42e2234ba48799c1f50f24a7926300a1"
+MD5_DEV_OTHER = "c8d0bcc9cca99d4f8b62fcc847357931"
+MD5_TRAIN_CLEAN_100 = "2a93770f6d5c6c964bc36631d331a522"
+MD5_TRAIN_CLEAN_360 = "c0e676e450a7ff2f54aeade5171606fa"
+MD5_TRAIN_OTHER_500 = "d1a0fd59409feb2c614ce4d30c387708"
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    "--target_dir",
+    default='~/.cache/paddle/dataset/speech/libri',
+    type=str,
+    help="Directory to save the dataset. (default: %(default)s)")
+parser.add_argument(
+    "--manifest_prefix",
+    default="manifest",
+    type=str,
+    help="Filepath prefix for output manifests. (default: %(default)s)")
+parser.add_argument(
+    "--full_download",
+    default="True",
+    type=distutils.util.strtobool,
+    help="Download all datasets for Librispeech."
+    " If False, only download a minimal requirement (test-clean, dev-clean"
+    " train-clean-100). (default: %(default)s)")
+args = parser.parse_args()
+def create_manifest(data_dir, manifest_path):
+    """Create a manifest json file summarizing the data set, with each line
+    containing the meta data (i.e. audio filepath, transcription text, audio
+    duration) of each audio file within the data set.
+    """
+    print("Creating manifest %s ..." % manifest_path)
+    json_lines = []
+    for subfolder, _, filelist in sorted(os.walk(data_dir)):
+        text_filelist = [
+            filename for filename in filelist if filename.endswith('trans.txt')
+        ]
+        if len(text_filelist) > 0:
+            text_filepath = os.path.join(subfolder, text_filelist[0])
+            for line in io.open(text_filepath, encoding="utf8"):
+                segments = line.strip().split()
+                text = ' '.join(segments[1:]).lower()
+                audio_filepath = os.path.join(subfolder, segments[0] + '.flac')
+                audio_data, samplerate = soundfile.read(audio_filepath)
+                duration = float(len(audio_data)) / samplerate
+                json_lines.append(
+                    json.dumps({
+                        'audio_filepath': audio_filepath,
+                        'duration': duration,
+                        'text': text
+                    }))
+    with codecs.open(manifest_path, 'w', 'utf-8') as out_file:
+        for line in json_lines:
+            out_file.write(line + '\n')
+def prepare_dataset(url, md5sum, target_dir, manifest_path):
+    """Download, unpack and create summmary manifest file.
+    """
+    if not os.path.exists(os.path.join(target_dir, "LibriSpeech")):
+        # download
+        filepath = download(url, md5sum, target_dir)
+        # unpack
+        unpack(filepath, target_dir)
+    else:
+        print("Skip downloading and unpacking. Data already exists in %s." %
+              target_dir)
+    # create manifest json file
+    create_manifest(target_dir, manifest_path)
+def main():
+    if args.target_dir.startswith('~'):
+        args.target_dir = os.path.expanduser(args.target_dir)
+    prepare_dataset(
+        url=URL_TEST_CLEAN,
+        md5sum=MD5_TEST_CLEAN,
+        target_dir=os.path.join(args.target_dir, "test-clean"),
+        manifest_path=args.manifest_prefix + ".test-clean")
+    prepare_dataset(
+        url=URL_DEV_CLEAN,
+        md5sum=MD5_DEV_CLEAN,
+        target_dir=os.path.join(args.target_dir, "dev-clean"),
+        manifest_path=args.manifest_prefix + ".dev-clean")
+    if args.full_download:
+        prepare_dataset(
+            url=URL_TRAIN_CLEAN_100,
+            md5sum=MD5_TRAIN_CLEAN_100,
+            target_dir=os.path.join(args.target_dir, "train-clean-100"),
+            manifest_path=args.manifest_prefix + ".train-clean-100")
+        prepare_dataset(
+            url=URL_TEST_OTHER,
+            md5sum=MD5_TEST_OTHER,
+            target_dir=os.path.join(args.target_dir, "test-other"),
+            manifest_path=args.manifest_prefix + ".test-other")
+        prepare_dataset(
+            url=URL_DEV_OTHER,
+            md5sum=MD5_DEV_OTHER,
+            target_dir=os.path.join(args.target_dir, "dev-other"),
+            manifest_path=args.manifest_prefix + ".dev-other")
+        prepare_dataset(
+            url=URL_TRAIN_CLEAN_360,
+            md5sum=MD5_TRAIN_CLEAN_360,
+            target_dir=os.path.join(args.target_dir, "train-clean-360"),
+            manifest_path=args.manifest_prefix + ".train-clean-360")
+        prepare_dataset(
+            url=URL_TRAIN_OTHER_500,
+            md5sum=MD5_TRAIN_OTHER_500,
+            target_dir=os.path.join(args.target_dir, "train-other-500"),
+            manifest_path=args.manifest_prefix + ".train-other-500")
+if __name__ == '__main__':
+    main()
--- a/examples/transv1.8to2.x/data/noise/chime3_background.py
+++ b/examples/transv1.8to2.x/data/noise/chime3_background.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Prepare CHiME3 background data.
+Download, unpack and create manifest files.
+Manifest file is a json-format file with each line containing the
+meta data (i.e. audio filepath, transcript and audio duration)
+of each audio file in the data set.
+"""
+import argparse
+import io
+import json
+import os
+import zipfile
+import soundfile
+import wget
+from paddle.v2.dataset.common import md5file
+DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
+URL = "https://d4s.myairbridge.com/packagev2/AG0Y3DNBE5IWRRTV/?dlid=W19XG7T0NNHB027139H0EQ"
+MD5 = "c3ff512618d7a67d4f85566ea1bc39ec"
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    "--target_dir",
+    default=DATA_HOME + "/chime3_background",
+    type=str,
+    help="Directory to save the dataset. (default: %(default)s)")
+parser.add_argument(
+    "--manifest_filepath",
+    default="manifest.chime3.background",
+    type=str,
+    help="Filepath for output manifests. (default: %(default)s)")
+args = parser.parse_args()
+def download(url, md5sum, target_dir, filename=None):
+    """Download file from url to target_dir, and check md5sum."""
+    if filename is None:
+        filename = url.split("/")[-1]
+    if not os.path.exists(target_dir):
+        os.makedirs(target_dir)
+    filepath = os.path.join(target_dir, filename)
+    if not (os.path.exists(filepath) and md5file(filepath) == md5sum):
+        print("Downloading %s ..." % url)
+        wget.download(url, target_dir)
+        print("\nMD5 Chesksum %s ..." % filepath)
+        if not md5file(filepath) == md5sum:
+            raise RuntimeError("MD5 checksum failed.")
+    else:
+        print("File exists, skip downloading. (%s)" % filepath)
+    return filepath
+def unpack(filepath, target_dir):
+    """Unpack the file to the target_dir."""
+    print("Unpacking %s ..." % filepath)
+    if filepath.endswith('.zip'):
+        zip = zipfile.ZipFile(filepath, 'r')
+        zip.extractall(target_dir)
+        zip.close()
+    elif filepath.endswith('.tar') or filepath.endswith('.tar.gz'):
+        tar = zipfile.open(filepath)
+        tar.extractall(target_dir)
+        tar.close()
+    else:
+        raise ValueError("File format is not supported for unpacking.")
+def create_manifest(data_dir, manifest_path):
+    """Create a manifest json file summarizing the data set, with each line
+    containing the meta data (i.e. audio filepath, transcription text, audio
+    duration) of each audio file within the data set.
+    """
+    print("Creating manifest %s ..." % manifest_path)
+    json_lines = []
+    for subfolder, _, filelist in sorted(os.walk(data_dir)):
+        for filename in filelist:
+            if filename.endswith('.wav'):
+                filepath = os.path.join(data_dir, subfolder, filename)
+                audio_data, samplerate = soundfile.read(filepath)
+                duration = float(len(audio_data)) / samplerate
+                json_lines.append(
+                    json.dumps({
+                        'audio_filepath': filepath,
+                        'duration': duration,
+                        'text': ''
+                    }))
+    with io.open(manifest_path, mode='w', encoding='utf8') as out_file:
+        for line in json_lines:
+            out_file.write(line + '\n')
+def prepare_chime3(url, md5sum, target_dir, manifest_path):
+    """Download, unpack and create summmary manifest file."""
+    if not os.path.exists(os.path.join(target_dir, "CHiME3")):
+        # download
+        filepath = download(url, md5sum, target_dir,
+                            "myairbridge-AG0Y3DNBE5IWRRTV.zip")
+        # unpack
+        unpack(filepath, target_dir)
+        unpack(
+            os.path.join(target_dir, 'CHiME3_background_bus.zip'), target_dir)
+        unpack(
+            os.path.join(target_dir, 'CHiME3_background_caf.zip'), target_dir)
+        unpack(
+            os.path.join(target_dir, 'CHiME3_background_ped.zip'), target_dir)
+        unpack(
+            os.path.join(target_dir, 'CHiME3_background_str.zip'), target_dir)
+    else:
+        print("Skip downloading and unpacking. Data already exists in %s." %
+              target_dir)
+    # create manifest json file
+    create_manifest(target_dir, manifest_path)
+def main():
+    prepare_chime3(
+        url=URL,
+        md5sum=MD5,
+        target_dir=args.target_dir,
+        manifest_path=args.manifest_filepath)
+if __name__ == '__main__':
+    main()
--- a/examples/transv1.8to2.x/data/voxforge/run_data.sh
+++ b/examples/transv1.8to2.x/data/voxforge/run_data.sh
+#! /usr/bin/env bash
+# download data, generate manifests
+PYTHONPATH=../../:$PYTHONPATH python voxforge.py \
+--manifest_prefix='./manifest' \
+--target_dir='./dataset/VoxForge' \
+--is_merge_dialect=True \
+--dialects 'american' 'british' 'australian' 'european' 'irish' 'canadian' 'indian'
+if [ $? -ne 0 ]; then
+    echo "Prepare VoxForge failed. Terminated."
+    exit 1
+fi
+echo "VoxForge Data preparation done."
+exit 0
--- a/examples/transv1.8to2.x/data/voxforge/voxforge.py
+++ b/examples/transv1.8to2.x/data/voxforge/voxforge.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Prepare VoxForge dataset
+Download, unpack and create manifest files.
+Manifest file is a json-format file with each line containing the
+meta data (i.e. audio filepath, transcript and audio duration)
+of each audio file in the data set.
+"""
+import argparse
+import codecs
+import datetime
+import json
+import os
+import shutil
+import subprocess
+import soundfile
+from data_utils.utility import download_multi
+from data_utils.utility import getfile_insensitive
+from data_utils.utility import unpack
+DATA_HOME = './dataset'
+DATA_URL = 'http://www.repository.voxforge1.org/downloads/SpeechCorpus/Trunk/' \
+           'Audio/Main/16kHz_16bit'
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    "--target_dir",
+    default=DATA_HOME + "/VoxForge",
+    type=str,
+    help="Directory to save the dataset. (default: %(default)s)")
+parser.add_argument(
+    "--dialects",
+    default=[
+        'american', 'british', 'australian', 'european', 'irish', 'canadian',
+        'indian'
+    ],
+    nargs='+',
+    type=str,
+    help="Dialect types. (default: %(default)s)")
+parser.add_argument(
+    "--is_merge_dialect",
+    default=True,
+    type=bool,
+    help="If set True, manifests of american dialect and canadian dialect will "
+    "be merged to american-canadian dialect; manifests of british "
+    "dialect, irish dialect and australian dialect will be merged to "
+    "commonwealth dialect. (default: %(default)s)")
+parser.add_argument(
+    "--manifest_prefix",
+    default="manifest",
+    type=str,
+    help="Filepath prefix for output manifests. (default: %(default)s)")
+args = parser.parse_args()
+def download_and_unpack(target_dir, url):
+    wget_args = '-q -l 1 -N -nd -c -e robots=off -A tgz -r -np'
+    tgz_dir = os.path.join(target_dir, 'tgz')
+    exit_code = download_multi(url, tgz_dir, wget_args)
+    if exit_code != 0:
+        print('Download tgz audio files failed with exit code %d.' % exit_code)
+    else:
+        print('Download done, start unpacking ...')
+        audio_dir = os.path.join(target_dir, 'audio')
+        for root, dirs, files in os.walk(tgz_dir):
+            for file in files:
+                print(file)
+                if file.endswith('.tgz'):
+                    unpack(os.path.join(root, file), audio_dir)
+def select_dialects(target_dir, dialect_list):
+    """Classify audio files by dialect."""
+    dialect_root_dir = os.path.join(target_dir, 'dialect')
+    if os.path.exists(dialect_root_dir):
+        shutil.rmtree(dialect_root_dir)
+    os.mkdir(dialect_root_dir)
+    audio_dir = os.path.abspath(os.path.join(target_dir, 'audio'))
+    for dialect in dialect_list:
+        # filter files by dialect
+        command = 'find %s -iwholename "*etc/readme*" -exec egrep -iHl \
+                   "pronunciation dialect.*%s" {} \;' % (audio_dir, dialect)
+        p = subprocess.Popen(
+            command, stdin=subprocess.PIPE, stdout=subprocess.PIPE, shell=True)
+        output, err = p.communicate()
+        dialect_dir = os.path.join(dialect_root_dir, dialect)
+        if os.path.exists(dialect_dir):
+            shutil.rmtree(dialect_dir)
+        os.mkdir(dialect_dir)
+        for path in output.splitlines():
+            src_dir = os.path.dirname(os.path.dirname(path))
+            link = os.path.basename(os.path.normpath(src_dir))
+            os.symlink(src_dir, os.path.join(dialect_dir, link))
+def generate_manifest(data_dir, manifest_path):
+    json_lines = []
+    for path in os.listdir(data_dir):
+        audio_link = os.path.join(data_dir, path)
+        assert os.path.islink(
+            audio_link), '%s should be symbolic link.' % audio_link
+        actual_audio_dir = os.path.abspath(os.readlink(audio_link))
+        audio_type = ''
+        if os.path.isdir(os.path.join(actual_audio_dir, 'wav')):
+            audio_type = 'wav'
+        elif os.path.isdir(os.path.join(actual_audio_dir, 'flac')):
+            audio_type = 'flac'
+        else:
+            print('Unknown audio type, skipped processing %s.' %
+                  actual_audio_dir)
+            continue
+        etc_dir = os.path.join(actual_audio_dir, 'etc')
+        prompts_file = os.path.join(etc_dir, 'PROMPTS')
+        if not os.path.isfile(prompts_file):
+            print('PROMPTS file missing, skip processing %s.' %
+                  actual_audio_dir)
+            continue
+        readme_file = getfile_insensitive(os.path.join(etc_dir, 'README'))
+        if readme_file is None:
+            print('README file missing, skip processing %s.' % actual_audio_dir)
+            continue
+        for line in file(prompts_file):
+            u, trans = line.strip().split(None, 1)
+            u_parts = u.split('/')
+            # try to format the date time
+            try:
+                speaker, date, sfx = u_parts[-3].split('-')
+                obj = datetime.datetime.strptime(date, '%y.%m.%d')
+                formatted = obj.strftime('%Y%m%d')
+                u_parts[-3] = '-'.join([speaker, formatted, sfx])
+            except Exception as e:
+                pass
+            if len(u_parts) < 2:
+                u_parts = [audio_type] + u_parts
+            u_parts[-2] = audio_type
+            u_parts[-1] += '.' + audio_type
+            u = os.path.join(actual_audio_dir, '/'.join(u_parts[-2:]))
+            if not os.path.isfile(u):
+                print('Audio file missing, skip processing %s.' % u)
+                continue
+            if os.stat(u).st_size == 0:
+                print('Empty audio file, skip processing %s.' % u)
+                continue
+            trans = trans.strip().replace('-', ' ')
+            if not trans.isupper() or \
+                not trans.strip().replace(' ', '').replace("'", "").isalpha():
+                print("Transcript not normalized properly, skip processing %s."
+                      % u)
+                continue
+            audio_data, samplerate = soundfile.read(u)
+            duration = float(len(audio_data)) / samplerate
+            json_lines.append(
+                json.dumps({
+                    'audio_filepath': u,
+                    'duration': duration,
+                    'text': trans.lower()
+                }))
+    with codecs.open(manifest_path, 'w', 'utf-8') as fout:
+        for line in json_lines:
+            fout.write(line + '\n')
+def merge_manifests(manifest_files, save_path):
+    lines = []
+    for manifest_file in manifest_files:
+        line = codecs.open(manifest_file, 'r', 'utf-8').readlines()
+        lines += line
+    with codecs.open(save_path, 'w', 'utf-8') as fout:
+        for line in lines:
+            fout.write(line)
+def prepare_dataset(url, dialects, target_dir, manifest_prefix, is_merge):
+    download_and_unpack(target_dir, url)
+    select_dialects(target_dir, dialects)
+    american_canadian_manifests = []
+    commonwealth_manifests = []
+    for dialect in dialects:
+        dialect_dir = os.path.join(target_dir, 'dialect', dialect)
+        manifest_fpath = manifest_prefix + '.' + dialect
+        if dialect == 'american' or dialect == 'canadian':
+            american_canadian_manifests.append(manifest_fpath)
+        if dialect == 'australian' \
+                or dialect == 'british' \
+                or dialect == 'irish':
+            commonwealth_manifests.append(manifest_fpath)
+        generate_manifest(dialect_dir, manifest_fpath)
+    if is_merge:
+        if len(american_canadian_manifests) > 0:
+            manifest_fpath = manifest_prefix + '.american-canadian'
+            merge_manifests(american_canadian_manifests, manifest_fpath)
+        if len(commonwealth_manifests) > 0:
+            manifest_fpath = manifest_prefix + '.commonwealth'
+            merge_manifests(commonwealth_manifests, manifest_fpath)
+def main():
+    if args.target_dir.startswith('~'):
+        args.target_dir = os.path.expanduser(args.target_dir)
+    prepare_dataset(DATA_URL, args.dialects, args.target_dir,
+                    args.manifest_prefix, args.is_merge_dialect)
+if __name__ == '__main__':
+    main()
--- a/examples/transv1.8to2.x/data_utils/__init__.py
+++ b/examples/transv1.8to2.x/data_utils/__init__.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/examples/transv1.8to2.x/data_utils/audio.py
+++ b/examples/transv1.8to2.x/data_utils/audio.py
--- a/examples/transv1.8to2.x/data_utils/augmentor/__init__.py
+++ b/examples/transv1.8to2.x/data_utils/augmentor/__init__.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/examples/transv1.8to2.x/data_utils/augmentor/augmentation.py
+++ b/examples/transv1.8to2.x/data_utils/augmentor/augmentation.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Contains the data augmentation pipeline."""
+import json
+import random
+from data_utils.augmentor.impulse_response import ImpulseResponseAugmentor
+from data_utils.augmentor.noise_perturb import NoisePerturbAugmentor
+from data_utils.augmentor.online_bayesian_normalization import \
+     OnlineBayesianNormalizationAugmentor
+from data_utils.augmentor.resample import ResampleAugmentor
+from data_utils.augmentor.shift_perturb import ShiftPerturbAugmentor
+from data_utils.augmentor.speed_perturb import SpeedPerturbAugmentor
+from data_utils.augmentor.volume_perturb import VolumePerturbAugmentor
+class AugmentationPipeline(object):
+    """Build a pre-processing pipeline with various augmentation models.Such a
+    data augmentation pipeline is oftern leveraged to augment the training
+    samples to make the model invariant to certain types of perturbations in the
+    real world, improving model's generalization ability.
+    The pipeline is built according the the augmentation configuration in json
+    string, e.g.
+    .. code-block::
+        [ {
+                "type": "noise",
+                "params": {"min_snr_dB": 10,
+                           "max_snr_dB": 20,
+                           "noise_manifest_path": "datasets/manifest.noise"},
+                "prob": 0.0
+            },
+            {
+                "type": "speed",
+                "params": {"min_speed_rate": 0.9,
+                           "max_speed_rate": 1.1},
+                "prob": 1.0
+            },
+            {
+                "type": "shift",
+                "params": {"min_shift_ms": -5,
+                           "max_shift_ms": 5},
+                "prob": 1.0
+            },
+            {
+                "type": "volume",
+                "params": {"min_gain_dBFS": -10,
+                           "max_gain_dBFS": 10},
+                "prob": 0.0
+            },
+            {
+                "type": "bayesian_normal",
+                "params": {"target_db": -20,
+                           "prior_db": -20,
+                           "prior_samples": 100},
+                "prob": 0.0
+            }
+        ]
+    This augmentation configuration inserts two augmentation models
+    into the pipeline, with one is VolumePerturbAugmentor and the other
+    SpeedPerturbAugmentor. "prob" indicates the probability of the current
+    augmentor to take effect. If "prob" is zero, the augmentor does not take
+    effect.
+    :param augmentation_config: Augmentation configuration in json string.
+    :type augmentation_config: str
+    :param random_seed: Random seed.
+    :type random_seed: int
+    :raises ValueError: If the augmentation json config is in incorrect format".
+    """
+    def __init__(self, augmentation_config, random_seed=0):
+        self._rng = random.Random(random_seed)
+        self._augmentors, self._rates = self._parse_pipeline_from(
+            augmentation_config)
+    def transform_audio(self, audio_segment):
+        """Run the pre-processing pipeline for data augmentation.
+        Note that this is an in-place transformation.
+        :param audio_segment: Audio segment to process.
+        :type audio_segment: AudioSegmenet|SpeechSegment
+        """
+        for augmentor, rate in zip(self._augmentors, self._rates):
+            if self._rng.uniform(0., 1.) < rate:
+                augmentor.transform_audio(audio_segment)
+    def _parse_pipeline_from(self, config_json):
+        """Parse the config json to build a augmentation pipelien."""
+        try:
+            configs = json.loads(config_json)
+            augmentors = [
+                self._get_augmentor(config["type"], config["params"])
+                for config in configs
+            ]
+            rates = [config["prob"] for config in configs]
+        except Exception as e:
+            raise ValueError("Failed to parse the augmentation config json: "
+                             "%s" % str(e))
+        return augmentors, rates
+    def _get_augmentor(self, augmentor_type, params):
+        """Return an augmentation model by the type name, and pass in params."""
+        if augmentor_type == "volume":
+            return VolumePerturbAugmentor(self._rng, **params)
+        elif augmentor_type == "shift":
+            return ShiftPerturbAugmentor(self._rng, **params)
+        elif augmentor_type == "speed":
+            return SpeedPerturbAugmentor(self._rng, **params)
+        elif augmentor_type == "resample":
+            return ResampleAugmentor(self._rng, **params)
+        elif augmentor_type == "bayesian_normal":
+            return OnlineBayesianNormalizationAugmentor(self._rng, **params)
+        elif augmentor_type == "noise":
+            return NoisePerturbAugmentor(self._rng, **params)
+        elif augmentor_type == "impulse":
+            return ImpulseResponseAugmentor(self._rng, **params)
+        else:
+            raise ValueError("Unknown augmentor type [%s]." % augmentor_type)
--- a/examples/transv1.8to2.x/data_utils/augmentor/base.py
+++ b/examples/transv1.8to2.x/data_utils/augmentor/base.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Contains the abstract base class for augmentation models."""
+from abc import ABCMeta
+from abc import abstractmethod
+class AugmentorBase(object):
+    """Abstract base class for augmentation model (augmentor) class.
+    All augmentor classes should inherit from this class, and implement the
+    following abstract methods.
+    """
+    __metaclass__ = ABCMeta
+    @abstractmethod
+    def __init__(self):
+        pass
+    @abstractmethod
+    def transform_audio(self, audio_segment):
+        """Adds various effects to the input audio segment. Such effects
+        will augment the training data to make the model invariant to certain
+        types of perturbations in the real world, improving model's
+        generalization ability.
+        Note that this is an in-place transformation.
+        :param audio_segment: Audio segment to add effects to.
+        :type audio_segment: AudioSegmenet|SpeechSegment
+        """
+        pass
--- a/examples/transv1.8to2.x/data_utils/augmentor/impulse_response.py
+++ b/examples/transv1.8to2.x/data_utils/augmentor/impulse_response.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Contains the impulse response augmentation model."""
+from data_utils.audio import AudioSegment
+from data_utils.augmentor.base import AugmentorBase
+from data_utils.utility import read_manifest
+class ImpulseResponseAugmentor(AugmentorBase):
+    """Augmentation model for adding impulse response effect.
+    :param rng: Random generator object.
+    :type rng: random.Random
+    :param impulse_manifest_path: Manifest path for impulse audio data.
+    :type impulse_manifest_path: str
+    """
+    def __init__(self, rng, impulse_manifest_path):
+        self._rng = rng
+        self._impulse_manifest = read_manifest(impulse_manifest_path)
+    def transform_audio(self, audio_segment):
+        """Add impulse response effect.
+        Note that this is an in-place transformation.
+        :param audio_segment: Audio segment to add effects to.
+        :type audio_segment: AudioSegmenet|SpeechSegment
+        """
+        impulse_json = self._rng.sample(self._impulse_manifest, 1)[0]
+        impulse_segment = AudioSegment.from_file(impulse_json['audio_filepath'])
+        audio_segment.convolve(impulse_segment, allow_resample=True)
--- a/examples/transv1.8to2.x/data_utils/augmentor/noise_perturb.py
+++ b/examples/transv1.8to2.x/data_utils/augmentor/noise_perturb.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Contains the noise perturb augmentation model."""
+from data_utils.audio import AudioSegment
+from data_utils.augmentor.base import AugmentorBase
+from data_utils.utility import read_manifest
+class NoisePerturbAugmentor(AugmentorBase):
+    """Augmentation model for adding background noise.
+    :param rng: Random generator object.
+    :type rng: random.Random
+    :param min_snr_dB: Minimal signal noise ratio, in decibels.
+    :type min_snr_dB: float
+    :param max_snr_dB: Maximal signal noise ratio, in decibels.
+    :type max_snr_dB: float
+    :param noise_manifest_path: Manifest path for noise audio data.
+    :type noise_manifest_path: str
+    """
+    def __init__(self, rng, min_snr_dB, max_snr_dB, noise_manifest_path):
+        self._min_snr_dB = min_snr_dB
+        self._max_snr_dB = max_snr_dB
+        self._rng = rng
+        self._noise_manifest = read_manifest(manifest_path=noise_manifest_path)
+    def transform_audio(self, audio_segment):
+        """Add background noise audio.
+        Note that this is an in-place transformation.
+        :param audio_segment: Audio segment to add effects to.
+        :type audio_segment: AudioSegmenet|SpeechSegment
+        """
+        noise_json = self._rng.sample(self._noise_manifest, 1)[0]
+        if noise_json['duration'] < audio_segment.duration:
+            raise RuntimeError("The duration of sampled noise audio is smaller "
+                               "than the audio segment to add effects to.")
+        diff_duration = noise_json['duration'] - audio_segment.duration
+        start = self._rng.uniform(0, diff_duration)
+        end = start + audio_segment.duration
+        noise_segment = AudioSegment.slice_from_file(
+            noise_json['audio_filepath'], start=start, end=end)
+        snr_dB = self._rng.uniform(self._min_snr_dB, self._max_snr_dB)
+        audio_segment.add_noise(
+            noise_segment, snr_dB, allow_downsampling=True, rng=self._rng)
--- a/examples/transv1.8to2.x/data_utils/augmentor/online_bayesian_normalization.py
+++ b/examples/transv1.8to2.x/data_utils/augmentor/online_bayesian_normalization.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Contain the online bayesian normalization augmentation model."""
+from data_utils.augmentor.base import AugmentorBase
+class OnlineBayesianNormalizationAugmentor(AugmentorBase):
+    """Augmentation model for adding online bayesian normalization.
+    :param rng: Random generator object.
+    :type rng: random.Random
+    :param target_db: Target RMS value in decibels.
+    :type target_db: float
+    :param prior_db: Prior RMS estimate in decibels.
+    :type prior_db: float
+    :param prior_samples: Prior strength in number of samples.
+    :type prior_samples: int
+    :param startup_delay: Default 0.0s. If provided, this function will
+                          accrue statistics for the first startup_delay 
+                          seconds before applying online normalization.
+    :type starup_delay: float.
+    """
+    def __init__(self,
+                 rng,
+                 target_db,
+                 prior_db,
+                 prior_samples,
+                 startup_delay=0.0):
+        self._target_db = target_db
+        self._prior_db = prior_db
+        self._prior_samples = prior_samples
+        self._rng = rng
+        self._startup_delay = startup_delay
+    def transform_audio(self, audio_segment):
+        """Normalizes the input audio using the online Bayesian approach.
+        Note that this is an in-place transformation.
+        :param audio_segment: Audio segment to add effects to.
+        :type audio_segment: AudioSegment|SpeechSegment
+        """
+        audio_segment.normalize_online_bayesian(self._target_db, self._prior_db,
+                                                self._prior_samples,
+                                                self._startup_delay)
--- a/examples/transv1.8to2.x/data_utils/augmentor/resample.py
+++ b/examples/transv1.8to2.x/data_utils/augmentor/resample.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Contain the resample augmentation model."""
+from data_utils.augmentor.base import AugmentorBase
+class ResampleAugmentor(AugmentorBase):
+    """Augmentation model for resampling.
+    See more info here:
+    https://ccrma.stanford.edu/~jos/resample/index.html
+    :param rng: Random generator object.
+    :type rng: random.Random
+    :param new_sample_rate: New sample rate in Hz.
+    :type new_sample_rate: int
+    """
+    def __init__(self, rng, new_sample_rate):
+        self._new_sample_rate = new_sample_rate
+        self._rng = rng
+    def transform_audio(self, audio_segment):
+        """Resamples the input audio to a target sample rate.
+        Note that this is an in-place transformation.
+        :param audio: Audio segment to add effects to.
+        :type audio: AudioSegment|SpeechSegment
+        """
+        audio_segment.resample(self._new_sample_rate)
--- a/examples/transv1.8to2.x/data_utils/augmentor/shift_perturb.py
+++ b/examples/transv1.8to2.x/data_utils/augmentor/shift_perturb.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Contains the volume perturb augmentation model."""
+from data_utils.augmentor.base import AugmentorBase
+class ShiftPerturbAugmentor(AugmentorBase):
+    """Augmentation model for adding random shift perturbation.
+    :param rng: Random generator object.
+    :type rng: random.Random
+    :param min_shift_ms: Minimal shift in milliseconds.
+    :type min_shift_ms: float
+    :param max_shift_ms: Maximal shift in milliseconds.
+    :type max_shift_ms: float
+    """
+    def __init__(self, rng, min_shift_ms, max_shift_ms):
+        self._min_shift_ms = min_shift_ms
+        self._max_shift_ms = max_shift_ms
+        self._rng = rng
+    def transform_audio(self, audio_segment):
+        """Shift audio.
+        Note that this is an in-place transformation.
+        :param audio_segment: Audio segment to add effects to.
+        :type audio_segment: AudioSegmenet|SpeechSegment
+        """
+        shift_ms = self._rng.uniform(self._min_shift_ms, self._max_shift_ms)
+        audio_segment.shift(shift_ms)
--- a/examples/transv1.8to2.x/data_utils/augmentor/speed_perturb.py
+++ b/examples/transv1.8to2.x/data_utils/augmentor/speed_perturb.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Contain the speech perturbation augmentation model."""
+from data_utils.augmentor.base import AugmentorBase
+class SpeedPerturbAugmentor(AugmentorBase):
+    """Augmentation model for adding speed perturbation.
+    See reference paper here:
+    http://www.danielpovey.com/files/2015_interspeech_augmentation.pdf
+    :param rng: Random generator object.
+    :type rng: random.Random
+    :param min_speed_rate: Lower bound of new speed rate to sample and should
+                           not be smaller than 0.9.
+    :type min_speed_rate: float
+    :param max_speed_rate: Upper bound of new speed rate to sample and should
+                           not be larger than 1.1.
+    :type max_speed_rate: float
+    """
+    def __init__(self, rng, min_speed_rate, max_speed_rate):
+        if min_speed_rate < 0.9:
+            raise ValueError(
+                "Sampling speed below 0.9 can cause unnatural effects")
+        if max_speed_rate > 1.1:
+            raise ValueError(
+                "Sampling speed above 1.1 can cause unnatural effects")
+        self._min_speed_rate = min_speed_rate
+        self._max_speed_rate = max_speed_rate
+        self._rng = rng
+    def transform_audio(self, audio_segment):
+        """Sample a new speed rate from the given range and
+        changes the speed of the given audio clip.
+        Note that this is an in-place transformation.
+        :param audio_segment: Audio segment to add effects to.
+        :type audio_segment: AudioSegment|SpeechSegment
+        """
+        sampled_speed = self._rng.uniform(self._min_speed_rate,
+                                          self._max_speed_rate)
+        audio_segment.change_speed(sampled_speed)
--- a/examples/transv1.8to2.x/data_utils/augmentor/volume_perturb.py
+++ b/examples/transv1.8to2.x/data_utils/augmentor/volume_perturb.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Contains the volume perturb augmentation model."""
+from data_utils.augmentor.base import AugmentorBase
+class VolumePerturbAugmentor(AugmentorBase):
+    """Augmentation model for adding random volume perturbation.
+    This is used for multi-loudness training of PCEN. See
+    https://arxiv.org/pdf/1607.05666v1.pdf
+    for more details.
+    :param rng: Random generator object.
+    :type rng: random.Random
+    :param min_gain_dBFS: Minimal gain in dBFS.
+    :type min_gain_dBFS: float
+    :param max_gain_dBFS: Maximal gain in dBFS.
+    :type max_gain_dBFS: float
+    """
+    def __init__(self, rng, min_gain_dBFS, max_gain_dBFS):
+        self._min_gain_dBFS = min_gain_dBFS
+        self._max_gain_dBFS = max_gain_dBFS
+        self._rng = rng
+    def transform_audio(self, audio_segment):
+        """Change audio loadness.
+        Note that this is an in-place transformation.
+        :param audio_segment: Audio segment to add effects to.
+        :type audio_segment: AudioSegmenet|SpeechSegment
+        """
+        gain = self._rng.uniform(self._min_gain_dBFS, self._max_gain_dBFS)
+        audio_segment.gain_db(gain)
--- a/examples/transv1.8to2.x/data_utils/data.py
+++ b/examples/transv1.8to2.x/data_utils/data.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Contains data generator for orgnaizing various audio data preprocessing
+pipeline and offering data reader interface of PaddlePaddle requirements.
+"""
+import random
+import tarfile
+from threading import local
+import numpy as np
+import paddle.fluid as fluid
+from data_utils.augmentor.augmentation import AugmentationPipeline
+from data_utils.featurizer.speech_featurizer import SpeechFeaturizer
+from data_utils.normalizer import FeatureNormalizer
+from data_utils.speech import SpeechSegment
+from data_utils.utility import read_manifest
+class DataGenerator(object):
+    """
+    DataGenerator provides basic audio data preprocessing pipeline, and offers
+    data reader interfaces of PaddlePaddle requirements.
+    :param vocab_filepath: Vocabulary filepath for indexing tokenized
+                           transcripts.
+    :type vocab_filepath: str
+    :param mean_std_filepath: File containing the pre-computed mean and stddev.
+    :type mean_std_filepath: None|str
+    :param augmentation_config: Augmentation configuration in json string.
+                                Details see AugmentationPipeline.__doc__.
+    :type augmentation_config: str
+    :param max_duration: Audio with duration (in seconds) greater than
+                         this will be discarded.
+    :type max_duration: float
+    :param min_duration: Audio with duration (in seconds) smaller than
+                         this will be discarded.
+    :type min_duration: float
+    :param stride_ms: Striding size (in milliseconds) for generating frames.
+    :type stride_ms: float
+    :param window_ms: Window size (in milliseconds) for generating frames.
+    :type window_ms: float
+    :param max_freq: Used when specgram_type is 'linear', only FFT bins
+                     corresponding to frequencies between [0, max_freq] are
+                     returned.
+    :types max_freq: None|float
+    :param specgram_type: Specgram feature type. Options: 'linear'.
+    :type specgram_type: str
+    :param use_dB_normalization: Whether to normalize the audio to -20 dB
+                                before extracting the features.
+    :type use_dB_normalization: bool
+    :param random_seed: Random seed.
+    :type random_seed: int
+    :param keep_transcription_text: If set to True, transcription text will
+                                    be passed forward directly without
+                                    converting to index sequence.
+    :type keep_transcription_text: bool
+    :param place: The place to run the program.
+    :type place: CPUPlace or CUDAPlace
+    :param is_training: If set to True, generate text data for training,
+                        otherwise,  generate text data for infer.
+    :type is_training: bool
+    """
+    def __init__(self,
+                 vocab_filepath,
+                 mean_std_filepath,
+                 augmentation_config='{}',
+                 max_duration=float('inf'),
+                 min_duration=0.0,
+                 stride_ms=10.0,
+                 window_ms=20.0,
+                 max_freq=None,
+                 specgram_type='linear',
+                 use_dB_normalization=True,
+                 random_seed=0,
+                 keep_transcription_text=False,
+                 place=fluid.CPUPlace(),
+                 is_training=True):
+        self._max_duration = max_duration
+        self._min_duration = min_duration
+        self._normalizer = FeatureNormalizer(mean_std_filepath)
+        self._augmentation_pipeline = AugmentationPipeline(
+            augmentation_config=augmentation_config, random_seed=random_seed)
+        self._speech_featurizer = SpeechFeaturizer(
+            vocab_filepath=vocab_filepath,
+            specgram_type=specgram_type,
+            stride_ms=stride_ms,
+            window_ms=window_ms,
+            max_freq=max_freq,
+            use_dB_normalization=use_dB_normalization)
+        self._rng = random.Random(random_seed)
+        self._keep_transcription_text = keep_transcription_text
+        self._epoch = 0
+        self._is_training = is_training
+        # for caching tar files info
+        self._local_data = local()
+        self._local_data.tar2info = {}
+        self._local_data.tar2object = {}
+        self._place = place
+    def process_utterance(self, audio_file, transcript):
+        """Load, augment, featurize and normalize for speech data.
+        :param audio_file: Filepath or file object of audio file.
+        :type audio_file: str | file
+        :param transcript: Transcription text.
+        :type transcript: str
+        :return: Tuple of audio feature tensor and data of transcription part,
+                 where transcription part could be token ids or text.
+        :rtype: tuple of (2darray, list)
+        """
+        if isinstance(audio_file, str) and audio_file.startswith('tar:'):
+            speech_segment = SpeechSegment.from_file(
+                self._subfile_from_tar(audio_file), transcript)
+        else:
+            speech_segment = SpeechSegment.from_file(audio_file, transcript)
+        self._augmentation_pipeline.transform_audio(speech_segment)
+        specgram, transcript_part = self._speech_featurizer.featurize(
+            speech_segment, self._keep_transcription_text)
+        specgram = self._normalizer.apply(specgram)
+        return specgram, transcript_part
+    def batch_reader_creator(self,
+                             manifest_path,
+                             batch_size,
+                             padding_to=-1,
+                             flatten=False,
+                             sortagrad=False,
+                             shuffle_method="batch_shuffle"):
+        """
+        Batch data reader creator for audio data. Return a callable generator
+        function to produce batches of data.
+        Audio features within one batch will be padded with zeros to have the
+        same shape, or a user-defined shape.
+        :param manifest_path: Filepath of manifest for audio files.
+        :type manifest_path: str
+        :param batch_size: Number of instances in a batch.
+        :type batch_size: int
+        :param padding_to:  If set -1, the maximun shape in the batch
+                            will be used as the target shape for padding.
+                            Otherwise, `padding_to` will be the target shape.
+        :type padding_to: int
+        :param flatten: If set True, audio features will be flatten to 1darray.
+        :type flatten: bool
+        :param sortagrad: If set True, sort the instances by audio duration
+                          in the first epoch for speed up training.
+        :type sortagrad: bool
+        :param shuffle_method: Shuffle method. Options:
+                                '' or None: no shuffle.
+                                'instance_shuffle': instance-wise shuffle.
+                                'batch_shuffle': similarly-sized instances are
+                                                 put into batches, and then
+                                                 batch-wise shuffle the batches.
+                                                 For more details, please see
+                                                 ``_batch_shuffle.__doc__``.
+                                'batch_shuffle_clipped': 'batch_shuffle' with
+                                                         head shift and tail
+                                                         clipping. For more
+                                                         details, please see
+                                                         ``_batch_shuffle``.
+                              If sortagrad is True, shuffle is disabled
+                              for the first epoch.
+        :type shuffle_method: None|str
+        :return: Batch reader function, producing batches of data when called.
+        :rtype: callable
+        """
+        def batch_reader():
+            # read manifest
+            manifest = read_manifest(
+                manifest_path=manifest_path,
+                max_duration=self._max_duration,
+                min_duration=self._min_duration)
+            # sort (by duration) or batch-wise shuffle the manifest
+            if self._epoch == 0 and sortagrad:
+                manifest.sort(key=lambda x: x["duration"])
+            else:
+                if shuffle_method == "batch_shuffle":
+                    manifest = self._batch_shuffle(
+                        manifest, batch_size, clipped=False)
+                elif shuffle_method == "batch_shuffle_clipped":
+                    manifest = self._batch_shuffle(
+                        manifest, batch_size, clipped=True)
+                elif shuffle_method == "instance_shuffle":
+                    self._rng.shuffle(manifest)
+                elif shuffle_method is None:
+                    pass
+                else:
+                    raise ValueError("Unknown shuffle method %s." %
+                                     shuffle_method)
+            # prepare batches
+            batch = []
+            instance_reader = self._instance_reader_creator(manifest)
+            for instance in instance_reader():
+                batch.append(instance)
+                if len(batch) == batch_size:
+                    yield self._padding_batch(batch, padding_to, flatten)
+                    batch = []
+            if len(batch) >= 1:
+                yield self._padding_batch(batch, padding_to, flatten)
+            self._epoch += 1
+        return batch_reader
+    @property
+    def feeding(self):
+        """Returns data reader's feeding dict.
+        :return: Data feeding dict.
+        :rtype: dict
+        """
+        feeding_dict = {"audio_spectrogram": 0, "transcript_text": 1}
+        return feeding_dict
+    @property
+    def vocab_size(self):
+        """Return the vocabulary size.
+        :return: Vocabulary size.
+        :rtype: int
+        """
+        return self._speech_featurizer.vocab_size
+    @property
+    def vocab_list(self):
+        """Return the vocabulary in list.
+        :return: Vocabulary in list.
+        :rtype: list
+        """
+        return self._speech_featurizer.vocab_list
+    def _parse_tar(self, file):
+        """Parse a tar file to get a tarfile object
+        and a map containing tarinfoes
+        """
+        result = {}
+        f = tarfile.open(file)
+        for tarinfo in f.getmembers():
+            result[tarinfo.name] = tarinfo
+        return f, result
+    def _subfile_from_tar(self, file):
+        """Get subfile object from tar.
+        It will return a subfile object from tar file
+        and cached tar file info for next reading request.
+        """
+        tarpath, filename = file.split(':', 1)[1].split('#', 1)
+        if 'tar2info' not in self._local_data.__dict__:
+            self._local_data.tar2info = {}
+        if 'tar2object' not in self._local_data.__dict__:
+            self._local_data.tar2object = {}
+        if tarpath not in self._local_data.tar2info:
+            object, infoes = self._parse_tar(tarpath)
+            self._local_data.tar2info[tarpath] = infoes
+            self._local_data.tar2object[tarpath] = object
+        return self._local_data.tar2object[tarpath].extractfile(
+            self._local_data.tar2info[tarpath][filename])
+    def _instance_reader_creator(self, manifest):
+        """
+        Instance reader creator. Create a callable function to produce
+        instances of data.
+        Instance: a tuple of ndarray of audio spectrogram and a list of
+        token indices for transcript.
+        """
+        def reader():
+            for instance in manifest:
+                inst = self.process_utterance(instance["audio_filepath"],
+                                              instance["text"])
+                yield inst
+        return reader
+    def _padding_batch(self, batch, padding_to=-1, flatten=False):
+        """
+        Padding audio features with zeros to make them have the same shape (or
+        a user-defined shape) within one bach.
+        If ``padding_to`` is -1, the maximun shape in the batch will be used
+        as the target shape for padding. Otherwise, `padding_to` will be the
+        target shape (only refers to the second axis).
+        If `flatten` is True, features will be flatten to 1darray.
+        """
+        new_batch = []
+        # get target shape
+        max_length = max([audio.shape[1] for audio, text in batch])
+        if padding_to != -1:
+            if padding_to < max_length:
+                raise ValueError("If padding_to is not -1, it should be larger "
+                                 "than any instance's shape in the batch")
+            max_length = padding_to
+        # padding
+        padded_audios = []
+        texts, text_lens = [], []
+        audio_lens = []
+        masks = []
+        for audio, text in batch:
+            padded_audio = np.zeros([audio.shape[0], max_length])
+            padded_audio[:, :audio.shape[1]] = audio
+            if flatten:
+                padded_audio = padded_audio.flatten()
+            padded_audios.append(padded_audio)
+            if self._is_training:
+                texts += text
+            else:
+                texts.append(text)
+            text_lens.append(len(text))
+            audio_lens.append(audio.shape[1])
+            mask_shape0 = (audio.shape[0] - 1) // 2 + 1
+            mask_shape1 = (audio.shape[1] - 1) // 3 + 1
+            mask_max_len = (max_length - 1) // 3 + 1
+            mask_ones = np.ones((mask_shape0, mask_shape1))
+            mask_zeros = np.zeros((mask_shape0, mask_max_len - mask_shape1))
+            mask = np.repeat(
+                np.reshape(
+                    np.concatenate((mask_ones, mask_zeros), axis=1),
+                    (1, mask_shape0, mask_max_len)),
+                32,
+                axis=0)
+            masks.append(mask)
+        padded_audios = np.array(padded_audios).astype('float32')
+        if self._is_training:
+            texts = np.expand_dims(np.array(texts).astype('int32'), axis=-1)
+            texts = fluid.create_lod_tensor(
+                texts, recursive_seq_lens=[text_lens], place=self._place)
+        audio_lens = np.array(audio_lens).astype('int64').reshape([-1, 1])
+        masks = np.array(masks).astype('float32')
+        return padded_audios, texts, audio_lens, masks
+    def _batch_shuffle(self, manifest, batch_size, clipped=False):
+        """Put similarly-sized instances into minibatches for better efficiency
+        and make a batch-wise shuffle.
+        1. Sort the audio clips by duration.
+        2. Generate a random number `k`, k in [0, batch_size).
+        3. Randomly shift `k` instances in order to create different batches
+           for different epochs. Create minibatches.
+        4. Shuffle the minibatches.
+        :param manifest: Manifest contents. List of dict.
+        :type manifest: list
+        :param batch_size: Batch size. This size is also used for generate
+                           a random number for batch shuffle.
+        :type batch_size: int
+        :param clipped: Whether to clip the heading (small shift) and trailing
+                        (incomplete batch) instances.
+        :type clipped: bool
+        :return: Batch shuffled mainifest.
+        :rtype: list
+        """
+        manifest.sort(key=lambda x: x["duration"])
+        shift_len = self._rng.randint(0, batch_size - 1)
+        batch_manifest = list(zip(* [iter(manifest[shift_len:])] * batch_size))
+        self._rng.shuffle(batch_manifest)
+        batch_manifest = [item for batch in batch_manifest for item in batch]
+        if not clipped:
+            res_len = len(manifest) - shift_len - len(batch_manifest)
+            batch_manifest.extend(manifest[-res_len:])
+            batch_manifest.extend(manifest[0:shift_len])
+        return batch_manifest
--- a/examples/transv1.8to2.x/data_utils/featurizer/__init__.py
+++ b/examples/transv1.8to2.x/data_utils/featurizer/__init__.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/examples/transv1.8to2.x/data_utils/featurizer/audio_featurizer.py
+++ b/examples/transv1.8to2.x/data_utils/featurizer/audio_featurizer.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Contains the audio featurizer class."""
+import numpy as np
+from python_speech_features import delta
+from python_speech_features import mfcc
+class AudioFeaturizer(object):
+    """Audio featurizer, for extracting features from audio contents of
+    AudioSegment or SpeechSegment.
+    Currently, it supports feature types of linear spectrogram and mfcc.
+    :param specgram_type: Specgram feature type. Options: 'linear'.
+    :type specgram_type: str
+    :param stride_ms: Striding size (in milliseconds) for generating frames.
+    :type stride_ms: float
+    :param window_ms: Window size (in milliseconds) for generating frames.
+    :type window_ms: float
+    :param max_freq: When specgram_type is 'linear', only FFT bins
+                     corresponding to frequencies between [0, max_freq] are
+                     returned; when specgram_type is 'mfcc', max_feq is the
+                     highest band edge of mel filters.
+    :types max_freq: None|float
+    :param target_sample_rate: Audio are resampled (if upsampling or
+                               downsampling is allowed) to this before
+                               extracting spectrogram features.
+    :type target_sample_rate: float
+    :param use_dB_normalization: Whether to normalize the audio to a certain
+                                 decibels before extracting the features.
+    :type use_dB_normalization: bool
+    :param target_dB: Target audio decibels for normalization.
+    :type target_dB: float
+    """
+    def __init__(self,
+                 specgram_type='linear',
+                 stride_ms=10.0,
+                 window_ms=20.0,
+                 max_freq=None,
+                 target_sample_rate=16000,
+                 use_dB_normalization=True,
+                 target_dB=-20):
+        self._specgram_type = specgram_type
+        self._stride_ms = stride_ms
+        self._window_ms = window_ms
+        self._max_freq = max_freq
+        self._target_sample_rate = target_sample_rate
+        self._use_dB_normalization = use_dB_normalization
+        self._target_dB = target_dB
+    def featurize(self,
+                  audio_segment,
+                  allow_downsampling=True,
+                  allow_upsampling=True):
+        """Extract audio features from AudioSegment or SpeechSegment.
+        :param audio_segment: Audio/speech segment to extract features from.
+        :type audio_segment: AudioSegment|SpeechSegment
+        :param allow_downsampling: Whether to allow audio downsampling before
+                                   featurizing.
+        :type allow_downsampling: bool
+        :param allow_upsampling: Whether to allow audio upsampling before
+                                 featurizing.
+        :type allow_upsampling: bool
+        :return: Spectrogram audio feature in 2darray.
+        :rtype: ndarray
+        :raises ValueError: If audio sample rate is not supported.
+        """
+        # upsampling or downsampling
+        if ((audio_segment.sample_rate > self._target_sample_rate and
+             allow_downsampling) or
+            (audio_segment.sample_rate < self._target_sample_rate and
+             allow_upsampling)):
+            audio_segment.resample(self._target_sample_rate)
+        if audio_segment.sample_rate != self._target_sample_rate:
+            raise ValueError("Audio sample rate is not supported. "
+                             "Turn allow_downsampling or allow up_sampling on.")
+        # decibel normalization
+        if self._use_dB_normalization:
+            audio_segment.normalize(target_db=self._target_dB)
+        # extract spectrogram
+        return self._compute_specgram(audio_segment.samples,
+                                      audio_segment.sample_rate)
+    def _compute_specgram(self, samples, sample_rate):
+        """Extract various audio features."""
+        if self._specgram_type == 'linear':
+            return self._compute_linear_specgram(
+                samples, sample_rate, self._stride_ms, self._window_ms,
+                self._max_freq)
+        elif self._specgram_type == 'mfcc':
+            return self._compute_mfcc(samples, sample_rate, self._stride_ms,
+                                      self._window_ms, self._max_freq)
+        else:
+            raise ValueError("Unknown specgram_type %s. "
+                             "Supported values: linear." % self._specgram_type)
+    def _compute_linear_specgram(self,
+                                 samples,
+                                 sample_rate,
+                                 stride_ms=10.0,
+                                 window_ms=20.0,
+                                 max_freq=None,
+                                 eps=1e-14):
+        """Compute the linear spectrogram from FFT energy."""
+        if max_freq is None:
+            max_freq = sample_rate / 2
+        if max_freq > sample_rate / 2:
+            raise ValueError("max_freq must not be greater than half of "
+                             "sample rate.")
+        if stride_ms > window_ms:
+            raise ValueError("Stride size must not be greater than "
+                             "window size.")
+        stride_size = int(0.001 * sample_rate * stride_ms)
+        window_size = int(0.001 * sample_rate * window_ms)
+        specgram, freqs = self._specgram_real(
+            samples,
+            window_size=window_size,
+            stride_size=stride_size,
+            sample_rate=sample_rate)
+        ind = np.where(freqs <= max_freq)[0][-1] + 1
+        return np.log(specgram[:ind, :] + eps)
+    def _specgram_real(self, samples, window_size, stride_size, sample_rate):
+        """Compute the spectrogram for samples from a real signal."""
+        # extract strided windows
+        truncate_size = (len(samples) - window_size) % stride_size
+        samples = samples[:len(samples) - truncate_size]
+        nshape = (window_size, (len(samples) - window_size) // stride_size + 1)
+        nstrides = (samples.strides[0], samples.strides[0] * stride_size)
+        windows = np.lib.stride_tricks.as_strided(
+            samples, shape=nshape, strides=nstrides)
+        assert np.all(
+            windows[:, 1] == samples[stride_size:(stride_size + window_size)])
+        # window weighting, squared Fast Fourier Transform (fft), scaling
+        weighting = np.hanning(window_size)[:, None]
+        fft = np.fft.rfft(windows * weighting, axis=0)
+        fft = np.absolute(fft)
+        fft = fft**2
+        scale = np.sum(weighting**2) * sample_rate
+        fft[1:-1, :] *= (2.0 / scale)
+        fft[(0, -1), :] /= scale
+        # prepare fft frequency list
+        freqs = float(sample_rate) / window_size * np.arange(fft.shape[0])
+        return fft, freqs
+    def _compute_mfcc(self,
+                      samples,
+                      sample_rate,
+                      stride_ms=10.0,
+                      window_ms=20.0,
+                      max_freq=None):
+        """Compute mfcc from samples."""
+        if max_freq is None:
+            max_freq = sample_rate / 2
+        if max_freq > sample_rate / 2:
+            raise ValueError("max_freq must not be greater than half of "
+                             "sample rate.")
+        if stride_ms > window_ms:
+            raise ValueError("Stride size must not be greater than "
+                             "window size.")
+        # compute the 13 cepstral coefficients, and the first one is replaced
+        # by log(frame energy)
+        mfcc_feat = mfcc(
+            signal=samples,
+            samplerate=sample_rate,
+            winlen=0.001 * window_ms,
+            winstep=0.001 * stride_ms,
+            highfreq=max_freq)
+        # Deltas
+        d_mfcc_feat = delta(mfcc_feat, 2)
+        # Deltas-Deltas
+        dd_mfcc_feat = delta(d_mfcc_feat, 2)
+        # transpose
+        mfcc_feat = np.transpose(mfcc_feat)
+        d_mfcc_feat = np.transpose(d_mfcc_feat)
+        dd_mfcc_feat = np.transpose(dd_mfcc_feat)
+        # concat above three features
+        concat_mfcc_feat = np.concatenate(
+            (mfcc_feat, d_mfcc_feat, dd_mfcc_feat))
+        return concat_mfcc_feat
--- a/examples/transv1.8to2.x/data_utils/featurizer/speech_featurizer.py
+++ b/examples/transv1.8to2.x/data_utils/featurizer/speech_featurizer.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Contains the speech featurizer class."""
+from data_utils.featurizer.audio_featurizer import AudioFeaturizer
+from data_utils.featurizer.text_featurizer import TextFeaturizer
+class SpeechFeaturizer(object):
+    """Speech featurizer, for extracting features from both audio and transcript
+    contents of SpeechSegment.
+    Currently, for audio parts, it supports feature types of linear
+    spectrogram and mfcc; for transcript parts, it only supports char-level
+    tokenizing and conversion into a list of token indices. Note that the
+    token indexing order follows the given vocabulary file.
+    :param vocab_filepath: Filepath to load vocabulary for token indices
+                           conversion.
+    :type specgram_type: str
+    :param specgram_type: Specgram feature type. Options: 'linear', 'mfcc'.
+    :type specgram_type: str
+    :param stride_ms: Striding size (in milliseconds) for generating frames.
+    :type stride_ms: float
+    :param window_ms: Window size (in milliseconds) for generating frames.
+    :type window_ms: float
+    :param max_freq: When specgram_type is 'linear', only FFT bins
+                     corresponding to frequencies between [0, max_freq] are
+                     returned; when specgram_type is 'mfcc', max_freq is the
+                     highest band edge of mel filters.
+    :types max_freq: None|float
+    :param target_sample_rate: Speech are resampled (if upsampling or
+                               downsampling is allowed) to this before
+                               extracting spectrogram features.
+    :type target_sample_rate: float
+    :param use_dB_normalization: Whether to normalize the audio to a certain
+                                 decibels before extracting the features.
+    :type use_dB_normalization: bool
+    :param target_dB: Target audio decibels for normalization.
+    :type target_dB: float
+    """
+    def __init__(self,
+                 vocab_filepath,
+                 specgram_type='linear',
+                 stride_ms=10.0,
+                 window_ms=20.0,
+                 max_freq=None,
+                 target_sample_rate=16000,
+                 use_dB_normalization=True,
+                 target_dB=-20):
+        self._audio_featurizer = AudioFeaturizer(
+            specgram_type=specgram_type,
+            stride_ms=stride_ms,
+            window_ms=window_ms,
+            max_freq=max_freq,
+            target_sample_rate=target_sample_rate,
+            use_dB_normalization=use_dB_normalization,
+            target_dB=target_dB)
+        self._text_featurizer = TextFeaturizer(vocab_filepath)
+    def featurize(self, speech_segment, keep_transcription_text):
+        """Extract features for speech segment.
+        1. For audio parts, extract the audio features.
+        2. For transcript parts, keep the original text or convert text string
+           to a list of token indices in char-level.
+        :param audio_segment: Speech segment to extract features from.
+        :type audio_segment: SpeechSegment
+        :return: A tuple of 1) spectrogram audio feature in 2darray, 2) list of
+                 char-level token indices.
+        :rtype: tuple
+        """
+        audio_feature = self._audio_featurizer.featurize(speech_segment)
+        if keep_transcription_text:
+            return audio_feature, speech_segment.transcript
+        text_ids = self._text_featurizer.featurize(speech_segment.transcript)
+        return audio_feature, text_ids
+    @property
+    def vocab_size(self):
+        """Return the vocabulary size.
+        :return: Vocabulary size.
+        :rtype: int
+        """
+        return self._text_featurizer.vocab_size
+    @property
+    def vocab_list(self):
+        """Return the vocabulary in list.
+        :return: Vocabulary in list.
+        :rtype: list
+        """
+        return self._text_featurizer.vocab_list
--- a/examples/transv1.8to2.x/data_utils/featurizer/text_featurizer.py
+++ b/examples/transv1.8to2.x/data_utils/featurizer/text_featurizer.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Contains the text featurizer class."""
+import codecs
+class TextFeaturizer(object):
+    """Text featurizer, for processing or extracting features from text.
+    Currently, it only supports char-level tokenizing and conversion into
+    a list of token indices. Note that the token indexing order follows the
+    given vocabulary file.
+    :param vocab_filepath: Filepath to load vocabulary for token indices
+                           conversion.
+    :type specgram_type: str
+    """
+    def __init__(self, vocab_filepath):
+        self._vocab_dict, self._vocab_list = self._load_vocabulary_from_file(
+            vocab_filepath)
+    def featurize(self, text):
+        """Convert text string to a list of token indices in char-level.Note
+        that the token indexing order follows the given vocabulary file.
+        :param text: Text to process.
+        :type text: str
+        :return: List of char-level token indices.
+        :rtype: list
+        """
+        tokens = self._char_tokenize(text)
+        return [self._vocab_dict[token] for token in tokens]
+    @property
+    def vocab_size(self):
+        """Return the vocabulary size.
+        :return: Vocabulary size.
+        :rtype: int
+        """
+        return len(self._vocab_list)
+    @property
+    def vocab_list(self):
+        """Return the vocabulary in list.
+        :return: Vocabulary in list.
+        :rtype: list
+        """
+        return self._vocab_list
+    def _char_tokenize(self, text):
+        """Character tokenizer."""
+        return list(text.strip())
+    def _load_vocabulary_from_file(self, vocab_filepath):
+        """Load vocabulary from file."""
+        vocab_lines = []
+        with codecs.open(vocab_filepath, 'r', 'utf-8') as file:
+            vocab_lines.extend(file.readlines())
+        vocab_list = [line[:-1] for line in vocab_lines]
+        vocab_dict = dict(
+            [(token, id) for (id, token) in enumerate(vocab_list)])
+        return vocab_dict, vocab_list
--- a/examples/transv1.8to2.x/data_utils/normalizer.py
+++ b/examples/transv1.8to2.x/data_utils/normalizer.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Contains feature normalizers."""
+import random
+import numpy as np
+from data_utils.audio import AudioSegment
+from data_utils.utility import read_manifest
+class FeatureNormalizer(object):
+    """Feature normalizer. Normalize features to be of zero mean and unit
+    stddev.
+    if mean_std_filepath is provided (not None), the normalizer will directly
+    initilize from the file. Otherwise, both manifest_path and featurize_func
+    should be given for on-the-fly mean and stddev computing.
+    :param mean_std_filepath: File containing the pre-computed mean and stddev.
+    :type mean_std_filepath: None|str
+    :param manifest_path: Manifest of instances for computing mean and stddev.
+    :type meanifest_path: None|str
+    :param featurize_func: Function to extract features. It should be callable
+                           with ``featurize_func(audio_segment)``.
+    :type featurize_func: None|callable
+    :param num_samples: Number of random samples for computing mean and stddev.
+    :type num_samples: int
+    :param random_seed: Random seed for sampling instances.
+    :type random_seed: int
+    :raises ValueError: If both mean_std_filepath and manifest_path
+                        (or both mean_std_filepath and featurize_func) are None.
+    """
+    def __init__(self,
+                 mean_std_filepath,
+                 manifest_path=None,
+                 featurize_func=None,
+                 num_samples=500,
+                 random_seed=0):
+        if not mean_std_filepath:
+            if not (manifest_path and featurize_func):
+                raise ValueError("If mean_std_filepath is None, meanifest_path "
+                                 "and featurize_func should not be None.")
+            self._rng = random.Random(random_seed)
+            self._compute_mean_std(manifest_path, featurize_func, num_samples)
+        else:
+            self._read_mean_std_from_file(mean_std_filepath)
+    def apply(self, features, eps=1e-14):
+        """Normalize features to be of zero mean and unit stddev.
+        :param features: Input features to be normalized.
+        :type features: ndarray
+        :param eps:  added to stddev to provide numerical stablibity.
+        :type eps: float
+        :return: Normalized features.
+        :rtype: ndarray
+        """
+        return (features - self._mean) / (self._std + eps)
+    def write_to_file(self, filepath):
+        """Write the mean and stddev to the file.
+        :param filepath: File to write mean and stddev.
+        :type filepath: str
+        """
+        np.savez(filepath, mean=self._mean, std=self._std)
+    def _read_mean_std_from_file(self, filepath):
+        """Load mean and std from file."""
+        npzfile = np.load(filepath)
+        self._mean = npzfile["mean"]
+        self._std = npzfile["std"]
+    def _compute_mean_std(self, manifest_path, featurize_func, num_samples):
+        """Compute mean and std from randomly sampled instances."""
+        manifest = read_manifest(manifest_path)
+        sampled_manifest = self._rng.sample(manifest, num_samples)
+        features = []
+        for instance in sampled_manifest:
+            features.append(
+                featurize_func(
+                    AudioSegment.from_file(instance["audio_filepath"])))
+        features = np.hstack(features)
+        self._mean = np.mean(features, axis=1).reshape([-1, 1])
+        self._std = np.std(features, axis=1).reshape([-1, 1])
--- a/examples/transv1.8to2.x/data_utils/speech.py
+++ b/examples/transv1.8to2.x/data_utils/speech.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Contains the speech segment class."""
+import numpy as np
+from data_utils.audio import AudioSegment
+class SpeechSegment(AudioSegment):
+    """Speech segment abstraction, a subclass of AudioSegment,
+    with an additional transcript.
+    :param samples: Audio samples [num_samples x num_channels].
+    :type samples: ndarray.float32
+    :param sample_rate: Audio sample rate.
+    :type sample_rate: int
+    :param transcript: Transcript text for the speech.
+    :type transript: str
+    :raises TypeError: If the sample data type is not float or int.
+    """
+    def __init__(self, samples, sample_rate, transcript):
+        AudioSegment.__init__(self, samples, sample_rate)
+        self._transcript = transcript
+    def __eq__(self, other):
+        """Return whether two objects are equal.
+        """
+        if not AudioSegment.__eq__(self, other):
+            return False
+        if self._transcript != other._transcript:
+            return False
+        return True
+    def __ne__(self, other):
+        """Return whether two objects are unequal."""
+        return not self.__eq__(other)
+    @classmethod
+    def from_file(cls, filepath, transcript):
+        """Create speech segment from audio file and corresponding transcript.
+        :param filepath: Filepath or file object to audio file.
+        :type filepath: str|file
+        :param transcript: Transcript text for the speech.
+        :type transript: str
+        :return: Speech segment instance.
+        :rtype: SpeechSegment
+        """
+        audio = AudioSegment.from_file(filepath)
+        return cls(audio.samples, audio.sample_rate, transcript)
+    @classmethod
+    def from_bytes(cls, bytes, transcript):
+        """Create speech segment from a byte string and corresponding
+        transcript.
+        :param bytes: Byte string containing audio samples.
+        :type bytes: str
+        :param transcript: Transcript text for the speech.
+        :type transript: str
+        :return: Speech segment instance.
+        :rtype: Speech Segment
+        """
+        audio = AudioSegment.from_bytes(bytes)
+        return cls(audio.samples, audio.sample_rate, transcript)
+    @classmethod
+    def concatenate(cls, *segments):
+        """Concatenate an arbitrary number of speech segments together, both
+        audio and transcript will be concatenated.
+        :param *segments: Input speech segments to be concatenated.
+        :type *segments: tuple of SpeechSegment
+        :return: Speech segment instance.
+        :rtype: SpeechSegment
+        :raises ValueError: If the number of segments is zero, or if the 
+                            sample_rate of any two segments does not match.
+        :raises TypeError: If any segment is not SpeechSegment instance.
+        """
+        if len(segments) == 0:
+            raise ValueError("No speech segments are given to concatenate.")
+        sample_rate = segments[0]._sample_rate
+        transcripts = ""
+        for seg in segments:
+            if sample_rate != seg._sample_rate:
+                raise ValueError("Can't concatenate segments with "
+                                 "different sample rates")
+            if type(seg) is not cls:
+                raise TypeError("Only speech segments of the same type "
+                                "instance can be concatenated.")
+            transcripts += seg._transcript
+        samples = np.concatenate([seg.samples for seg in segments])
+        return cls(samples, sample_rate, transcripts)
+    @classmethod
+    def slice_from_file(cls, filepath, transcript, start=None, end=None):
+        """Loads a small section of an speech without having to load
+        the entire file into the memory which can be incredibly wasteful.
+        :param filepath: Filepath or file object to audio file.
+        :type filepath: str|file
+        :param start: Start time in seconds. If start is negative, it wraps
+                      around from the end. If not provided, this function 
+                      reads from the very beginning.
+        :type start: float
+        :param end: End time in seconds. If end is negative, it wraps around
+                    from the end. If not provided, the default behvaior is
+                    to read to the end of the file.
+        :type end: float
+        :param transcript: Transcript text for the speech. if not provided, 
+                           the defaults is an empty string.
+        :type transript: str
+        :return: SpeechSegment instance of the specified slice of the input
+                 speech file.
+        :rtype: SpeechSegment
+        """
+        audio = AudioSegment.slice_from_file(filepath, start, end)
+        return cls(audio.samples, audio.sample_rate, transcript)
+    @classmethod
+    def make_silence(cls, duration, sample_rate):
+        """Creates a silent speech segment of the given duration and
+        sample rate, transcript will be an empty string.
+        :param duration: Length of silence in seconds.
+        :type duration: float
+        :param sample_rate: Sample rate.
+        :type sample_rate: float
+        :return: Silence of the given duration.
+        :rtype: SpeechSegment
+        """
+        audio = AudioSegment.make_silence(duration, sample_rate)
+        return cls(audio.samples, audio.sample_rate, "")
+    @property
+    def transcript(self):
+        """Return the transcript text.
+        :return: Transcript text for the speech.
+        :rtype: str
+        """
+        return self._transcript
--- a/examples/transv1.8to2.x/data_utils/utility.py
+++ b/examples/transv1.8to2.x/data_utils/utility.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Contains data helper functions."""
+import codecs
+import json
+import os
+import tarfile
+from paddle.dataset.common import md5file
+def read_manifest(manifest_path, max_duration=float('inf'), min_duration=0.0):
+    """Load and parse manifest file.
+    Instances with durations outside [min_duration, max_duration] will be
+    filtered out.
+    :param manifest_path: Manifest file to load and parse.
+    :type manifest_path: str
+    :param max_duration: Maximal duration in seconds for instance filter.
+    :type max_duration: float
+    :param min_duration: Minimal duration in seconds for instance filter.
+    :type min_duration: float
+    :return: Manifest parsing results. List of dict.
+    :rtype: list
+    :raises IOError: If failed to parse the manifest.
+    """
+    manifest = []
+    for json_line in codecs.open(manifest_path, 'r', 'utf-8'):
+        try:
+            json_data = json.loads(json_line)
+        except Exception as e:
+            raise IOError("Error reading manifest: %s" % str(e))
+        if (json_data["duration"] <= max_duration and
+                json_data["duration"] >= min_duration):
+            manifest.append(json_data)
+    return manifest
+def getfile_insensitive(path):
+    """Get the actual file path when given insensitive filename."""
+    directory, filename = os.path.split(path)
+    directory, filename = (directory or '.'), filename.lower()
+    for f in os.listdir(directory):
+        newpath = os.path.join(directory, f)
+        if os.path.isfile(newpath) and f.lower() == filename:
+            return newpath
+def download_multi(url, target_dir, extra_args):
+    """Download multiple files from url to target_dir."""
+    if not os.path.exists(target_dir):
+        os.makedirs(target_dir)
+    print("Downloading %s ..." % url)
+    ret_code = os.system("wget -c " + url + ' ' + extra_args + " -P " +
+                         target_dir)
+    return ret_code
+def download(url, md5sum, target_dir):
+    """Download file from url to target_dir, and check md5sum."""
+    if not os.path.exists(target_dir):
+        os.makedirs(target_dir)
+    filepath = os.path.join(target_dir, url.split("/")[-1])
+    if not (os.path.exists(filepath) and md5file(filepath) == md5sum):
+        print("Downloading %s ..." % url)
+        os.system("wget -c " + url + " -P " + target_dir)
+        print("\nMD5 Chesksum %s ..." % filepath)
+        if not md5file(filepath) == md5sum:
+            raise RuntimeError("MD5 checksum failed.")
+    else:
+        print("File exists, skip downloading. (%s)" % filepath)
+    return filepath
+def unpack(filepath, target_dir, rm_tar=False):
+    """Unpack the file to the target_dir."""
+    print("Unpacking %s ..." % filepath)
+    tar = tarfile.open(filepath)
+    tar.extractall(target_dir)
+    tar.close()
+    if rm_tar is True:
+        os.remove(filepath)
+class XmapEndSignal():
+    pass
--- a/examples/transv1.8to2.x/deepspeech/__init__.py
+++ b/examples/transv1.8to2.x/deepspeech/__init__.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any
+from typing import List
+from typing import Tuple
+from typing import Union
+import paddle
+from paddle import nn
+from paddle.fluid import core
+from paddle.nn import functional as F
+from deepspeech.utils.log import Log
+#TODO(Hui Zhang): remove  fluid import
+logger = Log(__name__).getlog()
+########### hcak logging #############
+logger.warn = logger.warning
+########### hcak paddle #############
+paddle.half = 'float16'
+paddle.float = 'float32'
+paddle.double = 'float64'
+paddle.short = 'int16'
+paddle.int = 'int32'
+paddle.long = 'int64'
+paddle.uint16 = 'uint16'
+paddle.cdouble = 'complex128'
+def convert_dtype_to_string(tensor_dtype):
+    """
+    Convert the data type in numpy to the data type in Paddle
+    Args:
+        tensor_dtype(core.VarDesc.VarType): the data type in numpy.
+    Returns:
+        core.VarDesc.VarType: the data type in Paddle.
+    """
+    dtype = tensor_dtype
+    if dtype == core.VarDesc.VarType.FP32:
+        return paddle.float32
+    elif dtype == core.VarDesc.VarType.FP64:
+        return paddle.float64
+    elif dtype == core.VarDesc.VarType.FP16:
+        return paddle.float16
+    elif dtype == core.VarDesc.VarType.INT32:
+        return paddle.int32
+    elif dtype == core.VarDesc.VarType.INT16:
+        return paddle.int16
+    elif dtype == core.VarDesc.VarType.INT64:
+        return paddle.int64
+    elif dtype == core.VarDesc.VarType.BOOL:
+        return paddle.bool
+    elif dtype == core.VarDesc.VarType.BF16:
+        # since there is still no support for bfloat16 in NumPy,
+        # uint16 is used for casting bfloat16
+        return paddle.uint16
+    elif dtype == core.VarDesc.VarType.UINT8:
+        return paddle.uint8
+    elif dtype == core.VarDesc.VarType.INT8:
+        return paddle.int8
+    elif dtype == core.VarDesc.VarType.COMPLEX64:
+        return paddle.complex64
+    elif dtype == core.VarDesc.VarType.COMPLEX128:
+        return paddle.complex128
+    else:
+        raise ValueError("Not supported tensor dtype %s" % dtype)
+if not hasattr(paddle, 'softmax'):
+    logger.warn("register user softmax to paddle, remove this when fixed!")
+    setattr(paddle, 'softmax', paddle.nn.functional.softmax)
+if not hasattr(paddle, 'log_softmax'):
+    logger.warn("register user log_softmax to paddle, remove this when fixed!")
+    setattr(paddle, 'log_softmax', paddle.nn.functional.log_softmax)
+if not hasattr(paddle, 'sigmoid'):
+    logger.warn("register user sigmoid to paddle, remove this when fixed!")
+    setattr(paddle, 'sigmoid', paddle.nn.functional.sigmoid)
+if not hasattr(paddle, 'log_sigmoid'):
+    logger.warn("register user log_sigmoid to paddle, remove this when fixed!")
+    setattr(paddle, 'log_sigmoid', paddle.nn.functional.log_sigmoid)
+if not hasattr(paddle, 'relu'):
+    logger.warn("register user relu to paddle, remove this when fixed!")
+    setattr(paddle, 'relu', paddle.nn.functional.relu)
+def cat(xs, dim=0):
+    return paddle.concat(xs, axis=dim)
+if not hasattr(paddle, 'cat'):
+    logger.warn(
+        "override cat of paddle if exists or register, remove this when fixed!")
+    paddle.cat = cat
+########### hcak paddle.Tensor #############
+def item(x: paddle.Tensor):
+    return x.numpy().item()
+if not hasattr(paddle.Tensor, 'item'):
+    logger.warn(
+        "override item of paddle.Tensor if exists or register, remove this when fixed!"
+    )
+    paddle.Tensor.item = item
+def func_long(x: paddle.Tensor):
+    return paddle.cast(x, paddle.long)
+if not hasattr(paddle.Tensor, 'long'):
+    logger.warn(
+        "override long of paddle.Tensor if exists or register, remove this when fixed!"
+    )
+    paddle.Tensor.long = func_long
+if not hasattr(paddle.Tensor, 'numel'):
+    logger.warn(
+        "override numel of paddle.Tensor if exists or register, remove this when fixed!"
+    )
+    paddle.Tensor.numel = paddle.numel
+def new_full(x: paddle.Tensor,
+             size: Union[List[int], Tuple[int], paddle.Tensor],
+             fill_value: Union[float, int, bool, paddle.Tensor],
+             dtype=None):
+    return paddle.full(size, fill_value, dtype=x.dtype)
+if not hasattr(paddle.Tensor, 'new_full'):
+    logger.warn(
+        "override new_full of paddle.Tensor if exists or register, remove this when fixed!"
+    )
+    paddle.Tensor.new_full = new_full
+def eq(xs: paddle.Tensor, ys: Union[paddle.Tensor, float]) -> paddle.Tensor:
+    if convert_dtype_to_string(xs.dtype) == paddle.bool:
+        xs = xs.astype(paddle.int)
+    return xs.equal(
+        paddle.to_tensor(
+            ys, dtype=convert_dtype_to_string(xs.dtype), place=xs.place))
+if not hasattr(paddle.Tensor, 'eq'):
+    logger.warn(
+        "override eq of paddle.Tensor if exists or register, remove this when fixed!"
+    )
+    paddle.Tensor.eq = eq
+if not hasattr(paddle, 'eq'):
+    logger.warn(
+        "override eq of paddle if exists or register, remove this when fixed!")
+    paddle.eq = eq
+def contiguous(xs: paddle.Tensor) -> paddle.Tensor:
+    return xs
+if not hasattr(paddle.Tensor, 'contiguous'):
+    logger.warn(
+        "override contiguous of paddle.Tensor if exists or register, remove this when fixed!"
+    )
+    paddle.Tensor.contiguous = contiguous
+def size(xs: paddle.Tensor, *args: int) -> paddle.Tensor:
+    nargs = len(args)
+    assert (nargs <= 1)
+    s = paddle.shape(xs)
+    if nargs == 1:
+        return s[args[0]]
+    else:
+        return s
+#`to_static` do not process `size` property, maybe some `paddle` api dependent on it.
+logger.warn(
+    "override size of paddle.Tensor "
+    "(`to_static` do not process `size` property, maybe some `paddle` api dependent on it), remove this when fixed!"
+)
+paddle.Tensor.size = size
+def view(xs: paddle.Tensor, *args: int) -> paddle.Tensor:
+    return xs.reshape(args)
+if not hasattr(paddle.Tensor, 'view'):
+    logger.warn("register user view to paddle.Tensor, remove this when fixed!")
+    paddle.Tensor.view = view
+def view_as(xs: paddle.Tensor, ys: paddle.Tensor) -> paddle.Tensor:
+    return xs.reshape(ys.size())
+if not hasattr(paddle.Tensor, 'view_as'):
+    logger.warn(
+        "register user view_as to paddle.Tensor, remove this when fixed!")
+    paddle.Tensor.view_as = view_as
+def is_broadcastable(shp1, shp2):
+    for a, b in zip(shp1[::-1], shp2[::-1]):
+        if a == 1 or b == 1 or a == b:
+            pass
+        else:
+            return False
+    return True
+def masked_fill(xs: paddle.Tensor,
+                mask: paddle.Tensor,
+                value: Union[float, int]):
+    assert is_broadcastable(xs.shape, mask.shape) is True
+    bshape = paddle.broadcast_shape(xs.shape, mask.shape)
+    mask = mask.broadcast_to(bshape)
+    trues = paddle.ones_like(xs) * value
+    xs = paddle.where(mask, trues, xs)
+    return xs
+if not hasattr(paddle.Tensor, 'masked_fill'):
+    logger.warn(
+        "register user masked_fill to paddle.Tensor, remove this when fixed!")
+    paddle.Tensor.masked_fill = masked_fill
+def masked_fill_(xs: paddle.Tensor,
+                 mask: paddle.Tensor,
+                 value: Union[float, int]) -> paddle.Tensor:
+    assert is_broadcastable(xs.shape, mask.shape) is True
+    bshape = paddle.broadcast_shape(xs.shape, mask.shape)
+    mask = mask.broadcast_to(bshape)
+    trues = paddle.ones_like(xs) * value
+    ret = paddle.where(mask, trues, xs)
+    paddle.assign(ret.detach(), output=xs)
+    return xs
+if not hasattr(paddle.Tensor, 'masked_fill_'):
+    logger.warn(
+        "register user masked_fill_ to paddle.Tensor, remove this when fixed!")
+    paddle.Tensor.masked_fill_ = masked_fill_
+def fill_(xs: paddle.Tensor, value: Union[float, int]) -> paddle.Tensor:
+    val = paddle.full_like(xs, value)
+    paddle.assign(val.detach(), output=xs)
+    return xs
+if not hasattr(paddle.Tensor, 'fill_'):
+    logger.warn("register user fill_ to paddle.Tensor, remove this when fixed!")
+    paddle.Tensor.fill_ = fill_
+def repeat(xs: paddle.Tensor, *size: Any) -> paddle.Tensor:
+    return paddle.tile(xs, size)
+if not hasattr(paddle.Tensor, 'repeat'):
+    logger.warn(
+        "register user repeat to paddle.Tensor, remove this when fixed!")
+    paddle.Tensor.repeat = repeat
+if not hasattr(paddle.Tensor, 'softmax'):
+    logger.warn(
+        "register user softmax to paddle.Tensor, remove this when fixed!")
+    setattr(paddle.Tensor, 'softmax', paddle.nn.functional.softmax)
+if not hasattr(paddle.Tensor, 'sigmoid'):
+    logger.warn(
+        "register user sigmoid to paddle.Tensor, remove this when fixed!")
+    setattr(paddle.Tensor, 'sigmoid', paddle.nn.functional.sigmoid)
+if not hasattr(paddle.Tensor, 'relu'):
+    logger.warn("register user relu to paddle.Tensor, remove this when fixed!")
+    setattr(paddle.Tensor, 'relu', paddle.nn.functional.relu)
+def type_as(x: paddle.Tensor, other: paddle.Tensor) -> paddle.Tensor:
+    return x.astype(other.dtype)
+if not hasattr(paddle.Tensor, 'type_as'):
+    logger.warn(
+        "register user type_as to paddle.Tensor, remove this when fixed!")
+    setattr(paddle.Tensor, 'type_as', type_as)
+def to(x: paddle.Tensor, *args, **kwargs) -> paddle.Tensor:
+    assert len(args) == 1
+    if isinstance(args[0], str):  # dtype
+        return x.astype(args[0])
+    elif isinstance(args[0], paddle.Tensor):  #Tensor
+        return x.astype(args[0].dtype)
+    else:  # Device
+        return x
+if not hasattr(paddle.Tensor, 'to'):
+    logger.warn("register user to to paddle.Tensor, remove this when fixed!")
+    setattr(paddle.Tensor, 'to', to)
+def func_float(x: paddle.Tensor) -> paddle.Tensor:
+    return x.astype(paddle.float)
+if not hasattr(paddle.Tensor, 'float'):
+    logger.warn("register user float to paddle.Tensor, remove this when fixed!")
+    setattr(paddle.Tensor, 'float', func_float)
+def func_int(x: paddle.Tensor) -> paddle.Tensor:
+    return x.astype(paddle.int)
+if not hasattr(paddle.Tensor, 'int'):
+    logger.warn("register user int to paddle.Tensor, remove this when fixed!")
+    setattr(paddle.Tensor, 'int', func_int)
+def tolist(x: paddle.Tensor) -> List[Any]:
+    return x.numpy().tolist()
+if not hasattr(paddle.Tensor, 'tolist'):
+    logger.warn(
+        "register user tolist to paddle.Tensor, remove this when fixed!")
+    setattr(paddle.Tensor, 'tolist', tolist)
+########### hcak paddle.nn #############
+class GLU(nn.Layer):
+    """Gated Linear Units (GLU) Layer"""
+    def __init__(self, dim: int=-1):
+        super().__init__()
+        self.dim = dim
+    def forward(self, xs):
+        return F.glu(xs, axis=self.dim)
+if not hasattr(paddle.nn, 'GLU'):
+    logger.warn("register user GLU to paddle.nn, remove this when fixed!")
+    setattr(paddle.nn, 'GLU', GLU)
--- a/examples/transv1.8to2.x/deepspeech/decoders/README.MD
+++ b/examples/transv1.8to2.x/deepspeech/decoders/README.MD
+# Reference
+* [Sequence Modeling With CTC](https://distill.pub/2017/ctc/)
+* [First-Pass Large Vocabulary Continuous Speech Recognition using Bi-Directional Recurrent DNNs](https://arxiv.org/pdf/1408.2873.pdf)
\ No newline at end of file
--- a/examples/transv1.8to2.x/deepspeech/decoders/__init__.py
+++ b/examples/transv1.8to2.x/deepspeech/decoders/__init__.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/examples/transv1.8to2.x/deepspeech/decoders/decoders_deprecated.py
+++ b/examples/transv1.8to2.x/deepspeech/decoders/decoders_deprecated.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Contains various CTC decoders."""
+import multiprocessing
+from itertools import groupby
+from math import log
+import numpy as np
+def ctc_greedy_decoder(probs_seq, vocabulary):
+    """CTC greedy (best path) decoder.
+    Path consisting of the most probable tokens are further post-processed to
+    remove consecutive repetitions and all blanks.
+    :param probs_seq: 2-D list of probabilities over the vocabulary for each
+                      character. Each element is a list of float probabilities
+                      for one character.
+    :type probs_seq: list
+    :param vocabulary: Vocabulary list.
+    :type vocabulary: list
+    :return: Decoding result string.
+    :rtype: baseline
+    """
+    # dimension verification
+    for probs in probs_seq:
+        if not len(probs) == len(vocabulary) + 1:
+            raise ValueError("probs_seq dimension mismatchedd with vocabulary")
+    # argmax to get the best index for each time step
+    max_index_list = list(np.array(probs_seq).argmax(axis=1))
+    # remove consecutive duplicate indexes
+    index_list = [index_group[0] for index_group in groupby(max_index_list)]
+    # remove blank indexes
+    blank_index = len(vocabulary)
+    index_list = [index for index in index_list if index != blank_index]
+    # convert index list to string
+    return ''.join([vocabulary[index] for index in index_list])
+def ctc_beam_search_decoder(probs_seq,
+                            beam_size,
+                            vocabulary,
+                            cutoff_prob=1.0,
+                            cutoff_top_n=40,
+                            ext_scoring_func=None,
+                            nproc=False):
+    """CTC Beam search decoder.
+    It utilizes beam search to approximately select top best decoding
+    labels and returning results in the descending order.
+    The implementation is based on Prefix Beam Search
+    (https://arxiv.org/abs/1408.2873), and the unclear part is
+    redesigned. Two important modifications: 1) in the iterative computation
+    of probabilities, the assignment operation is changed to accumulation for
+    one prefix may comes from different paths; 2) the if condition "if l^+ not
+    in A_prev then" after probabilities' computation is deprecated for it is
+    hard to understand and seems unnecessary.
+    :param probs_seq: 2-D list of probability distributions over each time
+                      step, with each element being a list of normalized
+                      probabilities over vocabulary and blank.
+    :type probs_seq: 2-D list
+    :param beam_size: Width for beam search.
+    :type beam_size: int
+    :param vocabulary: Vocabulary list.
+    :type vocabulary: list
+    :param cutoff_prob: Cutoff probability in pruning,
+                        default 1.0, no pruning.
+    :type cutoff_prob: float
+    :param ext_scoring_func: External scoring function for
+                            partially decoded sentence, e.g. word count
+                            or language model.
+    :type external_scoring_func: callable
+    :param nproc: Whether the decoder used in multiprocesses.
+    :type nproc: bool
+    :return: List of tuples of log probability and sentence as decoding
+             results, in descending order of the probability.
+    :rtype: list
+    """
+    # dimension check
+    for prob_list in probs_seq:
+        if not len(prob_list) == len(vocabulary) + 1:
+            raise ValueError("The shape of prob_seq does not match with the "
+                             "shape of the vocabulary.")
+    # blank_id assign
+    blank_id = len(vocabulary)
+    # If the decoder called in the multiprocesses, then use the global scorer
+    # instantiated in ctc_beam_search_decoder_batch().
+    if nproc is True:
+        global ext_nproc_scorer
+        ext_scoring_func = ext_nproc_scorer
+    # initialize
+    # prefix_set_prev: the set containing selected prefixes
+    # probs_b_prev: prefixes' probability ending with blank in previous step
+    # probs_nb_prev: prefixes' probability ending with non-blank in previous step
+    prefix_set_prev = {'\t': 1.0}
+    probs_b_prev, probs_nb_prev = {'\t': 1.0}, {'\t': 0.0}
+    # extend prefix in loop
+    for time_step in range(len(probs_seq)):
+        # prefix_set_next: the set containing candidate prefixes
+        # probs_b_cur: prefixes' probability ending with blank in current step
+        # probs_nb_cur: prefixes' probability ending with non-blank in current step
+        prefix_set_next, probs_b_cur, probs_nb_cur = {}, {}, {}
+        prob_idx = list(enumerate(probs_seq[time_step]))
+        cutoff_len = len(prob_idx)
+        # If pruning is enabled
+        if cutoff_prob < 1.0 or cutoff_top_n < cutoff_len:
+            prob_idx = sorted(prob_idx, key=lambda asd: asd[1], reverse=True)
+            cutoff_len, cum_prob = 0, 0.0
+            for i in range(len(prob_idx)):
+                cum_prob += prob_idx[i][1]
+                cutoff_len += 1
+                if cum_prob >= cutoff_prob:
+                    break
+            cutoff_len = min(cutoff_len, cutoff_top_n)
+            prob_idx = prob_idx[0:cutoff_len]
+        for l in prefix_set_prev:
+            if l not in prefix_set_next:
+                probs_b_cur[l], probs_nb_cur[l] = 0.0, 0.0
+            # extend prefix by travering prob_idx
+            for index in range(cutoff_len):
+                c, prob_c = prob_idx[index][0], prob_idx[index][1]
+                if c == blank_id:
+                    probs_b_cur[l] += prob_c * (
+                        probs_b_prev[l] + probs_nb_prev[l])
+                else:
+                    last_char = l[-1]
+                    new_char = vocabulary[c]
+                    l_plus = l + new_char
+                    if l_plus not in prefix_set_next:
+                        probs_b_cur[l_plus], probs_nb_cur[l_plus] = 0.0, 0.0
+                    if new_char == last_char:
+                        probs_nb_cur[l_plus] += prob_c * probs_b_prev[l]
+                        probs_nb_cur[l] += prob_c * probs_nb_prev[l]
+                    elif new_char == ' ':
+                        if (ext_scoring_func is None) or (len(l) == 1):
+                            score = 1.0
+                        else:
+                            prefix = l[1:]
+                            score = ext_scoring_func(prefix)
+                        probs_nb_cur[l_plus] += score * prob_c * (
+                            probs_b_prev[l] + probs_nb_prev[l])
+                    else:
+                        probs_nb_cur[l_plus] += prob_c * (
+                            probs_b_prev[l] + probs_nb_prev[l])
+                    # add l_plus into prefix_set_next
+                    prefix_set_next[l_plus] = probs_nb_cur[
+                        l_plus] + probs_b_cur[l_plus]
+            # add l into prefix_set_next
+            prefix_set_next[l] = probs_b_cur[l] + probs_nb_cur[l]
+        # update probs
+        probs_b_prev, probs_nb_prev = probs_b_cur, probs_nb_cur
+        # store top beam_size prefixes
+        prefix_set_prev = sorted(
+            prefix_set_next.items(), key=lambda asd: asd[1], reverse=True)
+        if beam_size < len(prefix_set_prev):
+            prefix_set_prev = prefix_set_prev[:beam_size]
+        prefix_set_prev = dict(prefix_set_prev)
+    beam_result = []
+    for seq, prob in prefix_set_prev.items():
+        if prob > 0.0 and len(seq) > 1:
+            result = seq[1:]
+            # score last word by external scorer
+            if (ext_scoring_func is not None) and (result[-1] != ' '):
+                prob = prob * ext_scoring_func(result)
+            log_prob = log(prob)
+            beam_result.append((log_prob, result))
+        else:
+            beam_result.append((float('-inf'), ''))
+    # output top beam_size decoding results
+    beam_result = sorted(beam_result, key=lambda asd: asd[0], reverse=True)
+    return beam_result
+def ctc_beam_search_decoder_batch(probs_split,
+                                  beam_size,
+                                  vocabulary,
+                                  num_processes,
+                                  cutoff_prob=1.0,
+                                  cutoff_top_n=40,
+                                  ext_scoring_func=None):
+    """CTC beam search decoder using multiple processes.
+    :param probs_seq: 3-D list with each element as an instance of 2-D list
+                      of probabilities used by ctc_beam_search_decoder().
+    :type probs_seq: 3-D list
+    :param beam_size: Width for beam search.
+    :type beam_size: int
+    :param vocabulary: Vocabulary list.
+    :type vocabulary: list
+    :param num_processes: Number of parallel processes.
+    :type num_processes: int
+    :param cutoff_prob: Cutoff probability in pruning,
+                        default 1.0, no pruning.
+    :type cutoff_prob: float
+    :param num_processes: Number of parallel processes.
+    :type num_processes: int
+    :param ext_scoring_func: External scoring function for
+                            partially decoded sentence, e.g. word count
+                            or language model.
+    :type external_scoring_function: callable
+    :return: List of tuples of log probability and sentence as decoding
+             results, in descending order of the probability.
+    :rtype: list
+    """
+    if not num_processes > 0:
+        raise ValueError("Number of processes must be positive!")
+    # use global variable to pass the externnal scorer to beam search decoder
+    global ext_nproc_scorer
+    ext_nproc_scorer = ext_scoring_func
+    nproc = True
+    pool = multiprocessing.Pool(processes=num_processes)
+    results = []
+    for i, probs_list in enumerate(probs_split):
+        args = (probs_list, beam_size, vocabulary, cutoff_prob, cutoff_top_n,
+                None, nproc)
+        results.append(pool.apply_async(ctc_beam_search_decoder, args))
+    pool.close()
+    pool.join()
+    beam_search_results = [result.get() for result in results]
+    return beam_search_results
--- a/examples/transv1.8to2.x/deepspeech/decoders/scorer_deprecated.py
+++ b/examples/transv1.8to2.x/deepspeech/decoders/scorer_deprecated.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""External Scorer for Beam Search Decoder."""
+import os
+import kenlm
+import numpy as np
+class Scorer(object):
+    """External scorer to evaluate a prefix or whole sentence in
+       beam search decoding, including the score from n-gram language
+       model and word count.
+    :param alpha: Parameter associated with language model. Don't use
+                  language model when alpha = 0.
+    :type alpha: float
+    :param beta: Parameter associated with word count. Don't use word
+                count when beta = 0.
+    :type beta: float
+    :model_path: Path to load language model.
+    :type model_path: str
+    """
+    def __init__(self, alpha, beta, model_path):
+        self._alpha = alpha
+        self._beta = beta
+        if not os.path.isfile(model_path):
+            raise IOError("Invaid language model path: %s" % model_path)
+        self._language_model = kenlm.LanguageModel(model_path)
+    # n-gram language model scoring
+    def _language_model_score(self, sentence):
+        #log10 prob of last word
+        log_cond_prob = list(
+            self._language_model.full_scores(sentence, eos=False))[-1][0]
+        return np.power(10, log_cond_prob)
+    # word insertion term
+    def _word_count(self, sentence):
+        words = sentence.strip().split(' ')
+        return len(words)
+    # reset alpha and beta
+    def reset_params(self, alpha, beta):
+        self._alpha = alpha
+        self._beta = beta
+    # execute evaluation
+    def __call__(self, sentence, log=False):
+        """Evaluation function, gathering all the different scores
+        and return the final one.
+        :param sentence: The input sentence for evalutation
+        :type sentence: str
+        :param log: Whether return the score in log representation.
+        :type log: bool
+        :return: Evaluation score, in the decimal or log.
+        :rtype: float
+        """
+        lm = self._language_model_score(sentence)
+        word_cnt = self._word_count(sentence)
+        if log is False:
+            score = np.power(lm, self._alpha) * np.power(word_cnt, self._beta)
+        else:
+            score = self._alpha * np.log(lm) + self._beta * np.log(word_cnt)
+        return score
--- a/examples/transv1.8to2.x/deepspeech/decoders/swig_wrapper.py
+++ b/examples/transv1.8to2.x/deepspeech/decoders/swig_wrapper.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Wrapper for various CTC decoders in SWIG."""
+import swig_decoders
+class Scorer(swig_decoders.Scorer):
+    """Wrapper for Scorer.
+    :param alpha: Parameter associated with language model. Don't use
+                  language model when alpha = 0.
+    :type alpha: float
+    :param beta: Parameter associated with word count. Don't use word
+                 count when beta = 0.
+    :type beta: float
+    :model_path: Path to load language model.
+    :type model_path: str
+    """
+    def __init__(self, alpha, beta, model_path, vocabulary):
+        swig_decoders.Scorer.__init__(self, alpha, beta, model_path, vocabulary)
+def ctc_greedy_decoder(probs_seq, vocabulary, blank_id):
+    """Wrapper for ctc best path decoder in swig.
+    :param probs_seq: 2-D list of probability distributions over each time
+                      step, with each element being a list of normalized
+                      probabilities over vocabulary and blank.
+    :type probs_seq: 2-D list
+    :param vocabulary: Vocabulary list.
+    :type vocabulary: list
+    :return: Decoding result string.
+    :rtype: str
+    """
+    result = swig_decoders.ctc_greedy_decoder(probs_seq.tolist(), vocabulary,
+                                              blank_id)
+    return result
+def ctc_beam_search_decoder(probs_seq,
+                            vocabulary,
+                            beam_size,
+                            cutoff_prob=1.0,
+                            cutoff_top_n=40,
+                            ext_scoring_func=None,
+                            blank_id=0):
+    """Wrapper for the CTC Beam Search Decoder.
+    :param probs_seq: 2-D list of probability distributions over each time
+                      step, with each element being a list of normalized
+                      probabilities over vocabulary and blank.
+    :type probs_seq: 2-D list
+    :param vocabulary: Vocabulary list.
+    :type vocabulary: list
+    :param beam_size: Width for beam search.
+    :type beam_size: int
+    :param cutoff_prob: Cutoff probability in pruning,
+                        default 1.0, no pruning.
+    :type cutoff_prob: float
+    :param cutoff_top_n: Cutoff number in pruning, only top cutoff_top_n
+                         characters with highest probs in vocabulary will be
+                         used in beam search, default 40.
+    :type cutoff_top_n: int
+    :param ext_scoring_func: External scoring function for
+                             partially decoded sentence, e.g. word count
+                             or language model.
+    :type external_scoring_func: callable
+    :return: List of tuples of log probability and sentence as decoding
+             results, in descending order of the probability.
+    :rtype: list
+    """
+    beam_results = swig_decoders.ctc_beam_search_decoder(
+        probs_seq.tolist(), vocabulary, beam_size, cutoff_prob, cutoff_top_n,
+        ext_scoring_func, blank_id)
+    beam_results = [(res[0], res[1].decode('utf-8')) for res in beam_results]
+    return beam_results
+def ctc_beam_search_decoder_batch(probs_split,
+                                  vocabulary,
+                                  beam_size,
+                                  num_processes,
+                                  cutoff_prob=1.0,
+                                  cutoff_top_n=40,
+                                  ext_scoring_func=None,
+                                  blank_id=0):
+    """Wrapper for the batched CTC beam search decoder.
+    :param probs_seq: 3-D list with each element as an instance of 2-D list
+                      of probabilities used by ctc_beam_search_decoder().
+    :type probs_seq: 3-D list
+    :param vocabulary: Vocabulary list.
+    :type vocabulary: list
+    :param beam_size: Width for beam search.
+    :type beam_size: int
+    :param num_processes: Number of parallel processes.
+    :type num_processes: int
+    :param cutoff_prob: Cutoff probability in vocabulary pruning,
+                        default 1.0, no pruning.
+    :type cutoff_prob: float
+    :param cutoff_top_n: Cutoff number in pruning, only top cutoff_top_n
+                         characters with highest probs in vocabulary will be
+                         used in beam search, default 40.
+    :type cutoff_top_n: int
+    :param num_processes: Number of parallel processes.
+    :type num_processes: int
+    :param ext_scoring_func: External scoring function for
+                             partially decoded sentence, e.g. word count
+                             or language model.
+    :type external_scoring_function: callable
+    :return: List of tuples of log probability and sentence as decoding
+             results, in descending order of the probability.
+    :rtype: list
+    """
+    probs_split = [probs_seq.tolist() for probs_seq in probs_split]
+    batch_beam_results = swig_decoders.ctc_beam_search_decoder_batch(
+        probs_split, vocabulary, beam_size, num_processes, cutoff_prob,
+        cutoff_top_n, ext_scoring_func, blank_id)
+    batch_beam_results = [[(res[0], res[1]) for res in beam_results]
+                          for beam_results in batch_beam_results]
+    return batch_beam_results
--- a/examples/transv1.8to2.x/deepspeech/frontend/__init__.py
+++ b/examples/transv1.8to2.x/deepspeech/frontend/__init__.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/examples/transv1.8to2.x/deepspeech/frontend/audio.py
+++ b/examples/transv1.8to2.x/deepspeech/frontend/audio.py
--- a/examples/transv1.8to2.x/deepspeech/frontend/augmentor/__init__.py
+++ b/examples/transv1.8to2.x/deepspeech/frontend/augmentor/__init__.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/examples/transv1.8to2.x/deepspeech/frontend/augmentor/augmentation.py
+++ b/examples/transv1.8to2.x/deepspeech/frontend/augmentor/augmentation.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Contains the data augmentation pipeline."""
+import json
+from collections.abc import Sequence
+from inspect import signature
+import numpy as np
+from deepspeech.frontend.augmentor.base import AugmentorBase
+from deepspeech.utils.dynamic_import import dynamic_import
+from deepspeech.utils.log import Log
+__all__ = ["AugmentationPipeline"]
+logger = Log(__name__).getlog()
+import_alias = dict(
+    volume="deepspeech.frontend.augmentor.impulse_response:VolumePerturbAugmentor",
+    shift="deepspeech.frontend.augmentor.shift_perturb:ShiftPerturbAugmentor",
+    speed="deepspeech.frontend.augmentor.speed_perturb:SpeedPerturbAugmentor",
+    resample="deepspeech.frontend.augmentor.resample:ResampleAugmentor",
+    bayesian_normal="deepspeech.frontend.augmentor.online_bayesian_normalization:OnlineBayesianNormalizationAugmentor",
+    noise="deepspeech.frontend.augmentor.noise_perturb:NoisePerturbAugmentor",
+    impulse="deepspeech.frontend.augmentor.impulse_response:ImpulseResponseAugmentor",
+    specaug="deepspeech.frontend.augmentor.spec_augment:SpecAugmentor", )
+class AugmentationPipeline():
+    """Build a pre-processing pipeline with various augmentation models.Such a
+    data augmentation pipeline is oftern leveraged to augment the training
+    samples to make the model invariant to certain types of perturbations in the
+    real world, improving model's generalization ability.
+    The pipeline is built according the the augmentation configuration in json
+    string, e.g.
+    .. code-block::
+        [ {
+                "type": "noise",
+                "params": {"min_snr_dB": 10,
+                           "max_snr_dB": 20,
+                           "noise_manifest_path": "datasets/manifest.noise"},
+                "prob": 0.0
+            },
+            {
+                "type": "speed",
+                "params": {"min_speed_rate": 0.9,
+                           "max_speed_rate": 1.1},
+                "prob": 1.0
+            },
+            {
+                "type": "shift",
+                "params": {"min_shift_ms": -5,
+                           "max_shift_ms": 5},
+                "prob": 1.0
+            },
+            {
+                "type": "volume",
+                "params": {"min_gain_dBFS": -10,
+                           "max_gain_dBFS": 10},
+                "prob": 0.0
+            },
+            {
+                "type": "bayesian_normal",
+                "params": {"target_db": -20,
+                           "prior_db": -20,
+                           "prior_samples": 100},
+                "prob": 0.0
+            }
+        ]
+    This augmentation configuration inserts two augmentation models
+    into the pipeline, with one is VolumePerturbAugmentor and the other
+    SpeedPerturbAugmentor. "prob" indicates the probability of the current
+    augmentor to take effect. If "prob" is zero, the augmentor does not take
+    effect.
+    Params:
+        augmentation_config(str): Augmentation configuration in json string.
+        random_seed(int): Random seed.
+        train(bool): whether is train mode.
+    Raises:
+        ValueError: If the augmentation json config is in incorrect format".
+    """
+    SPEC_TYPES = {'specaug'}
+    def __init__(self, augmentation_config: str, random_seed: int=0):
+        self._rng = np.random.RandomState(random_seed)
+        self.conf = {'mode': 'sequential', 'process': []}
+        if augmentation_config:
+            process = json.loads(augmentation_config)
+            self.conf['process'] += process
+        self._augmentors, self._rates = self._parse_pipeline_from('all')
+        self._audio_augmentors, self._audio_rates = self._parse_pipeline_from(
+            'audio')
+        self._spec_augmentors, self._spec_rates = self._parse_pipeline_from(
+            'feature')
+    def __call__(self, xs, uttid_list=None, **kwargs):
+        if not isinstance(xs, Sequence):
+            is_batch = False
+            xs = [xs]
+        else:
+            is_batch = True
+        if isinstance(uttid_list, str):
+            uttid_list = [uttid_list for _ in range(len(xs))]
+        if self.conf.get("mode", "sequential") == "sequential":
+            for idx, (func, rate) in enumerate(
+                    zip(self._augmentors, self._rates), 0):
+                if self._rng.uniform(0., 1.) >= rate:
+                    continue
+                # Derive only the args which the func has
+                try:
+                    param = signature(func).parameters
+                except ValueError:
+                    # Some function, e.g. built-in function, are failed
+                    param = {}
+                _kwargs = {k: v for k, v in kwargs.items() if k in param}
+                try:
+                    if uttid_list is not None and "uttid" in param:
+                        xs = [
+                            func(x, u, **_kwargs)
+                            for x, u in zip(xs, uttid_list)
+                        ]
+                    else:
+                        xs = [func(x, **_kwargs) for x in xs]
+                except Exception:
+                    logger.fatal("Catch a exception from {}th func: {}".format(
+                        idx, func))
+                    raise
+        else:
+            raise NotImplementedError(
+                "Not supporting mode={}".format(self.conf["mode"]))
+        if is_batch:
+            return xs
+        else:
+            return xs[0]
+    def transform_audio(self, audio_segment):
+        """Run the pre-processing pipeline for data augmentation.
+        Note that this is an in-place transformation.
+        :param audio_segment: Audio segment to process.
+        :type audio_segment: AudioSegmenet|SpeechSegment
+        """
+        for augmentor, rate in zip(self._audio_augmentors, self._audio_rates):
+            if self._rng.uniform(0., 1.) < rate:
+                augmentor.transform_audio(audio_segment)
+    def transform_feature(self, spec_segment):
+        """spectrogram augmentation.
+        Args:
+            spec_segment (np.ndarray): audio feature, (D, T).
+        """
+        for augmentor, rate in zip(self._spec_augmentors, self._spec_rates):
+            if self._rng.uniform(0., 1.) < rate:
+                spec_segment = augmentor.transform_feature(spec_segment)
+        return spec_segment
+    def _parse_pipeline_from(self, aug_type='all'):
+        """Parse the config json to build a augmentation pipelien."""
+        assert aug_type in ('audio', 'feature', 'all'), aug_type
+        audio_confs = []
+        feature_confs = []
+        all_confs = []
+        for config in self.conf['process']:
+            all_confs.append(config)
+            if config["type"] in self.SPEC_TYPES:
+                feature_confs.append(config)
+            else:
+                audio_confs.append(config)
+        if aug_type == 'audio':
+            aug_confs = audio_confs
+        elif aug_type == 'feature':
+            aug_confs = feature_confs
+        else:
+            aug_confs = all_confs
+        augmentors = [
+            self._get_augmentor(config["type"], config["params"])
+            for config in aug_confs
+        ]
+        rates = [config["prob"] for config in aug_confs]
+        return augmentors, rates
+    def _get_augmentor(self, augmentor_type, params):
+        """Return an augmentation model by the type name, and pass in params."""
+        class_obj = dynamic_import(augmentor_type, import_alias)
+        assert issubclass(class_obj, AugmentorBase)
+        try:
+            obj = class_obj(self._rng, **params)
+        except Exception:
+            raise ValueError("Unknown augmentor type [%s]." % augmentor_type)
+        return obj
--- a/examples/transv1.8to2.x/deepspeech/frontend/augmentor/base.py
+++ b/examples/transv1.8to2.x/deepspeech/frontend/augmentor/base.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Contains the abstract base class for augmentation models."""
+from abc import ABCMeta
+from abc import abstractmethod
+class AugmentorBase():
+    """Abstract base class for augmentation model (augmentor) class.
+    All augmentor classes should inherit from this class, and implement the
+    following abstract methods.
+    """
+    __metaclass__ = ABCMeta
+    @abstractmethod
+    def __init__(self):
+        pass
+    @abstractmethod
+    def __call__(self, xs):
+        raise NotImplementedError("AugmentorBase: Not impl __call__")
+    @abstractmethod
+    def transform_audio(self, audio_segment):
+        """Adds various effects to the input audio segment. Such effects
+        will augment the training data to make the model invariant to certain
+        types of perturbations in the real world, improving model's
+        generalization ability.
+        Note that this is an in-place transformation.
+        :param audio_segment: Audio segment to add effects to.
+        :type audio_segment: AudioSegmenet|SpeechSegment
+        """
+        raise NotImplementedError("AugmentorBase: Not impl transform_audio")
+    @abstractmethod
+    def transform_feature(self, spec_segment):
+        """Adds various effects to the input audo feature segment. Such effects
+        will augment the training data to make the model invariant to certain
+        types of time_mask or freq_mask in the real world, improving model's
+        generalization ability.
+        Args:
+            spec_segment (Spectrogram): Spectrogram segment to add effects to.
+        """
+        raise NotImplementedError("AugmentorBase: Not impl transform_feature")
--- a/examples/transv1.8to2.x/deepspeech/frontend/augmentor/impulse_response.py
+++ b/examples/transv1.8to2.x/deepspeech/frontend/augmentor/impulse_response.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Contains the impulse response augmentation model."""
+from deepspeech.frontend.audio import AudioSegment
+from deepspeech.frontend.augmentor.base import AugmentorBase
+from deepspeech.frontend.utility import read_manifest
+class ImpulseResponseAugmentor(AugmentorBase):
+    """Augmentation model for adding impulse response effect.
+    :param rng: Random generator object.
+    :type rng: random.Random
+    :param impulse_manifest_path: Manifest path for impulse audio data.
+    :type impulse_manifest_path: str
+    """
+    def __init__(self, rng, impulse_manifest_path):
+        self._rng = rng
+        self._impulse_manifest = read_manifest(impulse_manifest_path)
+    def __call__(self, x, uttid=None, train=True):
+        if not train:
+            return x
+        self.transform_audio(x)
+        return x
+    def transform_audio(self, audio_segment):
+        """Add impulse response effect.
+        Note that this is an in-place transformation.
+        :param audio_segment: Audio segment to add effects to.
+        :type audio_segment: AudioSegmenet|SpeechSegment
+        """
+        impulse_json = self._rng.choice(
+            self._impulse_manifest, 1, replace=False)[0]
+        impulse_segment = AudioSegment.from_file(impulse_json['audio_filepath'])
+        audio_segment.convolve(impulse_segment, allow_resample=True)
--- a/examples/transv1.8to2.x/deepspeech/frontend/augmentor/noise_perturb.py
+++ b/examples/transv1.8to2.x/deepspeech/frontend/augmentor/noise_perturb.py
--- a/examples/transv1.8to2.x/deepspeech/frontend/augmentor/online_bayesian_normalization.py
+++ b/examples/transv1.8to2.x/deepspeech/frontend/augmentor/online_bayesian_normalization.py
--- a/examples/transv1.8to2.x/deepspeech/frontend/augmentor/resample.py
+++ b/examples/transv1.8to2.x/deepspeech/frontend/augmentor/resample.py
--- a/examples/transv1.8to2.x/deepspeech/frontend/augmentor/shift_perturb.py
+++ b/examples/transv1.8to2.x/deepspeech/frontend/augmentor/shift_perturb.py
--- a/examples/transv1.8to2.x/deepspeech/frontend/augmentor/spec_augment.py
+++ b/examples/transv1.8to2.x/deepspeech/frontend/augmentor/spec_augment.py
--- a/examples/transv1.8to2.x/deepspeech/frontend/augmentor/speed_perturb.py
+++ b/examples/transv1.8to2.x/deepspeech/frontend/augmentor/speed_perturb.py
--- a/examples/transv1.8to2.x/deepspeech/frontend/augmentor/volume_perturb.py
+++ b/examples/transv1.8to2.x/deepspeech/frontend/augmentor/volume_perturb.py
--- a/examples/transv1.8to2.x/deepspeech/frontend/featurizer/__init__.py
+++ b/examples/transv1.8to2.x/deepspeech/frontend/featurizer/__init__.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .audio_featurizer import AudioFeaturizer  #noqa: F401
+from .speech_featurizer import SpeechFeaturizer
+from .text_featurizer import TextFeaturizer
--- a/examples/transv1.8to2.x/deepspeech/frontend/featurizer/audio_featurizer.py
+++ b/examples/transv1.8to2.x/deepspeech/frontend/featurizer/audio_featurizer.py
--- a/examples/transv1.8to2.x/deepspeech/frontend/featurizer/speech_featurizer.py
+++ b/examples/transv1.8to2.x/deepspeech/frontend/featurizer/speech_featurizer.py
--- a/examples/transv1.8to2.x/deepspeech/frontend/featurizer/text_featurizer.py
+++ b/examples/transv1.8to2.x/deepspeech/frontend/featurizer/text_featurizer.py
--- a/examples/transv1.8to2.x/deepspeech/frontend/normalizer.py
+++ b/examples/transv1.8to2.x/deepspeech/frontend/normalizer.py
--- a/examples/transv1.8to2.x/deepspeech/frontend/speech.py
+++ b/examples/transv1.8to2.x/deepspeech/frontend/speech.py
--- a/examples/transv1.8to2.x/deepspeech/frontend/utility.py
+++ b/examples/transv1.8to2.x/deepspeech/frontend/utility.py
--- a/examples/transv1.8to2.x/deepspeech/io/__init__.py
+++ b/examples/transv1.8to2.x/deepspeech/io/__init__.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/examples/transv1.8to2.x/deepspeech/io/batchfy.py
+++ b/examples/transv1.8to2.x/deepspeech/io/batchfy.py
--- a/examples/transv1.8to2.x/deepspeech/io/collator.py
+++ b/examples/transv1.8to2.x/deepspeech/io/collator.py
--- a/examples/transv1.8to2.x/deepspeech/io/collator_st.py
+++ b/examples/transv1.8to2.x/deepspeech/io/collator_st.py
--- a/examples/transv1.8to2.x/deepspeech/io/converter.py
+++ b/examples/transv1.8to2.x/deepspeech/io/converter.py
--- a/examples/transv1.8to2.x/deepspeech/io/dataloader.py
+++ b/examples/transv1.8to2.x/deepspeech/io/dataloader.py
--- a/examples/transv1.8to2.x/deepspeech/io/dataset.py
+++ b/examples/transv1.8to2.x/deepspeech/io/dataset.py
--- a/examples/transv1.8to2.x/deepspeech/io/reader.py
+++ b/examples/transv1.8to2.x/deepspeech/io/reader.py
--- a/examples/transv1.8to2.x/deepspeech/io/sampler.py
+++ b/examples/transv1.8to2.x/deepspeech/io/sampler.py
--- a/examples/transv1.8to2.x/deepspeech/io/utility.py
+++ b/examples/transv1.8to2.x/deepspeech/io/utility.py
--- a/examples/transv1.8to2.x/deepspeech/models/__init__.py
+++ b/examples/transv1.8to2.x/deepspeech/models/__init__.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/examples/transv1.8to2.x/deepspeech/models/ds2/__init__.py
+++ b/examples/transv1.8to2.x/deepspeech/models/ds2/__init__.py
--- a/examples/transv1.8to2.x/deepspeech/models/ds2/conv.py
+++ b/examples/transv1.8to2.x/deepspeech/models/ds2/conv.py
--- a/examples/transv1.8to2.x/deepspeech/models/ds2/deepspeech2.py
+++ b/examples/transv1.8to2.x/deepspeech/models/ds2/deepspeech2.py
--- a/examples/transv1.8to2.x/deepspeech/models/ds2/rnn.py
+++ b/examples/transv1.8to2.x/deepspeech/models/ds2/rnn.py
--- a/examples/transv1.8to2.x/deepspeech/modules/__init__.py
+++ b/examples/transv1.8to2.x/deepspeech/modules/__init__.py
--- a/examples/transv1.8to2.x/deepspeech/modules/activation.py
+++ b/examples/transv1.8to2.x/deepspeech/modules/activation.py
--- a/examples/transv1.8to2.x/deepspeech/modules/cmvn.py
+++ b/examples/transv1.8to2.x/deepspeech/modules/cmvn.py
--- a/examples/transv1.8to2.x/deepspeech/modules/crf.py
+++ b/examples/transv1.8to2.x/deepspeech/modules/crf.py
--- a/examples/transv1.8to2.x/deepspeech/modules/ctc.py
+++ b/examples/transv1.8to2.x/deepspeech/modules/ctc.py
--- a/examples/transv1.8to2.x/deepspeech/modules/decoder.py
+++ b/examples/transv1.8to2.x/deepspeech/modules/decoder.py
--- a/examples/transv1.8to2.x/deepspeech/modules/decoder_layer.py
+++ b/examples/transv1.8to2.x/deepspeech/modules/decoder_layer.py
--- a/examples/transv1.8to2.x/deepspeech/modules/encoder.py
+++ b/examples/transv1.8to2.x/deepspeech/modules/encoder.py
--- a/examples/transv1.8to2.x/deepspeech/modules/loss.py
+++ b/examples/transv1.8to2.x/deepspeech/modules/loss.py
--- a/examples/transv1.8to2.x/deepspeech/modules/mask.py
+++ b/examples/transv1.8to2.x/deepspeech/modules/mask.py
--- a/examples/transv1.8to2.x/deepspeech/utils/__init__.py
+++ b/examples/transv1.8to2.x/deepspeech/utils/__init__.py
--- a/examples/transv1.8to2.x/deepspeech/utils/bleu_score.py
+++ b/examples/transv1.8to2.x/deepspeech/utils/bleu_score.py
--- a/examples/transv1.8to2.x/deepspeech/utils/checkpoint.py
+++ b/examples/transv1.8to2.x/deepspeech/utils/checkpoint.py
--- a/examples/transv1.8to2.x/deepspeech/utils/ctc_utils.py
+++ b/examples/transv1.8to2.x/deepspeech/utils/ctc_utils.py
--- a/examples/transv1.8to2.x/deepspeech/utils/dynamic_import.py
+++ b/examples/transv1.8to2.x/deepspeech/utils/dynamic_import.py
--- a/examples/transv1.8to2.x/deepspeech/utils/error_rate.py
+++ b/examples/transv1.8to2.x/deepspeech/utils/error_rate.py
--- a/examples/transv1.8to2.x/deepspeech/utils/layer_tools.py
+++ b/examples/transv1.8to2.x/deepspeech/utils/layer_tools.py
--- a/examples/transv1.8to2.x/deepspeech/utils/log.py
+++ b/examples/transv1.8to2.x/deepspeech/utils/log.py
--- a/examples/transv1.8to2.x/deepspeech/utils/mp_tools.py
+++ b/examples/transv1.8to2.x/deepspeech/utils/mp_tools.py
--- a/examples/transv1.8to2.x/deepspeech/utils/socket_server.py
+++ b/examples/transv1.8to2.x/deepspeech/utils/socket_server.py
--- a/examples/transv1.8to2.x/deepspeech/utils/tensor_utils.py
+++ b/examples/transv1.8to2.x/deepspeech/utils/tensor_utils.py
--- a/examples/transv1.8to2.x/deepspeech/utils/text_grid.py
+++ b/examples/transv1.8to2.x/deepspeech/utils/text_grid.py
--- a/examples/transv1.8to2.x/deepspeech/utils/utility.py
+++ b/examples/transv1.8to2.x/deepspeech/utils/utility.py
--- a/examples/transv1.8to2.x/example/aishell/run_data.sh
+++ b/examples/transv1.8to2.x/example/aishell/run_data.sh
--- a/examples/transv1.8to2.x/example/aishell/run_infer_golden.sh
+++ b/examples/transv1.8to2.x/example/aishell/run_infer_golden.sh
--- a/examples/transv1.8to2.x/example/aishell/run_test_golden.sh
+++ b/examples/transv1.8to2.x/example/aishell/run_test_golden.sh
--- a/examples/transv1.8to2.x/example/baidu_en8k/run_data.sh
+++ b/examples/transv1.8to2.x/example/baidu_en8k/run_data.sh
--- a/examples/transv1.8to2.x/example/baidu_en8k/run_infer_golden.sh
+++ b/examples/transv1.8to2.x/example/baidu_en8k/run_infer_golden.sh
--- a/examples/transv1.8to2.x/example/baidu_en8k/run_test_golden.sh
+++ b/examples/transv1.8to2.x/example/baidu_en8k/run_test_golden.sh
--- a/examples/transv1.8to2.x/example/librispeech/run_data.sh
+++ b/examples/transv1.8to2.x/example/librispeech/run_data.sh
--- a/examples/transv1.8to2.x/example/librispeech/run_infer_golden.sh
+++ b/examples/transv1.8to2.x/example/librispeech/run_infer_golden.sh
--- a/examples/transv1.8to2.x/example/librispeech/run_test_golden.sh
+++ b/examples/transv1.8to2.x/example/librispeech/run_test_golden.sh
--- a/examples/transv1.8to2.x/infer2x.py
+++ b/examples/transv1.8to2.x/infer2x.py
--- a/examples/transv1.8to2.x/model_utils/__init__.py
+++ b/examples/transv1.8to2.x/model_utils/__init__.py
--- a/examples/transv1.8to2.x/model_utils/model_check.py
+++ b/examples/transv1.8to2.x/model_utils/model_check.py
--- a/examples/transv1.8to2.x/models/aishell/download_model.sh
+++ b/examples/transv1.8to2.x/models/aishell/download_model.sh
--- a/examples/transv1.8to2.x/models/baidu_en8k/download_model.sh
+++ b/examples/transv1.8to2.x/models/baidu_en8k/download_model.sh
--- a/examples/transv1.8to2.x/models/librispeech/download_model.sh
+++ b/examples/transv1.8to2.x/models/librispeech/download_model.sh
--- a/examples/transv1.8to2.x/models/lm/download_lm_ch.sh
+++ b/examples/transv1.8to2.x/models/lm/download_lm_ch.sh
--- a/examples/transv1.8to2.x/models/lm/download_lm_en.sh
+++ b/examples/transv1.8to2.x/models/lm/download_lm_en.sh
--- a/examples/transv1.8to2.x/test2x.py
+++ b/examples/transv1.8to2.x/test2x.py
--- a/examples/transv1.8to2.x/tools/_init_paths.py
+++ b/examples/transv1.8to2.x/tools/_init_paths.py
--- a/examples/transv1.8to2.x/tools/build_vocab.py
+++ b/examples/transv1.8to2.x/tools/build_vocab.py
--- a/examples/transv1.8to2.x/tools/compute_mean_std.py
+++ b/examples/transv1.8to2.x/tools/compute_mean_std.py
--- a/examples/transv1.8to2.x/utils/__init__.py
+++ b/examples/transv1.8to2.x/utils/__init__.py
--- a/examples/transv1.8to2.x/utils/error_rate.py
+++ b/examples/transv1.8to2.x/utils/error_rate.py
--- a/examples/transv1.8to2.x/utils/tests/test_error_rate.py
+++ b/examples/transv1.8to2.x/utils/tests/test_error_rate.py
--- a/examples/transv1.8to2.x/utils/utility.py
+++ b/examples/transv1.8to2.x/utils/utility.py
--- a/examples/transv1.8to2.x/utils/utility.sh
+++ b/examples/transv1.8to2.x/utils/utility.sh