Merge pull request #1012 from zh794390558/datapipe

[asr] independent dataloader

Merge pull request #1012 from zh794390558/datapipe
[asr] independent dataloader
6750770e · Hui Zhang · GitHub · 8fd97642 · 2f4f7440 · 6750770e
227 changed file
--- a/examples/aishell/README.md
+++ b/examples/aishell/README.md
 # ASR

-* s0 for deepspeech2
-* s1 for u2/transformer/conformer
+* asr0 - deepspeech2 Streaming/Non-Streaming
+* asr1 - transformer/conformer Streaming/Non-Streaming
+* asr2 - transformer/conformer Streaming/Non-Streaming with Kaldi feature
+

 ## Data


--- a/examples/aishell/s0/.gitignore
+++ b/examples/aishell/s0/.gitignore
--- a/examples/aishell/s0/README.md
+++ b/examples/aishell/s0/README.md
--- a/examples/aishell/s0/conf/augmentation.json
+++ b/examples/aishell/s0/conf/augmentation.json
--- a/examples/aishell/s0/conf/deepspeech2.yaml
+++ b/examples/aishell/s0/conf/deepspeech2.yaml
--- a/examples/aishell/s0/conf/deepspeech2_online.yaml
+++ b/examples/aishell/s0/conf/deepspeech2_online.yaml
--- a/examples/aishell/s0/local/data.sh
+++ b/examples/aishell/s0/local/data.sh
@@ -32,8 +32,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    --manifest_path="data/manifest.train.raw" \
    --spectrum_type="linear" \
    --delta_delta=false \
-    --stride_ms=10.0 \
-    --window_ms=20.0 \
+    --stride_ms=10 \
+    --window_ms=20 \
    --sample_rate=16000 \
    --use_dB_normalization=True \
    --num_samples=2000 \
@@ -66,7 +66,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    for dataset in train dev test; do
    {
        python3 ${MAIN_ROOT}/utils/format_data.py \
-                --feat_type "raw" \
                --cmvn_path "data/mean_std.json" \
                --unit_type "char" \
                --vocab_path="data/vocab.txt" \

--- a/examples/aishell/s0/local/download_lm_ch.sh
+++ b/examples/aishell/s0/local/download_lm_ch.sh
--- a/examples/aishell/s0/local/export.sh
+++ b/examples/aishell/s0/local/export.sh
--- a/examples/aishell/s0/local/test.sh
+++ b/examples/aishell/s0/local/test.sh
--- a/examples/aishell/s0/local/test_export.sh
+++ b/examples/aishell/s0/local/test_export.sh
--- a/examples/aishell/s0/local/test_hub.sh
+++ b/examples/aishell/s0/local/test_hub.sh
--- a/examples/aishell/s0/local/train.sh
+++ b/examples/aishell/s0/local/train.sh
--- a/examples/aishell/s0/path.sh
+++ b/examples/aishell/s0/path.sh
--- a/examples/aishell/s0/run.sh
+++ b/examples/aishell/s0/run.sh
--- a/examples/aishell/s1/.gitignore
+++ b/examples/aishell/s1/.gitignore
--- a/examples/aishell/s1/README.md
+++ b/examples/aishell/s1/README.md
@@ -19,3 +19,13 @@ Need set `decoding.decoding_chunk_size=16` when decoding.
 | conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_greedy_search | 16, -1 | - | 0.070806 |  
 | conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_prefix_beam_search | 16, -1 | - | 0.070739 |  
 | conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | attention_rescoring | 16, -1 |  - | 0.059400 |  
+
+
+## Transformer 
+
+| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER |  
+| --- | --- | --- | --- | --- | --- | --- | --- |  
+| transformer | 31.95M  | conf/transformer.yaml | spec_aug | test | attention | 3.858648955821991 | 0.057293 |  
+| transformer | 31.95M  | conf/transformer.yaml | spec_aug | test | ctc_greedy_search | 3.858648955821991 | 0.061837 |  
+| transformer | 31.95M  | conf/transformer.yaml | spec_aug | test | ctc_prefix_beam_search | 3.858648955821991 | 0.061685 |  
+| transformer | 31.95M  | conf/transformer.yaml | spec_aug | test | attention_rescoring | 3.858648955821991 | 0.053844 |  
\ No newline at end of file
--- a/examples/aishell/s1/conf/augmentation.json
+++ b/examples/aishell/s1/conf/augmentation.json
--- a/examples/aishell/s1/conf/chunk_conformer.yaml
+++ b/examples/aishell/s1/conf/chunk_conformer.yaml
@@ -15,7 +15,7 @@ collator:
  vocab_filepath: data/vocab.txt 
  unit_type: 'char'
  spm_model_prefix: ''
-  augmentation_config: conf/augmentation.json
+  augmentation_config: conf/preprocess.yaml
  batch_size: 32
  raw_wav: True  # use raw_wav or kaldi feature
  spectrum_type: fbank #linear, mfcc, fbank
@@ -38,7 +38,7 @@ collator:

 # network architecture
 model:
-    cmvn_file: "data/mean_std.json"
+    cmvn_file: 
    cmvn_file_type: "json"
    # encoder related
    encoder: conformer

--- a/examples/aishell/s1/conf/conformer.yaml
+++ b/examples/aishell/s1/conf/conformer.yaml
@@ -15,7 +15,7 @@ collator:
  vocab_filepath: data/vocab.txt 
  unit_type: 'char'
  spm_model_prefix: ''
-  augmentation_config: conf/augmentation.json
+  augmentation_config: conf/preprocess.yaml
  batch_size: 64
  raw_wav: True  # use raw_wav or kaldi feature
  spectrum_type: fbank #linear, mfcc, fbank
@@ -37,7 +37,7 @@ collator:

 # network architecture
 model:
-    cmvn_file: "data/mean_std.json"
+    cmvn_file: 
    cmvn_file_type: "json"
    # encoder related
    encoder: conformer

--- a/examples/aishell/asr1/conf/preprocess.yaml
+++ b/examples/aishell/asr1/conf/preprocess.yaml
+process:
+  # extract kaldi fbank from PCM
+  - type: fbank_kaldi
+    fs: 16000
+    n_mels: 80
+    n_shift: 160
+    win_length: 400
+    dither: true
+  - type: cmvn_json
+    cmvn_path: data/mean_std.json
+  # these three processes are a.k.a. SpecAugument
+  - type: time_warp
+    max_time_warp: 5
+    inplace: true
+    mode: PIL
+  - type: freq_mask
+    F: 30
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
+  - type: time_mask
+    T: 40
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
+
+
+
+
--- a/examples/aishell/asr1/conf/transformer.yaml
+++ b/examples/aishell/asr1/conf/transformer.yaml
+# https://yaml.org/type/float.html
+data:
+  train_manifest: data/manifest.train
+  dev_manifest: data/manifest.dev
+  test_manifest: data/manifest.test
+  min_input_len: 0.5
+  max_input_len: 20.0 # second
+  min_output_len: 0.0
+  max_output_len: 400.0
+  min_output_input_ratio: 0.05
+  max_output_input_ratio: 10.0
+
+
+collator:
+  vocab_filepath: data/vocab.txt 
+  unit_type: 'char'
+  spm_model_prefix: ''
+  augmentation_config: conf/preprocess.yaml
+  batch_size: 64
+  raw_wav: True  # use raw_wav or kaldi feature
+  spectrum_type: fbank #linear, mfcc, fbank
+  feat_dim: 80
+  delta_delta: False
+  dither: 1.0
+  target_sample_rate: 16000
+  max_freq: None
+  n_fft: None
+  stride_ms: 10.0
+  window_ms: 25.0
+  use_dB_normalization: True 
+  target_dB: -20
+  random_seed: 0
+  keep_transcription_text: False
+  sortagrad: True 
+  shuffle_method: batch_shuffle
+  num_workers: 2
+
+# network architecture
+model:
+    cmvn_file: 
+    cmvn_file_type: "json"
+    # encoder related
+    encoder: transformer
+    encoder_conf:
+        output_size: 256    # dimension of attention
+        attention_heads: 4
+        linear_units: 2048  # the number of units of position-wise feed forward
+        num_blocks: 12      # the number of encoder blocks
+        dropout_rate: 0.1
+        positional_dropout_rate: 0.1
+        attention_dropout_rate: 0.0
+        input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+        normalize_before: true
+
+    # decoder related
+    decoder: transformer
+    decoder_conf:
+        attention_heads: 4
+        linear_units: 2048
+        num_blocks: 6
+        dropout_rate: 0.1
+        positional_dropout_rate: 0.1
+        self_attention_dropout_rate: 0.0
+        src_attention_dropout_rate: 0.0
+
+    # hybrid CTC/attention
+    model_conf:
+        ctc_weight: 0.3
+        ctc_dropoutrate: 0.0
+        ctc_grad_norm_type: null 
+        lsm_weight: 0.1     # label smoothing option
+        length_normalized_loss: false
+
+
+training:
+  n_epoch: 120 
+  accum_grad: 2
+  global_grad_clip: 5.0
+  optim: adam
+  optim_conf:
+    lr: 0.002
+    weight_decay: 1e-6
+  scheduler: warmuplr     # pytorch v1.1.0+ required
+  scheduler_conf:
+    warmup_steps: 25000
+    lr_decay: 1.0
+  log_interval: 100
+  checkpoint:
+    kbest_n: 50
+    latest_n: 5
+
+
+decoding:
+  batch_size: 128
+  error_rate_type: cer 
+  decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
+  lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
+  alpha: 2.5
+  beta: 0.3
+  beam_size: 10
+  cutoff_prob: 1.0
+  cutoff_top_n: 0
+  num_proc_bsearch: 8
+  ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
+  decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
+      # <0: for decoding, use full chunk.
+      # >0: for decoding, use fixed chunk size as set.
+      # 0: used for training, it's prohibited here. 
+  num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
+  simulate_streaming: False  # simulate streaming inference. Defaults to False.
+
+
--- a/examples/aishell/s1/local/aishell_train_lms.sh
+++ b/examples/aishell/s1/local/aishell_train_lms.sh
--- a/examples/aishell/s1/local/align.sh
+++ b/examples/aishell/s1/local/align.sh
--- a/examples/aishell/s1/local/data.sh
+++ b/examples/aishell/s1/local/data.sh
@@ -33,8 +33,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    --spectrum_type="fbank" \
    --feat_dim=80 \
    --delta_delta=false \
-    --stride_ms=10.0 \
-    --window_ms=25.0 \
+    --stride_ms=10 \
+    --window_ms=25 \
    --sample_rate=16000 \
    --use_dB_normalization=False \
    --num_samples=-1 \
@@ -67,7 +67,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    for dataset in train dev test; do
    {
        python3 ${MAIN_ROOT}/utils/format_data.py \
-            --feat_type "raw" \
            --cmvn_path "data/mean_std.json" \
            --unit_type "char" \
            --vocab_path="data/vocab.txt" \

--- a/examples/aishell/s1/local/export.sh
+++ b/examples/aishell/s1/local/export.sh
--- a/examples/aishell/s1/local/test.sh
+++ b/examples/aishell/s1/local/test.sh
--- a/examples/aishell/s1/local/test_hub.sh
+++ b/examples/aishell/s1/local/test_hub.sh
@@ -23,8 +23,6 @@ fi
 #    exit 1
 #fi

-
-
 for type in  attention_rescoring; do
    echo "decoding ${type}"
    batch_size=1

--- a/examples/aishell/s1/local/tlg.sh
+++ b/examples/aishell/s1/local/tlg.sh
--- a/examples/aishell/s1/local/train.sh
+++ b/examples/aishell/s1/local/train.sh
--- a/examples/aishell/s1/path.sh
+++ b/examples/aishell/s1/path.sh
--- a/examples/aishell/s1/run.sh
+++ b/examples/aishell/s1/run.sh
--- a/examples/aishell/s1/utils
+++ b/examples/aishell/s1/utils
--- a/examples/callcenter/s1/.gitignore
+++ b/examples/callcenter/s1/.gitignore
--- a/examples/callcenter/s1/README.md
+++ b/examples/callcenter/s1/README.md
--- a/examples/callcenter/s1/conf/augmentation.json
+++ b/examples/callcenter/s1/conf/augmentation.json
--- a/examples/callcenter/s1/conf/chunk_conformer.yaml
+++ b/examples/callcenter/s1/conf/chunk_conformer.yaml
@@ -15,7 +15,7 @@ collator:
  vocab_filepath: data/vocab.txt 
  unit_type: 'char'
  spm_model_prefix: ''
-  augmentation_config: conf/augmentation.json
+  augmentation_config: conf/preprocess.yaml
  batch_size: 32
  raw_wav: True  # use raw_wav or kaldi feature
  spectrum_type: fbank #linear, mfcc, fbank
@@ -38,7 +38,7 @@ collator:

 # network architecture
 model:
-    cmvn_file: "data/mean_std.json"
+    cmvn_file: 
    cmvn_file_type: "json"
    # encoder related
    encoder: conformer

--- a/examples/callcenter/s1/conf/conformer.yaml
+++ b/examples/callcenter/s1/conf/conformer.yaml
@@ -15,7 +15,7 @@ collator:
  vocab_filepath: data/vocab.txt 
  unit_type: 'char'
  spm_model_prefix: ''
-  augmentation_config: conf/augmentation.json
+  augmentation_config: conf/preprocess.yaml
  batch_size: 32
  raw_wav: True  # use raw_wav or kaldi feature
  spectrum_type: fbank #linear, mfcc, fbank
@@ -37,7 +37,7 @@ collator:

 # network architecture
 model:
-    cmvn_file: "data/mean_std.json"
+    cmvn_file: 
    cmvn_file_type: "json"
    # encoder related
    encoder: conformer

--- a/examples/callcenter/asr1/conf/preprocess.yaml
+++ b/examples/callcenter/asr1/conf/preprocess.yaml
+process:
+  # extract kaldi fbank from PCM
+  - type: fbank_kaldi
+    fs: 16000
+    n_mels: 80
+    n_shift: 160
+    win_length: 400
+    dither: true
+  - type: cmvn_json
+    cmvn_path: data/mean_std.json
+  # these three processes are a.k.a. SpecAugument
+  - type: time_warp
+    max_time_warp: 5
+    inplace: true
+    mode: PIL
+  - type: freq_mask
+    F: 30
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
+  - type: time_mask
+    T: 40
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
+
+
+
+
--- a/examples/callcenter/s1/local/align.sh
+++ b/examples/callcenter/s1/local/align.sh
--- a/examples/callcenter/s1/local/data.sh
+++ b/examples/callcenter/s1/local/data.sh
@@ -21,8 +21,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    --spectrum_type="fbank" \
    --feat_dim=80 \
    --delta_delta=false \
-    --stride_ms=10.0 \
-    --window_ms=25.0 \
+    --stride_ms=10 \
+    --window_ms=25 \
    --sample_rate=8000 \
    --use_dB_normalization=False \
    --num_samples=-1 \
@@ -55,7 +55,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    for dataset in train dev test; do
    {
        python3 ${MAIN_ROOT}/utils/format_data.py \
-            --feat_type "raw" \
            --cmvn_path "data/mean_std.json" \
            --unit_type "char" \
            --vocab_path="data/vocab.txt" \

--- a/examples/callcenter/s1/local/download_lm_ch.sh
+++ b/examples/callcenter/s1/local/download_lm_ch.sh
--- a/examples/callcenter/s1/local/export.sh
+++ b/examples/callcenter/s1/local/export.sh
--- a/examples/callcenter/s1/local/test.sh
+++ b/examples/callcenter/s1/local/test.sh
--- a/examples/callcenter/s1/local/train.sh
+++ b/examples/callcenter/s1/local/train.sh
--- a/examples/callcenter/s1/path.sh
+++ b/examples/callcenter/s1/path.sh
--- a/examples/callcenter/s1/run.sh
+++ b/examples/callcenter/s1/run.sh
--- a/examples/dataset/aidatatang_200zh/aidatatang_200zh.py
+++ b/examples/dataset/aidatatang_200zh/aidatatang_200zh.py
@@ -22,6 +22,7 @@ import argparse
 import codecs
 import json
 import os
+from pathlib import Path

 import soundfile

@@ -79,6 +80,7 @@ def create_manifest(data_dir, manifest_path_prefix):

                audio_path = os.path.abspath(os.path.join(subfolder, fname))
                audio_id = os.path.basename(fname)[:-4]
+                utt2spk = Path(audio_path).parent.name

                audio_data, samplerate = soundfile.read(audio_path)
                duration = float(len(audio_data) / samplerate)
@@ -87,6 +89,7 @@ def create_manifest(data_dir, manifest_path_prefix):
                    json.dumps(
                        {
                            'utt': audio_id,
+                            'utt2spk': str(utt2spk),
                            'feat': audio_path,
                            'feat_shape': (duration, ),  # second
                            'text': text,

--- a/examples/dataset/aishell/aishell.py
+++ b/examples/dataset/aishell/aishell.py
@@ -22,6 +22,7 @@ import argparse
 import codecs
 import json
 import os
+from pathlib import Path

 import soundfile

@@ -81,6 +82,8 @@ def create_manifest(data_dir, manifest_path_prefix):
                # if no transcription for audio then skipped
                if audio_id not in transcript_dict:
                    continue
+               
+                utt2spk = Path(audio_path).parent.name
                audio_data, samplerate = soundfile.read(audio_path)
                duration = float(len(audio_data) / samplerate)
                text = transcript_dict[audio_id]
@@ -88,6 +91,7 @@ def create_manifest(data_dir, manifest_path_prefix):
                    json.dumps(
                        {
                            'utt': audio_id,
+                            'utt2spk': str(utt2spk),
                            'feat': audio_path,
                            'feat_shape': (duration, ),  # second
                            'text': text

--- a/examples/dataset/librispeech/librispeech.py
+++ b/examples/dataset/librispeech/librispeech.py
@@ -78,7 +78,7 @@ def create_manifest(data_dir, manifest_path):
    print("Creating manifest %s ..." % manifest_path)
    json_lines = []
    total_sec = 0.0
-    total_text = 0.0
+    total_char = 0.0
    total_num = 0

    for subfolder, _, filelist in sorted(os.walk(data_dir)):
@@ -89,25 +89,28 @@ def create_manifest(data_dir, manifest_path):
            text_filepath = os.path.join(subfolder, text_filelist[0])
            for line in io.open(text_filepath, encoding="utf8"):
                segments = line.strip().split()
+                nchars = len(segments[1:])
                text = ' '.join(segments[1:]).lower()

                audio_filepath = os.path.abspath(
                    os.path.join(subfolder, segments[0] + '.flac'))
                audio_data, samplerate = soundfile.read(audio_filepath)
                duration = float(len(audio_data)) / samplerate
+
+                utt = os.path.splitext(os.path.basename(audio_filepath))[0]
+                utt2spk = '-'.join(utt.split('-')[:2])
+
                json_lines.append(
                    json.dumps({
-                        'utt':
-                        os.path.splitext(os.path.basename(audio_filepath))[0],
-                        'feat':
-                        audio_filepath,
-                        'feat_shape': (duration, ),  #second
-                        'text':
-                        text
+                        'utt': utt,
+                        'utt2spk': utt2spk,
+                        'feat': audio_filepath,
+                        'feat_shape': (duration, ),  # second
+                        'text': text,
                    }))

                total_sec += duration
-                total_text += len(text)
+                total_char += nchars
                total_num += 1

    with codecs.open(manifest_path, 'w', 'utf-8') as out_file:
@@ -122,8 +125,8 @@ def create_manifest(data_dir, manifest_path):
        print(f"{subset}:", file=f)
        print(f"{total_num} utts", file=f)
        print(f"{total_sec / (60*60)} h", file=f)
-        print(f"{total_text} text", file=f)
-        print(f"{total_text / total_sec} text/sec", file=f)
+        print(f"{total_char} char", file=f)
+        print(f"{total_char / total_sec} char/sec", file=f)
        print(f"{total_sec / total_num} sec/utt", file=f)



--- a/examples/dataset/mini_librispeech/mini_librispeech.py
+++ b/examples/dataset/mini_librispeech/mini_librispeech.py
@@ -74,15 +74,16 @@ def create_manifest(data_dir, manifest_path):
                audio_filepath = os.path.join(subfolder, segments[0] + '.flac')
                audio_data, samplerate = soundfile.read(audio_filepath)
                duration = float(len(audio_data)) / samplerate
+
+                utt = os.path.splitext(os.path.basename(audio_filepath))[0]
+                utt2spk = '-'.join(utt.split('-')[:2])
                json_lines.append(
                    json.dumps({
-                        'utt':
-                        os.path.splitext(os.path.basename(audio_filepath))[0],
-                        'feat':
-                        audio_filepath,
+                        'utt': utt,
+                        'utt2spk': utt2spk,
+                        'feat': audio_filepath,
                        'feat_shape': (duration, ),  #second
-                        'text':
-                        text
+                        'text': text,
                    }))

                total_sec += duration

--- a/examples/dataset/ted_en_zh/ted_en_zh.py
+++ b/examples/dataset/ted_en_zh/ted_en_zh.py
@@ -72,14 +72,17 @@ def create_manifest(data_dir, manifest_path_prefix):
                    continue
                audio_data, samplerate = soundfile.read(audio_path)
                duration = float(len(audio_data) / samplerate)
+
+
+                translation_str = " ".join(translation.split())
+                trancription_str = " ".join(trancription.split())
                json_lines.append(
                    json.dumps(
                        {
                            'utt': utt,
                            'feat': audio_path,
                            'feat_shape': (duration, ),  # second
-                            'text': " ".join(translation.split()),
-                            'text1': " ".join(trancription.split())
+                            'text': [translation_str, trancription_str],  
                        },
                        ensure_ascii=False))


--- a/examples/dataset/thchs30/thchs30.py
+++ b/examples/dataset/thchs30/thchs30.py
@@ -113,6 +113,8 @@ def create_manifest(data_dir, manifest_path_prefix):
                assert os.path.exists(audio_path) and os.path.exists(text_path)

                audio_id = os.path.basename(audio_path)[:-4]
+                spk = audio_id.split('_')[0]
+
                word_text, syllable_text, phone_text = read_trn(text_path)
                audio_data, samplerate = soundfile.read(audio_path)
                duration = float(len(audio_data) / samplerate)
@@ -122,6 +124,7 @@ def create_manifest(data_dir, manifest_path_prefix):
                    json.dumps(
                        {
                            'utt': audio_id,
+                            'utt2spk', spk,
                            'feat': audio_path,
                            'feat_shape': (duration, ),  # second
                            'text': word_text,  # charactor

--- a/examples/dataset/timit/timit.py
+++ b/examples/dataset/timit/timit.py
@@ -180,12 +180,12 @@ def create_manifest(data_dir, manifest_path_prefix):
                json.dumps(
                    {
                        'utt': utt_id,
+                        'utt2spk': spk,
+                        'utt2gender': gender,
                        'feat': str(audio_path),
                        'feat_shape': (duration, ),  # second
                        'text': word_text,  # word
                        'phone': phone_text,
-                        'spk': spk,
-                        'gender': gender,
                    },
                    ensure_ascii=False))


--- a/examples/dataset/timit/timit_kaldi_standard_split.py
+++ b/examples/dataset/timit/timit_kaldi_standard_split.py
@@ -24,6 +24,7 @@ import json
 import os

 import soundfile
+from pathlib import Path

 parser = argparse.ArgumentParser(description=__doc__)
 parser.add_argument(
@@ -67,10 +68,17 @@ def create_manifest(data_dir, manifest_path_prefix):
            audio_data, samplerate = soundfile.read(audio_path)
            duration = float(len(audio_data) / samplerate)
            text = phn_dict[audio_id]
+
+            gender_spk = str(Path(audio_path).parent.stem)
+            spk = gender_spk[1:]
+            gender = gender_spk[0]
+            utt_id = '_'.join([spk, gender, audio_id])
            json_lines.append(
                json.dumps(
                    {
                        'utt': audio_id,
+                        'utt2spk': spk,
+                        'utt2gender': gender,
                        'feat': audio_path,
                        'feat_shape': (duration, ),  # second
                        'text': text

--- a/examples/dataset/voxforge/voxforge.py
+++ b/examples/dataset/voxforge/voxforge.py
@@ -175,9 +175,12 @@ def generate_manifest(data_dir, manifest_path):

            audio_data, samplerate = soundfile.read(u)
            duration = float(len(audio_data)) / samplerate
+
+            utt = os.path.splitext(os.path.basename(u))[0]
            json_lines.append(
                json.dumps({
-                    'utt': os.path.splitext(os.path.basename(u))[0],
+                    'utt': utt,
+                    'utt2spk': speaker,
                    'feat': u,
                    'feat_shape': (duration, ),  #second
                    'text': trans.lower()

--- a/examples/librispeech/README.md
+++ b/examples/librispeech/README.md
 # ASR

-* s0 is for deepspeech2 offline
-* s1 is for transformer/conformer/U2
-* s2 is for transformer/conformer/U2 w/ kaldi feat, need install Kaldi
+* asr0 - deepspeech2 Streaming/Non-Streaming
+* asr1 - transformer/conformer Streaming/Non-Streaming
+* asr2 - transformer/conformer Streaming/Non-Streaming with Kaldi feature
+

 ## Data
 | Data Subset | Duration in Seconds |

--- a/examples/librispeech/s0/README.md
+++ b/examples/librispeech/s0/README.md
--- a/examples/librispeech/s0/conf/augmentation.json
+++ b/examples/librispeech/s0/conf/augmentation.json
--- a/examples/librispeech/s0/conf/deepspeech2.yaml
+++ b/examples/librispeech/s0/conf/deepspeech2.yaml
--- a/examples/librispeech/s0/conf/deepspeech2_online.yaml
+++ b/examples/librispeech/s0/conf/deepspeech2_online.yaml
--- a/examples/librispeech/s0/local/data.sh
+++ b/examples/librispeech/s0/local/data.sh
@@ -50,8 +50,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    --spectrum_type="linear" \
    --delta_delta=false \
    --sample_rate=16000 \
-    --stride_ms=10.0 \
-    --window_ms=20.0 \
+    --stride_ms=10 \
+    --window_ms=20 \
    --use_dB_normalization=True \
    --num_workers=${num_workers} \
    --output_path="data/mean_std.json"
@@ -81,7 +81,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    for set in train dev test dev-clean dev-other test-clean test-other; do
    {
        python3 ${MAIN_ROOT}/utils/format_data.py \
-        --feat_type "raw" \
        --cmvn_path "data/mean_std.json" \
        --unit_type ${unit_type} \
        --vocab_path="data/vocab.txt" \

--- a/examples/librispeech/s0/local/download_lm_en.sh
+++ b/examples/librispeech/s0/local/download_lm_en.sh
--- a/examples/librispeech/s0/local/export.sh
+++ b/examples/librispeech/s0/local/export.sh
--- a/examples/librispeech/s0/local/test.sh
+++ b/examples/librispeech/s0/local/test.sh
--- a/examples/librispeech/s0/local/test_hub.sh
+++ b/examples/librispeech/s0/local/test_hub.sh
--- a/examples/librispeech/s0/local/train.sh
+++ b/examples/librispeech/s0/local/train.sh
--- a/examples/librispeech/s0/path.sh
+++ b/examples/librispeech/s0/path.sh
--- a/examples/librispeech/s0/run.sh
+++ b/examples/librispeech/s0/run.sh
--- a/examples/librispeech/s1/.gitignore
+++ b/examples/librispeech/s1/.gitignore
--- a/examples/librispeech/s1/README.md
+++ b/examples/librispeech/s1/README.md
@@ -21,7 +21,7 @@
 ## Transformer
 | Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER |  
 | --- | --- | --- | --- | --- | --- | --- | --- |
-| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean | attention | 7.404532432556152 | 0.056204 |  
-| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean | ctc_greedy_search | 7.404532432556152 | 0.058658 |  
-| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean | ctc_prefix_beam_search | 7.404532432556152 | 0.058278 |  
-| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean | attention_rescoring | 7.404532432556152 | 0.045591 |  
+| transformer | 32.52 M | conf/transformer.yaml | spec_aug  | test-clean | attention | 6.805267604192098, | 0.049795 |  
+| transformer | 32.52 M | conf/transformer.yaml | spec_aug  | test-clean | ctc_greedy_search | 6.805267604192098, | 0.054892 |  
+| transformer | 32.52 M | conf/transformer.yaml | spec_aug  | test-clean | ctc_prefix_beam_search | 6.805267604192098, | 0.054531 |  
+| transformer | 32.52 M | conf/transformer.yaml | spec_aug  | test-clean | attention_rescoring | 6.805267604192098, | 0.042244 |  
\ No newline at end of file
--- a/examples/librispeech/s1/cmd.sh
+++ b/examples/librispeech/s1/cmd.sh
--- a/examples/librispeech/s1/conf/augmentation.json
+++ b/examples/librispeech/s1/conf/augmentation.json
--- a/examples/librispeech/s1/conf/chunk_conformer.yaml
+++ b/examples/librispeech/s1/conf/chunk_conformer.yaml
@@ -15,7 +15,7 @@ collator:
  unit_type: 'spm'
  spm_model_prefix: 'data/bpe_unigram_5000'
  mean_std_filepath: ""
-  augmentation_config: conf/augmentation.json
+  augmentation_config: conf/preprocess.yaml
  batch_size: 16
  raw_wav: True  # use raw_wav or kaldi feature
  spectrum_type: fbank #linear, mfcc, fbank
@@ -38,7 +38,7 @@ collator:

 # network architecture
 model:
-    cmvn_file: "data/mean_std.json"
+    cmvn_file: 
    cmvn_file_type: "json"
    # encoder related
    encoder: conformer

--- a/examples/librispeech/s1/conf/chunk_transformer.yaml
+++ b/examples/librispeech/s1/conf/chunk_transformer.yaml
@@ -15,7 +15,7 @@ collator:
  unit_type: 'spm'
  spm_model_prefix: 'data/bpe_unigram_5000'
  mean_std_filepath: ""
-  augmentation_config: conf/augmentation.json
+  augmentation_config: conf/preprocess.yaml
  batch_size: 64
  raw_wav: True  # use raw_wav or kaldi feature
  spectrum_type: fbank #linear, mfcc, fbank
@@ -38,7 +38,7 @@ collator:

 # network architecture
 model:
-    cmvn_file: "data/mean_std.json"
+    cmvn_file: 
    cmvn_file_type: "json"
    # encoder related
    encoder: transformer

--- a/examples/librispeech/s1/conf/conformer.yaml
+++ b/examples/librispeech/s1/conf/conformer.yaml
@@ -15,7 +15,7 @@ collator:
  unit_type: 'spm'
  spm_model_prefix: 'data/bpe_unigram_5000'
  mean_std_filepath: ""
-  augmentation_config: conf/augmentation.json
+  augmentation_config: conf/preprocess.yaml
  batch_size: 16
  raw_wav: True  # use raw_wav or kaldi feature
  spectrum_type: fbank #linear, mfcc, fbank
@@ -38,7 +38,7 @@ collator:

 # network architecture
 model:
-    cmvn_file: "data/mean_std.json"
+    cmvn_file: 
    cmvn_file_type: "json"
    # encoder related
    encoder: conformer

--- a/examples/librispeech/asr1/conf/preprocess.yaml
+++ b/examples/librispeech/asr1/conf/preprocess.yaml
+process:
+  # extract kaldi fbank from PCM
+  - type: fbank_kaldi
+    fs: 16000
+    n_mels: 80
+    n_shift: 160
+    win_length: 400
+    dither: true
+  - type: cmvn_json
+    cmvn_path: data/mean_std.json
+  # these three processes are a.k.a. SpecAugument
+  - type: time_warp
+    max_time_warp: 5
+    inplace: true
+    mode: PIL
+  - type: freq_mask
+    F: 30
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
+  - type: time_mask
+    T: 40
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
--- a/examples/librispeech/s1/conf/transformer.yaml
+++ b/examples/librispeech/s1/conf/transformer.yaml
@@ -15,7 +15,7 @@ collator:
  unit_type: 'spm'
  spm_model_prefix: 'data/bpe_unigram_5000'
  mean_std_filepath: ""
-  augmentation_config: conf/augmentation.json
+  augmentation_config: conf/preprocess.yaml
  batch_size: 32
  raw_wav: True  # use raw_wav or kaldi feature
  spectrum_type: fbank #linear, mfcc, fbank
@@ -38,7 +38,7 @@ collator:

 # network architecture
 model:
-    cmvn_file: "data/mean_std.json"
+    cmvn_file: 
    cmvn_file_type: "json"
    # encoder related
    encoder: transformer

--- a/examples/librispeech/s1/local/align.sh
+++ b/examples/librispeech/s1/local/align.sh
--- a/examples/librispeech/s1/local/data.sh
+++ b/examples/librispeech/s1/local/data.sh
@@ -8,6 +8,11 @@ nbpe=5000
 bpemode=unigram
 bpeprefix="data/bpe_${bpemode}_${nbpe}"

+stride_ms=10
+window_ms=25
+sample_rate=16000
+feat_dim=80
+
 source ${MAIN_ROOT}/utils/parse_options.sh


@@ -27,21 +32,21 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
        exit 1
    fi

-    for set in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
-        mv data/manifest.${set} data/manifest.${set}.raw
+    for sub in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
+        mv data/manifest.${sub} data/manifest.${sub}.raw
    done

    rm -rf data/manifest.train.raw data/manifest.dev.raw  data/manifest.test.raw
-    for set in train-clean-100 train-clean-360 train-other-500; do
-        cat data/manifest.${set}.raw >> data/manifest.train.raw
+    for sub in train-clean-100 train-clean-360 train-other-500; do
+        cat data/manifest.${sub}.raw >> data/manifest.train.raw
    done

-    for set in dev-clean dev-other; do
-        cat data/manifest.${set}.raw >> data/manifest.dev.raw
+    for sub in dev-clean dev-other; do
+        cat data/manifest.${sub}.raw >> data/manifest.dev.raw
    done

-    for set in test-clean test-other; do
-        cat data/manifest.${set}.raw >> data/manifest.test.raw
+    for sub in test-clean test-other; do
+        cat data/manifest.${sub}.raw >> data/manifest.test.raw
    done
 fi

@@ -52,11 +57,11 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    --manifest_path="data/manifest.train.raw" \
    --num_samples=-1 \
    --spectrum_type="fbank" \
-    --feat_dim=80 \
+    --feat_dim=${feat_dim} \
    --delta_delta=false \
-    --sample_rate=16000 \
-    --stride_ms=10.0 \
-    --window_ms=25.0 \
+    --sample_rate=${sample_rate} \
+    --stride_ms=${stride_ms} \
+    --window_ms=${window_ms} \
    --use_dB_normalization=False \
    --num_workers=${num_workers} \
    --output_path="data/mean_std.json"
@@ -85,16 +90,15 @@ fi

 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    # format manifest with tokenids, vocab size
-    for set in train dev test dev-clean dev-other test-clean test-other; do
+    for sub in train dev test dev-clean dev-other test-clean test-other; do
    {
        python3 ${MAIN_ROOT}/utils/format_data.py \
-        --feat_type "raw" \
        --cmvn_path "data/mean_std.json" \
        --unit_type "spm" \
        --spm_model_prefix ${bpeprefix} \
        --vocab_path="data/vocab.txt" \
-        --manifest_path="data/manifest.${set}.raw" \
-        --output_path="data/manifest.${set}"
+        --manifest_path="data/manifest.${sub}.raw" \
+        --output_path="data/manifest.${sub}"

        if [ $? -ne 0 ]; then
            echo "Formt mnaifest failed. Terminated."
@@ -103,6 +107,16 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    }&
    done
    wait
+
+    for sub in train dev; do
+        mv data/manifest.${sub} data/manifest.${sub}.fmt
+    done
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    for sub in train dev; do
+        remove_longshortdata.py --maxframes 3000 --maxchars 400 --stride_ms ${stride_ms} data/manifest.${sub}.fmt data/manifest.${sub}
+    done
 fi

 echo "LibriSpeech Data preparation done."

--- a/examples/librispeech/s1/local/download_lm_en.sh
+++ b/examples/librispeech/s1/local/download_lm_en.sh
--- a/examples/librispeech/s1/local/export.sh
+++ b/examples/librispeech/s1/local/export.sh
--- a/examples/librispeech/s1/local/test.sh
+++ b/examples/librispeech/s1/local/test.sh
--- a/examples/librispeech/s1/local/test_hub.sh
+++ b/examples/librispeech/s1/local/test_hub.sh
--- a/examples/librispeech/s1/local/train.sh
+++ b/examples/librispeech/s1/local/train.sh
--- a/examples/librispeech/s1/path.sh
+++ b/examples/librispeech/s1/path.sh
--- a/examples/librispeech/s1/run.sh
+++ b/examples/librispeech/s1/run.sh
--- a/examples/librispeech/s1/utils
+++ b/examples/librispeech/s1/utils
--- a/examples/librispeech/s2/.gitignore
+++ b/examples/librispeech/s2/.gitignore
--- a/examples/librispeech/s2/README.md
+++ b/examples/librispeech/s2/README.md
--- a/examples/librispeech/s2/cmd.sh
+++ b/examples/librispeech/s2/cmd.sh
--- a/examples/librispeech/s2/conf/augmentation.json
+++ b/examples/librispeech/s2/conf/augmentation.json
--- a/examples/librispeech/s2/conf/decode/decode.yaml
+++ b/examples/librispeech/s2/conf/decode/decode.yaml
--- a/examples/librispeech/s2/conf/decode/decode_att.yaml
+++ b/examples/librispeech/s2/conf/decode/decode_att.yaml
--- a/examples/librispeech/s2/conf/decode/decode_ctc.yaml
+++ b/examples/librispeech/s2/conf/decode/decode_ctc.yaml
--- a/examples/librispeech/s2/conf/decode/decode_wo_lm.yaml
+++ b/examples/librispeech/s2/conf/decode/decode_wo_lm.yaml
--- a/examples/librispeech/s2/conf/fbank.conf
+++ b/examples/librispeech/s2/conf/fbank.conf
--- a/examples/librispeech/s2/conf/lm/transformer.yaml
+++ b/examples/librispeech/s2/conf/lm/transformer.yaml
--- a/examples/librispeech/s2/conf/pitch.conf
+++ b/examples/librispeech/s2/conf/pitch.conf
--- a/examples/librispeech/s2/conf/transformer.yaml
+++ b/examples/librispeech/s2/conf/transformer.yaml
--- a/examples/librispeech/s2/local/align.sh
+++ b/examples/librispeech/s2/local/align.sh
--- a/examples/librispeech/s2/local/cacu_perplexity.sh
+++ b/examples/librispeech/s2/local/cacu_perplexity.sh
--- a/examples/librispeech/s2/local/data.sh
+++ b/examples/librispeech/s2/local/data.sh
--- a/examples/librispeech/s2/local/data_prep.sh
+++ b/examples/librispeech/s2/local/data_prep.sh
--- a/examples/librispeech/s2/local/download_lm_en.sh
+++ b/examples/librispeech/s2/local/download_lm_en.sh
--- a/examples/librispeech/s2/local/espnet_json_to_manifest.py
+++ b/examples/librispeech/s2/local/espnet_json_to_manifest.py
--- a/examples/librispeech/s2/local/export.sh
+++ b/examples/librispeech/s2/local/export.sh
--- a/examples/librispeech/s2/local/recog.sh
+++ b/examples/librispeech/s2/local/recog.sh
--- a/examples/librispeech/s2/local/test.sh
+++ b/examples/librispeech/s2/local/test.sh
--- a/examples/librispeech/s2/local/train.sh
+++ b/examples/librispeech/s2/local/train.sh
--- a/examples/librispeech/s2/path.sh
+++ b/examples/librispeech/s2/path.sh
--- a/examples/librispeech/s2/run.sh
+++ b/examples/librispeech/s2/run.sh
--- a/examples/librispeech/s2/steps
+++ b/examples/librispeech/s2/steps
--- a/examples/librispeech/s2/utils
+++ b/examples/librispeech/s2/utils
--- a/examples/other/1xt2x/aishell/local/data.sh
+++ b/examples/other/1xt2x/aishell/local/data.sh
@@ -50,7 +50,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    for dataset in train dev test; do
    {
        python3 ${MAIN_ROOT}/utils/format_data.py \
-                --feat_type "raw" \
                --cmvn_path "data/mean_std.npz" \
                --unit_type "char" \
                --vocab_path="data/vocab.txt" \

--- a/examples/other/1xt2x/baidu_en8k/local/data.sh
+++ b/examples/other/1xt2x/baidu_en8k/local/data.sh
@@ -65,7 +65,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    for set in train dev test dev-clean dev-other test-clean test-other; do
    {
        python3 ${MAIN_ROOT}/utils/format_data.py \
-        --feat_type "raw" \
        --cmvn_path "data/mean_std.npz" \
        --unit_type ${unit_type} \
        --vocab_path="data/vocab.txt" \

--- a/examples/other/1xt2x/librispeech/local/data.sh
+++ b/examples/other/1xt2x/librispeech/local/data.sh
@@ -63,7 +63,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    for set in train dev test dev-clean dev-other test-clean test-other; do
    {
        python3 ${MAIN_ROOT}/utils/format_data.py \
-        --feat_type "raw" \
        --cmvn_path "data/mean_std.npz" \
        --unit_type ${unit_type} \
        --vocab_path="data/vocab.txt" \

--- a/examples/ted_en_zh/README.md
+++ b/examples/ted_en_zh/README.md
 # TED En -> Zh

-* t0 for u2 speech translation
+* st0 - conformer/transformer speech translation
--- a/examples/ted_en_zh/t0/.gitignore
+++ b/examples/ted_en_zh/t0/.gitignore
-TED_EnZh
+TED-En-Zh
 data
 exp
--- a/examples/ted_en_zh/t0/README.md
+++ b/examples/ted_en_zh/t0/README.md
--- a/examples/ted_en_zh/t0/conf/transformer.yaml
+++ b/examples/ted_en_zh/t0/conf/transformer.yaml
--- a/examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml
+++ b/examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml
--- a/examples/ted_en_zh/t0/local/data.sh
+++ b/examples/ted_en_zh/t0/local/data.sh
@@ -9,7 +9,7 @@ stop_stage=100
 nbpe=8000
 bpemode=unigram
 bpeprefix="data/bpe_${bpemode}_${nbpe}"
-data_dir=./TED_EnZh
+data_dir=./TED-En-Zh


 source ${MAIN_ROOT}/utils/parse_options.sh
@@ -21,7 +21,7 @@ mkdir -p data

 if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
    if [ ! -e ${data_dir} ]; then
-        echo "Error: Dataset is not avaiable. Please download and unzip the dataset"
+        echo "Error: ${data_dir} Dataset is not avaiable. Please download and unzip the dataset"
        echo "Download Link: https://pan.baidu.com/s/18L-59wgeS96WkObISrytQQ Passwd: bva0"
        echo "The tree of the directory should be:"
        echo "."
@@ -54,8 +54,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    --feat_dim=80 \
    --delta_delta=false \
    --sample_rate=16000 \
-    --stride_ms=10.0 \
-    --window_ms=25.0 \
+    --stride_ms=10 \
+    --window_ms=25 \
    --use_dB_normalization=False \
    --num_workers=${num_workers} \
    --output_path="data/mean_std.json"
@@ -88,8 +88,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    # format manifest with tokenids, vocab size
    for set in train dev test; do
    {
-        python3 ${MAIN_ROOT}/utils/format_triplet_data.py \
-        --feat_type "raw" \
+        python3 ${MAIN_ROOT}/utils/format_data.py \
        --cmvn_path "data/mean_std.json" \
        --unit_type "spm" \
        --spm_model_prefix ${bpeprefix} \

--- a/examples/ted_en_zh/t0/local/test.sh
+++ b/examples/ted_en_zh/t0/local/test.sh
--- a/examples/ted_en_zh/t0/local/train.sh
+++ b/examples/ted_en_zh/t0/local/train.sh
--- a/examples/ted_en_zh/t0/path.sh
+++ b/examples/ted_en_zh/t0/path.sh
--- a/examples/ted_en_zh/t0/run.sh
+++ b/examples/ted_en_zh/t0/run.sh
@@ -22,7 +22,7 @@ fi

 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    # train model, all `ckpt` under `exp` dir
-    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path}  ${ckpt}
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt}
 fi

 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then

--- a/examples/thchs30/README.md
+++ b/examples/thchs30/README.md
 # thchs30

-* a0 for mfa alignment
+* align0 - mfa alignment
--- a/examples/thchs30/a0/README.md
+++ b/examples/thchs30/a0/README.md
--- a/examples/thchs30/a0/data/dict/syllable.lexicon
+++ b/examples/thchs30/a0/data/dict/syllable.lexicon
--- a/examples/thchs30/a0/local/data.sh
+++ b/examples/thchs30/a0/local/data.sh
--- a/examples/thchs30/a0/local/gen_word2phone.py
+++ b/examples/thchs30/a0/local/gen_word2phone.py
--- a/examples/thchs30/a0/local/reorganize_thchs30.py
+++ b/examples/thchs30/a0/local/reorganize_thchs30.py
--- a/examples/thchs30/a0/path.sh
+++ b/examples/thchs30/a0/path.sh
--- a/examples/thchs30/a0/run.sh
+++ b/examples/thchs30/a0/run.sh
--- a/examples/timit/README.md
+++ b/examples/timit/README.md
 # TIMIT

-* s1 u2 model with phone unit
+asr model with phone unit
+
+* asr0 - deepspeech2 Streaming/Non-Streaming
+* asr1 - transformer/conformer Streaming/Non-Streaming
+* asr2 - transformer/conformer Streaming/Non-Streaming with Kaldi feature
\ No newline at end of file
--- a/examples/timit/s1/.gitignore
+++ b/examples/timit/s1/.gitignore
--- a/examples/timit/s1/README.md
+++ b/examples/timit/s1/README.md
--- a/examples/timit/s1/conf/augmentation.json
+++ b/examples/timit/s1/conf/augmentation.json
--- a/examples/timit/s1/conf/dev_spk.list
+++ b/examples/timit/s1/conf/dev_spk.list
--- a/examples/timit/s1/conf/phones.60-48-39.map
+++ b/examples/timit/s1/conf/phones.60-48-39.map
--- a/examples/timit/asr1/conf/preprocess.yaml
+++ b/examples/timit/asr1/conf/preprocess.yaml
+process:
+  # extract kaldi fbank from PCM
+  - type: fbank_kaldi
+    fs: 16000
+    n_mels: 80
+    n_shift: 160
+    win_length: 400
+    dither: true
+  - type: cmvn_json
+    cmvn_path: data/mean_std.json
+  # these three processes are a.k.a. SpecAugument
+  - type: time_warp
+    max_time_warp: 5
+    inplace: true
+    mode: PIL
+  - type: freq_mask
+    F: 30
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
+  - type: time_mask
+    T: 40
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
+
+
+
+
--- a/examples/timit/s1/conf/test_spk.list
+++ b/examples/timit/s1/conf/test_spk.list
--- a/examples/timit/s1/conf/transformer.yaml
+++ b/examples/timit/s1/conf/transformer.yaml
@@ -14,7 +14,7 @@ collator:
  vocab_filepath: data/vocab.txt
  unit_type: "word"
  mean_std_filepath: ""
-  augmentation_config: ""
+  augmentation_config: conf/preprocess.yaml
  batch_size: 64
  raw_wav: True  # use raw_wav or kaldi feature
  spectrum_type: fbank #linear, mfcc, fbank
@@ -37,7 +37,7 @@ collator:

 # network architecture
 model:
-    cmvn_file: "data/mean_std.json"
+    cmvn_file: 
    cmvn_file_type: "json"
    # encoder related
    encoder: transformer

--- a/examples/timit/s1/local/align.sh
+++ b/examples/timit/s1/local/align.sh
--- a/examples/timit/s1/local/data.sh
+++ b/examples/timit/s1/local/data.sh
@@ -35,8 +35,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    --feat_dim=80 \
    --delta_delta=false \
    --sample_rate=16000 \
-    --stride_ms=10.0 \
-    --window_ms=25.0 \
+    --stride_ms=10 \
+    --window_ms=25 \
    --use_dB_normalization=False \
    --num_workers=${num_workers} \
    --output_path="data/mean_std.json"
@@ -66,7 +66,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    for set in train dev test; do
    {
        python3 ${MAIN_ROOT}/utils/format_data.py \
-        --feat_type "raw" \
        --cmvn_path "data/mean_std.json" \
        --unit_type ${unit_type} \
        --vocab_path="data/vocab.txt" \

--- a/examples/timit/s1/local/export.sh
+++ b/examples/timit/s1/local/export.sh
--- a/examples/timit/s1/local/test.sh
+++ b/examples/timit/s1/local/test.sh
--- a/examples/timit/s1/local/timit_data_prep.sh
+++ b/examples/timit/s1/local/timit_data_prep.sh
--- a/examples/timit/s1/local/timit_norm_trans.pl
+++ b/examples/timit/s1/local/timit_norm_trans.pl
--- a/examples/timit/s1/local/train.sh
+++ b/examples/timit/s1/local/train.sh
--- a/examples/timit/s1/path.sh
+++ b/examples/timit/s1/path.sh
--- a/examples/timit/s1/run.sh
+++ b/examples/timit/s1/run.sh
--- a/examples/tiny/README.md
+++ b/examples/tiny/README.md
-* s0 for deepspeech2
-* s1 for U2
+* asr0 - deepspeech2 Streaming/Non-Streaming
+* asr1 - transformer/conformer Streaming/Non-Streaming
+* asr2 - transformer/conformer Streaming/Non-Streaming with Kaldi feature
--- a/examples/tiny/s0/.gitignore
+++ b/examples/tiny/s0/.gitignore
--- a/examples/tiny/s0/README.md
+++ b/examples/tiny/s0/README.md
--- a/examples/tiny/s0/conf/augmentation.json
+++ b/examples/tiny/s0/conf/augmentation.json
--- a/examples/tiny/s0/conf/deepspeech2.yaml
+++ b/examples/tiny/s0/conf/deepspeech2.yaml
--- a/examples/tiny/s0/conf/deepspeech2_online.yaml
+++ b/examples/tiny/s0/conf/deepspeech2_online.yaml
--- a/examples/tiny/s0/local/data.sh
+++ b/examples/tiny/s0/local/data.sh
@@ -34,8 +34,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    --spectrum_type="linear" \
    --delta_delta=false \
    --sample_rate=16000 \
-    --stride_ms=10.0 \
-    --window_ms=20.0 \
+    --stride_ms=10 \
+    --window_ms=20 \
    --use_dB_normalization=False \
    --num_workers=2 \
    --output_path="data/mean_std.json"
@@ -63,7 +63,6 @@ fi
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    # format manifest with tokenids, vocab size
    python3 ${MAIN_ROOT}/utils/format_data.py \
-    --feat_type "raw" \
    --cmvn_path "data/mean_std.json" \
    --unit_type ${unit_type} \
    --vocab_path="data/vocab.txt" \

--- a/examples/tiny/s0/local/download_lm_en.sh
+++ b/examples/tiny/s0/local/download_lm_en.sh
--- a/examples/tiny/s0/local/export.sh
+++ b/examples/tiny/s0/local/export.sh
--- a/examples/tiny/s0/local/test.sh
+++ b/examples/tiny/s0/local/test.sh
--- a/examples/tiny/s0/local/train.sh
+++ b/examples/tiny/s0/local/train.sh
--- a/examples/tiny/s0/path.sh
+++ b/examples/tiny/s0/path.sh
--- a/examples/tiny/s0/run.sh
+++ b/examples/tiny/s0/run.sh
--- a/examples/tiny/s1/.gitignore
+++ b/examples/tiny/s1/.gitignore
--- a/examples/tiny/s1/conf/augmentation.json
+++ b/examples/tiny/s1/conf/augmentation.json
--- a/examples/tiny/s1/conf/chunk_confermer.yaml
+++ b/examples/tiny/s1/conf/chunk_confermer.yaml
@@ -15,7 +15,7 @@ collator:
  vocab_filepath: data/vocab.txt 
  unit_type: 'spm'
  spm_model_prefix: 'data/bpe_unigram_200'
-  augmentation_config: conf/augmentation.json
+  augmentation_config: conf/preprocess.yaml
  batch_size: 4
  raw_wav: True  # use raw_wav or kaldi feature
  spectrum_type: fbank #linear, mfcc, fbank

--- a/examples/tiny/s1/conf/chunk_transformer.yaml
+++ b/examples/tiny/s1/conf/chunk_transformer.yaml
@@ -15,7 +15,7 @@ collator:
  vocab_filepath: data/vocab.txt 
  unit_type: 'spm'
  spm_model_prefix: 'data/bpe_unigram_200'
-  augmentation_config: conf/augmentation.json
+  augmentation_config: conf/preprocess.yaml
  batch_size: 4
  raw_wav: True  # use raw_wav or kaldi feature
  spectrum_type: fbank #linear, mfcc, fbank

--- a/examples/tiny/s1/conf/conformer.yaml
+++ b/examples/tiny/s1/conf/conformer.yaml
@@ -15,7 +15,7 @@ collator:
  vocab_filepath: data/vocab.txt 
  unit_type: 'spm'
  spm_model_prefix: 'data/bpe_unigram_200'
-  augmentation_config: conf/augmentation.json
+  augmentation_config: conf/preprocess.yaml
  batch_size: 4
  raw_wav: True  # use raw_wav or kaldi feature
  spectrum_type: fbank #linear, mfcc, fbank

--- a/examples/tiny/asr1/conf/preprocess.yaml
+++ b/examples/tiny/asr1/conf/preprocess.yaml
+process:
+  # extract kaldi fbank from PCM
+  - type: fbank_kaldi
+    fs: 16000
+    n_mels: 80
+    n_shift: 160
+    win_length: 400
+    dither: true
+  - type: cmvn_json
+    cmvn_path: data/mean_std.json
+  # these three processes are a.k.a. SpecAugument
+  - type: time_warp
+    max_time_warp: 5
+    inplace: true
+    mode: PIL
+  - type: freq_mask
+    F: 30
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
+  - type: time_mask
+    T: 40
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
+
+
+
+
--- a/examples/tiny/s1/conf/transformer.yaml
+++ b/examples/tiny/s1/conf/transformer.yaml
@@ -11,11 +11,11 @@ data:
  max_output_input_ratio: 10.0
  
 collator:
-  mean_std_filepath: ""
+  mean_std_filepath: data/mean_std.json
  vocab_filepath: data/vocab.txt 
  unit_type: 'spm'
  spm_model_prefix: 'data/bpe_unigram_200'
-  augmentation_config: conf/augmentation.json
+  augmentation_config: conf/preprocess.yaml
  batch_size: 4
  raw_wav: True  # use raw_wav or kaldi feature
  spectrum_type: fbank #linear, mfcc, fbank
@@ -37,7 +37,7 @@ collator:

 # network architecture
 model:
-    cmvn_file: "data/mean_std.json"
+    cmvn_file: 
    cmvn_file_type: "json"
    # encoder related
    encoder: transformer

--- a/examples/tiny/s1/local/align.sh
+++ b/examples/tiny/s1/local/align.sh
--- a/examples/tiny/s1/local/data.sh
+++ b/examples/tiny/s1/local/data.sh
@@ -38,8 +38,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    --feat_dim=80 \
    --delta_delta=false \
    --sample_rate=16000 \
-    --stride_ms=10.0 \
-    --window_ms=25.0 \
+    --stride_ms=10 \
+    --window_ms=25 \
    --use_dB_normalization=False \
    --num_workers=2 \
    --output_path="data/mean_std.json"
@@ -69,7 +69,6 @@ fi
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    # format manifest with tokenids, vocab size
    python3 ${MAIN_ROOT}/utils/format_data.py \
-    --feat_type "raw" \
    --cmvn_path "data/mean_std.json" \
    --unit_type "spm" \
    --spm_model_prefix ${bpeprefix} \

--- a/examples/tiny/s1/local/export.sh
+++ b/examples/tiny/s1/local/export.sh
--- a/examples/tiny/s1/local/test.sh
+++ b/examples/tiny/s1/local/test.sh
--- a/examples/tiny/s1/local/train.sh
+++ b/examples/tiny/s1/local/train.sh
--- a/examples/tiny/s1/path.sh
+++ b/examples/tiny/s1/path.sh
--- a/examples/tiny/s1/run.sh
+++ b/examples/tiny/s1/run.sh
--- a/examples/wenetspeech/README.md
+++ b/examples/wenetspeech/README.md
+* asr0 - deepspeech2 Streaming/Non-Streaming
+* asr1 - transformer/conformer Streaming/Non-Streaming
+* asr2 - transformer/conformer Streaming/Non-Streaming with Kaldi feature
+
+# [WenetSpeech](https://github.com/wenet-e2e/WenetSpeech)
+
+A 10000+ Hours Multi-domain Chinese Corpus for Speech Recognition
+
+## Description
+
+### Creation
+
+All the data are collected from YouTube and Podcast. Optical character recognition (OCR) and automatic speech recognition (ASR) techniques are adopted to label each YouTube and Podcast recording, respectively. To improve the quality of the corpus, we use a novel end-to-end label error detection method to further validate and filter the data.
+
+### Categories
+
+In summary, WenetSpeech groups all data into 3 categories, as the following table shows:
+
+| Set        | Hours | Confidence  | Usage                                 |
+|------------|-------|-------------|---------------------------------------|
+| High Label | 10005 | >=0.95      | Supervised Training                   |
+| Weak Label | 2478  | [0.6, 0.95] | Semi-supervised or noise training     |
+| Unlabel    | 9952  | /           | Unsupervised training or Pre-training |
+| In Total   | 22435 | /           | All above                             |
+
+### High Label Data
+
+We classify the high label into 10 groups according to its domain, speaking style, and scenarios.
+
+| Domain      | Youtube | Podcast | Total  |
+|-------------|---------|---------|--------|
+| audiobook   | 0       | 250.9   | 250.9  |
+| commentary  | 112.6   | 135.7   | 248.3  |
+| documentary | 386.7   | 90.5    | 477.2  |
+| drama       | 4338.2  | 0       | 4338.2 |
+| interview   | 324.2   | 614     | 938.2  |
+| news        | 0       | 868     | 868    |
+| reading     | 0       | 1110.2  | 1110.2 |
+| talk        | 204     | 90.7    | 294.7  |
+| variety     | 603.3   | 224.5   | 827.8  |
+| others      | 144     | 507.5   | 651.5  |
+| Total       | 6113    | 3892    | 10005  |
+
+As shown in the following table, we provide 3 training subsets, namely `S`, `M` and `L` for building ASR systems on different data scales.
+
+| Training Subsets | Confidence  | Hours |
+|------------------|-------------|-------|
+| L                | [0.95, 1.0] | 10005 |
+| M                | 1.0         | 1000  |
+| S                | 1.0         | 100   |
+
+### Evaluation Sets
+
+| Evaluation Sets | Hours | Source       | Description                                                                             |
+|-----------------|-------|--------------|-----------------------------------------------------------------------------------------|
+| DEV             | 20    | Internet     | Specially designed for some speech tools which require cross-validation set in training |
+| TEST\_NET       | 23    | Internet     | Match test                                                                              |
+| TEST\_MEETING   | 15    | Real meeting | Mismatch test which is a far-field, conversational, spontaneous, and meeting dataset   |
\ No newline at end of file
--- a/examples/wenetspeech/asr1/.gitignore
+++ b/examples/wenetspeech/asr1/.gitignore
+data
+exp
+*.profile
--- a/examples/wenetspeech/asr1/README.md
+++ b/examples/wenetspeech/asr1/README.md
+## Pack Model
+
+pack model to tar.gz, e.g.
+
+```bash
+./utils/pack_model.sh  --preprocess_conf conf/preprocess.yaml --dict data/vocab.txt conf/conformer.yaml '' data/mean_std.json exp/conformer/checkpoints/wenetspeec
+h.pdparams 
+
+```
+
+show model.tar.gz
+```
+tar tf model.tar.gz 
+```
--- a/examples/wenetspeech/asr1/RESULTS.md
+++ b/examples/wenetspeech/asr1/RESULTS.md
+# WenetSpeech
+
+
+## Conformer
+
+| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER |  
+| --- | --- | --- | --- | --- | --- | --- | --- |
+| conformer | 32.52 M | conf/conformer.yaml | spec_aug  | dev | attention |  |  |  
+| conformer | 32.52 M | conf/conformer.yaml | spec_aug  | test net | ctc_greedy_search |  |  |  
+| conformer | 32.52 M | conf/conformer.yaml | spec_aug  | test meeting | ctc_prefix_beam_search |  |  |  
+| conformer | 32.52 M | conf/conformer.yaml | spec_aug  | test net | attention_rescoring |  |  |  
+
+
+
+## Conformer Pretrain Model
+
+Pretrain model from http://mobvoi-speech-public.ufile.ucloud.cn/public/wenet/wenetspeech/20211025_conformer_exp.tar.gz
+
+| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER |  
+| --- | --- | --- | --- | --- | --- | --- | --- |
+| conformer | 32.52 M | conf/conformer.yaml | spec_aug  | aishell1 | attention | - | 0.048456 |  
+| conformer | 32.52 M | conf/conformer.yaml | spec_aug  | aishell1 | ctc_greedy_search | - | 0.052534 |  
+| conformer | 32.52 M | conf/conformer.yaml | spec_aug  | aishell1 | ctc_prefix_beam_search | - | 0.052915 |  
+| conformer | 32.52 M | conf/conformer.yaml | spec_aug  | aishell1 | attention_rescoring | - | 0.047904 |  
\ No newline at end of file
--- a/examples/wenetspeech/asr1/conf/conformer.yaml
+++ b/examples/wenetspeech/asr1/conf/conformer.yaml
+# network architecture
+model:
+    # encoder related
+    encoder: conformer
+    encoder_conf:
+        output_size: 512    # dimension of attention
+        attention_heads: 8
+        linear_units: 2048  # the number of units of position-wise feed forward
+        num_blocks: 12      # the number of encoder blocks
+        dropout_rate: 0.1
+        positional_dropout_rate: 0.1
+        attention_dropout_rate: 0.0
+        input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+        normalize_before: True
+        use_cnn_module: True
+        cnn_module_kernel: 15
+        cnn_module_norm: layer_norm
+        activation_type: swish
+        pos_enc_layer_type: rel_pos
+        selfattention_layer_type: rel_selfattn
+
+    # decoder related
+    decoder: transformer
+    decoder_conf:
+        attention_heads: 8
+        linear_units: 2048
+        num_blocks: 6
+        dropout_rate: 0.1
+        positional_dropout_rate: 0.1
+        self_attention_dropout_rate: 0.0
+        src_attention_dropout_rate: 0.0
+
+    # hybrid CTC/attention
+    model_conf:
+        ctc_weight: 0.3
+        ctc_dropoutrate: 0.0
+        ctc_grad_norm_type: null
+        lsm_weight: 0.1     # label smoothing option
+        length_normalized_loss: false
+
+# https://yaml.org/type/float.html
+data:
+  train_manifest: data/manifest.train
+  dev_manifest: data/manifest.dev
+  test_manifest: data/manifest.test
+  min_input_len: 0.1 # second
+  max_input_len: 12.0 # second
+  min_output_len: 1.0
+  max_output_len: 400.0
+  min_output_input_ratio: 0.05
+  max_output_input_ratio: 10.0
+
+collator:
+  vocab_filepath: data/vocab.txt 
+  unit_type: 'char'
+  spm_model_prefix: ''
+  augmentation_config: conf/preprocess.yaml
+  batch_size: 64
+  raw_wav: True  # use raw_wav or kaldi feature
+  spectrum_type: fbank #linear, mfcc, fbank
+  feat_dim: 80
+  delta_delta: False
+  dither: 1.0
+  target_sample_rate: 16000
+  max_freq: None
+  n_fft: None
+  stride_ms: 10.0
+  window_ms: 25.0
+  use_dB_normalization: True 
+  target_dB: -20
+  random_seed: 0
+  keep_transcription_text: False
+  sortagrad: True 
+  shuffle_method: batch_shuffle
+  num_workers: 2
+
+
+training:
+  n_epoch: 240 
+  accum_grad: 16
+  global_grad_clip: 5.0
+  log_interval: 100
+  checkpoint:
+    kbest_n: 50
+    latest_n: 5
+  optim: adam
+  optim_conf:
+    lr: 0.001
+    weight_decay: 1e-6
+  scheduler: warmuplr     # pytorch v1.1.0+ required
+  scheduler_conf:
+    warmup_steps: 5000
+    lr_decay: 1.0
+
+
+decoding:
+  batch_size: 128
+  error_rate_type: cer 
+  decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
+  lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
+  alpha: 2.5
+  beta: 0.3
+  beam_size: 10
+  cutoff_prob: 1.0
+  cutoff_top_n: 0
+  num_proc_bsearch: 8
+  ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
+  decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
+      # <0: for decoding, use full chunk.
+      # >0: for decoding, use fixed chunk size as set.
+      # 0: used for training, it's prohibited here. 
+  num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
+  simulate_streaming: False  # simulate streaming inference. Defaults to False.
\ No newline at end of file
--- a/examples/wenetspeech/asr1/conf/preprocess.yaml
+++ b/examples/wenetspeech/asr1/conf/preprocess.yaml
+process:
+  # extract kaldi fbank from PCM
+  - type: fbank_kaldi
+    fs: 16000
+    n_mels: 80
+    n_shift: 160
+    win_length: 400
+    dither: true
+  - type: cmvn_json
+    cmvn_path: data/mean_std.json
+  # these three processes are a.k.a. SpecAugument
+  - type: time_warp
+    max_time_warp: 5
+    inplace: true
+    mode: PIL
+  - type: freq_mask
+    F: 30
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
+  - type: time_mask
+    T: 40
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
+
+
+
+
--- a/examples/wenetspeech/asr1/local/data.sh
+++ b/examples/wenetspeech/asr1/local/data.sh
+#!/bin/bash
+
+# Copyright 2021  Mobvoi Inc(Author: Di Wu, Binbin Zhang)
+#                 NPU, ASLP Group (Author: Qijie Shao)
+
+stage=-1
+stop_stage=100
+
+# Use your own data path. You need to download the WenetSpeech dataset by yourself.
+wenetspeech_data_dir=./wenetspeech
+# Make sure you have 1.2T for ${shards_dir}
+shards_dir=./wenetspeech_shards
+
+#wenetspeech training set
+set=L
+train_set=train_`echo $set | tr 'A-Z' 'a-z'`
+dev_set=dev
+test_sets="test_net test_meeting"
+
+cmvn=true
+cmvn_sampling_divisor=20 # 20 means 5% of the training data to estimate cmvn
+
+
+. ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
+set -u
+set -o pipefail
+
+
+mkdir -p data
+TARGET_DIR=${MAIN_ROOT}/examples/dataset
+mkdir -p ${TARGET_DIR}
+
+if [ ${stage} -le -2 ] && [ ${stop_stage} -ge -2 ]; then
+    # download data
+    echo "Please follow https://github.com/wenet-e2e/WenetSpeech to download the data."
+    exit 0;
+fi
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    echo "Data preparation"
+    local/wenetspeech_data_prep.sh \
+        --train-subset $set \
+        $wenetspeech_data_dir \
+        data || exit 1;
+fi
+
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+    # generate manifests
+    python3 ${TARGET_DIR}/aishell/aishell.py \
+    --manifest_prefix="data/manifest" \
+    --target_dir="${TARGET_DIR}/aishell"
+
+    if [ $? -ne 0 ]; then
+        echo "Prepare Aishell failed. Terminated."
+        exit 1
+    fi
+
+    for dataset in train dev test; do
+        mv data/manifest.${dataset} data/manifest.${dataset}.raw
+    done
+fi
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # compute mean and stddev for normalizer
+    if $cmvn; then
+        full_size=`cat data/${train_set}/wav.scp | wc -l`
+        sampling_size=$((full_size / cmvn_sampling_divisor))
+        shuf -n $sampling_size data/$train_set/wav.scp \
+            > data/$train_set/wav.scp.sampled
+        num_workers=$(nproc)
+
+        python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
+        --manifest_path="data/manifest.train.raw" \
+        --spectrum_type="fbank" \
+        --feat_dim=80 \
+        --delta_delta=false \
+        --stride_ms=10 \
+        --window_ms=25 \
+        --sample_rate=16000 \
+        --use_dB_normalization=False \
+        --num_samples=-1 \
+        --num_workers=${num_workers} \
+        --output_path="data/mean_std.json"
+
+        if [ $? -ne 0 ]; then
+            echo "Compute mean and stddev failed. Terminated."
+            exit 1
+        fi
+    fi
+fi
+
+dict=data/dict/lang_char.txt
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # download data, generate manifests
+    # build vocabulary
+    python3 ${MAIN_ROOT}/utils/build_vocab.py \
+    --unit_type="char" \
+    --count_threshold=0 \
+    --vocab_path="data/vocab.txt" \
+    --manifest_paths "data/manifest.train.raw"
+
+    if [ $? -ne 0 ]; then
+        echo "Build vocabulary failed. Terminated."
+        exit 1
+    fi
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # format manifest with tokenids, vocab size
+    for dataset in train dev test; do
+    {
+        python3 ${MAIN_ROOT}/utils/format_data.py \
+            --cmvn_path "data/mean_std.json" \
+            --unit_type "char" \
+            --vocab_path="data/vocab.txt" \
+            --manifest_path="data/manifest.${dataset}.raw" \
+            --output_path="data/manifest.${dataset}"
+
+        if [ $? -ne 0 ]; then
+            echo "Formt mnaifest failed. Terminated."
+            exit 1
+        fi
+    } &
+    done
+    wait
+fi
+
+echo "Aishell data preparation done."
+exit 0
--- a/examples/wenetspeech/asr1/local/extract_meta.py
+++ b/examples/wenetspeech/asr1/local/extract_meta.py
+# Copyright 2021  Xiaomi Corporation (Author: Yongqing Wang)
+#                 Mobvoi Inc(Author: Di Wu, Binbin Zhang)
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import os
+import argparse
+import json
+
+
+def get_args():
+    parser = argparse.ArgumentParser(description="""
+      This script is used to process raw json dataset of WenetSpeech,
+      where the long wav is splitinto segments and
+      data of wenet format is generated.
+      """)
+    parser.add_argument('input_json', help="""Input json file of WenetSpeech""")
+    parser.add_argument('output_dir', help="""Output dir for prepared data""")
+
+    args = parser.parse_args()
+    return args
+
+
+def meta_analysis(input_json, output_dir):
+    input_dir = os.path.dirname(input_json)
+
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+
+    try:
+        with open(input_json, 'r') as injson:
+            json_data = json.load(injson)
+    except Exception:
+        sys.exit(f'Failed to load input json file: {input_json}')
+    else:
+        if json_data['audios'] is not None:
+            with open(f'{output_dir}/text', 'w') as utt2text, \
+                 open(f'{output_dir}/segments', 'w') as segments, \
+                 open(f'{output_dir}/utt2dur', 'w') as utt2dur, \
+                 open(f'{output_dir}/wav.scp', 'w') as wavscp, \
+                 open(f'{output_dir}/utt2subsets', 'w') as utt2subsets, \
+                 open(f'{output_dir}/reco2dur', 'w') as reco2dur:
+                for long_audio in json_data['audios']:
+                    try:
+                        long_audio_path = os.path.realpath(
+                            os.path.join(input_dir, long_audio['path']))
+                        aid = long_audio['aid']
+                        segments_lists = long_audio['segments']
+                        duration = long_audio['duration']
+                        assert (os.path.exists(long_audio_path))
+                    except AssertionError:
+                        print(f'''Warning: {aid} something is wrong,
+                                  maybe AssertionError, skipped''')
+                        continue
+                    except Exception:
+                        print(f'''Warning: {aid} something is wrong, maybe the
+                                  error path: {long_audio_path}, skipped''')
+                        continue
+                    else:
+                        wavscp.write(f'{aid}\t{long_audio_path}\n')
+                        reco2dur.write(f'{aid}\t{duration}\n')
+                        for segment_file in segments_lists:
+                            try:
+                                sid = segment_file['sid']
+                                start_time = segment_file['begin_time']
+                                end_time = segment_file['end_time']
+                                dur = end_time - start_time
+                                text = segment_file['text']
+                                segment_subsets = segment_file["subsets"]
+                            except Exception:
+                                print(f'''Warning: {segment_file} something
+                                          is wrong, skipped''')
+                                continue
+                            else:
+                                utt2text.write(f'{sid}\t{text}\n')
+                                segments.write(
+                                    f'{sid}\t{aid}\t{start_time}\t{end_time}\n'
+                                )
+                                utt2dur.write(f'{sid}\t{dur}\n')
+                                segment_sub_names = " ".join(segment_subsets)
+                                utt2subsets.write(
+                                    f'{sid}\t{segment_sub_names}\n')
+
+def main():
+    args = get_args()
+
+    meta_analysis(args.input_json, args.output_dir)
+
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
--- a/examples/wenetspeech/asr1/local/process_opus.py
+++ b/examples/wenetspeech/asr1/local/process_opus.py
+# Copyright 2021  NPU, ASLP Group (Author: Qijie Shao)
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# process_opus.py: segmentation and downsampling of opus audio
+
+# usage: python3 process_opus.py wav.scp segments output_wav.scp
+
+from pydub import AudioSegment
+import sys
+import os
+
+
+def read_file(wav_scp, segments):
+    wav_scp_dict = {}
+    with open(wav_scp, 'r', encoding='UTF-8') as fin:
+        for line_str in fin:
+            wav_id, path = line_str.strip().split()
+            wav_scp_dict[wav_id] = path
+
+    utt_list = []
+    seg_path_list = []
+    start_time_list = []
+    end_time_list = []
+    with open(segments, 'r', encoding='UTF-8') as fin:
+        for line_str in fin:
+            arr = line_str.strip().split()
+            assert len(arr) == 4
+            utt_list.append(arr[0])
+            seg_path_list.append(wav_scp_dict[arr[1]])
+            start_time_list.append(float(arr[2]))
+            end_time_list.append(float(arr[3]))
+    return utt_list, seg_path_list, start_time_list, end_time_list
+
+
+# TODO(Qijie): Fix the process logic
+def output(output_wav_scp, utt_list, seg_path_list, start_time_list,
+           end_time_list):
+    num_utts = len(utt_list)
+    step = int(num_utts * 0.01)
+    with open(output_wav_scp, 'w', encoding='UTF-8') as fout:
+        previous_wav_path = ""
+        for i in range(num_utts):
+            utt_id = utt_list[i]
+            current_wav_path = seg_path_list[i]
+            output_dir = (os.path.dirname(current_wav_path)) \
+                .replace("audio", 'audio_seg')
+            seg_wav_path = os.path.join(output_dir, utt_id + '.wav')
+
+            # if not os.path.exists(output_dir):
+            #     os.makedirs(output_dir)
+
+            if current_wav_path != previous_wav_path:
+                source_wav = AudioSegment.from_file(current_wav_path)
+            previous_wav_path = current_wav_path
+
+            start = int(start_time_list[i] * 1000)
+            end = int(end_time_list[i] * 1000)
+            target_audio = source_wav[start:end].set_frame_rate(16000)
+            target_audio.export(seg_wav_path, format="wav")
+
+            fout.write("{} {}\n".format(utt_id, seg_wav_path))
+            if i % step == 0:
+                print("seg wav finished: {}%".format(int(i / step)))
+
+
+def main():
+    wav_scp = sys.argv[1]
+    segments = sys.argv[2]
+    output_wav_scp = sys.argv[3]
+
+    utt_list, seg_path_list, start_time_list, end_time_list \
+        = read_file(wav_scp, segments)
+    output(output_wav_scp, utt_list, seg_path_list, start_time_list,
+           end_time_list)
+
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
--- a/examples/wenetspeech/asr1/local/test.sh
+++ b/examples/wenetspeech/asr1/local/test.sh
+#!/bin/bash
+
+if [ $# != 2 ];then
+    echo "usage: ${0} config_path ckpt_path_prefix"
+    exit -1
+fi
+
+ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+echo "using $ngpu gpus..."
+
+config_path=$1
+ckpt_prefix=$2
+
+chunk_mode=false
+if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then
+    chunk_mode=true
+fi
+
+# download language model
+#bash local/download_lm_ch.sh
+#if [ $? -ne 0 ]; then
+#    exit 1
+#fi
+
+
+for type in attention ctc_greedy_search; do
+    echo "decoding ${type}"
+    if [ ${chunk_mode} == true ];then
+        # stream decoding only support batchsize=1
+        batch_size=1
+    else
+        batch_size=64
+    fi
+    output_dir=${ckpt_prefix}
+    mkdir -p ${output_dir}
+    python3 -u ${BIN_DIR}/test.py \
+    --nproc ${ngpu} \
+    --config ${config_path} \
+    --result_file ${output_dir}/${type}.rsl \
+    --checkpoint_path ${ckpt_prefix} \
+    --opts decoding.decoding_method ${type} \
+    --opts decoding.batch_size ${batch_size}
+
+    if [ $? -ne 0 ]; then
+        echo "Failed in evaluation!"
+        exit 1
+    fi
+done
+
+for type in ctc_prefix_beam_search attention_rescoring; do
+    echo "decoding ${type}"
+    batch_size=1
+    output_dir=${ckpt_prefix}
+    mkdir -p ${output_dir}
+    python3 -u ${BIN_DIR}/test.py \
+    --nproc ${ngpu} \
+    --config ${config_path} \
+    --result_file ${output_dir}/${type}.rsl \
+    --checkpoint_path ${ckpt_prefix} \
+    --opts decoding.decoding_method ${type} \
+    --opts decoding.batch_size ${batch_size}
+
+    if [ $? -ne 0 ]; then
+        echo "Failed in evaluation!"
+        exit 1
+    fi
+done
+
+exit 0
--- a/examples/wenetspeech/asr1/local/wenetspeech_data_prep.sh
+++ b/examples/wenetspeech/asr1/local/wenetspeech_data_prep.sh
+#!/usr/bin/env bash
+
+# Copyright 2021  Xiaomi Corporation (Author: Yongqing Wang)
+#                 Seasalt AI, Inc (Author: Guoguo Chen)
+#                 Mobvoi Inc(Author: Di Wu, Binbin Zhang)
+#                 NPU, ASLP Group (Author: Qijie Shao)
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+set -o pipefail
+
+stage=1
+prefix=
+train_subset=L
+
+. ./tools/parse_options.sh || exit 1;
+
+filter_by_id () {
+  idlist=$1
+  input=$2
+  output=$3
+  field=1
+  if [ $# -eq 4 ]; then
+    field=$4
+  fi
+  cat $input | perl -se '
+    open(F, "<$idlist") || die "Could not open id-list file $idlist";
+    while(<F>) {
+      @A = split;
+      @A>=1 || die "Invalid id-list file line $_";
+      $seen{$A[0]} = 1;
+    }
+    while(<>) {
+      @A = split;
+      @A > 0 || die "Invalid file line $_";
+      @A >= $field || die "Invalid file line $_";
+      if ($seen{$A[$field-1]}) {
+        print $_;
+      }
+    }' -- -idlist="$idlist" -field="$field" > $output ||\
+  (echo "$0: filter_by_id() error: $input" && exit 1) || exit 1;
+}
+
+subset_data_dir () {
+  utt_list=$1
+  src_dir=$2
+  dest_dir=$3
+  mkdir -p $dest_dir || exit 1;
+  # wav.scp text segments utt2dur
+  filter_by_id $utt_list $src_dir/utt2dur $dest_dir/utt2dur ||\
+    (echo "$0: subset_data_dir() error: $src_dir/utt2dur" && exit 1) || exit 1;
+  filter_by_id $utt_list $src_dir/text $dest_dir/text ||\
+    (echo "$0: subset_data_dir() error: $src_dir/text" && exit 1) || exit 1;
+  filter_by_id $utt_list $src_dir/segments $dest_dir/segments ||\
+    (echo "$0: subset_data_dir() error: $src_dir/segments" && exit 1) || exit 1;
+  awk '{print $2}' $dest_dir/segments | sort | uniq > $dest_dir/reco
+  filter_by_id $dest_dir/reco $src_dir/wav.scp $dest_dir/wav.scp ||\
+    (echo "$0: subset_data_dir() error: $src_dir/wav.scp" && exit 1) || exit 1;
+  rm -f $dest_dir/reco
+}
+
+if [ $# -ne 2 ]; then
+  echo "Usage: $0 [options] <wenetspeech-dataset-dir> <data-dir>"
+  echo " e.g.: $0 --train-subset L /disk1/audio_data/wenetspeech/ data/"
+  echo ""
+  echo "This script takes the WenetSpeech source directory, and prepares the"
+  echo "WeNet format data directory."
+  echo "  --prefix <prefix>                # Prefix for output data directory."
+  echo "  --stage <stage>                  # Processing stage."
+  echo "  --train-subset <L|M|S|W>     # Train subset to be created."
+  exit 1
+fi
+
+wenetspeech_dir=$1
+data_dir=$2
+
+declare -A subsets
+subsets=(
+  [L]="train_l"
+  [M]="train_m"
+  [S]="train_s"
+  [W]="train_w"
+  [DEV]="dev"
+  [TEST_NET]="test_net"
+  [TEST_MEETING]="test_meeting")
+
+prefix=${prefix:+${prefix}_}
+
+corpus_dir=$data_dir/${prefix}corpus/
+if [ $stage -le 1 ]; then
+  echo "$0: Extract meta into $corpus_dir"
+  # Sanity check.
+  [ ! -f $wenetspeech_dir/WenetSpeech.json ] &&\
+    echo "$0: Please download $wenetspeech_dir/WenetSpeech.json!" && exit 1;
+  [ ! -d $wenetspeech_dir/audio ] &&\
+    echo "$0: Please download $wenetspeech_dir/audio!" && exit 1;
+
+  [ ! -d $corpus_dir ] && mkdir -p $corpus_dir
+
+  # Files to be created:
+  # wav.scp text segments utt2dur
+  python3 local/extract_meta.py \
+    $wenetspeech_dir/WenetSpeech.json $corpus_dir || exit 1;
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: Split data to train, dev, test_net, and test_meeting"
+  [ ! -f $corpus_dir/utt2subsets ] &&\
+    echo "$0: No such file $corpus_dir/utt2subsets!" && exit 1;
+  for label in $train_subset DEV TEST_NET TEST_MEETING; do
+    if [ ! ${subsets[$label]+set} ]; then
+      echo "$0: Subset $label is not defined in WenetSpeech.json." && exit 1;
+    fi
+    subset=${subsets[$label]}
+    [ ! -d $data_dir/${prefix}$subset ] && mkdir -p $data_dir/${prefix}$subset
+    cat $corpus_dir/utt2subsets | \
+       awk -v s=$label '{for (i=2;i<=NF;i++) if($i==s) print $0;}' \
+       > $corpus_dir/${prefix}${subset}_utt_list|| exit 1;
+    subset_data_dir $corpus_dir/${prefix}${subset}_utt_list \
+      $corpus_dir $data_dir/${prefix}$subset || exit 1;
+  done
+fi
+
+echo "$0: Done"
\ No newline at end of file
--- a/examples/wenetspeech/asr1/path.sh
+++ b/examples/wenetspeech/asr1/path.sh
+export MAIN_ROOT=`realpath ${PWD}/../../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
+
+# model exp
+MODEL=u2
+export BIN_DIR=${MAIN_ROOT}/paddlespeech/s2t/exps/${MODEL}/bin
--- a/examples/wenetspeech/asr1/run.sh
+++ b/examples/wenetspeech/asr1/run.sh
+#!/bin/bash
+
+. path.sh || exit 1;
+set -e
+
+gpus=0,1,2,3,4,5,6,7
+stage=0
+stop_stage=100
+conf_path=conf/conformer.yaml
+
+average_checkpoint=true
+avg_num=10
+
+. ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
+
+avg_ckpt=avg_${avg_num}
+ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')
+echo "checkpoint name ${ckpt}"
+
+audio_file="data/tmp.wav"
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    bash ./local/data.sh || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # train model, all `ckpt` under `exp` dir
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path}  ${ckpt}
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # avg n best model
+    avg.sh best exp/${ckpt}/checkpoints ${avg_num}
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # test ckpt avg_n
+    CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
+fi
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    # ctc alignment of test data
+    CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
+fi
+
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+    # export ckpt avg_n
+    CUDA_VISIBLE_DEVICES=0 ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
+fi
+
+if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
+    # test a single .wav file
+    CUDA_VISIBLE_DEVICES=0 ./local/test_hub.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1
+fi
--- a/examples/wenetspeech/asr1/utils
+++ b/examples/wenetspeech/asr1/utils
+../../../utils
\ No newline at end of file
--- a/paddlespeech/s2t/exps/u2/model.py
+++ b/paddlespeech/s2t/exps/u2/model.py
@@ -27,7 +27,9 @@ from paddle import distributed as dist
 from paddle.io import DataLoader
 from yacs.config import CfgNode

+from paddlespeech.s2t.frontend.featurizer import TextFeaturizer
 from paddlespeech.s2t.io.collator import SpeechCollator
+from paddlespeech.s2t.io.dataloader import BatchDataLoader
 from paddlespeech.s2t.io.dataset import ManifestDataset
 from paddlespeech.s2t.io.sampler import SortagradBatchSampler
 from paddlespeech.s2t.io.sampler import SortagradDistributedBatchSampler
@@ -249,92 +251,103 @@ class U2Trainer(Trainer):

    def setup_dataloader(self):
        config = self.config.clone()
-        config.defrost()
-        config.collator.keep_transcription_text = False

-        # train/valid dataset, return token ids
-        config.data.manifest = config.data.train_manifest
-        train_dataset = ManifestDataset.from_config(config)
-
-        config.data.manifest = config.data.dev_manifest
-        dev_dataset = ManifestDataset.from_config(config)
-
-        collate_fn_train = SpeechCollator.from_config(config)
-
-        config.collator.augmentation_config = ""
-        collate_fn_dev = SpeechCollator.from_config(config)
-
-        if self.parallel:
-            batch_sampler = SortagradDistributedBatchSampler(
-                train_dataset,
+        if self.train:
+            # train/valid dataset, return token ids
+            self.train_loader = BatchDataLoader(
+                json_file=config.data.train_manifest,
+                train_mode=True,
+                sortagrad=False,
                batch_size=config.collator.batch_size,
-                num_replicas=None,
-                rank=None,
-                shuffle=True,
-                drop_last=True,
-                sortagrad=config.collator.sortagrad,
-                shuffle_method=config.collator.shuffle_method)
-        else:
-            batch_sampler = SortagradBatchSampler(
-                train_dataset,
-                shuffle=True,
+                maxlen_in=float('inf'),
+                maxlen_out=float('inf'),
+                minibatches=0,
+                mini_batch_size=self.args.nprocs,
+                batch_count='auto',
+                batch_bins=0,
+                batch_frames_in=0,
+                batch_frames_out=0,
+                batch_frames_inout=0,
+                preprocess_conf=config.collator.
+                augmentation_config,  # aug will be off when train_mode=False
+                n_iter_processes=config.collator.num_workers,
+                subsampling_factor=1,
+                num_encs=1)
+
+            self.valid_loader = BatchDataLoader(
+                json_file=config.data.dev_manifest,
+                train_mode=False,
+                sortagrad=False,
                batch_size=config.collator.batch_size,
-                drop_last=True,
-                sortagrad=config.collator.sortagrad,
-                shuffle_method=config.collator.shuffle_method)
-        self.train_loader = DataLoader(
-            train_dataset,
-            batch_sampler=batch_sampler,
-            collate_fn=collate_fn_train,
-            num_workers=config.collator.num_workers, )
-        self.valid_loader = DataLoader(
-            dev_dataset,
-            batch_size=config.collator.batch_size,
-            shuffle=False,
-            drop_last=False,
-            collate_fn=collate_fn_dev,
-            num_workers=config.collator.num_workers, )
-
-        # test dataset, return raw text
-        config.data.manifest = config.data.test_manifest
-        # filter test examples, will cause less examples, but no mismatch with training
-        # and can use large batch size , save training time, so filter test egs now.
-        config.data.min_input_len = 0.0  # second
-        config.data.max_input_len = float('inf')  # second
-        config.data.min_output_len = 0.0  # tokens
-        config.data.max_output_len = float('inf')  # tokens
-        config.data.min_output_input_ratio = 0.00
-        config.data.max_output_input_ratio = float('inf')
-
-        test_dataset = ManifestDataset.from_config(config)
-        # return text ord id
-        config.collator.keep_transcription_text = True
-        config.collator.augmentation_config = ""
-        self.test_loader = DataLoader(
-            test_dataset,
-            batch_size=config.decoding.batch_size,
-            shuffle=False,
-            drop_last=False,
-            collate_fn=SpeechCollator.from_config(config),
-            num_workers=config.collator.num_workers, )
-        # return text token id
-        config.collator.keep_transcription_text = False
-        self.align_loader = DataLoader(
-            test_dataset,
-            batch_size=config.decoding.batch_size,
-            shuffle=False,
-            drop_last=False,
-            collate_fn=SpeechCollator.from_config(config),
-            num_workers=config.collator.num_workers, )
-        logger.info("Setup train/valid/test/align Dataloader!")
+                maxlen_in=float('inf'),
+                maxlen_out=float('inf'),
+                minibatches=0,
+                mini_batch_size=self.args.nprocs,
+                batch_count='auto',
+                batch_bins=0,
+                batch_frames_in=0,
+                batch_frames_out=0,
+                batch_frames_inout=0,
+                preprocess_conf=config.collator.
+                augmentation_config,  # aug will be off when train_mode=False
+                n_iter_processes=config.collator.num_workers,
+                subsampling_factor=1,
+                num_encs=1)
+            logger.info("Setup train/valid Dataloader!")
+        else:
+            # test dataset, return raw text
+            self.test_loader = BatchDataLoader(
+                json_file=config.data.test_manifest,
+                train_mode=False,
+                sortagrad=False,
+                batch_size=config.decoding.batch_size,
+                maxlen_in=float('inf'),
+                maxlen_out=float('inf'),
+                minibatches=0,
+                mini_batch_size=1,
+                batch_count='auto',
+                batch_bins=0,
+                batch_frames_in=0,
+                batch_frames_out=0,
+                batch_frames_inout=0,
+                preprocess_conf=config.collator.
+                augmentation_config,  # aug will be off when train_mode=False
+                n_iter_processes=1,
+                subsampling_factor=1,
+                num_encs=1)
+
+            self.align_loader = BatchDataLoader(
+                json_file=config.data.test_manifest,
+                train_mode=False,
+                sortagrad=False,
+                batch_size=config.decoding.batch_size,
+                maxlen_in=float('inf'),
+                maxlen_out=float('inf'),
+                minibatches=0,
+                mini_batch_size=1,
+                batch_count='auto',
+                batch_bins=0,
+                batch_frames_in=0,
+                batch_frames_out=0,
+                batch_frames_inout=0,
+                preprocess_conf=config.collator.
+                augmentation_config,  # aug will be off when train_mode=False
+                n_iter_processes=1,
+                subsampling_factor=1,
+                num_encs=1)
+            logger.info("Setup test/align Dataloader!")

    def setup_model(self):
        config = self.config
        model_conf = config.model

        with UpdateConfig(model_conf):
-            model_conf.input_dim = self.train_loader.collate_fn.feature_size
-            model_conf.output_dim = self.train_loader.collate_fn.vocab_size
+            if self.train:
+                model_conf.input_dim = self.train_loader.feat_dim
+                model_conf.output_dim = self.train_loader.vocab_size
+            else:
+                model_conf.input_dim = self.test_loader.feat_dim
+                model_conf.output_dim = self.test_loader.vocab_size

        model = U2Model.from_config(model_conf)

@@ -343,6 +356,11 @@ class U2Trainer(Trainer):

        logger.info(f"{model}")
        layer_tools.print_params(model, logger.info)
+        self.model = model
+        logger.info("Setup model!")
+
+        if not self.train:
+            return

        train_config = config.training
        optim_type = train_config.optim
@@ -383,10 +401,9 @@ class U2Trainer(Trainer):
        optimzer_args = optimizer_args(config, model.parameters(), lr_scheduler)
        optimizer = OptimizerFactory.from_args(optim_type, optimzer_args)

-        self.model = model
        self.optimizer = optimizer
        self.lr_scheduler = lr_scheduler
-        logger.info("Setup model/optimizer/lr_scheduler!")
+        logger.info("Setup optimizer/lr_scheduler!")


 class U2Tester(U2Trainer):
@@ -421,14 +438,19 @@ class U2Tester(U2Trainer):

    def __init__(self, config, args):
        super().__init__(config, args)
+        self.text_feature = TextFeaturizer(
+            unit_type=self.config.collator.unit_type,
+            vocab_filepath=self.config.collator.vocab_filepath,
+            spm_model_prefix=self.config.collator.spm_model_prefix)
+        self.vocab_list = self.text_feature.vocab_list

-    def ordid2token(self, texts, texts_len):
+    def id2token(self, texts, texts_len, text_feature):
        """ ord() id to chr() chr """
        trans = []
        for text, n in zip(texts, texts_len):
            n = n.numpy().item()
            ids = text[:n]
-            trans.append(''.join([chr(i) for i in ids]))
+            trans.append(text_feature.defeaturize(ids.numpy().tolist()))
        return trans

    def compute_metrics(self,
@@ -444,12 +466,11 @@ class U2Tester(U2Trainer):
        error_rate_func = error_rate.cer if cfg.error_rate_type == 'cer' else error_rate.wer

        start_time = time.time()
-        text_feature = self.test_loader.collate_fn.text_feature
-        target_transcripts = self.ordid2token(texts, texts_len)
+        target_transcripts = self.id2token(texts, texts_len, self.text_feature)
        result_transcripts, result_tokenids = self.model.decode(
            audio,
            audio_len,
-            text_feature=text_feature,
+            text_feature=self.text_feature,
            decoding_method=cfg.decoding_method,
            lang_model_path=cfg.lang_model_path,
            beam_alpha=cfg.alpha,
@@ -499,7 +520,7 @@ class U2Tester(U2Trainer):
        self.model.eval()
        logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}")

-        stride_ms = self.test_loader.collate_fn.stride_ms
+        stride_ms = self.config.collator.stride_ms
        error_rate_type = None
        errors_sum, len_refs, num_ins = 0.0, 0, 0
        num_frames = 0.0
@@ -558,8 +579,7 @@ class U2Tester(U2Trainer):
    def align(self):
        ctc_utils.ctc_align(self.config, self.model, self.align_loader,
                            self.config.decoding.batch_size,
-                            self.align_loader.collate_fn.stride_ms,
-                            self.align_loader.collate_fn.vocab_list,
+                            self.config.collator.stride_ms, self.vocab_list,
                            self.args.result_file)

    def load_inferspec(self):
@@ -573,7 +593,7 @@ class U2Tester(U2Trainer):
        infer_model = U2InferModel.from_pretrained(self.test_loader,
                                                   self.config.model.clone(),
                                                   self.args.checkpoint_path)
-        feat_dim = self.test_loader.collate_fn.feature_size
+        feat_dim = self.test_loader.feat_dim
        input_spec = [
            paddle.static.InputSpec(shape=[1, None, feat_dim],
                                    dtype='float32'),  # audio, [B,T,D]

--- a/paddlespeech/s2t/exps/u2_kaldi/model.py
+++ b/paddlespeech/s2t/exps/u2_kaldi/model.py
@@ -392,6 +392,7 @@ class U2Tester(U2Trainer):
            unit_type=self.config.collator.unit_type,
            vocab_filepath=self.config.collator.vocab_filepath,
            spm_model_prefix=self.config.collator.spm_model_prefix)
+        self.vocab_list = self.text_feature.vocab_list

    def id2token(self, texts, texts_len, text_feature):
        """ ord() id to chr() chr """
@@ -529,8 +530,7 @@ class U2Tester(U2Trainer):
    def align(self):
        ctc_utils.ctc_align(self.config, self.model, self.align_loader,
                            self.config.decoding.batch_size,
-                            self.align_loader.collate_fn.stride_ms,
-                            self.align_loader.collate_fn.vocab_list,
+                            self.config.collator.stride_ms, self.vocab_list,
                            self.args.result_file)

    def load_inferspec(self):

--- a/paddlespeech/s2t/frontend/audio.py
+++ b/paddlespeech/s2t/frontend/audio.py
@@ -24,6 +24,8 @@ import soundfile
 import soxbindings as sox
 from scipy import signal

+from .utility import convert_samples_from_float32
+from .utility import convert_samples_to_float32
 from .utility import subfile_from_tar


@@ -689,15 +691,7 @@ class AudioSegment():
        Audio sample type is usually integer or float-point.
        Integers will be scaled to [-1, 1] in float32.
        """
-        float32_samples = samples.astype('float32')
-        if samples.dtype in np.sctypes['int']:
-            bits = np.iinfo(samples.dtype).bits
-            float32_samples *= (1. / 2**(bits - 1))
-        elif samples.dtype in np.sctypes['float']:
-            pass
-        else:
-            raise TypeError("Unsupported sample type: %s." % samples.dtype)
-        return float32_samples
+        return convert_samples_to_float32(samples)

    def _convert_samples_from_float32(self, samples, dtype):
        """Convert sample type from float32 to dtype.
@@ -708,20 +702,4 @@ class AudioSegment():

        This is for writing a audio file.
        """
-        dtype = np.dtype(dtype)
-        output_samples = samples.copy()
-        if dtype in np.sctypes['int']:
-            bits = np.iinfo(dtype).bits
-            output_samples *= (2**(bits - 1) / 1.)
-            min_val = np.iinfo(dtype).min
-            max_val = np.iinfo(dtype).max
-            output_samples[output_samples > max_val] = max_val
-            output_samples[output_samples < min_val] = min_val
-        elif samples.dtype in np.sctypes['float']:
-            min_val = np.finfo(dtype).min
-            max_val = np.finfo(dtype).max
-            output_samples[output_samples > max_val] = max_val
-            output_samples[output_samples < min_val] = min_val
-        else:
-            raise TypeError("Unsupported sample type: %s." % samples.dtype)
-        return output_samples.astype(dtype)
+        return convert_samples_from_float32(samples, dtype)
--- a/paddlespeech/s2t/frontend/featurizer/text_featurizer.py
+++ b/paddlespeech/s2t/frontend/featurizer/text_featurizer.py
@@ -92,7 +92,9 @@ class TextFeaturizer():
        tokens = self.tokenize(text)
        ids = []
        for token in tokens:
-            token = token if token in self.vocab_dict else self.unk
+            if token not in self.vocab_dict:
+                logger.debug(f"Text Token: {token} -> {self.unk}")
+                token = self.unk
            ids.append(self.vocab_dict[token])
        return ids


--- a/paddlespeech/s2t/frontend/utility.py
+++ b/paddlespeech/s2t/frontend/utility.py
@@ -30,7 +30,8 @@ logger = Log(__name__).getlog()
 __all__ = [
    "load_dict", "load_cmvn", "read_manifest", "rms_to_db", "rms_to_dbfs",
    "max_dbfs", "mean_dbfs", "gain_db_to_ratio", "normalize_audio", "SOS",
-    "EOS", "UNK", "BLANK", "MASKCTC", "SPACE"
+    "EOS", "UNK", "BLANK", "MASKCTC", "SPACE", "convert_samples_to_float32",
+    "convert_samples_from_float32"
 ]

 IGNORE_ID = -1
@@ -342,3 +343,50 @@ def load_cmvn(cmvn_file: str, filetype: str):
    else:
        raise ValueError(f"cmvn file type no support: {filetype}")
    return cmvn[0], cmvn[1]
+
+
+def convert_samples_to_float32(samples):
+    """Convert sample type to float32.
+
+    Audio sample type is usually integer or float-point.
+    Integers will be scaled to [-1, 1] in float32.
+
+    PCM16 -> PCM32
+    """
+    float32_samples = samples.astype('float32')
+    if samples.dtype in np.sctypes['int']:
+        bits = np.iinfo(samples.dtype).bits
+        float32_samples *= (1. / 2**(bits - 1))
+    elif samples.dtype in np.sctypes['float']:
+        pass
+    else:
+        raise TypeError("Unsupported sample type: %s." % samples.dtype)
+    return float32_samples
+
+
+def convert_samples_from_float32(samples, dtype):
+    """Convert sample type from float32 to dtype.
+
+    Audio sample type is usually integer or float-point. For integer
+    type, float32 will be rescaled from [-1, 1] to the maximum range
+    supported by the integer type.
+
+    PCM32 -> PCM16
+    """
+    dtype = np.dtype(dtype)
+    output_samples = samples.copy()
+    if dtype in np.sctypes['int']:
+        bits = np.iinfo(dtype).bits
+        output_samples *= (2**(bits - 1) / 1.)
+        min_val = np.iinfo(dtype).min
+        max_val = np.iinfo(dtype).max
+        output_samples[output_samples > max_val] = max_val
+        output_samples[output_samples < min_val] = min_val
+    elif samples.dtype in np.sctypes['float']:
+        min_val = np.finfo(dtype).min
+        max_val = np.finfo(dtype).max
+        output_samples[output_samples > max_val] = max_val
+        output_samples[output_samples < min_val] = min_val
+    else:
+        raise TypeError("Unsupported sample type: %s." % samples.dtype)
+    return output_samples.astype(dtype)
--- a/paddlespeech/s2t/io/collator.py
+++ b/paddlespeech/s2t/io/collator.py
@@ -199,8 +199,8 @@ class SpeechCollatorBase():
        for idx, item in enumerate(batch):
            utts.append(item['utt'])

-            audio = item['feat']
-            text = item['text']
+            audio = item['input'][0]['feat']
+            text = item['output'][0]['text']
            audio, text = self.process_utterance(audio, text)

            audios.append(audio)  # [T, D]
@@ -343,9 +343,10 @@ class TripletSpeechCollator(SpeechCollator):
        for idx, item in enumerate(batch):
            utts.append(item['utt'])

-            audio = item['feat']
-            translation = item['text']
-            transcription = item['text1']
+            audio = item['input'][0]['feat']
+            translation = item['output'][0]['text']
+            transcription = item['output'][1]['text']
+
            audio, translation, transcription = self.process_utterance(
                audio, translation, transcription)


--- a/paddlespeech/s2t/io/dataset.py
+++ b/paddlespeech/s2t/io/dataset.py
@@ -103,7 +103,7 @@ class ManifestDataset(Dataset):
            min_output_len=min_output_len,
            max_output_input_ratio=max_output_input_ratio,
            min_output_input_ratio=min_output_input_ratio)
-        self._manifest.sort(key=lambda x: x["feat_shape"][0])
+        self._manifest.sort(key=lambda x: x["input"][0]["shape"][0])

    def __len__(self):
        return len(self._manifest)
@@ -188,34 +188,16 @@ class AudioDataset(Dataset):
        if sort:
            data = sorted(data, key=lambda x: x["feat_shape"][0])
        if raw_wav:
-            assert data[0]['feat'].split(':')[0].splitext()[-1] not in ('.ark',
-                                                                        '.scp')
-            data = map(lambda x: (float(x['feat_shape'][0]) * 1000 / stride_ms))
+            path_suffix = data[0]['feat'].split(':')[0].splitext()[-1]
+            assert path_suffix not in ('.ark', '.scp')
+            # m second to n frame
+            data = list(
+                map(lambda x: (float(x['feat_shape'][0]) * 1000 / stride_ms),
+                    data))

        self.input_dim = data[0]['feat_shape'][1]
        self.output_dim = data[0]['token_shape'][1]

-        # with open(data_file, 'r') as f:
-        #     for line in f:
-        #         arr = line.strip().split('\t')
-        #         if len(arr) != 7:
-        #             continue
-        #         key = arr[0].split(':')[1]
-        #         tokenid = arr[5].split(':')[1]
-        #         output_dim = int(arr[6].split(':')[1].split(',')[1])
-        #         if raw_wav:
-        #             wav_path = ':'.join(arr[1].split(':')[1:])
-        #             duration = int(float(arr[2].split(':')[1]) * 1000 / 10)
-        #             data.append((key, wav_path, duration, tokenid))
-        #         else:
-        #             feat_ark = ':'.join(arr[1].split(':')[1:])
-        #             feat_info = arr[2].split(':')[1].split(',')
-        #             feat_dim = int(feat_info[1].strip())
-        #             num_frames = int(feat_info[0].strip())
-        #             data.append((key, feat_ark, num_frames, tokenid))
-        #             self.input_dim = feat_dim
-        #         self.output_dim = output_dim
-
        valid_data = []
        for i in range(len(data)):
            length = data[i]['feat_shape'][0]
@@ -223,17 +205,17 @@ class AudioDataset(Dataset):
            # remove too lang or too short utt for both input and output
            # to prevent from out of memory
            if length > max_length or length < min_length:
-                # logging.warn('ignore utterance {} feature {}'.format(
-                #     data[i][0], length))
                pass
            elif token_length > token_max_length or token_length < token_min_length:
                pass
            else:
                valid_data.append(data[i])
+        logger.info(f"raw dataset len: {len(data)}")
        data = valid_data
+        num_data = len(data)
+        logger.info(f"dataset len after filter: {num_data}")

        self.minibatch = []
-        num_data = len(data)
        # Dynamic batch size
        if batch_type == 'dynamic':
            assert (max_frames_in_batch > 0)
@@ -258,7 +240,9 @@ class AudioDataset(Dataset):
                cur = end

    def __len__(self):
+        """number of example(batch)"""
        return len(self.minibatch)

    def __getitem__(self, idx):
+        """batch example of idx"""
        return self.minibatch[idx]
--- a/paddlespeech/s2t/io/reader.py
+++ b/paddlespeech/s2t/io/reader.py
@@ -18,8 +18,10 @@ import kaldiio
 import numpy as np
 import soundfile

-from paddlespeech.s2t.frontend.augmentor.augmentation import AugmentationPipeline as Transformation
+from .utility import feat_type
+from paddlespeech.s2t.transform.transformation import Transformation
 from paddlespeech.s2t.utils.log import Log
+# from paddlespeech.s2t.frontend.augmentor.augmentation import AugmentationPipeline as Transformation

 __all__ = ["LoadInputsAndTargets"]

@@ -322,20 +324,7 @@ class LoadInputsAndTargets():
                "Not supported: loader_type={}".format(filetype))

    def file_type(self, filepath):
-        suffix = filepath.split(":")[0].split('.')[-1].lower()
-        if suffix == 'ark':
-            return 'mat'
-        elif suffix == 'scp':
-            return 'scp'
-        elif suffix == 'npy':
-            return 'npy'
-        elif suffix == 'npz':
-            return 'npz'
-        elif suffix in ['wav', 'flac']:
-            # PCM16
-            return 'sound'
-        else:
-            raise ValueError(f"Not support filetype: {suffix}")
+        return feat_type(filepath)


 class SoundHDF5File():

--- a/paddlespeech/s2t/io/utility.py
+++ b/paddlespeech/s2t/io/utility.py
@@ -17,7 +17,7 @@ import numpy as np

 from paddlespeech.s2t.utils.log import Log

-__all__ = ["pad_list", "pad_sequence"]
+__all__ = ["pad_list", "pad_sequence", "feat_type"]

 logger = Log(__name__).getlog()

@@ -85,3 +85,20 @@ def pad_sequence(sequences: List[np.ndarray],
            out_tensor[:length, i, ...] = tensor

    return out_tensor
+
+
+def feat_type(filepath):
+    suffix = filepath.split(":")[0].split('.')[-1].lower()
+    if suffix == 'ark':
+        return 'mat'
+    elif suffix == 'scp':
+        return 'scp'
+    elif suffix == 'npy':
+        return 'npy'
+    elif suffix == 'npz':
+        return 'npz'
+    elif suffix in ['wav', 'flac']:
+        # PCM16
+        return 'sound'
+    else:
+        raise ValueError(f"Not support filetype: {suffix}")
--- a/paddlespeech/s2t/models/u2/u2.py
+++ b/paddlespeech/s2t/models/u2/u2.py
@@ -860,7 +860,7 @@ class U2Model(U2DecodeModel):
            int, nn.Layer, nn.Layer, nn.Layer: vocab size, encoder, decoder, ctc
        """
        # cmvn
-        if 'cmvn_file' in configs and configs['cmvn_file'] is not None:
+        if 'cmvn_file' in configs and configs['cmvn_file']:
            mean, istd = load_cmvn(configs['cmvn_file'],
                                   configs['cmvn_file_type'])
            global_cmvn = GlobalCMVN(
@@ -934,8 +934,8 @@ class U2Model(U2DecodeModel):
            DeepSpeech2Model: The model built from pretrained result.
        """
        with UpdateConfig(config):
-            config.input_dim = dataloader.collate_fn.feature_size
-            config.output_dim = dataloader.collate_fn.vocab_size
+            config.input_dim = dataloader.feat_dim
+            config.output_dim = dataloader.vocab_size

        model = cls.from_config(config)


--- a/paddlespeech/s2t/modules/attention.py
+++ b/paddlespeech/s2t/modules/attention.py
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2019 Mobvoi Inc. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/paddlespeech/s2t/modules/cmvn.py
+++ b/paddlespeech/s2t/modules/cmvn.py
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2019 Mobvoi Inc. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/paddlespeech/s2t/modules/conformer_convolution.py
+++ b/paddlespeech/s2t/modules/conformer_convolution.py
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2019 Mobvoi Inc. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/paddlespeech/s2t/modules/decoder.py
+++ b/paddlespeech/s2t/modules/decoder.py
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2019 Mobvoi Inc. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/paddlespeech/s2t/modules/decoder_layer.py
+++ b/paddlespeech/s2t/modules/decoder_layer.py
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2019 Mobvoi Inc. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/paddlespeech/s2t/modules/embedding.py
+++ b/paddlespeech/s2t/modules/embedding.py
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2019 Mobvoi Inc. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/paddlespeech/s2t/modules/encoder.py
+++ b/paddlespeech/s2t/modules/encoder.py
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2019 Mobvoi Inc. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/paddlespeech/s2t/modules/encoder_layer.py
+++ b/paddlespeech/s2t/modules/encoder_layer.py
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2019 Mobvoi Inc. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/paddlespeech/s2t/modules/loss.py
+++ b/paddlespeech/s2t/modules/loss.py
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2019 Mobvoi Inc. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/paddlespeech/s2t/modules/mask.py
+++ b/paddlespeech/s2t/modules/mask.py
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2019 Mobvoi Inc. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/paddlespeech/s2t/modules/positionwise_feed_forward.py
+++ b/paddlespeech/s2t/modules/positionwise_feed_forward.py
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2019 Mobvoi Inc. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/paddlespeech/s2t/modules/subsampling.py
+++ b/paddlespeech/s2t/modules/subsampling.py
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2019 Mobvoi Inc. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/paddlespeech/s2t/transform/cmvn.py
+++ b/paddlespeech/s2t/transform/cmvn.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 # Modified from espnet(https://github.com/espnet/espnet)
 import io
+import json

 import h5py
 import kaldiio
@@ -157,3 +158,40 @@ class UtteranceCMVN():
            x = np.divide(x, std)

        return x
+
+
+class GlobalCMVN():
+    "Apply Global CMVN"
+
+    def __init__(self,
+                 cmvn_path,
+                 norm_means=True,
+                 norm_vars=True,
+                 std_floor=1.0e-20):
+        self.cmvn_path = cmvn_path
+        self.norm_means = norm_means
+        self.norm_vars = norm_vars
+        self.std_floor = std_floor
+
+        with open(cmvn_path) as f:
+            cmvn_stats = json.load(f)
+        self.count = cmvn_stats['frame_num']
+        self.mean = np.array(cmvn_stats['mean_stat']) / self.count
+        self.square_sums = np.array(cmvn_stats['var_stat'])
+        self.var = self.square_sums / self.count - self.mean**2
+        self.std = np.maximum(np.sqrt(self.var), self.std_floor)
+
+    def __repr__(self):
+        return f"""{self.__class__.__name__}(
+            cmvn_path={self.cmvn_path}, 
+            norm_means={self.norm_means}, 
+            norm_vars={self.norm_vars},)"""
+
+    def __call__(self, x, uttid=None):
+        # x: [Time, Dim]
+        if self.norm_means:
+            x = np.subtract(x, self.mean)
+
+        if self.norm_vars:
+            x = np.divide(x, self.std)
+        return x
--- a/paddlespeech/s2t/transform/perturb.py
+++ b/paddlespeech/s2t/transform/perturb.py
@@ -16,6 +16,7 @@ import librosa
 import numpy
 import scipy
 import soundfile
+import soxbindings as sox

 from paddlespeech.s2t.io.reader import SoundHDF5File

@@ -82,7 +83,6 @@ class SpeedPerturbation():
    def __call__(self, x, uttid=None, train=True):
        if not train:
            return x
-
        x = x.astype(numpy.float32)
        if self.accept_uttid:
            ratio = self.utt2ratio[uttid]
@@ -108,6 +108,110 @@ class SpeedPerturbation():
        return y


+class SpeedPerturbationSox():
+    """SpeedPerturbationSox
+
+    The speed perturbation in kaldi uses sox-speed instead of sox-tempo,
+    and sox-speed just to resample the input,
+    i.e pitch and tempo are changed both.
+
+    To speed up or slow down the sound of a file, 
+    use speed to modify the pitch and the duration of the file. 
+    This raises the speed and reduces the time. 
+    The default factor is 1.0 which makes no change to the audio. 
+    2.0 doubles speed, thus time length is cut by a half and pitch is one interval higher.
+
+    "Why use speed option instead of tempo -s in SoX for speed perturbation"
+    https://groups.google.com/forum/#!topic/kaldi-help/8OOG7eE4sZ8
+
+    tempo option:
+    sox -t wav input.wav -t wav output.tempo0.9.wav tempo -s 0.9
+
+    speed option:
+    sox -t wav input.wav -t wav output.speed0.9.wav speed 0.9
+
+    If we use speed option like above, the pitch of audio also will be changed, 
+    but the tempo option does not change the pitch.
+    """
+
+    def __init__(
+            self,
+            lower=0.9,
+            upper=1.1,
+            utt2ratio=None,
+            keep_length=True,
+            sr=16000,
+            seed=None, ):
+        self.sr = sr
+        self.keep_length = keep_length
+        self.state = numpy.random.RandomState(seed)
+
+        if utt2ratio is not None:
+            self.utt2ratio = {}
+            # Use the scheduled ratio for each utterances
+            self.utt2ratio_file = utt2ratio
+            self.lower = None
+            self.upper = None
+            self.accept_uttid = True
+
+            with open(utt2ratio, "r") as f:
+                for line in f:
+                    utt, ratio = line.rstrip().split(None, 1)
+                    ratio = float(ratio)
+                    self.utt2ratio[utt] = ratio
+        else:
+            self.utt2ratio = None
+            # The ratio is given on runtime randomly
+            self.lower = lower
+            self.upper = upper
+
+    def __repr__(self):
+        if self.utt2ratio is None:
+            return f"""{self.__class__.__name__}(
+                lower={self.lower}, 
+                upper={self.upper}, 
+                keep_length={self.keep_length},
+                sample_rate={self.sr})"""
+
+        else:
+            return f"""{self.__class__.__name__}(
+                utt2ratio={self.utt2ratio_file},
+                sample_rate={self.sr})"""
+
+    def __call__(self, x, uttid=None, train=True):
+        if not train:
+            return x
+
+        x = x.astype(numpy.float32)
+        if self.accept_uttid:
+            ratio = self.utt2ratio[uttid]
+        else:
+            ratio = self.state.uniform(self.lower, self.upper)
+
+        tfm = sox.Transformer()
+        tfm.set_globals(multithread=False)
+        tfm.speed(ratio)
+        y = tfm.build_array(input_array=x, sample_rate_in=self.sr)
+
+        if self.keep_length:
+            diff = abs(len(x) - len(y))
+            if len(y) > len(x):
+                # Truncate noise
+                y = y[diff // 2:-((diff + 1) // 2)]
+            elif len(y) < len(x):
+                # Assume the time-axis is the first: (Time, Channel)
+                pad_width = [(diff // 2, (diff + 1) // 2)] + [
+                    (0, 0) for _ in range(y.ndim - 1)
+                ]
+                y = numpy.pad(
+                    y, pad_width=pad_width, constant_values=0, mode="constant")
+
+        if y.ndim == 2 and x.ndim == 1:
+            # (T, C) -> (T)
+            y = y.sequence(1)
+        return y
+
+
 class BandpassPerturbation():
    """BandpassPerturbation


--- a/paddlespeech/s2t/transform/spec_augment.py
+++ b/paddlespeech/s2t/transform/spec_augment.py
@@ -34,6 +34,9 @@ def time_warp(x, max_time_warp=80, inplace=False, mode="PIL"):
    :returns numpy.ndarray: time warped spectrogram (time, freq)
    """
    window = max_time_warp
+    if window == 0:
+        return x
+
    if mode == "PIL":
        t = x.shape[0]
        if t - window <= window:

--- a/paddlespeech/s2t/transform/spectrogram.py
+++ b/paddlespeech/s2t/transform/spectrogram.py
@@ -14,6 +14,7 @@
 # Modified from espnet(https://github.com/espnet/espnet)
 import librosa
 import numpy as np
+from python_speech_features import logfbank


 def stft(x,
@@ -304,3 +305,94 @@ class IStft():
            win_length=self.win_length,
            window=self.window,
            center=self.center, )
+
+
+class LogMelSpectrogramKaldi():
+    def __init__(
+            self,
+            fs=16000,
+            n_mels=80,
+            n_fft=512,  # fft point
+            n_shift=160,  # unit:sample, 10ms
+            win_length=400,  # unit:sample, 25ms
+            window="povey",
+            fmin=20,
+            fmax=None,
+            eps=1e-10,
+            dither=False):
+        self.fs = fs
+        self.n_mels = n_mels
+        self.n_fft = n_fft
+        if n_shift > win_length:
+            raise ValueError("Stride size must not be greater than "
+                             "window size.")
+        self.n_shift = n_shift / fs  # unit: ms
+        self.win_length = win_length / fs  # unit: ms
+
+        self.window = window
+        self.fmin = fmin
+        if fmax is None:
+            fmax_ = fmax if fmax else self.fs / 2
+        elif fmax > int(self.fs / 2):
+            raise ValueError("fmax must not be greater than half of "
+                             "sample rate.")
+        self.fmax = fmax_
+
+        self.eps = eps
+        self.remove_dc_offset = True
+        self.preemph = 0.97
+        self.dither = dither
+
+    def __repr__(self):
+        return (
+            "{name}(fs={fs}, n_mels={n_mels}, n_fft={n_fft}, "
+            "n_shift={n_shift}, win_length={win_length}, preemph={preemph}, window={window}, "
+            "fmin={fmin}, fmax={fmax}, eps={eps}, dither={dither}))".format(
+                name=self.__class__.__name__,
+                fs=self.fs,
+                n_mels=self.n_mels,
+                n_fft=self.n_fft,
+                n_shift=self.n_shift,
+                preemph=self.preemph,
+                win_length=self.win_length,
+                window=self.window,
+                fmin=self.fmin,
+                fmax=self.fmax,
+                eps=self.eps,
+                dither=self.dither, ))
+
+    def __call__(self, x):
+        """
+
+        Args:
+            x (np.ndarray): shape (Ti,)
+
+        Raises:
+            ValueError: not support (Ti, C)
+
+        Returns:
+            np.ndarray: (T, D)
+        """
+        if x.ndim != 1:
+            raise ValueError("Not support x: [Time, Channel]")
+
+        if x.dtype in np.sctypes['float']:
+            # PCM32 -> PCM16
+            bits = np.iinfo(np.int16).bits
+            x = x * 2**(bits - 1)
+
+        # logfbank need PCM16 input
+        y = logfbank(
+            signal=x,
+            samplerate=self.fs,
+            winlen=self.win_length,  # unit ms
+            winstep=self.n_shift,  # unit ms
+            nfilt=self.n_mels,
+            nfft=self.n_fft,
+            lowfreq=self.fmin,
+            highfreq=self.fmax,
+            dither=self.dither,
+            remove_dc_offset=self.remove_dc_offset,
+            preemph=self.preemph,
+            wintype=self.window)
+        return y
--- a/paddlespeech/s2t/transform/transformation.py
+++ b/paddlespeech/s2t/transform/transformation.py
@@ -45,7 +45,8 @@ import_alias = dict(
    stft2fbank="paddlespeech.s2t.transform.spectrogram:Stft2LogMelSpectrogram",
    wpe="paddlespeech.s2t.transform.wpe:WPE",
    channel_selector="paddlespeech.s2t.transform.channel_selector:ChannelSelector",
-)
+    fbank_kaldi="paddlespeech.s2t.transform.spectrogram:LogMelSpectrogramKaldi",
+    cmvn_json="paddlespeech.s2t.transform.cmvn:GlobalCMVN")


 class Transformation():

--- a/utils/compute_mean_std.py
+++ b/utils/compute_mean_std.py
@@ -33,8 +33,8 @@ add_arg('spectrum_type',    str,
        choices=['linear', 'mfcc', 'fbank'])
 add_arg('feat_dim',    int, 13, "Audio feature dim.")
 add_arg('delta_delta', bool,  False, "Audio feature with delta delta.")
-add_arg('stride_ms', float, 10.0,  "stride length in ms.")
-add_arg('window_ms', float, 20.0,  "stride length in ms.")
+add_arg('stride_ms', int, 10,  "stride length in ms.")
+add_arg('window_ms', int, 20,  "stride length in ms.")
 add_arg('sample_rate',  int, 16000,  "target sample rate.")
 add_arg('use_dB_normalization', bool, True, "do dB normalization.")
 add_arg('target_dB',   int, -20,  "target dB.")
@@ -61,8 +61,8 @@ def main():
        spectrum_type=args.spectrum_type,
        feat_dim=args.feat_dim,
        delta_delta=args.delta_delta,
-        stride_ms=args.stride_ms,
-        window_ms=args.window_ms,
+        stride_ms=float(args.stride_ms),
+        window_ms=float(args.window_ms),
        n_fft=None,
        max_freq=None,
        target_sample_rate=args.sample_rate,

--- a/utils/format_data.py
+++ b/utils/format_data.py
@@ -20,13 +20,13 @@ import json
 from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
 from paddlespeech.s2t.frontend.utility import load_cmvn
 from paddlespeech.s2t.frontend.utility import read_manifest
+from paddlespeech.s2t.io.utility import feat_type
 from paddlespeech.s2t.utils.utility import add_arguments
 from paddlespeech.s2t.utils.utility import print_arguments

 parser = argparse.ArgumentParser(description=__doc__)
 add_arg = functools.partial(add_arguments, argparser=parser)
 # yapf: disable
-add_arg('feat_type', str, "raw", "speech feature type, e.g. raw(wav, flac), mat(ark), scp")
 add_arg('cmvn_path',       str,
        'examples/librispeech/data/mean_std.json',
        "Filepath of cmvn.")
@@ -62,27 +62,76 @@ def main():
    vocab_size = text_feature.vocab_size
    print(f"Vocab size: {vocab_size}")

+    # josnline like this
+    # {
+    #   "input": [{"name": "input1", "shape": (100, 83), "feat": "xxx.ark:123"}],
+    #   "output": [{"name":"target1", "shape": (40, 5002), "text": "a b c de"}],
+    #   "utt2spk": "111-2222",
+    #   "utt": "111-2222-333"
+    # }
    count = 0
    for manifest_path in args.manifest_paths:
        manifest_jsons = read_manifest(manifest_path)
        for line_json in manifest_jsons:
+            output_json = {
+                "input": [],
+                "output": [],
+                'utt': line_json['utt'],
+                'utt2spk': line_json.get('utt2spk', 'global'),
+            }
+
+            # output
            line = line_json['text']
-            tokens = text_feature.tokenize(line)
-            tokenids = text_feature.featurize(line)
-            line_json['token'] = tokens
-            line_json['token_id'] = tokenids
-            line_json['token_shape'] = (len(tokenids), vocab_size)
-            feat_shape = line_json['feat_shape']
-            assert isinstance(feat_shape, (list, tuple)), type(feat_shape)
-            if args.feat_type == 'raw':
-                feat_shape.append(feat_dim)
-                line_json['filetype'] = 'sound'
-            else: # kaldi
-                raise NotImplementedError('no support kaldi feat now!')
-            fout.write(json.dumps(line_json) + '\n')
+            if isinstance(line, str):
+                # only one target
+                tokens = text_feature.tokenize(line)
+                tokenids = text_feature.featurize(line)
+                output_json['output'].append({
+                    'name': 'target1',
+                    'shape': (len(tokenids), vocab_size),
+                    'text': line,
+                    'token': ' '.join(tokens),
+                    'tokenid': ' '.join(map(str, tokenids)),
+                })
+            else:
+                # isinstance(line, list), multi target in one vocab
+                for i, item in enumerate(line, 1):
+                    tokens = text_feature.tokenize(item)
+                    tokenids = text_feature.featurize(item)
+                    output_json['output'].append({
+                        'name': f'target{i}',
+                        'shape': (len(tokenids), vocab_size),
+                        'text': item,
+                        'token': ' '.join(tokens),
+                        'tokenid': ' '.join(map(str, tokenids)),
+                    })
+
+            # input
+            line = line_json['feat']
+            if isinstance(line, str):
+                # only one input
+                feat_shape = line_json['feat_shape']
+                assert isinstance(feat_shape, (list, tuple)), type(feat_shape)
+                filetype = feat_type(line)
+                if filetype == 'sound':
+                    feat_shape.append(feat_dim)
+                else: # kaldi
+                    raise NotImplementedError('no support kaldi feat now!')
+
+                output_json['input'].append({
+                    "name": "input1",
+                    "shape": feat_shape,
+                    "feat": line,
+                    "filetype": filetype,
+                })
+            else:
+                # isinstance(line, list), multi input 
+                raise NotImplementedError("not support multi input now!")
+
+            fout.write(json.dumps(output_json) + '\n')
            count += 1

-    print(f"Examples number: {count}")
+    print(f"{args.manifest_paths} Examples number: {count}")
    fout.close()



--- a/utils/format_triplet_data.py
+++ b/utils/format_triplet_data.py
@@ -20,13 +20,13 @@ import json
 from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
 from paddlespeech.s2t.frontend.utility import load_cmvn
 from paddlespeech.s2t.frontend.utility import read_manifest
+from paddlespeech.s2t.io.utility import feat_type
 from paddlespeech.s2t.utils.utility import add_arguments
 from paddlespeech.s2t.utils.utility import print_arguments

 parser = argparse.ArgumentParser(description=__doc__)
 add_arg = functools.partial(add_arguments, argparser=parser)
 # yapf: disable
-add_arg('feat_type', str, "raw", "speech feature type, e.g. raw(wav, flac), kaldi")
 add_arg('cmvn_path',       str,
        'examples/librispeech/data/mean_std.json',
        "Filepath of cmvn.")
@@ -79,9 +79,11 @@ def main():
            line_json['token1'] = tokens
            line_json['token_id1'] = tokenids
            line_json['token_shape1'] = (len(tokenids), vocab_size)
+
            feat_shape = line_json['feat_shape']
            assert isinstance(feat_shape, (list, tuple)), type(feat_shape)
-            if args.feat_type == 'raw':
+            filetype = feat_type(line_json['feat'])
+            if filetype == 'sound':
                feat_shape.append(feat_dim)
            else: # kaldi
                raise NotImplementedError('no support kaldi feat now!')

--- a/utils/pack_model.sh
+++ b/utils/pack_model.sh
+#!/usr/bin/env bash
+
+# Copyright 2019 Johns Hopkins University (Shinji Watanabe)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+[ -f ./path.sh ] && . ./path.sh
+
+results=""
+# e.g., "exp/tr_it_pytorch_train/decode_dt_it_decode/result.wrd.txt
+#        exp/tr_it_pytorch_train/decode_et_it_decode/result.wrd.txt"'
+lm=""
+dict=""
+etc=""
+outfile="model"
+preprocess_conf=""
+
+help_message=$(cat <<EOF
+Usage: $0 --lm <lm> --dict <dict> <train_conf> <dec_conf> <cmvn> <e2e>, for example:
+<lm>:       exp/train_rnnlm/rnnlm.model.best
+<dict>:     data/lang_char
+<train_conf>:  conf/train.yaml
+<dec_conf>: conf/decode.yaml
+<cmvn>:     data/tr_it/cmvn.ark
+<e2e>:      exp/tr_it_pytorch_train/results/model.last10.avg.best
+EOF
+)
+
+. utils/parse_options.sh
+
+if [ $# != 4 ]; then
+    echo "${help_message}"
+    exit 1
+fi
+
+tr_conf=$1
+dec_conf=$2
+cmvn=$3
+e2e=$4
+
+echo "  - Model files (archived to ${outfile}.tar.gz by \`\$ pack_model.sh\`)"
+echo "    - model link: (put the model link manually.)"
+
+# configs
+if [ -e ${tr_conf} ]; then
+    tar cfh ${outfile}.tar ${tr_conf}
+    echo -n "    - training config file: \`"
+    echo ${tr_conf} | sed -e "s/$/\`/"
+else
+    echo "missing ${tr_conf}"
+    exit 1
+fi
+if [ -e ${dec_conf} ]; then
+    tar rfh ${outfile}.tar ${dec_conf}
+    echo -n "    - decoding config file: \`"
+    echo ${dec_conf} | sed -e "s/$/\`/"
+else
+    echo "missing ${dec_conf}"
+    exit 1
+fi
+# NOTE(kan-bayashi): preprocess conf is optional
+if [ -n "${preprocess_conf}" ]; then
+    tar rfh ${outfile}.tar ${preprocess_conf}
+    echo -n "    - preprocess config file: \`"
+    echo ${preprocess_conf} | sed -e "s/$/\`/"
+fi
+
+# cmvn
+if [ -e ${cmvn} ]; then
+    tar rfh ${outfile}.tar ${cmvn}
+    echo -n "    - cmvn file: \`"
+    echo ${cmvn} | sed -e "s/$/\`/"
+else
+    echo "missing ${cmvn}"
+    exit 1
+fi
+
+# e2e
+if [ -e ${e2e} ]; then
+    tar rfh ${outfile}.tar ${e2e}
+    echo -n "    - e2e file: \`"
+    echo ${e2e} | sed -e "s/$/\`/"
+
+    e2e_conf=$(dirname ${e2e})/model.json
+    if [ ! -e ${e2e_conf} ]; then
+	echo missing ${e2e_conf}
+	#exit 1
+    else
+	echo -n "    - e2e JSON file: \`"
+	echo ${e2e_conf} | sed -e "s/$/\`/"
+	tar rfh ${outfile}.tar ${e2e_conf}
+    fi
+else
+    echo "missing ${e2e}"
+    exit 1
+fi
+
+# lm
+if [ -n "${lm}" ]; then
+    if [ -e ${lm} ]; then
+	tar rfh ${outfile}.tar ${lm}
+	echo -n "    - lm file: \`"
+	echo ${lm} | sed -e "s/$/\`/"
+
+	lm_conf=$(dirname ${lm})/model.json
+	if [ ! -e ${lm_conf} ]; then
+	    echo missing ${lm_conf}
+	    exit 1
+	else
+	    echo -n "    - lm JSON file: \`"
+	    echo ${lm_conf} | sed -e "s/$/\`/"
+	    tar rfh ${outfile}.tar ${lm_conf}
+	fi
+    else
+	echo "missing ${lm}"
+	exit 1
+    fi
+fi
+
+# dict
+if [ -n "${dict}" ]; then
+    if [ -e ${dict} ]; then
+	tar rfh ${outfile}.tar ${dict}
+	echo -n "    - dict file: \`"
+	echo ${dict} | sed -e "s/$/\`/"
+    else
+	echo "missing ${dict}"
+	exit 1
+    fi
+fi
+
+# etc
+for x in ${etc}; do
+    if [ -e ${x} ]; then
+	tar rfh ${outfile}.tar ${x}
+	echo -n "    - etc file: \`"
+	echo ${x} | sed -e "s/$/\`/"
+    else
+	echo "missing ${x}"
+	exit 1
+    fi
+done
+
+# finally compress the tar file
+gzip -f ${outfile}.tar
+
+# results
+if [ -n "${results}" ]; then
+    echo "  - Results (paste them by yourself or obtained by \`\$ pack_model.sh --results <results>\`)"
+    echo "\`\`\`"
+fi
+for x in ${results}; do
+    if [ -e ${x} ]; then
+	echo "${x}"
+	grep -e Avg -e SPKR -m 2 ${x}
+    else
+	echo "missing ${x}"
+	exit 1
+    fi
+done
+if [ -n "${results}" ]; then
+    echo "\`\`\`"
+fi
+
+exit 0
--- a/utils/remove_longshortdata.py
+++ b/utils/remove_longshortdata.py
+#!/usr/bin/env python3
+"""remove longshort data from manifest"""
+import argparse
+import logging
+
+import jsonlines
+
+from paddlespeech.s2t.utils.cli_utils import get_commandline_args
+
+# manifest after format
+# josnline like this
+# {
+#   "input": [{"name": "input1", "shape": (100, 83), "feat": "xxx.ark:123"}],
+#   "output": [{"name":"target1", "shape": (40, 5002), "text": "a b c de"}],
+#   "utt2spk": "111-2222",
+#   "utt": "111-2222-333"
+# }
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        description="remove longshort data from format manifest",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter, )
+    parser.add_argument(
+        "--verbose", "-V", default=0, type=int, help="Verbose option")
+    parser.add_argument(
+        "--iaxis",
+        default=0,
+        type=int,
+        help="multi inputs index, 0 is the first")
+    parser.add_argument(
+        "--oaxis",
+        default=0,
+        type=int,
+        help="multi outputs index, 0 is the first")
+    parser.add_argument("--maxframes", default=2000, type=int, help="maxframes")
+    parser.add_argument("--minframes", default=10, type=int, help="minframes")
+    parser.add_argument("--maxchars", default=200, type=int, help="max tokens")
+    parser.add_argument("--minchars", default=0, type=int, help="min tokens")
+    parser.add_argument(
+        "--stride_ms", default=10, type=int, help="stride in ms unit.")
+    parser.add_argument(
+        "rspecifier",
+        type=str,
+        help="jsonl format manifest. e.g. manifest.jsonl")
+    parser.add_argument(
+        "wspecifier_or_wxfilename",
+        type=str,
+        help="Write specifier. e.g. manifest.jsonl")
+    return parser
+
+
+def filter_input(args, line):
+    tmp = line['input'][args.iaxis]
+    if args.sound:
+        # second to frame
+        nframe = tmp['shape'][0] * 1000 / args.stride_ms
+    else:
+        nframe = tmp['shape'][0]
+
+    if nframe < args.minframes or nframe > args.maxframes:
+        return True
+    else:
+        return False
+
+
+def filter_output(args, line):
+    nchars = len(line['output'][args.iaxis]['text'])
+    if nchars < args.minchars or nchars > args.maxchars:
+        return True
+    else:
+        return False
+
+
+def main():
+    args = get_parser().parse_args()
+
+    logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
+    if args.verbose > 0:
+        logging.basicConfig(level=logging.INFO, format=logfmt)
+    else:
+        logging.basicConfig(level=logging.WARN, format=logfmt)
+    logging.info(get_commandline_args())
+
+    with jsonlines.open(args.rspecifier, 'r') as reader:
+        lines = list(reader)
+    logging.info(f"Example: {len(lines)}")
+    feat = lines[0]['input'][args.iaxis]['feat']
+    args.soud = False
+    if feat.split('.')[-1] not in 'ark, scp':
+        args.sound = True
+
+    count = 0
+    filter = 0
+    with jsonlines.open(args.wspecifier_or_wxfilename, 'w') as writer:
+        for line in lines:
+            if filter_input(args, line) or filter_output(args, line):
+                filter += 1
+                continue
+            writer.write(line)
+            count += 1
+    logging.info(f"Example after filter: {count}\{filter}")
+
+
+if __name__ == '__main__':
+    main()
--- a/utils/show_results.sh
+++ b/utils/show_results.sh
+#!/usr/bin/env bash
+mindepth=0
+maxdepth=1
+
+. utils/parse_options.sh
+
+if [ $# -gt 1 ]; then
+    echo "Usage: $0 --mindepth 0 --maxdepth 1 [exp]" 1>&2
+    echo ""
+    echo "Show the system environments and the evaluation results in Markdown format."
+    echo 'The default of <exp> is "exp/".'
+    exit 1
+fi
+
+[ -f ./path.sh ] && . ./path.sh
+set -euo pipefail
+if [ $# -eq 1 ]; then
+    exp=$1
+else
+    exp=exp
+fi
+
+
+cat << EOF
+<!-- Generated by $0 -->
+# RESULTS
+## Environments
+- date: \`$(LC_ALL=C date)\`
+EOF
+
+python3 << EOF
+import sys, paddle
+pyversion = sys.version.replace('\n', ' ')
+
+print(f"""- python version: \`{pyversion}\`
+- paddle version: \`paddle {paddle.__version__}\`""")
+EOF
+
+cat << EOF
+- Git hash: \`$(git rev-parse HEAD)\`
+  - Commit date: \`$(git log -1 --format='%cd')\`
+
+EOF
+
+while IFS= read -r expdir; do
+    if ls ${expdir}/decode_*/result.txt &> /dev/null; then
+    # 1. Show the result table
+    cat << EOF
+## $(basename ${expdir})
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+EOF
+        grep -e Avg ${expdir}/decode_*/result.txt \
+            | sed -e "s#${expdir}/\([^/]*\)/result.txt:#|\1#g" \
+            | sed -e 's#Sum/Avg##g' | tr '|' ' ' | tr -s ' ' '|'
+        echo
+
+        # 2. Show the result table for WER
+        if ls ${expdir}/decode_*/result.wrd.txt &> /dev/null; then
+            cat << EOF
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+EOF
+            grep -e Avg ${expdir}/decode_*/result.wrd.txt \
+                | sed -e "s#${expdir}/\([^/]*\)/result.wrd.txt:#|\1#g" \
+                | sed -e 's#Sum/Avg##g' | tr '|' ' ' | tr -s ' ' '|'
+            echo
+        fi
+    fi
+done < <(find ${exp} -mindepth ${mindepth} -maxdepth ${maxdepth} -type d)