From 44743622d4b894ba5fe7440d35007c87b3258db1 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Fri, 5 Nov 2021 09:50:19 +0000
Subject: [PATCH] filter example; cmvn stride and window int; libri/s1 conf

---
 examples/aishell/s0/local/data.sh             |   4 +-
 examples/aishell/s1/local/data.sh             |   4 +-
 examples/callcenter/s1/local/data.sh          |   4 +-
 examples/dataset/librispeech/librispeech.py   |  10 +-
 examples/librispeech/s0/local/data.sh         |   4 +-
 .../librispeech/s1/conf/chunk_conformer.yaml  |   4 +-
 .../s1/conf/chunk_transformer.yaml            |   4 +-
 examples/librispeech/s1/conf/conformer.yaml   |   4 +-
 examples/librispeech/s1/conf/preprocess.yaml  |  29 +++++
 examples/librispeech/s1/conf/transformer.yaml |   4 +-
 examples/librispeech/s1/local/data.sh         |  45 +++++---
 examples/ted_en_zh/t0/local/data.sh           |   4 +-
 examples/timit/s1/local/data.sh               |   4 +-
 examples/tiny/s0/local/data.sh                |   4 +-
 examples/tiny/s1/local/data.sh                |   4 +-
 utils/compute_mean_std.py                     |   8 +-
 utils/format_data.py                          |   2 +-
 utils/remove_longshortdata.py                 | 102 ++++++++++++++++++
 18 files changed, 195 insertions(+), 49 deletions(-)
 create mode 100644 examples/librispeech/s1/conf/preprocess.yaml
 create mode 100755 utils/remove_longshortdata.py

diff --git a/examples/aishell/s0/local/data.sh b/examples/aishell/s0/local/data.sh
index d0a63dca..23f04f2a 100755
--- a/examples/aishell/s0/local/data.sh
+++ b/examples/aishell/s0/local/data.sh
@@ -32,8 +32,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
     --manifest_path="data/manifest.train.raw" \
     --spectrum_type="linear" \
     --delta_delta=false \
-    --stride_ms=10.0 \
-    --window_ms=20.0 \
+    --stride_ms=10 \
+    --window_ms=20 \
     --sample_rate=16000 \
     --use_dB_normalization=True \
     --num_samples=2000 \
diff --git a/examples/aishell/s1/local/data.sh b/examples/aishell/s1/local/data.sh
index 8124d1bb..76e28075 100755
--- a/examples/aishell/s1/local/data.sh
+++ b/examples/aishell/s1/local/data.sh
@@ -33,8 +33,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
     --spectrum_type="fbank" \
     --feat_dim=80 \
     --delta_delta=false \
-    --stride_ms=10.0 \
-    --window_ms=25.0 \
+    --stride_ms=10 \
+    --window_ms=25 \
     --sample_rate=16000 \
     --use_dB_normalization=False \
     --num_samples=-1 \
diff --git a/examples/callcenter/s1/local/data.sh b/examples/callcenter/s1/local/data.sh
index 65e6e5fc..c40c752a 100755
--- a/examples/callcenter/s1/local/data.sh
+++ b/examples/callcenter/s1/local/data.sh
@@ -21,8 +21,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
     --spectrum_type="fbank" \
     --feat_dim=80 \
     --delta_delta=false \
-    --stride_ms=10.0 \
-    --window_ms=25.0 \
+    --stride_ms=10 \
+    --window_ms=25 \
     --sample_rate=8000 \
     --use_dB_normalization=False \
     --num_samples=-1 \
diff --git a/examples/dataset/librispeech/librispeech.py b/examples/dataset/librispeech/librispeech.py
index 0d535e13..69f0db59 100644
--- a/examples/dataset/librispeech/librispeech.py
+++ b/examples/dataset/librispeech/librispeech.py
@@ -78,7 +78,7 @@ def create_manifest(data_dir, manifest_path):
     print("Creating manifest %s ..." % manifest_path)
     json_lines = []
     total_sec = 0.0
-    total_text = 0.0
+    total_char = 0.0
     total_num = 0
 
     for subfolder, _, filelist in sorted(os.walk(data_dir)):
@@ -89,7 +89,7 @@ def create_manifest(data_dir, manifest_path):
             text_filepath = os.path.join(subfolder, text_filelist[0])
             for line in io.open(text_filepath, encoding="utf8"):
                 segments = line.strip().split()
-                n_token = len(segments[1:])
+                nchars = len(segments[1:])
                 text = ' '.join(segments[1:]).lower()
 
                 audio_filepath = os.path.abspath(
@@ -110,7 +110,7 @@ def create_manifest(data_dir, manifest_path):
                     }))
 
                 total_sec += duration
-                total_text += n_token
+                total_char += nchars
                 total_num += 1
 
     with codecs.open(manifest_path, 'w', 'utf-8') as out_file:
@@ -125,8 +125,8 @@ def create_manifest(data_dir, manifest_path):
         print(f"{subset}:", file=f)
         print(f"{total_num} utts", file=f)
         print(f"{total_sec / (60*60)} h", file=f)
-        print(f"{total_text} text", file=f)
-        print(f"{total_text / total_sec} text/sec", file=f)
+        print(f"{total_char} char", file=f)
+        print(f"{total_char / total_sec} char/sec", file=f)
         print(f"{total_sec / total_num} sec/utt", file=f)
 
 
diff --git a/examples/librispeech/s0/local/data.sh b/examples/librispeech/s0/local/data.sh
index 78a4ffc4..0f276cec 100755
--- a/examples/librispeech/s0/local/data.sh
+++ b/examples/librispeech/s0/local/data.sh
@@ -50,8 +50,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
     --spectrum_type="linear" \
     --delta_delta=false \
     --sample_rate=16000 \
-    --stride_ms=10.0 \
-    --window_ms=20.0 \
+    --stride_ms=10 \
+    --window_ms=20 \
     --use_dB_normalization=True \
     --num_workers=${num_workers} \
     --output_path="data/mean_std.json"
diff --git a/examples/librispeech/s1/conf/chunk_conformer.yaml b/examples/librispeech/s1/conf/chunk_conformer.yaml
index 4d0e6ceb..2bfb0fb6 100644
--- a/examples/librispeech/s1/conf/chunk_conformer.yaml
+++ b/examples/librispeech/s1/conf/chunk_conformer.yaml
@@ -15,7 +15,7 @@ collator:
   unit_type: 'spm'
   spm_model_prefix: 'data/bpe_unigram_5000'
   mean_std_filepath: ""
-  augmentation_config: conf/augmentation.json
+  augmentation_config: conf/preprocess.yaml
   batch_size: 16
   raw_wav: True  # use raw_wav or kaldi feature
   spectrum_type: fbank #linear, mfcc, fbank
@@ -38,7 +38,7 @@ collator:
 
 # network architecture
 model:
-    cmvn_file: "data/mean_std.json"
+    cmvn_file: 
     cmvn_file_type: "json"
     # encoder related
     encoder: conformer
diff --git a/examples/librispeech/s1/conf/chunk_transformer.yaml b/examples/librispeech/s1/conf/chunk_transformer.yaml
index c7b53f95..fe533777 100644
--- a/examples/librispeech/s1/conf/chunk_transformer.yaml
+++ b/examples/librispeech/s1/conf/chunk_transformer.yaml
@@ -15,7 +15,7 @@ collator:
   unit_type: 'spm'
   spm_model_prefix: 'data/bpe_unigram_5000'
   mean_std_filepath: ""
-  augmentation_config: conf/augmentation.json
+  augmentation_config: conf/preprocess.yaml
   batch_size: 64
   raw_wav: True  # use raw_wav or kaldi feature
   spectrum_type: fbank #linear, mfcc, fbank
@@ -38,7 +38,7 @@ collator:
 
 # network architecture
 model:
-    cmvn_file: "data/mean_std.json"
+    cmvn_file: 
     cmvn_file_type: "json"
     # encoder related
     encoder: transformer
diff --git a/examples/librispeech/s1/conf/conformer.yaml b/examples/librispeech/s1/conf/conformer.yaml
index 3bc942dc..c844baaa 100644
--- a/examples/librispeech/s1/conf/conformer.yaml
+++ b/examples/librispeech/s1/conf/conformer.yaml
@@ -15,7 +15,7 @@ collator:
   unit_type: 'spm'
   spm_model_prefix: 'data/bpe_unigram_5000'
   mean_std_filepath: ""
-  augmentation_config: conf/augmentation.json
+  augmentation_config: conf/preprocess.yaml
   batch_size: 16
   raw_wav: True  # use raw_wav or kaldi feature
   spectrum_type: fbank #linear, mfcc, fbank
@@ -38,7 +38,7 @@ collator:
 
 # network architecture
 model:
-    cmvn_file: "data/mean_std.json"
+    cmvn_file: 
     cmvn_file_type: "json"
     # encoder related
     encoder: conformer
diff --git a/examples/librispeech/s1/conf/preprocess.yaml b/examples/librispeech/s1/conf/preprocess.yaml
new file mode 100644
index 00000000..dd4cfd27
--- /dev/null
+++ b/examples/librispeech/s1/conf/preprocess.yaml
@@ -0,0 +1,29 @@
+process:
+  # extract kaldi fbank from PCM
+  - type: fbank_kaldi
+    fs: 16000
+    n_mels: 80
+    n_shift: 160
+    win_length: 400
+    dither: true
+  - type: cmvn_json
+    cmvn_path: data/mean_std.json
+  # these three processes are a.k.a. SpecAugument
+  - type: time_warp
+    max_time_warp: 5
+    inplace: true
+    mode: PIL
+  - type: freq_mask
+    F: 30
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
+  - type: time_mask
+    T: 40
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
+
+
+
+
diff --git a/examples/librispeech/s1/conf/transformer.yaml b/examples/librispeech/s1/conf/transformer.yaml
index 3cc17004..5a158f3e 100644
--- a/examples/librispeech/s1/conf/transformer.yaml
+++ b/examples/librispeech/s1/conf/transformer.yaml
@@ -15,7 +15,7 @@ collator:
   unit_type: 'spm'
   spm_model_prefix: 'data/bpe_unigram_5000'
   mean_std_filepath: ""
-  augmentation_config: conf/augmentation.json
+  augmentation_config: conf/preprocess.yaml
   batch_size: 32
   raw_wav: True  # use raw_wav or kaldi feature
   spectrum_type: fbank #linear, mfcc, fbank
@@ -38,7 +38,7 @@ collator:
 
 # network architecture
 model:
-    cmvn_file: "data/mean_std.json"
+    cmvn_file: 
     cmvn_file_type: "json"
     # encoder related
     encoder: transformer
diff --git a/examples/librispeech/s1/local/data.sh b/examples/librispeech/s1/local/data.sh
index b15ddce5..35f4e635 100755
--- a/examples/librispeech/s1/local/data.sh
+++ b/examples/librispeech/s1/local/data.sh
@@ -8,6 +8,11 @@ nbpe=5000
 bpemode=unigram
 bpeprefix="data/bpe_${bpemode}_${nbpe}"
 
+stride_ms=10
+window_ms=25
+sample_rate=16000
+feat_dim=80
+
 source ${MAIN_ROOT}/utils/parse_options.sh
 
 
@@ -27,21 +32,21 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
         exit 1
     fi
 
-    for set in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
-        mv data/manifest.${set} data/manifest.${set}.raw
+    for sub in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
+        mv data/manifest.${sub} data/manifest.${sub}.raw
     done
 
     rm -rf data/manifest.train.raw data/manifest.dev.raw  data/manifest.test.raw
-    for set in train-clean-100 train-clean-360 train-other-500; do
-        cat data/manifest.${set}.raw >> data/manifest.train.raw
+    for sub in train-clean-100 train-clean-360 train-other-500; do
+        cat data/manifest.${sub}.raw >> data/manifest.train.raw
     done
 
-    for set in dev-clean dev-other; do
-        cat data/manifest.${set}.raw >> data/manifest.dev.raw
+    for sub in dev-clean dev-other; do
+        cat data/manifest.${sub}.raw >> data/manifest.dev.raw
     done
 
-    for set in test-clean test-other; do
-        cat data/manifest.${set}.raw >> data/manifest.test.raw
+    for sub in test-clean test-other; do
+        cat data/manifest.${sub}.raw >> data/manifest.test.raw
     done
 fi
 
@@ -52,11 +57,11 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
     --manifest_path="data/manifest.train.raw" \
     --num_samples=-1 \
     --spectrum_type="fbank" \
-    --feat_dim=80 \
+    --feat_dim=${feat_dim} \
     --delta_delta=false \
-    --sample_rate=16000 \
-    --stride_ms=10.0 \
-    --window_ms=25.0 \
+    --sample_rate=${sample_rate} \
+    --stride_ms=${stride_ms} \
+    --window_ms=${window_ms} \
     --use_dB_normalization=False \
     --num_workers=${num_workers} \
     --output_path="data/mean_std.json"
@@ -85,15 +90,15 @@ fi
 
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     # format manifest with tokenids, vocab size
-    for set in train dev test dev-clean dev-other test-clean test-other; do
+    for sub in train dev test dev-clean dev-other test-clean test-other; do
     {
         python3 ${MAIN_ROOT}/utils/format_data.py \
         --cmvn_path "data/mean_std.json" \
         --unit_type "spm" \
         --spm_model_prefix ${bpeprefix} \
         --vocab_path="data/vocab.txt" \
-        --manifest_path="data/manifest.${set}.raw" \
-        --output_path="data/manifest.${set}"
+        --manifest_path="data/manifest.${sub}.raw" \
+        --output_path="data/manifest.${sub}"
 
         if [ $? -ne 0 ]; then
             echo "Formt mnaifest failed. Terminated."
@@ -102,6 +107,16 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     }&
     done
     wait
+
+    for sub in train dev; do
+        mv data/manifest.${sub} data/manifest.${sub}.fmt
+    done
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    for sub in train dev; do
+        remove_longshortdata.py --maxframes 3000 --maxchars 400 --stride_ms ${stride_ms} data/manifest.${sub}.fmt data/manifest.${sub}
+    done
 fi
 
 echo "LibriSpeech Data preparation done."
diff --git a/examples/ted_en_zh/t0/local/data.sh b/examples/ted_en_zh/t0/local/data.sh
index 23e5a9c7..ce58f539 100755
--- a/examples/ted_en_zh/t0/local/data.sh
+++ b/examples/ted_en_zh/t0/local/data.sh
@@ -54,8 +54,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
     --feat_dim=80 \
     --delta_delta=false \
     --sample_rate=16000 \
-    --stride_ms=10.0 \
-    --window_ms=25.0 \
+    --stride_ms=10 \
+    --window_ms=25 \
     --use_dB_normalization=False \
     --num_workers=${num_workers} \
     --output_path="data/mean_std.json"
diff --git a/examples/timit/s1/local/data.sh b/examples/timit/s1/local/data.sh
index 66be39e2..e588e48d 100755
--- a/examples/timit/s1/local/data.sh
+++ b/examples/timit/s1/local/data.sh
@@ -35,8 +35,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
     --feat_dim=80 \
     --delta_delta=false \
     --sample_rate=16000 \
-    --stride_ms=10.0 \
-    --window_ms=25.0 \
+    --stride_ms=10 \
+    --window_ms=25 \
     --use_dB_normalization=False \
     --num_workers=${num_workers} \
     --output_path="data/mean_std.json"
diff --git a/examples/tiny/s0/local/data.sh b/examples/tiny/s0/local/data.sh
index bcf9e6d1..f1fb8cb1 100755
--- a/examples/tiny/s0/local/data.sh
+++ b/examples/tiny/s0/local/data.sh
@@ -34,8 +34,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
     --spectrum_type="linear" \
     --delta_delta=false \
     --sample_rate=16000 \
-    --stride_ms=10.0 \
-    --window_ms=20.0 \
+    --stride_ms=10 \
+    --window_ms=20 \
     --use_dB_normalization=False \
     --num_workers=2 \
     --output_path="data/mean_std.json"
diff --git a/examples/tiny/s1/local/data.sh b/examples/tiny/s1/local/data.sh
index 3d7f19ab..87539d5e 100755
--- a/examples/tiny/s1/local/data.sh
+++ b/examples/tiny/s1/local/data.sh
@@ -38,8 +38,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
     --feat_dim=80 \
     --delta_delta=false \
     --sample_rate=16000 \
-    --stride_ms=10.0 \
-    --window_ms=25.0 \
+    --stride_ms=10 \
+    --window_ms=25 \
     --use_dB_normalization=False \
     --num_workers=2 \
     --output_path="data/mean_std.json"
diff --git a/utils/compute_mean_std.py b/utils/compute_mean_std.py
index 296d272a..e47554dc 100755
--- a/utils/compute_mean_std.py
+++ b/utils/compute_mean_std.py
@@ -33,8 +33,8 @@ add_arg('spectrum_type',    str,
         choices=['linear', 'mfcc', 'fbank'])
 add_arg('feat_dim',    int, 13, "Audio feature dim.")
 add_arg('delta_delta', bool,  False, "Audio feature with delta delta.")
-add_arg('stride_ms', float, 10.0,  "stride length in ms.")
-add_arg('window_ms', float, 20.0,  "stride length in ms.")
+add_arg('stride_ms', int, 10,  "stride length in ms.")
+add_arg('window_ms', int, 20,  "stride length in ms.")
 add_arg('sample_rate',  int, 16000,  "target sample rate.")
 add_arg('use_dB_normalization', bool, True, "do dB normalization.")
 add_arg('target_dB',   int, -20,  "target dB.")
@@ -61,8 +61,8 @@ def main():
         spectrum_type=args.spectrum_type,
         feat_dim=args.feat_dim,
         delta_delta=args.delta_delta,
-        stride_ms=args.stride_ms,
-        window_ms=args.window_ms,
+        stride_ms=float(args.stride_ms),
+        window_ms=float(args.window_ms),
         n_fft=None,
         max_freq=None,
         target_sample_rate=args.sample_rate,
diff --git a/utils/format_data.py b/utils/format_data.py
index 49dcbee8..f9b5e6aa 100755
--- a/utils/format_data.py
+++ b/utils/format_data.py
@@ -122,7 +122,7 @@ def main():
             fout.write(json.dumps(output_json) + '\n')
             count += 1
 
-    print(f"Examples number: {count}")
+    print(f"{args.manifest_paths} Examples number: {count}")
     fout.close()
 
 
diff --git a/utils/remove_longshortdata.py b/utils/remove_longshortdata.py
new file mode 100755
index 00000000..dcc05b23
--- /dev/null
+++ b/utils/remove_longshortdata.py
@@ -0,0 +1,102 @@
+#!/usr/bin/env python3
+"""remove longshort data from manifest"""
+import logging
+import argparse
+import jsonlines
+
+from paddlespeech.s2t.utils.cli_utils import get_commandline_args
+
+# manifest after format
+# josnline like this
+# {
+#   "input": [{"name": "input1", "shape": (100, 83), "feat": "xxx.ark:123"}],
+#   "output": [{"name":"target1", "shape": (40, 5002), "text": "a b c de"}],
+#   "utt2spk": "111-2222",
+#   "utt": "111-2222-333"
+# }
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        description="remove longshort data from format manifest",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter, )
+    parser.add_argument(
+        "--verbose", "-V", default=0, type=int, help="Verbose option")
+    parser.add_argument(
+        "--iaxis", default=0, type=int, help="multi inputs index, 0 is the first")
+    parser.add_argument(
+        "--oaxis", default=0, type=int, help="multi outputs index, 0 is the first")
+    parser.add_argument(
+        "--maxframes", default=2000, type=int, help="maxframes")
+    parser.add_argument(
+        "--minframes", default=10, type=int, help="minframes")
+    parser.add_argument(
+        "--maxchars", default=200, type=int, help="max tokens")
+    parser.add_argument(
+        "--minchars", default=0, type=int, help="min tokens")
+    parser.add_argument(
+        "--stride_ms", default=10, type=int, help="stride in ms unit.")
+    parser.add_argument(
+        "rspecifier",
+        type=str,
+        help="jsonl format manifest. e.g. manifest.jsonl")
+    parser.add_argument(
+        "wspecifier_or_wxfilename",
+        type=str,
+        help="Write specifier. e.g. manifest.jsonl")
+    return parser
+
+
+def filter_input(args, line):
+    tmp = line['input'][args.iaxis]
+    if args.sound:
+        # second to frame
+        nframe = tmp['shape'][0] * 1000 / args.stride_ms
+    else:
+        nframe = tmp['shape'][0]
+   
+    if nframe < args.minframes or nframe > args.maxframes:
+        return True
+    else:
+        return False
+
+
+def filter_output(args, line):
+    nchars = len(line['output'][args.iaxis]['text'])
+    if nchars < args.minchars or nchars > args.maxchars:
+        return True
+    else:
+        return False
+    
+
+def main():
+    args = get_parser().parse_args()
+
+    logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
+    if args.verbose > 0:
+        logging.basicConfig(level=logging.INFO, format=logfmt)
+    else:
+        logging.basicConfig(level=logging.WARN, format=logfmt)
+    logging.info(get_commandline_args())
+    
+    with jsonlines.open(args.rspecifier, 'r') as reader:
+        lines = list(reader)
+    logging.info(f"Example: {len(lines)}")
+    feat = lines[0]['input'][args.iaxis]['feat']
+    args.soud  = False
+    if feat.split('.')[-1] not in 'ark, scp':
+        args.sound = True
+    
+    count = 0
+    filter = 0
+    with jsonlines.open(args.wspecifier_or_wxfilename, 'w') as writer:
+        for line in lines:
+            if filter_input(args, line) or filter_output(args, line):
+                filter += 1
+                continue
+            writer.write(line)
+            count += 1
+    logging.info(f"Example after filter: {count}\{filter}")
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
-- 
GitLab