fix libri ds2 scripts; add ngram and spm doc

c5d85a93 · Hui Zhang · 9f907b9b · c5d85a93 · c5d85a93 · c5d85a93
4 changed file
--- a/examples/librispeech/s0/conf/deepspeech2.yaml
+++ b/examples/librispeech/s0/conf/deepspeech2.yaml
@@ -10,9 +10,9 @@ data:
  min_input_len: 0.0
  max_input_len: 27.0 # second
  min_output_len: 0.0
-  max_output_len: 400.0
-  min_output_input_ratio: 0.05
-  max_output_input_ratio: 10.0
+  max_output_len: .inf
+  min_output_input_ratio: 0.00
+  max_output_input_ratio: .inf
  specgram_type: linear
  target_sample_rate: 16000
  max_freq: None
@@ -21,7 +21,7 @@ data:
  window_ms: 20.0
  delta_delta: False
  dither: 1.0
-  use_dB_normalization: True
+  use_dB_normalization: True 
  target_dB: -20
  random_seed: 0
  keep_transcription_text: False
@@ -41,7 +41,7 @@ training:
  lr: 1e-3
  lr_decay: 0.83
  weight_decay: 1e-06
-  global_grad_clip: 5.0
+  global_grad_clip: 3.0
  log_interval: 100

 decoding:

--- a/examples/librispeech/s0/local/data.sh
+++ b/examples/librispeech/s0/local/data.sh
@@ -17,12 +17,12 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
    --manifest_prefix="data/manifest" \
    --target_dir="${TARGET_DIR}/librispeech" \
    --full_download="True"
-    
+
    if [ $? -ne 0 ]; then
        echo "Prepare LibriSpeech failed. Terminated."
        exit 1
    fi
-    
+
    for set in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
        mv data/manifest.${set} data/manifest.${set}.raw
    done
@@ -48,7 +48,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    --count_threshold=0 \
    --vocab_path="data/vocab.txt" \
    --manifest_paths="data/manifest.train.raw"
-    
+
    if [ $? -ne 0 ]; then
        echo "Build vocabulary failed. Terminated."
        exit 1
@@ -61,16 +61,16 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    num_workers=$(nproc)
    python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
    --manifest_path="data/manifest.train.raw" \
-    --num_samples=-1 \
+    --num_samples=2000 \
    --specgram_type="linear" \
    --delta_delta=false \
    --sample_rate=16000 \
    --stride_ms=10.0 \
    --window_ms=20.0 \
-    --use_dB_normalization=False \
+    --use_dB_normalization=True \
    --num_workers=${num_workers} \
    --output_path="data/mean_std.json"
-    
+
    if [ $? -ne 0 ]; then
        echo "Compute mean and stddev failed. Terminated."
        exit 1

--- a/examples/ngram_lm/README.md
+++ b/examples/ngram_lm/README.md
+# Ngram LM
+
+Train chinese chararctor ngram lm by [kenlm](https://github.com/kpu/kenlm).
+
+```
+bash run.sh
+```
--- a/examples/spm/README.md
+++ b/examples/spm/README.md
-# SPM demo
+# [SentencePiece Model](https://github.com/google/sentencepiece)
+
+Train a `spm` model for English tokenizer.

 ```
 bash run.sh