vocab into data/lang_char

4f54e362 · Hui Zhang · 7cddfd27 · 4f54e362 · 4f54e362 · 4f54e362
34 changed file
--- a/examples/aishell/asr0/conf/deepspeech2.yaml
+++ b/examples/aishell/asr0/conf/deepspeech2.yaml
@@ -14,7 +14,7 @@ collator:
  batch_size: 64 # one gpu
  mean_std_filepath: data/mean_std.json
  unit_type: char
-  vocab_filepath: data/vocab.txt 
+  vocab_filepath: data/lang_char/vocab.txt 
  augmentation_config: conf/augmentation.json
  random_seed: 0
  spm_model_prefix: 

--- a/examples/aishell/asr0/conf/deepspeech2_online.yaml
+++ b/examples/aishell/asr0/conf/deepspeech2_online.yaml
@@ -14,7 +14,7 @@ collator:
  batch_size: 64 # one gpu
  mean_std_filepath: data/mean_std.json
  unit_type: char
-  vocab_filepath: data/vocab.txt 
+  vocab_filepath: data/lang_char/vocab.txt 
  augmentation_config: conf/augmentation.json
  random_seed: 0
  spm_model_prefix: 

--- a/examples/aishell/asr0/local/data.sh
+++ b/examples/aishell/asr0/local/data.sh
@@ -3,9 +3,12 @@
 stage=-1
 stop_stage=100

-source ${MAIN_ROOT}/utils/parse_options.sh
+dict_dir=data/lang_char
+
+. ${MAIN_ROOT}/utils/parse_options.sh || exit -1;

 mkdir -p data
+mkdir -p ${dict_dir}
 TARGET_DIR=${MAIN_ROOT}/examples/dataset
 mkdir -p ${TARGET_DIR}

@@ -52,7 +55,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    python3 ${MAIN_ROOT}/utils/build_vocab.py \
    --unit_type="char" \
    --count_threshold=0 \
-    --vocab_path="data/vocab.txt" \
+    --vocab_path="${dict_dir}/vocab.txt" \
    --manifest_paths "data/manifest.train.raw" "data/manifest.dev.raw"

    if [ $? -ne 0 ]; then
@@ -68,7 +71,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
        python3 ${MAIN_ROOT}/utils/format_data.py \
                --cmvn_path "data/mean_std.json" \
                --unit_type "char" \
-                --vocab_path="data/vocab.txt" \
+                --vocab_path="${dict_dir}/vocab.txt" \
                --manifest_path="data/manifest.${dataset}.raw" \
                --output_path="data/manifest.${dataset}"


--- a/examples/aishell/asr1/conf/chunk_conformer.yaml
+++ b/examples/aishell/asr1/conf/chunk_conformer.yaml
@@ -12,7 +12,7 @@ data:
  

 collator:
-  vocab_filepath: data/vocab.txt 
+  vocab_filepath: data/lang_char/vocab.txt 
  unit_type: 'char'
  spm_model_prefix: ''
  augmentation_config: conf/preprocess.yaml

--- a/examples/aishell/asr1/conf/conformer.yaml
+++ b/examples/aishell/asr1/conf/conformer.yaml
@@ -12,7 +12,7 @@ data:


 collator:
-  vocab_filepath: data/vocab.txt 
+  vocab_filepath: data/lang_char/vocab.txt 
  unit_type: 'char'
  spm_model_prefix: ''
  augmentation_config: conf/preprocess.yaml

--- a/examples/aishell/asr1/conf/transformer.yaml
+++ b/examples/aishell/asr1/conf/transformer.yaml
@@ -12,7 +12,7 @@ data:


 collator:
-  vocab_filepath: data/vocab.txt 
+  vocab_filepath: data/lang_char/vocab.txt 
  unit_type: 'char'
  spm_model_prefix: ''
  augmentation_config: conf/preprocess.yaml

--- a/examples/aishell/asr1/local/data.sh
+++ b/examples/aishell/asr1/local/data.sh
@@ -2,10 +2,12 @@

 stage=-1
 stop_stage=100
+dict_dir=data/lang_char

-source ${MAIN_ROOT}/utils/parse_options.sh
+. ${MAIN_ROOT}/utils/parse_options.sh || exit -1;

 mkdir -p data
+mkdir -p ${dict_dir}
 TARGET_DIR=${MAIN_ROOT}/examples/dataset
 mkdir -p ${TARGET_DIR}

@@ -53,7 +55,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    python3 ${MAIN_ROOT}/utils/build_vocab.py \
    --unit_type="char" \
    --count_threshold=0 \
-    --vocab_path="data/vocab.txt" \
+    --vocab_path="${dict_dir}/vocab.txt" \
    --manifest_paths "data/manifest.train.raw"

    if [ $? -ne 0 ]; then
@@ -69,7 +71,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
        python3 ${MAIN_ROOT}/utils/format_data.py \
            --cmvn_path "data/mean_std.json" \
            --unit_type "char" \
-            --vocab_path="data/vocab.txt" \
+            --vocab_path="${dict_dir}/vocab.txt" \
            --manifest_path="data/manifest.${dataset}.raw" \
            --output_path="data/manifest.${dataset}"


--- a/examples/callcenter/asr1/conf/chunk_conformer.yaml
+++ b/examples/callcenter/asr1/conf/chunk_conformer.yaml
@@ -12,7 +12,7 @@ data:
  

 collator:
-  vocab_filepath: data/vocab.txt 
+  vocab_filepath: data/lang_char/vocab.txt 
  unit_type: 'char'
  spm_model_prefix: ''
  augmentation_config: conf/preprocess.yaml

--- a/examples/callcenter/asr1/conf/conformer.yaml
+++ b/examples/callcenter/asr1/conf/conformer.yaml
@@ -12,7 +12,7 @@ data:


 collator:
-  vocab_filepath: data/vocab.txt 
+  vocab_filepath: data/lang_char/vocab.txt 
  unit_type: 'char'
  spm_model_prefix: ''
  augmentation_config: conf/preprocess.yaml

--- a/examples/callcenter/asr1/local/data.sh
+++ b/examples/callcenter/asr1/local/data.sh
@@ -2,10 +2,12 @@

 stage=-1
 stop_stage=100
+dict_dir=data/lang_char

 source ${MAIN_ROOT}/utils/parse_options.sh

 mkdir -p data
+mkdir -p ${dict_dir}

 if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
    for dataset in train dev test; do
@@ -41,7 +43,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    python3 ${MAIN_ROOT}/utils/build_vocab.py \
    --unit_type="char" \
    --count_threshold=0 \
-    --vocab_path="data/vocab.txt" \
+    --vocab_path="${dict_dir}/vocab.txt" \
    --manifest_paths "data/manifest.train.raw"

    if [ $? -ne 0 ]; then
@@ -57,7 +59,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
        python3 ${MAIN_ROOT}/utils/format_data.py \
            --cmvn_path "data/mean_std.json" \
            --unit_type "char" \
-            --vocab_path="data/vocab.txt" \
+            --vocab_path="${dict_dir}/vocab.txt" \
            --manifest_path="data/manifest.${dataset}.raw" \
            --output_path="data/manifest.${dataset}"


--- a/examples/librispeech/asr0/conf/deepspeech2.yaml
+++ b/examples/librispeech/asr0/conf/deepspeech2.yaml
@@ -14,7 +14,7 @@ collator:
  batch_size: 20
  mean_std_filepath: data/mean_std.json
  unit_type: char
-  vocab_filepath: data/vocab.txt 
+  vocab_filepath: data/lang_char/vocab.txt
  augmentation_config: conf/augmentation.json
  random_seed: 0
  spm_model_prefix: 

--- a/examples/librispeech/asr0/conf/deepspeech2_online.yaml
+++ b/examples/librispeech/asr0/conf/deepspeech2_online.yaml
@@ -14,7 +14,7 @@ collator:
  batch_size: 15
  mean_std_filepath: data/mean_std.json
  unit_type: char
-  vocab_filepath: data/vocab.txt 
+  vocab_filepath: data/lang_char/vocab.txt
  augmentation_config: conf/augmentation.json
  random_seed: 0
  spm_model_prefix: 

--- a/examples/librispeech/asr0/local/data.sh
+++ b/examples/librispeech/asr0/local/data.sh
@@ -4,10 +4,12 @@ stage=-1
 stop_stage=100

 unit_type=char
+dict_dir=data/lang_char

 source ${MAIN_ROOT}/utils/parse_options.sh

 mkdir -p data
+mkdir -p ${dict_dir}
 TARGET_DIR=${MAIN_ROOT}/examples/dataset
 mkdir -p ${TARGET_DIR}

@@ -67,7 +69,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    python3 ${MAIN_ROOT}/utils/build_vocab.py \
    --unit_type ${unit_type} \
    --count_threshold=0 \
-    --vocab_path="data/vocab.txt" \
+    --vocab_path="${dict_dir}/vocab.txt" \
    --manifest_paths="data/manifest.train.raw"

    if [ $? -ne 0 ]; then
@@ -83,7 +85,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
        python3 ${MAIN_ROOT}/utils/format_data.py \
        --cmvn_path "data/mean_std.json" \
        --unit_type ${unit_type} \
-        --vocab_path="data/vocab.txt" \
+        --vocab_path="${dict_dir}/vocab.txt" \
        --manifest_path="data/manifest.${set}.raw" \
        --output_path="data/manifest.${set}"


--- a/examples/librispeech/asr1/conf/chunk_conformer.yaml
+++ b/examples/librispeech/asr1/conf/chunk_conformer.yaml
@@ -11,9 +11,9 @@ data:
  max_output_input_ratio: 100.0

 collator:
-  vocab_filepath: data/vocab.txt 
+  vocab_filepath: data/lang_char/vocab.txt 
  unit_type: 'spm'
-  spm_model_prefix: 'data/bpe_unigram_5000'
+  spm_model_prefix: 'data/lang_char/bpe_unigram_5000'
  mean_std_filepath: ""
  augmentation_config: conf/preprocess.yaml
  batch_size: 16

--- a/examples/librispeech/asr1/conf/conformer.yaml
+++ b/examples/librispeech/asr1/conf/conformer.yaml
@@ -11,9 +11,9 @@ data:
  max_output_input_ratio: 100.0

 collator:
-  vocab_filepath: data/vocab.txt 
+  vocab_filepath: data/lang_char/vocab.txt 
  unit_type: 'spm'
-  spm_model_prefix: 'data/bpe_unigram_5000'
+  spm_model_prefix: 'data/lang_char/bpe_unigram_5000'
  mean_std_filepath: ""
  augmentation_config: conf/preprocess.yaml
  batch_size: 16

--- a/examples/librispeech/asr1/conf/transformer.yaml
+++ b/examples/librispeech/asr1/conf/transformer.yaml
@@ -11,9 +11,9 @@ data:
  max_output_input_ratio: 100.0

 collator:
-  vocab_filepath: data/vocab.txt
+  vocab_filepath: data/lang_char/vocab.txt
  unit_type: 'spm'
-  spm_model_prefix: 'data/bpe_unigram_5000'
+  spm_model_prefix: 'data/lang_char/bpe_unigram_5000'
  mean_std_filepath: ""
  augmentation_config: conf/preprocess.yaml
  batch_size: 32

--- a/examples/librispeech/asr1/local/data.sh
+++ b/examples/librispeech/asr1/local/data.sh
@@ -2,11 +2,12 @@

 stage=-1
 stop_stage=100
+dict_dir=data/lang_char

 # bpemode (unigram or bpe)
 nbpe=5000
 bpemode=unigram
-bpeprefix="data/bpe_${bpemode}_${nbpe}"
+bpeprefix="${dict_dir}/bpe_${bpemode}_${nbpe}"

 stride_ms=10
 window_ms=25
@@ -17,6 +18,7 @@ source ${MAIN_ROOT}/utils/parse_options.sh


 mkdir -p data
+mkdir -p ${dict_dir}
 TARGET_DIR=${MAIN_ROOT}/examples/dataset
 mkdir -p ${TARGET_DIR}

@@ -79,7 +81,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    --spm_vocab_size=${nbpe} \
    --spm_mode ${bpemode} \
    --spm_model_prefix ${bpeprefix} \
-    --vocab_path="data/vocab.txt" \
+    --vocab_path="${dict_dir}/vocab.txt" \
    --manifest_paths="data/manifest.train.raw"

    if [ $? -ne 0 ]; then
@@ -96,7 +98,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
        --cmvn_path "data/mean_std.json" \
        --unit_type "spm" \
        --spm_model_prefix ${bpeprefix} \
-        --vocab_path="data/vocab.txt" \
+        --vocab_path="${dict_dir}/vocab.txt" \
        --manifest_path="data/manifest.${sub}.raw" \
        --output_path="data/manifest.${sub}"


--- a/examples/ted_en_zh/st0/conf/transformer.yaml
+++ b/examples/ted_en_zh/st0/conf/transformer.yaml
@@ -11,9 +11,9 @@ data:
  max_output_input_ratio: 20.0

 collator:
-  vocab_filepath: data/vocab.txt
+  vocab_filepath: data/lang_char/vocab.txt
  unit_type: 'spm'
-  spm_model_prefix: data/bpe_unigram_8000
+  spm_model_prefix: data/lang_char/bpe_unigram_8000
  mean_std_filepath: ""
  # augmentation_config: conf/augmentation.json
  batch_size: 10

--- a/examples/ted_en_zh/st0/conf/transformer_joint_noam.yaml
+++ b/examples/ted_en_zh/st0/conf/transformer_joint_noam.yaml
@@ -11,9 +11,9 @@ data:
  max_output_input_ratio: 20.0

 collator:
-  vocab_filepath: data/vocab.txt
+  vocab_filepath: data/lang_char/vocab.txt
  unit_type: 'spm'
-  spm_model_prefix: data/bpe_unigram_8000
+  spm_model_prefix: data/lang_char/bpe_unigram_8000
  mean_std_filepath: ""
  # augmentation_config: conf/augmentation.json
  batch_size: 10

--- a/examples/ted_en_zh/st0/local/data.sh
+++ b/examples/ted_en_zh/st0/local/data.sh
@@ -4,19 +4,22 @@ set -e

 stage=-1
 stop_stage=100
+dict_dir=data/lang_char

 # bpemode (unigram or bpe)
 nbpe=8000
 bpemode=unigram
-bpeprefix="data/bpe_${bpemode}_${nbpe}"
+bpeprefix="${dict_dir}/bpe_${bpemode}_${nbpe}"
 data_dir=./TED-En-Zh


-source ${MAIN_ROOT}/utils/parse_options.sh
+. ${MAIN_ROOT}/utils/parse_options.sh || exit -1;
+

 TARGET_DIR=${MAIN_ROOT}/examples/dataset
 mkdir -p ${TARGET_DIR}
 mkdir -p data
+mkdir -p ${dict_dir}


 if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
@@ -73,11 +76,10 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    --spm_vocab_size=${nbpe} \
    --spm_mode ${bpemode} \
    --spm_model_prefix ${bpeprefix} \
-    --vocab_path="data/vocab.txt" \
+    --vocab_path="${dict_dir}/vocab.txt" \
    --text_keys 'text' 'text1' \
    --manifest_paths="data/manifest.train.raw"

-
    if [ $? -ne 0 ]; then
        echo "Build vocabulary failed. Terminated."
        exit 1
@@ -92,7 +94,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
        --cmvn_path "data/mean_std.json" \
        --unit_type "spm" \
        --spm_model_prefix ${bpeprefix} \
-        --vocab_path="data/vocab.txt" \
+        --vocab_path="${dict_dir}/vocab.txt" \
        --manifest_path="data/manifest.${set}.raw" \
        --output_path="data/manifest.${set}"


--- a/examples/ted_en_zh/st1/conf/transformer.yaml
+++ b/examples/ted_en_zh/st1/conf/transformer.yaml
@@ -11,9 +11,9 @@ data:
  max_output_input_ratio: 20.0

 collator:
-  vocab_filepath: data/vocab.txt
+  vocab_filepath: data/lang_char/vocab.txt
  unit_type: 'spm'
-  spm_model_prefix: data/bpe_unigram_8000
+  spm_model_prefix: data/lang_char/bpe_unigram_8000
  mean_std_filepath: ""
  # augmentation_config: conf/augmentation.json
  batch_size: 10

--- a/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml
+++ b/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml
@@ -11,7 +11,7 @@ data:
  max_output_input_ratio: 20.0

 collator:
-  vocab_filepath: data/vocab.txt
+  vocab_filepath: data/lang_char/vocab.txt
  unit_type: 'spm'
  spm_model_prefix: data/train_sp.en-zh-nlpr.zh-nlpr_bpe8000_tc
  mean_std_filepath: ""

--- a/examples/ted_en_zh/st1/local/data.sh
+++ b/examples/ted_en_zh/st1/local/data.sh
@@ -4,11 +4,12 @@ set -e

 stage=-1
 stop_stage=100
+dict_dir=data/lang_char

 # bpemode (unigram or bpe)
 nbpe=8000
 bpemode=unigram
-bpeprefix="data/bpe_${bpemode}_${nbpe}"
+bpeprefix="${dict_dir}/bpe_${bpemode}_${nbpe}"
 data_dir=./TED_EnZh


@@ -17,6 +18,7 @@ source ${MAIN_ROOT}/utils/parse_options.sh
 TARGET_DIR=${MAIN_ROOT}/examples/dataset
 mkdir -p ${TARGET_DIR}
 mkdir -p data
+mkdir -p ${dict_dir}


 if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
@@ -73,7 +75,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    --spm_vocab_size=${nbpe} \
    --spm_mode ${bpemode} \
    --spm_model_prefix ${bpeprefix} \
-    --vocab_path="data/vocab.txt" \
+    --vocab_path="${dict_dir}/vocab.txt" \
    --text_keys 'text' 'text1' \
    --manifest_paths="data/manifest.train.raw"

@@ -93,7 +95,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
        --cmvn_path "data/mean_std.json" \
        --unit_type "spm" \
        --spm_model_prefix ${bpeprefix} \
-        --vocab_path="data/vocab.txt" \
+        --vocab_path="${dict_dir}/vocab.txt" \
        --manifest_path="data/manifest.${set}.raw" \
        --output_path="data/manifest.${set}"


--- a/examples/timit/asr1/conf/transformer.yaml
+++ b/examples/timit/asr1/conf/transformer.yaml
@@ -11,7 +11,7 @@ data:
  max_output_input_ratio: 1000.0

 collator:
-  vocab_filepath: data/vocab.txt
+  vocab_filepath: data/lang_char/vocab.txt
  unit_type: "word"
  mean_std_filepath: ""
  augmentation_config: conf/preprocess.yaml

--- a/examples/timit/asr1/local/data.sh
+++ b/examples/timit/asr1/local/data.sh
@@ -3,15 +3,19 @@
 stage=-1
 stop_stage=100

+dict_dir=data/lang_char
+
 unit_type=word
 TIMIT_path=

-source ${MAIN_ROOT}/utils/parse_options.sh
+. ${MAIN_ROOT}/utils/parse_options.sh || exit -1;

 mkdir -p data
+mkdir -p ${dict_dir}
 TARGET_DIR=${MAIN_ROOT}/examples/dataset
 mkdir -p ${TARGET_DIR}

+
 if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
    # download data, generate manifests
    python3 ${TARGET_DIR}/timit/timit_kaldi_standard_split.py \
@@ -52,7 +56,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    python3 ${MAIN_ROOT}/utils/build_vocab.py \
    --unit_type ${unit_type} \
    --count_threshold=0 \
-    --vocab_path="data/vocab.txt" \
+    --vocab_path="${dict_dir}/vocab.txt" \
    --manifest_paths="data/manifest.train.raw"

    if [ $? -ne 0 ]; then
@@ -68,7 +72,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
        python3 ${MAIN_ROOT}/utils/format_data.py \
        --cmvn_path "data/mean_std.json" \
        --unit_type ${unit_type} \
-        --vocab_path="data/vocab.txt" \
+        --vocab_path="${dict_dir}/vocab.txt" \
        --manifest_path="data/manifest.${set}.raw" \
        --output_path="data/manifest.${set}"


--- a/examples/tiny/asr0/conf/deepspeech2.yaml
+++ b/examples/tiny/asr0/conf/deepspeech2.yaml
@@ -14,7 +14,7 @@ data:
 collator:
  mean_std_filepath: data/mean_std.json
  unit_type: char
-  vocab_filepath: data/vocab.txt
+  vocab_filepath: data/lang_char/vocab.txt
  augmentation_config: conf/augmentation.json
  random_seed: 0
  spm_model_prefix: 

--- a/examples/tiny/asr0/conf/deepspeech2_online.yaml
+++ b/examples/tiny/asr0/conf/deepspeech2_online.yaml
@@ -14,7 +14,7 @@ data:
 collator:
  mean_std_filepath: data/mean_std.json
  unit_type: char
-  vocab_filepath: data/vocab.txt
+  vocab_filepath: data/lang_char/vocab.txt
  augmentation_config: conf/augmentation.json
  random_seed: 0
  spm_model_prefix: 

--- a/examples/tiny/asr0/local/data.sh
+++ b/examples/tiny/asr0/local/data.sh
@@ -4,10 +4,12 @@ stage=-1
 stop_stage=100

 unit_type=char
+dict_dir=data/lang_char

-source ${MAIN_ROOT}/utils/parse_options.sh
+. ${MAIN_ROOT}/utils/parse_options.sh || exit -1;

 mkdir -p data
+mkdir -p ${dict_dir}
 TARGET_DIR=${MAIN_ROOT}/examples/dataset
 mkdir -p ${TARGET_DIR}

@@ -51,7 +53,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    python3 ${MAIN_ROOT}/utils/build_vocab.py \
    --unit_type ${unit_type} \
    --count_threshold=0 \
-    --vocab_path="data/vocab.txt" \
+    --vocab_path="${dict_dir}/vocab.txt" \
    --manifest_paths="data/manifest.tiny.raw"
    
    if [ $? -ne 0 ]; then
@@ -65,7 +67,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    python3 ${MAIN_ROOT}/utils/format_data.py \
    --cmvn_path "data/mean_std.json" \
    --unit_type ${unit_type} \
-    --vocab_path="data/vocab.txt" \
+    --vocab_path="${dict_dir}/vocab.txt" \
    --manifest_path="data/manifest.tiny.raw" \
    --output_path="data/manifest.tiny"
    

--- a/examples/tiny/asr1/conf/chunk_confermer.yaml
+++ b/examples/tiny/asr1/conf/chunk_confermer.yaml
@@ -12,7 +12,7 @@ data:
  
 collator:
  mean_std_filepath: ""
-  vocab_filepath: data/vocab.txt 
+  vocab_filepath: data/lang_char/vocab.txt 
  unit_type: 'spm'
  spm_model_prefix: 'data/bpe_unigram_200'
  augmentation_config: conf/preprocess.yaml

--- a/examples/tiny/asr1/conf/chunk_transformer.yaml
+++ b/examples/tiny/asr1/conf/chunk_transformer.yaml
@@ -12,7 +12,7 @@ data:
  
 collator:
  mean_std_filepath: ""
-  vocab_filepath: data/vocab.txt 
+  vocab_filepath: data/lang_char/vocab.txt 
  unit_type: 'spm'
  spm_model_prefix: 'data/bpe_unigram_200'
  augmentation_config: conf/preprocess.yaml

--- a/examples/tiny/asr1/conf/conformer.yaml
+++ b/examples/tiny/asr1/conf/conformer.yaml
@@ -12,7 +12,7 @@ data:
  
 collator:
  mean_std_filepath: ""
-  vocab_filepath: data/vocab.txt 
+  vocab_filepath: data/lang_char/vocab.txt 
  unit_type: 'spm'
  spm_model_prefix: 'data/bpe_unigram_200'
  augmentation_config: conf/preprocess.yaml

--- a/examples/tiny/asr1/conf/transformer.yaml
+++ b/examples/tiny/asr1/conf/transformer.yaml
@@ -12,7 +12,7 @@ data:
  
 collator:
  mean_std_filepath: data/mean_std.json
-  vocab_filepath: data/vocab.txt 
+  vocab_filepath: data/lang_char/vocab.txt 
  unit_type: 'spm'
  spm_model_prefix: 'data/bpe_unigram_200'
  augmentation_config: conf/preprocess.yaml

--- a/examples/tiny/asr1/local/data.sh
+++ b/examples/tiny/asr1/local/data.sh
@@ -3,14 +3,17 @@
 stage=-1
 stop_stage=100

+dict_dir=data/lang_char
+
 # bpemode (unigram or bpe)
 nbpe=200
 bpemode=unigram
-bpeprefix="data/bpe_${bpemode}_${nbpe}"
+bpeprefix="${dict_dir}/bpe_${bpemode}_${nbpe}"

-source ${MAIN_ROOT}/utils/parse_options.sh
+. ${MAIN_ROOT}/utils/parse_options.sh || exit -1;

 mkdir -p data
+mkdir -p ${dict_dir}
 TARGET_DIR=${MAIN_ROOT}/examples/dataset
 mkdir -p ${TARGET_DIR}

@@ -57,7 +60,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    --spm_vocab_size=${nbpe} \
    --spm_mode ${bpemode} \
    --spm_model_prefix ${bpeprefix} \
-    --vocab_path="data/vocab.txt" \
+    --vocab_path="${dict_dir}/vocab.txt" \
    --manifest_paths="data/manifest.tiny.raw"
    
    if [ $? -ne 0 ]; then
@@ -72,7 +75,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    --cmvn_path "data/mean_std.json" \
    --unit_type "spm" \
    --spm_model_prefix ${bpeprefix} \
-    --vocab_path="data/vocab.txt" \
+    --vocab_path="${dict_dir}/vocab.txt" \
    --manifest_path="data/manifest.tiny.raw" \
    --output_path="data/manifest.tiny"
    

--- a/examples/wenetspeech/asr1/conf/conformer.yaml
+++ b/examples/wenetspeech/asr1/conf/conformer.yaml
@@ -51,7 +51,7 @@ data:
  max_output_input_ratio: 10.0

 collator:
-  vocab_filepath: data/vocab.txt 
+  vocab_filepath: data/lang_char/vocab.txt 
  unit_type: 'char'
  spm_model_prefix: ''
  augmentation_config: conf/preprocess.yaml