data.sh 2.8 KB
Newer Older
1
#!/bin/bash
2 3 4 5 6

stage=-1
stop_stage=100

unit_type=char
H
Hui Zhang 已提交
7
dict_dir=data/lang_char
8 9 10 11

source ${MAIN_ROOT}/utils/parse_options.sh

mkdir -p data
H
Hui Zhang 已提交
12
mkdir -p ${dict_dir}
H
Hui Zhang 已提交
13
TARGET_DIR=${MAIN_ROOT}/dataset
14 15 16 17 18 19 20 21
mkdir -p ${TARGET_DIR}

if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
    # download data, generate manifests
    python3 ${TARGET_DIR}/librispeech/librispeech.py \
    --manifest_prefix="data/manifest" \
    --target_dir="${TARGET_DIR}/librispeech" \
    --full_download="True"
H
Hui Zhang 已提交
22

23 24 25 26
    if [ $? -ne 0 ]; then
        echo "Prepare LibriSpeech failed. Terminated."
        exit 1
    fi
H
Hui Zhang 已提交
27

28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
    for set in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
        mv data/manifest.${set} data/manifest.${set}.raw
    done

    rm -rf data/manifest.train.raw data/manifest.dev.raw  data/manifest.test.raw
    for set in train-clean-100 train-clean-360 train-other-500; do
        cat data/manifest.${set}.raw >> data/manifest.train.raw
    done

    for set in dev-clean dev-other; do
        cat data/manifest.${set}.raw >> data/manifest.dev.raw
    done

    for set in test-clean test-other; do
        cat data/manifest.${set}.raw >> data/manifest.test.raw
    done
fi

if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    # compute mean and stddev for normalizer
    num_workers=$(nproc)
    python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
    --manifest_path="data/manifest.train.raw" \
H
Hui Zhang 已提交
51
    --num_samples=2000 \
H
Hui Zhang 已提交
52
    --spectrum_type="linear" \
53 54
    --delta_delta=false \
    --sample_rate=16000 \
55 56
    --stride_ms=10 \
    --window_ms=20 \
H
Hui Zhang 已提交
57
    --use_dB_normalization=True \
58 59
    --num_workers=${num_workers} \
    --output_path="data/mean_std.json"
H
Hui Zhang 已提交
60

61 62 63 64 65 66
    if [ $? -ne 0 ]; then
        echo "Compute mean and stddev failed. Terminated."
        exit 1
    fi
fi

H
Hui Zhang 已提交
67 68 69 70 71
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    # build vocabulary
    python3 ${MAIN_ROOT}/utils/build_vocab.py \
    --unit_type ${unit_type} \
    --count_threshold=0 \
H
Hui Zhang 已提交
72
    --vocab_path="${dict_dir}/vocab.txt" \
H
Hui Zhang 已提交
73 74 75 76 77 78 79
    --manifest_paths="data/manifest.train.raw"

    if [ $? -ne 0 ]; then
        echo "Build vocabulary failed. Terminated."
        exit 1
    fi
fi
80 81 82 83 84 85 86 87

if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    # format manifest with tokenids, vocab size
    for set in train dev test dev-clean dev-other test-clean test-other; do
    {
        python3 ${MAIN_ROOT}/utils/format_data.py \
        --cmvn_path "data/mean_std.json" \
        --unit_type ${unit_type} \
H
Hui Zhang 已提交
88
        --vocab_path="${dict_dir}/vocab.txt" \
89 90 91 92 93 94 95 96 97 98 99 100 101 102
        --manifest_path="data/manifest.${set}.raw" \
        --output_path="data/manifest.${set}"

        if [ $? -ne 0 ]; then
            echo "Formt mnaifest.${set} failed. Terminated."
            exit 1
        fi
    }&
    done
    wait
fi

echo "LibriSpeech Data preparation done."
exit 0