提交 f2120bb5 编写于 作者: Y Yibing Liu

adapt to the new structure

manifest*
mean_std.npz
thirdparty/
此差异已折叠。
'
a
b
c
d
e
f
g
h
i
j
k
l
m
n
o
p
q
r
s
t
u
v
w
x
y
z
...@@ -19,8 +19,6 @@ import json ...@@ -19,8 +19,6 @@ import json
import codecs import codecs
from paddle.v2.dataset.common import md5file from paddle.v2.dataset.common import md5file
DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
URL_ROOT = "http://www.openslr.org/resources/12" URL_ROOT = "http://www.openslr.org/resources/12"
URL_TEST_CLEAN = URL_ROOT + "/test-clean.tar.gz" URL_TEST_CLEAN = URL_ROOT + "/test-clean.tar.gz"
URL_TEST_OTHER = URL_ROOT + "/test-other.tar.gz" URL_TEST_OTHER = URL_ROOT + "/test-other.tar.gz"
...@@ -41,7 +39,7 @@ MD5_TRAIN_OTHER_500 = "d1a0fd59409feb2c614ce4d30c387708" ...@@ -41,7 +39,7 @@ MD5_TRAIN_OTHER_500 = "d1a0fd59409feb2c614ce4d30c387708"
parser = argparse.ArgumentParser(description=__doc__) parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument( parser.add_argument(
"--target_dir", "--target_dir",
default=DATA_HOME + "/Libri", default='~/.cache/paddle/dataset/speech/libri',
type=str, type=str,
help="Directory to save the dataset. (default: %(default)s)") help="Directory to save the dataset. (default: %(default)s)")
parser.add_argument( parser.add_argument(
...@@ -60,8 +58,7 @@ args = parser.parse_args() ...@@ -60,8 +58,7 @@ args = parser.parse_args()
def download(url, md5sum, target_dir): def download(url, md5sum, target_dir):
""" """Download file from url to target_dir, and check md5sum.
Download file from url to target_dir, and check md5sum.
""" """
if not os.path.exists(target_dir): os.makedirs(target_dir) if not os.path.exists(target_dir): os.makedirs(target_dir)
filepath = os.path.join(target_dir, url.split("/")[-1]) filepath = os.path.join(target_dir, url.split("/")[-1])
...@@ -77,8 +74,7 @@ def download(url, md5sum, target_dir): ...@@ -77,8 +74,7 @@ def download(url, md5sum, target_dir):
def unpack(filepath, target_dir): def unpack(filepath, target_dir):
""" """Unpack the file to the target_dir.
Unpack the file to the target_dir.
""" """
print("Unpacking %s ..." % filepath) print("Unpacking %s ..." % filepath)
tar = tarfile.open(filepath) tar = tarfile.open(filepath)
...@@ -87,8 +83,7 @@ def unpack(filepath, target_dir): ...@@ -87,8 +83,7 @@ def unpack(filepath, target_dir):
def create_manifest(data_dir, manifest_path): def create_manifest(data_dir, manifest_path):
""" """Create a manifest json file summarizing the data set, with each line
Create a manifest json file summarizing the data set, with each line
containing the meta data (i.e. audio filepath, transcription text, audio containing the meta data (i.e. audio filepath, transcription text, audio
duration) of each audio file within the data set. duration) of each audio file within the data set.
""" """
...@@ -119,8 +114,7 @@ def create_manifest(data_dir, manifest_path): ...@@ -119,8 +114,7 @@ def create_manifest(data_dir, manifest_path):
def prepare_dataset(url, md5sum, target_dir, manifest_path): def prepare_dataset(url, md5sum, target_dir, manifest_path):
""" """Download, unpack and create summmary manifest file.
Download, unpack and create summmary manifest file.
""" """
if not os.path.exists(os.path.join(target_dir, "LibriSpeech")): if not os.path.exists(os.path.join(target_dir, "LibriSpeech")):
# download # download
...@@ -135,6 +129,8 @@ def prepare_dataset(url, md5sum, target_dir, manifest_path): ...@@ -135,6 +129,8 @@ def prepare_dataset(url, md5sum, target_dir, manifest_path):
def main(): def main():
args.target_dir = os.path.expanduser(args.target_dir)
prepare_dataset( prepare_dataset(
url=URL_TEST_CLEAN, url=URL_TEST_CLEAN,
md5sum=MD5_TEST_CLEAN, md5sum=MD5_TEST_CLEAN,
...@@ -145,12 +141,12 @@ def main(): ...@@ -145,12 +141,12 @@ def main():
md5sum=MD5_DEV_CLEAN, md5sum=MD5_DEV_CLEAN,
target_dir=os.path.join(args.target_dir, "dev-clean"), target_dir=os.path.join(args.target_dir, "dev-clean"),
manifest_path=args.manifest_prefix + ".dev-clean") manifest_path=args.manifest_prefix + ".dev-clean")
prepare_dataset(
url=URL_TRAIN_CLEAN_100,
md5sum=MD5_TRAIN_CLEAN_100,
target_dir=os.path.join(args.target_dir, "train-clean-100"),
manifest_path=args.manifest_prefix + ".train-clean-100")
if args.full_download: if args.full_download:
prepare_dataset(
url=URL_TRAIN_CLEAN_100,
md5sum=MD5_TRAIN_CLEAN_100,
target_dir=os.path.join(args.target_dir, "train-clean-100"),
manifest_path=args.manifest_prefix + ".train-clean-100")
prepare_dataset( prepare_dataset(
url=URL_TEST_OTHER, url=URL_TEST_OTHER,
md5sum=MD5_TEST_OTHER, md5sum=MD5_TEST_OTHER,
......
...@@ -7,8 +7,8 @@ ...@@ -7,8 +7,8 @@
#include <map> #include <map>
#include <utility> #include <utility>
#include "fst/fstlib.h"
#include "ThreadPool.h" #include "ThreadPool.h"
#include "fst/fstlib.h"
#include "decoder_utils.h" #include "decoder_utils.h"
#include "path_trie.h" #include "path_trie.h"
......
#ifndef PATH_TRIE_H #ifndef PATH_TRIE_H
#define PATH_TRIE_H #define PATH_TRIE_H
#pragma once #pragma once
#include <fst/fstlib.h>
#include <algorithm> #include <algorithm>
#include <limits> #include <limits>
#include <memory> #include <memory>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include <fst/fstlib.h>
using FSTMATCH = fst::SortedMatcher<fst::StdVectorFst>; using FSTMATCH = fst::SortedMatcher<fst::StdVectorFst>;
......
...@@ -11,7 +11,7 @@ import wave ...@@ -11,7 +11,7 @@ import wave
import paddle.v2 as paddle import paddle.v2 as paddle
import _init_paths import _init_paths
from data_utils.data import DataGenerator from data_utils.data import DataGenerator
from models.model import DeepSpeech2Model from model_utils.model import DeepSpeech2Model
from data_utils.utils import read_manifest from data_utils.utils import read_manifest
from utils.utility import add_arguments, print_arguments from utils.utility import add_arguments, print_arguments
...@@ -46,7 +46,7 @@ add_arg('vocab_path', str, ...@@ -46,7 +46,7 @@ add_arg('vocab_path', str,
'data/librispeech/eng_vocab.txt', 'data/librispeech/eng_vocab.txt',
"Filepath of vocabulary.") "Filepath of vocabulary.")
add_arg('model_path', str, add_arg('model_path', str,
'./checkpoints/params.latest.tar.gz', './checkpoints/libri/params.latest.tar.gz',
"If None, the training starts from scratch, " "If None, the training starts from scratch, "
"otherwise, it resumes from the pre-trained model.") "otherwise, it resumes from the pre-trained model.")
add_arg('lang_model_path', str, add_arg('lang_model_path', str,
......
#! /usr/bin/bash #! /usr/bin/bash
pushd ../.. pushd ../.. > /dev/null
# download data, generate manifests # download data, generate manifests
python data/librispeech/librispeech.py \ python data/librispeech/librispeech.py \
--manifest_prefix='data/librispeech/manifest' \ --manifest_prefix='data/librispeech/manifest' \
--full_download='True' \ --target_dir='~/.cache/paddle/dataset/speech/Libri' \
--target_dir=$HOME'/.cache/paddle/dataset/speech/Libri' --full_download='True'
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
echo "Prepare LibriSpeech failed. Terminated." echo "Prepare LibriSpeech failed. Terminated."
exit 1 exit 1
fi fi
#cat data/librispeech/manifest.train* | shuf > data/librispeech/manifest.train cat data/librispeech/manifest.train-* | shuf > data/librispeech/manifest.train
# build vocabulary
python tools/build_vocab.py \
--count_threshold=0 \
--vocab_path='data/librispeech/vocab.txt' \
--manifest_paths='data/librispeech/manifest.train'
if [ $? -ne 0 ]; then
echo "Build vocabulary failed. Terminated."
exit 1
fi
# compute mean and stddev for normalizer # compute mean and stddev for normalizer
...@@ -30,3 +42,4 @@ fi ...@@ -30,3 +42,4 @@ fi
echo "LibriSpeech Data preparation done." echo "LibriSpeech Data preparation done."
exit 0
#! /usr/bin/bash #! /usr/bin/bash
pushd ../.. pushd ../.. > /dev/null
# download language model
pushd models/lm > /dev/null
sh download_lm_en.sh
if [ $? -ne 0 ]; then
exit 1
fi
popd > /dev/null
# infer
CUDA_VISIBLE_DEVICES=0 \ CUDA_VISIBLE_DEVICES=0 \
python -u infer.py \ python -u infer.py \
--num_samples=10 \ --num_samples=10 \
--trainer_count=1 \ --trainer_count=1 \
--beam_size=500 \ --beam_size=500 \
--num_proc_bsearch=12 \ --num_proc_bsearch=8 \
--num_proc_data=12 \
--num_conv_layers=2 \ --num_conv_layers=2 \
--num_rnn_layers=3 \ --num_rnn_layers=3 \
--rnn_layer_size=2048 \ --rnn_layer_size=2048 \
...@@ -18,11 +27,19 @@ python -u infer.py \ ...@@ -18,11 +27,19 @@ python -u infer.py \
--use_gru=False \ --use_gru=False \
--use_gpu=True \ --use_gpu=True \
--share_rnn_weights=True \ --share_rnn_weights=True \
--infer_manifest='data/librispeech/manifest.dev-clean' \ --infer_manifest='data/librispeech/manifest.test-clean' \
--mean_std_path='data/librispeech/mean_std.npz' \ --mean_std_path='data/librispeech/mean_std.npz' \
--vocab_path='data/librispeech/eng_vocab.txt' \ --vocab_path='data/librispeech/vocab.txt' \
--model_path='checkpoints/params.latest.tar.gz' \ --model_path='checkpoints/libri/params.latest.tar.gz' \
--lang_model_path='lm/data/common_crawl_00.prune01111.trie.klm' \ --lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
--decoding_method='ctc_beam_search' \ --decoding_method='ctc_beam_search' \
--error_rate_type='wer' \ --error_rate_type='wer' \
--specgram_type='linear' --specgram_type='linear'
if [ $? -ne 0 ]; then
echo "Failed in inference!"
exit 1
fi
exit 0
#! /usr/bin/bash
pushd ../.. > /dev/null
# download language model
pushd models/lm > /dev/null
sh download_lm_en.sh
if [ $? -ne 0 ]; then
exit 1
fi
popd > /dev/null
# download well-trained model
pushd models/librispeech > /dev/null
sh download_model.sh
if [ $? -ne 0 ]; then
exit 1
fi
popd > /dev/null
# infer
CUDA_VISIBLE_DEVICES=0 \
python -u infer.py \
--num_samples=10 \
--trainer_count=1 \
--beam_size=500 \
--num_proc_bsearch=8 \
--num_conv_layers=2 \
--num_rnn_layers=3 \
--rnn_layer_size=2048 \
--alpha=2.15 \
--beta=0.35 \
--cutoff_prob=1.0 \
--use_gru=False \
--use_gpu=True \
--share_rnn_weights=True \
--infer_manifest='data/librispeech/manifest.test-clean' \
--mean_std_path='models/librispeech/mean_std.npz' \
--vocab_path='models/librispeech/vocab.txt' \
--model_path='models/librispeech/params.tar.gz' \
--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
--decoding_method='ctc_beam_search' \
--error_rate_type='wer' \
--specgram_type='linear'
if [ $? -ne 0 ]; then
echo "Failed in inference!"
exit 1
fi
exit 0
#! /usr/bin/bash #! /usr/bin/bash
pushd ../.. pushd ../.. > /dev/null
# download language model
pushd models/lm > /dev/null
sh download_lm_en.sh
if [ $? -ne 0 ]; then
exit 1
fi
popd > /dev/null
# evaluate model
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
python -u test.py \ python -u test.py \
--batch_size=128 \ --batch_size=128 \
--trainer_count=8 \ --trainer_count=8 \
--beam_size=500 \ --beam_size=500 \
--num_proc_bsearch=12 \ --num_proc_bsearch=8 \
--num_proc_data=12 \ --num_proc_data=4 \
--num_conv_layers=2 \ --num_conv_layers=2 \
--num_rnn_layers=3 \ --num_rnn_layers=3 \
--rnn_layer_size=2048 \ --rnn_layer_size=2048 \
...@@ -20,9 +30,17 @@ python -u test.py \ ...@@ -20,9 +30,17 @@ python -u test.py \
--share_rnn_weights=True \ --share_rnn_weights=True \
--test_manifest='data/librispeech/manifest.test-clean' \ --test_manifest='data/librispeech/manifest.test-clean' \
--mean_std_path='data/librispeech/mean_std.npz' \ --mean_std_path='data/librispeech/mean_std.npz' \
--vocab_path='data/librispeech/eng_vocab.txt' \ --vocab_path='data/librispeech/vocab.txt' \
--model_path='checkpoints/params.latest.tar.gz' \ --model_path='checkpoints/libri/params.latest.tar.gz' \
--lang_model_path='lm/data/common_crawl_00.prune01111.trie.klm' \ --lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
--decoding_method='ctc_beam_search' \ --decoding_method='ctc_beam_search' \
--error_rate_type='wer' \ --error_rate_type='wer' \
--specgram_type='linear' --specgram_type='linear'
if [ $? -ne 0 ]; then
echo "Failed in evaluation!"
exit 1
fi
exit 0
#! /usr/bin/bash
pushd ../.. > /dev/null
# download language model
pushd models/lm > /dev/null
sh download_lm_en.sh
if [ $? -ne 0 ]; then
exit 1
fi
popd > /dev/null
# download well-trained model
pushd models/librispeech > /dev/null
sh download_model.sh
if [ $? -ne 0 ]; then
exit 1
fi
popd > /dev/null
# evaluate model
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
python -u test.py \
--batch_size=128 \
--trainer_count=8 \
--beam_size=500 \
--num_proc_bsearch=8 \
--num_proc_data=4 \
--num_conv_layers=2 \
--num_rnn_layers=3 \
--rnn_layer_size=2048 \
--alpha=0.36 \
--beta=0.25 \
--cutoff_prob=0.99 \
--use_gru=False \
--use_gpu=True \
--share_rnn_weights=True \
--test_manifest='data/tiny/manifest.test-clean' \
--mean_std_path='models/librispeech/mean_std.npz' \
--vocab_path='models/librispeech/vocab.txt' \
--model_path='models/librispeech/params.tar.gz' \
--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
--decoding_method='ctc_beam_search' \
--error_rate_type='wer' \
--specgram_type='linear'
if [ $? -ne 0 ]; then
echo "Failed in evaluation!"
exit 1
fi
exit 0
#! /usr/bin/bash #! /usr/bin/bash
pushd ../.. pushd ../.. > /dev/null
# train model
# if you wish to resume from an exists model, uncomment --init_model_path
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
python -u train.py \ python -u train.py \
--batch_size=256 \ --batch_size=512 \
--trainer_count=8 \ --trainer_count=8 \
--num_passes=200 \ --num_passes=50 \
--num_proc_data=12 \ --num_proc_data=12 \
--num_conv_layers=2 \ --num_conv_layers=2 \
--num_rnn_layers=3 \ --num_rnn_layers=3 \
...@@ -23,8 +25,16 @@ python -u train.py \ ...@@ -23,8 +25,16 @@ python -u train.py \
--train_manifest='data/librispeech/manifest.train' \ --train_manifest='data/librispeech/manifest.train' \
--dev_manifest='data/librispeech/manifest.dev' \ --dev_manifest='data/librispeech/manifest.dev' \
--mean_std_path='data/librispeech/mean_std.npz' \ --mean_std_path='data/librispeech/mean_std.npz' \
--vocab_path='data/librispeech/eng_vocab.txt' \ --vocab_path='data/librispeech/vocab.txt' \
--output_model_dir='./checkpoints' \ --output_model_dir='./checkpoints/libri' \
--augment_conf_path='conf/augmentation.config' \ --augment_conf_path='conf/augmentation.config' \
--specgram_type='linear' \ --specgram_type='linear' \
--shuffle_method='batch_shuffle_clipped' --shuffle_method='batch_shuffle_clipped'
if [ $? -ne 0 ]; then
echo "Failed in training!"
exit 1
fi
exit 0
#! /usr/bin/bash #! /usr/bin/bash
pushd ../.. pushd ../.. > /dev/null
# grid-search for hyper-parameters in language model
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
python -u tools/tune.py \ python -u tools/tune.py \
--num_samples=100 \ --num_samples=100 \
...@@ -23,8 +24,16 @@ python -u tools/tune.py \ ...@@ -23,8 +24,16 @@ python -u tools/tune.py \
--share_rnn_weights=True \ --share_rnn_weights=True \
--tune_manifest='data/librispeech/manifest.dev-clean' \ --tune_manifest='data/librispeech/manifest.dev-clean' \
--mean_std_path='data/librispeech/mean_std.npz' \ --mean_std_path='data/librispeech/mean_std.npz' \
--vocab_path='data/librispeech/eng_vocab.txt' \ --vocab_path='data/librispeech/vocab.txt' \
--model_path='checkpoints/params.latest.tar.gz' \ --model_path='checkpoints/libri/params.latest.tar.gz' \
--lang_model_path='lm/data/common_crawl_00.prune01111.trie.klm' \ --lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
--error_rate_type='wer' \ --error_rate_type='wer' \
--specgram_type='linear' --specgram_type='linear'
if [ $? -ne 0 ]; then
echo "Failed in tuning!"
exit 1
fi
exit 0
#! /usr/bin/bash
pushd ../.. > /dev/null
# start demo client
CUDA_VISIBLE_DEVICES=0 \
python -u deploy/demo_client.py \
--host_ip='localhost' \
--host_port=8086 \
if [ $? -ne 0 ]; then
echo "Failed in starting demo client!"
exit 1
fi
exit 0
#! /usr/bin/bash
# TODO: replace the model with a mandarin model
pushd ../.. > /dev/null
# download language model
pushd models/lm > /dev/null
sh download_lm_en.sh
if [ $? -ne 0 ]; then
exit 1
fi
popd > /dev/null
# download well-trained model
pushd models/librispeech > /dev/null
sh download_model.sh
if [ $? -ne 0 ]; then
exit 1
fi
popd > /dev/null
# start demo server
CUDA_VISIBLE_DEVICES=0 \
python -u deploy/demo_server.py \
--host_ip='localhost' \
--host_port=8086 \
--num_conv_layers=2 \
--num_rnn_layers=3 \
--rnn_layer_size=2048 \
--alpha=0.36 \
--beta=0.25 \
--cutoff_prob=0.99 \
--use_gru=False \
--use_gpu=True \
--share_rnn_weights=True \
--speech_save_dir='demo_cache' \
--warmup_manifest='data/tiny/manifest.test-clean' \
--mean_std_path='models/librispeech/mean_std.npz' \
--vocab_path='models/librispeech/vocab.txt' \
--model_path='models/librispeech/params.tar.gz' \
--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
--decoding_method='ctc_beam_search' \
--specgram_type='linear'
if [ $? -ne 0 ]; then
echo "Failed in starting demo server!"
exit 1
fi
exit 0
#! /usr/bin/bash
pushd ../.. > /dev/null
# prepare folder
if [ ! -e data/tiny ]; then
mkdir data/tiny
fi
# download data, generate manifests
python data/librispeech/librispeech.py \
--manifest_prefix='data/tiny/manifest' \
--target_dir='~/.cache/paddle/dataset/speech/libri' \
--full_download='False'
if [ $? -ne 0 ]; then
echo "Prepare LibriSpeech failed. Terminated."
exit 1
fi
head -n 64 data/tiny/manifest.dev-clean > data/tiny/manifest.tiny
# build vocabulary
python tools/build_vocab.py \
--count_threshold=0 \
--vocab_path='data/tiny/vocab.txt' \
--manifest_paths='data/tiny/manifest.dev'
if [ $? -ne 0 ]; then
echo "Build vocabulary failed. Terminated."
exit 1
fi
# compute mean and stddev for normalizer
python tools/compute_mean_std.py \
--manifest_path='data/tiny/manifest.tiny' \
--num_samples=64 \
--specgram_type='linear' \
--output_path='data/tiny/mean_std.npz'
if [ $? -ne 0 ]; then
echo "Compute mean and stddev failed. Terminated."
exit 1
fi
echo "Tiny data preparation done."
exit 0
#! /usr/bin/bash
pushd ../.. > /dev/null
# download language model
pushd models/lm > /dev/null
sh download_lm_en.sh
if [ $? -ne 0 ]; then
exit 1
fi
popd > /dev/null
# infer
CUDA_VISIBLE_DEVICES=0 \
python -u infer.py \
--num_samples=10 \
--trainer_count=1 \
--beam_size=500 \
--num_proc_bsearch=8 \
--num_conv_layers=2 \
--num_rnn_layers=3 \
--rnn_layer_size=2048 \
--alpha=0.36 \
--beta=0.25 \
--cutoff_prob=0.99 \
--use_gru=False \
--use_gpu=True \
--share_rnn_weights=True \
--infer_manifest='data/tiny/manifest.tiny' \
--mean_std_path='data/tiny/mean_std.npz' \
--vocab_path='data/tiny/vocab.txt' \
--model_path='checkpoints/tiny/params.pass-19.tar.gz' \
--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
--decoding_method='ctc_beam_search' \
--error_rate_type='wer' \
--specgram_type='linear'
if [ $? -ne 0 ]; then
echo "Failed in inference!"
exit 1
fi
exit 0
#! /usr/bin/bash
pushd ../.. > /dev/null
# download language model
pushd models/lm > /dev/null
sh download_lm_en.sh
if [ $? -ne 0 ]; then
exit 1
fi
popd > /dev/null
# download well-trained model
pushd models/librispeech > /dev/null
sh download_model.sh
if [ $? -ne 0 ]; then
exit 1
fi
popd > /dev/null
# infer
CUDA_VISIBLE_DEVICES=0 \
python -u infer.py \
--num_samples=10 \
--trainer_count=1 \
--beam_size=500 \
--num_proc_bsearch=8 \
--num_conv_layers=2 \
--num_rnn_layers=3 \
--rnn_layer_size=2048 \
--alpha=0.36 \
--beta=0.25 \
--cutoff_prob=0.99 \
--use_gru=False \
--use_gpu=True \
--share_rnn_weights=True \
--infer_manifest='data/tiny/manifest.test-clean' \
--mean_std_path='models/librispeech/mean_std.npz' \
--vocab_path='models/librispeech/vocab.txt' \
--model_path='models/librispeech/params.tar.gz' \
--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
--decoding_method='ctc_beam_search' \
--error_rate_type='wer' \
--specgram_type='linear'
if [ $? -ne 0 ]; then
echo "Failed in inference!"
exit 1
fi
exit 0
#! /usr/bin/bash
pushd ../.. > /dev/null
# download language model
pushd models/lm > /dev/null
sh download_lm_en.sh
if [ $? -ne 0 ]; then
exit 1
fi
popd > /dev/null
# evaluate model
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
python -u test.py \
--batch_size=16 \
--trainer_count=8 \
--beam_size=500 \
--num_proc_bsearch=8 \
--num_proc_data=4 \
--num_conv_layers=2 \
--num_rnn_layers=3 \
--rnn_layer_size=2048 \
--alpha=0.36 \
--beta=0.25 \
--cutoff_prob=0.99 \
--use_gru=False \
--use_gpu=True \
--share_rnn_weights=True \
--test_manifest='data/tiny/manifest.tiny' \
--mean_std_path='data/tiny/mean_std.npz' \
--vocab_path='data/tiny/vocab.txt' \
--model_path='checkpoints/params.pass-19.tar.gz' \
--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
--decoding_method='ctc_beam_search' \
--error_rate_type='wer' \
--specgram_type='linear'
if [ $? -ne 0 ]; then
echo "Failed in evaluation!"
exit 1
fi
exit 0
#! /usr/bin/bash
pushd ../.. > /dev/null
# download language model
pushd models/lm > /dev/null
sh download_lm_en.sh
if [ $? -ne 0 ]; then
exit 1
fi
popd > /dev/null
# download well-trained model
pushd models/librispeech > /dev/null
sh download_model.sh
if [ $? -ne 0 ]; then
exit 1
fi
popd > /dev/null
# evaluate model
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
python -u test.py \
--batch_size=128 \
--trainer_count=8 \
--beam_size=500 \
--num_proc_bsearch=8 \
--num_proc_data=4 \
--num_conv_layers=2 \
--num_rnn_layers=3 \
--rnn_layer_size=2048 \
--alpha=0.36 \
--beta=0.25 \
--cutoff_prob=0.99 \
--use_gru=False \
--use_gpu=True \
--share_rnn_weights=True \
--test_manifest='data/tiny/manifest.test-clean' \
--mean_std_path='models/librispeech/mean_std.npz' \
--vocab_path='models/librispeech/vocab.txt' \
--model_path='models/librispeech/params.tar.gz' \
--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
--decoding_method='ctc_beam_search' \
--error_rate_type='wer' \
--specgram_type='linear'
if [ $? -ne 0 ]; then
echo "Failed in evaluation!"
exit 1
fi
exit 0
#! /usr/bin/bash
pushd ../.. > /dev/null
# train model
# if you wish to resume from an exists model, uncomment --init_model_path
CUDA_VISIBLE_DEVICES=0,1,2,3 \
python -u train.py \
--batch_size=16 \
--trainer_count=4 \
--num_passes=20 \
--num_proc_data=1 \
--num_conv_layers=2 \
--num_rnn_layers=3 \
--rnn_layer_size=2048 \
--num_iter_print=100 \
--learning_rate=1e-5 \
--max_duration=27.0 \
--min_duration=0.0 \
--use_sortagrad=True \
--use_gru=False \
--use_gpu=True \
--is_local=True \
--share_rnn_weights=True \
--train_manifest='data/tiny/manifest.tiny' \
--dev_manifest='data/tiny/manifest.tiny' \
--mean_std_path='data/tiny/mean_std.npz' \
--vocab_path='data/tiny/vocab.txt' \
--output_model_dir='./checkpoints/tiny' \
--augment_conf_path='conf/augmentation.config' \
--specgram_type='linear' \
--shuffle_method='batch_shuffle_clipped'
if [ $? -ne 0 ]; then
echo "Fail to do inference!"
exit 1
fi
exit 0
#! /usr/bin/bash
pushd ../.. > /dev/null
# grid-search for hyper-parameters in language model
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
python -u tools/tune.py \
--num_samples=100 \
--trainer_count=8 \
--beam_size=500 \
--num_proc_bsearch=12 \
--num_conv_layers=2 \
--num_rnn_layers=3 \
--rnn_layer_size=2048 \
--num_alphas=14 \
--num_betas=20 \
--alpha_from=0.1 \
--alpha_to=0.36 \
--beta_from=0.05 \
--beta_to=1.0 \
--cutoff_prob=0.99 \
--use_gru=False \
--use_gpu=True \
--share_rnn_weights=True \
--tune_manifest='data/tiny/manifest.tiny' \
--mean_std_path='data/tiny/mean_std.npz' \
--vocab_path='data/tiny/vocab.txt' \
--model_path='checkpoints/params.pass-9.tar.gz' \
--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
--error_rate_type='wer' \
--specgram_type='linear'
if [ $? -ne 0 ]; then
echo "Failed in tuning!"
exit 1
fi
exit 0
...@@ -7,7 +7,7 @@ import argparse ...@@ -7,7 +7,7 @@ import argparse
import functools import functools
import paddle.v2 as paddle import paddle.v2 as paddle
from data_utils.data import DataGenerator from data_utils.data import DataGenerator
from models.model import DeepSpeech2Model from model_utils.model import DeepSpeech2Model
from utils.error_rate import wer, cer from utils.error_rate import wer, cer
from utils.utility import add_arguments, print_arguments from utils.utility import add_arguments, print_arguments
...@@ -35,13 +35,13 @@ add_arg('mean_std_path', str, ...@@ -35,13 +35,13 @@ add_arg('mean_std_path', str,
'data/librispeech/mean_std.npz', 'data/librispeech/mean_std.npz',
"Filepath of normalizer's mean & std.") "Filepath of normalizer's mean & std.")
add_arg('vocab_path', str, add_arg('vocab_path', str,
'data/librispeech/eng_vocab.txt', 'data/librispeech/vocab.txt',
"Filepath of vocabulary.") "Filepath of vocabulary.")
add_arg('lang_model_path', str, add_arg('lang_model_path', str,
'lm/data/common_crawl_00.prune01111.trie.klm', 'models/lm/common_crawl_00.prune01111.trie.klm',
"Filepath for language model.") "Filepath for language model.")
add_arg('model_path', str, add_arg('model_path', str,
'./checkpoints/params.latest.tar.gz', './checkpoints/libri/params.latest.tar.gz',
"If None, the training starts from scratch, " "If None, the training starts from scratch, "
"otherwise, it resumes from the pre-trained model.") "otherwise, it resumes from the pre-trained model.")
add_arg('decoding_method', str, add_arg('decoding_method', str,
......
echo "Downloading language model ..."
mkdir data
LM=common_crawl_00.prune01111.trie.klm
MD5="099a601759d467cd0a8523ff939819c5"
wget -c http://paddlepaddle.bj.bcebos.com/model_zoo/speech/$LM -P ./data
echo "Checking md5sum ..."
md5_tmp=`md5sum ./data/$LM | awk -F[' '] '{print $1}'`
if [ $MD5 != $md5_tmp ]; then
echo "Fail to download the language model!"
exit 1
fi
...@@ -180,6 +180,8 @@ def ctc_beam_search_decoder(probs_seq, ...@@ -180,6 +180,8 @@ def ctc_beam_search_decoder(probs_seq,
prob = prob * ext_scoring_func(result) prob = prob * ext_scoring_func(result)
log_prob = log(prob) log_prob = log(prob)
beam_result.append((log_prob, result)) beam_result.append((log_prob, result))
else:
beam_result.append((float('-inf'), ''))
## output top beam_size decoding results ## output top beam_size decoding results
beam_result = sorted(beam_result, key=lambda asd: asd[0], reverse=True) beam_result = sorted(beam_result, key=lambda asd: asd[0], reverse=True)
......
...@@ -8,10 +8,10 @@ import os ...@@ -8,10 +8,10 @@ import os
import time import time
import gzip import gzip
import paddle.v2 as paddle import paddle.v2 as paddle
from models.swig_decoders_wrapper import Scorer from decoders.swig_wrapper import Scorer
from models.swig_decoders_wrapper import ctc_greedy_decoder from decoders.swig_wrapper import ctc_greedy_decoder
from models.swig_decoders_wrapper import ctc_beam_search_decoder_batch from decoders.swig_wrapper import ctc_beam_search_decoder_batch
from models.network import deep_speech_v2_network from model_utils.network import deep_speech_v2_network
class DeepSpeech2Model(object): class DeepSpeech2Model(object):
......
...@@ -4,7 +4,7 @@ from __future__ import division ...@@ -4,7 +4,7 @@ from __future__ import division
from __future__ import print_function from __future__ import print_function
import unittest import unittest
from models import decoder from model_utils import decoder
class TestDecoders(unittest.TestCase): class TestDecoders(unittest.TestCase):
......
#! /usr/bin/bash
source ../../utils/utility.sh
# TODO: add urls
URL='to-be-added'
MD5=5b4af224b26c1dc4dd972b7d32f2f52a
TARGET=./librispeech_model.tar.gz
echo "Download LibriSpeech model ..."
download $URL $MD5 $TARGET
if [ $? -ne 0 ]; then
echo "Fail to download LibriSpeech model!"
exit 1
fi
tar -zxvf $TARGET
exit 0
#! /usr/bin/bash
source ../../utils/utility.sh
URL=http://paddlepaddle.bj.bcebos.com/model_zoo/speech/common_crawl_00.prune01111.trie.klm
MD5="099a601759d467cd0a8523ff939819c5"
TARGET=./common_crawl_00.prune01111.trie.klm
echo "Download language model ..."
download $URL $MD5 $TARGET
if [ $? -ne 0 ]; then
echo "Fail to download the language model!"
exit 1
fi
exit 0
...@@ -7,7 +7,7 @@ import argparse ...@@ -7,7 +7,7 @@ import argparse
import functools import functools
import paddle.v2 as paddle import paddle.v2 as paddle
from data_utils.data import DataGenerator from data_utils.data import DataGenerator
from models.model import DeepSpeech2Model from model_utils.model import DeepSpeech2Model
from utils.error_rate import wer, cer from utils.error_rate import wer, cer
from utils.utility import add_arguments, print_arguments from utils.utility import add_arguments, print_arguments
...@@ -36,14 +36,14 @@ add_arg('mean_std_path', str, ...@@ -36,14 +36,14 @@ add_arg('mean_std_path', str,
'data/librispeech/mean_std.npz', 'data/librispeech/mean_std.npz',
"Filepath of normalizer's mean & std.") "Filepath of normalizer's mean & std.")
add_arg('vocab_path', str, add_arg('vocab_path', str,
'data/librispeech/eng_vocab.txt', 'data/librispeech/vocab.txt',
"Filepath of vocabulary.") "Filepath of vocabulary.")
add_arg('model_path', str, add_arg('model_path', str,
'./checkpoints/params.latest.tar.gz', './checkpoints/libri/params.latest.tar.gz',
"If None, the training starts from scratch, " "If None, the training starts from scratch, "
"otherwise, it resumes from the pre-trained model.") "otherwise, it resumes from the pre-trained model.")
add_arg('lang_model_path', str, add_arg('lang_model_path', str,
'lm/data/common_crawl_00.prune01111.trie.klm', 'models/lm/common_crawl_00.prune01111.trie.klm',
"Filepath for language model.") "Filepath for language model.")
add_arg('decoding_method', str, add_arg('decoding_method', str,
'ctc_beam_search', 'ctc_beam_search',
......
...@@ -21,7 +21,7 @@ add_arg = functools.partial(add_arguments, argparser=parser) ...@@ -21,7 +21,7 @@ add_arg = functools.partial(add_arguments, argparser=parser)
# yapf: disable # yapf: disable
add_arg('count_threshold', int, 0, "Truncation threshold for char counts.") add_arg('count_threshold', int, 0, "Truncation threshold for char counts.")
add_arg('vocab_path', str, add_arg('vocab_path', str,
'datasets/vocab/zh_vocab.txt', 'data/librispeech/vocab.txt',
"Filepath to write the vocabulary.") "Filepath to write the vocabulary.")
add_arg('manifest_paths', str, add_arg('manifest_paths', str,
None, None,
...@@ -34,7 +34,7 @@ args = parser.parse_args() ...@@ -34,7 +34,7 @@ args = parser.parse_args()
def count_manifest(counter, manifest_path): def count_manifest(counter, manifest_path):
manifest_jsons = utils.read_manifest(manifest_path) manifest_jsons = read_manifest(manifest_path)
for line_json in manifest_jsons: for line_json in manifest_jsons:
for char in line_json['text']: for char in line_json['text']:
counter.update(char) counter.update(char)
......
...@@ -20,10 +20,10 @@ add_arg('specgram_type', str, ...@@ -20,10 +20,10 @@ add_arg('specgram_type', str,
"Audio feature type. Options: linear, mfcc.", "Audio feature type. Options: linear, mfcc.",
choices=['linear', 'mfcc']) choices=['linear', 'mfcc'])
add_arg('manifest_path', str, add_arg('manifest_path', str,
'datasets/manifest.train', 'data/librispeech/manifest.train',
"Filepath of manifest to compute normalizer's mean and stddev.") "Filepath of manifest to compute normalizer's mean and stddev.")
add_arg('output_path', str, add_arg('output_path', str,
'mean_std.npz', 'data/librispeech/mean_std.npz',
"Filepath of write mean and stddev to (.npz).") "Filepath of write mean and stddev to (.npz).")
# yapf: disable # yapf: disable
args = parser.parse_args() args = parser.parse_args()
......
...@@ -9,7 +9,7 @@ import functools ...@@ -9,7 +9,7 @@ import functools
import paddle.v2 as paddle import paddle.v2 as paddle
import _init_paths import _init_paths
from data_utils.data import DataGenerator from data_utils.data import DataGenerator
from models.model import DeepSpeech2Model from model_utils.model import DeepSpeech2Model
from utils.error_rate import wer from utils.error_rate import wer
from utils.utility import add_arguments, print_arguments from utils.utility import add_arguments, print_arguments
...@@ -41,13 +41,13 @@ add_arg('mean_std_path', str, ...@@ -41,13 +41,13 @@ add_arg('mean_std_path', str,
'data/librispeech/mean_std.npz', 'data/librispeech/mean_std.npz',
"Filepath of normalizer's mean & std.") "Filepath of normalizer's mean & std.")
add_arg('vocab_path', str, add_arg('vocab_path', str,
'data/librispeech/eng_vocab.txt', 'data/librispeech/vocab.txt',
"Filepath of vocabulary.") "Filepath of vocabulary.")
add_arg('lang_model_path', str, add_arg('lang_model_path', str,
'lm/data/common_crawl_00.prune01111.trie.klm', 'models/lm/common_crawl_00.prune01111.trie.klm',
"Filepath for language model.") "Filepath for language model.")
add_arg('model_path', str, add_arg('model_path', str,
'./checkpoints/params.latest.tar.gz', './checkpoints/libri/params.latest.tar.gz',
"If None, the training starts from scratch, " "If None, the training starts from scratch, "
"otherwise, it resumes from the pre-trained model.") "otherwise, it resumes from the pre-trained model.")
add_arg('error_rate_type', str, add_arg('error_rate_type', str,
......
...@@ -6,7 +6,7 @@ from __future__ import print_function ...@@ -6,7 +6,7 @@ from __future__ import print_function
import argparse import argparse
import functools import functools
import paddle.v2 as paddle import paddle.v2 as paddle
from models.model import DeepSpeech2Model from model_utils.model import DeepSpeech2Model
from data_utils.data import DataGenerator from data_utils.data import DataGenerator
from utils.utility import add_arguments, print_arguments from utils.utility import add_arguments, print_arguments
...@@ -41,14 +41,14 @@ add_arg('mean_std_path', str, ...@@ -41,14 +41,14 @@ add_arg('mean_std_path', str,
'data/librispeech/mean_std.npz', 'data/librispeech/mean_std.npz',
"Filepath of normalizer's mean & std.") "Filepath of normalizer's mean & std.")
add_arg('vocab_path', str, add_arg('vocab_path', str,
'data/librispeech/eng_vocab.txt', 'data/librispeech/vocab.txt',
"Filepath of vocabulary.") "Filepath of vocabulary.")
add_arg('init_model_path', str, add_arg('init_model_path', str,
None, None,
"If None, the training starts from scratch, " "If None, the training starts from scratch, "
"otherwise, it resumes from the pre-trained model.") "otherwise, it resumes from the pre-trained model.")
add_arg('output_model_dir', str, add_arg('output_model_dir', str,
"./checkpoints", "./checkpoints/libri",
"Directory for saving checkpoints.") "Directory for saving checkpoints.")
add_arg('augment_conf_path',str, add_arg('augment_conf_path',str,
'conf/augmentation.config', 'conf/augmentation.config',
......
download() {
URL=$1
MD5=$2
TARGET=$3
if [ -e $TARGET ]; then
md5_result=`md5sum $TARGET | awk -F[' '] '{print $1}'`
if [ $MD5 == $md5_result ]; then
echo "$TARGET already exists, download skipped."
return 0
fi
fi
wget -c $URL -P `dirname "$TARGET"`
md5_result=`md5sum $TARGET | awk -F[' '] '{print $1}'`
if [ $MD5 -ne $md5_result ]; then
echo "Fail to download the language model!"
return 1
fi
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册