提交 c4bc8228 编写于 作者: Y Yibing Liu

adapt to the new structure

...@@ -17,7 +17,7 @@ addons: ...@@ -17,7 +17,7 @@ addons:
- python-pip - python-pip
- python2.7-dev - python2.7-dev
before_install: before_install:
- pip install -U virtualenv pre-commit pip - sudo pip install -U virtualenv pre-commit pip
- docker pull paddlepaddle/paddle:latest - docker pull paddlepaddle/paddle:latest
script: script:
- .travis/precommit.sh - .travis/precommit.sh
......
manifest*
mean_std.npz
thirdparty/
此差异已折叠。
'
a
b
c
d
e
f
g
h
i
j
k
l
m
n
o
p
q
r
s
t
u
v
w
x
y
z
...@@ -19,8 +19,6 @@ import json ...@@ -19,8 +19,6 @@ import json
import codecs import codecs
from paddle.v2.dataset.common import md5file from paddle.v2.dataset.common import md5file
DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
URL_ROOT = "http://www.openslr.org/resources/12" URL_ROOT = "http://www.openslr.org/resources/12"
URL_TEST_CLEAN = URL_ROOT + "/test-clean.tar.gz" URL_TEST_CLEAN = URL_ROOT + "/test-clean.tar.gz"
URL_TEST_OTHER = URL_ROOT + "/test-other.tar.gz" URL_TEST_OTHER = URL_ROOT + "/test-other.tar.gz"
...@@ -41,7 +39,7 @@ MD5_TRAIN_OTHER_500 = "d1a0fd59409feb2c614ce4d30c387708" ...@@ -41,7 +39,7 @@ MD5_TRAIN_OTHER_500 = "d1a0fd59409feb2c614ce4d30c387708"
parser = argparse.ArgumentParser(description=__doc__) parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument( parser.add_argument(
"--target_dir", "--target_dir",
default=DATA_HOME + "/Libri", default='~/.cache/paddle/dataset/speech/libri',
type=str, type=str,
help="Directory to save the dataset. (default: %(default)s)") help="Directory to save the dataset. (default: %(default)s)")
parser.add_argument( parser.add_argument(
...@@ -60,8 +58,7 @@ args = parser.parse_args() ...@@ -60,8 +58,7 @@ args = parser.parse_args()
def download(url, md5sum, target_dir): def download(url, md5sum, target_dir):
""" """Download file from url to target_dir, and check md5sum.
Download file from url to target_dir, and check md5sum.
""" """
if not os.path.exists(target_dir): os.makedirs(target_dir) if not os.path.exists(target_dir): os.makedirs(target_dir)
filepath = os.path.join(target_dir, url.split("/")[-1]) filepath = os.path.join(target_dir, url.split("/")[-1])
...@@ -77,8 +74,7 @@ def download(url, md5sum, target_dir): ...@@ -77,8 +74,7 @@ def download(url, md5sum, target_dir):
def unpack(filepath, target_dir): def unpack(filepath, target_dir):
""" """Unpack the file to the target_dir.
Unpack the file to the target_dir.
""" """
print("Unpacking %s ..." % filepath) print("Unpacking %s ..." % filepath)
tar = tarfile.open(filepath) tar = tarfile.open(filepath)
...@@ -87,8 +83,7 @@ def unpack(filepath, target_dir): ...@@ -87,8 +83,7 @@ def unpack(filepath, target_dir):
def create_manifest(data_dir, manifest_path): def create_manifest(data_dir, manifest_path):
""" """Create a manifest json file summarizing the data set, with each line
Create a manifest json file summarizing the data set, with each line
containing the meta data (i.e. audio filepath, transcription text, audio containing the meta data (i.e. audio filepath, transcription text, audio
duration) of each audio file within the data set. duration) of each audio file within the data set.
""" """
...@@ -119,8 +114,7 @@ def create_manifest(data_dir, manifest_path): ...@@ -119,8 +114,7 @@ def create_manifest(data_dir, manifest_path):
def prepare_dataset(url, md5sum, target_dir, manifest_path): def prepare_dataset(url, md5sum, target_dir, manifest_path):
""" """Download, unpack and create summmary manifest file.
Download, unpack and create summmary manifest file.
""" """
if not os.path.exists(os.path.join(target_dir, "LibriSpeech")): if not os.path.exists(os.path.join(target_dir, "LibriSpeech")):
# download # download
...@@ -135,6 +129,8 @@ def prepare_dataset(url, md5sum, target_dir, manifest_path): ...@@ -135,6 +129,8 @@ def prepare_dataset(url, md5sum, target_dir, manifest_path):
def main(): def main():
args.target_dir = os.path.expanduser(args.target_dir)
prepare_dataset( prepare_dataset(
url=URL_TEST_CLEAN, url=URL_TEST_CLEAN,
md5sum=MD5_TEST_CLEAN, md5sum=MD5_TEST_CLEAN,
...@@ -145,12 +141,12 @@ def main(): ...@@ -145,12 +141,12 @@ def main():
md5sum=MD5_DEV_CLEAN, md5sum=MD5_DEV_CLEAN,
target_dir=os.path.join(args.target_dir, "dev-clean"), target_dir=os.path.join(args.target_dir, "dev-clean"),
manifest_path=args.manifest_prefix + ".dev-clean") manifest_path=args.manifest_prefix + ".dev-clean")
if args.full_download:
prepare_dataset( prepare_dataset(
url=URL_TRAIN_CLEAN_100, url=URL_TRAIN_CLEAN_100,
md5sum=MD5_TRAIN_CLEAN_100, md5sum=MD5_TRAIN_CLEAN_100,
target_dir=os.path.join(args.target_dir, "train-clean-100"), target_dir=os.path.join(args.target_dir, "train-clean-100"),
manifest_path=args.manifest_prefix + ".train-clean-100") manifest_path=args.manifest_prefix + ".train-clean-100")
if args.full_download:
prepare_dataset( prepare_dataset(
url=URL_TEST_OTHER, url=URL_TEST_OTHER,
md5sum=MD5_TEST_OTHER, md5sum=MD5_TEST_OTHER,
......
...@@ -7,8 +7,8 @@ ...@@ -7,8 +7,8 @@
#include <map> #include <map>
#include <utility> #include <utility>
#include "fst/fstlib.h"
#include "ThreadPool.h" #include "ThreadPool.h"
#include "fst/fstlib.h"
#include "decoder_utils.h" #include "decoder_utils.h"
#include "path_trie.h" #include "path_trie.h"
......
#ifndef PATH_TRIE_H #ifndef PATH_TRIE_H
#define PATH_TRIE_H #define PATH_TRIE_H
#pragma once #pragma once
#include <fst/fstlib.h>
#include <algorithm> #include <algorithm>
#include <limits> #include <limits>
#include <memory> #include <memory>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include <fst/fstlib.h>
using FSTMATCH = fst::SortedMatcher<fst::StdVectorFst>; using FSTMATCH = fst::SortedMatcher<fst::StdVectorFst>;
......
...@@ -11,7 +11,7 @@ import wave ...@@ -11,7 +11,7 @@ import wave
import paddle.v2 as paddle import paddle.v2 as paddle
import _init_paths import _init_paths
from data_utils.data import DataGenerator from data_utils.data import DataGenerator
from models.model import DeepSpeech2Model from model_utils.model import DeepSpeech2Model
from data_utils.utils import read_manifest from data_utils.utils import read_manifest
from utils.utility import add_arguments, print_arguments from utils.utility import add_arguments, print_arguments
...@@ -46,7 +46,7 @@ add_arg('vocab_path', str, ...@@ -46,7 +46,7 @@ add_arg('vocab_path', str,
'data/librispeech/eng_vocab.txt', 'data/librispeech/eng_vocab.txt',
"Filepath of vocabulary.") "Filepath of vocabulary.")
add_arg('model_path', str, add_arg('model_path', str,
'./checkpoints/params.latest.tar.gz', './checkpoints/libri/params.latest.tar.gz',
"If None, the training starts from scratch, " "If None, the training starts from scratch, "
"otherwise, it resumes from the pre-trained model.") "otherwise, it resumes from the pre-trained model.")
add_arg('lang_model_path', str, add_arg('lang_model_path', str,
......
#! /usr/bin/bash #! /usr/bin/bash
pushd ../.. pushd ../.. > /dev/null
# download data, generate manifests # download data, generate manifests
python data/librispeech/librispeech.py \ python data/librispeech/librispeech.py \
--manifest_prefix='data/librispeech/manifest' \ --manifest_prefix='data/librispeech/manifest' \
--full_download='True' \ --target_dir='~/.cache/paddle/dataset/speech/Libri' \
--target_dir=$HOME'/.cache/paddle/dataset/speech/Libri' --full_download='True'
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
echo "Prepare LibriSpeech failed. Terminated." echo "Prepare LibriSpeech failed. Terminated."
exit 1 exit 1
fi fi
#cat data/librispeech/manifest.train* | shuf > data/librispeech/manifest.train cat data/librispeech/manifest.train-* | shuf > data/librispeech/manifest.train
# build vocabulary
python tools/build_vocab.py \
--count_threshold=0 \
--vocab_path='data/librispeech/vocab.txt' \
--manifest_paths='data/librispeech/manifest.train'
if [ $? -ne 0 ]; then
echo "Build vocabulary failed. Terminated."
exit 1
fi
# compute mean and stddev for normalizer # compute mean and stddev for normalizer
...@@ -30,3 +42,4 @@ fi ...@@ -30,3 +42,4 @@ fi
echo "LibriSpeech Data preparation done." echo "LibriSpeech Data preparation done."
exit 0
#! /usr/bin/bash #! /usr/bin/bash
pushd ../.. pushd ../.. > /dev/null
# download language model
pushd models/lm > /dev/null
sh download_lm_en.sh
if [ $? -ne 0 ]; then
exit 1
fi
popd > /dev/null
# infer
CUDA_VISIBLE_DEVICES=0 \ CUDA_VISIBLE_DEVICES=0 \
python -u infer.py \ python -u infer.py \
--num_samples=10 \ --num_samples=10 \
--trainer_count=1 \ --trainer_count=1 \
--beam_size=500 \ --beam_size=500 \
--num_proc_bsearch=12 \ --num_proc_bsearch=8 \
--num_proc_data=12 \
--num_conv_layers=2 \ --num_conv_layers=2 \
--num_rnn_layers=3 \ --num_rnn_layers=3 \
--rnn_layer_size=2048 \ --rnn_layer_size=2048 \
...@@ -18,11 +27,19 @@ python -u infer.py \ ...@@ -18,11 +27,19 @@ python -u infer.py \
--use_gru=False \ --use_gru=False \
--use_gpu=True \ --use_gpu=True \
--share_rnn_weights=True \ --share_rnn_weights=True \
--infer_manifest='data/librispeech/manifest.dev-clean' \ --infer_manifest='data/librispeech/manifest.test-clean' \
--mean_std_path='data/librispeech/mean_std.npz' \ --mean_std_path='data/librispeech/mean_std.npz' \
--vocab_path='data/librispeech/eng_vocab.txt' \ --vocab_path='data/librispeech/vocab.txt' \
--model_path='checkpoints/params.latest.tar.gz' \ --model_path='checkpoints/libri/params.latest.tar.gz' \
--lang_model_path='lm/data/common_crawl_00.prune01111.trie.klm' \ --lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
--decoding_method='ctc_beam_search' \ --decoding_method='ctc_beam_search' \
--error_rate_type='wer' \ --error_rate_type='wer' \
--specgram_type='linear' --specgram_type='linear'
if [ $? -ne 0 ]; then
echo "Failed in inference!"
exit 1
fi
exit 0
#! /usr/bin/bash
pushd ../.. > /dev/null
# download language model
pushd models/lm > /dev/null
sh download_lm_en.sh
if [ $? -ne 0 ]; then
exit 1
fi
popd > /dev/null
# download well-trained model
pushd models/librispeech > /dev/null
sh download_model.sh
if [ $? -ne 0 ]; then
exit 1
fi
popd > /dev/null
# infer
CUDA_VISIBLE_DEVICES=0 \
python -u infer.py \
--num_samples=10 \
--trainer_count=1 \
--beam_size=500 \
--num_proc_bsearch=8 \
--num_conv_layers=2 \
--num_rnn_layers=3 \
--rnn_layer_size=2048 \
--alpha=2.15 \
--beta=0.35 \
--cutoff_prob=1.0 \
--use_gru=False \
--use_gpu=True \
--share_rnn_weights=True \
--infer_manifest='data/librispeech/manifest.test-clean' \
--mean_std_path='models/librispeech/mean_std.npz' \
--vocab_path='models/librispeech/vocab.txt' \
--model_path='models/librispeech/params.tar.gz' \
--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
--decoding_method='ctc_beam_search' \
--error_rate_type='wer' \
--specgram_type='linear'
if [ $? -ne 0 ]; then
echo "Failed in inference!"
exit 1
fi
exit 0
#! /usr/bin/bash #! /usr/bin/bash
pushd ../.. pushd ../.. > /dev/null
# download language model
pushd models/lm > /dev/null
sh download_lm_en.sh
if [ $? -ne 0 ]; then
exit 1
fi
popd > /dev/null
# evaluate model
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
python -u test.py \ python -u test.py \
--batch_size=128 \ --batch_size=128 \
--trainer_count=8 \ --trainer_count=8 \
--beam_size=500 \ --beam_size=500 \
--num_proc_bsearch=12 \ --num_proc_bsearch=8 \
--num_proc_data=12 \ --num_proc_data=4 \
--num_conv_layers=2 \ --num_conv_layers=2 \
--num_rnn_layers=3 \ --num_rnn_layers=3 \
--rnn_layer_size=2048 \ --rnn_layer_size=2048 \
...@@ -20,9 +30,17 @@ python -u test.py \ ...@@ -20,9 +30,17 @@ python -u test.py \
--share_rnn_weights=True \ --share_rnn_weights=True \
--test_manifest='data/librispeech/manifest.test-clean' \ --test_manifest='data/librispeech/manifest.test-clean' \
--mean_std_path='data/librispeech/mean_std.npz' \ --mean_std_path='data/librispeech/mean_std.npz' \
--vocab_path='data/librispeech/eng_vocab.txt' \ --vocab_path='data/librispeech/vocab.txt' \
--model_path='checkpoints/params.latest.tar.gz' \ --model_path='checkpoints/libri/params.latest.tar.gz' \
--lang_model_path='lm/data/common_crawl_00.prune01111.trie.klm' \ --lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
--decoding_method='ctc_beam_search' \ --decoding_method='ctc_beam_search' \
--error_rate_type='wer' \ --error_rate_type='wer' \
--specgram_type='linear' --specgram_type='linear'
if [ $? -ne 0 ]; then
echo "Failed in evaluation!"
exit 1
fi
exit 0
#! /usr/bin/bash
pushd ../.. > /dev/null
# download language model
pushd models/lm > /dev/null
sh download_lm_en.sh
if [ $? -ne 0 ]; then
exit 1
fi
popd > /dev/null
# download well-trained model
pushd models/librispeech > /dev/null
sh download_model.sh
if [ $? -ne 0 ]; then
exit 1
fi
popd > /dev/null
# evaluate model
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
python -u test.py \
--batch_size=128 \
--trainer_count=8 \
--beam_size=500 \
--num_proc_bsearch=8 \
--num_proc_data=4 \
--num_conv_layers=2 \
--num_rnn_layers=3 \
--rnn_layer_size=2048 \
--alpha=0.36 \
--beta=0.25 \
--cutoff_prob=0.99 \
--use_gru=False \
--use_gpu=True \
--share_rnn_weights=True \
--test_manifest='data/tiny/manifest.test-clean' \
--mean_std_path='models/librispeech/mean_std.npz' \
--vocab_path='models/librispeech/vocab.txt' \
--model_path='models/librispeech/params.tar.gz' \
--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
--decoding_method='ctc_beam_search' \
--error_rate_type='wer' \
--specgram_type='linear'
if [ $? -ne 0 ]; then
echo "Failed in evaluation!"
exit 1
fi
exit 0
#! /usr/bin/bash #! /usr/bin/bash
pushd ../.. pushd ../.. > /dev/null
# train model
# if you wish to resume from an exists model, uncomment --init_model_path
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
python -u train.py \ python -u train.py \
--batch_size=256 \ --batch_size=512 \
--trainer_count=8 \ --trainer_count=8 \
--num_passes=200 \ --num_passes=50 \
--num_proc_data=12 \ --num_proc_data=12 \
--num_conv_layers=2 \ --num_conv_layers=2 \
--num_rnn_layers=3 \ --num_rnn_layers=3 \
...@@ -23,8 +25,16 @@ python -u train.py \ ...@@ -23,8 +25,16 @@ python -u train.py \
--train_manifest='data/librispeech/manifest.train' \ --train_manifest='data/librispeech/manifest.train' \
--dev_manifest='data/librispeech/manifest.dev' \ --dev_manifest='data/librispeech/manifest.dev' \
--mean_std_path='data/librispeech/mean_std.npz' \ --mean_std_path='data/librispeech/mean_std.npz' \
--vocab_path='data/librispeech/eng_vocab.txt' \ --vocab_path='data/librispeech/vocab.txt' \
--output_model_dir='./checkpoints' \ --output_model_dir='./checkpoints/libri' \
--augment_conf_path='conf/augmentation.config' \ --augment_conf_path='conf/augmentation.config' \
--specgram_type='linear' \ --specgram_type='linear' \
--shuffle_method='batch_shuffle_clipped' --shuffle_method='batch_shuffle_clipped'
if [ $? -ne 0 ]; then
echo "Failed in training!"
exit 1
fi
exit 0
#! /usr/bin/bash #! /usr/bin/bash
pushd ../.. pushd ../.. > /dev/null
# grid-search for hyper-parameters in language model
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
python -u tools/tune.py \ python -u tools/tune.py \
--num_samples=100 \ --num_samples=100 \
...@@ -23,8 +24,16 @@ python -u tools/tune.py \ ...@@ -23,8 +24,16 @@ python -u tools/tune.py \
--share_rnn_weights=True \ --share_rnn_weights=True \
--tune_manifest='data/librispeech/manifest.dev-clean' \ --tune_manifest='data/librispeech/manifest.dev-clean' \
--mean_std_path='data/librispeech/mean_std.npz' \ --mean_std_path='data/librispeech/mean_std.npz' \
--vocab_path='data/librispeech/eng_vocab.txt' \ --vocab_path='data/librispeech/vocab.txt' \
--model_path='checkpoints/params.latest.tar.gz' \ --model_path='checkpoints/libri/params.latest.tar.gz' \
--lang_model_path='lm/data/common_crawl_00.prune01111.trie.klm' \ --lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
--error_rate_type='wer' \ --error_rate_type='wer' \
--specgram_type='linear' --specgram_type='linear'
if [ $? -ne 0 ]; then
echo "Failed in tuning!"
exit 1
fi
exit 0
#! /usr/bin/bash
pushd ../.. > /dev/null
# start demo client
CUDA_VISIBLE_DEVICES=0 \
python -u deploy/demo_client.py \
--host_ip='localhost' \
--host_port=8086 \
if [ $? -ne 0 ]; then
echo "Failed in starting demo client!"
exit 1
fi
exit 0
#! /usr/bin/bash
# TODO: replace the model with a mandarin model
pushd ../.. > /dev/null
# download language model
pushd models/lm > /dev/null
sh download_lm_en.sh
if [ $? -ne 0 ]; then
exit 1
fi
popd > /dev/null
# download well-trained model
pushd models/librispeech > /dev/null
sh download_model.sh
if [ $? -ne 0 ]; then
exit 1
fi
popd > /dev/null
# start demo server
CUDA_VISIBLE_DEVICES=0 \
python -u deploy/demo_server.py \
--host_ip='localhost' \
--host_port=8086 \
--num_conv_layers=2 \
--num_rnn_layers=3 \
--rnn_layer_size=2048 \
--alpha=0.36 \
--beta=0.25 \
--cutoff_prob=0.99 \
--use_gru=False \
--use_gpu=True \
--share_rnn_weights=True \
--speech_save_dir='demo_cache' \
--warmup_manifest='data/tiny/manifest.test-clean' \
--mean_std_path='models/librispeech/mean_std.npz' \
--vocab_path='models/librispeech/vocab.txt' \
--model_path='models/librispeech/params.tar.gz' \
--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
--decoding_method='ctc_beam_search' \
--specgram_type='linear'
if [ $? -ne 0 ]; then
echo "Failed in starting demo server!"
exit 1
fi
exit 0
#! /usr/bin/bash
pushd ../.. > /dev/null
# prepare folder
if [ ! -e data/tiny ]; then
mkdir data/tiny
fi
# download data, generate manifests
python data/librispeech/librispeech.py \
--manifest_prefix='data/tiny/manifest' \
--target_dir='~/.cache/paddle/dataset/speech/libri' \
--full_download='False'
if [ $? -ne 0 ]; then
echo "Prepare LibriSpeech failed. Terminated."
exit 1
fi
head -n 64 data/tiny/manifest.dev-clean > data/tiny/manifest.tiny
# build vocabulary
python tools/build_vocab.py \
--count_threshold=0 \
--vocab_path='data/tiny/vocab.txt' \
--manifest_paths='data/tiny/manifest.dev'
if [ $? -ne 0 ]; then
echo "Build vocabulary failed. Terminated."
exit 1
fi
# compute mean and stddev for normalizer
python tools/compute_mean_std.py \
--manifest_path='data/tiny/manifest.tiny' \
--num_samples=64 \
--specgram_type='linear' \
--output_path='data/tiny/mean_std.npz'
if [ $? -ne 0 ]; then
echo "Compute mean and stddev failed. Terminated."
exit 1
fi
echo "Tiny data preparation done."
exit 0
#! /usr/bin/bash
pushd ../.. > /dev/null
# download language model
pushd models/lm > /dev/null
sh download_lm_en.sh
if [ $? -ne 0 ]; then
exit 1
fi
popd > /dev/null
# infer
CUDA_VISIBLE_DEVICES=0 \
python -u infer.py \
--num_samples=10 \
--trainer_count=1 \
--beam_size=500 \
--num_proc_bsearch=8 \
--num_conv_layers=2 \
--num_rnn_layers=3 \
--rnn_layer_size=2048 \
--alpha=0.36 \
--beta=0.25 \
--cutoff_prob=0.99 \
--use_gru=False \
--use_gpu=True \
--share_rnn_weights=True \
--infer_manifest='data/tiny/manifest.tiny' \
--mean_std_path='data/tiny/mean_std.npz' \
--vocab_path='data/tiny/vocab.txt' \
--model_path='checkpoints/tiny/params.pass-19.tar.gz' \
--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
--decoding_method='ctc_beam_search' \
--error_rate_type='wer' \
--specgram_type='linear'
if [ $? -ne 0 ]; then
echo "Failed in inference!"
exit 1
fi
exit 0
#! /usr/bin/bash
pushd ../.. > /dev/null
# download language model
pushd models/lm > /dev/null
sh download_lm_en.sh
if [ $? -ne 0 ]; then
exit 1
fi
popd > /dev/null
# download well-trained model
pushd models/librispeech > /dev/null
sh download_model.sh
if [ $? -ne 0 ]; then
exit 1
fi
popd > /dev/null
# infer
CUDA_VISIBLE_DEVICES=0 \
python -u infer.py \
--num_samples=10 \
--trainer_count=1 \
--beam_size=500 \
--num_proc_bsearch=8 \
--num_conv_layers=2 \
--num_rnn_layers=3 \
--rnn_layer_size=2048 \
--alpha=0.36 \
--beta=0.25 \
--cutoff_prob=0.99 \
--use_gru=False \
--use_gpu=True \
--share_rnn_weights=True \
--infer_manifest='data/tiny/manifest.test-clean' \
--mean_std_path='models/librispeech/mean_std.npz' \
--vocab_path='models/librispeech/vocab.txt' \
--model_path='models/librispeech/params.tar.gz' \
--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
--decoding_method='ctc_beam_search' \
--error_rate_type='wer' \
--specgram_type='linear'
if [ $? -ne 0 ]; then
echo "Failed in inference!"
exit 1
fi
exit 0
#! /usr/bin/bash
pushd ../.. > /dev/null
# download language model
pushd models/lm > /dev/null
sh download_lm_en.sh
if [ $? -ne 0 ]; then
exit 1
fi
popd > /dev/null
# evaluate model
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
python -u test.py \
--batch_size=16 \
--trainer_count=8 \
--beam_size=500 \
--num_proc_bsearch=8 \
--num_proc_data=4 \
--num_conv_layers=2 \
--num_rnn_layers=3 \
--rnn_layer_size=2048 \
--alpha=0.36 \
--beta=0.25 \
--cutoff_prob=0.99 \
--use_gru=False \
--use_gpu=True \
--share_rnn_weights=True \
--test_manifest='data/tiny/manifest.tiny' \
--mean_std_path='data/tiny/mean_std.npz' \
--vocab_path='data/tiny/vocab.txt' \
--model_path='checkpoints/params.pass-19.tar.gz' \
--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
--decoding_method='ctc_beam_search' \
--error_rate_type='wer' \
--specgram_type='linear'
if [ $? -ne 0 ]; then
echo "Failed in evaluation!"
exit 1
fi
exit 0
#! /usr/bin/bash
pushd ../.. > /dev/null
# download language model
pushd models/lm > /dev/null
sh download_lm_en.sh
if [ $? -ne 0 ]; then
exit 1
fi
popd > /dev/null
# download well-trained model
pushd models/librispeech > /dev/null
sh download_model.sh
if [ $? -ne 0 ]; then
exit 1
fi
popd > /dev/null
# evaluate model
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
python -u test.py \
--batch_size=128 \
--trainer_count=8 \
--beam_size=500 \
--num_proc_bsearch=8 \
--num_proc_data=4 \
--num_conv_layers=2 \
--num_rnn_layers=3 \
--rnn_layer_size=2048 \
--alpha=0.36 \
--beta=0.25 \
--cutoff_prob=0.99 \
--use_gru=False \
--use_gpu=True \
--share_rnn_weights=True \
--test_manifest='data/tiny/manifest.test-clean' \
--mean_std_path='models/librispeech/mean_std.npz' \
--vocab_path='models/librispeech/vocab.txt' \
--model_path='models/librispeech/params.tar.gz' \
--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
--decoding_method='ctc_beam_search' \
--error_rate_type='wer' \
--specgram_type='linear'
if [ $? -ne 0 ]; then
echo "Failed in evaluation!"
exit 1
fi
exit 0
#! /usr/bin/bash
pushd ../.. > /dev/null
# train model
# if you wish to resume from an exists model, uncomment --init_model_path
CUDA_VISIBLE_DEVICES=0,1,2,3 \
python -u train.py \
--batch_size=16 \
--trainer_count=4 \
--num_passes=20 \
--num_proc_data=1 \
--num_conv_layers=2 \
--num_rnn_layers=3 \
--rnn_layer_size=2048 \
--num_iter_print=100 \
--learning_rate=1e-5 \
--max_duration=27.0 \
--min_duration=0.0 \
--use_sortagrad=True \
--use_gru=False \
--use_gpu=True \
--is_local=True \
--share_rnn_weights=True \
--train_manifest='data/tiny/manifest.tiny' \
--dev_manifest='data/tiny/manifest.tiny' \
--mean_std_path='data/tiny/mean_std.npz' \
--vocab_path='data/tiny/vocab.txt' \
--output_model_dir='./checkpoints/tiny' \
--augment_conf_path='conf/augmentation.config' \
--specgram_type='linear' \
--shuffle_method='batch_shuffle_clipped'
if [ $? -ne 0 ]; then
echo "Fail to do inference!"
exit 1
fi
exit 0
#! /usr/bin/bash
pushd ../.. > /dev/null
# grid-search for hyper-parameters in language model
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
python -u tools/tune.py \
--num_samples=100 \
--trainer_count=8 \
--beam_size=500 \
--num_proc_bsearch=12 \
--num_conv_layers=2 \
--num_rnn_layers=3 \
--rnn_layer_size=2048 \
--num_alphas=14 \
--num_betas=20 \
--alpha_from=0.1 \
--alpha_to=0.36 \
--beta_from=0.05 \
--beta_to=1.0 \
--cutoff_prob=0.99 \
--use_gru=False \
--use_gpu=True \
--share_rnn_weights=True \
--tune_manifest='data/tiny/manifest.tiny' \
--mean_std_path='data/tiny/mean_std.npz' \
--vocab_path='data/tiny/vocab.txt' \
--model_path='checkpoints/params.pass-9.tar.gz' \
--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \
--error_rate_type='wer' \
--specgram_type='linear'
if [ $? -ne 0 ]; then
echo "Failed in tuning!"
exit 1
fi
exit 0
...@@ -7,7 +7,7 @@ import argparse ...@@ -7,7 +7,7 @@ import argparse
import functools import functools
import paddle.v2 as paddle import paddle.v2 as paddle
from data_utils.data import DataGenerator from data_utils.data import DataGenerator
from models.model import DeepSpeech2Model from model_utils.model import DeepSpeech2Model
from utils.error_rate import wer, cer from utils.error_rate import wer, cer
from utils.utility import add_arguments, print_arguments from utils.utility import add_arguments, print_arguments
...@@ -35,13 +35,13 @@ add_arg('mean_std_path', str, ...@@ -35,13 +35,13 @@ add_arg('mean_std_path', str,
'data/librispeech/mean_std.npz', 'data/librispeech/mean_std.npz',
"Filepath of normalizer's mean & std.") "Filepath of normalizer's mean & std.")
add_arg('vocab_path', str, add_arg('vocab_path', str,
'data/librispeech/eng_vocab.txt', 'data/librispeech/vocab.txt',
"Filepath of vocabulary.") "Filepath of vocabulary.")
add_arg('lang_model_path', str, add_arg('lang_model_path', str,
'lm/data/common_crawl_00.prune01111.trie.klm', 'models/lm/common_crawl_00.prune01111.trie.klm',
"Filepath for language model.") "Filepath for language model.")
add_arg('model_path', str, add_arg('model_path', str,
'./checkpoints/params.latest.tar.gz', './checkpoints/libri/params.latest.tar.gz',
"If None, the training starts from scratch, " "If None, the training starts from scratch, "
"otherwise, it resumes from the pre-trained model.") "otherwise, it resumes from the pre-trained model.")
add_arg('decoding_method', str, add_arg('decoding_method', str,
......
echo "Downloading language model ..."
mkdir data
LM=common_crawl_00.prune01111.trie.klm
MD5="099a601759d467cd0a8523ff939819c5"
wget -c http://paddlepaddle.bj.bcebos.com/model_zoo/speech/$LM -P ./data
echo "Checking md5sum ..."
md5_tmp=`md5sum ./data/$LM | awk -F[' '] '{print $1}'`
if [ $MD5 != $md5_tmp ]; then
echo "Fail to download the language model!"
exit 1
fi
...@@ -180,6 +180,8 @@ def ctc_beam_search_decoder(probs_seq, ...@@ -180,6 +180,8 @@ def ctc_beam_search_decoder(probs_seq,
prob = prob * ext_scoring_func(result) prob = prob * ext_scoring_func(result)
log_prob = log(prob) log_prob = log(prob)
beam_result.append((log_prob, result)) beam_result.append((log_prob, result))
else:
beam_result.append((float('-inf'), ''))
## output top beam_size decoding results ## output top beam_size decoding results
beam_result = sorted(beam_result, key=lambda asd: asd[0], reverse=True) beam_result = sorted(beam_result, key=lambda asd: asd[0], reverse=True)
......
...@@ -8,10 +8,10 @@ import os ...@@ -8,10 +8,10 @@ import os
import time import time
import gzip import gzip
import paddle.v2 as paddle import paddle.v2 as paddle
from models.swig_decoders_wrapper import Scorer from decoders.swig_wrapper import Scorer
from models.swig_decoders_wrapper import ctc_greedy_decoder from decoders.swig_wrapper import ctc_greedy_decoder
from models.swig_decoders_wrapper import ctc_beam_search_decoder_batch from decoders.swig_wrapper import ctc_beam_search_decoder_batch
from models.network import deep_speech_v2_network from model_utils.network import deep_speech_v2_network
class DeepSpeech2Model(object): class DeepSpeech2Model(object):
......
...@@ -4,7 +4,7 @@ from __future__ import division ...@@ -4,7 +4,7 @@ from __future__ import division
from __future__ import print_function from __future__ import print_function
import unittest import unittest
from models import decoder from model_utils import decoder
class TestDecoders(unittest.TestCase): class TestDecoders(unittest.TestCase):
......
#! /usr/bin/bash
source ../../utils/utility.sh
# TODO: add urls
URL='to-be-added'
MD5=5b4af224b26c1dc4dd972b7d32f2f52a
TARGET=./librispeech_model.tar.gz
echo "Download LibriSpeech model ..."
download $URL $MD5 $TARGET
if [ $? -ne 0 ]; then
echo "Fail to download LibriSpeech model!"
exit 1
fi
tar -zxvf $TARGET
exit 0
#! /usr/bin/bash
source ../../utils/utility.sh
URL=http://paddlepaddle.bj.bcebos.com/model_zoo/speech/common_crawl_00.prune01111.trie.klm
MD5="099a601759d467cd0a8523ff939819c5"
TARGET=./common_crawl_00.prune01111.trie.klm
echo "Download language model ..."
download $URL $MD5 $TARGET
if [ $? -ne 0 ]; then
echo "Fail to download the language model!"
exit 1
fi
exit 0
...@@ -7,7 +7,7 @@ import argparse ...@@ -7,7 +7,7 @@ import argparse
import functools import functools
import paddle.v2 as paddle import paddle.v2 as paddle
from data_utils.data import DataGenerator from data_utils.data import DataGenerator
from models.model import DeepSpeech2Model from model_utils.model import DeepSpeech2Model
from utils.error_rate import wer, cer from utils.error_rate import wer, cer
from utils.utility import add_arguments, print_arguments from utils.utility import add_arguments, print_arguments
...@@ -36,14 +36,14 @@ add_arg('mean_std_path', str, ...@@ -36,14 +36,14 @@ add_arg('mean_std_path', str,
'data/librispeech/mean_std.npz', 'data/librispeech/mean_std.npz',
"Filepath of normalizer's mean & std.") "Filepath of normalizer's mean & std.")
add_arg('vocab_path', str, add_arg('vocab_path', str,
'data/librispeech/eng_vocab.txt', 'data/librispeech/vocab.txt',
"Filepath of vocabulary.") "Filepath of vocabulary.")
add_arg('model_path', str, add_arg('model_path', str,
'./checkpoints/params.latest.tar.gz', './checkpoints/libri/params.latest.tar.gz',
"If None, the training starts from scratch, " "If None, the training starts from scratch, "
"otherwise, it resumes from the pre-trained model.") "otherwise, it resumes from the pre-trained model.")
add_arg('lang_model_path', str, add_arg('lang_model_path', str,
'lm/data/common_crawl_00.prune01111.trie.klm', 'models/lm/common_crawl_00.prune01111.trie.klm',
"Filepath for language model.") "Filepath for language model.")
add_arg('decoding_method', str, add_arg('decoding_method', str,
'ctc_beam_search', 'ctc_beam_search',
......
...@@ -21,7 +21,7 @@ add_arg = functools.partial(add_arguments, argparser=parser) ...@@ -21,7 +21,7 @@ add_arg = functools.partial(add_arguments, argparser=parser)
# yapf: disable # yapf: disable
add_arg('count_threshold', int, 0, "Truncation threshold for char counts.") add_arg('count_threshold', int, 0, "Truncation threshold for char counts.")
add_arg('vocab_path', str, add_arg('vocab_path', str,
'datasets/vocab/zh_vocab.txt', 'data/librispeech/vocab.txt',
"Filepath to write the vocabulary.") "Filepath to write the vocabulary.")
add_arg('manifest_paths', str, add_arg('manifest_paths', str,
None, None,
...@@ -34,7 +34,7 @@ args = parser.parse_args() ...@@ -34,7 +34,7 @@ args = parser.parse_args()
def count_manifest(counter, manifest_path): def count_manifest(counter, manifest_path):
manifest_jsons = utils.read_manifest(manifest_path) manifest_jsons = read_manifest(manifest_path)
for line_json in manifest_jsons: for line_json in manifest_jsons:
for char in line_json['text']: for char in line_json['text']:
counter.update(char) counter.update(char)
......
...@@ -20,10 +20,10 @@ add_arg('specgram_type', str, ...@@ -20,10 +20,10 @@ add_arg('specgram_type', str,
"Audio feature type. Options: linear, mfcc.", "Audio feature type. Options: linear, mfcc.",
choices=['linear', 'mfcc']) choices=['linear', 'mfcc'])
add_arg('manifest_path', str, add_arg('manifest_path', str,
'datasets/manifest.train', 'data/librispeech/manifest.train',
"Filepath of manifest to compute normalizer's mean and stddev.") "Filepath of manifest to compute normalizer's mean and stddev.")
add_arg('output_path', str, add_arg('output_path', str,
'mean_std.npz', 'data/librispeech/mean_std.npz',
"Filepath of write mean and stddev to (.npz).") "Filepath of write mean and stddev to (.npz).")
# yapf: disable # yapf: disable
args = parser.parse_args() args = parser.parse_args()
......
...@@ -9,7 +9,7 @@ import functools ...@@ -9,7 +9,7 @@ import functools
import paddle.v2 as paddle import paddle.v2 as paddle
import _init_paths import _init_paths
from data_utils.data import DataGenerator from data_utils.data import DataGenerator
from models.model import DeepSpeech2Model from model_utils.model import DeepSpeech2Model
from utils.error_rate import wer from utils.error_rate import wer
from utils.utility import add_arguments, print_arguments from utils.utility import add_arguments, print_arguments
...@@ -41,13 +41,13 @@ add_arg('mean_std_path', str, ...@@ -41,13 +41,13 @@ add_arg('mean_std_path', str,
'data/librispeech/mean_std.npz', 'data/librispeech/mean_std.npz',
"Filepath of normalizer's mean & std.") "Filepath of normalizer's mean & std.")
add_arg('vocab_path', str, add_arg('vocab_path', str,
'data/librispeech/eng_vocab.txt', 'data/librispeech/vocab.txt',
"Filepath of vocabulary.") "Filepath of vocabulary.")
add_arg('lang_model_path', str, add_arg('lang_model_path', str,
'lm/data/common_crawl_00.prune01111.trie.klm', 'models/lm/common_crawl_00.prune01111.trie.klm',
"Filepath for language model.") "Filepath for language model.")
add_arg('model_path', str, add_arg('model_path', str,
'./checkpoints/params.latest.tar.gz', './checkpoints/libri/params.latest.tar.gz',
"If None, the training starts from scratch, " "If None, the training starts from scratch, "
"otherwise, it resumes from the pre-trained model.") "otherwise, it resumes from the pre-trained model.")
add_arg('error_rate_type', str, add_arg('error_rate_type', str,
......
...@@ -6,7 +6,7 @@ from __future__ import print_function ...@@ -6,7 +6,7 @@ from __future__ import print_function
import argparse import argparse
import functools import functools
import paddle.v2 as paddle import paddle.v2 as paddle
from models.model import DeepSpeech2Model from model_utils.model import DeepSpeech2Model
from data_utils.data import DataGenerator from data_utils.data import DataGenerator
from utils.utility import add_arguments, print_arguments from utils.utility import add_arguments, print_arguments
...@@ -41,14 +41,14 @@ add_arg('mean_std_path', str, ...@@ -41,14 +41,14 @@ add_arg('mean_std_path', str,
'data/librispeech/mean_std.npz', 'data/librispeech/mean_std.npz',
"Filepath of normalizer's mean & std.") "Filepath of normalizer's mean & std.")
add_arg('vocab_path', str, add_arg('vocab_path', str,
'data/librispeech/eng_vocab.txt', 'data/librispeech/vocab.txt',
"Filepath of vocabulary.") "Filepath of vocabulary.")
add_arg('init_model_path', str, add_arg('init_model_path', str,
None, None,
"If None, the training starts from scratch, " "If None, the training starts from scratch, "
"otherwise, it resumes from the pre-trained model.") "otherwise, it resumes from the pre-trained model.")
add_arg('output_model_dir', str, add_arg('output_model_dir', str,
"./checkpoints", "./checkpoints/libri",
"Directory for saving checkpoints.") "Directory for saving checkpoints.")
add_arg('augment_conf_path',str, add_arg('augment_conf_path',str,
'conf/augmentation.config', 'conf/augmentation.config',
......
download() {
URL=$1
MD5=$2
TARGET=$3
if [ -e $TARGET ]; then
md5_result=`md5sum $TARGET | awk -F[' '] '{print $1}'`
if [ $MD5 == $md5_result ]; then
echo "$TARGET already exists, download skipped."
return 0
fi
fi
wget -c $URL -P `dirname "$TARGET"`
md5_result=`md5sum $TARGET | awk -F[' '] '{print $1}'`
if [ $MD5 -ne $md5_result ]; then
echo "Fail to download the language model!"
return 1
fi
}
此差异已折叠。
"""
Contains data utilities.
"""
def reader_append_wrapper(reader, append_tuple):
"""
Data reader wrapper for appending extra data to exisiting reader.
"""
def new_reader():
for ins in reader():
yield ins + append_tuple
return new_reader
"""
External neural memory class.
"""
import paddle.v2 as paddle
class ExternalMemory(object):
"""External neural memory class.
A simplified Neural Turing Machines (NTM) with only content-based
addressing (including content addressing and interpolation, but excluding
convolutional shift and sharpening). It serves as an external differential
memory bank, with differential write/read head controllers to store
and read information dynamically. Simple feedforward networks are
used as the write/read head controllers.
The ExternalMemory class could be utilized by many neural network structures
to easily expand their memory bandwidth and accomplish a long-term memory
handling. Besides, some existing mechanism can be realized directly with
the ExternalMemory class, e.g. the attention mechanism in Seq2Seq (i.e. an
unbounded external memory).
Besides, the ExternalMemory class must be used together with
paddle.layer.recurrent_group (within its step function). It can never be
used in a standalone manner.
For more details, please refer to
`Neural Turing Machines <https://arxiv.org/abs/1410.5401>`_.
:param name: Memory name.
:type name: basestring
:param mem_slot_size: Size of memory slot/vector.
:type mem_slot_size: int
:param boot_layer: Boot layer for initializing the external memory. The
sequence layer has sequence length indicating the number
of memory slots, and size as memory slot size.
:type boot_layer: LayerOutput
:param readonly: If true, the memory is read-only, and write function cannot
be called. Default is false.
:type readonly: bool
:param enable_interpolation: If set true, the read/write addressing weights
will be interpolated with the weights in the
last step, with the affine coefficients being
a learnable gate function.
:type enable_interpolation: bool
"""
def __init__(self,
name,
mem_slot_size,
boot_layer,
readonly=False,
enable_interpolation=True):
self.name = name
self.mem_slot_size = mem_slot_size
self.readonly = readonly
self.enable_interpolation = enable_interpolation
self.external_memory = paddle.layer.memory(
name=self.name, size=self.mem_slot_size, boot_layer=boot_layer)
# prepare a constant (zero) intializer for addressing weights
self.zero_addressing_init = paddle.layer.slope_intercept(
input=paddle.layer.fc(input=boot_layer, size=1),
slope=0.0,
intercept=0.0)
# set memory to constant when readonly=True
if self.readonly:
self.updated_external_memory = paddle.layer.mixed(
name=self.name,
input=[
paddle.layer.identity_projection(input=self.external_memory)
],
size=self.mem_slot_size)
def _content_addressing(self, key_vector):
"""Get write/read head's addressing weights via content-based addressing.
"""
# content-based addressing: a=tanh(W*M + U*key)
key_projection = paddle.layer.fc(
input=key_vector,
size=self.mem_slot_size,
act=paddle.activation.Linear(),
bias_attr=False)
key_proj_expanded = paddle.layer.expand(
input=key_projection, expand_as=self.external_memory)
memory_projection = paddle.layer.fc(
input=self.external_memory,
size=self.mem_slot_size,
act=paddle.activation.Linear(),
bias_attr=False)
merged_projection = paddle.layer.addto(
input=[key_proj_expanded, memory_projection],
act=paddle.activation.Tanh())
# softmax addressing weight: w=softmax(v^T a)
addressing_weight = paddle.layer.fc(
input=merged_projection,
size=1,
act=paddle.activation.SequenceSoftmax(),
bias_attr=False)
return addressing_weight
def _interpolation(self, head_name, key_vector, addressing_weight):
"""Interpolate between previous and current addressing weights.
"""
# prepare interpolation scalar gate: g=sigmoid(W*key)
gate = paddle.layer.fc(
input=key_vector,
size=1,
act=paddle.activation.Sigmoid(),
bias_attr=False)
# interpolation: w_t = g*w_t+(1-g)*w_{t-1}
last_addressing_weight = paddle.layer.memory(
name=self.name + "_addressing_weight_" + head_name,
size=1,
boot_layer=self.zero_addressing_init)
interpolated_weight = paddle.layer.interpolation(
name=self.name + "_addressing_weight_" + head_name,
input=[addressing_weight, addressing_weight],
weight=paddle.layer.expand(input=gate, expand_as=addressing_weight))
return interpolated_weight
def _get_addressing_weight(self, head_name, key_vector):
"""Get final addressing weights for read/write heads, including content
addressing and interpolation.
"""
# current content-based addressing
addressing_weight = self._content_addressing(key_vector)
# interpolation with previous addresing weight
if self.enable_interpolation:
return self._interpolation(head_name, key_vector, addressing_weight)
else:
return addressing_weight
def write(self, write_key):
"""Write onto the external memory.
It cannot be called if "readonly" set True.
:param write_key: Key vector for write heads to generate writing
content and addressing signals.
:type write_key: LayerOutput
"""
# check readonly
if self.readonly:
raise ValueError("ExternalMemory with readonly=True cannot write.")
# get addressing weight for write head
write_weight = self._get_addressing_weight("write_head", write_key)
# prepare add_vector and erase_vector
erase_vector = paddle.layer.fc(
input=write_key,
size=self.mem_slot_size,
act=paddle.activation.Sigmoid(),
bias_attr=False)
add_vector = paddle.layer.fc(
input=write_key,
size=self.mem_slot_size,
act=paddle.activation.Sigmoid(),
bias_attr=False)
erase_vector_expand = paddle.layer.expand(
input=erase_vector, expand_as=self.external_memory)
add_vector_expand = paddle.layer.expand(
input=add_vector, expand_as=self.external_memory)
# prepare scaled add part and erase part
scaled_erase_vector_expand = paddle.layer.scaling(
weight=write_weight, input=erase_vector_expand)
erase_memory_part = paddle.layer.mixed(
input=paddle.layer.dotmul_operator(
a=self.external_memory,
b=scaled_erase_vector_expand,
scale=-1.0))
add_memory_part = paddle.layer.scaling(
weight=write_weight, input=add_vector_expand)
# update external memory
self.updated_external_memory = paddle.layer.addto(
input=[self.external_memory, add_memory_part, erase_memory_part],
name=self.name)
def read(self, read_key):
"""Read from the external memory.
:param write_key: Key vector for read head to generate addressing
signals.
:type write_key: LayerOutput
:return: Content (vector) read from external memory.
:rtype: LayerOutput
"""
# get addressing weight for write head
read_weight = self._get_addressing_weight("read_head", read_key)
# read content from external memory
scaled = paddle.layer.scaling(
weight=read_weight, input=self.updated_external_memory)
return paddle.layer.pooling(
input=scaled, pooling_type=paddle.pooling.Sum())
"""
Contains infering script for machine translation with external memory.
"""
import distutils.util
import argparse
import gzip
import paddle.v2 as paddle
from external_memory import ExternalMemory
from model import memory_enhanced_seq2seq
from data_utils import reader_append_wrapper
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--dict_size",
default=30000,
type=int,
help="Vocabulary size. (default: %(default)s)")
parser.add_argument(
"--word_vec_dim",
default=512,
type=int,
help="Word embedding size. (default: %(default)s)")
parser.add_argument(
"--hidden_size",
default=1024,
type=int,
help="Hidden cell number in RNN. (default: %(default)s)")
parser.add_argument(
"--memory_slot_num",
default=8,
type=int,
help="External memory slot number. (default: %(default)s)")
parser.add_argument(
"--beam_size",
default=3,
type=int,
help="Beam search width. (default: %(default)s)")
parser.add_argument(
"--use_gpu",
default=False,
type=distutils.util.strtobool,
help="Use gpu or not. (default: %(default)s)")
parser.add_argument(
"--trainer_count",
default=1,
type=int,
help="Trainer number. (default: %(default)s)")
parser.add_argument(
"--batch_size",
default=5,
type=int,
help="Batch size. (default: %(default)s)")
parser.add_argument(
"--infer_data_num",
default=3,
type=int,
help="Instance num to infer. (default: %(default)s)")
parser.add_argument(
"--model_filepath",
default="checkpoints/params.latest.tar.gz",
type=str,
help="Model filepath. (default: %(default)s)")
parser.add_argument(
"--memory_perturb_stddev",
default=0.1,
type=float,
help="Memory perturb stddev for memory initialization."
"(default: %(default)s)")
args = parser.parse_args()
def parse_beam_search_result(beam_result, dictionary):
"""
Beam search result parser.
"""
sentence_list = []
sentence = []
for word in beam_result[1]:
if word != -1:
sentence.append(word)
else:
sentence_list.append(
' '.join([dictionary.get(word) for word in sentence[1:]]))
sentence = []
beam_probs = beam_result[0]
beam_size = len(beam_probs[0])
beam_sentences = [
sentence_list[i:i + beam_size]
for i in range(0, len(sentence_list), beam_size)
]
return beam_probs, beam_sentences
def infer():
"""
For inferencing.
"""
# create network config
source_words = paddle.layer.data(
name="source_words",
type=paddle.data_type.integer_value_sequence(args.dict_size))
beam_gen = memory_enhanced_seq2seq(
encoder_input=source_words,
decoder_input=None,
decoder_target=None,
hidden_size=args.hidden_size,
word_vec_dim=args.word_vec_dim,
dict_size=args.dict_size,
is_generating=True,
beam_size=args.beam_size)
# load parameters
parameters = paddle.parameters.Parameters.from_tar(
gzip.open(args.model_filepath))
# prepare infer data
infer_data = []
random.seed(0) # for keeping consitancy for multiple runs
bounded_memory_perturbation = [[
random.gauss(0, memory_perturb_stddev) for i in xrange(args.hidden_size)
] for j in xrange(args.memory_slot_num)]
test_append_reader = reader_append_wrapper(
reader=paddle.dataset.wmt14.test(dict_size),
append_tuple=(bounded_memory_perturbation, ))
for i, item in enumerate(test_append_reader()):
if i < args.infer_data_num:
infer_data.append((item[0], item[3], ))
# run inference
beam_result = paddle.infer(
output_layer=beam_gen,
parameters=parameters,
input=infer_data,
field=['prob', 'id'])
# parse beam result and print
source_dict, target_dict = paddle.dataset.wmt14.get_dict(dict_size)
beam_probs, beam_sentences = parse_beam_search_result(beam_result,
target_dict)
for i in xrange(args.infer_data_num):
print "\n***************************************************\n"
print "src:", ' '.join(
[source_dict.get(word) for word in infer_data[i][0]]), "\n"
for j in xrange(args.beam_size):
print "prob = %f : %s" % (beam_probs[i][j], beam_sentences[i][j])
def main():
paddle.init(use_gpu=False, trainer_count=1)
infer()
if __name__ == '__main__':
main()
"""
Contains model configuration for external-memory-enhanced seq2seq.
The "external memory" refers to two types of memories.
- Unbounded memory: i.e. vanilla attention mechanism in Seq2Seq.
- Bounded memory: i.e. external memory in NTM.
Both types of external memories are exploited to enhance the vanilla
Seq2Seq neural machine translation.
The implementation primarily follows the paper
`Memory-enhanced Decoder for Neural Machine Translation
<https://arxiv.org/abs/1606.02003>`_,
with some minor differences (will be listed in README.md).
For details about "external memory", please also refer to
`Neural Turing Machines <https://arxiv.org/abs/1410.5401>`_.
"""
import paddle.v2 as paddle
from external_memory import ExternalMemory
def bidirectional_gru_encoder(input, size, word_vec_dim):
"""Bidirectional GRU encoder.
:params size: Hidden cell number in decoder rnn.
:type size: int
:params word_vec_dim: Word embedding size.
:type word_vec_dim: int
:return: Tuple of 1. concatenated forward and backward hidden sequence.
2. last state of backward rnn.
:rtype: tuple of LayerOutput
"""
# token embedding
embeddings = paddle.layer.embedding(input=input, size=word_vec_dim)
# token-level forward and backard encoding for attentions
forward = paddle.networks.simple_gru(
input=embeddings, size=size, reverse=False)
backward = paddle.networks.simple_gru(
input=embeddings, size=size, reverse=True)
forward_backward = paddle.layer.concat(input=[forward, backward])
# sequence-level encoding
backward_first = paddle.layer.first_seq(input=backward)
return forward_backward, backward_first
def memory_enhanced_decoder(input, target, initial_state, source_context, size,
word_vec_dim, dict_size, is_generating, beam_size):
"""GRU sequence decoder enhanced with external memory.
The "external memory" refers to two types of memories.
- Unbounded memory: i.e. attention mechanism in Seq2Seq.
- Bounded memory: i.e. external memory in NTM.
Both types of external memories can be implemented with
ExternalMemory class, and are both exploited in this enhanced RNN decoder.
The vanilla RNN/LSTM/GRU also has a narrow memory mechanism, namely the
hidden state vector (or cell state in LSTM) carrying information through
a span of sequence time, which is a successful design enriching the model
with the capability to "remember" things in the long run. However, such a
vector state is somewhat limited to a very narrow memory bandwidth. External
memory introduced here could easily increase the memory capacity with linear
complexity cost (rather than quadratic for vector state).
This enhanced decoder expands its "memory passage" through two
ExternalMemory objects:
- Bounded memory for handling long-term information exchange within decoder
itself. A direct expansion of traditional "vector" state.
- Unbounded memory for handling source language's token-wise information.
Exactly the attention mechanism over Seq2Seq.
Notice that we take the attention mechanism as a particular form of external
memory, with read-only memory bank initialized with encoder states, and a
read head with content-based addressing (attention). From this view point,
we arrive at a better understanding of attention mechanism itself and other
external memory, and a concise and unified implementation for them.
For more details about external memory, please refer to
`Neural Turing Machines <https://arxiv.org/abs/1410.5401>`_.
For more details about this memory-enhanced decoder, please
refer to `Memory-enhanced Decoder for Neural Machine Translation
<https://arxiv.org/abs/1606.02003>`_. This implementation is highly
correlated to this paper, but with minor differences (e.g. put "write"
before "read" to bypass a potential bug in V2 APIs. See
(`issue <https://github.com/PaddlePaddle/Paddle/issues/2061>`_).
:params input: Decoder input.
:type input: LayerOutput
:params target: Decoder target.
:type target: LayerOutput
:params initial_state: Initial hidden state.
:type initial_state: LayerOutput
:params source_context: Group of context hidden states for each token in the
source sentence, for attention mechanisim.
:type source_context: LayerOutput
:params size: Hidden cell number in decoder rnn.
:type size: int
:params word_vec_dim: Word embedding size.
:type word_vec_dim: int
:param dict_size: Vocabulary size.
:type dict_size: int
:params is_generating: Whether for beam search inferencing (True) or
for training (False).
:type is_generating: bool
:params beam_size: Beam search width.
:type beam_size: int
:return: Cost layer if is_generating=False; Beam search layer if
is_generating = True.
:rtype: LayerOutput
"""
# prepare initial bounded and unbounded memory
bounded_memory_slot_init = paddle.layer.fc(
input=paddle.layer.pooling(
input=source_context, pooling_type=paddle.pooling.Avg()),
size=size,
act=paddle.activation.Sigmoid())
bounded_memory_perturbation = paddle.layer.data(
name='bounded_memory_perturbation',
type=paddle.data_type.dense_vector_sequence(size))
bounded_memory_init = paddle.layer.addto(
input=[
paddle.layer.expand(
input=bounded_memory_slot_init,
expand_as=bounded_memory_perturbation),
bounded_memory_perturbation
],
act=paddle.activation.Linear())
unbounded_memory_init = source_context
# prepare step function for reccurent group
def recurrent_decoder_step(cur_embedding):
# create hidden state, bounded and unbounded memory.
state = paddle.layer.memory(
name="gru_decoder", size=size, boot_layer=initial_state)
bounded_memory = ExternalMemory(
name="bounded_memory",
mem_slot_size=size,
boot_layer=bounded_memory_init,
readonly=False,
enable_interpolation=True)
unbounded_memory = ExternalMemory(
name="unbounded_memory",
mem_slot_size=size * 2,
boot_layer=unbounded_memory_init,
readonly=True,
enable_interpolation=False)
# write bounded memory
bounded_memory.write(state)
# read bounded memory
bounded_memory_read = bounded_memory.read(state)
# prepare key for unbounded memory
key_for_unbounded_memory = paddle.layer.fc(
input=[bounded_memory_read, cur_embedding],
size=size,
act=paddle.activation.Tanh(),
bias_attr=False)
# read unbounded memory (i.e. attention mechanism)
context = unbounded_memory.read(key_for_unbounded_memory)
# gated recurrent unit
gru_inputs = paddle.layer.fc(
input=[context, cur_embedding, bounded_memory_read],
size=size * 3,
act=paddle.activation.Linear(),
bias_attr=False)
gru_output = paddle.layer.gru_step(
name="gru_decoder", input=gru_inputs, output_mem=state, size=size)
# step output
return paddle.layer.fc(
input=[gru_output, context, cur_embedding],
size=dict_size,
act=paddle.activation.Softmax(),
bias_attr=True)
if not is_generating:
target_embeddings = paddle.layer.embedding(
input=input,
size=word_vec_dim,
param_attr=paddle.attr.ParamAttr(name="_decoder_word_embedding"))
decoder_result = paddle.layer.recurrent_group(
name="decoder_group",
step=recurrent_decoder_step,
input=[target_embeddings])
cost = paddle.layer.classification_cost(
input=decoder_result, label=target)
return cost
else:
target_embeddings = paddle.layer.GeneratedInput(
size=dict_size,
embedding_name="_decoder_word_embedding",
embedding_size=word_vec_dim)
beam_gen = paddle.layer.beam_search(
name="decoder_group",
step=recurrent_decoder_step,
input=[target_embeddings],
bos_id=0,
eos_id=1,
beam_size=beam_size,
max_length=100)
return beam_gen
def memory_enhanced_seq2seq(encoder_input, decoder_input, decoder_target,
hidden_size, word_vec_dim, dict_size, is_generating,
beam_size):
"""Seq2Seq Model enhanced with external memory.
The "external memory" refers to two types of memories.
- Unbounded memory: i.e. attention mechanism in Seq2Seq.
- Bounded memory: i.e. external memory in NTM.
Both types of external memories can be implemented with
ExternalMemory class, and are both exploited in this Seq2Seq model.
Please refer to the function comments of memory_enhanced_decoder(...).
For more details about external memory, please refer to
`Neural Turing Machines <https://arxiv.org/abs/1410.5401>`_.
For more details about this memory-enhanced Seq2Seq, please
refer to `Memory-enhanced Decoder for Neural Machine Translation
<https://arxiv.org/abs/1606.02003>`_.
:params encoder_input: Encoder input.
:type encoder_input: LayerOutput
:params decoder_input: Decoder input.
:type decoder_input: LayerOutput
:params decoder_target: Decoder target.
:type decoder_target: LayerOutput
:params hidden_size: Hidden cell number, both in encoder and decoder rnn.
:type hidden_size: int
:params word_vec_dim: Word embedding size.
:type word_vec_dim: int
:param dict_size: Vocabulary size.
:type dict_size: int
:params is_generating: Whether for beam search inferencing (True) or
for training (False).
:type is_generating: bool
:params beam_size: Beam search width.
:type beam_size: int
:return: Cost layer if is_generating=False; Beam search layer if
is_generating = True.
:rtype: LayerOutput
"""
# encoder
context_encodings, sequence_encoding = bidirectional_gru_encoder(
input=encoder_input, size=hidden_size, word_vec_dim=word_vec_dim)
# decoder
return memory_enhanced_decoder(
input=decoder_input,
target=decoder_target,
initial_state=sequence_encoding,
source_context=context_encodings,
size=hidden_size,
word_vec_dim=word_vec_dim,
dict_size=dict_size,
is_generating=is_generating,
beam_size=beam_size)
"""
Contains training script for machine translation with external memory.
"""
import argparse
import sys
import gzip
import distutils.util
import random
import paddle.v2 as paddle
from external_memory import ExternalMemory
from model import memory_enhanced_seq2seq
from data_utils import reader_append_wrapper
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--dict_size",
default=30000,
type=int,
help="Vocabulary size. (default: %(default)s)")
parser.add_argument(
"--word_vec_dim",
default=512,
type=int,
help="Word embedding size. (default: %(default)s)")
parser.add_argument(
"--hidden_size",
default=1024,
type=int,
help="Hidden cell number in RNN. (default: %(default)s)")
parser.add_argument(
"--memory_slot_num",
default=8,
type=int,
help="External memory slot number. (default: %(default)s)")
parser.add_argument(
"--use_gpu",
default=False,
type=distutils.util.strtobool,
help="Use gpu or not. (default: %(default)s)")
parser.add_argument(
"--trainer_count",
default=1,
type=int,
help="Trainer number. (default: %(default)s)")
parser.add_argument(
"--num_passes",
default=100,
type=int,
help="Training epochs. (default: %(default)s)")
parser.add_argument(
"--batch_size",
default=5,
type=int,
help="Batch size. (default: %(default)s)")
parser.add_argument(
"--memory_perturb_stddev",
default=0.1,
type=float,
help="Memory perturb stddev for memory initialization."
"(default: %(default)s)")
args = parser.parse_args()
def train():
"""
For training.
"""
# create optimizer
optimizer = paddle.optimizer.Adam(
learning_rate=5e-5,
gradient_clipping_threshold=5,
regularization=paddle.optimizer.L2Regularization(rate=8e-4))
# create network config
source_words = paddle.layer.data(
name="source_words",
type=paddle.data_type.integer_value_sequence(args.dict_size))
target_words = paddle.layer.data(
name="target_words",
type=paddle.data_type.integer_value_sequence(args.dict_size))
target_next_words = paddle.layer.data(
name='target_next_words',
type=paddle.data_type.integer_value_sequence(args.dict_size))
cost = memory_enhanced_seq2seq(
encoder_input=source_words,
decoder_input=target_words,
decoder_target=target_next_words,
hidden_size=args.hidden_size,
word_vec_dim=args.word_vec_dim,
dict_size=args.dict_size,
is_generating=False,
beam_size=None)
# create parameters and trainer
parameters = paddle.parameters.create(cost)
trainer = paddle.trainer.SGD(
cost=cost, parameters=parameters, update_equation=optimizer)
# create data readers
feeding = {
"source_words": 0,
"target_words": 1,
"target_next_words": 2,
"bounded_memory_perturbation": 3
}
random.seed(0) # for keeping consitancy for multiple runs
bounded_memory_perturbation = [[
random.gauss(0, args.memory_perturb_stddev)
for i in xrange(args.hidden_size)
] for j in xrange(args.memory_slot_num)]
train_append_reader = reader_append_wrapper(
reader=paddle.dataset.wmt14.train(args.dict_size),
append_tuple=(bounded_memory_perturbation, ))
train_batch_reader = paddle.batch(
reader=paddle.reader.shuffle(reader=train_append_reader, buf_size=8192),
batch_size=args.batch_size)
test_append_reader = reader_append_wrapper(
reader=paddle.dataset.wmt14.test(args.dict_size),
append_tuple=(bounded_memory_perturbation, ))
test_batch_reader = paddle.batch(
reader=paddle.reader.shuffle(reader=test_append_reader, buf_size=8192),
batch_size=args.batch_size)
# create event handler
def event_handler(event):
if isinstance(event, paddle.event.EndIteration):
if event.batch_id % 10 == 0:
print "Pass: %d, Batch: %d, TrainCost: %f, %s" % (
event.pass_id, event.batch_id, event.cost, event.metrics)
with gzip.open("checkpoints/params.latest.tar.gz", 'w') as f:
parameters.to_tar(f)
else:
sys.stdout.write('.')
sys.stdout.flush()
if isinstance(event, paddle.event.EndPass):
result = trainer.test(reader=test_batch_reader, feeding=feeding)
print "Pass: %d, TestCost: %f, %s" % (event.pass_id, event.cost,
result.metrics)
with gzip.open("checkpoints/params.pass-%d.tar.gz" % event.pass_id,
'w') as f:
parameters.to_tar(f)
# run train
if not os.path.exists('checkpoints'):
os.mkdir('checkpoints')
trainer.train(
reader=train_batch_reader,
event_handler=event_handler,
num_passes=args.num_passes,
feeding=feeding)
def main():
paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count)
train()
if __name__ == '__main__':
main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册