提交 fd84eb57 编写于 作者: X Xinghai Sun

Re-organize folder structure and hierarchy for DS2.

上级 b56a548e
文件模式从 100755 更改为 100644
文件模式从 100755 更改为 100644
TRAIN_MANIFEST="cloud/cloud.manifest.train"
DEV_MANIFEST="cloud/cloud.manifest.dev"
#! /usr/bin/bash
TRAIN_MANIFEST="cloud/cloud_manifests/cloud.manifest.train"
DEV_MANIFEST="cloud/cloud_manifests/cloud.manifest.dev"
CLOUD_MODEL_DIR="./checkpoints"
BATCH_SIZE=256
BATCH_SIZE=512
NUM_GPU=8
NUM_NODE=1
IS_LOCAL="True"
......
#! /usr/bin/bash
TRAIN_MANIFEST=$1
DEV_MANIFEST=$2
MODEL_PATH=$3
......@@ -14,11 +16,29 @@ python ./cloud/split_data.py \
--out_manifest_path='/local.manifest.dev'
python -u train.py \
--batch_size=$BATCH_SIZE \
--use_gpu=1 \
--batch_size=${BATCH_SIZE} \
--trainer_count=${NUM_GPU} \
--num_threads_data=${NUM_GPU} \
--num_passes=200 \
--num_proc_data=${NUM_GPU} \
--num_conv_layers=2 \
--num_rnn_layers=3 \
--rnn_layer_size=2048 \
--num_iter_print=100 \
--learning_rate=5e-4 \
--max_duration=27.0 \
--min_duration=0.0 \
--use_sortagrad=True \
--use_gru=False \
--use_gpu=True \
--is_local=${IS_LOCAL} \
--train_manifest_path='/local.manifest.train' \
--dev_manifest_path='/local.manifest.dev' \
--output_model_dir=${MODEL_PATH} 2>&1 | tee ./log/train.log
--share_rnn_weights=True \
--train_manifest='/local.manifest.train' \
--dev_manifest='/local.manifest.dev' \
--mean_std_path='data/librispeech/mean_std.npz' \
--vocab_path='data/librispeech/eng_vocab.txt' \
--output_model_dir='./checkpoints' \
--output_model_dir=${MODEL_PATH} \
--augment_conf_path='conf/augmentation.config' \
--specgram_type='linear' \
--shuffle_method='batch_shuffle_clipped' \
2>&1 | tee ./log/train.log
IN_MANIFESTS="../datasets/manifest.train ../datasets/manifest.dev ../datasets/manifest.test"
OUT_MANIFESTS="./cloud.manifest.train ./cloud.manifest.dev ./cloud.manifest.test"
#! /usr/bin/bash
mkdir cloud_manifests
IN_MANIFESTS="../data/librispeech/manifest.train ../data/librispeech/manifest.dev-clean ../data/librispeech/manifest.test-clean"
OUT_MANIFESTS="cloud_manifests/cloud.manifest.train cloud_manifests/cloud.manifest.dev cloud_manifests/cloud.manifest.test"
CLOUD_DATA_DIR="/pfs/dlnel/home/USERNAME/deepspeech2/data/librispeech"
NUM_SHARDS=50
......@@ -14,4 +18,5 @@ then
echo "Upload Data Failed!"
exit 1
fi
echo "All Done."
......@@ -4,23 +4,22 @@ from __future__ import division
from __future__ import print_function
from data_utils.augmentor.base import AugmentorBase
from data_utils import utils
from data_utils.utility import read_manifest
from data_utils.audio import AudioSegment
class ImpulseResponseAugmentor(AugmentorBase):
"""Augmentation model for adding impulse response effect.
:param rng: Random generator object.
:type rng: random.Random
:param impulse_manifest_path: Manifest path for impulse audio data.
:type impulse_manifest_path: basestring
:type impulse_manifest_path: basestring
"""
def __init__(self, rng, impulse_manifest_path):
self._rng = rng
self._impulse_manifest = utils.read_manifest(
manifest_path=impulse_manifest_path)
self._impulse_manifest = read_manifest(impulse_manifest_path)
def transform_audio(self, audio_segment):
"""Add impulse response effect.
......
......@@ -4,13 +4,13 @@ from __future__ import division
from __future__ import print_function
from data_utils.augmentor.base import AugmentorBase
from data_utils import utils
from data_utils.utility import read_manifest
from data_utils.audio import AudioSegment
class NoisePerturbAugmentor(AugmentorBase):
"""Augmentation model for adding background noise.
:param rng: Random generator object.
:type rng: random.Random
:param min_snr_dB: Minimal signal noise ratio, in decibels.
......@@ -18,15 +18,14 @@ class NoisePerturbAugmentor(AugmentorBase):
:param max_snr_dB: Maximal signal noise ratio, in decibels.
:type max_snr_dB: float
:param noise_manifest_path: Manifest path for noise audio data.
:type noise_manifest_path: basestring
:type noise_manifest_path: basestring
"""
def __init__(self, rng, min_snr_dB, max_snr_dB, noise_manifest_path):
self._min_snr_dB = min_snr_dB
self._max_snr_dB = max_snr_dB
self._rng = rng
self._noise_manifest = utils.read_manifest(
manifest_path=noise_manifest_path)
self._noise_manifest = read_manifest(manifest_path=noise_manifest_path)
def transform_audio(self, audio_segment):
"""Add background noise audio.
......
......@@ -11,7 +11,7 @@ import multiprocessing
import numpy as np
import paddle.v2 as paddle
from threading import local
from data_utils import utils
from data_utils.utility import read_manifest
from data_utils.augmentor.augmentation import AugmentationPipeline
from data_utils.featurizer.speech_featurizer import SpeechFeaturizer
from data_utils.speech import SpeechSegment
......@@ -159,7 +159,7 @@ class DataGenerator(object):
def batch_reader():
# read manifest
manifest = utils.read_manifest(
manifest = read_manifest(
manifest_path=manifest_path,
max_duration=self._max_duration,
min_duration=self._min_duration)
......
......@@ -4,7 +4,7 @@ from __future__ import division
from __future__ import print_function
import numpy as np
from data_utils import utils
from data_utils.utility import read_manifest
from data_utils.audio import AudioSegment
from python_speech_features import mfcc
from python_speech_features import delta
......
......@@ -5,7 +5,7 @@ from __future__ import print_function
import numpy as np
import random
import data_utils.utils as utils
from data_utils.utility import read_manifest
from data_utils.audio import AudioSegment
......@@ -75,7 +75,7 @@ class FeatureNormalizer(object):
def _compute_mean_std(self, manifest_path, featurize_func, num_samples):
"""Compute mean and std from randomly sampled instances."""
manifest = utils.read_manifest(manifest_path)
manifest = read_manifest(manifest_path)
sampled_manifest = self._rng.sample(manifest, num_samples)
features = []
for instance in sampled_manifest:
......
cd librispeech
python librispeech.py
if [ $? -ne 0 ]; then
echo "Prepare LibriSpeech failed. Terminated."
exit 1
fi
cd -
cat librispeech/manifest.train* | shuf > manifest.train
cat librispeech/manifest.dev-clean > manifest.dev
cat librispeech/manifest.test-clean > manifest.test
echo "All done."
cd noise
python chime3_background.py
if [ $? -ne 0 ]; then
echo "Prepare CHiME3 background noise failed. Terminated."
exit 1
fi
cd -
cat noise/manifest.* > manifest.noise
echo "All done."
"""Set up paths for DS2"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os.path
import sys
def add_path(path):
if path not in sys.path:
sys.path.insert(0, path)
this_dir = os.path.dirname(__file__)
# Add project path to PYTHONPATH
proj_path = os.path.join(this_dir, '..')
add_path(proj_path)
......@@ -9,10 +9,11 @@ import SocketServer
import struct
import wave
import paddle.v2 as paddle
import _init_paths
from data_utils.data import DataGenerator
from model import DeepSpeech2Model
from models.model import DeepSpeech2Model
from data_utils.utils import read_manifest
from utils import add_arguments, print_arguments
from utils.utility import add_arguments, print_arguments
parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
......@@ -36,13 +37,13 @@ add_arg('speech_save_dir', str,
'demo_cache',
"Directory to save demo audios.")
add_arg('warmup_manifest', str,
'datasets/manifest.test',
'data/librispeech/manifest.test-clean',
"Filepath of manifest to warm up.")
add_arg('mean_std_path', str,
'mean_std.npz',
'data/librispeech/mean_std.npz',
"Filepath of normalizer's mean & std.")
add_arg('vocab_path', str,
'datasets/vocab/eng_vocab.txt',
'data/librispeech/eng_vocab.txt',
"Filepath of vocabulary.")
add_arg('model_path', str,
'./checkpoints/params.latest.tar.gz',
......
......@@ -7,9 +7,9 @@ import argparse
import functools
import paddle.v2 as paddle
from data_utils.data import DataGenerator
from model import DeepSpeech2Model
from error_rate import wer, cer
from utils import add_arguments, print_arguments
from models.model import DeepSpeech2Model
from utils.error_rate import wer, cer
from utils.utility import add_arguments, print_arguments
parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
......@@ -30,13 +30,13 @@ add_arg('use_gpu', bool, True, "Use GPU or not.")
add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across "
"bi-directional RNNs. Not for GRU.")
add_arg('test_manifest', str,
'datasets/manifest.test',
'data/librispeech/manifest.test-clean',
"Filepath of manifest to evaluate.")
add_arg('mean_std_path', str,
'mean_std.npz',
'data/librispeech/mean_std.npz',
"Filepath of normalizer's mean & std.")
add_arg('vocab_path', str,
'datasets/vocab/eng_vocab.txt',
'data/librispeech/eng_vocab.txt',
"Filepath of vocabulary.")
add_arg('model_path', str,
'./checkpoints/params.latest.tar.gz',
......
#! /usr/bin/bash
pushd ../..
CUDA_VISIBLE_DEVICES=0 \
python -u infer.py \
--num_samples=10 \
--trainer_count=1 \
--beam_size=500 \
--num_proc_bsearch=12 \
--num_proc_data=12 \
--num_conv_layers=2 \
--num_rnn_layers=3 \
--rnn_layer_size=2048 \
--alpha=0.36 \
--beta=0.25 \
--cutoff_prob=0.99 \
--use_gru=False \
--use_gpu=True \
--share_rnn_weights=True \
--infer_manifest='data/librispeech/manifest.dev-clean' \
--mean_std_path='data/librispeech/mean_std.npz' \
--vocab_path='data/librispeech/eng_vocab.txt' \
--model_path='checkpoints/params.latest.tar.gz' \
--lang_model_path='lm/data/common_crawl_00.prune01111.trie.klm' \
--decoding_method='ctc_beam_search' \
--error_rate_type='wer' \
--specgram_type='linear'
#! /usr/bin/bash
pushd ../..
# download data, generate manifests
python data/librispeech/librispeech.py \
--manifest_prefix='data/librispeech/manifest' \
--full_download='True' \
--target_dir='~/.cache/paddle/dataset/speech/Libri'
if [ $? -ne 0 ]; then
echo "Prepare LibriSpeech failed. Terminated."
exit 1
fi
#cat data/librispeech/manifest.train* | shuf > data/librispeech/manifest.train
# compute mean and stddev for normalizer
python tools/compute_mean_std.py \
--manifest_path='data/librispeech/manifest.train' \
--num_samples=2000 \
--specgram_type='linear' \
--output_path='data/librispeech/mean_std.npz'
if [ $? -ne 0 ]; then
echo "Compute mean and stddev failed. Terminated."
exit 1
fi
echo "LibriSpeech Data preparation done."
#! /usr/bin/bash
pushd ../..
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
python -u evaluate.py \
--batch_size=128 \
--trainer_count=8 \
--beam_size=500 \
--num_proc_bsearch=12 \
--num_proc_data=12 \
--num_conv_layers=2 \
--num_rnn_layers=3 \
--rnn_layer_size=2048 \
--alpha=0.36 \
--beta=0.25 \
--cutoff_prob=0.99 \
--use_gru=False \
--use_gpu=True \
--share_rnn_weights=True \
--test_manifest='data/librispeech/manifest.test-clean' \
--mean_std_path='data/librispeech/mean_std.npz' \
--vocab_path='data/librispeech/eng_vocab.txt' \
--model_path='checkpoints/params.latest.tar.gz' \
--lang_model_path='lm/data/common_crawl_00.prune01111.trie.klm' \
--decoding_method='ctc_beam_search' \
--error_rate_type='wer' \
--specgram_type='linear'
#! /usr/bin/bash
pushd ../..
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
python -u train.py \
--batch_size=256 \
--trainer_count=8 \
--num_passes=200 \
--num_proc_data=12 \
--num_conv_layers=2 \
--num_rnn_layers=3 \
--rnn_layer_size=2048 \
--num_iter_print=100 \
--learning_rate=5e-4 \
--max_duration=27.0 \
--min_duration=0.0 \
--use_sortagrad=True \
--use_gru=False \
--use_gpu=True \
--is_local=True \
--share_rnn_weights=True \
--train_manifest='data/librispeech/manifest.train' \
--dev_manifest='data/librispeech/manifest.dev' \
--mean_std_path='data/librispeech/mean_std.npz' \
--vocab_path='data/librispeech/eng_vocab.txt' \
--output_model_dir='./checkpoints' \
--augment_conf_path='conf/augmentation.config' \
--specgram_type='linear' \
--shuffle_method='batch_shuffle_clipped'
#! /usr/bin/bash
pushd ../..
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
python -u tools/tune.py \
--num_samples=100 \
--trainer_count=8 \
--beam_size=500 \
--num_proc_bsearch=12 \
--num_conv_layers=2 \
--num_rnn_layers=3 \
--rnn_layer_size=2048 \
--num_alphas=14 \
--num_betas=20 \
--alpha_from=0.1 \
--alpha_to=0.36 \
--beta_from=0.05 \
--beta_to=1.0 \
--cutoff_prob=0.99 \
--use_gru=False \
--use_gpu=True \
--share_rnn_weights=True \
--tune_manifest='data/librispeech/manifest.dev-clean' \
--mean_std_path='data/librispeech/mean_std.npz' \
--vocab_path='data/librispeech/eng_vocab.txt' \
--model_path='checkpoints/params.latest.tar.gz' \
--lang_model_path='lm/data/common_crawl_00.prune01111.trie.klm' \
--error_rate_type='wer' \
--specgram_type='linear'
......@@ -7,9 +7,9 @@ import argparse
import functools
import paddle.v2 as paddle
from data_utils.data import DataGenerator
from model import DeepSpeech2Model
from error_rate import wer, cer
from utils import add_arguments, print_arguments
from models.model import DeepSpeech2Model
from utils.error_rate import wer, cer
from utils.utility import add_arguments, print_arguments
parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
......@@ -29,13 +29,13 @@ add_arg('use_gpu', bool, True, "Use GPU or not.")
add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across "
"bi-directional RNNs. Not for GRU.")
add_arg('infer_manifest', str,
'datasets/manifest.dev',
'data/librispeech/manifest.dev-clean',
"Filepath of manifest to infer.")
add_arg('mean_std_path', str,
'mean_std.npz',
'data/librispeech/mean_std.npz',
"Filepath of normalizer's mean & std.")
add_arg('vocab_path', str,
'datasets/vocab/eng_vocab.txt',
'data/librispeech/eng_vocab.txt',
"Filepath of vocabulary.")
add_arg('lang_model_path', str,
'lm/data/common_crawl_00.prune01111.trie.klm',
......
......@@ -7,10 +7,10 @@ import sys
import os
import time
import gzip
from decoder import *
from lm.lm_scorer import LmScorer
import paddle.v2 as paddle
from layer import *
from utils.decoder import ctc_greedy_decoder, ctc_beam_search_decoder
from lm.lm_scorer import LmScorer
from models.network import deep_speech_v2_network
class DeepSpeech2Model(object):
......@@ -241,7 +241,7 @@ class DeepSpeech2Model(object):
text_data = paddle.layer.data(
name="transcript_text",
type=paddle.data_type.integer_value_sequence(vocab_size))
self._log_probs, self._loss = deep_speech2(
self._log_probs, self._loss = deep_speech_v2_network(
audio_data=audio_data,
text_data=text_data,
dict_size=vocab_size,
......
"""Contains DeepSpeech2 layers."""
"""Contains DeepSpeech2 layers and networks."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
......@@ -205,16 +205,15 @@ def rnn_group(input, size, num_stacks, use_gru, share_rnn_weights):
return output
def deep_speech2(audio_data,
text_data,
dict_size,
num_conv_layers=2,
num_rnn_layers=3,
rnn_size=256,
use_gru=False,
share_rnn_weights=True):
"""
The whole DeepSpeech2 model structure.
def deep_speech_v2_network(audio_data,
text_data,
dict_size,
num_conv_layers=2,
num_rnn_layers=3,
rnn_size=256,
use_gru=False,
share_rnn_weights=True):
"""The DeepSpeech2 network structure.
:param audio_data: Audio spectrogram data layer.
:type audio_data: LayerOutput
......
......@@ -13,8 +13,8 @@ import json
from collections import Counter
import os.path
import _init_paths
from data_utils import utils
from utils import add_arguments, print_arguments
from data_utils.utility import read_manifest
from utils.utility import add_arguments, print_arguments
parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
......
......@@ -9,7 +9,7 @@ import _init_paths
from data_utils.normalizer import FeatureNormalizer
from data_utils.augmentor.augmentation import AugmentationPipeline
from data_utils.featurizer.audio_featurizer import AudioFeaturizer
from utils import add_arguments, print_arguments
from utils.utility import add_arguments, print_arguments
parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
......
......@@ -7,10 +7,11 @@ import numpy as np
import argparse
import functools
import paddle.v2 as paddle
import _init_paths
from data_utils.data import DataGenerator
from model import DeepSpeech2Model
from error_rate import wer
from utils import add_arguments, print_arguments
from models.model import DeepSpeech2Model
from utils.error_rate import wer
from utils.utility import add_arguments, print_arguments
parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
......@@ -27,20 +28,20 @@ add_arg('num_betas', int, 20, "# of beta candidates for tuning.")
add_arg('alpha_from', float, 0.1, "Where alpha starts tuning from.")
add_arg('alpha_to', float, 0.36, "Where alpha ends tuning with.")
add_arg('beta_from', float, 0.05, "Where beta starts tuning from.")
add_arg('beta_to', float, 0.36, "Where beta ends tuning with.")
add_arg('beta_to', float, 1.0, "Where beta ends tuning with.")
add_arg('cutoff_prob', float, 0.99, "Cutoff probability for pruning.")
add_arg('use_gru', bool, False, "Use GRUs instead of simple RNNs.")
add_arg('use_gpu', bool, True, "Use GPU or not.")
add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across "
"bi-directional RNNs. Not for GRU.")
add_arg('tune_manifest', str,
'datasets/manifest.test',
'data/librispeech/manifest.dev',
"Filepath of manifest to tune.")
add_arg('mean_std_path', str,
'mean_std.npz',
'data/librispeech/mean_std.npz',
"Filepath of normalizer's mean & std.")
add_arg('vocab_path', str,
'datasets/vocab/eng_vocab.txt',
'data/librispeech/eng_vocab.txt',
"Filepath of vocabulary.")
add_arg('lang_model_path', str,
'lm/data/common_crawl_00.prune01111.trie.klm',
......
......@@ -6,9 +6,9 @@ from __future__ import print_function
import argparse
import functools
import paddle.v2 as paddle
from model import DeepSpeech2Model
from models.model import DeepSpeech2Model
from data_utils.data import DataGenerator
from utils import add_arguments, print_arguments
from utils.utility import add_arguments, print_arguments
parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
......@@ -27,21 +27,21 @@ add_arg('max_duration', float, 27.0, "Longest audio duration allowed.")
add_arg('min_duration', float, 0.0, "Shortest audio duration allowed.")
add_arg('use_sortagrad', bool, True, "Use SortaGrad or not.")
add_arg('use_gpu', bool, True, "Use GPU or not.")
add_arg('is_local', bool, True, "Use pserver or not.")
add_arg('use_gru', bool, False, "Use GRUs instead of simple RNNs.")
add_arg('is_local', bool, True, "Use pserver or not.")
add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across "
"bi-directional RNNs. Not for GRU.")
add_arg('train_manifest', str,
'datasets/manifest.train',
'data/librispeech/manifest.train',
"Filepath of train manifest.")
add_arg('dev_manifest', str,
'datasets/manifest.dev',
'data/librispeech/manifest.dev-clean',
"Filepath of validation manifest.")
add_arg('mean_std_path', str,
'mean_std.npz',
'data/librispeech/mean_std.npz',
"Filepath of normalizer's mean & std.")
add_arg('vocab_path', str,
'datasets/vocab/eng_vocab.txt',
'data/librispeech/eng_vocab.txt',
"Filepath of vocabulary.")
add_arg('init_model_path', str,
None,
......@@ -101,7 +101,7 @@ def train():
rnn_layer_size=args.rnn_layer_size,
use_gru=args.use_gru,
pretrained_model_path=args.init_model_path,
share_rnn_weights=args.share_weights)
share_rnn_weights=args.share_rnn_weights)
ds2_model.train(
train_batch_reader=train_batch_reader,
dev_batch_reader=dev_batch_reader,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册