diff --git a/README.md b/README.md index 33d6f94f3a0de2dd82709af50fc9ec55663a1ffd..1f9e62067141c1dd29b2e90fb665c2030ad71dfb 100644 --- a/README.md +++ b/README.md @@ -17,13 +17,11 @@ PaddlePaddle提供了丰富的运算单元,帮助大家以模块化的方式 - 1.2 [噪声对比估计加速词向量训练](https://github.com/PaddlePaddle/models/tree/develop/nce_cost) -## 2. 语言模型 +## 2. 使用循环神经网络语言模型生成文本 -语言模型是自然语言处理领域里一个重要的基础模型,它是一个概率分布模型,利用它可以确定哪个词序列的可能性更大,或者给定若干个词,可以预测下一个最可能出现的词。语言模型被应用在很多领域,如:自动写作、QA、机器翻译、拼写检查、语音识别、词性标注等。 +语言模型是自然语言处理领域里一个重要的基础模型,除了得到词向量(语言模型训练的副产物),还可以帮助我们生成文本。给定若干个词,语言模型可以帮助我们预测下一个最可能出现的词。在利用语言模型生成文本的例子中,我们重点介绍循环神经网络语言模型,大家可以通过文档中的使用说明快速适配到自己的训练语料,完成自动写诗、自动写散文等有趣的模型。 -在语言模型的例子中,我们以文本生成为例,提供了RNN LM(包括LSTM、GRU)和N-Gram LM,供大家学习和使用。用户可以通过文档中的 “使用说明” 快速上手:适配训练语料,以训练 “自动写诗”、“自动写散文” 等有趣的模型。 - -- 2.1 [基于LSTM、GRU、N-Gram的文本生成模型](https://github.com/PaddlePaddle/models/tree/develop/language_model) +- 2.1 [使用循环神经网络语言模型生成文本](https://github.com/PaddlePaddle/models/tree/develop/generate_sequence_by_rnn_lm) ## 3. 点击率预估 @@ -65,6 +63,14 @@ PaddlePaddle提供了丰富的运算单元,帮助大家以模块化的方式 - 7.1 [无注意力机制的编码器解码器模型](https://github.com/PaddlePaddle/models/tree/develop/nmt_without_attention) +## 8. 图像分类 +图像相比文字能够提供更加生动、容易理解及更具艺术感的信息,是人们转递与交换信息的重要来源。在图像分类的例子中,我们向大家介绍如何在PaddlePaddle中训练AlexNet、VGG、GoogLeNet和ResNet模型。同时还提供了一个模型转换工具,能够将Caffe训练好的模型文件,转换为PaddlePaddle的模型文件。 + +- 8.1 [将Caffe模型文件转换为PaddlePaddle模型文件](https://github.com/PaddlePaddle/models/tree/develop/image_classification/caffe2paddle) +- 8.2 [AlexNet](https://github.com/PaddlePaddle/models/tree/develop/image_classification) +- 8.3 [VGG](https://github.com/PaddlePaddle/models/tree/develop/image_classification) +- 8.4 [Residual Network](https://github.com/PaddlePaddle/models/tree/develop/image_classification) + ## Copyright and License PaddlePaddle is provided under the [Apache-2.0 license](LICENSE). diff --git a/deep_speech_2/data_utils/audio.py b/deep_speech_2/data_utils/audio.py index d55fae1efc951bf6025b2a6ba02852b1640fa10f..3891f5b923f6d73c6b87dcb90bede0183b0e081c 100644 --- a/deep_speech_2/data_utils/audio.py +++ b/deep_speech_2/data_utils/audio.py @@ -6,7 +6,7 @@ from __future__ import print_function import numpy as np import io import soundfile -import scikits.samplerate +import resampy from scipy import signal import random import copy @@ -308,7 +308,7 @@ class AudioSegment(object): prior_mean_squared = 10.**(prior_db / 10.) prior_sum_of_squares = prior_mean_squared * prior_samples cumsum_of_squares = np.cumsum(self.samples**2) - sample_count = np.arange(len(self.num_samples)) + 1 + sample_count = np.arange(self.num_samples) + 1 if startup_sample_idx > 0: cumsum_of_squares[:startup_sample_idx] = \ cumsum_of_squares[startup_sample_idx] @@ -321,21 +321,19 @@ class AudioSegment(object): gain_db = target_db - rms_estimate_db self.gain_db(gain_db) - def resample(self, target_sample_rate, quality='sinc_medium'): + def resample(self, target_sample_rate, filter='kaiser_best'): """Resample the audio to a target sample rate. Note that this is an in-place transformation. :param target_sample_rate: Target sample rate. :type target_sample_rate: int - :param quality: One of {'sinc_fastest', 'sinc_medium', 'sinc_best'}. - Sets resampling speed/quality tradeoff. - See http://www.mega-nerd.com/SRC/api_misc.html#Converters - :type quality: str + :param filter: The resampling filter to use one of {'kaiser_best', + 'kaiser_fast'}. + :type filter: str """ - resample_ratio = target_sample_rate / self._sample_rate - self._samples = scikits.samplerate.resample( - self._samples, r=resample_ratio, type=quality) + self._samples = resampy.resample( + self.samples, self.sample_rate, target_sample_rate, filter=filter) self._sample_rate = target_sample_rate def pad_silence(self, duration, sides='both'): diff --git a/deep_speech_2/data_utils/augmentor/augmentation.py b/deep_speech_2/data_utils/augmentor/augmentation.py index 0d60bbdb9cdd25b6df9177140576cb2bd6641fac..9dced47314a81f52dc0eafd6e592e240953f291d 100644 --- a/deep_speech_2/data_utils/augmentor/augmentation.py +++ b/deep_speech_2/data_utils/augmentor/augmentation.py @@ -7,6 +7,10 @@ import json import random from data_utils.augmentor.volume_perturb import VolumePerturbAugmentor from data_utils.augmentor.shift_perturb import ShiftPerturbAugmentor +from data_utils.augmentor.speed_perturb import SpeedPerturbAugmentor +from data_utils.augmentor.resample import ResampleAugmentor +from data_utils.augmentor.online_bayesian_normalization import \ + OnlineBayesianNormalizationAugmentor class AugmentationPipeline(object): @@ -79,5 +83,11 @@ class AugmentationPipeline(object): return VolumePerturbAugmentor(self._rng, **params) elif augmentor_type == "shift": return ShiftPerturbAugmentor(self._rng, **params) + elif augmentor_type == "speed": + return SpeedPerturbAugmentor(self._rng, **params) + elif augmentor_type == "resample": + return ResampleAugmentor(self._rng, **params) + elif augmentor_type == "bayesian_normal": + return OnlineBayesianNormalizationAugmentor(self._rng, **params) else: raise ValueError("Unknown augmentor type [%s]." % augmentor_type) diff --git a/deep_speech_2/data_utils/augmentor/online_bayesian_normalization.py b/deep_speech_2/data_utils/augmentor/online_bayesian_normalization.py new file mode 100755 index 0000000000000000000000000000000000000000..e488ac7d67833631919f88b9e660a99b363b90d0 --- /dev/null +++ b/deep_speech_2/data_utils/augmentor/online_bayesian_normalization.py @@ -0,0 +1,48 @@ +"""Contain the online bayesian normalization augmentation model.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from data_utils.augmentor.base import AugmentorBase + + +class OnlineBayesianNormalizationAugmentor(AugmentorBase): + """Augmentation model for adding online bayesian normalization. + + :param rng: Random generator object. + :type rng: random.Random + :param target_db: Target RMS value in decibels. + :type target_db: float + :param prior_db: Prior RMS estimate in decibels. + :type prior_db: float + :param prior_samples: Prior strength in number of samples. + :type prior_samples: int + :param startup_delay: Default 0.0s. If provided, this function will + accrue statistics for the first startup_delay + seconds before applying online normalization. + :type starup_delay: float. + """ + + def __init__(self, + rng, + target_db, + prior_db, + prior_samples, + startup_delay=0.0): + self._target_db = target_db + self._prior_db = prior_db + self._prior_samples = prior_samples + self._rng = rng + self._startup_delay = startup_delay + + def transform_audio(self, audio_segment): + """Normalizes the input audio using the online Bayesian approach. + + Note that this is an in-place transformation. + + :param audio_segment: Audio segment to add effects to. + :type audio_segment: AudioSegment|SpeechSegment + """ + audio_segment.normalize_online_bayesian(self._target_db, self._prior_db, + self._prior_samples, + self._startup_delay) diff --git a/deep_speech_2/data_utils/augmentor/resample.py b/deep_speech_2/data_utils/augmentor/resample.py new file mode 100755 index 0000000000000000000000000000000000000000..8df17f3a869420bca1e4e6c0ae9b4035f7d50d8d --- /dev/null +++ b/deep_speech_2/data_utils/augmentor/resample.py @@ -0,0 +1,33 @@ +"""Contain the resample augmentation model.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from data_utils.augmentor.base import AugmentorBase + + +class ResampleAugmentor(AugmentorBase): + """Augmentation model for resampling. + + See more info here: + https://ccrma.stanford.edu/~jos/resample/index.html + + :param rng: Random generator object. + :type rng: random.Random + :param new_sample_rate: New sample rate in Hz. + :type new_sample_rate: int + """ + + def __init__(self, rng, new_sample_rate): + self._new_sample_rate = new_sample_rate + self._rng = rng + + def transform_audio(self, audio_segment): + """Resamples the input audio to a target sample rate. + + Note that this is an in-place transformation. + + :param audio: Audio segment to add effects to. + :type audio: AudioSegment|SpeechSegment + """ + audio_segment.resample(self._new_sample_rate) diff --git a/deep_speech_2/data_utils/augmentor/speed_perturb.py b/deep_speech_2/data_utils/augmentor/speed_perturb.py new file mode 100644 index 0000000000000000000000000000000000000000..cc5738bd155a5871817039f5ccb3c4707ff87a6c --- /dev/null +++ b/deep_speech_2/data_utils/augmentor/speed_perturb.py @@ -0,0 +1,47 @@ +"""Contain the speech perturbation augmentation model.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from data_utils.augmentor.base import AugmentorBase + + +class SpeedPerturbAugmentor(AugmentorBase): + """Augmentation model for adding speed perturbation. + + See reference paper here: + http://www.danielpovey.com/files/2015_interspeech_augmentation.pdf + + :param rng: Random generator object. + :type rng: random.Random + :param min_speed_rate: Lower bound of new speed rate to sample and should + not be smaller than 0.9. + :type min_speed_rate: float + :param max_speed_rate: Upper bound of new speed rate to sample and should + not be larger than 1.1. + :type max_speed_rate: float + """ + + def __init__(self, rng, min_speed_rate, max_speed_rate): + if min_speed_rate < 0.9: + raise ValueError( + "Sampling speed below 0.9 can cause unnatural effects") + if max_speed_rate > 1.1: + raise ValueError( + "Sampling speed above 1.1 can cause unnatural effects") + self._min_speed_rate = min_speed_rate + self._max_speed_rate = max_speed_rate + self._rng = rng + + def transform_audio(self, audio_segment): + """Sample a new speed rate from the given range and + changes the speed of the given audio clip. + + Note that this is an in-place transformation. + + :param audio_segment: Audio segment to add effects to. + :type audio_segment: AudioSegment|SpeechSegment + """ + sampled_speed = self._rng.uniform(self._min_speed_rate, + self._max_speed_rate) + audio_segment.change_speed(sampled_speed) diff --git a/deep_speech_2/data_utils/augmentor/volume_perturb.py b/deep_speech_2/data_utils/augmentor/volume_perturb.py index 62631fb041c45350811b2cd2dd78d6758a622db8..758676d558d8e4d77191504d0d7b75cefe020549 100644 --- a/deep_speech_2/data_utils/augmentor/volume_perturb.py +++ b/deep_speech_2/data_utils/augmentor/volume_perturb.py @@ -37,4 +37,4 @@ class VolumePerturbAugmentor(AugmentorBase): :type audio_segment: AudioSegmenet|SpeechSegment """ gain = self._rng.uniform(self._min_gain_dBFS, self._max_gain_dBFS) - audio_segment.apply_gain(gain) + audio_segment.gain_db(gain) diff --git a/deep_speech_2/requirements.txt b/deep_speech_2/requirements.txt old mode 100644 new mode 100755 index ce0245916b17933bdb99b6faa772f2255e80bb69..2ae7d0895a3594059e995e20d106f7c30ef92568 --- a/deep_speech_2/requirements.txt +++ b/deep_speech_2/requirements.txt @@ -1,4 +1,4 @@ -SoundFile==0.9.0.post1 wget==3.2 scipy==0.13.1 +resampy==0.1.5 https://github.com/kpu/kenlm/archive/master.zip diff --git a/deep_speech_2/setup.sh b/deep_speech_2/setup.sh index cdec34ff07048a691f19658711a855ada40db9f0..8cba91ecdb68b42125181331471f9ee323062a24 100644 --- a/deep_speech_2/setup.sh +++ b/deep_speech_2/setup.sh @@ -1,7 +1,7 @@ #!/bin/bash # install python dependencies -if [ -f 'requirements.txt' ]; then +if [ -f "requirements.txt" ]; then pip install -r requirements.txt fi if [ $? != 0 ]; then @@ -9,21 +9,21 @@ if [ $? != 0 ]; then exit 1 fi -# install scikits.samplerate -curl -O "http://www.mega-nerd.com/SRC/libsamplerate-0.1.9.tar.gz" +# install package Soundfile +curl -O "http://www.mega-nerd.com/libsndfile/files/libsndfile-1.0.28.tar.gz" if [ $? != 0 ]; then - echo "Download libsamplerate-0.1.9.tar.gz failed !!!" + echo "Download libsndfile-1.0.28.tar.gz failed !!!" exit 1 fi -tar -xvf libsamplerate-0.1.9.tar.gz -cd libsamplerate-0.1.9 +tar -zxvf libsndfile-1.0.28.tar.gz +cd libsndfile-1.0.28 ./configure && make && make install cd - -rm -rf libsamplerate-0.1.9 -rm libsamplerate-0.1.9.tar.gz -pip install scikits.samplerate==0.3.3 +rm -rf libsndfile-1.0.28 +rm libsndfile-1.0.28.tar.gz +pip install SoundFile==0.9.0.post1 if [ $? != 0 ]; then - echo "Install scikits.samplerate failed !!!" + echo "Install SoundFile failed !!!" exit 1 fi