resolve conflicts in requirements.txt

2b594b4e · Yibing Liu · 20b50ca4 · 08ab956f · 2b594b4e · 2b594b4e
9 changed file
--- a/README.md
+++ b/README.md
@@ -17,13 +17,11 @@ PaddlePaddle提供了丰富的运算单元，帮助大家以模块化的方式
 - 1.2 [噪声对比估计加速词向量训练](https://github.com/PaddlePaddle/models/tree/develop/nce_cost)


-## 2. 语言模型
+## 2. 使用循环神经网络语言模型生成文本

-语言模型是自然语言处理领域里一个重要的基础模型，它是一个概率分布模型，利用它可以确定哪个词序列的可能性更大，或者给定若干个词，可以预测下一个最可能出现的词。语言模型被应用在很多领域，如：自动写作、QA、机器翻译、拼写检查、语音识别、词性标注等。
+语言模型是自然语言处理领域里一个重要的基础模型，除了得到词向量（语言模型训练的副产物），还可以帮助我们生成文本。给定若干个词，语言模型可以帮助我们预测下一个最可能出现的词。在利用语言模型生成文本的例子中，我们重点介绍循环神经网络语言模型，大家可以通过文档中的使用说明快速适配到自己的训练语料，完成自动写诗、自动写散文等有趣的模型。

-在语言模型的例子中，我们以文本生成为例，提供了RNN LM（包括LSTM、GRU）和N-Gram LM，供大家学习和使用。用户可以通过文档中的 “使用说明” 快速上手：适配训练语料，以训练 “自动写诗”、“自动写散文” 等有趣的模型。
-
- 2.1 [基于LSTM、GRU、N-Gram的文本生成模型](https://github.com/PaddlePaddle/models/tree/develop/language_model)
+- 2.1 [使用循环神经网络语言模型生成文本](https://github.com/PaddlePaddle/models/tree/develop/generate_sequence_by_rnn_lm)

 ## 3. 点击率预估

@@ -65,6 +63,14 @@ PaddlePaddle提供了丰富的运算单元，帮助大家以模块化的方式

 - 7.1 [无注意力机制的编码器解码器模型](https://github.com/PaddlePaddle/models/tree/develop/nmt_without_attention)

+## 8. 图像分类
+图像相比文字能够提供更加生动、容易理解及更具艺术感的信息，是人们转递与交换信息的重要来源。在图像分类的例子中，我们向大家介绍如何在PaddlePaddle中训练AlexNet、VGG、GoogLeNet和ResNet模型。同时还提供了一个模型转换工具，能够将Caffe训练好的模型文件，转换为PaddlePaddle的模型文件。
+
+- 8.1 [将Caffe模型文件转换为PaddlePaddle模型文件](https://github.com/PaddlePaddle/models/tree/develop/image_classification/caffe2paddle)
+- 8.2 [AlexNet](https://github.com/PaddlePaddle/models/tree/develop/image_classification)
+- 8.3 [VGG](https://github.com/PaddlePaddle/models/tree/develop/image_classification)
+- 8.4 [Residual Network](https://github.com/PaddlePaddle/models/tree/develop/image_classification)
+

 ## Copyright and License
 PaddlePaddle is provided under the [Apache-2.0 license](LICENSE).
--- a/deep_speech_2/data_utils/audio.py
+++ b/deep_speech_2/data_utils/audio.py
@@ -6,7 +6,7 @@ from __future__ import print_function
 import numpy as np
 import io
 import soundfile
-import scikits.samplerate
+import resampy
 from scipy import signal
 import random
 import copy
@@ -308,7 +308,7 @@ class AudioSegment(object):
        prior_mean_squared = 10.**(prior_db / 10.)
        prior_sum_of_squares = prior_mean_squared * prior_samples
        cumsum_of_squares = np.cumsum(self.samples**2)
-        sample_count = np.arange(len(self.num_samples)) + 1
+        sample_count = np.arange(self.num_samples) + 1
        if startup_sample_idx > 0:
            cumsum_of_squares[:startup_sample_idx] = \
                cumsum_of_squares[startup_sample_idx]
@@ -321,21 +321,19 @@ class AudioSegment(object):
        gain_db = target_db - rms_estimate_db
        self.gain_db(gain_db)

-    def resample(self, target_sample_rate, quality='sinc_medium'):
+    def resample(self, target_sample_rate, filter='kaiser_best'):
        """Resample the audio to a target sample rate.

        Note that this is an in-place transformation.

        :param target_sample_rate: Target sample rate.
        :type target_sample_rate: int
-        :param quality: One of {'sinc_fastest', 'sinc_medium', 'sinc_best'}.
-                        Sets resampling speed/quality tradeoff.
-                        See http://www.mega-nerd.com/SRC/api_misc.html#Converters
-        :type quality: str
+        :param filter: The resampling filter to use one of {'kaiser_best',
+                       'kaiser_fast'}.
+        :type filter: str
        """
-        resample_ratio = target_sample_rate / self._sample_rate
-        self._samples = scikits.samplerate.resample(
-            self._samples, r=resample_ratio, type=quality)
+        self._samples = resampy.resample(
+            self.samples, self.sample_rate, target_sample_rate, filter=filter)
        self._sample_rate = target_sample_rate

    def pad_silence(self, duration, sides='both'):

--- a/deep_speech_2/data_utils/augmentor/augmentation.py
+++ b/deep_speech_2/data_utils/augmentor/augmentation.py
@@ -7,6 +7,10 @@ import json
 import random
 from data_utils.augmentor.volume_perturb import VolumePerturbAugmentor
 from data_utils.augmentor.shift_perturb import ShiftPerturbAugmentor
+from data_utils.augmentor.speed_perturb import SpeedPerturbAugmentor
+from data_utils.augmentor.resample import ResampleAugmentor
+from data_utils.augmentor.online_bayesian_normalization import \
+     OnlineBayesianNormalizationAugmentor


 class AugmentationPipeline(object):
@@ -79,5 +83,11 @@ class AugmentationPipeline(object):
            return VolumePerturbAugmentor(self._rng, **params)
        elif augmentor_type == "shift":
            return ShiftPerturbAugmentor(self._rng, **params)
+        elif augmentor_type == "speed":
+            return SpeedPerturbAugmentor(self._rng, **params)
+        elif augmentor_type == "resample":
+            return ResampleAugmentor(self._rng, **params)
+        elif augmentor_type == "bayesian_normal":
+            return OnlineBayesianNormalizationAugmentor(self._rng, **params)
        else:
            raise ValueError("Unknown augmentor type [%s]." % augmentor_type)
--- a/deep_speech_2/data_utils/augmentor/online_bayesian_normalization.py
+++ b/deep_speech_2/data_utils/augmentor/online_bayesian_normalization.py
+"""Contain the online bayesian normalization augmentation model."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from data_utils.augmentor.base import AugmentorBase
+
+
+class OnlineBayesianNormalizationAugmentor(AugmentorBase):
+    """Augmentation model for adding online bayesian normalization.
+
+    :param rng: Random generator object.
+    :type rng: random.Random
+    :param target_db: Target RMS value in decibels.
+    :type target_db: float
+    :param prior_db: Prior RMS estimate in decibels.
+    :type prior_db: float
+    :param prior_samples: Prior strength in number of samples.
+    :type prior_samples: int
+    :param startup_delay: Default 0.0s. If provided, this function will
+                          accrue statistics for the first startup_delay 
+                          seconds before applying online normalization.
+    :type starup_delay: float.
+    """
+
+    def __init__(self,
+                 rng,
+                 target_db,
+                 prior_db,
+                 prior_samples,
+                 startup_delay=0.0):
+        self._target_db = target_db
+        self._prior_db = prior_db
+        self._prior_samples = prior_samples
+        self._rng = rng
+        self._startup_delay = startup_delay
+
+    def transform_audio(self, audio_segment):
+        """Normalizes the input audio using the online Bayesian approach.
+
+        Note that this is an in-place transformation.
+
+        :param audio_segment: Audio segment to add effects to.
+        :type audio_segment: AudioSegment|SpeechSegment
+        """
+        audio_segment.normalize_online_bayesian(self._target_db, self._prior_db,
+                                                self._prior_samples,
+                                                self._startup_delay)
--- a/deep_speech_2/data_utils/augmentor/resample.py
+++ b/deep_speech_2/data_utils/augmentor/resample.py
+"""Contain the resample augmentation model."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from data_utils.augmentor.base import AugmentorBase
+
+
+class ResampleAugmentor(AugmentorBase):
+    """Augmentation model for resampling.
+
+    See more info here:
+    https://ccrma.stanford.edu/~jos/resample/index.html
+    
+    :param rng: Random generator object.
+    :type rng: random.Random
+    :param new_sample_rate: New sample rate in Hz.
+    :type new_sample_rate: int
+    """
+
+    def __init__(self, rng, new_sample_rate):
+        self._new_sample_rate = new_sample_rate
+        self._rng = rng
+
+    def transform_audio(self, audio_segment):
+        """Resamples the input audio to a target sample rate.
+
+        Note that this is an in-place transformation.
+
+        :param audio: Audio segment to add effects to.
+        :type audio: AudioSegment|SpeechSegment
+        """
+        audio_segment.resample(self._new_sample_rate)
--- a/deep_speech_2/data_utils/augmentor/speed_perturb.py
+++ b/deep_speech_2/data_utils/augmentor/speed_perturb.py
+"""Contain the speech perturbation augmentation model."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from data_utils.augmentor.base import AugmentorBase
+
+
+class SpeedPerturbAugmentor(AugmentorBase):
+    """Augmentation model for adding speed perturbation.
+
+    See reference paper here:
+    http://www.danielpovey.com/files/2015_interspeech_augmentation.pdf
+
+    :param rng: Random generator object.
+    :type rng: random.Random
+    :param min_speed_rate: Lower bound of new speed rate to sample and should
+                           not be smaller than 0.9.
+    :type min_speed_rate: float
+    :param max_speed_rate: Upper bound of new speed rate to sample and should
+                           not be larger than 1.1.
+    :type max_speed_rate: float
+    """
+
+    def __init__(self, rng, min_speed_rate, max_speed_rate):
+        if min_speed_rate < 0.9:
+            raise ValueError(
+                "Sampling speed below 0.9 can cause unnatural effects")
+        if max_speed_rate > 1.1:
+            raise ValueError(
+                "Sampling speed above 1.1 can cause unnatural effects")
+        self._min_speed_rate = min_speed_rate
+        self._max_speed_rate = max_speed_rate
+        self._rng = rng
+
+    def transform_audio(self, audio_segment):
+        """Sample a new speed rate from the given range and
+        changes the speed of the given audio clip.
+
+        Note that this is an in-place transformation.
+
+        :param audio_segment: Audio segment to add effects to.
+        :type audio_segment: AudioSegment|SpeechSegment
+        """
+        sampled_speed = self._rng.uniform(self._min_speed_rate,
+                                          self._max_speed_rate)
+        audio_segment.change_speed(sampled_speed)
--- a/deep_speech_2/data_utils/augmentor/volume_perturb.py
+++ b/deep_speech_2/data_utils/augmentor/volume_perturb.py
@@ -37,4 +37,4 @@ class VolumePerturbAugmentor(AugmentorBase):
        :type audio_segment: AudioSegmenet|SpeechSegment
        """
        gain = self._rng.uniform(self._min_gain_dBFS, self._max_gain_dBFS)
-        audio_segment.apply_gain(gain)
+        audio_segment.gain_db(gain)
--- a/deep_speech_2/requirements.txt
+++ b/deep_speech_2/requirements.txt
-SoundFile==0.9.0.post1
 wget==3.2
 scipy==0.13.1
+resampy==0.1.5
 https://github.com/kpu/kenlm/archive/master.zip
--- a/deep_speech_2/setup.sh
+++ b/deep_speech_2/setup.sh
 #!/bin/bash

 # install python dependencies
-if [ -f 'requirements.txt' ]; then
+if [ -f "requirements.txt" ]; then
    pip install -r requirements.txt
 fi
 if [ $? != 0 ]; then
@@ -9,21 +9,21 @@ if [ $? != 0 ]; then
    exit 1
 fi

-# install scikits.samplerate
-curl -O "http://www.mega-nerd.com/SRC/libsamplerate-0.1.9.tar.gz"
+# install package Soundfile
+curl -O "http://www.mega-nerd.com/libsndfile/files/libsndfile-1.0.28.tar.gz"
 if [ $? != 0 ]; then
-    echo "Download libsamplerate-0.1.9.tar.gz failed !!!"
+    echo "Download libsndfile-1.0.28.tar.gz failed !!!"
    exit 1
 fi
-tar -xvf libsamplerate-0.1.9.tar.gz
-cd libsamplerate-0.1.9
+tar -zxvf libsndfile-1.0.28.tar.gz
+cd libsndfile-1.0.28
 ./configure && make && make install
 cd -
-rm -rf libsamplerate-0.1.9
-rm libsamplerate-0.1.9.tar.gz
-pip install scikits.samplerate==0.3.3
+rm -rf libsndfile-1.0.28
+rm libsndfile-1.0.28.tar.gz
+pip install SoundFile==0.9.0.post1
 if [ $? != 0 ]; then
-    echo "Install scikits.samplerate failed !!!"
+    echo "Install SoundFile failed !!!"
    exit 1
 fi