add 3 augmentor and unittest

9ec357ed · xushaoyong · 75ea3746 · 01c4df2f · 9ec357ed · 9ec357ed
38 changed file
--- a/.travis.yml
+++ b/.travis.yml
+group: deprecated-2017Q2
 language: cpp
 cache: ccache
 sudo: required

--- a/README.md
+++ b/README.md
@@ -13,17 +13,15 @@ PaddlePaddle提供了丰富的运算单元，帮助大家以模块化的方式
 在词向量的例子中，我们向大家展示如何使用Hierarchical-Sigmoid 和噪声对比估计（Noise Contrastive Estimation，NCE）来加速词向量的学习。
- 1.1 [Hsigmoid加速词向量训练](https://github.com/PaddlePaddle/models/tree/develop/word_embedding)
+- 1.1 [Hsigmoid加速词向量训练](https://github.com/PaddlePaddle/models/tree/develop/hsigmoid)
 - 1.2 [噪声对比估计加速词向量训练](https://github.com/PaddlePaddle/models/tree/develop/nce_cost)
-## 2. 语言模型
+## 2. 使用循环神经网络语言模型生成文本
-语言模型是自然语言处理领域里一个重要的基础模型，它是一个概率分布模型，利用它可以确定哪个词序列的可能性更大，或者给定若干个词，可以预测下一个最可能出现的词。语言模型被应用在很多领域，如：自动写作、QA、机器翻译、拼写检查、语音识别、词性标注等。
+语言模型是自然语言处理领域里一个重要的基础模型，除了得到词向量（语言模型训练的副产物），还可以帮助我们生成文本。给定若干个词，语言模型可以帮助我们预测下一个最可能出现的词。在利用语言模型生成文本的例子中，我们重点介绍循环神经网络语言模型，大家可以通过文档中的使用说明快速适配到自己的训练语料，完成自动写诗、自动写散文等有趣的模型。
-在语言模型的例子中，我们以文本生成为例，提供了RNN LM（包括LSTM、GRU）和N-Gram LM，供大家学习和使用。用户可以通过文档中的 “使用说明” 快速上手：适配训练语料，以训练 “自动写诗”、“自动写散文” 等有趣的模型。
+- 2.1 [使用循环神经网络语言模型生成文本](https://github.com/PaddlePaddle/models/tree/develop/generate_sequence_by_rnn_lm)
- 2.1 [基于LSTM、GRU、N-Gram的文本生成模型](https://github.com/PaddlePaddle/models/tree/develop/language_model)
 ## 3. 点击率预估
@@ -65,6 +63,14 @@ PaddlePaddle提供了丰富的运算单元，帮助大家以模块化的方式
 - 7.1 [无注意力机制的编码器解码器模型](https://github.com/PaddlePaddle/models/tree/develop/nmt_without_attention)
+## 8. 图像分类
+图像相比文字能够提供更加生动、容易理解及更具艺术感的信息，是人们转递与交换信息的重要来源。在图像分类的例子中，我们向大家介绍如何在PaddlePaddle中训练AlexNet、VGG、GoogLeNet和ResNet模型。同时还提供了一个模型转换工具，能够将Caffe训练好的模型文件，转换为PaddlePaddle的模型文件。
+- 8.1 [将Caffe模型文件转换为PaddlePaddle模型文件](https://github.com/PaddlePaddle/models/tree/develop/image_classification/caffe2paddle)
+- 8.2 [AlexNet](https://github.com/PaddlePaddle/models/tree/develop/image_classification)
+- 8.3 [VGG](https://github.com/PaddlePaddle/models/tree/develop/image_classification)
+- 8.4 [Residual Network](https://github.com/PaddlePaddle/models/tree/develop/image_classification)
 ## Copyright and License
 PaddlePaddle is provided under the [Apache-2.0 license](LICENSE).
--- a/deep_speech_2/README.md
+++ b/deep_speech_2/README.md
@@ -51,13 +51,13 @@ python compute_mean_std.py --help
 For GPU Training:
 ```
-CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py --trainer_count 4
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python train.py
 ```
 For CPU Training:
 ```
-python train.py --trainer_count 8 --use_gpu False
+python train.py --use_gpu False
 ```
 More help for arguments:

--- a/deep_speech_2/data_utils/audio.py
+++ b/deep_speech_2/data_utils/audio.py
@@ -66,6 +66,54 @@ class AudioSegment(object):
        samples, sample_rate = soundfile.read(file, dtype='float32')
        return cls(samples, sample_rate)
+    @classmethod
+    def slice_from_file(cls, file, start=None, end=None):
+        """Loads a small section of an audio without having to load
+        the entire file into the memory which can be incredibly wasteful.
+        :param file: Input audio filepath or file object.
+        :type file: basestring|file
+        :param start: Start time in seconds. If start is negative, it wraps
+                      around from the end. If not provided, this function 
+                      reads from the very beginning.
+        :type start: float
+        :param end: End time in seconds. If end is negative, it wraps around
+                    from the end. If not provided, the default behvaior is
+                    to read to the end of the file.
+        :type end: float
+        :return: AudioSegment instance of the specified slice of the input
+                 audio file.
+        :rtype: AudioSegment
+        :raise ValueError: If start or end is incorrectly set, e.g. out of
+                           bounds in time.
+        """
+        sndfile = soundfile.SoundFile(file)
+        sample_rate = sndfile.samplerate
+        duration = float(len(sndfile)) / sample_rate
+        start = 0. if start is None else start
+        end = 0. if end is None else end
+        if start < 0.0:
+            start += duration
+        if end < 0.0:
+            end += duration
+        if start < 0.0:
+            raise ValueError("The slice start position (%f s) is out of "
+                             "bounds." % start)
+        if end < 0.0:
+            raise ValueError("The slice end position (%f s) is out of bounds." %
+                             end)
+        if start > end:
+            raise ValueError("The slice start position (%f s) is later than "
+                             "the slice end position (%f s)." % (start, end))
+        if end > duration:
+            raise ValueError("The slice end position (%f s) is out of bounds "
+                             "(> %f s)" % (end, duration))
+        start_frame = int(start * sample_rate)
+        end_frame = int(end * sample_rate)
+        sndfile.seek(start_frame)
+        data = sndfile.read(frames=end_frame - start_frame, dtype='float32')
+        return cls(data, sample_rate)
    @classmethod
    def from_bytes(cls, bytes):
        """Create audio segment from a byte string containing audio samples.
@@ -105,6 +153,20 @@ class AudioSegment(object):
        samples = np.concatenate([seg.samples for seg in segments])
        return cls(samples, sample_rate)
+    @classmethod
+    def make_silence(cls, duration, sample_rate):
+        """Creates a silent audio segment of the given duration and sample rate.
+        :param duration: Length of silence in seconds.
+        :type duration: float
+        :param sample_rate: Sample rate.
+        :type sample_rate: float
+        :return: Silent AudioSegment instance of the given duration.
+        :rtype: AudioSegment
+        """
+        samples = np.zeros(int(duration * sample_rate))
+        return cls(samples, sample_rate)
    def to_wav_file(self, filepath, dtype='float32'):
        """Save audio segment to disk as wav file.
@@ -130,68 +192,6 @@ class AudioSegment(object):
            format='WAV',
            subtype=subtype_map[dtype])
-    @classmethod
-    def slice_from_file(cls, file, start=None, end=None):
-        """Loads a small section of an audio without having to load
-        the entire file into the memory which can be incredibly wasteful.
-        :param file: Input audio filepath or file object.
-        :type file: basestring|file
-        :param start: Start time in seconds. If start is negative, it wraps
-                      around from the end. If not provided, this function 
-                      reads from the very beginning.
-        :type start: float
-        :param end: End time in seconds. If end is negative, it wraps around
-                    from the end. If not provided, the default behvaior is
-                    to read to the end of the file.
-        :type end: float
-        :return: AudioSegment instance of the specified slice of the input
-                 audio file.
-        :rtype: AudioSegment
-        :raise ValueError: If start or end is incorrectly set, e.g. out of
-                           bounds in time.
-        """
-        sndfile = soundfile.SoundFile(file)
-        sample_rate = sndfile.samplerate
-        duration = float(len(sndfile)) / sample_rate
-        start = 0. if start is None else start
-        end = 0. if end is None else end
-        if start < 0.0:
-            start += duration
-        if end < 0.0:
-            end += duration
-        if start < 0.0:
-            raise ValueError("The slice start position (%f s) is out of "
-                             "bounds." % start)
-        if end < 0.0:
-            raise ValueError("The slice end position (%f s) is out of bounds." %
-                             end)
-        if start > end:
-            raise ValueError("The slice start position (%f s) is later than "
-                             "the slice end position (%f s)." % (start, end))
-        if end > duration:
-            raise ValueError("The slice end position (%f s) is out of bounds "
-                             "(> %f s)" % (end, duration))
-        start_frame = int(start * sample_rate)
-        end_frame = int(end * sample_rate)
-        sndfile.seek(start_frame)
-        data = sndfile.read(frames=end_frame - start_frame, dtype='float32')
-        return cls(data, sample_rate)
-    @classmethod
-    def make_silence(cls, duration, sample_rate):
-        """Creates a silent audio segment of the given duration and sample rate.
-        :param duration: Length of silence in seconds.
-        :type duration: float
-        :param sample_rate: Sample rate.
-        :type sample_rate: float
-        :return: Silent AudioSegment instance of the given duration.
-        :rtype: AudioSegment
-        """
-        samples = np.zeros(int(duration * sample_rate))
-        return cls(samples, sample_rate)
    def superimpose(self, other):
        """Add samples from another segment to those of this segment
        (sample-wise addition, not segment concatenation).
@@ -225,7 +225,7 @@ class AudioSegment(object):
        samples = self._convert_samples_from_float32(self._samples, dtype)
        return samples.tostring()
-    def apply_gain(self, gain):
+    def gain_db(self, gain):
        """Apply gain in decibels to samples.
        Note that this is an in-place transformation.
@@ -278,7 +278,7 @@ class AudioSegment(object):
                "Unable to normalize segment to %f dB because the "
                "the probable gain have exceeds max_gain_db (%f dB)" %
                (target_db, max_gain_db))
-        self.apply_gain(min(max_gain_db, target_db - self.rms_db))
+        self.gain_db(min(max_gain_db, target_db - self.rms_db))
    def normalize_online_bayesian(self,
                                  target_db,
@@ -319,7 +319,7 @@ class AudioSegment(object):
        rms_estimate_db = 10 * np.log10(mean_squared_estimate)
        # Compute required time-varying gain.
        gain_db = target_db - rms_estimate_db
-        self.apply_gain(gain_db)
+        self.gain_db(gain_db)
    def resample(self, target_sample_rate, filter='kaiser_best'):
        """Resample the audio to a target sample rate.
@@ -329,9 +329,10 @@ class AudioSegment(object):
        :param target_sample_rate: Target sample rate.
        :type target_sample_rate: int
        :param filter: The resampling filter to use one of {'kaiser_best',
-                       'kaiser_fast'}.               
+                       'kaiser_fast'}.
        :type filter: str
        """
+        resample_ratio = target_sample_rate / self._sample_rate
        self._samples = resampy.resample(
            self.samples, self.sample_rate, target_sample_rate, filter=filter)
        self._sample_rate = target_sample_rate
@@ -364,6 +365,31 @@ class AudioSegment(object):
            raise ValueError("Unknown value for the sides %s" % sides)
        self._samples = padded._samples
+    def shift(self, shift_ms):
+        """Shift the audio in time. If `shift_ms` is positive, shift with time
+        advance; if negative, shift with time delay. Silence are padded to
+        keep the duration unchanged.
+        Note that this is an in-place transformation.
+        :param shift_ms: Shift time in millseconds. If positive, shift with
+                         time advance; if negative; shift with time delay.
+        :type shift_ms: float
+        :raises ValueError: If shift_ms is longer than audio duration.
+        """
+        if abs(shift_ms) / 1000.0 > self.duration:
+            raise ValueError("Absolute value of shift_ms should be smaller "
+                             "than audio duration.")
+        shift_samples = int(shift_ms * self._sample_rate / 1000)
+        if shift_samples > 0:
+            # time advance
+            self._samples[:-shift_samples] = self._samples[shift_samples:]
+            self._samples[-shift_samples:] = 0
+        elif shift_samples < 0:
+            # time delay
+            self._samples[-shift_samples:] = self._samples[:shift_samples]
+            self._samples[:-shift_samples] = 0
    def subsegment(self, start_sec=None, end_sec=None):
        """Cut the AudioSegment between given boundaries.
@@ -503,7 +529,7 @@ class AudioSegment(object):
        noise_gain_db = min(self.rms_db - noise.rms_db - snr_dB, max_gain_db)
        noise_new = copy.deepcopy(noise)
        noise_new.random_subsegment(self.duration, rng=rng)
-        noise_new.apply_gain(noise_gain_db)
+        noise_new.gain_db(noise_gain_db)
        self.superimpose(noise_new)
    @property

--- a/deep_speech_2/data_utils/augmentor/augmentation.py
+++ b/deep_speech_2/data_utils/augmentor/augmentation.py
@@ -6,6 +6,7 @@ from __future__ import print_function
 import json
 import random
 from data_utils.augmentor.volume_perturb import VolumePerturbAugmentor
+from data_utils.augmentor.shift_perturb import ShiftPerturbAugmentor
 from data_utils.augmentor.speed_perturb import SpeedPerturbAugmentor
 from data_utils.augmentor.resample import ResampleAugmentor
 from data_utils.augmentor.online_bayesian_normalization import OnlineBayesianNormalizationAugmentor
@@ -79,11 +80,13 @@ class AugmentationPipeline(object):
        """Return an augmentation model by the type name, and pass in params."""
        if augmentor_type == "volume":
            return VolumePerturbAugmentor(self._rng, **params)
-        if augmentor_type == "speed":
+        elif augmentor_type == "shift":
+            return ShiftPerturbAugmentor(self._rng, **params)
+        elif augmentor_type == "speed":
            return SpeedPerturbAugmentor(self._rng, **params)
-        if augmentor_type == "resample":
+        elif augmentor_type == "resample":
            return ResampleAugmentor(self._rng, **params)
-        if augmentor_type == "bayesian_normal":
+        elif augmentor_type == "bayesian_normal":
            return OnlineBayesianNormalizationAugmentor(self._rng, **params)
        else:
            raise ValueError("Unknown augmentor type [%s]." % augmentor_type)
--- a/deep_speech_2/data_utils/augmentor/resample.py
+++ b/deep_speech_2/data_utils/augmentor/resample.py
@@ -30,4 +30,4 @@ class ResampleAugmentor(AugmentorBase):
        :param audio: Audio segment to add effects to.
        :type audio: AudioSegment|SpeechSegment
        """
        audio_segment.resample(self._new_sample_rate)
\ No newline at end of file
--- a/deep_speech_2/data_utils/augmentor/shift_perturb.py
+++ b/deep_speech_2/data_utils/augmentor/shift_perturb.py
+"""Contains the volume perturb augmentation model."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from data_utils.augmentor.base import AugmentorBase
+class ShiftPerturbAugmentor(AugmentorBase):
+    """Augmentation model for adding random shift perturbation.
+    :param rng: Random generator object.
+    :type rng: random.Random
+    :param min_shift_ms: Minimal shift in milliseconds.
+    :type min_shift_ms: float
+    :param max_shift_ms: Maximal shift in milliseconds.
+    :type max_shift_ms: float
+    """
+    def __init__(self, rng, min_shift_ms, max_shift_ms):
+        self._min_shift_ms = min_shift_ms
+        self._max_shift_ms = max_shift_ms
+        self._rng = rng
+    def transform_audio(self, audio_segment):
+        """Shift audio.
+        Note that this is an in-place transformation.
+        :param audio_segment: Audio segment to add effects to.
+        :type audio_segment: AudioSegmenet|SpeechSegment
+        """
+        shift_ms = self._rng.uniform(self._min_shift_ms, self._max_shift_ms)
+        audio_segment.shift(shift_ms)
--- a/deep_speech_2/data_utils/augmentor/speed_perturb.py
+++ b/deep_speech_2/data_utils/augmentor/speed_perturb.py
@@ -14,20 +14,21 @@ class SpeedPerturbAugmentor(AugmentorBase):
    :param rng: Random generator object.
    :type rng: random.Random
-    :param min_speed_rate: Lower bound of new speed rate to sample.
+    :param min_speed_rate: Lower bound of new speed rate to sample and should
+                           not below 0.9.
    :type min_speed_rate: float
-    :param max_speed_rate: Upper bound of new speed rate to sample.
+    :param max_speed_rate: Upper bound of new speed rate to sample and should
+                           not above 1.1.
    :type max_speed_rate: float
    """
    def __init__(self, rng, min_speed_rate, max_speed_rate):
+        if min_speed_rate < 0.9:
-        if (min_speed_rate < 0.5):
+            raise ValueError(
-            raise ValueError("Sampling speed below 0.9 can cause unnatural "\
+                "Sampling speed below 0.9 can cause unnatural effects")
-                             "effects")
+        if max_speed_rate > 1.1:
-        if (max_speed_rate > 1.5):
+            raise ValueError(
-            raise ValueError("Sampling speed above 1.1 can cause unnatural "\
+                "Sampling speed above 1.1 can cause unnatural effects")
-                             "effects")
        self._min_speed_rate = min_speed_rate
        self._max_speed_rate = max_speed_rate
        self._rng = rng

--- a/deep_speech_2/data_utils/augmentor/volume_perturb.py
+++ b/deep_speech_2/data_utils/augmentor/volume_perturb.py
@@ -37,4 +37,4 @@ class VolumePerturbAugmentor(AugmentorBase):
        :type audio_segment: AudioSegmenet|SpeechSegment
        """
        gain = self._rng.uniform(self._min_gain_dBFS, self._max_gain_dBFS)
-        audio_segment.apply_gain(gain)
+        audio_segment.gain_db(gain)
--- a/deep_speech_2/data_utils/data.py
+++ b/deep_speech_2/data_utils/data.py
@@ -7,6 +7,7 @@ from __future__ import print_function
 import random
 import numpy as np
+import multiprocessing
 import paddle.v2 as paddle
 from data_utils import utils
 from data_utils.augmentor.augmentation import AugmentationPipeline
@@ -44,6 +45,11 @@ class DataGenerator(object):
    :types max_freq: None|float
    :param specgram_type: Specgram feature type. Options: 'linear'.
    :type specgram_type: str
+    :param use_dB_normalization: Whether to normalize the audio to -20 dB
+                                 before extracting the features.
+    :type use_dB_normalization: bool
+    :param num_threads: Number of CPU threads for processing data.
+    :type num_threads: int
    :param random_seed: Random seed.
    :type random_seed: int
    """
@@ -58,6 +64,8 @@ class DataGenerator(object):
                 window_ms=20.0,
                 max_freq=None,
                 specgram_type='linear',
+                 use_dB_normalization=True,
+                 num_threads=multiprocessing.cpu_count(),
                 random_seed=0):
        self._max_duration = max_duration
        self._min_duration = min_duration
@@ -69,7 +77,9 @@ class DataGenerator(object):
            specgram_type=specgram_type,
            stride_ms=stride_ms,
            window_ms=window_ms,
-            max_freq=max_freq)
+            max_freq=max_freq,
+            use_dB_normalization=use_dB_normalization)
+        self._num_threads = num_threads
        self._rng = random.Random(random_seed)
        self._epoch = 0
@@ -207,10 +217,14 @@ class DataGenerator(object):
        def reader():
            for instance in manifest:
-                yield self._process_utterance(instance["audio_filepath"],
+                yield instance
-                                              instance["text"])
-        return reader
+        def mapper(instance):
+            return self._process_utterance(instance["audio_filepath"],
+                                           instance["text"])
+        return paddle.reader.xmap_readers(
+            mapper, reader, self._num_threads, 1024, order=True)
    def _padding_batch(self, batch, padding_to=-1, flatten=False):
        """

--- a/deep_speech_2/data_utils/featurizer/audio_featurizer.py
+++ b/deep_speech_2/data_utils/featurizer/audio_featurizer.py
@@ -24,26 +24,64 @@ class AudioFeaturizer(object):
                     corresponding to frequencies between [0, max_freq] are
                     returned.
    :types max_freq: None|float
+    :param target_sample_rate: Audio are resampled (if upsampling or
+                               downsampling is allowed) to this before
+                               extracting spectrogram features.
+    :type target_sample_rate: float
+    :param use_dB_normalization: Whether to normalize the audio to a certain
+                                 decibels before extracting the features.
+    :type use_dB_normalization: bool
+    :param target_dB: Target audio decibels for normalization.
+    :type target_dB: float
    """
    def __init__(self,
                 specgram_type='linear',
                 stride_ms=10.0,
                 window_ms=20.0,
-                 max_freq=None):
+                 max_freq=None,
+                 target_sample_rate=16000,
+                 use_dB_normalization=True,
+                 target_dB=-20):
        self._specgram_type = specgram_type
        self._stride_ms = stride_ms
        self._window_ms = window_ms
        self._max_freq = max_freq
+        self._target_sample_rate = target_sample_rate
+        self._use_dB_normalization = use_dB_normalization
+        self._target_dB = target_dB
-    def featurize(self, audio_segment):
+    def featurize(self,
+                  audio_segment,
+                  allow_downsampling=True,
+                  allow_upsamplling=True):
        """Extract audio features from AudioSegment or SpeechSegment.
        :param audio_segment: Audio/speech segment to extract features from.
        :type audio_segment: AudioSegment|SpeechSegment
+        :param allow_downsampling: Whether to allow audio downsampling before
+                                   featurizing.
+        :type allow_downsampling: bool
+        :param allow_upsampling: Whether to allow audio upsampling before
+                                 featurizing.
+        :type allow_upsampling: bool
        :return: Spectrogram audio feature in 2darray.
        :rtype: ndarray
+        :raises ValueError: If audio sample rate is not supported.
        """
+        # upsampling or downsampling
+        if ((audio_segment.sample_rate > self._target_sample_rate and
+             allow_downsampling) or
+            (audio_segment.sample_rate < self._target_sample_rate and
+             allow_upsampling)):
+            audio_segment.resample(self._target_sample_rate)
+        if audio_segment.sample_rate != self._target_sample_rate:
+            raise ValueError("Audio sample rate is not supported. "
+                             "Turn allow_downsampling or allow up_sampling on.")
+        # decibel normalization
+        if self._use_dB_normalization:
+            audio_segment.normalize(target_db=self._target_dB)
+        # extract spectrogram
        return self._compute_specgram(audio_segment.samples,
                                      audio_segment.sample_rate)

--- a/deep_speech_2/data_utils/featurizer/speech_featurizer.py
+++ b/deep_speech_2/data_utils/featurizer/speech_featurizer.py
@@ -29,6 +29,15 @@ class SpeechFeaturizer(object):
                     corresponding to frequencies between [0, max_freq] are
                     returned.
    :types max_freq: None|float
+    :param target_sample_rate: Speech are resampled (if upsampling or
+                               downsampling is allowed) to this before
+                               extracting spectrogram features.
+    :type target_sample_rate: float
+    :param use_dB_normalization: Whether to normalize the audio to a certain
+                                 decibels before extracting the features.
+    :type use_dB_normalization: bool
+    :param target_dB: Target audio decibels for normalization.
+    :type target_dB: float
    """
    def __init__(self,
@@ -36,9 +45,18 @@ class SpeechFeaturizer(object):
                 specgram_type='linear',
                 stride_ms=10.0,
                 window_ms=20.0,
-                 max_freq=None):
+                 max_freq=None,
-        self._audio_featurizer = AudioFeaturizer(specgram_type, stride_ms,
+                 target_sample_rate=16000,
-                                                 window_ms, max_freq)
+                 use_dB_normalization=True,
+                 target_dB=-20):
+        self._audio_featurizer = AudioFeaturizer(
+            specgram_type=specgram_type,
+            stride_ms=stride_ms,
+            window_ms=window_ms,
+            max_freq=max_freq,
+            target_sample_rate=target_sample_rate,
+            use_dB_normalization=use_dB_normalization,
+            target_dB=target_dB)
        self._text_featurizer = TextFeaturizer(vocab_filepath)
    def featurize(self, speech_segment):

--- a/deep_speech_2/data_utils/speech.py
+++ b/deep_speech_2/data_utils/speech.py
@@ -94,7 +94,7 @@ class SpeechSegment(AudioSegment):
        return cls(samples, sample_rate, transcripts)
    @classmethod
-    def slice_from_file(cls, filepath, start=None, end=None, transcript):
+    def slice_from_file(cls, filepath, transcript, start=None, end=None):
        """Loads a small section of an speech without having to load
        the entire file into the memory which can be incredibly wasteful.

--- a/deep_speech_2/infer.py
+++ b/deep_speech_2/infer.py
@@ -6,6 +6,7 @@ from __future__ import print_function
 import argparse
 import gzip
 import distutils.util
+import multiprocessing
 import paddle.v2 as paddle
 from data_utils.data import DataGenerator
 from model import deep_speech2
@@ -38,6 +39,11 @@ parser.add_argument(
    default=True,
    type=distutils.util.strtobool,
    help="Use gpu or not. (default: %(default)s)")
+parser.add_argument(
+    "--num_threads_data",
+    default=multiprocessing.cpu_count(),
+    type=int,
+    help="Number of cpu threads for preprocessing data. (default: %(default)s)")
 parser.add_argument(
    "--mean_std_filepath",
    default='mean_std.npz',
@@ -50,7 +56,7 @@ parser.add_argument(
    help="Manifest path for decoding. (default: %(default)s)")
 parser.add_argument(
    "--model_filepath",
-    default='./params.tar.gz',
+    default='checkpoints/params.latest.tar.gz',
    type=str,
    help="Model filepath. (default: %(default)s)")
 parser.add_argument(
@@ -67,7 +73,8 @@ def infer():
    data_generator = DataGenerator(
        vocab_filepath=args.vocab_filepath,
        mean_std_filepath=args.mean_std_filepath,
-        augmentation_config='{}')
+        augmentation_config='{}',
+        num_threads=args.num_threads_data)
    # create network config
    # paddle.data_type.dense_array is used for variable batch input.

--- a/deep_speech_2/tests/test_augmentor.py
+++ b/deep_speech_2/tests/test_augmentor.py
+"""Test augmentor class."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import unittest
+from data_utils import audio
+from data_utils.augmentor.augmentation import AugmentationPipeline
+import random
+import numpy as np
+random_seed = 0
+#audio instance
+audio_data = [3.0517571e-05, -8.54492188e-04, -1.09863281e-03, -9.4604492e-04,\
+            -1.31225586e-03, -1.09863281e-03, -1.73950195e-03, -2.1057189e-03,\
+            -2.04467773e-03, -1.46484375e-03, -1.43432617e-03, -9.4604492e-04,\
+            -1.95312500e-03, -1.86157227e-03, -2.10571289e-03, -2.3193354e-03,\
+            -2.01416016e-03, -2.62451172e-03, -2.07519531e-03, -2.3803719e-03]
+audio_data = np.array(audio_data)
+samplerate = 10
+class TestAugmentor(unittest.TestCase):
+    def test_volume(self):
+        config_json = '[{"type": "volume","params": {"min_gain_dBFS": -15, '\
+        '"max_gain_dBFS": 15},"prob": 1.0}]'
+        aug_pipeline = AugmentationPipeline(
+            augmentation_config=config_json, random_seed=random_seed)
+        audio_seg = audio.AudioSegment(audio_data, samplerate)
+        aug_pipeline.transform_audio(audio_seg)
+        orig_audio = audio.AudioSegment(audio_data, samplerate)
+        self.assertFalse(np.any(audio_seg.samples == orig_audio.samples))
+    def test_speed(self):
+        config_json = '[{"type":"speed","params": {"min_speed_rate": 0.9,' \
+        '"max_speed_rate": 1.1},"prob": 1.0}]'
+        aug_pipeline = AugmentationPipeline(
+            augmentation_config=config_json, random_seed=random_seed)
+        audio_seg = audio.AudioSegment(audio_data, samplerate)
+        aug_pipeline.transform_audio(audio_seg)
+        orig_audio = audio.AudioSegment(audio_data, samplerate)
+        self.assertFalse(np.any(audio_seg.samples == orig_audio.samples))
+    def test_resample(self):
+        config_json = '[{"type":"resample","params": {"new_sample_rate":5},'\
+        '"prob": 1.0}]'
+        aug_pipeline = AugmentationPipeline(
+            augmentation_config=config_json, random_seed=random_seed)
+        audio_seg = audio.AudioSegment(audio_data, samplerate)
+        aug_pipeline.transform_audio(audio_seg)
+        self.assertTrue(audio_seg.sample_rate == 5)
+    def test_bayesial(self):
+        config_json = '[{"type":"bayesian_normal","params":{"target_db":-20,' \
+        '"prior_db":-4, "prior_samples": -8, "startup_delay": 0.0},"prob":1.0}]'
+        aug_pipeline = AugmentationPipeline(
+            augmentation_config=config_json, random_seed=random_seed)
+        audio_seg = audio.AudioSegment(audio_data, samplerate)
+        aug_pipeline.transform_audio(audio_seg)
+        orig_audio = audio.AudioSegment(audio_data, samplerate)
+        self.assertFalse(np.any(audio_seg.samples == orig_audio.samples))
+if __name__ == '__main__':
+    unittest.main()
--- a/deep_speech_2/train.py
+++ b/deep_speech_2/train.py
@@ -9,6 +9,7 @@ import argparse
 import gzip
 import time
 import distutils.util
+import multiprocessing
 import paddle.v2 as paddle
 from model import deep_speech2
 from data_utils.data import DataGenerator
@@ -16,10 +17,10 @@ import utils
 parser = argparse.ArgumentParser(description=__doc__)
 parser.add_argument(
-    "--batch_size", default=32, type=int, help="Minibatch size.")
+    "--batch_size", default=256, type=int, help="Minibatch size.")
 parser.add_argument(
    "--num_passes",
-    default=20,
+    default=200,
    type=int,
    help="Training pass number. (default: %(default)s)")
 parser.add_argument(
@@ -52,17 +53,34 @@ parser.add_argument(
    default=True,
    type=distutils.util.strtobool,
    help="Use sortagrad or not. (default: %(default)s)")
+parser.add_argument(
+    "--max_duration",
+    default=27.0,
+    type=float,
+    help="Audios with duration larger than this will be discarded. "
+    "(default: %(default)s)")
+parser.add_argument(
+    "--min_duration",
+    default=0.0,
+    type=float,
+    help="Audios with duration smaller than this will be discarded. "
+    "(default: %(default)s)")
 parser.add_argument(
    "--shuffle_method",
-    default='instance_shuffle',
+    default='batch_shuffle_clipped',
    type=str,
    help="Shuffle method: 'instance_shuffle', 'batch_shuffle', "
    "'batch_shuffle_batch'. (default: %(default)s)")
 parser.add_argument(
    "--trainer_count",
-    default=4,
+    default=8,
    type=int,
    help="Trainer number. (default: %(default)s)")
+parser.add_argument(
+    "--num_threads_data",
+    default=multiprocessing.cpu_count(),
+    type=int,
+    help="Number of cpu threads for preprocessing data. (default: %(default)s)")
 parser.add_argument(
    "--mean_std_filepath",
    default='mean_std.npz',
@@ -92,7 +110,9 @@ parser.add_argument(
    "the existing model of this path. (default: %(default)s)")
 parser.add_argument(
    "--augmentation_config",
-    default='{}',
+    default='[{"type": "shift", '
+    '"params": {"min_shift_ms": -5, "max_shift_ms": 5},'
+    '"prob": 1.0}]',
    type=str,
    help="Augmentation configuration in json-format. "
    "(default: %(default)s)")
@@ -107,7 +127,10 @@ def train():
        return DataGenerator(
            vocab_filepath=args.vocab_filepath,
            mean_std_filepath=args.mean_std_filepath,
-            augmentation_config=args.augmentation_config)
+            augmentation_config=args.augmentation_config,
+            max_duration=args.max_duration,
+            min_duration=args.min_duration,
+            num_threads=args.num_threads_data)
    train_generator = data_generator()
    test_generator = data_generator()
@@ -168,7 +191,7 @@ def train():
                print("\nPass: %d, Batch: %d, TrainCost: %f" % (
                    event.pass_id, event.batch_id + 1, cost_sum / cost_counter))
                cost_sum, cost_counter = 0.0, 0
-                with gzip.open("params.tar.gz", 'w') as f:
+                with gzip.open("checkpoints/params.latest.tar.gz", 'w') as f:
                    parameters.to_tar(f)
            else:
                sys.stdout.write('.')
@@ -181,6 +204,9 @@ def train():
                reader=test_batch_reader, feeding=test_generator.feeding)
            print("\n------- Time: %d sec,  Pass: %d, ValidationCost: %s" %
                  (time.time() - start_time, event.pass_id, result.cost))
+            with gzip.open("checkpoints/params.pass-%d.tar.gz" % event.pass_id,
+                           'w') as f:
+                parameters.to_tar(f)
    # run train
    trainer.train(

--- a/hsigmoid/.gitignore
+++ b/hsigmoid/.gitignore
+*.pyc
+models
--- a/word_embedding/README.md
+++ b/word_embedding/README.md
@@ -50,7 +50,7 @@ def train_data(filename, word_dict, n):
 ```
 ## 网络结构
-本文通过训练N-gram语言模型来获得词向量，具体地使用前4个词来预测当前词。网络输入为词在字典中的id，然后查询词向量词表获取词向量，接着拼接4个词的词向量，然后接入一个全连接隐层，最后是Hsigmoid层。详细网络结构见图2：
+本文通过训练N-gram语言模型来获得词向量，具体地使用前4个词来预测当前词。网络输入为词在字典中的id，然后查询词向量词表获取词向量，接着拼接4个词的词向量，然后接入一个全连接隐层，最后是`Hsigmoid`层。详细网络结构见图2：
 <p align="center">
 <img src="images/network_conf.png" width = "70%" align="center"/><br/>
@@ -60,41 +60,27 @@ def train_data(filename, word_dict, n):
 代码实现如下：
 ```python
-import math
+def ngram_lm(hidden_size, embed_size, dict_size, gram_num=4, is_train=True):
-import paddle.v2 as paddle
+    emb_layers = []
-def network_conf(hidden_size, embed_size, dict_size, is_train=True):
-    first_word = paddle.layer.data(
-        name='firstw', type=paddle.data_type.integer_value(dict_size))
-    second_word = paddle.layer.data(
-        name='secondw', type=paddle.data_type.integer_value(dict_size))
-    third_word = paddle.layer.data(
-        name='thirdw', type=paddle.data_type.integer_value(dict_size))
-    fourth_word = paddle.layer.data(
-        name='fourthw', type=paddle.data_type.integer_value(dict_size))
-    target_word = paddle.layer.data(
-        name='fifthw', type=paddle.data_type.integer_value(dict_size))
    embed_param_attr = paddle.attr.Param(
        name="_proj", initial_std=0.001, learning_rate=1, l2_rate=0)
-    embed_first_word = paddle.layer.embedding(
+    for i in range(gram_num):
-        input=first_word, size=embed_size, param_attr=embed_param_attr)
+        word = paddle.layer.data(
-    embed_second_word = paddle.layer.embedding(
+            name="__word%02d__" % (i),
-        input=second_word, size=embed_size, param_attr=embed_param_attr)
+            type=paddle.data_type.integer_value(dict_size))
-    embed_third_word = paddle.layer.embedding(
+        emb_layers.append(
-        input=third_word, size=embed_size, param_attr=embed_param_attr)
+            paddle.layer.embedding(
-    embed_fourth_word = paddle.layer.embedding(
+                input=word, size=embed_size, param_attr=embed_param_attr))
-        input=fourth_word, size=embed_size, param_attr=embed_param_attr)
+    target_word = paddle.layer.data(
-    embed_context = paddle.layer.concat(input=[
+        name="__target_word__", type=paddle.data_type.integer_value(dict_size))
-        embed_first_word, embed_second_word, embed_third_word, embed_fourth_word
-    ])
+    embed_context = paddle.layer.concat(input=emb_layers)
    hidden_layer = paddle.layer.fc(
        input=embed_context,
        size=hidden_size,
-                act=paddle.activation.Sigmoid(),
+        act=paddle.activation.Sigmoid(),
        layer_attr=paddle.attr.Extra(drop_rate=0.5),
        bias_attr=paddle.attr.Param(learning_rate=2),
        param_attr=paddle.attr.Param(
@@ -105,27 +91,26 @@ def network_conf(hidden_size, embed_size, dict_size, is_train=True):
            input=hidden_layer,
            label=target_word,
            num_classes=dict_size,
-            param_attr=paddle.attr.Param(name='sigmoid_w'),
+            param_attr=paddle.attr.Param(name="sigmoid_w"),
-            bias_attr=paddle.attr.Param(name='sigmoid_b'))
+            bias_attr=paddle.attr.Param(name="sigmoid_b"))
        return cost
    else:
-        with paddle.layer.mixed(
+        prediction = paddle.layer.fc(
-                size=dict_size - 1,
+            size=dict_size - 1,
-                act=paddle.activation.Sigmoid(),
+            input=hidden_layer,
-                bias_attr=paddle.attr.Param(name='sigmoid_b')) as prediction:
+            act=paddle.activation.Sigmoid(),
-            prediction += paddle.layer.trans_full_matrix_projection(
+            bias_attr=paddle.attr.Param(name="sigmoid_b"),
-                input=hidden_layer,
+            param_attr=paddle.attr.Param(name="sigmoid_w"))
-                param_attr=paddle.attr.Param(name='sigmoid_w'))
        return prediction
 ```
 需要注意，在预测阶段，我们需要对hsigmoid参数做一次转置，这里输出的类别数为词典大小减1，对应非叶节点的数量。
 ## 训练阶段
-训练比较简单，直接运行``` python hsigmoid_train.py ```。程序第一次运行会检测用户缓存文件夹中是否包含imikolov数据集，如果未包含，则自动下载。运行过程中，每100个iteration会打印模型训练信息，主要包含训练损失和测试损失，每个pass会保存一次模型。
+训练比较简单，直接运行``` python train.py ```。程序第一次运行会检测用户缓存文件夹中是否包含imikolov数据集，如果未包含，则自动下载。运行过程中，每100个iteration会打印模型训练信息，主要包含训练损失和测试损失，每个pass会保存一次模型。
 ## 预测阶段
-预测时，直接运行``` python hsigmoid_predict.py ```，程序会首先load模型，然后按照batch方式进行预测，并打印预测结果。预测阶段最重要的就是根据概率得到编码路径，然后遍历路径获取最终的预测类别，这部分逻辑如下：
+预测时，直接运行``` python infer.py ```，程序会首先load模型，然后按照batch方式进行预测，并打印预测结果。预测阶段最重要的就是根据概率得到编码路径，然后遍历路径获取最终的预测类别，这部分逻辑如下：
 ```python
 def decode_res(infer_res, dict_size):

--- a/word_embedding/images/binary_tree.png
+++ b/word_embedding/images/binary_tree.png
--- a/word_embedding/images/network_conf.png
+++ b/word_embedding/images/network_conf.png
--- a/word_embedding/images/path_to_1.png
+++ b/word_embedding/images/path_to_1.png
--- a/word_embedding/index.html
+++ b/word_embedding/index.html
@@ -92,7 +92,7 @@ def train_data(filename, word_dict, n):
 ```
 ## 网络结构
-本文通过训练N-gram语言模型来获得词向量，具体地使用前4个词来预测当前词。网络输入为词在字典中的id，然后查询词向量词表获取词向量，接着拼接4个词的词向量，然后接入一个全连接隐层，最后是Hsigmoid层。详细网络结构见图2：
+本文通过训练N-gram语言模型来获得词向量，具体地使用前4个词来预测当前词。网络输入为词在字典中的id，然后查询词向量词表获取词向量，接着拼接4个词的词向量，然后接入一个全连接隐层，最后是`Hsigmoid`层。详细网络结构见图2：
 <p align="center">
 <img src="images/network_conf.png" width = "70%" align="center"/><br/>
@@ -102,41 +102,27 @@ def train_data(filename, word_dict, n):
 代码实现如下：
 ```python
-import math
+def ngram_lm(hidden_size, embed_size, dict_size, gram_num=4, is_train=True):
-import paddle.v2 as paddle
+    emb_layers = []
-def network_conf(hidden_size, embed_size, dict_size, is_train=True):
-    first_word = paddle.layer.data(
-        name='firstw', type=paddle.data_type.integer_value(dict_size))
-    second_word = paddle.layer.data(
-        name='secondw', type=paddle.data_type.integer_value(dict_size))
-    third_word = paddle.layer.data(
-        name='thirdw', type=paddle.data_type.integer_value(dict_size))
-    fourth_word = paddle.layer.data(
-        name='fourthw', type=paddle.data_type.integer_value(dict_size))
-    target_word = paddle.layer.data(
-        name='fifthw', type=paddle.data_type.integer_value(dict_size))
    embed_param_attr = paddle.attr.Param(
        name="_proj", initial_std=0.001, learning_rate=1, l2_rate=0)
-    embed_first_word = paddle.layer.embedding(
+    for i in range(gram_num):
-        input=first_word, size=embed_size, param_attr=embed_param_attr)
+        word = paddle.layer.data(
-    embed_second_word = paddle.layer.embedding(
+            name="__word%02d__" % (i),
-        input=second_word, size=embed_size, param_attr=embed_param_attr)
+            type=paddle.data_type.integer_value(dict_size))
-    embed_third_word = paddle.layer.embedding(
+        emb_layers.append(
-        input=third_word, size=embed_size, param_attr=embed_param_attr)
+            paddle.layer.embedding(
-    embed_fourth_word = paddle.layer.embedding(
+                input=word, size=embed_size, param_attr=embed_param_attr))
-        input=fourth_word, size=embed_size, param_attr=embed_param_attr)
+    target_word = paddle.layer.data(
-    embed_context = paddle.layer.concat(input=[
+        name="__target_word__", type=paddle.data_type.integer_value(dict_size))
-        embed_first_word, embed_second_word, embed_third_word, embed_fourth_word
-    ])
+    embed_context = paddle.layer.concat(input=emb_layers)
    hidden_layer = paddle.layer.fc(
        input=embed_context,
        size=hidden_size,
-                act=paddle.activation.Sigmoid(),
+        act=paddle.activation.Sigmoid(),
        layer_attr=paddle.attr.Extra(drop_rate=0.5),
        bias_attr=paddle.attr.Param(learning_rate=2),
        param_attr=paddle.attr.Param(
@@ -147,27 +133,26 @@ def network_conf(hidden_size, embed_size, dict_size, is_train=True):
            input=hidden_layer,
            label=target_word,
            num_classes=dict_size,
-            param_attr=paddle.attr.Param(name='sigmoid_w'),
+            param_attr=paddle.attr.Param(name="sigmoid_w"),
-            bias_attr=paddle.attr.Param(name='sigmoid_b'))
+            bias_attr=paddle.attr.Param(name="sigmoid_b"))
        return cost
    else:
-        with paddle.layer.mixed(
+        prediction = paddle.layer.fc(
-                size=dict_size - 1,
+            size=dict_size - 1,
-                act=paddle.activation.Sigmoid(),
+            input=hidden_layer,
-                bias_attr=paddle.attr.Param(name='sigmoid_b')) as prediction:
+            act=paddle.activation.Sigmoid(),
-            prediction += paddle.layer.trans_full_matrix_projection(
+            bias_attr=paddle.attr.Param(name="sigmoid_b"),
-                input=hidden_layer,
+            param_attr=paddle.attr.Param(name="sigmoid_w"))
-                param_attr=paddle.attr.Param(name='sigmoid_w'))
        return prediction
 ```
 需要注意，在预测阶段，我们需要对hsigmoid参数做一次转置，这里输出的类别数为词典大小减1，对应非叶节点的数量。
 ## 训练阶段
-训练比较简单，直接运行``` python hsigmoid_train.py ```。程序第一次运行会检测用户缓存文件夹中是否包含imikolov数据集，如果未包含，则自动下载。运行过程中，每100个iteration会打印模型训练信息，主要包含训练损失和测试损失，每个pass会保存一次模型。
+训练比较简单，直接运行``` python train.py ```。程序第一次运行会检测用户缓存文件夹中是否包含imikolov数据集，如果未包含，则自动下载。运行过程中，每100个iteration会打印模型训练信息，主要包含训练损失和测试损失，每个pass会保存一次模型。
 ## 预测阶段
-预测时，直接运行``` python hsigmoid_predict.py ```，程序会首先load模型，然后按照batch方式进行预测，并打印预测结果。预测阶段最重要的就是根据概率得到编码路径，然后遍历路径获取最终的预测类别，这部分逻辑如下：
+预测时，直接运行``` python infer.py ```，程序会首先load模型，然后按照batch方式进行预测，并打印预测结果。预测阶段最重要的就是根据概率得到编码路径，然后遍历路径获取最终的预测类别，这部分逻辑如下：
 ```python
 def decode_res(infer_res, dict_size):

--- a/word_embedding/hsigmoid_predict.py
+++ b/word_embedding/hsigmoid_predict.py
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
+import os
+import logging
+import gzip
 import paddle.v2 as paddle
-from hsigmoid_conf import network_conf
+from network_conf import ngram_lm
-import gzip
+logger = logging.getLogger("paddle")
+logger.setLevel(logging.WARNING)
 def decode_res(infer_res, dict_size):
@@ -36,32 +41,32 @@ def decode_res(infer_res, dict_size):
    return predict_lbls
-def predict(batch_ins, idx_word_dict, dict_size, prediction_layer, parameters):
+def predict(batch_ins, idx_word_dict, dict_size, inferer):
-    infer_res = paddle.infer(
+    infer_res = inferer.infer(input=batch_ins)
-        output_layer=prediction_layer, parameters=parameters, input=batch_ins)
    predict_lbls = decode_res(infer_res, dict_size)
    predict_words = [idx_word_dict[lbl] for lbl in predict_lbls]  # map to word
    # Ouput format: word1 word2 word3 word4 -> predict label
    for i, ins in enumerate(batch_ins):
-        print(idx_word_dict[ins[0]] + ' ' + \
+        print(" ".join([idx_word_dict[w]
-            idx_word_dict[ins[1]] + ' ' + \
+                        for w in ins]) + " -> " + predict_words[i])
-            idx_word_dict[ins[2]] + ' ' + \
-            idx_word_dict[ins[3]] + ' ' + \
-         ' -> ' + predict_words[i])
+def main(model_path):
+    assert os.path.exists(model_path), "trained model does not exist."
-def main():
    paddle.init(use_gpu=False, trainer_count=1)
    word_dict = paddle.dataset.imikolov.build_dict(min_word_freq=2)
    dict_size = len(word_dict)
-    prediction_layer = network_conf(
+    prediction_layer = ngram_lm(
        is_train=False, hidden_size=256, embed_size=32, dict_size=dict_size)
-    with gzip.open('./models/model_pass_00000.tar.gz') as f:
+    with gzip.open(model_path, "r") as f:
        parameters = paddle.parameters.Parameters.from_tar(f)
+    inferer = paddle.inference.Inference(
+        output_layer=prediction_layer, parameters=parameters)
    idx_word_dict = dict((v, k) for k, v in word_dict.items())
    batch_size = 64
    batch_ins = []
@@ -70,14 +75,12 @@ def main():
    for ins in ins_iter():
        batch_ins.append(ins[:-1])
        if len(batch_ins) == batch_size:
-            predict(batch_ins, idx_word_dict, dict_size, prediction_layer,
+            predict(batch_ins, idx_word_dict, dict_size, inferer)
-                    parameters)
            batch_ins = []
    if len(batch_ins) > 0:
-        predict(batch_ins, idx_word_dict, dict_size, prediction_layer,
+        predict(batch_ins, idx_word_dict, dict_size, inferer)
-                parameters)
-if __name__ == '__main__':
+if __name__ == "__main__":
-    main()
+    main("models/hsigmoid_batch_00010.tar.gz")
--- a/word_embedding/hsigmoid_conf.py
+++ b/word_embedding/hsigmoid_conf.py
@@ -5,32 +5,22 @@ import math
 import paddle.v2 as paddle
-def network_conf(hidden_size, embed_size, dict_size, is_train=True):
+def ngram_lm(hidden_size, embed_size, dict_size, gram_num=4, is_train=True):
-    first_word = paddle.layer.data(
+    emb_layers = []
-        name='firstw', type=paddle.data_type.integer_value(dict_size))
-    second_word = paddle.layer.data(
-        name='secondw', type=paddle.data_type.integer_value(dict_size))
-    third_word = paddle.layer.data(
-        name='thirdw', type=paddle.data_type.integer_value(dict_size))
-    fourth_word = paddle.layer.data(
-        name='fourthw', type=paddle.data_type.integer_value(dict_size))
-    target_word = paddle.layer.data(
-        name='fifthw', type=paddle.data_type.integer_value(dict_size))
    embed_param_attr = paddle.attr.Param(
        name="_proj", initial_std=0.001, learning_rate=1, l2_rate=0)
-    embed_first_word = paddle.layer.embedding(
+    for i in range(gram_num):
-        input=first_word, size=embed_size, param_attr=embed_param_attr)
+        word = paddle.layer.data(
-    embed_second_word = paddle.layer.embedding(
+            name="__word%02d__" % (i),
-        input=second_word, size=embed_size, param_attr=embed_param_attr)
+            type=paddle.data_type.integer_value(dict_size))
-    embed_third_word = paddle.layer.embedding(
+        emb_layers.append(
-        input=third_word, size=embed_size, param_attr=embed_param_attr)
+            paddle.layer.embedding(
-    embed_fourth_word = paddle.layer.embedding(
+                input=word, size=embed_size, param_attr=embed_param_attr))
-        input=fourth_word, size=embed_size, param_attr=embed_param_attr)
-    embed_context = paddle.layer.concat(input=[
+    target_word = paddle.layer.data(
-        embed_first_word, embed_second_word, embed_third_word, embed_fourth_word
+        name="__target_word__", type=paddle.data_type.integer_value(dict_size))
-    ])
+    embed_context = paddle.layer.concat(input=emb_layers)
    hidden_layer = paddle.layer.fc(
        input=embed_context,
@@ -46,15 +36,14 @@ def network_conf(hidden_size, embed_size, dict_size, is_train=True):
            input=hidden_layer,
            label=target_word,
            num_classes=dict_size,
-            param_attr=paddle.attr.Param(name='sigmoid_w'),
+            param_attr=paddle.attr.Param(name="sigmoid_w"),
-            bias_attr=paddle.attr.Param(name='sigmoid_b'))
+            bias_attr=paddle.attr.Param(name="sigmoid_b"))
        return cost
    else:
-        with paddle.layer.mixed(
+        prediction = paddle.layer.fc(
-                size=dict_size - 1,
+            size=dict_size - 1,
-                act=paddle.activation.Sigmoid(),
+            input=hidden_layer,
-                bias_attr=paddle.attr.Param(name='sigmoid_b')) as prediction:
+            act=paddle.activation.Sigmoid(),
-            prediction += paddle.layer.trans_full_matrix_projection(
+            bias_attr=paddle.attr.Param(name="sigmoid_b"),
-                input=hidden_layer,
+            param_attr=paddle.attr.Param(name="sigmoid_w"))
-                param_attr=paddle.attr.Param(name='sigmoid_w'))
        return prediction
--- a/word_embedding/hsigmoid_train.py
+++ b/word_embedding/hsigmoid_train.py
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
+import os
+import logging
+import gzip
 import paddle.v2 as paddle
-from hsigmoid_conf import network_conf
+from network_conf import ngram_lm
-import gzip
+logger = logging.getLogger("paddle")
+logger.setLevel(logging.INFO)
-def main():
+def main(save_dir="models"):
+    if not os.path.exists(save_dir):
+        os.mkdir(save_dir)
    paddle.init(use_gpu=False, trainer_count=1)
    word_dict = paddle.dataset.imikolov.build_dict(min_word_freq=2)
    dict_size = len(word_dict)
-    cost = network_conf(
+    cost = ngram_lm(hidden_size=256, embed_size=32, dict_size=dict_size)
-        is_train=True, hidden_size=256, embed_size=32, dict_size=dict_size)
    def event_handler(event):
        if isinstance(event, paddle.event.EndPass):
-            model_name = './models/model_pass_%05d.tar.gz' % event.pass_id
+            model_name = os.path.join(save_dir, "hsigmoid_pass_%05d.tar.gz" %
-            print("Save model into %s ..." % model_name)
+                                      event.pass_id)
-            with gzip.open(model_name, 'w') as f:
+            logger.info("Save model into %s ..." % model_name)
+            with gzip.open(model_name, "w") as f:
                parameters.to_tar(f)
        if isinstance(event, paddle.event.EndIteration):
-            if event.batch_id % 100 == 0:
+            if event.batch_id and event.batch_id % 10 == 0:
                result = trainer.test(
                    paddle.batch(
                        paddle.dataset.imikolov.test(word_dict, 5), 32))
-                print("Pass %d, Batch %d, Cost %f, Test Cost %f" %
+                logger.info(
-                      (event.pass_id, event.batch_id, event.cost, result.cost))
+                    "Pass %d, Batch %d, Cost %f, Test Cost %f" %
+                    (event.pass_id, event.batch_id, event.cost, result.cost))
-    feeding = {
-        'firstw': 0,
-        'secondw': 1,
-        'thirdw': 2,
-        'fourthw': 3,
-        'fifthw': 4
-    }
    parameters = paddle.parameters.create(cost)
    adam_optimizer = paddle.optimizer.Adam(
@@ -48,9 +49,8 @@ def main():
                lambda: paddle.dataset.imikolov.train(word_dict, 5)(),
                buf_size=1000), 64),
        num_passes=30,
-        event_handler=event_handler,
+        event_handler=event_handler)
-        feeding=feeding)
-if __name__ == '__main__':
+if __name__ == "__main__":
    main()
--- a/sequence_tagging_for_ner/.gitignore
+++ b/sequence_tagging_for_ner/.gitignore
+*.pyc
+*.tar.gz
--- a/sequence_tagging_for_ner/README.md
+++ b/sequence_tagging_for_ner/README.md
 # 命名实体识别
+以下是本例的简要目录结构及说明：
+```text
+.
+├── data                 # 存储运行本例所依赖的数据
+│   ├── download.sh
+├── images               # README 文档中的图片
+├── index.html
+├── infer.py             # 测试脚本
+├── network_conf.py      # 模型定义
+├── reader.py            # 数据读取接口
+├── README.md            # 文档
+├── train.py             # 训练脚本
+└── utils.py             # 定义同样的函数
+```
+## 简介
 命名实体识别（Named Entity Recognition，NER）又称作“专名识别”，是指识别文本中具有特定意义的实体，主要包括人名、地名、机构名、专有名词等，是自然语言处理研究的一个基础问题。NER任务通常包括实体边界识别、确定实体类别两部分，可以将其作为序列标注问题解决。
-序列标注可以分为Sequence Classification、Segment Classification和Temporal Classification三类[[1](#参考文献)]，本例只考虑Segment Classification，即对输入序列中的每个元素在输出序列中给出对应的标签。对于NER任务，由于需要标识边界，一般采用[BIO方式](http://book.paddlepaddle.org/07.label_semantic_roles/)定义的标签集，如下是一个NER的标注结果示例：
+序列标注可以分为Sequence Classification、Segment Classification和Temporal Classification三类[[1](#参考文献)]，本例只考虑Segment Classification，即对输入序列中的每个元素在输出序列中给出对应的标签。对于NER任务，由于需要标识边界，一般采用[BIO标注方法](http://book.paddlepaddle.org/07.label_semantic_roles/)定义的标签集，如下是一个NER的标注结果示例：
 <div  align="center">
 <img src="images/ner_label_ins.png" width = "80%"  align=center /><br>
 图1. BIO标注方法示例
 </div>
-根据序列标注结果可以直接得到实体边界和实体类别。类似的，分词、词性标注、语块识别、[语义角色标注](http://book.paddlepaddle.org/07.label_semantic_roles/index.cn.html)等任务同样可通过序列标注来解决。
+根据序列标注结果可以直接得到实体边界和实体类别。类似的，分词、词性标注、语块识别、[语义角色标注](http://book.paddlepaddle.org/07.label_semantic_roles/index.cn.html)等任务都可通过序列标注来解决。使用神经网络模型解决问题的思路通常是：前层网络学习输入的特征表示，网络的最后一层在特征基础上完成最终的任务；对于序列标注问题，通常：使用基于RNN的网络结构学习特征，将学习到的特征接入CRF完成序列标注。实际上是将传统CRF中的线性模型换成了非线性神经网络。沿用CRF的出发点是：CRF使用句子级别的似然概率，能够更好的解决标记偏置问题[[2](#参考文献)]。本例也将基于此思路建立模型。虽然，这里以NER任务作为示例，但所给出的模型可以应用到其他各种序列标注任务中。
-由于序列标注问题的广泛性，产生了[CRF](http://book.paddlepaddle.org/07.label_semantic_roles/index.cn.html)等经典的序列模型，这些模型大多只能使用局部信息或需要人工设计特征。随着深度学习研究的发展，循环神经网络（Recurrent Neural Network，RNN等序列模型能够处理序列元素之间前后关联问题，能够从原始输入文本中学习特征表示，而更加适合序列标注任务，更多相关知识可参考PaddleBook中[语义角色标注](https://github.com/PaddlePaddle/book/blob/develop/07.label_semantic_roles/README.cn.md)一课。
-使用神经网络模型解决问题的思路通常是：前层网络学习输入的特征表示，网络的最后一层在特征基础上完成最终的任务；对于序列标注问题，通常：使用基于RNN的网络结构学习特征，将学习到的特征接入CRF完成序列标注。实际上是将传统CRF中的线性模型换成了非线性神经网络。沿用CRF的出发点是：CRF使用句子级别的似然概率，能够更好的解决标记偏置问题[[2](#参考文献)]。本例也将基于此思路建立模型。虽然，这里以NER任务作为示例，但所给出的模型可以应用到其他各种序列标注任务中。
+由于序列标注问题的广泛性，产生了[CRF](http://book.paddlepaddle.org/07.label_semantic_roles/index.cn.html)等经典的序列模型，这些模型大多只能使用局部信息或需要人工设计特征。随着深度学习研究的发展，循环神经网络（Recurrent Neural Network，RNN等 序列模型能够处理序列元素之间前后关联问题，能够从原始输入文本中学习特征表示，而更加适合序列标注任务，更多相关知识可参考PaddleBook中[语义角色标注](https://github.com/PaddlePaddle/book/blob/develop/07.label_semantic_roles/README.cn.md)一课。
-## 模型说明
+## 模型详解
-NER任务的输入是"一句话"，目标是识别句子中的实体边界及类别，我们参照论文\[[2](#参考文献)\]仅对原始句子进行了一些预处理工作：将每个词转换为小写，并将原词是否大写另作为一个特征，共同作为模型的输入。按照上述处理序列标注问题的思路，可构造如下结构的模型（图2是模型结构示意图）：
+NER任务的输入是"一句话"，目标是识别句子中的实体边界及类别，我们参照论文\[[2](#参考文献)\]仅对原始句子进行了一些简单的预处理工作：将每个词转换为小写，并将原词是否大写另作为一个特征，共同作为模型的输入。模型如图2所示，工作流程如下：
 1. 构造输入
 - 输入1是句子序列，采用one-hot方式表示
@@ -28,51 +45,47 @@ NER任务的输入是"一句话"，目标是识别句子中的实体边界及类
 <div  align="center">  
 <img src="images/ner_network.png" width = "40%"  align=center /><br>
-图2. NER模型的网络结构图
+图2. NER 模型网络结构图
 </div>
 ## 数据说明
-在本例中，我们使用CoNLL 2003 NER任务中开放出的数据集。该任务（见[此页面](http://www.clips.uantwerpen.be/conll2003/ner/)）只提供了标注工具的下载，原始Reuters数据由于版权原因需另外申请免费下载。在获取原始数据后可参照标注工具中README生成所需数据文件，完成后将包括如下三个数据文件：
+在本例中，我们以 [CoNLL 2003 NER任务](http://www.clips.uantwerpen.be/conll2003/ner/)为例，原始Reuters数据由于版权原因需另外申请免费下载，请大家按照原网站说明获取。
-| 文件名 | 描述 |
+ 我们仅在`data`目录下的`train`和`test`文件中放置少数样本用以示例输入数据格式。
-|---|---|
+ 本例依赖数据还包括
-| eng.train | 训练数据 |
+    1. 输入文本的词典
-| eng.testa | 验证数据，可用来进行参数调优 |
+    2. 为词典中的词语提供预训练好的词向量
-| eng.testb | 评估数据，用来进行最终效果评估 |
+    2. 标记标签的词典
+   标记标签词典已附在`data`目录中，对应于`data/target.txt`文件。输入文本的词典以及词典中词语的预训练的词向量来自：[Stanford CS224d](http://cs224d.stanford.edu/)课程作业。**为运行本例，请首先在`data`目录下运行`download.sh`脚本下载输入文本的词典和预训练的词向量。** 完成后会将这两个文件一并放入`data`目录下，输入文本的词典和预训练的词向量分别对应：`data/vocab.txt`和`data/wordVectors.txt`这两个文件。
-为保证本例的完整性，我们从中抽取少量样本放在`data/train`和`data/test`文件中，作为示例使用；由于版权原因，完整数据还请大家自行获取。这三个文件数据格式如下：
+CoNLL 2003原始数据格式如下：
 ```
-   U.N.         NNP  I-NP  I-ORG
+U.N.         NNP  I-NP  I-ORG
-   official     NN   I-NP  O
+official     NN   I-NP  O
-   Ekeus        NNP  I-NP  I-PER
+Ekeus        NNP  I-NP  I-PER
-   heads        VBZ  I-VP  O
+heads        VBZ  I-VP  O
-   for          IN   I-PP  O
+for          IN   I-PP  O
-   Baghdad      NNP  I-NP  I-LOC
+Baghdad      NNP  I-NP  I-LOC
-   .            .    O     O
+.            .    O     O
 ```
-其中第一列为原始句子序列（第二、三列分别为词性标签和句法分析中的语块标签，这里暂时不用），第四列为采用了I-TYPE方式表示的NER标签（I-TYPE和BIO方式的主要区别在于语块开始标记的使用上，I-TYPE只有在出现相邻的同类别实体时对后者使用B标记，其他均使用I标记），句子之间以空行分隔。
+- 第一列为原始句子序列
+- 第二、三列分别为词性标签和句法分析中的语块标签，本例不使用
+- 第四列为采用了 I-TYPE 方式表示的NER标签
+    - I-TYPE 和 BIO 方式的主要区别在于语块开始标记的使用上，I-TYPE只有在出现相邻的同类别实体时对后者使用B标记，其他均使用I标记），句子之间以空行分隔。
-原始数据需要进行数据预处理才能被PaddlePaddle处理，预处理主要包括下面几个步骤:
+我们在`reader.py`脚本中完成对原始数据的处理以及读取，主要包括下面几个步骤:
 1. 从原始数据文件中抽取出句子和标签，构造句子序列和标签序列；
-2. 将I-TYPE表示的标签转换为BIO方式表示的标签；
+2. 将 I-TYPE 表示的标签转换为 BIO 方式表示的标签；
 3. 将句子序列中的单词转换为小写，并构造大写标记序列；
 4. 依据词典获取词对应的整数索引。
-我们将在`conll03.py`中完成以上预处理工作（使用方法将在后文给出）：
-```python
+预处理完成后，一条训练样本包含3个部分作为神经网络的输入信息用于训练：（1）句子序列；（2）首字母大写标记序列；（3）标注序列，下表是一条训练样本的示例：
-# import conll03
-# conll03.corpus_reader函数完成上面第1步和第2步.
-# conll03.reader_creator函数完成上面第3步和第4步.
-# conll03.train和conll03.test函数可以获取处理之后的每条样本来供PaddlePaddle训练和测试.
-```
-预处理完成后，一条训练样本包含3个部分：句子序列、首字母大写标记序列、标注序列。下表是一条训练样本的示例。
 | 句子序列 | 大写标记序列 | 标注序列 |
 |---|---|---|
@@ -84,165 +97,74 @@ NER任务的输入是"一句话"，目标是识别句子中的实体边界及类
 | baghdad | 1 | B-LOC |
 | . | 0 | O |
-另外，本例依赖的数据还包括：word词典、label词典和预训练的词向量三个文件。label词典已附在`data`目录中，对应于`data/target.txt`；word词典和预训练的词向量来源于[Stanford CS224d](http://cs224d.stanford.edu/)课程作业，请先在该示例所在目录下运行`data/download.sh`脚本进行下载，完成后会将这两个文件一并放入`data`目录下，分别对应`data/vocab.txt`和`data/wordVectors.txt`。
+## 运行
+### 编写数据读取接口
-## 使用说明
-本示例给出的`conll03.py`和`ner.py`两个Python脚本分别提供了数据相关和模型相关接口。
-### 数据接口使用
-`conll03.py`提供了使用CoNLL 2003数据的接口，各主要函数的功能已在数据说明部分进行说明。结合我们提供的接口和文件，可以按照如下步骤使用CoNLL 2003数据：
-1. 定义各数据文件、词典文件和词向量文件路径；
+自定义数据读取接口只需编写一个 Python 生成器实现从原始输入文本中解析一条训练样本的逻辑。[reader.py](./reader.py) 中的`data_reader`函数实现了读取原始数据返回类型为： `paddle.data_type.integer_value_sequence`的 3 个输入（分别对应：词语在字典的序号、是否为大写、标注结果在字典中的序号）给`network_conf.ner_net`中定义的 3 个 `data_layer` 的功能。
-2. 调用`conll03.train`和`conll03.test`接口。
-对应如下代码：
+### 训练
-```python
+1. 运行 `sh data/download.sh`
-import conll03
+2. 修改 `train.py` 的 `main` 函数，指定数据路径
-# 修改以下变量为对应文件路径
+    ```python
-train_data_file = 'data/train'    # 训练数据文件的路径
+    main(
-test_data_file = 'data/test'      # 测试数据文件的路径
+        train_data_file='data/train',
-vocab_file = 'data/vocab.txt'     # 输入句子对应的字典文件的路径
+        test_data_file='data/test',
-target_file = 'data/target.txt'   # 标签对应的字典文件的路径
+        vocab_file='data/vocab.txt',
-emb_file = 'data/wordVectors.txt' # 预训练的词向量参数的路径
+        target_file='data/target.txt',
+        emb_file='data/wordVectors.txt')
-# 返回训练数据的生成器
-train_data_reader = conll03.train(train_data_file, vocab_file, target_file)
-# 返回测试数据的生成器
-test_data_reader = conll03.test(test_data_file, vocab_file, target_file)
-```
-### 模型接口使用
-`ner.py`提供了以下两个接口分别进行模型训练和预测：
-1. `ner_net_train(data_reader, num_passes)`函数实现了模型训练功能，参数`data_reader`表示训练数据的迭代器、`num_passes`表示训练pass的轮数。训练过程中每100个iteration会打印模型训练信息。我们同时在模型配置中加入了chunk evaluator，会输出当前模型对语块识别的Precision、Recall和F1值。chunk evaluator 的详细使用说明请参照[文档](http://www.paddlepaddle.org/develop/doc/api/v2/config/evaluators.html#chunk)。每个pass后会将模型保存为`params_pass_***.tar.gz`的文件（`***`表示pass的id）。
-2. `ner_net_infer(data_reader, model_file)`函数实现了预测功能，参数`data_reader`表示测试数据的迭代器、`model_file`表示保存在本地的模型文件，预测过程会按如下格式打印预测结果：
-    ```
-    U.N.      B-ORG
-    official  O
-    Ekeus     B-PER
-    heads     O
-    for       O
-    Baghdad   B-LOC
-    .         O
    ```
-    其中第一列为原始句子序列，第二列为BIO方式表示的NER标签。
-### 运行程序
-本例另在`ner.py`中提供了完整的运行流程，包括数据接口的使用和模型训练、预测。根据上文所述的接口使用方法，使用时需要将`ner.py`中如下的数据设置部分中的各变量修改为对应文件路径：
-```python
+3. 运行命令 `python train.py` ，**需要注意：直接运行使用的是示例数据，请替换真实的标记数据。**
-# 修改以下变量为对应文件路径
-train_data_file = 'data/train'    # 训练数据文件的路径
-test_data_file = 'data/test'      # 测试数据文件的路径
-vocab_file = 'data/vocab.txt'     # 输入句子对应的字典文件的路径
-target_file = 'data/target.txt'   # 标签对应的字典文件的路径
-emb_file = 'data/wordVectors.txt' # 预训练的词向量参数的路径
-```
-各接口的调用已在`ner.py`中提供：
-```python
-# 训练数据的生成器
-train_data_reader = conll03.train(train_data_file, vocab_file, target_file)
-# 测试数据的生成器
-test_data_reader = conll03.test(test_data_file, vocab_file, target_file)
-# 模型训练
-ner_net_train(data_reader=train_data_reader, num_passes=1)
-# 预测
-ner_net_infer(data_reader=test_data_reader, model_file='params_pass_0.tar.gz')
-```
-为运行序列标注模型除适当调整`num_passes`和`model_file`两参数值外，无需再做其它修改（也可根据需要自行调用各接口，如只使用预测功能）。完成修改后，运行本示例只需在`ner.py`所在路径下执行`python ner.py`即可。该示例程序会执行数据读取、模型训练和保存、模型读取及新样本预测等步骤。
-### 自定义数据和任务
-前文提到本例中的模型可以应用到其他序列标注任务中，这里以词性标注任务为例，给出使用其他数据，并应用到其他任务的操作方法。
-假定有如下格式的原始数据：
-```
-U.N.         NNP
-official     NN
-Ekeus        NNP
-heads        VBZ
-for          IN
-Baghdad      NNP
-.            .  
-```
-第一列为原始句子序列，第二列为词性标签序列，两列之间以“\t”分隔，句子之间以空行分隔。
-为使用PaddlePaddle和本示例提供的模型，可参照`conll03.py`并根据需要自定义数据接口，如下：
-1. 参照`conll03.py`中的`corpus_reader`函数，定义接口返回句子序列和标签序列生成器；
+    ```text
+    commandline:  --use_gpu=False --trainer_count=1
-    ```python
+    Initing parameters..
-    # 实现句子和对应标签的抽取，传入数据文件路径，返回句子和标签序列生成器。
+    Init parameters done.
-    def corpus_reader(filename):
+    Pass 0, Batch 0, Cost 41.430110, {'ner_chunk.precision': 0.01587301678955555, 'ner_chunk.F1-score': 0.028368793427944183, 'ner_chunk.recall': 0.13333334028720856, 'error': 0.939393937587738}
-        def reader():
+    Test with Pass 0, Batch 0, {'ner_chunk.precision': 0.0, 'ner_chunk.F1-score': 0.0, 'ner_chunk.recall': 0.0, 'error': 0.16260161995887756}
-            sentence = []
-            labels = []
-            with open(filename) as f:
-                for line in f:
-                    if len(line.strip()) == 0:
-                        if len(sentence) > 0:
-                            yield sentence, labels
-                        sentence = []
-                        labels = []
-                    else:
-                        segs = line.strip().split()
-                        sentence.append(segs[0])
-                        labels.append(segs[-1])
-            f.close()
-        return reader
    ```
-2. 参照`conll03.py`中的`reader_creator`函数，定义接口返回id化的句子和标签序列生成器。
+### 预测
+1. 修改 [infer.py](./infer.py) 的 `main` 函数，指定：需要测试的模型的路径、测试数据、字典文件，预测标记文件的路径，默认参数如下：
    ```python
-    # 传入corpus_reader返回的生成器、dict类型的word词典和label词典，返回id化的句子和标签序列生成器。
+    infer(
-    def reader_creator(corpus_reader, word_dict, label_dict):
+        model_path="models/params_pass_0.tar.gz",
-        def reader():
+        batch_size=2,
-            for sentence, labels in corpus_reader():
+        test_data_file="data/test",
-                word_idx = [
+        vocab_file="data/vocab.txt",
-                    word_dict.get(w, UNK_IDX) # 若使用小写单词，请使用w.lower()
+        target_file="data/target.txt")
-                    for w in sentence
-                ]
-                # 若使用首字母大写标记，请去掉以下注释符号，并在yield语句的word_idx后加上mark
-                # mark = [
-                #     1 if w[0].isupper() else 0
-                #     for w in sentence
-                # ]
-                label_idx = [label_dict.get(w) for w in labels]
-                yield word_idx, label_idx, sentence # 加上sentence方便预测时打印
-        return reader
    ```
-自定义了数据接口后，要使用本示例中的模型，只需在调用模型训练和预测接口`ner_net_train`和`ner_net_infer`时传入调用`reader_creator`返回的生成器即可。另外需要注意，这里给出的数据接口定义去掉了`conll03.py`一些预处理（使用原始句子，而非转换成小写单词加上大写标记），`ner.py`中的模型相关接口也需要进行一些调整：
+2. 在终端运行 `python infer.py`，开始测试，会看到如下预测结果（以下为训练500个pass所得模型的部分预测结果）：
-1. 修改网络结构定义接口`ner_net`中大写标记相关内容：
+    ```text
+    cricket             O
+    -                   O
+    leicestershire      B-ORG
+    take                O
+    over                O
+    at                  O
+    top                 O
+    after               O
+    innings             O
+    victory             O
+    .                   O
+    london              B-LOC
+    1996-08-30          O
+    west                B-MISC
+    indian              I-MISC
+    all-rounder         O
+    phil                B-PER
+    simmons             I-PER
+    took                O
+    four                O
-    删去`mark`和`mark_embedding`两个变量；
+    ```
+    输出分为两列，以“\t” 分隔，第一列是输入的词语，第二列是标记结果。多条输入序列之间以空行分隔。
-2. 修改模型训练接口`ner_net_train`中大写标记相关内容：
-    将变量`feeding`定义改为`feeding = {'word': 0, 'target': 1}`；
-3. 修改预测接口`ner_net_infer`中大写标记相关内容：
-    将`test_data.append([item[0], item[1]])`改为`test_data.append([item[0]])`。
-如果要继续使用NER中的特征预处理（小写单词、大写标记），请参照上文`reader_creator`代码段给出的注释进行修改，此时`ner.py`中的模型相关接口不必进行修改。
 ## 参考文献

--- a/sequence_tagging_for_ner/data/download.sh
+++ b/sequence_tagging_for_ner/data/download.sh
 wget http://cs224d.stanford.edu/assignment2/assignment2.zip
-unzip assignment2.zip
-cp assignment2_release/data/ner/wordVectors.txt data/
+if [ $? -eq 0  ];then
-cp assignment2_release/data/ner/vocab.txt data/
+    unzip assignment2.zip
-rm -rf assignment2.zip assignment2_release
+    cp assignment2_release/data/ner/wordVectors.txt ./data
+    cp assignment2_release/data/ner/vocab.txt ./data
+    rm -rf assignment2.zip assignment2_release
+else
+  echo "download data error!" >> /dev/stderr
+  exit 1
+fi
--- a/sequence_tagging_for_ner/data/test
+++ b/sequence_tagging_for_ner/data/test
-DOCSTART- -X- O O
 CRICKET NNP I-NP O
 - : O O
 LEICESTERSHIRE NNP I-NP I-ORG

--- a/sequence_tagging_for_ner/data/train
+++ b/sequence_tagging_for_ner/data/train
-DOCSTART- -X- O O
 EU NNP I-NP I-ORG
 rejects VBZ I-VP O
 German JJ I-NP I-MISC

--- a/sequence_tagging_for_ner/data/vocab.txt
+++ b/sequence_tagging_for_ner/data/vocab.txt
--- a/sequence_tagging_for_ner/index.html
+++ b/sequence_tagging_for_ner/index.html
@@ -42,24 +42,41 @@
 <div id="markdown" style='display:none'>
 # 命名实体识别
+以下是本例的简要目录结构及说明：
+```text
+.
+├── data                 # 存储运行本例所依赖的数据
+│   ├── download.sh
+├── images               # README 文档中的图片
+├── index.html
+├── infer.py             # 测试脚本
+├── network_conf.py      # 模型定义
+├── reader.py            # 数据读取接口
+├── README.md            # 文档
+├── train.py             # 训练脚本
+└── utils.py             # 定义同样的函数
+```
+## 简介
 命名实体识别（Named Entity Recognition，NER）又称作“专名识别”，是指识别文本中具有特定意义的实体，主要包括人名、地名、机构名、专有名词等，是自然语言处理研究的一个基础问题。NER任务通常包括实体边界识别、确定实体类别两部分，可以将其作为序列标注问题解决。
-序列标注可以分为Sequence Classification、Segment Classification和Temporal Classification三类[[1](#参考文献)]，本例只考虑Segment Classification，即对输入序列中的每个元素在输出序列中给出对应的标签。对于NER任务，由于需要标识边界，一般采用[BIO方式](http://book.paddlepaddle.org/07.label_semantic_roles/)定义的标签集，如下是一个NER的标注结果示例：
+序列标注可以分为Sequence Classification、Segment Classification和Temporal Classification三类[[1](#参考文献)]，本例只考虑Segment Classification，即对输入序列中的每个元素在输出序列中给出对应的标签。对于NER任务，由于需要标识边界，一般采用[BIO标注方法](http://book.paddlepaddle.org/07.label_semantic_roles/)定义的标签集，如下是一个NER的标注结果示例：
 <div  align="center">
 <img src="images/ner_label_ins.png" width = "80%"  align=center /><br>
 图1. BIO标注方法示例
 </div>
-根据序列标注结果可以直接得到实体边界和实体类别。类似的，分词、词性标注、语块识别、[语义角色标注](http://book.paddlepaddle.org/07.label_semantic_roles/index.cn.html)等任务同样可通过序列标注来解决。
+根据序列标注结果可以直接得到实体边界和实体类别。类似的，分词、词性标注、语块识别、[语义角色标注](http://book.paddlepaddle.org/07.label_semantic_roles/index.cn.html)等任务都可通过序列标注来解决。使用神经网络模型解决问题的思路通常是：前层网络学习输入的特征表示，网络的最后一层在特征基础上完成最终的任务；对于序列标注问题，通常：使用基于RNN的网络结构学习特征，将学习到的特征接入CRF完成序列标注。实际上是将传统CRF中的线性模型换成了非线性神经网络。沿用CRF的出发点是：CRF使用句子级别的似然概率，能够更好的解决标记偏置问题[[2](#参考文献)]。本例也将基于此思路建立模型。虽然，这里以NER任务作为示例，但所给出的模型可以应用到其他各种序列标注任务中。
-由于序列标注问题的广泛性，产生了[CRF](http://book.paddlepaddle.org/07.label_semantic_roles/index.cn.html)等经典的序列模型，这些模型大多只能使用局部信息或需要人工设计特征。随着深度学习研究的发展，循环神经网络（Recurrent Neural Network，RNN等序列模型能够处理序列元素之间前后关联问题，能够从原始输入文本中学习特征表示，而更加适合序列标注任务，更多相关知识可参考PaddleBook中[语义角色标注](https://github.com/PaddlePaddle/book/blob/develop/07.label_semantic_roles/README.cn.md)一课。
-使用神经网络模型解决问题的思路通常是：前层网络学习输入的特征表示，网络的最后一层在特征基础上完成最终的任务；对于序列标注问题，通常：使用基于RNN的网络结构学习特征，将学习到的特征接入CRF完成序列标注。实际上是将传统CRF中的线性模型换成了非线性神经网络。沿用CRF的出发点是：CRF使用句子级别的似然概率，能够更好的解决标记偏置问题[[2](#参考文献)]。本例也将基于此思路建立模型。虽然，这里以NER任务作为示例，但所给出的模型可以应用到其他各种序列标注任务中。
+由于序列标注问题的广泛性，产生了[CRF](http://book.paddlepaddle.org/07.label_semantic_roles/index.cn.html)等经典的序列模型，这些模型大多只能使用局部信息或需要人工设计特征。随着深度学习研究的发展，循环神经网络（Recurrent Neural Network，RNN等 序列模型能够处理序列元素之间前后关联问题，能够从原始输入文本中学习特征表示，而更加适合序列标注任务，更多相关知识可参考PaddleBook中[语义角色标注](https://github.com/PaddlePaddle/book/blob/develop/07.label_semantic_roles/README.cn.md)一课。
-## 模型说明
+## 模型详解
-NER任务的输入是"一句话"，目标是识别句子中的实体边界及类别，我们参照论文\[[2](#参考文献)\]仅对原始句子进行了一些预处理工作：将每个词转换为小写，并将原词是否大写另作为一个特征，共同作为模型的输入。按照上述处理序列标注问题的思路，可构造如下结构的模型（图2是模型结构示意图）：
+NER任务的输入是"一句话"，目标是识别句子中的实体边界及类别，我们参照论文\[[2](#参考文献)\]仅对原始句子进行了一些简单的预处理工作：将每个词转换为小写，并将原词是否大写另作为一个特征，共同作为模型的输入。模型如图2所示，工作流程如下：
 1. 构造输入
 - 输入1是句子序列，采用one-hot方式表示
@@ -70,51 +87,47 @@ NER任务的输入是"一句话"，目标是识别句子中的实体边界及类
 <div  align="center">  
 <img src="images/ner_network.png" width = "40%"  align=center /><br>
-图2. NER模型的网络结构图
+图2. NER 模型网络结构图
 </div>
 ## 数据说明
-在本例中，我们使用CoNLL 2003 NER任务中开放出的数据集。该任务（见[此页面](http://www.clips.uantwerpen.be/conll2003/ner/)）只提供了标注工具的下载，原始Reuters数据由于版权原因需另外申请免费下载。在获取原始数据后可参照标注工具中README生成所需数据文件，完成后将包括如下三个数据文件：
+在本例中，我们以 [CoNLL 2003 NER任务](http://www.clips.uantwerpen.be/conll2003/ner/)为例，原始Reuters数据由于版权原因需另外申请免费下载，请大家按照原网站说明获取。
-| 文件名 | 描述 |
+ 我们仅在`data`目录下的`train`和`test`文件中放置少数样本用以示例输入数据格式。
-|---|---|
+ 本例依赖数据还包括
-| eng.train | 训练数据 |
+    1. 输入文本的词典
-| eng.testa | 验证数据，可用来进行参数调优 |
+    2. 为词典中的词语提供预训练好的词向量
-| eng.testb | 评估数据，用来进行最终效果评估 |
+    2. 标记标签的词典
+   标记标签词典已附在`data`目录中，对应于`data/target.txt`文件。输入文本的词典以及词典中词语的预训练的词向量来自：[Stanford CS224d](http://cs224d.stanford.edu/)课程作业。**为运行本例，请首先在`data`目录下运行`download.sh`脚本下载输入文本的词典和预训练的词向量。** 完成后会将这两个文件一并放入`data`目录下，输入文本的词典和预训练的词向量分别对应：`data/vocab.txt`和`data/wordVectors.txt`这两个文件。
-为保证本例的完整性，我们从中抽取少量样本放在`data/train`和`data/test`文件中，作为示例使用；由于版权原因，完整数据还请大家自行获取。这三个文件数据格式如下：
+CoNLL 2003原始数据格式如下：
 ```
-   U.N.         NNP  I-NP  I-ORG
+U.N.         NNP  I-NP  I-ORG
-   official     NN   I-NP  O
+official     NN   I-NP  O
-   Ekeus        NNP  I-NP  I-PER
+Ekeus        NNP  I-NP  I-PER
-   heads        VBZ  I-VP  O
+heads        VBZ  I-VP  O
-   for          IN   I-PP  O
+for          IN   I-PP  O
-   Baghdad      NNP  I-NP  I-LOC
+Baghdad      NNP  I-NP  I-LOC
-   .            .    O     O
+.            .    O     O
 ```
-其中第一列为原始句子序列（第二、三列分别为词性标签和句法分析中的语块标签，这里暂时不用），第四列为采用了I-TYPE方式表示的NER标签（I-TYPE和BIO方式的主要区别在于语块开始标记的使用上，I-TYPE只有在出现相邻的同类别实体时对后者使用B标记，其他均使用I标记），句子之间以空行分隔。
+- 第一列为原始句子序列
+- 第二、三列分别为词性标签和句法分析中的语块标签，本例不使用
+- 第四列为采用了 I-TYPE 方式表示的NER标签
+    - I-TYPE 和 BIO 方式的主要区别在于语块开始标记的使用上，I-TYPE只有在出现相邻的同类别实体时对后者使用B标记，其他均使用I标记），句子之间以空行分隔。
-原始数据需要进行数据预处理才能被PaddlePaddle处理，预处理主要包括下面几个步骤:
+我们在`reader.py`脚本中完成对原始数据的处理以及读取，主要包括下面几个步骤:
 1. 从原始数据文件中抽取出句子和标签，构造句子序列和标签序列；
-2. 将I-TYPE表示的标签转换为BIO方式表示的标签；
+2. 将 I-TYPE 表示的标签转换为 BIO 方式表示的标签；
 3. 将句子序列中的单词转换为小写，并构造大写标记序列；
 4. 依据词典获取词对应的整数索引。
-我们将在`conll03.py`中完成以上预处理工作（使用方法将在后文给出）：
-```python
+预处理完成后，一条训练样本包含3个部分作为神经网络的输入信息用于训练：（1）句子序列；（2）首字母大写标记序列；（3）标注序列，下表是一条训练样本的示例：
-# import conll03
-# conll03.corpus_reader函数完成上面第1步和第2步.
-# conll03.reader_creator函数完成上面第3步和第4步.
-# conll03.train和conll03.test函数可以获取处理之后的每条样本来供PaddlePaddle训练和测试.
-```
-预处理完成后，一条训练样本包含3个部分：句子序列、首字母大写标记序列、标注序列。下表是一条训练样本的示例。
 | 句子序列 | 大写标记序列 | 标注序列 |
 |---|---|---|
@@ -126,165 +139,74 @@ NER任务的输入是"一句话"，目标是识别句子中的实体边界及类
 | baghdad | 1 | B-LOC |
 | . | 0 | O |
-另外，本例依赖的数据还包括：word词典、label词典和预训练的词向量三个文件。label词典已附在`data`目录中，对应于`data/target.txt`；word词典和预训练的词向量来源于[Stanford CS224d](http://cs224d.stanford.edu/)课程作业，请先在该示例所在目录下运行`data/download.sh`脚本进行下载，完成后会将这两个文件一并放入`data`目录下，分别对应`data/vocab.txt`和`data/wordVectors.txt`。
+## 运行
+### 编写数据读取接口
-## 使用说明
-本示例给出的`conll03.py`和`ner.py`两个Python脚本分别提供了数据相关和模型相关接口。
-### 数据接口使用
-`conll03.py`提供了使用CoNLL 2003数据的接口，各主要函数的功能已在数据说明部分进行说明。结合我们提供的接口和文件，可以按照如下步骤使用CoNLL 2003数据：
-1. 定义各数据文件、词典文件和词向量文件路径；
+自定义数据读取接口只需编写一个 Python 生成器实现从原始输入文本中解析一条训练样本的逻辑。[reader.py](./reader.py) 中的`data_reader`函数实现了读取原始数据返回类型为： `paddle.data_type.integer_value_sequence`的 3 个输入（分别对应：词语在字典的序号、是否为大写、标注结果在字典中的序号）给`network_conf.ner_net`中定义的 3 个 `data_layer` 的功能。
-2. 调用`conll03.train`和`conll03.test`接口。
-对应如下代码：
+### 训练
-```python
+1. 运行 `sh data/download.sh`
-import conll03
+2. 修改 `train.py` 的 `main` 函数，指定数据路径
-# 修改以下变量为对应文件路径
+    ```python
-train_data_file = 'data/train'    # 训练数据文件的路径
+    main(
-test_data_file = 'data/test'      # 测试数据文件的路径
+        train_data_file='data/train',
-vocab_file = 'data/vocab.txt'     # 输入句子对应的字典文件的路径
+        test_data_file='data/test',
-target_file = 'data/target.txt'   # 标签对应的字典文件的路径
+        vocab_file='data/vocab.txt',
-emb_file = 'data/wordVectors.txt' # 预训练的词向量参数的路径
+        target_file='data/target.txt',
+        emb_file='data/wordVectors.txt')
-# 返回训练数据的生成器
-train_data_reader = conll03.train(train_data_file, vocab_file, target_file)
-# 返回测试数据的生成器
-test_data_reader = conll03.test(test_data_file, vocab_file, target_file)
-```
-### 模型接口使用
-`ner.py`提供了以下两个接口分别进行模型训练和预测：
-1. `ner_net_train(data_reader, num_passes)`函数实现了模型训练功能，参数`data_reader`表示训练数据的迭代器、`num_passes`表示训练pass的轮数。训练过程中每100个iteration会打印模型训练信息。我们同时在模型配置中加入了chunk evaluator，会输出当前模型对语块识别的Precision、Recall和F1值。chunk evaluator 的详细使用说明请参照[文档](http://www.paddlepaddle.org/develop/doc/api/v2/config/evaluators.html#chunk)。每个pass后会将模型保存为`params_pass_***.tar.gz`的文件（`***`表示pass的id）。
-2. `ner_net_infer(data_reader, model_file)`函数实现了预测功能，参数`data_reader`表示测试数据的迭代器、`model_file`表示保存在本地的模型文件，预测过程会按如下格式打印预测结果：
-    ```
-    U.N.      B-ORG
-    official  O
-    Ekeus     B-PER
-    heads     O
-    for       O
-    Baghdad   B-LOC
-    .         O
    ```
-    其中第一列为原始句子序列，第二列为BIO方式表示的NER标签。
-### 运行程序
-本例另在`ner.py`中提供了完整的运行流程，包括数据接口的使用和模型训练、预测。根据上文所述的接口使用方法，使用时需要将`ner.py`中如下的数据设置部分中的各变量修改为对应文件路径：
-```python
+3. 运行命令 `python train.py` ，**需要注意：直接运行使用的是示例数据，请替换真实的标记数据。**
-# 修改以下变量为对应文件路径
-train_data_file = 'data/train'    # 训练数据文件的路径
-test_data_file = 'data/test'      # 测试数据文件的路径
-vocab_file = 'data/vocab.txt'     # 输入句子对应的字典文件的路径
-target_file = 'data/target.txt'   # 标签对应的字典文件的路径
-emb_file = 'data/wordVectors.txt' # 预训练的词向量参数的路径
-```
-各接口的调用已在`ner.py`中提供：
-```python
-# 训练数据的生成器
-train_data_reader = conll03.train(train_data_file, vocab_file, target_file)
-# 测试数据的生成器
-test_data_reader = conll03.test(test_data_file, vocab_file, target_file)
-# 模型训练
-ner_net_train(data_reader=train_data_reader, num_passes=1)
-# 预测
-ner_net_infer(data_reader=test_data_reader, model_file='params_pass_0.tar.gz')
-```
-为运行序列标注模型除适当调整`num_passes`和`model_file`两参数值外，无需再做其它修改（也可根据需要自行调用各接口，如只使用预测功能）。完成修改后，运行本示例只需在`ner.py`所在路径下执行`python ner.py`即可。该示例程序会执行数据读取、模型训练和保存、模型读取及新样本预测等步骤。
-### 自定义数据和任务
-前文提到本例中的模型可以应用到其他序列标注任务中，这里以词性标注任务为例，给出使用其他数据，并应用到其他任务的操作方法。
-假定有如下格式的原始数据：
-```
-U.N.         NNP
-official     NN
-Ekeus        NNP
-heads        VBZ
-for          IN
-Baghdad      NNP
-.            .  
-```
-第一列为原始句子序列，第二列为词性标签序列，两列之间以“\t”分隔，句子之间以空行分隔。
-为使用PaddlePaddle和本示例提供的模型，可参照`conll03.py`并根据需要自定义数据接口，如下：
-1. 参照`conll03.py`中的`corpus_reader`函数，定义接口返回句子序列和标签序列生成器；
+    ```text
+    commandline:  --use_gpu=False --trainer_count=1
-    ```python
+    Initing parameters..
-    # 实现句子和对应标签的抽取，传入数据文件路径，返回句子和标签序列生成器。
+    Init parameters done.
-    def corpus_reader(filename):
+    Pass 0, Batch 0, Cost 41.430110, {'ner_chunk.precision': 0.01587301678955555, 'ner_chunk.F1-score': 0.028368793427944183, 'ner_chunk.recall': 0.13333334028720856, 'error': 0.939393937587738}
-        def reader():
+    Test with Pass 0, Batch 0, {'ner_chunk.precision': 0.0, 'ner_chunk.F1-score': 0.0, 'ner_chunk.recall': 0.0, 'error': 0.16260161995887756}
-            sentence = []
-            labels = []
-            with open(filename) as f:
-                for line in f:
-                    if len(line.strip()) == 0:
-                        if len(sentence) > 0:
-                            yield sentence, labels
-                        sentence = []
-                        labels = []
-                    else:
-                        segs = line.strip().split()
-                        sentence.append(segs[0])
-                        labels.append(segs[-1])
-            f.close()
-        return reader
    ```
-2. 参照`conll03.py`中的`reader_creator`函数，定义接口返回id化的句子和标签序列生成器。
+### 预测
+1. 修改 [infer.py](./infer.py) 的 `main` 函数，指定：需要测试的模型的路径、测试数据、字典文件，预测标记文件的路径，默认参数如下：
    ```python
-    # 传入corpus_reader返回的生成器、dict类型的word词典和label词典，返回id化的句子和标签序列生成器。
+    infer(
-    def reader_creator(corpus_reader, word_dict, label_dict):
+        model_path="models/params_pass_0.tar.gz",
-        def reader():
+        batch_size=2,
-            for sentence, labels in corpus_reader():
+        test_data_file="data/test",
-                word_idx = [
+        vocab_file="data/vocab.txt",
-                    word_dict.get(w, UNK_IDX) # 若使用小写单词，请使用w.lower()
+        target_file="data/target.txt")
-                    for w in sentence
-                ]
-                # 若使用首字母大写标记，请去掉以下注释符号，并在yield语句的word_idx后加上mark
-                # mark = [
-                #     1 if w[0].isupper() else 0
-                #     for w in sentence
-                # ]
-                label_idx = [label_dict.get(w) for w in labels]
-                yield word_idx, label_idx, sentence # 加上sentence方便预测时打印
-        return reader
    ```
-自定义了数据接口后，要使用本示例中的模型，只需在调用模型训练和预测接口`ner_net_train`和`ner_net_infer`时传入调用`reader_creator`返回的生成器即可。另外需要注意，这里给出的数据接口定义去掉了`conll03.py`一些预处理（使用原始句子，而非转换成小写单词加上大写标记），`ner.py`中的模型相关接口也需要进行一些调整：
+2. 在终端运行 `python infer.py`，开始测试，会看到如下预测结果（以下为训练500个pass所得模型的部分预测结果）：
-1. 修改网络结构定义接口`ner_net`中大写标记相关内容：
+    ```text
+    cricket             O
+    -                   O
+    leicestershire      B-ORG
+    take                O
+    over                O
+    at                  O
+    top                 O
+    after               O
+    innings             O
+    victory             O
+    .                   O
+    london              B-LOC
+    1996-08-30          O
+    west                B-MISC
+    indian              I-MISC
+    all-rounder         O
+    phil                B-PER
+    simmons             I-PER
+    took                O
+    four                O
-    删去`mark`和`mark_embedding`两个变量；
+    ```
+    输出分为两列，以“\t” 分隔，第一列是输入的词语，第二列是标记结果。多条输入序列之间以空行分隔。
-2. 修改模型训练接口`ner_net_train`中大写标记相关内容：
-    将变量`feeding`定义改为`feeding = {'word': 0, 'target': 1}`；
-3. 修改预测接口`ner_net_infer`中大写标记相关内容：
-    将`test_data.append([item[0], item[1]])`改为`test_data.append([item[0]])`。
-如果要继续使用NER中的特征预处理（小写单词、大写标记），请参照上文`reader_creator`代码段给出的注释进行修改，此时`ner.py`中的模型相关接口不必进行修改。
 ## 参考文献

--- a/sequence_tagging_for_ner/infer.py
+++ b/sequence_tagging_for_ner/infer.py
+import gzip
+import reader
+from network_conf import *
+from utils import *
+def infer(model_path, batch_size, test_data_file, vocab_file, target_file):
+    def _infer_a_batch(inferer, test_data, id_2_word, id_2_label):
+        probs = inferer.infer(input=test_data, field=["id"])
+        assert len(probs) == sum(len(x[0]) for x in test_data)
+        for idx, test_sample in enumerate(test_data):
+            start_id = 0
+            for w, tag in zip(test_sample[0],
+                              probs[start_id:start_id + len(test_sample[0])]):
+                print("%s\t%s" % (id_2_word[w], id_2_label[tag]))
+            print("\n")
+            start_id += len(test_sample[0])
+    word_dict = load_dict(vocab_file)
+    word_dict_len = len(word_dict)
+    word_reverse_dict = load_reverse_dict(vocab_file)
+    label_dict = load_dict(target_file)
+    label_reverse_dict = load_reverse_dict(target_file)
+    label_dict_len = len(label_dict)
+    # initialize PaddlePaddle
+    paddle.init(use_gpu=False, trainer_count=1)
+    parameters = paddle.parameters.Parameters.from_tar(
+        gzip.open(model_path, "r"))
+    predict = ner_net(
+        word_dict_len=word_dict_len,
+        label_dict_len=label_dict_len,
+        is_train=False)
+    inferer = paddle.inference.Inference(
+        output_layer=predict, parameters=parameters)
+    test_data = []
+    for i, item in enumerate(
+            reader.data_reader(test_data_file, word_dict, label_dict)()):
+        test_data.append([item[0], item[1]])
+        if len(test_data) == batch_size:
+            _infer_a_batch(inferer, test_data, word_reverse_dict,
+                           label_reverse_dict)
+            test_data = []
+    _infer_a_batch(inferer, test_data, word_reverse_dict, label_reverse_dict)
+    test_data = []
+if __name__ == "__main__":
+    infer(
+        model_path="models/params_pass_0.tar.gz",
+        batch_size=2,
+        test_data_file="data/test",
+        vocab_file="data/vocab.txt",
+        target_file="data/target.txt")
--- a/sequence_tagging_for_ner/ner.py
+++ b/sequence_tagging_for_ner/ner.py
-import math
-import gzip
-import paddle.v2 as paddle
-import paddle.v2.evaluator as evaluator
-import conll03
-import itertools
-# init dataset
-train_data_file = 'data/train'
-test_data_file = 'data/test'
-vocab_file = 'data/vocab.txt'
-target_file = 'data/target.txt'
-emb_file = 'data/wordVectors.txt'
-train_data_reader = conll03.train(train_data_file, vocab_file, target_file)
-test_data_reader = conll03.test(test_data_file, vocab_file, target_file)
-word_dict, label_dict = conll03.get_dict(vocab_file, target_file)
-word_vector_values = conll03.get_embedding(emb_file)
-# init hyper-params
-word_dict_len = len(word_dict)
-label_dict_len = len(label_dict)
-mark_dict_len = 2
-word_dim = 50
-mark_dim = 5
-hidden_dim = 300
-mix_hidden_lr = 1e-3
-default_std = 1 / math.sqrt(hidden_dim) / 3.0
-emb_para = paddle.attr.Param(
-    name='emb', initial_std=math.sqrt(1. / word_dim), is_static=True)
-std_0 = paddle.attr.Param(initial_std=0.)
-std_default = paddle.attr.Param(initial_std=default_std)
-def d_type(size):
-    return paddle.data_type.integer_value_sequence(size)
-def ner_net(is_train):
-    word = paddle.layer.data(name='word', type=d_type(word_dict_len))
-    mark = paddle.layer.data(name='mark', type=d_type(mark_dict_len))
-    word_embedding = paddle.layer.mixed(
-        name='word_embedding',
-        size=word_dim,
-        input=paddle.layer.table_projection(input=word, param_attr=emb_para))
-    mark_embedding = paddle.layer.mixed(
-        name='mark_embedding',
-        size=mark_dim,
-        input=paddle.layer.table_projection(input=mark, param_attr=std_0))
-    emb_layers = [word_embedding, mark_embedding]
-    word_caps_vector = paddle.layer.concat(
-        name='word_caps_vector', input=emb_layers)
-    hidden_1 = paddle.layer.mixed(
-        name='hidden1',
-        size=hidden_dim,
-        act=paddle.activation.Tanh(),
-        bias_attr=std_default,
-        input=[
-            paddle.layer.full_matrix_projection(
-                input=word_caps_vector, param_attr=std_default)
-        ])
-    rnn_para_attr = paddle.attr.Param(initial_std=0.0, learning_rate=0.1)
-    hidden_para_attr = paddle.attr.Param(
-        initial_std=default_std, learning_rate=mix_hidden_lr)
-    rnn_1_1 = paddle.layer.recurrent(
-        name='rnn1-1',
-        input=hidden_1,
-        act=paddle.activation.Relu(),
-        bias_attr=std_0,
-        param_attr=rnn_para_attr)
-    rnn_1_2 = paddle.layer.recurrent(
-        name='rnn1-2',
-        input=hidden_1,
-        act=paddle.activation.Relu(),
-        reverse=1,
-        bias_attr=std_0,
-        param_attr=rnn_para_attr)
-    hidden_2_1 = paddle.layer.mixed(
-        name='hidden2-1',
-        size=hidden_dim,
-        bias_attr=std_default,
-        act=paddle.activation.STanh(),
-        input=[
-            paddle.layer.full_matrix_projection(
-                input=hidden_1, param_attr=hidden_para_attr),
-            paddle.layer.full_matrix_projection(
-                input=rnn_1_1, param_attr=rnn_para_attr)
-        ])
-    hidden_2_2 = paddle.layer.mixed(
-        name='hidden2-2',
-        size=hidden_dim,
-        bias_attr=std_default,
-        act=paddle.activation.STanh(),
-        input=[
-            paddle.layer.full_matrix_projection(
-                input=hidden_1, param_attr=hidden_para_attr),
-            paddle.layer.full_matrix_projection(
-                input=rnn_1_2, param_attr=rnn_para_attr)
-        ])
-    rnn_2_1 = paddle.layer.recurrent(
-        name='rnn2-1',
-        input=hidden_2_1,
-        act=paddle.activation.Relu(),
-        reverse=1,
-        bias_attr=std_0,
-        param_attr=rnn_para_attr)
-    rnn_2_2 = paddle.layer.recurrent(
-        name='rnn2-2',
-        input=hidden_2_2,
-        act=paddle.activation.Relu(),
-        bias_attr=std_0,
-        param_attr=rnn_para_attr)
-    hidden_3 = paddle.layer.mixed(
-        name='hidden3',
-        size=hidden_dim,
-        bias_attr=std_default,
-        act=paddle.activation.STanh(),
-        input=[
-            paddle.layer.full_matrix_projection(
-                input=hidden_2_1, param_attr=hidden_para_attr),
-            paddle.layer.full_matrix_projection(
-                input=rnn_2_1,
-                param_attr=rnn_para_attr), paddle.layer.full_matrix_projection(
-                    input=hidden_2_2, param_attr=hidden_para_attr),
-            paddle.layer.full_matrix_projection(
-                input=rnn_2_2, param_attr=rnn_para_attr)
-        ])
-    output = paddle.layer.mixed(
-        name='output',
-        size=label_dict_len,
-        bias_attr=False,
-        input=[
-            paddle.layer.full_matrix_projection(
-                input=hidden_3, param_attr=std_default)
-        ])
-    if is_train:
-        target = paddle.layer.data(name='target', type=d_type(label_dict_len))
-        crf_cost = paddle.layer.crf(
-            size=label_dict_len,
-            input=output,
-            label=target,
-            param_attr=paddle.attr.Param(
-                name='crfw',
-                initial_std=default_std,
-                learning_rate=mix_hidden_lr))
-        crf_dec = paddle.layer.crf_decoding(
-            size=label_dict_len,
-            input=output,
-            label=target,
-            param_attr=paddle.attr.Param(name='crfw'))
-        return crf_cost, crf_dec, target
-    else:
-        predict = paddle.layer.crf_decoding(
-            size=label_dict_len,
-            input=output,
-            param_attr=paddle.attr.Param(name='crfw'))
-        return predict
-def ner_net_train(data_reader=train_data_reader, num_passes=1):
-    # define network topology
-    crf_cost, crf_dec, target = ner_net(is_train=True)
-    evaluator.sum(name='error', input=crf_dec)
-    evaluator.chunk(
-        name='ner_chunk',
-        input=crf_dec,
-        label=target,
-        chunk_scheme='IOB',
-        num_chunk_types=(label_dict_len - 1) / 2)
-    # create parameters
-    parameters = paddle.parameters.create(crf_cost)
-    parameters.set('emb', word_vector_values)
-    # create optimizer
-    optimizer = paddle.optimizer.Momentum(
-        momentum=0,
-        learning_rate=2e-4,
-        regularization=paddle.optimizer.L2Regularization(rate=8e-4),
-        gradient_clipping_threshold=25,
-        model_average=paddle.optimizer.ModelAverage(
-            average_window=0.5, max_average_window=10000), )
-    trainer = paddle.trainer.SGD(
-        cost=crf_cost,
-        parameters=parameters,
-        update_equation=optimizer,
-        extra_layers=crf_dec)
-    reader = paddle.batch(
-        paddle.reader.shuffle(data_reader, buf_size=8192), batch_size=64)
-    feeding = {'word': 0, 'mark': 1, 'target': 2}
-    def event_handler(event):
-        if isinstance(event, paddle.event.EndIteration):
-            if event.batch_id % 100 == 0:
-                print "Pass %d, Batch %d, Cost %f, %s" % (
-                    event.pass_id, event.batch_id, event.cost, event.metrics)
-            if event.batch_id % 1000 == 0:
-                result = trainer.test(reader=reader, feeding=feeding)
-                print "\nTest with Pass %d, Batch %d, %s" % (
-                    event.pass_id, event.batch_id, result.metrics)
-        if isinstance(event, paddle.event.EndPass):
-            # save parameters
-            with gzip.open('params_pass_%d.tar.gz' % event.pass_id, 'w') as f:
-                parameters.to_tar(f)
-            result = trainer.test(reader=reader, feeding=feeding)
-            print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
-    trainer.train(
-        reader=reader,
-        event_handler=event_handler,
-        num_passes=num_passes,
-        feeding=feeding)
-    return parameters
-def ner_net_infer(data_reader=test_data_reader, model_file='ner_model.tar.gz'):
-    test_data = []
-    test_sentences = []
-    for item in data_reader():
-        test_data.append([item[0], item[1]])
-        test_sentences.append(item[-1])
-        if len(test_data) == 10:
-            break
-    predict = ner_net(is_train=False)
-    lab_ids = paddle.infer(
-        output_layer=predict,
-        parameters=paddle.parameters.Parameters.from_tar(gzip.open(model_file)),
-        input=test_data,
-        field='id')
-    flat_data = [word for word in itertools.chain.from_iterable(test_sentences)]
-    labels_reverse = {}
-    for (k, v) in label_dict.items():
-        labels_reverse[v] = k
-    pre_lab = [labels_reverse[lab_id] for lab_id in lab_ids]
-    for word, label in zip(flat_data, pre_lab):
-        print word, label
-if __name__ == '__main__':
-    paddle.init(use_gpu=False, trainer_count=1)
-    ner_net_train(data_reader=train_data_reader, num_passes=1)
-    ner_net_infer(
-        data_reader=test_data_reader, model_file='params_pass_0.tar.gz')
--- a/sequence_tagging_for_ner/network_conf.py
+++ b/sequence_tagging_for_ner/network_conf.py
+import math
+import paddle.v2 as paddle
+import paddle.v2.evaluator as evaluator
+def ner_net(word_dict_len, label_dict_len, stack_num=2, is_train=True):
+    mark_dict_len = 2
+    word_dim = 50
+    mark_dim = 5
+    hidden_dim = 128
+    word = paddle.layer.data(
+        name='word',
+        type=paddle.data_type.integer_value_sequence(word_dict_len))
+    word_embedding = paddle.layer.embedding(
+        input=word,
+        size=word_dim,
+        param_attr=paddle.attr.Param(
+            name='emb', initial_std=math.sqrt(1. / word_dim), is_static=True))
+    mark = paddle.layer.data(
+        name='mark',
+        type=paddle.data_type.integer_value_sequence(mark_dict_len))
+    mark_embedding = paddle.layer.embedding(
+        input=mark,
+        size=mark_dim,
+        param_attr=paddle.attr.Param(initial_std=math.sqrt(1. / word_dim)))
+    word_caps_vector = paddle.layer.concat(
+        input=[word_embedding, mark_embedding])
+    mix_hidden_lr = 1e-3
+    rnn_para_attr = paddle.attr.Param(initial_std=0.0, learning_rate=0.1)
+    hidden_para_attr = paddle.attr.Param(
+        initial_std=1 / math.sqrt(hidden_dim), learning_rate=mix_hidden_lr)
+    # the first rnn layer shares the input-to-hidden mappings.
+    hidden = paddle.layer.fc(
+        name="__hidden00__",
+        size=hidden_dim,
+        act=paddle.activation.Tanh(),
+        bias_attr=paddle.attr.Param(initial_std=1.),
+        input=word_caps_vector,
+        param_attr=hidden_para_attr)
+    fea = []
+    for direction in ["fwd", "bwd"]:
+        for i in range(stack_num):
+            if i:
+                hidden = paddle.layer.fc(
+                    name="__hidden%02d_%s__" % (i, direction),
+                    size=hidden_dim,
+                    act=paddle.activation.STanh(),
+                    bias_attr=paddle.attr.Param(initial_std=1.),
+                    input=[hidden, rnn],
+                    param_attr=[hidden_para_attr, rnn_para_attr])
+            rnn = paddle.layer.recurrent(
+                name="__rnn%02d_%s__" % (i, direction),
+                input=hidden,
+                act=paddle.activation.Relu(),
+                bias_attr=paddle.attr.Param(initial_std=1.),
+                reverse=i % 2 if direction == "fwd" else not i % 2,
+                param_attr=rnn_para_attr)
+        fea += [hidden, rnn]
+    rnn_fea = paddle.layer.fc(
+        size=hidden_dim,
+        bias_attr=paddle.attr.Param(initial_std=1.),
+        act=paddle.activation.STanh(),
+        input=fea,
+        param_attr=[hidden_para_attr, rnn_para_attr] * 2)
+    emission = paddle.layer.fc(
+        size=label_dict_len,
+        bias_attr=False,
+        input=rnn_fea,
+        param_attr=rnn_para_attr)
+    if is_train:
+        target = paddle.layer.data(
+            name='target',
+            type=paddle.data_type.integer_value_sequence(label_dict_len))
+        crf = paddle.layer.crf(
+            size=label_dict_len,
+            input=emission,
+            label=target,
+            param_attr=paddle.attr.Param(name='crfw', initial_std=1e-3))
+        crf_dec = paddle.layer.crf_decoding(
+            size=label_dict_len,
+            input=emission,
+            label=target,
+            param_attr=paddle.attr.Param(name='crfw'))
+        return crf, crf_dec, target
+    else:
+        predict = paddle.layer.crf_decoding(
+            size=label_dict_len,
+            input=emission,
+            param_attr=paddle.attr.Param(name='crfw'))
+        return predict
--- a/sequence_tagging_for_ner/conll03.py
+++ b/sequence_tagging_for_ner/conll03.py
@@ -2,16 +2,9 @@
 Conll03 dataset.
 """
-import tarfile
+from utils import *
-import gzip
-import itertools
-import collections
-import re
-import numpy as np
-__all__ = ['train', 'test', 'get_dict', 'get_embedding']
+__all__ = ["data_reader"]
-UNK_IDX = 0
 def canonicalize_digits(word):
@@ -28,96 +21,46 @@ def canonicalize_word(word, wordset=None, digits=True):
        if (wordset != None) and (word in wordset): return word
        word = canonicalize_digits(word)  # try to canonicalize numbers
    if (wordset == None) or (word in wordset): return word
-    else: return "UUUNKKK"  # unknown token
+    else: return "<UNK>"  # unknown token
-def load_dict(filename):
-    d = dict()
-    with open(filename, 'r') as f:
-        for i, line in enumerate(f):
-            d[line.strip()] = i
-    return d
+def data_reader(data_file, word_dict, label_dict):
-def get_dict(vocab_file='data/vocab.txt', target_file='data/target.txt'):
-    """
-    Get the word and label dictionary.
    """
-    word_dict = load_dict(vocab_file)
+    The dataset can be obtained according to http://www.clips.uantwerpen.be/conll2003/ner/.
-    label_dict = load_dict(target_file)
+    It returns a reader creator, each sample in the reader includes:
-    return word_dict, label_dict
+    word id sequence, label id sequence and raw sentence.
-def get_embedding(emb_file='data/wordVectors.txt'):
+    :return: reader creator
-    """
+    :rtype: callable
-    Get the trained word vector.
    """
-    return np.loadtxt(emb_file, dtype=float)
-def corpus_reader(filename='data/train'):
    def reader():
+        UNK_IDX = word_dict["<UNK>"]
        sentence = []
        labels = []
-        with open(filename) as f:
+        with open(data_file, "r") as f:
            for line in f:
-                if re.match(r"-DOCSTART-.+", line) or (len(line.strip()) == 0):
+                if len(line.strip()) == 0:
                    if len(sentence) > 0:
-                        yield sentence, labels
+                        word_idx = [
+                            word_dict.get(
+                                canonicalize_word(w, word_dict), UNK_IDX)
+                            for w in sentence
+                        ]
+                        mark = [1 if w[0].isupper() else 0 for w in sentence]
+                        label_idx = [label_dict[l] for l in labels]
+                        yield word_idx, mark, label_idx
                    sentence = []
                    labels = []
                else:
                    segs = line.strip().split()
                    sentence.append(segs[0])
-                    # transform from I-TYPE to BIO schema
+                    # transform I-TYPE to BIO schema
-                    if segs[-1] != 'O' and (len(labels) == 0 or
+                    if segs[-1] != "O" and (len(labels) == 0 or
                                            labels[-1][1:] != segs[-1][1:]):
-                        labels.append('B' + segs[-1][1:])
+                        labels.append("B" + segs[-1][1:])
                    else:
                        labels.append(segs[-1])
-        f.close()
    return reader
-def reader_creator(corpus_reader, word_dict, label_dict):
-    """
-    Conll03 train set creator.
-    The dataset can be obtained according to http://www.clips.uantwerpen.be/conll2003/ner/.
-    It returns a reader creator, each sample in the reader includes word id sequence, label id sequence and raw sentence for purpose of print.
-    :return: Training reader creator
-    :rtype: callable
-    """
-    def reader():
-        for sentence, labels in corpus_reader():
-            word_idx = [
-                word_dict.get(canonicalize_word(w, word_dict), UNK_IDX)
-                for w in sentence
-            ]
-            mark = [1 if w[0].isupper() else 0 for w in sentence]
-            label_idx = [label_dict.get(w) for w in labels]
-            yield word_idx, mark, label_idx, sentence
-    return reader
-def train(data_file='data/train',
-          vocab_file='data/vocab.txt',
-          target_file='data/target.txt'):
-    return reader_creator(
-        corpus_reader(data_file),
-        word_dict=load_dict(vocab_file),
-        label_dict=load_dict(target_file))
-def test(data_file='data/test',
-         vocab_file='data/vocab.txt',
-         target_file='data/target.txt'):
-    return reader_creator(
-        corpus_reader(data_file),
-        word_dict=load_dict(vocab_file),
-        label_dict=load_dict(target_file))
--- a/sequence_tagging_for_ner/train.py
+++ b/sequence_tagging_for_ner/train.py
+import gzip
+import numpy as np
+import reader
+from utils import *
+from network_conf import *
+def main(train_data_file,
+         test_data_file,
+         vocab_file,
+         target_file,
+         emb_file,
+         num_passes=10,
+         batch_size=32):
+    word_dict = load_dict(vocab_file)
+    label_dict = load_dict(target_file)
+    word_vector_values = get_embedding(emb_file)
+    word_dict_len = len(word_dict)
+    label_dict_len = len(label_dict)
+    paddle.init(use_gpu=False, trainer_count=1)
+    # define network topology
+    crf_cost, crf_dec, target = ner_net(word_dict_len, label_dict_len)
+    evaluator.sum(name="error", input=crf_dec)
+    evaluator.chunk(
+        name="ner_chunk",
+        input=crf_dec,
+        label=target,
+        chunk_scheme="IOB",
+        num_chunk_types=(label_dict_len - 1) / 2)
+    # create parameters
+    parameters = paddle.parameters.create(crf_cost)
+    parameters.set("emb", word_vector_values)
+    # create optimizer
+    optimizer = paddle.optimizer.Momentum(
+        momentum=0,
+        learning_rate=2e-4,
+        regularization=paddle.optimizer.L2Regularization(rate=8e-4),
+        gradient_clipping_threshold=25,
+        model_average=paddle.optimizer.ModelAverage(
+            average_window=0.5, max_average_window=10000), )
+    trainer = paddle.trainer.SGD(
+        cost=crf_cost,
+        parameters=parameters,
+        update_equation=optimizer,
+        extra_layers=crf_dec)
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            reader.data_reader(train_data_file, word_dict, label_dict),
+            buf_size=1000),
+        batch_size=batch_size)
+    test_reader = paddle.batch(
+        paddle.reader.shuffle(
+            reader.data_reader(test_data_file, word_dict, label_dict),
+            buf_size=1000),
+        batch_size=batch_size)
+    feeding = {"word": 0, "mark": 1, "target": 2}
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 1 == 0:
+                logger.info("Pass %d, Batch %d, Cost %f, %s" % (
+                    event.pass_id, event.batch_id, event.cost, event.metrics))
+            if event.batch_id % 1 == 0:
+                result = trainer.test(reader=test_reader, feeding=feeding)
+                logger.info("\nTest with Pass %d, Batch %d, %s" %
+                            (event.pass_id, event.batch_id, result.metrics))
+        if isinstance(event, paddle.event.EndPass):
+            # save parameters
+            with gzip.open("models/params_pass_%d.tar.gz" % event.pass_id,
+                           "w") as f:
+                parameters.to_tar(f)
+            result = trainer.test(reader=test_reader, feeding=feeding)
+            logger.info("\nTest with Pass %d, %s" % (event.pass_id,
+                                                     result.metrics))
+    trainer.train(
+        reader=train_reader,
+        event_handler=event_handler,
+        num_passes=num_passes,
+        feeding=feeding)
+if __name__ == "__main__":
+    main(
+        train_data_file='data/train',
+        test_data_file='data/test',
+        vocab_file='data/vocab.txt',
+        target_file='data/target.txt',
+        emb_file='data/wordVectors.txt')
--- a/sequence_tagging_for_ner/utils.py
+++ b/sequence_tagging_for_ner/utils.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import logging
+import os
+import re
+import argparse
+import numpy as np
+from collections import defaultdict
+logger = logging.getLogger("paddle")
+logger.setLevel(logging.INFO)
+def get_embedding(emb_file='data/wordVectors.txt'):
+    """
+    Get the trained word vector.
+    """
+    return np.loadtxt(emb_file, dtype=float)
+def load_dict(dict_path):
+    """
+    Load the word dictionary from the given file.
+    Each line of the given file is a word, which can include multiple columns
+    seperated by tab.
+    This function takes the first column (columns in a line are seperated by
+    tab) as key and takes line number of a line as the key (index of the word
+    in the dictionary).
+    """
+    return dict((line.strip().split("\t")[0], idx)
+                for idx, line in enumerate(open(dict_path, "r").readlines()))
+def load_reverse_dict(dict_path):
+    """
+    Load the word dictionary from the given file.
+    Each line of the given file is a word, which can include multiple columns
+    seperated by tab.
+    This function takes line number of a line as the key (index of the word in
+    the dictionary) and the first column (columns in a line are seperated by
+    tab) as the value.
+    """
+    return dict((idx, line.strip().split("\t")[0])
+                for idx, line in enumerate(open(dict_path, "r").readlines()))