Merge branch 'develop' of https://github.com/PaddlePaddle/models into ctc_decoder_dev

6224d367 · Yibing Liu · 46df7c44 · 68caa8ca · 6224d367 · 6224d367
34 changed file
--- a/.travis.yml
+++ b/.travis.yml
+group: deprecated-2017Q2
 language: cpp
 cache: ccache
 sudo: required

--- a/README.md
+++ b/README.md
@@ -13,7 +13,7 @@ PaddlePaddle提供了丰富的运算单元，帮助大家以模块化的方式

 在词向量的例子中，我们向大家展示如何使用Hierarchical-Sigmoid 和噪声对比估计（Noise Contrastive Estimation，NCE）来加速词向量的学习。

- 1.1 [Hsigmoid加速词向量训练](https://github.com/PaddlePaddle/models/tree/develop/word_embedding)
+- 1.1 [Hsigmoid加速词向量训练](https://github.com/PaddlePaddle/models/tree/develop/hsigmoid)
 - 1.2 [噪声对比估计加速词向量训练](https://github.com/PaddlePaddle/models/tree/develop/nce_cost)



--- a/deep_speech_2/README.md
+++ b/deep_speech_2/README.md
@@ -51,13 +51,13 @@ python compute_mean_std.py --help
 For GPU Training:

 ```
-CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py --trainer_count 4
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python train.py
 ```

 For CPU Training:

 ```
-python train.py --trainer_count 8 --use_gpu False
+python train.py --use_gpu False
 ```

 More help for arguments:

--- a/deep_speech_2/data_utils/audio.py
+++ b/deep_speech_2/data_utils/audio.py
@@ -66,6 +66,54 @@ class AudioSegment(object):
        samples, sample_rate = soundfile.read(file, dtype='float32')
        return cls(samples, sample_rate)

+    @classmethod
+    def slice_from_file(cls, file, start=None, end=None):
+        """Loads a small section of an audio without having to load
+        the entire file into the memory which can be incredibly wasteful.
+
+        :param file: Input audio filepath or file object.
+        :type file: basestring|file
+        :param start: Start time in seconds. If start is negative, it wraps
+                      around from the end. If not provided, this function 
+                      reads from the very beginning.
+        :type start: float
+        :param end: End time in seconds. If end is negative, it wraps around
+                    from the end. If not provided, the default behvaior is
+                    to read to the end of the file.
+        :type end: float
+        :return: AudioSegment instance of the specified slice of the input
+                 audio file.
+        :rtype: AudioSegment
+        :raise ValueError: If start or end is incorrectly set, e.g. out of
+                           bounds in time.
+        """
+        sndfile = soundfile.SoundFile(file)
+        sample_rate = sndfile.samplerate
+        duration = float(len(sndfile)) / sample_rate
+        start = 0. if start is None else start
+        end = 0. if end is None else end
+        if start < 0.0:
+            start += duration
+        if end < 0.0:
+            end += duration
+        if start < 0.0:
+            raise ValueError("The slice start position (%f s) is out of "
+                             "bounds." % start)
+        if end < 0.0:
+            raise ValueError("The slice end position (%f s) is out of bounds." %
+                             end)
+        if start > end:
+            raise ValueError("The slice start position (%f s) is later than "
+                             "the slice end position (%f s)." % (start, end))
+        if end > duration:
+            raise ValueError("The slice end position (%f s) is out of bounds "
+                             "(> %f s)" % (end, duration))
+        start_frame = int(start * sample_rate)
+        end_frame = int(end * sample_rate)
+        sndfile.seek(start_frame)
+        data = sndfile.read(frames=end_frame - start_frame, dtype='float32')
+        return cls(data, sample_rate)
+
    @classmethod
    def from_bytes(cls, bytes):
        """Create audio segment from a byte string containing audio samples.
@@ -105,6 +153,20 @@ class AudioSegment(object):
        samples = np.concatenate([seg.samples for seg in segments])
        return cls(samples, sample_rate)

+    @classmethod
+    def make_silence(cls, duration, sample_rate):
+        """Creates a silent audio segment of the given duration and sample rate.
+
+        :param duration: Length of silence in seconds.
+        :type duration: float
+        :param sample_rate: Sample rate.
+        :type sample_rate: float
+        :return: Silent AudioSegment instance of the given duration.
+        :rtype: AudioSegment
+        """
+        samples = np.zeros(int(duration * sample_rate))
+        return cls(samples, sample_rate)
+
    def to_wav_file(self, filepath, dtype='float32'):
        """Save audio segment to disk as wav file.
        
@@ -130,68 +192,6 @@ class AudioSegment(object):
            format='WAV',
            subtype=subtype_map[dtype])

-    @classmethod
-    def slice_from_file(cls, file, start=None, end=None):
-        """Loads a small section of an audio without having to load
-        the entire file into the memory which can be incredibly wasteful.
-
-        :param file: Input audio filepath or file object.
-        :type file: basestring|file
-        :param start: Start time in seconds. If start is negative, it wraps
-                      around from the end. If not provided, this function 
-                      reads from the very beginning.
-        :type start: float
-        :param end: End time in seconds. If end is negative, it wraps around
-                    from the end. If not provided, the default behvaior is
-                    to read to the end of the file.
-        :type end: float
-        :return: AudioSegment instance of the specified slice of the input
-                 audio file.
-        :rtype: AudioSegment
-        :raise ValueError: If start or end is incorrectly set, e.g. out of
-                           bounds in time.
-        """
-        sndfile = soundfile.SoundFile(file)
-        sample_rate = sndfile.samplerate
-        duration = float(len(sndfile)) / sample_rate
-        start = 0. if start is None else start
-        end = 0. if end is None else end
-        if start < 0.0:
-            start += duration
-        if end < 0.0:
-            end += duration
-        if start < 0.0:
-            raise ValueError("The slice start position (%f s) is out of "
-                             "bounds." % start)
-        if end < 0.0:
-            raise ValueError("The slice end position (%f s) is out of bounds." %
-                             end)
-        if start > end:
-            raise ValueError("The slice start position (%f s) is later than "
-                             "the slice end position (%f s)." % (start, end))
-        if end > duration:
-            raise ValueError("The slice end position (%f s) is out of bounds "
-                             "(> %f s)" % (end, duration))
-        start_frame = int(start * sample_rate)
-        end_frame = int(end * sample_rate)
-        sndfile.seek(start_frame)
-        data = sndfile.read(frames=end_frame - start_frame, dtype='float32')
-        return cls(data, sample_rate)
-
-    @classmethod
-    def make_silence(cls, duration, sample_rate):
-        """Creates a silent audio segment of the given duration and sample rate.
-
-        :param duration: Length of silence in seconds.
-        :type duration: float
-        :param sample_rate: Sample rate.
-        :type sample_rate: float
-        :return: Silent AudioSegment instance of the given duration.
-        :rtype: AudioSegment
-        """
-        samples = np.zeros(int(duration * sample_rate))
-        return cls(samples, sample_rate)
-
    def superimpose(self, other):
        """Add samples from another segment to those of this segment
        (sample-wise addition, not segment concatenation).
@@ -225,7 +225,7 @@ class AudioSegment(object):
        samples = self._convert_samples_from_float32(self._samples, dtype)
        return samples.tostring()

-    def apply_gain(self, gain):
+    def gain_db(self, gain):
        """Apply gain in decibels to samples.

        Note that this is an in-place transformation.
@@ -278,7 +278,7 @@ class AudioSegment(object):
                "Unable to normalize segment to %f dB because the "
                "the probable gain have exceeds max_gain_db (%f dB)" %
                (target_db, max_gain_db))
-        self.apply_gain(min(max_gain_db, target_db - self.rms_db))
+        self.gain_db(min(max_gain_db, target_db - self.rms_db))

    def normalize_online_bayesian(self,
                                  target_db,
@@ -319,7 +319,7 @@ class AudioSegment(object):
        rms_estimate_db = 10 * np.log10(mean_squared_estimate)
        # Compute required time-varying gain.
        gain_db = target_db - rms_estimate_db
-        self.apply_gain(gain_db)
+        self.gain_db(gain_db)

    def resample(self, target_sample_rate, quality='sinc_medium'):
        """Resample the audio to a target sample rate.
@@ -366,6 +366,31 @@ class AudioSegment(object):
            raise ValueError("Unknown value for the sides %s" % sides)
        self._samples = padded._samples

+    def shift(self, shift_ms):
+        """Shift the audio in time. If `shift_ms` is positive, shift with time
+        advance; if negative, shift with time delay. Silence are padded to
+        keep the duration unchanged.
+
+        Note that this is an in-place transformation.
+
+        :param shift_ms: Shift time in millseconds. If positive, shift with
+                         time advance; if negative; shift with time delay.
+        :type shift_ms: float
+        :raises ValueError: If shift_ms is longer than audio duration.
+        """
+        if abs(shift_ms) / 1000.0 > self.duration:
+            raise ValueError("Absolute value of shift_ms should be smaller "
+                             "than audio duration.")
+        shift_samples = int(shift_ms * self._sample_rate / 1000)
+        if shift_samples > 0:
+            # time advance
+            self._samples[:-shift_samples] = self._samples[shift_samples:]
+            self._samples[-shift_samples:] = 0
+        elif shift_samples < 0:
+            # time delay
+            self._samples[-shift_samples:] = self._samples[:shift_samples]
+            self._samples[:-shift_samples] = 0
+
    def subsegment(self, start_sec=None, end_sec=None):
        """Cut the AudioSegment between given boundaries.

@@ -505,7 +530,7 @@ class AudioSegment(object):
        noise_gain_db = min(self.rms_db - noise.rms_db - snr_dB, max_gain_db)
        noise_new = copy.deepcopy(noise)
        noise_new.random_subsegment(self.duration, rng=rng)
-        noise_new.apply_gain(noise_gain_db)
+        noise_new.gain_db(noise_gain_db)
        self.superimpose(noise_new)

    @property

--- a/deep_speech_2/data_utils/augmentor/augmentation.py
+++ b/deep_speech_2/data_utils/augmentor/augmentation.py
@@ -6,6 +6,7 @@ from __future__ import print_function
 import json
 import random
 from data_utils.augmentor.volume_perturb import VolumePerturbAugmentor
+from data_utils.augmentor.shift_perturb import ShiftPerturbAugmentor


 class AugmentationPipeline(object):
@@ -76,5 +77,7 @@ class AugmentationPipeline(object):
        """Return an augmentation model by the type name, and pass in params."""
        if augmentor_type == "volume":
            return VolumePerturbAugmentor(self._rng, **params)
+        elif augmentor_type == "shift":
+            return ShiftPerturbAugmentor(self._rng, **params)
        else:
            raise ValueError("Unknown augmentor type [%s]." % augmentor_type)
--- a/deep_speech_2/data_utils/augmentor/volume_perturb.py
+++ b/deep_speech_2/data_utils/augmentor/volume_perturb.py
@@ -36,5 +36,5 @@ class VolumePerturbAugmentor(AugmentorBase):
        :param audio_segment: Audio segment to add effects to.
        :type audio_segment: AudioSegmenet|SpeechSegment
        """
-        gain = self._rng.uniform(min_gain_dBFS, max_gain_dBFS)
+        gain = self._rng.uniform(self._min_gain_dBFS, self._max_gain_dBFS)
        audio_segment.apply_gain(gain)
--- a/deep_speech_2/data_utils/data.py
+++ b/deep_speech_2/data_utils/data.py
@@ -45,6 +45,9 @@ class DataGenerator(object):
    :types max_freq: None|float
    :param specgram_type: Specgram feature type. Options: 'linear'.
    :type specgram_type: str
+    :param use_dB_normalization: Whether to normalize the audio to -20 dB
+                                 before extracting the features.
+    :type use_dB_normalization: bool
    :param num_threads: Number of CPU threads for processing data.
    :type num_threads: int
    :param random_seed: Random seed.
@@ -61,6 +64,7 @@ class DataGenerator(object):
                 window_ms=20.0,
                 max_freq=None,
                 specgram_type='linear',
+                 use_dB_normalization=True,
                 num_threads=multiprocessing.cpu_count(),
                 random_seed=0):
        self._max_duration = max_duration
@@ -73,7 +77,8 @@ class DataGenerator(object):
            specgram_type=specgram_type,
            stride_ms=stride_ms,
            window_ms=window_ms,
-            max_freq=max_freq)
+            max_freq=max_freq,
+            use_dB_normalization=use_dB_normalization)
        self._num_threads = num_threads
        self._rng = random.Random(random_seed)
        self._epoch = 0

--- a/deep_speech_2/data_utils/featurizer/audio_featurizer.py
+++ b/deep_speech_2/data_utils/featurizer/audio_featurizer.py
@@ -24,26 +24,64 @@ class AudioFeaturizer(object):
                     corresponding to frequencies between [0, max_freq] are
                     returned.
    :types max_freq: None|float
+    :param target_sample_rate: Audio are resampled (if upsampling or
+                               downsampling is allowed) to this before
+                               extracting spectrogram features.
+    :type target_sample_rate: float
+    :param use_dB_normalization: Whether to normalize the audio to a certain
+                                 decibels before extracting the features.
+    :type use_dB_normalization: bool
+    :param target_dB: Target audio decibels for normalization.
+    :type target_dB: float
    """

    def __init__(self,
                 specgram_type='linear',
                 stride_ms=10.0,
                 window_ms=20.0,
-                 max_freq=None):
+                 max_freq=None,
+                 target_sample_rate=16000,
+                 use_dB_normalization=True,
+                 target_dB=-20):
        self._specgram_type = specgram_type
        self._stride_ms = stride_ms
        self._window_ms = window_ms
        self._max_freq = max_freq
+        self._target_sample_rate = target_sample_rate
+        self._use_dB_normalization = use_dB_normalization
+        self._target_dB = target_dB

-    def featurize(self, audio_segment):
+    def featurize(self,
+                  audio_segment,
+                  allow_downsampling=True,
+                  allow_upsamplling=True):
        """Extract audio features from AudioSegment or SpeechSegment.

        :param audio_segment: Audio/speech segment to extract features from.
        :type audio_segment: AudioSegment|SpeechSegment
+        :param allow_downsampling: Whether to allow audio downsampling before
+                                   featurizing.
+        :type allow_downsampling: bool
+        :param allow_upsampling: Whether to allow audio upsampling before
+                                 featurizing.
+        :type allow_upsampling: bool
        :return: Spectrogram audio feature in 2darray.
        :rtype: ndarray
+        :raises ValueError: If audio sample rate is not supported.
        """
+        # upsampling or downsampling
+        if ((audio_segment.sample_rate > self._target_sample_rate and
+             allow_downsampling) or
+            (audio_segment.sample_rate < self._target_sample_rate and
+             allow_upsampling)):
+            audio_segment.resample(self._target_sample_rate)
+        if audio_segment.sample_rate != self._target_sample_rate:
+            raise ValueError("Audio sample rate is not supported. "
+                             "Turn allow_downsampling or allow up_sampling on.")
+        # decibel normalization
+        if self._use_dB_normalization:
+            audio_segment.normalize(target_db=self._target_dB)
+        # extract spectrogram
        return self._compute_specgram(audio_segment.samples,
                                      audio_segment.sample_rate)


--- a/deep_speech_2/data_utils/featurizer/speech_featurizer.py
+++ b/deep_speech_2/data_utils/featurizer/speech_featurizer.py
@@ -29,6 +29,15 @@ class SpeechFeaturizer(object):
                     corresponding to frequencies between [0, max_freq] are
                     returned.
    :types max_freq: None|float
+    :param target_sample_rate: Speech are resampled (if upsampling or
+                               downsampling is allowed) to this before
+                               extracting spectrogram features.
+    :type target_sample_rate: float
+    :param use_dB_normalization: Whether to normalize the audio to a certain
+                                 decibels before extracting the features.
+    :type use_dB_normalization: bool
+    :param target_dB: Target audio decibels for normalization.
+    :type target_dB: float
    """

    def __init__(self,
@@ -36,9 +45,18 @@ class SpeechFeaturizer(object):
                 specgram_type='linear',
                 stride_ms=10.0,
                 window_ms=20.0,
-                 max_freq=None):
-        self._audio_featurizer = AudioFeaturizer(specgram_type, stride_ms,
-                                                 window_ms, max_freq)
+                 max_freq=None,
+                 target_sample_rate=16000,
+                 use_dB_normalization=True,
+                 target_dB=-20):
+        self._audio_featurizer = AudioFeaturizer(
+            specgram_type=specgram_type,
+            stride_ms=stride_ms,
+            window_ms=window_ms,
+            max_freq=max_freq,
+            target_sample_rate=target_sample_rate,
+            use_dB_normalization=use_dB_normalization,
+            target_dB=target_dB)
        self._text_featurizer = TextFeaturizer(vocab_filepath)

    def featurize(self, speech_segment):

--- a/deep_speech_2/infer.py
+++ b/deep_speech_2/infer.py
@@ -58,7 +58,7 @@ parser.add_argument(
    help="Manifest path for decoding. (default: %(default)s)")
 parser.add_argument(
    "--model_filepath",
-    default='./params.tar.gz',
+    default='checkpoints/params.latest.tar.gz',
    type=str,
    help="Model filepath. (default: %(default)s)")
 parser.add_argument(

--- a/deep_speech_2/setup.sh
+++ b/deep_speech_2/setup.sh
@@ -27,4 +27,7 @@ if [ $? != 0 ]; then
    exit 1
 fi

+# prepare ./checkpoints
+mkdir checkpoints
+
 echo "Install all dependencies successfully."
--- a/deep_speech_2/train.py
+++ b/deep_speech_2/train.py
@@ -17,10 +17,10 @@ import utils

 parser = argparse.ArgumentParser(description=__doc__)
 parser.add_argument(
-    "--batch_size", default=32, type=int, help="Minibatch size.")
+    "--batch_size", default=256, type=int, help="Minibatch size.")
 parser.add_argument(
    "--num_passes",
-    default=20,
+    default=200,
    type=int,
    help="Training pass number. (default: %(default)s)")
 parser.add_argument(
@@ -55,7 +55,7 @@ parser.add_argument(
    help="Use sortagrad or not. (default: %(default)s)")
 parser.add_argument(
    "--max_duration",
-    default=100.0,
+    default=27.0,
    type=float,
    help="Audios with duration larger than this will be discarded. "
    "(default: %(default)s)")
@@ -67,13 +67,13 @@ parser.add_argument(
    "(default: %(default)s)")
 parser.add_argument(
    "--shuffle_method",
-    default='instance_shuffle',
+    default='batch_shuffle_clipped',
    type=str,
    help="Shuffle method: 'instance_shuffle', 'batch_shuffle', "
    "'batch_shuffle_batch'. (default: %(default)s)")
 parser.add_argument(
    "--trainer_count",
-    default=4,
+    default=8,
    type=int,
    help="Trainer number. (default: %(default)s)")
 parser.add_argument(
@@ -110,7 +110,9 @@ parser.add_argument(
    "the existing model of this path. (default: %(default)s)")
 parser.add_argument(
    "--augmentation_config",
-    default='{}',
+    default='[{"type": "shift", '
+    '"params": {"min_shift_ms": -5, "max_shift_ms": 5},'
+    '"prob": 1.0}]',
    type=str,
    help="Augmentation configuration in json-format. "
    "(default: %(default)s)")
@@ -189,7 +191,7 @@ def train():
                print("\nPass: %d, Batch: %d, TrainCost: %f" % (
                    event.pass_id, event.batch_id + 1, cost_sum / cost_counter))
                cost_sum, cost_counter = 0.0, 0
-                with gzip.open("params.tar.gz", 'w') as f:
+                with gzip.open("checkpoints/params.latest.tar.gz", 'w') as f:
                    parameters.to_tar(f)
            else:
                sys.stdout.write('.')
@@ -202,6 +204,9 @@ def train():
                reader=test_batch_reader, feeding=test_generator.feeding)
            print("\n------- Time: %d sec,  Pass: %d, ValidationCost: %s" %
                  (time.time() - start_time, event.pass_id, result.cost))
+            with gzip.open("checkpoints/params.pass-%d.tar.gz" % event.pass_id,
+                           'w') as f:
+                parameters.to_tar(f)

    # run train
    trainer.train(

--- a/hsigmoid/.gitignore
+++ b/hsigmoid/.gitignore
+*.pyc
+models
+
--- a/word_embedding/README.md
+++ b/word_embedding/README.md
@@ -50,7 +50,7 @@ def train_data(filename, word_dict, n):
 ```

 ## 网络结构
-本文通过训练N-gram语言模型来获得词向量，具体地使用前4个词来预测当前词。网络输入为词在字典中的id，然后查询词向量词表获取词向量，接着拼接4个词的词向量，然后接入一个全连接隐层，最后是Hsigmoid层。详细网络结构见图2：
+本文通过训练N-gram语言模型来获得词向量，具体地使用前4个词来预测当前词。网络输入为词在字典中的id，然后查询词向量词表获取词向量，接着拼接4个词的词向量，然后接入一个全连接隐层，最后是`Hsigmoid`层。详细网络结构见图2：

 <p align="center">
 <img src="images/network_conf.png" width = "70%" align="center"/><br/>
@@ -60,41 +60,27 @@ def train_data(filename, word_dict, n):
 代码实现如下：

 ```python
-import math
-import paddle.v2 as paddle
-
-
-def network_conf(hidden_size, embed_size, dict_size, is_train=True):
-    first_word = paddle.layer.data(
-        name='firstw', type=paddle.data_type.integer_value(dict_size))
-    second_word = paddle.layer.data(
-        name='secondw', type=paddle.data_type.integer_value(dict_size))
-    third_word = paddle.layer.data(
-        name='thirdw', type=paddle.data_type.integer_value(dict_size))
-    fourth_word = paddle.layer.data(
-        name='fourthw', type=paddle.data_type.integer_value(dict_size))
-    target_word = paddle.layer.data(
-        name='fifthw', type=paddle.data_type.integer_value(dict_size))
-
+def ngram_lm(hidden_size, embed_size, dict_size, gram_num=4, is_train=True):
+    emb_layers = []
    embed_param_attr = paddle.attr.Param(
        name="_proj", initial_std=0.001, learning_rate=1, l2_rate=0)
-    embed_first_word = paddle.layer.embedding(
-        input=first_word, size=embed_size, param_attr=embed_param_attr)
-    embed_second_word = paddle.layer.embedding(
-        input=second_word, size=embed_size, param_attr=embed_param_attr)
-    embed_third_word = paddle.layer.embedding(
-        input=third_word, size=embed_size, param_attr=embed_param_attr)
-    embed_fourth_word = paddle.layer.embedding(
-        input=fourth_word, size=embed_size, param_attr=embed_param_attr)
-
-    embed_context = paddle.layer.concat(input=[
-        embed_first_word, embed_second_word, embed_third_word, embed_fourth_word
-    ])
+    for i in range(gram_num):
+        word = paddle.layer.data(
+            name="__word%02d__" % (i),
+            type=paddle.data_type.integer_value(dict_size))
+        emb_layers.append(
+            paddle.layer.embedding(
+                input=word, size=embed_size, param_attr=embed_param_attr))
+
+    target_word = paddle.layer.data(
+        name="__target_word__", type=paddle.data_type.integer_value(dict_size))
+
+    embed_context = paddle.layer.concat(input=emb_layers)

    hidden_layer = paddle.layer.fc(
        input=embed_context,
        size=hidden_size,
-                act=paddle.activation.Sigmoid(),
+        act=paddle.activation.Sigmoid(),
        layer_attr=paddle.attr.Extra(drop_rate=0.5),
        bias_attr=paddle.attr.Param(learning_rate=2),
        param_attr=paddle.attr.Param(
@@ -105,27 +91,26 @@ def network_conf(hidden_size, embed_size, dict_size, is_train=True):
            input=hidden_layer,
            label=target_word,
            num_classes=dict_size,
-            param_attr=paddle.attr.Param(name='sigmoid_w'),
-            bias_attr=paddle.attr.Param(name='sigmoid_b'))
+            param_attr=paddle.attr.Param(name="sigmoid_w"),
+            bias_attr=paddle.attr.Param(name="sigmoid_b"))
        return cost
    else:
-        with paddle.layer.mixed(
-                size=dict_size - 1,
-                act=paddle.activation.Sigmoid(),
-                bias_attr=paddle.attr.Param(name='sigmoid_b')) as prediction:
-            prediction += paddle.layer.trans_full_matrix_projection(
-                input=hidden_layer,
-                param_attr=paddle.attr.Param(name='sigmoid_w'))
+        prediction = paddle.layer.fc(
+            size=dict_size - 1,
+            input=hidden_layer,
+            act=paddle.activation.Sigmoid(),
+            bias_attr=paddle.attr.Param(name="sigmoid_b"),
+            param_attr=paddle.attr.Param(name="sigmoid_w"))
        return prediction
 ```

 需要注意，在预测阶段，我们需要对hsigmoid参数做一次转置，这里输出的类别数为词典大小减1，对应非叶节点的数量。

 ## 训练阶段
-训练比较简单，直接运行``` python hsigmoid_train.py ```。程序第一次运行会检测用户缓存文件夹中是否包含imikolov数据集，如果未包含，则自动下载。运行过程中，每100个iteration会打印模型训练信息，主要包含训练损失和测试损失，每个pass会保存一次模型。
+训练比较简单，直接运行``` python train.py ```。程序第一次运行会检测用户缓存文件夹中是否包含imikolov数据集，如果未包含，则自动下载。运行过程中，每100个iteration会打印模型训练信息，主要包含训练损失和测试损失，每个pass会保存一次模型。

 ## 预测阶段
-预测时，直接运行``` python hsigmoid_predict.py ```，程序会首先load模型，然后按照batch方式进行预测，并打印预测结果。预测阶段最重要的就是根据概率得到编码路径，然后遍历路径获取最终的预测类别，这部分逻辑如下：
+预测时，直接运行``` python infer.py ```，程序会首先load模型，然后按照batch方式进行预测，并打印预测结果。预测阶段最重要的就是根据概率得到编码路径，然后遍历路径获取最终的预测类别，这部分逻辑如下：

 ```python
 def decode_res(infer_res, dict_size):

--- a/word_embedding/images/binary_tree.png
+++ b/word_embedding/images/binary_tree.png
--- a/word_embedding/images/network_conf.png
+++ b/word_embedding/images/network_conf.png
--- a/word_embedding/images/path_to_1.png
+++ b/word_embedding/images/path_to_1.png
--- a/word_embedding/index.html
+++ b/word_embedding/index.html
@@ -92,7 +92,7 @@ def train_data(filename, word_dict, n):
 ```

 ## 网络结构
-本文通过训练N-gram语言模型来获得词向量，具体地使用前4个词来预测当前词。网络输入为词在字典中的id，然后查询词向量词表获取词向量，接着拼接4个词的词向量，然后接入一个全连接隐层，最后是Hsigmoid层。详细网络结构见图2：
+本文通过训练N-gram语言模型来获得词向量，具体地使用前4个词来预测当前词。网络输入为词在字典中的id，然后查询词向量词表获取词向量，接着拼接4个词的词向量，然后接入一个全连接隐层，最后是`Hsigmoid`层。详细网络结构见图2：

 <p align="center">
 <img src="images/network_conf.png" width = "70%" align="center"/><br/>
@@ -102,41 +102,27 @@ def train_data(filename, word_dict, n):
 代码实现如下：

 ```python
-import math
-import paddle.v2 as paddle
-
-
-def network_conf(hidden_size, embed_size, dict_size, is_train=True):
-    first_word = paddle.layer.data(
-        name='firstw', type=paddle.data_type.integer_value(dict_size))
-    second_word = paddle.layer.data(
-        name='secondw', type=paddle.data_type.integer_value(dict_size))
-    third_word = paddle.layer.data(
-        name='thirdw', type=paddle.data_type.integer_value(dict_size))
-    fourth_word = paddle.layer.data(
-        name='fourthw', type=paddle.data_type.integer_value(dict_size))
-    target_word = paddle.layer.data(
-        name='fifthw', type=paddle.data_type.integer_value(dict_size))
-
+def ngram_lm(hidden_size, embed_size, dict_size, gram_num=4, is_train=True):
+    emb_layers = []
    embed_param_attr = paddle.attr.Param(
        name="_proj", initial_std=0.001, learning_rate=1, l2_rate=0)
-    embed_first_word = paddle.layer.embedding(
-        input=first_word, size=embed_size, param_attr=embed_param_attr)
-    embed_second_word = paddle.layer.embedding(
-        input=second_word, size=embed_size, param_attr=embed_param_attr)
-    embed_third_word = paddle.layer.embedding(
-        input=third_word, size=embed_size, param_attr=embed_param_attr)
-    embed_fourth_word = paddle.layer.embedding(
-        input=fourth_word, size=embed_size, param_attr=embed_param_attr)
-
-    embed_context = paddle.layer.concat(input=[
-        embed_first_word, embed_second_word, embed_third_word, embed_fourth_word
-    ])
+    for i in range(gram_num):
+        word = paddle.layer.data(
+            name="__word%02d__" % (i),
+            type=paddle.data_type.integer_value(dict_size))
+        emb_layers.append(
+            paddle.layer.embedding(
+                input=word, size=embed_size, param_attr=embed_param_attr))
+
+    target_word = paddle.layer.data(
+        name="__target_word__", type=paddle.data_type.integer_value(dict_size))
+
+    embed_context = paddle.layer.concat(input=emb_layers)

    hidden_layer = paddle.layer.fc(
        input=embed_context,
        size=hidden_size,
-                act=paddle.activation.Sigmoid(),
+        act=paddle.activation.Sigmoid(),
        layer_attr=paddle.attr.Extra(drop_rate=0.5),
        bias_attr=paddle.attr.Param(learning_rate=2),
        param_attr=paddle.attr.Param(
@@ -147,27 +133,26 @@ def network_conf(hidden_size, embed_size, dict_size, is_train=True):
            input=hidden_layer,
            label=target_word,
            num_classes=dict_size,
-            param_attr=paddle.attr.Param(name='sigmoid_w'),
-            bias_attr=paddle.attr.Param(name='sigmoid_b'))
+            param_attr=paddle.attr.Param(name="sigmoid_w"),
+            bias_attr=paddle.attr.Param(name="sigmoid_b"))
        return cost
    else:
-        with paddle.layer.mixed(
-                size=dict_size - 1,
-                act=paddle.activation.Sigmoid(),
-                bias_attr=paddle.attr.Param(name='sigmoid_b')) as prediction:
-            prediction += paddle.layer.trans_full_matrix_projection(
-                input=hidden_layer,
-                param_attr=paddle.attr.Param(name='sigmoid_w'))
+        prediction = paddle.layer.fc(
+            size=dict_size - 1,
+            input=hidden_layer,
+            act=paddle.activation.Sigmoid(),
+            bias_attr=paddle.attr.Param(name="sigmoid_b"),
+            param_attr=paddle.attr.Param(name="sigmoid_w"))
        return prediction
 ```

 需要注意，在预测阶段，我们需要对hsigmoid参数做一次转置，这里输出的类别数为词典大小减1，对应非叶节点的数量。

 ## 训练阶段
-训练比较简单，直接运行``` python hsigmoid_train.py ```。程序第一次运行会检测用户缓存文件夹中是否包含imikolov数据集，如果未包含，则自动下载。运行过程中，每100个iteration会打印模型训练信息，主要包含训练损失和测试损失，每个pass会保存一次模型。
+训练比较简单，直接运行``` python train.py ```。程序第一次运行会检测用户缓存文件夹中是否包含imikolov数据集，如果未包含，则自动下载。运行过程中，每100个iteration会打印模型训练信息，主要包含训练损失和测试损失，每个pass会保存一次模型。

 ## 预测阶段
-预测时，直接运行``` python hsigmoid_predict.py ```，程序会首先load模型，然后按照batch方式进行预测，并打印预测结果。预测阶段最重要的就是根据概率得到编码路径，然后遍历路径获取最终的预测类别，这部分逻辑如下：
+预测时，直接运行``` python infer.py ```，程序会首先load模型，然后按照batch方式进行预测，并打印预测结果。预测阶段最重要的就是根据概率得到编码路径，然后遍历路径获取最终的预测类别，这部分逻辑如下：

 ```python
 def decode_res(infer_res, dict_size):

--- a/word_embedding/hsigmoid_predict.py
+++ b/word_embedding/hsigmoid_predict.py
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
+import os
+import logging
+import gzip

 import paddle.v2 as paddle
-from hsigmoid_conf import network_conf
-import gzip
+from network_conf import ngram_lm
+
+logger = logging.getLogger("paddle")
+logger.setLevel(logging.WARNING)


 def decode_res(infer_res, dict_size):
@@ -36,32 +41,32 @@ def decode_res(infer_res, dict_size):
    return predict_lbls


-def predict(batch_ins, idx_word_dict, dict_size, prediction_layer, parameters):
-    infer_res = paddle.infer(
-        output_layer=prediction_layer, parameters=parameters, input=batch_ins)
+def predict(batch_ins, idx_word_dict, dict_size, inferer):
+    infer_res = inferer.infer(input=batch_ins)

    predict_lbls = decode_res(infer_res, dict_size)
    predict_words = [idx_word_dict[lbl] for lbl in predict_lbls]  # map to word

    # Ouput format: word1 word2 word3 word4 -> predict label
    for i, ins in enumerate(batch_ins):
-        print(idx_word_dict[ins[0]] + ' ' + \
-            idx_word_dict[ins[1]] + ' ' + \
-            idx_word_dict[ins[2]] + ' ' + \
-            idx_word_dict[ins[3]] + ' ' + \
-         ' -> ' + predict_words[i])
+        print(" ".join([idx_word_dict[w]
+                        for w in ins]) + " -> " + predict_words[i])
+

+def main(model_path):
+    assert os.path.exists(model_path), "trained model does not exist."

-def main():
    paddle.init(use_gpu=False, trainer_count=1)
    word_dict = paddle.dataset.imikolov.build_dict(min_word_freq=2)
    dict_size = len(word_dict)
-    prediction_layer = network_conf(
+    prediction_layer = ngram_lm(
        is_train=False, hidden_size=256, embed_size=32, dict_size=dict_size)

-    with gzip.open('./models/model_pass_00000.tar.gz') as f:
+    with gzip.open(model_path, "r") as f:
        parameters = paddle.parameters.Parameters.from_tar(f)

+    inferer = paddle.inference.Inference(
+        output_layer=prediction_layer, parameters=parameters)
    idx_word_dict = dict((v, k) for k, v in word_dict.items())
    batch_size = 64
    batch_ins = []
@@ -70,14 +75,12 @@ def main():
    for ins in ins_iter():
        batch_ins.append(ins[:-1])
        if len(batch_ins) == batch_size:
-            predict(batch_ins, idx_word_dict, dict_size, prediction_layer,
-                    parameters)
+            predict(batch_ins, idx_word_dict, dict_size, inferer)
            batch_ins = []

    if len(batch_ins) > 0:
-        predict(batch_ins, idx_word_dict, dict_size, prediction_layer,
-                parameters)
+        predict(batch_ins, idx_word_dict, dict_size, inferer)


-if __name__ == '__main__':
-    main()
+if __name__ == "__main__":
+    main("models/hsigmoid_batch_00010.tar.gz")
--- a/word_embedding/hsigmoid_conf.py
+++ b/word_embedding/hsigmoid_conf.py
@@ -5,32 +5,22 @@ import math
 import paddle.v2 as paddle


-def network_conf(hidden_size, embed_size, dict_size, is_train=True):
-    first_word = paddle.layer.data(
-        name='firstw', type=paddle.data_type.integer_value(dict_size))
-    second_word = paddle.layer.data(
-        name='secondw', type=paddle.data_type.integer_value(dict_size))
-    third_word = paddle.layer.data(
-        name='thirdw', type=paddle.data_type.integer_value(dict_size))
-    fourth_word = paddle.layer.data(
-        name='fourthw', type=paddle.data_type.integer_value(dict_size))
-    target_word = paddle.layer.data(
-        name='fifthw', type=paddle.data_type.integer_value(dict_size))
-
+def ngram_lm(hidden_size, embed_size, dict_size, gram_num=4, is_train=True):
+    emb_layers = []
    embed_param_attr = paddle.attr.Param(
        name="_proj", initial_std=0.001, learning_rate=1, l2_rate=0)
-    embed_first_word = paddle.layer.embedding(
-        input=first_word, size=embed_size, param_attr=embed_param_attr)
-    embed_second_word = paddle.layer.embedding(
-        input=second_word, size=embed_size, param_attr=embed_param_attr)
-    embed_third_word = paddle.layer.embedding(
-        input=third_word, size=embed_size, param_attr=embed_param_attr)
-    embed_fourth_word = paddle.layer.embedding(
-        input=fourth_word, size=embed_size, param_attr=embed_param_attr)
+    for i in range(gram_num):
+        word = paddle.layer.data(
+            name="__word%02d__" % (i),
+            type=paddle.data_type.integer_value(dict_size))
+        emb_layers.append(
+            paddle.layer.embedding(
+                input=word, size=embed_size, param_attr=embed_param_attr))

-    embed_context = paddle.layer.concat(input=[
-        embed_first_word, embed_second_word, embed_third_word, embed_fourth_word
-    ])
+    target_word = paddle.layer.data(
+        name="__target_word__", type=paddle.data_type.integer_value(dict_size))
+
+    embed_context = paddle.layer.concat(input=emb_layers)

    hidden_layer = paddle.layer.fc(
        input=embed_context,
@@ -46,15 +36,14 @@ def network_conf(hidden_size, embed_size, dict_size, is_train=True):
            input=hidden_layer,
            label=target_word,
            num_classes=dict_size,
-            param_attr=paddle.attr.Param(name='sigmoid_w'),
-            bias_attr=paddle.attr.Param(name='sigmoid_b'))
+            param_attr=paddle.attr.Param(name="sigmoid_w"),
+            bias_attr=paddle.attr.Param(name="sigmoid_b"))
        return cost
    else:
-        with paddle.layer.mixed(
-                size=dict_size - 1,
-                act=paddle.activation.Sigmoid(),
-                bias_attr=paddle.attr.Param(name='sigmoid_b')) as prediction:
-            prediction += paddle.layer.trans_full_matrix_projection(
-                input=hidden_layer,
-                param_attr=paddle.attr.Param(name='sigmoid_w'))
+        prediction = paddle.layer.fc(
+            size=dict_size - 1,
+            input=hidden_layer,
+            act=paddle.activation.Sigmoid(),
+            bias_attr=paddle.attr.Param(name="sigmoid_b"),
+            param_attr=paddle.attr.Param(name="sigmoid_w"))
        return prediction
--- a/word_embedding/hsigmoid_train.py
+++ b/word_embedding/hsigmoid_train.py
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
+import os
+import logging
+import gzip

 import paddle.v2 as paddle
-from hsigmoid_conf import network_conf
-import gzip
+from network_conf import ngram_lm
+
+logger = logging.getLogger("paddle")
+logger.setLevel(logging.INFO)


-def main():
+def main(save_dir="models"):
+    if not os.path.exists(save_dir):
+        os.mkdir(save_dir)
+
    paddle.init(use_gpu=False, trainer_count=1)
    word_dict = paddle.dataset.imikolov.build_dict(min_word_freq=2)
    dict_size = len(word_dict)
-    cost = network_conf(
-        is_train=True, hidden_size=256, embed_size=32, dict_size=dict_size)
+    cost = ngram_lm(hidden_size=256, embed_size=32, dict_size=dict_size)

    def event_handler(event):
        if isinstance(event, paddle.event.EndPass):
-            model_name = './models/model_pass_%05d.tar.gz' % event.pass_id
-            print("Save model into %s ..." % model_name)
-            with gzip.open(model_name, 'w') as f:
+            model_name = os.path.join(save_dir, "hsigmoid_pass_%05d.tar.gz" %
+                                      event.pass_id)
+            logger.info("Save model into %s ..." % model_name)
+            with gzip.open(model_name, "w") as f:
                parameters.to_tar(f)

        if isinstance(event, paddle.event.EndIteration):
-            if event.batch_id % 100 == 0:
+            if event.batch_id and event.batch_id % 10 == 0:
                result = trainer.test(
                    paddle.batch(
                        paddle.dataset.imikolov.test(word_dict, 5), 32))
-                print("Pass %d, Batch %d, Cost %f, Test Cost %f" %
-                      (event.pass_id, event.batch_id, event.cost, result.cost))
-
-    feeding = {
-        'firstw': 0,
-        'secondw': 1,
-        'thirdw': 2,
-        'fourthw': 3,
-        'fifthw': 4
-    }
+                logger.info(
+                    "Pass %d, Batch %d, Cost %f, Test Cost %f" %
+                    (event.pass_id, event.batch_id, event.cost, result.cost))

    parameters = paddle.parameters.create(cost)
    adam_optimizer = paddle.optimizer.Adam(
@@ -48,9 +49,8 @@ def main():
                lambda: paddle.dataset.imikolov.train(word_dict, 5)(),
                buf_size=1000), 64),
        num_passes=30,
-        event_handler=event_handler,
-        feeding=feeding)
+        event_handler=event_handler)


-if __name__ == '__main__':
+if __name__ == "__main__":
    main()
--- a/sequence_tagging_for_ner/.gitignore
+++ b/sequence_tagging_for_ner/.gitignore
+*.pyc
+*.tar.gz
--- a/sequence_tagging_for_ner/README.md
+++ b/sequence_tagging_for_ner/README.md
--- a/sequence_tagging_for_ner/data/download.sh
+++ b/sequence_tagging_for_ner/data/download.sh
 wget http://cs224d.stanford.edu/assignment2/assignment2.zip
-unzip assignment2.zip
-cp assignment2_release/data/ner/wordVectors.txt data/
-cp assignment2_release/data/ner/vocab.txt data/
-rm -rf assignment2.zip assignment2_release
+
+if [ $? -eq 0  ];then
+    unzip assignment2.zip
+    cp assignment2_release/data/ner/wordVectors.txt ./data
+    cp assignment2_release/data/ner/vocab.txt ./data
+    rm -rf assignment2.zip assignment2_release
+else
+  echo "download data error!" >> /dev/stderr
+  exit 1
+fi

--- a/sequence_tagging_for_ner/data/test
+++ b/sequence_tagging_for_ner/data/test
-DOCSTART- -X- O O
-
 CRICKET NNP I-NP O
 - : O O
 LEICESTERSHIRE NNP I-NP I-ORG

--- a/sequence_tagging_for_ner/data/train
+++ b/sequence_tagging_for_ner/data/train
-DOCSTART- -X- O O
-
 EU NNP I-NP I-ORG
 rejects VBZ I-VP O
 German JJ I-NP I-MISC

--- a/sequence_tagging_for_ner/data/vocab.txt
+++ b/sequence_tagging_for_ner/data/vocab.txt
--- a/sequence_tagging_for_ner/index.html
+++ b/sequence_tagging_for_ner/index.html
--- a/sequence_tagging_for_ner/infer.py
+++ b/sequence_tagging_for_ner/infer.py
+import gzip
+
+import reader
+from network_conf import *
+from utils import *
+
+
+def infer(model_path, batch_size, test_data_file, vocab_file, target_file):
+    def _infer_a_batch(inferer, test_data, id_2_word, id_2_label):
+        probs = inferer.infer(input=test_data, field=["id"])
+        assert len(probs) == sum(len(x[0]) for x in test_data)
+
+        for idx, test_sample in enumerate(test_data):
+            start_id = 0
+            for w, tag in zip(test_sample[0],
+                              probs[start_id:start_id + len(test_sample[0])]):
+                print("%s\t%s" % (id_2_word[w], id_2_label[tag]))
+            print("\n")
+            start_id += len(test_sample[0])
+
+    word_dict = load_dict(vocab_file)
+    word_dict_len = len(word_dict)
+    word_reverse_dict = load_reverse_dict(vocab_file)
+
+    label_dict = load_dict(target_file)
+    label_reverse_dict = load_reverse_dict(target_file)
+    label_dict_len = len(label_dict)
+
+    # initialize PaddlePaddle
+    paddle.init(use_gpu=False, trainer_count=1)
+    parameters = paddle.parameters.Parameters.from_tar(
+        gzip.open(model_path, "r"))
+
+    predict = ner_net(
+        word_dict_len=word_dict_len,
+        label_dict_len=label_dict_len,
+        is_train=False)
+
+    inferer = paddle.inference.Inference(
+        output_layer=predict, parameters=parameters)
+
+    test_data = []
+    for i, item in enumerate(
+            reader.data_reader(test_data_file, word_dict, label_dict)()):
+        test_data.append([item[0], item[1]])
+        if len(test_data) == batch_size:
+            _infer_a_batch(inferer, test_data, word_reverse_dict,
+                           label_reverse_dict)
+            test_data = []
+
+    _infer_a_batch(inferer, test_data, word_reverse_dict, label_reverse_dict)
+    test_data = []
+
+
+if __name__ == "__main__":
+    infer(
+        model_path="models/params_pass_0.tar.gz",
+        batch_size=2,
+        test_data_file="data/test",
+        vocab_file="data/vocab.txt",
+        target_file="data/target.txt")
--- a/sequence_tagging_for_ner/ner.py
+++ b/sequence_tagging_for_ner/ner.py
-import math
-import gzip
-import paddle.v2 as paddle
-import paddle.v2.evaluator as evaluator
-import conll03
-import itertools
-
-# init dataset
-train_data_file = 'data/train'
-test_data_file = 'data/test'
-vocab_file = 'data/vocab.txt'
-target_file = 'data/target.txt'
-emb_file = 'data/wordVectors.txt'
-
-train_data_reader = conll03.train(train_data_file, vocab_file, target_file)
-test_data_reader = conll03.test(test_data_file, vocab_file, target_file)
-word_dict, label_dict = conll03.get_dict(vocab_file, target_file)
-word_vector_values = conll03.get_embedding(emb_file)
-
-# init hyper-params
-word_dict_len = len(word_dict)
-label_dict_len = len(label_dict)
-mark_dict_len = 2
-word_dim = 50
-mark_dim = 5
-hidden_dim = 300
-
-mix_hidden_lr = 1e-3
-default_std = 1 / math.sqrt(hidden_dim) / 3.0
-emb_para = paddle.attr.Param(
-    name='emb', initial_std=math.sqrt(1. / word_dim), is_static=True)
-std_0 = paddle.attr.Param(initial_std=0.)
-std_default = paddle.attr.Param(initial_std=default_std)
-
-
-def d_type(size):
-    return paddle.data_type.integer_value_sequence(size)
-
-
-def ner_net(is_train):
-    word = paddle.layer.data(name='word', type=d_type(word_dict_len))
-    mark = paddle.layer.data(name='mark', type=d_type(mark_dict_len))
-
-    word_embedding = paddle.layer.mixed(
-        name='word_embedding',
-        size=word_dim,
-        input=paddle.layer.table_projection(input=word, param_attr=emb_para))
-    mark_embedding = paddle.layer.mixed(
-        name='mark_embedding',
-        size=mark_dim,
-        input=paddle.layer.table_projection(input=mark, param_attr=std_0))
-    emb_layers = [word_embedding, mark_embedding]
-
-    word_caps_vector = paddle.layer.concat(
-        name='word_caps_vector', input=emb_layers)
-    hidden_1 = paddle.layer.mixed(
-        name='hidden1',
-        size=hidden_dim,
-        act=paddle.activation.Tanh(),
-        bias_attr=std_default,
-        input=[
-            paddle.layer.full_matrix_projection(
-                input=word_caps_vector, param_attr=std_default)
-        ])
-
-    rnn_para_attr = paddle.attr.Param(initial_std=0.0, learning_rate=0.1)
-    hidden_para_attr = paddle.attr.Param(
-        initial_std=default_std, learning_rate=mix_hidden_lr)
-
-    rnn_1_1 = paddle.layer.recurrent(
-        name='rnn1-1',
-        input=hidden_1,
-        act=paddle.activation.Relu(),
-        bias_attr=std_0,
-        param_attr=rnn_para_attr)
-    rnn_1_2 = paddle.layer.recurrent(
-        name='rnn1-2',
-        input=hidden_1,
-        act=paddle.activation.Relu(),
-        reverse=1,
-        bias_attr=std_0,
-        param_attr=rnn_para_attr)
-
-    hidden_2_1 = paddle.layer.mixed(
-        name='hidden2-1',
-        size=hidden_dim,
-        bias_attr=std_default,
-        act=paddle.activation.STanh(),
-        input=[
-            paddle.layer.full_matrix_projection(
-                input=hidden_1, param_attr=hidden_para_attr),
-            paddle.layer.full_matrix_projection(
-                input=rnn_1_1, param_attr=rnn_para_attr)
-        ])
-    hidden_2_2 = paddle.layer.mixed(
-        name='hidden2-2',
-        size=hidden_dim,
-        bias_attr=std_default,
-        act=paddle.activation.STanh(),
-        input=[
-            paddle.layer.full_matrix_projection(
-                input=hidden_1, param_attr=hidden_para_attr),
-            paddle.layer.full_matrix_projection(
-                input=rnn_1_2, param_attr=rnn_para_attr)
-        ])
-
-    rnn_2_1 = paddle.layer.recurrent(
-        name='rnn2-1',
-        input=hidden_2_1,
-        act=paddle.activation.Relu(),
-        reverse=1,
-        bias_attr=std_0,
-        param_attr=rnn_para_attr)
-    rnn_2_2 = paddle.layer.recurrent(
-        name='rnn2-2',
-        input=hidden_2_2,
-        act=paddle.activation.Relu(),
-        bias_attr=std_0,
-        param_attr=rnn_para_attr)
-
-    hidden_3 = paddle.layer.mixed(
-        name='hidden3',
-        size=hidden_dim,
-        bias_attr=std_default,
-        act=paddle.activation.STanh(),
-        input=[
-            paddle.layer.full_matrix_projection(
-                input=hidden_2_1, param_attr=hidden_para_attr),
-            paddle.layer.full_matrix_projection(
-                input=rnn_2_1,
-                param_attr=rnn_para_attr), paddle.layer.full_matrix_projection(
-                    input=hidden_2_2, param_attr=hidden_para_attr),
-            paddle.layer.full_matrix_projection(
-                input=rnn_2_2, param_attr=rnn_para_attr)
-        ])
-
-    output = paddle.layer.mixed(
-        name='output',
-        size=label_dict_len,
-        bias_attr=False,
-        input=[
-            paddle.layer.full_matrix_projection(
-                input=hidden_3, param_attr=std_default)
-        ])
-
-    if is_train:
-        target = paddle.layer.data(name='target', type=d_type(label_dict_len))
-
-        crf_cost = paddle.layer.crf(
-            size=label_dict_len,
-            input=output,
-            label=target,
-            param_attr=paddle.attr.Param(
-                name='crfw',
-                initial_std=default_std,
-                learning_rate=mix_hidden_lr))
-
-        crf_dec = paddle.layer.crf_decoding(
-            size=label_dict_len,
-            input=output,
-            label=target,
-            param_attr=paddle.attr.Param(name='crfw'))
-
-        return crf_cost, crf_dec, target
-    else:
-        predict = paddle.layer.crf_decoding(
-            size=label_dict_len,
-            input=output,
-            param_attr=paddle.attr.Param(name='crfw'))
-
-        return predict
-
-
-def ner_net_train(data_reader=train_data_reader, num_passes=1):
-    # define network topology
-    crf_cost, crf_dec, target = ner_net(is_train=True)
-    evaluator.sum(name='error', input=crf_dec)
-    evaluator.chunk(
-        name='ner_chunk',
-        input=crf_dec,
-        label=target,
-        chunk_scheme='IOB',
-        num_chunk_types=(label_dict_len - 1) / 2)
-
-    # create parameters
-    parameters = paddle.parameters.create(crf_cost)
-    parameters.set('emb', word_vector_values)
-
-    # create optimizer
-    optimizer = paddle.optimizer.Momentum(
-        momentum=0,
-        learning_rate=2e-4,
-        regularization=paddle.optimizer.L2Regularization(rate=8e-4),
-        gradient_clipping_threshold=25,
-        model_average=paddle.optimizer.ModelAverage(
-            average_window=0.5, max_average_window=10000), )
-
-    trainer = paddle.trainer.SGD(
-        cost=crf_cost,
-        parameters=parameters,
-        update_equation=optimizer,
-        extra_layers=crf_dec)
-
-    reader = paddle.batch(
-        paddle.reader.shuffle(data_reader, buf_size=8192), batch_size=64)
-
-    feeding = {'word': 0, 'mark': 1, 'target': 2}
-
-    def event_handler(event):
-        if isinstance(event, paddle.event.EndIteration):
-            if event.batch_id % 100 == 0:
-                print "Pass %d, Batch %d, Cost %f, %s" % (
-                    event.pass_id, event.batch_id, event.cost, event.metrics)
-            if event.batch_id % 1000 == 0:
-                result = trainer.test(reader=reader, feeding=feeding)
-                print "\nTest with Pass %d, Batch %d, %s" % (
-                    event.pass_id, event.batch_id, result.metrics)
-
-        if isinstance(event, paddle.event.EndPass):
-            # save parameters
-            with gzip.open('params_pass_%d.tar.gz' % event.pass_id, 'w') as f:
-                parameters.to_tar(f)
-            result = trainer.test(reader=reader, feeding=feeding)
-            print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
-
-    trainer.train(
-        reader=reader,
-        event_handler=event_handler,
-        num_passes=num_passes,
-        feeding=feeding)
-
-    return parameters
-
-
-def ner_net_infer(data_reader=test_data_reader, model_file='ner_model.tar.gz'):
-    test_data = []
-    test_sentences = []
-    for item in data_reader():
-        test_data.append([item[0], item[1]])
-        test_sentences.append(item[-1])
-        if len(test_data) == 10:
-            break
-
-    predict = ner_net(is_train=False)
-
-    lab_ids = paddle.infer(
-        output_layer=predict,
-        parameters=paddle.parameters.Parameters.from_tar(gzip.open(model_file)),
-        input=test_data,
-        field='id')
-
-    flat_data = [word for word in itertools.chain.from_iterable(test_sentences)]
-
-    labels_reverse = {}
-    for (k, v) in label_dict.items():
-        labels_reverse[v] = k
-    pre_lab = [labels_reverse[lab_id] for lab_id in lab_ids]
-
-    for word, label in zip(flat_data, pre_lab):
-        print word, label
-
-
-if __name__ == '__main__':
-    paddle.init(use_gpu=False, trainer_count=1)
-    ner_net_train(data_reader=train_data_reader, num_passes=1)
-    ner_net_infer(
-        data_reader=test_data_reader, model_file='params_pass_0.tar.gz')
--- a/sequence_tagging_for_ner/network_conf.py
+++ b/sequence_tagging_for_ner/network_conf.py
+import math
+
+import paddle.v2 as paddle
+import paddle.v2.evaluator as evaluator
+
+
+def ner_net(word_dict_len, label_dict_len, stack_num=2, is_train=True):
+    mark_dict_len = 2
+    word_dim = 50
+    mark_dim = 5
+    hidden_dim = 128
+
+    word = paddle.layer.data(
+        name='word',
+        type=paddle.data_type.integer_value_sequence(word_dict_len))
+    word_embedding = paddle.layer.embedding(
+        input=word,
+        size=word_dim,
+        param_attr=paddle.attr.Param(
+            name='emb', initial_std=math.sqrt(1. / word_dim), is_static=True))
+
+    mark = paddle.layer.data(
+        name='mark',
+        type=paddle.data_type.integer_value_sequence(mark_dict_len))
+    mark_embedding = paddle.layer.embedding(
+        input=mark,
+        size=mark_dim,
+        param_attr=paddle.attr.Param(initial_std=math.sqrt(1. / word_dim)))
+
+    word_caps_vector = paddle.layer.concat(
+        input=[word_embedding, mark_embedding])
+
+    mix_hidden_lr = 1e-3
+    rnn_para_attr = paddle.attr.Param(initial_std=0.0, learning_rate=0.1)
+    hidden_para_attr = paddle.attr.Param(
+        initial_std=1 / math.sqrt(hidden_dim), learning_rate=mix_hidden_lr)
+
+    # the first rnn layer shares the input-to-hidden mappings.
+    hidden = paddle.layer.fc(
+        name="__hidden00__",
+        size=hidden_dim,
+        act=paddle.activation.Tanh(),
+        bias_attr=paddle.attr.Param(initial_std=1.),
+        input=word_caps_vector,
+        param_attr=hidden_para_attr)
+
+    fea = []
+    for direction in ["fwd", "bwd"]:
+        for i in range(stack_num):
+            if i:
+                hidden = paddle.layer.fc(
+                    name="__hidden%02d_%s__" % (i, direction),
+                    size=hidden_dim,
+                    act=paddle.activation.STanh(),
+                    bias_attr=paddle.attr.Param(initial_std=1.),
+                    input=[hidden, rnn],
+                    param_attr=[hidden_para_attr, rnn_para_attr])
+
+            rnn = paddle.layer.recurrent(
+                name="__rnn%02d_%s__" % (i, direction),
+                input=hidden,
+                act=paddle.activation.Relu(),
+                bias_attr=paddle.attr.Param(initial_std=1.),
+                reverse=i % 2 if direction == "fwd" else not i % 2,
+                param_attr=rnn_para_attr)
+        fea += [hidden, rnn]
+
+    rnn_fea = paddle.layer.fc(
+        size=hidden_dim,
+        bias_attr=paddle.attr.Param(initial_std=1.),
+        act=paddle.activation.STanh(),
+        input=fea,
+        param_attr=[hidden_para_attr, rnn_para_attr] * 2)
+
+    emission = paddle.layer.fc(
+        size=label_dict_len,
+        bias_attr=False,
+        input=rnn_fea,
+        param_attr=rnn_para_attr)
+
+    if is_train:
+        target = paddle.layer.data(
+            name='target',
+            type=paddle.data_type.integer_value_sequence(label_dict_len))
+
+        crf = paddle.layer.crf(
+            size=label_dict_len,
+            input=emission,
+            label=target,
+            param_attr=paddle.attr.Param(name='crfw', initial_std=1e-3))
+
+        crf_dec = paddle.layer.crf_decoding(
+            size=label_dict_len,
+            input=emission,
+            label=target,
+            param_attr=paddle.attr.Param(name='crfw'))
+        return crf, crf_dec, target
+    else:
+        predict = paddle.layer.crf_decoding(
+            size=label_dict_len,
+            input=emission,
+            param_attr=paddle.attr.Param(name='crfw'))
+        return predict
--- a/sequence_tagging_for_ner/conll03.py
+++ b/sequence_tagging_for_ner/conll03.py
@@ -2,16 +2,9 @@
 Conll03 dataset.
 """

-import tarfile
-import gzip
-import itertools
-import collections
-import re
-import numpy as np
+from utils import *

-__all__ = ['train', 'test', 'get_dict', 'get_embedding']
-
-UNK_IDX = 0
+__all__ = ["data_reader"]


 def canonicalize_digits(word):
@@ -28,96 +21,46 @@ def canonicalize_word(word, wordset=None, digits=True):
        if (wordset != None) and (word in wordset): return word
        word = canonicalize_digits(word)  # try to canonicalize numbers
    if (wordset == None) or (word in wordset): return word
-    else: return "UUUNKKK"  # unknown token
-
+    else: return "<UNK>"  # unknown token

-def load_dict(filename):
-    d = dict()
-    with open(filename, 'r') as f:
-        for i, line in enumerate(f):
-            d[line.strip()] = i
-    return d

-
-def get_dict(vocab_file='data/vocab.txt', target_file='data/target.txt'):
-    """
-    Get the word and label dictionary.
+def data_reader(data_file, word_dict, label_dict):
    """
-    word_dict = load_dict(vocab_file)
-    label_dict = load_dict(target_file)
-    return word_dict, label_dict
-
+    The dataset can be obtained according to http://www.clips.uantwerpen.be/conll2003/ner/.
+    It returns a reader creator, each sample in the reader includes:
+    word id sequence, label id sequence and raw sentence.

-def get_embedding(emb_file='data/wordVectors.txt'):
-    """
-    Get the trained word vector.
+    :return: reader creator
+    :rtype: callable
    """
-    return np.loadtxt(emb_file, dtype=float)
-

-def corpus_reader(filename='data/train'):
    def reader():
+        UNK_IDX = word_dict["<UNK>"]
+
        sentence = []
        labels = []
-        with open(filename) as f:
+        with open(data_file, "r") as f:
            for line in f:
-                if re.match(r"-DOCSTART-.+", line) or (len(line.strip()) == 0):
+                if len(line.strip()) == 0:
                    if len(sentence) > 0:
-                        yield sentence, labels
+                        word_idx = [
+                            word_dict.get(
+                                canonicalize_word(w, word_dict), UNK_IDX)
+                            for w in sentence
+                        ]
+                        mark = [1 if w[0].isupper() else 0 for w in sentence]
+                        label_idx = [label_dict[l] for l in labels]
+                        yield word_idx, mark, label_idx
                    sentence = []
                    labels = []
                else:
                    segs = line.strip().split()
                    sentence.append(segs[0])
-                    # transform from I-TYPE to BIO schema
-                    if segs[-1] != 'O' and (len(labels) == 0 or
+                    # transform I-TYPE to BIO schema
+                    if segs[-1] != "O" and (len(labels) == 0 or
                                            labels[-1][1:] != segs[-1][1:]):
-                        labels.append('B' + segs[-1][1:])
+                        labels.append("B" + segs[-1][1:])
                    else:
                        labels.append(segs[-1])

-        f.close()
-
    return reader
-
-
-def reader_creator(corpus_reader, word_dict, label_dict):
-    """
-    Conll03 train set creator.
-
-    The dataset can be obtained according to http://www.clips.uantwerpen.be/conll2003/ner/.
-    It returns a reader creator, each sample in the reader includes word id sequence, label id sequence and raw sentence for purpose of print.
-
-    :return: Training reader creator
-    :rtype: callable
-    """
-
-    def reader():
-        for sentence, labels in corpus_reader():
-            word_idx = [
-                word_dict.get(canonicalize_word(w, word_dict), UNK_IDX)
-                for w in sentence
-            ]
-            mark = [1 if w[0].isupper() else 0 for w in sentence]
-            label_idx = [label_dict.get(w) for w in labels]
-            yield word_idx, mark, label_idx, sentence
-
-    return reader
-
-
-def train(data_file='data/train',
-          vocab_file='data/vocab.txt',
-          target_file='data/target.txt'):
-    return reader_creator(
-        corpus_reader(data_file),
-        word_dict=load_dict(vocab_file),
-        label_dict=load_dict(target_file))
-
-
-def test(data_file='data/test',
-         vocab_file='data/vocab.txt',
-         target_file='data/target.txt'):
-    return reader_creator(
-        corpus_reader(data_file),
-        word_dict=load_dict(vocab_file),
-        label_dict=load_dict(target_file))
--- a/sequence_tagging_for_ner/train.py
+++ b/sequence_tagging_for_ner/train.py
+import gzip
+import numpy as np
+
+import reader
+from utils import *
+from network_conf import *
+
+
+def main(train_data_file,
+         test_data_file,
+         vocab_file,
+         target_file,
+         emb_file,
+         num_passes=10,
+         batch_size=32):
+    word_dict = load_dict(vocab_file)
+    label_dict = load_dict(target_file)
+
+    word_vector_values = get_embedding(emb_file)
+
+    word_dict_len = len(word_dict)
+    label_dict_len = len(label_dict)
+
+    paddle.init(use_gpu=False, trainer_count=1)
+
+    # define network topology
+    crf_cost, crf_dec, target = ner_net(word_dict_len, label_dict_len)
+    evaluator.sum(name="error", input=crf_dec)
+    evaluator.chunk(
+        name="ner_chunk",
+        input=crf_dec,
+        label=target,
+        chunk_scheme="IOB",
+        num_chunk_types=(label_dict_len - 1) / 2)
+
+    # create parameters
+    parameters = paddle.parameters.create(crf_cost)
+    parameters.set("emb", word_vector_values)
+
+    # create optimizer
+    optimizer = paddle.optimizer.Momentum(
+        momentum=0,
+        learning_rate=2e-4,
+        regularization=paddle.optimizer.L2Regularization(rate=8e-4),
+        gradient_clipping_threshold=25,
+        model_average=paddle.optimizer.ModelAverage(
+            average_window=0.5, max_average_window=10000), )
+
+    trainer = paddle.trainer.SGD(
+        cost=crf_cost,
+        parameters=parameters,
+        update_equation=optimizer,
+        extra_layers=crf_dec)
+
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            reader.data_reader(train_data_file, word_dict, label_dict),
+            buf_size=1000),
+        batch_size=batch_size)
+    test_reader = paddle.batch(
+        paddle.reader.shuffle(
+            reader.data_reader(test_data_file, word_dict, label_dict),
+            buf_size=1000),
+        batch_size=batch_size)
+
+    feeding = {"word": 0, "mark": 1, "target": 2}
+
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 1 == 0:
+                logger.info("Pass %d, Batch %d, Cost %f, %s" % (
+                    event.pass_id, event.batch_id, event.cost, event.metrics))
+            if event.batch_id % 1 == 0:
+                result = trainer.test(reader=test_reader, feeding=feeding)
+                logger.info("\nTest with Pass %d, Batch %d, %s" %
+                            (event.pass_id, event.batch_id, result.metrics))
+
+        if isinstance(event, paddle.event.EndPass):
+            # save parameters
+            with gzip.open("models/params_pass_%d.tar.gz" % event.pass_id,
+                           "w") as f:
+                parameters.to_tar(f)
+
+            result = trainer.test(reader=test_reader, feeding=feeding)
+            logger.info("\nTest with Pass %d, %s" % (event.pass_id,
+                                                     result.metrics))
+
+    trainer.train(
+        reader=train_reader,
+        event_handler=event_handler,
+        num_passes=num_passes,
+        feeding=feeding)
+
+
+if __name__ == "__main__":
+    main(
+        train_data_file='data/train',
+        test_data_file='data/test',
+        vocab_file='data/vocab.txt',
+        target_file='data/target.txt',
+        emb_file='data/wordVectors.txt')
--- a/sequence_tagging_for_ner/utils.py
+++ b/sequence_tagging_for_ner/utils.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import logging
+import os
+import re
+import argparse
+import numpy as np
+from collections import defaultdict
+
+logger = logging.getLogger("paddle")
+logger.setLevel(logging.INFO)
+
+
+def get_embedding(emb_file='data/wordVectors.txt'):
+    """
+    Get the trained word vector.
+    """
+    return np.loadtxt(emb_file, dtype=float)
+
+
+def load_dict(dict_path):
+    """
+    Load the word dictionary from the given file.
+    Each line of the given file is a word, which can include multiple columns
+    seperated by tab.
+
+    This function takes the first column (columns in a line are seperated by
+    tab) as key and takes line number of a line as the key (index of the word
+    in the dictionary).
+    """
+
+    return dict((line.strip().split("\t")[0], idx)
+                for idx, line in enumerate(open(dict_path, "r").readlines()))
+
+
+def load_reverse_dict(dict_path):
+    """
+    Load the word dictionary from the given file.
+    Each line of the given file is a word, which can include multiple columns
+    seperated by tab.
+
+    This function takes line number of a line as the key (index of the word in
+    the dictionary) and the first column (columns in a line are seperated by
+    tab) as the value.
+    """
+    return dict((idx, line.strip().split("\t")[0])
+                for idx, line in enumerate(open(dict_path, "r").readlines()))