diff --git a/README.md b/README.md
index fb0e20bf42560748f1c9633f19eb0d77090d1b32..33d6f94f3a0de2dd82709af50fc9ec55663a1ffd 100644
--- a/README.md
+++ b/README.md
@@ -13,7 +13,7 @@ PaddlePaddle提供了丰富的运算单元，帮助大家以模块化的方式
 
 在词向量的例子中，我们向大家展示如何使用Hierarchical-Sigmoid 和噪声对比估计（Noise Contrastive Estimation，NCE）来加速词向量的学习。
 
-- 1.1 [Hsigmoid加速词向量训练](https://github.com/PaddlePaddle/models/tree/develop/word_embedding)
+- 1.1 [Hsigmoid加速词向量训练](https://github.com/PaddlePaddle/models/tree/develop/hsigmoid)
 - 1.2 [噪声对比估计加速词向量训练](https://github.com/PaddlePaddle/models/tree/develop/nce_cost)
 
 
diff --git a/hsigmoid/.gitignore b/hsigmoid/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..29a9367f0e91889df8654ad4293f0649de2074f0
--- /dev/null
+++ b/hsigmoid/.gitignore
@@ -0,0 +1,3 @@
+*.pyc
+models
+
diff --git a/hsigmoid/README.md b/hsigmoid/README.md
index 66798f9a2fe8e7921dd819a444b19183bd70de67..b8af766ba3712e55c8447b5f0fcd5763209ff6b4 100644
--- a/hsigmoid/README.md
+++ b/hsigmoid/README.md
@@ -50,7 +50,7 @@ def train_data(filename, word_dict, n):
 ```
 
 ## 网络结构
-本文通过训练N-gram语言模型来获得词向量，具体地使用前4个词来预测当前词。网络输入为词在字典中的id，然后查询词向量词表获取词向量，接着拼接4个词的词向量，然后接入一个全连接隐层，最后是Hsigmoid层。详细网络结构见图2：
+本文通过训练N-gram语言模型来获得词向量，具体地使用前4个词来预测当前词。网络输入为词在字典中的id，然后查询词向量词表获取词向量，接着拼接4个词的词向量，然后接入一个全连接隐层，最后是`Hsigmoid`层。详细网络结构见图2：
 
 <p align="center">
 <img src="images/network_conf.png" width = "70%" align="center"/><br/>
@@ -60,41 +60,27 @@ def train_data(filename, word_dict, n):
 代码实现如下：
 
 ```python
-import math
-import paddle.v2 as paddle
-
-
-def network_conf(hidden_size, embed_size, dict_size, is_train=True):
-    first_word = paddle.layer.data(
-        name='firstw', type=paddle.data_type.integer_value(dict_size))
-    second_word = paddle.layer.data(
-        name='secondw', type=paddle.data_type.integer_value(dict_size))
-    third_word = paddle.layer.data(
-        name='thirdw', type=paddle.data_type.integer_value(dict_size))
-    fourth_word = paddle.layer.data(
-        name='fourthw', type=paddle.data_type.integer_value(dict_size))
-    target_word = paddle.layer.data(
-        name='fifthw', type=paddle.data_type.integer_value(dict_size))
-
+def ngram_lm(hidden_size, embed_size, dict_size, gram_num=4, is_train=True):
+    emb_layers = []
     embed_param_attr = paddle.attr.Param(
         name="_proj", initial_std=0.001, learning_rate=1, l2_rate=0)
-    embed_first_word = paddle.layer.embedding(
-        input=first_word, size=embed_size, param_attr=embed_param_attr)
-    embed_second_word = paddle.layer.embedding(
-        input=second_word, size=embed_size, param_attr=embed_param_attr)
-    embed_third_word = paddle.layer.embedding(
-        input=third_word, size=embed_size, param_attr=embed_param_attr)
-    embed_fourth_word = paddle.layer.embedding(
-        input=fourth_word, size=embed_size, param_attr=embed_param_attr)
-
-    embed_context = paddle.layer.concat(input=[
-        embed_first_word, embed_second_word, embed_third_word, embed_fourth_word
-    ])
+    for i in range(gram_num):
+        word = paddle.layer.data(
+            name="__word%02d__" % (i),
+            type=paddle.data_type.integer_value(dict_size))
+        emb_layers.append(
+            paddle.layer.embedding(
+                input=word, size=embed_size, param_attr=embed_param_attr))
+
+    target_word = paddle.layer.data(
+        name="__target_word__", type=paddle.data_type.integer_value(dict_size))
+
+    embed_context = paddle.layer.concat(input=emb_layers)
 
     hidden_layer = paddle.layer.fc(
         input=embed_context,
         size=hidden_size,
-                act=paddle.activation.Sigmoid(),
+        act=paddle.activation.Sigmoid(),
         layer_attr=paddle.attr.Extra(drop_rate=0.5),
         bias_attr=paddle.attr.Param(learning_rate=2),
         param_attr=paddle.attr.Param(
@@ -105,27 +91,26 @@ def network_conf(hidden_size, embed_size, dict_size, is_train=True):
             input=hidden_layer,
             label=target_word,
             num_classes=dict_size,
-            param_attr=paddle.attr.Param(name='sigmoid_w'),
-            bias_attr=paddle.attr.Param(name='sigmoid_b'))
+            param_attr=paddle.attr.Param(name="sigmoid_w"),
+            bias_attr=paddle.attr.Param(name="sigmoid_b"))
         return cost
     else:
-        with paddle.layer.mixed(
-                size=dict_size - 1,
-                act=paddle.activation.Sigmoid(),
-                bias_attr=paddle.attr.Param(name='sigmoid_b')) as prediction:
-            prediction += paddle.layer.trans_full_matrix_projection(
-                input=hidden_layer,
-                param_attr=paddle.attr.Param(name='sigmoid_w'))
+        prediction = paddle.layer.fc(
+            size=dict_size - 1,
+            input=hidden_layer,
+            act=paddle.activation.Sigmoid(),
+            bias_attr=paddle.attr.Param(name="sigmoid_b"),
+            param_attr=paddle.attr.Param(name="sigmoid_w"))
         return prediction
 ```
 
 需要注意，在预测阶段，我们需要对hsigmoid参数做一次转置，这里输出的类别数为词典大小减1，对应非叶节点的数量。
 
 ## 训练阶段
-训练比较简单，直接运行``` python hsigmoid_train.py ```。程序第一次运行会检测用户缓存文件夹中是否包含imikolov数据集，如果未包含，则自动下载。运行过程中，每100个iteration会打印模型训练信息，主要包含训练损失和测试损失，每个pass会保存一次模型。
+训练比较简单，直接运行``` python train.py ```。程序第一次运行会检测用户缓存文件夹中是否包含imikolov数据集，如果未包含，则自动下载。运行过程中，每100个iteration会打印模型训练信息，主要包含训练损失和测试损失，每个pass会保存一次模型。
 
 ## 预测阶段
-预测时，直接运行``` python hsigmoid_predict.py ```，程序会首先load模型，然后按照batch方式进行预测，并打印预测结果。预测阶段最重要的就是根据概率得到编码路径，然后遍历路径获取最终的预测类别，这部分逻辑如下：
+预测时，直接运行``` python infer.py ```，程序会首先load模型，然后按照batch方式进行预测，并打印预测结果。预测阶段最重要的就是根据概率得到编码路径，然后遍历路径获取最终的预测类别，这部分逻辑如下：
 
 ```python
 def decode_res(infer_res, dict_size):
diff --git a/hsigmoid/hsigmoid_conf.py b/hsigmoid/hsigmoid_conf.py
deleted file mode 100644
index be6b7462a1487e906278fa2682d65add256aaa2d..0000000000000000000000000000000000000000
--- a/hsigmoid/hsigmoid_conf.py
+++ /dev/null
@@ -1,60 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-import math
-import paddle.v2 as paddle
-
-
-def network_conf(hidden_size, embed_size, dict_size, is_train=True):
-    first_word = paddle.layer.data(
-        name='firstw', type=paddle.data_type.integer_value(dict_size))
-    second_word = paddle.layer.data(
-        name='secondw', type=paddle.data_type.integer_value(dict_size))
-    third_word = paddle.layer.data(
-        name='thirdw', type=paddle.data_type.integer_value(dict_size))
-    fourth_word = paddle.layer.data(
-        name='fourthw', type=paddle.data_type.integer_value(dict_size))
-    target_word = paddle.layer.data(
-        name='fifthw', type=paddle.data_type.integer_value(dict_size))
-
-    embed_param_attr = paddle.attr.Param(
-        name="_proj", initial_std=0.001, learning_rate=1, l2_rate=0)
-    embed_first_word = paddle.layer.embedding(
-        input=first_word, size=embed_size, param_attr=embed_param_attr)
-    embed_second_word = paddle.layer.embedding(
-        input=second_word, size=embed_size, param_attr=embed_param_attr)
-    embed_third_word = paddle.layer.embedding(
-        input=third_word, size=embed_size, param_attr=embed_param_attr)
-    embed_fourth_word = paddle.layer.embedding(
-        input=fourth_word, size=embed_size, param_attr=embed_param_attr)
-
-    embed_context = paddle.layer.concat(input=[
-        embed_first_word, embed_second_word, embed_third_word, embed_fourth_word
-    ])
-
-    hidden_layer = paddle.layer.fc(
-        input=embed_context,
-        size=hidden_size,
-        act=paddle.activation.Sigmoid(),
-        layer_attr=paddle.attr.Extra(drop_rate=0.5),
-        bias_attr=paddle.attr.Param(learning_rate=2),
-        param_attr=paddle.attr.Param(
-            initial_std=1. / math.sqrt(embed_size * 8), learning_rate=1))
-
-    if is_train == True:
-        cost = paddle.layer.hsigmoid(
-            input=hidden_layer,
-            label=target_word,
-            num_classes=dict_size,
-            param_attr=paddle.attr.Param(name='sigmoid_w'),
-            bias_attr=paddle.attr.Param(name='sigmoid_b'))
-        return cost
-    else:
-        with paddle.layer.mixed(
-                size=dict_size - 1,
-                act=paddle.activation.Sigmoid(),
-                bias_attr=paddle.attr.Param(name='sigmoid_b')) as prediction:
-            prediction += paddle.layer.trans_full_matrix_projection(
-                input=hidden_layer,
-                param_attr=paddle.attr.Param(name='sigmoid_w'))
-        return prediction
diff --git a/hsigmoid/index.html b/hsigmoid/index.html
index 83f6809d669d9ec6e0dd002f414ba8247068e270..c53e110fdb80fabe5d82865fc5d4eb3512007d4a 100644
--- a/hsigmoid/index.html
+++ b/hsigmoid/index.html
@@ -92,7 +92,7 @@ def train_data(filename, word_dict, n):
 ```
 
 ## 网络结构
-本文通过训练N-gram语言模型来获得词向量，具体地使用前4个词来预测当前词。网络输入为词在字典中的id，然后查询词向量词表获取词向量，接着拼接4个词的词向量，然后接入一个全连接隐层，最后是Hsigmoid层。详细网络结构见图2：
+本文通过训练N-gram语言模型来获得词向量，具体地使用前4个词来预测当前词。网络输入为词在字典中的id，然后查询词向量词表获取词向量，接着拼接4个词的词向量，然后接入一个全连接隐层，最后是`Hsigmoid`层。详细网络结构见图2：
 
 <p align="center">
 <img src="images/network_conf.png" width = "70%" align="center"/><br/>
@@ -102,41 +102,27 @@ def train_data(filename, word_dict, n):
 代码实现如下：
 
 ```python
-import math
-import paddle.v2 as paddle
-
-
-def network_conf(hidden_size, embed_size, dict_size, is_train=True):
-    first_word = paddle.layer.data(
-        name='firstw', type=paddle.data_type.integer_value(dict_size))
-    second_word = paddle.layer.data(
-        name='secondw', type=paddle.data_type.integer_value(dict_size))
-    third_word = paddle.layer.data(
-        name='thirdw', type=paddle.data_type.integer_value(dict_size))
-    fourth_word = paddle.layer.data(
-        name='fourthw', type=paddle.data_type.integer_value(dict_size))
-    target_word = paddle.layer.data(
-        name='fifthw', type=paddle.data_type.integer_value(dict_size))
-
+def ngram_lm(hidden_size, embed_size, dict_size, gram_num=4, is_train=True):
+    emb_layers = []
     embed_param_attr = paddle.attr.Param(
         name="_proj", initial_std=0.001, learning_rate=1, l2_rate=0)
-    embed_first_word = paddle.layer.embedding(
-        input=first_word, size=embed_size, param_attr=embed_param_attr)
-    embed_second_word = paddle.layer.embedding(
-        input=second_word, size=embed_size, param_attr=embed_param_attr)
-    embed_third_word = paddle.layer.embedding(
-        input=third_word, size=embed_size, param_attr=embed_param_attr)
-    embed_fourth_word = paddle.layer.embedding(
-        input=fourth_word, size=embed_size, param_attr=embed_param_attr)
-
-    embed_context = paddle.layer.concat(input=[
-        embed_first_word, embed_second_word, embed_third_word, embed_fourth_word
-    ])
+    for i in range(gram_num):
+        word = paddle.layer.data(
+            name="__word%02d__" % (i),
+            type=paddle.data_type.integer_value(dict_size))
+        emb_layers.append(
+            paddle.layer.embedding(
+                input=word, size=embed_size, param_attr=embed_param_attr))
+
+    target_word = paddle.layer.data(
+        name="__target_word__", type=paddle.data_type.integer_value(dict_size))
+
+    embed_context = paddle.layer.concat(input=emb_layers)
 
     hidden_layer = paddle.layer.fc(
         input=embed_context,
         size=hidden_size,
-                act=paddle.activation.Sigmoid(),
+        act=paddle.activation.Sigmoid(),
         layer_attr=paddle.attr.Extra(drop_rate=0.5),
         bias_attr=paddle.attr.Param(learning_rate=2),
         param_attr=paddle.attr.Param(
@@ -147,27 +133,26 @@ def network_conf(hidden_size, embed_size, dict_size, is_train=True):
             input=hidden_layer,
             label=target_word,
             num_classes=dict_size,
-            param_attr=paddle.attr.Param(name='sigmoid_w'),
-            bias_attr=paddle.attr.Param(name='sigmoid_b'))
+            param_attr=paddle.attr.Param(name="sigmoid_w"),
+            bias_attr=paddle.attr.Param(name="sigmoid_b"))
         return cost
     else:
-        with paddle.layer.mixed(
-                size=dict_size - 1,
-                act=paddle.activation.Sigmoid(),
-                bias_attr=paddle.attr.Param(name='sigmoid_b')) as prediction:
-            prediction += paddle.layer.trans_full_matrix_projection(
-                input=hidden_layer,
-                param_attr=paddle.attr.Param(name='sigmoid_w'))
+        prediction = paddle.layer.fc(
+            size=dict_size - 1,
+            input=hidden_layer,
+            act=paddle.activation.Sigmoid(),
+            bias_attr=paddle.attr.Param(name="sigmoid_b"),
+            param_attr=paddle.attr.Param(name="sigmoid_w"))
         return prediction
 ```
 
 需要注意，在预测阶段，我们需要对hsigmoid参数做一次转置，这里输出的类别数为词典大小减1，对应非叶节点的数量。
 
 ## 训练阶段
-训练比较简单，直接运行``` python hsigmoid_train.py ```。程序第一次运行会检测用户缓存文件夹中是否包含imikolov数据集，如果未包含，则自动下载。运行过程中，每100个iteration会打印模型训练信息，主要包含训练损失和测试损失，每个pass会保存一次模型。
+训练比较简单，直接运行``` python train.py ```。程序第一次运行会检测用户缓存文件夹中是否包含imikolov数据集，如果未包含，则自动下载。运行过程中，每100个iteration会打印模型训练信息，主要包含训练损失和测试损失，每个pass会保存一次模型。
 
 ## 预测阶段
-预测时，直接运行``` python hsigmoid_predict.py ```，程序会首先load模型，然后按照batch方式进行预测，并打印预测结果。预测阶段最重要的就是根据概率得到编码路径，然后遍历路径获取最终的预测类别，这部分逻辑如下：
+预测时，直接运行``` python infer.py ```，程序会首先load模型，然后按照batch方式进行预测，并打印预测结果。预测阶段最重要的就是根据概率得到编码路径，然后遍历路径获取最终的预测类别，这部分逻辑如下：
 
 ```python
 def decode_res(infer_res, dict_size):
diff --git a/hsigmoid/hsigmoid_predict.py b/hsigmoid/infer.py
similarity index 82%
rename from hsigmoid/hsigmoid_predict.py
rename to hsigmoid/infer.py
index 210f87ee103a2ac145e3c42cea536cd00d2994bb..32000238ee715e6ad8fcb9cb2484e7c532974987 100644
--- a/hsigmoid/hsigmoid_predict.py
+++ b/hsigmoid/infer.py
@@ -1,9 +1,14 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
+import os
+import logging
+import gzip
 
 import paddle.v2 as paddle
-from hsigmoid_conf import network_conf
-import gzip
+from network_conf import ngram_lm
+
+logger = logging.getLogger("paddle")
+logger.setLevel(logging.WARNING)
 
 
 def decode_res(infer_res, dict_size):
@@ -45,21 +50,20 @@ def predict(batch_ins, idx_word_dict, dict_size, prediction_layer, parameters):
 
     # Ouput format: word1 word2 word3 word4 -> predict label
     for i, ins in enumerate(batch_ins):
-        print(idx_word_dict[ins[0]] + ' ' + \
-            idx_word_dict[ins[1]] + ' ' + \
-            idx_word_dict[ins[2]] + ' ' + \
-            idx_word_dict[ins[3]] + ' ' + \
-         ' -> ' + predict_words[i])
+        print(" ".join([idx_word_dict[w]
+                        for w in ins]) + " -> " + predict_words[i])
+
 
+def main(model_path):
+    assert os.path.exists(model_path), "trained model does not exist."
 
-def main():
     paddle.init(use_gpu=False, trainer_count=1)
     word_dict = paddle.dataset.imikolov.build_dict(min_word_freq=2)
     dict_size = len(word_dict)
-    prediction_layer = network_conf(
+    prediction_layer = ngram_lm(
         is_train=False, hidden_size=256, embed_size=32, dict_size=dict_size)
 
-    with gzip.open('./models/model_pass_00000.tar.gz') as f:
+    with gzip.open(model_path, "r") as f:
         parameters = paddle.parameters.Parameters.from_tar(f)
 
     idx_word_dict = dict((v, k) for k, v in word_dict.items())
@@ -79,5 +83,5 @@ def main():
                 parameters)
 
 
-if __name__ == '__main__':
-    main()
+if __name__ == "__main__":
+    main("models/hsigmoid_batch_00010.tar.gz")
diff --git a/hsigmoid/network_conf.py b/hsigmoid/network_conf.py
new file mode 100644
index 0000000000000000000000000000000000000000..494494788c015fd76ab5914ba6c2a8161bde5785
--- /dev/null
+++ b/hsigmoid/network_conf.py
@@ -0,0 +1,49 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import math
+import paddle.v2 as paddle
+
+
+def ngram_lm(hidden_size, embed_size, dict_size, gram_num=4, is_train=True):
+    emb_layers = []
+    embed_param_attr = paddle.attr.Param(
+        name="_proj", initial_std=0.001, learning_rate=1, l2_rate=0)
+    for i in range(gram_num):
+        word = paddle.layer.data(
+            name="__word%02d__" % (i),
+            type=paddle.data_type.integer_value(dict_size))
+        emb_layers.append(
+            paddle.layer.embedding(
+                input=word, size=embed_size, param_attr=embed_param_attr))
+
+    target_word = paddle.layer.data(
+        name="__target_word__", type=paddle.data_type.integer_value(dict_size))
+
+    embed_context = paddle.layer.concat(input=emb_layers)
+
+    hidden_layer = paddle.layer.fc(
+        input=embed_context,
+        size=hidden_size,
+        act=paddle.activation.Sigmoid(),
+        layer_attr=paddle.attr.Extra(drop_rate=0.5),
+        bias_attr=paddle.attr.Param(learning_rate=2),
+        param_attr=paddle.attr.Param(
+            initial_std=1. / math.sqrt(embed_size * 8), learning_rate=1))
+
+    if is_train == True:
+        cost = paddle.layer.hsigmoid(
+            input=hidden_layer,
+            label=target_word,
+            num_classes=dict_size,
+            param_attr=paddle.attr.Param(name="sigmoid_w"),
+            bias_attr=paddle.attr.Param(name="sigmoid_b"))
+        return cost
+    else:
+        prediction = paddle.layer.fc(
+            size=dict_size - 1,
+            input=hidden_layer,
+            act=paddle.activation.Sigmoid(),
+            bias_attr=paddle.attr.Param(name="sigmoid_b"),
+            param_attr=paddle.attr.Param(name="sigmoid_w"))
+        return prediction
diff --git a/hsigmoid/hsigmoid_train.py b/hsigmoid/train.py
similarity index 55%
rename from hsigmoid/hsigmoid_train.py
rename to hsigmoid/train.py
index 0c2e1b236b284c3dfb32988b0d917eb830f365be..809c842af55b22daff3428db9b674065a16f1700 100644
--- a/hsigmoid/hsigmoid_train.py
+++ b/hsigmoid/train.py
@@ -1,40 +1,41 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
+import os
+import logging
+import gzip
 
 import paddle.v2 as paddle
-from hsigmoid_conf import network_conf
-import gzip
+from network_conf import ngram_lm
+
+logger = logging.getLogger("paddle")
+logger.setLevel(logging.INFO)
 
 
-def main():
+def main(save_dir="models"):
+    if not os.path.exists(save_dir):
+        os.mkdir(save_dir)
+
     paddle.init(use_gpu=False, trainer_count=1)
     word_dict = paddle.dataset.imikolov.build_dict(min_word_freq=2)
     dict_size = len(word_dict)
-    cost = network_conf(
-        is_train=True, hidden_size=256, embed_size=32, dict_size=dict_size)
+    cost = ngram_lm(hidden_size=256, embed_size=32, dict_size=dict_size)
 
     def event_handler(event):
         if isinstance(event, paddle.event.EndPass):
-            model_name = './models/model_pass_%05d.tar.gz' % event.pass_id
-            print("Save model into %s ..." % model_name)
-            with gzip.open(model_name, 'w') as f:
+            model_name = os.path.join(save_dir, "hsigmoid_pass_%05d.tar.gz" %
+                                      event.pass_id)
+            logger.info("Save model into %s ..." % model_name)
+            with gzip.open(model_name, "w") as f:
                 parameters.to_tar(f)
 
         if isinstance(event, paddle.event.EndIteration):
-            if event.batch_id % 100 == 0:
+            if event.batch_id and event.batch_id % 10 == 0:
                 result = trainer.test(
                     paddle.batch(
                         paddle.dataset.imikolov.test(word_dict, 5), 32))
-                print("Pass %d, Batch %d, Cost %f, Test Cost %f" %
-                      (event.pass_id, event.batch_id, event.cost, result.cost))
-
-    feeding = {
-        'firstw': 0,
-        'secondw': 1,
-        'thirdw': 2,
-        'fourthw': 3,
-        'fifthw': 4
-    }
+                logger.info(
+                    "Pass %d, Batch %d, Cost %f, Test Cost %f" %
+                    (event.pass_id, event.batch_id, event.cost, result.cost))
 
     parameters = paddle.parameters.create(cost)
     adam_optimizer = paddle.optimizer.Adam(
@@ -48,9 +49,8 @@ def main():
                 lambda: paddle.dataset.imikolov.train(word_dict, 5)(),
                 buf_size=1000), 64),
         num_passes=30,
-        event_handler=event_handler,
-        feeding=feeding)
+        event_handler=event_handler)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()