diff --git a/.copyright.hook b/.copyright.hook
index e28e88e2664b25b5aecce6660470c485b20a1a6e..2446e27248125134ab624ed557823993c90fafc5 100644
--- a/.copyright.hook
+++ b/.copyright.hook
@@ -49,12 +49,12 @@ def generate_copyright(template, lang='C'):
         LANG_COMMENT_MARK = "//"
 
     lines = template.split(NEW_LINE_MARK)
-    ans = LANG_COMMENT_MARK + COPYRIGHT_HEADER + NEW_LINE_MARK
+    ans = LANG_COMMENT_MARK + " " + COPYRIGHT_HEADER + NEW_LINE_MARK
     for lino, line in enumerate(lines):
         if lino == 0 or lino == 1 or lino == len(lines) - 1: continue
-        ans += LANG_COMMENT_MARK + line + NEW_LINE_MARK
+        ans += LANG_COMMENT_MARK + " " + line + NEW_LINE_MARK
 
-    return ans
+    return ans + "\n"
 
 
 def lang_type(filename):
@@ -90,7 +90,7 @@ def main(argv=None):
     retv = 0
     for filename in args.filenames:
         first_line = io.open(filename).readline()
-        if "Copyright" in first_line: continue
+        if "COPYRIGHT" in first_line.upper() : continue
         original_contents = io.open(filename).read()
         new_contents = generate_copyright(
             COPYRIGHT, lang_type(filename)) + original_contents
diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
new file mode 100644
index 0000000000000000000000000000000000000000..54131b48eca463aef817a4b96ba1b64de4b60aab
--- /dev/null
+++ b/CODE_OF_CONDUCT.md
@@ -0,0 +1,46 @@
+# Contributor Covenant Code of Conduct
+
+## Our Pledge
+
+In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation.
+
+## Our Standards
+
+Examples of behavior that contributes to creating a positive environment include:
+
+* Using welcoming and inclusive language
+* Being respectful of differing viewpoints and experiences
+* Gracefully accepting constructive criticism
+* Focusing on what is best for the community
+* Showing empathy towards other community members
+
+Examples of unacceptable behavior by participants include:
+
+* The use of sexualized language or imagery and unwelcome sexual attention or advances
+* Trolling, insulting/derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or electronic address, without explicit permission
+* Other conduct which could reasonably be considered inappropriate in a professional setting
+
+## Our Responsibilities
+
+Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior.
+
+Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful.
+
+## Scope
+
+This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at paddle-dev@baidu.com. The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately.
+
+Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at [http://contributor-covenant.org/version/1/4][version]
+
+[homepage]: http://contributor-covenant.org
+[version]: http://contributor-covenant.org/version/1/4/
diff --git a/CODE_OF_CONDUCT_cn.md b/CODE_OF_CONDUCT_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..c3a22f29cce443de1865f30e67663ba5a60a3040
--- /dev/null
+++ b/CODE_OF_CONDUCT_cn.md
@@ -0,0 +1,50 @@
+# 貢獻者公約
+
+## 我們的承諾
+
+為了促進一個開放透明且受歡迎的環境，我們作為貢獻者和維護者保證，無論年齡、種族、民族、性別認同和表達、體型、殘疾、經驗水平、國籍、個人表現、宗教或性別取向，在我們的專案以及社群的參與者都有不被騷擾的體驗。
+
+## 我們的準則
+
+舉例來說有助於創造正面環境的行為包括：
+* 使用歡迎和包容性語言
+* 尊重不同的觀點和經驗
+* 優雅地接受建設性批評
+* 關注在對於社群最好的事情上
+* 對其他社群成員的表現友善
+
+舉例來說身為參與者不能接受的行為包括：
+* 使用與性有關的言語或是圖像，以及不受歡迎的性騷擾
+* 酸民/反串/釣魚行為或進行侮辱/貶損的評論，人身攻擊及政治攻擊
+* 公開或私下的騷擾
+* 未經許可地發布他人的個人資料，例如住址或是電子地址
+* 其他可以被合理地認定為不恰當或者違反職業操守的行為
+
+## 我們的責任
+
+專案維護者有責任為"可接受的行為"準則做出詮釋，以及對已發生的不被接受的行為採取恰當且公平的糾正措施。
+
+專案維護者有權力及責任去刪除、編輯、拒絕與本行為準則有所違背的評論(comments)、提交(commits)、程式碼、wiki 編輯、問題(issues)和其他貢獻，以及專案維護者可暫時或永久性的禁止任何他們認為有不適當、威脅、冒犯、有害行為的貢獻者。
+
+## 使用範圍
+
+當一個人代表該專案或是其社群時，本行為準則適用於其專案平台和公共平台。
+
+代表專案或是社群的情況，舉例來說包括使用官方專案的電子郵件地址、通過官方的社群媒體帳號發布或線上或線下事件中擔任指定代表。
+
+該專案的呈現方式可由其專案維護者進行進一步的定義及解釋。
+
+## 強制執行
+
+可以透過paddle-dev@baidu.com，來聯繫專案團隊來報告濫用、騷擾或其他不被接受的行為。
+
+任何維護團隊認為有必要且適合的所有投訴都將進行審查及調查，並做出相對應的回應。專案小組有對事件回報者有保密的義務。具體執行的方針近一步細節可能會單獨公佈。
+
+沒有真誠的遵守或是執行本行為準則的專案維護人員，可能會因專案領導人或是其他成員的決定，暫時或是永久的取消其身份。
+
+## 來源
+
+本行為準則改編自[貢獻者公約][首頁]，版本 1.4
+可在此觀看https://www.contributor-covenant.org/zh-tw/version/1/4/code-of-conduct.html
+
+[首頁]: https://www.contributor-covenant.org
diff --git a/adversarial/advbox/attacks/gradientsign.py b/adversarial/advbox/attacks/gradientsign.py
index 77d93bd793936abbaae0302050e2bcf714adfa1a..cc26ffb69020a87f559c537f03de84f7c2bea2de 100644
--- a/adversarial/advbox/attacks/gradientsign.py
+++ b/adversarial/advbox/attacks/gradientsign.py
@@ -49,3 +49,39 @@ class GradientSignAttack(Attack):
 
 
 FGSM = GradientSignAttack
+
+
+class IteratorGradientSignAttack(Attack):
+    """
+    This attack was originally implemented by Alexey Kurakin(Google Brain).
+    Paper link: https://arxiv.org/pdf/1607.02533.pdf
+    """
+
+    def _apply(self, image_label, epsilons=100, steps=10):
+        """
+        Apply the iterative gradient sign attack.
+        Args:
+            image_label(list): The image and label tuple list of one element.
+            epsilons(list|tuple|int): The epsilon (input variation parameter).
+            steps(int): The number of iterator steps.
+        Return:
+            numpy.ndarray: The adversarail sample generated by the algorithm.
+        """
+        assert len(image_label) == 1
+        pre_label = np.argmax(self.model.predict(image_label))
+        gradient = self.model.gradient(image_label)
+        min_, max_ = self.model.bounds()
+
+        if not isinstance(epsilons, Iterable):
+            epsilons = np.linspace(0, 1, num=epsilons + 1)
+
+        for epsilon in epsilons:
+            adv_img = image_label[0][0].reshape(gradient.shape)
+            for _ in range(steps):
+                gradient = self.model.gradient([(adv_img, image_label[0][1])])
+                gradient_sign = np.sign(gradient) * (max_ - min_)
+                adv_img = adv_img + epsilon * gradient_sign
+                adv_img = np.clip(adv_img, min_, max_)
+                adv_label = np.argmax(self.model.predict([(adv_img, 0)]))
+                if pre_label != adv_label:
+                    return adv_img
diff --git a/benchmark/tensorflow/image/googlenet_multi_gpu.py b/benchmark/tensorflow/image/googlenet_multi_gpu.py
index 31466faa37c47c66e4fe4628e28c867875e89f2e..44de3800a8a42d90debae2c567795789f3eb0a7d 100644
--- a/benchmark/tensorflow/image/googlenet_multi_gpu.py
+++ b/benchmark/tensorflow/image/googlenet_multi_gpu.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 from six.moves import xrange  # pylint: disable=redefined-builtin
 from datetime import datetime
 import math
diff --git a/doc/api/v2/fluid/layers.rst b/doc/api/v2/fluid/layers.rst
index 62c154e65dcff1bdfb00109bf1b724c34731652e..0eb531cf021bd63c5ad93d9a1d12e9be811e4407 100644
--- a/doc/api/v2/fluid/layers.rst
+++ b/doc/api/v2/fluid/layers.rst
@@ -364,6 +364,12 @@ split
 ..  autofunction:: paddle.v2.fluid.layers.split
     :noindex:
 
+
+matmul
+------
+..  autofunction:: paddle.v2.fluid.layers.matmul
+    :noindex:
+
 logsigmoid
 ----------
 ..  autofunction:: paddle.v2.fluid.layers.logsigmoid
@@ -493,3 +499,8 @@ swish
 ------
 ..  autofunction:: paddle.v2.fluid.layers.swish
     :noindex:
+
+l2_normalize
+------------
+..  autofunction:: paddle.v2.fluid.layers.l2_normalize
+    :noindex:
diff --git a/doc/api/v2/fluid/nets.rst b/doc/api/v2/fluid/nets.rst
index cca0dcdf082c0d809fab1aebba2c0b6c7b8efa2a..f6b1cb4ba10659fb336899f08376c265c67290f1 100644
--- a/doc/api/v2/fluid/nets.rst
+++ b/doc/api/v2/fluid/nets.rst
@@ -25,3 +25,9 @@ glu
 ..  autofunction:: paddle.v2.fluid.nets.glu
     :noindex:
 
+
+dot_product_attention
+---------------------
+..  autofunction:: paddle.v2.fluid.nets.dot_product_attention
+    :noindex:
+
diff --git a/doc/getstarted/concepts/src/infer.py b/doc/getstarted/concepts/src/infer.py
index 4cc58dfee0bd6dade0340b4fd0ee1adb49ffebf6..ee71cd7a9a4fbddb93fa3aa2d9349f01f3673982 100644
--- a/doc/getstarted/concepts/src/infer.py
+++ b/doc/getstarted/concepts/src/infer.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import paddle.v2 as paddle
 import numpy as np
 
diff --git a/doc/howto/usage/capi/organization_of_the_inputs_cn.md b/doc/howto/usage/capi/organization_of_the_inputs_cn.md
index 563ec5ca21ec5d75800fa201943d65e6d6fe51ea..a889ae4ffab7be02468b4a5ac5a18e3cc77803c9 100644
--- a/doc/howto/usage/capi/organization_of_the_inputs_cn.md
+++ b/doc/howto/usage/capi/organization_of_the_inputs_cn.md
@@ -19,7 +19,7 @@
 
 ### 基本使用概念
 
-- 在PaddlePaddle内部，神经网络中一个计算层的输入/输出被组织为一个 `Argument` 结构体，如果神经网络有多个输入或者多个输入，每一个输入/输入都会对应有自己的`Argument`。
+- 在PaddlePaddle内部，神经网络中一个计算层的输入/输出被组织为一个 `Argument` 结构体，如果神经网络有多个输入或者多个输出，每一个输入/输出都会对应有自己的`Argument`。
 - `Argument` 并不真正“存储”数据，而是将输入/输出信息有机地组织在一起。
 - 在`Argument`内部由`IVector`（对应着上文提到的一维整型数组）和`Matrix`（对应着上文提到的二维浮点型矩阵）来实际存储数据；由 `Sequence Start Positions` (下文详细解释) 来描述输入/输出的序列信息。
 
diff --git a/doc/howto/usage/cluster/fluid_cluster_train_en.md b/doc/howto/usage/cluster/fluid_cluster_train_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..419eac51aa52c765a202856b3f1620e742b29cb6
--- /dev/null
+++ b/doc/howto/usage/cluster/fluid_cluster_train_en.md
@@ -0,0 +1,138 @@
+# Fluid Distributed Training
+
+## Introduction
+
+In this article, we'll explain how to config and run distributed training jobs with PaddlePaddle Fluid in a bare metal cluster.
+
+## Preparations
+
+### Get your cluster ready
+
+Prepare your computer nodes in the cluster. Nodes in this cluster can be of any specification that runs PaddlePaddle, and with a unique IP address assigned to it. Make sure they can communicate with each other.
+
+### Have PaddlePaddle installed
+
+PaddlePaddle must be installed on all nodes. If you have GPU cards on your nodes, be sure to properly install drivers and CUDA libraries.
+
+PaddlePaddle build and installation guide can be found from [here](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/index_en.html).
+
+### Update training script
+
+#### Non-cluster training script
+
+Let's take [Deep Learning 101](http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.html)'s first chapter: "fit a line" as an example.
+
+This demo's non-cluster version with fluid API is as follows:
+
+``` python
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+
+x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+y_predict = fluid.layers.fc(input=x, size=1, act=None)
+y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+
+cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+avg_cost = fluid.layers.mean(x=cost)
+
+sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+sgd_optimizer.minimize(avg_cost)
+
+BATCH_SIZE = 20
+
+train_reader = paddle.batch(
+    paddle.reader.shuffle(
+        paddle.dataset.uci_housing.train(), buf_size=500),
+    batch_size=BATCH_SIZE)
+
+place = fluid.CPUPlace()
+feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+exe = fluid.Executor(place)
+
+exe.run(fluid.default_startup_program())
+
+PASS_NUM = 100
+for pass_id in range(PASS_NUM):
+    fluid.io.save_persistables(exe, "./fit_a_line.model/")
+    fluid.io.load_persistables(exe, "./fit_a_line.model/")
+    for data in train_reader():
+        avg_loss_value, = exe.run(fluid.default_main_program(),
+                                  feed=feeder.feed(data),
+                                  fetch_list=[avg_cost])
+
+        if avg_loss_value[0] < 10.0:
+            exit(0)  # if avg cost less than 10.0, we think our code is good.
+exit(1)
+```
+
+We created a simple fully connected neural networks training program and handed it to the fluid executor to run for 100 passes.
+
+Now let's try to convert it to a distributed version to run in a cluster.
+
+#### Introducing parameter server
+
+As you see from the non-cluster version of training script, there is only one role in it: the trainer, who does the computing as well as holding parameters. In cluster training, since multi-trainers are working on the same task, they need one centralized place to hold and distribute parameters. This centralized place is called the Parameter Server in PaddlePaddle.
+
+![parameter server architect](src/trainer.png)
+
+Parameter Server in fluid does not only hold parameters but is also assigned with a part of the program. Trainers communicate with parameter servers via send/receive OPs. For more tech detail, please refer to this [document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/dist_refactor/distributed_architecture.md).
+
+Now we need to create program for both trainers and parameter servers, the question is how?
+
+#### Slice the program
+
+Fluid provides a tool called "Distribute Transpiler" to automatically convert the non-cluster program into cluster program.
+
+The idea behind this tool is to find optimize OPs and gradient parameters, slice the program into 2 pieces and connect them with send/receive OP.
+
+Optimize OPs and gradient parameters can be found from the return values of optimizer's minimize function.
+
+To put them together:
+
+``` python
+... #define the program, cost, and create sgd optimizer
+
+optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost) #get optimize OPs and gradient parameters
+
+t = fluid.DistributeTranspiler() # create transpiler instance
+# slice the program into 2 pieces with optimizer_ops and gradient parameters list, as well as pserver_endpoints, which is a comma separated list of [IP:PORT] and number of trainers
+t.transpile(optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2) 
+
+... #create executor
+
+# in pserver, run this
+exe.run(fluid.default_startup_program())
+#current_endpoint here means current pserver IP:PORT you wish to run on
+exe.run(t.get_pserver_program(current_endpoint, optimize_ops)) 
+
+# in trainer, run this
+... # define data reader
+exe.run(fluid.default_startup_program())
+for pass_id in range(100):
+    for data in train_reader():
+        exe.run(t.get_trainer_program())
+
+
+```
+
+### E2E demo
+
+Please find the complete demo from [here](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/tests/book_distribute/notest_dist_fit_a_line.py). In parameter server node run this in the command line:
+
+``` bash
+PSERVERS=192.168.1.2:6174 SERVER_ENDPOINT=192.168.1.2:6174 TRAINING_ROLE=PSERVER python notest_dist_fit_a_line.py
+```
+
+*please note we assume that your parameter server runs at 192.168.1.2:6174*
+
+Wait until the prompt `Server listening on 192.168.1.2:6174`
+
+Then in 2 of your trainer node run this:
+
+``` bash
+PSERVERS=192.168.1.2:6174 SERVER_ENDPOINT=192.168.1.2:6174 TRAINING_ROLE=TRAINER python notest_dist_fit_a_line.py
+```
+
+*the reason you need to run this command twice in 2 nodes is: in the script we set the trainer count to be 2. You can change this setting on line 50*
+
+Now you have 2 trainers and 1 parameter server up and running.
diff --git a/paddle/framework/lod_tensor.cc b/paddle/framework/lod_tensor.cc
index 87a57d095141cc456af2cbabbc227715a02375e9..3e239e9911d03a43987825ffa7824298a748ebda 100644
--- a/paddle/framework/lod_tensor.cc
+++ b/paddle/framework/lod_tensor.cc
@@ -135,6 +135,65 @@ bool operator==(const LoD &a, const LoD &b) {
   return true;
 }
 
+bool CheckLoD(const LoD &in, int tensor_height) {
+  if (in.empty()) return true;
+  for (const auto &level : in) {
+    // check: there should be more than 2 offsets existing in each level.
+    if (level.size() < 2) return false;
+    // check: the first offset(the begin offset) of each level should be 0.
+    if (level.front() != 0) return false;
+    // check: all the offsets in a level should be ascending(no same items
+    // allows).
+    if (!std::is_sorted(level.begin(), level.begin(), [](size_t a, size_t b) {
+          if (a < b) return true;
+          return false;
+        })) {
+      LOG(INFO) << "ascending error";
+      return false;
+    }
+  }
+  // check: the lowest level's last offset should equals `tensor_height` if
+  //        tensor_height>0.
+  if (tensor_height > 0 && (size_t)tensor_height != in.back().back())
+    return false;
+
+  // check: the higher level's last offset should equals the lower level's
+  // size-1.
+  // NOTE LoD store the levels from top to bottom, so the higher level goes
+  // first.
+  for (size_t level = 0; level < in.size() - 1; level++) {
+    if (in[level].back() != in[level + 1].size() - 1) return false;
+  }
+  return true;
+}
+
+bool CheckAbsLoD(const LoD &in, int tensor_height) {
+  if (in.empty()) return true;
+  for (const auto &level : in) {
+    // check: all the offsets in a level should be ascending(no same items
+    // allows).
+    if (!std::is_sorted(level.begin(), level.begin(), [](size_t a, size_t b) {
+          if (a < b) return true;
+          return false;
+        })) {
+      return false;
+    }
+
+    // check: there should be more than 2 offsets existing in each level.
+    if (level.size() < 2) return false;
+
+    // check: the first offset of each level should be 0, and the last should be
+    // the same(the height of underlying tensor).
+    if (level.front() != 0) return false;
+    if (tensor_height < 0) {
+      tensor_height = level.back();
+    } else if ((size_t)tensor_height != level.back()) {
+      return false;
+    }
+  }
+  return true;
+}
+
 using LoDAndOffset = std::pair<LoD, std::pair<size_t, size_t>>;
 LoDAndOffset GetSubLoDAndAbsoluteOffset(const LoD &lod, size_t start_idx,
                                         size_t end_idx, size_t start_level) {
@@ -232,23 +291,32 @@ std::vector<LoDTensor> LoDTensor::SplitLoDTensor(
     const std::vector<platform::Place> places) const {
   check_memory_size();
   PADDLE_ENFORCE(lod().empty(), "Disable parallel lod for now");
-  PADDLE_ENFORCE(dims()[0] % places.size() == 0,
-                 "Batch size should be divided by places size");
-
-  std::vector<LoDTensor> lods;
-  for (size_t place_idx = 0; place_idx < places.size(); ++place_idx) {
-    int begin = place_idx * dims()[0] / places.size();
-    int end = (place_idx + 1) * dims()[0] / places.size();
+  size_t result_size = std::min(static_cast<size_t>(dims()[0]), places.size());
+  size_t remainder = dims()[0] % places.size();
+
+  std::vector<LoDTensor> results;
+  results.reserve(result_size);
+
+  int step_width = static_cast<int>(dims()[0] / result_size);
+  for (size_t i = 0; i < result_size; ++i) {
+    int begin = static_cast<int>(i * step_width);
+    int end = static_cast<int>((i + 1) * step_width);
+    if (i + 1 == places.size()) {  // last
+      end += remainder;
+    }
 
     auto src = Slice(begin, end);
-    auto &dst_place = places[place_idx];
+    auto &dst_place = places[i];
     LoDTensor dst;
-    framework::Copy(src, dst_place, &dst);
-
-    lods.emplace_back(dst);
+    if (!(dst_place == place())) {
+      framework::Copy(src, dst_place, &dst);
+    } else {  // It is no need to copy if src_place and dst_place are same.
+      dst.ShareDataWith(src);
+    }
+    results.emplace_back(dst);
   }
 
-  return lods;
+  return results;
 }
 
 // TODO(tonyyang-svail): make this function support LoD
@@ -259,12 +327,17 @@ void LoDTensor::MergeLoDTensor(
   framework::DDim new_dim = lod_tensors[0]->dims();
   std::type_index new_type = lod_tensors[0]->type();
   auto new_layout = lod_tensors[0]->layout();
+  int64_t new_height = 0;
   for (auto *lod : lod_tensors) {
-    PADDLE_ENFORCE(new_dim == lod->dims());
-    PADDLE_ENFORCE(new_type == lod->type());
-    PADDLE_ENFORCE(new_layout == lod->layout());
+    new_height += lod->dims()[0];
+    for (int i = 1; i < new_dim.size(); ++i) {
+      PADDLE_ENFORCE_EQ(new_dim[i], lod->dims()[i]);
+    }
+
+    PADDLE_ENFORCE_EQ(new_type, lod->type());
+    PADDLE_ENFORCE_EQ(new_layout, lod->layout());
   }
-  new_dim[0] *= lod_tensors.size();
+  new_dim[0] = new_height;
   Resize(new_dim);
   set_layout(new_layout);
 
diff --git a/paddle/framework/lod_tensor.h b/paddle/framework/lod_tensor.h
index 88ea78f2682b2ffc962c9663f6b3c636dedb931d..9d1294fdeb9bd76bf944f7ec3687e3c5bb333241 100644
--- a/paddle/framework/lod_tensor.h
+++ b/paddle/framework/lod_tensor.h
@@ -71,6 +71,38 @@ LoD ToAbsOffset(const LoD& in);
 
 bool operator==(const LoD& a, const LoD& b);
 
+/*
+ * Check whether this lod's format is valid.
+ *
+ * ATTENTION:
+ *   - Empty lod is treated as valid.
+ *
+ * It will check two things:
+ *
+ *  1. all the offsets in a level should be ascending(no same items allows).
+ *  2. there should be more than 2 offsets existing in each level.
+ *  3. the higher level's last offset should equals the lower level's size-1.
+ *  4. the first offset(the begin offset) of each level should be 0.
+ *  5. the lowest level's last offset should equals `tensor_height` if
+ * tensor_height>0.
+ */
+
+bool CheckLoD(const LoD& in, int tensor_height = -1);
+/*
+ * Check whether this absolute lod's format is valid.
+ *
+ * ATTENTION:
+ *   - Empty lod is treated as valid.
+ *
+ * It will check two things:
+ *  1. all the offsets in a level should be ascending(no same items allows)
+ *  2. there should be more than 2 offsets existing in each level.
+ *  3. the first offset of each level should be 0, and the last should be the
+ *     same(the height of underlying tensor) or `tensor_height` if
+ *     tensor_height>0.
+ */
+bool CheckAbsLoD(const LoD& in, int tensor_height = -1);
+
 /*
  * LoDTensor (Level of details Tensor)
  * see https://en.wikipedia.org/wiki/Level_of_details for reference.
diff --git a/paddle/framework/lod_tensor_test.cc b/paddle/framework/lod_tensor_test.cc
index 19ae7815cc1008f9f468821d61b26f221e266d5e..9c7ad6c7b47952bd137eeedf302e2b9182fe8279 100644
--- a/paddle/framework/lod_tensor_test.cc
+++ b/paddle/framework/lod_tensor_test.cc
@@ -1,28 +1,16 @@
-//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-/*
-  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-  Licensed under the Apache License, Version 2.0 (the "License");
-  you may not use this file except in compliance with the License.
-  You may obtain a copy of the License at
-  http://www.apache.org/licenses/LICENSE-2.0
-  Unless required by applicable law or agreed to in writing, software
-  distributed under the License is distributed on an "AS IS" BASIS,
-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-  See the License for the specific language governing permissions and
-  limitations under the License.
-*/
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/framework/lod_tensor.h"
 
@@ -35,38 +23,6 @@
 namespace paddle {
 namespace framework {
 
-const int kLodTensorSize = 20 * 128;
-
-class LoDTensorTester : public ::testing::Test {
- public:
-  virtual void SetUp() override {
-    // tensor's batch_size: 30
-    // 3 levels
-    // 0 10 20
-    // 0 5 10 15 20
-    // 0 2 5 7 10 12 15 20
-    LoD lod;
-    lod.push_back(std::vector<size_t>{0, 2, 3});
-    lod.push_back(std::vector<size_t>{0, 2, 5, 8});
-    lod.push_back(std::vector<size_t>{0, 2, 5, 7, 10, 12, 15, 17, 20});
-
-    ASSERT_EQ(lod.size(), 3UL);
-
-    lod_tensor_.Resize({20 /*batch size*/, 128 /*dim*/});
-    // malloc memory
-    float* dst_ptr = lod_tensor_.mutable_data<float>(place);
-    for (int i = 0; i < kLodTensorSize; ++i) {
-      dst_ptr[i] = i;
-    }
-
-    lod_tensor_.set_lod(lod);
-  }
-
- protected:
-  platform::CPUPlace place;
-  LoDTensor lod_tensor_;
-};
-
 TEST(LodExpand, test) {
   LoD lod{{0, 2}};
   LoDTensor tensor;
@@ -144,5 +100,53 @@ TEST(LoD, ToAbsOffset) {
   EXPECT_EQ(abs_lod, expected);
 }
 
+TEST(LoD, CheckLoD) {
+  LoD relative_lod;
+  relative_lod.push_back(std::vector<size_t>({0, 2}));
+  relative_lod.push_back(std::vector<size_t>({0, 1, 3}));
+  relative_lod.push_back(std::vector<size_t>({0, 2, 4, 5}));
+
+  // check compatible
+  ASSERT_TRUE(CheckLoD(relative_lod));
+  relative_lod[1].back()++;
+  ASSERT_FALSE(CheckLoD(relative_lod));
+  relative_lod[1].back()--;  // recover it
+
+  // check empty
+  LoD empty_lod;
+  ASSERT_TRUE(CheckLoD(empty_lod));
+
+  // check less than 2 offsets in a level
+  LoD some_lod0;
+  some_lod0.push_back(std::vector<size_t>({0}));
+  ASSERT_FALSE(CheckLoD(some_lod0));
+
+  // check with underlying tensor storage.
+  ASSERT_TRUE(CheckLoD(relative_lod, 5));
+  ASSERT_FALSE(CheckLoD(relative_lod, 9));
+}
+
+TEST(LoD, CheckAbsLoD) {
+  LoD relative_lod;
+  relative_lod.push_back(std::vector<size_t>({0, 2}));
+  relative_lod.push_back(std::vector<size_t>({0, 1, 3}));
+  relative_lod.push_back(std::vector<size_t>({0, 2, 4, 5}));
+
+  auto abs_lod = ToAbsOffset(relative_lod);
+
+  ASSERT_TRUE(CheckAbsLoD(abs_lod));
+
+  // check less than 2 offsets in a level.
+
+  // check the last item should be compatible with tensor height.
+  abs_lod.back().back()++;
+  ASSERT_FALSE(CheckAbsLoD(abs_lod));
+  abs_lod.back().back()--;  // restore
+
+  // check less than 2 offsets in a lod.
+  LoD abs_lod0;
+  abs_lod0.push_back(std::vector<size_t>({0}));
+  ASSERT_FALSE(CheckAbsLoD(abs_lod0));
+}
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index d75c0233e8e0134ddf4edc50c07490a234b65cd0..5de9ae559c435439f30931c7840e54e0d2bb744c 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -177,16 +177,16 @@ class OpKernelRegistrar : public Registrar {
 /**
  * Macro to register OperatorKernel.
  */
-#define REGISTER_OP_KERNEL(op_type, DEVICE_TYPE, place_class, ...)        \
-  STATIC_ASSERT_GLOBAL_NAMESPACE(                                         \
-      __reg_op_kernel_##op_type##_##DEVICE_TYPE##__,                      \
-      "REGISTER_OP_KERNEL must be called in global namespace");           \
-  static ::paddle::framework::OpKernelRegistrar<place_class, __VA_ARGS__> \
-      __op_kernel_registrar_##op_type##_##DEVICE_TYPE##__(#op_type,       \
-                                                          #DEVICE_TYPE);  \
-  int TouchOpKernelRegistrar_##op_type##_##DEVICE_TYPE() {                \
-    __op_kernel_registrar_##op_type##_##DEVICE_TYPE##__.Touch();          \
-    return 0;                                                             \
+#define REGISTER_OP_KERNEL(op_type, LIBRARY_TYPE, place_class, ...)        \
+  STATIC_ASSERT_GLOBAL_NAMESPACE(                                          \
+      __reg_op_kernel_##op_type##_##LIBRARY_TYPE##__,                      \
+      "REGISTER_OP_KERNEL must be called in global namespace");            \
+  static ::paddle::framework::OpKernelRegistrar<place_class, __VA_ARGS__>  \
+      __op_kernel_registrar_##op_type##_##LIBRARY_TYPE##__(#op_type,       \
+                                                           #LIBRARY_TYPE); \
+  int TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE() {                \
+    __op_kernel_registrar_##op_type##_##LIBRARY_TYPE##__.Touch();          \
+    return 0;                                                              \
   }
 
 #define REGISTER_OP_CUDA_KERNEL(op_type, ...) \
@@ -208,14 +208,14 @@ class OpKernelRegistrar : public Registrar {
   static int use_op_itself_##op_type##_ __attribute__((unused)) = \
       TouchOpRegistrar_##op_type()
 
-#define USE_OP_DEVICE_KERNEL(op_type, DEVICE_TYPE)               \
-  STATIC_ASSERT_GLOBAL_NAMESPACE(                                \
-      __use_op_kernel_##op_type##_##DEVICE_TYPE##__,             \
-      "USE_OP_DEVICE_KERNEL must be in global namespace");       \
-  extern int TouchOpKernelRegistrar_##op_type##_##DEVICE_TYPE(); \
-  static int use_op_kernel_##op_type##_##DEVICE_TYPE##_          \
-      __attribute__((unused)) =                                  \
-          TouchOpKernelRegistrar_##op_type##_##DEVICE_TYPE()
+#define USE_OP_DEVICE_KERNEL(op_type, LIBRARY_TYPE)               \
+  STATIC_ASSERT_GLOBAL_NAMESPACE(                                 \
+      __use_op_kernel_##op_type##_##LIBRARY_TYPE##__,             \
+      "USE_OP_DEVICE_KERNEL must be in global namespace");        \
+  extern int TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE(); \
+  static int use_op_kernel_##op_type##_##LIBRARY_TYPE##_          \
+      __attribute__((unused)) =                                   \
+          TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE()
 
 // TODO(fengjiayi): The following macros
 // seems ugly, do we have better method?
diff --git a/paddle/gserver/layers/MKLDNNConcatLayer.cpp b/paddle/gserver/layers/MKLDNNConcatLayer.cpp
index 44bb0883b89c712d70e2d4fdfe16bdfde86f81b7..520ccc1a995e966de73080b61a8c20cbee722267 100644
--- a/paddle/gserver/layers/MKLDNNConcatLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNConcatLayer.cpp
@@ -43,7 +43,7 @@ void MKLDNNConcatLayer::reshape(
   channels_[0] = ic;
   oc = ic;
   for (size_t i = 1; i < inputLayers_.size(); i++) {
-    int batchsize, height, witdh;
+    int batchsize = 0, height = 0, witdh = 0;
     reshapeInput(batchsize, height, witdh, i);
     CHECK_EQ(bs, batchsize);
     CHECK_EQ(ih, height);
@@ -84,6 +84,7 @@ void MKLDNNConcatLayer::resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
   bool has8c = false, has16c = false, hasnc = false;
   for (size_t i = 0; i < inputs.size(); i++) {
     resetInValue(inputs[i], nullptr, i, channels_[i]);
+    inputs[i]->downSpatial();
     CHECK(inputs[i]);
     auto dm = inputs[i]->getDims();
     // inputs format can be different, but ndims must equal
diff --git a/paddle/gserver/tests/img_conv_cudnn.py b/paddle/gserver/tests/img_conv_cudnn.py
index e424261bda2c0fea4b1796ce443810e8e4e4599b..0ea6d6bae66b0a307748bd0d0fa9a53ed5f7927d 100644
--- a/paddle/gserver/tests/img_conv_cudnn.py
+++ b/paddle/gserver/tests/img_conv_cudnn.py
@@ -1,4 +1,4 @@
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 #
 #Licensed under the Apache License, Version 2.0 (the "License");
 #you may not use this file except in compliance with the License.
@@ -11,20 +11,6 @@
 #WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #See the License for the specific language governing permissions and
 #limitations under the License.
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from paddle.trainer_config_helpers import *
 
diff --git a/paddle/gserver/tests/img_conv_exconv.py b/paddle/gserver/tests/img_conv_exconv.py
index 3b59cd80b8b8349f022b774b36d4e4db2d517b73..c618cdab27c52d70042b0a118f7f6fe935a6b9d7 100644
--- a/paddle/gserver/tests/img_conv_exconv.py
+++ b/paddle/gserver/tests/img_conv_exconv.py
@@ -1,4 +1,4 @@
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 #
 #Licensed under the Apache License, Version 2.0 (the "License");
 #you may not use this file except in compliance with the License.
@@ -11,20 +11,6 @@
 #WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #See the License for the specific language governing permissions and
 #limitations under the License.
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from paddle.trainer_config_helpers import *
 
diff --git a/paddle/gserver/tests/pyDataProvider.py b/paddle/gserver/tests/pyDataProvider.py
index 7235a239439b7544805d1bd06dfb1a72c2e0e937..d2ad5888b5a4c79d8b663ce8c2f313184151beb6 100644
--- a/paddle/gserver/tests/pyDataProvider.py
+++ b/paddle/gserver/tests/pyDataProvider.py
@@ -1,17 +1,16 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 #
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+#    http://www.apache.org/licenses/LICENSE-2.0
 #
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import numpy
 import struct
 import traceback
diff --git a/paddle/gserver/tests/rnn_data_provider.py b/paddle/gserver/tests/rnn_data_provider.py
index 913365a5a4037d14fcba1e1546508ba89668e0d6..063a4127e542d23012359a2eac0045bf69a51356 100644
--- a/paddle/gserver/tests/rnn_data_provider.py
+++ b/paddle/gserver/tests/rnn_data_provider.py
@@ -1,17 +1,16 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 #
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+#    http://www.apache.org/licenses/LICENSE-2.0
 #
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 from paddle.trainer.PyDataProvider2 import *
 
 # Note that each config should has an independent provider
diff --git a/paddle/gserver/tests/sequenceGen.py b/paddle/gserver/tests/sequenceGen.py
index fd725727c04677b5ea8918f6721f0c007e80915d..04a1732d61c8618984d16550acf7c94da1bd3578 100644
--- a/paddle/gserver/tests/sequenceGen.py
+++ b/paddle/gserver/tests/sequenceGen.py
@@ -1,17 +1,16 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 #
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+#    http://www.apache.org/licenses/LICENSE-2.0
 #
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import os
 import sys
 
diff --git a/paddle/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py b/paddle/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py
index 7303d088043d5096a3491d3b3b32b231bde09a0a..aeaaa221f9fab981af88cfd63c30349e1b02a0ee 100644
--- a/paddle/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py
+++ b/paddle/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py
@@ -1,18 +1,16 @@
-# edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 #
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+#    http://www.apache.org/licenses/LICENSE-2.0
 #
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 from paddle.trainer_config_helpers import *
 
 ######################## data source ################################
diff --git a/paddle/gserver/tests/sequence_recurrent.py b/paddle/gserver/tests/sequence_recurrent.py
index d20d3331ae6887aa3bca9df7e7817e08ebb91a48..8786a5465db82d786d3772357b02ab837073a576 100644
--- a/paddle/gserver/tests/sequence_recurrent.py
+++ b/paddle/gserver/tests/sequence_recurrent.py
@@ -1,4 +1,4 @@
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 #
 #Licensed under the Apache License, Version 2.0 (the "License");
 #you may not use this file except in compliance with the License.
@@ -11,20 +11,6 @@
 #WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #See the License for the specific language governing permissions and
 #limitations under the License.
-#!/usr/bin/env python
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from paddle.trainer_config_helpers import *
 
diff --git a/paddle/gserver/tests/sequence_recurrent_group.py b/paddle/gserver/tests/sequence_recurrent_group.py
index a1d54542e3bc4e89f70d31d5e89c0f44953c9f90..8b5a3d49838c9bb49321a9d7514fc0241e6d67cd 100644
--- a/paddle/gserver/tests/sequence_recurrent_group.py
+++ b/paddle/gserver/tests/sequence_recurrent_group.py
@@ -1,18 +1,16 @@
-#!/usr/bin/env python
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+#    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from paddle.trainer_config_helpers import *
 
 ######################## data source ################################
diff --git a/paddle/gserver/tests/sequence_rnn_matched_inputs.py b/paddle/gserver/tests/sequence_rnn_matched_inputs.py
index b156fa9baaf386a8625d63020d72828047ddca2f..0c55f2cf9d07b194aa06f88892f831f1a9ce6436 100644
--- a/paddle/gserver/tests/sequence_rnn_matched_inputs.py
+++ b/paddle/gserver/tests/sequence_rnn_matched_inputs.py
@@ -1,4 +1,4 @@
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 #
 #Licensed under the Apache License, Version 2.0 (the "License");
 #you may not use this file except in compliance with the License.
@@ -11,20 +11,6 @@
 #WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #See the License for the specific language governing permissions and
 #limitations under the License.
-# edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from paddle.trainer_config_helpers import *
 
diff --git a/paddle/gserver/tests/sequence_rnn_mixed_inputs.py b/paddle/gserver/tests/sequence_rnn_mixed_inputs.py
index c2c98aae5f5c5f73c61b309382f2e3d7d649ef47..22b376b91aa4736d16fead698105466d679dd248 100644
--- a/paddle/gserver/tests/sequence_rnn_mixed_inputs.py
+++ b/paddle/gserver/tests/sequence_rnn_mixed_inputs.py
@@ -1,4 +1,4 @@
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 #
 #Licensed under the Apache License, Version 2.0 (the "License");
 #you may not use this file except in compliance with the License.
@@ -11,20 +11,6 @@
 #WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #See the License for the specific language governing permissions and
 #limitations under the License.
-# edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from paddle.trainer_config_helpers import *
 
diff --git a/paddle/gserver/tests/sequence_rnn_multi_unequalength_inputs.py b/paddle/gserver/tests/sequence_rnn_multi_unequalength_inputs.py
index c812c6ced2458b6c40e141606655aa45e2bbc55a..3ce87490bbd0f30a3c42b947b073adb2a6c5b51c 100644
--- a/paddle/gserver/tests/sequence_rnn_multi_unequalength_inputs.py
+++ b/paddle/gserver/tests/sequence_rnn_multi_unequalength_inputs.py
@@ -1,4 +1,4 @@
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 #
 #Licensed under the Apache License, Version 2.0 (the "License");
 #you may not use this file except in compliance with the License.
@@ -11,20 +11,6 @@
 #WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #See the License for the specific language governing permissions and
 #limitations under the License.
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from paddle.trainer_config_helpers import *
 
diff --git a/paddle/gserver/tests/test_PyDataProvider2.py b/paddle/gserver/tests/test_PyDataProvider2.py
index 0d0fe476ff5eac8bf8ad1c9fe09b32c1a8f73ebc..044aede98e684a432c48b3ea5bb82a4a677682d4 100644
--- a/paddle/gserver/tests/test_PyDataProvider2.py
+++ b/paddle/gserver/tests/test_PyDataProvider2.py
@@ -1,17 +1,16 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 #
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+#    http://www.apache.org/licenses/LICENSE-2.0
 #
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import random
 
 from paddle.trainer.PyDataProvider2 import *
diff --git a/paddle/operators/clip_op.cc b/paddle/operators/clip_op.cc
index 573bb9c7dfdac2366c2458dd9f27a035a9f9b813..7adb74eab78dcdd0251b8db60781f6e24e348634 100644
--- a/paddle/operators/clip_op.cc
+++ b/paddle/operators/clip_op.cc
@@ -51,8 +51,8 @@ class ClipOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 Clip Operator.
 
-The clip operator limits the value of given input within an interval. The interval is
-specified with arguments 'min' and 'max':
+The clip operator limits the value of given input within an interval. The
+interval is specified with arguments 'min' and 'max':
 
 $$
 Out = \min(\max(X, min), max)
diff --git a/paddle/operators/elementwise_add_op.h b/paddle/operators/elementwise_add_op.h
index 6478e1e0c2e1cfc8a1be5e8842113ec8ca33d762..a8389429f26c17ceab1db22175c90888546ead6f 100644
--- a/paddle/operators/elementwise_add_op.h
+++ b/paddle/operators/elementwise_add_op.h
@@ -28,39 +28,7 @@ template <typename DeviceContext, typename T>
 class ElementwiseAddKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    using Tensor = framework::Tensor;
-
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* z = ctx.Output<Tensor>("Out");
-    z->mutable_data<T>(ctx.GetPlace());
-    TransformFunctor<AddFunctor<T>, T, DeviceContext> functor(
-        x, y, z, ctx.template device_context<DeviceContext>(), AddFunctor<T>());
-
-    auto x_dims = x->dims();
-    auto y_dims = y->dims();
-    PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
-                      "Rank of first input must >= rank of second input.");
-
-    if (x_dims == y_dims) {
-      functor.Run();
-      return;
-    }
-
-    int axis = ctx.Attr<int>("axis");
-    axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
-    PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
-                   "Axis should be in range [0, x_dims)");
-
-    int pre, n, post;
-    get_mid_dims(x_dims, y_dims, axis, pre, n, post);
-    if (post == 1) {
-      functor.RunRowWise(n, pre);
-      return;
-    } else {
-      functor.RunMidWise(n, pre, post);
-      return;
-    }
+    ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>(ctx);
   }
 };
 
diff --git a/paddle/operators/elementwise_div_op.h b/paddle/operators/elementwise_div_op.h
index 7783875e24ef0d94e50f49b3598c93ff8bd5d73f..ef26cb6c914f50ded07cc9d0d8de3f49f2151129 100644
--- a/paddle/operators/elementwise_div_op.h
+++ b/paddle/operators/elementwise_div_op.h
@@ -19,11 +19,16 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+template <typename T>
+struct DivFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const { return a / b; }
+};
+
 template <typename DeviceContext, typename T>
 class ElementwiseDivKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    ElementwiseCompute<EigenDivFunctor, DeviceContext, T>(ctx);
+    ElementwiseComputeEx<DivFunctor<T>, DeviceContext, T>(ctx);
   }
 };
 
diff --git a/paddle/operators/elementwise_max_op.cc b/paddle/operators/elementwise_max_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..53c27ae5be4cbfe85ce61aa27196594ae152eea4
--- /dev/null
+++ b/paddle/operators/elementwise_max_op.cc
@@ -0,0 +1,45 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/elementwise_max_op.h"
+#include "paddle/operators/elementwise_op.h"
+
+namespace paddle {
+namespace operators {
+class ElementwiseMaxOpMaker : public ElementwiseOpMaker {
+ public:
+  ElementwiseMaxOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : ElementwiseOpMaker(proto, op_checker) {
+    SetComment("Max", "Out = max(X, Y)");
+    AddComment(comment_);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(elementwise_max, ops::ElementwiseOp, ops::ElementwiseMaxOpMaker,
+            elementwise_max_grad, ops::ElementwiseOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    elementwise_max,
+    ops::ElementwiseMaxKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ElementwiseMaxKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ElementwiseMaxKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ElementwiseMaxKernel<paddle::platform::CPUDeviceContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(
+    elementwise_max_grad,
+    ops::ElementwiseMaxGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ElementwiseMaxGradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ElementwiseMaxGradKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ElementwiseMaxGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/operators/elementwise_max_op.cu b/paddle/operators/elementwise_max_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5ff4af17477cbd35b765cc00d46c95fda620e2df
--- /dev/null
+++ b/paddle/operators/elementwise_max_op.cu
@@ -0,0 +1,32 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/elementwise_max_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    elementwise_max,
+    ops::ElementwiseMaxKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ElementwiseMaxKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ElementwiseMaxKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ElementwiseMaxKernel<paddle::platform::CUDADeviceContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(
+    elementwise_max_grad,
+    ops::ElementwiseMaxGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ElementwiseMaxGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ElementwiseMaxGradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ElementwiseMaxGradKernel<paddle::platform::CUDADeviceContext,
+                                  int64_t>);
diff --git a/paddle/operators/elementwise_max_op.h b/paddle/operators/elementwise_max_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..255728e8e620665a7de225b228c19d6c510da1c8
--- /dev/null
+++ b/paddle/operators/elementwise_max_op.h
@@ -0,0 +1,120 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/operators/elementwise_op_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct MaxFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const { return a > b ? a : b; }
+};
+
+template <typename DeviceContext, typename T>
+class ElementwiseMaxKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    ElementwiseComputeEx<MaxFunctor<T>, DeviceContext, T>(ctx);
+  }
+};
+
+template <typename T>
+struct ElementwiseMaxGradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) {
+    auto x_e = framework::EigenVector<T>::Flatten(*x);
+    auto y_e = framework::EigenVector<T>::Flatten(*y);
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = (x_e > y_e).template cast<T>() * dz_e;
+    }
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = (x_e <= y_e).template cast<T>() * dz_e;
+    }
+  }
+};
+
+template <typename T>
+struct ElementwiseMaxBroadCastGradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ, typename Pre, typename N>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) {
+    auto x_e = framework::EigenVector<T>::Flatten(*x);
+    auto y_e = framework::EigenVector<T>::Flatten(*y);
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+
+    auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 2>(1, n))
+                         .broadcast(Eigen::DSizes<int, 2>(pre, 1))
+                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));
+
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = (x_e > y_e_bcast).template cast<T>() * dz_e;
+    }
+
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = ((x_e <= y_e_bcast).template cast<T>() * dz_e)
+                           .reshape(Eigen::DSizes<int, 2>(pre, n))
+                           .sum(Eigen::array<int, 1>{{0}});
+    }
+  }
+};
+
+template <typename T>
+struct ElementwiseMaxBroadCast2GradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ, typename Pre, typename N, typename Post>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n,
+                  Post post) {
+    auto x_e = framework::EigenVector<T>::Flatten(*x);
+    auto y_e = framework::EigenVector<T>::Flatten(*y);
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+
+    auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 3>(1, n, 1))
+                         .broadcast(Eigen::DSizes<int, 3>(pre, 1, post))
+                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = (x_e > y_e_bcast).template cast<T>() * dz_e;
+    }
+
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = ((x_e <= y_e_bcast).template cast<T>() * dz_e)
+                           .reshape(Eigen::DSizes<int, 3>(pre, n, post))
+                           .sum(Eigen::array<int, 2>{{0, 2}});
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class ElementwiseMaxGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    ElementwiseGradCompute<DeviceContext, T, ElementwiseMaxGradFunctor<T>,
+                           ElementwiseMaxBroadCastGradFunctor<T>,
+                           ElementwiseMaxBroadCast2GradFunctor<T>>(ctx);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/elementwise_min_op.cc b/paddle/operators/elementwise_min_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..99482e1bf60c88062087c5fe0105e90aa0a8677c
--- /dev/null
+++ b/paddle/operators/elementwise_min_op.cc
@@ -0,0 +1,45 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/elementwise_min_op.h"
+#include "paddle/operators/elementwise_op.h"
+
+namespace paddle {
+namespace operators {
+class ElementwiseMinOpMaker : public ElementwiseOpMaker {
+ public:
+  ElementwiseMinOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : ElementwiseOpMaker(proto, op_checker) {
+    SetComment("Max", "Out = min(X, Y)");
+    AddComment(comment_);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(elementwise_min, ops::ElementwiseOp, ops::ElementwiseMinOpMaker,
+            elementwise_min_grad, ops::ElementwiseOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    elementwise_min,
+    ops::ElementwiseMinKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ElementwiseMinKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ElementwiseMinKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ElementwiseMinKernel<paddle::platform::CPUDeviceContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(
+    elementwise_min_grad,
+    ops::ElementwiseMinGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ElementwiseMinGradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ElementwiseMinGradKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ElementwiseMinGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/operators/elementwise_min_op.cu b/paddle/operators/elementwise_min_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..3547e6ccb77177002b1ecbee4e4604b602f72209
--- /dev/null
+++ b/paddle/operators/elementwise_min_op.cu
@@ -0,0 +1,32 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/elementwise_min_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    elementwise_min,
+    ops::ElementwiseMinKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ElementwiseMinKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ElementwiseMinKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ElementwiseMinKernel<paddle::platform::CUDADeviceContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(
+    elementwise_min_grad,
+    ops::ElementwiseMinGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ElementwiseMinGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ElementwiseMinGradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ElementwiseMinGradKernel<paddle::platform::CUDADeviceContext,
+                                  int64_t>);
diff --git a/paddle/operators/elementwise_min_op.h b/paddle/operators/elementwise_min_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..e6627a0f1bb468c8e4661b83489cb964b72dddb0
--- /dev/null
+++ b/paddle/operators/elementwise_min_op.h
@@ -0,0 +1,120 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/operators/elementwise_op_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct MinFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const { return a < b ? a : b; }
+};
+
+template <typename DeviceContext, typename T>
+class ElementwiseMinKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    ElementwiseComputeEx<MinFunctor<T>, DeviceContext, T>(ctx);
+  }
+};
+
+template <typename T>
+struct ElementwiseMinGradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) {
+    auto x_e = framework::EigenVector<T>::Flatten(*x);
+    auto y_e = framework::EigenVector<T>::Flatten(*y);
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = (x_e < y_e).template cast<T>() * dz_e;
+    }
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = (x_e >= y_e).template cast<T>() * dz_e;
+    }
+  }
+};
+
+template <typename T>
+struct ElementwiseMinBroadCastGradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ, typename Pre, typename N>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) {
+    auto x_e = framework::EigenVector<T>::Flatten(*x);
+    auto y_e = framework::EigenVector<T>::Flatten(*y);
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+
+    auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 2>(1, n))
+                         .broadcast(Eigen::DSizes<int, 2>(pre, 1))
+                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));
+
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = (x_e < y_e_bcast).template cast<T>() * dz_e;
+    }
+
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = ((x_e >= y_e_bcast).template cast<T>() * dz_e)
+                           .reshape(Eigen::DSizes<int, 2>(pre, n))
+                           .sum(Eigen::array<int, 1>{{0}});
+    }
+  }
+};
+
+template <typename T>
+struct ElementwiseMinBroadCast2GradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ, typename Pre, typename N, typename Post>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n,
+                  Post post) {
+    auto x_e = framework::EigenVector<T>::Flatten(*x);
+    auto y_e = framework::EigenVector<T>::Flatten(*y);
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+
+    auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 3>(1, n, 1))
+                         .broadcast(Eigen::DSizes<int, 3>(pre, 1, post))
+                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = (x_e < y_e_bcast).template cast<T>() * dz_e;
+    }
+
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = ((x_e >= y_e_bcast).template cast<T>() * dz_e)
+                           .reshape(Eigen::DSizes<int, 3>(pre, n, post))
+                           .sum(Eigen::array<int, 2>{{0, 2}});
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class ElementwiseMinGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    ElementwiseGradCompute<DeviceContext, T, ElementwiseMinGradFunctor<T>,
+                           ElementwiseMinBroadCastGradFunctor<T>,
+                           ElementwiseMinBroadCast2GradFunctor<T>>(ctx);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/elementwise_mul_op.h b/paddle/operators/elementwise_mul_op.h
index 0e6559eacc0f90fec78397ce66372071370291e5..4b86b00b5a095ae898f9ce0c17cde2cc91060ba9 100644
--- a/paddle/operators/elementwise_mul_op.h
+++ b/paddle/operators/elementwise_mul_op.h
@@ -18,11 +18,16 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+template <typename T>
+struct MulFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const { return a * b; }
+};
+
 template <typename DeviceContext, typename T>
 class ElementwiseMulKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    ElementwiseCompute<EigenMulFunctor, DeviceContext, T>(ctx);
+    ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(ctx);
   }
 };
 
diff --git a/paddle/operators/elementwise_op.h b/paddle/operators/elementwise_op.h
index a342595b546bfca1a344cf8a549597df6a29adec..1a0131d8b943da3deebd0c461f78cb02b34e6dc2 100644
--- a/paddle/operators/elementwise_op.h
+++ b/paddle/operators/elementwise_op.h
@@ -26,9 +26,9 @@ class ElementwiseOp : public framework::OperatorWithKernel {
   using Tensor = framework::Tensor;
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of elementwise op should not be null");
+                   "Input(X) of elementwise op should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("Y"),
-                   "Input(Y) of elementwise op should not be null");
+                   "Input(Y) of elementwise op should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of elementwise op should not be null.");
 
@@ -45,12 +45,12 @@ class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   ElementwiseOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "(Tensor) The first input tensor of elementwise op");
-    AddInput("Y", "(Tensor) The second input tensor of elementwise op");
-    AddOutput("Out", "The output of elementwise op");
+    AddInput("X", "(Tensor), The first input tensor of elementwise op.");
+    AddInput("Y", "(Tensor), The second input tensor of elementwise op.");
+    AddOutput("Out", "The output of elementwise op.");
     AddAttr<int>("axis",
-                 "(int, default -1) The starting dimension index "
-                 "for broadcasting Y onto X")
+                 "(int, default -1). The start dimension index "
+                 "for broadcasting Y onto X.")
         .SetDefault(-1)
         .EqualGreaterThan(-1);
     comment_ = R"DOC(
@@ -58,19 +58,18 @@ Limited Elementwise {name} Operator.
 
 The equation is:
 
-.. math::
-  {equation}
+$${equation}$$
 
-X is a tensor of any dimension and the dimensions of tensor Y must be smaller than
-or equal to the dimensions of X. 
+$X$ is a tensor of any dimension and the dimensions of tensor $Y$ must be
+smaller than or equal to the dimensions of $X$.
 
 There are two cases for this operator:
-1. The shape of Y is same with X;
-2. The shape of Y is a subset of X.
+1. The shape of $Y$ is same with $X$;
+2. The shape of $Y$ is a subset of $X$.
 
 For case 2:
-Y will be broadcasted to match the shape of X and axis should be 
-the starting dimension index for broadcasting Y onto X.
+$Y$ will be broadcasted to match the shape of $X$ and axis should be
+set to index of the start dimension to broadcast $Y$ onto $X$.
 
 For example
   .. code-block:: python
@@ -81,7 +80,8 @@ For example
     shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
     shape(X) = (2, 3, 4, 5), shape(Y) = (2), with axis=0
 
-Either of the inputs X and Y or none can carry the LoD (Level of Details) information. However, the output only shares the LoD information with input X.
+Either of the inputs $X$ and $Y$ or none can carry the LoD (Level of Details)
+information. However, the output only shares the LoD information with input $X$.
 
 )DOC";
     AddComment(comment_);
diff --git a/paddle/operators/elementwise_op_function.h b/paddle/operators/elementwise_op_function.h
index 0c75276b03140473cee4b57a4022ff3c6989ab4c..db5d30c1af286913f8decd7ab74058fd732ead65 100644
--- a/paddle/operators/elementwise_op_function.h
+++ b/paddle/operators/elementwise_op_function.h
@@ -340,6 +340,13 @@ void ElementwiseGradCompute(const framework::ExecutionContext& ctx) {
     return;
   }
 
+  if (y_dims.size() == 1 && y_dims[0] == 1) {
+    // y is a scalar
+    auto extended_dims = framework::vectorize(x_dims);
+    extended_dims.push_back(1);
+    x_dims = framework::make_ddim(extended_dims);
+  }
+
   int axis = ctx.Attr<int>("axis");
   axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
 
@@ -356,5 +363,50 @@ void ElementwiseGradCompute(const framework::ExecutionContext& ctx) {
     return;
   }
 }
+
+template <typename Functor, typename DeviceContext, typename T>
+void ElementwiseComputeEx(const framework::ExecutionContext& ctx) {
+  using Tensor = framework::Tensor;
+
+  auto* x = ctx.Input<Tensor>("X");
+  auto* y = ctx.Input<Tensor>("Y");
+  auto* z = ctx.Output<Tensor>("Out");
+  z->mutable_data<T>(ctx.GetPlace());
+  TransformFunctor<Functor, T, DeviceContext> functor(
+      x, y, z, ctx.template device_context<DeviceContext>(), Functor());
+
+  auto x_dims = x->dims();
+  auto y_dims = y->dims();
+  PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
+                    "Rank of first input must >= rank of second input.");
+
+  if (x_dims == y_dims) {
+    functor.Run();
+    return;
+  }
+
+  if (y_dims.size() == 1 && y_dims[0] == 1) {
+    // y is a scalar
+    auto extended_dims = framework::vectorize(x_dims);
+    extended_dims.push_back(1);
+    x_dims = framework::make_ddim(extended_dims);
+  }
+
+  int axis = ctx.Attr<int>("axis");
+  axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
+  PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
+                 "Axis should be in range [0, x_dims)");
+
+  int pre, n, post;
+  get_mid_dims(x_dims, y_dims, axis, pre, n, post);
+  if (post == 1) {
+    functor.RunRowWise(n, pre);
+    return;
+  } else {
+    functor.RunMidWise(n, pre, post);
+    return;
+  }
+}
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/elementwise_sub_op.h b/paddle/operators/elementwise_sub_op.h
index 347e92f87c7a46d3935f69864718d63661bef05f..a2aca793026189ec87e00b52d7c351689f870400 100644
--- a/paddle/operators/elementwise_sub_op.h
+++ b/paddle/operators/elementwise_sub_op.h
@@ -18,11 +18,16 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+template <typename T>
+struct SubFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const { return a - b; }
+};
+
 template <typename DeviceContext, typename T>
 class ElementwiseSubKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    ElementwiseCompute<EigenSubFunctor, DeviceContext, T>(ctx);
+    ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(ctx);
   }
 };
 
diff --git a/paddle/operators/expand_op.cc b/paddle/operators/expand_op.cc
index 08fa91ed72aa41ed2f513c090b9085410bb5cc47..043c93654d33f7c105c89960e18ec72d3557237d 100644
--- a/paddle/operators/expand_op.cc
+++ b/paddle/operators/expand_op.cc
@@ -58,21 +58,21 @@ class ExpandOpMaker : public framework::OpProtoAndCheckerMaker {
   ExpandOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
-             "(Tensor, default Tensor<float>) A tensor with rank in [1, 6]."
-             "X is the input tensor to be expanded.");
+             "(Tensor, default Tensor<float>). A tensor with rank in [1, 6]."
+             "X is the input to be expanded.");
     AddOutput("Out",
-              "(Tensor, default Tensor<float>) A tensor with rank in [1, 6]."
-              "The rank of Output(Out) is same as Input(X) except that each "
-              "dimension size of Output(Out) is equal to corresponding "
-              "dimension size of Input(X) multiplying corresponding value of "
-              "Attr(expand_times).");
+              "(Tensor, default Tensor<float>). A tensor with rank in [1, 6]."
+              "The rank of Output(Out) have the same with Input(X). "
+              "After expanding, size of each dimension of Output(Out) is equal "
+              "to size of the corresponding dimension of Input(X) multiplying "
+              "the corresponding value given by Attr(expand_times).");
     AddAttr<std::vector<int>>("expand_times",
                               "Expand times number for each dimension.");
     AddComment(R"DOC(
 Expand operator tiles the input by given times number. You should set times
 number for each dimension by providing attribute 'expand_times'. The rank of X
-should be in [1, 6]. Please notice that size of 'expand_times' must be same with
-X's rank. Following is a using case:
+should be in [1, 6]. Please note that size of 'expand_times' must be the same
+with X's rank. Following is a using case:
 
 Input(X) is a 3-D tensor with shape [2, 3, 1]:
 
diff --git a/paddle/operators/op_documentation/batch_norm_op.md b/paddle/operators/op_documentation/batch_norm_op.md
index 80948adf2b9047a9685dbdd90b2296b5a955f9c1..d1392619c42d9206bf4bddcd33ad11b033e6cbdb 100644
--- a/paddle/operators/op_documentation/batch_norm_op.md
+++ b/paddle/operators/op_documentation/batch_norm_op.md
@@ -66,7 +66,7 @@ As most C++ operators do, `batch_norm_op` is defined by inputs, outputs, attribu
 
 The following graph showes the training computational process of `batch_norm_op`:
 
-<img src="./images/batch_norm_op_kernel.png" width="800"/>
+<img src="../images/batch_norm_op_kernel.png" width="800"/>
 
 cudnn provides APIs to finish the whole series of computation, we can use them in our GPU kernel.
 
@@ -124,7 +124,7 @@ for pass_id in range(PASS_NUM):
 `is_infer` is an attribute. Once an operator is created, its attributes can not be changed. It suggests us that we shall maintain two `batch_norm_op` in the model, one's `is_infer` is `True`(we call it `infer_batch_norm_op`) and the other one's is `False`(we call it `train_batch_norm_op`). They share all parameters and variables, but be placed in two different branches. That is to say, if a network contains a `batch_norm_op`, it will fork into two branches, one go through `train_batch_norm_op` and the other one go through `infer_batch_norm_op`:
 
 <div align=center>
-<img src="./images/batch_norm_fork.png" width="500"/>
+<img src="../images/batch_norm_fork.png" width="500"/>
 </div>
 
 Just like what is shown in the above graph, the net forks before `batch_norm_op` and will never merge again. All the operators after `batch_norm_op` will duplicate. 
diff --git a/paddle/operators/parallel_do_op.cc b/paddle/operators/parallel_do_op.cc
index e1bec0421e76143bef669a4f6fa373cdf01226b2..c2561fa2bf3aa0992f32ed1295c6640d55e6322b 100644
--- a/paddle/operators/parallel_do_op.cc
+++ b/paddle/operators/parallel_do_op.cc
@@ -30,16 +30,13 @@ static constexpr char kParallelScopes[] = "parallel_scopes";
 
 static constexpr char kParallelBlock[] = "sub_block";
 
-// using ParallelScopeVar = std::vector<framework::Scope *>;
 using LoDTensor = framework::LoDTensor;
-using OperatorBase = framework::OperatorBase;
 
-void SplitTensorAndMoveTensorToScopes(
-    const framework::Scope &scope,
-    const std::vector<framework::Scope *> &sub_scopes,
+static void SplitTensorAndMoveTensorToScopes(
+    const framework::Scope &scope, std::vector<framework::Scope *> *sub_scopes,
     const std::vector<platform::Place> &places,
     const std::vector<std::string> &names) {
-  PADDLE_ENFORCE_EQ(sub_scopes.size(), places.size());
+  size_t num_sub_scopes = 0;
   for (auto &argu : names) {
     auto *var = scope.FindVar(argu);
     const auto &tensor = var->Get<LoDTensor>();
@@ -48,9 +45,21 @@ void SplitTensorAndMoveTensorToScopes(
     for (auto &lod : lod_tensors) {
       VLOG(3) << lod.dims();
     }
+    if (num_sub_scopes == 0) {
+      num_sub_scopes = lod_tensors.size();
+    } else {
+      PADDLE_ENFORCE_EQ(num_sub_scopes, lod_tensors.size());
+    }
+    PADDLE_ENFORCE_NE(num_sub_scopes, 0);
+    if (sub_scopes->size() == 0) {
+      sub_scopes->reserve(num_sub_scopes);
+      for (size_t i = 0; i < num_sub_scopes; ++i) {
+        sub_scopes->emplace_back(&scope.NewScope());
+      }
+    }
 
-    for (size_t i = 0; i < sub_scopes.size(); ++i) {
-      *sub_scopes[i]->Var(argu)->GetMutable<LoDTensor>() = lod_tensors[i];
+    for (size_t i = 0; i < lod_tensors.size(); ++i) {
+      *(*sub_scopes)[i]->Var(argu)->GetMutable<LoDTensor>() = lod_tensors[i];
     }
   }
 }
@@ -70,7 +79,7 @@ class ParallelDoOp : public framework::OperatorBase {
                const framework::VariableNameMap &inputs,
                const framework::VariableNameMap &outputs,
                const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
+      : framework::OperatorBase(type, inputs, outputs, attrs) {}
 
   void Run(const framework::Scope &scope,
            const platform::Place &place) const override {
@@ -85,19 +94,17 @@ class ParallelDoOp : public framework::OperatorBase {
 
     auto &sub_scopes = *scope.FindVar(Output(kParallelScopes))
                             ->GetMutable<std::vector<framework::Scope *>>();
-    for (size_t place_idx = 0; place_idx < places.size(); ++place_idx) {
-      sub_scopes.push_back(&scope.NewScope());
-    }
 
     // split input
-    SplitTensorAndMoveTensorToScopes(scope, sub_scopes, places,
+    SplitTensorAndMoveTensorToScopes(scope, &sub_scopes, places,
                                      Inputs(kInputs));
+
     // copy parameter
     for (auto &param : Inputs(kParameters)) {
       PADDLE_ENFORCE(scope.FindVar(param)->IsType<LoDTensor>(),
                      "Only support parameter type as LoDTensor");
       auto &src = scope.FindVar(param)->Get<LoDTensor>();
-      for (size_t i = 0; i < places.size(); ++i) {
+      for (size_t i = 0; i < sub_scopes.size(); ++i) {
         auto &place = places[i];
         auto *sub_scope = sub_scopes[i];
         auto *dst = sub_scope->Var(param)->GetMutable<LoDTensor>();
@@ -108,9 +115,7 @@ class ParallelDoOp : public framework::OperatorBase {
 
     std::vector<std::future<void>> workers;
     workers.reserve(places.size());
-    for (size_t place_idx = 0; place_idx < places.size(); ++place_idx) {
-      VLOG(3) << "Run " << place_idx;
-
+    for (size_t place_idx = 0; place_idx < sub_scopes.size(); ++place_idx) {
       auto &place = places[place_idx];
       auto *cur_scope = sub_scopes[place_idx];
 
@@ -157,21 +162,16 @@ ParallelDo Operator.
   }
 };
 
-class ParallelDoGradOp : public OperatorBase {
+class ParallelDoGradOp : public framework::OperatorBase {
  public:
   ParallelDoGradOp(const std::string &type,
                    const framework::VariableNameMap &inputs,
                    const framework::VariableNameMap &outputs,
                    const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
+      : framework::OperatorBase(type, inputs, outputs, attrs) {}
 
   void Run(const framework::Scope &scope,
            const platform::Place &place) const override {
-    // // get device context from pool
-    // platform::DeviceContextPool &pool =
-    //        platform::DeviceContextPool::Instance();
-    // auto &dev_ctx = *pool.Get(place);
-
     auto *block = Attr<framework::BlockDesc *>(kParallelBlock);
     auto *program = block->Program();
 
@@ -181,26 +181,16 @@ class ParallelDoGradOp : public OperatorBase {
     auto &places = scope.FindVar(Input(kPlaces))->Get<platform::PlaceList>();
 
     // feed output@grad
-    SplitTensorAndMoveTensorToScopes(scope, sub_scopes, places,
-                                     Inputs(framework::GradVarName(kOutputs)));
+    SplitTensorAndMoveTensorToScopes(
+        scope, const_cast<std::vector<framework::Scope *> *>(&sub_scopes),
+        places, Inputs(framework::GradVarName(kOutputs)));
     WaitOnPlaces(places);
 
-    // for debugging
-    for (auto &s : Inputs(framework::GradVarName(kOutputs))) {
-      VLOG(3) << s;
-      VLOG(3) << scope.FindVar(s)->Get<LoDTensor>();
-      for (auto *sub_scope : sub_scopes) {
-        VLOG(3) << sub_scope->FindVar(s)->Get<LoDTensor>();
-      }
-    }
-
     // exe run
     std::vector<std::future<void>> workers;
-    for (size_t place_idx = 0; place_idx < places.size(); ++place_idx) {
-      VLOG(3) << "Run " << place_idx;
-
-      auto &place = places[place_idx];
-      auto *cur_scope = sub_scopes[place_idx];
+    for (size_t i = 0; i < sub_scopes.size(); ++i) {
+      auto &place = places[i];
+      auto *cur_scope = sub_scopes[i];
 
       // execute
       workers.emplace_back(framework::Async([program, cur_scope, place, block] {
@@ -216,33 +206,38 @@ class ParallelDoGradOp : public OperatorBase {
 
     // merge grad
     for (auto &s : Outputs(framework::GradVarName(kParameters))) {
-      VLOG(3) << "merge grad " << s;
-
-      auto &t = sub_scopes[0]->FindVar(s)->Get<LoDTensor>();
-      VLOG(3) << t;
-
-      std::string s_buf = s + "@BUF";
-      auto *t_buf = sub_scopes[0]->Var(s_buf)->GetMutable<LoDTensor>();
-
-      for (size_t place_idx = 1; place_idx < places.size(); ++place_idx) {
-        auto &tt = sub_scopes[place_idx]->FindVar(s)->Get<LoDTensor>();
-        VLOG(3) << place_idx;
-        VLOG(3) << tt;
-        framework::Copy(tt, places[0], t_buf);
+      auto &result = sub_scopes[0]->FindVar(s)->Get<LoDTensor>();
+      std::string tmp_name;
+      auto *tmp = sub_scopes[0]->Var(&tmp_name)->GetMutable<LoDTensor>();
+
+      for (size_t i = 1; i < sub_scopes.size(); ++i) {
+        auto &tensor_to_merge = sub_scopes[i]->FindVar(s)->Get<LoDTensor>();
+        if (!(places[i] == places[0])) {
+          framework::Copy(tensor_to_merge, places[0], tmp);
+        } else {
+          tmp->ShareDataWith(tensor_to_merge);
+        }
 
         auto sum_op = framework::OpRegistry::CreateOp(
-            "sum", {{"X", {s, s_buf}}}, {{"Out", {s}}},
+            "sum", {{"X", {s, tmp_name}}}, {{"Out", {s}}},
             framework::AttributeMap{});
         sum_op->Run(*sub_scopes[0], places[0]);
         WaitOnPlaces(places);
       }
 
-      VLOG(3) << t;
-      framework::Copy(t, place, scope.FindVar(s)->GetMutable<LoDTensor>());
+      VLOG(3) << result;
+      framework::Copy(result, place, scope.FindVar(s)->GetMutable<LoDTensor>());
     }
   }
 };
 
+std::ostream &operator<<(std::ostream &sout,
+                         const std::vector<std::string> &strs) {
+  std::copy(strs.begin(), strs.end(),
+            std::ostream_iterator<std::string>(sout, ","));
+  return sout;
+}
+
 class ParallelDoGradOpDescMaker : public framework::SingleGradOpDescMaker {
  public:
   using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
@@ -283,18 +278,30 @@ class ParallelDoGradOpShapeInference : public framework::InferShapeBase {
   void operator()(framework::InferShapeContext *ctx) const override {
     std::vector<std::string> input{kParameters, kInputs};
     std::vector<std::string> output{kOutputs};
-    for (auto &s : input) {
-      PADDLE_ENFORCE(ctx->HasInputs(s));
-      PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(s)),
-                     "Cannot find the gradient variable %s",
-                     framework::GradVarName(s));
-    }
+
+    PADDLE_ENFORCE(ctx->HasInputs(kParameters));
+    PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(kParameters)));
+    PADDLE_ENFORCE(ctx->HasInput(kInputs));
+
     for (auto &s : output) {
       PADDLE_ENFORCE(ctx->HasInputs(s));
     }
-    for (auto &s : input) {
-      ctx->SetOutputsDim(framework::GradVarName(s), ctx->GetInputsDim(s));
+
+    ctx->SetOutputsDim(framework::GradVarName(kParameters),
+                       ctx->GetInputsDim(kParameters));
+
+    auto i_dims = ctx->GetInputsDim(kInputs);
+    auto ig_names = ctx->Outputs(framework::GradVarName(kInputs));
+
+    for (size_t i = 0; i < ig_names.size(); ++i) {
+      auto &ig_name = ig_names[i];
+      if (ig_name == framework::kEmptyVarName) {
+        continue;
+      }
+
+      ctx->SetDims({ig_name}, {i_dims[i]});
     }
+
     if (ctx->HasInputs(kParameters)) {
       PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(kParameters)));
       ctx->SetOutputsDim(framework::GradVarName(kParameters),
diff --git a/paddle/operators/reduce_op.cc b/paddle/operators/reduce_op.cc
index 172d28bb3b647901d4de7bc03c9de21e3468a364..09b7091358e65221374a604122b742d763cfbafc 100644
--- a/paddle/operators/reduce_op.cc
+++ b/paddle/operators/reduce_op.cc
@@ -129,7 +129,7 @@ If reduce_all is true, just reduce along all dimensions and output a scalar.
   }
 
   void SetComment(std::string name, std::string op) {
-    Replace(comment_, "{ReduceOP}", name);
+    Replace(comment_, "{ReduceOp}", name);
     Replace(comment_, "{reduce}", op);
   }
 };
diff --git a/paddle/string/to_string.h b/paddle/string/to_string.h
index 3b3bcc69a478045156225728236174fd601461dd..178edc18951f94291a63566516df36956f25f67b 100644
--- a/paddle/string/to_string.h
+++ b/paddle/string/to_string.h
@@ -15,9 +15,15 @@ limitations under the License. */
 #pragma once
 #include <sstream>
 #include <string>
+#include <typeindex>
 
 namespace paddle {
 namespace string {
+inline std::ostream& operator<<(std::ostream& s, const std::type_index& t) {
+  s << t.name();
+  return s;
+}
+
 template <typename T>
 inline std::string to_string(T v) {
   std::ostringstream sout;
@@ -25,6 +31,11 @@ inline std::string to_string(T v) {
   return sout.str();
 }
 
+template <>
+inline std::string to_string(std::type_index t) {
+  return t.name();
+}
+
 // Faster std::string/const char* type
 template <>
 inline std::string to_string(std::string v) {
diff --git a/python/paddle/trainer_config_helpers/evaluators.py b/python/paddle/trainer_config_helpers/evaluators.py
index 95797fba8f67bacb421f5c2813ad6332bc53cbc9..0eeaf7eabb179f19d2af8dafe821f7baa153fead 100644
--- a/python/paddle/trainer_config_helpers/evaluators.py
+++ b/python/paddle/trainer_config_helpers/evaluators.py
@@ -16,13 +16,22 @@ from paddle.trainer.config_parser import *
 from default_decorators import *
 
 __all__ = [
-    "evaluator_base", "classification_error_evaluator", "auc_evaluator",
-    "pnpair_evaluator", "precision_recall_evaluator", "ctc_error_evaluator",
-    "chunk_evaluator", "sum_evaluator", "column_sum_evaluator",
-    "value_printer_evaluator", "gradient_printer_evaluator",
-    "maxid_printer_evaluator", "maxframe_printer_evaluator",
-    "seqtext_printer_evaluator", "classification_error_printer_evaluator",
-    "detection_map_evaluator"
+    "evaluator_base",
+    "classification_error_evaluator",
+    "auc_evaluator",
+    "pnpair_evaluator",
+    "precision_recall_evaluator",
+    "ctc_error_evaluator",
+    "chunk_evaluator",
+    "sum_evaluator",
+    "column_sum_evaluator",
+    "value_printer_evaluator",
+    "gradient_printer_evaluator",
+    "maxid_printer_evaluator",
+    "maxframe_printer_evaluator",
+    "seqtext_printer_evaluator",
+    "classification_error_printer_evaluator",
+    "detection_map_evaluator",
 ]
 
 
diff --git a/python/paddle/v2/fluid/framework.py b/python/paddle/v2/fluid/framework.py
index 8042febfed7ed7db2f6d1507142b17079aa00fd8..4f8366b64039b2edcd4c273439c87397bdc33595 100644
--- a/python/paddle/v2/fluid/framework.py
+++ b/python/paddle/v2/fluid/framework.py
@@ -116,8 +116,8 @@ def _debug_string_(proto, throw_on_error=True):
     """
     error_fields = list()
     if not proto.IsInitialized(error_fields) and throw_on_error:
-        raise ValueError("{0} are not initialized\nThe message is {1}".format(
-            error_fields, proto))
+        raise ValueError("{0} are not initialized.\nThe message is {1}:\n".
+                         format(error_fields, proto))
     return proto.__str__()
 
 
@@ -374,12 +374,13 @@ class Operator(object):
         >>>                     outputs={"Out": [var1]})
 
         Args:
-            block(Block): The block has the current operator
-            desc(core.OpDesc): The protobuf description
+            block(Block): The block has the current operator.
+            desc(core.OpDesc): The protobuf description.
             type(str): The type of operator.
             inputs(dict): The input dictionary. Key is the input parameter name.
                 Value is a list of variables.
-            outputs(dict): The output dictionary. Has same format with inputs
+            outputs(dict): The output dictionary which has the same format with
+                           inputs.
             attrs(dict): The attributes dictionary. Key is attribute name. Value
                 is the attribute value. The attribute type should be as same as
                 the type registered in C++
@@ -436,10 +437,11 @@ class Operator(object):
             for m in proto.outputs:
                 need.add(m.name)
             if not given == need:
-                raise ValueError(
-                    "Incorrect setting for output(s) of operator \"%s\". Need: [%s] Given: [%s]"
-                    % (type, ", ".join(str(e) for e in need), ", ".join(
-                        str(e) for e in given)))
+                raise ValueError(("Incorrect setting for output(s) of "
+                                  "operator \"%s\". Need: [%s] Given: [%s]") %
+                                 (type, ", ".join(str(e)
+                                                  for e in need), ", ".join(
+                                                      str(e) for e in given)))
 
             for out_proto in proto.outputs:
                 out_args = outputs[out_proto.name]
@@ -818,9 +820,8 @@ class Program(object):
                 if isinstance(t, Variable):
                     t = t.op
                 else:
-                    raise ValueError(
-                        "All targets of prune() can only be Variable or Operator."
-                    )
+                    raise ValueError(("All targets of prune() can only be "
+                                      "Variable or Operator."))
 
             targets_idx.append([t.block.idx, t.idx])
         res = Program()
diff --git a/python/paddle/v2/fluid/layers/io.py b/python/paddle/v2/fluid/layers/io.py
index 6177f0b4d71cdda8637fc40e3e65c72842bf7439..a43e0ee4def668bf7033f37cfa1a3f59d10a88d0 100644
--- a/python/paddle/v2/fluid/layers/io.py
+++ b/python/paddle/v2/fluid/layers/io.py
@@ -28,9 +28,9 @@ def data(name,
     **Data Layer**
 
     This function takes in the input and based on whether data has
-    to be returned back as a minibatch, it creates the global variable using
+    to be returned back as a minibatch, it creates the global variable by using
     the helper functions. The global variables can be accessed by all the
-    following operations and layers in the graph.
+    following operators in the graph.
 
     All the input variables of this function are passed in as local variables
     to the LayerHelper constructor.
diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py
index faaa68c91832cadf8092eec423b3003192bd9d25..d9fe1ca0aca0a15f2b9742cc7749bb03268d7371 100644
--- a/python/paddle/v2/fluid/layers/nn.py
+++ b/python/paddle/v2/fluid/layers/nn.py
@@ -50,6 +50,8 @@ __all__ = [
     'sequence_last_step',
     'dropout',
     'split',
+    'l2_normalize',
+    'matmul',
 ]
 
 
@@ -1040,7 +1042,8 @@ def pool2d(input,
            pool_type,
            pool_stride=None,
            pool_padding=None,
-           global_pooling=False):
+           global_pooling=False,
+           name=None):
     """
     This function adds the operator for pooling in 2 dimensions, using the
     pooling configurations mentioned in input parameters.
@@ -1086,7 +1089,8 @@ def batch_norm(input,
                epsilon=1e-05,
                param_attr=None,
                bias_attr=None,
-               data_layout='NCHW'):
+               data_layout='NCHW',
+               name=None):
     """
     This function helps create an operator to implement
     the BatchNorm layer using the configurations from the input parameters.
@@ -1162,7 +1166,7 @@ def batch_norm(input,
     return helper.append_activation(batch_norm_out)
 
 
-def beam_search_decode(ids, scores):
+def beam_search_decode(ids, scores, name=None):
     helper = LayerHelper('beam_search_decode', **locals())
     sentence_ids = helper.create_tmp_variable(dtype=ids.dtype)
     sentence_scores = helper.create_tmp_variable(dtype=ids.dtype)
@@ -1186,7 +1190,8 @@ def conv2d_transpose(input,
                      padding=None,
                      stride=None,
                      dilation=None,
-                     param_attr=None):
+                     param_attr=None,
+                     name=None):
     """
     The transpose of conv2d layer.
 
@@ -1213,8 +1218,8 @@ def conv2d_transpose(input,
             contain two integers, (dilation_H, dilation_W). Otherwise, the
             dilation_H = dilation_W = dilation.
         param_attr: Parameter Attribute.
-        main_program(Program): the main program
-        startup_program(Program): the startup program
+        name(str|None): A name for this layer(optional). If set None, the layer
+                       will be named automatically.
 
     Returns:
         Variable: Output image.
@@ -1278,7 +1283,7 @@ def conv2d_transpose(input,
     return out
 
 
-def sequence_expand(x, y):
+def sequence_expand(x, y, name=None):
     """Sequence Expand Layer. This layer will expand the input variable **x**
     according to LoD information of **y**. And the following examples will
     explain how sequence_expand works:
@@ -1322,6 +1327,8 @@ def sequence_expand(x, y):
     Args:
         x (Variable): The input variable which is a Tensor or LoDTensor.
         y (Variable): The input variable which is a LoDTensor.
+        name(str|None): A name for this layer(optional). If set None, the layer
+                       will be named automatically.
 
     Returns:
         Variable: The expanded variable which is a LoDTensor.
@@ -1348,7 +1355,8 @@ def lstm_unit(x_t,
               cell_t_prev,
               forget_bias=0.0,
               param_attr=None,
-              bias_attr=None):
+              bias_attr=None,
+              name=None):
     """Lstm unit layer. The equation of a lstm step is:
 
         .. math::
@@ -1395,6 +1403,8 @@ def lstm_unit(x_t,
             initializer, name etc.
         bias_attr (ParamAttr): The attributes of bias weights, if not False,
             bias weights will be created and be set to default value.
+        name(str|None): A name for this layer(optional). If set None, the layer
+                       will be named automatically.
 
     Returns:
         tuple: The hidden value and cell value of lstm unit.
@@ -1460,7 +1470,7 @@ def lstm_unit(x_t,
     return h, c
 
 
-def reduce_sum(input, dim=None, keep_dim=False):
+def reduce_sum(input, dim=None, keep_dim=False, name=None):
     """
     Computes the sum of tensor elements over the given dimension.
 
@@ -1474,6 +1484,8 @@ def reduce_sum(input, dim=None, keep_dim=False):
         keep_dim (bool): Whether to reserve the reduced dimension in the
             output Tensor. The result tensor will have one fewer dimension
             than the :attr:`input` unless :attr:`keep_dim` is true.
+        name(str|None): A name for this layer(optional). If set None, the layer
+                       will be named automatically.
 
     Returns:
         Variable: The reduced Tensor variable.
@@ -1504,7 +1516,7 @@ def reduce_sum(input, dim=None, keep_dim=False):
     return out
 
 
-def reduce_mean(input, dim=None, keep_dim=False):
+def reduce_mean(input, dim=None, keep_dim=False, name=None):
     """
     Computes the mean of tensor elements over the given dimension.
 
@@ -1518,6 +1530,8 @@ def reduce_mean(input, dim=None, keep_dim=False):
         keep_dim (bool): Whether to reserve the reduced dimension in the
             output Tensor. The result tensor will have one fewer dimension
             than the :attr:`input` unless :attr:`keep_dim` is true.
+        name(str|None): A name for this layer(optional). If set None, the layer
+                       will be named automatically.
 
     Returns:
         Variable: The reduced Tensor variable.
@@ -1548,7 +1562,7 @@ def reduce_mean(input, dim=None, keep_dim=False):
     return out
 
 
-def reduce_max(input, dim=None, keep_dim=False):
+def reduce_max(input, dim=None, keep_dim=False, name=None):
     """
     Computes the maximum of tensor elements over the given dimension.
 
@@ -1562,6 +1576,8 @@ def reduce_max(input, dim=None, keep_dim=False):
         keep_dim (bool): Whether to reserve the reduced dimension in the
             output Tensor. The result tensor will have one fewer dimension
             than the :attr:`input` unless :attr:`keep_dim` is true.
+        name(str|None): A name for this layer(optional). If set None, the layer
+                       will be named automatically.
 
     Returns:
         Variable: The reduced Tensor variable.
@@ -1592,7 +1608,7 @@ def reduce_max(input, dim=None, keep_dim=False):
     return out
 
 
-def reduce_min(input, dim=None, keep_dim=False):
+def reduce_min(input, dim=None, keep_dim=False, name=None):
     """
     Computes the minimum of tensor elements over the given dimension.
 
@@ -1606,6 +1622,8 @@ def reduce_min(input, dim=None, keep_dim=False):
         keep_dim (bool): Whether to reserve the reduced dimension in the
             output Tensor. The result tensor will have one fewer dimension
             than the :attr:`input` unless :attr:`keep_dim` is true.
+        name(str|None): A name for this layer(optional). If set None, the layer
+                       will be named automatically.
 
     Returns:
         Variable: The reduced Tensor variable.
@@ -1636,20 +1654,22 @@ def reduce_min(input, dim=None, keep_dim=False):
     return out
 
 
-def split(input, num_or_sections, dim=-1):
+def split(input, num_or_sections, dim=-1, name=None):
     """
-    Splits the tensor into multiple sub-tensors.
+    Split the input tensor into multiple sub-tensors.
 
     Args:
         input (Variable): The input variable which is a Tensor or LoDTensor.
-        num_or_sections (int|list): If :attr:`num_or_sections` is an integer, 
-            then the integer indicates the number of equal sized sub-tensors 
-            that the tensor will be divided into. If :attr:`num_or_sections` 
-            is a list of integers, the length of list indicates the number of 
-            sub-tensors and the integers indicate the sizes of sub-tensors' 
+        num_or_sections (int|list): If :attr:`num_or_sections` is an integer,
+            then the integer indicates the number of equal sized sub-tensors
+            that the tensor will be divided into. If :attr:`num_or_sections`
+            is a list of integers, the length of list indicates the number of
+            sub-tensors and the integers indicate the sizes of sub-tensors'
             :attr:`dim` dimension orderly.
-        dim (int): The dimension along which to split. If :math:`dim < 0`, the 
+        dim (int): The dimension along which to split. If :math:`dim < 0`, the
             dimension to split along is :math:`rank(input) + dim`.
+        name(str|None): A name for this layer(optional). If set None, the layer
+                       will be named automatically.
 
     Returns:
         List: The list of segmented tensor variables.
@@ -1692,3 +1712,155 @@ def split(input, num_or_sections, dim=-1):
             'axis': dim
         })
     return outs
+
+
+def l2_normalize(x, axis, epsilon=1e-12, name=None):
+    """
+    **L2 normalize Layer**
+
+    The l2 normalize layer normalizes `x` along dimension `axis` using an L2
+    norm. For a 1-D tensor (`dim` is fixed to 0), this layer computes
+
+    output = x / sqrt(max(sum(x**2), epsilon))
+
+    For `x` with more dimensions, this layer independently normalizes each 1-D
+    slice along dimension `axis`.
+
+    Args:
+       x(Variable|list): The input tensor to l2_normalize layer.
+       axis(int): Dimension along which to normalize the input.
+       epsilon(float): A lower bound value for `x`'s l2 norm. sqrt(epsilon) will
+                       be used as the divisor if the l2 norm of `x` is less than
+                       sqrt(epsilon).
+       name(str|None): A name for this layer(optional). If set None, the layer
+                       will be named automatically.
+
+
+    Returns:
+        Variable: The output tensor variable.
+
+    Examples:
+        .. code-block:: python
+
+          data = fluid.layers.data(name="data",
+                                   shape=(3, 17, 13),
+                                   dtype="float32")
+          fc = fluid.layers.l2_normalize(x=data, axis=1)
+    """
+
+    if len(x.shape) == 1: axis = 0
+
+    helper = LayerHelper("l2_normalize", **locals())
+
+    square = helper.create_tmp_variable(dtype=x.dtype)
+    helper.append_op(type="square", inputs={"X": x}, outputs={"Out": square})
+
+    reduced_sum = helper.create_tmp_variable(dtype=x.dtype)
+    helper.append_op(
+        type="reduce_sum",
+        inputs={"X": square},
+        outputs={"Out": reduced_sum},
+        attrs={
+            "dim": 1 if axis is None else axis,
+            "keep_dim": True,
+            "reduce_all": False
+        })
+
+    # TODO(caoying) A lower bound value epsilon for the norm is needed to
+    # imporve the numeric stability of reciprocal. This requires a maximum_op.
+    rsquare = helper.create_tmp_variable(dtype=x.dtype)
+    helper.append_op(
+        type="reciprocal", inputs={"X": reduced_sum}, outputs={"Out": rsquare})
+
+    # TODO(caoying) the current elementwise_mul operator does not support a
+    # general broadcast rule which broadcasts input(Y) to have the same
+    # dimension with Input(X) starting from a specified dimension. So this
+    # exanpsion is requred. Once a general broadcast rule is spported, this
+    # expanding canbe removed.
+    rsquare_expanded = helper.create_tmp_variable(dtype=x.dtype)
+    expand_times = [1] * len(x.shape)
+    expand_times[axis] = int(x.shape[axis])
+    helper.append_op(
+        type="expand",
+        inputs={"X": rsquare},
+        outputs={"Out": rsquare_expanded},
+        attrs={"expand_times": expand_times})
+
+    out = helper.create_tmp_variable(dtype=x.dtype)
+    helper.append_op(
+        type="elementwise_mul",
+        inputs={"X": x,
+                "Y": rsquare_expanded},
+        outputs={"Out": out})
+    return out
+
+
+def matmul(x, y, transpose_x=False, transpose_y=False, name=None):
+    """
+    Applies matrix multipication to two tensors. Currently only rank 1 to rank 
+    3 input tensors are supported.
+
+    The actual behavior depends on the shapes of :math:`x`, :math:`y` and the 
+    flag values of :attr:`transpose_x`, :attr:`transpose_y`. Specifically:
+
+    - If a transpose flag is specified, the last two dimensions of the tensor 
+      are transposed. If the tensor is rank-1 of shape :math:`[D]`, then for 
+      :math:`x` it is treated as :math:`[1, D]` in nontransposed form and as 
+      :math:`[D, 1]` in transposed form, whereas for :math:`y` it is the 
+      opposite: It is treated as :math:`[D, 1]` in nontransposed form and as 
+      :math:`[1, D]` in transposed form.
+
+    - After transpose, the two tensors are 2-D or 3-D and matrix multipication 
+      performs in the following way.
+
+      - If both are 2-D, they are multiplied like conventional matrices.
+      - If either is 3-D, it is treated as a stack of matrices residing in the 
+        last two dimensions and a batched matrix multiply supporting broadcast 
+        applies on the two tensors.
+
+    Also note that if the raw tensor :math:`x` or :math:`y` is rank-1 and 
+    nontransposed, the prepended or appended dimension :math:`1` will be 
+    removed after matrix multipication.
+
+    Args:
+        x (Variable): The input variable which is a Tensor or LoDTensor.
+        y (Variable): The input variable which is a Tensor or LoDTensor.
+        transpose_x (bool): Whether to transpose :math:`x` before multiplication.
+        transpose_y (bool): Whether to transpose :math:`y` before multiplication.
+        name(str|None): A name for this layer(optional). If set None, the layer 
+            will be named automatically.
+
+    Returns:
+        Variable: The product Tensor variable.
+
+    Examples:
+        .. code-block:: python
+
+            # Examples to clarify shapes of the inputs and output
+            # x: [B, M, K], y: [B, K, N]
+            fluid.layers.matmul(x, y)  # out: [B, M, N]
+            # x: [B, M, K], y: [K, N]
+            fluid.layers.matmul(x, y)  # out: [B, M, N]
+            # x: [B, M, K], y: [K]
+            fluid.layers.matmul(x, y)  # out: [B, M]
+            # x: [M, K], y: [K, N]
+            fluid.layers.matmul(x, y)  # out: [M, N]
+            # x: [K], y: [K]
+            fluid.layers.matmul(x, y)  # out: [1]
+            # x: [M], y: [N]
+
+            fluid.layers.matmul(x, y, True, True)  # out: [M, N]
+    """
+    helper = LayerHelper('matmul', **locals())
+    assert max(
+        len(x.shape), len(y.shape)
+    ) <= 3, 'Currently only rank 1 to rank 3 input tensors are supported.'
+    out = helper.create_tmp_variable(dtype=helper.input_dtype())
+    helper.append_op(
+        type='matmul',
+        inputs={'X': x,
+                'Y': y},
+        outputs={'Out': out},
+        attrs={'transpose_X': transpose_x,
+               'transpose_Y': transpose_y})
+    return out
diff --git a/python/paddle/v2/fluid/layers/ops.py b/python/paddle/v2/fluid/layers/ops.py
index 73d7c895806ef28ffc98db88809e317a86762769..21945edf0827e7a86ec2f8ce8f84c9093808c68b 100644
--- a/python/paddle/v2/fluid/layers/ops.py
+++ b/python/paddle/v2/fluid/layers/ops.py
@@ -55,6 +55,8 @@ __all__ = [
     'elementwise_div',
     'elementwise_sub',
     'elementwise_mul',
+    'elementwise_max',
+    'elementwise_min',
     'clip',
     'sequence_softmax',
 ] + __activations__
diff --git a/python/paddle/v2/fluid/nets.py b/python/paddle/v2/fluid/nets.py
index b5c26e713db4e5c48d6c0851fd68275aeabeea64..ee6f70b89940a1daa076c5617b05e5a845078d14 100644
--- a/python/paddle/v2/fluid/nets.py
+++ b/python/paddle/v2/fluid/nets.py
@@ -17,6 +17,7 @@ __all__ = [
     "simple_img_conv_pool",
     "sequence_conv_pool",
     "glu",
+    "dot_product_attention",
 ]
 
 
@@ -150,3 +151,55 @@ def glu(input, dim=-1):
     act_b = layers.sigmoid(x=b)
     out = layers.elementwise_mul(x=a, y=act_b)
     return out
+
+
+def dot_product_attention(querys, keys, values):
+    """
+    The dot-product attention.
+
+    Attention mechanism can be seen as mapping a query and a set of key-value 
+    pairs to an output. The output is computed as a weighted sum of the values, 
+    where the weight assigned to each value is computed by a compatibility 
+    function (dot-product here) of the query with the corresponding key.
+    
+    The dot-product attention can be implemented through (batch) matrix 
+    multipication as follows:
+
+        .. math::
+
+            Attention(Q, K, V)= softmax(QK^\mathrm{T})V
+
+    Refer to `Attention Is All You Need 
+    <https://arxiv.org/pdf/1706.03762.pdf>`_.
+
+    Note that batch data containing sequences with different lengths is not 
+    supported by this because of the (batch) matrix multipication.
+    
+    Args:
+        query (Variable): The input variable which is a Tensor or LoDTensor.
+        key (Variable): The input variable which is a Tensor or LoDTensor.
+        value (Variable): The input variable which is a Tensor or LoDTensor.
+
+    Returns:
+        tuple: The Tensor variables representing the output and attention scores.
+
+    Examples:
+        .. code-block:: python
+
+            # Suppose q, k, v are tensor variables with the following shape:
+            # q: [3, 5, 9], k: [3, 6, 9], v: [3, 6, 10]
+            out, attn_scores = fluid.nets.dot_product_attention(q, k, v)
+            out.shape  # [3, 5, 10]
+            attn_scores.shape  # [3, 5, 6]
+    """
+    assert keys.shape[-2] == values.shape[
+        -2], 'The shapes of keys and values mismatch.'
+    assert querys.shape[-1] == keys.shape[
+        -1], 'The shapes of querys and keys mismatch.'
+    product = layers.matmul(x=querys, y=keys, transpose_y=True)
+    attn_scores = layers.reshape(
+        x=layers.reshape(
+            x=product, shape=[-1, product.shape[-1]], act='softmax'),
+        shape=product.shape)
+    out = layers.matmul(attn_scores, values)
+    return out, attn_scores
diff --git a/python/paddle/v2/fluid/tests/test_dynrnn_static_input.py b/python/paddle/v2/fluid/tests/test_dynrnn_static_input.py
index 9b138a6207f760ddfbfa3ad70dfa7e7875727901..d6878f0b6d07b74612dd47794e254e4d7d98a124 100644
--- a/python/paddle/v2/fluid/tests/test_dynrnn_static_input.py
+++ b/python/paddle/v2/fluid/tests/test_dynrnn_static_input.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import paddle.v2 as paddle
 import paddle.v2.fluid.core as core
diff --git a/python/paddle/v2/fluid/tests/test_edit_distance_op.py b/python/paddle/v2/fluid/tests/test_edit_distance_op.py
index 38e87728b387bb70a8921a2fe73a4e69701aabe9..cf118df634bb8288456009ebd4954f08d5eb4323 100644
--- a/python/paddle/v2/fluid/tests/test_edit_distance_op.py
+++ b/python/paddle/v2/fluid/tests/test_edit_distance_op.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/v2/fluid/tests/test_elementwise_add_op.py b/python/paddle/v2/fluid/tests/test_elementwise_add_op.py
index 1e88231877f869052d859a30fbbd8f7690b64095..3564772fb52882e9e58ea88caeb12c5e91137525 100644
--- a/python/paddle/v2/fluid/tests/test_elementwise_add_op.py
+++ b/python/paddle/v2/fluid/tests/test_elementwise_add_op.py
@@ -1,16 +1,16 @@
 #  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 #
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import unittest
 import numpy as np
 from op_test import OpTest
@@ -40,6 +40,16 @@ class TestElementwiseOp(OpTest):
             ['X'], 'Out', max_relative_error=0.005, no_grad_set=set('Y'))
 
 
+class TestElementwiseAddOp_scalar(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_add"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 4).astype(np.float32),
+            'Y': np.random.rand(1).astype(np.float32)
+        }
+        self.outputs = {'Out': self.inputs['X'] + self.inputs['Y']}
+
+
 class TestElementwiseAddOp_Vector(TestElementwiseOp):
     def setUp(self):
         self.op_type = "elementwise_add"
diff --git a/python/paddle/v2/fluid/tests/test_elementwise_div_op.py b/python/paddle/v2/fluid/tests/test_elementwise_div_op.py
index fbabc79be2406efa916d2acafe9ce3d69112b160..77b113af7693c4a71a5a13c791cfb3e0420f4ff8 100644
--- a/python/paddle/v2/fluid/tests/test_elementwise_div_op.py
+++ b/python/paddle/v2/fluid/tests/test_elementwise_div_op.py
@@ -1,16 +1,16 @@
 #  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 #
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import unittest
 import numpy as np
 from op_test import OpTest
@@ -45,6 +45,16 @@ class ElementwiseDivOp(OpTest):
             ['X'], 'Out', max_relative_error=0.05, no_grad_set=set('Y'))
 
 
+class TestElementwiseDivOp_scalar(ElementwiseDivOp):
+    def setUp(self):
+        self.op_type = "elementwise_div"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [2, 3, 4]).astype(np.float32),
+            'Y': np.random.uniform(0.1, 1, [1]).astype(np.float32)
+        }
+        self.outputs = {'Out': self.inputs['X'] / self.inputs['Y']}
+
+
 class TestElementwiseDivOp_Vector(ElementwiseDivOp):
     def setUp(self):
         self.op_type = "elementwise_div"
diff --git a/python/paddle/v2/fluid/tests/test_elementwise_max_op.py b/python/paddle/v2/fluid/tests/test_elementwise_max_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..9526f0199bf2f1a537a18010ef6b87105589ed07
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_elementwise_max_op.py
@@ -0,0 +1,129 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestElementwiseOp(OpTest):
+    def setUp(self):
+        self.op_type = "elementwise_max"
+        # If x and y have the same value, the max() is not differentiable.
+        # So we generate test data by the following method
+        # to avoid them being too close to each other.
+        x = np.random.uniform(0.1, 1, [13, 17]).astype("float32")
+        sgn = np.random.choice([-1, 1], [13, 17]).astype("float32")
+        y = x + sgn * np.random.uniform(0.1, 1, [13, 17]).astype("float32")
+        self.inputs = {'X': x, 'Y': y}
+        self.outputs = {'Out': np.maximum(self.inputs['X'], self.inputs['Y'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.005)
+
+    def test_check_grad_ingore_x(self):
+        self.check_grad(
+            ['Y'], 'Out', max_relative_error=0.005, no_grad_set=set("X"))
+
+    def test_check_grad_ingore_y(self):
+        self.check_grad(
+            ['X'], 'Out', max_relative_error=0.005, no_grad_set=set('Y'))
+
+
+class TestElementwiseMaxOp_scalar(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_max"
+        x = np.random.random_integers(-5, 5, [2, 3, 4]).astype("float32")
+        y = np.array([0.5]).astype("float32")
+        self.inputs = {'X': x, 'Y': y}
+        self.outputs = {'Out': np.maximum(self.inputs['X'], self.inputs['Y'])}
+
+
+class TestElementwiseMaxOp_Vector(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_max"
+        x = np.random.random((32, )).astype("float32")
+        sgn = np.random.choice([-1, 1], (32, )).astype("float32")
+        y = x + sgn * np.random.uniform(0.1, 1, (32, )).astype("float32")
+        self.inputs = {'X': x, 'Y': y}
+        self.outputs = {'Out': np.maximum(self.inputs['X'], self.inputs['Y'])}
+
+
+class TestElementwiseMaxOp_broadcast_0(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_max"
+        x = np.random.uniform(0.5, 1, (2, 3, 4)).astype(np.float32)
+        sgn = np.random.choice([-1, 1], (2, )).astype(np.float32)
+        y = x[:, 0, 0] + sgn * \
+            np.random.uniform(1, 2, (2, )).astype(np.float32)
+        self.inputs = {'X': x, 'Y': y}
+
+        self.attrs = {'axis': 0}
+        self.outputs = {
+            'Out':
+            np.maximum(self.inputs['X'], self.inputs['Y'].reshape(2, 1, 1))
+        }
+
+
+class TestElementwiseMaxOp_broadcast_1(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_max"
+        x = np.random.uniform(0.5, 1, (2, 3, 4)).astype(np.float32)
+        sgn = np.random.choice([-1, 1], (3, )).astype(np.float32)
+        y = x[0, :, 0] + sgn * \
+            np.random.uniform(1, 2, (3, )).astype(np.float32)
+        self.inputs = {'X': x, 'Y': y}
+
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out':
+            np.maximum(self.inputs['X'], self.inputs['Y'].reshape(1, 3, 1))
+        }
+
+
+class TestElementwiseMaxOp_broadcast_2(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_max"
+        x = np.random.uniform(0.5, 1, (2, 3, 4)).astype(np.float32)
+        sgn = np.random.choice([-1, 1], (4, )).astype(np.float32)
+        y = x[0, 0, :] + sgn * \
+            np.random.uniform(1, 2, (4, )).astype(np.float32)
+        self.inputs = {'X': x, 'Y': y}
+
+        self.outputs = {
+            'Out':
+            np.maximum(self.inputs['X'], self.inputs['Y'].reshape(1, 1, 4))
+        }
+
+
+class TestElementwiseMaxOp_broadcast_3(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_max"
+        x = np.random.uniform(0.5, 1, (2, 3, 4, 5)).astype(np.float32)
+        sgn = np.random.choice([-1, 1], (3, 4)).astype(np.float32)
+        y = x[0, :, :, 0] + sgn * \
+            np.random.uniform(1, 2, (3, 4)).astype(np.float32)
+        self.inputs = {'X': x, 'Y': y}
+
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out':
+            np.maximum(self.inputs['X'], self.inputs['Y'].reshape(1, 3, 4, 1))
+        }
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_elementwise_min_op.py b/python/paddle/v2/fluid/tests/test_elementwise_min_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9007282335a32f85defcd1e2b190745abd38794
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_elementwise_min_op.py
@@ -0,0 +1,129 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestElementwiseOp(OpTest):
+    def setUp(self):
+        self.op_type = "elementwise_min"
+        # If x and y have the same value, the min() is not differentiable.
+        # So we generate test data by the following method
+        # to avoid them being too close to each other.
+        x = np.random.uniform(0.1, 1, [13, 17]).astype("float32")
+        sgn = np.random.choice([-1, 1], [13, 17]).astype("float32")
+        y = x + sgn * np.random.uniform(0.1, 1, [13, 17]).astype("float32")
+        self.inputs = {'X': x, 'Y': y}
+        self.outputs = {'Out': np.minimum(self.inputs['X'], self.inputs['Y'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.005)
+
+    def test_check_grad_ingore_x(self):
+        self.check_grad(
+            ['Y'], 'Out', max_relative_error=0.005, no_grad_set=set("X"))
+
+    def test_check_grad_ingore_y(self):
+        self.check_grad(
+            ['X'], 'Out', max_relative_error=0.005, no_grad_set=set('Y'))
+
+
+class TestElementwiseMinOp_scalar(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_min"
+        x = np.random.random_integers(-5, 5, [2, 3, 4]).astype("float32")
+        y = np.array([0.5]).astype("float32")
+        self.inputs = {'X': x, 'Y': y}
+        self.outputs = {'Out': np.minimum(self.inputs['X'], self.inputs['Y'])}
+
+
+class TestElementwiseMaxOp_Vector(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_min"
+        x = np.random.random((32, )).astype("float32")
+        sgn = np.random.choice([-1, 1], (32, )).astype("float32")
+        y = x + sgn * np.random.uniform(0.1, 1, (32, )).astype("float32")
+        self.inputs = {'X': x, 'Y': y}
+        self.outputs = {'Out': np.minimum(self.inputs['X'], self.inputs['Y'])}
+
+
+class TestElementwiseMaxOp_broadcast_0(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_min"
+        x = np.random.uniform(0.5, 1, (2, 3, 4)).astype(np.float32)
+        sgn = np.random.choice([-1, 1], (2, )).astype(np.float32)
+        y = x[:, 0, 0] + sgn * \
+            np.random.uniform(1, 2, (2, )).astype(np.float32)
+        self.inputs = {'X': x, 'Y': y}
+
+        self.attrs = {'axis': 0}
+        self.outputs = {
+            'Out':
+            np.minimum(self.inputs['X'], self.inputs['Y'].reshape(2, 1, 1))
+        }
+
+
+class TestElementwiseMaxOp_broadcast_1(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_min"
+        x = np.random.uniform(0.5, 1, (2, 3, 4)).astype(np.float32)
+        sgn = np.random.choice([-1, 1], (3, )).astype(np.float32)
+        y = x[0, :, 0] + sgn * \
+            np.random.uniform(1, 2, (3, )).astype(np.float32)
+        self.inputs = {'X': x, 'Y': y}
+
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out':
+            np.minimum(self.inputs['X'], self.inputs['Y'].reshape(1, 3, 1))
+        }
+
+
+class TestElementwiseMaxOp_broadcast_2(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_min"
+        x = np.random.uniform(0.5, 1, (2, 3, 4)).astype(np.float32)
+        sgn = np.random.choice([-1, 1], (4, )).astype(np.float32)
+        y = x[0, 0, :] + sgn * \
+            np.random.uniform(1, 2, (4, )).astype(np.float32)
+        self.inputs = {'X': x, 'Y': y}
+
+        self.outputs = {
+            'Out':
+            np.minimum(self.inputs['X'], self.inputs['Y'].reshape(1, 1, 4))
+        }
+
+
+class TestElementwiseMaxOp_broadcast_3(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_min"
+        x = np.random.uniform(0.5, 1, (2, 3, 4, 5)).astype(np.float32)
+        sgn = np.random.choice([-1, 1], (3, 4)).astype(np.float32)
+        y = x[0, :, :, 0] + sgn * \
+            np.random.uniform(1, 2, (3, 4)).astype(np.float32)
+        self.inputs = {'X': x, 'Y': y}
+
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out':
+            np.minimum(self.inputs['X'], self.inputs['Y'].reshape(1, 3, 4, 1))
+        }
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_elementwise_mul_op.py b/python/paddle/v2/fluid/tests/test_elementwise_mul_op.py
index ef3a829abc65f3af60b899ba50927424fd563607..12dfa6599cd634e1d806980f89e7c013b8eb8754 100644
--- a/python/paddle/v2/fluid/tests/test_elementwise_mul_op.py
+++ b/python/paddle/v2/fluid/tests/test_elementwise_mul_op.py
@@ -1,16 +1,16 @@
 #  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 #
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import unittest
 import numpy as np
 from op_test import OpTest
@@ -38,6 +38,16 @@ class ElementwiseMulOp(OpTest):
         self.check_grad(['X'], 'Out', no_grad_set=set('Y'))
 
 
+class TestElementwiseMulOp_scalar(ElementwiseMulOp):
+    def setUp(self):
+        self.op_type = "elementwise_mul"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 4).astype(np.float32),
+            'Y': np.random.rand(1).astype(np.float32)
+        }
+        self.outputs = {'Out': self.inputs['X'] * self.inputs['Y']}
+
+
 class TestElementwiseMulOp_Vector(ElementwiseMulOp):
     def setUp(self):
         self.op_type = "elementwise_mul"
diff --git a/python/paddle/v2/fluid/tests/test_elementwise_sub_op.py b/python/paddle/v2/fluid/tests/test_elementwise_sub_op.py
index db24db7b304e2ef8b946e128b22ed62d71260ddc..cf53d85bbad81f393a6263f8742fc942d357135f 100644
--- a/python/paddle/v2/fluid/tests/test_elementwise_sub_op.py
+++ b/python/paddle/v2/fluid/tests/test_elementwise_sub_op.py
@@ -1,16 +1,16 @@
 #  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 #
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import unittest
 import numpy as np
 from op_test import OpTest
@@ -40,6 +40,16 @@ class TestElementwiseOp(OpTest):
             ['X'], 'Out', max_relative_error=0.005, no_grad_set=set('Y'))
 
 
+class TestElementwiseSubOp_scalar(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_sub"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 4).astype(np.float32),
+            'Y': np.random.rand(1).astype(np.float32)
+        }
+        self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
+
+
 class TestElementwiseSubOp_Vector(TestElementwiseOp):
     def setUp(self):
         self.op_type = "elementwise_sub"
diff --git a/python/paddle/v2/fluid/tests/test_matmul_op.py b/python/paddle/v2/fluid/tests/test_matmul_op.py
index 0220cfe1287ab6908f379f9f7451cd257dd7f0a6..f7dc4e053217dcceaf9c64e3605286fc0698593b 100644
--- a/python/paddle/v2/fluid/tests/test_matmul_op.py
+++ b/python/paddle/v2/fluid/tests/test_matmul_op.py
@@ -96,18 +96,18 @@ class Generator(object):
         self.outputs = {'Out': Out}
 
     def test_check_output(self):
-        self.check_output(atol=1e-2)
+        self.check_output(atol=1e-3)
 
     def test_check_grad_normal(self):
-        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5)
+        self.check_grad(['X', 'Y'], 'Out', max_relative_error=1e-3)
 
     def test_check_grad_ignore_x(self):
         self.check_grad(
-            ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
+            ['Y'], 'Out', max_relative_error=1e-3, no_grad_set=set("X"))
 
     def test_check_grad_ignore_y(self):
         self.check_grad(
-            ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
+            ['X'], 'Out', max_relative_error=1e-3, no_grad_set=set('Y'))
 
 
 # Generate test cases for all possibilities
diff --git a/python/paddle/v2/fluid/tests/test_normalization_wrapper.py b/python/paddle/v2/fluid/tests/test_normalization_wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..caff63011d038d785472cb38a26a51f3f4cc9288
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_normalization_wrapper.py
@@ -0,0 +1,95 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+import unittest
+import paddle.v2.fluid as fluid
+import paddle.v2.fluid.core as core
+import numpy as np
+
+
+class TestNormalization(unittest.TestCase):
+    data_desc = {"name": "input", "shape": (2, 3, 7)}
+
+    def gen_random_input(self):
+        """Generate random input data.
+        """
+        self.data = np.random.random(
+            size=self.data_desc["shape"]).astype("float32")
+
+    def set_program(self, axis, epsilon):
+        """Build the test program.
+        """
+        data = fluid.layers.data(
+            name=self.data_desc["name"],
+            shape=self.data_desc["shape"],
+            dtype="float32",
+            append_batch_size=False)
+        data.stop_gradient = False
+        l2_norm = fluid.layers.l2_normalize(x=data, axis=axis, epsilon=epsilon)
+        out = fluid.layers.reduce_sum(l2_norm, dim=None)
+
+        fluid.backward.append_backward(loss=out)
+        self.fetch_list = [l2_norm]
+
+    def run_program(self):
+        """Run the test program.
+        """
+        places = [core.CPUPlace()]
+        if core.is_compile_gpu():
+            places.append(core.CUDAPlace(0))
+
+        for place in places:
+            self.set_inputs(place)
+            exe = fluid.Executor(place)
+
+            output = exe.run(fluid.default_main_program(),
+                             feed=self.inputs,
+                             fetch_list=self.fetch_list,
+                             return_numpy=True)
+            self.op_output = output
+
+    def set_inputs(self, place):
+        """Set the randomly generated data to the test program.
+        """
+        self.inputs = {}
+        tensor = fluid.Tensor()
+        tensor.set(self.data, place)
+        self.inputs[self.data_desc["name"]] = tensor
+
+    def l2_normalize(self, data, axis, epsilon):
+        """ Compute the groundtruth.
+        """
+        output = data * np.reciprocal(
+            np.sum(np.square(data), axis=axis, keepdims=True))
+        return output
+
+    def test_l2_normalize(self):
+        """ Test the python wrapper for l2_normalize.
+        """
+        axis = 1
+        #TODO(caoying) epsilon is not supported due to lack of a maximum_op.
+        epsilon = 1e-6
+
+        self.gen_random_input()
+
+        self.set_program(axis, epsilon)
+        self.run_program()
+
+        expect_output = self.l2_normalize(self.data, axis, epsilon)
+
+        # check output
+        self.assertTrue(np.allclose(self.op_output, expect_output, atol=0.001))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_parallel_op.py b/python/paddle/v2/fluid/tests/test_parallel_op.py
index 3c190477d167c6aa48078869a9abb15488d21dd3..45196ef6fe5230a6b3ead0b64fee09492188da82 100644
--- a/python/paddle/v2/fluid/tests/test_parallel_op.py
+++ b/python/paddle/v2/fluid/tests/test_parallel_op.py
@@ -151,24 +151,28 @@ class BaseParallelForTest(unittest.TestCase):
 
 
 class ParallelOpTest(BaseParallelForTest):
-    def test_simple_fc(self):
-        def __network__():
-            x = fluid.layers.data(shape=[784], dtype='float32', name='img')
-            # FIXME: This is a bug of parallel.do
-            x.stop_gradient = False
-            x = yield x
-            hidden = fluid.layers.fc(input=x, size=200, param_attr='fc1.w')
-            loss = fluid.layers.mean(x=hidden)
-            yield loss
+    @staticmethod
+    def __network__():
+        x = fluid.layers.data(shape=[784], dtype='float32', name='img')
+        x = yield x
+        hidden = fluid.layers.fc(input=x, size=200, param_attr='fc1.w')
+        loss = fluid.layers.mean(x=hidden)
+        yield loss
 
+    def test_simple_fc(self):
         self.run_test(
-            callback=__network__,
+            callback=ParallelOpTest.__network__,
             feed={
-                'img':
-                numpy.random.random(size=(128 * 3, 784)).astype('float32')
+                'img': numpy.random.random(size=(51, 784)).astype('float32')
             },
             fetch='fc1.w@GRAD')
 
+    def test_fc_with_tiny_data(self):
+        self.run_test(
+            callback=ParallelOpTest.__network__,
+            feed={'img': numpy.random.random(size=(1, 784)).astype('float32')},
+            fetch='fc1.w@GRAD')
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/v1_api_demo/README.md b/v1_api_demo/README.md
deleted file mode 100644
index 0460a85fae078800332982751a5d4a9644c50bd6..0000000000000000000000000000000000000000
--- a/v1_api_demo/README.md
+++ /dev/null
@@ -1,5 +0,0 @@
-The examples in v1_api_demo are using v1_api currently, and will be upgraded to v2_api later.
-Thus, v1_api_demo is a temporary directory. We decide not to maintain it and will delete it in future.
-
-Please go to [PaddlePaddle/book](https://github.com/PaddlePaddle/book) and 
-[PaddlePaddle/models](https://github.com/PaddlePaddle/models) to learn PaddlePaddle.
diff --git a/v1_api_demo/gan/.gitignore b/v1_api_demo/gan/.gitignore
deleted file mode 100644
index 93a6f5080a16a601cffb0bff51af9aef3ba3bae7..0000000000000000000000000000000000000000
--- a/v1_api_demo/gan/.gitignore
+++ /dev/null
@@ -1,11 +0,0 @@
-output/
-uniform_params/
-cifar_params/
-mnist_params/
-*.png
-.pydevproject
-.project
-*.log
-*.pyc
-data/mnist_data/
-data/cifar-10-batches-py/
diff --git a/v1_api_demo/gan/README.md b/v1_api_demo/gan/README.md
deleted file mode 100644
index 1908b534b0c1f63904d5503399b961d74ce0037c..0000000000000000000000000000000000000000
--- a/v1_api_demo/gan/README.md
+++ /dev/null
@@ -1,13 +0,0 @@
-# Generative Adversarial Networks (GAN) 
-
-This demo implements GAN training described in the original GAN paper (https://arxiv.org/abs/1406.2661) and DCGAN (https://arxiv.org/abs/1511.06434).
-
-The general training procedures are implemented in gan_trainer.py. The neural network configurations are specified in gan_conf.py (for synthetic data) and gan_conf_image.py (for image data).
-
-In order to run the model, first download the corresponding data by running the shell script in ./data.
-Then you can run the command below. The flag -d specifies the training data (cifar, mnist or uniform) and flag --useGpu specifies whether to use gpu for training (0 is cpu, 1 is gpu).  
-
-$python gan_trainer.py -d cifar --use_gpu 1
-
-The generated images will be stored in ./cifar_samples/
-The corresponding models will be stored in ./cifar_params/
diff --git a/v1_api_demo/gan/data/download_cifar.sh b/v1_api_demo/gan/data/download_cifar.sh
deleted file mode 100755
index bbadc7c10c73e45a0948018b8812f79040d14bc4..0000000000000000000000000000000000000000
--- a/v1_api_demo/gan/data/download_cifar.sh
+++ /dev/null
@@ -1,18 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-wget https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
-tar zxf cifar-10-python.tar.gz
-rm cifar-10-python.tar.gz
diff --git a/v1_api_demo/gan/data/get_mnist_data.sh b/v1_api_demo/gan/data/get_mnist_data.sh
deleted file mode 100755
index a77c81bf5af9ddb6634ff89460797ca543c5e517..0000000000000000000000000000000000000000
--- a/v1_api_demo/gan/data/get_mnist_data.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/usr/bin/env sh
-# This script downloads the mnist data and unzips it.
-set -e
-DIR="$( cd "$(dirname "$0")" ; pwd -P )"
-rm -rf "$DIR/mnist_data"
-mkdir "$DIR/mnist_data"
-cd "$DIR/mnist_data"
-
-echo "Downloading..."
-
-for fname in train-images-idx3-ubyte train-labels-idx1-ubyte t10k-images-idx3-ubyte t10k-labels-idx1-ubyte
-do
-    if [ ! -e $fname ]; then
-        wget --no-check-certificate http://yann.lecun.com/exdb/mnist/${fname}.gz
-        gunzip ${fname}.gz
-    fi
-done
diff --git a/v1_api_demo/gan/gan_conf.py b/v1_api_demo/gan/gan_conf.py
deleted file mode 100644
index 86ac2dffe5f4490a88e12d1fa5e8cd9fa61a69f4..0000000000000000000000000000000000000000
--- a/v1_api_demo/gan/gan_conf.py
+++ /dev/null
@@ -1,151 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from paddle.trainer_config_helpers import *
-
-mode = get_config_arg("mode", str, "generator")
-assert mode in set([
-    "generator", "discriminator", "generator_training", "discriminator_training"
-])
-
-is_generator_training = mode == "generator_training"
-is_discriminator_training = mode == "discriminator_training"
-is_generator = mode == "generator"
-is_discriminator = mode == "discriminator"
-
-# The network structure below follows the ref https://arxiv.org/abs/1406.2661
-# Here we used two hidden layers and batch_norm
-
-print('mode=%s' % mode)
-# the dim of the noise (z) as the input of the generator network
-noise_dim = 10
-# the dim of the hidden layer
-hidden_dim = 10
-# the dim of the generated sample
-sample_dim = 2
-
-settings(
-    batch_size=128,
-    learning_rate=1e-4,
-    learning_method=AdamOptimizer(beta1=0.5))
-
-
-def discriminator(sample):
-    """
-    discriminator ouputs the probablity of a sample is from generator
-    or real data.
-    The output has two dimenstional: dimension 0 is the probablity
-    of the sample is from generator and dimension 1 is the probabblity
-    of the sample is from real data.
-    """
-    param_attr = ParamAttr(is_static=is_generator_training)
-    bias_attr = ParamAttr(
-        is_static=is_generator_training, initial_mean=1.0, initial_std=0)
-
-    hidden = fc_layer(
-        input=sample,
-        name="dis_hidden",
-        size=hidden_dim,
-        bias_attr=bias_attr,
-        param_attr=param_attr,
-        act=ReluActivation())
-
-    hidden2 = fc_layer(
-        input=hidden,
-        name="dis_hidden2",
-        size=hidden_dim,
-        bias_attr=bias_attr,
-        param_attr=param_attr,
-        act=LinearActivation())
-
-    hidden_bn = batch_norm_layer(
-        hidden2,
-        act=ReluActivation(),
-        name="dis_hidden_bn",
-        bias_attr=bias_attr,
-        param_attr=ParamAttr(
-            is_static=is_generator_training, initial_mean=1.0,
-            initial_std=0.02),
-        use_global_stats=False)
-
-    return fc_layer(
-        input=hidden_bn,
-        name="dis_prob",
-        size=2,
-        bias_attr=bias_attr,
-        param_attr=param_attr,
-        act=SoftmaxActivation())
-
-
-def generator(noise):
-    """
-    generator generates a sample given noise
-    """
-    param_attr = ParamAttr(is_static=is_discriminator_training)
-    bias_attr = ParamAttr(
-        is_static=is_discriminator_training, initial_mean=1.0, initial_std=0)
-
-    hidden = fc_layer(
-        input=noise,
-        name="gen_layer_hidden",
-        size=hidden_dim,
-        bias_attr=bias_attr,
-        param_attr=param_attr,
-        act=ReluActivation())
-
-    hidden2 = fc_layer(
-        input=hidden,
-        name="gen_hidden2",
-        size=hidden_dim,
-        bias_attr=bias_attr,
-        param_attr=param_attr,
-        act=LinearActivation())
-
-    hidden_bn = batch_norm_layer(
-        hidden2,
-        act=ReluActivation(),
-        name="gen_layer_hidden_bn",
-        bias_attr=bias_attr,
-        param_attr=ParamAttr(
-            is_static=is_discriminator_training,
-            initial_mean=1.0,
-            initial_std=0.02),
-        use_global_stats=False)
-
-    return fc_layer(
-        input=hidden_bn,
-        name="gen_layer1",
-        size=sample_dim,
-        bias_attr=bias_attr,
-        param_attr=param_attr,
-        act=LinearActivation())
-
-
-if is_generator_training:
-    noise = data_layer(name="noise", size=noise_dim)
-    sample = generator(noise)
-
-if is_discriminator_training:
-    sample = data_layer(name="sample", size=sample_dim)
-
-if is_generator_training or is_discriminator_training:
-    label = data_layer(name="label", size=1)
-    prob = discriminator(sample)
-    cost = cross_entropy(input=prob, label=label)
-    classification_error_evaluator(
-        input=prob, label=label, name=mode + '_error')
-    outputs(cost)
-
-if is_generator:
-    noise = data_layer(name="noise", size=noise_dim)
-    outputs(generator(noise))
diff --git a/v1_api_demo/gan/gan_conf_image.py b/v1_api_demo/gan/gan_conf_image.py
deleted file mode 100644
index c469227994c1a84d1aa73e03bbc74ebeac41d30e..0000000000000000000000000000000000000000
--- a/v1_api_demo/gan/gan_conf_image.py
+++ /dev/null
@@ -1,298 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from paddle.trainer_config_helpers import *
-
-mode = get_config_arg("mode", str, "generator")
-dataSource = get_config_arg("data", str, "mnist")
-assert mode in set([
-    "generator", "discriminator", "generator_training", "discriminator_training"
-])
-
-is_generator_training = mode == "generator_training"
-is_discriminator_training = mode == "discriminator_training"
-is_generator = mode == "generator"
-is_discriminator = mode == "discriminator"
-
-# The network structure below follows the dcgan paper 
-# (https://arxiv.org/abs/1511.06434)
-
-print('mode=%s' % mode)
-# the dim of the noise (z) as the input of the generator network
-noise_dim = 100
-# the number of filters in the layer in generator/discriminator that is 
-# closet to the image
-gf_dim = 64
-df_dim = 64
-if dataSource == "mnist":
-    sample_dim = 28  # image dim
-    c_dim = 1  # image color
-else:
-    sample_dim = 32
-    c_dim = 3
-s2, s4 = int(sample_dim / 2), int(sample_dim / 4),
-s8, s16 = int(sample_dim / 8), int(sample_dim / 16)
-
-settings(
-    batch_size=128,
-    learning_rate=2e-4,
-    learning_method=AdamOptimizer(beta1=0.5))
-
-
-def conv_bn(input,
-            channels,
-            imgSize,
-            num_filters,
-            output_x,
-            stride,
-            name,
-            param_attr,
-            bias_attr,
-            param_attr_bn,
-            bn,
-            trans=False,
-            act=ReluActivation()):
-    """
-    conv_bn is a utility function that constructs a convolution/deconv layer 
-    with an optional batch_norm layer
-
-    :param bn: whether to use batch_norm_layer
-    :type bn: bool
-    :param trans: whether to use conv (False) or deconv (True)
-    :type trans: bool
-    """
-
-    # calculate the filter_size and padding size based on the given
-    # imgSize and ouput size
-    tmp = imgSize - (output_x - 1) * stride
-    if tmp <= 1 or tmp > 5:
-        raise ValueError("conv input-output dimension does not fit")
-    elif tmp <= 3:
-        filter_size = tmp + 2
-        padding = 1
-    else:
-        filter_size = tmp
-        padding = 0
-
-    print(imgSize, output_x, stride, filter_size, padding)
-
-    if trans:
-        nameApx = "_convt"
-    else:
-        nameApx = "_conv"
-
-    if bn:
-        conv = img_conv_layer(
-            input,
-            filter_size=filter_size,
-            num_filters=num_filters,
-            name=name + nameApx,
-            num_channels=channels,
-            act=LinearActivation(),
-            groups=1,
-            stride=stride,
-            padding=padding,
-            bias_attr=bias_attr,
-            param_attr=param_attr,
-            shared_biases=True,
-            layer_attr=None,
-            filter_size_y=None,
-            stride_y=None,
-            padding_y=None,
-            trans=trans)
-
-        conv_bn = batch_norm_layer(
-            conv,
-            act=act,
-            name=name + nameApx + "_bn",
-            bias_attr=bias_attr,
-            param_attr=param_attr_bn,
-            use_global_stats=False)
-
-        return conv_bn
-    else:
-        conv = img_conv_layer(
-            input,
-            filter_size=filter_size,
-            num_filters=num_filters,
-            name=name + nameApx,
-            num_channels=channels,
-            act=act,
-            groups=1,
-            stride=stride,
-            padding=padding,
-            bias_attr=bias_attr,
-            param_attr=param_attr,
-            shared_biases=True,
-            layer_attr=None,
-            filter_size_y=None,
-            stride_y=None,
-            padding_y=None,
-            trans=trans)
-        return conv
-
-
-def generator(noise):
-    """
-    generator generates a sample given noise
-    """
-    param_attr = ParamAttr(
-        is_static=is_discriminator_training, initial_mean=0.0, initial_std=0.02)
-    bias_attr = ParamAttr(
-        is_static=is_discriminator_training, initial_mean=0.0, initial_std=0.0)
-
-    param_attr_bn = ParamAttr(
-        is_static=is_discriminator_training, initial_mean=1.0, initial_std=0.02)
-
-    h1 = fc_layer(
-        input=noise,
-        name="gen_layer_h1",
-        size=s8 * s8 * gf_dim * 4,
-        bias_attr=bias_attr,
-        param_attr=param_attr,
-        act=LinearActivation())
-
-    h1_bn = batch_norm_layer(
-        h1,
-        act=ReluActivation(),
-        name="gen_layer_h1_bn",
-        bias_attr=bias_attr,
-        param_attr=param_attr_bn,
-        use_global_stats=False)
-
-    h2_bn = conv_bn(
-        h1_bn,
-        channels=gf_dim * 4,
-        output_x=s8,
-        num_filters=gf_dim * 2,
-        imgSize=s4,
-        stride=2,
-        name="gen_layer_h2",
-        param_attr=param_attr,
-        bias_attr=bias_attr,
-        param_attr_bn=param_attr_bn,
-        bn=True,
-        trans=True)
-
-    h3_bn = conv_bn(
-        h2_bn,
-        channels=gf_dim * 2,
-        output_x=s4,
-        num_filters=gf_dim,
-        imgSize=s2,
-        stride=2,
-        name="gen_layer_h3",
-        param_attr=param_attr,
-        bias_attr=bias_attr,
-        param_attr_bn=param_attr_bn,
-        bn=True,
-        trans=True)
-
-    return conv_bn(
-        h3_bn,
-        channels=gf_dim,
-        output_x=s2,
-        num_filters=c_dim,
-        imgSize=sample_dim,
-        stride=2,
-        name="gen_layer_h4",
-        param_attr=param_attr,
-        bias_attr=bias_attr,
-        param_attr_bn=param_attr_bn,
-        bn=False,
-        trans=True,
-        act=TanhActivation())
-
-
-def discriminator(sample):
-    """
-    discriminator ouputs the probablity of a sample is from generator
-    or real data.
-    The output has two dimenstional: dimension 0 is the probablity
-    of the sample is from generator and dimension 1 is the probabblity
-    of the sample is from real data.
-    """
-    param_attr = ParamAttr(
-        is_static=is_generator_training, initial_mean=0.0, initial_std=0.02)
-    bias_attr = ParamAttr(
-        is_static=is_generator_training, initial_mean=0.0, initial_std=0.0)
-
-    param_attr_bn = ParamAttr(
-        is_static=is_generator_training, initial_mean=1.0, initial_std=0.02)
-
-    h0 = conv_bn(
-        sample,
-        channels=c_dim,
-        imgSize=sample_dim,
-        num_filters=df_dim,
-        output_x=s2,
-        stride=2,
-        name="dis_h0",
-        param_attr=param_attr,
-        bias_attr=bias_attr,
-        param_attr_bn=param_attr_bn,
-        bn=False)
-
-    h1_bn = conv_bn(
-        h0,
-        channels=df_dim,
-        imgSize=s2,
-        num_filters=df_dim * 2,
-        output_x=s4,
-        stride=2,
-        name="dis_h1",
-        param_attr=param_attr,
-        bias_attr=bias_attr,
-        param_attr_bn=param_attr_bn,
-        bn=True)
-
-    h2_bn = conv_bn(
-        h1_bn,
-        channels=df_dim * 2,
-        imgSize=s4,
-        num_filters=df_dim * 4,
-        output_x=s8,
-        stride=2,
-        name="dis_h2",
-        param_attr=param_attr,
-        bias_attr=bias_attr,
-        param_attr_bn=param_attr_bn,
-        bn=True)
-
-    return fc_layer(
-        input=h2_bn,
-        name="dis_prob",
-        size=2,
-        bias_attr=bias_attr,
-        param_attr=param_attr,
-        act=SoftmaxActivation())
-
-
-if is_generator_training:
-    noise = data_layer(name="noise", size=noise_dim)
-    sample = generator(noise)
-
-if is_discriminator_training:
-    sample = data_layer(name="sample", size=sample_dim * sample_dim * c_dim)
-
-if is_generator_training or is_discriminator_training:
-    label = data_layer(name="label", size=1)
-    prob = discriminator(sample)
-    cost = cross_entropy(input=prob, label=label)
-    classification_error_evaluator(
-        input=prob, label=label, name=mode + '_error')
-    outputs(cost)
-
-if is_generator:
-    noise = data_layer(name="noise", size=noise_dim)
-    outputs(generator(noise))
diff --git a/v1_api_demo/gan/gan_trainer.py b/v1_api_demo/gan/gan_trainer.py
deleted file mode 100644
index 4a26c230f7a21cc6dd4a3cdb52e32730b1ce73ca..0000000000000000000000000000000000000000
--- a/v1_api_demo/gan/gan_trainer.py
+++ /dev/null
@@ -1,349 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import random
-import numpy
-import cPickle
-import sys, os
-from PIL import Image
-
-from paddle.trainer.config_parser import parse_config
-from paddle.trainer.config_parser import logger
-import py_paddle.swig_paddle as api
-import matplotlib.pyplot as plt
-
-
-def plot2DScatter(data, outputfile):
-    '''
-    Plot the data as a 2D scatter plot and save to outputfile
-    data needs to be two dimensinoal
-    '''
-    x = data[:, 0]
-    y = data[:, 1]
-    logger.info("The mean vector is %s" % numpy.mean(data, 0))
-    logger.info("The std vector is %s" % numpy.std(data, 0))
-
-    heatmap, xedges, yedges = numpy.histogram2d(x, y, bins=50)
-    extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]]
-
-    plt.clf()
-    plt.scatter(x, y)
-    plt.savefig(outputfile, bbox_inches='tight')
-
-
-def CHECK_EQ(a, b):
-    assert a == b, "a=%s, b=%s" % (a, b)
-
-
-def copy_shared_parameters(src, dst):
-    '''
-    copy the parameters from src to dst
-    :param src: the source of the parameters
-    :type src: GradientMachine
-    :param dst: the destination of the parameters
-    :type dst: GradientMachine
-    '''
-    src_params = [src.getParameter(i) for i in xrange(src.getParameterSize())]
-    src_params = dict([(p.getName(), p) for p in src_params])
-
-    for i in xrange(dst.getParameterSize()):
-        dst_param = dst.getParameter(i)
-        src_param = src_params.get(dst_param.getName(), None)
-        if src_param is None:
-            continue
-        src_value = src_param.getBuf(api.PARAMETER_VALUE)
-        dst_value = dst_param.getBuf(api.PARAMETER_VALUE)
-        CHECK_EQ(len(src_value), len(dst_value))
-        dst_value.copyFrom(src_value)
-        dst_param.setValueUpdated()
-
-
-def print_parameters(src):
-    src_params = [src.getParameter(i) for i in xrange(src.getParameterSize())]
-
-    print "***************"
-    for p in src_params:
-        print "Name is %s" % p.getName()
-        print "value is %s \n" % p.getBuf(api.PARAMETER_VALUE).copyToNumpyArray(
-        )
-
-
-def load_mnist_data(imageFile):
-    f = open(imageFile, "rb")
-    f.read(16)
-
-    # Define number of samples for train/test
-    if "train" in imageFile:
-        n = 60000
-    else:
-        n = 10000
-
-    data = numpy.fromfile(f, 'ubyte', count=n * 28 * 28).reshape((n, 28 * 28))
-    data = data / 255.0 * 2.0 - 1.0
-
-    f.close()
-    return data.astype('float32')
-
-
-def load_cifar_data(cifar_path):
-    batch_size = 10000
-    data = numpy.zeros((5 * batch_size, 32 * 32 * 3), dtype="float32")
-    for i in range(1, 6):
-        file = cifar_path + "/data_batch_" + str(i)
-        fo = open(file, 'rb')
-        dict = cPickle.load(fo)
-        fo.close()
-        data[(i - 1) * batch_size:(i * batch_size), :] = dict["data"]
-
-    data = data / 255.0 * 2.0 - 1.0
-    return data
-
-
-# synthesize 2-D uniform data
-def load_uniform_data():
-    data = numpy.random.rand(1000000, 2).astype('float32')
-    return data
-
-
-def merge(images, size):
-    if images.shape[1] == 28 * 28:
-        h, w, c = 28, 28, 1
-    else:
-        h, w, c = 32, 32, 3
-    img = numpy.zeros((h * size[0], w * size[1], c))
-    for idx in xrange(size[0] * size[1]):
-        i = idx % size[1]
-        j = idx // size[1]
-        img[j*h:j*h+h, i*w:i*w+w, :] = \
-          ((images[idx, :].reshape((h, w, c), order="F").transpose(1, 0, 2) + 1.0) / 2.0 * 255.0)
-    return img.astype('uint8')
-
-
-def save_images(images, path):
-    merged_img = merge(images, [8, 8])
-    if merged_img.shape[2] == 1:
-        im = Image.fromarray(numpy.squeeze(merged_img)).convert('RGB')
-    else:
-        im = Image.fromarray(merged_img, mode="RGB")
-    im.save(path)
-
-
-def get_real_samples(batch_size, data_np):
-    return data_np[numpy.random.choice(
-        data_np.shape[0], batch_size, replace=False), :]
-
-
-def get_noise(batch_size, noise_dim):
-    return numpy.random.normal(size=(batch_size, noise_dim)).astype('float32')
-
-
-def get_fake_samples(generator_machine, batch_size, noise):
-    gen_inputs = api.Arguments.createArguments(1)
-    gen_inputs.setSlotValue(0, api.Matrix.createDenseFromNumpy(noise))
-    gen_outputs = api.Arguments.createArguments(0)
-    generator_machine.forward(gen_inputs, gen_outputs, api.PASS_TEST)
-    fake_samples = gen_outputs.getSlotValue(0).copyToNumpyMat()
-    return fake_samples
-
-
-def get_training_loss(training_machine, inputs):
-    outputs = api.Arguments.createArguments(0)
-    training_machine.forward(inputs, outputs, api.PASS_TEST)
-    loss = outputs.getSlotValue(0).copyToNumpyMat()
-    return numpy.mean(loss)
-
-
-def prepare_discriminator_data_batch_pos(batch_size, data_np):
-    real_samples = get_real_samples(batch_size, data_np)
-    labels = numpy.ones(batch_size, dtype='int32')
-    inputs = api.Arguments.createArguments(2)
-    inputs.setSlotValue(0, api.Matrix.createDenseFromNumpy(real_samples))
-    inputs.setSlotIds(1, api.IVector.createVectorFromNumpy(labels))
-    return inputs
-
-
-def prepare_discriminator_data_batch_neg(generator_machine, batch_size, noise):
-    fake_samples = get_fake_samples(generator_machine, batch_size, noise)
-    labels = numpy.zeros(batch_size, dtype='int32')
-    inputs = api.Arguments.createArguments(2)
-    inputs.setSlotValue(0, api.Matrix.createDenseFromNumpy(fake_samples))
-    inputs.setSlotIds(1, api.IVector.createVectorFromNumpy(labels))
-    return inputs
-
-
-def prepare_generator_data_batch(batch_size, noise):
-    label = numpy.ones(batch_size, dtype='int32')
-    inputs = api.Arguments.createArguments(2)
-    inputs.setSlotValue(0, api.Matrix.createDenseFromNumpy(noise))
-    inputs.setSlotIds(1, api.IVector.createVectorFromNumpy(label))
-    return inputs
-
-
-def find(iterable, cond):
-    for item in iterable:
-        if cond(item):
-            return item
-    return None
-
-
-def get_layer_size(model_conf, layer_name):
-    layer_conf = find(model_conf.layers, lambda x: x.name == layer_name)
-    assert layer_conf is not None, "Cannot find '%s' layer" % layer_name
-    return layer_conf.size
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("-d", "--data_source", help="mnist or cifar or uniform")
-    parser.add_argument(
-        "--use_gpu", default="1", help="1 means use gpu for training")
-    parser.add_argument("--gpu_id", default="0", help="the gpu_id parameter")
-    args = parser.parse_args()
-    data_source = args.data_source
-    use_gpu = args.use_gpu
-    assert data_source in ["mnist", "cifar", "uniform"]
-    assert use_gpu in ["0", "1"]
-
-    if not os.path.exists("./%s_samples/" % data_source):
-        os.makedirs("./%s_samples/" % data_source)
-
-    if not os.path.exists("./%s_params/" % data_source):
-        os.makedirs("./%s_params/" % data_source)
-
-    api.initPaddle('--use_gpu=' + use_gpu, '--dot_period=10',
-                   '--log_period=100', '--gpu_id=' + args.gpu_id,
-                   '--save_dir=' + "./%s_params/" % data_source)
-
-    if data_source == "uniform":
-        conf = "gan_conf.py"
-        num_iter = 10000
-    else:
-        conf = "gan_conf_image.py"
-        num_iter = 1000
-
-    gen_conf = parse_config(conf, "mode=generator_training,data=" + data_source)
-    dis_conf = parse_config(conf,
-                            "mode=discriminator_training,data=" + data_source)
-    generator_conf = parse_config(conf, "mode=generator,data=" + data_source)
-    batch_size = dis_conf.opt_config.batch_size
-    noise_dim = get_layer_size(gen_conf.model_config, "noise")
-
-    if data_source == "mnist":
-        data_np = load_mnist_data("./data/mnist_data/train-images-idx3-ubyte")
-    elif data_source == "cifar":
-        data_np = load_cifar_data("./data/cifar-10-batches-py/")
-    else:
-        data_np = load_uniform_data()
-
-    # this creates a gradient machine for discriminator
-    dis_training_machine = api.GradientMachine.createFromConfigProto(
-        dis_conf.model_config)
-    # this create a gradient machine for generator    
-    gen_training_machine = api.GradientMachine.createFromConfigProto(
-        gen_conf.model_config)
-
-    # generator_machine is used to generate data only, which is used for
-    # training discriminator
-    logger.info(str(generator_conf.model_config))
-    generator_machine = api.GradientMachine.createFromConfigProto(
-        generator_conf.model_config)
-
-    dis_trainer = api.Trainer.create(dis_conf, dis_training_machine)
-
-    gen_trainer = api.Trainer.create(gen_conf, gen_training_machine)
-
-    dis_trainer.startTrain()
-    gen_trainer.startTrain()
-
-    # Sync parameters between networks (GradientMachine) at the beginning
-    copy_shared_parameters(gen_training_machine, dis_training_machine)
-    copy_shared_parameters(gen_training_machine, generator_machine)
-
-    # constrain that either discriminator or generator can not be trained
-    # consecutively more than MAX_strike times
-    curr_train = "dis"
-    curr_strike = 0
-    MAX_strike = 5
-
-    for train_pass in xrange(100):
-        dis_trainer.startTrainPass()
-        gen_trainer.startTrainPass()
-        for i in xrange(num_iter):
-            # Do forward pass in discriminator to get the dis_loss
-            noise = get_noise(batch_size, noise_dim)
-            data_batch_dis_pos = prepare_discriminator_data_batch_pos(
-                batch_size, data_np)
-            dis_loss_pos = get_training_loss(dis_training_machine,
-                                             data_batch_dis_pos)
-
-            data_batch_dis_neg = prepare_discriminator_data_batch_neg(
-                generator_machine, batch_size, noise)
-            dis_loss_neg = get_training_loss(dis_training_machine,
-                                             data_batch_dis_neg)
-
-            dis_loss = (dis_loss_pos + dis_loss_neg) / 2.0
-
-            # Do forward pass in generator to get the gen_loss
-            data_batch_gen = prepare_generator_data_batch(batch_size, noise)
-            gen_loss = get_training_loss(gen_training_machine, data_batch_gen)
-
-            if i % 100 == 0:
-                print "d_pos_loss is %s     d_neg_loss is %s" % (dis_loss_pos,
-                                                                 dis_loss_neg)
-                print "d_loss is %s    g_loss is %s" % (dis_loss, gen_loss)
-
-            # Decide which network to train based on the training history
-            # And the relative size of the loss        
-            if (not (curr_train == "dis" and curr_strike == MAX_strike)) and \
-               ((curr_train == "gen" and curr_strike == MAX_strike) or dis_loss > gen_loss):
-                if curr_train == "dis":
-                    curr_strike += 1
-                else:
-                    curr_train = "dis"
-                    curr_strike = 1
-                dis_trainer.trainOneDataBatch(batch_size, data_batch_dis_neg)
-                dis_trainer.trainOneDataBatch(batch_size, data_batch_dis_pos)
-                copy_shared_parameters(dis_training_machine,
-                                       gen_training_machine)
-
-            else:
-                if curr_train == "gen":
-                    curr_strike += 1
-                else:
-                    curr_train = "gen"
-                    curr_strike = 1
-                gen_trainer.trainOneDataBatch(batch_size, data_batch_gen)
-                # TODO: add API for paddle to allow true parameter sharing between different GradientMachines 
-                # so that we do not need to copy shared parameters. 
-                copy_shared_parameters(gen_training_machine,
-                                       dis_training_machine)
-                copy_shared_parameters(gen_training_machine, generator_machine)
-
-        dis_trainer.finishTrainPass()
-        gen_trainer.finishTrainPass()
-        # At the end of each pass, save the generated samples/images
-        fake_samples = get_fake_samples(generator_machine, batch_size, noise)
-        if data_source == "uniform":
-            plot2DScatter(fake_samples, "./%s_samples/train_pass%s.png" %
-                          (data_source, train_pass))
-        else:
-            save_images(fake_samples, "./%s_samples/train_pass%s.png" %
-                        (data_source, train_pass))
-    dis_trainer.finishTrain()
-    gen_trainer.finishTrain()
-
-
-if __name__ == '__main__':
-    main()
diff --git a/v1_api_demo/mnist/.gitignore b/v1_api_demo/mnist/.gitignore
deleted file mode 100644
index 7e61d5e3a0cabd46d4185454d46610ac2ee2e63f..0000000000000000000000000000000000000000
--- a/v1_api_demo/mnist/.gitignore
+++ /dev/null
@@ -1,10 +0,0 @@
-data/raw_data
-data/*.list
-mnist_vgg_model
-plot.png
-train.log
-*pyc
-.ipynb_checkpoints
-params.pkl
-params.tar
-params.tar.gz
diff --git a/v1_api_demo/mnist/api_train.py b/v1_api_demo/mnist/api_train.py
deleted file mode 100644
index e42c6cbb7e0eed4f3a3625f18d79b3de64fd8e26..0000000000000000000000000000000000000000
--- a/v1_api_demo/mnist/api_train.py
+++ /dev/null
@@ -1,209 +0,0 @@
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-"""
-A very basic example for how to use current Raw SWIG API to train mnist network.
-
-Current implementation uses Raw SWIG, which means the API call is directly \
-passed to C++ side of Paddle.
-
-The user api could be simpler and carefully designed.
-"""
-import random
-
-import numpy as np
-import paddle.v2 as paddle_v2
-import py_paddle.swig_paddle as api
-from paddle.trainer_config_helpers import *
-from py_paddle import DataProviderConverter
-
-from mnist_util import read_from_mnist
-
-
-def init_parameter(network):
-    assert isinstance(network, api.GradientMachine)
-    for each_param in network.getParameters():
-        assert isinstance(each_param, api.Parameter)
-        array_size = len(each_param)
-        array = np.random.uniform(-1.0, 1.0, array_size).astype('float32')
-        each_param.getBuf(api.PARAMETER_VALUE).copyFromNumpyArray(array)
-
-
-def generator_to_batch(generator, batch_size):
-    ret_val = list()
-    for each_item in generator:
-        ret_val.append(each_item)
-        if len(ret_val) == batch_size:
-            yield ret_val
-            ret_val = list()
-    if len(ret_val) != 0:
-        yield ret_val
-
-
-class BatchPool(object):
-    def __init__(self, generator, batch_size):
-        self.data = list(generator)
-        self.batch_size = batch_size
-
-    def __call__(self):
-        random.shuffle(self.data)
-        for offset in xrange(0, len(self.data), self.batch_size):
-            limit = min(offset + self.batch_size, len(self.data))
-            yield self.data[offset:limit]
-
-
-def input_order_converter(generator):
-    for each_item in generator:
-        yield each_item['pixel'], each_item['label']
-
-
-def main():
-    api.initPaddle("-use_gpu=false", "-trainer_count=4")  # use 4 cpu cores
-
-    optimizer = paddle_v2.optimizer.Adam(
-        learning_rate=1e-4,
-        batch_size=1000,
-        model_average=ModelAverage(average_window=0.5),
-        regularization=L2Regularization(rate=0.5))
-
-    # Create Local Updater. Local means not run in cluster.
-    # For a cluster training, here we can change to createRemoteUpdater
-    # in future.
-    updater = optimizer.create_local_updater()
-    assert isinstance(updater, api.ParameterUpdater)
-
-    # define network
-    images = paddle_v2.layer.data(
-        name='pixel', type=paddle_v2.data_type.dense_vector(784))
-    label = paddle_v2.layer.data(
-        name='label', type=paddle_v2.data_type.integer_value(10))
-    hidden1 = paddle_v2.layer.fc(input=images, size=200)
-    hidden2 = paddle_v2.layer.fc(input=hidden1, size=200)
-    inference = paddle_v2.layer.fc(input=hidden2,
-                                   size=10,
-                                   act=paddle_v2.activation.Softmax())
-    cost = paddle_v2.layer.classification_cost(input=inference, label=label)
-
-    # Create Simple Gradient Machine.
-    model_config = paddle_v2.layer.parse_network(cost)
-    m = api.GradientMachine.createFromConfigProto(model_config,
-                                                  api.CREATE_MODE_NORMAL,
-                                                  optimizer.enable_types())
-
-    # This type check is not useful. Only enable type hint in IDE.
-    # Such as PyCharm
-    assert isinstance(m, api.GradientMachine)
-
-    # Initialize Parameter by numpy.
-    init_parameter(network=m)
-
-    # Initialize ParameterUpdater.
-    updater.init(m)
-
-    # DataProvider Converter is a utility convert Python Object to Paddle C++
-    # Input. The input format is as same as Paddle's DataProvider.
-    converter = DataProviderConverter(input_types=[images.type, label.type])
-
-    train_file = './data/raw_data/train'
-    test_file = './data/raw_data/t10k'
-
-    # start gradient machine.
-    # the gradient machine must be started before invoke forward/backward.
-    # not just for training, but also for inference.
-    m.start()
-
-    # evaluator can print error rate, etc. It is a C++ class.
-    batch_evaluator = m.makeEvaluator()
-    test_evaluator = m.makeEvaluator()
-
-    # Get Train Data.
-    # TrainData will stored in a data pool. Currently implementation is not care
-    # about memory, speed. Just a very naive implementation.
-    train_data_generator = input_order_converter(read_from_mnist(train_file))
-    train_data = BatchPool(train_data_generator, 512)
-
-    # outArgs is Neural Network forward result. Here is not useful, just passed
-    # to gradient_machine.forward
-    outArgs = api.Arguments.createArguments(0)
-
-    for pass_id in xrange(2):  # we train 2 passes.
-        updater.startPass()
-
-        for batch_id, data_batch in enumerate(train_data()):
-            # data_batch is input images.
-            # here, for online learning, we could get data_batch from network.
-
-            # Start update one batch.
-            pass_type = updater.startBatch(len(data_batch))
-
-            # Start BatchEvaluator.
-            # batch_evaluator can be used between start/finish.
-            batch_evaluator.start()
-
-            # forwardBackward is a shortcut for forward and backward.
-            # It is sometimes faster than invoke forward/backward separately,
-            # because in GradientMachine, it may be async.
-            m.forwardBackward(converter(data_batch), outArgs, pass_type)
-
-            for each_param in m.getParameters():
-                updater.update(each_param)
-
-            # Get cost. We use numpy to calculate total cost for this batch.
-            cost_vec = outArgs.getSlotValue(0)
-            cost_vec = cost_vec.copyToNumpyMat()
-            cost = cost_vec.sum() / len(data_batch)
-
-            # Make evaluator works.
-            m.eval(batch_evaluator)
-
-            # Print logs.
-            print 'Pass id', pass_id, 'Batch id', batch_id, 'with cost=', \
-                cost, batch_evaluator
-
-            batch_evaluator.finish()
-            # Finish batch.
-            #  * will clear gradient.
-            #  * ensure all values should be updated.
-            updater.finishBatch(cost)
-
-        # testing stage. use test data set to test current network.
-        updater.apply()
-        test_evaluator.start()
-        test_data_generator = input_order_converter(read_from_mnist(test_file))
-        for data_batch in generator_to_batch(test_data_generator, 512):
-            # in testing stage, only forward is needed.
-            m.forward(converter(data_batch), outArgs, api.PASS_TEST)
-            m.eval(test_evaluator)
-
-        # print error rate for test data set
-        print 'Pass', pass_id, ' test evaluator: ', test_evaluator
-        test_evaluator.finish()
-        updater.restore()
-
-        updater.catchUpWith()
-        params = m.getParameters()
-        for each_param in params:
-            assert isinstance(each_param, api.Parameter)
-            value = each_param.getBuf(api.PARAMETER_VALUE)
-            value = value.copyToNumpyArray()
-
-            # Here, we could save parameter to every where you want
-            print each_param.getName(), value
-
-        updater.finishPass()
-
-    m.finish()
-
-
-if __name__ == '__main__':
-    main()
diff --git a/v1_api_demo/mnist/data/generate_list.py b/v1_api_demo/mnist/data/generate_list.py
deleted file mode 100644
index 49981cc7a93308bc96ad5097eba749440e958525..0000000000000000000000000000000000000000
--- a/v1_api_demo/mnist/data/generate_list.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-o = open("./" + "train.list", "w")
-o.write("./data/raw_data/train" + "\n")
-o.close()
-
-o = open("./" + "test.list", "w")
-o.write("./data/raw_data/t10k" + "\n")
-o.close()
diff --git a/v1_api_demo/mnist/data/get_mnist_data.sh b/v1_api_demo/mnist/data/get_mnist_data.sh
deleted file mode 100755
index 5a2e34026d4fe7f8315d4f5453bec7c4ee4f6885..0000000000000000000000000000000000000000
--- a/v1_api_demo/mnist/data/get_mnist_data.sh
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/usr/bin/env sh
-# This scripts downloads the mnist data and unzips it.
-set -e
-DIR="$( cd "$(dirname "$0")" ; pwd -P )"
-rm -rf "$DIR/raw_data"
-mkdir "$DIR/raw_data"
-cd "$DIR/raw_data"
-
-echo "Downloading..."
-
-for fname in train-images-idx3-ubyte train-labels-idx1-ubyte t10k-images-idx3-ubyte t10k-labels-idx1-ubyte
-do
-    if [ ! -e $fname ]; then
-        wget --no-check-certificate http://yann.lecun.com/exdb/mnist/${fname}.gz
-        gunzip ${fname}.gz
-    fi
-done
-
-cd $DIR
-rm -f *.list
-python generate_list.py
diff --git a/v1_api_demo/mnist/light_mnist.py b/v1_api_demo/mnist/light_mnist.py
deleted file mode 100644
index 33409054357d2f0c6a765b3ab3164eb2e584467e..0000000000000000000000000000000000000000
--- a/v1_api_demo/mnist/light_mnist.py
+++ /dev/null
@@ -1,79 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-is_predict = get_config_arg("is_predict", bool, False)
-
-####################Data Configuration ##################
-
-if not is_predict:
-    data_dir = './data/'
-    define_py_data_sources2(
-        train_list=data_dir + 'train.list',
-        test_list=data_dir + 'test.list',
-        module='mnist_provider',
-        obj='process')
-
-######################Algorithm Configuration #############
-settings(batch_size=50, learning_rate=0.001, learning_method=AdamOptimizer())
-
-#######################Network Configuration #############
-
-data_size = 1 * 28 * 28
-label_size = 10
-img = data_layer(name='pixel', size=data_size)
-
-
-# light cnn
-# A shallower cnn model: [CNN, BN, ReLU, Max-Pooling] x4 + FC x1
-# Easier to train for mnist dataset and quite efficient
-# Final performance is close to deeper ones on tasks such as digital and character classification 
-def light_cnn(input_image, num_channels, num_classes):
-    def __light__(ipt,
-                  num_filter=128,
-                  times=1,
-                  conv_filter_size=3,
-                  dropouts=0,
-                  num_channels_=None):
-        return img_conv_group(
-            input=ipt,
-            num_channels=num_channels_,
-            pool_size=2,
-            pool_stride=2,
-            conv_padding=0,
-            conv_num_filter=[num_filter] * times,
-            conv_filter_size=conv_filter_size,
-            conv_act=ReluActivation(),
-            conv_with_batchnorm=True,
-            conv_batchnorm_drop_rate=dropouts,
-            pool_type=MaxPooling())
-
-    tmp = __light__(input_image, num_filter=128, num_channels_=num_channels)
-    tmp = __light__(tmp, num_filter=128)
-    tmp = __light__(tmp, num_filter=128)
-    tmp = __light__(tmp, num_filter=128, conv_filter_size=1)
-
-    tmp = fc_layer(input=tmp, size=num_classes, act=SoftmaxActivation())
-    return tmp
-
-
-predict = light_cnn(input_image=img, num_channels=1, num_classes=label_size)
-
-if not is_predict:
-    lbl = data_layer(name="label", size=label_size)
-    inputs(img, lbl)
-    outputs(classification_cost(input=predict, label=lbl))
-else:
-    outputs(predict)
diff --git a/v1_api_demo/mnist/mnist_provider.py b/v1_api_demo/mnist/mnist_provider.py
deleted file mode 100644
index 4192339837620aada84b64a92fef3e05953971c2..0000000000000000000000000000000000000000
--- a/v1_api_demo/mnist/mnist_provider.py
+++ /dev/null
@@ -1,25 +0,0 @@
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-from paddle.trainer.PyDataProvider2 import *
-from mnist_util import read_from_mnist
-
-
-# Define a py data provider
-@provider(
-    input_types={'pixel': dense_vector(28 * 28),
-                 'label': integer_value(10)},
-    cache=CacheType.CACHE_PASS_IN_MEM)
-def process(settings, filename):  # settings is not used currently.
-    for each in read_from_mnist(filename):
-        yield each
diff --git a/v1_api_demo/mnist/mnist_util.py b/v1_api_demo/mnist/mnist_util.py
deleted file mode 100644
index 3fd88ae7edc821296ca0accbf6dedc083e411744..0000000000000000000000000000000000000000
--- a/v1_api_demo/mnist/mnist_util.py
+++ /dev/null
@@ -1,30 +0,0 @@
-import numpy
-
-__all__ = ['read_from_mnist']
-
-
-def read_from_mnist(filename):
-    imgf = filename + "-images-idx3-ubyte"
-    labelf = filename + "-labels-idx1-ubyte"
-    f = open(imgf, "rb")
-    l = open(labelf, "rb")
-
-    f.read(16)
-    l.read(8)
-
-    # Define number of samples for train/test
-    if "train" in filename:
-        n = 60000
-    else:
-        n = 10000
-
-    images = numpy.fromfile(
-        f, 'ubyte', count=n * 28 * 28).reshape((n, 28 * 28)).astype('float32')
-    images = images / 255.0 * 2.0 - 1.0
-    labels = numpy.fromfile(l, 'ubyte', count=n).astype("int")
-
-    for i in xrange(n):
-        yield {"pixel": images[i, :], 'label': labels[i]}
-
-    f.close()
-    l.close()
diff --git a/v1_api_demo/mnist/train.sh b/v1_api_demo/mnist/train.sh
deleted file mode 100755
index ca2b1ad9eb960685b95b0f294a9b929e1a4acab1..0000000000000000000000000000000000000000
--- a/v1_api_demo/mnist/train.sh
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-config=vgg_16_mnist.py
-output=./mnist_vgg_model
-log=train.log
-
-paddle train \
---config=$config \
---dot_period=10 \
---log_period=100 \
---test_all_data_in_one_period=1 \
---use_gpu=0 \
---trainer_count=1 \
---num_passes=100 \
---save_dir=$output \
-2>&1 | tee $log
-paddle usage -l $log -e $? -n "mnist_train" >/dev/null 2>&1
-
-python -m paddle.utils.plotcurve -i $log > plot.png
diff --git a/v1_api_demo/mnist/vgg_16_mnist.py b/v1_api_demo/mnist/vgg_16_mnist.py
deleted file mode 100644
index a819b391c690fb473801eb2e7ba3161cc31b5b4b..0000000000000000000000000000000000000000
--- a/v1_api_demo/mnist/vgg_16_mnist.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-is_predict = get_config_arg("is_predict", bool, False)
-
-####################Data Configuration ##################
-
-if not is_predict:
-    data_dir = './data/'
-    define_py_data_sources2(
-        train_list=data_dir + 'train.list',
-        test_list=data_dir + 'test.list',
-        module='mnist_provider',
-        obj='process')
-
-######################Algorithm Configuration #############
-settings(
-    batch_size=128,
-    learning_rate=0.1 / 128.0,
-    learning_method=MomentumOptimizer(0.9),
-    regularization=L2Regularization(0.0005 * 128))
-
-#######################Network Configuration #############
-
-data_size = 1 * 28 * 28
-label_size = 10
-img = data_layer(name='pixel', size=data_size)
-
-# small_vgg is predined in trainer_config_helpers.network
-predict = small_vgg(input_image=img, num_channels=1, num_classes=label_size)
-
-if not is_predict:
-    lbl = data_layer(name="label", size=label_size)
-    inputs(img, lbl)
-    outputs(classification_cost(input=predict, label=lbl))
-else:
-    outputs(predict)
diff --git a/v1_api_demo/model_zoo/embedding/.gitignore b/v1_api_demo/model_zoo/embedding/.gitignore
deleted file mode 100644
index 908f5a3fb2f7c34368ea24d0fc3ac9cac29a4fdb..0000000000000000000000000000000000000000
--- a/v1_api_demo/model_zoo/embedding/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-baidu.dict
-model_*.emb
diff --git a/v1_api_demo/model_zoo/embedding/extract_para.py b/v1_api_demo/model_zoo/embedding/extract_para.py
deleted file mode 100755
index 570b90c1f772c8f6abfc6cda02560fd3471ef0b6..0000000000000000000000000000000000000000
--- a/v1_api_demo/model_zoo/embedding/extract_para.py
+++ /dev/null
@@ -1,113 +0,0 @@
-#!/bin/env python
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Example:
-    python extract_para.py --preModel PREMODEL --preDict PREDICT \
-                            --usrModel USRMODEL --usrDict USRDICT -d DIM
-
-Options:
-    -h, --help          show this help message and exit
-    --preModel PREMODEL the name of pretrained embedding model
-    --preDict PREDICT   the name of pretrained dictionary
-    --usrModel usrModel the name of output usr embedding model
-    --usrDict usrDict   the name of user specified dictionary
-    -d DIM              dimension of parameter
-"""
-from optparse import OptionParser
-import struct
-
-
-def get_row_index(preDict, usrDict):
-    """
-    Get the row positions for all words in user dictionary from pre-trained dictionary.
-    return: a list of row positions
-    Example: preDict='a\nb\nc\n', usrDict='a\nc\n', then return [0,2]
-    """
-    pos = []
-    index = dict()
-    with open(preDict, "r") as f:
-        for line_index, line in enumerate(f):
-            word = line.strip().split()[0]
-            index[word] = line_index
-    with open(usrDict, "r") as f:
-        for line in f:
-            word = line.strip().split()[0]
-            pos.append(index[word])
-    return pos
-
-
-def extract_parameters_by_usrDict(preModel, preDict, usrModel, usrDict,
-                                  paraDim):
-    """
-    Extract desired parameters from a pretrained embedding model based on user dictionary
-    """
-    if paraDim not in [32, 64, 128, 256]:
-        raise RuntimeError("We only support 32, 64, 128, 256 dimensions now")
-
-    fi = open(preModel, "rb")
-    fo = open(usrModel, "wb")
-
-    # write filehead
-    rowIndex = get_row_index(preDict, usrDict)
-    newHead = struct.pack("iil", 0, 4, len(rowIndex) * paraDim)
-    fo.write(newHead)
-    bytes = 4 * paraDim
-    for i in range(0, len(rowIndex)):
-        # find the absolute position of input file
-        fi.seek(rowIndex[i] * bytes + 16, 0)
-        fo.write(fi.read(bytes))
-
-    print "extract parameters finish, total", len(rowIndex), "lines"
-    fi.close()
-
-
-def main():
-    """
-    Main entry for running paraconvert.py 
-    """
-    usage = "usage: \n" \
-            "python %prog --preModel PREMODEL --preDict PREDICT" \
-            " --usrModel USRMODEL --usrDict USRDICT -d DIM"
-    parser = OptionParser(usage)
-    parser.add_option(
-        "--preModel",
-        action="store",
-        dest="preModel",
-        help="the name of pretrained embedding model")
-    parser.add_option(
-        "--preDict",
-        action="store",
-        dest="preDict",
-        help="the name of pretrained dictionary")
-    parser.add_option(
-        "--usrModel",
-        action="store",
-        dest="usrModel",
-        help="the name of output usr embedding model")
-    parser.add_option(
-        "--usrDict",
-        action="store",
-        dest="usrDict",
-        help="the name of user specified dictionary")
-    parser.add_option(
-        "-d", action="store", dest="dim", help="dimension of parameter")
-    (options, args) = parser.parse_args()
-    extract_parameters_by_usrDict(options.preModel, options.preDict,
-                                  options.usrModel, options.usrDict,
-                                  int(options.dim))
-
-
-if __name__ == '__main__':
-    main()
diff --git a/v1_api_demo/model_zoo/embedding/paraconvert.py b/v1_api_demo/model_zoo/embedding/paraconvert.py
deleted file mode 100755
index ce7a70efc43d7f85708f1e12bb94739f3588370c..0000000000000000000000000000000000000000
--- a/v1_api_demo/model_zoo/embedding/paraconvert.py
+++ /dev/null
@@ -1,159 +0,0 @@
-#!/bin/env python
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Example:
-    python paraconvert.py --b2t -i INPUT -o OUTPUT -d DIM
-    python paraconvert.py --t2b -i INPUT -o OUTPUT
-
-Options:
-    -h, --help  show this help message and exit
-    --b2t       convert parameter file of embedding model from binary to text
-    --t2b       convert parameter file of embedding model from text to binary
-    -i INPUT    input parameter file name
-    -o OUTPUT   output parameter file name
-    -d DIM      dimension of parameter
-"""
-from optparse import OptionParser
-import struct
-
-
-def binary2text(input, output, paraDim):
-    """
-    Convert a binary parameter file of embedding model to be a text file.  
-    input: the name of input binary parameter file, the format is:
-           1) the first 16 bytes is filehead:
-                version(4 bytes): version of paddle, default = 0
-                floatSize(4 bytes): sizeof(float) = 4
-                paraCount(8 bytes): total number of parameter
-           2) the next (paraCount * 4) bytes is parameters, each has 4 bytes 
-    output: the name of output text parameter file, for example:
-           0,4,32156096
-           -0.7845433,1.1937413,-0.1704215,...
-           0.0000909,0.0009465,-0.0008813,...
-           ...
-           the format is:
-           1) the first line is filehead: 
-              version=0, floatSize=4, paraCount=32156096
-           2) other lines print the paramters
-              a) each line prints paraDim paramters splitted by ','
-              b) there is paraCount/paraDim lines (embedding words)
-    paraDim: dimension of parameters 
-    """
-    fi = open(input, "rb")
-    fo = open(output, "w")
-    """
-    """
-    version, floatSize, paraCount = struct.unpack("iil", fi.read(16))
-    newHead = ','.join([str(version), str(floatSize), str(paraCount)])
-    print >> fo, newHead
-
-    bytes = 4 * int(paraDim)
-    format = "%df" % int(paraDim)
-    context = fi.read(bytes)
-    line = 0
-
-    while context:
-        numbers = struct.unpack(format, context)
-        lst = []
-        for i in numbers:
-            lst.append('%8.7f' % i)
-        print >> fo, ','.join(lst)
-        context = fi.read(bytes)
-        line += 1
-    fi.close()
-    fo.close()
-    print "binary2text finish, total", line, "lines"
-
-
-def get_para_count(input):
-    """
-    Compute the total number of embedding parameters in input text file. 
-    input: the name of input text file
-    """
-    numRows = 1
-    paraDim = 0
-    with open(input) as f:
-        line = f.readline()
-        paraDim = len(line.split(","))
-        for line in f:
-            numRows += 1
-    return numRows * paraDim
-
-
-def text2binary(input, output, paddle_head=True):
-    """
-    Convert a text parameter file of embedding model to be a binary file.
-    input: the name of input text parameter file, for example:
-           -0.7845433,1.1937413,-0.1704215,...
-           0.0000909,0.0009465,-0.0008813,... 
-           ...
-           the format is:
-           1) it doesn't have filehead
-           2) each line stores the same dimension of parameters, 
-              the separator is commas ','
-    output: the name of output binary parameter file, the format is:
-           1) the first 16 bytes is filehead: 
-             version(4 bytes), floatSize(4 bytes), paraCount(8 bytes)
-           2) the next (paraCount * 4) bytes is parameters, each has 4 bytes
-    """
-    fi = open(input, "r")
-    fo = open(output, "wb")
-
-    newHead = struct.pack("iil", 0, 4, get_para_count(input))
-    fo.write(newHead)
-
-    count = 0
-    for line in fi:
-        line = line.strip().split(",")
-        for i in range(0, len(line)):
-            binary_data = struct.pack("f", float(line[i]))
-            fo.write(binary_data)
-        count += 1
-    fi.close()
-    fo.close()
-    print "text2binary finish, total", count, "lines"
-
-
-def main():
-    """
-    Main entry for running paraconvert.py 
-    """
-    usage = "usage: \n" \
-            "python %prog --b2t -i INPUT -o OUTPUT -d DIM \n" \
-            "python %prog --t2b -i INPUT -o OUTPUT"
-    parser = OptionParser(usage)
-    parser.add_option(
-        "--b2t",
-        action="store_true",
-        help="convert parameter file of embedding model from binary to text")
-    parser.add_option(
-        "--t2b",
-        action="store_true",
-        help="convert parameter file of embedding model from text to binary")
-    parser.add_option(
-        "-i", action="store", dest="input", help="input parameter file name")
-    parser.add_option(
-        "-o", action="store", dest="output", help="output parameter file name")
-    parser.add_option(
-        "-d", action="store", dest="dim", help="dimension of parameter")
-    (options, args) = parser.parse_args()
-    if options.b2t:
-        binary2text(options.input, options.output, options.dim)
-    if options.t2b:
-        text2binary(options.input, options.output)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/v1_api_demo/model_zoo/embedding/pre_DictAndModel.sh b/v1_api_demo/model_zoo/embedding/pre_DictAndModel.sh
deleted file mode 100755
index f61c65a935c76032a06613cfe0b50f1c90bc50d9..0000000000000000000000000000000000000000
--- a/v1_api_demo/model_zoo/embedding/pre_DictAndModel.sh
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-set -x
-BASE_URL='http://paddlepaddle.cdn.bcebos.com/model_zoo/embedding'
-
-DOWNLOAD_ITEMS=(baidu.dict model_32.emb model_64.emb model_128.emb model_256.emb)
-ITEM_MD5=(fa03a12321eaab6c30a8fcc9442eaea3
-          f88c8325ee6da6187f1080e8fe66c1cd
-          927cf70f27f860aff1a5703ebf7f1584
-	  a52e43655cd25d279777ed509a1ae27b
-	  b92c67fe9ff70fea53596080e351ac80)
-
-for ((i=0; i<${#ITEM_MD5[@]}; i++))
-do
-  FILENAME=${DOWNLOAD_ITEMS[${i}]}
-  REAL_MD5=`wget ${BASE_URL}/${FILENAME} -O - | tee ${FILENAME} | md5sum | cut -d ' ' -f 1`
-  EXPECTED_MD5=${ITEM_MD5[${i}]}
-  [ "${EXPECTED_MD5}" = "${REAL_MD5}" ]
-done
diff --git a/v1_api_demo/model_zoo/resnet/.gitignore b/v1_api_demo/model_zoo/resnet/.gitignore
deleted file mode 100644
index 7a64209b62340a5c5a51626821028e63ed5e588e..0000000000000000000000000000000000000000
--- a/v1_api_demo/model_zoo/resnet/.gitignore
+++ /dev/null
@@ -1,5 +0,0 @@
-fea_output/
-features/
-model.list
-ResNet_50.dot
-ResNet_50.png
diff --git a/v1_api_demo/model_zoo/resnet/classify.py b/v1_api_demo/model_zoo/resnet/classify.py
deleted file mode 100755
index 6074cc1d3a85e13e3e8d336d81e22104f9d8e7cf..0000000000000000000000000000000000000000
--- a/v1_api_demo/model_zoo/resnet/classify.py
+++ /dev/null
@@ -1,312 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import sys
-import cPickle
-import logging
-from PIL import Image
-import numpy as np
-from optparse import OptionParser
-
-import paddle.utils.image_util as image_util
-
-from py_paddle import swig_paddle, DataProviderConverter
-from paddle.trainer.PyDataProvider2 import dense_vector
-from paddle.trainer.config_parser import parse_config
-
-logging.basicConfig(
-    format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s')
-logging.getLogger().setLevel(logging.INFO)
-
-
-class ImageClassifier():
-    def __init__(self,
-                 train_conf,
-                 model_dir=None,
-                 resize_dim=256,
-                 crop_dim=224,
-                 use_gpu=True,
-                 mean_file=None,
-                 output_layer=None,
-                 oversample=False,
-                 is_color=True):
-        """
-        train_conf: network configure.
-        model_dir: string, directory of model.
-        resize_dim: int, resized image size.
-        crop_dim: int, crop size.
-        mean_file: string, image mean file.
-        oversample: bool, oversample means multiple crops, namely five
-                    patches (the four corner patches and the center
-                    patch) as well as their horizontal reflections,
-                    ten crops in all.
-        """
-        self.train_conf = train_conf
-        self.model_dir = model_dir
-        if model_dir is None:
-            self.model_dir = os.path.dirname(train_conf)
-
-        self.resize_dim = resize_dim
-        self.crop_dims = [crop_dim, crop_dim]
-        self.oversample = oversample
-        self.is_color = is_color
-
-        self.output_layer = output_layer
-        if self.output_layer:
-            assert isinstance(self.output_layer, basestring)
-            self.output_layer = self.output_layer.split(",")
-
-        self.transformer = image_util.ImageTransformer(is_color=is_color)
-        self.transformer.set_transpose((2, 0, 1))
-        self.transformer.set_channel_swap((2, 1, 0))
-
-        self.mean_file = mean_file
-        if self.mean_file is not None:
-            mean = np.load(self.mean_file)['data_mean']
-            mean = mean.reshape(3, self.crop_dims[0], self.crop_dims[1])
-            self.transformer.set_mean(mean)  # mean pixel
-        else:
-            # if you use three mean value, set like:
-            # this three mean value is calculated from ImageNet.
-            self.transformer.set_mean(np.array([103.939, 116.779, 123.68]))
-
-        conf_args = "is_test=1,use_gpu=%d,is_predict=1" % (int(use_gpu))
-        conf = parse_config(train_conf, conf_args)
-        swig_paddle.initPaddle("--use_gpu=%d" % (int(use_gpu)))
-        self.network = swig_paddle.GradientMachine.createFromConfigProto(
-            conf.model_config)
-        assert isinstance(self.network, swig_paddle.GradientMachine)
-        self.network.loadParameters(self.model_dir)
-
-        data_size = 3 * self.crop_dims[0] * self.crop_dims[1]
-        slots = [dense_vector(data_size)]
-        self.converter = DataProviderConverter(slots)
-
-    def get_data(self, img_path):
-        """
-        1. load image from img_path.
-        2. resize or oversampling.
-        3. transformer data: transpose, channel swap, sub mean.
-        return K x H x W ndarray.
-
-        img_path: image path.
-        """
-        image = image_util.load_image(img_path, self.is_color)
-        # Another way to extract oversampled features is that
-        # cropping and averaging from large feature map which is
-        # calculated by large size of image.
-        # This way reduces the computation.
-        if self.oversample:
-            # image_util.resize_image: short side is self.resize_dim
-            image = image_util.resize_image(image, self.resize_dim)
-            image = np.array(image)
-            input = np.zeros(
-                (1, image.shape[0], image.shape[1], 3), dtype=np.float32)
-            input[0] = image.astype(np.float32)
-            input = image_util.oversample(input, self.crop_dims)
-        else:
-            image = image.resize(self.crop_dims, Image.ANTIALIAS)
-            input = np.zeros(
-                (1, self.crop_dims[0], self.crop_dims[1], 3), dtype=np.float32)
-            input[0] = np.array(image).astype(np.float32)
-
-        data_in = []
-        for img in input:
-            img = self.transformer.transformer(img).flatten()
-            data_in.append([img.tolist()])
-        # paddle input: [[[]],[[]],...], [[]] is one sample.
-        return data_in
-
-    def forward(self, input_data):
-        """
-        return output arguments which are the Outputs() in network configure.
-
-        input_data: py_paddle input data.
-        call forward.
-        """
-        in_arg = self.converter(input_data)
-        return self.network.forwardTest(in_arg)
-
-    def forward(self, data, output_layer):
-        """
-        return output arguments which are the Outputs() in network configure.
-
-        input_data: py_paddle input data.
-        call forward.
-        """
-        input = self.converter(data)
-        self.network.forwardTest(input)
-        output = self.network.getLayerOutputs(output_layer)
-        res = {}
-        if isinstance(output_layer, basestring):
-            output_layer = [output_layer]
-        for name in output_layer:
-            # For oversampling, average predictions across crops.
-            # If not, the shape of output[name]: (1, class_number),
-            # the mean is also applicable.
-            res[name] = output[name]['value'].mean(0)
-
-        return res
-
-    def predict(self, data_file):
-        """
-        call forward and predicting.
-
-        data_file: input image list.
-        """
-        image_files = open(data_file, 'rb').readlines()
-        results = {}
-        if self.output_layer is None:
-            self.output_layer = ["output"]
-        for line in image_files:
-            image = line.split()[0]
-            data = self.get_data(image)
-            prob = self.forward(data, self.output_layer)
-            lab = np.argsort(-prob[self.output_layer[0]])
-            results[image] = lab[0]
-            logging.info("Label of %s is: %d", image, lab[0])
-        return results
-
-    def extract(self, data_file, output_dir, batch_size=10000):
-        """
-        extract and save features of output layers, which are
-        specify in Outputs() in network configure.
-
-        data_file: file name of input data.
-        output_dir: saved directory of extracted features.
-        batch_size: sample number of one batch file.
-        """
-        if not os.path.exists(output_dir):
-            os.mkdir(output_dir)
-
-        sample_num = 0
-        batch_num = 0
-        image_feature = {}
-        image_files = open(data_file, 'rb').readlines()
-        for idx, line in enumerate(image_files):
-            image = line.split()[0]
-            data = self.get_data(image)
-            feature = self.forward(data, self.output_layer)
-            # save extracted features
-            file_name = image.split("/")[-1]
-            image_feature[file_name] = feature
-            sample_num += 1
-            if sample_num == batch_size:
-                batch_name = os.path.join(output_dir, 'batch_%d' % (batch_num))
-                self.save_file(image_feature, batch_name)
-                logging.info('Finish batch %d', batch_num)
-                batch_num += 1
-                sample_num = 0
-                image_feature = {}
-            if idx % 1000 == 0:
-                logging.info('%d/%d, %s', idx, len(image_files), file_name)
-        if sample_num > 0:
-            batch_name = os.path.join(output_dir, 'batch_%d' % (batch_num))
-            self.save_file(image_feature, batch_name)
-            logging.info('Finish batch %d', batch_num)
-        logging.info('Done: make image feature batch')
-
-    def save_file(self, data, file):
-        of = open(file, 'wb')
-        cPickle.dump(data, of, protocol=cPickle.HIGHEST_PROTOCOL)
-
-
-def option_parser():
-    """
-    Main entry for predciting
-    """
-    usage = "%prog -c config -i data_list -w model_dir [options]"
-    parser = OptionParser(usage="usage: %s" % usage)
-    parser.add_option(
-        "-j",
-        "--job",
-        action="store",
-        dest="job_type",
-        help="job type: predict, extract\
-                            predict: predicting,\
-                            extract: extract features")
-    parser.add_option(
-        "-c",
-        "--conf",
-        action="store",
-        dest="train_conf",
-        help="network config")
-    parser.add_option(
-        "-i", "--data", action="store", dest="data_file", help="image list")
-    parser.add_option(
-        "-w",
-        "--model",
-        action="store",
-        dest="model_path",
-        default=None,
-        help="model path")
-    parser.add_option(
-        "-g",
-        "--use_gpu",
-        action="store",
-        dest="use_gpu",
-        default=True,
-        help="Whether to use gpu mode.")
-    parser.add_option(
-        "-o",
-        "--output_dir",
-        action="store",
-        dest="output_dir",
-        default="output",
-        help="output path")
-    parser.add_option(
-        "-m",
-        "--mean",
-        action="store",
-        dest="mean",
-        default=None,
-        help="mean file.")
-    parser.add_option(
-        "-p",
-        "--multi_crop",
-        action="store_true",
-        dest="multi_crop",
-        default=False,
-        help="Wether to use multiple crops on image.")
-    parser.add_option("-l", "--output_layer", action="store",
-                      dest="output_layer", default=None,
-                      help="--job=extract, specify layers to extract "\
-                           "features, --job=predict, specify layer of "
-                           "classification probability, output in resnet.py.")
-    return parser.parse_args()
-
-
-def main():
-    """
-    1. parse input arguments.
-    2. predicting or extract features according job type.
-    """
-    options, args = option_parser()
-    obj = ImageClassifier(
-        options.train_conf,
-        options.model_path,
-        use_gpu=options.use_gpu,
-        mean_file=options.mean,
-        output_layer=options.output_layer,
-        oversample=options.multi_crop)
-    if options.job_type == "predict":
-        obj.predict(options.data_file)
-
-    elif options.job_type == "extract":
-        obj.extract(options.data_file, options.output_dir)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/v1_api_demo/model_zoo/resnet/example/.gitignore b/v1_api_demo/model_zoo/resnet/example/.gitignore
deleted file mode 100644
index 4a2b5962a6800f251cba655c026331f14648c86e..0000000000000000000000000000000000000000
--- a/v1_api_demo/model_zoo/resnet/example/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-*image_list_provider_copy_1.py
diff --git a/v1_api_demo/model_zoo/resnet/example/__init__.py b/v1_api_demo/model_zoo/resnet/example/__init__.py
deleted file mode 100644
index f662d6826321eb840739382558f76327d27b5847..0000000000000000000000000000000000000000
--- a/v1_api_demo/model_zoo/resnet/example/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/v1_api_demo/model_zoo/resnet/example/cat.jpg b/v1_api_demo/model_zoo/resnet/example/cat.jpg
deleted file mode 100644
index 47b01db90eddc46ff845f10bc2accaf2364c272d..0000000000000000000000000000000000000000
Binary files a/v1_api_demo/model_zoo/resnet/example/cat.jpg and /dev/null differ
diff --git a/v1_api_demo/model_zoo/resnet/example/dog.jpg b/v1_api_demo/model_zoo/resnet/example/dog.jpg
deleted file mode 100644
index b9cc33cf069da5c453b97dbb7383838edd07c199..0000000000000000000000000000000000000000
Binary files a/v1_api_demo/model_zoo/resnet/example/dog.jpg and /dev/null differ
diff --git a/v1_api_demo/model_zoo/resnet/example/image_list_provider.py b/v1_api_demo/model_zoo/resnet/example/image_list_provider.py
deleted file mode 100644
index 2cd8eb8bf850f41282ed5db2885dc0b7218c79f7..0000000000000000000000000000000000000000
--- a/v1_api_demo/model_zoo/resnet/example/image_list_provider.py
+++ /dev/null
@@ -1,102 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.utils.image_util import *
-from paddle.trainer.PyDataProvider2 import *
-
-
-def hook(settings, image_size, crop_size, color, file_list, is_train, **kwargs):
-    """
-    Description: Init with a list of data file
-    file_list is the name list of input files.
-    kwargs["load_data_args"] is the value of 'load_data_args'
-    which can be set in config.
-    Each args is separated by a column.
-    image_size: the crop image size.
-    mean_meta: the path of the meta file to store the mean image.
-    mean_value: can be mean value, not a file.
-                can not set mean_meta and mean_value at the same time.
-    color: 'color' means a color image. Otherwise, it means a gray image.
-    is_train: whether the data provider is used for training.
-              Data argumentation might be different for training and testing.
-    """
-    settings.img_size = image_size
-    settings.crop_size = crop_size
-    settings.mean_img_size = settings.crop_size
-    settings.color = color  # default is color
-    settings.is_train = is_train
-
-    settings.is_swap_channel = kwargs.get('swap_channel', None)
-    if settings.is_swap_channel is not None:
-        settings.swap_channel = settings.is_swap_channel
-        settings.is_swap_channel = True
-
-    if settings.color:
-        settings.img_input_size = settings.crop_size * settings.crop_size * 3
-    else:
-        settings.img_input_size = settings.crop_size * settings.crop_size
-
-    settings.file_list = file_list
-    settings.mean_meta = kwargs.get('mean_meta', None)
-    settings.mean_value = kwargs.get('mean_value', None)
-    # can not specify both mean_meta and mean_value.
-    assert not (settings.mean_meta and settings.mean_value)
-    if not settings.mean_meta:
-        settings.mean_value = kwargs.get('mean_value')
-        sz = settings.crop_size * settings.crop_size
-        settings.img_mean = np.zeros(sz * 3, dtype=np.single)
-        for idx, value in enumerate(settings.mean_value):
-            settings.img_mean[idx * sz:(idx + 1) * sz] = value
-        settings.img_mean = settings.img_mean.reshape(3, settings.crop_size,
-                                                      settings.crop_size)
-
-    else:
-        settings.img_mean = load_meta(settings.mean_meta,
-                                      settings.mean_img_size,
-                                      settings.crop_size, settings.color)
-
-    settings.input_types = [
-        dense_vector(settings.img_input_size),  # image feature
-        integer_value(1)
-    ]  # labels
-
-    settings.logger.info('Image short side: %s', settings.img_size)
-    settings.logger.info('Crop size: %s', settings.crop_size)
-    settings.logger.info('Meta path: %s', settings.mean_meta)
-    if settings.is_swap_channel:
-        settings.logger.info('swap channel: %s', settings.swap_channel)
-    settings.logger.info('DataProvider Initialization finished')
-
-
-@provider(init_hook=hook, should_shuffle=False)
-def processData(settings, file_list):
-    """
-    The main function for loading data.
-    Load the batch, iterate all the images and labels in this batch.
-    file_name: the batch file name.
-    """
-    img_path, lab = file_list.strip().split(' ')
-    img = Image.open(img_path)
-    img.load()
-    img = img.resize((settings.img_size, settings.img_size), Image.ANTIALIAS)
-    img = np.array(img).astype(np.float32)
-    if len(img.shape) == 3:
-        img = np.swapaxes(img, 1, 2)
-        img = np.swapaxes(img, 1, 0)
-    # swap channel
-    if settings.is_swap_channel:
-        img = img[settings.swap_channel, :, :]
-    img_feat = preprocess_img(img, settings.img_mean, settings.crop_size,
-                              settings.is_train, settings.color)
-    yield img_feat.tolist(), int(lab.strip())
diff --git a/v1_api_demo/model_zoo/resnet/example/test.list b/v1_api_demo/model_zoo/resnet/example/test.list
deleted file mode 100644
index 30bbf630b640a26239fc104c9c08f6ebc9dfaa82..0000000000000000000000000000000000000000
--- a/v1_api_demo/model_zoo/resnet/example/test.list
+++ /dev/null
@@ -1,2 +0,0 @@
-example/dog.jpg 0
-example/cat.jpg 0
diff --git a/v1_api_demo/model_zoo/resnet/extract_fea_c++.sh b/v1_api_demo/model_zoo/resnet/extract_fea_c++.sh
deleted file mode 100755
index 5447aa92dfb5facd3433eb4a1893e96e3c786c73..0000000000000000000000000000000000000000
--- a/v1_api_demo/model_zoo/resnet/extract_fea_c++.sh
+++ /dev/null
@@ -1,40 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-
-#set names of layer which you want to extract feature
-#in Outputs() of resnet.py 
-#like: Outputs("res5_3_branch2c_conv", "res5_3_branch2c_bn")
-layer_num=50
-configure=./resnet.py
-model_path=./model/resnet_$layer_num
-fea_dir=fea_output
-#Output is text file.
-#Each line is one sample's features.
-#If you set N layer names in Outputs()
-#each line contains N features sperated by ";". 
-
-# create model list file.
-model_list=./model.list
-touch $model_list | echo $model_path > $model_list
-
-paddle train \
-  --local=true \
-  --job=test \
-  --config=$configure \
-  --model_list=$model_list \
-  --use_gpu=1 \
-  --predict_output_dir=$fea_dir \
-  --config_args=is_test=1,layer_num=$layer_num
diff --git a/v1_api_demo/model_zoo/resnet/extract_fea_py.sh b/v1_api_demo/model_zoo/resnet/extract_fea_py.sh
deleted file mode 100755
index 2e87152f7f8598f487870291271cdee646105044..0000000000000000000000000000000000000000
--- a/v1_api_demo/model_zoo/resnet/extract_fea_py.sh
+++ /dev/null
@@ -1,29 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-
-#Note if you use CPU mode, you need to set use_gpu=0 in classify.py. like this:
-#conf_args = "is_test=0,use_gpu=1,is_predict=1"
-#conf = parse_config(train_conf, conf_args)
-#swig_paddle.initPaddle("--use_gpu=0")
-python classify.py \
-     --job=extract \
-     --conf=resnet.py \
-     --use_gpu=1 \
-     --mean=model/mean_meta_224/mean.meta \
-     --model=model/resnet_50 \
-     --data=./example/test.list \
-     --output_layer="res5_3_branch2c_conv,res5_3_branch2c_bn" \
-     --output_dir=features
diff --git a/v1_api_demo/model_zoo/resnet/get_model.sh b/v1_api_demo/model_zoo/resnet/get_model.sh
deleted file mode 100755
index b33d8178ab7859fc0b0d514fb19bec2c28a77c3d..0000000000000000000000000000000000000000
--- a/v1_api_demo/model_zoo/resnet/get_model.sh
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-
-DIR="$( cd "$(dirname "$0")" ; pwd -P )"
-cd $DIR
-
-mkdir model
-cd model
-
-echo "Downloading ResNet models..."
-
-for file in resnet_50.tar.gz resnet_101.tar.gz resnet_152.tar.gz mean_meta_224.tar.gz 
-do 
-  wget http://paddlepaddle.bj.bcebos.com/model_zoo/imagenet/$file
-  tar -xvf $file 
-  rm $file
-done
-
-echo "Done."
diff --git a/v1_api_demo/model_zoo/resnet/load_feature.py b/v1_api_demo/model_zoo/resnet/load_feature.py
deleted file mode 100644
index 5d3d0c0d30ef710c37c98e93a51b2f813d636b59..0000000000000000000000000000000000000000
--- a/v1_api_demo/model_zoo/resnet/load_feature.py
+++ /dev/null
@@ -1,63 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import sys
-import cPickle
-import logging
-
-logging.basicConfig(
-    format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s')
-logging.getLogger().setLevel(logging.INFO)
-
-
-def load_feature_c(file):
-    """
-    Load feature extracted by C++ interface.
-    Return a list.
-    file: feature file.
-    """
-    features = []
-    f = open(file, 'r')
-    for line in f:
-        sample = []
-        for slot in line.strip().split(";"):
-            fea = [float(val) for val in slot.strip().split()]
-            if fea:
-                sample.append(fea)
-        features.append(sample)
-    f.close()
-    return features
-
-
-def load_feature_py(feature_dir):
-    """
-    Load feature extracted by python interface.
-    Return a dictionary.
-    feature_dir: directory of feature file.
-    """
-    file_list = os.listdir(feature_dir)
-    file_list = [os.path.join(feature_dir, f) for f in file_list]
-    features = {}
-    for file_name in file_list:
-        with open(file_name, 'rb') as f:
-            feature = cPickle.load(f)
-            features.update(feature)
-            logging.info('Load feature file %s', file_name)
-    return features
-
-
-if __name__ == '__main__':
-    print load_feature_py(sys.argv[1])
-    #print load_feature_c(sys.argv[1]) 
diff --git a/v1_api_demo/model_zoo/resnet/net_diagram.sh b/v1_api_demo/model_zoo/resnet/net_diagram.sh
deleted file mode 100755
index 1b06ffa44eec8a0f312420c35699d3902f9a6400..0000000000000000000000000000000000000000
--- a/v1_api_demo/model_zoo/resnet/net_diagram.sh
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-:'
-Visual deep residual network
-1. Using make_model_diagram.py to generate dot file.
-2. Using graphviz to convert dot file.
-
-Usage:
-./net_diagram.sh
-'
-
-set -e
-
-DIR="$( cd "$(dirname "$0")" ; pwd -P )"
-cd $DIR
-
-img_type=png
-img_fileprefix=ResNet_50
-conf_filename=resnet.py
-dot_filename=ResNet_50.dot
-config_str="layer_num=50,data_provider=0"
-
-python -m paddle.utils.make_model_diagram $conf_filename $dot_filename $config_str
-
-# If you have installed graphviz, running like this:
-# dot -Tpng -o ResNet.png ResNet.dot
diff --git a/v1_api_demo/model_zoo/resnet/predict.sh b/v1_api_demo/model_zoo/resnet/predict.sh
deleted file mode 100755
index 2b67b17c48c60cc8a7b7c46a1c80a3f2bf281870..0000000000000000000000000000000000000000
--- a/v1_api_demo/model_zoo/resnet/predict.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-
-python classify.py \
-     --job=predict \
-     --conf=resnet.py\
-     --model=model/resnet_50 \
-     --multi_crop \
-     --use_gpu=1 \
-     --data=./example/test.list
diff --git a/v1_api_demo/model_zoo/resnet/resnet.py b/v1_api_demo/model_zoo/resnet/resnet.py
deleted file mode 100644
index 6fdd97fefc62392c93ecffae0fc918e8dc4b18c5..0000000000000000000000000000000000000000
--- a/v1_api_demo/model_zoo/resnet/resnet.py
+++ /dev/null
@@ -1,271 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-"""
-paper: https://arxiv.org/abs/1512.03385
-"""
-is_test = get_config_arg("is_test", bool, False)
-is_predict = get_config_arg("is_predict", bool, False)
-data_provider = get_config_arg("data_provider", bool, True)
-layer_num = get_config_arg("layer_num", int, 50)
-
-if not is_predict and data_provider:
-    train_list = 'train.list' if not is_test else None
-    # mean.meta is mean file of ImageNet dataset.
-    # mean.meta size : 3 x 224 x 224.
-    # If you use three mean value, set like:
-    # "mean_value:103.939,116.779,123.68;"
-    args = {
-        'mean_meta': "model/mean_meta_224/mean.meta",
-        'image_size': 224,
-        'crop_size': 224,
-        'color': True,
-        'swap_channel:': [2, 1, 0]
-    }
-    define_py_data_sources2(
-        train_list,
-        'example/test.list',
-        module="example.image_list_provider",
-        obj="processData",
-        args=args)
-
-batch_size = 1
-learning_rate = 0.1 / batch_size
-momentum = 0.9
-weight_decay = 0.0001 * batch_size
-default_momentum(momentum)
-default_decay_rate(weight_decay)
-
-Settings(
-    algorithm='sgd',
-    batch_size=batch_size,
-    learning_rate=learning_rate,
-
-    # set the appropriate parameters according your schedule
-    learning_method='momentum',
-    learning_rate_decay_a=0.5,
-    learning_rate_decay_b=1200000 * 10,
-    learning_rate_schedule="discexp", )
-
-
-def conv_bn_layer(name,
-                  input,
-                  filter_size,
-                  num_filters,
-                  stride,
-                  padding,
-                  channels=None,
-                  active_type=ReluActivation()):
-    """
-    A wrapper for conv layer with batch normalization layers.
-    Note:
-    conv layer has no activation.
-    """
-
-    tmp = img_conv_layer(
-        name=name + "_conv",
-        input=input,
-        filter_size=filter_size,
-        num_channels=channels,
-        num_filters=num_filters,
-        stride=stride,
-        padding=padding,
-        act=LinearActivation(),
-        bias_attr=False)
-    return batch_norm_layer(
-        name=name + "_bn", input=tmp, act=active_type, use_global_stats=is_test)
-
-
-def bottleneck_block(name, input, num_filters1, num_filters2):
-    """
-    A wrapper for bottlenect building block in ResNet.
-    Last conv_bn_layer has no activation.
-    Addto layer has activation of relu.
-    """
-    last_name = conv_bn_layer(
-        name=name + '_branch2a',
-        input=input,
-        filter_size=1,
-        num_filters=num_filters1,
-        stride=1,
-        padding=0)
-    last_name = conv_bn_layer(
-        name=name + '_branch2b',
-        input=last_name,
-        filter_size=3,
-        num_filters=num_filters1,
-        stride=1,
-        padding=1)
-    last_name = conv_bn_layer(
-        name=name + '_branch2c',
-        input=last_name,
-        filter_size=1,
-        num_filters=num_filters2,
-        stride=1,
-        padding=0,
-        active_type=LinearActivation())
-
-    return addto_layer(
-        name=name + "_addto", input=[input, last_name], act=ReluActivation())
-
-
-def mid_projection(name, input, num_filters1, num_filters2, stride=2):
-    """
-    A wrapper for middile projection in ResNet.
-    projection shortcuts are used for increasing dimensions,
-    and other shortcuts are identity
-    branch1: projection shortcuts are used for increasing
-    dimensions, has no activation.
-    branch2x: bottleneck building block, shortcuts are identity.
-    """
-    # stride = 2
-    branch1 = conv_bn_layer(
-        name=name + '_branch1',
-        input=input,
-        filter_size=1,
-        num_filters=num_filters2,
-        stride=stride,
-        padding=0,
-        active_type=LinearActivation())
-
-    last_name = conv_bn_layer(
-        name=name + '_branch2a',
-        input=input,
-        filter_size=1,
-        num_filters=num_filters1,
-        stride=stride,
-        padding=0)
-    last_name = conv_bn_layer(
-        name=name + '_branch2b',
-        input=last_name,
-        filter_size=3,
-        num_filters=num_filters1,
-        stride=1,
-        padding=1)
-
-    last_name = conv_bn_layer(
-        name=name + '_branch2c',
-        input=last_name,
-        filter_size=1,
-        num_filters=num_filters2,
-        stride=1,
-        padding=0,
-        active_type=LinearActivation())
-
-    return addto_layer(
-        name=name + "_addto", input=[branch1, last_name], act=ReluActivation())
-
-
-def deep_res_net(res2_num=3, res3_num=4, res4_num=6, res5_num=3):
-    """
-    A wrapper for 50,101,152 layers of ResNet.
-    res2_num: number of blocks stacked in conv2_x
-    res3_num: number of blocks stacked in conv3_x
-    res4_num: number of blocks stacked in conv4_x
-    res5_num: number of blocks stacked in conv5_x
-    """
-    # For ImageNet
-    # conv1: 112x112
-    img = data_layer(name='input', size=224 * 224 * 3)
-    tmp = conv_bn_layer(
-        "conv1",
-        img,
-        filter_size=7,
-        channels=3,
-        num_filters=64,
-        stride=2,
-        padding=3)
-    tmp = img_pool_layer(name="pool1", input=tmp, pool_size=3, stride=2)
-
-    # conv2_x: 56x56
-    tmp = mid_projection(
-        name="res2_1", input=tmp, num_filters1=64, num_filters2=256, stride=1)
-    for i in xrange(2, res2_num + 1, 1):
-        tmp = bottleneck_block(
-            name="res2_" + str(i), input=tmp, num_filters1=64, num_filters2=256)
-
-    # conv3_x: 28x28
-    tmp = mid_projection(
-        name="res3_1", input=tmp, num_filters1=128, num_filters2=512)
-    for i in xrange(2, res3_num + 1, 1):
-        tmp = bottleneck_block(
-            name="res3_" + str(i),
-            input=tmp,
-            num_filters1=128,
-            num_filters2=512)
-
-    # conv4_x: 14x14
-    tmp = mid_projection(
-        name="res4_1", input=tmp, num_filters1=256, num_filters2=1024)
-    for i in xrange(2, res4_num + 1, 1):
-        tmp = bottleneck_block(
-            name="res4_" + str(i),
-            input=tmp,
-            num_filters1=256,
-            num_filters2=1024)
-
-    # conv5_x: 7x7
-    tmp = mid_projection(
-        name="res5_1", input=tmp, num_filters1=512, num_filters2=2048)
-    for i in xrange(2, res5_num + 1, 1):
-        tmp = bottleneck_block(
-            name="res5_" + str(i),
-            input=tmp,
-            num_filters1=512,
-            num_filters2=2048)
-
-    tmp = img_pool_layer(
-        name='avgpool',
-        input=tmp,
-        pool_size=7,
-        stride=1,
-        pool_type=AvgPooling())
-
-    output = fc_layer(
-        name='output', input=tmp, size=1000, act=SoftmaxActivation())
-
-    if not is_predict:
-        classification_cost(
-            input=output, label=data_layer(
-                name='label', size=1))
-
-
-def res_net_50():
-    deep_res_net(3, 4, 6, 3)
-
-
-def res_net_101():
-    deep_res_net(3, 4, 23, 3)
-
-
-def res_net_152():
-    deep_res_net(3, 8, 36, 3)
-
-
-if not is_predict:
-    Inputs("input", "label")
-else:
-    Inputs("input")
-# Outputs("cost-softmax" if not is_predict else "output")
-Outputs("res5_3_branch2c_conv", "res5_3_branch2c_bn")
-
-if layer_num == 50:
-    res_net_50()
-elif layer_num == 101:
-    res_net_101()
-elif layer_num == 152:
-    res_net_152()
-else:
-    print("Wrong layer number.")
diff --git a/v1_api_demo/quick_start/.gitignore b/v1_api_demo/quick_start/.gitignore
deleted file mode 100644
index f71662563ff96d6227dd568d9951a90b0d09456e..0000000000000000000000000000000000000000
--- a/v1_api_demo/quick_start/.gitignore
+++ /dev/null
@@ -1,15 +0,0 @@
-*.pyc
-data/dict.txt
-data/dict_all.txt
-data/labels.list
-data/mosesdecoder-master/
-data/reviews_Electronics_5.json.gz
-data/test.list
-data/test.txt
-data/train.list
-data/train.txt
-data/pred.list
-data/pred.txt
-dataprovider_copy_1.py
-train.log
-output
diff --git a/v1_api_demo/quick_start/api_predict.py b/v1_api_demo/quick_start/api_predict.py
deleted file mode 100755
index 9bdffe1006281c58a595e2771561ba62e4c2d6bd..0000000000000000000000000000000000000000
--- a/v1_api_demo/quick_start/api_predict.py
+++ /dev/null
@@ -1,147 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os, sys
-import numpy as np
-from optparse import OptionParser
-from py_paddle import swig_paddle, DataProviderConverter
-from paddle.trainer.PyDataProvider2 import sparse_binary_vector
-from paddle.trainer.config_parser import parse_config
-"""
-Usage: run following command to show help message.
-  python api_predict.py -h
-"""
-
-
-class QuickStartPrediction():
-    def __init__(self, train_conf, dict_file, model_dir=None, label_file=None):
-        """
-        train_conf: trainer configure.
-        dict_file: word dictionary file name.
-        model_dir: directory of model.
-        """
-        self.train_conf = train_conf
-        self.dict_file = dict_file
-        self.word_dict = {}
-        self.dict_dim = self.load_dict()
-        self.model_dir = model_dir
-        if model_dir is None:
-            self.model_dir = os.path.dirname(train_conf)
-
-        self.label = None
-        if label_file is not None:
-            self.load_label(label_file)
-
-        conf = parse_config(train_conf, "is_predict=1")
-        self.network = swig_paddle.GradientMachine.createFromConfigProto(
-            conf.model_config)
-        self.network.loadParameters(self.model_dir)
-        input_types = [sparse_binary_vector(self.dict_dim)]
-        self.converter = DataProviderConverter(input_types)
-
-    def load_dict(self):
-        """
-        Load dictionary from self.dict_file.
-        """
-        for line_count, line in enumerate(open(self.dict_file, 'r')):
-            self.word_dict[line.strip().split('\t')[0]] = line_count
-        return len(self.word_dict)
-
-    def load_label(self, label_file):
-        """
-        Load label.
-        """
-        self.label = {}
-        for v in open(label_file, 'r'):
-            self.label[int(v.split('\t')[1])] = v.split('\t')[0]
-
-    def get_index(self, data):
-        """
-        transform word into integer index according to the dictionary.
-        """
-        words = data.strip().split()
-        word_slot = [self.word_dict[w] for w in words if w in self.word_dict]
-        return word_slot
-
-    def batch_predict(self, data_batch):
-        input = self.converter(data_batch)
-        output = self.network.forwardTest(input)
-        prob = output[0]["id"].tolist()
-        print("predicting labels is:")
-        print prob
-
-
-def option_parser():
-    usage = "python predict.py -n config -w model_dir -d dictionary -i input_file "
-    parser = OptionParser(usage="usage: %s [options]" % usage)
-    parser.add_option(
-        "-n",
-        "--tconf",
-        action="store",
-        dest="train_conf",
-        help="network config")
-    parser.add_option(
-        "-d",
-        "--dict",
-        action="store",
-        dest="dict_file",
-        help="dictionary file")
-    parser.add_option(
-        "-b",
-        "--label",
-        action="store",
-        dest="label",
-        default=None,
-        help="dictionary file")
-    parser.add_option(
-        "-c",
-        "--batch_size",
-        type="int",
-        action="store",
-        dest="batch_size",
-        default=1,
-        help="the batch size for prediction")
-    parser.add_option(
-        "-w",
-        "--model",
-        action="store",
-        dest="model_path",
-        default=None,
-        help="model path")
-    return parser.parse_args()
-
-
-def main():
-    options, args = option_parser()
-    train_conf = options.train_conf
-    batch_size = options.batch_size
-    dict_file = options.dict_file
-    model_path = options.model_path
-    label = options.label
-    swig_paddle.initPaddle("--use_gpu=0")
-    predict = QuickStartPrediction(train_conf, dict_file, model_path, label)
-
-    batch = []
-    labels = []
-    for line in sys.stdin:
-        [label, text] = line.split("\t")
-        labels.append(int(label))
-        batch.append([predict.get_index(text)])
-    print("labels is:")
-    print labels
-    predict.batch_predict(batch)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/v1_api_demo/quick_start/api_predict.sh b/v1_api_demo/quick_start/api_predict.sh
deleted file mode 100755
index 4d9aa9e8854ed79446a47dbc593f419cdda077b4..0000000000000000000000000000000000000000
--- a/v1_api_demo/quick_start/api_predict.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-
-#Note the default model is pass-00002, you shold make sure the model path
-#exists or change the mode path.
-#only test on trainer_config.lr.py
-model=output/model/pass-00001/
-config=trainer_config.lr.py
-label=data/labels.list
-dict=data/dict.txt
-batch_size=20
-head -n$batch_size data/test.txt | python api_predict.py \
-     --tconf=$config\
-     --model=$model \
-     --label=$label \
-     --dict=$dict \
-     --batch_size=$batch_size
diff --git a/v1_api_demo/quick_start/api_train.py b/v1_api_demo/quick_start/api_train.py
deleted file mode 100644
index 5699789daa4051661b0a72c69f4668f2d8bb9cb2..0000000000000000000000000000000000000000
--- a/v1_api_demo/quick_start/api_train.py
+++ /dev/null
@@ -1,122 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import itertools
-import random
-
-from paddle.trainer.config_parser import parse_config
-from py_paddle import swig_paddle as api
-from py_paddle import DataProviderConverter
-from paddle.trainer.PyDataProvider2 \
-    import integer_value, integer_value_sequence, sparse_binary_vector
-
-
-def parse_arguments():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--train_data", type=str, required=False, help="train data file")
-    parser.add_argument("--test_data", type=str, help="test data file")
-    parser.add_argument(
-        "--config", type=str, required=True, help="config file name")
-    parser.add_argument("--dict_file", required=True, help="dictionary file")
-    parser.add_argument(
-        "--seq", default=1, type=int, help="whether use sequence training")
-    parser.add_argument(
-        "--use_gpu", default=0, type=int, help="whether use GPU for training")
-    parser.add_argument(
-        "--trainer_count",
-        default=1,
-        type=int,
-        help="Number of threads for training")
-    parser.add_argument(
-        "--num_passes", default=5, type=int, help="Number of training passes")
-    return parser.parse_args()
-
-
-UNK_IDX = 0
-
-
-def load_data(file_name, word_dict):
-    with open(file_name, 'r') as f:
-        for line in f:
-            label, comment = line.strip().split('\t')
-            words = comment.split()
-            word_slot = [word_dict.get(w, UNK_IDX) for w in words]
-            yield word_slot, int(label)
-
-
-def load_dict(dict_file):
-    word_dict = dict()
-    with open(dict_file, 'r') as f:
-        for i, line in enumerate(f):
-            w = line.strip().split()[0]
-            word_dict[w] = i
-    return word_dict
-
-
-def main():
-    options = parse_arguments()
-    api.initPaddle("--use_gpu=%s" % options.use_gpu,
-                   "--trainer_count=%s" % options.trainer_count)
-
-    word_dict = load_dict(options.dict_file)
-    train_dataset = list(load_data(options.train_data, word_dict))
-    if options.test_data:
-        test_dataset = list(load_data(options.test_data, word_dict))
-    else:
-        test_dataset = None
-
-    trainer_config = parse_config(options.config,
-                                  "dict_file=%s" % options.dict_file)
-    # No need to have data provider for trainer
-    trainer_config.ClearField('data_config')
-    trainer_config.ClearField('test_data_config')
-
-    # create a GradientMachine from the model configuratin
-    model = api.GradientMachine.createFromConfigProto(
-        trainer_config.model_config)
-    # create a trainer for the gradient machine
-    trainer = api.Trainer.create(trainer_config, model)
-
-    # create a data converter which converts data to PaddlePaddle
-    # internal format
-    input_types = [
-        integer_value_sequence(len(word_dict)) if options.seq else
-        sparse_binary_vector(len(word_dict)), integer_value(2)
-    ]
-    converter = DataProviderConverter(input_types)
-
-    batch_size = trainer_config.opt_config.batch_size
-    trainer.startTrain()
-    for train_pass in xrange(options.num_passes):
-        trainer.startTrainPass()
-        random.shuffle(train_dataset)
-        for pos in xrange(0, len(train_dataset), batch_size):
-            batch = itertools.islice(train_dataset, pos, pos + batch_size)
-            size = min(batch_size, len(train_dataset) - pos)
-            trainer.trainOneDataBatch(size, converter(batch))
-        trainer.finishTrainPass()
-        if test_dataset:
-            trainer.startTestPeriod()
-            for pos in xrange(0, len(test_dataset), batch_size):
-                batch = itertools.islice(test_dataset, pos, pos + batch_size)
-                size = min(batch_size, len(test_dataset) - pos)
-                trainer.testOneDataBatch(size, converter(batch))
-            trainer.finishTestPeriod()
-    trainer.finishTrain()
-
-
-if __name__ == '__main__':
-    main()
diff --git a/v1_api_demo/quick_start/api_train.sh b/v1_api_demo/quick_start/api_train.sh
deleted file mode 100755
index 9b2a4e2f224b1677c458ede66a6a3bac09d8ad61..0000000000000000000000000000000000000000
--- a/v1_api_demo/quick_start/api_train.sh
+++ /dev/null
@@ -1,29 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-
-# Note: if using trainer_config.emb.py, trainer_config.cnn.py
-# or trainer_config.lstm.py, you need to change --seq to --seq=1
-# because they are sequence models.
-python api_train.py \
-  --config=trainer_config.lr.py \
-  --trainer_count=2 \
-  --num_passes=15 \
-  --use_gpu=0 \
-  --seq=0 \
-  --train_data=data/train.txt \
-  --test_data=data/test.txt \
-  --dict_file=data/dict.txt \
-  2>&1 | tee 'train.log'
diff --git a/v1_api_demo/quick_start/cluster/cluster_train.sh b/v1_api_demo/quick_start/cluster/cluster_train.sh
deleted file mode 100755
index a7b1f01064b29cf6abc4cd6b706ee466a6d6da36..0000000000000000000000000000000000000000
--- a/v1_api_demo/quick_start/cluster/cluster_train.sh
+++ /dev/null
@@ -1,45 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-
-# Should run pserver.sh before run this script.
-bin_dir=$(cd `dirname $0`; pwd)
-home_dir=$(cd "${bin_dir}/.."; pwd)
-source "$bin_dir/env.sh"
-
-model_dir="$bin_dir/output"
-log_file="$bin_dir/train.log"
-
-pushd "$home_dir"
-cfg=trainer_config.lr.py
-paddle train \
-  --start_pserver=false \
-  --config=$cfg \
-  --save_dir=${model_dir} \
-  --trainer_count=4 \
-  --local=0 \
-  --log_period=100 \
-  --num_passes=15 \
-  --use_gpu=false \
-  --show_parameter_stats_period=100 \
-  --test_all_data_in_one_period=1 \
-  --num_gradient_servers=1 \
-  --nics=`get_nics` \
-  --port=7164 \
-  --ports_num=1 \
-  --pservers="127.0.0.1" \
-  --comment="paddle_trainer" \
-  2>&1 | tee "$log_file"
-popd
diff --git a/v1_api_demo/quick_start/cluster/env.sh b/v1_api_demo/quick_start/cluster/env.sh
deleted file mode 100644
index a404993835d0e479f65c89c5561855293b7b66f0..0000000000000000000000000000000000000000
--- a/v1_api_demo/quick_start/cluster/env.sh
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-
-function get_nics() {
-  machine=`uname -s`
-  local nics=""
-  if [ "$machine" == "Linux" ]; then
-    nics="lo"
-  elif [ "$machine" == "Darwin" ]; then
-    nics="lo0"
-  else
-    nics="unsupport"
-  fi
-  echo $nics
-}
diff --git a/v1_api_demo/quick_start/cluster/pserver.sh b/v1_api_demo/quick_start/cluster/pserver.sh
deleted file mode 100755
index b187c1d9b9108a607ed310253d54ecc096f0e792..0000000000000000000000000000000000000000
--- a/v1_api_demo/quick_start/cluster/pserver.sh
+++ /dev/null
@@ -1,26 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-bin_dir=$(cd `dirname $0`; pwd)
-source "$bin_dir/env.sh"
-
-paddle pserver \
-  --nics=`get_nics` \
-  --port=7164 \
-  --ports_num=1 \
-  --ports_num_for_sparse=1 \
-  --num_gradient_servers=1 \
-  --comment="paddle_pserver" \
-  2>&1 | tee 'pserver.log'
diff --git a/v1_api_demo/quick_start/data/README.md b/v1_api_demo/quick_start/data/README.md
deleted file mode 100644
index 63abcf7ebf31903213e44cf492b93e09f61db14e..0000000000000000000000000000000000000000
--- a/v1_api_demo/quick_start/data/README.md
+++ /dev/null
@@ -1,9 +0,0 @@
-This dataset consists of electronics product reviews associated with
-binary labels (positive/negative) for sentiment classification.
-
-The preprocessed data can be downloaded by script `get_data.sh`.
-The data was derived from reviews_Electronics_5.json.gz at
-
-http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz
-
-If you want to process the raw data, you can use the script `proc_from_raw_data/get_data.sh`.
diff --git a/v1_api_demo/quick_start/data/get_data.sh b/v1_api_demo/quick_start/data/get_data.sh
deleted file mode 100755
index a09a18f919e5a84f1f7c889a43f0a5fbf4a60a77..0000000000000000000000000000000000000000
--- a/v1_api_demo/quick_start/data/get_data.sh
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-
-DIR="$( cd "$(dirname "$0")" ; pwd -P )"
-cd $DIR
-
-# Download the preprocessed data
-wget http://paddlepaddle.bj.bcebos.com/demo/quick_start_preprocessed_data/preprocessed_data.tar.gz
-
-# Extract package
-tar zxvf preprocessed_data.tar.gz
-
-# Remove compressed package
-rm preprocessed_data.tar.gz
diff --git a/v1_api_demo/quick_start/data/proc_from_raw_data/get_data.sh b/v1_api_demo/quick_start/data/proc_from_raw_data/get_data.sh
deleted file mode 100755
index d976eaebfaa600778e0ab6bb0adbd7159f1cce2f..0000000000000000000000000000000000000000
--- a/v1_api_demo/quick_start/data/proc_from_raw_data/get_data.sh
+++ /dev/null
@@ -1,79 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# 1. size of pos : neg = 1:1.
-# 2. size of testing set = min(25k, len(all_data) * 0.1), others is traning set.
-# 3. distinct train set and test set.
-
-set -e
-
-DIR="$( cd "$(dirname "$0")" ; pwd -P )"
-cd $DIR
-
-# Download data
-echo "Downloading Amazon Electronics reviews data..."
-# http://jmcauley.ucsd.edu/data/amazon/
-wget http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz
-echo "Downloading mosesdecoder..."
-# https://github.com/moses-smt/mosesdecoder
-wget https://github.com/moses-smt/mosesdecoder/archive/master.zip
-
-unzip master.zip
-rm master.zip
-
-##################
-# Preprocess data 
-echo "Preprocess data..."
-export LC_ALL=C
-UNAME_STR=`uname`
-
-if [ ${UNAME_STR} == 'Linux' ]; then
-  SHUF_PROG='shuf'
-else
-  SHUF_PROG='gshuf'
-fi
-
-mkdir -p tmp
-python preprocess.py -i reviews_Electronics_5.json.gz
-# uniq and shuffle
-cd tmp
-echo 'Uniq and shuffle...'
-cat pos_*|sort|uniq|${SHUF_PROG}> pos.shuffed
-cat neg_*|sort|uniq|${SHUF_PROG}> neg.shuffed
-
-min_len=`sed -n '$=' neg.shuffed`
-test_num=$((min_len/10))
-if [ $test_num -gt 12500 ];then
- test_num=12500
-fi
-train_num=$((min_len-test_num))
-
-head -n$train_num pos.shuffed >train.pos
-head -n$train_num neg.shuffed >train.neg
-tail -n$test_num pos.shuffed >test.pos
-tail -n$test_num neg.shuffed >test.neg
-
-cat train.pos train.neg | ${SHUF_PROG} >../train.txt
-cat test.pos test.neg | ${SHUF_PROG} >../test.txt
-
-cd -
-echo 'train.txt' > train.list
-echo 'test.txt' > test.list
-
-# use 30k dict
-rm -rf tmp
-mv dict.txt dict_all.txt
-cat dict_all.txt | head -n 30001 > dict.txt
-echo 'Done.'
diff --git a/v1_api_demo/quick_start/data/proc_from_raw_data/preprocess.py b/v1_api_demo/quick_start/data/proc_from_raw_data/preprocess.py
deleted file mode 100755
index 5706351a21fbd15d9bbf197156bb0fdabcb07295..0000000000000000000000000000000000000000
--- a/v1_api_demo/quick_start/data/proc_from_raw_data/preprocess.py
+++ /dev/null
@@ -1,236 +0,0 @@
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-# -*- coding: UTF-8 -*-
-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-1. Tokenize the words and punctuation 
-2. pos sample : rating score 5; neg sample: rating score 1-2.
-
-Usage:
-    python preprocess.py -i data_file [random seed]
-"""
-
-import sys
-import os
-import operator
-import gzip
-from subprocess import Popen, PIPE
-from optparse import OptionParser
-import json
-from multiprocessing import Queue
-from multiprocessing import Pool
-import multiprocessing
-
-batch_size = 5000
-word_count = {}
-num_tokenize = max(1,
-                   multiprocessing.cpu_count() - 2)  # parse + tokenize + save
-max_queue_size = 8
-parse_queue = Queue(maxsize=max_queue_size + num_tokenize)
-tokenize_queue = Queue(maxsize=max_queue_size + num_tokenize)
-
-
-def create_dict(data):
-    """
-    Create dictionary based on data, and saved in data_dir/dict.txt.
-    The first line is unk \t -1.
-    data: list, input data by batch.
-    """
-    for seq in data:
-        try:
-            for w in seq.lower().split():
-                if w not in word_count:
-                    word_count[w] = 1
-                else:
-                    word_count[w] += 1
-        except:
-            sys.stderr.write(seq + "\tERROR\n")
-
-
-def parse(path):
-    """
-    Open .gz file.
-    """
-    sys.stderr.write(path)
-    g = gzip.open(path, 'r')
-    for l in g:
-        yield json.loads(l)
-    g.close()
-
-
-def tokenize(sentences):
-    """
-    Use tokenizer.perl to tokenize input sentences.
-    tokenizer.perl is tool of Moses.
-    sentences : a list of input sentences.
-    return: a list of processed text.
-    """
-    dir = './mosesdecoder-master/scripts/tokenizer/tokenizer.perl'
-    if not os.path.exists(dir):
-        sys.exit(
-            "The ./mosesdecoder-master/scripts/tokenizer/tokenizer.perl does not exists."
-        )
-    tokenizer_cmd = [dir, '-l', 'en', '-q', '-']
-    assert isinstance(sentences, list)
-    text = "\n".join(sentences)
-    tokenizer = Popen(tokenizer_cmd, stdin=PIPE, stdout=PIPE)
-    tok_text, _ = tokenizer.communicate(text)
-    toks = tok_text.split('\n')[:-1]
-    return toks
-
-
-def save_data(instance, data_dir, pre_fix, batch_num):
-    """
-    save data by batch
-    """
-    label = ['1' if pre_fix == 'pos' else '0' for i in range(len(instance))]
-    lines = ['%s\t%s' % (label[i], instance[i]) for i in range(len(label))]
-    file_name = os.path.join(data_dir, "%s_%s.txt" % (pre_fix, batch_num))
-    file(file_name, 'w').write('\n'.join(lines) + '\n')
-
-
-def tokenize_batch(id):
-    """
-    tokenize data by batch
-    """
-    while True:
-        num_batch, instance, pre_fix = parse_queue.get()
-        if num_batch == -1:  ### parse_queue finished
-            tokenize_queue.put((-1, None, None))
-            sys.stderr.write("Thread %s finish\n" % (id))
-            break
-        tokenize_instance = tokenize(instance)
-        tokenize_queue.put((num_batch, tokenize_instance, pre_fix))
-        sys.stderr.write('.')
-
-
-def save_batch(data_dir, num_tokenize, data_dir_dict):
-    """
-        save data by batch
-        build dict.txt
-    """
-    token_count = 0
-    while True:
-        num_batch, instance, pre_fix = tokenize_queue.get()
-        if num_batch == -1:
-            token_count += 1
-            if token_count == num_tokenize:  #### tokenize finished.
-                break
-            else:
-                continue
-        save_data(instance, data_dir, pre_fix, num_batch)
-        create_dict(instance)  ## update dict
-
-    sys.stderr.write("save file finish\n")
-    f = open(data_dir_dict, 'w')
-    f.write('%s\t%s\n' % ('unk', '-1'))
-    for k, v in sorted(word_count.items(), key=operator.itemgetter(1), \
-                       reverse=True):
-        f.write('%s\t%s\n' % (k, v))
-    f.close()
-    sys.stderr.write("build dict finish\n")
-
-
-def parse_batch(data, num_tokenize):
-    """
-    parse data by batch
-    parse -> tokenize -> save
-    """
-    raw_txt = parse(data)
-    neg, pos = [], []
-    count = 0
-    sys.stderr.write("extract raw data\n")
-    for l in raw_txt:
-        rating = l["overall"]
-        text = l["reviewText"].lower()  # # convert words to lower case
-        if rating == 5.0 and text:
-            pos.append(text)
-        if rating < 3.0 and text:
-            neg.append(text)
-        if len(pos) == batch_size or len(neg) == batch_size:
-            if len(pos) == batch_size:
-                batch = pos
-                pre_fix = 'pos'
-            else:
-                batch = neg
-                pre_fix = 'neg'
-
-            parse_queue.put((count, batch, pre_fix))
-            count += 1
-            if pre_fix == 'pos':
-                pos = []
-            else:
-                neg = []
-
-    if len(pos) > 0:
-        parse_queue.put((count, pos, 'pos'))
-        count += 1
-    if len(neg) > 0:
-        parse_queue.put((count, neg, 'neg'))
-        count += 1
-    for i in range(num_tokenize):
-        parse_queue.put((-1, None, None))  #### for tokenize's input finished
-    sys.stderr.write("parsing finish\n")
-
-
-def option_parser():
-    parser = OptionParser(usage="usage: python preprcoess.py "\
-                                "-i data_path [options]")
-    parser.add_option(
-        "-i", "--data", action="store", dest="input", help="Input data path.")
-    parser.add_option(
-        "-s",
-        "--seed",
-        action="store",
-        dest="seed",
-        default=1024,
-        help="Set random seed.")
-    return parser.parse_args()
-
-
-def main():
-    reload(sys)
-    sys.setdefaultencoding('utf-8')
-    options, args = option_parser()
-    data = options.input
-    seed = options.seed
-    data_dir_dict = os.path.join(os.path.dirname(data), 'dict.txt')
-    data_dir = os.path.join(os.path.dirname(data), 'tmp')
-    pool = Pool(processes=num_tokenize + 2)
-    pool.apply_async(parse_batch, args=(data, num_tokenize))
-    for i in range(num_tokenize):
-        pool.apply_async(tokenize_batch, args=(str(i), ))
-    pool.apply_async(save_batch, args=(data_dir, num_tokenize, data_dir_dict))
-    pool.close()
-    pool.join()
-
-    file(os.path.join(os.path.dirname(data), 'labels.list'),
-         'w').write('neg\t0\npos\t1\n')
-
-
-if __name__ == '__main__':
-    main()
diff --git a/v1_api_demo/quick_start/dataprovider_bow.py b/v1_api_demo/quick_start/dataprovider_bow.py
deleted file mode 100644
index 2745495586449b5d1eb64ae570f73eb6b14dbdfe..0000000000000000000000000000000000000000
--- a/v1_api_demo/quick_start/dataprovider_bow.py
+++ /dev/null
@@ -1,86 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer.PyDataProvider2 import *
-
-# id of the word not in dictionary
-UNK_IDX = 0
-
-
-# initializer is called by the framework during initialization.
-# It allows the user to describe the data types and setup the
-# necessary data structure for later use.
-# `settings` is an object. initializer need to properly fill settings.input_types.
-# initializer can also store other data structures needed to be used at process().
-# In this example, dictionary is stored in settings.
-# `dictionay` and `kwargs` are arguments passed from trainer_config.lr.py
-def initializer(settings, dictionary, **kwargs):
-    # Put the word dictionary into settings
-    settings.word_dict = dictionary
-
-    # setting.input_types specifies what the data types the data provider
-    # generates.
-    settings.input_types = {
-        # The first input is a sparse_binary_vector,
-        # which means each dimension of the vector is either 0 or 1. It is the
-        # bag-of-words (BOW) representation of the texts.
-        'word': sparse_binary_vector(len(dictionary)),
-        # The second input is an integer. It represents the category id of the
-        # sample. 2 means there are two labels in the dataset.
-        # (1 for positive and 0 for negative)
-        'label': integer_value(2)
-    }
-
-
-# Delaring a data provider. It has an initializer 'data_initialzer'.
-# It will cache the generated data of the first pass in memory, so that
-# during later pass, no on-the-fly data generation will be needed.
-# `setting` is the same object used by initializer()
-# `file_name` is the name of a file listed train_list or test_list file given
-# to define_py_data_sources2(). See trainer_config.lr.py.
-@provider(init_hook=initializer, cache=CacheType.CACHE_PASS_IN_MEM)
-def process(settings, file_name):
-    # Open the input data file.
-    with open(file_name, 'r') as f:
-        # Read each line.
-        for line in f:
-            # Each line contains the label and text of the comment, separated by \t.
-            label, comment = line.strip().split('\t')
-
-            # Split the words into a list.
-            words = comment.split()
-
-            # convert the words into a list of ids by looking them up in word_dict.
-            word_vector = [settings.word_dict.get(w, UNK_IDX) for w in words]
-
-            # Return the features for the current comment. The first is a list
-            # of ids representing a 0-1 binary sparse vector of the text,
-            # the second is the integer id of the label.
-            yield {'word': word_vector, 'label': int(label)}
-
-
-def predict_initializer(settings, dictionary, **kwargs):
-    settings.word_dict = dictionary
-    settings.input_types = {'word': sparse_binary_vector(len(dictionary))}
-
-
-# Declaring a data provider for prediction. The difference with process
-# is that label is not generated.
-@provider(init_hook=predict_initializer, should_shuffle=False)
-def process_predict(settings, file_name):
-    with open(file_name, 'r') as f:
-        for line in f:
-            comment = line.strip().split()
-            word_vector = [settings.word_dict.get(w, UNK_IDX) for w in comment]
-            yield {'word': word_vector}
diff --git a/v1_api_demo/quick_start/dataprovider_emb.py b/v1_api_demo/quick_start/dataprovider_emb.py
deleted file mode 100755
index ddfa3ce9b73555cb3b7f5a44314ca35b12d41ede..0000000000000000000000000000000000000000
--- a/v1_api_demo/quick_start/dataprovider_emb.py
+++ /dev/null
@@ -1,52 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer.PyDataProvider2 import *
-
-UNK_IDX = 0
-
-
-def initializer(settings, dictionary, **kwargs):
-    settings.word_dict = dictionary
-    settings.input_types = {
-        # Define the type of the first input as sequence of integer.
-        # The value of the integers range from 0 to len(dictrionary)-1
-        'word': integer_value_sequence(len(dictionary)),
-        # Define the second input for label id
-        'label': integer_value(2)
-    }
-
-
-@provider(init_hook=initializer, cache=CacheType.CACHE_PASS_IN_MEM)
-def process(settings, file_name):
-    with open(file_name, 'r') as f:
-        for line in f:
-            label, comment = line.strip().split('\t')
-            words = comment.split()
-            word_slot = [settings.word_dict.get(w, UNK_IDX) for w in words]
-            yield {'word': word_slot, 'label': int(label)}
-
-
-def predict_initializer(settings, dictionary, **kwargs):
-    settings.word_dict = dictionary
-    settings.input_types = {'word': integer_value_sequence(len(dictionary))}
-
-
-@provider(init_hook=predict_initializer, should_shuffle=False)
-def process_predict(settings, file_name):
-    with open(file_name, 'r') as f:
-        for line in f:
-            comment = line.strip().split()
-            word_slot = [settings.word_dict.get(w, UNK_IDX) for w in comment]
-            yield {'word': word_slot}
diff --git a/v1_api_demo/quick_start/predict.sh b/v1_api_demo/quick_start/predict.sh
deleted file mode 100755
index e47c2dd01fb5c919203964e298018e6dc2bd366e..0000000000000000000000000000000000000000
--- a/v1_api_demo/quick_start/predict.sh
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-
-cfg=trainer_config.lr.py
-#cfg=trainer_config.emb.py
-#cfg=trainer_config.cnn.py
-#cfg=trainer_config.lstm.py
-model="output/pass-00003"
-paddle train \
-    --config=$cfg \
-    --use_gpu=false \
-    --job=test \
-    --init_model_path=$model \
-    --config_args=is_predict=1 \
-    --predict_output_dir=. \
-2>&1 | tee 'predict.log'
-paddle usage -l 'predict.log' -e $? -n "quick_start_predict_${cfg}" >/dev/null 2>&1
-
-mv rank-00000 result.txt
diff --git a/v1_api_demo/quick_start/train.sh b/v1_api_demo/quick_start/train.sh
deleted file mode 100755
index 01697fed48054be8ad98a01d4cbb5029e6a1ead0..0000000000000000000000000000000000000000
--- a/v1_api_demo/quick_start/train.sh
+++ /dev/null
@@ -1,34 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-
-cfg=trainer_config.lr.py
-#cfg=trainer_config.emb.py
-#cfg=trainer_config.cnn.py
-#cfg=trainer_config.lstm.py
-#cfg=trainer_config.bidi-lstm.py
-#cfg=trainer_config.db-lstm.py
-#cfg=trainer_config.resnet-lstm.py
-paddle train \
-  --config=$cfg \
-  --save_dir=./output \
-  --trainer_count=4 \
-  --log_period=100 \
-  --num_passes=15 \
-  --use_gpu=false \
-  --show_parameter_stats_period=100 \
-  --test_all_data_in_one_period=1 \
-  2>&1 | tee 'train.log'
-paddle usage -l "train.log" -e $? -n "quick_start_${cfg}" >/dev/null 2>&1
diff --git a/v1_api_demo/quick_start/trainer_config.bidi-lstm.py b/v1_api_demo/quick_start/trainer_config.bidi-lstm.py
deleted file mode 100644
index 3deff4aa00b1ea5d66097514867d1a392393a523..0000000000000000000000000000000000000000
--- a/v1_api_demo/quick_start/trainer_config.bidi-lstm.py
+++ /dev/null
@@ -1,74 +0,0 @@
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-# edit-mode: -*- python -*-
-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-dict_file = "./data/dict.txt"
-word_dict = dict()
-with open(dict_file, 'r') as f:
-    for i, line in enumerate(f):
-        w = line.strip().split()[0]
-        word_dict[w] = i
-
-is_predict = get_config_arg('is_predict', bool, False)
-trn = 'data/train.list' if not is_predict else None
-tst = 'data/test.list' if not is_predict else 'data/pred.list'
-process = 'process' if not is_predict else 'process_predict'
-define_py_data_sources2(
-    train_list=trn,
-    test_list=tst,
-    module="dataprovider_emb",
-    obj=process,
-    args={"dictionary": word_dict})
-
-batch_size = 128 if not is_predict else 1
-settings(
-    batch_size=batch_size,
-    learning_rate=2e-3,
-    learning_method=AdamOptimizer(),
-    regularization=L2Regularization(8e-4),
-    gradient_clipping_threshold=25)
-
-bias_attr = ParamAttr(initial_std=0., l2_rate=0.)
-data = data_layer(name="word", size=len(word_dict))
-emb = embedding_layer(input=data, size=128)
-
-bi_lstm = bidirectional_lstm(input=emb, size=128)
-dropout = dropout_layer(input=bi_lstm, dropout_rate=0.5)
-
-output = fc_layer(
-    input=dropout, size=2, bias_attr=bias_attr, act=SoftmaxActivation())
-
-if is_predict:
-    maxid = maxid_layer(output)
-    outputs([maxid, output])
-else:
-    label = data_layer(name="label", size=2)
-    cls = classification_cost(input=output, label=label)
-    outputs(cls)
diff --git a/v1_api_demo/quick_start/trainer_config.cnn.py b/v1_api_demo/quick_start/trainer_config.cnn.py
deleted file mode 100644
index e09e41484d30db385a1d276b7f346b444fe79d3d..0000000000000000000000000000000000000000
--- a/v1_api_demo/quick_start/trainer_config.cnn.py
+++ /dev/null
@@ -1,68 +0,0 @@
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-# edit-mode: -*- python -*-
-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-dict_file = "./data/dict.txt"
-word_dict = dict()
-with open(dict_file, 'r') as f:
-    for i, line in enumerate(f):
-        w = line.strip().split()[0]
-        word_dict[w] = i
-
-is_predict = get_config_arg('is_predict', bool, False)
-trn = 'data/train.list' if not is_predict else None
-tst = 'data/test.list' if not is_predict else 'data/pred.list'
-process = 'process' if not is_predict else 'process_predict'
-define_py_data_sources2(
-    train_list=trn,
-    test_list=tst,
-    module="dataprovider_emb",
-    obj=process,
-    args={"dictionary": word_dict})
-
-batch_size = 128 if not is_predict else 1
-settings(
-    batch_size=batch_size,
-    learning_rate=2e-3,
-    learning_method=AdamOptimizer(),
-    regularization=L2Regularization(8e-4),
-    gradient_clipping_threshold=25)
-
-data = data_layer(name="word", size=len(word_dict))
-embedding = embedding_layer(input=data, size=128)
-conv = sequence_conv_pool(input=embedding, context_len=3, hidden_size=512)
-output = fc_layer(input=conv, size=2, act=SoftmaxActivation())
-if is_predict:
-    maxid = maxid_layer(output)
-    outputs([maxid, output])
-else:
-    label = data_layer(name="label", size=2)
-    cls = classification_cost(input=output, label=label)
-    outputs(cls)
diff --git a/v1_api_demo/quick_start/trainer_config.db-lstm.py b/v1_api_demo/quick_start/trainer_config.db-lstm.py
deleted file mode 100644
index fba802b4600b33cfbfd0820cce1f47e4d0f948ae..0000000000000000000000000000000000000000
--- a/v1_api_demo/quick_start/trainer_config.db-lstm.py
+++ /dev/null
@@ -1,74 +0,0 @@
-# edit-mode: -*- python -*-
-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-dict_file = "./data/dict.txt"
-word_dict = dict()
-with open(dict_file, 'r') as f:
-    for i, line in enumerate(f):
-        w = line.strip().split()[0]
-        word_dict[w] = i
-
-is_predict = get_config_arg('is_predict', bool, False)
-trn = 'data/train.list' if not is_predict else None
-tst = 'data/test.list' if not is_predict else 'data/pred.list'
-process = 'process' if not is_predict else 'process_predict'
-define_py_data_sources2(
-    train_list=trn,
-    test_list=tst,
-    module="dataprovider_emb",
-    obj=process,
-    args={"dictionary": word_dict})
-
-batch_size = 128 if not is_predict else 1
-settings(
-    batch_size=batch_size,
-    learning_rate=2e-3,
-    learning_method=AdamOptimizer(),
-    regularization=L2Regularization(8e-4),
-    gradient_clipping_threshold=25)
-
-bias_attr = ParamAttr(initial_std=0., l2_rate=0.)
-
-data = data_layer(name="word", size=len(word_dict))
-emb = embedding_layer(input=data, size=128)
-
-hidden_0 = mixed_layer(size=128, input=[full_matrix_projection(input=emb)])
-lstm_0 = lstmemory(input=hidden_0, layer_attr=ExtraAttr(drop_rate=0.1))
-
-input_layers = [hidden_0, lstm_0]
-
-for i in range(1, 8):
-    fc = fc_layer(input=input_layers, size=128)
-    lstm = lstmemory(
-        input=fc,
-        layer_attr=ExtraAttr(drop_rate=0.1),
-        reverse=(i % 2) == 1, )
-    input_layers = [fc, lstm]
-
-lstm_last = pooling_layer(input=lstm, pooling_type=MaxPooling())
-
-output = fc_layer(
-    input=lstm_last, size=2, bias_attr=bias_attr, act=SoftmaxActivation())
-
-if is_predict:
-    maxid = maxid_layer(output)
-    outputs([maxid, output])
-else:
-    label = data_layer(name="label", size=2)
-    cls = classification_cost(input=output, label=label)
-    outputs(cls)
diff --git a/v1_api_demo/quick_start/trainer_config.emb.py b/v1_api_demo/quick_start/trainer_config.emb.py
deleted file mode 100644
index f69f98ff7fc885d3fe16d3aaf66967389b3b3240..0000000000000000000000000000000000000000
--- a/v1_api_demo/quick_start/trainer_config.emb.py
+++ /dev/null
@@ -1,64 +0,0 @@
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-# edit-mode: -*- python -*-
-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-dict_file = "./data/dict.txt"
-word_dict = dict()
-with open(dict_file, 'r') as f:
-    for i, line in enumerate(f):
-        w = line.strip().split()[0]
-        word_dict[w] = i
-
-is_predict = get_config_arg('is_predict', bool, False)
-trn = 'data/train.list' if not is_predict else None
-tst = 'data/test.list' if not is_predict else 'data/pred.list'
-process = 'process' if not is_predict else 'process_predict'
-define_py_data_sources2(
-    train_list=trn,
-    test_list=tst,
-    module="dataprovider_emb",
-    obj=process,
-    args={"dictionary": word_dict})
-
-batch_size = 128 if not is_predict else 1
-settings(
-    batch_size=batch_size, learning_rate=2e-3, learning_method=AdamOptimizer())
-
-data = data_layer(name="word", size=len(word_dict))
-embedding = embedding_layer(input=data, size=128)
-avg = pooling_layer(input=embedding, pooling_type=AvgPooling())
-output = fc_layer(input=avg, size=2, act=SoftmaxActivation())
-if is_predict:
-    maxid = maxid_layer(output)
-    outputs([maxid, output])
-else:
-    label = data_layer(name="label", size=2)
-    cls = classification_cost(input=output, label=label)
-    outputs(cls)
diff --git a/v1_api_demo/quick_start/trainer_config.lr.py b/v1_api_demo/quick_start/trainer_config.lr.py
deleted file mode 100644
index b7b694940e338acbc40ffd3e5597f209bf07488f..0000000000000000000000000000000000000000
--- a/v1_api_demo/quick_start/trainer_config.lr.py
+++ /dev/null
@@ -1,85 +0,0 @@
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-# edit-mode: -*- python -*-
-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-dict_file = get_config_arg('dict_file', str, "./data/dict.txt")
-word_dict = dict()
-with open(dict_file, 'r') as f:
-    for i, line in enumerate(f):
-        w = line.strip().split()[0]
-        word_dict[w] = i
-
-is_predict = get_config_arg('is_predict', bool, False)
-trn = 'data/train.list' if not is_predict else None
-tst = 'data/test.list' if not is_predict else 'data/pred.list'
-process = 'process' if not is_predict else 'process_predict'
-
-# define the data sources for the model.
-# We need to use different process for training and prediction.
-# For training, the input data includes both word IDs and labels.
-# For prediction, the input data only includs word Ids.
-define_py_data_sources2(
-    train_list=trn,
-    test_list=tst,
-    module="dataprovider_bow",
-    obj=process,
-    args={"dictionary": word_dict})
-
-batch_size = 128 if not is_predict else 1
-settings(
-    batch_size=batch_size,
-    learning_rate=2e-3,
-    learning_method=AdamOptimizer(),
-    regularization=L2Regularization(8e-4),
-    gradient_clipping_threshold=25)
-
-# Define the data for text features. The size of the data layer is the number
-# of words in the dictionary.
-data = data_layer(name="word", size=len(word_dict))
-
-# Define a fully connected layer with logistic activation.
-# (also called softmax activation).
-output = fc_layer(input=data, size=2, act=SoftmaxActivation())
-
-if not is_predict:
-    # For training, we need label and cost
-
-    # define the category id for each example.
-    # The size of the data layer is the number of labels.
-    label = data_layer(name="label", size=2)
-
-    # Define cross-entropy classification loss and error.
-    cls = classification_cost(input=output, label=label)
-    outputs(cls)
-else:
-    # For prediction, no label is needed. We need to output
-    # We need to output classification result, and class probabilities.
-    maxid = maxid_layer(output)
-    outputs([maxid, output])
diff --git a/v1_api_demo/quick_start/trainer_config.lstm.py b/v1_api_demo/quick_start/trainer_config.lstm.py
deleted file mode 100644
index 8967d78807b9bbf990f5dd36240c18199b86954e..0000000000000000000000000000000000000000
--- a/v1_api_demo/quick_start/trainer_config.lstm.py
+++ /dev/null
@@ -1,70 +0,0 @@
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-# edit-mode: -*- python -*-
-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-dict_file = "./data/dict.txt"
-word_dict = dict()
-with open(dict_file, 'r') as f:
-    for i, line in enumerate(f):
-        w = line.strip().split()[0]
-        word_dict[w] = i
-
-is_predict = get_config_arg('is_predict', bool, False)
-trn = 'data/train.list' if not is_predict else None
-tst = 'data/test.list' if not is_predict else 'data/pred.list'
-process = 'process' if not is_predict else 'process_predict'
-define_py_data_sources2(
-    train_list=trn,
-    test_list=tst,
-    module="dataprovider_emb",
-    obj=process,
-    args={"dictionary": word_dict})
-
-batch_size = 128 if not is_predict else 1
-settings(
-    batch_size=batch_size,
-    learning_rate=2e-3,
-    learning_method=AdamOptimizer(),
-    regularization=L2Regularization(8e-4),
-    gradient_clipping_threshold=25)
-
-data = data_layer(name="word", size=len(word_dict))
-emb = embedding_layer(input=data, size=128)
-lstm = simple_lstm(
-    input=emb, size=128, lstm_cell_attr=ExtraAttr(drop_rate=0.25))
-lstm_max = pooling_layer(input=lstm, pooling_type=MaxPooling())
-output = fc_layer(input=lstm_max, size=2, act=SoftmaxActivation())
-if is_predict:
-    maxid = maxid_layer(output)
-    outputs([maxid, output])
-else:
-    label = data_layer(name="label", size=2)
-    cls = classification_cost(input=output, label=label)
-    outputs(cls)
diff --git a/v1_api_demo/quick_start/trainer_config.resnet-lstm.py b/v1_api_demo/quick_start/trainer_config.resnet-lstm.py
deleted file mode 100644
index 32d0596f250c0f0c5a4004d3af7adb794b3f0f1b..0000000000000000000000000000000000000000
--- a/v1_api_demo/quick_start/trainer_config.resnet-lstm.py
+++ /dev/null
@@ -1,104 +0,0 @@
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-# edit-mode: -*- python -*-
-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-This configuration is a demonstration of how to implement the stacked LSTM
-with residual connections, i.e. an LSTM layer takes the sum of the hidden states
-and inputs of the previous LSTM layer instead of only the hidden states.
-This architecture is from:
-Yonghui Wu, Mike Schuster, Zhifeng Chen, Quoc V. Le, Mohammad Norouzi,
-Wolfgang Macherey, Maxim Krikun, Yuan Cao, Qin Gao, Klaus Macherey,
-Jeff Klingner, Apurva Shah, Melvin Johnson, Xiaobing Liu, Lukasz Kaiser,
-Stephan Gouws, Yoshikiyo Kato, Taku Kudo, Hideto Kazawa, Keith Stevens,
-George Kurian, Nishant Patil, Wei Wang, Cliff Young, Jason Smith, Jason Riesa,
-Alex Rudnick, Oriol Vinyals, Greg Corrado, Macduff Hughes, Jeffrey Dean. 2016.
-Google's Neural Machine Translation System: Bridging the Gap between Human and
-Machine Translation. In arXiv https://arxiv.org/pdf/1609.08144v2.pdf
-Different from the architecture described in the paper, we use a stack single
-direction LSTM layers as the first layer instead of bi-directional LSTM. Also,
-since this is a demo code, to reduce computation time, we stacked 4 layers
-instead of 8 layers.
-"""
-
-from paddle.trainer_config_helpers import *
-
-dict_file = "./data/dict.txt"
-word_dict = dict()
-with open(dict_file, 'r') as f:
-    for i, line in enumerate(f):
-        w = line.strip().split()[0]
-        word_dict[w] = i
-
-is_predict = get_config_arg('is_predict', bool, False)
-trn = 'data/train.list' if not is_predict else None
-tst = 'data/test.list' if not is_predict else 'data/pred.list'
-process = 'process' if not is_predict else 'process_predict'
-define_py_data_sources2(
-    train_list=trn,
-    test_list=tst,
-    module="dataprovider_emb",
-    obj=process,
-    args={"dictionary": word_dict})
-
-batch_size = 128 if not is_predict else 1
-settings(
-    batch_size=batch_size,
-    learning_rate=2e-3,
-    learning_method=AdamOptimizer(),
-    regularization=L2Regularization(8e-4),
-    gradient_clipping_threshold=25)
-
-bias_attr = ParamAttr(initial_std=0., l2_rate=0.)
-
-data = data_layer(name="word", size=len(word_dict))
-emb = embedding_layer(input=data, size=128)
-lstm = simple_lstm(input=emb, size=128, lstm_cell_attr=ExtraAttr(drop_rate=0.1))
-
-previous_input, previous_hidden_state = emb, lstm
-
-for i in range(3):
-    # The input to the current layer is the sum of the hidden state
-    # and input of the previous layer.
-    current_input = addto_layer(input=[previous_input, previous_hidden_state])
-    hidden_state = simple_lstm(
-        input=current_input, size=128, lstm_cell_attr=ExtraAttr(drop_rate=0.1))
-    previous_input, previous_hidden_state = current_input, hidden_state
-
-lstm = previous_hidden_state
-
-lstm_last = pooling_layer(input=lstm, pooling_type=MaxPooling())
-output = fc_layer(
-    input=lstm_last, size=2, bias_attr=bias_attr, act=SoftmaxActivation())
-
-if is_predict:
-    maxid = maxid_layer(output)
-    outputs([maxid, output])
-else:
-    label = data_layer(name="label", size=2)
-    cls = classification_cost(input=output, label=label)
-    outputs(cls)
diff --git a/v1_api_demo/sequence_tagging/data/get_data.sh b/v1_api_demo/sequence_tagging/data/get_data.sh
deleted file mode 100755
index 0cdb394035e782b3a647f7f13e79d55b5d3dff48..0000000000000000000000000000000000000000
--- a/v1_api_demo/sequence_tagging/data/get_data.sh
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-
-DIR="$( cd "$(dirname "$0")" ; pwd -P )"
-cd $DIR
-
-wget http://www.cnts.ua.ac.be/conll2000/chunking/train.txt.gz
-wget http://www.cnts.ua.ac.be/conll2000/chunking/test.txt.gz
diff --git a/v1_api_demo/sequence_tagging/data/test.list b/v1_api_demo/sequence_tagging/data/test.list
deleted file mode 100644
index 073c0a0c9063ac55f762ac261746aa73057d70e8..0000000000000000000000000000000000000000
--- a/v1_api_demo/sequence_tagging/data/test.list
+++ /dev/null
@@ -1 +0,0 @@
-data/test.txt.gz
diff --git a/v1_api_demo/sequence_tagging/data/train.list b/v1_api_demo/sequence_tagging/data/train.list
deleted file mode 100644
index 43c24d5f6484a90fe883ad5516fe100d27c9ce47..0000000000000000000000000000000000000000
--- a/v1_api_demo/sequence_tagging/data/train.list
+++ /dev/null
@@ -1 +0,0 @@
-data/train.txt.gz
diff --git a/v1_api_demo/sequence_tagging/dataprovider.py b/v1_api_demo/sequence_tagging/dataprovider.py
deleted file mode 100644
index bb4b4465bc7e032c50c1d21263651e2578af67be..0000000000000000000000000000000000000000
--- a/v1_api_demo/sequence_tagging/dataprovider.py
+++ /dev/null
@@ -1,260 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer.PyDataProvider2 import *
-import gzip
-import logging
-
-logging.basicConfig(
-    format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s', )
-logger = logging.getLogger('paddle')
-logger.setLevel(logging.INFO)
-
-OOV_POLICY_IGNORE = 0
-OOV_POLICY_USE = 1
-OOV_POLICY_ERROR = 2
-
-num_original_columns = 3
-
-# Feature combination patterns.
-# [[-1,0], [0,0]]  means previous token at column 0 and current token at 
-# column 0 are combined as one feature.
-patterns = [
-    [[-2, 0]],
-    [[-1, 0]],
-    [[0, 0]],
-    [[1, 0]],
-    [[2, 0]],
-    [[-1, 0], [0, 0]],
-    [[0, 0], [1, 0]],
-    [[-2, 1]],
-    [[-1, 1]],
-    [[0, 1]],
-    [[1, 1]],
-    [[2, 1]],
-    [[-2, 1], [-1, 1]],
-    [[-1, 1], [0, 1]],
-    [[0, 1], [1, 1]],
-    [[1, 1], [2, 1]],
-    [[-2, 1], [-1, 1], [0, 1]],
-    [[-1, 1], [0, 1], [1, 1]],
-    [[0, 1], [1, 1], [2, 1]],
-]
-
-dict_label = {
-    'B-ADJP': 0,
-    'I-ADJP': 1,
-    'B-ADVP': 2,
-    'I-ADVP': 3,
-    'B-CONJP': 4,
-    'I-CONJP': 5,
-    'B-INTJ': 6,
-    'I-INTJ': 7,
-    'B-LST': 8,
-    'I-LST': 9,
-    'B-NP': 10,
-    'I-NP': 11,
-    'B-PP': 12,
-    'I-PP': 13,
-    'B-PRT': 14,
-    'I-PRT': 15,
-    'B-SBAR': 16,
-    'I-SBAR': 17,
-    'B-UCP': 18,
-    'I-UCP': 19,
-    'B-VP': 20,
-    'I-VP': 21,
-    'O': 22
-}
-
-
-def make_features(sequence):
-    length = len(sequence)
-    num_features = len(sequence[0])
-
-    def get_features(pos):
-        if pos < 0:
-            return ['#B%s' % -pos] * num_features
-        if pos >= length:
-            return ['#E%s' % (pos - length + 1)] * num_features
-        return sequence[pos]
-
-    for i in xrange(length):
-        for pattern in patterns:
-            fname = '/'.join([get_features(i + pos)[f] for pos, f in pattern])
-            sequence[i].append(fname)
-
-
-'''
-Source file format:
-Each line is for one timestep. The features are separated by space.
-An empty line indicates end of a sequence.
-
-cutoff: a list of numbers. If count of a feature is smaller than this,
- it will be ignored.
-if oov_policy[i] is OOV_POLICY_USE, id 0 is reserved for OOV features of
-i-th column.
-
-return a list of dict for each column
-'''
-
-
-def create_dictionaries(filename, cutoff, oov_policy):
-    def add_to_dict(sequence, dicts):
-        num_features = len(dicts)
-        for features in sequence:
-            l = len(features)
-            assert l == num_features, "Wrong number of features " + line
-            for i in xrange(l):
-                if features[i] in dicts[i]:
-                    dicts[i][features[i]] += 1
-                else:
-                    dicts[i][features[i]] = 1
-
-    num_features = len(cutoff)
-    dicts = []
-    for i in xrange(num_features):
-        dicts.append(dict())
-
-    f = gzip.open(filename, 'rb')
-
-    sequence = []
-
-    for line in f:
-        line = line.strip()
-        if not line:
-            make_features(sequence)
-            add_to_dict(sequence, dicts)
-            sequence = []
-            continue
-        features = line.split(' ')
-        sequence.append(features)
-
-    for i in xrange(num_features):
-        dct = dicts[i]
-        n = 1 if oov_policy[i] == OOV_POLICY_USE else 0
-        todo = []
-        for k, v in dct.iteritems():
-            if v < cutoff[i]:
-                todo.append(k)
-            else:
-                dct[k] = n
-                n += 1
-
-        if oov_policy[i] == OOV_POLICY_USE:
-            # placeholder so that len(dct) will be the number of features
-            # including OOV
-            dct['#OOV#'] = 0
-
-        logger.info('column %d dict size=%d, ignored %d' % (i, n, len(todo)))
-        for k in todo:
-            del dct[k]
-
-    f.close()
-    return dicts
-
-
-def initializer(settings, **xargs):
-    cutoff = [3, 1, 0]
-    cutoff += [3] * len(patterns)
-    oov_policy = [OOV_POLICY_IGNORE, OOV_POLICY_ERROR, OOV_POLICY_ERROR]
-    oov_policy += [OOV_POLICY_IGNORE] * len(patterns)
-    dicts = create_dictionaries('data/train.txt.gz', cutoff, oov_policy)
-    dicts[2] = dict_label
-    settings.dicts = dicts
-    settings.oov_policy = oov_policy
-    input_types = []
-    num_features = len(dicts)
-    for i in xrange(num_original_columns):
-        input_types.append(integer_sequence(len(dicts[i])))
-        logger.info("slot %s size=%s" % (i, len(dicts[i])))
-    if patterns:
-        dim = 0
-        for i in xrange(num_original_columns, num_features):
-            dim += len(dicts[i])
-        input_types.append(sparse_binary_vector_sequence(dim))
-        logger.info("feature size=%s" % dim)
-    settings.input_types = input_types
-
-
-'''
-if oov_policy[i] == OOV_POLICY_USE, features in i-th column which are not
-existed in dicts[i] will be assigned to id 0.
-if oov_policy[i] == OOV_POLICY_ERROR, all features in i-th column MUST exist
-in dicts[i].
-'''
-
-
-@provider(init_hook=initializer, cache=CacheType.CACHE_PASS_IN_MEM)
-def process(settings, filename):
-    input_file = filename
-    dicts = settings.dicts
-    oov_policy = settings.oov_policy
-
-    def gen_sample(sequence):
-        num_features = len(dicts)
-        sample = [list() for i in xrange(num_original_columns)]
-        if patterns:
-            sample.append([])
-        for features in sequence:
-            assert len(features) == num_features, \
-                "Wrong number of features: " + line
-            for i in xrange(num_original_columns):
-                id = dicts[i].get(features[i], -1)
-                if id != -1:
-                    sample[i].append(id)
-                elif oov_policy[i] == OOV_POLICY_IGNORE:
-                    sample[i].append(0xffffffff)
-                elif oov_policy[i] == OOV_POLICY_ERROR:
-                    logger.fatal("Unknown token: %s" % features[i])
-                else:
-                    sample[i].append(0)
-
-            if patterns:
-                dim = 0
-                vec = []
-                for i in xrange(num_original_columns, num_features):
-                    id = dicts[i].get(features[i], -1)
-                    if id != -1:
-                        vec.append(dim + id)
-                    elif oov_policy[i] == OOV_POLICY_IGNORE:
-                        pass
-                    elif oov_policy[i] == OOV_POLICY_ERROR:
-                        logger.fatal("Unknown token: %s" % features[i])
-                    else:
-                        vec.ids.append(dim + 0)
-
-                    dim += len(dicts[i])
-                sample[-1].append(vec)
-        return sample
-
-    num_features = len(dicts)
-    f = gzip.open(input_file, 'rb')
-
-    num_sequences = 0
-    sequence = []
-    for line in f:
-        line = line.strip()
-        if not line:
-            make_features(sequence)
-            yield gen_sample(sequence)
-            sequence = []
-            num_sequences += 1
-            continue
-        features = line.split(' ')
-        sequence.append(features)
-
-    f.close()
-
-    logger.info("num_sequences=%s" % num_sequences)
diff --git a/v1_api_demo/sequence_tagging/linear_crf.py b/v1_api_demo/sequence_tagging/linear_crf.py
deleted file mode 100644
index ea012ba1ae9c790ccefd3dd5f066aa92202128a2..0000000000000000000000000000000000000000
--- a/v1_api_demo/sequence_tagging/linear_crf.py
+++ /dev/null
@@ -1,83 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-import math
-
-define_py_data_sources2(
-    train_list="data/train.list",
-    test_list="data/test.list",
-    module="dataprovider",
-    obj="process")
-
-batch_size = 1
-settings(
-    learning_method=MomentumOptimizer(),
-    batch_size=batch_size,
-    regularization=L2Regularization(batch_size * 1e-4),
-    model_average=ModelAverage(0.5),
-    learning_rate=1e-1,
-    learning_rate_decay_a=1e-5,
-    learning_rate_decay_b=0.25, )
-
-num_label_types = 23
-
-
-def get_simd_size(size):
-    return int(math.ceil(float(size) / 8)) * 8
-
-
-# Currently, in order to use sparse_update=True,
-# the size has to be aligned.
-num_label_types = get_simd_size(num_label_types)
-
-features = data_layer(name="features", size=76328)
-word = data_layer(name="word", size=6778)
-pos = data_layer(name="pos", size=44)
-chunk = data_layer(name="chunk", size=num_label_types)
-
-crf_input = fc_layer(
-    input=features,
-    size=num_label_types,
-    act=LinearActivation(),
-    bias_attr=False,
-    param_attr=ParamAttr(
-        initial_std=0, sparse_update=True))
-
-crf = crf_layer(
-    input=crf_input,
-    label=chunk,
-    param_attr=ParamAttr(
-        name="crfw", initial_std=0), )
-
-crf_decoding = crf_decoding_layer(
-    size=num_label_types,
-    input=crf_input,
-    label=chunk,
-    param_attr=ParamAttr(name="crfw"), )
-
-sum_evaluator(
-    name="error",
-    input=crf_decoding, )
-
-chunk_evaluator(
-    name="chunk_f1",
-    input=crf_decoding,
-    label=chunk,
-    chunk_scheme="IOB",
-    num_chunk_types=11, )
-
-inputs(word, pos, chunk, features)
-outputs(crf)
diff --git a/v1_api_demo/sequence_tagging/readme.md b/v1_api_demo/sequence_tagging/readme.md
deleted file mode 100644
index 2e17fffb83c532f5e5fec1227f169c97c1f20e22..0000000000000000000000000000000000000000
--- a/v1_api_demo/sequence_tagging/readme.md
+++ /dev/null
@@ -1,45 +0,0 @@
-# Sequence Tagging
-
-This demo is a sequence model for assigning tags to each token in a sentence. The task is described at <a href = "http://www.cnts.ua.ac.be/conll2000/chunking">CONLL2000 Text Chunking</a> task.
-
-## Download data
-```bash
-cd demo/sequence_tagging
-./data/get_data.sh
-```
-
-## Train model
-```bash
-cd demo/sequence_tagging
-./train.sh
-```
-
-## Model description
-
-We provide two models. One is a linear CRF model (linear_crf.py) with is equivalent to the one at <a href="http://leon.bottou.org/projects/sgd#stochastic_gradient_crfs">leon.bottou.org/projects/sgd</a>. The second one is a stacked bidirectional RNN and CRF model (rnn_crf.py).
-<center>
-<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
-
-<thead>
-<th scope="col" class="left">Model name</th>
-<th scope="col" class="left">Number of parameters</th>
-<th scope="col" class="left">F1 score</th>
-</thead>
-
-<tbody>
-<tr>
-<td class="left">linear_crf</td>
-<td class="left"> 1.8M </td>
-<td class="left"> 0.937</td>
-</tr>
-
-<tr>
-<td class="left">rnn_crf</td>
-<td class="left"> 960K </td>
-<td class="left">0.941</td>
-</tr>
-
-</tbody>
-</table>
-</center>
-<br>
diff --git a/v1_api_demo/sequence_tagging/rnn_crf.py b/v1_api_demo/sequence_tagging/rnn_crf.py
deleted file mode 100644
index 937a34df103663ecf0f0827bbfb9d82823c9b902..0000000000000000000000000000000000000000
--- a/v1_api_demo/sequence_tagging/rnn_crf.py
+++ /dev/null
@@ -1,121 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-import math
-
-define_py_data_sources2(
-    train_list="data/train.list",
-    test_list="data/test.list",
-    module="dataprovider",
-    obj="process")
-
-batch_size = 16
-settings(
-    learning_method=MomentumOptimizer(),
-    batch_size=batch_size,
-    regularization=L2Regularization(batch_size * 1e-5),
-    model_average=ModelAverage(0.5),
-    learning_rate=2e-3,
-    learning_rate_decay_a=5e-7,
-    learning_rate_decay_b=0.5, )
-
-word_dim = 128
-hidden_dim = 128
-with_rnn = True
-
-initial_std = 1 / math.sqrt(hidden_dim)
-param_attr = ParamAttr(initial_std=initial_std)
-cpu_layer_attr = ExtraLayerAttribute(device=-1)
-
-default_device(0)
-
-num_label_types = 23
-
-features = data_layer(name="features", size=76328)
-word = data_layer(name="word", size=6778)
-pos = data_layer(name="pos", size=44)
-chunk = data_layer(
-    name="chunk", size=num_label_types, layer_attr=cpu_layer_attr)
-
-emb = embedding_layer(
-    input=word, size=word_dim, param_attr=ParamAttr(initial_std=0))
-
-hidden1 = mixed_layer(
-    size=hidden_dim,
-    act=STanhActivation(),
-    bias_attr=True,
-    input=[
-        full_matrix_projection(emb), table_projection(
-            pos, param_attr=param_attr)
-    ])
-
-if with_rnn:
-    rnn1 = recurrent_layer(
-        act=ReluActivation(),
-        bias_attr=True,
-        input=hidden1,
-        param_attr=ParamAttr(initial_std=0), )
-
-hidden2 = mixed_layer(
-    size=hidden_dim,
-    act=STanhActivation(),
-    bias_attr=True,
-    input=[full_matrix_projection(hidden1)] +
-    ([full_matrix_projection(
-        rnn1, param_attr=ParamAttr(initial_std=0))] if with_rnn else []), )
-
-if with_rnn:
-    rnn2 = recurrent_layer(
-        reverse=True,
-        act=ReluActivation(),
-        bias_attr=True,
-        input=hidden2,
-        param_attr=ParamAttr(initial_std=0), )
-
-crf_input = mixed_layer(
-    size=num_label_types,
-    bias_attr=False,
-    input=[full_matrix_projection(hidden2), ] +
-    ([full_matrix_projection(
-        rnn2, param_attr=ParamAttr(initial_std=0))] if with_rnn else []), )
-
-crf = crf_layer(
-    input=crf_input,
-    label=chunk,
-    param_attr=ParamAttr(
-        name="crfw", initial_std=0),
-    layer_attr=cpu_layer_attr, )
-
-crf_decoding = crf_decoding_layer(
-    size=num_label_types,
-    input=crf_input,
-    label=chunk,
-    param_attr=ParamAttr(name="crfw"),
-    layer_attr=cpu_layer_attr, )
-
-sum_evaluator(
-    name="error",
-    input=crf_decoding, )
-
-chunk_evaluator(
-    name="chunk_f1",
-    input=crf_decoding,
-    label=chunk,
-    chunk_scheme="IOB",
-    num_chunk_types=11, )
-
-inputs(word, pos, chunk, features)
-outputs(crf)
diff --git a/v1_api_demo/sequence_tagging/train.sh b/v1_api_demo/sequence_tagging/train.sh
deleted file mode 100755
index 37e196c84200dc26ccb523076a81dbc393b1280f..0000000000000000000000000000000000000000
--- a/v1_api_demo/sequence_tagging/train.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/bash
-
-paddle train \
-       --config rnn_crf.py \
-       --parallel_nn=1 \
-       --use_gpu=1 \
-       --dot_period=10 \
-       --log_period=1000 \
-       --test_period=0 \
-       --num_passes=10 \
-2>&1 | tee 'train.log'
-paddle usage -l 'train.log' -e $? -n "sequence_tagging_train" >/dev/null 2>&1
diff --git a/v1_api_demo/sequence_tagging/train_linear.sh b/v1_api_demo/sequence_tagging/train_linear.sh
deleted file mode 100755
index ad6e2d8ee7f813c69f9dd250c6f7bbb4403a0ed5..0000000000000000000000000000000000000000
--- a/v1_api_demo/sequence_tagging/train_linear.sh
+++ /dev/null
@@ -1,11 +0,0 @@
-#!/bin/bash
-
-paddle train \
-       --config linear_crf.py \
-       --use_gpu=0 \
-       --dot_period=100 \
-       --log_period=10000 \
-       --test_period=0 \
-       --num_passes=10
-2>&1 | tee 'train_linear.log'
-paddle usage -l 'train_linear.log' -e $? -n "sequence_tagging_train_linear" >/dev/null 2>&1
diff --git a/v1_api_demo/traffic_prediction/README b/v1_api_demo/traffic_prediction/README
deleted file mode 100644
index 4c95188583513c332b7d7cb0a32d59336208e1aa..0000000000000000000000000000000000000000
--- a/v1_api_demo/traffic_prediction/README
+++ /dev/null
@@ -1,7 +0,0 @@
-run by:
-cd ./data
-sh get_data.sh
-cd ..
-sh train.sh
-sh predict.sh
-
diff --git a/v1_api_demo/traffic_prediction/data/get_data.sh b/v1_api_demo/traffic_prediction/data/get_data.sh
deleted file mode 100755
index f2fa548d4709c0361334f117bfb49e18d83c32f4..0000000000000000000000000000000000000000
--- a/v1_api_demo/traffic_prediction/data/get_data.sh
+++ /dev/null
@@ -1,34 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors, Inc. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-set -e
-set -x
-
-DIR="$( cd "$(dirname "$0")" ; pwd -P )"
-cd $DIR
-
-#download the dataset
-echo "Downloading traffic data..."
-wget http://paddlepaddle.cdn.bcebos.com/demo/traffic/traffic_data.tar.gz
-
-#extract package
-echo "Unzipping..."
-tar -zxvf traffic_data.tar.gz
-
-echo "data/speeds.csv" > train.list
-echo "data/speeds.csv" > test.list
-echo "data/speeds.csv" > pred.list
-
-echo "Done."
diff --git a/v1_api_demo/traffic_prediction/dataprovider.py b/v1_api_demo/traffic_prediction/dataprovider.py
deleted file mode 100644
index c7883b6950c369ee67c39b80ce1cefbbf9350459..0000000000000000000000000000000000000000
--- a/v1_api_demo/traffic_prediction/dataprovider.py
+++ /dev/null
@@ -1,82 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors, Inc. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer.PyDataProvider2 import *
-import sys
-import numpy as np
-TERM_NUM = 24
-FORECASTING_NUM = 24
-LABEL_VALUE_NUM = 4
-
-
-def initHook(settings, file_list, **kwargs):
-    """
-    Init hook is invoked before process data. It will set obj.slots and store data meta.
-
-    :param settings: global object. It will passed to process routine.
-    :type obj: object
-    :param file_list: the meta file object, which passed from trainer_config.py,but unused in this function.
-    :param kwargs: unused other arguments.
-    """
-    del kwargs  #unused 
-
-    settings.pool_size = sys.maxint
-    #Use a time seires of the past as feature.
-    #Dense_vector's expression form is [float,float,...,float]
-    settings.input_types = [dense_vector(TERM_NUM)]
-    #There are next FORECASTING_NUM fragments you need predict.
-    #Every predicted condition at time point has four states.
-    for i in range(FORECASTING_NUM):
-        settings.input_types.append(integer_value(LABEL_VALUE_NUM))
-
-
-@provider(
-    init_hook=initHook, cache=CacheType.CACHE_PASS_IN_MEM, should_shuffle=True)
-def process(settings, file_name):
-    with open(file_name) as f:
-        #abandon fields name
-        f.next()
-        for row_num, line in enumerate(f):
-            speeds = map(int, line.rstrip('\r\n').split(",")[1:])
-            # Get the max index.
-            end_time = len(speeds)
-            # Scanning and generating samples
-            for i in range(TERM_NUM, end_time - FORECASTING_NUM):
-                # For dense slot
-                pre_spd = map(float, speeds[i - TERM_NUM:i])
-
-                # Integer value need predicting, values start from 0, so every one minus 1.
-                fol_spd = [j - 1 for j in speeds[i:i + FORECASTING_NUM]]
-
-                # Predicting label is missing, abandon the sample.
-                if -1 in fol_spd:
-                    continue
-                yield [pre_spd] + fol_spd
-
-
-def predict_initHook(settings, file_list, **kwargs):
-    settings.pool_size = sys.maxint
-    settings.input_types = [dense_vector(TERM_NUM)]
-
-
-@provider(init_hook=predict_initHook, should_shuffle=False)
-def process_predict(settings, file_name):
-    with open(file_name) as f:
-        #abandon fields name
-        f.next()
-        for row_num, line in enumerate(f):
-            speeds = map(int, line.rstrip('\r\n').split(","))
-            end_time = len(speeds)
-            pre_spd = map(float, speeds[end_time - TERM_NUM:end_time])
-            yield pre_spd
diff --git a/v1_api_demo/traffic_prediction/gen_result.py b/v1_api_demo/traffic_prediction/gen_result.py
deleted file mode 100644
index 3da70b30315f863fd3582583e9a29540a09c1e7f..0000000000000000000000000000000000000000
--- a/v1_api_demo/traffic_prediction/gen_result.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors, Inc. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-res = []
-with open('./rank-00000') as f:
-    for line in f:
-        pred = map(int, line.strip('\r\n;').split(";"))
-        #raw prediction range from 0 to 3
-        res.append([i + 1 for i in pred])
-
-file_name = open('./data/pred.list').read().strip('\r\n')
-
-FORECASTING_NUM = 24
-header = [
-    'id',
-    '201604200805',
-    '201604200810',
-    '201604200815',
-    '201604200820',
-    '201604200825',
-    '201604200830',
-    '201604200835',
-    '201604200840',
-    '201604200845',
-    '201604200850',
-    '201604200855',
-    '201604200900',
-    '201604200905',
-    '201604200910',
-    '201604200915',
-    '201604200920',
-    '201604200925',
-    '201604200930',
-    '201604200935',
-    '201604200940',
-    '201604200945',
-    '201604200950',
-    '201604200955',
-    '201604201000',
-]
-###################
-## To CSV format ##
-###################
-with open(file_name) as f:
-    f.next()
-    print ','.join(header)
-    for row_num, line in enumerate(f):
-        fields = line.rstrip('\r\n').split(',')
-        linkid = fields[0]
-        print linkid + ',' + ','.join(map(str, res[row_num]))
diff --git a/v1_api_demo/traffic_prediction/predict.sh b/v1_api_demo/traffic_prediction/predict.sh
deleted file mode 100755
index 2dbd5e8805dd97d35c7d58917f8ec6b5033bda03..0000000000000000000000000000000000000000
--- a/v1_api_demo/traffic_prediction/predict.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors, Inc. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-
-cfg=trainer_config.py
-# pass choice 
-model="output/pass-00000"
-paddle train \
-    --config=$cfg \
-    --use_gpu=false \
-    --job=test \
-    --init_model_path=$model \
-    --config_args=is_predict=1 \
-    --predict_output_dir=. 
-
-python gen_result.py > result.csv
-
-rm -rf rank-00000
diff --git a/v1_api_demo/traffic_prediction/train.sh b/v1_api_demo/traffic_prediction/train.sh
deleted file mode 100755
index 48dfc5604f80042598c5c779bd450a5808fdfb64..0000000000000000000000000000000000000000
--- a/v1_api_demo/traffic_prediction/train.sh
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors, Inc. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-
-cfg=trainer_config.py
-paddle train \
-  --config=$cfg \
-  --save_dir=./output \
-  --trainer_count=4 \
-  --log_period=1000 \
-  --dot_period=10 \
-  --num_passes=10 \
-  --use_gpu=false \
-  --show_parameter_stats_period=3000 \
-  2>&1 | tee 'train.log'
diff --git a/v1_api_demo/traffic_prediction/trainer_config.py b/v1_api_demo/traffic_prediction/trainer_config.py
deleted file mode 100755
index 52d678624aff7ca2264c3c20e320004217d14397..0000000000000000000000000000000000000000
--- a/v1_api_demo/traffic_prediction/trainer_config.py
+++ /dev/null
@@ -1,52 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors, Inc. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from paddle.trainer_config_helpers import *
-
-################################### DATA Configuration #############################################
-is_predict = get_config_arg('is_predict', bool, False)
-trn = './data/train.list' if not is_predict else None
-tst = './data/test.list' if not is_predict else './data/pred.list'
-process = 'process' if not is_predict else 'process_predict'
-define_py_data_sources2(
-    train_list=trn, test_list=tst, module="dataprovider", obj=process)
-################################### Parameter Configuaration #######################################
-TERM_NUM = 24
-FORECASTING_NUM = 24
-emb_size = 16
-batch_size = 128 if not is_predict else 1
-settings(
-    batch_size=batch_size,
-    learning_rate=1e-3,
-    learning_method=RMSPropOptimizer())
-################################### Algorithm Configuration ########################################
-
-output_label = []
-
-link_encode = data_layer(name='link_encode', size=TERM_NUM)
-for i in xrange(FORECASTING_NUM):
-    # Each task share same weight.
-    link_param = ParamAttr(
-        name='_link_vec.w', initial_max=1.0, initial_min=-1.0)
-    link_vec = fc_layer(input=link_encode, size=emb_size, param_attr=link_param)
-    score = fc_layer(input=link_vec, size=4, act=SoftmaxActivation())
-    if is_predict:
-        maxid = maxid_layer(score)
-        output_label.append(maxid)
-    else:
-        # Multi-task training.
-        label = data_layer(name='label_%dmin' % ((i + 1) * 5), size=4)
-        cls = classification_cost(
-            input=score, name="cost_%dmin" % ((i + 1) * 5), label=label)
-        output_label.append(cls)
-outputs(output_label)
diff --git a/v1_api_demo/vae/README.md b/v1_api_demo/vae/README.md
deleted file mode 100644
index e55d483b023773900729622a6cac44116fc79c76..0000000000000000000000000000000000000000
--- a/v1_api_demo/vae/README.md
+++ /dev/null
@@ -1,13 +0,0 @@
-#Variational Autoencoder (VAE)
-
-This demo implements VAE training described in the original paper (https://arxiv.org/abs/1312.6114).
-
-
-In order to run the model, first download the MNIST dataset by running the shell script in ./data.
-
-Then you can run the command below. The flag --useGpu specifies whether to use gpu for training (0 is cpu, 1 is gpu).  
-
-$python vae_train.py [--use_gpu 1]
-
-The generated images will be stored in ./samples/
-The corresponding models will be stored in ./params/
diff --git a/v1_api_demo/vae/data/get_mnist_data.sh b/v1_api_demo/vae/data/get_mnist_data.sh
deleted file mode 100755
index a77c81bf5af9ddb6634ff89460797ca543c5e517..0000000000000000000000000000000000000000
--- a/v1_api_demo/vae/data/get_mnist_data.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/usr/bin/env sh
-# This script downloads the mnist data and unzips it.
-set -e
-DIR="$( cd "$(dirname "$0")" ; pwd -P )"
-rm -rf "$DIR/mnist_data"
-mkdir "$DIR/mnist_data"
-cd "$DIR/mnist_data"
-
-echo "Downloading..."
-
-for fname in train-images-idx3-ubyte train-labels-idx1-ubyte t10k-images-idx3-ubyte t10k-labels-idx1-ubyte
-do
-    if [ ! -e $fname ]; then
-        wget --no-check-certificate http://yann.lecun.com/exdb/mnist/${fname}.gz
-        gunzip ${fname}.gz
-    fi
-done
diff --git a/v1_api_demo/vae/dataloader.py b/v1_api_demo/vae/dataloader.py
deleted file mode 100644
index e9ff95d44f825cd941b5687f754618e66d491e7f..0000000000000000000000000000000000000000
--- a/v1_api_demo/vae/dataloader.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-
-
-class MNISTloader():
-    def __init__(self,
-                 data_path="./data/mnist_data/",
-                 batch_size=60,
-                 process='train'):
-        self.batch_size = batch_size
-        self.data_path = data_path
-        self._pointer = 0
-        self.image_batches = np.array([])
-        self.process = process
-
-    def _extract_images(self, filename, n):
-        f = open(filename, 'rb')
-        f.read(16)
-        data = np.fromfile(f, 'ubyte', count=n * 28 * 28).reshape((n, 28 * 28))
-        #Mapping data into [-1, 1]
-        data = data / 255. * 2. - 1
-        data_batches = np.split(data, 60000 / self.batch_size, 0)
-
-        f.close()
-
-        return data_batches
-
-    @property
-    def pointer(self):
-        return self._pointer
-
-    def load_data(self):
-        TRAIN_IMAGES = '%s/train-images-idx3-ubyte' % self.data_path
-        TEST_IMAGES = '%s/t10k-images-idx3-ubyte' % self.data_path
-
-        if self.process == 'train':
-            self.image_batches = self._extract_images(TRAIN_IMAGES, 60000)
-        else:
-            self.image_batches = self._extract_images(TEST_IMAGES, 10000)
-
-    def next_batch(self):
-        batch = self.image_batches[self._pointer]
-        self._pointer = (self._pointer + 1) % (60000 / self.batch_size)
-        return np.array(batch)
-
-    def reset_pointer(self):
-        self._pointer = 0
diff --git a/v1_api_demo/vae/vae_conf.py b/v1_api_demo/vae/vae_conf.py
deleted file mode 100644
index 301dd23793d19ec5946cc7bb07e32c53c04a972b..0000000000000000000000000000000000000000
--- a/v1_api_demo/vae/vae_conf.py
+++ /dev/null
@@ -1,116 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-import numpy as np
-
-is_generating = get_config_arg("is_generating", bool, False)
-
-settings(batch_size=32, learning_rate=1e-3, learning_method=AdamOptimizer())
-
-X_dim = 28 * 28
-h_dim = 128
-z_dim = 100
-
-
-def reparameterization(mu, logvar):
-    eps = ParamAttr(initial_mean=0., initial_std=1)
-    with mixed_layer() as sigma:
-        sigma += dotmul_projection(layer_math.exp(logvar) * 0.5, param_attr=eps)
-    return mu + sigma
-
-
-def q_func(X):
-    """
-    xavier initialization
-    """
-    param_attr = ParamAttr(
-        name='share.w', initial_mean=0., initial_std=1. / np.sqrt(X_dim / 2.))
-    mu_param = ParamAttr(
-        name='mu.w', initial_mean=0., initial_std=1. / np.sqrt(h_dim / 2.))
-    logvar_param = ParamAttr(
-        name='logvar.w', initial_mean=0., initial_std=1. / np.sqrt(h_dim / 2.))
-
-    bias_attr = ParamAttr(name='share.bias', initial_mean=0., initial_std=0.)
-    mu_bias = ParamAttr(name='mu.bias', initial_mean=0., initial_std=0.)
-    logvar_bias = ParamAttr(name='logvar.bias', initial_mean=0., initial_std=0.)
-
-    share_layer = fc_layer(
-        X,
-        size=h_dim,
-        param_attr=param_attr,
-        bias_attr=bias_attr,
-        act=ReluActivation())
-
-    return (fc_layer(
-        share_layer,
-        size=z_dim,
-        param_attr=mu_param,
-        bias_attr=mu_bias,
-        act=LinearActivation()), fc_layer(
-            share_layer,
-            size=z_dim,
-            param_attr=logvar_param,
-            bias_attr=logvar_bias,
-            act=LinearActivation()))
-
-
-def generator(z):
-
-    hidden_param = ParamAttr(
-        name='hidden.w', initial_mean=0., initial_std=1. / np.sqrt(z_dim / 2.))
-    hidden_bias = ParamAttr(name='hidden.bias', initial_mean=0., initial_std=0.)
-    prob_param = ParamAttr(
-        name='prob.w', initial_mean=0., initial_std=1. / np.sqrt(h_dim / 2.))
-    prob_bias = ParamAttr(name='prob.bias', initial_mean=0., initial_std=0.)
-
-    hidden_layer = fc_layer(
-        z,
-        size=h_dim,
-        act=ReluActivation(),
-        param_attr=hidden_param,
-        bias_attr=hidden_bias)
-    prob = fc_layer(
-        hidden_layer,
-        size=X_dim,
-        act=SigmoidActivation(),
-        param_attr=prob_param,
-        bias_attr=prob_bias)
-
-    return prob
-
-
-def reconstruct_error(prob, X):
-    cost = multi_binary_label_cross_entropy(input=prob, label=X)
-    return cost
-
-
-def KL_loss(mu, logvar):
-    with mixed_layer() as mu_square:
-        mu_square += dotmul_operator(mu, mu, scale=1.)
-
-    cost = 0.5 * sum_cost(layer_math.exp(logvar) + mu_square - 1. - logvar)
-
-    return cost
-
-
-if not is_generating:
-    x_batch = data_layer(name='x_batch', size=X_dim)
-    mu, logvar = q_func(x_batch)
-    z_samples = reparameterization(mu, logvar)
-    prob = generator(z_samples)
-    outputs(reconstruct_error(prob, x_batch) + KL_loss(mu, logvar))
-else:
-    z_samples = data_layer(name='noise', size=z_dim)
-    outputs(generator(z_samples))
diff --git a/v1_api_demo/vae/vae_train.py b/v1_api_demo/vae/vae_train.py
deleted file mode 100644
index 1babb011c77b92861cc680a2e1aaa8c9ae5d97b5..0000000000000000000000000000000000000000
--- a/v1_api_demo/vae/vae_train.py
+++ /dev/null
@@ -1,175 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import random
-import numpy as np
-import cPickle
-import sys, os
-from PIL import Image
-
-from paddle.trainer.config_parser import parse_config
-from paddle.trainer.config_parser import logger
-import py_paddle.swig_paddle as api
-import dataloader
-import matplotlib.pyplot as plt
-
-
-def plot_samples(samples):
-    fig = plt.figure(figsize=(4, 4))
-    gs = gridspec.GridSpec(4, 4)
-    gs.update(wspace=0.05, hspace=0.05)
-    for i, sample in enumerate(samples):
-        plt.subplot(gs[i])
-        plt.axis('off')
-        plt.imshow(sample.reshape(28, 28), cmap='Greys_r')
-
-    return fig
-
-
-def CHECK_EQ(a, b):
-    assert a == b, "a=%s, b=%s" % (a, b)
-
-
-def get_fake_samples(generator_machine, batch_size, noise):
-    gen_inputs = api.Arguments.createArguments(1)
-    gen_inputs.setSlotValue(0, api.Matrix.createDenseFromNumpy(noise))
-    gen_outputs = api.Arguments.createArguments(0)
-    generator_machine.forward(gen_inputs, gen_outputs, api.PASS_TEST)
-    fake_samples = gen_outputs.getSlotValue(0).copyToNumpyMat()
-    return fake_samples
-
-
-def copy_shared_parameters(src, dst):
-    '''
-    copy the parameters from src to dst
-    :param src: the source of the parameters
-    :type src: GradientMachine
-    :param dst: the destination of the parameters
-    :type dst: GradientMachine
-    '''
-    src_params = [src.getParameter(i) for i in xrange(src.getParameterSize())]
-    src_params = dict([(p.getName(), p) for p in src_params])
-
-    for i in xrange(dst.getParameterSize()):
-        dst_param = dst.getParameter(i)
-        src_param = src_params.get(dst_param.getName(), None)
-        if src_param is None:
-            continue
-        src_value = src_param.getBuf(api.PARAMETER_VALUE)
-        dst_value = dst_param.getBuf(api.PARAMETER_VALUE)
-        CHECK_EQ(len(src_value), len(dst_value))
-        dst_value.copyFrom(src_value)
-        dst_param.setValueUpdated()
-
-
-def find(iterable, cond):
-    for item in iterable:
-        if cond(item):
-            return item
-    return None
-
-
-def get_layer_size(model_conf, layer_name):
-    layer_conf = find(model_conf.layers, lambda x: x.name == layer_name)
-    assert layer_conf is not None, "Cannot find '%s' layer" % layer_name
-    return layer_conf.size
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--use_gpu", default="1", help="1 means use gpu for training")
-    parser.add_argument("--gpu_id", default="0", help="the gpu_id parameter")
-    args = parser.parse_args()
-    use_gpu = args.use_gpu
-    assert use_gpu in ["0", "1"]
-
-    if not os.path.exists("./samples/"):
-        os.makedirs("./samples/")
-
-    if not os.path.exists("./params/"):
-        os.makedirs("./params/")
-
-    api.initPaddle('--use_gpu=' + use_gpu, '--dot_period=10',
-                   '--log_period=1000', '--gpu_id=' + args.gpu_id,
-                   '--save_dir=' + "./params/")
-
-    conf = "vae_conf.py"
-
-    trainer_conf = parse_config(conf, "is_generating=False")
-    gener_conf = parse_config(conf, "is_generating=True")
-
-    batch_size = trainer_conf.opt_config.batch_size
-
-    noise_dim = get_layer_size(gener_conf.model_config, "noise")
-
-    mnist = dataloader.MNISTloader(batch_size=batch_size)
-    mnist.load_data()
-
-    training_machine = api.GradientMachine.createFromConfigProto(
-        trainer_conf.model_config)
-
-    generator_machine = api.GradientMachine.createFromConfigProto(
-        gener_conf.model_config)
-
-    trainer = api.Trainer.create(trainer_conf, training_machine)
-
-    trainer.startTrain()
-
-    for train_pass in xrange(100):
-        trainer.startTrainPass()
-        mnist.reset_pointer()
-        i = 0
-        it = 0
-        while mnist.pointer != 0 or i == 0:
-            X = mnist.next_batch().astype('float32')
-
-            inputs = api.Arguments.createArguments(1)
-            inputs.setSlotValue(0, api.Matrix.createDenseFromNumpy(X))
-
-            trainer.trainOneDataBatch(batch_size, inputs)
-
-            if it % 1000 == 0:
-
-                outputs = api.Arguments.createArguments(0)
-                training_machine.forward(inputs, outputs, api.PASS_TEST)
-                loss = np.mean(outputs.getSlotValue(0).copyToNumpyMat())
-                print "\niter: {}".format(str(it).zfill(3))
-                print "VAE loss: {}".format(str(loss).zfill(3))
-
-                #Sync parameters between networks (GradientMachine) at the beginning
-                copy_shared_parameters(training_machine, generator_machine)
-
-                z_samples = np.random.randn(batch_size,
-                                            noise_dim).astype('float32')
-                samples = get_fake_samples(generator_machine, batch_size,
-                                           z_samples)
-
-                #Generating the first 16 images for a picture. 
-                figure = plot_samples(samples[:16])
-                plt.savefig(
-                    "./samples/{}_{}.png".format(
-                        str(train_pass).zfill(3), str(i).zfill(3)),
-                    bbox_inches='tight')
-                plt.close(figure)
-                i += 1
-            it += 1
-
-        trainer.finishTrainPass()
-    trainer.finishTrain()
-
-
-if __name__ == '__main__':
-    main()