diff --git a/.gitmodules b/.gitmodules
deleted file mode 100644
index 832698b8bfaa5e2275cb5202fb42abb9f3afce75..0000000000000000000000000000000000000000
--- a/.gitmodules
+++ /dev/null
@@ -1,4 +0,0 @@
-[submodule "book"]
-	path = book
-	url = https://github.com/PaddlePaddle/book.git
-	branch = develop
\ No newline at end of file
diff --git a/book b/book
deleted file mode 160000
index 6e3875eb62533de1f2c1088a477719eb57b9732c..0000000000000000000000000000000000000000
--- a/book
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 6e3875eb62533de1f2c1088a477719eb57b9732c
diff --git a/demo/introduction/api_train_v2.py b/demo/introduction/api_train_v2.py
index 84125c3b4b621a128fd488ff7fa374a75f620bf1..1ba971b3688ce3dec078998df2c0b183a4e449f8 100644
--- a/demo/introduction/api_train_v2.py
+++ b/demo/introduction/api_train_v2.py
@@ -14,7 +14,7 @@ def main():
                                 act=paddle.activation.Linear(),
                                 bias_attr=paddle.attr.Param(name='b'))
     y = paddle.layer.data(name='y', type=paddle.data_type.dense_vector(1))
-    cost = paddle.layer.regression_cost(input=y_predict, label=y)
+    cost = paddle.layer.mse_cost(input=y_predict, label=y)
 
     # create parameters
     parameters = paddle.parameters.create(cost)
diff --git a/demo/introduction/trainer_config.py b/demo/introduction/trainer_config.py
index ecafe955f9e5c1062168d5d7b6b4c639d6e72a99..651dfaa4b7b4873810a0b393655541a62d1a311b 100644
--- a/demo/introduction/trainer_config.py
+++ b/demo/introduction/trainer_config.py
@@ -34,5 +34,5 @@ y_predict = fc_layer(
     size=1,
     act=LinearActivation(),
     bias_attr=ParamAttr(name='b'))
-cost = regression_cost(input=y_predict, label=y)
+cost = mse_cost(input=y_predict, label=y)
 outputs(cost)
diff --git a/demo/recommendation/api_train_v2.py b/demo/recommendation/api_train_v2.py
index 9b254933a1de60bf8d74517f0d52401d334703b7..f6a061799e3ac50236a68beedaf700dd6c698a05 100644
--- a/demo/recommendation/api_train_v2.py
+++ b/demo/recommendation/api_train_v2.py
@@ -61,7 +61,7 @@ def main():
 
     inference = paddle.layer.cos_sim(
         a=usr_combined_features, b=mov_combined_features, size=1, scale=5)
-    cost = paddle.layer.regression_cost(
+    cost = paddle.layer.mse_cost(
         input=inference,
         label=paddle.layer.data(
             name='score', type=paddle.data_type.dense_vector(1)))
diff --git a/demo/recommendation/trainer_config.py b/demo/recommendation/trainer_config.py
index aabcd335253faf69c940024ac8098a54da030463..25f529d7d7c430f179107fb189ade34760ab309d 100755
--- a/demo/recommendation/trainer_config.py
+++ b/demo/recommendation/trainer_config.py
@@ -86,10 +86,7 @@ movie_feature = construct_feature("movie")
 user_feature = construct_feature("user")
 similarity = cos_sim(a=movie_feature, b=user_feature)
 if not is_predict:
-    outputs(
-        regression_cost(
-            input=similarity, label=data_layer(
-                'rating', size=1)))
+    outputs(mse_cost(input=similarity, label=data_layer('rating', size=1)))
 
     define_py_data_sources2(
         'data/train.list',
diff --git a/doc/api/v1/trainer_config_helpers/layers.rst b/doc/api/v1/trainer_config_helpers/layers.rst
index bbea823de4d870f8a4384b6a85ebb7e8182797fe..24389c2d8574dfda4bec9298776aa6b1aee51535 100644
--- a/doc/api/v1/trainer_config_helpers/layers.rst
+++ b/doc/api/v1/trainer_config_helpers/layers.rst
@@ -432,6 +432,12 @@ multi_binary_label_cross_entropy
     :members: multi_binary_label_cross_entropy
     :noindex:
 
+mse_cost
+---------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: mse_cost
+    :noindex:
+
 huber_cost
 ----------
 ..  automodule:: paddle.trainer_config_helpers.layers
@@ -450,6 +456,12 @@ rank_cost
     :members: rank_cost
     :noindex:
 
+sum_cost
+---------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: sum_cost
+    :noindex:
+
 crf_layer
 -----------------
 ..  automodule:: paddle.trainer_config_helpers.layers
@@ -486,12 +498,6 @@ hsigmoid
     :members: hsigmoid
     :noindex:
 
-sum_cost
----------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: sum_cost
-    :noindex:
-
 Check Layer 
 ============
 
diff --git a/doc/design/multi_language_interface/why_plain_c.md b/doc/design/multi_language_interface/why_plain_c.md
new file mode 100644
index 0000000000000000000000000000000000000000..a3f41ca7b93de8a55d927c88812802ef12246182
--- /dev/null
+++ b/doc/design/multi_language_interface/why_plain_c.md
@@ -0,0 +1,118 @@
+# Paddle多语言接口实现
+## 背景
+
+Paddle需要一个多语言接口，这个接口需要做到:
+
+* 有标准的，良好的文档
+    * 例如Python可以使用[Sphinx](http://www.sphinx-doc.org/en/stable/)生成API文档，golang可以使用[GoDoc](https://godoc.org/golang.org/x/tools/cmd/godoc)生成文档。这都需要这个接口按照约定俗成的规则来注释完备。
+* 不同语言的接口适应不同语言的特性
+    * 例如Java与Python的错误处理是直接扔出来Exception，而对于golang错误处理应该使用返回值。
+
+## 基本要求
+
+Paddle的多语言接口实现包括一下几个方面:
+
+* 我们使用动态库来分发Paddle。在这个动态库中不嵌入任何其他语言的解释器，也不使用其他动态库。
+* 这个动态库使用C99标准的头文件导出一些函数，不使用/导出C++符号。
+* 不导出Paddle内部的结构体、类，仅仅使用`void*`指针作为类型的句柄(handler)。
+* 不使用SWIG这种代码生成器，而是手写多语言绑定。
+
+
+## 原因
+
+### 使用动态库来分发Paddle
+
+* Paddle的链接方式比较复杂
+    * 如果用户要把Paddle的静态库（libpaddle.a）链接到自己的程序里，得使用 `--whole-archive` (for GCC) 或者 `--force_load` (for Clang) 参数，来确保把 libpaddle.a 里所有的符号都写入自己的程序的二进制文件里。这是因为 Paddle 的源码里使用了[object factory design pattern](http://stackoverflow.com/a/1310326/724872)。
+* 编译型语言，例如C/C++使用静态库和动态库难度差不多。但是解释性语言，例如[Python](http://stackoverflow.com/questions/19560594/how-to-import-static-library-in-python)或者[Java](http://stackoverflow.com/questions/24493337/linking-static-library-with-jni)，只能调用Paddle的动态库，否则得把Paddle静态库链接到解释器里。
+    * 解释性语言实际运行的二进制是解释器本身，如果调用静态库只能将静态库与解释器链接。例如对于Java来说，便是将静态库加入JVM中。这对于通常的Java的开发者来说，是不常见的做法。
+
+### 动态库中不嵌入任何其他语言的解释器
+
+* 目前Paddle的进程模型是C++内部驱动Python解释器进行模型配置解析和数据读取
+* 我们最终的动态库中不嵌入Python或者其他任何语言的解释器。模型配置解析，数据读取均交由其他语言完成
+
+现阶段Paddle有一个问题是，Paddle内嵌的Python解释器和外部使用的Python如果版本不同，会直接报错退出。
+
+### Paddle动态库中，不引用其他动态库
+
+* 即这个动态库是不依赖于其他任何文件的，可以在任何机器上执行的。
+
+###  这个动态库使用C99标准的头文件导出一些函数，不使用/导出C++符号
+
+* 由于C++编译器没有[名字修饰](https://en.wikipedia.org/wiki/Name_mangling#C.2B.2B)的规范，不同版本的编译器之间，对于同一段C++代码生成的符号可能不一致。而多语言接口需要直接读取生成的二进制(动态库)，需要有稳定的导出符号。
+* C语言是有导出符号的标准的，并且在常见的平台上，都是ABI调用标准的。
+* 大多数语言都支持使用C语言API
+* 使用C99而不使用C89，是因为C99支持[Fixed-width integer types](https://en.wikipedia.org/wiki/C_data_types#Fixed-width_integer_types)和[Boolean type](https://en.wikipedia.org/wiki/C_data_types#Boolean_type)。
+* 使用C99而不使用C11的原因是，[C11](https://en.wikipedia.org/wiki/C11_(C_standard_revision))并没有Paddle特别需要的特性，且C99相对于C11使用更加广泛。
+
+### 不导出Paddle内部的结构体、类，仅仅使用`void*`指针作为类型的句柄(handler)
+
+* Paddle内部的类为C++书写，直接导出到C的接口比较困难。
+* 在C-API中使用`void*`来表示Paddle内部类。再在每一个API中自己检查类型。
+
+在C的头文件 `paddle_matrix.h` 中:
+
+```C
+typedef void* paddle_matrix;
+typedef int paddle_error;
+
+extern "C"
+paddle_error paddle_matrix_shape(paddle_matrix matrix,
+                                 uint64_t* width,
+                                 uint64_t* height);
+```
+而在CPP里面实现这个C的接口，文件 `paddle_matrix.cpp`
+
+```cpp
+#include "paddle/math/matrix.hpp"
+extern "C"
+paddle_error paddle_matrix_shape(paddle_matrix matrix,
+                                 uint64_t *width,
+                                 uint64_t *height) {
+  auto m = (paddle::math::matrix*)(matrix);
+  *width = m->width();
+  *height = m->height();
+}
+```
+
+其中`paddle/math/matrix.hpp`文件内容为:
+
+```cpp
+namespace paddle {
+namespace math {  
+
+class Matrix {
+  //...
+};
+
+}  // namespace math
+}  // namespace paddle
+```
+
+### 不使用SWIG这种代码生成器，而是手写多语言绑定
+
+* [SWIG](http://www.swig.org/)是一个多语言接口的代码生成器。他的目标是使用C/C++写代码，SWIG直接读取C/C++的头文件，生成各种语言的绑定代码。
+    * 对于多语言接口，SWIG需要写一个interface文件。这个文件具有独特的语法，学习成本高。且增加一个第三方语言，就需要对这个第三方语言增加一些定义。有的时候，interface文件的写法非常[tricky](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/api/Paddle.swig#L36)。社区贡献代码学习成本高。
+    * SWIG暴露的接口保留了C++的接口样式，很难保证多语言代码风格的一致性。(函数命名，错误处理)
+        * 因为SWIG在第三方语言中暴露的函数名，类名和C++中完全一致。C++的命名风格并不能适应其他第三方语言。如果使用SWIG我们需要将在interface文件里，将大量的`SomeCppClass`重命名成`some_python_class`，或者`SomeGoTypes`。
+        * 对于不同语言，错误处理的方式也不尽相同。例如对于Java或者Python，最常见的错误处理方式是Exception，而对于Golang，错误处理方式是返回值。而SWIG只能简单的暴露C++接口，无法做到对于各种语言错误处理方式的适配。
+    * 对于大多数语言，直接使用C语言的.h并不困难。例如Python的[cffi](https://cffi.readthedocs.io/en/latest/overview.html#simple-example-abi-level-in-line)或者[Cython](http://cython.org/), golang的[cgo](https://golang.org/cmd/cgo/)。
+    * SWIG支持的语言或者解释器有局限。例如对于Python，使用SWIG只支持CPython解释器，而不支持PyPy解释器。
+
+
+## 原因列表
+
+| 结论 | 对比 | 原因 |
+|---| --- | --- |
+| 使用动态库 | 不使用静态库 | 解释型语言只能调用动态库，Paddle静态库链接复杂 |
+| 不嵌入其他语言解释器 | 不嵌入Python解释器 | Paddle C++目前嵌入Python解释器，会导致不同版本Python在一个进程里的bug |
+| 不引用其他动态库 | | Paddle一个动态库可以在任何Linux系统上运行 |
+| 使用C99做接口 | 不使用C++做接口 | C有标准的ABI，C99是目前C最广泛的使用标准，且C99支持bool类型和定长整数(uint64_t等)类型 |
+| 使用void*作为类句柄 | 不显示的写每个类具体包含什么| 实现简单，并且让接口脱离实现细节 |
+| 手写多语言绑定 | 不使用SWIG | 使用SWIG需要多语言绑定的开发人员熟练掌握SWIG配置，社区参与困难。SWIG生成的代码不能保证多语言代码风格的一致性 |
+
+
+## 简单实现
+
+TBD
diff --git a/doc/getstarted/basic_usage/index_cn.rst b/doc/getstarted/basic_usage/index_cn.rst
index d01cdaaeb75ec7d02480eb9162cabaad2a947db9..428f58830e0b10c024f31238b7404c6df193eecd 100644
--- a/doc/getstarted/basic_usage/index_cn.rst
+++ b/doc/getstarted/basic_usage/index_cn.rst
@@ -55,7 +55,7 @@ PaddlePaddle是源于百度的一个深度学习平台。这份简短的介绍
     # 线性计算网络层: ȳ = wx + b
     ȳ = fc_layer(input=x, param_attr=ParamAttr(name='w'), size=1, act=LinearActivation(), bias_attr=ParamAttr(name='b'))
     # 计算误差函数，即  ȳ 和真实 y 之间的距离
-    cost = regression_cost(input= ȳ, label=y)
+    cost = mse_cost(input= ȳ, label=y)
     outputs(cost)
 
 
@@ -69,7 +69,7 @@ PaddlePaddle是源于百度的一个深度学习平台。这份简短的介绍
     
     - **数据层**：数据层 `data_layer` 是神经网络的入口，它读入数据并将它们传输到接下来的网络层。这里数据层有两个，分别对应于变量 `x` 和 `y`。
     - **全连接层**：全连接层 `fc_layer` 是基础的计算单元，这里利用它建模变量之间的线性关系。计算单元是神经网络的核心，PaddlePaddle支持大量的计算单元和任意深度的网络连接，从而可以拟合任意的函数来学习复杂的数据关系。
-    - **回归误差代价层**：回归误差代价层 `regression_cost` 是众多误差代价函数层的一种，它们在训练过程作为网络的出口，用来计算模型的误差，是模型参数优化的目标函数。
+    - **回归误差代价层**：回归误差代价层 `mse_cost` 是众多误差代价函数层的一种，它们在训练过程作为网络的出口，用来计算模型的误差，是模型参数优化的目标函数。
 
 定义了网络结构并保存为 `trainer_config.py` 之后，运行以下训练命令：
 
diff --git a/doc/getstarted/basic_usage/index_en.rst b/doc/getstarted/basic_usage/index_en.rst
index c10b897d4292d0c2b062b5c8e23466505afa408a..6775da20c2f51000f305b095d40abd27b8fa6c0e 100644
--- a/doc/getstarted/basic_usage/index_en.rst
+++ b/doc/getstarted/basic_usage/index_en.rst
@@ -49,7 +49,7 @@ To recover this relationship between ``X`` and ``Y``, we use a neural network wi
         x = data_layer(name='x', size=1)
         y = data_layer(name='y', size=1)
         y_predict = fc_layer(input=x, param_attr=ParamAttr(name='w'), size=1, act=LinearActivation(), bias_attr=ParamAttr(name='b'))
-        cost = regression_cost(input=y_predict, label=y)
+        cost = mse_cost(input=y_predict, label=y)
         outputs(cost)
 
 Some of the most fundamental usages of PaddlePaddle are demonstrated:
diff --git a/doc/howto/usage/k8s/k8s_distributed_cn.md b/doc/howto/usage/k8s/k8s_distributed_cn.md
index 2a7a6c8c17882a6f2c95e933e051c4b8f1a8eeee..3121b3f59df650c0a22d0bd305a6f793b202d30e 100644
--- a/doc/howto/usage/k8s/k8s_distributed_cn.md
+++ b/doc/howto/usage/k8s/k8s_distributed_cn.md
@@ -213,7 +213,7 @@ I1116 09:10:17.123440    50 Util.cpp:130] Calling runInitFunctions
 I1116 09:10:17.123764    50 Util.cpp:143] Call runInitFunctions done.
 [WARNING 2016-11-16 09:10:17,227 default_decorators.py:40] please use keyword arguments in paddle config.
 [INFO 2016-11-16 09:10:17,239 networks.py:1282] The input order is [movie_id, title, genres, user_id, gender, age, occupation, rating]
-[INFO 2016-11-16 09:10:17,239 networks.py:1289] The output order is [__regression_cost_0__]
+[INFO 2016-11-16 09:10:17,239 networks.py:1289] The output order is [__mse_cost_0__]
 I1116 09:10:17.392917    50 Trainer.cpp:170] trainer mode: Normal
 I1116 09:10:17.613910    50 PyDataProvider2.cpp:257] loading dataprovider dataprovider::process
 I1116 09:10:17.680917    50 PyDataProvider2.cpp:257] loading dataprovider dataprovider::process
diff --git a/paddle/py_paddle/dataprovider_converter.py b/paddle/py_paddle/dataprovider_converter.py
index c009b05cdeeb9dbe2dc70048e6827a12445f677e..6d6a406cf61d467cb2701ca5e85e99648eea36eb 100644
--- a/paddle/py_paddle/dataprovider_converter.py
+++ b/paddle/py_paddle/dataprovider_converter.py
@@ -16,11 +16,25 @@ import paddle.trainer.PyDataProvider2 as dp2
 import collections
 import swig_paddle
 import numpy
+import itertools
 
 __all__ = ['DataProviderConverter']
 
 
 class IScanner(object):
+    """
+    The scanner will scan Python object two passes, then convert it to Paddle's
+    argument.
+
+    In the first pass, `pre_scan` will be invoked by every data instance, and
+    then invoke `finish_pre_scan` to arguments. And the second pass do the same
+    thing except the functions changed to `scan`, `finish_scan`.
+
+    During the first pass, a scanner may count the shape of input matrix and
+    allocate memory for this argument. Then fill the data into this  argument
+    in second pass.
+    """
+
     def __init__(self, input_type, pos):
         self.input_type = input_type
         if not isinstance(self.input_type, dp2.InputType):
@@ -36,10 +50,40 @@ class IScanner(object):
         self.data_in_gpu = swig_paddle.isUsingGpu(
         ) and swig_paddle.getTrainerCount() == 1
 
+    def pre_scan(self, dat):
+        """
+        First pass scan method. During this method, the scanner could count the
+        data number, and get the total memory size this batch would use.
+
+        :param dat: The python object.
+        """
+        pass
+
+    def finish_pre_scan(self, argument):
+        """
+        Finish first scan pass. Allocate the memory.
+
+        :param argument: Output arguments object.
+        :type argument: swig_paddle.Arguments
+        :return:
+        """
+        pass
+
     def scan(self, dat):
+        """
+        Second pass scan method. Copy the data to arguments.
+
+        :param dat: The python object.
+        """
         pass
 
     def finish_scan(self, argument):
+        """
+        Finish second pass. Finalize the resources, etc.
+
+        :param argument: Output arguments object.
+        :type argument: swig_paddle.Arguments
+        """
         pass
 
 
@@ -51,12 +95,19 @@ class DenseScanner(IScanner):
     def __init__(self, input_type, pos):
         IScanner.__init__(self, input_type, pos)
         self.__mat__ = None
+        self.__height__ = 0
+
+    def pre_scan(self, dat):
+        self.__height__ += 1
+
+    def finish_pre_scan(self, argument):
+        self.__mat__ = numpy.ndarray(
+            shape=(self.__height__, self.input_type.dim), dtype=numpy.float32)
+        self.__height__ = 0
 
     def scan(self, dat):
-        if self.__mat__ is None:
-            self.__mat__ = numpy.array([dat], dtype='float32')
-        else:
-            self.__mat__ = numpy.append(self.__mat__, [dat], axis=0)
+        self.__mat__[self.__height__] = dat
+        self.__height__ += 1
 
     def finish_scan(self, argument):
         assert isinstance(argument, swig_paddle.Arguments)
@@ -163,7 +214,14 @@ class DataProviderConverter(object):
         ]
 
         for each_sample in dat:
-            for each_step, scanner in zip(each_sample, scanners):
+            for each_step, scanner in itertools.izip(each_sample, scanners):
+                scanner.pre_scan(each_step)
+
+        for scanner in scanners:
+            scanner.finish_pre_scan(argument)
+
+        for each_sample in dat:
+            for each_step, scanner in itertools.izip(each_sample, scanners):
                 scanner.scan(each_step)
 
         for scanner in scanners:
diff --git a/paddle/scripts/docker/README.md b/paddle/scripts/docker/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..fc38fd7fa6e715c5fc326bb491994045d41e47f5
--- /dev/null
+++ b/paddle/scripts/docker/README.md
@@ -0,0 +1,155 @@
+# Building PaddlePaddle
+
+## Goals
+
+We want the building procedure generates Docker images, so we can run PaddlePaddle applications on Kubernetes clusters.
+
+We want it generates .deb packages, so that enterprises without Docker support can run PaddlePaddle applications as well.
+
+We want to minimize the size of  generated Docker images and .deb packages so to ease the deployment cost.
+
+We want to encapsulate building tools and dependencies in a *development* Docker image so to ease the tools installation for developers.
+
+We want developers can use whatever editing tools (emacs, vim, Eclipse, Jupyter Notebook), so the development Docker image contains only building tools, not editing tools, and developers are supposed to git clone source code into their development computers, instead of the container running the development Docker image.
+
+We want the procedure and tools work also with testing, continuous integration, and releasing.
+
+
+## Docker Images
+
+We want two Docker images for each version of PaddlePaddle:
+
+1. `paddle:<version>-dev`
+
+   This a development image contains only the development tools.  This standardizes the building tools and procedure.  Users include:
+
+   - developers -- no longer need to install development tools on the host, and can build their current work on the host (development computer).
+   - release engineers -- use this to build the official release from certain branch/tag on Github.com.
+   - document writers / Website developers -- Our documents are in the source repo in the form of .md/.rst files and comments in source code.  We need tools to extract the information, typeset, and generate Web pages.
+
+   Of course developers can install building tools on their development computers.  But different version of PaddlePaddle might require different set/version of building tools.  Also, it makes collaborative debugging eaiser if all developers use a unified development environment.
+
+  The development image should include the following tools:
+
+   - gcc/clang
+   - nvcc
+   - Python
+   - sphinx
+   - woboq
+   - sshd
+
+   where `sshd` makes it easy for developers to have multiple terminals connecting into the container.  `docker exec` works too, but if the container is running on a remote machine, it would be easier to ssh directly into the container than ssh to the box and run `docker exec`.
+
+1. `paddle:<version>`
+
+   This is the production image, generated using the development image. This image might have multiple variants:
+
+   - GPU/AVX   `paddle:<version>-gpu`
+   - GPU/no-AVX  `paddle:<version>-gpu-noavx`
+   - no-GPU/AVX  `paddle:<version>`
+   - no-GPU/no-AVX  `paddle:<version>-noavx`
+
+   We'd like to give users the choice between GPU and no-GPU, because the GPU version image is much larger than then the no-GPU version.
+
+   We'd like to give users the choice between AVX and no-AVX, because some cloud providers don't provide AVX-enabled VMs.
+
+
+## Development Environment
+
+Here we describe how to use above two images.  We start from considering our daily development environment.
+
+Developers work on a computer, which is usually a laptop or desktop:
+
+![](doc/paddle-development-environment.png)
+
+or, they might rely on a more sophisticated box (like with GPUs):
+
+![](doc/paddle-development-environment-gpu.png)
+
+A basic principle is that source code lies on the development computer (host), so that editing tools like Eclipse can parse the source code and support auto-completion.
+
+
+## Usages
+
+### Build the Development Docker Image
+
+The following commands check out the source code on the development computer (host) and build the development image `paddle:dev`:
+
+```bash
+git clone https://github.com/PaddlePaddle/Paddle paddle
+cd paddle
+docker build -t paddle:dev .
+```
+
+The `docker build` command assumes that `Dockerfile` is in the root source tree.  This is reasonable because this Dockerfile is this only on in our repo in this design.
+
+
+### Build PaddlePaddle from Source Code
+
+Given the development image `paddle:dev`, the following command builds PaddlePaddle from the source tree on the development computer (host):
+
+```bash
+docker run -v $PWD:/paddle -e "GPU=OFF" -e "AVX=ON" -e "TEST=ON" paddle:dev
+```
+
+This command mounts the source directory on the host into `/paddle` in the container, so  the default entrypoint of `paddle:dev`, `build.sh`, would build the source code with possible local changes.  When it writes to `/paddle/build` in the container, it actually writes to `$PWD/build` on the host.
+
+`build.sh` builds the following:
+
+- PaddlePaddle binaries,
+- `$PWD/build/paddle-<version>.deb` for production installation, and
+- `$PWD/build/Dockerfile`, which builds the production Docker image.
+
+
+### Build the Production Docker Image
+
+The following command builds the production image:
+
+```bash
+docker build -t paddle -f build/Dockerfile .
+```
+
+This production image is minimal -- it includes binary `paddle`, the share library `libpaddle.so`, and Python runtime.
+
+### Run PaddlePaddle Applications
+
+Again the development happens on the host.  Suppoose that we have a simple application program in `a.py`, we can test and run it using the production image:
+
+```bash
+docker run -it -v $PWD:/work paddle /work/a.py
+```
+
+But this works only if all dependencies of `a.py` are in the production image. If this is not the case, we need to build a new Docker image from the production image and with more dependencies installs.
+
+### Build and Run PaddlePaddle Appications
+
+We need a Dockerfile in https://github.com/paddlepaddle/book that builds Docker image `paddlepaddle/book:<version>`, basing on the PaddlePaddle production image:
+
+```
+FROM paddlepaddle/paddle:<version>
+RUN pip install -U matplotlib jupyter ...
+COPY . /book
+EXPOSE 8080
+CMD ["jupyter"]
+```
+
+The book image is an example of PaddlePaddle application image.  We can build it
+
+```bash
+git clone https://github.com/paddlepaddle/book
+cd book
+docker build -t book .
+```
+
+### Build and Run Distributed Applications
+
+In our [API design doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/api.md#distributed-training), we proposed an API that starts a distributed training job on a cluster.  This API need to build a PaddlePaddle application into a Docekr image as above, and calls kubectl to run it on the cluster.  This API might need to generate a Dockerfile look like above and call `docker build`.
+
+Of course, we can manually build an application image and launch the job using the kubectl tool:
+
+```bash
+docker build -f some/Dockerfile -t myapp .
+docker tag myapp me/myapp
+docker push
+kubectl ...
+```
diff --git a/paddle/scripts/docker/doc/paddle-development-environment-gpu.graffle b/paddle/scripts/docker/doc/paddle-development-environment-gpu.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..4629f9b9da7ababdafa0b964db18a98a819c6a9e
Binary files /dev/null and b/paddle/scripts/docker/doc/paddle-development-environment-gpu.graffle differ
diff --git a/paddle/scripts/docker/doc/paddle-development-environment-gpu.png b/paddle/scripts/docker/doc/paddle-development-environment-gpu.png
new file mode 100644
index 0000000000000000000000000000000000000000..61a96d7198d013f08f0f9c269cc352da5f7dd2e9
Binary files /dev/null and b/paddle/scripts/docker/doc/paddle-development-environment-gpu.png differ
diff --git a/paddle/scripts/docker/doc/paddle-development-environment.graffle b/paddle/scripts/docker/doc/paddle-development-environment.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..5b164c4832809de94ead7309af49c579135d7f48
Binary files /dev/null and b/paddle/scripts/docker/doc/paddle-development-environment.graffle differ
diff --git a/paddle/scripts/docker/doc/paddle-development-environment.png b/paddle/scripts/docker/doc/paddle-development-environment.png
new file mode 100644
index 0000000000000000000000000000000000000000..707ed45a335a981c23b3533984045f53848b55e2
Binary files /dev/null and b/paddle/scripts/docker/doc/paddle-development-environment.png differ
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index b94f8f9a783552519ca73e7cfc0937b302d3445b..7cd3ce91312b86c96e46530e45ff9427db0a0a45 100755
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -52,6 +52,7 @@ __all__ = [
     "cos_sim",
     "hsigmoid",
     "conv_projection",
+    "mse_cost",
     "regression_cost",
     'classification_cost',
     "LayerOutput",
@@ -3572,11 +3573,14 @@ def __cost_input__(input, label, weight=None):
 
 @wrap_name_default()
 @layer_support()
-def regression_cost(input, label, weight=None, name=None, layer_attr=None):
+def mse_cost(input, label, weight=None, name=None, layer_attr=None):
     """
-    Regression Layer.
+    mean squared error cost:
+
+    ..  math::
+
+       $\frac{1}{N}\sum_{i=1}^N(t _i- y_i)^2$
 
-    TODO(yuyang18): Complete this method.
 
     :param name: layer name.
     :type name: basestring
@@ -3602,6 +3606,9 @@ def regression_cost(input, label, weight=None, name=None, layer_attr=None):
     return LayerOutput(name, LayerType.COST, parents=parents, size=1)
 
 
+regression_cost = mse_cost
+
+
 @wrap_name_default("cost")
 @layer_support()
 def classification_cost(input,
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers_with_weight.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers_with_weight.protostr
index 811b38ae4a51e8faedb59fea2b81a8be3cceeae6..3244181a63109335c4fba6ca4dd04ac8f0446313 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers_with_weight.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers_with_weight.protostr
@@ -45,7 +45,7 @@ layers {
   coeff: 1.0
 }
 layers {
-  name: "__regression_cost_0__"
+  name: "__mse_cost_0__"
   type: "square_error"
   size: 1
   active_type: ""
@@ -84,7 +84,7 @@ input_layer_names: "input"
 input_layer_names: "label"
 input_layer_names: "weight"
 output_layer_names: "__cost_0__"
-output_layer_names: "__regression_cost_0__"
+output_layer_names: "__mse_cost_0__"
 evaluators {
   name: "classification_error_evaluator"
   type: "classification_error"
@@ -99,12 +99,12 @@ sub_models {
   layer_names: "weight"
   layer_names: "__fc_layer_0__"
   layer_names: "__cost_0__"
-  layer_names: "__regression_cost_0__"
+  layer_names: "__mse_cost_0__"
   input_layer_names: "input"
   input_layer_names: "label"
   input_layer_names: "weight"
   output_layer_names: "__cost_0__"
-  output_layer_names: "__regression_cost_0__"
+  output_layer_names: "__mse_cost_0__"
   evaluator_names: "classification_error_evaluator"
   is_recurrent_layer_group: false
 }
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers_with_weight.py b/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers_with_weight.py
index d30f70a55c5b1834074966dfb3f378e01447c8ab..1c0aa7f9b9ee45b9eaf82dc46a2648d834dcd4ad 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers_with_weight.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers_with_weight.py
@@ -10,5 +10,5 @@ fc = fc_layer(input=data, size=10, act=SoftmaxActivation())
 outputs(
     classification_cost(
         input=fc, label=lbl, weight=wt),
-    regression_cost(
+    mse_cost(
         input=fc, label=lbl, weight=wt))
diff --git a/python/paddle/v2/tests/test_layer.py b/python/paddle/v2/tests/test_layer.py
index 0055679a91801a2f9b6432797665ec17caf3beb1..5ccd3d6913e1755a37b4da7c4f182147b880d3cb 100644
--- a/python/paddle/v2/tests/test_layer.py
+++ b/python/paddle/v2/tests/test_layer.py
@@ -126,9 +126,8 @@ class CostLayerTest(unittest.TestCase):
         cost3 = layer.cross_entropy_cost(input=inference, label=label)
         cost4 = layer.cross_entropy_with_selfnorm_cost(
             input=inference, label=label)
-        cost5 = layer.regression_cost(input=inference, label=label)
-        cost6 = layer.regression_cost(
-            input=inference, label=label, weight=weight)
+        cost5 = layer.mse_cost(input=inference, label=label)
+        cost6 = layer.mse_cost(input=inference, label=label, weight=weight)
         cost7 = layer.multi_binary_label_cross_entropy_cost(
             input=inference, label=label)
         cost8 = layer.rank_cost(left=score, right=score, label=score)