diff --git a/benchmark/cluster/README.md b/benchmark/cluster/README.md
index b619613ea7a5b6e940ec735314e8e47338b2c600..64816098a524f064ec12474a736cd4c721227a70 100644
--- a/benchmark/cluster/README.md
+++ b/benchmark/cluster/README.md
@@ -36,11 +36,41 @@
 - Trainer Count: 100
 - Metrics: mini-batch / sec
 
-| Batch Size | 32 | 64 | 128 | 256 |
-| -- | -- | -- | -- | -- |
-| PaddlePaddle Fluid | - | - | - | - |
-| PaddlePaddle v2 | - | - | - | - |
-| TensorFlow | - | - | - | - |
+
+<table>
+<thead>
+<tr>
+<th>Batch Size </th>
+<th> 32</th>
+<th>64</th>
+<th>128 </th>
+<th>256</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td> PaddlePaddle Fluid</td>
+<td>-</td>
+<td>- </td>
+<td>-  </td>
+<td>- </td>
+</tr>
+<tr>
+<td>PaddlePaddle v2  </td>
+<td>-  </td>
+<td>- </td>
+<td>-  </td>
+<td>- </td>
+</tr>
+<tr>
+<td>TensorFlow </td>
+<td>-  </td>
+<td>- </td>
+<td>-  </td>
+<td>- </td>
+</tr>
+</tbody>
+</table>
 
 ### Measure the Performance for Different PServer Count
 
@@ -48,11 +78,41 @@
 - Batch Size: 64
 - Metrics: mini-batch / sec
 
-| PServer Count | 10 | 20 | 40 | 60 |
-| -- | -- | -- | -- | -- |
-| PaddlePaddle Fluid | - | - | - | - |
-| PaddlePaddle v2 | - | - | - | - |
-| TensorFlow | - | - | - | - |
+
+<table>
+<thead>
+<tr>
+<th>PServer Count  </th>
+<th>10</th>
+<th>20</th>
+<th>40 </th>
+<th>60</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td> PaddlePaddle Fluid</td>
+<td>-</td>
+<td>- </td>
+<td>-  </td>
+<td>- </td>
+</tr>
+<tr>
+<td>PaddlePaddle v2  </td>
+<td>-  </td>
+<td>- </td>
+<td>-  </td>
+<td>- </td>
+</tr>
+<tr>
+<td>TensorFlow </td>
+<td>-  </td>
+<td>- </td>
+<td>-  </td>
+<td>- </td>
+</tr>
+</tbody>
+</table>
 
 ### Measure Parallel Efficiency By Increasing Trainer Count
 
@@ -67,11 +127,69 @@ The parallel efficiency is:
 
 $E = \div(S, N)$
 
-| Trainer Counter | 1 | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
-| -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- |
-| PaddlePaddle Fluid | - | - | - | - | - | - | - | - | - | - | - |
-| PaddlePaddle v2 | - | - | - | - | - | - | - | - | - | - | - | - |
-| TensorFlow | - | - | - | - | - | - | - | - | - | - | - | - | - |
+<table>
+<thead>
+<tr>
+<th>Trainer Counter  </th>
+<th>1</th>
+<th>10</th>
+<th>20 </th>
+<th>30</th>
+<th>40</th>
+<th>50</th>
+<th>60 </th>
+<th>70</th>
+<th>80</th>
+<th>90</th>
+<th>100 </th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td> PaddlePaddle Fluid</td>
+<td>-</td>
+<td>- </td>
+<td>- </td>
+<td>- </td>
+<td>-</td>
+<td>- </td>
+<td>- </td>
+<td>- </td>
+<td>-</td>
+<td>- </td>
+<td>- </td>
+</tr>
+<tr>
+<td>PaddlePaddle v2  </td>
+<td>-  </td>
+<td>- </td>
+<td>-  </td>
+<td>- </td>
+<td>-</td>
+<td>- </td>
+<td>- </td>
+<td>- </td>
+<td>-</td>
+<td>- </td>
+<td>- </td>
+</tr>
+<tr>
+<td>TensorFlow </td>
+<td>-  </td>
+<td>- </td>
+<td>-  </td>
+<td>- </td>
+<td>-</td>
+<td>- </td>
+<td>- </td>
+<td>- </td>
+<td>-</td>
+<td>- </td>
+<td>- </td>
+</tr>
+</tbody>
+</table>
+
 
 ## Reproduce the benchmark
 
diff --git a/benchmark/cluster/vgg16/README.md b/benchmark/cluster/vgg16/README.md
index cd681a1a282d9a26eac1c267bfa26967f8c3c9fd..d56a912b9b03986e32693363f82df05a34b779e9 100644
--- a/benchmark/cluster/vgg16/README.md
+++ b/benchmark/cluster/vgg16/README.md
@@ -16,11 +16,41 @@ Setting environment variable: `MKL_NUM_THREADS=1`.
 
 - Metrics: samples / sec
 
-| Batch Size | 32 | 64 | 128 | 256 |
-| -- | -- | -- | -- | -- |
-| PaddlePaddle Fluid | 15.44 | 16.32 | 16.74 | 16.79 |
-| PaddlePaddle v2 | 15.97 | 17.04 | 17.60 | 17.83 |
-| TensorFlow | 9.09 | 9.10 | 9.24 | 8.66 |
+<table>
+<thead>
+<tr>
+<th>Batch Size </th>
+<th> 32</th>
+<th>64</th>
+<th>128 </th>
+<th>256</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td> PaddlePaddle Fluid</td>
+<td> 15.44 </td>
+<td> 16.32 </td>
+<td> 16.74 </td>
+<td> 16.79 </td>
+</tr>
+<tr>
+<td>PaddlePaddle v2  </td>
+<td> 15.97 </td>
+<td> 17.04 </td>
+<td> 17.60 </td>
+<td> 17.83 </td>
+</tr>
+<tr>
+<td>TensorFlow </td>
+<td> 9.09 </td>
+<td> 9.10 </td>
+<td> 9.24 </td>
+<td> 8.66 </td>
+</tr>
+</tbody>
+</table>
+
 
 ### Different Batch Size
 
@@ -28,12 +58,40 @@ Setting environment variable: `MKL_NUM_THREADS=1`.
 - Trainer Count: 20
 - Metrics: samples / sec
 
-| Batch Size | 32 | 64 | 128 | 256 |
-| -- | -- | -- | -- | -- |
-| PaddlePaddle Fluid | 190.20 | 222.15 | 247.40 | 258.18 |
-| PaddlePaddle v2 | 170.96 | 233.71 | 256.14 | 329.23 |
-| TensorFlow | - | - | - | - |
-
+<table>
+<thead>
+<tr>
+<th>Batch Size </th>
+<th> 32</th>
+<th>64</th>
+<th>128 </th>
+<th>256</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td> PaddlePaddle Fluid</td>
+<td> 190.20 </td>
+<td> 222.15 </td>
+<td> 247.40 </td>
+<td> 258.18 </td>
+</tr>
+<tr>
+<td>PaddlePaddle v2  </td>
+<td> 170.96 </td>
+<td> 233.71 </td>
+<td> 256.14 </td>
+<td> 329.23 </td>
+</tr>
+<tr>
+<td>TensorFlow </td>
+<td> - </td>
+<td> - </td>
+<td> - </td>
+<td> - </td>
+</tr>
+</tbody>
+</table>
 
 ### Accelerate Rate
 
@@ -41,11 +99,41 @@ Setting environment variable: `MKL_NUM_THREADS=1`.
 - Batch Size: 128
 - Metrics: samples / sec
 
-| Trainer Count | 20 | 40 | 80 | 100 |
-| -- | -- | -- | -- | -- |
-| PaddlePaddle Fluid | 263.29 (78.64%) | 518.80 (77.47%) | 836.26 (62.44%) | 1019.29 (60.89%) |
-| PaddlePaddle v2 (need more tests) | 326.85 (92.85%) | 534.58 (75.93%) | 853.30 (60.60%) | 1041.99 (59.20%) |
-| TensorFlow | - | - | - | - |
+<table>
+<thead>
+<tr>
+<th>Trainer Count </th>
+<th>20</th>
+<th>40</th>
+<th>80</th>
+<th>100</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td> PaddlePaddle Fluid</td>
+<td> 263.29 (78.64%) </td>
+<td> 518.80 (77.47%) </td>
+<td> 836.26 (62.44%) </td>
+<td> 1019.29 (60.89%) </td>
+</tr>
+<tr>
+<td>PaddlePaddle v2 (need more tests)   </td>
+<td> 326.85 (92.85%) </td>
+<td> 534.58 (75.93%) </td>
+<td> 853.30 (60.60%) </td>
+<td> 1041.99 (59.20%) </td>
+</tr>
+<tr>
+<td>TensorFlow </td>
+<td> - </td>
+<td> - </td>
+<td> - </td>
+<td> - </td>
+</tr>
+</tbody>
+</table>
+
 
 ### Different Pserver Count
 
@@ -53,11 +141,41 @@ Setting environment variable: `MKL_NUM_THREADS=1`.
 - Batch Size: 128
 - Metrics: samples/ sec
 
-| PServer Count | 3 | 6 |10 | 20 |
-| -- | -- | -- | -- | -- |
-| PaddlePaddle Fluid(should fix in next PR) | 589.1 | 592.6 | 656.4 | 655.8 |
-| PaddlePaddle v2 | 593.4 | 791.3 | 729.7 | 821.7 |
-| TensorFlow | - | - | - | - |
+<table>
+<thead>
+<tr>
+<th>PServer Count </th>
+<th>3</th>
+<th>6</th>
+<th>10</th>
+<th>20</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td> PaddlePaddle Fluid(should fix in next PR) </td>
+<td> 589.1 </td>
+<td> 592.6 </td>
+<td> 656.4 </td>
+<td> 655.8 </td>
+</tr>
+<tr>
+<td>PaddlePaddle v2 (need more tests)   </td>
+<td> 593.4 </td>
+<td> 791.3 </td>
+<td> 729.7 </td>
+<td> 821.7 </td>
+</tr>
+<tr>
+<td>TensorFlow </td>
+<td> - </td>
+<td> - </td>
+<td> - </td>
+<td> - </td>
+</tr>
+</tbody>
+</table>
+
 
 *The performance gap between Fuild and v2 comes from the network interference.*
 
diff --git a/doc/fluid/design/algorithm/parameter_average.md b/doc/fluid/design/algorithm/parameter_average.md
index 2c4edee9fe31d502ea62b9fe5c8757c0a4c5e79f..53d601d3a9a37e8adad519833bb6fa2dc48023a0 100644
--- a/doc/fluid/design/algorithm/parameter_average.md
+++ b/doc/fluid/design/algorithm/parameter_average.md
@@ -7,7 +7,7 @@ Polyak and Juditsky (1992) showed that the test performance of simple average of
 
 Hence, to accelerate the speed of Stochastic Gradient Descent, Averaged Stochastic Gradient Descent (ASGD) was proposed in Polyak and Juditsky (1992). For ASGD, the running average of parameters obtained by SGD, is used as the estimator for <img src="./images/theta_star.gif"/><br/> . The averaging is done as follows:
 
-<img src="./images/asgd.gif" align="center"/><br/>
+![](./images/asgd.gif)
 
 We propose averaging for any optimizer similar to how ASGD performs it, as mentioned above.
 
diff --git a/doc/fluid/design/concepts/README.md b/doc/fluid/design/concepts/README.md
index ed3f5aab2882c16ca6ac1446b4c4d4d27a373af7..8ded0ad22f4013a521bf3bee260565dc5cf855ae 100644
--- a/doc/fluid/design/concepts/README.md
+++ b/doc/fluid/design/concepts/README.md
@@ -6,11 +6,33 @@ Here are some initial thoughts. Your comments are welcome!
 
 I think we need only the following few CMake functions to make a project description mean and clean:
 
-| C++ | CUDA C++ | Go |
-|---|---|---|
-| cc_library | nv_library | go_library |
-| cc_binary | nv_binary | go_binary |
-| cc_test | nv_test | go_test |
+<table>
+<thead>
+<tr>
+<th>C++</th>
+<th>CUDA C++</th>
+<th>Go</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>cc_library </td>
+<td>nv_library </td>
+<td>go_library </td>
+</tr>
+<tr>
+<td>cc_binary </td>
+<td>nv_binary </td>
+<td>go_binary </td>
+</tr>
+<tr>
+<td> cc_test </td>
+<td> nv_test </td>
+<td> go_test </td>
+</tr>
+</tbody>
+</table>
+
 
 - The `_library` functions generate  .a files from source code.
 - The `_binary` functions generate executable binary files.
diff --git a/doc/fluid/design/concepts/block.md b/doc/fluid/design/concepts/block.md
index 907a2def557fd472ac4d679c73447bd9107d1190..3b626bd89cd83a9428997abccfeeebbbbdbb3d38 100644
--- a/doc/fluid/design/concepts/block.md
+++ b/doc/fluid/design/concepts/block.md
@@ -14,11 +14,29 @@ In programming languages, a block is a pair of curly braces that includes local
 
 Blocks work with control flow structures like `if`, `else`, and `for`, which have equivalents in deep learning:
 
-| programming languages | PaddlePaddle          |
-|-----------------------|-----------------------|
-| for, while loop       | RNN, WhileOp          |
-| if, if-else, switch   | IfElseOp, SwitchOp    |
-| sequential execution  | a sequence of layers  |
+<table>
+<thead>
+<tr>
+<th>programming languages</th>
+<th>PaddlePaddle</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>for, while loop </td>
+<td>RNN, WhileOp </td>
+</tr>
+<tr>
+<td>if, if-else, switch </td>
+<td>IfElseOp, SwitchOp </td>
+</tr>
+<tr>
+<td>sequential execution </td>
+<td>a sequence of layers </td>
+</tr>
+</tbody>
+</table>
+
 
 A key difference is that a C++ program describes a one pass computation, whereas a deep learning program describes both the forward and backward passes.
 
@@ -26,12 +44,33 @@ A key difference is that a C++ program describes a one pass computation, whereas
 
 The existence of the backward pass makes the execution of a block of PaddlePaddle different from traditional programs:
 
-| programming languages | PaddlePaddle                    |
-|-----------------------|---------------------------------|
-| stack                 | scope hierarchy                 |
-| stack frame           | scope                           |
-| push at entering block| push at entering block          |
-| pop at leaving block  | destroy when minibatch completes|
+<table>
+<thead>
+<tr>
+<th>programming languages</th>
+<th>PaddlePaddle</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>stack </td>
+<td>scope hierarchy </td>
+</tr>
+<tr>
+<td>stack frame  </td>
+<td>scope </td>
+</tr>
+<tr>
+<td>push at entering block </td>
+<td>push at entering block </td>
+</tr>
+<tr>
+<td>pop at leaving block </td>
+<td>destroy when minibatch completes </td>
+</tr>
+</tbody>
+</table>
+
 
 1. In traditional programs:
 
diff --git a/doc/fluid/design/concepts/functions_operators_layers.md b/doc/fluid/design/concepts/functions_operators_layers.md
index 984b59f4c6971dfb6f46dfe342f2751f392c0e88..30bc488a18a28d349645d9d2502aae6691a69931 100644
--- a/doc/fluid/design/concepts/functions_operators_layers.md
+++ b/doc/fluid/design/concepts/functions_operators_layers.md
@@ -86,12 +86,40 @@ def layer.fc(X):
 
 We'd like to have Python bindings to operators in package `paddle.operator`, and Python compositions of operators in package `paddle.layer`.  So we have the following concepts in above illustrative example:
 
-
-| C++ functions/functors | mul          | add          |             |          |
-|------------------------|--------------|--------------|-------------|----------|
-| C++ operator class     | mulOp        | addOp        | FCOp        |          |
-| Python binding         | operator.mul | operator.add | operator.fc |          |
-| Python function        |              |              |             | layer.fc |
+<table>
+<thead>
+<tr>
+<th>C++ functions/functors</th>
+<th>mul</th>
+<th>add</th>
+<th></th>
+<th></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>C++ operator class </td>
+<td>mulOp</td>
+<td>addOp </td>
+<td>FCOp </td>
+<td></td>
+</tr>
+<tr>
+<td>Python binding  </td>
+<td>operator.mul</td>
+<td> operator.add </td>
+<td>operator.fc </td>
+<td></td>
+</tr>
+<tr>
+<td>Python function   </td>
+<td></td>
+<td></td>
+<td> </td>
+<td>layer.fc</td>
+</tr>
+</tbody>
+</table>
 
 
 This is how we differentiate layer and operators in PaddlePaddle:
diff --git a/doc/fluid/design/concepts/lod_tensor.md b/doc/fluid/design/concepts/lod_tensor.md
index 10a8a7867fbf072f585fe3bfb1243e4e6bef4ec8..a88292e7888d0ebc64ee89ca315dfea38a12c71d 100644
--- a/doc/fluid/design/concepts/lod_tensor.md
+++ b/doc/fluid/design/concepts/lod_tensor.md
@@ -2,12 +2,38 @@
 
 Like other deep learning systems, PaddlePaddle supports training models from sequence data.  Also, like other systems, PaddlePaddle represent a mini-batch of sequences as a Tensor.  What is different is that PaddlePaddle doesn't require all sequences in a mini-batch to be of the same length. Thus no need for padding zeros.
 
-|                       | TensorFlow | PaddlePaddle |
-|-----------------------|------------|--------------|
-| RNN                   | Support    | Support      |
-| recursive RNN         | Support    | Support      |
-| padding zeros         | Must       | No need      |
-| blob data type        | Tensor     | LoDTensor    |
+<table>
+<thead>
+<tr>
+<th></th>
+<th>TensorFlow</th>
+<th>PaddlePaddle</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>RNN </td>
+<td>Support </td>
+<td>Support </td>
+</tr>
+<tr>
+<td>recursive RNN </td>
+<td>Support </td>
+<td>Support </td>
+</tr>
+<tr>
+<td>padding zeros </td>
+<td> Must </td>
+<td>No need </td>
+</tr>
+<tr>
+<td> blob data type </td>
+<td> Tensor</td>
+<td> LoDTensor </td>
+</tr>
+</tbody>
+</table>
+
 
 PaddlePaddle achieves this flexibility by passing through a new data type, *LoD Tensor*, which is a Tensor attached with segmentation index known as *LoD*, between operators.  The LoD index doesn't only segment a tensor, but also recursively segments sub-sequences.  This document presents the design of LoD and LoDTensor.
 
diff --git a/doc/fluid/design/concepts/var_desc.md b/doc/fluid/design/concepts/var_desc.md
index fcba08c07f40177d54a91048cb616198402a9d5d..6750323c0167bf1efbde6ef4fd670e88a5aa502a 100644
--- a/doc/fluid/design/concepts/var_desc.md
+++ b/doc/fluid/design/concepts/var_desc.md
@@ -10,10 +10,27 @@ PaddlePaddle uses proto message to describe compile time program because :
 
 The computation `Program` consists of nested `Blocks`. Each `Block` will consist of data(i.e. `Variable`)  and  `Operations`. The concept to represent them is in the table below.
 
-| |compile time|runtime|
-|---|---|---|
-|Data|VarDesc(proto)|Variable(cpp)|
-|Operation|OpDesc(proto)|Operator(cpp)|
+<table>
+<thead>
+<tr>
+<th></th>
+<th>compile time</th>
+<th>runtime</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>Data </td>
+<td>VarDesc(proto) </td>
+<td>Variable(cpp) </td>
+</tr>
+<tr>
+<td>Operation </td>
+<td>OpDesc(proto) </td>
+<td>Operator(cpp) </td>
+</tr>
+</tbody>
+</table>
 
 
 ## Definition of VarType
diff --git a/doc/fluid/design/concurrent/concurrent_programming.md b/doc/fluid/design/concurrent/concurrent_programming.md
index f022e67fd3a048cd7e53c91d9a1fd0506487b665..64602166065af28309d7a01fdeb7076a9b0a081a 100644
--- a/doc/fluid/design/concurrent/concurrent_programming.md
+++ b/doc/fluid/design/concurrent/concurrent_programming.md
@@ -10,12 +10,38 @@ The answer relies on the fact that a `ProgramDesc` is similar to an abstract syn
 
 The following table compares concepts in Fluid and Go
 
-| Go | Fluid |
-|----|-------|
-|user-defined functions | [layers](https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/fluid) |
-| control-flow and built-in functions | [intrinsics/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators) |
-| goroutines, channels | [class ThreadPool](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework/thread_pool.h) |
-| runtime | [class Executor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.h) |
+<table>
+<thead>
+<tr>
+<th></th>
+<th>Go</th>
+<th>Fluid</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>user-defined functions </td>
+<td>
+<a href="https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/fluid">layers</a></td>
+</tr>
+<tr>
+<td>control-flow and built-in functions </td>
+<td>
+<a href="https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators">intrinsics/operators</a></td>
+</tr>
+<tr>
+<td>goroutines, channels </td>
+<td>
+<a href="https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework/thread_pool.h">class ThreadPool</a></td>
+</tr>
+<tr>
+<td>runtime </td>
+<td>
+<a href="https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.h">class Executor</a></td>
+</tr>
+</tbody>
+</table>
+
 
 ## An Example Concurrent Program
 
@@ -77,11 +103,11 @@ message ProgramDesc {
       read(output = X)
       kube_get_workers_addrs(output = L)
       Y = tensor_array(len(L))
-      parallel_for(input = X, output = Y, 
+      parallel_for(input = X, output = Y,
                    attrs = {L, block_id(1)}) # referring to block 1
     ]
   }
-  
+
   block[1] = Block {
     parent = 0,
     vars = [x, y, index],
@@ -102,7 +128,7 @@ func main() {  //// block 0
   X = fluid.read(...)
   L = fluid.k8s.get_worker_addrs()
   Y = fluid.tensor_array(len(L))
-  fluid.parallel_for(X, L, 
+  fluid.parallel_for(X, L,
                      func(index int) {  //// block 1
                        x = X[index]
                        fluid.send(L[index], x)
@@ -116,7 +142,7 @@ An explanation of the above program:
 
 - `fluid.k8s` is a package that provides access to Kubernetes API.  
 - `fluid.k8s.get_worker_addrs` returns the list of IP and ports of all pods of the current job except for the current one (the master pod).  
-- `fluid.tensor_array` creates a [tensor array](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor_array.h).  `fluid.parallel_for` creates a `ParallelFor` intrinsic, which, when executed, 
+- `fluid.tensor_array` creates a [tensor array](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor_array.h).  `fluid.parallel_for` creates a `ParallelFor` intrinsic, which, when executed,
 
   1. creates `len(L)` scopes, each for the concurrent running of the sub-block (block 1 in this case), and initializes a variable named "index" in the scope to an integer value in the range `[0, len(L)-1]`, and
   2. creates `len(L)` threads by calling into the `ThreadPool` singleton, each thread  
diff --git a/doc/fluid/design/concurrent/csp.md b/doc/fluid/design/concurrent/csp.md
index 10d936860fab7e09241e968a63526c7d86d3e568..66d19f44baf861c7847e81ca83f61024ec877faf 100644
--- a/doc/fluid/design/concurrent/csp.md
+++ b/doc/fluid/design/concurrent/csp.md
@@ -13,14 +13,41 @@ Most DL systems, including TensorFlow, Caffe2, and MxNet, can asynchronously exe
 
 There were many concurrent programming models, implemented in various forms:
 
-| concurrent programming model | implementation |
-|-----|-----|
-| mutex | types and functions in standard libraries |
-| semaphore | types and functions in standard libraries |
-| communicating sequential processes (CSP) | Go programming language |
-| actor model | Erlang programming language |
-| message passing | MPI |
-| bulk synchronous parallel (BSP) | Pregel distributed programming framework |
+<table>
+<thead>
+<tr>
+<th>concurrent programming model</th>
+<th>implementation</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>mutex </td>
+<td>types and functions in standard libraries </td>
+</tr>
+<tr>
+<td>semaphore </td>
+<td> types and functions in standard libraries </td>
+</tr>
+<tr>
+<td> communicating sequential processes (CSP)  </td>
+<td> Go programming language </td>
+</tr>
+<tr>
+<td> actor model  </td>
+<td> Erlang programming language </td>
+</tr>
+<tr>
+<td> message passing  </td>
+<td> MPI </td>
+</tr>
+<tr>
+<td> bulk synchronous parallel (BSP)   </td>
+<td> Pregel distributed programming framework </td>
+</tr>
+</tbody>
+</table>
+
 
 Since Fluid was designed to be a programming language, we would like to implement CSP in Fluid.
 
@@ -118,9 +145,9 @@ There are four types of actions with a channel:
    ```go
    close(ch)
    ```
-   
+
    Please be aware that a closed channel is not a nil channel, which is `var ch chan int`.
-   
+
 There are some [axioms with channels](https://dave.cheney.net/2014/03/19/channel-axioms):
 
 1. A send to a nil channel blocks forever
diff --git a/doc/fluid/design/modules/python_api.md b/doc/fluid/design/modules/python_api.md
index 73f6d7b90c7dca0d48109cf3d28d5f7cd56b5c0b..f83ad3b6a4e8b4d82d8fe8d4154a2739a9b9628b 100644
--- a/doc/fluid/design/modules/python_api.md
+++ b/doc/fluid/design/modules/python_api.md
@@ -2,12 +2,33 @@
 
 Due to the refactorization of the PaddlePaddle core, we need Python classes to construct corresponding protobuf messages that describe a DL program.
 
-| Python classes | Protobuf messages |
-| --- | --- |
-| Program | ProgramDesc |
-| Block | BlockDesc |
-| Operator | OpDesc |
-| Variable | VarDesc |
+<table>
+<thead>
+<tr>
+<th>Python classes</th>
+<th>Protobuf messages</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>Program </td>
+<td>ProgramDesc </td>
+</tr>
+<tr>
+<td>Block  </td>
+<td>BlockDesc </td>
+</tr>
+<tr>
+<td>Operator </td>
+<td>OpDesc </td>
+</tr>
+<tr>
+<td>Variable </td>
+<td>VarDesc </td>
+</tr>
+</tbody>
+</table>
+
 
 Please be aware that these Python classes need to maintain some construction-time information, which are not part of the protobuf messages.
 
diff --git a/doc/fluid/design/motivation/fluid.md b/doc/fluid/design/motivation/fluid.md
index 110b7d78bf12ac8328fb3a913e4386e75d63c995..5e147f8263e685a4665b5793f7127178cbc3cfdd 100644
--- a/doc/fluid/design/motivation/fluid.md
+++ b/doc/fluid/design/motivation/fluid.md
@@ -10,11 +10,37 @@ Fluid is the answer.  Fluid is similar to PyTorch and TensorFlow Eager Execution
 
 Deep learning infrastructure is one of the fastest evolving technologies. Within four years, there have already been three generations of technologies invented.
 
-| Existed since | model as sequence of layers | model as graph of operators | No model |
-|--|--|--|--|
-| 2013 | Caffe, Theano, Torch, PaddlePaddle | | |
-| 2015 | | TensorFlow, MxNet, Caffe2, ONNX, n-graph | |
-| 2016 | | | PyTorch, TensorFlow Eager Execution, PaddlePaddle Fluid |
+<table>
+<thead>
+<tr>
+<th>Existed since</th>
+<th>model as sequence of layers</th>
+<th>model as graph of operators</th>
+<th>No model</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>2013 </td>
+<td>Caffe, Theano, Torch, PaddlePaddle </td>
+<td> </td>
+<td> </td>
+</tr>
+<tr>
+<td>2015 </td>
+<td> </td>
+<td>TensorFlow, MxNet, Caffe2, ONNX, n-graph </td>
+<td> </td>
+</tr>
+<tr>
+<td>2016 </td>
+<td> </td>
+<td>   </td>
+<td> PyTorch, TensorFlow Eager Execution, PaddlePaddle Fluid</td>
+</tr>
+</tbody>
+</table>
+
 
 From the above table, we see that the deep learning technology is evolving towards getting rid of the concept of a model.  To understand the reasons behind this direction, a comparison of the *programming paradigms* or the ways to program deep learning applications using these systems, would be helpful. The following section goes over these.
 
diff --git a/doc/fluid/design/motivation/refactorization.md b/doc/fluid/design/motivation/refactorization.md
index 7c39fabcc6df76afdb6a77b4cbc2edf0bf3ef780..f199cc892f5e84f0a12abe3b8e5cace9849e7fa8 100644
--- a/doc/fluid/design/motivation/refactorization.md
+++ b/doc/fluid/design/motivation/refactorization.md
@@ -36,11 +36,37 @@ At compile time, the Python program generates a protobuf message representation
 
 At runtime, the C++ program realizes the graph and runs it.
 
-| | Representation (protobuf messages) | Realization (C++ class objects) |
-|---|---|---|
-|Data|[VarDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L107)|[Variable](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h#L24)|
-|Operation|[OpDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L35)|[Operator](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L64)|
-|Block|BlockDesc|Block|
+<table>
+<thead>
+<tr>
+<th></th>
+<th>Representation (protobuf messages)</th>
+<th>Realization (C++ class objects) </th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>Data</td>
+<td>
+<a href="https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L107">VarDesc</a></td>
+<td>
+<a href="https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h#L24">Variable</a></td>
+</tr>
+<tr>
+<td>Operation </td>
+<td>
+<a href="https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L35">OpDesc</a></td>
+<td>
+<a href="https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L64">Operator</a></td>
+</tr>
+<tr>
+<td>Block </td>
+<td>BlockDesc </td>
+<td>Block </td>
+
+</tbody>
+</table>
+
 
 The word *graph* is interchangeable with *block* in this document.  A graph consists of computation steps and local variables similar to a C++/Java program block, or a pair of parentheses(`{` and `}`).
 
diff --git a/doc/fluid/design/network/deep_speech_2.md b/doc/fluid/design/network/deep_speech_2.md
index af0c6ef36feba9e0239e7a5f81a8dc9108b2471a..7f5dcf55f9f2a0fd27ffde100510dd8fee305381 100644
--- a/doc/fluid/design/network/deep_speech_2.md
+++ b/doc/fluid/design/network/deep_speech_2.md
@@ -1,4 +1,4 @@
-# DeepSpeech2 on PaddlePaddle: Design Doc 
+# DeepSpeech2 on PaddlePaddle: Design Doc
 
 We are planning to build Deep Speech 2 (DS2) \[[1](#references)\], a powerful Automatic Speech Recognition (ASR) engine,  on PaddlePaddle. For the first-stage plan, we have the following short-term goals:
 
@@ -68,11 +68,33 @@ We roughly break down the project into 14 tasks:
 
 Tasks parallelizable within phases:
 
-Roadmap     | Description                               | Parallelizable Tasks 
------------ | :------------------------------------     | :--------------------
-Phase I	    | Simplified model & components             | *Task 1* ~ *Task 8*
-Phase II    | Standard model & benchmarking & profiling | *Task 9* ~ *Task 12*
-Phase III   | Documentations                            | *Task13* ~ *Task14*
+<table>
+<thead>
+<tr>
+<th>Roadmap</th>
+<th>Description</th>
+<th> Parallelizable Tasks</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>Phase I </td>
+<td>Simplified model & components </td>
+<td>Task 1 ~ Task 8</td>
+</tr>
+<tr>
+<td>Phase II </td>
+<td> Standard model & benchmarking & profiling</td>
+<td>Task 9 ~ Task 12 </td>
+</tr>
+<tr>
+<td>Phase III </td>
+<td> Documentations</td>
+<td> Task13 ~ Task14 </td>
+</tr>
+</tbody>
+</table>
+
 
 Issue for each task will be created later. Contributions, discussions and comments are all highly appreciated and welcomed!
 
@@ -102,37 +124,82 @@ We don't have to persist on this 2-3-7-1-1-1 depth \[[2](#references)\]. Similar
 
 Key ingredients about the layers:
 
-- **Data Layers**: 
+- **Data Layers**:
    - Frame sequences data of audio **spectrogram** (with FFT).
-   - Token sequences data of **transcription** text (labels). 
+   - Token sequences data of **transcription** text (labels).
    - These two type of sequences do not have the same lengthes, thus a CTC-loss layer is required.
-- **2D Convolution Layers**: 
+- **2D Convolution Layers**:
    - Not only temporal convolution, but also **frequency convolution**. Like a 2D image convolution, but with a variable dimension (i.e. temporal dimension).
    - With striding for only the first convlution layer.
    - No pooling for all convolution layers.
-- **Uni-directional RNNs** 
+- **Uni-directional RNNs**
 	- Uni-directional + row convolution: for low-latency inference.
 	- Bi-direcitional + without row convolution: if we don't care about the inference latency.
 - **Row convolution**:
 	- For looking only a few steps ahead into the feature, instead of looking into a whole sequence in bi-directional RNNs.
-	- Not nessesary if with bi-direcitional RNNs. 
+	- Not nessesary if with bi-direcitional RNNs.
 	- "**Row**" means convolutions are done within each frequency dimension (row), and no convolution kernels shared across.
 - **Batch Normalization Layers**:
    - Added to all above layers (except for data and loss layer).
    - Sequence-wise normalization for RNNs: BatchNorm only performed on input-state projection and not state-state projection, for efficiency consideration.
- 
-
-Required Components                     | PaddlePaddle Support                      | Need to Develop
-:-------------------------------------  | :--------------------------------------   | :-----------------------
-Data Layer I (Spectrogram)	            | Not supported yet.                        |  TBD (Task 3)
-Data Layer II (Transcription)           | `paddle.data_type.integer_value_sequence` | -
-2D Convolution Layer                    | `paddle.layer.image_conv_layer`           | -
-DataType Converter (vec2seq)            | `paddle.layer.block_expand`               | -
-Bi-/Uni-directional RNNs                | `paddle.layer.recurrent_group`            | -
-Row Convolution Layer                   | Not supported yet.                        | TBD (Task 4)
-CTC-loss Layer                          | `paddle.layer.warp_ctc`                   | -
-Batch Normalization Layer               | `paddle.layer.batch_norm`                 | -
-CTC-Beam search                         | Not supported yet.                        | TBD (Task 6)
+   
+<table>
+<thead>
+<tr>
+<th>Required Components</th>
+<th> PaddlePaddle Support</th>
+<th> Need to Develop</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>Data Layer I (Spectrogram) </td>
+<td>Not supported yet.</td>
+<td>TBD (Task 3)</td>
+</tr>
+<tr>
+<td>Data Layer II (Transcription)  </td>
+<td> paddle.data_type.integer_value_sequence</td>
+<td> - </td>
+</tr>
+<tr>
+<td>2D Convolution Layer </td>
+<td> paddle.layer.image_conv_layer</td>
+<td> - </td>
+</tr>
+<tr>
+<td>DataType Converter (vec2seq)</td>
+<td> paddle.layer.block_expand</td>
+<td> - </td>
+</tr>
+<tr>
+<td>Bi-/Uni-directional RNNs </td>
+<td>paddle.layer.recurrent_group</td>
+<td> - </td>
+</tr>
+<tr>
+<td>Row Convolution Layer </td>
+<td>Not supported yet.</td>
+<td>TBD (Task 4)</td>
+</tr>
+<tr>
+<td>CTC-loss Layer </td>
+<td>paddle.layer.warp_ctc</td>
+<td> - </td>
+</tr>
+<tr>
+<td>Batch Normalization Layer </td>
+<td>paddle.layer.batch_norm</td>
+<td> - </td>
+</tr>
+<tr>
+<td>CTC-Beam search </td>
+<td>Not supported yet.</td>
+<td> TBD (Task 6) </td>
+</tr>
+</tbody>
+</table>
+
 
 ### Row Convolution
 
@@ -145,14 +212,14 @@ TODO by Assignees
 Figure 2. Algorithm for CTC Beam Search Decoder.
 </div>
 
-- The **Beam Search Decoder** for DS2 CTC-trained network follows the similar approach in \[[3](#references)\] as shown in Figure 2, with two important modifications for the ambiguous parts: 
-   - 1) in the iterative computation of probabilities, the assignment operation is changed to accumulation for one prefix may comes from different paths; 
+- The **Beam Search Decoder** for DS2 CTC-trained network follows the similar approach in \[[3](#references)\] as shown in Figure 2, with two important modifications for the ambiguous parts:
+   - 1) in the iterative computation of probabilities, the assignment operation is changed to accumulation for one prefix may comes from different paths;
    - 2) the if condition ```if l^+ not in A_prev then``` after probabilities' computation is deprecated for it is hard to understand and seems unnecessary.
 - An **external scorer** would be passed into the decoder to evaluate a candidate prefix during decoding whenever a white space appended in English decoding and any character appended in Mandarin decoding.
 - Such external scorer consists of language model, word count or any other custom scorers.
 - The **language model** is built from Task 5, with parameters should be carefully tuned to achieve minimum WER/CER (c.f. Task 7)
-- This decoder needs to perform with **high efficiency** for the convenience of parameters tuning and speech recognition in reality. 
- 
+- This decoder needs to perform with **high efficiency** for the convenience of parameters tuning and speech recognition in reality.
+
 
 ## Future Work
 
diff --git a/doc/fluid/dev/index_cn.rst b/doc/fluid/dev/index_cn.rst
index e70bf5dff3849f2ff82315f7eba4a92c93539843..f627437f354a12c79cad25c959409db29ecbd874 100644
--- a/doc/fluid/dev/index_cn.rst
+++ b/doc/fluid/dev/index_cn.rst
@@ -4,9 +4,9 @@
 .. toctree::
   :maxdepth: 1
 
-  new_op_en.md
-  new_op_kernel_en.md
-  use_eigen_en.md
+  new_op_cn.md
+  new_op_kernel.md
+  use_eigen_cn.md
   name_convention.md
   support_new_device.md
   releasing_process.md
diff --git a/doc/fluid/dev/index_en.rst b/doc/fluid/dev/index_en.rst
index f0e9afcfcc9edfb9a91f58375cd415ea414f8f82..0b65fed67ad45eb399b624184485a99a082d79e9 100644
--- a/doc/fluid/dev/index_en.rst
+++ b/doc/fluid/dev/index_en.rst
@@ -5,7 +5,7 @@ Development
   :maxdepth: 1
 
   new_op_en.md
-  new_op_kernel_en.md
+  new_op_kernel.md
   use_eigen_en.md
   name_convention.md
   support_new_device.md
diff --git a/doc/fluid/dev/new_op_cn.md b/doc/fluid/dev/new_op_cn.md
index 92996585674b46f45549b972b9f295503b1c7f8c..0c3f88d9c31e05bec399c64bf6ade56e62e01f68 100644
--- a/doc/fluid/dev/new_op_cn.md
+++ b/doc/fluid/dev/new_op_cn.md
@@ -26,13 +26,32 @@
 
 依据是否包含kernel，可以将Op分为两种：包含Kernel的Op和不包含kernel的Op，前者Op的定义继承自`OperatorWithKernel`，后者继承自`OperatorBase`。本教程主要介绍带Kernel的Op如何写，简单总结Op需要包含的内容如下：
 
-
- 内容            | 定义位置
---------------  | :----------------------
-OpProtoMake定义  | `.cc`文件，Backward Op不需要定义OpProtoMake
-Op定义           | `.cc`文件
-Kernel实现       | CPU、CUDA共享Kernel实现在`.h`文件中，否则，CPU 实现在`.cc`文件中，CUDA 实现在`.cu`文件中。
-注册Op           | Op注册实现在`.cc`文件；Kernel注册CPU实现在`.cc`文件中，CUDA实现在`.cu`文件中
+<table>
+<thead>
+<tr>
+<th>内容</th>
+<th>定义位置</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>OpProtoMake定义 </td>
+<td>`.cc`文件，Backward Op不需要定义OpProtoMake </td>
+</tr>
+<tr>
+<td>Op定义 </td>
+<td> `.cc`文件</td>
+</tr>
+<tr>
+<td>Kernel实现 </td>
+<td> CPU、CUDA共享Kernel实现在`.h`文件中，否则，CPU 实现在`.cc`文件中，CUDA 实现在`.cu`文件中。</td>
+</tr>
+<tr>
+<td>注册Op </td>
+<td> Op注册实现在`.cc`文件；Kernel注册CPU实现在`.cc`文件中，CUDA实现在`.cu`文件中</td>
+</tr>
+</tbody>
+</table>
 
 
 实现新的op都添加至目录[paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators)下，文件命名以`*_op.h`（如有） 、 `*_op.cc` 、`*_op.cu`（如有）结尾。**系统会根据文件名自动构建op和其对应的Python扩展。**
diff --git a/doc/fluid/dev/new_op_en.md b/doc/fluid/dev/new_op_en.md
index da8b1bdd1082e439456daf25e9b3a1e8eb534375..a566a09131f86251b70d5435d0a483aa2a705b35 100644
--- a/doc/fluid/dev/new_op_en.md
+++ b/doc/fluid/dev/new_op_en.md
@@ -33,6 +33,33 @@ Op definition           | `.cc` files
 Kernel implementation       | The kernel methods shared between CPU and CUDA are defined in `.h` files. CPU-specific kernels live in `.cc` files, while CUDA-specific kernels are implemented in `.cu`files.
 Registering the Op           | Ops are registered in `.cc` files; For Kernel registration, `.cc` files contain the CPU implementation, while `.cu` files contain the CUDA implementation.
 
+<table>
+<thead>
+<tr>
+<th>Information</th>
+<th> Where is it defined</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>OpProtoMake definition </td>
+<td> `.cc`files, Backward Op does not need an OpProtoMake interface. </td>
+</tr>
+<tr>
+<td>Op definition  </td>
+<td> `.cc` files</td>
+</tr>
+<tr>
+<td>Kernel implementation  </td>
+<td> The kernel methods shared between CPU and CUDA are defined in `.h` files. CPU-specific kernels live in `.cc` files, while CUDA-specific kernels are implemented in `.cu`files.</td>
+</tr>
+<tr>
+<td>Registering the Op  </td>
+<td> Ops are registered in `.cc` files; For Kernel registration, `.cc` files contain the CPU implementation, while `.cu` files contain the CUDA implementation.</td>
+</tr>
+</tbody>
+</table>
+
 
 New Operator implementations are added to the list [paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators), with file names in the format `*_op.h` (if applicable), `*_op.cc`, `*_op.cu` (if applicable).** The system will use the naming scheme to automatically build operators and their corresponding Python extensions.**
 
@@ -279,7 +306,7 @@ A forward operator unit test inherits `unittest.TestCase` and defines metaclass
 
       def test_check_output(self):
           self.check_output()
-          
+
       def test_check_grad_normal(self):
           self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5)
 
diff --git a/doc/fluid/dev/new_op_kernel_en.md b/doc/fluid/dev/new_op_kernel.md
similarity index 100%
rename from doc/fluid/dev/new_op_kernel_en.md
rename to doc/fluid/dev/new_op_kernel.md
diff --git a/doc/fluid/dev/releasing_process.md b/doc/fluid/dev/releasing_process.md
index b9787261092f1f27377886152cb1596d9ff54188..0810765b85f73d9dba876e66fb43bb1ad476d6d2 100644
--- a/doc/fluid/dev/releasing_process.md
+++ b/doc/fluid/dev/releasing_process.md
@@ -66,7 +66,7 @@ PaddlePaddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git-
 	* 建议，开发者fork的版本库使用`develop`分支同步主版本库的`develop`分支
 	* 建议，开发者fork的版本库中，再基于`develop`版本fork出自己的功能分支。
 	* 当功能分支开发完毕后，向PaddlePaddle的主版本库提交`Pull Reuqest`，进而进行代码评审。
-		* 在评审过程中，开发者修改自己的代码，可以继续在自己的功能分支提交代码。 
+		* 在评审过程中，开发者修改自己的代码，可以继续在自己的功能分支提交代码。
 
 * BugFix分支也是在开发者自己的fork版本库维护，与功能分支不同的是，BugFix分支需要分别给主版本库的`master`、`develop`与可能有的`release/版本号`分支，同时提起`Pull Request`。
 
@@ -78,13 +78,116 @@ PaddlePaddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git-
 
 PaddlePaddle每次发版本首先要保证PaddlePaddle Book中所有章节功能的正确性。功能的正确性包括验证PaddlePaddle目前的`paddle_trainer`训练和纯使用`Python`训练模型正确性。
 
-| | 新手入门章节 | 识别数字 | 图像分类 | 词向量 | 情感分析 | 语意角色标注 | 机器翻译 | 个性化推荐 |
-| --- | --- | --- | --- | --- | --- | --- | --- | --- |
-| API.V2 + Docker + GPU  |  |  |  |  |  |  |  |  |
-| API.V2 + Docker + CPU  |  |  |  |  |  |  |  |  |
-| `paddle_trainer` + Docker + GPU |  |  |  |  |  |  |  |  |
-| `paddle_trainer` + Docker + CPU |  |  |  |  |  |  |  |  |
-| API.V2 + Ubuntu + GPU |  |  |  |  |  |  |  |  |
-| API.V2 + Ubuntu + CPU |  |  |  |  |  |  |  |  |
-| `paddle_trainer` + Ubuntu + GPU |  |  |  |  |  |  |  |  |
-| `paddle_trainer` + Ubuntu + CPU |  |  |  |  |  |  |  |  |
+<table>
+<thead>
+<tr>
+<th></th>
+<th>新手入门章节 </th>
+<th> 识别数字</th>
+<th> 图像分类</th>
+<th>词向量</th>
+<th> 情感分析</th>
+<th>语意角色标注</th>
+<th> 机器翻译</th>
+<th>个性化推荐</th>
+</tr>
+</thead>
+
+<tbody>
+<tr>
+<td>API.V2 + Docker + GPU </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td> API.V2 + Docker + CPU </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td>`paddle_trainer` + Docker + GPU </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td>`paddle_trainer` + Docker + CPU </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td> API.V2 + Ubuntu + GPU</td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td>API.V2 + Ubuntu + CPU </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td> `paddle_trainer` + Ubuntu + GPU</td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td> `paddle_trainer` + Ubuntu + CPU</td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+</tbody>
+</table>
diff --git a/doc/fluid/getstarted/concepts/save_model/model_format.md b/doc/fluid/getstarted/concepts/save_model/model_format.md
index e29129fddf775939c9f7a8b49d850d523e6e5a45..1f12ba0497369eacc6a2db7984781b5672f45ea1 100644
--- a/doc/fluid/getstarted/concepts/save_model/model_format.md
+++ b/doc/fluid/getstarted/concepts/save_model/model_format.md
@@ -4,30 +4,70 @@
 
 A model is an output of the training process. One complete model consists of two parts, the **topology** and the **parameters**. In order to support industrial deployment, the model format must be self-complete and must not expose any training source code.
 
-As a result, In PaddlePaddle, the **topology** is represented as a  [ProgramDesc](https://github.com/PaddlePaddle/Paddle/blob/1c0a4c901c9fc881d120249c703b15d1c50dae7d/doc/design/program.md), which describes the model structure. The **parameters** contain all the trainable weights in the model. We must support large size parameters and efficient serialization/deserialization of parameters. 
+As a result, In PaddlePaddle, the **topology** is represented as a  [ProgramDesc](https://github.com/PaddlePaddle/Paddle/blob/1c0a4c901c9fc881d120249c703b15d1c50dae7d/doc/design/program.md), which describes the model structure. The **parameters** contain all the trainable weights in the model. We must support large size parameters and efficient serialization/deserialization of parameters.
 
 ## Implementation
 
-The topology is saved as a plain text in a detailed self-contain protobuf file. 
+The topology is saved as a plain text in a detailed self-contain protobuf file.
 
 The parameters are saved as a binary file. As we all know, the protobuf message has a limit of [64M size](https://developers.google.com/protocol-buffers/docs/reference/cpp/google.protobuf.io.coded_stream#CodedInputStream.SetTotalBytesLimit.details). We have done a [benchmark experiment](https://github.com/PaddlePaddle/Paddle/pull/4610), which shows that protobuf is not fit for the task.
 
-As a result, we design a particular format for tensor serialization. By default, an arbitrary tensor in Paddle is a [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md), and has a description information proto of [LoDTensorDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L99). We save the DescProto as the byte string header. It contains all the necessary information, such as the `dims`, and the `LoD` information in [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/1c0a4c901c9fc881d120249c703b15d1c50dae7d/paddle/framework/lod_tensor.md). A tensor stores values in a continuous memory buffer. For speed we dump the raw memory to disk and save it as the byte string content. So, the binary format of one tensor is, 
+As a result, we design a particular format for tensor serialization. By default, an arbitrary tensor in Paddle is a [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md), and has a description information proto of [LoDTensorDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L99). We save the DescProto as the byte string header. It contains all the necessary information, such as the `dims`, and the `LoD` information in [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/1c0a4c901c9fc881d120249c703b15d1c50dae7d/paddle/framework/lod_tensor.md). A tensor stores values in a continuous memory buffer. For speed we dump the raw memory to disk and save it as the byte string content. So, the binary format of one tensor is,
 
 The table below shows a tensor's byte view in detail. Note that all the signed values are written in the little-endian format.
 
-|field name  | type | description |
-| --- | --- | --- |
-| version | uint32_t | Version of saved file. Always 0 now. |
-| tensor desc length | uint32_t | TensorDesc(Protobuf message) length in bytes. |
-| tensor desc | void* | TensorDesc protobuf binary message |
-| tensor data | void* | Tensor's data in binary format. The length of `tensor_data` is decided by `TensorDesc.dims()` and `TensorDesc.data_type()` |
-| lod_level | uint64_t | Level of LoD |
-| length of lod[0] | uint64_t | [Optional] length of lod[0] in bytes. |
-| data of lod[0] | uint64_t*  | [Optional] lod[0].data() |
-| ... | ... | ... |
-
+<table>
+<thead>
+<tr>
+<th>field name</th>
+<th>type </th>
+<th>description </th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td> version</td>
+<td> uint32_t </td>
+<td> Version of saved file. Always 0 now.</td>
+</tr>
 
+<tr>
+<td> tensor desc length  </td>
+<td> uint32_t </td>
+<td> TensorDesc(Protobuf message) length in bytes. </td>
+</tr>
+<tr>
+<td>tensor desc </td>
+<td> void*</td>
+<td> TensorDesc protobuf binary message </td>
+</tr>
+<tr>
+<td> tensor data </td>
+<td> void* </td>
+<td> Tensor's data in binary format. The length of `tensor_data` is decided by `TensorDesc.dims()` and `TensorDesc.data_type()` </td>
+</tr>
+<tr>
+<td> lod_level</td>
+<td> uint64_t </td>
+<td> Level of LoD </td>
+</tr>
+<tr>
+<td> length of lod[0] </td>
+<td> uint64_t </td>
+<td> [Optional] length of lod[0] in bytes. </td>
+</tr>
+<tr>
+<td> data of lod[0] </td>
+<td> uint64_t*   </td>
+<td> [Optional] lod[0].data() </td>
+</tr>
+<tr>
+<td>... </td>
+<td> ... </td>
+<td> ... </td>
+</tr>
+</tbody>
+</table>
 
 ## Summary
 
diff --git a/doc/fluid/howto/cluster/fluid_cluster_train_cn.md b/doc/fluid/howto/cluster/fluid_cluster_train_cn.md
index 1b6f767869aaa800c122c8e7a06a1413e48e10e0..b99b90056b0a2e51f2668a6d27d94857bdc09c37 100644
--- a/doc/fluid/howto/cluster/fluid_cluster_train_cn.md
+++ b/doc/fluid/howto/cluster/fluid_cluster_train_cn.md
@@ -65,10 +65,10 @@ exit(1)
 
 **因此，在分布式的Fluid环境中，我们有两个角色需要创建，分别是Parameter Server和Trainer。**
 
-### 分布式训练 
+### 分布式训练
 Fliud专门提供了工具[Distributed Transpiler](https://github.com/PaddlePaddle/Paddle/blob/ba65d54d9d3b41cd3c5171b00f476d4e60133ddb/doc/fluid/design/dist_train/distributed_architecture.md#distributed-transpiler)用于将单机版的训练程序转换为分布式版本的训练程序。工具背后的理念是找出程序的优化算子和梯度参数，将他们分隔为两部分，通过send/recv 操作算子进行连接,优化算子和梯度参数可以在优化器的minimize函数的返回值中获取到。
 ```python
-optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost) 
+optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost)
 ```
 将Distributed Transpiler、优化算子和梯度函数放在一个代码中如下：
 ```python
@@ -99,15 +99,51 @@ for pass_id in range(100):
 ### 分布式训练脚本运行说明
 分布式任务的运行需要将表格中说明的多个参数进行赋值:
 
-| 参数名 | 值类型 | 说明 | 示例 |
-|:-------------|:------|:---------------------------------------|:-------------|
-| trainer_id | int | 当前训练节点的ID，训练节点ID编号为0 - n-1， n为trainers的值 | 0/1/2/3 |
-| pservers | str | parameter server 列表 | 127.0.0.1:6710,127.0.0.1:6711 |
-| trainers | int | 训练节点的总个数，>0的数字 | 4 |
-| server_endpoint | str | 当前所起的服务节点的IP:PORT | 127.0.0.1:8789 |
-| training_role | str | 节点角色， TRAINER/PSERVER | PSERVER |
-
-**注意：** ```training_role```是用来区分当前所起服务的角色的，用于训练程序中，用户可根据需要自行定义，其他参数为fluid.DistributeTranspiler的transpile函数所需要，需要在调用函数前进行定义，样例如下： 
+<table>
+<thead>
+<tr>
+<th>参数名</th>
+<th> 值类型</th>
+<th>说明</th>
+<th> 示例</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>trainer_id </td>
+<td> int</td>
+<td> 当前训练节点的ID，训练节点ID编号为0 - n-1， n为trainers的值 </td>
+<td> 0/1/2/3  </td>
+</tr>
+<tr>
+<td>pservers </td>
+<td> str</td>
+<td> parameter server 列表 </td>
+<td> 127.0.0.1:6710,127.0.0.1:6711 </td>
+</tr>
+<tr>
+<td>trainers </td>
+<td>int </td>
+<td> 训练节点的总个数，>0的数字 </td>
+<td> 4 </td>
+</tr>
+<tr>
+<td> server_endpoint</td>
+<td> str </td>
+<td> 当前所起的服务节点的IP:PORT </td>
+<td> 127.0.0.1:8789 </td>
+</tr>
+<tr>
+<td> training_role</td>
+<td>str </td>
+<td> 节点角色， TRAINER/PSERVER </td>
+<td> PSERVER </td>
+</tr>
+</tbody>
+</table>
+
+
+**注意：** ```training_role```是用来区分当前所起服务的角色的，用于训练程序中，用户可根据需要自行定义，其他参数为fluid.DistributeTranspiler的transpile函数所需要，需要在调用函数前进行定义，样例如下：
 
 ```python
 t = fluid.DistributeTranspiler()
diff --git a/doc/fluid/howto/optimization/cpu_profiling_cn.md b/doc/fluid/howto/optimization/cpu_profiling_cn.md
index 17f895573a65731db34b2addddaa22e7f32157ec..8266dec3c6125a09b90ac0ccd4aa5464f5c7db31 100644
--- a/doc/fluid/howto/optimization/cpu_profiling_cn.md
+++ b/doc/fluid/howto/optimization/cpu_profiling_cn.md
@@ -42,14 +42,40 @@ cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py
 
 每一列的含义是:
 
-| 列名 | 含义 |
-| --- | --- |
-| ncalls | 函数的调用次数 |
-| tottime | 函数实际使用的总时间。该时间去除掉本函数调用其他函数的时间 |
-| percall | tottime的每次调用平均时间 |
-| cumtime | 函数总时间。包含这个函数调用其他函数的时间 |
-| percall | cumtime的每次调用平均时间 |
-| filename:lineno(function) | 文件名, 行号，函数名 |
+<table>
+<thead>
+<tr>
+<th>列名</th>
+<th>含义 </th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td> ncalls</td>
+<td> 函数的调用次数</td>
+</tr>
+<tr>
+<td>tottime</td>
+<td> 函数实际使用的总时间。该时间去除掉本函数调用其他函数的时间</td>
+</tr>
+<tr>
+<td> percall </td>
+<td> tottime的每次调用平均时间</td>
+</tr>
+<tr>
+<td> cumtime</td>
+<td> 函数总时间。包含这个函数调用其他函数的时间</td>
+</tr>
+<tr>
+<td> percall</td>
+<td> cumtime的每次调用平均时间</td>
+</tr>
+<tr>
+<td> filename:lineno(function) </td>
+<td> 文件名, 行号，函数名 </td>
+</tr>
+</tbody>
+</table>
 
 
 ### 寻找性能瓶颈
diff --git a/doc/fluid/howto/optimization/cpu_profiling_en.md b/doc/fluid/howto/optimization/cpu_profiling_en.md
index abe4493c175fb4ee57f1acf45931e2890620d9c1..e95556dd608b7ff0a3eb18873df0015a2da94e7c 100644
--- a/doc/fluid/howto/optimization/cpu_profiling_en.md
+++ b/doc/fluid/howto/optimization/cpu_profiling_en.md
@@ -57,14 +57,40 @@ port, we will see the output like the following:
 where each line corresponds to Python function, and the meaning of
 each column is as follows:
 
-| column | meaning |
-| --- | --- |
-| ncalls | the number of calls into a function |
-| tottime | the total execution time of the function, not including the execution time of other functions called by the function |
-| percall | tottime divided by ncalls |
-| cumtime | the total execution time of the function, including the execution time of other functions being called |
-| percall | cumtime divided by ncalls |
-| filename:lineno(function) | where the function is defined |
+<table>
+<thead>
+<tr>
+<th>column</th>
+<th>meaning </th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td> ncalls</td>
+<td> the number of calls into a function</td>
+</tr>
+<tr>
+<td>tottime</td>
+<td> the total execution time of the function, not including the execution time of other functions called by the function</td>
+</tr>
+<tr>
+<td> percall </td>
+<td> tottime divided by ncalls</td>
+</tr>
+<tr>
+<td> cumtime</td>
+<td> the total execution time of the function, including the execution time of other functions being called</td>
+</tr>
+<tr>
+<td> percall</td>
+<td> cumtime divided by ncalls</td>
+</tr>
+<tr>
+<td> filename:lineno(function) </td>
+<td> where the function is define </td>
+</tr>
+</tbody>
+</table>
 
 ### Identify Performance Bottlenecks
 

Trainer Counter	1	10	20	30	40	50	60	70	80	90	100
PaddlePaddle Fluid	-	-	-	-	-	-	-	-	-	-	-
PaddlePaddle v2	-	-	-	-	-	-	-	-	-	-	-
TensorFlow	-	-	-	-	-	-	-	-	-	-	-
Batch Size	32	64	128	256
PaddlePaddle Fluid	15.44	16.32	16.74	16.79
PaddlePaddle v2	15.97	17.04	17.60	17.83
TensorFlow	9.09	9.10	9.24	8.66
Batch Size	32	64	128	256
PaddlePaddle Fluid	190.20	222.15	247.40	258.18
PaddlePaddle v2	170.96	233.71	256.14	329.23
TensorFlow	-	-	-	-
Trainer Count	20	40	80	100
PaddlePaddle Fluid	263.29 (78.64%)	518.80 (77.47%)	836.26 (62.44%)	1019.29 (60.89%)
PaddlePaddle v2 (need more tests)	326.85 (92.85%)	534.58 (75.93%)	853.30 (60.60%)	1041.99 (59.20%)
TensorFlow	-	-	-	-
PServer Count	3	6	10	20
PaddlePaddle Fluid(should fix in next PR)	589.1	592.6	656.4	655.8
PaddlePaddle v2 (need more tests)	593.4	791.3	729.7	821.7
TensorFlow	-	-	-	-
C++	CUDA C++	Go
cc_library	nv_library	go_library
cc_binary	nv_binary	go_binary
cc_test	nv_test	go_test
programming languages	PaddlePaddle
for, while loop	RNN, WhileOp
if, if-else, switch	IfElseOp, SwitchOp
sequential execution	a sequence of layers
programming languages	PaddlePaddle
stack	scope hierarchy
stack frame	scope
push at entering block	push at entering block
pop at leaving block	destroy when minibatch completes
C++ functions/functors	mul	add
C++ operator class	mulOp	addOp	FCOp
Python binding	operator.mul	operator.add	operator.fc
Python function				layer.fc
	TensorFlow	PaddlePaddle
RNN	Support	Support
recursive RNN	Support	Support
padding zeros	Must	No need
blob data type	Tensor	LoDTensor