diff --git a/.gitignore b/.gitignore
index 2b30f7938c8a1672acd0a14b7051af12c37889fb..275173b9677bffe028152fe8eadb3384329aeb5a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -16,3 +16,6 @@ third_party/
 *~
 bazel-*
 third_party/
+
+# clion workspace.
+cmake-build-*
diff --git a/cmake/external/any.cmake b/cmake/external/any.cmake
index 8116f235d535917c03deb646ff4ec083a0cdadc7..62eea42692b4191e53d0bbb0805786fd15ac7944 100644
--- a/cmake/external/any.cmake
+++ b/cmake/external/any.cmake
@@ -18,3 +18,4 @@ ExternalProject_Add(
 )
 
 add_definitions(-DANY_IMPL_ANY_CAST_MOVEABLE)
+LIST(APPEND external_project_dependencies linb_any)
\ No newline at end of file
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index b35e6839cdc2ee062a9066585f0c83948d87e385..7340394b1e1fad9e1893ac87d62febb8dd72751c 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -14,6 +14,34 @@
 
 INCLUDE(ExternalProject)
 
+macro(PROMPT_PROTOBUF_LIB)
+    MESSAGE(STATUS "Protobuf protoc executable: ${PROTOBUF_PROTOC_EXECUTABLE}")
+    MESSAGE(STATUS "Protobuf library: ${PROTOBUF_LIBRARY}")
+    MESSAGE(STATUS "Protobuf version: ${PROTOBUF_VERSION}")
+    INCLUDE_DIRECTORIES(${PROTOBUF_INCLUDE_DIR})
+    RETURN()
+endmacro()
+macro(SET_PROTOBUF_VERSION)
+    EXEC_PROGRAM(${PROTOBUF_PROTOC_EXECUTABLE} ARGS --version OUTPUT_VARIABLE PROTOBUF_VERSION)
+    STRING(REGEX MATCH "[0-9]+.[0-9]+" PROTOBUF_VERSION "${PROTOBUF_VERSION}")
+endmacro()
+
+set(PROTOBUF_ROOT "" CACHE PATH "Folder contains protobuf")
+if (NOT "${PROTOBUF_ROOT}" STREQUAL "")
+    find_path(PROTOBUF_INCLUDE_DIR google/protobuf/message.h PATHS ${PROTOBUF_ROOT}/include)
+    find_library(PROTOBUF_LIBRARY protobuf PATHS ${PROTOBUF_ROOT}/lib)
+    find_library(PROTOBUF_LITE_LIBRARY protobuf-lite PATHS ${PROTOBUF_ROOT}/lib)
+    find_library(PROTOBUF_PROTOC_LIBRARY protoc PATHS ${PROTOBUF_ROOT}/lib)
+    find_program(PROTOBUF_PROTOC_EXECUTABLE protoc PATHS ${PROTOBUF_ROOT}/bin)
+    if (PROTOBUF_INCLUDE_DIR AND PROTOBUF_LIBRARY AND PROTOBUF_LITE_LIBRARY AND PROTOBUF_PROTOC_LIBRARY AND PROTOBUF_PROTOC_EXECUTABLE)
+        message(STATUS "Using custom protobuf library in ${PROTOBUF_ROOT}.")
+        SET_PROTOBUF_VERSION()
+        PROMPT_PROTOBUF_LIB()
+    else()
+        message(WARNING "Cannot find protobuf library in ${PROTOBUF_ROOT}.")
+    endif()
+endif()
+
 FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
     SET(PROTOBUF_SOURCES_DIR ${THIRD_PARTY_PATH}/${TARGET_NAME})
     SET(PROTOBUF_INSTALL_DIR ${THIRD_PARTY_PATH}/install/${TARGET_NAME})
@@ -78,8 +106,7 @@ IF(NOT CMAKE_CROSSCOMPILING)
     FIND_PACKAGE(Protobuf ${PROTOBUF_VERSION})
 
     IF(PROTOBUF_FOUND)
-        EXEC_PROGRAM(${PROTOBUF_PROTOC_EXECUTABLE} ARGS --version OUTPUT_VARIABLE PROTOBUF_VERSION)
-        STRING(REGEX MATCH "[0-9]+.[0-9]+" PROTOBUF_VERSION "${PROTOBUF_VERSION}")
+        SET_PROTOBUF_VERSION()
         IF("${PROTOBUF_VERSION}" VERSION_LESS "3.1.0")
             SET(PROTOBUF_FOUND OFF)
         ENDIF()
@@ -107,6 +134,4 @@ IF(NOT PROTOBUF_FOUND)
     SET(PROTOBUF_PROTOC_LIBRARY ${protobuf_PROTOC_LIBRARY} CACHE FILEPATH "protoc library." FORCE)
 ENDIF(NOT PROTOBUF_FOUND)
 
-MESSAGE(STATUS "Protobuf protoc executable: ${PROTOBUF_PROTOC_EXECUTABLE}")
-MESSAGE(STATUS "Protobuf library: ${PROTOBUF_LIBRARY}")
-INCLUDE_DIRECTORIES(${PROTOBUF_INCLUDE_DIR})
+PROMPT_PROTOBUF_LIB()
\ No newline at end of file
diff --git a/doc/design/cluster_train/master_server.md b/doc/design/cluster_train/master_server.md
index bb8307652587b4dc56cd668a3a5e64722734d194..4bf3c506f101361875043f8bfd97972b8c981a22 100644
--- a/doc/design/cluster_train/master_server.md
+++ b/doc/design/cluster_train/master_server.md
@@ -10,7 +10,7 @@ A dataset is a list of files in *RecordIO* format. A RecordIO file consists of c
 
 ## Task Queue
 
-As mentioned in [distributed training design doc](./README.md), a *task* is a data shard that the master server assigns to the trainer process to train on. A task consists of one or multiple *blocks* from one or multiple files. The master server maintains *task queues* to track the training progress.
+As mentioned in [distributed training design doc](./README.md), a *task* is a data shard that the master server assigns to the trainer process to train on. A task consists of one or multiple *chunks* from one or multiple files. The master server maintains *task queues* to track the training progress.
 
 ### Task Queue Creation
 
@@ -21,23 +21,23 @@ As mentioned in [distributed training design doc](./README.md), a *task* is a da
 	func (m *RPCServer) ReportDataset(Paths []string, dummy *int) error {
 	}
 	```
-1. The master server will scan through each RecordIO file to generate the *block index* and know how many blocks does each file have. A block can be referenced by the file path and the index of the block within the file. The block index is in memory data structure that enables fast access to each block, and the index of the block with the file is an integer start from 0, representing the n-th block within the file.
+1. The master server will scan through each RecordIO file to generate the *chunk index* and know how many chunks does each file have. A chunk can be referenced by the file path and the index of the chunk within the file. The chunk index is in memory data structure that enables fast access to each chunk, and the index of the chunk with the file is an integer start from 0, representing the n-th chunk within the file.
 
-	The definition of the block is:
+	The definition of the chunk is:
 	```go
-	type Block struct {
-		Idx   int // index of the block within the file
+	type Chunk struct {
+		Idx   int // index of the chunk within the file
 		Path  string
-		Index recordio.Index // block index
+		Index recordio.Index // chunk index
 	}
 	```
-1. Blocks are grouped into tasks, and tasks are filled into the todo queue. The pending queue and the done queue are initialized with no element.
+1. Chunks are grouped into tasks, and tasks are filled into the todo queue. The pending queue and the done queue are initialized with no element.
 
 	The definition of the task is:
 	```go
 	type Task struct {
 		Index  int
-		Blocks []Block
+		Chunks []Chunk
 	}
 	```
 
diff --git a/doc/design/cluster_train/pserver_client.md b/doc/design/cluster_train/pserver_client.md
index 392bab25e9de6bf5aa7cc1b0ad345ef12f1d9e5d..007285640e9f11c55715291774826620419cec66 100644
--- a/doc/design/cluster_train/pserver_client.md
+++ b/doc/design/cluster_train/pserver_client.md
@@ -55,7 +55,7 @@ The trainer select process is encapsulated in the C API function:
 ```c
 int paddle_begin_init_params(paddle_pserver_client* client, const char* config_proto);
 ```
-The selected trainer's call to `paddle_begin_init_params` will return with 1, and the other trainers' call to `paddle_begin_init_params` will block until initialization is done, and return 0. As illustrated below:
+The selected trainer's call to `paddle_begin_init_params` will return with 1, and the other trainers' call to `paddle_begin_init_params` will return 0. `paddle_get_params` will be blocked until initialization is completed. As illustrated below:
 
  @@ -89,16 +89,13 @@ void paddle_pserver_client_release(paddle_pserver_client* client);
  *
  * paddle_begin_init_params will be called from multiple trainers,
  * only one trainer will be selected to initialize the parameters on
- * parameter servers. Other trainers will be blocked until the
- * initialization is done, and they need to get the initialized
+ * parameter servers. Other trainers need to get the initialized
  * parameters from parameter servers using @paddle_get_params.
  *
- * @param pserver_config_proto serialized parameter server configuration in
- * Protocol Buffers format.
  * @return 1 if the trainer is selected to initialize parameter
  * servers, otherwise 0.
  */
-int paddle_begin_init_params(paddle_pserver_client* client, const char* pserver_config_proto);
+int paddle_begin_init_params(paddle_pserver_client* client);
 
 /**
  * @brief paddle_init_param initializes the parameter on parameter
@@ -106,12 +103,13 @@ int paddle_begin_init_params(paddle_pserver_client* client, const char* pserver_
  *
  * @param param the parameter to initialize.
  * @param param_config_proto the configuration for the parameter.
+ * @param config_len the length of param_config_proto
  * @return 0 if successful, otherwise -1. On failure, the trainer
  * needs to restart the entire initialization process (starting from
  * @paddle_begin_init_param). Or simply exit the program and wait for
  * the cluster management system to restart the trainer.
  */
-int paddle_init_param(paddle_pserver_client* client, paddle_parameter params, const char* param_config_proto);
+int paddle_init_param(paddle_pserver_client* client, paddle_parameter param, const unsigned char* param_config_proto, int config_len);
 
 /**
  * @brief paddle_finish_init_params tells parameter servers client has
diff --git a/doc/design/cluster_train/src/pserver_init.graffle b/doc/design/cluster_train/src/pserver_init.graffle
index 730d3a561ffdc19e723b3cf6612471440951826a..5f3f1f52be8aa7f9049a8fcd6b7c93c8560c1676 100644
Binary files a/doc/design/cluster_train/src/pserver_init.graffle and b/doc/design/cluster_train/src/pserver_init.graffle differ
diff --git a/doc/design/cluster_train/src/pserver_init.png b/doc/design/cluster_train/src/pserver_init.png
index 4d502226d82ba271c50ae1bec5efaaaac4cc4434..dfe491ff98dd7db1c336093c80964a260df2cd90 100644
Binary files a/doc/design/cluster_train/src/pserver_init.png and b/doc/design/cluster_train/src/pserver_init.png differ
diff --git a/doc/design/parameters_in_cpp.md b/doc/design/parameters_in_cpp.md
new file mode 100644
index 0000000000000000000000000000000000000000..b6f99bc7d9d6fafacb0a4bcff806b65d9aef98cc
--- /dev/null
+++ b/doc/design/parameters_in_cpp.md
@@ -0,0 +1,41 @@
+# Design Doc: The C++ Class `Parameters`
+
+`Parameters` is a concept we designed in Paddle V2 API. `Parameters` is a container of parameters, and make Paddle can shared parameter between topologies. We described usages of `Parameter` in [api.md](./api.md).
+
+We used Python to implement Parameters when designing V2 API before. There are several defects for current implementation:
+* We just use `memcpy` to share Parameters between topologies, but this is very inefficient. 
+* We did not implement share Parameters while training. We just trigger `memcpy` when start training.
+
+It is necessary that we implement Parameters in CPP side. However, it could be a code refactoring for Paddle, because Paddle was designed for training only one topology before, i.e., each GradientMachine contains its Parameter as a data member. In current Paddle implementation, there are three concepts associated with `Parameters`:
+
+1. `paddle::Parameter`. A `Parameters` is a container for `paddle::Parameter`.
+It is evident that we should use `paddle::Parameter` when developing `Parameters`.
+However, the `Parameter` class contains many functions and does not have a clear interface.
+It contains `create/store Parameter`, `serialize/deserialize`, `optimize(i.e SGD)`, `randomize/zero`.
+When we developing `Parameters`, we only use `create/store Parameter` functionality.
+We should extract functionalities of Parameter into many classes to clean Paddle CPP implementation.
+
+2. `paddle::GradientMachine` and its sub-classes, e.g., `paddle::MultiGradientMachine`, `paddle::NeuralNetwork`.
+We should pass `Parameters` to `paddle::GradientMachine` when `forward/backward` to avoid `memcpy` between topologies.
+Also, we should handle multi-GPU/CPU training, because `forward` and `backward` would perform on multi-GPUs and multi-CPUs.
+`Parameters` should dispatch the parameter value to each device, and gather the parameter gradient from each device.
+
+3. `paddle::ParameterUpdater`. The ParameterUpdater is used to update parameters in Paddle. 
+So `Parameters` should be used by `paddle::ParameterUpdater`, and `paddle::ParameterUpdater` should optimize `Parameters` (by SGD).
+
+
+The step by step approach for implementation Parameters in Paddle C++ core is listed below. Each step should be a PR and could be merged into Paddle one by one.
+
+1. Clean `paddle::Parameter` interface. Extract the functionalities of `paddle::Parameter` to prepare for the implementation of Parameters.
+
+2. Implementation a `Parameters` class. It just stores the `paddle::Parameter` inside. Make `GradientMachine` uses `Parameters` as a class member.
+
+3. Make `Parameters` support Multi-CPU and Multi-GPU training to prepare for sharing `Parameter` between topologies.
+Because we need share `Parameters` between topologies, it is `Parameters`'s response to exchange Parameters between GPUs.
+`GradientMachine` should not handle how to exchange Parameters because `GradientMachine` only used to train one topology and we need to support train many topologies in Paddle, i.e., there could be many GradientMachines use one `Parameters`.
+   * We should use a global function to exchange Parameters between GPUs, not a member function in `Parameters`. The `MultiGradientMachine` invoke this function, which uses `Parameters` as this function inputs.
+   * The MultiGradientMachine contains many functionalities. Extracting the Parameters exchanging logic could make MultiGradientMachine clearer and simpler.
+
+4. Make `Parameters` as an argument for `forward/backward` function, not a data member for `GradientMachine`. For example, `forward` could be `forward(const Parameters& params, ...)` and `backward` could be `backward(Parameters* params, ...)`. After this step, Paddle could share `Parameters` between topologies.
+
+5. `ParameterUpdater` is invoked by `GradientMachine` and `Trainer`, but it updates `Parameters`. In the end of this code refactoring, we could change `ParameterUpdater` directly uses `Parameters` to make `ParameterUpdater`'s implementation clear.
diff --git a/doc/design/speech/README.MD b/doc/design/speech/README.MD
new file mode 100644
index 0000000000000000000000000000000000000000..7304650e628dba210488cd2dc4836318b5383b2a
--- /dev/null
+++ b/doc/design/speech/README.MD
@@ -0,0 +1,155 @@
+# DeepSpeech2 on PaddlePaddle: Design Doc 
+
+We are planning to build Deep Speech 2 (DS2) \[[1](#references)\], a powerful Automatic Speech Recognition (ASR) engine,  on PaddlePaddle. For the first-stage plan, we have the following short-term goals:
+
+- Release a basic distributed implementation of DS2 on PaddlePaddle.
+- Contribute a chapter of Deep Speech to PaddlePaddle Book.
+
+Intensive system optimization and low-latency inference library (details in \[[1](#references)\]) are not yet covered in this first-stage plan.
+
+## Table of Contents
+
+- [Tasks](#tasks)
+- [Task Dependency](#task-dependency)
+- [Design Details](#design-details)
+    - [Overview](#overview)
+    - [Row Convolution](#row-convolution)
+    - [Beam Search With CTC and LM](#beam-search-with-ctc-and-lm)
+- [Future Work](#future-work)
+- [References](#references)
+
+## Tasks
+
+We roughly break down the project into 14 tasks:
+
+1. Develop an **audio data provider**:
+	- Json filelist generator.
+	- Audio file format transformer.
+	- Spectrogram feature extraction, power normalization etc.
+	- Batch data reader with SortaGrad.
+	- Data augmentation (optional).
+	- Prepare (one or more) public English data sets & baseline.
+2. Create a **simplified DS2 model configuration**:
+   - With only fixed-length (by padding) audio sequences (otherwise need *Task 3*).
+	- With only bidirectional-GRU (otherwise need *Task 4*).
+	- With only greedy decoder (otherwise need *Task 5, 6*).
+3. Develop to support **variable-shaped** dense-vector (image) batches of input data.
+   - Update `DenseScanner` in `dataprovider_converter.py`, etc.
+4. Develop a new **lookahead-row-convolution layer** (See \[[1](#references)\] for details):
+   - Lookahead convolution windows.
+   - Within-row convolution, without kernels shared across rows.
+5. Build KenLM **language model** (5-gram) for beam search decoder:
+   - Use KenLM toolkit.
+   - Prepare the corpus & train the model.
+   - Create infererence interfaces (for Task 6).
+6. Develop a **beam search decoder** with CTC + LM + WORDCOUNT:
+   - Beam search with CTC.
+   - Beam search with external custom scorer (e.g. LM).
+   - Try to design a more general beam search interface.
+7. Develop a **Word Error Rate evaluator**:
+   - update `ctc_error_evaluator`(CER) to support WER.
+8. Prepare internal dataset for Mandarin (optional):
+    - Dataset, baseline, evaluation details.
+    - Particular data preprocessing for Mandarin.
+    - Might need cooperating with the Speech Department.
+9. Create **standard DS2 model configuration**:
+   - With variable-length audio sequences (need *Task 3*).
+	- With unidirectional-GRU + row-convolution (need *Task 4*).
+	- With CTC-LM beam search decoder (need *Task 5, 6*).
+10. Make it run perfectly on **clusters**.
+11. Experiments and **benchmarking** (for accuracy, not efficiency):
+    - With public English dataset.
+    - With internal (Baidu) Mandarin dataset (optional).
+12. Time **profiling** and optimization.
+13. Prepare **docs**.
+14. Prepare PaddlePaddle **Book** chapter with a simplified version.
+
+## Task Dependency
+
+Tasks parallelizable within phases:
+
+Roadmap     | Description                               | Parallelizable Tasks 
+----------- | :------------------------------------     | :--------------------
+Phase I	    | Simplified model & components             | *Task 1* ~ *Task 8*
+Phase II    | Standard model & benchmarking & profiling | *Task 9* ~ *Task 12*
+Phase III   | Documentations                            | *Task13* ~ *Task14*
+
+Issue for each task will be created later. Contributions, discussions and comments are all highly appreciated and welcomed!
+
+## Design Details
+
+### Overview
+
+Traditional **ASR** (Automatic Speech Recognition) pipelines require great human efforts devoted to elaborately tuning multiple hand-engineered components (e.g. audio feature design, accoustic model, pronuncation model and language model etc.). **Deep Speech 2** (**DS2**) \[[1](#references)\], however, trains such ASR models in an end-to-end manner, replacing most intermediate modules with only a single deep network architecture. With scaling up both the data and model sizes, DS2 achieves a very significant performance boost.
+
+Please read Deep Speech 2 \[[1](#references),[2](#references)\] paper for more background knowledge.
+
+The classical DS2 network contains 15 layers (from bottom to top):
+
+- **Two** data layers (audio spectrogram, transcription text)
+- **Three** 2D convolution layers
+- **Seven** uni-directional simple-RNN layers
+- **One** lookahead row convolution layers
+- **One** fully-connected layers
+- **One** CTC-loss layer
+
+
 
@@ -89,16 +89,13 @@ void paddle_pserver_client_release(paddle_pserver_client* client);
  *
  * paddle_begin_init_params will be called from multiple trainers,
  * only one trainer will be selected to initialize the parameters on
- * parameter servers. Other trainers will be blocked until the
- * initialization is done, and they need to get the initialized
+ * parameter servers. Other trainers need to get the initialized
  * parameters from parameter servers using @paddle_get_params.
  *
- * @param pserver_config_proto serialized parameter server configuration in
- * Protocol Buffers format.
  * @return 1 if the trainer is selected to initialize parameter
  * servers, otherwise 0.
  */
-int paddle_begin_init_params(paddle_pserver_client* client, const char* pserver_config_proto);
+int paddle_begin_init_params(paddle_pserver_client* client);
 
 /**
  * @brief paddle_init_param initializes the parameter on parameter
@@ -106,12 +103,13 @@ int paddle_begin_init_params(paddle_pserver_client* client, const char* pserver_
  *
  * @param param the parameter to initialize.
  * @param param_config_proto the configuration for the parameter.
+ * @param config_len the length of param_config_proto
  * @return 0 if successful, otherwise -1. On failure, the trainer
  * needs to restart the entire initialization process (starting from
  * @paddle_begin_init_param). Or simply exit the program and wait for
  * the cluster management system to restart the trainer.
  */
-int paddle_init_param(paddle_pserver_client* client, paddle_parameter params, const char* param_config_proto);
+int paddle_init_param(paddle_pserver_client* client, paddle_parameter param, const unsigned char* param_config_proto, int config_len);
 
 /**
  * @brief paddle_finish_init_params tells parameter servers client has
diff --git a/doc/design/cluster_train/src/pserver_init.graffle b/doc/design/cluster_train/src/pserver_init.graffle
index 730d3a561ffdc19e723b3cf6612471440951826a..5f3f1f52be8aa7f9049a8fcd6b7c93c8560c1676 100644
Binary files a/doc/design/cluster_train/src/pserver_init.graffle and b/doc/design/cluster_train/src/pserver_init.graffle differ
diff --git a/doc/design/cluster_train/src/pserver_init.png b/doc/design/cluster_train/src/pserver_init.png
index 4d502226d82ba271c50ae1bec5efaaaac4cc4434..dfe491ff98dd7db1c336093c80964a260df2cd90 100644
Binary files a/doc/design/cluster_train/src/pserver_init.png and b/doc/design/cluster_train/src/pserver_init.png differ
diff --git a/doc/design/parameters_in_cpp.md b/doc/design/parameters_in_cpp.md
new file mode 100644
index 0000000000000000000000000000000000000000..b6f99bc7d9d6fafacb0a4bcff806b65d9aef98cc
--- /dev/null
+++ b/doc/design/parameters_in_cpp.md
@@ -0,0 +1,41 @@
+# Design Doc: The C++ Class `Parameters`
+
+`Parameters` is a concept we designed in Paddle V2 API. `Parameters` is a container of parameters, and make Paddle can shared parameter between topologies. We described usages of `Parameter` in [api.md](./api.md).
+
+We used Python to implement Parameters when designing V2 API before. There are several defects for current implementation:
+* We just use `memcpy` to share Parameters between topologies, but this is very inefficient. 
+* We did not implement share Parameters while training. We just trigger `memcpy` when start training.
+
+It is necessary that we implement Parameters in CPP side. However, it could be a code refactoring for Paddle, because Paddle was designed for training only one topology before, i.e., each GradientMachine contains its Parameter as a data member. In current Paddle implementation, there are three concepts associated with `Parameters`:
+
+1. `paddle::Parameter`. A `Parameters` is a container for `paddle::Parameter`.
+It is evident that we should use `paddle::Parameter` when developing `Parameters`.
+However, the `Parameter` class contains many functions and does not have a clear interface.
+It contains `create/store Parameter`, `serialize/deserialize`, `optimize(i.e SGD)`, `randomize/zero`.
+When we developing `Parameters`, we only use `create/store Parameter` functionality.
+We should extract functionalities of Parameter into many classes to clean Paddle CPP implementation.
+
+2. `paddle::GradientMachine` and its sub-classes, e.g., `paddle::MultiGradientMachine`, `paddle::NeuralNetwork`.
+We should pass `Parameters` to `paddle::GradientMachine` when `forward/backward` to avoid `memcpy` between topologies.
+Also, we should handle multi-GPU/CPU training, because `forward` and `backward` would perform on multi-GPUs and multi-CPUs.
+`Parameters` should dispatch the parameter value to each device, and gather the parameter gradient from each device.
+
+3. `paddle::ParameterUpdater`. The ParameterUpdater is used to update parameters in Paddle. 
+So `Parameters` should be used by `paddle::ParameterUpdater`, and `paddle::ParameterUpdater` should optimize `Parameters` (by SGD).
+
+
+The step by step approach for implementation Parameters in Paddle C++ core is listed below. Each step should be a PR and could be merged into Paddle one by one.
+
+1. Clean `paddle::Parameter` interface. Extract the functionalities of `paddle::Parameter` to prepare for the implementation of Parameters.
+
+2. Implementation a `Parameters` class. It just stores the `paddle::Parameter` inside. Make `GradientMachine` uses `Parameters` as a class member.
+
+3. Make `Parameters` support Multi-CPU and Multi-GPU training to prepare for sharing `Parameter` between topologies.
+Because we need share `Parameters` between topologies, it is `Parameters`'s response to exchange Parameters between GPUs.
+`GradientMachine` should not handle how to exchange Parameters because `GradientMachine` only used to train one topology and we need to support train many topologies in Paddle, i.e., there could be many GradientMachines use one `Parameters`.
+   * We should use a global function to exchange Parameters between GPUs, not a member function in `Parameters`. The `MultiGradientMachine` invoke this function, which uses `Parameters` as this function inputs.
+   * The MultiGradientMachine contains many functionalities. Extracting the Parameters exchanging logic could make MultiGradientMachine clearer and simpler.
+
+4. Make `Parameters` as an argument for `forward/backward` function, not a data member for `GradientMachine`. For example, `forward` could be `forward(const Parameters& params, ...)` and `backward` could be `backward(Parameters* params, ...)`. After this step, Paddle could share `Parameters` between topologies.
+
+5. `ParameterUpdater` is invoked by `GradientMachine` and `Trainer`, but it updates `Parameters`. In the end of this code refactoring, we could change `ParameterUpdater` directly uses `Parameters` to make `ParameterUpdater`'s implementation clear.
diff --git a/doc/design/speech/README.MD b/doc/design/speech/README.MD
new file mode 100644
index 0000000000000000000000000000000000000000..7304650e628dba210488cd2dc4836318b5383b2a
--- /dev/null
+++ b/doc/design/speech/README.MD
@@ -0,0 +1,155 @@
+# DeepSpeech2 on PaddlePaddle: Design Doc 
+
+We are planning to build Deep Speech 2 (DS2) \[[1](#references)\], a powerful Automatic Speech Recognition (ASR) engine,  on PaddlePaddle. For the first-stage plan, we have the following short-term goals:
+
+- Release a basic distributed implementation of DS2 on PaddlePaddle.
+- Contribute a chapter of Deep Speech to PaddlePaddle Book.
+
+Intensive system optimization and low-latency inference library (details in \[[1](#references)\]) are not yet covered in this first-stage plan.
+
+## Table of Contents
+
+- [Tasks](#tasks)
+- [Task Dependency](#task-dependency)
+- [Design Details](#design-details)
+    - [Overview](#overview)
+    - [Row Convolution](#row-convolution)
+    - [Beam Search With CTC and LM](#beam-search-with-ctc-and-lm)
+- [Future Work](#future-work)
+- [References](#references)
+
+## Tasks
+
+We roughly break down the project into 14 tasks:
+
+1. Develop an **audio data provider**:
+	- Json filelist generator.
+	- Audio file format transformer.
+	- Spectrogram feature extraction, power normalization etc.
+	- Batch data reader with SortaGrad.
+	- Data augmentation (optional).
+	- Prepare (one or more) public English data sets & baseline.
+2. Create a **simplified DS2 model configuration**:
+   - With only fixed-length (by padding) audio sequences (otherwise need *Task 3*).
+	- With only bidirectional-GRU (otherwise need *Task 4*).
+	- With only greedy decoder (otherwise need *Task 5, 6*).
+3. Develop to support **variable-shaped** dense-vector (image) batches of input data.
+   - Update `DenseScanner` in `dataprovider_converter.py`, etc.
+4. Develop a new **lookahead-row-convolution layer** (See \[[1](#references)\] for details):
+   - Lookahead convolution windows.
+   - Within-row convolution, without kernels shared across rows.
+5. Build KenLM **language model** (5-gram) for beam search decoder:
+   - Use KenLM toolkit.
+   - Prepare the corpus & train the model.
+   - Create infererence interfaces (for Task 6).
+6. Develop a **beam search decoder** with CTC + LM + WORDCOUNT:
+   - Beam search with CTC.
+   - Beam search with external custom scorer (e.g. LM).
+   - Try to design a more general beam search interface.
+7. Develop a **Word Error Rate evaluator**:
+   - update `ctc_error_evaluator`(CER) to support WER.
+8. Prepare internal dataset for Mandarin (optional):
+    - Dataset, baseline, evaluation details.
+    - Particular data preprocessing for Mandarin.
+    - Might need cooperating with the Speech Department.
+9. Create **standard DS2 model configuration**:
+   - With variable-length audio sequences (need *Task 3*).
+	- With unidirectional-GRU + row-convolution (need *Task 4*).
+	- With CTC-LM beam search decoder (need *Task 5, 6*).
+10. Make it run perfectly on **clusters**.
+11. Experiments and **benchmarking** (for accuracy, not efficiency):
+    - With public English dataset.
+    - With internal (Baidu) Mandarin dataset (optional).
+12. Time **profiling** and optimization.
+13. Prepare **docs**.
+14. Prepare PaddlePaddle **Book** chapter with a simplified version.
+
+## Task Dependency
+
+Tasks parallelizable within phases:
+
+Roadmap     | Description                               | Parallelizable Tasks 
+----------- | :------------------------------------     | :--------------------
+Phase I	    | Simplified model & components             | *Task 1* ~ *Task 8*
+Phase II    | Standard model & benchmarking & profiling | *Task 9* ~ *Task 12*
+Phase III   | Documentations                            | *Task13* ~ *Task14*
+
+Issue for each task will be created later. Contributions, discussions and comments are all highly appreciated and welcomed!
+
+## Design Details
+
+### Overview
+
+Traditional **ASR** (Automatic Speech Recognition) pipelines require great human efforts devoted to elaborately tuning multiple hand-engineered components (e.g. audio feature design, accoustic model, pronuncation model and language model etc.). **Deep Speech 2** (**DS2**) \[[1](#references)\], however, trains such ASR models in an end-to-end manner, replacing most intermediate modules with only a single deep network architecture. With scaling up both the data and model sizes, DS2 achieves a very significant performance boost.
+
+Please read Deep Speech 2 \[[1](#references),[2](#references)\] paper for more background knowledge.
+
+The classical DS2 network contains 15 layers (from bottom to top):
+
+- **Two** data layers (audio spectrogram, transcription text)
+- **Three** 2D convolution layers
+- **Seven** uni-directional simple-RNN layers
+- **One** lookahead row convolution layers
+- **One** fully-connected layers
+- **One** CTC-loss layer
+
+
+

+Figure 1. Archetecture of Deep Speech 2 Network.
+