diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt
index 94dd3457fb5b513441c4c8e339e1862de9092517..58ce5d61c950d12630cfe1de354ffc2a2ba1fd59 100644
--- a/doc/CMakeLists.txt
+++ b/doc/CMakeLists.txt
@@ -47,3 +47,5 @@ sphinx_add_target(paddle_docs_cn
                   ${SPHINX_CACHE_DIR_CN}
                   ${CMAKE_CURRENT_SOURCE_DIR}
                   ${SPHINX_HTML_DIR_CN})
+
+add_subdirectory(api)
diff --git a/doc/api/CMakeLists.txt b/doc/api/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4e0bc1d5b8e799ef86cb92a0dda348b0be4e299a
--- /dev/null
+++ b/doc/api/CMakeLists.txt
@@ -0,0 +1,20 @@
+# configured documentation tools and intermediate build results
+set(BINARY_BUILD_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_build")
+
+# Sphinx cache with pickled ReST documents
+set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
+
+# HTML output director
+set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")
+
+configure_file(
+    "${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.en.in"
+    "${BINARY_BUILD_DIR_EN}/conf.py"
+    @ONLY)
+
+sphinx_add_target(paddle_api_docs
+                  html
+                  ${BINARY_BUILD_DIR_EN}
+                  ${SPHINX_CACHE_DIR_EN}
+                  ${CMAKE_CURRENT_SOURCE_DIR}
+                  ${SPHINX_HTML_DIR_EN})
diff --git a/doc/design/auto_gradient_check.md b/doc/design/auto_gradient_check.md
index f9991541bc51c6e13ffce4e9cec60f73dc800121..773b7b6a767541f28c27f247c1ad8c9a8a2d0ccf 100644
--- a/doc/design/auto_gradient_check.md
+++ b/doc/design/auto_gradient_check.md
@@ -1,23 +1,23 @@
-## Auto Gradient Checker Design
+## Auto Gradient Check Design
 
-## Backgraound：
-- Generally, it is easy to check whether the forward computation of an Operator is correct or not. However, backpropagation is a notoriously difficult algorithm to debug and get right:
-  1. you should get the right backpropagation formula according to the forward computation.
-  2. you should implement it right in CPP.
-  3. it's difficult to prepare test data.
+## Background：
+- Generally, it is easy to check whether the forward computation of an Operator is correct or not. However, backpropagation is a notoriously difficult algorithm to debug and get right because of the following challenges:
+  1. The formula for backpropagation formula should be correct according to the forward computation.
+  2. The Implementation of the above shoule be correct in CPP.
+  3. It is difficult to prepare an unbiased test data.
 
-- Auto gradient checking gets a numerical gradient by forward Operator and use it as a reference of the backward Operator's result. It has several advantages:
-  1. numerical gradient checker only need forward operator.
-  2. user only need to prepare the input data for forward Operator.
+- Auto gradient checking gets a numerical gradient using forward Operator and uses it as a reference for the backward Operator's result. It has several advantages:
+  1. Numerical gradient checker only needs the forward operator.
+  2. The user only needs to prepare the input data for forward Operator and not worry about the backward Operator.
 
 ## Mathematical Theory
-The following two document from Stanford has a detailed explanation of how to get numerical gradient and why it's useful.
+The following documents from Stanford have a detailed explanation of how to compute the numerical gradient and why it is useful.
 
 - [Gradient checking and advanced optimization(en)](http://deeplearning.stanford.edu/wiki/index.php/Gradient_checking_and_advanced_optimization)
 - [Gradient checking and advanced optimization(cn)](http://ufldl.stanford.edu/wiki/index.php/%E6%A2%AF%E5%BA%A6%E6%A3%80%E9%AA%8C%E4%B8%8E%E9%AB%98%E7%BA%A7%E4%BC%98%E5%8C%96)
 
 
-## Numeric Gradient Implementation
+## Numerical Gradient Implementation
 ### Python Interface
 ```python
 def get_numerical_gradient(op,
@@ -27,73 +27,76 @@ def get_numerical_gradient(op,
                          delta=0.005,
                          local_scope=None):
     """
-    Get Numeric Gradient for an operator's input.
+    Get Numerical Gradient for the input of an operator.
 
-    :param op: C++ operator instance, could be an network
+    :param op: C++ operator instance, could be an network.
     :param input_values: The input variables. Should be an dictionary, whose key is
-    variable name, and value is numpy array.
+    variable name, and value is a numpy array.
     :param output_name: The final output variable name.
-    :param input_to_check: The input variable with respect to which to compute the gradient.
-    :param delta: The perturbation value for numeric gradient method. The
-    smaller delta is, the more accurate result will get. But if that delta is
-     too small, it will suffer from numerical stability problem.
+    :param input_to_check: The input variable with respect to which the gradient has to be computed.
+    :param delta: The perturbation value for numerical gradient method. The
+    smaller the delta, the more accurate the result. But if the delta is too
+    small, it will suffer from the numerical stability problem.
     :param local_scope: The local scope used for get_numeric_gradient.
     :return: The gradient array in numpy format.
     """
 ```
 
-### Explaination:
+### Explanation:
 
-- Why need `output_name`
-  - An Operator may have multiple Output, one can get independent gradient from each Output. So caller should specify the name of the output variable.
+- Why do we need an `output_name`
+  - An Operator may have multiple Outputs, one can compute an independent gradient from each Output. So the caller should specify the name of the output variable.
 
-- Why need `input_to_check`
-  - One operator may have multiple inputs. Gradient Op can calculate the gradient of these inputs at the same time. But Numeric Gradient needs to calculate them one by one. So `get_numeric_gradient` is designed to calculate the gradient for one input. If you need to compute multiple inputs, you can call `get_numeric_gradient` multiple times.
+- Why do we need `input_to_check`
+  - One operator can have multiple inputs. Gradient Op can calculate the gradient of these inputs at the same time. But Numerical Gradient needs to calculate them one by one. So `get_numeric_gradient` is designed to calculate the gradient for one input. If you need to compute multiple inputs, you can call `get_numeric_gradient` multiple times each with a different input.
 
 
 ### Core Algorithm Implementation
 
 
 ```python
-    # we only compute gradient of one element a time.
+    # we only compute the gradient of one element a time.
     # we use a for loop to compute the gradient of each element.
     for i in xrange(tensor_size):
-        # get one input element by its index i.
-        origin = tensor_to_check.get_float_element(i)
+        # get one input element using the index i.
+        original = tensor_to_check.get_float_element(i)
 
-        # add delta to it, run op and then get the new value of the result tensor.
-        x_pos = origin + delta
+        # add delta to it, run the forward op and then
+        # get the new value of the result tensor.
+        x_pos = original + delta
         tensor_to_check.set_float_element(i, x_pos)
         y_pos = get_output()
 
-        # plus delta to this element, run op and get the new value of the result tensor.
-        x_neg = origin - delta
+        # Subtract delta from this element, run the op again
+        # and get the new value of the result tensor.
+        x_neg = original - delta
         tensor_to_check.set_float_element(i, x_neg)
         y_neg = get_output()
 
         # restore old value
-        tensor_to_check.set_float_element(i, origin)
+        tensor_to_check.set_float_element(i, original)
 
-        # compute the gradient of this element and store it into a numpy array.
+        # compute the gradient of this element and store
+        # it into a numpy array.
         gradient_flat[i] = (y_pos - y_neg) / delta / 2
 
     # reshape the gradient result to the shape of the source tensor.
     return gradient_flat.reshape(tensor_to_check.get_dims())
 ```
 
-## Auto Graident Checker Framework
+## Auto Gradient Check Framework
 
 Each Operator Kernel has three kinds of Gradient:
 
 1. Numerical gradient
 2. CPU kernel gradient
-3. GPU kernel gradient (if supported)
+3. GPU kernel gradient (if supported by the device)
 
-The numerical gradient only relies on forward Operator. So we use the numerical gradient as the reference value. And the gradient checking is performed in the following three steps:
+The numerical gradient only relies on the forward Operator, so we use the numerical gradient as the reference value. The gradient checking is performed in the following three steps:
 
-1. calculate the numerical gradient
-2. calculate CPU kernel gradient with the backward Operator and compare it with the numerical gradient
-3. calculate GPU kernel gradient with the backward Operator and compare it with the numeric gradient (if supported)
+1. Calculate the numerical gradient
+2. Calculate CPU kernel gradient with the backward Operator and compare it with the numerical gradient.
+3. Calculate GPU kernel gradient with the backward Operator and compare it with the numeric gradient. (if supported)
 
 #### Python Interface
 
@@ -109,26 +112,27 @@ The numerical gradient only relies on forward Operator. So we use the numerical
         """
         :param forward_op: used to create backward_op
         :param input_vars: numpy value of input variable. The following
-            computation will use these variables.
-        :param inputs_to_check: the input variable with respect to which to compute the gradient.
+          computation will use these variables.
+        :param inputs_to_check: the input variable with respect to which the
+          gradient will be computed.
         :param output_name: The final output variable name.
         :param max_relative_error: The relative tolerance parameter.
-        :param no_grad_set: used when create backward ops
+        :param no_grad_set: used to create backward ops
         :param only_cpu: only compute and check gradient on cpu kernel.
         :return:
         """
 ```
 
-### How to check if two numpy array is close enough?
-if `abs_numerical_grad` is nearly zero, then use abs error for numerical_grad
+### How to check if two numpy arrays are close enough?
+if `abs_numerical_grad` is nearly zero, then use absolute error for numerical_grad.
 
 ```python
 numerical_grad = ...
 operator_grad = numpy.array(scope.find_var(grad_var_name(name)).get_tensor())
 
 abs_numerical_grad = numpy.abs(numerical_grad)
-# if abs_numerical_grad is nearly zero, then use abs error for numeric_grad, not relative
-# error.
+# if abs_numerical_grad is nearly zero, then use abs error for
+# numeric_grad, instead of relative error.
 abs_numerical_grad[abs_numerical_grad < 1e-3] = 1
 
 diff_mat = numpy.abs(abs_numerical_grad - operator_grad) / abs_numerical_grad
@@ -137,10 +141,10 @@ max_diff = numpy.max(diff_mat)
 
 
 #### Notes：
-The Input data for auto gradient checker should be reasonable to avoid numerical  stability problem.
+The Input data for auto gradient checker should be reasonable to avoid numerical stability problem.
 
 
-#### Refs:
+#### References:
 
 - [Gradient checking and advanced optimization(en)](http://deeplearning.stanford.edu/wiki/index.php/Gradient_checking_and_advanced_optimization)
 - [Gradient checking and advanced optimization(cn)](http://ufldl.stanford.edu/wiki/index.php/%E6%A2%AF%E5%BA%A6%E6%A3%80%E9%AA%8C%E4%B8%8E%E9%AB%98%E7%BA%A7%E4%BC%98%E5%8C%96)
diff --git a/doc/design/csp.md b/doc/design/csp.md
index ba9cacfdea7dcf7c6499b562dfc58400d082f2c8..10d936860fab7e09241e968a63526c7d86d3e568 100644
--- a/doc/design/csp.md
+++ b/doc/design/csp.md
@@ -42,7 +42,7 @@ The type *channel* is conceptually the blocking queue.  In Go, its implemented i
 
 The `select` operation has been in OS kernels long before Go language.  All Unix kernels implement system calls *poll* and *select*.  They monitor multiple file descriptors to see if I/O is possible on any of them.  This takes O(N) time.  Since Linux 2.6, a new system call, *epoll*, can do the same in O(1) time.  In BSD systems, there is a similar system call *kqueue*.  Go's Linux implementation uses epoll.
 
-It might be a good idea to implement Fluid's select using epoll too.  In this design doc, we start from the O(N) way, so we could focus on Python binding and the syntax.
+It might be a good idea to implement Fluid's select using epoll too.  In this design doc, we start from the O(N) way so that we could focus on Python binding and the syntax.
 
 ### Type Channel
 
@@ -71,14 +71,14 @@ ch1 := make(chan int, 100)  // a channel that can buffer 100 ints.
 In Fluid, we should be able to do the same:
 
 ```python
-ch  = fluid.make_chan(dtype=INT)
-ch1 = fluid.make_chan(dtype=INT, 100)
+ch  = fluid.make_channel(dtype=INT)
+ch1 = fluid.make_channel(dtype=INT, 100)
 ```
 
 In addition to that, we want channels that can hold more complex element types, e.g., Tensors of float16:
 
 ```python
-ch = fluid.make_chan(dtype=Tensor, etype=float16)
+ch = fluid.make_channel(dtype=Tensor, etype=float16)
 ```
 
 or Tensors of Tensors of float16 etc.
@@ -87,8 +87,136 @@ The point here is that we need a consistent way to compose types, like in C++ we
 
 ### Send and Recv
 
+Go's CSP implementation depends on data type *channel*. There are two types of channels:
+
+1. The unblocked channel, or buffered channel, is a blocking queue with a non-zero sized buffer. The sending to buffered channel blocks if the buffer is full, and the receive operation blocks if the buffer is empty.
+1. blocked channel, or unbuffered channel, is a blocking queue with no buffer.  Both sending and receiving block with unbuffered channels.
+
+There are four types of actions with a channel:
+
+1. Create a channel
+
+   ```go
+   ch := make(chan int) // this is an unbuffered channel
+   ch := make(chan int, 100) // this is a buffered channel of 100 ints.
+   ```
+
+1. Send
+
+   ```go
+   ch <- 111
+   ```
+
+1. Recv
+
+   ```go
+   y, ok <- ch
+   ```
+
+1. Close
+
+   ```go
+   close(ch)
+   ```
+   
+   Please be aware that a closed channel is not a nil channel, which is `var ch chan int`.
+   
+There are some [axioms with channels](https://dave.cheney.net/2014/03/19/channel-axioms):
+
+1. A send to a nil channel blocks forever
+
+1. A receive from a nil channel blocks forever
+
+1. A send to a closed channel panics
+
+1. A receive from a closed channel returns the residual values and then zeros.
+
+In Fluid, we have [buffered channels](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/details/buffered_channel.h) and [unbuffered channels](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/details/unbuffered_channel.h)
+
+The following program illustrates the Python syntax for accessing Fluid buffers.
+
+```python
+import fluid
+
+buffer_size = 10
+ch = fluid.make_channel(dtype=INT, buffer_size)
+
+# Now write three elements to the channel
+with fluid.while(steps=buffer_size):
+  fluid.send(ch, step)
+
+fluid.close_channel(ch)
+
+with fluid.while(steps=buffer_size):
+  fluid.print(fluid.recv(ch))
+```
+
+The following example shows that to avoid the always-blocking behavior of unbuffered channels, we need to use Fluid's goroutines.
+
+```python
+import fluid
+
+ch = fluid.make_channel(dtype=INT)
+
+with fluid.go():
+  fluid.send(ch)
+
+y = fluid.recv(ch)
+
+fluid.close_channel(ch)
+```
+
 ### Select
 
+In Go, the `select` statement lets a goroutine wait on multiple communication operations. A `select` blocks until one of its cases can run, then it executes that case. It chooses one at random if multiple are ready.
+
+```go
+
+ch1  := make(chan int)       
+ch2  := make(chan int, 100)
+
+x := 0
+
+for {
+    select {
+    case ch1 <- x:
+      x := x + 1
+    case y <- ch2:
+      fmt.Println("Received on channel")
+    default:
+      fmt.Println("Default")
+    }
+  }
+
+```
+
+In Fluid, we should be able to do the same:
+
+```python
+ch1  = fluid.make_chan(dtype=INT)
+ch2 = fluid.make_chan(dtype=INT, 100)
+
+sel = fluid.select()
+
+with sel.case(ch1, 'w', X):
+    fluid.layers.increment(X)
+
+with sel.case(ch2, 'r', Y):
+    fluid.print("Received on Channel")
+
+with sel.default():
+    fluid.print("Default")
+
+```
+
+In the above code snippet, `X` and `Y` are variables. Now let us look at each of these statements one by one.
+
+- `sel.case(ch1, 'w', X)` : This specifies that we are writing to `ch1` and we want to write the integer in variable `X` to the channel. The character `w` is used here to make the syntax familiar to write syntax in Python I/O.
+
+- `sel.case(ch2, 'r', Y)` : This specifies that we would like to read the result from `ch2` into variable `Y`. The character `r` is used here to make the syntax familiar to read syntax in Python I/O.
+
+- `sel.default()` : This is equivalent to the default in Go `select`. If none of the channels are ready for read or write, then the fluid code in the default block will be executed.
+
 ## Example Programs
 
 ### 1. RPC between Trainers and Parameter Servers
diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc
index 85e693434af863bfc3bde29989dbbfc69678d3b7..f52a51519fceffd96a4e8db240b23d15ed399fff 100644
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@@ -534,7 +534,7 @@ ParamGradInfoMap AppendBackward(
   auto root_block = program_desc.MutableBlock(root_block_idx);
 
   std::string fill_one_op_out = GradVarName(target.Name());
-  bool is_scalar = target.Shape() == std::vector<int64_t>{1};
+  bool is_scalar = target.GetShape() == std::vector<int64_t>{1};
   PADDLE_ENFORCE(is_scalar, "target should be scalar");
   VLOG(3) << "backward from loss=" << target.Name()
           << " data_type=" << target.GetDataType();
@@ -565,7 +565,7 @@ ParamGradInfoMap AppendBackward(
 
   auto var = root_block->Var(fill_one_op_out);
   var->SetDataType(target.GetDataType());
-  var->SetShape(target.Shape());
+  var->SetShape(target.GetShape());
   auto& target_grad = retv[target.Name()];
   target_grad.name_ = fill_one_op_out;
   target_grad.block_idx_ = root_block_idx;
diff --git a/paddle/framework/channel_test.cc b/paddle/framework/channel_test.cc
index 444d68498c9676fe0e246167dfacbe999a41d1a7..df9e15e22b890347a03d6816e8549c99b010bb38 100644
--- a/paddle/framework/channel_test.cc
+++ b/paddle/framework/channel_test.cc
@@ -60,6 +60,47 @@ TEST(Channel, SufficientBufferSizeDoesntBlock) {
   delete ch;
 }
 
+TEST(Channel, SendOnClosedChannelPanics) {
+  const size_t buffer_size = 10;
+  auto ch = MakeChannel<size_t>(buffer_size);
+  size_t i = 5;
+  EXPECT_EQ(ch->Send(&i), true);  // should not block or panic
+  CloseChannel(ch);
+  EXPECT_EQ(ch->Send(&i), false);  // should panic
+  delete ch;
+}
+
+TEST(Channel, ReceiveFromBufferedChannelReturnResidualValuesTest) {
+  const size_t buffer_size = 10;
+  auto ch = MakeChannel<size_t>(buffer_size);
+
+  for (size_t i = 0; i < buffer_size; ++i) {
+    EXPECT_EQ(ch->Send(&i), true);  // sending should not block
+  }
+
+  size_t out;
+  for (size_t i = 0; i < buffer_size / 2; ++i) {
+    EXPECT_EQ(ch->Receive(&out), true);  // receiving should not block
+    EXPECT_EQ(out, i);
+  }
+
+  CloseChannel(ch);
+
+  for (size_t i = buffer_size / 2; i < buffer_size; ++i) {
+    EXPECT_EQ(ch->Receive(&out),
+              true);  // receving should return residual values.
+    EXPECT_EQ(out, i);
+  }
+
+  for (size_t i = 0; i < buffer_size; ++i) {
+    EXPECT_EQ(ch->Receive(&out),
+              false);  // after receiving residual values, return zeros.
+    // Note: we cannot check EXPECT_EQ(out, 0), because C++ doesn't
+    // define zero values like Go does.
+  }
+  delete ch;
+}
+
 TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) {
   const size_t buffer_size = 10;
   auto ch = MakeChannel<size_t>(buffer_size);
diff --git a/paddle/framework/details/buffered_channel.h b/paddle/framework/details/buffered_channel.h
index 7ac234b8d42bae0661c3256c78311455c0fbc77c..00b63da4da7844b41168c03f55e2faa84ff44154 100644
--- a/paddle/framework/details/buffered_channel.h
+++ b/paddle/framework/details/buffered_channel.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <atomic>
 #include <condition_variable>
 #include <deque>
 #include <mutex>
@@ -42,7 +43,7 @@ class Buffered : public paddle::framework::Channel<T> {
   std::condition_variable empty_cond_var_;
   std::condition_variable full_cond_var_;
   std::deque<T> channel_;
-  bool closed_;
+  std::atomic<bool> closed_{false};
 
   Buffered(size_t cap) : cap_(cap), closed_(false) {
     PADDLE_ENFORCE_GT(cap, 0);
@@ -53,10 +54,13 @@ class Buffered : public paddle::framework::Channel<T> {
 
 template <typename T>
 bool Buffered<T>::Send(T* item) {
+  bool ret = false;
+  if (closed_) {
+    return ret;
+  }
   std::unique_lock<std::mutex> lock(mu_);
   full_cond_var_.wait(lock,
                       [this]() { return channel_.size() < cap_ || closed_; });
-  bool ret = false;
   if (!closed_) {
     channel_.push_back(std::move(*item));
     lock.unlock();
@@ -71,7 +75,7 @@ bool Buffered<T>::Receive(T* item) {
   std::unique_lock<std::mutex> lock(mu_);
   empty_cond_var_.wait(lock, [this]() { return !channel_.empty() || closed_; });
   bool ret = false;
-  if (!closed_) {
+  if (!channel_.empty()) {
     *item = std::move(channel_.front());
     channel_.pop_front();
     full_cond_var_.notify_one();
@@ -82,6 +86,9 @@ bool Buffered<T>::Receive(T* item) {
 
 template <typename T>
 void Buffered<T>::Close() {
+  if (closed_) {
+    return;
+  }
   std::unique_lock<std::mutex> lock(mu_);
   closed_ = true;
   NotifyAllParticipants(&lock);
diff --git a/paddle/framework/details/unbuffered_channel.h b/paddle/framework/details/unbuffered_channel.h
index f86a894bb4a42e45edf6964e30620b68183faaa8..815cebad2d8c08aa31bb566bc6c51250870383d8 100644
--- a/paddle/framework/details/unbuffered_channel.h
+++ b/paddle/framework/details/unbuffered_channel.h
@@ -58,6 +58,10 @@ class UnBuffered : public paddle::framework::Channel<T> {
 // be sent from a writer to a reader.
 template <typename T>
 bool UnBuffered<T>::Send(T* data) {
+  bool ret = false;
+  if (closed_) {
+    return ret;
+  }
   // Prevent other writers from entering
   std::unique_lock<std::recursive_mutex> writer_lock(mu_write_);
   writer_found_ = true;
@@ -66,7 +70,6 @@ bool UnBuffered<T>::Send(T* data) {
   cv_writer_.wait(cv_lock,
                   [this]() { return reader_found_ == true || closed_; });
   cv_reader_.notify_one();
-  bool ret = false;
   if (!closed_) {
     std::unique_lock<std::mutex> channel_lock(mu_ch_);
     item = data;
@@ -114,6 +117,9 @@ bool UnBuffered<T>::Receive(T* data) {
 // that take place once the channel is closed.
 template <typename T>
 void UnBuffered<T>::Close() {
+  if (closed_) {
+    return;
+  }
   std::unique_lock<std::mutex> lock(mu_ch_);
   item = nullptr;
   closed_ = true;
diff --git a/paddle/framework/framework.proto b/paddle/framework/framework.proto
index 5b6ef03f610926578d2c02dcf06f399f106a30a1..f65ccae6e6a4df4490d49128e871efa55800b505 100644
--- a/paddle/framework/framework.proto
+++ b/paddle/framework/framework.proto
@@ -116,6 +116,8 @@ message LoDTensorArrayDesc {
   optional int32 lod_level = 2 [ default = 0 ];
 }
 
+message Reader { repeated LoDTensorDesc lod_tensor = 1; }
+
 message VarDesc {
   enum VarType {
     LOD_TENSOR = 1;
@@ -126,13 +128,15 @@ message VarDesc {
     LOD_RANK_TABLE = 6;
     LOD_TENSOR_ARRAY = 7;
     PLACE_LIST = 8;
+    READER = 9;
   }
   required string name = 1;
   required VarType type = 2;
-  optional LoDTensorDesc lod_tensor = 3;
-  optional TensorDesc selected_rows = 4;
+  optional bool persistable = 3 [ default = false ];
+  optional LoDTensorDesc lod_tensor = 4;
+  optional TensorDesc selected_rows = 5;
   optional LoDTensorArrayDesc tensor_array = 6;
-  optional bool persistable = 5 [ default = false ];
+  optional Reader reader = 7;
 }
 
 message BlockDesc {
diff --git a/paddle/framework/op_desc.cc b/paddle/framework/op_desc.cc
index f554c77845087453f8c6e4d04522a8555e583ae6..ad361852ec9f2bb35b24209179f96b15300ca8d0 100644
--- a/paddle/framework/op_desc.cc
+++ b/paddle/framework/op_desc.cc
@@ -458,11 +458,11 @@ DDim CompileTimeInferShapeContext::GetDim(const std::string &name) const {
   auto var = block_.FindVarRecursive(name);
   PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name);
   try {
-    auto shape = var->Shape();
+    auto shape = var->GetShape();
     if (shape.empty()) {
       return framework::make_ddim({0UL});
     } else {
-      return framework::make_ddim(var->Shape());
+      return framework::make_ddim(var->GetShape());
     }
   } catch (...) {
     VLOG(5) << "GetDim of variable " << name << " error";
diff --git a/paddle/framework/program_desc_test.cc b/paddle/framework/program_desc_test.cc
index 59947c9f2189348226b7ff6c2b9315196bbf55fa..9945aee31b647a6243971c7e64c8391c0b1c09c5 100644
--- a/paddle/framework/program_desc_test.cc
+++ b/paddle/framework/program_desc_test.cc
@@ -53,7 +53,7 @@ TEST(ProgramDesc, copy_ctor) {
     ASSERT_NE(copy, var_before);
     ASSERT_EQ(copy->Name(), var_before->Name());
     ASSERT_EQ(copy->GetType(), var_before->GetType());
-    ASSERT_EQ(copy->Shape(), var_before->Shape());
+    ASSERT_EQ(copy->GetShape(), var_before->GetShape());
     ASSERT_EQ(copy->Proto()->SerializeAsString(),
               var_before->Proto()->SerializeAsString());
   };
@@ -117,7 +117,7 @@ TEST(ProgramDescBind, serialize_and_deserialize) {
     ASSERT_NE(restored, var_before);
     ASSERT_EQ(restored->Name(), var_before->Name());
     ASSERT_EQ(restored->GetType(), var_before->GetType());
-    ASSERT_EQ(restored->Shape(), var_before->Shape());
+    ASSERT_EQ(restored->GetShape(), var_before->GetShape());
     ASSERT_EQ(restored->Proto()->SerializeAsString(),
               var_before->Proto()->SerializeAsString());
   };
diff --git a/paddle/framework/threadpool.h b/paddle/framework/threadpool.h
index 4e9b58679d9e7c84adf76b6245b397c7a8872483..77d31a1176d5947655b57c70846294093a7bb5ef 100644
--- a/paddle/framework/threadpool.h
+++ b/paddle/framework/threadpool.h
@@ -21,7 +21,8 @@ limitations under the License. */
 #include <queue>
 #include <thread>
 #include <vector>
-
+#include "glog/logging.h"
+#include "paddle/platform/enforce.h"
 #include "paddle/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
 
 namespace paddle {
@@ -31,7 +32,7 @@ namespace framework {
 // number of threads.
 class ThreadPool {
  public:
-  typedef std::packaged_task<void()> Task;
+  using Task = std::packaged_task<std::unique_ptr<platform::EnforceNotMet>()>;
 
   // Returns the singleton of ThreadPool.
   static ThreadPool* GetInstance();
@@ -52,9 +53,28 @@ class ThreadPool {
   // std::future::wait().
   template <typename Callback>
   std::future<void> Run(Callback fn) {
+    auto f = this->RunAndGetException(fn);
+    return std::async(std::launch::deferred, ExceptionHandler(std::move(f)));
+  }
+
+  template <typename Callback>
+  std::future<std::unique_ptr<platform::EnforceNotMet>> RunAndGetException(
+      Callback fn) {
     std::unique_lock<std::mutex> lock(mutex_);
-    Task task(std::bind(fn));
-    std::future<void> f = task.get_future();
+    Task task([fn]() -> std::unique_ptr<platform::EnforceNotMet> {
+      try {
+        fn();
+        return nullptr;
+      } catch (platform::EnforceNotMet ex) {
+        return std::unique_ptr<platform::EnforceNotMet>(
+            new platform::EnforceNotMet(ex));
+      } catch (...) {
+        LOG(FATAL)
+            << "Unexpected exception is catched in thread pool. All "
+               "throwable exception in Fluid should be an EnforceNotMet.";
+      }
+    });
+    std::future<std::unique_ptr<platform::EnforceNotMet>> f = task.get_future();
     tasks_.push(std::move(task));
     lock.unlock();
     scheduled_.notify_one();
@@ -65,6 +85,22 @@ class ThreadPool {
   void Wait();
 
  private:
+  struct ExceptionHandler {
+    mutable std::future<std::unique_ptr<platform::EnforceNotMet>> future_;
+    explicit ExceptionHandler(
+        std::future<std::unique_ptr<platform::EnforceNotMet>>&& f)
+        : future_(std::move(f)) {}
+    void operator()() const {
+      auto ex = this->future_.get();
+      if (ex != nullptr) {
+        LOG(FATAL) << "The exception is thrown inside the thread pool. You "
+                      "should use RunAndGetException to handle the exception.\n"
+                      "The default exception handler is LOG(FATAL)."
+                   << ex->what();
+      }
+    }
+  };
+
   DISABLE_COPY_AND_ASSIGN(ThreadPool);
 
   explicit ThreadPool(int num_threads);
diff --git a/paddle/framework/var_desc.cc b/paddle/framework/var_desc.cc
index 62ab6593ef23c195e3caa2336574796ecaf35bc8..6d83e2e41126db0b3ac6fc1c86c9ed7fc4dfb39b 100644
--- a/paddle/framework/var_desc.cc
+++ b/paddle/framework/var_desc.cc
@@ -26,18 +26,91 @@ void VarDesc::SetShape(const std::vector<int64_t> &dims) {
   VectorToRepeated(dims, mutable_tensor_desc()->mutable_dims());
 }
 
+void VarDesc::SetTensorDescNum(size_t num) {
+  switch (desc_.type()) {
+    case proto::VarDesc::READER: {
+      auto *lod_tensors_ptr = desc_.mutable_reader()->mutable_lod_tensor();
+      lod_tensors_ptr->Clear();
+      for (size_t i = 0; i < num; ++i) {
+        lod_tensors_ptr->Add();
+      }
+      return;
+    } break;
+    default:
+      PADDLE_THROW(
+          "Setting 'sub_tensor_number' is not supported by the type of var %s.",
+          this->Name());
+  }
+}
+
+size_t VarDesc::GetTensorDescNum() const {
+  switch (desc_.type()) {
+    case proto::VarDesc::READER:
+      return desc_.reader().lod_tensor_size();
+      break;
+    default:
+      PADDLE_THROW(
+          "Getting 'sub_tensor_number' is not supported by the type of var %s.",
+          this->Name());
+  }
+}
+
+void VarDesc::SetShapes(
+    const std::vector<std::vector<int64_t>> &multiple_dims) {
+  PADDLE_ENFORCE_EQ(multiple_dims.size(), GetTensorDescNum(),
+                    "The number of given shapes(%d) doesn't equal to the "
+                    "number of sub tensor.",
+                    multiple_dims.size(), GetTensorDescNum());
+  std::vector<proto::TensorDesc *> tensors = mutable_tensor_descs();
+  for (size_t i = 0; i < multiple_dims.size(); ++i) {
+    VectorToRepeated(multiple_dims[i], tensors[i]->mutable_dims());
+  }
+}
+
+std::vector<int64_t> VarDesc::GetShape() const {
+  return RepeatedToVector(tensor_desc().dims());
+}
+
+std::vector<std::vector<int64_t>> VarDesc::GetShapes() const {
+  std::vector<proto::TensorDesc> descs = tensor_descs();
+  std::vector<std::vector<int64_t>> res;
+  res.reserve(descs.size());
+  for (const auto &tensor_desc : descs) {
+    res.push_back(RepeatedToVector(tensor_desc.dims()));
+  }
+  return res;
+}
+
 void VarDesc::SetDataType(proto::DataType data_type) {
   mutable_tensor_desc()->set_data_type(data_type);
 }
 
-std::vector<int64_t> VarDesc::Shape() const {
-  return RepeatedToVector(tensor_desc().dims());
+void VarDesc::SetDataTypes(
+    const std::vector<proto::DataType> &multiple_data_type) {
+  PADDLE_ENFORCE_EQ(multiple_data_type.size(), GetTensorDescNum(),
+                    "The number of given data types(%d) doesn't equal to the "
+                    "number of sub tensor.",
+                    multiple_data_type.size(), GetTensorDescNum());
+  std::vector<proto::TensorDesc *> tensor_descs = mutable_tensor_descs();
+  for (size_t i = 0; i < multiple_data_type.size(); ++i) {
+    tensor_descs[i]->set_data_type(multiple_data_type[i]);
+  }
 }
 
 proto::DataType VarDesc::GetDataType() const {
   return tensor_desc().data_type();
 }
 
+std::vector<proto::DataType> VarDesc::GetDataTypes() const {
+  std::vector<proto::TensorDesc> descs = tensor_descs();
+  std::vector<proto::DataType> res;
+  res.reserve(descs.size());
+  for (const auto &tensor_desc : descs) {
+    res.push_back(tensor_desc.data_type());
+  }
+  return res;
+}
+
 void VarDesc::SetLoDLevel(int32_t lod_level) {
   switch (desc_.type()) {
     case proto::VarDesc::LOD_TENSOR:
@@ -47,8 +120,28 @@ void VarDesc::SetLoDLevel(int32_t lod_level) {
       desc_.mutable_tensor_array()->set_lod_level(lod_level);
       break;
     default:
-      PADDLE_THROW("Tensor type=%d does not support LoDLevel",
-                   desc_.tensor_array().lod_level());
+      PADDLE_THROW(
+          "Setting 'lod_level' is not supported by the type of var %s.",
+          this->Name());
+  }
+}
+
+void VarDesc::SetLoDLevels(const std::vector<int32_t> &multiple_lod_level) {
+  PADDLE_ENFORCE_EQ(multiple_lod_level.size(), GetTensorDescNum(),
+                    "The number of given data types(%d) doesn't equal to the "
+                    "number of sub tensor.",
+                    multiple_lod_level.size(), GetTensorDescNum());
+  switch (desc_.type()) {
+    case proto::VarDesc::READER: {
+      size_t i = 0;
+      for (auto &lod_tensor : *desc_.mutable_reader()->mutable_lod_tensor()) {
+        lod_tensor.set_lod_level(multiple_lod_level[i++]);
+      }
+    } break;
+    default:
+      PADDLE_THROW(
+          "Setting 'lod_levels' is not supported by the type of var %s.",
+          this->Name());
   }
 }
 
@@ -59,13 +152,31 @@ int32_t VarDesc::GetLoDLevel() const {
     case proto::VarDesc::LOD_TENSOR_ARRAY:
       return desc_.tensor_array().lod_level();
     default:
-      PADDLE_THROW("Tensor type=%d does not support LoDLevel",
-                   desc_.tensor_array().lod_level());
+      PADDLE_THROW(
+          "Getting 'lod_level' is not supported by the type of var %s.",
+          this->Name());
+  }
+}
+
+std::vector<int32_t> VarDesc::GetLoDLevels() const {
+  std::vector<int32_t> res;
+  switch (desc_.type()) {
+    case proto::VarDesc::READER:
+      res.reserve(desc_.reader().lod_tensor_size());
+      for (auto &lod_tensor : desc_.reader().lod_tensor()) {
+        res.push_back(lod_tensor.lod_level());
+      }
+      return res;
+      break;
+    default:
+      PADDLE_THROW(
+          "Getting 'lod_levels' is not supported by the type of var %s.",
+          this->Name());
   }
 }
 
 const proto::TensorDesc &VarDesc::tensor_desc() const {
-  PADDLE_ENFORCE(desc_.has_type(), "invoke TensorDesc must after set type");
+  PADDLE_ENFORCE(desc_.has_type(), "The var's type hasn't been set.");
   switch (desc_.type()) {
     case proto::VarDesc::SELECTED_ROWS:
       return desc_.selected_rows();
@@ -74,13 +185,32 @@ const proto::TensorDesc &VarDesc::tensor_desc() const {
     case proto::VarDesc::LOD_TENSOR_ARRAY:
       return desc_.tensor_array().tensor();
     default:
-      PADDLE_THROW("The type of var %s is unsupported.", this->Name());
+      PADDLE_THROW(
+          "Getting 'tensor_desc' is not supported by the type of var %s.",
+          this->Name());
+  }
+}
+
+std::vector<proto::TensorDesc> VarDesc::tensor_descs() const {
+  PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set.");
+  std::vector<proto::TensorDesc> res;
+  res.reserve(GetTensorDescNum());
+  switch (desc_.type()) {
+    case proto::VarDesc::READER:
+      for (const auto &lod_tensor : desc_.reader().lod_tensor()) {
+        res.push_back(lod_tensor.tensor());
+      }
+      return res;
+    default:
+      PADDLE_THROW(
+          "Getting 'tensor_descs' is not supported by the type of var "
+          "%s.",
+          this->Name());
   }
 }
 
 proto::TensorDesc *VarDesc::mutable_tensor_desc() {
-  PADDLE_ENFORCE(desc_.has_type(),
-                 "invoke MutableTensorDesc must after set type");
+  PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set.");
   switch (desc_.type()) {
     case proto::VarDesc::SELECTED_ROWS:
       return desc_.mutable_selected_rows();
@@ -89,8 +219,30 @@ proto::TensorDesc *VarDesc::mutable_tensor_desc() {
     case proto::VarDesc::LOD_TENSOR_ARRAY:
       return desc_.mutable_tensor_array()->mutable_tensor();
     default:
-      PADDLE_THROW("Unexpected branch.");
+      PADDLE_THROW(
+          "Getting 'mutable_tensor_desc' is not supported by the type of var "
+          "%s.",
+          this->Name());
   }
 }
+
+std::vector<proto::TensorDesc *> VarDesc::mutable_tensor_descs() {
+  PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set.");
+  std::vector<proto::TensorDesc *> res;
+  res.reserve(GetTensorDescNum());
+  switch (desc_.type()) {
+    case proto::VarDesc::READER:
+      for (auto &lod_tensor : *desc_.mutable_reader()->mutable_lod_tensor()) {
+        res.push_back(lod_tensor.mutable_tensor());
+      }
+      return res;
+    default:
+      PADDLE_THROW(
+          "Getting 'tensor_descs' is not supported by the type of var "
+          "%s.",
+          this->Name());
+  }
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/var_desc.h b/paddle/framework/var_desc.h
index 9316b14bb695c185efd6db4296d422ef0c476d57..72da2fbb0a66ec7ca8c0c274dc4273bfbfcf303e 100644
--- a/paddle/framework/var_desc.h
+++ b/paddle/framework/var_desc.h
@@ -68,18 +68,34 @@ class VarDesc {
 
   void SetName(std::string name) { desc_.set_name(name); }
 
+  void SetTensorDescNum(size_t num);
+
+  size_t GetTensorDescNum() const;
+
   void SetShape(const std::vector<int64_t> &dims);
 
+  void SetShapes(const std::vector<std::vector<int64_t>> &multiple_dims);
+
+  std::vector<int64_t> GetShape() const;
+
+  std::vector<std::vector<int64_t>> GetShapes() const;
+
   void SetDataType(proto::DataType data_type);
 
-  std::vector<int64_t> Shape() const;
+  void SetDataTypes(const std::vector<proto::DataType> &multiple_data_type);
 
   proto::DataType GetDataType() const;
 
+  std::vector<proto::DataType> GetDataTypes() const;
+
   void SetLoDLevel(int32_t lod_level);
 
+  void SetLoDLevels(const std::vector<int32_t> &multiple_lod_level);
+
   int32_t GetLoDLevel() const;
 
+  std::vector<int32_t> GetLoDLevels() const;
+
   proto::VarDesc::VarType GetType() const;
 
   void SetType(proto::VarDesc::VarType type);
@@ -90,7 +106,9 @@ class VarDesc {
 
  private:
   const proto::TensorDesc &tensor_desc() const;
+  std::vector<proto::TensorDesc> tensor_descs() const;
   proto::TensorDesc *mutable_tensor_desc();
+  std::vector<proto::TensorDesc *> mutable_tensor_descs();
 
   proto::VarDesc desc_;
 };
diff --git a/paddle/inference/io.cc b/paddle/inference/io.cc
index 60ad7af1c0a469beb6a07bf057a8647fcb98cca8..1ed14b69c83a7a0fb5a55db9c179df133407440c 100644
--- a/paddle/inference/io.cc
+++ b/paddle/inference/io.cc
@@ -55,7 +55,7 @@ void LoadPersistables(framework::Executor& executor,
       VLOG(3) << "parameter's name: " << var->Name();
 
       framework::VarDesc* new_var = load_block->Var(var->Name());
-      new_var->SetShape(var->Shape());
+      new_var->SetShape(var->GetShape());
       new_var->SetDataType(var->GetDataType());
       new_var->SetType(var->GetType());
       new_var->SetLoDLevel(var->GetLoDLevel());
diff --git a/paddle/inference/tests/book/CMakeLists.txt b/paddle/inference/tests/book/CMakeLists.txt
index 0e987eb0240301c58cfb74c9e995d3b525130125..4c71517dc987456c6ccfa85d4c84ab40193d15c3 100644
--- a/paddle/inference/tests/book/CMakeLists.txt
+++ b/paddle/inference/tests/book/CMakeLists.txt
@@ -3,5 +3,17 @@ cc_test(test_inference_recognize_digits_mlp
     SRCS test_inference_recognize_digits.cc
     DEPS ARCHIVE_START paddle_fluid ARCHIVE_END
     ARGS --dirname=${PYTHON_TESTS_DIR}/book/recognize_digits_mlp.inference.model)
+cc_test(test_inference_image_classification_vgg
+    SRCS test_inference_image_classification.cc
+    DEPS ARCHIVE_START paddle_fluid ARCHIVE_END
+    ARGS --dirname=${PYTHON_TESTS_DIR}/book/image_classification_vgg.inference.model)
+cc_test(test_inference_image_classification_resnet
+    SRCS test_inference_image_classification.cc
+    DEPS ARCHIVE_START paddle_fluid ARCHIVE_END
+    ARGS --dirname=${PYTHON_TESTS_DIR}/book/image_classification_resnet.inference.model)
 set_tests_properties(test_inference_recognize_digits_mlp
     PROPERTIES DEPENDS test_recognize_digits)
+set_tests_properties(test_inference_image_classification_vgg
+    PROPERTIES DEPENDS test_image_classification_train)
+set_tests_properties(test_inference_image_classification_resnet
+    PROPERTIES DEPENDS test_image_classification_train)
diff --git a/paddle/inference/tests/book/test_inference_image_classification.cc b/paddle/inference/tests/book/test_inference_image_classification.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e01f5b312a097ce3d7b20ce74e3803c79d942e51
--- /dev/null
+++ b/paddle/inference/tests/book/test_inference_image_classification.cc
@@ -0,0 +1,113 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <time.h>
+#include <sstream>
+#include "gflags/gflags.h"
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/inference/io.h"
+
+DEFINE_string(dirname, "", "Directory of the inference model.");
+
+template <typename Place, typename T>
+void TestInference(const std::string& dirname,
+                   const std::vector<paddle::framework::LoDTensor*>& cpu_feeds,
+                   std::vector<paddle::framework::LoDTensor*>& cpu_fetchs) {
+  // 1. Define place, executor and scope
+  auto place = Place();
+  auto executor = paddle::framework::Executor(place);
+  auto* scope = new paddle::framework::Scope();
+
+  // 2. Initialize the inference_program and load all parameters from file
+  auto inference_program = paddle::inference::Load(executor, *scope, dirname);
+
+  // 3. Get the feed_target_names and fetch_target_names
+  const std::vector<std::string>& feed_target_names =
+      inference_program->GetFeedTargetNames();
+  const std::vector<std::string>& fetch_target_names =
+      inference_program->GetFetchTargetNames();
+
+  // 4. Prepare inputs: set up maps for feed targets
+  std::map<std::string, const paddle::framework::LoDTensor*> feed_targets;
+  for (size_t i = 0; i < feed_target_names.size(); ++i) {
+    // Please make sure that cpu_feeds[i] is right for feed_target_names[i]
+    feed_targets[feed_target_names[i]] = cpu_feeds[i];
+  }
+
+  // 5. Define Tensor to get the outputs: set up maps for fetch targets
+  std::map<std::string, paddle::framework::LoDTensor*> fetch_targets;
+  for (size_t i = 0; i < fetch_target_names.size(); ++i) {
+    fetch_targets[fetch_target_names[i]] = cpu_fetchs[i];
+  }
+
+  // 6. Run the inference program
+  executor.Run(*inference_program, scope, feed_targets, fetch_targets);
+
+  delete scope;
+}
+
+TEST(inference, image_classification) {
+  if (FLAGS_dirname.empty()) {
+    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
+  }
+
+  LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
+  std::string dirname = FLAGS_dirname;
+
+  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
+  // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
+
+  paddle::framework::LoDTensor input;
+  srand(time(0));
+  float* input_ptr =
+      input.mutable_data<float>({1, 3, 32, 32}, paddle::platform::CPUPlace());
+  for (int i = 0; i < 3072; ++i) {
+    input_ptr[i] = rand() / (static_cast<float>(RAND_MAX));
+  }
+  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
+  cpu_feeds.push_back(&input);
+
+  paddle::framework::LoDTensor output1;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
+  cpu_fetchs1.push_back(&output1);
+
+  // Run inference on CPU
+  TestInference<paddle::platform::CPUPlace, float>(
+      dirname, cpu_feeds, cpu_fetchs1);
+  LOG(INFO) << output1.dims();
+
+#ifdef PADDLE_WITH_CUDA
+  paddle::framework::LoDTensor output2;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
+  cpu_fetchs2.push_back(&output2);
+
+  // Run inference on CUDA GPU
+  TestInference<paddle::platform::CUDAPlace, float>(
+      dirname, cpu_feeds, cpu_fetchs2);
+  LOG(INFO) << output2.dims();
+
+  EXPECT_EQ(output1.dims(), output2.dims());
+  EXPECT_EQ(output1.numel(), output2.numel());
+
+  float err = 1E-3;
+  int count = 0;
+  for (int64_t i = 0; i < output1.numel(); ++i) {
+    if (fabs(output1.data<float>()[i] - output2.data<float>()[i]) > err) {
+      count++;
+    }
+  }
+  EXPECT_EQ(count, 0) << "There are " << count << " different elements.";
+#endif
+}
diff --git a/paddle/inference/tests/book/test_inference_recognize_digits.cc b/paddle/inference/tests/book/test_inference_recognize_digits.cc
index 26dc2aee04261d9a1fd29b4d75bfacc7870c09d8..ce8772587f371991591b65cd61a95b177ca509ec 100644
--- a/paddle/inference/tests/book/test_inference_recognize_digits.cc
+++ b/paddle/inference/tests/book/test_inference_recognize_digits.cc
@@ -58,6 +58,47 @@ void TestInference(const std::string& dirname,
   delete scope;
 }
 
+template <typename T>
+void SetupTensor(paddle::framework::LoDTensor& input,
+                 paddle::framework::DDim dims,
+                 T lower,
+                 T upper) {
+  srand(time(0));
+  float* input_ptr = input.mutable_data<T>(dims, paddle::platform::CPUPlace());
+  for (int i = 0; i < input.numel(); ++i) {
+    input_ptr[i] =
+        (static_cast<T>(rand()) / static_cast<T>(RAND_MAX)) * (upper - lower) +
+        lower;
+  }
+}
+
+template <typename T>
+void CheckError(paddle::framework::LoDTensor& output1,
+                paddle::framework::LoDTensor& output2) {
+  // Check lod information
+  EXPECT_EQ(output1.lod(), output2.lod());
+
+  EXPECT_EQ(output1.dims(), output2.dims());
+  EXPECT_EQ(output1.numel(), output2.numel());
+
+  T err = static_cast<T>(0);
+  if (typeid(T) == typeid(float)) {
+    err = 1E-3;
+  } else if (typeid(T) == typeid(double)) {
+    err = 1E-6;
+  } else {
+    err = 0;
+  }
+
+  size_t count = 0;
+  for (int64_t i = 0; i < output1.numel(); ++i) {
+    if (fabs(output1.data<T>()[i] - output2.data<T>()[i]) > err) {
+      count++;
+    }
+  }
+  EXPECT_EQ(count, 0) << "There are " << count << " different elements.";
+}
+
 TEST(inference, recognize_digits) {
   if (FLAGS_dirname.empty()) {
     LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
@@ -70,12 +111,10 @@ TEST(inference, recognize_digits) {
   // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
 
   paddle::framework::LoDTensor input;
-  srand(time(0));
-  float* input_ptr =
-      input.mutable_data<float>({1, 28, 28}, paddle::platform::CPUPlace());
-  for (int i = 0; i < 784; ++i) {
-    input_ptr[i] = rand() / (static_cast<float>(RAND_MAX));
-  }
+  // Use normilized image pixels as input data,
+  // which should be in the range [-1.0, 1.0].
+  SetupTensor<float>(
+      input, {1, 28, 28}, static_cast<float>(-1), static_cast<float>(1));
   std::vector<paddle::framework::LoDTensor*> cpu_feeds;
   cpu_feeds.push_back(&input);
 
@@ -98,16 +137,6 @@ TEST(inference, recognize_digits) {
       dirname, cpu_feeds, cpu_fetchs2);
   LOG(INFO) << output2.dims();
 
-  EXPECT_EQ(output1.dims(), output2.dims());
-  EXPECT_EQ(output1.numel(), output2.numel());
-
-  float err = 1E-3;
-  int count = 0;
-  for (int64_t i = 0; i < output1.numel(); ++i) {
-    if (fabs(output1.data<float>()[i] - output2.data<float>()[i]) > err) {
-      count++;
-    }
-  }
-  EXPECT_EQ(count, 0) << "There are " << count << " different elements.";
+  CheckError<float>(output1, output2);
 #endif
 }
diff --git a/paddle/operators/multiclass_nms_op.cc b/paddle/operators/multiclass_nms_op.cc
index 8a65fe69f1534112923c72e17969f808b9310735..41b9335fb8fc9ef5f5336bd8c63dc68bb94ff4f6 100644
--- a/paddle/operators/multiclass_nms_op.cc
+++ b/paddle/operators/multiclass_nms_op.cc
@@ -85,7 +85,7 @@ static inline void GetMaxScoreIndex(
   std::stable_sort(sorted_indices->begin(), sorted_indices->end(),
                    SortScorePairDescend<int>);
   // Keep top_k scores if needed.
-  if (top_k > -1 && top_k < sorted_indices->size()) {
+  if (top_k > -1 && top_k < static_cast<int>(sorted_indices->size())) {
     sorted_indices->resize(top_k);
   }
 }
@@ -151,7 +151,7 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
     while (sorted_indices.size() != 0) {
       const int idx = sorted_indices.front().second;
       bool keep = true;
-      for (int k = 0; k < selected_indices->size(); ++k) {
+      for (size_t k = 0; k < selected_indices->size(); ++k) {
         if (keep) {
           const int kept_idx = (*selected_indices)[k];
           T overlap = JaccardOverlap<T>(bbox_data + idx * box_size,
@@ -201,7 +201,7 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
         int label = it.first;
         const T* sdata = scores_data + label * predict_dim;
         const std::vector<int>& label_indices = it.second;
-        for (int j = 0; j < label_indices.size(); ++j) {
+        for (size_t j = 0; j < label_indices.size(); ++j) {
           int idx = label_indices[j];
           PADDLE_ENFORCE_LT(idx, predict_dim);
           score_index_pairs.push_back(
@@ -215,7 +215,7 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
 
       // Store the new indices.
       std::map<int, std::vector<int>> new_indices;
-      for (int j = 0; j < score_index_pairs.size(); ++j) {
+      for (size_t j = 0; j < score_index_pairs.size(); ++j) {
         int label = score_index_pairs[j].second.first;
         int idx = score_index_pairs[j].second.second;
         new_indices[label].push_back(idx);
@@ -238,7 +238,7 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
       int label = it.first;
       const T* sdata = scores_data + label * predict_dim;
       const std::vector<int>& indices = it.second;
-      for (int j = 0; j < indices.size(); ++j) {
+      for (size_t j = 0; j < indices.size(); ++j) {
         int idx = indices[j];
         const T* bdata = bboxes_data + idx * kBBoxSize;
         odata[count * kOutputDim] = label;           // label
diff --git a/paddle/operators/prior_box_op.cc b/paddle/operators/prior_box_op.cc
index 105ff4ac3e3ba889aad880f4204af15829c6da47..1dc4b288559d0294e5f58cc923ffc78b80604af9 100644
--- a/paddle/operators/prior_box_op.cc
+++ b/paddle/operators/prior_box_op.cc
@@ -44,12 +44,6 @@ class PriorBoxOp : public framework::OperatorWithKernel {
     auto aspect_ratios = ctx->Attrs().Get<std::vector<float>>("aspect_ratios");
     bool flip = ctx->Attrs().Get<bool>("flip");
 
-    PADDLE_ENFORCE_GT(min_sizes.size(), 0,
-                      "Size of min_sizes must be at least 1.");
-    for (size_t i = 0; i < min_sizes.size(); ++i) {
-      PADDLE_ENFORCE_GT(min_sizes[i], 0, "min_sizes[%d] must be positive.", i);
-    }
-
     std::vector<float> aspect_ratios_vec;
     ExpandAspectRatios(aspect_ratios, flip, aspect_ratios_vec);
 
@@ -65,17 +59,6 @@ class PriorBoxOp : public framework::OperatorWithKernel {
       }
     }
 
-    PADDLE_ENFORCE_EQ(variances.size(), 4, "Must and only provide 4 variance.");
-    for (size_t i = 0; i < variances.size(); ++i) {
-      PADDLE_ENFORCE_GT(variances[i], 0.0,
-                        "variance[%d] must be greater than 0.", i);
-    }
-
-    const float step_h = ctx->Attrs().Get<float>("step_h");
-    PADDLE_ENFORCE_GT(step_h, 0.0, "step_h should be larger than 0.");
-    const float step_w = ctx->Attrs().Get<float>("step_w");
-    PADDLE_ENFORCE_GT(step_w, 0.0, "step_w should be larger than 0.");
-
     std::vector<int64_t> dim_vec(4);
     dim_vec[0] = input_dims[2];
     dim_vec[1] = input_dims[3];
@@ -106,26 +89,54 @@ class PriorBoxOpMaker : public framework::OpProtoAndCheckerMaker {
               "PriorBoxOp. The layout is [H, W, num_priors, 4]. "
               "H is the height of input, W is the width of input, num_priors "
               "is the box count of each position.");
-    AddAttr<std::vector<int>>("min_sizes", "(vector<int>) ",
-                              "List of min sizes of generated prior boxes.");
-    AddAttr<std::vector<int>>("max_sizes", "(vector<int>) ",
-                              "List of max sizes of generated prior boxes.");
+
+    AddAttr<std::vector<int>>("min_sizes",
+                              "(vector<int>) List of min sizes "
+                              "of generated prior boxes.")
+        .AddCustomChecker([](const std::vector<int>& min_sizes) {
+          PADDLE_ENFORCE_GT(min_sizes.size(), 0,
+                            "Size of min_sizes must be at least 1.");
+          for (size_t i = 0; i < min_sizes.size(); ++i) {
+            PADDLE_ENFORCE_GT(min_sizes[i], 0,
+                              "min_sizes[%d] must be positive.", i);
+          }
+        });
+    AddAttr<std::vector<int>>(
+        "max_sizes",
+        "(vector<int>) List of max sizes of generated prior boxes.");
     AddAttr<std::vector<float>>(
-        "aspect_ratios", "(vector<float>) ",
-        "List of aspect ratios of generated prior boxes.");
+        "aspect_ratios",
+        "(vector<float>) List of aspect ratios of generated prior boxes.");
+
     AddAttr<std::vector<float>>(
-        "variances", "(vector<float>) ",
-        "List of variances to be encoded in prior boxes.");
-    AddAttr<bool>("flip", "(bool) ", "Whether to flip aspect ratios.")
+        "variances",
+        "(vector<float>) List of variances to be encoded in prior boxes.")
+        .AddCustomChecker([](const std::vector<float>& variances) {
+          PADDLE_ENFORCE_EQ(variances.size(), 4,
+                            "Must and only provide 4 variance.");
+          for (size_t i = 0; i < variances.size(); ++i) {
+            PADDLE_ENFORCE_GT(variances[i], 0.0,
+                              "variance[%d] must be greater than 0.", i);
+          }
+        });
+    AddAttr<bool>("flip", "(bool) Whether to flip aspect ratios.")
         .SetDefault(true);
-    AddAttr<bool>("clip", "(bool) ", "Whether to clip out-of-boundary boxes.")
+    AddAttr<bool>("clip", "(bool) Whether to clip out-of-boundary boxes.")
         .SetDefault(true);
+
     AddAttr<float>("step_w",
                    "Prior boxes step across width, 0 for auto calculation.")
-        .SetDefault(0.0);
+        .SetDefault(0.0)
+        .AddCustomChecker([](const float& step_w) {
+          PADDLE_ENFORCE_GT(step_w, 0.0, "step_w should be larger than 0.");
+        });
     AddAttr<float>("step_h",
                    "Prior boxes step across height, 0 for auto calculation.")
-        .SetDefault(0.0);
+        .SetDefault(0.0)
+        .AddCustomChecker([](const float& step_h) {
+          PADDLE_ENFORCE_GT(step_h, 0.0, "step_h should be larger than 0.");
+        });
+
     AddAttr<float>("offset",
                    "(float) "
                    "Prior boxes center offset.")
diff --git a/paddle/operators/prior_box_op.h b/paddle/operators/prior_box_op.h
index e0a663ace8f38c2d08fd4714c1247d3313ffae3e..6b221cb74ebb306b99533f49c824ed6c60144ff2 100644
--- a/paddle/operators/prior_box_op.h
+++ b/paddle/operators/prior_box_op.h
@@ -25,7 +25,7 @@ inline void ExpandAspectRatios(const std::vector<float>& input_aspect_ratior,
                                std::vector<float>& output_aspect_ratior) {
   constexpr float epsilon = 1e-6;
   output_aspect_ratior.clear();
-  output_aspect_ratior.push_back(1.);
+  output_aspect_ratior.push_back(1.0f);
   for (size_t i = 0; i < input_aspect_ratior.size(); ++i) {
     float ar = input_aspect_ratior[i];
     bool already_exist = false;
@@ -38,7 +38,7 @@ inline void ExpandAspectRatios(const std::vector<float>& input_aspect_ratior,
     if (!already_exist) {
       output_aspect_ratior.push_back(ar);
       if (flip) {
-        output_aspect_ratior.push_back(1. / ar);
+        output_aspect_ratior.push_back(1.0f / ar);
       }
     }
   }
@@ -46,7 +46,7 @@ inline void ExpandAspectRatios(const std::vector<float>& input_aspect_ratior,
 
 template <typename T>
 struct ClipFunctor {
-  HOSTDEVICE T operator()(T in) const {
+  HOSTDEVICE inline T operator()(T in) const {
     return std::min<T>(std::max<T>(in, 0.), 1.);
   }
 };
@@ -97,6 +97,9 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
     boxes->mutable_data<T>(ctx.GetPlace());
     vars->mutable_data<T>(ctx.GetPlace());
 
+    T inv_img_width = 1.0 / img_width;
+    T inv_img_height = 1.0 / img_height;
+
     auto e_boxes = framework::EigenTensor<T, 4>::From(*boxes);
     for (int h = 0; h < feature_height; ++h) {
       for (int w = 0; w < feature_width; ++w) {
@@ -109,13 +112,15 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
           // first prior: aspect_ratio = 1, size = min_size
           box_width = box_height = min_size;
           // xmin
-          e_boxes(h, w, idx, 0) = (center_x - box_width / 2.) / img_width;
+          e_boxes(h, w, idx, 0) = (center_x - box_width * 0.5) * inv_img_width;
           // ymin
-          e_boxes(h, w, idx, 1) = (center_y - box_height / 2.) / img_height;
+          e_boxes(h, w, idx, 1) =
+              (center_y - box_height * 0.5) * inv_img_height;
           // xmax
-          e_boxes(h, w, idx, 2) = (center_x + box_width / 2.) / img_width;
+          e_boxes(h, w, idx, 2) = (center_x + box_width * 0.5) * inv_img_width;
           // ymax
-          e_boxes(h, w, idx, 3) = (center_y + box_height / 2.) / img_height;
+          e_boxes(h, w, idx, 3) =
+              (center_y + box_height * 0.5) * inv_img_height;
 
           idx++;
           if (max_sizes.size() > 0) {
@@ -124,13 +129,17 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
             // size = sqrt(min_size * max_size)
             box_width = box_height = sqrt(min_size * max_size);
             // xmin
-            e_boxes(h, w, idx, 0) = (center_x - box_width / 2.) / img_width;
+            e_boxes(h, w, idx, 0) =
+                (center_x - box_width * 0.5) * inv_img_width;
             // ymin
-            e_boxes(h, w, idx, 1) = (center_y - box_height / 2.) / img_height;
+            e_boxes(h, w, idx, 1) =
+                (center_y - box_height * 0.5) * inv_img_height;
             // xmax
-            e_boxes(h, w, idx, 2) = (center_x + box_width / 2.) / img_width;
+            e_boxes(h, w, idx, 2) =
+                (center_x + box_width * 0.5) * inv_img_width;
             // ymax
-            e_boxes(h, w, idx, 3) = (center_y + box_height / 2.) / img_height;
+            e_boxes(h, w, idx, 3) =
+                (center_y + box_height * 0.5) * inv_img_height;
             idx++;
           }
 
@@ -143,13 +152,17 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
             box_width = min_size * sqrt(ar);
             box_height = min_size / sqrt(ar);
             // xmin
-            e_boxes(h, w, idx, 0) = (center_x - box_width / 2.) / img_width;
+            e_boxes(h, w, idx, 0) =
+                (center_x - box_width * 0.5) * inv_img_width;
             // ymin
-            e_boxes(h, w, idx, 1) = (center_y - box_height / 2.) / img_height;
+            e_boxes(h, w, idx, 1) =
+                (center_y - box_height * 0.5) * inv_img_height;
             // xmax
-            e_boxes(h, w, idx, 2) = (center_x + box_width / 2.) / img_width;
+            e_boxes(h, w, idx, 2) =
+                (center_x + box_width * 0.5) * inv_img_width;
             // ymax
-            e_boxes(h, w, idx, 3) = (center_y + box_height / 2.) / img_height;
+            e_boxes(h, w, idx, 3) =
+                (center_y + box_height * 0.5) * inv_img_height;
             idx++;
           }
         }
diff --git a/paddle/pybind/protobuf.cc b/paddle/pybind/protobuf.cc
index 371d6119d4ab73e683821d0dc5db5194f44a64ce..0f1953abe0864c70671c5bb32e15b495ecf993d0 100644
--- a/paddle/pybind/protobuf.cc
+++ b/paddle/pybind/protobuf.cc
@@ -214,11 +214,20 @@ void BindVarDsec(py::module &m) {
            py::return_value_policy::reference)
       .def("set_name", &VarDesc::SetName)
       .def("set_shape", &VarDesc::SetShape)
+      .def("set_shapes", &VarDesc::SetShapes)
       .def("set_dtype", &VarDesc::SetDataType)
-      .def("shape", &VarDesc::Shape, py::return_value_policy::reference)
+      .def("set_dtypes", &VarDesc::SetDataTypes)
+      .def("set_tensor_num", &VarDesc::SetTensorDescNum)
+      .def("tensor_num", &VarDesc::GetTensorDescNum)
+      .def("shape", &VarDesc::GetShape, py::return_value_policy::reference)
+      .def("shapes", &VarDesc::GetShapes, py::return_value_policy::reference)
       .def("dtype", &VarDesc::GetDataType, py::return_value_policy::reference)
+      .def("dtypes", &VarDesc::GetDataTypes, py::return_value_policy::reference)
       .def("lod_level", &VarDesc::GetLoDLevel)
+      .def("lod_levels", &VarDesc::GetLoDLevels,
+           py::return_value_policy::reference)
       .def("set_lod_level", &VarDesc::SetLoDLevel)
+      .def("set_lod_levels", &VarDesc::SetLoDLevels)
       .def("type", &VarDesc::GetType)
       .def("set_type", &VarDesc::SetType)
       .def("serialize_to_string", SerializeMessage<VarDesc>)
@@ -233,7 +242,8 @@ void BindVarDsec(py::module &m) {
       .value("STEP_SCOPES", proto::VarDesc::STEP_SCOPES)
       .value("LOD_RANK_TABLE", proto::VarDesc::LOD_RANK_TABLE)
       .value("LOD_TENSOR_ARRAY", proto::VarDesc::LOD_TENSOR_ARRAY)
-      .value("PLACE_LIST", proto::VarDesc::PLACE_LIST);
+      .value("PLACE_LIST", proto::VarDesc::PLACE_LIST)
+      .value("READER", proto::VarDesc::READER);
 }
 
 void BindOpDesc(py::module &m) {
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index 59f3af03986793a4185bc5933d26888e89d555f7..ba496db5f834efe767bfe446a46877932faa81a0 100644
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -117,7 +117,7 @@ EOF
             -DWITH_STYLE_CHECK=OFF
         make -j `nproc` gen_proto_py
         make -j `nproc` paddle_python
-        make -j `nproc` paddle_docs paddle_docs_cn
+        make -j `nproc` paddle_docs paddle_docs_cn paddle_api_docs
         make -j `nproc` print_operators_doc
         paddle/pybind/print_operators_doc > doc/en/html/operators.json
         popd
diff --git a/paddle/scripts/travis/build_doc.sh b/paddle/scripts/travis/build_doc.sh
index 0db8d33bbcb5278ed0dd5584b5822502b719ede9..4af4ac4f5e43543449ae922d7eb2a5740372f68f 100755
--- a/paddle/scripts/travis/build_doc.sh
+++ b/paddle/scripts/travis/build_doc.sh
@@ -9,13 +9,14 @@ cd $TRAVIS_BUILD_DIR/build
 cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON
 make -j `nproc` gen_proto_py
 make -j `nproc` paddle_python
-make -j `nproc` paddle_docs paddle_docs_cn
+make -j `nproc` paddle_docs paddle_docs_cn paddle_api_docs
 make -j `nproc` print_operators_doc
 paddle/pybind/print_operators_doc > doc/en/html/operators.json
 
 # check websites for broken links
 linkchecker doc/en/html/index.html
 linkchecker doc/cn/html/index.html
+linkchecker doc/api/en/html/index.html
 
 # Parse Github URL
 REPO=`git config remote.origin.url`
@@ -54,10 +55,11 @@ function deploy_docs() {
   mkdir -p ${DIR}
   # remove old docs. mv new docs.
   set +e
-  rm -rf ${DIR}/doc ${DIR}/doc_cn
+  rm -rf ${DIR}/doc ${DIR}/doc_cn ${DIR}/api_doc
   set -e
   cp -r ../doc/cn/html ${DIR}/doc_cn
   cp -r ../doc/en/html ${DIR}/doc
+  cp -r ../doc/api/en/html ${DIR}/api_doc
   git add .
 }
 
diff --git a/python/paddle/v2/fluid/executor.py b/python/paddle/v2/fluid/executor.py
index 9f48815b8b84426c7d539af4e7d45ea47e69d4d9..0eddcc3a5ab6f71aa5500c3b98b63c0937c7ddfc 100644
--- a/python/paddle/v2/fluid/executor.py
+++ b/python/paddle/v2/fluid/executor.py
@@ -17,7 +17,9 @@ import contextlib
 from framework import Program, default_main_program
 from . import core
 
-__all__ = ['Executor', 'global_scope', 'scope_guard', 'switch_scope']
+__all__ = [
+    'Executor', 'global_scope', 'scope_guard', 'switch_scope', 'fetch_var'
+]
 
 g_scope = core.Scope()
 
@@ -80,12 +82,12 @@ def has_feed_operators(block, feed_targets, feed_holder_name):
     Args:
         block: a block instance (typically global block of a program)
         feed_targets: a dictionary of {feed_target_name: feed_target_data}
-        feed_holder_name: the name of the variable that holds the data of 
-            all feed targets. The type of this feed_holder variable is 
+        feed_holder_name: the name of the variable that holds the data of
+            all feed targets. The type of this feed_holder variable is
             FEED_MINIBATCH, which is essentially vector<LoDTensor>.
 
     Returns:
-        A boolean value that indicates whether a block has feed operators 
+        A boolean value that indicates whether a block has feed operators
         that match the info contained in feed_targets and feed_holder_name.
     """
 
@@ -108,7 +110,7 @@ def has_feed_operators(block, feed_targets, feed_holder_name):
 
 def has_fetch_operators(block, fetch_targets, fetch_holder_name):
     """ Check whether the block already has fetch operators.
-    
+
     Return false if the block does not have any fetch operators.
     If some fetch operators have been appended to the block, check that
     the info contained in these fetch operators matches the fetch_targets
@@ -118,13 +120,13 @@ def has_fetch_operators(block, fetch_targets, fetch_holder_name):
     Args:
         block: a block instance (typically global block of a program)
         fetch_targets: a dictionary of {fetch_target_name: fetch_target_data}
-        fetch_holder_name: the name of the variable that holds the data of 
-            all fetch targets. The type of this fetch_holder variable is 
-            FETCH_LIST, which is essentially vector<LoDTensor>.    
+        fetch_holder_name: the name of the variable that holds the data of
+            all fetch targets. The type of this fetch_holder variable is
+            FETCH_LIST, which is essentially vector<LoDTensor>.
 
-    Return:    
-        A boolean value that indicates whether a block has fetch operators 
-        that match the info contained in fetch_targets and fetch_holder_name.     
+    Return:
+        A boolean value that indicates whether a block has fetch operators
+        that match the info contained in fetch_targets and fetch_holder_name.
     """
 
     fetch_count = 0
@@ -146,6 +148,35 @@ def has_fetch_operators(block, fetch_targets, fetch_holder_name):
     return fetch_count > 0
 
 
+def fetch_var(name, scope=None, return_numpy=True):
+    """
+    Fetch the value of the variable with the given name from the given scope
+    Args:
+        name(str): name of the variable. Typically, only persistable variables
+            can be found in the scope used for running the program.
+        scope(core.Scope|None): scope object. It should be the scope where
+            you pass to Executor.run() when running your program.
+            If None, global_scope() will be used.
+        return_numpy(bool): whether convert the tensor to numpy.ndarray
+    Returns:
+       LodTensor|numpy.ndarray
+    """
+    assert isinstance(name, str)
+    if scope is None:
+        scope = global_scope()
+    assert isinstance(scope, core.Scope)
+
+    var = global_scope().find_var(name)
+    assert var is not None, (
+        "Cannot find " + name + " in scope. Perhaps you need to make the"
+        " variable persistable by using var.persistable = True in your"
+        " program.")
+    tensor = var.get_tensor()
+    if return_numpy:
+        tensor = as_numpy(tensor)
+    return tensor
+
+
 class Executor(object):
     def __init__(self, places):
         if not isinstance(places, list) and not isinstance(places, tuple):
diff --git a/python/paddle/v2/fluid/framework.py b/python/paddle/v2/fluid/framework.py
index 69cbebe41e57309bb0993148833836b715e417ce..a12427258e9d3142abcb84249a10dabd8e96b792 100644
--- a/python/paddle/v2/fluid/framework.py
+++ b/python/paddle/v2/fluid/framework.py
@@ -31,6 +31,7 @@ __all__ = [
     'program_guard',
     'switch_startup_program',
     'switch_main_program',
+    'get_var',
 ]
 
 EMPTY_VAR_NAME = core.kEmptyVarName()
@@ -1123,3 +1124,22 @@ def program_guard(main_program, startup_program=None):
     switch_main_program(main_program)
     if startup_program is not None:
         switch_startup_program(startup_program)
+
+
+def get_var(name, program=None):
+    """
+    Get a variable by name from the global block of a program
+    Args:
+        name(str): name of the variable
+        program(Program|None): program object.
+             If None, default_global_program() will be used.
+
+    Returns:
+        Variable
+    """
+    if program is None:
+        program = default_main_program()
+    assert isinstance(name, str)
+    assert isinstance(name, Program)
+
+    return program.global_block().var(name)
diff --git a/python/paddle/v2/fluid/layers/tensor.py b/python/paddle/v2/fluid/layers/tensor.py
index 8460af2a08a54c5f241553757296ed2bf02f0167..704e040b9f478ef61991cfbe175f1cdeaf102763 100644
--- a/python/paddle/v2/fluid/layers/tensor.py
+++ b/python/paddle/v2/fluid/layers/tensor.py
@@ -35,13 +35,15 @@ __all__ = [
 ]
 
 
-def create_tensor(dtype, name=None):
+def create_tensor(dtype, name=None, persistable=False):
     helper = LayerHelper("create_tensor", **locals())
-    return helper.create_variable(name=helper.name, dtype=dtype)
+    return helper.create_variable(
+        name=helper.name, dtype=dtype, persistable=persistable)
 
 
 def create_parameter(shape,
                      dtype,
+                     name=None,
                      attr=None,
                      is_bias=False,
                      default_initializer=None):
@@ -62,7 +64,7 @@ def create_parameter(shape,
     """
     helper = LayerHelper("create_parameter", **locals())
     if attr is None:
-        attr = ParamAttr()
+        attr = ParamAttr(name=name)
     return helper.create_parameter(attr, shape, dtype, is_bias,
                                    default_initializer)
 
diff --git a/python/paddle/v2/fluid/tests/book/test_image_classification_train.py b/python/paddle/v2/fluid/tests/book/test_image_classification_train.py
index a4168d16db06f904faed811fdda3f0fe52f0b27b..03b009ebb0714a91329a1c56ff3939beecb03435 100644
--- a/python/paddle/v2/fluid/tests/book/test_image_classification_train.py
+++ b/python/paddle/v2/fluid/tests/book/test_image_classification_train.py
@@ -16,8 +16,9 @@ from __future__ import print_function
 
 import paddle.v2 as paddle
 import paddle.v2.fluid as fluid
-import unittest
 import contextlib
+import numpy
+import unittest
 
 
 def resnet_cifar10(input, depth=32):
@@ -89,10 +90,7 @@ def vgg16_bn_drop(input):
     return fc2
 
 
-def main(net_type, use_cuda):
-    if use_cuda and not fluid.core.is_compiled_with_cuda():
-        return
-
+def train(net_type, use_cuda, save_dirname):
     classdim = 10
     data_shape = [3, 32, 32]
 
@@ -111,12 +109,14 @@ def main(net_type, use_cuda):
     predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
     cost = fluid.layers.cross_entropy(input=predict, label=label)
     avg_cost = fluid.layers.mean(x=cost)
+    acc = fluid.layers.accuracy(input=predict, label=label)
+
+    # Test program 
+    test_program = fluid.default_main_program().clone()
 
     optimizer = fluid.optimizer.Adam(learning_rate=0.001)
     optimizer.minimize(avg_cost)
 
-    accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
-
     BATCH_SIZE = 128
     PASS_NUM = 1
 
@@ -125,6 +125,9 @@ def main(net_type, use_cuda):
             paddle.dataset.cifar.train10(), buf_size=128 * 10),
         batch_size=BATCH_SIZE)
 
+    test_reader = paddle.batch(
+        paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE)
+
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     exe = fluid.Executor(place)
     feeder = fluid.DataFeeder(place=place, feed_list=[images, label])
@@ -132,18 +135,68 @@ def main(net_type, use_cuda):
 
     loss = 0.0
     for pass_id in range(PASS_NUM):
-        accuracy.reset(exe)
-        for data in train_reader():
-            loss, acc = exe.run(fluid.default_main_program(),
-                                feed=feeder.feed(data),
-                                fetch_list=[avg_cost] + accuracy.metrics)
-            pass_acc = accuracy.eval(exe)
-            print("loss:" + str(loss) + " acc:" + str(acc) + " pass_acc:" + str(
-                pass_acc))
-            return
-
-    raise AssertionError(
-        "Image classification loss is too large, {0:2.2}".format(loss))
+        for batch_id, data in enumerate(train_reader()):
+            exe.run(feed=feeder.feed(data))
+
+            if (batch_id % 10) == 0:
+                acc_list = []
+                avg_loss_list = []
+                for tid, test_data in enumerate(test_reader()):
+                    loss_t, acc_t = exe.run(program=test_program,
+                                            feed=feeder.feed(test_data),
+                                            fetch_list=[avg_cost, acc])
+                    acc_list.append(float(acc_t))
+                    avg_loss_list.append(float(loss_t))
+                    break  # Use 1 segment for speeding up CI
+
+                acc_value = numpy.array(acc_list).mean()
+                avg_loss_value = numpy.array(avg_loss_list).mean()
+
+                print(
+                    'PassID {0:1}, BatchID {1:04}, Test Loss {2:2.2}, Acc {3:2.2}'.
+                    format(pass_id, batch_id + 1,
+                           float(avg_loss_value), float(acc_value)))
+
+                if acc_value > 0.01:  # Low threshold for speeding up CI
+                    fluid.io.save_inference_model(save_dirname, ["pixel"],
+                                                  [predict], exe)
+                    return
+
+
+def infer(use_cuda, save_dirname=None):
+    if save_dirname is None:
+        return
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    # Use fluid.io.load_inference_model to obtain the inference program desc,
+    # the feed_target_names (the names of variables that will be feeded 
+    # data using feed operators), and the fetch_targets (variables that 
+    # we want to obtain data from using fetch operators).
+    [inference_program, feed_target_names,
+     fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
+
+    # The input's dimension of conv should be 4-D or 5-D.
+    tensor_img = numpy.random.rand(1, 3, 32, 32).astype("float32")
+
+    # Construct feed as a dictionary of {feed_target_name: feed_target_data}
+    # and results will contain a list of data corresponding to fetch_targets.
+    results = exe.run(inference_program,
+                      feed={feed_target_names[0]: tensor_img},
+                      fetch_list=fetch_targets)
+    print("infer results: ", results[0])
+
+
+def main(net_type, use_cuda):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+
+    # Directory for saving the trained model
+    save_dirname = "image_classification_" + net_type + ".inference.model"
+
+    train(net_type, use_cuda, save_dirname)
+    infer(use_cuda, save_dirname)
 
 
 class TestImageClassification(unittest.TestCase):
diff --git a/python/paddle/v2/fluid/tests/book/test_recognize_digits.py b/python/paddle/v2/fluid/tests/book/test_recognize_digits.py
index b8f55c813b6984a5a1d266acc1c159c45c23b665..fb6b1f7192d51dcd654543e4c4ae5ee0c6fe060f 100644
--- a/python/paddle/v2/fluid/tests/book/test_recognize_digits.py
+++ b/python/paddle/v2/fluid/tests/book/test_recognize_digits.py
@@ -166,7 +166,9 @@ def infer(use_cuda, save_dirname=None):
      fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
 
     # The input's dimension of conv should be 4-D or 5-D.
-    tensor_img = numpy.random.rand(1, 1, 28, 28).astype("float32")
+    # Use normilized image pixels as input data, which should be in the range [-1.0, 1.0].
+    tensor_img = numpy.random.uniform(-1.0, 1.0,
+                                      [1, 1, 28, 28]).astype("float32")
 
     # Construct feed as a dictionary of {feed_target_name: feed_target_data}
     # and results will contain a list of data corresponding to fetch_targets.
diff --git a/python/paddle/v2/fluid/tests/test_fetch_var.py b/python/paddle/v2/fluid/tests/test_fetch_var.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed75a350b0bcb220c8435d60e1978c27da84a24c
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_fetch_var.py
@@ -0,0 +1,37 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2.fluid as fluid
+import paddle.v2.fluid.layers as layers
+import op_test
+import numpy
+import unittest
+
+
+class TestFetchVar(op_test.OpTest):
+    def test_fetch_var(self):
+        val = numpy.array([1, 3, 5]).astype(numpy.int32)
+        x = layers.create_tensor(dtype="int32", persistable=True, name="x")
+        layers.assign(input=val, output=x)
+        exe = fluid.Executor(fluid.CPUPlace())
+        exe.run(fluid.default_main_program(), feed={}, fetch_list=[])
+        fetched_x = fluid.fetch_var("x")
+        self.assertTrue(
+            numpy.array_equal(fetched_x, val),
+            "fetch_x=%s val=%s" % (fetched_x, val))
+        self.assertEqual(fetched_x.dtype, val.dtype)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_protobuf_descs.py b/python/paddle/v2/fluid/tests/test_protobuf_descs.py
index 9034b2f4ef1c983ef224b14b8f602f87e6ce94b0..8f335d13db3ddb999058a58cadc57ff23ff1fbc5 100644
--- a/python/paddle/v2/fluid/tests/test_protobuf_descs.py
+++ b/python/paddle/v2/fluid/tests/test_protobuf_descs.py
@@ -115,6 +115,18 @@ class TestVarDesc(unittest.TestCase):
         self.assertEqual(src_shape, res_shape)
         self.assertEqual(core.VarDesc.VarType.SELECTED_ROWS, var.type())
 
+    def test_multiple_shape(self):
+        program_desc = core.ProgramDesc()
+        block = program_desc.block(0)
+        var = block.var('my_reader')
+        var.set_type(core.VarDesc.VarType.READER)
+        var.set_tensor_num(3)
+        src_shapes = [[2, 3, 3], [4, 5], [6, 7, 8, 9]]
+        var.set_shapes(src_shapes)
+        res_shapes = var.shapes()
+        self.assertEqual(src_shapes, res_shapes)
+        self.assertEqual(core.VarDesc.VarType.READER, var.type())
+
     def test_dtype(self):
         program_desc = core.ProgramDesc()
         block = program_desc.block(0)
@@ -124,6 +136,30 @@ class TestVarDesc(unittest.TestCase):
         self.assertEqual(core.DataType.INT32, var.dtype())
         self.assertEqual(core.VarDesc.VarType.LOD_TENSOR, var.type())
 
+    def test_multiple_dtype(self):
+        program_desc = core.ProgramDesc()
+        block = program_desc.block(0)
+        var = block.var('my_reader')
+        var.set_type(core.VarDesc.VarType.READER)
+        var.set_tensor_num(3)
+        src_types = [
+            core.DataType.INT32, core.DataType.FP64, core.DataType.FP32
+        ]
+        var.set_dtypes(src_types)
+        self.assertEqual(src_types, var.dtypes())
+        self.assertEqual(core.VarDesc.VarType.READER, var.type())
+
+    def test_multiple_lod_level(self):
+        program_desc = core.ProgramDesc()
+        block = program_desc.block(0)
+        var = block.var('my_reader')
+        var.set_type(core.VarDesc.VarType.READER)
+        var.set_tensor_num(3)
+        src_types = [3, 1, 2]
+        var.set_lod_levels(src_types)
+        self.assertEqual(src_types, var.lod_levels())
+        self.assertEqual(core.VarDesc.VarType.READER, var.type())
+
 
 class TestBlockDesc(unittest.TestCase):
     def test_add_var(self):