diff --git a/paddle/fluid/operators/elementwise_op_function.h b/paddle/fluid/operators/elementwise_op_function.h
index 0ee7291f04a57bb843ffe4c62e489ea9b575f7d0..2a4a611511138377b46102dbe9d956c8beecd1bd 100644
--- a/paddle/fluid/operators/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise_op_function.h
@@ -314,7 +314,6 @@ EIGEN_FUNCTOR(Div, EIGEN_DIV);
 template <typename DeviceContext, typename T, typename functor,
           typename broadcastfunctor, typename broadcast2functor>
 void ElementwiseGradCompute(const framework::ExecutionContext& ctx,
-
                             const framework::Tensor* x,
                             const framework::Tensor* y,
                             const framework::Tensor* out,
diff --git a/paddle/fluid/operators/print_op.cc b/paddle/fluid/operators/print_op.cc
index a76ba796fe4ed20ebc09d34fcebe564d70c267a5..7fa2b060afd2d3effa4136bd6e6bb376600bdb7e 100644
--- a/paddle/fluid/operators/print_op.cc
+++ b/paddle/fluid/operators/print_op.cc
@@ -46,7 +46,7 @@ struct Formater {
   }
 
  private:
-  void PrintMessage() { CLOG << std::time(nullptr) << "\t" << message; }
+  void PrintMessage() { CLOG << std::time(nullptr) << "\t" << message << "\t"; }
   void PrintName() {
     if (!name.empty()) {
       CLOG << "Tensor[" << name << "]" << std::endl;
@@ -85,15 +85,16 @@ struct Formater {
     // print float
     if (dtype.hash_code() == typeid(float).hash_code()) {
       Display<float>(size);
-    }
-    if (dtype.hash_code() == typeid(double).hash_code()) {
+    } else if (dtype.hash_code() == typeid(double).hash_code()) {
       Display<double>(size);
-    }
-    if (dtype.hash_code() == typeid(int).hash_code()) {
+    } else if (dtype.hash_code() == typeid(int).hash_code()) {
       Display<int>(size);
-    }
-    if (dtype.hash_code() == typeid(int64_t).hash_code()) {
+    } else if (dtype.hash_code() == typeid(int64_t).hash_code()) {
       Display<int64_t>(size);
+    } else if (dtype.hash_code() == typeid(bool).hash_code()) {
+      Display<bool>(size);
+    } else {
+      CLOG << "\tdata: unprintable type: " << dtype.name() << std::endl;
     }
   }
 
@@ -182,6 +183,7 @@ class TensorPrintOp : public framework::OperatorBase {
     }
 
     Formater formater;
+    formater.message = Attr<std::string>("message");
     if (Attr<bool>("print_tensor_name")) {
       formater.name = printed_var_name;
     }
diff --git a/python/paddle/v2/fluid/layers/control_flow.py b/python/paddle/v2/fluid/layers/control_flow.py
index 800c11a53b83bb902276cb2eb5213ba000e403c7..1ca11bb35b0e39d1bc97dbd531c0ebcf62e18e74 100644
--- a/python/paddle/v2/fluid/layers/control_flow.py
+++ b/python/paddle/v2/fluid/layers/control_flow.py
@@ -174,7 +174,7 @@ def Print(input,
         print_tensor_type (bool): Print the tensor type.
         print_tensor_shape (bool): Print the tensor shape.
         print_tensor_lod (bool): Print the tensor lod.
-        print_phase (bool): Which phase to displace, including 'forward',
+        print_phase (str): Which phase to displace, including 'forward',
                 'backward' and 'both'. If set to 'backward' or 'both', will
                 print the gradients of input tensor.
 
diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py
index f5b64fee1dc82d0b2088191338cb3cb70f6b6b52..5f1842f5fb95e09d2874caa9e9de4ebeb7a99403 100644
--- a/python/paddle/v2/fluid/layers/nn.py
+++ b/python/paddle/v2/fluid/layers/nn.py
@@ -1579,7 +1579,7 @@ def layer_norm(input,
     """
     **Layer Normalization**
 
-    Assume feature vectors exist on dimensions 
+    Assume feature vectors exist on dimensions
     :attr:`begin_norm_axis ... rank(input)` and calculate the moment statistics
     along these dimensions for each feature vector :math:`a` with size
     :math:`H`, then normalize each feature vector using the corresponding
@@ -1600,13 +1600,13 @@ def layer_norm(input,
 
     Args:
         input(Variable): The input tensor variable.
-        scale(bool): Whether to learn the adaptive gain :math:`g` after 
+        scale(bool): Whether to learn the adaptive gain :math:`g` after
             normalization.
-        shift(bool): Whether to learn the adaptive bias :math:`b` after 
+        shift(bool): Whether to learn the adaptive bias :math:`b` after
             normalization.
-        begin_norm_axis(bool): The normalization will be performed along 
+        begin_norm_axis(bool): The normalization will be performed along
             dimensions from :attr:`begin_norm_axis` to :attr:`rank(input)`.
-        epsilon(float): The small value added to the variance to prevent 
+        epsilon(float): The small value added to the variance to prevent
             division by zero.
         param_attr(ParamAttr|None): The parameter attribute for the learnable
             gain :math:`g`.
@@ -2070,7 +2070,7 @@ def reduce_sum(input, dim=None, keep_dim=False, name=None):
             Tensor variable with a single element, otherwise must be in the
             range :math:`[-rank(input), rank(input))`. If :math:`dim < 0`,
             the dimension to reduce is :math:`rank + dim`.
-        keep_dim (bool): Whether to reserve the reduced dimension in the
+        keep_dim (bool|False): Whether to reserve the reduced dimension in the
             output Tensor. The result tensor will have one fewer dimension
             than the :attr:`input` unless :attr:`keep_dim` is true.
         name(str|None): A name for this layer(optional). If set None, the layer
@@ -3098,33 +3098,33 @@ def multiplex(inputs, index):
 def softmax_with_cross_entropy(logits, label, soft_label=False):
     """
     **Softmax With Cross Entropy Operator.**
-    
+
     Cross entropy loss with softmax is used as the output layer extensively. This
     operator computes the softmax normalized values for each row of the input
     tensor, after which cross-entropy loss is computed. This provides a more
     numerically stable gradient.
-    
+
     Because this operator performs a softmax on logits internally, it expects
     unscaled logits. This operator should not be used with the output of
     softmax operator since that would produce incorrect results.
-    
+
     When the attribute soft_label is set false, this operators expects mutually
     exclusive hard labels, each sample in a batch is in exactly one class with a
     probability of 1.0. Each sample in the batch will have a single label.
-    
+
     The equation is as follows:
-    
+
     1) Hard label (one-hot label, so every sample has exactly one class)
-    
+
     .. math::
 
         loss_j =  -\\text{logit}_{label_j} +
         \\log\\left(\\sum_{i=0}^{K}\\exp(\\text{logit}_i)\\right), j = 1,..., K
-    
+
     2) Soft label (each sample can have a distribution over all classes)
 
     .. math::
-    
+
         loss_j =  -\\sum_{i=0}^{K}\\text{label}_i
         \\left(\\text{logit}_i - \\log\\left(\\sum_{i=0}^{K}
         \\exp(\\text{logit}_i)\\right)\\right), j = 1,...,K
@@ -3169,7 +3169,7 @@ def smooth_l1(x, y, inside_weight=None, outside_weight=None, sigma=None):
     The operator takes the first dimension of X and Y as batch size.
     For each instance, it computes the smooth l1 loss element by element first
     and then sums all the losses. So the shape of Out is [batch_size, 1].
-    
+
     Args:
         x (Variable): A tensor with rank at least 2. The input value of smooth
             l1 loss op with shape [batch_size, dim1, ..., dimN].