diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index 6b47666aa50052d77cee600092d39f0ceea1da3d..7f7542b0348e66db64ea522b4e32f11a16990a74 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -220,7 +220,7 @@ paddle.fluid.layers.py_func (ArgSpec(args=['func', 'x', 'out', 'backward_func',
 paddle.fluid.layers.psroi_pool (ArgSpec(args=['input', 'rois', 'output_channels', 'spatial_scale', 'pooled_height', 'pooled_width', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '1546136806fef5c08f6918544bd9151d'))
 paddle.fluid.layers.teacher_student_sigmoid_loss (ArgSpec(args=['input', 'label', 'soft_max_up_bound', 'soft_max_lower_bound'], varargs=None, keywords=None, defaults=(15.0, -15.0)), ('document', '2f6ff96864054a31aa4bb659c6722c99'))
 paddle.fluid.layers.huber_loss (ArgSpec(args=['input', 'label', 'delta'], varargs=None, keywords=None, defaults=None), ('document', '431a4301c35032166ec029f7432c80a7'))
-paddle.fluid.layers.kldiv_loss (ArgSpec(args=['x', 'target', 'reduction', 'name'], varargs=None, keywords=None, defaults=('mean', None)), ('document', '74112f07e2329448f9f583cabd9d681e'))
+paddle.fluid.layers.kldiv_loss (ArgSpec(args=['x', 'target', 'reduction', 'name'], varargs=None, keywords=None, defaults=('mean', None)), ('document', '776d536cac47c89073abc7ee524d5aec'))
 paddle.fluid.layers.tree_conv (ArgSpec(args=['nodes_vector', 'edge_set', 'output_size', 'num_filters', 'max_depth', 'act', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(1, 2, 'tanh', None, None, None)), ('document', '34ea12ac9f10a65dccbc50100d12e607'))
 paddle.fluid.layers.data (ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)), ('document', '33bbd42027d872b3818b3d64ec52e139'))
 paddle.fluid.layers.open_files (ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)), ('document', 'b1ae2e1cc0750e58726374061ea90ecc'))
diff --git a/paddle/fluid/operators/kldiv_loss_op.cc b/paddle/fluid/operators/kldiv_loss_op.cc
index c120d77451c24e01aa11439218d341cbb49a1091..a43f22c0496f89943d2fd5110446f1aae6a99315 100644
--- a/paddle/fluid/operators/kldiv_loss_op.cc
+++ b/paddle/fluid/operators/kldiv_loss_op.cc
@@ -65,11 +65,11 @@ class KLDivLossOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
     AddInput("X",
-             "The input tensor of KL divergence loss operator, "
-             "This is a tensor with shape of [N, *], where N is the"
+             "The input tensor of KL divergence loss operator. "
+             "This is a tensor with shape of [N, *], where N is the "
              "batch size, * means any number of additional dimensions.");
     AddInput("Target",
-             "The  tensor of KL divergence loss operator, "
+             "The  tensor of KL divergence loss operator. "
              "This is a tensor with shape of Input(X).");
     AddOutput(
         "Loss",
@@ -82,7 +82,7 @@ class KLDivLossOpMaker : public framework::OpProtoAndCheckerMaker {
         "The reduction type to apply to the output, available types "
         "are 'none' | 'batchmean' | 'mean' | 'sum', 'none' for no "
         "reduction, 'batchmean' for the sum of output divided by "
-        "batch size, 'mean' for the average valud of all output, "
+        "batch size, 'mean' for the average value of all output, "
         "'sum' for the sum of the output.")
         .SetDefault("mean");
 
@@ -90,21 +90,23 @@ class KLDivLossOpMaker : public framework::OpProtoAndCheckerMaker {
          This operator calculates the Kullback-Leibler divergence loss
          between Input(X) and Input(Target).
 
-         KL divergence loss calculates as follows:
+         KL divergence loss is calculated as follows:
 
-         $$l(x, y) = y * (\log y - x)$$
+         $$l(x, y) = y * (\log(y) - x)$$
+
+         While :math:`x` is Input(X) and :math:`y` is Input(Target).
 
          While :attr:`reduction` is :attr:`none`, output loss is in
-         same shape with Input(X), loss in each point is calculated 
-         seperately and no reduction applied.
+         the same shape as Input(X), loss in each point is calculated 
+         seperately and no reduction is applied.
          
-         While :attr:`reduction` is :attr:`mean`, output loss in in
+         While :attr:`reduction` is :attr:`mean`, output loss is in
          shape of [1] and loss value is the mean value of all losses.
          
-         While :attr:`reduction` is :attr:`sum`, output loss in in
+         While :attr:`reduction` is :attr:`sum`, output loss is in
          shape of [1] and loss value is the sum value of all losses.
          
-         While :attr:`reduction` is :attr:`batchmean`, output loss in 
+         While :attr:`reduction` is :attr:`batchmean`, output loss is 
          in shape of [1] and loss value is the sum value of all losses
          divided by batch size.