diff --git a/paddle/fluid/operators/math/sample_prob.cu b/paddle/fluid/operators/math/sample_prob.cu
index 01c61fd8053aa5f2377a02b9904fb2b8846583cd..ca21f9db88c2b3ceb96114f6ce8f717c7284018e 100644
--- a/paddle/fluid/operators/math/sample_prob.cu
+++ b/paddle/fluid/operators/math/sample_prob.cu
@@ -112,33 +112,6 @@ int UniqSampler(const Sampler& sampler, const std::size_t num_samples,
   }
   return num_tries;
 }
-/*
-template <typename T>
-void Print(Tensor & t, std::string name) {
-  if (!FLAGS_debug_print) {
-    return;
-  }
-  VLOG(1) << "qxz print "<< name;
-  VLOG(1) << name << "size = " << t.numel();
-  size_t size = t.numel();
-  type *d = t.data<type>();
-#ifdef PADDLE_WITH_CUDA
-    std::vector<type> vec;
-    platform::DeviceContextPool::Instance().Get(t.place())->Wait();
-    if (platform::is_gpu_place(t.place())) {
-      vec.resize(size);
-      cudaMemcpy(vec.data(), d, sizeof(T) * size, cudaMemcpyDeviceToHost);
-      d = vec.data();
-    }
-#endif
-  VLOG(1) << name << " data_ptr = " << static_cast<void*>(d);
-  std::string out;
-  for (size_t i = 0; i < size; i++) {
-       out += std::to_string(d[i]);
-       out += ",";
-  }
-  VLOG(1) << out;
-}*/
 
 template <typename T>
 void GPUSampleWithProb<T>::operator()(
diff --git a/paddle/fluid/operators/sample_logits_op.cc b/paddle/fluid/operators/sample_logits_op.cc
index 160eb066eabf467133d19d691b6590d08dd3e07d..22286ae87f9ecac3d0f9abd8c89a30b42da1b17b 100644
--- a/paddle/fluid/operators/sample_logits_op.cc
+++ b/paddle/fluid/operators/sample_logits_op.cc
@@ -64,12 +64,13 @@ class SampleLogitsOpMaker : public framework::OpProtoAndCheckerMaker {
         .AsIntermediate();
     AddOutput("SampledLogits",
               "(Tensor, default: Tensor<float>), A 2-D tensor with shape"
-              "[N x S+NT]. The outputs value of sampled softmax, which will be"
+              "[N x S+NT]. The outputs value of sample logits, which will be"
               "used in backward calculation.")
         .AsIntermediate();
-    AddOutput("SampledLabel",
-              "(Tensor, default: Tensor<int64>), A 2-D tensor. The cross "
-              "entropy loss with shape [N x NT].");
+    AddOutput(
+        "SampledLabel",
+        "(Tensor, default: Tensor<int64>), A 2-D tensor. The sampled label"
+        "with shape [N x S + NT].");
     AddAttr<bool>(
         "use_custom_samples",
         "An indicator whether to use custom samples with probabilities, if True"
@@ -81,7 +82,7 @@ class SampleLogitsOpMaker : public framework::OpProtoAndCheckerMaker {
         "An indicator whether to sample non-repetitive negtive labels, if True"
         "the operator will sample negtive labels without replacement."
         "otherwise, the operator will sample negtive labels with replacement.")
-        .SetDefault(false);
+        .SetDefault(true);
     AddAttr<bool>(
         "remove_accidental_hits",
         "An indicator whether to remove accidental hits when samples hits true"
@@ -92,35 +93,11 @@ class SampleLogitsOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<int>("seed", "Random seed for generating samples").SetDefault(0);
 
     AddComment(R"DOC(
-TODO(chenfeiyu): Write documentation for this Operator.
-Sampled Softmax With Cross Entropy Operator.
-
-Cross entropy loss with sampled softmax is used as the output layer extensively.
-This operator computes the softmax normalized values for each row of the input
-tensor, after which cross-entropy loss is computed. This provides a more
-numerically stable gradient.
-
-Because this operator performs a softmax on logits internally, it expects
-unscaled logits. This operator should not be used with the output of
-softmax operator since that would produce incorrect results.
-
-When the attribute soft_label is set false, this operators expects mutually
-exclusive hard labels, each sample in a batch is in exactly one class with a
-probability of 1.0. Each sample in the batch will have a single label.
-
-The equation is as follows:
-
-1) Hard label (one-hot label, so every sample has exactly one class)
-
-$$Loss_j =  -\text{Logit}_{Label_j} +
-\log\left(\sum_{i=0}^{K}\exp(\text{Logit}_i)\right),
-j = 1,..., K$$
-
-2) Soft label (each sample can have a distribution over all classes)
+  """
+  Computes sampled output training logits and labels suitable for implementing
+  sampled softmax.
 
-$$Loss_j =  -\sum_{i=0}^{K}\text{Label}_i \left(\text{Logit}_i -
-\log\left(\sum_{i=0}^{K}\exp(\text{Logit}_i)\right)\right),
-j = 1,...,K$$
+  """
 
 )DOC");
   }
diff --git a/paddle/fluid/operators/sample_logits_op.cu b/paddle/fluid/operators/sample_logits_op.cu
index 5b311bb67149b5d232137fe2d0d34298ff2fa1de..fe95542fd8f5d620094bf99d0e197480855f17e1 100644
--- a/paddle/fluid/operators/sample_logits_op.cu
+++ b/paddle/fluid/operators/sample_logits_op.cu
@@ -248,8 +248,7 @@ class SampleLogitsGradCUDAKernel : public framework::OpKernel<T> {
     if (!FLAGS_debug_print) {
       return;
     }
-    VLOG(1) << "qxz print " << name;
-    VLOG(1) << name << "size = " << t.numel();
+    VLOG(1) << name << " size = " << t.numel();
     size_t size = t.numel();
     const type* d = t.data<type>();
 #ifdef PADDLE_WITH_CUDA
diff --git a/paddle/fluid/operators/sample_logits_op.h b/paddle/fluid/operators/sample_logits_op.h
index 77d66a642e6dc2fed60ca6701d37df2e33a93001..139432178bdd90972a8cdd6934475854635cd645 100644
--- a/paddle/fluid/operators/sample_logits_op.h
+++ b/paddle/fluid/operators/sample_logits_op.h
@@ -207,37 +207,6 @@ class SampleLogitsKernel : public framework::OpKernel<T> {
                                         num_true);
     }
 
-    /* Debug
-    const auto num_sampled_classes = samples_dim[1];
-    std::cout << "Sampled Logits" << std::endl;
-    const auto sampled_logits_data = sampled_logits->data<T>();
-    for (int i = 0; i < sampled_logits->numel(); ++i) {
-      std::cout << sampled_logits_data[i] << ", ";
-      if ((i + 1) % num_sampled_classes == 0)
-        std::cout << std::endl;
-    }
-    std::cout << std::endl;
-    */
-    /* Debug
-    std::cout << "Samples" << std::endl;
-    const auto samples_data = samples->data<int64_t>();
-    for (int i = 0; i < samples->numel(); ++i) {
-      std::cout << samples_data[i] << ", ";
-      if ((i + 1) % num_sampled_classes == 0)
-        std::cout << std::endl;
-    }
-    std::cout << std::endl;
-    */
-    /* Debug
-    std::cout << "Probabilities" << std::endl;
-    const auto probabilities_data = probabilities->data<T>();
-    for (int i = 0; i < probabilities->numel(); ++i) {
-      std::cout << probabilities_data[i] << ", ";
-      if ((i + 1) % num_sampled_classes == 0)
-        std::cout << std::endl;
-    }
-    std::cout << std::endl;
-    */
     // subtracted sampled logits with logQ(y|x)
     auto probs = EigenMatrix<T>::From(*probabilities);
     auto smp_logits = EigenMatrix<T>::From(*sampled_logits);
@@ -263,9 +232,6 @@ class SampleLogitsGradKernel : public framework::OpKernel<T> {
     math::SetConstant<platform::CPUDeviceContext, T> set_zero;
     set_zero(dev_ctx, logits_grad, static_cast<T>(0));
 
-    // const bool remove_accidental_hits =
-    //    context.Attr<bool>("remove_accidental_hits");
-
     // UNDERSTAND: scatter it back to logit_grad
     CPUPutAlongD1<T>(dev_ctx, logits_grad, *samples, *sampled_logits_grad);
   }