diff --git a/paddle/fluid/operators/sample_logits_op.cu b/paddle/fluid/operators/sample_logits_op.cu
index eb55c14ff9c8bfc8cb7dfe379095ab19b06943d0..f0529ea82cc5e6377d4187b6794dca8244829b9a 100644
--- a/paddle/fluid/operators/sample_logits_op.cu
+++ b/paddle/fluid/operators/sample_logits_op.cu
@@ -27,8 +27,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-DEFINE_bool(debug_print, true, "run debug mode");
-
 // UNDERSTAND: something like take_along_axis in numpy.
 template <typename T>
 __global__ void GPUTakeAlongD1(size_t size, const int batch_size,
@@ -108,32 +106,6 @@ template <typename T>
 class SampleLogitsCUDAKernel : public framework::OpKernel<T> {
  public:
   using Tensor = framework::Tensor;
-  template <typename type>
-  void Print(const Tensor& t, std::string name) const {
-    if (!FLAGS_debug_print) {
-      return;
-    }
-    VLOG(1) << name << " size = " << t.numel();
-    size_t size = t.numel();
-    const type* d = t.data<type>();
-#ifdef PADDLE_WITH_CUDA
-    std::vector<type> vec;
-    platform::DeviceContextPool::Instance().Get(t.place())->Wait();
-    if (platform::is_gpu_place(t.place())) {
-      vec.resize(size);
-      cudaMemcpy(vec.data(), d, sizeof(T) * size, cudaMemcpyDeviceToHost);
-      d = vec.data();
-    }
-#endif
-    VLOG(1) << name << " data_ptr = " << static_cast<const void*>(d);
-    std::string out;
-    for (size_t i = 0; i < size; i++) {
-      out += std::to_string(d[i]);
-      out += ",";
-    }
-    VLOG(1) << out;
-  }
-
   void Compute(const framework::ExecutionContext& context) const override {
     // get necessary inputs
     const Tensor* logits = context.Input<Tensor>("Logits");
@@ -189,12 +161,9 @@ class SampleLogitsCUDAKernel : public framework::OpKernel<T> {
       // UNDERSTAND: sampling
       const auto seed = context.Attr<int>("seed");
       auto sampler_with_prob = math::GPUSampleWithProb<T>();
-      Print<int64_t>(*samples, std::string("samples1"));
       sampler_with_prob(context.cuda_device_context(), seed, num_classes, uniq,
                         num_samples, label, samples, probabilities);
     }
-    Print<int64_t>(*samples, std::string("samples2"));
-    Print<T>(*probabilities, std::string("probabilities"));
 
     // UNDERSTAND: gather sampled logits and remove accidental hits if needed
     const auto num_take = samples->dims()[1];
@@ -216,7 +185,6 @@ class SampleLogitsCUDAKernel : public framework::OpKernel<T> {
         T><<<grid, threads, 0, context.cuda_device_context().stream()>>>(
         size, batch_size, array_slice_size, idx_slice_size, p_array, p_index,
         p_value);
-    Print<T>(*sampled_logits, std::string("sampled_logits"));
 
     if (remove_accidental_hits) {
       const size_t size = batch_size * (num_true + num_samples);
@@ -224,8 +192,6 @@ class SampleLogitsCUDAKernel : public framework::OpKernel<T> {
       gpu_compute_remove_accidental_hits<
           T><<<grid, threads, 0, context.cuda_device_context().stream()>>>(
           size, num_true, idx_slice_size, p_index, p_value);
-      Print<T>(*sampled_logits,
-               std::string("sampled_logits_remove_accidental_hits"));
     }
 
     // subtracted sampled logits with logQ(y|x)
@@ -234,7 +200,6 @@ class SampleLogitsCUDAKernel : public framework::OpKernel<T> {
     smp_logits.device(*dev_ctx.eigen_device()) =
         (smp_logits - probs.log().unaryExpr(TolerableValue<T>()))
             .unaryExpr(TolerableValue<T>());
-    Print<T>(*sampled_logits, std::string("sampled_logits_res"));
   }
 };
 
@@ -242,32 +207,6 @@ template <typename T>
 class SampleLogitsGradCUDAKernel : public framework::OpKernel<T> {
  public:
   using Tensor = framework::Tensor;
-  template <typename type>
-  void Print(const Tensor& t, std::string name) const {
-    if (!FLAGS_debug_print) {
-      return;
-    }
-    VLOG(1) << name << " size = " << t.numel();
-    size_t size = t.numel();
-    const type* d = t.data<type>();
-#ifdef PADDLE_WITH_CUDA
-    std::vector<type> vec;
-    platform::DeviceContextPool::Instance().Get(t.place())->Wait();
-    if (platform::is_gpu_place(t.place())) {
-      vec.resize(size);
-      cudaMemcpy(vec.data(), d, sizeof(T) * size, cudaMemcpyDeviceToHost);
-      d = vec.data();
-    }
-#endif
-    VLOG(1) << name << " data_ptr = " << static_cast<const void*>(d);
-    std::string out;
-    for (size_t i = 0; i < size; i++) {
-      out += std::to_string(d[i]);
-      out += ",";
-    }
-    VLOG(1) << out;
-  }
-
   void Compute(const framework::ExecutionContext& context) const override {
     auto logits_grad = context.Output<Tensor>(framework::GradVarName("Logits"));
     const Tensor* samples = context.Input<Tensor>("Samples");
@@ -298,13 +237,10 @@ class SampleLogitsGradCUDAKernel : public framework::OpKernel<T> {
     const size_t size = batch_size;
     int grid = (size + threads - 1) / threads;
 
-    Print<T>(*sampled_logits_grad, std::string("sampled_logits_grad"));
-    Print<int64_t>(*samples, std::string("samples"));
     GPUPutAlongD1<
         T><<<grid, threads, 0, context.cuda_device_context().stream()>>>(
         size, batch_size, array_slice_size, idx_slice_size, p_array, p_index,
         p_value);
-    Print<T>(*logits_grad, std::string("logits_grad"));
   }
 };
 
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 6fa0de847c8d8cda85689714376304807edc4f26..396f36e188b27fe450cc19b3b8ccf967daf1456c 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -131,7 +131,7 @@ def __bootstrap__():
         'eager_delete_tensor_gb', 'fast_eager_deletion_mode',
         'allocator_strategy', 'reader_queue_speed_test_mode',
         'print_sub_graph_dir', 'pe_profile_fname', 'warpctc_dir',
-        'inner_op_parallelism', 'enable_parallel_graph', 'debug_print'
+        'inner_op_parallelism', 'enable_parallel_graph'
     ]
     if 'Darwin' not in sysstr:
         read_env_flags.append('use_pinned_memory')