remove debug print

test=develop

remove debug print
test=develop
9b24ac34 · xuezhong · 1de9b60a · 9b24ac34 · 9b24ac34
隐藏空白更改
内联并排

Showing with 1 addition and 65 deletion

paddle/fluid/operators/sample_logits_op.cu paddle/fluid/operators/sample_logits_op.cu +0 -64

python/paddle/fluid/__init__.py python/paddle/fluid/__init__.py +1 -1

未找到文件。
--- a/paddle/fluid/operators/sample_logits_op.cu
+++ b/paddle/fluid/operators/sample_logits_op.cu
@@ -27,8 +27,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
-DEFINE_bool(debug_print, true, "run debug mode");
 // UNDERSTAND: something like take_along_axis in numpy.
 template <typename T>
 __global__ void GPUTakeAlongD1(size_t size, const int batch_size,
@@ -108,32 +106,6 @@ template <typename T>
 class SampleLogitsCUDAKernel : public framework::OpKernel<T> {
 public:
  using Tensor = framework::Tensor;
-  template <typename type>
-  void Print(const Tensor& t, std::string name) const {
-    if (!FLAGS_debug_print) {
-      return;
-    }
-    VLOG(1) << name << " size = " << t.numel();
-    size_t size = t.numel();
-    const type* d = t.data<type>();
-#ifdef PADDLE_WITH_CUDA
-    std::vector<type> vec;
-    platform::DeviceContextPool::Instance().Get(t.place())->Wait();
-    if (platform::is_gpu_place(t.place())) {
-      vec.resize(size);
-      cudaMemcpy(vec.data(), d, sizeof(T) * size, cudaMemcpyDeviceToHost);
-      d = vec.data();
-    }
-#endif
-    VLOG(1) << name << " data_ptr = " << static_cast<const void*>(d);
-    std::string out;
-    for (size_t i = 0; i < size; i++) {
-      out += std::to_string(d[i]);
-      out += ",";
-    }
-    VLOG(1) << out;
-  }
  void Compute(const framework::ExecutionContext& context) const override {
    // get necessary inputs
    const Tensor* logits = context.Input<Tensor>("Logits");
@@ -189,12 +161,9 @@ class SampleLogitsCUDAKernel : public framework::OpKernel<T> {
      // UNDERSTAND: sampling
      const auto seed = context.Attr<int>("seed");
      auto sampler_with_prob = math::GPUSampleWithProb<T>();
-      Print<int64_t>(*samples, std::string("samples1"));
      sampler_with_prob(context.cuda_device_context(), seed, num_classes, uniq,
                        num_samples, label, samples, probabilities);
    }
-    Print<int64_t>(*samples, std::string("samples2"));
-    Print<T>(*probabilities, std::string("probabilities"));
    // UNDERSTAND: gather sampled logits and remove accidental hits if needed
    const auto num_take = samples->dims()[1];
@@ -216,7 +185,6 @@ class SampleLogitsCUDAKernel : public framework::OpKernel<T> {
        T><<<grid, threads, 0, context.cuda_device_context().stream()>>>(
        size, batch_size, array_slice_size, idx_slice_size, p_array, p_index,
        p_value);
-    Print<T>(*sampled_logits, std::string("sampled_logits"));
    if (remove_accidental_hits) {
      const size_t size = batch_size * (num_true + num_samples);
@@ -224,8 +192,6 @@ class SampleLogitsCUDAKernel : public framework::OpKernel<T> {
      gpu_compute_remove_accidental_hits<
          T><<<grid, threads, 0, context.cuda_device_context().stream()>>>(
          size, num_true, idx_slice_size, p_index, p_value);
-      Print<T>(*sampled_logits,
-               std::string("sampled_logits_remove_accidental_hits"));
    }
    // subtracted sampled logits with logQ(y|x)
@@ -234,7 +200,6 @@ class SampleLogitsCUDAKernel : public framework::OpKernel<T> {
    smp_logits.device(*dev_ctx.eigen_device()) =
        (smp_logits - probs.log().unaryExpr(TolerableValue<T>()))
            .unaryExpr(TolerableValue<T>());
-    Print<T>(*sampled_logits, std::string("sampled_logits_res"));
  }
 };
@@ -242,32 +207,6 @@ template <typename T>
 class SampleLogitsGradCUDAKernel : public framework::OpKernel<T> {
 public:
  using Tensor = framework::Tensor;
-  template <typename type>
-  void Print(const Tensor& t, std::string name) const {
-    if (!FLAGS_debug_print) {
-      return;
-    }
-    VLOG(1) << name << " size = " << t.numel();
-    size_t size = t.numel();
-    const type* d = t.data<type>();
-#ifdef PADDLE_WITH_CUDA
-    std::vector<type> vec;
-    platform::DeviceContextPool::Instance().Get(t.place())->Wait();
-    if (platform::is_gpu_place(t.place())) {
-      vec.resize(size);
-      cudaMemcpy(vec.data(), d, sizeof(T) * size, cudaMemcpyDeviceToHost);
-      d = vec.data();
-    }
-#endif
-    VLOG(1) << name << " data_ptr = " << static_cast<const void*>(d);
-    std::string out;
-    for (size_t i = 0; i < size; i++) {
-      out += std::to_string(d[i]);
-      out += ",";
-    }
-    VLOG(1) << out;
-  }
  void Compute(const framework::ExecutionContext& context) const override {
    auto logits_grad = context.Output<Tensor>(framework::GradVarName("Logits"));
    const Tensor* samples = context.Input<Tensor>("Samples");
@@ -298,13 +237,10 @@ class SampleLogitsGradCUDAKernel : public framework::OpKernel<T> {
    const size_t size = batch_size;
    int grid = (size + threads - 1) / threads;
-    Print<T>(*sampled_logits_grad, std::string("sampled_logits_grad"));
-    Print<int64_t>(*samples, std::string("samples"));
    GPUPutAlongD1<
        T><<<grid, threads, 0, context.cuda_device_context().stream()>>>(
        size, batch_size, array_slice_size, idx_slice_size, p_array, p_index,
        p_value);
-    Print<T>(*logits_grad, std::string("logits_grad"));
  }
 };

--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -131,7 +131,7 @@ def __bootstrap__():
        'eager_delete_tensor_gb', 'fast_eager_deletion_mode',
        'allocator_strategy', 'reader_queue_speed_test_mode',
        'print_sub_graph_dir', 'pe_profile_fname', 'warpctc_dir',
-        'inner_op_parallelism', 'enable_parallel_graph', 'debug_print'
+        'inner_op_parallelism', 'enable_parallel_graph'
    ]
    if 'Darwin' not in sysstr:
        read_env_flags.append('use_pinned_memory')