diff --git a/paddle/fluid/operators/sample_logits_op.cu b/paddle/fluid/operators/sample_logits_op.cu index eb55c14ff9c8bfc8cb7dfe379095ab19b06943d0..f0529ea82cc5e6377d4187b6794dca8244829b9a 100644 --- a/paddle/fluid/operators/sample_logits_op.cu +++ b/paddle/fluid/operators/sample_logits_op.cu @@ -27,8 +27,6 @@ limitations under the License. */ namespace paddle { namespace operators { -DEFINE_bool(debug_print, true, "run debug mode"); - // UNDERSTAND: something like take_along_axis in numpy. template __global__ void GPUTakeAlongD1(size_t size, const int batch_size, @@ -108,32 +106,6 @@ template class SampleLogitsCUDAKernel : public framework::OpKernel { public: using Tensor = framework::Tensor; - template - void Print(const Tensor& t, std::string name) const { - if (!FLAGS_debug_print) { - return; - } - VLOG(1) << name << " size = " << t.numel(); - size_t size = t.numel(); - const type* d = t.data(); -#ifdef PADDLE_WITH_CUDA - std::vector vec; - platform::DeviceContextPool::Instance().Get(t.place())->Wait(); - if (platform::is_gpu_place(t.place())) { - vec.resize(size); - cudaMemcpy(vec.data(), d, sizeof(T) * size, cudaMemcpyDeviceToHost); - d = vec.data(); - } -#endif - VLOG(1) << name << " data_ptr = " << static_cast(d); - std::string out; - for (size_t i = 0; i < size; i++) { - out += std::to_string(d[i]); - out += ","; - } - VLOG(1) << out; - } - void Compute(const framework::ExecutionContext& context) const override { // get necessary inputs const Tensor* logits = context.Input("Logits"); @@ -189,12 +161,9 @@ class SampleLogitsCUDAKernel : public framework::OpKernel { // UNDERSTAND: sampling const auto seed = context.Attr("seed"); auto sampler_with_prob = math::GPUSampleWithProb(); - Print(*samples, std::string("samples1")); sampler_with_prob(context.cuda_device_context(), seed, num_classes, uniq, num_samples, label, samples, probabilities); } - Print(*samples, std::string("samples2")); - Print(*probabilities, std::string("probabilities")); // UNDERSTAND: gather sampled logits and remove accidental hits if needed const auto num_take = samples->dims()[1]; @@ -216,7 +185,6 @@ class SampleLogitsCUDAKernel : public framework::OpKernel { T><<>>( size, batch_size, array_slice_size, idx_slice_size, p_array, p_index, p_value); - Print(*sampled_logits, std::string("sampled_logits")); if (remove_accidental_hits) { const size_t size = batch_size * (num_true + num_samples); @@ -224,8 +192,6 @@ class SampleLogitsCUDAKernel : public framework::OpKernel { gpu_compute_remove_accidental_hits< T><<>>( size, num_true, idx_slice_size, p_index, p_value); - Print(*sampled_logits, - std::string("sampled_logits_remove_accidental_hits")); } // subtracted sampled logits with logQ(y|x) @@ -234,7 +200,6 @@ class SampleLogitsCUDAKernel : public framework::OpKernel { smp_logits.device(*dev_ctx.eigen_device()) = (smp_logits - probs.log().unaryExpr(TolerableValue())) .unaryExpr(TolerableValue()); - Print(*sampled_logits, std::string("sampled_logits_res")); } }; @@ -242,32 +207,6 @@ template class SampleLogitsGradCUDAKernel : public framework::OpKernel { public: using Tensor = framework::Tensor; - template - void Print(const Tensor& t, std::string name) const { - if (!FLAGS_debug_print) { - return; - } - VLOG(1) << name << " size = " << t.numel(); - size_t size = t.numel(); - const type* d = t.data(); -#ifdef PADDLE_WITH_CUDA - std::vector vec; - platform::DeviceContextPool::Instance().Get(t.place())->Wait(); - if (platform::is_gpu_place(t.place())) { - vec.resize(size); - cudaMemcpy(vec.data(), d, sizeof(T) * size, cudaMemcpyDeviceToHost); - d = vec.data(); - } -#endif - VLOG(1) << name << " data_ptr = " << static_cast(d); - std::string out; - for (size_t i = 0; i < size; i++) { - out += std::to_string(d[i]); - out += ","; - } - VLOG(1) << out; - } - void Compute(const framework::ExecutionContext& context) const override { auto logits_grad = context.Output(framework::GradVarName("Logits")); const Tensor* samples = context.Input("Samples"); @@ -298,13 +237,10 @@ class SampleLogitsGradCUDAKernel : public framework::OpKernel { const size_t size = batch_size; int grid = (size + threads - 1) / threads; - Print(*sampled_logits_grad, std::string("sampled_logits_grad")); - Print(*samples, std::string("samples")); GPUPutAlongD1< T><<>>( size, batch_size, array_slice_size, idx_slice_size, p_array, p_index, p_value); - Print(*logits_grad, std::string("logits_grad")); } }; diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 6fa0de847c8d8cda85689714376304807edc4f26..396f36e188b27fe450cc19b3b8ccf967daf1456c 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -131,7 +131,7 @@ def __bootstrap__(): 'eager_delete_tensor_gb', 'fast_eager_deletion_mode', 'allocator_strategy', 'reader_queue_speed_test_mode', 'print_sub_graph_dir', 'pe_profile_fname', 'warpctc_dir', - 'inner_op_parallelism', 'enable_parallel_graph', 'debug_print' + 'inner_op_parallelism', 'enable_parallel_graph' ] if 'Darwin' not in sysstr: read_env_flags.append('use_pinned_memory')