diff --git a/paddle/fluid/operators/renorm_op.cu b/paddle/fluid/operators/renorm_op.cu index c8471fffa859809a64ad89395936e39bb9a7b4eb..1798faa759bed013c35f93bf0d2623f2a17da49a 100644 --- a/paddle/fluid/operators/renorm_op.cu +++ b/paddle/fluid/operators/renorm_op.cu @@ -12,16 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/operators/renorm_op.h" + #include +#include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h" -#include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h" #include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" -#include "paddle/fluid/operators/reduce_ops/reduce_op.h" -#include "paddle/fluid/operators/renorm_op.h" #include "paddle/fluid/operators/utils.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" -#include "stdio.h" namespace paddle { namespace operators { @@ -60,7 +60,7 @@ __global__ void RenormKernelFunc3(int64_t size, T* dim_value, float p, } template -__global__ void RenormKernelFunc4(T* x_data, T* out_data, int64_t size, +__global__ void RenormKernelFunc4(const T* x_data, T* out_data, int64_t size, T* dim_value, int64_t dimension_each, int64_t dim_divisor) { int64_t i = ((int64_t)blockIdx.x) * blockDim.x + threadIdx.x; @@ -74,8 +74,8 @@ __global__ void RenormKernelFunc4(T* x_data, T* out_data, int64_t size, } template -__global__ void RenormGradKernelFunc1(T* x_data, T* dout_data, T* pow_value, - T* mul_value, int64_t size, +__global__ void RenormGradKernelFunc1(const T* x_data, const T* dout_data, + T* pow_value, T* mul_value, int64_t size, int64_t dimension_each, float p, int64_t dim_divisor) { int64_t i = ((int64_t)blockIdx.x) * blockDim.x + threadIdx.x; @@ -87,8 +87,8 @@ __global__ void RenormGradKernelFunc1(T* x_data, T* dout_data, T* pow_value, } template -__global__ void RenormGradKernelFunc2(T* x_data, T* dout_data, T* dx_data, - int64_t size, T* dim_value, +__global__ void RenormGradKernelFunc2(const T* x_data, const T* dout_data, + T* dx_data, int64_t size, T* dim_value, T* dim_power_sum, T* weight_derivative, int64_t dimension_each, float p, float max_norm, int64_t dim_divisor) { @@ -100,8 +100,9 @@ __global__ void RenormGradKernelFunc2(T* x_data, T* dout_data, T* dx_data, if (temp > max_norm) { dim_power_sum[i] = pow(dim_value[i], (T)(-1.0 - 1.0 / p)) * -1 * max_norm; dim_value[i] = max_norm / temp; - } else + } else { dim_value[i] = 1.0; + } } __syncthreads(); if (i < size) { @@ -120,7 +121,7 @@ class CUDARenormKernel : public framework::OpKernel { const Tensor* x = context.Input("X"); Tensor* out = context.Output("Out"); auto numel = x->numel(); - T* x_data = (T*)x->data(); + const T* x_data = x->data(); auto input_dims = x->dims(); float max_norm = context.Attr("max_norm"); float p = context.Attr("p"); @@ -176,8 +177,8 @@ class CUDAGradRenormKernel : public framework::OpKernel { ctx.Output(framework::GradVarName("X")); auto numel = d_out->numel(); - T* dout_data = (T*)d_out->data(); - T* x_data = (T*)x->data(); + const T* dout_data = d_out->data(); + const T* x_data = x->data(); auto input_dims = x->dims(); float max_norm = ctx.Attr("max_norm"); float p = ctx.Attr("p"); @@ -234,4 +235,4 @@ REGISTER_OP_CUDA_KERNEL(renorm, ops::CUDARenormKernel, ops::CUDARenormKernel); REGISTER_OP_CUDA_KERNEL(renorm_grad, ops::CUDAGradRenormKernel, - ops::CUDAGradRenormKernel); \ No newline at end of file + ops::CUDAGradRenormKernel);