diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h index daade439e5232f06be72bc5bb1e2285124f2c3a4..b29238432b05d81e984e1f4c269a00b01a4229cc 100644 --- a/paddle/framework/op_registry.h +++ b/paddle/framework/op_registry.h @@ -181,8 +181,8 @@ class OpKernelRegistrar : public Registrar { return 0; \ } -#define REGISTER_OP_GPU_KERNEL(op_type, ...) \ - REGISTER_OP_KERNEL(op_type, GPU, ::paddle::platform::GPUPlace, __VA_ARGS__) +#define REGISTER_OP_CUDA_KERNEL(op_type, ...) \ + REGISTER_OP_KERNEL(op_type, CUDA, ::paddle::platform::GPUPlace, __VA_ARGS__) #define REGISTER_OP_CPU_KERNEL(op_type, ...) \ REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__) @@ -217,7 +217,7 @@ class OpKernelRegistrar : public Registrar { #else #define USE_OP_KERNEL(op_type) \ USE_OP_DEVICE_KERNEL(op_type, CPU); \ - USE_OP_DEVICE_KERNEL(op_type, GPU) + USE_OP_DEVICE_KERNEL(op_type, CUDA) #endif #define USE_NO_KERNEL_OP(op_type) USE_OP_ITSELF(op_type); @@ -226,9 +226,9 @@ class OpKernelRegistrar : public Registrar { USE_OP_ITSELF(op_type); \ USE_OP_DEVICE_KERNEL(op_type, CPU); -#define USE_GPU_ONLY_OP(op_type) \ - USE_OP_ITSELF(op_type); \ - USE_OP_DEVICE_KERNEL(op_type, GPU) +#define USE_CUDA_ONLY_OP(op_type) \ + USE_OP_ITSELF(op_type); \ + USE_OP_DEVICE_KERNEL(op_type, CUDA) #define USE_OP(op_type) \ USE_OP_ITSELF(op_type); \ diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc index f1444eeee90a286b8ab1ed824c6469c08502c1c6..e83d7547831744333d6a9c36e842d840a2a0dc03 100644 --- a/paddle/framework/operator.cc +++ b/paddle/framework/operator.cc @@ -22,20 +22,6 @@ limitations under the License. */ namespace paddle { namespace framework { -template <> -Eigen::DefaultDevice& ExecutionContext::GetEigenDevice< - platform::CPUPlace, Eigen::DefaultDevice>() const { - return *device_context_.GetEigenDevice(); -} - -#ifdef PADDLE_WITH_CUDA -template <> -Eigen::GpuDevice& -ExecutionContext::GetEigenDevice() const { - return *device_context_.GetEigenDevice(); -} -#endif - std::string OperatorBase::Input(const std::string& name) const { auto& ins = Inputs(name); PADDLE_ENFORCE_LE(ins.size(), 1UL, @@ -429,7 +415,7 @@ void OperatorWithKernel::Run(const Scope& scope, } OpKernelType OperatorWithKernel::GetKernelType( const ExecutionContext& ctx) const { - return OpKernelType(IndicateDataType(ctx), ctx.device_context()); + return OpKernelType(IndicateDataType(ctx), ctx.GetPlace()); } DataType OperatorWithKernel::IndicateDataType( const ExecutionContext& ctx) const { diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h index 60861d92933dd100f877bec8d43f9b924f951e60..e60dbfc313f732120f6879fd6fd19ca8abc06813 100644 --- a/paddle/framework/operator.h +++ b/paddle/framework/operator.h @@ -276,17 +276,25 @@ class ExecutionContext { out_tensor->set_lod(in_tensor.lod()); } - template ::EigenDeviceType> - DeviceType& GetEigenDevice() const; - platform::Place GetPlace() const { return device_context_.GetPlace(); } + template + const DeviceContextType& device_context() const { + return *reinterpret_cast(&device_context_); + } + const platform::DeviceContext& device_context() const { return device_context_; } +#ifdef PADDLE_WITH_CUDA + const inline platform::CUDADeviceContext& cuda_device_context() const { + PADDLE_ENFORCE(platform::is_gpu_place(device_context_.GetPlace())); + return *reinterpret_cast( + &device_context_); + } +#endif + //! Get actual name vector for this input. const std::vector& Inputs(const std::string& name) const { return op_.Inputs(name); @@ -297,14 +305,6 @@ class ExecutionContext { return op_.Outputs(name); } -#ifdef PADDLE_WITH_CUDA - const inline platform::CUDADeviceContext& cuda_device_context() const { - PADDLE_ENFORCE(platform::is_gpu_place(device_context_.GetPlace())); - return *reinterpret_cast( - &device_context_); - } -#endif - private: const OperatorBase& op_; const Scope& scope_; diff --git a/paddle/framework/operator_test.cc b/paddle/framework/operator_test.cc index 59ddbc77912decf56192e542916ab012c929ed0a..b678178454ff63e4217f0be7a9938a9ba183cda4 100644 --- a/paddle/framework/operator_test.cc +++ b/paddle/framework/operator_test.cc @@ -115,7 +115,7 @@ class OpWithKernelTest : public OperatorWithKernel { protected: void InferShape(framework::InferShapeContext* ctx) const override {} OpKernelType GetKernelType(const ExecutionContext& ctx) const override { - return OpKernelType(DataType::FP32, ctx.device_context()); + return OpKernelType(DataType::FP32, ctx.GetPlace()); } }; diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt index 38b89b9eb108d73c3374360a81c6ed28502bfdc5..5aaaf993323c2d4dbef688d0977ec6374fde6512 100644 --- a/paddle/operators/CMakeLists.txt +++ b/paddle/operators/CMakeLists.txt @@ -138,7 +138,7 @@ function(op_library TARGET) if ("${TARGET}" STREQUAL "nccl_op") set(pybind_flag 1) # It's enough to just adding one operator to pybind - file(APPEND ${pybind_file} "USE_GPU_ONLY_OP(ncclAllReduce);\n") + file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(ncclAllReduce);\n") endif() # reduce_op contains several operators diff --git a/paddle/operators/accuracy_op.cc b/paddle/operators/accuracy_op.cc index 2785a8c6fb62527db4d203788be88ebead068a19..76da21c4726a1245241c1cf61860f9c8b62ea452 100644 --- a/paddle/operators/accuracy_op.cc +++ b/paddle/operators/accuracy_op.cc @@ -57,7 +57,7 @@ class AccuracyOp : public framework::OperatorWithKernel { const framework::ExecutionContext &ctx) const override { return framework::OpKernelType( framework::ToDataType(ctx.Input("Out")->type()), - ctx.device_context()); + ctx.GetPlace()); } }; diff --git a/paddle/operators/accuracy_op.cu b/paddle/operators/accuracy_op.cu index d2dcab4e548b99c6beecfaa570ac31804fd07d82..539a93530206c93a37791a9ccb2fb104af17f940 100644 --- a/paddle/operators/accuracy_op.cu +++ b/paddle/operators/accuracy_op.cu @@ -104,5 +104,6 @@ class AccuracyOpCUDAKernel : public framework::OpKernel { // FIXME(typhoonzero): types of T is for inference data. // label data is always int64 -REGISTER_OP_GPU_KERNEL(accuracy, paddle::operators::AccuracyOpCUDAKernel, - paddle::operators::AccuracyOpCUDAKernel); +REGISTER_OP_CUDA_KERNEL(accuracy, + paddle::operators::AccuracyOpCUDAKernel, + paddle::operators::AccuracyOpCUDAKernel); diff --git a/paddle/operators/accuracy_op.h b/paddle/operators/accuracy_op.h index d060e6edddb31ecc1a4d27836f80b8ac5fa7d36d..04104a695fac6a967ad94780e31ba3fdd2ca2eda 100644 --- a/paddle/operators/accuracy_op.h +++ b/paddle/operators/accuracy_op.h @@ -21,7 +21,7 @@ namespace operators { using Tensor = framework::Tensor; -template +template class AccuracyKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { diff --git a/paddle/operators/activation_op.cc b/paddle/operators/activation_op.cc index 7f3118f17603d03903b072b248e033cbd3347623..63490f0ec9f4852a3ead574b9d52c807d8ba6d89 100644 --- a/paddle/operators/activation_op.cc +++ b/paddle/operators/activation_op.cc @@ -611,16 +611,17 @@ REGISTER_OP(hard_sigmoid, ops::ActivationOp, ops::HardSigmoidOpMaker, REGISTER_OP(swish, ops::ActivationOp, ops::SwishOpMaker, swish_grad, ops::ActivationOpGrad); -#define REGISTER_ACTIVATION_CPU_KERNEL(act_type, functor, grad_functor) \ - REGISTER_OP_CPU_KERNEL( \ - act_type, \ - ops::ActivationKernel>, \ - ops::ActivationKernel>); \ - REGISTER_OP_CPU_KERNEL( \ - act_type##_grad, ops::ActivationGradKernel>, \ - ops::ActivationGradKernel>, \ + ops::ActivationKernel>); \ + REGISTER_OP_CPU_KERNEL( \ + act_type##_grad, \ + ops::ActivationGradKernel>, \ + ops::ActivationGradKernel>); FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CPU_KERNEL); diff --git a/paddle/operators/activation_op.cu b/paddle/operators/activation_op.cu index 97737857ab25dfa92163b64a750fd7a7d9ea0ac3..856d3fc35dafe6b22c25c55dfda2dc4973072615 100644 --- a/paddle/operators/activation_op.cu +++ b/paddle/operators/activation_op.cu @@ -17,16 +17,17 @@ namespace ops = paddle::operators; -#define REGISTER_ACTIVATION_GPU_KERNEL(act_type, functor, grad_functor) \ - REGISTER_OP_GPU_KERNEL( \ - act_type, \ - ops::ActivationKernel>, \ - ops::ActivationKernel>); \ - REGISTER_OP_GPU_KERNEL( \ - act_type##_grad, ops::ActivationGradKernel>, \ - ops::ActivationGradKernel>, \ + ops::ActivationKernel>); \ + REGISTER_OP_CUDA_KERNEL( \ + act_type##_grad, \ + ops::ActivationGradKernel>, \ + ops::ActivationGradKernel>); -FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_GPU_KERNEL); +FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CUDA_KERNEL); diff --git a/paddle/operators/activation_op.h b/paddle/operators/activation_op.h index ac0e0a3b01232940a18ff8b1ce6b6504df1299d1..75eefca8b8c7ba8831a2f90c83718d00b83fba30 100644 --- a/paddle/operators/activation_op.h +++ b/paddle/operators/activation_op.h @@ -19,7 +19,7 @@ namespace paddle { namespace operators { -template +template class ActivationKernel : public framework::OpKernel { public: @@ -32,18 +32,19 @@ class ActivationKernel auto x = framework::EigenVector::Flatten(*X); auto y = framework::EigenVector::Flatten(*Y); - auto place = context.GetEigenDevice(); + auto* place = + context.template device_context().eigen_device(); Functor functor; auto attrs = functor.GetAttrs(); for (auto& attr : attrs) { *attr.second = context.Attr(attr.first); } - functor(place, x, y); + functor(*place, x, y); } }; -template +template class ActivationGradKernel : public framework::OpKernel { public: @@ -59,13 +60,14 @@ class ActivationGradKernel auto x = framework::EigenVector::Flatten(*X); auto y = framework::EigenVector::Flatten(*Y); auto dx = framework::EigenVector::Flatten(*dX); - auto place = context.GetEigenDevice(); + auto* place = + context.template device_context().eigen_device(); Functor functor; auto attrs = functor.GetAttrs(); for (auto& attr : attrs) { *attr.second = context.Attr(attr.first); } - functor(place, x, y, dy, dx); + functor(*place, x, y, dy, dx); } }; diff --git a/paddle/operators/adadelta_op.cc b/paddle/operators/adadelta_op.cc index 29434a0ee221d16b1a8b1eb77d7fe373d4b943aa..507811e7b59b9426c599570ead9b42f8d02380fd 100644 --- a/paddle/operators/adadelta_op.cc +++ b/paddle/operators/adadelta_op.cc @@ -109,5 +109,5 @@ $$ namespace ops = paddle::operators; REGISTER_OP_WITHOUT_GRADIENT(adadelta, ops::AdadeltaOp, ops::AdadeltaOpMaker); REGISTER_OP_CPU_KERNEL( - adadelta, ops::AdadeltaOpKernel, - ops::AdadeltaOpKernel); + adadelta, ops::AdadeltaOpKernel, + ops::AdadeltaOpKernel); diff --git a/paddle/operators/adadelta_op.cu b/paddle/operators/adadelta_op.cu index 9fb61852071f11670b8bc51321bb0881de196777..eee2d0a2f55f877bc5c87c72bca07bfd9485e517 100644 --- a/paddle/operators/adadelta_op.cu +++ b/paddle/operators/adadelta_op.cu @@ -16,6 +16,6 @@ #include "paddle/operators/adadelta_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( - adadelta, ops::AdadeltaOpKernel, - ops::AdadeltaOpKernel); +REGISTER_OP_CUDA_KERNEL( + adadelta, ops::AdadeltaOpKernel, + ops::AdadeltaOpKernel); diff --git a/paddle/operators/adadelta_op.h b/paddle/operators/adadelta_op.h index a8c5f0c8aa20ce506f5279fa696079ba64034bd5..819d0845dbdafab95d993a455013300fa71495e2 100644 --- a/paddle/operators/adadelta_op.h +++ b/paddle/operators/adadelta_op.h @@ -19,7 +19,7 @@ limitations under the License. */ namespace paddle { namespace operators { -template +template class AdadeltaOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -51,7 +51,7 @@ class AdadeltaOpKernel : public framework::OpKernel { framework::EigenVector::Flatten(*avg_squared_grad_out_tensor); auto avg_squared_update_out = framework::EigenVector::Flatten(*avg_squared_update_out_tensor); - auto place = ctx.GetEigenDevice(); + auto& place = *ctx.template device_context().eigen_device(); avg_squared_grad_out.device(place) = rho * avg_squared_grad + (1 - rho) * grad.square(); diff --git a/paddle/operators/adagrad_op.cc b/paddle/operators/adagrad_op.cc index d19602244bc015100f1162317ca046aa2157caf5..5d007163161cd4bf4a9fd46eda57f7984c6a414f 100644 --- a/paddle/operators/adagrad_op.cc +++ b/paddle/operators/adagrad_op.cc @@ -100,8 +100,8 @@ size_t FindPos(const std::vector& rows, int64_t value) { } // namespace template -struct SparseAdagradFunctor { - void operator()(const platform::DeviceContext& context, +struct SparseAdagradFunctor { + void operator()(const platform::CPUDeviceContext& context, const framework::SelectedRows& grad, const framework::Tensor& learning_rate, T epsilon, framework::Tensor* moment, framework::Tensor* param) { @@ -120,7 +120,7 @@ struct SparseAdagradFunctor { {static_cast(merge_rows.size()), grad_width}), context.GetPlace()); - math::SetConstant constant_functor; + math::SetConstant constant_functor; constant_functor(context, grad_merge->mutable_value(), 0.0); auto* grad_merge_data = grad_merge->mutable_value()->data(); @@ -144,9 +144,9 @@ struct SparseAdagradFunctor { auto gs = framework::EigenVector::Flatten(*(grad_square->mutable_value())); auto gm = framework::EigenVector::Flatten(grad_merge->value()); - gs.device(*context.GetEigenDevice()) = gm * gm; + gs.device(*context.eigen_device()) = gm * gm; - math::SelectedRowsAddToTensor functor; + math::SelectedRowsAddToTensor functor; functor(context, *grad_square, moment); // 3. update parameter @@ -164,13 +164,13 @@ struct SparseAdagradFunctor { } }; -template struct SparseAdagradFunctor; -template struct SparseAdagradFunctor; +template struct SparseAdagradFunctor; +template struct SparseAdagradFunctor; } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OP_WITHOUT_GRADIENT(adagrad, ops::AdagradOp, ops::AdagradOpMaker); REGISTER_OP_CPU_KERNEL( - adagrad, ops::AdagradOpKernel, - ops::AdagradOpKernel); + adagrad, ops::AdagradOpKernel, + ops::AdagradOpKernel); diff --git a/paddle/operators/adagrad_op.cu b/paddle/operators/adagrad_op.cu index 1c870214b29dbfcabb7414317b1214d6bef369cb..585b2d92894af65b8ed15a596f0377fdcf564cfa 100644 --- a/paddle/operators/adagrad_op.cu +++ b/paddle/operators/adagrad_op.cu @@ -72,8 +72,8 @@ __global__ void SparseAdagradFunctorKernel(const T* grad, const int64_t* rows, } // namespace template -struct SparseAdagradFunctor { - void operator()(const platform::DeviceContext& context, +struct SparseAdagradFunctor { + void operator()(const platform::CUDADeviceContext& context, const framework::SelectedRows& grad, const framework::Tensor& learning_rate, T epsilon, framework::Tensor* moment, framework::Tensor* param) { @@ -92,7 +92,7 @@ struct SparseAdagradFunctor { {static_cast(merge_rows.size()), grad_width}), context.GetPlace()); - math::SetConstant constant_functor; + math::SetConstant constant_functor; constant_functor(context, grad_merge->mutable_value(), 0.0); auto* grad_merge_data = grad_merge->mutable_value()->data(); @@ -119,9 +119,9 @@ struct SparseAdagradFunctor { auto gs = framework::EigenVector::Flatten(*(grad_square->mutable_value())); auto gm = framework::EigenVector::Flatten(grad_merge->value()); - gs.device(*context.GetEigenDevice()) = gm * gm; + gs.device(*context.eigen_device()) = gm * gm; - math::SelectedRowsAddToTensor functor; + math::SelectedRowsAddToTensor functor; functor(context, *grad_square, moment); // 3. update parameter @@ -139,13 +139,13 @@ struct SparseAdagradFunctor { } }; -template struct SparseAdagradFunctor; -template struct SparseAdagradFunctor; +template struct SparseAdagradFunctor; +template struct SparseAdagradFunctor; } // namespace operators } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( - adagrad, ops::AdagradOpKernel, - ops::AdagradOpKernel); +REGISTER_OP_CUDA_KERNEL( + adagrad, ops::AdagradOpKernel, + ops::AdagradOpKernel); diff --git a/paddle/operators/adagrad_op.h b/paddle/operators/adagrad_op.h index 4d4a6434c7c472d8ceb01edfc4050fbb009d6c9f..0d77dbcbacd4efb6c1900e57b5c4ea9e9b136771 100644 --- a/paddle/operators/adagrad_op.h +++ b/paddle/operators/adagrad_op.h @@ -19,15 +19,15 @@ limitations under the License. */ namespace paddle { namespace operators { -template +template struct SparseAdagradFunctor { - void operator()(const platform::DeviceContext& context, + void operator()(const DeviceContext& context, const framework::SelectedRows& grad, const framework::Tensor& learning_rate, T epsilon, framework::Tensor* moment, framework::Tensor* param); }; -template +template class AdagradOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -52,11 +52,11 @@ class AdagradOpKernel : public framework::OpKernel { auto param_out = framework::EigenVector::Flatten(*param_out_tensor); auto moment_out = framework::EigenVector::Flatten(*moment_out_tensor); - auto place = ctx.GetEigenDevice(); + auto* place = ctx.template device_context().eigen_device(); - moment_out.device(place) = moment + grad * grad; + moment_out.device(*place) = moment + grad * grad; Eigen::DSizes m_dsize(moment_out_tensor->numel()); - param_out.device(place) = + param_out.device(*place) = param - lr.broadcast(m_dsize) * grad / (moment_out.sqrt() + epsilon); } else if (grad_var->IsType()) { auto* param_tensor = ctx.Input("Param"); @@ -65,8 +65,9 @@ class AdagradOpKernel : public framework::OpKernel { auto* moment_tensor = ctx.Input("Moment"); PADDLE_ENFORCE_EQ(moment_tensor, moment_out_tensor); - SparseAdagradFunctor functor; - functor(ctx.device_context(), *ctx.Input("Grad"), + SparseAdagradFunctor functor; + functor(ctx.template device_context(), + *ctx.Input("Grad"), *ctx.Input("LearningRate"), epsilon, moment_out_tensor, param_out_tensor); } else { diff --git a/paddle/operators/adam_op.cc b/paddle/operators/adam_op.cc index a268d05484247628f46ce4c4941d5fb54d999d96..cf6ef6dd53979b23de125014b8d5150d8ce4c053 100644 --- a/paddle/operators/adam_op.cc +++ b/paddle/operators/adam_op.cc @@ -128,6 +128,6 @@ $$ namespace ops = paddle::operators; REGISTER_OP_WITHOUT_GRADIENT(adam, ops::AdamOp, ops::AdamOpMaker); -REGISTER_OP_CPU_KERNEL(adam, - ops::AdamOpKernel, - ops::AdamOpKernel); +REGISTER_OP_CPU_KERNEL( + adam, ops::AdamOpKernel, + ops::AdamOpKernel); diff --git a/paddle/operators/adam_op.cu b/paddle/operators/adam_op.cu index 6e34f7818ce20c75692fe21776721ce200b7a147..c135b3737899a1ae92041b4759698ddc30c20e12 100644 --- a/paddle/operators/adam_op.cu +++ b/paddle/operators/adam_op.cu @@ -16,6 +16,6 @@ #include "paddle/operators/adam_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(adam, - ops::AdamOpKernel, - ops::AdamOpKernel); +REGISTER_OP_CUDA_KERNEL( + adam, ops::AdamOpKernel, + ops::AdamOpKernel); diff --git a/paddle/operators/adam_op.h b/paddle/operators/adam_op.h index 7f7fa1da1c0d8d81d1bcb18a1bf542838eddccf7..45157842a6f92348909498f83d304d53b36c7d47 100644 --- a/paddle/operators/adam_op.h +++ b/paddle/operators/adam_op.h @@ -19,7 +19,7 @@ limitations under the License. */ namespace paddle { namespace operators { -template +template class AdamOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -52,17 +52,17 @@ class AdamOpKernel : public framework::OpKernel { auto param_out = framework::EigenVector::Flatten(*param_out_tensor); auto moment1_out = framework::EigenVector::Flatten(*moment1_out_tensor); auto moment2_out = framework::EigenVector::Flatten(*moment2_out_tensor); - auto place = ctx.GetEigenDevice(); + auto* place = ctx.template device_context().eigen_device(); - moment1_out.device(place) = beta1 * moment1 + (1 - beta1) * grad; - moment2_out.device(place) = beta2 * moment2 + (1 - beta2) * grad.square(); + moment1_out.device(*place) = beta1 * moment1 + (1 - beta1) * grad; + moment2_out.device(*place) = beta2 * moment2 + (1 - beta2) * grad.square(); // All of these are tensors of 1 element auto lr_t = lr * (1 - beta2_pow).sqrt() / (1 - beta1_pow); // Eigen does not support automatic broadcast // Get dimensions of moment vector to broadcast lr_t Eigen::DSizes m_dsize(moment1_out_tensor->numel()); - param_out.device(place) = + param_out.device(*place) = param - lr_t.broadcast(m_dsize) * (moment1_out / (moment2_out.sqrt() + epsilon)); diff --git a/paddle/operators/adamax_op.cc b/paddle/operators/adamax_op.cc index 9e7576c961bf72f9d9ed6e69f1f3ba121c243dbe..49ce497bb710de24b198fb4b5f56ff6d277c6f52 100644 --- a/paddle/operators/adamax_op.cc +++ b/paddle/operators/adamax_op.cc @@ -127,6 +127,6 @@ division by 0 error. namespace ops = paddle::operators; REGISTER_OP_WITHOUT_GRADIENT(adamax, ops::AdamaxOp, ops::AdamaxOpMaker); -REGISTER_OP_CPU_KERNEL(adamax, - ops::AdamaxOpKernel, - ops::AdamaxOpKernel); +REGISTER_OP_CPU_KERNEL( + adamax, ops::AdamaxOpKernel, + ops::AdamaxOpKernel); diff --git a/paddle/operators/adamax_op.cu b/paddle/operators/adamax_op.cu index 057ef39025aa23704457ef7bbe54934d06cdc87f..2d143905c4819dbf5f94391bdcf093971849e7a3 100644 --- a/paddle/operators/adamax_op.cu +++ b/paddle/operators/adamax_op.cu @@ -16,6 +16,6 @@ #include "paddle/operators/adamax_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(adamax, - ops::AdamaxOpKernel, - ops::AdamaxOpKernel); +REGISTER_OP_CUDA_KERNEL( + adamax, ops::AdamaxOpKernel, + ops::AdamaxOpKernel); diff --git a/paddle/operators/adamax_op.h b/paddle/operators/adamax_op.h index bf36ed78604dd88c537db51fbeb38f43d0c46173..172c179c5fabf5ca106bf11479aff2d94a4e21d2 100644 --- a/paddle/operators/adamax_op.h +++ b/paddle/operators/adamax_op.h @@ -19,7 +19,7 @@ limitations under the License. */ namespace paddle { namespace operators { -template +template class AdamaxOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -51,14 +51,14 @@ class AdamaxOpKernel : public framework::OpKernel { auto moment_out = framework::EigenVector::Flatten(*moment_out_tensor); auto inf_norm_out = framework::EigenVector::Flatten(*inf_norm_out_tensor); - auto place = ctx.GetEigenDevice(); + auto* place = ctx.template device_context().eigen_device(); - moment_out.device(place) = beta1 * moment + (1 - beta1) * grad; - inf_norm_out.device(place) = + moment_out.device(*place) = beta1 * moment + (1 - beta1) * grad; + inf_norm_out.device(*place) = grad.abs().cwiseMax((beta2 * inf_norm) + epsilon); auto lr_t = lr / (1 - beta1_pow); Eigen::DSizes m_dsize(moment_out_tensor->numel()); - param_out.device(place) = + param_out.device(*place) = param - lr_t.broadcast(m_dsize) * (moment_out / inf_norm_out); } }; diff --git a/paddle/operators/auc_op.h b/paddle/operators/auc_op.h index e5ac57b038ac32ed35bce35e477ede0cdb5da813..b80509e2a99a2a255dff2a98d950257588a21d29 100644 --- a/paddle/operators/auc_op.h +++ b/paddle/operators/auc_op.h @@ -25,7 +25,7 @@ template using EigenVector = framework::EigenVector; -template +template class AucKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { diff --git a/paddle/operators/batch_norm_op.cc b/paddle/operators/batch_norm_op.cc index ac97bd83ab7e7838871586cfe5acb832084b6cec..94a972b7ab56f41f8b6a203b6bf0330a69f84e54 100644 --- a/paddle/operators/batch_norm_op.cc +++ b/paddle/operators/batch_norm_op.cc @@ -135,7 +135,8 @@ The required data format for this layer is one of the following: }; template -class BatchNormKernel : public framework::OpKernel { +class BatchNormKernel + : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { const float epsilon = ctx.Attr("epsilon"); @@ -318,12 +319,12 @@ class BatchNormGradOp : public framework::OperatorWithKernel { PADDLE_THROW("can't find Y@GRAD"); } return framework::OpKernelType(framework::ToDataType(t->type()), - ctx.device_context()); + ctx.GetPlace()); } }; template -class BatchNormGradKernel +class BatchNormGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { @@ -436,8 +437,9 @@ class BatchNormGradKernel namespace ops = paddle::operators; REGISTER_OP(batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker, batch_norm_grad, ops::BatchNormGradOp); -REGISTER_OP_CPU_KERNEL(batch_norm, - ops::BatchNormKernel); +REGISTER_OP_CPU_KERNEL( + batch_norm, + ops::BatchNormKernel); REGISTER_OP_CPU_KERNEL( batch_norm_grad, - ops::BatchNormGradKernel); + ops::BatchNormGradKernel); diff --git a/paddle/operators/batch_norm_op.cu.cc b/paddle/operators/batch_norm_op.cu.cc index 7b2f3187007fa2491afa75de1cde1910c6ce9bb8..c7adc3d80ed25d129cec41a0fd3d22fd42aba363 100644 --- a/paddle/operators/batch_norm_op.cu.cc +++ b/paddle/operators/batch_norm_op.cu.cc @@ -47,7 +47,8 @@ void ExtractNCWHD(const framework::DDim &dims, } template -class BatchNormKernel : public framework::OpKernel { +class BatchNormKernel + : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), @@ -121,11 +122,12 @@ class BatchNormKernel : public framework::OpKernel { saved_mean->mutable_data(ctx.GetPlace()); saved_variance->mutable_data(ctx.GetPlace()); - math::SetConstant functor; - functor(ctx.device_context(), saved_mean, 0); - functor(ctx.device_context(), saved_variance, 0); + auto &dev_ctx = ctx.template device_context(); + math::SetConstant functor; + functor(dev_ctx, saved_mean, 0); + functor(dev_ctx, saved_variance, 0); - auto handle = ctx.cuda_device_context().cudnn_handle(); + auto handle = dev_ctx.cudnn_handle(); // Now, depending on whether we are running test or not, we have two paths. if (is_test) { @@ -171,7 +173,7 @@ class BatchNormKernel : public framework::OpKernel { }; template -class BatchNormGradKernel +class BatchNormGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { @@ -244,11 +246,12 @@ class BatchNormGradKernel const void *saved_mean_data = saved_mean->template data(); const void *saved_var_data = saved_var->template data(); + auto &dev_ctx = ctx.template device_context(); CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationBackward( - ctx.cuda_device_context().cudnn_handle(), mode_, - CudnnDataType::kOne(), CudnnDataType::kZero(), - CudnnDataType::kOne(), CudnnDataType::kZero(), data_desc_, - x->template data(), data_desc_, d_y->template data(), data_desc_, + dev_ctx.cudnn_handle(), mode_, CudnnDataType::kOne(), + CudnnDataType::kZero(), CudnnDataType::kOne(), + CudnnDataType::kZero(), data_desc_, x->template data(), + data_desc_, d_y->template data(), data_desc_, d_x->template mutable_data(ctx.GetPlace()), bn_param_desc_, scale->template data(), d_scale->template mutable_data(ctx.GetPlace()), @@ -266,8 +269,9 @@ class BatchNormGradKernel } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(batch_norm, - ops::BatchNormKernel); -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( + batch_norm, + ops::BatchNormKernel); +REGISTER_OP_CUDA_KERNEL( batch_norm_grad, - ops::BatchNormGradKernel); + ops::BatchNormGradKernel); diff --git a/paddle/operators/batch_norm_op.h b/paddle/operators/batch_norm_op.h index 4e80134a1acf3b4d66154453dd0ed709133d1c7c..8d99b6864776e81b30e87c09028b336309cf2838 100644 --- a/paddle/operators/batch_norm_op.h +++ b/paddle/operators/batch_norm_op.h @@ -34,13 +34,13 @@ inline TensorFormat StringToTensorFormat(const std::string& str) { } } -template +template class BatchNormKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override; }; -template +template class BatchNormGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override; diff --git a/paddle/operators/bilinear_tensor_product_op.cc b/paddle/operators/bilinear_tensor_product_op.cc index c88b2c9beb4497b617078c8ac5582d2f246f43fd..217fd523667777f7d250295d2a036867dac94f04 100644 --- a/paddle/operators/bilinear_tensor_product_op.cc +++ b/paddle/operators/bilinear_tensor_product_op.cc @@ -159,9 +159,12 @@ REGISTER_OP(bilinear_tensor_product, ops::BilinearTensorProductOp, ops::BilinearTensorProductOpGrad); REGISTER_OP_CPU_KERNEL( bilinear_tensor_product, - ops::BilinearTensorProductKernel, - ops::BilinearTensorProductKernel); + ops::BilinearTensorProductKernel, + ops::BilinearTensorProductKernel); REGISTER_OP_CPU_KERNEL( bilinear_tensor_product_grad, - ops::BilinearTensorProductGradKernel, - ops::BilinearTensorProductGradKernel); + ops::BilinearTensorProductGradKernel, + ops::BilinearTensorProductGradKernel); diff --git a/paddle/operators/bilinear_tensor_product_op.cu b/paddle/operators/bilinear_tensor_product_op.cu index 858d2668d01379afe8082cd1eda32a2a5d09bd18..0f48010716f086a64c0b6a35b76e06a42430ab84 100644 --- a/paddle/operators/bilinear_tensor_product_op.cu +++ b/paddle/operators/bilinear_tensor_product_op.cu @@ -16,11 +16,15 @@ limitations under the License. */ #include "paddle/operators/bilinear_tensor_product_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( bilinear_tensor_product, - ops::BilinearTensorProductKernel, - ops::BilinearTensorProductKernel); -REGISTER_OP_GPU_KERNEL( + ops::BilinearTensorProductKernel, + ops::BilinearTensorProductKernel); +REGISTER_OP_CUDA_KERNEL( bilinear_tensor_product_grad, - ops::BilinearTensorProductGradKernel, - ops::BilinearTensorProductGradKernel); + ops::BilinearTensorProductGradKernel, + ops::BilinearTensorProductGradKernel); diff --git a/paddle/operators/bilinear_tensor_product_op.h b/paddle/operators/bilinear_tensor_product_op.h index 1113a4c6f357edb4f6b14b73c6eec9c6cca24ce5..ba9a2c5ce3c024a82e864a399ad90281d8dcdb20 100644 --- a/paddle/operators/bilinear_tensor_product_op.h +++ b/paddle/operators/bilinear_tensor_product_op.h @@ -27,7 +27,7 @@ template using EigenMatrix = framework::EigenMatrix; -template +template class BilinearTensorProductKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -46,7 +46,8 @@ class BilinearTensorProductKernel : public framework::OpKernel { int out_dim = weight_dims[0]; auto x_dim = weight_dims[1]; auto y_dim = weight_dims[2]; - auto place = ctx.GetEigenDevice(); + auto& place = *ctx.template device_context().eigen_device(); + auto& dev_ctx = ctx.template device_context(); // Create the intermediate variable to caculate the result of // Input(X) multiplied by Input(Weight_i), the formula is: @@ -60,9 +61,9 @@ class BilinearTensorProductKernel : public framework::OpKernel { auto output_col_vec = output_mat.chip(i, 1); Tensor weight_mat = weight->Slice(i, i + 1).Resize(framework::make_ddim({x_dim, y_dim})); - math::gemm(ctx.device_context(), CblasNoTrans, CblasNoTrans, - batch_size, y_dim, x_dim, 1, x->data(), - weight_mat.data(), 0, left_mul.data()); + math::gemm(dev_ctx, CblasNoTrans, CblasNoTrans, + batch_size, y_dim, x_dim, 1, x->data(), + weight_mat.data(), 0, left_mul.data()); output_col_vec.device(place) = (left_mul_mat * y_mat).sum(Eigen::DSizes(1)); } @@ -74,7 +75,7 @@ class BilinearTensorProductKernel : public framework::OpKernel { } }; -template +template class BilinearTensorProductGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -96,8 +97,8 @@ class BilinearTensorProductGradKernel : public framework::OpKernel { auto x_mat = EigenMatrix::From(*x); auto y_mat = EigenMatrix::From(*y); auto d_out_mat = EigenMatrix::From(*d_out); - auto place = ctx.GetEigenDevice(); - + auto& place = *ctx.template device_context().eigen_device(); + auto& dev_ctx = ctx.template device_context(); // Create the intermediate variable to caculate the Output(Y@Grad). Tensor x_scale; x_scale.mutable_data(framework::make_ddim({batch_size, x_dim}), @@ -110,18 +111,18 @@ class BilinearTensorProductGradKernel : public framework::OpKernel { ctx.GetPlace()); auto y_scale_mat = EigenMatrix::From(y_scale); - math::SetConstant set_zero; + math::SetConstant set_zero; // Set Output(X@Grad) be zero. if (d_x) { d_x->mutable_data(ctx.GetPlace()); - set_zero(ctx.device_context(), d_x, static_cast(0)); + set_zero(dev_ctx, d_x, static_cast(0)); } // Set Output(Y@Grad) be zero. if (d_y) { d_y->mutable_data(ctx.GetPlace()); - set_zero(ctx.device_context(), d_y, static_cast(0)); + set_zero(dev_ctx, d_y, static_cast(0)); } // Caculate the Output(X@Grad) and Output(Y@Grad). @@ -137,18 +138,18 @@ class BilinearTensorProductGradKernel : public framework::OpKernel { output_vec.reshape(Eigen::DSizes(batch_size, 1)) .broadcast(bcast_for_x) * y_mat; - math::gemm(ctx.device_context(), CblasNoTrans, CblasTrans, - batch_size, x_dim, y_dim, 1, y_scale.data(), - weight_i.data(), 1, d_x->data()); + math::gemm( + dev_ctx, CblasNoTrans, CblasTrans, batch_size, x_dim, y_dim, 1, + y_scale.data(), weight_i.data(), 1, d_x->data()); } if (d_y) { x_scale_mat.device(place) = output_vec.reshape(Eigen::DSizes(batch_size, 1)) .broadcast(bcast_for_y) * x_mat; - math::gemm(ctx.device_context(), CblasNoTrans, CblasNoTrans, - batch_size, y_dim, x_dim, 1, x_scale.data(), - weight_i.data(), 1, d_y->data()); + math::gemm( + dev_ctx, CblasNoTrans, CblasNoTrans, batch_size, y_dim, x_dim, 1, + x_scale.data(), weight_i.data(), 1, d_y->data()); } } } @@ -165,9 +166,9 @@ class BilinearTensorProductGradKernel : public framework::OpKernel { output_vec.reshape(Eigen::DSizes(batch_size, 1)) .broadcast(bcast_for_weight) * x_mat; - math::gemm(ctx.device_context(), CblasTrans, CblasNoTrans, - x_dim, y_dim, batch_size, 1, x_scale.data(), - y->data(), 0, d_weight_i.data()); + math::gemm(dev_ctx, CblasTrans, CblasNoTrans, x_dim, + y_dim, batch_size, 1, x_scale.data(), + y->data(), 0, d_weight_i.data()); } } diff --git a/paddle/operators/cast_op.cc b/paddle/operators/cast_op.cc index 3082a53ccfbe4f8666cfdfc2efed6b46ffdfede9..42bff69a1e1c051354296d2e2426e1658792a94d 100644 --- a/paddle/operators/cast_op.cc +++ b/paddle/operators/cast_op.cc @@ -68,7 +68,7 @@ class CastOpGradMaker : public framework::SingleGradOpDescMaker { } // namespace paddle namespace ops = paddle::operators; -using CPU = paddle::platform::CPUPlace; +using CPU = paddle::platform::CPUDeviceContext; REGISTER_OP_WITH_KERNEL(cast, ops::CastOpGradMaker, ops::CastOpInferShape, ops::CastOpProtoMaker); REGISTER_OP_CPU_KERNEL(cast, ops::CastOpKernel, diff --git a/paddle/operators/cast_op.cu b/paddle/operators/cast_op.cu index fb75ddbabfefd8d00420d8c96f958abcb8fdce62..4681deaa62fdcde18f39f4192841be66f49d7c08 100644 --- a/paddle/operators/cast_op.cu +++ b/paddle/operators/cast_op.cu @@ -16,7 +16,7 @@ template using CastOpKernel = - paddle::operators::CastOpKernel; + paddle::operators::CastOpKernel; -REGISTER_OP_GPU_KERNEL(cast, CastOpKernel, CastOpKernel, - CastOpKernel, CastOpKernel); +REGISTER_OP_CUDA_KERNEL(cast, CastOpKernel, CastOpKernel, + CastOpKernel, CastOpKernel); diff --git a/paddle/operators/cast_op.h b/paddle/operators/cast_op.h index 850dc8e3498351e54d41fcd2b6596c6fe668df14..a6773f13a8deb443b022c6045f1b3b976b3e6607 100644 --- a/paddle/operators/cast_op.h +++ b/paddle/operators/cast_op.h @@ -27,13 +27,13 @@ struct CastOpTransformFunctor { HOSTDEVICE OutT operator()(InT in) const { return static_cast(in); } }; -template +template struct CastOpFunctor { const framework::Tensor* in_; framework::Tensor* out_; - const platform::DeviceContext& ctx_; + const DeviceContext& ctx_; CastOpFunctor(const framework::Tensor* in, framework::Tensor* out, - const platform::DeviceContext& ctx) + const DeviceContext& ctx) : in_(in), out_(out), ctx_(ctx) {} template @@ -42,13 +42,13 @@ struct CastOpFunctor { auto numel = in_->numel(); auto* in_end = in_begin + numel; auto* out_begin = out_->mutable_data(ctx_.GetPlace()); - platform::Transform trans; + platform::Transform trans; trans(ctx_, in_begin, in_end, out_begin, CastOpTransformFunctor()); } }; -template +template class CastOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -56,7 +56,8 @@ class CastOpKernel : public framework::OpKernel { auto* out = context.Output("Out"); framework::VisitDataType( static_cast(context.Attr("out_dtype")), - CastOpFunctor(in, out, context.device_context())); + CastOpFunctor( + in, out, context.template device_context())); } }; diff --git a/paddle/operators/chunk_eval_op.h b/paddle/operators/chunk_eval_op.h index dd88f2553bb6a204f5c5932d0b2f0e24c39faea7..9cd758a8253914515437b480e17a94d5d6b21fd2 100644 --- a/paddle/operators/chunk_eval_op.h +++ b/paddle/operators/chunk_eval_op.h @@ -23,7 +23,7 @@ namespace operators { using Tensor = framework::Tensor; using LoDTensor = framework::LoDTensor; -template +template class ChunkEvalKernel : public framework::OpKernel { public: struct Segment { diff --git a/paddle/operators/clip_by_norm_op.cc b/paddle/operators/clip_by_norm_op.cc index f73d55bbe3be5c14785fb3865eead97fbaa9f33d..0b7975a63f7d364bf9b0ce529e2dd72d9f3cd2e9 100644 --- a/paddle/operators/clip_by_norm_op.cc +++ b/paddle/operators/clip_by_norm_op.cc @@ -71,4 +71,5 @@ namespace ops = paddle::operators; REGISTER_OP_WITHOUT_GRADIENT(clip_by_norm, ops::ClipByNormOp, ops::ClipByNormOpMaker); REGISTER_OP_CPU_KERNEL( - clip_by_norm, ops::ClipByNormKernel); + clip_by_norm, + ops::ClipByNormKernel); diff --git a/paddle/operators/clip_by_norm_op.cu b/paddle/operators/clip_by_norm_op.cu index 2593a24ebbf56ecd286a726e527d2414247576e8..acd75438230715420470b81f7a5e5953bd8b8abe 100644 --- a/paddle/operators/clip_by_norm_op.cu +++ b/paddle/operators/clip_by_norm_op.cu @@ -15,5 +15,6 @@ #include "paddle/operators/clip_by_norm_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( - clip_by_norm, ops::ClipByNormKernel); +REGISTER_OP_CUDA_KERNEL( + clip_by_norm, + ops::ClipByNormKernel); diff --git a/paddle/operators/clip_by_norm_op.h b/paddle/operators/clip_by_norm_op.h index b26476cae9b5b2fa290bc9186b9a64c48ba703d6..d8db1566b0e8c9c351d3b6d6aca1d22d991fe76e 100644 --- a/paddle/operators/clip_by_norm_op.h +++ b/paddle/operators/clip_by_norm_op.h @@ -26,7 +26,7 @@ template using EigenVector = framework::EigenVector; -template +template class ClipByNormKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -38,7 +38,8 @@ class ClipByNormKernel : public framework::OpKernel { auto x = EigenVector::Flatten(*input); auto out = EigenVector::Flatten(*output); auto x_norm = x.square().sum().sqrt(); - auto place = context.GetEigenDevice(); + auto& place = + *context.template device_context().eigen_device(); auto temp = (x_norm <= max_norm).template cast().eval(); auto scaling = temp + (static_cast(1) - temp) * max_norm / x_norm; diff --git a/paddle/operators/clip_op.cc b/paddle/operators/clip_op.cc index 4ddf24dea3363432988ae5460bb7b092501ca021..6092212de4635e2ada81f8383a0ccf64a8116158 100644 --- a/paddle/operators/clip_op.cc +++ b/paddle/operators/clip_op.cc @@ -83,7 +83,7 @@ class ClipOpGrad : public framework::OperatorWithKernel { namespace ops = paddle::operators; REGISTER_OP(clip, ops::ClipOp, ops::ClipOpMaker, clip_grad, ops::ClipOpGrad); -REGISTER_OP_CPU_KERNEL(clip, - ops::ClipKernel); -REGISTER_OP_CPU_KERNEL(clip_grad, - ops::ClipGradKernel); +REGISTER_OP_CPU_KERNEL( + clip, ops::ClipKernel); +REGISTER_OP_CPU_KERNEL( + clip_grad, ops::ClipGradKernel); diff --git a/paddle/operators/clip_op.cu b/paddle/operators/clip_op.cu index ca9701298fdae3fabe234925edaf9e4d775cc66e..bb7dcc671a46758a6bd09e8035cf8d3f5e464b3b 100644 --- a/paddle/operators/clip_op.cu +++ b/paddle/operators/clip_op.cu @@ -15,7 +15,7 @@ #include "paddle/operators/clip_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(clip, - ops::ClipKernel); -REGISTER_OP_GPU_KERNEL(clip_grad, - ops::ClipGradKernel); +REGISTER_OP_CUDA_KERNEL( + clip, ops::ClipKernel); +REGISTER_OP_CUDA_KERNEL( + clip_grad, ops::ClipGradKernel); diff --git a/paddle/operators/clip_op.h b/paddle/operators/clip_op.h index ac702e9935201ba5263a80ebeb1ab22fa0bd1340..0c40797410950641d3d509a4980d5c4bdbd75cff 100644 --- a/paddle/operators/clip_op.h +++ b/paddle/operators/clip_op.h @@ -55,7 +55,7 @@ class ClipGradFunctor { T max_; }; -template +template class ClipKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -66,13 +66,13 @@ class ClipKernel : public framework::OpKernel { T* out_data = out->mutable_data(context.GetPlace()); const T* x_data = x->data(); int64_t numel = x->numel(); - Transform trans; - trans(context.device_context(), x_data, x_data + numel, out_data, - ClipFunctor(min, max)); + Transform trans; + trans(context.template device_context(), x_data, + x_data + numel, out_data, ClipFunctor(min, max)); } }; -template +template class ClipGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -86,9 +86,9 @@ class ClipGradKernel : public framework::OpKernel { auto* d_x_data = d_x->mutable_data(context.GetPlace()); const T* d_out_data = d_out->data(); const T* x_data = x->data(); - Transform trans; - trans(context.device_context(), d_out_data, d_out_data + numel, x_data, - d_x_data, ClipGradFunctor(min, max)); + Transform trans; + trans(context.template device_context(), d_out_data, + d_out_data + numel, x_data, d_x_data, ClipGradFunctor(min, max)); } } }; diff --git a/paddle/operators/compare_op.cu b/paddle/operators/compare_op.cu index 6ac8c124b9b2e7c808808ecc8802a2e5aeaa5b5d..596a878bcf9f5b81c87c3bd419a2f46c0a450635 100644 --- a/paddle/operators/compare_op.cu +++ b/paddle/operators/compare_op.cu @@ -14,10 +14,10 @@ #include "paddle/operators/compare_op.h" -REGISTER_LOGICAL_KERNEL(less_than, GPU, paddle::operators::LessThanFunctor); -REGISTER_LOGICAL_KERNEL(less_equal, GPU, paddle::operators::LessEqualFunctor); -REGISTER_LOGICAL_KERNEL(greater_than, GPU, +REGISTER_LOGICAL_KERNEL(less_than, CUDA, paddle::operators::LessThanFunctor); +REGISTER_LOGICAL_KERNEL(less_equal, CUDA, paddle::operators::LessEqualFunctor); +REGISTER_LOGICAL_KERNEL(greater_than, CUDA, paddle::operators::GreaterThanFunctor); -REGISTER_LOGICAL_KERNEL(greater_equal, GPU, +REGISTER_LOGICAL_KERNEL(greater_equal, CUDA, paddle::operators::GreaterEqualFunctor); -REGISTER_LOGICAL_KERNEL(equal, GPU, paddle::operators::EqualFunctor); +REGISTER_LOGICAL_KERNEL(equal, CUDA, paddle::operators::EqualFunctor); diff --git a/paddle/operators/compare_op.h b/paddle/operators/compare_op.h index afdf3ab3e098b4e7f4c996471617d97ec49264b1..a56536e155531ac9ea3d17256210bdb9f4212181 100644 --- a/paddle/operators/compare_op.h +++ b/paddle/operators/compare_op.h @@ -59,7 +59,7 @@ struct EqualFunctor { } }; -template +template class CompareOpKernel : public framework::OpKernel { public: @@ -69,24 +69,23 @@ class CompareOpKernel auto* y = context.Input("Y"); auto* out = context.Output("Out"); Functor binary_func; - platform::Transform trans; - trans(context.device_context(), x->data(), x->data() + x->numel(), - y->data(), out->mutable_data(context.GetPlace()), - binary_func); + platform::Transform trans; + trans(context.template device_context(), x->data(), + x->data() + x->numel(), y->data(), + out->mutable_data(context.GetPlace()), binary_func); } }; } // namespace operators } // namespace paddle -#define REGISTER_LOGICAL_KERNEL(op_type, dev, functor) \ - REGISTER_OP_##dev##_KERNEL( \ - op_type, \ - ::paddle::operators::CompareOpKernel<::paddle::platform::dev##Place, \ - functor>, \ - ::paddle::operators::CompareOpKernel<::paddle::platform::dev##Place, \ - functor>, \ - ::paddle::operators::CompareOpKernel<::paddle::platform::dev##Place, \ - functor>, \ - ::paddle::operators::CompareOpKernel<::paddle::platform::dev##Place, \ - functor>); +#define REGISTER_LOGICAL_KERNEL(op_type, dev, functor) \ + REGISTER_OP_##dev##_KERNEL( \ + op_type, ::paddle::operators::CompareOpKernel< \ + ::paddle::platform::dev##DeviceContext, functor>, \ + ::paddle::operators::CompareOpKernel< \ + ::paddle::platform::dev##DeviceContext, functor>, \ + ::paddle::operators::CompareOpKernel< \ + ::paddle::platform::dev##DeviceContext, functor>, \ + ::paddle::operators::CompareOpKernel< \ + ::paddle::platform::dev##DeviceContext, functor>); diff --git a/paddle/operators/concat_op.cu.cc b/paddle/operators/concat_op.cu.cc index ede832ddcd486729db56bba016683b33875f8837..7b46452d3d5db58799923a3dc76bb9df3471d9e7 100644 --- a/paddle/operators/concat_op.cu.cc +++ b/paddle/operators/concat_op.cu.cc @@ -14,7 +14,8 @@ limitations under the License. */ #include "paddle/operators/concat_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(concat, - ops::ConcatKernel); -REGISTER_OP_GPU_KERNEL( - concat_grad, ops::ConcatGradKernel); +REGISTER_OP_CUDA_KERNEL( + concat, ops::ConcatKernel); +REGISTER_OP_CUDA_KERNEL( + concat_grad, + ops::ConcatGradKernel); diff --git a/paddle/operators/concat_op.h b/paddle/operators/concat_op.h index c113f19fb5cf806709bff845ee0f1078b34014bb..de4011585af81363368a096a5c361ff3f7aeecdb 100644 --- a/paddle/operators/concat_op.h +++ b/paddle/operators/concat_op.h @@ -21,7 +21,7 @@ limitations under the License. */ namespace paddle { namespace operators { -template +template class ConcatKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -43,7 +43,7 @@ class ConcatKernel : public framework::OpKernel { } }; -template +template class ConcatGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const { diff --git a/paddle/operators/conv_cudnn_op.cc b/paddle/operators/conv_cudnn_op.cc index 0dd8c13b2ad6ff206066ccb98a4c009e4c3b4fd0..008bf01885ecddd1fee76a33c43370d07a8988a2 100644 --- a/paddle/operators/conv_cudnn_op.cc +++ b/paddle/operators/conv_cudnn_op.cc @@ -57,18 +57,20 @@ REGISTER_OP(conv2d_cudnn, ops::ConvOp, ops::CudnnConv2DOpMaker, REGISTER_OP(conv3d_cudnn, ops::ConvOp, ops::CudnnConv3DOpMaker, conv3d_cudnn_grad, ops::ConvOpGrad); -REGISTER_OP_CPU_KERNEL(conv2d_cudnn, - ops::GemmConvKernel, - ops::GemmConvKernel); +REGISTER_OP_CPU_KERNEL( + conv2d_cudnn, + ops::GemmConvKernel, + ops::GemmConvKernel); REGISTER_OP_CPU_KERNEL( conv2d_cudnn_grad, - ops::GemmConvGradKernel, - ops::GemmConvGradKernel); + ops::GemmConvGradKernel, + ops::GemmConvGradKernel); -REGISTER_OP_CPU_KERNEL(conv3d_cudnn, - ops::GemmConvKernel, - ops::GemmConvKernel); +REGISTER_OP_CPU_KERNEL( + conv3d_cudnn, + ops::GemmConvKernel, + ops::GemmConvKernel); REGISTER_OP_CPU_KERNEL( conv3d_cudnn_grad, - ops::GemmConvGradKernel, - ops::GemmConvGradKernel); + ops::GemmConvGradKernel, + ops::GemmConvGradKernel); diff --git a/paddle/operators/conv_cudnn_op.cu.cc b/paddle/operators/conv_cudnn_op.cu.cc index bc265dcc4f28d3d3f848bebbfed3e0c3e8791b6a..3da0a9001aafbb5b2c4b9a91c4527d9437ac38a1 100644 --- a/paddle/operators/conv_cudnn_op.cu.cc +++ b/paddle/operators/conv_cudnn_op.cu.cc @@ -118,7 +118,8 @@ class CudnnConvOpKernel : public framework::OpKernel { } // ------------------- cudnn conv algorithm --------------------- cudnnConvolutionFwdAlgo_t algo; - auto handle = ctx.cuda_device_context().cudnn_handle(); + auto& dev_ctx = ctx.template device_context(); + auto handle = dev_ctx.cudnn_handle(); PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm( handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc, @@ -238,7 +239,8 @@ class CudnnConvGradOpKernel : public framework::OpKernel { workspace_size_limit = user_workspace_size * 1024 * 1024; } - auto handle = ctx.cuda_device_context().cudnn_handle(); + auto& dev_ctx = ctx.template device_context(); + auto handle = dev_ctx.cudnn_handle(); if (input_grad) { PADDLE_ENFORCE( platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm( @@ -313,16 +315,16 @@ class CudnnConvGradOpKernel : public framework::OpKernel { } // namespace operators } // namespace paddle -REGISTER_OP_GPU_KERNEL(conv2d_cudnn, - paddle::operators::CudnnConvOpKernel, - paddle::operators::CudnnConvOpKernel); -REGISTER_OP_GPU_KERNEL(conv2d_cudnn_grad, - paddle::operators::CudnnConvGradOpKernel, - paddle::operators::CudnnConvGradOpKernel); - -REGISTER_OP_GPU_KERNEL(conv3d_cudnn, - paddle::operators::CudnnConvOpKernel, - paddle::operators::CudnnConvOpKernel); -REGISTER_OP_GPU_KERNEL(conv3d_cudnn_grad, - paddle::operators::CudnnConvGradOpKernel, - paddle::operators::CudnnConvGradOpKernel); +REGISTER_OP_CUDA_KERNEL(conv2d_cudnn, + paddle::operators::CudnnConvOpKernel, + paddle::operators::CudnnConvOpKernel); +REGISTER_OP_CUDA_KERNEL(conv2d_cudnn_grad, + paddle::operators::CudnnConvGradOpKernel, + paddle::operators::CudnnConvGradOpKernel); + +REGISTER_OP_CUDA_KERNEL(conv3d_cudnn, + paddle::operators::CudnnConvOpKernel, + paddle::operators::CudnnConvOpKernel); +REGISTER_OP_CUDA_KERNEL(conv3d_cudnn_grad, + paddle::operators::CudnnConvGradOpKernel, + paddle::operators::CudnnConvGradOpKernel); diff --git a/paddle/operators/conv_op.cc b/paddle/operators/conv_op.cc index 462e6d9cbcbe61d9911efe8beff4446620e1e932..7ef805fd44bf94d3279ffa50f86993b3f2b64412 100644 --- a/paddle/operators/conv_op.cc +++ b/paddle/operators/conv_op.cc @@ -235,16 +235,18 @@ namespace ops = paddle::operators; REGISTER_OP(conv3d, ops::ConvOp, ops::Conv3DOpMaker, conv3d_grad, ops::ConvOpGrad); -REGISTER_OP_CPU_KERNEL(conv2d, - ops::GemmConvKernel, - ops::GemmConvKernel); REGISTER_OP_CPU_KERNEL( - conv2d_grad, ops::GemmConvGradKernel, - ops::GemmConvGradKernel); + conv2d, ops::GemmConvKernel, + ops::GemmConvKernel); +REGISTER_OP_CPU_KERNEL( + conv2d_grad, + ops::GemmConvGradKernel, + ops::GemmConvGradKernel); -REGISTER_OP_CPU_KERNEL(conv3d, - ops::GemmConvKernel, - ops::GemmConvKernel); REGISTER_OP_CPU_KERNEL( - conv3d_grad, ops::GemmConvGradKernel, - ops::GemmConvGradKernel); + conv3d, ops::GemmConvKernel, + ops::GemmConvKernel); +REGISTER_OP_CPU_KERNEL( + conv3d_grad, + ops::GemmConvGradKernel, + ops::GemmConvGradKernel); diff --git a/paddle/operators/conv_op.cu.cc b/paddle/operators/conv_op.cu.cc index 546451234a1ed1a4d3119cb175c6d37ae3f0aac1..38615a8befab91633423b7cd8536253a0d049ac3 100644 --- a/paddle/operators/conv_op.cu.cc +++ b/paddle/operators/conv_op.cu.cc @@ -16,16 +16,18 @@ namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(conv2d, - ops::GemmConvKernel, - ops::GemmConvKernel); -REGISTER_OP_GPU_KERNEL( - conv2d_grad, ops::GemmConvGradKernel, - ops::GemmConvGradKernel); +REGISTER_OP_CUDA_KERNEL( + conv2d, ops::GemmConvKernel, + ops::GemmConvKernel); +REGISTER_OP_CUDA_KERNEL( + conv2d_grad, + ops::GemmConvGradKernel, + ops::GemmConvGradKernel); -REGISTER_OP_GPU_KERNEL(conv3d, - ops::GemmConvKernel, - ops::GemmConvKernel); -REGISTER_OP_GPU_KERNEL( - conv3d_grad, ops::GemmConvGradKernel, - ops::GemmConvGradKernel); +REGISTER_OP_CUDA_KERNEL( + conv3d, ops::GemmConvKernel, + ops::GemmConvKernel); +REGISTER_OP_CUDA_KERNEL( + conv3d_grad, + ops::GemmConvGradKernel, + ops::GemmConvGradKernel); diff --git a/paddle/operators/conv_op.h b/paddle/operators/conv_op.h index 09bff0a68db82aa723dc08aa83c775910e17c5b8..749258183ba058cf0ed8d91c4406813694314b85 100644 --- a/paddle/operators/conv_op.h +++ b/paddle/operators/conv_op.h @@ -72,7 +72,7 @@ class ConvOpGrad : public framework::OperatorWithKernel { void InferShape(framework::InferShapeContext* ctx) const override; }; -template +template class GemmConvKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -141,9 +141,10 @@ class GemmConvKernel : public framework::OpKernel { int in_step = static_cast(input->dims()[1]) / groups; int out_step = static_cast(output->dims()[1]) / groups; - math::Vol2ColFunctor vol2col; - math::Im2ColFunctor im2col; + math::Vol2ColFunctor vol2col; + math::Im2ColFunctor im2col; + auto& dev_ctx = context.template device_context(); for (int i = 0; i < batch_size; i++) { Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape); Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape); @@ -157,27 +158,26 @@ class GemmConvKernel : public framework::OpKernel { col_matrix.Resize(col_matrix_shape); } else if (data_dim == 2U) { // im2col - im2col(context.device_context(), in_slice, dilations, strides, + im2col(dev_ctx, in_slice, dilations, strides, std::vector{paddings[0], paddings[1], paddings[0], paddings[1]}, &col); } else if (data_dim == 3U) { // vol2col - vol2col(context.device_context(), in_slice, dilations, strides, - paddings, &col); + vol2col(dev_ctx, in_slice, dilations, strides, paddings, &col); } // gemm Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); - math::matmul(context.device_context(), filter_slice, false, - col_matrix, false, T(1.0), &out_slice, T(0.0)); + math::matmul(dev_ctx, filter_slice, false, col_matrix, + false, T(1.0), &out_slice, T(0.0)); } } } }; -template +template class GemmConvGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -256,14 +256,15 @@ class GemmConvGradKernel : public framework::OpKernel { col_matrix.Resize(col_matrix_shape); } - math::SetConstant set_zero; + math::SetConstant set_zero; + auto& dev_ctx = context.template device_context(); if (input_grad) { input_grad->mutable_data(context.GetPlace()); - set_zero(context.device_context(), input_grad, static_cast(0)); + set_zero(dev_ctx, input_grad, static_cast(0)); - math::Col2VolFunctor col2vol; - math::Col2ImFunctor col2im; + math::Col2VolFunctor col2vol; + math::Col2ImFunctor col2im; for (int i = 0; i < batch_size; i++) { Tensor out_grad_batch = @@ -282,18 +283,17 @@ class GemmConvGradKernel : public framework::OpKernel { col_matrix.ShareDataWith(in_grad_slice); col_matrix.Resize(col_matrix_shape); } - math::matmul(context.device_context(), filter_slice, true, - out_grad_slice, false, T(1.0), &col_matrix, - T(0.0)); + math::matmul(dev_ctx, filter_slice, true, + out_grad_slice, false, T(1.0), + &col_matrix, T(0.0)); if (is_expand && data_dim == 2U) { - col2im(context.device_context(), col, dilations, strides, + col2im(dev_ctx, col, dilations, strides, std::vector{paddings[0], paddings[1], paddings[0], paddings[1]}, &in_grad_slice); } else if (is_expand && data_dim == 3U) { - col2vol(context.device_context(), col, dilations, strides, paddings, - &in_grad_slice); + col2vol(dev_ctx, col, dilations, strides, paddings, &in_grad_slice); } } } @@ -303,9 +303,9 @@ class GemmConvGradKernel : public framework::OpKernel { filter_grad->mutable_data(context.GetPlace()); Tensor filter_grad_ = *filter_grad; filter_grad_.Resize(filter_matrix_shape); - set_zero(context.device_context(), filter_grad, static_cast(0)); - math::Im2ColFunctor im2col; - math::Vol2ColFunctor vol2col; + set_zero(dev_ctx, filter_grad, static_cast(0)); + math::Im2ColFunctor im2col; + math::Vol2ColFunctor vol2col; for (int i = 0; i < batch_size; i++) { Tensor out_grad_batch = output_grad->Slice(i, i + 1).Resize(output_matrix_shape); @@ -321,21 +321,20 @@ class GemmConvGradKernel : public framework::OpKernel { col_matrix.ShareDataWith(col); col_matrix.Resize(col_matrix_shape); } else if (data_dim == 2U) { - im2col(context.device_context(), in_slice, dilations, strides, + im2col(dev_ctx, in_slice, dilations, strides, std::vector{paddings[0], paddings[1], paddings[0], paddings[1]}, &col); } else if (data_dim == 3U) { - vol2col(context.device_context(), in_slice, dilations, strides, - paddings, &col); + vol2col(dev_ctx, in_slice, dilations, strides, paddings, &col); } // gemm Tensor filter_grad_slice = filter_grad_.Slice(g * out_step, (g + 1) * out_step); - math::matmul(context.device_context(), out_grad_slice, - false, col_matrix, true, T(1.0), - &filter_grad_slice, T(1.0)); + math::matmul(dev_ctx, out_grad_slice, false, + col_matrix, true, T(1.0), + &filter_grad_slice, T(1.0)); } } } diff --git a/paddle/operators/conv_shift_op.cu b/paddle/operators/conv_shift_op.cu index 95e13c38a8dd234f49393d2d4808607a447b0d4c..f7ca82ce2635f9ef9d7e9a062d148448e61c163c 100644 --- a/paddle/operators/conv_shift_op.cu +++ b/paddle/operators/conv_shift_op.cu @@ -111,7 +111,8 @@ __global__ void ConvShiftDy(const T *x, const T *dout, int x_width, int y_width, } // namespace template -class ConvShiftKernel : public framework::OpKernel { +class ConvShiftKernel + : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { const Tensor *X = context.Input("X"); @@ -132,7 +133,8 @@ class ConvShiftKernel : public framework::OpKernel { dim3 grid_dim(num_x_blocks, batch_size); - auto stream = context.cuda_device_context().stream(); + auto stream = + context.template device_context().stream(); ConvShiftForward<<>>( x_data, y_data, x_width, y_width, y_half_width, batch_size, out_data); @@ -140,7 +142,7 @@ class ConvShiftKernel : public framework::OpKernel { }; template -class ConvShiftGradKernel +class ConvShiftGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { @@ -159,8 +161,9 @@ class ConvShiftGradKernel int y_width = Y->dims()[1]; int y_half_width = (y_width - 1) / 2; - auto &device_ctx = context.cuda_device_context(); - math::SetConstant zero; + auto &device_ctx = + context.template device_context(); + math::SetConstant zero; const int x_per_block = 256; int num_x_blocks = DivUp(x_width, x_per_block); @@ -186,8 +189,9 @@ class ConvShiftGradKernel } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(conv_shift, - ops::ConvShiftKernel); -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( + conv_shift, + ops::ConvShiftKernel); +REGISTER_OP_CUDA_KERNEL( conv_shift_grad, - ops::ConvShiftGradKernel); + ops::ConvShiftGradKernel); diff --git a/paddle/operators/conv_shift_op.h b/paddle/operators/conv_shift_op.h index 5a160b0f1696c70868fc48d219b38cde2018e8a3..1a70b38a0d8cb82ad1f818148306b7ec5f334744 100644 --- a/paddle/operators/conv_shift_op.h +++ b/paddle/operators/conv_shift_op.h @@ -18,13 +18,13 @@ namespace paddle { namespace operators { -template +template class ConvShiftKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override; }; -template +template class ConvShiftGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override; diff --git a/paddle/operators/conv_transpose_cudnn_op.cc b/paddle/operators/conv_transpose_cudnn_op.cc index 0192178ce3a0a47196232f0723baec8324bea60b..4cb6a2ccffc76066ea0868f76ba2a3bfb9e5e450 100644 --- a/paddle/operators/conv_transpose_cudnn_op.cc +++ b/paddle/operators/conv_transpose_cudnn_op.cc @@ -61,12 +61,13 @@ REGISTER_OP(conv2d_transpose_cudnn, ops::ConvTransposeOp, REGISTER_OP_CPU_KERNEL( conv2d_transpose_cudnn, - ops::GemmConvTransposeKernel, - ops::GemmConvTransposeKernel); + ops::GemmConvTransposeKernel, + ops::GemmConvTransposeKernel); REGISTER_OP_CPU_KERNEL( conv2d_transpose_cudnn_grad, - ops::GemmConvTransposeGradKernel, - ops::GemmConvTransposeGradKernel); + ops::GemmConvTransposeGradKernel, + ops::GemmConvTransposeGradKernel); REGISTER_OP(conv3d_transpose_cudnn, ops::ConvTransposeOp, ops::CudnnConv3DTransposeOpMaker, conv3d_transpose_cudnn_grad, @@ -74,9 +75,10 @@ REGISTER_OP(conv3d_transpose_cudnn, ops::ConvTransposeOp, REGISTER_OP_CPU_KERNEL( conv3d_transpose_cudnn, - ops::GemmConvTransposeKernel, - ops::GemmConvTransposeKernel); + ops::GemmConvTransposeKernel, + ops::GemmConvTransposeKernel); REGISTER_OP_CPU_KERNEL( conv3d_transpose_cudnn_grad, - ops::GemmConvTransposeGradKernel, - ops::GemmConvTransposeGradKernel); + ops::GemmConvTransposeGradKernel, + ops::GemmConvTransposeGradKernel); diff --git a/paddle/operators/conv_transpose_cudnn_op.cu.cc b/paddle/operators/conv_transpose_cudnn_op.cu.cc index 494904fe524ae30a5032e489a0c5f20179d8e8ce..f0297f6c40c132c28b50184997d657451f26362b 100644 --- a/paddle/operators/conv_transpose_cudnn_op.cu.cc +++ b/paddle/operators/conv_transpose_cudnn_op.cu.cc @@ -83,7 +83,8 @@ class CudnnConvTransposeOpKernel : public framework::OpKernel { } // ------------------- cudnn conv algorithm --------------------- cudnnConvolutionBwdDataAlgo_t algo; - auto handle = ctx.cuda_device_context().cudnn_handle(); + auto& dev_ctx = ctx.template device_context(); + auto handle = dev_ctx.cudnn_handle(); // Get the algorithm PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm( handle, cudnn_filter_desc, cudnn_input_desc, cudnn_conv_desc, @@ -165,7 +166,8 @@ class CudnnConvTransposeGradOpKernel : public framework::OpKernel { workspace_size_limit = user_workspace_size * 1024 * 1024; } - auto handle = ctx.cuda_device_context().cudnn_handle(); + auto& dev_ctx = ctx.template device_context(); + auto handle = dev_ctx.cudnn_handle(); if (input_grad) { // choose backward algorithm for data PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm( @@ -234,16 +236,16 @@ class CudnnConvTransposeGradOpKernel : public framework::OpKernel { namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(conv2d_transpose_cudnn, - ops::CudnnConvTransposeOpKernel, - ops::CudnnConvTransposeOpKernel); -REGISTER_OP_GPU_KERNEL(conv2d_transpose_cudnn_grad, - ops::CudnnConvTransposeGradOpKernel, - ops::CudnnConvTransposeGradOpKernel); - -REGISTER_OP_GPU_KERNEL(conv3d_transpose_cudnn, - ops::CudnnConvTransposeOpKernel, - ops::CudnnConvTransposeOpKernel); -REGISTER_OP_GPU_KERNEL(conv3d_transpose_cudnn_grad, - ops::CudnnConvTransposeGradOpKernel, - ops::CudnnConvTransposeGradOpKernel); +REGISTER_OP_CUDA_KERNEL(conv2d_transpose_cudnn, + ops::CudnnConvTransposeOpKernel, + ops::CudnnConvTransposeOpKernel); +REGISTER_OP_CUDA_KERNEL(conv2d_transpose_cudnn_grad, + ops::CudnnConvTransposeGradOpKernel, + ops::CudnnConvTransposeGradOpKernel); + +REGISTER_OP_CUDA_KERNEL(conv3d_transpose_cudnn, + ops::CudnnConvTransposeOpKernel, + ops::CudnnConvTransposeOpKernel); +REGISTER_OP_CUDA_KERNEL(conv3d_transpose_cudnn_grad, + ops::CudnnConvTransposeGradOpKernel, + ops::CudnnConvTransposeGradOpKernel); diff --git a/paddle/operators/conv_transpose_op.cc b/paddle/operators/conv_transpose_op.cc index 678b192dea78fc6b4a6b54c4bb09a55dfb8f9c38..ca063e94bbe64817567a298c3b1ad9306667536d 100644 --- a/paddle/operators/conv_transpose_op.cc +++ b/paddle/operators/conv_transpose_op.cc @@ -197,21 +197,23 @@ REGISTER_OP(conv2d_transpose, ops::ConvTransposeOp, ops::Conv2DTransposeOpMaker, REGISTER_OP_CPU_KERNEL( conv2d_transpose, - ops::GemmConvTransposeKernel, - ops::GemmConvTransposeKernel); + ops::GemmConvTransposeKernel, + ops::GemmConvTransposeKernel); REGISTER_OP_CPU_KERNEL( conv2d_transpose_grad, - ops::GemmConvTransposeGradKernel, - ops::GemmConvTransposeGradKernel); + ops::GemmConvTransposeGradKernel, + ops::GemmConvTransposeGradKernel); REGISTER_OP(conv3d_transpose, ops::ConvTransposeOp, ops::Conv3DTransposeOpMaker, conv3d_transpose_grad, ops::ConvTransposeOpGrad); REGISTER_OP_CPU_KERNEL( conv3d_transpose, - ops::GemmConvTransposeKernel, - ops::GemmConvTransposeKernel); + ops::GemmConvTransposeKernel, + ops::GemmConvTransposeKernel); REGISTER_OP_CPU_KERNEL( conv3d_transpose_grad, - ops::GemmConvTransposeGradKernel, - ops::GemmConvTransposeGradKernel); + ops::GemmConvTransposeGradKernel, + ops::GemmConvTransposeGradKernel); diff --git a/paddle/operators/conv_transpose_op.cu.cc b/paddle/operators/conv_transpose_op.cu.cc index 4165eb0c7b048b83bbd94c57b971530043b66545..b91ebd7922f2e101df8d6ef5892a62ec5a10cf99 100644 --- a/paddle/operators/conv_transpose_op.cu.cc +++ b/paddle/operators/conv_transpose_op.cu.cc @@ -16,20 +16,24 @@ namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( conv2d_transpose, - ops::GemmConvTransposeKernel, - ops::GemmConvTransposeKernel); -REGISTER_OP_GPU_KERNEL( + ops::GemmConvTransposeKernel, + ops::GemmConvTransposeKernel); +REGISTER_OP_CUDA_KERNEL( conv2d_transpose_grad, - ops::GemmConvTransposeGradKernel, - ops::GemmConvTransposeGradKernel); + ops::GemmConvTransposeGradKernel, + ops::GemmConvTransposeGradKernel); -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( conv3d_transpose, - ops::GemmConvTransposeKernel, - ops::GemmConvTransposeKernel); -REGISTER_OP_GPU_KERNEL( + ops::GemmConvTransposeKernel, + ops::GemmConvTransposeKernel); +REGISTER_OP_CUDA_KERNEL( conv3d_transpose_grad, - ops::GemmConvTransposeGradKernel, - ops::GemmConvTransposeGradKernel); + ops::GemmConvTransposeGradKernel, + ops::GemmConvTransposeGradKernel); diff --git a/paddle/operators/conv_transpose_op.h b/paddle/operators/conv_transpose_op.h index 1cacb770e6af3ad3c99ab81c5598ffcd228f59b2..80600b53614994ba0c740aed0d75c9944333fecc 100644 --- a/paddle/operators/conv_transpose_op.h +++ b/paddle/operators/conv_transpose_op.h @@ -52,7 +52,7 @@ class ConvTransposeOpGrad : public framework::OperatorWithKernel { void InferShape(framework::InferShapeContext* ctx) const override; }; -template +template class GemmConvTransposeKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -109,11 +109,12 @@ class GemmConvTransposeKernel : public framework::OpKernel { filter.Resize(filter_matrix_shape); output->mutable_data(context.GetPlace()); - math::SetConstant set_zero; - set_zero(context.device_context(), output, static_cast(0)); + math::SetConstant set_zero; + auto& dev_ctx = context.template device_context(); + set_zero(dev_ctx, output, static_cast(0)); - math::Col2ImFunctor col2im; - math::Col2VolFunctor col2vol; + math::Col2ImFunctor col2im; + math::Col2VolFunctor col2vol; std::vector dilations({1, 1, 1}); // convolution transpose: gemm + col2im or col2vol (similar to conv-backward @@ -127,29 +128,27 @@ class GemmConvTransposeKernel : public framework::OpKernel { // col_matrix = filter * input_batch // of shape (c * k_h * k_w, h * w) or (c * k_d * k_h * k_w, d * h * w) - math::matmul(context.device_context(), filter, true, - input_batch, false, static_cast(1.0), - &col_matrix, static_cast(0.0)); + math::matmul(dev_ctx, filter, true, input_batch, false, + static_cast(1.0), &col_matrix, + static_cast(0.0)); if (data_dim == 2U) { // col2im: col_matrix -> dy // from (c * k_h * k_w, h * w) to (c, o_h, o_w) - col2im(context.device_context(), col, - std::vector{dilations[0], dilations[1]}, strides, - std::vector{paddings[0], paddings[1], paddings[0], - paddings[1]}, + col2im(dev_ctx, col, std::vector{dilations[0], dilations[1]}, + strides, std::vector{paddings[0], paddings[1], paddings[0], + paddings[1]}, &output_batch); } else if (data_dim == 3U) { // col2vol: col_matrix -> dy // from (c * k_d * k_h * k_w, d * h * w) to (c, o_d, o_h, o_w) - col2vol(context.device_context(), col, dilations, strides, paddings, - &output_batch); + col2vol(dev_ctx, col, dilations, strides, paddings, &output_batch); } } } }; -template +template class GemmConvTransposeGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -206,6 +205,7 @@ class GemmConvTransposeGradKernel : public framework::OpKernel { // convolution transpose grad on input: // im2col + gemm (similar to conv-forward) // input need to compute gradient + auto& dev_ctx = context.template device_context(); if (input_grad || filter_grad) { Tensor col; col.mutable_data(col_shape, context.GetPlace()); @@ -217,19 +217,19 @@ class GemmConvTransposeGradKernel : public framework::OpKernel { col_matrix.Resize(col_matrix_shape); Tensor filter_grad_; - math::SetConstant set_zero; + math::SetConstant set_zero; - math::Im2ColFunctor im2col; - math::Vol2ColFunctor vol2col; + math::Im2ColFunctor im2col; + math::Vol2ColFunctor vol2col; std::vector dilations({1, 1, 1}); if (input_grad) { input_grad->mutable_data(context.GetPlace()); - set_zero(context.device_context(), input_grad, static_cast(0)); + set_zero(dev_ctx, input_grad, static_cast(0)); } if (filter_grad) { // filter size (m, c, k_h, k_w) filter_grad->mutable_data(context.GetPlace()); - set_zero(context.device_context(), filter_grad, static_cast(0)); + set_zero(dev_ctx, filter_grad, static_cast(0)); filter_grad_ = *filter_grad; filter_grad_.Resize(filter_matrix_shape); } @@ -242,7 +242,7 @@ class GemmConvTransposeGradKernel : public framework::OpKernel { if (data_dim == 2U) { // im2col: dy -> col matrix // from (c, o_h, o_w) to (c * k_h * k_w, h * w) - im2col(context.device_context(), output_grad_batch, + im2col(dev_ctx, output_grad_batch, std::vector{dilations[0], dilations[1]}, strides, std::vector{paddings[0], paddings[1], paddings[0], paddings[1]}, @@ -250,8 +250,8 @@ class GemmConvTransposeGradKernel : public framework::OpKernel { } else if (data_dim == 3U) { // vol2col: dy -> col_matrix // from (c, o_d, o_h, o_w) to (c * k_d * k_h * k_w, d * h * w) - vol2col(context.device_context(), output_grad_batch, dilations, - strides, paddings, &col); + vol2col(dev_ctx, output_grad_batch, dilations, strides, paddings, + &col); } if (input_grad) { @@ -263,9 +263,9 @@ class GemmConvTransposeGradKernel : public framework::OpKernel { // or // (m, c * k_d * k_h * k_w) * (c * k_d * k_h * k_w, d * h * w) -> (m, // d, h, w) - math::matmul(context.device_context(), filter, false, - col_matrix, false, static_cast(1.0), - &input_grad_batch, static_cast(0.0)); + math::matmul( + dev_ctx, filter, false, col_matrix, false, static_cast(1.0), + &input_grad_batch, static_cast(0.0)); } if (filter_grad) { // input batch @@ -275,9 +275,9 @@ class GemmConvTransposeGradKernel : public framework::OpKernel { // or // (m, d * h * w) * (d * h * w, c * k_d * k_h * k_w) -> (m, c * k_d * // k_h * k_w) - math::matmul(context.device_context(), in_batch, false, - col_matrix, true, static_cast(1.0), - &filter_grad_, static_cast(1.0)); + math::matmul(dev_ctx, in_batch, false, col_matrix, + true, static_cast(1.0), + &filter_grad_, static_cast(1.0)); } } } diff --git a/paddle/operators/cos_sim_op.cc b/paddle/operators/cos_sim_op.cc index 312264ccd48d1405a247a2c864d9f5897c897bea..440c427cba9396ec6d0ebf7814d671e45f45412d 100644 --- a/paddle/operators/cos_sim_op.cc +++ b/paddle/operators/cos_sim_op.cc @@ -155,7 +155,8 @@ class CosSimOpGrad : public framework::OperatorWithKernel { namespace ops = paddle::operators; REGISTER_OP(cos_sim, ops::CosSimOp, ops::CosSimOpMaker, cos_sim_grad, ops::CosSimOpGrad); -REGISTER_OP_CPU_KERNEL(cos_sim, - ops::CosSimKernel); REGISTER_OP_CPU_KERNEL( - cos_sim_grad, ops::CosSimGradKernel); + cos_sim, ops::CosSimKernel); +REGISTER_OP_CPU_KERNEL( + cos_sim_grad, + ops::CosSimGradKernel); diff --git a/paddle/operators/cos_sim_op.cu b/paddle/operators/cos_sim_op.cu index 0cb8fd26de47a4a464db98664263544e3e503d63..1cb01f5945f691747bac609ca4a93e2d15cde5bf 100644 --- a/paddle/operators/cos_sim_op.cu +++ b/paddle/operators/cos_sim_op.cu @@ -16,7 +16,8 @@ #include "paddle/operators/cos_sim_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(cos_sim, - ops::CosSimKernel); -REGISTER_OP_GPU_KERNEL( - cos_sim_grad, ops::CosSimGradKernel); +REGISTER_OP_CUDA_KERNEL( + cos_sim, ops::CosSimKernel); +REGISTER_OP_CUDA_KERNEL( + cos_sim_grad, + ops::CosSimGradKernel); diff --git a/paddle/operators/cos_sim_op.h b/paddle/operators/cos_sim_op.h index 62a4e484eceeabc4cc26e68ac54a50be1ac95df7..fecb5a79b2397dd73d991a1a87efcf84d60ef882 100644 --- a/paddle/operators/cos_sim_op.h +++ b/paddle/operators/cos_sim_op.h @@ -27,7 +27,7 @@ template using EigenVector = framework::EigenVector; -template +template class CosSimKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -51,7 +51,8 @@ class CosSimKernel : public framework::OpKernel { auto y_norm = EigenVector::Flatten(*out_y_norm); // compute - auto place = context.GetEigenDevice(); + auto& place = + *context.template device_context().eigen_device(); auto row_along = Eigen::array({{1}}); x_norm.device(place) = x.square().sum(row_along).sqrt(); y_norm.device(place) = y.square().sum(row_along).sqrt(); @@ -66,7 +67,7 @@ class CosSimKernel : public framework::OpKernel { } }; -template +template class CosSimGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -96,7 +97,8 @@ class CosSimGradKernel : public framework::OpKernel { auto z_bcast = z.broadcast(bcast_cols); auto dz_bcast = dz.broadcast(bcast_cols); auto x_snorm_bcast = x_norm.square().eval().broadcast(bcast_cols); - auto place = context.GetEigenDevice(); + auto& place = + *context.template device_context().eigen_device(); if (rows_x == rows_y) { auto y_snorm_bcast = y_norm.square().eval().broadcast(bcast_cols); auto norm_prod_bcast = (x_norm * y_norm).eval().broadcast(bcast_cols); diff --git a/paddle/operators/crf_decoding_op.cc b/paddle/operators/crf_decoding_op.cc index 291b23ed1b857a288beca31956355706d987ea3c..1ce189fa6ebba3712467572c55d599975bbe7534 100644 --- a/paddle/operators/crf_decoding_op.cc +++ b/paddle/operators/crf_decoding_op.cc @@ -135,5 +135,6 @@ namespace ops = paddle::operators; REGISTER_OP_WITHOUT_GRADIENT(crf_decoding, ops::CRFDecodingOp, ops::CRFDecodingOpMaker); REGISTER_OP_CPU_KERNEL( - crf_decoding, ops::CRFDecodingOpKernel, - ops::CRFDecodingOpKernel); + crf_decoding, + ops::CRFDecodingOpKernel, + ops::CRFDecodingOpKernel); diff --git a/paddle/operators/crf_decoding_op.h b/paddle/operators/crf_decoding_op.h index 57b5e21b3ab20ffb0f5589af299bf43a5ef5ccb1..f6827b7b1128251b2bb7e0a6a032389e5adc1371 100644 --- a/paddle/operators/crf_decoding_op.h +++ b/paddle/operators/crf_decoding_op.h @@ -24,7 +24,7 @@ using framework::LoDTensor; using framework::LoD; using framework::Tensor; -template +template class CRFDecodingOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -44,8 +44,8 @@ class CRFDecodingOpKernel : public framework::OpKernel { const size_t seq_num = lod[level].size() - 1; int64_t* path = decoded_path->mutable_data(platform::CPUPlace()); - math::SetConstant()(ctx.device_context(), - decoded_path, 0); + math::SetConstant()( + ctx.template device_context(), decoded_path, 0); for (size_t i = 0; i < seq_num; ++i) { int start_pos = static_cast(lod[level][i]); int end_pos = static_cast(lod[level][i + 1]); diff --git a/paddle/operators/crop_op.cc b/paddle/operators/crop_op.cc index 6752eb8c1c72150b0b1cf5595211ca1d01ef2bf4..7c2a0ac7a705e5aac3d181545f8dfc8881e811f2 100644 --- a/paddle/operators/crop_op.cc +++ b/paddle/operators/crop_op.cc @@ -133,5 +133,5 @@ class CropOpGrad : public framework::OperatorWithKernel { namespace ops = paddle::operators; REGISTER_OP(crop, ops::CropOp, ops::CropOpMaker, crop_grad, ops::CropOpGrad); REGISTER_OP_CPU_KERNEL(crop, ops::CropKernel); -REGISTER_OP_CPU_KERNEL(crop_grad, - ops::CropGradKernel); +REGISTER_OP_CPU_KERNEL( + crop_grad, ops::CropGradKernel); diff --git a/paddle/operators/crop_op.cu b/paddle/operators/crop_op.cu index f8ee18a1d6e894cbb2d71dd4b6b459abeb076817..90fd83ca10b750896a9fe144d3c30fabb2f54e0a 100644 --- a/paddle/operators/crop_op.cu +++ b/paddle/operators/crop_op.cu @@ -16,6 +16,6 @@ #include "paddle/operators/crop_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(crop, ops::CropKernel); -REGISTER_OP_GPU_KERNEL(crop_grad, - ops::CropGradKernel); +REGISTER_OP_CUDA_KERNEL(crop, ops::CropKernel); +REGISTER_OP_CUDA_KERNEL( + crop_grad, ops::CropGradKernel); diff --git a/paddle/operators/crop_op.h b/paddle/operators/crop_op.h index 2e72583d68d0acf0e2f5044637dba55de3b57209..d531a19c783d2768d24142bb7b974ccfc2b39350 100644 --- a/paddle/operators/crop_op.h +++ b/paddle/operators/crop_op.h @@ -49,7 +49,7 @@ class CropKernel : public framework::OpKernel { } }; -template +template void CropGradFunction(const framework::ExecutionContext& context) { auto* d_x = context.Output(framework::GradVarName("X")); if (d_x != nullptr) { @@ -63,12 +63,13 @@ void CropGradFunction(const framework::ExecutionContext& context) { } auto d_x_tensor = EigenTensor::From(*d_x); auto d_out_tensor = EigenTensor::From(*d_out); - d_x_tensor.device(context.GetEigenDevice()) = + d_x_tensor.device( + *context.template device_context().eigen_device()) = d_out_tensor.pad(paddings, 0); } } -template +template class CropGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -76,22 +77,22 @@ class CropGradKernel : public framework::OpKernel { context.Input(framework::GradVarName("Out"))->dims().size(); switch (rank) { case 1: - CropGradFunction(context); + CropGradFunction(context); break; case 2: - CropGradFunction(context); + CropGradFunction(context); break; case 3: - CropGradFunction(context); + CropGradFunction(context); break; case 4: - CropGradFunction(context); + CropGradFunction(context); break; case 5: - CropGradFunction(context); + CropGradFunction(context); break; case 6: - CropGradFunction(context); + CropGradFunction(context); break; default: PADDLE_THROW( diff --git a/paddle/operators/cross_entropy_op.cu b/paddle/operators/cross_entropy_op.cu index 6212e39dfde33c5943958adbd1a0a052262e119e..05469645880fa466a2a3324ad1b7a8b9d681c440 100644 --- a/paddle/operators/cross_entropy_op.cu +++ b/paddle/operators/cross_entropy_op.cu @@ -53,8 +53,9 @@ class CrossEntropyOpCUDAKernel : public framework::OpKernel { Tensor* y = ctx.Output("Y"); y->mutable_data(ctx.GetPlace()); - math::CrossEntropyFunctor()( - ctx.device_context(), y, x, label, ctx.Attr("soft_label")); + math::CrossEntropyFunctor()( + ctx.template device_context(), y, x, label, + ctx.Attr("soft_label")); } }; @@ -80,15 +81,17 @@ class CrossEntropyGradientOpCUDAKernel : public framework::OpKernel { int block = 512; int grid = (batch_size * class_num + block - 1) / block; - auto stream = ctx.cuda_device_context().stream(); + + auto& dev_ctx = ctx.template device_context(); + auto stream = dev_ctx.stream(); if (ctx.Attr("soft_label")) { auto* label_data = label->data(); SoftCrossEntropyGradientKernel<<>>( dx_data, dy_data, x_data, label_data, batch_size, class_num); } else { - math::SetConstant functor; - functor(ctx.device_context(), dx, 0); + math::SetConstant functor; + functor(dev_ctx, dx, 0); auto* label_data = label->data(); grid = (batch_size + block - 1) / block; CrossEntropyGradientKernel<<>>( @@ -101,8 +104,8 @@ class CrossEntropyGradientOpCUDAKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(cross_entropy, ops::CrossEntropyOpCUDAKernel, - ops::CrossEntropyOpCUDAKernel); -REGISTER_OP_GPU_KERNEL(cross_entropy_grad, - ops::CrossEntropyGradientOpCUDAKernel, - ops::CrossEntropyGradientOpCUDAKernel); +REGISTER_OP_CUDA_KERNEL(cross_entropy, ops::CrossEntropyOpCUDAKernel, + ops::CrossEntropyOpCUDAKernel); +REGISTER_OP_CUDA_KERNEL(cross_entropy_grad, + ops::CrossEntropyGradientOpCUDAKernel, + ops::CrossEntropyGradientOpCUDAKernel); diff --git a/paddle/operators/cross_entropy_op.h b/paddle/operators/cross_entropy_op.h index 37db0a930a6aea0ba333395ca9c5b9d231c07b32..5623d2ded16daaf51dd26c9d9a8c04a0ae5be5ec 100644 --- a/paddle/operators/cross_entropy_op.h +++ b/paddle/operators/cross_entropy_op.h @@ -37,8 +37,9 @@ class CrossEntropyOpKernel : public framework::OpKernel { Tensor* y = ctx.Output("Y"); y->mutable_data(ctx.GetPlace()); - math::CrossEntropyFunctor()( - ctx.device_context(), y, x, labels, ctx.Attr("soft_label")); + math::CrossEntropyFunctor()( + ctx.template device_context(), y, x, labels, + ctx.Attr("soft_label")); } }; @@ -61,7 +62,8 @@ class CrossEntropyGradientOpKernel : public framework::OpKernel { auto lbl_mat = EigenMatrix::From(*label); auto dx_mat = EigenMatrix::From(*dx); - dx_mat.device(ctx.GetEigenDevice()) = + dx_mat.device(*ctx.template device_context() + .eigen_device()) = -(lbl_mat * dy_mat.broadcast(Eigen::DSizes(1, class_num)) / x_mat); } else { @@ -70,8 +72,8 @@ class CrossEntropyGradientOpKernel : public framework::OpKernel { const T* x_data = x->data(); const int64_t* label_data = label->data(); - math::SetConstant functor; - functor(ctx.device_context(), dx, 0); + math::SetConstant functor; + functor(ctx.template device_context(), dx, 0); for (int64_t i = 0; i < batch_size; ++i) { PADDLE_ASSERT(label_data[i] >= 0 || label_data[i] < class_num); diff --git a/paddle/operators/decayed_adagrad_op.cc b/paddle/operators/decayed_adagrad_op.cc index 640b4e77448d1b64bcf7375f26c07ff1d2bdeaa3..fd29c7270b0442da740a74f83fdfeed8f47f830d 100644 --- a/paddle/operators/decayed_adagrad_op.cc +++ b/paddle/operators/decayed_adagrad_op.cc @@ -99,4 +99,4 @@ REGISTER_OP_WITHOUT_GRADIENT(decayed_adagrad, ops::DecayedAdagradOp, ops::DecayedAdagradOpMaker); REGISTER_OP_CPU_KERNEL( decayed_adagrad, - ops::DecayedAdagradOpKernel); + ops::DecayedAdagradOpKernel); diff --git a/paddle/operators/decayed_adagrad_op.cu b/paddle/operators/decayed_adagrad_op.cu index 6fce77fe4ec6b76cb7b0259aab6a3d55d2edb36c..282b90f275ad1542d5941e001dbf646348fc01b6 100644 --- a/paddle/operators/decayed_adagrad_op.cu +++ b/paddle/operators/decayed_adagrad_op.cu @@ -16,6 +16,6 @@ #include "paddle/operators/decayed_adagrad_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( decayed_adagrad, - ops::DecayedAdagradOpKernel); + ops::DecayedAdagradOpKernel); diff --git a/paddle/operators/decayed_adagrad_op.h b/paddle/operators/decayed_adagrad_op.h index 0fe0fc5acd66c9824a864618b69097c5c063ea3f..fec9705cfc1e14e5423e23d6afb218c6c051f5a1 100644 --- a/paddle/operators/decayed_adagrad_op.h +++ b/paddle/operators/decayed_adagrad_op.h @@ -19,7 +19,7 @@ limitations under the License. */ namespace paddle { namespace operators { -template +template class DecayedAdagradOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -43,7 +43,7 @@ class DecayedAdagradOpKernel : public framework::OpKernel { auto param_out = framework::EigenVector::Flatten(*param_out_tensor); auto moment_out = framework::EigenVector::Flatten(*moment_out_tensor); - auto place = ctx.GetEigenDevice(); + auto& place = *ctx.template device_context().eigen_device(); moment_out.device(place) = decay * moment + (1 - decay) * grad * grad; Eigen::DSizes m_dsize(moment_out_tensor->numel()); diff --git a/paddle/operators/dropout_op.cc b/paddle/operators/dropout_op.cc index 932c0bf8fbf6ffdc466516bb7c8578abf0f57209..acd526ae8047292ce6c6756f174c80053dca0d9f 100644 --- a/paddle/operators/dropout_op.cc +++ b/paddle/operators/dropout_op.cc @@ -100,6 +100,8 @@ namespace ops = paddle::operators; REGISTER_OP(dropout, ops::DropoutOp, ops::DropoutOpMaker, dropout_grad, ops::DropoutOpGrad); REGISTER_OP_CPU_KERNEL( - dropout, ops::CPUDropoutKernel); + dropout, + ops::CPUDropoutKernel); REGISTER_OP_CPU_KERNEL( - dropout_grad, ops::DropoutGradKernel); + dropout_grad, + ops::DropoutGradKernel); diff --git a/paddle/operators/dropout_op.cu b/paddle/operators/dropout_op.cu index db3578b9bf4c081e431f202f0828ec6392c924b2..10c670751d026ef92e01aad7da31a8f59b8514c0 100644 --- a/paddle/operators/dropout_op.cu +++ b/paddle/operators/dropout_op.cu @@ -58,7 +58,7 @@ class GPUDropoutKernel : public framework::OpKernel { auto X = EigenMatrix::Reshape(*x, 1); auto Y = EigenMatrix::Reshape(*y, 1); - auto place = context.GetEigenDevice(); + auto& place = *context.template device_context().eigen_device(); if (!context.Attr("is_test")) { auto* mask = context.Output("Mask"); auto* mask_data = mask->mutable_data(context.GetPlace()); @@ -80,7 +80,9 @@ class GPUDropoutKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( - dropout, ops::GPUDropoutKernel); -REGISTER_OP_GPU_KERNEL( - dropout_grad, ops::DropoutGradKernel); +REGISTER_OP_CUDA_KERNEL( + dropout, + ops::GPUDropoutKernel); +REGISTER_OP_CUDA_KERNEL( + dropout_grad, + ops::DropoutGradKernel); diff --git a/paddle/operators/dropout_op.h b/paddle/operators/dropout_op.h index d9a130fdc040f745b058c39221f0bb9661473388..84ad39f0bb639975365d427aa205411ef79ecd46 100644 --- a/paddle/operators/dropout_op.h +++ b/paddle/operators/dropout_op.h @@ -25,7 +25,7 @@ template using EigenMatrix = framework::EigenMatrix; -template +template class CPUDropoutKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -55,13 +55,14 @@ class CPUDropoutKernel : public framework::OpKernel { } else { auto X = EigenMatrix::Reshape(*x, 1); auto Y = EigenMatrix::Reshape(*y, 1); - auto place = context.GetEigenDevice(); + auto& place = + *context.template device_context().eigen_device(); Y.device(place) = X * dropout_prob; } } }; -template +template class DropoutGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -77,7 +78,8 @@ class DropoutGradKernel : public framework::OpKernel { auto dX = EigenMatrix::Reshape(*grad_x, 1); auto dY = EigenMatrix::Reshape(*grad_y, 1); - auto place = context.GetEigenDevice(); + auto& place = + *context.template device_context().eigen_device(); dX.device(place) = dY * M; } }; diff --git a/paddle/operators/elementwise_add_op.cc b/paddle/operators/elementwise_add_op.cc index 432b9ba6f72f8dd11c666d5473c570bde60de995..a62eeeeb95fef77c00258403ca1cae11c2db7173 100644 --- a/paddle/operators/elementwise_add_op.cc +++ b/paddle/operators/elementwise_add_op.cc @@ -34,13 +34,13 @@ REGISTER_OP(elementwise_add, ops::ElementwiseOp, ops::ElementwiseAddOpMaker, elementwise_add_grad, ops::ElementwiseOpGrad); REGISTER_OP_CPU_KERNEL( elementwise_add, - ops::ElementwiseAddKernel, - ops::ElementwiseAddKernel, - ops::ElementwiseAddKernel, - ops::ElementwiseAddKernel); + ops::ElementwiseAddKernel, + ops::ElementwiseAddKernel, + ops::ElementwiseAddKernel, + ops::ElementwiseAddKernel); REGISTER_OP_CPU_KERNEL( elementwise_add_grad, - ops::ElementwiseAddGradKernel, - ops::ElementwiseAddGradKernel, - ops::ElementwiseAddGradKernel, - ops::ElementwiseAddGradKernel); + ops::ElementwiseAddGradKernel, + ops::ElementwiseAddGradKernel, + ops::ElementwiseAddGradKernel, + ops::ElementwiseAddGradKernel); diff --git a/paddle/operators/elementwise_add_op.cu b/paddle/operators/elementwise_add_op.cu index 7591428ac7c2f74f25f0f7d818eafcf59c8e4a4f..78642bb4246e7328dd3e2d902aca88615d598ddf 100644 --- a/paddle/operators/elementwise_add_op.cu +++ b/paddle/operators/elementwise_add_op.cu @@ -17,15 +17,16 @@ namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( elementwise_add, - ops::ElementwiseAddKernel, - ops::ElementwiseAddKernel, - ops::ElementwiseAddKernel, - ops::ElementwiseAddKernel); -REGISTER_OP_GPU_KERNEL( + ops::ElementwiseAddKernel, + ops::ElementwiseAddKernel, + ops::ElementwiseAddKernel, + ops::ElementwiseAddKernel); +REGISTER_OP_CUDA_KERNEL( elementwise_add_grad, - ops::ElementwiseAddGradKernel, - ops::ElementwiseAddGradKernel, - ops::ElementwiseAddGradKernel, - ops::ElementwiseAddGradKernel); + ops::ElementwiseAddGradKernel, + ops::ElementwiseAddGradKernel, + ops::ElementwiseAddGradKernel, + ops::ElementwiseAddGradKernel); diff --git a/paddle/operators/elementwise_add_op.h b/paddle/operators/elementwise_add_op.h index 921dc5f6a69a01f40de66f77680803cadf7ef537..069bdaf0ab7469b0a814ca5f68b444b9ce4904f1 100644 --- a/paddle/operators/elementwise_add_op.h +++ b/paddle/operators/elementwise_add_op.h @@ -24,7 +24,7 @@ struct AddFunctor { inline HOSTDEVICE T operator()(T a, T b) const { return a + b; } }; -template +template class ElementwiseAddKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -34,8 +34,8 @@ class ElementwiseAddKernel : public framework::OpKernel { auto* y = ctx.Input("Y"); auto* z = ctx.Output("Out"); z->mutable_data(ctx.GetPlace()); - TransformFunctor, T, Place> functor( - x, y, z, ctx.device_context(), AddFunctor()); + TransformFunctor, T, DeviceContext> functor( + x, y, z, ctx.template device_context(), AddFunctor()); auto x_dims = x->dims(); auto y_dims = y->dims(); @@ -137,11 +137,11 @@ struct ElementwiseAddBroadCast2GradFunctor { } }; -template +template class ElementwiseAddGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - ElementwiseGradCompute, + ElementwiseGradCompute, ElementwiseAddOneGradFunctor, ElementwiseAddBroadCastGradFunctor, ElementwiseAddBroadCast2GradFunctor>(ctx); diff --git a/paddle/operators/elementwise_div_op.cc b/paddle/operators/elementwise_div_op.cc index 7a325199bd07e44042a4e8b3aae0ab93fae1c351..1c3e9e70eef0c1adfb89cf1a58437092f8d536d7 100644 --- a/paddle/operators/elementwise_div_op.cc +++ b/paddle/operators/elementwise_div_op.cc @@ -35,13 +35,13 @@ REGISTER_OP(elementwise_div, ops::ElementwiseOp, ops::ElementwiseDivOpMaker, elementwise_div_grad, ops::ElementwiseOpGrad); REGISTER_OP_CPU_KERNEL( elementwise_div, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel); + ops::ElementwiseDivKernel, + ops::ElementwiseDivKernel, + ops::ElementwiseDivKernel, + ops::ElementwiseDivKernel); REGISTER_OP_CPU_KERNEL( elementwise_div_grad, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel); + ops::ElementwiseDivGradKernel, + ops::ElementwiseDivGradKernel, + ops::ElementwiseDivGradKernel, + ops::ElementwiseDivGradKernel); diff --git a/paddle/operators/elementwise_div_op.cu b/paddle/operators/elementwise_div_op.cu index de4d0c33442a1fcfe0dd4c16df7ceeec737fbc6d..502c52893667e246a19bb04c8bf3ed3df3265f2d 100644 --- a/paddle/operators/elementwise_div_op.cu +++ b/paddle/operators/elementwise_div_op.cu @@ -17,15 +17,16 @@ namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( elementwise_div, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel); -REGISTER_OP_GPU_KERNEL( + ops::ElementwiseDivKernel, + ops::ElementwiseDivKernel, + ops::ElementwiseDivKernel, + ops::ElementwiseDivKernel); +REGISTER_OP_CUDA_KERNEL( elementwise_div_grad, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel); + ops::ElementwiseDivGradKernel, + ops::ElementwiseDivGradKernel, + ops::ElementwiseDivGradKernel, + ops::ElementwiseDivGradKernel); diff --git a/paddle/operators/elementwise_div_op.h b/paddle/operators/elementwise_div_op.h index 8946ff3d25c2aff3dc3aa69368f0083371cd2fef..d91313db4225d8fe051856345367a15867bdf215 100644 --- a/paddle/operators/elementwise_div_op.h +++ b/paddle/operators/elementwise_div_op.h @@ -19,11 +19,11 @@ namespace paddle { namespace operators { -template +template class ElementwiseDivKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - ElementwiseCompute(ctx); + ElementwiseCompute(ctx); } }; @@ -102,11 +102,11 @@ struct ElementwiseDivBroadCast2GradFunctor { } }; -template +template class ElementwiseDivGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - ElementwiseGradCompute, + ElementwiseGradCompute, ElementwiseDivGradFunctor, ElementwiseDivBroadCastGradFunctor, ElementwiseDivBroadCast2GradFunctor>(ctx); diff --git a/paddle/operators/elementwise_mul_op.cc b/paddle/operators/elementwise_mul_op.cc index 8851267a524f51773a9f86ff83943cea4cb042aa..aadb95cbe35fe565cf1009f0f9765def921d0906 100644 --- a/paddle/operators/elementwise_mul_op.cc +++ b/paddle/operators/elementwise_mul_op.cc @@ -36,13 +36,13 @@ REGISTER_OP(elementwise_mul, ops::ElementwiseOp, ops::ElementwiseMulOpMaker, elementwise_mul_grad, ops::ElementwiseOpGrad); REGISTER_OP_CPU_KERNEL( elementwise_mul, - ops::ElementwiseMulKernel, - ops::ElementwiseMulKernel, - ops::ElementwiseMulKernel, - ops::ElementwiseMulKernel); + ops::ElementwiseMulKernel, + ops::ElementwiseMulKernel, + ops::ElementwiseMulKernel, + ops::ElementwiseMulKernel); REGISTER_OP_CPU_KERNEL( elementwise_mul_grad, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel); + ops::ElementwiseMulGradKernel, + ops::ElementwiseMulGradKernel, + ops::ElementwiseMulGradKernel, + ops::ElementwiseMulGradKernel); diff --git a/paddle/operators/elementwise_mul_op.cu b/paddle/operators/elementwise_mul_op.cu index b0dfdee1ccef56c6cda06ae6759017294fa5115c..089451b3e1288b3adc689a3c7d9fea2bc5243407 100644 --- a/paddle/operators/elementwise_mul_op.cu +++ b/paddle/operators/elementwise_mul_op.cu @@ -17,15 +17,16 @@ namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( elementwise_mul, - ops::ElementwiseMulKernel, - ops::ElementwiseMulKernel, - ops::ElementwiseMulKernel, - ops::ElementwiseMulKernel); -REGISTER_OP_GPU_KERNEL( + ops::ElementwiseMulKernel, + ops::ElementwiseMulKernel, + ops::ElementwiseMulKernel, + ops::ElementwiseMulKernel); +REGISTER_OP_CUDA_KERNEL( elementwise_mul_grad, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel); + ops::ElementwiseMulGradKernel, + ops::ElementwiseMulGradKernel, + ops::ElementwiseMulGradKernel, + ops::ElementwiseMulGradKernel); diff --git a/paddle/operators/elementwise_mul_op.h b/paddle/operators/elementwise_mul_op.h index 4469b07eaa08a3b011a88e58f1d645dd30b10ced..16fa5ec4b3a369805acb401bae5407072101af8d 100644 --- a/paddle/operators/elementwise_mul_op.h +++ b/paddle/operators/elementwise_mul_op.h @@ -18,11 +18,11 @@ namespace paddle { namespace operators { -template +template class ElementwiseMulKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - ElementwiseCompute(ctx); + ElementwiseCompute(ctx); } }; @@ -101,11 +101,11 @@ struct ElementwiseMulBroadCast2GradFunctor { } }; -template +template class ElementwiseMulGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - ElementwiseGradCompute, + ElementwiseGradCompute, ElementwiseMulGradFunctor, ElementwiseMulBroadCastGradFunctor, ElementwiseMulBroadCast2GradFunctor>(ctx); diff --git a/paddle/operators/elementwise_op_function.h b/paddle/operators/elementwise_op_function.h index ca3542e7838219090d2b6634a043856daae83e9c..7ebfc7df8c117edd7bcf14cc5ae6ba3dc1302c03 100644 --- a/paddle/operators/elementwise_op_function.h +++ b/paddle/operators/elementwise_op_function.h @@ -59,17 +59,17 @@ inline void get_mid_dims(const framework::DDim& x_dims, } } -template +template class RowwiseTransformIterator; -template +template class MidWiseTransformIterator; template -class RowwiseTransformIterator { +class RowwiseTransformIterator { public: RowwiseTransformIterator(const T* ptr, int n) : ptr_(ptr), i_(0), n_(n) {} - RowwiseTransformIterator& operator++() { + RowwiseTransformIterator& operator++() { ++i_; if (UNLIKELY(i_ == n_)) { i_ = 0; @@ -77,13 +77,13 @@ class RowwiseTransformIterator { return *this; } - bool operator==( - const RowwiseTransformIterator& rhs) const { + bool operator==(const RowwiseTransformIterator& + rhs) const { return (ptr_ + i_) == &(*rhs); } - bool operator!=( - const RowwiseTransformIterator& rhs) const { + bool operator!=(const RowwiseTransformIterator& + rhs) const { return (ptr_ + i_) != &(*rhs); } @@ -96,12 +96,12 @@ class RowwiseTransformIterator { }; template -class MidWiseTransformIterator { +class MidWiseTransformIterator { public: MidWiseTransformIterator(const T* ptr, int n, int post) : ptr_(ptr), i_(0), j_(0), n_(n), post_(post) {} - MidWiseTransformIterator& operator++() { + MidWiseTransformIterator& operator++() { ++j_; i_ = j_ / post_; if (UNLIKELY(i_ == n_)) { @@ -111,13 +111,13 @@ class MidWiseTransformIterator { return *this; } - bool operator==( - const MidWiseTransformIterator& rhs) const { + bool operator==(const MidWiseTransformIterator& + rhs) const { return (ptr_ + i_) == &(*rhs); } - bool operator!=( - const MidWiseTransformIterator& rhs) const { + bool operator!=(const MidWiseTransformIterator& + rhs) const { return (ptr_ + i_) != &(*rhs); } @@ -133,12 +133,12 @@ class MidWiseTransformIterator { #ifdef __NVCC__ template -class RowwiseTransformIterator +class RowwiseTransformIterator : public thrust::iterator_adaptor< - RowwiseTransformIterator, const T*> { + RowwiseTransformIterator, const T*> { public: typedef thrust::iterator_adaptor< - RowwiseTransformIterator, const T*> + RowwiseTransformIterator, const T*> super_t; HOSTDEVICE RowwiseTransformIterator(const T* x, int n) : super_t(x), begin_(x), n_(n){}; @@ -153,12 +153,12 @@ class RowwiseTransformIterator }; template -class MidWiseTransformIterator +class MidWiseTransformIterator : public thrust::iterator_adaptor< - MidWiseTransformIterator, const T*> { + MidWiseTransformIterator, const T*> { public: typedef thrust::iterator_adaptor< - MidWiseTransformIterator, const T*> + MidWiseTransformIterator, const T*> super_t; HOSTDEVICE MidWiseTransformIterator(const T* x, int n, int post) : super_t(x), begin_(x), n_(n), post_(post){}; @@ -174,12 +174,11 @@ class MidWiseTransformIterator }; #endif -template +template class TransformFunctor { public: TransformFunctor(const framework::Tensor* x, const framework::Tensor* y, - framework::Tensor* z, const platform::DeviceContext& ctx, - Functor func) + framework::Tensor* z, const DeviceContext& ctx, Functor func) : x_(x->data()), y_(y->data()), z_(z->mutable_data(ctx.GetPlace())), @@ -188,20 +187,20 @@ class TransformFunctor { func_(func) {} inline void Run() const { - platform::Transform trans; + platform::Transform trans; trans(ctx_, x_, x_ + nx_, y_, z_, func_); } inline void RunRowWise(int n, int pre) const { - platform::Transform trans; - trans(ctx_, x_, x_ + nx_, RowwiseTransformIterator(y_, n), z_, - func_); + platform::Transform trans; + trans(ctx_, x_, x_ + nx_, RowwiseTransformIterator(y_, n), + z_, func_); } inline void RunMidWise(int n, int pre, int post) const { - platform::Transform trans; - trans(ctx_, x_, x_ + nx_, MidWiseTransformIterator(y_, n, post), - z_, func_); + platform::Transform trans; + trans(ctx_, x_, x_ + nx_, + MidWiseTransformIterator(y_, n, post), z_, func_); } private: @@ -209,22 +208,24 @@ class TransformFunctor { const T* y_; T* z_; int64_t nx_; - const platform::DeviceContext& ctx_; + const DeviceContext& ctx_; Functor func_; }; #define EIGEN_FUNCTOR(name, eigen_op) \ struct Eigen##name##Functor { \ - template \ + template \ inline void Run(const framework::Tensor* x, const framework::Tensor* y, \ framework::Tensor* z, \ const framework::ExecutionContext& ctx) { \ auto x_e = framework::EigenVector::Flatten(*x); \ auto y_e = framework::EigenVector::Flatten(*y); \ auto z_e = framework::EigenVector::Flatten(*z); \ - z_e.device(ctx.GetEigenDevice()) = eigen_op(x_e, y_e); \ + z_e.device( \ + *ctx.template device_context().eigen_device()) = \ + eigen_op(x_e, y_e); \ } \ - template \ + template \ inline void RunBroadCast(const framework::Tensor* x, \ const framework::Tensor* y, framework::Tensor* z, \ const framework::ExecutionContext& ctx, int pre, \ @@ -235,9 +236,11 @@ class TransformFunctor { auto y_bcast = y_e.reshape(Eigen::DSizes(1, n)) \ .broadcast(Eigen::DSizes(pre, 1)) \ .reshape(Eigen::DSizes(x_e.size())); \ - z_e.device(ctx.GetEigenDevice()) = eigen_op(x_e, y_bcast); \ + z_e.device( \ + *ctx.template device_context().eigen_device()) = \ + eigen_op(x_e, y_bcast); \ } \ - template \ + template \ inline void RunBroadCast2(const framework::Tensor* x, \ const framework::Tensor* y, \ framework::Tensor* z, \ @@ -249,11 +252,13 @@ class TransformFunctor { auto y_bcast = y_e.reshape(Eigen::DSizes(1, n, 1)) \ .broadcast(Eigen::DSizes(pre, 1, post)) \ .reshape(Eigen::DSizes(x_e.size())); \ - z_e.device(ctx.GetEigenDevice()) = eigen_op(x_e, y_bcast); \ + z_e.device( \ + *ctx.template device_context().eigen_device()) = \ + eigen_op(x_e, y_bcast); \ } \ } -template +template void ElementwiseCompute(const framework::ExecutionContext& ctx) { using Tensor = framework::Tensor; @@ -269,7 +274,7 @@ void ElementwiseCompute(const framework::ExecutionContext& ctx) { if (x_dims == y_dims) { functor f; - f.template Run(x, y, z, ctx); + f.template Run(x, y, z, ctx); return; } @@ -282,11 +287,11 @@ void ElementwiseCompute(const framework::ExecutionContext& ctx) { get_mid_dims(x_dims, y_dims, axis, pre, n, post); if (post == 1) { functor f; - f.template RunBroadCast(x, y, z, ctx, pre, n); + f.template RunBroadCast(x, y, z, ctx, pre, n); return; } else { functor f; - f.template RunBroadCast2(x, y, z, ctx, pre, n, post); + f.template RunBroadCast2(x, y, z, ctx, pre, n, post); return; } } @@ -303,8 +308,9 @@ EIGEN_FUNCTOR(Mul, EIGEN_MUL); #define EIGEN_DIV(x, y) ((x) / (y)) EIGEN_FUNCTOR(Div, EIGEN_DIV); -template +template void ElementwiseGradCompute(const framework::ExecutionContext& ctx) { using Tensor = framework::Tensor; @@ -313,7 +319,7 @@ void ElementwiseGradCompute(const framework::ExecutionContext& ctx) { auto* out = ctx.Input("Out"); auto* dout = ctx.Input(framework::GradVarName("Out")); - auto place = ctx.GetEigenDevice(); + auto& place = *ctx.template device_context().eigen_device(); auto x_dims = x->dims(); auto y_dims = y->dims(); diff --git a/paddle/operators/elementwise_sub_op.cc b/paddle/operators/elementwise_sub_op.cc index 95d7979e39bfe7b484acb7771d1bd078014293a2..3e4d19361ead0100e45e50880d402e3d2b8557ff 100644 --- a/paddle/operators/elementwise_sub_op.cc +++ b/paddle/operators/elementwise_sub_op.cc @@ -34,13 +34,13 @@ REGISTER_OP(elementwise_sub, ops::ElementwiseOp, ops::ElementwiseSubOpMaker, elementwise_sub_grad, ops::ElementwiseOpGrad); REGISTER_OP_CPU_KERNEL( elementwise_sub, - ops::ElementwiseSubKernel, - ops::ElementwiseSubKernel, - ops::ElementwiseSubKernel, - ops::ElementwiseSubKernel); + ops::ElementwiseSubKernel, + ops::ElementwiseSubKernel, + ops::ElementwiseSubKernel, + ops::ElementwiseSubKernel); REGISTER_OP_CPU_KERNEL( elementwise_sub_grad, - ops::ElementwiseSubGradKernel, - ops::ElementwiseSubGradKernel, - ops::ElementwiseSubGradKernel, - ops::ElementwiseSubGradKernel); + ops::ElementwiseSubGradKernel, + ops::ElementwiseSubGradKernel, + ops::ElementwiseSubGradKernel, + ops::ElementwiseSubGradKernel); diff --git a/paddle/operators/elementwise_sub_op.cu b/paddle/operators/elementwise_sub_op.cu index ec23bec35feae26f5463c575b1ab6f58d417e100..0b2f0f7d4d98f1336087f9fc3fc485ed8d805b5f 100644 --- a/paddle/operators/elementwise_sub_op.cu +++ b/paddle/operators/elementwise_sub_op.cu @@ -17,15 +17,16 @@ namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( elementwise_sub, - ops::ElementwiseSubKernel, - ops::ElementwiseSubKernel, - ops::ElementwiseSubKernel, - ops::ElementwiseSubKernel); -REGISTER_OP_GPU_KERNEL( + ops::ElementwiseSubKernel, + ops::ElementwiseSubKernel, + ops::ElementwiseSubKernel, + ops::ElementwiseSubKernel); +REGISTER_OP_CUDA_KERNEL( elementwise_sub_grad, - ops::ElementwiseSubGradKernel, - ops::ElementwiseSubGradKernel, - ops::ElementwiseSubGradKernel, - ops::ElementwiseSubGradKernel); + ops::ElementwiseSubGradKernel, + ops::ElementwiseSubGradKernel, + ops::ElementwiseSubGradKernel, + ops::ElementwiseSubGradKernel); diff --git a/paddle/operators/elementwise_sub_op.h b/paddle/operators/elementwise_sub_op.h index 3f40c1c5bcea5e8473765b039de4ee2a16054f0c..731a30c5e30d3f9bbdbabd62e5d9a77559500b06 100644 --- a/paddle/operators/elementwise_sub_op.h +++ b/paddle/operators/elementwise_sub_op.h @@ -18,11 +18,11 @@ namespace paddle { namespace operators { -template +template class ElementwiseSubKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - ElementwiseCompute(ctx); + ElementwiseCompute(ctx); } }; @@ -101,11 +101,11 @@ struct ElementwiseSubBroadCast2GradFunctor { } }; -template +template class ElementwiseSubGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - ElementwiseGradCompute, + ElementwiseGradCompute, ElementwiseSubOneGradFunctor, ElementwiseSubBroadCastGradFunctor, ElementwiseSubBroadCast2GradFunctor>(ctx); diff --git a/paddle/operators/expand_op.cc b/paddle/operators/expand_op.cc index 282775fcda45fe3bbd72bf04a7ae828f2c840ab7..8b3cddbb944de250d5754a2be64dd8e7ec53003a 100644 --- a/paddle/operators/expand_op.cc +++ b/paddle/operators/expand_op.cc @@ -130,7 +130,8 @@ class ExpandGradOp : public framework::OperatorWithKernel { namespace ops = paddle::operators; REGISTER_OP(expand, ops::ExpandOp, ops::ExpandOpMaker, expand_grad, ops::ExpandGradOp); -REGISTER_OP_CPU_KERNEL(expand, - ops::ExpandKernel); REGISTER_OP_CPU_KERNEL( - expand_grad, ops::ExpandGradKernel); + expand, ops::ExpandKernel); +REGISTER_OP_CPU_KERNEL( + expand_grad, + ops::ExpandGradKernel); diff --git a/paddle/operators/expand_op.cu b/paddle/operators/expand_op.cu index 6744562b6c21dd8bfeb7e4cb6b809dc7913aa3a5..99ee584d0859f9bf688899cc9b346d221415518c 100644 --- a/paddle/operators/expand_op.cu +++ b/paddle/operators/expand_op.cu @@ -17,7 +17,8 @@ #include "paddle/operators/expand_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(expand, - ops::ExpandKernel); -REGISTER_OP_GPU_KERNEL( - expand_grad, ops::ExpandGradKernel); +REGISTER_OP_CUDA_KERNEL( + expand, ops::ExpandKernel); +REGISTER_OP_CUDA_KERNEL( + expand_grad, + ops::ExpandGradKernel); diff --git a/paddle/operators/expand_op.h b/paddle/operators/expand_op.h index 4d7996ad1e744fead1329c35ce6ea43bf0683ce6..14ef8b0912860f7ec39535997c39d6d4c4970650 100644 --- a/paddle/operators/expand_op.h +++ b/paddle/operators/expand_op.h @@ -56,7 +56,7 @@ template using EigenTensor = framework::EigenTensor; -template +template class ExpandKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -83,12 +83,13 @@ class ExpandKernel : public framework::OpKernel { auto x = EigenTensor::From(*in0); out0->mutable_data(context.GetPlace()); auto y = EigenTensor::From(*out0); - auto place = context.GetEigenDevice(); + auto& place = + *context.template device_context().eigen_device(); y.device(place) = x.broadcast(bcast_dims); } }; -template +template class ExpandGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -164,7 +165,8 @@ class ExpandGradKernel : public framework::OpKernel { reduce_dims[i] = reduce_dims_vec[i]; } auto out_grad = EigenVector::Flatten(*in0); - x_grad.device(context.GetEigenDevice()) = + x_grad.device( + *context.template device_context().eigen_device()) = out_grad.reshape(reshape_dims).sum(reduce_dims).reshape(x.dimensions()); } }; diff --git a/paddle/operators/fill_constant_batch_size_like_op.cc b/paddle/operators/fill_constant_batch_size_like_op.cc index 892922cd3aaec8bf8194320c5c3a0dd0365bb589..7fb74e2b950338fbd05515f844959862504eddce 100644 --- a/paddle/operators/fill_constant_batch_size_like_op.cc +++ b/paddle/operators/fill_constant_batch_size_like_op.cc @@ -100,8 +100,11 @@ REGISTER_OPERATOR(fill_constant_batch_size_like, ops::FillConstantBatchSizeLikeOpMaker); REGISTER_OP_CPU_KERNEL( fill_constant_batch_size_like, - ops::FillConstantBatchSizeLikeOpKernel, - ops::FillConstantBatchSizeLikeOpKernel, - ops::FillConstantBatchSizeLikeOpKernel, - ops::FillConstantBatchSizeLikeOpKernel, + ops::FillConstantBatchSizeLikeOpKernel, + ops::FillConstantBatchSizeLikeOpKernel, + ops::FillConstantBatchSizeLikeOpKernel); diff --git a/paddle/operators/fill_constant_batch_size_like_op.cu.cc b/paddle/operators/fill_constant_batch_size_like_op.cu.cc index 9e7a1eeab863c962ca72908e561e12a04d5021c5..2e0e15f36bb2e0ffd33dc6d1d25965d0cbe33186 100644 --- a/paddle/operators/fill_constant_batch_size_like_op.cu.cc +++ b/paddle/operators/fill_constant_batch_size_like_op.cu.cc @@ -16,10 +16,13 @@ #include "paddle/framework/op_registry.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( fill_constant_batch_size_like, - ops::FillConstantBatchSizeLikeOpKernel, - ops::FillConstantBatchSizeLikeOpKernel, - ops::FillConstantBatchSizeLikeOpKernel, - ops::FillConstantBatchSizeLikeOpKernel, + ops::FillConstantBatchSizeLikeOpKernel, + ops::FillConstantBatchSizeLikeOpKernel, + ops::FillConstantBatchSizeLikeOpKernel); diff --git a/paddle/operators/fill_constant_batch_size_like_op.h b/paddle/operators/fill_constant_batch_size_like_op.h index 339d97a30a5819ab488e83990651ba99212239ec..66da9d0307e36db3726f30518c8c57a923e54388 100644 --- a/paddle/operators/fill_constant_batch_size_like_op.h +++ b/paddle/operators/fill_constant_batch_size_like_op.h @@ -19,7 +19,7 @@ limitations under the License. */ namespace paddle { namespace operators { -template +template class FillConstantBatchSizeLikeOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -27,8 +27,9 @@ class FillConstantBatchSizeLikeOpKernel : public framework::OpKernel { out->mutable_data(ctx.GetPlace()); auto value = ctx.Attr("value"); - math::SetConstant setter; - setter(ctx.device_context(), out, static_cast(value)); + math::SetConstant setter; + setter(ctx.template device_context(), out, + static_cast(value)); } }; diff --git a/paddle/operators/fill_zeros_like_op.cc b/paddle/operators/fill_zeros_like_op.cc index 95fb5932b8b555e1357adc9fdfb7b6e6db7da71d..720c11f5f12a8dea971fe82db6afe8f6b0d9ee1a 100644 --- a/paddle/operators/fill_zeros_like_op.cc +++ b/paddle/operators/fill_zeros_like_op.cc @@ -54,8 +54,9 @@ namespace ops = paddle::operators; REGISTER_OP_WITHOUT_GRADIENT(fill_zeros_like, ops::FillZerosLikeOp, ops::FillZerosLikeOpMaker); REGISTER_OP_CPU_KERNEL( - fill_zeros_like, ops::FillZerosLikeKernel, - ops::FillZerosLikeKernel, - ops::FillZerosLikeKernel, - ops::FillZerosLikeKernel, - ops::FillZerosLikeKernel); + fill_zeros_like, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel); diff --git a/paddle/operators/fill_zeros_like_op.cu.cc b/paddle/operators/fill_zeros_like_op.cu.cc index 1501a17441072223ba0e8cf5b6c8cdd5e903a467..9f412306bb5f08497990f0e0385f695d838c2400 100644 --- a/paddle/operators/fill_zeros_like_op.cu.cc +++ b/paddle/operators/fill_zeros_like_op.cu.cc @@ -16,9 +16,10 @@ #include "paddle/framework/op_registry.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( - fill_zeros_like, ops::FillZerosLikeKernel, - ops::FillZerosLikeKernel, - ops::FillZerosLikeKernel, - ops::FillZerosLikeKernel, - ops::FillZerosLikeKernel); +REGISTER_OP_CUDA_KERNEL( + fill_zeros_like, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel); diff --git a/paddle/operators/fill_zeros_like_op.h b/paddle/operators/fill_zeros_like_op.h index 7e7d78eea2bce427d6ad4dfb77bcb4ace35cd287..a6e2941f52150de7886717303d2cb2f10b7eef7b 100644 --- a/paddle/operators/fill_zeros_like_op.h +++ b/paddle/operators/fill_zeros_like_op.h @@ -19,15 +19,16 @@ limitations under the License. */ namespace paddle { namespace operators { -template +template class FillZerosLikeKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { auto* out = context.Output("Y"); out->mutable_data(context.GetPlace()); - math::SetConstant setter; - setter(context.device_context(), out, static_cast(0)); + math::SetConstant setter; + setter(context.template device_context(), out, + static_cast(0)); } }; diff --git a/paddle/operators/ftrl_op.cc b/paddle/operators/ftrl_op.cc index cb7ae6919623f10a6c4ec98c0e942c1590ac9a7a..b14913ff213c84051b5a945f4a470cea4039a289 100644 --- a/paddle/operators/ftrl_op.cc +++ b/paddle/operators/ftrl_op.cc @@ -135,5 +135,5 @@ The paper that proposed Follow The Regularized Leader (FTRL): namespace ops = paddle::operators; REGISTER_OP_WITHOUT_GRADIENT(ftrl, ops::FTRLOp, ops::FTRLOpMaker); -REGISTER_OP_CPU_KERNEL(ftrl, - ops::FTRLOpKernel); +REGISTER_OP_CPU_KERNEL( + ftrl, ops::FTRLOpKernel); diff --git a/paddle/operators/ftrl_op.cu b/paddle/operators/ftrl_op.cu index 97b36dade6f531df49615ae2d44d565eadba7154..abbbe7adbe6bd14f55f7f941c5e6740fada24910 100644 --- a/paddle/operators/ftrl_op.cu +++ b/paddle/operators/ftrl_op.cu @@ -15,5 +15,5 @@ specific language governing permissions and limitations under the License. */ #include "paddle/operators/ftrl_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(ftrl, - ops::FTRLOpKernel); +REGISTER_OP_CUDA_KERNEL( + ftrl, ops::FTRLOpKernel); diff --git a/paddle/operators/ftrl_op.h b/paddle/operators/ftrl_op.h index b040162f8d1d8998aa13021c10a25fe57135c1e9..4eea04cd8d61bb34fc612e0ca1765a664e329ca9 100644 --- a/paddle/operators/ftrl_op.h +++ b/paddle/operators/ftrl_op.h @@ -24,7 +24,7 @@ template using EigenVector = framework::EigenVector; -template +template class FTRLOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -53,7 +53,7 @@ class FTRLOpKernel : public framework::OpKernel { auto p_out = EigenVector::Flatten(*param_out); auto s_acc_out = EigenVector::Flatten(*sq_accum_out); auto l_acc_out = EigenVector::Flatten(*lin_accum_out); - auto place = ctx.GetEigenDevice(); + auto& place = *ctx.template device_context().eigen_device(); Eigen::DSizes grad_dsize(grad->numel()); diff --git a/paddle/operators/gather.cu.h b/paddle/operators/gather.cu.h index 8d04ecd284226c7b4c6cdd5531915fee2d94ce61..c806aa5f05ad214abb3484935d82b67880a1db7a 100644 --- a/paddle/operators/gather.cu.h +++ b/paddle/operators/gather.cu.h @@ -20,7 +20,7 @@ namespace paddle { namespace operators { using framework::Tensor; -using platform::Place; +using platform::DeviceContext; #define CUDA_1D_KERNEL_LOOP(i, n) \ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ diff --git a/paddle/operators/gather_op.cu b/paddle/operators/gather_op.cu index 92219d6a433e6db0bb9886ed8670cbafaa843ff8..b37f0576e276b2aa995f01de635ec153a0db36aa 100644 --- a/paddle/operators/gather_op.cu +++ b/paddle/operators/gather_op.cu @@ -49,7 +49,8 @@ class GatherGradOpCUDAKernel : public framework::OpKernel { dX->mutable_data(ctx.GetPlace()); auto dxt = framework::EigenVector::Flatten(*dX); - auto place = ctx.GetEigenDevice(); + auto &place = *ctx.template device_context() + .eigen_device(); dxt.device(place) = dxt.constant(static_cast(0)); GPUScatterAssign(ctx.device_context(), *dO, *Index, dX); @@ -60,5 +61,5 @@ class GatherGradOpCUDAKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(gather, ops::GatherOpCUDAKernel); -REGISTER_OP_GPU_KERNEL(gather_grad, ops::GatherGradOpCUDAKernel); +REGISTER_OP_CUDA_KERNEL(gather, ops::GatherOpCUDAKernel); +REGISTER_OP_CUDA_KERNEL(gather_grad, ops::GatherGradOpCUDAKernel); diff --git a/paddle/operators/gather_op.h b/paddle/operators/gather_op.h index 8276ed0d3d8b676aafab45fae70942e78b72b8e6..1a1ba0c41aef95d3dc8cc929db72770a7bd08b18 100644 --- a/paddle/operators/gather_op.h +++ b/paddle/operators/gather_op.h @@ -53,7 +53,8 @@ class GatherGradientOpKernel : public framework::OpKernel { dX->mutable_data(ctx.GetPlace()); auto dxt = framework::EigenVector::Flatten(*dX); - auto place = ctx.GetEigenDevice(); + auto &place = *ctx.template device_context() + .eigen_device(); dxt.device(place) = dxt.constant(static_cast(0)); ScatterAssign(ctx.device_context(), *dO, *Index, dX); diff --git a/paddle/operators/gaussian_random_op.cu b/paddle/operators/gaussian_random_op.cu index 315560bf1ba8a66b9a3b7d79510d202885e845d6..ffce6f713816abe7d1f207f141a1b0933574e2ff 100644 --- a/paddle/operators/gaussian_random_op.cu +++ b/paddle/operators/gaussian_random_op.cu @@ -60,5 +60,5 @@ class GPUGaussianRandomKernel : public framework::OpKernel { } // namespace operators } // namespace paddle -REGISTER_OP_GPU_KERNEL(gaussian_random, - paddle::operators::GPUGaussianRandomKernel); +REGISTER_OP_CUDA_KERNEL(gaussian_random, + paddle::operators::GPUGaussianRandomKernel); diff --git a/paddle/operators/gru_op.cc b/paddle/operators/gru_op.cc index 5aa03f8916a67222fb0ca5781533766063e52683..311e7edcf1519bc706a51e4d9242a1ebee5168ca 100644 --- a/paddle/operators/gru_op.cc +++ b/paddle/operators/gru_op.cc @@ -213,8 +213,9 @@ class GRUGradOp : public framework::OperatorWithKernel { namespace ops = paddle::operators; REGISTER_OP(gru, ops::GRUOp, ops::GRUOpMaker, gru_grad, ops::GRUGradOp); -REGISTER_OP_CPU_KERNEL(gru, ops::GRUKernel, - ops::GRUKernel); -REGISTER_OP_CPU_KERNEL(gru_grad, - ops::GRUGradKernel, - ops::GRUGradKernel); +REGISTER_OP_CPU_KERNEL( + gru, ops::GRUKernel, + ops::GRUKernel); +REGISTER_OP_CPU_KERNEL( + gru_grad, ops::GRUGradKernel, + ops::GRUGradKernel); diff --git a/paddle/operators/gru_op.cu.cc b/paddle/operators/gru_op.cu.cc index 0ceff94ec3ddaadbd5f0ca4f5a4eebe6cb8ee3a9..458630ca6187ec89638046d8eea63c31eca518f2 100644 --- a/paddle/operators/gru_op.cu.cc +++ b/paddle/operators/gru_op.cu.cc @@ -15,8 +15,9 @@ #include "paddle/operators/gru_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(gru, ops::GRUKernel, - ops::GRUKernel); -REGISTER_OP_GPU_KERNEL(gru_grad, - ops::GRUGradKernel, - ops::GRUGradKernel); +REGISTER_OP_CUDA_KERNEL( + gru, ops::GRUKernel, + ops::GRUKernel); +REGISTER_OP_CUDA_KERNEL( + gru_grad, ops::GRUGradKernel, + ops::GRUGradKernel); diff --git a/paddle/operators/gru_op.h b/paddle/operators/gru_op.h index 564489d3a98b59e3e527be5613a73d23d6dbbf31..6d02dff578846904beeb58c5161d27c7c2ed5d70 100644 --- a/paddle/operators/gru_op.h +++ b/paddle/operators/gru_op.h @@ -27,16 +27,16 @@ namespace operators { using LoDTensor = framework::LoDTensor; using Tensor = framework::Tensor; -template -inline void ReorderInitState(const platform::DeviceContext& ctx, +template +inline void ReorderInitState(const DeviceContext& ctx, const framework::Tensor& src, const size_t* index, framework::Tensor* dst, bool indexed_src) { - math::CopyMatrixRowsFunctor row_shuffle; + math::CopyMatrixRowsFunctor row_shuffle; dst->mutable_data(src.dims(), ctx.GetPlace()); row_shuffle(ctx, src, index, *dst, indexed_src); } -template +template class GRUKernel : public framework::OpKernel { public: void BatchCompute(const framework::ExecutionContext& context) const { @@ -60,12 +60,12 @@ class GRUKernel : public framework::OpKernel { auto hidden_dims = hidden->dims(); bool is_reverse = context.Attr("is_reverse"); - math::LoDTensor2BatchFunctor to_batch; - auto& dev_ctx = context.device_context(); + math::LoDTensor2BatchFunctor to_batch; + auto& dev_ctx = context.template device_context(); to_batch(dev_ctx, *input, *batch_gate, true, is_reverse); if (bias) { - math::RowwiseAdd add_bias; + math::RowwiseAdd add_bias; add_bias(dev_ctx, *batch_gate, *bias, batch_gate); } @@ -80,8 +80,9 @@ class GRUKernel : public framework::OpKernel { // Since the batch computing for GRU reorders the input sequences // according to their length. The initialized cell state also needs // to reorder. - ReorderInitState(context.device_context(), *h0, order, - &ordered_h0, true); + ReorderInitState( + context.template device_context(), *h0, order, + &ordered_h0, true); gru_value.prev_out_value = ordered_h0.data(); } else { gru_value.prev_out_value = nullptr; @@ -99,14 +100,14 @@ class GRUKernel : public framework::OpKernel { gru_value.output_value = hidden_t.data(); gru_value.gate_value = gate_t.data(); gru_value.reset_output_value = reset_hidden_prev_t.data(); - math::GRUUnitFunctor::compute( + math::GRUUnitFunctor::compute( dev_ctx, gru_value, frame_size, cur_batch_size, math::ActiveType(context.Attr("activation")), math::ActiveType(context.Attr("gate_activation"))); gru_value.prev_out_value = gru_value.output_value; } - math::Batch2LoDTensorFunctor to_seq; + math::Batch2LoDTensorFunctor to_seq; batch_hidden->set_lod(batch_gate->lod()); to_seq(dev_ctx, *batch_hidden, *hidden); } @@ -116,7 +117,7 @@ class GRUKernel : public framework::OpKernel { } }; -template +template class GRUGradKernel : public framework::OpKernel { public: void BatchCompute(const framework::ExecutionContext& context) const { @@ -141,14 +142,14 @@ class GRUGradKernel : public framework::OpKernel { auto hidden_dims = hidden->dims(); int frame_size = hidden_dims[1]; - math::LoDTensor2BatchFunctor to_batch; + math::LoDTensor2BatchFunctor to_batch; LoDTensor batch_hidden_grad, batch_gate_grad, batch_reset_hidden_prev_grad; batch_hidden_grad.mutable_data(hidden_dims, context.GetPlace()); batch_gate_grad.mutable_data(gate_dims, context.GetPlace()); batch_reset_hidden_prev_grad.mutable_data(hidden_dims, context.GetPlace()); - math::SetConstant zero; - auto& dev_ctx = context.device_context(); + math::SetConstant zero; + auto& dev_ctx = context.template device_context(); zero(dev_ctx, &batch_hidden_grad, static_cast(0.0)); zero(dev_ctx, &batch_gate_grad, static_cast(0.0)); zero(dev_ctx, &batch_reset_hidden_prev_grad, static_cast(0.0)); @@ -156,12 +157,13 @@ class GRUGradKernel : public framework::OpKernel { Tensor ordered_h0, ordered_h0_grad; const size_t* order = batch_gate->lod()[2].data(); if (h0) { - ReorderInitState(context.device_context(), *h0, order, - &ordered_h0, true); + ReorderInitState(dev_ctx, *h0, order, &ordered_h0, + true); } if (h0_grad) { ordered_h0_grad.mutable_data(h0_grad->dims(), context.GetPlace()); - zero(context.device_context(), &ordered_h0_grad, static_cast(0.0)); + zero(context.template device_context(), &ordered_h0_grad, + static_cast(0.0)); } bool is_reverse = context.Attr("is_reverse"); @@ -216,25 +218,25 @@ class GRUGradKernel : public framework::OpKernel { gru_grad.prev_out_grad = hidden_prev_grad_t.data(); } - math::GRUUnitGradFunctor::compute( + math::GRUUnitGradFunctor::compute( dev_ctx, gru_value, gru_grad, frame_size, cur_batch_size, math::ActiveType(context.Attr("activation")), math::ActiveType(context.Attr("gate_activation"))); } if (input_grad) { input_grad->mutable_data(context.GetPlace()); - math::Batch2LoDTensorFunctor to_seq; + math::Batch2LoDTensorFunctor to_seq; batch_gate_grad.set_lod(batch_gate->lod()); to_seq(dev_ctx, batch_gate_grad, *input_grad); } if (bias_grad) { bias_grad->mutable_data(context.GetPlace()); - math::ColwiseSum col_sum; + math::ColwiseSum col_sum; col_sum(dev_ctx, batch_gate_grad, bias_grad); } if (h0 && h0_grad) { - ReorderInitState(context.device_context(), ordered_h0_grad, - order, h0_grad, false); + ReorderInitState(dev_ctx, ordered_h0_grad, order, + h0_grad, false); } } diff --git a/paddle/operators/gru_unit_op.cc b/paddle/operators/gru_unit_op.cc index 877c969103cfc17e1b170449d1922d9c7db2a58b..705de87be5b67fbc343a89eeba2282941b264c8a 100644 --- a/paddle/operators/gru_unit_op.cc +++ b/paddle/operators/gru_unit_op.cc @@ -201,9 +201,10 @@ class GRUUnitGradOp : public framework::OperatorWithKernel { namespace ops = paddle::operators; REGISTER_OP(gru_unit, ops::GRUUnitOp, ops::GRUUnitOpMaker, gru_unit_grad, ops::GRUUnitGradOp); -REGISTER_OP_CPU_KERNEL(gru_unit, - ops::GRUUnitKernel, - ops::GRUUnitKernel); REGISTER_OP_CPU_KERNEL( - gru_unit_grad, ops::GRUUnitGradKernel, - ops::GRUUnitGradKernel); + gru_unit, ops::GRUUnitKernel, + ops::GRUUnitKernel); +REGISTER_OP_CPU_KERNEL( + gru_unit_grad, + ops::GRUUnitGradKernel, + ops::GRUUnitGradKernel); diff --git a/paddle/operators/gru_unit_op.cu b/paddle/operators/gru_unit_op.cu index 821c8c6421771bd99474b0b2f8aa2acf04697779..7c752db494b59c3ec2af093332777ce6655fb477 100644 --- a/paddle/operators/gru_unit_op.cu +++ b/paddle/operators/gru_unit_op.cu @@ -16,9 +16,10 @@ #include "paddle/operators/gru_unit_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(gru_unit, - ops::GRUUnitKernel, - ops::GRUUnitKernel); -REGISTER_OP_GPU_KERNEL( - gru_unit_grad, ops::GRUUnitGradKernel, - ops::GRUUnitGradKernel); +REGISTER_OP_CUDA_KERNEL( + gru_unit, ops::GRUUnitKernel, + ops::GRUUnitKernel); +REGISTER_OP_CUDA_KERNEL( + gru_unit_grad, + ops::GRUUnitGradKernel, + ops::GRUUnitGradKernel); diff --git a/paddle/operators/gru_unit_op.h b/paddle/operators/gru_unit_op.h index 3398c0934e250cfc292776d08773204bb9b4d87e..8fe60c750da0a42089dc38190d2dda3d08e5ba06 100644 --- a/paddle/operators/gru_unit_op.h +++ b/paddle/operators/gru_unit_op.h @@ -34,7 +34,7 @@ using EigenVector = framework::EigenVector; enum GRUActivationType { identity = 0, sigmoid = 1, tanh = 2, relu = 3 }; -template +template class GRUUnitKernel : public framework::OpKernel { public: template @@ -71,7 +71,8 @@ class GRUUnitKernel : public framework::OpKernel { auto g = EigenMatrix::From(*gate); auto r_h_p = EigenMatrix::From(*reset_hidden_prev); auto h = EigenMatrix::From(*hidden); - auto place = context.GetEigenDevice(); + auto& place = + *context.template device_context().eigen_device(); // calculate unactivated gate outputs if (bias) { @@ -86,10 +87,10 @@ class GRUUnitKernel : public framework::OpKernel { const T* weight_data = weight->data(); T* gate_data = gate->data(); T* reset_hidden_prev_data = reset_hidden_prev->data(); - math::gemm(context.device_context(), false, false, batch_size, - 2 * frame_size, frame_size, 1, hidden_prev_data, - frame_size, weight_data, frame_size * 2, 1, gate_data, - frame_size * 3); + math::gemm( + context.template device_context(), false, false, + batch_size, 2 * frame_size, frame_size, 1, hidden_prev_data, frame_size, + weight_data, frame_size * 2, 1, gate_data, frame_size * 3); // calculate activited gate Eigen::array extents({{batch_size, frame_size}}); @@ -102,11 +103,11 @@ class GRUUnitKernel : public framework::OpKernel { g.slice(r_offsets, extents), g.slice(r_offsets, extents)); auto r = g.slice(r_offsets, extents); // reset gate r_h_p.device(place) = r * h_p; // reset previous hidden state - math::gemm(context.device_context(), false, false, batch_size, - frame_size, frame_size, 1, reset_hidden_prev_data, - frame_size, weight_data + frame_size * frame_size * 2, - frame_size, 1, gate_data + frame_size * 2, - frame_size * 3); + math::gemm( + context.template device_context(), false, false, + batch_size, frame_size, frame_size, 1, reset_hidden_prev_data, + frame_size, weight_data + frame_size * frame_size * 2, frame_size, 1, + gate_data + frame_size * 2, frame_size * 3); Eigen::array c_offsets({{0, frame_size * 2}}); ActCompute(context.Attr("activation"), place, @@ -118,7 +119,7 @@ class GRUUnitKernel : public framework::OpKernel { } }; -template +template class GRUUnitGradKernel : public framework::OpKernel { public: template @@ -166,7 +167,8 @@ class GRUUnitGradKernel : public framework::OpKernel { auto d_h = EigenMatrix::From(*hidden_grad); auto d_g = EigenMatrix::From(gate_grad); auto d_r_h_p = EigenMatrix::From(reset_hidden_prev_grad); - auto place = context.GetEigenDevice(); + auto& place = + *context.template device_context().eigen_device(); int batch_size = input->dims()[0]; int frame_size = hidden_prev->dims()[1]; @@ -186,11 +188,11 @@ class GRUUnitGradKernel : public framework::OpKernel { ActGradCompute(context.Attr("activation"), place, c, c, d_g.slice(c_offsets, extents), d_h * u); // backward for reset_hidden_prev - math::gemm(context.device_context(), false, true, batch_size, - frame_size, frame_size, 1, - gate_grad_data + frame_size * 2, frame_size * 3, - weight_data + frame_size * frame_size * 2, frame_size, - 0, reset_hidden_prev_grad_data, frame_size); + math::gemm( + context.template device_context(), false, true, + batch_size, frame_size, frame_size, 1, gate_grad_data + frame_size * 2, + frame_size * 3, weight_data + frame_size * frame_size * 2, frame_size, + 0, reset_hidden_prev_grad_data, frame_size); // backward for unactivated reset gate ActGradCompute(context.Attr("gate_activation"), place, r, r, d_g.slice(r_offsets, extents), d_r_h_p * h_p); @@ -198,17 +200,18 @@ class GRUUnitGradKernel : public framework::OpKernel { if (weight_grad) { T* weight_grad_data = weight_grad->mutable_data(context.GetPlace()); // backward for state_weight - math::gemm( - context.device_context(), true, false, frame_size, frame_size, - batch_size, 1, reset_hidden_prev_data, frame_size, - gate_grad_data + frame_size * 2, frame_size * 3, 0, + math::gemm( + context.template device_context(), true, false, + frame_size, frame_size, batch_size, 1, reset_hidden_prev_data, + frame_size, gate_grad_data + frame_size * 2, frame_size * 3, 0, weight_grad_data + frame_size * frame_size * 2, frame_size); // backward for update_gate_weight and reset_gate_weight - math::gemm(context.device_context(), true, false, frame_size, - frame_size * 2, batch_size, 1, hidden_prev_data, - frame_size, gate_grad_data, frame_size * 3, 0, - weight_grad_data, frame_size * 2); + math::gemm( + context.template device_context(), true, false, + frame_size, frame_size * 2, batch_size, 1, hidden_prev_data, + frame_size, gate_grad_data, frame_size * 3, 0, weight_grad_data, + frame_size * 2); } // backward for hidden_prev if (hidden_prev_grad) { @@ -216,10 +219,11 @@ class GRUUnitGradKernel : public framework::OpKernel { hidden_prev_grad->mutable_data(context.GetPlace()); auto d_h_p = EigenMatrix::From(*hidden_prev_grad); d_h_p.device(place) = d_r_h_p * r + d_h * (u.constant(T(1)) - u); - math::gemm(context.device_context(), false, true, batch_size, - frame_size, frame_size * 2, 1, gate_grad_data, - frame_size * 3, weight_data, frame_size * 2, 1, - hidden_prev_grad_data, frame_size); + math::gemm( + context.template device_context(), false, true, + batch_size, frame_size, frame_size * 2, 1, gate_grad_data, + frame_size * 3, weight_data, frame_size * 2, 1, hidden_prev_grad_data, + frame_size); } // backward for input if (input_grad) { diff --git a/paddle/operators/hinge_loss_op.cc b/paddle/operators/hinge_loss_op.cc index 1e13897bb6990ad78ec8a752700e3c43137393ae..373b4d99b47f2a8ab06c7584a25acee59b6f3e3b 100644 --- a/paddle/operators/hinge_loss_op.cc +++ b/paddle/operators/hinge_loss_op.cc @@ -106,8 +106,9 @@ class HingeLossGradOp : public framework::OperatorWithKernel { namespace ops = paddle::operators; REGISTER_OP(hinge_loss, ops::HingeLossOp, ops::HingeLossOpMaker, hinge_loss_grad, ops::HingeLossGradOp); -REGISTER_OP_CPU_KERNEL(hinge_loss, - ops::HingeLossKernel); +REGISTER_OP_CPU_KERNEL( + hinge_loss, + ops::HingeLossKernel); REGISTER_OP_CPU_KERNEL( hinge_loss_grad, - ops::HingeLossGradKernel); + ops::HingeLossGradKernel); diff --git a/paddle/operators/hinge_loss_op.cu b/paddle/operators/hinge_loss_op.cu index ec20b08e30d80acb376c6b5f4c076b43963216cf..31a5bde292ebcab899ad05a813c685963dd5bc25 100644 --- a/paddle/operators/hinge_loss_op.cu +++ b/paddle/operators/hinge_loss_op.cu @@ -16,8 +16,9 @@ #include "paddle/operators/hinge_loss_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(hinge_loss, - ops::HingeLossKernel); -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( + hinge_loss, + ops::HingeLossKernel); +REGISTER_OP_CUDA_KERNEL( hinge_loss_grad, - ops::HingeLossGradKernel); + ops::HingeLossGradKernel); diff --git a/paddle/operators/hinge_loss_op.h b/paddle/operators/hinge_loss_op.h index c0be496f9cde956aaefa1d1909dfd8319fc6c1fe..91369cfb8a5d4f40be9e6249b50079ba2b550003 100644 --- a/paddle/operators/hinge_loss_op.h +++ b/paddle/operators/hinge_loss_op.h @@ -19,14 +19,15 @@ limitations under the License. */ namespace paddle { namespace operators { -template +template class HingeLossKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { auto* pred = context.Input("Logits"); auto* label = context.Input("Labels"); auto* loss = context.Output("Loss"); - auto place = context.GetEigenDevice(); + auto& place = + *context.template device_context().eigen_device(); auto x = framework::EigenVector::Flatten(*pred); auto y = framework::EigenVector::Flatten(*label); @@ -38,7 +39,7 @@ class HingeLossKernel : public framework::OpKernel { } }; -template +template class HingeLossGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -48,7 +49,8 @@ class HingeLossGradKernel : public framework::OpKernel { context.Input(framework::GradVarName("Loss")); auto* dpred = context.Output(framework::GradVarName("Logits")); - auto place = context.GetEigenDevice(); + auto& place = + *context.template device_context().eigen_device(); auto x = framework::EigenVector::Flatten(*pred); auto y = framework::EigenVector::Flatten(*label); diff --git a/paddle/operators/huber_loss_op.cc b/paddle/operators/huber_loss_op.cc index 938803d5b36177c782fe40bc34fd92504e5bbf7b..11828d083a55f0a38cf3b8513b7395bbb5592581 100644 --- a/paddle/operators/huber_loss_op.cc +++ b/paddle/operators/huber_loss_op.cc @@ -124,8 +124,9 @@ class HuberLossGradOp : public framework::OperatorWithKernel { namespace ops = paddle::operators; REGISTER_OP(huber_loss, ops::HuberLossOp, ops::HuberLossOpMaker, huber_loss_grad, ops::HuberLossGradOp); -REGISTER_OP_CPU_KERNEL(huber_loss, - ops::HuberLossKernel); +REGISTER_OP_CPU_KERNEL( + huber_loss, + ops::HuberLossKernel); REGISTER_OP_CPU_KERNEL( huber_loss_grad, - ops::HuberLossGradKernel); + ops::HuberLossGradKernel); diff --git a/paddle/operators/huber_loss_op.cu b/paddle/operators/huber_loss_op.cu index 317321dc6c495f6e9a8808d841c71bfa26b754d0..d49a4d9d4236c402f2559c5a0a5de097c2edc61f 100644 --- a/paddle/operators/huber_loss_op.cu +++ b/paddle/operators/huber_loss_op.cu @@ -16,8 +16,9 @@ #include "paddle/operators/huber_loss_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(huber_loss, - ops::HuberLossKernel); -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( + huber_loss, + ops::HuberLossKernel); +REGISTER_OP_CUDA_KERNEL( huber_loss_grad, - ops::HuberLossGradKernel); + ops::HuberLossGradKernel); diff --git a/paddle/operators/huber_loss_op.h b/paddle/operators/huber_loss_op.h index 4e7bc5543226e19fe0d6190171cdd9c2b3d2d985..4dd20e8b080ab8bd2e61830241d64ee8546a80ec 100644 --- a/paddle/operators/huber_loss_op.h +++ b/paddle/operators/huber_loss_op.h @@ -41,7 +41,7 @@ struct HuberLossForward { T delta; }; -template +template class HuberLossKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -50,7 +50,8 @@ class HuberLossKernel : public framework::OpKernel { auto* out0 = context.Output("Residual"); auto* out1 = context.Output("Out"); auto delta = static_cast(context.Attr("delta")); - auto place = context.GetEigenDevice(); + auto& place = + *context.template device_context().eigen_device(); auto x = EigenVector::Flatten(*in0); auto y = EigenVector::Flatten(*in1); @@ -85,7 +86,7 @@ struct HuberLossBackward { T delta; }; -template +template class HuberLossGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -94,7 +95,8 @@ class HuberLossGradKernel : public framework::OpKernel { auto* out0 = context.Output(framework::GradVarName("X")); auto* out1 = context.Output(framework::GradVarName("Y")); auto delta = static_cast(context.op().Attr("delta")); - auto place = context.GetEigenDevice(); + auto& place = + *context.template device_context().eigen_device(); auto residual = EigenVector::Flatten(*in0); auto out_grad = EigenVector::Flatten(*in1); diff --git a/paddle/operators/l1_norm_op.cc b/paddle/operators/l1_norm_op.cc index 02ebf022968e95d0b20598d3c935fb51177c8841..c0b51202c6bb708a682568175c56583394961535 100644 --- a/paddle/operators/l1_norm_op.cc +++ b/paddle/operators/l1_norm_op.cc @@ -69,7 +69,8 @@ $$Out = \sum{|X|}$$ namespace ops = paddle::operators; REGISTER_OP(l1_norm, ops::L1NormOp, ops::L1NormOpMaker, l1_norm_grad, ops::L1NormGradOp); -REGISTER_OP_CPU_KERNEL(l1_norm, - ops::L1NormKernel); REGISTER_OP_CPU_KERNEL( - l1_norm_grad, ops::L1NormGradKernel); + l1_norm, ops::L1NormKernel); +REGISTER_OP_CPU_KERNEL( + l1_norm_grad, + ops::L1NormGradKernel); diff --git a/paddle/operators/l1_norm_op.cu b/paddle/operators/l1_norm_op.cu index 1c206e04ccbb5f4c2cb9d45aef7bac17c62d55c5..fd725f86f6c98c5aff844546361d8599ea3527ab 100644 --- a/paddle/operators/l1_norm_op.cu +++ b/paddle/operators/l1_norm_op.cu @@ -16,7 +16,8 @@ #include "paddle/operators/l1_norm_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(l1_norm, - ops::L1NormKernel); -REGISTER_OP_GPU_KERNEL( - l1_norm_grad, ops::L1NormGradKernel); +REGISTER_OP_CUDA_KERNEL( + l1_norm, ops::L1NormKernel); +REGISTER_OP_CUDA_KERNEL( + l1_norm_grad, + ops::L1NormGradKernel); diff --git a/paddle/operators/l1_norm_op.h b/paddle/operators/l1_norm_op.h index 3c60dc3dc7415f34ed9d238e6f41b197ec404883..ae3878f2b7b079027a9e9145cefa9eae6b22ffbc 100644 --- a/paddle/operators/l1_norm_op.h +++ b/paddle/operators/l1_norm_op.h @@ -20,7 +20,7 @@ namespace paddle { namespace operators { // Out = sum(abs(X)) -template +template class L1NormKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { @@ -30,14 +30,15 @@ class L1NormKernel : public framework::OpKernel { auto x = framework::EigenVector::Flatten(*X); auto out = framework::EigenScalar::From(*Out); - auto place = context.GetEigenDevice(); + auto &place = + *context.template device_context().eigen_device(); out.device(place) = x.abs().sum(); } }; // dX = dout * sign(X) -template +template class L1NormGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { @@ -52,7 +53,8 @@ class L1NormGradKernel : public framework::OpKernel { auto x_eigen = framework::EigenVector::Flatten(*x); auto d_out_eigen = framework::EigenVector::Flatten(*d_out); auto dx_eigen = framework::EigenVector::Flatten(*dx); - auto place = context.GetEigenDevice(); + auto &place = + *context.template device_context().eigen_device(); Eigen::DSizes x_dsize(x->numel()); dx_eigen.device(place) = d_out_eigen.broadcast(x_dsize) * x_eigen.sign(); diff --git a/paddle/operators/linear_chain_crf_op.cc b/paddle/operators/linear_chain_crf_op.cc index 8e079a14e0a15e8ff803b6087e6b0b02083479ef..896e3657d4406c5a1fe07f1712abb2ff0370fd3c 100644 --- a/paddle/operators/linear_chain_crf_op.cc +++ b/paddle/operators/linear_chain_crf_op.cc @@ -261,9 +261,10 @@ REGISTER_OP(linear_chain_crf, ops::LinearChainCRFOp, ops::LinearChainCRFOpMaker, linear_chain_crf_grad, ops::LinearChainCRFGradOp); REGISTER_OP_CPU_KERNEL( linear_chain_crf, - ops::LinearChainCRFOpKernel, - ops::LinearChainCRFOpKernel); + ops::LinearChainCRFOpKernel, + ops::LinearChainCRFOpKernel); REGISTER_OP_CPU_KERNEL( linear_chain_crf_grad, - ops::LinearChainCRFGradOpKernel, - ops::LinearChainCRFGradOpKernel); + ops::LinearChainCRFGradOpKernel, + ops::LinearChainCRFGradOpKernel); diff --git a/paddle/operators/linear_chain_crf_op.cu b/paddle/operators/linear_chain_crf_op.cu index 6fc8995f4c2ce05f89ffb58129695113f89159fa..3b105ec3414b5d63946331319d0f47a38e7908cc 100644 --- a/paddle/operators/linear_chain_crf_op.cu +++ b/paddle/operators/linear_chain_crf_op.cu @@ -16,11 +16,12 @@ limitations under the License. */ namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( linear_chain_crf, - ops::LinearChainCRFOpKernel, - ops::LinearChainCRFOpKernel); -REGISTER_OP_GPU_KERNEL( + ops::LinearChainCRFOpKernel, + ops::LinearChainCRFOpKernel); +REGISTER_OP_CUDA_KERNEL( linear_chain_crf_grad, - ops::LinearChainCRFGradOpKernel, - ops::LinearChainCRFGradOpKernel); + ops::LinearChainCRFGradOpKernel, + ops::LinearChainCRFGradOpKernel); diff --git a/paddle/operators/linear_chain_crf_op.h b/paddle/operators/linear_chain_crf_op.h index 014bbfa7580011e38a2f546e30d1e584965a7815..694584e79c3a1e818814a4a2145f52d8db7cf10a 100644 --- a/paddle/operators/linear_chain_crf_op.h +++ b/paddle/operators/linear_chain_crf_op.h @@ -50,7 +50,7 @@ template using EigenMatrix = framework::EigenMatrix; -template +template class LinearChainCRFOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -137,7 +137,8 @@ class LinearChainCRFOpKernel : public framework::OpKernel { framework::make_ddim({static_cast(batch_size), 1}), platform::CPUPlace()); - auto place = ctx.GetEigenDevice(); + auto& place = *ctx.template device_context() + .eigen_device(); auto x = EigenMatrix::From(*emission_weights); auto x_row_max = EigenMatrix::From(emission_row_max); x_row_max.device(place) = @@ -287,7 +288,7 @@ class LinearChainCRFOpKernel : public framework::OpKernel { } }; -template +template class LinearChainCRFGradOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -359,8 +360,7 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel { emission_grad->mutable_data(platform::CPUPlace()); if (transition_grad) { transition_grad->mutable_data(platform::CPUPlace()); - math::SetConstant()(ctx.device_context(), - transition_grad, 0.); + math::set_constant(ctx.device_context(), transition_grad, 0.); } // Now, all the inputs and outputs should be on the CPU memory. @@ -384,10 +384,10 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel { Tensor one_seq_beta = beta.Slice(start_pos, end_pos); Tensor one_seq_emission_grad = emission_grad->Slice(start_pos, end_pos); - BackwardOneSequence(ctx.device_context(), ll_grad[i], - one_seq_emission_exps, *transition_exps, - one_seq_alpha, one_seq_label, &one_seq_beta, - transition_grad, &one_seq_emission_grad); + BackwardOneSequence( + ctx.template device_context(), ll_grad[i], + one_seq_emission_exps, *transition_exps, one_seq_alpha, one_seq_label, + &one_seq_beta, transition_grad, &one_seq_emission_grad); } if (platform::is_gpu_place(ctx.GetPlace())) { @@ -441,8 +441,8 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel { copyTensor(ctx, transition_grad_src, transition_grad_dst); } - void BackwardOneSequence(const platform::DeviceContext& ctx, const T ll_grad, - const Tensor& emission_exps, + void BackwardOneSequence(const platform::CPUDeviceContext& ctx, + const T ll_grad, const Tensor& emission_exps, const Tensor& transition_exps, const Tensor& alpha, const Tensor& label, Tensor* beta, Tensor* transition_grad, @@ -481,7 +481,7 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel { auto alpha_mat = EigenMatrix::From(alpha); auto beta_mat = EigenMatrix::From(*beta); - auto* place = ctx.GetEigenDevice(); + auto* place = ctx.eigen_device(); auto prob = alpha_mat * beta_mat; auto row_sum = prob.sum(Eigen::DSizes(1)) .reshape(Eigen::DSizes(seq_length, 1)) diff --git a/paddle/operators/lod_reset_op.cu b/paddle/operators/lod_reset_op.cu index 5244a17c3aad01909e3b8cf5f4d5abf8a44edc7f..f7c235898096ffb3d6ba039cb3f01d5bc9ef5364 100644 --- a/paddle/operators/lod_reset_op.cu +++ b/paddle/operators/lod_reset_op.cu @@ -16,9 +16,10 @@ namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(lod_reset, - ops::LoDResetKernel, - ops::LoDResetKernel); -REGISTER_OP_GPU_KERNEL( - lod_reset_grad, ops::LoDResetGradKernel, - ops::LoDResetGradKernel); +REGISTER_OP_CUDA_KERNEL( + lod_reset, ops::LoDResetKernel, + ops::LoDResetKernel); +REGISTER_OP_CUDA_KERNEL( + lod_reset_grad, + ops::LoDResetGradKernel, + ops::LoDResetGradKernel); diff --git a/paddle/operators/lod_reset_op.h b/paddle/operators/lod_reset_op.h index cbcbf80adc3cf68f9eb28bbe2a69168cc8798347..b86f8b13135fa809ade3b001434eda5d88375c2c 100644 --- a/paddle/operators/lod_reset_op.h +++ b/paddle/operators/lod_reset_op.h @@ -20,7 +20,7 @@ namespace paddle { namespace operators { -template +template class LoDResetKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const { @@ -65,7 +65,7 @@ class LoDResetKernel : public framework::OpKernel { } }; -template +template class LoDResetGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const { diff --git a/paddle/operators/log_loss_op.cc b/paddle/operators/log_loss_op.cc index 257e5c8a49e935dcbdc33e5060118ef1804fa8d7..4524229a330a0ceddca673e2b2a6d836a15a2e3f 100644 --- a/paddle/operators/log_loss_op.cc +++ b/paddle/operators/log_loss_op.cc @@ -109,7 +109,8 @@ class LogLossGradOp : public framework::OperatorWithKernel { namespace ops = paddle::operators; REGISTER_OP(log_loss, ops::LogLossOp, ops::LogLossOpMaker, log_loss_grad, ops::LogLossGradOp); -REGISTER_OP_CPU_KERNEL(log_loss, - ops::LogLossKernel); REGISTER_OP_CPU_KERNEL( - log_loss_grad, ops::LogLossGradKernel); + log_loss, ops::LogLossKernel); +REGISTER_OP_CPU_KERNEL( + log_loss_grad, + ops::LogLossGradKernel); diff --git a/paddle/operators/log_loss_op.cu b/paddle/operators/log_loss_op.cu index 6c189ef3412d7a56205502c7913e93218a03b929..e87ac7d12a2b730085b4e9a33457612c4eba2655 100644 --- a/paddle/operators/log_loss_op.cu +++ b/paddle/operators/log_loss_op.cu @@ -16,7 +16,8 @@ #include "paddle/operators/log_loss_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(log_loss, - ops::LogLossKernel); -REGISTER_OP_GPU_KERNEL( - log_loss_grad, ops::LogLossGradKernel); +REGISTER_OP_CUDA_KERNEL( + log_loss, ops::LogLossKernel); +REGISTER_OP_CUDA_KERNEL( + log_loss_grad, + ops::LogLossGradKernel); diff --git a/paddle/operators/log_loss_op.h b/paddle/operators/log_loss_op.h index 73404fce9157fa750a51451fa93646bc4059481a..743eddb74004b5e87ed9b8a6ccb1b8496b8548dc 100644 --- a/paddle/operators/log_loss_op.h +++ b/paddle/operators/log_loss_op.h @@ -24,7 +24,7 @@ template using EigenVector = framework::EigenVector; -template +template class LogLossKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -38,7 +38,7 @@ class LogLossKernel : public framework::OpKernel { auto label = EigenVector::Flatten(*ctx.Input("Labels")); auto loss = EigenVector::Flatten(*loss_out); - auto place = ctx.GetEigenDevice(); + auto& place = *ctx.template device_context().eigen_device(); loss.device(place) = (-(label * (prediction + epsilon).log()) - ((static_cast(1) - label) * @@ -46,7 +46,7 @@ class LogLossKernel : public framework::OpKernel { } }; -template +template class LogLossGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -59,7 +59,7 @@ class LogLossGradKernel : public framework::OpKernel { auto* dpred = ctx.Output(framework::GradVarName("Predicted")); auto dl = EigenVector::Flatten(*dloss); - auto place = ctx.GetEigenDevice(); + auto& place = *ctx.template device_context().eigen_device(); if (dpred) { dpred->mutable_data(ctx.GetPlace()); diff --git a/paddle/operators/logical_op.cu b/paddle/operators/logical_op.cu index d41239b2ca43e7145ea56afcb0af69948838cc48..7fef60e0c9e957f28118e54d23c6043752d2f52f 100644 --- a/paddle/operators/logical_op.cu +++ b/paddle/operators/logical_op.cu @@ -14,11 +14,11 @@ #include "paddle/operators/logical_op.h" -REGISTER_BINARY_LOGICAL_KERNEL(logical_and, GPU, +REGISTER_BINARY_LOGICAL_KERNEL(logical_and, CUDA, paddle::operators::LogicalAndFunctor); -REGISTER_BINARY_LOGICAL_KERNEL(logical_or, GPU, +REGISTER_BINARY_LOGICAL_KERNEL(logical_or, CUDA, paddle::operators::LogicalOrFunctor); -REGISTER_UNARY_LOGICAL_KERNEL(logical_not, GPU, +REGISTER_UNARY_LOGICAL_KERNEL(logical_not, CUDA, paddle::operators::LogicalNotFunctor); -REGISTER_BINARY_LOGICAL_KERNEL(logical_xor, GPU, +REGISTER_BINARY_LOGICAL_KERNEL(logical_xor, CUDA, paddle::operators::LogicalXorFunctor); diff --git a/paddle/operators/logical_op.h b/paddle/operators/logical_op.h index 6e78a7d6ed87ba950886e6bc667f82118ff78904..629388cac81e60c8b84197238018384ffc59a08f 100644 --- a/paddle/operators/logical_op.h +++ b/paddle/operators/logical_op.h @@ -47,7 +47,7 @@ struct LogicalXorFunctor { } }; -template +template class BinaryLogicalOpKernel : public framework::OpKernel { public: @@ -57,14 +57,14 @@ class BinaryLogicalOpKernel auto* y = context.Input("Y"); auto* out = context.Output("Out"); Functor binary_func; - platform::Transform trans; - trans(context.device_context(), x->data(), x->data() + x->numel(), - y->data(), out->mutable_data(context.GetPlace()), - binary_func); + platform::Transform trans; + trans(context.template device_context(), x->data(), + x->data() + x->numel(), y->data(), + out->mutable_data(context.GetPlace()), binary_func); } }; -template +template class UnaryLogicalOpKernel : public framework::OpKernel { public: @@ -73,8 +73,9 @@ class UnaryLogicalOpKernel auto* x = context.Input("X"); auto* out = context.Output("Out"); Functor unary_func; - platform::Transform trans; - trans(context.device_context(), x->data(), x->data() + x->numel(), + platform::Transform trans; + trans(context.template device_context(), x->data(), + x->data() + x->numel(), out->mutable_data(context.GetPlace()), unary_func); } }; @@ -85,9 +86,9 @@ class UnaryLogicalOpKernel #define REGISTER_BINARY_LOGICAL_KERNEL(op_type, dev, functor) \ REGISTER_OP_##dev##_KERNEL( \ op_type, ::paddle::operators::BinaryLogicalOpKernel< \ - ::paddle::platform::dev##Place, functor>); + ::paddle::platform::dev##DeviceContext, functor>); #define REGISTER_UNARY_LOGICAL_KERNEL(op_type, dev, functor) \ REGISTER_OP_##dev##_KERNEL( \ op_type, ::paddle::operators::UnaryLogicalOpKernel< \ - ::paddle::platform::dev##Place, functor>); + ::paddle::platform::dev##DeviceContext, functor>); diff --git a/paddle/operators/lookup_table_op.cu b/paddle/operators/lookup_table_op.cu index 84b044184a36a0d3a72a4105d6baf401b4774cf7..9431030a53975acafe9bcb22dc9164492929b07a 100644 --- a/paddle/operators/lookup_table_op.cu +++ b/paddle/operators/lookup_table_op.cu @@ -85,6 +85,8 @@ template class LookupTableGradCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { + auto& dev_ctx = + context.template device_context(); bool is_sparse = context.Attr("is_sparse"); if (is_sparse) { auto* ids = context.Input("Ids"); @@ -95,7 +97,7 @@ class LookupTableGradCUDAKernel : public framework::OpKernel { auto* ids_data = ids->data(); auto ids_dim = ids->dims(); - auto stream = context.cuda_device_context().stream(); + auto stream = dev_ctx.stream(); // copy GPU memory to CPU pinned memory framework::Vector new_rows; new_rows.resize(ids_dim[0]); @@ -129,14 +131,11 @@ class LookupTableGradCUDAKernel : public framework::OpKernel { T* d_table = d_table_t->mutable_data(context.GetPlace()); auto t = framework::EigenVector::Flatten(*d_table_t); - t.device(context.GetEigenDevice()) = - t.constant(static_cast(0)); + t.device(*dev_ctx.eigen_device()) = t.constant(static_cast(0)); dim3 threads(128, 8); dim3 grids(8, 1); - LookupTableGrad< - T, 128, 8, - 8><<>>( + LookupTableGrad<<>>( d_table, d_output, ids, N, K, D); } } @@ -146,7 +145,8 @@ class LookupTableGradCUDAKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(lookup_table, ops::LookupTableCUDAKernel, - ops::LookupTableCUDAKernel); -REGISTER_OP_GPU_KERNEL(lookup_table_grad, ops::LookupTableGradCUDAKernel, - ops::LookupTableGradCUDAKernel); +REGISTER_OP_CUDA_KERNEL(lookup_table, ops::LookupTableCUDAKernel, + ops::LookupTableCUDAKernel); +REGISTER_OP_CUDA_KERNEL(lookup_table_grad, + ops::LookupTableGradCUDAKernel, + ops::LookupTableGradCUDAKernel); diff --git a/paddle/operators/lrn_op.cc b/paddle/operators/lrn_op.cc index e20340e77bf03a940634d28369fd61a1d7881c8b..b5b7bc940a85ac2bbb6c6b303284777df714b7d6 100644 --- a/paddle/operators/lrn_op.cc +++ b/paddle/operators/lrn_op.cc @@ -20,7 +20,7 @@ namespace operators { using framework::Tensor; template -struct LRNFunctor { +struct LRNFunctor { void operator()(const framework::ExecutionContext& ctx, const framework::Tensor& input, framework::Tensor* out, framework::Tensor* mid, int N, int C, int H, int W, int n, @@ -55,11 +55,11 @@ struct LRNFunctor { out_e = x_v * e_mid.reshape(Eigen::DSizes(e_mid.size())).pow(-beta); } }; -template struct LRNFunctor; -template struct LRNFunctor; +template struct LRNFunctor; +template struct LRNFunctor; template -struct LRNGradFunctor { +struct LRNGradFunctor { void operator()(const framework::ExecutionContext& ctx, const framework::Tensor& x, const framework::Tensor& out, const framework::Tensor& mid, framework::Tensor* x_g, @@ -113,8 +113,8 @@ struct LRNGradFunctor { } } }; -template struct LRNGradFunctor; -template struct LRNGradFunctor; +template struct LRNGradFunctor; +template struct LRNGradFunctor; class LRNOp : public framework::OperatorWithKernel { public: @@ -204,7 +204,7 @@ Input(i, x, y), Output(i, x, y) represents an element in an image. C is the number of feature maps of one image. n is a hyper-parameter configured when operator is initialized. The sum in the denominator is the sum of the same positions in the neighboring maps. - + )DOC"); } }; @@ -230,6 +230,7 @@ class LRNOpGrad : public framework::OperatorWithKernel { namespace ops = paddle::operators; REGISTER_OP(lrn, ops::LRNOp, ops::LRNOpMaker, lrn_grad, ops::LRNOpGrad); -REGISTER_OP_CPU_KERNEL(lrn, ops::LRNKernel); -REGISTER_OP_CPU_KERNEL(lrn_grad, - ops::LRNGradKernel); +REGISTER_OP_CPU_KERNEL( + lrn, ops::LRNKernel); +REGISTER_OP_CPU_KERNEL( + lrn_grad, ops::LRNGradKernel); diff --git a/paddle/operators/lrn_op.cu b/paddle/operators/lrn_op.cu index e9a86712333c614ff558c539058f7d586bcbcdc8..c6857c2b6d0a9011ef83d115e6edd81bf2f8a0ca 100644 --- a/paddle/operators/lrn_op.cu +++ b/paddle/operators/lrn_op.cu @@ -69,19 +69,18 @@ void CrossMapNormal(const framework::ExecutionContext& ctx, const T* inputs, const int block_size = 1024; int grid_size = (img_size + block_size - 1) / block_size; - KeCMRNormFillScale< - T><<>>( + auto& dev_ctx = ctx.template device_context(); + KeCMRNormFillScale<<>>( img_size, inputs, mid, C, H, W, n, k, alpha); int input_size = N * H * W * C; grid_size = (input_size + block_size - 1) / block_size; - KeCMRNormOutput< - T><<>>( + KeCMRNormOutput<<>>( input_size, inputs, mid, -beta, outputs); } template -struct LRNFunctor { +struct LRNFunctor { void operator()(const framework::ExecutionContext& ctx, const framework::Tensor& input, framework::Tensor* out, framework::Tensor* mid, int N, int C, int H, int W, int n, @@ -92,8 +91,8 @@ struct LRNFunctor { } }; -template struct LRNFunctor; -template struct LRNFunctor; +template struct LRNFunctor; +template struct LRNFunctor; template __global__ void KeCMRNormDiff(int img_size, const T* x, const T* out, @@ -148,14 +147,14 @@ void CrossMapNormalGrad(const framework::ExecutionContext& ctx, const T* x, const int block_size = 1024; int grid_size = (img_size + block_size - 1) / block_size; - KeCMRNormDiff< - T><<>>( + auto& dev_ctx = ctx.template device_context(); + KeCMRNormDiff<<>>( img_size, x, out, mid, x_g, out_g, C, H, W, n, -beta, 2.0f * alpha * beta); } template -struct LRNGradFunctor { +struct LRNGradFunctor { void operator()(const framework::ExecutionContext& ctx, const framework::Tensor& x, const framework::Tensor& out, const framework::Tensor& mid, framework::Tensor* x_g, @@ -167,12 +166,13 @@ struct LRNGradFunctor { } }; -template struct LRNGradFunctor; -template struct LRNGradFunctor; +template struct LRNGradFunctor; +template struct LRNGradFunctor; } // namespace operators } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(lrn, ops::LRNKernel); -REGISTER_OP_GPU_KERNEL(lrn_grad, - ops::LRNGradKernel); +REGISTER_OP_CUDA_KERNEL( + lrn, ops::LRNKernel); +REGISTER_OP_CUDA_KERNEL( + lrn_grad, ops::LRNGradKernel); diff --git a/paddle/operators/lrn_op.h b/paddle/operators/lrn_op.h index aa7539db4a28dbce3c37fae011bb26c596239e70..44063d3e036809eb236bbe7c46aa0cce06b46df0 100644 --- a/paddle/operators/lrn_op.h +++ b/paddle/operators/lrn_op.h @@ -29,7 +29,7 @@ struct LRNFunctor { T k, T alpha, T beta); }; -template +template class LRNKernel : public framework::OpKernel { public: using Tensor = framework::Tensor; @@ -65,12 +65,12 @@ class LRNKernel : public framework::OpKernel { PADDLE_ENFORCE(beta >= 0.0, "beta should >= 0.0"); PADDLE_ENFORCE(k >= 0.0, "k should >= 0.0"); - LRNFunctor f; + LRNFunctor f; f(ctx, x, out, mid, N, C, H, W, n, k, alpha, beta); } }; -template +template struct LRNGradFunctor { void operator()(const framework::ExecutionContext& ctx, const framework::Tensor& x, const framework::Tensor& out, @@ -98,7 +98,7 @@ struct LRNGradFunctor { * The upper and lower is the same as forward. The logic of the sum * is also the same as forward. */ -template +template class LRNGradKernel : public framework::OpKernel { public: using Tensor = framework::Tensor; @@ -121,7 +121,7 @@ class LRNGradKernel : public framework::OpKernel { T alpha = ctx.Attr("alpha"); T beta = ctx.Attr("beta"); - LRNGradFunctor f; + LRNGradFunctor f; f(ctx, x, out, mid, x_g, out_g, N, C, H, W, n, alpha, beta); } }; diff --git a/paddle/operators/lstm_op.cc b/paddle/operators/lstm_op.cc index fa8e5f2da8d3ba7fc218f3c836270139fc6c4882..2db7da30db416e03cf473c8e65b023d9265e9193 100644 --- a/paddle/operators/lstm_op.cc +++ b/paddle/operators/lstm_op.cc @@ -273,8 +273,9 @@ class LSTMGradOp : public framework::OperatorWithKernel { namespace ops = paddle::operators; REGISTER_OP(lstm, ops::LSTMOp, ops::LSTMOpMaker, lstm_grad, ops::LSTMGradOp); -REGISTER_OP_CPU_KERNEL(lstm, ops::LSTMKernel, - ops::LSTMKernel); -REGISTER_OP_CPU_KERNEL(lstm_grad, - ops::LSTMGradKernel, - ops::LSTMGradKernel); +REGISTER_OP_CPU_KERNEL( + lstm, ops::LSTMKernel, + ops::LSTMKernel); +REGISTER_OP_CPU_KERNEL( + lstm_grad, ops::LSTMGradKernel, + ops::LSTMGradKernel); diff --git a/paddle/operators/lstm_op.cu.cc b/paddle/operators/lstm_op.cu.cc index 610cbb03e890203407b1489800bc17f1a196d12c..48519bed6f7d927b40d02683a7e9f2acfb8b85e5 100644 --- a/paddle/operators/lstm_op.cu.cc +++ b/paddle/operators/lstm_op.cu.cc @@ -15,8 +15,9 @@ #include "paddle/operators/lstm_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(lstm, ops::LSTMKernel, - ops::LSTMKernel); -REGISTER_OP_GPU_KERNEL(lstm_grad, - ops::LSTMGradKernel, - ops::LSTMGradKernel); +REGISTER_OP_CUDA_KERNEL( + lstm, ops::LSTMKernel, + ops::LSTMKernel); +REGISTER_OP_CUDA_KERNEL( + lstm_grad, ops::LSTMGradKernel, + ops::LSTMGradKernel); diff --git a/paddle/operators/lstm_op.h b/paddle/operators/lstm_op.h index a78f548aafbcdb3834a1ed56f9b31143fae37386..14abd4bf0a6e73a9c0f000f53a5e1e380f01d1c0 100644 --- a/paddle/operators/lstm_op.h +++ b/paddle/operators/lstm_op.h @@ -24,16 +24,16 @@ namespace operators { using LoDTensor = framework::LoDTensor; using Tensor = framework::Tensor; -template -inline void ReorderInitState(const platform::DeviceContext& ctx, +template +inline void ReorderInitState(const DeviceContext& ctx, const framework::Tensor& src, const size_t* index, framework::Tensor* dst, bool indexed_src) { - math::CopyMatrixRowsFunctor row_shuffle; + math::CopyMatrixRowsFunctor row_shuffle; dst->mutable_data(src.dims(), ctx.GetPlace()); row_shuffle(ctx, src, index, *dst, indexed_src); } -template +template class LSTMKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -52,8 +52,8 @@ class LSTMKernel : public framework::OpKernel { cell_out->mutable_data(ctx.GetPlace()); bool is_reverse = ctx.Attr("is_reverse"); - math::LoDTensor2BatchFunctor to_batch; - auto& device_ctx = ctx.device_context(); + math::LoDTensor2BatchFunctor to_batch; + auto& device_ctx = ctx.template device_context(); to_batch(device_ctx, *input, *batch_gate, true, is_reverse); auto in_dims = input->dims(); @@ -64,7 +64,7 @@ class LSTMKernel : public framework::OpKernel { Tensor b = *bias; b.Resize({bias->numel(), 1}); Tensor gate_bias = b.Slice(0, 4 * frame_size); - math::RowwiseAdd add_bias; + math::RowwiseAdd add_bias; add_bias(device_ctx, *batch_gate, gate_bias, batch_gate); } @@ -88,8 +88,8 @@ class LSTMKernel : public framework::OpKernel { // Since the batch computing for LSTM reorders the input sequence // according to their length. The initialized cell state also needs // to reorder. - ReorderInitState(device_ctx, *cell_t0, order, &ordered_c0, - true); + ReorderInitState(device_ctx, *cell_t0, order, + &ordered_c0, true); lstm_value.prev_state_value = ordered_c0.data(); } @@ -121,9 +121,9 @@ class LSTMKernel : public framework::OpKernel { int pre_h_start = static_cast(batch_starts[n - 1]); int pre_h_end = pre_h_start + cur_batch_size; auto pre_hidden_t = batch_hidden.Slice(pre_h_start, pre_h_end); - math::matmul(device_ctx, pre_hidden_t, false, *weight, false, - static_cast(1.0), &gate_t, - static_cast(1.0)); + math::matmul(device_ctx, pre_hidden_t, false, *weight, + false, static_cast(1.0), &gate_t, + static_cast(1.0)); } else if (hidden_t0) { // If n == 0 and there is no initialized hidden state, that is to say // the H0 is zeros, the calculation W_h * H0 will be skiped. @@ -133,24 +133,24 @@ class LSTMKernel : public framework::OpKernel { // according to their length. The initialized hidden state also needs // to reorder. Tensor ordered_h0; - ReorderInitState(device_ctx, *hidden_t0, order, &ordered_h0, - true); - math::matmul(device_ctx, ordered_h0, false, *weight, false, - static_cast(1.0), &gate_t, - static_cast(1.0)); + ReorderInitState(device_ctx, *hidden_t0, order, + &ordered_h0, true); + math::matmul(device_ctx, ordered_h0, false, *weight, + false, static_cast(1.0), &gate_t, + static_cast(1.0)); } lstm_value.gate_value = gate_t.data(); lstm_value.output_value = out_t.data(); lstm_value.state_value = cell_t.data(); lstm_value.state_active_value = cell_pre_act_t.data(); - math::LstmUnitFunctor::compute(device_ctx, lstm_value, - frame_size, cur_batch_size, - gate_act, cell_act, cand_act); + math::LstmUnitFunctor::compute( + device_ctx, lstm_value, frame_size, cur_batch_size, gate_act, + cell_act, cand_act); lstm_value.prev_state_value = lstm_value.state_value; } - math::Batch2LoDTensorFunctor to_seq; + math::Batch2LoDTensorFunctor to_seq; batch_hidden.set_lod(batch_gate->lod()); // restore the output hidden in LoDTensor from the batch hidden to_seq(device_ctx, batch_hidden, *hidden_out); @@ -161,7 +161,7 @@ class LSTMKernel : public framework::OpKernel { } }; -template +template class LSTMGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -187,8 +187,8 @@ class LSTMGradKernel : public framework::OpKernel { auto* h0_g = ctx.Output(framework::GradVarName("H0")); auto* c0_g = ctx.Output(framework::GradVarName("C0")); - auto& device_ctx = ctx.device_context(); - math::SetConstant zero; + auto& device_ctx = ctx.template device_context(); + math::SetConstant zero; if (weight_g) { weight_g->mutable_data(ctx.GetPlace()); zero(device_ctx, weight_g, static_cast(0.0)); @@ -200,7 +200,8 @@ class LSTMGradKernel : public framework::OpKernel { Tensor ordered_h0, ordered_c0, ordered_h0_g, ordered_c0_g; const size_t* order = batch_gate->lod()[2].data(); if (c0) { - ReorderInitState(device_ctx, *c0, order, &ordered_c0, true); + ReorderInitState(device_ctx, *c0, order, &ordered_c0, + true); } if (c0 && c0_g) { ordered_c0_g.mutable_data(c0_g->dims(), ctx.GetPlace()); @@ -240,10 +241,10 @@ class LSTMGradKernel : public framework::OpKernel { lstm_grad.check_og_grad = nullptr; } - math::LoDTensor2BatchFunctor to_batch; + math::LoDTensor2BatchFunctor to_batch; auto ToBatch = [&batch_gate, &to_batch]( - const platform::DeviceContext& ctx, const framework::LoDTensor& src, + const DeviceContext& ctx, const framework::LoDTensor& src, const framework::DDim& dims, framework::LoDTensor& dst) { dst.mutable_data(dims, ctx.GetPlace()); dst.set_lod(batch_gate->lod()); @@ -299,7 +300,7 @@ class LSTMGradKernel : public framework::OpKernel { } int cur_batch_size = bend - bstart; - math::LstmUnitGradFunctor::compute( + math::LstmUnitGradFunctor::compute( device_ctx, lstm_value, lstm_grad, frame_size, cur_batch_size, gate_act, cell_act, cand_act); @@ -307,33 +308,34 @@ class LSTMGradKernel : public framework::OpKernel { int pre_h_start = static_cast(batch_starts[n - 1]); int pre_h_end = pre_h_start + cur_batch_size; auto pre_hidden_g = batch_hidden_g.Slice(pre_h_start, pre_h_end); - math::matmul(device_ctx, gate_g, false, *weight, true, - static_cast(1.0), &pre_hidden_g, - static_cast(1.0)); + math::matmul(device_ctx, gate_g, false, *weight, true, + static_cast(1.0), &pre_hidden_g, + static_cast(1.0)); if (weight_g) { /* backward weight */ auto pre_hidden = batch_hidden.Slice(pre_h_start, pre_h_end); - math::matmul(device_ctx, pre_hidden, true, gate_g, false, - static_cast(1.0), weight_g, - static_cast(1.0)); + math::matmul(device_ctx, pre_hidden, true, gate_g, + false, static_cast(1.0), weight_g, + static_cast(1.0)); } } else { if (h0 && weight_g) { - ReorderInitState(device_ctx, *h0, order, &ordered_h0, true); - math::matmul(device_ctx, ordered_h0, true, gate_g, false, - static_cast(1.0), weight_g, - static_cast(1.0)); + ReorderInitState(device_ctx, *h0, order, + &ordered_h0, true); + math::matmul(device_ctx, ordered_h0, true, gate_g, + false, static_cast(1.0), weight_g, + static_cast(1.0)); } if (h0 && h0_g) { ordered_h0_g.mutable_data(h0_g->dims(), ctx.GetPlace()); - math::matmul(device_ctx, gate_g, false, *weight, true, - static_cast(1.0), &ordered_h0_g, - static_cast(0.0)); + math::matmul(device_ctx, gate_g, false, *weight, + true, static_cast(1.0), + &ordered_h0_g, static_cast(0.0)); } } } - math::Batch2LoDTensorFunctor to_seq; + math::Batch2LoDTensorFunctor to_seq; if (in_g) { /* backward data */ in_g->mutable_data(ctx.GetPlace()); @@ -344,15 +346,17 @@ class LSTMGradKernel : public framework::OpKernel { Tensor b_g = *bias_g; b_g.Resize({bias_g->numel(), 1}); Tensor gate_bias_g = b_g.Slice(0, 4 * frame_size); - math::ColwiseSum col_sum; + math::ColwiseSum col_sum; col_sum(device_ctx, batch_gate_g, &gate_bias_g); } if (h0 && h0_g) { - ReorderInitState(device_ctx, ordered_h0_g, order, h0_g, false); + ReorderInitState(device_ctx, ordered_h0_g, order, h0_g, + false); } if (c0 && c0_g) { - ReorderInitState(device_ctx, ordered_c0_g, order, c0_g, false); + ReorderInitState(device_ctx, ordered_c0_g, order, c0_g, + false); } } }; diff --git a/paddle/operators/lstm_unit_op.cu b/paddle/operators/lstm_unit_op.cu index e192283aa0afac49e8e467506f3703d1ce60d2a6..291f2c295e78288c01c6575df936ceedceba7ce8 100644 --- a/paddle/operators/lstm_unit_op.cu +++ b/paddle/operators/lstm_unit_op.cu @@ -173,7 +173,7 @@ class LstmUnitGradOpCUDAKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(lstm_unit, ops::LstmUnitOpCUDAKernel, - ops::LstmUnitOpCUDAKernel); -REGISTER_OP_GPU_KERNEL(lstm_unit_grad, ops::LstmUnitGradOpCUDAKernel, - ops::LstmUnitGradOpCUDAKernel); +REGISTER_OP_CUDA_KERNEL(lstm_unit, ops::LstmUnitOpCUDAKernel, + ops::LstmUnitOpCUDAKernel); +REGISTER_OP_CUDA_KERNEL(lstm_unit_grad, ops::LstmUnitGradOpCUDAKernel, + ops::LstmUnitGradOpCUDAKernel); diff --git a/paddle/operators/lstm_unit_op.h b/paddle/operators/lstm_unit_op.h index 38cb298f92a21bb5c7508761fec701d28279a85f..61705675d930369ea8d491229caa1b4046f3e16a 100644 --- a/paddle/operators/lstm_unit_op.h +++ b/paddle/operators/lstm_unit_op.h @@ -35,7 +35,7 @@ inline T tanh(T x) { return 2. * sigmoid(2. * x) - 1.; } -template +template class LstmUnitKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -78,7 +78,7 @@ class LstmUnitKernel : public framework::OpKernel { } }; -template +template class LstmUnitGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { diff --git a/paddle/operators/margin_rank_loss_op.cc b/paddle/operators/margin_rank_loss_op.cc index d7e8a0ea7632650203106b01531d724cf0b8e085..42e8961c0ea57650a823ee4b58516f66a455b385 100644 --- a/paddle/operators/margin_rank_loss_op.cc +++ b/paddle/operators/margin_rank_loss_op.cc @@ -117,7 +117,7 @@ REGISTER_OP(margin_rank_loss, ops::MarginRankLossOp, ops::MarginRankLossGradOp); REGISTER_OP_CPU_KERNEL( margin_rank_loss, - ops::MarginRankLossKernel); + ops::MarginRankLossKernel); REGISTER_OP_CPU_KERNEL( margin_rank_loss_grad, - ops::MarginRankLossGradKernel); + ops::MarginRankLossGradKernel); diff --git a/paddle/operators/margin_rank_loss_op.cu b/paddle/operators/margin_rank_loss_op.cu index 3a639f25d478a712c1030d57c57d7e55de1488b5..1c2afccc5b32e22c939a275d8c69ad774d3ebdad 100644 --- a/paddle/operators/margin_rank_loss_op.cu +++ b/paddle/operators/margin_rank_loss_op.cu @@ -16,9 +16,9 @@ namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( margin_rank_loss, - ops::MarginRankLossKernel); -REGISTER_OP_GPU_KERNEL( + ops::MarginRankLossKernel); +REGISTER_OP_CUDA_KERNEL( margin_rank_loss_grad, - ops::MarginRankLossGradKernel); + ops::MarginRankLossGradKernel); diff --git a/paddle/operators/margin_rank_loss_op.h b/paddle/operators/margin_rank_loss_op.h index 8d0830147ecc465909e8988e90125929829f6f34..9c1f96cac13f1bdb8c5dfd3e771157d1d1c60e15 100644 --- a/paddle/operators/margin_rank_loss_op.h +++ b/paddle/operators/margin_rank_loss_op.h @@ -34,7 +34,7 @@ struct Heaviside { } }; -template +template class MarginRankLossKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const { @@ -56,13 +56,13 @@ class MarginRankLossKernel : public framework::OpKernel { auto x1 = framework::EigenVector::Flatten(*x1_t); auto x2 = framework::EigenVector::Flatten(*x2_t); - auto& dev = ctx.GetEigenDevice(); + auto& dev = *ctx.template device_context().eigen_device(); out.device(dev) = (-label * (x1 - x2) + margin).unaryExpr(ReLU()); act.device(dev) = out.unaryExpr(Heaviside()); } }; -template +template class MarginRankLossGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const { @@ -78,7 +78,7 @@ class MarginRankLossGradKernel : public framework::OpKernel { auto d_out = framework::EigenVector::Flatten(*d_out_t); auto act = framework::EigenVector::Flatten(*act_t); auto label = framework::EigenVector::Flatten(*label_t); - auto& dev = ctx.GetEigenDevice(); + auto& dev = *ctx.template device_context().eigen_device(); // compute d_x1 if (d_x1_t) { diff --git a/paddle/operators/math/context_project.cc b/paddle/operators/math/context_project.cc index f82ea5d7bee81fd1578c46f79477bb23939e627a..980dd90df8710cdbcb760e1ca84f1492a76fdb70 100644 --- a/paddle/operators/math/context_project.cc +++ b/paddle/operators/math/context_project.cc @@ -18,8 +18,8 @@ namespace paddle { namespace operators { namespace math { -template class ContextProjectFunctor; -template class ContextProjectFunctor; +template class ContextProjectFunctor; +template class ContextProjectFunctor; } // namespace math } // namespace operators diff --git a/paddle/operators/math/context_project.cu b/paddle/operators/math/context_project.cu index 04eeed543cb165fe449d3578a951cf74b0422252..934e3df645916013b4d1fe5eb4a19be924c914d5 100644 --- a/paddle/operators/math/context_project.cu +++ b/paddle/operators/math/context_project.cu @@ -20,8 +20,8 @@ namespace paddle { namespace operators { namespace math { -template class ContextProjectFunctor; -template class ContextProjectFunctor; +template class ContextProjectFunctor; +template class ContextProjectFunctor; } // namespace math } // namespace operators diff --git a/paddle/operators/math/context_project.h b/paddle/operators/math/context_project.h index d853507188cf8c80aede1e7646736036e30c9678..4036614086e1eb724a4a647db6ef13b6fe7aaaa0 100644 --- a/paddle/operators/math/context_project.h +++ b/paddle/operators/math/context_project.h @@ -81,17 +81,17 @@ using LoDTensor = framework::LoDTensor; * */ -template +template class ContextProjectFunctor { public: - void operator()(const platform::DeviceContext& context, const LoDTensor& in, + void operator()(const DeviceContext& context, const LoDTensor& in, const Tensor& padding_data, bool padding_trainable, const int context_start, const int context_length, const int context_stride, const int up_pad, const int down_pad, Tensor* col) { auto lod_level_0 = in.lod()[0]; - math::Im2ColFunctor im2col_ocf; + math::Im2ColFunctor im2col_ocf; std::vector dilation({1, 1}); std::vector padding({up_pad, 0, down_pad, 0}); @@ -188,17 +188,17 @@ class ContextProjectFunctor { } }; -template +template class ContextProjectGradFunctor { public: - void operator()(const platform::DeviceContext& context, const LoDTensor& in, + void operator()(const DeviceContext& context, const LoDTensor& in, bool padding_trainable, const int context_start, const int context_length, const int context_stride, const int up_pad, const int down_pad, bool pad_grad, bool input_grad, Tensor* padding_data, Tensor* col) { auto lod_level_0 = in.lod()[0]; - math::Col2ImFunctor col2im_ocf; + math::Col2ImFunctor col2im_ocf; std::vector dilation({1, 1}); std::vector padding({up_pad, 0, down_pad, 0}); @@ -258,8 +258,8 @@ class ContextProjectGradFunctor { Tensor out_t_sub = out_t.Slice(k * context_length, k * context_length + padding_size); Tensor w_sub = padding_data->Slice(k, k + padding_size); - axpy(context, w_sub.numel(), static_cast(1), - out_t_sub.data(), w_sub.data()); + axpy(context, w_sub.numel(), static_cast(1), + out_t_sub.data(), w_sub.data()); } } if (down_pad > 0) { @@ -290,8 +290,8 @@ class ContextProjectGradFunctor { (down_pad_begin_row + t) * context_length); Tensor w_sub = padding_data->Slice( up_pad + padding_idx, up_pad + padding_idx + padding_size); - axpy(context, w_sub.numel(), static_cast(1), - out_t_sub.data(), w_sub.data()); + axpy(context, w_sub.numel(), static_cast(1), + out_t_sub.data(), w_sub.data()); } } out_t.Resize({sequence_height, context_length * sequence_width}); diff --git a/paddle/operators/math/cross_entropy.cc b/paddle/operators/math/cross_entropy.cc index cf238a58e0a0b930077b0376a71dc02c5b31efe5..6011a196d446854877e162019f6745deb501ee9d 100644 --- a/paddle/operators/math/cross_entropy.cc +++ b/paddle/operators/math/cross_entropy.cc @@ -24,9 +24,9 @@ template ; template -class CrossEntropyFunctor { +class CrossEntropyFunctor { public: - void operator()(const platform::DeviceContext& ctx, framework::Tensor* out, + void operator()(const platform::CPUDeviceContext& ctx, framework::Tensor* out, const framework::Tensor* prob, const framework::Tensor* labels, const bool softLabel) { const int batch_size = prob->dims()[0]; @@ -35,7 +35,7 @@ class CrossEntropyFunctor { auto lbl = EigenMatrix::From(*labels); auto loss = EigenMatrix::From(*out); - loss.device(*ctx.GetEigenDevice()) = + loss.device(*ctx.eigen_device()) = -((lbl * in.log().unaryExpr(math::TolerableValue())) .sum(Eigen::DSizes(1)) .reshape(Eigen::DSizes(batch_size, 1))); @@ -53,8 +53,8 @@ class CrossEntropyFunctor { } }; -template class CrossEntropyFunctor; -template class CrossEntropyFunctor; +template class CrossEntropyFunctor; +template class CrossEntropyFunctor; } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/operators/math/cross_entropy.cu b/paddle/operators/math/cross_entropy.cu index 651c08f740c2991b11c210c9bf012e505adc1835..2132d49c937a85afeed0e0cee0a74a7e30c6a3ca 100644 --- a/paddle/operators/math/cross_entropy.cu +++ b/paddle/operators/math/cross_entropy.cu @@ -95,10 +95,10 @@ __global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label, using Tensor = framework::Tensor; template -class CrossEntropyFunctor { +class CrossEntropyFunctor { public: - void operator()(const platform::DeviceContext& ctx, framework::Tensor* out, - const framework::Tensor* prob, + void operator()(const platform::CUDADeviceContext& ctx, + framework::Tensor* out, const framework::Tensor* prob, const framework::Tensor* labels, bool softLabel) { const T* prob_data = prob->data(); T* loss_data = out->mutable_data(ctx.GetPlace()); @@ -118,16 +118,14 @@ class CrossEntropyFunctor { const int64_t* label_data = labels->data(); int block = 512; int grid = (batch_size + block - 1) / block; - CrossEntropyKernel<<< - grid, block, 0, - reinterpret_cast(ctx).stream()>>>( + CrossEntropyKernel<<>>( loss_data, prob_data, label_data, batch_size, class_num); } } }; -template class CrossEntropyFunctor; -template class CrossEntropyFunctor; +template class CrossEntropyFunctor; +template class CrossEntropyFunctor; } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/operators/math/cross_entropy.h b/paddle/operators/math/cross_entropy.h index 70ed9ddd551bb8cb7989727c02fea870186c9f2e..677adb5adaf4041fe7acfd29be354073535fd5fc 100644 --- a/paddle/operators/math/cross_entropy.h +++ b/paddle/operators/math/cross_entropy.h @@ -33,11 +33,11 @@ struct TolerableValue { } }; -template +template class CrossEntropyFunctor { public: - void operator()(const platform::DeviceContext& context, - framework::Tensor* out, const framework::Tensor* prob, + void operator()(const DeviceContext& context, framework::Tensor* out, + const framework::Tensor* prob, const framework::Tensor* labels, const bool softLabel); }; } // namespace math diff --git a/paddle/operators/math/gru_compute.cc b/paddle/operators/math/gru_compute.cc index ae4e47b014a9cd1f656dd9332086aa4d1b7cbb52..d570c68cd458914c8951c4ce50a02e3c5b1acab0 100644 --- a/paddle/operators/math/gru_compute.cc +++ b/paddle/operators/math/gru_compute.cc @@ -19,14 +19,14 @@ namespace operators { namespace math { template -struct GRUUnitFunctor { - static void compute(const platform::DeviceContext &context, +struct GRUUnitFunctor { + static void compute(const platform::CPUDeviceContext &context, hl_gru_value value, int frame_size, int batch_size, activation_mode_t active_node, activation_mode_t active_gate) { #ifndef __NVCC__ if (value.prev_out_value) { - math::gemm( + math::gemm( context, false, false, batch_size, frame_size * 2, frame_size, 1, value.prev_out_value, frame_size, value.gate_weight, frame_size * 2, 1, value.gate_value, frame_size * 3); @@ -36,7 +36,7 @@ struct GRUUnitFunctor { frame_size, batch_size, active_gate); if (value.prev_out_value) { - math::gemm( + math::gemm( context, false, false, batch_size, frame_size, frame_size, 1, value.reset_output_value, frame_size, value.state_weight, frame_size, 1, value.gate_value + frame_size * 2, frame_size * 3); @@ -49,8 +49,8 @@ struct GRUUnitFunctor { }; template -struct GRUUnitGradFunctor { - static void compute(const platform::DeviceContext &context, +struct GRUUnitGradFunctor { + static void compute(const platform::CPUDeviceContext &context, hl_gru_value value, hl_gru_grad grad, int frame_size, int batch_size, activation_mode_t active_node, @@ -60,13 +60,13 @@ struct GRUUnitGradFunctor { grad, frame_size, batch_size, active_node); if (value.prev_out_value && grad.prev_out_grad) { - math::gemm( + math::gemm( context, false, true, batch_size, frame_size, frame_size, 1, grad.gate_grad + frame_size * 2, frame_size * 3, value.state_weight, frame_size, 0, grad.reset_output_grad, frame_size); if (grad.state_weight_grad) { - math::gemm( + math::gemm( context, true, false, frame_size, frame_size, batch_size, 1, value.reset_output_value, frame_size, grad.gate_grad + frame_size * 2, frame_size * 3, 1, @@ -78,13 +78,13 @@ struct GRUUnitGradFunctor { grad, frame_size, batch_size, active_gate); if (grad.prev_out_grad && value.prev_out_value) { - math::gemm( + math::gemm( context, false, true, batch_size, frame_size, frame_size * 2, 1, grad.gate_grad, frame_size * 3, value.gate_weight, frame_size * 2, 1, grad.prev_out_grad, frame_size); if (grad.gate_weight_grad) { - math::gemm( + math::gemm( context, true, false, frame_size, frame_size * 2, batch_size, 1, value.prev_out_value, frame_size, grad.gate_grad, frame_size * 3, 1, grad.gate_weight_grad, frame_size * 2); @@ -94,10 +94,10 @@ struct GRUUnitGradFunctor { } }; -template struct GRUUnitFunctor; -template struct GRUUnitFunctor; -template struct GRUUnitGradFunctor; -template struct GRUUnitGradFunctor; +template struct GRUUnitFunctor; +template struct GRUUnitFunctor; +template struct GRUUnitGradFunctor; +template struct GRUUnitGradFunctor; } // namespace math } // namespace operators diff --git a/paddle/operators/math/gru_compute.cu b/paddle/operators/math/gru_compute.cu index 0252bdbdb63fef2e4754057fc5b6d415cef0c29f..dd518cd1e4bea52f0d463150114feed3ceea0ccb 100644 --- a/paddle/operators/math/gru_compute.cu +++ b/paddle/operators/math/gru_compute.cu @@ -19,13 +19,12 @@ namespace operators { namespace math { template -struct GRUUnitFunctor { - static void compute(const platform::DeviceContext &context, +struct GRUUnitFunctor { + static void compute(const platform::CUDADeviceContext &context, hl_gru_value value, int frame_size, int batch_size, activation_mode_t active_node, activation_mode_t active_gate) { - auto stream = - reinterpret_cast(context).stream(); + auto stream = context.stream(); dim3 threads; dim3 grid; if (batch_size == 1) { @@ -39,7 +38,7 @@ struct GRUUnitFunctor { } if (value.prev_out_value) { - math::gemm( + math::gemm( context, false, false, batch_size, frame_size * 2, frame_size, 1, value.prev_out_value, frame_size, value.gate_weight, frame_size * 2, 1, value.gate_value, frame_size * 3); @@ -62,7 +61,7 @@ struct GRUUnitFunctor { } if (value.prev_out_value) { - math::gemm( + math::gemm( context, false, false, batch_size, frame_size, frame_size, 1, value.reset_output_value, frame_size, value.state_weight, frame_size, 1, value.gate_value + frame_size * 2, frame_size * 3); @@ -87,14 +86,13 @@ struct GRUUnitFunctor { }; template -struct GRUUnitGradFunctor { - static void compute(const platform::DeviceContext &context, +struct GRUUnitGradFunctor { + static void compute(const platform::CUDADeviceContext &context, hl_gru_value value, hl_gru_grad grad, int frame_size, int batch_size, activation_mode_t active_node, activation_mode_t active_gate) { - auto stream = - reinterpret_cast(context).stream(); + auto stream = context.stream(); dim3 threads; dim3 grid; if (batch_size == 1) { @@ -124,13 +122,13 @@ struct GRUUnitGradFunctor { } if (value.prev_out_value && grad.prev_out_grad) { - math::gemm( + math::gemm( context, false, true, batch_size, frame_size, frame_size, 1, grad.gate_grad + frame_size * 2, frame_size * 3, value.state_weight, frame_size, 0, grad.reset_output_grad, frame_size); if (grad.state_weight_grad) { - math::gemm( + math::gemm( context, true, false, frame_size, frame_size, batch_size, 1, value.reset_output_value, frame_size, grad.gate_grad + frame_size * 2, frame_size * 3, 1, @@ -155,13 +153,13 @@ struct GRUUnitGradFunctor { } if (grad.prev_out_grad && value.prev_out_value) { - math::gemm( + math::gemm( context, false, true, batch_size, frame_size, frame_size * 2, 1, grad.gate_grad, frame_size * 3, value.gate_weight, frame_size * 2, 1, grad.prev_out_grad, frame_size); if (grad.gate_weight_grad) { - math::gemm( + math::gemm( context, true, false, frame_size, frame_size * 2, batch_size, 1, value.prev_out_value, frame_size, grad.gate_grad, frame_size * 3, 1, grad.gate_weight_grad, frame_size * 2); @@ -170,10 +168,10 @@ struct GRUUnitGradFunctor { } }; -template struct GRUUnitFunctor; -template struct GRUUnitFunctor; -template struct GRUUnitGradFunctor; -template struct GRUUnitGradFunctor; +template struct GRUUnitFunctor; +template struct GRUUnitFunctor; +template struct GRUUnitGradFunctor; +template struct GRUUnitGradFunctor; } // namespace math } // namespace operators diff --git a/paddle/operators/math/gru_compute.h b/paddle/operators/math/gru_compute.h index 58ea59f68e91c647a6b29ce3e8bc7e5d25db9b9b..ca1343cb2c5c1eb8da92c2f06b25902c1c2fe8b3 100644 --- a/paddle/operators/math/gru_compute.h +++ b/paddle/operators/math/gru_compute.h @@ -40,19 +40,18 @@ struct hl_gru_grad { T *prev_out_grad; }; -template +template struct GRUUnitFunctor { - static void compute(const platform::DeviceContext &context, - hl_gru_value value, int frame_size, int batch_size, + static void compute(const DeviceContext &context, hl_gru_value value, + int frame_size, int batch_size, activation_mode_t active_node, activation_mode_t active_gate); }; -template +template struct GRUUnitGradFunctor { - static void compute(const platform::DeviceContext &context, - hl_gru_value value, hl_gru_grad grad, - int frame_size, int batch_size, + static void compute(const DeviceContext &context, hl_gru_value value, + hl_gru_grad grad, int frame_size, int batch_size, activation_mode_t active_node, activation_mode_t active_gate); }; diff --git a/paddle/operators/math/im2col.cc b/paddle/operators/math/im2col.cc index c10c44c52076c8ee56eee3a0d82c31df70a1c9c7..707ebf05962fb65892c2adbbf41a0a3449763d31 100644 --- a/paddle/operators/math/im2col.cc +++ b/paddle/operators/math/im2col.cc @@ -25,9 +25,9 @@ namespace math { */ template class Im2ColFunctor { + platform::CPUDeviceContext, T> { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& im, const std::vector& dilation, const std::vector& stride, const std::vector& padding, framework::Tensor* col) { @@ -90,9 +90,9 @@ class Im2ColFunctor class Col2ImFunctor { + platform::CPUDeviceContext, T> { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& col, const std::vector& dilation, const std::vector& stride, @@ -149,13 +149,13 @@ class Col2ImFunctor; + platform::CPUDeviceContext, float>; template class Im2ColFunctor; + platform::CPUDeviceContext, double>; template class Col2ImFunctor; + platform::CPUDeviceContext, float>; template class Col2ImFunctor; + platform::CPUDeviceContext, double>; /* * im = [input_channels, input_height, input_width] @@ -164,9 +164,9 @@ template class Col2ImFunctor class Im2ColFunctor { + platform::CPUDeviceContext, T> { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& im, const std::vector& dilation, const std::vector& stride, const std::vector& padding, framework::Tensor* col) { @@ -235,9 +235,9 @@ class Im2ColFunctor class Col2ImFunctor { + platform::CPUDeviceContext, T> { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& col, const std::vector& dilation, const std::vector& stride, @@ -300,13 +300,13 @@ class Col2ImFunctor; + platform::CPUDeviceContext, float>; template class Im2ColFunctor; + platform::CPUDeviceContext, double>; template class Col2ImFunctor; + platform::CPUDeviceContext, float>; template class Col2ImFunctor; + platform::CPUDeviceContext, double>; } // namespace math } // namespace operators diff --git a/paddle/operators/math/im2col.cu b/paddle/operators/math/im2col.cu index bf7894243919571c2ab15d53690b1ef05bfcc6ee..a88e837b030f286cce272f99ad7991c70336e4a9 100644 --- a/paddle/operators/math/im2col.cu +++ b/paddle/operators/math/im2col.cu @@ -58,9 +58,9 @@ __global__ void im2col(const T* data_im, int num_outs, int im_height, */ template class Im2ColFunctor { + platform::CUDADeviceContext, T> { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& im, const std::vector& dilation, const std::vector& stride, const std::vector& padding, framework::Tensor* col) { @@ -96,9 +96,7 @@ class Im2ColFunctor<<(context) - .stream()>>>( + im2col<<>>( im.data(), num_outputs, im_height, im_width, dilation[0], dilation[1], filter_height, filter_width, stride[0], stride[1], padding[0], padding[1], col_height, col_width, col->data()); @@ -160,9 +158,9 @@ __global__ void col2im(int n, const T* data_col, int im_height, int im_width, */ template class Col2ImFunctor { + platform::CUDADeviceContext, T> { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& col, const std::vector& dilation, const std::vector& stride, @@ -203,9 +201,7 @@ class Col2ImFunctor<<(context) - .stream()>>>( + col2im<<>>( num_kernels, col.data(), im_height, im_width, dilation[0], dilation[1], filter_height, filter_width, stride[0], stride[1], padding[0], padding[2], col_height, col_width, im->data()); @@ -213,13 +209,13 @@ class Col2ImFunctor; + platform::CUDADeviceContext, float>; template class Im2ColFunctor; + platform::CUDADeviceContext, double>; template class Col2ImFunctor; + platform::CUDADeviceContext, float>; template class Col2ImFunctor; + platform::CUDADeviceContext, double>; template __global__ void im2colOCF(const T* im_data, int im_channels, int im_height, @@ -260,9 +256,9 @@ __global__ void im2colOCF(const T* im_data, int im_channels, int im_height, */ template class Im2ColFunctor { + platform::CUDADeviceContext, T> { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& im, const std::vector& dilation, const std::vector& stride, const std::vector& padding, framework::Tensor* col) { @@ -310,9 +306,7 @@ class Im2ColFunctor<<(context) - .stream()>>>( + im2colOCF<<>>( im.data(), im_channels, im_height, im_width, filter_height, filter_width, stride[0], stride[1], padding[0], padding[1], col_height, col_width, col->data()); @@ -358,9 +352,9 @@ __global__ void col2imOCF(const T* col_data, int im_channels, int im_height, */ template class Col2ImFunctor { + platform::CUDADeviceContext, T> { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& col, const std::vector& dilation, const std::vector& stride, @@ -409,9 +403,7 @@ class Col2ImFunctor<<(context) - .stream()>>>( + col2imOCF<<>>( col.data(), im_channels, im_height, im_width, filter_height, filter_width, stride[0], stride[1], padding[0], padding[1], col_height, col_width, im->data()); @@ -419,13 +411,13 @@ class Col2ImFunctor; + platform::CUDADeviceContext, float>; template class Im2ColFunctor; + platform::CUDADeviceContext, double>; template class Col2ImFunctor; + platform::CUDADeviceContext, float>; template class Col2ImFunctor; + platform::CUDADeviceContext, double>; } // namespace math } // namespace operators diff --git a/paddle/operators/math/im2col.h b/paddle/operators/math/im2col.h index 24fd9a06e9f5fbd50483429379cf3f46ff88bcaa..38f2c9fe0adf80a2a4355a45bebb9ba0f341d1ab 100644 --- a/paddle/operators/math/im2col.h +++ b/paddle/operators/math/im2col.h @@ -79,20 +79,19 @@ enum class ColFormat { kCFO = 0, kOCF = 1 }; * \note The caller needs to ensure that imShape.inputChannels is equal to * colShape.inputChannels. */ -template +template class Im2ColFunctor { public: - void operator()(const platform::DeviceContext& context, - const framework::Tensor& im, const std::vector& dilation, + void operator()(const DeviceContext& context, const framework::Tensor& im, + const std::vector& dilation, const std::vector& stride, const std::vector& padding, framework::Tensor* col); }; -template +template class Col2ImFunctor { public: - void operator()(const platform::DeviceContext& context, - const framework::Tensor& col, + void operator()(const DeviceContext& context, const framework::Tensor& col, const std::vector& dilation, const std::vector& stride, const std::vector& padding, framework::Tensor* im); diff --git a/paddle/operators/math/im2col_test.cc b/paddle/operators/math/im2col_test.cc index ae197a97ed8aa089b51be77a59a8ba6a98ac70ec..256f3bc9bd487d11b0f139ef057f5a98556b4db1 100644 --- a/paddle/operators/math/im2col_test.cc +++ b/paddle/operators/math/im2col_test.cc @@ -16,7 +16,7 @@ limitations under the License. */ #include #include -template +template void testIm2col() { paddle::framework::Tensor input_tmp; paddle::framework::Tensor input; @@ -59,18 +59,7 @@ void testIm2col() { memcpy(input_ptr, arr, 6 * sizeof(float)); auto* place = new Place(); - paddle::platform::DeviceContext* context; - if (paddle::platform::is_cpu_place(*place)) { - context = - new paddle::platform::CPUDeviceContext(paddle::platform::CPUPlace()); - } else { -#ifdef PADDLE_WITH_CUDA - context = - new paddle::platform::CUDADeviceContext(paddle::platform::GPUPlace()); -#else - PADDLE_THROW("no GPU support"); -#endif // PADDLE_WITH_CUDA - } + DeviceContext* context = new DeviceContext(*place); if (paddle::platform::is_cpu_place(*place)) { input = input_tmp; } else { @@ -83,10 +72,10 @@ void testIm2col() { // Im2Col paddle::operators::math::Im2ColFunctor< - paddle::operators::math::ColFormat::kCFO, Place, float> + paddle::operators::math::ColFormat::kCFO, DeviceContext, float> im2col; paddle::operators::math::Im2ColFunctor< - paddle::operators::math::ColFormat::kOCF, Place, float> + paddle::operators::math::ColFormat::kOCF, DeviceContext, float> im2col_ocf; im2col(*context, input, dilation, stride, padding, &output_cfo); @@ -119,10 +108,10 @@ void testIm2col() { // Col2Im: kCFO paddle::operators::math::Col2ImFunctor< - paddle::operators::math::ColFormat::kCFO, Place, float> + paddle::operators::math::ColFormat::kCFO, DeviceContext, float> col2im; paddle::operators::math::Col2ImFunctor< - paddle::operators::math::ColFormat::kOCF, Place, float> + paddle::operators::math::ColFormat::kOCF, DeviceContext, float> col2im_ocf; float col2im_data[] = {0, 2, 2, 3, 8, 5}; @@ -168,8 +157,8 @@ void testIm2col() { } TEST(math, im2col) { - testIm2col(); + testIm2col(); #ifdef PADDLE_WITH_CUDA - testIm2col(); + testIm2col(); #endif } diff --git a/paddle/operators/math/lstm_compute.cc b/paddle/operators/math/lstm_compute.cc index ad3a59bcdbe65a4107db4e56a0870794914a0fd8..2c2e8bb82e6f51e21a00de53bbfce5f0b4868e27 100644 --- a/paddle/operators/math/lstm_compute.cc +++ b/paddle/operators/math/lstm_compute.cc @@ -21,8 +21,8 @@ namespace operators { namespace math { template -struct LstmUnitFunctor { - static void compute(const platform::DeviceContext& context, +struct LstmUnitFunctor { + static void compute(const platform::CPUDeviceContext& context, LstmMetaValue value, int frame_size, int batch_size, const std::string& gate_act, const std::string& cell_act, const std::string& cand_act) { @@ -42,8 +42,8 @@ struct LstmUnitFunctor { }; template -struct LstmUnitGradFunctor { - static void compute(const platform::DeviceContext& context, +struct LstmUnitGradFunctor { + static void compute(const platform::CPUDeviceContext& context, LstmMetaValue value, LstmMetaGrad grad, int frame_size, int batch_size, const std::string& gate_act, const std::string& cell_act, @@ -72,10 +72,10 @@ struct LstmUnitGradFunctor { } }; -template class LstmUnitFunctor; -template class LstmUnitFunctor; -template class LstmUnitGradFunctor; -template class LstmUnitGradFunctor; +template class LstmUnitFunctor; +template class LstmUnitFunctor; +template class LstmUnitGradFunctor; +template class LstmUnitGradFunctor; } // namespace math } // namespace operators diff --git a/paddle/operators/math/lstm_compute.cu b/paddle/operators/math/lstm_compute.cu index b2122f2a5c08a6d9d53293833177f0ba2c3ab860..92b1f4228b49709d2903fab518e7649133932fad 100644 --- a/paddle/operators/math/lstm_compute.cu +++ b/paddle/operators/math/lstm_compute.cu @@ -21,8 +21,8 @@ namespace operators { namespace math { template -struct LstmUnitFunctor { - static void compute(const platform::DeviceContext& context, +struct LstmUnitFunctor { + static void compute(const platform::CUDADeviceContext& context, LstmMetaValue value, int frame_size, int batch_size, const std::string& gate_act, const std::string& cell_act, const std::string& cand_act) { @@ -33,8 +33,8 @@ struct LstmUnitFunctor { }; template -struct LstmUnitGradFunctor { - static void compute(const platform::DeviceContext& context, +struct LstmUnitGradFunctor { + static void compute(const platform::CUDADeviceContext& context, LstmMetaValue value, LstmMetaGrad grad, int frame_size, int batch_size, const std::string& gate_act, const std::string& cell_act, @@ -45,10 +45,10 @@ struct LstmUnitGradFunctor { } }; -template class LstmUnitFunctor; -template class LstmUnitFunctor; -template class LstmUnitGradFunctor; -template class LstmUnitGradFunctor; +template class LstmUnitFunctor; +template class LstmUnitFunctor; +template class LstmUnitGradFunctor; +template class LstmUnitGradFunctor; } // namespace math } // namespace operators diff --git a/paddle/operators/math/lstm_compute.h b/paddle/operators/math/lstm_compute.h index 9652399d4c149c5c186a0780f2b8bf2294bad978..5f74e273585aea5184281bf294df694235150e30 100644 --- a/paddle/operators/math/lstm_compute.h +++ b/paddle/operators/math/lstm_compute.h @@ -67,21 +67,20 @@ inline activation_mode_t ActiveType(const std::string &type) { } } -template +template class LstmUnitFunctor { public: - static void compute(const platform::DeviceContext &context, - LstmMetaValue value, int frame_size, int batch_size, + static void compute(const DeviceContext &context, LstmMetaValue value, + int frame_size, int batch_size, const std::string &gate_act, const std::string &cell_act, const std::string &cand_act); }; -template +template class LstmUnitGradFunctor { public: - static void compute(const platform::DeviceContext &context, - LstmMetaValue value, LstmMetaGrad grad, - int frame_size, int batch_size, + static void compute(const DeviceContext &context, LstmMetaValue value, + LstmMetaGrad grad, int frame_size, int batch_size, const std::string &gate_act, const std::string &cell_act, const std::string &cand_act); }; diff --git a/paddle/operators/math/math_function.cc b/paddle/operators/math/math_function.cc index e099a6a43917f2c1213ecb1f07965ee97ff195db..2b35e4532a9c9f72f473020d472244234af24248 100644 --- a/paddle/operators/math/math_function.cc +++ b/paddle/operators/math/math_function.cc @@ -21,13 +21,11 @@ namespace operators { namespace math { template <> -void gemm(const platform::DeviceContext& context, - const CBLAS_TRANSPOSE transA, - const CBLAS_TRANSPOSE transB, const int M, - const int N, const int K, - const float alpha, const float* A, - const float* B, const float beta, - float* C) { +void gemm( + const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA, + const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, + const float alpha, const float* A, const float* B, const float beta, + float* C) { int lda = (transA == CblasNoTrans) ? K : M; int ldb = (transB == CblasNoTrans) ? N : K; int ldc = N; @@ -36,13 +34,11 @@ void gemm(const platform::DeviceContext& context, } template <> -void gemm(const platform::DeviceContext& context, - const CBLAS_TRANSPOSE transA, - const CBLAS_TRANSPOSE transB, const int M, - const int N, const int K, - const double alpha, const double* A, - const double* B, const double beta, - double* C) { +void gemm( + const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA, + const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, + const double alpha, const double* A, const double* B, const double beta, + double* C) { int lda = (transA == CblasNoTrans) ? K : M; int ldb = (transB == CblasNoTrans) ? N : K; int ldc = N; @@ -51,35 +47,32 @@ void gemm(const platform::DeviceContext& context, } template <> -void gemm(const platform::DeviceContext& context, - const bool transA, const bool transB, - const int M, const int N, const int K, - const float alpha, const float* A, - const int lda, const float* B, - const int ldb, const float beta, float* C, - const int ldc) { +void gemm( + const platform::CPUDeviceContext& context, const bool transA, + const bool transB, const int M, const int N, const int K, const float alpha, + const float* A, const int lda, const float* B, const int ldb, + const float beta, float* C, const int ldc) { cblas_sgemm(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans, transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc); } template <> -void gemm(const platform::DeviceContext& context, - const bool transA, const bool transB, - const int M, const int N, const int K, - const double alpha, const double* A, - const int lda, const double* B, - const int ldb, const double beta, - double* C, const int ldc) { +void gemm( + const platform::CPUDeviceContext& context, const bool transA, + const bool transB, const int M, const int N, const int K, + const double alpha, const double* A, const int lda, const double* B, + const int ldb, const double beta, double* C, const int ldc) { cblas_dgemm(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans, transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc); } template <> -void matmul( - const platform::DeviceContext& context, const framework::Tensor& matrix_a, - bool trans_a, const framework::Tensor& matrix_b, bool trans_b, float alpha, +void matmul( + const platform::CPUDeviceContext& context, + const framework::Tensor& matrix_a, bool trans_a, + const framework::Tensor& matrix_b, bool trans_b, float alpha, framework::Tensor* matrix_out, float beta) { auto dim_a = matrix_a.dims(); auto dim_b = matrix_b.dims(); @@ -99,15 +92,16 @@ void matmul( CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans; CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans; - gemm( + gemm( context, transA, transB, M, N, K, alpha, matrix_a.data(), matrix_b.data(), beta, matrix_out->data()); } template <> -void matmul( - const platform::DeviceContext& context, const framework::Tensor& matrix_a, - bool trans_a, const framework::Tensor& matrix_b, bool trans_b, double alpha, +void matmul( + const platform::CPUDeviceContext& context, + const framework::Tensor& matrix_a, bool trans_a, + const framework::Tensor& matrix_b, bool trans_b, double alpha, framework::Tensor* matrix_out, double beta) { auto dim_a = matrix_a.dims(); auto dim_b = matrix_b.dims(); @@ -127,7 +121,7 @@ void matmul( CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans; CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans; - gemm( + gemm( context, transA, transB, M, N, K, alpha, matrix_a.data(), matrix_b.data(), beta, matrix_out->data()); } @@ -135,8 +129,8 @@ void matmul( #ifdef PADDLE_WITH_MKLML // Use cblas_{s,d}gemm_batched if available: Run with 1 group of size batchSize. template <> -void batched_gemm( - const platform::DeviceContext& context, const CBLAS_TRANSPOSE transA, +void batched_gemm( + const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, const float alpha, const float* A, const float* B, const float beta, float* C, const int batchCount, const int strideA, const int strideB) { @@ -157,8 +151,8 @@ void batched_gemm( } template <> -void batched_gemm( - const platform::DeviceContext& context, const CBLAS_TRANSPOSE transA, +void batched_gemm( + const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, const double alpha, const double* A, const double* B, const double beta, double* C, const int batchCount, const int strideA, const int strideB) { @@ -183,8 +177,8 @@ void batched_gemm( // functions of Intel MKL are not available. In the future, this computation // should be parallelized. template <> -void batched_gemm( - const platform::DeviceContext& context, const CBLAS_TRANSPOSE transA, +void batched_gemm( + const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, const float alpha, const float* A, const float* B, const float beta, float* C, const int batchCount, const int strideA, const int strideB) { @@ -192,14 +186,14 @@ void batched_gemm( const float* Ak = &A[k * strideA]; const float* Bk = &B[k * strideB]; float* Ck = &C[k * M * N]; - gemm(context, transA, transB, M, N, K, alpha, Ak, - Bk, beta, Ck); + gemm(context, transA, transB, M, N, K, + alpha, Ak, Bk, beta, Ck); } } template <> -void batched_gemm( - const platform::DeviceContext& context, const CBLAS_TRANSPOSE transA, +void batched_gemm( + const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, const double alpha, const double* A, const double* B, const double beta, double* C, const int batchCount, const int strideA, const int strideB) { @@ -207,55 +201,53 @@ void batched_gemm( const double* Ak = &A[k * strideA]; const double* Bk = &B[k * strideB]; double* Ck = &C[k * M * N]; - gemm(context, transA, transB, M, N, K, alpha, - Ak, Bk, beta, Ck); + gemm(context, transA, transB, M, N, K, + alpha, Ak, Bk, beta, Ck); } } #endif template <> -void gemv(const platform::DeviceContext& context, - const bool trans_a, const int M, - const int N, const float alpha, - const float* A, const float* B, - const float beta, float* C) { +void gemv( + const platform::CPUDeviceContext& context, const bool trans_a, const int M, + const int N, const float alpha, const float* A, const float* B, + const float beta, float* C) { CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans; cblas_sgemv(CblasRowMajor, transA, M, N, alpha, A, N, B, 1, beta, C, 1); } template <> -void gemv(const platform::DeviceContext& context, - const bool trans_a, const int M, - const int N, const double alpha, - const double* A, const double* B, - const double beta, double* C) { +void gemv( + const platform::CPUDeviceContext& context, const bool trans_a, const int M, + const int N, const double alpha, const double* A, const double* B, + const double beta, double* C) { CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans; cblas_dgemv(CblasRowMajor, transA, M, N, alpha, A, N, B, 1, beta, C, 1); } template <> -void axpy(const platform::DeviceContext& context, - const int n, const float alpha, - const float* x, float* y) { +void axpy( + const platform::CPUDeviceContext& context, const int n, const float alpha, + const float* x, float* y) { cblas_saxpy(n, alpha, x, 1, y, 1); } template <> -void axpy(const platform::DeviceContext& context, - const int n, const double alpha, - const double* x, double* y) { +void axpy( + const platform::CPUDeviceContext& context, const int n, const double alpha, + const double* x, double* y) { cblas_daxpy(n, alpha, x, 1, y, 1); } -template struct SetConstant; -template struct SetConstant; -template struct SetConstant; -template struct SetConstant; -template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; -#define DEFINE_CPU_TRANS(RANK) \ - template struct Transpose; \ - template struct Transpose; +#define DEFINE_CPU_TRANS(RANK) \ + template struct Transpose; \ + template struct Transpose; DEFINE_CPU_TRANS(1); DEFINE_CPU_TRANS(2); @@ -310,10 +302,10 @@ void set_constant(const platform::DeviceContext& context, #endif } -template struct RowwiseAdd; -template struct RowwiseAdd; -template struct ColwiseSum; -template struct ColwiseSum; +template struct RowwiseAdd; +template struct RowwiseAdd; +template struct ColwiseSum; +template struct ColwiseSum; } // namespace math } // namespace operators diff --git a/paddle/operators/math/math_function.cu b/paddle/operators/math/math_function.cu index 3018e50a4f54592123df6b9cadd45ce525d7b3e1..1b560a7e2d29c1b63a25d4ec9bbd82d5960a279d 100644 --- a/paddle/operators/math/math_function.cu +++ b/paddle/operators/math/math_function.cu @@ -22,13 +22,11 @@ namespace operators { namespace math { template <> -void gemm(const platform::DeviceContext& context, - const CBLAS_TRANSPOSE transA, - const CBLAS_TRANSPOSE transB, const int M, - const int N, const int K, - const float alpha, const float* A, - const float* B, const float beta, - float* C) { +void gemm( + const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA, + const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, + const float alpha, const float* A, const float* B, const float beta, + float* C) { // Note that cublas follows fortran order, so the order is different from // the cblas convention. int lda = (transA == CblasNoTrans) ? K : M; @@ -39,19 +37,16 @@ void gemm(const platform::DeviceContext& context, (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; PADDLE_ENFORCE(platform::dynload::cublasSgemm( - reinterpret_cast(context) - .cublas_handle(), - cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, N)); + context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, + lda, &beta, C, N)); } template <> -void gemm(const platform::DeviceContext& context, - const CBLAS_TRANSPOSE transA, - const CBLAS_TRANSPOSE transB, const int M, - const int N, const int K, - const double alpha, const double* A, - const double* B, const double beta, - double* C) { +void gemm( + const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA, + const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, + const double alpha, const double* A, const double* B, const double beta, + double* C) { // Note that cublas follows fortran order, so the order is different from // the cblas convention. int lda = (transA == CblasNoTrans) ? K : M; @@ -61,51 +56,45 @@ void gemm(const platform::DeviceContext& context, cublasOperation_t cuTransB = (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; PADDLE_ENFORCE(platform::dynload::cublasDgemm( - reinterpret_cast(context) - .cublas_handle(), - cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, N)); + context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, + lda, &beta, C, N)); } template <> -void gemm(const platform::DeviceContext& context, - const bool transA, const bool transB, - const int M, const int N, const int K, - const float alpha, const float* A, - const int lda, const float* B, - const int ldb, const float beta, float* C, - const int ldc) { +void gemm( + const platform::CUDADeviceContext& context, const bool transA, + const bool transB, const int M, const int N, const int K, const float alpha, + const float* A, const int lda, const float* B, const int ldb, + const float beta, float* C, const int ldc) { // Note that cublas follows fortran order, so the order is different from // the cblas convention. cublasOperation_t cuTransA = transA == false ? CUBLAS_OP_N : CUBLAS_OP_T; cublasOperation_t cuTransB = transB == false ? CUBLAS_OP_N : CUBLAS_OP_T; PADDLE_ENFORCE(platform::dynload::cublasSgemm( - reinterpret_cast(context) - .cublas_handle(), - cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, ldc)); + context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, + lda, &beta, C, ldc)); } template <> -void gemm(const platform::DeviceContext& context, - const bool transA, const bool transB, - const int M, const int N, const int K, - const double alpha, const double* A, - const int lda, const double* B, - const int ldb, const double beta, - double* C, const int ldc) { +void gemm( + const platform::CUDADeviceContext& context, const bool transA, + const bool transB, const int M, const int N, const int K, + const double alpha, const double* A, const int lda, const double* B, + const int ldb, const double beta, double* C, const int ldc) { // Note that cublas follows fortran order, so the order is different from // the cblas convention. cublasOperation_t cuTransA = transA == false ? CUBLAS_OP_N : CUBLAS_OP_T; cublasOperation_t cuTransB = transB == false ? CUBLAS_OP_N : CUBLAS_OP_T; PADDLE_ENFORCE(platform::dynload::cublasDgemm( - reinterpret_cast(context) - .cublas_handle(), - cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, ldc)); + context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, + lda, &beta, C, ldc)); } template <> -void matmul( - const platform::DeviceContext& context, const framework::Tensor& matrix_a, - bool trans_a, const framework::Tensor& matrix_b, bool trans_b, float alpha, +void matmul( + const platform::CUDADeviceContext& context, + const framework::Tensor& matrix_a, bool trans_a, + const framework::Tensor& matrix_b, bool trans_b, float alpha, framework::Tensor* matrix_out, float beta) { auto dim_a = matrix_a.dims(); auto dim_b = matrix_b.dims(); @@ -125,15 +114,16 @@ void matmul( CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans; CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans; - gemm( + gemm( context, transA, transB, M, N, K, alpha, matrix_a.data(), matrix_b.data(), beta, matrix_out->data()); } template <> -void matmul( - const platform::DeviceContext& context, const framework::Tensor& matrix_a, - bool trans_a, const framework::Tensor& matrix_b, bool trans_b, double alpha, +void matmul( + const platform::CUDADeviceContext& context, + const framework::Tensor& matrix_a, bool trans_a, + const framework::Tensor& matrix_b, bool trans_b, double alpha, framework::Tensor* matrix_out, double beta) { auto dim_a = matrix_a.dims(); auto dim_b = matrix_b.dims(); @@ -153,14 +143,14 @@ void matmul( CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans; CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans; - gemm( + gemm( context, transA, transB, M, N, K, alpha, matrix_a.data(), matrix_b.data(), beta, matrix_out->data()); } template <> -void batched_gemm( - const platform::DeviceContext& context, const CBLAS_TRANSPOSE transA, +void batched_gemm( + const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, const float alpha, const float* A, const float* B, const float beta, float* C, const int batchCount, const int strideA, const int strideB) { @@ -176,15 +166,13 @@ void batched_gemm( const int strideC = M * N; PADDLE_ENFORCE(platform::dynload::cublasSgemmStridedBatched( - reinterpret_cast(context) - .cublas_handle(), - cuTransB, cuTransA, N, M, K, &alpha, B, ldb, strideB, A, lda, strideA, - &beta, C, ldc, strideC, batchCount)); + context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, + strideB, A, lda, strideA, &beta, C, ldc, strideC, batchCount)); } template <> -void batched_gemm( - const platform::DeviceContext& context, const CBLAS_TRANSPOSE transA, +void batched_gemm( + const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, const double alpha, const double* A, const double* B, const double beta, double* C, const int batchCount, const int strideA, const int strideB) { @@ -200,68 +188,58 @@ void batched_gemm( const int strideC = M * N; PADDLE_ENFORCE(platform::dynload::cublasDgemmStridedBatched( - reinterpret_cast(context) - .cublas_handle(), - cuTransB, cuTransA, N, M, K, &alpha, B, ldb, strideB, A, lda, strideA, - &beta, C, ldc, strideC, batchCount)); + context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, + strideB, A, lda, strideA, &beta, C, ldc, strideC, batchCount)); } template <> -void gemv(const platform::DeviceContext& context, - const bool trans_a, const int M, - const int N, const float alpha, - const float* A, const float* B, - const float beta, float* C) { +void gemv( + const platform::CUDADeviceContext& context, const bool trans_a, const int M, + const int N, const float alpha, const float* A, const float* B, + const float beta, float* C) { cublasOperation_t cuTransA = (trans_a == false) ? CUBLAS_OP_T : CUBLAS_OP_N; - PADDLE_ENFORCE(platform::dynload::cublasSgemv( - reinterpret_cast(context) - .cublas_handle(), - cuTransA, N, M, &alpha, A, N, B, 1, &beta, C, 1)); + PADDLE_ENFORCE(platform::dynload::cublasSgemv(context.cublas_handle(), + cuTransA, N, M, &alpha, A, N, B, + 1, &beta, C, 1)); } template <> -void gemv(const platform::DeviceContext& context, - const bool trans_a, const int M, - const int N, const double alpha, - const double* A, const double* B, - const double beta, double* C) { +void gemv( + const platform::CUDADeviceContext& context, const bool trans_a, const int M, + const int N, const double alpha, const double* A, const double* B, + const double beta, double* C) { cublasOperation_t cuTransA = (trans_a == false) ? CUBLAS_OP_T : CUBLAS_OP_N; - PADDLE_ENFORCE(platform::dynload::cublasDgemv( - reinterpret_cast(context) - .cublas_handle(), - cuTransA, N, M, &alpha, A, N, B, 1, &beta, C, 1)); + PADDLE_ENFORCE(platform::dynload::cublasDgemv(context.cublas_handle(), + cuTransA, N, M, &alpha, A, N, B, + 1, &beta, C, 1)); } template <> -void axpy(const platform::DeviceContext& context, - const int n, const float alpha, - const float* x, float* y) { - PADDLE_ENFORCE(platform::dynload::cublasSaxpy( - reinterpret_cast(context) - .cublas_handle(), - n, &alpha, x, 1, y, 1)); +void axpy( + const platform::CUDADeviceContext& context, const int n, const float alpha, + const float* x, float* y) { + PADDLE_ENFORCE(platform::dynload::cublasSaxpy(context.cublas_handle(), n, + &alpha, x, 1, y, 1)); } template <> -void axpy(const platform::DeviceContext& context, - const int n, const double alpha, - const double* x, double* y) { - PADDLE_ENFORCE(platform::dynload::cublasDaxpy( - reinterpret_cast(context) - .cublas_handle(), - n, &alpha, x, 1, y, 1)); +void axpy( + const platform::CUDADeviceContext& context, const int n, const double alpha, + const double* x, double* y) { + PADDLE_ENFORCE(platform::dynload::cublasDaxpy(context.cublas_handle(), n, + &alpha, x, 1, y, 1)); } -template struct SetConstant; -template struct SetConstant; -template struct SetConstant; -template struct SetConstant; -template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; -#define DEFINE_GPU_TRANS(RANK) \ - template struct Transpose; \ - template struct Transpose; +#define DEFINE_GPU_TRANS(RANK) \ + template struct Transpose; \ + template struct Transpose; DEFINE_GPU_TRANS(1); DEFINE_GPU_TRANS(2); @@ -277,8 +255,9 @@ struct TensorSetConstantGPU { template void operator()() const { - SetConstant functor; - functor(context_, tensor_, static_cast(value_)); + SetConstant functor; + functor(reinterpret_cast(context_), + tensor_, static_cast(value_)); } const platform::DeviceContext& context_; @@ -294,27 +273,27 @@ void set_constant_with_place( TensorSetConstantGPU(context, tensor, value)); } -template struct RowwiseAdd; -template struct RowwiseAdd; -template struct ColwiseSum; -// template struct ColwiseSum; -// The ColwiseSum failed in debug mode, +template struct RowwiseAdd; +template struct RowwiseAdd; +template struct ColwiseSum; +// template struct ColwiseSum; +// The ColwiseSum failed in debug mode, // and only failed for this case. So reimplemented it. template <> -void ColwiseSum::operator()( - const platform::DeviceContext& context, const framework::Tensor& input, +void ColwiseSum::operator()( + const platform::CUDADeviceContext& context, const framework::Tensor& input, framework::Tensor* vector) { auto in_dims = input.dims(); auto size = input.numel() / in_dims[0]; PADDLE_ENFORCE_EQ(vector->numel(), size); framework::Tensor one; one.mutable_data({in_dims[0]}, context.GetPlace()); - SetConstant set; + SetConstant set; set(context, &one, static_cast(1.0)); - gemv(context, true, static_cast(in_dims[0]), - static_cast(in_dims[1]), 1.0, - input.data(), one.data(), - 0.0, vector->data()); + gemv( + context, true, static_cast(in_dims[0]), static_cast(in_dims[1]), + 1.0, input.data(), one.data(), 0.0, + vector->data()); } } // namespace math diff --git a/paddle/operators/math/math_function.h b/paddle/operators/math/math_function.h index f2b025b78b68decee43fa1f80378e51eaf4c90de..8cc03c2ba0facae691a0d2b8a4f2ea768cfa5491 100644 --- a/paddle/operators/math/math_function.h +++ b/paddle/operators/math/math_function.h @@ -62,53 +62,51 @@ namespace math { // Then matrixA: M * K, matrixB: K * N, matrixC : M * N // For more detailed info, please refer to // http://www.netlib.org/lapack/explore-html/d4/de2/sgemm_8f.html -template -void gemm(const platform::DeviceContext& context, const CBLAS_TRANSPOSE transA, +template +void gemm(const DeviceContext& context, const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, const T alpha, const T* A, const T* B, const T beta, T* C); // gemm wrapper with stride args for matrix uncontinuous in memory -template -void gemm(const platform::DeviceContext& context, const bool transA, - const bool transB, const int M, const int N, const int K, - const T alpha, const T* A, const int lda, const T* B, const int ldb, - const T beta, T* C, const int ldc); +template +void gemm(const DeviceContext& context, const bool transA, const bool transB, + const int M, const int N, const int K, const T alpha, const T* A, + const int lda, const T* B, const int ldb, const T beta, T* C, + const int ldc); // matrix multiply with continuous memory -template -void matmul(const platform::DeviceContext& context, - const framework::Tensor& matrix_a, bool trans_a, - const framework::Tensor& matrix_b, bool trans_b, T alpha, - framework::Tensor* matrix_out, T beta); +template +void matmul(const DeviceContext& context, const framework::Tensor& matrix_a, + bool trans_a, const framework::Tensor& matrix_b, bool trans_b, + T alpha, framework::Tensor* matrix_out, T beta); // Batched gemm -template -void batched_gemm(const platform::DeviceContext& context, - const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB, - const int M, const int N, const int K, const T alpha, - const T* A, const T* B, const T beta, T* C, - const int batchCount, const int strideA, const int strideB); - -template -void gemv(const platform::DeviceContext& context, const bool trans_a, - const int M, const int N, const T alpha, const T* A, const T* B, - const T beta, T* C); - -template -void axpy(const platform::DeviceContext& context, const int n, const T alpha, - const T* x, T* y); - -template +template +void batched_gemm(const DeviceContext& context, const CBLAS_TRANSPOSE transA, + const CBLAS_TRANSPOSE transB, const int M, const int N, + const int K, const T alpha, const T* A, const T* B, + const T beta, T* C, const int batchCount, const int strideA, + const int strideB); + +template +void gemv(const DeviceContext& context, const bool trans_a, const int M, + const int N, const T alpha, const T* A, const T* B, const T beta, + T* C); + +template +void axpy(const DeviceContext& context, const int n, const T alpha, const T* x, + T* y); + +template struct Transpose { - void operator()(const platform::DeviceContext& context, - const framework::Tensor& in, framework::Tensor* out, - const std::vector& axis); + void operator()(const DeviceContext& context, const framework::Tensor& in, + framework::Tensor* out, const std::vector& axis); }; -template +template struct SetConstant { - void operator()(const platform::DeviceContext& context, - framework::Tensor* tensor, T num); + void operator()(const DeviceContext& context, framework::Tensor* tensor, + T num); }; template @@ -118,17 +116,16 @@ void set_constant_with_place(const platform::DeviceContext& context, void set_constant(const platform::DeviceContext& context, framework::Tensor* tensor, float value); -template +template struct RowwiseAdd { - void operator()(const platform::DeviceContext& context, - const framework::Tensor& input, const framework::Tensor& vec, - framework::Tensor* output); + void operator()(const DeviceContext& context, const framework::Tensor& input, + const framework::Tensor& vec, framework::Tensor* output); }; -template +template struct ColwiseSum { - void operator()(const platform::DeviceContext& context, - const framework::Tensor& input, framework::Tensor* vec); + void operator()(const DeviceContext& context, const framework::Tensor& input, + framework::Tensor* vec); }; } // namespace math diff --git a/paddle/operators/math/math_function_impl.h b/paddle/operators/math/math_function_impl.h index 4dc17a4e525c52b8f696277274a7ad00a6b00a08..3e6d83386589a02c7d8f62394c1c2becb606504c 100644 --- a/paddle/operators/math/math_function_impl.h +++ b/paddle/operators/math/math_function_impl.h @@ -20,16 +20,17 @@ namespace paddle { namespace operators { namespace math { -template -void SetConstant::operator()(const platform::DeviceContext& context, - framework::Tensor* tensor, T num) { +template +void SetConstant::operator()(const DeviceContext& context, + framework::Tensor* tensor, + T num) { auto t = framework::EigenVector::Flatten(*tensor); - t.device(*context.GetEigenDevice()) = t.constant(static_cast(num)); + t.device(*context.eigen_device()) = t.constant(static_cast(num)); } -template -void Transpose::operator()( - const platform::DeviceContext& context, const framework::Tensor& in, +template +void Transpose::operator()( + const DeviceContext& context, const framework::Tensor& in, framework::Tensor* out, const std::vector& axis) { Eigen::array permute; for (int i = 0; i < Rank; i++) { @@ -40,15 +41,15 @@ void Transpose::operator()( auto eigen_in = framework::EigenTensor::From(in); auto eigen_out = framework::EigenTensor::From(*out); - auto* dev = context.GetEigenDevice(); + auto* dev = context.eigen_device(); eigen_out.device(*dev) = eigen_in.shuffle(permute); } -template -void RowwiseAdd::operator()(const platform::DeviceContext& context, - const framework::Tensor& input, - const framework::Tensor& vector, - framework::Tensor* output) { +template +void RowwiseAdd::operator()(const DeviceContext& context, + const framework::Tensor& input, + const framework::Tensor& vector, + framework::Tensor* output) { auto in_dims = input.dims(); auto size = input.numel() / in_dims[0]; PADDLE_ENFORCE_EQ(vector.numel(), size); @@ -59,14 +60,14 @@ void RowwiseAdd::operator()(const platform::DeviceContext& context, auto out = framework::EigenMatrix::From(*output); Eigen::array shape({{1, static_cast(size)}}); Eigen::array bcast({{static_cast(in_dims[0]), 1}}); - out.device(*context.GetEigenDevice()) = + out.device(*context.eigen_device()) = in + vec.reshape(shape).broadcast(bcast); } -template -void ColwiseSum::operator()(const platform::DeviceContext& context, - const framework::Tensor& input, - framework::Tensor* vector) { +template +void ColwiseSum::operator()(const DeviceContext& context, + const framework::Tensor& input, + framework::Tensor* vector) { auto in_dims = input.dims(); auto size = input.numel() / in_dims[0]; PADDLE_ENFORCE_EQ(vector->numel(), size); @@ -74,7 +75,7 @@ void ColwiseSum::operator()(const platform::DeviceContext& context, auto vec = framework::EigenMatrix::From(*vector); auto in = framework::EigenMatrix::From(input); Eigen::array shape({{1, static_cast(size)}}); - vec.reshape(shape).device(*context.GetEigenDevice()) = + vec.reshape(shape).device(*context.eigen_device()) = in.sum(Eigen::array({{0}})).reshape(shape); } diff --git a/paddle/operators/math/math_function_test.cc b/paddle/operators/math/math_function_test.cc index 983c9fdcffb0a67da1bc0b5b4af9420a68bd2ac1..7c6f098ca9065ded1644420a3ab47911bf7bc3b3 100644 --- a/paddle/operators/math/math_function_test.cc +++ b/paddle/operators/math/math_function_test.cc @@ -21,7 +21,7 @@ TEST(math_function, gemm_notrans_cblas) { memcpy(input3_ptr, arr3, 8 * sizeof(float)); paddle::platform::CPUDeviceContext context(*cpu_place); - paddle::operators::math::gemm( + paddle::operators::math::gemm( context, false, false, m, n, k, 1, input1_ptr, 3, input2_ptr + 1, 4, 1, input3_ptr + 1, 4); @@ -55,7 +55,7 @@ TEST(math_function, gemm_trans_clbas) { memcpy(input3_ptr, arr3, 8 * sizeof(float)); paddle::platform::CPUDeviceContext context(*cpu_place); - paddle::operators::math::gemm( + paddle::operators::math::gemm( context, false, true, m, n, k, 1, input1_ptr, 3, input2_ptr + 3, 3, 1, input3_ptr + 1, 4); @@ -74,7 +74,8 @@ TEST(math_function, zero) { auto* cpu_place = new paddle::platform::CPUPlace(); float* t = tensor.mutable_data({2, 2}, *cpu_place); paddle::platform::CPUDeviceContext context(*cpu_place); - paddle::operators::math::SetConstant + paddle::operators::math::SetConstant functor; functor(context, &tensor, 0); EXPECT_EQ(t[0], 0); @@ -110,7 +111,7 @@ void GemvTest(int m, int n, bool trans) { } paddle::platform::CPUDeviceContext context(*cpu_place); - paddle::operators::math::gemv( + paddle::operators::math::gemv( context, trans, static_cast(m), static_cast(n), 1., data_a, data_b, 0., data_c); diff --git a/paddle/operators/math/math_function_test.cu b/paddle/operators/math/math_function_test.cu index d5d6f0c73bc6bce7a74db2c98fa9f884a0bcd9a2..32e96d948714a8fd1fa2c089057603fdaed85c16 100644 --- a/paddle/operators/math/math_function_test.cu +++ b/paddle/operators/math/math_function_test.cu @@ -21,7 +21,7 @@ TEST(math_function, notrans_mul_trans) { out_gpu.mutable_data({2, 2}, *gpu_place); - paddle::operators::math::matmul( + paddle::operators::math::matmul( context, input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0); paddle::framework::CopyFrom(out_gpu, *cpu_place, context, &out); @@ -55,7 +55,7 @@ TEST(math_function, trans_mul_notrans) { out_gpu.mutable_data({3, 3}, *gpu_place); - paddle::operators::math::matmul( + paddle::operators::math::matmul( context, input1_gpu, true, input2_gpu, false, 1, &out_gpu, 0); paddle::framework::CopyFrom(out_gpu, *cpu_place, context, &out); @@ -106,7 +106,7 @@ TEST(math_function, gemm_notrans_cublas) { float* b = input2_gpu.data(); float* c = input3_gpu.mutable_data(*gpu_place); - paddle::operators::math::gemm( + paddle::operators::math::gemm( context, false, false, m, n, k, 1, a, 3, b + 1, 4, 1, c + 1, 4); paddle::framework::CopyFrom(input3_gpu, *cpu_place, context, &input3); @@ -161,7 +161,7 @@ TEST(math_function, gemm_trans_cublas) { float* b = input2_gpu.data(); float* c = input3_gpu.mutable_data(*gpu_place); - paddle::operators::math::gemm( + paddle::operators::math::gemm( context, false, true, m, n, k, 1, a, 3, b + 3, 3, 1, c + 1, 4); paddle::framework::CopyFrom(input3_gpu, *cpu_place, context, &input3); @@ -208,7 +208,7 @@ void GemvTest(int m, int n, bool trans) { paddle::framework::CopyFrom(mat_a, *gpu_place, context, &g_mat_a); paddle::framework::CopyFrom(vec_b, *gpu_place, context, &g_vec_b); - paddle::operators::math::gemv( + paddle::operators::math::gemv( context, trans, static_cast(m), static_cast(n), 1., g_data_a, g_data_b, 0., g_data_c); diff --git a/paddle/operators/math/matmul.h b/paddle/operators/math/matmul.h index 6ba9a0ba9a70bd938f9362179990ab68fa3186ba..7048e11e6f27a075892c28681a3c4913a27b3f3e 100644 --- a/paddle/operators/math/matmul.h +++ b/paddle/operators/math/matmul.h @@ -26,13 +26,12 @@ namespace math { // // Both a & b can be 1- to 3-dimensional. Higher rank tensors are not supported // yet. -template +template class MatMulFunctor { public: - void operator()(const platform::DeviceContext& context, - const framework::Tensor& a, bool trans_a, - const framework::Tensor& b, bool trans_b, T alpha, - framework::Tensor* out, T beta) { + void operator()(const DeviceContext& context, const framework::Tensor& a, + bool trans_a, const framework::Tensor& b, bool trans_b, + T alpha, framework::Tensor* out, T beta) { auto dim_a = a.dims(); auto dim_b = b.dims(); @@ -108,13 +107,13 @@ class MatMulFunctor { if (!batchCount) { // regular matrix multiplication - gemm(context, transA, transB, M, N, kA, alpha, a.data(), - b.data(), beta, out->data()); + gemm(context, transA, transB, M, N, kA, alpha, + a.data(), b.data(), beta, out->data()); } else { // batched matrix multiplication - batched_gemm(context, transA, transB, M, N, kA, alpha, - a.data(), b.data(), beta, out->data(), - batchCount, strideA, strideB); + batched_gemm( + context, transA, transB, M, N, kA, alpha, a.data(), b.data(), + beta, out->data(), batchCount, strideA, strideB); } } }; diff --git a/paddle/operators/math/maxouting.cc b/paddle/operators/math/maxouting.cc index c9003962d33b70b8e21a0d6b78bf5a77981df409..fea86675f75dad99a336d795d4561ae32d58c30a 100644 --- a/paddle/operators/math/maxouting.cc +++ b/paddle/operators/math/maxouting.cc @@ -20,9 +20,9 @@ namespace math { // All tensors are in NCHW format, and the groups must be greater than 1 template -class MaxOutFunctor { +class MaxOutFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& input, framework::Tensor* output, int groups) { const int batch_size = input.dims()[0]; @@ -54,9 +54,9 @@ class MaxOutFunctor { }; template -class MaxOutGradFunctor { +class MaxOutGradFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& input, framework::Tensor* input_grad, const framework::Tensor& output, const framework::Tensor& output_grad, int groups) { @@ -91,10 +91,10 @@ class MaxOutGradFunctor { } }; -template class MaxOutGradFunctor; -template class MaxOutGradFunctor; -template class MaxOutFunctor; -template class MaxOutFunctor; +template class MaxOutGradFunctor; +template class MaxOutGradFunctor; +template class MaxOutFunctor; +template class MaxOutFunctor; } // namespace math } // namespace operators diff --git a/paddle/operators/math/maxouting.cu b/paddle/operators/math/maxouting.cu index c3fabcae081e24d92d50d0e2a2cad4a2e9872125..6056ad251c12976fe9032f03aaaeb52727da1f42 100644 --- a/paddle/operators/math/maxouting.cu +++ b/paddle/operators/math/maxouting.cu @@ -78,9 +78,9 @@ __global__ void KernelMaxoutGrad(const int nthreads, const T* input_data, * All tensors are in NCHW format. */ template -class MaxOutFunctor { +class MaxOutFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& input, framework::Tensor* output, int groups) { const int batch_size = input.dims()[0]; @@ -98,20 +98,18 @@ class MaxOutFunctor { dim3 threads(1024, 1); dim3 grid(blocks, 1); - KernelMaxOut< - T><<(context) - .stream()>>>(nthreads, input_data, input_channels, - input_height, input_width, groups, output_data); + KernelMaxOut<<>>( + nthreads, input_data, input_channels, input_height, input_width, groups, + output_data); } }; /* * All tensors are in NCHW format. */ template -class MaxOutGradFunctor { +class MaxOutGradFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& input, framework::Tensor* input_grad, const framework::Tensor& output, const framework::Tensor& output_grad, int groups) { @@ -132,20 +130,17 @@ class MaxOutGradFunctor { dim3 threads(1024, 1); dim3 grid(blocks, 1); - KernelMaxoutGrad< - T><<(context) - .stream()>>>(nthreads, input_data, output_data, - output_grad_data, input_grad_data, input_channels, - input_height, input_width, groups); + KernelMaxoutGrad<<>>( + nthreads, input_data, output_data, output_grad_data, input_grad_data, + input_channels, input_height, input_width, groups); } }; -template class MaxOutGradFunctor; -template class MaxOutGradFunctor; +template class MaxOutGradFunctor; +template class MaxOutGradFunctor; -template class MaxOutFunctor; -template class MaxOutFunctor; +template class MaxOutFunctor; +template class MaxOutFunctor; } // namespace math } // namespace operators diff --git a/paddle/operators/math/maxouting.h b/paddle/operators/math/maxouting.h index 2d9069b0b3ca3e7bad3b21a46985c52ef00f50e6..68f4743db07af0f369eb18f1a7cb6e326d469e85 100644 --- a/paddle/operators/math/maxouting.h +++ b/paddle/operators/math/maxouting.h @@ -23,20 +23,18 @@ namespace math { #define FLT_MAX __FLT_MAX__ -template - +template class MaxOutFunctor { public: - void operator()(const platform::DeviceContext& context, - const framework::Tensor& input, framework::Tensor* output, - int groups); + void operator()(const DeviceContext& context, const framework::Tensor& input, + framework::Tensor* output, int groups); }; -template +template class MaxOutGradFunctor { public: - void operator()(const platform::DeviceContext& context, - const framework::Tensor& input, framework::Tensor* input_grad, + void operator()(const DeviceContext& context, const framework::Tensor& input, + framework::Tensor* input_grad, const framework::Tensor& output, const framework::Tensor& output_grad, int groups); }; diff --git a/paddle/operators/math/pooling.cc b/paddle/operators/math/pooling.cc index 135984586a67f666425f81456148c3623ed7ef25..150de6fd59ef3ac0c4cb9160bf5afb1ce1064577 100644 --- a/paddle/operators/math/pooling.cc +++ b/paddle/operators/math/pooling.cc @@ -24,9 +24,9 @@ namespace math { * height and width, respectively. */ template -class Pool2dFunctor { +class Pool2dFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& input, std::vector& ksize, std::vector& strides, std::vector& paddings, PoolProcess pool_process, framework::Tensor* output) { @@ -84,9 +84,9 @@ class Pool2dFunctor { * and width, respectively. */ template -class Pool2dGradFunctor { +class Pool2dGradFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& input, const framework::Tensor& output, const framework::Tensor& output_grad, std::vector& ksize, @@ -152,9 +152,9 @@ class Pool2dGradFunctor { * height and width, respectively. */ template -class MaxPool2dGradFunctor { +class MaxPool2dGradFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& input, const framework::Tensor& output, const framework::Tensor& output_grad, std::vector& ksize, @@ -213,25 +213,29 @@ class MaxPool2dGradFunctor { } }; -template class MaxPool2dGradFunctor; -template class MaxPool2dGradFunctor; +template class MaxPool2dGradFunctor; +template class MaxPool2dGradFunctor; -template class Pool2dFunctor, float>; -template class Pool2dFunctor, float>; -template class Pool2dGradFunctor< - platform::CPUPlace, paddle::operators::math::MaxPoolGrad, float>; -template class Pool2dGradFunctor< - platform::CPUPlace, paddle::operators::math::AvgPoolGrad, float>; -template class Pool2dFunctor, + float>; +template class Pool2dGradFunctor, + float>; +template class Pool2dFunctor, double>; -template class Pool2dFunctor, double>; -template class Pool2dGradFunctor< - platform::CPUPlace, paddle::operators::math::MaxPoolGrad, double>; -template class Pool2dGradFunctor< - platform::CPUPlace, paddle::operators::math::AvgPoolGrad, double>; +template class Pool2dGradFunctor, + double>; +template class Pool2dGradFunctor, + double>; /* * All tensors are in NCDHW format. @@ -239,9 +243,9 @@ template class Pool2dGradFunctor< * depth, height and width, respectively. */ template -class Pool3dFunctor { +class Pool3dFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& input, std::vector& ksize, std::vector& strides, std::vector& paddings, PoolProcess pool_process, framework::Tensor* output) { @@ -314,9 +318,9 @@ class Pool3dFunctor { * depth, height and width, respectively. */ template -class Pool3dGradFunctor { +class Pool3dGradFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& input, const framework::Tensor& output, const framework::Tensor& output_grad, std::vector& ksize, @@ -398,9 +402,9 @@ class Pool3dGradFunctor { * depth, height and width, respectively. */ template -class MaxPool3dGradFunctor { +class MaxPool3dGradFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& input, const framework::Tensor& output, const framework::Tensor& output_grad, std::vector& ksize, @@ -473,25 +477,29 @@ class MaxPool3dGradFunctor { } }; -template class MaxPool3dGradFunctor; -template class MaxPool3dGradFunctor; +template class MaxPool3dGradFunctor; +template class MaxPool3dGradFunctor; -template class Pool3dFunctor, float>; -template class Pool3dFunctor, float>; -template class Pool3dGradFunctor< - platform::CPUPlace, paddle::operators::math::MaxPoolGrad, float>; -template class Pool3dGradFunctor< - platform::CPUPlace, paddle::operators::math::AvgPoolGrad, float>; -template class Pool3dFunctor, + float>; +template class Pool3dGradFunctor, + float>; +template class Pool3dFunctor, double>; -template class Pool3dFunctor, double>; -template class Pool3dGradFunctor< - platform::CPUPlace, paddle::operators::math::MaxPoolGrad, double>; -template class Pool3dGradFunctor< - platform::CPUPlace, paddle::operators::math::AvgPoolGrad, double>; +template class Pool3dGradFunctor, + double>; +template class Pool3dGradFunctor, + double>; /* * All tensors are in NCHW format. @@ -499,9 +507,9 @@ template class Pool3dGradFunctor< * height and width, respectively. */ template -class MaxPool2dWithIndexFunctor { +class MaxPool2dWithIndexFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& input, std::vector& ksize, std::vector& strides, std::vector& paddings, framework::Tensor* output, framework::Tensor* mask) { @@ -564,9 +572,9 @@ class MaxPool2dWithIndexFunctor { * height and width, respectively. */ template -class MaxPool2dWithIndexGradFunctor { +class MaxPool2dWithIndexGradFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& output_grad, const framework::Tensor& mask, std::vector& ksize, std::vector& strides, std::vector& paddings, @@ -602,10 +610,14 @@ class MaxPool2dWithIndexGradFunctor { } }; -template class MaxPool2dWithIndexFunctor; -template class MaxPool2dWithIndexGradFunctor; -template class MaxPool2dWithIndexFunctor; -template class MaxPool2dWithIndexGradFunctor; +template class MaxPool2dWithIndexFunctor; +template class MaxPool2dWithIndexGradFunctor; +template class MaxPool2dWithIndexFunctor; +template class MaxPool2dWithIndexGradFunctor; /* * All tensors are in NCDHW format. @@ -613,9 +625,9 @@ template class MaxPool2dWithIndexGradFunctor; * depth, height and width, respectively. */ template -class MaxPool3dWithIndexFunctor { +class MaxPool3dWithIndexFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& input, std::vector& ksize, std::vector& strides, std::vector& paddings, framework::Tensor* output, framework::Tensor* mask) { @@ -692,9 +704,9 @@ class MaxPool3dWithIndexFunctor { * depth, height and width, respectively. */ template -class MaxPool3dWithIndexGradFunctor { +class MaxPool3dWithIndexGradFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& output_grad, const framework::Tensor& mask, std::vector& ksize, std::vector& strides, std::vector& paddings, @@ -735,10 +747,14 @@ class MaxPool3dWithIndexGradFunctor { } }; -template class MaxPool3dWithIndexFunctor; -template class MaxPool3dWithIndexGradFunctor; -template class MaxPool3dWithIndexFunctor; -template class MaxPool3dWithIndexGradFunctor; +template class MaxPool3dWithIndexFunctor; +template class MaxPool3dWithIndexGradFunctor; +template class MaxPool3dWithIndexFunctor; +template class MaxPool3dWithIndexGradFunctor; } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/operators/math/pooling.cu b/paddle/operators/math/pooling.cu index ca3560f264b59057fd655084f3d43adc617c6606..0243cf8316a2a83bfc4c091f64419574c1be2f5c 100644 --- a/paddle/operators/math/pooling.cu +++ b/paddle/operators/math/pooling.cu @@ -155,9 +155,9 @@ __global__ void KernelMaxPool2DGrad( * height and width, respectively. */ template -class Pool2dFunctor { +class Pool2dFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& input, std::vector& ksize, std::vector& strides, std::vector& paddings, PoolProcess pool_process, framework::Tensor* output) { @@ -183,11 +183,7 @@ class Pool2dFunctor { dim3 threads(1024, 1); dim3 grid(blocks, 1); - KernelPool2D< - PoolProcess, - T><<(context) - .stream()>>>( + KernelPool2D<<>>( nthreads, input_data, input_channels, input_height, input_width, output_height, output_width, ksize_height, ksize_width, stride_height, stride_width, padding_height, padding_width, pool_process, output_data); @@ -200,9 +196,9 @@ class Pool2dFunctor { * height and width, respectively. */ template -class Pool2dGradFunctor { +class Pool2dGradFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& input, const framework::Tensor& output, const framework::Tensor& output_grad, std::vector& ksize, @@ -231,11 +227,7 @@ class Pool2dGradFunctor { dim3 threads(1024, 1); dim3 grid(blocks, 1); - KernelPool2DGrad< - PoolProcess, - T><<(context) - .stream()>>>( + KernelPool2DGrad<<>>( nthreads, input_data, output_data, output_grad_data, input_channels, input_height, input_width, output_height, output_width, ksize_height, ksize_width, stride_height, stride_width, padding_height, padding_width, @@ -249,9 +241,9 @@ class Pool2dGradFunctor { * height and width, respectively. */ template -class MaxPool2dGradFunctor { +class MaxPool2dGradFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& input, const framework::Tensor& output, const framework::Tensor& output_grad, std::vector& ksize, @@ -281,10 +273,7 @@ class MaxPool2dGradFunctor { dim3 threads(1024, 1); dim3 grid(blocks, 1); - KernelMaxPool2DGrad< - T><<(context) - .stream()>>>( + KernelMaxPool2DGrad<<>>( nthreads, input_data, output_data, output_grad_data, input_channels, input_height, input_width, output_height, output_width, ksize_height, ksize_width, stride_height, stride_width, padding_height, padding_width, @@ -292,25 +281,29 @@ class MaxPool2dGradFunctor { } }; -template class MaxPool2dGradFunctor; -template class MaxPool2dGradFunctor; +template class MaxPool2dGradFunctor; +template class MaxPool2dGradFunctor; -template class Pool2dFunctor, float>; -template class Pool2dFunctor, float>; -template class Pool2dGradFunctor< - platform::GPUPlace, paddle::operators::math::MaxPoolGrad, float>; -template class Pool2dGradFunctor< - platform::GPUPlace, paddle::operators::math::AvgPoolGrad, float>; -template class Pool2dFunctor, + float>; +template class Pool2dGradFunctor, + float>; +template class Pool2dFunctor, double>; -template class Pool2dFunctor, double>; -template class Pool2dGradFunctor< - platform::GPUPlace, paddle::operators::math::MaxPoolGrad, double>; -template class Pool2dGradFunctor< - platform::GPUPlace, paddle::operators::math::AvgPoolGrad, double>; +template class Pool2dGradFunctor, + double>; +template class Pool2dGradFunctor, + double>; template __global__ void KernelPool3D(const int nthreads, const T* input_data, @@ -478,9 +471,9 @@ __global__ void KernelMaxPool3DGrad( * depth, height and width, respectively. */ template -class Pool3dFunctor { +class Pool3dFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& input, std::vector& ksize, std::vector& strides, std::vector& paddings, PoolProcess pool_process, framework::Tensor* output) { @@ -512,11 +505,7 @@ class Pool3dFunctor { dim3 threads(1024, 1); dim3 grid(blocks, 1); - KernelPool3D< - PoolProcess, - T><<(context) - .stream()>>>( + KernelPool3D<<>>( nthreads, input_data, input_channels, input_depth, input_height, input_width, output_depth, output_height, output_width, ksize_depth, ksize_height, ksize_width, stride_depth, stride_height, stride_width, @@ -531,9 +520,9 @@ class Pool3dFunctor { * depth, height and width, respectively. */ template -class Pool3dGradFunctor { +class Pool3dGradFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& input, const framework::Tensor& output, const framework::Tensor& output_grad, std::vector& ksize, @@ -569,11 +558,7 @@ class Pool3dGradFunctor { dim3 threads(1024, 1); dim3 grid(blocks, 1); - KernelPool3DGrad< - PoolProcess, - T><<(context) - .stream()>>>( + KernelPool3DGrad<<>>( nthreads, input_data, output_data, output_grad_data, input_channels, input_depth, input_height, input_width, output_depth, output_height, output_width, ksize_depth, ksize_height, ksize_width, stride_depth, @@ -588,9 +573,9 @@ class Pool3dGradFunctor { * depth, height and width, respectively. */ template -class MaxPool3dGradFunctor { +class MaxPool3dGradFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& input, const framework::Tensor& output, const framework::Tensor& output_grad, std::vector& ksize, @@ -626,10 +611,7 @@ class MaxPool3dGradFunctor { dim3 threads(1024, 1); dim3 grid(blocks, 1); - KernelMaxPool3DGrad< - T><<(context) - .stream()>>>( + KernelMaxPool3DGrad<<>>( nthreads, input_data, output_data, output_grad_data, input_channels, input_depth, input_height, input_width, output_depth, output_height, output_width, ksize_depth, ksize_height, ksize_width, stride_depth, @@ -638,25 +620,29 @@ class MaxPool3dGradFunctor { } }; -template class MaxPool3dGradFunctor; -template class MaxPool3dGradFunctor; +template class MaxPool3dGradFunctor; +template class MaxPool3dGradFunctor; -template class Pool3dFunctor, float>; -template class Pool3dFunctor, float>; -template class Pool3dGradFunctor< - platform::GPUPlace, paddle::operators::math::MaxPoolGrad, float>; -template class Pool3dGradFunctor< - platform::GPUPlace, paddle::operators::math::AvgPoolGrad, float>; -template class Pool3dFunctor, + float>; +template class Pool3dGradFunctor, + float>; +template class Pool3dFunctor, double>; -template class Pool3dFunctor, double>; -template class Pool3dGradFunctor< - platform::GPUPlace, paddle::operators::math::MaxPoolGrad, double>; -template class Pool3dGradFunctor< - platform::GPUPlace, paddle::operators::math::AvgPoolGrad, double>; +template class Pool3dGradFunctor, + double>; +template class Pool3dGradFunctor, + double>; template __global__ void KernelMaxPool2dWithIdx( @@ -747,9 +733,9 @@ __global__ void KernelMaxPool2DWithIdxGrad( * height and width, respectively. */ template -class MaxPool2dWithIndexFunctor { +class MaxPool2dWithIndexFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& input, std::vector& ksize, std::vector& strides, std::vector& paddings, framework::Tensor* output, framework::Tensor* mask) { @@ -776,10 +762,7 @@ class MaxPool2dWithIndexFunctor { dim3 threads(1024, 1); dim3 grid(blocks, 1); - KernelMaxPool2dWithIdx< - T1, T2><<(context) - .stream()>>>( + KernelMaxPool2dWithIdx<<>>( nthreads, input_data, input_channels, input_height, input_width, output_height, output_width, ksize_height, ksize_width, stride_height, stride_width, padding_height, padding_width, output_data, mask_data); @@ -792,9 +775,9 @@ class MaxPool2dWithIndexFunctor { * height and width, respectively. */ template -class MaxPool2dWithIndexGradFunctor { +class MaxPool2dWithIndexGradFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& output_grad, const framework::Tensor& mask, std::vector& ksize, std::vector& strides, std::vector& paddings, @@ -821,10 +804,7 @@ class MaxPool2dWithIndexGradFunctor { dim3 threads(1024, 1); dim3 grid(blocks, 1); - KernelMaxPool2DWithIdxGrad< - T1, T2><<(context) - .stream()>>>( + KernelMaxPool2DWithIdxGrad<<>>( nthreads, output_grad_data, mask_data, input_channels, input_height, input_width, output_height, output_width, ksize_height, ksize_width, stride_height, stride_width, padding_height, padding_width, @@ -832,10 +812,14 @@ class MaxPool2dWithIndexGradFunctor { } }; -template class MaxPool2dWithIndexFunctor; -template class MaxPool2dWithIndexGradFunctor; -template class MaxPool2dWithIndexFunctor; -template class MaxPool2dWithIndexGradFunctor; +template class MaxPool2dWithIndexFunctor; +template class MaxPool2dWithIndexGradFunctor; +template class MaxPool2dWithIndexFunctor; +template class MaxPool2dWithIndexGradFunctor; template __global__ void KernelMaxPool3DWithIdx( @@ -950,9 +934,9 @@ __global__ void KernelMaxPool3DWithIdxGrad( * depth, height and width, respectively. */ template -class MaxPool3dWithIndexFunctor { +class MaxPool3dWithIndexFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& input, std::vector& ksize, std::vector& strides, std::vector& paddings, framework::Tensor* output, framework::Tensor* mask) { @@ -985,10 +969,7 @@ class MaxPool3dWithIndexFunctor { dim3 threads(1024, 1); dim3 grid(blocks, 1); - KernelMaxPool3DWithIdx< - T1, T2><<(context) - .stream()>>>( + KernelMaxPool3DWithIdx<<>>( nthreads, input_data, input_channels, input_depth, input_height, input_width, output_depth, output_height, output_width, ksize_depth, ksize_height, ksize_width, stride_depth, stride_height, stride_width, @@ -1002,9 +983,9 @@ class MaxPool3dWithIndexFunctor { * depth, height and width, respectively. */ template -class MaxPool3dWithIndexGradFunctor { +class MaxPool3dWithIndexGradFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& output_grad, const framework::Tensor& mask, std::vector& ksize, std::vector& strides, std::vector& paddings, @@ -1037,10 +1018,7 @@ class MaxPool3dWithIndexGradFunctor { dim3 threads(1024, 1); dim3 grid(blocks, 1); - KernelMaxPool3DWithIdxGrad< - T1, T2><<(context) - .stream()>>>( + KernelMaxPool3DWithIdxGrad<<>>( nthreads, output_grad_data, mask_data, input_channels, input_depth, input_height, input_width, output_depth, output_height, output_width, ksize_depth, ksize_height, ksize_width, stride_depth, stride_height, @@ -1049,10 +1027,14 @@ class MaxPool3dWithIndexGradFunctor { } }; -template class MaxPool3dWithIndexFunctor; -template class MaxPool3dWithIndexGradFunctor; -template class MaxPool3dWithIndexFunctor; -template class MaxPool3dWithIndexGradFunctor; +template class MaxPool3dWithIndexFunctor; +template class MaxPool3dWithIndexGradFunctor; +template class MaxPool3dWithIndexFunctor; +template class MaxPool3dWithIndexGradFunctor; } // namespace math } // namespace operators diff --git a/paddle/operators/math/pooling.h b/paddle/operators/math/pooling.h index 19fbd8b4bb2469d3ce8a139ce30a48641dbd6e0f..2759f06cb6a51f7ceb6b8010d792030eb6ad5d3e 100644 --- a/paddle/operators/math/pooling.h +++ b/paddle/operators/math/pooling.h @@ -84,62 +84,58 @@ class AvgPoolGrad { * This is different from average pooling. So we rewrite the max_pool_grad: * MaxPool2dGradFunctor, MaxPool3dGradFunctor. */ -template +template class Pool2dFunctor { public: - void operator()(const platform::DeviceContext& context, - const framework::Tensor& input, std::vector& ksize, - std::vector& strides, std::vector& paddings, - PoolProcess pool_compute, framework::Tensor* output); + void operator()(const DeviceContext& context, const framework::Tensor& input, + std::vector& ksize, std::vector& strides, + std::vector& paddings, PoolProcess pool_compute, + framework::Tensor* output); }; -template +template class Pool2dGradFunctor { public: - void operator()(const platform::DeviceContext& context, - const framework::Tensor& input, + void operator()(const DeviceContext& context, const framework::Tensor& input, const framework::Tensor& output, const framework::Tensor& output_grad, std::vector& ksize, std::vector& strides, std::vector& paddings, PoolProcess pool_compute, framework::Tensor* input_grad); }; -template +template class MaxPool2dGradFunctor { public: - void operator()(const platform::DeviceContext& context, - const framework::Tensor& input, + void operator()(const DeviceContext& context, const framework::Tensor& input, const framework::Tensor& output, const framework::Tensor& output_grad, std::vector& ksize, std::vector& strides, std::vector& paddings, framework::Tensor* input_grad); }; -template +template class Pool3dFunctor { public: - void operator()(const platform::DeviceContext& context, - const framework::Tensor& input, std::vector& ksize, - std::vector& strides, std::vector& paddings, - PoolProcess pool_compute, framework::Tensor* output); + void operator()(const DeviceContext& context, const framework::Tensor& input, + std::vector& ksize, std::vector& strides, + std::vector& paddings, PoolProcess pool_compute, + framework::Tensor* output); }; -template +template class Pool3dGradFunctor { public: - void operator()(const platform::DeviceContext& context, - const framework::Tensor& input, + void operator()(const DeviceContext& context, const framework::Tensor& input, const framework::Tensor& output, const framework::Tensor& output_grad, std::vector& ksize, std::vector& strides, std::vector& paddings, PoolProcess pool_compute, framework::Tensor* input_grad); }; -template +template class MaxPool3dGradFunctor { public: - void operator()(const platform::DeviceContext& context, - const framework::Tensor& input, + void operator()(const DeviceContext& context, const framework::Tensor& input, const framework::Tensor& output, const framework::Tensor& output_grad, std::vector& ksize, std::vector& strides, std::vector& paddings, @@ -153,38 +149,38 @@ class MaxPool3dGradFunctor { * In pool2d, all tensors are in NCHW format. In pool3d, all tensors are in * NCDHW format. */ -template +template class MaxPool2dWithIndexFunctor { public: - void operator()(const platform::DeviceContext& context, - const framework::Tensor& input, std::vector& ksize, - std::vector& strides, std::vector& paddings, - framework::Tensor* output, framework::Tensor* mask); + void operator()(const DeviceContext& context, const framework::Tensor& input, + std::vector& ksize, std::vector& strides, + std::vector& paddings, framework::Tensor* output, + framework::Tensor* mask); }; -template +template class MaxPool2dWithIndexGradFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const DeviceContext& context, const framework::Tensor& output_grad, const framework::Tensor& mask, std::vector& ksize, std::vector& strides, std::vector& paddings, framework::Tensor* input_grad); }; -template +template class MaxPool3dWithIndexFunctor { public: - void operator()(const platform::DeviceContext& context, - const framework::Tensor& input, std::vector& ksize, - std::vector& strides, std::vector& paddings, - framework::Tensor* output, framework::Tensor* mask); + void operator()(const DeviceContext& context, const framework::Tensor& input, + std::vector& ksize, std::vector& strides, + std::vector& paddings, framework::Tensor* output, + framework::Tensor* mask); }; -template +template class MaxPool3dWithIndexGradFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const DeviceContext& context, const framework::Tensor& output_grad, const framework::Tensor& mask, std::vector& ksize, std::vector& strides, std::vector& paddings, diff --git a/paddle/operators/math/selected_rows_functor.cc b/paddle/operators/math/selected_rows_functor.cc index 514f2adef284c8877e2e74b943b4e6419c6ae721..ab758d1e7fd8ab361948b28e8cb735b9a742a339 100644 --- a/paddle/operators/math/selected_rows_functor.cc +++ b/paddle/operators/math/selected_rows_functor.cc @@ -19,8 +19,8 @@ namespace paddle { namespace operators { namespace math { template -struct SelectedRowsAdd { - void operator()(const platform::DeviceContext& context, +struct SelectedRowsAdd { + void operator()(const platform::CPUDeviceContext& context, const framework::SelectedRows& input1, const framework::SelectedRows& input2, framework::SelectedRows* output) { @@ -67,12 +67,12 @@ struct SelectedRowsAdd { } }; -template struct SelectedRowsAdd; -template struct SelectedRowsAdd; +template struct SelectedRowsAdd; +template struct SelectedRowsAdd; template -struct SelectedRowsAddTensor { - void operator()(const platform::DeviceContext& context, +struct SelectedRowsAddTensor { + void operator()(const platform::CPUDeviceContext& context, const framework::SelectedRows& input1, const framework::Tensor& input2, framework::Tensor* output) { auto in1_height = input1.height(); @@ -88,7 +88,7 @@ struct SelectedRowsAddTensor { PADDLE_ENFORCE_EQ(in1_row_numel, input2.numel() / in1_height); PADDLE_ENFORCE_EQ(in1_row_numel, output->numel() / in1_height); - SetConstant functor; + SetConstant functor; functor(context, output, 0.0); auto* in1_data = in1_value.data(); @@ -103,17 +103,16 @@ struct SelectedRowsAddTensor { auto out_eigen = framework::EigenVector::Flatten(*output); auto in2_eigen = framework::EigenVector::Flatten(input2); - out_eigen.device(*context.GetEigenDevice()) = - out_eigen + in2_eigen; + out_eigen.device(*context.eigen_device()) = out_eigen + in2_eigen; } }; -template struct SelectedRowsAddTensor; -template struct SelectedRowsAddTensor; +template struct SelectedRowsAddTensor; +template struct SelectedRowsAddTensor; template -struct SelectedRowsAddTo { - void operator()(const platform::DeviceContext& context, +struct SelectedRowsAddTo { + void operator()(const platform::CPUDeviceContext& context, const framework::SelectedRows& input1, const int64_t input2_offset, framework::SelectedRows* input2) { @@ -143,14 +142,14 @@ struct SelectedRowsAddTo { } }; -template struct SelectedRowsAddTo; -template struct SelectedRowsAddTo; -template struct SelectedRowsAddTo; -template struct SelectedRowsAddTo; +template struct SelectedRowsAddTo; +template struct SelectedRowsAddTo; +template struct SelectedRowsAddTo; +template struct SelectedRowsAddTo; template -struct SelectedRowsAddToTensor { - void operator()(const platform::DeviceContext& context, +struct SelectedRowsAddToTensor { + void operator()(const platform::CPUDeviceContext& context, const framework::SelectedRows& input1, framework::Tensor* input2) { auto in1_height = input1.height(); @@ -175,10 +174,10 @@ struct SelectedRowsAddToTensor { } }; -template struct SelectedRowsAddToTensor; -template struct SelectedRowsAddToTensor; -template struct SelectedRowsAddToTensor; -template struct SelectedRowsAddToTensor; +template struct SelectedRowsAddToTensor; +template struct SelectedRowsAddToTensor; +template struct SelectedRowsAddToTensor; +template struct SelectedRowsAddToTensor; } // namespace math } // namespace operators diff --git a/paddle/operators/math/selected_rows_functor.cu b/paddle/operators/math/selected_rows_functor.cu index c1dd323ba29e03e3ab4a3e4d7248388b408fb9d6..c44577e00af5f362ae7e168495e496d60d05de95 100644 --- a/paddle/operators/math/selected_rows_functor.cu +++ b/paddle/operators/math/selected_rows_functor.cu @@ -20,8 +20,8 @@ namespace paddle { namespace operators { namespace math { template -struct SelectedRowsAdd { - void operator()(const platform::DeviceContext& context, +struct SelectedRowsAdd { + void operator()(const platform::CUDADeviceContext& context, const framework::SelectedRows& input1, const framework::SelectedRows& input2, framework::SelectedRows* output) { @@ -64,16 +64,15 @@ struct SelectedRowsAdd { reinterpret_cast(context).stream()); auto* in2_data = in2_value.data(); - memory::Copy( - boost::get(out_place), out_data + in1_value.numel(), - boost::get(in2_place), in2_data, - in2_value.numel() * sizeof(T), - reinterpret_cast(context).stream()); + memory::Copy(boost::get(out_place), + out_data + in1_value.numel(), + boost::get(in2_place), in2_data, + in2_value.numel() * sizeof(T), context.stream()); } }; -template struct SelectedRowsAdd; -template struct SelectedRowsAdd; +template struct SelectedRowsAdd; +template struct SelectedRowsAdd; namespace { template @@ -96,8 +95,8 @@ __global__ void SelectedRowsAddTensorKernel(const T* selected_rows, } // namespace template -struct SelectedRowsAddTensor { - void operator()(const platform::DeviceContext& context, +struct SelectedRowsAddTensor { + void operator()(const platform::CUDADeviceContext& context, const framework::SelectedRows& input1, const framework::Tensor& input2, framework::Tensor* output) { auto in1_height = input1.height(); @@ -117,30 +116,28 @@ struct SelectedRowsAddTensor { auto* in2_data = input2.data(); auto* out_data = output->data(); - SetConstant functor; + SetConstant functor; functor(context, output, 0.0); const int block_size = 256; dim3 threads(block_size, 1); dim3 grid(1, in1_rows.size()); - SelectedRowsAddTensorKernel<<< - grid, threads, 0, - reinterpret_cast(context) - .stream()>>>(in1_data, in1_rows.data(), out_data, in1_row_numel); + SelectedRowsAddTensorKernel< + T, block_size><<>>( + in1_data, in1_rows.data(), out_data, in1_row_numel); auto out_eigen = framework::EigenVector::Flatten(*output); auto in2_eigen = framework::EigenVector::Flatten(input2); - out_eigen.device(*context.GetEigenDevice()) = - out_eigen + in2_eigen; + out_eigen.device(*context.eigen_device()) = out_eigen + in2_eigen; } }; -template struct SelectedRowsAddTensor; -template struct SelectedRowsAddTensor; +template struct SelectedRowsAddTensor; +template struct SelectedRowsAddTensor; template -struct SelectedRowsAddTo { - void operator()(const platform::DeviceContext& context, +struct SelectedRowsAddTo { + void operator()(const platform::CUDADeviceContext& context, const framework::SelectedRows& input1, const int64_t input2_offset, framework::SelectedRows* input2) { @@ -163,18 +160,17 @@ struct SelectedRowsAddTo { auto* in1_data = in1_value.data(); auto* in2_data = in2_value->data(); - memory::Copy( - boost::get(in2_place), in2_data + input2_offset, - boost::get(in1_place), in1_data, - in1_value.numel() * sizeof(T), - reinterpret_cast(context).stream()); + memory::Copy(boost::get(in2_place), + in2_data + input2_offset, + boost::get(in1_place), in1_data, + in1_value.numel() * sizeof(T), context.stream()); } }; -template struct SelectedRowsAddTo; -template struct SelectedRowsAddTo; -template struct SelectedRowsAddTo; -template struct SelectedRowsAddTo; +template struct SelectedRowsAddTo; +template struct SelectedRowsAddTo; +template struct SelectedRowsAddTo; +template struct SelectedRowsAddTo; namespace { template @@ -197,8 +193,8 @@ __global__ void SelectedRowsAddToTensorKernel(const T* selected_rows, } // namespace template -struct SelectedRowsAddToTensor { - void operator()(const platform::DeviceContext& context, +struct SelectedRowsAddToTensor { + void operator()(const platform::CUDADeviceContext& context, const framework::SelectedRows& input1, framework::Tensor* input2) { auto in1_height = input1.height(); @@ -216,17 +212,16 @@ struct SelectedRowsAddToTensor { const int block_size = 256; dim3 threads(block_size, 1); dim3 grid(1, in1_rows.size()); - SelectedRowsAddToTensorKernel<<< - grid, threads, 0, - reinterpret_cast(context) - .stream()>>>(in1_data, in1_rows.data(), in2_data, in1_row_numel); + SelectedRowsAddToTensorKernel< + T, block_size><<>>( + in1_data, in1_rows.data(), in2_data, in1_row_numel); } }; -template struct SelectedRowsAddToTensor; -template struct SelectedRowsAddToTensor; -template struct SelectedRowsAddToTensor; -template struct SelectedRowsAddToTensor; +template struct SelectedRowsAddToTensor; +template struct SelectedRowsAddToTensor; +template struct SelectedRowsAddToTensor; +template struct SelectedRowsAddToTensor; } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/operators/math/selected_rows_functor.h b/paddle/operators/math/selected_rows_functor.h index d6dc6c03c941f965394d952574d309c51eb82a62..1149075abf16547a120ac8928c45b4972409fc72 100644 --- a/paddle/operators/math/selected_rows_functor.h +++ b/paddle/operators/math/selected_rows_functor.h @@ -21,33 +21,33 @@ namespace math { // SelectedRows + SelectedRows will simplely concat value and rows. // The real computation happens in dealing with LoDTensor. -template +template struct SelectedRowsAdd { - void operator()(const platform::DeviceContext& context, + void operator()(const DeviceContext& context, const framework::SelectedRows& input1, const framework::SelectedRows& input2, framework::SelectedRows* output); }; -template +template struct SelectedRowsAddTensor { - void operator()(const platform::DeviceContext& context, + void operator()(const DeviceContext& context, const framework::SelectedRows& input1, const framework::Tensor& input2, framework::Tensor* output); }; // input2 = input1 + input2 -template +template struct SelectedRowsAddTo { - void operator()(const platform::DeviceContext& context, + void operator()(const DeviceContext& context, const framework::SelectedRows& input1, const int64_t input2_offset, framework::SelectedRows* input2); }; // input2 = input1 + input2 -template +template struct SelectedRowsAddToTensor { - void operator()(const platform::DeviceContext& context, + void operator()(const DeviceContext& context, const framework::SelectedRows& input1, framework::Tensor* input2); }; diff --git a/paddle/operators/math/selected_rows_functor_test.cc b/paddle/operators/math/selected_rows_functor_test.cc index a3649b6875aca61ee3ceb1ca83c7f9b38dc06c42..8c74cab0a1e817f9e98fa682fe4122db7837aec9 100644 --- a/paddle/operators/math/selected_rows_functor_test.cc +++ b/paddle/operators/math/selected_rows_functor_test.cc @@ -23,7 +23,7 @@ TEST(selected_rows_functor, cpu_add) { CPUPlace cpu_place; CPUDeviceContext ctx(cpu_place); - SetConstant functor; + SetConstant functor; int64_t height = 10; int64_t row_numel = 10; @@ -47,7 +47,7 @@ TEST(selected_rows_functor, cpu_add) { // simplely concat two SelectedRows out_value->mutable_data(make_ddim({7, 10}), cpu_place); - SelectedRowsAdd add_functor; + SelectedRowsAdd add_functor; add_functor(ctx, *selected_rows1, *selected_rows2, output.get()); auto out_height = output->height(); @@ -85,7 +85,7 @@ TEST(selected_rows_functor, cpu_add) { std::unique_ptr tensor2{new Tensor()}; tensor2->mutable_data(make_ddim({height, row_numel}), cpu_place); - SelectedRowsAddTensor add_tensor_functor; + SelectedRowsAddTensor add_tensor_functor; add_tensor_functor(ctx, *output, *tensor1, tensor2.get()); auto* tensor2_data = tensor2->data(); @@ -112,7 +112,7 @@ TEST(selected_rows_functor, cpu_add_to) { CPUPlace cpu_place; CPUDeviceContext ctx(cpu_place); - SetConstant functor; + SetConstant functor; int64_t height = 10; int64_t row_numel = 10; @@ -137,7 +137,7 @@ TEST(selected_rows_functor, cpu_add_to) { // simplely concat two SelectedRows out_value->mutable_data(make_ddim({7, 10}), cpu_place); - SelectedRowsAddTo add_to_functor; + SelectedRowsAddTo add_to_functor; add_to_functor(ctx, *selected_rows1, 0, output.get()); add_to_functor(ctx, *selected_rows2, in1_value->numel(), output.get()); @@ -173,7 +173,7 @@ TEST(selected_rows_functor, cpu_add_to) { tensor1->mutable_data(make_ddim({height, row_numel}), cpu_place); functor(ctx, tensor1.get(), 3.0); - SelectedRowsAddToTensor add_to_tensor_functor; + SelectedRowsAddToTensor add_to_tensor_functor; add_to_tensor_functor(ctx, *output, tensor1.get()); auto* tensor1_data = tensor1->data(); diff --git a/paddle/operators/math/selected_rows_functor_test.cu b/paddle/operators/math/selected_rows_functor_test.cu index 7de9291c17d3f09a3c6076f00f2457f240e6f0af..777caf5635647d11e8fde05a68fdf7e2c32f48df 100644 --- a/paddle/operators/math/selected_rows_functor_test.cu +++ b/paddle/operators/math/selected_rows_functor_test.cu @@ -24,7 +24,7 @@ TEST(selected_rows_functor, gpu_add) { GPUPlace gpu_place(0); CPUPlace cpu_place; CUDADeviceContext ctx(gpu_place); - SetConstant functor; + SetConstant functor; int64_t height = 10; int64_t row_numel = 10; @@ -48,7 +48,7 @@ TEST(selected_rows_functor, gpu_add) { // simplely concat two SelectedRows out_value->mutable_data(make_ddim({7, 10}), gpu_place); - SelectedRowsAdd add_functor; + SelectedRowsAdd add_functor; add_functor(ctx, *selected_rows1, *selected_rows2, output.get()); auto out_height = output->height(); @@ -90,7 +90,7 @@ TEST(selected_rows_functor, gpu_add) { std::unique_ptr tensor2{new Tensor()}; tensor2->mutable_data(make_ddim({height, row_numel}), gpu_place); - SelectedRowsAddTensor add_tensor_functor; + SelectedRowsAddTensor add_tensor_functor; add_tensor_functor(ctx, *output, *tensor1, tensor2.get()); Tensor tensor2_cpu; @@ -122,7 +122,7 @@ TEST(selected_rows_functor, gpu_add_to) { GPUPlace gpu_place(0); CPUPlace cpu_place; CUDADeviceContext ctx(gpu_place); - SetConstant functor; + SetConstant functor; int64_t height = 10; int64_t row_numel = 10; @@ -147,7 +147,7 @@ TEST(selected_rows_functor, gpu_add_to) { // simplely concat two SelectedRows out_value->mutable_data(make_ddim({7, 10}), gpu_place); - SelectedRowsAddTo add_to_functor; + SelectedRowsAddTo add_to_functor; add_to_functor(ctx, *selected_rows1, 0, output.get()); add_to_functor(ctx, *selected_rows2, in1_value->numel(), output.get()); @@ -187,7 +187,7 @@ TEST(selected_rows_functor, gpu_add_to) { tensor1->mutable_data(make_ddim({height, row_numel}), gpu_place); functor(ctx, tensor1.get(), 3.0); - SelectedRowsAddToTensor add_to_tensor_functor; + SelectedRowsAddToTensor add_to_tensor_functor; add_to_tensor_functor(ctx, *output, tensor1.get()); Tensor tensor1_cpu; diff --git a/paddle/operators/math/sequence2batch.cc b/paddle/operators/math/sequence2batch.cc index 5b3bde02fbf981772759caa3d0054fac4a8520f9..88977be1f8c030741c3a3a8f07a4feeb1d8bb4d9 100644 --- a/paddle/operators/math/sequence2batch.cc +++ b/paddle/operators/math/sequence2batch.cc @@ -19,9 +19,9 @@ namespace operators { namespace math { template -class CopyMatrixRowsFunctor { +class CopyMatrixRowsFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& src, const size_t* index, framework::Tensor& dst, bool is_src_index) { auto src_dims = src.dims(); @@ -48,13 +48,13 @@ class CopyMatrixRowsFunctor { } }; -template class CopyMatrixRowsFunctor; -template class CopyMatrixRowsFunctor; +template class CopyMatrixRowsFunctor; +template class CopyMatrixRowsFunctor; -template class LoDTensor2BatchFunctor; -template class LoDTensor2BatchFunctor; -template class Batch2LoDTensorFunctor; -template class Batch2LoDTensorFunctor; +template class LoDTensor2BatchFunctor; +template class LoDTensor2BatchFunctor; +template class Batch2LoDTensorFunctor; +template class Batch2LoDTensorFunctor; } // namespace math } // namespace operators diff --git a/paddle/operators/math/sequence2batch.cu b/paddle/operators/math/sequence2batch.cu index c5d968aeb216bbb3e0e17f138b9e891494d99f75..452ae8951000872b706f7e4227a62dbf98109e7e 100644 --- a/paddle/operators/math/sequence2batch.cu +++ b/paddle/operators/math/sequence2batch.cu @@ -39,9 +39,9 @@ __global__ void CopyMatrixRowsKernel(const T* src, T* dst, const size_t* index, } template -class CopyMatrixRowsFunctor { +class CopyMatrixRowsFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& src, const size_t* index, framework::Tensor& dst, bool is_src_index) { auto src_dims = src.dims(); @@ -59,20 +59,19 @@ class CopyMatrixRowsFunctor { dim3 threads(128, 8); dim3 grid(8, 1); - auto stream = - reinterpret_cast(context).stream(); + auto stream = context.stream(); CopyMatrixRowsKernel<<>>( src_data, dst_data, index, height, width, is_src_index); } }; -template class CopyMatrixRowsFunctor; -template class CopyMatrixRowsFunctor; +template class CopyMatrixRowsFunctor; +template class CopyMatrixRowsFunctor; -template class LoDTensor2BatchFunctor; -template class LoDTensor2BatchFunctor; -template class Batch2LoDTensorFunctor; -template class Batch2LoDTensorFunctor; +template class LoDTensor2BatchFunctor; +template class LoDTensor2BatchFunctor; +template class Batch2LoDTensorFunctor; +template class Batch2LoDTensorFunctor; } // namespace math } // namespace operators diff --git a/paddle/operators/math/sequence2batch.h b/paddle/operators/math/sequence2batch.h index 73295ddbcb73fe80be08e732790f0ec75e94b415..a5c43a2c7d4d729c35a20a27de2a23141e6019bc 100644 --- a/paddle/operators/math/sequence2batch.h +++ b/paddle/operators/math/sequence2batch.h @@ -26,7 +26,7 @@ template using EigenMatrix = framework::EigenMatrix; -template +template class CopyMatrixRowsFunctor { public: // If is_src_index is true, @@ -34,12 +34,12 @@ class CopyMatrixRowsFunctor { // If is_src_index is false, // copy the input src to the indexed rows of output dst. // The indexed rows are based on the input index. - void operator()(const platform::DeviceContext& context, - const framework::Tensor& src, const size_t* index, - framework::Tensor& dst, bool is_src_index); + void operator()(const DeviceContext& context, const framework::Tensor& src, + const size_t* index, framework::Tensor& dst, + bool is_src_index); }; -template +template class LoDTensor2BatchFunctor { // Calculate the length of each sequence and // sort sequence index by the length. @@ -56,7 +56,7 @@ class LoDTensor2BatchFunctor { }; public: - void operator()(const platform::DeviceContext& context, + void operator()(const DeviceContext& context, const framework::LoDTensor& lod_tensor, framework::LoDTensor& batch, bool is_cal_batch_lod, bool is_reverse = false) const { @@ -65,7 +65,7 @@ class LoDTensor2BatchFunctor { PADDLE_ENFORCE_GT(lods.size(), 2UL); PADDLE_ENFORCE_EQ(lods[1].size(), static_cast(lod_tensor.dims()[0])); - CopyMatrixRowsFunctor to_batch; + CopyMatrixRowsFunctor to_batch; to_batch(context, lod_tensor, lods[1].data(), batch, true); return; } @@ -143,22 +143,22 @@ class LoDTensor2BatchFunctor { } batch.set_lod(batch_lods); - CopyMatrixRowsFunctor to_batch; + CopyMatrixRowsFunctor to_batch; to_batch(context, lod_tensor, seq2batch_idx, batch, true); } }; -template +template class Batch2LoDTensorFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const DeviceContext& context, const framework::LoDTensor& batch, framework::LoDTensor& lod_tensor) const { auto in_lod = batch.lod(); PADDLE_ENFORCE_GT(in_lod.size(), 2UL); PADDLE_ENFORCE_EQ(in_lod[1].size(), static_cast(lod_tensor.dims()[0])); - CopyMatrixRowsFunctor to_seq; + CopyMatrixRowsFunctor to_seq; size_t* index = in_lod[1].data(); to_seq(context, batch, index, lod_tensor, false); } diff --git a/paddle/operators/math/sequence_pooling.cc b/paddle/operators/math/sequence_pooling.cc index 5913c99fdb01100d0de44ab317124550fa626528..8fb92b1a130b8f25163d856f3f596136072180cf 100644 --- a/paddle/operators/math/sequence_pooling.cc +++ b/paddle/operators/math/sequence_pooling.cc @@ -20,9 +20,9 @@ namespace operators { namespace math { template -class MaxSeqPoolFunctor { +class MaxSeqPoolFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CPUDeviceContext& context, const framework::LoDTensor& input, framework::Tensor* output, framework::Tensor* index) { auto in_dims = input.dims(); @@ -60,9 +60,9 @@ class MaxSeqPoolFunctor { }; template -class MaxSeqPoolGradFunctor { +class MaxSeqPoolGradFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& out_grad, const framework::Tensor& index, framework::LoDTensor* in_grad) { @@ -80,7 +80,7 @@ class MaxSeqPoolGradFunctor { const int* max_index = index.data(); T* ig_data = in_grad->data(); - SetConstant set_zero; + SetConstant set_zero; set_zero(context, in_grad, static_cast(0.0)); int64_t num_seq = og_dims[0]; int64_t dim = out_grad.numel() / num_seq; @@ -93,10 +93,10 @@ class MaxSeqPoolGradFunctor { } }; -template class MaxSeqPoolFunctor; -template class MaxSeqPoolFunctor; -template class MaxSeqPoolGradFunctor; -template class MaxSeqPoolGradFunctor; +template class MaxSeqPoolFunctor; +template class MaxSeqPoolFunctor; +template class MaxSeqPoolGradFunctor; +template class MaxSeqPoolGradFunctor; } // namespace math } // namespace operators diff --git a/paddle/operators/math/sequence_pooling.cu b/paddle/operators/math/sequence_pooling.cu index 5ed951402fecba66a8960f4d024bf3785dac51c7..4c9e6b375ce7251747b9cd443d86cca0858c84ef 100644 --- a/paddle/operators/math/sequence_pooling.cu +++ b/paddle/operators/math/sequence_pooling.cu @@ -46,9 +46,9 @@ __global__ void KeMaxSequencePool(const T* input, const size_t* starts, } template -class MaxSeqPoolFunctor { +class MaxSeqPoolFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CUDADeviceContext& context, const framework::LoDTensor& input, framework::Tensor* output, framework::Tensor* index) { auto in_dims = input.dims(); @@ -71,8 +71,7 @@ class MaxSeqPoolFunctor { dim3 threads(256, 1); dim3 grid(num_seq, 1); - auto stream = - reinterpret_cast(context).stream(); + auto stream = context.stream(); KeMaxSequencePool<<>>( in_data, starts.data(), out_data, max_index, num_seq, dim); } @@ -91,9 +90,9 @@ __global__ void KeMaxSequencePoolGrad(const T* out_grad, const int* max_index, } template -class MaxSeqPoolGradFunctor { +class MaxSeqPoolGradFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& out_grad, const framework::Tensor& index, framework::LoDTensor* in_grad) { @@ -111,7 +110,7 @@ class MaxSeqPoolGradFunctor { const int* max_index = index.data(); T* ig_data = in_grad->data(); - SetConstant set_zero; + SetConstant set_zero; set_zero(context, in_grad, static_cast(0.0)); int64_t num_seq = og_dims[0]; int64_t dim = out_grad.numel() / num_seq; @@ -119,17 +118,16 @@ class MaxSeqPoolGradFunctor { unsigned int blocks = (num_seq * dim + 128 - 1) / 128; dim3 threads(128, 1); dim3 grid(blocks, 1); - auto stream = - reinterpret_cast(context).stream(); + auto stream = context.stream(); KeMaxSequencePoolGrad<<>>( og_data, max_index, ig_data, num_seq, dim); } }; -template class MaxSeqPoolFunctor; -template class MaxSeqPoolFunctor; -template class MaxSeqPoolGradFunctor; -template class MaxSeqPoolGradFunctor; +template class MaxSeqPoolFunctor; +template class MaxSeqPoolFunctor; +template class MaxSeqPoolGradFunctor; +template class MaxSeqPoolGradFunctor; } // namespace math } // namespace operators diff --git a/paddle/operators/math/sequence_pooling.h b/paddle/operators/math/sequence_pooling.h index 35dfe26de1a87a064410401244914d4e2a94176e..13ffb2ebef3a683b5e5fe64433a90237b944002e 100644 --- a/paddle/operators/math/sequence_pooling.h +++ b/paddle/operators/math/sequence_pooling.h @@ -23,18 +23,18 @@ namespace math { #define FLT_MAX __FLT_MAX__ -template +template class MaxSeqPoolFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const DeviceContext& context, const framework::LoDTensor& input, framework::Tensor* output, framework::Tensor* index); }; -template +template class MaxSeqPoolGradFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const DeviceContext& context, const framework::Tensor& out_grad, const framework::Tensor& index, framework::LoDTensor* in_grad); diff --git a/paddle/operators/math/softmax.cc b/paddle/operators/math/softmax.cc index 3e2f15d6c27f58818128f32fab0bd4c5f36b0050..72f10f35f4ef39b41fbc5e900313eafd7ba669e9 100644 --- a/paddle/operators/math/softmax.cc +++ b/paddle/operators/math/softmax.cc @@ -19,10 +19,10 @@ namespace paddle { namespace operators { namespace math { -template class SoftmaxFunctor; -template class SoftmaxFunctor; -template class SoftmaxGradFunctor; -template class SoftmaxGradFunctor; +template class SoftmaxFunctor; +template class SoftmaxFunctor; +template class SoftmaxGradFunctor; +template class SoftmaxGradFunctor; } // namespace math } // namespace operators diff --git a/paddle/operators/math/softmax.cu b/paddle/operators/math/softmax.cu index 4dbab51d46bdaaa506a6c242d0958c73687f4eb9..9e73f6a371c950ed6f81ee90216f7fd3899f73ce 100644 --- a/paddle/operators/math/softmax.cu +++ b/paddle/operators/math/softmax.cu @@ -21,10 +21,10 @@ namespace paddle { namespace operators { namespace math { -template class SoftmaxFunctor; -template class SoftmaxFunctor; -template class SoftmaxGradFunctor; -template class SoftmaxGradFunctor; +template class SoftmaxFunctor; +template class SoftmaxFunctor; +template class SoftmaxGradFunctor; +template class SoftmaxGradFunctor; } // namespace math } // namespace operators diff --git a/paddle/operators/math/softmax.h b/paddle/operators/math/softmax.h index fe1074650234c5beb5889e7efd713164769ad740..471f44d340cfd0d6305a9127c34289ef1663accb 100644 --- a/paddle/operators/math/softmax.h +++ b/paddle/operators/math/softmax.h @@ -19,19 +19,18 @@ namespace paddle { namespace operators { namespace math { -template +template class SoftmaxFunctor { public: - void operator()(const platform::DeviceContext& context, - const framework::Tensor* X, framework::Tensor* Y); + void operator()(const DeviceContext& context, const framework::Tensor* X, + framework::Tensor* Y); }; -template +template class SoftmaxGradFunctor { public: - void operator()(const platform::DeviceContext& context, - const framework::Tensor* y, const framework::Tensor* y_grad, - framework::Tensor* x_grad); + void operator()(const DeviceContext& context, const framework::Tensor* y, + const framework::Tensor* y_grad, framework::Tensor* x_grad); }; } // namespace math diff --git a/paddle/operators/math/softmax_impl.h b/paddle/operators/math/softmax_impl.h index 05793eeb3eeafaf36c301236197555b7b35e5803..82f597ff792decb1760f59e693026cd453432d05 100644 --- a/paddle/operators/math/softmax_impl.h +++ b/paddle/operators/math/softmax_impl.h @@ -32,10 +32,10 @@ struct ValueClip { } }; -template -void SoftmaxFunctor::operator()( - const platform::DeviceContext& context, const framework::Tensor* X, - framework::Tensor* Y) { +template +void SoftmaxFunctor::operator()(const DeviceContext& context, + const framework::Tensor* X, + framework::Tensor* Y) { auto logits = EigenMatrix::From(*X); auto softmax = EigenMatrix::From(*Y); @@ -56,19 +56,18 @@ void SoftmaxFunctor::operator()( .broadcast(one_by_class)) .unaryExpr(ValueClip()); - softmax.device(*context.GetEigenDevice()) = shifted_logits.exp(); - softmax.device(*context.GetEigenDevice()) = - (softmax * - softmax.sum(along_class) - .inverse() - .eval() - .reshape(batch_by_one) - .broadcast(one_by_class)); + softmax.device(*context.eigen_device()) = shifted_logits.exp(); + softmax.device(*context.eigen_device()) = (softmax * + softmax.sum(along_class) + .inverse() + .eval() + .reshape(batch_by_one) + .broadcast(one_by_class)); } -template -void SoftmaxGradFunctor::operator()( - const platform::DeviceContext& context, const framework::Tensor* y, +template +void SoftmaxGradFunctor::operator()( + const DeviceContext& context, const framework::Tensor* y, const framework::Tensor* y_grad, framework::Tensor* x_grad) { auto softmax = EigenMatrix::From(*y); auto softmax_grad = EigenMatrix::From(*y_grad); @@ -89,8 +88,7 @@ void SoftmaxGradFunctor::operator()( .eval() .reshape(batch_by_one) .broadcast(one_by_class); - logits_grad.device(*context.GetEigenDevice()) = - (softmax_grad - dot) * softmax; + logits_grad.device(*context.eigen_device()) = (softmax_grad - dot) * softmax; } } // namespace math diff --git a/paddle/operators/math/unpooling.cc b/paddle/operators/math/unpooling.cc index b57d3dc1414cff492db8d7d503a7fce370a3f151..ecd3a647e00655a57d11c2f082bd1f81822cf92b 100644 --- a/paddle/operators/math/unpooling.cc +++ b/paddle/operators/math/unpooling.cc @@ -17,9 +17,9 @@ namespace paddle { namespace operators { namespace math { template -class Unpool2dMaxFunctor { +class Unpool2dMaxFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& input, const framework::Tensor& indices, framework::Tensor* output) { const int batch_size = input.dims()[0]; @@ -48,9 +48,9 @@ class Unpool2dMaxFunctor { } }; template -class Unpool2dMaxGradFunctor { +class Unpool2dMaxGradFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& input, const framework::Tensor& indices, const framework::Tensor& output, @@ -82,10 +82,10 @@ class Unpool2dMaxGradFunctor { } } }; -template class Unpool2dMaxGradFunctor; -template class Unpool2dMaxGradFunctor; -template class Unpool2dMaxFunctor; -template class Unpool2dMaxFunctor; +template class Unpool2dMaxGradFunctor; +template class Unpool2dMaxGradFunctor; +template class Unpool2dMaxFunctor; +template class Unpool2dMaxFunctor; } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/operators/math/unpooling.cu b/paddle/operators/math/unpooling.cu index 37c3c8b689f9a69b68ddffd23813fa9ad8ced0e7..ecbde0f6a798ba817c28714b37af8187d2e9555e 100644 --- a/paddle/operators/math/unpooling.cu +++ b/paddle/operators/math/unpooling.cu @@ -67,9 +67,9 @@ __global__ void KernelUnpool2dMaxGrad( * All tensors are in NCHW format. */ template -class Unpool2dMaxFunctor { +class Unpool2dMaxFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& input, const framework::Tensor& indices, framework::Tensor* output) { const int batch_size = input.dims()[0]; @@ -83,21 +83,18 @@ class Unpool2dMaxFunctor { T* output_data = output->mutable_data(context.GetPlace()); int threads = 1024; int grid = (input.numel() + threads - 1) / threads; - KernelUnpool2dMax< - T><<(context) - .stream()>>>(input.numel(), input_data, indices_data, - input_height, input_width, output_channels, - output_data, output_height, output_width); + KernelUnpool2dMax<<>>( + input.numel(), input_data, indices_data, input_height, input_width, + output_channels, output_data, output_height, output_width); } }; /* * All tensors are in NCHW format. */ template -class Unpool2dMaxGradFunctor { +class Unpool2dMaxGradFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& input, const framework::Tensor& indices, const framework::Tensor& output, @@ -116,19 +113,16 @@ class Unpool2dMaxGradFunctor { T* input_grad_data = input_grad->mutable_data(context.GetPlace()); int threads = 1024; int grid = (input.numel() + threads - 1) / threads; - KernelUnpool2dMaxGrad< - T><<(context) - .stream()>>>(input.numel(), input_data, indices_data, - input_height, input_width, output_channels, - output_data, output_grad_data, output_height, - output_width, input_grad_data); + KernelUnpool2dMaxGrad<<>>( + input.numel(), input_data, indices_data, input_height, input_width, + output_channels, output_data, output_grad_data, output_height, + output_width, input_grad_data); } }; -template class Unpool2dMaxGradFunctor; -template class Unpool2dMaxGradFunctor; -template class Unpool2dMaxFunctor; -template class Unpool2dMaxFunctor; +template class Unpool2dMaxGradFunctor; +template class Unpool2dMaxGradFunctor; +template class Unpool2dMaxFunctor; +template class Unpool2dMaxFunctor; } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/operators/math/unpooling.h b/paddle/operators/math/unpooling.h index 7077d7c2274fd9e02b69ef343f310f4ffbbcff1a..0f0ff1371ebea8c7501aee1c7c45bc6a79de397e 100644 --- a/paddle/operators/math/unpooling.h +++ b/paddle/operators/math/unpooling.h @@ -18,18 +18,16 @@ limitations under the License. */ namespace paddle { namespace operators { namespace math { -template +template class Unpool2dMaxFunctor { public: - void operator()(const platform::DeviceContext& context, - const framework::Tensor& input, + void operator()(const DeviceContext& context, const framework::Tensor& input, const framework::Tensor& indices, framework::Tensor* output); }; -template +template class Unpool2dMaxGradFunctor { public: - void operator()(const platform::DeviceContext& context, - const framework::Tensor& input, + void operator()(const DeviceContext& context, const framework::Tensor& input, const framework::Tensor& indices, const framework::Tensor& output, const framework::Tensor& output_grad, diff --git a/paddle/operators/math/vol2col.cc b/paddle/operators/math/vol2col.cc index 99eb7fd46de42400a915d86706580d15b08a74a2..d574ed9234304d992a6e4a10fce0816aee7fa40a 100644 --- a/paddle/operators/math/vol2col.cc +++ b/paddle/operators/math/vol2col.cc @@ -25,9 +25,9 @@ namespace math { * output_depth, output_height, output_width] */ template -class Vol2ColFunctor { +class Vol2ColFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& vol, const std::vector& dilations, const std::vector& strides, @@ -111,9 +111,9 @@ class Vol2ColFunctor { * output_depth, output_height, output_width] */ template -class Col2VolFunctor { +class Col2VolFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& col, const std::vector& dilations, const std::vector& strides, @@ -190,10 +190,10 @@ class Col2VolFunctor { } }; -template class Vol2ColFunctor; -template class Vol2ColFunctor; -template class Col2VolFunctor; -template class Col2VolFunctor; +template class Vol2ColFunctor; +template class Vol2ColFunctor; +template class Col2VolFunctor; +template class Col2VolFunctor; } // namespace math } // namespace operators diff --git a/paddle/operators/math/vol2col.cu b/paddle/operators/math/vol2col.cu index dae3be858e9f47d0133aa37e8a5f90a0addf1dfd..b029442fe48dd27232d322aadec5864760e1b9ff 100644 --- a/paddle/operators/math/vol2col.cu +++ b/paddle/operators/math/vol2col.cu @@ -68,9 +68,9 @@ __global__ void vol2col(int num_kernels, const T* data_vol, int depth, * output_depth, output_height, output_width] */ template -class Vol2ColFunctor { +class Vol2ColFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& vol, const std::vector& dilations, const std::vector& strides, @@ -117,9 +117,7 @@ class Vol2ColFunctor { const int threads = 1024; const int blocks = (num_outputs + 1024 - 1) / 1024; - vol2col<<(context) - .stream()>>>( + vol2col<<>>( num_outputs, vol.data(), input_depth, input_height, input_width, dilations[0], dilations[1], dilations[2], filter_depth, filter_height, filter_width, strides[0], strides[1], strides[2], paddings[0], @@ -196,9 +194,9 @@ __global__ void col2vol(int num_kernels, const T* data_col, int depth, * output_depth, output_height, output_width] */ template -class Col2VolFunctor { +class Col2VolFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& col, const std::vector& dilations, const std::vector& strides, @@ -245,9 +243,7 @@ class Col2VolFunctor { const int threads = 1024; const int blocks = (num_kernels + 1024 - 1) / 1024; - col2vol<<(context) - .stream()>>>( + col2vol<<>>( num_kernels, col.data(), input_depth, input_height, input_width, dilations[0], dilations[1], dilations[2], filter_depth, filter_height, filter_width, strides[0], strides[1], strides[2], paddings[0], @@ -256,10 +252,10 @@ class Col2VolFunctor { } }; -template class Vol2ColFunctor; -template class Vol2ColFunctor; -template class Col2VolFunctor; -template class Col2VolFunctor; +template class Vol2ColFunctor; +template class Vol2ColFunctor; +template class Col2VolFunctor; +template class Col2VolFunctor; } // namespace math } // namespace operators diff --git a/paddle/operators/math/vol2col.h b/paddle/operators/math/vol2col.h index dc64d1d9776261541a380ed15207904d6b4e641c..dcd80370e8516d34b764b1ab3b0b98516e738bf6 100644 --- a/paddle/operators/math/vol2col.h +++ b/paddle/operators/math/vol2col.h @@ -63,22 +63,20 @@ namespace math { * \note The caller needs to ensure that volShape.inputChannels is equal to * colShape.inputChannels. */ -template +template class Vol2ColFunctor { public: - void operator()(const platform::DeviceContext& context, - const framework::Tensor& vol, + void operator()(const DeviceContext& context, const framework::Tensor& vol, const std::vector& dilations, const std::vector& strides, const std::vector& paddings, framework::Tensor* col) const; }; -template +template class Col2VolFunctor { public: - void operator()(const platform::DeviceContext& context, - const framework::Tensor& col, + void operator()(const DeviceContext& context, const framework::Tensor& col, const std::vector& dilations, const std::vector& strides, const std::vector& paddings, diff --git a/paddle/operators/math/vol2col_test.cc b/paddle/operators/math/vol2col_test.cc index 62c3152304ad7fe946c996be413e102f3dd92bb2..f46db3c56713399798a45854bf1613d07aee26e6 100644 --- a/paddle/operators/math/vol2col_test.cc +++ b/paddle/operators/math/vol2col_test.cc @@ -16,7 +16,7 @@ limitations under the License. */ #include #include -template +template void testVol2col() { paddle::framework::Tensor input; paddle::framework::Tensor input_tmp; @@ -24,18 +24,7 @@ void testVol2col() { paddle::framework::Tensor output_tmp; auto* place = new Place(); - paddle::platform::DeviceContext* context; - if (paddle::platform::is_cpu_place(*place)) { - context = - new paddle::platform::CPUDeviceContext(paddle::platform::CPUPlace()); - } else { -#ifdef PADDLE_WITH_CUDA - context = - new paddle::platform::CUDADeviceContext(paddle::platform::GPUPlace()); -#else - PADDLE_THROW("no GPU support"); -#endif // PADDLE_WITH_CUDA - } + DeviceContext* context = new DeviceContext(*place); /** * input = [[0, 1, 2, @@ -88,7 +77,7 @@ void testVol2col() { output_depth, output_height, output_width}, *place); - paddle::operators::math::Vol2ColFunctor vol2col; + paddle::operators::math::Vol2ColFunctor vol2col; vol2col(*context, input, dilations, strides, paddings, &output); float vol_2_col[] = {0, 1, 1, 2, 3, 4, 4, 5, 6, 7, 7, 8, 9, 10, 10, 11}; @@ -113,7 +102,7 @@ void testVol2col() { CopyFrom(input_tmp, *place, *context, &input); } - paddle::operators::math::Col2VolFunctor col2vol; + paddle::operators::math::Col2VolFunctor col2vol; col2vol(*context, output, dilations, strides, paddings, &input); float* in_ptr; @@ -130,8 +119,9 @@ void testVol2col() { } TEST(math, vol2col) { - testVol2col(); + testVol2col(); #ifdef PADDLE_WITH_CUDA - testVol2col(); + testVol2col(); #endif // PADDLE_WITH_CUDA } diff --git a/paddle/operators/matmul_op.cc b/paddle/operators/matmul_op.cc index 5a1a6154203d40186f1e41491194b19612931b1f..ee0bc0c3708ac20ad00e3222060244d42dbd6f2f 100644 --- a/paddle/operators/matmul_op.cc +++ b/paddle/operators/matmul_op.cc @@ -206,7 +206,8 @@ class MatMulOpGrad : public framework::OperatorWithKernel { namespace ops = paddle::operators; REGISTER_OP(matmul, ops::MatMulOp, ops::MatMulOpMaker, matmul_grad, ops::MatMulOpGrad); -REGISTER_OP_CPU_KERNEL(matmul, - ops::MatMulKernel); REGISTER_OP_CPU_KERNEL( - matmul_grad, ops::MatMulGradKernel); + matmul, ops::MatMulKernel); +REGISTER_OP_CPU_KERNEL( + matmul_grad, + ops::MatMulGradKernel); diff --git a/paddle/operators/matmul_op.cu.cc b/paddle/operators/matmul_op.cu.cc index b7e66382f00445b087e14103e7a148d450b37405..6a3772c00457993dcc7b55a0f15493974633026c 100644 --- a/paddle/operators/matmul_op.cu.cc +++ b/paddle/operators/matmul_op.cu.cc @@ -15,7 +15,8 @@ #include "paddle/operators/matmul_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(matmul, - ops::MatMulKernel); -REGISTER_OP_GPU_KERNEL( - matmul_grad, ops::MatMulGradKernel); +REGISTER_OP_CUDA_KERNEL( + matmul, ops::MatMulKernel); +REGISTER_OP_CUDA_KERNEL( + matmul_grad, + ops::MatMulGradKernel); diff --git a/paddle/operators/matmul_op.h b/paddle/operators/matmul_op.h index 1e4aa48b7018d8e3d6f02591fbca2877ddbd3c5d..de9da487b3d627cc79962db3770632813e9cd9f5 100644 --- a/paddle/operators/matmul_op.h +++ b/paddle/operators/matmul_op.h @@ -27,7 +27,7 @@ using DDim = framework::DDim; using framework::make_ddim; using framework::vectorize; -template +template class MatMulKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -38,8 +38,9 @@ class MatMulKernel : public framework::OpKernel { bool transpose_x = context.Attr("transpose_X"); bool transpose_y = context.Attr("transpose_Y"); - math::MatMulFunctor()(context.device_context(), x, transpose_x, y, - transpose_y, T(1), out, T(0)); + math::MatMulFunctor()( + context.template device_context(), x, transpose_x, y, + transpose_y, T(1), out, T(0)); } }; @@ -68,17 +69,16 @@ Tensor CombineBatchAndM(const Tensor& input) { // Reshape a rank-3 tensor from P x M x N to M x (P * N). // (Warning: This requires transposing data and writes into new memory.) // Identity op if the tensor is not of rank 3. -template -Tensor CombineBatchAndN(const framework::ExecutionContext& context, - const Tensor& input) { +template +Tensor CombineBatchAndN(const DeviceContext& context, const Tensor& input) { Tensor output; auto in_dims = input.dims(); if (in_dims.size() == 3) { output.Resize({in_dims[1], in_dims[0], in_dims[2]}); output.mutable_data(context.GetPlace()); std::vector axis = {1, 0, 2}; - math::Transpose trans; - trans(context.device_context(), input, &output, axis); + math::Transpose trans; + trans(context, input, &output, axis); std::vector out_dims = {in_dims[1], in_dims[0] * in_dims[2]}; output.Resize({in_dims[1], in_dims[0] * in_dims[2]}); } else { @@ -112,7 +112,7 @@ Tensor CombineBatchAndN(const framework::ExecutionContext& context, // // To handle this sort of scenario, we reshape X : P x M x K, dOut: P x M x N // to X: (P * M) x K, dOut: (P * M) x N. -template +template class MatMulGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -178,24 +178,23 @@ class MatMulGradKernel : public framework::OpKernel { Tensor Y = Reshape(y, make_ddim(y_dims)); Tensor dOut = Reshape(dout, make_ddim(dout_dims)); + auto& dev_ctx = context.template device_context(); if (dx) { dx->mutable_data(context.GetPlace()); const Tensor& dOut_for_dX = (x_dims.size() == 2 && y_dims.size() == 3) - ? CombineBatchAndN(context, dOut) + ? CombineBatchAndN(dev_ctx, dOut) : dOut; if (x_dims.size() == 2 && y_dims.size() == 3) { Y = transpose_y ? CombineBatchAndM(Y) - : CombineBatchAndN(context, Y); + : CombineBatchAndN(dev_ctx, Y); } if (transpose_x) { - math::MatMulFunctor()(context.device_context(), Y, - transpose_y, dOut_for_dX, transpose_x, - T(1), dx, T(0)); + math::MatMulFunctor()( + dev_ctx, Y, transpose_y, dOut_for_dX, transpose_x, T(1), dx, T(0)); } else { - math::MatMulFunctor()(context.device_context(), dOut_for_dX, - transpose_x, Y, !transpose_y, T(1), dx, - T(0)); + math::MatMulFunctor()( + dev_ctx, dOut_for_dX, transpose_x, Y, !transpose_y, T(1), dx, T(0)); } } @@ -205,18 +204,16 @@ class MatMulGradKernel : public framework::OpKernel { ? CombineBatchAndM(dOut) : dOut; if (y_dims.size() == 2 && x_dims.size() == 3) { - X = transpose_x ? CombineBatchAndN(context, X) + X = transpose_x ? CombineBatchAndN(dev_ctx, X) : CombineBatchAndM(X); dOut = CombineBatchAndM(dOut); } if (transpose_y) { - math::MatMulFunctor()(context.device_context(), dOut_for_dY, - transpose_y, X, transpose_x, T(1), dy, - T(0)); + math::MatMulFunctor()( + dev_ctx, dOut_for_dY, transpose_y, X, transpose_x, T(1), dy, T(0)); } else { - math::MatMulFunctor()(context.device_context(), X, - !transpose_x, dOut_for_dY, transpose_y, - T(1), dy, T(0)); + math::MatMulFunctor()( + dev_ctx, X, !transpose_x, dOut_for_dY, transpose_y, T(1), dy, T(0)); } } } diff --git a/paddle/operators/maxout_op.cc b/paddle/operators/maxout_op.cc index 44bf402e9584474641e101155fcc35814639366f..011616e615a36efa0efe9ff15e678f1486c5177a 100644 --- a/paddle/operators/maxout_op.cc +++ b/paddle/operators/maxout_op.cc @@ -101,7 +101,8 @@ class MaxOutOpGrad : public framework::OperatorWithKernel { namespace ops = paddle::operators; REGISTER_OP(maxout, ops::MaxOutOp, ops::MaxOutOpMaker, maxout_grad, ops::MaxOutOpGrad); -REGISTER_OP_CPU_KERNEL(maxout, - ops::MaxOutKernel); REGISTER_OP_CPU_KERNEL( - maxout_grad, ops::MaxOutGradKernel); + maxout, ops::MaxOutKernel); +REGISTER_OP_CPU_KERNEL( + maxout_grad, + ops::MaxOutGradKernel); diff --git a/paddle/operators/maxout_op.cu.cc b/paddle/operators/maxout_op.cu.cc index decd43913d69d122330886e07178778d03f7fef5..2904f0ff96f06cefad29a65898cd82107d9bd600 100644 --- a/paddle/operators/maxout_op.cu.cc +++ b/paddle/operators/maxout_op.cu.cc @@ -15,9 +15,10 @@ #include "paddle/operators/maxout_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(maxout, - ops::MaxOutKernel, - ops::MaxOutKernel); -REGISTER_OP_GPU_KERNEL( - maxout_grad, ops::MaxOutGradKernel, - ops::MaxOutGradKernel); +REGISTER_OP_CUDA_KERNEL( + maxout, ops::MaxOutKernel, + ops::MaxOutKernel); +REGISTER_OP_CUDA_KERNEL( + maxout_grad, + ops::MaxOutGradKernel, + ops::MaxOutGradKernel); diff --git a/paddle/operators/maxout_op.h b/paddle/operators/maxout_op.h index 44a0d073dda642f6e261ce5760013f3e1055f43d..e8b12552b9ff39e23702de17abc9825a527f02aa 100644 --- a/paddle/operators/maxout_op.h +++ b/paddle/operators/maxout_op.h @@ -23,7 +23,7 @@ namespace operators { using Tensor = framework::Tensor; -template +template class MaxOutKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -31,12 +31,13 @@ class MaxOutKernel : public framework::OpKernel { Tensor* out = context.Output("Out"); int groups = context.template Attr("groups"); - math::MaxOutFunctor maxout_forward; - maxout_forward(context.device_context(), *in_x, out, groups); + math::MaxOutFunctor maxout_forward; + maxout_forward(context.template device_context(), *in_x, out, + groups); } }; -template +template class MaxOutGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -46,14 +47,13 @@ class MaxOutGradKernel : public framework::OpKernel { context.Input(framework::GradVarName("Out")); Tensor* in_x_grad = context.Output(framework::GradVarName("X")); int groups = context.template Attr("groups"); - auto& device_ctx = context.device_context(); - math::SetConstant zero; + auto& device_ctx = context.template device_context(); + math::SetConstant zero; if (in_x_grad) { in_x_grad->mutable_data(context.GetPlace()); zero(device_ctx, in_x_grad, static_cast(0.0)); - math::MaxOutGradFunctor maxout_backward; - maxout_backward(context.device_context(), *in_x, in_x_grad, *out, - *out_grad, groups); + math::MaxOutGradFunctor maxout_backward; + maxout_backward(device_ctx, *in_x, in_x_grad, *out, *out_grad, groups); } } }; diff --git a/paddle/operators/mean_op.cc b/paddle/operators/mean_op.cc index dcc5b4286f4ac833268a779a9a7edd2ed119ffff..8932d700c2ae17eefe919eefae2282ae4a5a80a8 100644 --- a/paddle/operators/mean_op.cc +++ b/paddle/operators/mean_op.cc @@ -76,8 +76,9 @@ class MeanGradMaker : public framework::SingleGradOpDescMaker { namespace ops = paddle::operators; REGISTER_OPERATOR(mean, ops::MeanOp, ops::MeanOpMaker, ops::MeanGradMaker); REGISTER_OPERATOR(mean_grad, ops::MeanGradOp); -REGISTER_OP_CPU_KERNEL(mean, ops::MeanKernel, - ops::MeanKernel); -REGISTER_OP_CPU_KERNEL(mean_grad, - ops::MeanGradKernel, - ops::MeanGradKernel); +REGISTER_OP_CPU_KERNEL( + mean, ops::MeanKernel, + ops::MeanKernel); +REGISTER_OP_CPU_KERNEL( + mean_grad, ops::MeanGradKernel, + ops::MeanGradKernel); diff --git a/paddle/operators/mean_op.cu b/paddle/operators/mean_op.cu index ca089938c048f7aa5bd561f57c093aa74cce4e11..93062bf540ad64350f7ee9a554c3c469aba46677 100644 --- a/paddle/operators/mean_op.cu +++ b/paddle/operators/mean_op.cu @@ -17,8 +17,9 @@ #include "paddle/operators/mean_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(mean, ops::MeanKernel, - ops::MeanKernel); -REGISTER_OP_GPU_KERNEL(mean_grad, - ops::MeanGradKernel, - ops::MeanGradKernel); +REGISTER_OP_CUDA_KERNEL( + mean, ops::MeanKernel, + ops::MeanKernel); +REGISTER_OP_CUDA_KERNEL( + mean_grad, ops::MeanGradKernel, + ops::MeanGradKernel); diff --git a/paddle/operators/mean_op.h b/paddle/operators/mean_op.h index c99286a5b928f1edcd845b01b21b95654c25db07..351b34595974b1771d9f4ae5232e0b3a33491104 100644 --- a/paddle/operators/mean_op.h +++ b/paddle/operators/mean_op.h @@ -27,7 +27,7 @@ template using EigenVector = framework::EigenVector; -template +template class MeanKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -38,13 +38,14 @@ class MeanKernel : public framework::OpKernel { auto X = EigenVector::Flatten(*input); auto y = EigenScalar::From(*output); - auto& place = context.GetEigenDevice(); + auto& place = + *context.template device_context().eigen_device(); y.device(place) = X.mean(); } }; -template +template class MeanGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -56,7 +57,8 @@ class MeanGradKernel : public framework::OpKernel { T ig_size = static_cast(IG->numel()); Eigen::DSizes bcast(ig_size); - EigenVector::Flatten(*IG).device(context.GetEigenDevice()) = + EigenVector::Flatten(*IG).device( + *context.template device_context().eigen_device()) = (EigenVector::From(*OG) / ig_size).broadcast(bcast); } }; diff --git a/paddle/operators/minus_op.cc b/paddle/operators/minus_op.cc index 4684c20208501a3239fd57b35428946bb52af4a0..27f0c8de2053064e65d9984ec9bd4242fee48e5f 100644 --- a/paddle/operators/minus_op.cc +++ b/paddle/operators/minus_op.cc @@ -102,5 +102,5 @@ class MinusGradMaker : public framework::GradOpDescMakerBase { namespace ops = paddle::operators; REGISTER_OPERATOR(minus, ops::MinusOp, ops::MinusOpMaker, ops::MinusGradMaker); -REGISTER_OP_CPU_KERNEL(minus, - ops::MinusKernel); +REGISTER_OP_CPU_KERNEL( + minus, ops::MinusKernel); diff --git a/paddle/operators/minus_op.cu b/paddle/operators/minus_op.cu index a8375cc6301b2c1a917299c3933b03226bb72907..3b202ea92ee8692f2441909083f559adff5fea8c 100644 --- a/paddle/operators/minus_op.cu +++ b/paddle/operators/minus_op.cu @@ -14,5 +14,6 @@ #include "paddle/operators/minus_op.h" -REGISTER_OP_GPU_KERNEL( - minus, paddle::operators::MinusKernel); +REGISTER_OP_CUDA_KERNEL( + minus, + paddle::operators::MinusKernel); diff --git a/paddle/operators/minus_op.h b/paddle/operators/minus_op.h index bd9a2790aa2b208c2d3dfc792031283eb6c42397..78e1e1be6d622d504db9e664dcb5f35ca0c22b95 100644 --- a/paddle/operators/minus_op.h +++ b/paddle/operators/minus_op.h @@ -19,7 +19,7 @@ namespace paddle { namespace operators { -template +template class MinusKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -28,7 +28,8 @@ class MinusKernel : public framework::OpKernel { auto* out_tensor = context.Output("Out"); out_tensor->mutable_data(context.GetPlace()); - auto& dev = context.GetEigenDevice(); + auto& dev = + *context.template device_context().eigen_device(); framework::EigenVector::Flatten(*out_tensor).device(dev) = framework::EigenVector::Flatten(*left_tensor) - framework::EigenVector::Flatten(*right_tensor); diff --git a/paddle/operators/modified_huber_loss_op.cc b/paddle/operators/modified_huber_loss_op.cc index 28528848af1f467bf38be53f9d05fee6ca3f93cc..f0a42491bf04a5bbe2de10de2f702877c9a2f839 100644 --- a/paddle/operators/modified_huber_loss_op.cc +++ b/paddle/operators/modified_huber_loss_op.cc @@ -115,6 +115,6 @@ REGISTER_OP(modified_huber_loss, ops::ModifiedHuberLossOp, REGISTER_OP_CPU_KERNEL( modified_huber_loss, - ops::ModifiedHuberLossKernel); + ops::ModifiedHuberLossKernel); REGISTER_OP_CPU_KERNEL(modified_huber_loss_grad, ops::ModifiedHuberLossGradCPUKernel); diff --git a/paddle/operators/modified_huber_loss_op.cu b/paddle/operators/modified_huber_loss_op.cu index 8854e166cd99ce914d7f9f9bcead3234b0649506..40a8447da4d9d4874af232f3408557c950b58482 100644 --- a/paddle/operators/modified_huber_loss_op.cu +++ b/paddle/operators/modified_huber_loss_op.cu @@ -71,8 +71,8 @@ class ModifiedHuberLossGradGPUKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( modified_huber_loss, - ops::ModifiedHuberLossKernel); -REGISTER_OP_GPU_KERNEL(modified_huber_loss_grad, - ops::ModifiedHuberLossGradGPUKernel); + ops::ModifiedHuberLossKernel); +REGISTER_OP_CUDA_KERNEL(modified_huber_loss_grad, + ops::ModifiedHuberLossGradGPUKernel); diff --git a/paddle/operators/modified_huber_loss_op.h b/paddle/operators/modified_huber_loss_op.h index aba75efad9c19e3e113b4f09bc1fbd4732f4e187..157ae0682e0cf4392dab003153d44f48209d00a1 100644 --- a/paddle/operators/modified_huber_loss_op.h +++ b/paddle/operators/modified_huber_loss_op.h @@ -46,7 +46,7 @@ struct ModifiedHuberLossForward { } }; -template +template class ModifiedHuberLossKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -57,7 +57,8 @@ class ModifiedHuberLossKernel : public framework::OpKernel { out0->mutable_data(context.GetPlace()); out1->mutable_data(context.GetPlace()); - auto place = context.GetEigenDevice(); + auto& place = + *context.template device_context().eigen_device(); auto x = EigenVector::Flatten(*in0); auto y = EigenVector::Flatten(*in1); diff --git a/paddle/operators/momentum_op.cu b/paddle/operators/momentum_op.cu index be0c8ea071716b75eaeddab209a52b3d5f2f7e16..00f1253465d336e0fad580d0c6b898369e4783ca 100644 --- a/paddle/operators/momentum_op.cu +++ b/paddle/operators/momentum_op.cu @@ -74,5 +74,5 @@ class MomentumOpCUDAKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(momentum, ops::MomentumOpCUDAKernel, - ops::MomentumOpCUDAKernel); +REGISTER_OP_CUDA_KERNEL(momentum, ops::MomentumOpCUDAKernel, + ops::MomentumOpCUDAKernel); diff --git a/paddle/operators/mul_op.cc b/paddle/operators/mul_op.cc index 3c39ae10dc50084cff284c307167c33c9208a3ce..bc4a5fdf0b37ce07b4c07bba9e1af5611d2be7e3 100644 --- a/paddle/operators/mul_op.cc +++ b/paddle/operators/mul_op.cc @@ -149,6 +149,7 @@ REGISTER_OPERATOR(mul, paddle::framework::OperatorWithKernel, ops::MulOpMaker, ops::MulOpShapeInference, paddle::framework::DefaultGradOpDescMaker); REGISTER_OPERATOR(mul_grad, ops::MulOpGrad); -REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel); -REGISTER_OP_CPU_KERNEL(mul_grad, - ops::MulGradKernel); +REGISTER_OP_CPU_KERNEL( + mul, ops::MulKernel); +REGISTER_OP_CPU_KERNEL( + mul_grad, ops::MulGradKernel); diff --git a/paddle/operators/mul_op.cu.cc b/paddle/operators/mul_op.cu.cc index 66dc3d6d106a18640adad413d4e967fa101abcfc..6095de58d0c58be6b647771e9784348cbf8c4ad4 100644 --- a/paddle/operators/mul_op.cu.cc +++ b/paddle/operators/mul_op.cu.cc @@ -15,6 +15,7 @@ #include "paddle/operators/mul_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(mul, ops::MulKernel); -REGISTER_OP_GPU_KERNEL(mul_grad, - ops::MulGradKernel); +REGISTER_OP_CUDA_KERNEL( + mul, ops::MulKernel); +REGISTER_OP_CUDA_KERNEL( + mul_grad, ops::MulGradKernel); diff --git a/paddle/operators/mul_op.h b/paddle/operators/mul_op.h index 0eb9df41e9415845f88af283de63856158b447f9..1b467dca8302c10fe08a157aac4586230e096dd0 100644 --- a/paddle/operators/mul_op.h +++ b/paddle/operators/mul_op.h @@ -23,7 +23,7 @@ namespace operators { using Tensor = framework::Tensor; -template +template class MulKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -46,15 +46,16 @@ class MulKernel : public framework::OpKernel { if (z_dim.size() != 2) { z->Resize({x_matrix.dims()[0], y_matrix.dims()[1]}); } - math::matmul(context.device_context(), x_matrix, false, y_matrix, - false, 1, z, 0); + math::matmul( + context.template device_context(), x_matrix, false, + y_matrix, false, 1, z, 0); if (z_dim.size() != 2) { z->Resize(z_dim); } } }; -template +template class MulGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -77,6 +78,7 @@ class MulGradKernel : public framework::OpKernel { Tensor* dx = ctx.Output(framework::GradVarName("X")); Tensor* dy = ctx.Output(framework::GradVarName("Y")); + auto& dev_ctx = ctx.template device_context(); if (dx) { dx->mutable_data(ctx.GetPlace()); Tensor dx_matrix = dx->dims().size() > 2 @@ -84,8 +86,8 @@ class MulGradKernel : public framework::OpKernel { : *dx; // dx = dout * y'. dx: M x K, dout : M x N, y : K x N - math::matmul(ctx.device_context(), dout_mat, false, y_matrix, - true, 1, &dx_matrix, 0); + math::matmul(dev_ctx, dout_mat, false, y_matrix, true, + 1, &dx_matrix, 0); } if (dy) { dy->mutable_data(ctx.GetPlace()); @@ -93,8 +95,8 @@ class MulGradKernel : public framework::OpKernel { ? framework::ReshapeToMatrix(*dy, y_num_col_dims) : *dy; // dy = x' * dout. dy K x N, dout : M x N, x : M x K - math::matmul(ctx.device_context(), x_matrix, true, dout_mat, - false, 1, &dy_matrix, 0); + math::matmul(dev_ctx, x_matrix, true, dout_mat, false, + 1, &dy_matrix, 0); } } }; diff --git a/paddle/operators/multiplex_op.cc b/paddle/operators/multiplex_op.cc index 8e7f544e0d3c5f4afc2c63dd8335fadd4753d83d..b1ee8051c4c48f575690b38142ae082930fe2070 100644 --- a/paddle/operators/multiplex_op.cc +++ b/paddle/operators/multiplex_op.cc @@ -119,7 +119,8 @@ REGISTER_OPERATOR(multiplex, ops::MultiplexOp, ops::MultiplexOpMaker, paddle::framework::DefaultGradOpDescMaker); REGISTER_OPERATOR(multiplex_grad, ops::MultiplexGradOp); REGISTER_OP_CPU_KERNEL( - multiplex, ops::MultiplexCPUKernel); + multiplex, + ops::MultiplexCPUKernel); REGISTER_OP_CPU_KERNEL( multiplex_grad, - ops::MultiplexGradCPUKernel); + ops::MultiplexGradCPUKernel); diff --git a/paddle/operators/multiplex_op.cu b/paddle/operators/multiplex_op.cu index 10dff8d021d0394702cc8b92e779c012a4cf3eb2..47986e9ff86f2e08b0861cde35ac3a44b10caed1 100644 --- a/paddle/operators/multiplex_op.cu +++ b/paddle/operators/multiplex_op.cu @@ -36,7 +36,7 @@ class MultiplexGPUKernel : public framework::OpKernel { CopyFrom(*ids, platform::CPUPlace(), ctx.device_context(), &index_t_cpu); auto* index = index_t_cpu.data(); auto stream = ctx.cuda_device_context().stream(); - Place place = boost::get(ctx.GetPlace()); + platform::GPUPlace place = boost::get(ctx.GetPlace()); for (auto i = 0; i < rows; i++) { int32_t k = index[i]; PADDLE_ENFORCE_GE(k, 0, "index must be nonnegative."); @@ -60,7 +60,8 @@ class MultiplexGradGPUKernel : public framework::OpKernel { if (d_ins[i]) { d_ins[i]->mutable_data(ctx.GetPlace()); auto t = framework::EigenVector::Flatten(*d_ins[i]); - t.device(ctx.GetEigenDevice()) = t.constant(static_cast(0)); + t.device(*ctx.template device_context().eigen_device()) = + t.constant(static_cast(0)); } } @@ -72,7 +73,7 @@ class MultiplexGradGPUKernel : public framework::OpKernel { auto* index = index_t_cpu.data(); auto stream = ctx.cuda_device_context().stream(); - Place place = boost::get(ctx.GetPlace()); + platform::GPUPlace place = boost::get(ctx.GetPlace()); for (auto i = 0; i < rows; i++) { size_t k = static_cast(index[i]); if (d_ins[k]) { @@ -87,8 +88,9 @@ class MultiplexGradGPUKernel : public framework::OpKernel { namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( - multiplex, ops::MultiplexGPUKernel); -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( + multiplex, + ops::MultiplexGPUKernel); +REGISTER_OP_CUDA_KERNEL( multiplex_grad, - ops::MultiplexGradGPUKernel); + ops::MultiplexGradGPUKernel); diff --git a/paddle/operators/multiplex_op.h b/paddle/operators/multiplex_op.h index ab3cafaa324a29d6f249cf1f73db92e1364eebc8..344315116122f7ad843af740be8a31313c8a0342 100644 --- a/paddle/operators/multiplex_op.h +++ b/paddle/operators/multiplex_op.h @@ -22,7 +22,7 @@ namespace paddle { namespace operators { -template +template class MultiplexCPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const { @@ -35,7 +35,7 @@ class MultiplexCPUKernel : public framework::OpKernel { auto rows = ins[0]->dims()[0]; auto cols = ins[0]->numel() / rows; auto index = ids->data(); - Place place = boost::get(ctx.GetPlace()); + platform::CPUPlace place = boost::get(ctx.GetPlace()); for (auto i = 0; i < rows; i++) { int32_t k = index[i]; PADDLE_ENFORCE_GE(k, 0, "index must be nonnegative."); @@ -47,7 +47,7 @@ class MultiplexCPUKernel : public framework::OpKernel { } }; -template +template class MultiplexGradCPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const { @@ -60,14 +60,15 @@ class MultiplexGradCPUKernel : public framework::OpKernel { if (d_ins[i]) { d_ins[i]->mutable_data(ctx.GetPlace()); auto t = framework::EigenVector::Flatten(*d_ins[i]); - t.device(ctx.GetEigenDevice()) = t.constant(static_cast(0)); + t.device(*ctx.template device_context().eigen_device()) = + t.constant(static_cast(0)); } } auto rows = ins[0]->dims()[0]; auto cols = ins[0]->numel() / rows; auto* index = ids->data(); - Place place = boost::get(ctx.GetPlace()); + platform::CPUPlace place = boost::get(ctx.GetPlace()); for (auto i = 0; i < rows; i++) { size_t k = static_cast(index[i]); if (d_ins[k]) { diff --git a/paddle/operators/nccl_op.cu.cc b/paddle/operators/nccl_op.cu.cc index 4f0a2a79edb9f24c7758fc91483d374425b36853..6ca6db7253da0e59c742f115cd25a1b8203a3044 100644 --- a/paddle/operators/nccl_op.cu.cc +++ b/paddle/operators/nccl_op.cu.cc @@ -204,6 +204,6 @@ class NCCLBcastKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(ncclAllReduce, ops::NCCLAllReduceKernel); -REGISTER_OP_GPU_KERNEL(ncclBcast, ops::NCCLBcastKernel); -REGISTER_OP_GPU_KERNEL(ncclReduce, ops::NCCLReduceKernel); +REGISTER_OP_CUDA_KERNEL(ncclAllReduce, ops::NCCLAllReduceKernel); +REGISTER_OP_CUDA_KERNEL(ncclBcast, ops::NCCLBcastKernel); +REGISTER_OP_CUDA_KERNEL(ncclReduce, ops::NCCLReduceKernel); diff --git a/paddle/operators/nccl_op_test.cu.cc b/paddle/operators/nccl_op_test.cu.cc index bb7ae20286dd8e52f72b79cbf353bd812a2cc092..d747cc0cf5f74b886bbd40549673e7d64de952e9 100644 --- a/paddle/operators/nccl_op_test.cu.cc +++ b/paddle/operators/nccl_op_test.cu.cc @@ -33,9 +33,9 @@ #include "paddle/platform/place.h" USE_NO_KERNEL_OP(ncclInit); -USE_GPU_ONLY_OP(ncclAllReduce); -USE_GPU_ONLY_OP(ncclReduce); -USE_GPU_ONLY_OP(ncclBcast); +USE_CUDA_ONLY_OP(ncclAllReduce); +USE_CUDA_ONLY_OP(ncclReduce); +USE_CUDA_ONLY_OP(ncclBcast); namespace f = paddle::framework; namespace p = paddle::platform; diff --git a/paddle/operators/nce_op.cc b/paddle/operators/nce_op.cc index 952da10434df01a10fc713a017084d315a2a59d3..5ad1610fde041ee934486ef98ba41dca42559100 100644 --- a/paddle/operators/nce_op.cc +++ b/paddle/operators/nce_op.cc @@ -67,7 +67,7 @@ class NCEOp : public framework::OperatorWithKernel { const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( framework::ToDataType(ctx.Input("Input")->type()), - ctx.device_context()); + ctx.GetPlace()); } }; @@ -170,7 +170,7 @@ class NCEOpGrad : public framework::OperatorWithKernel { const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( framework::ToDataType(ctx.Input("Input")->type()), - ctx.device_context()); + ctx.GetPlace()); } }; diff --git a/paddle/operators/nce_op.h b/paddle/operators/nce_op.h index 0a8a95de5f402540af057d9d330c5b565ad39db0..6636dad06037f163252dc342200a99c756ed2a2e 100644 --- a/paddle/operators/nce_op.h +++ b/paddle/operators/nce_op.h @@ -28,7 +28,7 @@ template using EigenMatrix = framework::EigenMatrix; -template +template void PrepareSamples(const framework::ExecutionContext& context) { auto label = context.Input("Label"); const int64_t* label_data = label->data(); @@ -67,11 +67,11 @@ void PrepareSamples(const framework::ExecutionContext& context) { } } -template +template class NCEKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { - PrepareSamples(context); + PrepareSamples(context); auto sample_labels = context.Output("SampleLabels"); const int64_t* sample_labels_data = sample_labels->data(); auto sample_out = context.Output("SampleLogits"); @@ -135,7 +135,7 @@ class NCEKernel : public framework::OpKernel { } }; -template +template class NCEGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { diff --git a/paddle/operators/pad_op.cc b/paddle/operators/pad_op.cc index adb75df6ef10c59fc6f3db4d36e1ffb1ae0b4b1e..936dde22c34a30c5a50e2ac8a76f0f91dfb328ab 100644 --- a/paddle/operators/pad_op.cc +++ b/paddle/operators/pad_op.cc @@ -134,6 +134,7 @@ namespace ops = paddle::operators; REGISTER_OPERATOR(pad, ops::PadOp, ops::PadOpMaker, ops::PadOpGradMaker); REGISTER_OPERATOR(pad_grad, ops::PadOpGrad); -REGISTER_OP_CPU_KERNEL(pad, ops::PadKernel); -REGISTER_OP_CPU_KERNEL(pad_grad, - ops::PadGradKernel); +REGISTER_OP_CPU_KERNEL( + pad, ops::PadKernel); +REGISTER_OP_CPU_KERNEL( + pad_grad, ops::PadGradKernel); diff --git a/paddle/operators/pad_op.cu b/paddle/operators/pad_op.cu index 555a7dba23c6fa2659cabf4858b42ff70d74bf18..c309fb625cca203418db2599a59ea0144782efc2 100644 --- a/paddle/operators/pad_op.cu +++ b/paddle/operators/pad_op.cu @@ -16,6 +16,7 @@ #include "paddle/operators/pad_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(pad, ops::PadKernel); -REGISTER_OP_GPU_KERNEL(pad_grad, - ops::PadGradKernel); +REGISTER_OP_CUDA_KERNEL( + pad, ops::PadKernel); +REGISTER_OP_CUDA_KERNEL( + pad_grad, ops::PadGradKernel); diff --git a/paddle/operators/pad_op.h b/paddle/operators/pad_op.h index 9534dbf54529e3b9ae2b6640d51fe291e9521927..1b95942af3b3711fcad965cdc3f2d2f99b2f32e8 100644 --- a/paddle/operators/pad_op.h +++ b/paddle/operators/pad_op.h @@ -26,7 +26,7 @@ template using EigenTensor = framework::EigenTensor; -template +template void PadFunction(const framework::ExecutionContext& context) { auto pads = context.Attr>("paddings"); Eigen::array, D> paddings; @@ -42,33 +42,34 @@ void PadFunction(const framework::ExecutionContext& context) { auto x_tensor = EigenTensor::From(*x); auto out_tensor = EigenTensor::From(*out); - auto place = context.GetEigenDevice(); + auto& place = + *context.template device_context().eigen_device(); out_tensor.device(place) = x_tensor.pad(paddings, pad_value); } -template +template class PadKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { int rank = context.Input("X")->dims().size(); switch (rank) { case 1: - PadFunction(context); + PadFunction(context); break; case 2: - PadFunction(context); + PadFunction(context); break; case 3: - PadFunction(context); + PadFunction(context); break; case 4: - PadFunction(context); + PadFunction(context); break; case 5: - PadFunction(context); + PadFunction(context); break; case 6: - PadFunction(context); + PadFunction(context); break; default: PADDLE_THROW( @@ -77,7 +78,7 @@ class PadKernel : public framework::OpKernel { } }; -template +template void PadGradFunction(const framework::ExecutionContext& context) { auto pads = context.Attr>("paddings"); Eigen::array, D> paddings; @@ -91,12 +92,13 @@ void PadGradFunction(const framework::ExecutionContext& context) { d_x->mutable_data(context.GetPlace()); auto d_x_tensor = EigenTensor::From(*d_x); auto d_out_tensor = EigenTensor::From(*d_out); - auto place = context.GetEigenDevice(); + auto& place = + *context.template device_context().eigen_device(); d_x_tensor.device(place) = d_out_tensor.pad(paddings, 0); } } -template +template class PadGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -104,22 +106,22 @@ class PadGradKernel : public framework::OpKernel { context.Input(framework::GradVarName("Out"))->dims().size(); switch (rank) { case 1: - PadGradFunction(context); + PadGradFunction(context); break; case 2: - PadGradFunction(context); + PadGradFunction(context); break; case 3: - PadGradFunction(context); + PadGradFunction(context); break; case 4: - PadGradFunction(context); + PadGradFunction(context); break; case 5: - PadGradFunction(context); + PadGradFunction(context); break; case 6: - PadGradFunction(context); + PadGradFunction(context); break; default: PADDLE_THROW( diff --git a/paddle/operators/pool_cudnn_op.cc b/paddle/operators/pool_cudnn_op.cc index be9fcc5661f420aadf908cf80cce6c963008b0e4..77407f5cdf7e4ef7b76c38ef8992517b4bd1c5fe 100644 --- a/paddle/operators/pool_cudnn_op.cc +++ b/paddle/operators/pool_cudnn_op.cc @@ -19,19 +19,21 @@ namespace ops = paddle::operators; REGISTER_OP(pool2d_cudnn, ops::PoolOp, ops::Pool2dOpMaker, pool2d_cudnn_grad, ops::PoolOpGrad); -REGISTER_OP_CPU_KERNEL(pool2d_cudnn, - ops::PoolKernel, - ops::PoolKernel); -REGISTER_OP_CPU_KERNEL(pool2d_cudnn_grad, - ops::PoolGradKernel, - ops::PoolGradKernel) +REGISTER_OP_CPU_KERNEL( + pool2d_cudnn, ops::PoolKernel, + ops::PoolKernel); +REGISTER_OP_CPU_KERNEL( + pool2d_cudnn_grad, + ops::PoolGradKernel, + ops::PoolGradKernel) REGISTER_OP(pool3d_cudnn, ops::PoolOp, ops::Pool3dOpMaker, pool3d_cudnn_grad, ops::PoolOpGrad); -REGISTER_OP_CPU_KERNEL(pool3d_cudnn, - ops::PoolKernel, - ops::PoolKernel); -REGISTER_OP_CPU_KERNEL(pool3d_cudnn_grad, - ops::PoolGradKernel, - ops::PoolGradKernel) +REGISTER_OP_CPU_KERNEL( + pool3d_cudnn, ops::PoolKernel, + ops::PoolKernel); +REGISTER_OP_CPU_KERNEL( + pool3d_cudnn_grad, + ops::PoolGradKernel, + ops::PoolGradKernel) diff --git a/paddle/operators/pool_cudnn_op.cu.cc b/paddle/operators/pool_cudnn_op.cu.cc index 66dd194ccd5ed629c5861552a7c124dc911362d7..fc2b37bd0fbac82005e709779b2939843b839596 100644 --- a/paddle/operators/pool_cudnn_op.cu.cc +++ b/paddle/operators/pool_cudnn_op.cu.cc @@ -162,12 +162,12 @@ class PoolCudnnGradOpKernel : public framework::OpKernel { namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(pool2d_cudnn, ops::PoolCudnnOpKernel, - ops::PoolCudnnOpKernel); -REGISTER_OP_GPU_KERNEL(pool2d_cudnn_grad, ops::PoolCudnnGradOpKernel, - ops::PoolCudnnGradOpKernel); - -REGISTER_OP_GPU_KERNEL(pool3d_cudnn, ops::PoolCudnnOpKernel, - ops::PoolCudnnOpKernel); -REGISTER_OP_GPU_KERNEL(pool3d_cudnn_grad, ops::PoolCudnnGradOpKernel, - ops::PoolCudnnGradOpKernel); +REGISTER_OP_CUDA_KERNEL(pool2d_cudnn, ops::PoolCudnnOpKernel, + ops::PoolCudnnOpKernel); +REGISTER_OP_CUDA_KERNEL(pool2d_cudnn_grad, ops::PoolCudnnGradOpKernel, + ops::PoolCudnnGradOpKernel); + +REGISTER_OP_CUDA_KERNEL(pool3d_cudnn, ops::PoolCudnnOpKernel, + ops::PoolCudnnOpKernel); +REGISTER_OP_CUDA_KERNEL(pool3d_cudnn_grad, ops::PoolCudnnGradOpKernel, + ops::PoolCudnnGradOpKernel); diff --git a/paddle/operators/pool_op.cc b/paddle/operators/pool_op.cc index e26ffd86e5b5645e361070ca9fd9d8dc49d1ed30..45fa20280c1ad20f63d6542d5199e002ff60495f 100644 --- a/paddle/operators/pool_op.cc +++ b/paddle/operators/pool_op.cc @@ -216,19 +216,19 @@ namespace ops = paddle::operators; REGISTER_OP(pool2d, ops::PoolOp, ops::Pool2dOpMaker, pool2d_grad, ops::PoolOpGrad); -REGISTER_OP_CPU_KERNEL(pool2d, - ops::PoolKernel, - ops::PoolKernel); -REGISTER_OP_CPU_KERNEL(pool2d_grad, - ops::PoolGradKernel, - ops::PoolGradKernel) +REGISTER_OP_CPU_KERNEL( + pool2d, ops::PoolKernel, + ops::PoolKernel); +REGISTER_OP_CPU_KERNEL( + pool2d_grad, ops::PoolGradKernel, + ops::PoolGradKernel) REGISTER_OP(pool3d, ops::PoolOp, ops::Pool3dOpMaker, pool3d_grad, ops::PoolOpGrad); -REGISTER_OP_CPU_KERNEL(pool3d, - ops::PoolKernel, - ops::PoolKernel); -REGISTER_OP_CPU_KERNEL(pool3d_grad, - ops::PoolGradKernel, - ops::PoolGradKernel); +REGISTER_OP_CPU_KERNEL( + pool3d, ops::PoolKernel, + ops::PoolKernel); +REGISTER_OP_CPU_KERNEL( + pool3d_grad, ops::PoolGradKernel, + ops::PoolGradKernel); diff --git a/paddle/operators/pool_op.cu.cc b/paddle/operators/pool_op.cu.cc index 1010cb762289dd39cd632c699f7528f4ba638278..39a9dfbf794b3dbaf81e2435f8609014dc27f3af 100644 --- a/paddle/operators/pool_op.cu.cc +++ b/paddle/operators/pool_op.cu.cc @@ -16,16 +16,18 @@ limitations under the License. */ namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(pool2d, - ops::PoolKernel, - ops::PoolKernel); -REGISTER_OP_GPU_KERNEL(pool2d_grad, - ops::PoolGradKernel, - ops::PoolGradKernel); +REGISTER_OP_CUDA_KERNEL( + pool2d, ops::PoolKernel, + ops::PoolKernel); +REGISTER_OP_CUDA_KERNEL( + pool2d_grad, + ops::PoolGradKernel, + ops::PoolGradKernel); -REGISTER_OP_GPU_KERNEL(pool3d, - ops::PoolKernel, - ops::PoolKernel); -REGISTER_OP_GPU_KERNEL(pool3d_grad, - ops::PoolGradKernel, - ops::PoolGradKernel); +REGISTER_OP_CUDA_KERNEL( + pool3d, ops::PoolKernel, + ops::PoolKernel); +REGISTER_OP_CUDA_KERNEL( + pool3d_grad, + ops::PoolGradKernel, + ops::PoolGradKernel); diff --git a/paddle/operators/pool_op.h b/paddle/operators/pool_op.h index 63492a89e8d4e44a036bc3c2b16cc54c7e77b534..ab85d587a3131237d7a9ec774a11193c70220c7c 100644 --- a/paddle/operators/pool_op.h +++ b/paddle/operators/pool_op.h @@ -50,7 +50,7 @@ class Pool3dOpMaker : public framework::OpProtoAndCheckerMaker { framework::OpAttrChecker* op_checker); }; -template +template class PoolKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -67,41 +67,41 @@ class PoolKernel : public framework::OpKernel { ksize[i] = static_cast(in_x->dims()[i + 2]); } } - + auto& dev_ctx = context.template device_context(); switch (ksize.size()) { case 2: { if (pooling_type == "max") { paddle::operators::math::Pool2dFunctor< - Place, paddle::operators::math::MaxPool, T> + DeviceContext, paddle::operators::math::MaxPool, T> pool2d_forward; paddle::operators::math::MaxPool pool_process; - pool2d_forward(context.device_context(), *in_x, ksize, strides, - paddings, pool_process, out); + pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process, + out); } else if (pooling_type == "avg") { paddle::operators::math::Pool2dFunctor< - Place, paddle::operators::math::AvgPool, T> + DeviceContext, paddle::operators::math::AvgPool, T> pool2d_forward; paddle::operators::math::AvgPool pool_process; - pool2d_forward(context.device_context(), *in_x, ksize, strides, - paddings, pool_process, out); + pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process, + out); } } break; case 3: { if (pooling_type == "max") { paddle::operators::math::Pool3dFunctor< - Place, paddle::operators::math::MaxPool, T> + DeviceContext, paddle::operators::math::MaxPool, T> pool3d_forward; paddle::operators::math::MaxPool pool_process; - pool3d_forward(context.device_context(), *in_x, ksize, strides, - paddings, pool_process, out); + pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process, + out); } else if (pooling_type == "avg") { paddle::operators::math::Pool3dFunctor< - Place, paddle::operators::math::AvgPool, T> + DeviceContext, paddle::operators::math::AvgPool, T> pool3d_forward; paddle::operators::math::AvgPool pool_process; - pool3d_forward(context.device_context(), *in_x, ksize, strides, - paddings, pool_process, out); + pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process, + out); } } break; default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); } @@ -109,7 +109,7 @@ class PoolKernel : public framework::OpKernel { } }; -template +template class PoolGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -130,42 +130,43 @@ class PoolGradKernel : public framework::OpKernel { ksize[i] = static_cast(in_x->dims()[i + 2]); } } - + auto& dev_ctx = context.template device_context(); if (in_x_grad) { in_x_grad->mutable_data(context.GetPlace()); auto temp = framework::EigenVector::Flatten(*in_x_grad); - temp.device(context.GetEigenDevice()) = + temp.device( + *context.template device_context().eigen_device()) = temp.constant(static_cast(0)); switch (ksize.size()) { case 2: { if (pooling_type == "max") { - paddle::operators::math::MaxPool2dGradFunctor + paddle::operators::math::MaxPool2dGradFunctor pool2d_backward; - pool2d_backward(context.device_context(), *in_x, *out, *out_grad, - ksize, strides, paddings, in_x_grad); + pool2d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides, + paddings, in_x_grad); } else if (pooling_type == "avg") { paddle::operators::math::Pool2dGradFunctor< - Place, paddle::operators::math::AvgPoolGrad, T> + DeviceContext, paddle::operators::math::AvgPoolGrad, T> pool2d_backward; paddle::operators::math::AvgPoolGrad pool_process; - pool2d_backward(context.device_context(), *in_x, *out, *out_grad, - ksize, strides, paddings, pool_process, in_x_grad); + pool2d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides, + paddings, pool_process, in_x_grad); } } break; case 3: { if (pooling_type == "max") { - paddle::operators::math::MaxPool3dGradFunctor + paddle::operators::math::MaxPool3dGradFunctor pool3d_backward; - pool3d_backward(context.device_context(), *in_x, *out, *out_grad, - ksize, strides, paddings, in_x_grad); + pool3d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides, + paddings, in_x_grad); } else if (pooling_type == "avg") { paddle::operators::math::Pool3dGradFunctor< - Place, paddle::operators::math::AvgPoolGrad, T> + DeviceContext, paddle::operators::math::AvgPoolGrad, T> pool3d_backward; paddle::operators::math::AvgPoolGrad pool_process; - pool3d_backward(context.device_context(), *in_x, *out, *out_grad, - ksize, strides, paddings, pool_process, in_x_grad); + pool3d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides, + paddings, pool_process, in_x_grad); } } break; default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); } diff --git a/paddle/operators/pool_with_index_op.cc b/paddle/operators/pool_with_index_op.cc index b9c42a69128a26ff5942748e11fb87c57d3e3f58..1a2383f8b80357d2927c3b6a8c57c787ba7e366d 100644 --- a/paddle/operators/pool_with_index_op.cc +++ b/paddle/operators/pool_with_index_op.cc @@ -266,12 +266,15 @@ REGISTER_OP(max_pool2d_with_index, ops::MaxPoolWithIndexOp, REGISTER_OP_CPU_KERNEL( max_pool2d_with_index, - ops::MaxPoolWithIndexKernel, - ops::MaxPoolWithIndexKernel); + ops::MaxPoolWithIndexKernel, + ops::MaxPoolWithIndexKernel); REGISTER_OP_CPU_KERNEL( max_pool2d_with_index_grad, - ops::MaxPoolWithIndexGradKernel, - ops::MaxPoolWithIndexGradKernel) + ops::MaxPoolWithIndexGradKernel, + ops::MaxPoolWithIndexGradKernel) REGISTER_OP(max_pool3d_with_index, ops::MaxPoolWithIndexOp, ops::MaxPool3dWithIndexOpMaker, max_pool3d_with_index_grad, @@ -279,9 +282,12 @@ REGISTER_OP(max_pool3d_with_index, ops::MaxPoolWithIndexOp, REGISTER_OP_CPU_KERNEL( max_pool3d_with_index, - ops::MaxPoolWithIndexKernel, - ops::MaxPoolWithIndexKernel); + ops::MaxPoolWithIndexKernel, + ops::MaxPoolWithIndexKernel); REGISTER_OP_CPU_KERNEL( max_pool3d_with_index_grad, - ops::MaxPoolWithIndexGradKernel, - ops::MaxPoolWithIndexGradKernel) + ops::MaxPoolWithIndexGradKernel, + ops::MaxPoolWithIndexGradKernel) diff --git a/paddle/operators/pool_with_index_op.cu.cc b/paddle/operators/pool_with_index_op.cu.cc index 335064a7eea4ec15c529db5254cbb026ba575f3d..4c9804da639e3ad44f90963b53948cd8b755a6ac 100644 --- a/paddle/operators/pool_with_index_op.cu.cc +++ b/paddle/operators/pool_with_index_op.cu.cc @@ -16,20 +16,28 @@ limitations under the License. */ namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( max_pool2d_with_index, - ops::MaxPoolWithIndexKernel, - ops::MaxPoolWithIndexKernel); -REGISTER_OP_GPU_KERNEL( + ops::MaxPoolWithIndexKernel, + ops::MaxPoolWithIndexKernel); +REGISTER_OP_CUDA_KERNEL( max_pool2d_with_index_grad, - ops::MaxPoolWithIndexGradKernel, - ops::MaxPoolWithIndexGradKernel) + ops::MaxPoolWithIndexGradKernel, + ops::MaxPoolWithIndexGradKernel) -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( max_pool3d_with_index, - ops::MaxPoolWithIndexKernel, - ops::MaxPoolWithIndexKernel); -REGISTER_OP_GPU_KERNEL( + ops::MaxPoolWithIndexKernel, + ops::MaxPoolWithIndexKernel); +REGISTER_OP_CUDA_KERNEL( max_pool3d_with_index_grad, - ops::MaxPoolWithIndexGradKernel, - ops::MaxPoolWithIndexGradKernel) + ops::MaxPoolWithIndexGradKernel, + ops::MaxPoolWithIndexGradKernel) diff --git a/paddle/operators/pool_with_index_op.h b/paddle/operators/pool_with_index_op.h index 40766c7e821e8b85aeda9473798a1f696d0ad719..4f4087d1dd36d6e91cdd9a9253dd72a71735e136 100644 --- a/paddle/operators/pool_with_index_op.h +++ b/paddle/operators/pool_with_index_op.h @@ -24,7 +24,7 @@ namespace operators { using Tensor = framework::Tensor; -template +template class MaxPoolWithIndexKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -35,6 +35,8 @@ class MaxPoolWithIndexKernel : public framework::OpKernel { std::vector ksize = context.Attr>("ksize"); std::vector strides = context.Attr>("strides"); std::vector paddings = context.Attr>("paddings"); + + auto& dev_ctx = context.template device_context(); if (context.Attr("global_pooling")) { for (size_t i = 0; i < ksize.size(); ++i) { paddings[i] = 0; @@ -44,23 +46,23 @@ class MaxPoolWithIndexKernel : public framework::OpKernel { switch (ksize.size()) { case 2: { - paddle::operators::math::MaxPool2dWithIndexFunctor + paddle::operators::math::MaxPool2dWithIndexFunctor pool2d_forward; - pool2d_forward(context.device_context(), *in_x, ksize, strides, - paddings, out, mask); + pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, out, mask); } break; case 3: { - paddle::operators::math::MaxPool3dWithIndexFunctor + paddle::operators::math::MaxPool3dWithIndexFunctor pool3d_forward; - pool3d_forward(context.device_context(), *in_x, ksize, strides, - paddings, out, mask); + pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, out, mask); } break; default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); } } } }; -template +template class MaxPoolWithIndexGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -81,18 +83,20 @@ class MaxPoolWithIndexGradKernel : public framework::OpKernel { if (in_x_grad) { in_x_grad->mutable_data(context.GetPlace()); - auto& device_ctx = context.device_context(); + auto& device_ctx = context.template device_context(); math::set_constant(device_ctx, in_x_grad, 0); switch (ksize.size()) { case 2: { - paddle::operators::math::MaxPool2dWithIndexGradFunctor + paddle::operators::math::MaxPool2dWithIndexGradFunctor pool2d_backward; pool2d_backward(device_ctx, *out_grad, *mask, ksize, strides, paddings, in_x_grad); } break; case 3: { - paddle::operators::math::MaxPool3dWithIndexGradFunctor + paddle::operators::math::MaxPool3dWithIndexGradFunctor pool3d_backward; pool3d_backward(device_ctx, *out_grad, *mask, ksize, strides, paddings, in_x_grad); diff --git a/paddle/operators/positive_negative_pair_op.h b/paddle/operators/positive_negative_pair_op.h index 2efd3777e04c17b27c07bccde524de5785af35fe..977e59b7d2f771fc4c3412f0092f1eba92ef22da 100644 --- a/paddle/operators/positive_negative_pair_op.h +++ b/paddle/operators/positive_negative_pair_op.h @@ -22,7 +22,7 @@ namespace operators { using Tensor = framework::Tensor; using LoDTensor = framework::LoDTensor; -template +template class PositiveNegativePairKernel : public framework::OpKernel { public: struct PredictionResult { diff --git a/paddle/operators/precision_recall_op.h b/paddle/operators/precision_recall_op.h index 4a871ce6741469cf9af409ec90215f721d52f36c..c0d55405a362809f414b8dc3b12ed692f96c24e9 100644 --- a/paddle/operators/precision_recall_op.h +++ b/paddle/operators/precision_recall_op.h @@ -26,7 +26,7 @@ using EigenMatrix = framework::EigenMatrix; enum StateVariable { TP = 0, FP, TN, FN }; -template +template class PrecisionRecallKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { diff --git a/paddle/operators/prelu_op.cc b/paddle/operators/prelu_op.cc index 055c471b4561e5fd3c7a65c6f81d66cdce1a5578..317a2a40154f92f2e13a3012d2f7a63df9a69afb 100644 --- a/paddle/operators/prelu_op.cc +++ b/paddle/operators/prelu_op.cc @@ -85,7 +85,8 @@ namespace ops = paddle::operators; REGISTER_OP(prelu, ops::PReluOp, ops::PReluOpMaker, prelu_grad, ops::PReluGradOp); -REGISTER_OP_CPU_KERNEL(prelu, - ops::PReluKernel); -REGISTER_OP_CPU_KERNEL(prelu_grad, - ops::PReluGradKernel); +REGISTER_OP_CPU_KERNEL( + prelu, ops::PReluKernel); +REGISTER_OP_CPU_KERNEL( + prelu_grad, + ops::PReluGradKernel); diff --git a/paddle/operators/prelu_op.cu b/paddle/operators/prelu_op.cu index 9e391dabae735cc8a740b46b50d31d271f99b65d..12033dee0e1c190b08080023d6746fcad48db2fd 100644 --- a/paddle/operators/prelu_op.cu +++ b/paddle/operators/prelu_op.cu @@ -14,8 +14,9 @@ #include "paddle/operators/prelu_op.h" -REGISTER_OP_GPU_KERNEL( - prelu, paddle::operators::PReluKernel); -REGISTER_OP_GPU_KERNEL( - prelu_grad, - paddle::operators::PReluGradKernel); +REGISTER_OP_CUDA_KERNEL( + prelu, + paddle::operators::PReluKernel); +REGISTER_OP_CUDA_KERNEL(prelu_grad, + paddle::operators::PReluGradKernel< + paddle::platform::CUDADeviceContext, float>); diff --git a/paddle/operators/prelu_op.h b/paddle/operators/prelu_op.h index 5ad31c2203ae6c9bf6f48bb9ecf9a714597e7da8..56f9a553ec12d5bfa745af63ec0570ad30910628 100644 --- a/paddle/operators/prelu_op.h +++ b/paddle/operators/prelu_op.h @@ -39,7 +39,7 @@ class PReluFunctor { const T* alpha_; }; -template +template class PReluKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -54,9 +54,9 @@ class PReluKernel : public framework::OpKernel { int numel = x->numel(); - Transform trans; - trans(context.device_context(), x_ptr, x_ptr + numel, o_ptr, - PReluFunctor(alpha_ptr)); + Transform trans; + trans(context.template device_context(), x_ptr, + x_ptr + numel, o_ptr, PReluFunctor(alpha_ptr)); } }; @@ -76,7 +76,7 @@ class PReluGradFunctor { const T* alpha_; }; -template +template class PReluGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -92,9 +92,9 @@ class PReluGradKernel : public framework::OpKernel { const T* out_ptr = out->data(); int numel = dx->numel(); - Transform trans; - trans(context.device_context(), out_ptr, out_ptr + numel, dout_ptr, dx_ptr, - PReluGradFunctor(alpha_ptr)); + Transform trans; + trans(context.template device_context(), out_ptr, + out_ptr + numel, dout_ptr, dx_ptr, PReluGradFunctor(alpha_ptr)); // TODO(Zhuoyuan): add dalpha upgrade when GPU kernels ready } diff --git a/paddle/operators/proximal_adagrad_op.cc b/paddle/operators/proximal_adagrad_op.cc index 36e460103ab46bf6f1408840a0699793e2be134d..cc350f6d26e6d8bd6e59f2fda74a3b734df55247 100644 --- a/paddle/operators/proximal_adagrad_op.cc +++ b/paddle/operators/proximal_adagrad_op.cc @@ -114,4 +114,4 @@ REGISTER_OP_WITHOUT_GRADIENT(proximal_adagrad, ops::ProximalAdagradOp, ops::ProximalAdagradOpMaker); REGISTER_OP_CPU_KERNEL( proximal_adagrad, - ops::ProximalAdagradOpKernel); + ops::ProximalAdagradOpKernel); diff --git a/paddle/operators/proximal_adagrad_op.cu b/paddle/operators/proximal_adagrad_op.cu index d0ae0395184ae4f794565f2e28c57f960f0ccbeb..42a178f94b94c8e80ec8f9b5e6471b75878b65d1 100644 --- a/paddle/operators/proximal_adagrad_op.cu +++ b/paddle/operators/proximal_adagrad_op.cu @@ -15,6 +15,6 @@ specific language governing permissions and limitations under the License. */ #include "paddle/operators/proximal_adagrad_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( proximal_adagrad, - ops::ProximalAdagradOpKernel); + ops::ProximalAdagradOpKernel); diff --git a/paddle/operators/proximal_adagrad_op.h b/paddle/operators/proximal_adagrad_op.h index 7a1560e8cb339a306ab19513808aab165f82cc8a..523924d80e127d9ad2483e6b239fb948aa72200c 100644 --- a/paddle/operators/proximal_adagrad_op.h +++ b/paddle/operators/proximal_adagrad_op.h @@ -24,7 +24,7 @@ template using EigenVector = framework::EigenVector; -template +template class ProximalAdagradOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -45,20 +45,20 @@ class ProximalAdagradOpKernel : public framework::OpKernel { auto p_out = EigenVector::Flatten(*param_out); auto m_out = EigenVector::Flatten(*moment_out); - auto place = ctx.GetEigenDevice(); + auto* place = ctx.template device_context().eigen_device(); Eigen::DSizes grad_dsize(grad->numel()); - m_out.device(place) = m + g * g; + m_out.device(*place) = m + g * g; auto prox_param = p - lr.broadcast(grad_dsize) * g / m_out.sqrt(); if (l1 > static_cast(0)) { - p_out.device(place) = + p_out.device(*place) = prox_param.sign() * (((prox_param.abs() - (lr * l1).broadcast(grad_dsize)) .cwiseMax(static_cast(0.0))) / (static_cast(1.0) + (lr * l2).broadcast(grad_dsize))); } else { - p_out.device(place) = + p_out.device(*place) = prox_param / (static_cast(1.0) + (lr * l2).broadcast(grad_dsize)); } } diff --git a/paddle/operators/proximal_gd_op.cc b/paddle/operators/proximal_gd_op.cc index 5693d0ec9ebf4c470dfa5141b6eeee431f24f2ea..0b26beb3ac3803c78f45cc2ce0a8f444bdc313b6 100644 --- a/paddle/operators/proximal_gd_op.cc +++ b/paddle/operators/proximal_gd_op.cc @@ -94,4 +94,5 @@ namespace ops = paddle::operators; REGISTER_OP_WITHOUT_GRADIENT(proximal_gd, ops::ProximalGDOp, ops::ProximalGDOpMaker); REGISTER_OP_CPU_KERNEL( - proximal_gd, ops::ProximalGDOpKernel); + proximal_gd, + ops::ProximalGDOpKernel); diff --git a/paddle/operators/proximal_gd_op.cu b/paddle/operators/proximal_gd_op.cu index 26f4ebaa0f43620fee7ece2d71755be94a0e01a5..b7dd840d19a13cd3329fb68563693a80d22291ca 100644 --- a/paddle/operators/proximal_gd_op.cu +++ b/paddle/operators/proximal_gd_op.cu @@ -15,5 +15,6 @@ specific language governing permissions and limitations under the License. */ #include "paddle/operators/proximal_gd_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( - proximal_gd, ops::ProximalGDOpKernel); +REGISTER_OP_CUDA_KERNEL( + proximal_gd, + ops::ProximalGDOpKernel); diff --git a/paddle/operators/proximal_gd_op.h b/paddle/operators/proximal_gd_op.h index bebda0204173ec5c3ec9a7a9da6fb623171f4cea..64648b3ccaf9615c995d65464607105d87c04198 100644 --- a/paddle/operators/proximal_gd_op.h +++ b/paddle/operators/proximal_gd_op.h @@ -24,7 +24,7 @@ template using EigenVector = framework::EigenVector; -template +template class ProximalGDOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -42,7 +42,7 @@ class ProximalGDOpKernel : public framework::OpKernel { auto lr = EigenVector::Flatten(*ctx.Input("LearningRate")); auto p_out = EigenVector::Flatten(*param_out); - auto place = ctx.GetEigenDevice(); + auto& place = *ctx.template device_context().eigen_device(); Eigen::DSizes grad_dsize(grad->numel()); diff --git a/paddle/operators/rank_loss_op.cc b/paddle/operators/rank_loss_op.cc index 912f88f455252effbdb12ecfc45e4afefa60e03e..b80b175792f3fc56d689c187b7182198542d7345 100644 --- a/paddle/operators/rank_loss_op.cc +++ b/paddle/operators/rank_loss_op.cc @@ -123,7 +123,8 @@ namespace ops = paddle::operators; REGISTER_OP(rank_loss, ops::RankLossOp, ops::RankLossOpMaker, rank_loss_grad, ops::RankLossGradOp); -REGISTER_OP_CPU_KERNEL(rank_loss, - ops::RankLossKernel); REGISTER_OP_CPU_KERNEL( - rank_loss_grad, ops::RankLossGradKernel); + rank_loss, ops::RankLossKernel); +REGISTER_OP_CPU_KERNEL( + rank_loss_grad, + ops::RankLossGradKernel); diff --git a/paddle/operators/rank_loss_op.cu b/paddle/operators/rank_loss_op.cu index 5382e3a6296acd257211104d8ec6835c11b90bdd..5aee66443d60c8e20625880ba2ec9606b8a007a0 100644 --- a/paddle/operators/rank_loss_op.cu +++ b/paddle/operators/rank_loss_op.cu @@ -14,9 +14,9 @@ #include "paddle/operators/rank_loss_op.h" -REGISTER_OP_GPU_KERNEL( - rank_loss, - paddle::operators::RankLossKernel); -REGISTER_OP_GPU_KERNEL( - rank_loss_grad, - paddle::operators::RankLossGradKernel); +REGISTER_OP_CUDA_KERNEL(rank_loss, + paddle::operators::RankLossKernel< + paddle::platform::CUDADeviceContext, float>); +REGISTER_OP_CUDA_KERNEL(rank_loss_grad, + paddle::operators::RankLossGradKernel< + paddle::platform::CUDADeviceContext, float>); diff --git a/paddle/operators/rank_loss_op.h b/paddle/operators/rank_loss_op.h index 703c77a0b21f2b2f0b0ae6fae86aae819ea824b5..ea24b61fd94b57950e79b7c1ddb13fa165953538 100644 --- a/paddle/operators/rank_loss_op.h +++ b/paddle/operators/rank_loss_op.h @@ -20,7 +20,7 @@ namespace paddle { namespace operators { -template +template class RankLossKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const { @@ -35,13 +35,13 @@ class RankLossKernel : public framework::OpKernel { auto left = framework::EigenVector::Flatten(*left_t); auto right = framework::EigenVector::Flatten(*right_t); - auto& dev = ctx.GetEigenDevice(); + auto& dev = *ctx.template device_context().eigen_device(); out.device(dev) = (1. + (left - right).exp()).log() - label * (left - right); } }; -template +template class RankLossGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const { @@ -55,7 +55,7 @@ class RankLossGradKernel : public framework::OpKernel { auto* left_t = ctx.Input("Left"); auto* right_t = ctx.Input("Right"); - auto& dev = ctx.GetEigenDevice(); + auto& dev = *ctx.template device_context().eigen_device(); auto d_out = framework::EigenVector::Flatten(*d_out_t); auto label = framework::EigenVector::Flatten(*label_t); auto left = framework::EigenVector::Flatten(*left_t); diff --git a/paddle/operators/reduce_op.cc b/paddle/operators/reduce_op.cc index 2589a54cfc7fc5bc11ae983797d480a134e0eb25..b754637bf29225615f129d7423d60518e053ca18 100644 --- a/paddle/operators/reduce_op.cc +++ b/paddle/operators/reduce_op.cc @@ -180,12 +180,13 @@ REGISTER_OP(reduce_max, ops::ReduceOp, ops::ReduceMaxOpMaker, reduce_max_grad, REGISTER_OP(reduce_min, ops::ReduceOp, ops::ReduceMinOpMaker, reduce_min_grad, ops::ReduceGradOp); -#define REGISTER_REDUCE_CPU_KERNEL(reduce_type, functor, grad_functor) \ - REGISTER_OP_CPU_KERNEL( \ - reduce_type, \ - ops::ReduceKernel); \ - REGISTER_OP_CPU_KERNEL(reduce_type##_grad, \ - ops::ReduceGradKernel); +#define REGISTER_REDUCE_CPU_KERNEL(reduce_type, functor, grad_functor) \ + REGISTER_OP_CPU_KERNEL(reduce_type, \ + ops::ReduceKernel); \ + REGISTER_OP_CPU_KERNEL( \ + reduce_type##_grad, \ + ops::ReduceGradKernel); FOR_EACH_KERNEL_FUNCTOR(REGISTER_REDUCE_CPU_KERNEL); diff --git a/paddle/operators/reduce_op.cu b/paddle/operators/reduce_op.cu index d306e1a24096d737438d71d4d4abc35328d160cb..a10ace5253b850db5855bef8384278edebc9e45f 100644 --- a/paddle/operators/reduce_op.cu +++ b/paddle/operators/reduce_op.cu @@ -17,12 +17,13 @@ namespace ops = paddle::operators; -#define REGISTER_REDUCE_GPU_KERNEL(reduce_type, functor, grad_functor) \ - REGISTER_OP_GPU_KERNEL( \ - reduce_type, \ - ops::ReduceKernel); \ - REGISTER_OP_GPU_KERNEL(reduce_type##_grad, \ - ops::ReduceGradKernel); +#define REGISTER_REDUCE_GPU_KERNEL(reduce_type, functor, grad_functor) \ + REGISTER_OP_CUDA_KERNEL( \ + reduce_type, ops::ReduceKernel); \ + REGISTER_OP_CUDA_KERNEL( \ + reduce_type##_grad, \ + ops::ReduceGradKernel); FOR_EACH_KERNEL_FUNCTOR(REGISTER_REDUCE_GPU_KERNEL); diff --git a/paddle/operators/reduce_op.h b/paddle/operators/reduce_op.h index dd6547542d16b0fe336184a0c09a8498027db6ea..47ce910f2821467c701a7f5e22a8dbe5c8c95c92 100644 --- a/paddle/operators/reduce_op.h +++ b/paddle/operators/reduce_op.h @@ -32,55 +32,55 @@ template ; struct SumFunctor { - template - void operator()(const Place& place, X& x, Y& y, const Dim& dim) { + template + void operator()(const DeviceContext& place, X& x, Y& y, const Dim& dim) { y.device(place) = x.sum(dim); } }; struct SumGradFunctor { - template - void operator()(const Place& place, X& x, Y& y, DX& dx, DY& dy, + template + void operator()(const DeviceContext& place, X& x, Y& y, DX& dx, DY& dy, const Dim& dim, int size) { dx.device(place) = dy.broadcast(dim); } }; struct MeanFunctor { - template - void operator()(const Place& place, X& x, Y& y, const Dim& dim) { + template + void operator()(const DeviceContext& place, X& x, Y& y, const Dim& dim) { y.device(place) = x.mean(dim); } }; struct MeanGradFunctor { - template - void operator()(const Place& place, X& x, Y& y, DX& dx, DY& dy, + template + void operator()(const DeviceContext& place, X& x, Y& y, DX& dx, DY& dy, const Dim& dim, int size) { dx.device(place) = dy.broadcast(dim) / dx.constant(size); } }; struct MaxFunctor { - template - void operator()(const Place& place, X& x, Y& y, const Dim& dim) { + template + void operator()(const DeviceContext& place, X& x, Y& y, const Dim& dim) { y.device(place) = x.maximum(dim); } }; struct MinFunctor { - template - void operator()(const Place& place, X& x, Y& y, const Dim& dim) { + template + void operator()(const DeviceContext& place, X& x, Y& y, const Dim& dim) { y.device(place) = x.minimum(dim); } }; struct MaxOrMinGradFunctor { - template - void operator()(const Place& place, X& x, Y& y, DX& dx, DY& dy, + template + void operator()(const DeviceContext& place, X& x, Y& y, DX& dx, DY& dy, const Dim& dim, int size) { auto equals = x == y.broadcast(dim); auto ones = dx.constant(1); @@ -91,7 +91,7 @@ struct MaxOrMinGradFunctor { } }; -template +template class ReduceKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -139,7 +139,8 @@ class ReduceKernel : public framework::OpKernel { dims = framework::make_ddim(dims_vector); } - auto& place = context.GetEigenDevice(); + auto& place = + *context.template device_context().eigen_device(); Functor functor; if (D == 1) { @@ -152,7 +153,7 @@ class ReduceKernel : public framework::OpKernel { } }; -template +template class ReduceGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -201,7 +202,8 @@ class ReduceGradKernel : public framework::OpKernel { Eigen::array broadcast_dim; for (size_t i = 0; i < D; ++i) broadcast_dim[i] = 1; broadcast_dim[dim] = input0->dims()[dim]; - auto& place = context.GetEigenDevice(); + auto& place = + *context.template device_context().eigen_device(); Functor functor; functor(place, x, x_reduce, x_grad, x_reduce_grad, broadcast_dim, broadcast_dim[dim]); diff --git a/paddle/operators/reshape_op.cu b/paddle/operators/reshape_op.cu index dca6c15007a64808248443af32141b4a677f95d7..b7329238c0ea8ebb374d35bd7cddced3dfee1a2c 100644 --- a/paddle/operators/reshape_op.cu +++ b/paddle/operators/reshape_op.cu @@ -14,9 +14,9 @@ #include "paddle/operators/reshape_op.h" -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( reshape, paddle::operators::ReshapeKernel); -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( reshape_grad, paddle::operators::ReshapeGradKernel); diff --git a/paddle/operators/reshape_op.h b/paddle/operators/reshape_op.h index 73fd1da6428f55976a397b7f6f92bb0c796bfe02..92d8cbbb56e224fe67e630bdfcb16d7df44f2846 100644 --- a/paddle/operators/reshape_op.h +++ b/paddle/operators/reshape_op.h @@ -20,7 +20,7 @@ namespace paddle { namespace operators { -template +template class ReshapeKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const { @@ -33,7 +33,7 @@ class ReshapeKernel : public framework::OpKernel { } }; -template +template class ReshapeGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const { diff --git a/paddle/operators/rmsprop_op.cc b/paddle/operators/rmsprop_op.cc index a9c45f639c6728ff2fd6de6fcdadfe5032a705d7..fc3f9b8988ec7fe0093ef6b09a105747b0025ec1 100644 --- a/paddle/operators/rmsprop_op.cc +++ b/paddle/operators/rmsprop_op.cc @@ -116,5 +116,5 @@ http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf) namespace ops = paddle::operators; REGISTER_OP_WITHOUT_GRADIENT(rmsprop, ops::RmspropOp, ops::RmspropOpMaker); -REGISTER_OP_CPU_KERNEL(rmsprop, - ops::RmspropOpKernel); +REGISTER_OP_CPU_KERNEL( + rmsprop, ops::RmspropOpKernel); diff --git a/paddle/operators/rmsprop_op.cu b/paddle/operators/rmsprop_op.cu index 52634a54816bcd5ad0ba82a56f1df95110112265..2a9fd6e1044e923b9ccffab834ff64df0f7cf5d7 100644 --- a/paddle/operators/rmsprop_op.cu +++ b/paddle/operators/rmsprop_op.cu @@ -16,5 +16,5 @@ #include "paddle/operators/rmsprop_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(rmsprop, - ops::RmspropOpKernel); +REGISTER_OP_CUDA_KERNEL( + rmsprop, ops::RmspropOpKernel); diff --git a/paddle/operators/rmsprop_op.h b/paddle/operators/rmsprop_op.h index 7bf2129010f994966d79ef11d5cec30159b47068..16a561835d02457cf2268f713289001773e63d6c 100644 --- a/paddle/operators/rmsprop_op.h +++ b/paddle/operators/rmsprop_op.h @@ -24,7 +24,7 @@ template using EigenVector = framework::EigenVector; -template +template class RmspropOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -51,7 +51,7 @@ class RmspropOpKernel : public framework::OpKernel { auto p_out = EigenVector::Flatten(*param_out); auto mom_out = EigenVector::Flatten(*moment_out); auto ms_out = EigenVector::Flatten(*mean_square_out); - auto place = ctx.GetEigenDevice(); + auto& place = *ctx.template device_context().eigen_device(); Eigen::DSizes grad_dsize(grad->numel()); diff --git a/paddle/operators/roi_pool_op.cc b/paddle/operators/roi_pool_op.cc index 2b5e66c96b726a3c1fdb2596a244c5395db85279..75fcea8401fbbc2943c0d6a50ca81288268823d8 100644 --- a/paddle/operators/roi_pool_op.cc +++ b/paddle/operators/roi_pool_op.cc @@ -157,9 +157,10 @@ namespace ops = paddle::operators; REGISTER_OP(roi_pool, ops::ROIPoolOp, ops::ROIPoolOpMaker, roi_pool_grad, ops::ROIPoolGradOp); REGISTER_OP_CPU_KERNEL( - roi_pool, ops::CPUROIPoolOpKernel, - ops::CPUROIPoolOpKernel); + roi_pool, + ops::CPUROIPoolOpKernel, + ops::CPUROIPoolOpKernel); REGISTER_OP_CPU_KERNEL( roi_pool_grad, - ops::CPUROIPoolGradOpKernel, - ops::CPUROIPoolOpKernel); + ops::CPUROIPoolGradOpKernel, + ops::CPUROIPoolOpKernel); diff --git a/paddle/operators/roi_pool_op.cu b/paddle/operators/roi_pool_op.cu index 9a4c8ca752bb7abc4f44d4815743769bc989703a..a874befe4d12029afa9ce55230da22cb048000aa 100644 --- a/paddle/operators/roi_pool_op.cu +++ b/paddle/operators/roi_pool_op.cu @@ -177,7 +177,7 @@ class GPUROIPoolGradOpKernel : public framework::OpKernel { if (x_grad) { x_grad->mutable_data(ctx.GetPlace()); math::SetConstant set_zero; - set_zero(ctx.device_context(), x_grad, static_cast(0)); + set_zero(ctx.cuda_device_context(), x_grad, static_cast(0)); int output_grad_size = out_grad->numel(); int blocks = NumBlocks(output_grad_size); @@ -199,10 +199,11 @@ class GPUROIPoolGradOpKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( - roi_pool, ops::GPUROIPoolOpKernel, - ops::GPUROIPoolOpKernel); -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( + roi_pool, + ops::GPUROIPoolOpKernel, + ops::GPUROIPoolOpKernel); +REGISTER_OP_CUDA_KERNEL( roi_pool_grad, - ops::GPUROIPoolGradOpKernel, - ops::GPUROIPoolOpKernel); + ops::GPUROIPoolGradOpKernel, + ops::GPUROIPoolOpKernel); diff --git a/paddle/operators/roi_pool_op.h b/paddle/operators/roi_pool_op.h index 3812c66c65457b9d1337690d1a82759aab9a9732..09a9d3d870c1066f1c6f780c4b3203679e9e7505 100644 --- a/paddle/operators/roi_pool_op.h +++ b/paddle/operators/roi_pool_op.h @@ -19,7 +19,7 @@ limitations under the License. */ namespace paddle { namespace operators { -template +template class CPUROIPoolOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -126,7 +126,7 @@ class CPUROIPoolOpKernel : public framework::OpKernel { } }; -template +template class CPUROIPoolGradOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -145,8 +145,9 @@ class CPUROIPoolGradOpKernel : public framework::OpKernel { const T* out_grad_data = out_grad->data(); const int64_t* argmax_data = argmax->data(); T* in_grad_data = in_grad->mutable_data(ctx.GetPlace()); - math::SetConstant set_zero; - set_zero(ctx.device_context(), in_grad, static_cast(0)); + math::SetConstant set_zero; + set_zero(ctx.template device_context(), in_grad, + static_cast(0)); auto in_stride = framework::stride(in->dims()); auto argmax_stride = framework::stride(argmax->dims()); diff --git a/paddle/operators/row_conv_op.cc b/paddle/operators/row_conv_op.cc index ea0bb99f8d2e6b9a3b6eac7e298a89cd604ec49c..5203a5079c8b125f8dc156202f70ce76711a1e30 100644 --- a/paddle/operators/row_conv_op.cc +++ b/paddle/operators/row_conv_op.cc @@ -124,7 +124,8 @@ $$ }; template -class RowConvKernel : public framework::OpKernel { +class RowConvKernel + : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { auto *x = context.Input("X"); @@ -169,7 +170,8 @@ class RowConvKernel : public framework::OpKernel { }; template -class RowConvGradKernel : public framework::OpKernel { +class RowConvGradKernel + : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { auto *x = context.Input("X"); @@ -251,7 +253,8 @@ class RowConvGradKernel : public framework::OpKernel { namespace ops = paddle::operators; REGISTER_OP(row_conv, ops::RowConvOp, ops::RowConvOpMaker, row_conv_grad, ops::RowConvGradOp); -REGISTER_OP_CPU_KERNEL(row_conv, - ops::RowConvKernel); REGISTER_OP_CPU_KERNEL( - row_conv_grad, ops::RowConvGradKernel); + row_conv, ops::RowConvKernel); +REGISTER_OP_CPU_KERNEL( + row_conv_grad, + ops::RowConvGradKernel); diff --git a/paddle/operators/row_conv_op.cu b/paddle/operators/row_conv_op.cu index e0d7ebda7e76919f2d9be702a5399793278ccc43..3fc5eabcf51aea5c4237da865341c1d9e896dc3f 100644 --- a/paddle/operators/row_conv_op.cu +++ b/paddle/operators/row_conv_op.cu @@ -292,7 +292,8 @@ __global__ void RowConvGradFilter(const T *in, const T *dout, int num_sequence, } // namespace template -class RowConvKernel : public framework::OpKernel { +class RowConvKernel + : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { auto *X = context.Input("X"); @@ -327,7 +328,8 @@ class RowConvKernel : public framework::OpKernel { }; template -class RowConvGradKernel : public framework::OpKernel { +class RowConvGradKernel + : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { auto *X = context.Input("X"); @@ -347,7 +349,7 @@ class RowConvGradKernel : public framework::OpKernel { size_t *idx = batch_indices.data(); auto &device_ctx = context.cuda_device_context(); - math::SetConstant zero; + math::SetConstant zero; if (dFilter) { T *dfilter = dFilter->mutable_data(context.GetPlace()); @@ -402,7 +404,8 @@ class RowConvGradKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(row_conv, - ops::RowConvKernel); -REGISTER_OP_GPU_KERNEL( - row_conv_grad, ops::RowConvGradKernel); +REGISTER_OP_CUDA_KERNEL( + row_conv, ops::RowConvKernel); +REGISTER_OP_CUDA_KERNEL( + row_conv_grad, + ops::RowConvGradKernel); diff --git a/paddle/operators/row_conv_op.h b/paddle/operators/row_conv_op.h index 525e83908d86fb6ff0c4ee48257b39a836ea9f97..80912ad8f73b3581efa9e263427e99304208d581 100644 --- a/paddle/operators/row_conv_op.h +++ b/paddle/operators/row_conv_op.h @@ -18,13 +18,13 @@ namespace paddle { namespace operators { -template +template class RowConvKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override; }; -template +template class RowConvGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override; diff --git a/paddle/operators/scale_op.cc b/paddle/operators/scale_op.cc index e5c10fec4d840c58a74758a65ddfa93421ab4827..d848be823e602e595f66138f4b5dfd6e38dd85a1 100644 --- a/paddle/operators/scale_op.cc +++ b/paddle/operators/scale_op.cc @@ -75,8 +75,8 @@ namespace ops = paddle::operators; REGISTER_OPERATOR(scale, ops::ScaleOp, ops::ScaleOpMaker, ops::ScaleGradMaker); -REGISTER_OP_CPU_KERNEL(scale, - ops::ScaleKernel, - ops::ScaleKernel, - ops::ScaleKernel, - ops::ScaleKernel); +REGISTER_OP_CPU_KERNEL( + scale, ops::ScaleKernel, + ops::ScaleKernel, + ops::ScaleKernel, + ops::ScaleKernel); diff --git a/paddle/operators/scale_op.cu b/paddle/operators/scale_op.cu index 0d707751598e65bc56bf73a435c10b4acd6d8ed0..0c7980430f31e2720c7af97aa14cf146c7dfc009 100644 --- a/paddle/operators/scale_op.cu +++ b/paddle/operators/scale_op.cu @@ -14,8 +14,10 @@ #include "paddle/operators/scale_op.h" -REGISTER_OP_GPU_KERNEL( - scale, paddle::operators::ScaleKernel, - paddle::operators::ScaleKernel, - paddle::operators::ScaleKernel, - paddle::operators::ScaleKernel); +REGISTER_OP_CUDA_KERNEL( + scale, + paddle::operators::ScaleKernel, + paddle::operators::ScaleKernel, + paddle::operators::ScaleKernel, + paddle::operators::ScaleKernel); diff --git a/paddle/operators/scale_op.h b/paddle/operators/scale_op.h index 4931294c9d3661f4c53798bd0895a5cd38ae4501..02a8c97a83f5b6f95bbd4079c453dfdc7b7c1481 100644 --- a/paddle/operators/scale_op.h +++ b/paddle/operators/scale_op.h @@ -19,7 +19,7 @@ namespace paddle { namespace operators { -template +template class ScaleKernel : public framework::OpKernel { public: virtual void Compute(const framework::ExecutionContext& context) const { @@ -31,7 +31,8 @@ class ScaleKernel : public framework::OpKernel { auto eigen_out = framework::EigenVector::Flatten(*tensor); auto eigen_in = framework::EigenVector::Flatten(*in); - auto& dev = context.GetEigenDevice(); + auto& dev = + *context.template device_context().eigen_device(); eigen_out.device(dev) = scale * eigen_in; } }; diff --git a/paddle/operators/scatter_op.cu b/paddle/operators/scatter_op.cu index 3b32ae2fb77a5d3d4c558742ec469c74d15eee07..6b43a1389f98bf268cb3b70d7e61409f361e0063 100644 --- a/paddle/operators/scatter_op.cu +++ b/paddle/operators/scatter_op.cu @@ -59,5 +59,5 @@ class ScatterGradOpCUDAKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(scatter, ops::ScatterOpCUDAKernel); -REGISTER_OP_GPU_KERNEL(scatter_grad, ops::ScatterGradOpCUDAKernel); +REGISTER_OP_CUDA_KERNEL(scatter, ops::ScatterOpCUDAKernel); +REGISTER_OP_CUDA_KERNEL(scatter_grad, ops::ScatterGradOpCUDAKernel); diff --git a/paddle/operators/seq_expand_op.cc b/paddle/operators/seq_expand_op.cc index b862056ad400290a60e8a75a23dceeb1d4422ea4..ede9754697429a4d24c51cf494b0ea8f4e408b44 100644 --- a/paddle/operators/seq_expand_op.cc +++ b/paddle/operators/seq_expand_op.cc @@ -148,8 +148,9 @@ class SeqExpandOpGrad : public framework::OperatorWithKernel { namespace ops = paddle::operators; REGISTER_OP(seq_expand, ops::SeqExpandOp, ops::SeqExpandOpMaker, seq_expand_grad, ops::SeqExpandOpGrad); -REGISTER_OP_CPU_KERNEL(seq_expand, - ops::SeqExpandKernel); +REGISTER_OP_CPU_KERNEL( + seq_expand, + ops::SeqExpandKernel); REGISTER_OP_CPU_KERNEL( seq_expand_grad, - ops::SeqExpandGradKernel); + ops::SeqExpandGradKernel); diff --git a/paddle/operators/seq_expand_op.cu b/paddle/operators/seq_expand_op.cu index f1e4b82a76e628c4d9fb83bc93f3dcfd2f98ea5b..8e67ce9ccb29497a957508a9ecdc6b810a7de543 100644 --- a/paddle/operators/seq_expand_op.cu +++ b/paddle/operators/seq_expand_op.cu @@ -16,8 +16,9 @@ #include "paddle/operators/seq_expand_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(seq_expand, - ops::SeqExpandKernel); -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( + seq_expand, + ops::SeqExpandKernel); +REGISTER_OP_CUDA_KERNEL( seq_expand_grad, - ops::SeqExpandGradKernel); + ops::SeqExpandGradKernel); diff --git a/paddle/operators/seq_expand_op.h b/paddle/operators/seq_expand_op.h index 4ef0d02cf85c43e95335660be65a67df66b4f55c..fbee0db454f9701e3f58a41008efd24e728d0600 100644 --- a/paddle/operators/seq_expand_op.h +++ b/paddle/operators/seq_expand_op.h @@ -23,7 +23,7 @@ namespace operators { using LoDTensor = framework::LoDTensor; -template +template class SeqExpandKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -37,7 +37,8 @@ class SeqExpandKernel : public framework::OpKernel { "The size of last lod level in Input(Y)" "must be equal to dims[0] of Input(X)."); out->set_lod(y->lod()); - auto place = context.GetEigenDevice(); + auto* place = + context.template device_context().eigen_device(); size_t element_len = framework::product(x_dims) / x_dims[0]; T* out_data = out->mutable_data(context.GetPlace()); auto out_starts = out->lod().back(); @@ -50,7 +51,7 @@ class SeqExpandKernel : public framework::OpKernel { Eigen::TensorMap> out_t(out_data, scale, element_len); Eigen::array cast({{scale, 1}}); - out_t.device(place) = x_t.broadcast(cast); + out_t.device(*place) = x_t.broadcast(cast); x_data += element_len; out_data += element_len * scale; } @@ -69,7 +70,7 @@ class SeqExpandKernel : public framework::OpKernel { * Grad(X).lod = Input(X).lod * * */ -template +template class SeqExpandGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -89,8 +90,9 @@ class SeqExpandGradKernel : public framework::OpKernel { d_out_t(d_out_data, static_cast(repeat), element_len); Eigen::TensorMap> d_x_t(d_x_data, static_cast(element_len)); - auto place = context.GetEigenDevice(); - d_x_t.device(place) = d_out_t.sum(Eigen::array({{0}})); + auto place = + context.template device_context().eigen_device(); + d_x_t.device(*place) = d_out_t.sum(Eigen::array({{0}})); d_out_data += (repeat * element_len); d_x_data += element_len; } diff --git a/paddle/operators/sequence_concat_op.cc b/paddle/operators/sequence_concat_op.cc index d1de0b444712a8c304c33bd194e306dfe3c41f02..9c7e5456e8238af70f920aaaa9cc652d5d12d3e9 100644 --- a/paddle/operators/sequence_concat_op.cc +++ b/paddle/operators/sequence_concat_op.cc @@ -129,7 +129,7 @@ REGISTER_OP(sequence_concat, ops::SequenceConcatOp, ops::SequenceConcatOpMaker, sequence_concat_grad, ops::SequenceConcatGradOp); REGISTER_OP_CPU_KERNEL( sequence_concat, - ops::SequenceConcatOpKernel); + ops::SequenceConcatOpKernel); REGISTER_OP_CPU_KERNEL( sequence_concat_grad, - ops::SequenceConcatGradOpKernel); + ops::SequenceConcatGradOpKernel); diff --git a/paddle/operators/sequence_concat_op.cu.cc b/paddle/operators/sequence_concat_op.cu.cc index 9ca99c2258f547e6f9c23be0d394bc3ea2bb6678..144bdb5af635b0cb75bcd1f654700041186dae46 100644 --- a/paddle/operators/sequence_concat_op.cu.cc +++ b/paddle/operators/sequence_concat_op.cu.cc @@ -15,9 +15,9 @@ limitations under the License. */ #include "paddle/operators/sequence_concat_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( sequence_concat, - ops::SequenceConcatOpKernel); -REGISTER_OP_GPU_KERNEL( - sequence_concat_grad, - ops::SequenceConcatGradOpKernel); + ops::SequenceConcatOpKernel); +REGISTER_OP_CUDA_KERNEL(sequence_concat_grad, + ops::SequenceConcatGradOpKernel< + paddle::platform::CUDADeviceContext, float>); diff --git a/paddle/operators/sequence_concat_op.h b/paddle/operators/sequence_concat_op.h index 09212070aa90b0f080f6140a312924229162aaec..8445224f46aba6110280783c9080ed4691266b8b 100644 --- a/paddle/operators/sequence_concat_op.h +++ b/paddle/operators/sequence_concat_op.h @@ -59,7 +59,7 @@ LoD ConcatLoD(const std::vector ins, const size_t level) { return out_lod; } -template +template class SequenceConcatOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -119,7 +119,7 @@ class SequenceConcatOpKernel : public framework::OpKernel { } }; -template +template class SequenceConcatGradOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { diff --git a/paddle/operators/sequence_conv_op.cc b/paddle/operators/sequence_conv_op.cc index c5533732d44737bb8cc71fd8ac46f3c36c72ada1..f5c4f1c13331f45183d2810a95f773ad52aca13b 100644 --- a/paddle/operators/sequence_conv_op.cc +++ b/paddle/operators/sequence_conv_op.cc @@ -179,9 +179,10 @@ REGISTER_OP(sequence_conv, ops::SequenceConvOp, ops::SequenceConvOpMaker, sequence_conv_grad, ops::SequenceConvGradOp); REGISTER_OP_CPU_KERNEL( - sequence_conv, ops::SequenceConvKernel, - ops::SequenceConvKernel); + sequence_conv, + ops::SequenceConvKernel, + ops::SequenceConvKernel); REGISTER_OP_CPU_KERNEL( sequence_conv_grad, - ops::SequenceConvGradKernel, - ops::SequenceConvGradKernel); + ops::SequenceConvGradKernel, + ops::SequenceConvGradKernel); diff --git a/paddle/operators/sequence_conv_op.cu.cc b/paddle/operators/sequence_conv_op.cu.cc index c8136dbcb35be4f1236dddc3d24546f9d91670c8..eacba79ace3e60a408d5f5e21a6fe2658da56ca7 100644 --- a/paddle/operators/sequence_conv_op.cu.cc +++ b/paddle/operators/sequence_conv_op.cu.cc @@ -15,10 +15,11 @@ #include "paddle/operators/sequence_conv_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( - sequence_conv, ops::SequenceConvKernel, - ops::SequenceConvKernel); -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( + sequence_conv, + ops::SequenceConvKernel, + ops::SequenceConvKernel); +REGISTER_OP_CUDA_KERNEL( sequence_conv_grad, - ops::SequenceConvGradKernel, - ops::SequenceConvGradKernel); + ops::SequenceConvGradKernel, + ops::SequenceConvGradKernel); diff --git a/paddle/operators/sequence_conv_op.h b/paddle/operators/sequence_conv_op.h index b8fbe2647c4338a2fa16aa655ebab64dd8d5417d..bb584b7bfa5fb8f6eb0a452468d24ca034be6f1b 100644 --- a/paddle/operators/sequence_conv_op.h +++ b/paddle/operators/sequence_conv_op.h @@ -23,7 +23,7 @@ namespace operators { using Tensor = framework::Tensor; using LoDTensor = framework::LoDTensor; -template +template class SequenceConvKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -56,21 +56,23 @@ class SequenceConvKernel : public framework::OpKernel { Tensor col; col.mutable_data(col_shape, context.GetPlace()); // Because if padding_trainable is false, padding data should be zeros. - math::SetConstant set_zero; - set_zero(context.device_context(), &col, static_cast(0)); + math::SetConstant set_zero; + auto& dev_ctx = context.template device_context(); + set_zero(dev_ctx, &col, static_cast(0)); - math::ContextProjectFunctor seq_project_functor; + math::ContextProjectFunctor seq_project_functor; - seq_project_functor(context.device_context(), *in, *padding_data, - padding_trainable, context_start, context_length, - context_stride, up_pad, down_pad, &col); + seq_project_functor(dev_ctx, *in, *padding_data, padding_trainable, + context_start, context_length, context_stride, up_pad, + down_pad, &col); - math::matmul(context.device_context(), col, false, filter, false, - static_cast(1.0), out, static_cast(0.0)); + math::matmul(dev_ctx, col, false, filter, false, + static_cast(1.0), out, + static_cast(0.0)); } }; -template +template class SequenceConvGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -95,7 +97,8 @@ class SequenceConvGradKernel : public framework::OpKernel { int down_pad = std::max(0, context_start + context_length - 1); int sequence_width = static_cast(in->dims()[1]); - math::SetConstant set_zero; + math::SetConstant set_zero; + auto& dev_ctx = context.template device_context(); // use col_shape in the im2col calculation framework::DDim col_shape = {in->dims()[0], sequence_width * context_length}; @@ -104,38 +107,36 @@ class SequenceConvGradKernel : public framework::OpKernel { if (in_g || filter_g || (padding_trainable && padding_data_g)) { col.mutable_data(col_shape, context.GetPlace()); // Because if padding_trainable is false, padding data should be zeros. - set_zero(context.device_context(), &col, static_cast(0)); - math::matmul(context.device_context(), *out_g, false, *filter, - true, T(1.0), &col, T(1.0)); + set_zero(dev_ctx, &col, static_cast(0)); + math::matmul(dev_ctx, *out_g, false, *filter, true, + T(1.0), &col, T(1.0)); } - math::ContextProjectFunctor seq_project_functor; - math::ContextProjectGradFunctor seq_project_grad_functor; + math::ContextProjectFunctor seq_project_functor; + math::ContextProjectGradFunctor seq_project_grad_functor; if (in_g) { in_g->mutable_data(context.GetPlace()); in_g->set_lod(in->lod()); - set_zero(context.device_context(), in_g, static_cast(0)); + set_zero(dev_ctx, in_g, static_cast(0)); - seq_project_grad_functor(context.device_context(), *in_g, - padding_trainable, context_start, context_length, - context_stride, up_pad, down_pad, false, true, - padding_data_g, &col); + seq_project_grad_functor(dev_ctx, *in_g, padding_trainable, context_start, + context_length, context_stride, up_pad, down_pad, + false, true, padding_data_g, &col); } if (padding_trainable && padding_data_g) { padding_data_g->mutable_data(context.GetPlace()); - set_zero(context.device_context(), padding_data_g, static_cast(0)); + set_zero(dev_ctx, padding_data_g, static_cast(0)); LoDTensor* input = const_cast(in); - seq_project_grad_functor(context.device_context(), *input, - padding_trainable, context_start, context_length, - context_stride, up_pad, down_pad, true, false, - padding_data_g, &col); + seq_project_grad_functor( + dev_ctx, *input, padding_trainable, context_start, context_length, + context_stride, up_pad, down_pad, true, false, padding_data_g, &col); } if (filter_g) { filter_g->mutable_data(context.GetPlace()); - set_zero(context.device_context(), filter_g, static_cast(0)); + set_zero(dev_ctx, filter_g, static_cast(0)); Tensor filter_grad = *filter_g; LoDTensor out_grad = *out_g; @@ -145,12 +146,12 @@ class SequenceConvGradKernel : public framework::OpKernel { padding_data = context.Input("PaddingData"); } - seq_project_functor(context.device_context(), *in, *padding_data, - padding_trainable, context_start, context_length, - context_stride, up_pad, down_pad, &col); + seq_project_functor(dev_ctx, *in, *padding_data, padding_trainable, + context_start, context_length, context_stride, up_pad, + down_pad, &col); - math::matmul(context.device_context(), col, true, out_grad, - false, T(1.0), &filter_grad, T(1.0)); + math::matmul(dev_ctx, col, true, out_grad, false, + T(1.0), &filter_grad, T(1.0)); } } }; diff --git a/paddle/operators/sequence_pool_op.cc b/paddle/operators/sequence_pool_op.cc index bfda8649cda7ae2da96211dc64aad20c77bf715e..3526e45a1b6565bc21413d381d15c02f08c587bd 100644 --- a/paddle/operators/sequence_pool_op.cc +++ b/paddle/operators/sequence_pool_op.cc @@ -123,7 +123,8 @@ namespace ops = paddle::operators; REGISTER_OP(sequence_pool, ops::SequencePoolOp, ops::SequencePoolOpMaker, sequence_pool_grad, ops::SequencePoolGradOp); REGISTER_OP_CPU_KERNEL( - sequence_pool, ops::SequencePoolKernel); + sequence_pool, + ops::SequencePoolKernel); REGISTER_OP_CPU_KERNEL( sequence_pool_grad, - ops::SequencePoolGradKernel); + ops::SequencePoolGradKernel); diff --git a/paddle/operators/sequence_pool_op.cu b/paddle/operators/sequence_pool_op.cu index 66850772d501f873cf754205c19e9d0c0090370a..fcd65084353744dc836ff1dc5a3aa4b03a205130 100644 --- a/paddle/operators/sequence_pool_op.cu +++ b/paddle/operators/sequence_pool_op.cu @@ -17,8 +17,9 @@ #include "paddle/operators/sequence_pool_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( - sequence_pool, ops::SequencePoolKernel); -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( + sequence_pool, + ops::SequencePoolKernel); +REGISTER_OP_CUDA_KERNEL( sequence_pool_grad, - ops::SequencePoolGradKernel); + ops::SequencePoolGradKernel); diff --git a/paddle/operators/sequence_pool_op.h b/paddle/operators/sequence_pool_op.h index 7f136d8cf0e1eaae7b4de32988b60ae8a5034cc6..7519aa1d7208b9832f7a3d3afbc59a2eb4e8e13a 100644 --- a/paddle/operators/sequence_pool_op.h +++ b/paddle/operators/sequence_pool_op.h @@ -30,7 +30,7 @@ template using EigenMatrix = framework::EigenMatrix; -template +template class SequencePoolKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -54,17 +54,18 @@ class SequencePoolKernel : public framework::OpKernel { auto lod_level_0 = lod[0]; out->mutable_data(context.GetPlace()); - + auto& dev_ctx = context.template device_context(); if (pooltype == "MAX") { - math::MaxSeqPoolFunctor max_pool; + math::MaxSeqPoolFunctor max_pool; auto* index = context.Output("MaxIndex"); index->Resize({dims}); index->mutable_data(context.GetPlace()); - max_pool(context.device_context(), *in, out, index); + max_pool(dev_ctx, *in, out, index); return; } - auto place = context.GetEigenDevice(); + auto& place = + *context.template device_context().eigen_device(); for (int i = 0; i < static_cast(lod_level_0.size()) - 1; ++i) { Tensor in_t = in->Slice(static_cast(lod_level_0[i]), static_cast(lod_level_0[i + 1])); @@ -91,7 +92,7 @@ class SequencePoolKernel : public framework::OpKernel { } }; -template +template class SequencePoolGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -105,20 +106,23 @@ class SequencePoolGradKernel : public framework::OpKernel { int64_t w = in->numel() / dims[0]; in_g->mutable_data(context.GetPlace()); + auto& dev_ctx = context.template device_context(); if (pooltype == "MAX") { - math::MaxSeqPoolGradFunctor max_pool_grad; + math::MaxSeqPoolGradFunctor max_pool_grad; auto* index = context.Input("MaxIndex"); - max_pool_grad(context.device_context(), *out_g, *index, in_g); + max_pool_grad(dev_ctx, *out_g, *index, in_g); return; } if (pooltype == "LAST" || pooltype == "FIRST") { // set X@Grad be zero at first when pooltype is LAST/FIRST - math::SetConstant functor; - functor(context.device_context(), in_g, 0); + math::SetConstant functor; + functor(dev_ctx, in_g, 0); } - auto place = context.GetEigenDevice(); + auto& place = + *context.template device_context().eigen_device(); + for (int i = 0; i < static_cast(lod.size()) - 1; ++i) { auto in_g_t = in_g->Slice(static_cast(lod[i]), static_cast(lod[i + 1])); diff --git a/paddle/operators/sequence_slice_op.cc b/paddle/operators/sequence_slice_op.cc index 255683a572c0e8d54791cb0c905d85239920d992..481db8f9e548de68c102210035d4ff037ab56261 100644 --- a/paddle/operators/sequence_slice_op.cc +++ b/paddle/operators/sequence_slice_op.cc @@ -125,7 +125,7 @@ REGISTER_OP(sequence_slice, ops::SequenceSliceOp, ops::SequenceSliceOpMaker, sequence_slice_grad, ops::SequenceSliceGradOp); REGISTER_OP_CPU_KERNEL( sequence_slice, - ops::SequenceSliceOpKernel); + ops::SequenceSliceOpKernel); REGISTER_OP_CPU_KERNEL( sequence_slice_grad, - ops::SequenceSliceGradOpKernel); + ops::SequenceSliceGradOpKernel); diff --git a/paddle/operators/sequence_slice_op.cu b/paddle/operators/sequence_slice_op.cu index a9f59dadba74d900fa5cc0601fb5b264ea19e34d..43a21d619f4116874c329eb968f09dc230975c05 100755 --- a/paddle/operators/sequence_slice_op.cu +++ b/paddle/operators/sequence_slice_op.cu @@ -15,9 +15,9 @@ limitations under the License. */ #include "paddle/operators/sequence_slice_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( sequence_slice, - ops::SequenceSliceOpKernel); -REGISTER_OP_GPU_KERNEL( + ops::SequenceSliceOpKernel); +REGISTER_OP_CUDA_KERNEL( sequence_slice_grad, - ops::SequenceSliceGradOpKernel); + ops::SequenceSliceGradOpKernel); diff --git a/paddle/operators/sequence_slice_op.h b/paddle/operators/sequence_slice_op.h index 428ef556daa248a918f58dde608dc024144e773c..14bcaebbb402cb47507f1bf60035bc2d37f9baf7 100644 --- a/paddle/operators/sequence_slice_op.h +++ b/paddle/operators/sequence_slice_op.h @@ -39,7 +39,7 @@ inline LoD SequenceSliceLoD(const T& in, const int64_t* offset_data, return out_lod; } -template +template class SequenceSliceOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -108,7 +108,7 @@ class SequenceSliceOpKernel : public framework::OpKernel { } }; -template +template class SequenceSliceGradOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -143,8 +143,9 @@ class SequenceSliceGradOpKernel : public framework::OpKernel { if (x_grad) { x_grad->mutable_data(ctx.GetPlace()); x_grad->set_lod(in->lod()); - math::SetConstant set_zero; - set_zero(ctx.device_context(), x_grad, static_cast(0)); + math::SetConstant set_zero; + set_zero(ctx.template device_context(), x_grad, + static_cast(0)); auto out_grad_stride = framework::stride(out_grad->dims()); diff --git a/paddle/operators/sequence_softmax_op.cc b/paddle/operators/sequence_softmax_op.cc index 32c15025660ebf0baf317e269a33c047e6844219..37d5452e6ba59411f9ab2e1460fc8584583f0321 100644 --- a/paddle/operators/sequence_softmax_op.cc +++ b/paddle/operators/sequence_softmax_op.cc @@ -99,7 +99,7 @@ REGISTER_OP(sequence_softmax, ops::SequenceSoftmaxOp, ops::SequenceSoftmaxGradOp); REGISTER_OP_CPU_KERNEL( sequence_softmax, - ops::SequenceSoftmaxKernel); + ops::SequenceSoftmaxKernel); REGISTER_OP_CPU_KERNEL( sequence_softmax_grad, - ops::SequenceSoftmaxGradKernel); + ops::SequenceSoftmaxGradKernel); diff --git a/paddle/operators/sequence_softmax_op.cu.cc b/paddle/operators/sequence_softmax_op.cu.cc index 7023795a3b5777c250a9323a304a54849d763e9e..5f65b4daf97cf025b975d2d95212375b5fca01f8 100644 --- a/paddle/operators/sequence_softmax_op.cu.cc +++ b/paddle/operators/sequence_softmax_op.cu.cc @@ -15,9 +15,9 @@ limitations under the License. */ #include "paddle/operators/sequence_softmax_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( sequence_softmax, - ops::SequenceSoftmaxKernel) -REGISTER_OP_GPU_KERNEL( + ops::SequenceSoftmaxKernel) +REGISTER_OP_CUDA_KERNEL( sequence_softmax_grad, - ops::SequenceSoftmaxGradKernel); + ops::SequenceSoftmaxGradKernel); diff --git a/paddle/operators/sequence_softmax_op.h b/paddle/operators/sequence_softmax_op.h index 1b68dd0662ddfffc57b187945fe131e202c55174..e889e88cb34719b6648e3032754645fbb2807741 100644 --- a/paddle/operators/sequence_softmax_op.h +++ b/paddle/operators/sequence_softmax_op.h @@ -23,7 +23,7 @@ namespace operators { using Tensor = framework::Tensor; using LoDTensor = framework::LoDTensor; -template +template class SequenceSoftmaxKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -52,12 +52,13 @@ class SequenceSoftmaxKernel : public framework::OpKernel { framework::DDim dims_i = framework::make_ddim({1UL, end_pos - start_pos}); x_i.Resize(dims_i); out_i.Resize(dims_i); - math::SoftmaxFunctor()(ctx.device_context(), &x_i, &out_i); + math::SoftmaxFunctor()( + ctx.template device_context(), &x_i, &out_i); } } }; -template +template class SequenceSoftmaxGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -83,8 +84,9 @@ class SequenceSoftmaxGradKernel : public framework::OpKernel { out_i.Resize(dims_i); out_grad_i.Resize(dims_i); x_grad_i.Resize(dims_i); - math::SoftmaxGradFunctor()(ctx.device_context(), &out_i, - &out_grad_i, &x_grad_i); + math::SoftmaxGradFunctor()( + ctx.template device_context(), &out_i, &out_grad_i, + &x_grad_i); } } }; diff --git a/paddle/operators/sgd_op.cc b/paddle/operators/sgd_op.cc index 5576d7b8be060a3c58cb18ed667041562cf853b8..121bf60b27c62c1b0dd4c34c12962b7098e29ae2 100644 --- a/paddle/operators/sgd_op.cc +++ b/paddle/operators/sgd_op.cc @@ -62,8 +62,8 @@ $$param\_out = param - learning\_rate * grad$$ }; template -struct SparseSGDFunctor { - void operator()(const platform::DeviceContext& context, +struct SparseSGDFunctor { + void operator()(const platform::CPUDeviceContext& context, const framework::SelectedRows& input, const framework::Tensor& learning_rate, framework::Tensor* output) { @@ -90,13 +90,14 @@ struct SparseSGDFunctor { } }; -template struct SparseSGDFunctor; -template struct SparseSGDFunctor; +template struct SparseSGDFunctor; +template struct SparseSGDFunctor; } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OP_WITHOUT_GRADIENT(sgd, ops::SGDOp, ops::SGDOpMaker); -REGISTER_OP_CPU_KERNEL(sgd, ops::SGDOpKernel, - ops::SGDOpKernel); +REGISTER_OP_CPU_KERNEL( + sgd, ops::SGDOpKernel, + ops::SGDOpKernel); diff --git a/paddle/operators/sgd_op.cu b/paddle/operators/sgd_op.cu index 7b6c5ec30628b521b594ceaa3b7f1e0e03e497e4..a3c0db7e50ecaabd6d4b83c43e5436e6be491676 100644 --- a/paddle/operators/sgd_op.cu +++ b/paddle/operators/sgd_op.cu @@ -41,8 +41,8 @@ __global__ void SparseSGDFunctorKernel(const T* selected_rows, } // namespace template -struct SparseSGDFunctor { - void operator()(const platform::DeviceContext& context, +struct SparseSGDFunctor { + void operator()(const platform::CUDADeviceContext& context, const framework::SelectedRows& input, const framework::Tensor& learning_rate, framework::Tensor* output) { @@ -62,21 +62,19 @@ struct SparseSGDFunctor { const int block_size = 256; dim3 threads(block_size, 1); dim3 grid(1, in_rows.size()); - SparseSGDFunctorKernel< - T, 256><<(context) - .stream()>>>(in_data, in_rows.data(), - learning_rate.data(), out_data, - in_row_numel); + SparseSGDFunctorKernel<<>>( + in_data, in_rows.data(), learning_rate.data(), out_data, + in_row_numel); } }; -template struct SparseSGDFunctor; -template struct SparseSGDFunctor; +template struct SparseSGDFunctor; +template struct SparseSGDFunctor; } // namespace operators } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(sgd, ops::SGDOpKernel, - ops::SGDOpKernel); +REGISTER_OP_CUDA_KERNEL( + sgd, ops::SGDOpKernel, + ops::SGDOpKernel); diff --git a/paddle/operators/sgd_op.h b/paddle/operators/sgd_op.h index 78b595fc6c63d775b627f23cafa9458f1dadd4e5..c920025a91cd0b68019bcb05558398093f31e206 100644 --- a/paddle/operators/sgd_op.h +++ b/paddle/operators/sgd_op.h @@ -20,15 +20,15 @@ limitations under the License. */ namespace paddle { namespace operators { -template +template struct SparseSGDFunctor { - void operator()(const platform::DeviceContext& context, + void operator()(const DeviceContext& context, const framework::SelectedRows& input, const framework::Tensor& learning_rate, framework::Tensor* output); }; -template +template class SGDOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -46,7 +46,8 @@ class SGDOpKernel : public framework::OpKernel { auto g = framework::EigenVector::Flatten(*grad); auto o = framework::EigenVector::Flatten(*param_out); auto lr = framework::EigenVector::Flatten(*learning_rate); - auto place = ctx.GetEigenDevice(); + auto& place = + *ctx.template device_context().eigen_device(); Eigen::DSizes grad_dsize(grad->numel()); o.device(place) = p - lr.broadcast(grad_dsize) * g; @@ -56,8 +57,9 @@ class SGDOpKernel : public framework::OpKernel { // It's better to find a more elegant solution. PADDLE_ENFORCE_EQ(param, param_out); auto* grad = ctx.Input("Grad"); - SparseSGDFunctor functor; - functor(ctx.device_context(), *grad, *learning_rate, param_out); + SparseSGDFunctor functor; + functor(ctx.template device_context(), *grad, + *learning_rate, param_out); } else { PADDLE_THROW("Unsupported Variable Type of Grad"); } diff --git a/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc b/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc index 782f4c79361b3255cc686ec3b1edf31ce37f5a2d..b8a1bf122a78df1e0d8291c77a61b3f917d40960 100644 --- a/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc +++ b/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc @@ -142,7 +142,7 @@ REGISTER_OP(sigmoid_cross_entropy_with_logits, ops::SigmoidCrossEntropyWithLogitsGradOp); REGISTER_OP_CPU_KERNEL(sigmoid_cross_entropy_with_logits, ops::SigmoidCrossEntropyWithLogitsKernel< - paddle::platform::CPUPlace, float>); + paddle::platform::CPUDeviceContext, float>); REGISTER_OP_CPU_KERNEL(sigmoid_cross_entropy_with_logits_grad, ops::SigmoidCrossEntropyWithLogitsGradKernel< - paddle::platform::CPUPlace, float>); + paddle::platform::CPUDeviceContext, float>); diff --git a/paddle/operators/sigmoid_cross_entropy_with_logits_op.cu b/paddle/operators/sigmoid_cross_entropy_with_logits_op.cu index 32a39956a14a206373b7b4c141dad19577d171f0..1b569c93ed9568a26824defef0d25bb1c3dadad4 100644 --- a/paddle/operators/sigmoid_cross_entropy_with_logits_op.cu +++ b/paddle/operators/sigmoid_cross_entropy_with_logits_op.cu @@ -16,9 +16,9 @@ #include "paddle/operators/sigmoid_cross_entropy_with_logits_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(sigmoid_cross_entropy_with_logits, - ops::SigmoidCrossEntropyWithLogitsKernel< - paddle::platform::GPUPlace, float>); -REGISTER_OP_GPU_KERNEL(sigmoid_cross_entropy_with_logits_grad, - ops::SigmoidCrossEntropyWithLogitsGradKernel< - paddle::platform::GPUPlace, float>); +REGISTER_OP_CUDA_KERNEL(sigmoid_cross_entropy_with_logits, + ops::SigmoidCrossEntropyWithLogitsKernel< + paddle::platform::CUDADeviceContext, float>); +REGISTER_OP_CUDA_KERNEL(sigmoid_cross_entropy_with_logits_grad, + ops::SigmoidCrossEntropyWithLogitsGradKernel< + paddle::platform::CUDADeviceContext, float>); diff --git a/paddle/operators/sigmoid_cross_entropy_with_logits_op.h b/paddle/operators/sigmoid_cross_entropy_with_logits_op.h index 2a9d9bbc77266c8ecfba82663c396bbd8e4dbe27..8fe7c5ba8224f8dac5de8d7ee772ebc71f987d69 100644 --- a/paddle/operators/sigmoid_cross_entropy_with_logits_op.h +++ b/paddle/operators/sigmoid_cross_entropy_with_logits_op.h @@ -20,7 +20,7 @@ namespace paddle { namespace operators { // Out = max(X, 0) - X * Labels + log(1 + exp(-abs(X))) -template +template class SigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { @@ -32,7 +32,7 @@ class SigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel { auto x = framework::EigenVector::Flatten(*X); auto labels = framework::EigenVector::Flatten(*Labels); auto out = framework::EigenVector::Flatten(*Out); - auto place = context.GetEigenDevice(); + auto &place = *context.device_context().eigen_device(); // term1 = max(x, 0) auto term1 = x.cwiseMax(static_cast(0)); @@ -46,7 +46,7 @@ class SigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel { }; // dX = sigmoid(X) - labels -template +template class SigmoidCrossEntropyWithLogitsGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { @@ -62,7 +62,8 @@ class SigmoidCrossEntropyWithLogitsGradKernel : public framework::OpKernel { auto labels = framework::EigenVector::Flatten(*Labels); auto dout = framework::EigenVector::Flatten(*dOut); auto dx = framework::EigenVector::Flatten(*dX); - auto place = context.GetEigenDevice(); + auto &place = + *context.template device_context().eigen_device(); auto sigmoid_x = static_cast(1) / (static_cast(1) + (-x).exp()); dx.device(place) = dout * (sigmoid_x - labels); diff --git a/paddle/operators/sign_op.cc b/paddle/operators/sign_op.cc index 08bf2e4e7cc101a3bcc907d3b40ee82347b39f80..d5a7ccb77e7d9ad3a93702861dbab295c4ab5bce 100644 --- a/paddle/operators/sign_op.cc +++ b/paddle/operators/sign_op.cc @@ -67,5 +67,5 @@ namespace ops = paddle::operators; REGISTER_OPERATOR(sign, ops::SignOp, ops::SignOpMaker, ops::SignGradMaker); -REGISTER_OP_CPU_KERNEL(sign, - ops::SignKernel); +REGISTER_OP_CPU_KERNEL( + sign, ops::SignKernel); diff --git a/paddle/operators/sign_op.cu b/paddle/operators/sign_op.cu index 4d0638cb97d84bf650fb23e4d2a201adc51a4b68..9bc1c65d214ba8f988dec3b7b11da9e1ec3a6581 100644 --- a/paddle/operators/sign_op.cu +++ b/paddle/operators/sign_op.cu @@ -14,5 +14,6 @@ #include "paddle/operators/sign_op.h" -REGISTER_OP_GPU_KERNEL( - sign, paddle::operators::SignKernel); +REGISTER_OP_CUDA_KERNEL( + sign, + paddle::operators::SignKernel); diff --git a/paddle/operators/sign_op.h b/paddle/operators/sign_op.h index ab5cd4bac019d602c63ea51629fb85fa7e206841..2e476ed6658491b3dcec3cf1388ccc4a0813449c 100644 --- a/paddle/operators/sign_op.h +++ b/paddle/operators/sign_op.h @@ -19,7 +19,7 @@ namespace paddle { namespace operators { -template +template class SignKernel : public framework::OpKernel { public: virtual void Compute(const framework::ExecutionContext& context) const { @@ -29,7 +29,8 @@ class SignKernel : public framework::OpKernel { auto eigen_out = framework::EigenVector::Flatten(*out); auto eigen_in = framework::EigenVector::Flatten(*in); - auto& place = context.GetEigenDevice(); + auto& place = + *context.template device_context().eigen_device(); eigen_out.device(place) = eigen_in.sign(); } }; diff --git a/paddle/operators/smooth_l1_loss_op.cc b/paddle/operators/smooth_l1_loss_op.cc index 50543fcc148698c42e15259ba20bdacdd50ac1af..56e8d9058fcc035c28e74daff778c4e034f46b44 100644 --- a/paddle/operators/smooth_l1_loss_op.cc +++ b/paddle/operators/smooth_l1_loss_op.cc @@ -138,7 +138,8 @@ REGISTER_OP(smooth_l1_loss, ops::SmoothL1LossOp, ops::SmoothL1LossOpMaker, smooth_l1_loss_grad, ops::SmoothL1LossGradOp); REGISTER_OP_CPU_KERNEL( - smooth_l1_loss, ops::SmoothL1LossKernel); + smooth_l1_loss, + ops::SmoothL1LossKernel); REGISTER_OP_CPU_KERNEL( smooth_l1_loss_grad, - ops::SmoothL1LossGradKernel); + ops::SmoothL1LossGradKernel); diff --git a/paddle/operators/smooth_l1_loss_op.cu b/paddle/operators/smooth_l1_loss_op.cu index 1c3172f43867741cd1f26979a366b2425f326321..8e94ebac644d1047920827250c4313c657b22ea0 100644 --- a/paddle/operators/smooth_l1_loss_op.cu +++ b/paddle/operators/smooth_l1_loss_op.cu @@ -17,8 +17,9 @@ #include "paddle/operators/smooth_l1_loss_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( - smooth_l1_loss, ops::SmoothL1LossKernel); -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( + smooth_l1_loss, + ops::SmoothL1LossKernel); +REGISTER_OP_CUDA_KERNEL( smooth_l1_loss_grad, - ops::SmoothL1LossGradKernel); + ops::SmoothL1LossGradKernel); diff --git a/paddle/operators/smooth_l1_loss_op.h b/paddle/operators/smooth_l1_loss_op.h index 39d0070b6c8909b8f433de48038240e851d9d6cf..1a70c9c63c340d66b6bf0db97cc8ab35a663f816 100644 --- a/paddle/operators/smooth_l1_loss_op.h +++ b/paddle/operators/smooth_l1_loss_op.h @@ -44,7 +44,7 @@ struct SmoothL1LossForward { T sigma2; }; -template +template class SmoothL1LossKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -57,7 +57,8 @@ class SmoothL1LossKernel : public framework::OpKernel { out0->mutable_data(context.GetPlace()); out1->mutable_data(context.GetPlace()); - auto place = context.GetEigenDevice(); + auto* place = + context.template device_context().eigen_device(); auto sigma = static_cast(context.Attr("sigma")); T sigma2 = sigma * sigma; @@ -67,12 +68,12 @@ class SmoothL1LossKernel : public framework::OpKernel { auto y = EigenVector::Flatten(*in1); auto diff = EigenVector::Flatten(*out0); - diff.device(place) = x - y; + diff.device(*place) = x - y; // multiply inside weight if (has_weight) { auto inside_weight = EigenVector::Flatten(*in2); // cache diff, reused in bp - diff.device(place) = diff * inside_weight; + diff.device(*place) = diff * inside_weight; } auto in_counts = in0->numel(); @@ -81,12 +82,12 @@ class SmoothL1LossKernel : public framework::OpKernel { context.GetPlace()); auto errors = EigenVector::Flatten(ptensor_errors); // apply smooth l1 forward - errors.device(place) = diff.unaryExpr(SmoothL1LossForward(sigma2)); + errors.device(*place) = diff.unaryExpr(SmoothL1LossForward(sigma2)); // multiply outside weight if (has_weight) { auto outside_weight = EigenVector::Flatten(*in3); - errors.device(place) = errors * outside_weight; + errors.device(*place) = errors * outside_weight; } auto loss = EigenVector::Flatten(*out1); // first dimension of 'X' is the number of samples @@ -94,7 +95,7 @@ class SmoothL1LossKernel : public framework::OpKernel { framework::make_ddim({static_cast(in0->dims()[0]), static_cast(in_counts / in0->dims()[0])}); auto errors_mat_view = EigenMatrix::From(ptensor_errors, mat_dims); - loss.device(place) = errors_mat_view.sum(Eigen::array({{1}})); + loss.device(*place) = errors_mat_view.sum(Eigen::array({{1}})); } }; @@ -114,7 +115,7 @@ struct SmoothL1LossBackward { T sigma2; }; -template +template class SmoothL1LossGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -126,7 +127,8 @@ class SmoothL1LossGradKernel : public framework::OpKernel { T sigma2 = sigma * sigma; bool has_weight = (in0 != nullptr) && (in1 != nullptr); - auto place = context.GetEigenDevice(); + auto* place = + context.template device_context().eigen_device(); auto in_dims = in2->dims(); auto counts = in2->numel(); @@ -139,7 +141,7 @@ class SmoothL1LossGradKernel : public framework::OpKernel { context.GetPlace()); auto diff = EigenVector::Flatten(ptensor_diff); // apply smooth l1 backwoard - diff.device(place) = EigenVector::Flatten(*in2).unaryExpr( + diff.device(*place) = EigenVector::Flatten(*in2).unaryExpr( SmoothL1LossBackward(sigma2)); // compute weights @@ -147,11 +149,11 @@ class SmoothL1LossGradKernel : public framework::OpKernel { ptensor_weights.mutable_data(mat_dims, context.GetPlace()); auto weights = EigenMatrix::From(ptensor_weights); // initialize to 1.0 - weights.device(place) = weights.constant(static_cast(1.0)); + weights.device(*place) = weights.constant(static_cast(1.0)); if (has_weight) { auto inside_weight = EigenMatrix::From(*in0, mat_dims); auto outside_weight = EigenMatrix::From(*in1, mat_dims); - weights.device(place) = inside_weight * outside_weight; + weights.device(*place) = inside_weight * outside_weight; } // compute gradients @@ -167,13 +169,13 @@ class SmoothL1LossGradKernel : public framework::OpKernel { if (out0) { out0->mutable_data(context.GetPlace()); auto x_grad = EigenMatrix::From(*out0, mat_dims); - x_grad.device(place) = gradients; + x_grad.device(*place) = gradients; } if (out1) { out1->mutable_data(context.GetPlace()); auto y_grad = EigenMatrix::From(*out1, mat_dims); - y_grad.device(place) = -1 * gradients; + y_grad.device(*place) = -1 * gradients; } } }; diff --git a/paddle/operators/softmax_op.cc b/paddle/operators/softmax_op.cc index 93e0525badc26808f0dca70cc1153ac728f1fe9c..0988c83d43535d7ee1bcef87bf506e5db1a3ecc0 100644 --- a/paddle/operators/softmax_op.cc +++ b/paddle/operators/softmax_op.cc @@ -89,7 +89,8 @@ namespace ops = paddle::operators; REGISTER_OP(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker, softmax_grad, ops::SoftmaxOpGrad); -REGISTER_OP_CPU_KERNEL(softmax, - ops::SoftmaxKernel); REGISTER_OP_CPU_KERNEL( - softmax_grad, ops::SoftmaxGradKernel); + softmax, ops::SoftmaxKernel); +REGISTER_OP_CPU_KERNEL( + softmax_grad, + ops::SoftmaxGradKernel); diff --git a/paddle/operators/softmax_op.cu.cc b/paddle/operators/softmax_op.cu.cc index 013ace19ae3d4a1af29b570ba33fea3e4595fe5b..7b9882cbcfe1a0381541386f76867c6bb0f1fe55 100644 --- a/paddle/operators/softmax_op.cu.cc +++ b/paddle/operators/softmax_op.cu.cc @@ -16,7 +16,8 @@ namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(softmax, - ops::SoftmaxKernel); -REGISTER_OP_GPU_KERNEL( - softmax_grad, ops::SoftmaxGradKernel); +REGISTER_OP_CUDA_KERNEL( + softmax, ops::SoftmaxKernel); +REGISTER_OP_CUDA_KERNEL( + softmax_grad, + ops::SoftmaxGradKernel); diff --git a/paddle/operators/softmax_op.h b/paddle/operators/softmax_op.h index 44d1e63f1bb4798144218cd1caf01f133825bcff..0f8998b99e93b5ed6c9b43ad7adabc2d515c1ff1 100644 --- a/paddle/operators/softmax_op.h +++ b/paddle/operators/softmax_op.h @@ -21,7 +21,7 @@ namespace operators { using Tensor = framework::Tensor; -template +template class SoftmaxKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -31,11 +31,12 @@ class SoftmaxKernel : public framework::OpKernel { // allocate memory on device. Y->mutable_data(context.GetPlace()); - math::SoftmaxFunctor()(context.device_context(), X, Y); + math::SoftmaxFunctor()( + context.template device_context(), X, Y); } }; -template +template class SoftmaxGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -46,7 +47,8 @@ class SoftmaxGradKernel : public framework::OpKernel { // allocate memory on device. dX->mutable_data(context.GetPlace()); - math::SoftmaxGradFunctor()(context.device_context(), Y, dY, dX); + math::SoftmaxGradFunctor()( + context.template device_context(), Y, dY, dX); } }; diff --git a/paddle/operators/softmax_with_cross_entropy_op.cu b/paddle/operators/softmax_with_cross_entropy_op.cu index b1faddac3fd21aaf817caf9d3e57e664f4e0e2d5..6100c63f9aba006d9739173a8a5a2fb398187e55 100644 --- a/paddle/operators/softmax_with_cross_entropy_op.cu +++ b/paddle/operators/softmax_with_cross_entropy_op.cu @@ -69,10 +69,10 @@ class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel { softmax->mutable_data(context.GetPlace()); loss->mutable_data(context.GetPlace()); - math::SoftmaxFunctor()(context.device_context(), - logits, softmax); - math::CrossEntropyFunctor()( - context.device_context(), loss, softmax, labels, + math::SoftmaxFunctor()( + context.cuda_device_context(), logits, softmax); + math::CrossEntropyFunctor()( + context.cuda_device_context(), loss, softmax, labels, context.Attr("soft_label")); } }; @@ -98,18 +98,18 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel { if (context.Attr("soft_label")) { const T* label_data = labels->data(); - SoftCrossEntropyGradientKernel<<< - grid, block, 0, reinterpret_cast( - context.device_context()) - .stream()>>>(logit_grad_data, loss_grad_data, - label_data, batch_size, class_num); + SoftCrossEntropyGradientKernel< + T><<() + .stream()>>>(logit_grad_data, loss_grad_data, label_data, + batch_size, class_num); } else { const int64_t* label_data = labels->data(); - CrossEntropyGrad<<< - grid, block, 0, reinterpret_cast( - context.device_context()) - .stream()>>>(logit_grad_data, loss_grad_data, - label_data, batch_size, class_num); + CrossEntropyGrad< + T><<() + .stream()>>>(logit_grad_data, loss_grad_data, label_data, + batch_size, class_num); } } }; @@ -118,9 +118,9 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(softmax_with_cross_entropy, - ops::SoftmaxWithCrossEntropyCUDAKernel, - ops::SoftmaxWithCrossEntropyCUDAKernel); -REGISTER_OP_GPU_KERNEL(softmax_with_cross_entropy_grad, - ops::SoftmaxWithCrossEntropyGradCUDAKernel, - ops::SoftmaxWithCrossEntropyGradCUDAKernel); +REGISTER_OP_CUDA_KERNEL(softmax_with_cross_entropy, + ops::SoftmaxWithCrossEntropyCUDAKernel, + ops::SoftmaxWithCrossEntropyCUDAKernel); +REGISTER_OP_CUDA_KERNEL(softmax_with_cross_entropy_grad, + ops::SoftmaxWithCrossEntropyGradCUDAKernel, + ops::SoftmaxWithCrossEntropyGradCUDAKernel); diff --git a/paddle/operators/softmax_with_cross_entropy_op.h b/paddle/operators/softmax_with_cross_entropy_op.h index c4ab3f74b4b07d13957d99e01aa4868fac719f61..9c3431605b2f2285b2e7d71c5ff2f4a53c6c6f30 100644 --- a/paddle/operators/softmax_with_cross_entropy_op.h +++ b/paddle/operators/softmax_with_cross_entropy_op.h @@ -40,11 +40,12 @@ class SoftmaxWithCrossEntropyKernel : public framework::OpKernel { softmax->mutable_data(context.GetPlace()); loss->mutable_data(context.GetPlace()); - math::SoftmaxFunctor()(context.device_context(), - logits, softmax); - math::CrossEntropyFunctor()( - context.device_context(), loss, softmax, labels, - context.Attr("soft_label")); + auto& dev_ctx = + context.template device_context(); + math::SoftmaxFunctor()(dev_ctx, logits, + softmax); + math::CrossEntropyFunctor()( + dev_ctx, loss, softmax, labels, context.Attr("soft_label")); } }; @@ -62,14 +63,15 @@ class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel { const int class_num = logit_grad->dims()[1]; auto out_grad_mat = EigenMatrix::From(*out_grad); auto logit_grad_mat = EigenMatrix::From(*logit_grad); - + auto& place = *context.template device_context() + .eigen_device(); if (context.Attr("soft_label")) { auto lbl_mat = EigenMatrix::From(*labels); - logit_grad_mat.device(context.GetEigenDevice()) = + logit_grad_mat.device(place) = out_grad_mat.broadcast(Eigen::DSizes(1, class_num)) * (logit_grad_mat - lbl_mat); } else { - logit_grad_mat.device(context.GetEigenDevice()) = + logit_grad_mat.device(place) = logit_grad_mat * out_grad_mat.broadcast(Eigen::DSizes(1, class_num)); diff --git a/paddle/operators/split_op.cu.cc b/paddle/operators/split_op.cu.cc index 93d1fc3c44cbc146c945c51af1abe6494572d1ae..dbad0bbf68d7924cfba80721bb3294b7e0cfac00 100644 --- a/paddle/operators/split_op.cu.cc +++ b/paddle/operators/split_op.cu.cc @@ -14,5 +14,5 @@ limitations under the License. */ #include "paddle/operators/split_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(split, - ops::SplitOpKernel); +REGISTER_OP_CUDA_KERNEL( + split, ops::SplitOpKernel); diff --git a/paddle/operators/split_op.h b/paddle/operators/split_op.h index fa26e5f677b18c84b45dd583004d02cab4c1d375..a38c435d531d7da2a1a60eb2c455bc1782c1cd4c 100644 --- a/paddle/operators/split_op.h +++ b/paddle/operators/split_op.h @@ -21,7 +21,7 @@ limitations under the License. */ namespace paddle { namespace operators { -template +template class SplitOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { diff --git a/paddle/operators/squared_l2_distance_op.cc b/paddle/operators/squared_l2_distance_op.cc index bec2a2c18ae8da892ee7d71f45afe53c887c0f57..50bc6da196e642e3860874cfb883390dd2e93215 100644 --- a/paddle/operators/squared_l2_distance_op.cc +++ b/paddle/operators/squared_l2_distance_op.cc @@ -115,7 +115,7 @@ REGISTER_OP(squared_l2_distance, ops::SquaredL2DistanceOp, ops::SquaredL2DistanceGradOp); REGISTER_OP_CPU_KERNEL( squared_l2_distance, - ops::SquaredL2DistanceKernel); -REGISTER_OP_CPU_KERNEL( - squared_l2_distance_grad, - ops::SquaredL2DistanceGradKernel); + ops::SquaredL2DistanceKernel); +REGISTER_OP_CPU_KERNEL(squared_l2_distance_grad, + ops::SquaredL2DistanceGradKernel< + paddle::platform::CPUDeviceContext, float>); diff --git a/paddle/operators/squared_l2_distance_op.cu b/paddle/operators/squared_l2_distance_op.cu index 3fe62f1a9cb56722ea544b0fed052ac384e799aa..ecc82ed1e49501b05e0cf54e5b44114db150a427 100644 --- a/paddle/operators/squared_l2_distance_op.cu +++ b/paddle/operators/squared_l2_distance_op.cu @@ -17,9 +17,9 @@ #include "paddle/operators/squared_l2_distance_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( squared_l2_distance, - ops::SquaredL2DistanceKernel); -REGISTER_OP_GPU_KERNEL( - squared_l2_distance_grad, - ops::SquaredL2DistanceGradKernel); + ops::SquaredL2DistanceKernel); +REGISTER_OP_CUDA_KERNEL(squared_l2_distance_grad, + ops::SquaredL2DistanceGradKernel< + paddle::platform::CUDADeviceContext, float>); diff --git a/paddle/operators/squared_l2_distance_op.h b/paddle/operators/squared_l2_distance_op.h index 259ef4029646914f83a112b9c6d7fdf8401483f6..5bd5f4819a35966b73038f433d38c06031e18715 100644 --- a/paddle/operators/squared_l2_distance_op.h +++ b/paddle/operators/squared_l2_distance_op.h @@ -27,7 +27,7 @@ template using EigenMatrix = framework::EigenMatrix; -template +template class SquaredL2DistanceKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -51,7 +51,8 @@ class SquaredL2DistanceKernel : public framework::OpKernel { auto sub_result = EigenMatrix::From(*out0); auto z = EigenVector::Flatten(*out1); - auto place = context.GetEigenDevice(); + auto& place = + *context.template device_context().eigen_device(); auto x_dims = x.dimensions(); auto y_dims = y.dimensions(); // buffer the substraction result @@ -67,7 +68,7 @@ class SquaredL2DistanceKernel : public framework::OpKernel { } }; -template +template class SquaredL2DistanceGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -89,7 +90,8 @@ class SquaredL2DistanceGradKernel : public framework::OpKernel { sub_result; // propagate back to input - auto eigen_place = context.GetEigenDevice(); + auto& eigen_place = + *context.template device_context().eigen_device(); if (x_g) { x_g->mutable_data(context.GetPlace()); // eigen matrix diff --git a/paddle/operators/squared_l2_norm_op.cc b/paddle/operators/squared_l2_norm_op.cc index 3c10e6159f44bc8c21b1e79aefaa962c7a2b64ed..3cff61a02f71fadf99f73787e2b2c179f7d441a8 100644 --- a/paddle/operators/squared_l2_norm_op.cc +++ b/paddle/operators/squared_l2_norm_op.cc @@ -72,7 +72,7 @@ REGISTER_OP(squared_l2_norm, ops::SquaredL2NormOp, ops::SquaredL2NormOpMaker, squared_l2_norm_grad, ops::SquaredL2NormGradOp); REGISTER_OP_CPU_KERNEL( squared_l2_norm, - ops::SquaredL2NormKernel); + ops::SquaredL2NormKernel); REGISTER_OP_CPU_KERNEL( squared_l2_norm_grad, - ops::SquaredL2NormGradKernel); + ops::SquaredL2NormGradKernel); diff --git a/paddle/operators/squared_l2_norm_op.cu b/paddle/operators/squared_l2_norm_op.cu index d384e9c28c9150fa901404478739ff809f29126f..2d6567d090a96a43cbda203fb8176041d719e55f 100644 --- a/paddle/operators/squared_l2_norm_op.cu +++ b/paddle/operators/squared_l2_norm_op.cu @@ -16,9 +16,9 @@ #include "paddle/operators/squared_l2_norm_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( squared_l2_norm, - ops::SquaredL2NormKernel); -REGISTER_OP_GPU_KERNEL( + ops::SquaredL2NormKernel); +REGISTER_OP_CUDA_KERNEL( squared_l2_norm_grad, - ops::SquaredL2NormGradKernel); + ops::SquaredL2NormGradKernel); diff --git a/paddle/operators/squared_l2_norm_op.h b/paddle/operators/squared_l2_norm_op.h index 48d7b1c2d56882f04330dbf27b0a92e37cb8874c..0ced7e7d70ab3627a337d70890db6842ba0f7768 100644 --- a/paddle/operators/squared_l2_norm_op.h +++ b/paddle/operators/squared_l2_norm_op.h @@ -20,7 +20,7 @@ namespace paddle { namespace operators { // Out = sum(square(X)) -template +template class SquaredL2NormKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { @@ -30,14 +30,15 @@ class SquaredL2NormKernel : public framework::OpKernel { auto x = framework::EigenVector::Flatten(*X); auto out = framework::EigenScalar::From(*Out); - auto place = context.GetEigenDevice(); + auto *place = + context.template device_context().eigen_device(); - out.device(place) = x.square().sum(); + out.device(*place) = x.square().sum(); } }; // dX = X -template +template class SquaredL2NormGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { @@ -53,10 +54,11 @@ class SquaredL2NormGradKernel : public framework::OpKernel { auto x = framework::EigenVector::Flatten(*X); auto dout = framework::EigenVector::Flatten(*dOut); auto dx = framework::EigenVector::Flatten(*dX); - auto place = context.GetEigenDevice(); + auto *place = + context.template device_context().eigen_device(); Eigen::DSizes x_dsize(X->numel()); - dx.device(place) = (dout.broadcast(x_dsize) * x) * static_cast(2.0); + dx.device(*place) = (dout.broadcast(x_dsize) * x) * static_cast(2.0); } }; diff --git a/paddle/operators/sum_op.cc b/paddle/operators/sum_op.cc index 744b2fe3f297ad84d4e2d524ecf180c130c85658..cd52672f78e3e5826e8dfff92fb8e4668c5c6dcd 100644 --- a/paddle/operators/sum_op.cc +++ b/paddle/operators/sum_op.cc @@ -195,7 +195,8 @@ namespace ops = paddle::operators; REGISTER_OPERATOR(sum, ops::SumOp, ops::SumOpMaker, ops::SumGradMaker, ops::SumOpVarTypeInference); -REGISTER_OP_CPU_KERNEL(sum, ops::SumKernel, - ops::SumKernel, - ops::SumKernel, - ops::SumKernel); +REGISTER_OP_CPU_KERNEL( + sum, ops::SumKernel, + ops::SumKernel, + ops::SumKernel, + ops::SumKernel); diff --git a/paddle/operators/sum_op.cu b/paddle/operators/sum_op.cu index 5c30dd4d470c2e0acecef18524a4a81f9eb786a9..873155076c179d5280a418e25fd39fdaf4b0a2b2 100644 --- a/paddle/operators/sum_op.cu +++ b/paddle/operators/sum_op.cu @@ -13,7 +13,8 @@ limitations under the License. */ #include "paddle/operators/sum_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(sum, ops::SumKernel, - ops::SumKernel, - ops::SumKernel, - ops::SumKernel); +REGISTER_OP_CUDA_KERNEL( + sum, ops::SumKernel, + ops::SumKernel, + ops::SumKernel, + ops::SumKernel); diff --git a/paddle/operators/sum_op.h b/paddle/operators/sum_op.h index ed6c80ce60da44de1a749ce075cfbca7d53032e0..eaa36aa1aea53e0b37ef6c578d8bb1cda230ded0 100644 --- a/paddle/operators/sum_op.h +++ b/paddle/operators/sum_op.h @@ -26,7 +26,7 @@ template using EigenVector = framework::EigenVector; -template +template class SumKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { @@ -43,12 +43,14 @@ class SumKernel : public framework::OpKernel { auto result = EigenVector::Flatten(*out); if (!in_place) { - math::SetConstant constant_functor; - constant_functor(context.device_context(), out, 0.0); + math::SetConstant constant_functor; + constant_functor(context.template device_context(), out, + 0.0); } - math::SelectedRowsAddToTensor functor; - auto place = context.GetEigenDevice(); + math::SelectedRowsAddToTensor functor; + auto &place = + *context.template device_context().eigen_device(); // If in_place, just skip the first tensor for (int i = in_place ? 1 : 0; i < N; i++) { if (in_vars[i]->IsType()) { @@ -60,7 +62,7 @@ class SumKernel : public framework::OpKernel { result.device(place) = result + in; } else if (in_vars[i]->IsType()) { auto &in_t = in_vars[i]->Get(); - functor(context.device_context(), in_t, out); + functor(context.template device_context(), in_t, out); } else { PADDLE_THROW("Variable type must be LoDTensor/SelectedRows."); } @@ -82,14 +84,14 @@ class SumKernel : public framework::OpKernel { out_value->Resize(framework::make_ddim(in_dim_vec)); out_value->mutable_data(context.GetPlace()); - math::SelectedRowsAddTo functor; + math::SelectedRowsAddTo functor; int64_t offset = 0; for (int i = 0; i < N; i++) { PADDLE_ENFORCE_EQ(out->height(), in_vars[i]->Get().height()); - functor(context.device_context(), in_vars[i]->Get(), - offset, out); + functor(context.template device_context(), + in_vars[i]->Get(), offset, out); offset += in_vars[i]->Get().value().numel(); } } else if (out_var->IsType()) { @@ -112,7 +114,8 @@ class SumKernel : public framework::OpKernel { PADDLE_ENFORCE(out_array[i].lod() == in_array[i].lod()); auto in = EigenVector::Flatten(in_array[i]); auto result = EigenVector::Flatten(out_array[i]); - result.device(context.GetEigenDevice()) = result + in; + result.device(*context.template device_context() + .eigen_device()) = result + in; } } } diff --git a/paddle/operators/top_k_op.cu b/paddle/operators/top_k_op.cu index 7851c71bbe9fe73402968ce14f6db0df523cd6d3..453bd07267e3a6e33211117368dd9aff10a9e23f 100644 --- a/paddle/operators/top_k_op.cu +++ b/paddle/operators/top_k_op.cu @@ -317,4 +317,4 @@ class TopkOpCUDAKernel : public framework::OpKernel { } // namespace operators } // namespace paddle -REGISTER_OP_GPU_KERNEL(top_k, paddle::operators::TopkOpCUDAKernel); +REGISTER_OP_CUDA_KERNEL(top_k, paddle::operators::TopkOpCUDAKernel); diff --git a/paddle/operators/top_k_op.h b/paddle/operators/top_k_op.h index bc8563717a21bd5b3d8fc87f689657990066957b..e9cd9bbd4d964c28f305fb4ab4c4733ed27ebfff 100644 --- a/paddle/operators/top_k_op.h +++ b/paddle/operators/top_k_op.h @@ -27,7 +27,7 @@ template using EigenMatrix = framework::EigenMatrix; -template +template class TopkKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { diff --git a/paddle/operators/transpose_op.cc b/paddle/operators/transpose_op.cc index 94de3d5069017a7ca818e246ad574c4db92d8006..de5ff561add6183828f6bb4c44e30f6bb13079fa 100644 --- a/paddle/operators/transpose_op.cc +++ b/paddle/operators/transpose_op.cc @@ -112,8 +112,8 @@ class TransposeOpGrad : public framework::OperatorWithKernel { namespace ops = paddle::operators; REGISTER_OP(transpose, ops::TransposeOp, ops::TransposeOpMaker, transpose_grad, ops::TransposeOpGrad); -REGISTER_OP_CPU_KERNEL(transpose, - ops::TransposeKernel); +REGISTER_OP_CPU_KERNEL( + transpose, ops::TransposeKernel); REGISTER_OP_CPU_KERNEL( transpose_grad, - ops::TransposeGradKernel); + ops::TransposeGradKernel); diff --git a/paddle/operators/transpose_op.cu.cc b/paddle/operators/transpose_op.cu.cc index af3f581462c919bbd2dd1067e536cc638f9c267d..7d23f1493ec2d548438aeb2493fda8a4ff8c6e80 100644 --- a/paddle/operators/transpose_op.cu.cc +++ b/paddle/operators/transpose_op.cu.cc @@ -15,8 +15,9 @@ #include "paddle/operators/transpose_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(transpose, - ops::TransposeKernel); -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( + transpose, + ops::TransposeKernel); +REGISTER_OP_CUDA_KERNEL( transpose_grad, - ops::TransposeGradKernel); + ops::TransposeGradKernel); diff --git a/paddle/operators/transpose_op.h b/paddle/operators/transpose_op.h index e296032f4147f9f8338148f9e4fef100c7cf816f..d995271a6be3266e05c742ab18c34636da384e66 100644 --- a/paddle/operators/transpose_op.h +++ b/paddle/operators/transpose_op.h @@ -20,33 +20,33 @@ namespace paddle { namespace operators { -template -inline void TransCompute(const int dim, const platform::DeviceContext& dev_ctx, +template +inline void TransCompute(const int dim, const DeviceContext& dev_ctx, const framework::Tensor& in, framework::Tensor* out, const std::vector& axis) { switch (dim) { case 1: - math::Transpose trans1; + math::Transpose trans1; trans1(dev_ctx, in, out, axis); break; case 2: - math::Transpose trans2; + math::Transpose trans2; trans2(dev_ctx, in, out, axis); break; case 3: - math::Transpose trans3; + math::Transpose trans3; trans3(dev_ctx, in, out, axis); break; case 4: - math::Transpose trans4; + math::Transpose trans4; trans4(dev_ctx, in, out, axis); break; case 5: - math::Transpose trans5; + math::Transpose trans5; trans5(dev_ctx, in, out, axis); break; case 6: - math::Transpose trans6; + math::Transpose trans6; trans6(dev_ctx, in, out, axis); break; default: @@ -54,7 +54,7 @@ inline void TransCompute(const int dim, const platform::DeviceContext& dev_ctx, } } -template +template class TransposeKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -64,12 +64,12 @@ class TransposeKernel : public framework::OpKernel { std::vector axis = context.Attr>("axis"); int ndims = axis.size(); - auto& dev_ctx = context.device_context(); - TransCompute(ndims, dev_ctx, *x, out, axis); + auto& dev_ctx = context.template device_context(); + TransCompute(ndims, dev_ctx, *x, out, axis); } }; -template +template class TransposeGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -88,8 +88,9 @@ class TransposeGradKernel : public framework::OpKernel { } int ndims = axis.size(); - auto& dev_ctx = context.device_context(); - TransCompute(ndims, dev_ctx, *out_grad, x_grad, reversed_axis); + auto& dev_ctx = context.template device_context(); + TransCompute(ndims, dev_ctx, *out_grad, x_grad, + reversed_axis); } }; diff --git a/paddle/operators/uniform_random_op.cc b/paddle/operators/uniform_random_op.cc index fff1dc7ccddf1d8cee0c8311828fd38888283cd1..2a49ee471f67cda87415db0e1440a4992c0cd088 100644 --- a/paddle/operators/uniform_random_op.cc +++ b/paddle/operators/uniform_random_op.cc @@ -67,7 +67,7 @@ class UniformRandomOp : public framework::OperatorWithKernel { const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( static_cast(ctx.Attr("dtype")), - ctx.device_context()); + ctx.GetPlace()); } }; diff --git a/paddle/operators/uniform_random_op.cu b/paddle/operators/uniform_random_op.cu index 8b20bb8287807aca673817c503fee6db04b55753..cfe9d293cff2108cf25749d0e78e2e86e6e198a5 100644 --- a/paddle/operators/uniform_random_op.cu +++ b/paddle/operators/uniform_random_op.cu @@ -63,6 +63,6 @@ class GPUUniformRandomKernel : public framework::OpKernel { } // namespace operators } // namespace paddle -REGISTER_OP_GPU_KERNEL(uniform_random, - paddle::operators::GPUUniformRandomKernel, - paddle::operators::GPUUniformRandomKernel); +REGISTER_OP_CUDA_KERNEL(uniform_random, + paddle::operators::GPUUniformRandomKernel, + paddle::operators::GPUUniformRandomKernel); diff --git a/paddle/operators/unpool_op.cc b/paddle/operators/unpool_op.cc index 89c48e071cf351f7d7b9cf26a5d4989af291da57..49df2a530cd0c5c13f08db4b1e7db62679081e9b 100644 --- a/paddle/operators/unpool_op.cc +++ b/paddle/operators/unpool_op.cc @@ -135,9 +135,10 @@ class UnpoolOpGrad : public framework::OperatorWithKernel { namespace ops = paddle::operators; REGISTER_OP(unpool, ops::UnpoolOp, ops::Unpool2dOpMaker, unpool_grad, ops::UnpoolOpGrad); -REGISTER_OP_CPU_KERNEL(unpool, - ops::UnpoolKernel, - ops::UnpoolKernel); REGISTER_OP_CPU_KERNEL( - unpool_grad, ops::UnpoolGradKernel, - ops::UnpoolGradKernel); + unpool, ops::UnpoolKernel, + ops::UnpoolKernel); +REGISTER_OP_CPU_KERNEL( + unpool_grad, + ops::UnpoolGradKernel, + ops::UnpoolGradKernel); diff --git a/paddle/operators/unpool_op.cu.cc b/paddle/operators/unpool_op.cu.cc index 18aafb7dc74ed474ed3ec5e8a388ecdb71b9a8f5..9b002e35c434db561114dbbafce2f3f934daaf6a 100644 --- a/paddle/operators/unpool_op.cu.cc +++ b/paddle/operators/unpool_op.cu.cc @@ -15,9 +15,10 @@ limitations under the License. */ #include "paddle/operators/unpool_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(unpool, - ops::UnpoolKernel, - ops::UnpoolKernel); -REGISTER_OP_GPU_KERNEL( - unpool_grad, ops::UnpoolGradKernel, - ops::UnpoolGradKernel); +REGISTER_OP_CUDA_KERNEL( + unpool, ops::UnpoolKernel, + ops::UnpoolKernel); +REGISTER_OP_CUDA_KERNEL( + unpool_grad, + ops::UnpoolGradKernel, + ops::UnpoolGradKernel); diff --git a/paddle/operators/unpool_op.h b/paddle/operators/unpool_op.h index 243eb7e532c5149db4fb1b381fd8664ae4bdd81a..ee18b118c957c7933890000bbe934e6ffdc9e56f 100644 --- a/paddle/operators/unpool_op.h +++ b/paddle/operators/unpool_op.h @@ -20,7 +20,7 @@ limitations under the License. */ namespace paddle { namespace operators { -template +template class UnpoolKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -32,15 +32,16 @@ class UnpoolKernel : public framework::OpKernel { std::vector strides = context.Attr>("strides"); std::vector paddings = context.Attr>("paddings"); T* output_data = out->mutable_data(context.GetPlace()); + auto& dev_ctx = context.template device_context(); if (output_data) { - math::SetConstant set_zero; - set_zero(context.device_context(), out, static_cast(0)); + math::SetConstant set_zero; + set_zero(dev_ctx, out, static_cast(0)); } - math::Unpool2dMaxFunctor unpool2d_max_forward; - unpool2d_max_forward(context.device_context(), *in_x, *in_y, out); + math::Unpool2dMaxFunctor unpool2d_max_forward; + unpool2d_max_forward(dev_ctx, *in_x, *in_y, out); } }; -template +template class UnpoolGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -56,15 +57,14 @@ class UnpoolGradKernel : public framework::OpKernel { std::vector strides = context.Attr>("strides"); std::vector paddings = context.Attr>("paddings"); - auto& device_ctx = context.device_context(); - math::SetConstant zero; + auto& device_ctx = context.template device_context(); + math::SetConstant zero; if (in_x_grad) { in_x_grad->mutable_data(context.GetPlace()); zero(device_ctx, in_x_grad, static_cast(0)); } - math::Unpool2dMaxGradFunctor unpool2d_max_backward; - unpool2d_max_backward(context.device_context(), *in_x, *in_y, *out, - *out_grad, in_x_grad); + math::Unpool2dMaxGradFunctor unpool2d_max_backward; + unpool2d_max_backward(device_ctx, *in_x, *in_y, *out, *out_grad, in_x_grad); } }; } // namespace operators diff --git a/paddle/platform/device_context.cc b/paddle/platform/device_context.cc index ae4f0bf896dce013d301aa0bf9f732f0fd9cc6bf..2c7f96421621b9a34d1ec96c13d9c354a0d4012c 100644 --- a/paddle/platform/device_context.cc +++ b/paddle/platform/device_context.cc @@ -15,12 +15,6 @@ limitations under the License. */ namespace paddle { namespace platform { -template <> -Eigen::DefaultDevice* DeviceContext::GetEigenDevice< - platform::CPUPlace, Eigen::DefaultDevice>() const { - return reinterpret_cast(this)->eigen_device(); -} - CPUDeviceContext::CPUDeviceContext() { eigen_device_.reset(new Eigen::DefaultDevice()); } @@ -37,12 +31,6 @@ Place CPUDeviceContext::GetPlace() const { return CPUPlace(); } #ifdef PADDLE_WITH_CUDA -template <> -Eigen::GpuDevice* -DeviceContext::GetEigenDevice() const { - return reinterpret_cast(this)->eigen_device(); -} - class EigenCudaStreamDevice : public Eigen::StreamInterface { public: EigenCudaStreamDevice() : scratch_(nullptr), semaphore_(nullptr) { diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h index ef5f19214d9ccb23b9c946bee28cb764122bd7cd..596d9d0bba420a47fc10cc9dd96a755daa35dbac 100644 --- a/paddle/platform/device_context.h +++ b/paddle/platform/device_context.h @@ -27,24 +27,11 @@ limitations under the License. */ namespace paddle { namespace platform { -template -struct EigenDeviceConverter; - -template <> -struct EigenDeviceConverter { - using EigenDeviceType = Eigen::DefaultDevice; -}; - class DeviceContext { public: virtual ~DeviceContext() {} virtual Place GetPlace() const = 0; - template ::EigenDeviceType> - DeviceType* GetEigenDevice() const; - virtual void Wait() const {} }; @@ -62,10 +49,6 @@ class CPUDeviceContext : public DeviceContext { }; #ifdef PADDLE_WITH_CUDA -template <> -struct EigenDeviceConverter { - using EigenDeviceType = Eigen::GpuDevice; -}; class EigenCudaStreamDevice; diff --git a/paddle/platform/device_context_test.cc b/paddle/platform/device_context_test.cc index 8bf5174c4a5579f6f5602dd38e5a87ed3ef444a7..4893cd92f6a74f7992c279ebd51232049f29e853 100644 --- a/paddle/platform/device_context_test.cc +++ b/paddle/platform/device_context_test.cc @@ -22,9 +22,8 @@ TEST(Device, Init) { int count = paddle::platform::GetCUDADeviceCount(); for (int i = 0; i < count; i++) { - DeviceContext* device_context = new CUDADeviceContext(GPUPlace(i)); - Eigen::GpuDevice* gpu_device = - device_context->template GetEigenDevice(); + CUDADeviceContext* device_context = new CUDADeviceContext(GPUPlace(i)); + Eigen::GpuDevice* gpu_device = device_context->eigen_device(); ASSERT_NE(nullptr, gpu_device); delete device_context; } diff --git a/paddle/platform/transform.h b/paddle/platform/transform.h index bb9d59ec0a18ce013632f128c9b5d230255f1ac4..148ebaed3d893cd03df8cf27b1309d07afd9aa4a 100644 --- a/paddle/platform/transform.h +++ b/paddle/platform/transform.h @@ -31,7 +31,7 @@ namespace paddle { namespace platform { // Transform on host or device. It provides the same API in std library. -template +template struct Transform { template void operator()(const DeviceContext& context, InputIter first, InputIter last, @@ -45,16 +45,16 @@ struct Transform { }; template <> -struct Transform { +struct Transform { template - void operator()(const DeviceContext& context, InputIter first, InputIter last, - OutputIter result, UnaryOperation op) { + void operator()(const platform::CPUDeviceContext& context, InputIter first, + InputIter last, OutputIter result, UnaryOperation op) { std::transform(first, last, result, op); } template - void operator()(const DeviceContext& context, InputIter1 first1, + void operator()(const platform::CPUDeviceContext& context, InputIter1 first1, InputIter1 last1, InputIter2 first2, OutputIter result, BinaryOperation op) { std::transform(first1, last1, first2, result, op); @@ -63,27 +63,25 @@ struct Transform { #ifdef __NVCC__ template <> -struct Transform { +struct Transform { template - void operator()(const DeviceContext& context, InputIter first, InputIter last, - OutputIter result, UnaryOperation op) { + void operator()(const platform::CUDADeviceContext& context, InputIter first, + InputIter last, OutputIter result, UnaryOperation op) { auto place = context.GetPlace(); PADDLE_ENFORCE(is_gpu_place(place), "It must use GPU place."); - auto& ctx = reinterpret_cast(context); - thrust::transform(thrust::cuda::par.on(ctx.stream()), + thrust::transform(thrust::cuda::par.on(context.stream()), details::DevPtrCast(first), details::DevPtrCast(last), details::DevPtrCast(result), op); } template - void operator()(const DeviceContext& context, InputIter1 first1, + void operator()(const platform::CUDADeviceContext& context, InputIter1 first1, InputIter1 last1, InputIter2 first2, OutputIter result, BinaryOperation op) { auto place = context.GetPlace(); PADDLE_ENFORCE(is_gpu_place(place), "It must use GPU place."); - auto& ctx = reinterpret_cast(context); - thrust::transform(thrust::cuda::par.on(ctx.stream()), + thrust::transform(thrust::cuda::par.on(context.stream()), details::DevPtrCast(first1), details::DevPtrCast(last1), details::DevPtrCast(first2), details::DevPtrCast(result), op); diff --git a/paddle/platform/transform_test.cu b/paddle/platform/transform_test.cu index c76cab80e4b0e8df98a7be15f86699cfb6f93af2..d36eac8379ebedb284b36012a46186cd3ac43b91 100644 --- a/paddle/platform/transform_test.cu +++ b/paddle/platform/transform_test.cu @@ -39,7 +39,7 @@ TEST(Transform, CPUUnary) { using namespace paddle::platform; CPUDeviceContext ctx; float buf[4] = {0.1, 0.2, 0.3, 0.4}; - Transform trans; + Transform trans; trans(ctx, buf, buf + 4, buf, Scale(10)); for (int i = 0; i < 4; ++i) { ASSERT_NEAR(buf[i], static_cast(i + 1), 1e-5); @@ -54,7 +54,7 @@ TEST(Transform, GPUUnary) { float cpu_buf[4] = {0.1, 0.2, 0.3, 0.4}; float* gpu_buf = static_cast(Alloc(gpu0, sizeof(float) * 4)); Copy(gpu0, gpu_buf, CPUPlace(), cpu_buf, sizeof(cpu_buf)); - Transform trans; + Transform trans; trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, Scale(10)); ctx.Wait(); Copy(CPUPlace(), cpu_buf, gpu0, gpu_buf, sizeof(cpu_buf)); @@ -68,7 +68,7 @@ TEST(Transform, CPUBinary) { using namespace paddle::platform; using namespace paddle::memory; int buf[4] = {1, 2, 3, 4}; - Transform trans; + Transform trans; CPUDeviceContext ctx; trans(ctx, buf, buf + 4, buf, buf, Multiply()); for (int i = 0; i < 4; ++i) { @@ -84,7 +84,7 @@ TEST(Transform, GPUBinary) { CUDADeviceContext ctx(gpu0); int* gpu_buf = static_cast(Alloc(gpu0, sizeof(buf))); Copy(gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf)); - Transform trans; + Transform trans; trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, gpu_buf, Multiply()); ctx.Wait(); Copy(CPUPlace(), buf, gpu0, gpu_buf, sizeof(buf));