diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt index 5aaaf993323c2d4dbef688d0977ec6374fde6512..68346001b1fa3e4ff5823f822fa41c9ef8150f2e 100644 --- a/paddle/operators/CMakeLists.txt +++ b/paddle/operators/CMakeLists.txt @@ -210,7 +210,8 @@ set(DEPS_OPS save_op load_op send_op - recv_op) + recv_op + detection_output_op) if(WITH_DISTRIBUTE) add_subdirectory(detail) @@ -233,6 +234,7 @@ op_library(cond_op SRCS cond_op.cc DEPS framework_proto tensor operator net_op) op_library(cross_entropy_op DEPS cross_entropy) op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax) op_library(softmax_op DEPS softmax) +op_library(detection_output_op DEPS softmax) op_library(sequence_softmax_op DEPS softmax) op_library(sum_op DEPS selected_rows_functor) op_library(sgd_op DEPS selected_rows_functor) diff --git a/paddle/operators/detection_output_op.cc b/paddle/operators/detection_output_op.cc index 2bf0ef44141ee26dbfcf423609ef95aa0a49fcd6..109cf7d4c76438aacd07506ad69e08925ca87f9c 100644 --- a/paddle/operators/detection_output_op.cc +++ b/paddle/operators/detection_output_op.cc @@ -86,5 +86,5 @@ REGISTER_OP_WITHOUT_GRADIENT(detection_output, ops::Detection_output_Op, ops::Detection_output_OpMaker); REGISTER_OP_CPU_KERNEL( detection_output, - ops::Detection_output_Kernel, - ops::Detection_output_Kernel); + ops::Detection_output_Kernel, + ops::Detection_output_Kernel); diff --git a/paddle/operators/detection_output_op.cu.cc b/paddle/operators/detection_output_op.cu.cc index 8edcfc0be3547e26612fd72edefc99da2b0c6730..e65b2afd213f7ed10f5bc855b00e0b188df45c27 100644 --- a/paddle/operators/detection_output_op.cu.cc +++ b/paddle/operators/detection_output_op.cu.cc @@ -15,7 +15,7 @@ limitations under the License. */ #include "paddle/operators/detection_output_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( detection_output, - ops::Detection_output_Kernel, - ops::Detection_output_Kernel); + ops::Detection_output_Kernel, + ops::Detection_output_Kernel); diff --git a/paddle/operators/detection_output_op.h b/paddle/operators/detection_output_op.h index 510d82251d8c96ad06b7bbbd301ec345d269ab50..733ec3b0ede2d3c344d877881197235089d5ab74 100644 --- a/paddle/operators/detection_output_op.h +++ b/paddle/operators/detection_output_op.h @@ -21,8 +21,8 @@ limitations under the License. */ #include "paddle/operators/strided_memcpy.h" namespace paddle { namespace operators { -template -inline void transpose_fun(const platform::DeviceContext& context, +template +inline void transpose_fun(const framework::ExecutionContext& context, const framework::Tensor& src, framework::Tensor* dst) { int input_nums = src.dims()[0]; @@ -36,17 +36,18 @@ inline void transpose_fun(const platform::DeviceContext& context, framework::Tensor in_p_tensor_transpose; in_p_tensor_transpose.mutable_data(shape, context.GetPlace()); std::vector shape_axis({0, 1, 3, 4, 2}); - math::Transpose trans5; - trans5(context, in_p_tensor, &in_p_tensor_transpose, shape_axis); + math::Transpose trans5; + trans5(context.template device_context(), in_p_tensor, + &in_p_tensor_transpose, shape_axis); auto dst_stride = framework::stride(dst->dims()); auto src_stride = framework::stride(in_p_tensor_transpose.dims()); - StridedMemcpy(context, in_p_tensor_transpose.data(), src_stride, - in_p_tensor_transpose.dims(), dst_stride, + StridedMemcpy(context.device_context(), in_p_tensor_transpose.data(), + src_stride, in_p_tensor_transpose.dims(), dst_stride, dst->data() + offset); offset += in_p_tensor_transpose.dims()[4] * src_stride[4]; } } -template +template class Detection_output_Kernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -87,11 +88,12 @@ class Detection_output_Kernel : public framework::OpKernel { framework::Tensor conf_cpu; framework::Tensor priorbox_cpu; const T* priorbox_data = in_priorbox->data(); - transpose_fun(context.device_context(), *in_loc, &loc_tensor); - transpose_fun(context.device_context(), *in_conf, &conf_tensor); + transpose_fun(context, *in_loc, &loc_tensor); + transpose_fun(context, *in_conf, &conf_tensor); conf_tensor.Resize(conf_shape_softmax); - math::SoftmaxFunctor()(context.device_context(), &conf_tensor, - &conf_tensor); + math::SoftmaxFunctor()( + context.template device_context(), &conf_tensor, + &conf_tensor); T* loc_data = loc_tensor.data(); T* conf_data = conf_tensor.data(); if (platform::is_gpu_place(context.GetPlace())) {