From 738585476dfa3ad55a660e6c2090cb6e22d3aea7 Mon Sep 17 00:00:00 2001 From: Abhinav Arora Date: Tue, 1 May 2018 13:05:49 -0700 Subject: [PATCH] Fix more CPPLint issues in fluid/operators/math (#10276) * Fix CPPLint issues in lstm_cpu_kernel.h * Fix CPPLint issues in math/math_function_test * Fix CPPLint issues in math/math_function_test * Fix CPPLint issues in math/concat.cc * Fix CPPLint issues in math/concat.cc * Fix CPPLint issues in math/concat.cc * Fix CPPLint issues in math/gru_cpu_kernel * Fix CPPLint issues in math/selected_rows_functor_test.cu * Fix compile error * Fix compile error --- paddle/fluid/operators/concat_op.h | 2 +- paddle/fluid/operators/math/concat.cc | 13 +- paddle/fluid/operators/math/concat.cu | 14 +- paddle/fluid/operators/math/concat.h | 3 +- .../operators/math/detail/gru_cpu_kernel.h | 51 +-- .../operators/math/detail/lstm_cpu_kernel.h | 71 ++-- .../operators/math/math_function_test.cu | 321 +++++++++--------- .../math/selected_rows_functor_test.cu | 123 ++++--- 8 files changed, 313 insertions(+), 285 deletions(-) diff --git a/paddle/fluid/operators/concat_op.h b/paddle/fluid/operators/concat_op.h index 92c8ab6d9ff..1b1b8bf5ed9 100644 --- a/paddle/fluid/operators/concat_op.h +++ b/paddle/fluid/operators/concat_op.h @@ -87,7 +87,7 @@ class ConcatGradKernel : public framework::OpKernel { auto& dev_ctx = ctx.template device_context(); paddle::operators::math::ConcatGradFunctor concat_grad_functor; - concat_grad_functor(dev_ctx, *in, static_cast(axis), outputs); + concat_grad_functor(dev_ctx, *in, static_cast(axis), &outputs); } } }; diff --git a/paddle/fluid/operators/math/concat.cc b/paddle/fluid/operators/math/concat.cc index bfce56f9fdc..cc69212466b 100644 --- a/paddle/fluid/operators/math/concat.cc +++ b/paddle/fluid/operators/math/concat.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/math/concat.h" +#include namespace paddle { namespace operators { @@ -70,20 +71,20 @@ class ConcatGradFunctor { public: void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& input, const int axis, - std::vector& outputs) { + std::vector* outputs) { // TODO(zcd): Add input data validity checking - int num = outputs.size(); + int num = outputs->size(); int input_rows = 1; - auto dim_0 = outputs[0].dims(); + auto dim_0 = outputs->at(0).dims(); for (int i = 0; i < axis; ++i) { input_rows *= dim_0[i]; } int input_cols = 0; - std::vector output_cols(outputs.size()); + std::vector output_cols(outputs->size()); for (int i = 0; i < num; ++i) { - int t_cols = outputs[i].numel() / input_rows; + int t_cols = outputs->at(i).numel() / input_rows; input_cols += t_cols; output_cols[i] = t_cols; } @@ -95,7 +96,7 @@ class ConcatGradFunctor { int col_idx = 0; for (int j = 0; j < num; ++j) { int col_len = output_cols[j]; - T* dst_ptr = outputs[j].data() + k * col_len; + T* dst_ptr = outputs->at(j).data() + k * col_len; memory::Copy(cpu_place, dst_ptr, cpu_place, src_ptr + col_idx, sizeof(T) * col_len); col_idx += col_len; diff --git a/paddle/fluid/operators/math/concat.cu b/paddle/fluid/operators/math/concat.cu index 226a879bce5..4285d38dcd6 100644 --- a/paddle/fluid/operators/math/concat.cu +++ b/paddle/fluid/operators/math/concat.cu @@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include +#include #include "paddle/fluid/framework/mixed_vector.h" #include "paddle/fluid/operators/math/concat.h" #include "paddle/fluid/platform/cuda_primitives.h" @@ -202,16 +204,16 @@ class ConcatGradFunctor { public: void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& input, const int axis, - std::vector& outputs) { + std::vector* outputs) { // TODO(zcd): Add input data validity checking - int o_num = outputs.size(); + int o_num = outputs->size(); int out_row = 1; - auto dim_0 = outputs[0].dims(); + auto dim_0 = outputs->at(0).dims(); for (int i = 0; i < axis; ++i) { out_row *= dim_0[i]; } - int out_col = outputs[0].numel() / out_row; + int out_col = outputs->at(0).numel() / out_row; int in_col = 0, in_row = out_row; bool sameShape = true; @@ -221,13 +223,13 @@ class ConcatGradFunctor { outputs_cols[0] = 0; for (int i = 0; i < o_num; ++i) { - int t_col = outputs[i].numel() / out_row; + int t_col = outputs->at(i).numel() / out_row; if (sameShape) { if (t_col != out_col) sameShape = false; } in_col += t_col; outputs_cols[i + 1] = in_col; - outputs_ptr[i] = outputs[i].data(); + outputs_ptr[i] = outputs->at(i).data(); } T** dev_out_gpu_data = diff --git a/paddle/fluid/operators/math/concat.h b/paddle/fluid/operators/math/concat.h index c0e983e4aa7..041ce8bf8a2 100644 --- a/paddle/fluid/operators/math/concat.h +++ b/paddle/fluid/operators/math/concat.h @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/tensor.h" @@ -56,7 +57,7 @@ template class ConcatGradFunctor { public: void operator()(const DeviceContext& context, const framework::Tensor& input, - const int axis, std::vector& outputs); + const int axis, std::vector* outputs); }; } // namespace math diff --git a/paddle/fluid/operators/math/detail/gru_cpu_kernel.h b/paddle/fluid/operators/math/detail/gru_cpu_kernel.h index 1e5ff8ef46d..26e6adafdfc 100644 --- a/paddle/fluid/operators/math/detail/gru_cpu_kernel.h +++ b/paddle/fluid/operators/math/detail/gru_cpu_kernel.h @@ -89,14 +89,14 @@ void hl_avx_gru_forward_reset_output(OpResetOutput op_reset_output, __m256 r_value_reset_gate; __m256 r_value_reset_output; __m256 r_prev_out = _mm256_set1_ps(0.0f); - __m256 *update_gate = (__m256 *)gate_value; - __m256 *reset_gate = (__m256 *)(gate_value + frame_size); + __m256 *update_gate = reinterpret_cast<__m256 *>(gate_value); + __m256 *reset_gate = reinterpret_cast<__m256 *>(gate_value + frame_size); for (int i = 0; i < frame_size / 8; i++) { r_value_update_gate = update_gate[i]; r_value_reset_gate = reset_gate[i]; if (prev_output_value) { - r_prev_out = ((__m256 *)prev_output_value)[i]; + r_prev_out = (reinterpret_cast<__m256 *>(prev_output_value))[i]; } op_reset_output(r_value_update_gate, r_value_reset_gate, r_prev_out, @@ -104,7 +104,7 @@ void hl_avx_gru_forward_reset_output(OpResetOutput op_reset_output, update_gate[i] = r_value_update_gate; reset_gate[i] = r_value_reset_gate; - ((__m256 *)reset_output_value)[i] = r_value_reset_output; + (reinterpret_cast<__m256 *>(reset_output_value))[i] = r_value_reset_output; } #endif } @@ -119,21 +119,21 @@ void hl_avx_gru_forward_final_output(OpFinalOutput op_final_output, __m256 r_value_frame_state; __m256 r_prev_out = _mm256_set1_ps(0.0f); __m256 r_output; - __m256 *update_gate = (__m256 *)gate_value; - __m256 *frame_state = (__m256 *)(gate_value + frame_size * 2); + __m256 *update_gate = reinterpret_cast<__m256 *>(gate_value); + __m256 *frame_state = reinterpret_cast<__m256 *>(gate_value + frame_size * 2); for (int i = 0; i < frame_size / 8; i++) { r_value_update_gate = update_gate[i]; r_value_frame_state = frame_state[i]; if (prev_output_value) { - r_prev_out = ((__m256 *)prev_output_value)[i]; + r_prev_out = (reinterpret_cast<__m256 *>(prev_output_value))[i]; } op_final_output(r_value_update_gate, r_value_frame_state, r_prev_out, r_output, active_node); frame_state[i] = r_value_frame_state; - ((__m256 *)output_value)[i] = r_output; + (reinterpret_cast<__m256 *>(output_value))[i] = r_output; } #endif } @@ -284,20 +284,22 @@ void hl_avx_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value, __m256 r_out_grad; __m256 r_prev_out_value = _mm256_set1_ps(0.0f); __m256 r_prev_out_grad = _mm256_set1_ps(0.0f); - __m256 *update_gate_value = (__m256 *)gate_value; - __m256 *update_gate_grad = (__m256 *)gate_grad; - __m256 *frame_state_value = (__m256 *)(gate_value + frame_size * 2); - __m256 *frame_state_grad = (__m256 *)(gate_grad + frame_size * 2); + __m256 *update_gate_value = reinterpret_cast<__m256 *>(gate_value); + __m256 *update_gate_grad = reinterpret_cast<__m256 *>(gate_grad); + __m256 *frame_state_value = + reinterpret_cast<__m256 *>(gate_value + frame_size * 2); + __m256 *frame_state_grad = + reinterpret_cast<__m256 *>(gate_grad + frame_size * 2); for (int i = 0; i < frame_size / 8; i++) { r_update_gate_value = update_gate_value[i]; r_frame_state_value = frame_state_value[i]; - r_out_grad = ((__m256 *)output_grad)[i]; + r_out_grad = (reinterpret_cast<__m256 *>(output_grad))[i]; if (prev_out_value) { - r_prev_out_value = ((__m256 *)prev_out_value)[i]; + r_prev_out_value = (reinterpret_cast<__m256 *>(prev_out_value))[i]; } if (prev_out_grad) { - r_prev_out_grad = ((__m256 *)prev_out_grad)[i]; + r_prev_out_grad = (reinterpret_cast<__m256 *>(prev_out_grad))[i]; } op_state_grad(r_update_gate_value, r_update_gate_grad, r_frame_state_value, @@ -307,7 +309,7 @@ void hl_avx_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value, update_gate_grad[i] = r_update_gate_grad; frame_state_grad[i] = r_frame_state_grad; if (prev_out_grad) { - ((__m256 *)prev_out_grad)[i] = r_prev_out_grad; + (reinterpret_cast<__m256 *>(prev_out_grad))[i] = r_prev_out_grad; } } #endif @@ -327,10 +329,11 @@ void hl_avx_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value, __m256 r_reset_output_grad = _mm256_set1_ps(0.0f); __m256 r_prev_out_value = _mm256_set1_ps(0.0f); __m256 r_prev_out_grad = _mm256_set1_ps(0.0f); - __m256 *update_gate_value = (__m256 *)gate_value; - __m256 *update_gate_grad = (__m256 *)gate_grad; - __m256 *reset_gate_value = (__m256 *)(gate_value + frame_size); - __m256 *reset_gate_grad = (__m256 *)(gate_grad + frame_size); + __m256 *update_gate_value = reinterpret_cast<__m256 *>(gate_value); + __m256 *update_gate_grad = reinterpret_cast<__m256 *>(gate_grad); + __m256 *reset_gate_value = + reinterpret_cast<__m256 *>(gate_value + frame_size); + __m256 *reset_gate_grad = reinterpret_cast<__m256 *>(gate_grad + frame_size); for (int i = 0; i < frame_size / 8; i++) { r_update_gate_value = update_gate_value[i]; @@ -338,13 +341,13 @@ void hl_avx_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value, r_reset_gate_value = reset_gate_value[i]; if (prev_out_value && prev_out_grad) { - r_reset_output_grad = ((__m256 *)reset_output_grad)[i]; + r_reset_output_grad = (reinterpret_cast<__m256 *>(reset_output_grad))[i]; } if (prev_out_value) { - r_prev_out_value = ((__m256 *)prev_out_value)[i]; + r_prev_out_value = (reinterpret_cast<__m256 *>(prev_out_value))[i]; } if (prev_out_grad) { - r_prev_out_grad = ((__m256 *)prev_out_grad)[i]; + r_prev_out_grad = (reinterpret_cast<__m256 *>(prev_out_grad))[i]; } op_reset_grad(r_update_gate_value, r_update_gate_grad, r_reset_gate_value, @@ -354,7 +357,7 @@ void hl_avx_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value, update_gate_grad[i] = r_update_gate_grad; reset_gate_grad[i] = r_reset_gate_grad; if (prev_out_grad) { - ((__m256 *)prev_out_grad)[i] = r_prev_out_grad; + (reinterpret_cast<__m256 *>(prev_out_grad))[i] = r_prev_out_grad; } } #endif diff --git a/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h b/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h index 6ad77830fd7..19f6b213aa3 100644 --- a/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h +++ b/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h @@ -164,10 +164,12 @@ void avx_lstm_forward_one_sequence(Op op, LstmMetaValue value, __m256 r_state_atv; __m256 r_out; - __m256 *value_in = (__m256 *)value.gate_value; - __m256 *value_ig = (__m256 *)(value.gate_value + frame_size); - __m256 *value_fg = (__m256 *)(value.gate_value + frame_size * 2); - __m256 *value_og = (__m256 *)(value.gate_value + frame_size * 3); + __m256 *value_in = reinterpret_cast<__m256 *>(value.gate_value); + __m256 *value_ig = reinterpret_cast<__m256 *>(value.gate_value + frame_size); + __m256 *value_fg = + reinterpret_cast<__m256 *>(value.gate_value + frame_size * 2); + __m256 *value_og = + reinterpret_cast<__m256 *>(value.gate_value + frame_size * 3); for (int i = 0; i < frame_size / 8; i++) { r_value_in = value_in[i]; @@ -175,13 +177,13 @@ void avx_lstm_forward_one_sequence(Op op, LstmMetaValue value, r_value_fg = value_fg[i]; r_value_og = value_og[i]; if (value.check_ig) { - r_checkI = ((__m256 *)value.check_ig)[i]; - r_checkF = ((__m256 *)value.check_fg)[i]; - r_checkO = ((__m256 *)value.check_og)[i]; + r_checkI = (reinterpret_cast<__m256 *>(value.check_ig))[i]; + r_checkF = (reinterpret_cast<__m256 *>(value.check_fg))[i]; + r_checkO = (reinterpret_cast<__m256 *>(value.check_og))[i]; } if (value.prev_state_value) { - r_prev_state = ((__m256 *)value.prev_state_value)[i]; + r_prev_state = (reinterpret_cast<__m256 *>(value.prev_state_value))[i]; } op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_prev_state, r_state, @@ -192,9 +194,9 @@ void avx_lstm_forward_one_sequence(Op op, LstmMetaValue value, value_ig[i] = r_value_ig; value_fg[i] = r_value_fg; value_og[i] = r_value_og; - ((__m256 *)value.state_value)[i] = r_state; - ((__m256 *)value.state_active_value)[i] = r_state_atv; - ((__m256 *)value.output_value)[i] = r_out; + (reinterpret_cast<__m256 *>(value.state_value))[i] = r_state; + (reinterpret_cast<__m256 *>(value.state_active_value))[i] = r_state_atv; + (reinterpret_cast<__m256 *>(value.output_value))[i] = r_out; } #endif } @@ -227,14 +229,16 @@ void avx_lstm_backward_one_sequence(Op op, LstmMetaValue value, __m256 r_checkFGrad; __m256 r_checkOGrad; - __m256 *value_in = (__m256 *)value.gate_value; - __m256 *value_ig = (__m256 *)(value.gate_value + frame_size); - __m256 *value_fg = (__m256 *)(value.gate_value + frame_size * 2); - __m256 *value_og = (__m256 *)(value.gate_value + frame_size * 3); - __m256 *grad_in = (__m256 *)grad.gate_grad; - __m256 *grad_ig = (__m256 *)(grad.gate_grad + frame_size); - __m256 *grad_fg = (__m256 *)(grad.gate_grad + frame_size * 2); - __m256 *grad_og = (__m256 *)(grad.gate_grad + frame_size * 3); + __m256 *value_in = reinterpret_cast<__m256 *>(value.gate_value); + __m256 *value_ig = reinterpret_cast<__m256 *>(value.gate_value + frame_size); + __m256 *value_fg = + reinterpret_cast<__m256 *>(value.gate_value + frame_size * 2); + __m256 *value_og = + reinterpret_cast<__m256 *>(value.gate_value + frame_size * 3); + __m256 *grad_in = reinterpret_cast<__m256 *>(grad.gate_grad); + __m256 *grad_ig = reinterpret_cast<__m256 *>(grad.gate_grad + frame_size); + __m256 *grad_fg = reinterpret_cast<__m256 *>(grad.gate_grad + frame_size * 2); + __m256 *grad_og = reinterpret_cast<__m256 *>(grad.gate_grad + frame_size * 3); for (int i = 0; i < frame_size / 8; i++) { r_value_in = value_in[i]; @@ -242,16 +246,16 @@ void avx_lstm_backward_one_sequence(Op op, LstmMetaValue value, r_value_fg = value_fg[i]; r_value_og = value_og[i]; if (value.check_ig) { - r_checkI = ((__m256 *)value.check_ig)[i]; - r_checkF = ((__m256 *)value.check_fg)[i]; - r_checkO = ((__m256 *)value.check_og)[i]; + r_checkI = (reinterpret_cast<__m256 *>(value.check_ig))[i]; + r_checkF = (reinterpret_cast<__m256 *>(value.check_fg))[i]; + r_checkO = (reinterpret_cast<__m256 *>(value.check_og))[i]; } - r_state = ((__m256 *)value.state_value)[i]; - r_state_atv = ((__m256 *)value.state_active_value)[i]; - r_output_grad = ((__m256 *)grad.output_grad)[i]; - r_state_grad = ((__m256 *)grad.state_grad)[i]; + r_state = (reinterpret_cast<__m256 *>(value.state_value))[i]; + r_state_atv = (reinterpret_cast<__m256 *>(value.state_active_value))[i]; + r_output_grad = (reinterpret_cast<__m256 *>(grad.output_grad))[i]; + r_state_grad = (reinterpret_cast<__m256 *>(grad.state_grad))[i]; if (value.prev_state_value) { - r_prev_state = ((__m256 *)value.prev_state_value)[i]; + r_prev_state = (reinterpret_cast<__m256 *>(value.prev_state_value))[i]; } op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_grad_in, r_grad_ig, @@ -264,15 +268,18 @@ void avx_lstm_backward_one_sequence(Op op, LstmMetaValue value, grad_ig[i] = r_grad_ig; grad_fg[i] = r_grad_fg; grad_og[i] = r_grad_og; - ((__m256 *)grad.state_grad)[i] = r_state_grad; + (reinterpret_cast<__m256 *>(grad.state_grad))[i] = r_state_grad; if (grad.prev_state_grad) - ((__m256 *)grad.prev_state_grad)[i] = r_prev_state_grad; + (reinterpret_cast<__m256 *>(grad.prev_state_grad))[i] = r_prev_state_grad; if (value.prev_state_value) { - if (grad.check_ig_grad) ((__m256 *)grad.check_ig_grad)[i] += r_checkIGrad; - if (grad.check_fg_grad) ((__m256 *)grad.check_fg_grad)[i] += r_checkFGrad; + if (grad.check_ig_grad) + (reinterpret_cast<__m256 *>(grad.check_ig_grad))[i] += r_checkIGrad; + if (grad.check_fg_grad) + (reinterpret_cast<__m256 *>(grad.check_fg_grad))[i] += r_checkFGrad; } - if (grad.check_og_grad) ((__m256 *)grad.check_og_grad)[i] += r_checkOGrad; + if (grad.check_og_grad) + (reinterpret_cast<__m256 *>(grad.check_og_grad))[i] += r_checkOGrad; } #endif } diff --git a/paddle/fluid/operators/math/math_function_test.cu b/paddle/fluid/operators/math/math_function_test.cu index 7986326e96b..b84bb997493 100644 --- a/paddle/fluid/operators/math/math_function_test.cu +++ b/paddle/fluid/operators/math/math_function_test.cu @@ -23,32 +23,29 @@ void fill_fp16_data(paddle::platform::float16* in_ptr, size_t size, } TEST(math_function, notrans_mul_trans_fp32) { - using namespace paddle::framework; - using namespace paddle::platform; + paddle::framework::Tensor input1; + paddle::framework::Tensor input1_gpu; + paddle::framework::Tensor input2_gpu; + paddle::framework::Tensor out_gpu; + paddle::framework::Tensor out; - Tensor input1; - Tensor input1_gpu; - Tensor input2_gpu; - Tensor out_gpu; - Tensor out; - - CPUPlace cpu_place; - CUDAPlace gpu_place(0); - CUDADeviceContext context(gpu_place); + paddle::platform::CPUPlace cpu_place; + paddle::platform::CUDAPlace gpu_place(0); + paddle::platform::CUDADeviceContext context(gpu_place); float* input1_ptr = input1.mutable_data({2, 3}, cpu_place); float arr[6] = {0, 1, 2, 3, 4, 5}; memcpy(input1_ptr, arr, 6 * sizeof(float)); - TensorCopySync(input1, gpu_place, &input1_gpu); - TensorCopySync(input1, gpu_place, &input2_gpu); + paddle::framework::TensorCopySync(input1, gpu_place, &input1_gpu); + paddle::framework::TensorCopySync(input1, gpu_place, &input2_gpu); out_gpu.mutable_data({2, 2}, gpu_place); - paddle::operators::math::matmul( + paddle::operators::math::matmul( context, input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0); - TensorCopySync(out_gpu, cpu_place, &out); + paddle::framework::TensorCopySync(out_gpu, cpu_place, &out); float* out_ptr = out.data(); context.Wait(); @@ -59,39 +56,38 @@ TEST(math_function, notrans_mul_trans_fp32) { } TEST(math_function, notrans_mul_trans_fp16) { - using namespace paddle::framework; - using namespace paddle::platform; - - Tensor input1; - Tensor input1_gpu; - Tensor input2_gpu; - Tensor out_gpu; - Tensor out; + paddle::framework::Tensor input1; + paddle::framework::Tensor input1_gpu; + paddle::framework::Tensor input2_gpu; + paddle::framework::Tensor out_gpu; + paddle::framework::Tensor out; - CPUPlace cpu_place; - CUDAPlace gpu_place(0); - CUDADeviceContext context(gpu_place); + paddle::platform::CPUPlace cpu_place; + paddle::platform::CUDAPlace gpu_place(0); + paddle::platform::CUDADeviceContext context(gpu_place); // fp16 GEMM in cublas requires GPU compute capability >= 53 if (context.GetComputeCapability() < 53) { return; } - float16* input1_ptr = input1.mutable_data({2, 3}, cpu_place); + paddle::platform::float16* input1_ptr = + input1.mutable_data({2, 3}, cpu_place); fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5}); - TensorCopySync(input1, gpu_place, &input1_gpu); - TensorCopySync(input1, gpu_place, &input2_gpu); + paddle::framework::TensorCopySync(input1, gpu_place, &input1_gpu); + paddle::framework::TensorCopySync(input1, gpu_place, &input2_gpu); - out_gpu.mutable_data({2, 2}, gpu_place); + out_gpu.mutable_data({2, 2}, gpu_place); - paddle::operators::math::matmul( - context, input1_gpu, false, input2_gpu, true, float16(1), &out_gpu, - float16(0)); + paddle::operators::math::matmul( + context, input1_gpu, false, input2_gpu, true, + paddle::platform::float16(1), &out_gpu, paddle::platform::float16(0)); - TensorCopySync(out_gpu, cpu_place, &out); + paddle::framework::TensorCopySync(out_gpu, cpu_place, &out); - float16* out_ptr = out.data(); + paddle::platform::float16* out_ptr = out.data(); context.Wait(); EXPECT_EQ(static_cast(out_ptr[0]), 5); EXPECT_EQ(static_cast(out_ptr[1]), 14); @@ -100,32 +96,29 @@ TEST(math_function, notrans_mul_trans_fp16) { } TEST(math_function, trans_mul_notrans_fp32) { - using namespace paddle::framework; - using namespace paddle::platform; + paddle::framework::Tensor input1; + paddle::framework::Tensor input1_gpu; + paddle::framework::Tensor input2_gpu; + paddle::framework::Tensor out_gpu; + paddle::framework::Tensor out; - Tensor input1; - Tensor input1_gpu; - Tensor input2_gpu; - Tensor out_gpu; - Tensor out; - - CPUPlace cpu_place; - CUDAPlace gpu_place(0); - CUDADeviceContext context(gpu_place); + paddle::platform::CPUPlace cpu_place; + paddle::platform::CUDAPlace gpu_place(0); + paddle::platform::CUDADeviceContext context(gpu_place); float* input1_ptr = input1.mutable_data({2, 3}, cpu_place); float arr[6] = {0, 1, 2, 3, 4, 5}; memcpy(input1_ptr, arr, 6 * sizeof(float)); - TensorCopySync(input1, gpu_place, &input1_gpu); - TensorCopySync(input1, gpu_place, &input2_gpu); + paddle::framework::TensorCopySync(input1, gpu_place, &input1_gpu); + paddle::framework::TensorCopySync(input1, gpu_place, &input2_gpu); out_gpu.mutable_data({3, 3}, gpu_place); paddle::operators::math::matmul( context, input1_gpu, true, input2_gpu, false, 1, &out_gpu, 0); - TensorCopySync(out_gpu, cpu_place, &out); + paddle::framework::TensorCopySync(out_gpu, cpu_place, &out); float* out_ptr = out.data(); context.Wait(); @@ -141,39 +134,38 @@ TEST(math_function, trans_mul_notrans_fp32) { } TEST(math_function, trans_mul_notrans_fp16) { - using namespace paddle::framework; - using namespace paddle::platform; - - Tensor input1; - Tensor input1_gpu; - Tensor input2_gpu; - Tensor out_gpu; - Tensor out; + paddle::framework::Tensor input1; + paddle::framework::Tensor input1_gpu; + paddle::framework::Tensor input2_gpu; + paddle::framework::Tensor out_gpu; + paddle::framework::Tensor out; - CPUPlace cpu_place; - CUDAPlace gpu_place(0); - CUDADeviceContext context(gpu_place); + paddle::platform::CPUPlace cpu_place; + paddle::platform::CUDAPlace gpu_place(0); + paddle::platform::CUDADeviceContext context(gpu_place); // fp16 GEMM in cublas requires GPU compute capability >= 53 if (context.GetComputeCapability() < 53) { return; } - float16* input1_ptr = input1.mutable_data({2, 3}, cpu_place); + paddle::platform::float16* input1_ptr = + input1.mutable_data({2, 3}, cpu_place); fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5}); - TensorCopySync(input1, gpu_place, &input1_gpu); - TensorCopySync(input1, gpu_place, &input2_gpu); + paddle::framework::TensorCopySync(input1, gpu_place, &input1_gpu); + paddle::framework::TensorCopySync(input1, gpu_place, &input2_gpu); - out_gpu.mutable_data({3, 3}, gpu_place); + out_gpu.mutable_data({3, 3}, gpu_place); - paddle::operators::math::matmul( - context, input1_gpu, true, input2_gpu, false, float16(1), &out_gpu, - float16(0)); + paddle::operators::math::matmul( + context, input1_gpu, true, input2_gpu, false, + paddle::platform::float16(1), &out_gpu, paddle::platform::float16(0)); - TensorCopySync(out_gpu, cpu_place, &out); + paddle::framework::TensorCopySync(out_gpu, cpu_place, &out); - float16* out_ptr = out.data(); + paddle::platform::float16* out_ptr = out.data(); context.Wait(); EXPECT_EQ(static_cast(out_ptr[0]), 9); EXPECT_EQ(static_cast(out_ptr[1]), 12); @@ -187,19 +179,16 @@ TEST(math_function, trans_mul_notrans_fp16) { } TEST(math_function, gemm_notrans_cublas_fp32) { - using namespace paddle::framework; - using namespace paddle::platform; + paddle::framework::Tensor input1; + paddle::framework::Tensor input2; + paddle::framework::Tensor input3; + paddle::framework::Tensor input1_gpu; + paddle::framework::Tensor input2_gpu; + paddle::framework::Tensor input3_gpu; - Tensor input1; - Tensor input2; - Tensor input3; - Tensor input1_gpu; - Tensor input2_gpu; - Tensor input3_gpu; - - CPUPlace cpu_place; - CUDAPlace gpu_place(0); - CUDADeviceContext context(gpu_place); + paddle::platform::CPUPlace cpu_place; + paddle::platform::CUDAPlace gpu_place(0); + paddle::platform::CUDADeviceContext context(gpu_place); int m = 2; int n = 3; @@ -214,9 +203,9 @@ TEST(math_function, gemm_notrans_cublas_fp32) { float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7}; memcpy(input3_ptr, arr3, 8 * sizeof(float)); - TensorCopySync(input1, gpu_place, &input1_gpu); - TensorCopySync(input2, gpu_place, &input2_gpu); - TensorCopySync(input3, gpu_place, &input3_gpu); + paddle::framework::TensorCopySync(input1, gpu_place, &input1_gpu); + paddle::framework::TensorCopySync(input2, gpu_place, &input2_gpu); + paddle::framework::TensorCopySync(input3, gpu_place, &input3_gpu); float* a = input1_gpu.data(); float* b = input2_gpu.data(); float* c = input3_gpu.mutable_data(gpu_place); @@ -224,7 +213,7 @@ TEST(math_function, gemm_notrans_cublas_fp32) { paddle::operators::math::gemm( context, false, false, m, n, k, 1, a, 3, b + 1, 4, 1, c + 1, 4); - TensorCopySync(input3_gpu, cpu_place, &input3); + paddle::framework::TensorCopySync(input3_gpu, cpu_place, &input3); // numpy code: // a = np.arange(6).reshape(2, 3) @@ -244,19 +233,16 @@ TEST(math_function, gemm_notrans_cublas_fp32) { } TEST(math_function, gemm_notrans_cublas_fp16) { - using namespace paddle::framework; - using namespace paddle::platform; - - Tensor input1; - Tensor input2; - Tensor input3; - Tensor input1_gpu; - Tensor input2_gpu; - Tensor input3_gpu; + paddle::framework::Tensor input1; + paddle::framework::Tensor input2; + paddle::framework::Tensor input3; + paddle::framework::Tensor input1_gpu; + paddle::framework::Tensor input2_gpu; + paddle::framework::Tensor input3_gpu; - CPUPlace cpu_place; - CUDAPlace gpu_place(0); - CUDADeviceContext context(gpu_place); + paddle::platform::CPUPlace cpu_place; + paddle::platform::CUDAPlace gpu_place(0); + paddle::platform::CUDADeviceContext context(gpu_place); // fp16 GEMM in cublas requires GPU compute capability >= 53 if (context.GetComputeCapability() < 53) { @@ -266,26 +252,31 @@ TEST(math_function, gemm_notrans_cublas_fp16) { int m = 2; int n = 3; int k = 3; - float16* input1_ptr = input1.mutable_data({2, 3}, cpu_place); + paddle::platform::float16* input1_ptr = + input1.mutable_data({2, 3}, cpu_place); fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5}); - float16* input2_ptr = input2.mutable_data({3, 4}, cpu_place); + paddle::platform::float16* input2_ptr = + input2.mutable_data({3, 4}, cpu_place); fill_fp16_data(input2_ptr, input2.numel(), {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}); - float16* input3_ptr = input3.mutable_data({2, 4}, cpu_place); + paddle::platform::float16* input3_ptr = + input3.mutable_data({2, 4}, cpu_place); fill_fp16_data(input3_ptr, input3.numel(), {0, 1, 2, 3, 4, 5, 6, 7}); - TensorCopySync(input1, gpu_place, &input1_gpu); - TensorCopySync(input2, gpu_place, &input2_gpu); - TensorCopySync(input3, gpu_place, &input3_gpu); - float16* a = input1_gpu.data(); - float16* b = input2_gpu.data(); - float16* c = input3_gpu.mutable_data(gpu_place); + paddle::framework::TensorCopySync(input1, gpu_place, &input1_gpu); + paddle::framework::TensorCopySync(input2, gpu_place, &input2_gpu); + paddle::framework::TensorCopySync(input3, gpu_place, &input3_gpu); + paddle::platform::float16* a = input1_gpu.data(); + paddle::platform::float16* b = input2_gpu.data(); + paddle::platform::float16* c = + input3_gpu.mutable_data(gpu_place); - paddle::operators::math::gemm( - context, false, false, m, n, k, float16(1), a, 3, b + 1, 4, float16(1), - c + 1, 4); + paddle::operators::math::gemm( + context, false, false, m, n, k, paddle::platform::float16(1), a, 3, b + 1, + 4, paddle::platform::float16(1), c + 1, 4); - TensorCopySync(input3_gpu, cpu_place, &input3); + paddle::framework::TensorCopySync(input3_gpu, cpu_place, &input3); // numpy code: // a = np.arange(6).reshape(2, 3) @@ -305,19 +296,16 @@ TEST(math_function, gemm_notrans_cublas_fp16) { } TEST(math_function, gemm_trans_cublas_fp32) { - using namespace paddle::framework; - using namespace paddle::platform; - - Tensor input1; - Tensor input2; - Tensor input3; - Tensor input1_gpu; - Tensor input2_gpu; - Tensor input3_gpu; + paddle::framework::Tensor input1; + paddle::framework::Tensor input2; + paddle::framework::Tensor input3; + paddle::framework::Tensor input1_gpu; + paddle::framework::Tensor input2_gpu; + paddle::framework::Tensor input3_gpu; - CPUPlace cpu_place; - CUDAPlace gpu_place(0); - CUDADeviceContext context(gpu_place); + paddle::platform::CPUPlace cpu_place; + paddle::platform::CUDAPlace gpu_place(0); + paddle::platform::CUDADeviceContext context(gpu_place); int m = 2; int n = 3; @@ -332,9 +320,9 @@ TEST(math_function, gemm_trans_cublas_fp32) { float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7}; memcpy(input3_ptr, arr3, 8 * sizeof(float)); - TensorCopySync(input1, gpu_place, &input1_gpu); - TensorCopySync(input2, gpu_place, &input2_gpu); - TensorCopySync(input3, gpu_place, &input3_gpu); + paddle::framework::TensorCopySync(input1, gpu_place, &input1_gpu); + paddle::framework::TensorCopySync(input2, gpu_place, &input2_gpu); + paddle::framework::TensorCopySync(input3, gpu_place, &input3_gpu); float* a = input1_gpu.data(); float* b = input2_gpu.data(); float* c = input3_gpu.mutable_data(gpu_place); @@ -342,7 +330,7 @@ TEST(math_function, gemm_trans_cublas_fp32) { paddle::operators::math::gemm( context, false, true, m, n, k, 1, a, 3, b + 3, 3, 1, c + 1, 4); - TensorCopySync(input3_gpu, cpu_place, &input3); + paddle::framework::TensorCopySync(input3_gpu, cpu_place, &input3); context.Wait(); EXPECT_EQ(input3_ptr[0], 0); @@ -356,19 +344,16 @@ TEST(math_function, gemm_trans_cublas_fp32) { } TEST(math_function, gemm_trans_cublas_fp16) { - using namespace paddle::framework; - using namespace paddle::platform; + paddle::framework::Tensor input1; + paddle::framework::Tensor input2; + paddle::framework::Tensor input3; + paddle::framework::Tensor input1_gpu; + paddle::framework::Tensor input2_gpu; + paddle::framework::Tensor input3_gpu; - Tensor input1; - Tensor input2; - Tensor input3; - Tensor input1_gpu; - Tensor input2_gpu; - Tensor input3_gpu; - - CPUPlace cpu_place; - CUDAPlace gpu_place(0); - CUDADeviceContext context(gpu_place); + paddle::platform::CPUPlace cpu_place; + paddle::platform::CUDAPlace gpu_place(0); + paddle::platform::CUDADeviceContext context(gpu_place); // fp16 GEMM in cublas requires GPU compute capability >= 53 if (context.GetComputeCapability() < 53) { @@ -378,26 +363,31 @@ TEST(math_function, gemm_trans_cublas_fp16) { int m = 2; int n = 3; int k = 3; - float16* input1_ptr = input1.mutable_data({2, 3}, cpu_place); + paddle::platform::float16* input1_ptr = + input1.mutable_data({2, 3}, cpu_place); fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5}); - float16* input2_ptr = input2.mutable_data({4, 3}, cpu_place); + paddle::platform::float16* input2_ptr = + input2.mutable_data({4, 3}, cpu_place); fill_fp16_data(input2_ptr, input2.numel(), {0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11}); - float16* input3_ptr = input3.mutable_data({2, 4}, cpu_place); + paddle::platform::float16* input3_ptr = + input3.mutable_data({2, 4}, cpu_place); fill_fp16_data(input3_ptr, input3.numel(), {0, 1, 2, 3, 4, 5, 6, 7}); - TensorCopySync(input1, gpu_place, &input1_gpu); - TensorCopySync(input2, gpu_place, &input2_gpu); - TensorCopySync(input3, gpu_place, &input3_gpu); - float16* a = input1_gpu.data(); - float16* b = input2_gpu.data(); - float16* c = input3_gpu.mutable_data(gpu_place); + paddle::framework::TensorCopySync(input1, gpu_place, &input1_gpu); + paddle::framework::TensorCopySync(input2, gpu_place, &input2_gpu); + paddle::framework::TensorCopySync(input3, gpu_place, &input3_gpu); + paddle::platform::float16* a = input1_gpu.data(); + paddle::platform::float16* b = input2_gpu.data(); + paddle::platform::float16* c = + input3_gpu.mutable_data(gpu_place); - paddle::operators::math::gemm( - context, false, true, m, n, k, float16(1), a, 3, b + 3, 3, float16(1), - c + 1, 4); + paddle::operators::math::gemm( + context, false, true, m, n, k, paddle::platform::float16(1), a, 3, b + 3, + 3, paddle::platform::float16(1), c + 1, 4); - TensorCopySync(input3_gpu, cpu_place, &input3); + paddle::framework::TensorCopySync(input3_gpu, cpu_place, &input3); context.Wait(); EXPECT_EQ(static_cast(input3_ptr[0]), 0); @@ -412,24 +402,21 @@ TEST(math_function, gemm_trans_cublas_fp16) { template void GemvTest(int m, int n, bool trans) { - using namespace paddle::framework; - using namespace paddle::platform; - - Tensor mat_a; - Tensor vec_b; - Tensor vec_c; + paddle::framework::Tensor mat_a; + paddle::framework::Tensor vec_b; + paddle::framework::Tensor vec_c; - CPUPlace cpu_place; - CUDAPlace gpu_place(0); - CUDADeviceContext context(gpu_place); + paddle::platform::CPUPlace cpu_place; + paddle::platform::CUDAPlace gpu_place(0); + paddle::platform::CUDADeviceContext context(gpu_place); T* data_a = mat_a.mutable_data({m, n}, cpu_place); T* data_b = vec_b.mutable_data({trans ? m : n}, cpu_place); T* data_c = vec_c.mutable_data({trans ? n : m}, cpu_place); - Tensor g_mat_a; - Tensor g_vec_b; - Tensor g_vec_c; + paddle::framework::Tensor g_mat_a; + paddle::framework::Tensor g_vec_b; + paddle::framework::Tensor g_vec_c; T* g_data_a = g_mat_a.mutable_data(mat_a.dims(), gpu_place); T* g_data_b = g_vec_b.mutable_data(vec_b.dims(), gpu_place); T* g_data_c = g_vec_c.mutable_data(vec_c.dims(), gpu_place); @@ -441,14 +428,14 @@ void GemvTest(int m, int n, bool trans) { data_b[i] = static_cast(i); } - TensorCopySync(mat_a, gpu_place, &g_mat_a); - TensorCopySync(vec_b, gpu_place, &g_vec_b); + paddle::framework::TensorCopySync(mat_a, gpu_place, &g_mat_a); + paddle::framework::TensorCopySync(vec_b, gpu_place, &g_vec_b); - paddle::operators::math::gemv( + paddle::operators::math::gemv( context, trans, static_cast(m), static_cast(n), 1., g_data_a, g_data_b, 0., g_data_c); - TensorCopySync(g_vec_c, cpu_place, &vec_c); + paddle::framework::TensorCopySync(g_vec_c, cpu_place, &vec_c); if (!trans) { for (int i = 0; i < m; ++i) { diff --git a/paddle/fluid/operators/math/selected_rows_functor_test.cu b/paddle/fluid/operators/math/selected_rows_functor_test.cu index 942d9b13fc1..e89b27855bd 100644 --- a/paddle/fluid/operators/math/selected_rows_functor_test.cu +++ b/paddle/fluid/operators/math/selected_rows_functor_test.cu @@ -12,43 +12,52 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include #include "gtest/gtest.h" #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/selected_rows_functor.h" TEST(selected_rows_functor, gpu_add) { - using namespace paddle::framework; - using namespace paddle::platform; - using namespace paddle::operators::math; - - CUDAPlace gpu_place(0); - CPUPlace cpu_place; - CUDADeviceContext ctx(gpu_place); - SetConstant functor; + paddle::platform::CUDAPlace gpu_place(0); + paddle::platform::CPUPlace cpu_place; + paddle::platform::CUDADeviceContext ctx(gpu_place); + paddle::operators::math::SetConstant + functor; int64_t height = 10; int64_t row_numel = 10; std::vector rows1{0, 4, 7}; - std::unique_ptr selected_rows1{new SelectedRows(rows1, height)}; + std::unique_ptr selected_rows1{ + new paddle::framework::SelectedRows(rows1, height)}; auto* in1_value = selected_rows1->mutable_value(); in1_value->mutable_data( - make_ddim({static_cast(rows1.size()), row_numel}), gpu_place); + paddle::framework::make_ddim( + {static_cast(rows1.size()), row_numel}), + gpu_place); functor(ctx, in1_value, 1.0); std::vector rows2{0, 5, 7, 9}; - std::unique_ptr selected_rows2{new SelectedRows(rows2, height)}; + std::unique_ptr selected_rows2{ + new paddle::framework::SelectedRows(rows2, height)}; auto* in2_value = selected_rows2->mutable_value(); in2_value->mutable_data( - make_ddim({static_cast(rows2.size()), row_numel}), gpu_place); + paddle::framework::make_ddim( + {static_cast(rows2.size()), row_numel}), + gpu_place); functor(ctx, in2_value, 2.0); - std::unique_ptr output{new SelectedRows()}; + std::unique_ptr output{ + new paddle::framework::SelectedRows()}; auto* out_value = output->mutable_value(); - // simplely concat two SelectedRows - out_value->mutable_data(make_ddim({7, 10}), gpu_place); + // simply concat two SelectedRows + out_value->mutable_data(paddle::framework::make_ddim({7, 10}), + gpu_place); - SelectedRowsAdd add_functor; + paddle::operators::math::SelectedRowsAdd + add_functor; add_functor(ctx, *selected_rows1, *selected_rows2, output.get()); auto out_height = output->height(); @@ -66,8 +75,8 @@ TEST(selected_rows_functor, gpu_add) { EXPECT_EQ(out_rows[5], 7); EXPECT_EQ(out_rows[6], 9); - Tensor out_cpu; - TensorCopy(*out_value, cpu_place, ctx, &out_cpu); + paddle::framework::Tensor out_cpu; + paddle::framework::TensorCopy(*out_value, cpu_place, ctx, &out_cpu); ctx.Wait(); auto* out_cpu_data = out_cpu.data(); @@ -83,18 +92,24 @@ TEST(selected_rows_functor, gpu_add) { EXPECT_EQ(out_cpu_data[5 * row_numel + 7], 2.0); EXPECT_EQ(out_cpu_data[6 * row_numel + 9], 2.0); - std::unique_ptr tensor1{new Tensor()}; - tensor1->mutable_data(make_ddim({height, row_numel}), gpu_place); + std::unique_ptr tensor1{ + new paddle::framework::Tensor()}; + tensor1->mutable_data( + paddle::framework::make_ddim({height, row_numel}), gpu_place); functor(ctx, tensor1.get(), 3.0); - std::unique_ptr tensor2{new Tensor()}; - tensor2->mutable_data(make_ddim({height, row_numel}), gpu_place); + std::unique_ptr tensor2{ + new paddle::framework::Tensor()}; + tensor2->mutable_data( + paddle::framework::make_ddim({height, row_numel}), gpu_place); - SelectedRowsAddTensor add_tensor_functor; + paddle::operators::math::SelectedRowsAddTensor< + paddle::platform::CUDADeviceContext, float> + add_tensor_functor; add_tensor_functor(ctx, *output, *tensor1, tensor2.get()); - Tensor tensor2_cpu; - TensorCopy(*tensor2, cpu_place, ctx, &tensor2_cpu); + paddle::framework::Tensor tensor2_cpu; + paddle::framework::TensorCopy(*tensor2, cpu_place, ctx, &tensor2_cpu); ctx.Wait(); auto* tensor2_cpu_data = tensor2_cpu.data(); @@ -115,39 +130,47 @@ TEST(selected_rows_functor, gpu_add) { } TEST(selected_rows_functor, gpu_add_to) { - using namespace paddle::framework; - using namespace paddle::platform; - using namespace paddle::operators::math; - - CUDAPlace gpu_place(0); - CPUPlace cpu_place; - CUDADeviceContext ctx(gpu_place); - SetConstant functor; + paddle::platform::CUDAPlace gpu_place(0); + paddle::platform::CPUPlace cpu_place; + paddle::platform::CUDADeviceContext ctx(gpu_place); + paddle::operators::math::SetConstant + functor; int64_t height = 10; int64_t row_numel = 10; std::vector rows1{0, 4, 7}; - std::unique_ptr selected_rows1{new SelectedRows(rows1, height)}; + std::unique_ptr selected_rows1{ + new paddle::framework::SelectedRows(rows1, height)}; auto* in1_value = selected_rows1->mutable_value(); in1_value->mutable_data( - make_ddim({static_cast(rows1.size()), row_numel}), gpu_place); + paddle::framework::make_ddim( + {static_cast(rows1.size()), row_numel}), + gpu_place); functor(ctx, in1_value, 1.0); std::vector rows2{0, 5, 7, 9}; - std::unique_ptr selected_rows2{new SelectedRows(rows2, height)}; + std::unique_ptr selected_rows2{ + new paddle::framework::SelectedRows(rows2, height)}; auto* in2_value = selected_rows2->mutable_value(); in2_value->mutable_data( - make_ddim({static_cast(rows2.size()), row_numel}), gpu_place); + paddle::framework::make_ddim( + {static_cast(rows2.size()), row_numel}), + gpu_place); functor(ctx, in2_value, 2.0); - std::unique_ptr output{new SelectedRows()}; + std::unique_ptr output{ + new paddle::framework::SelectedRows()}; output->set_height(height); auto* out_value = output->mutable_value(); - // simplely concat two SelectedRows - out_value->mutable_data(make_ddim({7, 10}), gpu_place); + // simply concat two SelectedRows + out_value->mutable_data(paddle::framework::make_ddim({7, 10}), + gpu_place); - SelectedRowsAddTo add_to_functor; + paddle::operators::math::SelectedRowsAddTo< + paddle::platform::CUDADeviceContext, float> + add_to_functor; add_to_functor(ctx, *selected_rows1, 0, output.get()); add_to_functor(ctx, *selected_rows2, in1_value->numel(), output.get()); @@ -166,8 +189,8 @@ TEST(selected_rows_functor, gpu_add_to) { EXPECT_EQ(out_rows[5], 7); EXPECT_EQ(out_rows[6], 9); - Tensor out_cpu; - TensorCopy(*out_value, cpu_place, ctx, &out_cpu); + paddle::framework::Tensor out_cpu; + paddle::framework::TensorCopy(*out_value, cpu_place, ctx, &out_cpu); ctx.Wait(); auto* out_cpu_data = out_cpu.data(); @@ -183,15 +206,19 @@ TEST(selected_rows_functor, gpu_add_to) { EXPECT_EQ(out_cpu_data[5 * row_numel + 7], 2.0); EXPECT_EQ(out_cpu_data[6 * row_numel + 9], 2.0); - std::unique_ptr tensor1{new Tensor()}; - tensor1->mutable_data(make_ddim({height, row_numel}), gpu_place); + std::unique_ptr tensor1{ + new paddle::framework::Tensor()}; + tensor1->mutable_data( + paddle::framework::make_ddim({height, row_numel}), gpu_place); functor(ctx, tensor1.get(), 3.0); - SelectedRowsAddToTensor add_to_tensor_functor; + paddle::operators::math::SelectedRowsAddToTensor< + paddle::platform::CUDADeviceContext, float> + add_to_tensor_functor; add_to_tensor_functor(ctx, *output, tensor1.get()); - Tensor tensor1_cpu; - TensorCopy(*tensor1, cpu_place, ctx, &tensor1_cpu); + paddle::framework::Tensor tensor1_cpu; + paddle::framework::TensorCopy(*tensor1, cpu_place, ctx, &tensor1_cpu); ctx.Wait(); auto* tensor1_cpu_data = tensor1_cpu.data(); -- GitLab