提交 bcf260e1 编写于 作者: F fengjiayi

fix several unit tests

上级 3cd99f4b
...@@ -59,8 +59,7 @@ class FetchOp : public framework::OperatorBase { ...@@ -59,8 +59,7 @@ class FetchOp : public framework::OperatorBase {
// CPU outputs? // CPU outputs?
auto &dev_ctx = *pool.Get(src_item.place()); auto &dev_ctx = *pool.Get(src_item.place());
TensorCopy(src_item, platform::CPUPlace(), dev_ctx, &dst_item); TensorCopy(src_item, platform::CPUPlace(), dev_ctx, &dst_item, true);
dev_ctx.Wait();
dst_item.set_lod(src_item.lod()); dst_item.set_lod(src_item.lod());
VLOG(3) << "Fetch variable " << fetch_var_name << " to " << out_name; VLOG(3) << "Fetch variable " << fetch_var_name << " to " << out_name;
......
...@@ -72,8 +72,8 @@ void testConcat() { ...@@ -72,8 +72,8 @@ void testConcat() {
} }
if (paddle::platform::is_gpu_place(Place())) { if (paddle::platform::is_gpu_place(Place())) {
TensorCopy(input_a_cpu, Place(), *context, &input_a); TensorCopy(input_a_cpu, Place(), *context, &input_a, true);
TensorCopy(input_b_cpu, Place(), *context, &input_b); TensorCopy(input_b_cpu, Place(), *context, &input_b, true);
} }
std::vector<Tensor> input; std::vector<Tensor> input;
...@@ -89,7 +89,7 @@ void testConcat() { ...@@ -89,7 +89,7 @@ void testConcat() {
int* out_ptr; int* out_ptr;
if (paddle::platform::is_gpu_place(Place())) { if (paddle::platform::is_gpu_place(Place())) {
TensorCopy(out, CPUPlace(), *context, &out_cpu); TensorCopy(out, CPUPlace(), *context, &out_cpu, true);
out_ptr = out_cpu.data<int>(); out_ptr = out_cpu.data<int>();
} else { } else {
out_ptr = out.data<int>(); out_ptr = out.data<int>();
...@@ -144,8 +144,8 @@ void testConcat() { ...@@ -144,8 +144,8 @@ void testConcat() {
} }
if (paddle::platform::is_gpu_place(Place())) { if (paddle::platform::is_gpu_place(Place())) {
TensorCopy(input_a_cpu, Place(), *context, &input_a); TensorCopy(input_a_cpu, Place(), *context, &input_a, true);
TensorCopy(input_b_cpu, Place(), *context, &input_b); TensorCopy(input_b_cpu, Place(), *context, &input_b, true);
} }
input.clear(); input.clear();
...@@ -159,7 +159,7 @@ void testConcat() { ...@@ -159,7 +159,7 @@ void testConcat() {
PADDLE_ENFORCE_EQ(input_b.dims(), dim_b); PADDLE_ENFORCE_EQ(input_b.dims(), dim_b);
if (paddle::platform::is_gpu_place(Place())) { if (paddle::platform::is_gpu_place(Place())) {
TensorCopy(out, CPUPlace(), *context, &out_cpu); TensorCopy(out, CPUPlace(), *context, &out_cpu, true);
out_ptr = out_cpu.data<int>(); out_ptr = out_cpu.data<int>();
} else { } else {
out_ptr = out.data<int>(); out_ptr = out.data<int>();
...@@ -216,8 +216,8 @@ void testConcat() { ...@@ -216,8 +216,8 @@ void testConcat() {
} }
if (paddle::platform::is_gpu_place(Place())) { if (paddle::platform::is_gpu_place(Place())) {
TensorCopy(input_a_cpu, Place(), *context, &input_a); TensorCopy(input_a_cpu, Place(), *context, &input_a, true);
TensorCopy(input_b_cpu, Place(), *context, &input_b); TensorCopy(input_b_cpu, Place(), *context, &input_b, true);
} }
input.clear(); input.clear();
...@@ -231,7 +231,7 @@ void testConcat() { ...@@ -231,7 +231,7 @@ void testConcat() {
PADDLE_ENFORCE_EQ(input_b.dims(), dim_b); PADDLE_ENFORCE_EQ(input_b.dims(), dim_b);
if (paddle::platform::is_gpu_place(Place())) { if (paddle::platform::is_gpu_place(Place())) {
TensorCopy(out, CPUPlace(), *context, &out_cpu); TensorCopy(out, CPUPlace(), *context, &out_cpu, true);
out_ptr = out_cpu.data<int>(); out_ptr = out_cpu.data<int>();
} else { } else {
out_ptr = out.data<int>(); out_ptr = out.data<int>();
...@@ -290,8 +290,8 @@ void testConcat() { ...@@ -290,8 +290,8 @@ void testConcat() {
} }
if (paddle::platform::is_gpu_place(Place())) { if (paddle::platform::is_gpu_place(Place())) {
TensorCopy(input_a_cpu, Place(), *context, &input_a); TensorCopy(input_a_cpu, Place(), *context, &input_a, true);
TensorCopy(input_b_cpu, Place(), *context, &input_b); TensorCopy(input_b_cpu, Place(), *context, &input_b, true);
} }
input.clear(); input.clear();
...@@ -305,7 +305,7 @@ void testConcat() { ...@@ -305,7 +305,7 @@ void testConcat() {
PADDLE_ENFORCE_EQ(input_b.dims(), dim_b); PADDLE_ENFORCE_EQ(input_b.dims(), dim_b);
if (paddle::platform::is_gpu_place(Place())) { if (paddle::platform::is_gpu_place(Place())) {
TensorCopy(out, CPUPlace(), *context, &out_cpu); TensorCopy(out, CPUPlace(), *context, &out_cpu, true);
out_ptr = out_cpu.data<int>(); out_ptr = out_cpu.data<int>();
} else { } else {
out_ptr = out.data<int>(); out_ptr = out.data<int>();
......
...@@ -62,7 +62,7 @@ void testIm2col() { ...@@ -62,7 +62,7 @@ void testIm2col() {
if (paddle::platform::is_cpu_place(*place)) { if (paddle::platform::is_cpu_place(*place)) {
input = input_tmp; input = input_tmp;
} else { } else {
TensorCopy(input_tmp, *place, *context, &input); TensorCopy(input_tmp, *place, *context, &input, true);
} }
output_cfo.mutable_data<float>( output_cfo.mutable_data<float>(
{1, filter_size, filter_size, output_height, output_width}, *place); {1, filter_size, filter_size, output_height, output_width}, *place);
...@@ -87,7 +87,8 @@ void testIm2col() { ...@@ -87,7 +87,8 @@ void testIm2col() {
if (paddle::platform::is_cpu_place(*place)) { if (paddle::platform::is_cpu_place(*place)) {
out_cfo_ptr = output_cfo.data<float>(); out_cfo_ptr = output_cfo.data<float>();
} else { } else {
TensorCopy(output_cfo, paddle::platform::CPUPlace(), *context, &output_tmp); TensorCopy(output_cfo, paddle::platform::CPUPlace(), *context, &output_tmp,
true);
out_cfo_ptr = output_tmp.data<float>(); out_cfo_ptr = output_tmp.data<float>();
} }
for (int i = 0; i < 6; ++i) { for (int i = 0; i < 6; ++i) {
...@@ -98,7 +99,8 @@ void testIm2col() { ...@@ -98,7 +99,8 @@ void testIm2col() {
if (paddle::platform::is_cpu_place(*place)) { if (paddle::platform::is_cpu_place(*place)) {
out_ocf_ptr = output_ocf.data<float>(); out_ocf_ptr = output_ocf.data<float>();
} else { } else {
TensorCopy(output_ocf, paddle::platform::CPUPlace(), *context, &output_tmp); TensorCopy(output_ocf, paddle::platform::CPUPlace(), *context, &output_tmp,
true);
out_ocf_ptr = output_tmp.data<float>(); out_ocf_ptr = output_tmp.data<float>();
} }
...@@ -119,7 +121,7 @@ void testIm2col() { ...@@ -119,7 +121,7 @@ void testIm2col() {
if (paddle::platform::is_cpu_place(*place)) { if (paddle::platform::is_cpu_place(*place)) {
input = input_tmp; input = input_tmp;
} else { } else {
TensorCopy(input_tmp, *place, *context, &input); TensorCopy(input_tmp, *place, *context, &input, true);
} }
col2im(*context, output_cfo, dilation, stride, padding, &input); col2im(*context, output_cfo, dilation, stride, padding, &input);
...@@ -128,7 +130,7 @@ void testIm2col() { ...@@ -128,7 +130,7 @@ void testIm2col() {
if (paddle::platform::is_cpu_place(*place)) { if (paddle::platform::is_cpu_place(*place)) {
in_ptr = input.data<float>(); in_ptr = input.data<float>();
} else { } else {
TensorCopy(input, paddle::platform::CPUPlace(), *context, &input_tmp); TensorCopy(input, paddle::platform::CPUPlace(), *context, &input_tmp, true);
in_ptr = input_tmp.data<float>(); in_ptr = input_tmp.data<float>();
} }
for (int i = 0; i < 6; ++i) { for (int i = 0; i < 6; ++i) {
...@@ -140,7 +142,7 @@ void testIm2col() { ...@@ -140,7 +142,7 @@ void testIm2col() {
if (paddle::platform::is_cpu_place(*place)) { if (paddle::platform::is_cpu_place(*place)) {
input = input_tmp; input = input_tmp;
} else { } else {
TensorCopy(input_tmp, *place, *context, &input); TensorCopy(input_tmp, *place, *context, &input, true);
} }
col2im_ocf(*context, output_ocf, dilation, stride, padding, &input); col2im_ocf(*context, output_ocf, dilation, stride, padding, &input);
...@@ -148,7 +150,7 @@ void testIm2col() { ...@@ -148,7 +150,7 @@ void testIm2col() {
if (paddle::platform::is_cpu_place(*place)) { if (paddle::platform::is_cpu_place(*place)) {
in_ptr = input.data<float>(); in_ptr = input.data<float>();
} else { } else {
TensorCopy(input, paddle::platform::CPUPlace(), *context, &input_tmp); TensorCopy(input, paddle::platform::CPUPlace(), *context, &input_tmp, true);
in_ptr = input_tmp.data<float>(); in_ptr = input_tmp.data<float>();
} }
for (int i = 0; i < 6; ++i) { for (int i = 0; i < 6; ++i) {
......
...@@ -40,15 +40,15 @@ TEST(math_function, notrans_mul_trans_fp32) { ...@@ -40,15 +40,15 @@ TEST(math_function, notrans_mul_trans_fp32) {
float arr[6] = {0, 1, 2, 3, 4, 5}; float arr[6] = {0, 1, 2, 3, 4, 5};
memcpy(input1_ptr, arr, 6 * sizeof(float)); memcpy(input1_ptr, arr, 6 * sizeof(float));
TensorCopy(input1, gpu_place, context, &input1_gpu); TensorCopy(input1, gpu_place, context, &input1_gpu, true);
TensorCopy(input1, gpu_place, context, &input2_gpu); TensorCopy(input1, gpu_place, context, &input2_gpu, true);
out_gpu.mutable_data<float>({2, 2}, gpu_place); out_gpu.mutable_data<float>({2, 2}, gpu_place);
paddle::operators::math::matmul<CUDADeviceContext, float>( paddle::operators::math::matmul<CUDADeviceContext, float>(
context, input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0); context, input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0);
TensorCopy(out_gpu, cpu_place, context, &out); TensorCopy(out_gpu, cpu_place, context, &out, true);
float* out_ptr = out.data<float>(); float* out_ptr = out.data<float>();
context.Wait(); context.Wait();
...@@ -80,8 +80,8 @@ TEST(math_function, notrans_mul_trans_fp16) { ...@@ -80,8 +80,8 @@ TEST(math_function, notrans_mul_trans_fp16) {
float16* input1_ptr = input1.mutable_data<float16>({2, 3}, cpu_place); float16* input1_ptr = input1.mutable_data<float16>({2, 3}, cpu_place);
fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5}); fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5});
TensorCopy(input1, gpu_place, context, &input1_gpu); TensorCopy(input1, gpu_place, context, &input1_gpu, true);
TensorCopy(input1, gpu_place, context, &input2_gpu); TensorCopy(input1, gpu_place, context, &input2_gpu, true);
out_gpu.mutable_data<float16>({2, 2}, gpu_place); out_gpu.mutable_data<float16>({2, 2}, gpu_place);
...@@ -89,7 +89,7 @@ TEST(math_function, notrans_mul_trans_fp16) { ...@@ -89,7 +89,7 @@ TEST(math_function, notrans_mul_trans_fp16) {
context, input1_gpu, false, input2_gpu, true, float16(1), &out_gpu, context, input1_gpu, false, input2_gpu, true, float16(1), &out_gpu,
float16(0)); float16(0));
TensorCopy(out_gpu, cpu_place, context, &out); TensorCopy(out_gpu, cpu_place, context, &out, true);
float16* out_ptr = out.data<float16>(); float16* out_ptr = out.data<float16>();
context.Wait(); context.Wait();
...@@ -117,15 +117,15 @@ TEST(math_function, trans_mul_notrans_fp32) { ...@@ -117,15 +117,15 @@ TEST(math_function, trans_mul_notrans_fp32) {
float arr[6] = {0, 1, 2, 3, 4, 5}; float arr[6] = {0, 1, 2, 3, 4, 5};
memcpy(input1_ptr, arr, 6 * sizeof(float)); memcpy(input1_ptr, arr, 6 * sizeof(float));
TensorCopy(input1, gpu_place, context, &input1_gpu); TensorCopy(input1, gpu_place, context, &input1_gpu, true);
TensorCopy(input1, gpu_place, context, &input2_gpu); TensorCopy(input1, gpu_place, context, &input2_gpu, true);
out_gpu.mutable_data<float>({3, 3}, gpu_place); out_gpu.mutable_data<float>({3, 3}, gpu_place);
paddle::operators::math::matmul<paddle::platform::CUDADeviceContext, float>( paddle::operators::math::matmul<paddle::platform::CUDADeviceContext, float>(
context, input1_gpu, true, input2_gpu, false, 1, &out_gpu, 0); context, input1_gpu, true, input2_gpu, false, 1, &out_gpu, 0);
TensorCopy(out_gpu, cpu_place, context, &out); TensorCopy(out_gpu, cpu_place, context, &out, true);
float* out_ptr = out.data<float>(); float* out_ptr = out.data<float>();
context.Wait(); context.Wait();
...@@ -162,8 +162,8 @@ TEST(math_function, trans_mul_notrans_fp16) { ...@@ -162,8 +162,8 @@ TEST(math_function, trans_mul_notrans_fp16) {
float16* input1_ptr = input1.mutable_data<float16>({2, 3}, cpu_place); float16* input1_ptr = input1.mutable_data<float16>({2, 3}, cpu_place);
fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5}); fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5});
TensorCopy(input1, gpu_place, context, &input1_gpu); TensorCopy(input1, gpu_place, context, &input1_gpu, true);
TensorCopy(input1, gpu_place, context, &input2_gpu); TensorCopy(input1, gpu_place, context, &input2_gpu, true);
out_gpu.mutable_data<float16>({3, 3}, gpu_place); out_gpu.mutable_data<float16>({3, 3}, gpu_place);
...@@ -171,7 +171,7 @@ TEST(math_function, trans_mul_notrans_fp16) { ...@@ -171,7 +171,7 @@ TEST(math_function, trans_mul_notrans_fp16) {
context, input1_gpu, true, input2_gpu, false, float16(1), &out_gpu, context, input1_gpu, true, input2_gpu, false, float16(1), &out_gpu,
float16(0)); float16(0));
TensorCopy(out_gpu, cpu_place, context, &out); TensorCopy(out_gpu, cpu_place, context, &out, true);
float16* out_ptr = out.data<float16>(); float16* out_ptr = out.data<float16>();
context.Wait(); context.Wait();
...@@ -214,9 +214,9 @@ TEST(math_function, gemm_notrans_cublas_fp32) { ...@@ -214,9 +214,9 @@ TEST(math_function, gemm_notrans_cublas_fp32) {
float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7}; float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7};
memcpy(input3_ptr, arr3, 8 * sizeof(float)); memcpy(input3_ptr, arr3, 8 * sizeof(float));
TensorCopy(input1, gpu_place, context, &input1_gpu); TensorCopy(input1, gpu_place, context, &input1_gpu, true);
TensorCopy(input2, gpu_place, context, &input2_gpu); TensorCopy(input2, gpu_place, context, &input2_gpu, true);
TensorCopy(input3, gpu_place, context, &input3_gpu); TensorCopy(input3, gpu_place, context, &input3_gpu, true);
float* a = input1_gpu.data<float>(); float* a = input1_gpu.data<float>();
float* b = input2_gpu.data<float>(); float* b = input2_gpu.data<float>();
float* c = input3_gpu.mutable_data<float>(gpu_place); float* c = input3_gpu.mutable_data<float>(gpu_place);
...@@ -224,7 +224,7 @@ TEST(math_function, gemm_notrans_cublas_fp32) { ...@@ -224,7 +224,7 @@ TEST(math_function, gemm_notrans_cublas_fp32) {
paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, float>( paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, float>(
context, false, false, m, n, k, 1, a, 3, b + 1, 4, 1, c + 1, 4); context, false, false, m, n, k, 1, a, 3, b + 1, 4, 1, c + 1, 4);
TensorCopy(input3_gpu, cpu_place, context, &input3); TensorCopy(input3_gpu, cpu_place, context, &input3, true);
// numpy code: // numpy code:
// a = np.arange(6).reshape(2, 3) // a = np.arange(6).reshape(2, 3)
...@@ -274,9 +274,9 @@ TEST(math_function, gemm_notrans_cublas_fp16) { ...@@ -274,9 +274,9 @@ TEST(math_function, gemm_notrans_cublas_fp16) {
float16* input3_ptr = input3.mutable_data<float16>({2, 4}, cpu_place); float16* input3_ptr = input3.mutable_data<float16>({2, 4}, cpu_place);
fill_fp16_data(input3_ptr, input3.numel(), {0, 1, 2, 3, 4, 5, 6, 7}); fill_fp16_data(input3_ptr, input3.numel(), {0, 1, 2, 3, 4, 5, 6, 7});
TensorCopy(input1, gpu_place, context, &input1_gpu); TensorCopy(input1, gpu_place, context, &input1_gpu, true);
TensorCopy(input2, gpu_place, context, &input2_gpu); TensorCopy(input2, gpu_place, context, &input2_gpu, true);
TensorCopy(input3, gpu_place, context, &input3_gpu); TensorCopy(input3, gpu_place, context, &input3_gpu, true);
float16* a = input1_gpu.data<float16>(); float16* a = input1_gpu.data<float16>();
float16* b = input2_gpu.data<float16>(); float16* b = input2_gpu.data<float16>();
float16* c = input3_gpu.mutable_data<float16>(gpu_place); float16* c = input3_gpu.mutable_data<float16>(gpu_place);
...@@ -285,7 +285,7 @@ TEST(math_function, gemm_notrans_cublas_fp16) { ...@@ -285,7 +285,7 @@ TEST(math_function, gemm_notrans_cublas_fp16) {
context, false, false, m, n, k, float16(1), a, 3, b + 1, 4, float16(1), context, false, false, m, n, k, float16(1), a, 3, b + 1, 4, float16(1),
c + 1, 4); c + 1, 4);
TensorCopy(input3_gpu, cpu_place, context, &input3); TensorCopy(input3_gpu, cpu_place, context, &input3, true);
// numpy code: // numpy code:
// a = np.arange(6).reshape(2, 3) // a = np.arange(6).reshape(2, 3)
...@@ -332,9 +332,9 @@ TEST(math_function, gemm_trans_cublas_fp32) { ...@@ -332,9 +332,9 @@ TEST(math_function, gemm_trans_cublas_fp32) {
float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7}; float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7};
memcpy(input3_ptr, arr3, 8 * sizeof(float)); memcpy(input3_ptr, arr3, 8 * sizeof(float));
TensorCopy(input1, gpu_place, context, &input1_gpu); TensorCopy(input1, gpu_place, context, &input1_gpu, true);
TensorCopy(input2, gpu_place, context, &input2_gpu); TensorCopy(input2, gpu_place, context, &input2_gpu, true);
TensorCopy(input3, gpu_place, context, &input3_gpu); TensorCopy(input3, gpu_place, context, &input3_gpu, true);
float* a = input1_gpu.data<float>(); float* a = input1_gpu.data<float>();
float* b = input2_gpu.data<float>(); float* b = input2_gpu.data<float>();
float* c = input3_gpu.mutable_data<float>(gpu_place); float* c = input3_gpu.mutable_data<float>(gpu_place);
...@@ -342,7 +342,7 @@ TEST(math_function, gemm_trans_cublas_fp32) { ...@@ -342,7 +342,7 @@ TEST(math_function, gemm_trans_cublas_fp32) {
paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, float>( paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, float>(
context, false, true, m, n, k, 1, a, 3, b + 3, 3, 1, c + 1, 4); context, false, true, m, n, k, 1, a, 3, b + 3, 3, 1, c + 1, 4);
TensorCopy(input3_gpu, cpu_place, context, &input3); TensorCopy(input3_gpu, cpu_place, context, &input3, true);
context.Wait(); context.Wait();
EXPECT_EQ(input3_ptr[0], 0); EXPECT_EQ(input3_ptr[0], 0);
...@@ -386,9 +386,9 @@ TEST(math_function, gemm_trans_cublas_fp16) { ...@@ -386,9 +386,9 @@ TEST(math_function, gemm_trans_cublas_fp16) {
float16* input3_ptr = input3.mutable_data<float16>({2, 4}, cpu_place); float16* input3_ptr = input3.mutable_data<float16>({2, 4}, cpu_place);
fill_fp16_data(input3_ptr, input3.numel(), {0, 1, 2, 3, 4, 5, 6, 7}); fill_fp16_data(input3_ptr, input3.numel(), {0, 1, 2, 3, 4, 5, 6, 7});
TensorCopy(input1, gpu_place, context, &input1_gpu); TensorCopy(input1, gpu_place, context, &input1_gpu, true);
TensorCopy(input2, gpu_place, context, &input2_gpu); TensorCopy(input2, gpu_place, context, &input2_gpu, true);
TensorCopy(input3, gpu_place, context, &input3_gpu); TensorCopy(input3, gpu_place, context, &input3_gpu, true);
float16* a = input1_gpu.data<float16>(); float16* a = input1_gpu.data<float16>();
float16* b = input2_gpu.data<float16>(); float16* b = input2_gpu.data<float16>();
float16* c = input3_gpu.mutable_data<float16>(gpu_place); float16* c = input3_gpu.mutable_data<float16>(gpu_place);
...@@ -397,7 +397,7 @@ TEST(math_function, gemm_trans_cublas_fp16) { ...@@ -397,7 +397,7 @@ TEST(math_function, gemm_trans_cublas_fp16) {
context, false, true, m, n, k, float16(1), a, 3, b + 3, 3, float16(1), context, false, true, m, n, k, float16(1), a, 3, b + 3, 3, float16(1),
c + 1, 4); c + 1, 4);
TensorCopy(input3_gpu, cpu_place, context, &input3); TensorCopy(input3_gpu, cpu_place, context, &input3, true);
context.Wait(); context.Wait();
EXPECT_EQ(static_cast<float>(input3_ptr[0]), 0); EXPECT_EQ(static_cast<float>(input3_ptr[0]), 0);
...@@ -441,14 +441,14 @@ void GemvTest(int m, int n, bool trans) { ...@@ -441,14 +441,14 @@ void GemvTest(int m, int n, bool trans) {
data_b[i] = static_cast<T>(i); data_b[i] = static_cast<T>(i);
} }
TensorCopy(mat_a, gpu_place, context, &g_mat_a); TensorCopy(mat_a, gpu_place, context, &g_mat_a, true);
TensorCopy(vec_b, gpu_place, context, &g_vec_b); TensorCopy(vec_b, gpu_place, context, &g_vec_b, true);
paddle::operators::math::gemv<CUDADeviceContext, T>( paddle::operators::math::gemv<CUDADeviceContext, T>(
context, trans, static_cast<int>(m), static_cast<int>(n), 1., g_data_a, context, trans, static_cast<int>(m), static_cast<int>(n), 1., g_data_a,
g_data_b, 0., g_data_c); g_data_b, 0., g_data_c);
TensorCopy(g_vec_c, cpu_place, context, &vec_c); TensorCopy(g_vec_c, cpu_place, context, &vec_c, true);
if (!trans) { if (!trans) {
for (int i = 0; i < m; ++i) { for (int i = 0; i < m; ++i) {
......
...@@ -71,7 +71,7 @@ void testVol2col() { ...@@ -71,7 +71,7 @@ void testVol2col() {
if (paddle::platform::is_cpu_place(*place)) { if (paddle::platform::is_cpu_place(*place)) {
input = input_tmp; input = input_tmp;
} else { } else {
paddle::framework::TensorCopy(input_tmp, *place, *context, &input); paddle::framework::TensorCopy(input_tmp, *place, *context, &input, true);
} }
output.mutable_data<float>({1, filter_size, filter_size, filter_size, output.mutable_data<float>({1, filter_size, filter_size, filter_size,
output_depth, output_height, output_width}, output_depth, output_height, output_width},
...@@ -85,7 +85,8 @@ void testVol2col() { ...@@ -85,7 +85,8 @@ void testVol2col() {
if (paddle::platform::is_cpu_place(*place)) { if (paddle::platform::is_cpu_place(*place)) {
out_cfo_ptr = output.data<float>(); out_cfo_ptr = output.data<float>();
} else { } else {
TensorCopy(output, paddle::platform::CPUPlace(), *context, &output_tmp); TensorCopy(output, paddle::platform::CPUPlace(), *context, &output_tmp,
true);
out_cfo_ptr = output_tmp.data<float>(); out_cfo_ptr = output_tmp.data<float>();
} }
...@@ -99,7 +100,7 @@ void testVol2col() { ...@@ -99,7 +100,7 @@ void testVol2col() {
if (paddle::platform::is_cpu_place(*place)) { if (paddle::platform::is_cpu_place(*place)) {
input = input_tmp; input = input_tmp;
} else { } else {
TensorCopy(input_tmp, *place, *context, &input); TensorCopy(input_tmp, *place, *context, &input, true);
} }
paddle::operators::math::Col2VolFunctor<DeviceContext, float> col2vol; paddle::operators::math::Col2VolFunctor<DeviceContext, float> col2vol;
...@@ -109,7 +110,7 @@ void testVol2col() { ...@@ -109,7 +110,7 @@ void testVol2col() {
if (paddle::platform::is_cpu_place(*place)) { if (paddle::platform::is_cpu_place(*place)) {
in_ptr = input.data<float>(); in_ptr = input.data<float>();
} else { } else {
TensorCopy(input, paddle::platform::CPUPlace(), *context, &input_tmp); TensorCopy(input, paddle::platform::CPUPlace(), *context, &input_tmp, true);
in_ptr = input_tmp.data<float>(); in_ptr = input_tmp.data<float>();
} }
......
...@@ -228,10 +228,8 @@ TEST_F(NCCLTester, ncclReduceOp) { ...@@ -228,10 +228,8 @@ TEST_F(NCCLTester, ncclReduceOp) {
result_tensor->Resize(kDims); result_tensor->Resize(kDims);
auto *ct = result_tensor->mutable_data<float>(cpu_place); auto *ct = result_tensor->mutable_data<float>(cpu_place);
paddle::memory::Copy( paddle::memory::Copy(cpu_place, ct, p::CUDAPlace(gpu_list_[kRoot]), rt,
cpu_place, ct, p::CUDAPlace(gpu_list_[kRoot]), rt, recv_tensor.numel() * sizeof(float), nullptr);
recv_tensor.numel() * sizeof(float),
static_cast<p::CUDADeviceContext *>(dev_ctxs_[kRoot])->stream());
for (int64_t j = 0; j < f::product(kDims); ++j) { for (int64_t j = 0; j < f::product(kDims); ++j) {
ASSERT_NEAR(ct[j], expected_result, 1e-5); ASSERT_NEAR(ct[j], expected_result, 1e-5);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册