提交 bcf260e1 编写于 作者: F fengjiayi

fix several unit tests

上级 3cd99f4b
......@@ -59,8 +59,7 @@ class FetchOp : public framework::OperatorBase {
// CPU outputs?
auto &dev_ctx = *pool.Get(src_item.place());
TensorCopy(src_item, platform::CPUPlace(), dev_ctx, &dst_item);
dev_ctx.Wait();
TensorCopy(src_item, platform::CPUPlace(), dev_ctx, &dst_item, true);
dst_item.set_lod(src_item.lod());
VLOG(3) << "Fetch variable " << fetch_var_name << " to " << out_name;
......
......@@ -72,8 +72,8 @@ void testConcat() {
}
if (paddle::platform::is_gpu_place(Place())) {
TensorCopy(input_a_cpu, Place(), *context, &input_a);
TensorCopy(input_b_cpu, Place(), *context, &input_b);
TensorCopy(input_a_cpu, Place(), *context, &input_a, true);
TensorCopy(input_b_cpu, Place(), *context, &input_b, true);
}
std::vector<Tensor> input;
......@@ -89,7 +89,7 @@ void testConcat() {
int* out_ptr;
if (paddle::platform::is_gpu_place(Place())) {
TensorCopy(out, CPUPlace(), *context, &out_cpu);
TensorCopy(out, CPUPlace(), *context, &out_cpu, true);
out_ptr = out_cpu.data<int>();
} else {
out_ptr = out.data<int>();
......@@ -144,8 +144,8 @@ void testConcat() {
}
if (paddle::platform::is_gpu_place(Place())) {
TensorCopy(input_a_cpu, Place(), *context, &input_a);
TensorCopy(input_b_cpu, Place(), *context, &input_b);
TensorCopy(input_a_cpu, Place(), *context, &input_a, true);
TensorCopy(input_b_cpu, Place(), *context, &input_b, true);
}
input.clear();
......@@ -159,7 +159,7 @@ void testConcat() {
PADDLE_ENFORCE_EQ(input_b.dims(), dim_b);
if (paddle::platform::is_gpu_place(Place())) {
TensorCopy(out, CPUPlace(), *context, &out_cpu);
TensorCopy(out, CPUPlace(), *context, &out_cpu, true);
out_ptr = out_cpu.data<int>();
} else {
out_ptr = out.data<int>();
......@@ -216,8 +216,8 @@ void testConcat() {
}
if (paddle::platform::is_gpu_place(Place())) {
TensorCopy(input_a_cpu, Place(), *context, &input_a);
TensorCopy(input_b_cpu, Place(), *context, &input_b);
TensorCopy(input_a_cpu, Place(), *context, &input_a, true);
TensorCopy(input_b_cpu, Place(), *context, &input_b, true);
}
input.clear();
......@@ -231,7 +231,7 @@ void testConcat() {
PADDLE_ENFORCE_EQ(input_b.dims(), dim_b);
if (paddle::platform::is_gpu_place(Place())) {
TensorCopy(out, CPUPlace(), *context, &out_cpu);
TensorCopy(out, CPUPlace(), *context, &out_cpu, true);
out_ptr = out_cpu.data<int>();
} else {
out_ptr = out.data<int>();
......@@ -290,8 +290,8 @@ void testConcat() {
}
if (paddle::platform::is_gpu_place(Place())) {
TensorCopy(input_a_cpu, Place(), *context, &input_a);
TensorCopy(input_b_cpu, Place(), *context, &input_b);
TensorCopy(input_a_cpu, Place(), *context, &input_a, true);
TensorCopy(input_b_cpu, Place(), *context, &input_b, true);
}
input.clear();
......@@ -305,7 +305,7 @@ void testConcat() {
PADDLE_ENFORCE_EQ(input_b.dims(), dim_b);
if (paddle::platform::is_gpu_place(Place())) {
TensorCopy(out, CPUPlace(), *context, &out_cpu);
TensorCopy(out, CPUPlace(), *context, &out_cpu, true);
out_ptr = out_cpu.data<int>();
} else {
out_ptr = out.data<int>();
......
......@@ -62,7 +62,7 @@ void testIm2col() {
if (paddle::platform::is_cpu_place(*place)) {
input = input_tmp;
} else {
TensorCopy(input_tmp, *place, *context, &input);
TensorCopy(input_tmp, *place, *context, &input, true);
}
output_cfo.mutable_data<float>(
{1, filter_size, filter_size, output_height, output_width}, *place);
......@@ -87,7 +87,8 @@ void testIm2col() {
if (paddle::platform::is_cpu_place(*place)) {
out_cfo_ptr = output_cfo.data<float>();
} else {
TensorCopy(output_cfo, paddle::platform::CPUPlace(), *context, &output_tmp);
TensorCopy(output_cfo, paddle::platform::CPUPlace(), *context, &output_tmp,
true);
out_cfo_ptr = output_tmp.data<float>();
}
for (int i = 0; i < 6; ++i) {
......@@ -98,7 +99,8 @@ void testIm2col() {
if (paddle::platform::is_cpu_place(*place)) {
out_ocf_ptr = output_ocf.data<float>();
} else {
TensorCopy(output_ocf, paddle::platform::CPUPlace(), *context, &output_tmp);
TensorCopy(output_ocf, paddle::platform::CPUPlace(), *context, &output_tmp,
true);
out_ocf_ptr = output_tmp.data<float>();
}
......@@ -119,7 +121,7 @@ void testIm2col() {
if (paddle::platform::is_cpu_place(*place)) {
input = input_tmp;
} else {
TensorCopy(input_tmp, *place, *context, &input);
TensorCopy(input_tmp, *place, *context, &input, true);
}
col2im(*context, output_cfo, dilation, stride, padding, &input);
......@@ -128,7 +130,7 @@ void testIm2col() {
if (paddle::platform::is_cpu_place(*place)) {
in_ptr = input.data<float>();
} else {
TensorCopy(input, paddle::platform::CPUPlace(), *context, &input_tmp);
TensorCopy(input, paddle::platform::CPUPlace(), *context, &input_tmp, true);
in_ptr = input_tmp.data<float>();
}
for (int i = 0; i < 6; ++i) {
......@@ -140,7 +142,7 @@ void testIm2col() {
if (paddle::platform::is_cpu_place(*place)) {
input = input_tmp;
} else {
TensorCopy(input_tmp, *place, *context, &input);
TensorCopy(input_tmp, *place, *context, &input, true);
}
col2im_ocf(*context, output_ocf, dilation, stride, padding, &input);
......@@ -148,7 +150,7 @@ void testIm2col() {
if (paddle::platform::is_cpu_place(*place)) {
in_ptr = input.data<float>();
} else {
TensorCopy(input, paddle::platform::CPUPlace(), *context, &input_tmp);
TensorCopy(input, paddle::platform::CPUPlace(), *context, &input_tmp, true);
in_ptr = input_tmp.data<float>();
}
for (int i = 0; i < 6; ++i) {
......
......@@ -40,15 +40,15 @@ TEST(math_function, notrans_mul_trans_fp32) {
float arr[6] = {0, 1, 2, 3, 4, 5};
memcpy(input1_ptr, arr, 6 * sizeof(float));
TensorCopy(input1, gpu_place, context, &input1_gpu);
TensorCopy(input1, gpu_place, context, &input2_gpu);
TensorCopy(input1, gpu_place, context, &input1_gpu, true);
TensorCopy(input1, gpu_place, context, &input2_gpu, true);
out_gpu.mutable_data<float>({2, 2}, gpu_place);
paddle::operators::math::matmul<CUDADeviceContext, float>(
context, input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0);
TensorCopy(out_gpu, cpu_place, context, &out);
TensorCopy(out_gpu, cpu_place, context, &out, true);
float* out_ptr = out.data<float>();
context.Wait();
......@@ -80,8 +80,8 @@ TEST(math_function, notrans_mul_trans_fp16) {
float16* input1_ptr = input1.mutable_data<float16>({2, 3}, cpu_place);
fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5});
TensorCopy(input1, gpu_place, context, &input1_gpu);
TensorCopy(input1, gpu_place, context, &input2_gpu);
TensorCopy(input1, gpu_place, context, &input1_gpu, true);
TensorCopy(input1, gpu_place, context, &input2_gpu, true);
out_gpu.mutable_data<float16>({2, 2}, gpu_place);
......@@ -89,7 +89,7 @@ TEST(math_function, notrans_mul_trans_fp16) {
context, input1_gpu, false, input2_gpu, true, float16(1), &out_gpu,
float16(0));
TensorCopy(out_gpu, cpu_place, context, &out);
TensorCopy(out_gpu, cpu_place, context, &out, true);
float16* out_ptr = out.data<float16>();
context.Wait();
......@@ -117,15 +117,15 @@ TEST(math_function, trans_mul_notrans_fp32) {
float arr[6] = {0, 1, 2, 3, 4, 5};
memcpy(input1_ptr, arr, 6 * sizeof(float));
TensorCopy(input1, gpu_place, context, &input1_gpu);
TensorCopy(input1, gpu_place, context, &input2_gpu);
TensorCopy(input1, gpu_place, context, &input1_gpu, true);
TensorCopy(input1, gpu_place, context, &input2_gpu, true);
out_gpu.mutable_data<float>({3, 3}, gpu_place);
paddle::operators::math::matmul<paddle::platform::CUDADeviceContext, float>(
context, input1_gpu, true, input2_gpu, false, 1, &out_gpu, 0);
TensorCopy(out_gpu, cpu_place, context, &out);
TensorCopy(out_gpu, cpu_place, context, &out, true);
float* out_ptr = out.data<float>();
context.Wait();
......@@ -162,8 +162,8 @@ TEST(math_function, trans_mul_notrans_fp16) {
float16* input1_ptr = input1.mutable_data<float16>({2, 3}, cpu_place);
fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5});
TensorCopy(input1, gpu_place, context, &input1_gpu);
TensorCopy(input1, gpu_place, context, &input2_gpu);
TensorCopy(input1, gpu_place, context, &input1_gpu, true);
TensorCopy(input1, gpu_place, context, &input2_gpu, true);
out_gpu.mutable_data<float16>({3, 3}, gpu_place);
......@@ -171,7 +171,7 @@ TEST(math_function, trans_mul_notrans_fp16) {
context, input1_gpu, true, input2_gpu, false, float16(1), &out_gpu,
float16(0));
TensorCopy(out_gpu, cpu_place, context, &out);
TensorCopy(out_gpu, cpu_place, context, &out, true);
float16* out_ptr = out.data<float16>();
context.Wait();
......@@ -214,9 +214,9 @@ TEST(math_function, gemm_notrans_cublas_fp32) {
float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7};
memcpy(input3_ptr, arr3, 8 * sizeof(float));
TensorCopy(input1, gpu_place, context, &input1_gpu);
TensorCopy(input2, gpu_place, context, &input2_gpu);
TensorCopy(input3, gpu_place, context, &input3_gpu);
TensorCopy(input1, gpu_place, context, &input1_gpu, true);
TensorCopy(input2, gpu_place, context, &input2_gpu, true);
TensorCopy(input3, gpu_place, context, &input3_gpu, true);
float* a = input1_gpu.data<float>();
float* b = input2_gpu.data<float>();
float* c = input3_gpu.mutable_data<float>(gpu_place);
......@@ -224,7 +224,7 @@ TEST(math_function, gemm_notrans_cublas_fp32) {
paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, float>(
context, false, false, m, n, k, 1, a, 3, b + 1, 4, 1, c + 1, 4);
TensorCopy(input3_gpu, cpu_place, context, &input3);
TensorCopy(input3_gpu, cpu_place, context, &input3, true);
// numpy code:
// a = np.arange(6).reshape(2, 3)
......@@ -274,9 +274,9 @@ TEST(math_function, gemm_notrans_cublas_fp16) {
float16* input3_ptr = input3.mutable_data<float16>({2, 4}, cpu_place);
fill_fp16_data(input3_ptr, input3.numel(), {0, 1, 2, 3, 4, 5, 6, 7});
TensorCopy(input1, gpu_place, context, &input1_gpu);
TensorCopy(input2, gpu_place, context, &input2_gpu);
TensorCopy(input3, gpu_place, context, &input3_gpu);
TensorCopy(input1, gpu_place, context, &input1_gpu, true);
TensorCopy(input2, gpu_place, context, &input2_gpu, true);
TensorCopy(input3, gpu_place, context, &input3_gpu, true);
float16* a = input1_gpu.data<float16>();
float16* b = input2_gpu.data<float16>();
float16* c = input3_gpu.mutable_data<float16>(gpu_place);
......@@ -285,7 +285,7 @@ TEST(math_function, gemm_notrans_cublas_fp16) {
context, false, false, m, n, k, float16(1), a, 3, b + 1, 4, float16(1),
c + 1, 4);
TensorCopy(input3_gpu, cpu_place, context, &input3);
TensorCopy(input3_gpu, cpu_place, context, &input3, true);
// numpy code:
// a = np.arange(6).reshape(2, 3)
......@@ -332,9 +332,9 @@ TEST(math_function, gemm_trans_cublas_fp32) {
float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7};
memcpy(input3_ptr, arr3, 8 * sizeof(float));
TensorCopy(input1, gpu_place, context, &input1_gpu);
TensorCopy(input2, gpu_place, context, &input2_gpu);
TensorCopy(input3, gpu_place, context, &input3_gpu);
TensorCopy(input1, gpu_place, context, &input1_gpu, true);
TensorCopy(input2, gpu_place, context, &input2_gpu, true);
TensorCopy(input3, gpu_place, context, &input3_gpu, true);
float* a = input1_gpu.data<float>();
float* b = input2_gpu.data<float>();
float* c = input3_gpu.mutable_data<float>(gpu_place);
......@@ -342,7 +342,7 @@ TEST(math_function, gemm_trans_cublas_fp32) {
paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, float>(
context, false, true, m, n, k, 1, a, 3, b + 3, 3, 1, c + 1, 4);
TensorCopy(input3_gpu, cpu_place, context, &input3);
TensorCopy(input3_gpu, cpu_place, context, &input3, true);
context.Wait();
EXPECT_EQ(input3_ptr[0], 0);
......@@ -386,9 +386,9 @@ TEST(math_function, gemm_trans_cublas_fp16) {
float16* input3_ptr = input3.mutable_data<float16>({2, 4}, cpu_place);
fill_fp16_data(input3_ptr, input3.numel(), {0, 1, 2, 3, 4, 5, 6, 7});
TensorCopy(input1, gpu_place, context, &input1_gpu);
TensorCopy(input2, gpu_place, context, &input2_gpu);
TensorCopy(input3, gpu_place, context, &input3_gpu);
TensorCopy(input1, gpu_place, context, &input1_gpu, true);
TensorCopy(input2, gpu_place, context, &input2_gpu, true);
TensorCopy(input3, gpu_place, context, &input3_gpu, true);
float16* a = input1_gpu.data<float16>();
float16* b = input2_gpu.data<float16>();
float16* c = input3_gpu.mutable_data<float16>(gpu_place);
......@@ -397,7 +397,7 @@ TEST(math_function, gemm_trans_cublas_fp16) {
context, false, true, m, n, k, float16(1), a, 3, b + 3, 3, float16(1),
c + 1, 4);
TensorCopy(input3_gpu, cpu_place, context, &input3);
TensorCopy(input3_gpu, cpu_place, context, &input3, true);
context.Wait();
EXPECT_EQ(static_cast<float>(input3_ptr[0]), 0);
......@@ -441,14 +441,14 @@ void GemvTest(int m, int n, bool trans) {
data_b[i] = static_cast<T>(i);
}
TensorCopy(mat_a, gpu_place, context, &g_mat_a);
TensorCopy(vec_b, gpu_place, context, &g_vec_b);
TensorCopy(mat_a, gpu_place, context, &g_mat_a, true);
TensorCopy(vec_b, gpu_place, context, &g_vec_b, true);
paddle::operators::math::gemv<CUDADeviceContext, T>(
context, trans, static_cast<int>(m), static_cast<int>(n), 1., g_data_a,
g_data_b, 0., g_data_c);
TensorCopy(g_vec_c, cpu_place, context, &vec_c);
TensorCopy(g_vec_c, cpu_place, context, &vec_c, true);
if (!trans) {
for (int i = 0; i < m; ++i) {
......
......@@ -71,7 +71,7 @@ void testVol2col() {
if (paddle::platform::is_cpu_place(*place)) {
input = input_tmp;
} else {
paddle::framework::TensorCopy(input_tmp, *place, *context, &input);
paddle::framework::TensorCopy(input_tmp, *place, *context, &input, true);
}
output.mutable_data<float>({1, filter_size, filter_size, filter_size,
output_depth, output_height, output_width},
......@@ -85,7 +85,8 @@ void testVol2col() {
if (paddle::platform::is_cpu_place(*place)) {
out_cfo_ptr = output.data<float>();
} else {
TensorCopy(output, paddle::platform::CPUPlace(), *context, &output_tmp);
TensorCopy(output, paddle::platform::CPUPlace(), *context, &output_tmp,
true);
out_cfo_ptr = output_tmp.data<float>();
}
......@@ -99,7 +100,7 @@ void testVol2col() {
if (paddle::platform::is_cpu_place(*place)) {
input = input_tmp;
} else {
TensorCopy(input_tmp, *place, *context, &input);
TensorCopy(input_tmp, *place, *context, &input, true);
}
paddle::operators::math::Col2VolFunctor<DeviceContext, float> col2vol;
......@@ -109,7 +110,7 @@ void testVol2col() {
if (paddle::platform::is_cpu_place(*place)) {
in_ptr = input.data<float>();
} else {
TensorCopy(input, paddle::platform::CPUPlace(), *context, &input_tmp);
TensorCopy(input, paddle::platform::CPUPlace(), *context, &input_tmp, true);
in_ptr = input_tmp.data<float>();
}
......
......@@ -228,10 +228,8 @@ TEST_F(NCCLTester, ncclReduceOp) {
result_tensor->Resize(kDims);
auto *ct = result_tensor->mutable_data<float>(cpu_place);
paddle::memory::Copy(
cpu_place, ct, p::CUDAPlace(gpu_list_[kRoot]), rt,
recv_tensor.numel() * sizeof(float),
static_cast<p::CUDADeviceContext *>(dev_ctxs_[kRoot])->stream());
paddle::memory::Copy(cpu_place, ct, p::CUDAPlace(gpu_list_[kRoot]), rt,
recv_tensor.numel() * sizeof(float), nullptr);
for (int64_t j = 0; j < f::product(kDims); ++j) {
ASSERT_NEAR(ct[j], expected_result, 1e-5);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册