From 910f377fa52d96531a2fd85a40020946036e6d6b Mon Sep 17 00:00:00 2001 From: furnace <34057289+windstamp@users.noreply.github.com> Date: Wed, 10 Mar 2021 11:11:23 +0800 Subject: [PATCH] Bugfix rocm (#31490) * bugfix for test_cholesky_op * bugfix for test_compare_op * bugfix for lookup_table_op * bugfix for affine_channel_op --- paddle/fluid/operators/affine_channel_op.cu | 8 ++++++ paddle/fluid/operators/lookup_table_op.cu | 28 ++++++++++++++++++- .../fluid/tests/unittests/test_cholesky_op.py | 9 ++++-- .../fluid/tests/unittests/test_compare_op.py | 3 ++ 4 files changed, 44 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/operators/affine_channel_op.cu b/paddle/fluid/operators/affine_channel_op.cu index cddc288c24c..5fa1e18553b 100644 --- a/paddle/fluid/operators/affine_channel_op.cu +++ b/paddle/fluid/operators/affine_channel_op.cu @@ -71,7 +71,11 @@ class AffineChannelCUDAKernel : public framework::OpKernel { const T* bias_d = bias->data(); T* y_d = y->data(); +#ifdef PADDLE_WITH_HIP + int block = 256; +#else int block = 1024; +#endif // PADDLE_WITH_HIP int grid = (num + block - 1) / block; int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); @@ -153,7 +157,11 @@ class AffineChannelGradCUDAKernel : public framework::OpKernel { T* ds_d = dscale ? dscale->mutable_data(ctx.GetPlace()) : nullptr; T* db_d = dbias ? dbias->mutable_data(ctx.GetPlace()) : nullptr; +#ifdef PADDLE_WITH_HIP + const int block = 256; +#else const int block = 1024; +#endif // PADDLE_WITH_HIP int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); const int max_blocks = std::max(max_threads / block, 1); int grid1 = (num + block - 1) / block; diff --git a/paddle/fluid/operators/lookup_table_op.cu b/paddle/fluid/operators/lookup_table_op.cu index 3e06e5caed3..6985b916757 100644 --- a/paddle/fluid/operators/lookup_table_op.cu +++ b/paddle/fluid/operators/lookup_table_op.cu @@ -105,9 +105,24 @@ class LookupTableCUDAKernel : public framework::OpKernel { auto *table = table_t->data(); auto *output = output_t->mutable_data(context.GetPlace()); +#ifdef PADDLE_WITH_HIP + dim3 threads(64, 4); +#else dim3 threads(128, 8); +#endif // PADDLE_WITH_HIP dim3 grids(8, 1); - +#ifdef PADDLE_WITH_HIP + if (padding_idx == -1) + LookupTable< + T, 64, 4, 8, + false><<>>( + output, table, ids, N, K, D, padding_idx); + else + LookupTable< + T, 64, 4, 8, + true><<>>( + output, table, ids, N, K, D, padding_idx); +#else if (padding_idx == -1) LookupTable< T, 128, 8, 8, @@ -118,6 +133,7 @@ class LookupTableCUDAKernel : public framework::OpKernel { T, 128, 8, 8, true><<>>( output, table, ids, N, K, D, padding_idx); +#endif // PADDLE_WITH_HIP } }; @@ -185,10 +201,20 @@ class LookupTableGradCUDAKernel : public framework::OpKernel { auto t = framework::EigenVector::Flatten(*d_table_t); t.device(*dev_ctx.eigen_device()) = t.constant(static_cast(0)); +#ifdef PADDLE_WITH_HIP + dim3 threads(64, 4); +#else dim3 threads(128, 8); +#endif // PADDLE_WITH_HIP dim3 grids(8, 1); + +#ifdef PADDLE_WITH_HIP + LookupTableGrad<<>>( + d_table, d_output, ids, N, K, D); +#else LookupTableGrad<<>>( d_table, d_output, ids, N, K, D); +#endif // PADDLE_WITH_HIP } } }; diff --git a/python/paddle/fluid/tests/unittests/test_cholesky_op.py b/python/paddle/fluid/tests/unittests/test_cholesky_op.py index 93f62b20f29..633aa2cd613 100644 --- a/python/paddle/fluid/tests/unittests/test_cholesky_op.py +++ b/python/paddle/fluid/tests/unittests/test_cholesky_op.py @@ -58,7 +58,7 @@ class TestCholeskyOp(OpTest): def test_check_grad(self): places = [fluid.CPUPlace()] - if core.is_compiled_with_cuda(): + if core.is_compiled_with_cuda() and (not core.is_compiled_with_rocm()): places.append(fluid.CUDAPlace(0)) for p in places: self.func(p) @@ -92,7 +92,10 @@ class TestCholeskyOp2D(TestCholeskyOp): class TestDygraph(unittest.TestCase): def test_dygraph(self): - paddle.disable_static() + if core.is_compiled_with_rocm(): + paddle.disable_static(place=fluid.CPUPlace()) + else: + paddle.disable_static() a = np.random.rand(3, 3) a_t = np.transpose(a, [1, 0]) x_data = np.matmul(a, a_t) + 1e-03 @@ -103,7 +106,7 @@ class TestDygraph(unittest.TestCase): class TestCholeskySingularAPI(unittest.TestCase): def setUp(self): self.places = [fluid.CPUPlace()] - if core.is_compiled_with_cuda(): + if core.is_compiled_with_cuda() and (not core.is_compiled_with_rocm()): self.places.append(fluid.CUDAPlace(0)) def check_static_result(self, place, with_out=False): diff --git a/python/paddle/fluid/tests/unittests/test_compare_op.py b/python/paddle/fluid/tests/unittests/test_compare_op.py index 63a43432b4e..fbf7384b86b 100644 --- a/python/paddle/fluid/tests/unittests/test_compare_op.py +++ b/python/paddle/fluid/tests/unittests/test_compare_op.py @@ -61,6 +61,9 @@ def create_test_class(op_type, typename, callback): for _type_name in {'float32', 'float64', 'int32', 'int64'}: + if _type_name == 'float64' and core.is_compiled_with_rocm(): + _type_name = 'float32' + create_test_class('less_than', _type_name, lambda _a, _b: _a < _b) create_test_class('less_equal', _type_name, lambda _a, _b: _a <= _b) create_test_class('greater_than', _type_name, lambda _a, _b: _a > _b) -- GitLab