未验证 提交 420527f0 编写于 作者: R ronnywang 提交者: GitHub

[ROCM] fix layer_norm, norm, p_norm, test_sequence_softmax_op, test_math_op_patch_var_base (#31709)

上级 87852616
...@@ -43,7 +43,11 @@ template <typename T> ...@@ -43,7 +43,11 @@ template <typename T>
using LayerNormParamType = typename CudnnDataType<T>::BatchNormParamType; using LayerNormParamType = typename CudnnDataType<T>::BatchNormParamType;
inline static int GetDesiredBlockDim(int block_dim) { inline static int GetDesiredBlockDim(int block_dim) {
#ifdef __HIPCC__
const int kMaxBlockDim = 256;
#else
const int kMaxBlockDim = 512; const int kMaxBlockDim = 512;
#endif
return block_dim >= kMaxBlockDim return block_dim >= kMaxBlockDim
? kMaxBlockDim ? kMaxBlockDim
: (1 << (static_cast<int>(std::log2f(block_dim)))); : (1 << (static_cast<int>(std::log2f(block_dim))));
...@@ -698,8 +702,11 @@ static void LayerNormBackward(const T *x, const T *d_y, const U *scale, ...@@ -698,8 +702,11 @@ static void LayerNormBackward(const T *x, const T *d_y, const U *scale,
const framework::ExecutionContext &ctx) { const framework::ExecutionContext &ctx) {
auto &dev_ctx = ctx.cuda_device_context(); auto &dev_ctx = ctx.cuda_device_context();
auto stream = dev_ctx.stream(); auto stream = dev_ctx.stream();
#ifdef __HIPCC__
const int kMaxBlockDim = 256;
#else
const int kMaxBlockDim = 512; const int kMaxBlockDim = 512;
#endif
const int kMaxBlockNum = 128; const int kMaxBlockNum = 128;
int gradient_flag = ((d_x != nullptr ? 1 : 0) << 2) | int gradient_flag = ((d_x != nullptr ? 1 : 0) << 2) |
((d_scale != nullptr ? 1 : 0) << 1) | ((d_scale != nullptr ? 1 : 0) << 1) |
......
...@@ -79,8 +79,11 @@ class NormCUDAKernel : public framework::OpKernel<T> { ...@@ -79,8 +79,11 @@ class NormCUDAKernel : public framework::OpKernel<T> {
GetDims(xdim, axis, &pre, &n, &post); GetDims(xdim, axis, &pre, &n, &post);
auto& dev_ctx = ctx.cuda_device_context(); auto& dev_ctx = ctx.cuda_device_context();
#ifdef __HIPCC__
const int block = 256;
#else
const int block = 512; const int block = 512;
#endif
int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
const int max_blocks = std::max(max_threads / block, 1); const int max_blocks = std::max(max_threads / block, 1);
int grid = std::min(max_blocks, pre * post); int grid = std::min(max_blocks, pre * post);
...@@ -146,7 +149,11 @@ class NormGradCUDAKernel : public framework::OpKernel<T> { ...@@ -146,7 +149,11 @@ class NormGradCUDAKernel : public framework::OpKernel<T> {
auto& dev_ctx = ctx.cuda_device_context(); auto& dev_ctx = ctx.cuda_device_context();
#ifdef __HIPCC__
const int block = 256;
#else
const int block = 512; const int block = 512;
#endif
int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
const int max_blocks = std::max(max_threads / block, 1); const int max_blocks = std::max(max_threads / block, 1);
int grid = std::min(max_blocks, pre * post); int grid = std::min(max_blocks, pre * post);
......
...@@ -142,7 +142,12 @@ class PnormCUDAKernel : public framework::OpKernel<T> { ...@@ -142,7 +142,12 @@ class PnormCUDAKernel : public framework::OpKernel<T> {
auto& dev_ctx = ctx.cuda_device_context(); auto& dev_ctx = ctx.cuda_device_context();
#ifdef __HIPCC__
const int block = 256;
#else
const int block = 512; const int block = 512;
#endif
int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
const int max_blocks = std::max(max_threads / block, 1); const int max_blocks = std::max(max_threads / block, 1);
int grid = std::min(max_blocks, pre * post); int grid = std::min(max_blocks, pre * post);
...@@ -244,7 +249,12 @@ class PnormGradCUDAKernel : public framework::OpKernel<T> { ...@@ -244,7 +249,12 @@ class PnormGradCUDAKernel : public framework::OpKernel<T> {
auto& dev_ctx = ctx.cuda_device_context(); auto& dev_ctx = ctx.cuda_device_context();
#ifdef __HIPCC__
const int block = 256;
#else
const int block = 512; const int block = 512;
#endif
int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
const int max_blocks = std::max(max_threads / block, 1); const int max_blocks = std::max(max_threads / block, 1);
int grid = std::min(max_blocks, pre * post); int grid = std::min(max_blocks, pre * post);
......
...@@ -28,10 +28,10 @@ class TestSequenceSoftmaxOp(OpTest): ...@@ -28,10 +28,10 @@ class TestSequenceSoftmaxOp(OpTest):
self.op_type = "sequence_softmax" self.op_type = "sequence_softmax"
self.use_cudnn = False self.use_cudnn = False
self.init_op_type() self.init_op_type()
self.dtype = "float32" if core.is_compiled_with_rocm() else "float64"
x = np.random.uniform(0.1, 1, (110, 1)).astype("float64") x = np.random.uniform(0.1, 1, (110, 1)).astype(self.dtype)
self.init_lod() self.init_lod()
out = np.zeros((110, 1)).astype("float64") out = np.zeros((110, 1)).astype(self.dtype)
offset = 0 offset = 0
for i in range(len(self.lod[0])): for i in range(len(self.lod[0])):
if (self.lod[0][i] == 0): if (self.lod[0][i] == 0):
......
...@@ -354,8 +354,11 @@ class TestMathOpPatchesVarBase(unittest.TestCase): ...@@ -354,8 +354,11 @@ class TestMathOpPatchesVarBase(unittest.TestCase):
[1.30058, 1.0688717, 1.4928783], [1.30058, 1.0688717, 1.4928783],
[1.0958099, 1.3724753, 1.8926544]]) [1.0958099, 1.3724753, 1.8926544]])
d = d.matmul(d.t()) d = d.matmul(d.t())
self.assertTrue( # ROCM not support cholesky
np.array_equal(d.cholesky().numpy(), paddle.cholesky(d).numpy())) if not fluid.core.is_compiled_with_rocm():
self.assertTrue(
np.array_equal(d.cholesky().numpy(), paddle.cholesky(d).numpy(
)))
self.assertTrue( self.assertTrue(
np.array_equal(x.is_empty().numpy(), paddle.is_empty(x).numpy())) np.array_equal(x.is_empty().numpy(), paddle.is_empty(x).numpy()))
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册