未验证 提交 420527f0 编写于 作者: R ronnywang 提交者: GitHub

[ROCM] fix layer_norm, norm, p_norm, test_sequence_softmax_op, test_math_op_patch_var_base (#31709)

上级 87852616
......@@ -43,7 +43,11 @@ template <typename T>
using LayerNormParamType = typename CudnnDataType<T>::BatchNormParamType;
inline static int GetDesiredBlockDim(int block_dim) {
#ifdef __HIPCC__
const int kMaxBlockDim = 256;
#else
const int kMaxBlockDim = 512;
#endif
return block_dim >= kMaxBlockDim
? kMaxBlockDim
: (1 << (static_cast<int>(std::log2f(block_dim))));
......@@ -698,8 +702,11 @@ static void LayerNormBackward(const T *x, const T *d_y, const U *scale,
const framework::ExecutionContext &ctx) {
auto &dev_ctx = ctx.cuda_device_context();
auto stream = dev_ctx.stream();
#ifdef __HIPCC__
const int kMaxBlockDim = 256;
#else
const int kMaxBlockDim = 512;
#endif
const int kMaxBlockNum = 128;
int gradient_flag = ((d_x != nullptr ? 1 : 0) << 2) |
((d_scale != nullptr ? 1 : 0) << 1) |
......
......@@ -79,8 +79,11 @@ class NormCUDAKernel : public framework::OpKernel<T> {
GetDims(xdim, axis, &pre, &n, &post);
auto& dev_ctx = ctx.cuda_device_context();
#ifdef __HIPCC__
const int block = 256;
#else
const int block = 512;
#endif
int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
const int max_blocks = std::max(max_threads / block, 1);
int grid = std::min(max_blocks, pre * post);
......@@ -146,7 +149,11 @@ class NormGradCUDAKernel : public framework::OpKernel<T> {
auto& dev_ctx = ctx.cuda_device_context();
#ifdef __HIPCC__
const int block = 256;
#else
const int block = 512;
#endif
int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
const int max_blocks = std::max(max_threads / block, 1);
int grid = std::min(max_blocks, pre * post);
......
......@@ -142,7 +142,12 @@ class PnormCUDAKernel : public framework::OpKernel<T> {
auto& dev_ctx = ctx.cuda_device_context();
#ifdef __HIPCC__
const int block = 256;
#else
const int block = 512;
#endif
int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
const int max_blocks = std::max(max_threads / block, 1);
int grid = std::min(max_blocks, pre * post);
......@@ -244,7 +249,12 @@ class PnormGradCUDAKernel : public framework::OpKernel<T> {
auto& dev_ctx = ctx.cuda_device_context();
#ifdef __HIPCC__
const int block = 256;
#else
const int block = 512;
#endif
int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
const int max_blocks = std::max(max_threads / block, 1);
int grid = std::min(max_blocks, pre * post);
......
......@@ -28,10 +28,10 @@ class TestSequenceSoftmaxOp(OpTest):
self.op_type = "sequence_softmax"
self.use_cudnn = False
self.init_op_type()
x = np.random.uniform(0.1, 1, (110, 1)).astype("float64")
self.dtype = "float32" if core.is_compiled_with_rocm() else "float64"
x = np.random.uniform(0.1, 1, (110, 1)).astype(self.dtype)
self.init_lod()
out = np.zeros((110, 1)).astype("float64")
out = np.zeros((110, 1)).astype(self.dtype)
offset = 0
for i in range(len(self.lod[0])):
if (self.lod[0][i] == 0):
......
......@@ -354,8 +354,11 @@ class TestMathOpPatchesVarBase(unittest.TestCase):
[1.30058, 1.0688717, 1.4928783],
[1.0958099, 1.3724753, 1.8926544]])
d = d.matmul(d.t())
# ROCM not support cholesky
if not fluid.core.is_compiled_with_rocm():
self.assertTrue(
np.array_equal(d.cholesky().numpy(), paddle.cholesky(d).numpy()))
np.array_equal(d.cholesky().numpy(), paddle.cholesky(d).numpy(
)))
self.assertTrue(
np.array_equal(x.is_empty().numpy(), paddle.is_empty(x).numpy()))
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册