[ROCM] fix conv2d and conv3d op, test=develop (#31553)

3d5aa9d1 · Qi Li · GitHub · f302bb4f · 3d5aa9d1 · 3d5aa9d1
7 changed file
--- a/paddle/fluid/operators/conv_cudnn_op.cu
+++ b/paddle/fluid/operators/conv_cudnn_op.cu
@@ -249,6 +249,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
    args.handle = handle;
 #ifdef PADDLE_WITH_HIP
+    // MIOPEN need to set groups in cdesc in miopen_desc.h
    args.cdesc.set(dtype, padding_common, strides, dilations,
                   platform::AllowTF32Cudnn(), groups);
 #else
@@ -264,6 +265,10 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
        platform::dynload::cudnnSetConvolutionGroupCount(args.cdesc.desc(),
                                                         groups));
    groups = 1;
+#endif
+#ifdef PADDLE_WITH_HIP
+    // MIOPEN do not set groups in wdesc after set groups in cdesc
+    groups = 1;
 #endif
    args.idesc.set(transformed_input, layout_format);
    args.wdesc.set(transformed_filter_channel, layout_format, groups);
@@ -292,12 +297,14 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
 #ifdef PADDLE_WITH_HIP
    miopenConvFwdAlgorithm_t algo{};
    using search = SearchAlgorithm<miopenConvFwdAlgorithm_t>;
+    workspace_size = search::GetWorkspaceSize(args);
+    algo = search::Find<T>(args, exhaustive_search, false, workspace_size, ctx);
 #else
    cudnnConvolutionFwdAlgo_t algo{};
    using search = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
-#endif
    algo = search::Find<T>(args, exhaustive_search, false, ctx);
    workspace_size = search::GetWorkspaceSize(args, algo);
+#endif
 #if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(7, 0, 1)
    // when groups > 1, SearchAlgorithm find algo is CUDNN_CONVOLUTION_\
@@ -652,13 +659,17 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
 #ifdef PADDLE_WITH_HIP
      using search1 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
+      workspace_size =
+          std::max(workspace_size, search1::GetWorkspaceSize(args1));
+      data_algo = search1::Find<T>(args1, exhaustive_search, deterministic,
+                                   workspace_size, ctx);
 #else
      using search1 = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
-#endif
      data_algo =
          search1::Find<T>(args1, exhaustive_search, deterministic, ctx);
      workspace_size =
          std::max(workspace_size, search1::GetWorkspaceSize(args1, data_algo));
+#endif
    }
    if (filter_grad) {
@@ -673,13 +684,17 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
                      platform::AllowTF32Cudnn(), c_groups);
 #ifdef PADDLE_WITH_HIP
      using search2 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
+      workspace_size =
+          std::max(workspace_size, search2::GetWorkspaceSize(args2));
+      filter_algo = search2::Find<T>(args2, exhaustive_search, deterministic,
+                                     workspace_size, ctx);
 #else
      using search2 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
-#endif
      filter_algo =
          search2::Find<T>(args2, exhaustive_search, deterministic, ctx);
      workspace_size = std::max(workspace_size,
                                search2::GetWorkspaceSize(args2, filter_algo));
+#endif
    }
    // ------------------- cudnn conv backward data ---------------------
@@ -688,23 +703,22 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
    VLOG(4) << "Conv_grad: use_addto = " << ctx.Attr<bool>("use_addto");
    if (input_grad) {
-      // When beta is 0, it is unnecessary to reset input_grad.
+// When beta is 0, it is unnecessary to reset input_grad.
-      // When beta is 1, the output cannot be reset since addt strategy used.
+// When beta is 1, the output cannot be reset since addt strategy used.
-      for (int i = 0; i < groups; i++) {
 #ifdef PADDLE_WITH_HIP
-        workspace_handle.RunFunc(
+      workspace_handle.RunFunc(
-            [&](void* cudnn_workspace_ptr) {
+          [&](void* cudnn_workspace_ptr) {
-              PADDLE_ENFORCE_CUDA_SUCCESS(
+            PADDLE_ENFORCE_CUDA_SUCCESS(
-                  platform::dynload::miopenConvolutionBackwardData(
+                platform::dynload::miopenConvolutionBackwardData(
-                      handle, &alpha, args1.odesc.desc(),
+                    handle, &alpha, args1.odesc.desc(), output_grad_data,
-                      output_grad_data + i * group_offset_out,
+                    args1.wdesc.desc(), filter_data, args1.cdesc.desc(),
-                      args1.wdesc.desc(), filter_data + i * group_offset_filter,
+                    data_algo, &beta, args1.idesc.desc(),
-                      args1.cdesc.desc(), data_algo, &beta, args1.idesc.desc(),
+                    transformed_input_grad_data, cudnn_workspace_ptr,
-                      transformed_input_grad_data + i * group_offset_in,
+                    workspace_size));
-                      cudnn_workspace_ptr, workspace_size));
+          },
-            },
+          workspace_size);
-            workspace_size);
 #else
+      for (int i = 0; i < groups; i++) {
        workspace_handle.RunFunc(
            [&](void* cudnn_workspace_ptr) {
              PADDLE_ENFORCE_CUDA_SUCCESS(
@@ -717,9 +731,8 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
                      transformed_input_grad_data + i * group_offset_in));
            },
            workspace_size);
-#endif
      }
+#endif
      if (!is_sys_pad) {
        std::vector<int> starts(transformed_input_channel.dims().size(), 0);
        std::vector<int> axes(transformed_input_channel.dims().size(), 0);
@@ -751,23 +764,20 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
    ScalingParamType<T> beta_filter = 0.0f;
    // ------------------- cudnn conv backward filter ---------------------
    if (filter_grad) {
-      // Because beta is zero, it is unnecessary to reset filter_grad.
+// Because beta is zero, it is unnecessary to reset filter_grad.
-      for (int i = 0; i < groups; i++) {
 #ifdef PADDLE_WITH_HIP
-        workspace_handle.RunFunc(
+      workspace_handle.RunFunc(
-            [&](void* cudnn_workspace_ptr) {
+          [&](void* cudnn_workspace_ptr) {
-              PADDLE_ENFORCE_CUDA_SUCCESS(
+            PADDLE_ENFORCE_CUDA_SUCCESS(
-                  platform::dynload::miopenConvolutionBackwardWeights(
+                platform::dynload::miopenConvolutionBackwardWeights(
-                      handle, &alpha, args2.odesc.desc(),
+                    handle, &alpha, args2.odesc.desc(), output_grad_data,
-                      output_grad_data + i * group_offset_out,
+                    args2.idesc.desc(), input_data, args2.cdesc.desc(),
-                      args2.idesc.desc(), input_data + i * group_offset_in,
+                    filter_algo, &beta, args2.wdesc.desc(), filter_grad_data,
-                      args2.cdesc.desc(), filter_algo, &beta,
+                    cudnn_workspace_ptr, workspace_size));
-                      args2.wdesc.desc(),
+          },
-                      filter_grad_data + i * group_offset_filter,
+          workspace_size);
-                      cudnn_workspace_ptr, workspace_size));
-            },
-            workspace_size);
 #else
+      for (int i = 0; i < groups; i++) {
        workspace_handle.RunFunc(
            [&](void* cudnn_workspace_ptr) {
              PADDLE_ENFORCE_CUDA_SUCCESS(
@@ -780,8 +790,8 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
                      filter_grad_data + i * group_offset_filter));
            },
            workspace_size);
-#endif
      }
+#endif
      if (compute_format == DataLayout::kNHWC) {
        TransToChannelFirst<paddle::platform::CUDADeviceContext, T>(
@@ -1080,32 +1090,37 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
 #ifdef PADDLE_WITH_HIP
        using search1 = SearchAlgorithm<miopenConvFwdAlgorithm_t>;
+        workspace_size = search1::GetWorkspaceSize(args1);
+        fwd_algo1 = search1::Find<T>(args1, exhaustive_search, false,
+                                     workspace_size, ctx);
 #else
        using search1 = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
-#endif
        fwd_algo1 = search1::Find<T>(args1, exhaustive_search, false, ctx);
        workspace_size = search1::GetWorkspaceSize(args1, fwd_algo1);
+#endif
      }
      if (ddW) {
        ddw = ddW->data<T>();
        args2.handle = handle;
        args2.idesc.set(transformed_X, iwo_group);
        args2.wdesc.set(*ddW, layout, iwo_group);
        args2.odesc.set(transformed_ddO_channel, iwo_group);
        args2.cdesc.set(dtype, padding_common, strides, dilations,
                        platform::AllowTF32Cudnn(), c_group);
 #ifdef PADDLE_WITH_HIP
        using search2 = SearchAlgorithm<miopenConvFwdAlgorithm_t>;
+        workspace_size =
+            std::max(workspace_size, search2::GetWorkspaceSize(args2));
+        fwd_algo2 = search2::Find<T>(args2, exhaustive_search, false,
+                                     workspace_size, ctx);
 #else
        using search2 = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
-#endif
        fwd_algo2 = search2::Find<T>(args2, exhaustive_search, false, ctx);
        workspace_size = std::max(workspace_size,
                                  search2::GetWorkspaceSize(args2, fwd_algo2));
+#endif
      }
    }
@@ -1114,21 +1129,23 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
      args3.handle = handle;
      args3.idesc.set(transformed_ddX, iwo_group);
      args3.wdesc.set(*dW, layout, iwo_group);
      args3.odesc.set(transformed_dO_channel, iwo_group);
      args3.cdesc.set(dtype, padding_common, strides, dilations,
                      platform::AllowTF32Cudnn(), c_group);
 #ifdef PADDLE_WITH_HIP
      using search3 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
+      workspace_size =
+          std::max(workspace_size, search3::GetWorkspaceSize(args3));
+      filter_algo = search3::Find<T>(args3, exhaustive_search, deterministic,
+                                     workspace_size, ctx);
 #else
      using search3 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
-#endif
      filter_algo =
          search3::Find<T>(args3, exhaustive_search, deterministic, ctx);
      workspace_size = std::max(workspace_size,
                                search3::GetWorkspaceSize(args3, filter_algo));
+#endif
    }
    if (ddW && dX) {
@@ -1143,13 +1160,17 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
 #ifdef PADDLE_WITH_HIP
      using search4 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
+      workspace_size =
+          std::max(workspace_size, search4::GetWorkspaceSize(args4));
+      data_algo = search4::Find<T>(args4, exhaustive_search, deterministic,
+                                   workspace_size, ctx);
 #else
      using search4 = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
-#endif
      data_algo =
          search4::Find<T>(args4, exhaustive_search, deterministic, ctx);
      workspace_size =
          std::max(workspace_size, search4::GetWorkspaceSize(args4, data_algo));
+#endif
    }
    int i_n, i_c, i_d, i_h, i_w;
@@ -1176,21 +1197,19 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
    if (ddO) {
      if (ddX) {
        ddx = transformed_ddX.data<T>();
-        for (int i = 0; i < groups; i++) {
 #ifdef PADDLE_WITH_HIP
-          wkspace_handle.RunFunc(
+        wkspace_handle.RunFunc(
-              [&](void* workspace_ptr) {
+            [&](void* workspace_ptr) {
-                PADDLE_ENFORCE_CUDA_SUCCESS(
+              PADDLE_ENFORCE_CUDA_SUCCESS(
-                    platform::dynload::miopenConvolutionForward(
+                  platform::dynload::miopenConvolutionForward(
-                        handle, &alpha, args1.idesc.desc(),
+                      handle, &alpha, args1.idesc.desc(), ddx,
-                        ddx + i * group_offset_in, args1.wdesc.desc(),
+                      args1.wdesc.desc(), w, args1.cdesc.desc(), fwd_algo1,
-                        w + i * group_offset_filter, args1.cdesc.desc(),
+                      &beta, args1.odesc.desc(), transformed_ddy_channel,
-                        fwd_algo1, &beta, args1.odesc.desc(),
+                      workspace_ptr, workspace_size));
-                        transformed_ddy_channel + i * group_offset_out,
+            },
-                        workspace_ptr, workspace_size));
+            workspace_size);
-              },
-              workspace_size);
 #else
+        for (int i = 0; i < groups; i++) {
          wkspace_handle.RunFunc(
              [&](void* workspace_ptr) {
                PADDLE_ENFORCE_CUDA_SUCCESS(
@@ -1203,26 +1222,24 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
                        transformed_ddy_channel + i * group_offset_out));
              },
              workspace_size);
-#endif
        }
+#endif
      }
      if (ddW) {
-        for (int i = 0; i < groups; i++) {
 #ifdef PADDLE_WITH_HIP
-          // MIOPEN ONLY support beta to be 0.0f
+        // MIOPEN ONLY support beta to be 0.0f
-          wkspace_handle.RunFunc(
+        wkspace_handle.RunFunc(
-              [&](void* workspace_ptr) {
+            [&](void* workspace_ptr) {
-                PADDLE_ENFORCE_CUDA_SUCCESS(
+              PADDLE_ENFORCE_CUDA_SUCCESS(
-                    platform::dynload::miopenConvolutionForward(
+                  platform::dynload::miopenConvolutionForward(
-                        handle, &alpha, args2.idesc.desc(),
+                      handle, &alpha, args2.idesc.desc(), x, args2.wdesc.desc(),
-                        x + i * group_offset_in, args2.wdesc.desc(),
+                      ddw, args2.cdesc.desc(), fwd_algo2, &beta,
-                        ddw + i * group_offset_filter, args2.cdesc.desc(),
+                      args2.odesc.desc(), transformed_ddy_channel,
-                        fwd_algo2, &beta, args2.odesc.desc(),
+                      workspace_ptr, workspace_size));
-                        transformed_ddy_channel + i * group_offset_out,
+            },
-                        workspace_ptr, workspace_size));
+            workspace_size);
-              },
-              workspace_size);
 #else
+        for (int i = 0; i < groups; i++) {
          wkspace_handle.RunFunc(
              [&](void* workspace_ptr) {
                PADDLE_ENFORCE_CUDA_SUCCESS(
@@ -1235,8 +1252,8 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
                        transformed_ddy_channel + i * group_offset_out));
              },
              workspace_size);
-#endif
        }
+#endif
      }
      if (channel_last) {
        TransToChannelLast<paddle::platform::CUDADeviceContext, T>(
@@ -1246,21 +1263,19 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
    T* transformed_dy_channel = transformed_dO_channel.data<T>();
    if (dW && ddX) {
      ddx = transformed_ddX.data<T>();
-      for (int i = 0; i < groups; i++) {
 #ifdef PADDLE_WITH_HIP
-        wkspace_handle.RunFunc(
+      wkspace_handle.RunFunc(
-            [&](void* workspace_ptr) {
+          [&](void* workspace_ptr) {
-              PADDLE_ENFORCE_CUDA_SUCCESS(
+            PADDLE_ENFORCE_CUDA_SUCCESS(
-                  platform::dynload::miopenConvolutionBackwardWeights(
+                platform::dynload::miopenConvolutionBackwardWeights(
-                      handle, &alpha, args3.odesc.desc(),
+                    handle, &alpha, args3.odesc.desc(), transformed_dy_channel,
-                      transformed_dy_channel + i * group_offset_out,
+                    args3.idesc.desc(), ddx, args3.cdesc.desc(), filter_algo,
-                      args3.idesc.desc(), ddx + i * group_offset_in,
+                    &beta, args3.wdesc.desc(), dw, workspace_ptr,
-                      args3.cdesc.desc(), filter_algo, &beta,
+                    workspace_size));
-                      args3.wdesc.desc(), dw + i * group_offset_filter,
+          },
-                      workspace_ptr, workspace_size));
+          workspace_size);
-            },
-            workspace_size);
 #else
+      for (int i = 0; i < groups; i++) {
        wkspace_handle.RunFunc(
            [&](void* workspace_ptr) {
              PADDLE_ENFORCE_CUDA_SUCCESS(
@@ -1273,27 +1288,25 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
                      dw + i * group_offset_filter));
            },
            workspace_size);
-#endif
      }
+#endif
    }
    if (dX && ddW) {
      ddw = ddW->data<T>();
-      for (int i = 0; i < groups; i++) {
 #ifdef PADDLE_WITH_HIP
-        wkspace_handle.RunFunc(
+      wkspace_handle.RunFunc(
-            [&](void* workspace_ptr) {
+          [&](void* workspace_ptr) {
-              PADDLE_ENFORCE_CUDA_SUCCESS(
+            PADDLE_ENFORCE_CUDA_SUCCESS(
-                  platform::dynload::miopenConvolutionBackwardData(
+                platform::dynload::miopenConvolutionBackwardData(
-                      handle, &alpha, args4.odesc.desc(),
+                    handle, &alpha, args4.odesc.desc(), transformed_dy_channel,
-                      transformed_dy_channel + i * group_offset_out,
+                    args4.wdesc.desc(), ddw, args4.cdesc.desc(), data_algo,
-                      args4.wdesc.desc(), ddw + i * group_offset_filter,
+                    &beta, args4.idesc.desc(), transformed_dx, workspace_ptr,
-                      args4.cdesc.desc(), data_algo, &beta, args4.idesc.desc(),
+                    workspace_size));
-                      transformed_dx + i * group_offset_in, workspace_ptr,
+          },
-                      workspace_size));
+          workspace_size);
-            },
-            workspace_size);
 #else
+      for (int i = 0; i < groups; i++) {
        wkspace_handle.RunFunc(
            [&](void* workspace_ptr) {
              PADDLE_ENFORCE_CUDA_SUCCESS(
@@ -1306,8 +1319,8 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
                      transformed_dx + i * group_offset_in));
            },
            workspace_size);
-#endif
      }
+#endif
      if (!is_sys_pad) {
        // reverse padded input

--- a/paddle/fluid/operators/conv_miopen_helper.h
+++ b/paddle/fluid/operators/conv_miopen_helper.h
@@ -127,57 +127,52 @@ struct SearchAlgorithm<miopenConvFwdAlgorithm_t> {
  template <typename T>
  static algo_t Find(const ConvArgs& args, bool exhaustive_search,
-                     bool deterministic,
+                     bool deterministic, size_t workspace_size,
                     const framework::ExecutionContext& ctx) {
-    auto dtype = platform::CudnnDataType<T>::type;
-    bool has_got_workspace_size = true;
-    size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024;
-    size_t workspace_size = 0;
    algo_t algo;
    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
    auto workspace_handle = dev_ctx.cudnn_workspace_handle();
-    auto& temp = ctx.cuda_device_context();
+    int find_count;
-    AlgorithmsCache<algo_t>& algo_cache =
+    miopenConvAlgoPerf_t find_result;
-        *(framework::ConvSearchCache::Instance().GetForward());
+    auto cudnn_find_func = [&](void* cudnn_workspace_ptr) {
+      PADDLE_ENFORCE_CUDA_SUCCESS(
-    auto x_dims = framework::vectorize(args.x->dims());
+          platform::dynload::miopenFindConvolutionForwardAlgorithm(
-    auto w_dims = framework::vectorize(args.w->dims());
+              args.handle, args.idesc.desc(), args.x->data<T>(),
+              args.wdesc.desc(), args.w->data<T>(), args.cdesc.desc(),
-    VLOG(10) << "miopenConvolutionFwdAlgoPerf_t:"
+              args.odesc.desc(), const_cast<T*>(args.o->data<T>()),
-             << ", x_dims:" << x_dims << ", w_dims:" << w_dims << ", args.s"
+              kNUM_CUDNN_FWD_ALGS, &find_count, &find_result,
-             << args.s << ", args.p" << args.p << ", args.d" << args.d;
+              cudnn_workspace_ptr, workspace_size, false));
+    };
-    algo = algo_cache.GetAlgorithm(
-        x_dims, w_dims, args.s, args.p, args.d, 0,
+    if (!exhaustive_search && !deterministic) {
-        static_cast<int64_t>(args.cudnn_dtype), [&]() {
+      workspace_handle.RunFuncSync(cudnn_find_func, workspace_size);
-          int returned_algo_count;
+      algo = find_result.fwd_algo;
-          std::array<perf_t, kNUM_CUDNN_FWD_ALGS> perf_stat;
+    } else {
+      auto& temp = ctx.cuda_device_context();
-          auto cudnn_find_func = [&](void* cudnn_workspace_ptr) {
+      AlgorithmsCache<algo_t>& algo_cache =
-            PADDLE_ENFORCE_CUDA_SUCCESS(
+          *(framework::ConvSearchCache::Instance().GetForward());
-                platform::dynload::miopenFindConvolutionForwardAlgorithm(
-                    args.handle, args.idesc.desc(), args.x->data<T>(),
+      auto x_dims = framework::vectorize(args.x->dims());
-                    args.wdesc.desc(), args.w->data<T>(), args.cdesc.desc(),
+      auto w_dims = framework::vectorize(args.w->dims());
-                    args.odesc.desc(), const_cast<T*>(args.o->data<T>()),
-                    kNUM_CUDNN_FWD_ALGS, &returned_algo_count, perf_stat.data(),
+      VLOG(10) << "miopenConvolutionFwdAlgoPerf_t:"
-                    cudnn_workspace_ptr, workspace_size_limit, false));
+               << ", x_dims:" << x_dims << ", w_dims:" << w_dims << ", args.s"
-          };
+               << args.s << ", args.p" << args.p << ", args.d" << args.d;
-          workspace_handle.RunFuncSync(cudnn_find_func, workspace_size_limit);
+      algo = algo_cache.GetAlgorithm(
-          VLOG(3) << "FwdAlgo Perf result: (algo: stat, time, memory)";
+          x_dims, w_dims, args.s, args.p, args.d, 0,
-          for (int i = 0; i < returned_algo_count; ++i) {
+          static_cast<int64_t>(args.cudnn_dtype), [&]() {
-            const auto& stat = perf_stat[i];
+            workspace_handle.RunFuncSync(cudnn_find_func, workspace_size);
-            VLOG(3) << stat.fwd_algo;
+            return find_result.fwd_algo;
-          }
+          });
-          return perf_stat[0].fwd_algo;
+    }
-        });
    VLOG(3) << "choose algo " << algo;
    return algo;
  }
-  static size_t GetWorkspaceSize(const ConvArgs& args, algo_t algo) {
+  static size_t GetWorkspaceSize(const ConvArgs& args) {
    size_t workspace_size = 0;
    PADDLE_ENFORCE_CUDA_SUCCESS(
        platform::dynload::miopenConvolutionForwardGetWorkSpaceSize(
@@ -194,58 +189,51 @@ struct SearchAlgorithm<miopenConvBwdDataAlgorithm_t> {
  template <typename T>
  static algo_t Find(const ConvArgs& args, bool exhaustive_search,
-                     bool deterministic,
+                     bool deterministic, size_t workspace_size,
                     const framework::ExecutionContext& ctx) {
-    auto dtype = platform::CudnnDataType<T>::type;
-    size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024;
-    size_t workspace_size = 0;
-    bool has_got_workspace_size = true;
    algo_t algo;
    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
    auto workspace_handle = dev_ctx.cudnn_workspace_handle();
-    AlgorithmsCache<algo_t>& algo_cache =
+    int find_count;
-        *(framework::ConvSearchCache::Instance().GetBackwardData());
+    miopenConvAlgoPerf_t find_result;
+    auto cudnn_find_func = [&](void* cudnn_workspace_ptr) {
-    auto x_dims = framework::vectorize(args.x->dims());
+      PADDLE_ENFORCE_CUDA_SUCCESS(
-    auto w_dims = framework::vectorize(args.w->dims());
+          platform::dynload::miopenFindConvolutionBackwardDataAlgorithm(
+              args.handle, args.odesc.desc(), args.o->data<T>(),
-    VLOG(10) << "miopenConvolutionFwdAlgoPerf_t"
+              args.wdesc.desc(), args.w->data<T>(), args.cdesc.desc(),
-             << ", x_dims:" << x_dims << ", w_dims:" << w_dims << ", args.s"
+              args.idesc.desc(), const_cast<T*>(args.x->data<T>()),
-             << args.s << ", args.p" << args.p << ", args.d" << args.d;
+              kNUM_CUDNN_BWD_DATA_ALGS, &find_count, &find_result,
+              cudnn_workspace_ptr, workspace_size, false));
-    algo = algo_cache.GetAlgorithm(
+    };
-        x_dims, w_dims, args.s, args.p, args.d, 0,
-        static_cast<int64_t>(args.cudnn_dtype), [&]() {
+    if (!exhaustive_search && !deterministic) {
-          int returned_algo_count;
+      workspace_handle.RunFuncSync(cudnn_find_func, workspace_size);
-          std::array<perf_t, kNUM_CUDNN_FWD_ALGS> perf_stat;
+      algo = find_result.bwd_data_algo;
+    } else {
-          auto cudnn_find_func = [&](void* cudnn_workspace_ptr) {
+      AlgorithmsCache<algo_t>& algo_cache =
-            PADDLE_ENFORCE_CUDA_SUCCESS(
+          *(framework::ConvSearchCache::Instance().GetBackwardData());
-                platform::dynload::miopenFindConvolutionBackwardDataAlgorithm(
-                    args.handle, args.odesc.desc(), args.o->data<T>(),
+      auto x_dims = framework::vectorize(args.x->dims());
-                    args.wdesc.desc(), args.w->data<T>(), args.cdesc.desc(),
+      auto w_dims = framework::vectorize(args.w->dims());
-                    args.idesc.desc(), const_cast<T*>(args.x->data<T>()),
-                    kNUM_CUDNN_BWD_DATA_ALGS, &returned_algo_count,
+      VLOG(10) << "miopenConvolutionFwdAlgoPerf_t"
-                    perf_stat.data(), cudnn_workspace_ptr, workspace_size_limit,
+               << ", x_dims:" << x_dims << ", w_dims:" << w_dims << ", args.s"
-                    false));
+               << args.s << ", args.p" << args.p << ", args.d" << args.d;
-          };
-          workspace_handle.RunFuncSync(cudnn_find_func, workspace_size_limit);
+      algo = algo_cache.GetAlgorithm(
+          x_dims, w_dims, args.s, args.p, args.d, 0,
-          VLOG(3) << "BwdDataAlgo Perf result: (algo: stat, time, memory)";
+          static_cast<int64_t>(args.cudnn_dtype), [&]() {
-          for (int i = 0; i < returned_algo_count; ++i) {
+            workspace_handle.RunFuncSync(cudnn_find_func, workspace_size);
-            const auto& stat = perf_stat[i];
+            return find_result.bwd_data_algo;
-            VLOG(3) << stat.bwd_data_algo;
+          });
-          }
+    }
-          return perf_stat[0].bwd_data_algo;
-        });
    VLOG(3) << "choose algo " << algo;
    return algo;
  }
-  static size_t GetWorkspaceSize(const ConvArgs& args, algo_t algo) {
+  static size_t GetWorkspaceSize(const ConvArgs& args) {
    size_t workspace_size = 0;
    PADDLE_ENFORCE_CUDA_SUCCESS(
        platform::dynload::miopenConvolutionBackwardDataGetWorkSpaceSize(
@@ -262,56 +250,51 @@ struct SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t> {
  template <typename T>
  static algo_t Find(const ConvArgs& args, bool exhaustive_search,
-                     bool deterministic,
+                     bool deterministic, size_t workspace_size,
                     const framework::ExecutionContext& ctx) {
-    auto dtype = platform::CudnnDataType<T>::type;
-    size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024;
-    size_t workspace_size = 0;
-    bool has_got_workspace_size = true;
    algo_t algo;
    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
    auto workspace_handle = dev_ctx.cudnn_workspace_handle();
-    AlgorithmsCache<algo_t>& algo_cache =
-        *(framework::ConvSearchCache::Instance().GetBackwardFilter());
+    int find_count;
+    miopenConvAlgoPerf_t find_result;
-    auto x_dims = framework::vectorize(args.x->dims());
+    auto cudnn_find_func = [&](void* cudnn_workspace_ptr) {
-    auto w_dims = framework::vectorize(args.w->dims());
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          platform::dynload::miopenFindConvolutionBackwardWeightsAlgorithm(
-    VLOG(10) << "miopenConvolutionFwdAlgoPerf_t:"
+              args.handle, args.odesc.desc(), args.o->data<T>(),
-             << ", x_dims:" << x_dims << ", w_dims:" << w_dims << ", args.s"
+              args.idesc.desc(), args.x->data<T>(), args.cdesc.desc(),
-             << args.s << ", args.p" << args.p << ", args.d" << args.d;
+              args.wdesc.desc(), const_cast<T*>(args.w->data<T>()),
+              kNUM_CUDNN_BWD_FILTER_ALGS, &find_count, &find_result,
-    algo = algo_cache.GetAlgorithm(
+              cudnn_workspace_ptr, workspace_size, false));
-        x_dims, w_dims, args.s, args.p, args.d, 0,
+    };
-        static_cast<int64_t>(args.cudnn_dtype), [&]() {
-          int returned_algo_count;
+    if (!exhaustive_search && !deterministic) {
-          std::array<perf_t, kNUM_CUDNN_FWD_ALGS> perf_stat;
+      workspace_handle.RunFuncSync(cudnn_find_func, workspace_size);
-          auto cudnn_find_func = [&](void* cudnn_workspace_ptr) {
+      algo = find_result.bwd_weights_algo;
-            PADDLE_ENFORCE_CUDA_SUCCESS(
+    } else {
-                platform::dynload::
+      AlgorithmsCache<algo_t>& algo_cache =
-                    miopenFindConvolutionBackwardWeightsAlgorithm(
+          *(framework::ConvSearchCache::Instance().GetBackwardFilter());
-                        args.handle, args.odesc.desc(), args.o->data<T>(),
-                        args.idesc.desc(), args.x->data<T>(), args.cdesc.desc(),
+      auto x_dims = framework::vectorize(args.x->dims());
-                        args.wdesc.desc(), const_cast<T*>(args.w->data<T>()),
+      auto w_dims = framework::vectorize(args.w->dims());
-                        kNUM_CUDNN_BWD_FILTER_ALGS, &returned_algo_count,
-                        perf_stat.data(), cudnn_workspace_ptr,
+      VLOG(10) << "miopenConvolutionFwdAlgoPerf_t:"
-                        workspace_size_limit, false));
+               << ", x_dims:" << x_dims << ", w_dims:" << w_dims << ", args.s"
-          };
+               << args.s << ", args.p" << args.p << ", args.d" << args.d;
-          workspace_handle.RunFuncSync(cudnn_find_func, workspace_size_limit);
+      algo = algo_cache.GetAlgorithm(
-          VLOG(3) << "BwdFilterAlgo Perf result: (algo: stat, time, memory)";
+          x_dims, w_dims, args.s, args.p, args.d, 0,
-          for (int i = 0; i < returned_algo_count; ++i) {
+          static_cast<int64_t>(args.cudnn_dtype), [&]() {
-            const auto& stat = perf_stat[i];
+            workspace_handle.RunFuncSync(cudnn_find_func, workspace_size);
-            VLOG(3) << stat.bwd_weights_algo;
+            return find_result.bwd_weights_algo;
-          }
+          });
-          return perf_stat[0].bwd_weights_algo;
+    }
-        });
    VLOG(3) << "choose algo " << algo;
    return algo;
  }
-  static size_t GetWorkspaceSize(const ConvArgs& args, algo_t algo) {
+  static size_t GetWorkspaceSize(const ConvArgs& args) {
    size_t workspace_size = 0;
    PADDLE_ENFORCE_CUDA_SUCCESS(
        platform::dynload::miopenConvolutionBackwardWeightsGetWorkSpaceSize(

--- a/paddle/fluid/operators/conv_transpose_cudnn_op.cu
+++ b/paddle/fluid/operators/conv_transpose_cudnn_op.cu
@@ -244,13 +244,14 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
 #ifdef PADDLE_WITH_HIP
    using search = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
+    workspace_size = std::max(workspace_size, search::GetWorkspaceSize(args));
+    algo = search::Find<T>(args, false, deterministic, workspace_size, ctx);
 #else
    using search = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
-#endif
    algo = search::Find<T>(args, false, deterministic, ctx);
    workspace_size =
        std::max(workspace_size, search::GetWorkspaceSize(args, algo));
+#endif
    // ------------------- cudnn conv transpose forward ---------------------
    int input_offset =
@@ -504,12 +505,16 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
                      platform::AllowTF32Cudnn(), c_groups);
 #ifdef PADDLE_WITH_HIP
      using search1 = SearchAlgorithm<miopenConvFwdAlgorithm_t>;
+      workspace_size =
+          std::max(workspace_size, search1::GetWorkspaceSize(args1));
+      data_algo =
+          search1::Find<T>(args1, false, deterministic, workspace_size, ctx);
 #else
      using search1 = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
-#endif
      data_algo = search1::Find<T>(args1, false, deterministic, ctx);
      workspace_size =
          std::max(workspace_size, search1::GetWorkspaceSize(args1, data_algo));
+#endif
    }
    if (filter_grad) {
@@ -522,12 +527,16 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
                      platform::AllowTF32Cudnn(), c_groups);
 #ifdef PADDLE_WITH_HIP
      using search2 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
+      workspace_size =
+          std::max(workspace_size, search2::GetWorkspaceSize(args2));
+      filter_algo =
+          search2::Find<T>(args2, false, deterministic, workspace_size, ctx);
 #else
      using search2 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
-#endif
      filter_algo = search2::Find<T>(args2, false, deterministic, ctx);
      workspace_size = std::max(workspace_size,
                                search2::GetWorkspaceSize(args2, filter_algo));
+#endif
    }
    // ------------------- cudnn conv backward data ---------------------
@@ -942,11 +951,14 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
        args1.cdesc.set(dtype, padding_common, strides, dilations, c_group);
 #ifdef PADDLE_WITH_HIP
        using search1 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
+        workspace_size = search1::GetWorkspaceSize(args1);
+        bwd_algo1 =
+            search1::Find<T>(args1, false, deterministic, workspace_size, ctx);
 #else
        using search1 = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
-#endif
        bwd_algo1 = search1::Find<T>(args1, false, deterministic, ctx);
        workspace_size = search1::GetWorkspaceSize(args1, bwd_algo1);
+#endif
      }
      if (ddW) {
@@ -958,12 +970,16 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
        args2.cdesc.set(dtype, padding_common, strides, dilations, c_group);
 #ifdef PADDLE_WITH_HIP
        using search2 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
+        workspace_size =
+            std::max(workspace_size, search2::GetWorkspaceSize(args2));
+        bwd_algo2 =
+            search2::Find<T>(args2, false, deterministic, workspace_size, ctx);
 #else
        using search2 = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
-#endif
        bwd_algo2 = search2::Find<T>(args2, false, deterministic, ctx);
        workspace_size = std::max(workspace_size,
                                  search2::GetWorkspaceSize(args2, bwd_algo2));
+#endif
      }
    }
@@ -978,12 +994,16 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
      args3.cdesc.set(dtype, padding_common, strides, dilations, c_group);
 #ifdef PADDLE_WITH_HIP
      using search3 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
+      workspace_size =
+          std::max(workspace_size, search3::GetWorkspaceSize(args3));
+      filter_algo =
+          search3::Find<T>(args3, false, deterministic, workspace_size, ctx);
 #else
      using search3 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
-#endif
      filter_algo = search3::Find<T>(args3, false, deterministic, ctx);
      workspace_size = std::max(workspace_size,
                                search3::GetWorkspaceSize(args3, filter_algo));
+#endif
    }
    if (ddW && dX) {
@@ -996,12 +1016,16 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
      args4.cdesc.set(dtype, padding_common, strides, dilations, c_group);
 #ifdef PADDLE_WITH_HIP
      using search4 = SearchAlgorithm<miopenConvFwdAlgorithm_t>;
+      workspace_size =
+          std::max(workspace_size, search4::GetWorkspaceSize(args4));
+      data_algo =
+          search4::Find<T>(args4, false, deterministic, workspace_size, ctx);
 #else
      using search4 = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
-#endif
      data_algo = search4::Find<T>(args4, false, deterministic, ctx);
      workspace_size =
          std::max(workspace_size, search4::GetWorkspaceSize(args4, data_algo));
+#endif
    }
    int i_n, i_c, i_d, i_h, i_w;

--- a/paddle/fluid/platform/miopen_desc.h
+++ b/paddle/fluid/platform/miopen_desc.h
@@ -199,19 +199,24 @@ class FilterDescriptor {
  void set(const Tensor& tensor, const miopenTensorFormat_t format,
           const int groups = 1) {
-    auto dims = framework::vectorize<int>(tensor.dims());
-    std::vector<int> transformed_dims;
    PADDLE_ENFORCE_EQ(format, MIOPEN_TENSOR_NCHW,
                      platform::errors::InvalidArgument(
                          "format should ONLY be NCHW in MIOPEN."));
-    transformed_dims = dims;
+    auto dims = framework::vectorize<int>(tensor.dims());
-    // if (groups > 1) {
+    std::vector<int> strides(dims.size());
-    //   transformed_dims[1] = transformed_dims[1] / groups;
+    strides[dims.size() - 1] = 1;
-    // }
+    for (int i = dims.size() - 2; i >= 0; i--) {
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSet4dTensorDescriptor(
+      strides[i] = dims[i + 1] * strides[i + 1];
-        (miopenTensorDescriptor_t)desc_.get(), ToCudnnDataType(tensor.type()),
+    }
-        transformed_dims[0], transformed_dims[1], transformed_dims[2],
+    std::vector<int> dims_with_group(dims.begin(), dims.end());
-        transformed_dims[3]));
+    if (groups > 1) {
+      dims_with_group[1] = dims_with_group[1] / groups;
+    }
+    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSetTensorDescriptor(
+        (miopenTensorDescriptor_t)(desc_.get()), ToCudnnDataType(tensor.type()),
+        static_cast<int>(dims_with_group.size()),
+        const_cast<int*>(dims_with_group.data()),
+        const_cast<int*>(strides.data())));
  }
 private:

--- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
@@ -128,6 +128,8 @@ def create_test_cudnn_class(parent):
    class TestCUDNNCase(parent):
        def init_kernel_type(self):
            self.use_cudnn = True
+            self.dtype = np.float32 if core.is_compiled_with_rocm(
+            ) else np.float64
    cls_name = "{0}_{1}".format(parent.__name__, "CUDNN")
    TestCUDNNCase.__name__ = cls_name
@@ -185,6 +187,8 @@ def create_test_cudnn_channel_last_class(parent):
    class TestCudnnChannelLastCase(parent):
        def init_kernel_type(self):
            self.use_cudnn = True
+            self.dtype = np.float32 if core.is_compiled_with_rocm(
+            ) else np.float64
        def init_data_format(self):
            self.data_format = "NHWC"
@@ -264,6 +268,8 @@ def create_test_cudnn_padding_SAME_class(parent):
    class TestCUDNNPaddingSMAECase(parent):
        def init_kernel_type(self):
            self.use_cudnn = True
+            self.dtype = np.float32 if core.is_compiled_with_rocm(
+            ) else np.float64
        def init_paddings(self):
            self.pad = [1, 1]
@@ -280,6 +286,8 @@ def create_test_cudnn_padding_VALID_class(parent):
    class TestCUDNNPaddingVALIDCase(parent):
        def init_kernel_type(self):
            self.use_cudnn = True
+            self.dtype = np.float32 if core.is_compiled_with_rocm(
+            ) else np.float64
        def init_paddings(self):
            self.pad = [1, 1]
@@ -299,8 +307,7 @@ class TestConv2DOp(OpTest):
        self.use_mkldnn = False
        self.fuse_relu_before_depthwise_conv = False
        self.data_format = "AnyLayout"
-        # explicilty use float32 for ROCm, as MIOpen does not yet support float64
+        self.dtype = np.float64
-        self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
        self.init_kernel_type()
        self.init_group()
        self.init_dilation()
@@ -693,6 +700,7 @@ class TestCUDNNExhaustiveSearch(TestConv2DOp):
    def init_kernel_type(self):
        self.use_cudnn = True
        self.exhaustive_search = True
+        self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
 class TestConv2DOpError(unittest.TestCase):
@@ -734,8 +742,7 @@ class TestConv2DOp_v2(OpTest):
        self.use_cuda = False
        self.use_mkldnn = False
        self.fuse_relu_before_depthwise_conv = False
-        # explicilty use float32 for ROCm, as MIOpen does not yet support float64
+        self.dtype = np.float64
-        self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
        self.init_kernel_type()
        self.init_group()
        self.init_dilation()

--- a/python/paddle/fluid/tests/unittests/test_conv3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_op.py
@@ -135,6 +135,8 @@ def create_test_cudnn_class(parent):
    class TestCUDNNCase(parent):
        def init_kernel_type(self):
            self.use_cudnn = True
+            self.dtype = np.float32 if core.is_compiled_with_rocm(
+            ) else np.float64
    cls_name = "{0}_{1}".format(parent.__name__, "CUDNN")
    TestCUDNNCase.__name__ = cls_name
@@ -169,6 +171,8 @@ def create_test_cudnn_padding_SAME_class(parent):
    class TestCUDNNPaddingSMAECase(parent):
        def init_kernel_type(self):
            self.use_cudnn = True
+            self.dtype = np.float32 if core.is_compiled_with_rocm(
+            ) else np.float64
        def init_paddings(self):
            self.pad = [1, 1, 1]
@@ -185,6 +189,8 @@ def create_test_cudnn_padding_VALID_class(parent):
    class TestCUDNNPaddingVALIDCase(parent):
        def init_kernel_type(self):
            self.use_cudnn = True
+            self.dtype = np.float32 if core.is_compiled_with_rocm(
+            ) else np.float64
        def init_paddings(self):
            self.pad = [1, 1, 1]
@@ -215,6 +221,8 @@ def create_test_cudnn_channel_last_class(parent):
    class TestCudnnChannelLastCase(parent):
        def init_kernel_type(self):
            self.use_cudnn = True
+            self.dtype = np.float32 if core.is_compiled_with_rocm(
+            ) else np.float64
        def init_data_format(self):
            self.data_format = "NDHWC"
@@ -410,6 +418,7 @@ class TestWithDilation(TestConv3DOp):
 class TestCUDNN(TestConv3DOp):
    def init_kernel_type(self):
        self.use_cudnn = True
+        self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
 @unittest.skipIf(not core.is_compiled_with_cuda(),
@@ -431,6 +440,7 @@ class TestFP16CUDNN(TestConv3DOp):
 class TestWithGroup1CUDNN(TestWithGroup1):
    def init_kernel_type(self):
        self.use_cudnn = True
+        self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
 @unittest.skipIf(not core.is_compiled_with_cuda(),
@@ -452,6 +462,7 @@ class TestFP16WithGroup1CUDNN(TestWithGroup1):
 class TestWithGroup2CUDNN(TestWithGroup2):
    def init_kernel_type(self):
        self.use_cudnn = True
+        self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
 @unittest.skipIf(not core.is_compiled_with_cuda(),
@@ -473,6 +484,7 @@ class TestFP16WithGroup2CUDNN(TestWithGroup2):
 class TestWith1x1CUDNN(TestWith1x1):
    def init_kernel_type(self):
        self.use_cudnn = True
+        self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
 @unittest.skipIf(not core.is_compiled_with_cuda(),
@@ -494,6 +506,7 @@ class TestFP16With1x1CUDNN(TestWith1x1):
 class TestWithInput1x1Filter1x1CUDNN(TestWithInput1x1Filter1x1):
    def init_kernel_type(self):
        self.use_cudnn = True
+        self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
 @unittest.skipIf(not core.is_compiled_with_cuda(),
@@ -514,6 +527,7 @@ class TestCUDNNExhaustiveSearch(TestCUDNN):
    def init_kernel_type(self):
        self.use_cudnn = True
        self.exhaustive_search = True
+        self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
 # ---- test asymmetric padding ----

--- a/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
@@ -50,7 +50,7 @@ class TestSyncBatchNormOpTraining(unittest.TestCase):
    def setUp(self):
        """Setup."""
        #self.dtype = np.float32
-        self.dtype = np.float64
+        self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
        self.N = 8
        self.C = 16
        self.H = 32
@@ -92,7 +92,10 @@ class TestSyncBatchNormOpTraining(unittest.TestCase):
                    moving_variance_name='bn_moving_variance',
                    data_layout=layout,
                    is_test=only_forward)
-                bn = fluid.layers.cast(bn, 'float64')
+                if core.is_compiled_with_rocm():
+                    bn = fluid.layers.cast(bn, 'float32')
+                else:
+                    bn = fluid.layers.cast(bn, 'float64')
                sigmoid = fluid.layers.sigmoid(bn)
                out = fluid.layers.reduce_sum(sigmoid)
                if not sync_bn: