error message opt for XPU, test=kunlun (#27972)

* add stack pool2d roi_align xpu op,test=kunlun * error message opt, test=kunlun * add xpu unittest,test=kunlun * skip check grad,test=kunlun * fix boostget , test=kunlun * error message opt for XPU, test=kunlun

error message opt for XPU, test=kunlun (#27972)
* add stack pool2d roi_align xpu op,test=kunlun * error message opt, test=kunlun * add xpu unittest,test=kunlun * skip check grad,test=kunlun * fix boostget , test=kunlun * error message opt for XPU, test=kunlun
c1eed1fa · Double_V · GitHub · 4c5b779a · c1eed1fa · c1eed1fa
3 changed file
--- a/paddle/fluid/operators/pool_op_xpu.cc
+++ b/paddle/fluid/operators/pool_op_xpu.cc
@@ -43,12 +43,14 @@ class PoolXPUKernel : public framework::OpKernel<T> {
    bool exclusive = context.Attr<bool>("exclusive");
    bool is_test = context.Attr<bool>("is_test");
    bool adaptive = context.Attr<bool>("adaptive");
-    PADDLE_ENFORCE_EQ(!adaptive, true,
+    PADDLE_ENFORCE_EQ(
-                      platform::errors::InvalidArgument(
+        !adaptive, true,
-                          "XPU does not support adaptive == true!"));
+        platform::errors::InvalidArgument(
-    PADDLE_ENFORCE_EQ(ksize.size(), 2,
+            "The Pool2d XPU OP does not support adaptive == true!"));
-                      platform::errors::InvalidArgument(
+    PADDLE_ENFORCE_EQ(
-                          "XPU only support 2 dimension pooling!"));
+        ksize.size(), 2,
+        platform::errors::InvalidArgument(
+            "The Pool2d XPU OP only support 2 dimension pooling!"));
    int* index_data = nullptr;
    if (context.Attr<bool>("global_pooling")) {
      for (size_t i = 0; i < ksize.size(); ++i) {
@@ -80,7 +82,10 @@ class PoolXPUKernel : public framework::OpKernel<T> {
        stride_w, out_h, out_w);
    PADDLE_ENFORCE_EQ(
        r, xpu::Error_t::SUCCESS,
-        platform::errors::InvalidArgument("pool2d XPU kernel error!"));
+        platform::errors::External(
+            "The pool2d XPU API return wrong value[%d], please check "
+            "where Baidu Kunlun Card is properly installed.",
+            r));
  }
 };
 template <typename DeviceContext, typename T>
@@ -99,12 +104,15 @@ class PoolGradXPUKernel : public framework::OpKernel<T> {
    bool exclusive = context.Attr<bool>("exclusive");
    bool adaptive = context.Attr<bool>("adaptive");
    const int* index_data = nullptr;
-    PADDLE_ENFORCE_EQ(!adaptive, true,
+    PADDLE_ENFORCE_EQ(
-                      platform::errors::InvalidArgument(
+        !adaptive, true,
-                          "XPU does not support adaptive == true!"));
+        platform::errors::InvalidArgument(
-    PADDLE_ENFORCE_EQ(ksize.size(), 2,
+            "The Pool2d XPU OP does not support adaptive == true!"));
-                      platform::errors::InvalidArgument(
+    PADDLE_ENFORCE_EQ(ksize.size(), 2, platform::errors::InvalidArgument(
-                          "XPU only support 2 dimension pooling!"));
+                                           "The Pool2d XPU OP only support 2 "
+                                           "dimension pooling!, but received "
+                                           "%d-dimension pool kernel size",
+                                           ksize.size()));
    if (context.Attr<bool>("global_pooling")) {
      for (size_t i = 0; i < ksize.size(); ++i) {
        paddings[i] = 0;
@@ -139,16 +147,22 @@ class PoolGradXPUKernel : public framework::OpKernel<T> {
    int r =
        xpu::memset(dev_ctx.x_context(), reinterpret_cast<void**>(input_grad),
                    zero, in_x_grad->numel() * sizeof(float));
-    PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
+    PADDLE_ENFORCE_EQ(
-                      platform::errors::InvalidArgument(
+        r, xpu::Error_t::SUCCESS,
-                          "There are pool2d grad XPU kernel error raised!"));
+        platform::errors::External(
+            "The Pool2d XPU OP return wrong value[%d], please check "
+            "where Baidu Kunlun Card is properly installed.",
+            r));
    r = xpu::pooling_backward(dev_ctx.x_context(), input, output, index_data,
                              output_grad, input_grad, pool_type, c, in_h, in_w,
                              pad_left, pad_right, pad_up, pad_down, win_h,
                              win_w, stride_h, stride_w, out_h, out_w);
-    PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
+    PADDLE_ENFORCE_EQ(
-                      platform::errors::InvalidArgument(
+        r, xpu::Error_t::SUCCESS,
-                          "There are pool2d grad XPU kernel error raised!"));
+        platform::errors::External(
+            "The Pool2d XPU OP return wrong value[%d], please check "
+            "where Baidu Kunlun Card is properly installed.",
+            r));
  }
 };

--- a/paddle/fluid/operators/roi_align_op_xpu.cc
+++ b/paddle/fluid/operators/roi_align_op_xpu.cc
@@ -44,11 +44,16 @@ class XPUROIAlignOpKernel : public framework::OpKernel<T> {
    PADDLE_ENFORCE_EQ(
        rois_batch_size, batch_size,
        platform::errors::InvalidArgument(
-            "The rois_batch_size and imgs batch_size must be the same."));
+            "The rois_batch_size and imgs batch_size of roi_align_xpu OP must "
+            "be the same. But received rois_batch_size %d , batch_size %d",
+            rois_batch_size, batch_size));
    int rois_num_with_lod = rois_lod[rois_batch_size];
-    PADDLE_ENFORCE_EQ(rois_num, rois_num_with_lod,
+    PADDLE_ENFORCE_EQ(
-                      platform::errors::InvalidArgument(
+        rois_num, rois_num_with_lod,
-                          "The rois_num from input and lod must be the same."));
+        platform::errors::InvalidArgument(
+            "The rois_num from input and lod of roi_align_xpu OP must be the "
+            "same. But received input rois_num %d , input lod %d",
+            rois_num, rois_num_with_lod));
    T* output_data = out->mutable_data<T>(ctx.GetPlace());
    const T* rois_data = rois->data<T>();
    for (int n = 0; n < rois_batch_size; n++) {
@@ -62,7 +67,10 @@ class XPUROIAlignOpKernel : public framework::OpKernel<T> {
                rois_lod[n] * channels * pooled_height * pooled_width);
        PADDLE_ENFORCE_EQ(
            r, xpu::Error_t::SUCCESS,
-            platform::errors::InvalidArgument("roi_align XPU kernel error!"));
+            platform::errors::External(
+                "The roi_align XPU OP return wrong value[%d], please check "
+                "where Baidu Kunlun Card is properly installed.",
+                r));
      }
    }
  }

--- a/paddle/fluid/operators/stack_op_xpu.cc
+++ b/paddle/fluid/operators/stack_op_xpu.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 #include "paddle/fluid/operators/stack_op.h"
+#include <string>
 #ifdef PADDLE_WITH_XPU
 namespace paddle {
@@ -45,8 +46,15 @@ class StackXPUKernel : public framework::OpKernel<T> {
    auto& dev_ctx = ctx.template device_context<DeviceContext>();
    void* x_datas_host = std::malloc(n * sizeof(void*));
    void* x_datas_device = nullptr;
-    PADDLE_ENFORCE(xpu_malloc(reinterpret_cast<void**>(&x_datas_device),
+    PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast<void**>(&x_datas_device),
-                              n * sizeof(void*)) == XPU_SUCCESS);
+                                 n * sizeof(void*)),
+                      XPU_SUCCESS,
+                      platform::errors::ResourceExhausted(
+                          "\n\nOut of memory error on XPU, Cannot"
+                          "allocate %s memory on XPU. \n\nPlease "
+                          "check whether there is any other process "
+                          "using XPU.\n",
+                          string::HumanReadableSize(n * sizeof(void*))));
    for (auto i = 0; i < n; ++i) {
      ((const void**)x_datas_host)[i] = x[i]->data<T>();
    }
@@ -55,9 +63,12 @@ class StackXPUKernel : public framework::OpKernel<T> {
                 n * sizeof(void*));
    int r = xpu::stack_forward<float>(dev_ctx.x_context(), pre, post, n,
                                      x_datas_device, y_data);
-    PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
+    PADDLE_ENFORCE_EQ(
-                      platform::errors::InvalidArgument(
+        r, xpu::Error_t::SUCCESS,
-                          "There are stack XPU kernel error raised!"));
+        platform::errors::External(
+            "The stack XPU API return wrong value[%d], please check "
+            "where Baidu Kunlun Card is properly installed.",
+            r));
    dev_ctx.Wait();
    std::free(x_datas_host);
    xpu_free(x_datas_device);