fix bugs in transformer predict in xpu place (#30730)

* transformer predict * trans bug fix

fix bugs in transformer predict in xpu place (#30730)
* transformer predict * trans bug fix
caf3680b · taixiurong · GitHub · a87d78f1 · caf3680b · caf3680b
3 changed file
--- a/paddle/fluid/operators/array_operator.h
+++ b/paddle/fluid/operators/array_operator.h
@@ -20,6 +20,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 class ArrayOp : public framework::OperatorBase {
 public:
  ArrayOp(const std::string &type, const framework::VariableNameMap &inputs,
@@ -45,7 +46,8 @@ class ArrayOp : public framework::OperatorBase {
    auto &dev_ctx = *pool.Get(place);
    size_t offset;
-    if (platform::is_gpu_place(i_tensor.place())) {
+    if (platform::is_gpu_place(i_tensor.place()) ||
+        platform::is_xpu_place(i_tensor.place())) {
      // FIXME: Avoid copy from GPU to CPU
      framework::Tensor t;
      framework::TensorCopy(i_tensor, platform::CPUPlace(), dev_ctx, &t);

--- a/paddle/fluid/operators/concat_op_xpu.cc
+++ b/paddle/fluid/operators/concat_op_xpu.cc
@@ -47,19 +47,6 @@ class ConcatXPUKernel : public framework::OpKernel<T> {
                          "size is %d.",
                          axis, ins[0]->dims().size()));
-    auto place = ctx.GetPlace();
-    out->mutable_data<T>(place);
-    std::vector<int> choose_idx;
-    int n = 0;
-    for (unsigned int i = 0; i < ins.size(); ++i) {
-      if (ins[i] && ins[i]->numel() > 0) {
-        choose_idx.push_back(i);
-        n++;
-      }
-    }
-    PADDLE_ENFORCE_GT(
-        n, 0, platform::errors::InvalidArgument("No tensor need concat?"));
    // If axis is 0, the lod of the output is not the same as inputs.
    if (axis == 0 && ins[0]->lod().size() > 0) {
      size_t lod_size_0 = ins[0]->lod().size();
@@ -87,30 +74,32 @@ class ConcatXPUKernel : public framework::OpKernel<T> {
        }
      }
    }
+    auto place = ctx.GetPlace();
-    auto input_dims = ins[0]->dims();
+    out->mutable_data<T>(place);
-    std::vector<std::vector<int>> xdims_list(n);
+    std::vector<std::vector<int>> xdims_list;
-    for (int i = 0; i < n; ++i) {
+    std::vector<const T*> ptrs;
-      std::vector<int> tmp_dims(input_dims.size());
+    for (unsigned int i = 0; i < ins.size(); ++i) {
-      for (int j = 0; j < input_dims.size(); ++j) {
+      if (ins[i] && ins[i]->numel() > 0) {
-        tmp_dims[j] = ins[i]->dims()[j];
+        ptrs.push_back(ins[i]->data<T>());
+        int size = ins[i]->dims().size();
+        std::vector<int> tmp_dims(size);
+        for (int j = 0; j < size; ++j) {
+          tmp_dims[j] = ins[i]->dims()[j];
+        }
+        xdims_list.push_back(tmp_dims);
      }
-      xdims_list[i] = tmp_dims;
    }
+    PADDLE_ENFORCE_GT(xdims_list.size(), 0, platform::errors::InvalidArgument(
+                                                "No tensor need concat"));
    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    std::vector<const T*> ptrs;
-    for (int i = 0; i < n; ++i) {
-      ptrs.push_back(ins[choose_idx[i]]->data<T>());
-    }
    int r = xpu::concat<T>(dev_ctx.x_context(), ptrs, out->data<T>(),
                           xdims_list, axis);
-    PADDLE_ENFORCE_EQ(
+    PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
-        r, XPU_SUCCESS,
+                      platform::errors::External(
-        platform::errors::External(
+                          "XPU concat kernel return wrong value[%d %s]", r,
-            "XPU API return wrong value[%d], please check whether "
+                          XPUAPIErrorMsg[r]));
-            "Baidu Kunlun Card is properly installed.",
-            r));
  }
 };

--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -380,11 +380,20 @@ class ReshapeKernel {
 #ifdef PADDLE_WITH_XPU
    if (platform::is_xpu_place(ctx.GetPlace())) {
-      auto &dev_ctx =
+      void *out_ptr = out->data<void>();
-          ctx.template device_context<paddle::platform::XPUDeviceContext>();
+      const void *in_ptr = in->data<void>();
-      xpu::memcpy_device(
+      if ((out_ptr != nullptr) && (in_ptr != nullptr) &&
-          dev_ctx.x_context(), out->data<void>(), in->data<void>(),
+          (paddle::framework::SizeOfType(in->type()) > 0)) {
-          in->numel() * paddle::framework::SizeOfType(in->type()));
+        auto &dev_ctx =
+            ctx.template device_context<paddle::platform::XPUDeviceContext>();
+        int r = xpu::memcpy_device(
+            dev_ctx.x_context(), out_ptr, in_ptr,
+            in->numel() * paddle::framework::SizeOfType(in->type()));
+        PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
+                          platform::errors::External(
+                              "XPU memcpy_device return wrong value[%d %s]", r,
+                              XPUAPIErrorMsg[r]));
+      }
    } else {
 #endif
      framework::TensorCopy(