diff --git a/paddle/fluid/operators/array_operator.h b/paddle/fluid/operators/array_operator.h
index 1beff472ecaf75e531e9ca8874d45b9379ce39d7..44063f233caf80455f6ef76c3939412bb2c4bd48 100644
--- a/paddle/fluid/operators/array_operator.h
+++ b/paddle/fluid/operators/array_operator.h
@@ -20,6 +20,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
+
 class ArrayOp : public framework::OperatorBase {
  public:
   ArrayOp(const std::string &type, const framework::VariableNameMap &inputs,
@@ -45,7 +46,8 @@ class ArrayOp : public framework::OperatorBase {
     auto &dev_ctx = *pool.Get(place);
 
     size_t offset;
-    if (platform::is_gpu_place(i_tensor.place())) {
+    if (platform::is_gpu_place(i_tensor.place()) ||
+        platform::is_xpu_place(i_tensor.place())) {
       // FIXME: Avoid copy from GPU to CPU
       framework::Tensor t;
       framework::TensorCopy(i_tensor, platform::CPUPlace(), dev_ctx, &t);
diff --git a/paddle/fluid/operators/concat_op_xpu.cc b/paddle/fluid/operators/concat_op_xpu.cc
index 4ebe92801e623aef0ca4e90927e2b2d0fce4d9e7..aa0002cc6d1777dab6e598fc7c123e5255d0f094 100644
--- a/paddle/fluid/operators/concat_op_xpu.cc
+++ b/paddle/fluid/operators/concat_op_xpu.cc
@@ -47,19 +47,6 @@ class ConcatXPUKernel : public framework::OpKernel<T> {
                           "size is %d.",
                           axis, ins[0]->dims().size()));
 
-    auto place = ctx.GetPlace();
-    out->mutable_data<T>(place);
-    std::vector<int> choose_idx;
-    int n = 0;
-    for (unsigned int i = 0; i < ins.size(); ++i) {
-      if (ins[i] && ins[i]->numel() > 0) {
-        choose_idx.push_back(i);
-        n++;
-      }
-    }
-    PADDLE_ENFORCE_GT(
-        n, 0, platform::errors::InvalidArgument("No tensor need concat?"));
-
     // If axis is 0, the lod of the output is not the same as inputs.
     if (axis == 0 && ins[0]->lod().size() > 0) {
       size_t lod_size_0 = ins[0]->lod().size();
@@ -87,30 +74,32 @@ class ConcatXPUKernel : public framework::OpKernel<T> {
         }
       }
     }
-
-    auto input_dims = ins[0]->dims();
-    std::vector<std::vector<int>> xdims_list(n);
-    for (int i = 0; i < n; ++i) {
-      std::vector<int> tmp_dims(input_dims.size());
-      for (int j = 0; j < input_dims.size(); ++j) {
-        tmp_dims[j] = ins[i]->dims()[j];
+    auto place = ctx.GetPlace();
+    out->mutable_data<T>(place);
+    std::vector<std::vector<int>> xdims_list;
+    std::vector<const T*> ptrs;
+    for (unsigned int i = 0; i < ins.size(); ++i) {
+      if (ins[i] && ins[i]->numel() > 0) {
+        ptrs.push_back(ins[i]->data<T>());
+        int size = ins[i]->dims().size();
+        std::vector<int> tmp_dims(size);
+        for (int j = 0; j < size; ++j) {
+          tmp_dims[j] = ins[i]->dims()[j];
+        }
+        xdims_list.push_back(tmp_dims);
       }
-      xdims_list[i] = tmp_dims;
     }
 
+    PADDLE_ENFORCE_GT(xdims_list.size(), 0, platform::errors::InvalidArgument(
+                                                "No tensor need concat"));
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    std::vector<const T*> ptrs;
-    for (int i = 0; i < n; ++i) {
-      ptrs.push_back(ins[choose_idx[i]]->data<T>());
-    }
+
     int r = xpu::concat<T>(dev_ctx.x_context(), ptrs, out->data<T>(),
                            xdims_list, axis);
-    PADDLE_ENFORCE_EQ(
-        r, XPU_SUCCESS,
-        platform::errors::External(
-            "XPU API return wrong value[%d], please check whether "
-            "Baidu Kunlun Card is properly installed.",
-            r));
+    PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
+                      platform::errors::External(
+                          "XPU concat kernel return wrong value[%d %s]", r,
+                          XPUAPIErrorMsg[r]));
   }
 };
 
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index 7b93ea15de3da3c0b5d6f0a93b1e96dbbc75fd2f..41f631f5547369a491e886434b243336fc57b0b4 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -380,11 +380,20 @@ class ReshapeKernel {
 
 #ifdef PADDLE_WITH_XPU
     if (platform::is_xpu_place(ctx.GetPlace())) {
-      auto &dev_ctx =
-          ctx.template device_context<paddle::platform::XPUDeviceContext>();
-      xpu::memcpy_device(
-          dev_ctx.x_context(), out->data<void>(), in->data<void>(),
-          in->numel() * paddle::framework::SizeOfType(in->type()));
+      void *out_ptr = out->data<void>();
+      const void *in_ptr = in->data<void>();
+      if ((out_ptr != nullptr) && (in_ptr != nullptr) &&
+          (paddle::framework::SizeOfType(in->type()) > 0)) {
+        auto &dev_ctx =
+            ctx.template device_context<paddle::platform::XPUDeviceContext>();
+        int r = xpu::memcpy_device(
+            dev_ctx.x_context(), out_ptr, in_ptr,
+            in->numel() * paddle::framework::SizeOfType(in->type()));
+        PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
+                          platform::errors::External(
+                              "XPU memcpy_device return wrong value[%d %s]", r,
+                              XPUAPIErrorMsg[r]));
+      }
     } else {
 #endif
       framework::TensorCopy(