From 7335b679c1192c4faae84656cdd02692e7fe452c Mon Sep 17 00:00:00 2001
From: houj04 <35131887+houj04@users.noreply.github.com>
Date: Thu, 4 Aug 2022 10:35:56 +0800
Subject: [PATCH] [XPU] fleet dist_model support xpu (#44854)

* [XPU] fleet dist_model support xpu. test=kunlun

* [XPU] fleet dist_model support xpu. test=kunlun

* move unittest file location. test=kunlun
---
 .../distributed/fleet_executor/dist_model.cc  | 20 +++-
 .../xpu/test_fleet_exe_dist_model_run_xpu.py  | 93 +++++++++++++++++++
 2 files changed, 111 insertions(+), 2 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/xpu/test_fleet_exe_dist_model_run_xpu.py
diff --git a/paddle/fluid/distributed/fleet_executor/dist_model.cc b/paddle/fluid/distributed/fleet_executor/dist_model.cc
index 0b46369b970..b14bc4f7ed4 100644
--- a/paddle/fluid/distributed/fleet_executor/dist_model.cc
+++ b/paddle/fluid/distributed/fleet_executor/dist_model.cc
@@ -89,10 +89,23 @@ bool LoadDataFromDistModelTensor(const DistModelTensor &input_data,
 #else
     PADDLE_THROW(paddle::platform::errors::Fatal(
         "Paddle wasn't compiled with CUDA, but place is GPU."));
+#endif
+  } else if (platform::is_xpu_place(place)) {
+    VLOG(3) << "Loading data for XPU.";
+#if defined(PADDLE_WITH_XPU)
+    auto xpu_place = place;
+    memory::Copy(xpu_place,
+                 static_cast<void *>(input_tensor_ptr),
+                 platform::CPUPlace(),
+                 input_data.data.data(),
+                 input_data.data.length());
+#else
+    PADDLE_THROW(paddle::platform::errors::Fatal(
+        "Paddle wasn't compiled with XPU, but place is XPU."));
 #endif
   } else {
     PADDLE_THROW(paddle::platform::errors::InvalidArgument(
-        "DistModel only supports CPU and GPU."));
+        "DistModel only supports CPU and GPU and XPU."));
   }
 
   framework::LoD dst_lod;
@@ -189,9 +202,12 @@ bool DistModel::PreparePlace() {
     place_ = paddle::platform::CUDAPlace(config_.device_id);
   } else if (config_.place == "CPU") {
     place_ = paddle::platform::CPUPlace();
+  } else if (config_.place == "XPU") {
+    place_ = paddle::platform::XPUPlace(config_.device_id);
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
-        "Place must be choosen from GPU or CPU, but got %s.", config_.place));
+        "Place must be choosen from GPU or CPU or XPU, but got %s.",
+        config_.place));
   }
   return true;
 }
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_fleet_exe_dist_model_run_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_fleet_exe_dist_model_run_xpu.py
new file mode 100644
index 00000000000..851a5b521e1
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_fleet_exe_dist_model_run_xpu.py
@@ -0,0 +1,93 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import numpy as np
+import os
+import tempfile
+from paddle.fluid import core
+
+paddle.enable_static()
+
+
+class TestDistModelRun(unittest.TestCase):
+
+    def setUp(self):
+        self.temp_dir = tempfile.TemporaryDirectory()
+
+    def tearDown(self):
+        # step 6: clean up the env, delete the saved model and params
+        print('cleaned up the env')
+        self.temp_dir.cleanup()
+
+    def test_dist_model_run(self):
+        # step 0: declare folder to save the model and params
+        path_prefix = os.path.join(self.temp_dir.name,
+                                   "dist_model_run_test/inf")
+
+        # step 1: saving the inference model and params
+        x = paddle.static.data(name='x', shape=[28, 28], dtype='float32')
+        y = paddle.static.data(name='y', shape=[28, 1], dtype='int64')
+        predict = paddle.static.nn.fc(x, 10, activation='softmax')
+        loss = paddle.nn.functional.cross_entropy(predict, y)
+        avg_loss = paddle.tensor.stat.mean(loss)
+        exe = paddle.static.Executor(paddle.XPUPlace(0))
+        exe.run(paddle.static.default_startup_program())
+        x_data = np.random.randn(28, 28).astype('float32')
+        y_data = np.random.randint(0, 9, size=[28, 1]).astype('int64')
+        exe.run(paddle.static.default_main_program(),
+                feed={
+                    'x': x_data,
+                    'y': y_data
+                },
+                fetch_list=[avg_loss])
+        paddle.static.save_inference_model(path_prefix, [x, y], [avg_loss], exe)
+        print('save model to', path_prefix)
+
+        # step 2: prepare fake data for the inference
+        x_tensor = np.random.randn(28, 28).astype('float32')
+        y_tensor = np.random.randint(0, 9, size=[28, 1]).astype('int64')
+
+        # step 3: init the dist model to inference with fake data
+        config = core.DistModelConfig()
+        config.model_dir = path_prefix
+        config.place = 'XPU'
+        dist = core.DistModel(config)
+        dist.init()
+        dist_x = core.DistModelTensor(x_tensor, 'x')
+        dist_y = core.DistModelTensor(y_tensor, 'y')
+        input_data = [dist_x, dist_y]
+        output_rst = dist.run(input_data)
+        dist_model_rst = output_rst[0].as_ndarray().ravel().tolist()
+        print("dist model rst:", dist_model_rst)
+
+        # step 4: use framework's api to inference with fake data
+        [inference_program, feed_target_names,
+         fetch_targets] = (paddle.static.load_inference_model(path_prefix, exe))
+        results = exe.run(inference_program,
+                          feed={
+                              'x': x_tensor,
+                              'y': y_tensor
+                          },
+                          fetch_list=fetch_targets)
+        load_inference_model_rst = results[0]
+        print("load inference model api rst:", load_inference_model_rst)
+
+        # step 5: compare two results
+        self.assertTrue(np.allclose(dist_model_rst, load_inference_model_rst))
+
+
+if __name__ == '__main__':
+    unittest.main()
-- 
GitLab