[NPU] fix cast op (#32121)

* fix npu kernel of cast op to handle casting to same dtype * add comments

[NPU] fix cast op (#32121)
* fix npu kernel of cast op to handle casting to same dtype * add comments
78959a39 · Leo Chen · GitHub · 4638fe9a · 78959a39 · 78959a39
Showing with 50 addition and 18 deletion

paddle/fluid/operators/cast_op_npu.cc paddle/fluid/operators/cast_op_npu.cc +26 -18

python/paddle/fluid/tests/unittests/npu/test_cast_op_npu.py python/paddle/fluid/tests/unittests/npu/test_cast_op_npu.py +24 -0

未找到文件。
--- a/paddle/fluid/operators/cast_op_npu.cc
+++ b/paddle/fluid/operators/cast_op_npu.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#ifdef PADDLE_WITH_ASCEND_CL
 #include <memory>
 #include <string>
@@ -41,12 +40,22 @@ class CastNPUKernel : public framework::OpKernel<T> {
  void Compute(const framework::ExecutionContext& ctx) const override {
    auto* x = ctx.Input<Tensor>("X");
    int dtype = ctx.Attr<int>("out_dtype");
    auto* out = ctx.Output<Tensor>("Out");
    auto place = ctx.GetPlace();
-    auto iter = DTYPE_2_ACL_DTYPE.find(static_cast<framework::proto::VarType::Type>(dtype));
+    if (x->type() == dtype) {
+      // NOTE(zhiqiu): NPU cast op may result in wrong value, so
+      // add special case here.
+      VLOG(4) << "cast to same dtype:" << dtype;
+      out->mutable_data(place, x->type());
+      framework::TensorCopy(
+          *x, ctx.GetPlace(),
+          ctx.template device_context<platform::DeviceContext>(), out);
+      return;
+    }
+    auto iter = DTYPE_2_ACL_DTYPE.find(
+        static_cast<framework::proto::VarType::Type>(dtype));
    int aclDtype = iter->second;
    if (dtype == framework::proto::VarType::FP32) {
@@ -69,18 +78,18 @@ class CastNPUKernel : public framework::OpKernel<T> {
        ctx.template device_context<paddle::platform::NPUDeviceContext>()
            .stream();
-    auto runner = NpuOpRunner("Cast", {*x}, {*out}, {{"dst_type", static_cast<int32_t>(aclDtype)}});
+    auto runner = NpuOpRunner("Cast", {*x}, {*out},
+                              {{"dst_type", static_cast<int32_t>(aclDtype)}});
    runner.Run(stream);
  }
 };
 }  // namespace operators
-}  // namespace paddleaclDtype
+}  // namespace paddle
 namespace ops = paddle::operators;
 REGISTER_OP_NPU_KERNEL(
-    cast,
+    cast, ops::CastNPUKernel<paddle::platform::NPUDeviceContext, int16_t>,
-    ops::CastNPUKernel<paddle::platform::NPUDeviceContext, int16_t>,
    ops::CastNPUKernel<paddle::platform::NPUDeviceContext, int32_t>,
    ops::CastNPUKernel<paddle::platform::NPUDeviceContext, int64_t>,
    ops::CastNPUKernel<paddle::platform::NPUDeviceContext, int>,
@@ -89,4 +98,3 @@ REGISTER_OP_NPU_KERNEL(
    ops::CastNPUKernel<paddle::platform::NPUDeviceContext, float>,
    ops::CastNPUKernel<paddle::platform::NPUDeviceContext,
                       paddle::platform::float16>);
-#endif
--- a/python/paddle/fluid/tests/unittests/npu/test_cast_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_cast_op_npu.py
@@ -50,6 +50,7 @@ class TestCast1(OpTest):
    def test_check_output(self):
        self.check_output_with_place(self.place, check_dygraph=False)
 class TestCast2(OpTest):
    def setUp(self):
        self.set_npu()
@@ -71,5 +72,28 @@ class TestCast2(OpTest):
    def test_check_output(self):
        self.check_output_with_place(self.place, check_dygraph=False, atol=1e-3)
+class TestCast3(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "cast"
+        self.place = paddle.NPUPlace(0)
+        ipt = np.random.random(size=[10, 10]) + 1
+        self.inputs = {'X': ipt.astype('int32')}
+        self.outputs = {'Out': ipt.astype('int32')}
+        self.attrs = {
+            'in_dtype': int(core.VarDesc.VarType.INT32),
+            'out_dtype': int(core.VarDesc.VarType.INT32)
+        }
+    def set_npu(self):
+        self.__class__.use_npu = True
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False, atol=1e-3)
 if __name__ == '__main__':
    unittest.main()