[NPU] fix accuracy npu op bug and change top_k's output to int64 (#32935)

* Output indices of top_k npu op change to int64 * fix accuracy npu bug * fix errors * change cast method to FillNpuTensorWithConstant * change cast method to FillNpuTensorWithConstant

[NPU] fix accuracy npu op bug and change top_k's output to int64 (#32935)
* Output indices of top_k npu op change to int64 * fix accuracy npu bug * fix errors * change cast method to FillNpuTensorWithConstant * change cast method to FillNpuTensorWithConstant
c66586b4 · pangyoki · GitHub · 5d627488 · c66586b4 · c66586b4
3 changed file
--- a/paddle/fluid/operators/metrics/accuracy_op_npu.cc
+++ b/paddle/fluid/operators/metrics/accuracy_op_npu.cc
@@ -23,91 +23,82 @@ template <typename DeviceContext, typename T>
 class AccuracyNPUKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* pred = ctx.Input<Tensor>("Out");
+    auto* inference = ctx.Input<Tensor>("Out");
    auto* label = ctx.Input<Tensor>("Label");
-    // auto* logits = ctx.Input<Tensor>("Indices");
+    auto* indices = ctx.Input<Tensor>("Indices");
-    auto* acc = ctx.Output<Tensor>("Accuracy");
+    auto* accuracy = ctx.Output<Tensor>("Accuracy");
    auto* correct = ctx.Output<Tensor>("Correct");
    auto* total = ctx.Output<Tensor>("Total");
    auto stream =
        ctx.template device_context<paddle::platform::NPUDeviceContext>()
            .stream();
-    // cast pred
+    int num_samples = inference->dims()[0];
-    Tensor tmp_pred(pred->type());
+    if (num_samples == 0) {
-    tmp_pred.Resize(pred->dims());
+      return;
-    tmp_pred.mutable_data<int>(ctx.GetPlace());
+    }
-    auto runner_cast_pred =
-        NpuOpRunner("Cast", {*pred}, {tmp_pred},
-                    {{"dst_type", static_cast<int>(ACL_INT32)}});
-    runner_cast_pred.Run(stream);
-    // cast label
-    Tensor tmp_label(label->type());
-    tmp_label.Resize(label->dims());
-    tmp_label.mutable_data<int>(ctx.GetPlace());
-    auto runner_cast_label =
-        NpuOpRunner("Cast", {*label}, {tmp_label},
-                    {{"dst_type", static_cast<int>(ACL_INT32)}});
-    runner_cast_label.Run(stream);
    // equal
-    Tensor tmp_equal(label->type());
+    Tensor tmp_equal(framework::proto::VarType::BOOL);
-    tmp_equal.Resize(label->dims());
+    tmp_equal.Resize(inference->dims());
    tmp_equal.mutable_data<bool>(ctx.GetPlace());
    auto runner_equal =
-        NpuOpRunner("Equal", {tmp_pred, tmp_label}, {tmp_equal}, {});
+        NpuOpRunner("Equal", {*indices, *label}, {tmp_equal}, {});
    runner_equal.Run(stream);
    // cast equal
-    Tensor tmp_equal_cast(label->type());
+    Tensor tmp_equal_cast(framework::proto::VarType::FP32);
-    tmp_equal_cast.Resize(label->dims());
+    tmp_equal_cast.Resize(inference->dims());
    tmp_equal_cast.mutable_data<float>(ctx.GetPlace());
-    auto runner_cast_equal =
+    auto runner_cast_equal = NpuOpRunner(
-        NpuOpRunner("Cast", {tmp_equal}, {tmp_equal_cast},
+        "Cast", {tmp_equal}, {tmp_equal_cast},
-                    {{"dst_type", static_cast<float>(ACL_FLOAT)}});
+        {{"dst_type",
+          static_cast<int>(ConvertToNpuDtype(tmp_equal_cast.type()))}});
    runner_cast_equal.Run(stream);
-    // acc
+    // [correct]
-    acc->mutable_data<float>(ctx.GetPlace());
+    // reduce_max
-    std::vector<int> axes_vec_1;
+    Tensor tmp_correct_max(framework::proto::VarType::FP32);
-    auto runner_acc = NpuOpRunner("ReduceMeanD", {tmp_equal_cast}, {*acc},
+    tmp_correct_max.Resize(framework::make_ddim({num_samples}));
-                                  {{"keep_dims", false}, {"axes", axes_vec_1}});
+    tmp_correct_max.mutable_data<float>(ctx.GetPlace());
-    runner_acc.Run(stream);
+    auto runner_reduce_max =
+        NpuOpRunner("ReduceMaxD", {tmp_equal_cast}, {tmp_correct_max},
-    // correct
+                    {{"axes", std::vector<int>{1}}, {"keep_dims", false}});
-    correct->mutable_data<float>(ctx.GetPlace());
+    runner_reduce_max.Run(stream);
-    std::vector<int> axes_vec_2;
-    auto runner_correct =
+    // reduce_sum
-        NpuOpRunner("ReduceSumD", {tmp_equal_cast}, {*correct},
+    Tensor tmp_correct(framework::proto::VarType::FP32);
-                    {{"keep_dims", false}, {"axes", axes_vec_2}});
+    tmp_correct.Resize(correct->dims());
-    runner_correct.Run(stream);
+    tmp_correct.mutable_data<float>(ctx.GetPlace());
+    auto runner_reduce_sum =
-    // ones_tensor
+        NpuOpRunner("ReduceSumD", {tmp_correct_max}, {tmp_correct},
-    Tensor ones_tensor(label->type());
+                    {{"axes", std::vector<int>{0}}, {"keep_dims", false}});
-    ones_tensor.Resize(label->dims());
+    runner_reduce_sum.Run(stream);
-    ones_tensor.mutable_data<int>(ctx.GetPlace());
-    auto runner_oneslike =
+    // cast to int
-        NpuOpRunner("OnesLike", {tmp_label}, {ones_tensor}, {});
+    correct->mutable_data<int>(ctx.GetPlace());
-    runner_oneslike.Run(stream);
+    auto runner_cast_correct = NpuOpRunner(
+        "Cast", {tmp_correct}, {*correct},
-    // ones_tensor_cast
+        {{"dst_type", static_cast<int>(ConvertToNpuDtype(correct->type()))}});
-    Tensor ones_tensor_cast(label->type());
+    runner_cast_correct.Run(stream);
-    ones_tensor_cast.Resize(label->dims());
-    ones_tensor_cast.mutable_data<float>(ctx.GetPlace());
+    // [total]
-    auto runner_ones_cast =
+    total->mutable_data<int>(ctx.GetPlace());
-        NpuOpRunner("Cast", {ones_tensor}, {ones_tensor_cast},
+    FillNpuTensorWithConstant<int>(total, static_cast<int>(num_samples));
-                    {{"dst_type", static_cast<float>(ACL_FLOAT)}});
-    runner_ones_cast.Run(stream);
+    // use `total` of type `float32` for calculating accuracy
+    Tensor tmp_total(framework::proto::VarType::FP32);
-    // total
+    tmp_total.Resize(total->dims());
-    total->mutable_data<float>(ctx.GetPlace());
+    tmp_total.mutable_data<float>(ctx.GetPlace());
-    std::vector<int> axes_vec_3;
+    FillNpuTensorWithConstant<float>(&tmp_total,
-    auto runner_total =
+                                     static_cast<float>(num_samples));
-        NpuOpRunner("ReduceSumD", {ones_tensor_cast}, {*total},
-                    {{"keep_dims", false}, {"axes", axes_vec_3}});
+    // [accuracy]
-    runner_total.Run(stream);
+    accuracy->mutable_data<float>(ctx.GetPlace());
+    auto runner_accuracy =
+        NpuOpRunner("Div", {tmp_correct, tmp_total}, {*accuracy}, {});
+    runner_accuracy.Run(stream);
  }
 };

--- a/paddle/fluid/operators/top_k_op_npu.cc
+++ b/paddle/fluid/operators/top_k_op_npu.cc
@@ -48,7 +48,7 @@ class TopkNPUKernel : public framework::OpKernel<T> {
    size_t k = static_cast<int>(ctx.Attr<int>("k"));
    output->mutable_data<T>(ctx.GetPlace());
-    indices->mutable_data<int>(ctx.GetPlace());
+    indices->mutable_data<int64_t>(ctx.GetPlace());
    // prepare assit
    auto dim = input->dims().size();
@@ -62,15 +62,24 @@ class TopkNPUKernel : public framework::OpKernel<T> {
                                             {"dim", -1},
                                             {"largest", true}};
+    Tensor tmp_indices(framework::proto::VarType::INT32);
+    tmp_indices.Resize(indices->dims());
+    tmp_indices.mutable_data<int>(ctx.GetPlace());
    // run ascend
    auto runner = NpuOpRunner("TopKD", {*input, assist_seq_tensor},
-                              {*output, *indices}, attr_input);
+                              {*output, tmp_indices}, attr_input);
    auto stream =
        ctx.template device_context<paddle::platform::NPUDeviceContext>()
            .stream();
    runner.Run(stream);
+    // cast indices from INT32 to INT64
+    auto dst_dtype = ConvertToNpuDtype(indices->type());
+    auto runner_cast_indices =
+        NpuOpRunner("Cast", {tmp_indices}, {*indices},
+                    {{"dst_type", static_cast<int>(dst_dtype)}});
+    runner_cast_indices.Run(stream);
  }
 };

--- a/python/paddle/fluid/tests/unittests/npu/test_accuracy_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_accuracy_op_npu.py
@@ -35,21 +35,21 @@ class TestAccuracy(OpTest):
        self.set_npu()
        self.init_dtype()
        np.random.seed(SEED)
-        pred = np.random.uniform(1, 2, [11, 1]).astype(self.dtype)
+        n = 8192
-        label = pred.copy()
+        infer = np.random.random((n, 1)).astype(self.dtype)
-        accuracy = np.array([1]).astype(self.dtype)
+        indices = np.random.randint(0, 2, (n, 1)).astype('int64')
-        correct = np.array([11 * 1]).astype(self.dtype)
+        label = np.random.randint(0, 2, (n, 1)).astype('int64')
-        total = np.array([11 * 1]).astype(self.dtype)
+        self.inputs = {'Out': infer, 'Indices': indices, "Label": label}
+        num_correct = 0
-        self.inputs = {
+        for rowid in range(n):
-            "Out": OpTest.np_dtype_to_fluid_dtype(pred),
+            for ele in indices[rowid]:
-            "Label": OpTest.np_dtype_to_fluid_dtype(label),
+                if ele == label[rowid]:
-            "Indices": OpTest.np_dtype_to_fluid_dtype(pred)
+                    num_correct += 1
-        }
+                    break
        self.outputs = {
-            "Accuracy": accuracy,
+            'Accuracy': np.array([num_correct / float(n)]).astype(self.dtype),
-            "Correct": correct,
+            'Correct': np.array([num_correct]).astype("int32"),
-            "Total": total
+            'Total': np.array([n]).astype("int32")
        }
    def set_npu(self):
@@ -69,54 +69,23 @@ class TestAccuracy2(TestAccuracy):
        self.set_npu()
        self.init_dtype()
        np.random.seed(SEED)
-        pred = np.random.uniform(1, 2, [11, 1]).astype(self.dtype)
+        n = 8192
-        label = np.random.uniform(4, 5, [11, 1]).astype(self.dtype)
+        infer = np.random.random((n, 100)).astype(self.dtype)
-        accuracy = np.array([0]).astype(self.dtype)
+        indices = np.random.randint(0, 1000, (n, 100)).astype('int64')
-        correct = np.array([11 * 0]).astype(self.dtype)
+        label = np.random.randint(0, 1000, (n, 1)).astype('int64')
-        total = np.array([11 * 1]).astype(self.dtype)
+        self.inputs = {'Out': infer, 'Indices': indices, "Label": label}
+        num_correct = 0
-        self.inputs = {
+        for rowid in range(n):
-            "Out": OpTest.np_dtype_to_fluid_dtype(pred),
+            for ele in indices[rowid]:
-            "Label": OpTest.np_dtype_to_fluid_dtype(label),
+                if ele == label[rowid]:
-            "Indices": OpTest.np_dtype_to_fluid_dtype(pred)
+                    num_correct += 1
-        }
+                    break
-        self.outputs = {
-            "Accuracy": accuracy,
-            "Correct": correct,
-            "Total": total
-        }
-class TestAccuracy3(TestAccuracy):
-    def setUp(self):
-        self.op_type = "accuracy"
-        self.set_npu()
-        self.init_dtype()
-        np.random.seed(SEED)
-        a = np.random.randint(1, 2, [5, 1])
-        b = np.random.randint(0, 1, [5, 1])
-        pred = np.row_stack((a, b)).astype(self.dtype)
-        label = np.random.randint(1, 2, [10, 1]).astype(self.dtype)
-        accuracy = np.array([0.5]).astype(self.dtype)
-        correct = np.array([5]).astype(self.dtype)
-        total = np.array([10 * 1]).astype(self.dtype)
-        self.inputs = {
-            "Out": OpTest.np_dtype_to_fluid_dtype(pred),
-            "Label": OpTest.np_dtype_to_fluid_dtype(label),
-            "Indices": OpTest.np_dtype_to_fluid_dtype(pred)
-        }
        self.outputs = {
-            "Accuracy": accuracy,
+            'Accuracy': np.array([num_correct / float(n)]).astype(self.dtype),
-            "Correct": correct,
+            'Correct': np.array([num_correct]).astype("int32"),
-            "Total": total
+            'Total': np.array([n]).astype("int32")
        }
-class TestAccuracyInt(TestAccuracy):
-    def init_dtype(self):
-        self.dtype = np.int
 if __name__ == '__main__':
    unittest.main()