add int32/int64 for outer/matmul Kernel. (#55584)

* add int32/int64 for outer/matmul Kernel. * fix by comment. * fix by comment

add int32/int64 for outer/matmul Kernel. (#55584)
* add int32/int64 for outer/matmul Kernel. * fix by comment. * fix by comment
ff2142f2 · zxcd · GitHub · bd73a57d · ff2142f2 · ff2142f2
7 changed file
--- a/paddle/phi/kernels/cpu/matmul_kernel.cc
+++ b/paddle/phi/kernels/cpu/matmul_kernel.cc
@@ -25,6 +25,8 @@ PD_REGISTER_KERNEL(matmul,
                   phi::MatmulKernel,
                   float,
                   double,
+                   int32_t,
+                   int64_t,
                   phi::dtype::complex<float>,
                   phi::dtype::complex<double>) {}


--- a/paddle/phi/kernels/gpu/matmul_kernel.cu
+++ b/paddle/phi/kernels/gpu/matmul_kernel.cu
@@ -25,6 +25,8 @@ PD_REGISTER_KERNEL(matmul,
                   phi::MatmulKernel,
                   float,
                   double,
+                   int32_t,
+                   int64_t,
                   phi::dtype::float16,
                   phi::dtype::bfloat16,
                   phi::dtype::complex<float>,

--- a/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h
@@ -97,13 +97,14 @@ static DenseTensor FoldHeadAndLastDims(const Context& dev_ctx,
 }

 template <typename Context, typename T>
-void MatMul(const Context& dev_ctx,
-            const DenseTensor& a,
-            bool trans_a,
-            const DenseTensor& b,
-            bool trans_b,
-            DenseTensor* out,
-            bool flag = false) {
+typename std::enable_if<!std::is_integral<T>::value>::type MatMul(
+    const Context& dev_ctx,
+    const DenseTensor& a,
+    bool trans_a,
+    const DenseTensor& b,
+    bool trans_b,
+    DenseTensor* out,
+    bool flag = false) {
  dev_ctx.template Alloc<T>(out);
  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
  auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(a.dims(), 0, trans_a);

--- a/paddle/phi/kernels/impl/matmul_kernel_impl.h
+++ b/paddle/phi/kernels/impl/matmul_kernel_impl.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/autotune/cache_base.h"
+#include "paddle/phi/kernels/cast_kernel.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/blas/blaslt_impl.cu.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
@@ -1078,6 +1079,38 @@ void MatMulInt8Function(const Context& ctx,
 #endif
 }

+template <typename Context, typename T>
+typename std::enable_if<std::is_integral<T>::value>::type
+MatmulJudgeDtypeKernel(const Context& ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& y,
+                       const std::vector<std::int64_t>& x_dims,
+                       const std::vector<std::int64_t>& y_dims,
+                       DenseTensor* out,
+                       bool transpose_x,
+                       bool transpose_y) {
+  auto x_tmp = phi::Cast<T, Context>(ctx, x, phi::DataType::FLOAT32);
+  auto y_tmp = phi::Cast<T, Context>(ctx, y, phi::DataType::FLOAT32);
+  DenseTensor out_tmp;
+  MatMulFunction<Context, float>(
+      ctx, x_tmp, y_tmp, x_dims, y_dims, &out_tmp, transpose_x, transpose_y);
+  phi::CastKernel<float>(ctx, out_tmp, x.dtype(), out);
+}
+
+template <typename Context, typename T>
+typename std::enable_if<!std::is_integral<T>::value>::type
+MatmulJudgeDtypeKernel(const Context& ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& y,
+                       const std::vector<std::int64_t>& x_dims,
+                       const std::vector<std::int64_t>& y_dims,
+                       DenseTensor* out,
+                       bool transpose_x,
+                       bool transpose_y) {
+  MatMulFunction<Context, T>(
+      ctx, x, y, x_dims, y_dims, out, transpose_x, transpose_y);
+}
+
 template <typename T, typename Context>
 void MatmulKernel(const Context& ctx,
                  const DenseTensor& x,
@@ -1097,7 +1130,7 @@ void MatmulKernel(const Context& ctx,
                                   " but reviced dims size is 0. "));
  const std::vector<std::int64_t> x_dims = vectorize(x.dims());
  const std::vector<std::int64_t> y_dims = vectorize(y.dims());
-  MatMulFunction<Context, T>(
+  MatmulJudgeDtypeKernel<Context, T>(
      ctx, x, y, x_dims, y_dims, out, transpose_x, transpose_y);
 }


--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -2315,7 +2315,10 @@ def outer(x, y, name=None):
            var_names = {'x': x, 'y': y}
            for name, val in var_names.items():
                check_variable_and_dtype(
-                    val, name, ['float16', 'float32', 'float64'], 'inner'
+                    val,
+                    name,
+                    ['float16', 'float32', 'float64', 'int32', 'int64'],
+                    'outer',
                )

        __check_input(nx, ny)

--- a/test/legacy_test/test_matmul_v2_op.py
+++ b/test/legacy_test/test_matmul_v2_op.py
@@ -712,6 +712,110 @@ class TestMatMulTypePromotion(TestComplexMatMulOp):
        self.out = np.dot(self.x, self.y)


+class TestInt32MatmulOp(OpTest):
+    def setUp(self):
+        self.op_type = "matmul_v2"
+        self.python_api = paddle.tensor.matmul
+        self.init_base_dtype()
+        self.init_input_output()
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(self.x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(self.y),
+        }
+        self.attrs = {'axis': -1, 'use_mkldnn': False}
+        self.outputs = {'Out': self.out}
+
+    def init_base_dtype(self):
+        self.dtype = np.int32
+
+    def init_input_output(self):
+        self.x = np.random.random((10, 10)).astype(self.dtype)
+        self.y = np.random.random((10, 10)).astype(self.dtype)
+        self.out = np.matmul(self.x, self.y)
+
+    def test_check_output(self):
+        self.check_output(check_cinn=False)
+
+
+class TestInt32MatMulOpBroadcast(OpTest):
+    def setUp(self):
+        self.op_type = "matmul_v2"
+        self.python_api = paddle.tensor.matmul
+        self.init_base_dtype()
+        self.init_input_output()
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(self.x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(self.y),
+        }
+        self.attrs = {'axis': -1, 'use_mkldnn': False}
+        self.outputs = {'Out': self.out}
+
+    def init_base_dtype(self):
+        self.dtype = np.int32
+
+    def init_input_output(self):
+        self.x = np.random.random((10, 2, 5)).astype(self.dtype)
+        self.y = np.random.random((5, 20)).astype(self.dtype)
+        self.out = np.matmul(self.x, self.y)
+
+    def test_check_output(self):
+        self.check_output(check_cinn=False)
+
+
+class TestInt64MatmulOp(OpTest):
+    def setUp(self):
+        self.op_type = "matmul_v2"
+        self.python_api = paddle.tensor.matmul
+        self.init_base_dtype()
+        self.init_input_output()
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(self.x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(self.y),
+        }
+        self.attrs = {'axis': -1, 'use_mkldnn': False}
+        self.outputs = {'Out': self.out}
+
+    def init_base_dtype(self):
+        self.dtype = np.int64
+
+    def init_input_output(self):
+        self.x = np.random.random((10, 10)).astype(self.dtype)
+        self.y = np.random.random((10, 10)).astype(self.dtype)
+        self.out = np.matmul(self.x, self.y)
+
+    def test_check_output(self):
+        self.check_output(check_cinn=False)
+
+
+class TestInt64MatMulOpBroadcast(OpTest):
+    def setUp(self):
+        self.op_type = "matmul_v2"
+        self.python_api = paddle.tensor.matmul
+        self.init_base_dtype()
+        self.init_input_output()
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(self.x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(self.y),
+        }
+        self.attrs = {'axis': -1, 'use_mkldnn': False}
+        self.outputs = {'Out': self.out}
+
+    def init_base_dtype(self):
+        self.dtype = np.int64
+
+    def init_input_output(self):
+        self.x = np.random.random((10, 2, 5)).astype(self.dtype)
+        self.y = np.random.random((5, 20)).astype(self.dtype)
+        self.out = np.matmul(self.x, self.y)
+
+    def test_check_output(self):
+        self.check_output(check_cinn=False)
+
+
 class TestMatmulop(unittest.TestCase):
    def func_dygraph_matmul(self):
        paddle.disable_static()

--- a/test/legacy_test/test_outer.py
+++ b/test/legacy_test/test_outer.py
@@ -74,6 +74,18 @@ class TestMultiplyApi(unittest.TestCase):
        res = self._run_static_graph_case(x_data, y_data)
        np.testing.assert_allclose(res, np.outer(x_data, y_data), rtol=1e-05)

+        # test static computation graph: 1-d int32 array
+        x_data = np.random.rand(50).astype(np.int32)
+        y_data = np.random.rand(50).astype(np.int32)
+        res = self._run_static_graph_case(x_data, y_data)
+        np.testing.assert_allclose(res, np.outer(x_data, y_data), rtol=1e-05)
+
+        # test static computation graph: 1-d int64 array
+        x_data = np.random.rand(50).astype(np.int64)
+        y_data = np.random.rand(50).astype(np.int64)
+        res = self._run_static_graph_case(x_data, y_data)
+        np.testing.assert_allclose(res, np.outer(x_data, y_data), rtol=1e-05)
+
        # test dynamic computation graph: 3-d array
        x_data = np.random.rand(5, 10, 10).astype(np.float64)
        y_data = np.random.rand(2, 10).astype(np.float64)
@@ -112,6 +124,18 @@ class TestMultiplyApi(unittest.TestCase):
        res = self._run_dynamic_graph_case(x_data, y_data)
        np.testing.assert_allclose(res, np.outer(x_data, y_data), rtol=1e-05)

+        # test dynamic computation graph: 3-d int32 array
+        x_data = np.random.rand(5, 10, 10).astype(np.int32)
+        y_data = np.random.rand(2, 10).astype(np.int32)
+        res = self._run_dynamic_graph_case(x_data, y_data)
+        np.testing.assert_allclose(res, np.outer(x_data, y_data), rtol=1e-05)
+
+        # test dynamic computation graph: 3-d int64 array
+        x_data = np.random.rand(5, 10, 10).astype(np.int64)
+        y_data = np.random.rand(2, 10).astype(np.int64)
+        res = self._run_dynamic_graph_case(x_data, y_data)
+        np.testing.assert_allclose(res, np.outer(x_data, y_data), rtol=1e-05)
+

 class TestMultiplyError(unittest.TestCase):
    def test_errors(self):