fix jit_matmul bug according to paddle pr#20948 test=develop (#2392)

fix jit::matmul bug. Input x shape is (m, k), weight shape is (k, n). When k < 512, m==1, and n is a multiple of 16, the weight pointer is not correctly updated in the group calculation in the implementation of jit::matmul, resulting in the result diff

fix jit_matmul bug according to paddle pr#20948 test=develop (#2392)
fix jit::matmul bug. Input x shape is (m, k), weight shape is (k, n). When k < 512, m==1, and n is a multiple of 16, the weight pointer is not correctly updated in the group calculation in the implementation of jit::matmul, resulting in the result diff
f51c4891 · Wilber · GitHub · 19f8ac5c · f51c4891
隐藏空白更改
内联并排

Showing with 7 addition and 1 deletion

lite/backends/x86/jit/gen/matmul.cc lite/backends/x86/jit/gen/matmul.cc +7 -1

未找到文件。
--- a/lite/backends/x86/jit/gen/matmul.cc
+++ b/lite/backends/x86/jit/gen/matmul.cc
@@ -39,7 +39,12 @@ void MatMulJitCode::genCode() {
  size_t wgt_offset = 0;
  for (size_t g = 0; g < groups.size(); ++g) {
    size_t x_offset = 0;
+    size_t wgt_offset_tmp = 0;
+    for (int i = 0; i < g; ++i) {
+      wgt_offset_tmp += groups[i] * block_len;
+    }
    for (int k = 0; k < k_; ++k) {
+      wgt_offset = wgt_offset_tmp;
      vbroadcastss(zmm_t(x_reg_idx), ptr[param_x + x_offset]);
      // clean
      if (k == 0) {
@@ -48,7 +53,8 @@ void MatMulJitCode::genCode() {
        }
      }
      for (int i = 0; i < groups[g]; ++i) {
-        vmovups(zmm_t(w_reg_idx), ptr[reg_ptr_wgt + wgt_offset]);
+        vmovups(zmm_t(w_reg_idx),
+                ptr[reg_ptr_wgt + wgt_offset + k * n_ * sizeof(float)]);
        vfmadd231ps(zmm_t(i), zmm_t(w_reg_idx), zmm_t(x_reg_idx));
        wgt_offset += block_len;
      }