1.fix fc sgmv1x2bug 2.fix benchmark error 3.add int8 im2col+gemm (#478)

* 1.fix x86 benchmark bug 2.add int8 im2col+gemm Co-authored-by: N shitouren1994 <shihebing@bigo.sg>

1.fix fc sgmv1x2bug 2.fix benchmark error 3.add int8 im2col+gemm (#478)
* 1.fix x86 benchmark bug 2.add int8 im2col+gemm Co-authored-by: N shitouren1994 <shihebing@bigo.sg>
eb0a08ea · shitouren1994 · GitHub · 0a5a9637 · eb0a08ea · eb0a08ea
5 changed file
--- a/src/dev/cpu/op/conv/conv_hcl_arm.c
+++ b/src/dev/cpu/op/conv/conv_hcl_arm.c
@@ -97,6 +97,15 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct
        }
    }
 #endif
+    else if (exec_graph->mode == TENGINE_MODE_INT8)
+    {
+        if (int8_conv_hcl_prerun(input_tensor,filter_tensor,output_tensor,conv_priv_info,conv_param) < 0)
+        {
+            TLOG_ERR("hcl conv hybrid int8 prerun failed\n");
+            set_tengine_errno(EFAULT);
+            return -1;
+        }
+    }
    else
    {
        printf("Tengine work node not support %d\n", exec_graph->mode);
@@ -151,6 +160,15 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
        }        
    }
 #endif
+    else if (exec_graph->mode == TENGINE_MODE_INT8)
+    {
+        if (int8_conv_hcl_run(input_tensor, weight_tensor, bias_tensor, output_tensor, conv_priv_info, conv_param, num_thread, cpu_affinity) < 0)
+        {
+            TLOG_ERR("hcl conv int8 run failed\n");
+            set_tengine_errno(EFAULT);
+            return -1;
+        }
+    }
    else
    {
        printf("Tengine work node not support %d\n", exec_graph->mode);
@@ -191,6 +209,15 @@ static int postrun(struct node_ops* node_ops, struct exec_node* exec_node, struc
        }
    }
 #endif
+    else if (exec_graph->mode == TENGINE_MODE_INT8)
+    {
+        if (int8_conv_hcl_postrun(conv_priv_info) < 0)
+        {
+            TLOG_ERR("hcl conv fp16 postrun failed\n");
+            set_tengine_errno(EFAULT);
+            return -1;
+        }
+    }
    else
    {
        printf("Tengine work node not support %d\n", exec_graph->mode);
@@ -236,6 +263,10 @@ static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, str
        exec_node->shared_mem_size = fp16_conv_hcl_get_shared_mem_size(input_tensor, output_tensor, conv_param);
    }
 #endif
+    else if (exec_graph->mode == TENGINE_MODE_INT8)
+    {
+        exec_node->shared_mem_size = int8_conv_hcl_get_shared_mem_size(input_tensor,output_tensor,conv_param);
+    }
    else
    {
        printf("Tengine work node not support %d\n", exec_graph->mode);
@@ -266,6 +297,8 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
    int kernel_w = param->kernel_w;
    int in_c = input_tensor->dims[1] / group;
    int out_c = output_tensor->dims[1] / group;
+    if (input_tensor->data_type == TENGINE_DT_INT8)
+        return OPS_SCORE_BEST;

    /* todo support int8/fp16 */
 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC    

--- a/src/dev/cpu/op/conv/cortex_a/conv_kernel_arm.h
+++ b/src/dev/cpu/op/conv/cortex_a/conv_kernel_arm.h
@@ -72,6 +72,17 @@ int conv_hcl_get_shared_pack4_mem_size(struct ir_tensor* input_tensor, struct ir
 int conv_hcl_set_shared_mem(struct conv_priv_info* priv_info, void* mem, int mem_size) __attribute__((weak));
 int conv_hcl_set_shared_pack4_mem(struct conv_priv_info* priv_info, void* mem, int mem_size) __attribute__((weak));

+int int8_conv_hcl_prerun(struct ir_tensor*  input_tensor,
+                         struct ir_tensor*  filter_tensor,
+                         struct ir_tensor*  output_tensor,
+                         struct conv_priv_info* priv_info,
+                         struct conv_param* param)  __attribute__((weak));
+
+int int8_conv_hcl_postrun(struct conv_priv_info* info) __attribute__((weak));
+
+int int8_conv_hcl_run(struct ir_tensor* input_tensor , struct ir_tensor* filter_tensor ,struct ir_tensor* bias_tensor ,  struct ir_tensor* output_tensor , struct conv_priv_info*  conv_info ,struct conv_param* param, int num_thread, int cpu_affinity)  __attribute__((weak));
+
+int int8_conv_hcl_get_shared_mem_size(struct ir_tensor*  input_tensor ,struct ir_tensor*  output_tensor , struct conv_param* param) __attribute__((weak)) ;
 /* fp16 */
 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 int fp16_conv_hcl_prerun(struct ir_tensor*  input_tensor, 

--- a/src/dev/cpu/op/conv/cortex_a/conv_kernel_int8_arm.c
+++ b/src/dev/cpu/op/conv/cortex_a/conv_kernel_int8_arm.c
--- a/src/dev/cpu/op/fc/cortex_a/fc_kernel_int8_arm.c
+++ b/src/dev/cpu/op/fc/cortex_a/fc_kernel_int8_arm.c
@@ -109,7 +109,8 @@ void gemv_1x2_int8(const int32_t *biases, const float *scales, int8_t *inp, int8
    int remainw = (kernel_size << 3) >> 3;
    int8x8x2_t weight;
    int8x8_t input;
-    int16x8_t out_16_0, out_16_1, out_32_0, out_32_1;
+    int16x8_t out_16_0, out_16_1;
+    int32x4_t out_32_0, out_32_1;
    int32_t sum0 = 0, sum1 = 0;
    for (int i = 0; i < remainw; i = i + 8) {
        weight = vld2_s8(weight_ptr);

--- a/src/dev/cpu/op/pooling/pooling_pack_x86.c
+++ b/src/dev/cpu/op/pooling/pooling_pack_x86.c
@@ -146,9 +146,7 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
        {
            if (pad_h0 == 0 && pad_h1 == 0 && (pool_size == POOL_K2S2 || pool_size == POOL_K3S2))
                return OPS_SCORE_BEST;
-            if (pad_h0 == 1 && pad_h1 == 1 && (pool_size == POOL_K2S2 || pool_size == POOL_K3S1))
-                return OPS_SCORE_BEST;
-            else if (pad_h0 == 0 && pad_h1 == 1 && (pool_size == POOL_K3S2))
+            if (pad_h0 == 1 && pad_h1 == 1 && (pool_size == POOL_K2S2 || pool_size == POOL_K3S2))
                return OPS_SCORE_BEST;
        }
    }