Fix, some bug of rv64 cpu implement (#613)

3bca4649 · BUG1989 · GitHub · c7989a4b · 3bca4649 · 3bca4649
7 changed file
--- a/include/compiler_fp16.h
+++ b/include/compiler_fp16.h
@@ -32,7 +32,7 @@
 extern "C" {
 #endif

-#ifdef __ARM_ARCH
+#if defined __ARM_ARCH || defined __riscv

 #define fp16_to_fp32(data) \
    ({                     \

--- a/src/dev/cpu/op/cast/compiler_fp16.h
+++ b/src/dev/cpu/op/cast/compiler_fp16.h
@@ -33,7 +33,7 @@ extern "C" {

 #else

-#ifdef __ARM_ARCH
+#if defined __ARM_ARCH || defined __riscv

 #define fp16_to_fp32(data) \
    ({                     \

--- a/src/dev/cpu/op/conv/conv_hcl_rv64.c
+++ b/src/dev/cpu/op/conv/conv_hcl_rv64.c
@@ -17,6 +17,10 @@
 * under the License.
 */

+/*
+ * Copyright (c) 2021, OPEN AI LAB
+ * Author: ddzhao@openailab.com
+ */

 #include "sys_port.h"
 #include "module.h"
@@ -92,7 +96,6 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct

 static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
-    fprintf(stderr, "conv hcl start\n");
    struct ir_node* ir_node = exec_node->ir_node;
    struct ir_graph* ir_graph = ir_node->graph;
    struct ir_tensor* input_tensor;

--- a/src/dev/cpu/op/conv/rv64/conv_kernel_rv64.c
+++ b/src/dev/cpu/op/conv/rv64/conv_kernel_rv64.c
@@ -18,9 +18,10 @@
 */

 /*
- * Copyright (c) 2020, OPEN AI LAB
- * Author: haoluo@openailab.com
+ * Copyright (c) 2021, OPEN AI LAB
+ * Author: ddzhao@openailab.com
 */
+
 #include <stdint.h>
 #include <stdlib.h>
 #include <math.h>
@@ -381,6 +382,11 @@ static void sgemm4x4(float* col, float* kernel, float* biases, float* output, in
        {
            cur_col = ( float* )(col + col_line * kernel_size);
            sgemm_4x4_rv64(cur_biases, cur_col, cur_kernel, kernel_size, result, 4, activation, 0);
+            for (int i = 0; i < 4; i++)
+            {
+                for (int j = 0; j < (col_end3); j++)
+                    *(output + (kernel_num + i) * output_xy + col_line + j) = result[(i << 2) + j];
+            }
        }
    }
    if (kernel_end3)

--- a/src/dev/cpu/op/conv/rv64/conv_kernel_rv64.h
+++ b/src/dev/cpu/op/conv/rv64/conv_kernel_rv64.h
@@ -18,8 +18,8 @@
 */

 /*
- * Copyright (c) 2020, Martin Han
- * Author: hansh-sz@hotmail.com
+ * Copyright (c) 2021, OPEN AI LAB
+ * Author: ddzhao@openailab.com
 */

 #ifndef _CONV_KERNEL_RV64_H_

--- a/src/dev/cpu/op/rv64/sgemm_4x16_a72.S
+++ b/src/dev/cpu/op/rv64/sgemm_4x16_a72.S
@@ -36,7 +36,6 @@
 //      input 4 x p             kernel p x 16            biases 4 x 16                 output 4 x 16           p = kernel size
 //
 //
-// optimised for Cortex-A72 pipeline  64 cycle per loop (4*16*4 dot product)
 // load 4 more input and 8 more kernel to improve loop performance
 //
 // input: 

--- a/src/dev/cpu/op/rv64/sgemm_4x4_a72.S
+++ b/src/dev/cpu/op/rv64/sgemm_4x4_a72.S
@@ -36,7 +36,6 @@
 //      input 4 x p             kernel p x 4             biases 4 x 4                 output 4 x 4         p = kernel size
 //
 //
-// optimised for Cortex-A72 pipeline 18 cycle per loop (4*4*4 dot product)
 //
 // input:  
 //         x0 arg0  biases address {b0,b1,b2,b3}  nullptr means no biases