From 3bca4649a881a60a7665785fa25bc849c32a167f Mon Sep 17 00:00:00 2001
From: BUG1989 <248857878@qq.com>
Date: Fri, 9 Apr 2021 16:27:54 +0800
Subject: [PATCH] Fix, some bug of rv64 cpu implement (#613)

---
 include/compiler_fp16.h                                |  2 +-
 src/dev/cpu/op/cast/compiler_fp16.h                    |  2 +-
 src/dev/cpu/op/conv/conv_hcl_rv64.c                    |  5 ++++-
 src/dev/cpu/op/conv/rv64/conv_kernel_rv64.c            | 10 ++++++++--
 src/dev/cpu/op/conv/rv64/conv_kernel_rv64.h            |  4 ++--
 src/dev/cpu/op/rv64/{sgemm_4x16_a72.S => sgemm_4x16.S} |  1 -
 src/dev/cpu/op/rv64/{sgemm_4x4_a72.S => sgemm_4x4.S}   |  1 -
 7 files changed, 16 insertions(+), 9 deletions(-)
 rename src/dev/cpu/op/rv64/{sgemm_4x16_a72.S => sgemm_4x16.S} (99%)
 rename src/dev/cpu/op/rv64/{sgemm_4x4_a72.S => sgemm_4x4.S} (99%)

diff --git a/include/compiler_fp16.h b/include/compiler_fp16.h
index 53b0310a..1857d7ee 100644
--- a/include/compiler_fp16.h
+++ b/include/compiler_fp16.h
@@ -32,7 +32,7 @@
 extern "C" {
 #endif
 
-#ifdef __ARM_ARCH
+#if defined __ARM_ARCH || defined __riscv
 
 #define fp16_to_fp32(data) \
     ({                     \
diff --git a/src/dev/cpu/op/cast/compiler_fp16.h b/src/dev/cpu/op/cast/compiler_fp16.h
index f2905896..ebd19e14 100644
--- a/src/dev/cpu/op/cast/compiler_fp16.h
+++ b/src/dev/cpu/op/cast/compiler_fp16.h
@@ -33,7 +33,7 @@ extern "C" {
 
 #else
 
-#ifdef __ARM_ARCH
+#if defined __ARM_ARCH || defined __riscv
 
 #define fp16_to_fp32(data) \
     ({                     \
diff --git a/src/dev/cpu/op/conv/conv_hcl_rv64.c b/src/dev/cpu/op/conv/conv_hcl_rv64.c
index 86ee626c..770a9655 100644
--- a/src/dev/cpu/op/conv/conv_hcl_rv64.c
+++ b/src/dev/cpu/op/conv/conv_hcl_rv64.c
@@ -17,6 +17,10 @@
  * under the License.
  */
 
+/*
+ * Copyright (c) 2021, OPEN AI LAB
+ * Author: ddzhao@openailab.com
+ */
 
 #include "sys_port.h"
 #include "module.h"
@@ -92,7 +96,6 @@ static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct
 
 static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
-    fprintf(stderr, "conv hcl start\n");
     struct ir_node* ir_node = exec_node->ir_node;
     struct ir_graph* ir_graph = ir_node->graph;
     struct ir_tensor* input_tensor;
diff --git a/src/dev/cpu/op/conv/rv64/conv_kernel_rv64.c b/src/dev/cpu/op/conv/rv64/conv_kernel_rv64.c
index 0b6431c9..d889f3d3 100644
--- a/src/dev/cpu/op/conv/rv64/conv_kernel_rv64.c
+++ b/src/dev/cpu/op/conv/rv64/conv_kernel_rv64.c
@@ -18,9 +18,10 @@
  */
 
 /*
- * Copyright (c) 2020, OPEN AI LAB
- * Author: haoluo@openailab.com
+ * Copyright (c) 2021, OPEN AI LAB
+ * Author: ddzhao@openailab.com
  */
+
 #include <stdint.h>
 #include <stdlib.h>
 #include <math.h>
@@ -381,6 +382,11 @@ static void sgemm4x4(float* col, float* kernel, float* biases, float* output, in
         {
             cur_col = ( float* )(col + col_line * kernel_size);
             sgemm_4x4_rv64(cur_biases, cur_col, cur_kernel, kernel_size, result, 4, activation, 0);
+            for (int i = 0; i < 4; i++)
+            {
+                for (int j = 0; j < (col_end3); j++)
+                    *(output + (kernel_num + i) * output_xy + col_line + j) = result[(i << 2) + j];
+            }
         }
     }
     if (kernel_end3)
diff --git a/src/dev/cpu/op/conv/rv64/conv_kernel_rv64.h b/src/dev/cpu/op/conv/rv64/conv_kernel_rv64.h
index 2957067e..d1ee7661 100644
--- a/src/dev/cpu/op/conv/rv64/conv_kernel_rv64.h
+++ b/src/dev/cpu/op/conv/rv64/conv_kernel_rv64.h
@@ -18,8 +18,8 @@
  */
 
 /*
- * Copyright (c) 2020, Martin Han
- * Author: hansh-sz@hotmail.com
+ * Copyright (c) 2021, OPEN AI LAB
+ * Author: ddzhao@openailab.com
  */
 
 #ifndef _CONV_KERNEL_RV64_H_
diff --git a/src/dev/cpu/op/rv64/sgemm_4x16_a72.S b/src/dev/cpu/op/rv64/sgemm_4x16.S
similarity index 99%
rename from src/dev/cpu/op/rv64/sgemm_4x16_a72.S
rename to src/dev/cpu/op/rv64/sgemm_4x16.S
index 6826d27e..b8b7431e 100644
--- a/src/dev/cpu/op/rv64/sgemm_4x16_a72.S
+++ b/src/dev/cpu/op/rv64/sgemm_4x16.S
@@ -36,7 +36,6 @@
 //      input 4 x p             kernel p x 16            biases 4 x 16                 output 4 x 16           p = kernel size
 //
 //
-// optimised for Cortex-A72 pipeline  64 cycle per loop (4*16*4 dot product)
 // load 4 more input and 8 more kernel to improve loop performance
 //
 // input: 
diff --git a/src/dev/cpu/op/rv64/sgemm_4x4_a72.S b/src/dev/cpu/op/rv64/sgemm_4x4.S
similarity index 99%
rename from src/dev/cpu/op/rv64/sgemm_4x4_a72.S
rename to src/dev/cpu/op/rv64/sgemm_4x4.S
index 22e1dedc..8fe4218b 100644
--- a/src/dev/cpu/op/rv64/sgemm_4x4_a72.S
+++ b/src/dev/cpu/op/rv64/sgemm_4x4.S
@@ -36,7 +36,6 @@
 //      input 4 x p             kernel p x 4             biases 4 x 4                 output 4 x 4         p = kernel size
 //
 //
-// optimised for Cortex-A72 pipeline 18 cycle per loop (4*4*4 dot product)
 //
 // input:  
 //         x0 arg0  biases address {b0,b1,b2,b3}  nullptr means no biases 
-- 
GitLab