Dev/riscv (#605)

* update CMakeLists to install tengine cpp api header * init rv64 im2col gemm * update rv64 gemm assembly * update rv64 im2col gemm * update rv64 gemm * Update CMakeLists.txt * fix rv64 im2col gemm run mobilenet success Co-authored-by: N dongdong <ddzhao@openailab.com>

Dev/riscv (#605)
* update CMakeLists to install tengine cpp api header * init rv64 im2col gemm * update rv64 gemm assembly * update rv64 im2col gemm * update rv64 gemm * Update CMakeLists.txt * fix rv64 im2col gemm run mobilenet success Co-authored-by: N dongdong <ddzhao@openailab.com>
129c5eeb · Daniel · GitHub · 2bc19316 · 129c5eeb · 129c5eeb
12 changed file
--- a/cmake/check.cmake
+++ b/cmake/check.cmake
@@ -57,6 +57,10 @@ ELSEIF (CMAKE_SYSTEM_PROCESSOR MATCHES "^(mips.*|MIPS.*)")
    SET (TENGINE_TARGET_PROCESSOR        "MIPS"  CACHE INTERNAL "" FORCE)
    SET (TENGINE_TARGET_PROCESSOR_32Bit  TRUE    CACHE INTERNAL "" FORCE)
    SET (TENGINE_TARGET_PROCESSOR_64Bit  FALSE   CACHE INTERNAL "" FORCE)
+ELSEIF (CMAKE_SYSTEM_PROCESSOR MATCHES "^(rv64.*|RV64.*)")
+    SET (TENGINE_TARGET_PROCESSOR        "lp64dv"  CACHE INTERNAL "" FORCE)
+    SET (TENGINE_TARGET_PROCESSOR_32Bit  FALSE    CACHE INTERNAL "" FORCE)
+    SET (TENGINE_TARGET_PROCESSOR_64Bit  TRUE   CACHE INTERNAL "" FORCE)
 ELSE()
    IF (NOT TENGINE_SUPPRESS_TARGET_PROCESSOR_CHECK)
        MESSAGE (WARNING "TENGINE: Unrecognized target processor configuration.")

--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -72,6 +72,11 @@ if (${TENGINE_TARGET_PROCESSOR} MATCHES "X86")
    endif()
 endif()

+# RV64
+if (${TENGINE_TARGET_PROCESSOR} MATCHES "lp64dv")
+    file(GLOB_RECURSE TENGINE_BACKEND_HCL_ASM_OPS "${CMAKE_CURRENT_SOURCE_DIR}/dev/cpu/op/rv64/*.S")
+endif()
+
 # add operator files
 if (TENGINE_DYNAMIC_COMPILE AND EXISTS "${CMAKE_SOURCE_DIR}/cmake/operators.cmake")
    # macro for adding include op dir
@@ -104,6 +109,12 @@ if (TENGINE_DYNAMIC_COMPILE AND EXISTS "${CMAKE_SOURCE_DIR}/cmake/operators.cmak
            set (MIPS_OP_PATH "${CMAKE_CURRENT_SOURCE_DIR}/dev/cpu/op/${name}/*mips.c")
            list (APPEND HCL_SOURCE ${MIPS_OP_PATH})
        endif()
+
+        # RV64
+        if (${TENGINE_TARGET_PROCESSOR} MATCHES "lp64dv")
+            set (RV64_OP_PATH "${CMAKE_CURRENT_SOURCE_DIR}/dev/cpu/op/${name}/*rv64.c")
+            list (APPEND HCL_SOURCE ${RV64_OP_PATH})
+        endif()
    endmacro()
    include(${CMAKE_SOURCE_DIR}/cmake/operators.cmake)

@@ -111,8 +122,8 @@ if (TENGINE_DYNAMIC_COMPILE AND EXISTS "${CMAKE_SOURCE_DIR}/cmake/operators.cmak
    file(GLOB_RECURSE TENGINE_BACKEND_REF_OPS ${REF_SOURCE})

    # add hcl operator files
-    # arm or x86 or mips64
-    if (${TENGINE_TARGET_PROCESSOR} MATCHES "ARM" OR ${TENGINE_TARGET_PROCESSOR} MATCHES "X86" OR ${TENGINE_TARGET_PROCESSOR} MATCHES "MIPS")
+    # arm or x86 or mips64 or rv64
+    if (${TENGINE_TARGET_PROCESSOR} MATCHES "ARM" OR ${TENGINE_TARGET_PROCESSOR} MATCHES "X86" OR ${TENGINE_TARGET_PROCESSOR} MATCHES "MIPS" OR ${TENGINE_TARGET_PROCESSOR} MATCHES "lp64dv")
        file(GLOB_RECURSE TENGINE_BACKEND_HCL_OPS   ${HCL_SOURCE})
    endif()
 else()
@@ -139,6 +150,11 @@ else()
    if (${TENGINE_TARGET_PROCESSOR} MATCHES "MIPS")
        file(GLOB_RECURSE TENGINE_BACKEND_HCL_OPS   "${CMAKE_CURRENT_SOURCE_DIR}/dev/cpu/op/*mips.c")
    endif()
+
+    # RV64
+    if (${TENGINE_TARGET_PROCESSOR} MATCHES "lp64dv")
+        file(GLOB_RECURSE TENGINE_BACKEND_HCL_OPS   "${CMAKE_CURRENT_SOURCE_DIR}/dev/cpu/op/*rv64.c")
+    endif()
 endif()

 # add cmsis operator files
@@ -390,7 +406,26 @@ if (TENGINE_STANDALONE_HCL)
                ${TENGINE_STANDALONE_HCL_LIB_NAME} SHARED
                ${TENGINE_BACKEND_HCL_OPS}
        )
+    elseif (${TENGINE_TARGET_PROCESSOR} MATCHES "lp64dv")
+        message (STATUS "TENGINE RV64 TENGINE_BACKEND_HCL_ASM_OPS.----------------------------------------------")
+        list(APPEND TENGINE_COMPILE_OPTIONS_C_PRIVATE -rv64imafdcvxtheadc)
+        list(APPEND TENGINE_COMPILE_OPTIONS_C_PRIVATE -mabi=lp64dv)
+        list(APPEND TENGINE_COMPILE_OPTIONS_C_PRIVATE -mtune=c910)
+
+        list(APPEND TENGINE_COMPILE_OPTIONS_CXX_PRIVATE -rv64imafdcvxtheadc)
+        list(APPEND TENGINE_COMPILE_OPTIONS_CXX_PRIVATE -mabi=lp64dv)
+        list(APPEND TENGINE_COMPILE_OPTIONS_CXX_PRIVATE -mtune=c910)
+
+        list(APPEND TENGINE_COMPILE_OPTIONS_CUDA_PRIVATE -rv64imafdcvxtheadc)
+        list(APPEND TENGINE_COMPILE_OPTIONS_CUDA_PRIVATE -mabi=lp64dv)
+        list(APPEND TENGINE_COMPILE_OPTIONS_CUDA_PRIVATE -mtune=c910)
+        add_library (
+                ${TENGINE_STANDALONE_HCL_LIB_NAME} SHARED
+                ${TENGINE_BACKEND_HCL_OPS}
+                ${TENGINE_BACKEND_HCL_ASM_OPS}
+        )
    endif()
+    

    if (TENGINE_BACKEND_HCL_OPS)
        unset(TENGINE_BACKEND_HCL_OPS)
@@ -458,6 +493,18 @@ elseif (${TENGINE_TARGET_PROCESSOR} MATCHES "MIPS")
        ${TENGINE_BACKEND_COMMON}
        ${TENGINE_BACKEND_REF_OPS}
        ${TENGINE_BACKEND_HCL_OPS})
+elseif (${TENGINE_TARGET_PROCESSOR} MATCHES "lp64dv")
+    add_definitions(-march=rv64imafdcvxtheadc)
+    add_definitions(-mabi=lp64dv)
+    add_definitions(-mtune=c910)
+    add_library(${CMAKE_PROJECT_NAME} SHARED
+        ${TENGINE_LIB_SRCS} ${TENGINE_FRONT_END_SRCS}
+        ${TENGINE_SERIALIZER_SRCS}
+        ${TENGINE_TINY_SERIALIZER_SRCS}
+        ${TENGINE_BACKEND_COMMON}
+        ${TENGINE_BACKEND_REF_OPS}
+        ${TENGINE_BACKEND_HCL_OPS}
+        ${TENGINE_BACKEND_HCL_ASM_OPS})
 else()
    add_library(${CMAKE_PROJECT_NAME} SHARED
        ${TENGINE_LIB_SRCS}

--- a/src/dev/cpu/op/conv/conv_hcl_rv64.c
+++ b/src/dev/cpu/op/conv/conv_hcl_rv64.c
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+
+#include "sys_port.h"
+#include "module.h"
+#include "tengine_errno.h"
+#include "tengine_log.h"
+#include "tengine_ir.h"
+#include "../../cpu_node_ops.h"
+#include "tengine_op.h"
+#include "convolution_param.h"
+#include "./rv64/conv_kernel_rv64.h"
+
+static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
+{
+    struct ir_node* ir_node = exec_node->ir_node;
+    struct ir_graph* ir_graph = ir_node->graph;
+    struct ir_tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
+    struct ir_tensor* filter_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]);
+    struct ir_tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
+
+    struct conv_param* conv_param = ( struct conv_param* )ir_node->op.param_mem;
+    struct conv_priv_info* conv_priv_info = ( struct conv_priv_info* )exec_node->ops_priv;
+
+    /* get cpu affinity */
+    conv_priv_info->cpu_type = exec_graph->cpu_affinity;
+
+    /* fp32 prerun */
+    if (exec_graph->mode == TENGINE_MODE_FP32)
+    {
+        if (conv_hcl_set_shared_mem && exec_node->shared_mem_size < exec_graph->shared_mem_size)
+        {
+            if (conv_hcl_set_shared_mem(conv_priv_info, exec_graph->shared_mem, exec_graph->shared_mem_size) < 0)
+            {
+                TLOG_ERR("hcl conv: set shared memory failed\n");
+                set_tengine_errno(EFAULT);
+                return -1;
+            }
+        }
+        if (conv_hcl_set_shared_pack4_mem && exec_node->shared_pack4_mem_size < exec_graph->shared_pack4_mem_size)
+        {
+            if (conv_hcl_set_shared_pack4_mem(conv_priv_info, exec_graph->shared_pack4_mem,
+                                              exec_graph->shared_pack4_mem_size) < 0)
+            {
+                TLOG_ERR("hcl conv: set shared pack4 memory failed\n");
+                set_tengine_errno(EFAULT);
+                return -1;
+            }
+        }
+
+        int group = conv_param->group;
+        int kernel_h = conv_param->kernel_h;
+        int kernel_w = conv_param->kernel_w;
+        if (group > 1 && kernel_h == 7 && kernel_w == 7)
+            conv_priv_info->external_interleave_pack4_mem = 0;
+        else
+            conv_priv_info->external_interleave_pack4_mem = 1;
+
+        /* do prerun */
+        if (conv_hcl_prerun(input_tensor, filter_tensor, output_tensor, conv_priv_info, conv_param) < 0)
+        {
+            TLOG_ERR("hcl conv prerun failed\n");
+            set_tengine_errno(EFAULT);
+            return -1;
+        }
+    }
+    else
+    {
+        printf("Tengine work node not support %d\n", exec_graph->mode);
+        return -1;
+    }
+
+    return 0;
+}
+
+static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
+{
+    fprintf(stderr, "conv hcl start\n");
+    struct ir_node* ir_node = exec_node->ir_node;
+    struct ir_graph* ir_graph = ir_node->graph;
+    struct ir_tensor* input_tensor;
+    struct ir_tensor* weight_tensor;
+    struct ir_tensor* output_tensor;
+    struct ir_tensor* bias_tensor = NULL;
+    int num_thread = exec_graph->num_thread;
+    int cpu_affinity = exec_graph->cpu_affinity;
+
+    /* set the input data and shape again, in case of reshape or dynamic shape */
+    input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
+    weight_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]);
+    output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
+    if (ir_node->input_num > 2)
+        bias_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[2]);
+
+    struct conv_param* conv_param = ( struct conv_param* )ir_node->op.param_mem;
+    struct conv_priv_info* conv_priv_info = ( struct conv_priv_info* )exec_node->ops_priv;
+
+    /* fp32 run */
+    if (exec_graph->mode == TENGINE_MODE_FP32)
+    {
+        if (conv_hcl_run(input_tensor, weight_tensor, bias_tensor, output_tensor, conv_priv_info, conv_param, num_thread,
+                         cpu_affinity) < 0)
+        {
+            TLOG_ERR("hcl conv run failed\n");
+            set_tengine_errno(EFAULT);
+            return -1;
+        }
+    }
+    else
+    {
+        printf("Tengine work node not support %d\n", exec_graph->mode);
+        return -1;
+    }
+
+    return 0;
+}
+
+static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
+{
+    return 0;
+}
+
+static int postrun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
+{
+    struct conv_priv_info* conv_priv_info = ( struct conv_priv_info* )exec_node->ops_priv;
+
+    /* fp32 postrun */
+    if (exec_graph->mode == TENGINE_MODE_FP32)
+    {
+        if (conv_hcl_postrun(conv_priv_info) < 0)
+        {
+            TLOG_ERR("hcl conv postrun failed\n");
+            set_tengine_errno(EFAULT);
+            return -1;
+        }
+    }
+    else
+    {
+        printf("Tengine work node not support %d\n", exec_graph->mode);
+        return -1;
+    }
+
+    return 0;
+}
+
+static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
+{
+    struct ir_node* ir_node = exec_node->ir_node;
+    struct ir_graph* ir_graph = ir_node->graph;
+    struct ir_tensor* input_tensor;
+    struct ir_tensor* filter_tensor;
+    struct ir_tensor* output_tensor;
+
+    input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
+    filter_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]);
+    output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
+
+    struct conv_param* conv_param = ( struct conv_param* )ir_node->op.param_mem;
+
+    /* init the private info data of convolution op */
+    struct conv_priv_info* conv_priv_info = ( struct conv_priv_info* )sys_malloc(sizeof(struct conv_priv_info));
+    if (conv_priv_info == NULL)
+    {
+        set_tengine_errno(ENOMEM);
+        return -1;
+    }
+    memset(conv_priv_info, 0, sizeof(struct conv_priv_info));
+    exec_node->ops_priv = conv_priv_info;
+
+    /* get shared memory size */
+    if (exec_graph->mode == TENGINE_MODE_FP32)
+    {
+        exec_node->shared_mem_size = conv_hcl_get_shared_mem_size(input_tensor, output_tensor, conv_param);
+        exec_node->shared_pack4_mem_size = conv_hcl_get_shared_pack4_mem_size(filter_tensor, output_tensor, conv_param);
+    }
+    else
+    {
+        printf("Tengine work node not support %d\n", exec_graph->mode);
+        return -1;
+    }
+
+    return 0;
+}
+
+static int release_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
+{
+    struct conv_priv_info* conv_priv_info = ( struct conv_priv_info* )exec_node->ops_priv;
+    sys_free(conv_priv_info);
+    exec_node->ops_priv = NULL;
+
+    return 0;
+}
+
+static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struct ir_node* exec_node)
+{
+    struct ir_node* ir_node = exec_node;
+    struct ir_graph* ir_graph = ir_node->graph;
+    struct ir_tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
+    struct ir_tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
+    struct conv_param* param = ( struct conv_param* )exec_node->op.param_mem;
+    int group = param->group;
+    int kernel_h = param->kernel_h;
+    int kernel_w = param->kernel_w;
+    int in_c = input_tensor->dims[1] / group;
+    int out_c = output_tensor->dims[1] / group;
+
+    if (input_tensor->data_type != TENGINE_DT_FP32)
+        return 0;
+
+    if (group != 1)
+        return 0;
+
+    return OPS_SCORE_PREFER;
+}
+
+static struct node_ops hcl_node_ops = {.prerun = prerun,
+                                       .run = run,
+                                       .reshape = reshape,
+                                       .postrun = postrun,
+                                       .init_node = init_node,
+                                       .release_node = release_node,
+                                       .score = score
+};
+
+static int reg_conv_hcl_ops(void* arg)
+{
+    return register_builtin_node_ops(OP_CONV, &hcl_node_ops);
+}
+
+static int unreg_conv_hcl_ops(void* arg)
+{
+    unregister_builtin_node_ops(OP_CONV, &hcl_node_ops);
+    return 0;
+}
+
+AUTO_REGISTER_OPS(reg_conv_hcl_ops);
+AUTO_UNREGISTER_OPS(unreg_conv_hcl_ops);
--- a/src/dev/cpu/op/conv/rv64/conv_kernel_rv64.c
+++ b/src/dev/cpu/op/conv/rv64/conv_kernel_rv64.c
--- a/src/dev/cpu/op/conv/rv64/conv_kernel_rv64.h
+++ b/src/dev/cpu/op/conv/rv64/conv_kernel_rv64.h
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2020, Martin Han
+ * Author: hansh-sz@hotmail.com
+ */
+
+#ifndef _CONV_KERNEL_RV64_H_
+#define _CONV_KERNEL_RV64_H_
+
+#include "tengine_ir.h"
+#include "convolution_param.h"
+
+/* float32 */
+int conv_hcl_prerun(struct ir_tensor* input_tensor, struct ir_tensor* filter_tensor, struct ir_tensor* output_tensor,
+                    struct conv_priv_info* info, struct conv_param* param) __attribute__((weak));
+
+int conv_hcl_postrun(struct conv_priv_info* info) __attribute__((weak));
+
+int conv_hcl_run(struct ir_tensor* input_tensor, struct ir_tensor* filter_tensor, struct ir_tensor* bias_tensor,
+                 struct ir_tensor* output_tensor, struct conv_priv_info* conv_info, struct conv_param* param,
+                 int num_thread, int cpu_affinity) __attribute__((weak));
+
+int conv_hcl_get_shared_mem_size(struct ir_tensor* input_tensor, struct ir_tensor* output_tensor,
+                                 struct conv_param* param) __attribute__((weak));
+int conv_hcl_get_shared_pack4_mem_size(struct ir_tensor* input_tensor, struct ir_tensor* output_tensor,
+                                       struct conv_param* param) __attribute__((weak));
+
+int conv_hcl_set_shared_mem(struct conv_priv_info* priv_info, void* mem, int mem_size) __attribute__((weak));
+
+int conv_hcl_set_shared_pack4_mem(struct conv_priv_info* priv_info, void* mem, int mem_size) __attribute__((weak));
+
+#endif
--- a/src/dev/cpu/op/rv64/im2col_fp32_1x1.S
+++ b/src/dev/cpu/op/rv64/im2col_fp32_1x1.S
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (c) 2021, OPEN AI LAB
+ * Author: ddzhao@openailab.com
+ */
+//
+// im2col for kernel 1x1 s1p0d1
+//
+// input:
+//         x0 arg0  input address 
+//         x1 arg1  input_xy
+//         x2 arg2  col address
+//         x3 arg3  col_cnt must be multiply of 4
+//         x4 arg4  input channel
+//
+// register definition
+//    x0 input address 
+//    x1 input_xy x 4
+//    x2 col address
+//    x3 col_cnt
+//    x4 input channel
+//    x6 input start pointer		t6
+//    x7 input pointer
+//    x9 channel cnt
+//    x11
+//    x12 = input_xy size * 2		// x12 -> t5
+
+        .section .text,"ax"
+        .align 5
+
+        .type   im2col_fp32_1x1 STT_FUNC
+        .global im2col_fp32_1x1
+        .hidden im2col_fp32_1x1
+im2col_fp32_1x1:
+	addi    sp, sp, -56
+	sd      t0, 0(sp)
+	sd      t1, 8(sp)
+	sd      t2, 16(sp)
+	sd      t3, 24(sp)
+	sd      t4, 32(sp)
+	sd      t5, 40(sp)
+	sd      t6, 48(sp)
+	vsetvli	t0, a0, e32
+	li 		t0, 4
+	blt 	a3, t0, col_end
+	
+	srli	a3, a3, 2
+	
+	slli	a1, a1, 2
+	
+	mv 		t6, a0
+	
+	slli	t5, a1, 1
+	
+	add 	t4, a4, 1								// x10 -> t4
+
+	// col loop
+col_loop:
+	mv 		t3, t6
+	srli	t2, a4, 1
+	beqz	t2, channel_last
+	add 	t1, t3, a1						
+	// kernel size loop
+channel_loop2:
+	vlw.v 	v0,(t3)
+	vlw.v 	v1,(t1)
+	addi 	t2, t2, -1
+	add 	t3, t3, t5
+	add 	t1, t1, t5
+	vsw.v 	v0, (a2)
+	addi 	a2, a2, 16
+	vsw.v 	v1, (a2)
+	addi 	a2, a2, 16
+	bnez	t2, channel_loop2
+
+channel_last:
+	beqz 	t4, channel_loop_end
+	vlw.v 	v0,(t3)
+	vsw.v 	v0, (a2)
+	addi 	a2, a2, 16
+
+channel_loop_end:
+	addi 	t6, t6, 16
+	addi 	a3, a3, -1
+	bnez	a3, col_loop
+
+col_end:
+	ld      t0, 0(sp)
+	ld      t1, 8(sp)
+	ld      t2, 16(sp)
+	ld      t3, 24(sp)
+	ld      t4, 32(sp)
+	ld      t5, 40(sp)
+	ld      t6, 48(sp)
+	addi    sp, sp, 56
+	ret
+	.end
--- a/src/dev/cpu/op/rv64/im2col_fp32_3x3.S
+++ b/src/dev/cpu/op/rv64/im2col_fp32_3x3.S
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (c) 2020, OPEN AI LAB
+ * Author: ddzhao@openailab.com
+ */
+//
+// im2col fp16 for kernel 3x3  include 2 function  stride 1 and stride 2
+// ABCDABCD
+//
+// input:
+//         x0 arg0  input address 
+//         x1 arg1  input_x
+//         x2 arg2  input_y
+//         x3 arg3  input channel cnt
+//         x4 arg4  col address
+//         x5 arg5  stride_x
+//
+// register definition
+//    x0 cl0 address  q0  q1    d16 d17 d18
+//    x1 input_x x 4
+//    x2 input_xy x 4
+//    x3 input channel
+//    x4 col address
+//    x5 stride_x
+//    x11 cl1 address q2  q3    d19 d20 d21
+//    x12 cl2 address q4  q5    d22 d23 d24
+
+        .section .text,"ax"
+        .align 5
+
+        .type   im2col_fp32_3x3 STT_FUNC
+        .global im2col_fp32_3x3
+        .hidden im2col_fp32_3x3
+
+.balign 16
+mask_32b:
+  .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \
+        0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff
+
+im2col_fp32_3x3:
+        addi            sp, sp, -56
+        sd              t0, 0(sp)
+        sd              t1, 8(sp)
+        sd              t2, 16(sp)
+        sd              t3, 24(sp)
+        sd              t4, 32(sp)
+        sd              t5, 40(sp)
+        sd              t6, 48(sp)
+        vsetvli         t0, a0, e32
+	// initial
+        beqz            a3, finish
+        li              t0, 2
+        slli	        a1, a1, 2
+        mul             a2, a2, a1
+        add             t5, a0, a1
+        slli	        t1, a1, 1
+        add             t6, a0, t1
+        li              t2, 8
+        beq             a5, t0, stride2_channel_loop
+
+stride1_channel_loop:
+        vlw.v           v0, (a0)
+        addi            t0, a0, 16
+        vlw.v           v1, (t0)
+        vlw.v           v2, (t5)
+        addi            t0, t5, 16
+        vlw.v           v3, (t0)
+        vlw.v           v4, (t6)
+        addi            t0, t6, 16
+        vlw.v           v5, (t0)
+        
+        addi             a3, a3, -1
+        
+        addi            t0, a0, 4
+        vlw.v           v16, (t0)
+        addi            t0, a0, 8
+        vlw.v           v17, (t0)
+        add             a0, a0, a2
+        
+        addi            t0, t5, 4
+        vlw.v           v19, (t0)
+        
+        addi            t0, t5, 8
+        vlw.v           v20, (t0)
+        add             t5, t5, a2
+        addi            t0, t6, 4
+        vlw.v           v22, (t0)
+        addi            t0, t6, 8
+        vlw.v           v23, (t0)
+        add             t6, t6, a2
+        vsw.v           v0, (a4)
+        addi            a4, a4, 16
+        vsw.v           v16, (a4)
+        addi            a4, a4, 16
+        vsw.v           v17, (a4)
+        addi            a4, a4, 16
+        vsw.v           v2, (a4)
+        addi            a4, a4, 16
+        vsw.v           v19, (a4)
+        addi            a4, a4, 16
+        vsw.v           v20, (a4)
+        addi            a4, a4, 16
+        vsw.v           v4, (a4)
+        addi            a4, a4, 16
+        vsw.v           v22, (a4)
+        addi            a4, a4, 16
+        vsw.v           v23, (a4)
+        addi            a4, a4, 16
+        bnez            a3, stride1_channel_loop
+        j               finish
+
+stride2_channel_loop:
+        la              t0, mask_32b
+        vlw.v           v0, (t0)
+        addi            t0, a0, 0
+        vlsw.v          v16, (t0), t2
+        addi            t0, a0, 0x4
+        vlsw.v          v17, (t0), t2
+        addi            t0, a0, 32
+        vlw.v           v18, (t0)
+        vslidedown.vi   v1, v16, 1
+        vslideup.vi     v2, v18, 3
+        vmerge.vvm      v18, v1, v2, v0
+        
+        addi            t0, t5, 0
+        vlsw.v           v19, (t0), t2
+        addi            t0, t5, 0x4
+        vlsw.v           v20, (t0), t2
+        addi            t0, t5, 0x20
+        vlw.v           v21, (t0)
+        vslidedown.vi   v1, v19, 1
+        vslideup.vi     v2, v21, 3
+        vmerge.vvm      v21, v1, v2, v0
+        
+        addi            t0, t6, 0
+        vlsw.v           v22, (t0), t2
+        addi            t0, t6, 0x4
+        vlsw.v           v23, (t0), t2
+        addi            t0, t6, 0x20
+        vlw.v           v24, (t0)
+        vslidedown.vi   v1, v22, 1
+        vslideup.vi     v2, v24, 3
+        vmerge.vvm      v24, v1, v2, v0
+        
+        addi            a3, a3, -1
+        
+        vsw.v           v16, (a4)
+        addi            a4, a4, 0x10
+        vsw.v           v17, (a4)
+        addi            a4, a4, 0x10
+        vsw.v           v18, (a4)
+        addi            a4, a4, 0x10
+        vsw.v           v19, (a4)
+        addi            a4, a4, 0x10
+        vsw.v           v20, (a4)
+        addi            a4, a4, 0x10
+        vsw.v           v21, (a4)
+        addi            a4, a4, 0x10
+        vsw.v           v22, (a4)
+        addi            a4, a4, 0x10
+        vsw.v           v23, (a4)
+        addi            a4, a4, 0x10
+        vsw.v           v24, (a4)
+        addi            a4, a4, 0x10
+        
+	add	        a0, a0, a2
+        add	        t5, t5, a2
+        add	        t6, t6, a2
+        
+        bnez            a3, stride2_channel_loop
+finish:
+        ld              t0, 0(sp)
+        ld              t1, 8(sp)
+        ld              t2, 16(sp)
+        ld              t3, 24(sp)
+        ld              t4, 32(sp)
+        ld              t5, 40(sp)
+        ld              t6, 48(sp)
+        addi            sp, sp, 56
+	ret
+	.end
--- a/src/dev/cpu/op/rv64/sgemm_4x16_a72.S
+++ b/src/dev/cpu/op/rv64/sgemm_4x16_a72.S
--- a/src/dev/cpu/op/rv64/sgemm_4x4_a72.S
+++ b/src/dev/cpu/op/rv64/sgemm_4x4_a72.S
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2020, OPEN AI LAB
+ * Author: ddzhao@openailab.com
+ */
+//
+// 4*4 single precise floating point matric multiplication
+//
+//    --              --      --               --     --               --         --                   --
+//    | i0 - - - - - - |      |  k0  k1  k2  k3 |     |  b0  b1  b2  b3 |         | i0k0 i0k1 i0k2 i0k3 |
+//    |                |      |  .   .   .   .  |     |                 |         |                     |
+//    | i1 - - - - - - |      |  .   .   .   .  |     |  b0  b1  b2  b3 |         | i1k0 i1k1 i1k2 i1k3 |
+//    |                |  x   |  .   .   .   .  |  +  |                 |     =   |                     |
+//    | i2 - - - - - - |      |  .   .   .   .  |     |  b0  b1  b2  b3 |         | i2k0 i2k1 i2k2 i2k3 |
+//    |                |      |  .   .   .   .  |     |                 |         |                     |
+//    | i3 - - - - - - |      |  .   .   .   .  |     |  b0  b1  b2  b3 |         | i3k0 i3k1 i3k2 i3k3 |
+//    --              --      --               --     --               --         --                   --
+//      input 4 x p             kernel p x 4             biases 4 x 4                 output 4 x 4         p = kernel size
+//
+//
+// optimised for Cortex-A72 pipeline 18 cycle per loop (4*4*4 dot product)
+//
+// input:  
+//         x0 arg0  biases address {b0,b1,b2,b3}  nullptr means no biases 
+//         x1 arg1  input  address {i[0-3][0],i1[0-3][1],i[0-3][2],i[0-3][3],i[0-3][4],...}
+//         x2 arg2  kernel address {k[0-3][0],k[0-3][1],k[0-3][2],k[0-3][3],...}
+//         x3 arg3  kernel size
+//         x4 arg4  output address 
+//                  indirect save: output {i[0-3]k[0],i[0-3]k[1],i[0-3]k[2],i[0-3]k[3]}
+//                    direct save: output                 : {i0k0  i1k0  i2k0  i3k0}
+//                                 output + ouput_xy      : {i0k1  i1k1  i2k1  i3k1}
+//                                 output + ouput_xy * 2  : {i0k2  i1k2  i2k2  i3k2}
+//                                 output + ouput_xy * 3  : {i0k3  i1k3  i2k3  i3k3}
+//         x5 arg5  output xy
+//         x6 arg6  activation flag     relu layers is integrated after convolution
+//
+// output: no
+//
+// register definition
+// x0        biases start address
+// x1        input start address
+// x2        kernel start address
+// x3        kernal size
+// x4        output start address
+// x5        output_x * output_y
+// x6        fused relu flag
+// x9 ~ x10  temp loop counter
+// x11~ x13  temp output save address
+// x7~8 14~15 not used
+
+//
+// v0-3 4S data of input0   {i3   i2   i1   i0}
+// v4-7 4S kernal data      {k3   k2   k1   k0}
+// v8~v15 not used
+// v16 dot product for {i3k0, i2k0, i1k0, i0k0}
+// v17 dot product for {i3k1, i2k1, i1k1, i0k1}
+// v18 dot product for {i3k2, i2k2, i1k2, i0k2}
+// v19 dot product for {i3k3, i2k3, i1k3, i0k3}
+// v20~V31 not used
+
+        .section .text,"ax"
+        .align 5
+
+        .type sgemm_4x4_rv64 STT_FUNC
+        .global sgemm_4x4_rv64
+        .hidden sgemm_4x4_rv64
+sgemm_4x4_rv64:
+        slli            a5, a5, 0x2
+#       // initial biases
+        beqz            a0, non_biases
+        vsetvli         t0, a0, e32
+        vlw.v           v16, (a0)
+        vmv.v.v         v17, v16
+        vmv.v.v         v18, v16
+        vmv.v.v         v19, v16
+
+        j               convoluation_start
+	
+non_biases:
+        vmv.v.x         v16, x0
+        vmv.v.x         v17, x0
+        vmv.v.x         v18, x0
+        vmv.v.x         v19, x0
+
+convoluation_start:
+        add             t4, a4, a5
+        
+        andi	        t3, a3, 0x3
+
+        li              t0, 4
+        blt             a3, t0, loop4_end
+        srli            t2, a3, 0x2
+
+// main loop: each loop generate dot prodcut for 4x4SFP
+loop4:  
+        addi            t2, t2, -1
+        
+        vlw.v           v0, (a1)
+        addi            a1, a1, 16
+        vlw.v           v1, (a1)
+        addi            a1, a1, 16
+        vlw.v           v2, (a1)
+        addi            a1, a1, 16
+        vlw.v           v3, (a1)
+        addi            a1, a1, 16
+        
+        vlw.v           v4, (a2)
+        addi            a2, a2, 16
+        vlw.v           v5, (a2)
+        addi            a2, a2, 16
+        vlw.v           v6, (a2)
+        addi            a2, a2, 16
+        vlw.v           v7, (a2)
+        addi            a2, a2, 16
+        
+        vrgather.vi     v20, v4, 0
+        vrgather.vi     v21, v4, 1
+        vrgather.vi     v22, v4, 2
+        vrgather.vi     v23, v4, 3
+        vfmacc.vv       v16, v20, v0
+        vfmacc.vv       v17, v21, v0
+        vfmacc.vv       v18, v22, v0
+        vfmacc.vv       v19, v23, v0
+        
+        vrgather.vi     v20, v5, 0
+        vrgather.vi     v21, v5, 1
+        vrgather.vi     v22, v5, 2
+        vrgather.vi     v23, v5, 3
+        vfmacc.vv       v16, v20, v1
+        vfmacc.vv       v17, v21, v1
+        vfmacc.vv       v18, v22, v1
+        vfmacc.vv       v19, v23, v1
+
+        vrgather.vi     v20, v6, 0
+        vrgather.vi     v21, v6, 1
+        vrgather.vi     v22, v6, 2
+        vrgather.vi     v23, v6, 3
+        vfmacc.vv       v16, v20, v2
+        vfmacc.vv       v17, v21, v2
+        vfmacc.vv       v18, v22, v2
+        vfmacc.vv       v19, v23, v2
+
+        vrgather.vi     v20, v7, 0
+        vrgather.vi     v21, v7, 1
+        vrgather.vi     v22, v7, 2
+        vrgather.vi     v23, v7, 3
+        vfmacc.vv       v16, v20, v3
+        vfmacc.vv       v17, v21, v3
+        vfmacc.vv       v18, v22, v3
+        vfmacc.vv       v19, v23, v3
+
+        bnez            t2, loop4
+
+loop4_end:
+        slli            t0, a5, 1
+        add             t5, a4, t0
+        beqz            t3, activation
+
+loop1:
+        addi            t3, t3, -1
+        
+        vlw.v           v0, (a1)
+        addi            a1, a1, 16
+        
+        vlw.v           v4, (a2)
+        addi            a2, a2, 16
+
+        vrgather.vi     v20, v4, 0
+        vrgather.vi     v21, v4, 1
+        vrgather.vi     v22, v4, 2
+        vrgather.vi     v23, v4, 3
+        vfmacc.vv       v16, v20, v0
+        vfmacc.vv       v17, v21, v0
+        vfmacc.vv       v18, v22, v0
+        vfmacc.vv       v19, v23, v0
+
+        bnez            t3, loop1
+
+
+activation:
+        slli            t0, a5, 1
+        add             t6, t4, t0
+        
+        bltz            a6, save_result
+        
+        vmv.v.i         v0, 0
+        vmv.v.x         v1, a6
+
+        vfmax.vv        v16, v16, v0
+        vfmax.vv        v17, v17, v0
+        vfmax.vv        v18, v18, v0
+        vfmax.vv        v19, v19, v0    
+
+        beqz            a6, save_result
+        vfmin.vv        v16, v16, v1
+        vfmin.vv        v17, v17, v1
+        vfmin.vv        v18, v18, v1
+        vfmin.vv        v19, v19, v1 
+
+save_result:
+# 	// store result
+        beqz            a7, save_result_nchw
+        
+        li              t1, 0
+        vext.x.v        t0, v16, t1
+        sw              t0, 0(a4)
+        vext.x.v        t0, v17, t1
+        sw              t0, 4(a4)
+        vext.x.v        t0, v18, t1
+        sw              t0, 8(a4)
+        vext.x.v        t0, v19, t1
+        sw              t0, 12(a4)
+        
+        li              t1, 1
+        vext.x.v        t0, v16, t1
+        sw              t0, 0(t4)
+        vext.x.v        t0, v17, t1
+        sw              t0, 4(t4)
+        vext.x.v        t0, v18, t1
+        sw              t0, 8(t4)
+        vext.x.v        t0, v19, t1
+        sw              t0, 12(t4)
+        
+        li              t1, 2
+        vext.x.v        t0, v16, t1
+        sw              t0, 0(t5)
+        vext.x.v        t0, v17, t1
+        sw              t0, 4(t5)
+        vext.x.v        t0, v18, t1
+        sw              t0, 8(t5)
+        vext.x.v        t0, v19, t1
+        sw              t0, 12(t5)
+        
+        li              t1, 3
+        vext.x.v        t0, v16, t1
+        sw              t0, 0(t6)
+        vext.x.v        t0, v17, t1
+        sw              t0, 4(t6)
+        vext.x.v        t0, v18, t1
+        sw              t0, 8(t6)
+        vext.x.v        t0, v19, t1
+        sw              t0, 12(t6)
+        j               end
+
+save_result_nchw:
+        vsw.v           v16, (a4)
+        vsw.v           v17, (t4)
+        vsw.v           v18, (t5)
+        vsw.v           v19, (t6)
+
+end:
+	ret
+    .end
+
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -9,6 +9,7 @@ endmacro()

 # operator level test
 tengine_test(test_op_prelu              op/test_op_prelu.c)
+tengine_test(test_op_conv               op/test_op_conv.c)

 if (TENGINE_ENABLE_TIM_VX)
    tengine_test(test_op_prelu_timvx        op/test_op_prelu_timvx.c)

--- a/tests/op/test_op_conv.c
+++ b/tests/op/test_op_conv.c
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2019, Open AI Lab
+ * Author: haitao@openailab.com
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <malloc.h>           
+
+#include "tengine_c_api.h"
+#include "tengine_c_api_ex.h"
+
+int allocated_num = 0;
+void** record_ptr = NULL;
+
+void record_allocated_buf(void* buf)
+{
+    allocated_num++;
+    record_ptr = realloc(record_ptr, sizeof(void*) * allocated_num);
+    record_ptr[allocated_num - 1] = buf;
+}
+
+void free_allocated_buf(void)
+{
+    for(int i = 0; i < allocated_num; i++)
+        free(record_ptr[i]);
+
+    if(record_ptr)
+        free(record_ptr);
+}
+
+void init_buffer(void* buf, int elem_num, int elem_size, int val)
+{
+    for(int i = 0; i < elem_num; i++)
+    {
+        float val0;
+        float* fp;
+        int16_t* i16;
+        char* c;
+
+        if(val >= 0)
+            val0 = val;
+        else
+            val0 = i%10;
+
+        switch(elem_size)
+        {
+            case 4:
+                fp = ( float* )buf;
+                fp[i] = val0;
+                break;
+            case 2:
+                i16 = ( int16_t* )buf;
+                i16[i] = val0;
+                break;
+            case 1:
+                c = ( char* )buf;
+                c[i] = val0;
+                break;
+        }
+    }
+}
+
+int create_input_node(graph_t graph, const char* node_name, int c, int h, int w)
+{
+    node_t node = create_graph_node(graph, node_name, "InputOp");
+    tensor_t tensor = create_graph_tensor(graph, node_name, TENGINE_DT_FP32);
+    set_node_output_tensor(node, 0, tensor, TENSOR_TYPE_INPUT);
+
+    int dims[4] = {1, c, h, w};
+
+    set_tensor_shape(tensor, dims, 4);
+
+    release_graph_tensor(tensor);
+    release_graph_node(node);
+
+    return 0;
+}
+
+int create_conv_node(graph_t graph, const char* node_name, const char* input_name, int k_size, int stride, int pad,
+                     int in_c, int out_c, int group)
+{
+    /* weight */
+    char* weight_name = malloc(strlen(node_name) + 16);
+    sprintf(weight_name, "%s/weight", node_name);
+
+    node_t w_node = create_graph_node(graph, weight_name, "Const");
+    tensor_t w_tensor = create_graph_tensor(graph, weight_name, TENGINE_DT_FP32);
+
+    set_node_output_tensor(w_node, 0, w_tensor, TENSOR_TYPE_CONST);
+    int w_dims[] = {out_c, in_c / group, k_size, k_size};
+    set_tensor_shape(w_tensor, w_dims, 4);
+
+    /* bias */
+
+    char* bias_name = malloc(strlen(node_name) + 16);
+    sprintf(bias_name, "%s/bias", node_name);
+
+    node_t b_node = create_graph_node(graph, bias_name, "Const");
+    tensor_t b_tensor = create_graph_tensor(graph, bias_name, TENGINE_DT_FP32);
+
+    set_node_output_tensor(b_node, 0, b_tensor, TENSOR_TYPE_CONST);
+    int b_dims[] = {out_c};
+
+    set_tensor_shape(b_tensor, b_dims, 1);
+
+    /* conv */
+
+    node_t conv_node = create_graph_node(graph, node_name, "Convolution");
+
+    tensor_t input_tensor = get_graph_tensor(graph, input_name);
+
+    if(input_tensor == NULL)
+    {
+        fprintf(stderr, "errno= %d\n", get_tengine_errno());
+        return -1;
+    }
+
+    set_node_input_tensor(conv_node, 2, b_tensor);
+    set_node_input_tensor(conv_node, 1, w_tensor);
+    set_node_input_tensor(conv_node, 0, input_tensor);
+
+    tensor_t output_tensor = create_graph_tensor(graph, node_name, TENGINE_DT_FP32);
+    set_node_output_tensor(conv_node, 0, output_tensor, TENSOR_TYPE_VAR);
+
+    release_graph_tensor(input_tensor);
+    release_graph_tensor(output_tensor);
+
+    release_graph_node(w_node);
+    release_graph_tensor(w_tensor);
+
+    release_graph_node(b_node);
+    release_graph_tensor(b_tensor);
+
+    free(bias_name);
+    free(weight_name);
+
+    /* attr */
+    set_node_attr_int(conv_node, "kernel_h", &k_size);
+    set_node_attr_int(conv_node, "kernel_w", &k_size);
+    set_node_attr_int(conv_node, "stride_h", &stride);
+    set_node_attr_int(conv_node, "stride_w", &stride);
+    set_node_attr_int(conv_node, "pad_h0", &pad);
+    set_node_attr_int(conv_node, "pad_h1", &pad);
+    set_node_attr_int(conv_node, "pad_w0", &pad);
+    set_node_attr_int(conv_node, "pad_w1", &pad);
+    set_node_attr_int(conv_node, "output_channel", &out_c);
+    set_node_attr_int(conv_node, "input_channel", &in_c);
+    set_node_attr_int(conv_node, "group", &group);
+
+    release_graph_node(conv_node);
+
+    return 0;
+}
+
+int create_pooling_node(graph_t graph, const char* node_name, const char* input_name)
+{
+    node_t pool_node = create_graph_node(graph, node_name, "Pooling");
+
+    tensor_t input_tensor = get_graph_tensor(graph, input_name);
+
+    if(input_tensor == NULL)
+    {
+        fprintf(stderr, "ERRNO: %d\n", get_tengine_errno());
+        return -1;
+    }
+
+    set_node_input_tensor(pool_node, 0, input_tensor);
+
+    release_graph_tensor(input_tensor);
+
+    /* output */
+    tensor_t output_tensor = create_graph_tensor(graph, node_name, TENGINE_DT_FP32);
+    set_node_output_tensor(pool_node, 0, output_tensor, TENSOR_TYPE_VAR);
+    release_graph_tensor(output_tensor);
+
+    release_graph_node(pool_node);
+
+    return 0;
+}
+
+graph_t create_test_graph(int c, int h, int w, int out_c)
+{
+    graph_t graph = create_graph(NULL, NULL, NULL);
+
+    if(graph == NULL)
+    {
+        fprintf(stderr, "ERRNO: %d\n", get_tengine_errno());
+        return NULL;
+    }
+
+    const char* input_name = "data";
+    const char* conv_name = "conv";
+
+    if(create_input_node(graph, input_name, c, h, w) < 0)
+    {
+        fprintf(stderr, "create input failed\n");
+        return NULL;
+    }
+
+    // int out_c = 4;
+    //                                                k  s  p in_c out_c group
+    if(create_conv_node(graph, conv_name, input_name, 1, 1, 0, c, out_c, 1) < 0)
+    {
+        fprintf(stderr, "create conv node failed\n");
+        return NULL;
+    }
+#if 0
+    const char* pool_name = "pooling";
+
+    if(create_pooling_node(graph, pool_name, conv_name) < 0)
+    {
+        fprintf(stderr, "create pooling node failed\n");
+        return NULL;
+    }
+
+    /* set input/output node */
+
+    const char* inputs[] = {input_name};
+    const char* outputs[] = {pool_name};
+#else
+    const char* inputs[] = {input_name};
+    const char* outputs[] = {conv_name};
+
+#endif
+
+    if(set_graph_input_node(graph, inputs, sizeof(inputs) / sizeof(char*)) < 0)
+    {
+        fprintf(stderr, "set inputs failed: ERRNO: %d\n", get_tengine_errno());
+        return NULL;
+    }
+
+    if(set_graph_output_node(graph, outputs, sizeof(outputs) / sizeof(char*)) < 0)
+    {
+        fprintf(stderr, "set outputs failed: ERRNO: %d\n", get_tengine_errno());
+        return NULL;
+    }
+
+    return graph;
+}
+
+void fill_conv_node(node_t node)
+{
+    tensor_t filter = get_node_input_tensor(node, 1);
+    int dims[4];
+
+    get_tensor_shape(filter, dims, 4);
+
+    int elem_num = dims[0] * dims[1] * dims[2] * dims[3];
+    int elem_size = 4;
+
+    void* filter_buf = malloc(elem_num * elem_size);
+
+    init_buffer(filter_buf, elem_num, elem_size, -1);
+
+    set_tensor_buffer(filter, filter_buf, elem_num * elem_size);
+
+    record_allocated_buf(filter_buf);
+
+    release_graph_tensor(filter);
+
+    tensor_t bias = get_node_input_tensor(node, 2);
+
+    if(bias == NULL)
+        return;
+
+    get_tensor_shape(bias, dims, 1);
+
+    elem_num = dims[0];
+
+    void* bias_buf = malloc(elem_num * elem_size);
+
+    init_buffer(bias_buf, elem_num, elem_size, 3);
+
+    set_tensor_buffer(bias, bias_buf, elem_num * elem_size);
+
+    record_allocated_buf(bias_buf);
+
+    release_graph_tensor(bias);
+}
+
+void fill_graph_param(graph_t graph)
+{
+    int node_num = get_graph_node_num(graph);
+
+    for(int i = 0; i < node_num; i++)
+    {
+        node_t node = get_graph_node_by_idx(graph, i);
+
+        const char* node_op = get_node_op(node);
+
+        if(!strcmp(node_op, "Convolution"))
+        {
+            fill_conv_node(node);
+        }
+
+        release_graph_node(node);
+    }
+}
+
+int main(int argc, char* argv[])
+{
+    int c, h, w, out_c;
+
+    c = 8;
+    h = 14;
+    w = 14;
+    out_c = 16;
+
+    init_tengine();
+
+    graph_t graph = create_test_graph(c, h, w, out_c);
+ 
+    if(graph == NULL)
+        return 1;
+
+    fill_graph_param(graph);
+
+    /* fill input */
+    tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0);
+
+    int dims[4];
+    int dim_num = get_tensor_shape(input_tensor, dims, 4);
+
+    int elem_num = 1;
+    int elem_size = 4;
+
+    for(int i = 0; i < dim_num; i++)
+        elem_num *= dims[i];
+
+    void* input_buf = malloc(elem_num * elem_size);
+
+    init_buffer(input_buf, elem_num, elem_size, -1);
+    record_allocated_buf(input_buf);
+
+    set_tensor_buffer(input_tensor, input_buf, elem_num * elem_size);
+    release_graph_tensor(input_tensor);
+
+    prerun_graph(graph);
+
+    dump_graph(graph);
+
+    run_graph(graph, 1);
+
+    tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
+
+    dim_num = get_tensor_shape(output_tensor, dims, 4);
+
+    elem_num = 1;
+
+    printf("output shape: [");
+
+    for(int i = 0; i < dim_num; i++)
+    {
+        elem_num *= dims[i];
+        printf(" %d", dims[i]);
+    }
+
+    printf(" ]\n");
+
+    float* output = get_tensor_buffer(output_tensor);
+
+    for(int i = 0; i < elem_num; i++)
+    {
+        int w = dims[3];
+
+        if((i % w) == 0)
+            printf("\n%d:\t", i);
+
+        printf(" %f", output[i]);
+    }
+
+    printf("\n");
+
+    release_graph_tensor(output_tensor);
+
+    postrun_graph(graph);
+
+    destroy_graph(graph);
+
+    release_tengine();
+
+    free_allocated_buf();
+
+    return 0;
+}
--- a/toolchains/rv64-linux-gnu.toolchain.cmake
+++ b/toolchains/rv64-linux-gnu.toolchain.cmake
+set(CMAKE_SYSTEM_NAME Linux)
+set(CMAKE_SYSTEM_PROCESSOR rv64)
+
+set(CMAKE_ASM_COMPILER "riscv64-unknown-linux-gnu-gcc")
+set(CMAKE_C_COMPILER "riscv64-unknown-linux-gnu-gcc")
+set(CMAKE_CXX_COMPILER "riscv64-unknown-linux-gnu-g++")
+
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+
+set(CMAKE_C_FLAGS "-march=rv64imafdcvxtheadc -mabi=lp64dv -mtune=c910 -mfp16 -lc")
+set(CMAKE_CXX_FLAGS "-march=rv64imafdcvxtheadc -mabi=lp64dv -mtune=c910 -mfp16 -lc")
+
+# cache flags for C910v
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "c flags")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}" CACHE STRING "c++ flags")
\ No newline at end of file