未验证 提交 129c5eeb 编写于 作者: D Daniel 提交者: GitHub

Dev/riscv (#605)

* update CMakeLists to install tengine cpp api header

* init rv64 im2col gemm

* update rv64 gemm assembly

* update rv64 im2col gemm

* update rv64 gemm

* Update CMakeLists.txt

* fix rv64 im2col gemm run mobilenet success
Co-authored-by: Ndongdong <ddzhao@openailab.com>
上级 2bc19316
......@@ -57,6 +57,10 @@ ELSEIF (CMAKE_SYSTEM_PROCESSOR MATCHES "^(mips.*|MIPS.*)")
SET (TENGINE_TARGET_PROCESSOR "MIPS" CACHE INTERNAL "" FORCE)
SET (TENGINE_TARGET_PROCESSOR_32Bit TRUE CACHE INTERNAL "" FORCE)
SET (TENGINE_TARGET_PROCESSOR_64Bit FALSE CACHE INTERNAL "" FORCE)
ELSEIF (CMAKE_SYSTEM_PROCESSOR MATCHES "^(rv64.*|RV64.*)")
SET (TENGINE_TARGET_PROCESSOR "lp64dv" CACHE INTERNAL "" FORCE)
SET (TENGINE_TARGET_PROCESSOR_32Bit FALSE CACHE INTERNAL "" FORCE)
SET (TENGINE_TARGET_PROCESSOR_64Bit TRUE CACHE INTERNAL "" FORCE)
ELSE()
IF (NOT TENGINE_SUPPRESS_TARGET_PROCESSOR_CHECK)
MESSAGE (WARNING "TENGINE: Unrecognized target processor configuration.")
......
......@@ -72,6 +72,11 @@ if (${TENGINE_TARGET_PROCESSOR} MATCHES "X86")
endif()
endif()
# RV64
if (${TENGINE_TARGET_PROCESSOR} MATCHES "lp64dv")
file(GLOB_RECURSE TENGINE_BACKEND_HCL_ASM_OPS "${CMAKE_CURRENT_SOURCE_DIR}/dev/cpu/op/rv64/*.S")
endif()
# add operator files
if (TENGINE_DYNAMIC_COMPILE AND EXISTS "${CMAKE_SOURCE_DIR}/cmake/operators.cmake")
# macro for adding include op dir
......@@ -104,6 +109,12 @@ if (TENGINE_DYNAMIC_COMPILE AND EXISTS "${CMAKE_SOURCE_DIR}/cmake/operators.cmak
set (MIPS_OP_PATH "${CMAKE_CURRENT_SOURCE_DIR}/dev/cpu/op/${name}/*mips.c")
list (APPEND HCL_SOURCE ${MIPS_OP_PATH})
endif()
# RV64
if (${TENGINE_TARGET_PROCESSOR} MATCHES "lp64dv")
set (RV64_OP_PATH "${CMAKE_CURRENT_SOURCE_DIR}/dev/cpu/op/${name}/*rv64.c")
list (APPEND HCL_SOURCE ${RV64_OP_PATH})
endif()
endmacro()
include(${CMAKE_SOURCE_DIR}/cmake/operators.cmake)
......@@ -111,8 +122,8 @@ if (TENGINE_DYNAMIC_COMPILE AND EXISTS "${CMAKE_SOURCE_DIR}/cmake/operators.cmak
file(GLOB_RECURSE TENGINE_BACKEND_REF_OPS ${REF_SOURCE})
# add hcl operator files
# arm or x86 or mips64
if (${TENGINE_TARGET_PROCESSOR} MATCHES "ARM" OR ${TENGINE_TARGET_PROCESSOR} MATCHES "X86" OR ${TENGINE_TARGET_PROCESSOR} MATCHES "MIPS")
# arm or x86 or mips64 or rv64
if (${TENGINE_TARGET_PROCESSOR} MATCHES "ARM" OR ${TENGINE_TARGET_PROCESSOR} MATCHES "X86" OR ${TENGINE_TARGET_PROCESSOR} MATCHES "MIPS" OR ${TENGINE_TARGET_PROCESSOR} MATCHES "lp64dv")
file(GLOB_RECURSE TENGINE_BACKEND_HCL_OPS ${HCL_SOURCE})
endif()
else()
......@@ -139,6 +150,11 @@ else()
if (${TENGINE_TARGET_PROCESSOR} MATCHES "MIPS")
file(GLOB_RECURSE TENGINE_BACKEND_HCL_OPS "${CMAKE_CURRENT_SOURCE_DIR}/dev/cpu/op/*mips.c")
endif()
# RV64
if (${TENGINE_TARGET_PROCESSOR} MATCHES "lp64dv")
file(GLOB_RECURSE TENGINE_BACKEND_HCL_OPS "${CMAKE_CURRENT_SOURCE_DIR}/dev/cpu/op/*rv64.c")
endif()
endif()
# add cmsis operator files
......@@ -390,7 +406,26 @@ if (TENGINE_STANDALONE_HCL)
${TENGINE_STANDALONE_HCL_LIB_NAME} SHARED
${TENGINE_BACKEND_HCL_OPS}
)
elseif (${TENGINE_TARGET_PROCESSOR} MATCHES "lp64dv")
message (STATUS "TENGINE RV64 TENGINE_BACKEND_HCL_ASM_OPS.----------------------------------------------")
list(APPEND TENGINE_COMPILE_OPTIONS_C_PRIVATE -rv64imafdcvxtheadc)
list(APPEND TENGINE_COMPILE_OPTIONS_C_PRIVATE -mabi=lp64dv)
list(APPEND TENGINE_COMPILE_OPTIONS_C_PRIVATE -mtune=c910)
list(APPEND TENGINE_COMPILE_OPTIONS_CXX_PRIVATE -rv64imafdcvxtheadc)
list(APPEND TENGINE_COMPILE_OPTIONS_CXX_PRIVATE -mabi=lp64dv)
list(APPEND TENGINE_COMPILE_OPTIONS_CXX_PRIVATE -mtune=c910)
list(APPEND TENGINE_COMPILE_OPTIONS_CUDA_PRIVATE -rv64imafdcvxtheadc)
list(APPEND TENGINE_COMPILE_OPTIONS_CUDA_PRIVATE -mabi=lp64dv)
list(APPEND TENGINE_COMPILE_OPTIONS_CUDA_PRIVATE -mtune=c910)
add_library (
${TENGINE_STANDALONE_HCL_LIB_NAME} SHARED
${TENGINE_BACKEND_HCL_OPS}
${TENGINE_BACKEND_HCL_ASM_OPS}
)
endif()
if (TENGINE_BACKEND_HCL_OPS)
unset(TENGINE_BACKEND_HCL_OPS)
......@@ -458,6 +493,18 @@ elseif (${TENGINE_TARGET_PROCESSOR} MATCHES "MIPS")
${TENGINE_BACKEND_COMMON}
${TENGINE_BACKEND_REF_OPS}
${TENGINE_BACKEND_HCL_OPS})
elseif (${TENGINE_TARGET_PROCESSOR} MATCHES "lp64dv")
add_definitions(-march=rv64imafdcvxtheadc)
add_definitions(-mabi=lp64dv)
add_definitions(-mtune=c910)
add_library(${CMAKE_PROJECT_NAME} SHARED
${TENGINE_LIB_SRCS} ${TENGINE_FRONT_END_SRCS}
${TENGINE_SERIALIZER_SRCS}
${TENGINE_TINY_SERIALIZER_SRCS}
${TENGINE_BACKEND_COMMON}
${TENGINE_BACKEND_REF_OPS}
${TENGINE_BACKEND_HCL_OPS}
${TENGINE_BACKEND_HCL_ASM_OPS})
else()
add_library(${CMAKE_PROJECT_NAME} SHARED
${TENGINE_LIB_SRCS}
......
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* License); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
#include "sys_port.h"
#include "module.h"
#include "tengine_errno.h"
#include "tengine_log.h"
#include "tengine_ir.h"
#include "../../cpu_node_ops.h"
#include "tengine_op.h"
#include "convolution_param.h"
#include "./rv64/conv_kernel_rv64.h"
static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
{
struct ir_node* ir_node = exec_node->ir_node;
struct ir_graph* ir_graph = ir_node->graph;
struct ir_tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
struct ir_tensor* filter_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]);
struct ir_tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
struct conv_param* conv_param = ( struct conv_param* )ir_node->op.param_mem;
struct conv_priv_info* conv_priv_info = ( struct conv_priv_info* )exec_node->ops_priv;
/* get cpu affinity */
conv_priv_info->cpu_type = exec_graph->cpu_affinity;
/* fp32 prerun */
if (exec_graph->mode == TENGINE_MODE_FP32)
{
if (conv_hcl_set_shared_mem && exec_node->shared_mem_size < exec_graph->shared_mem_size)
{
if (conv_hcl_set_shared_mem(conv_priv_info, exec_graph->shared_mem, exec_graph->shared_mem_size) < 0)
{
TLOG_ERR("hcl conv: set shared memory failed\n");
set_tengine_errno(EFAULT);
return -1;
}
}
if (conv_hcl_set_shared_pack4_mem && exec_node->shared_pack4_mem_size < exec_graph->shared_pack4_mem_size)
{
if (conv_hcl_set_shared_pack4_mem(conv_priv_info, exec_graph->shared_pack4_mem,
exec_graph->shared_pack4_mem_size) < 0)
{
TLOG_ERR("hcl conv: set shared pack4 memory failed\n");
set_tengine_errno(EFAULT);
return -1;
}
}
int group = conv_param->group;
int kernel_h = conv_param->kernel_h;
int kernel_w = conv_param->kernel_w;
if (group > 1 && kernel_h == 7 && kernel_w == 7)
conv_priv_info->external_interleave_pack4_mem = 0;
else
conv_priv_info->external_interleave_pack4_mem = 1;
/* do prerun */
if (conv_hcl_prerun(input_tensor, filter_tensor, output_tensor, conv_priv_info, conv_param) < 0)
{
TLOG_ERR("hcl conv prerun failed\n");
set_tengine_errno(EFAULT);
return -1;
}
}
else
{
printf("Tengine work node not support %d\n", exec_graph->mode);
return -1;
}
return 0;
}
static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
{
fprintf(stderr, "conv hcl start\n");
struct ir_node* ir_node = exec_node->ir_node;
struct ir_graph* ir_graph = ir_node->graph;
struct ir_tensor* input_tensor;
struct ir_tensor* weight_tensor;
struct ir_tensor* output_tensor;
struct ir_tensor* bias_tensor = NULL;
int num_thread = exec_graph->num_thread;
int cpu_affinity = exec_graph->cpu_affinity;
/* set the input data and shape again, in case of reshape or dynamic shape */
input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
weight_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]);
output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
if (ir_node->input_num > 2)
bias_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[2]);
struct conv_param* conv_param = ( struct conv_param* )ir_node->op.param_mem;
struct conv_priv_info* conv_priv_info = ( struct conv_priv_info* )exec_node->ops_priv;
/* fp32 run */
if (exec_graph->mode == TENGINE_MODE_FP32)
{
if (conv_hcl_run(input_tensor, weight_tensor, bias_tensor, output_tensor, conv_priv_info, conv_param, num_thread,
cpu_affinity) < 0)
{
TLOG_ERR("hcl conv run failed\n");
set_tengine_errno(EFAULT);
return -1;
}
}
else
{
printf("Tengine work node not support %d\n", exec_graph->mode);
return -1;
}
return 0;
}
static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
{
return 0;
}
static int postrun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
{
struct conv_priv_info* conv_priv_info = ( struct conv_priv_info* )exec_node->ops_priv;
/* fp32 postrun */
if (exec_graph->mode == TENGINE_MODE_FP32)
{
if (conv_hcl_postrun(conv_priv_info) < 0)
{
TLOG_ERR("hcl conv postrun failed\n");
set_tengine_errno(EFAULT);
return -1;
}
}
else
{
printf("Tengine work node not support %d\n", exec_graph->mode);
return -1;
}
return 0;
}
static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
{
struct ir_node* ir_node = exec_node->ir_node;
struct ir_graph* ir_graph = ir_node->graph;
struct ir_tensor* input_tensor;
struct ir_tensor* filter_tensor;
struct ir_tensor* output_tensor;
input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
filter_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]);
output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
struct conv_param* conv_param = ( struct conv_param* )ir_node->op.param_mem;
/* init the private info data of convolution op */
struct conv_priv_info* conv_priv_info = ( struct conv_priv_info* )sys_malloc(sizeof(struct conv_priv_info));
if (conv_priv_info == NULL)
{
set_tengine_errno(ENOMEM);
return -1;
}
memset(conv_priv_info, 0, sizeof(struct conv_priv_info));
exec_node->ops_priv = conv_priv_info;
/* get shared memory size */
if (exec_graph->mode == TENGINE_MODE_FP32)
{
exec_node->shared_mem_size = conv_hcl_get_shared_mem_size(input_tensor, output_tensor, conv_param);
exec_node->shared_pack4_mem_size = conv_hcl_get_shared_pack4_mem_size(filter_tensor, output_tensor, conv_param);
}
else
{
printf("Tengine work node not support %d\n", exec_graph->mode);
return -1;
}
return 0;
}
static int release_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
{
struct conv_priv_info* conv_priv_info = ( struct conv_priv_info* )exec_node->ops_priv;
sys_free(conv_priv_info);
exec_node->ops_priv = NULL;
return 0;
}
static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struct ir_node* exec_node)
{
struct ir_node* ir_node = exec_node;
struct ir_graph* ir_graph = ir_node->graph;
struct ir_tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
struct ir_tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
struct conv_param* param = ( struct conv_param* )exec_node->op.param_mem;
int group = param->group;
int kernel_h = param->kernel_h;
int kernel_w = param->kernel_w;
int in_c = input_tensor->dims[1] / group;
int out_c = output_tensor->dims[1] / group;
if (input_tensor->data_type != TENGINE_DT_FP32)
return 0;
if (group != 1)
return 0;
return OPS_SCORE_PREFER;
}
static struct node_ops hcl_node_ops = {.prerun = prerun,
.run = run,
.reshape = reshape,
.postrun = postrun,
.init_node = init_node,
.release_node = release_node,
.score = score
};
static int reg_conv_hcl_ops(void* arg)
{
return register_builtin_node_ops(OP_CONV, &hcl_node_ops);
}
static int unreg_conv_hcl_ops(void* arg)
{
unregister_builtin_node_ops(OP_CONV, &hcl_node_ops);
return 0;
}
AUTO_REGISTER_OPS(reg_conv_hcl_ops);
AUTO_UNREGISTER_OPS(unreg_conv_hcl_ops);
此差异已折叠。
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* License); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
/*
* Copyright (c) 2020, Martin Han
* Author: hansh-sz@hotmail.com
*/
#ifndef _CONV_KERNEL_RV64_H_
#define _CONV_KERNEL_RV64_H_
#include "tengine_ir.h"
#include "convolution_param.h"
/* float32 */
int conv_hcl_prerun(struct ir_tensor* input_tensor, struct ir_tensor* filter_tensor, struct ir_tensor* output_tensor,
struct conv_priv_info* info, struct conv_param* param) __attribute__((weak));
int conv_hcl_postrun(struct conv_priv_info* info) __attribute__((weak));
int conv_hcl_run(struct ir_tensor* input_tensor, struct ir_tensor* filter_tensor, struct ir_tensor* bias_tensor,
struct ir_tensor* output_tensor, struct conv_priv_info* conv_info, struct conv_param* param,
int num_thread, int cpu_affinity) __attribute__((weak));
int conv_hcl_get_shared_mem_size(struct ir_tensor* input_tensor, struct ir_tensor* output_tensor,
struct conv_param* param) __attribute__((weak));
int conv_hcl_get_shared_pack4_mem_size(struct ir_tensor* input_tensor, struct ir_tensor* output_tensor,
struct conv_param* param) __attribute__((weak));
int conv_hcl_set_shared_mem(struct conv_priv_info* priv_info, void* mem, int mem_size) __attribute__((weak));
int conv_hcl_set_shared_pack4_mem(struct conv_priv_info* priv_info, void* mem, int mem_size) __attribute__((weak));
#endif
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* License); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
/*
* Copyright (c) 2021, OPEN AI LAB
* Author: ddzhao@openailab.com
*/
//
// im2col for kernel 1x1 s1p0d1
//
// input:
// x0 arg0 input address
// x1 arg1 input_xy
// x2 arg2 col address
// x3 arg3 col_cnt must be multiply of 4
// x4 arg4 input channel
//
// register definition
// x0 input address
// x1 input_xy x 4
// x2 col address
// x3 col_cnt
// x4 input channel
// x6 input start pointer t6
// x7 input pointer
// x9 channel cnt
// x11
// x12 = input_xy size * 2 // x12 -> t5
.section .text,"ax"
.align 5
.type im2col_fp32_1x1 STT_FUNC
.global im2col_fp32_1x1
.hidden im2col_fp32_1x1
im2col_fp32_1x1:
addi sp, sp, -56
sd t0, 0(sp)
sd t1, 8(sp)
sd t2, 16(sp)
sd t3, 24(sp)
sd t4, 32(sp)
sd t5, 40(sp)
sd t6, 48(sp)
vsetvli t0, a0, e32
li t0, 4
blt a3, t0, col_end
srli a3, a3, 2
slli a1, a1, 2
mv t6, a0
slli t5, a1, 1
add t4, a4, 1 // x10 -> t4
// col loop
col_loop:
mv t3, t6
srli t2, a4, 1
beqz t2, channel_last
add t1, t3, a1
// kernel size loop
channel_loop2:
vlw.v v0,(t3)
vlw.v v1,(t1)
addi t2, t2, -1
add t3, t3, t5
add t1, t1, t5
vsw.v v0, (a2)
addi a2, a2, 16
vsw.v v1, (a2)
addi a2, a2, 16
bnez t2, channel_loop2
channel_last:
beqz t4, channel_loop_end
vlw.v v0,(t3)
vsw.v v0, (a2)
addi a2, a2, 16
channel_loop_end:
addi t6, t6, 16
addi a3, a3, -1
bnez a3, col_loop
col_end:
ld t0, 0(sp)
ld t1, 8(sp)
ld t2, 16(sp)
ld t3, 24(sp)
ld t4, 32(sp)
ld t5, 40(sp)
ld t6, 48(sp)
addi sp, sp, 56
ret
.end
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* License); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
/*
* Copyright (c) 2020, OPEN AI LAB
* Author: ddzhao@openailab.com
*/
//
// im2col fp16 for kernel 3x3 include 2 function stride 1 and stride 2
// ABCDABCD
//
// input:
// x0 arg0 input address
// x1 arg1 input_x
// x2 arg2 input_y
// x3 arg3 input channel cnt
// x4 arg4 col address
// x5 arg5 stride_x
//
// register definition
// x0 cl0 address q0 q1 d16 d17 d18
// x1 input_x x 4
// x2 input_xy x 4
// x3 input channel
// x4 col address
// x5 stride_x
// x11 cl1 address q2 q3 d19 d20 d21
// x12 cl2 address q4 q5 d22 d23 d24
.section .text,"ax"
.align 5
.type im2col_fp32_3x3 STT_FUNC
.global im2col_fp32_3x3
.hidden im2col_fp32_3x3
.balign 16
mask_32b:
.byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \
0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff
im2col_fp32_3x3:
addi sp, sp, -56
sd t0, 0(sp)
sd t1, 8(sp)
sd t2, 16(sp)
sd t3, 24(sp)
sd t4, 32(sp)
sd t5, 40(sp)
sd t6, 48(sp)
vsetvli t0, a0, e32
// initial
beqz a3, finish
li t0, 2
slli a1, a1, 2
mul a2, a2, a1
add t5, a0, a1
slli t1, a1, 1
add t6, a0, t1
li t2, 8
beq a5, t0, stride2_channel_loop
stride1_channel_loop:
vlw.v v0, (a0)
addi t0, a0, 16
vlw.v v1, (t0)
vlw.v v2, (t5)
addi t0, t5, 16
vlw.v v3, (t0)
vlw.v v4, (t6)
addi t0, t6, 16
vlw.v v5, (t0)
addi a3, a3, -1
addi t0, a0, 4
vlw.v v16, (t0)
addi t0, a0, 8
vlw.v v17, (t0)
add a0, a0, a2
addi t0, t5, 4
vlw.v v19, (t0)
addi t0, t5, 8
vlw.v v20, (t0)
add t5, t5, a2
addi t0, t6, 4
vlw.v v22, (t0)
addi t0, t6, 8
vlw.v v23, (t0)
add t6, t6, a2
vsw.v v0, (a4)
addi a4, a4, 16
vsw.v v16, (a4)
addi a4, a4, 16
vsw.v v17, (a4)
addi a4, a4, 16
vsw.v v2, (a4)
addi a4, a4, 16
vsw.v v19, (a4)
addi a4, a4, 16
vsw.v v20, (a4)
addi a4, a4, 16
vsw.v v4, (a4)
addi a4, a4, 16
vsw.v v22, (a4)
addi a4, a4, 16
vsw.v v23, (a4)
addi a4, a4, 16
bnez a3, stride1_channel_loop
j finish
stride2_channel_loop:
la t0, mask_32b
vlw.v v0, (t0)
addi t0, a0, 0
vlsw.v v16, (t0), t2
addi t0, a0, 0x4
vlsw.v v17, (t0), t2
addi t0, a0, 32
vlw.v v18, (t0)
vslidedown.vi v1, v16, 1
vslideup.vi v2, v18, 3
vmerge.vvm v18, v1, v2, v0
addi t0, t5, 0
vlsw.v v19, (t0), t2
addi t0, t5, 0x4
vlsw.v v20, (t0), t2
addi t0, t5, 0x20
vlw.v v21, (t0)
vslidedown.vi v1, v19, 1
vslideup.vi v2, v21, 3
vmerge.vvm v21, v1, v2, v0
addi t0, t6, 0
vlsw.v v22, (t0), t2
addi t0, t6, 0x4
vlsw.v v23, (t0), t2
addi t0, t6, 0x20
vlw.v v24, (t0)
vslidedown.vi v1, v22, 1
vslideup.vi v2, v24, 3
vmerge.vvm v24, v1, v2, v0
addi a3, a3, -1
vsw.v v16, (a4)
addi a4, a4, 0x10
vsw.v v17, (a4)
addi a4, a4, 0x10
vsw.v v18, (a4)
addi a4, a4, 0x10
vsw.v v19, (a4)
addi a4, a4, 0x10
vsw.v v20, (a4)
addi a4, a4, 0x10
vsw.v v21, (a4)
addi a4, a4, 0x10
vsw.v v22, (a4)
addi a4, a4, 0x10
vsw.v v23, (a4)
addi a4, a4, 0x10
vsw.v v24, (a4)
addi a4, a4, 0x10
add a0, a0, a2
add t5, t5, a2
add t6, t6, a2
bnez a3, stride2_channel_loop
finish:
ld t0, 0(sp)
ld t1, 8(sp)
ld t2, 16(sp)
ld t3, 24(sp)
ld t4, 32(sp)
ld t5, 40(sp)
ld t6, 48(sp)
addi sp, sp, 56
ret
.end
此差异已折叠。
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* License); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
/*
* Copyright (c) 2020, OPEN AI LAB
* Author: ddzhao@openailab.com
*/
//
// 4*4 single precise floating point matric multiplication
//
// -- -- -- -- -- -- -- --
// | i0 - - - - - - | | k0 k1 k2 k3 | | b0 b1 b2 b3 | | i0k0 i0k1 i0k2 i0k3 |
// | | | . . . . | | | | |
// | i1 - - - - - - | | . . . . | | b0 b1 b2 b3 | | i1k0 i1k1 i1k2 i1k3 |
// | | x | . . . . | + | | = | |
// | i2 - - - - - - | | . . . . | | b0 b1 b2 b3 | | i2k0 i2k1 i2k2 i2k3 |
// | | | . . . . | | | | |
// | i3 - - - - - - | | . . . . | | b0 b1 b2 b3 | | i3k0 i3k1 i3k2 i3k3 |
// -- -- -- -- -- -- -- --
// input 4 x p kernel p x 4 biases 4 x 4 output 4 x 4 p = kernel size
//
//
// optimised for Cortex-A72 pipeline 18 cycle per loop (4*4*4 dot product)
//
// input:
// x0 arg0 biases address {b0,b1,b2,b3} nullptr means no biases
// x1 arg1 input address {i[0-3][0],i1[0-3][1],i[0-3][2],i[0-3][3],i[0-3][4],...}
// x2 arg2 kernel address {k[0-3][0],k[0-3][1],k[0-3][2],k[0-3][3],...}
// x3 arg3 kernel size
// x4 arg4 output address
// indirect save: output {i[0-3]k[0],i[0-3]k[1],i[0-3]k[2],i[0-3]k[3]}
// direct save: output : {i0k0 i1k0 i2k0 i3k0}
// output + ouput_xy : {i0k1 i1k1 i2k1 i3k1}
// output + ouput_xy * 2 : {i0k2 i1k2 i2k2 i3k2}
// output + ouput_xy * 3 : {i0k3 i1k3 i2k3 i3k3}
// x5 arg5 output xy
// x6 arg6 activation flag relu layers is integrated after convolution
//
// output: no
//
// register definition
// x0 biases start address
// x1 input start address
// x2 kernel start address
// x3 kernal size
// x4 output start address
// x5 output_x * output_y
// x6 fused relu flag
// x9 ~ x10 temp loop counter
// x11~ x13 temp output save address
// x7~8 14~15 not used
//
// v0-3 4S data of input0 {i3 i2 i1 i0}
// v4-7 4S kernal data {k3 k2 k1 k0}
// v8~v15 not used
// v16 dot product for {i3k0, i2k0, i1k0, i0k0}
// v17 dot product for {i3k1, i2k1, i1k1, i0k1}
// v18 dot product for {i3k2, i2k2, i1k2, i0k2}
// v19 dot product for {i3k3, i2k3, i1k3, i0k3}
// v20~V31 not used
.section .text,"ax"
.align 5
.type sgemm_4x4_rv64 STT_FUNC
.global sgemm_4x4_rv64
.hidden sgemm_4x4_rv64
sgemm_4x4_rv64:
slli a5, a5, 0x2
# // initial biases
beqz a0, non_biases
vsetvli t0, a0, e32
vlw.v v16, (a0)
vmv.v.v v17, v16
vmv.v.v v18, v16
vmv.v.v v19, v16
j convoluation_start
non_biases:
vmv.v.x v16, x0
vmv.v.x v17, x0
vmv.v.x v18, x0
vmv.v.x v19, x0
convoluation_start:
add t4, a4, a5
andi t3, a3, 0x3
li t0, 4
blt a3, t0, loop4_end
srli t2, a3, 0x2
// main loop: each loop generate dot prodcut for 4x4SFP
loop4:
addi t2, t2, -1
vlw.v v0, (a1)
addi a1, a1, 16
vlw.v v1, (a1)
addi a1, a1, 16
vlw.v v2, (a1)
addi a1, a1, 16
vlw.v v3, (a1)
addi a1, a1, 16
vlw.v v4, (a2)
addi a2, a2, 16
vlw.v v5, (a2)
addi a2, a2, 16
vlw.v v6, (a2)
addi a2, a2, 16
vlw.v v7, (a2)
addi a2, a2, 16
vrgather.vi v20, v4, 0
vrgather.vi v21, v4, 1
vrgather.vi v22, v4, 2
vrgather.vi v23, v4, 3
vfmacc.vv v16, v20, v0
vfmacc.vv v17, v21, v0
vfmacc.vv v18, v22, v0
vfmacc.vv v19, v23, v0
vrgather.vi v20, v5, 0
vrgather.vi v21, v5, 1
vrgather.vi v22, v5, 2
vrgather.vi v23, v5, 3
vfmacc.vv v16, v20, v1
vfmacc.vv v17, v21, v1
vfmacc.vv v18, v22, v1
vfmacc.vv v19, v23, v1
vrgather.vi v20, v6, 0
vrgather.vi v21, v6, 1
vrgather.vi v22, v6, 2
vrgather.vi v23, v6, 3
vfmacc.vv v16, v20, v2
vfmacc.vv v17, v21, v2
vfmacc.vv v18, v22, v2
vfmacc.vv v19, v23, v2
vrgather.vi v20, v7, 0
vrgather.vi v21, v7, 1
vrgather.vi v22, v7, 2
vrgather.vi v23, v7, 3
vfmacc.vv v16, v20, v3
vfmacc.vv v17, v21, v3
vfmacc.vv v18, v22, v3
vfmacc.vv v19, v23, v3
bnez t2, loop4
loop4_end:
slli t0, a5, 1
add t5, a4, t0
beqz t3, activation
loop1:
addi t3, t3, -1
vlw.v v0, (a1)
addi a1, a1, 16
vlw.v v4, (a2)
addi a2, a2, 16
vrgather.vi v20, v4, 0
vrgather.vi v21, v4, 1
vrgather.vi v22, v4, 2
vrgather.vi v23, v4, 3
vfmacc.vv v16, v20, v0
vfmacc.vv v17, v21, v0
vfmacc.vv v18, v22, v0
vfmacc.vv v19, v23, v0
bnez t3, loop1
activation:
slli t0, a5, 1
add t6, t4, t0
bltz a6, save_result
vmv.v.i v0, 0
vmv.v.x v1, a6
vfmax.vv v16, v16, v0
vfmax.vv v17, v17, v0
vfmax.vv v18, v18, v0
vfmax.vv v19, v19, v0
beqz a6, save_result
vfmin.vv v16, v16, v1
vfmin.vv v17, v17, v1
vfmin.vv v18, v18, v1
vfmin.vv v19, v19, v1
save_result:
# // store result
beqz a7, save_result_nchw
li t1, 0
vext.x.v t0, v16, t1
sw t0, 0(a4)
vext.x.v t0, v17, t1
sw t0, 4(a4)
vext.x.v t0, v18, t1
sw t0, 8(a4)
vext.x.v t0, v19, t1
sw t0, 12(a4)
li t1, 1
vext.x.v t0, v16, t1
sw t0, 0(t4)
vext.x.v t0, v17, t1
sw t0, 4(t4)
vext.x.v t0, v18, t1
sw t0, 8(t4)
vext.x.v t0, v19, t1
sw t0, 12(t4)
li t1, 2
vext.x.v t0, v16, t1
sw t0, 0(t5)
vext.x.v t0, v17, t1
sw t0, 4(t5)
vext.x.v t0, v18, t1
sw t0, 8(t5)
vext.x.v t0, v19, t1
sw t0, 12(t5)
li t1, 3
vext.x.v t0, v16, t1
sw t0, 0(t6)
vext.x.v t0, v17, t1
sw t0, 4(t6)
vext.x.v t0, v18, t1
sw t0, 8(t6)
vext.x.v t0, v19, t1
sw t0, 12(t6)
j end
save_result_nchw:
vsw.v v16, (a4)
vsw.v v17, (t4)
vsw.v v18, (t5)
vsw.v v19, (t6)
end:
ret
.end
......@@ -9,6 +9,7 @@ endmacro()
# operator level test
tengine_test(test_op_prelu op/test_op_prelu.c)
tengine_test(test_op_conv op/test_op_conv.c)
if (TENGINE_ENABLE_TIM_VX)
tengine_test(test_op_prelu_timvx op/test_op_prelu_timvx.c)
......
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* License); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
/*
* Copyright (c) 2019, Open AI Lab
* Author: haitao@openailab.com
*/
#include <stdio.h>
#include <string.h>
#include <malloc.h>
#include "tengine_c_api.h"
#include "tengine_c_api_ex.h"
int allocated_num = 0;
void** record_ptr = NULL;
void record_allocated_buf(void* buf)
{
allocated_num++;
record_ptr = realloc(record_ptr, sizeof(void*) * allocated_num);
record_ptr[allocated_num - 1] = buf;
}
void free_allocated_buf(void)
{
for(int i = 0; i < allocated_num; i++)
free(record_ptr[i]);
if(record_ptr)
free(record_ptr);
}
void init_buffer(void* buf, int elem_num, int elem_size, int val)
{
for(int i = 0; i < elem_num; i++)
{
float val0;
float* fp;
int16_t* i16;
char* c;
if(val >= 0)
val0 = val;
else
val0 = i%10;
switch(elem_size)
{
case 4:
fp = ( float* )buf;
fp[i] = val0;
break;
case 2:
i16 = ( int16_t* )buf;
i16[i] = val0;
break;
case 1:
c = ( char* )buf;
c[i] = val0;
break;
}
}
}
int create_input_node(graph_t graph, const char* node_name, int c, int h, int w)
{
node_t node = create_graph_node(graph, node_name, "InputOp");
tensor_t tensor = create_graph_tensor(graph, node_name, TENGINE_DT_FP32);
set_node_output_tensor(node, 0, tensor, TENSOR_TYPE_INPUT);
int dims[4] = {1, c, h, w};
set_tensor_shape(tensor, dims, 4);
release_graph_tensor(tensor);
release_graph_node(node);
return 0;
}
int create_conv_node(graph_t graph, const char* node_name, const char* input_name, int k_size, int stride, int pad,
int in_c, int out_c, int group)
{
/* weight */
char* weight_name = malloc(strlen(node_name) + 16);
sprintf(weight_name, "%s/weight", node_name);
node_t w_node = create_graph_node(graph, weight_name, "Const");
tensor_t w_tensor = create_graph_tensor(graph, weight_name, TENGINE_DT_FP32);
set_node_output_tensor(w_node, 0, w_tensor, TENSOR_TYPE_CONST);
int w_dims[] = {out_c, in_c / group, k_size, k_size};
set_tensor_shape(w_tensor, w_dims, 4);
/* bias */
char* bias_name = malloc(strlen(node_name) + 16);
sprintf(bias_name, "%s/bias", node_name);
node_t b_node = create_graph_node(graph, bias_name, "Const");
tensor_t b_tensor = create_graph_tensor(graph, bias_name, TENGINE_DT_FP32);
set_node_output_tensor(b_node, 0, b_tensor, TENSOR_TYPE_CONST);
int b_dims[] = {out_c};
set_tensor_shape(b_tensor, b_dims, 1);
/* conv */
node_t conv_node = create_graph_node(graph, node_name, "Convolution");
tensor_t input_tensor = get_graph_tensor(graph, input_name);
if(input_tensor == NULL)
{
fprintf(stderr, "errno= %d\n", get_tengine_errno());
return -1;
}
set_node_input_tensor(conv_node, 2, b_tensor);
set_node_input_tensor(conv_node, 1, w_tensor);
set_node_input_tensor(conv_node, 0, input_tensor);
tensor_t output_tensor = create_graph_tensor(graph, node_name, TENGINE_DT_FP32);
set_node_output_tensor(conv_node, 0, output_tensor, TENSOR_TYPE_VAR);
release_graph_tensor(input_tensor);
release_graph_tensor(output_tensor);
release_graph_node(w_node);
release_graph_tensor(w_tensor);
release_graph_node(b_node);
release_graph_tensor(b_tensor);
free(bias_name);
free(weight_name);
/* attr */
set_node_attr_int(conv_node, "kernel_h", &k_size);
set_node_attr_int(conv_node, "kernel_w", &k_size);
set_node_attr_int(conv_node, "stride_h", &stride);
set_node_attr_int(conv_node, "stride_w", &stride);
set_node_attr_int(conv_node, "pad_h0", &pad);
set_node_attr_int(conv_node, "pad_h1", &pad);
set_node_attr_int(conv_node, "pad_w0", &pad);
set_node_attr_int(conv_node, "pad_w1", &pad);
set_node_attr_int(conv_node, "output_channel", &out_c);
set_node_attr_int(conv_node, "input_channel", &in_c);
set_node_attr_int(conv_node, "group", &group);
release_graph_node(conv_node);
return 0;
}
int create_pooling_node(graph_t graph, const char* node_name, const char* input_name)
{
node_t pool_node = create_graph_node(graph, node_name, "Pooling");
tensor_t input_tensor = get_graph_tensor(graph, input_name);
if(input_tensor == NULL)
{
fprintf(stderr, "ERRNO: %d\n", get_tengine_errno());
return -1;
}
set_node_input_tensor(pool_node, 0, input_tensor);
release_graph_tensor(input_tensor);
/* output */
tensor_t output_tensor = create_graph_tensor(graph, node_name, TENGINE_DT_FP32);
set_node_output_tensor(pool_node, 0, output_tensor, TENSOR_TYPE_VAR);
release_graph_tensor(output_tensor);
release_graph_node(pool_node);
return 0;
}
graph_t create_test_graph(int c, int h, int w, int out_c)
{
graph_t graph = create_graph(NULL, NULL, NULL);
if(graph == NULL)
{
fprintf(stderr, "ERRNO: %d\n", get_tengine_errno());
return NULL;
}
const char* input_name = "data";
const char* conv_name = "conv";
if(create_input_node(graph, input_name, c, h, w) < 0)
{
fprintf(stderr, "create input failed\n");
return NULL;
}
// int out_c = 4;
// k s p in_c out_c group
if(create_conv_node(graph, conv_name, input_name, 1, 1, 0, c, out_c, 1) < 0)
{
fprintf(stderr, "create conv node failed\n");
return NULL;
}
#if 0
const char* pool_name = "pooling";
if(create_pooling_node(graph, pool_name, conv_name) < 0)
{
fprintf(stderr, "create pooling node failed\n");
return NULL;
}
/* set input/output node */
const char* inputs[] = {input_name};
const char* outputs[] = {pool_name};
#else
const char* inputs[] = {input_name};
const char* outputs[] = {conv_name};
#endif
if(set_graph_input_node(graph, inputs, sizeof(inputs) / sizeof(char*)) < 0)
{
fprintf(stderr, "set inputs failed: ERRNO: %d\n", get_tengine_errno());
return NULL;
}
if(set_graph_output_node(graph, outputs, sizeof(outputs) / sizeof(char*)) < 0)
{
fprintf(stderr, "set outputs failed: ERRNO: %d\n", get_tengine_errno());
return NULL;
}
return graph;
}
void fill_conv_node(node_t node)
{
tensor_t filter = get_node_input_tensor(node, 1);
int dims[4];
get_tensor_shape(filter, dims, 4);
int elem_num = dims[0] * dims[1] * dims[2] * dims[3];
int elem_size = 4;
void* filter_buf = malloc(elem_num * elem_size);
init_buffer(filter_buf, elem_num, elem_size, -1);
set_tensor_buffer(filter, filter_buf, elem_num * elem_size);
record_allocated_buf(filter_buf);
release_graph_tensor(filter);
tensor_t bias = get_node_input_tensor(node, 2);
if(bias == NULL)
return;
get_tensor_shape(bias, dims, 1);
elem_num = dims[0];
void* bias_buf = malloc(elem_num * elem_size);
init_buffer(bias_buf, elem_num, elem_size, 3);
set_tensor_buffer(bias, bias_buf, elem_num * elem_size);
record_allocated_buf(bias_buf);
release_graph_tensor(bias);
}
void fill_graph_param(graph_t graph)
{
int node_num = get_graph_node_num(graph);
for(int i = 0; i < node_num; i++)
{
node_t node = get_graph_node_by_idx(graph, i);
const char* node_op = get_node_op(node);
if(!strcmp(node_op, "Convolution"))
{
fill_conv_node(node);
}
release_graph_node(node);
}
}
int main(int argc, char* argv[])
{
int c, h, w, out_c;
c = 8;
h = 14;
w = 14;
out_c = 16;
init_tengine();
graph_t graph = create_test_graph(c, h, w, out_c);
if(graph == NULL)
return 1;
fill_graph_param(graph);
/* fill input */
tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0);
int dims[4];
int dim_num = get_tensor_shape(input_tensor, dims, 4);
int elem_num = 1;
int elem_size = 4;
for(int i = 0; i < dim_num; i++)
elem_num *= dims[i];
void* input_buf = malloc(elem_num * elem_size);
init_buffer(input_buf, elem_num, elem_size, -1);
record_allocated_buf(input_buf);
set_tensor_buffer(input_tensor, input_buf, elem_num * elem_size);
release_graph_tensor(input_tensor);
prerun_graph(graph);
dump_graph(graph);
run_graph(graph, 1);
tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
dim_num = get_tensor_shape(output_tensor, dims, 4);
elem_num = 1;
printf("output shape: [");
for(int i = 0; i < dim_num; i++)
{
elem_num *= dims[i];
printf(" %d", dims[i]);
}
printf(" ]\n");
float* output = get_tensor_buffer(output_tensor);
for(int i = 0; i < elem_num; i++)
{
int w = dims[3];
if((i % w) == 0)
printf("\n%d:\t", i);
printf(" %f", output[i]);
}
printf("\n");
release_graph_tensor(output_tensor);
postrun_graph(graph);
destroy_graph(graph);
release_tengine();
free_allocated_buf();
return 0;
}
set(CMAKE_SYSTEM_NAME Linux)
set(CMAKE_SYSTEM_PROCESSOR rv64)
set(CMAKE_ASM_COMPILER "riscv64-unknown-linux-gnu-gcc")
set(CMAKE_C_COMPILER "riscv64-unknown-linux-gnu-gcc")
set(CMAKE_CXX_COMPILER "riscv64-unknown-linux-gnu-g++")
set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
set(CMAKE_C_FLAGS "-march=rv64imafdcvxtheadc -mabi=lp64dv -mtune=c910 -mfp16 -lc")
set(CMAKE_CXX_FLAGS "-march=rv64imafdcvxtheadc -mabi=lp64dv -mtune=c910 -mfp16 -lc")
# cache flags for C910v
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "c flags")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}" CACHE STRING "c++ flags")
\ No newline at end of file
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册