/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * License); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ /* * Copyright (c) 2021, Open AI Lab * Author: hhchen@openailab.com */ #include "acl_executor.hpp" #include "acl_define.h" extern "C" { #include "operator/op.h" #include "utility/sys_port.h" #include "utility/utils.h" #include "convolution_param.h" } #define USE_CPU_CONVERT //#define ACL_EXTENSTION #ifdef __ANDROID__ #define dynamic_cast static_cast #endif static inline void copy_fp32_to_fp16(__fp16* f16, const float* f32, const int f32_size) { for(unsigned int i = 0; i < f32_size / sizeof(float); i++) f16[i] = f32[i]; } static inline void copy_fp16_to_fp32(float* f32, const __fp16* f16, const int f16_size) { for(unsigned int i = 0; i < f16_size / sizeof(__fp16); i++) f32[i] = f16[i]; } void copy_buffer(void* dest, const void* src, const int src_len, DataType dest_type, DataType src_type) { if(dest_type == src_type) memcpy(dest, src, src_len); else if(dest_type == DataType::F16 && src_type == DataType::F32) copy_fp32_to_fp16(( __fp16* )dest, ( const float* )src, src_len); else if(dest_type == DataType::F32 && src_type == DataType::F16) copy_fp16_to_fp32(( float* )dest, ( const __fp16* )src, src_len); else fprintf(stderr, "copy_buffer may failed!!!"); } template inline void _PermuteDataLayoutNCHWToNHWCInter(T* pvData, int n, int c, int h, int w, T* pvOutputData) { T* pDataInputBuf = pvData; T* pDataOutputBuf = pvOutputData; int s32Cnt = 0; for(int z = 0; z < n; z++) { for(int i = 0; i < h; i++) { const T* pRowStartAddr = pDataInputBuf + w * i + z * w * h * c; for(int j = 0; j < w; j++) { for(int k = 0; k < c; k++) { const T* pCkData = pRowStartAddr + k * (w * h) + j; pDataOutputBuf[s32Cnt] = *pCkData; s32Cnt++; } } } } } void _PermuteDataLayoutNCHWToNHWC(void* pvData, int n, int c, int h, int w, void* pvOutputData, int DataEleSize) { assert(pvData != NULL); assert(pvOutputData != NULL); assert(DataEleSize == 1 || DataEleSize == 2 || DataEleSize == 4); if(DataEleSize == 4) { _PermuteDataLayoutNCHWToNHWCInter((int *) pvData, n, c, h, w, (int *) pvOutputData); } else if(DataEleSize == 2) { _PermuteDataLayoutNCHWToNHWCInter((short *) pvData, n, c, h, w, (short *) pvOutputData); } else { _PermuteDataLayoutNCHWToNHWCInter((char *) pvData, n, c, h, w, (char *) pvOutputData); } } template inline void _PermuteDatalayoutNHWCToNCHWInter(T* pvData, int n, int c, int h, int w, T* pvOutputData) { T* pDataInputBuf = pvData; T* pDataOutputBuf = pvOutputData; int s32Cnt = 0; for(int z = 0; z < n; z++) { for(int i = 0; i < h; i++) { T* pRowStartAddr = pDataOutputBuf + w * i + z * w * h * c; for(int j = 0; j < w; j++) { for(int k = 0; k < c; k++) { T* pCkData = pRowStartAddr + k * (w * h) + j; *pCkData = pDataInputBuf[s32Cnt]; s32Cnt++; } } } } } inline void _PermuteDatalayoutNHWCToNCHW(void* pvData, int n, int c, int h, int w, void* pvOutputData, int DataEleSize) { assert(pvData != NULL); assert(pvOutputData != NULL); assert(DataEleSize == 1 || DataEleSize == 2 || DataEleSize == 4); if(DataEleSize == 4) { _PermuteDatalayoutNHWCToNCHWInter(( int* )pvData, n, c, h, w, ( int* )pvOutputData); } else if(DataEleSize == 2) { _PermuteDatalayoutNHWCToNCHWInter(( short* )pvData, n, c, h, w, ( short* )pvOutputData); } else { _PermuteDatalayoutNHWCToNCHWInter(( char* )pvData, n, c, h, w, ( char* )pvOutputData); } } static void copy_itensor(CLTensor* cl_tensor, void* buf, int buf_size, bool to_tensor, DataType data_type) { auto* cl_info = cl_tensor->info(); const size_t slice_num = cl_info->tensor_shape().total_size_upper(2); const Strides strides = cl_info->strides_in_bytes(); const PaddingSize padding = cl_info->padding(); int slice_w = cl_info->dimension(0) + padding.left + padding.right; int slice_h = cl_info->dimension(1) + padding.bottom + padding.top; uint8_t* slice_ptr = cl_tensor->buffer(); uint8_t* buf_ptr = ( uint8_t* )buf; for(unsigned int i = 0; i < slice_num; i++) { uint8_t* data_ptr = slice_ptr + padding.top * strides[1] + padding.left * strides[0]; for(unsigned int h = 0; h < cl_info->dimension(1); h++) { int data_len = cl_info->dimension(0) * strides[0]; int buf_len = data_len; if(data_type == DataType::F16) buf_len = data_len << 1; if(to_tensor) { copy_buffer(data_ptr, buf_ptr, buf_len, data_type, DataType::F32); } else { copy_buffer(buf_ptr, data_ptr, data_len, DataType::F32, data_type); } buf_ptr = buf_ptr + buf_len; data_ptr += slice_w * strides[0]; } slice_ptr += slice_h * slice_w * strides[0]; } } static void copy_to_itensor(CLTensor* cl_tensor, const void* buf, int buf_size, DataType tensor_dt) { copy_itensor(cl_tensor, ( void* )buf, buf_size, true, tensor_dt); } void copy_from_itensor(const CLTensor* cl_tensor, void* buf, int buf_size, DataType tensor_dt) { copy_itensor(( CLTensor* )cl_tensor, buf, buf_size, false, tensor_dt); } void copy_from_itensor_with_permuteNHWCTONCHW(CLTensor* cl_tensor, void* buf, int buf_size, DataType data_type) { auto* cl_info = cl_tensor->info(); // const size_t slice_num = cl_info->tensor_shape().total_size_upper(2); const Strides strides = cl_info->strides_in_bytes(); const PaddingSize padding = cl_info->padding(); int slice_w = cl_info->dimension(0) + padding.left + padding.right; int slice_h = cl_info->dimension(1) + padding.bottom + padding.top; uint8_t* slice_ptr = cl_tensor->buffer(); // uint8_t* buf_ptr = ( uint8_t* )buf; float* pf32DataOutputBuf = ( float* )buf; // float *pf32DataInputRowBuf; uint8_t* pu8RowInputData; uint8_t* cur_slice_ptr = slice_ptr; int n = cl_info->dimension(3); int c = cl_info->dimension(0); int h = cl_info->dimension(2); int w = cl_info->dimension(1); int hw = (w * h); int offsetSize = padding.top * strides[1] + padding.left * strides[0]; assert(n * h * w * c * 4 == buf_size); if(data_type == DataType::F32) { float* pf32DataInput; for(int z = 0; z < n; z++) { uint8_t* pu8SliceAddr = cur_slice_ptr + slice_h * slice_w * h * z * strides[0]; float* pf32OutStartAddr0 = pf32DataOutputBuf + z * w * h * c; for(int i = 0; i < h; i++) { float* pf32OutStartAddr1 = pf32OutStartAddr0 + w * i; uint8_t* pu8SliceAddr_h_ele = pu8SliceAddr + i * slice_h * slice_w * strides[0]; for(int j = 0; j < w; j++) { pu8RowInputData = pu8SliceAddr_h_ele + offsetSize + j * strides[1]; pf32DataInput = ( float* )pu8RowInputData; float* pf32RowStartAddr = pf32OutStartAddr1 + j; for(int k = 0; k < c; k++) { float* pf32CkData = pf32RowStartAddr + k * hw; *pf32CkData = pf32DataInput[k]; } } } } } else { assert(data_type == DataType::F16); __fp16* pf16DataInput; for(int z = 0; z < n; z++) { uint8_t* pu8SliceAddr = cur_slice_ptr + slice_h * slice_w * h * z * strides[0]; float* pf32OutStartAddr0 = pf32DataOutputBuf + z * w * h * c; for(int i = 0; i < h; i++) { float* pf32OutStartAddr1 = pf32OutStartAddr0 + w * i; uint8_t* pu8SliceAddr_h_ele = pu8SliceAddr + i * slice_h * slice_w * strides[0]; for(int j = 0; j < w; j++) { pu8RowInputData = pu8SliceAddr_h_ele + offsetSize + j * strides[1]; pf16DataInput = ( __fp16* )pu8RowInputData; float* pf32RowStartAddr = pf32OutStartAddr1 + j; for(int k = 0; k < c; k++) { float* pf32CkData = pf32RowStartAddr + k * hw; *pf32CkData = pf16DataInput[k]; } } } } } } CLGraph::CLGraph() { bForcedNHWCMode_ = false; pcScratchMem_ = new char[8]; l32ScratchMemSize_ = 0; l32AclNHWCOptimizeFlag_ = false; }; CLGraph::~CLGraph() { delete[] pcScratchMem_; } void CLGraph::init(std::string name, DataType type) { name_ = name; data_type_ = type; }; bool CLGraph::CreateACLGraph(struct subgraph* subgraph, DataType type, bool bDataLayoutOpFlag) { CLScheduler::get().default_init(); this->init("acl_graph", type); // tengine-lite's subgraph has not name. /* 1 Check Data Layout Work Mode*/ this->bForcedNHWCMode_ = bDataLayoutOpFlag; /* first, process input nodes' input tensor */ struct graph* ir_graph = subgraph->graph; int input_size = subgraph->input_num; for(int i = 0; i < input_size; i++) { struct tensor* tensor = get_ir_graph_tensor(ir_graph, subgraph->input_tensor_list[i]); if(tensor->tensor_type != TENSOR_TYPE_CONST) { CLTensor* itensor = new CLTensor(); int* dims = tensor->dims; const std::string& name = tensor->name; int dim_size = tensor->dim_num; for (int j=0;jlayout == TENGINE_LAYOUT_NCHW) ? DataLayout::NCHW : DataLayout::NHWC; i_info.set_data_layout(aclDataLayout); itensor->allocator()->init(i_info); } else if(dim_size == 3) { itensor->allocator()->init(TensorInfo(TensorShape(dims[2], dims[1], dims[0], 1), 1, type)); } else if(dim_size == 2) { itensor->allocator()->init(TensorInfo(TensorShape(dims[1], dims[0], 1, 1), 1, type)); } else if(dim_size == 1) { itensor->allocator()->init(TensorInfo(TensorShape(dims[0], 1, 1, 1), 1, type)); } else { TLOG_ERR("Bad shape dim: %d\n", dim_size); } this->tensors_map_[name] = itensor; } } /* now, let's scan all nodes! */ int node_size = subgraph->node_num; for(int i = 0; i < node_size; i++) { bool ret = false; struct node* node = get_ir_graph_node(ir_graph, subgraph->node_list[i]); uint16_t op_type = node->op.type; if(op_type == OP_CONST) continue; switch (op_type) { case OP_BATCHNORM: { struct node* node_next = get_ir_graph_node(ir_graph, subgraph->node_list[++i]); if(node_next->op.type != OP_SCALE) ret = false; else ret = this->AddBNLayer(node, node_next); break; } case OP_CAST: ret = this->AddCastLayer(node); break; case OP_CONCAT: { if (node->input_num < 2) ret = this->AddDropoutLayer(node); else ret = this->AddConcatLayer(node); break; } case OP_CLIP: ret = this->AddReLu6Layer(node); break; case OP_CONV: ret = this->AddConvolutionLayer(node); break; case OP_CROP: ret = this->AddCropLayer(node); break; case OP_DECONV: { /* deconv upsample */ ret = this->AddInterpLayer(node); break; } case OP_DROPOUT: ret = this->AddDropoutLayer(node); break; case OP_ELTWISE: ret = this->AddEltwiseLayer(node); break; case OP_FC: ret = this->AddFCLayer(node); break; case OP_INPUT: ret = this->AddInputLayer(node); break; case OP_POOL: ret = this->AddPoolingLayer(node); break; case OP_RELU: ret = this->AddReLuLayer(node); break; case OP_RESHAPE: ret = this->AddReshapeLayer(node); break; case OP_RESIZE: ret = this->AddResizeLayer(node); break; case OP_SOFTMAX: ret = this->AddSoftmaxLayer(node); break; case OP_INTERP: ret = this->AddInterpLayer(node); break; default: fprintf(stderr,"Fail to support this op(%d)!!!\n",i); return false; } if(!ret) { fprintf(stderr,"Create ACL for Op %s failed! \n", get_op_name_from_type(op_type)); return false; } } return true; } int CLGraph::prerun(struct subgraph *subgraph, struct acl_option* option) { fprintf(stderr, "ACL initialized\n"); DataType data_type = DataType::F32; if (nullptr != option) { switch(option->precision) { case TENGINE_DT_FP32: data_type = DataType::F32; fprintf(stderr, "ACL Backend set precision Float32\n"); break; case TENGINE_DT_FP16: data_type = DataType::F16; fprintf(stderr, "ACL Backend set precision Float16\n"); break; default: fprintf(stderr, "ACL Backend not support this %d data mode\n", option->precision); return -1; } } l32AclNHWCOptimizeFlag_ = true; this->CreateACLGraph(subgraph, data_type, l32AclNHWCOptimizeFlag_); auto ir_start = this->tensors_map_.begin(); auto ir_end = this->tensors_map_.end(); for(auto ir = ir_start; ir != ir_end; ir++) { CLTensor* tensor = ir->second; if(tensor->allocator()->info().is_resizable()) tensor->allocator()->allocate(); } struct graph* ir_graph = subgraph->graph; int output_node_size = subgraph->output_num; for (int i = 0; i < output_node_size; i++) { struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, subgraph->output_tensor_list[i]); void* mem_addr = output_tensor->data; if(mem_addr) continue; else output_tensor->data = (void*)sys_malloc(output_tensor->elem_size * output_tensor->elem_num); } return 0; } int CLGraph::run(struct subgraph *subgraph) { struct graph* ir_graph = subgraph->graph; int input_number = subgraph->input_num; DataType data_type_ = this->data_type_; int l32ScratchMemSize_ = this->l32ScratchMemSize_; void* scratch_mem = nullptr; for(int i = 0; i < input_number; i++) { struct tensor* tensor_input = get_ir_graph_tensor(ir_graph, subgraph->input_tensor_list[i]); uint8_t tensor_type = tensor_input->tensor_type; if(tensor_type == TENSOR_TYPE_INPUT) { bool bDataPermute = false; if(l32AclNHWCOptimizeFlag_ == 1) { int DataLayoutType = tensor_input->layout; if(DataLayoutType == TENGINE_LAYOUT_NCHW) { // need to permute data layout type to nhwc int* Dim = tensor_input->dims; int tengine_data_type = tensor_input->data_type; int DataEleSize = gs32TengineDataElemetSize[tengine_data_type]; int l32InputDataSize = tensor_input->elem_size * tensor_input->elem_num; // assert(l32InputDataSize == Dim[1] * Dim[2] * Dim[3] * 4 * Dim[0]); scratch_mem = sys_malloc(l32InputDataSize); assert(scratch_mem != NULL); void* pvTensorDataMem = tensor_input->data; // need to permute data to nhwc _PermuteDataLayoutNCHWToNHWC(pvTensorDataMem, Dim[0], Dim[1], Dim[2], Dim[3], scratch_mem, DataEleSize); bDataPermute = true; } } CLTensor* acl_input = this->GetCLTensor(tensor_input->name); void* buf = (bDataPermute == true) ? scratch_mem : tensor_input->data; int size = tensor_input->elem_size * tensor_input->elem_num; acl_input->map(); copy_to_itensor(acl_input, buf, size, data_type_); acl_input->unmap(); } else { /* normal Input Node */ bool bDataPermute = false; if(l32AclNHWCOptimizeFlag_ == 1) { int DataLayoutType = tensor_input->layout; if( DataLayoutType == TENGINE_LAYOUT_NCHW) { // need to permute data layout type to nhwc int* Dim = tensor_input->dims; int tengine_data_type = tensor_input->data_type; int DataEleSize = gs32TengineDataElemetSize[tengine_data_type]; int l32InputDataSize = tensor_input->elem_size * tensor_input->elem_num; scratch_mem = sys_malloc(l32InputDataSize); assert(scratch_mem != NULL); void* pvTensorDataMem = tensor_input->data; // need to permute data to nhwc _PermuteDataLayoutNCHWToNHWC(pvTensorDataMem, Dim[0], Dim[1], Dim[2], Dim[3], scratch_mem, DataEleSize); bDataPermute = true; } } CLTensor* acl_input = this->GetCLTensor(tensor_input->name); void* buf = (bDataPermute == true) ? scratch_mem : tensor_input->data; int size = tensor_input->elem_size * tensor_input->elem_num; acl_input->map(); copy_to_itensor(acl_input, buf, size, data_type_); acl_input->unmap(); } } if(scratch_mem) sys_free(scratch_mem); int size = functions_map_.size(); for(int i = 0; i < size; i++) { functions_map_[i]->run(); } int output_num = subgraph->output_num; for(int i = 0; i < output_num; i++) { struct tensor* output = get_ir_graph_tensor(ir_graph, subgraph->output_tensor_list[i]); std::string output_name = output->name; CLTensor* cltensor = this->GetCLTensor(output_name); TensorInfo* ptTensorInfo = cltensor->info(); int DataLayoutType = output->layout; DataLayout AclDataLayout = ptTensorInfo->data_layout(); int AclDataLayoutforTengine = (AclDataLayout == DataLayout::NHWC) ? TENGINE_LAYOUT_NHWC : TENGINE_LAYOUT_NCHW; void* output_buf = output->data; int out_size = output->elem_size * output->elem_num; // if we enable ACL_OP flag, we need to permute output data back if(DataLayoutType != AclDataLayoutforTengine) { if(AclDataLayoutforTengine == TENGINE_LAYOUT_NHWC) { cltensor->map(); copy_from_itensor_with_permuteNHWCTONCHW(cltensor, output_buf, out_size, data_type_); cltensor->unmap(); } else { cltensor->map(); copy_from_itensor(cltensor, output_buf, out_size, data_type_); cltensor->unmap(); } } else { cltensor->map(); copy_from_itensor(cltensor, output_buf, out_size, data_type_); cltensor->unmap(); } } return 0; } int CLGraph::postrun(struct subgraph *subgraph) { for (auto& var : tensors_map_) { if (nullptr != var.second) { delete var.second; var.second = nullptr; } } tensors_map_.clear(); functions_map_.clear(); return 0; } CLTensor* CLGraph::GetCLTensor(std::string name) { return tensors_map_[name]; }