diff --git a/CMakeLists.txt b/CMakeLists.txt index 2532ecf24367c0efd8cc6bda90209e77008a4a54..b0f8790b3cff653ea2351a029df21128ab81e940 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,21 +1,22 @@ -cmake_minimum_required(VERSION 3.0) +cmake_minimum_required(VERSION 3.6) project(paddle-mobile) option(DEBUGING "enable debug mode" ON) -option(USE_OPENMP "openmp support" ON) +option(USE_OPENMP "openmp support" OFF) option(USE_EXCEPTION "use std exception" ON) option(LOG_PROFILE "log profile" ON) # select the platform to build option(CPU "armv7 with neon" ON) option(MALI_GPU "mali gpu" OFF) option(FPGA "fpga" OFF) +option(QUANTI "quantification" OFF) file(GLOB_RECURSE PADDLE_MOBILE_CC src/*.cc src/*.cpp src/*.c src/*.mm) file(GLOB_RECURSE PADDLE_MOBILE_H src/*.h) include_directories(src/) if(IS_IOS) - set(CMAKE_CXX_FLAGS "-fobjc-abi-version=2 -fobjc-arc -std=gnu++11 -stdlib=libc++ -O3 -s -isysroot ${CMAKE_OSX_SYSROOT} ${CMAKE_CXX_FLAGS}") + set(CMAKE_CXX_FLAGS "-mfpu=neon -marm -fobjc-abi-version=2 -fobjc-arc -std=gnu++11 -stdlib=libc++ -O3 -s -isysroot ${CMAKE_OSX_SYSROOT} ${CMAKE_CXX_FLAGS}") else() set(CMAKE_CXX_FLAGS "-std=c++14 -O3 -s ${CMAKE_CXX_FLAGS}") endif() @@ -43,7 +44,7 @@ if (LOG_PROFILE) add_definitions(-DPADDLE_MOBILE_PROFILE) endif() -if(USE_OPENMP) +if(USE_OPENMP AND NOT IS_IOS) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp") add_definitions(-DPADDLE_MOBILE_USE_OPENMP) endif() @@ -104,12 +105,21 @@ else() foreach(f ${_tmp_list_h}) list(REMOVE_ITEM PADDLE_MOBILE_H ${f}) endforeach() -endif() + file(GLOB_RECURSE _tmp_list src/fpga/*.cpp src/fpga/*.cc) + foreach(f ${_tmp_list}) + list(REMOVE_ITEM PADDLE_MOBILE_CC ${f}) + endforeach() + + file(GLOB_RECURSE _tmp_list_h src/fpga/*.h) + foreach(f ${_tmp_list_h}) + list(REMOVE_ITEM PADDLE_MOBILE_H ${f}) + endforeach() +endif() + if (ANDROID_NDK_TOOLCHAIN_INCLUDED) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -llog") - add_definitions(-DARMV7) else() list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/jni/paddle_mobile_jni.h) list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/jni/paddle_mobile_jni.cpp) @@ -131,7 +141,7 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY build) # NET default set(NET "defult" CACHE STRING "select net type") -set_property(CACHE NET PROPERTY STRINGS "defult" "googlenet" "mobilenet" "yolo" "squeezenet") +set_property(CACHE NET PROPERTY STRINGS "defult" "googlenet" "mobilenet" "yolo" "squeezenet" "FPGAnets") include("${CMAKE_CURRENT_LIST_DIR}/tools/op.cmake") @@ -153,3 +163,7 @@ if(DEBUGING) endif() endif() +if (QUANTI) + add_subdirectory(tools/quantification) +endif () + diff --git a/README.md b/README.md index 69362734116fd8af78442a07dd31600aa46b7935..c9d15d4960a6330ff6614b6dfc8fd20b81386c9c 100644 --- a/README.md +++ b/README.md @@ -27,10 +27,10 @@ Paddle-Moible是PaddlePaddle组织下的项目,是一个致力于嵌入式平 - **ARM CPU** -![](http://7xop3k.com1.z0.glb.clouddn.com/15312108766575.jpg) +![](http://mms-graph.bj.bcebos.com/paddle-mobile%2F2018_07_18.png) arm cpu是paddle-mobile的主要支持方向,cpu的通用性一直是其优势。嵌入式深度学习,需要大量的cpu汇编实现。我们正在紧锣密鼓的编码,为的是能充分硬件的每一点加速能力。 - arm cpu的优化工作还在进行中,现在使用了常规的cpu优化。在arm a73上paddle-mobile arm-v7现在单核运行一次mobilenet1.0是120+ms,显然这不是我们的最终目标,我们正在用大量的汇编改写,后续性能仍会有巨大提升空间, 目前只支持armv7, 未来我们也会支持armv8。 + arm cpu的优化工作还在进行中,现在使用了常规的cpu优化。在arm a73上paddle-mobile arm-v7现在单核运行一次mobilenet1.0是110+ms,显然这不是我们的最终目标,我们正在用大量的汇编改写,后续性能仍会有巨大提升空间, 目前只支持armv7, 未来我们也会支持armv8。 - **Mali GPU** diff --git a/src/common/types.cpp b/src/common/types.cpp index cea42171f0205e0d40b2703d5c90f0b9fc253e68..e64d9c91a6faac2b7f2eaccac35b7592ab445efc 100644 --- a/src/common/types.cpp +++ b/src/common/types.cpp @@ -24,6 +24,8 @@ const std::string G_OP_TYPE_CONCAT = "concat"; const std::string G_OP_TYPE_ELEMENTWISE_ADD = "elementwise_add"; const std::string G_OP_TYPE_FUSION_CONV_ADD_RELU = "fusion_conv_add_relu"; const std::string G_OP_TYPE_FUSION_CONV_ADD_BN_RELU = "fusion_conv_add_bn_relu"; +const std::string G_OP_TYPE_FUSION_DWCONV_BN_RELU = "fusion_dwconv_bn_relu"; +const std::string G_OP_TYPE_FUSION_CONV_BN_RELU = "fusion_conv_bn_relu"; const std::string G_OP_TYPE_FC = "fusion_fc"; const std::string G_OP_TYPE_FUSION_CONV_ADD = "fusion_conv_add"; const std::string G_OP_TYPE_LRN = "lrn"; @@ -42,11 +44,21 @@ const std::string G_OP_TYPE_FETCH = "fetch"; const std::string G_OP_TYPE_DEPTHWISE_CONV = "depthwise_conv2d"; const std::string G_OP_TYPE_IM2SEQUENCE = "im2sequence"; const std::string G_OP_TYPE_DROPOUT = "dropout"; +const std::string G_OP_TYPE_FUSION_CONV_RELU = "fusion_conv_relu"; +const std::string G_OP_TYPE_FUSION_CONV_BN_SCALE = "fusion_conv_bn_scale"; +const std::string G_OP_TYPE_FUSION_CONV_BN_SCALE_RELU = + "fusion_conv_bn_scale_relu"; +const std::string G_OP_TYPE_FUSION_POOL_BN = "fusion_pool_bn"; +const std::string G_OP_TYPE_FUSION_ELEMENTWISE_ADD_RELU = + "fusion_elementwise_add_relu"; +const std::string G_OP_TYPE_REGION = "region"; std::unordered_map< std::string, std::pair, std::vector>> op_input_output_key = { {G_OP_TYPE_CONV, {{"Input"}, {"Output"}}}, + {G_OP_TYPE_FUSION_DWCONV_BN_RELU, {{"Input"}, {"Out"}}}, + {G_OP_TYPE_FUSION_CONV_BN_RELU, {{"Input"}, {"Out"}}}, {G_OP_TYPE_FUSION_CONV_ADD, {{"Input"}, {"Out"}}}, {G_OP_TYPE_RELU, {{"X"}, {"Out"}}}, {G_OP_TYPE_SOFTMAX, {{"X"}, {"Out"}}}, @@ -70,6 +82,12 @@ std::unordered_map< {G_OP_TYPE_DEPTHWISE_CONV, {{"Input"}, {"Output"}}}, {G_OP_TYPE_FUSION_CONV_ADD_RELU, {{"Input"}, {"Out"}}}, {G_OP_TYPE_IM2SEQUENCE, {{"X"}, {"Out"}}}, - {G_OP_TYPE_DROPOUT, {{"X"}, {"Out"}}}}; + {G_OP_TYPE_DROPOUT, {{"X"}, {"Out"}}}, + {G_OP_TYPE_FUSION_CONV_RELU, {{"Input"}, {"Out"}}}, + {G_OP_TYPE_FUSION_CONV_BN_SCALE, {{"Input"}, {"Out"}}}, + {G_OP_TYPE_FUSION_CONV_BN_SCALE_RELU, {{"Input"}, {"Out"}}}, + {G_OP_TYPE_FUSION_POOL_BN, {{"X"}, {"Out"}}}, + {G_OP_TYPE_FUSION_ELEMENTWISE_ADD_RELU, {{"X", "Y"}, {"Out"}}}, + {G_OP_TYPE_REGION, {{"X"}, {"Out"}}}}; } // namespace paddle_mobile diff --git a/src/common/types.h b/src/common/types.h index ec428b9911f64d7ccc8c6f5dc4be7f970e855d3c..d34c76710ad5fd40fb9d0c4ba67757f7a97558ff 100644 --- a/src/common/types.h +++ b/src/common/types.h @@ -16,6 +16,7 @@ limitations under the License. */ #include #include +#include #include namespace paddle_mobile { @@ -81,6 +82,8 @@ extern const std::string G_OP_TYPE_FUSION_CONV_ADD_RELU; extern const std::string G_OP_TYPE_FC; extern const std::string G_OP_TYPE_FUSION_CONV_ADD; extern const std::string G_OP_TYPE_FUSION_CONV_ADD_BN_RELU; +extern const std::string G_OP_TYPE_FUSION_DWCONV_BN_RELU; +extern const std::string G_OP_TYPE_FUSION_CONV_BN_RELU; extern const std::string G_OP_TYPE_LRN; extern const std::string G_OP_TYPE_MUL; @@ -99,6 +102,13 @@ extern const std::string G_OP_TYPE_DEPTHWISE_CONV; extern const std::string G_OP_TYPE_IM2SEQUENCE; extern const std::string G_OP_TYPE_DROPOUT; +extern const std::string G_OP_TYPE_FUSION_CONV_RELU; +extern const std::string G_OP_TYPE_FUSION_CONV_BN_SCALE; +extern const std::string G_OP_TYPE_FUSION_CONV_BN_SCALE_RELU; +extern const std::string G_OP_TYPE_FUSION_POOL_BN; +extern const std::string G_OP_TYPE_FUSION_ELEMENTWISE_ADD_RELU; +extern const std::string G_OP_TYPE_REGION; + extern std::unordered_map< std::string, std::pair, std::vector>> op_input_output_key; diff --git a/src/common/variant.h b/src/common/variant.h index 9d0aa3019fbfdd5acbaed8a1140bc58c33f7f438..00b8eb985d8f7fc22bb93a3e229aa387c358e257 100644 --- a/src/common/variant.h +++ b/src/common/variant.h @@ -84,7 +84,7 @@ struct Variant { if (type_id == typeid(T).hash_code()) { return *const_cast(reinterpret_cast(&data)); } else { - PADDLE_MOBILE_THROW_EXCEPTION(" bad cast in variant "); + PADDLE_MOBILE_THROW_EXCEPTION(" bad cast in variant"); exit(0); } } diff --git a/src/fpga/api/fpga_api.cpp b/src/fpga/api/fpga_api.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d484d889d8df8f4171658ae395531b84b0ac0a0d --- /dev/null +++ b/src/fpga/api/fpga_api.cpp @@ -0,0 +1,64 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "fpga/api/fpga_api.h" + +namespace paddle { +namespace mobile { +namespace fpga { +namespace api { + +static int fd = -1; +static const char *device_path = "/dev/fpgadrv0"; + +static inline int do_ioctl(int req, void *arg) { return ioctl(req, arg); } + +int open_device() { + if (fd == -1) { + fd = open(device_path, O_RDWR); + } + return fd; +} + +// memory management; +void *fpga_malloc(size_t size) { + return reinterpret_cast<(void *)> mmap64(NULL, size, PROT_READ | PROT_WRITE, + MAP_SHARED, fd, 0); +} + +void fpga_free(void *ptr) { munmap(ptr, 0); } + +void fpga_copy(void *dest, const void *src, size_t num) { + memcpy(dest, src, num); +} + +} // namespace api +} // namespace fpga +} // namespace mobile +} // namespace paddle diff --git a/src/fpga/api/fpga_api.h b/src/fpga/api/fpga_api.h new file mode 100644 index 0000000000000000000000000000000000000000..65fb1b5d611e8c063d196efa8b8d7ccfa0ff91b3 --- /dev/null +++ b/src/fpga/api/fpga_api.h @@ -0,0 +1,57 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include + +// memory management; + +namespace paddle { +namespace mobile { +namespace fpga { +namespace api { + +int open_device(); +int close_device(); + +void *fpga_malloc(size_t size); +void fpga_free(void *ptr); +void fpga_copy(void *dst, const void *src, size_t num); + +struct CnnVersionArgs { + void *buf; +}; + +struct QuantArgs { + float scale; +}; + +struct BatchNormalizationArgs { + bool enable; +}; + +struct ScaleArgs {}; + +#define IOCTL_CNN_MAGIC 'CNN' +#define IOCTL_VERSION _IOW(IOCTL_CNN_MAGIC, 1, struct CnnVersionArgs) +#define IOCTL_GET_QUANT _IOW(IOCTL_CNN_MAGIC, 2, struct QuantArgs) +#define IOCTL_SET_QUANT _IOW(IOCTL_CNN_MAGIC, 3, struct QuantArgs) + +} // namespace api +} // namespace fpga +} // namespace mobile +} // namespace paddle diff --git a/src/framework/operator.cpp b/src/framework/operator.cpp index 36b4663cb603d29bb60cfc297899d1c300e8ca91..765103c241a82ac224d707340f8b66ace827e335 100644 --- a/src/framework/operator.cpp +++ b/src/framework/operator.cpp @@ -28,6 +28,16 @@ vector OperatorBase::GetOutKeys() const { return it->second.second; } +template +vector OperatorBase::GetInputKeys() const { + auto it = op_input_output_key.find(type_); + if (it == op_input_output_key.end()) { + DLOG << type_ << " has no outputs"; + return {}; + } + return it->second.first; +} + template OperatorBase::OperatorBase(const std::string &type, const VariableNameMap &inputs, @@ -49,6 +59,11 @@ template void OperatorBase::Run() const { RunImpl(); #ifdef PADDLE_MOBILE_DEBUG + vector input_keys = GetInputKeys(); + for (const auto key : input_keys) { + Tensor *input = GetVarValue(key, inputs_, *scope_); + DLOG << type_ << " input- " << key << "=" << *input; + } vector output_keys = GetOutKeys(); for (const auto key : output_keys) { Tensor *out_ = GetVarValue(key, outputs_, *scope_); diff --git a/src/framework/operator.h b/src/framework/operator.h index 793551b0cd3eea290243c156c27616a34c37a3d2..084ac3c81185fe489fe1ca67589c1e8edb1d4fdf 100644 --- a/src/framework/operator.h +++ b/src/framework/operator.h @@ -61,6 +61,7 @@ class OperatorBase { virtual ~OperatorBase() {} void Run() const; std::vector GetOutKeys() const; + std::vector GetInputKeys() const; virtual void RunImpl() const = 0; virtual void Init() = 0; @@ -118,6 +119,10 @@ class OperatorWithKernel : public OperatorBase { virtual void InferShape() const = 0; void Init() { + // for (auto i : this->inputs_) { + // DLOG << i.first; + // DLOG << i.second; + // } PADDLE_MOBILE_ENFORCE(kernel_.Init(¶m_), " %s kernel init failed", this->type_.c_str()); } @@ -146,7 +151,7 @@ class OpKernelBase { } #endif virtual void Compute(const P ¶) const = 0; - virtual bool Init(P *para) { return true; }; + virtual bool Init(P *para) { return true; } virtual ~OpKernelBase() = default; private: diff --git a/src/framework/program/program-optimize/fusion_op_register.h b/src/framework/program/program-optimize/fusion_op_register.h index 1cd6b1dd779f9bc9ff0f5be5513c4fa716d80b10..f16a65c28fb47e1cf4139588742ebe1073c3f3e6 100644 --- a/src/framework/program/program-optimize/fusion_op_register.h +++ b/src/framework/program/program-optimize/fusion_op_register.h @@ -42,8 +42,17 @@ class FusionOpRegister { matchers_[matcher->Type()] = shared_matcher; } - const std::map> Matchers() { - return matchers_; + const std::vector> Matchers() { + std::vector> matchers; + for (const auto& match : matchers_) { + matchers.push_back(match.second); + } + std::sort(matchers.begin(), matchers.end(), + [](std::shared_ptr first, + std::shared_ptr second) { + return first->BeginNode().Depth() > second->BeginNode().Depth(); + }); + return matchers; } private: diff --git a/src/framework/program/program-optimize/node.cpp b/src/framework/program/program-optimize/node.cpp index e635e07eaf4484c3e390101c3b43fdaf24bbd2c6..a4e1db506da362df4fb61b39827d5e77ebc425eb 100644 --- a/src/framework/program/program-optimize/node.cpp +++ b/src/framework/program/program-optimize/node.cpp @@ -44,23 +44,6 @@ bool Node::operator==(const Node &in) { return true; } -std::vector> Node::OpDescs(int size) { - std::vector> op_descs; - OpDescs(size - 1, &op_descs); - return op_descs; -} - -void Node::OpDescs(int index, - std::vector> *op_desc) { - if (index == 0) { - return; - } - op_desc->push_back(this->op_desc_); - for (auto &output : outputs_) { - output->OpDescs(index, op_desc); - } -} - std::shared_ptr Node::To(int size) { std::shared_ptr node = std::make_shared(); this->To(size - 1, node); diff --git a/src/framework/program/program-optimize/node.h b/src/framework/program/program-optimize/node.h index 88bf1e16ed2a5fb3a038eadd546d63ffb3916f68..7eb179c243c28fe2668c3cf2f8f28f81312c0988 100644 --- a/src/framework/program/program-optimize/node.h +++ b/src/framework/program/program-optimize/node.h @@ -47,13 +47,10 @@ class Node { std::map>> change, std::vector> *removed_nodes); - std::vector> OpDescs(int size); std::shared_ptr OpDescOfNode() { return op_desc_; } std::string Type() { return type_; } private: - void OpDescs(int size, - std::vector> *op_desc); void To(int index, std::shared_ptr); void Folder( std::shared_ptr op_desc, diff --git a/src/framework/program/program-optimize/program_optimize.cpp b/src/framework/program/program-optimize/program_optimize.cpp index 3619bc79f576651245aa322992df9d318c810cd4..82d33bc65d864e010fbe41b270b71ed98a21b33e 100644 --- a/src/framework/program/program-optimize/program_optimize.cpp +++ b/src/framework/program/program-optimize/program_optimize.cpp @@ -78,9 +78,8 @@ std::shared_ptr ProgramOptimize::FusionOptimize( } for (auto ®isted : FusionOpRegister::Instance()->Matchers()) { - std::string fusion_type = registed.first; - std::shared_ptr matcher = registed.second; - // DLOG << " registed node \n " << matcher->BeginNode(); + std::string fusion_type = registed->Type(); + std::shared_ptr matcher = registed; auto match_vector = type_map[matcher->BeginType()]; diff --git a/src/framework/program/program.h b/src/framework/program/program.h index 5760efc826667d805695118b12e41efa0305553b..e500d500344d83204bf388401541259b90ea2f78 100644 --- a/src/framework/program/program.h +++ b/src/framework/program/program.h @@ -30,6 +30,7 @@ class Program { std::string model_path; std::string para_path; bool combined = false; + bool quantification = false; private: }; diff --git a/src/io/executor.cpp b/src/io/executor.cpp index 480f48290cc1bbf4888832d76187a13a4915ec40..65f019d1e3c3f6f6bdb8a18a9ff99bb7ecb2012c 100644 --- a/src/io/executor.cpp +++ b/src/io/executor.cpp @@ -154,7 +154,7 @@ void Executor::LoadMemory(const framework::VarDesc var_desc, tensor->Resize(framework::make_ddim(desc.Dims())); - void *memory = tensor; + void *memory = nullptr; int type_size = 0; switch (desc.DataType()) { case framework::VARTYPE_TYPE_FP16: @@ -179,11 +179,25 @@ void Executor::LoadMemory(const framework::VarDesc var_desc, default: break; } - - for (int n = 0; n < memory_size * type_size; ++n) { - static_cast(memory)[n] = (*data)[n]; + if (program_.quantification) { + float min_value; + float max_value; + + memcpy(&min_value, *data, sizeof(float)); + memcpy(&max_value, *data + sizeof(float), sizeof(float)); + *data += 2 * sizeof(float); + const float factor = (max_value - min_value) / 255.0; + uint8_t *uint8_data = (uint8_t *)(*data); + for (int k = 0; k < memory_size; ++k) { + static_cast(memory)[k] = uint8_data[k] * factor + min_value; + } + *data += (memory_size * sizeof(uint8_t)); + } else { + for (int n = 0; n < memory_size * type_size; ++n) { + static_cast(memory)[n] = (*data)[n]; + } + (*data) += (sizeof(char) * memory_size * type_size); } - (*data) += (sizeof(char) * memory_size * type_size); } template diff --git a/src/io/loader.cpp b/src/io/loader.cpp index 51e007a6ab4bce415628649a40f711903bceee92..9ed877d05d51dfbe7139ea2289fdb6480c62f88f 100644 --- a/src/io/loader.cpp +++ b/src/io/loader.cpp @@ -44,26 +44,29 @@ static size_t ReadBuffer(const char *file_name, uint8_t **out) { template const framework::Program Loader::Load( - const std::string &dirname, bool optimize, bool can_add_split) { - auto program = - this->LoadProgram(dirname + "/__model__", optimize, can_add_split); + const std::string &dirname, bool optimize, bool quantification, + bool can_add_split) { + auto program = this->LoadProgram(dirname + "/__model__", optimize, + quantification, can_add_split); program.model_path = dirname; return program; } template const framework::Program Loader::Load( - const std::string &model_path, const std::string ¶_path, - bool optimize) { + const std::string &model_path, const std::string ¶_path, bool optimize, + bool quantification) { auto program = this->LoadProgram(model_path, optimize); program.para_path = para_path; program.combined = true; + program.quantification = quantification; return program; } template const framework::Program Loader::LoadProgram( - const std::string &model_path, bool optimize, bool can_add_split) { + const std::string &model_path, bool optimize, bool quantification, + bool can_add_split) { std::string model_filename = model_path; PaddleMobile__Framework__Proto__ProgramDesc *c_program; uint8_t *buf = NULL; @@ -82,6 +85,7 @@ const framework::Program Loader::LoadProgram( framework::Program program; program.originProgram = originProgramDesc; + program.quantification = quantification; auto scope = std::make_shared(); program.scope = scope; diff --git a/src/io/loader.h b/src/io/loader.h index 5e3c53dc9db858f506a13d2105339038340344a6..512cee831f0a09f8223c07c531eb9d1c74e75d92 100644 --- a/src/io/loader.h +++ b/src/io/loader.h @@ -30,6 +30,7 @@ class Loader { * */ const framework::Program Load(const std::string &dirname, bool optimize = false, + bool quantification = false, bool can_add_split = false); /* @@ -38,11 +39,13 @@ class Loader { * */ const framework::Program Load(const std::string &model_path, const std::string ¶_path, - bool optimize = false); + bool optimize = false, + bool quantification = false); private: const framework::Program LoadProgram(const std::string &model_path, bool optimize = false, + bool quantification = false, bool can_add_split = false); }; diff --git a/src/io/paddle_mobile.cpp b/src/io/paddle_mobile.cpp index cabdd799a0e7d561d8bc56c0913f1389c38f8907..5e2e209d64aa7a00b56a5bdbbff88cb3097b7b94 100644 --- a/src/io/paddle_mobile.cpp +++ b/src/io/paddle_mobile.cpp @@ -26,7 +26,7 @@ void PaddleMobile::SetThreadNum(int num) { template bool PaddleMobile::Load(const std::string &dirname, bool optimize, - int batch_size) { + bool quantification, int batch_size) { if (loader_.get() == nullptr) { loader_ = std::make_shared>(); } else { @@ -35,7 +35,7 @@ bool PaddleMobile::Load(const std::string &dirname, bool optimize, if (executor_.get() == nullptr) { executor_ = std::make_shared>( - loader_->Load(dirname, optimize), batch_size, optimize); + loader_->Load(dirname, optimize, quantification), batch_size, optimize); } else { LOG(kLOG_INFO) << "executor inited"; } @@ -46,7 +46,7 @@ bool PaddleMobile::Load(const std::string &dirname, bool optimize, template bool PaddleMobile::Load(const std::string &model_path, const std::string ¶_path, bool optimize, - int batch_size) { + bool quantification, int batch_size) { if (loader_.get() == nullptr) { loader_ = std::make_shared>(); } else { @@ -55,7 +55,8 @@ bool PaddleMobile::Load(const std::string &model_path, if (executor_.get() == nullptr) { executor_ = std::make_shared>( - loader_->Load(model_path, para_path, optimize), batch_size, optimize); + loader_->Load(model_path, para_path, optimize, quantification), + batch_size, optimize); } else { LOG(kLOG_INFO) << "executor inited"; } diff --git a/src/io/paddle_mobile.h b/src/io/paddle_mobile.h index 74c11471566c3db8a37ea2d62e0496e5d40cb3b7..5dc3ccb21dd7e67fbe9b5032d01046b12728dc64 100644 --- a/src/io/paddle_mobile.h +++ b/src/io/paddle_mobile.h @@ -39,14 +39,18 @@ class PaddleMobile { * @b 加载分开形式的 fluid 模型 * */ bool Load(const std::string &dirname, bool optimize = false, - int batch_size = 1); + bool quantification = false, int batch_size = 1); /* * @b load combine format fluid mode * @b 加载结合在一起格式的模型 * */ bool Load(const std::string &model_path, const std::string ¶_path, - bool optimize = false, int batch_size = 1); + bool optimize = false, bool quantification = false, + int batch_size = 1); + /* + * @b 设置线程数, 当 cmake 中开启 openmp 时生效 + * */ void SetThreadNum(int num); /* diff --git a/src/memory/t_malloc.cpp b/src/memory/t_malloc.cpp index 0252f3c07c06487720586b0f650e2179d247234f..178541953323b6ffd1a3339f8209c2839b37a784 100644 --- a/src/memory/t_malloc.cpp +++ b/src/memory/t_malloc.cpp @@ -16,10 +16,32 @@ limitations under the License. */ #include #include +#ifdef PADDLE_MOBILE_FPGA + +#include "fpga/api/fpga_api.h" + +#endif + namespace paddle_mobile { namespace memory { const int MALLOC_ALIGN = 64; +#ifdef PADDLE_MOBILE_FPGA +namespace api = paddle::mobile::fpga::api; + +void Copy(void *dst, const void *src, size_t num) { + std::memcpy(dst, src, num); +} + +void *Alloc(size_t size) { return api::malloc(size); } + +void Free(void *ptr) { + if (ptr) { + api::fpga_free(ptr); + } +} + +#else void Copy(void *dst, const void *src, size_t num) { std::memcpy(dst, src, num); } @@ -42,5 +64,7 @@ void Free(void *ptr) { } } +#endif + } // namespace memory } // namespace paddle_mobile diff --git a/src/operators/batchnorm_op.cpp b/src/operators/batchnorm_op.cpp index 644a27c586375bc66d327e18ac5182e8fce2893b..f820908404ea637d9680c32d5c4b5568e191dd7e 100644 --- a/src/operators/batchnorm_op.cpp +++ b/src/operators/batchnorm_op.cpp @@ -26,7 +26,7 @@ void BatchNormOp::InferShape() const { auto x_dims = this->param_.InputX()->dims(); this->param_.OutputY()->Resize(x_dims); } -template class BatchNormOp; + } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/box_coder_op.cpp b/src/operators/box_coder_op.cpp index dece07d5efcfae9629842aead04d0274b9d82c93..9e57c9021dac1b6857752989727c1c86051e33f7 100644 --- a/src/operators/box_coder_op.cpp +++ b/src/operators/box_coder_op.cpp @@ -47,7 +47,7 @@ void BoxCoderOp::InferShape() const { this->param_.OutputBox()->Resize(framework::make_ddim( {input_targetbox_dims[0], input_priorbox_dims[0], 4})); } -template class BoxCoderOp; + } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/concat_op.cpp b/src/operators/concat_op.cpp index 9c524df351549fd0141294be805d77b3f1057362..19d771ddd5884412624a0720368ecc80f92678ea 100644 --- a/src/operators/concat_op.cpp +++ b/src/operators/concat_op.cpp @@ -56,7 +56,6 @@ void ConcatOp::InferShape() const { this->param_.Out()->Resize(out_dims); } -template class ConcatOp; } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/conv_op.cpp b/src/operators/conv_op.cpp index 1b00ed06eee2b1676667b9c54b8601c8872b6699..c4601995219b32db75f22c7c2ed959e18af85f36 100644 --- a/src/operators/conv_op.cpp +++ b/src/operators/conv_op.cpp @@ -48,8 +48,6 @@ void ConvOp::InferShape() const { this->param_.Output()->Resize(ddim); } -template class ConvOp; - } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/depthwise_conv_op.cpp b/src/operators/depthwise_conv_op.cpp index bee90781cd2de9d65bbbee3193cc922e743706de..8d6b6a143c37537be6de1e60cc095f1052136e26 100644 --- a/src/operators/depthwise_conv_op.cpp +++ b/src/operators/depthwise_conv_op.cpp @@ -49,8 +49,6 @@ void DepthwiseConvOp::InferShape() const { this->param_.Output()->Resize(ddim); } -template class DepthwiseConvOp; - } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/dropout_op.cpp b/src/operators/dropout_op.cpp index f7f5ca2475171f5756ee8cf4f13754d07df8fe01..a632aa0c52b19c591467f94afb216245a596680b 100644 --- a/src/operators/dropout_op.cpp +++ b/src/operators/dropout_op.cpp @@ -22,7 +22,7 @@ void DropoutOp::InferShape() const { auto input_dims = this->param_.InputX()->dims(); this->param_.Out()->Resize(input_dims); } -template class DropoutOp; + } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/elementwise_add_op.cpp b/src/operators/elementwise_add_op.cpp index 369589574139c7bc68debb7c55836926a3d5f6b2..49885f783417d61c6348fc4563e7306036994f17 100644 --- a/src/operators/elementwise_add_op.cpp +++ b/src/operators/elementwise_add_op.cpp @@ -24,7 +24,7 @@ void ElementwiseAddOp::InferShape() const { auto x_dim = this->param_.InputX()->dims(); this->param_.Out()->Resize(x_dim); } -template class ElementwiseAddOp; + } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/feed_op.cpp b/src/operators/feed_op.cpp index c4357d7993cd91a306fec5856eaa6839e9ab6a6e..4447f2c699fc929805f15a265440803e6ff34b56 100644 --- a/src/operators/feed_op.cpp +++ b/src/operators/feed_op.cpp @@ -14,10 +14,7 @@ limitations under the License. */ #include "feed_op.h" namespace paddle_mobile { -namespace operators { - -template class FeedOp; -} +namespace operators {} } // namespace paddle_mobile namespace ops = paddle_mobile::operators; diff --git a/src/operators/fetch_op.cpp b/src/operators/fetch_op.cpp index cdbe413c955b931a16e716aa2e18d2a018a53bab..adbd61d5ec364a40b565059ceb5d5d49999c8436 100644 --- a/src/operators/fetch_op.cpp +++ b/src/operators/fetch_op.cpp @@ -14,10 +14,7 @@ limitations under the License. */ #include "fetch_op.h" namespace paddle_mobile { -namespace operators { - -template class FetchOp; -} +namespace operators {} } // namespace paddle_mobile namespace ops = paddle_mobile::operators; diff --git a/src/operators/fusion_conv_add.cpp b/src/operators/fusion_conv_add.cpp index b1dba23be0d8ea010b38844b1897381fbf578617..cdd6a6db2bb11ebf8dce2aca85630aa8805adf3e 100644 --- a/src/operators/fusion_conv_add.cpp +++ b/src/operators/fusion_conv_add.cpp @@ -45,7 +45,6 @@ void FusionConvAddOp::InferShape() const { this->param_.Output()->Resize(ddim); } -template class FusionConvAddOp; } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/fusion_conv_add.h b/src/operators/fusion_conv_add.h index ae030ba5767e4039cfa3effe0a7ded4886f261cf..170df9ce33e4ab90297664fbc81d723e7c246f83 100644 --- a/src/operators/fusion_conv_add.h +++ b/src/operators/fusion_conv_add.h @@ -36,8 +36,6 @@ class FusionConvAddMatcher : public framework::FusionOpMatcher { void FolderNodes( framework::Node *node, std::vector> *removed_nodes) { - vector> origin_descs = - node->OpDescs(node_.Depth()); node->Folder(node_.Depth(), Type(), {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}}}, removed_nodes); } diff --git a/src/operators/fusion_conv_add_bn_relu_op.cpp b/src/operators/fusion_conv_add_bn_relu_op.cpp index 62839c1a5acaf89a3efef39bbe4a67c675da393b..16f4650a64ec0c363d5fa94ee27c15c73cf58a70 100644 --- a/src/operators/fusion_conv_add_bn_relu_op.cpp +++ b/src/operators/fusion_conv_add_bn_relu_op.cpp @@ -44,7 +44,7 @@ void FusionConvAddBNReluOp::InferShape() const { framework::DDim ddim = framework::make_ddim(output_shape); this->param_.Output()->Resize(ddim); } -template class FusionConvAddBNReluOp; + } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/fusion_conv_add_bn_relu_op.h b/src/operators/fusion_conv_add_bn_relu_op.h index 389c76cc83a532fe706d911903a8412bb8bfb4ca..19e33465c06921e9a6a7beb77053f05a03a6c760 100644 --- a/src/operators/fusion_conv_add_bn_relu_op.h +++ b/src/operators/fusion_conv_add_bn_relu_op.h @@ -39,8 +39,6 @@ class FusionConvAddBNReluMatcher : public framework::FusionOpMatcher { void FolderNodes( framework::Node *node, std::vector> *removed_nodes) { - vector> origin_descs = - node->OpDescs(node_.Depth()); node->Folder(node_.Depth(), Type(), {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}}, {G_OP_TYPE_BATCHNORM, diff --git a/src/operators/fusion_conv_bn_relu_op.cpp b/src/operators/fusion_conv_bn_relu_op.cpp new file mode 100644 index 0000000000000000000000000000000000000000..49fe9c933a5a9695f2c18bd0921c2d36063dc065 --- /dev/null +++ b/src/operators/fusion_conv_bn_relu_op.cpp @@ -0,0 +1,60 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef FUSION_CONVBNRELU_OP + +#include "operators/fusion_conv_bn_relu_op.h" +#include "operators/math/conv_func.h" + +namespace paddle_mobile { +namespace operators { + +template +void FusionConvBNReluOp::InferShape() const { + auto in_dims = this->param_.Input()->dims(); + auto filter_dims = this->param_.Filter()->dims(); + const std::vector &strides = this->param_.Strides(); + std::vector paddings = this->param_.Paddings(); + int groups = this->param_.Groups(); + std::vector dilations = this->param_.Dilations(); + + PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() && + dilations.size() == paddings.size() && + paddings.size() == strides.size()), + "ConvParam is not suitable"); + + std::vector output_shape({in_dims[0], filter_dims[0]}); + for (size_t i = 0; i < strides.size(); ++i) { + output_shape.push_back( + math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i], + paddings[i], strides[i])); + } + + framework::DDim ddim = framework::make_ddim(output_shape); + this->param_.Output()->Resize(ddim); +} + +} // namespace operators +} // namespace paddle_mobile + +namespace ops = paddle_mobile::operators; +#ifdef PADDLE_MOBILE_CPU +REGISTER_OPERATOR_CPU(fusion_conv_bn_relu, ops::FusionConvBNReluOp); +#endif +#ifdef PADDLE_MOBILE_MALI_GPU +#endif +#ifdef PADDLE_MOBILE_FPGA +#endif + +#endif diff --git a/src/operators/fusion_conv_bn_relu_op.h b/src/operators/fusion_conv_bn_relu_op.h new file mode 100644 index 0000000000000000000000000000000000000000..4c2c1033ac0a4d6c8e3bc3f188a66884dd9e0642 --- /dev/null +++ b/src/operators/fusion_conv_bn_relu_op.h @@ -0,0 +1,103 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef FUSION_CONVBNRELU_OP + +#pragma once + +#include +#include +#include "framework/operator.h" +#include "framework/program/program-optimize/fusion_op_register.h" +#include "operators/kernel/conv_bn_relu_kernel.h" +#include "operators/op_param.h" + +namespace paddle_mobile { +namespace operators { +using std::string; +using std::vector; +class FusionConvBNReluMatcher : public framework::FusionOpMatcher { + public: + FusionConvBNReluMatcher() { + node_ = framework::Node(G_OP_TYPE_CONV); + node_ > std::make_shared(G_OP_TYPE_BATCHNORM) > + std::make_shared(G_OP_TYPE_RELU); + } + + void FolderNodes( + framework::Node *node, + std::vector> *removed_nodes) { + node->Folder(node_.Depth(), Type(), + {{G_OP_TYPE_BATCHNORM, + {{"Scale", "Scale"}, + {"Mean", "Mean"}, + {"Bias", "Bias"}, + {"Variance", "Variance"}}}}, + removed_nodes); + } + + std::string Type() { return G_OP_TYPE_FUSION_CONV_BN_RELU; } +}; + +template +class FusionConvBNReluOp : public framework::OperatorWithKernel< + DeviceType, FusionConvBNReluParam, + operators::ConvBNReluKernel> { + public: + FusionConvBNReluOp(const string &type, const VariableNameMap &inputs, + const VariableNameMap &outputs, + const framework::AttributeMap &attrs, + std::shared_ptr scope) + : framework::OperatorWithKernel< + DeviceType, FusionConvBNReluParam, + operators::ConvBNReluKernel>(type, inputs, outputs, + attrs, scope) {} + + using framework::OperatorWithKernel< + DeviceType, FusionConvBNReluParam, + operators::ConvBNReluKernel>::OperatorWithKernel; + void InferShape() const override; + + protected: +}; + +#ifdef PADDLE_MOBILE_CPU + +#ifndef FUSION_CONV_BN_RELU_REGISTER +static framework::FusionOpRegistrar fusion_conv_bn_relu_registrar( + new FusionConvBNReluMatcher()); +#define FUSION_CONV_BN_RELU_REGISTER +#endif + +#endif + +#ifdef PADDLE_MOBILE_MALI_GPU + +#endif + +#ifdef PADDLE_MOBILE_FPGA +#endif + +} // namespace operators +} // namespace paddle_mobile + +#ifdef PADDLE_MOBILE_CPU +USE_OP_CPU(fusion_conv_bn_relu); +#endif +#ifdef PADDLE_MOBILE_MALI_GPU +#endif +#ifdef PADDLE_MOBILE_FPGA +#endif + +#endif diff --git a/src/operators/fusion_dwconv_bn_relu_op.cpp b/src/operators/fusion_dwconv_bn_relu_op.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e55295830e19b5b39a5ae2501e30170ffb1a7854 --- /dev/null +++ b/src/operators/fusion_dwconv_bn_relu_op.cpp @@ -0,0 +1,60 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef FUSION_DWCONVBNRELU_OP + +#include "operators/fusion_dwconv_bn_relu_op.h" +#include "operators/math/conv_func.h" + +namespace paddle_mobile { +namespace operators { + +template +void FusionDWConvBNReluOp::InferShape() const { + auto in_dims = this->param_.Input()->dims(); + auto filter_dims = this->param_.Filter()->dims(); + const std::vector &strides = this->param_.Strides(); + std::vector paddings = this->param_.Paddings(); + int groups = this->param_.Groups(); + std::vector dilations = this->param_.Dilations(); + + PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() && + dilations.size() == paddings.size() && + paddings.size() == strides.size()), + "ConvParam is not suitable"); + + std::vector output_shape({in_dims[0], filter_dims[0]}); + for (size_t i = 0; i < strides.size(); ++i) { + output_shape.push_back( + math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i], + paddings[i], strides[i])); + } + + framework::DDim ddim = framework::make_ddim(output_shape); + this->param_.Output()->Resize(ddim); +} + +} // namespace operators +} // namespace paddle_mobile + +namespace ops = paddle_mobile::operators; +#ifdef PADDLE_MOBILE_CPU +REGISTER_OPERATOR_CPU(fusion_dwconv_bn_relu, ops::FusionDWConvBNReluOp); +#endif +#ifdef PADDLE_MOBILE_MALI_GPU +#endif +#ifdef PADDLE_MOBILE_FPGA +#endif + +#endif diff --git a/src/operators/fusion_dwconv_bn_relu_op.h b/src/operators/fusion_dwconv_bn_relu_op.h new file mode 100644 index 0000000000000000000000000000000000000000..6f9f03e4936e082de802ced385060fecb9cc27a9 --- /dev/null +++ b/src/operators/fusion_dwconv_bn_relu_op.h @@ -0,0 +1,109 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef FUSION_DWCONVBNRELU_OP + +#pragma once + +#include +#include +#include "framework/operator.h" +#include "framework/program/program-optimize/fusion_op_register.h" +#include "op_param.h" +#include "operators/kernel/dwconv_bn_relu_kernel.h" + +namespace paddle_mobile { +namespace operators { +using std::string; +using std::vector; +class FusionDWConvBNReluMatcher : public framework::FusionOpMatcher { + public: + FusionDWConvBNReluMatcher() { + node_ = framework::Node(G_OP_TYPE_DEPTHWISE_CONV); + node_ > std::make_shared(G_OP_TYPE_BATCHNORM) > + std::make_shared(G_OP_TYPE_RELU); + } + + void FolderNodes( + framework::Node *node, + std::vector> *removed_nodes) { + node->Folder(node_.Depth(), Type(), + {{G_OP_TYPE_BATCHNORM, + {{"Scale", "Scale"}, + {"Mean", "Mean"}, + {"Bias", "Bias"}, + {"Variance", "Variance"}}}}, + removed_nodes); + } + + std::string Type() { return G_OP_TYPE_FUSION_DWCONV_BN_RELU; } +}; + +template +class FusionDWConvBNReluOp : public framework::OperatorWithKernel< + DeviceType, FusionDWConvBNReluParam, + operators::DWConvBNReluKernel> { + public: + FusionDWConvBNReluOp(const string &type, const VariableNameMap &inputs, + const VariableNameMap &outputs, + const framework::AttributeMap &attrs, + std::shared_ptr scope) + : framework::OperatorWithKernel< + DeviceType, FusionDWConvBNReluParam, + operators::DWConvBNReluKernel>(type, inputs, outputs, + attrs, scope) {} + + using framework::OperatorWithKernel< + DeviceType, FusionDWConvBNReluParam, + operators::DWConvBNReluKernel>::OperatorWithKernel; + void InferShape() const override; + + protected: +}; + +#ifdef PADDLE_MOBILE_CPU + +#ifndef FUSION_DWCONV_BN_RELU_REGISTER +static framework::FusionOpRegistrar fusion_dwconv_bn_relu_registrar( + new FusionDWConvBNReluMatcher()); +#define FUSION_DWCONV_BN_RELU_REGISTER +#endif + +#endif + +#ifdef PADDLE_MOBILE_MALI_GPU + +#ifndef FUSION_DWCONV_BN_RELU_REGISTER +static framework::FusionOpRegistrar fusion_dwconv_bn_relu_registrar( + new FusionDWConvBNReluMatcher()); +#define FUSION_DWCONV_BN_RELU_REGISTER +#endif + +#endif + +#ifdef PADDLE_MOBILE_FPGA +#endif + +} // namespace operators +} // namespace paddle_mobile + +#ifdef PADDLE_MOBILE_CPU +USE_OP_CPU(fusion_dwconv_bn_relu); +#endif +#ifdef PADDLE_MOBILE_MALI_GPU +#endif +#ifdef PADDLE_MOBILE_FPGA +#endif + +#endif diff --git a/src/operators/fusion_fc_op.cpp b/src/operators/fusion_fc_op.cpp index 57a8b1b53f2f98b3218ee8fc40c6c9774ec5a5c7..d564d4d88c16ee09382a9b2dae275807ec4bdb4b 100644 --- a/src/operators/fusion_fc_op.cpp +++ b/src/operators/fusion_fc_op.cpp @@ -50,7 +50,6 @@ void FusionFcOp::InferShape() const { this->param_.Out()->Resize(ddim); } -template class FusionFcOp; } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/im2sequence_op.cpp b/src/operators/im2sequence_op.cpp index 273ce462d0aa423a6bf023186c6a579e975dfb11..3c929af9cf0a8a1550f197ffdb42ee590cd43235 100644 --- a/src/operators/im2sequence_op.cpp +++ b/src/operators/im2sequence_op.cpp @@ -47,8 +47,6 @@ void Im2SequenceOp::InferShape() const { this->param_.Output()->Resize(ddim); } -template class Im2SequenceOp; - } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/kernel/arm/conv_add_bn_relu_kernel.cpp b/src/operators/kernel/arm/conv_add_bn_relu_kernel.cpp index 1fd1c66d4dc92a9918243b23e400ef5309422050..dbf3745eb15cf56bba32dc8cbae50d242ce2da76 100644 --- a/src/operators/kernel/arm/conv_add_bn_relu_kernel.cpp +++ b/src/operators/kernel/arm/conv_add_bn_relu_kernel.cpp @@ -15,7 +15,7 @@ limitations under the License. */ #ifdef FUSION_CONVADDBNRELU_OP #include "operators/kernel/conv_add_bn_relu_kernel.h" -#include "operators/kernel/central-arm-func/conv_add_bn_relu_func.h" +#include "operators/kernel/central-arm-func/conv_add_bn_relu_arm_func.h" namespace paddle_mobile { namespace operators { diff --git a/src/operators/kernel/arm/conv_bn_relu_kernel.cpp b/src/operators/kernel/arm/conv_bn_relu_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..23f06c1f0b8a0ed3f22ca9d23d24ae44c59f3618 --- /dev/null +++ b/src/operators/kernel/arm/conv_bn_relu_kernel.cpp @@ -0,0 +1,68 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef FUSION_CONVBNRELU_OP + +#include "operators/kernel/conv_bn_relu_kernel.h" +#include "operators/kernel/central-arm-func/conv_bn_relu_arm_func.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool ConvBNReluKernel::Init(FusionConvBNReluParam *param) { + const Tensor *mean = param->InputMean(); + const Tensor *variance = param->InputVariance(); + const Tensor *scale = param->InputScale(); + const Tensor *bias = param->InputBias(); + const float epsilon = param->Epsilon(); + + // DLOG << "variance: " << *variance; + + auto mean_ptr = mean->data(); + auto variance_ptr = variance->data(); + auto scale_ptr = scale->data(); + auto bias_ptr = bias->data(); + + const int C = mean->numel(); + float inv_std_ptr[C]; + for (int i = 0; i < C; i++) { + inv_std_ptr[i] = + 1 / static_cast(pow((variance_ptr[i] + epsilon), 0.5)); + } + Tensor *new_scale = new Tensor(); + Tensor *new_bias = new Tensor(); + auto new_scale_ptr = new_scale->mutable_data({C}); + auto new_bias_ptr = new_bias->mutable_data({C}); + for (int i = 0; i < C; i++) { + new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i]; + new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i]; + } + + param->SetNewScale(new_scale); + param->SetNewBias(new_bias); + return true; +} + +template <> +void ConvBNReluKernel::Compute( + const FusionConvBNReluParam ¶m) const { + ConvBNReluCompute(param); +} +template class ConvBNReluKernel; + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/arm/dwconv_bn_relu_kernel.cpp b/src/operators/kernel/arm/dwconv_bn_relu_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..0ec08fcecb9fefaa247e0acbb8a085e752b8dba3 --- /dev/null +++ b/src/operators/kernel/arm/dwconv_bn_relu_kernel.cpp @@ -0,0 +1,65 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef FUSION_DWCONVBNRELU_OP + +#include "operators/kernel/dwconv_bn_relu_kernel.h" +#include "operators/kernel/central-arm-func/dwconv_bn_relu_arm_func.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool DWConvBNReluKernel::Init(FusionDWConvBNReluParam *param) { + const Tensor *mean = param->InputMean(); + const Tensor *variance = param->InputVariance(); + const Tensor *scale = param->InputScale(); + const Tensor *bias = param->InputBias(); + const float epsilon = param->Epsilon(); + + auto mean_ptr = mean->data(); + auto variance_ptr = variance->data(); + auto scale_ptr = scale->data(); + auto bias_ptr = bias->data(); + + const int C = mean->numel(); + float inv_std_ptr[C]; + for (int i = 0; i < C; i++) { + inv_std_ptr[i] = + 1 / static_cast(pow((variance_ptr[i] + epsilon), 0.5)); + } + Tensor *new_scale = new Tensor(); + Tensor *new_bias = new Tensor(); + auto new_scale_ptr = new_scale->mutable_data({C}); + auto new_bias_ptr = new_bias->mutable_data({C}); + for (int i = 0; i < C; i++) { + new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i]; + new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i]; + } + param->SetNewScale(new_scale); + param->SetNewBias(new_bias); + return true; +} + +template <> +void DWConvBNReluKernel::Compute( + const FusionDWConvBNReluParam ¶m) const { + DWConvBNReluCompute(param); +} +template class DWConvBNReluKernel; + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/central-arm-func/batchnorm_arm_func.h b/src/operators/kernel/central-arm-func/batchnorm_arm_func.h index b2af17eb4aaf0a7ef98442f589162a3b6f371a3b..cc591035065e4cbbe71ff8f6bd6cbab9c6fe9e79 100644 --- a/src/operators/kernel/central-arm-func/batchnorm_arm_func.h +++ b/src/operators/kernel/central-arm-func/batchnorm_arm_func.h @@ -54,7 +54,40 @@ void BatchnormCompute(const BatchNormParam ¶m) { int HXW = H * W; -#ifdef ARMV7 +#if __ARM_NEON +#if __aarch64__ + float *inv_std_ptr = new float[C]; + for (int i = 0; i < C; i++) { + inv_std_ptr[i] = + 1 / static_cast(pow((variance_ptr[i] + epsilon), 0.5)); + } + + Tensor new_scale; + auto new_scale_ptr = new_scale.mutable_data(framework::make_ddim({C})); + Tensor new_bias; + auto new_bias_ptr = new_bias.mutable_data(framework::make_ddim({C})); + + /// ((x - est_mean) * (inv_var) * scale + bias equal to + /// (x * inv_var * scale) + (bias - est_mean * inv_var * scale) + for (int i = 0; i < C; i++) { + new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i]; + new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i]; + { + for (int n = 0; n < N; n++) { + for (int h = 0; h < H; h++) { + int tmp_index = n * stride0 + i * stride1 + h * stride2; + for (int w = 0; w < W; w++) { + int index = tmp_index + w; + out_ptr[index] = + input_x_ptr[index] * new_scale_ptr[i] + new_bias_ptr[i]; + } + } + } + } + } + delete[] inv_std_ptr; +#else + if (HXW > 32) { int NXC = N * C; float *inv_std_ptr = new float[NXC * 4]; @@ -229,6 +262,7 @@ void BatchnormCompute(const BatchNormParam ¶m) { delete[] inv_std_ptr; } +#endif #else float *inv_std_ptr = new float[C]; for (int i = 0; i < C; i++) { diff --git a/src/operators/kernel/central-arm-func/conv_add_bn_relu_func.h b/src/operators/kernel/central-arm-func/conv_add_bn_relu_arm_func.h similarity index 96% rename from src/operators/kernel/central-arm-func/conv_add_bn_relu_func.h rename to src/operators/kernel/central-arm-func/conv_add_bn_relu_arm_func.h index fb49a33c67face81a2615516bffd6aa151868fe3..d3b5bc69760797c4efcc3fb77831d54676d7d5b1 100644 --- a/src/operators/kernel/central-arm-func/conv_add_bn_relu_func.h +++ b/src/operators/kernel/central-arm-func/conv_add_bn_relu_arm_func.h @@ -15,6 +15,8 @@ limitations under the License. */ #ifdef FUSION_CONVADDBNRELU_OP #pragma once + +#include #include "operators/math/depthwise_conv_3x3.h" #include "operators/op_param.h" @@ -23,14 +25,9 @@ namespace operators { void ConvAddBNReluBasic(const FusionConvAddBNReluParam ¶m) { const Tensor *input = param.Input(); Tensor filter = *param.Filter(); - Tensor bias = *param.Bias(); Tensor new_bias = *param.NewBias(); Tensor new_scale = *param.NewScale(); - int axis = param.Axis(); Tensor *output = param.Output(); - math::expand_bias(bias, axis, output->dims()); - output->ShareDataWith(bias); - int groups = param.Groups(); std::vector strides = param.Strides(); std::vector paddings = param.Paddings(); @@ -107,7 +104,7 @@ void ConvAddBNReluBasic(const FusionConvAddBNReluParam ¶m) { math::matmulWithBn( filter_slice, false, col_matrix, false, static_cast(1), - &out_slice, static_cast(0), true, &new_scale, &new_bias); + &out_slice, static_cast(0), true, &new_scale, &new_bias, g); } } } @@ -121,7 +118,7 @@ void ConvAddBNReluCompute(const FusionConvAddBNReluParam ¶m) { param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1) { math::DepthwiseConvAddBNRelu3x3s1p1(param.Input(), param.Filter(), param.Output(), param.NewScale(), - param.NewBias(), 1); + param.NewBias(), true); } else if (param.Groups() == param.Input()->dims()[1] && param.Input()->dims()[1] == param.Output()->dims()[1] && param.Filter()->dims()[2] == param.Filter()->dims()[3] && diff --git a/src/operators/kernel/central-arm-func/conv_bn_relu_arm_func.h b/src/operators/kernel/central-arm-func/conv_bn_relu_arm_func.h new file mode 100644 index 0000000000000000000000000000000000000000..f18d67749b96cd0ee2d84c2731af8a2c3e136db1 --- /dev/null +++ b/src/operators/kernel/central-arm-func/conv_bn_relu_arm_func.h @@ -0,0 +1,139 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef FUSION_CONVBNRELU_OP + +#pragma once +#include +#include "operators/math/depthwise_conv_3x3.h" +#include "operators/op_param.h" +namespace paddle_mobile { +namespace operators { +void ConvBNReluBasic(const FusionConvBNReluParam ¶m) { + const Tensor *input = param.Input(); + Tensor filter = *param.Filter(); + Tensor new_bias = *param.NewBias(); + Tensor new_scale = *param.NewScale(); + + Tensor *output = param.Output(); + + int groups = param.Groups(); + std::vector strides = param.Strides(); + std::vector paddings = param.Paddings(); + std::vector dilations = param.Dilations(); + + const int batch_size = static_cast(input->dims()[0]); + + std::vector filter_shape_vec(framework::vectorize(filter.dims())); + + std::vector output_shape_vec(framework::vectorize(output->dims())); + size_t data_dim = filter_shape_vec.size() - 2; + std::vector col_shape_vec(1 + 2 * data_dim); + col_shape_vec[0] = input->dims()[1] / groups; + for (size_t j = 0; j < data_dim; ++j) { + col_shape_vec[j + 1] = filter_shape_vec[j + 2]; + col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2]; + } + framework::DDim col_shape(framework::make_ddim(col_shape_vec)); + + framework::DDim col_matrix_shape = + framework::flatten_to_2d(col_shape, data_dim + 1); + + bool is_expand = + math::IsExpand(filter_shape_vec, strides, paddings, dilations); + Tensor col; + Tensor col_matrix; + if (is_expand) { + col.mutable_data(col_shape); + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + } + + framework::DDim input_shape = framework::slice_ddim( + input->dims(), 1, static_cast(input->dims().size())); + + framework::DDim filter_matrix_shape = {filter.dims()[0], + filter.numel() / filter.dims()[0]}; + filter.Resize(filter_matrix_shape); + framework::DDim output_matrix_shape = { + output->dims()[1], + output->numel() / (output->dims()[0] * output->dims()[1])}; + + // convolution operator: im2col(or vol2col) + gemm + int in_step = static_cast(input->dims()[1]) / groups; + int out_step = static_cast(output->dims()[1]) / groups; + + math::Vol2ColFunctor vol2col; + math::Im2ColFunctor im2col; + + for (int i = 0; i < batch_size; i++) { + Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape); + Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape); + + for (int g = 0; g < groups; g++) { + Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step); + + if (!is_expand) { + col.ShareDataWith(in_slice); + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + } else if (data_dim == 2U) { + // im2col + im2col(in_slice, dilations, strides, + std::vector{paddings[0], paddings[1], paddings[0], + paddings[1]}, + &col); + } else if (data_dim == 3U) { + // vol2col + vol2col(in_slice, dilations, strides, paddings, &col); + } + // gemm + Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); + Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); + + math::matmulWithBn( + filter_slice, false, col_matrix, false, static_cast(1), + &out_slice, static_cast(0), true, &new_scale, &new_bias, g); + } + } +} + +template +void ConvBNReluCompute(const FusionConvBNReluParam ¶m) { + if (param.Groups() == param.Input()->dims()[1] && + param.Input()->dims()[1] == param.Output()->dims()[1] && + param.Filter()->dims()[2] == param.Filter()->dims()[3] && + param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1) { + math::DepthwiseConvAddBNRelu3x3s1p1(param.Input(), param.Filter(), + param.Output(), param.NewScale(), + param.NewBias(), true); + } else if (param.Groups() == param.Input()->dims()[1] && + param.Input()->dims()[1] == param.Output()->dims()[1] && + param.Filter()->dims()[2] == param.Filter()->dims()[3] && + param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) { + // math::DepthwiseConvAddBNRelu3x3s2p1(param.Input(), param.Filter(), + // param.Output(), param.NewScale(), + // param.NewBias(), 1); + math::DepthwiseConvAddBNRelu3x3s2p1v2(param.Input(), param.Filter(), + param.Output(), param.NewScale(), + param.NewBias(), true); + } else { + ConvBNReluBasic(param); + } +} + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/central-arm-func/dwconv_bn_relu_arm_func.h b/src/operators/kernel/central-arm-func/dwconv_bn_relu_arm_func.h new file mode 100644 index 0000000000000000000000000000000000000000..7693da2a84c15b8f7b6953eb51e2765b5ea159f8 --- /dev/null +++ b/src/operators/kernel/central-arm-func/dwconv_bn_relu_arm_func.h @@ -0,0 +1,137 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef FUSION_DWCONVBNRELU_OP + +#pragma once +#include +#include "operators/math/depthwise_conv_3x3.h" +#include "operators/op_param.h" +namespace paddle_mobile { +namespace operators { +void DWConvBNReluBasic(const FusionDWConvBNReluParam ¶m) { + const Tensor *input = param.Input(); + Tensor filter = *param.Filter(); + Tensor new_bias = *param.NewBias(); + Tensor new_scale = *param.NewScale(); + + Tensor *output = param.Output(); + + int groups = param.Groups(); + std::vector strides = param.Strides(); + std::vector paddings = param.Paddings(); + std::vector dilations = param.Dilations(); + + const int batch_size = static_cast(input->dims()[0]); + + std::vector filter_shape_vec(framework::vectorize(filter.dims())); + + std::vector output_shape_vec(framework::vectorize(output->dims())); + size_t data_dim = filter_shape_vec.size() - 2; + std::vector col_shape_vec(1 + 2 * data_dim); + col_shape_vec[0] = input->dims()[1] / groups; + for (size_t j = 0; j < data_dim; ++j) { + col_shape_vec[j + 1] = filter_shape_vec[j + 2]; + col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2]; + } + framework::DDim col_shape(framework::make_ddim(col_shape_vec)); + + framework::DDim col_matrix_shape = + framework::flatten_to_2d(col_shape, data_dim + 1); + + bool is_expand = + math::IsExpand(filter_shape_vec, strides, paddings, dilations); + Tensor col; + Tensor col_matrix; + if (is_expand) { + col.mutable_data(col_shape); + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + } + + framework::DDim input_shape = framework::slice_ddim( + input->dims(), 1, static_cast(input->dims().size())); + + framework::DDim filter_matrix_shape = {filter.dims()[0], + filter.numel() / filter.dims()[0]}; + filter.Resize(filter_matrix_shape); + framework::DDim output_matrix_shape = { + output->dims()[1], + output->numel() / (output->dims()[0] * output->dims()[1])}; + + // convolution operator: im2col(or vol2col) + gemm + int in_step = static_cast(input->dims()[1]) / groups; + int out_step = static_cast(output->dims()[1]) / groups; + + math::Vol2ColFunctor vol2col; + math::Im2ColFunctor im2col; + + for (int i = 0; i < batch_size; i++) { + Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape); + Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape); + + for (int g = 0; g < groups; g++) { + Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step); + + if (!is_expand) { + col.ShareDataWith(in_slice); + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + } else if (data_dim == 2U) { + // im2col + im2col(in_slice, dilations, strides, + std::vector{paddings[0], paddings[1], paddings[0], + paddings[1]}, + &col); + } else if (data_dim == 3U) { + // vol2col + vol2col(in_slice, dilations, strides, paddings, &col); + } + // gemm + Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); + Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); + math::matmulWithBn( + filter_slice, false, col_matrix, false, static_cast(1), + &out_slice, static_cast(0), true, &new_scale, &new_bias, g); + } + } +} +template +void DWConvBNReluCompute(const FusionDWConvBNReluParam ¶m) { + if (param.Groups() == param.Input()->dims()[1] && + param.Input()->dims()[1] == param.Output()->dims()[1] && + param.Filter()->dims()[2] == param.Filter()->dims()[3] && + param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1) { + math::DepthwiseConvAddBNRelu3x3s1p1(param.Input(), param.Filter(), + param.Output(), param.NewScale(), + param.NewBias(), true); + } else if (param.Groups() == param.Input()->dims()[1] && + param.Input()->dims()[1] == param.Output()->dims()[1] && + param.Filter()->dims()[2] == param.Filter()->dims()[3] && + param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) { + // math::DepthwiseConvAddBNRelu3x3s2p1(param.Input(), param.Filter(), + // param.Output(), param.NewScale(), + // param.NewBias(), 1); + math::DepthwiseConvAddBNRelu3x3s2p1v2(param.Input(), param.Filter(), + param.Output(), param.NewScale(), + param.NewBias(), true); + } else { + DWConvBNReluBasic(param); + } +} + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/central-arm-func/pool_arm_func.h b/src/operators/kernel/central-arm-func/pool_arm_func.h index 892dca2ea40d40484b4c32a57f8633849cc9d038..6179df5b0c11ad2a2e19384989029696e9d6c266 100644 --- a/src/operators/kernel/central-arm-func/pool_arm_func.h +++ b/src/operators/kernel/central-arm-func/pool_arm_func.h @@ -76,15 +76,20 @@ void PoolCompute(const PoolParam ¶m) { } } else if (ksize[0] == 2 && ksize[0] == ksize[1]) { -#ifndef IOS +#if __ARM_NEON +#if __aarch64__ + PoolBasic(pooling_type, ksize, strides, paddings, in_x, out); +#else if (pooling_type == "max") { math::Pool2x2Max(strides, paddings, in_x, out); } else if (pooling_type == "avg") { math::Pool2x2Avg(strides, paddings, in_x, out); } +#endif #else PoolBasic(pooling_type, ksize, strides, paddings, in_x, out); -#endif +#endif // __ARM_NEON + } else { PoolBasic(pooling_type, ksize, strides, paddings, in_x, out); } diff --git a/src/operators/kernel/central-arm-func/sigmoid_arm_func.h b/src/operators/kernel/central-arm-func/sigmoid_arm_func.h index daf6ad0e472515c8034a400dfc73de608f5b12d2..c612c4b092143ef8925f81a6d6fefe9cd9dff25b 100644 --- a/src/operators/kernel/central-arm-func/sigmoid_arm_func.h +++ b/src/operators/kernel/central-arm-func/sigmoid_arm_func.h @@ -68,6 +68,7 @@ void sigmoid(const Tensor *X, Tensor *Y) { input_outer_ptr++; } } +#else #endif } diff --git a/src/operators/kernel/conv_bn_relu_kernel.h b/src/operators/kernel/conv_bn_relu_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..c9d4df5d8f597deebaf2b53491851b7ce03fc7aa --- /dev/null +++ b/src/operators/kernel/conv_bn_relu_kernel.h @@ -0,0 +1,45 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#ifdef FUSION_CONVBNRELU_OP + +#include +#include "framework/ddim.h" +#include "framework/operator.h" +#include "operators/math/conv_func.h" +#include "operators/math/im2col.h" +#include "operators/math/math_function.h" +#include "operators/math/vol2col.h" +#include "operators/op_param.h" + +namespace paddle_mobile { +namespace operators { + +using framework::DDim; +using framework::OpKernelBase; + +template +class ConvBNReluKernel + : public OpKernelBase { + public: + void Compute(const FusionConvBNReluParam ¶m) const; + bool Init(FusionConvBNReluParam *param); +}; + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/dwconv_bn_relu_kernel.h b/src/operators/kernel/dwconv_bn_relu_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..91478ae5ecba37472e7e30f774f2c515b6952eee --- /dev/null +++ b/src/operators/kernel/dwconv_bn_relu_kernel.h @@ -0,0 +1,45 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#ifdef FUSION_DWCONVBNRELU_OP + +#include +#include "framework/ddim.h" +#include "framework/operator.h" +#include "operators/math/conv_func.h" +#include "operators/math/im2col.h" +#include "operators/math/math_function.h" +#include "operators/math/vol2col.h" +#include "operators/op_param.h" + +namespace paddle_mobile { +namespace operators { + +using framework::DDim; +using framework::OpKernelBase; + +template +class DWConvBNReluKernel + : public OpKernelBase { + public: + void Compute(const FusionDWConvBNReluParam ¶m) const; + bool Init(FusionDWConvBNReluParam *param); +}; + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/lrn_op.cpp b/src/operators/lrn_op.cpp index 1a5a8eccc1fc314d27517db8bc286035e573c9be..dde9123edf3568020f933bb7375be99e40f2367b 100644 --- a/src/operators/lrn_op.cpp +++ b/src/operators/lrn_op.cpp @@ -24,7 +24,7 @@ void LrnOp::InferShape() const { auto x_dims = this->param_.InputX()->dims(); this->param_.Out()->Resize(x_dims); } -template class LrnOp; + } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/math/depthwise_conv_3x3.cpp b/src/operators/math/depthwise_conv_3x3.cpp index 5db676564e190bf40e8af437ba68aee80b5a5af3..7e353c29b80279f895ad6d0150b31eb1703d97d4 100644 --- a/src/operators/math/depthwise_conv_3x3.cpp +++ b/src/operators/math/depthwise_conv_3x3.cpp @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "operators/math/depthwise_conv_3x3.h" -#ifdef __ARM_NEON +#if __ARM_NEON #include #endif #include @@ -23,7 +23,6 @@ namespace math { void DepthwiseConv3x3(const Tensor *input, vector strides, vector paddings, const Tensor *filter, Tensor *bias, Tensor *output, bool if_bias) { -#ifdef __ARM_NEON const int batch_size = input->dims()[0]; const int input_height = input->dims()[2]; @@ -181,7 +180,27 @@ void DepthwiseConv3x3(const Tensor *input, vector strides, } } else { -#if defined(ARMV17) +#if __ARM_NEON +#if __aarch64__ + const float32x4_t data1 = vld1q_f32(pos1); + const float32x4_t data2 = vld1q_f32(pos2); + const float32x4_t data3 = vld1q_f32(pos3); + + const float32x4_t v_filter1 = vld1q_f32(filter1); + const float32x4_t v_filter2 = vld1q_f32(filter2); + const float32x4_t v_filter3 = vld1q_f32(filter3); + float32x4_t mula = vmulq_f32(data1, v_filter1); + mula = vmlaq_f32(mula, data2, v_filter2); + mula = vmlaq_f32(mula, data3, v_filter3); + float32x2_t res = vpadd_f32( + vget_high_f32(vsetq_lane_f32(0, mula, 3)), vget_low_f32(mula)); + res = vpadd_f32(res, res); + if (if_bias) { + output_data[ph * output_width + pw] += vget_lane_f32(res, 0); + } else { + output_data[ph * output_width + pw] = vget_lane_f32(res, 0); + } +#else asm volatile( "vld1.32 {q1}, [%[pos1]] \n\t" @@ -209,26 +228,10 @@ void DepthwiseConv3x3(const Tensor *input, vector strides, [filter2] "r"(filter2), [filter3] "r"(filter3), [output_ptr] "r"(output_ptr), [zero] "r"(zero) : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6"); +#endif // __aarch64__ #else - const float32x4_t data1 = vld1q_f32(pos1); - const float32x4_t data2 = vld1q_f32(pos2); - const float32x4_t data3 = vld1q_f32(pos3); - const float32x4_t v_filter1 = vld1q_f32(filter1); - const float32x4_t v_filter2 = vld1q_f32(filter2); - const float32x4_t v_filter3 = vld1q_f32(filter3); - float32x4_t mula = vmulq_f32(data1, v_filter1); - mula = vmlaq_f32(mula, data2, v_filter2); - mula = vmlaq_f32(mula, data3, v_filter3); - float32x2_t res = vpadd_f32( - vget_high_f32(vsetq_lane_f32(0, mula, 3)), vget_low_f32(mula)); - res = vpadd_f32(res, res); - if (if_bias) { - output_data[ph * output_width + pw] += vget_lane_f32(res, 0); - } else { - output_data[ph * output_width + pw] = vget_lane_f32(res, 0); - } -#endif +#endif // __ARM_NEON } } } @@ -239,12 +242,11 @@ void DepthwiseConv3x3(const Tensor *input, vector strides, input_data += input_batch_stride; output_data += output_batch_stride; } -#endif } void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter, Tensor *output, Tensor *bias, bool if_bias) { -#ifdef __ARM_NEON +#if __ARM_NEON const float *input_data = input->data(); const float *filter_data = filter->data(); float *output_data = output->data(); @@ -520,7 +522,7 @@ void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter, void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter, Tensor *output, const Tensor *new_scale, const Tensor *new_bias, bool if_relu) { -#ifdef __ARM_NEON +#if __ARM_NEON const float *input_data = input->data(); const float *filter_data = filter->data(); float *output_data = output->data(); @@ -824,7 +826,7 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter, void DepthwiseConvAddBNRelu3x3s2p1(const Tensor *input, const Tensor *filter, Tensor *output, const Tensor *new_scale, const Tensor *new_bias, bool if_relu) { -#ifdef __ARM_NEON +#if __ARM_NEON const int batch_size = input->dims()[0]; @@ -1022,7 +1024,7 @@ void DepthwiseConvAddBNRelu3x3s2p1(const Tensor *input, const Tensor *filter, void DepthwiseConv3x3s2p1v2(const Tensor *input, const Tensor *filter, Tensor *output, Tensor bias, bool if_bias) { -#ifdef __ARM_NEON +#if __ARM_NEON const float *input_data = input->data(); const float *filter_data = filter->data(); float *output_data = output->data(); @@ -1225,7 +1227,7 @@ void DepthwiseConv3x3s2p1v2(const Tensor *input, const Tensor *filter, void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter, Tensor *output, const Tensor *new_scale, const Tensor *new_bias, bool if_relu) { -#ifdef __ARM_NEON +#if __ARM_NEON const float *input_data = input->data(); const float *filter_data = filter->data(); float *output_data = output->data(); diff --git a/src/operators/math/gemm.cpp b/src/operators/math/gemm.cpp index bb91adcc4db412db137fdc12831bad75e069e38c..4966ca14594cfe4680b4de2f7f56ef85e345e437 100644 --- a/src/operators/math/gemm.cpp +++ b/src/operators/math/gemm.cpp @@ -15,7 +15,7 @@ limitations under the License. */ #include "operators/math/gemm.h" #include "common/log.h" #include "memory/t_malloc.h" -#ifndef X86 +#if __ARM_NEON #include #endif #ifdef _OPENMP @@ -33,6 +33,7 @@ float *packedA; float *packedB; float *packedC; float *zero; +/* // 将A矩阵分块复制到连续内存(ColMajor) void PackMatrixA(int m, int k, int m_tail, const float *A, int lda, float *buffer) { @@ -60,6 +61,36 @@ void PackMatrixA(int m, int k, int m_tail, const float *A, int lda, } } +// 将B矩阵分块复制到连续内存(ColMajor) +void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb, + float *buffer) { + int i, j; + const float *Bj, *Bj1, *Bj2, *Bj3; + for (j = 0; j < n - n_tail; j += NR) { + Bj = &B(0, j); + Bj1 = &B(0, j + 1); + Bj2 = &B(0, j + 2); + Bj3 = &B(0, j + 3); + for (i = 0; i < k; ++i) { + *buffer++ = *Bj++; + *buffer++ = *Bj1++; + *buffer++ = *Bj2++; + *buffer++ = *Bj3++; + } + } + if (n_tail != 0) { + for (i = 0; i < k; ++i) { + for (int j = n - n_tail; j < n; ++j) { + *buffer++ = B(i, j); + } + for (int j = n; j < n + (NR - n_tail); ++j) { + *buffer++ = 0; + } + } + } +} +*/ + // 将A矩阵分块复制到连续内存(RowMajor) void PackMatrixA_(int m, int k, int m_tail, const float *A, int lda, float *buffer) { @@ -100,35 +131,6 @@ void PackMatrixA_(int m, int k, int m_tail, const float *A, int lda, } } -// 将B矩阵分块复制到连续内存(ColMajor) -void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb, - float *buffer) { - int i, j; - const float *Bj, *Bj1, *Bj2, *Bj3; - for (j = 0; j < n - n_tail; j += NR) { - Bj = &B(0, j); - Bj1 = &B(0, j + 1); - Bj2 = &B(0, j + 2); - Bj3 = &B(0, j + 3); - for (i = 0; i < k; ++i) { - *buffer++ = *Bj++; - *buffer++ = *Bj1++; - *buffer++ = *Bj2++; - *buffer++ = *Bj3++; - } - } - if (n_tail != 0) { - for (i = 0; i < k; ++i) { - for (int j = n - n_tail; j < n; ++j) { - *buffer++ = B(i, j); - } - for (int j = n; j < n + (NR - n_tail); ++j) { - *buffer++ = 0; - } - } - } -} - // 将B矩阵分块复制到连续内存(RowMajor) void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb, float *buffer) { @@ -136,13 +138,34 @@ void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb, for (int j = 0; j < n - n_tail; j += NR) { for (int i = 0; i < k; ++i) { b0 = &B(i, j); +#if __ARM_NEON +#if __aarch64__ + asm volatile( + "prfm pldl1keep, [%[b0]] \n\t" + "ld1 {v0.4s, v1.4s}, [%[b0]] \n\t" + "st1 {v0.4s, v1.4s}, [%[buffer]], #32 \n\t" + : [buffer] "+r"(buffer) + : [b0] "r"(b0) + : "memory", "v0", "v1"); +#else asm volatile( - "pld [%[b0]] \n\t" - "vld1.32 {q0, q1}, [%[b0]] \n\t" - "vst1.32 {q0, q1}, [%[buffer]]! \n\t" + "pld [%[b0]] \n\t" + "vld1.32 {q0, q1}, [%[b0]] \n\t" + "vst1.32 {q0, q1}, [%[buffer]]! \n\t" : [buffer] "+r"(buffer) : [b0] "r"(b0) - : "memory", "q0", "q0"); + : "memory", "q0", "q1"); +#endif // __aarch64__ +#else + *buffer++ = *b0++; + *buffer++ = *b0++; + *buffer++ = *b0++; + *buffer++ = *b0++; + *buffer++ = *b0++; + *buffer++ = *b0++; + *buffer++ = *b0++; + *buffer++ = *b0++; +#endif // __ARM_NEON } } if (n_tail != 0) { @@ -206,8 +229,10 @@ void InnerKernelWithBn(int mc, int nc, float alpha, const float *a, } } -#if defined(IOS) -void AddDot4x4(int k, const float *a, const float *b, float *C, int ldc) { +#if __ARM_NEON +#if __aarch64__ + +void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) { // init C float32x4_t cv0 = vdupq_n_f32(0.0); float32x4_t cv1 = vdupq_n_f32(0.0); @@ -234,30 +259,271 @@ void AddDot4x4(int k, const float *a, const float *b, float *C, int ldc) { a += MR; b += NR; } - float32x4x4_t cv = {cv0, cv1, cv2, cv3}; - int i, j; - for (i = 0; i < mc; ++i) { - for (j = 0; j < nc; ++j) { - if (beta == 0.0) { - C(i, j) = 0.0; - } else if (beta != 1.0) { - C(i, j) *= beta; + + vst1q_f32(c, cv0); + vst1q_f32(c + ldc, cv1); + vst1q_f32(c + 2 * ldc, cv2); + vst1q_f32(c + 3 * ldc, cv3); + // float32x4x4_t cv = {cv0, cv1, cv2, cv3}; +} + +void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc) { + // init C + float32x4_t cv0 = vdupq_n_f32(0.0); + float32x4_t cv1 = vdupq_n_f32(0.0); + float32x4_t cv2 = vdupq_n_f32(0.0); + float32x4_t cv3 = vdupq_n_f32(0.0); + float32x4_t cv4 = vdupq_n_f32(0.0); + float32x4_t cv5 = vdupq_n_f32(0.0); + float32x4_t cv6 = vdupq_n_f32(0.0); + float32x4_t cv7 = vdupq_n_f32(0.0); + + float32x4_t av; + float32x4_t bv0; + float32x4_t bv1; + + float32x2_t av01; + float32x2_t av23; + + for (int p = 0; p < k; p += 1) { + av = vld1q_f32(a); + bv0 = vld1q_f32(b); + bv1 = vld1q_f32(b + 4); + + av01 = vget_low_f32(av); + cv0 = vmlaq_lane_f32(cv0, bv0, av01, 0); + cv1 = vmlaq_lane_f32(cv1, bv1, av01, 0); + cv2 = vmlaq_lane_f32(cv2, bv0, av01, 1); + cv3 = vmlaq_lane_f32(cv3, bv1, av01, 1); + av23 = vget_high_f32(av); + cv4 = vmlaq_lane_f32(cv4, bv0, av23, 0); + cv5 = vmlaq_lane_f32(cv5, bv1, av23, 0); + cv6 = vmlaq_lane_f32(cv6, bv0, av23, 1); + cv7 = vmlaq_lane_f32(cv7, bv1, av23, 1); + + a += MR; + b += NR; + } + + vst1q_f32(c, cv0); + vst1q_f32(c + 4, cv1); + vst1q_f32(c + ldc, cv2); + vst1q_f32(c + ldc + 4, cv3); + vst1q_f32(c + 2 * ldc, cv4); + vst1q_f32(c + 2 * ldc + 4, cv5); + vst1q_f32(c + 3 * ldc, cv6); + vst1q_f32(c + 3 * ldc + 4, cv7); +} + +// 分块矩阵乘法结果回写 +// C = A * B +void WriteBasic(int mc, int nc, float *c, float *C, int ldc) { + int nc1 = nc / 4; + int _nc1 = nc % 4; + + float *c_ptr, *C_ptr; + float32x4_t cv; + for (int i = 0; i < mc; ++i) { + c_ptr = c + i * NC; + C_ptr = C + i * ldc; + for (int j = 0; j < nc1; ++j) { + cv = vld1q_f32(c_ptr); + vst1q_f32(C_ptr, cv); + c_ptr += 4; + C_ptr += 4; + } + if (_nc1 != 0) { + cv = vld1q_f32(c_ptr); + if (_nc1 >= 1) { + vst1q_lane_f32(C_ptr, cv, 0); + C_ptr++; } - if (j == 0) { - C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 0); - } else if (j == 1) { - C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 1); - } else if (j == 2) { - C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 2); - } else if (j == 3) { - C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 3); + if (_nc1 >= 2) { + vst1q_lane_f32(C_ptr, cv, 1); + C_ptr++; + } + if (_nc1 >= 3) { + vst1q_lane_f32(C_ptr, cv, 2); } } } } -} // namespace math -#elif defined(ARMV7) +// C = alpha * A * B + beta * C +void WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc) {} + +// C = A * B + C +void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc) { + int nc1 = nc / 4; + int _nc1 = nc % 4; + + float *c_ptr, *C_ptr; + float32x4_t cv; + float32x4_t cv1; + for (int i = 0; i < mc; ++i) { + c_ptr = c + i * NC; + C_ptr = C + i * ldc; + for (int j = 0; j < nc1; ++j) { + cv = vld1q_f32(c_ptr); + cv1 = vld1q_f32(C_ptr); + cv = vaddq_f32(cv, cv1); + vst1q_f32(C_ptr, cv); + c_ptr += 4; + C_ptr += 4; + } + if (_nc1 != 0) { + cv = vld1q_f32(c_ptr); + cv1 = vld1q_f32(C_ptr); + cv = vaddq_f32(cv, cv1); + if (_nc1 >= 1) { + vst1q_lane_f32(C_ptr, cv, 0); + C_ptr++; + } + if (_nc1 >= 2) { + vst1q_lane_f32(C_ptr, cv, 1); + C_ptr++; + } + if (_nc1 >= 3) { + vst1q_lane_f32(C_ptr, cv, 2); + } + } + } +} + +// C = A * B + C, relu(C) +void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) { + int nc1 = nc / 4; + int _nc1 = nc % 4; + + float *c_ptr, *C_ptr; + float32x4_t cv; + float32x4_t cv1; + float32x4_t zero = vdupq_n_f32(0.0); + for (int i = 0; i < mc; ++i) { + c_ptr = c + i * NC; + C_ptr = C + i * ldc; + for (int j = 0; j < nc1; ++j) { + cv = vld1q_f32(c_ptr); + cv1 = vld1q_f32(C_ptr); + cv = vaddq_f32(cv, cv1); + cv = vmaxq_f32(cv, zero); + vst1q_f32(C_ptr, cv); + c_ptr += 4; + C_ptr += 4; + } + if (_nc1 != 0) { + cv = vld1q_f32(c_ptr); + cv1 = vld1q_f32(C_ptr); + cv = vaddq_f32(cv, cv1); + cv = vmaxq_f32(cv, zero); + if (_nc1 >= 1) { + vst1q_lane_f32(C_ptr, cv, 0); + C_ptr++; + } + if (_nc1 >= 2) { + vst1q_lane_f32(C_ptr, cv, 1); + C_ptr++; + } + if (_nc1 >= 3) { + vst1q_lane_f32(C_ptr, cv, 2); + } + } + } +} + +// C = A * B, batchnorm(C) +void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *new_scale, + float *new_bias) { + int nc1 = nc / 4; + int _nc1 = nc % 4; + + float *c_ptr, *C_ptr; + float32x4_t cv; + float32x4_t cv1; + float32x4_t bias; + float32x2_t scale; + for (int i = 0; i < mc; ++i) { + c_ptr = c + i * NC; + C_ptr = C + i * ldc; + bias = vld1q_dup_f32(new_bias); + scale = vld1_dup_f32(new_scale); + new_bias++; + new_scale++; + float scale0 = vget_lane_f32(scale, 0); + for (int j = 0; j < nc1; ++j) { + cv = vld1q_f32(c_ptr); + cv = vmlaq_n_f32(bias, cv, scale0); + vst1q_f32(C_ptr, cv); + c_ptr += 4; + C_ptr += 4; + } + if (_nc1 != 0) { + cv = vld1q_f32(c_ptr); + cv = vmlaq_n_f32(bias, cv, scale0); + if (_nc1 >= 1) { + vst1q_lane_f32(C_ptr, cv, 0); + C_ptr++; + } + if (_nc1 >= 2) { + vst1q_lane_f32(C_ptr, cv, 1); + C_ptr++; + } + if (_nc1 >= 3) { + vst1q_lane_f32(C_ptr, cv, 2); + C_ptr++; + } + } + } +} + +// C = A * B, batchnorm(C), relu(C) +void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc, + float *new_scale, float *new_bias) { + int nc1 = nc / 4; + int _nc1 = nc % 4; + + float *c_ptr, *C_ptr; + float32x4_t cv; + float32x4_t bias; + float32x2_t scale; + float32x4_t zero = vdupq_n_f32(0.0); + for (int i = 0; i < mc; ++i) { + c_ptr = c + i * NC; + C_ptr = C + i * ldc; + bias = vld1q_dup_f32(new_bias); + scale = vld1_dup_f32(new_scale); + new_bias++; + new_scale++; + float scale0 = vget_lane_f32(scale, 0); + for (int j = 0; j < nc1; ++j) { + cv = vld1q_f32(c_ptr); + cv = vmlaq_n_f32(bias, cv, scale0); + cv = vmaxq_f32(cv, zero); + vst1q_f32(C_ptr, cv); + c_ptr += 4; + C_ptr += 4; + } + if (_nc1 != 0) { + cv = vld1q_f32(c_ptr); + cv = vmlaq_n_f32(bias, cv, scale0); + cv = vmaxq_f32(cv, zero); + if (_nc1 >= 1) { + vst1q_lane_f32(C_ptr, cv, 0); + C_ptr++; + } + if (_nc1 >= 2) { + vst1q_lane_f32(C_ptr, cv, 1); + C_ptr++; + } + if (_nc1 >= 3) { + vst1q_lane_f32(C_ptr, cv, 2); + } + } + } +} + +#else + void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) { const float *a_ptr, *b_ptr; a_ptr = a; @@ -328,221 +594,77 @@ void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) { "q10", "q11", "q12", "q13"); } -#else -void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) { - float *c0, *c1, *c2, *c3; - c0 = c; - c1 = c + ldc; - c2 = c + 2 * ldc; - c3 = c + 3 * ldc; - for (int p = 0; p < k; p += 1) { - // first row - c0[0] += a[0] * b[0]; - c0[1] += a[0] * b[1]; - c0[2] += a[0] * b[2]; - c0[3] += a[0] * b[3]; +/* +void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda, + const float *B, int ldb, float beta, float *C, int ldc, + bool relu) { + float *bufferC = static_cast(memory::Alloc(sizeof(float) * n)); - // second row - c1[0] += a[1] * b[0]; - c1[1] += a[1] * b[1]; - c1[2] += a[1] * b[2]; - c1[3] += a[1] * b[3]; + const float *a0, *b0, *b1, *b2, *b3; + float *c0, *C0; - // third row - c2[0] += a[2] * b[0]; - c2[1] += a[2] * b[1]; - c2[2] += a[2] * b[2]; - c2[3] += a[2] * b[3]; + int volatile kc1 = k / 4; + int volatile kc2 = k % 4; + int volatile nc1 = n / 16; + int _nc1 = n % 16; + int volatile nc2 = _nc1 / 4; + int volatile nc3 = _nc1 % 4; + for (int i = 0; i < kc1; i++) { + a0 = A + i * 4; + b0 = B + i * 4 * ldb; + b1 = b0 + ldb; + b2 = b1 + ldb; + b3 = b2 + ldb; + c0 = bufferC; + asm volatile( + "pld [%[a0], #16] \n\t" + "vld1.32 {q0}, [%[a0]] \n\t" - // fourth row - c3[0] += a[3] * b[0]; - c3[1] += a[3] * b[1]; - c3[2] += a[3] * b[2]; - c3[3] += a[3] * b[3]; + "subs %[nc1], %[nc1], #1 \n\t" + "blt end_nc1_%= \n\t" + "loop_nc1_%=: \n\t" - a += 4; - b += 4; - } -} + "cmp %[i], #0 \n\t" + "beq i_eq0_%= \n\t" + "bne i_ne0_%= \n\t" -#endif + "i_eq0_%=: \n\t" + "vmov.f32 q10, #0.0 \n\t" + "vmov.f32 q11, #0.0 \n\t" + "vmov.f32 q12, #0.0 \n\t" + "vmov.f32 q13, #0.0 \n\t" + "b gemm_nc1_%= \n\t" -// 32位 float 矩阵乘法 -void Sgemm(int m, int n, int k, float alpha, const float *A, int lda, - const float *B, int ldb, float beta, float *C, int ldc, bool relu) { - // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73) - // L2 cache is 0.5~4 Mib (Contex-A72 cluster) - int L1 = 30 * 1024; - int L2 = 1 * 1024 * 1024; + "i_ne0_%=: \n\t" + "pld [%[c0], #64] \n\t" + "vld1.32 {q10, q11}, [%[c0]]! \n\t" + "vld1.32 {q12, q13}, [%[c0]] \n\t" + "sub %[c0], %[c0], #32 \n\t" - KC = k; - MC = L2 / (2 * KC * sizeof(float)); - NC = MC; + "gemm_nc1_%=: \n\t" + "pld [%[b0], #64] \n\t" + "vld1.32 {q2, q3}, [%[b0]]! \n\t" + "vld1.32 {q4, q5}, [%[b0]]! \n\t" + "vmla.f32 q10, q2, d0[0] \n\t" + "vmla.f32 q11, q3, d0[0] \n\t" + "vmla.f32 q12, q4, d0[0] \n\t" + "vmla.f32 q13, q5, d0[0] \n\t" - // make sure MC is multiple of 4, and NC is multiple of 8 - int mblock_num = (m + MC - 1) / MC; - MC = (m + mblock_num - 1) / mblock_num; - MC = (MC + 4 - 1) / 4 * 4; - // DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n"; + "pld [%[b1], #64] \n\t" + "vld1.32 {q2, q3}, [%[b1]]! \n\t" + "vld1.32 {q4, q5}, [%[b1]]! \n\t" + "vmla.f32 q10, q2, d0[1] \n\t" + "vmla.f32 q11, q3, d0[1] \n\t" + "vmla.f32 q12, q4, d0[1] \n\t" + "vmla.f32 q13, q5, d0[1] \n\t" - int nblock_num = (n + NC - 1) / NC; - NC = (n + nblock_num - 1) / nblock_num; - NC = (NC + 8 - 1) / 8 * 8; - // DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n"; - - packedA = static_cast( - paddle_mobile::memory::Alloc(sizeof(float) * MC * KC)); - packedB = static_cast( - paddle_mobile::memory::Alloc(sizeof(float) * KC * NC)); - packedC = static_cast( - paddle_mobile::memory::Alloc(sizeof(float) * MC * NC)); - zero = static_cast(paddle_mobile::memory::Alloc(sizeof(float) * KC)); - - for (int l = 0; l < KC; ++l) { - zero[l] = 0; - } - - int mc, nc; - for (int j = 0; j < n; j += NC) { - nc = s_min(n - j, NC); - PackMatrixB_(KC, nc, nc % NR, &B(0, j), ldb, packedB); - for (int i = 0; i < m; i += MC) { - mc = s_min(m - i, MC); - PackMatrixA_(mc, KC, mc % MR, &A(i, 0), lda, packedA); - InnerKernel(mc, nc, alpha, packedA, packedB, beta, packedC, &C(i, j), ldc, - relu); - } - } - - paddle_mobile::memory::Free(packedA); - paddle_mobile::memory::Free(packedB); - paddle_mobile::memory::Free(packedC); - paddle_mobile::memory::Free(zero); -} - -void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda, - const float *B, int ldb, float beta, float *C, int ldc, - bool relu, float *new_scale, float *new_bias) { - // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73) - // L2 cache is 0.5~4 Mib (Contex-A72 cluster) - int L1 = 30 * 1024; - int L2 = 1 * 1024 * 1024; - - KC = k; - MC = L2 / (2 * KC * sizeof(float)); - NC = MC; - - // make sure MC is multiple of 4, and NC is multiple of 8 - int mblock_num = (m + MC - 1) / MC; - MC = (m + mblock_num - 1) / mblock_num; - MC = (MC + 4 - 1) / 4 * 4; - // DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n"; - - int nblock_num = (n + NC - 1) / NC; - NC = (n + nblock_num - 1) / nblock_num; - NC = (NC + 8 - 1) / 8 * 8; - // DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n"; - - packedA = static_cast( - paddle_mobile::memory::Alloc(sizeof(float) * MC * KC)); - packedB = static_cast( - paddle_mobile::memory::Alloc(sizeof(float) * KC * NC)); - packedC = static_cast( - paddle_mobile::memory::Alloc(sizeof(float) * MC * NC)); - zero = static_cast(paddle_mobile::memory::Alloc(sizeof(float) * KC)); - - for (int l = 0; l < KC; ++l) { - zero[l] = 0; - } - - int mc, nc; - for (int j = 0; j < n; j += NC) { - nc = s_min(n - j, NC); - PackMatrixB_(KC, nc, nc % NR, &B(0, j), ldb, packedB); - for (int i = 0; i < m; i += MC) { - mc = s_min(m - i, MC); - PackMatrixA_(mc, KC, mc % MR, &A(i, 0), lda, packedA); - InnerKernelWithBn(mc, nc, alpha, packedA, packedB, beta, packedC, - &C(i, j), ldc, relu, new_scale + i, new_bias + i); - } - } - - paddle_mobile::memory::Free(packedA); - paddle_mobile::memory::Free(packedB); - paddle_mobile::memory::Free(packedC); - paddle_mobile::memory::Free(zero); -} - -void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda, - const float *B, int ldb, float beta, float *C, int ldc, - bool relu) { - float *bufferC = static_cast(memory::Alloc(sizeof(float) * n)); - - const float *a0, *b0, *b1, *b2, *b3; - float *c0, *C0; - - int volatile kc1 = k / 4; - int volatile kc2 = k % 4; - int volatile nc1 = n / 16; - int _nc1 = n % 16; - int volatile nc2 = _nc1 / 4; - int volatile nc3 = _nc1 % 4; - for (int i = 0; i < kc1; i++) { - a0 = A + i * 4; - b0 = B + i * 4 * ldb; - b1 = b0 + ldb; - b2 = b1 + ldb; - b3 = b2 + ldb; - c0 = bufferC; - asm volatile( - "pld [%[a0], #16] \n\t" - "vld1.32 {q0}, [%[a0]] \n\t" - - "subs %[nc1], %[nc1], #1 \n\t" - "blt end_nc1_%= \n\t" - "loop_nc1_%=: \n\t" - - "cmp %[i], #0 \n\t" - "beq i_eq0_%= \n\t" - "bne i_ne0_%= \n\t" - - "i_eq0_%=: \n\t" - "vmov.f32 q10, #0.0 \n\t" - "vmov.f32 q11, #0.0 \n\t" - "vmov.f32 q12, #0.0 \n\t" - "vmov.f32 q13, #0.0 \n\t" - "b gemm_nc1_%= \n\t" - - "i_ne0_%=: \n\t" - "pld [%[c0], #64] \n\t" - "vld1.32 {q10, q11}, [%[c0]]! \n\t" - "vld1.32 {q12, q13}, [%[c0]] \n\t" - "sub %[c0], %[c0], #32 \n\t" - - "gemm_nc1_%=: \n\t" - "pld [%[b0], #64] \n\t" - "vld1.32 {q2, q3}, [%[b0]]! \n\t" - "vld1.32 {q4, q5}, [%[b0]]! \n\t" - "vmla.f32 q10, q2, d0[0] \n\t" - "vmla.f32 q11, q3, d0[0] \n\t" - "vmla.f32 q12, q4, d0[0] \n\t" - "vmla.f32 q13, q5, d0[0] \n\t" - - "pld [%[b1], #64] \n\t" - "vld1.32 {q2, q3}, [%[b1]]! \n\t" - "vld1.32 {q4, q5}, [%[b1]]! \n\t" - "vmla.f32 q10, q2, d0[1] \n\t" - "vmla.f32 q11, q3, d0[1] \n\t" - "vmla.f32 q12, q4, d0[1] \n\t" - "vmla.f32 q13, q5, d0[1] \n\t" - - "pld [%[b2], #64] \n\t" - "vld1.32 {q2, q3}, [%[b2]]! \n\t" - "vld1.32 {q4, q5}, [%[b2]]! \n\t" - "vmla.f32 q10, q2, d1[0] \n\t" - "vmla.f32 q11, q3, d1[0] \n\t" - "vmla.f32 q12, q4, d1[0] \n\t" - "vmla.f32 q13, q5, d1[0] \n\t" + "pld [%[b2], #64] \n\t" + "vld1.32 {q2, q3}, [%[b2]]! \n\t" + "vld1.32 {q4, q5}, [%[b2]]! \n\t" + "vmla.f32 q10, q2, d1[0] \n\t" + "vmla.f32 q11, q3, d1[0] \n\t" + "vmla.f32 q12, q4, d1[0] \n\t" + "vmla.f32 q13, q5, d1[0] \n\t" "pld [%[b3], #64] \n\t" "vld1.32 {q2, q3}, [%[b3]]! \n\t" @@ -905,6 +1027,7 @@ void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A, VecWriteWithBn(n, bufferC, C, ldc, new_scale, new_bias); } } +*/ void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc) { const float *a_ptr, *b_ptr; @@ -1214,6 +1337,21 @@ void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) { // C = A * B, batchnorm(C) void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *scale, float *bias) { + if (nc < 4) { + for (int i = 0; i < mc; ++i) { + for (int j = 0; j < nc; ++j) { + *C = (*c) * (*scale) + (*bias); + C++; + c++; + } + C += (ldc - nc); + c += (NC - nc); + scale++; + bias++; + } + return; + } + int volatile nc1 = nc / 16; int _nc1 = nc % 16; int volatile nc2 = _nc1 / 4; @@ -1300,6 +1438,24 @@ void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *scale, // C = A * B, batchnorm(C), relu(C) void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc, float *scale, float *bias) { + if (nc < 4) { + for (int i = 0; i < mc; ++i) { + for (int j = 0; j < nc; ++j) { + *C = (*c) * (*scale) + (*bias); + if (*C < 0) { + *C = 0; + } + C++; + c++; + } + C += (ldc - nc); + c += (NC - nc); + scale++; + bias++; + } + return; + } + int nc1 = nc / 16; int _nc1 = nc % 16; int nc2 = _nc1 / 4; @@ -1390,282 +1546,429 @@ void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc, float *scale, "q8", "q10", "q11", "q12", "q13", "q14"); } -// C = A * B -void VecWriteBasic(int n, float *c, float *C, int ldc) { - int nc1 = n / 16; - int _nc1 = n % 16; - int nc2 = _nc1 / 4; - int nc3 = 16 - 4 * (_nc1 % 4); + /* + // C = A * B + void VecWriteBasic(int n, float *c, float *C, int ldc) { + int nc1 = n / 16; + int _nc1 = n % 16; + int nc2 = _nc1 / 4; + int nc3 = 16 - 4 * (_nc1 % 4); - asm volatile( - "subs %[nc1], %[nc1], #1 \n\t" - "blt end_nc1_%= \n\t" - "loop_nc1_%=: \n\t" + asm volatile( + "subs %[nc1], %[nc1], #1 \n\t" + "blt end_nc1_%= \n\t" + "loop_nc1_%=: \n\t" - "vld1.32 {q0, q1}, [%[c]]! \n\t" - "vst1.32 {q0, q1}, [%[C]]! \n\t" + "vld1.32 {q0, q1}, [%[c]]! \n\t" + "vst1.32 {q0, q1}, [%[C]]! \n\t" - "vld1.32 {q2, q3}, [%[c]]! \n\t" - "vst1.32 {q2, q3}, [%[C]]! \n\t" + "vld1.32 {q2, q3}, [%[c]]! \n\t" + "vst1.32 {q2, q3}, [%[C]]! \n\t" - "subs %[nc1], %[nc1], #1 \n\t" - "bge loop_nc1_%= \n\t" - "end_nc1_%=: \n\t" + "subs %[nc1], %[nc1], #1 \n\t" + "bge loop_nc1_%= \n\t" + "end_nc1_%=: \n\t" - "subs %[nc2], %[nc2], #1 \n\t" - "blt end_nc2_%= \n\t" - "loop_nc2_%=: \n\t" + "subs %[nc2], %[nc2], #1 \n\t" + "blt end_nc2_%= \n\t" + "loop_nc2_%=: \n\t" - "vld1.32 {q4}, [%[c]]! \n\t" - "vst1.32 {q4}, [%[C]]! \n\t" + "vld1.32 {q4}, [%[c]]! \n\t" + "vst1.32 {q4}, [%[C]]! \n\t" - "subs %[nc2], %[nc2], #1 \n\t" - "bge loop_nc2_%= \n\t" - "end_nc2_%=: \n\t" + "subs %[nc2], %[nc2], #1 \n\t" + "bge loop_nc2_%= \n\t" + "end_nc2_%=: \n\t" - "cmp %[nc3], #16 \n\t" - "beq end_nc3_%= \n\t" - "sub %[c], %[c], %[nc3] \n\t" - "sub %[C], %[C], %[nc3] \n\t" - "vld1.32 {q5}, [%[c]]! \n\t" - "vst1.32 {q5}, [%[C]]! \n\t" - "end_nc3_%=: \n\t" + "cmp %[nc3], #16 \n\t" + "beq end_nc3_%= \n\t" + "sub %[c], %[c], %[nc3] \n\t" + "sub %[C], %[C], %[nc3] \n\t" + "vld1.32 {q5}, [%[c]]! \n\t" + "vst1.32 {q5}, [%[C]]! \n\t" + "end_nc3_%=: \n\t" - : - : [C] "r"(C), [c] "r"(c), [nc1] "r"(nc1), [nc2] "r"(nc2), [nc3] "r"(nc3) - : "memory", "q0", "q1", "q2", "q3", "q4", "q5"); -} + : + : [C] "r"(C), [c] "r"(c), [nc1] "r"(nc1), [nc2] "r"(nc2), [nc3] "r"(nc3) + : "memory", "q0", "q1", "q2", "q3", "q4", "q5"); + } -// C = alpha * A * B + beta * C -void VecWriteWithAlphaBeta(int n, float *c, float *C, int ldc) {} + // C = alpha * A * B + beta * C + void VecWriteWithAlphaBeta(int n, float *c, float *C, int ldc) {} -// C = A * B + C -void VecWriteWithAdd(int n, float *c, float *C, int ldc) { - int nc1 = n / 16; - int _nc1 = n % 16; + // C = A * B + C + void VecWriteWithAdd(int n, float *c, float *C, int ldc) { + int nc1 = n / 16; + int _nc1 = n % 16; - asm volatile( - "subs %[nc1], %[nc1], #1 \n\t" - "blt end_nc1_%= \n\t" - "loop_nc1_%=: \n\t" + asm volatile( + "subs %[nc1], %[nc1], #1 \n\t" + "blt end_nc1_%= \n\t" + "loop_nc1_%=: \n\t" - "vld1.32 {q0, q1}, [%[c]]! \n\t" - "vld1.32 {q2, q3}, [%[C]] \n\t" - "vadd.f32 q10, q0, q2 \n\t" - "vadd.f32 q11, q1, q3 \n\t" - "vst1.32 {q10, q11}, [%[C]]! \n\t" + "vld1.32 {q0, q1}, [%[c]]! \n\t" + "vld1.32 {q2, q3}, [%[C]] \n\t" + "vadd.f32 q10, q0, q2 \n\t" + "vadd.f32 q11, q1, q3 \n\t" + "vst1.32 {q10, q11}, [%[C]]! \n\t" - "vld1.32 {q4, q5}, [%[c]]! \n\t" - "vld1.32 {q6, q7}, [%[C]] \n\t" - "vadd.f32 q12, q4, q6 \n\t" - "vadd.f32 q13, q5, q7 \n\t" - "vst1.32 {q12, q13}, [%[C]]! \n\t" + "vld1.32 {q4, q5}, [%[c]]! \n\t" + "vld1.32 {q6, q7}, [%[C]] \n\t" + "vadd.f32 q12, q4, q6 \n\t" + "vadd.f32 q13, q5, q7 \n\t" + "vst1.32 {q12, q13}, [%[C]]! \n\t" - "subs %[nc1], %[nc1], #1 \n\t" - "bge loop_nc1_%= \n\t" - "end_nc1_%=: \n\t" + "subs %[nc1], %[nc1], #1 \n\t" + "bge loop_nc1_%= \n\t" + "end_nc1_%=: \n\t" - : [C] "+r"(C), [c] "+r"(c) - : [nc1] "r"(nc1) - : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q10", "q11", - "q12", "q13"); + : [C] "+r"(C), [c] "+r"(c) + : [nc1] "r"(nc1) + : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q10", + "q11", "q12", "q13"); - if (_nc1 != 0) { - for (int j = 0; j < _nc1; j++) { - *C++ += *c++; + if (_nc1 != 0) { + for (int j = 0; j < _nc1; j++) { + *C++ += *c++; + } } } -} -// C = A * B + C, relu(C) -void VecWriteWithAddRelu(int n, float *c, float *C, int ldc) { - int nc1 = n / 16; - int _nc1 = n % 16; + // C = A * B + C, relu(C) + void VecWriteWithAddRelu(int n, float *c, float *C, int ldc) { + int nc1 = n / 16; + int _nc1 = n % 16; - asm volatile( - "vmov.f32 q14, #0.0 \n\t" - "subs %[nc1], %[nc1], #1 \n\t" - "blt end_nc1_%= \n\t" - "loop_nc1_%=: \n\t" + asm volatile( + "vmov.f32 q14, #0.0 \n\t" + "subs %[nc1], %[nc1], #1 \n\t" + "blt end_nc1_%= \n\t" + "loop_nc1_%=: \n\t" - "vld1.32 {q0, q1}, [%[c]]! \n\t" - "vld1.32 {q2, q3}, [%[C]] \n\t" - "vadd.f32 q10, q0, q2 \n\t" - "vadd.f32 q11, q1, q3 \n\t" - "vmax.f32 q10, q10, q14 \n\t" - "vmax.f32 q11, q11, q14 \n\t" - "vst1.32 {q10, q11}, [%[C]]! \n\t" + "vld1.32 {q0, q1}, [%[c]]! \n\t" + "vld1.32 {q2, q3}, [%[C]] \n\t" + "vadd.f32 q10, q0, q2 \n\t" + "vadd.f32 q11, q1, q3 \n\t" + "vmax.f32 q10, q10, q14 \n\t" + "vmax.f32 q11, q11, q14 \n\t" + "vst1.32 {q10, q11}, [%[C]]! \n\t" - "vld1.32 {q4, q5}, [%[c]]! \n\t" - "vld1.32 {q6, q7}, [%[C]] \n\t" - "vadd.f32 q12, q4, q6 \n\t" - "vadd.f32 q13, q5, q7 \n\t" - "vmax.f32 q12, q12, q14 \n\t" - "vmax.f32 q13, q13, q14 \n\t" - "vst1.32 {q12, q13}, [%[C]]! \n\t" + "vld1.32 {q4, q5}, [%[c]]! \n\t" + "vld1.32 {q6, q7}, [%[C]] \n\t" + "vadd.f32 q12, q4, q6 \n\t" + "vadd.f32 q13, q5, q7 \n\t" + "vmax.f32 q12, q12, q14 \n\t" + "vmax.f32 q13, q13, q14 \n\t" + "vst1.32 {q12, q13}, [%[C]]! \n\t" - "subs %[nc1], %[nc1], #1 \n\t" - "bge loop_nc1_%= \n\t" - "end_nc1_%=: \n\t" + "subs %[nc1], %[nc1], #1 \n\t" + "bge loop_nc1_%= \n\t" + "end_nc1_%=: \n\t" - : [C] "+r"(C), [c] "+r"(c) - : [nc1] "r"(nc1) - : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q10", "q11", - "q12", "q13"); + : [C] "+r"(C), [c] "+r"(c) + : [nc1] "r"(nc1) + : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q10", + "q11", "q12", "q13"); - if (_nc1 != 0) { - for (int j = 0; j < _nc1; j++) { - *C += *c; - if (*C < 0) { - *C = 0; + if (_nc1 != 0) { + for (int j = 0; j < _nc1; j++) { + *C += *c; + if (*C < 0) { + *C = 0; + } + C++; + c++; } - C++; - c++; } } -} -// C = A * B, batchnorm(C) -void VecWriteWithBn(int n, float *c, float *C, int ldc, float *scale, - float *bias) { - int nc1 = n / 16; - int _nc1 = n % 16; - int nc2 = _nc1 / 4; - int nc3 = 16 - 4 * (_nc1 % 4); + // C = A * B, batchnorm(C) + void VecWriteWithBn(int n, float *c, float *C, int ldc, float *scale, + float *bias) { + int nc1 = n / 16; + int _nc1 = n % 16; + int nc2 = _nc1 / 4; + int nc3 = 16 - 4 * (_nc1 % 4); - asm volatile( - "subs %[nc1], %[nc1], #1 \n\t" - "blt end_nc1_%= \n\t" - "loop_nc1_%=: \n\t" + asm volatile( + "subs %[nc1], %[nc1], #1 \n\t" + "blt end_nc1_%= \n\t" + "loop_nc1_%=: \n\t" - "vld1.32 {q0, q1}, [%[c]]! \n\t" - "vld1.32 {q2, q3}, [%[scale]]! \n\t" - "vld1.32 {q10, q11}, [%[bias]]! \n\t" - "vmla.f32 q10, q0, q2 \n\t" - "vmla.f32 q11, q1, q3 \n\t" - "vst1.32 {q10, q11}, [%[C]]! \n\t" + "vld1.32 {q0, q1}, [%[c]]! \n\t" + "vld1.32 {q2, q3}, [%[scale]]! \n\t" + "vld1.32 {q10, q11}, [%[bias]]! \n\t" + "vmla.f32 q10, q0, q2 \n\t" + "vmla.f32 q11, q1, q3 \n\t" + "vst1.32 {q10, q11}, [%[C]]! \n\t" + + "vld1.32 {q4, q5}, [%[c]]! \n\t" + "vld1.32 {q6, q7}, [%[scale]]! \n\t" + "vld1.32 {q12, q13}, [%[bias]]! \n\t" + "vmla.f32 q12, q4, q6 \n\t" + "vmla.f32 q13, q5, q7 \n\t" + "vst1.32 {q12, q13}, [%[C]]! \n\t" + + "subs %[nc1], %[nc1], #1 \n\t" + "bge loop_nc1_%= \n\t" + "end_nc1_%=: \n\t" - "vld1.32 {q4, q5}, [%[c]]! \n\t" - "vld1.32 {q6, q7}, [%[scale]]! \n\t" - "vld1.32 {q12, q13}, [%[bias]]! \n\t" - "vmla.f32 q12, q4, q6 \n\t" - "vmla.f32 q13, q5, q7 \n\t" - "vst1.32 {q12, q13}, [%[C]]! \n\t" + "subs %[nc2], %[nc2], #1 \n\t" + "blt end_nc2_%= \n\t" + "loop_nc2_%=: \n\t" - "subs %[nc1], %[nc1], #1 \n\t" - "bge loop_nc1_%= \n\t" - "end_nc1_%=: \n\t" + "vld1.32 {q0}, [%[c]]! \n\t" + "vld1.32 {q1}, [%[scale]]! \n\t" + "vld1.32 {q10}, [%[bias]]! \n\t" + "vmla.f32 q10, q0, q1 \n\t" + "vst1.32 {q10}, [%[C]]! \n\t" - "subs %[nc2], %[nc2], #1 \n\t" - "blt end_nc2_%= \n\t" - "loop_nc2_%=: \n\t" + "subs %[nc2], %[nc2], #1 \n\t" + "bge loop_nc2_%= \n\t" + "end_nc2_%=: \n\t" - "vld1.32 {q0}, [%[c]]! \n\t" - "vld1.32 {q1}, [%[scale]]! \n\t" - "vld1.32 {q10}, [%[bias]]! \n\t" - "vmla.f32 q10, q0, q1 \n\t" - "vst1.32 {q10}, [%[C]]! \n\t" + "cmp %[nc3], #16 \n\t" + "beq end_nc3_%= \n\t" - "subs %[nc2], %[nc2], #1 \n\t" - "bge loop_nc2_%= \n\t" - "end_nc2_%=: \n\t" + "sub %[c], %[c], %[nc3] \n\t" + "sub %[scale], %[scale], %[nc3] \n\t" + "sub %[bias], %[bias], %[nc3] \n\t" + "sub %[C], %[C], %[nc3] \n\t" - "cmp %[nc3], #16 \n\t" - "beq end_nc3_%= \n\t" + "vld1.32 {q0}, [%[c]]! \n\t" + "vld1.32 {q1}, [%[scale]]! \n\t" + "vld1.32 {q10}, [%[bias]]! \n\t" + "vmla.f32 q10, q0, q1 \n\t" + "vst1.32 {q10}, [%[C]]! \n\t" + "end_nc3_%=: \n\t" - "sub %[c], %[c], %[nc3] \n\t" - "sub %[scale], %[scale], %[nc3] \n\t" - "sub %[bias], %[bias], %[nc3] \n\t" - "sub %[C], %[C], %[nc3] \n\t" + : + : [C] "r"(C), [c] "r"(c), [nc1] "r"(nc1), [nc2] "r"(nc2), [nc3] + "r"(nc3), [scale] "r"(scale), [bias] "r"(bias) : "memory", "q0", "q1", "q2", + "q3", "q4", "q5", "q6", "q7", "q10", "q11", "q12", "q13"); + } - "vld1.32 {q0}, [%[c]]! \n\t" - "vld1.32 {q1}, [%[scale]]! \n\t" - "vld1.32 {q10}, [%[bias]]! \n\t" - "vmla.f32 q10, q0, q1 \n\t" - "vst1.32 {q10}, [%[C]]! \n\t" - "end_nc3_%=: \n\t" + // C = A * B, batchnorm(C), relu(C) + void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *scale, + float *bias) { + int nc1 = n / 16; + int _nc1 = n % 16; + int nc2 = _nc1 / 4; + int nc3 = 16 - 4 * (_nc1 % 4); - : - : [C] "r"(C), [c] "r"(c), [nc1] "r"(nc1), [nc2] "r"(nc2), [nc3] "r"(nc3), - [scale] "r"(scale), [bias] "r"(bias) - : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q10", "q11", - "q12", "q13"); + asm volatile( + "vmov.f32 q14, #0.0 \n\t" + "subs %[nc1], %[nc1], #1 \n\t" + "blt end_nc1_%= \n\t" + "loop_nc1_%=: \n\t" + + "vld1.32 {q0, q1}, [%[c]]! \n\t" + "vld1.32 {q2, q3}, [%[scale]]! \n\t" + "vld1.32 {q10, q11}, [%[bias]]! \n\t" + "vmla.f32 q10, q0, q2 \n\t" + "vmla.f32 q11, q1, q3 \n\t" + "vmax.f32 q10, q10, q14 \n\t" + "vmax.f32 q11, q11, q14 \n\t" + "vst1.32 {q10, q11}, [%[C]]! \n\t" + + "vld1.32 {q4, q5}, [%[c]]! \n\t" + "vld1.32 {q6, q7}, [%[scale]]! \n\t" + "vld1.32 {q12, q13}, [%[bias]]! \n\t" + "vmla.f32 q12, q4, q6 \n\t" + "vmla.f32 q13, q5, q7 \n\t" + "vmax.f32 q12, q12, q14 \n\t" + "vmax.f32 q13, q13, q14 \n\t" + "vst1.32 {q12, q13}, [%[C]]! \n\t" + + "subs %[nc1], %[nc1], #1 \n\t" + "bge loop_nc1_%= \n\t" + "end_nc1_%=: \n\t" + + "subs %[nc2], %[nc2], #1 \n\t" + "blt end_nc2_%= \n\t" + "loop_nc2_%=: \n\t" + + "vld1.32 {q0}, [%[c]]! \n\t" + "vld1.32 {q1}, [%[scale]]! \n\t" + "vld1.32 {q10}, [%[bias]]! \n\t" + "vmla.f32 q10, q0, q1 \n\t" + "vmax.f32 q10, q10, q14 \n\t" + "vst1.32 {q10}, [%[C]]! \n\t" + + "subs %[nc2], %[nc2], #1 \n\t" + "bge loop_nc2_%= \n\t" + "end_nc2_%=: \n\t" + + "cmp %[nc3], #16 \n\t" + "beq end_nc3_%= \n\t" + + "sub %[c], %[c], %[nc3] \n\t" + "sub %[scale], %[scale], %[nc3] \n\t" + "sub %[bias], %[bias], %[nc3] \n\t" + "sub %[C], %[C], %[nc3] \n\t" + + "vld1.32 {q0}, [%[c]]! \n\t" + "vld1.32 {q1}, [%[scale]]! \n\t" + "vld1.32 {q10}, [%[bias]]! \n\t" + "vmla.f32 q10, q0, q1 \n\t" + "vmax.f32 q10, q10, q14 \n\t" + "vst1.32 {q10}, [%[C]]! \n\t" + "end_nc3_%=: \n\t" + + : + : [C] "r"(C), [c] "r"(c), [nc1] "r"(nc1), [nc2] "r"(nc2), [nc3] + "r"(nc3), [scale] "r"(scale), [bias] "r"(bias) : "memory", "q0", "q1", "q2", + "q3", "q4", "q5", "q6", "q7", "q10", "q11", "q12", "q13", "q14"); + } + */ + +#endif // __aarch64__ +#else + +void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) { + float *c0, *c1, *c2, *c3; + c0 = c; + c1 = c + ldc; + c2 = c + 2 * ldc; + c3 = c + 3 * ldc; + for (int p = 0; p < k; p += 1) { + // first row + c0[0] += a[0] * b[0]; + c0[1] += a[0] * b[1]; + c0[2] += a[0] * b[2]; + c0[3] += a[0] * b[3]; + + // second row + c1[0] += a[1] * b[0]; + c1[1] += a[1] * b[1]; + c1[2] += a[1] * b[2]; + c1[3] += a[1] * b[3]; + + // third row + c2[0] += a[2] * b[0]; + c2[1] += a[2] * b[1]; + c2[2] += a[2] * b[2]; + c2[3] += a[2] * b[3]; + + // fourth row + c3[0] += a[3] * b[0]; + c3[1] += a[3] * b[1]; + c3[2] += a[3] * b[2]; + c3[3] += a[3] * b[3]; + + a += 4; + b += 4; + } } -// C = A * B, batchnorm(C), relu(C) -void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *scale, - float *bias) { - int nc1 = n / 16; - int _nc1 = n % 16; - int nc2 = _nc1 / 4; - int nc3 = 16 - 4 * (_nc1 % 4); +#endif // __ARM_NEON - asm volatile( - "vmov.f32 q14, #0.0 \n\t" - "subs %[nc1], %[nc1], #1 \n\t" - "blt end_nc1_%= \n\t" - "loop_nc1_%=: \n\t" +// 32位 float 矩阵乘法 +void Sgemm(int m, int n, int k, float alpha, const float *A, int lda, + const float *B, int ldb, float beta, float *C, int ldc, bool relu) { + // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73) + // L2 cache is 0.5~4 Mib (Contex-A72 cluster) + int L1 = 30 * 1024; + int L2 = 1 * 1024 * 1024; - "vld1.32 {q0, q1}, [%[c]]! \n\t" - "vld1.32 {q2, q3}, [%[scale]]! \n\t" - "vld1.32 {q10, q11}, [%[bias]]! \n\t" - "vmla.f32 q10, q0, q2 \n\t" - "vmla.f32 q11, q1, q3 \n\t" - "vmax.f32 q10, q10, q14 \n\t" - "vmax.f32 q11, q11, q14 \n\t" - "vst1.32 {q10, q11}, [%[C]]! \n\t" + KC = k; + MC = L2 / (2 * KC * sizeof(float)); + NC = MC; - "vld1.32 {q4, q5}, [%[c]]! \n\t" - "vld1.32 {q6, q7}, [%[scale]]! \n\t" - "vld1.32 {q12, q13}, [%[bias]]! \n\t" - "vmla.f32 q12, q4, q6 \n\t" - "vmla.f32 q13, q5, q7 \n\t" - "vmax.f32 q12, q12, q14 \n\t" - "vmax.f32 q13, q13, q14 \n\t" - "vst1.32 {q12, q13}, [%[C]]! \n\t" + // make sure MC is multiple of 4, and NC is multiple of 8 + int mblock_num = (m + MC - 1) / MC; + MC = (m + mblock_num - 1) / mblock_num; + MC = (MC + 4 - 1) / 4 * 4; + // DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n"; - "subs %[nc1], %[nc1], #1 \n\t" - "bge loop_nc1_%= \n\t" - "end_nc1_%=: \n\t" + int nblock_num = (n + NC - 1) / NC; + NC = (n + nblock_num - 1) / nblock_num; + NC = (NC + 8 - 1) / 8 * 8; + // DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n"; - "subs %[nc2], %[nc2], #1 \n\t" - "blt end_nc2_%= \n\t" - "loop_nc2_%=: \n\t" + packedA = static_cast( + paddle_mobile::memory::Alloc(sizeof(float) * MC * KC)); + packedB = static_cast( + paddle_mobile::memory::Alloc(sizeof(float) * KC * NC)); + packedC = static_cast( + paddle_mobile::memory::Alloc(sizeof(float) * MC * NC)); + zero = static_cast(paddle_mobile::memory::Alloc(sizeof(float) * KC)); - "vld1.32 {q0}, [%[c]]! \n\t" - "vld1.32 {q1}, [%[scale]]! \n\t" - "vld1.32 {q10}, [%[bias]]! \n\t" - "vmla.f32 q10, q0, q1 \n\t" - "vmax.f32 q10, q10, q14 \n\t" - "vst1.32 {q10}, [%[C]]! \n\t" + for (int l = 0; l < KC; ++l) { + zero[l] = 0; + } - "subs %[nc2], %[nc2], #1 \n\t" - "bge loop_nc2_%= \n\t" - "end_nc2_%=: \n\t" + int mc, nc; + for (int j = 0; j < n; j += NC) { + nc = s_min(n - j, NC); + PackMatrixB_(KC, nc, nc % NR, &B(0, j), ldb, packedB); + for (int i = 0; i < m; i += MC) { + mc = s_min(m - i, MC); + PackMatrixA_(mc, KC, mc % MR, &A(i, 0), lda, packedA); + InnerKernel(mc, nc, alpha, packedA, packedB, beta, packedC, &C(i, j), ldc, + relu); + } + } - "cmp %[nc3], #16 \n\t" - "beq end_nc3_%= \n\t" + paddle_mobile::memory::Free(packedA); + paddle_mobile::memory::Free(packedB); + paddle_mobile::memory::Free(packedC); + paddle_mobile::memory::Free(zero); +} - "sub %[c], %[c], %[nc3] \n\t" - "sub %[scale], %[scale], %[nc3] \n\t" - "sub %[bias], %[bias], %[nc3] \n\t" - "sub %[C], %[C], %[nc3] \n\t" +void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda, + const float *B, int ldb, float beta, float *C, int ldc, + bool relu, float *new_scale, float *new_bias) { + // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73) + // L2 cache is 0.5~4 Mib (Contex-A72 cluster) + int L1 = 30 * 1024; + int L2 = 1 * 1024 * 1024; - "vld1.32 {q0}, [%[c]]! \n\t" - "vld1.32 {q1}, [%[scale]]! \n\t" - "vld1.32 {q10}, [%[bias]]! \n\t" - "vmla.f32 q10, q0, q1 \n\t" - "vmax.f32 q10, q10, q14 \n\t" - "vst1.32 {q10}, [%[C]]! \n\t" - "end_nc3_%=: \n\t" + KC = k; + MC = L2 / (2 * KC * sizeof(float)); + NC = MC; - : - : [C] "r"(C), [c] "r"(c), [nc1] "r"(nc1), [nc2] "r"(nc2), [nc3] "r"(nc3), - [scale] "r"(scale), [bias] "r"(bias) - : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q10", "q11", - "q12", "q13", "q14"); + // make sure MC is multiple of 4, and NC is multiple of 8 + int mblock_num = (m + MC - 1) / MC; + MC = (m + mblock_num - 1) / mblock_num; + MC = (MC + 4 - 1) / 4 * 4; + // DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n"; + + int nblock_num = (n + NC - 1) / NC; + NC = (n + nblock_num - 1) / nblock_num; + NC = (NC + 8 - 1) / 8 * 8; + // DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n"; + + packedA = static_cast( + paddle_mobile::memory::Alloc(sizeof(float) * MC * KC)); + packedB = static_cast( + paddle_mobile::memory::Alloc(sizeof(float) * KC * NC)); + packedC = static_cast( + paddle_mobile::memory::Alloc(sizeof(float) * MC * NC)); + zero = static_cast(paddle_mobile::memory::Alloc(sizeof(float) * KC)); + + for (int l = 0; l < KC; ++l) { + zero[l] = 0; + } + + int mc, nc; + for (int j = 0; j < n; j += NC) { + nc = s_min(n - j, NC); + PackMatrixB_(KC, nc, nc % NR, &B(0, j), ldb, packedB); + for (int i = 0; i < m; i += MC) { + mc = s_min(m - i, MC); + PackMatrixA_(mc, KC, mc % MR, &A(i, 0), lda, packedA); + InnerKernelWithBn(mc, nc, alpha, packedA, packedB, beta, packedC, + &C(i, j), ldc, relu, new_scale + i, new_bias + i); + } + } + + paddle_mobile::memory::Free(packedA); + paddle_mobile::memory::Free(packedB); + paddle_mobile::memory::Free(packedC); + paddle_mobile::memory::Free(zero); } +} // namespace math } // namespace operators } // namespace paddle_mobile -} // namespace paddle_mobile diff --git a/src/operators/math/gemm.h b/src/operators/math/gemm.h index b4bce43c7a29fba09ade7512cbc660f0ac2888ab..d8b305a7282b871d61ed588b1237f4f8f1cb56f8 100644 --- a/src/operators/math/gemm.h +++ b/src/operators/math/gemm.h @@ -28,6 +28,7 @@ namespace paddle_mobile { namespace operators { namespace math { +/* // 将 A 矩阵分块复制到连续内存(ColMajor) void PackMatrixA(int m, int k, int m_tail, const float *A, int lda, float *buffer); @@ -35,6 +36,7 @@ void PackMatrixA(int m, int k, int m_tail, const float *A, int lda, // 将 B 矩阵分块复制到连续内存(ColMajor) void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb, float *buffer); +*/ // 将 A 矩阵分块复制到连续内存(RowMajor) void PackMatrixA_(int m, int k, int m_tail, const float *A, int lda, @@ -51,7 +53,7 @@ void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b, void InnerKernelWithBn(int mc, int nc, float alpha, const float *a, const float *b, float beta, float *c, float *C, int ldc, bool relu, float *new_scale, float *new_bias); - +/* // 向量矩阵乘法 (M = 1) void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda, const float *B, int ldb, float beta, float *C, int ldc, @@ -60,6 +62,7 @@ void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda, void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A, int lda, const float *B, int ldb, float beta, float *C, int ldc, bool relu, float *new_scale, float *new_bias); +*/ // 计算一个更小的 C 矩阵分块 void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc); @@ -81,6 +84,7 @@ void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *new_scale, void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc, float *new_scale, float *new_bias); +/* // 向量矩阵乘法结果回写 // C = A * B void VecWriteBasic(int n, float *c, float *C, int ldc); @@ -96,6 +100,7 @@ void VecWriteWithBn(int n, float *c, float *C, int ldc, float *new_scale, // C = A * B, batchnorm(C), relu(C) void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *new_scale, float *new_bias); +*/ // 32位 float 矩阵乘法 void Sgemm(int m, int n, int k, float alpha, const float *A, int lda, diff --git a/src/operators/math/im2col.cpp b/src/operators/math/im2col.cpp index 625d120705aab8fcc3ea8d232b4077e213941ec4..7b0b974b542a83d381727128887bef8a48ce937f 100644 --- a/src/operators/math/im2col.cpp +++ b/src/operators/math/im2col.cpp @@ -15,7 +15,7 @@ limitations under the License. */ #include "operators/math/im2col.h" #include #ifdef __ARM_NEON -#include "arm_neon.h" +#include #endif #include "common/types.h" namespace paddle_mobile { @@ -69,7 +69,7 @@ class Im2ColFunctor { int channels_col = im_channels * filter_height * filter_width; const T *im_data = im.data(); T *col_data = col->data(); -#ifdef __ARM_NEON +#if __ARM_NEON const int osize = col_height; const int isize = im_height; bool pad1 = padding[0] > 0; diff --git a/src/operators/math/math_function.cpp b/src/operators/math/math_function.cpp index ca5367788ed87da070dd19900e8d546e51caf337..d881014ccb3f29393ca73fa0e7f4792d4c0d65c7 100644 --- a/src/operators/math/math_function.cpp +++ b/src/operators/math/math_function.cpp @@ -50,7 +50,7 @@ void matmulWithBn(const framework::Tensor &matrix_a, bool trans_a, const framework::Tensor &matrix_b, bool trans_b, float alpha, framework::Tensor *matrix_out, float beta, bool relu, framework::Tensor *new_scale, - framework::Tensor *new_bias) { + framework::Tensor *new_bias, int group) { auto dim_a = matrix_a.dims(); auto dim_b = matrix_b.dims(); auto dim_out = matrix_out->dims(); @@ -71,7 +71,8 @@ void matmulWithBn(const framework::Tensor &matrix_a, bool trans_a, SgemmWithBn(M, N, K, alpha, matrix_a.data(), K, matrix_b.data(), N, beta, matrix_out->data(), N, relu, - new_scale->data(), new_bias->data()); + new_scale->data() + group, + new_bias->data() + group); } } // namespace math diff --git a/src/operators/math/math_function.h b/src/operators/math/math_function.h index 0ca7815fc2bcff2be0345b581d3dfb26cf55794c..b5179458a2bf9e6817366c7bd4ea1f536fd21642 100644 --- a/src/operators/math/math_function.h +++ b/src/operators/math/math_function.h @@ -31,7 +31,8 @@ template void matmulWithBn(const framework::Tensor &matrix_a, bool trans_a, const framework::Tensor &matrix_b, bool trans_b, T alpha, framework::Tensor *matrix_out, T beta, bool relu, - framework::Tensor *new_scale, framework::Tensor *new_bias); + framework::Tensor *new_scale, framework::Tensor *new_bias, + int group); } // namespace math } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/math/pool_2x2.cpp b/src/operators/math/pool_2x2.cpp index c86003f6f96b632efd50bbb156293510e3d8521c..0a2d96d4d065d7938e6872b4f073e080d7be8c3a 100644 --- a/src/operators/math/pool_2x2.cpp +++ b/src/operators/math/pool_2x2.cpp @@ -13,7 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #ifdef POOL_OP -#include "pool_2x2.h" +#include "operators/math/pool_2x2.h" +#include +#include namespace paddle_mobile { namespace operators { @@ -21,10 +23,10 @@ namespace math { void Pool2x2Max(vector strides, vector paddings, const Tensor *input, Tensor *output) { -#ifdef __ARM_NEON - -#ifdef ARMV7 +#if __ARM_NEON +#if __aarch64__ +#else const int batch_size = input->dims()[0]; const int input_height = input->dims()[2]; @@ -93,15 +95,16 @@ void Pool2x2Max(vector strides, vector paddings, const Tensor *input, output_data += output_batch_stride; } #endif - +#else #endif } void Pool2x2Avg(vector strides, vector paddings, const Tensor *input, Tensor *output) { -#ifdef __ARM_NEON +#if __ARM_NEON -#ifdef ARMV7 +#if __aarch64__ +#else const int batch_size = input->dims()[0]; const int input_height = input->dims()[2]; @@ -171,12 +174,9 @@ void Pool2x2Avg(vector strides, vector paddings, const Tensor *input, input_data += input_batch_stride; output_data += output_batch_stride; } -#else - -// TODO(): to imp other asm #endif - +#else #endif } diff --git a/src/operators/math/pool_3x3.cpp b/src/operators/math/pool_3x3.cpp index 28a8877355b2c2cc1221512884b5be1497bc4243..28547b71fca6caea2ff4341b3f832c0035436a72 100644 --- a/src/operators/math/pool_3x3.cpp +++ b/src/operators/math/pool_3x3.cpp @@ -17,7 +17,7 @@ limitations under the License. */ #include #endif #include "framework/tensor.h" -#include "pool_3x3.h" +#include "operators/math/pool_3x3.h" #if __ARM_NEON #include #endif // __ARM_NEON @@ -518,6 +518,8 @@ void Pool3x3Maxs1p1(const Tensor *input, Tensor *output) { input_data += input_batch_stride; out_data += output_batch_stride; } +#else + #endif } @@ -582,7 +584,18 @@ void Pool3x3Max(vector strides, vector paddings, const Tensor *input, } output_seg[ph * output_width + pw] = max_value; } else { -#if defined(ARMV7) +#if __aarch64__ + const float32x4_t data1 = vld1q_f32(pos1); + const float32x4_t data2 = vld1q_f32(pos1 + input_width); + const float32x4_t data3 = vld1q_f32(pos1 + 2 * input_width); + const float32x4_t max_data = + vmaxq_f32(vmaxq_f32(data1, data2), data3); + float32x2_t res = + vpmax_f32(vget_high_f32(vsetq_lane_f32(-INT_MAX, max_data, 3)), + vget_low_f32(max_data)); + res = vpmax_f32(res, res); + output_seg[ph * output_width + pw] = vget_lane_f32(res, 0); +#else asm volatile( "vld1.32 {q1}, [%[pos1]] \n\t" "vld1.32 {q2}, [%[pos2]] \n\t" @@ -598,17 +611,6 @@ void Pool3x3Max(vector strides, vector paddings, const Tensor *input, [pos2] "r"(pos2), [pos3] "r"(pos3), [output_ptr] "r"(output_ptr), [negative_max] "r"(negative_max) : "memory", "q1", "q2", "q3", "q4"); -#else - const float32x4_t data1 = vld1q_f32(pos1); - const float32x4_t data2 = vld1q_f32(pos1 + input_width); - const float32x4_t data3 = vld1q_f32(pos1 + 2 * input_width); - const float32x4_t max_data = - vmaxq_f32(vmaxq_f32(data1, data2), data3); - float32x2_t res = - vpmax_f32(vget_high_f32(vsetq_lane_f32(-INT_MAX, max_data, 3)), - vget_low_f32(max_data)); - res = vpmax_f32(res, res); - output_seg[ph * output_width + pw] = vget_lane_f32(res, 0); #endif } } @@ -676,8 +678,8 @@ void Pool3x3Avg(vector strides, vector paddings, const Tensor *input, } output_seg[ph * output_width + pw] = sum / 9.0; } else { -#if defined(ARMV7) - +#if __aarch64__ +#else asm volatile( "vld1.32 {q1}, [%[pos1]] \n\t" "vld1.32 {q2}, [%[pos2]] \n\t" @@ -696,7 +698,7 @@ void Pool3x3Avg(vector strides, vector paddings, const Tensor *input, [output_ptr] "r"(output_ptr), [zero] "r"(zero), [nine_ptr] "r"(nine_ptr) : "memory", "r6", "q1", "q2", "q3", "q4"); -#else +#endif const float32x4_t data1 = vld1q_f32(pos1); const float32x4_t data2 = vld1q_f32(pos2); const float32x4_t data3 = vld1q_f32(pos3); @@ -707,7 +709,6 @@ void Pool3x3Avg(vector strides, vector paddings, const Tensor *input, vget_low_f32(sum_data)); res = vpadd_f32(res, res); output_seg[ph * output_width + pw] = vget_lane_f32(res, 0) / 9.0; -#endif } } } @@ -715,6 +716,7 @@ void Pool3x3Avg(vector strides, vector paddings, const Tensor *input, input_data += input_batch_stride; output_data += output_batch_stride; } +#else #endif } } // namespace math diff --git a/src/operators/math/softmax.cpp b/src/operators/math/softmax.cpp index 968915f21e08fce9f25ceb63831ee40ecba9cee6..dba88c93969014f2ad0d2636b4141c734dbc2ed5 100644 --- a/src/operators/math/softmax.cpp +++ b/src/operators/math/softmax.cpp @@ -135,6 +135,7 @@ class SoftmaxFuntor { } } } +#else #endif // ARM_NEON public: diff --git a/src/operators/mul_op.cpp b/src/operators/mul_op.cpp index 60e0c087383388c83ca1711c057af822a6e2a730..044da7012eccde57a87d417f4f3c00b82e01da42 100644 --- a/src/operators/mul_op.cpp +++ b/src/operators/mul_op.cpp @@ -50,7 +50,7 @@ void MulOp::InferShape() const { framework::DDim ddim = framework::make_ddim(output_dims); this->param_.Out()->Resize(ddim); } -template class MulOp; + } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/multiclass_nms_op.cpp b/src/operators/multiclass_nms_op.cpp index eea625469ec030e0c7d62baea8312e11f1308ce2..4324cab35298a45ece7e375299909994648a27a4 100644 --- a/src/operators/multiclass_nms_op.cpp +++ b/src/operators/multiclass_nms_op.cpp @@ -34,7 +34,7 @@ void MultiClassNMSOp::InferShape() const { // pre size, will change in Compute. this->param_.Out()->Resize(framework::make_ddim({input_bboxes_dims[1], 6})); } -template class MultiClassNMSOp; + } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/op_param.h b/src/operators/op_param.h index 4ecc1622f91d5ff63d6abe9434ba0222b10d34e6..4b95ceb18740531919c4ef00dfdd912b1067e891 100644 --- a/src/operators/op_param.h +++ b/src/operators/op_param.h @@ -371,7 +371,7 @@ class BatchNormParam : OpParam { input_variance_ = InputVarianceFrom(inputs, scope); epsilon_ = GetAttr("epsilon", attrs); momentum_ = GetAttr("momentum", attrs); - is_test_ = GetAttr("is_test", attrs); + // is_test_ = GetAttr("is_test", attrs); } const Tensor *InputX() const { return input_x_; } @@ -1059,6 +1059,165 @@ class FusionConvAddBNReluParam : public OpParam { Print &operator<<(Print &printer, const FusionConvAddParam &conv_param); #endif +#ifdef FUSION_DWCONVBNRELU_OP +class FusionDWConvBNReluParam : public OpParam { + public: + FusionDWConvBNReluParam(const VariableNameMap &inputs, + const VariableNameMap &outputs, + const AttributeMap &attrs, const Scope &scope) { + filter_ = FilterFrom(inputs, scope); + input_ = InputFrom(inputs, scope); + output_ = OutFrom(outputs, scope); + strides_ = GetAttr>("strides", attrs); + paddings_ = GetAttr>("paddings", attrs); + dilations_ = GetAttr>("dilations", attrs); + groups = GetAttr("groups", attrs); + input_bias_ = InputBiasFrom(inputs, scope); + input_mean_ = InputMeanFrom(inputs, scope); + input_scale_ = InputScaleFrom(inputs, scope); + input_variance_ = InputVarianceFrom(inputs, scope); + epsilon_ = GetAttr("epsilon", attrs); + momentum_ = GetAttr("momentum", attrs); + // is_test_ = GetAttr("is_test", attrs); + } + + const Tensor *Input() const { return input_; } + + const Tensor *Filter() const { return filter_; } + + Tensor *Output() const { return output_; } + + const vector &Strides() const { return strides_; } + + const vector &Paddings() const { return paddings_; } + + const vector &Dilations() const { return dilations_; } + + const int &Groups() const { return groups; } + + const Tensor *InputBias() const { return input_bias_; } + + const Tensor *InputMean() const { return input_mean_; } + + const Tensor *InputScale() const { return input_scale_; } + + const Tensor *InputVariance() const { return input_variance_; } + + const float &Epsilon() const { return epsilon_; } + + const float &Momentum() const { return momentum_; } + + const bool &IsTest() const { return is_test_; } + + void SetNewScale(Tensor *new_scale) { new_scale_ = new_scale; } + + void SetNewBias(Tensor *new_bias) { new_bias_ = new_bias; } + + const Tensor *NewScale() const { return new_scale_; } + + const Tensor *NewBias() const { return new_bias_; } + + protected: + Tensor *input_; + Tensor *output_; + Tensor *filter_; + vector strides_; + vector paddings_; + vector dilations_; + int groups; + Tensor *input_bias_; + Tensor *input_mean_; + Tensor *input_scale_; + Tensor *input_variance_; + float epsilon_; + float momentum_; + bool is_test_; + Tensor *new_bias_; + Tensor *new_scale_; +}; + +Print &operator<<(Print &printer, const FusionConvAddParam &conv_param); +#endif + +#ifdef FUSION_CONVBNRELU_OP +class FusionConvBNReluParam : public OpParam { + public: + FusionConvBNReluParam(const VariableNameMap &inputs, + const VariableNameMap &outputs, + const AttributeMap &attrs, const Scope &scope) { + filter_ = FilterFrom(inputs, scope); + input_ = InputFrom(inputs, scope); + output_ = OutFrom(outputs, scope); + + strides_ = GetAttr>("strides", attrs); + paddings_ = GetAttr>("paddings", attrs); + dilations_ = GetAttr>("dilations", attrs); + groups = GetAttr("groups", attrs); + input_bias_ = InputBiasFrom(inputs, scope); + input_mean_ = InputMeanFrom(inputs, scope); + input_scale_ = InputScaleFrom(inputs, scope); + input_variance_ = InputVarianceFrom(inputs, scope); + epsilon_ = GetAttr("epsilon", attrs); + momentum_ = GetAttr("momentum", attrs); + // is_test_ = GetAttr("is_test", attrs); + } + + const Tensor *Input() const { return input_; } + + const Tensor *Filter() const { return filter_; } + + Tensor *Output() const { return output_; } + + const vector &Strides() const { return strides_; } + + const vector &Paddings() const { return paddings_; } + + const vector &Dilations() const { return dilations_; } + + const int &Groups() const { return groups; } + + const Tensor *InputBias() const { return input_bias_; } + + const Tensor *InputMean() const { return input_mean_; } + + const Tensor *InputScale() const { return input_scale_; } + + const Tensor *InputVariance() const { return input_variance_; } + + const float &Epsilon() const { return epsilon_; } + + const float &Momentum() const { return momentum_; } + + const bool &IsTest() const { return is_test_; } + + void SetNewScale(Tensor *new_scale) { new_scale_ = new_scale; } + + void SetNewBias(Tensor *new_bias) { new_bias_ = new_bias; } + + const Tensor *NewScale() const { return new_scale_; } + + const Tensor *NewBias() const { return new_bias_; } + + protected: + Tensor *input_; + Tensor *output_; + Tensor *filter_; + vector strides_; + vector paddings_; + vector dilations_; + int groups; + Tensor *input_bias_; + Tensor *input_mean_; + Tensor *input_scale_; + Tensor *input_variance_; + float epsilon_; + float momentum_; + bool is_test_; + Tensor *new_bias_; + Tensor *new_scale_; +}; +#endif + #ifdef IM2SEQUENCE_OP class Im2SequenceParam : public OpParam { public: diff --git a/src/operators/pool_op.cpp b/src/operators/pool_op.cpp index 41016d74deb5bcd7d3679b1c762467e2dc65de34..0477c88cf84054090b4c46524284fb0cdf525c0e 100644 --- a/src/operators/pool_op.cpp +++ b/src/operators/pool_op.cpp @@ -54,7 +54,7 @@ void PoolOp::InferShape() const { } this->param_.Output()->Resize(framework::make_ddim(output_shape)); } -template class PoolOp; + } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/prelu_op.cpp b/src/operators/prelu_op.cpp index e78f6b0374336a3d891a1f3e73f63c706b321ccc..245154ca5ea6971dee33e14550bf1e090fa0ec71 100644 --- a/src/operators/prelu_op.cpp +++ b/src/operators/prelu_op.cpp @@ -23,7 +23,7 @@ void PReluOp::InferShape() const { auto input_dims = this->param_.InputX()->dims(); this->param_.Out()->Resize(input_dims); } -template class PReluOp; + } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/prior_box_op.cpp b/src/operators/prior_box_op.cpp index 81ba045a209a48105ab895f7687e56ed3db44305..a05a0ddcec5ba9d442b58846468a121e9b655a6a 100644 --- a/src/operators/prior_box_op.cpp +++ b/src/operators/prior_box_op.cpp @@ -44,7 +44,7 @@ void PriorBoxOp::InferShape() const { this->param_.OutputBoxes()->Resize(framework::make_ddim(dim_vec)); this->param_.OutputVariances()->Resize(framework::make_ddim(dim_vec)); } -template class PriorBoxOp; + } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/relu_op.cpp b/src/operators/relu_op.cpp index b80a56f38aec4bf1bf625d54f4115626447a654a..2a771e81e7a5a0e869984990b52b98d15036543a 100644 --- a/src/operators/relu_op.cpp +++ b/src/operators/relu_op.cpp @@ -23,7 +23,7 @@ void ReluOp::InferShape() const { auto input_dims = this->param_.InputX()->dims(); this->param_.Out()->Resize(input_dims); } -template class ReluOp; + } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/reshape_op.cpp b/src/operators/reshape_op.cpp index 193678613cc8dd2b8f9b8ae1654b0adacea09505..dcc15009af2b23129552d58b3fa22c3c67684dce 100644 --- a/src/operators/reshape_op.cpp +++ b/src/operators/reshape_op.cpp @@ -27,7 +27,7 @@ void ReshapeOp::InferShape() const { auto out_dims = ValidateShape(shape, input_x_dims); this->param_.Out()->Resize(out_dims); } -template class ReshapeOp; + } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/resize_op.cpp b/src/operators/resize_op.cpp index f378ff53f513ccf7cfb986f606378895b5af4b9f..02c50b662665fc9bd2f662922cb88dbce9fc5d53 100644 --- a/src/operators/resize_op.cpp +++ b/src/operators/resize_op.cpp @@ -24,7 +24,7 @@ void ResizeOp::InferShape() const { auto out_dims = CalOutputShape(this->param_); this->param_.Out()->Resize(out_dims); } -template class ResizeOp; + } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/scale_op.cpp b/src/operators/scale_op.cpp index c1931ed4fdc4c058c979fdceba11ea25f7d752f4..968fcd4098e92a47899c9a733c0261d91c314c29 100644 --- a/src/operators/scale_op.cpp +++ b/src/operators/scale_op.cpp @@ -24,7 +24,7 @@ void ScaleOp::InferShape() const { auto input_dims = this->param_.InputX()->dims(); this->param_.Out()->Resize(input_dims); } -template class ScaleOp; + } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/sigmoid_op.cpp b/src/operators/sigmoid_op.cpp index c83738b2c88c3c51ebc0d649fe134da9e44f30ea..8ea4c98942e0630f5b69133991583ee1192c8153 100644 --- a/src/operators/sigmoid_op.cpp +++ b/src/operators/sigmoid_op.cpp @@ -22,7 +22,7 @@ template void SigmoidOp::InferShape() const { this->param_.Out()->Resize(this->param_.InputX()->dims()); } -template class SigmoidOp; + } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/slice_op.cpp b/src/operators/slice_op.cpp index 6d70895fcc5edf75f73368813212f7d9177c760b..b77a675e10ed030443e1d4074239a715ddedf772 100644 --- a/src/operators/slice_op.cpp +++ b/src/operators/slice_op.cpp @@ -23,7 +23,7 @@ template void SliceOp::InferShape() const { /// todo: add InputShape() detection. } -template class SliceOp; + } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/softmax_op.cpp b/src/operators/softmax_op.cpp index db8fe1d94363c1db578a369d9eca00dde17d30af..c9edfccf4ff08e5a12d735526c3d63c689711357 100644 --- a/src/operators/softmax_op.cpp +++ b/src/operators/softmax_op.cpp @@ -22,7 +22,7 @@ template void SoftmaxOp::InferShape() const { this->param_.Out()->Resize(this->param_.InputX()->dims()); } -template class SoftmaxOp; + } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/transpose_op.cpp b/src/operators/transpose_op.cpp index 7e578b290174734ba8c210a354c9e56fde364858..5f193f96396c8d4d7cb58143573015384e7a7c28 100644 --- a/src/operators/transpose_op.cpp +++ b/src/operators/transpose_op.cpp @@ -47,7 +47,7 @@ void TransposeOp::InferShape() const { } this->param_.Out()->Resize(out_dims); } -template class TransposeOp; + } // namespace operators } // namespace paddle_mobile diff --git a/test/framework/test_load.cpp b/test/framework/test_load.cpp index f4215de46c2bafd732b0092b58c25bf6fcefdf7a..bea7d4ba7d2df1344f0819222fbdb389106fa77e 100644 --- a/test/framework/test_load.cpp +++ b/test/framework/test_load.cpp @@ -19,7 +19,9 @@ int main() { paddle_mobile::Loader loader; // ../../../test/models/googlenet // ../../../test/models/mobilenet - auto program = loader.Load(g_googlenet, true); + // auto program = loader.Load(g_googlenet, true); + + auto program = loader.Load(g_mobilenet_ssd, true); // auto program = loader.Load(g_googlenet_combine + "/model", // g_googlenet_combine + // "/params", true); diff --git a/test/net/test_googlenet.cpp b/test/net/test_googlenet.cpp index 2ab24736397c1e71350335561abbcabcba6e27a4..d230b9469229946fc74f4dc9e1ee6100196ed9aa 100644 --- a/test/net/test_googlenet.cpp +++ b/test/net/test_googlenet.cpp @@ -23,7 +23,7 @@ int main() { auto time1 = time(); if (paddle_mobile.Load(g_googlenet, optimize)) { auto time2 = time(); - DLOG << "load cost :" << time_diff(time1, time1) << "ms"; + DLOG << "load cost: " << time_diff(time1, time1) << "ms"; std::vector input; std::vector dims{1, 3, 224, 224}; GetInput(g_test_image_1x3x224x224, &input, dims); diff --git a/test/net/test_mobilenet+ssd.cpp b/test/net/test_mobilenet+ssd.cpp index 1a7c4cd49cb1707b9c7783cf74e87e74da39732e..a3d780a4854d018f948af2890bfe9f1e7a8fefef 100644 --- a/test/net/test_mobilenet+ssd.cpp +++ b/test/net/test_mobilenet+ssd.cpp @@ -12,16 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include +#include #include "../test_helper.h" #include "../test_include.h" int main() { paddle_mobile::PaddleMobile paddle_mobile; + paddle_mobile.SetThreadNum(4); auto time1 = time(); - if (paddle_mobile.Load(g_mobilenet_ssd, true)) { + auto isok = paddle_mobile.Load(g_mobilenet_ssd_gesture + "/model", + g_mobilenet_ssd_gesture + "/params", true); + // auto isok = paddle_mobile.Load(g_mobilenet_ssd, false); + if (isok) { auto time2 = time(); - DLOG << "load cost :" << time_diff(time1, time1) << "ms"; + std::cout << "load cost :" << time_diff(time1, time2) << "ms" << std::endl; std::vector dims{1, 3, 300, 300}; Tensor input_tensor; @@ -33,7 +37,8 @@ int main() { auto time3 = time(); paddle_mobile.Predict(input, dims); auto time4 = time(); - DLOG << "predict cost :" << time_diff(time3, time4) << "ms"; + std::cout << "predict cost :" << time_diff(time3, time4) << "ms" + << std::endl; } return 0; } diff --git a/test/net/test_mobilenet.cpp b/test/net/test_mobilenet.cpp index 2e285695fb79f3ed5471a653c71a10b36ef4e7f2..95ffc59c394782b69d17f16c549b0e6923fd31e8 100644 --- a/test/net/test_mobilenet.cpp +++ b/test/net/test_mobilenet.cpp @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include +#include #include "../test_helper.h" #include "../test_include.h" @@ -22,7 +22,7 @@ int main() { auto time1 = time(); if (paddle_mobile.Load(g_mobilenet, true)) { auto time2 = time(); - DLOG << "load cost :" << time_diff(time1, time1) << "ms"; + std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl; std::vector dims{1, 3, 224, 224}; Tensor input_tensor; @@ -35,7 +35,8 @@ int main() { auto vec_result = paddle_mobile.Predict(input, dims); auto time4 = time(); - DLOG << "predict cost :" << time_diff(time3, time4) << "ms"; + std::cout << "predict cost :" << time_diff(time3, time4) << "ms" + << std::endl; } return 0; diff --git a/test/test_helper.h b/test/test_helper.h index 81ad23ff3b4e53db0225630eebaa34878ad4c139..fb6724f9c5764497ec81de0d73406709f098e0e0 100644 --- a/test/test_helper.h +++ b/test/test_helper.h @@ -16,6 +16,8 @@ limitations under the License. */ #include #include +#include +#include #include "common/common.h" #include "common/log.h" @@ -23,6 +25,8 @@ limitations under the License. */ #include "framework/tensor.h" static const std::string g_mobilenet_ssd = "../models/mobilenet+ssd"; +static const std::string g_mobilenet_ssd_gesture = + "../models/mobilenet+ssd_gesture"; static const std::string g_squeezenet = "../models/squeezenet"; static const std::string g_googlenet = "../models/googlenet"; static const std::string g_mobilenet = "../models/mobilenet"; @@ -62,9 +66,9 @@ void GetInput(const std::string &input_name, std::vector *input, size *= dim; } - T *input_ptr = (T *)malloc(sizeof(T) * size); + T *input_ptr = reinterpret_cast(malloc(sizeof(T) * size)); std::ifstream in(input_name, std::ios::in | std::ios::binary); - in.read((char *)(input_ptr), size * sizeof(T)); + in.read(reinterpret_cast(input_ptr), size * sizeof(T)); in.close(); for (int i = 0; i < size; ++i) { input->push_back(input_ptr[i]); @@ -79,6 +83,6 @@ void GetInput(const std::string &input_name, T *input_ptr = input->mutable_data(dims); std::ifstream in(input_name, std::ios::in | std::ios::binary); - in.read((char *)(input_ptr), input->numel() * sizeof(T)); + in.read(reinterpret_cast(input_ptr), input->numel() * sizeof(T)); in.close(); } diff --git a/tools/build.sh b/tools/build.sh index ce330e6d631ea1009f28ccf987a50e5f79a032b6..54680f50efd04272b183c738541b9153b9e74416 100755 --- a/tools/build.sh +++ b/tools/build.sh @@ -38,7 +38,8 @@ build_for_android() { fi if [ -z "$PLATFORM" ]; then - PLATFORM="arm-v7a" # Users could choose "arm-v8a" or other platforms from the command line. + PLATFORM="arm-v7a" # Users could choose "arm-v8a" platform. +# PLATFORM="arm-v8a" fi if [ "${PLATFORM}" = "arm-v7a" ]; then @@ -92,23 +93,28 @@ build_for_ios() { # rm -rf "../build" PLATFORM="ios" MODE="Release" - BUILD_DIR=../build/release/"${PLATFORM}" +# IOS_ARCH="armv7" +# IOS_ARCH="armv7s" + IOS_ARCH="arm64" # Users could choose "armv7" or "armv7s" platforms. + BUILD_DIR=../build/release/"${PLATFORM}"/"${IOS_ARCH}" TOOLCHAIN_FILE="./tools/ios-cmake/ios.toolchain.cmake" mkdir -p "${BUILD_DIR}" if [ $# -eq 1 ]; then cmake .. \ -B"${BUILD_DIR}" \ -DCMAKE_BUILD_TYPE="${MODE}" \ - -DCMAKE_TOOLCHAIN_FILE="${TOOLCHAIN_FILE}" \ -DIOS_PLATFORM=OS \ + -DIOS_ARCH="${IOS_ARCH}" \ + -DCMAKE_TOOLCHAIN_FILE="${TOOLCHAIN_FILE}" \ -DNET=$1 \ -DIS_IOS="true" else cmake .. \ -B"${BUILD_DIR}" \ -DCMAKE_BUILD_TYPE="${MODE}" \ - -DCMAKE_TOOLCHAIN_FILE="${TOOLCHAIN_FILE}" \ -DIOS_PLATFORM=OS \ + -DIOS_ARCH="${IOS_ARCH}" \ + -DCMAKE_TOOLCHAIN_FILE="${TOOLCHAIN_FILE}" \ -DIS_IOS="true" fi cd "${BUILD_DIR}" diff --git a/tools/ios-cmake/ios.toolchain.cmake b/tools/ios-cmake/ios.toolchain.cmake index a8735adc8d853a5825a23f1ddf129d0a95199275..a81f066c11b3ad6614b8df3ee2c18f80469d1cd2 100644 --- a/tools/ios-cmake/ios.toolchain.cmake +++ b/tools/ios-cmake/ios.toolchain.cmake @@ -159,7 +159,6 @@ set (CMAKE_OSX_SYSROOT ${CMAKE_IOS_SDK_ROOT} CACHE PATH "Sysroot used for iOS su # set the architecture for iOS if (${IOS_PLATFORM} STREQUAL "OS") - set (IOS_ARCH armv7 armv7s arm64) elseif (${IOS_PLATFORM} STREQUAL "SIMULATOR") set (IOS_ARCH i386) elseif (${IOS_PLATFORM} STREQUAL "SIMULATOR64") diff --git a/tools/op.cmake b/tools/op.cmake index 71defeffcc919848e165ea836f4bfed2fcc7e0ff..50fca67092d964baeafecf10146e22efce24c98a 100644 --- a/tools/op.cmake +++ b/tools/op.cmake @@ -42,6 +42,16 @@ elseif (NET STREQUAL "resnet") set(MUL_OP ON) set(POOL_OP ON) set(RELU_OP ON) +elseif (NET STREQUAL "FPGAnets") + set(FUSION_CONVRELU_OP ON) + set(FUSION_CONVBNSCALE_OP ON) + set(FUSION_CONVBNSCALERELU_OP ON) + set(FUSION_POOLBN_OP ON) + set(FUSION_ELEMENTWISEADDRELU_OP ON) + set(REGION_OP ON) + set(POOL_OP ON) + set(CONCAT_OP ON) + set(SOFTMAX_OP ON) else () set(BATCHNORM_OP ON) set(BOXCODER_OP ON) @@ -64,6 +74,8 @@ else () set(TRANSPOSE_OP ON) set(FUSION_CONVADD_RELU_OP ON) set(FUSION_CONVADDBNRELU_OP ON) + set(FUSION_DWCONVBNRELU_OP ON) + set(FUSION_CONVBNRELU_OP ON) set(PRELU_OP ON) set(RESIZE_OP ON) set(SCALE_OP ON) @@ -155,6 +167,14 @@ endif() if (FUSION_CONVADDBNRELU_OP) add_definitions(-DFUSION_CONVADDBNRELU_OP) endif() +if (FUSION_DWCONVBNRELU_OP) + add_definitions(-DFUSION_DWCONVBNRELU_OP) +endif() + +if (FUSION_CONVBNRELU_OP) + add_definitions(-DFUSION_CONVBNRELU_OP) +endif() + if (PRELU_OP) add_definitions(-DPRELU_OP) endif() @@ -173,3 +193,23 @@ endif() if (IM2SEQUENCE_OP) add_definitions(-DIM2SEQUENCE_OP) endif() + +if (FUSION_CONVRELU_OP) + add_definitions(-DFUSION_CONVRELU_OP) +endif() +if (FUSION_CONVBNSCALE_OP) + add_definitions(-DFUSION_CONVBNSCALE_OP) +endif() +if (FUSION_CONVBNSCALERELU_OP) + add_definitions(-DFUSION_CONVBNSCALERELU_OP) +endif() +if (FUSION_POOLBN_OP) + add_definitions(-DFUSION_POOLBN_OP) +endif() +if (FUSION_ELEMENTWISEADDRELU_OP) + add_definitions(-DFUSION_ELEMENTWISEADDRELU_OP) +endif() +if (REGION_OP) + add_definitions(-DREGION_OP) +endif() + diff --git a/tools/quantification/CMakeLists.txt b/tools/quantification/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..1dfb9ee056a4126f65c2ab6fac4c1417039f66ec --- /dev/null +++ b/tools/quantification/CMakeLists.txt @@ -0,0 +1,5 @@ +set(dir ${CMAKE_CURRENT_SOURCE_DIR}) +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${dir}/build") + +ADD_EXECUTABLE(convert convert.cpp) +target_link_libraries(convert paddle-mobile) \ No newline at end of file diff --git a/tools/quantification/convert.cpp b/tools/quantification/convert.cpp new file mode 100644 index 0000000000000000000000000000000000000000..7a9511a654f3de9ac9ace5d3b9621c360bd86ad9 --- /dev/null +++ b/tools/quantification/convert.cpp @@ -0,0 +1,202 @@ + + +#include "io/paddle_mobile.h" +#include +using std::string; + +static const std::string g_googlenet_combine = "../models/googlenet_combine"; +static const std::string g_googlenet = "../models/googlenet"; +using paddle_mobile::Executor; +using paddle_mobile::framework::Program; + + char *Get_binary_data(std::string filename) { + FILE *file = fopen(filename.c_str(), "rb"); + PADDLE_MOBILE_ENFORCE(file != nullptr, "can't open file: %s ", + filename.c_str()); + fseek(file, 0, SEEK_END); + int64_t size = ftell(file); + PADDLE_MOBILE_ENFORCE(size > 0, "size is too small"); + rewind(file); + char *data = new char[size]; + size_t bytes_read = fread(data, 1, size, file); + PADDLE_MOBILE_ENFORCE(bytes_read == size, + "read binary file bytes do not match with fseek"); + DLOG << "Get_binary_data end"; + fclose(file); + return data; + } + + void LoadWithDump(const paddle_mobile::framework::VarDesc var_desc, + paddle_mobile::framework::LoDTensor *tensor, char **data, FILE *out_file) { + // 1. version + uint32_t version = *reinterpret_cast(*data); + // write version + fwrite(&version, sizeof(uint32_t), 1, out_file ); + (*data) += sizeof(uint32_t); + // 2 Lod information + uint64_t *lod_level_ptr = new uint64_t(); + memcpy(lod_level_ptr, (*data), sizeof(uint64_t)); + uint64_t lod_level = 0; + // write lod Information + fwrite(&lod_level, sizeof(uint64_t), 1, out_file); + delete lod_level_ptr; + (*data) += sizeof(uint64_t); + auto &lod = *tensor->mutable_lod(); + lod.resize(lod_level); + for (uint64_t i = 0; i < lod_level; ++i) { + uint64_t size = *reinterpret_cast(*data); + // write lod size + fwrite(&size, sizeof(uint64_t), 1, out_file); + (*data) += sizeof(uint64_t); + std::vector tmp(size / sizeof(size_t)); + for (int k = 0; k < tmp.size(); ++k) { + tmp[k] = *reinterpret_cast(*data); + (*data) += sizeof(size_t); + } + // write lod size vector + fwrite(&tmp, sizeof(size_t), tmp.size(), out_file ); + + lod[i] = tmp; + } + + // 3. tensor version + uint32_t tensor_version = *reinterpret_cast(*data); + // write tensor version + fwrite(&tensor_version, sizeof(uint32_t), 1, out_file); + (*data) += sizeof(uint32_t); + + // 4. tensor desc + int32_t size = *reinterpret_cast(*data); + // write tensor desc + fwrite(&size, sizeof(int32_t), 1, out_file); + (*data) += sizeof(int32_t); + + std::unique_ptr buf(new char[size]); + for (int m = 0; m < size; ++m) { + buf.get()[m] = (*data)[m]; + } + fwrite(buf.get(), sizeof(char), size, out_file); + (*data) += (sizeof(char) * size); + + const paddle_mobile::framework::TensorDesc &desc = var_desc.Tensor_desc(); + int memory_size = 1; + for (auto l : desc.Dims()) { + memory_size *= l; + } + tensor->Resize(paddle_mobile::framework::make_ddim(desc.Dims())); + + void *memory = tensor; + int type_size = 0; + switch (desc.DataType()) { + case paddle_mobile::framework::VARTYPE_TYPE_FP16: + type_size = 2; + break; + case paddle_mobile::framework::VARTYPE_TYPE_FP32: + type_size = 4; + memory = tensor->mutable_data(); + break; + case paddle_mobile::framework::VARTYPE_TYPE_FP64: + type_size = 8; + break; + case paddle_mobile::framework::VARTYPE_TYPE_INT32: + type_size = 4; + break; + case paddle_mobile::framework::VARTYPE_TYPE_INT64: + type_size = 8; + break; + case paddle_mobile::framework::VARTYPE_TYPE_BOOL: + type_size = 1; + break; + default: + break; + } + for (int n = 0; n < memory_size * type_size; ++n) { + static_cast(memory)[n] = (*data)[n]; + } + (*data) += (sizeof(char) * memory_size * type_size); + // for float 32 + float min_value = std::numeric_limits::max(); + float max_value = std::numeric_limits::min(); + for (int k = 0; k < memory_size; ++k) { + min_value = std::min(min_value, static_cast (memory)[k]); + max_value = std::max(max_value, static_cast (memory)[k]); + } + fwrite(&min_value, sizeof(float), 1, out_file); + fwrite(&max_value, sizeof(float), 1, out_file); + for (int g = 0; g < memory_size; ++g) { + float value = static_cast (memory)[g]; + uint8_t factor = (uint8_t) round((value - min_value) / (max_value - min_value) * 255); + fwrite(&factor, sizeof(uint8_t), 1, out_file); + } + + + } + + void quantificate_combined(std::string model_path, std::string param_path, std::string param_min_path){ + paddle_mobile::Loader loader; + bool optimize = true; + auto program = loader.Load(model_path, param_path, optimize); + char *origin_data = Get_binary_data(program.para_path); + char *data = origin_data; + FILE *out_file = fopen(param_min_path.c_str(), "wb"); + for (const auto &block : program.originProgram->Blocks()) { + for (const auto &var_desc : block->Vars()) { + auto var = program.scope->Var(var_desc->Name()); + if(var_desc ->Persistable()) { + auto tensor = var->template GetMutable(); + if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") { + continue; + } + LoadWithDump(*var_desc, tensor, &data,out_file); + } + } + } + fclose(out_file); + delete origin_data; + + } + void quantificate_seperated(std::string model_dir, std::string param_min_path) { + paddle_mobile::Loader loader; + bool optimize = true; + auto program = loader.Load(model_dir, optimize); + std::string shell_command = "mkdir "+param_min_path; + system(shell_command.c_str()); + for (const auto &block : program.originProgram->Blocks()) { + for (const auto &var_desc : block->Vars()) { + auto var = program.scope->Var(var_desc->Name()); + if(var_desc ->Persistable()) { + auto tensor = var->template GetMutable(); + if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") { + continue; + } + std::string file_name = param_min_path +"/"+ var_desc->Name(); + + FILE *out_file = fopen(file_name.c_str(), "wb"); + char *origin_data = + Get_binary_data(program.model_path + "/" + var_desc->Name()); + char *data = origin_data; + LoadWithDump(*var_desc, tensor, &data,out_file); + delete origin_data; + fclose(out_file); + } + } + } + + } + int main() { + std::string filename = "params_min"; + std::string model_path = g_googlenet_combine + "/model"; + std::string param_path = g_googlenet_combine + "/params"; + std::string dirname = "param_min_dir"; + std::string model_dir = g_googlenet; +// quantificate_combined(model_path, param_path,filename); + quantificate_seperated(model_dir, dirname); + + return 0; + } + + + + + +