diff --git a/README.md b/README.md index c9d15d4960a6330ff6614b6dfc8fd20b81386c9c..59ef597dd749ea16658977cd6d548cedaa90d166 100644 --- a/README.md +++ b/README.md @@ -27,7 +27,7 @@ Paddle-Moible是PaddlePaddle组织下的项目,是一个致力于嵌入式平 - **ARM CPU** -![](http://mms-graph.bj.bcebos.com/paddle-mobile%2F2018_07_18.png) +![](http://mms-graph.bj.bcebos.com/paddle-mobile%2F2018_07_29.png) arm cpu是paddle-mobile的主要支持方向,cpu的通用性一直是其优势。嵌入式深度学习,需要大量的cpu汇编实现。我们正在紧锣密鼓的编码,为的是能充分硬件的每一点加速能力。 arm cpu的优化工作还在进行中,现在使用了常规的cpu优化。在arm a73上paddle-mobile arm-v7现在单核运行一次mobilenet1.0是110+ms,显然这不是我们的最终目标,我们正在用大量的汇编改写,后续性能仍会有巨大提升空间, 目前只支持armv7, 未来我们也会支持armv8。 diff --git a/src/common/types.cpp b/src/common/types.cpp index 14924c4a2129292aca32e307569fc8dc9a00f913..2f366eb9e5a10ea11e3153e6e32b18204c6dd9cd 100644 --- a/src/common/types.cpp +++ b/src/common/types.cpp @@ -17,39 +17,39 @@ limitations under the License. */ namespace paddle_mobile { -const std::string G_OP_TYPE_CONV = "conv2d"; -const std::string G_OP_TYPE_BATCHNORM = "batch_norm"; -const std::string G_OP_TYPE_BOX_CODER = "box_coder"; -const std::string G_OP_TYPE_CONCAT = "concat"; -const std::string G_OP_TYPE_ELEMENTWISE_ADD = "elementwise_add"; -const std::string G_OP_TYPE_FUSION_CONV_ADD_RELU = "fusion_conv_add_relu"; -const std::string G_OP_TYPE_FUSION_CONV_ADD_BN_RELU = "fusion_conv_add_bn_relu"; -const std::string G_OP_TYPE_FUSION_DWCONV_BN_RELU = "fusion_dwconv_bn_relu"; -const std::string G_OP_TYPE_FUSION_CONV_BN_RELU = "fusion_conv_bn_relu"; -const std::string G_OP_TYPE_FC = "fusion_fc"; -const std::string G_OP_TYPE_FUSION_CONV_ADD = "fusion_conv_add"; -const std::string G_OP_TYPE_LRN = "lrn"; -const std::string G_OP_TYPE_MUL = "mul"; -const std::string G_OP_TYPE_MULTICLASS_NMS = "multiclass_nms"; -const std::string G_OP_TYPE_POOL2D = "pool2d"; -const std::string G_OP_TYPE_PRIOR_BOX = "prior_box"; -const std::string G_OP_TYPE_RELU = "relu"; -const std::string G_OP_TYPE_RESHAPE = "reshape"; -const std::string G_OP_TYPE_SIGMOID = "sigmoid"; -const std::string G_OP_TYPE_SOFTMAX = "softmax"; -const std::string G_OP_TYPE_TRANSPOSE = "transpose"; -const std::string G_OP_TYPE_SPLIT = "split"; -const std::string G_OP_TYPE_FEED = "feed"; -const std::string G_OP_TYPE_FETCH = "fetch"; -const std::string G_OP_TYPE_DEPTHWISE_CONV = "depthwise_conv2d"; -const std::string G_OP_TYPE_IM2SEQUENCE = "im2sequence"; -const std::string G_OP_TYPE_DROPOUT = "dropout"; -const std::string G_OP_TYPE_FUSION_CONV_ADD_BN = "fusion_conv_add_bn"; -const std::string G_OP_TYPE_FUSION_POOL_BN = "fusion_pool_bn"; -const std::string G_OP_TYPE_FUSION_ELEMENTWISE_ADD_RELU = +const char *G_OP_TYPE_CONV = "conv2d"; +const char *G_OP_TYPE_BATCHNORM = "batch_norm"; +const char *G_OP_TYPE_BOX_CODER = "box_coder"; +const char *G_OP_TYPE_CONCAT = "concat"; +const char *G_OP_TYPE_ELEMENTWISE_ADD = "elementwise_add"; +const char *G_OP_TYPE_FUSION_CONV_ADD_RELU = "fusion_conv_add_relu"; +const char *G_OP_TYPE_FUSION_CONV_ADD_BN_RELU = "fusion_conv_add_bn_relu"; +const char *G_OP_TYPE_FUSION_DWCONV_BN_RELU = "fusion_dwconv_bn_relu"; +const char *G_OP_TYPE_FUSION_CONV_BN_RELU = "fusion_conv_bn_relu"; +const char *G_OP_TYPE_FC = "fusion_fc"; +const char *G_OP_TYPE_FUSION_CONV_ADD = "fusion_conv_add"; +const char *G_OP_TYPE_LRN = "lrn"; +const char *G_OP_TYPE_MUL = "mul"; +const char *G_OP_TYPE_MULTICLASS_NMS = "multiclass_nms"; +const char *G_OP_TYPE_POOL2D = "pool2d"; +const char *G_OP_TYPE_PRIOR_BOX = "prior_box"; +const char *G_OP_TYPE_RELU = "relu"; +const char *G_OP_TYPE_RESHAPE = "reshape"; +const char *G_OP_TYPE_SIGMOID = "sigmoid"; +const char *G_OP_TYPE_SOFTMAX = "softmax"; +const char *G_OP_TYPE_TRANSPOSE = "transpose"; +const char *G_OP_TYPE_SPLIT = "split"; +const char *G_OP_TYPE_FEED = "feed"; +const char *G_OP_TYPE_FETCH = "fetch"; +const char *G_OP_TYPE_DEPTHWISE_CONV = "depthwise_conv2d"; +const char *G_OP_TYPE_IM2SEQUENCE = "im2sequence"; +const char *G_OP_TYPE_DROPOUT = "dropout"; +const char *G_OP_TYPE_FUSION_CONV_ADD_BN = "fusion_conv_add_bn"; +const char *G_OP_TYPE_FUSION_POOL_BN = "fusion_pool_bn"; +const char *G_OP_TYPE_FUSION_ELEMENTWISE_ADD_RELU = "fusion_elementwise_add_relu"; -const std::string G_OP_TYPE_FUSION_FC_RELU = "fusion_fc_relu"; -const std::string G_OP_TYPE_REGION = "region"; +const char *G_OP_TYPE_FUSION_FC_RELU = "fusion_fc_relu"; +const char *G_OP_TYPE_REGION = "region"; std::unordered_map< std::string, std::pair, std::vector>> diff --git a/src/common/types.h b/src/common/types.h index ae993f8034d7136a4badac2bbaf0353c6ef05222..6066879305d5ea7d1b6dcb0bb618c234338cc171 100644 --- a/src/common/types.h +++ b/src/common/types.h @@ -20,7 +20,9 @@ limitations under the License. */ #include namespace paddle_mobile { -enum class Precision : int { FP32 = 0 }; +enum class Precision : int { FP32 = 0, FP16 = 1 }; + +typedef int16_t half; template struct PrecisionTrait { @@ -31,6 +33,10 @@ template <> struct PrecisionTrait { typedef float ptype; }; +template <> +struct PrecisionTrait { + typedef half ptype; +}; //! device type enum DeviceTypeEnum { kINVALID = -1, kCPU = 0, kFPGA = 1, kGPU_MALI = 2 }; @@ -73,40 +79,40 @@ enum PMStatus { PMWrongDevice = 0x08 /*!< un-correct device. */ }; -extern const std::string G_OP_TYPE_CONV; -extern const std::string G_OP_TYPE_BATCHNORM; -extern const std::string G_OP_TYPE_BOX_CODER; -extern const std::string G_OP_TYPE_CONCAT; -extern const std::string G_OP_TYPE_ELEMENTWISE_ADD; -extern const std::string G_OP_TYPE_FUSION_CONV_ADD_RELU; -extern const std::string G_OP_TYPE_FC; -extern const std::string G_OP_TYPE_FUSION_CONV_ADD; -extern const std::string G_OP_TYPE_FUSION_CONV_ADD_BN_RELU; -extern const std::string G_OP_TYPE_FUSION_DWCONV_BN_RELU; -extern const std::string G_OP_TYPE_FUSION_CONV_BN_RELU; - -extern const std::string G_OP_TYPE_LRN; -extern const std::string G_OP_TYPE_MUL; -extern const std::string G_OP_TYPE_MULTICLASS_NMS; -extern const std::string G_OP_TYPE_POOL2D; -extern const std::string G_OP_TYPE_PRIOR_BOX; -extern const std::string G_OP_TYPE_RELU; -extern const std::string G_OP_TYPE_RESHAPE; -extern const std::string G_OP_TYPE_SIGMOID; -extern const std::string G_OP_TYPE_SOFTMAX; -extern const std::string G_OP_TYPE_TRANSPOSE; -extern const std::string G_OP_TYPE_SPLIT; -extern const std::string G_OP_TYPE_FEED; -extern const std::string G_OP_TYPE_FETCH; -extern const std::string G_OP_TYPE_DEPTHWISE_CONV; -extern const std::string G_OP_TYPE_IM2SEQUENCE; -extern const std::string G_OP_TYPE_DROPOUT; - -extern const std::string G_OP_TYPE_FUSION_CONV_ADD_BN; -extern const std::string G_OP_TYPE_FUSION_POOL_BN; -extern const std::string G_OP_TYPE_FUSION_ELEMENTWISE_ADD_RELU; -extern const std::string G_OP_TYPE_FUSION_FC_RELU; -extern const std::string G_OP_TYPE_REGION; +extern const char *G_OP_TYPE_CONV; +extern const char *G_OP_TYPE_BATCHNORM; +extern const char *G_OP_TYPE_BOX_CODER; +extern const char *G_OP_TYPE_CONCAT; +extern const char *G_OP_TYPE_ELEMENTWISE_ADD; +extern const char *G_OP_TYPE_FUSION_CONV_ADD_RELU; +extern const char *G_OP_TYPE_FC; +extern const char *G_OP_TYPE_FUSION_CONV_ADD; +extern const char *G_OP_TYPE_FUSION_CONV_ADD_BN_RELU; +extern const char *G_OP_TYPE_FUSION_DWCONV_BN_RELU; +extern const char *G_OP_TYPE_FUSION_CONV_BN_RELU; + +extern const char *G_OP_TYPE_LRN; +extern const char *G_OP_TYPE_MUL; +extern const char *G_OP_TYPE_MULTICLASS_NMS; +extern const char *G_OP_TYPE_POOL2D; +extern const char *G_OP_TYPE_PRIOR_BOX; +extern const char *G_OP_TYPE_RELU; +extern const char *G_OP_TYPE_RESHAPE; +extern const char *G_OP_TYPE_SIGMOID; +extern const char *G_OP_TYPE_SOFTMAX; +extern const char *G_OP_TYPE_TRANSPOSE; +extern const char *G_OP_TYPE_SPLIT; +extern const char *G_OP_TYPE_FEED; +extern const char *G_OP_TYPE_FETCH; +extern const char *G_OP_TYPE_DEPTHWISE_CONV; +extern const char *G_OP_TYPE_IM2SEQUENCE; +extern const char *G_OP_TYPE_DROPOUT; + +extern const char *G_OP_TYPE_FUSION_CONV_ADD_BN; +extern const char *G_OP_TYPE_FUSION_POOL_BN; +extern const char *G_OP_TYPE_FUSION_ELEMENTWISE_ADD_RELU; +extern const char *G_OP_TYPE_FUSION_FC_RELU; +extern const char *G_OP_TYPE_REGION; extern std::unordered_map< std::string, std::pair, std::vector>> diff --git a/src/fpga/api/fpga_api.cpp b/src/fpga/api/fpga_api.cpp index d484d889d8df8f4171658ae395531b84b0ac0a0d..706614e9449eb33e70e261b221d8882a9353ddbf 100644 --- a/src/fpga/api/fpga_api.cpp +++ b/src/fpga/api/fpga_api.cpp @@ -29,15 +29,15 @@ limitations under the License. */ #include "fpga/api/fpga_api.h" -namespace paddle { -namespace mobile { +namespace paddle_mobile { namespace fpga { -namespace api { static int fd = -1; static const char *device_path = "/dev/fpgadrv0"; -static inline int do_ioctl(int req, void *arg) { return ioctl(req, arg); } +static inline int do_ioctl(int req, void *arg) { + return ioctl(req, (long unsigned int)arg); +} int open_device() { if (fd == -1) { @@ -48,8 +48,8 @@ int open_device() { // memory management; void *fpga_malloc(size_t size) { - return reinterpret_cast<(void *)> mmap64(NULL, size, PROT_READ | PROT_WRITE, - MAP_SHARED, fd, 0); + return reinterpret_cast( + mmap64(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0)); } void fpga_free(void *ptr) { munmap(ptr, 0); } @@ -58,7 +58,9 @@ void fpga_copy(void *dest, const void *src, size_t num) { memcpy(dest, src, num); } -} // namespace api +int ComputeFpgaConv(struct FpgaConvArgs args) {} +int ComputeFpgaPool(struct FpgaPoolArgs args) {} +int ComputeFpgaEWAdd(struct FpgaEWAddArgs args) {} + } // namespace fpga -} // namespace mobile -} // namespace paddle +} // namespace paddle_mobile diff --git a/src/fpga/api/fpga_api.h b/src/fpga/api/fpga_api.h index 65fb1b5d611e8c063d196efa8b8d7ccfa0ff91b3..42e99f4e4238d6974d23c1fb33bf238ca8a8626d 100644 --- a/src/fpga/api/fpga_api.h +++ b/src/fpga/api/fpga_api.h @@ -14,44 +14,107 @@ limitations under the License. */ #pragma once +#include #include #include #include // memory management; -namespace paddle { -namespace mobile { +namespace paddle_mobile { namespace fpga { -namespace api { int open_device(); int close_device(); -void *fpga_malloc(size_t size); -void fpga_free(void *ptr); -void fpga_copy(void *dst, const void *src, size_t num); +void* fpga_malloc(size_t size); +void fpga_free(void* ptr); +void fpga_copy(void* dst, const void* src, size_t num); -struct CnnVersionArgs { - void *buf; +struct FpgaVersionArgs { + void* buf; }; -struct QuantArgs { +struct MemoryToPhysicalArgs { + const void* src; + uint64_t physical; +}; + +struct MemoryCopyArgs { + void* src; + void* dst; + size_t size; +}; + +struct FpgaQuantArgs { float scale; }; -struct BatchNormalizationArgs { - bool enable; +struct FpgaBNArgs { + bool enabled = false; + void* bias_addr; + void* scale_addr; +}; + +struct FpgaKernelArgs { + uint32_t width; + uint32_t height; + uint32_t stride_h; + uint32_t stride_w; +}; + +struct FpgaImageArgs { + uint32_t width; + uint32_t height; + uint32_t channels; + uint32_t pad_h; + uint32_t pad_w; +}; + +struct FpgaConvArgs { + bool relu_enabled; + struct FpgaBNArgs BNargs; + void* image_addr; + void* filter_addr; + void* bias_addr; + void* output_addr; + float quant_scale; + struct FpgaImageArgs image; + uint32_t filter_num; + uint32_t group_num; + + struct FpgaKernelArgs kernel; +}; + +struct FpgaPoolArgs { + void* image_addr; + void* output_addr; + struct FpgaImageArgs image; + struct FpgaKernelArgs kernel; +}; + +struct FpgaEWAddArgs { + bool relu_enabled; + void* image0_addr; + void* image1_addr; + void* result_addr; + uint32_t const0; + uint32_t const1; + uint32_t data_len; // aligned element count }; -struct ScaleArgs {}; +int ComputeFpgaConv(struct FpgaConvArgs args); +int ComputeFpgaPool(struct FpgaPoolArgs args); +int ComputeFpgaEWAdd(struct FpgaEWAddArgs args); -#define IOCTL_CNN_MAGIC 'CNN' -#define IOCTL_VERSION _IOW(IOCTL_CNN_MAGIC, 1, struct CnnVersionArgs) -#define IOCTL_GET_QUANT _IOW(IOCTL_CNN_MAGIC, 2, struct QuantArgs) -#define IOCTL_SET_QUANT _IOW(IOCTL_CNN_MAGIC, 3, struct QuantArgs) +#define IOCTL_FPGA_MAGIC 'CNN' +#define IOCTL_VERSION _IOW(IOCTL_FPGA_MAGIC, 1, struct FpgaVersionArgs) +#define IOCTL_GET_QUANT _IOW(IOCTL_FPGA_MAGIC, 2, struct FpgaQuantArgs) +#define IOCTL_SET_QUANT _IOW(IOCTL_FPGA_MAGIC, 3, struct FpgaQuantArgs) +#define IOCTL_MEM_COPY _IOW(IOCTL_FPGA_MAGIC, 11, struct MemoryCopyArgs) +#define IOCTL_CONFIG_CONV _IOW(IOCTL_FPGA_MAGIC, 21, struct FpgaConvArgs) +#define IOCTL_CONFIG_POOLING _IOW(IOCTL_FPGA_MAGIC, 22, struct FpgaPoolArgs) +#define IOCTL_CONFIG_EW _IOW(IOCTL_FPGA_MAGIC, 23, struct FpgaEWAddArgs) -} // namespace api } // namespace fpga -} // namespace mobile -} // namespace paddle +} // namespace paddle_mobile diff --git a/src/framework/program/program-optimize/fusion_op_register.h b/src/framework/program/program-optimize/fusion_op_register.h index f16a65c28fb47e1cf4139588742ebe1073c3f3e6..a5890d34c600f6c4f4838ec94c202801b3044d3f 100644 --- a/src/framework/program/program-optimize/fusion_op_register.h +++ b/src/framework/program/program-optimize/fusion_op_register.h @@ -14,11 +14,13 @@ limitations under the License. */ #pragma once +#include #include #include +#include #include "framework/operator.h" -#include "node.h" +#include "framework/program/program-optimize/node.h" namespace paddle_mobile { namespace framework { diff --git a/src/framework/tensor.h b/src/framework/tensor.h index 56e6d6bf18740489c195a66db70331cbab42aeea..954a65a3605c4d0204890d9414aeb074371b0d69 100644 --- a/src/framework/tensor.h +++ b/src/framework/tensor.h @@ -16,14 +16,15 @@ limitations under the License. */ #include #include +#include #include +#include #include #include #include -#include "common/enforce.h" -#include #include "common/enforce.h" +#include "common/types.h" #include "framework/data_layout.h" #include "framework/ddim.h" #include "memory/t_malloc.h" @@ -63,7 +64,8 @@ struct SizeOfTypeFunctor { }; static inline size_t SizeOfType(std::type_index type) { - SizeOfTypeFunctor functor; + SizeOfTypeFunctor + functor; size_t size = functor(type); PADDLE_MOBILE_ENFORCE(size != 0UL, "Cannot get size of type %s", type.name()); diff --git a/src/io/executor.cpp b/src/io/executor.cpp index 65f019d1e3c3f6f6bdb8a18a9ff99bb7ecb2012c..d6434b64aa752fd62bc637a882298228d59880b8 100644 --- a/src/io/executor.cpp +++ b/src/io/executor.cpp @@ -187,7 +187,7 @@ void Executor::LoadMemory(const framework::VarDesc var_desc, memcpy(&max_value, *data + sizeof(float), sizeof(float)); *data += 2 * sizeof(float); const float factor = (max_value - min_value) / 255.0; - uint8_t *uint8_data = (uint8_t *)(*data); + uint8_t *uint8_data = reinterpret_cast(*data); for (int k = 0; k < memory_size; ++k) { static_cast(memory)[k] = uint8_data[k] * factor + min_value; } @@ -419,7 +419,7 @@ std::vector::Ptype> Executor::Predict( } template class Executor; -template class Executor; template class Executor; +template class Executor; } // namespace paddle_mobile diff --git a/src/memory/t_malloc.cpp b/src/memory/t_malloc.cpp index 178541953323b6ffd1a3339f8209c2839b37a784..42b8c4551871c58955251d94845ca13576d7735b 100644 --- a/src/memory/t_malloc.cpp +++ b/src/memory/t_malloc.cpp @@ -27,17 +27,17 @@ namespace memory { const int MALLOC_ALIGN = 64; #ifdef PADDLE_MOBILE_FPGA -namespace api = paddle::mobile::fpga::api; +namespace fpga = paddle_mobile::fpga; void Copy(void *dst, const void *src, size_t num) { std::memcpy(dst, src, num); } -void *Alloc(size_t size) { return api::malloc(size); } +void *Alloc(size_t size) { return fpga::fpga_malloc(size); } void Free(void *ptr) { if (ptr) { - api::fpga_free(ptr); + fpga::fpga_free(ptr); } } diff --git a/src/operators/concat_op.cpp b/src/operators/concat_op.cpp index 19d771ddd5884412624a0720368ecc80f92678ea..f767f3481c999a16da46e75e314e8ebcb54193fa 100644 --- a/src/operators/concat_op.cpp +++ b/src/operators/concat_op.cpp @@ -14,7 +14,9 @@ limitations under the License. */ #ifdef CONCAT_OP -#include "concat_op.h" +#include + +#include "operators/concat_op.h" namespace paddle_mobile { namespace operators { @@ -68,6 +70,7 @@ REGISTER_OPERATOR_CPU(concat, ops::ConcatOp); REGISTER_OPERATOR_MALI_GPU(concat, ops::ConcatOp); #endif #ifdef PADDLE_MOBILE_FPGA +REGISTER_OPERATOR_FPGA(concat, ops::ConcatOp); #endif #endif diff --git a/src/operators/concat_op.h b/src/operators/concat_op.h index 7aedaab4b1fa00707661ada428c7c1dc27f124cd..bad0015917c2a9d4016def26c8a332d076b39c99 100644 --- a/src/operators/concat_op.h +++ b/src/operators/concat_op.h @@ -53,6 +53,7 @@ USE_OP_CPU(concat); USE_OP_MALI_GPU(concat); #endif #ifdef PADDLE_MOBILE_FPGA +USE_OP_FPGA(concat); #endif #endif diff --git a/src/operators/feed_op.h b/src/operators/feed_op.h index 723747874da8fc8ee2c02eb1be4c89189c2af746..e45ad38fd68cb9b4616b7e363be117e2039c93a9 100644 --- a/src/operators/feed_op.h +++ b/src/operators/feed_op.h @@ -29,7 +29,7 @@ class FeedOp : public framework::OperatorBase { std::shared_ptr scope) : framework::OperatorBase(type, inputs, outputs, attrs, scope), - param_(inputs, outputs, attrs, *scope) {} + param_(inputs, outputs, attrs, scope.get()) {} void RunImpl() const { param_.Out()->ShareDataWith(*param_.InputX()); } void Init() {} diff --git a/src/operators/kernel/arm/dropout_kernel.cpp b/src/operators/kernel/arm/dropout_kernel.cpp index af16048a1b4eba2ff36f842b6cf968031989576e..db942b018d7085ca3986533937328101afb08ff9 100644 --- a/src/operators/kernel/arm/dropout_kernel.cpp +++ b/src/operators/kernel/arm/dropout_kernel.cpp @@ -14,8 +14,6 @@ limitations under the License. */ #ifdef DROPOUT_OP -#pragma once - #include "operators/kernel/dropout_kernel.h" #include diff --git a/src/operators/kernel/dropout_kernel.h b/src/operators/kernel/dropout_kernel.h index 5a3783971959db8fba9ca6b701fb6eb6340fcb3f..3ef6b9dd62d88f012eba3456c676ac0d33bf9e52 100644 --- a/src/operators/kernel/dropout_kernel.h +++ b/src/operators/kernel/dropout_kernel.h @@ -17,7 +17,7 @@ limitations under the License. */ #include "framework/operator.h" #include "operators/op_param.h" -#pragma once; +#pragma once namespace paddle_mobile { namespace operators { diff --git a/src/operators/kernel/fpga/concat_kernel.cpp b/src/operators/kernel/fpga/concat_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c691988f4a388c7835a7016602d7a1ac9cb5f9b6 --- /dev/null +++ b/src/operators/kernel/fpga/concat_kernel.cpp @@ -0,0 +1,55 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef CONCAT_OP + +#include "operators/kernel/concat_kernel.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool ConcatKernel::Init(ConcatParam *param) { + return true; +} + +template <> +void ConcatKernel::Compute(const ConcatParam ¶m) const { + auto inputs = param.Inputs(); + auto *out = param.Out(); + int64_t axis = param.Axis(); + out->mutable_data(); + + DDim out_dim = out->dims(); + int pixels = out_dim[1] * out_dim[2]; + auto out_channel = out_dim[3]; + + auto out_offset = 0; + + for (int i = 0; i < inputs.size(); ++i) { + auto input = inputs[i]; + auto channels = input[3]; + out_offset += channels; + auto src = input->data(); + for (int j = 0; j < pixels; ++j) { + auto dst = out->data() + out_offset; + memory::Copy(dst, src, sizeof(half)); + } + } +} + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/im2sequence_kernel.h b/src/operators/kernel/im2sequence_kernel.h index cb592613f73d90dae5a7d6e515f8bc091981776e..aa798fd6af5592a062de207714dc9fee2afb93df 100644 --- a/src/operators/kernel/im2sequence_kernel.h +++ b/src/operators/kernel/im2sequence_kernel.h @@ -20,13 +20,11 @@ limitations under the License. */ #include "operators/math/vol2col.h" #include "operators/op_param.h" -#pragma once; +#pragma once namespace paddle_mobile { namespace operators { -using namespace framework; - template class Im2SequenceKernel : public framework::OpKernelBase { diff --git a/src/operators/kernel/mali/fushion_fc_kernel.cpp b/src/operators/kernel/mali/fushion_fc_kernel.cpp index a76c3c46012a758a05cf8f846a15376ad1b9f33c..44a7ce2af62a1d27aff8181f6742bebda1d6d066 100755 --- a/src/operators/kernel/mali/fushion_fc_kernel.cpp +++ b/src/operators/kernel/mali/fushion_fc_kernel.cpp @@ -14,8 +14,6 @@ limitations under the License. */ #ifdef FUSION_FC_OP -#pragma once - #include "operators/kernel/fusion_fc_kernel.h" namespace paddle_mobile { diff --git a/src/operators/kernel/prelu_kernel.h b/src/operators/kernel/prelu_kernel.h index 9f5dcb23ee9bf44ffa8bbdd98879d533d07c39f9..15696174377f04ad9a62366e03ded1f2cdcdee9e 100644 --- a/src/operators/kernel/prelu_kernel.h +++ b/src/operators/kernel/prelu_kernel.h @@ -15,7 +15,7 @@ limitations under the License. */ #include "framework/operator.h" #include "operators/op_param.h" -#pragma once; +#pragma once namespace paddle_mobile { namespace operators { diff --git a/src/operators/kernel/scale_kernel.h b/src/operators/kernel/scale_kernel.h index cc76a4b59b0be24dd3a3fb82c0e3d9fb1a4dbf24..98ac71d0bbad86f595171ad7ac5b2a1cdf5908fa 100644 --- a/src/operators/kernel/scale_kernel.h +++ b/src/operators/kernel/scale_kernel.h @@ -15,7 +15,7 @@ limitations under the License. */ #include "framework/operator.h" #include "operators/op_param.h" -#pragma once; +#pragma once namespace paddle_mobile { namespace operators { diff --git a/src/operators/kernel/slice_kernel.h b/src/operators/kernel/slice_kernel.h index e308364602f401b1c6c6f8e2e35385aefa055360..fd3b8dc767076c5244509f6015c42bee87df100b 100644 --- a/src/operators/kernel/slice_kernel.h +++ b/src/operators/kernel/slice_kernel.h @@ -15,7 +15,7 @@ limitations under the License. */ #include "framework/operator.h" #include "operators/op_param.h" -#pragma once; +#pragma once namespace paddle_mobile { namespace operators { diff --git a/src/operators/math/gemm.cpp b/src/operators/math/gemm.cpp index 4966ca14594cfe4680b4de2f7f56ef85e345e437..b9b61f4d1c59a0e2c8e7822742c54472ad540981 100644 --- a/src/operators/math/gemm.cpp +++ b/src/operators/math/gemm.cpp @@ -92,8 +92,8 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb, */ // 将A矩阵分块复制到连续内存(RowMajor) -void PackMatrixA_(int m, int k, int m_tail, const float *A, int lda, - float *buffer) { +void PackMatrixA_4r(int m, int k, int m_tail, const float *A, int lda, + float *buffer) { const float *a0, *a1, *a2, *a3; for (int i = 0; i < m - m_tail; i += MR) { a0 = A + i * lda; @@ -131,9 +131,62 @@ void PackMatrixA_(int m, int k, int m_tail, const float *A, int lda, } } +void PackMatrixA_6r(int m, int k, int m_tail, const float *A, int lda, + float *buffer) { + const float *a0, *a1, *a2, *a3, *a4, *a5; + for (int i = 0; i < m - m_tail; i += MR) { + a0 = A + i * lda; + a1 = A + (i + 1) * lda; + a2 = A + (i + 2) * lda; + a3 = A + (i + 3) * lda; + a4 = A + (i + 4) * lda; + a5 = A + (i + 5) * lda; + for (int j = 0; j < k; ++j) { + *buffer++ = *a0++; + *buffer++ = *a1++; + *buffer++ = *a2++; + *buffer++ = *a3++; + *buffer++ = *a4++; + *buffer++ = *a5++; + } + } + int i = m - m_tail; + a0 = &A(i, 0); + a1 = a0 + lda; + a2 = a0 + 2 * lda; + a3 = a0 + 3 * lda; + a4 = a0 + 4 * lda; + a5 = a0 + 5 * lda; + if (m_tail != 0) { + if (m_tail <= 5) { + a5 = zero; + } + if (m_tail <= 4) { + a4 = zero; + } + if (m_tail <= 3) { + a3 = zero; + } + if (m_tail <= 2) { + a2 = zero; + } + if (m_tail <= 1) { + a1 = zero; + } + for (int j = 0; j < k; ++j) { + *buffer++ = *a0++; + *buffer++ = *a1++; + *buffer++ = *a2++; + *buffer++ = *a3++; + *buffer++ = *a4++; + *buffer++ = *a5++; + } + } +} + // 将B矩阵分块复制到连续内存(RowMajor) -void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb, - float *buffer) { +void PackMatrixB_8c(int k, int n, int n_tail, const float *B, int ldb, + float *buffer) { const float *b0; for (int j = 0; j < n - n_tail; j += NR) { for (int i = 0; i < k; ++i) { @@ -188,7 +241,8 @@ void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b, for (int j = 0; j < nc; j += NR) { for (int i = 0; i < mc; i += MR) { // AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); - AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); + // AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); + AddDot6x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); } } @@ -218,7 +272,8 @@ void InnerKernelWithBn(int mc, int nc, float alpha, const float *a, for (int j = 0; j < nc; j += NR) { for (int i = 0; i < mc; i += MR) { // AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); - AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); + // AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); + AddDot6x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); } } @@ -1868,22 +1923,22 @@ void Sgemm(int m, int n, int k, float alpha, const float *A, int lda, const float *B, int ldb, float beta, float *C, int ldc, bool relu) { // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73) // L2 cache is 0.5~4 Mib (Contex-A72 cluster) - int L1 = 30 * 1024; - int L2 = 1 * 1024 * 1024; + int L1 = 32 * 1024; + int L2 = 0.5 * 1024 * 1024; KC = k; - MC = L2 / (2 * KC * sizeof(float)); - NC = MC; + MC = L1 / (KC * sizeof(float)); + NC = L2 / (KC * sizeof(float)); - // make sure MC is multiple of 4, and NC is multiple of 8 + // make sure MC is multiple of MR, and NC is multiple of NR int mblock_num = (m + MC - 1) / MC; MC = (m + mblock_num - 1) / mblock_num; - MC = (MC + 4 - 1) / 4 * 4; + MC = (MC + MR - 1) / MR * MR; // DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n"; int nblock_num = (n + NC - 1) / NC; NC = (n + nblock_num - 1) / nblock_num; - NC = (NC + 8 - 1) / 8 * 8; + NC = (NC + NR - 1) / NR * NR; // DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n"; packedA = static_cast( @@ -1901,10 +1956,10 @@ void Sgemm(int m, int n, int k, float alpha, const float *A, int lda, int mc, nc; for (int j = 0; j < n; j += NC) { nc = s_min(n - j, NC); - PackMatrixB_(KC, nc, nc % NR, &B(0, j), ldb, packedB); + PackMatrixB_8c(KC, nc, nc % NR, &B(0, j), ldb, packedB); for (int i = 0; i < m; i += MC) { mc = s_min(m - i, MC); - PackMatrixA_(mc, KC, mc % MR, &A(i, 0), lda, packedA); + PackMatrixA_6r(mc, KC, mc % MR, &A(i, 0), lda, packedA); InnerKernel(mc, nc, alpha, packedA, packedB, beta, packedC, &C(i, j), ldc, relu); } @@ -1921,22 +1976,22 @@ void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda, bool relu, float *new_scale, float *new_bias) { // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73) // L2 cache is 0.5~4 Mib (Contex-A72 cluster) - int L1 = 30 * 1024; - int L2 = 1 * 1024 * 1024; + int L1 = 32 * 1024; + int L2 = 0.5 * 1024 * 1024; KC = k; - MC = L2 / (2 * KC * sizeof(float)); - NC = MC; + MC = L1 / (KC * sizeof(float)); + NC = L2 / (KC * sizeof(float)); - // make sure MC is multiple of 4, and NC is multiple of 8 + // make sure MC is multiple of MR, and NC is multiple of NR int mblock_num = (m + MC - 1) / MC; MC = (m + mblock_num - 1) / mblock_num; - MC = (MC + 4 - 1) / 4 * 4; + MC = (MC + MR - 1) / MR * MR; // DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n"; int nblock_num = (n + NC - 1) / NC; NC = (n + nblock_num - 1) / nblock_num; - NC = (NC + 8 - 1) / 8 * 8; + NC = (NC + NR - 1) / NR * NR; // DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n"; packedA = static_cast( @@ -1954,10 +2009,10 @@ void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda, int mc, nc; for (int j = 0; j < n; j += NC) { nc = s_min(n - j, NC); - PackMatrixB_(KC, nc, nc % NR, &B(0, j), ldb, packedB); + PackMatrixB_8c(KC, nc, nc % NR, &B(0, j), ldb, packedB); for (int i = 0; i < m; i += MC) { mc = s_min(m - i, MC); - PackMatrixA_(mc, KC, mc % MR, &A(i, 0), lda, packedA); + PackMatrixA_6r(mc, KC, mc % MR, &A(i, 0), lda, packedA); InnerKernelWithBn(mc, nc, alpha, packedA, packedB, beta, packedC, &C(i, j), ldc, relu, new_scale + i, new_bias + i); } @@ -1969,6 +2024,221 @@ void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda, paddle_mobile::memory::Free(zero); } +void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) { +#if __ARM_NEON +#if __aarch64__ + + // init C + float32x4_t cv0 = vdupq_n_f32(0.0); + float32x4_t cv1 = vdupq_n_f32(0.0); + float32x4_t cv2 = vdupq_n_f32(0.0); + float32x4_t cv3 = vdupq_n_f32(0.0); + float32x4_t cv4 = vdupq_n_f32(0.0); + float32x4_t cv5 = vdupq_n_f32(0.0); + float32x4_t cv6 = vdupq_n_f32(0.0); + float32x4_t cv7 = vdupq_n_f32(0.0); + float32x4_t cv8 = vdupq_n_f32(0.0); + float32x4_t cv9 = vdupq_n_f32(0.0); + float32x4_t cv10 = vdupq_n_f32(0.0); + float32x4_t cv11 = vdupq_n_f32(0.0); + + float32x4_t av; + float32x4_t bv0; + float32x4_t bv1; + + float32x2_t av01; + float32x2_t av23; + float32x2_t av45; + + for (int p = 0; p < k; p += 1) { + av = vld1q_f32(a); + av01 = vget_low_f32(av); + av23 = vget_high_f32(av); + av45 = vld1_f32(a + 4); + bv0 = vld1q_f32(b); + bv1 = vld1q_f32(b + 4); + + cv0 = vmlaq_lane_f32(cv0, bv0, av01, 0); + cv1 = vmlaq_lane_f32(cv1, bv1, av01, 0); + cv2 = vmlaq_lane_f32(cv2, bv0, av01, 1); + cv3 = vmlaq_lane_f32(cv3, bv1, av01, 1); + + cv4 = vmlaq_lane_f32(cv4, bv0, av23, 0); + cv5 = vmlaq_lane_f32(cv5, bv1, av23, 0); + cv6 = vmlaq_lane_f32(cv6, bv0, av23, 1); + cv7 = vmlaq_lane_f32(cv7, bv1, av23, 1); + + cv8 = vmlaq_lane_f32(cv8, bv0, av45, 0); + cv9 = vmlaq_lane_f32(cv9, bv1, av45, 0); + cv10 = vmlaq_lane_f32(cv10, bv0, av45, 1); + cv11 = vmlaq_lane_f32(cv11, bv1, av45, 1); + + a += MR; + b += NR; + } + + vst1q_f32(c, cv0); + vst1q_f32(c + 4, cv1); + vst1q_f32(c + ldc, cv2); + vst1q_f32(c + ldc + 4, cv3); + vst1q_f32(c + 2 * ldc, cv4); + vst1q_f32(c + 2 * ldc + 4, cv5); + vst1q_f32(c + 3 * ldc, cv6); + vst1q_f32(c + 3 * ldc + 4, cv7); + vst1q_f32(c + 4 * ldc, cv8); + vst1q_f32(c + 4 * ldc + 4, cv9); + vst1q_f32(c + 5 * ldc, cv10); + vst1q_f32(c + 5 * ldc + 4, cv11); + +#else + + const float *a_ptr, *b_ptr; + a_ptr = a; + b_ptr = b; + int kc1 = k / 4; + int kc2 = k % 4; + int step = 4 * ldc; + asm volatile( + "pld [%[a_ptr]] \n\t" + "pld [%[b_ptr]] \n\t" + "pld [%[a_ptr], #64] \n\t" + "pld [%[b_ptr], #64] \n\t" + + "vmov.f32 q4, #0.0 \n\t" + "vmov.f32 q5, #0.0 \n\t" + "vmov.f32 q6, #0.0 \n\t" + "vmov.f32 q7, #0.0 \n\t" + "vmov.f32 q8, #0.0 \n\t" + "vmov.f32 q9, #0.0 \n\t" + "vmov.f32 q10, #0.0 \n\t" + "vmov.f32 q11, #0.0 \n\t" + "vmov.f32 q12, #0.0 \n\t" + "vmov.f32 q13, #0.0 \n\t" + "vmov.f32 q14, #0.0 \n\t" + "vmov.f32 q15, #0.0 \n\t" + + "subs %[kc1], %[kc1], #1 \n\t" + "blt end_kc1_%= \n\t" + "loop_kc1_%=: \n\t" + + // "pld [%[a_ptr], #128] \n\t" + // "pld [%[b_ptr], #128] \n\t" + // "pld [%[a_ptr], #192] \n\t" + // "pld [%[b_ptr], #192] \n\t" + + "vld1.32 {d0-d2}, [%[a_ptr]]! \n\t" + "vld1.32 {q2, q3}, [%[b_ptr]]! \n\t" + + "vmla.f32 q4, q2, d0[0] \n\t" + "vmla.f32 q5, q3, d0[0] \n\t" + "vmla.f32 q6, q2, d0[1] \n\t" + "vmla.f32 q7, q3, d0[1] \n\t" + "vmla.f32 q8, q2, d1[0] \n\t" + "vmla.f32 q9, q3, d1[0] \n\t" + "vmla.f32 q10, q2, d1[1] \n\t" + "vmla.f32 q11, q3, d1[1] \n\t" + "vmla.f32 q12, q2, d2[0] \n\t" + "vmla.f32 q13, q3, d2[0] \n\t" + "vmla.f32 q14, q2, d2[1] \n\t" + "vmla.f32 q15, q3, d2[1] \n\t" + + "vld1.32 {d0-d2}, [%[a_ptr]]! \n\t" + "vld1.32 {q2, q3}, [%[b_ptr]]! \n\t" + + "vmla.f32 q4, q2, d0[0] \n\t" + "vmla.f32 q5, q3, d0[0] \n\t" + "vmla.f32 q6, q2, d0[1] \n\t" + "vmla.f32 q7, q3, d0[1] \n\t" + "vmla.f32 q8, q2, d1[0] \n\t" + "vmla.f32 q9, q3, d1[0] \n\t" + "vmla.f32 q10, q2, d1[1] \n\t" + "vmla.f32 q11, q3, d1[1] \n\t" + "vmla.f32 q12, q2, d2[0] \n\t" + "vmla.f32 q13, q3, d2[0] \n\t" + "vmla.f32 q14, q2, d2[1] \n\t" + "vmla.f32 q15, q3, d2[1] \n\t" + + "vld1.32 {d0-d2}, [%[a_ptr]]! \n\t" + "vld1.32 {q2, q3}, [%[b_ptr]]! \n\t" + + "vmla.f32 q4, q2, d0[0] \n\t" + "vmla.f32 q5, q3, d0[0] \n\t" + "vmla.f32 q6, q2, d0[1] \n\t" + "vmla.f32 q7, q3, d0[1] \n\t" + "vmla.f32 q8, q2, d1[0] \n\t" + "vmla.f32 q9, q3, d1[0] \n\t" + "vmla.f32 q10, q2, d1[1] \n\t" + "vmla.f32 q11, q3, d1[1] \n\t" + "vmla.f32 q12, q2, d2[0] \n\t" + "vmla.f32 q13, q3, d2[0] \n\t" + "vmla.f32 q14, q2, d2[1] \n\t" + "vmla.f32 q15, q3, d2[1] \n\t" + + "vld1.32 {d0-d2}, [%[a_ptr]]! \n\t" + "vld1.32 {q2, q3}, [%[b_ptr]]! \n\t" + + "vmla.f32 q4, q2, d0[0] \n\t" + "vmla.f32 q5, q3, d0[0] \n\t" + "vmla.f32 q6, q2, d0[1] \n\t" + "vmla.f32 q7, q3, d0[1] \n\t" + "vmla.f32 q8, q2, d1[0] \n\t" + "vmla.f32 q9, q3, d1[0] \n\t" + "vmla.f32 q10, q2, d1[1] \n\t" + "vmla.f32 q11, q3, d1[1] \n\t" + "vmla.f32 q12, q2, d2[0] \n\t" + "vmla.f32 q13, q3, d2[0] \n\t" + "vmla.f32 q14, q2, d2[1] \n\t" + "vmla.f32 q15, q3, d2[1] \n\t" + + "subs %[kc1], %[kc1], #1 \n\t" + "bge loop_kc1_%= \n\t" + "end_kc1_%=: \n\t" + + "subs %[kc2], %[kc2], #1 \n\t" + "blt end_kc2_%= \n\t" + "loop_kc2_%=: \n\t" + + "vld1.32 {d0-d2}, [%[a_ptr]]! \n\t" + "vld1.32 {q2, q3}, [%[b_ptr]]! \n\t" + + "vmla.f32 q4, q2, d0[0] \n\t" + "vmla.f32 q5, q3, d0[0] \n\t" + "vmla.f32 q6, q2, d0[1] \n\t" + "vmla.f32 q7, q3, d0[1] \n\t" + "vmla.f32 q8, q2, d1[0] \n\t" + "vmla.f32 q9, q3, d1[0] \n\t" + "vmla.f32 q10, q2, d1[1] \n\t" + "vmla.f32 q11, q3, d1[1] \n\t" + "vmla.f32 q12, q2, d2[0] \n\t" + "vmla.f32 q13, q3, d2[0] \n\t" + "vmla.f32 q14, q2, d2[1] \n\t" + "vmla.f32 q15, q3, d2[1] \n\t" + + "subs %[kc2], %[kc2], #1 \n\t" + "bge loop_kc2_%= \n\t" + "end_kc2_%=: \n\t" + + "mov r5, %[c] \n\t" + "mov r6, %[step] \n\t" + "vst1.32 {q4, q5}, [r5], r6 \n\t" + "vst1.32 {q6, q7}, [r5], r6 \n\t" + "vst1.32 {q8, q9}, [r5], r6 \n\t" + "vst1.32 {q10, q11}, [r5], r6 \n\t" + "vst1.32 {q12, q13}, [r5], r6 \n\t" + "vst1.32 {q14, q15}, [r5] \n\t" + + : + : [a_ptr] "r"(a_ptr), [b_ptr] "r"(b_ptr), [c] "r"(c), [kc1] "r"(kc1), + [kc2] "r"(kc2), [step] "r"(step) + : "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"); + +#endif // __aarch64__ +#else + +#endif // __ARM_NEON +} + } // namespace math } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/math/gemm.h b/src/operators/math/gemm.h index d8b305a7282b871d61ed588b1237f4f8f1cb56f8..2044c264ed1c0f8624690874ed248661a753804c 100644 --- a/src/operators/math/gemm.h +++ b/src/operators/math/gemm.h @@ -19,7 +19,7 @@ limitations under the License. */ #define B(i, j) B[(i)*ldb + (j)] #define C(i, j) C[(i)*ldc + (j)] -#define MR 4 +#define MR 6 #define NR 8 #define s_min(i, j) ((i) < (j) ? (i) : (j)) @@ -39,12 +39,14 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb, */ // 将 A 矩阵分块复制到连续内存(RowMajor) -void PackMatrixA_(int m, int k, int m_tail, const float *A, int lda, - float *buffer); +void PackMatrixA_4r(int m, int k, int m_tail, const float *A, int lda, + float *buffer); +void PackMatrixA_6r(int m, int k, int m_tail, const float *A, int lda, + float *buffer); // 将 B 矩阵分块复制到连续内存(RowMajor) -void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb, - float *buffer); +void PackMatrixB_8c(int k, int n, int n_tail, const float *B, int ldb, + float *buffer); // 分块矩阵乘法 void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b, @@ -67,6 +69,7 @@ void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A, // 计算一个更小的 C 矩阵分块 void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc); void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc); +void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc); // 分块矩阵乘法结果回写 // C = A * B diff --git a/src/operators/op_param.h b/src/operators/op_param.h index e2795b3aefe3c67df9b51c882298a717a388ae15..dc790ee348e4070bd2891debe21c6f8e7bcdcdf0 100644 --- a/src/operators/op_param.h +++ b/src/operators/op_param.h @@ -22,6 +22,9 @@ limitations under the License. */ #include "framework/scope.h" #include "framework/tensor.h" #include "framework/variable.h" +#ifdef PADDLE_MOBILE_FPGA +#include "fpga/api/fpga_api.h" +#endif namespace paddle_mobile { namespace operators { @@ -256,6 +259,15 @@ class ElementwiseAddParam : OpParam { Tensor *input_y_; Tensor *out_; int axis_; +#ifdef PADDLE_MOBILE_FPGA + + private: + fpga::FpgaEWAddArgs fpga_EW_add_args; + + public: + const fpga::FpgaEWAddArgs &FpgaArgs() const { return fpga_EW_add_args; } + void SetFpgaArgs(const fpga::FpgaEWAddArgs &args) { fpga_EW_add_args = args; } +#endif }; #ifdef FUSION_ELEMENTWISEADDRELU_OP @@ -450,80 +462,15 @@ class PoolParam : public OpParam { vector paddings_; bool ceil_mode_; bool global_pooling_ = false; -}; -#endif - -#ifdef FUSION_POOLBN_OP -class FusionPoolBNParam : OpParam { - public: - FusionPoolBNParam(const VariableNameMap &inputs, - const VariableNameMap &outputs, const AttributeMap &attrs, - const Scope &scope) { - input_ = InputXFrom(inputs, scope); - pooling_type_ = GetAttr("pooling_type", attrs); - ksize_ = GetAttr>("ksize", attrs); - strides_ = GetAttr>("strides", attrs); - paddings_ = GetAttr>("paddings", attrs); - ceil_mode_ = GetAttr("ceil_mode", attrs); - global_pooling_ = GetAttr("global_pooling", attrs); - output_y_ = OutputYFrom(outputs, scope); - input_bias_ = InputBiasFrom(inputs, scope); - input_mean_ = InputMeanFrom(inputs, scope); - input_scale_ = InputScaleFrom(inputs, scope); - input_variance_ = InputVarianceFrom(inputs, scope); - epsilon_ = GetAttr("epsilon", attrs); - momentum_ = GetAttr("momentum", attrs); - // is_test_ = GetAttr("is_test", attrs); - } - const Tensor *Input() const { return input_; } - - const string &PoolingType() const { return pooling_type_; } - - const vector &Ksize() const { return ksize_; } - - const vector &Strides() const { return strides_; } - - const vector &Paddings() const { return paddings_; } - - bool isCeilMode() const { return ceil_mode_; } - - bool isGlobalPooling() const { return global_pooling_; } - - Tensor *OutputY() const { return output_y_; } - - const Tensor *InputBias() const { return input_bias_; } - - const Tensor *InputMean() const { return input_mean_; } - - const Tensor *InputScale() const { return input_scale_; } - - const Tensor *InputVariance() const { return input_variance_; } - - const float &Epsilon() const { return epsilon_; } - - const float &Momentum() const { return momentum_; } - - const bool &IsTest() const { return is_test_; } - - const string &DataFormat() const { return data_format_; } +#ifdef PADDLE_MOBILE_FPGA private: - Tensor *input_; - string pooling_type_; - vector ksize_; - vector strides_; - vector paddings_; - bool ceil_mode_; - bool global_pooling_ = false; - Tensor *output_y_; - Tensor *input_bias_; - Tensor *input_mean_; - Tensor *input_scale_; - Tensor *input_variance_; - float epsilon_; - float momentum_; - bool is_test_; - string data_format_; + fpga::FpgaPoolArgs fpga_pool_args; + + public: + const fpga::FpgaPoolArgs &FpgaArgs() const { return fpga_pool_args; } + void SetFpgaArgs(const fpga::FpgaPoolArgs &args) { fpga_pool_args = args; } +#endif }; #endif @@ -704,10 +651,10 @@ class MultiClassNMSParam : public OpParam { class FeedParam : public OpParam { public: FeedParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, Scope &scope) { - input_x_ = InputXFrom(inputs, scope); - out_ = OutFrom(outputs, scope); - auto var = scope.Var("batch_size"); + const AttributeMap &attrs, Scope *scope) { + input_x_ = InputXFrom(inputs, *scope); + out_ = OutFrom(outputs, *scope); + auto var = scope->Var("batch_size"); batch_size = var->GetValue(); } const Tensor *InputX() const { return input_x_; } @@ -983,6 +930,15 @@ class FusionFcParam : public OpParam { int x_num_col_dims_; int y_num_col_dims_; int axis_; +#ifdef PADDLE_MOBILE_FPGA + + private: + fpga::FpgaConvArgs fpga_conv_args; + + public: + const fpga::FpgaConvArgs &FpgaArgs() const { return fpga_conv_args; } + void SetFpgaArgs(const fpga::FpgaConvArgs &args) { fpga_conv_args = args; } +#endif }; #ifdef FUSION_FCRELU_OP @@ -1032,6 +988,15 @@ class FusionConvAddParam : public OpParam { vector paddings_; vector dilations_; int groups; +#ifdef PADDLE_MOBILE_FPGA + + private: + fpga::FpgaConvArgs fpga_conv_args; + + public: + const fpga::FpgaConvArgs &FpgaArgs() const { return fpga_conv_args; } + void SetFpgaArgs(const fpga::FpgaConvArgs &args) { fpga_conv_args = args; } +#endif }; Print &operator<<(Print &printer, const FusionConvAddParam &conv_param); @@ -1128,6 +1093,15 @@ class FusionConvAddBNReluParam : public OpParam { bool is_test_; Tensor *new_bias_; Tensor *new_scale_; +#ifdef PADDLE_MOBILE_FPGA + + private: + fpga::FpgaConvArgs fpga_conv_args; + + public: + const fpga::FpgaConvArgs &FpgaArgs() const { return fpga_conv_args; } + void SetFpgaArgs(const fpga::FpgaConvArgs &args) { fpga_conv_args = args; } +#endif }; #endif @@ -1213,6 +1187,15 @@ class FusionConvAddBNParam : public OpParam { bool is_test_; Tensor *new_bias_; Tensor *new_scale_; +#ifdef PADDLE_MOBILE_FPGA + + private: + fpga::FpgaConvArgs fpga_conv_args; + + public: + const fpga::FpgaConvArgs &FpgaArgs() const { return fpga_conv_args; } + void SetFpgaArgs(const fpga::FpgaConvArgs &args) { fpga_conv_args = args; } +#endif }; #endif @@ -1426,9 +1409,5 @@ class DropoutParam : public OpParam { }; #endif -#ifdef REGION_OP -class RegionParam : public OpParam {}; -#endif - } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/resize_op.h b/src/operators/resize_op.h index 9e7fd6b8029aebfdf4b7c53439936189b0c8eb8e..6cab048dea350d668c92fda56f6b6b197c38093d 100644 --- a/src/operators/resize_op.h +++ b/src/operators/resize_op.h @@ -33,7 +33,7 @@ class ResizeOp DeviceType, ResizeParam, operators::ResizeKernel> { public: ResizeOp(const std::string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, const framework::AttributeMap attrs, + const VariableNameMap &outputs, const framework::AttributeMap &attrs, std::shared_ptr scope) : framework::OperatorWithKernel>( diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index f747b8202c07b50511b252cc8217d1a4be7c37a9..418ebff79161675e8b23a4cca8f4319121aa6002 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -145,6 +145,10 @@ else () ADD_EXECUTABLE(test-conv-add-relu-op operators/test_conv_add_relu_op.cpp test_helper.h test_include.h executor_for_test.h) target_link_libraries(test-conv-add-relu-op paddle-mobile) + # gen test + ADD_EXECUTABLE(test-conv-add-bn-relu-op operators/test_fusion_conv_add_bn_relu_op.cpp test_helper.h test_include.h executor_for_test.h) + target_link_libraries(test-conv-add-bn-relu-op paddle-mobile) + #add_library(test-lib-size SHARED common/test_lib_size.h common/test_lib_size.cpp) endif() diff --git a/test/common/test_gemm.cpp b/test/common/test_gemm.cpp index 8cb778c458034aecf6cea89fcf0d3e2a3d8118ba..35241fbd535e062be1c7f1f28eb3860d118a3455 100644 --- a/test/common/test_gemm.cpp +++ b/test/common/test_gemm.cpp @@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include +#include #include #include "../test_helper.h" #include "common/log.h" @@ -20,13 +22,21 @@ limitations under the License. */ #define a(i, j) a[(i)*lda + (j)] #define b(i, j) b[(i)*ldb + (j)] +#define c(i, j) c[(i)*ldc + (j)] #define c1(i, j) c1[(i)*ldc + (j)] -#define m 62 -#define n 63 -#define k 74 +void print_matirx(int m, int n, int ldc, float *c) { + for (int i = 0; i < m; ++i) { + std::cout << c(i, 0); + for (int j = 1; j < n; ++j) { + std::cout << " | " << c(i, j); + } + std::cout << std::endl; + } + std::cout << std::endl; +} -int main() { +int do_sgemm(int m, int n, int k, bool relu, int t1, int t2, int pr) { int lda = k; int ldb = n; int ldc = n; @@ -39,44 +49,88 @@ int main() { static_cast(paddle_mobile::memory::Alloc(sizeof(float) * m * n)); float *c1 = static_cast(paddle_mobile::memory::Alloc(sizeof(float) * m * n)); + float *scale = + static_cast(paddle_mobile::memory::Alloc(sizeof(float) * m)); + float *bias = + static_cast(paddle_mobile::memory::Alloc(sizeof(float) * m)); + srand(unsigned(time(0))); for (int i = 0; i < m * k; ++i) { - a[i] = 2; + a[i] = t1 + rand() % t2; } for (int i = 0; i < k * n; ++i) { - b[i] = 2; + b[i] = t1 + rand() % t2; } - for (int i = 0; i < m * n; ++i) { - c[i] = 2; - c1[i] = 2; + for (int i = 0; i < m; ++i) { + scale[i] = t1 + rand() % t2; } - - auto time1 = time(); - // paddle_mobile::operators::math::Sgemm(m, n, k, 0.9, a, lda, b, ldb, 0.3, - // c, - // ldc); - auto time2 = time(); - DLOG << "gemm cost :" << time_diff(time1, time2) << "ms\n"; - for (int i = 0; i < m * n; ++i) { - std::cout << c[i] << " | "; - if (i % n == (n - 1)) { - std::cout << std::endl; - } + for (int i = 0; i < m; ++i) { + bias[i] = t1 + rand() % t2; } - for (int j = 0; j < n; ++j) { - for (int i = 0; i < m; ++i) { - c1(i, j) *= 0.3; - for (int p = 0; p < k; ++p) { - c1(i, j) += 0.9 * a(i, p) * b(p, j); + + for (int i = 0; i < m; ++i) { + for (int j = 0; j < n; ++j) { + float r = 0; + for (int p = 0; p < k; p++) { + r += a(i, p) * b(p, j); } + r *= scale[i]; + r += bias[i]; + if (relu && (r < 0)) { + r = 0; + } + c1(i, j) = r; } } - std::cout << "正确结果对比:" << std::endl; + + paddle_mobile::operators::math::SgemmWithBn(m, n, k, 0.9, a, lda, b, ldb, 0.3, + c, ldc, relu, scale, bias); + int eq = 0; + int neq = 0; for (int i = 0; i < m * n; ++i) { - std::cout << c1[i] << " | "; - if (i % n == (n - 1)) { - std::cout << std::endl; + if (static_cast(c[i]) == static_cast(c1[i])) { + ++eq; + } else { + ++neq; } } + + if (pr > 0) { + std::cout << "A:" << std::endl; + print_matirx(m, k, lda, a); + std::cout << "B:" << std::endl; + print_matirx(k, n, ldb, b); + std::cout << "C:" << std::endl; + print_matirx(m, n, ldc, c); + std::cout << "C1:" << std::endl; + print_matirx(m, n, ldc, c1); + } + + std::cout << "mnk=" << m << " " << n << " " << k << " relu=" << relu + << " eq=" << eq << " neq=" << neq << std::endl; + + paddle_mobile::memory::Free(a); + paddle_mobile::memory::Free(b); + paddle_mobile::memory::Free(c); + paddle_mobile::memory::Free(c1); + paddle_mobile::memory::Free(scale); + paddle_mobile::memory::Free(bias); + + return 0; +} + +int main() { + do_sgemm(9, 9, 9, true, 10, 10, 10); + do_sgemm(10, 6, 12, false, 10, 10, 0); + do_sgemm(512, 256, 384, false, 10, 10, 0); + do_sgemm(1366, 768, 256, false, 10, 10, 0); + do_sgemm(1255, 755, 333, false, 10, 10, 0); + do_sgemm(555, 777, 999, false, 10, 10, 0); + + do_sgemm(10, 6, 12, true, -4, 10, 0); + do_sgemm(512, 256, 384, true, -4, 10, 0); + do_sgemm(1366, 768, 256, true, -4, 10, 0); + do_sgemm(1255, 755, 333, true, -4, 10, 0); + do_sgemm(555, 777, 999, true, -4, 10, 0); return 0; } diff --git a/test/executor_for_test.h b/test/executor_for_test.h index c9ab4783d6826992ee81ffd63b0391169645576c..93847af20a6d48a6df33dc50f6c6a1db76facf51 100644 --- a/test/executor_for_test.h +++ b/test/executor_for_test.h @@ -43,7 +43,7 @@ template class Executor4Test : public Executor { public: Executor4Test(Program p, string op_type, - bool use_optimize = false) + bool use_optimize = false, int predict_op_count = 1) : Executor() { this->use_optimize_ = use_optimize; this->program_ = p; @@ -57,12 +57,14 @@ class Executor4Test : public Executor { LOG(paddle_mobile::LogLevel::kLOG_ERROR) << "to_predict_program_ == nullptr"; } + const std::vector> blocks = this->to_predict_program_->Blocks(); for (std::shared_ptr block_desc : blocks) { std::vector> ops = block_desc->Ops(); - for (std::shared_ptr op : ops) { - if (op->Type() == op_type) { + for (int i = 0; i < ops.size(); ++i) { + auto op = ops[i]; + if (op->Type() == op_type && i < predict_op_count) { DLOG << "匹配到: " << op->Type(); /// test first meeting op in program @@ -72,11 +74,17 @@ class Executor4Test : public Executor { op->Type(), op->GetInputs(), op->GetOutputs(), op->GetAttrMap(), this->program_.scope); this->ops_of_block_[*block_desc.get()].push_back(op_ptr); - break; } } } this->InitMemory(); + + std::shared_ptr to_predict_block = + this->to_predict_program_->Block(0); + auto &ops = this->ops_of_block_[*to_predict_block.get()]; + for (const auto &op : ops) { + op->Init(); + } } template @@ -130,9 +138,6 @@ class Executor4Test : public Executor { auto *output_tensor = con_output->GetMutable(); output_tensor->mutable_data(dDim); - std::shared_ptr out_tensor = std::make_shared(); - out_tensor.reset(output_tensor); - std::shared_ptr to_predict_block = this->to_predict_program_->Block(0); for (int j = 0; j < this->ops_of_block_[*to_predict_block.get()].size(); @@ -141,6 +146,7 @@ class Executor4Test : public Executor { op->Run(); } - return out_tensor; + return std::make_shared( + paddle_mobile::framework::Tensor(*output_tensor)); } }; diff --git a/test/framework/test_load.cpp b/test/framework/test_load.cpp index bea7d4ba7d2df1344f0819222fbdb389106fa77e..25cad4feaa706899122902dee2a8f0c915e78975 100644 --- a/test/framework/test_load.cpp +++ b/test/framework/test_load.cpp @@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include + #include "../test_helper.h" #include "io/loader.h" @@ -20,12 +22,10 @@ int main() { // ../../../test/models/googlenet // ../../../test/models/mobilenet // auto program = loader.Load(g_googlenet, true); + // auto program = loader.Load(g_mobilenet_ssd, true); - auto program = loader.Load(g_mobilenet_ssd, true); - // auto program = loader.Load(g_googlenet_combine + "/model", - // g_googlenet_combine + - // "/params", true); - + auto program = loader.Load(std::string(g_ocr) + "/model", + std::string(g_ocr) + "/params", false); // program.originProgram->Description("program desc: "); return 0; } diff --git a/test/net/test_mobilenet+ssd.cpp b/test/net/test_mobilenet+ssd.cpp index a3d780a4854d018f948af2890bfe9f1e7a8fefef..9b4e5f2d3a431001e138977b78994f5dfedbe0a3 100644 --- a/test/net/test_mobilenet+ssd.cpp +++ b/test/net/test_mobilenet+ssd.cpp @@ -20,22 +20,20 @@ int main() { paddle_mobile::PaddleMobile paddle_mobile; paddle_mobile.SetThreadNum(4); auto time1 = time(); - auto isok = paddle_mobile.Load(g_mobilenet_ssd_gesture + "/model", - g_mobilenet_ssd_gesture + "/params", true); + auto isok = paddle_mobile.Load( + std::string(g_mobilenet_ssd_gesture) + "/model", + std::string(g_mobilenet_ssd_gesture) + "/params", true); // auto isok = paddle_mobile.Load(g_mobilenet_ssd, false); if (isok) { auto time2 = time(); std::cout << "load cost :" << time_diff(time1, time2) << "ms" << std::endl; + std::vector input; std::vector dims{1, 3, 300, 300}; - Tensor input_tensor; - SetupTensor(&input_tensor, {1, 3, 300, 300}, static_cast(0), - static_cast(1)); + GetInput(g_hand, &input, dims); - std::vector input(input_tensor.data(), - input_tensor.data() + input_tensor.numel()); auto time3 = time(); - paddle_mobile.Predict(input, dims); + auto output = paddle_mobile.Predict(input, dims); auto time4 = time(); std::cout << "predict cost :" << time_diff(time3, time4) << "ms" << std::endl; diff --git a/test/net/test_mobilenet.cpp b/test/net/test_mobilenet.cpp index 95ffc59c394782b69d17f16c549b0e6923fd31e8..9fc7226fc12fa7a0c631c9920487c0bd56c90816 100644 --- a/test/net/test_mobilenet.cpp +++ b/test/net/test_mobilenet.cpp @@ -24,19 +24,21 @@ int main() { auto time2 = time(); std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl; + std::vector input; std::vector dims{1, 3, 224, 224}; - Tensor input_tensor; - SetupTensor(&input_tensor, {1, 3, 224, 224}, static_cast(0), - static_cast(1)); - - std::vector input(input_tensor.data(), - input_tensor.data() + input_tensor.numel()); - auto time3 = time(); - auto vec_result = paddle_mobile.Predict(input, dims); - auto time4 = time(); - - std::cout << "predict cost :" << time_diff(time3, time4) << "ms" - << std::endl; + GetInput(g_test_image_1x3x224x224, &input, dims); + + for (int i = 0; i < 10; ++i) { + auto time3 = time(); + auto vec_result = paddle_mobile.Predict(input, dims); + auto time4 = time(); + std::vector::iterator biggest = + std::max_element(std::begin(vec_result), std::end(vec_result)); + std::cout << " Max element is " << *biggest << " at position " + << std::distance(std::begin(vec_result), biggest) << std::endl; + std::cout << "predict cost :" << time_diff(time3, time4) << "ms" + << std::endl; + } } return 0; diff --git a/test/operators/test_fusion_conv_add_bn_relu_op.cpp b/test/operators/test_fusion_conv_add_bn_relu_op.cpp new file mode 100644 index 0000000000000000000000000000000000000000..81400d987195364c06b4b93d0859469b43f90e7b --- /dev/null +++ b/test/operators/test_fusion_conv_add_bn_relu_op.cpp @@ -0,0 +1,62 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "../test_include.h" +#include "operators/fusion_conv_add_bn_relu_op.h" + +int main() { + paddle_mobile::Loader loader; + // ../models/image_classification_resnet.inference.model + auto program = loader.Load(g_mobilenet, true); + + PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr, + "program file read fail"); + + Executor4Test> + executor(program, "fusion_conv_add_bn_relu", true); + + std::cout << "executor 4 test: " << std::endl; + + paddle_mobile::framework::Tensor input; + GetInput(g_test_image_1x3x224x224_banana, &input, {1, 3, 224, 224}); + // // use SetupTensor if not has local input image . + // SetupTensor(&input, {1, 3, 224, 224}, static_cast(0), + // static_cast(1)); + + DLOG << " fuck: " << input; + + auto out_ddim = paddle_mobile::framework::make_ddim({1, 32, 112, 112}); + std::cout << "before predict: " << std::endl; + auto output = + executor.Predict(input, "data", "conv2_1_dw_bn.tmp_2", out_ddim); + std::cout << "after predict " << std::endl; + auto output_ptr = output->data(); + + int stride = output->numel() / 100; + for (int i = 0; i < 100; i++) { + DLOG << " index:" << i * stride << " value: " << output_ptr[i * stride]; + } + + // for (int i = 0; i < 100; i++) { + // DLOG << " index:" << i << " value: "<< output_ptr[i]; + // } + + // for (int j = 0; j < output->numel(); ++j) { + // std::cout << " (index: " << j << " value: " << output_ptr[j] << ") "; + // } + std::cout << std::endl; + return 0; +} diff --git a/test/test_helper.h b/test/test_helper.h index fb6724f9c5764497ec81de0d73406709f098e0e0..658af447d6cfcd85c68ff350b104c2468d442e40 100644 --- a/test/test_helper.h +++ b/test/test_helper.h @@ -24,18 +24,22 @@ limitations under the License. */ #include "framework/ddim.h" #include "framework/tensor.h" -static const std::string g_mobilenet_ssd = "../models/mobilenet+ssd"; -static const std::string g_mobilenet_ssd_gesture = - "../models/mobilenet+ssd_gesture"; -static const std::string g_squeezenet = "../models/squeezenet"; -static const std::string g_googlenet = "../models/googlenet"; -static const std::string g_mobilenet = "../models/mobilenet"; -static const std::string g_resnet_50 = "../models/resnet_50"; -static const std::string g_resnet = "../models/resnet"; -static const std::string g_googlenet_combine = "../models/googlenet_combine"; -static const std::string g_yolo = "../models/yolo"; -static const std::string g_test_image_1x3x224x224 = +static const char *g_ocr = "../models/ocr"; +static const char *g_mobilenet_ssd = "../models/mobilenet+ssd"; +static const char *g_mobilenet_ssd_gesture = "../models/mobilenet+ssd_gesture"; +static const char *g_squeezenet = "../models/squeezenet"; +static const char *g_googlenet = "../models/googlenet"; +static const char *g_mobilenet = "../models/mobilenet"; +static const char *g_resnet_50 = "../models/resnet_50"; +static const char *g_resnet = "../models/resnet"; +static const char *g_googlenet_combine = "../models/googlenet_combine"; +static const char *g_yolo = "../models/yolo"; +static const char *g_test_image_1x3x224x224 = "../images/test_image_1x3x224x224_float"; +static const char *g_test_image_1x3x224x224_banana = + "../images/input_3x224x224_banana"; +static const char *g_hand = "../images/hand_image"; + using paddle_mobile::framework::DDim; using paddle_mobile::framework::Tensor; diff --git a/tools/android-cmake/android.toolchain.cmake b/tools/android-cmake/android.toolchain.cmake index a57d9c102ff65d4c10cc9bd3773ffa4c87e482fa..55b90ba65260b99d9af4a29832ed6f8ff5b235c8 100644 --- a/tools/android-cmake/android.toolchain.cmake +++ b/tools/android-cmake/android.toolchain.cmake @@ -65,6 +65,8 @@ endif() file(TO_CMAKE_PATH "${ANDROID_NDK}" ANDROID_NDK) # Android NDK revision +message("${ANDROID_NDK}") + file(READ "${ANDROID_NDK}/source.properties" ANDROID_NDK_SOURCE_PROPERTIES) set(ANDROID_NDK_SOURCE_PROPERTIES_REGEX "^Pkg\\.Desc = Android NDK\nPkg\\.Revision = ([0-9]+)\\.") @@ -159,7 +161,7 @@ endif() # Default values for configurable variables. if(NOT ANDROID_TOOLCHAIN) - set(ANDROID_TOOLCHAIN clang) + set(ANDROID_TOOLCHAIN gcc) endif() if(NOT ANDROID_ABI) set(ANDROID_ABI armeabi-v7a) diff --git a/tools/build.sh b/tools/build.sh index db809f71076e6b6d4aacc53bd8e144db3935cb91..bf3545ef162c86c16c0877f5f25f3a1e09de1fd4 100755 --- a/tools/build.sh +++ b/tools/build.sh @@ -63,7 +63,7 @@ build_for_android() { TOOLCHAIN_FILE="./tools/android-cmake/android.toolchain.cmake" ANDROID_ARM_MODE="arm" - if [ "${#NETS}" > 1 ]; then + if [ "${#NETS}" -gt 1 ]; then cmake .. \ -B"../build/release/${PLATFORM}" \ -DANDROID_ABI="${ABI}" \ @@ -99,7 +99,7 @@ build_for_ios() { BUILD_DIR=../build/release/"${PLATFORM}"/ TOOLCHAIN_FILE="./tools/ios-cmake/ios.toolchain.cmake" mkdir -p "${BUILD_DIR}" - if [ "${#NETS}" > 1 ]; then + if [ "${#NETS}" -gt 1 ]; then cmake .. \ -B"${BUILD_DIR}" \ -DCMAKE_BUILD_TYPE="${MODE}" \ diff --git a/tools/op.cmake b/tools/op.cmake index ec9768443c5e9825931111803acf1f51c1aa1acd..361381b81a603274207e50aeb8f0feddcff4e2ed 100644 --- a/tools/op.cmake +++ b/tools/op.cmake @@ -75,11 +75,9 @@ if ("FPGAnets" IN_LIST NET) set(FUSION_CONVADDRELU_OP ON) set(FUSION_CONVADDBNRELU_OP ON) set(FUSION_CONVADDBN_OP ON) - set(FUSION_POOLBN_OP ON) set(FUSION_ELEMENTWISEADDRELU_OP ON) set(FUSION_FC_OP ON) set(FUSION_FCRELU_OP ON) - set(REGION_OP ON) set(POOL_OP ON) set(CONCAT_OP ON) set(SOFTMAX_OP ON)