diff --git a/.gitignore b/.gitignore index 547e94ea11f048c9e007996d4ee716d22a13742e..4c86068b7ee3024416094613d2f8f8d74ce89921 100644 --- a/.gitignore +++ b/.gitignore @@ -75,4 +75,5 @@ cmake-build-release demo/ios/PaddleMobileDemo/PaddleMobileDemo/googlenet_combine/ demo/ios/PaddleMobileDemo/PaddleMobileDemo/*.jpg demo/ios/PaddleMobileDemo/PaddleMobileDemo/PaddleMobile/*.a -*.xcuserstate \ No newline at end of file +*.xcuserstate +/tools/quantification/quantify diff --git a/src/fpga/api/fpga_api.cpp b/src/fpga/api/fpga_api.cpp index a913d6e39cddda97b347c0675717c265dfa89d18..f91c21beb2d6b5fbce86b56d49b7d8c6a3ec9219 100644 --- a/src/fpga/api/fpga_api.cpp +++ b/src/fpga/api/fpga_api.cpp @@ -29,15 +29,15 @@ limitations under the License. */ #include "fpga/api/fpga_api.h" -namespace paddle { -namespace mobile { +namespace paddle_mobile { namespace fpga { -namespace api { static int fd = -1; static const char *device_path = "/dev/fpgadrv0"; -static inline int do_ioctl(int req, void *arg) { return ioctl(req, arg); } +static inline int do_ioctl(int req, void *arg) { + return ioctl(req, (unsigned int64_t)arg); +} int open_device() { if (fd == -1) { @@ -48,8 +48,8 @@ int open_device() { // memory management; void *fpga_malloc(size_t size) { - return reinterpret_cast<(void *)> mmap64(NULL, size, PROT_READ | PROT_WRITE, - MAP_SHARED, fd, 0); + return reinterpret_cast( + mmap64(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0)); } void fpga_free(void *ptr) { munmap(ptr, 0); } @@ -58,11 +58,13 @@ void fpga_copy(void *dest, const void *src, size_t num) { memcpy(dest, src, num); } -int ComputeFpgaConv(struct FpgaConvArgs) {} -int ComputeFpgaPool(struct FpgaPoolArgs) {} -int ComputeFpgaEWAdd(struct FpgaEWAddArgs) {} +int ComputeFpgaConv(const struct ConvArgs &args) { return do_ioctl(21, &args); } +int ComputeFpgaPool(const struct PoolingArgs &args) { + return do_ioctl(22, &args); +} +int ComputeFpgaEWAdd(const struct EWAddArgs &args) { + return do_ioctl(23, &args); +} -} // namespace api } // namespace fpga -} // namespace mobile -} // namespace paddle +} // namespace paddle_mobile diff --git a/src/fpga/api/fpga_api.h b/src/fpga/api/fpga_api.h index 42e99f4e4238d6974d23c1fb33bf238ca8a8626d..08635cdb5c01b50f59eb35554bba9a7b70f6ebfb 100644 --- a/src/fpga/api/fpga_api.h +++ b/src/fpga/api/fpga_api.h @@ -31,90 +31,150 @@ void* fpga_malloc(size_t size); void fpga_free(void* ptr); void fpga_copy(void* dst, const void* src, size_t num); -struct FpgaVersionArgs { - void* buf; +enum DataConvertType { + DATA_NO_CONVERT = 0, + DATA_FP32_TO_FP16 = 1, + DATA_FP16_TO_FP32 = 2, }; -struct MemoryToPhysicalArgs { - const void* src; - uint64_t physical; +enum LayoutConvertType { + LAYOUT_NO_CONVERT = 0, + LAYOUT_CHW_TO_HWC = 1, + LAYOUT_HWC_TO_CHW = 2, +}; + +struct VersionArgs { + void* buffer; }; struct MemoryCopyArgs { void* src; - void* dst; + void* dest; size_t size; }; -struct FpgaQuantArgs { - float scale; -}; - -struct FpgaBNArgs { - bool enabled = false; - void* bias_addr; - void* scale_addr; +struct BNArgs { + bool enabled; + void* bias_address; + void* scale_address; }; -struct FpgaKernelArgs { +/** +Conv and Pooling kernel +*/ +struct KernelArgs { uint32_t width; uint32_t height; - uint32_t stride_h; uint32_t stride_w; + uint32_t stride_h; }; -struct FpgaImageArgs { - uint32_t width; - uint32_t height; +struct ImageInputArgs { + void* address; // input featuremap virtual address + float* scale_address; // input scale address; uint32_t channels; - uint32_t pad_h; - uint32_t pad_w; + uint32_t width; // featuremap width + uint32_t height; + uint32_t pad_width; // padding width; + uint32_t pad_height; }; -struct FpgaConvArgs { +struct ImageOutputArgs { + void* address; // output result address; + float* scale_address; // output scale address; +}; + +struct ConvArgs { bool relu_enabled; - struct FpgaBNArgs BNargs; - void* image_addr; - void* filter_addr; - void* bias_addr; - void* output_addr; - float quant_scale; - struct FpgaImageArgs image; + void* bias_address; + void* filter_address; uint32_t filter_num; uint32_t group_num; - struct FpgaKernelArgs kernel; + void* sb_address; // scale and bias are interlaced; + struct KernelArgs kernel; + struct ImageInputArgs image; // input image; + struct ImageOutputArgs output; }; -struct FpgaPoolArgs { - void* image_addr; - void* output_addr; - struct FpgaImageArgs image; - struct FpgaKernelArgs kernel; +struct PoolingArgs { + struct KernelArgs kernel; + struct ImageInputArgs image; // input image; + struct ImageOutputArgs output; }; -struct FpgaEWAddArgs { +// elementwise add arguments +struct EWAddArgs { bool relu_enabled; - void* image0_addr; - void* image1_addr; - void* result_addr; - uint32_t const0; - uint32_t const1; - uint32_t data_len; // aligned element count + + float const0; // output0 = const0 x input0 + const1 x input1; + float const1; + struct ImageInputArgs image0; + struct ImageInputArgs image1; + struct ImageOutputArgs output; }; -int ComputeFpgaConv(struct FpgaConvArgs args); -int ComputeFpgaPool(struct FpgaPoolArgs args); -int ComputeFpgaEWAdd(struct FpgaEWAddArgs args); +struct BypassArgs { + enum DataConvertType convert_type; + struct ImageInputArgs image; + struct ImageOutputArgs output; +}; + +struct FpgaRegWriteArgs { + uint64_t address; // + uint64_t value; +}; + +struct FpgaRegReadArgs { + uint64_t address; + uint64_t value; +}; + +#define IOCTL_FPGA_MAGIC 'FPGA' + +#define IOCTL_VERSION _IOW(IOCTL_FPGA_MAGIC, 01, struct VersionArgs) + +#define IOCTL_SEPARATOR_0 10 -#define IOCTL_FPGA_MAGIC 'CNN' -#define IOCTL_VERSION _IOW(IOCTL_FPGA_MAGIC, 1, struct FpgaVersionArgs) -#define IOCTL_GET_QUANT _IOW(IOCTL_FPGA_MAGIC, 2, struct FpgaQuantArgs) -#define IOCTL_SET_QUANT _IOW(IOCTL_FPGA_MAGIC, 3, struct FpgaQuantArgs) #define IOCTL_MEM_COPY _IOW(IOCTL_FPGA_MAGIC, 11, struct MemoryCopyArgs) -#define IOCTL_CONFIG_CONV _IOW(IOCTL_FPGA_MAGIC, 21, struct FpgaConvArgs) -#define IOCTL_CONFIG_POOLING _IOW(IOCTL_FPGA_MAGIC, 22, struct FpgaPoolArgs) -#define IOCTL_CONFIG_EW _IOW(IOCTL_FPGA_MAGIC, 23, struct FpgaEWAddArgs) + +#define IOCTL_SEPARATOR_1 20 + +#define IOCTL_CONFIG_CONV _IOW(IOCTL_FPGA_MAGIC, 21, struct ConvArgs) +#define IOCTL_CONFIG_POOLING _IOW(IOCTL_FPGA_MAGIC, 22, struct PoolingArgs) +#define IOCTL_CONFIG_EW _IOW(IOCTL_FPGA_MAGIC, 23, struct EWAddArgs) +#define IOCTL_FPGA_REG_READ _IOW(IOCTL_FPGA_MAGIC, 28, struct FpgaRegReadArgs) +#define IOCTL_FPGA_REG_WRITE _IOW(IOCTL_FPGA_MAGIC, 29, struct FpgaRegWriteArgs) + +enum FPGA_ERR_TYPE { + ERR_IOCTL_CMD = -1, + ERR_TIMEOUT = -2, + ERR_COMPLETION_TIMEOUT = -3, + ERR_INVALID_FPGA_ADDR = -4, + ERR_NOMEM = -5, + ERR_NO_RESERVE_MEM = -6, + ERR_COPY_FROM_USER = -7, + ERR_COPY_TO_USER = -8, + ERR_DEL_TIMER = -9, + ERR_ENABLE_MSI = -10, + ERR_REGISTER_IRQ = -11, + ERR_PCIE_REGISTER = -12, + ERR_PCIE_PROBE = -13, + ERR_REGISTER_BLOCK = -14, + ERR_ALLOC_GENDISK = -15, + ERR_INIT_QUEUE = -16, + ERR_WAIT = -17, + ERR_ECC_ERROR = -31, + ERR_FPGA_FAIL_STOP = -64, + ERR_FPGA_DEBUG_STOP = -113, + DEV_TMP_UNAVAILABLE = -128 +}; + +//============================== API ============================= + +int ComputeFpgaConv(const struct ConvArgs& args); +int ComputeFpgaPool(const struct PoolingArgs& args); +int ComputeFpgaEWAdd(const struct EWAddArgs& args); } // namespace fpga } // namespace paddle_mobile diff --git a/src/fpga/fpga_quantilization.h b/src/fpga/fpga_quantilization.h new file mode 100644 index 0000000000000000000000000000000000000000..d2d2d61835de84c94760c10a25a973d4eaff1fbe --- /dev/null +++ b/src/fpga/fpga_quantilization.h @@ -0,0 +1,67 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once + +#include +#include "common/types.h" +#include "framework/lod_tensor.h" +#include "framework/operator.h" +#include "framework/scope.h" +#include "framework/tensor.h" + +namespace paddle_mobile { + +bool is_conv(std::string type) { + if (type.compare(G_OP_TYPE_CONV) == 0) { + return true; + } + if (type.compare(G_OP_TYPE_FUSION_CONV_ADD) == 0) { + return true; + } + if (type.compare(G_OP_TYPE_FUSION_CONV_ADD_RELU) == 0) { + return true; + } + if (type.compare(G_OP_TYPE_FUSION_CONV_BN_RELU) == 0) { + return true; + } + if (type.compare(G_OP_TYPE_FUSION_CONV_ADD_BN) == 0) { + return true; + } + return false; +} + +template +void quantilize_op(std::shared_ptr> op, + std::shared_ptr scope) { + if (!is_conv(op.get()->Type())) { + return; + } + framework::Tensor* filter = nullptr; + auto var_vec = op.get()->Inputs().at("Filter"); + if (!var_vec.empty()) { + auto var = scope.get()->FindVar(var_vec[0]); + filter = var->template GetMutable(); + } + float scale = 0; + + // 32bit filter -> 8bit filter; + if (filter->type() == typeid(float)) { + framework::Tensor* originalFilter = filter; + framework::Tensor* quantFilter = new framework::Tensor(); + float* floatData = originalFilter->data(); + int8_t* intData = quantFilter->mutable_data(); + } +} + +} // namespace paddle_mobile diff --git a/src/framework/tensor.h b/src/framework/tensor.h index 954a65a3605c4d0204890d9414aeb074371b0d69..3dba76d790d44f154f359454250d15b81ff717a3 100644 --- a/src/framework/tensor.h +++ b/src/framework/tensor.h @@ -253,6 +253,18 @@ class Tensor { "Tensor's dims_ is out of bound. "); } +#ifdef PADDLE_MOBILE_FPGA + struct FPGAArgs { + float scale; + + inline float *scale_pointer() { return &scale; } + }; + + struct FPGAArgs &fpga_args() { + return fpgaArgs_; + } +#endif + private: /** * @note Placeholder hides type T, so it doesn't appear as a @@ -319,6 +331,10 @@ class Tensor { * begins. */ size_t offset_; + +#ifdef PADDLE_MOBILE_FPGA + FPGAArgs fpgaArgs_; +#endif }; #ifdef PADDLE_MOBILE_DEBUG diff --git a/src/io/executor.cpp b/src/io/executor.cpp index 6b0af3454e0cb9c41633bd793b76250028644abe..c09fe2c58532437336307ce007532d43689d8fd2 100644 --- a/src/io/executor.cpp +++ b/src/io/executor.cpp @@ -32,6 +32,10 @@ limitations under the License. */ #include "common/threadpool.h" #endif +#ifdef PADDLE_MOBILE_FPGA +#include "fpga/fpga_quantilization.h" +#endif + namespace paddle_mobile { using framework::Variable; @@ -96,6 +100,11 @@ Executor::Executor(const framework::Program p, int batch_size, for (const auto &op : ops) { op->Init(); } +#ifdef PADDLE_MOBILE_FPGA + for (const auto &op : ops) { + quantilize_op(op, program_.scope); + } +#endif } template @@ -420,6 +429,6 @@ std::vector::Ptype> Executor::Predict( template class Executor; template class Executor; -template class Executor; +template class Executor; } // namespace paddle_mobile diff --git a/src/io/loader.cpp b/src/io/loader.cpp index 9ed877d05d51dfbe7139ea2289fdb6480c62f88f..cdcecf02ab8af22dec0e32113052ac26e9c5fcfc 100644 --- a/src/io/loader.cpp +++ b/src/io/loader.cpp @@ -56,7 +56,8 @@ template const framework::Program Loader::Load( const std::string &model_path, const std::string ¶_path, bool optimize, bool quantification) { - auto program = this->LoadProgram(model_path, optimize); + auto program = this->LoadProgram(model_path, optimize, quantification); + program.para_path = para_path; program.combined = true; program.quantification = quantification; diff --git a/src/jni/paddle_mobile_jni.cpp b/src/jni/paddle_mobile_jni.cpp index d4eb9e0f0733814cbe367a1873e241383127340a..66150e24e0ac957773161904948c10cf4637ee42 100644 --- a/src/jni/paddle_mobile_jni.cpp +++ b/src/jni/paddle_mobile_jni.cpp @@ -61,6 +61,15 @@ JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_load(JNIEnv *env, optimize); } +JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadQualified( + JNIEnv *env, jclass thiz, jstring modelPath) { + ANDROIDLOGI("loadQualified invoked"); + bool optimize = true; + bool qualified = true; + return getPaddleMobileInstance()->Load(jstring2cppstring(env, modelPath), + optimize, qualified); +} + JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadCombined( JNIEnv *env, jclass thiz, jstring modelPath, jstring paramPath) { ANDROIDLOGI("loadCombined invoked"); @@ -70,6 +79,16 @@ JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadCombined( optimize); } +JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadCombinedQualified( + JNIEnv *env, jclass thiz, jstring modelPath, jstring paramPath) { + ANDROIDLOGI("loadCombinedQualified invoked"); + bool optimize = true; + bool qualified = true; + return getPaddleMobileInstance()->Load(jstring2cppstring(env, modelPath), + jstring2cppstring(env, paramPath), + optimize, qualified); +} + JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictImage( JNIEnv *env, jclass thiz, jfloatArray buf, jintArray ddims) { ANDROIDLOGI("predictImage invoked"); diff --git a/src/jni/paddle_mobile_jni.h b/src/jni/paddle_mobile_jni.h index a830ab43c8ee0598fbf75e1fef5f3eb7da06c27b..06fabe04c739dfcee06110a3592a88591e3d37b9 100644 --- a/src/jni/paddle_mobile_jni.h +++ b/src/jni/paddle_mobile_jni.h @@ -27,12 +27,24 @@ namespace jni { JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_load(JNIEnv *env, jclass thiz, jstring modelPath); + +/** + * load separated qualified model for android + */ +JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadQualified( + JNIEnv *env, jclass thiz, jstring modelPath); /** * load combined model for android */ JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadCombined( JNIEnv *env, jclass thiz, jstring modelPath, jstring paramPath); +/** + * load combined qualified model for android + */ +JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadCombinedQualified( + JNIEnv *env, jclass thiz, jstring modelPath, jstring paramPath); + /** * object detection for anroid */ diff --git a/src/memory/t_malloc.cpp b/src/memory/t_malloc.cpp index 178541953323b6ffd1a3339f8209c2839b37a784..42b8c4551871c58955251d94845ca13576d7735b 100644 --- a/src/memory/t_malloc.cpp +++ b/src/memory/t_malloc.cpp @@ -27,17 +27,17 @@ namespace memory { const int MALLOC_ALIGN = 64; #ifdef PADDLE_MOBILE_FPGA -namespace api = paddle::mobile::fpga::api; +namespace fpga = paddle_mobile::fpga; void Copy(void *dst, const void *src, size_t num) { std::memcpy(dst, src, num); } -void *Alloc(size_t size) { return api::malloc(size); } +void *Alloc(size_t size) { return fpga::fpga_malloc(size); } void Free(void *ptr) { if (ptr) { - api::fpga_free(ptr); + fpga::fpga_free(ptr); } } diff --git a/src/operators/feed_op.h b/src/operators/feed_op.h index 723747874da8fc8ee2c02eb1be4c89189c2af746..e45ad38fd68cb9b4616b7e363be117e2039c93a9 100644 --- a/src/operators/feed_op.h +++ b/src/operators/feed_op.h @@ -29,7 +29,7 @@ class FeedOp : public framework::OperatorBase { std::shared_ptr scope) : framework::OperatorBase(type, inputs, outputs, attrs, scope), - param_(inputs, outputs, attrs, *scope) {} + param_(inputs, outputs, attrs, scope.get()) {} void RunImpl() const { param_.Out()->ShareDataWith(*param_.InputX()); } void Init() {} diff --git a/src/operators/kernel/arm/dropout_kernel.cpp b/src/operators/kernel/arm/dropout_kernel.cpp index af16048a1b4eba2ff36f842b6cf968031989576e..db942b018d7085ca3986533937328101afb08ff9 100644 --- a/src/operators/kernel/arm/dropout_kernel.cpp +++ b/src/operators/kernel/arm/dropout_kernel.cpp @@ -14,8 +14,6 @@ limitations under the License. */ #ifdef DROPOUT_OP -#pragma once - #include "operators/kernel/dropout_kernel.h" #include diff --git a/src/operators/kernel/dropout_kernel.h b/src/operators/kernel/dropout_kernel.h index 5a3783971959db8fba9ca6b701fb6eb6340fcb3f..3ef6b9dd62d88f012eba3456c676ac0d33bf9e52 100644 --- a/src/operators/kernel/dropout_kernel.h +++ b/src/operators/kernel/dropout_kernel.h @@ -17,7 +17,7 @@ limitations under the License. */ #include "framework/operator.h" #include "operators/op_param.h" -#pragma once; +#pragma once namespace paddle_mobile { namespace operators { diff --git a/src/operators/kernel/fpga/concat_kernel.cpp b/src/operators/kernel/fpga/concat_kernel.cpp index c691988f4a388c7835a7016602d7a1ac9cb5f9b6..627a94242ca4638640a7961120b36c9f763a0e85 100644 --- a/src/operators/kernel/fpga/concat_kernel.cpp +++ b/src/operators/kernel/fpga/concat_kernel.cpp @@ -39,7 +39,7 @@ void ConcatKernel::Compute(const ConcatParam ¶m) const { for (int i = 0; i < inputs.size(); ++i) { auto input = inputs[i]; - auto channels = input[3]; + auto channels = input->dims()[3]; out_offset += channels; auto src = input->data(); for (int j = 0; j < pixels; ++j) { diff --git a/src/operators/kernel/im2sequence_kernel.h b/src/operators/kernel/im2sequence_kernel.h index cb592613f73d90dae5a7d6e515f8bc091981776e..aa798fd6af5592a062de207714dc9fee2afb93df 100644 --- a/src/operators/kernel/im2sequence_kernel.h +++ b/src/operators/kernel/im2sequence_kernel.h @@ -20,13 +20,11 @@ limitations under the License. */ #include "operators/math/vol2col.h" #include "operators/op_param.h" -#pragma once; +#pragma once namespace paddle_mobile { namespace operators { -using namespace framework; - template class Im2SequenceKernel : public framework::OpKernelBase { diff --git a/src/operators/kernel/mali/fushion_fc_kernel.cpp b/src/operators/kernel/mali/fushion_fc_kernel.cpp index a76c3c46012a758a05cf8f846a15376ad1b9f33c..44a7ce2af62a1d27aff8181f6742bebda1d6d066 100755 --- a/src/operators/kernel/mali/fushion_fc_kernel.cpp +++ b/src/operators/kernel/mali/fushion_fc_kernel.cpp @@ -14,8 +14,6 @@ limitations under the License. */ #ifdef FUSION_FC_OP -#pragma once - #include "operators/kernel/fusion_fc_kernel.h" namespace paddle_mobile { diff --git a/src/operators/kernel/prelu_kernel.h b/src/operators/kernel/prelu_kernel.h index 9f5dcb23ee9bf44ffa8bbdd98879d533d07c39f9..15696174377f04ad9a62366e03ded1f2cdcdee9e 100644 --- a/src/operators/kernel/prelu_kernel.h +++ b/src/operators/kernel/prelu_kernel.h @@ -15,7 +15,7 @@ limitations under the License. */ #include "framework/operator.h" #include "operators/op_param.h" -#pragma once; +#pragma once namespace paddle_mobile { namespace operators { diff --git a/src/operators/kernel/scale_kernel.h b/src/operators/kernel/scale_kernel.h index cc76a4b59b0be24dd3a3fb82c0e3d9fb1a4dbf24..98ac71d0bbad86f595171ad7ac5b2a1cdf5908fa 100644 --- a/src/operators/kernel/scale_kernel.h +++ b/src/operators/kernel/scale_kernel.h @@ -15,7 +15,7 @@ limitations under the License. */ #include "framework/operator.h" #include "operators/op_param.h" -#pragma once; +#pragma once namespace paddle_mobile { namespace operators { diff --git a/src/operators/kernel/slice_kernel.h b/src/operators/kernel/slice_kernel.h index e308364602f401b1c6c6f8e2e35385aefa055360..fd3b8dc767076c5244509f6015c42bee87df100b 100644 --- a/src/operators/kernel/slice_kernel.h +++ b/src/operators/kernel/slice_kernel.h @@ -15,7 +15,7 @@ limitations under the License. */ #include "framework/operator.h" #include "operators/op_param.h" -#pragma once; +#pragma once namespace paddle_mobile { namespace operators { diff --git a/src/operators/math/gemm.cpp b/src/operators/math/gemm.cpp index b9b61f4d1c59a0e2c8e7822742c54472ad540981..20d71907ff9e391d97ce75e38b6e08dc1286a9a3 100644 --- a/src/operators/math/gemm.cpp +++ b/src/operators/math/gemm.cpp @@ -107,20 +107,22 @@ void PackMatrixA_4r(int m, int k, int m_tail, const float *A, int lda, *buffer++ = *a3++; } } - int i = m - m_tail; - a0 = &A(i, 0); - a1 = a0 + lda; - a2 = a0 + 2 * lda; - a3 = a0 + 3 * lda; + if (m_tail != 0) { - if (m_tail <= 3) { - a3 = zero; - } - if (m_tail <= 2) { - a2 = zero; - } - if (m_tail <= 1) { - a1 = zero; + a0 = &A(m - m_tail, 0); + a1 = a0 + lda; + a2 = a0 + 2 * lda; + a3 = a0 + 3 * lda; + switch (m_tail) { + case 1: + a1 = zero; + case 2: + a2 = zero; + case 3: + a3 = zero; + break; + default: + break; } for (int j = 0; j < k; ++j) { *buffer++ = *a0++; @@ -150,28 +152,89 @@ void PackMatrixA_6r(int m, int k, int m_tail, const float *A, int lda, *buffer++ = *a5++; } } - int i = m - m_tail; - a0 = &A(i, 0); - a1 = a0 + lda; - a2 = a0 + 2 * lda; - a3 = a0 + 3 * lda; - a4 = a0 + 4 * lda; - a5 = a0 + 5 * lda; if (m_tail != 0) { - if (m_tail <= 5) { - a5 = zero; + a0 = &A(m - m_tail, 0); + a1 = a0 + lda; + a2 = a0 + 2 * lda; + a3 = a0 + 3 * lda; + a4 = a0 + 4 * lda; + a5 = a0 + 5 * lda; + switch (m_tail) { + case 1: + a1 = zero; + case 2: + a2 = zero; + case 3: + a3 = zero; + case 4: + a4 = zero; + case 5: + a5 = zero; + break; + default: + break; } - if (m_tail <= 4) { - a4 = zero; - } - if (m_tail <= 3) { - a3 = zero; + for (int j = 0; j < k; ++j) { + *buffer++ = *a0++; + *buffer++ = *a1++; + *buffer++ = *a2++; + *buffer++ = *a3++; + *buffer++ = *a4++; + *buffer++ = *a5++; } - if (m_tail <= 2) { - a2 = zero; + } +} + +void PackMatrixA_8r(int m, int k, int m_tail, const float *A, int lda, + float *buffer) { + const float *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7; + for (int i = 0; i < m - m_tail; i += MR) { + a0 = A + i * lda; + a1 = A + (i + 1) * lda; + a2 = A + (i + 2) * lda; + a3 = A + (i + 3) * lda; + a4 = A + (i + 4) * lda; + a5 = A + (i + 5) * lda; + a6 = A + (i + 6) * lda; + a7 = A + (i + 7) * lda; + for (int j = 0; j < k; ++j) { + *buffer++ = *a0++; + *buffer++ = *a1++; + *buffer++ = *a2++; + *buffer++ = *a3++; + *buffer++ = *a4++; + *buffer++ = *a5++; + *buffer++ = *a6++; + *buffer++ = *a7++; } - if (m_tail <= 1) { - a1 = zero; + } + if (m_tail != 0) { + a0 = &A(m - m_tail, 0); + a1 = a0 + lda; + a2 = a0 + 2 * lda; + a3 = a0 + 3 * lda; + a4 = a0 + 4 * lda; + a5 = a0 + 5 * lda; + a6 = a0 + 6 * lda; + a7 = a0 + 7 * lda; + switch (m_tail) { + case 1: + a1 = zero; + case 2: + a2 = zero; + case 3: + a3 = zero; + case 4: + a4 = zero; + case 5: + a5 = zero; + case 6: + a6 = zero; + case 7: + a7 = zero; + break; + default: + break; } for (int j = 0; j < k; ++j) { *buffer++ = *a0++; @@ -180,6 +243,8 @@ void PackMatrixA_6r(int m, int k, int m_tail, const float *A, int lda, *buffer++ = *a3++; *buffer++ = *a4++; *buffer++ = *a5++; + *buffer++ = *a6++; + *buffer++ = *a7++; } } } @@ -234,15 +299,78 @@ void PackMatrixB_8c(int k, int n, int n_tail, const float *B, int ldb, } } +#if __aarch64__ +void PackMatrixB_12c(int k, int n, int n_tail, const float *B, int ldb, + float *buffer) { + const float *b0; + for (int j = 0; j < n - n_tail; j += NR) { + for (int i = 0; i < k; ++i) { + b0 = &B(i, j); + asm volatile( + "prfm pldl2keep, [%[b0], #64] \n\t" + "ld1 {v0.4s, v1.4s, v2.4s}, [%[b0]] \n\t" + "st1 {v0.4s, v1.4s, v2.4s}, [%[buffer]], #48 \n\t" + : [buffer] "+r"(buffer) + : [b0] "r"(b0) + : "memory", "v0", "v1", "v2"); + } + } + if (n_tail != 0) { + for (int i = 0; i < k; ++i) { + b0 = &B(i, n - n_tail); + for (int j = n - n_tail; j < n; ++j) { + *buffer++ = *b0++; + } + for (int j = n; j < n + (NR - n_tail); ++j) { + *buffer++ = 0; + } + } + } +} + +void PackMatrixB_16c(int k, int n, int n_tail, const float *B, int ldb, + float *buffer) { + const float *b0; + for (int j = 0; j < n - n_tail; j += NR) { + for (int i = 0; i < k; ++i) { + b0 = &B(i, j); + asm volatile( + "prfm pldl2keep, [%[b0], #64] \n\t" + "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%[b0]] \n\t" + "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%[buffer]], #64 \n\t" + : [buffer] "+r"(buffer) + : [b0] "r"(b0) + : "memory", "v0", "v1", "v2", "v3"); + } + } + if (n_tail != 0) { + for (int i = 0; i < k; ++i) { + b0 = &B(i, n - n_tail); + for (int j = n - n_tail; j < n; ++j) { + *buffer++ = *b0++; + } + for (int j = n; j < n + (NR - n_tail); ++j) { + *buffer++ = 0; + } + } + } +} +#endif // __aarch64__ + // 分块矩阵乘法 void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b, float beta, float *c, float *C, int ldc, bool relu) { #pragma omp parallel for for (int j = 0; j < nc; j += NR) { for (int i = 0; i < mc; i += MR) { +#if __aarch64__ + // AddDot8x12(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); + AddDot6x16(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); +#else // AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); // AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); AddDot6x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); +#endif } } @@ -271,9 +399,14 @@ void InnerKernelWithBn(int mc, int nc, float alpha, const float *a, #pragma omp parallel for for (int j = 0; j < nc; j += NR) { for (int i = 0; i < mc; i += MR) { +#if __aarch64__ + // AddDot8x12(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); + AddDot6x16(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); +#else // AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); // AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); AddDot6x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); +#endif } } @@ -1956,10 +2089,20 @@ void Sgemm(int m, int n, int k, float alpha, const float *A, int lda, int mc, nc; for (int j = 0; j < n; j += NC) { nc = s_min(n - j, NC); +#if __aarch64__ + // PackMatrixB_12c(KC, nc, nc % NR, &B(0, j), ldb, packedB); + PackMatrixB_16c(KC, nc, nc % NR, &B(0, j), ldb, packedB); +#else PackMatrixB_8c(KC, nc, nc % NR, &B(0, j), ldb, packedB); +#endif for (int i = 0; i < m; i += MC) { mc = s_min(m - i, MC); +#if __aarch64__ PackMatrixA_6r(mc, KC, mc % MR, &A(i, 0), lda, packedA); + // PackMatrixA_8r(mc, KC, mc % MR, &A(i, 0), lda, packedA); +#else + PackMatrixA_6r(mc, KC, mc % MR, &A(i, 0), lda, packedA); +#endif InnerKernel(mc, nc, alpha, packedA, packedB, beta, packedC, &C(i, j), ldc, relu); } @@ -2009,10 +2152,20 @@ void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda, int mc, nc; for (int j = 0; j < n; j += NC) { nc = s_min(n - j, NC); +#if __aarch64__ + // PackMatrixB_12c(KC, nc, nc % NR, &B(0, j), ldb, packedB); + PackMatrixB_16c(KC, nc, nc % NR, &B(0, j), ldb, packedB); +#else PackMatrixB_8c(KC, nc, nc % NR, &B(0, j), ldb, packedB); +#endif for (int i = 0; i < m; i += MC) { mc = s_min(m - i, MC); +#if __aarch64__ PackMatrixA_6r(mc, KC, mc % MR, &A(i, 0), lda, packedA); + // PackMatrixA_8r(mc, KC, mc % MR, &A(i, 0), lda, packedA); +#else + PackMatrixA_6r(mc, KC, mc % MR, &A(i, 0), lda, packedA); +#endif InnerKernelWithBn(mc, nc, alpha, packedA, packedB, beta, packedC, &C(i, j), ldc, relu, new_scale + i, new_bias + i); } @@ -2239,6 +2392,192 @@ void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) { #endif // __ARM_NEON } +#if __aarch64__ +void AddDot8x12(int k, const float *a, const float *b, float *c, int ldc) { + const float *a_ptr, *b_ptr; + a_ptr = a; + b_ptr = b; + int kc1 = k; + int step = 4 * ldc; + asm volatile( + "dup v5.4s, wzr \n\t" + "dup v6.4s, wzr \n\t" + "dup v7.4s, wzr \n\t" + "dup v8.4s, wzr \n\t" + "dup v9.4s, wzr \n\t" + "dup v10.4s, wzr \n\t" + "dup v11.4s, wzr \n\t" + "dup v12.4s, wzr \n\t" + "dup v13.4s, wzr \n\t" + "dup v14.4s, wzr \n\t" + "dup v15.4s, wzr \n\t" + "dup v16.4s, wzr \n\t" + + "dup v17.4s, wzr \n\t" + "dup v18.4s, wzr \n\t" + "dup v19.4s, wzr \n\t" + "dup v20.4s, wzr \n\t" + "dup v21.4s, wzr \n\t" + "dup v22.4s, wzr \n\t" + "dup v23.4s, wzr \n\t" + "dup v24.4s, wzr \n\t" + "dup v25.4s, wzr \n\t" + "dup v26.4s, wzr \n\t" + "dup v27.4s, wzr \n\t" + "dup v28.4s, wzr \n\t" + + "subs %[kc1], %[kc1], #1 \n\t" + "blt end_kc1_%= \n\t" + "loop_kc1_%=: \n\t" + + "prfm pldl1keep, [%[a_ptr], #32] \n\t" + "prfm pldl1keep, [%[b_ptr], #48] \n\t" + + "ld1 {v0.4s, v1.4s}, [%[a_ptr]], #32 \n\t" + "ld1 {v2.4s, v3.4s, v4.4s}, [%[b_ptr]], #48 \n\t" + + "fmla v5.4s, v2.4s, v0.s[0] \n\t" + "fmla v6.4s, v3.4s, v0.s[0] \n\t" + "fmla v7.4s, v4.4s, v0.s[0] \n\t" + "fmla v8.4s, v2.4s, v0.s[1] \n\t" + "fmla v9.4s, v3.4s, v0.s[1] \n\t" + "fmla v10.4s, v4.4s, v0.s[1] \n\t" + "fmla v11.4s, v2.4s, v0.s[2] \n\t" + "fmla v12.4s, v3.4s, v0.s[2] \n\t" + "fmla v13.4s, v4.4s, v0.s[2] \n\t" + "fmla v14.4s, v2.4s, v0.s[3] \n\t" + "fmla v15.4s, v3.4s, v0.s[3] \n\t" + "fmla v16.4s, v4.4s, v0.s[3] \n\t" + + "fmla v17.4s, v2.4s, v1.s[0] \n\t" + "fmla v18.4s, v3.4s, v1.s[0] \n\t" + "fmla v19.4s, v4.4s, v1.s[0] \n\t" + "fmla v20.4s, v2.4s, v1.s[1] \n\t" + "fmla v21.4s, v3.4s, v1.s[1] \n\t" + "fmla v22.4s, v4.4s, v1.s[1] \n\t" + "fmla v23.4s, v2.4s, v1.s[2] \n\t" + "fmla v24.4s, v3.4s, v1.s[2] \n\t" + "fmla v25.4s, v4.4s, v1.s[2] \n\t" + "fmla v26.4s, v2.4s, v1.s[3] \n\t" + "fmla v27.4s, v3.4s, v1.s[3] \n\t" + "fmla v28.4s, v4.4s, v1.s[3] \n\t" + + "subs %[kc1], %[kc1], #1 \n\t" + "bge loop_kc1_%= \n\t" + "end_kc1_%=: \n\t" + + "st1 {v5.4s, v6.4s, v7.4s}, [%[c]], %[step] \n\t" + "st1 {v8.4s, v9.4s, v10.4s}, [%[c]], %[step] \n\t" + "st1 {v11.4s, v12.4s, v13.4s}, [%[c]], %[step] \n\t" + "st1 {v14.4s, v15.4s, v16.4s}, [%[c]], %[step] \n\t" + "st1 {v17.4s, v18.4s, v19.4s}, [%[c]], %[step] \n\t" + "st1 {v20.4s, v21.4s, v22.4s}, [%[c]], %[step] \n\t" + "st1 {v23.4s, v24.4s, v25.4s}, [%[c]], %[step] \n\t" + "st1 {v26.4s, v27.4s, v28.4s}, [%[c]], %[step] \n\t" + : + : [a_ptr] "r"(a_ptr), [b_ptr] "r"(b_ptr), [c] "r"(c), [kc1] "r"(kc1), + [step] "r"(step) + : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", + "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28"); +} + +void AddDot6x16(int k, const float *a, const float *b, float *c, int ldc) { + const float *a_ptr, *b_ptr; + a_ptr = a; + b_ptr = b; + int kc1 = k; + int step = 4 * ldc; + int step1 = 4 * 6; + asm volatile( + + "dup v6.4s, wzr \n\t" + "dup v7.4s, wzr \n\t" + "dup v8.4s, wzr \n\t" + "dup v9.4s, wzr \n\t" + "dup v10.4s, wzr \n\t" + "dup v11.4s, wzr \n\t" + "dup v12.4s, wzr \n\t" + "dup v13.4s, wzr \n\t" + + "dup v14.4s, wzr \n\t" + "dup v15.4s, wzr \n\t" + "dup v16.4s, wzr \n\t" + "dup v17.4s, wzr \n\t" + "dup v18.4s, wzr \n\t" + "dup v19.4s, wzr \n\t" + "dup v20.4s, wzr \n\t" + "dup v21.4s, wzr \n\t" + + "dup v22.4s, wzr \n\t" + "dup v23.4s, wzr \n\t" + "dup v24.4s, wzr \n\t" + "dup v25.4s, wzr \n\t" + "dup v26.4s, wzr \n\t" + "dup v27.4s, wzr \n\t" + "dup v28.4s, wzr \n\t" + "dup v29.4s, wzr \n\t" + + "subs %[kc1], %[kc1], #1 \n\t" + "blt end_kc1_%= \n\t" + "loop_kc1_%=: \n\t" + + "prfm pldl1keep, [%[a_ptr], #24] \n\t" + "prfm pldl1keep, [%[b_ptr], #64] \n\t" + + "ld1 {v0.4s, v1.4s}, [%[a_ptr]], %[step1] \n\t" + "ld1 {v2.4s, v3.4s, v4.4s, v5.4s}, [%[b_ptr]], #64 \n\t" + + "fmla v6.4s, v2.4s, v0.s[0] \n\t" + "fmla v7.4s, v3.4s, v0.s[0] \n\t" + "fmla v8.4s, v4.4s, v0.s[0] \n\t" + "fmla v9.4s, v5.4s, v0.s[0] \n\t" + + "fmla v10.4s, v2.4s, v0.s[1] \n\t" + "fmla v11.4s, v3.4s, v0.s[1] \n\t" + "fmla v12.4s, v4.4s, v0.s[1] \n\t" + "fmla v13.4s, v5.4s, v0.s[1] \n\t" + + "fmla v14.4s, v2.4s, v0.s[2] \n\t" + "fmla v15.4s, v3.4s, v0.s[2] \n\t" + "fmla v16.4s, v4.4s, v0.s[2] \n\t" + "fmla v17.4s, v5.4s, v0.s[2] \n\t" + + "fmla v18.4s, v2.4s, v0.s[3] \n\t" + "fmla v19.4s, v3.4s, v0.s[3] \n\t" + "fmla v20.4s, v4.4s, v0.s[3] \n\t" + "fmla v21.4s, v5.4s, v0.s[3] \n\t" + + "fmla v22.4s, v2.4s, v1.s[0] \n\t" + "fmla v23.4s, v3.4s, v1.s[0] \n\t" + "fmla v24.4s, v4.4s, v1.s[0] \n\t" + "fmla v25.4s, v5.4s, v1.s[0] \n\t" + + "fmla v26.4s, v2.4s, v1.s[1] \n\t" + "fmla v27.4s, v3.4s, v1.s[1] \n\t" + "fmla v28.4s, v4.4s, v1.s[1] \n\t" + "fmla v29.4s, v5.4s, v1.s[1] \n\t" + + "subs %[kc1], %[kc1], #1 \n\t" + "bge loop_kc1_%= \n\t" + "end_kc1_%=: \n\t" + + "st1 {v6.4s, v7.4s, v8.4s, v9.4s}, [%[c]], %[step] \n\t" + "st1 {v10.4s, v11.4s, v12.4s, v13.4s}, [%[c]], %[step] \n\t" + "st1 {v14.4s, v15.4s, v16.4s, v17.4s}, [%[c]], %[step] \n\t" + "st1 {v18.4s, v19.4s, v20.4s, v21.4s}, [%[c]], %[step] \n\t" + "st1 {v22.4s, v23.4s, v24.4s, v25.4s}, [%[c]], %[step] \n\t" + "st1 {v26.4s, v27.4s, v28.4s, v29.4s}, [%[c]], %[step] \n\t" + : + : [a_ptr] "r"(a_ptr), [b_ptr] "r"(b_ptr), [c] "r"(c), [kc1] "r"(kc1), + [step] "r"(step), [step1] "r"(step1) + : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", + "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29"); +} + +#endif // __aarch64__ + } // namespace math } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/math/gemm.h b/src/operators/math/gemm.h index 2044c264ed1c0f8624690874ed248661a753804c..a9593b15ae73f46aa287028ba74efdb0d303fdde 100644 --- a/src/operators/math/gemm.h +++ b/src/operators/math/gemm.h @@ -19,8 +19,13 @@ limitations under the License. */ #define B(i, j) B[(i)*ldb + (j)] #define C(i, j) C[(i)*ldc + (j)] +#if __aarch64__ +#define MR 6 +#define NR 16 +#else #define MR 6 #define NR 8 +#endif #define s_min(i, j) ((i) < (j) ? (i) : (j)) @@ -43,10 +48,16 @@ void PackMatrixA_4r(int m, int k, int m_tail, const float *A, int lda, float *buffer); void PackMatrixA_6r(int m, int k, int m_tail, const float *A, int lda, float *buffer); +void PackMatrixA_8r(int m, int k, int m_tail, const float *A, int lda, + float *buffer); // 将 B 矩阵分块复制到连续内存(RowMajor) void PackMatrixB_8c(int k, int n, int n_tail, const float *B, int ldb, float *buffer); +void PackMatrixB_12c(int k, int n, int n_tail, const float *B, int ldb, + float *buffer); +void PackMatrixB_16c(int k, int n, int n_tail, const float *B, int ldb, + float *buffer); // 分块矩阵乘法 void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b, @@ -70,6 +81,8 @@ void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A, void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc); void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc); void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc); +void AddDot8x12(int k, const float *a, const float *b, float *c, int ldc); +void AddDot6x16(int k, const float *a, const float *b, float *c, int ldc); // 分块矩阵乘法结果回写 // C = A * B @@ -114,10 +127,6 @@ void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda, const float *B, int ldb, float beta, float *C, int ldc, bool relu, float *new_scale, float *new_bias); -// 64位 double 矩阵乘法 -void dgemm(int m, int n, int k, float alpha, const double *A, int lda, - const double *B, int ldb, float beta, double *C, int ldc); - } // namespace math } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/op_param.h b/src/operators/op_param.h index a1c9baad79df159b1784ef0dd5d12ccf7ed7fe11..88c1886ad7ade5960d1d8175a1b46e12363ca849 100644 --- a/src/operators/op_param.h +++ b/src/operators/op_param.h @@ -262,11 +262,11 @@ class ElementwiseAddParam : OpParam { #ifdef PADDLE_MOBILE_FPGA private: - fpga::FpgaEWAddArgs fpga_EW_add_args; + fpga::EWAddArgs fpga_EW_add_args; public: - const fpga::FpgaEWAddArgs &FpgaArgs() const { return fpga_EW_add_args; } - void SetFpgaArgs(const fpga::FpgaEWAddArgs &args) { fpga_EW_add_args = args; } + const fpga::EWAddArgs &FpgaArgs() const { return fpga_EW_add_args; } + void SetFpgaArgs(const fpga::EWAddArgs &args) { fpga_EW_add_args = args; } #endif }; @@ -465,11 +465,11 @@ class PoolParam : public OpParam { #ifdef PADDLE_MOBILE_FPGA private: - fpga::FpgaPoolArgs fpga_pool_args; + fpga::PoolingArgs fpga_pool_args; public: - const fpga::FpgaPoolArgs &FpgaArgs() const { return fpga_pool_args; } - void SetFpgaArgs(const fpga::FpgaPoolArgs &args) { fpga_pool_args = args; } + const fpga::PoolingArgs &FpgaArgs() const { return fpga_pool_args; } + void SetFpgaArgs(const fpga::PoolingArgs &args) { fpga_pool_args = args; } #endif }; #endif @@ -651,10 +651,10 @@ class MultiClassNMSParam : public OpParam { class FeedParam : public OpParam { public: FeedParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, Scope const &scope) { - input_x_ = InputXFrom(inputs, scope); - out_ = OutFrom(outputs, scope); - auto var = scope.Var("batch_size"); + const AttributeMap &attrs, Scope *scope) { + input_x_ = InputXFrom(inputs, *scope); + out_ = OutFrom(outputs, *scope); + auto var = scope->Var("batch_size"); batch_size = var->GetValue(); } const Tensor *InputX() const { return input_x_; } @@ -933,11 +933,11 @@ class FusionFcParam : public OpParam { #ifdef PADDLE_MOBILE_FPGA private: - fpga::FpgaConvArgs fpga_conv_args; + fpga::ConvArgs fpga_conv_args; public: - const fpga::FpgaConvArgs &FpgaArgs() const { return fpga_conv_args; } - void SetFpgaArgs(const fpga::FpgaConvArgs &args) { fpga_conv_args = args; } + const fpga::ConvArgs &FpgaArgs() const { return fpga_conv_args; } + void SetFpgaArgs(const fpga::ConvArgs &args) { fpga_conv_args = args; } #endif }; @@ -991,11 +991,11 @@ class FusionConvAddParam : public OpParam { #ifdef PADDLE_MOBILE_FPGA private: - fpga::FpgaConvArgs fpga_conv_args; + fpga::ConvArgs fpga_conv_args; public: - const fpga::FpgaConvArgs &FpgaArgs() const { return fpga_conv_args; } - void SetFpgaArgs(const fpga::FpgaConvArgs &args) { fpga_conv_args = args; } + const fpga::ConvArgs &FpgaArgs() const { return fpga_conv_args; } + void SetFpgaArgs(const fpga::ConvArgs &args) { fpga_conv_args = args; } #endif }; @@ -1096,11 +1096,11 @@ class FusionConvAddBNReluParam : public OpParam { #ifdef PADDLE_MOBILE_FPGA private: - fpga::FpgaConvArgs fpga_conv_args; + fpga::ConvArgs fpga_conv_args; public: - const fpga::FpgaConvArgs &FpgaArgs() const { return fpga_conv_args; } - void SetFpgaArgs(const fpga::FpgaConvArgs &args) { fpga_conv_args = args; } + const fpga::ConvArgs &FpgaArgs() const { return fpga_conv_args; } + void SetFpgaArgs(const fpga::ConvArgs &args) { fpga_conv_args = args; } #endif }; #endif @@ -1190,11 +1190,11 @@ class FusionConvAddBNParam : public OpParam { #ifdef PADDLE_MOBILE_FPGA private: - fpga::FpgaConvArgs fpga_conv_args; + fpga::ConvArgs fpga_conv_args; public: - const fpga::FpgaConvArgs &FpgaArgs() const { return fpga_conv_args; } - void SetFpgaArgs(const fpga::FpgaConvArgs &args) { fpga_conv_args = args; } + const fpga::ConvArgs &FpgaArgs() const { return fpga_conv_args; } + void SetFpgaArgs(const fpga::ConvArgs &args) { fpga_conv_args = args; } #endif }; #endif diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 418ebff79161675e8b23a4cca8f4319121aa6002..8839079fecfdbefcdaff85354d3a6a8208af10ee 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -114,8 +114,12 @@ else () target_link_libraries(test-softmax paddle-mobile) # gen test - ADD_EXECUTABLE(test-gemm common/test_gemm.cpp) - target_link_libraries(test-gemm paddle-mobile) + ADD_EXECUTABLE(test-gemm-accuracy common/test_gemm_accuracy.cpp) + target_link_libraries(test-gemm-accuracy paddle-mobile) + + # gen test + ADD_EXECUTABLE(test-gemm-perf common/test_gemm_perf.cpp) + target_link_libraries(test-gemm-perf paddle-mobile) # gen test ADD_EXECUTABLE(test-enforce common/test_enforce.cpp) diff --git a/test/common/test_gemm.cpp b/test/common/test_gemm_accuracy.cpp similarity index 100% rename from test/common/test_gemm.cpp rename to test/common/test_gemm_accuracy.cpp diff --git a/test/common/test_gemm_perf.cpp b/test/common/test_gemm_perf.cpp new file mode 100644 index 0000000000000000000000000000000000000000..260236e24ea44a6fc5708d4d0dac239252d28945 --- /dev/null +++ b/test/common/test_gemm_perf.cpp @@ -0,0 +1,64 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "../test_helper.h" +#include "operators/math/gemm.h" +#include "operators/math/math_function.h" + +#define a(i, j) a[(i)*lda + (j)] +#define b(i, j) b[(i)*ldb + (j)] +#define c1(i, j) c1[(i)*ldc + (j)] + +#define m 1024 +#define n 1024 +#define k 1024 + +int main() { + Tensor aa, bb, cc, scale, bias; + auto aaptr = aa.mutable_data({m, k}); + auto bbptr = bb.mutable_data({k, n}); + auto ccptr = cc.mutable_data({m, n}); + auto scaleptr = scale.mutable_data({m}); + auto biasptr = bias.mutable_data({m}); + + for (int i = 0; i < m * k; ++i) { + aaptr[i] = 2; + } + for (int i = 0; i < k * n; ++i) { + bbptr[i] = 2; + } + for (int i = 0; i < m * n; ++i) { + ccptr[i] = 2; + } + for (int i = 0; i < m; ++i) { + scaleptr[i] = 1; + biasptr[i] = 0; + } + + auto time1 = time(); + for (int j = 0; j < 10; ++j) { + paddle_mobile::operators::math::matmul(aa, false, bb, false, + static_cast(1), &cc, + static_cast(0), false); + + // paddle_mobile::operators::math::matmulWithBn( + // aa, false, bb, false, static_cast(1), &cc, + // static_cast(0), true, &scale, &bias, 0); + } + auto time2 = time(); + std::cout << "gemm cost :" << time_diff(time1, time2) / 10 << "ms\n"; + + return 0; +} diff --git a/test/fpga/test_tensor_quant.cpp b/test/fpga/test_tensor_quant.cpp new file mode 100644 index 0000000000000000000000000000000000000000..1e30b9be551c608c5200460ebb80526270da5aed --- /dev/null +++ b/test/fpga/test_tensor_quant.cpp @@ -0,0 +1,34 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "../test_helper.h" +#include "../test_include.h" + +int main() { + paddle_mobile::PaddleMobile paddle_mobile; + bool optimize = false; + if (paddle_mobile.Load(g_googlenet, optimize)) { + auto time2 = time(); + DLOG << "load cost: " << time_diff(time1, time1) << "ms"; + std::vector input; + std::vector dims{1, 3, 224, 224}; + GetInput(g_test_image_1x3x224x224, &input, dims); + auto time3 = time(); + auto vec_result = paddle_mobile.Predict(input, dims); + auto time4 = time(); + DLOG << "predict cost :" << time_diff(time3, time4) << "ms"; + } + return 0; +} diff --git a/test/framework/test_load.cpp b/test/framework/test_load.cpp index bea7d4ba7d2df1344f0819222fbdb389106fa77e..25cad4feaa706899122902dee2a8f0c915e78975 100644 --- a/test/framework/test_load.cpp +++ b/test/framework/test_load.cpp @@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include + #include "../test_helper.h" #include "io/loader.h" @@ -20,12 +22,10 @@ int main() { // ../../../test/models/googlenet // ../../../test/models/mobilenet // auto program = loader.Load(g_googlenet, true); + // auto program = loader.Load(g_mobilenet_ssd, true); - auto program = loader.Load(g_mobilenet_ssd, true); - // auto program = loader.Load(g_googlenet_combine + "/model", - // g_googlenet_combine + - // "/params", true); - + auto program = loader.Load(std::string(g_ocr) + "/model", + std::string(g_ocr) + "/params", false); // program.originProgram->Description("program desc: "); return 0; } diff --git a/test/net/test_googlenet.cpp b/test/net/test_googlenet.cpp index d230b9469229946fc74f4dc9e1ee6100196ed9aa..02882bedb01df49b8032325e506c9118f3434a2f 100644 --- a/test/net/test_googlenet.cpp +++ b/test/net/test_googlenet.cpp @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include +#include #include "../test_helper.h" #include "../test_include.h" @@ -23,15 +23,20 @@ int main() { auto time1 = time(); if (paddle_mobile.Load(g_googlenet, optimize)) { auto time2 = time(); - DLOG << "load cost: " << time_diff(time1, time1) << "ms"; + std::cout << "load cost :" << time_diff(time1, time2) << "ms" << std::endl; std::vector input; std::vector dims{1, 3, 224, 224}; GetInput(g_test_image_1x3x224x224, &input, dims); - auto time3 = time(); + // 预热一次 auto vec_result = paddle_mobile.Predict(input, dims); + auto time3 = time(); + for (int i = 0; i < 10; ++i) { + auto vec_result = paddle_mobile.Predict(input, dims); + } auto time4 = time(); - DLOG << "predict cost :" << time_diff(time3, time4) << "ms"; + std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms" + << std::endl; } return 0; } diff --git a/test/net/test_mobilenet+ssd.cpp b/test/net/test_mobilenet+ssd.cpp index 9b4e5f2d3a431001e138977b78994f5dfedbe0a3..ae6c40961ca96ea032b1822f17a663baedc8f661 100644 --- a/test/net/test_mobilenet+ssd.cpp +++ b/test/net/test_mobilenet+ssd.cpp @@ -32,10 +32,14 @@ int main() { std::vector dims{1, 3, 300, 300}; GetInput(g_hand, &input, dims); - auto time3 = time(); + // 预热一次 auto output = paddle_mobile.Predict(input, dims); + auto time3 = time(); + for (int i = 0; i < 10; ++i) { + auto output = paddle_mobile.Predict(input, dims); + } auto time4 = time(); - std::cout << "predict cost :" << time_diff(time3, time4) << "ms" + std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms" << std::endl; } return 0; diff --git a/test/net/test_mobilenet.cpp b/test/net/test_mobilenet.cpp index 9fc7226fc12fa7a0c631c9920487c0bd56c90816..d7793f729866024e2560ad13ac5613172eecc4dd 100644 --- a/test/net/test_mobilenet.cpp +++ b/test/net/test_mobilenet.cpp @@ -26,19 +26,22 @@ int main() { std::vector input; std::vector dims{1, 3, 224, 224}; - GetInput(g_test_image_1x3x224x224, &input, dims); + GetInput(g_test_image_1x3x224x224_banana, &input, dims); + // 预热一次 + auto vec_result = paddle_mobile.Predict(input, dims); + std::vector::iterator biggest = + std::max_element(std::begin(vec_result), std::end(vec_result)); + std::cout << " Max element is " << *biggest << " at position " + << std::distance(std::begin(vec_result), biggest) << std::endl; + + auto time3 = time(); for (int i = 0; i < 10; ++i) { - auto time3 = time(); auto vec_result = paddle_mobile.Predict(input, dims); - auto time4 = time(); - std::vector::iterator biggest = - std::max_element(std::begin(vec_result), std::end(vec_result)); - std::cout << " Max element is " << *biggest << " at position " - << std::distance(std::begin(vec_result), biggest) << std::endl; - std::cout << "predict cost :" << time_diff(time3, time4) << "ms" - << std::endl; } + auto time4 = time(); + std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms" + << std::endl; } return 0; diff --git a/test/test_helper.h b/test/test_helper.h index 9a5c62c79c44fdf52657ea5facb5f0768810c440..658af447d6cfcd85c68ff350b104c2468d442e40 100644 --- a/test/test_helper.h +++ b/test/test_helper.h @@ -24,6 +24,7 @@ limitations under the License. */ #include "framework/ddim.h" #include "framework/tensor.h" +static const char *g_ocr = "../models/ocr"; static const char *g_mobilenet_ssd = "../models/mobilenet+ssd"; static const char *g_mobilenet_ssd_gesture = "../models/mobilenet+ssd_gesture"; static const char *g_squeezenet = "../models/squeezenet"; diff --git a/test_gemm.cpp b/test_gemm.cpp deleted file mode 100644 index 6a49193256d8293dc2cef559b1d1e73bc6dfc7bb..0000000000000000000000000000000000000000 --- a/test_gemm.cpp +++ /dev/null @@ -1,136 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include -#include "../test_helper.h" -#include "common/log.h" -#include "memory/t_malloc.h" -#include "operators/math/gemm.h" - -#define a(i, j) a[(i)*lda + (j)] -#define b(i, j) b[(i)*ldb + (j)] -#define c(i, j) c[(i)*ldc + (j)] -#define c1(i, j) c1[(i)*ldc + (j)] - - -void print_matirx(int m, int n, int ldc, float *c) { - for (int i = 0; i < m; ++i) { - std::cout << c(i, 0); - for (int j = 1; j < n; ++j) { - std::cout << " | " << c(i, j); - } - std::cout << std::endl; - } - std::cout << std::endl; -} - -int do_sgemm(int m, int n, int k, bool relu, int t1, int t2, int pr) { - int lda = k; - int ldb = n; - int ldc = n; - - float *a = static_cast(paddle_mobile::memory::Alloc(sizeof(float) * m * k)); - float *b = static_cast(paddle_mobile::memory::Alloc(sizeof(float) * k * n)); - float *c = static_cast(paddle_mobile::memory::Alloc(sizeof(float) * m * n)); - float *c1 = static_cast(paddle_mobile::memory::Alloc(sizeof(float) * m * n)); - float* scale = static_cast(paddle_mobile::memory::Alloc(sizeof(float) * m)); - float* bias = static_cast(paddle_mobile::memory::Alloc(sizeof(float) * m)); - - srand(unsigned(time(0))); - for (int i = 0; i < m * k; ++i) { - a[i] = t1 + rand() % t2; - } - for (int i = 0; i < k * n; ++i) { - b[i] = t1 + rand() % t2; - } - for (int i = 0; i < m; ++i) { - scale[i] = t1 + rand() % t2; - } - for (int i = 0; i < m; ++i) { - bias[i] = t1 + rand() % t2; - } - - for (int i = 0; i < m; ++i) { - for (int j = 0; j < n; ++j) { - float r = 0; - for (int p = 0; p < k; p++) { - r += a(i, p) * b(p, j); - } - r *= scale[i]; - r += bias[i]; - if (relu && (r < 0)) { - r = 0; - } - c1(i, j) = r; - } - } - - paddle_mobile::operators::math::SgemmWithBn(m, n, k, 0.9, a, lda, - b, ldb, 0.3, c, ldc, relu, scale, bias); - int eq = 0; - int neq = 0; - for (int i = 0; i < m * n; ++i) { - if (static_cast(c[i]) == static_cast(c1[i])) { - ++eq; - } else { - ++neq; - } - } - - if (pr > 0) { - std::cout << "A:" << std::endl; - print_matirx(m, k, lda, a); - - std::cout << "B:" << std::endl; - print_matirx(k, n, ldb, b); - - std::cout << "C:" << std::endl; - print_matirx(m, n, ldc, c); - - std::cout << "C1:" << std::endl; - print_matirx(m, n, ldc, c1); - } - - std::cout << "mnk=" << m << " " << n << " " << k << - " relu=" << relu << - " eq=" << eq << " neq=" << neq << std::endl; - - paddle_mobile::memory::Free(a); - paddle_mobile::memory::Free(b); - paddle_mobile::memory::Free(c); - paddle_mobile::memory::Free(c1); - paddle_mobile::memory::Free(scale); - paddle_mobile::memory::Free(bias); - - return 0; -} - -int main() { - - do_sgemm(9, 9, 9, true, 10, 10, 10); - do_sgemm(10, 6, 12, false, 10, 10, 0); - do_sgemm(512, 256, 384, false, 10, 10, 0); - do_sgemm(1366, 768, 256, false, 10, 10, 0); - do_sgemm(1255, 755, 333, false, 10, 10, 0); - do_sgemm(555, 777, 999, false, 10, 10, 0); - - do_sgemm(10, 6, 12, true, -4, 10, 0); - do_sgemm(512, 256, 384, true, -4, 10, 0); - do_sgemm(1366, 768, 256, true, -4, 10, 0); - do_sgemm(1255, 755, 333, true, -4, 10, 0); - do_sgemm(555, 777, 999, true, -4, 10, 0); - return 0; -} diff --git a/tools/build.sh b/tools/build.sh index ced18a180762826ffa2c45949e5aab9bfe5c8f88..bf3545ef162c86c16c0877f5f25f3a1e09de1fd4 100755 --- a/tools/build.sh +++ b/tools/build.sh @@ -40,8 +40,8 @@ build_for_android() { fi if [ -z "$PLATFORM" ]; then -# PLATFORM="arm-v7a" # Users could choose "arm-v8a" platform. - PLATFORM="arm-v8a" + PLATFORM="arm-v7a" # Users could choose "arm-v8a" platform. +# PLATFORM="arm-v8a" fi if [ "${PLATFORM}" = "arm-v7a" ]; then diff --git a/tools/quantification/convert.cpp b/tools/quantification/convert.cpp index 88eef48b39ab8d2aeb1d4e3858ba97ef6360c9a9..282b22073fc96ddb2ed0d421f113604aadcc4afc 100644 --- a/tools/quantification/convert.cpp +++ b/tools/quantification/convert.cpp @@ -3,8 +3,8 @@ #include "src/enforce.h" #include "src/var_desc.h" #include "src/program_desc.h" +#include #include -#include #include #include #include @@ -13,7 +13,7 @@ #include "src/protobuf-c.h" #include #include - +#include const size_t kSize64 = sizeof(uint64_t); const size_t kSize32 = sizeof(uint32_t); @@ -68,60 +68,60 @@ std::shared_ptr loadParams(const std::string &model_path) { } -void LoadWithDump(const paddle_mobile::framework::VarDesc &var_desc, char *dataP, FILE *out_file) { +void LoadWithDump(const paddle_mobile::framework::VarDesc &var_desc, char **dataP, FILE *out_file) { // 1. version - uint32_t version = *reinterpret_cast(dataP); + uint32_t version = *reinterpret_cast(*dataP); // write version fwrite(&version, kSize32, 1, out_file); - dataP += kSize32; + *dataP += kSize32; // 2 Lod information auto *lod_level_ptr = new uint64_t(); - memcpy(lod_level_ptr, dataP, kSize64); + memcpy(lod_level_ptr, *dataP, kSize64); uint64_t lod_level = 0; // write lod Information fwrite(&lod_level, kSize64, 1, out_file); delete lod_level_ptr; - dataP += kSize64; + *dataP += kSize64; for (uint64_t i = 0; i < lod_level; ++i) { - uint64_t size = *reinterpret_cast(dataP); + uint64_t size = *reinterpret_cast(*dataP); // write lod size fwrite(&size, kSize64, 1, out_file); - (dataP) += kSize64; + (*dataP) += kSize64; std::vector tmp(size / sizeof(size_t)); for (unsigned long &k : tmp) { - k = *reinterpret_cast(dataP); - (dataP) += sizeof(size_t); + k = *reinterpret_cast(*dataP); + (*dataP) += sizeof(size_t); } // write lod size vector fwrite(&tmp, sizeof(size_t), tmp.size(), out_file); } // 3. tensor version - uint32_t tensor_version = *reinterpret_cast(dataP); + uint32_t tensor_version = *reinterpret_cast(*dataP); // write tensor version fwrite(&tensor_version, kSize32, 1, out_file); - (dataP) += kSize32; + (*dataP) += kSize32; // 4. tensor desc - int32_t size = *reinterpret_cast(dataP); + int32_t size = *reinterpret_cast(*dataP); // write tensor desc fwrite(&size, sizeof(int32_t), 1, out_file); - (dataP) += sizeof(int32_t); + (*dataP) += sizeof(int32_t); std::unique_ptr buf(new char[size]); for (int m = 0; m < size; ++m) { - buf.get()[m] = (dataP)[m]; + buf.get()[m] = (*dataP)[m]; } fwrite(buf.get(), sizeof(char), static_cast(size), out_file); - (dataP) += (sizeof(char) * size); + (*dataP) += (sizeof(char) * size); const paddle_mobile::framework::TensorDesc &desc = var_desc.Tensor_desc(); int memory_size = 1; @@ -158,9 +158,9 @@ void LoadWithDump(const paddle_mobile::framework::VarDesc &var_desc, char *dataP memory = new char[tensorSize]; for (int n = 0; n < tensorSize; ++n) { - static_cast(memory)[n] = (dataP)[n]; + static_cast(memory)[n] = (*dataP)[n]; } - dataP += tensorSize; + *dataP += tensorSize; // for float 32 float min_value = std::numeric_limits::max(); @@ -194,7 +194,7 @@ quantificate_combined(const std::string &model_path, const std::string ¶m_pa if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") { continue; } - LoadWithDump(*var_desc, data, out_file); + LoadWithDump(*var_desc, &data, out_file); } } } @@ -220,7 +220,7 @@ void quantificate_seperated(const std::string model_dir, const std::string param FILE *out_file = fopen(file_name.c_str(), "wb"); char *origin_data = Get_binary_data(model_dir + "/" + var_desc->Name()); char *data = origin_data; - LoadWithDump(*var_desc, data, out_file); + LoadWithDump(*var_desc, &data, out_file); delete origin_data; fclose(out_file); } diff --git a/tools/quantification/src/block_desc_local.h b/tools/quantification/src/block_desc_local.h index 41c2dc0abbdf8bb006f4152674e92dd1f7d01500..2ee8132af7f21ed0e62678c8da510bfd7fba9dbd 100644 --- a/tools/quantification/src/block_desc_local.h +++ b/tools/quantification/src/block_desc_local.h @@ -19,6 +19,7 @@ limitations under the License. */ #ifndef TOOLS_QUANTIFICATION_SRC_BLOCK_DESC_LOCAL_H_ #define TOOLS_QUANTIFICATION_SRC_BLOCK_DESC_LOCAL_H_ +#include #include #include "src/var_desc.h"