diff --git a/README.md b/README.md index de8fe0bb7f4613bd9d6dfebd82db1d407ee682f4..fb4daf3bde4658223cff6e2ebdad55d78412f339 100644 --- a/README.md +++ b/README.md @@ -26,22 +26,6 @@ Paddle-Moible是PaddlePaddle组织下的项目,是一个致力于嵌入式平 - **ARM CPU** -|mobilenet arm v7|1线程|2线程|4线程| -|------------|----|-----|-----| -|麒麟960(ms)|110.586|63.285|38.215| -||||| -|mobilenetssd arm v7|1线程|2线程|4线程| -|麒麟960(ms)|220.248|128.473|79.334| -||||| -|googlenet(v1) arm v7|1线程|2线程|4线程| -|麒麟960(ms)|341.965|228.724|161.531| -||||| -|squeezenet arm v7|1线程|2线程|4线程| -|麒麟960(ms)|84.080|55.641|37.182| -||||| -|yolo arm v7|1线程|2线程|4线程| -|麒麟960(ms)|129.445|80.627|50.936| - arm cpu是paddle-mobile的主要支持方向,cpu的通用性一直是其优势。嵌入式深度学习,需要大量的cpu汇编实现。我们正在紧锣密鼓的编码,为的是能充分硬件的每一点加速能力。 arm cpu的优化工作还在进行中,现在使用了常规的cpu优化。在arm a73上paddle-mobile arm-v7现在单核运行一次mobilenet1.0是110+ms,显然这不是我们的最终目标,我们正在用大量的汇编改写,后续性能仍会有巨大提升空间, 目前只支持armv7, 未来我们也会支持armv8。 diff --git a/src/fpga/api.cpp b/src/fpga/api.cpp new file mode 100644 index 0000000000000000000000000000000000000000..01c610ce5b445bc603da3c0dc43ad21c35d95ae6 --- /dev/null +++ b/src/fpga/api.cpp @@ -0,0 +1,168 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "api.h" + +#define FPGA_TEST_MODE +#ifdef FPGA_TEST_MODE +#include "common/log.h" +#endif + +namespace paddle_mobile { +namespace fpga { + +static int fd = -1; +static const char *device_path = "/dev/fpgadrv0"; + +static inline int do_ioctl(int req, const void *arg) { +#ifdef PADDLE_MOBILE_OS_LINUX + return ioctl(req, (unsigned int64_t)arg); +#else + return -1; +#endif +} + +int open_device() { + if (fd == -1) { + fd = open(device_path, O_RDWR); + } + return fd; +} + +// memory management; +void *fpga_malloc(size_t size) { +#ifdef PADDLE_MOBILE_OS_LINUX + return reinterpret_cast( + mmap64(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0)); +#else + return malloc(size); +#endif +} + +void fpga_free(void *ptr) { +#ifdef PADDLE_MOBILE_OS_LINUX + munmap(ptr, 0); +#else + free(ptr); +#endif +} + +void fpga_copy(void *dest, const void *src, size_t num) { + memcpy(dest, src, num); +} + +int ComputeFpgaConv(const struct ConvArgs &args) { +#ifdef FPGA_TEST_MODE + DLOG << " relu_enabled:" << args.relu_enabled + << " sb_address:" << args.sb_address + << " filter_address:" << args.filter_address + << " filter_num:" << args.filter_num + << " group_num:" << args.group_num; + DLOG << " image_address:" << args.image.address + << " image_scale_address:" << args.image.scale_address + << " image_channels:" << args.image.channels + << " image_height:" << args.image.height + << " image_width:" << args.image.width + << " pad_height:" << args.image.pad_height + << " pad_width:" << args.image.pad_width; + DLOG << " kernel_height:" << args.kernel.height + << " kernel_width:" << args.kernel.width + << " stride_h:" << args.kernel.stride_h + << " stride_w:" << args.kernel.stride_w; + DLOG << " out_address:" << args.output.address + << " out_scale_address:" << args.output.scale_address; +#endif + + return do_ioctl(IOCTL_CONFIG_CONV, &args); +} + +int ComputeFpgaPool(const struct PoolingArgs &args) { +#ifdef FPGA_TEST_MODE + DLOG << " image_address:" << args.image.address + << " image_scale_address:" << args.image.scale_address + << " image_channels:" << args.image.channels + << " image_height:" << args.image.height + << " image_width:" << args.image.width + << " pad_height:" << args.image.pad_height + << " pad_width:" << args.image.pad_width; + DLOG << " kernel_height:" << args.kernel.height + << " kernel_width:" << args.kernel.width + << " stride_h:" << args.kernel.stride_h + << " stride_w:" << args.kernel.stride_w; + DLOG << " out_address:" << args.output.address + << " out_scale_address:" << args.output.scale_address; +#endif + + return do_ioctl(IOCTL_CONFIG_POOLING, &args); +} + +int ComputeFpgaEWAdd(const struct EWAddArgs &args) { +#ifdef FPGA_TEST_MODE + DLOG << " relu_enabled:" << args.relu_enabled << " const0:" << args.const0 + << " const1:" << args.const1; + DLOG << " image0_address:" << args.image0.address + << " image0_scale_address:" << args.image0.scale_address + << " image0_channels:" << args.image0.channels + << " image0_height:" << args.image0.height + << " image0_width:" << args.image0.width + << " pad0_height:" << args.image0.pad_height + << " pad0_width:" << args.image0.pad_width; + DLOG << " image1_address:" << args.image1.address + << " image1_scale_address:" << args.image1.scale_address + << " image1_channels:" << args.image1.channels + << " image1_height:" << args.image1.height + << " image1_width:" << args.image1.width + << " pad1_height:" << args.image1.pad_height + << " pad_width:" << args.image1.pad_width; + DLOG << " out_address:" << args.output.address + << " out_scale_address:" << args.output.scale_address; +#endif + + return do_ioctl(IOCTL_CONFIG_EW, &args); +} +int PerformBypass(const struct BypassArgs &args) { +#ifdef FPGA_TEST_MODE + DLOG << " layout_type:" << args.layout_type + << " convert_type:" << args.convert_type; + DLOG << " image_address:" << args.image.address + << " image_scale_address:" << args.image.scale_address + << " image_channels:" << args.image.channels + << " image_height:" << args.image.height + << " image_width:" << args.image.width + << " pad_height:" << args.image.pad_height + << " pad_width:" << args.image.pad_width; + DLOG << " out_address:" << args.output.address + << " out_scale_address:" << args.output.scale_address; +#endif + + return do_ioctl(IOCTL_CONFIG_BYPASS, &args); +} + +} // namespace fpga +} // namespace paddle_mobile diff --git a/src/fpga/api/fpga_api.h b/src/fpga/api.h similarity index 100% rename from src/fpga/api/fpga_api.h rename to src/fpga/api.h diff --git a/src/fpga/api/fpga_api.cpp b/src/fpga/api/fpga_api.cpp deleted file mode 100644 index 779c846d1f3c465e5113f805b2b3856a1a7894c5..0000000000000000000000000000000000000000 --- a/src/fpga/api/fpga_api.cpp +++ /dev/null @@ -1,75 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "fpga/api/fpga_api.h" - -namespace paddle_mobile { -namespace fpga { - -static int fd = -1; -static const char *device_path = "/dev/fpgadrv0"; - -static inline int do_ioctl(int req, const void *arg) { - return ioctl(req, (unsigned int64_t)arg); -} - -int open_device() { - if (fd == -1) { - fd = open(device_path, O_RDWR); - } - return fd; -} - -// memory management; -void *fpga_malloc(size_t size) { - return reinterpret_cast( - mmap64(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0)); -} - -void fpga_free(void *ptr) { munmap(ptr, 0); } - -void fpga_copy(void *dest, const void *src, size_t num) { - memcpy(dest, src, num); -} - -int ComputeFpgaConv(const struct ConvArgs &args) { - return do_ioctl(IOCTL_CONFIG_CONV, &args); -} -int ComputeFpgaPool(const struct PoolingArgs &args) { - return do_ioctl(IOCTL_CONFIG_POOLING, &args); -} -int ComputeFpgaEWAdd(const struct EWAddArgs &args) { - return do_ioctl(IOCTL_CONFIG_EW, &args); -} -int PerformBypass(const struct BypassArgs &args) { - return do_ioctl(IOCTL_CONFIG_BYPASS, &args); -} - -} // namespace fpga -} // namespace paddle_mobile diff --git a/src/fpga/fpga_quantilization.cpp b/src/fpga/fpga_quantilization.cpp deleted file mode 100644 index 8b351f1a81e0a92f0e2f12a3f61dd2a7d3948c85..0000000000000000000000000000000000000000 --- a/src/fpga/fpga_quantilization.cpp +++ /dev/null @@ -1,91 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "fpga/fpga_quantilization.h" -#include - -namespace paddle_mobile { -namespace fpga { - -template -static void chw_to_hwc(Dtype* data_in, Dtype* data_out, int num, int channel, - int height, int width) { - int offset_height = 0; - - for (int n = 0; n < num; n++) { - int amount_per_row = width * channel; - for (int c = 0; c < channel; c++) { - for (int h = 0; h < height; h++) { - int offset_height = h * amount_per_row; - for (int w = 0; w < width; w++) { - *(data_out + offset_height + w * channel + c) = *(data_in++); - } - } - } - data_out += num; - } -} - -template -static Dtype find_max(Dtype* data, int num) { - Dtype max = 0; - for (int i = 0; i < num; ++i) { - max = std::max(max, data[i]); - } - return max; -} - -// template -void quantify_filter(framework::Tensor* filter) { - DLOG << "quantilize_filter........"; - - float scale = 0; - float fix_range = static_cast((1 << (8 - 1)) - 1); - - const int batch_size = filter->dims()[0]; - const int channel = filter->dims()[1]; - const int height = filter->dims()[2]; - const int width = filter->dims()[3]; - - int8_t* int_data = nullptr; - int8_t* tmp_data = new int8_t[filter->numel()]; - - // 32bit filter -> 8bit filter; - if (filter->type() == typeid(float)) { - float* float_data = filter->data(); - float max = find_max(float_data, filter->numel()); - - scale = (max / fix_range); - - for (int i = 0; i < filter->numel(); ++i) { - tmp_data[i] = (int8_t)float_data[i] * scale; - } - int_data = filter->mutable_data(); - } else { - int8_t max = find_max(filter->data(), filter->numel()); - scale = (max / fix_range); - - for (int i = 0; i < filter->numel(); ++i) { - tmp_data[i] = filter->data()[i]; - } - int_data = filter->mutable_data(); - } - // NCHW -> NHWC; - chw_to_hwc(tmp_data, int_data, batch_size, channel, height, width); - delete tmp_data; - *(filter->fpga_args().scale_pointer()) = scale; -} - -} // namespace fpga -} // namespace paddle_mobile diff --git a/src/fpga/quantization.cpp b/src/fpga/quantization.cpp new file mode 100644 index 0000000000000000000000000000000000000000..44994d4c353490b533110d0965fb63b4fb5c7aa2 --- /dev/null +++ b/src/fpga/quantization.cpp @@ -0,0 +1,92 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "fpga/quantization.h" +#include + +namespace paddle_mobile { +namespace fpga { + +template +static void chw_to_hwc(Dtype* data_in, Dtype* data_out, int64_t num, + int64_t channel, int64_t height, int64_t width) { + for (int n = 0; n < num; n++) { + int64_t amount_per_row = width * channel; + for (int c = 0; c < channel; c++) { + for (int h = 0; h < height; h++) { + int64_t offset_height = h * amount_per_row; + for (int w = 0; w < width; w++) { + *(data_out + offset_height + w * channel + c) = *(data_in++); + } + } + } + data_out += num; + } +} + +template +static Dtype find_max(Dtype* data, int64_t num) { + Dtype max = 0; + for (int i = 0; i < num; ++i) { + Dtype value = data[i]; + Dtype abs = value > 0 ? value : -value; + max = std::max(max, abs); + } + return max; +} + +// template +void quantize_filter(framework::Tensor* filter) { + DLOG << "quantilize_filter........" << filter->dims(); + + float scale = 0; + auto fix_range = static_cast(std::pow(2, 8 - 1) - 1); + + auto* tmp_data = new int8_t[filter->numel()]; + + // 32bit filter -> 8bit filter; + if (filter->type() == typeid(float)) { + auto* float_data = filter->data(); + auto max = find_max(float_data, filter->numel()); + + scale = (fix_range / max); + DLOG << "scale:" << scale; + + for (int i = 0; i < filter->numel(); ++i) { + tmp_data[i] = (int8_t)(float_data[i] * scale); + } + } else { + auto max = find_max(filter->data(), filter->numel()); + scale = (fix_range / max); + std::memcpy(tmp_data, filter->data(), (size_t)filter->numel()); + } + + if (filter->dims().size() == 4) { + const auto batch_size = filter->dims()[0]; + const auto channel = filter->dims()[1]; + const auto height = filter->dims()[2]; + const auto width = filter->dims()[3]; + chw_to_hwc(tmp_data, filter->mutable_data(), batch_size, + channel, height, width); + } else if (filter->dims().size() == 2) { + std::memcpy(filter->mutable_data(), tmp_data, + (size_t)filter->numel()); + } + + delete tmp_data; + filter->SetFpgaScale(scale); +} + +} // namespace fpga +} // namespace paddle_mobile diff --git a/src/fpga/fpga_quantilization.h b/src/fpga/quantization.h similarity index 80% rename from src/fpga/fpga_quantilization.h rename to src/fpga/quantization.h index 4f1f6ad402a3ff4df773ecbd2121820f4c7dc265..0d6c2405fccd814f73d44eef20b6735dc0ad0eab 100644 --- a/src/fpga/fpga_quantilization.h +++ b/src/fpga/quantization.h @@ -21,11 +21,10 @@ namespace paddle_mobile { namespace fpga { template -static void chw_to_hwc(Dtype* data_in, Dtype* data_out, int num, int channel, - int height, int width); +static void chw_to_hwc(Dtype* data_in, Dtype* data_out, int64_t num, + int64_t channel, int64_t height, int64_t width); -// template -void quantify_filter(framework::Tensor* filter); +void quantize_filter(framework::Tensor* filter); } // namespace fpga } // namespace paddle_mobile diff --git a/src/framework/tensor.h b/src/framework/tensor.h index 797fcf5bffbe5e738fe352d1ca84602f0e5d86a0..6fc16a01a2874f04ecea3edb89774f4deea93dd5 100644 --- a/src/framework/tensor.h +++ b/src/framework/tensor.h @@ -64,7 +64,8 @@ struct SizeOfTypeFunctor { }; static inline size_t SizeOfType(std::type_index type) { - SizeOfTypeFunctor + SizeOfTypeFunctor functor; size_t size = functor(type); @@ -115,8 +116,8 @@ class Tensor { PADDLE_MOBILE_ENFORCE( (std::is_same::value || holder_->type().hash_code() == typeid(T).hash_code()), - "Tensor holds the wrong type, it holds %s", - this->holder_->type().name()); + "Tensor holds the wrong type, it holds %s ,requested:%s", + this->holder_->type().name(), typeid(T).name()); return reinterpret_cast( reinterpret_cast(holder_->ptr()) + offset_); @@ -255,14 +256,26 @@ class Tensor { #ifdef PADDLE_MOBILE_FPGA struct FPGAArgs { - float scale; + friend class Tensor; + + inline float *scale_pointer() { return scale_; } + inline float scale() { return *scale_; } - inline float *scale_pointer() { return &scale; } + private: + float *scale_; }; struct FPGAArgs fpga_args() const { - return fpgaArgs_; + FPGAArgs args; + args.scale_ = scale.get(); + return args; } + + void SetFpgaScale(float s) { *(scale.get()) = s; } + + private: + std::shared_ptr scale = std::make_shared(0); + #endif private: @@ -331,10 +344,6 @@ class Tensor { * begins. */ size_t offset_; - -#ifdef PADDLE_MOBILE_FPGA - FPGAArgs fpgaArgs_; -#endif }; #ifdef PADDLE_MOBILE_DEBUG @@ -342,9 +351,12 @@ inline Print &operator<<(Print &printer, const Tensor &tensor) { printer << " dims: " << tensor.dims() << "\n"; int stride = tensor.numel() / 20; stride = stride > 0 ? stride : 1; +#ifndef PADDLE_MOBILE_FPGA for (int i = 0; i < tensor.numel(); i += stride) { printer << tensor.data()[i] << " "; } +#endif + return printer; } diff --git a/src/memory/t_malloc.cpp b/src/memory/t_malloc.cpp index 8902543347b2db7caee7126b2a28fa460ca741db..2bd4c0ac6ba3c7b066cc7ad2439ab6bebb7c3cd9 100644 --- a/src/memory/t_malloc.cpp +++ b/src/memory/t_malloc.cpp @@ -18,7 +18,7 @@ limitations under the License. */ #ifdef PADDLE_MOBILE_FPGA -#include "fpga/api/fpga_api.h" +#include "fpga/api.h" #endif @@ -26,7 +26,7 @@ namespace paddle_mobile { namespace memory { const int MALLOC_ALIGN = 64; -#ifdef PADDLE_MOBILE_FPGA__VV +#ifdef PADDLE_MOBILE_FPGA namespace fpga = paddle_mobile::fpga; void Copy(void *dst, const void *src, size_t num) { diff --git a/src/operators/feed_op.h b/src/operators/feed_op.h index 286b0076ef2b9ad806f141c4d6124f1233dc78dc..5969e679552345d25c8c9c7a4950eb3b6d72eca2 100644 --- a/src/operators/feed_op.h +++ b/src/operators/feed_op.h @@ -38,10 +38,15 @@ class FeedOp : public framework::OperatorBase { } #ifdef PADDLE_MOBILE_FPGA - void RunImpl() const { fpga::PerformBypass(param_.FpgaArgs()); } + void Init() { + Tensor *output = param_.Out(); + output->mutable_data(); + } + + void RunImpl() const { const Tensor *input = param_.InputX(); - auto input_ptr = (const_cast(input))->mutable_data(); + auto input_ptr = input->data(); Tensor *output = param_.Out(); auto output_ptr = output->mutable_data(); fpga::BypassArgs args; @@ -52,12 +57,12 @@ class FeedOp : public framework::OperatorBase { args.image.height = input->dims()[2]; args.image.width = input->dims()[3]; args.output.address = output_ptr; - param_.SetFpgaArgs(args); + fpga::PerformBypass(args); } #else - void RunImpl() const { param_.Out()->ShareDataWith(*param_.InputX()); } void Init() {} + void RunImpl() const { param_.Out()->ShareDataWith(*param_.InputX()); } #endif protected: diff --git a/src/operators/kernel/fpga/conv_add_bn_kernel.cpp b/src/operators/kernel/fpga/conv_add_bn_kernel.cpp index 93bbfe9c1a8ae3d9930c759ba0efcef04e5e572f..152b200cfa88d010bb4c8e8022c01ee3663cc179 100644 --- a/src/operators/kernel/fpga/conv_add_bn_kernel.cpp +++ b/src/operators/kernel/fpga/conv_add_bn_kernel.cpp @@ -15,8 +15,8 @@ limitations under the License. */ #ifdef FUSION_CONVADDBN_OP #include "operators/kernel/conv_add_bn_kernel.h" -#include "fpga/api/fpga_api.h" -#include "fpga/fpga_quantilization.h" +#include "fpga/api.h" +#include "fpga/quantization.h" namespace paddle_mobile { namespace operators { @@ -37,11 +37,11 @@ bool ConvAddBNKernel::Init(FusionConvAddBNParam *param) { auto bn_scale_ptr = param->InputScale()->data(); auto bn_bias_ptr = param->InputBias()->data(); const float epsilon = param->Epsilon(); - PADDLE_MOBILE_ENFORCE(input->dims()[1] == bias->dims()[0] && + PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0] && bias->dims()[0] == param->InputBias()->dims()[0], - "Image channel should be equal to bias number"); + "Output channel should be equal to bias number"); - const int channel = input->dims()[1]; + const int channel = out->dims()[1]; float *bs_ptr = reinterpret_cast(fpga::fpga_malloc(2 * channel * sizeof(float))); Tensor *new_scale = new Tensor(); @@ -60,8 +60,8 @@ bool ConvAddBNKernel::Init(FusionConvAddBNParam *param) { param->SetNewScale(new_scale); param->SetNewBias(new_bias); - fpga::quantify_filter(filter); - auto filter_ptr = filter->data(); + fpga::quantize_filter(filter); + auto filter_ptr = filter->data(); fpga::ConvArgs convArgs; convArgs.relu_enabled = relu_enabled; diff --git a/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp b/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp index d5e79a39b79494d543e6e9485497a540a15152aa..caa1e94c6bb9b583efb15e181d46c80f0b66c7ff 100644 --- a/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp +++ b/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp @@ -15,7 +15,7 @@ limitations under the License. */ #ifdef FUSION_CONVADDBNRELU_OP #include "operators/kernel/conv_add_bn_relu_kernel.h" -#include "fpga/fpga_quantilization.h" +#include "fpga/quantization.h" namespace paddle_mobile { namespace operators { @@ -35,11 +35,11 @@ bool ConvAddBNReluKernel::Init(FusionConvAddBNReluParam *param) { auto bn_scale_ptr = param->InputScale()->data(); auto bn_bias_ptr = param->InputBias()->data(); const float epsilon = param->Epsilon(); - PADDLE_MOBILE_ENFORCE(input->dims()[1] == bias->dims()[0] && + PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0] && bias->dims()[0] == param->InputBias()->dims()[0], - "Image channel should be equal to bias number"); + "Output channel should be equal to bias number"); - const int channel = input->dims()[1]; + const int channel = out->dims()[1]; float *bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); Tensor *new_scale = new Tensor(); Tensor *new_bias = new Tensor(); @@ -56,8 +56,8 @@ bool ConvAddBNReluKernel::Init(FusionConvAddBNReluParam *param) { } param->SetNewScale(new_scale); param->SetNewBias(new_bias); - fpga::quantify_filter(filter); - auto filter_ptr = filter->data(); + fpga::quantize_filter(filter); + auto filter_ptr = filter->data(); fpga::ConvArgs convArgs; convArgs.relu_enabled = relu_enabled; diff --git a/src/operators/kernel/fpga/conv_add_relu_kernel.cpp b/src/operators/kernel/fpga/conv_add_relu_kernel.cpp index 3b44506f65cc6700323c3d5f7d0765c9e52f7e0a..33e55773ad0be4f174916f0e5f066b6eeec1d46e 100644 --- a/src/operators/kernel/fpga/conv_add_relu_kernel.cpp +++ b/src/operators/kernel/fpga/conv_add_relu_kernel.cpp @@ -15,7 +15,7 @@ limitations under the License. */ #ifdef FUSION_CONVADDRELU_OP #include "operators/kernel/conv_add_relu_kernel.h" -#include "fpga/fpga_quantilization.h" +#include "fpga/quantization.h" namespace paddle_mobile { namespace operators { @@ -31,17 +31,17 @@ bool ConvAddReluKernel::Init(FusionConvAddReluParam *param) { Tensor *out = param->Output(); auto out_ptr = out->mutable_data(); - PADDLE_MOBILE_ENFORCE(input->dims()[1] == bias->dims()[0], - "Image channel should be equal to bias number"); - int channel = input->dims()[1]; + PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0], + "Output channel should be equal to bias number"); + int channel = out->dims()[1]; float *bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); for (int i = 0; i < channel; i++) { bs_ptr[i * 2] = 1; bs_ptr[i * 2 + 1] = bias_ptr[i]; } - fpga::quantify_filter(filter); - auto filter_ptr = filter->data(); + fpga::quantize_filter(filter); + auto filter_ptr = filter->data(); fpga::ConvArgs convArgs; convArgs.relu_enabled = relu_enabled; diff --git a/src/operators/kernel/fpga/conv_bn_kernel.cpp b/src/operators/kernel/fpga/conv_bn_kernel.cpp index fd95f47a1fcb8c444172909abc67ad7f5e0de632..3ad65a254f95bde431efbd3c5995df6cc2295d3d 100644 --- a/src/operators/kernel/fpga/conv_bn_kernel.cpp +++ b/src/operators/kernel/fpga/conv_bn_kernel.cpp @@ -15,8 +15,8 @@ limitations under the License. */ #ifdef FUSION_CONVBN_OP #include "operators/kernel/conv_bn_kernel.h" -#include "fpga/api/fpga_api.h" -#include "fpga/fpga_quantilization.h" +#include "fpga/api.h" +#include "fpga/quantization.h" namespace paddle_mobile { namespace operators { @@ -35,10 +35,10 @@ bool ConvBNKernel::Init(FusionConvBNParam *param) { auto bn_scale_ptr = param->InputScale()->data(); auto bn_bias_ptr = param->InputBias()->data(); const float epsilon = param->Epsilon(); - PADDLE_MOBILE_ENFORCE(input->dims()[1] == param->InputBias()->dims()[0], - "Image channel should be equal to bias number"); + PADDLE_MOBILE_ENFORCE(out->dims()[1] == param->InputBias()->dims()[0], + "Output channel should be equal to bias number"); - const int channel = input->dims()[1]; + const int channel = out->dims()[1]; float *bs_ptr = reinterpret_cast(fpga::fpga_malloc(2 * channel * sizeof(float))); Tensor *new_scale = new Tensor(); @@ -55,8 +55,8 @@ bool ConvBNKernel::Init(FusionConvBNParam *param) { } param->SetNewScale(new_scale); param->SetNewBias(new_bias); - fpga::quantify_filter(filter); - auto filter_ptr = filter->data(); + fpga::quantize_filter(filter); + auto filter_ptr = filter->data(); fpga::ConvArgs convArgs; convArgs.relu_enabled = relu_enabled; diff --git a/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp b/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp index fbb3ca512ea863c49ca4da3f9a133f8c91897b53..18ef4b4e15e488f01a435d89218992e63873bb14 100644 --- a/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp +++ b/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp @@ -15,7 +15,7 @@ limitations under the License. */ #ifdef FUSION_CONVBNRELU_OP #include "operators/kernel/conv_bn_relu_kernel.h" -#include "fpga/fpga_quantilization.h" +#include "fpga/quantization.h" namespace paddle_mobile { namespace operators { @@ -33,10 +33,10 @@ bool ConvBNReluKernel::Init(FusionConvBNReluParam *param) { auto bn_scale_ptr = param->InputScale()->data(); auto bn_bias_ptr = param->InputBias()->data(); const float epsilon = param->Epsilon(); - PADDLE_MOBILE_ENFORCE(input->dims()[1] == param->InputBias()->dims()[0], - "Image channel should be equal to bias number"); + PADDLE_MOBILE_ENFORCE(out->dims()[1] == param->InputBias()->dims()[0], + "Output channel should be equal to bias number"); - const int channel = input->dims()[1]; + const int channel = out->dims()[1]; float *bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); Tensor *new_scale = new Tensor(); Tensor *new_bias = new Tensor(); @@ -52,8 +52,8 @@ bool ConvBNReluKernel::Init(FusionConvBNReluParam *param) { } param->SetNewScale(new_scale); param->SetNewBias(new_bias); - fpga::quantify_filter(filter); - auto filter_ptr = filter->data(); + fpga::quantize_filter(filter); + auto filter_ptr = filter->data(); fpga::ConvArgs convArgs; convArgs.relu_enabled = relu_enabled; diff --git a/src/operators/kernel/fpga/fc_relu_kernel.cpp b/src/operators/kernel/fpga/fc_relu_kernel.cpp index 21e334b12b70be1980d9417ed11161143106d1c6..fb6a3e7508bf11f0bba1c3e34c065fa63caa2100 100644 --- a/src/operators/kernel/fpga/fc_relu_kernel.cpp +++ b/src/operators/kernel/fpga/fc_relu_kernel.cpp @@ -13,7 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #ifdef FUSION_FCRELU_OP #include "operators/kernel/fc_relu_kernel.h" -#include "fpga/api/fpga_api.h" + +#include "fpga/api.h" +#include "fpga/quantization.h" namespace paddle_mobile { namespace operators { @@ -23,8 +25,7 @@ bool FusionFcReluKernel::Init(FusionFcReluParam *param) { bool relu_enabled = true; const Tensor *input_x = param->InputX(); auto input_x_ptr = input_x->data(); - const Tensor *input_y = param->InputY(); - auto input_y_ptr = input_y->data(); + Tensor *input_y = param->InputY(); const Tensor *input_z = param->InputZ(); auto input_z_ptr = input_z->data(); Tensor *out = param->Out(); @@ -32,13 +33,16 @@ bool FusionFcReluKernel::Init(FusionFcReluParam *param) { PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == input_y->dims()[0], "Image channel should be equal to weight number"); - int channel = input_x->dims()[1]; + int channel = out->dims()[1]; float *bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); for (int i = 0; i < channel; i++) { bs_ptr[i * 2] = 1; bs_ptr[i * 2 + 1] = input_z_ptr[i]; } + fpga::quantize_filter(input_y); + auto input_y_ptr = input_y->data(); + fpga::ConvArgs convArgs; convArgs.relu_enabled = relu_enabled; convArgs.filter_address = (void *)input_y_ptr; diff --git a/src/operators/kernel/fpga/fusion_fc_kernel.cpp b/src/operators/kernel/fpga/fusion_fc_kernel.cpp index 505b8768565dc4003152c3493b558448f9d73d04..5479deb6c19cf085dcea03555e4895d4ad98c4e8 100644 --- a/src/operators/kernel/fpga/fusion_fc_kernel.cpp +++ b/src/operators/kernel/fpga/fusion_fc_kernel.cpp @@ -14,6 +14,7 @@ limitations under the License. */ #ifdef FUSION_FC_OP #include "operators/kernel/fusion_fc_kernel.h" +#include "fpga/quantization.h" namespace paddle_mobile { namespace operators { @@ -23,8 +24,7 @@ bool FusionFcKernel::Init(FusionFcParam *param) { bool relu_enabled = false; const Tensor *input_x = param->InputX(); auto input_x_ptr = input_x->data(); - const Tensor *input_y = param->InputY(); - auto input_y_ptr = input_y->data(); + Tensor *input_y = param->InputY(); const Tensor *input_z = param->InputZ(); auto input_z_ptr = input_z->data(); Tensor *out = param->Out(); @@ -32,13 +32,16 @@ bool FusionFcKernel::Init(FusionFcParam *param) { PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == input_y->dims()[0], "Image channel should be equal to weight number"); - int channel = input_x->dims()[1]; + int channel = out->dims()[1]; float *bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); for (int i = 0; i < channel; i++) { bs_ptr[i * 2] = 1; bs_ptr[i * 2 + 1] = input_z_ptr[i]; } + fpga::quantize_filter(input_y); + auto input_y_ptr = input_y->data(); + fpga::ConvArgs convArgs; convArgs.relu_enabled = relu_enabled; convArgs.filter_address = (void *)input_y_ptr; @@ -55,11 +58,9 @@ bool FusionFcKernel::Init(FusionFcParam *param) { convArgs.image.width = input_x->dims()[3]; convArgs.image.pad_height = 0; convArgs.image.pad_width = 0; - convArgs.image.scale_address = - input_x->fpga_args().scale_pointer(); // fc input has scale attribute?? + convArgs.image.scale_address = input_x->fpga_args().scale_pointer(); convArgs.output.address = (void *)out_ptr; - convArgs.output.scale_address = - out->fpga_args().scale_pointer(); // fc output has scale attribute?? + convArgs.output.scale_address = out->fpga_args().scale_pointer(); param->SetFpgaArgs(convArgs); return true; } diff --git a/src/operators/kernel/fpga/softmax_kernel.cpp b/src/operators/kernel/fpga/softmax_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d58ab0f751eeb584f286a0920d08e9473be38402 --- /dev/null +++ b/src/operators/kernel/fpga/softmax_kernel.cpp @@ -0,0 +1,54 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef SOFTMAX_OP + +#include "../softmax_kernel.h" +#include "../central-arm-func/softmax_arm_func.h" +#include "common/types.h" +#include "fpga/api.h" +#include "operators/math/softmax.h" +namespace paddle_mobile { +namespace operators { + +template <> +bool SoftmaxKernel::Init(SoftmaxParam *param) { + const Tensor *input = param->InputX(); + if (input->type() == typeid(half)) { + auto input_ptr = input->data(); + auto output_ptr = param->Out(); + fpga::BypassArgs args; + args.convert_type = fpga::DATA_FP16_TO_FP32; + args.layout_type = fpga::LAYOUT_HWC_TO_CHW; + args.image.address = (void *)(input_ptr); + args.image.height = input->dims()[0]; + args.image.width = input->dims()[1]; + args.image.channels = 1; + args.output.address = output_ptr; + param->SetFpgaArgs(args); + } + + return true; +} + +template <> +void SoftmaxKernel::Compute(const SoftmaxParam ¶m) const { + // SoftmaxCompute(param); +} + +template class SoftmaxKernel; +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/op_param.h b/src/operators/op_param.h index f61599ab51e6a06c26dd188d5a1b33aa8b1df200..06da537e419f3a54ffc9986b12274f9853f12774 100644 --- a/src/operators/op_param.h +++ b/src/operators/op_param.h @@ -23,7 +23,7 @@ limitations under the License. */ #include "framework/tensor.h" #include "framework/variable.h" #ifdef PADDLE_MOBILE_FPGA -#include "fpga/api/fpga_api.h" +#include "fpga/api.h" #endif namespace paddle_mobile { @@ -585,6 +585,21 @@ class SoftmaxParam : public OpParam { private: Tensor *input_x_; Tensor *out_; + +#ifdef PADDLE_MOBILE_FPGA + + private: + std::shared_ptr float_input_x_; + fpga::BypassArgs fpga_bypass_args; + + public: + Tensor *FloatInput() { + return float_input_x_ == nullptr ? input_x_ : float_input_x_.get(); + } + void SetFloatInput(Tensor *input) { float_input_x_.reset(input); } + const fpga::BypassArgs &FpgaArgs() const { return fpga_bypass_args; } + void SetFpgaArgs(const fpga::BypassArgs &args) { fpga_bypass_args = args; } +#endif }; #endif @@ -670,16 +685,6 @@ class FeedParam : public OpParam { Tensor *input_x_; Tensor *out_; int batch_size; - -#ifdef PADDLE_MOBILE_FPGA - - private: - fpga::BypassArgs fpga_bypass_args; - - public: - const fpga::BypassArgs &FpgaArgs() const { return fpga_bypass_args; } - void SetFpgaArgs(const fpga::BypassArgs &args) { fpga_bypass_args = args; } -#endif }; class FetchParam : public OpParam { @@ -1143,7 +1148,6 @@ class FusionConvBNParam : public OpParam { FusionConvBNParam(const VariableNameMap &inputs, const VariableNameMap &outputs, const AttributeMap &attrs, const Scope &scope) { - axis_ = GetAttr("axis", attrs); filter_ = FilterFrom(inputs, scope); input_ = InputFrom(inputs, scope); output_y_ = OutputYFrom(outputs, scope); @@ -1160,8 +1164,6 @@ class FusionConvBNParam : public OpParam { // is_test_ = GetAttr("is_test", attrs); } - const int &Axis() const { return axis_; } - const Tensor *Input() const { return input_; } #ifdef PADDLE_MOBILE_FPGA @@ -1202,7 +1204,6 @@ class FusionConvBNParam : public OpParam { const Tensor *NewBias() const { return new_bias_; } protected: - int axis_; Tensor *input_; Tensor *output_y_; Tensor *filter_; diff --git a/src/operators/softmax_op.cpp b/src/operators/softmax_op.cpp index c9edfccf4ff08e5a12d735526c3d63c689711357..e85edc69c3291c794f2eeb8119b91b2926c4d870 100644 --- a/src/operators/softmax_op.cpp +++ b/src/operators/softmax_op.cpp @@ -34,6 +34,7 @@ REGISTER_OPERATOR_CPU(softmax, ops::SoftmaxOp); REGISTER_OPERATOR_MALI_GPU(softmax, ops::SoftmaxOp); #endif #ifdef PADDLE_MOBILE_FPGA +REGISTER_OPERATOR_FPGA(softmax, ops::SoftmaxOp); #endif #endif diff --git a/src/operators/softmax_op.h b/src/operators/softmax_op.h index f645d7edf7a3b9f7a92cf286feec58e960a5e3b7..bacae23b522daf1cc689a2d7af6b14cd2bc794bb 100644 --- a/src/operators/softmax_op.h +++ b/src/operators/softmax_op.h @@ -55,6 +55,7 @@ USE_OP_CPU(softmax); USE_OP_MALI_GPU(softmax); #endif #ifdef PADDLE_MOBILE_FPGA +USE_OP_FPGA(softmax); #endif #endif diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 632f1f55c24c524ee56a15e91940517fc44af06c..8f92b6dab9e5c2c51c485f61fa2860926ce50b1f 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -27,10 +27,14 @@ elseif("resnet" IN_LIST NET) ADD_EXECUTABLE(test-resnet net/test_resnet.cpp test_helper.h test_include.h executor_for_test.h) target_link_libraries(test-resnet paddle-mobile) elseif("FPGAnets" IN_LIST NET) - # ADD_EXECUTABLE(test-resnet net/test_resnet.cpp test_helper.h test_include.h executor_for_test.h) - # target_link_libraries(test-resnet paddle-mobile) + ADD_EXECUTABLE(test-resnet net/test_resnet.cpp test_helper.h test_include.h executor_for_test.h) + target_link_libraries(test-resnet paddle-mobile) + ADD_EXECUTABLE(test-tensor-quant fpga/test_tensor_quant.cpp test_helper.h test_include.h executor_for_test.h) target_link_libraries(test-tensor-quant paddle-mobile) + + ADD_EXECUTABLE(test-fpga-concat-op fpga/test_concat_op.cpp test_helper.h test_include.h) + target_link_libraries(test-fpga-concat-op paddle-mobile) elseif("mobilenetssd" IN_LIST NET) # gen test ADD_EXECUTABLE(test-mobilenetssd net/test_mobilenet+ssd.cpp test_helper.h test_include.h executor_for_test.h) diff --git a/test/fpga/test_concat_op.cpp b/test/fpga/test_concat_op.cpp new file mode 100644 index 0000000000000000000000000000000000000000..5d1a5828b36b3d9ed371a271af6db82657ff1596 --- /dev/null +++ b/test/fpga/test_concat_op.cpp @@ -0,0 +1,87 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "../test_include.h" +#include "operators/concat_op.h" + +int main() { + paddle_mobile::Loader loader; + auto program = loader.Load(g_googlenet); + PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr, + "program file read fail"); + + Executor4Test> + executor(program, "concat"); + + // 1. input_tensors; + vector input_tensors; + + Tensor input1; + auto input1_data = CreateInput(&input1, {4, 10, 2, 2}, 0, 1); + input_tensors.push_back(input1); + Tensor input2; + auto input2_data = CreateInput(&input2, {4, 20, 2, 2}, 0, 1); + input_tensors.push_back(input2); + Tensor input3; + auto input3_data = CreateInput(&input3, {4, 30, 2, 2}, 0, 1); + input_tensors.push_back(input3); + Tensor input4; + auto input4_data = CreateInput(&input4, {4, 40, 2, 2}, 0, 1); + input_tensors.push_back(input4); + // 2. input_names + vector input_names({ + "conv2d_3.tmp_1", + "conv2d_5.tmp_1", + "conv2d_7.tmp_1", + "conv2d_8.tmp_1", + }); + + // 3. output_names + vector output_names({"concat_0.tmp_0"}); + + // 4. out_dims; + vector out_ddims; + auto out_ddim = paddle_mobile::framework::make_ddim({3, 100, 2, 2}); + out_ddims.push_back(out_ddim); + + auto output = executor.Predict(input_tensors, input_names, + output_names, out_ddims); + + auto output0_data = output[0]->data(); + + // 5. test one example. + int input_n = 1; + int input_c = 2; + int input_h = 0; + int input_w = 1; + int stride0 = input3.numel() / input3.dims()[0]; + int stride1 = input3.numel() / input3.dims()[0] / input3.dims()[1]; + int stride2 = input3.dims()[3]; + /// inputx1 (4,10,2,2), + /// inputx2 (4,20,2,2), + /// inputx3 (4,30,2,2), + /// inputx4 (4,40,2,2), + /// axis = 1 + /// output (4,100,2,2) + int input_index = + input_n * stride0 + input_c * stride1 + input_h * stride2 + input_w; + int output_index = input_n * 100 * 2 * 2 + + (input_c + input1.dims()[1] + input2.dims()[1]) * 2 * 2 + + input_h * 2 + input_w; + + DLOG << " input3 [1, 2,0,1] = " << input3_data[input_index]; + DLOG << " output [1,32,0,1] = " << output0_data[output_index]; + return 0; +} diff --git a/test/net/test_resnet.cpp b/test/net/test_resnet.cpp index 73ac88ef77b0c02545ef55b6493d4681c61c192d..82fdc22763d11d4b06439465d56d0e6fa663a317 100644 --- a/test/net/test_resnet.cpp +++ b/test/net/test_resnet.cpp @@ -17,7 +17,13 @@ limitations under the License. */ #include "../test_include.h" int main() { +#ifdef PADDLE_MOBILE_FPGA + paddle_mobile::PaddleMobile paddle_mobile; +#endif + +#ifdef PADDLE_MOBILE_CPU paddle_mobile::PaddleMobile paddle_mobile; +#endif paddle_mobile.SetThreadNum(4); auto time1 = time(); if (paddle_mobile.Load(g_resnet, true)) { diff --git a/tools/op.cmake b/tools/op.cmake index 8f5ffb52aeae29c76d0d456a1392b5411cb5d04a..fd2b103842a3017fa5c93d39602a4c2bee47d94e 100644 --- a/tools/op.cmake +++ b/tools/op.cmake @@ -86,6 +86,8 @@ if ("resnet" IN_LIST NET) set(RELU_OP ON) set(ELEMENTWISEADD_OP ON) set(POOL_OP ON) + set(BATCHNORM_OP ON) + set(MUL_OP ON) set(RESHAPE_OP ON) set(SOFTMAX_OP ON)