diff --git a/src/fpga/api.cpp b/src/fpga/api.cpp index 9d33c742e3a4bc76d1f2766a8b5476579ace2789..10787b915594a12a826a087e5453b2c2e8c03f9a 100644 --- a/src/fpga/api.cpp +++ b/src/fpga/api.cpp @@ -29,9 +29,7 @@ namespace fpga { static int fd = -1; static const char *device_path = "/dev/fpgadrv0"; -#ifdef PADDLE_MOBILE_OS_LINUX static std::map memory_map; -#endif static inline int do_ioctl(int req, const void *arg) { #ifdef PADDLE_MOBILE_OS_LINUX @@ -53,32 +51,38 @@ int open_device() { // memory management; void *fpga_malloc(size_t size) { static uint64_t counter = 0; - counter += size; - DLOG << size << " bytes allocated. Total " << counter << " bytes"; + #ifdef PADDLE_MOBILE_OS_LINUX auto ptr = mmap64(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); - memory_map.insert(std::make_pair(ptr, size)); - return ptr; #else - return malloc(size); + auto ptr = malloc(size); #endif + counter += size; + memory_map.insert(std::make_pair(ptr, size)); + DLOG << "Address: " << ptr << ", " << size << " bytes allocated. Total " + << counter << " bytes"; + return ptr; } void fpga_free(void *ptr) { -#ifdef PADDLE_MOBILE_OS_LINUX static uint64_t counter = 0; size_t size = 0; + auto iter = memory_map.find(ptr); // std::map::iterator if (iter != memory_map.end()) { size = iter->second; - munmap(ptr, size); memory_map.erase(iter); - } - counter += size; - DLOG << size << " bytes freed. Total " << counter << " bytes"; +#ifdef PADDLE_MOBILE_OS_LINUX + munmap(ptr, size); #else - free(ptr); + free(ptr); #endif + counter += size; + DLOG << "Address: " << ptr << ", " << size << " bytes freed. Total " + << counter << " bytes"; + } else { + DLOG << "Invalid pointer"; + } } void fpga_copy(void *dest, const void *src, size_t num) { @@ -211,7 +215,8 @@ int PerformBypass(const struct BypassArgs &args) { int ComputeFPGAConcat(const struct ConcatArgs &args) { #ifdef FPGA_TEST_MODE DLOG << "=============ComputeFpgaConcat==========="; - DLOG << " out_address:" << args.image_out + DLOG << " Image_num: " << args.image_num + << " out_address:" << args.image_out << " out_scale_address:" << args.scale_out; DLOG << " image_height:" << args.height << " image_width:" << args.width; for (int i = 0; i < args.image_num; i++) { @@ -235,7 +240,7 @@ void format_image(framework::Tensor *image_tensor) { auto channel = dims[1], height = dims[2], width = dims[3]; auto data_ptr = image_tensor->data(); size_t memory_size = channel * height * width * sizeof(float); - float *new_data = (float *)fpga_malloc(memory_size); + auto new_data = (float *)fpga_malloc(memory_size); fpga_copy(new_data, data_ptr, memory_size); image::format_image(&new_data, channel, height, width); image_tensor->reset_data_ptr(new_data); @@ -346,12 +351,12 @@ void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input, auto out_ptr = out->data(); arg->group_num = (uint32_t)group_num; - arg->split_num = (uint32_t)fpga::get_plit_num(filter); + // Either group_num or split_num = 1; + arg->split_num = group_num == 1 ? (uint32_t)get_plit_num(filter) : 1; arg->filter_num = (uint32_t)filter->dims()[0]; arg->output.address = out_ptr; arg->output.scale_address = out->scale; - arg->conv_args = (fpga::ConvArgs *)fpga::fpga_malloc(arg->split_num * - sizeof(fpga::ConvArgs)); + arg->conv_args = (ConvArgs *)fpga_malloc(arg->split_num * sizeof(ConvArgs)); arg->concat_arg.image_num = arg->split_num; arg->concat_arg.image_out = out_ptr; @@ -360,15 +365,14 @@ void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input, arg->concat_arg.width = (uint32_t)filter->dims()[3]; int n = arg->split_num; - arg->concat_arg.images_in = (half **)fpga::fpga_malloc(n * sizeof(int *)); - arg->concat_arg.scales_in = (float **)fpga::fpga_malloc(n * sizeof(float *)); - arg->concat_arg.channel_num = - (uint32_t *)fpga::fpga_malloc(n * sizeof(uint32_t)); + arg->concat_arg.images_in = (half **)fpga_malloc(n * sizeof(int *)); + arg->concat_arg.scales_in = (float **)fpga_malloc(n * sizeof(float *)); + arg->concat_arg.channel_num = (uint32_t *)fpga_malloc(n * sizeof(uint32_t)); arg->concat_arg.image_out = out_ptr; auto channel = (int)out->dims()[1]; - int filter_num_per_div = fpga::get_filter_num_per_div(filter, group_num); - int element_num = fpga::get_aligned_filter_element_num( + int filter_num_per_div = get_filter_num_per_div(filter, group_num); + int element_num = get_aligned_filter_element_num( filter->dims()[1] * filter->dims()[2] * filter->dims()[3]); for (int i = 0; i < n; i++) { @@ -390,16 +394,17 @@ void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input, &((int8_t *)filter_ptr)[i * element_num * filter_num_per_div]; arg->conv_args[i].sb_address = &bs_ptr[i * filter_num_per_div * 2]; arg->conv_args[i].filter_num = - (uint32_t)(i == n - 1 ? fpga::get_aligned_filter_num( - channel - (n - 1) * filter_num_per_div) + (uint32_t)(i == n - 1 ? channel - (n - 1) * filter_num_per_div : filter_num_per_div); if (n > 1) { arg->conv_args[i].output.scale_address = - (float *)fpga::fpga_malloc(2 * sizeof(float)); - arg->conv_args[i].output.address = - fpga::fpga_malloc(input->dims()[2] * input->dims()[3] * - arg->conv_args[i].filter_num * sizeof(half)); + (float *)fpga_malloc(2 * sizeof(float)); + arg->conv_args[i].output.address = fpga_malloc( + input->dims()[2] * + align_to_x(input->dims()[3] * arg->conv_args[i].filter_num, + IMAGE_ALIGNMENT) * + sizeof(half)); } else { @@ -408,7 +413,7 @@ void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input, } arg->concat_arg.images_in[i] = (half *)arg->conv_args[i].output.address; - arg->concat_arg.scales_in[i] = (float *)arg->conv_args[i].sb_address; + arg->concat_arg.scales_in[i] = arg->conv_args[i].output.scale_address; arg->concat_arg.channel_num[i] = arg->conv_args[i].filter_num; } } diff --git a/src/io/executor.cpp b/src/io/executor.cpp index c12f1ce02c8ab32d04d00d76cad5dc7d6ce45bc2..33a6ff359515b0cb6f8e9c2dd2c10af6001490e5 100644 --- a/src/io/executor.cpp +++ b/src/io/executor.cpp @@ -79,7 +79,7 @@ Executor::Executor(const framework::Program p, int batch_size, std::vector> ops = block_desc->Ops(); for (int j = 0; j < ops.size(); ++j) { std::shared_ptr op = ops[j]; - DLOG << "create op: " << op->Type(); + DLOG << "create op: " << j << " " << op->Type(); auto op_base = framework::OpRegistry::CreateOp( op->Type(), op->GetInputs(), op->GetOutputs(), op->GetAttrMap(), program_.scope); @@ -103,7 +103,9 @@ Executor::Executor(const framework::Program p, int batch_size, std::shared_ptr to_predict_block = to_predict_program_->Block(0); auto &ops = ops_of_block_[*to_predict_block.get()]; + int i = 0; for (const auto &op : ops) { + DLOG << "Init op: " << i++ << " " << op->Type(); op->Init(); } } @@ -695,6 +697,7 @@ void Executor::Predict_From_To(int start, int end) { clock_gettime(CLOCK_MONOTONIC, &ts); profile[i].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec; #endif + DLOG << "Running op: " << i << " " << ops[i]->Type(); ops[i]->Run(); #ifdef PADDLE_MOBILE_PROFILE diff --git a/src/operators/feed_op.h b/src/operators/feed_op.h index 2cc7fda7f8a6bb6f6856a937b9e14ab9792224e1..cccd4f52ebdc368e4f68eaf9dc3f25ee3693fdd2 100644 --- a/src/operators/feed_op.h +++ b/src/operators/feed_op.h @@ -53,7 +53,7 @@ class FeedOp : public framework::OperatorBase { auto input_ptr = input->data(); fpga::format_image(input); Tensor *output = param_.Out(); - auto output_ptr = output->data(); + auto output_ptr = output->data(); fpga::BypassArgs args = {fpga::DATA_TYPE_FP32}; diff --git a/src/operators/kernel/fpga/softmax_kernel.cpp b/src/operators/kernel/fpga/softmax_kernel.cpp index 79f1453fc8e77e35b52a5617064c164d93aa9207..7cfd0c7d76c1a8e73955dbec1971d86ceebde259 100644 --- a/src/operators/kernel/fpga/softmax_kernel.cpp +++ b/src/operators/kernel/fpga/softmax_kernel.cpp @@ -26,7 +26,8 @@ template <> bool SoftmaxKernel::Init(SoftmaxParam *param) { auto input = const_cast(param->InputX()); auto input_ptr = input->data(); - auto float_input = new Tensor(*input); + auto float_input = new Tensor; + float_input->mutable_data(input->dims()); fpga::format_fp32_ofm(float_input); fpga::BypassArgs args = {fpga::DATA_TYPE_FP16}; diff --git a/src/operators/op_param.h b/src/operators/op_param.h index bb2355d80fc581ff4e4501030632628b33394d6d..5b53743b75bfe65a9e029e44114b339603388c08 100644 --- a/src/operators/op_param.h +++ b/src/operators/op_param.h @@ -341,7 +341,6 @@ class OpParam { } }; -#ifdef CONV_OP template class ConvParam : public OpParam { typedef typename DtypeTensorTrait::gtype GType; @@ -386,7 +385,6 @@ class ConvParam : public OpParam { }; template Print &operator<<(Print &printer, const ConvParam &conv_param); -#endif template class ElementwiseAddParam : OpParam { diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index ef03205ae595ade0765377b1dcc0178471a6553e..a19df61fd1e692108d760d2d0a6914f6e1e4033f 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -30,6 +30,27 @@ elseif("FPGAnets" IN_LIST NET) ADD_EXECUTABLE(test-resnet net/test_resnet.cpp test_helper.h test_include.h executor_for_test.h) target_link_libraries(test-resnet paddle-mobile) + ADD_EXECUTABLE(test-resnet50 fpga/test_resnet50.cpp test_helper.h test_include.h executor_for_test.h) + target_link_libraries(test-resnet50 paddle-mobile) + + ADD_EXECUTABLE(test-fpga-EW fpga/test_fpga_EW.cpp test_helper.h test_include.h executor_for_test.h) + target_link_libraries(test-fpga-EW paddle-mobile) + + ADD_EXECUTABLE(test-fpga-conv fpga/test_fpga_conv.cpp test_helper.h test_include.h executor_for_test.h) + target_link_libraries(test-fpga-conv paddle-mobile) + + ADD_EXECUTABLE(test-fpga-pooling fpga/test_fpga_pooling.cpp test_helper.h test_include.h executor_for_test.h) + target_link_libraries(test-fpga-pooling paddle-mobile) + + ADD_EXECUTABLE(test-fpga-bypass fpga/test_fpga_bypass.cpp test_helper.h test_include.h executor_for_test.h) + target_link_libraries(test-fpga-bypass paddle-mobile) + + ADD_EXECUTABLE(test-fpga-softmax fpga/test_fpga_softmax.cpp test_helper.h test_include.h executor_for_test.h) + target_link_libraries(test-fpga-softmax paddle-mobile) + + ADD_EXECUTABLE(test-fpga-concat fpga/test_fpga_concat.cpp test_helper.h test_include.h executor_for_test.h) + target_link_libraries(test-fpga-concat paddle-mobile) + ADD_EXECUTABLE(test-tensor-quant fpga/test_tensor_quant.cpp test_helper.h test_include.h executor_for_test.h) target_link_libraries(test-tensor-quant paddle-mobile) @@ -66,6 +87,9 @@ else () ADD_EXECUTABLE(test-resnet net/test_resnet.cpp test_helper.h test_include.h executor_for_test.h) target_link_libraries(test-resnet paddle-mobile) + ADD_EXECUTABLE(test-resnet50 net/test_resnet50.cpp test_helper.h test_include.h executor_for_test.h) + target_link_libraries(test-resnet50 paddle-mobile) + # gen test ADD_EXECUTABLE(test-squeezenet net/test_squeezenet.cpp test_helper.h test_include.h executor_for_test.h) target_link_libraries(test-squeezenet paddle-mobile) @@ -235,13 +259,4 @@ else () #add_library(test-lib-size SHARED common/test_lib_size.h common/test_lib_size.cpp) - - - - endif() - -# if(FPGA) -# ADD_EXECUTABLE(test-tensor-quant fpga/test_tensor_quant.cpp test_helper.h test_include.h executor_for_test.h) -# target_link_libraries(test-tensor-quant paddle-mobile) -# endif() diff --git a/test/fpga/test_resnet50.cpp b/test/fpga/test_resnet50.cpp new file mode 100644 index 0000000000000000000000000000000000000000..cca6793f10da5a0784cf8a3ba2d0104f3508028d --- /dev/null +++ b/test/fpga/test_resnet50.cpp @@ -0,0 +1,39 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "../test_include.h" +static const char *g_resnet_combine = "../models/resnet50"; + +int main() { + DLOG << paddle_mobile::fpga::open_device(); + paddle_mobile::PaddleMobile paddle_mobile; + if (paddle_mobile.Load(std::string(g_resnet_combine) + "/model", + std::string(g_resnet_combine) + "/params", true)) { + std::vector dims{1, 3, 224, 224}; + Tensor input_tensor; + SetupTensor(&input_tensor, {1, 3, 224, 224}, static_cast(0), + static_cast(1)); + + std::vector input(input_tensor.data(), + input_tensor.data() + input_tensor.numel()); + + paddle_mobile.FeedData(input_tensor); + paddle_mobile.Predict_To(-1); + // paddle_mobile.Predict_From(73); + // paddle_mobile.Predict_From_To(72, 73); + + DLOG << "Computation done"; + return 0; + } +}