diff --git a/src/fpga/V1/api.cpp b/src/fpga/V1/api.cpp index 9607961c4785f631afb4b5e207ebff2c8e33623e..b8f131634e9eb4c56218db8f0643f10834089393 100644 --- a/src/fpga/V1/api.cpp +++ b/src/fpga/V1/api.cpp @@ -30,9 +30,10 @@ void format_image(framework::Tensor *image_tensor) { auto data_ptr = image_tensor->data(); auto external_ptr = reinterpret_cast(image_tensor->external_data); float *p_data = external_ptr == nullptr ? data_ptr : external_ptr; - float *old_p = p_data; + image::format_image(&p_data, channel, height, width); - if (old_p != p_data) { + + if (p_data != data_ptr && external_ptr == nullptr) { image_tensor->reset_data_ptr(p_data); } } @@ -48,9 +49,9 @@ void format_fp16_ofm(framework::Tensor *ofm_tensor) { auto dims = ofm_tensor->dims(); size_t memory_size = 0; if (dims.size() == 4) { - auto channel = dims[1], height = dims[2], width = dims[3]; - memory_size = - height * align_to_x(channel * width, IMAGE_ALIGNMENT) * sizeof(half); + auto channel = dims[1], height = dims[2], width = dims[3], num = dims[0]; + memory_size = num * height * align_to_x(channel * width, IMAGE_ALIGNMENT) * + sizeof(half); } else if (dims.size() == 2) { memory_size = align_to_x(dims[1], IMAGE_ALIGNMENT) * sizeof(half); } else { @@ -713,7 +714,6 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, } for (int j = 0; j < split_num; ++j) { - // arg->split_conv_args[i]->conv_arg[j].relu_enabled = relu_enabled; arg->split_conv_args[i]->conv_arg[j].output.activation.activation_type = activation_enable; arg->split_conv_args[i] @@ -775,19 +775,6 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, fpga_flush(arg->split_conv_args[i]->conv_arg[j].filter_address, filter_size); - /*{ - static int cnt = 0; - std::string str = "deconv_filter"; - if(cnt <= 1){ - cnt++; - str += std::to_string(cnt); - int8_t result = 0; - fpga::savefile(str, - arg->split_conv_args[i]->conv_arg[j].filter_address, filter_size, result); - } - - }*/ - size_t bs_align_num = align_to_x( arg->split_conv_args[i]->conv_arg[j].filter_num, BS_NUM_ALIGNMENT); size_t bs_size = 2 * bs_align_num * sizeof(float); @@ -803,20 +790,6 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, memcpy(arg->split_conv_args[i]->conv_arg[j].sb_address, bs_head, bs_size); fpga_flush(arg->split_conv_args[i]->conv_arg[j].sb_address, bs_size); - /* { - static int cnt = 0; - std::string str = "deconv_sb"; - if(cnt <= 1){ - cnt++; - str += std::to_string(cnt); - float result = 0; - fpga::savefile(str, - arg->split_conv_args[i]->conv_arg[j].sb_address, 2 * bs_align_num, - result); - } - - }*/ - if (split_num == 1) { arg->split_conv_args[i]->conv_arg[j].output.address = arg->split_conv_args[i]->output.address; @@ -863,10 +836,13 @@ void fill_dwconv_arg(struct DWconvArgs *arg, framework::Tensor *input, int16_t leaky_relu_negative_slope, int stride_h, int stride_w, int padding_h, int padding_w, float *bias_ptr) { + auto deleter = [](void *p) { fpga_free(p); }; + arg->vector_dwconv_space.push_back( + std::shared_ptr(reinterpret_cast(bias_ptr), deleter)); + auto filter_ptr = filter->data(); auto input_ptr = input->data(); - auto output_ptr = out->data(); - + auto output_ptr = out->mutable_data(); arg->sub_conv_num = 1; // arg->relu_enabled = relu_enabled; arg->output.activation.activation_type = activation_enable; diff --git a/src/operators/kernel/fpga/V1/feed_kernel.cpp b/src/operators/kernel/fpga/V1/feed_kernel.cpp index a52521b8470886c3ee2d3c4979d513a6e8b5aa93..de905f39a244d955011c4e879bd080a53ed66d01 100644 --- a/src/operators/kernel/fpga/V1/feed_kernel.cpp +++ b/src/operators/kernel/fpga/V1/feed_kernel.cpp @@ -49,6 +49,9 @@ void FeedKernel::Compute(const FeedParam ¶m) { fpga::format_image(input); auto input_ptr = input->data(); + auto external_ptr = reinterpret_cast(input->external_data); + float *p_data = external_ptr == nullptr ? input_ptr : external_ptr; + auto output_ptr = output->data(); fpga::BypassArgs args = {fpga::DATA_TYPE_FP32}; @@ -57,7 +60,7 @@ void FeedKernel::Compute(const FeedParam ¶m) { args.output_data_type = fpga::DATA_TYPE_FP16; args.input_layout_type = fpga::LAYOUT_CHW; args.output_layout_type = fpga::LAYOUT_HWC; - args.image.address = input_ptr; + args.image.address = p_data; args.image.channels = (uint32_t)input->dims()[1]; args.image.height = (uint32_t)input->dims()[2]; args.image.width = (uint32_t)input->dims()[3]; diff --git a/src/operators/kernel/fpga/V1/fetch_kernel.cpp b/src/operators/kernel/fpga/V1/fetch_kernel.cpp index b575d952371c5352d2d23d465b08d7749b82d140..ad3bcfbaa0ec96545007459ceda20bc13c7efe4b 100644 --- a/src/operators/kernel/fpga/V1/fetch_kernel.cpp +++ b/src/operators/kernel/fpga/V1/fetch_kernel.cpp @@ -56,8 +56,9 @@ void FetchKernel::Compute(const FetchParam ¶m) { return; } fpga::BypassArgs args = param.fpga_bypass_args; - auto data = (input->mutable_data()); - args.image.address = static_cast(data); + auto input_address = (input->data()); + args.image.address = static_cast(input_address); + fpga::PerformBypass(args); fpga::fpga_invalidate(param.fpga_bypass_args.output.address, param.fpga_bypass_args.image.channels * sizeof(float)); diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index fdd7c46fedc98b3f1811cd10ffe6bcec7d0e3a46..3af55f075805361fd0cff40ab2e53752ea63f781 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -77,6 +77,10 @@ if (CON GREATER -1) ADD_EXECUTABLE(test-rfcn fpga/test_rfcn.cpp test_helper.h test_include.h executor_for_test.h) target_link_libraries(test-rfcn paddle-mobile) + ADD_EXECUTABLE(test-marker fpga/test_marker.cpp test_helper.h test_include.h executor_for_test.h) + target_link_libraries(test-marker paddle-mobile) + + set(FOUND_MATCH ON) endif () diff --git a/test/fpga/test_marker.cpp b/test/fpga/test_marker.cpp new file mode 100644 index 0000000000000000000000000000000000000000..5abbcd3629c5c084f1258f5140f3190d99bf2344 --- /dev/null +++ b/test/fpga/test_marker.cpp @@ -0,0 +1,167 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "../test_helper.h" +#include "../test_include.h" + +#ifdef PADDLE_MOBILE_FPGA_V1 +#include "fpga/V1/api.h" +#endif +#ifdef PADDLE_MOBILE_FPGA_V2 +#include "fpga/V2/api.h" +#endif +#include +#include + + +void readStream(std::string filename, char *buf) { + std::ifstream in; + in.open(filename, std::ios::in|std::ios::binary); + if (!in.is_open()) { + std::cout << "open File Failed." << std::endl; + return; + } + + in.seekg(0, std::ios::end); // go to the end + auto length = in.tellg(); // report location (this is the length) + in.seekg(0, std::ios::beg); // go back to the beginning + in.read(buf, length); + DLOG << length; + in.close(); +} + +void convert_to_chw(int16_t **data_in, int channel, int height, int width,int num, + int16_t *data_tmp) { + int64_t amount_per_side = width * height; +for(int n = 0; n < num; n++){ + for (int h = 0; h < height; h++) { + for (int w = 0; w < width; w++) { + for (int c = 0; c < channel; c++) { + *(data_tmp + n* amount_per_side*channel + c * amount_per_side + width * h + w) = *((*data_in)++); + } + } + } + } +} + + + + +void dump_stride_half(std::string filename, Tensor input_tensor, + const int dumpnum, bool use_chw) { + // bool use_chw = true; + if(input_tensor.dims().size()!=4)return; + int c = (input_tensor.dims())[1]; + int h = (input_tensor.dims())[2]; + int w = (input_tensor.dims())[3]; + int n = (input_tensor.dims())[0]; + auto data_ptr = input_tensor.get_data(); + auto *data_ptr_16 = reinterpret_cast(data_ptr); + auto data_tmp = data_ptr_16; + if (use_chw){ + data_tmp = + reinterpret_cast(malloc(n * c * h * w * sizeof(int16_t))); + convert_to_chw(&data_ptr_16, c, h, w,n, data_tmp); + } + std::ofstream out(filename.c_str()); + float result = 0; + int stride = input_tensor.numel() / dumpnum; + stride = stride > 0 ? stride : 1; + for (int i = 0; i < input_tensor.numel(); i += stride) { + result = paddle_mobile::fpga::fp16_2_fp32(data_tmp[i]); + out << result << std::endl; + } + out.close(); + if(data_tmp!=data_ptr_16){free(data_tmp);} +} + +void dump_stride_float(std::string filename, Tensor input_tensor, + const int dumpnum) { + auto data_ptr = reinterpret_cast(input_tensor.get_data()); + std::ofstream out(filename.c_str()); + float result = 0; + int stride = input_tensor.numel() / dumpnum; + stride = stride > 0 ? stride : 1; + for (int i = 0; i < input_tensor.numel(); i += stride) { + result = data_ptr[i]; + out << result << std::endl; + } + out.close(); +} + + +void dump_stride(std::string filename, Tensor input_tensor, + const int dumpnum, bool use_chw) { + static int i=0; + if (input_tensor.numel() == 0) { + return; + } + if(input_tensor.type() == typeid(float)){ + DLOG << "op: " < paddle_mobile; + + //if (paddle_mobile.Load(std::string(g_rfcn_combine) + "/model", + // std::string(g_rfcn_combine) + "/params", true, false, + // 1, true)) { + if(paddle_mobile.Load(std::string(g_marker_combine),true)){ + float img_info[3] = {720, 1280, 800.0f / 960.0f}; + auto img = reinterpret_cast(fpga::fpga_malloc(720 * 1280 * 3 * sizeof(float))); + readStream(g_image_src_float, reinterpret_cast(img)); + + std::vector v(3, nullptr); + paddle_mobile.FeedData({ img}); + paddle_mobile.Predict_To(-1); + + for (int i = 47; i < 52; i++) { + auto tensor_ptr = paddle_mobile.FetchResult(i); + std::string saveName = "marker_" + std::to_string(i); + //if(i != 58) + paddle_mobile::fpga::fpga_invalidate((*tensor_ptr).get_data(),tensor_ptr->numel() * sizeof(float)); + // tensor_ptr->numel() * sizeof(float)); + + dump_stride(saveName, (*tensor_ptr),tensor_ptr->numel(), true);//20);//tensor_ptr->numel()); + +/* float result = 0; + std::string str = "softmax_input_data"; + float* data = static_cast(fpga::fpga_malloc(tensor_ptr->numel() * sizeof(float))); + str = "softmax_output_data"; + auto output_ptr = static_cast((*tensor_ptr).get_data()); + for (int idx = 0; idx < tensor_ptr->numel(); ++idx) + { + data[idx] = fpga::fp16_2_fp32(output_ptr[idx]); + } + fpga::savefile(str,data, tensor_ptr->numel(), result ); */ + } + + // paddle_mobile.GetResults(&v); + DLOG << "Computation done"; + fpga::fpga_free(img); + } + + return 0; +}