未验证 提交 9c96f3e6 编写于 作者: Z zhangyang0701 提交者: GitHub

Merge pull request #1005 from zhangyang0701/develop

Fix op kernel bugs for FPGA tracks close #1004
......@@ -29,9 +29,7 @@ namespace fpga {
static int fd = -1;
static const char *device_path = "/dev/fpgadrv0";
#ifdef PADDLE_MOBILE_OS_LINUX
static std::map<void *, size_t> memory_map;
#endif
static inline int do_ioctl(int req, const void *arg) {
#ifdef PADDLE_MOBILE_OS_LINUX
......@@ -53,32 +51,38 @@ int open_device() {
// memory management;
void *fpga_malloc(size_t size) {
static uint64_t counter = 0;
counter += size;
DLOG << size << " bytes allocated. Total " << counter << " bytes";
#ifdef PADDLE_MOBILE_OS_LINUX
auto ptr = mmap64(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
memory_map.insert(std::make_pair(ptr, size));
return ptr;
#else
return malloc(size);
auto ptr = malloc(size);
#endif
counter += size;
memory_map.insert(std::make_pair(ptr, size));
DLOG << "Address: " << ptr << ", " << size << " bytes allocated. Total "
<< counter << " bytes";
return ptr;
}
void fpga_free(void *ptr) {
#ifdef PADDLE_MOBILE_OS_LINUX
static uint64_t counter = 0;
size_t size = 0;
auto iter = memory_map.find(ptr); // std::map<void *, size_t>::iterator
if (iter != memory_map.end()) {
size = iter->second;
munmap(ptr, size);
memory_map.erase(iter);
}
counter += size;
DLOG << size << " bytes freed. Total " << counter << " bytes";
#ifdef PADDLE_MOBILE_OS_LINUX
munmap(ptr, size);
#else
free(ptr);
free(ptr);
#endif
counter += size;
DLOG << "Address: " << ptr << ", " << size << " bytes freed. Total "
<< counter << " bytes";
} else {
DLOG << "Invalid pointer";
}
}
void fpga_copy(void *dest, const void *src, size_t num) {
......@@ -211,7 +215,8 @@ int PerformBypass(const struct BypassArgs &args) {
int ComputeFPGAConcat(const struct ConcatArgs &args) {
#ifdef FPGA_TEST_MODE
DLOG << "=============ComputeFpgaConcat===========";
DLOG << " out_address:" << args.image_out
DLOG << " Image_num: " << args.image_num
<< " out_address:" << args.image_out
<< " out_scale_address:" << args.scale_out;
DLOG << " image_height:" << args.height << " image_width:" << args.width;
for (int i = 0; i < args.image_num; i++) {
......@@ -235,7 +240,7 @@ void format_image(framework::Tensor *image_tensor) {
auto channel = dims[1], height = dims[2], width = dims[3];
auto data_ptr = image_tensor->data<float>();
size_t memory_size = channel * height * width * sizeof(float);
float *new_data = (float *)fpga_malloc(memory_size);
auto new_data = (float *)fpga_malloc(memory_size);
fpga_copy(new_data, data_ptr, memory_size);
image::format_image(&new_data, channel, height, width);
image_tensor->reset_data_ptr(new_data);
......@@ -346,12 +351,12 @@ void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input,
auto out_ptr = out->data<float>();
arg->group_num = (uint32_t)group_num;
arg->split_num = (uint32_t)fpga::get_plit_num(filter);
// Either group_num or split_num = 1;
arg->split_num = group_num == 1 ? (uint32_t)get_plit_num(filter) : 1;
arg->filter_num = (uint32_t)filter->dims()[0];
arg->output.address = out_ptr;
arg->output.scale_address = out->scale;
arg->conv_args = (fpga::ConvArgs *)fpga::fpga_malloc(arg->split_num *
sizeof(fpga::ConvArgs));
arg->conv_args = (ConvArgs *)fpga_malloc(arg->split_num * sizeof(ConvArgs));
arg->concat_arg.image_num = arg->split_num;
arg->concat_arg.image_out = out_ptr;
......@@ -360,15 +365,14 @@ void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input,
arg->concat_arg.width = (uint32_t)filter->dims()[3];
int n = arg->split_num;
arg->concat_arg.images_in = (half **)fpga::fpga_malloc(n * sizeof(int *));
arg->concat_arg.scales_in = (float **)fpga::fpga_malloc(n * sizeof(float *));
arg->concat_arg.channel_num =
(uint32_t *)fpga::fpga_malloc(n * sizeof(uint32_t));
arg->concat_arg.images_in = (half **)fpga_malloc(n * sizeof(int *));
arg->concat_arg.scales_in = (float **)fpga_malloc(n * sizeof(float *));
arg->concat_arg.channel_num = (uint32_t *)fpga_malloc(n * sizeof(uint32_t));
arg->concat_arg.image_out = out_ptr;
auto channel = (int)out->dims()[1];
int filter_num_per_div = fpga::get_filter_num_per_div(filter, group_num);
int element_num = fpga::get_aligned_filter_element_num(
int filter_num_per_div = get_filter_num_per_div(filter, group_num);
int element_num = get_aligned_filter_element_num(
filter->dims()[1] * filter->dims()[2] * filter->dims()[3]);
for (int i = 0; i < n; i++) {
......@@ -390,16 +394,17 @@ void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input,
&((int8_t *)filter_ptr)[i * element_num * filter_num_per_div];
arg->conv_args[i].sb_address = &bs_ptr[i * filter_num_per_div * 2];
arg->conv_args[i].filter_num =
(uint32_t)(i == n - 1 ? fpga::get_aligned_filter_num(
channel - (n - 1) * filter_num_per_div)
(uint32_t)(i == n - 1 ? channel - (n - 1) * filter_num_per_div
: filter_num_per_div);
if (n > 1) {
arg->conv_args[i].output.scale_address =
(float *)fpga::fpga_malloc(2 * sizeof(float));
arg->conv_args[i].output.address =
fpga::fpga_malloc(input->dims()[2] * input->dims()[3] *
arg->conv_args[i].filter_num * sizeof(half));
(float *)fpga_malloc(2 * sizeof(float));
arg->conv_args[i].output.address = fpga_malloc(
input->dims()[2] *
align_to_x(input->dims()[3] * arg->conv_args[i].filter_num,
IMAGE_ALIGNMENT) *
sizeof(half));
}
else {
......@@ -408,7 +413,7 @@ void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input,
}
arg->concat_arg.images_in[i] = (half *)arg->conv_args[i].output.address;
arg->concat_arg.scales_in[i] = (float *)arg->conv_args[i].sb_address;
arg->concat_arg.scales_in[i] = arg->conv_args[i].output.scale_address;
arg->concat_arg.channel_num[i] = arg->conv_args[i].filter_num;
}
}
......
......@@ -79,7 +79,7 @@ Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
std::vector<std::shared_ptr<framework::OpDesc>> ops = block_desc->Ops();
for (int j = 0; j < ops.size(); ++j) {
std::shared_ptr<framework::OpDesc> op = ops[j];
DLOG << "create op: " << op->Type();
DLOG << "create op: " << j << " " << op->Type();
auto op_base = framework::OpRegistry<Dtype>::CreateOp(
op->Type(), op->GetInputs(), op->GetOutputs(), op->GetAttrMap(),
program_.scope);
......@@ -103,7 +103,9 @@ Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
std::shared_ptr<framework::BlockDesc> to_predict_block =
to_predict_program_->Block(0);
auto &ops = ops_of_block_[*to_predict_block.get()];
int i = 0;
for (const auto &op : ops) {
DLOG << "Init op: " << i++ << " " << op->Type();
op->Init();
}
}
......@@ -695,6 +697,7 @@ void Executor<Dtype, P>::Predict_From_To(int start, int end) {
clock_gettime(CLOCK_MONOTONIC, &ts);
profile[i].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif
DLOG << "Running op: " << i << " " << ops[i]->Type();
ops[i]->Run();
#ifdef PADDLE_MOBILE_PROFILE
......
......@@ -53,7 +53,7 @@ class FeedOp : public framework::OperatorBase<DeviceType> {
auto input_ptr = input->data<float>();
fpga::format_image(input);
Tensor *output = param_.Out();
auto output_ptr = output->data<half>();
auto output_ptr = output->data<float>();
fpga::BypassArgs args = {fpga::DATA_TYPE_FP32};
......
......@@ -26,7 +26,8 @@ template <>
bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
auto input = const_cast<Tensor *>(param->InputX());
auto input_ptr = input->data<float>();
auto float_input = new Tensor(*input);
auto float_input = new Tensor;
float_input->mutable_data<float>(input->dims());
fpga::format_fp32_ofm(float_input);
fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
......
......@@ -341,7 +341,6 @@ class OpParam {
}
};
#ifdef CONV_OP
template <typename Dtype>
class ConvParam : public OpParam {
typedef typename DtypeTensorTrait<Dtype>::gtype GType;
......@@ -386,7 +385,6 @@ class ConvParam : public OpParam {
};
template <typename Dtype>
Print &operator<<(Print &printer, const ConvParam<Dtype> &conv_param);
#endif
template <typename Dtype>
class ElementwiseAddParam : OpParam {
......
......@@ -30,6 +30,27 @@ elseif("FPGAnets" IN_LIST NET)
ADD_EXECUTABLE(test-resnet net/test_resnet.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-resnet paddle-mobile)
ADD_EXECUTABLE(test-resnet50 fpga/test_resnet50.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-resnet50 paddle-mobile)
ADD_EXECUTABLE(test-fpga-EW fpga/test_fpga_EW.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-fpga-EW paddle-mobile)
ADD_EXECUTABLE(test-fpga-conv fpga/test_fpga_conv.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-fpga-conv paddle-mobile)
ADD_EXECUTABLE(test-fpga-pooling fpga/test_fpga_pooling.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-fpga-pooling paddle-mobile)
ADD_EXECUTABLE(test-fpga-bypass fpga/test_fpga_bypass.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-fpga-bypass paddle-mobile)
ADD_EXECUTABLE(test-fpga-softmax fpga/test_fpga_softmax.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-fpga-softmax paddle-mobile)
ADD_EXECUTABLE(test-fpga-concat fpga/test_fpga_concat.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-fpga-concat paddle-mobile)
ADD_EXECUTABLE(test-tensor-quant fpga/test_tensor_quant.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-tensor-quant paddle-mobile)
......@@ -66,6 +87,9 @@ else ()
ADD_EXECUTABLE(test-resnet net/test_resnet.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-resnet paddle-mobile)
ADD_EXECUTABLE(test-resnet50 net/test_resnet50.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-resnet50 paddle-mobile)
# gen test
ADD_EXECUTABLE(test-squeezenet net/test_squeezenet.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-squeezenet paddle-mobile)
......@@ -235,13 +259,4 @@ else ()
#add_library(test-lib-size SHARED common/test_lib_size.h common/test_lib_size.cpp)
endif()
# if(FPGA)
# ADD_EXECUTABLE(test-tensor-quant fpga/test_tensor_quant.cpp test_helper.h test_include.h executor_for_test.h)
# target_link_libraries(test-tensor-quant paddle-mobile)
# endif()
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "../test_include.h"
static const char *g_resnet_combine = "../models/resnet50";
int main() {
DLOG << paddle_mobile::fpga::open_device();
paddle_mobile::PaddleMobile<paddle_mobile::FPGA> paddle_mobile;
if (paddle_mobile.Load(std::string(g_resnet_combine) + "/model",
std::string(g_resnet_combine) + "/params", true)) {
std::vector<int64_t> dims{1, 3, 224, 224};
Tensor input_tensor;
SetupTensor<float>(&input_tensor, {1, 3, 224, 224}, static_cast<float>(0),
static_cast<float>(1));
std::vector<float> input(input_tensor.data<float>(),
input_tensor.data<float>() + input_tensor.numel());
paddle_mobile.FeedData(input_tensor);
paddle_mobile.Predict_To(-1);
// paddle_mobile.Predict_From(73);
// paddle_mobile.Predict_From_To(72, 73);
DLOG << "Computation done";
return 0;
}
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册