From 85d6c44926b3ef99a8341273db8510f7e3cea7f6 Mon Sep 17 00:00:00 2001 From: qnqinan Date: Wed, 27 Feb 2019 16:09:37 +0800 Subject: [PATCH] update --- src/fpga/V1/api.cpp | 65 +++++++++++++------ .../kernel/fpga/V1/conv_transpose_kernel.cpp | 2 +- src/operators/kernel/fpga/V1/pad2d_kernel.cpp | 60 +++++++++++++++++ 3 files changed, 106 insertions(+), 21 deletions(-) create mode 100644 src/operators/kernel/fpga/V1/pad2d_kernel.cpp diff --git a/src/fpga/V1/api.cpp b/src/fpga/V1/api.cpp index 570ab6a198..9607961c47 100644 --- a/src/fpga/V1/api.cpp +++ b/src/fpga/V1/api.cpp @@ -30,9 +30,9 @@ void format_image(framework::Tensor *image_tensor) { auto data_ptr = image_tensor->data(); auto external_ptr = reinterpret_cast(image_tensor->external_data); float *p_data = external_ptr == nullptr ? data_ptr : external_ptr; - + float *old_p = p_data; image::format_image(&p_data, channel, height, width); - if (p_data != data_ptr) { + if (old_p != p_data) { image_tensor->reset_data_ptr(p_data); } } @@ -48,9 +48,9 @@ void format_fp16_ofm(framework::Tensor *ofm_tensor) { auto dims = ofm_tensor->dims(); size_t memory_size = 0; if (dims.size() == 4) { - auto channel = dims[1], height = dims[2], width = dims[3], num = dims[0]; - memory_size = num * height * align_to_x(channel * width, IMAGE_ALIGNMENT) * - sizeof(half); + auto channel = dims[1], height = dims[2], width = dims[3]; + memory_size = + height * align_to_x(channel * width, IMAGE_ALIGNMENT) * sizeof(half); } else if (dims.size() == 2) { memory_size = align_to_x(dims[1], IMAGE_ALIGNMENT) * sizeof(half); } else { @@ -162,7 +162,7 @@ void format_dwconv_filter(framework::Tensor *filter_tensor, float *scale_ptr) { fpga_copy(new_data, data_ptr, memory_size); filter::format_dwconv_filter(&new_data, num, height, width, scale_ptr); filter_tensor->reset_data_ptr(new_data); - filter_tensor->set_type(typeid(int8_t)); + filter_tensor->set_type(typeid(int16_t)); } void format_DWDconv_filter(framework::Tensor *filter_tensor, float *scale_ptr, @@ -396,8 +396,8 @@ void expand_conv_arg(ConvArgs *arg) { // auto cmd = 0UL | (args.relu_enabled ? USE_RELU : 0) | USE_BIAS; auto cmd = 0UL | USE_BIAS; - auto deconv_param = ((args.deconv_tx_param.deconv_en) << 24) | - ((args.deconv_tx_param.sub_conv_num) << 16) | + auto deconv_param = ((args.deconv_tx_param.deconv_en) << 16) | + ((args.deconv_tx_param.sub_conv_num) << 8) | ((args.deconv_tx_param.omit_size) << 0); (*arg).driver.image_address_phy = vaddr_to_paddr(args.image.address); (*arg).driver.sb_address_phy = vaddr_to_paddr(args.sb_address); @@ -623,7 +623,7 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, fpga::format_fp16_ofm(out, dims_out_new); auto out_ptr = out->data(); arg->output.address = - out_ptr + + (half *)out_ptr + // NOLINT omit_size * sizeof(half) * (align_to_x(real_out_width * arg->filter_num, IMAGE_ALIGNMENT)); arg->output.scale_address = out->scale; @@ -713,6 +713,7 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, } for (int j = 0; j < split_num; ++j) { + // arg->split_conv_args[i]->conv_arg[j].relu_enabled = relu_enabled; arg->split_conv_args[i]->conv_arg[j].output.activation.activation_type = activation_enable; arg->split_conv_args[i] @@ -758,9 +759,9 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, align_to_x(arg->split_conv_args[i]->conv_arg[j].filter_num, FILTER_NUM_ALIGNMENT) * sizeof(int8_t); - auto filter_head = - &filter_ptr[j * element_num * filter_num_per_div + // NOLINT - i * filter_sub_conv_offset]; + auto filter_head = &(( + int8_t *)filter_ptr)[j * element_num * filter_num_per_div + // NOLINT + i * filter_sub_conv_offset]; arg->split_conv_args[i]->conv_arg[j].filter_address = fpga_malloc(filter_size); arg->split_conv_args[i]->vector_conv_space.push_back( @@ -774,6 +775,19 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, fpga_flush(arg->split_conv_args[i]->conv_arg[j].filter_address, filter_size); + /*{ + static int cnt = 0; + std::string str = "deconv_filter"; + if(cnt <= 1){ + cnt++; + str += std::to_string(cnt); + int8_t result = 0; + fpga::savefile(str, + arg->split_conv_args[i]->conv_arg[j].filter_address, filter_size, result); + } + + }*/ + size_t bs_align_num = align_to_x( arg->split_conv_args[i]->conv_arg[j].filter_num, BS_NUM_ALIGNMENT); size_t bs_size = 2 * bs_align_num * sizeof(float); @@ -789,6 +803,20 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, memcpy(arg->split_conv_args[i]->conv_arg[j].sb_address, bs_head, bs_size); fpga_flush(arg->split_conv_args[i]->conv_arg[j].sb_address, bs_size); + /* { + static int cnt = 0; + std::string str = "deconv_sb"; + if(cnt <= 1){ + cnt++; + str += std::to_string(cnt); + float result = 0; + fpga::savefile(str, + arg->split_conv_args[i]->conv_arg[j].sb_address, 2 * bs_align_num, + result); + } + + }*/ + if (split_num == 1) { arg->split_conv_args[i]->conv_arg[j].output.address = arg->split_conv_args[i]->output.address; @@ -835,13 +863,10 @@ void fill_dwconv_arg(struct DWconvArgs *arg, framework::Tensor *input, int16_t leaky_relu_negative_slope, int stride_h, int stride_w, int padding_h, int padding_w, float *bias_ptr) { - auto deleter = [](void *p) { fpga_free(p); }; - arg->vector_dwconv_space.push_back( - std::shared_ptr(reinterpret_cast(bias_ptr), deleter)); - - auto filter_ptr = filter->data(); + auto filter_ptr = filter->data(); auto input_ptr = input->data(); - auto output_ptr = out->mutable_data(); + auto output_ptr = out->data(); + arg->sub_conv_num = 1; // arg->relu_enabled = relu_enabled; arg->output.activation.activation_type = activation_enable; @@ -960,10 +985,10 @@ void fill_DWDeconv_arg(struct DWDeconvArgs *arg, framework::Tensor *input, sizeof(int16_t)); arg->dw_conv_args[i]->output.scale_address = static_cast(fpga_malloc(2 * sizeof(float))); - arg->vector_dw_conv_space.push_back(std::shared_ptr( // NOLINT + arg->vector_dw_conv_space.push_back(std::shared_ptr( reinterpret_cast(arg->dw_conv_args[i]->output.address), deleter)); - arg->vector_dw_conv_space.push_back(std::shared_ptr( // NOLINT + arg->vector_dw_conv_space.push_back(std::shared_ptr( reinterpret_cast(arg->dw_conv_args[i]->output.scale_address), deleter)); } diff --git a/src/operators/kernel/fpga/V1/conv_transpose_kernel.cpp b/src/operators/kernel/fpga/V1/conv_transpose_kernel.cpp index 0dd39fbc34..788504df5d 100644 --- a/src/operators/kernel/fpga/V1/conv_transpose_kernel.cpp +++ b/src/operators/kernel/fpga/V1/conv_transpose_kernel.cpp @@ -43,7 +43,7 @@ bool ConvTransposeKernel::Init(ConvTransposeParam *param) { for (int i = 0; i < channel * sub_conv_n; i++) { bs_ptr[i + sub_conv_n * channel] = 1; - // bs_ptr[i] = bias_ptr[i % (channel)]; + bs_ptr[i] = 0; // bias_ptr[i % (channel)]; } PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0], diff --git a/src/operators/kernel/fpga/V1/pad2d_kernel.cpp b/src/operators/kernel/fpga/V1/pad2d_kernel.cpp new file mode 100644 index 0000000000..f47a585ee4 --- /dev/null +++ b/src/operators/kernel/fpga/V1/pad2d_kernel.cpp @@ -0,0 +1,60 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "operators/kernel/pad2d_kernel.h" +namespace paddle_mobile { +namespace operators { +template <> +bool Pad2dKernel::Init(Pad2dParam *param) { + Tensor *output = param->Out(); + fpga::format_fp16_ofm(output); + return true; +} +void pad2dFunc(const framework::Tensor *input, framework::Tensor *output) { + auto input_data = (input->data()); + auto output_data = (output->data()); + auto input_c = input->dims()[1]; + auto input_h = input->dims()[2]; + auto input_w = input->dims()[3]; + auto output_c = output->dims()[1]; + auto output_w = output->dims()[3]; + auto copysize = input_c * input_w; + for (int h = 0; h < input_h; ++h) { + auto input_offset = h * input_c * input_w; + auto output_offset = h * paddle_mobile::fpga::align_to_x( + output_c * output_w, IMAGE_ALIGNMENT); + memcpy((output_data + output_offset), (input_data + input_offset), + copysize * sizeof(half)); + } +} +template <> +void Pad2dKernel::Compute(const Pad2dParam ¶m) { + auto in_x = param.InputX(); + auto out = param.Out(); + fpga::fpga_invalidate((void *)in_x->data(), // NOLINT + in_x->numel() * sizeof(half)); + pad2dFunc(in_x, out); + (out->scale)[0] = (in_x->scale)[0]; + (out->scale)[1] = (in_x->scale)[1]; + DLOG << (out->scale)[0]; + DLOG << (out->scale)[1]; + size_t outputSize = + out->dims()[2] * + paddle_mobile::fpga::align_to_x((out->dims()[1]) * (out->dims()[3]), + IMAGE_ALIGNMENT) * + sizeof(half); + fpga::fpga_flush(out->data(), outputSize); +} +} // namespace operators +} // namespace paddle_mobile -- GitLab