api.cpp 40.8 KB
Newer Older
H
hanbuhe 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

Z
zhangyang 已提交
15 16
#include "fpga/V1/api.h"
#include "fpga/V1/bias_scale.h"
Z
zhangyang 已提交
17
#include "fpga/V1/deconv_filter.h"
Z
zhangyang 已提交
18 19
#include "fpga/V1/filter.h"
#include "fpga/V1/image.h"
Z
zhangyang 已提交
20

Z
zhangyang 已提交
21
namespace paddle_mobile {
H
hanbuhe 已提交
22 23
namespace fpga {

24 25 26
#define USE_RELU 1
#define USE_BIAS 2

Z
zhangyang 已提交
27 28
void format_image(framework::Tensor *image_tensor) {
  auto dims = image_tensor->dims();
Z
zhangyang 已提交
29
  auto channel = dims[1], height = dims[2], width = dims[3];
30
  auto data_ptr = image_tensor->data<float>();
31 32 33 34 35 36 37
  auto external_ptr = reinterpret_cast<float *>(image_tensor->external_data);
  float *p_data = external_ptr == nullptr ? data_ptr : external_ptr;
  float *old_p = p_data;
  image::format_image(&p_data, channel, height, width);
  if (old_p != p_data) {
    image_tensor->reset_data_ptr(p_data);
  }
Z
zhangyang 已提交
38 39
}

40
void format_fp16_ofm(framework::Tensor *ofm_tensor) {
Z
zhangyang 已提交
41
  auto dims = ofm_tensor->dims();
42 43 44 45 46 47 48 49 50 51 52 53 54
  size_t memory_size = 0;
  if (dims.size() == 4) {
    auto channel = dims[1], height = dims[2], width = dims[3];
    memory_size =
        height * align_to_x(channel * width, IMAGE_ALIGNMENT) * sizeof(half);
  } else if (dims.size() == 2) {
    memory_size = align_to_x(dims[1], IMAGE_ALIGNMENT) * sizeof(half);
  } else {
    DLOG << "Wrong ofm dimension";
  }
  auto p = fpga_malloc(memory_size);
  memset(p, 0, memory_size);
  ofm_tensor->reset_data_ptr(p);
55
  ofm_tensor->set_type(typeid(half));
56 57
}

58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
void format_fp16_ofm(framework::Tensor *ofm_tensor, framework::DDim dims) {
  // auto dims = ofm_tensor->dims();
  size_t memory_size = 0;
  if (dims.size() == 4) {
    auto channel = dims[1], height = dims[2], width = dims[3];
    memory_size =
        height * align_to_x(channel * width, IMAGE_ALIGNMENT) * sizeof(half);
  } else if (dims.size() == 2) {
    memory_size = align_to_x(dims[1], IMAGE_ALIGNMENT) * sizeof(half);
  } else {
    DLOG << "Wrong ofm dimension";
  }
  auto p = fpga_malloc(memory_size);
  memset(p, 0, memory_size);
  ofm_tensor->reset_data_ptr(p);
73
  ofm_tensor->set_type(typeid(half));
74
}
75 76 77 78 79 80 81 82 83 84 85 86 87 88 89
void format_fp32_ofm(framework::Tensor *ofm_tensor) {
  auto dims = ofm_tensor->dims();
  size_t memory_size = 0;
  if (dims.size() == 4) {
    auto channel = dims[1], height = dims[2], width = dims[3];
    memory_size =
        height * align_to_x(channel * width, IMAGE_ALIGNMENT) * sizeof(float);
  } else if (dims.size() == 2) {
    memory_size = align_to_x(dims[1], IMAGE_ALIGNMENT) * sizeof(float);
  } else {
    DLOG << "Wrong ofm dimension";
  }
  auto p = fpga_malloc(memory_size);
  memset(p, 0, memory_size);
  ofm_tensor->reset_data_ptr(p);
90
  ofm_tensor->set_type(typeid(float));
Z
zhangyang 已提交
91 92
}

Z
zhangyang 已提交
93 94 95 96
float filter_find_max(framework::Tensor *filter_tensor) {
  auto filter_ptr = filter_tensor->data<float>();
  return filter::find_max(filter_ptr, filter_tensor->numel());
}
Z
zhangyang 已提交
97 98 99

int get_plit_num(framework::Tensor *filter_tensor) {
  auto dims = filter_tensor->dims();
Z
zhangyang 已提交
100 101
  auto chw = dims[1] * dims[2] * dims[3];
  auto num = dims[0];
Z
zhangyang 已提交
102 103 104
  int div_capacity = filter::calc_division_capacity(chw);
  return filter::calc_split_num(num, div_capacity);
}
Z
zhangyang 已提交
105 106 107 108 109 110 111
int get_deconv_plit_num(framework::Tensor *filter_tensor, int stride) {
  auto dims = filter_tensor->dims();
  auto chw = dims[1] * dims[2] / stride * dims[3] / stride;
  auto num = dims[0] * stride;
  int div_capacity = filter::calc_division_capacity(chw);
  return filter::calc_split_num(num, div_capacity);
}
Z
zhangyang 已提交
112

113
int get_filter_num_per_div(framework::Tensor *filter_tensor, int group_num) {
Z
zhangyang 已提交
114
  auto dims = filter_tensor->dims();
Z
zhangyang 已提交
115 116
  auto chw = dims[1] * dims[2] * dims[3];
  auto num = dims[0];
Z
zhangyang 已提交
117 118 119 120
  int div_capacity = filter::calc_division_capacity(chw);
  return filter::calc_num_per_div(num, group_num, div_capacity);
}

Z
zhangyang 已提交
121 122 123 124 125 126 127 128 129
int get_deconv_filter_num_per_div(framework::Tensor *filter_tensor,
                                  int group_num, int stride) {
  auto dims = filter_tensor->dims();
  auto chw = dims[1] * dims[2] / stride * dims[3] / stride;
  auto num = dims[0] * stride;
  int div_capacity = filter::calc_division_capacity(chw);
  return filter::calc_num_per_div(num, group_num, div_capacity);
}

Z
zhangyang 已提交
130 131 132 133
int get_aligned_filter_element_num(int chw) {
  return align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
}

Z
zhangyang 已提交
134 135
void format_filter(framework::Tensor *filter_tensor, float max_value,
                   int group_num) {
136 137
  filter_tensor->scale[0] = float(max_value / 127.0);  // NOLINT
  filter_tensor->scale[1] = float(127.0 / max_value);  // NOLINT
Z
zhangyang 已提交
138
  auto dims = filter_tensor->dims();
Z
zhangyang 已提交
139
  auto num = dims[0], channel = dims[1], height = dims[2], width = dims[3];
140
  auto data_ptr = filter_tensor->data<float>();
Z
zhangyang 已提交
141
  size_t memory_size = num * channel * height * width * sizeof(float);
142
  auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
Z
zhangyang 已提交
143 144 145 146
  fpga_copy(new_data, data_ptr, memory_size);
  filter::format_filter(&new_data, num, channel, height, width, group_num,
                        max_value);
  filter_tensor->reset_data_ptr(new_data);
147
  filter_tensor->set_type(typeid(int8_t));
Z
zhangyang 已提交
148
}
149 150 151 152 153 154 155 156 157
void format_dwconv_filter(framework::Tensor *filter_tensor, float *scale_ptr) {
  auto dims = filter_tensor->dims();
  auto num = dims[0], height = dims[2], width = dims[3];
  auto data_ptr = filter_tensor->data<float>();
  size_t memory_size = num * height * width * sizeof(float);
  auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
  fpga_copy(new_data, data_ptr, memory_size);
  filter::format_dwconv_filter(&new_data, num, height, width, scale_ptr);
  filter_tensor->reset_data_ptr(new_data);
158
  filter_tensor->set_type(typeid(int8_t));
159
}
Z
zhangyang 已提交
160

qnqinan's avatar
qnqinan 已提交
161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182
void format_DWDconv_filter(framework::Tensor *filter_tensor, float *scale_ptr,
                           int stride) {
  auto dims = filter_tensor->dims();
  auto num = dims[0], height = dims[2], width = dims[3];
  auto data_ptr = filter_tensor->data<float>();
  size_t memory_size = num * height * width * sizeof(float);
  auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
  fpga_copy(new_data, data_ptr, memory_size);

  int hw = height * width;
  deconv_filter::deconv_NC_convert(&new_data, num, 1, hw);

  num = dims[1];
  int channel = dims[0];

  deconv_filter::DWDconv_format_filter(&new_data, num, channel, height, width,
                                       scale_ptr, stride);

  //  framework::DDim dims_new =
  //      framework::make_ddim({num, 1, height, width});
  //  filter_tensor->Resize(dims_new);
  filter_tensor->reset_data_ptr(new_data);
183
  filter_tensor->set_type(typeid(int8_t));
qnqinan's avatar
qnqinan 已提交
184 185
}

Z
zhangyang 已提交
186 187 188 189 190 191 192 193 194 195 196 197
void format_fc_filter(framework::Tensor *filter_tensor, float max_value) {
  filter_tensor->scale[0] = float(max_value / 127.0);  // NOLINT
  filter_tensor->scale[1] = float(127.0 / max_value);  // NOLINT
  auto dims = filter_tensor->dims();
  auto num = dims[0], channel = dims[1], height = dims[2], width = dims[3];
  auto data_ptr = filter_tensor->data<float>();
  size_t memory_size = num * channel * height * width * sizeof(float);
  auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
  fpga_copy(new_data, data_ptr, memory_size);
  filter::format_fc_filter(&new_data, num, channel, height, width, 1,
                           max_value);
  filter_tensor->reset_data_ptr(new_data);
198
  filter_tensor->set_type(typeid(int8_t));
Z
zhangyang 已提交
199
}
Z
zhangyang 已提交
200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224
void format_deconv_filter(framework::Tensor *filter_tensor, float max_value,
                          int group_num, int stride) {
  filter_tensor->scale[0] = float(max_value / 127.0);  // NOLINT
  filter_tensor->scale[1] = float(127.0 / max_value);  // NOLINT
  auto dims = filter_tensor->dims();
  auto num = dims[0], channel = dims[1], height = dims[2], width = dims[3];
  auto data_ptr = filter_tensor->data<float>();
  size_t memory_size = num * channel * height * width * sizeof(float);
  auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
  memcpy(new_data, data_ptr, memory_size);

  int hw = height * width;
  deconv_filter::deconv_NC_convert(&new_data, num, channel, hw);

  num = dims[1];
  channel = dims[0];
  deconv_filter::deconv_format_filter(
      &new_data, (int)num, (int)channel,          // NOLINT
      (int)height,                                // NOLINT
      (int)width, group_num, max_value, stride);  // NOLINT

  framework::DDim dims_new =
      framework::make_ddim({num, channel, height, width});
  filter_tensor->Resize(dims_new);
  filter_tensor->reset_data_ptr(new_data);
225
  filter_tensor->set_type(typeid(int8_t));
Z
zhangyang 已提交
226
}
Z
zhangyang 已提交
227

Z
zhangyang 已提交
228 229 230 231 232
void format_bias_scale_array(float **bias_scale_array,
                             int element_num_per_division, int num) {
  bias_scale::format_bias_scale_array(bias_scale_array,
                                      element_num_per_division, num);
}
233 234 235
void format_bias_array(float **bias_array, int num) {
  bias_scale::format_bias_array(bias_array, num);
}
Z
zhangyang 已提交
236

Z
zhangyang 已提交
237 238 239 240 241 242 243 244 245
void format_concat_output(framework::Tensor *out, int height, int width,
                          int image_num, uint32_t *channel_num) {
  int sum_channel = 0, sum_cw = 0;
  for (int i = 0; i < image_num; i++) {
    sum_channel += channel_num[i];
  }

  sum_cw = align_to_x(width * sum_channel, IMAGE_ALIGNMENT);
  auto data_ptr = fpga_malloc(height * sum_cw * sizeof(half));
246
  auto ddim = framework::make_ddim({1, sum_channel, height, width});
Z
zhangyang 已提交
247 248
  out->Resize(ddim);
  out->reset_data_ptr(data_ptr);
249
  out->set_type(typeid(half));
Z
zhangyang 已提交
250
}
251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271
void format_conv_data(framework::Tensor *filter_tensor,
                      framework::Tensor *ofm_tensor, float **bs_ptr,
                      int group) {
  float max_value = fpga::filter_find_max(filter_tensor);
  fpga::format_filter(filter_tensor, max_value, group);
  int element_num_per_div = fpga::get_filter_num_per_div(filter_tensor, group);
  fpga::format_bias_scale_array(bs_ptr, element_num_per_div,
                                ofm_tensor->dims()[1]);
  fpga::format_fp16_ofm(ofm_tensor);
}
void format_deconv_data(framework::Tensor *filter_tensor,
                        framework::Tensor *ofm_tensor, float **bs_ptr,
                        int group, int sub_conv_n) {
  int channel = ofm_tensor->dims()[1];
  float max_value = filter_find_max(filter_tensor);
  format_deconv_filter(filter_tensor, max_value, group, sub_conv_n);
  int element_num_per_div =
      get_deconv_filter_num_per_div(filter_tensor, group, sub_conv_n);
  format_bias_scale_array(bs_ptr, element_num_per_div, channel * sub_conv_n);
  format_fp16_ofm(ofm_tensor);
}
Z
zhangyang 已提交
272

273 274 275 276 277 278 279 280
void format_dwconv_data(framework::Tensor *filter_tensor,
                        framework::Tensor *ofm_tensor, float *scale_ptr,
                        float **bias_ptr) {
  auto channel = ofm_tensor->dims()[1];
  format_dwconv_filter(filter_tensor, scale_ptr);
  format_bias_array(bias_ptr, channel);
  format_fp16_ofm(ofm_tensor);
}
qnqinan's avatar
qnqinan 已提交
281 282 283 284 285 286 287 288 289 290 291
void format_DWDeconv_data(framework::Tensor *filter_tensor,
                          framework::Tensor *ofm_tensor, float **bs_ptr,
                          int group, int sub_conv_n) {
  int channel = ofm_tensor->dims()[1];
  // dw-deconv
  format_DWDconv_filter(
      filter_tensor,
      (reinterpret_cast<float *>(*bs_ptr) + sub_conv_n * channel), sub_conv_n);
  format_bias_array(bs_ptr, channel);
  format_fp16_ofm(ofm_tensor);
}
292 293
void expand_conv_arg(ConvArgs *arg) {
  ConvArgs args = *arg;
294 295

  auto fpga_bias_scale_len =
296 297
      align_to_x(args.filter_num / args.group_num, 8) * args.group_num;

298
  auto output_height =
299 300 301
      (args.image.height + args.image.pad_height * 2 - args.kernel.height) /
          args.kernel.stride_h +
      1;
302
  auto output_width =
303 304 305
      (args.image.width + args.image.pad_width * 2 - args.kernel.width) /
          args.kernel.stride_w +
      1;
306 307 308 309 310 311 312 313 314 315

  auto filter_per_group = args.filter_num / args.group_num;
  auto channel_per_group = args.image.channels / args.group_num;

  auto image_row_count = args.image.width * args.image.channels;
  auto image_amount_per_row = align_to_x(image_row_count, IMAGE_ALIGNMENT);
  auto image_one_pad_per_row = align_to_x(image_row_count, IMAGE_ALIGNMENT) +
                               args.image.pad_width * args.image.channels;
  auto filter_amount_all =
      align_to_x(args.kernel.height * args.kernel.width * channel_per_group,
316 317
                 FILTER_ELEMENT_ALIGNMENT);

318 319 320
  auto output_amount_per_row = align_to_x(
      (output_width - (args.deconv_tx_param.omit_size) * 2) * args.filter_num,
      IMAGE_ALIGNMENT);
321 322 323 324

  // find the opt partition strategy
  uint64_t res_win;
  uint64_t res_fit = 0;
325
  for (res_win = 1; res_win <= output_width; res_win++) {
326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346
    if ((align_to_x(
             (args.image.channels *
              (args.kernel.width + (res_win - 1) * args.kernel.stride_w)),
             IMAGE_ALIGNMENT) /
             16 +
         1) *
            args.kernel.height >
        2048) {
      break;
    }
  }

  if (res_win != output_width) {
    res_win -= 1;
  }

  if (((res_win % 2) != 0) && (res_win != 1)) {
    res_win = res_win - 1;
  }
  res_fit = res_win;

347 348 349
  auto block_num = (output_width + res_fit - 1) / res_fit;
  auto block_len = res_fit;
  auto block_last = output_width - res_fit * (block_num - 1);
350

351 352
  auto res_amount_per_row =
      (output_width - (args.deconv_tx_param.omit_size) * 2) * args.filter_num;
353
  auto res_amount_per_row_pad = output_amount_per_row - res_amount_per_row;
354

355 356 357
  auto image_block_amount_per_row =
      args.kernel.stride_w * res_fit * args.image.channels;
  auto filter_pad_width_mul_channel =
358
      args.image.pad_width * args.image.channels;
359
  auto image_amount_per_row_multi_win_first =
qnqinan's avatar
qnqinan 已提交
360
      image_amount_per_row * (2 * args.kernel.stride_h - args.image.pad_height);
361
  auto image_amount_per_row_multi_win =
qnqinan's avatar
qnqinan 已提交
362
      image_amount_per_row * (2 * args.kernel.stride_h);
363

364 365
  auto image_block_num = block_num;
  auto image_block_len =
366 367 368 369 370
      align_to_x((args.image.channels *
                  (args.kernel.width + (block_len - 1) * args.kernel.stride_w)),
                 IMAGE_ALIGNMENT) /
          16 +
      1;
371
  auto image_block_len_last =
372 373 374 375 376 377
      align_to_x(
          (args.image.channels *
           (args.kernel.width + (block_last - 1) * args.kernel.stride_w)),
          IMAGE_ALIGNMENT) /
          16 +
      1;
378 379 380
  auto image_win_cnt = block_len;
  auto image_win_cnt_last = block_last;
  auto res_row_data_align4_pad = res_amount_per_row_pad / 8;
381 382
  auto prog_full_cnt = 1024 / (filter_amount_all / 16 * 2) - 1;
  if (prog_full_cnt == 511) {
383 384
    prog_full_cnt--;
  }
385
  auto post_prog_full_cnt =
386 387 388
      (512 / (align_to_x(args.filter_num, 4) / 4 * 2) > 2)
          ? (512 / (align_to_x(args.filter_num, 4) / 4 * 2) - 2)
          : 0;
qnqinan's avatar
qnqinan 已提交
389 390
  // auto cmd = 0UL | (args.relu_enabled ? USE_RELU : 0) | USE_BIAS;
  auto cmd = 0UL | USE_BIAS;
391

392 393 394
  auto deconv_param = ((args.deconv_tx_param.deconv_en) << 24) |
                      ((args.deconv_tx_param.sub_conv_num) << 16) |
                      ((args.deconv_tx_param.omit_size) << 0);
395 396 397
  (*arg).driver.image_address_phy = vaddr_to_paddr(args.image.address);
  (*arg).driver.sb_address_phy = vaddr_to_paddr(args.sb_address);
  (*arg).driver.filter_address_phy = vaddr_to_paddr(args.filter_address);
398 399
  (*arg).driver.output_address_phy = vaddr_to_paddr(args.output.address) +
                                     args.deconv_tx_param.out_addr_offset;
400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422
  (*arg).driver.output_height = output_height;
  (*arg).driver.output_width = output_width;
  (*arg).driver.filter_per_group = filter_per_group;
  (*arg).driver.channel_per_group = channel_per_group;
  (*arg).driver.image_amount_per_row = image_amount_per_row;
  (*arg).driver.image_one_pad_per_row = image_one_pad_per_row;
  (*arg).driver.filter_amount_all = filter_amount_all;
  (*arg).driver.output_amount_per_row = output_amount_per_row;
  (*arg).driver.image_block_amount_per_row = image_block_amount_per_row;
  (*arg).driver.filter_pad_width_mul_channel = filter_pad_width_mul_channel;
  (*arg).driver.image_amount_per_row_multi_win_first =
      image_amount_per_row_multi_win_first;
  (*arg).driver.image_amount_per_row_multi_win = image_amount_per_row_multi_win;
  (*arg).driver.image_block_num = image_block_num;
  (*arg).driver.image_block_len = image_block_len;
  (*arg).driver.image_block_len_last = image_block_len_last;
  (*arg).driver.image_win_cnt = image_win_cnt;
  (*arg).driver.image_win_cnt_last = image_win_cnt_last;
  (*arg).driver.res_row_data_align4_pad = res_row_data_align4_pad;
  (*arg).driver.prog_full_cnt = prog_full_cnt;
  (*arg).driver.post_prog_full_cnt = post_prog_full_cnt;
  (*arg).driver.fpga_bias_scale_len = fpga_bias_scale_len;
  (*arg).driver.cmd = cmd;
423
  (*arg).driver.deconv_param = deconv_param;
424 425 426 427
}  // expand_conv_arg()

void expand_EW_arg(EWAddArgs *arg) {
  EWAddArgs args = *arg;
qnqinan's avatar
qnqinan 已提交
428 429
  // uint64_t cmd = args.relu_enabled ? USE_RELU : 0;
  uint64_t cmd = 0;
430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454
  uint64_t datalen = (uint64_t)args.image0.width *
                     (uint64_t)args.image0.height *
                     (uint64_t)args.image0.channels;
  uint64_t coefficient = (uint64_t)args.const0 << 32 | (uint64_t)args.const1;
  uint64_t image0_address_phy = vaddr_to_paddr(args.image0.address);
  uint64_t image1_address_phy = vaddr_to_paddr(args.image1.address);
  uint64_t output_address_phy = vaddr_to_paddr(args.output.address);

  uint64_t image_amount_per_row =
      align_to_x((uint64_t)args.image0.width * (uint64_t)args.image0.channels,
                 IMAGE_ALIGNMENT);
  uint64_t image_image_pixel = ((uint64_t)args.image0.channels << 32) |
                               ((uint64_t)args.image0.width << 16) |
                               (uint64_t)args.image0.height;

  (*arg).driver.image0_address_phy = image0_address_phy;
  (*arg).driver.image1_address_phy = image1_address_phy;
  (*arg).driver.datalen = datalen;
  (*arg).driver.image_image_pixel = image_image_pixel;
  (*arg).driver.image_amount_per_row = image_amount_per_row;
  (*arg).driver.output_address_phy = output_address_phy;
  (*arg).driver.coefficient = coefficient;
  (*arg).driver.cmd = cmd;
}  // expand_EW_arg

Z
zhangyang 已提交
455 456
void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
                    framework::Tensor *out, framework::Tensor *filter,
qnqinan's avatar
qnqinan 已提交
457 458 459 460
                    ActivationType activation_enable,
                    int16_t leaky_relu_negative_slope, int group_num,
                    int stride_h, int stride_w, int padding_h, int padding_w,
                    float *bs_ptr) {
461 462 463
  auto input_ptr = input->data<half>();
  auto filter_ptr = filter->data<int8_t>();
  auto out_ptr = out->data<half>();
Z
zhangyang 已提交
464
  auto deleter = [](void *p) { fpga_free(p); };
465 466

  arg->group_num = (uint32_t)group_num;
467 468
  // Either group_num or split_num = 1;
  arg->split_num = group_num == 1 ? (uint32_t)get_plit_num(filter) : 1;
469 470 471
  arg->filter_num = (uint32_t)filter->dims()[0];
  arg->output.address = out_ptr;
  arg->output.scale_address = out->scale;
Z
zhangyang 已提交
472
  arg->conv_arg =
473
      (ConvArgs *)fpga_malloc(arg->split_num * sizeof(ConvArgs));  // NOLINT
474

Z
zhangyang 已提交
475 476
  arg->shared_conv_arg = std::shared_ptr<ConvArgs>(arg->conv_arg, deleter);

477 478
  memset(arg->conv_arg, 0, arg->split_num * sizeof(struct ConvArgs));

479 480 481
  arg->concat_arg.image_num = arg->split_num;
  arg->concat_arg.image_out = out_ptr;
  arg->concat_arg.scale_out = out->scale;
482 483
  arg->concat_arg.height = (uint32_t)out->dims()[2];
  arg->concat_arg.width = (uint32_t)out->dims()[3];
484 485

  int n = arg->split_num;
486
  arg->concat_arg.images_in =
Z
zhangyang 已提交
487
      static_cast<int16_t **>(fpga_malloc(n * sizeof(int *)));
488
  arg->concat_arg.scales_in =
Z
zhangyang 已提交
489
      static_cast<float **>(fpga_malloc(n * sizeof(float *)));
490
  arg->concat_arg.channel_num =
Z
zhangyang 已提交
491 492 493 494 495 496 497
      static_cast<uint32_t *>(fpga_malloc(n * sizeof(uint32_t)));
  arg->vector_concat_space.push_back(std::shared_ptr<char>(
      reinterpret_cast<char *>(arg->concat_arg.images_in), deleter));
  arg->vector_concat_space.push_back(std::shared_ptr<char>(
      reinterpret_cast<char *>(arg->concat_arg.scales_in), deleter));
  arg->vector_concat_space.push_back(std::shared_ptr<char>(
      reinterpret_cast<char *>(arg->concat_arg.channel_num), deleter));
498

499 500 501
  auto channel = (int)out->dims()[1];  // NOLINT
  int filter_num_per_div = get_filter_num_per_div(filter, group_num);
  int element_num = get_aligned_filter_element_num(
502 503
      (int)(filter->dims()[1] * filter->dims()[2] *  // NOLINT
            filter->dims()[3]));
504 505

  for (int i = 0; i < n; i++) {
qnqinan's avatar
qnqinan 已提交
506 507 508 509
    // arg->conv_arg[i].relu_enabled = relu_enabled;
    arg->conv_arg[i].output.activation.activation_type = activation_enable;
    arg->conv_arg[i].output.activation.leaky_relu_negative_slope =
        leaky_relu_negative_slope;
Z
zhangyang 已提交
510 511 512 513 514 515 516 517 518 519 520 521 522 523
    arg->conv_arg[i].group_num = (uint32_t)group_num;
    arg->conv_arg[i].kernel.stride_h = (uint32_t)stride_h;
    arg->conv_arg[i].kernel.stride_w = (uint32_t)stride_w;
    arg->conv_arg[i].kernel.height = (uint32_t)filter->dims()[2];
    arg->conv_arg[i].kernel.width = (uint32_t)filter->dims()[3];
    arg->conv_arg[i].image.address = input_ptr;
    arg->conv_arg[i].image.channels = (uint32_t)input->dims()[1];
    arg->conv_arg[i].image.height = (uint32_t)input->dims()[2];
    arg->conv_arg[i].image.width = (uint32_t)input->dims()[3];
    arg->conv_arg[i].image.scale_address = input->scale;
    arg->conv_arg[i].image.pad_height = (uint32_t)padding_h;
    arg->conv_arg[i].image.pad_width = (uint32_t)padding_w;
    arg->conv_arg[i].filter_scale_address = filter->scale;
    arg->conv_arg[i].filter_num = (uint32_t)(
524 525
        i == n - 1 ? channel - (n - 1) * filter_num_per_div  // NOLINT
                   : filter_num_per_div);
526

Z
zhangyang 已提交
527
    size_t filter_size =
528 529 530
        element_num *
        align_to_x(arg->conv_arg[i].filter_num, FILTER_NUM_ALIGNMENT) *
        sizeof(int8_t);
531 532
    auto filter_head = &(
        (int8_t *)filter_ptr)[i * element_num * filter_num_per_div];  // NOLINT
Z
zhangyang 已提交
533
    arg->conv_arg[i].filter_address = fpga_malloc(filter_size);
Z
zhangyang 已提交
534 535
    arg->vector_conv_space.push_back(std::shared_ptr<char>(
        reinterpret_cast<char *>(arg->conv_arg[i].filter_address), deleter));
Z
zhangyang 已提交
536 537 538
    memcpy(arg->conv_arg[i].filter_address, filter_head, filter_size);
    fpga_flush(arg->conv_arg[i].filter_address, filter_size);

539 540 541
    size_t bs_size = 2 *
                     align_to_x(arg->conv_arg[i].filter_num, BS_NUM_ALIGNMENT) *
                     sizeof(float);
Z
zhangyang 已提交
542 543
    auto bs_head = &bs_ptr[i * filter_num_per_div * 2];
    arg->conv_arg[i].sb_address = fpga_malloc(bs_size);
Z
zhangyang 已提交
544 545
    arg->vector_conv_space.push_back(std::shared_ptr<char>(
        reinterpret_cast<char *>(arg->conv_arg[i].sb_address), deleter));
Z
zhangyang 已提交
546 547 548
    memcpy(arg->conv_arg[i].sb_address, bs_head, bs_size);
    fpga_flush(arg->conv_arg[i].sb_address, bs_size);

549
    if (n > 1) {
Z
zhangyang 已提交
550
      arg->conv_arg[i].output.scale_address =
Z
zhangyang 已提交
551
          static_cast<float *>(fpga_malloc(2 * sizeof(float)));
552 553 554 555 556 557
      arg->conv_arg[i].output.address =
          fpga_malloc(out->dims()[2] *
                      align_to_x((int)(out->dims()[3] *  // NOLINT
                                       arg->conv_arg[i].filter_num),
                                 IMAGE_ALIGNMENT) *
                      sizeof(half));
Z
zhangyang 已提交
558 559 560 561 562
      arg->vector_conv_space.push_back(std::shared_ptr<char>(
          reinterpret_cast<char *>(arg->conv_arg[i].output.scale_address),
          deleter));
      arg->vector_conv_space.push_back(std::shared_ptr<char>(
          reinterpret_cast<char *>(arg->conv_arg[i].output.address), deleter));
563
    } else {
Z
zhangyang 已提交
564 565
      arg->conv_arg[i].output.scale_address = out->scale;
      arg->conv_arg[i].output.address = out_ptr;
566 567
    }

568
    arg->concat_arg.images_in[i] =
Z
zhangyang 已提交
569 570 571
        (half *)arg->conv_arg[i].output.address;  // NOLINT
    arg->concat_arg.scales_in[i] = arg->conv_arg[i].output.scale_address;
    arg->concat_arg.channel_num[i] = arg->conv_arg[i].filter_num;
572 573

    expand_conv_arg(&arg->conv_arg[i]);
574
  }
Z
zhangyang 已提交
575 576
  filter->reset_data_ptr(nullptr);
  fpga_free(bs_ptr);
577 578
}  // fill_split_arg

Z
zhangyang 已提交
579 580
void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
                     framework::Tensor *out, framework::Tensor *filter,
qnqinan's avatar
qnqinan 已提交
581 582 583
                     ActivationType activation_enable,
                     int16_t leaky_relu_negative_slope, int group_num,
                     int stride_h, int stride_w, int padding_h, int padding_w,
Z
zhangyang 已提交
584
                     float *bs_ptr) {
585 586
  auto input_ptr = input->data<half>();
  auto filter_ptr = filter->data<int8_t>();
Z
zhangyang 已提交
587
  auto deleter = [](void *p) { fpga_free(p); };
Z
zhangyang 已提交
588 589

  arg->group_num = (uint32_t)group_num;
590
  arg->sub_conv_num = (uint32_t)stride_h;
Z
zhangyang 已提交
591
  arg->filter_num = (uint32_t)filter->dims()[0];
592
  uint32_t sub_conv_num = arg->sub_conv_num;
593 594 595
  int sub_pad =
      deconv_filter::deconv_calc_sub_pad((int)filter->dims()[3],  // NOLINT
                                         padding_w, stride_w);
596
  auto sub_filter_width = (uint32_t)deconv_filter::deconv_get_sub_filter_axis(
597
      (int)filter->dims()[3], stride_w);  // NOLINT
598

599
  auto sub_output_width = (uint32_t)deconv_filter::deconv_get_sub_out_axis(
600
      (int)input->dims()[3], sub_pad, sub_filter_width);  // NOLINT
601
  auto sub_output_height = (uint32_t)deconv_filter::deconv_get_sub_out_axis(
602
      (int)input->dims()[2], sub_pad, sub_filter_width);  // NOLINT
Z
zhangyang 已提交
603

604 605 606
  arg->sub_output_width = (uint32_t)sub_output_width;
  arg->sub_output_height = (uint32_t)sub_output_height;
  arg->omit_size = (uint32_t)deconv_filter::deconv_get_omit(
607
      stride_w, (int)filter->dims()[3], padding_w);  // NOLINT
Z
zhangyang 已提交
608

609
  auto sub_channels = (int)input->dims()[1];  // NOLINT
610
  uint32_t omit_size = arg->omit_size;
Z
zhangyang 已提交
611
  int real_out_width = sub_output_width * sub_conv_num - 2 * omit_size;
Z
zhangyang 已提交
612 613
  int sub_filter_num = sub_conv_num * (arg->filter_num);

614 615 616
  framework::DDim dims_out_new = framework::make_ddim(
      {1, arg->filter_num, sub_output_height * sub_conv_num, real_out_width});
  fpga::format_fp16_ofm(out, dims_out_new);
617
  auto out_ptr = out->data<half>();
618
  arg->output.address =
619
      (half *)out_ptr +  // NOLINT
620 621 622 623 624
      omit_size * sizeof(half) *
          (align_to_x(real_out_width * arg->filter_num, IMAGE_ALIGNMENT));
  arg->output.scale_address = out->scale;

  uint32_t conv_output_size =
Z
zhangyang 已提交
625 626
      (align_to_x(sub_output_width * sub_filter_num, IMAGE_ALIGNMENT)) *
      sub_output_height;
627
  uint32_t split_num =
Z
zhangyang 已提交
628 629
      group_num == 1 ? (uint32_t)get_deconv_plit_num(filter, sub_conv_num) : 1;

Z
zhangyang 已提交
630
  for (int i = 0; i < sub_conv_num; ++i) {
Z
zhangyang 已提交
631 632
    arg->split_conv_args.push_back(std::make_shared<SplitConvArgs>());
    arg->split_conv_args[i]->filter_num =
Z
zhangyang 已提交
633
        (arg->sub_conv_num) * (arg->filter_num);
Z
zhangyang 已提交
634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664
    arg->split_conv_args[i]->group_num = (uint32_t)group_num;
    arg->split_conv_args[i]->split_num = split_num;
    arg->split_conv_args[i]->concat_arg.height = sub_output_height;
    arg->split_conv_args[i]->concat_arg.width = sub_output_width;
    arg->split_conv_args[i]->concat_arg.image_num = split_num;

    arg->split_conv_args[i]->conv_arg =
        static_cast<ConvArgs *>(fpga_malloc(split_num * sizeof(ConvArgs)));
    arg->split_conv_args[i]->concat_arg.images_in =
        static_cast<int16_t **>(fpga_malloc(split_num * sizeof(int16_t *)));
    arg->split_conv_args[i]->concat_arg.scales_in =
        static_cast<float **>(fpga_malloc(split_num * sizeof(float *)));
    arg->split_conv_args[i]->concat_arg.channel_num =
        static_cast<uint32_t *>(fpga_malloc(split_num * sizeof(uint32_t)));
    arg->split_conv_args[i]->shared_conv_arg =
        std::shared_ptr<ConvArgs>(arg->split_conv_args[i]->conv_arg, deleter);
    arg->split_conv_args[i]->vector_concat_space.push_back(
        std::shared_ptr<char>(
            reinterpret_cast<char *>(
                arg->split_conv_args[i]->concat_arg.images_in),
            deleter));
    arg->split_conv_args[i]->vector_concat_space.push_back(
        std::shared_ptr<char>(
            reinterpret_cast<char *>(
                arg->split_conv_args[i]->concat_arg.scales_in),
            deleter));
    arg->split_conv_args[i]->vector_concat_space.push_back(
        std::shared_ptr<char>(
            reinterpret_cast<char *>(
                arg->split_conv_args[i]->concat_arg.channel_num),
            deleter));
Z
zhangyang 已提交
665
  }
Z
zhangyang 已提交
666

667 668
  auto filter_num_per_div =
      (uint32_t)get_deconv_filter_num_per_div(filter, group_num, stride_w);
Z
zhangyang 已提交
669
  int element_num = get_aligned_filter_element_num(
670
      (int)(sub_channels * sub_filter_width * sub_filter_width));  // NOLINT
Z
zhangyang 已提交
671 672 673 674 675 676 677 678 679 680 681 682 683 684 685

  int chw = sub_channels * sub_filter_width * sub_filter_width;
  int division_capacity = filter::calc_division_capacity(chw);
  int num_per_div_before_alignment =
      filter::calc_num_per_div(sub_filter_num, group_num, division_capacity);
  int num_per_div_after_alignment =
      align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
  int div_num = (sub_filter_num + num_per_div_before_alignment - 1) /
                num_per_div_before_alignment;
  int residual = sub_filter_num % num_per_div_before_alignment;
  int num_after_alignment = num_per_div_after_alignment *
                                ((residual == 0) ? div_num : (div_num - 1)) +
                            align_to_x(residual, FILTER_NUM_ALIGNMENT);

  int filter_sub_conv_offset = element_num * num_after_alignment;
686
  uint32_t out_addr_offset = 0;
Z
zhangyang 已提交
687
  for (int i = 0; i < sub_conv_num; ++i) {
Z
zhangyang 已提交
688
    if (sub_conv_num == 1) {
Z
zhangyang 已提交
689 690
      arg->split_conv_args[i]->output.address = arg->output.address;
      arg->split_conv_args[i]->output.scale_address = arg->output.scale_address;
691
      out_addr_offset = 0;
Z
zhangyang 已提交
692

Z
zhangyang 已提交
693
    } else {
694
      out_addr_offset =
Z
zhangyang 已提交
695
          sizeof(int16_t) * (sub_conv_num - 1 - i) *
696 697
          (align_to_x(real_out_width * arg->filter_num, IMAGE_ALIGNMENT));

Z
zhangyang 已提交
698 699 700 701 702 703 704 705
      arg->split_conv_args[i]->output.address = out_ptr;
      arg->split_conv_args[i]->output.scale_address =
          static_cast<float *>(fpga_malloc(2 * sizeof(float)));
      arg->split_conv_args[i]->vector_conv_space.push_back(
          std::shared_ptr<char>(
              reinterpret_cast<char *>(
                  arg->split_conv_args[i]->output.scale_address),
              deleter));
Z
zhangyang 已提交
706 707
    }

Z
zhangyang 已提交
708
    for (int j = 0; j < split_num; ++j) {
qnqinan's avatar
qnqinan 已提交
709 710 711 712 713 714 715
      // arg->split_conv_args[i]->conv_arg[j].relu_enabled = relu_enabled;
      arg->split_conv_args[i]->conv_arg[j].output.activation.activation_type =
          activation_enable;
      arg->split_conv_args[i]
          ->conv_arg[j]
          .output.activation.leaky_relu_negative_slope =
          leaky_relu_negative_slope;
Z
zhangyang 已提交
716
      arg->split_conv_args[i]->conv_arg[j].group_num = (uint32_t)group_num;
Z
zhangyang 已提交
717

Z
zhangyang 已提交
718
      arg->split_conv_args[i]->conv_arg[j].kernel.width =
Z
zhangyang 已提交
719
          (uint32_t)sub_filter_width;
Z
zhangyang 已提交
720
      arg->split_conv_args[i]->conv_arg[j].kernel.height =
Z
zhangyang 已提交
721
          (uint32_t)sub_filter_width;
Z
zhangyang 已提交
722 723
      arg->split_conv_args[i]->conv_arg[j].kernel.stride_w = 1;
      arg->split_conv_args[i]->conv_arg[j].kernel.stride_h = 1;
Z
zhangyang 已提交
724

Z
zhangyang 已提交
725 726
      arg->split_conv_args[i]->conv_arg[j].deconv_tx_param.deconv_en = 1;
      arg->split_conv_args[i]->conv_arg[j].deconv_tx_param.sub_conv_num =
727
          sub_conv_num;
Z
zhangyang 已提交
728 729 730
      arg->split_conv_args[i]->conv_arg[j].deconv_tx_param.omit_size =
          omit_size;
      arg->split_conv_args[i]->conv_arg[j].deconv_tx_param.out_addr_offset =
731 732
          out_addr_offset;

Z
zhangyang 已提交
733 734
      arg->split_conv_args[i]->conv_arg[j].image.scale_address = input->scale;
      arg->split_conv_args[i]->conv_arg[j].image.channels =
Z
zhangyang 已提交
735
          (uint32_t)sub_channels;
Z
zhangyang 已提交
736
      arg->split_conv_args[i]->conv_arg[j].image.width =
Z
zhangyang 已提交
737
          (uint32_t)input->dims()[3];
Z
zhangyang 已提交
738
      arg->split_conv_args[i]->conv_arg[j].image.height =
Z
zhangyang 已提交
739
          (uint32_t)input->dims()[2];
Z
zhangyang 已提交
740 741 742
      arg->split_conv_args[i]->conv_arg[j].image.pad_width = (uint32_t)sub_pad;
      arg->split_conv_args[i]->conv_arg[j].image.pad_height = (uint32_t)sub_pad;
      arg->split_conv_args[i]->conv_arg[j].image.address = input_ptr;
Z
zhangyang 已提交
743

Z
zhangyang 已提交
744 745
      arg->split_conv_args[i]->conv_arg[j].filter_scale_address = filter->scale;
      arg->split_conv_args[i]->conv_arg[j].filter_num =
746 747 748
          (uint32_t)(j == split_num - 1
                         ? sub_filter_num - (split_num - 1) * filter_num_per_div
                         : filter_num_per_div);
Z
zhangyang 已提交
749 750 751

      size_t filter_size =
          element_num *
Z
zhangyang 已提交
752
          align_to_x(arg->split_conv_args[i]->conv_arg[j].filter_num,
Z
zhangyang 已提交
753 754
                     FILTER_NUM_ALIGNMENT) *
          sizeof(int8_t);
755 756 757
      auto filter_head = &((
          int8_t *)filter_ptr)[j * element_num * filter_num_per_div +  // NOLINT
                               i * filter_sub_conv_offset];
Z
zhangyang 已提交
758
      arg->split_conv_args[i]->conv_arg[j].filter_address =
Z
zhangyang 已提交
759
          fpga_malloc(filter_size);
Z
zhangyang 已提交
760 761 762 763 764 765 766
      arg->split_conv_args[i]->vector_conv_space.push_back(
          std::shared_ptr<char>(
              reinterpret_cast<char *>(
                  arg->split_conv_args[i]->conv_arg[j].filter_address),
              deleter));

      memcpy(arg->split_conv_args[i]->conv_arg[j].filter_address, filter_head,
Z
zhangyang 已提交
767
             filter_size);
Z
zhangyang 已提交
768
      fpga_flush(arg->split_conv_args[i]->conv_arg[j].filter_address,
Z
zhangyang 已提交
769 770 771
                 filter_size);

      size_t bs_align_num = align_to_x(
Z
zhangyang 已提交
772
          arg->split_conv_args[i]->conv_arg[j].filter_num, BS_NUM_ALIGNMENT);
Z
zhangyang 已提交
773 774 775
      size_t bs_size = 2 * bs_align_num * sizeof(float);
      auto bs_head = &bs_ptr[j * filter_num_per_div * 2];

Z
zhangyang 已提交
776 777 778 779 780 781 782 783 784
      arg->split_conv_args[i]->conv_arg[j].sb_address = fpga_malloc(bs_size);
      arg->split_conv_args[i]->vector_conv_space.push_back(
          std::shared_ptr<char>(
              reinterpret_cast<char *>(
                  arg->split_conv_args[i]->conv_arg[j].sb_address),
              deleter));

      memcpy(arg->split_conv_args[i]->conv_arg[j].sb_address, bs_head, bs_size);
      fpga_flush(arg->split_conv_args[i]->conv_arg[j].sb_address, bs_size);
Z
zhangyang 已提交
785 786

      if (split_num == 1) {
Z
zhangyang 已提交
787 788 789 790
        arg->split_conv_args[i]->conv_arg[j].output.address =
            arg->split_conv_args[i]->output.address;
        arg->split_conv_args[i]->conv_arg[j].output.scale_address =
            arg->split_conv_args[i]->output.scale_address;
Z
zhangyang 已提交
791
      } else {
Z
zhangyang 已提交
792 793 794 795 796 797 798 799 800 801 802 803 804 805
        arg->split_conv_args[i]->conv_arg[j].output.address =
            fpga_malloc(conv_output_size * sizeof(int16_t));
        arg->split_conv_args[i]->conv_arg[j].output.scale_address =
            static_cast<float *>(fpga_malloc(2 * sizeof(float)));
        arg->split_conv_args[i]->vector_conv_space.push_back(
            std::shared_ptr<char>(
                reinterpret_cast<char *>(
                    arg->split_conv_args[i]->conv_arg[j].output.address),
                deleter));
        arg->split_conv_args[i]->vector_conv_space.push_back(
            std::shared_ptr<char>(
                reinterpret_cast<char *>(
                    arg->split_conv_args[i]->conv_arg[j].output.scale_address),
                deleter));
Z
zhangyang 已提交
806
      }
807
      arg->split_conv_args[i]->concat_arg.images_in[j] = static_cast<half *>(
Z
zhangyang 已提交
808 809 810 811 812 813 814
          arg->split_conv_args[i]->conv_arg[j].output.address);
      arg->split_conv_args[i]->concat_arg.scales_in[j] =
          arg->split_conv_args[i]->conv_arg[j].output.scale_address;
      arg->split_conv_args[i]->concat_arg.channel_num[j] =
          arg->split_conv_args[i]->conv_arg[j].filter_num;

      expand_conv_arg(&(arg->split_conv_args[i]->conv_arg[j]));
Z
zhangyang 已提交
815 816
    }

Z
zhangyang 已提交
817 818 819 820
    arg->split_conv_args[i]->concat_arg.image_out =
        arg->split_conv_args[i]->output.address;
    arg->split_conv_args[i]->concat_arg.scale_out =
        arg->split_conv_args[i]->output.scale_address;
Z
zhangyang 已提交
821
  }
822
  filter->reset_data_ptr(nullptr);
Z
zhangyang 已提交
823
  fpga_free(bs_ptr);
824 825
}  // fill_deconv_arg

826 827
void fill_dwconv_arg(struct DWconvArgs *arg, framework::Tensor *input,
                     framework::Tensor *out, framework::Tensor *filter,
qnqinan's avatar
qnqinan 已提交
828 829 830 831
                     ActivationType activation_enable,
                     int16_t leaky_relu_negative_slope, int stride_h,
                     int stride_w, int padding_h, int padding_w,
                     float *bias_ptr) {
832 833 834
  auto filter_ptr = filter->data<uint8_t>();
  auto input_ptr = input->data<half>();
  auto output_ptr = out->mutable_data<half>();
835
  arg->sub_conv_num = 1;
qnqinan's avatar
qnqinan 已提交
836 837 838
  // arg->relu_enabled = relu_enabled;
  arg->output.activation.activation_type = activation_enable;
  arg->output.activation.leaky_relu_negative_slope = leaky_relu_negative_slope;
839 840
  arg->bias_address = bias_ptr;
  arg->filter_address = filter_ptr;
Z
zhangyang 已提交
841 842 843 844
  arg->kernel.height = (uint32_t)filter->dims()[2];
  arg->kernel.width = (uint32_t)filter->dims()[3];
  arg->kernel.stride_h = (uint32_t)stride_h;
  arg->kernel.stride_w = (uint32_t)stride_w;
845 846 847 848
  arg->image.address = input_ptr;
  arg->image.channels = (uint32_t)input->dims()[1];
  arg->image.height = (uint32_t)input->dims()[2];
  arg->image.width = (uint32_t)input->dims()[3];
Z
zhangyang 已提交
849 850
  arg->image.pad_height = (uint32_t)padding_h;
  arg->image.pad_width = (uint32_t)padding_w;
851 852 853 854 855
  arg->image.scale_address = input->scale;
  arg->output.address = output_ptr;
  arg->output.scale_address = out->scale;
}  // end dwconv arg fill

qnqinan's avatar
qnqinan 已提交
856 857
void fill_DWDeconv_arg(struct DWDeconvArgs *arg, framework::Tensor *input,
                       framework::Tensor *out, framework::Tensor *filter,
qnqinan's avatar
qnqinan 已提交
858 859 860 861
                       ActivationType activation_enable,
                       int16_t leaky_relu_negative_slope, int stride_h,
                       int stride_w, int padding_h, int padding_w,
                       float *bias_ptr) {
862 863
  auto filter_ptr = filter->data<int8_t>();
  auto input_ptr = input->data<half>();
qnqinan's avatar
qnqinan 已提交
864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897

  auto deleter = [](void *p) { fpga_free(p); };

  arg->group_num = (uint32_t)filter->dims()[0];
  arg->sub_conv_num = (uint32_t)stride_w;
  arg->filter_num = (uint32_t)filter->dims()[0];

  int sub_conv_num = stride_w;

  int sub_pad =
      deconv_filter::deconv_calc_sub_pad((int)filter->dims()[3],  // NOLINT
                                         padding_w, stride_w);
  auto sub_filter_width = (uint32_t)deconv_filter::deconv_get_sub_filter_axis(
      (int)filter->dims()[3], stride_w);  // NOLINT

  auto sub_output_width = (uint32_t)deconv_filter::deconv_get_sub_out_axis(
      (int)input->dims()[3], sub_pad, sub_filter_width);  // NOLINT
  auto sub_output_height = (uint32_t)deconv_filter::deconv_get_sub_out_axis(
      (int)input->dims()[2], sub_pad, sub_filter_width);  // NOLINT

  arg->sub_output_width = (uint32_t)sub_output_width;
  arg->sub_output_height = (uint32_t)sub_output_height;
  arg->omit_size = (uint32_t)deconv_filter::deconv_get_omit(
      stride_w, (int)filter->dims()[3], padding_w);  // NOLINT

  auto sub_channels = (int)input->dims()[1];  // NOLINT
  uint32_t omit_size = arg->omit_size;
  int real_out_width = sub_output_width * sub_conv_num - 2 * omit_size;
  int real_out_height = sub_output_height * sub_conv_num - 2 * omit_size;
  int sub_filter_num = sub_conv_num * (arg->filter_num);

  framework::DDim dims_out_new = framework::make_ddim(
      {1, arg->filter_num, real_out_height, real_out_width});
  fpga::format_fp16_ofm(out, dims_out_new);
898
  auto out_ptr = out->data<half>();
qnqinan's avatar
qnqinan 已提交
899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916

  /*====For Addition
  arg->output.address =
      (half *)out_ptr +  // NOLINT
      omit_size * sizeof(half) *
          (align_to_x(real_out_width * arg->filter_num, IMAGE_ALIGNMENT));
          */
  arg->output.address = out_ptr;
  arg->output.scale_address = out->scale;

  int filter_offset = sub_filter_width * sub_filter_width *
                      align_to_x(sub_channels, FILTER_ELEMENT_ALIGNMENT) *
                      arg->sub_conv_num;

  for (int i = 0; i < sub_conv_num; ++i) {
    arg->dw_conv_args.push_back(std::make_shared<DWconvArgs>());

    arg->dw_conv_args[i]->sub_conv_num = sub_conv_num;
qnqinan's avatar
qnqinan 已提交
917 918 919 920
    // arg->dw_conv_args[i]->relu_enabled = relu_enabled;
    arg->dw_conv_args[i]->output.activation.activation_type = activation_enable;
    arg->dw_conv_args[i]->output.activation.leaky_relu_negative_slope =
        leaky_relu_negative_slope;
qnqinan's avatar
qnqinan 已提交
921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963
    arg->dw_conv_args[i]->bias_address = bias_ptr;

    arg->dw_conv_args[i]->filter_address =
        fpga_malloc(filter_offset * sizeof(int16_t));
    memcpy(arg->dw_conv_args[i]->filter_address,
           (reinterpret_cast<half *>(filter_ptr) + i * filter_offset),
           filter_offset * sizeof(int16_t));
    arg->vector_dw_conv_space.push_back(std::shared_ptr<char>(
        reinterpret_cast<char *>(arg->dw_conv_args[i]->filter_address),
        deleter));

    arg->dw_conv_args[i]->kernel.height = (uint32_t)sub_filter_width;
    arg->dw_conv_args[i]->kernel.width = (uint32_t)sub_filter_width;

    arg->dw_conv_args[i]->kernel.stride_h = (uint32_t)1;
    arg->dw_conv_args[i]->kernel.stride_w = (uint32_t)1;
    arg->dw_conv_args[i]->image.address = input_ptr;
    arg->dw_conv_args[i]->image.channels = (uint32_t)input->dims()[1];
    arg->dw_conv_args[i]->image.height = (uint32_t)input->dims()[2];
    arg->dw_conv_args[i]->image.width = (uint32_t)input->dims()[3];

    arg->dw_conv_args[i]->image.pad_height = sub_pad;
    arg->dw_conv_args[i]->image.pad_width = sub_pad;
    arg->dw_conv_args[i]->image.scale_address = input->scale;

    arg->dw_conv_args[i]->output.address =
        fpga_malloc(sub_output_height *
                    align_to_x(sub_output_width * sub_channels * sub_conv_num,
                               IMAGE_ALIGNMENT) *
                    sizeof(int16_t));
    arg->dw_conv_args[i]->output.scale_address =
        static_cast<float *>(fpga_malloc(2 * sizeof(float)));
    arg->vector_dw_conv_space.push_back(std::shared_ptr<char>(
        reinterpret_cast<char *>(arg->dw_conv_args[i]->output.address),
        deleter));
    arg->vector_dw_conv_space.push_back(std::shared_ptr<char>(
        reinterpret_cast<char *>(arg->dw_conv_args[i]->output.scale_address),
        deleter));
  }

  // arg->output.scale_address = out->scale;
}  // end dwconv arg fill

H
hanbuhe 已提交
964
}  // namespace fpga
Z
zhangyang 已提交
965
}  // namespace paddle_mobile