api.cpp 26.4 KB
Newer Older
H
hanbuhe 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

Z
zhangyang 已提交
15 16
#include "fpga/V1/api.h"
#include "fpga/V1/bias_scale.h"
Z
zhangyang 已提交
17
#include "fpga/V1/deconv_filter.h"
Z
zhangyang 已提交
18 19
#include "fpga/V1/filter.h"
#include "fpga/V1/image.h"
Z
zhangyang 已提交
20

Z
zhangyang 已提交
21
namespace paddle_mobile {
H
hanbuhe 已提交
22 23
namespace fpga {

24 25 26
#define USE_RELU 1
#define USE_BIAS 2

Z
zhangyang 已提交
27 28
void format_image(framework::Tensor *image_tensor) {
  auto dims = image_tensor->dims();
Z
zhangyang 已提交
29
  auto channel = dims[1], height = dims[2], width = dims[3];
30
  auto data_ptr = image_tensor->data<float>();
Z
zhangyang 已提交
31
  size_t memory_size = channel * height * width * sizeof(float);
32
  auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
Z
zhangyang 已提交
33 34 35 36 37
  fpga_copy(new_data, data_ptr, memory_size);
  image::format_image(&new_data, channel, height, width);
  image_tensor->reset_data_ptr(new_data);
}

38
void format_fp16_ofm(framework::Tensor *ofm_tensor) {
Z
zhangyang 已提交
39
  auto dims = ofm_tensor->dims();
40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69
  size_t memory_size = 0;
  if (dims.size() == 4) {
    auto channel = dims[1], height = dims[2], width = dims[3];
    memory_size =
        height * align_to_x(channel * width, IMAGE_ALIGNMENT) * sizeof(half);
  } else if (dims.size() == 2) {
    memory_size = align_to_x(dims[1], IMAGE_ALIGNMENT) * sizeof(half);
  } else {
    DLOG << "Wrong ofm dimension";
  }
  auto p = fpga_malloc(memory_size);
  memset(p, 0, memory_size);
  ofm_tensor->reset_data_ptr(p);
}

void format_fp32_ofm(framework::Tensor *ofm_tensor) {
  auto dims = ofm_tensor->dims();
  size_t memory_size = 0;
  if (dims.size() == 4) {
    auto channel = dims[1], height = dims[2], width = dims[3];
    memory_size =
        height * align_to_x(channel * width, IMAGE_ALIGNMENT) * sizeof(float);
  } else if (dims.size() == 2) {
    memory_size = align_to_x(dims[1], IMAGE_ALIGNMENT) * sizeof(float);
  } else {
    DLOG << "Wrong ofm dimension";
  }
  auto p = fpga_malloc(memory_size);
  memset(p, 0, memory_size);
  ofm_tensor->reset_data_ptr(p);
Z
zhangyang 已提交
70 71
}

Z
zhangyang 已提交
72 73 74 75
float filter_find_max(framework::Tensor *filter_tensor) {
  auto filter_ptr = filter_tensor->data<float>();
  return filter::find_max(filter_ptr, filter_tensor->numel());
}
Z
zhangyang 已提交
76 77 78

int get_plit_num(framework::Tensor *filter_tensor) {
  auto dims = filter_tensor->dims();
Z
zhangyang 已提交
79 80
  auto chw = dims[1] * dims[2] * dims[3];
  auto num = dims[0];
Z
zhangyang 已提交
81 82 83
  int div_capacity = filter::calc_division_capacity(chw);
  return filter::calc_split_num(num, div_capacity);
}
Z
zhangyang 已提交
84 85 86 87 88 89 90
int get_deconv_plit_num(framework::Tensor *filter_tensor, int stride) {
  auto dims = filter_tensor->dims();
  auto chw = dims[1] * dims[2] / stride * dims[3] / stride;
  auto num = dims[0] * stride;
  int div_capacity = filter::calc_division_capacity(chw);
  return filter::calc_split_num(num, div_capacity);
}
Z
zhangyang 已提交
91

92
int get_filter_num_per_div(framework::Tensor *filter_tensor, int group_num) {
Z
zhangyang 已提交
93
  auto dims = filter_tensor->dims();
Z
zhangyang 已提交
94 95
  auto chw = dims[1] * dims[2] * dims[3];
  auto num = dims[0];
Z
zhangyang 已提交
96 97 98 99
  int div_capacity = filter::calc_division_capacity(chw);
  return filter::calc_num_per_div(num, group_num, div_capacity);
}

Z
zhangyang 已提交
100 101 102 103 104 105 106 107 108
int get_deconv_filter_num_per_div(framework::Tensor *filter_tensor,
                                  int group_num, int stride) {
  auto dims = filter_tensor->dims();
  auto chw = dims[1] * dims[2] / stride * dims[3] / stride;
  auto num = dims[0] * stride;
  int div_capacity = filter::calc_division_capacity(chw);
  return filter::calc_num_per_div(num, group_num, div_capacity);
}

Z
zhangyang 已提交
109 110 111 112
int get_aligned_filter_element_num(int chw) {
  return align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
}

Z
zhangyang 已提交
113 114
void format_filter(framework::Tensor *filter_tensor, float max_value,
                   int group_num) {
115 116
  filter_tensor->scale[0] = float(max_value / 127.0);  // NOLINT
  filter_tensor->scale[1] = float(127.0 / max_value);  // NOLINT
Z
zhangyang 已提交
117
  auto dims = filter_tensor->dims();
Z
zhangyang 已提交
118
  auto num = dims[0], channel = dims[1], height = dims[2], width = dims[3];
119
  auto data_ptr = filter_tensor->data<float>();
Z
zhangyang 已提交
120
  size_t memory_size = num * channel * height * width * sizeof(float);
121
  auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
Z
zhangyang 已提交
122 123 124 125 126 127
  fpga_copy(new_data, data_ptr, memory_size);
  filter::format_filter(&new_data, num, channel, height, width, group_num,
                        max_value);
  filter_tensor->reset_data_ptr(new_data);
}

Z
zhangyang 已提交
128 129 130 131 132 133 134 135 136 137 138 139 140
void format_fc_filter(framework::Tensor *filter_tensor, float max_value) {
  filter_tensor->scale[0] = float(max_value / 127.0);  // NOLINT
  filter_tensor->scale[1] = float(127.0 / max_value);  // NOLINT
  auto dims = filter_tensor->dims();
  auto num = dims[0], channel = dims[1], height = dims[2], width = dims[3];
  auto data_ptr = filter_tensor->data<float>();
  size_t memory_size = num * channel * height * width * sizeof(float);
  auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
  fpga_copy(new_data, data_ptr, memory_size);
  filter::format_fc_filter(&new_data, num, channel, height, width, 1,
                           max_value);
  filter_tensor->reset_data_ptr(new_data);
}
Z
zhangyang 已提交
141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166
void format_deconv_filter(framework::Tensor *filter_tensor, float max_value,
                          int group_num, int stride) {
  filter_tensor->scale[0] = float(max_value / 127.0);  // NOLINT
  filter_tensor->scale[1] = float(127.0 / max_value);  // NOLINT
  auto dims = filter_tensor->dims();
  auto num = dims[0], channel = dims[1], height = dims[2], width = dims[3];
  auto data_ptr = filter_tensor->data<float>();
  size_t memory_size = num * channel * height * width * sizeof(float);
  auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
  memcpy(new_data, data_ptr, memory_size);

  int hw = height * width;
  deconv_filter::deconv_NC_convert(&new_data, num, channel, hw);

  num = dims[1];
  channel = dims[0];
  deconv_filter::deconv_format_filter(
      &new_data, (int)num, (int)channel,          // NOLINT
      (int)height,                                // NOLINT
      (int)width, group_num, max_value, stride);  // NOLINT

  framework::DDim dims_new =
      framework::make_ddim({num, channel, height, width});
  filter_tensor->Resize(dims_new);
  filter_tensor->reset_data_ptr(new_data);
}
Z
zhangyang 已提交
167

Z
zhangyang 已提交
168 169 170 171 172 173
void format_bias_scale_array(float **bias_scale_array,
                             int element_num_per_division, int num) {
  bias_scale::format_bias_scale_array(bias_scale_array,
                                      element_num_per_division, num);
}

Z
zhangyang 已提交
174 175 176 177 178 179 180 181 182
void format_concat_output(framework::Tensor *out, int height, int width,
                          int image_num, uint32_t *channel_num) {
  int sum_channel = 0, sum_cw = 0;
  for (int i = 0; i < image_num; i++) {
    sum_channel += channel_num[i];
  }

  sum_cw = align_to_x(width * sum_channel, IMAGE_ALIGNMENT);
  auto data_ptr = fpga_malloc(height * sum_cw * sizeof(half));
183
  auto ddim = framework::make_ddim({1, sum_channel, height, width});
Z
zhangyang 已提交
184 185 186 187
  out->Resize(ddim);
  out->reset_data_ptr(data_ptr);
}

188 189
void expand_conv_arg(ConvArgs *arg) {
  ConvArgs args = *arg;
190 191

  auto fpga_bias_scale_len =
192 193
      align_to_x(args.filter_num / args.group_num, 8) * args.group_num;

194
  auto output_height =
195 196 197
      (args.image.height + args.image.pad_height * 2 - args.kernel.height) /
          args.kernel.stride_h +
      1;
198
  auto output_width =
199 200 201
      (args.image.width + args.image.pad_width * 2 - args.kernel.width) /
          args.kernel.stride_w +
      1;
202 203 204 205 206 207 208 209 210 211

  auto filter_per_group = args.filter_num / args.group_num;
  auto channel_per_group = args.image.channels / args.group_num;

  auto image_row_count = args.image.width * args.image.channels;
  auto image_amount_per_row = align_to_x(image_row_count, IMAGE_ALIGNMENT);
  auto image_one_pad_per_row = align_to_x(image_row_count, IMAGE_ALIGNMENT) +
                               args.image.pad_width * args.image.channels;
  auto filter_amount_all =
      align_to_x(args.kernel.height * args.kernel.width * channel_per_group,
212 213
                 FILTER_ELEMENT_ALIGNMENT);

214 215
  auto output_amount_per_row =
      align_to_x(output_width * args.filter_num, IMAGE_ALIGNMENT);
216 217 218 219

  // find the opt partition strategy
  uint64_t res_win;
  uint64_t res_fit = 0;
220
  for (res_win = 1; res_win <= output_width; res_win++) {
221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241
    if ((align_to_x(
             (args.image.channels *
              (args.kernel.width + (res_win - 1) * args.kernel.stride_w)),
             IMAGE_ALIGNMENT) /
             16 +
         1) *
            args.kernel.height >
        2048) {
      break;
    }
  }

  if (res_win != output_width) {
    res_win -= 1;
  }

  if (((res_win % 2) != 0) && (res_win != 1)) {
    res_win = res_win - 1;
  }
  res_fit = res_win;

242 243 244
  auto block_num = (output_width + res_fit - 1) / res_fit;
  auto block_len = res_fit;
  auto block_last = output_width - res_fit * (block_num - 1);
245

246 247
  auto res_amount_per_row = output_width * args.filter_num;
  auto res_amount_per_row_pad = output_amount_per_row - res_amount_per_row;
248

249 250 251
  auto image_block_amount_per_row =
      args.kernel.stride_w * res_fit * args.image.channels;
  auto filter_pad_width_mul_channel =
252
      args.image.pad_width * args.image.channels;
253
  auto image_amount_per_row_multi_win_first =
254
      image_amount_per_row * (4 * args.kernel.stride_h - args.image.pad_height);
255
  auto image_amount_per_row_multi_win =
256 257
      image_amount_per_row * (4 * args.kernel.stride_h);

258 259
  auto image_block_num = block_num;
  auto image_block_len =
260 261 262 263 264
      align_to_x((args.image.channels *
                  (args.kernel.width + (block_len - 1) * args.kernel.stride_w)),
                 IMAGE_ALIGNMENT) /
          16 +
      1;
265
  auto image_block_len_last =
266 267 268 269 270 271
      align_to_x(
          (args.image.channels *
           (args.kernel.width + (block_last - 1) * args.kernel.stride_w)),
          IMAGE_ALIGNMENT) /
          16 +
      1;
272 273 274 275
  auto image_win_cnt = block_len;
  auto image_win_cnt_last = block_last;
  auto res_row_data_align4_pad = res_amount_per_row_pad / 8;
  auto prog_full_cnt = 2048 / (filter_amount_all / 16 * 2) - 1;
276 277 278
  if (prog_full_cnt == 1023) {
    prog_full_cnt--;
  }
279
  auto post_prog_full_cnt =
280 281 282
      (512 / (align_to_x(args.filter_num, 4) / 4 * 2) > 2)
          ? (512 / (align_to_x(args.filter_num, 4) / 4 * 2) - 2)
          : 0;
283
  auto cmd = 0UL | (args.relu_enabled ? USE_RELU : 0) | USE_BIAS;
284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342

  (*arg).driver.image_address_phy = vaddr_to_paddr(args.image.address);
  (*arg).driver.sb_address_phy = vaddr_to_paddr(args.sb_address);
  (*arg).driver.filter_address_phy = vaddr_to_paddr(args.filter_address);
  (*arg).driver.output_address_phy = vaddr_to_paddr(args.output.address);
  (*arg).driver.output_height = output_height;
  (*arg).driver.output_width = output_width;
  (*arg).driver.filter_per_group = filter_per_group;
  (*arg).driver.channel_per_group = channel_per_group;
  (*arg).driver.image_amount_per_row = image_amount_per_row;
  (*arg).driver.image_one_pad_per_row = image_one_pad_per_row;
  (*arg).driver.filter_amount_all = filter_amount_all;
  (*arg).driver.output_amount_per_row = output_amount_per_row;
  (*arg).driver.image_block_amount_per_row = image_block_amount_per_row;
  (*arg).driver.filter_pad_width_mul_channel = filter_pad_width_mul_channel;
  (*arg).driver.image_amount_per_row_multi_win_first =
      image_amount_per_row_multi_win_first;
  (*arg).driver.image_amount_per_row_multi_win = image_amount_per_row_multi_win;
  (*arg).driver.image_block_num = image_block_num;
  (*arg).driver.image_block_len = image_block_len;
  (*arg).driver.image_block_len_last = image_block_len_last;
  (*arg).driver.image_win_cnt = image_win_cnt;
  (*arg).driver.image_win_cnt_last = image_win_cnt_last;
  (*arg).driver.res_row_data_align4_pad = res_row_data_align4_pad;
  (*arg).driver.prog_full_cnt = prog_full_cnt;
  (*arg).driver.post_prog_full_cnt = post_prog_full_cnt;
  (*arg).driver.fpga_bias_scale_len = fpga_bias_scale_len;
  (*arg).driver.cmd = cmd;
}  // expand_conv_arg()

void expand_EW_arg(EWAddArgs *arg) {
  EWAddArgs args = *arg;
  uint64_t cmd = args.relu_enabled ? USE_RELU : 0;
  uint64_t datalen = (uint64_t)args.image0.width *
                     (uint64_t)args.image0.height *
                     (uint64_t)args.image0.channels;
  uint64_t coefficient = (uint64_t)args.const0 << 32 | (uint64_t)args.const1;
  uint64_t image0_address_phy = vaddr_to_paddr(args.image0.address);
  uint64_t image1_address_phy = vaddr_to_paddr(args.image1.address);
  uint64_t output_address_phy = vaddr_to_paddr(args.output.address);

  uint64_t image_amount_per_row =
      align_to_x((uint64_t)args.image0.width * (uint64_t)args.image0.channels,
                 IMAGE_ALIGNMENT);
  uint64_t image_image_pixel = ((uint64_t)args.image0.channels << 32) |
                               ((uint64_t)args.image0.width << 16) |
                               (uint64_t)args.image0.height;

  (*arg).driver.image0_address_phy = image0_address_phy;
  (*arg).driver.image1_address_phy = image1_address_phy;
  (*arg).driver.datalen = datalen;
  (*arg).driver.image_image_pixel = image_image_pixel;
  (*arg).driver.image_amount_per_row = image_amount_per_row;
  (*arg).driver.output_address_phy = output_address_phy;
  (*arg).driver.coefficient = coefficient;
  (*arg).driver.cmd = cmd;

}  // expand_EW_arg

Z
zhangyang 已提交
343 344 345 346
void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
                    framework::Tensor *out, framework::Tensor *filter,
                    bool relu_enabled, int group_num, int stride_h,
                    int stride_w, int padding_h, int padding_w, float *bs_ptr) {
347 348
  auto input_ptr = input->data<float>();
  auto filter_ptr = filter->data<float>();
349
  auto out_ptr = out->data<float>();
350 351

  arg->group_num = (uint32_t)group_num;
352 353
  // Either group_num or split_num = 1;
  arg->split_num = group_num == 1 ? (uint32_t)get_plit_num(filter) : 1;
354 355 356
  arg->filter_num = (uint32_t)filter->dims()[0];
  arg->output.address = out_ptr;
  arg->output.scale_address = out->scale;
Z
zhangyang 已提交
357
  arg->conv_arg =
358
      (ConvArgs *)fpga_malloc(arg->split_num * sizeof(ConvArgs));  // NOLINT
359 360 361 362

  arg->concat_arg.image_num = arg->split_num;
  arg->concat_arg.image_out = out_ptr;
  arg->concat_arg.scale_out = out->scale;
363 364
  arg->concat_arg.height = (uint32_t)out->dims()[2];
  arg->concat_arg.width = (uint32_t)out->dims()[3];
365 366

  int n = arg->split_num;
367 368 369 370
  arg->concat_arg.images_in =
      (half **)fpga_malloc(n * sizeof(int *));  // NOLINT
  arg->concat_arg.scales_in =
      (float **)fpga_malloc(n * sizeof(float *));  // NOLINT
371
  arg->concat_arg.channel_num =
372
      (uint32_t *)fpga_malloc(n * sizeof(uint32_t));  // NOLINT
373

374 375 376
  auto channel = (int)out->dims()[1];  // NOLINT
  int filter_num_per_div = get_filter_num_per_div(filter, group_num);
  int element_num = get_aligned_filter_element_num(
377
      (int)(filter->dims()[1] * filter->dims()[2] * filter->dims()[3]));
378 379

  for (int i = 0; i < n; i++) {
Z
zhangyang 已提交
380 381 382 383 384 385 386 387 388 389 390 391 392 393 394
    arg->conv_arg[i].relu_enabled = relu_enabled;
    arg->conv_arg[i].group_num = (uint32_t)group_num;
    arg->conv_arg[i].kernel.stride_h = (uint32_t)stride_h;
    arg->conv_arg[i].kernel.stride_w = (uint32_t)stride_w;
    arg->conv_arg[i].kernel.height = (uint32_t)filter->dims()[2];
    arg->conv_arg[i].kernel.width = (uint32_t)filter->dims()[3];
    arg->conv_arg[i].image.address = input_ptr;
    arg->conv_arg[i].image.channels = (uint32_t)input->dims()[1];
    arg->conv_arg[i].image.height = (uint32_t)input->dims()[2];
    arg->conv_arg[i].image.width = (uint32_t)input->dims()[3];
    arg->conv_arg[i].image.scale_address = input->scale;
    arg->conv_arg[i].image.pad_height = (uint32_t)padding_h;
    arg->conv_arg[i].image.pad_width = (uint32_t)padding_w;
    arg->conv_arg[i].filter_scale_address = filter->scale;
    arg->conv_arg[i].filter_num = (uint32_t)(
395 396
        i == n - 1 ? channel - (n - 1) * filter_num_per_div  // NOLINT
                   : filter_num_per_div);
397

Z
zhangyang 已提交
398
    size_t filter_size =
399 400 401
        element_num *
        align_to_x(arg->conv_arg[i].filter_num, FILTER_NUM_ALIGNMENT) *
        sizeof(int8_t);
Z
zhangyang 已提交
402 403 404 405 406 407
    auto filter_head =
        &((int8_t *)filter_ptr)[i * element_num * filter_num_per_div];
    arg->conv_arg[i].filter_address = fpga_malloc(filter_size);
    memcpy(arg->conv_arg[i].filter_address, filter_head, filter_size);
    fpga_flush(arg->conv_arg[i].filter_address, filter_size);

408 409 410
    size_t bs_size = 2 *
                     align_to_x(arg->conv_arg[i].filter_num, BS_NUM_ALIGNMENT) *
                     sizeof(float);
Z
zhangyang 已提交
411 412 413 414 415
    auto bs_head = &bs_ptr[i * filter_num_per_div * 2];
    arg->conv_arg[i].sb_address = fpga_malloc(bs_size);
    memcpy(arg->conv_arg[i].sb_address, bs_head, bs_size);
    fpga_flush(arg->conv_arg[i].sb_address, bs_size);

416
    if (n > 1) {
Z
zhangyang 已提交
417
      arg->conv_arg[i].output.scale_address =
418
          (float *)fpga_malloc(2 * sizeof(float));  // NOLINT
419 420 421 422 423
      arg->conv_arg[i].output.address = fpga_malloc(
          out->dims()[2] *
          align_to_x((int)(out->dims()[3] * arg->conv_arg[i].filter_num),
                     IMAGE_ALIGNMENT) *
          sizeof(half));
424
    } else {
Z
zhangyang 已提交
425 426
      arg->conv_arg[i].output.scale_address = out->scale;
      arg->conv_arg[i].output.address = out_ptr;
427 428
    }

429
    arg->concat_arg.images_in[i] =
Z
zhangyang 已提交
430 431 432
        (half *)arg->conv_arg[i].output.address;  // NOLINT
    arg->concat_arg.scales_in[i] = arg->conv_arg[i].output.scale_address;
    arg->concat_arg.channel_num[i] = arg->conv_arg[i].filter_num;
433 434

    expand_conv_arg(&arg->conv_arg[i]);
435
  }
Z
zhangyang 已提交
436 437
  filter->reset_data_ptr(nullptr);
  fpga_free(bs_ptr);
438 439
}  // fill_split_arg

Z
zhangyang 已提交
440 441 442 443 444 445 446 447 448 449
void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
                     framework::Tensor *out, framework::Tensor *filter,
                     bool relu_enabled, int group_num, int stride_h,
                     int stride_w, int padding_h, int padding_w,
                     float *bs_ptr) {
  auto input_ptr = input->data<float>();
  auto filter_ptr = filter->data<float>();
  auto out_ptr = out->data<float>();

  arg->group_num = (uint32_t)group_num;
450
  arg->sub_conv_num = (uint32_t)stride_h;
Z
zhangyang 已提交
451 452
  arg->filter_num = (uint32_t)filter->dims()[0];
  int sub_conv_num = arg->sub_conv_num;
453 454 455 456
  int sub_pad = deconv_filter::deconv_calc_sub_pad((int)filter->dims()[3],
                                                   padding_w, stride_w);
  int sub_filter_width = deconv_filter::deconv_get_sub_filter_axis(
      (int)filter->dims()[3], stride_w);
457

Z
zhangyang 已提交
458
  int sub_output_width = deconv_filter::deconv_get_sub_out_axis(
459
      (int)input->dims()[3], sub_pad, sub_filter_width);
Z
zhangyang 已提交
460
  int sub_output_height = deconv_filter::deconv_get_sub_out_axis(
461
      (int)input->dims()[2], sub_pad, sub_filter_width);
Z
zhangyang 已提交
462

463 464 465 466
  arg->sub_output_width = (uint32_t)sub_output_width;
  arg->sub_output_height = (uint32_t)sub_output_height;
  arg->omit_size = (uint32_t)deconv_filter::deconv_get_omit(
      stride_w, (int)filter->dims()[3], padding_w);
Z
zhangyang 已提交
467

Z
zhangyang 已提交
468 469 470 471 472 473 474
  arg->output.address = out_ptr;
  arg->output.scale_address = out->scale;

  int sub_channels = (int)input->dims()[1];
  int omit_size = arg->omit_size;
  int real_out_width = sub_output_width * sub_conv_num - 2 * omit_size;
  int real_out_height = sub_output_height * sub_conv_num - 2 * omit_size;
Z
zhangyang 已提交
475 476 477 478 479
  int sub_filter_num = sub_conv_num * (arg->filter_num);

  int conv_output_size =
      (align_to_x(sub_output_width * sub_filter_num, IMAGE_ALIGNMENT)) *
      sub_output_height;
Z
zhangyang 已提交
480
  int ouput_size = conv_output_size * sub_conv_num;
Z
zhangyang 已提交
481 482 483 484 485 486 487 488

  int align_sub_filter_num = align_to_x(sub_filter_num, FILTER_NUM_ALIGNMENT);
  int align_sub_filter_count =
      align_to_x(sub_filter_width * sub_filter_width * sub_channels,
                 FILTER_ELEMENT_ALIGNMENT);
  int align_conv_sub_filter_count =
      align_sub_filter_count * align_sub_filter_num;

Z
zhangyang 已提交
489 490 491 492 493
  int split_num =
      group_num == 1 ? (uint32_t)get_deconv_plit_num(filter, sub_conv_num) : 1;

  arg->split_conv_args =
      (SplitConvArgs *)fpga_malloc(sub_conv_num * sizeof(SplitConvArgs));
Z
zhangyang 已提交
494
  for (int i = 0; i < sub_conv_num; ++i) {
Z
zhangyang 已提交
495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515
    arg->split_conv_args[i].filter_num =
        (arg->sub_conv_num) * (arg->filter_num);
    arg->split_conv_args[i].group_num = (uint32_t)group_num;
    arg->split_conv_args[i].split_num = split_num;
    arg->split_conv_args[i].conv_arg =
        (ConvArgs *)fpga_malloc(split_num * sizeof(ConvArgs));

    arg->split_conv_args[i].concat_arg.height = sub_output_height;
    arg->split_conv_args[i].concat_arg.width = sub_output_width;
    arg->split_conv_args[i].concat_arg.image_num = split_num;
    arg->split_conv_args[i].concat_arg.images_in =
        (half **)fpga_malloc(split_num * sizeof(half *));
    arg->split_conv_args[i].concat_arg.scales_in =
        (float **)fpga_malloc(split_num * sizeof(float *));
    arg->split_conv_args[i].concat_arg.channel_num =
        (uint32_t *)fpga_malloc(split_num * sizeof(uint32_t));
    // arg->split_conv_args[i].concat_arg.image_out =
    // fpga_malloc(conv_output_size * sizeof(half));
    // arg->split_conv_args[i].concat_arg.scale_out = fpga_malloc(2 *
    // sizeof(float));
  }
Z
zhangyang 已提交
516

Z
zhangyang 已提交
517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536
  int filter_num_per_div =
      get_deconv_filter_num_per_div(filter, group_num, stride_w);
  int element_num = get_aligned_filter_element_num(
      (int)(sub_channels * sub_filter_width * sub_filter_width));

  int chw = sub_channels * sub_filter_width * sub_filter_width;
  int division_capacity = filter::calc_division_capacity(chw);
  int num_per_div_before_alignment =
      filter::calc_num_per_div(sub_filter_num, group_num, division_capacity);
  int num_per_div_after_alignment =
      align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
  int div_num = (sub_filter_num + num_per_div_before_alignment - 1) /
                num_per_div_before_alignment;
  int residual = sub_filter_num % num_per_div_before_alignment;
  int num_after_alignment = num_per_div_after_alignment *
                                ((residual == 0) ? div_num : (div_num - 1)) +
                            align_to_x(residual, FILTER_NUM_ALIGNMENT);

  int filter_sub_conv_offset = element_num * num_after_alignment;
  for (int i = 0; i < sub_conv_num; ++i) {
Z
zhangyang 已提交
537
    if (sub_conv_num == 1) {
Z
zhangyang 已提交
538 539 540
      arg->split_conv_args[i].output.address = arg->output.address;
      arg->split_conv_args[i].output.scale_address = arg->output.scale_address;

Z
zhangyang 已提交
541
    } else {
Z
zhangyang 已提交
542 543
      auto ptr_output = (half *)fpga_malloc(conv_output_size * sizeof(half));
      arg->split_conv_args[i].output.address = (void *)((half *)ptr_output);
544
      auto ptr_output_scale = (float *)fpga_malloc(2 * sizeof(float));
Z
zhangyang 已提交
545
      arg->split_conv_args[i].output.scale_address = ptr_output_scale;
Z
zhangyang 已提交
546 547
    }

Z
zhangyang 已提交
548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640
    for (int j = 0; j < split_num; ++j) {
      arg->split_conv_args[i].conv_arg[j].relu_enabled = relu_enabled;
      arg->split_conv_args[i].conv_arg[j].group_num = (uint32_t)group_num;

      arg->split_conv_args[i].conv_arg[j].kernel.width =
          (uint32_t)sub_filter_width;
      arg->split_conv_args[i].conv_arg[j].kernel.height =
          (uint32_t)sub_filter_width;
      arg->split_conv_args[i].conv_arg[j].kernel.stride_w = 1;
      arg->split_conv_args[i].conv_arg[j].kernel.stride_h = 1;

      arg->split_conv_args[i].conv_arg[j].image.scale_address = input->scale;
      arg->split_conv_args[i].conv_arg[j].image.channels =
          (uint32_t)sub_channels;
      arg->split_conv_args[i].conv_arg[j].image.width =
          (uint32_t)input->dims()[3];
      arg->split_conv_args[i].conv_arg[j].image.height =
          (uint32_t)input->dims()[2];
      arg->split_conv_args[i].conv_arg[j].image.pad_width = (uint32_t)sub_pad;
      arg->split_conv_args[i].conv_arg[j].image.pad_height = (uint32_t)sub_pad;
      arg->split_conv_args[i].conv_arg[j].image.address = input_ptr;

      arg->split_conv_args[i].conv_arg[j].filter_scale_address = filter->scale;
      arg->split_conv_args[i].conv_arg[j].filter_num = (uint32_t)(
          j == split_num - 1
              ? sub_filter_num - (split_num - 1) * filter_num_per_div  // NOLINT
              : filter_num_per_div);

      size_t filter_size =
          element_num *
          align_to_x(arg->split_conv_args[i].conv_arg[j].filter_num,
                     FILTER_NUM_ALIGNMENT) *
          sizeof(int8_t);
      auto filter_head =
          &((int8_t *)filter_ptr)[j * element_num * filter_num_per_div +
                                  i * filter_sub_conv_offset];
      arg->split_conv_args[i].conv_arg[j].filter_address =
          fpga_malloc(filter_size);
      memcpy(arg->split_conv_args[i].conv_arg[j].filter_address, filter_head,
             filter_size);
      fpga_flush(arg->split_conv_args[i].conv_arg[j].filter_address,
                 filter_size);

      {
        static int test_cnt = 0;
        signed char result = 0;
        if (test_cnt <= 1) {
          std::string filename = "deconv_split_flt" + std::to_string(test_cnt);

          fpga::savefile<signed char>(
              filename, arg->split_conv_args[i].conv_arg[j].filter_address,
              filter_size, result);
          test_cnt++;
        }
      }

      size_t bs_align_num = align_to_x(
          arg->split_conv_args[i].conv_arg[j].filter_num, BS_NUM_ALIGNMENT);
      size_t bs_size = 2 * bs_align_num * sizeof(float);
      auto bs_head = &bs_ptr[j * filter_num_per_div * 2];

      arg->split_conv_args[i].conv_arg[j].sb_address = fpga_malloc(bs_size);
      memcpy(arg->split_conv_args[i].conv_arg[j].sb_address, bs_head, bs_size);
      fpga_flush(arg->split_conv_args[i].conv_arg[j].sb_address, bs_size);

      if (split_num == 1) {
        arg->split_conv_args[i].conv_arg[j].output.address =
            arg->split_conv_args[i].output.address;
        arg->split_conv_args[i].conv_arg[j].output.scale_address =
            arg->split_conv_args[i].output.scale_address;
      } else {
        auto ptr_output = (half *)fpga_malloc(conv_output_size * sizeof(half));
        arg->split_conv_args[i].conv_arg[j].output.address =
            (void *)((half *)ptr_output);
        auto ptr_output_scale = (float *)fpga_malloc(2 * sizeof(float));
        arg->split_conv_args[i].conv_arg[j].output.scale_address =
            ptr_output_scale;
      }
      arg->split_conv_args[i].concat_arg.images_in[j] =
          (half *)arg->split_conv_args[i].conv_arg[j].output.address;  // NOLINT
      arg->split_conv_args[i].concat_arg.scales_in[j] =
          arg->split_conv_args[i].conv_arg[j].output.scale_address;
      arg->split_conv_args[i].concat_arg.channel_num[j] =
          arg->split_conv_args[i].conv_arg[j].filter_num;

      expand_conv_arg(&(arg->split_conv_args[i].conv_arg[j]));
    }

    arg->split_conv_args[i].concat_arg.image_out =
        arg->split_conv_args[i].output.address;
    arg->split_conv_args[i].concat_arg.scale_out =
        arg->split_conv_args[i].output.scale_address;
  }
641
  filter->reset_data_ptr(nullptr);
Z
zhangyang 已提交
642
  fpga_free(bs_ptr);
643 644
}  // fill_deconv_arg

H
hanbuhe 已提交
645
}  // namespace fpga
Z
zhangyang 已提交
646
}  // namespace paddle_mobile