api.cpp 16.8 KB
Newer Older
H
hanbuhe 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

15
#include "fpga/api.h"
H
hanbuhe 已提交
16 17
#include <fcntl.h>
#include <sys/ioctl.h>
18
#include <sys/mman.h>
H
hanbuhe 已提交
19
#include <algorithm>
20 21 22 23
#include <map>
#include "fpga/bias_scale.h"
#include "fpga/filter.h"
#include "fpga/image.h"
Z
zhangyang 已提交
24
#define FPGA_TEST_MODE
25
//#define PADDLE_MOBILE_OS_LINUX
Z
zhangyang 已提交
26

Z
zhangyang 已提交
27
namespace paddle_mobile {
H
hanbuhe 已提交
28 29 30 31
namespace fpga {

static int fd = -1;
static const char *device_path = "/dev/fpgadrv0";
32
static std::map<void *, size_t> memory_map;
H
hanbuhe 已提交
33

H
hanbuhe 已提交
34
static inline int do_ioctl(int req, const void *arg) {
H
hanbuhe 已提交
35
#ifdef PADDLE_MOBILE_OS_LINUX
36 37 38
  int result = ioctl(fd, req, (uint64_t)arg);
  PADDLE_MOBILE_ENFORCE(result == 0, "ioctl didn't return correctly");
  return result;
H
hanbuhe 已提交
39 40 41
#else
  return -1;
#endif
Z
zhangyang 已提交
42
}
H
hanbuhe 已提交
43 44 45 46 47 48 49 50 51 52

int open_device() {
  if (fd == -1) {
    fd = open(device_path, O_RDWR);
  }
  return fd;
}

// memory management;
void *fpga_malloc(size_t size) {
53 54
  static uint64_t counter = 0;

H
hanbuhe 已提交
55
#ifdef PADDLE_MOBILE_OS_LINUX
56
  auto ptr = mmap64(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
H
hanbuhe 已提交
57
#else
58
  auto ptr = malloc(size);
H
hanbuhe 已提交
59
#endif
60 61
  counter += size;
  memory_map.insert(std::make_pair(ptr, size));
Z
zhangyang 已提交
62 63
  //  DLOG << "Address: " << ptr << ", " << size << " bytes allocated. Total "
  //       << counter << " bytes";
64
  return ptr;
H
hanbuhe 已提交
65 66
}

67
void fpga_free(void *ptr) {
68 69 70 71 72 73 74
  static uint64_t counter = 0;
  size_t size = 0;

  auto iter = memory_map.find(ptr);  // std::map<void *, size_t>::iterator
  if (iter != memory_map.end()) {
    size = iter->second;
    memory_map.erase(iter);
Z
zhangyang 已提交
75
#ifdef PADDLE_MOBILE_OS_LINUX
76
    munmap(ptr, size);
77
#else
78
    free(ptr);
79
#endif
80
    counter += size;
Z
zhangyang 已提交
81 82
    //    DLOG << "Address: " << ptr << ", " << size << " bytes freed. Total "
    //         << counter << " bytes";
83 84 85
  } else {
    DLOG << "Invalid pointer";
  }
86
}
H
hanbuhe 已提交
87 88 89 90 91

void fpga_copy(void *dest, const void *src, size_t num) {
  memcpy(dest, src, num);
}

92 93 94 95 96 97 98 99 100 101 102 103 104 105
int fpga_flush(void *address, size_t size) {
  struct MemoryCacheArgs args = {nullptr};
  args.address = address;
  args.size = size;
  return do_ioctl(IOCTL_MEMCACHE_FLUSH, &args);
}

int fpga_invalidate(void *address, size_t size) {
  struct MemoryCacheArgs args = {nullptr};
  args.address = address;
  args.size = size;
  return do_ioctl(IOCTL_MEMCACHE_INVAL, &args);
}

Z
zhangyang 已提交
106
half fp32_2_fp16(float fp32_num) {
Z
zhangyang 已提交
107
  unsigned long tmp = *(unsigned long *)(&fp32_num);  // NOLINT
Z
zhangyang 已提交
108 109 110 111 112 113 114 115 116 117 118 119 120 121 122
  half t = ((tmp & 0x007fffff) >> 13) | ((tmp & 0x80000000) >> 16) |
           (((tmp & 0x7f800000) >> 13) - (112 << 10));
  if (tmp & 0x1000) {
    t++;  // roundoff
  }
  return t;
}

float fp16_2_fp32(half fp16_num) {
  int frac = (fp16_num & 0x3ff);
  int exp = ((fp16_num & 0x7c00) >> 10) + 112;
  int s = fp16_num & 0x8000;
  int tmp = 0;
  float fp32_num;
  tmp = s << 16 | exp << 23 | frac << 13;
Z
zhangyang 已提交
123
  fp32_num = *(float *)&tmp;  // NOLINT
Z
zhangyang 已提交
124 125 126
  return fp32_num;
}

127
int ComputeBasicConv(const struct ConvArgs &args) {
128
#ifdef FPGA_TEST_MODE
129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147
  DLOG << "======Compute Basic Conv======";
  DLOG << "   relu_enabled:" << args.relu_enabled
       << "   sb_address:" << args.sb_address
       << "   filter_address:" << args.filter_address
       << "   filter_num:" << args.filter_num
       << "   group_num:" << args.group_num;
  DLOG << "   image_address:" << args.image.address
       << "   image_scale_address:" << args.image.scale_address
       << "   image_channels:" << args.image.channels
       << "   image_height:" << args.image.height
       << "   image_width:" << args.image.width
       << "   pad_height:" << args.image.pad_height
       << "   pad_width:" << args.image.pad_width;
  DLOG << "   kernel_height:" << args.kernel.height
       << "   kernel_width:" << args.kernel.width
       << "   stride_h:" << args.kernel.stride_h
       << "   stride_w:" << args.kernel.stride_w;
  DLOG << "   out_address:" << args.output.address
       << "   out_scale_address:" << args.output.scale_address;
148
#endif
149 150 151
  return do_ioctl(IOCTL_CONFIG_CONV, &args);
}

Z
zhangyang 已提交
152
int ComputeFpgaConv(const struct WrapperConvArgs &args) {
Z
zhangyang 已提交
153
#ifdef FPGA_TEST_MODE
154 155 156 157
  DLOG << "=============ComputeFPGAConv===========";
  DLOG << "   filter_num:" << args.filter_num
       << "   group_num:" << args.group_num
       << "   split_num:" << args.split_num;
Z
zhangyang 已提交
158
#endif
159

Z
zhangyang 已提交
160 161
  int split_num = args.split_num;
  for (int i = 0; i < split_num; i++) {
162
    ComputeBasicConv(args.conv_args[i]);
Z
zhangyang 已提交
163
  }
Z
zhangyang 已提交
164

Z
zhangyang 已提交
165 166 167
  if (split_num > 1) {
    ComputeFPGAConcat(args.concat_arg);
  }
H
hanbuhe 已提交
168
}
Z
zhangyang 已提交
169

H
hanbuhe 已提交
170
int ComputeFpgaPool(const struct PoolingArgs &args) {
Z
zhangyang 已提交
171
#ifdef FPGA_TEST_MODE
172
  DLOG << "=============ComputeFpgaPool===========";
Z
zhangyang 已提交
173 174
  DLOG << "   mode:" << args.mode
       << "   kernel_reciprocal:" << fp16_2_fp32(args.kernel_reciprocal);
Z
zhangyang 已提交
175 176 177 178 179 180 181 182 183 184 185 186 187 188 189
  DLOG << "   image_address:" << args.image.address
       << "   image_scale_address:" << args.image.scale_address
       << "   image_channels:" << args.image.channels
       << "   image_height:" << args.image.height
       << "   image_width:" << args.image.width
       << "   pad_height:" << args.image.pad_height
       << "   pad_width:" << args.image.pad_width;
  DLOG << "   kernel_height:" << args.kernel.height
       << "   kernel_width:" << args.kernel.width
       << "   stride_h:" << args.kernel.stride_h
       << "   stride_w:" << args.kernel.stride_w;
  DLOG << "   out_address:" << args.output.address
       << "   out_scale_address:" << args.output.scale_address;
#endif

H
hanbuhe 已提交
190
  return do_ioctl(IOCTL_CONFIG_POOLING, &args);
H
hanbuhe 已提交
191
}
Z
zhangyang 已提交
192

H
hanbuhe 已提交
193
int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
Z
zhangyang 已提交
194
#ifdef FPGA_TEST_MODE
195
  DLOG << "=============ComputeFpgaEWAdd===========";
196 197 198
  DLOG << "   relu_enabled:" << args.relu_enabled
       << "   const0:" << fp16_2_fp32(short(args.const0))
       << "   const1:" << fp16_2_fp32(short(args.const1));
Z
zhangyang 已提交
199 200 201 202 203 204 205 206
  DLOG << "   image0_address:" << args.image0.address
       << "   image0_scale_address:" << args.image0.scale_address
       << "   image0_channels:" << args.image0.channels
       << "   image0_height:" << args.image0.height
       << "   image0_width:" << args.image0.width
       << "   pad0_height:" << args.image0.pad_height
       << "   pad0_width:" << args.image0.pad_width;
  DLOG << "   image1_address:" << args.image1.address
Z
zhangyang 已提交
207
       << "   image1_scale_address:" << args.image1.scale_address
Z
zhangyang 已提交
208 209 210 211 212 213 214 215 216
       << "   image1_channels:" << args.image1.channels
       << "   image1_height:" << args.image1.height
       << "   image1_width:" << args.image1.width
       << "   pad1_height:" << args.image1.pad_height
       << "   pad_width:" << args.image1.pad_width;
  DLOG << "   out_address:" << args.output.address
       << "   out_scale_address:" << args.output.scale_address;
#endif

H
hanbuhe 已提交
217 218 219
  return do_ioctl(IOCTL_CONFIG_EW, &args);
}
int PerformBypass(const struct BypassArgs &args) {
Z
zhangyang 已提交
220
#ifdef FPGA_TEST_MODE
221 222 223 224 225
  DLOG << "=============ComputeFpgaBypass===========";
  DLOG << "   input_type:" << args.input_data_type
       << "   output_type:" << args.output_data_type
       << "   input_layout_type:" << args.input_layout_type
       << "   output_layout_type:" << args.output_layout_type;
Z
zhangyang 已提交
226 227 228 229 230 231 232 233 234 235 236
  DLOG << "   image_address:" << args.image.address
       << "   image_scale_address:" << args.image.scale_address
       << "   image_channels:" << args.image.channels
       << "   image_height:" << args.image.height
       << "   image_width:" << args.image.width
       << "   pad_height:" << args.image.pad_height
       << "   pad_width:" << args.image.pad_width;
  DLOG << "   out_address:" << args.output.address
       << "   out_scale_address:" << args.output.scale_address;
#endif

H
hanbuhe 已提交
237
  return do_ioctl(IOCTL_CONFIG_BYPASS, &args);
H
hanbuhe 已提交
238
}
Z
zhangyang 已提交
239

Z
zhangyang 已提交
240
int ComputeFPGAConcat(const struct ConcatArgs &args) {
241 242 243 244 245 246 247 248 249 250 251 252 253 254
#ifdef FPGA_TEST_MODE
  DLOG << "=============ComputeFpgaConcat===========";
  DLOG << "   Image_num: " << args.image_num
       << "   out_address:" << args.image_out
       << "   out_scale_address:" << args.scale_out;
  DLOG << "   image_height:" << args.height << "   image_width:" << args.width;
  for (int i = 0; i < args.image_num; i++) {
    DLOG << "   " << i << "th:        ";
    DLOG << "   channel_num:" << args.channel_num[i]
         << "   image_address:" << args.images_in[i]
         << "   image_scale_address:" << args.scales_in[i];
  }
#endif

Z
zhangyang 已提交
255 256 257 258 259 260
  image::concat_images(args.images_in, args.scales_in, args.image_out,
                       args.scale_out, args.image_num, args.channel_num,
                       args.height, args.width);
  return 0;
}

261 262
int get_align_image_cw(int cw) { return align_to_x(cw, IMAGE_ALIGNMENT); }

Z
zhangyang 已提交
263 264
void format_image(framework::Tensor *image_tensor) {
  auto dims = image_tensor->dims();
Z
zhangyang 已提交
265
  auto channel = dims[1], height = dims[2], width = dims[3];
266
  auto data_ptr = image_tensor->data<float>();
Z
zhangyang 已提交
267
  size_t memory_size = channel * height * width * sizeof(float);
268
  auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
Z
zhangyang 已提交
269 270 271 272 273
  fpga_copy(new_data, data_ptr, memory_size);
  image::format_image(&new_data, channel, height, width);
  image_tensor->reset_data_ptr(new_data);
}

274
void format_fp16_ofm(framework::Tensor *ofm_tensor) {
Z
zhangyang 已提交
275
  auto dims = ofm_tensor->dims();
276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305
  size_t memory_size = 0;
  if (dims.size() == 4) {
    auto channel = dims[1], height = dims[2], width = dims[3];
    memory_size =
        height * align_to_x(channel * width, IMAGE_ALIGNMENT) * sizeof(half);
  } else if (dims.size() == 2) {
    memory_size = align_to_x(dims[1], IMAGE_ALIGNMENT) * sizeof(half);
  } else {
    DLOG << "Wrong ofm dimension";
  }
  auto p = fpga_malloc(memory_size);
  memset(p, 0, memory_size);
  ofm_tensor->reset_data_ptr(p);
}

void format_fp32_ofm(framework::Tensor *ofm_tensor) {
  auto dims = ofm_tensor->dims();
  size_t memory_size = 0;
  if (dims.size() == 4) {
    auto channel = dims[1], height = dims[2], width = dims[3];
    memory_size =
        height * align_to_x(channel * width, IMAGE_ALIGNMENT) * sizeof(float);
  } else if (dims.size() == 2) {
    memory_size = align_to_x(dims[1], IMAGE_ALIGNMENT) * sizeof(float);
  } else {
    DLOG << "Wrong ofm dimension";
  }
  auto p = fpga_malloc(memory_size);
  memset(p, 0, memory_size);
  ofm_tensor->reset_data_ptr(p);
Z
zhangyang 已提交
306 307
}

Z
zhangyang 已提交
308 309 310 311
float filter_find_max(framework::Tensor *filter_tensor) {
  auto filter_ptr = filter_tensor->data<float>();
  return filter::find_max(filter_ptr, filter_tensor->numel());
}
Z
zhangyang 已提交
312 313 314

int get_plit_num(framework::Tensor *filter_tensor) {
  auto dims = filter_tensor->dims();
Z
zhangyang 已提交
315 316
  auto chw = dims[1] * dims[2] * dims[3];
  auto num = dims[0];
Z
zhangyang 已提交
317 318 319 320
  int div_capacity = filter::calc_division_capacity(chw);
  return filter::calc_split_num(num, div_capacity);
}

321
int get_filter_num_per_div(framework::Tensor *filter_tensor, int group_num) {
Z
zhangyang 已提交
322
  auto dims = filter_tensor->dims();
Z
zhangyang 已提交
323 324
  auto chw = dims[1] * dims[2] * dims[3];
  auto num = dims[0];
Z
zhangyang 已提交
325 326 327 328
  int div_capacity = filter::calc_division_capacity(chw);
  return filter::calc_num_per_div(num, group_num, div_capacity);
}

Z
zhangyang 已提交
329 330 331 332 333 334 335 336
int get_aligned_filter_element_num(int chw) {
  return align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
}

int get_aligned_filter_num(int num) {
  return align_to_x(num, FILTER_NUM_ALIGNMENT);
}

Z
zhangyang 已提交
337 338
void format_filter(framework::Tensor *filter_tensor, float max_value,
                   int group_num) {
339 340
  filter_tensor->scale[0] = float(max_value / 127.0);  // NOLINT
  filter_tensor->scale[1] = float(127.0 / max_value);  // NOLINT
Z
zhangyang 已提交
341
  auto dims = filter_tensor->dims();
Z
zhangyang 已提交
342
  auto num = dims[0], channel = dims[1], height = dims[2], width = dims[3];
343
  auto data_ptr = filter_tensor->data<float>();
Z
zhangyang 已提交
344
  size_t memory_size = num * channel * height * width * sizeof(float);
345
  auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
Z
zhangyang 已提交
346 347 348 349 350 351
  fpga_copy(new_data, data_ptr, memory_size);
  filter::format_filter(&new_data, num, channel, height, width, group_num,
                        max_value);
  filter_tensor->reset_data_ptr(new_data);
}

Z
zhangyang 已提交
352 353 354 355 356 357 358 359 360 361 362 363 364 365
void format_fc_filter(framework::Tensor *filter_tensor, float max_value) {
  filter_tensor->scale[0] = float(max_value / 127.0);  // NOLINT
  filter_tensor->scale[1] = float(127.0 / max_value);  // NOLINT
  auto dims = filter_tensor->dims();
  auto num = dims[0], channel = dims[1], height = dims[2], width = dims[3];
  auto data_ptr = filter_tensor->data<float>();
  size_t memory_size = num * channel * height * width * sizeof(float);
  auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
  fpga_copy(new_data, data_ptr, memory_size);
  filter::format_fc_filter(&new_data, num, channel, height, width, 1,
                           max_value);
  filter_tensor->reset_data_ptr(new_data);
}

Z
zhangyang 已提交
366 367 368 369 370 371
void format_bias_scale_array(float **bias_scale_array,
                             int element_num_per_division, int num) {
  bias_scale::format_bias_scale_array(bias_scale_array,
                                      element_num_per_division, num);
}

Z
zhangyang 已提交
372 373 374 375 376 377 378 379 380
void format_concat_output(framework::Tensor *out, int height, int width,
                          int image_num, uint32_t *channel_num) {
  int sum_channel = 0, sum_cw = 0;
  for (int i = 0; i < image_num; i++) {
    sum_channel += channel_num[i];
  }

  sum_cw = align_to_x(width * sum_channel, IMAGE_ALIGNMENT);
  auto data_ptr = fpga_malloc(height * sum_cw * sizeof(half));
381
  auto ddim = framework::make_ddim({1, sum_channel, height, width});
Z
zhangyang 已提交
382 383 384 385
  out->Resize(ddim);
  out->reset_data_ptr(data_ptr);
}

386 387 388 389 390 391
void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input,
                   framework::Tensor *out, framework::Tensor *filter,
                   bool relu_enabled, int group_num, int stride_h, int stride_w,
                   int padding_h, int padding_w, float *bs_ptr) {
  auto input_ptr = input->data<float>();
  auto filter_ptr = filter->data<float>();
392
  auto out_ptr = out->data<float>();
393 394

  arg->group_num = (uint32_t)group_num;
395 396
  // Either group_num or split_num = 1;
  arg->split_num = group_num == 1 ? (uint32_t)get_plit_num(filter) : 1;
397 398 399
  arg->filter_num = (uint32_t)filter->dims()[0];
  arg->output.address = out_ptr;
  arg->output.scale_address = out->scale;
400 401
  arg->conv_args =
      (ConvArgs *)fpga_malloc(arg->split_num * sizeof(ConvArgs));  // NOLINT
402 403 404 405

  arg->concat_arg.image_num = arg->split_num;
  arg->concat_arg.image_out = out_ptr;
  arg->concat_arg.scale_out = out->scale;
406 407
  arg->concat_arg.height = (uint32_t)out->dims()[2];
  arg->concat_arg.width = (uint32_t)out->dims()[3];
408 409

  int n = arg->split_num;
410 411 412 413
  arg->concat_arg.images_in =
      (half **)fpga_malloc(n * sizeof(int *));  // NOLINT
  arg->concat_arg.scales_in =
      (float **)fpga_malloc(n * sizeof(float *));  // NOLINT
414
  arg->concat_arg.channel_num =
415
      (uint32_t *)fpga_malloc(n * sizeof(uint32_t));  // NOLINT
416

417 418 419
  auto channel = (int)out->dims()[1];  // NOLINT
  int filter_num_per_div = get_filter_num_per_div(filter, group_num);
  int element_num = get_aligned_filter_element_num(
420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435
      filter->dims()[1] * filter->dims()[2] * filter->dims()[3]);

  for (int i = 0; i < n; i++) {
    arg->conv_args[i].relu_enabled = relu_enabled;
    arg->conv_args[i].group_num = (uint32_t)group_num;
    arg->conv_args[i].kernel.stride_h = (uint32_t)stride_h;
    arg->conv_args[i].kernel.stride_w = (uint32_t)stride_w;
    arg->conv_args[i].kernel.height = (uint32_t)filter->dims()[2];
    arg->conv_args[i].kernel.width = (uint32_t)filter->dims()[3];
    arg->conv_args[i].image.address = input_ptr;
    arg->conv_args[i].image.channels = (uint32_t)input->dims()[1];
    arg->conv_args[i].image.height = (uint32_t)input->dims()[2];
    arg->conv_args[i].image.width = (uint32_t)input->dims()[3];
    arg->conv_args[i].image.scale_address = input->scale;
    arg->conv_args[i].image.pad_height = (uint32_t)padding_h;
    arg->conv_args[i].image.pad_width = (uint32_t)padding_w;
436 437 438 439 440 441 442
    arg->conv_args[i].filter_scale_address = filter->scale;
    arg->conv_args[i].filter_address = &(
        (int8_t *)filter_ptr)[i * element_num * filter_num_per_div];  // NOLINT
    arg->conv_args[i].sb_address = &bs_ptr[i * filter_num_per_div * 2];
    arg->conv_args[i].filter_num = (uint32_t)(
        i == n - 1 ? channel - (n - 1) * filter_num_per_div  // NOLINT
                   : filter_num_per_div);
443 444 445

    if (n > 1) {
      arg->conv_args[i].output.scale_address =
446 447 448 449 450 451 452
          (float *)fpga_malloc(2 * sizeof(float));  // NOLINT
      arg->conv_args[i].output.address = fpga_malloc(
          input->dims()[2] *
          align_to_x(input->dims()[3] * arg->conv_args[i].filter_num,
                     IMAGE_ALIGNMENT) *
          sizeof(half));
    } else {
453 454 455 456
      arg->conv_args[i].output.scale_address = out->scale;
      arg->conv_args[i].output.address = out_ptr;
    }

457 458 459
    arg->concat_arg.images_in[i] =
        (half *)arg->conv_args[i].output.address;  // NOLINT
    arg->concat_arg.scales_in[i] = arg->conv_args[i].output.scale_address;
460 461 462 463
    arg->concat_arg.channel_num[i] = arg->conv_args[i].filter_num;
  }
}

H
hanbuhe 已提交
464
}  // namespace fpga
Z
zhangyang 已提交
465
}  // namespace paddle_mobile