api.cpp 14.7 KB
Newer Older
H
hanbuhe 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

Z
zhangyang 已提交
15 16
#include "fpga/V1/api.h"
#include "fpga/V1/bias_scale.h"
Z
zhangyang 已提交
17
#include "fpga/V1/deconv_filter.h"
Z
zhangyang 已提交
18 19
#include "fpga/V1/filter.h"
#include "fpga/V1/image.h"
Z
zhangyang 已提交
20

Z
zhangyang 已提交
21
namespace paddle_mobile {
H
hanbuhe 已提交
22 23
namespace fpga {

24 25
int get_align_image_cw(int cw) { return align_to_x(cw, IMAGE_ALIGNMENT); }

Z
zhangyang 已提交
26 27
void format_image(framework::Tensor *image_tensor) {
  auto dims = image_tensor->dims();
Z
zhangyang 已提交
28
  auto channel = dims[1], height = dims[2], width = dims[3];
29
  auto data_ptr = image_tensor->data<float>();
Z
zhangyang 已提交
30
  size_t memory_size = channel * height * width * sizeof(float);
31
  auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
Z
zhangyang 已提交
32 33 34 35 36
  fpga_copy(new_data, data_ptr, memory_size);
  image::format_image(&new_data, channel, height, width);
  image_tensor->reset_data_ptr(new_data);
}

37
void format_fp16_ofm(framework::Tensor *ofm_tensor) {
Z
zhangyang 已提交
38
  auto dims = ofm_tensor->dims();
39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68
  size_t memory_size = 0;
  if (dims.size() == 4) {
    auto channel = dims[1], height = dims[2], width = dims[3];
    memory_size =
        height * align_to_x(channel * width, IMAGE_ALIGNMENT) * sizeof(half);
  } else if (dims.size() == 2) {
    memory_size = align_to_x(dims[1], IMAGE_ALIGNMENT) * sizeof(half);
  } else {
    DLOG << "Wrong ofm dimension";
  }
  auto p = fpga_malloc(memory_size);
  memset(p, 0, memory_size);
  ofm_tensor->reset_data_ptr(p);
}

void format_fp32_ofm(framework::Tensor *ofm_tensor) {
  auto dims = ofm_tensor->dims();
  size_t memory_size = 0;
  if (dims.size() == 4) {
    auto channel = dims[1], height = dims[2], width = dims[3];
    memory_size =
        height * align_to_x(channel * width, IMAGE_ALIGNMENT) * sizeof(float);
  } else if (dims.size() == 2) {
    memory_size = align_to_x(dims[1], IMAGE_ALIGNMENT) * sizeof(float);
  } else {
    DLOG << "Wrong ofm dimension";
  }
  auto p = fpga_malloc(memory_size);
  memset(p, 0, memory_size);
  ofm_tensor->reset_data_ptr(p);
Z
zhangyang 已提交
69 70
}

Z
zhangyang 已提交
71 72 73 74
float filter_find_max(framework::Tensor *filter_tensor) {
  auto filter_ptr = filter_tensor->data<float>();
  return filter::find_max(filter_ptr, filter_tensor->numel());
}
Z
zhangyang 已提交
75 76 77

int get_plit_num(framework::Tensor *filter_tensor) {
  auto dims = filter_tensor->dims();
Z
zhangyang 已提交
78 79
  auto chw = dims[1] * dims[2] * dims[3];
  auto num = dims[0];
Z
zhangyang 已提交
80 81 82 83
  int div_capacity = filter::calc_division_capacity(chw);
  return filter::calc_split_num(num, div_capacity);
}

84
int get_filter_num_per_div(framework::Tensor *filter_tensor, int group_num) {
Z
zhangyang 已提交
85
  auto dims = filter_tensor->dims();
Z
zhangyang 已提交
86 87
  auto chw = dims[1] * dims[2] * dims[3];
  auto num = dims[0];
Z
zhangyang 已提交
88 89 90 91
  int div_capacity = filter::calc_division_capacity(chw);
  return filter::calc_num_per_div(num, group_num, div_capacity);
}

Z
zhangyang 已提交
92 93 94 95 96 97 98 99
int get_aligned_filter_element_num(int chw) {
  return align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
}

int get_aligned_filter_num(int num) {
  return align_to_x(num, FILTER_NUM_ALIGNMENT);
}

Z
zhangyang 已提交
100 101
void format_filter(framework::Tensor *filter_tensor, float max_value,
                   int group_num) {
102 103
  filter_tensor->scale[0] = float(max_value / 127.0);  // NOLINT
  filter_tensor->scale[1] = float(127.0 / max_value);  // NOLINT
Z
zhangyang 已提交
104
  auto dims = filter_tensor->dims();
Z
zhangyang 已提交
105
  auto num = dims[0], channel = dims[1], height = dims[2], width = dims[3];
106
  auto data_ptr = filter_tensor->data<float>();
Z
zhangyang 已提交
107
  size_t memory_size = num * channel * height * width * sizeof(float);
108
  auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
Z
zhangyang 已提交
109 110 111 112 113 114
  fpga_copy(new_data, data_ptr, memory_size);
  filter::format_filter(&new_data, num, channel, height, width, group_num,
                        max_value);
  filter_tensor->reset_data_ptr(new_data);
}

Z
zhangyang 已提交
115 116 117 118 119 120 121 122 123 124 125 126 127
void format_fc_filter(framework::Tensor *filter_tensor, float max_value) {
  filter_tensor->scale[0] = float(max_value / 127.0);  // NOLINT
  filter_tensor->scale[1] = float(127.0 / max_value);  // NOLINT
  auto dims = filter_tensor->dims();
  auto num = dims[0], channel = dims[1], height = dims[2], width = dims[3];
  auto data_ptr = filter_tensor->data<float>();
  size_t memory_size = num * channel * height * width * sizeof(float);
  auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
  fpga_copy(new_data, data_ptr, memory_size);
  filter::format_fc_filter(&new_data, num, channel, height, width, 1,
                           max_value);
  filter_tensor->reset_data_ptr(new_data);
}
Z
zhangyang 已提交
128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153
void format_deconv_filter(framework::Tensor *filter_tensor, float max_value,
                          int group_num, int stride) {
  filter_tensor->scale[0] = float(max_value / 127.0);  // NOLINT
  filter_tensor->scale[1] = float(127.0 / max_value);  // NOLINT
  auto dims = filter_tensor->dims();
  auto num = dims[0], channel = dims[1], height = dims[2], width = dims[3];
  auto data_ptr = filter_tensor->data<float>();
  size_t memory_size = num * channel * height * width * sizeof(float);
  auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
  memcpy(new_data, data_ptr, memory_size);

  int hw = height * width;
  deconv_filter::deconv_NC_convert(&new_data, num, channel, hw);

  num = dims[1];
  channel = dims[0];
  deconv_filter::deconv_format_filter(
      &new_data, (int)num, (int)channel,          // NOLINT
      (int)height,                                // NOLINT
      (int)width, group_num, max_value, stride);  // NOLINT

  framework::DDim dims_new =
      framework::make_ddim({num, channel, height, width});
  filter_tensor->Resize(dims_new);
  filter_tensor->reset_data_ptr(new_data);
}
Z
zhangyang 已提交
154

Z
zhangyang 已提交
155 156 157 158 159 160
void format_bias_scale_array(float **bias_scale_array,
                             int element_num_per_division, int num) {
  bias_scale::format_bias_scale_array(bias_scale_array,
                                      element_num_per_division, num);
}

Z
zhangyang 已提交
161 162 163 164 165 166 167 168 169
void format_concat_output(framework::Tensor *out, int height, int width,
                          int image_num, uint32_t *channel_num) {
  int sum_channel = 0, sum_cw = 0;
  for (int i = 0; i < image_num; i++) {
    sum_channel += channel_num[i];
  }

  sum_cw = align_to_x(width * sum_channel, IMAGE_ALIGNMENT);
  auto data_ptr = fpga_malloc(height * sum_cw * sizeof(half));
170
  auto ddim = framework::make_ddim({1, sum_channel, height, width});
Z
zhangyang 已提交
171 172 173 174
  out->Resize(ddim);
  out->reset_data_ptr(data_ptr);
}

Z
zhangyang 已提交
175 176 177 178
void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
                    framework::Tensor *out, framework::Tensor *filter,
                    bool relu_enabled, int group_num, int stride_h,
                    int stride_w, int padding_h, int padding_w, float *bs_ptr) {
179 180
  auto input_ptr = input->data<float>();
  auto filter_ptr = filter->data<float>();
181
  auto out_ptr = out->data<float>();
182 183

  arg->group_num = (uint32_t)group_num;
184 185
  // Either group_num or split_num = 1;
  arg->split_num = group_num == 1 ? (uint32_t)get_plit_num(filter) : 1;
186 187 188
  arg->filter_num = (uint32_t)filter->dims()[0];
  arg->output.address = out_ptr;
  arg->output.scale_address = out->scale;
Z
zhangyang 已提交
189
  arg->conv_arg =
190
      (ConvArgs *)fpga_malloc(arg->split_num * sizeof(ConvArgs));  // NOLINT
191 192 193 194

  arg->concat_arg.image_num = arg->split_num;
  arg->concat_arg.image_out = out_ptr;
  arg->concat_arg.scale_out = out->scale;
195 196
  arg->concat_arg.height = (uint32_t)out->dims()[2];
  arg->concat_arg.width = (uint32_t)out->dims()[3];
197 198

  int n = arg->split_num;
199 200 201 202
  arg->concat_arg.images_in =
      (half **)fpga_malloc(n * sizeof(int *));  // NOLINT
  arg->concat_arg.scales_in =
      (float **)fpga_malloc(n * sizeof(float *));  // NOLINT
203
  arg->concat_arg.channel_num =
204
      (uint32_t *)fpga_malloc(n * sizeof(uint32_t));  // NOLINT
205

206 207 208
  auto channel = (int)out->dims()[1];  // NOLINT
  int filter_num_per_div = get_filter_num_per_div(filter, group_num);
  int element_num = get_aligned_filter_element_num(
209 210 211
      filter->dims()[1] * filter->dims()[2] * filter->dims()[3]);

  for (int i = 0; i < n; i++) {
Z
zhangyang 已提交
212 213 214 215 216 217 218 219 220 221 222 223 224 225
    arg->conv_arg[i].relu_enabled = relu_enabled;
    arg->conv_arg[i].group_num = (uint32_t)group_num;
    arg->conv_arg[i].kernel.stride_h = (uint32_t)stride_h;
    arg->conv_arg[i].kernel.stride_w = (uint32_t)stride_w;
    arg->conv_arg[i].kernel.height = (uint32_t)filter->dims()[2];
    arg->conv_arg[i].kernel.width = (uint32_t)filter->dims()[3];
    arg->conv_arg[i].image.address = input_ptr;
    arg->conv_arg[i].image.channels = (uint32_t)input->dims()[1];
    arg->conv_arg[i].image.height = (uint32_t)input->dims()[2];
    arg->conv_arg[i].image.width = (uint32_t)input->dims()[3];
    arg->conv_arg[i].image.scale_address = input->scale;
    arg->conv_arg[i].image.pad_height = (uint32_t)padding_h;
    arg->conv_arg[i].image.pad_width = (uint32_t)padding_w;
    arg->conv_arg[i].filter_scale_address = filter->scale;
Z
zhangyang 已提交
226 227 228 229 230
    //    arg->conv_arg[i].filter_address = &(
    //        (int8_t *)filter_ptr)[i * element_num * filter_num_per_div];  //
    //        NOLINT
    //    arg->conv_arg[i].sb_address = &bs_ptr[i * filter_num_per_div * 2];

Z
zhangyang 已提交
231
    arg->conv_arg[i].filter_num = (uint32_t)(
232 233
        i == n - 1 ? channel - (n - 1) * filter_num_per_div  // NOLINT
                   : filter_num_per_div);
234

Z
zhangyang 已提交
235 236 237 238 239 240 241 242 243 244 245 246 247 248
    size_t filter_size =
        element_num * arg->conv_arg[i].filter_num * sizeof(int8_t);
    auto filter_head =
        &((int8_t *)filter_ptr)[i * element_num * filter_num_per_div];
    arg->conv_arg[i].filter_address = fpga_malloc(filter_size);
    memcpy(arg->conv_arg[i].filter_address, filter_head, filter_size);
    fpga_flush(arg->conv_arg[i].filter_address, filter_size);

    size_t bs_size = 2 * arg->conv_arg[i].filter_num * sizeof(float);
    auto bs_head = &bs_ptr[i * filter_num_per_div * 2];
    arg->conv_arg[i].sb_address = fpga_malloc(bs_size);
    memcpy(arg->conv_arg[i].sb_address, bs_head, bs_size);
    fpga_flush(arg->conv_arg[i].sb_address, bs_size);

249
    if (n > 1) {
Z
zhangyang 已提交
250
      arg->conv_arg[i].output.scale_address =
251
          (float *)fpga_malloc(2 * sizeof(float));  // NOLINT
Z
zhangyang 已提交
252
      arg->conv_arg[i].output.address =
Z
zhangyang 已提交
253 254
          fpga_malloc(out->dims()[2] *
                      align_to_x(out->dims()[3] * arg->conv_arg[i].filter_num,
Z
zhangyang 已提交
255 256
                                 IMAGE_ALIGNMENT) *
                      sizeof(half));
257
    } else {
Z
zhangyang 已提交
258 259
      arg->conv_arg[i].output.scale_address = out->scale;
      arg->conv_arg[i].output.address = out_ptr;
260 261
    }

262
    arg->concat_arg.images_in[i] =
Z
zhangyang 已提交
263 264 265
        (half *)arg->conv_arg[i].output.address;  // NOLINT
    arg->concat_arg.scales_in[i] = arg->conv_arg[i].output.scale_address;
    arg->concat_arg.channel_num[i] = arg->conv_arg[i].filter_num;
266
  }
Z
zhangyang 已提交
267 268
  filter->reset_data_ptr(nullptr);
  fpga_free(bs_ptr);
269
}
Z
zhangyang 已提交
270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288
void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
                     framework::Tensor *out, framework::Tensor *filter,
                     bool relu_enabled, int group_num, int stride_h,
                     int stride_w, int padding_h, int padding_w,
                     float *bs_ptr) {
  auto input_ptr = input->data<float>();
  auto filter_ptr = filter->data<float>();
  auto out_ptr = out->data<float>();

  arg->group_num = (uint32_t)group_num;
  arg->sub_conv_num = stride_h;
  arg->filter_num = (uint32_t)filter->dims()[0];

  int sub_conv_num = arg->sub_conv_num;
  int sub_stride = 1;
  int sub_pad = deconv_filter::deconv_calc_sub_pad(filter->dims()[3], padding_w,
                                                   stride_w);
  int sub_filter_width =
      deconv_filter::deconv_get_sub_filter_axis(filter->dims()[3], stride_w);
289

Z
zhangyang 已提交
290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364
  int sub_output_width = deconv_filter::deconv_get_sub_out_axis(
      input->dims()[3], sub_pad, sub_filter_width);
  int sub_output_height = deconv_filter::deconv_get_sub_out_axis(
      input->dims()[2], sub_pad, sub_filter_width);

  arg->sub_output_width = sub_output_width;
  arg->sub_output_height = sub_output_height;
  arg->omit_size =
      deconv_filter::deconv_get_omit(stride_w, filter->dims()[3], padding_w);
  arg->conv_args = (ConvArgs *)fpga_malloc(sub_conv_num * sizeof(ConvArgs));

  int sub_channels = (int32_t)input->dims()[1];
  int omit_size = arg->omit_size;
  int real_out_width = sub_output_width * sub_conv_num - 2 * omit_size;
  int real_out_height = sub_output_height * sub_conv_num - 2 * omit_size;
  int sub_filter_num = sub_conv_num * (arg->filter_num);

  int conv_output_size =
      (align_to_x(sub_output_width * sub_filter_num, IMAGE_ALIGNMENT)) *
      sub_output_height;
  int ouput_size = conv_output_size * sub_conv_num;

  int align_sub_filter_num = align_to_x(sub_filter_num, FILTER_NUM_ALIGNMENT);
  int align_sub_filter_count =
      align_to_x(sub_filter_width * sub_filter_width * sub_channels,
                 FILTER_ELEMENT_ALIGNMENT);
  int align_conv_sub_filter_count =
      align_sub_filter_count * align_sub_filter_num;

  for (int i = 0; i < sub_conv_num; ++i) {
    arg->conv_args[i].filter_num = (arg->sub_conv_num) * (arg->filter_num);
    arg->conv_args[i].group_num = group_num;

    arg->conv_args[i].filter_scale_address = filter->scale;
    arg->conv_args[i].relu_enabled = relu_enabled;

    arg->conv_args[i].kernel.width = sub_filter_width;
    arg->conv_args[i].kernel.height = sub_filter_width;
    arg->conv_args[i].kernel.stride_w = 1;
    arg->conv_args[i].kernel.stride_h = 1;

    // DeconvParam.conv_args[i].image.address = (void*)ptr_image;
    arg->conv_args[i].image.scale_address = input->scale;
    arg->conv_args[i].image.channels = sub_channels;
    arg->conv_args[i].image.width = (uint32_t)input->dims()[3];
    arg->conv_args[i].image.height = (uint32_t)input->dims()[2];
    arg->conv_args[i].image.pad_width = sub_pad;
    arg->conv_args[i].image.pad_height = sub_pad;
    arg->conv_args[i].image.address = input_ptr;

    arg->conv_args[i].sb_address = (void *)bs_ptr;

    char *filter_sub_space =
        (char *)fpga_malloc(align_conv_sub_filter_count * sizeof(char));
    fpga_copy(filter_sub_space,
              (char *)filter_ptr + i * align_conv_sub_filter_count,
              align_conv_sub_filter_count);
    arg->conv_args[i].filter_address = (void *)(filter_sub_space);
    fpga_flush(filter_sub_space, align_conv_sub_filter_count);

    if (sub_conv_num == 1) {
      arg->conv_args[i].output.address = out_ptr;
      arg->conv_args[i].output.scale_address = out->scale;
    } else {
      half *ptr_output = (half *)fpga_malloc(conv_output_size * sizeof(half));
      arg->conv_args[i].output.address = (void *)((half *)ptr_output);
      float *ptr_output_scale = (float *)fpga_malloc(2 * sizeof(float));
      arg->conv_args[i].output.scale_address = ptr_output_scale;
    }
  }

  arg->output.address = out_ptr;
  arg->output.scale_address = out->scale;
  // fpga_free(filter_ptr);
}
H
hanbuhe 已提交
365
}  // namespace fpga
Z
zhangyang 已提交
366
}  // namespace paddle_mobile