api.cpp 9.7 KB
Newer Older
H
hanbuhe 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

Z
zhangyang 已提交
15 16 17 18
#include "fpga/V1/api.h"
#include "fpga/V1/bias_scale.h"
#include "fpga/V1/filter.h"
#include "fpga/V1/image.h"
Z
zhangyang 已提交
19

Z
zhangyang 已提交
20
namespace paddle_mobile {
H
hanbuhe 已提交
21 22
namespace fpga {

23 24
int get_align_image_cw(int cw) { return align_to_x(cw, IMAGE_ALIGNMENT); }

Z
zhangyang 已提交
25 26
void format_image(framework::Tensor *image_tensor) {
  auto dims = image_tensor->dims();
Z
zhangyang 已提交
27
  auto channel = dims[1], height = dims[2], width = dims[3];
28
  auto data_ptr = image_tensor->data<float>();
Z
zhangyang 已提交
29
  size_t memory_size = channel * height * width * sizeof(float);
30
  auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
Z
zhangyang 已提交
31 32 33 34 35
  fpga_copy(new_data, data_ptr, memory_size);
  image::format_image(&new_data, channel, height, width);
  image_tensor->reset_data_ptr(new_data);
}

36
void format_fp16_ofm(framework::Tensor *ofm_tensor) {
Z
zhangyang 已提交
37
  auto dims = ofm_tensor->dims();
38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67
  size_t memory_size = 0;
  if (dims.size() == 4) {
    auto channel = dims[1], height = dims[2], width = dims[3];
    memory_size =
        height * align_to_x(channel * width, IMAGE_ALIGNMENT) * sizeof(half);
  } else if (dims.size() == 2) {
    memory_size = align_to_x(dims[1], IMAGE_ALIGNMENT) * sizeof(half);
  } else {
    DLOG << "Wrong ofm dimension";
  }
  auto p = fpga_malloc(memory_size);
  memset(p, 0, memory_size);
  ofm_tensor->reset_data_ptr(p);
}

void format_fp32_ofm(framework::Tensor *ofm_tensor) {
  auto dims = ofm_tensor->dims();
  size_t memory_size = 0;
  if (dims.size() == 4) {
    auto channel = dims[1], height = dims[2], width = dims[3];
    memory_size =
        height * align_to_x(channel * width, IMAGE_ALIGNMENT) * sizeof(float);
  } else if (dims.size() == 2) {
    memory_size = align_to_x(dims[1], IMAGE_ALIGNMENT) * sizeof(float);
  } else {
    DLOG << "Wrong ofm dimension";
  }
  auto p = fpga_malloc(memory_size);
  memset(p, 0, memory_size);
  ofm_tensor->reset_data_ptr(p);
Z
zhangyang 已提交
68 69
}

Z
zhangyang 已提交
70 71 72 73
float filter_find_max(framework::Tensor *filter_tensor) {
  auto filter_ptr = filter_tensor->data<float>();
  return filter::find_max(filter_ptr, filter_tensor->numel());
}
Z
zhangyang 已提交
74 75 76

int get_plit_num(framework::Tensor *filter_tensor) {
  auto dims = filter_tensor->dims();
Z
zhangyang 已提交
77 78
  auto chw = dims[1] * dims[2] * dims[3];
  auto num = dims[0];
Z
zhangyang 已提交
79 80 81 82
  int div_capacity = filter::calc_division_capacity(chw);
  return filter::calc_split_num(num, div_capacity);
}

83
int get_filter_num_per_div(framework::Tensor *filter_tensor, int group_num) {
Z
zhangyang 已提交
84
  auto dims = filter_tensor->dims();
Z
zhangyang 已提交
85 86
  auto chw = dims[1] * dims[2] * dims[3];
  auto num = dims[0];
Z
zhangyang 已提交
87 88 89 90
  int div_capacity = filter::calc_division_capacity(chw);
  return filter::calc_num_per_div(num, group_num, div_capacity);
}

Z
zhangyang 已提交
91 92 93 94 95 96 97 98
int get_aligned_filter_element_num(int chw) {
  return align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
}

int get_aligned_filter_num(int num) {
  return align_to_x(num, FILTER_NUM_ALIGNMENT);
}

Z
zhangyang 已提交
99 100
void format_filter(framework::Tensor *filter_tensor, float max_value,
                   int group_num) {
101 102
  filter_tensor->scale[0] = float(max_value / 127.0);  // NOLINT
  filter_tensor->scale[1] = float(127.0 / max_value);  // NOLINT
Z
zhangyang 已提交
103
  auto dims = filter_tensor->dims();
Z
zhangyang 已提交
104
  auto num = dims[0], channel = dims[1], height = dims[2], width = dims[3];
105
  auto data_ptr = filter_tensor->data<float>();
Z
zhangyang 已提交
106
  size_t memory_size = num * channel * height * width * sizeof(float);
107
  auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
Z
zhangyang 已提交
108 109 110 111 112 113
  fpga_copy(new_data, data_ptr, memory_size);
  filter::format_filter(&new_data, num, channel, height, width, group_num,
                        max_value);
  filter_tensor->reset_data_ptr(new_data);
}

Z
zhangyang 已提交
114 115 116 117 118 119 120 121 122 123 124 125 126 127
void format_fc_filter(framework::Tensor *filter_tensor, float max_value) {
  filter_tensor->scale[0] = float(max_value / 127.0);  // NOLINT
  filter_tensor->scale[1] = float(127.0 / max_value);  // NOLINT
  auto dims = filter_tensor->dims();
  auto num = dims[0], channel = dims[1], height = dims[2], width = dims[3];
  auto data_ptr = filter_tensor->data<float>();
  size_t memory_size = num * channel * height * width * sizeof(float);
  auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
  fpga_copy(new_data, data_ptr, memory_size);
  filter::format_fc_filter(&new_data, num, channel, height, width, 1,
                           max_value);
  filter_tensor->reset_data_ptr(new_data);
}

Z
zhangyang 已提交
128 129 130 131 132 133
void format_bias_scale_array(float **bias_scale_array,
                             int element_num_per_division, int num) {
  bias_scale::format_bias_scale_array(bias_scale_array,
                                      element_num_per_division, num);
}

Z
zhangyang 已提交
134 135 136 137 138 139 140 141 142
void format_concat_output(framework::Tensor *out, int height, int width,
                          int image_num, uint32_t *channel_num) {
  int sum_channel = 0, sum_cw = 0;
  for (int i = 0; i < image_num; i++) {
    sum_channel += channel_num[i];
  }

  sum_cw = align_to_x(width * sum_channel, IMAGE_ALIGNMENT);
  auto data_ptr = fpga_malloc(height * sum_cw * sizeof(half));
143
  auto ddim = framework::make_ddim({1, sum_channel, height, width});
Z
zhangyang 已提交
144 145 146 147
  out->Resize(ddim);
  out->reset_data_ptr(data_ptr);
}

Z
zhangyang 已提交
148 149 150 151
void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
                    framework::Tensor *out, framework::Tensor *filter,
                    bool relu_enabled, int group_num, int stride_h,
                    int stride_w, int padding_h, int padding_w, float *bs_ptr) {
152 153
  auto input_ptr = input->data<float>();
  auto filter_ptr = filter->data<float>();
154
  auto out_ptr = out->data<float>();
155 156

  arg->group_num = (uint32_t)group_num;
157 158
  // Either group_num or split_num = 1;
  arg->split_num = group_num == 1 ? (uint32_t)get_plit_num(filter) : 1;
159 160 161
  arg->filter_num = (uint32_t)filter->dims()[0];
  arg->output.address = out_ptr;
  arg->output.scale_address = out->scale;
Z
zhangyang 已提交
162
  arg->conv_arg =
163
      (ConvArgs *)fpga_malloc(arg->split_num * sizeof(ConvArgs));  // NOLINT
164 165 166 167

  arg->concat_arg.image_num = arg->split_num;
  arg->concat_arg.image_out = out_ptr;
  arg->concat_arg.scale_out = out->scale;
168 169
  arg->concat_arg.height = (uint32_t)out->dims()[2];
  arg->concat_arg.width = (uint32_t)out->dims()[3];
170 171

  int n = arg->split_num;
172 173 174 175
  arg->concat_arg.images_in =
      (half **)fpga_malloc(n * sizeof(int *));  // NOLINT
  arg->concat_arg.scales_in =
      (float **)fpga_malloc(n * sizeof(float *));  // NOLINT
176
  arg->concat_arg.channel_num =
177
      (uint32_t *)fpga_malloc(n * sizeof(uint32_t));  // NOLINT
178

179 180 181
  auto channel = (int)out->dims()[1];  // NOLINT
  int filter_num_per_div = get_filter_num_per_div(filter, group_num);
  int element_num = get_aligned_filter_element_num(
182 183 184
      filter->dims()[1] * filter->dims()[2] * filter->dims()[3]);

  for (int i = 0; i < n; i++) {
Z
zhangyang 已提交
185 186 187 188 189 190 191 192 193 194 195 196 197 198
    arg->conv_arg[i].relu_enabled = relu_enabled;
    arg->conv_arg[i].group_num = (uint32_t)group_num;
    arg->conv_arg[i].kernel.stride_h = (uint32_t)stride_h;
    arg->conv_arg[i].kernel.stride_w = (uint32_t)stride_w;
    arg->conv_arg[i].kernel.height = (uint32_t)filter->dims()[2];
    arg->conv_arg[i].kernel.width = (uint32_t)filter->dims()[3];
    arg->conv_arg[i].image.address = input_ptr;
    arg->conv_arg[i].image.channels = (uint32_t)input->dims()[1];
    arg->conv_arg[i].image.height = (uint32_t)input->dims()[2];
    arg->conv_arg[i].image.width = (uint32_t)input->dims()[3];
    arg->conv_arg[i].image.scale_address = input->scale;
    arg->conv_arg[i].image.pad_height = (uint32_t)padding_h;
    arg->conv_arg[i].image.pad_width = (uint32_t)padding_w;
    arg->conv_arg[i].filter_scale_address = filter->scale;
Z
zhangyang 已提交
199 200 201 202 203
    //    arg->conv_arg[i].filter_address = &(
    //        (int8_t *)filter_ptr)[i * element_num * filter_num_per_div];  //
    //        NOLINT
    //    arg->conv_arg[i].sb_address = &bs_ptr[i * filter_num_per_div * 2];

Z
zhangyang 已提交
204
    arg->conv_arg[i].filter_num = (uint32_t)(
205 206
        i == n - 1 ? channel - (n - 1) * filter_num_per_div  // NOLINT
                   : filter_num_per_div);
207

Z
zhangyang 已提交
208 209 210 211 212 213 214 215 216 217 218 219 220 221
    size_t filter_size =
        element_num * arg->conv_arg[i].filter_num * sizeof(int8_t);
    auto filter_head =
        &((int8_t *)filter_ptr)[i * element_num * filter_num_per_div];
    arg->conv_arg[i].filter_address = fpga_malloc(filter_size);
    memcpy(arg->conv_arg[i].filter_address, filter_head, filter_size);
    fpga_flush(arg->conv_arg[i].filter_address, filter_size);

    size_t bs_size = 2 * arg->conv_arg[i].filter_num * sizeof(float);
    auto bs_head = &bs_ptr[i * filter_num_per_div * 2];
    arg->conv_arg[i].sb_address = fpga_malloc(bs_size);
    memcpy(arg->conv_arg[i].sb_address, bs_head, bs_size);
    fpga_flush(arg->conv_arg[i].sb_address, bs_size);

222
    if (n > 1) {
Z
zhangyang 已提交
223
      arg->conv_arg[i].output.scale_address =
224
          (float *)fpga_malloc(2 * sizeof(float));  // NOLINT
Z
zhangyang 已提交
225
      arg->conv_arg[i].output.address =
Z
zhangyang 已提交
226 227
          fpga_malloc(out->dims()[2] *
                      align_to_x(out->dims()[3] * arg->conv_arg[i].filter_num,
Z
zhangyang 已提交
228 229
                                 IMAGE_ALIGNMENT) *
                      sizeof(half));
230
    } else {
Z
zhangyang 已提交
231 232
      arg->conv_arg[i].output.scale_address = out->scale;
      arg->conv_arg[i].output.address = out_ptr;
233 234
    }

235
    arg->concat_arg.images_in[i] =
Z
zhangyang 已提交
236 237 238
        (half *)arg->conv_arg[i].output.address;  // NOLINT
    arg->concat_arg.scales_in[i] = arg->conv_arg[i].output.scale_address;
    arg->concat_arg.channel_num[i] = arg->conv_arg[i].filter_num;
239
  }
Z
zhangyang 已提交
240 241
  filter->reset_data_ptr(nullptr);
  fpga_free(bs_ptr);
242 243
}

H
hanbuhe 已提交
244
}  // namespace fpga
Z
zhangyang 已提交
245
}  // namespace paddle_mobile