api.cpp 10.7 KB
Newer Older
H
hanbuhe 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

Z
zhangyang 已提交
15
#include "fpga/V2/api.h"
H
hanbuhe 已提交
16
#include <algorithm>
Z
zhangyang 已提交
17
#include "fpga/V2/bias_scale.h"
Z
zhangyang 已提交
18
#include "fpga/V2/config.h"
Z
zhangyang 已提交
19
#include "fpga/V2/driver/driver.h"
Z
zhangyang 已提交
20 21
#include "fpga/V2/filter.h"
#include "fpga/V2/image.h"
Z
zhangyang 已提交
22

Z
zhangyang 已提交
23
namespace paddle_mobile {
H
hanbuhe 已提交
24
namespace fpga {
Z
zhangyang 已提交
25

26
static std::map<void *, size_t> memory_map;
H
hanbuhe 已提交
27

Z
zhangyang 已提交
28
int open_device() {
Z
zhangyang 已提交
29
  int ret = driver::open_device_driver();
Z
zhangyang 已提交
30
  return ret;
Z
zhangyang 已提交
31
}
H
hanbuhe 已提交
32

Z
zhangyang 已提交
33
int close_device() {
Z
zhangyang 已提交
34
  int ret = driver::close_device_driver();
Z
zhangyang 已提交
35
  return ret;
H
hanbuhe 已提交
36 37 38
}

void *fpga_malloc(size_t size) {
39
  static uint64_t counter = 0;
Z
zhangyang 已提交
40
#ifdef PADDLE_MOBILE_ZU5
Z
zhangyang 已提交
41
  auto ptr = driver::fpga_malloc_driver(size);
H
hanbuhe 已提交
42
#else
43
  auto ptr = malloc(size);
H
hanbuhe 已提交
44
#endif
45 46
  counter += size;
  memory_map.insert(std::make_pair(ptr, size));
Z
zhangyang 已提交
47 48
  //  DLOG << "Address: " << ptr << ", " << size << " bytes allocated. Total "
  //       << counter << " bytes";
49
  return ptr;
H
hanbuhe 已提交
50 51
}

52
void fpga_free(void *ptr) {
53 54 55 56 57 58
  static uint64_t counter = 0;
  size_t size = 0;
  auto iter = memory_map.find(ptr);  // std::map<void *, size_t>::iterator
  if (iter != memory_map.end()) {
    size = iter->second;
    memory_map.erase(iter);
Z
zhangyang 已提交
59
#ifdef PADDLE_MOBILE_ZU5
Z
zhangyang 已提交
60
    driver::fpga_free_driver(ptr);
61
#else
62
    free(ptr);
63
#endif
64
    counter += size;
Z
zhangyang 已提交
65 66
    //    DLOG << "Address: " << ptr << ", " << size << " bytes freed. Total "
    //         << counter << " bytes";
67 68 69
  } else {
    DLOG << "Invalid pointer";
  }
70
}
Z
zhangyang 已提交
71 72 73 74 75 76
void fpga_copy(void *dest, const void *src, size_t num) {
#ifdef PADDLE_MOBILE_ZU5
  driver::fpga_copy_driver(dest, src, num);
#else
  memcpy(dest, src, num);
#endif
Z
zhangyang 已提交
77 78
}

Z
zhangyang 已提交
79 80 81 82 83 84 85 86 87 88 89 90 91
int fpga_flush(void *address, size_t size) {
#ifdef PADDLE_MOBILE_ZU5
  return driver::fpga_flush_driver(address, size);
#else
  return 0;
#endif
}
int fpga_invalidate(void *address, size_t size) {
#ifdef PADDLE_MOBILE_ZU5
  return driver::fpga_invalidate_driver(address, size);
#else
  return 0;
#endif
Z
zhangyang 已提交
92 93
}

Z
zhangyang 已提交
94 95
void format_image(framework::Tensor *image_tensor) {
  auto dims = image_tensor->dims();
Z
zhangyang 已提交
96
  auto channel = dims[1], height = dims[2], width = dims[3];
97
  auto data_ptr = image_tensor->data<float>();
Z
zhangyang 已提交
98
  size_t memory_size = channel * height * width * sizeof(float);
99
  auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
Z
zhangyang 已提交
100
  memcpy(new_data, data_ptr, memory_size);
Z
zhangyang 已提交
101 102 103 104
  int aligned_channel = filter::calc_aligned_channel((int)channel);  // NOLINT
  image::format_image(&new_data, (int)channel, (int)height,          // NOLINT
                      (int)width,                                    // NOLINT
                      aligned_channel);
Z
zhangyang 已提交
105 106 107
  image_tensor->reset_data_ptr(new_data);
}

Z
zhangyang 已提交
108
void format_fp16_ofm(framework::Tensor *ofm_tensor, int aligned_channel) {
Z
zhangyang 已提交
109
  auto dims = ofm_tensor->dims();
110 111
  size_t memory_size = 0;
  if (dims.size() == 4) {
Z
zhangyang 已提交
112
    auto height = dims[2], width = dims[3];
Z
zhangyang 已提交
113
    memory_size = (height + 1) / 2 * 2 * width * aligned_channel * sizeof(half);
114
  } else if (dims.size() == 2) {
Z
zhangyang 已提交
115
    memory_size = aligned_channel * sizeof(half);
116 117 118 119 120 121 122 123
  } else {
    DLOG << "Wrong ofm dimension";
  }
  auto p = fpga_malloc(memory_size);
  memset(p, 0, memory_size);
  ofm_tensor->reset_data_ptr(p);
}

Z
zhangyang 已提交
124
void format_fp32_ofm(framework::Tensor *ofm_tensor, int aligned_channel) {
125 126 127
  auto dims = ofm_tensor->dims();
  size_t memory_size = 0;
  if (dims.size() == 4) {
Z
zhangyang 已提交
128 129
    auto height = dims[2], width = dims[3];
    memory_size = height * width * aligned_channel * sizeof(float);
130
  } else if (dims.size() == 2) {
Z
zhangyang 已提交
131
    memory_size = aligned_channel * sizeof(float);
132 133 134 135 136 137
  } else {
    DLOG << "Wrong ofm dimension";
  }
  auto p = fpga_malloc(memory_size);
  memset(p, 0, memory_size);
  ofm_tensor->reset_data_ptr(p);
Z
zhangyang 已提交
138 139
}

Z
zhangyang 已提交
140 141
float filter_find_max(framework::Tensor *filter_tensor) {
  auto filter_ptr = filter_tensor->data<float>();
Z
zhangyang 已提交
142
  return filter::find_max(filter_ptr, (int)filter_tensor->numel());  // NOLINT
Z
zhangyang 已提交
143
}
Z
zhangyang 已提交
144

Z
zhangyang 已提交
145 146
int get_aligned_channel_num(int channel_num) {
  return filter::calc_aligned_channel(channel_num);
Z
zhangyang 已提交
147 148
}

Z
zhangyang 已提交
149
int get_aligned_filter_num(framework::Tensor *filter_tensor) {
Z
zhangyang 已提交
150
  auto dims = filter_tensor->dims();
Z
zhangyang 已提交
151
  return filter::calc_aligned_num((int)dims[0], (int)dims[1]);  // NOLINT
Z
zhangyang 已提交
152 153
}

Z
zhangyang 已提交
154 155 156
int get_conv_output_channel(framework::Tensor *filter_tensor) {
  int aligned_filter_num = get_aligned_filter_num(filter_tensor);
  return get_aligned_channel_num(aligned_filter_num);
Z
zhangyang 已提交
157
}
Z
zhangyang 已提交
158 159
void format_filter(framework::Tensor *filter_tensor, float max_value,
                   int group_num) {
160 161
  filter_tensor->scale[0] = float(max_value / 127.0);  // NOLINT
  filter_tensor->scale[1] = float(127.0 / max_value);  // NOLINT
Z
zhangyang 已提交
162
  auto dims = filter_tensor->dims();
Z
zhangyang 已提交
163
  auto num = dims[0], channel = dims[1], height = dims[2], width = dims[3];
164
  auto data_ptr = filter_tensor->data<float>();
Z
zhangyang 已提交
165
  size_t memory_size = num * channel * height * width * sizeof(float);
166
  auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
Z
zhangyang 已提交
167
  memcpy(new_data, data_ptr, memory_size);
Z
zhangyang 已提交
168 169 170
  filter::format_filter(&new_data, (int)num, (int)channel,  // NOLINT
                        (int)height,                        // NOLINT
                        (int)width, group_num, max_value);  // NOLINT
Z
zhangyang 已提交
171 172 173
  filter_tensor->reset_data_ptr(new_data);
}

Z
zhangyang 已提交
174 175 176 177 178 179 180 181
void format_fc_filter(framework::Tensor *filter_tensor, float max_value) {
  filter_tensor->scale[0] = float(max_value / 127.0);  // NOLINT
  filter_tensor->scale[1] = float(127.0 / max_value);  // NOLINT
  auto dims = filter_tensor->dims();
  auto num = dims[0], channel = dims[1], height = dims[2], width = dims[3];
  auto data_ptr = filter_tensor->data<float>();
  size_t memory_size = num * channel * height * width * sizeof(float);
  auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
Z
zhangyang 已提交
182
  memcpy(new_data, data_ptr, memory_size);
Z
zhangyang 已提交
183 184 185
  filter::format_fc_filter(&new_data, (int)num, (int)channel,  // NOLINT
                           (int)height,                        // NOLINT
                           (int)width, 1, max_value);          // NOLINT
Z
zhangyang 已提交
186 187 188
  filter_tensor->reset_data_ptr(new_data);
}

Z
zhangyang 已提交
189 190 191 192 193 194
void format_bias_scale_array(float **bias_scale_array, int filter_num,
                             int filter_channel) {
  int num_after_alignment =
      filter::calc_aligned_num(filter_channel, filter_channel);
  bias_scale::format_bias_scale_array(bias_scale_array, filter_num,
                                      num_after_alignment);
Z
zhangyang 已提交
195 196
}

Z
zhangyang 已提交
197
void format_concat_output(framework::Tensor *out, int height, int width,
Z
zhangyang 已提交
198 199 200
                          uint32_t out_channel) {
  auto data_ptr = fpga_malloc(out_channel * height * width * sizeof(half));
  auto ddim = framework::make_ddim({1, out_channel, height, width});
Z
zhangyang 已提交
201 202 203 204
  out->Resize(ddim);
  out->reset_data_ptr(data_ptr);
}

Z
zhangyang 已提交
205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232
int format_conv_data(framework::Tensor *filter_tensor,
                     framework::Tensor *ofm_tensor, float *bs_ptr, int group) {
  float max_value = fpga::filter_find_max(filter_tensor);
  fpga::format_filter(filter_tensor, max_value, group);
  int aligned_num = get_aligned_filter_num(filter_tensor);
  fpga::format_bias_scale_array(&bs_ptr,
                                (int)filter_tensor->dims()[0],  // NOLINT
                                aligned_num);
  int aligned_channel = fpga::get_conv_output_channel(filter_tensor);
  fpga::format_fp16_ofm(ofm_tensor, aligned_channel);
  DLOG << aligned_channel;
  return aligned_channel;
}

int format_fc_data(framework::Tensor *filter_tensor,
                   framework::Tensor *ofm_tensor, float *bs_ptr) {
  float max_value = fpga::filter_find_max(filter_tensor);
  fpga::format_fc_filter(filter_tensor, max_value);
  int aligned_num = get_aligned_filter_num(filter_tensor);
  fpga::format_bias_scale_array(&bs_ptr,
                                (int)filter_tensor->dims()[0],  // NOLINT
                                aligned_num);
  int aligned_channel = fpga::get_conv_output_channel(filter_tensor);
  fpga::format_fp16_ofm(ofm_tensor, aligned_channel);
  DLOG << aligned_channel;
  return aligned_channel;
}

Z
zhangyang 已提交
233 234 235 236
void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
                    framework::Tensor *out, framework::Tensor *filter,
                    bool relu_enabled, int group_num, int stride_h,
                    int stride_w, int padding_h, int padding_w, float *bs_ptr) {
237 238
  auto input_ptr = input->data<float>();
  auto filter_ptr = filter->data<float>();
239
  auto out_ptr = out->data<float>();
240 241

  arg->group_num = (uint32_t)group_num;
Z
zhangyang 已提交
242
  arg->split_num = 1;
243 244 245
  arg->filter_num = (uint32_t)filter->dims()[0];
  arg->output.address = out_ptr;
  arg->output.scale_address = out->scale;
Z
zhangyang 已提交
246
  arg->conv_arg =
247
      (ConvArgs *)fpga_malloc(arg->split_num * sizeof(ConvArgs));  // NOLINT
248 249 250 251

  arg->concat_arg.image_num = arg->split_num;
  arg->concat_arg.image_out = out_ptr;
  arg->concat_arg.scale_out = out->scale;
252 253
  arg->concat_arg.height = (uint32_t)out->dims()[2];
  arg->concat_arg.width = (uint32_t)out->dims()[3];
254 255

  int n = arg->split_num;
256 257 258 259
  arg->concat_arg.images_in =
      (half **)fpga_malloc(n * sizeof(int *));  // NOLINT
  arg->concat_arg.scales_in =
      (float **)fpga_malloc(n * sizeof(float *));  // NOLINT
260
  arg->concat_arg.channel_num =
261
      (uint32_t *)fpga_malloc(n * sizeof(uint32_t));  // NOLINT
262 263

  for (int i = 0; i < n; i++) {
Z
zhangyang 已提交
264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290
    arg->conv_arg[i].relu_enabled = relu_enabled;
    arg->conv_arg[i].sb_address = bs_ptr;
    arg->conv_arg[i].filter_address = (int8_t *)filter_ptr;  // NOLINT
    arg->conv_arg[i].filter_scale_address = filter->scale;
    arg->conv_arg[i].filter_num = arg->filter_num;
    arg->conv_arg[i].group_num = (uint32_t)group_num;

    arg->conv_arg[i].kernel.stride_h = (uint32_t)stride_h;
    arg->conv_arg[i].kernel.stride_w = (uint32_t)stride_w;
    arg->conv_arg[i].kernel.height = (uint32_t)filter->dims()[2];
    arg->conv_arg[i].kernel.width = (uint32_t)filter->dims()[3];

    arg->conv_arg[i].image.address = input_ptr;
    arg->conv_arg[i].image.scale_address = input->scale;
    arg->conv_arg[i].image.channels = (uint32_t)input->dims()[1];
    arg->conv_arg[i].image.height = (uint32_t)input->dims()[2];
    arg->conv_arg[i].image.width = (uint32_t)input->dims()[3];
    arg->conv_arg[i].image.pad_height = (uint32_t)padding_h;
    arg->conv_arg[i].image.pad_width = (uint32_t)padding_w;

    arg->conv_arg[i].output.address = out_ptr;
    arg->conv_arg[i].output.scale_address = out->scale;

    int num_after_alignment =
        filter::calc_aligned_num((int)input->dims()[1], arg->filter_num);
    arg->conv_arg[i].free_space =
        fpga_malloc(num_after_alignment * 2 * sizeof(half));
291 292 293
  }
}

H
hanbuhe 已提交
294
}  // namespace fpga
Z
zhangyang 已提交
295
}  // namespace paddle_mobile