api.cpp 9.1 KB
Newer Older
H
hanbuhe 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

Z
zhangyang 已提交
15 16 17 18
#include "fpga/V2/api.h"
#include "fpga/V2/bias_scale.h"
#include "fpga/V2/filter.h"
#include "fpga/V2/image.h"
Z
zhangyang 已提交
19

Z
zhangyang 已提交
20
namespace paddle_mobile {
H
hanbuhe 已提交
21
namespace fpga {
Z
zhangyang 已提交
22

Z
zhangyang 已提交
23 24
void format_image(framework::Tensor *image_tensor) {
  auto dims = image_tensor->dims();
Z
zhangyang 已提交
25
  auto channel = dims[1], height = dims[2], width = dims[3];
26
  auto data_ptr = image_tensor->data<float>();
Z
zhangyang 已提交
27
  size_t memory_size = channel * height * width * sizeof(float);
28
  auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
Z
zhangyang 已提交
29
  memcpy(new_data, data_ptr, memory_size);
Z
zhangyang 已提交
30 31 32 33
  int aligned_channel = filter::calc_aligned_channel((int)channel);  // NOLINT
  image::format_image(&new_data, (int)channel, (int)height,          // NOLINT
                      (int)width,                                    // NOLINT
                      aligned_channel);
Z
zhangyang 已提交
34 35 36
  image_tensor->reset_data_ptr(new_data);
}

Z
zhangyang 已提交
37
void format_fp16_ofm(framework::Tensor *ofm_tensor, int aligned_channel) {
Z
zhangyang 已提交
38
  auto dims = ofm_tensor->dims();
39 40
  size_t memory_size = 0;
  if (dims.size() == 4) {
Z
zhangyang 已提交
41
    auto height = dims[2], width = dims[3];
Z
zhangyang 已提交
42
    memory_size = (height + 1) / 2 * 2 * width * aligned_channel * sizeof(half);
43
  } else if (dims.size() == 2) {
Z
zhangyang 已提交
44
    memory_size = aligned_channel * sizeof(half);
45 46 47 48 49 50 51 52
  } else {
    DLOG << "Wrong ofm dimension";
  }
  auto p = fpga_malloc(memory_size);
  memset(p, 0, memory_size);
  ofm_tensor->reset_data_ptr(p);
}

Z
zhangyang 已提交
53
void format_fp32_ofm(framework::Tensor *ofm_tensor, int aligned_channel) {
54 55 56
  auto dims = ofm_tensor->dims();
  size_t memory_size = 0;
  if (dims.size() == 4) {
Z
zhangyang 已提交
57 58
    auto height = dims[2], width = dims[3];
    memory_size = height * width * aligned_channel * sizeof(float);
59
  } else if (dims.size() == 2) {
Z
zhangyang 已提交
60
    memory_size = aligned_channel * sizeof(float);
61 62 63 64 65 66
  } else {
    DLOG << "Wrong ofm dimension";
  }
  auto p = fpga_malloc(memory_size);
  memset(p, 0, memory_size);
  ofm_tensor->reset_data_ptr(p);
Z
zhangyang 已提交
67 68
}

Z
zhangyang 已提交
69 70
float filter_find_max(framework::Tensor *filter_tensor) {
  auto filter_ptr = filter_tensor->data<float>();
Z
zhangyang 已提交
71
  return filter::find_max(filter_ptr, (int)filter_tensor->numel());  // NOLINT
Z
zhangyang 已提交
72
}
Z
zhangyang 已提交
73

Z
zhangyang 已提交
74 75
int get_aligned_channel_num(int channel_num) {
  return filter::calc_aligned_channel(channel_num);
Z
zhangyang 已提交
76 77
}

Z
zhangyang 已提交
78
int get_aligned_filter_num(framework::Tensor *filter_tensor) {
Z
zhangyang 已提交
79
  auto dims = filter_tensor->dims();
Z
zhangyang 已提交
80
  return filter::calc_aligned_num((int)dims[0], (int)dims[1]);  // NOLINT
Z
zhangyang 已提交
81 82
}

Z
zhangyang 已提交
83 84 85
int get_conv_output_channel(framework::Tensor *filter_tensor) {
  int aligned_filter_num = get_aligned_filter_num(filter_tensor);
  return get_aligned_channel_num(aligned_filter_num);
Z
zhangyang 已提交
86
}
Z
zhangyang 已提交
87 88
void format_filter(framework::Tensor *filter_tensor, float max_value,
                   int group_num) {
89 90
  filter_tensor->scale[0] = float(max_value / 127.0);  // NOLINT
  filter_tensor->scale[1] = float(127.0 / max_value);  // NOLINT
Z
zhangyang 已提交
91
  auto dims = filter_tensor->dims();
Z
zhangyang 已提交
92
  auto num = dims[0], channel = dims[1], height = dims[2], width = dims[3];
93
  auto data_ptr = filter_tensor->data<float>();
Z
zhangyang 已提交
94
  size_t memory_size = num * channel * height * width * sizeof(float);
95
  auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
Z
zhangyang 已提交
96
  memcpy(new_data, data_ptr, memory_size);
Z
zhangyang 已提交
97 98 99
  filter::format_filter(&new_data, (int)num, (int)channel,  // NOLINT
                        (int)height,                        // NOLINT
                        (int)width, group_num, max_value);  // NOLINT
Z
zhangyang 已提交
100 101 102
  filter_tensor->reset_data_ptr(new_data);
}

Z
zhangyang 已提交
103 104 105 106 107 108 109 110
void format_fc_filter(framework::Tensor *filter_tensor, float max_value) {
  filter_tensor->scale[0] = float(max_value / 127.0);  // NOLINT
  filter_tensor->scale[1] = float(127.0 / max_value);  // NOLINT
  auto dims = filter_tensor->dims();
  auto num = dims[0], channel = dims[1], height = dims[2], width = dims[3];
  auto data_ptr = filter_tensor->data<float>();
  size_t memory_size = num * channel * height * width * sizeof(float);
  auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
Z
zhangyang 已提交
111
  memcpy(new_data, data_ptr, memory_size);
Z
zhangyang 已提交
112 113 114
  filter::format_fc_filter(&new_data, (int)num, (int)channel,  // NOLINT
                           (int)height,                        // NOLINT
                           (int)width, 1, max_value);          // NOLINT
Z
zhangyang 已提交
115 116 117
  filter_tensor->reset_data_ptr(new_data);
}

Z
zhangyang 已提交
118 119 120 121 122 123
void format_bias_scale_array(float **bias_scale_array, int filter_num,
                             int filter_channel) {
  int num_after_alignment =
      filter::calc_aligned_num(filter_channel, filter_channel);
  bias_scale::format_bias_scale_array(bias_scale_array, filter_num,
                                      num_after_alignment);
Z
zhangyang 已提交
124 125
}

Z
zhangyang 已提交
126
void format_concat_output(framework::Tensor *out, int height, int width,
Z
zhangyang 已提交
127 128 129
                          uint32_t out_channel) {
  auto data_ptr = fpga_malloc(out_channel * height * width * sizeof(half));
  auto ddim = framework::make_ddim({1, out_channel, height, width});
Z
zhangyang 已提交
130 131 132 133
  out->Resize(ddim);
  out->reset_data_ptr(data_ptr);
}

Z
zhangyang 已提交
134
int format_conv_data(framework::Tensor *filter_tensor,
qnqinan's avatar
qnqinan 已提交
135
                     framework::Tensor *ofm_tensor, float **bs_ptr, int group) {
Z
zhangyang 已提交
136 137 138
  float max_value = fpga::filter_find_max(filter_tensor);
  fpga::format_filter(filter_tensor, max_value, group);
  int aligned_num = get_aligned_filter_num(filter_tensor);
qnqinan's avatar
qnqinan 已提交
139
  fpga::format_bias_scale_array(bs_ptr,
Z
zhangyang 已提交
140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
                                (int)filter_tensor->dims()[0],  // NOLINT
                                aligned_num);
  int aligned_channel = fpga::get_conv_output_channel(filter_tensor);
  fpga::format_fp16_ofm(ofm_tensor, aligned_channel);
  DLOG << aligned_channel;
  return aligned_channel;
}

int format_fc_data(framework::Tensor *filter_tensor,
                   framework::Tensor *ofm_tensor, float *bs_ptr) {
  float max_value = fpga::filter_find_max(filter_tensor);
  fpga::format_fc_filter(filter_tensor, max_value);
  int aligned_num = get_aligned_filter_num(filter_tensor);
  fpga::format_bias_scale_array(&bs_ptr,
                                (int)filter_tensor->dims()[0],  // NOLINT
                                aligned_num);
  int aligned_channel = fpga::get_conv_output_channel(filter_tensor);
  fpga::format_fp16_ofm(ofm_tensor, aligned_channel);
  DLOG << aligned_channel;
  return aligned_channel;
}

Z
zhangyang 已提交
162 163 164 165
void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
                    framework::Tensor *out, framework::Tensor *filter,
                    bool relu_enabled, int group_num, int stride_h,
                    int stride_w, int padding_h, int padding_w, float *bs_ptr) {
166 167
  auto input_ptr = input->data<float>();
  auto filter_ptr = filter->data<float>();
168
  auto out_ptr = out->data<float>();
169 170

  arg->group_num = (uint32_t)group_num;
Z
zhangyang 已提交
171
  arg->split_num = 1;
172 173 174
  arg->filter_num = (uint32_t)filter->dims()[0];
  arg->output.address = out_ptr;
  arg->output.scale_address = out->scale;
Z
zhangyang 已提交
175
  arg->conv_arg =
176
      (ConvArgs *)fpga_malloc(arg->split_num * sizeof(ConvArgs));  // NOLINT
177 178 179 180

  arg->concat_arg.image_num = arg->split_num;
  arg->concat_arg.image_out = out_ptr;
  arg->concat_arg.scale_out = out->scale;
181 182
  arg->concat_arg.height = (uint32_t)out->dims()[2];
  arg->concat_arg.width = (uint32_t)out->dims()[3];
183 184

  int n = arg->split_num;
185 186 187 188
  arg->concat_arg.images_in =
      (half **)fpga_malloc(n * sizeof(int *));  // NOLINT
  arg->concat_arg.scales_in =
      (float **)fpga_malloc(n * sizeof(float *));  // NOLINT
189
  arg->concat_arg.channel_num =
190
      (uint32_t *)fpga_malloc(n * sizeof(uint32_t));  // NOLINT
191 192

  for (int i = 0; i < n; i++) {
Z
zhangyang 已提交
193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215
    arg->conv_arg[i].relu_enabled = relu_enabled;
    arg->conv_arg[i].sb_address = bs_ptr;
    arg->conv_arg[i].filter_address = (int8_t *)filter_ptr;  // NOLINT
    arg->conv_arg[i].filter_scale_address = filter->scale;
    arg->conv_arg[i].filter_num = arg->filter_num;
    arg->conv_arg[i].group_num = (uint32_t)group_num;

    arg->conv_arg[i].kernel.stride_h = (uint32_t)stride_h;
    arg->conv_arg[i].kernel.stride_w = (uint32_t)stride_w;
    arg->conv_arg[i].kernel.height = (uint32_t)filter->dims()[2];
    arg->conv_arg[i].kernel.width = (uint32_t)filter->dims()[3];

    arg->conv_arg[i].image.address = input_ptr;
    arg->conv_arg[i].image.scale_address = input->scale;
    arg->conv_arg[i].image.channels = (uint32_t)input->dims()[1];
    arg->conv_arg[i].image.height = (uint32_t)input->dims()[2];
    arg->conv_arg[i].image.width = (uint32_t)input->dims()[3];
    arg->conv_arg[i].image.pad_height = (uint32_t)padding_h;
    arg->conv_arg[i].image.pad_width = (uint32_t)padding_w;

    arg->conv_arg[i].output.address = out_ptr;
    arg->conv_arg[i].output.scale_address = out->scale;

Z
zhangyang 已提交
216
    int num_after_alignment = filter::calc_aligned_num(
qnqinan's avatar
qnqinan 已提交
217
        arg->filter_num, (int)input->dims()[1]);  // NOLINT
Z
zhangyang 已提交
218 219
    arg->conv_arg[i].free_space =
        fpga_malloc(num_after_alignment * 2 * sizeof(half));
220 221 222
  }
}

H
hanbuhe 已提交
223
}  // namespace fpga
Z
zhangyang 已提交
224
}  // namespace paddle_mobile