conv_pe.hpp 10.5 KB
Newer Older
Y
Yan Chunwei 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#pragma once

#include <arm_neon.h>
T
TianXiaogang 已提交
18
#include <algorithm>
Y
Yan Chunwei 已提交
19 20
#include <vector>

21 22 23 24 25 26 27
#include "lite/backends/fpga/KD/pe.hpp"
#include "lite/backends/fpga/KD/pe_params.hpp"
#include "lite/backends/fpga/KD/pes/concat_pe.hpp"
#include "lite/backends/fpga/KD/pes/conv_pe.hpp"
#include "lite/backends/fpga/KD/pes/conv_process.hpp"
#include "lite/backends/fpga/KD/pes/elementwise_add_pe.hpp"
#include "lite/backends/fpga/KD/pes/scale_pe.hpp"
28
#include "lite/backends/fpga/KD/pes/split_pe.hpp"
Y
Yan Chunwei 已提交
29 30 31 32 33 34

namespace paddle {
namespace zynqmp {

class ConvPE : public PE {
 public:
35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135
  void cpu_conv_half_hwc() {
    Tensor* input = param_.input;
    Tensor* output = param_.output;

    Shape& input_shape = input->shape();
    Shape& out_shape = output->shape();

    int image_height = input_shape.height();
    int image_width = input_shape.width();
    int image_channels = input_shape.channel();
    int image_pad_h = param_.paddings[0];
    int image_pad_w = param_.paddings[0];
    int kernel_height = param_.filter->shape().height();
    int kernel_width = param_.filter->shape().width();
    int kernel_step_h = param_.strides[0];
    int kernel_step_w = param_.strides[1];
    int dilation_rate = 1;
    int out_channel = out_shape.channel();
    int pooled_height_ = out_shape.height();
    int pooled_width_ = out_shape.width();
    int filter_chw = image_channels * kernel_height * kernel_width;

    int kernel_rw = kernel_width + (dilation_rate - 1) * (kernel_width - 1);
    int kernel_rh = kernel_height + (dilation_rate - 1) * (kernel_height - 1);

    float* weight = param_.filter->data<float>();

    Tensor float_input;
    Tensor float_output;
    float* image_addr = float_input.mutableData<float>(FP32, input->shape());
    input->syncToDevice();
    float_input.copyFrom(input);
    float_input.invalidate();
    float_input.saveToFile("fi", true);

    float* out = float_output.mutableData<float>(FP32, output->shape());

    for (int ph = 0; ph < pooled_height_; ph++) {
      for (int pw = 0; pw < pooled_width_; pw++) {
        int hstart = ph * kernel_step_h - image_pad_h;
        int wstart = pw * kernel_step_w - image_pad_w;
        int hend = std::min(hstart + kernel_rh, static_cast<int>(image_height));
        int wend = std::min(wstart + kernel_rw, static_cast<int>(image_width));

        int hstart_plus =
            dilation_rate *
                ceil(static_cast<float>(image_pad_h - ph * kernel_step_h)) /
                static_cast<float>(dilation_rate) -
            image_pad_h + ph * kernel_step_h;
        int wstart_plus =
            dilation_rate *
                ceil(static_cast<float>(image_pad_w - pw * kernel_step_w) /
                     static_cast<float>(dilation_rate)) -
            image_pad_w + pw * kernel_step_w;

        int hstart_ = hstart < 0 ? hstart_plus : hstart;
        int wstart_ = wstart < 0 ? wstart_plus : wstart;

        for (int oc = 0; oc < out_channel; oc++) {
          float sum = 0.0f;
          const int pool_index = (ph * pooled_width_ + pw) * out_channel + oc;
          for (int c = 0; c < image_channels; c++) {
            for (int h = hstart_; h < hend; h += dilation_rate) {
              int hi = 0;
              if (hstart < 0) {
                hi = (kernel_rh - (hend - h)) / dilation_rate;
              } else {
                hi = (h - hstart_) / dilation_rate;
              }

              for (int w = wstart_; w < wend; w += dilation_rate) {
                int wi = 0;
                if (wstart < 0) {
                  wi = (kernel_rw - (wend - w)) / dilation_rate;
                } else {
                  wi = (w - wstart_) / dilation_rate;
                }

                const int index = (h * image_width + w) * image_channels + c;
                int weight_index = oc * filter_chw +
                                   kernel_width * kernel_height * c +
                                   kernel_width * hi + wi;
                float value = image_addr[index] * weight[weight_index];
                sum += value;
              }
            }
          }
          float s = param_.scale()->data<float>()[oc];
          float b = param_.bias()->data<float>()[oc];
          out[pool_index] = sum * s + b;
        }
      }
    }
    float_output.flush();
    float_output.saveToFile("fo", true);
    output->copyFrom(&float_output);
    output->invalidate();
    output->saveToFile("out", true);
    // exit(-1);
  }

Y
Yan Chunwei 已提交
136 137 138 139 140 141 142 143
  bool init() {
    Tensor* output = param_.output;
    output->setAligned(true);
    output->setDataLocation(Device);
    return true;
  }

  void apply() {
144 145
    if (param_.deconv == false) {
      split_axis = fill_split_arg(param_);
Y
Yan Chunwei 已提交
146

147
      split_channel = param_.groups != 1 && param_.splitParams().size() > 1;
148

149 150 151 152 153 154 155 156
      if (split_axis == 0 && param_.splitParams().size() > 1) {
        ConcatParam& concat_param = concatPE_.param();
        for (auto conv_param : param_.splitParams()) {
          concat_param.inputs.push_back(&conv_param->output);
        }
        concat_param.output = param_.output;
        concatPE_.init();
        concatPE_.apply();
Y
Yan Chunwei 已提交
157
      }
T
TianXiaogang 已提交
158

159 160 161 162 163 164 165 166
      if (split_channel) {
        SplitParam& split_param = splitPE_.param();
        split_param.input = param_.input;
        for (auto conv_param : param_.splitParams()) {
          split_param.outputs.push_back(&conv_param->input);
        }
        splitPE_.init();
        splitPE_.apply();
167 168 169
      }
    }

T
TianXiaogang 已提交
170 171 172
    if (DLEngine::get_instance().isZU3() &&
        param_.input->shape().dimSize() == 4 &&
        param_.input->shape().width() == 1 &&
173
        param_.input->shape().channel() >= 2048) {
T
TianXiaogang 已提交
174 175
      use_cpu_ = true;
    }
176

177
    if (!use_cpu_) {
178
      param_.filter->releaseData();
T
TianXiaogang 已提交
179 180
    }

181
    // exit(-1);
T
TianXiaogang 已提交
182
  }
Y
Yan Chunwei 已提交
183 184 185 186 187 188 189 190 191
  void cpu_compute() {
    Tensor* input = param_.input;
    Tensor* output = param_.output;
    input->syncToCPU();

    Tensor float_input;
    Tensor float_output;
    float* image_addr = float_input.mutableData<float>(FP32, input->shape());
    float_input.copyFrom(input);
192
    // float16* data_out = output->data<float16>();
Y
Yan Chunwei 已提交
193 194 195 196 197 198 199 200 201 202 203
    float* out = float_output.mutableData<float>(FP32, output->shape());

    int out_channel = output->shape().channel();
    int in_channel = input->shape().channel();

    float* filter_data = param_.filter->data<float>();
    float* mi = new float[in_channel];
    for (int i = 0; i < out_channel; i++) {
      float* image = image_addr;
      float* filter_ptr = filter_data + i * in_channel;
      float* out_ptr = mi;
204 205 206 207 208 209 210 211 212 213 214 215 216 217
#pragma omp parallel for
      for (int j = 0; j < in_channel; j++) {
        // float32x4_t x0 = vld1q_f32(image);
        // float32x4_t x1 = vld1q_f32(filter_ptr);

        // float32x4_t r = vmulq_f32(x0, x1);

        // vst1q_f32(out_ptr, r);
        // image += 4;
        // filter_ptr += 4;
        // out_ptr += 4;
        float value = image_addr[j] * filter_ptr[j];
        mi[j] = value;
      }
Y
Yan Chunwei 已提交
218

219 220 221
      float sum = 0;
      for (int j = 0; j < in_channel; j++) {
        sum += mi[j];
Y
Yan Chunwei 已提交
222
      }
223
      out[i] = sum;
Y
Yan Chunwei 已提交
224 225 226
    }
    delete[] mi;
    float_output.flush();
227
    output->flush();
Y
Yan Chunwei 已提交
228
    output->copyFrom(&float_output);
229
    output->invalidate();
Y
Yan Chunwei 已提交
230 231 232
  }

  bool dispatch() {
T
TianXiaogang 已提交
233 234 235 236
    if (use_cpu_) {
      cpu_compute();
      return true;
    }
237
    inplace_.global_pool_en = false;
238 239 240 241 242 243 244 245 246
    if (param_.activeParam.type == TYPE_RELU) {
      inplace_.relu_enable = true;
    } else if (param_.activeParam.type == TYPE_RELU6) {
      inplace_.relu6_enable = true;
    } else if (param_.activeParam.type == TYPE_SIGMOID) {
      inplace_.sigmoid_enable = true;
    } else if (param_.activeParam.type == TYPE_LEAKY_RELU) {
      inplace_.leaky_relu_enable = true;
    }
T
TianXiaogang 已提交
247

248 249
    if (inplace_.relu_enable || inplace_.leaky_relu_enable ||
        inplace_.relu6_enable || inplace_.sigmoid_enable) {
Y
Yan Chunwei 已提交
250
      config_inplace(inplace_);
T
TianXiaogang 已提交
251
      if (inplace_.leaky_relu_enable) {
252
        activeParamterArgs.type = TYPE_LEAKY_RELU;
T
TianXiaogang 已提交
253
        activeParamterArgs.leaky_relu_factor =
254
            float_to_half(param_.activeParam.leaky_relu_factor);
T
TianXiaogang 已提交
255 256
        config_activation(activeParamterArgs);
      }
Y
Yan Chunwei 已提交
257 258 259
    }

    std::vector<BasicConvParam*>& params = param_.splitParams();
260

261
    if (split_channel && param_.deconv == false) {
262 263 264 265
      // splitPE_.param().input->saveToFile("input_image",true);
      splitPE_.dispatch();
    }

Y
Yan Chunwei 已提交
266 267 268 269 270
    int ret = 0;
    for (auto conv_param : params) {
      ret |= compute_fpga_conv_basic(conv_param->args);
    }

271 272
    if (inplace_.relu_enable || inplace_.leaky_relu_enable ||
        inplace_.relu6_enable || inplace_.sigmoid_enable) {
Y
Yan Chunwei 已提交
273
      inplace_.relu_enable = false;
T
TianXiaogang 已提交
274
      inplace_.leaky_relu_enable = false;
275 276
      inplace_.relu6_enable = false;
      inplace_.sigmoid_enable = false;
277
      inplace_.global_pool_en = false;
Y
Yan Chunwei 已提交
278
      config_inplace(inplace_);
T
TianXiaogang 已提交
279

280
      if (param_.activeParam.type == TYPE_LEAKY_RELU) {
281
        activeParamterArgs.type = TYPE_LEAKY_RELU;
282
        activeParamterArgs.leaky_relu_factor = float_to_half(0);
T
TianXiaogang 已提交
283 284
        config_activation(activeParamterArgs);
      }
Y
Yan Chunwei 已提交
285 286 287
    }

    size_t size = params.size();
288
    if (split_axis == 0 && ret == 0 && size > 1 && param_.deconv == false) {
Y
Yan Chunwei 已提交
289 290 291
      concatPE_.dispatch();
    }
    if (split_axis == 1 && ret == 0 && size > 1) {
292
      // for (int n = 0; n < size - 1; n++) {
Y
Yan Chunwei 已提交
293 294 295 296 297 298
      ElementwiseAddParam& add_param = addPE_.param();
      add_param.inputs = {&params[0]->output, &params[1]->output};
      add_param.output = param_.output;
      addPE_.init();
      addPE_.apply();
      addPE_.dispatch();
299 300 301 302 303 304 305 306 307 308

      // param_.output->printScale();

      // params[0]->input.saveToFile("conv_1.txt");
      // params[1]->input.saveToFile("conv_2.txt");

      // params[0]->output.saveToFile("ew_o1.txt");
      // params[1]->output.saveToFile("ew_o2.txt");
      // std::cout << "\n ================== EW ================== \n";
      // }
Y
Yan Chunwei 已提交
309
    }
310

Y
Yan Chunwei 已提交
311 312 313 314 315 316
    return ret == 0;
  }

  ConvParam& param() { return param_; }

 private:
T
TianXiaogang 已提交
317
  bool use_cpu_ = false;
318
  bool split_channel = false;
Y
Yan Chunwei 已提交
319 320
  ConvParam param_;
  ConcatPE concatPE_;
321
  SplitPE splitPE_;
Y
Yan Chunwei 已提交
322 323 324
  ElementwiseAddPE addPE_;
  int split_axis = 0;
  InplaceArgs inplace_ = {0};
T
TianXiaogang 已提交
325
  ActiveParamterArgs activeParamterArgs;
Y
Yan Chunwei 已提交
326 327 328 329
};

}  // namespace zynqmp
}  // namespace paddle