elementwise_add_image_compute_test.cc 10.5 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <gtest/gtest.h>
#include <algorithm>
#include <random>
#include "lite/backends/opencl/target_wrapper.h"
#include "lite/core/op_registry.h"
#include "lite/core/tensor.h"

namespace paddle {
namespace lite {

template <typename dtype>
void fill_data(dtype *x, const int length, int set_value = -1) {
  if (set_value == -1) {
    for (size_t idx = 0; idx < length; ++idx) {
      x[idx] = idx;
    }
  } else if (set_value != -1) {
    for (size_t idx = 0; idx < length; ++idx) {
      x[idx] = set_value;
    }
  }
}

template <typename dtype>
void elementwise_compute_ref(const dtype *x_data,
                             const dtype *y_data,
                             dtype *out_data,
                             const DDim &x_dims,
                             const DDim &y_dims,
                             int axis,
                             const std::string elt_type,
                             bool use_relu = false) {
  if (axis < 0) {
    axis = x_dims.size() - y_dims.size();
  }
  int batch = 1;
  int channels = 1;
  int num = 1;
  for (int i = 0; i < axis; ++i) {
    batch *= x_dims[i];
  }
  for (int i = 0; i < y_dims.size(); ++i) {
    channels *= y_dims[i];
  }
  for (int i = y_dims.size() + axis; i < x_dims.size(); ++i) {
    num *= x_dims[i];
  }
62 63 64 65 66 67 68 69 70
  VLOG(4) << "axis:" << axis;
  VLOG(4) << "batch:" << batch;
  VLOG(4) << "cahnnels:" << channels;
  VLOG(4) << "num:" << num;
  // do elementwise add/sub/max/...
  if (elt_type == "add" && axis == 1 && y_dims.size() == 1) {
    for (int i = 0; i < x_dims.production(); ++i) {
      auto w = i % y_dims.production();
      out_data[i] = x_data[i] + y_data[w];
71
    }
72
  } else if (elt_type == "add") {
73 74 75 76 77 78 79
    for (int i = 0; i < batch; ++i) {
      for (int j = 0; j < channels; ++j) {
        int offset = (i * channels + j) * num;
        const dtype *din_ptr = x_data + offset;
        const dtype diny_data = y_data[j];
        dtype *dout_ptr = out_data + offset;
        for (int k = 0; k < num; ++k) {
80
          *dout_ptr = *din_ptr + diny_data;
81 82 83 84 85 86 87 88 89
          if (use_relu) {
            *dout_ptr = std::max(*dout_ptr, static_cast<dtype>(0));
          }
          dout_ptr++;
          din_ptr++;
        }
      }
    }
  } else {
90
    LOG(FATAL) << "unsupported Elementwise type: " << elt_type;
91 92 93 94
  }
}

// #define PRINT_RESULT
95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110
// image
TEST(elementwise_add_image, compute) {
  LOG(INFO) << "main steps of test: host -> layout(buf2img on cpu) -> "
               "elementwise_add(img) -> "
               "layout(img2buf on cpu) "
               "-> host";

  // elementwise_add's 3 kernels selection routing strategy:
  // --------------------------------------------------------
  //  1. elementwise_add: Need y_dim.size() == 4
  //  2. elementwise_add (used by fuse_elementwise_activation op):
  //                      Need y_dim.size() == 4 && act_type == "relu"
  //  3. width_add:       Need y_dim.size() == 1 && x_dim.size() == 4 && axis ==
  //  3
  //  4. channel_add:     Need y_dim.size() == 1 && x_dim.size() == 4 && axis ==
  //  1
111 112 113

  // dims
  const int n = 1;
114
  const int c = 3;
115 116 117 118 119
  const int h = 2;
  const int w = 2;

  const DDim x_dim = DDim(std::vector<DDim::value_type>{n, c, h, w});
  auto out_dim = x_dim;
120 121
  // y_dim / axis / relu_flag
  std::vector<DDim> y_dim_v{DDim(std::vector<DDim::value_type>{n, c, h, w}),
122
                            DDim(std::vector<DDim::value_type>{n, c, h, w}),
123
                            DDim(std::vector<DDim::value_type>{w}),
124
                            DDim(std::vector<DDim::value_type>{w})};
125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
  std::vector<int> axis_v{-1, -1, 3, 1};
  std::vector<bool> relu_flag_v{false, true, false, false};
  CHECK(y_dim_v.size() == axis_v.size() && axis_v.size() == relu_flag_v.size())
      << "y_dim_v.size() == axis_v.size() == relu_flag_v.size() should be "
         "same, and be corresponding "
         "one by one";

  // start loop
  for (size_t case_idx = 0; case_idx < y_dim_v.size(); ++case_idx) {
    auto y_dim = y_dim_v[case_idx];
    auto axis = axis_v[case_idx];
    auto relu_flag = relu_flag_v[case_idx];
    LOG(INFO) << "================== elementwise_add, case_idx:" << case_idx + 1
              << "/" << y_dim_v.size() << " ===================";
    LOG(INFO) << "x_dim:" << x_dim;
    LOG(INFO) << "y_dim:" << y_dim;
    LOG(INFO) << "out_dim:" << out_dim;
    LOG(INFO) << "axis:" << axis;
    LOG(INFO) << "relu_flag:" << relu_flag;
144 145

    // tensor
146 147 148 149 150
    VLOG(4) << "set tensors about op param";
    lite::Tensor eleadd_x, eleadd_y, eleadd_out;
    eleadd_x.Resize(x_dim);
    eleadd_y.Resize(y_dim);
    eleadd_out.Resize(out_dim);
151 152 153 154 155 156 157 158 159 160

    // initialize tensors
    VLOG(4) << "initialize tensors";
    paddle::lite::CLImageConverterDefault default_convertor;
    // x
    std::vector<float> x_v(x_dim.production());
    fill_data<float>(x_v.data(), x_v.size());  // fill with index value
    auto x_img_shape = default_convertor.InitImageDimInfoWith(x_dim);  // w, h
    auto x_img_w = x_img_shape[0];
    auto x_img_h = x_img_shape[1];
161
    std::vector<half_t> x_img_v(x_img_w * x_img_h * 4);  // 4: RGBA
162
    default_convertor.NCHWToImage(x_v.data(), x_img_v.data(), x_dim);
163
    eleadd_x.mutable_data<half_t, cl::Image2D>(
164
        x_img_w, x_img_h, x_img_v.data());
165 166 167 168 169 170 171

    // y
    std::vector<float> y_v(y_dim.production());
    fill_data<float>(y_v.data(), y_v.size());  // fill with index value
    auto y_img_shape = default_convertor.InitImageDimInfoWith(y_dim);  // w, h
    auto y_img_w = y_img_shape[0];
    auto y_img_h = y_img_shape[1];
172 173
    std::vector<half_t> y_img_v(y_img_shape[0] * y_img_shape[1] *
                                4);  // 4: RGBA
174
    default_convertor.NCHWToImage(y_v.data(), y_img_v.data(), y_dim);
175
    eleadd_y.mutable_data<half_t, cl::Image2D>(
176
        y_img_w, y_img_h, y_img_v.data());
177 178 179 180 181 182

    // out
    auto out_img_shape =
        default_convertor.InitImageDimInfoWith(out_dim);  // w, h
    auto out_img_w = out_img_shape[0];
    auto out_img_h = out_img_shape[1];
183
    eleadd_out.mutable_data<half_t, cl::Image2D>(out_img_w, out_img_h);
184

185 186
    std::vector<half_t> out_img_v(out_img_w * out_img_h * 4);
    fill_data<half_t>(
187 188 189 190 191
        out_img_v.data(), out_img_v.size(), 0);  // fill with zero value

    std::vector<float> out_v(out_dim.production());

    // operator param
192 193 194 195 196 197 198 199 200 201 202 203 204 205 206
    operators::FusionElementwiseActivationParam
        fuseEleaddParam;  // enabled if relu_flag is true
    fuseEleaddParam.X = &eleadd_x;
    fuseEleaddParam.Y = &eleadd_y;
    fuseEleaddParam.Out = &eleadd_out;
    fuseEleaddParam.axis = axis;
    fuseEleaddParam.act_type = relu_flag ? "relu" : "";

    operators::ElementwiseParam eleaddParam;
    eleaddParam.X = &eleadd_x;
    eleaddParam.Y = &eleadd_y;
    eleaddParam.Out = &eleadd_out;
    eleaddParam.axis = axis;

    auto op_param = relu_flag ? fuseEleaddParam : eleaddParam;
207 208

    // set kernel
209 210
    auto eleadd_img_kernels =
        KernelRegistry::Global().Create("elementwise_add",
211
                                        TARGET(kOpenCL),
212
                                        PRECISION(kFP16),
213
                                        DATALAYOUT(kImageDefault));
214
    ASSERT_FALSE(eleadd_img_kernels.empty());
215

216 217
    auto eleadd_img_kernel = std::move(eleadd_img_kernels.front());
    VLOG(4) << "get eleadd kernel: " << eleadd_img_kernel->doc();
218 219 220 221 222 223

    // set context and kernel args
    VLOG(4) << "set context and kernel args";
    std::unique_ptr<KernelContext> context(new KernelContext);
    context->As<OpenCLContext>().InitOnce();

224 225
    eleadd_img_kernel->SetParam(op_param);
    std::unique_ptr<KernelContext> eleadd_img_context(new KernelContext);
226
    context->As<OpenCLContext>().CopySharedTo(
227 228
        &(eleadd_img_context->As<OpenCLContext>()));
    eleadd_img_kernel->SetContext(std::move(eleadd_img_context));
229 230 231

    // run kernel
    VLOG(4) << "run kernel";
232
    eleadd_img_kernel->Launch();
233 234 235 236 237

    // download gpu result to cpu
    const size_t cl_image2d_row_pitch{0};
    const size_t cl_image2d_slice_pitch{0};
    TargetWrapperCL::ImgcpySync(out_img_v.data(),
238
                                eleadd_out.data<half_t, cl::Image2D>(),
239 240 241 242 243 244 245 246 247 248 249 250 251 252 253
                                out_img_w,
                                out_img_h,
                                cl_image2d_row_pitch,
                                cl_image2d_slice_pitch,
                                IoDirection::DtoH);
    default_convertor.ImageToNCHW(
        out_img_v.data(), out_v.data(), out_img_shape, out_dim);

    // compute cpu reference
    std::unique_ptr<float[]> out_ref(new float[out_dim.production()]);
    elementwise_compute_ref<float>(x_v.data(),
                                   y_v.data(),
                                   out_ref.get(),
                                   x_dim,
                                   y_dim,
254 255 256
                                   op_param.axis,
                                   "add",
                                   relu_flag);
257

258
#ifdef PRINT_RESULT  // enable to check value of x and y
259 260 261
    for (int eidx = 0; eidx < out_dim.production(); eidx++) {
      auto value = out_v[eidx];
      auto ref_value = out_ref.get()[eidx];
262 263 264 265
      LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx << " / "
                << out_dim.production() << ", x_v[" << eidx << "]:" << x_v[eidx]
                << ", value[" << eidx << "]:" << value << ", ref_value[" << eidx
                << "]:" << ref_value;
266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289
    }

    for (int i = 0; i < y_v.size(); i++) {
      LOG(INFO) << "y_v[" << i << "]:" << y_v[i];
    }
#endif

    for (int eidx = 0; eidx < out_dim.production(); eidx++) {
      auto value = out_v[eidx];
      auto ref_value = out_ref.get()[eidx];
      EXPECT_NEAR(value, ref_value, 1e-6);
      if (abs(value - ref_value) > 1e-6) {
        LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx << " / "
                  << out_dim.production() << ", value[" << eidx << "]:" << value
                  << ", ref_value[" << eidx << "]:" << ref_value;
        break;
      }
    }
  }
}

}  // namespace lite
}  // namespace paddle

290 291 292
USE_LITE_KERNEL(elementwise_add, kOpenCL, kFP16, kImageDefault, def);
USE_LITE_KERNEL(
    fusion_elementwise_add_activation, kOpenCL, kFP16, kImageDefault, def);