helper.cc 15.8 KB
Newer Older
L
Liangliang He 已提交
1
// Copyright 2018 Xiaomi, Inc.  All rights reserved.
2
//
L
Liangliang He 已提交
3 4 5
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
6
//
L
Liangliang He 已提交
7 8 9 10 11 12 13
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
14 15

#include "mace/kernels/opencl/helper.h"
W
wuchenghui 已提交
16 17 18 19 20

#include <algorithm>
#include <string>
#include <vector>

L
liuqi 已提交
21
#include "mace/utils/tuner.h"
22
#include "mace/utils/utils.h"
23 24 25 26

namespace mace {
namespace kernels {

27
namespace {
28
// [(C + 3) / 4 * W, N * H]
29
void CalInOutputImageShape(const std::vector<index_t> &shape, /* NHWC */
W
wuchenghui 已提交
30
                           std::vector<size_t> *image_shape) {
31
  MACE_CHECK(shape.size() == 4);
W
wuchenghui 已提交
32 33 34
  image_shape->resize(2);
  (*image_shape)[0] = RoundUpDiv4(shape[3]) * shape[2];
  (*image_shape)[1] = shape[0] * shape[1];
35 36
}

37
// [Ic, H * W * (Oc + 3) / 4]
38
void CalConv2dFilterImageShape(const std::vector<index_t> &shape, /* HWOI */
W
wuchenghui 已提交
39
                               std::vector<size_t> *image_shape) {
40
  MACE_CHECK(shape.size() == 4);
W
wuchenghui 已提交
41
  image_shape->resize(2);
42
  (*image_shape)[0] = shape[3];
43
  (*image_shape)[1] = shape[0] * shape[1] * RoundUpDiv4(shape[2]);
44 45 46
}

// [H * W * M, (Ic + 3) / 4]
47 48
void CalDepthwiseConv2dFilterImageShape(
    const std::vector<index_t> &shape, /* HWIM */
W
wuchenghui 已提交
49
    std::vector<size_t> *image_shape) {
50
  MACE_CHECK(shape.size() == 4);
W
wuchenghui 已提交
51 52 53
  image_shape->resize(2);
  (*image_shape)[0] = shape[0] * shape[1] * shape[3];
  (*image_shape)[1] = RoundUpDiv4(shape[2]);
54 55 56 57
}

// [(size + 3) / 4, 1]
void CalArgImageShape(const std::vector<index_t> &shape,
W
wuchenghui 已提交
58
                      std::vector<size_t> *image_shape) {
59
  MACE_CHECK(shape.size() == 1);
W
wuchenghui 已提交
60 61 62
  image_shape->resize(2);
  (*image_shape)[0] = RoundUpDiv4(shape[0]);
  (*image_shape)[1] = 1;
63 64
}

65 66
// Only support 3x3 now
// [ (Ic + 3) / 4, 16 * Oc]
67 68
void CalWinogradFilterImageShape(
    const std::vector<index_t> &shape, /* Oc, Ic, H, W*/
W
wuchenghui 已提交
69
    std::vector<size_t> *image_shape) {
70
  MACE_CHECK(shape.size() == 4);
W
wuchenghui 已提交
71 72 73
  image_shape->resize(2);
  (*image_shape)[0] = RoundUpDiv4(shape[1]);
  (*image_shape)[1] = (shape[0] << 4);
74 75 76 77
}

// [W * C, N * RoundUp<4>(H)]
void CalInOutHeightImageShape(const std::vector<index_t> &shape, /* NHWC */
W
wuchenghui 已提交
78
                              std::vector<size_t> *image_shape) {
79
  MACE_CHECK(shape.size() == 4);
W
wuchenghui 已提交
80 81 82
  image_shape->resize(2);
  (*image_shape)[0] = shape[2] * shape[3];
  (*image_shape)[1] = shape[0] * RoundUpDiv4(shape[1]);
83 84 85 86
}

// [RoundUp<4>(W) * C, N * H]
void CalInOutWidthImageShape(const std::vector<index_t> &shape, /* NHWC */
W
wuchenghui 已提交
87
                             std::vector<size_t> *image_shape) {
88
  MACE_CHECK(shape.size() == 4);
W
wuchenghui 已提交
89 90 91
  image_shape->resize(2);
  (*image_shape)[0] = RoundUpDiv4(shape[2]) * shape[3];
  (*image_shape)[1] = shape[0] * shape[1];
92 93
}

L
liuqi 已提交
94 95
// [W, (H + 3) / 4]
void CalWeightHeightImageShape(const std::vector<index_t> &shape, /* HW */
W
wuchenghui 已提交
96
                               std::vector<size_t> *image_shape) {
L
liuqi 已提交
97
  MACE_CHECK(shape.size() == 2);
W
wuchenghui 已提交
98 99 100
  image_shape->resize(2);
  (*image_shape)[0] = shape[1];
  (*image_shape)[1] = RoundUpDiv4(shape[0]);
L
liuqi 已提交
101 102
}

L
liuqi 已提交
103 104
// [(W + 3) / 4, H]
void CalWeightWidthImageShape(const std::vector<index_t> &shape, /* HW */
W
wuchenghui 已提交
105
                              std::vector<size_t> *image_shape) {
L
liuqi 已提交
106
  MACE_CHECK(shape.size() == 2);
W
wuchenghui 已提交
107 108 109
  image_shape->resize(2);
  (*image_shape)[0] = RoundUpDiv4(shape[1]);
  (*image_shape)[1] = shape[0];
L
liuqi 已提交
110
}
111
}  // namespace
L
liuqi 已提交
112

113 114
void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
                     const BufferType type,
W
wuchenghui 已提交
115 116
                     std::vector<size_t> *image_shape) {
  MACE_CHECK_NOTNULL(image_shape);
117
  switch (type) {
118 119 120 121 122
    case CONV2D_FILTER:
      CalConv2dFilterImageShape(shape, image_shape);
      break;
    case DW_CONV2D_FILTER:
      CalDepthwiseConv2dFilterImageShape(shape, image_shape);
123
      break;
124
    case IN_OUT_CHANNEL:
125
      CalInOutputImageShape(shape, image_shape);
126
      break;
127 128
    case ARGUMENT:
      CalArgImageShape(shape, image_shape);
129
      break;
130 131 132 133 134 135 136 137 138
    case IN_OUT_HEIGHT:
      CalInOutHeightImageShape(shape, image_shape);
      break;
    case IN_OUT_WIDTH:
      CalInOutWidthImageShape(shape, image_shape);
      break;
    case WINOGRAD_FILTER:
      CalWinogradFilterImageShape(shape, image_shape);
      break;
L
liuqi 已提交
139 140 141
    case WEIGHT_HEIGHT:
      CalWeightHeightImageShape(shape, image_shape);
      break;
L
liuqi 已提交
142 143 144
    case WEIGHT_WIDTH:
      CalWeightWidthImageShape(shape, image_shape);
      break;
145 146 147 148 149 150 151 152 153
    default:
      LOG(FATAL) << "Mace not supported yet.";
  }
}

std::vector<index_t> CalWinogradShape(const std::vector<index_t> &shape,
                                      const BufferType type) {
  if (type == WINOGRAD_FILTER) {
    return {16, shape[0], shape[1], 1};
154 155
  } else if (type == IN_OUT_HEIGHT) {
    index_t out_width = shape[0] * ((shape[1] - 1) / 2) * ((shape[2] - 1) / 2);
156 157 158
    return {16, shape[3], out_width, 1};
  } else {
    LOG(FATAL) << "Mace not supported yet.";
159
    return std::vector<index_t>();
160 161 162
  }
}

163 164
std::string DtToCLDt(const DataType dt) {
  switch (dt) {
165 166 167 168 169 170
    case DT_FLOAT:
      return "float";
    case DT_HALF:
      return "half";
    default:
      LOG(FATAL) << "Unsupported data type";
171 172 173 174 175 176
      return "";
  }
}

std::string DtToCLCMDDt(const DataType dt) {
  switch (dt) {
177 178 179 180 181 182
    case DT_FLOAT:
      return "f";
    case DT_HALF:
      return "h";
    default:
      LOG(FATAL) << "Not supported data type for opencl cmd data type";
183 184 185 186 187
      return "";
  }
}

std::string DtToUpstreamCLDt(const DataType dt) {
188 189
  switch (dt) {
    case DT_FLOAT:
190 191 192 193
    case DT_HALF:
      return "float";
    default:
      LOG(FATAL) << "Unsupported data type";
194 195 196 197
      return "";
  }
}

198
std::string DtToUpstreamCLCMDDt(const DataType dt) {
199 200
  switch (dt) {
    case DT_FLOAT:
201 202 203 204
    case DT_HALF:
      return "f";
    default:
      LOG(FATAL) << "Not supported data type for opencl cmd data type";
205 206 207 208
      return "";
  }
}

209 210 211 212 213 214 215 216 217 218 219 220 221 222
std::vector<uint32_t> Default3DLocalWS(const uint32_t *gws,
                                       const uint32_t kwg_size) {
  std::vector<uint32_t> lws(4, 0);
  uint64_t cache_size =
      OpenCLRuntime::Global()->device_global_mem_cache_size();
  uint32_t base = cache_size / kBaseGPUMemCacheSize;
  lws[1] = std::min<uint32_t>(gws[1], kwg_size);
  lws[2] = std::min<uint32_t>(std::min<uint32_t>(gws[2], base),
                              kwg_size / lws[1]);
  const uint32_t lws_size = lws[1] * lws[2];
  lws[0] = std::min<uint32_t>(base, kwg_size / lws_size);
  return lws;
}

W
wuchenghui 已提交
223
void TuningOrRun3DKernel(const cl::Kernel &kernel,
L
liuqi 已提交
224 225
                         const std::string tuning_key,
                         const uint32_t *gws,
226
                         const std::vector<uint32_t> &lws,
L
liuqi 已提交
227 228
                         StatsFuture *future) {
  auto runtime = OpenCLRuntime::Global();
Y
yejianwu 已提交
229

L
liuqi 已提交
230
  auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
231 232
    const uint32_t kwg_size =
        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel));
233 234
    std::vector<std::vector<uint32_t>> results;
    std::vector<std::vector<uint32_t>> candidates = {
W
wuchenghui 已提交
235
        // TODO(heliangliang): tuning these magic numbers
236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265
        {gws[0], gws[1], gws[2], 0},
        {gws[0], gws[1], gws[2] / 8, 0},
        {gws[0], gws[1], gws[2] / 4, 0},
        {gws[0], gws[1], 8, 0},
        {gws[0], gws[1], 4, 0},
        {gws[0], gws[1], 1, 0},
        {gws[0] / 4, gws[1], gws[2], 0},
        {gws[0] / 4, gws[1], gws[2] / 8, 0},
        {gws[0] / 4, gws[1], gws[2] / 4, 0},
        {gws[0] / 4, gws[1], 8, 0},
        {gws[0] / 4, gws[1], 4, 0},
        {gws[0] / 4, gws[1], 1, 0},
        {gws[0] / 8, gws[1], gws[2], 0},
        {gws[0] / 8, gws[1], gws[2] / 8, 0},
        {gws[0] / 8, gws[1], gws[2] / 4, 0},
        {gws[0] / 8, gws[1], 8, 0},
        {gws[0] / 8, gws[1], 4, 0},
        {gws[0] / 8, gws[1], 1, 0},
        {4, gws[1], gws[2], 0},
        {4, gws[1], gws[2] / 8, 0},
        {4, gws[1], gws[2] / 4, 0},
        {4, gws[1], 8, 0},
        {4, gws[1], 4, 0},
        {4, gws[1], 1, 0},
        {1, gws[1], gws[2], 0},
        {1, gws[1], gws[2] / 8, 0},
        {1, gws[1], gws[2] / 4, 0},
        {1, gws[1], 8, 0},
        {1, gws[1], 4, 0},
        {1, gws[1], 1, 0},
L
liuqi 已提交
266
    };
267 268 269 270 271 272 273
    for (auto &ele : candidates) {
      const uint32_t tmp = ele[0] * ele[1] * ele[2];
      if (0 < tmp && tmp <= kwg_size) {
        results.push_back(ele);
      }
    }
    return results;
L
liuqi 已提交
274 275
  };
  cl::Event event;
276
  auto func = [&](const std::vector<uint32_t> &params, Timer *timer,
277
                  std::vector<uint32_t> *tuning_result) -> cl_int {
278 279
    MACE_CHECK(params.size() == 4)
        << "Tuning parameters of 3D kernel must be 4D";
L
liuqi 已提交
280
    cl_int error = CL_SUCCESS;
281
    std::vector<uint32_t> internal_gws(gws, gws+3);
282
    if (!runtime->IsNonUniformWorkgroupsSupported()) {
Y
yejianwu 已提交
283
      for (size_t i = 0; i < 3; ++i) {
284
        internal_gws[i] = RoundUp(gws[i], params[i]);
Y
yejianwu 已提交
285
      }
Y
yejianwu 已提交
286 287
    }

L
liuqi 已提交
288
    if (timer == nullptr) {
289 290 291
      uint32_t block_size = params[3] == 0 ? internal_gws[2] : params[3];
      const uint32_t num_blocks = RoundUpDiv<uint32_t>(internal_gws[2],
                                                       block_size);
L
liuqi 已提交
292
      for (uint32_t i = 0; i < num_blocks; ++i) {
293 294 295 296
        uint32_t gws2 = block_size;
        if (runtime->IsNonUniformWorkgroupsSupported()
            && (i == num_blocks - 1)) {
          gws2 = (internal_gws[2] - (i * block_size));
Y
yejianwu 已提交
297
        }
298 299 300 301
        error = runtime->command_queue().enqueueNDRangeKernel(
            kernel, cl::NDRange(0, 0, i * block_size),
            cl::NDRange(internal_gws[0], internal_gws[1], gws2),
            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
L
Liangliang He 已提交
302
        MACE_CHECK_CL_SUCCESS(error);
L
liuqi 已提交
303 304
      }
    } else {
305
      timer->ClearTiming();
306 307 308 309
      error = runtime->command_queue().enqueueNDRangeKernel(
          kernel, cl::NullRange,
          cl::NDRange(internal_gws[0], internal_gws[1], internal_gws[2]),
          cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
L
Liangliang He 已提交
310
      MACE_CHECK_CL_SUCCESS(error);
311 312 313 314 315 316
      timer->AccumulateTiming();
      tuning_result->assign(params.begin(), params.end());

      if (LimitKernelTime()) {
        double elapse_time = timer->AccumulatedMicros();
        timer->ClearTiming();
317 318
        uint32_t num_blocks = std::min(
            static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[2]);
319 320 321 322 323 324
        uint32_t block_size = gws[2] / num_blocks;
        if (!runtime->IsNonUniformWorkgroupsSupported()) {
          block_size = RoundUp(block_size, params[2]);
        }
        (*tuning_result)[3] = block_size;
        num_blocks = RoundUpDiv<uint32_t>(internal_gws[2], block_size);
325
        for (uint32_t i = 0; i < num_blocks; ++i) {
326 327 328 329
          uint32_t gws2 = block_size;
          if (runtime->IsNonUniformWorkgroupsSupported()
              && (i == num_blocks - 1)) {
            gws2 = (internal_gws[2] - (i * block_size));
Y
yejianwu 已提交
330
          }
331 332 333 334
          error = runtime->command_queue().enqueueNDRangeKernel(
              kernel, cl::NDRange(0, 0, i * block_size),
              cl::NDRange(internal_gws[0], internal_gws[1], gws2),
              cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
L
Liangliang He 已提交
335
          MACE_CHECK_CL_SUCCESS(error);
336 337
          timer->AccumulateTiming();
        }
L
liuqi 已提交
338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355
      }
    }
    return error;
  };
  OpenCLProfilingTimer timer(&event);
  Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(
      tuning_key, lws, params_generator, func, &timer);

  if (future != nullptr) {
    future->wait_fn = [event](CallStats *stats) {
      event.wait();
      if (stats != nullptr) {
        OpenCLRuntime::Global()->GetCallStats(event, stats);
      }
    };
  }
}

W
wuchenghui 已提交
356
void TuningOrRun2DKernel(const cl::Kernel &kernel,
L
liuqi 已提交
357 358
                         const std::string tuning_key,
                         const uint32_t *gws,
359
                         const std::vector<uint32_t> &lws,
L
liuqi 已提交
360 361
                         StatsFuture *future) {
  auto runtime = OpenCLRuntime::Global();
Y
yejianwu 已提交
362

L
liuqi 已提交
363
  auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
364 365
    const uint32_t kwg_size =
        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel));
366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385
    std::vector<std::vector<uint32_t>> results;
    std::vector<std::vector<uint32_t>> candidates = {
        {kwg_size / 2, 2, 0},
        {kwg_size / 4, 4, 0},
        {kwg_size / 8, 8, 0},
        {kwg_size / 16, 16, 0},
        {kwg_size / 32, 32, 0},
        {kwg_size / 64, 64, 0},
        {kwg_size / 128, 128, 0},
        {kwg_size / 256, 256, 0},
        {kwg_size, 1, 0},
        {1, kwg_size, 0}
    };
    for (auto &ele : candidates) {
      const uint32_t tmp = ele[0] * ele[1] * ele[2];
      if (0 < tmp && tmp <= kwg_size) {
        results.push_back(ele);
      }
    }
    return results;
L
liuqi 已提交
386 387
  };
  cl::Event event;
388
  auto func = [&](const std::vector<uint32_t> &params, Timer *timer,
389
                  std::vector<uint32_t> *tuning_result) -> cl_int {
390 391
    MACE_CHECK(params.size() == 3)
        << "Tuning parameters of 2D kernel must be 3d";
L
liuqi 已提交
392
    cl_int error = CL_SUCCESS;
393
    std::vector<uint32_t> internal_gws(gws, gws+2);
394
    if (!runtime->IsNonUniformWorkgroupsSupported()) {
Y
yejianwu 已提交
395
      for (size_t i = 0; i < 2; ++i) {
396
        internal_gws[i] = RoundUp(gws[i], params[i]);
Y
yejianwu 已提交
397
      }
Y
yejianwu 已提交
398 399
    }

L
liuqi 已提交
400
    if (timer == nullptr) {
401 402 403
      uint32_t block_size = params[2] == 0 ? internal_gws[1] : params[2];
      const uint32_t num_blocks = RoundUpDiv<uint32_t>(internal_gws[1],
                                                       block_size);
L
liuqi 已提交
404
      for (uint32_t i = 0; i < num_blocks; ++i) {
405 406 407 408
        uint32_t gws1 = block_size;
        if (runtime->IsNonUniformWorkgroupsSupported()
            && (i == num_blocks - 1)) {
          gws1 = (internal_gws[1] - (i * block_size));
Y
yejianwu 已提交
409
        }
410 411 412 413
        error = runtime->command_queue().enqueueNDRangeKernel(
            kernel, cl::NDRange(0, i * block_size),
            cl::NDRange(internal_gws[0], gws1),
            cl::NDRange(params[0], params[1]), nullptr, &event);
L
Liangliang He 已提交
414
        MACE_CHECK_CL_SUCCESS(error);
L
liuqi 已提交
415 416
      }
    } else {
417
      timer->ClearTiming();
418 419 420
      error = runtime->command_queue().enqueueNDRangeKernel(
          kernel, cl::NullRange, cl::NDRange(internal_gws[0], internal_gws[1]),
          cl::NDRange(params[0], params[1]), nullptr, &event);
L
Liangliang He 已提交
421
      MACE_CHECK_CL_SUCCESS(error);
422 423 424 425 426 427
      timer->AccumulateTiming();
      tuning_result->assign(params.begin(), params.end());

      if (LimitKernelTime()) {
        double elapse_time = timer->AccumulatedMicros();
        timer->ClearTiming();
428 429
        uint32_t num_blocks = std::min(
            static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[1]);
430 431 432 433 434 435
        uint32_t block_size = gws[1] / num_blocks;
        if (!runtime->IsNonUniformWorkgroupsSupported()) {
          block_size = RoundUp(block_size, params[1]);
        }
        (*tuning_result)[2] = block_size;
        num_blocks = RoundUpDiv<uint32_t>(internal_gws[1], block_size);
436
        for (uint32_t i = 0; i < num_blocks; ++i) {
437 438 439 440
          uint32_t gws1 = block_size;
          if (runtime->IsNonUniformWorkgroupsSupported()
              && (i == num_blocks - 1)) {
            gws1 = (internal_gws[1] - (i * block_size));
Y
yejianwu 已提交
441
          }
442 443 444 445
          error = runtime->command_queue().enqueueNDRangeKernel(
              kernel, cl::NDRange(0, i * block_size),
              cl::NDRange(internal_gws[0], gws1),
              cl::NDRange(params[0], params[1]), nullptr, &event);
L
Liangliang He 已提交
446
          MACE_CHECK_CL_SUCCESS(error);
447 448
          timer->AccumulateTiming();
        }
L
liuqi 已提交
449 450 451 452 453
      }
    }
    return error;
  };
  OpenCLProfilingTimer timer(&event);
454 455
  Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(
      tuning_key, lws, params_generator, func, &timer);
L
liuqi 已提交
456 457 458 459 460 461 462 463 464 465
  if (future != nullptr) {
    future->wait_fn = [runtime, event](CallStats *stats) {
      event.wait();
      if (stats != nullptr) {
        runtime->GetCallStats(event, stats);
      }
    };
  }
}

466

467 468
}  // namespace kernels
}  // namespace mace