提交 d246608c 编写于 作者: 刘琦

Merge branch 'dynamic_opencl' into 'master'

Refactor opencl kernel build logic.

See merge request !105
...@@ -48,8 +48,6 @@ bool SimpleNet::Run(RunMetadata *run_metadata) { ...@@ -48,8 +48,6 @@ bool SimpleNet::Run(RunMetadata *run_metadata) {
LOG(ERROR) << "Operator failed: " << ProtoDebugString(op->debug_def()); LOG(ERROR) << "Operator failed: " << ProtoDebugString(op->debug_def());
return false; return false;
} }
if (device_type_ == DeviceType::OPENCL)
OpenCLRuntime::Get()->command_queue().finish();
if (op_stats) { if (op_stats) {
op_stats->set_op_end_rel_micros(NowInMicroSec() - op_stats->set_op_end_rel_micros(NowInMicroSec() -
op_stats->all_start_micros()); op_stats->all_start_micros());
......
...@@ -7,8 +7,6 @@ ...@@ -7,8 +7,6 @@
#include <memory> #include <memory>
#include <mutex> #include <mutex>
#include <dirent.h>
#include "mace/core/logging.h" #include "mace/core/logging.h"
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/runtime/opencl/opencl_runtime.h"
...@@ -32,57 +30,9 @@ bool ReadSourceFile(const std::string &filename, std::string *content) { ...@@ -32,57 +30,9 @@ bool ReadSourceFile(const std::string &filename, std::string *content) {
return true; return true;
} }
bool BuildProgram(OpenCLRuntime *runtime,
const std::string &path,
cl::Program *program) {
MACE_CHECK_NOTNULL(program);
auto closer = [](DIR *d) {
if (d != nullptr) closedir(d);
};
std::unique_ptr<DIR, decltype(closer)> dir(opendir(path.c_str()), closer);
MACE_CHECK_NOTNULL(dir.get());
const std::string kSourceSuffix = ".cl";
cl::Program::Sources sources;
errno = 0;
dirent *entry = readdir(dir.get());
MACE_CHECK(errno == 0);
while (entry != nullptr) {
if (entry->d_type == DT_REG) {
std::string d_name(entry->d_name);
if (d_name.size() > kSourceSuffix.size() &&
d_name.compare(d_name.size() - kSourceSuffix.size(),
kSourceSuffix.size(), kSourceSuffix) == 0) {
std::string filename = path + d_name;
std::string kernel_source;
MACE_CHECK(ReadSourceFile(filename, &kernel_source));
sources.push_back({kernel_source.c_str(), kernel_source.length()});
}
}
entry = readdir(dir.get());
MACE_CHECK(errno == 0);
};
*program = cl::Program(runtime->context(), sources);
std::string build_options = "-Werror -cl-mad-enable -cl-fast-relaxed-math -I" + path;
// TODO(heliangliang) -cl-unsafe-math-optimizations -cl-fast-relaxed-math
cl_int ret = program->build({runtime->device()}, build_options.c_str());
if (ret != CL_SUCCESS) {
if (program->getBuildInfo<CL_PROGRAM_BUILD_STATUS>(runtime->device()) ==
CL_BUILD_ERROR) {
std::string build_log =
program->getBuildInfo<CL_PROGRAM_BUILD_LOG>(runtime->device());
LOG(INFO) << "Program build log: " << build_log;
}
LOG(FATAL) << "Build program failed: " << ret;
}
return true;
}
} // namespace } // namespace
OpenCLRuntime *OpenCLRuntime::Get() { OpenCLRuntime *OpenCLRuntime::Get() {
static std::once_flag init_once; static std::once_flag init_once;
static OpenCLRuntime *instance = nullptr; static OpenCLRuntime *instance = nullptr;
...@@ -140,7 +90,10 @@ OpenCLRuntime *OpenCLRuntime::Get() { ...@@ -140,7 +90,10 @@ OpenCLRuntime *OpenCLRuntime::Get() {
OpenCLRuntime::OpenCLRuntime(cl::Context context, OpenCLRuntime::OpenCLRuntime(cl::Context context,
cl::Device device, cl::Device device,
cl::CommandQueue command_queue) cl::CommandQueue command_queue)
: context_(context), device_(device), command_queue_(command_queue) {} : context_(context), device_(device), command_queue_(command_queue) {
const char *kernel_path = getenv("MACE_KERNEL_PATH");
kernel_path_ = std::string(kernel_path == nullptr ? "" : kernel_path) + "/";
}
OpenCLRuntime::~OpenCLRuntime() {} OpenCLRuntime::~OpenCLRuntime() {}
...@@ -151,15 +104,76 @@ cl::Device &OpenCLRuntime::device() { return device_; } ...@@ -151,15 +104,76 @@ cl::Device &OpenCLRuntime::device() { return device_; }
cl::CommandQueue &OpenCLRuntime::command_queue() { return command_queue_; } cl::CommandQueue &OpenCLRuntime::command_queue() { return command_queue_; }
cl::Program &OpenCLRuntime::program() { cl::Program &OpenCLRuntime::program() {
// TODO(heliangliang) Support binary format // TODO(liuqi) : useless, leave it for old code.
static const char *kernel_path = getenv("MACE_KERNEL_PATH"); return program_;
std::string path(kernel_path == nullptr ? "" : kernel_path); }
std::call_once(build_flag_, [this, &path]() { // TODO(heliangliang) Support binary format
MACE_CHECK(BuildProgram(this, path, &program_)); const std::map<std::string, std::string>
}); OpenCLRuntime::program_map_ = {
{"addn", "addn.cl"},
{"batch_norm", "batch_norm.cl"},
{"conv_2d_1x1", "conv_2d_1x1.cl"},
{"conv_2d_3x3", "conv_2d_3x3.cl"},
{"depthwise_conv_3x3", "depthwise_conv_3x3.cl"},
{"pooling", "pooling.cl"},
{"relu", "relu.cl"},
{"resize_bilinear", "resize_bilinear.cl"},
{"space_to_batch", "space_to_batch.cl"},
};
void OpenCLRuntime::BuildProgram(const std::string &program_file_name,
const std::string &build_options,
cl::Program *program) {
MACE_CHECK_NOTNULL(program);
return program_; cl::Program::Sources sources;
std::string filename = kernel_path_ + program_file_name;
std::string kernel_source;
MACE_CHECK(ReadSourceFile(filename, &kernel_source));
sources.push_back({kernel_source.c_str(), kernel_source.length()});
*program = cl::Program(this->context(), sources);
std::string build_options_str = build_options +
" -Werror -cl-mad-enable -cl-fast-relaxed-math -I" + kernel_path_;
// TODO(heliangliang) -cl-unsafe-math-optimizations -cl-fast-relaxed-math
cl_int ret = program->build({device()}, build_options_str.c_str());
if (ret != CL_SUCCESS) {
if (program->getBuildInfo<CL_PROGRAM_BUILD_STATUS>(device()) ==
CL_BUILD_ERROR) {
std::string build_log =
program->getBuildInfo<CL_PROGRAM_BUILD_LOG>(device());
LOG(INFO) << "Program build log: " << build_log;
}
LOG(FATAL) << "Build program failed: " << ret;
}
}
cl::Kernel OpenCLRuntime::BuildKernel(const std::string &program_name,
const std::string &kernel_name,
const std::set<std::string> &build_options) {
auto kernel_program_it = program_map_.find(program_name);
if (kernel_program_it == program_map_.end()) {
MACE_CHECK(false, program_name, " opencl kernel doesn't exist.");
}
std::string program_file_name = kernel_program_it->second;
std::string build_options_str;
for(auto &option : build_options) {
build_options_str += " " + option;
}
std::string built_program_key = program_name + build_options_str;
std::lock_guard<std::mutex> lock(program_build_mutex_);
auto built_program_it = built_program_map_.find(built_program_key);
cl::Program program;
if (built_program_it != built_program_map_.end()) {
program = built_program_it->second;
} else {
this->BuildProgram(program_file_name, build_options_str, &program);
built_program_map_.emplace(built_program_key, program);
}
return cl::Kernel(program, kernel_name.c_str());
} }
uint32_t OpenCLRuntime::GetDeviceMaxWorkGroupSize() { uint32_t OpenCLRuntime::GetDeviceMaxWorkGroupSize() {
......
...@@ -7,6 +7,7 @@ ...@@ -7,6 +7,7 @@
#include <map> #include <map>
#include <mutex> #include <mutex>
#include <set>
#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_wrapper.h" #include "mace/core/runtime/opencl/opencl_wrapper.h"
...@@ -17,12 +18,16 @@ class OpenCLRuntime { ...@@ -17,12 +18,16 @@ class OpenCLRuntime {
public: public:
static OpenCLRuntime *Get(); static OpenCLRuntime *Get();
uint32_t GetDeviceMaxWorkGroupSize();
uint32_t GetKernelMaxWorkGroupSize(const cl::Kernel& kernel);
cl::Context &context(); cl::Context &context();
cl::Device &device(); cl::Device &device();
cl::CommandQueue &command_queue(); cl::CommandQueue &command_queue();
cl::Program &program(); cl::Program &program();
uint32_t GetDeviceMaxWorkGroupSize();
uint32_t GetKernelMaxWorkGroupSize(const cl::Kernel& kernel);
cl::Kernel BuildKernel(const std::string &program_name,
const std::string &kernel_name,
const std::set<std::string> &build_options);
private: private:
OpenCLRuntime(cl::Context context, OpenCLRuntime(cl::Context context,
cl::Device device, cl::Device device,
...@@ -31,12 +36,21 @@ class OpenCLRuntime { ...@@ -31,12 +36,21 @@ class OpenCLRuntime {
OpenCLRuntime(const OpenCLRuntime&) = delete; OpenCLRuntime(const OpenCLRuntime&) = delete;
OpenCLRuntime &operator=(const OpenCLRuntime&) = delete; OpenCLRuntime &operator=(const OpenCLRuntime&) = delete;
void BuildProgram(const std::string &kernel_name,
const std::string &build_options,
cl::Program *program);
private: private:
cl::Context context_; cl::Context context_;
cl::Device device_; cl::Device device_;
cl::CommandQueue command_queue_; cl::CommandQueue command_queue_;
cl::Program program_; cl::Program program_;
std::once_flag build_flag_; std::mutex program_build_mutex_;
std::string kernel_path_;
static const std::map<std::string,
std::string> program_map_;
mutable std::map<std::string,
cl::Program> built_program_map_;
}; };
} // namespace mace } // namespace mace
......
...@@ -12,6 +12,7 @@ bool DataTypeCanUseMemcpy(DataType dt) { ...@@ -12,6 +12,7 @@ bool DataTypeCanUseMemcpy(DataType dt) {
case DT_DOUBLE: case DT_DOUBLE:
case DT_INT32: case DT_INT32:
case DT_INT64: case DT_INT64:
case DT_UINT32:
case DT_UINT16: case DT_UINT16:
case DT_UINT8: case DT_UINT8:
case DT_INT16: case DT_INT16:
...@@ -23,4 +24,32 @@ bool DataTypeCanUseMemcpy(DataType dt) { ...@@ -23,4 +24,32 @@ bool DataTypeCanUseMemcpy(DataType dt) {
} }
} }
std::string DataTypeToCLType(const DataType dt) {
switch (dt) {
case DT_FLOAT:
return "float";
case DT_HALF:
return "half";
case DT_UINT8:
return "uchar";
case DT_INT8:
return "char";
case DT_DOUBLE:
return "double";
case DT_INT32:
return "int";
case DT_UINT32:
return "int";
case DT_UINT16:
return "ushort";
case DT_INT16:
return "short";
case DT_INT64:
return "long";
default:
LOG(FATAL) << "Unsupported data type";
return "";
}
}
} // namespace mace } // namespace mace
\ No newline at end of file
...@@ -12,6 +12,8 @@ namespace mace { ...@@ -12,6 +12,8 @@ namespace mace {
bool DataTypeCanUseMemcpy(DataType dt); bool DataTypeCanUseMemcpy(DataType dt);
std::string DataTypeToCLType(const DataType dt);
template <class T> template <class T>
struct IsValidDataType; struct IsValidDataType;
...@@ -50,6 +52,7 @@ MATCH_TYPE_AND_ENUM(int16_t, DT_INT16); ...@@ -50,6 +52,7 @@ MATCH_TYPE_AND_ENUM(int16_t, DT_INT16);
MATCH_TYPE_AND_ENUM(int8_t, DT_INT8); MATCH_TYPE_AND_ENUM(int8_t, DT_INT8);
MATCH_TYPE_AND_ENUM(string, DT_STRING); MATCH_TYPE_AND_ENUM(string, DT_STRING);
MATCH_TYPE_AND_ENUM(int64_t, DT_INT64); MATCH_TYPE_AND_ENUM(int64_t, DT_INT64);
MATCH_TYPE_AND_ENUM(uint32_t, DT_UINT32);
MATCH_TYPE_AND_ENUM(bool, DT_BOOL); MATCH_TYPE_AND_ENUM(bool, DT_BOOL);
static const int32_t kint32_tmax = ((int32_t)0x7FFFFFFF); static const int32_t kint32_tmax = ((int32_t)0x7FFFFFFF);
......
...@@ -15,9 +15,9 @@ static void Add2(const Tensor *input0, const Tensor *input1, Tensor *output) { ...@@ -15,9 +15,9 @@ static void Add2(const Tensor *input0, const Tensor *input1, Tensor *output) {
const uint32_t gws = blocks; const uint32_t gws = blocks;
auto runtime = OpenCLRuntime::Get(); auto runtime = OpenCLRuntime::Get();
auto program = runtime->program(); std::set<std::string> built_options;
built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(output->dtype()));
auto addn_kernel = cl::Kernel(program, "add2"); auto addn_kernel = runtime->BuildKernel("addn", "add2", built_options);
const uint32_t lws = runtime->GetKernelMaxWorkGroupSize(addn_kernel); const uint32_t lws = runtime->GetKernelMaxWorkGroupSize(addn_kernel);
......
...@@ -27,10 +27,10 @@ void BatchNormFunctor<DeviceType::OPENCL, float>::operator()( ...@@ -27,10 +27,10 @@ void BatchNormFunctor<DeviceType::OPENCL, float>::operator()(
static_cast<uint32_t>(input->dim(1)), static_cast<uint32_t>(input->dim(1)),
static_cast<uint32_t>(blocks)}; static_cast<uint32_t>(blocks)};
auto runtime = OpenCLRuntime::Get(); auto runtime = OpenCLRuntime::Get();
auto program = runtime->program(); std::set<std::string> built_options;
auto bm_kernel = cl::Kernel(program, "batch_norm"); built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
auto bm_kernel = runtime->BuildKernel("batch_norm", "batch_norm", built_options);
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(bm_kernel); const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(bm_kernel);
const std::vector<uint32_t> lws = {1, 1, kwg_size}; const std::vector<uint32_t> lws = {1, 1, kwg_size};
...@@ -63,7 +63,7 @@ void BatchNormFunctor<DeviceType::OPENCL, float>::operator()( ...@@ -63,7 +63,7 @@ void BatchNormFunctor<DeviceType::OPENCL, float>::operator()(
cl::NDRange(gws[0], gws[1], gws[2]), cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(params[0], params[1], params[2])); cl::NDRange(params[0], params[1], params[2]));
MACE_CHECK(error == CL_SUCCESS); MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
return error; return error;
}; };
std::stringstream ss; std::stringstream ss;
......
__kernel void add2(__global const float *input0, #include <common.h>
__global const float *input1,
// Supported data type: half/float
__kernel void add2(__global const DATA_TYPE *input0,
__global const DATA_TYPE *input1,
__private const int size, __private const int size,
__global float *output) { __global DATA_TYPE *output) {
int idx = get_global_id(0); int idx = get_global_id(0);
if (idx + 4 > size) { if (idx + 4 > size) {
...@@ -9,8 +12,8 @@ __kernel void add2(__global const float *input0, ...@@ -9,8 +12,8 @@ __kernel void add2(__global const float *input0,
*(output+idx) = *(input0+idx) + *(input1+idx); *(output+idx) = *(input0+idx) + *(input1+idx);
} }
} else { } else {
float4 in_data0 = vload4(idx, input0); VEC_DATA_TYPE(DATA_TYPE,4) in_data0 = vload4(idx, input0);
float4 in_data1 = vload4(idx, input1); VEC_DATA_TYPE(DATA_TYPE,4) in_data1 = vload4(idx, input1);
vstore4(in_data0+in_data1, idx, output); vstore4(in_data0+in_data1, idx, output);
} }
} }
......
void kernel batch_norm(global const float *input, #include <common.h>
global const float *scale, // Supported data types: half/float
global const float *offset, void kernel batch_norm(global const DATA_TYPE *input,
global const float *mean, global const DATA_TYPE *scale,
global const float *var, global const DATA_TYPE *offset,
global const float *epsilon, global const DATA_TYPE *mean,
global const DATA_TYPE *var,
global const DATA_TYPE *epsilon,
private const int pixels, private const int pixels,
global float *output, global DATA_TYPE *output,
__local float4 *new_scale, __local VEC_DATA_TYPE(DATA_TYPE, 4) *new_scale,
__local float4 *new_offset) { __local VEC_DATA_TYPE(DATA_TYPE, 4) *new_offset) {
const int batch = get_global_id(0); const int batch = get_global_id(0);
const int channel = get_global_id(1); const int channel = get_global_id(1);
const int channels = get_global_size(1); const int channels = get_global_size(1);
...@@ -23,8 +25,8 @@ void kernel batch_norm(global const float *input, ...@@ -23,8 +25,8 @@ void kernel batch_norm(global const float *input,
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
const int image_offset = (batch * channels + channel) * pixels + pixel_offset*4; const int image_offset = (batch * channels + channel) * pixels + pixel_offset*4;
const float *input_ptr = input + image_offset; const DATA_TYPE *input_ptr = input + image_offset;
float *output_ptr = output + image_offset; DATA_TYPE *output_ptr = output + image_offset;
const int end = (batch * channels + channel + 1) * pixels; const int end = (batch * channels + channel + 1) * pixels;
if ((image_offset+4) > end) { if ((image_offset+4) > end) {
for (int i = image_offset; i < end; ++i) { for (int i = image_offset; i < end; ++i) {
...@@ -33,7 +35,7 @@ void kernel batch_norm(global const float *input, ...@@ -33,7 +35,7 @@ void kernel batch_norm(global const float *input,
++output_ptr; ++output_ptr;
} }
} else { } else {
float4 values = vload4(0, input_ptr); VEC_DATA_TYPE(DATA_TYPE, 4) values = vload4(0, input_ptr);
values = values * new_scale[local_channel] + new_offset[local_channel]; values = values * new_scale[local_channel] + new_offset[local_channel];
vstore4(values, 0, output_ptr); vstore4(values, 0, output_ptr);
} }
......
...@@ -8,4 +8,7 @@ ...@@ -8,4 +8,7 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable #pragma OPENCL EXTENSION cl_khr_fp16 : enable
#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable #pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable
#define VEC_DATA_TYPE_STR(data_type, size) data_type##size
#define VEC_DATA_TYPE(data_type, size) VEC_DATA_TYPE_STR(data_type, size)
#endif // MACE_KERNELS_OPENCL_CL_COMMON_H_ #endif // MACE_KERNELS_OPENCL_CL_COMMON_H_
...@@ -25,31 +25,31 @@ __kernel void conv_2d_1x1_naive(__global const float *input, /* n, c, h, w */ ...@@ -25,31 +25,31 @@ __kernel void conv_2d_1x1_naive(__global const float *input, /* n, c, h, w */
} }
#define vec_conv_2d_1x1_s1 \ #define vec_conv_2d_1x1_s1 \
float4 in0 = vload4(0, input_ptr); \ VEC_DATA_TYPE(DATA_TYPE,4) in0 = vload4(0, input_ptr); \
float4 in1 = vload4(0, input_ptr + in_pixel); \ VEC_DATA_TYPE(DATA_TYPE,4) in1 = vload4(0, input_ptr + in_pixel); \
float4 in2 = vload4(0, input_ptr + 2 * in_pixel); \ VEC_DATA_TYPE(DATA_TYPE,4) in2 = vload4(0, input_ptr + 2 * in_pixel); \
float4 in3 = vload4(0, input_ptr + 3 * in_pixel); VEC_DATA_TYPE(DATA_TYPE,4) in3 = vload4(0, input_ptr + 3 * in_pixel);
#define vec_conv_2d_1x1_s2 \ #define vec_conv_2d_1x1_s2 \
float4 in00 = vload4(0, input_ptr); \ VEC_DATA_TYPE(DATA_TYPE,4) in00 = vload4(0, input_ptr); \
float3 in01 = vload3(0, input_ptr + 4); \ VEC_DATA_TYPE(DATA_TYPE,3) in01 = vload3(0, input_ptr + 4); \
float4 in10 = vload4(0, input_ptr + in_pixel); \ VEC_DATA_TYPE(DATA_TYPE,4) in10 = vload4(0, input_ptr + in_pixel); \
float3 in11 = vload3(0, input_ptr + in_pixel + 4); \ VEC_DATA_TYPE(DATA_TYPE,3) in11 = vload3(0, input_ptr + in_pixel + 4); \
float4 in20 = vload4(0, input_ptr + 2 * in_pixel); \ VEC_DATA_TYPE(DATA_TYPE,4) in20 = vload4(0, input_ptr + 2 * in_pixel); \
float3 in21 = vload3(0, input_ptr + 2 * in_pixel + 4);\ VEC_DATA_TYPE(DATA_TYPE,3) in21 = vload3(0, input_ptr + 2 * in_pixel + 4);\
float4 in30 = vload4(0, input_ptr + 3 * in_pixel); \ VEC_DATA_TYPE(DATA_TYPE,4) in30 = vload4(0, input_ptr + 3 * in_pixel); \
float3 in31 = vload3(0, input_ptr + 3 * in_pixel + 4); \ VEC_DATA_TYPE(DATA_TYPE,3) in31 = vload3(0, input_ptr + 3 * in_pixel + 4); \
float4 in0 = (float4)(in00.s02, in01.s02); \ VEC_DATA_TYPE(DATA_TYPE,4) in0 = (VEC_DATA_TYPE(DATA_TYPE,4))(in00.s02, in01.s02); \
float4 in1 = (float4)(in10.s02, in11.s02); \ VEC_DATA_TYPE(DATA_TYPE,4) in1 = (VEC_DATA_TYPE(DATA_TYPE,4))(in10.s02, in11.s02); \
float4 in2 = (float4)(in20.s02, in21.s02); \ VEC_DATA_TYPE(DATA_TYPE,4) in2 = (VEC_DATA_TYPE(DATA_TYPE,4))(in20.s02, in21.s02); \
float4 in3 = (float4)(in30.s02, in31.s02); VEC_DATA_TYPE(DATA_TYPE,4) in3 = (VEC_DATA_TYPE(DATA_TYPE,4))(in30.s02, in31.s02);
#define vec_conv_2d_1x1_compute_loop \ #define vec_conv_2d_1x1_compute_loop \
for (int oc = 0; oc < 4; ++oc) { \ for (int oc = 0; oc < 4; ++oc) { \
float4 weights = vload4(0, filter_ptr + oc * in_chan_num); \ VEC_DATA_TYPE(DATA_TYPE,4) weights = vload4(0, filter_ptr + oc * in_chan_num); \
float4 out = vload4(0, output_ptr + oc * out_pixel); \ VEC_DATA_TYPE(DATA_TYPE,4) out = vload4(0, output_ptr + oc * out_pixel); \
out += in0 * weights.x; \ out += in0 * weights.x; \
out += in1 * weights.y; \ out += in1 * weights.y; \
out += in2 * weights.z; \ out += in2 * weights.z; \
...@@ -58,25 +58,27 @@ __kernel void conv_2d_1x1_naive(__global const float *input, /* n, c, h, w */ ...@@ -58,25 +58,27 @@ __kernel void conv_2d_1x1_naive(__global const float *input, /* n, c, h, w */
} }
#define vec_conv_2d_1x1_compute \ #define vec_conv_2d_1x1_compute \
float4 weights = vload4(0, filter_ptr); \ VEC_DATA_TYPE(DATA_TYPE,4) weights = vload4(0, filter_ptr); \
float4 out = vload4(0, output_ptr); \ VEC_DATA_TYPE(DATA_TYPE,4) out = vload4(0, output_ptr); \
out += in0 * weights.x; \ out += in0 * weights.x; \
out += in1 * weights.y; \ out += in1 * weights.y; \
out += in2 * weights.z; \ out += in2 * weights.z; \
out += in3 * weights.w; \ out += in3 * weights.w; \
vstore4(out, 0, output_ptr); vstore4(out, 0, output_ptr);
__kernel void conv_2d_1x1_v2(__global const float *input, /* n, c, h, w */ // Supported data type: half/float
__global const float *filter, /* o, i, kh, kw */ __kernel void conv_2d_1x1_v2(__global const DATA_TYPE *input, /* n, c, h, w */
__global const float *bias, /* o */ __global const DATA_TYPE *filter, /* o, i, kh, kw */
__global float *output, /* n, c, h, w */ #ifdef BIAS
__global const DATA_TYPE *bias, /* o */
#endif /* defined(BIAS) */
__global DATA_TYPE *output, /* n, c, h, w */
__private const int in_chan_num, __private const int in_chan_num,
__private const int out_chan_num, __private const int out_chan_num,
__private const int in_height, __private const int in_height,
__private const int in_width, __private const int in_width,
__private const int out_height, __private const int out_height,
__private const int out_width, __private const int out_width) {
__private const int stride) {
int batch = get_global_id(0); int batch = get_global_id(0);
int out_chan_blk = get_global_id(1); int out_chan_blk = get_global_id(1);
int out_pixel_blk = get_global_id(2); int out_pixel_blk = get_global_id(2);
...@@ -92,20 +94,30 @@ __kernel void conv_2d_1x1_v2(__global const float *input, /* n, c, h, w */ ...@@ -92,20 +94,30 @@ __kernel void conv_2d_1x1_v2(__global const float *input, /* n, c, h, w */
const int out_chan_end = min(out_chan_begin + 4, out_chan_num); const int out_chan_end = min(out_chan_begin + 4, out_chan_num);
const int out_pixel_begin = out_pixel_height * out_width + out_pixel_width * 4; const int out_pixel_begin = out_pixel_height * out_width + out_pixel_width * 4;
const int out_pixel_end = min(out_pixel_begin + 4, (out_pixel_height + 1) * out_width); const int out_pixel_end = min(out_pixel_begin + 4, (out_pixel_height + 1) * out_width);
#ifdef STRIDE_1
const int stride = 1;
#else
const int stride = 2;
#endif
const int in_pixel_begin = out_pixel_height * stride * in_width + out_pixel_width * stride * 4; const int in_pixel_begin = out_pixel_height * stride * in_width + out_pixel_width * stride * 4;
const int in_offset = batch * in_chan_num * in_pixel; const int in_offset = batch * in_chan_num * in_pixel;
const int out_offset = batch * out_chan_num * out_pixel; const int out_offset = batch * out_chan_num * out_pixel;
const float *input_base = input + in_offset + in_pixel_begin; const DATA_TYPE *input_base = input + in_offset + in_pixel_begin;
float *output_base = output + out_offset + out_pixel_begin; DATA_TYPE *output_base = output + out_offset + out_pixel_begin;
int out_chan_len = out_chan_end - out_chan_begin; int out_chan_len = out_chan_end - out_chan_begin;
int pixel_len = out_pixel_end - out_pixel_begin; int pixel_len = out_pixel_end - out_pixel_begin;
for (int out_chan = out_chan_begin; out_chan < out_chan_end; ++out_chan) { for (int out_chan = out_chan_begin; out_chan < out_chan_end; ++out_chan) {
float *output_ptr = output_base + out_chan * out_pixel; DATA_TYPE *output_ptr = output_base + out_chan * out_pixel;
float bias_value = bias == NULL ? 0 : bias[out_chan]; #ifdef BIAS
DATA_TYPE bias_value = bias[out_chan];
#else
DATA_TYPE bias_value = 0;
#endif
for (int p = 0; p < pixel_len; ++p) { for (int p = 0; p < pixel_len; ++p) {
output_ptr[p] = bias_value; output_ptr[p] = bias_value;
} }
...@@ -113,48 +125,37 @@ __kernel void conv_2d_1x1_v2(__global const float *input, /* n, c, h, w */ ...@@ -113,48 +125,37 @@ __kernel void conv_2d_1x1_v2(__global const float *input, /* n, c, h, w */
int in_chan = 0; int in_chan = 0;
if (pixel_len == 4) { if (pixel_len == 4) {
if (stride == 1) {
for (; in_chan + 3 < in_chan_num; in_chan += 4) { for (; in_chan + 3 < in_chan_num; in_chan += 4) {
const float *input_ptr = input_base + in_chan * in_pixel; const DATA_TYPE *input_ptr = input_base + in_chan * in_pixel;
int out_chan = out_chan_begin; int out_chan = out_chan_begin;
for (; out_chan + 3 < out_chan_end; out_chan += 4) { for (; out_chan + 3 < out_chan_end; out_chan += 4) {
const float* filter_ptr = filter + out_chan * in_chan_num + in_chan; const DATA_TYPE* filter_ptr = filter + out_chan * in_chan_num + in_chan;
float *output_ptr = output_base + out_chan * out_pixel; DATA_TYPE *output_ptr = output_base + out_chan * out_pixel;
vec_conv_2d_1x1_s1; #ifdef STRIDE_1
vec_conv_2d_1x1_compute_loop;
}
for (; out_chan < out_chan_end; ++out_chan) {
const float* filter_ptr = filter + out_chan * in_chan_num + in_chan;
float *output_ptr = output_base + out_chan * out_pixel;
vec_conv_2d_1x1_s1; vec_conv_2d_1x1_s1;
vec_conv_2d_1x1_compute; #else
}
}
} else if (stride == 2) {
for (; in_chan + 3 < in_chan_num; in_chan += 4) {
const float *input_ptr = input_base + in_chan * in_pixel;
int out_chan = out_chan_begin;
for (; out_chan + 3 < out_chan_end; out_chan += 4) {
const float* filter_ptr = filter + out_chan * in_chan_num + in_chan;
float *output_ptr = output_base + out_chan * out_pixel;
vec_conv_2d_1x1_s2; vec_conv_2d_1x1_s2;
#endif
vec_conv_2d_1x1_compute_loop; vec_conv_2d_1x1_compute_loop;
} }
for (; out_chan < out_chan_end; ++out_chan) { for (; out_chan < out_chan_end; ++out_chan) {
const float* filter_ptr = filter + out_chan * in_chan_num + in_chan; const DATA_TYPE* filter_ptr = filter + out_chan * in_chan_num + in_chan;
float *output_ptr = output_base + out_chan * out_pixel; DATA_TYPE *output_ptr = output_base + out_chan * out_pixel;
#ifdef STRIDE_1
vec_conv_2d_1x1_s1;
#else
vec_conv_2d_1x1_s2; vec_conv_2d_1x1_s2;
#endif
vec_conv_2d_1x1_compute; vec_conv_2d_1x1_compute;
} }
} }
} }
}
for (; in_chan < in_chan_num; ++in_chan) { for (; in_chan < in_chan_num; ++in_chan) {
const float *input_ptr = input_base + in_chan * in_pixel; const DATA_TYPE *input_ptr = input_base + in_chan * in_pixel;
for (int out_chan = out_chan_begin; out_chan < out_chan_end; ++out_chan) { for (int out_chan = out_chan_begin; out_chan < out_chan_end; ++out_chan) {
float weights = filter[out_chan * in_chan_num + in_chan]; DATA_TYPE weights = filter[out_chan * in_chan_num + in_chan];
float *output_ptr = output_base + out_chan * out_pixel; DATA_TYPE *output_ptr = output_base + out_chan * out_pixel;
for (int p = 0; p < pixel_len; ++p) { for (int p = 0; p < pixel_len; ++p) {
float in = input_ptr[p*stride]; float in = input_ptr[p*stride];
......
#include <conv_helper.h> #include <common.h>
void kernel conv_2d_3x3(global const float *input,
global const float *filter, VEC_DATA_TYPE(DATA_TYPE,4) conv1x3_s1(const DATA_TYPE *input_ptr,
global const float *bias, const DATA_TYPE *filter_ptr) {
global float *output, VEC_DATA_TYPE(DATA_TYPE,4) row0 = vload4(0, input_ptr);
VEC_DATA_TYPE(DATA_TYPE,2) input1 = vload2(0, input_ptr+4);
VEC_DATA_TYPE(DATA_TYPE,4) row1 = (VEC_DATA_TYPE(DATA_TYPE,4))(row0.s123, input1.s0);
VEC_DATA_TYPE(DATA_TYPE,4) row2 = (VEC_DATA_TYPE(DATA_TYPE,4))(row0.s23, input1.s01);
VEC_DATA_TYPE(DATA_TYPE,3) filter_values = vload3(0, filter_ptr);
return (VEC_DATA_TYPE(DATA_TYPE,4))filter_values.s0 * row0 +
(VEC_DATA_TYPE(DATA_TYPE,4))filter_values.s1 * row1 +
(VEC_DATA_TYPE(DATA_TYPE,4))filter_values.s2 * row2;
}
VEC_DATA_TYPE(DATA_TYPE,4) conv1x3_s2(const DATA_TYPE *input_ptr,
const DATA_TYPE *filter_ptr) {
VEC_DATA_TYPE(DATA_TYPE,8) input = vload8(0, input_ptr);
VEC_DATA_TYPE(DATA_TYPE,4) row0 = input.even;
VEC_DATA_TYPE(DATA_TYPE,4) row1 = input.odd;
VEC_DATA_TYPE(DATA_TYPE,4) row2 = (VEC_DATA_TYPE(DATA_TYPE,4))(row0.s123, input_ptr[8]);
VEC_DATA_TYPE(DATA_TYPE,3) filter_values = vload3(0, filter_ptr);
return (VEC_DATA_TYPE(DATA_TYPE,4))filter_values.s0 * row0 +
(VEC_DATA_TYPE(DATA_TYPE,4))filter_values.s1 * row1 +
(VEC_DATA_TYPE(DATA_TYPE,4))filter_values.s2 * row2;
}
// Supported data type: half/float
DATA_TYPE conv3x3(const DATA_TYPE *input_ptr,
const DATA_TYPE *filter_ptr,
const int row_width) {
VEC_DATA_TYPE(DATA_TYPE,3) input_value = vload3(0, input_ptr);
VEC_DATA_TYPE(DATA_TYPE,3) filter_value = vload3(0, filter_ptr);
VEC_DATA_TYPE(DATA_TYPE,3) res = input_value * filter_value;
input_ptr += row_width;
input_value = vload3(0, input_ptr);
filter_value = vload3(1, filter_ptr);
res += input_value * filter_value;
input_ptr += row_width;
input_value = vload3(0, input_ptr);
filter_value = vload3(2, filter_ptr);
res += input_value * filter_value;
return res.s0 + res.s1 + res.s2;
}
void kernel conv_2d_3x3(global const DATA_TYPE *input,
global const DATA_TYPE *filter,
#ifdef BIAS
global const DATA_TYPE *bias,
#endif
global DATA_TYPE *output,
private const int in_chan_num, private const int in_chan_num,
private const int out_chan_num, private const int out_chan_num,
private const int in_height, private const int in_height,
private const int in_width, private const int in_width,
private const int out_height, private const int out_height,
private const int out_width, private const int out_width) {
private const int stride_h,
private const int stride_w) {
int batch = get_global_id(0); int batch = get_global_id(0);
int out_chan_blk = get_global_id(1); int out_chan_blk = get_global_id(1);
int out_pixel_blk = get_global_id(2); int out_pixel_blk = get_global_id(2);
...@@ -26,46 +70,54 @@ void kernel conv_2d_3x3(global const float *input, ...@@ -26,46 +70,54 @@ void kernel conv_2d_3x3(global const float *input,
const int out_chan_end = min(out_chan_begin + 4, out_chan_num); const int out_chan_end = min(out_chan_begin + 4, out_chan_num);
const int out_pixel_begin = out_pixel_height * out_width + out_pixel_width * 4; const int out_pixel_begin = out_pixel_height * out_width + out_pixel_width * 4;
const int out_pixel_end = min(out_pixel_begin + 4, (out_pixel_height + 1) * out_width); const int out_pixel_end = min(out_pixel_begin + 4, (out_pixel_height + 1) * out_width);
const int in_pixel_begin = out_pixel_height * stride_h * in_width + out_pixel_width * stride_w * 4; #ifdef STRIDE_1
const int stride = 1;
#else
const int stride = 2;
#endif
const int in_pixel_begin = out_pixel_height * stride * in_width + out_pixel_width * stride * 4;
const int in_offset = batch * in_chan_num * in_pixel; const int in_offset = batch * in_chan_num * in_pixel;
const int out_offset = batch * out_chan_num * out_pixel; const int out_offset = batch * out_chan_num * out_pixel;
const float *input_base = input + in_offset + in_pixel_begin; const DATA_TYPE *input_base = input + in_offset + in_pixel_begin;
float *output_base = output + out_offset + out_pixel_begin; DATA_TYPE *output_base = output + out_offset + out_pixel_begin;
const int pixels = out_pixel_end - out_pixel_begin; const int pixels = out_pixel_end - out_pixel_begin;
for (int i = out_chan_begin; i < out_chan_end; ++i) { for (int i = out_chan_begin; i < out_chan_end; ++i) {
float *output_ptr = output_base + i * out_pixel; DATA_TYPE *output_ptr = output_base + i * out_pixel;
const float *filter_base = filter + i * in_chan_num * 9; const DATA_TYPE *filter_base = filter + i * in_chan_num * 9;
if (pixels == 4) { if (pixels == 4) {
#ifdef BIAS
VEC_DATA_TYPE(DATA_TYPE, 4) res = (VEC_DATA_TYPE(DATA_TYPE, 4))bias[i];
#else
VEC_DATA_TYPE(DATA_TYPE, 4) res = 0;
#endif
float4 res = bias == NULL ? 0 : (float4)bias[i];
if (stride_w == 1) {
for (int in_chan_idx = 0; in_chan_idx < in_chan_num; ++in_chan_idx) { for (int in_chan_idx = 0; in_chan_idx < in_chan_num; ++in_chan_idx) {
const float* input_ptr = input_base + in_chan_idx * in_pixel; const DATA_TYPE *input_ptr = input_base + in_chan_idx * in_pixel;
const float* filter_ptr = filter_base + in_chan_idx * 9; const DATA_TYPE *filter_ptr = filter_base + in_chan_idx * 9;
#ifdef STRIDE_1
res += conv1x3_s1(input_ptr + 0 * in_width, filter_ptr + 0 * 3); res += conv1x3_s1(input_ptr + 0 * in_width, filter_ptr + 0 * 3);
res += conv1x3_s1(input_ptr + 1 * in_width, filter_ptr + 1 * 3); res += conv1x3_s1(input_ptr + 1 * in_width, filter_ptr + 1 * 3);
res += conv1x3_s1(input_ptr + 2 * in_width, filter_ptr + 2 * 3); res += conv1x3_s1(input_ptr + 2 * in_width, filter_ptr + 2 * 3);
} #else
} else {
for (int in_chan_idx = 0; in_chan_idx < in_chan_num; ++in_chan_idx) {
const float* input_ptr = input_base + in_chan_idx * in_pixel;
const float* filter_ptr = filter_base + in_chan_idx * 9;
res += conv1x3_s2(input_ptr + 0 * in_width, filter_ptr + 0 * 3); res += conv1x3_s2(input_ptr + 0 * in_width, filter_ptr + 0 * 3);
res += conv1x3_s2(input_ptr + 1 * in_width, filter_ptr + 1 * 3); res += conv1x3_s2(input_ptr + 1 * in_width, filter_ptr + 1 * 3);
res += conv1x3_s2(input_ptr + 2 * in_width, filter_ptr + 2 * 3); res += conv1x3_s2(input_ptr + 2 * in_width, filter_ptr + 2 * 3);
} #endif
} }
vstore4(res, 0, output_ptr); vstore4(res, 0, output_ptr);
} else { } else {
for (int p = 0; p < pixels; ++p) { for (int p = 0; p < pixels; ++p) {
float res = bias == NULL ? 0 : bias[i]; #ifdef BIAS
DATA_TYPE res = bias[i];
#else
DATA_TYPE res = 0;
#endif
for (uint in_chan_idx = 0; in_chan_idx < in_chan_num; ++in_chan_idx) { for (uint in_chan_idx = 0; in_chan_idx < in_chan_num; ++in_chan_idx) {
const float* input_ptr = input_base + in_chan_idx * in_pixel + p * stride_w; const DATA_TYPE *input_ptr = input_base + in_chan_idx * in_pixel + p * stride;
const float* filter_ptr = filter_base + in_chan_idx * 9; const DATA_TYPE *filter_ptr = filter_base + in_chan_idx * 9;
res += conv3x3(input_ptr, filter_ptr, in_width); res += conv3x3(input_ptr, filter_ptr, in_width);
} }
output_ptr[p] = res; output_ptr[p] = res;
......
float4 conv1x3_s1(const float *input_ptr,
const float *filter_ptr) {
float4 row0 = vload4(0, input_ptr);
float2 input1 = vload2(0, input_ptr+4);
float4 row1 = (float4)(row0.s123, input1.s0);
float4 row2 = (float4)(row0.s23, input1.s01);
float3 filter_values = vload3(0, filter_ptr);
return (float4)filter_values.s0 * row0 +
(float4)filter_values.s1 * row1 +
(float4)filter_values.s2 * row2;
}
float4 conv1x3_s2(const float *input_ptr,
const float *filter_ptr) {
float8 input = vload8(0, input_ptr);
float4 row0 = input.even;
float4 row1 = input.odd;
float4 row2 = (float4)(row0.s123, input_ptr[8]);
float3 filter_values = vload3(0, filter_ptr);
return (float4)filter_values.s0 * row0 +
(float4)filter_values.s1 * row1 +
(float4)filter_values.s2 * row2;
}
float conv3x3(const float *input_ptr,
const float *filter_ptr,
const int row_width) {
float3 input_value = vload3(0, input_ptr);
float3 filter_value = vload3(0, filter_ptr);
float3 res = input_value * filter_value;
input_ptr += row_width;
input_value = vload3(0, input_ptr);
filter_value = vload3(1, filter_ptr);
res += input_value * filter_value;
input_ptr += row_width;
input_value = vload3(0, input_ptr);
filter_value = vload3(2, filter_ptr);
res += input_value * filter_value;
return res.s0 + res.s1 + res.s2;
}
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#ifndef MACE_KERNELS_OPENCL_CL_CONV_HELPER_H_
#define MACE_KERNELS_OPENCL_CL_CONV_HELPER_H_
float4 conv1x3_s1(const float *input_ptr,
const float *filter_ptr);
float4 conv1x3_s2(const float *input_ptr,
const float *filter_ptr);
float conv3x3(const float *input_ptr,
const float *filter_ptr,
const int row_width);
#endif // MACE_KERNELS_OPENCL_CL_CONV_HELPER_H_
#include <conv_helper.h> #include <common.h>
VEC_DATA_TYPE(DATA_TYPE,4) conv1x3_s1(const DATA_TYPE *input_ptr,
const DATA_TYPE *filter_ptr) {
VEC_DATA_TYPE(DATA_TYPE,4) row0 = vload4(0, input_ptr);
VEC_DATA_TYPE(DATA_TYPE,2) input1 = vload2(0, input_ptr+4);
VEC_DATA_TYPE(DATA_TYPE,4) row1 = (VEC_DATA_TYPE(DATA_TYPE,4))(row0.s123, input1.s0);
VEC_DATA_TYPE(DATA_TYPE,4) row2 = (VEC_DATA_TYPE(DATA_TYPE,4))(row0.s23, input1.s01);
VEC_DATA_TYPE(DATA_TYPE,3) filter_values = vload3(0, filter_ptr);
return (VEC_DATA_TYPE(DATA_TYPE,4))filter_values.s0 * row0 +
(VEC_DATA_TYPE(DATA_TYPE,4))filter_values.s1 * row1 +
(VEC_DATA_TYPE(DATA_TYPE,4))filter_values.s2 * row2;
}
VEC_DATA_TYPE(DATA_TYPE,4) conv1x3_s2(const DATA_TYPE *input_ptr,
const DATA_TYPE *filter_ptr) {
VEC_DATA_TYPE(DATA_TYPE,8) input = vload8(0, input_ptr);
VEC_DATA_TYPE(DATA_TYPE,4) row0 = input.even;
VEC_DATA_TYPE(DATA_TYPE,4) row1 = input.odd;
VEC_DATA_TYPE(DATA_TYPE,4) row2 = (VEC_DATA_TYPE(DATA_TYPE,4))(row0.s123, input_ptr[8]);
VEC_DATA_TYPE(DATA_TYPE,3) filter_values = vload3(0, filter_ptr);
return (VEC_DATA_TYPE(DATA_TYPE,4))filter_values.s0 * row0 +
(VEC_DATA_TYPE(DATA_TYPE,4))filter_values.s1 * row1 +
(VEC_DATA_TYPE(DATA_TYPE,4))filter_values.s2 * row2;
}
// Supported data type: half/float
DATA_TYPE conv3x3(const DATA_TYPE *input_ptr,
const DATA_TYPE *filter_ptr,
const int row_width) {
VEC_DATA_TYPE(DATA_TYPE,3) input_value = vload3(0, input_ptr);
VEC_DATA_TYPE(DATA_TYPE,3) filter_value = vload3(0, filter_ptr);
VEC_DATA_TYPE(DATA_TYPE,3) res = input_value * filter_value;
input_ptr += row_width;
input_value = vload3(0, input_ptr);
filter_value = vload3(1, filter_ptr);
res += input_value * filter_value;
input_ptr += row_width;
input_value = vload3(0, input_ptr);
filter_value = vload3(2, filter_ptr);
res += input_value * filter_value;
return res.s0 + res.s1 + res.s2;
}
//TODO merge the depthwise with conv 3x3 to remove duplicate code. //TODO merge the depthwise with conv 3x3 to remove duplicate code.
void kernel depthwise_conv_3x3(global const float *input, /* n, c, h, w */ __kernel void depthwise_conv_3x3(__global const DATA_TYPE *input, /* n, c, h, w */
global const float *filter, /* m, i, kh, kw */ __global const DATA_TYPE *filter, /* m, i, kh, kw */
global const float *bias, /* o */ #ifdef BIAS
global float *output, /* n, c, h, w */ __global const DATA_TYPE *bias, /* o */
private const int in_chan_num, #endif
private const int out_chan_num, __global DATA_TYPE *output, /* n, c, h, w */
private const int in_height, __private const int in_chan_num,
private const int in_width, __private const int out_chan_num,
private const int out_height, __private const int in_height,
private const int out_width, __private const int in_width,
private const int stride_h, __private const int out_height,
private const int stride_w) { __private const int out_width) {
int batch = get_global_id(0); int batch = get_global_id(0);
int out_chan_blk = get_global_id(1); int out_chan_blk = get_global_id(1);
int out_pixel_blk = get_global_id(2); int out_pixel_blk = get_global_id(2);
...@@ -28,38 +71,54 @@ void kernel depthwise_conv_3x3(global const float *input, /* n, c, h, w */ ...@@ -28,38 +71,54 @@ void kernel depthwise_conv_3x3(global const float *input, /* n, c, h, w */
const int out_chan_end = min(out_chan_begin + 4, out_chan_num); const int out_chan_end = min(out_chan_begin + 4, out_chan_num);
const int out_pixel_begin = out_pixel_height * out_width + out_pixel_width * 4; const int out_pixel_begin = out_pixel_height * out_width + out_pixel_width * 4;
const int out_pixel_end = min(out_pixel_begin + 4, (out_pixel_height + 1) * out_width); const int out_pixel_end = min(out_pixel_begin + 4, (out_pixel_height + 1) * out_width);
const int in_pixel_begin = out_pixel_height * stride_h * in_width + out_pixel_width * stride_w * 4; #ifdef STRIDE_1
const int in_pixel_begin = out_pixel_height * in_width + out_pixel_width * 4;
#else
const int in_pixel_begin = out_pixel_height * 2 * in_width + out_pixel_width * 2 * 4;
#endif
const int in_offset = batch * in_chan_num * in_pixel; const int in_offset = batch * in_chan_num * in_pixel;
const int out_offset = batch * out_chan_num * out_pixel; const int out_offset = batch * out_chan_num * out_pixel;
const float *input_base = input + in_offset + in_pixel_begin; const DATA_TYPE *input_base = input + in_offset + in_pixel_begin;
float *output_base = output + out_offset + out_pixel_begin; DATA_TYPE *output_base = output + out_offset + out_pixel_begin;
const int pixels = out_pixel_end - out_pixel_begin; const int pixels = out_pixel_end - out_pixel_begin;
for (int i = out_chan_begin; i < out_chan_end; ++i) { for (int i = out_chan_begin; i < out_chan_end; ++i) {
float bias_value = bias[i]; const DATA_TYPE *input_ptr = input_base + (i / multiplier) * in_pixel;
const float *input_ptr = input_base + (i / multiplier) * in_pixel; const DATA_TYPE *filter_ptr = filter + i * 9;
const float *filter_ptr = filter + i * 9; DATA_TYPE *output_ptr = output_base + i * out_pixel;
float *output_ptr = output_base + i * out_pixel;
if (pixels == 4) { if (pixels == 4) {
float4 res = (float4)bias[i]; #ifdef BIAS
if (stride_w == 1) { VEC_DATA_TYPE(DATA_TYPE,4) res = (VEC_DATA_TYPE(DATA_TYPE,4))bias[i];
#else
VEC_DATA_TYPE(DATA_TYPE,4) res = 0;
#endif /* defined(BIAS) */
#ifdef STRIDE_1
res += conv1x3_s1(input_ptr + 0 * in_width, filter_ptr + 0 * 3); res += conv1x3_s1(input_ptr + 0 * in_width, filter_ptr + 0 * 3);
res += conv1x3_s1(input_ptr + 1 * in_width, filter_ptr + 1 * 3); res += conv1x3_s1(input_ptr + 1 * in_width, filter_ptr + 1 * 3);
res += conv1x3_s1(input_ptr + 2 * in_width, filter_ptr + 2 * 3); res += conv1x3_s1(input_ptr + 2 * in_width, filter_ptr + 2 * 3);
} else { #else
res += conv1x3_s2(input_ptr + 0 * in_width, filter_ptr + 0 * 3); res += conv1x3_s2(input_ptr + 0 * in_width, filter_ptr + 0 * 3);
res += conv1x3_s2(input_ptr + 1 * in_width, filter_ptr + 1 * 3); res += conv1x3_s2(input_ptr + 1 * in_width, filter_ptr + 1 * 3);
res += conv1x3_s2(input_ptr + 2 * in_width, filter_ptr + 2 * 3); res += conv1x3_s2(input_ptr + 2 * in_width, filter_ptr + 2 * 3);
} #endif
vstore4(res, 0, output_ptr); vstore4(res, 0, output_ptr);
} else { } else {
for (int p = 0; p < pixels; ++p) { for (int p = 0; p < pixels; ++p) {
float res = bias[i]; #ifdef BIAS
DATA_TYPE res = bias[i];
#else
DATA_TYPE res = 0;
#endif
res += conv3x3(input_ptr, filter_ptr, in_width); res += conv3x3(input_ptr, filter_ptr, in_width);
output_ptr[p] = res; output_ptr[p] = res;
input_ptr += stride_w; #ifdef STRIDE_1
input_ptr += 1;
#else
input_ptr += 2;
#endif
} }
} }
} }
......
float4 vec_pooling_3_s1(const float *input_ptr, const int in_width) { #include <common.h>
float4 row00 = vload4(0, input_ptr);
float2 row01 = vload2(0, input_ptr + 4); VEC_DATA_TYPE(DATA_TYPE,4) vec_pooling_3_s1(const DATA_TYPE *input_ptr, const int in_width) {
float4 row10 = vload4(0, input_ptr + in_width); VEC_DATA_TYPE(DATA_TYPE,4) row00 = vload4(0, input_ptr);
float2 row11 = vload2(0, input_ptr + in_width + 4); VEC_DATA_TYPE(DATA_TYPE,2) row01 = vload2(0, input_ptr + 4);
float4 row20 = vload4(0, input_ptr + in_width * 2); VEC_DATA_TYPE(DATA_TYPE,4) row10 = vload4(0, input_ptr + in_width);
float2 row21 = vload2(0, input_ptr + in_width * 2 + 4); VEC_DATA_TYPE(DATA_TYPE,2) row11 = vload2(0, input_ptr + in_width + 4);
VEC_DATA_TYPE(DATA_TYPE,4) row20 = vload4(0, input_ptr + in_width * 2);
float8 data00 = (float8)(row00.s01212323); VEC_DATA_TYPE(DATA_TYPE,2) row21 = vload2(0, input_ptr + in_width * 2 + 4);
float4 data01 = (float4)(row01.s0, row00.s3, row01.s01);
float8 data10 = (float8)(row10.s01212323); VEC_DATA_TYPE(DATA_TYPE,8) data00 = (VEC_DATA_TYPE(DATA_TYPE,8))(row00.s01212323);
float4 data11 = (float4)(row11.s0, row10.s3, row11.s01); VEC_DATA_TYPE(DATA_TYPE,4) data01 = (VEC_DATA_TYPE(DATA_TYPE,4))(row01.s0, row00.s3, row01.s01);
float8 data20 = (float8)(row20.s01212323); VEC_DATA_TYPE(DATA_TYPE,8) data10 = (VEC_DATA_TYPE(DATA_TYPE,8))(row10.s01212323);
float4 data21 = (float4)(row21.s0, row20.s3, row21.s01); VEC_DATA_TYPE(DATA_TYPE,4) data11 = (VEC_DATA_TYPE(DATA_TYPE,4))(row11.s0, row10.s3, row11.s01);
VEC_DATA_TYPE(DATA_TYPE,8) data20 = (VEC_DATA_TYPE(DATA_TYPE,8))(row20.s01212323);
float8 left = fmax(fmax(data00, data10), data20); VEC_DATA_TYPE(DATA_TYPE,4) data21 = (VEC_DATA_TYPE(DATA_TYPE,4))(row21.s0, row20.s3, row21.s01);
float4 right = fmax(fmax(data01, data11), data21);
VEC_DATA_TYPE(DATA_TYPE,8) left = fmax(fmax(data00, data10), data20);
float4 res = fmax((float4)(left.s036, right.s1), (float4)(left.s147, right.s2)); VEC_DATA_TYPE(DATA_TYPE,4) right = fmax(fmax(data01, data11), data21);
res = fmax(res, (float4)(left.s25, right.s03));
VEC_DATA_TYPE(DATA_TYPE,4) res = fmax((VEC_DATA_TYPE(DATA_TYPE,4))(left.s036, right.s1),
(VEC_DATA_TYPE(DATA_TYPE,4))(left.s147, right.s2));
res = fmax(res, (VEC_DATA_TYPE(DATA_TYPE,4))(left.s25, right.s03));
return res; return res;
} }
float4 vec_pooling_3_s2(const float *input_ptr, const int in_width) {
float8 row00 = vload8(0, input_ptr); VEC_DATA_TYPE(DATA_TYPE,4) vec_pooling_3_s2(const DATA_TYPE *input_ptr, const int in_width) {
float row01 = *(input_ptr + 8); VEC_DATA_TYPE(DATA_TYPE,8) row00 = vload8(0, input_ptr);
float8 row10 = vload8(0, input_ptr + in_width); DATA_TYPE row01 = *(input_ptr + 8);
float row11 = *(input_ptr + in_width + 8); VEC_DATA_TYPE(DATA_TYPE,8) row10 = vload8(0, input_ptr + in_width);
float8 row20 = vload8(0, input_ptr + in_width * 2); DATA_TYPE row11 = *(input_ptr + in_width + 8);
float row21 = *(input_ptr + in_width * 2 + 8); VEC_DATA_TYPE(DATA_TYPE,8) row20 = vload8(0, input_ptr + in_width * 2);
DATA_TYPE row21 = *(input_ptr + in_width * 2 + 8);
float8 data00 = (float8)(row00.s01223445);
float4 data01 = (float4)(row00.s667, row01); VEC_DATA_TYPE(DATA_TYPE,8) data00 = (VEC_DATA_TYPE(DATA_TYPE,8))(row00.s01223445);
float8 data10 = (float8)(row10.s01223445); VEC_DATA_TYPE(DATA_TYPE,4) data01 = (VEC_DATA_TYPE(DATA_TYPE,4))(row00.s667, row01);
float4 data11 = (float4)(row10.s667, row11); VEC_DATA_TYPE(DATA_TYPE,8) data10 = (VEC_DATA_TYPE(DATA_TYPE,8))(row10.s01223445);
float8 data20 = (float8)(row20.s01223445); VEC_DATA_TYPE(DATA_TYPE,4) data11 = (VEC_DATA_TYPE(DATA_TYPE,4))(row10.s667, row11);
float4 data21 = (float4)(row20.s667, row21); VEC_DATA_TYPE(DATA_TYPE,8) data20 = (VEC_DATA_TYPE(DATA_TYPE,8))(row20.s01223445);
VEC_DATA_TYPE(DATA_TYPE,4) data21 = (VEC_DATA_TYPE(DATA_TYPE,4))(row20.s667, row21);
float8 left = fmax(fmax(data00, data10), data20);
float4 right = fmax(fmax(data01, data11), data21); VEC_DATA_TYPE(DATA_TYPE,8) left = fmax(fmax(data00, data10), data20);
VEC_DATA_TYPE(DATA_TYPE,4) right = fmax(fmax(data01, data11), data21);
float4 res = fmax((float4)(left.s036, right.s1), (float4)(left.s147, right.s2));
res = fmax(res, (float4)(left.s25, right.s03)); VEC_DATA_TYPE(DATA_TYPE,4) res = fmax((VEC_DATA_TYPE(DATA_TYPE,4))(left.s036, right.s1),
(VEC_DATA_TYPE(DATA_TYPE,4))(left.s147, right.s2));
res = fmax(res, (VEC_DATA_TYPE(DATA_TYPE,4))(left.s25, right.s03));
return res; return res;
} }
float inner_pooling_3(const float *input_ptr, const int in_width) { DATA_TYPE inner_pooling_3(const DATA_TYPE *input_ptr, const int in_width) {
float3 row0 = vload3(0, input_ptr); VEC_DATA_TYPE(DATA_TYPE,3) row0 = vload3(0, input_ptr);
float3 row1 = vload3(0, input_ptr + in_width); VEC_DATA_TYPE(DATA_TYPE,3) row1 = vload3(0, input_ptr + in_width);
float3 row2 = vload3(0, input_ptr + in_width * 2); VEC_DATA_TYPE(DATA_TYPE,3) row2 = vload3(0, input_ptr + in_width * 2);
float3 data = fmax(fmax(row0, row1), row2); VEC_DATA_TYPE(DATA_TYPE,3) data = fmax(fmax(row0, row1), row2);
float res = fmax(fmax(data.s0, data.s1), data.s2); DATA_TYPE res = fmax(fmax(data.s0, data.s1), data.s2);
return res; return res;
} }
__kernel void pooling3(__global const float *input, /* n, c, h, w */ // Supported data type: half/float
__kernel void pooling3(__global const DATA_TYPE *input, /* n, c, h, w */
__private const int in_height, __private const int in_height,
__private const int in_width, __private const int in_width,
__private const int out_chan_num, __private const int out_chan_num,
__private const int out_height, __private const int out_height,
__private const int out_width, __private const int out_width,
__private const int stride, __private const int stride,
__global float *output) { __global DATA_TYPE *output) {
int batch = get_global_id(0); int batch = get_global_id(0);
int out_chan_blk = get_global_id(1); int out_chan_blk = get_global_id(1);
int out_pixel_blk = get_global_id(2); int out_pixel_blk = get_global_id(2);
...@@ -83,21 +89,21 @@ __kernel void pooling3(__global const float *input, /* n, c, h, w */ ...@@ -83,21 +89,21 @@ __kernel void pooling3(__global const float *input, /* n, c, h, w */
const int in_offset = batch * out_chan_num * in_pixel; const int in_offset = batch * out_chan_num * in_pixel;
const int out_offset = batch * out_chan_num * out_pixel; const int out_offset = batch * out_chan_num * out_pixel;
const float *input_base = input + in_offset + in_pixel_begin; const DATA_TYPE *input_base = input + in_offset + in_pixel_begin;
float *output_base = output + out_offset + out_pixel_begin; DATA_TYPE *output_base = output + out_offset + out_pixel_begin;
const int pixels = out_pixel_end - out_pixel_begin; const int pixels = out_pixel_end - out_pixel_begin;
for (int i = out_chan_begin; i < out_chan_end; ++i) { for (int i = out_chan_begin; i < out_chan_end; ++i) {
const float *input_ptr = input_base + i * in_pixel; const DATA_TYPE *input_ptr = input_base + i * in_pixel;
float *output_ptr = output_base + i * out_pixel; DATA_TYPE *output_ptr = output_base + i * out_pixel;
if (pixels == 4) { if (pixels == 4) {
float4 res; VEC_DATA_TYPE(DATA_TYPE,4) res;
if (stride == 1) { #ifdef STRIDE_1
res = vec_pooling_3_s1(input_ptr, in_width); res = vec_pooling_3_s1(input_ptr, in_width);
} else { #else
res = vec_pooling_3_s2(input_ptr, in_width); res = vec_pooling_3_s2(input_ptr, in_width);
} #endif
vstore4(res, 0, output_ptr); vstore4(res, 0, output_ptr);
} else { } else {
for (int p = 0; p < pixels; ++p) { for (int p = 0; p < pixels; ++p) {
...@@ -122,7 +128,8 @@ int calculate_avg_block_size(const int pos_h, ...@@ -122,7 +128,8 @@ int calculate_avg_block_size(const int pos_h,
return (h_end - h_start) * (w_end - w_start); return (h_end - h_start) * (w_end - w_start);
} }
__kernel void poolingn(__global const float *input, /* n, c, h, w */ // Supported data type: half/float
__kernel void poolingn(__global const DATA_TYPE *input, /* n, c, h, w */
__private const int in_height, __private const int in_height,
__private const int in_width, __private const int in_width,
__private const int out_chan_num, __private const int out_chan_num,
...@@ -132,7 +139,7 @@ __kernel void poolingn(__global const float *input, /* n, c, h, w */ ...@@ -132,7 +139,7 @@ __kernel void poolingn(__global const float *input, /* n, c, h, w */
__private const int pad_h, __private const int pad_h,
__private const int pad_w, __private const int pad_w,
__private const int pooling_size, __private const int pooling_size,
__global float *output) { __global DATA_TYPE *output) {
int batch = get_global_id(0); int batch = get_global_id(0);
int out_chan_idx = get_global_id(1); int out_chan_idx = get_global_id(1);
int out_pixel_idx = get_global_id(2); int out_pixel_idx = get_global_id(2);
...@@ -150,8 +157,8 @@ __kernel void poolingn(__global const float *input, /* n, c, h, w */ ...@@ -150,8 +157,8 @@ __kernel void poolingn(__global const float *input, /* n, c, h, w */
const int in_offset = batch * out_chan_num * in_pixel; const int in_offset = batch * out_chan_num * in_pixel;
const int out_offset = batch * out_chan_num * out_pixel; const int out_offset = batch * out_chan_num * out_pixel;
const float *input_base = input + in_offset + in_pixel_idx; const DATA_TYPE *input_base = input + in_offset + in_pixel_idx;
float *output_base = output + out_offset + out_pixel_idx; DATA_TYPE *output_base = output + out_offset + out_pixel_idx;
const int block_size = calculate_avg_block_size( const int block_size = calculate_avg_block_size(
out_pixel_height * stride, out_pixel_height * stride,
...@@ -162,14 +169,14 @@ __kernel void poolingn(__global const float *input, /* n, c, h, w */ ...@@ -162,14 +169,14 @@ __kernel void poolingn(__global const float *input, /* n, c, h, w */
in_height - pad_h, in_height - pad_h,
in_width - pad_w); in_width - pad_w);
for (int i = out_chan_begin; i < out_chan_end; ++i) { for (int i = out_chan_begin; i < out_chan_end; ++i) {
float8 sum8 = 0.0f; VEC_DATA_TYPE(DATA_TYPE,8) sum8 = 0.0f;
float sum1 = 0.0f; DATA_TYPE sum1 = 0.0f;
float *output_ptr = output_base + i * out_pixel; DATA_TYPE *output_ptr = output_base + i * out_pixel;
for (int y = 0; y < pooling_size; ++y) { for (int y = 0; y < pooling_size; ++y) {
const float *input_ptr = input_base + i * in_pixel + y * in_width; const DATA_TYPE *input_ptr = input_base + i * in_pixel + y * in_width;
int x = 0; int x = 0;
for (; x < (pooling_size-8); x += 8) { for (; x < (pooling_size-8); x += 8) {
float8 data = vload8(0, input_ptr); VEC_DATA_TYPE(DATA_TYPE,8) data = vload8(0, input_ptr);
sum8 += data; sum8 += data;
input_ptr += 8; input_ptr += 8;
} }
...@@ -178,8 +185,8 @@ __kernel void poolingn(__global const float *input, /* n, c, h, w */ ...@@ -178,8 +185,8 @@ __kernel void poolingn(__global const float *input, /* n, c, h, w */
input_ptr++; input_ptr++;
} }
} }
float4 sum4 = sum8.s0123 + sum8.s4567; VEC_DATA_TYPE(DATA_TYPE,4) sum4 = sum8.s0123 + sum8.s4567;
float2 sum2 = sum4.s01 + sum4.s23; VEC_DATA_TYPE(DATA_TYPE,2) sum2 = sum4.s01 + sum4.s23;
*output_ptr = (sum2.s0 + sum2.s1 + sum1) / block_size; *output_ptr = (sum2.s0 + sum2.s1 + sum1) / block_size;
} }
......
__kernel void relu(__global const float *input, #include <common.h>
// Supported data type: half/float
__kernel void relu(__global const DATA_TYPE *input,
__private const int size, __private const int size,
__global float *output) { __global DATA_TYPE *output) {
int idx = get_global_id(0); int idx = get_global_id(0);
if (idx + 4 > size) { if (idx + 4 > size) {
...@@ -8,16 +11,16 @@ __kernel void relu(__global const float *input, ...@@ -8,16 +11,16 @@ __kernel void relu(__global const float *input,
*(output+idx) = fmax(*(input+idx), 0); *(output+idx) = fmax(*(input+idx), 0);
} }
} else { } else {
float4 data = vload4(idx, input); VEC_DATA_TYPE(DATA_TYPE,4) data = vload4(idx, input);
data = fmax(data, (float4)0); data = fmax(data, (VEC_DATA_TYPE(DATA_TYPE,4))0);
vstore4(data, idx, output); vstore4(data, idx, output);
} }
} }
__kernel void relux(__global const float *input, __kernel void relux(__global const DATA_TYPE *input,
__private const float max_limit, __private const DATA_TYPE max_limit,
__private const int size, __private const int size,
__global float *output) { __global DATA_TYPE *output) {
int idx = get_global_id(0); int idx = get_global_id(0);
if (idx + 4 > size) { if (idx + 4 > size) {
...@@ -25,8 +28,8 @@ __kernel void relux(__global const float *input, ...@@ -25,8 +28,8 @@ __kernel void relux(__global const float *input,
*(output+idx) = clamp(*(input+idx), 0.0f, max_limit); *(output+idx) = clamp(*(input+idx), 0.0f, max_limit);
} }
} else { } else {
float4 data = vload4(idx, input); VEC_DATA_TYPE(DATA_TYPE,4) data = vload4(idx, input);
data = clamp(data, (float4)0, (float4)max_limit); data = clamp(data, (VEC_DATA_TYPE(DATA_TYPE,4))0, (VEC_DATA_TYPE(DATA_TYPE,4))max_limit);
vstore4(data, idx, output); vstore4(data, idx, output);
} }
} }
__kernel void resize_bilinear_nocache(__global const float *input, /* n * c, h, w */ #include <common.h>
__global float *output /* n * c, h, w */,
// Supported data type: half/float
__kernel void resize_bilinear_nocache(__global const DATA_TYPE *input, /* n * c, h, w */
__global DATA_TYPE *output /* n * c, h, w */,
__private const float height_scale, __private const float height_scale,
__private const float width_scale, __private const float width_scale,
__private const int in_height, __private const int in_height,
...@@ -21,16 +24,16 @@ __kernel void resize_bilinear_nocache(__global const float *input, /* n * c, h, ...@@ -21,16 +24,16 @@ __kernel void resize_bilinear_nocache(__global const float *input, /* n * c, h,
const float h_lerp = h_in - h_lower; const float h_lerp = h_in - h_lower;
const float w_lerp = w_in - w_lower; const float w_lerp = w_in - w_lower;
const float *input_base = input + c * in_height * in_width; const DATA_TYPE *input_base = input + c * in_height * in_width;
float *output_base = output + c * height * width; DATA_TYPE *output_base = output + c * height * width;
float top_left = input_base[h_lower * in_width + w_lower]; DATA_TYPE top_left = input_base[h_lower * in_width + w_lower];
float top_right = input_base[h_lower * in_width + w_upper]; DATA_TYPE top_right = input_base[h_lower * in_width + w_upper];
float bottom_left = input_base[h_upper * in_width + w_lower]; DATA_TYPE bottom_left = input_base[h_upper * in_width + w_lower];
float bottom_right = input_base[h_upper * in_width + w_upper]; DATA_TYPE bottom_right = input_base[h_upper * in_width + w_upper];
const float top = top_left + (top_right - top_left) * w_lerp; const DATA_TYPE top = top_left + (top_right - top_left) * w_lerp;
const float bottom = bottom_left + (bottom_right - bottom_left) * w_lerp; const DATA_TYPE bottom = bottom_left + (bottom_right - bottom_left) * w_lerp;
output_base[h * width + w] = top + (bottom - top) * h_lerp; output_base[h * width + w] = top + (bottom - top) * h_lerp;
} }
void kernel space_to_batch(global float *space_data_ptr, #include <common.h>
global const int *block_shape_ptr,
global const int *paddings_ptr, // Supported data type: all
private const int space_batch, __kernel void space_to_batch(__global DATA_TYPE *space_data_ptr,
private const int space_channel, __global const int *block_shape_ptr,
private const int space_height, __global const int *paddings_ptr,
private const int space_width, __private const int space_batch,
private const int batch_height, __private const int space_channel,
private const int batch_width, __private const int space_height,
private const int b2s, __private const int space_width,
global float* batch_data_ptr) { __private const int batch_height,
__private const int batch_width,
__private const int b2s,
__global DATA_TYPE* batch_data_ptr) {
int batch_idx = get_global_id(0); int batch_idx = get_global_id(0);
int batch_channel_idx = get_global_id(1); int batch_channel_idx = get_global_id(1);
int batch_pixel_idx = get_global_id(2); int batch_pixel_idx = get_global_id(2);
......
...@@ -61,16 +61,19 @@ void Conv1x1V2(const Tensor *input, ...@@ -61,16 +61,19 @@ void Conv1x1V2(const Tensor *input,
// TODO KernelFunctor has an extra clReleaseCommandQueue due to a copy // TODO KernelFunctor has an extra clReleaseCommandQueue due to a copy
// TODO check wired clReleaseCommandQueue latency // TODO check wired clReleaseCommandQueue latency
// The KernelFunctor can cause segment faults in cb_retain_event // The KernelFunctor can cause segment faults in cb_retain_event
auto conv_2d_kernel = cl::Kernel(program, "conv_2d_1x1_v2"); std::set<std::string> built_options;
built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
built_options.emplace(stride == 1 ? "-DSTRIDE_1" : "");
built_options.emplace(bias != nullptr ? "-DBIAS" : "");
auto conv_2d_kernel = runtime->BuildKernel("conv_2d_1x1", "conv_2d_1x1_v2", built_options);
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel); const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);
uint32_t idx = 0; uint32_t idx = 0;
conv_2d_kernel.setArg(idx++, conv_2d_kernel.setArg(idx++,
*(static_cast<const cl::Buffer *>(input->buffer()))); *(static_cast<const cl::Buffer *>(input->buffer())));
conv_2d_kernel.setArg(idx++, conv_2d_kernel.setArg(idx++,
*(static_cast<const cl::Buffer *>(filter->buffer()))); *(static_cast<const cl::Buffer *>(filter->buffer())));
if (bias == NULL) { if (bias != nullptr) {
conv_2d_kernel.setArg(idx++, NULL);
} else {
conv_2d_kernel.setArg(idx++, conv_2d_kernel.setArg(idx++,
*(static_cast<const cl::Buffer *>(bias->buffer()))); *(static_cast<const cl::Buffer *>(bias->buffer())));
} }
...@@ -81,7 +84,6 @@ void Conv1x1V2(const Tensor *input, ...@@ -81,7 +84,6 @@ void Conv1x1V2(const Tensor *input,
conv_2d_kernel.setArg(idx++, static_cast<int>(input->dim(3))); conv_2d_kernel.setArg(idx++, static_cast<int>(input->dim(3)));
conv_2d_kernel.setArg(idx++, static_cast<int>(height)); conv_2d_kernel.setArg(idx++, static_cast<int>(height));
conv_2d_kernel.setArg(idx++, static_cast<int>(width)); conv_2d_kernel.setArg(idx++, static_cast<int>(width));
conv_2d_kernel.setArg(idx++, stride);
auto command_queue = runtime->command_queue(); auto command_queue = runtime->command_queue();
cl_int error = command_queue.enqueueNDRangeKernel( cl_int error = command_queue.enqueueNDRangeKernel(
......
...@@ -22,14 +22,17 @@ static void InnerConv2dK3x3S12(const Tensor *input, const Tensor *filter, ...@@ -22,14 +22,17 @@ static void InnerConv2dK3x3S12(const Tensor *input, const Tensor *filter,
auto runtime = OpenCLRuntime::Get(); auto runtime = OpenCLRuntime::Get();
auto program = runtime->program(); auto program = runtime->program();
auto conv_kernel = cl::Kernel(program, "conv_2d_3x3");
std::set<std::string> built_options;
built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
built_options.emplace(stride == 1 ? "-DSTRIDE_1" : "");
built_options.emplace(bias != nullptr ? "-DBIAS" : "");
auto conv_kernel = runtime->BuildKernel("conv_2d_3x3", "conv_2d_3x3", built_options);
uint32_t idx = 0; uint32_t idx = 0;
conv_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer()))); conv_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer())));
conv_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(filter->buffer()))); conv_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(filter->buffer())));
if (bias == nullptr) { if (bias != nullptr) {
conv_kernel.setArg(idx++, NULL);
} else {
conv_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(bias->buffer()))); conv_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(bias->buffer())));
} }
conv_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer()))); conv_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
...@@ -39,8 +42,6 @@ static void InnerConv2dK3x3S12(const Tensor *input, const Tensor *filter, ...@@ -39,8 +42,6 @@ static void InnerConv2dK3x3S12(const Tensor *input, const Tensor *filter,
conv_kernel.setArg(idx++, static_cast<int32_t>(input->dim(3))); conv_kernel.setArg(idx++, static_cast<int32_t>(input->dim(3)));
conv_kernel.setArg(idx++, static_cast<int32_t>(height)); conv_kernel.setArg(idx++, static_cast<int32_t>(height));
conv_kernel.setArg(idx++, static_cast<int32_t>(width)); conv_kernel.setArg(idx++, static_cast<int32_t>(width));
conv_kernel.setArg(idx++, stride);
conv_kernel.setArg(idx++, stride);
const uint32_t gws[3] = {static_cast<uint32_t>(output->dim(0)), const uint32_t gws[3] = {static_cast<uint32_t>(output->dim(0)),
static_cast<uint32_t>(channel_blocks), static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(pixel_blocks)}; static_cast<uint32_t>(pixel_blocks)};
......
...@@ -30,13 +30,18 @@ static void InnerDepthwiseConvOpenclK3x3S12(const Tensor *input, ...@@ -30,13 +30,18 @@ static void InnerDepthwiseConvOpenclK3x3S12(const Tensor *input,
const index_t pixel_blocks = (width + 3) / 4 * height; const index_t pixel_blocks = (width + 3) / 4 * height;
auto runtime = OpenCLRuntime::Get(); auto runtime = OpenCLRuntime::Get();
auto program = runtime->program(); std::set<std::string> built_options;
auto conv_kernel = cl::Kernel(program, "depthwise_conv_3x3"); built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
built_options.emplace(stride == 1 ? "-DSTRIDE_1" : "");
built_options.emplace(bias != nullptr ? "-DBIAS" : "");
auto conv_kernel = runtime->BuildKernel("depthwise_conv_3x3", "depthwise_conv_3x3", built_options);
uint32_t idx = 0; uint32_t idx = 0;
conv_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer()))); conv_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer())));
conv_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(filter->buffer()))); conv_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(filter->buffer())));
if (bias != nullptr) {
conv_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(bias->buffer()))); conv_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(bias->buffer())));
}
conv_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer()))); conv_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
conv_kernel.setArg(idx++, static_cast<int32_t>(input->dim(1))); conv_kernel.setArg(idx++, static_cast<int32_t>(input->dim(1)));
conv_kernel.setArg(idx++, static_cast<int32_t>(channels)); conv_kernel.setArg(idx++, static_cast<int32_t>(channels));
...@@ -44,8 +49,6 @@ static void InnerDepthwiseConvOpenclK3x3S12(const Tensor *input, ...@@ -44,8 +49,6 @@ static void InnerDepthwiseConvOpenclK3x3S12(const Tensor *input,
conv_kernel.setArg(idx++, static_cast<int32_t>(input->dim(3))); conv_kernel.setArg(idx++, static_cast<int32_t>(input->dim(3)));
conv_kernel.setArg(idx++, static_cast<int32_t>(height)); conv_kernel.setArg(idx++, static_cast<int32_t>(height));
conv_kernel.setArg(idx++, static_cast<int32_t>(width)); conv_kernel.setArg(idx++, static_cast<int32_t>(width));
conv_kernel.setArg(idx++, stride);
conv_kernel.setArg(idx++, stride);
const uint32_t gws[3] = {static_cast<uint32_t>(output->dim(0)), const uint32_t gws[3] = {static_cast<uint32_t>(output->dim(0)),
static_cast<uint32_t>(channel_blocks), static_cast<uint32_t>(channel_blocks),
......
...@@ -30,24 +30,26 @@ static void Pooling3(const Tensor *input, ...@@ -30,24 +30,26 @@ static void Pooling3(const Tensor *input,
}; };
auto runtime = OpenCLRuntime::Get(); auto runtime = OpenCLRuntime::Get();
auto program = runtime->program(); std::set<std::string> built_options;
built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
built_options.emplace(stride[0] == 1 ? "-DSTRIDE_1" : "");
auto pooling_kernel = runtime->BuildKernel("pooling", "pooling3", built_options);
auto max_pooling_kernel = cl::Kernel(program, "pooling3");
const uint32_t lws[3] = {1, 8, 128}; const uint32_t lws[3] = {1, 8, 128};
uint32_t idx = 0; uint32_t idx = 0;
max_pooling_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer()))); pooling_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer())));
max_pooling_kernel.setArg(idx++, static_cast<int32_t>(input->dim(2))); pooling_kernel.setArg(idx++, static_cast<int32_t>(input->dim(2)));
max_pooling_kernel.setArg(idx++, static_cast<int32_t>(input->dim(3))); pooling_kernel.setArg(idx++, static_cast<int32_t>(input->dim(3)));
max_pooling_kernel.setArg(idx++, static_cast<int32_t>(channels)); pooling_kernel.setArg(idx++, static_cast<int32_t>(channels));
max_pooling_kernel.setArg(idx++, static_cast<int32_t>(out_height)); pooling_kernel.setArg(idx++, static_cast<int32_t>(out_height));
max_pooling_kernel.setArg(idx++, static_cast<int32_t>(out_width)); pooling_kernel.setArg(idx++, static_cast<int32_t>(out_width));
max_pooling_kernel.setArg(idx++, stride[0]); pooling_kernel.setArg(idx++, stride[0]);
max_pooling_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer()))); pooling_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
cl_int error = runtime->command_queue().enqueueNDRangeKernel( cl_int error = runtime->command_queue().enqueueNDRangeKernel(
max_pooling_kernel, cl::NullRange, pooling_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1], gws[2]), cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2])); cl::NDRange(lws[0], lws[1], lws[2]));
MACE_CHECK(error == CL_SUCCESS); MACE_CHECK(error == CL_SUCCESS);
...@@ -75,9 +77,9 @@ static void PoolingN(const Tensor *input, ...@@ -75,9 +77,9 @@ static void PoolingN(const Tensor *input,
}; };
auto runtime = OpenCLRuntime::Get(); auto runtime = OpenCLRuntime::Get();
auto program = runtime->program(); std::set<std::string> built_options;
built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
auto pooling_kernel = cl::Kernel(program, "poolingn"); auto pooling_kernel = runtime->BuildKernel("pooling", "poolingn", built_options);
const uint32_t lws[3] = {1, 8, 128}; const uint32_t lws[3] = {1, 8, 128};
......
...@@ -21,9 +21,10 @@ void ReluFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input, ...@@ -21,9 +21,10 @@ void ReluFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
auto runtime = OpenCLRuntime::Get(); auto runtime = OpenCLRuntime::Get();
auto program = runtime->program(); auto program = runtime->program();
std::set<std::string> built_options;
built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
if (max_limit_ < 0) { if (max_limit_ < 0) {
auto relu_kernel = cl::Kernel(program, "relu"); auto relu_kernel = runtime->BuildKernel("relu", "relu", built_options);
const uint32_t lws = runtime->GetKernelMaxWorkGroupSize(relu_kernel); const uint32_t lws = runtime->GetKernelMaxWorkGroupSize(relu_kernel);
uint32_t idx = 0; uint32_t idx = 0;
...@@ -37,7 +38,7 @@ void ReluFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input, ...@@ -37,7 +38,7 @@ void ReluFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
cl::NDRange(lws)); cl::NDRange(lws));
MACE_CHECK(error == CL_SUCCESS); MACE_CHECK(error == CL_SUCCESS);
} else { } else {
auto relu_kernel = cl::Kernel(program, "relux"); auto relu_kernel = runtime->BuildKernel("relu", "relux", built_options);
const uint32_t lws = runtime->GetKernelMaxWorkGroupSize(relu_kernel); const uint32_t lws = runtime->GetKernelMaxWorkGroupSize(relu_kernel);
......
...@@ -29,9 +29,10 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, float>::operator()( ...@@ -29,9 +29,10 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, float>::operator()(
float width_scale = CalculateResizeScale(in_width, out_width, align_corners_); float width_scale = CalculateResizeScale(in_width, out_width, align_corners_);
auto runtime = OpenCLRuntime::Get(); auto runtime = OpenCLRuntime::Get();
auto program = runtime->program(); std::set<std::string> built_options;
built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
auto rb_kernel = runtime->BuildKernel("resize_bilinear", "resize_bilinear_nocache", built_options);
auto rb_kernel = cl::Kernel(program, "resize_bilinear_nocache");
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(rb_kernel); const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(rb_kernel);
uint32_t idx = 0; uint32_t idx = 0;
rb_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer()))); rb_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer())));
......
...@@ -18,9 +18,9 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, float>::operator()(Tensor *space_te ...@@ -18,9 +18,9 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, float>::operator()(Tensor *space_te
const Tensor *paddings_tensor, const Tensor *paddings_tensor,
Tensor *batch_tensor) { Tensor *batch_tensor) {
auto runtime = OpenCLRuntime::Get(); auto runtime = OpenCLRuntime::Get();
auto program = runtime->program(); std::set<std::string> built_options;
auto s2b_kernel = cl::Kernel(program, "space_to_batch"); built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(space_tensor->dtype()));
auto s2b_kernel = runtime->BuildKernel("space_to_batch", "space_to_batch", built_options);
uint32_t idx = 0; uint32_t idx = 0;
s2b_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(space_tensor->buffer()))); s2b_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(space_tensor->buffer())));
......
...@@ -23,6 +23,8 @@ enum DataType { ...@@ -23,6 +23,8 @@ enum DataType {
DT_INT64 = 8; DT_INT64 = 8;
DT_UINT16 = 9; DT_UINT16 = 9;
DT_BOOL = 10; DT_BOOL = 10;
DT_HALF = 19;
DT_UINT32 = 22;
} }
message TensorProto { message TensorProto {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册