提交 6982006f 编写于 作者: Y yejianwu

add compatible method to space2depth and depth2space ops

上级 96452de2
...@@ -3,11 +3,28 @@ ...@@ -3,11 +3,28 @@
__kernel void depth_to_space(__read_only image2d_t input, __kernel void depth_to_space(__read_only image2d_t input,
__private const int block_size, __private const int block_size,
__private const int output_depth, __private const int output_depth,
#ifndef USE_QUALCOMM_OPENCL_2_0
__write_only image2d_t output,
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2) {
#else
__write_only image2d_t output) { __write_only image2d_t output) {
#endif
const int out_d = get_global_id(0); const int out_d = get_global_id(0);
const int out_w = get_global_id(1); const int out_w = get_global_id(1);
const int out_h = get_global_id(2); const int out_h = get_global_id(2);
#ifndef USE_QUALCOMM_OPENCL_2_0
if (out_d >= global_size_dim0 || out_w >= global_size_dim1
|| out_h >= global_size_dim2) {
return;
}
const int output_width = global_size_dim1;
#else
const int output_width = get_global_size(1); const int output_width = get_global_size(1);
#endif
const int out_pos = mad24(out_d, output_width, out_w); const int out_pos = mad24(out_d, output_width, out_w);
...@@ -30,11 +47,29 @@ __kernel void depth_to_space(__read_only image2d_t input, ...@@ -30,11 +47,29 @@ __kernel void depth_to_space(__read_only image2d_t input,
__kernel void space_to_depth(__read_only image2d_t input, __kernel void space_to_depth(__read_only image2d_t input,
__private const int block_size, __private const int block_size,
__private const int input_depth, __private const int input_depth,
#ifndef USE_QUALCOMM_OPENCL_2_0
__write_only image2d_t output,
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2) {
#else
__write_only image2d_t output) { __write_only image2d_t output) {
#endif
const int d = get_global_id(0); const int d = get_global_id(0);
const int w = get_global_id(1); const int w = get_global_id(1);
const int h = get_global_id(2); const int h = get_global_id(2);
#ifndef USE_QUALCOMM_OPENCL_2_0
if (d >= global_size_dim0 || w >= global_size_dim1
|| h >= global_size_dim2) {
return;
}
const int input_width = global_size_dim1;
#else
const int input_width = get_global_size(1); const int input_width = get_global_size(1);
#endif
const int in_pos = mad24(d, input_width, w); const int in_pos = mad24(d, input_width, w);
const int output_width = input_width / block_size; const int output_width = input_width / block_size;
......
...@@ -45,8 +45,11 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -45,8 +45,11 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape); CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
output->ResizeImage(output_shape, image_shape); output->ResizeImage(output_shape, image_shape);
auto runtime = OpenCLRuntime::Global();
const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
auto runtime = OpenCLRuntime::Global();
std::set<std::string> built_options; std::set<std::string> built_options;
std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
std::stringstream kernel_name_ss; std::stringstream kernel_name_ss;
...@@ -55,38 +58,47 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -55,38 +58,47 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
auto dt = DataTypeToEnum<T>::value; auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
if (is_qualcomm_opencl200) {
built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
}
kernel_ = kernel_ =
runtime->BuildKernel("depth_to_space", kernel_name, built_options); runtime->BuildKernel("depth_to_space", kernel_name, built_options);
} }
uint32_t gws[3];
std::stringstream ss;
if (!IsVecEqual(input_shape_, input->shape())) { if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0; uint32_t idx = 0;
kernel_.setArg(idx++, *(input->opencl_image())); kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, block_size_); kernel_.setArg(idx++, block_size_);
kernel_.setArg(idx++, depth_blocks); kernel_.setArg(idx++, depth_blocks);
kernel_.setArg(idx++, *(output->opencl_image())); kernel_.setArg(idx++, *(output->opencl_image()));
if (d2s_) {
gws[0] = static_cast<uint32_t>(depth_blocks);
gws[1] = static_cast<uint32_t>(output_width);
gws[2] = static_cast<uint32_t>(output_height * batch);
ss << "depth_to_space_opencl_kernel_" << output->dim(0) << "_"
<< output->dim(1) << "_" << output->dim(2) << "_" << output->dim(3);
} else {
gws[0] = static_cast<uint32_t>(depth_blocks);
gws[1] = static_cast<uint32_t>(input_width);
gws[2] = static_cast<uint32_t>(input_height * batch);
ss << "space_to_depth_opencl_kernel_" << input->dim(0) << "_"
<< input->dim(1) << "_" << input->dim(2) << "_" << input->dim(3);
}
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]);
input_shape_ = input->shape(); input_shape_ = input->shape();
} }
if (d2s_) { const uint32_t kwg_size =
const uint32_t gws[3] = {static_cast<uint32_t>(depth_blocks), static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
static_cast<uint32_t>(output_width), const std::vector<uint32_t> lws = {8, kwg_size / 64, 8, 1};
static_cast<uint32_t>(output_height * batch)}; TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
const std::vector<uint32_t> lws = {8, 16, 8, 1};
std::stringstream ss;
ss << "depth_to_space_opencl_kernel_" << output->dim(0) << "_"
<< output->dim(1) << "_" << output->dim(2) << "_" << output->dim(3);
TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
} else {
const uint32_t gws[3] = {static_cast<uint32_t>(depth_blocks),
static_cast<uint32_t>(input_width),
static_cast<uint32_t>(input_height * batch)};
const std::vector<uint32_t> lws = {8, 16, 8, 1};
std::stringstream ss;
ss << "space_to_depth_opencl_kernel_" << input->dim(0) << "_"
<< input->dim(1) << "_" << input->dim(2) << "_" << input->dim(3);
TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
}
} }
template struct DepthToSpaceOpFunctor<DeviceType::OPENCL, float>; template struct DepthToSpaceOpFunctor<DeviceType::OPENCL, float>;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册