提交 eefc9d01 编写于 作者: Y yejianwu

update cl source code format, fix pooling op core dump

上级 08a362e7
......@@ -71,6 +71,11 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
if (!IsVecEqual(input_shape_, input->shape())) {
int idx = 0;
if (!is_non_uniform_work_groups_supported_) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]);
}
kernel_.setArg(idx++, *(input->opencl_image()));
if (activation_ == PRELU) {
MACE_CHECK_NOTNULL(alpha);
......@@ -78,9 +83,6 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
}
kernel_.setArg(idx++, static_cast<float>(relux_max_limit_));
kernel_.setArg(idx++, *(output->opencl_image()));
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]);
input_shape_ = input->shape();
......
......@@ -70,12 +70,14 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
output_tensor->ResizeImage(output_shape, output_image_shape);
uint32_t idx = 0;
if (!is_non_uniform_work_groups_supported_) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
}
for (auto input : input_tensors) {
kernel_.setArg(idx++, *(input->opencl_image()));
}
kernel_.setArg(idx++, *(output_tensor->opencl_image()));
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
input_shape_ = input_tensors[0]->shape();
......
......@@ -75,6 +75,11 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
}
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
if (!is_non_uniform_work_groups_supported_) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]);
}
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(scale->opencl_image()));
kernel_.setArg(idx++, *(offset->opencl_image()));
......@@ -85,9 +90,6 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
}
kernel_.setArg(idx++, *(output->opencl_image()));
kernel_.setArg(idx++, relux_max_limit_);
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]);
input_shape_ = input->shape();
......
......@@ -45,12 +45,14 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
}
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
if (!is_non_uniform_work_groups_supported_) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]);
}
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(bias->opencl_image()));
kernel_.setArg(idx++, *(output->opencl_image()));
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]);
input_shape_ = input->shape();
kwg_size_ =
......
......@@ -87,6 +87,10 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
obfuscated_kernel_name, built_options);
uint32_t idx = 0;
if (!is_non_uniform_work_groups_supported) {
b2f_kernel.setArg(idx++, gws[0]);
b2f_kernel.setArg(idx++, gws[1]);
}
b2f_kernel.setArg(idx++, *(buffer->opencl_buffer()));
if (!i2b_) {
MACE_CHECK(buffer->buffer_offset() % GetEnumTypeSize(buffer->dtype()) == 0,
......@@ -112,8 +116,6 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(3)));
}
b2f_kernel.setArg(idx++, *(image->opencl_image()));
b2f_kernel.setArg(idx++, gws[0]);
b2f_kernel.setArg(idx++, gws[1]);
const uint32_t kwg_size =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(b2f_kernel));
......
......@@ -54,13 +54,15 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
if (!is_non_uniform_work_groups_supported_) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]);
}
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, groups_);
kernel_.setArg(idx++, static_cast<uint32_t>(channels_per_group));
kernel_.setArg(idx++, *(output->opencl_image()));
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]);
input_shape_ = input->shape();
......
#include <common.h>
__kernel void activation(__read_only image2d_t input,
__kernel void activation(
#ifndef USE_QUALCOMM_OPENCL_2_0
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
#endif
__read_only image2d_t input,
#ifdef USE_PRELU
__read_only image2d_t alpha,
#endif
__private const float relux_max_limit,
#ifndef USE_QUALCOMM_OPENCL_2_0
__write_only image2d_t output,
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2) {
#else
__write_only image2d_t output) {
#endif
const int ch_blk = get_global_id(0);
const int w = get_global_id(1);
const int hb = get_global_id(2);
......
#include <common.h>
__kernel void addn(__read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */
__kernel void addn(
#ifndef USE_QUALCOMM_OPENCL_2_0
__private const int global_size_dim0,
__private const int global_size_dim1,
#endif
__read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */
__read_only image2d_t input1,
#if INPUT_NUM > 2
__read_only image2d_t input2,
......@@ -8,14 +13,7 @@ __kernel void addn(__read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */
#if INPUT_NUM > 3
__read_only image2d_t input3,
#endif
#ifndef USE_QUALCOMM_OPENCL_2_0
__write_only image2d_t output,
__private const int global_size_dim0,
__private const int global_size_dim1) {
#else
__write_only image2d_t output) {
#endif
const int w = get_global_id(0);
const int hb = get_global_id(1);
......
#include <common.h>
// Supported data types: half/float
__kernel void batch_norm(__read_only image2d_t input,
__kernel void batch_norm(
#ifndef USE_QUALCOMM_OPENCL_2_0
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
#endif
__read_only image2d_t input,
__read_only image2d_t scale,
__read_only image2d_t offset,
#ifndef FOLDED_CONSTANT
......@@ -9,15 +15,7 @@ __kernel void batch_norm(__read_only image2d_t input,
__private const float epsilon,
#endif
__write_only image2d_t output,
#ifndef USE_QUALCOMM_OPENCL_2_0
__private const float relux_max_limit,
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2) {
#else
__private const float relux_max_limit) {
#endif
const int ch_blk = get_global_id(0);
const int w = get_global_id(1);
const int hb = get_global_id(2);
......
#include <common.h>
// Supported data types: half/float
__kernel void bias_add(__read_only image2d_t input,
__read_only image2d_t bias,
__kernel void bias_add(
#ifndef USE_QUALCOMM_OPENCL_2_0
__write_only image2d_t output,
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2) {
#else
__write_only image2d_t output) {
__private const int global_size_dim2,
#endif
__read_only image2d_t input,
__read_only image2d_t bias,
__write_only image2d_t output) {
const int ch_blk = get_global_id(0);
const int w = get_global_id(1);
const int hb = get_global_id(2);
......
#include <common.h>
__kernel void filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w, oc, ic */
__kernel void filter_buffer_to_image(
#ifndef USE_QUALCOMM_OPENCL_2_0
__private const int global_size_dim0,
__private const int global_size_dim1,
#endif
__global const DATA_TYPE *input, /* h, w, oc, ic */
__private const int input_offset,
__private const int filter_h,
__private const int filter_w,
__private const int out_channel,
__private const int in_channel,
#ifndef USE_QUALCOMM_OPENCL_2_0
__write_only image2d_t output,
__private const int global_size_dim0,
__private const int global_size_dim1) {
#else
__write_only image2d_t output) {
#endif
int w = get_global_id(0);
int h = get_global_id(1);
......@@ -58,19 +56,17 @@ __kernel void filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w, o
WRITE_IMAGET(output, coord, values);
}
__kernel void filter_image_to_buffer(__global DATA_TYPE *output, /* h, w, oc, ic */
__kernel void filter_image_to_buffer(
#ifndef USE_QUALCOMM_OPENCL_2_0
__private const int global_size_dim0,
__private const int global_size_dim1,
#endif
__global DATA_TYPE *output, /* h, w, oc, ic */
__private const int filter_h,
__private const int filter_w,
__private const int out_channel,
__private const int in_channel,
#ifndef USE_QUALCOMM_OPENCL_2_0
__read_only image2d_t input,
__private const int global_size_dim0,
__private const int global_size_dim1) {
#else
__read_only image2d_t input) {
#endif
int w = get_global_id(0);
int h = get_global_id(1);
......@@ -112,19 +108,17 @@ __kernel void filter_image_to_buffer(__global DATA_TYPE *output, /* h, w, oc, ic
}
}
__kernel void dw_filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w, ic, m */
__kernel void dw_filter_buffer_to_image(
#ifndef USE_QUALCOMM_OPENCL_2_0
__private const int global_size_dim0,
__private const int global_size_dim1,
#endif
__global const DATA_TYPE *input, /* h, w, ic, m */
__private const int input_offset,
__private const int filter_w,
__private const int in_channel,
__private const int multiplier,
#ifndef USE_QUALCOMM_OPENCL_2_0
__write_only image2d_t output,
__private const int global_size_dim0,
__private const int global_size_dim1) { /* ic%4 * kh * kw * m, ic/4 */
#else
__write_only image2d_t output) {
#endif
__write_only image2d_t output) { /* ic%4 * kh * kw * m, ic/4 */
const int w = get_global_id(0);
const int h = get_global_id(1);
......@@ -175,19 +169,17 @@ __kernel void dw_filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w
WRITE_IMAGET(output, coord, values);
}
__kernel void in_out_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
__kernel void in_out_buffer_to_image(
#ifndef USE_QUALCOMM_OPENCL_2_0
__private const int global_size_dim0,
__private const int global_size_dim1,
#endif
__global const DATA_TYPE *input, /* nhwc */
__private const int input_offset,
__private const int height,
__private const int width,
__private const int channels,
#ifndef USE_QUALCOMM_OPENCL_2_0
__write_only image2d_t output,
__private const int global_size_dim0,
__private const int global_size_dim1) {
#else
__write_only image2d_t output) {
#endif
int w = get_global_id(0);
int h = get_global_id(1);
......@@ -222,18 +214,16 @@ __kernel void in_out_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
WRITE_IMAGET(output, coord, values);
}
__kernel void in_out_image_to_buffer(__global DATA_TYPE *output, /* nhwc */
__kernel void in_out_image_to_buffer(
#ifndef USE_QUALCOMM_OPENCL_2_0
__private const int global_size_dim0,
__private const int global_size_dim1,
#endif
__global DATA_TYPE *output, /* nhwc */
__private const int height,
__private const int width,
__private const int channels,
#ifndef USE_QUALCOMM_OPENCL_2_0
__read_only image2d_t input,
__private const int global_size_dim0,
__private const int global_size_dim1) {
#else
__read_only image2d_t input) {
#endif
int w = get_global_id(0);
int h = get_global_id(1);
......@@ -267,17 +257,15 @@ __kernel void in_out_image_to_buffer(__global DATA_TYPE *output, /* nhwc */
}
}
__kernel void arg_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
__private const int input_offset,
__private const int count,
__kernel void arg_buffer_to_image(
#ifndef USE_QUALCOMM_OPENCL_2_0
__write_only image2d_t output,
__private const int global_size_dim0,
__private const int global_size_dim1) {
#else
__write_only image2d_t output) {
__private const int global_size_dim1,
#endif
__global const DATA_TYPE *input, /* nhwc */
__private const int input_offset,
__private const int count,
__write_only image2d_t output) {
int w = get_global_id(0);
int h = get_global_id(1);
......@@ -308,16 +296,14 @@ __kernel void arg_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
WRITE_IMAGET(output, coord, values);
}
__kernel void arg_image_to_buffer(__global DATA_TYPE *output, /* nhwc */
__private const int count,
__kernel void arg_image_to_buffer(
#ifndef USE_QUALCOMM_OPENCL_2_0
__read_only image2d_t input,
__private const int global_size_dim0,
__private const int global_size_dim1) {
#else
__read_only image2d_t input) {
__private const int global_size_dim1,
#endif
__global DATA_TYPE *output, /* nhwc */
__private const int count,
__read_only image2d_t input) {
int w = get_global_id(0);
int h = get_global_id(1);
......@@ -347,19 +333,17 @@ __kernel void arg_image_to_buffer(__global DATA_TYPE *output, /* nhwc */
}
__kernel void in_out_height_buffer_to_image(__global const DATA_TYPE *input, //nhwc
__kernel void in_out_height_buffer_to_image(
#ifndef USE_QUALCOMM_OPENCL_2_0
__private const int global_size_dim0,
__private const int global_size_dim1,
#endif
__global const DATA_TYPE *input, //nhwc
__private const int input_offset,
__private const int height,
__private const int width,
__private const int channels,
#ifndef USE_QUALCOMM_OPENCL_2_0
__write_only image2d_t output,
__private const int global_size_dim0,
__private const int global_size_dim1) {
#else
__write_only image2d_t output) {
#endif
int w = get_global_id(0);
int h = get_global_id(1);
......@@ -395,18 +379,16 @@ __kernel void in_out_height_buffer_to_image(__global const DATA_TYPE *input, //n
WRITE_IMAGET(output, coord, values);
}
__kernel void in_out_height_image_to_buffer(__global DATA_TYPE *output, //nhwc
__kernel void in_out_height_image_to_buffer(
#ifndef USE_QUALCOMM_OPENCL_2_0
__private const int global_size_dim0,
__private const int global_size_dim1,
#endif
__global DATA_TYPE *output, //nhwc
__private const int height,
__private const int width,
__private const int channels,
#ifndef USE_QUALCOMM_OPENCL_2_0
__read_only image2d_t input,
__private const int global_size_dim0,
__private const int global_size_dim1) {
#else
__read_only image2d_t input) {
#endif
int w = get_global_id(0);
int h = get_global_id(1);
......@@ -439,19 +421,17 @@ __kernel void in_out_height_image_to_buffer(__global DATA_TYPE *output, //nhwc
}
__kernel void in_out_width_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
__kernel void in_out_width_buffer_to_image(
#ifndef USE_QUALCOMM_OPENCL_2_0
__private const int global_size_dim0,
__private const int global_size_dim1,
#endif
__global const DATA_TYPE *input, /* nhwc */
__private const int input_offset,
__private const int height,
__private const int width,
__private const int channels,
#ifndef USE_QUALCOMM_OPENCL_2_0
__write_only image2d_t output,
__private const int global_size_dim0,
__private const int global_size_dim1) {
#else
__write_only image2d_t output) {
#endif
int w = get_global_id(0);
int h = get_global_id(1);
......@@ -487,19 +467,17 @@ __kernel void in_out_width_buffer_to_image(__global const DATA_TYPE *input, /* n
}
// only support 3x3 now
__kernel void winograd_filter_buffer_to_image(__global const DATA_TYPE *input, //Oc, Ic, H, W
__kernel void winograd_filter_buffer_to_image(
#ifndef USE_QUALCOMM_OPENCL_2_0
__private const int global_size_dim0,
__private const int global_size_dim1,
#endif
__global const DATA_TYPE *input, //Oc, Ic, H, W
__private const int input_offset,
__private const int in_channels,
__private const int height,
__private const int width,
#ifndef USE_QUALCOMM_OPENCL_2_0
__write_only image2d_t output,
__private const int global_size_dim0,
__private const int global_size_dim1) {
#else
__write_only image2d_t output) {
#endif
int w = get_global_id(0);
int h = get_global_id(1);
......@@ -584,18 +562,16 @@ __kernel void winograd_filter_buffer_to_image(__global const DATA_TYPE *input, /
}
// only support 3x3 now
__kernel void winograd_filter_image_to_buffer(__global DATA_TYPE *output, //Oc, Ic, H, W
__kernel void winograd_filter_image_to_buffer(
#ifndef USE_QUALCOMM_OPENCL_2_0
__private const int global_size_dim0,
__private const int global_size_dim1,
#endif
__global DATA_TYPE *output, //Oc, Ic, H, W
__private const int height,
__private const int width,
__private const int channel,
#ifndef USE_QUALCOMM_OPENCL_2_0
__read_only image2d_t input,
__private const int global_size_dim0,
__private const int global_size_dim1) {
#else
__read_only image2d_t input) {
#endif
const int w = get_global_id(0);
const int h = get_global_id(1);
......
#include <common.h>
// assume channes_per_group mod 4 = 0 && groups mod 4 == 0
__kernel void channel_shuffle(__read_only image2d_t input,
__private const int groups,
__private const int channels_per_group,
__kernel void channel_shuffle(
#ifndef USE_QUALCOMM_OPENCL_2_0
__write_only image2d_t output,
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2) {
#else
__write_only image2d_t output) {
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
#endif
__read_only image2d_t input,
__private const int groups,
__private const int channels_per_group,
__write_only image2d_t output) {
const int group_chan_blk_idx = get_global_id(0);
const int width_idx = get_global_id(1);
const int hb_idx = get_global_id(2);
......
......@@ -22,18 +22,16 @@ DATA_TYPE4 stitch_vector(DATA_TYPE4 left,
}
// Supported data type: half/float
__kernel void concat_channel(__read_only image2d_t input0,
__read_only image2d_t input1,
__private const int input0_chan,
__kernel void concat_channel(
#ifndef USE_QUALCOMM_OPENCL_2_0
__write_only image2d_t output,
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2) {
#else
__write_only image2d_t output) {
__private const int global_size_dim2,
#endif
__read_only image2d_t input0,
__read_only image2d_t input1,
__private const int input0_chan,
__write_only image2d_t output) {
const int chan_blk_idx = get_global_id(0);
const int width_idx = get_global_id(1);
const int hb_idx = get_global_id(2);
......@@ -90,17 +88,15 @@ __kernel void concat_channel(__read_only image2d_t input0,
}
// Required: All input channels are divisible by 4
__kernel void concat_channel_multi(__read_only image2d_t input,
__private const int chan_blk_offset,
__kernel void concat_channel_multi(
#ifndef USE_QUALCOMM_OPENCL_2_0
__write_only image2d_t output,
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2) {
#else
__write_only image2d_t output) {
__private const int global_size_dim2,
#endif
__read_only image2d_t input,
__private const int chan_blk_offset,
__write_only image2d_t output) {
const int chan_blk_idx = get_global_id(0);
const int width_idx = get_global_id(1);
const int hb_idx = get_global_id(2);
......
#include <common.h>
__kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__kernel void conv_2d(
#ifndef USE_QUALCOMM_OPENCL_2_0
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
#endif
__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__read_only image2d_t filter, /* cout%4 * cin, kh * kw * cout/4 */
#ifdef BIAS
__read_only image2d_t bias, /* cout%4 * cout/4 */
......@@ -18,15 +24,7 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__private const int padding_top,
__private const int padding_left,
__private const int dilation_h,
#ifndef USE_QUALCOMM_OPENCL_2_0
__private const int dilation_w,
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2) {
#else
__private const int dilation_w) {
#endif
const int out_ch_blk = get_global_id(0);
const int out_w_blk = get_global_id(1);
const int out_hb = get_global_id(2);
......
#include <common.h>
__kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__kernel void conv_2d_1x1(
#ifndef USE_QUALCOMM_OPENCL_2_0
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
#endif
__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__read_only image2d_t filter, /* cout%4 * cin, cout/4 */
#ifdef BIAS
__read_only image2d_t bias, /* cout%4 * cout/4 */
......@@ -12,15 +18,7 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
__private const int in_ch_blks,
__private const int height,
__private const int width,
#ifndef USE_QUALCOMM_OPENCL_2_0
__private const int stride,
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2) {
#else
__private const int stride) {
#endif
const int out_ch_blk = get_global_id(0);
const int out_w_blk = get_global_id(1);
const int out_hb = get_global_id(2);
......
#include <common.h>
__kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__kernel void conv_2d_3x3(
#ifndef USE_QUALCOMM_OPENCL_2_0
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
#endif
__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__read_only image2d_t filter, /* cout%4 * cin , kh * kw * cout/4 */
#ifdef BIAS
__read_only image2d_t bias, /* cout%4 * cout/4 */
......@@ -16,15 +22,7 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
__private const int padding_top,
__private const int padding_left,
__private const int dilation_h,
#ifndef USE_QUALCOMM_OPENCL_2_0
__private const int dilation_w,
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2) {
#else
__private const int dilation_w) {
#endif
const int out_ch_blk = get_global_id(0);
const int out_w_blk = get_global_id(1);
const int out_hb = get_global_id(2);
......
#include <common.h>
__kernel void depth_to_space(__read_only image2d_t input,
__private const int block_size,
__private const int output_depth,
__kernel void depth_to_space(
#ifndef USE_QUALCOMM_OPENCL_2_0
__write_only image2d_t output,
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2) {
#else
__write_only image2d_t output) {
__private const int global_size_dim2,
#endif
__read_only image2d_t input,
__private const int block_size,
__private const int output_depth,
__write_only image2d_t output) {
const int out_d = get_global_id(0);
const int out_w = get_global_id(1);
const int out_h = get_global_id(2);
......@@ -44,17 +42,16 @@ __kernel void depth_to_space(__read_only image2d_t input,
WRITE_IMAGET(output, (int2)(out_pos, out_h), in_data);
}
__kernel void space_to_depth(__read_only image2d_t input,
__private const int block_size,
__private const int input_depth,
__kernel void space_to_depth(
#ifndef USE_QUALCOMM_OPENCL_2_0
__write_only image2d_t output,
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2) {
#else
__write_only image2d_t output) {
__private const int global_size_dim2,
#endif
__read_only image2d_t input,
__private const int block_size,
__private const int input_depth,
__write_only image2d_t output) {
const int d = get_global_id(0);
const int w = get_global_id(1);
......
#include <common.h>
// Only multiplier = 1 is supported
__kernel void depthwise_conv2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__kernel void depthwise_conv2d(
#ifndef USE_QUALCOMM_OPENCL_2_0
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
#endif
__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__read_only image2d_t filter, /* cout%4 * kh * kw * m, cin/4 */
#ifdef BIAS
__read_only image2d_t bias, /* cout%4 * cout/4 */
......@@ -18,15 +24,7 @@ __kernel void depthwise_conv2d(__read_only image2d_t input, /* [c%4 * w * c/4, h
__private const short padding_top,
__private const short padding_left,
__private const short dilation_h,
#ifndef USE_QUALCOMM_OPENCL_2_0
__private const short dilation_w,
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2) {
#else
__private const short dilation_w) {
#endif
const short out_ch_blk = get_global_id(0);
const short out_w_blk = get_global_id(1);
const short out_hb = get_global_id(2);
......@@ -144,7 +142,13 @@ __kernel void depthwise_conv2d(__read_only image2d_t input, /* [c%4 * w * c/4, h
WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out3);
}
__kernel void depthwise_conv2d_s1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__kernel void depthwise_conv2d_s1(
#ifndef USE_QUALCOMM_OPENCL_2_0
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
#endif
__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__read_only image2d_t filter, /* cout%4 * kh * kw * m, cin/4 */
#ifdef BIAS
__read_only image2d_t bias, /* cout%4 * cout/4 */
......@@ -159,15 +163,7 @@ __kernel void depthwise_conv2d_s1(__read_only image2d_t input, /* [c%4 * w * c/4
__private const short filter_height,
__private const short filter_width,
__private const short padding_top,
#ifndef USE_QUALCOMM_OPENCL_2_0
__private const short padding_left,
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2) {
#else
__private const short padding_left) {
#endif
const short out_ch_blk = get_global_id(0);
const short out_w_blk = get_global_id(1) << 2;
const short out_hb = get_global_id(2);
......
#include <common.h>
__kernel void eltwise(__read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */
__kernel void eltwise(
#ifndef USE_QUALCOMM_OPENCL_2_0
__private const int global_size_dim0,
__private const int global_size_dim1,
#endif
__read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */
__read_only image2d_t input1,
#ifdef COEFF_SUM
__private const float coeff0,
__private const float coeff1,
#endif
#ifndef USE_QUALCOMM_OPENCL_2_0
__write_only image2d_t output,
__private const int global_size_dim0,
__private const int global_size_dim1) {
#else
__write_only image2d_t output) {
#endif
const int w = get_global_id(0);
const int hb = get_global_id(1);
......
#include <common.h>
// C = A * B
__kernel void matmul(__read_only image2d_t A,
__kernel void matmul(
#ifndef USE_QUALCOMM_OPENCL_2_0
__private const int global_size_dim0,
__private const int global_size_dim1,
#endif
__read_only image2d_t A,
__read_only image2d_t B,
__write_only image2d_t C,
__private const int M,
__private const int N,
__private const int K,
__private const int height_blocks,
#ifndef USE_QUALCOMM_OPENCL_2_0
__private const int k_blocks,
__private const int global_size_dim0,
__private const int global_size_dim1) {
#else
__private const int k_blocks) {
#endif
const int gx = get_global_id(0) << 2;
const int hb = get_global_id(1);
......
......@@ -19,7 +19,13 @@ inline int calculate_avg_block_size(const int pool_size,
}
// Supported data type: half/float
__kernel void pooling(__read_only image2d_t input,
__kernel void pooling(
#ifndef USE_QUALCOMM_OPENCL_2_0
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
#endif
__read_only image2d_t input,
__private const int in_height,
__private const int in_width,
__private const int out_height,
......@@ -27,14 +33,7 @@ __kernel void pooling(__read_only image2d_t input,
__private const int pad_left,
__private const int stride,
__private const int pooling_size,
#ifndef USE_QUALCOMM_OPENCL_2_0
__write_only image2d_t output,
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2) {
#else
__write_only image2d_t output) {
#endif
const int out_chan_idx = get_global_id(0);
const int out_width_idx = get_global_id(1);
......
#include <common.h>
__kernel void resize_bilinear_nocache(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__kernel void resize_bilinear_nocache(
#ifndef USE_QUALCOMM_OPENCL_2_0
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
#endif
__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__write_only image2d_t output,
__private const float height_scale,
__private const float width_scale,
__private const int in_height,
__private const int in_width,
#ifndef USE_QUALCOMM_OPENCL_2_0
__private const int out_height,
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2) {
#else
__private const int out_height) {
#endif
const int ch_blk = get_global_id(0);
const int w = get_global_id(1);
......
#include <common.h>
__kernel void slice(__read_only image2d_t input,
__private const int chan_blk_offset,
__kernel void slice(
#ifndef USE_QUALCOMM_OPENCL_2_0
__write_only image2d_t output,
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2) {
#else
__write_only image2d_t output) {
__private const int global_size_dim2,
#endif
__read_only image2d_t input,
__private const int chan_blk_offset,
__write_only image2d_t output) {
const int chan_blk_idx = get_global_id(0);
const int width_idx = get_global_id(1);
const int hb_idx = get_global_id(2);
......
#include <common.h>
__kernel void softmax(__read_only image2d_t input,
__private const int channels,
__private const int remain_channels,
__kernel void softmax(
#ifndef USE_QUALCOMM_OPENCL_2_0
__write_only image2d_t output,
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2) {
#else
__write_only image2d_t output) {
__private const int global_size_dim2,
#endif
__read_only image2d_t input,
__private const int channels,
__private const int remain_channels,
__write_only image2d_t output) {
const int chan_blk_idx = get_global_id(0);
const int width_idx = get_global_id(1);
const int hb_idx = get_global_id(2);
......
#include <common.h>
__kernel void space_to_batch(__read_only image2d_t space_data,
__kernel void space_to_batch(
#ifndef USE_QUALCOMM_OPENCL_2_0
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
#endif
__read_only image2d_t space_data,
__write_only image2d_t batch_data,
__private const int block_height,
__private const int block_width,
......@@ -9,15 +15,7 @@ __kernel void space_to_batch(__read_only image2d_t space_data,
__private const int space_height,
__private const int space_width,
__private const int batch_height,
#ifndef USE_QUALCOMM_OPENCL_2_0
__private const int batch_width,
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2) {
#else
__private const int batch_width) {
#endif
const int chan_idx = get_global_id(0);
const int batch_w_idx = get_global_id(1);
const int batch_hb_idx = get_global_id(2);
......@@ -54,7 +52,13 @@ __kernel void space_to_batch(__read_only image2d_t space_data,
WRITE_IMAGET(batch_data, batch_coord, value);
}
__kernel void batch_to_space(__read_only image2d_t batch_data,
__kernel void batch_to_space(
#ifndef USE_QUALCOMM_OPENCL_2_0
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
#endif
__read_only image2d_t batch_data,
__write_only image2d_t space_data,
__private const int block_height,
__private const int block_width,
......@@ -63,15 +67,7 @@ __kernel void batch_to_space(__read_only image2d_t batch_data,
__private const int space_height,
__private const int space_width,
__private const int batch_height,
#ifndef USE_QUALCOMM_OPENCL_2_0
__private const int batch_width,
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2) {
#else
__private const int batch_width) {
#endif
const int chan_idx = get_global_id(0);
const int batch_w_idx = get_global_id(1);
const int batch_hb_idx = get_global_id(2);
......
#include <common.h>
__kernel void winograd_transform_2x2(__read_only image2d_t input,
__kernel void winograd_transform_2x2(
#ifndef USE_QUALCOMM_OPENCL_2_0
__private const int global_size_dim0,
__private const int global_size_dim1,
#endif
__read_only image2d_t input,
__write_only image2d_t output,
__private const int in_height,
__private const int in_width,
......@@ -8,14 +13,7 @@ __kernel void winograd_transform_2x2(__read_only image2d_t input,
__private const int round_hw,
__private const int round_w,
__private const int padding_top,
#ifndef USE_QUALCOMM_OPENCL_2_0
__private const int padding_left,
__private const int global_size_dim0,
__private const int global_size_dim1) {
#else
__private const int padding_left) {
#endif
int out_width_idx = get_global_id(0);
int chan_blk_idx = get_global_id(1);
......@@ -121,7 +119,12 @@ __kernel void winograd_transform_2x2(__read_only image2d_t input,
}
}
__kernel void winograd_inverse_transform_2x2(__read_only image2d_t input,
__kernel void winograd_inverse_transform_2x2(
#ifndef USE_QUALCOMM_OPENCL_2_0
__private const int global_size_dim0,
__private const int global_size_dim1,
#endif
__read_only image2d_t input,
#ifdef BIAS
__read_only image2d_t bias, /* cout%4 * cout/4 */
#endif
......@@ -130,14 +133,7 @@ __kernel void winograd_inverse_transform_2x2(__read_only image2d_t input,
__private const int out_width,
__private const int round_hw,
__private const int round_w,
#ifndef USE_QUALCOMM_OPENCL_2_0
__private const float relux_max_limit,
__private const int global_size_dim0,
__private const int global_size_dim1) {
#else
__private const float relux_max_limit) {
#endif
const int width_idx = get_global_id(0);
const int height_idx = get_global_id(1);
......
......@@ -56,6 +56,11 @@ static void Concat2(cl::Kernel *kernel,
}
if (!IsVecEqual(*prev_input_shape, input0->shape())) {
uint32_t idx = 0;
if (!(*is_non_uniform_work_groups_supported)) {
kernel->setArg(idx++, gws[0]);
kernel->setArg(idx++, gws[1]);
kernel->setArg(idx++, gws[2]);
}
kernel->setArg(idx++,
*(static_cast<const cl::Image2D *>(input0->opencl_image())));
kernel->setArg(idx++,
......@@ -63,9 +68,6 @@ static void Concat2(cl::Kernel *kernel,
kernel->setArg(idx++, static_cast<int32_t>(input0->dim(3)));
kernel->setArg(idx++,
*(static_cast<cl::Image2D *>(output->opencl_image())));
kernel->setArg(idx++, gws[0]);
kernel->setArg(idx++, gws[1]);
kernel->setArg(idx++, gws[2]);
*prev_input_shape = input0->shape();
......@@ -119,12 +121,14 @@ static void ConcatN(cl::Kernel *kernel,
};
uint32_t idx = 0;
if (!(*is_non_uniform_work_groups_supported)) {
kernel->setArg(idx++, gws[0]);
kernel->setArg(idx++, gws[1]);
kernel->setArg(idx++, gws[2]);
}
kernel->setArg(idx++, *(input->opencl_image()));
kernel->setArg(idx++, static_cast<int32_t>(chan_blk_offset));
kernel->setArg(idx++, *(output->opencl_image()));
kernel->setArg(idx++, gws[0]);
kernel->setArg(idx++, gws[1]);
kernel->setArg(idx++, gws[2]);
chan_blk_offset += input_channel_blk;
*kwg_size =
......
......@@ -84,6 +84,11 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
if (!IsVecEqual(*prev_input_shape, input->shape())) {
uint32_t idx = 0;
if (!(*is_non_uniform_work_groups_supported)) {
kernel->setArg(idx++, gws[0]);
kernel->setArg(idx++, gws[1]);
kernel->setArg(idx++, gws[2]);
}
kernel->setArg(idx++, *(input->opencl_image()));
kernel->setArg(idx++, *(filter->opencl_image()));
if (bias != nullptr) {
......@@ -98,9 +103,6 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
kernel->setArg(idx++, static_cast<int>(height));
kernel->setArg(idx++, static_cast<int>(width));
kernel->setArg(idx++, stride);
kernel->setArg(idx++, gws[0]);
kernel->setArg(idx++, gws[1]);
kernel->setArg(idx++, gws[2]);
*prev_input_shape = input->shape();
......
......@@ -79,6 +79,11 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
if (!IsVecEqual(*prev_input_shape, input->shape())) {
uint32_t idx = 0;
if (!(*is_non_uniform_work_groups_supported)) {
kernel->setArg(idx++, gws[0]);
kernel->setArg(idx++, gws[1]);
kernel->setArg(idx++, gws[2]);
}
kernel->setArg(idx++, *(input->opencl_image()));
kernel->setArg(idx++, *(filter->opencl_image()));
if (bias != nullptr) {
......@@ -96,9 +101,6 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
kernel->setArg(idx++, padding[1] / 2);
kernel->setArg(idx++, dilations[0]);
kernel->setArg(idx++, dilations[1]);
kernel->setArg(idx++, gws[0]);
kernel->setArg(idx++, gws[1]);
kernel->setArg(idx++, gws[2]);
*prev_input_shape = input->shape();
......
......@@ -79,6 +79,11 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
if (!IsVecEqual(*prev_input_shape, input->shape())) {
uint32_t idx = 0;
if (!(*is_non_uniform_work_groups_supported)) {
kernel->setArg(idx++, gws[0]);
kernel->setArg(idx++, gws[1]);
kernel->setArg(idx++, gws[2]);
}
kernel->setArg(idx++, *(input->opencl_image()));
kernel->setArg(idx++, *(filter->opencl_image()));
if (bias != nullptr) {
......@@ -98,9 +103,6 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
kernel->setArg(idx++, padding[1] / 2);
kernel->setArg(idx++, dilations[0]);
kernel->setArg(idx++, dilations[1]);
kernel->setArg(idx++, gws[0]);
kernel->setArg(idx++, gws[1]);
kernel->setArg(idx++, gws[2]);
*prev_input_shape = input->shape();
......
......@@ -68,12 +68,6 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
uint32_t gws[3];
std::stringstream ss;
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, block_size_);
kernel_.setArg(idx++, depth_blocks);
kernel_.setArg(idx++, *(output->opencl_image()));
if (d2s_) {
gws[0] = static_cast<uint32_t>(depth_blocks);
gws[1] = static_cast<uint32_t>(output_width);
......@@ -88,9 +82,16 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
<< input->dim(1) << "_" << input->dim(2) << "_" << input->dim(3);
}
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]);
uint32_t idx = 0;
if (!is_non_uniform_work_groups_supported_) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]);
}
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, block_size_);
kernel_.setArg(idx++, depth_blocks);
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
......
......@@ -97,6 +97,11 @@ void DepthwiseConv2d(cl::Kernel *kernel,
input_channels);
uint32_t idx = 0;
if (!(*is_non_uniform_work_groups_supported)) {
kernel->setArg(idx++, gws[0]);
kernel->setArg(idx++, gws[1]);
kernel->setArg(idx++, gws[2]);
}
kernel->setArg(idx++, *(input->opencl_image()));
kernel->setArg(idx++, *(filter->opencl_image()));
if (bias != nullptr) {
......@@ -117,9 +122,6 @@ void DepthwiseConv2d(cl::Kernel *kernel,
kernel->setArg(idx++, static_cast<int16_t>(dilations[0]));
kernel->setArg(idx++, static_cast<int16_t>(dilations[1]));
}
kernel->setArg(idx++, gws[0]);
kernel->setArg(idx++, gws[1]);
kernel->setArg(idx++, gws[2]);
*prev_input_shape = input->shape();
......
......@@ -47,6 +47,10 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
}
if (!IsVecEqual(input_shape_, input0->shape())) {
uint32_t idx = 0;
if (!is_non_uniform_work_groups_supported_) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
}
kernel_.setArg(idx++, *(input0->opencl_image()));
kernel_.setArg(idx++, *(input1->opencl_image()));
if (!coeff_.empty()) {
......@@ -54,8 +58,6 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
kernel_.setArg(idx++, coeff_[1]);
}
kernel_.setArg(idx++, *(output->opencl_image()));
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
input_shape_ = input0->shape();
......
......@@ -48,6 +48,10 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
kernel_ = runtime->BuildKernel("matmul", kernel_name, built_options);
}
uint32_t idx = 0;
if (!is_non_uniform_work_groups_supported_) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
}
kernel_.setArg(idx++, *(A->opencl_image()));
kernel_.setArg(idx++, *(B->opencl_image()));
kernel_.setArg(idx++, *(C->opencl_image()));
......@@ -56,8 +60,6 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
kernel_.setArg(idx++, static_cast<int>(A->dim(2)));
kernel_.setArg(idx++, static_cast<int>(height_blocks));
kernel_.setArg(idx++, static_cast<int>(RoundUpDiv4(A->dim(2))));
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
......
......@@ -81,6 +81,11 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
};
uint32_t idx = 0;
if (!is_non_uniform_work_groups_supported_) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]);
}
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, static_cast<int32_t>(input->dim(1)));
kernel_.setArg(idx++, static_cast<int32_t>(input->dim(2)));
......@@ -90,14 +95,23 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
kernel_.setArg(idx++, strides_[0]);
kernel_.setArg(idx++, kernels_[0]);
kernel_.setArg(idx++, *(output->opencl_image()));
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]);
input_shape_ = input->shape();
kwg_size_ =
static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
} else {
index_t batch = output->dim(0);
index_t out_height = output->dim(1);
index_t out_width = output->dim(2);
index_t channels = output->dim(3);
index_t channel_blocks = (channels + 3) / 4;
gws = {
static_cast<uint32_t>(channel_blocks), static_cast<uint32_t>(out_width),
static_cast<uint32_t>(batch * out_height),
};
}
std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
......
......@@ -60,6 +60,11 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
CalculateResizeScale(in_width, out_width, align_corners_);
uint32_t idx = 0;
if (!is_non_uniform_work_groups_supported_) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]);
}
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(output->opencl_image()));
kernel_.setArg(idx++, height_scale);
......@@ -67,9 +72,6 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
kernel_.setArg(idx++, static_cast<int32_t>(in_height));
kernel_.setArg(idx++, static_cast<int32_t>(in_width));
kernel_.setArg(idx++, static_cast<int32_t>(out_height));
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]);
input_shape_ = input->shape();
......
......@@ -65,12 +65,14 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()(
<< outputs_count;
for (int i = 0; i < outputs_count; ++i) {
uint32_t idx = 0;
if (!is_non_uniform_work_groups_supported_) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]);
}
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, static_cast<int32_t>(channel_blk * i));
kernel_.setArg(idx++, *(output_list[i]->opencl_image()));
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]);
TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
}
......
......@@ -45,13 +45,15 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
}
if (!IsVecEqual(input_shape_, logits->shape())) {
uint32_t idx = 0;
if (!is_non_uniform_work_groups_supported_) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]);
}
kernel_.setArg(idx++, *(logits->opencl_image()));
kernel_.setArg(idx++, static_cast<int>(channels));
kernel_.setArg(idx++, remain_channels);
kernel_.setArg(idx++, *(output->opencl_image()));
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]);
input_shape_ = logits->shape();
......
......@@ -57,6 +57,11 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
}
if (!IsVecEqual(space_shape_, space_tensor->shape())) {
uint32_t idx = 0;
if (!is_non_uniform_work_groups_supported_) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]);
}
if (b2s_) {
kernel_.setArg(idx++, *(batch_tensor->opencl_image()));
kernel_.setArg(idx++, *(space_tensor->opencl_image()));
......@@ -72,9 +77,6 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(2)));
kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(1)));
kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(2)));
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
kernel_.setArg(idx++, gws[2]);
space_shape_ = space_tensor->shape();
......
......@@ -61,6 +61,10 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
output_tensor->ResizeImage(output_shape, image_shape);
uint32_t idx = 0;
if (!is_non_uniform_work_groups_supported_) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
}
kernel_.setArg(idx++, *(input_tensor->opencl_image()));
kernel_.setArg(idx++, *(output_tensor->opencl_image()));
kernel_.setArg(idx++, static_cast<uint32_t>(input_tensor->dim(1)));
......@@ -70,8 +74,6 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
kernel_.setArg(idx++, static_cast<uint32_t>(round_w));
kernel_.setArg(idx++, static_cast<uint32_t>(paddings[0] / 2));
kernel_.setArg(idx++, static_cast<uint32_t>(paddings[1] / 2));
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
input_shape_ = input_tensor->shape();
......@@ -151,6 +153,10 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
const uint32_t round_h = (height_ + 1) / 2;
const uint32_t round_w = (width_ + 1) / 2;
uint32_t idx = 0;
if (!is_non_uniform_work_groups_supported_) {
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
}
kernel_.setArg(
idx++,
*(static_cast<const cl::Image2D *>(input_tensor->opencl_image())));
......@@ -165,8 +171,6 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
kernel_.setArg(idx++, static_cast<uint32_t>(round_h * round_w));
kernel_.setArg(idx++, static_cast<uint32_t>(round_w));
kernel_.setArg(idx++, relux_max_limit_);
kernel_.setArg(idx++, gws[0]);
kernel_.setArg(idx++, gws[1]);
input_shape_ = input_tensor->shape();
......
......@@ -376,7 +376,8 @@ def main(unused_args):
build_run_throughput_test(target_soc, FLAGS.run_seconds,
merged_lib_file, FLAGS.output_dir)
packaging_lib_file(FLAGS.output_dir)
if FLAGS.mode == "build" or FLAGS.mode == "all":
packaging_lib_file(FLAGS.output_dir)
if __name__ == "__main__":
......
......@@ -14,8 +14,13 @@ source ${CURRENT_DIR}/env.sh
LIBMACE_BUILD_DIR=$1
TAR_PACKAGE_NAME=libmace_${PROJECT_NAME}.tar.gz
pushd $LIBMACE_BUILD_DIR/$PROJECT_NAME
ls | grep -v build | xargs tar cvzf libmace_${PROJECT_NAME}.tar.gz
if [ -f $TAR_PACKAGE_NAME ]; then
rm -f $TAR_PACKAGE_NAME
fi
ls | grep -v build | xargs tar cvzf $TAR_PACKAGE_NAME
popd
echo "Packaging done!"
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册