提交 2d79d5c3 编写于 作者: W wuchenghui

multi-input concat opencl kernel

上级 a7ee7f0f
......@@ -71,6 +71,22 @@ __kernel void concat_channel(__read_only image2d_t input0,
WRITE_IMAGET(output, (int2)(mad24(chan_blk_idx, width, width_idx), hb_idx), data);
}
// Required: All input channels are divisible by 4
__kernel void concat_channel_multi(__read_only image2d_t input,
__private const int chan_blk_offset,
__write_only image2d_t output) {
const int chan_blk_idx = get_global_id(0);
const int width_idx = get_global_id(1);
const int width = get_global_size(1);
const int hb_idx = get_global_id(2);
DATA_TYPE4 data = 0;
data = READ_IMAGET(input,
SAMPLER,
(int2)(mad24(chan_blk_idx, width, width_idx), hb_idx));
WRITE_IMAGET(output, (int2)(mad24(chan_blk_idx + chan_blk_offset, width, width_idx), hb_idx), data);
}
//__kernel void concat_width(__read_only image2d_t input0,
// __read_only image2d_t input1,
// __private const int input0_width,
......
......@@ -63,21 +63,71 @@ static void Concat2(cl::Kernel *kernel,
TuningOrRun3DKernel(*kernel, ss.str(), gws, lws, future);
}
static void ConcatN(cl::Kernel *kernel,
const std::vector<const Tensor *> &input_list,
const DataType dt,
Tensor *output,
StatsFuture *future) {
const index_t batch = output->dim(0);
const index_t height = output->dim(1);
const index_t width = output->dim(2);
const index_t channel = output->dim(3);
const int channel_blk = RoundUpDiv4(channel);
if (kernel->get() == nullptr) {
auto runtime = OpenCLRuntime::Global();
std::set<std::string> built_options;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel_multi");
built_options.emplace("-Dconcat_channel_multi=" + kernel_name);
built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
*kernel = runtime->BuildKernel("concat", kernel_name, built_options);
}
const int inputs_count = input_list.size();
index_t chan_blk_offset = 0;
for (int i = 0; i < inputs_count; ++i) {
const Tensor *input = input_list[i];
uint32_t idx = 0;
kernel->setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
kernel->setArg(idx++, static_cast<int32_t>(chan_blk_offset));
kernel->setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));
index_t input_channel_blk = input->dim(3) / 4;
chan_blk_offset += input_channel_blk;
const uint32_t gws[3] = {
static_cast<uint32_t>(input_channel_blk),
static_cast<uint32_t>(width),
static_cast<uint32_t>(batch * height),
};
const std::vector<uint32_t> lws = {8, 16, 8, 1};
std::stringstream ss;
ss << "concat_n_opencl_kernel_"
<< input_channel_blk << "_"
<< width << "_"
<< batch * height;
TuningOrRun3DKernel(*kernel, ss.str(), gws, lws, future);
}
}
template<typename T>
void ConcatFunctor<DeviceType::OPENCL, T>::operator()(const std::vector<const Tensor *> &input_list,
Tensor *output,
StatsFuture *future) {
const int inputs_count = input_list.size();
MACE_CHECK(inputs_count == 2 && axis_ == 3)
<< "Concat opencl kernel only support two elements with axis == 3";
MACE_CHECK(inputs_count >= 2 && axis_ == 3)
<< "Concat opencl kernel only support >=2 elements with axis == 3";
const Tensor *input0 = input_list[0];
bool divisible_four = input0->dim(axis_) % 4 == 0;
std::vector<index_t> output_shape(input0->shape());
for (int i = 1; i < inputs_count; ++i) {
const Tensor *input = input_list[i];
MACE_CHECK(input->dim_size() == input0->dim_size(),
"Ranks of all input tensors must be same.");
divisible_four &= input->dim(axis_) % 4 == 0;
for (int j = 0; j < input->dim_size(); ++j) {
if (j == axis_) {
continue;
......@@ -87,6 +137,8 @@ void ConcatFunctor<DeviceType::OPENCL, T>::operator()(const std::vector<const Te
}
output_shape[axis_] += input->dim(axis_);
}
MACE_CHECK(inputs_count == 2 || divisible_four,
"Dimensions of inputs should be divisible by 4 when inputs_count > 2.");
std::vector<size_t> image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, image_shape);
output->ResizeImage(output_shape, image_shape);
......@@ -96,7 +148,13 @@ void ConcatFunctor<DeviceType::OPENCL, T>::operator()(const std::vector<const Te
Concat2(&kernel_, input_list[0], input_list[1], DataTypeToEnum<T>::value,
output, future);
break;
default:MACE_NOT_IMPLEMENTED;
default:
if (divisible_four) {
ConcatN(&kernel_, input_list, DataTypeToEnum<T>::value, output, future);
}
else {
MACE_NOT_IMPLEMENTED;
}
}
};
......
......@@ -143,7 +143,7 @@ template <typename T>
void OpenclRandomTest(const std::vector<std::vector<index_t>> &shapes,
const int axis) {
srand(time(nullptr));
int num_inputs = 2;
int num_inputs = shapes.size();
int concat_axis_size = 0;
// Construct graph
OpsTestNet net;
......@@ -212,3 +212,8 @@ TEST_F(ConcatOpTest, OPENCLHalfAligned) {
TEST_F(ConcatOpTest, OPENCLUnAligned) {
OpenclRandomTest<float>({{3, 32, 32, 13}, {3, 32, 32, 17}}, 3);
}
TEST_F(ConcatOpTest, OPENCLAlignedMultiInput) {
OpenclRandomTest<float>({{3, 32, 32, 32}, {3, 32, 32, 32},
{3, 32, 32, 32}, {3, 32, 32, 32}}, 3);
}
\ No newline at end of file
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册