提交 5e152a77 编写于 作者: L Liangliang He

Merge branch 'conv3x3_opencl' into 'master'

Finish depthwise 3x3 conv with stride 2.

See merge request !88
......@@ -4,7 +4,7 @@ void kernel batch_norm(global const float *input,
global const float *mean,
global const float *var,
global const float *epsilon,
private const uint pixels,
private const int pixels,
global float *output,
__local float4 *new_scale,
__local float4 *new_offset) {
......@@ -12,7 +12,7 @@ void kernel batch_norm(global const float *input,
const int channel = get_global_id(1);
const int channels = get_global_size(1);
const int pixel_offset = get_global_id(2);
const unsigned int local_channel = get_local_id(1);
const int local_channel = get_local_id(1);
const int local_pixel_idx = get_local_id(2);
if(local_pixel_idx == 0) {
......
float4 conv1x3_s1(const float *input_ptr,
const float *filter_ptr);
float4 conv1x3_s2(const float *input_ptr,
const float *filter_ptr);
float conv3x3(const float *input_ptr,
const float *filter_ptr,
const int row_width);
#include <conv_helper.h>
void kernel conv_2d_3x3(global const float *input,
global const float *filter,
global const float *bias,
global float *output,
private const uint in_chan_num,
private const uint out_chan_num,
private const uint in_height,
private const uint in_width,
private const uint out_height,
private const uint out_width,
private const uint stride_h,
private const uint stride_w) {
private const int in_chan_num,
private const int out_chan_num,
private const int in_height,
private const int in_width,
private const int out_height,
private const int out_width,
private const int stride_h,
private const int stride_w) {
int batch = get_global_id(0);
int out_chan_blk = get_global_id(1);
int out_pixel_blk = get_global_id(2);
const uint in_pixel = in_height * in_width;
const uint out_pixel = out_height * out_width;
const int in_pixel = in_height * in_width;
const int out_pixel = out_height * out_width;
const uint round_out_width = (out_width + 3) / 4;
const uint out_pixel_height = out_pixel_blk / round_out_width;
const uint out_pixel_width = out_pixel_blk % round_out_width;
const int round_out_width = (out_width + 3) / 4;
const int out_pixel_height = out_pixel_blk / round_out_width;
const int out_pixel_width = out_pixel_blk % round_out_width;
const uint out_chan_begin = out_chan_blk * 4;
const uint out_chan_end = min(out_chan_begin + 4, out_chan_num);
const uint out_pixel_begin = out_pixel_height * out_width + out_pixel_width * 4;
const uint out_pixel_end = min(out_pixel_begin + 4, (out_pixel_height + 1) * out_width);
const uint in_pixel_begin = out_pixel_height * stride_h * in_width + out_pixel_width * stride_w * 4;
const int out_chan_begin = out_chan_blk * 4;
const int out_chan_end = min(out_chan_begin + 4, out_chan_num);
const int out_pixel_begin = out_pixel_height * out_width + out_pixel_width * 4;
const int out_pixel_end = min(out_pixel_begin + 4, (out_pixel_height + 1) * out_width);
const int in_pixel_begin = out_pixel_height * stride_h * in_width + out_pixel_width * stride_w * 4;
const uint in_offset = batch * in_chan_num * in_pixel;
const uint out_offset = batch * out_chan_num * out_pixel;
const int in_offset = batch * in_chan_num * in_pixel;
const int out_offset = batch * out_chan_num * out_pixel;
const float *input_base = input + in_offset + in_pixel_begin;
float *output_base = output + out_offset + out_pixel_begin;
uint pixels = out_pixel_end - out_pixel_begin;
const int pixels = out_pixel_end - out_pixel_begin;
for (uint i = out_chan_begin; i < out_chan_end; ++i) {
float4 res = (float4)bias[i];
for (int i = out_chan_begin; i < out_chan_end; ++i) {
float *output_ptr = output_base + i * out_pixel;
const float *filter_base = filter + i * in_chan_num * 9;
if (pixels == 4) {
for (uint in_chan_idx = 0; in_chan_idx < in_chan_num; ++in_chan_idx) {
float4 res = (float4)bias[i];
for (int in_chan_idx = 0; in_chan_idx < in_chan_num; ++in_chan_idx) {
const float* input_ptr = input_base + in_chan_idx * in_pixel;
const float* filter_ptr = filter_base + in_chan_idx * 9;
if (stride_w == 1) {
......@@ -62,7 +55,7 @@ void kernel conv_2d_3x3(global const float *input,
}
vstore4(res, 0, output_ptr);
} else {
for (uint p = 0; p < pixels; ++p) {
for (int p = 0; p < pixels; ++p) {
float res = bias[i];
for (uint in_chan_idx = 0; in_chan_idx < in_chan_num; ++in_chan_idx) {
const float* input_ptr = input_base + in_chan_idx * in_pixel + p * stride_w;
......
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#ifndef MACE_KERNELS_OPENCL_CL_CONV_HELPER_H_
#define MACE_KERNELS_OPENCL_CL_CONV_HELPER_H_
float4 conv1x3_s1(const float *input_ptr,
const float *filter_ptr);
float4 conv1x3_s2(const float *input_ptr,
const float *filter_ptr);
float conv3x3(const float *input_ptr,
const float *filter_ptr,
const int row_width);
#endif // MACE_KERNELS_OPENCL_CL_CONV_HELPER_H_
float4 conv1x3_s1(const float *input_ptr,
const float *filter_ptr);
float conv3x3(const float *input_ptr,
const float *filter_ptr,
const int row_width);
void kernel depthwise_conv_3x3_s1(global const float *input, /* n, c, h, w */
global const float *filter, /* m, i, kh, kw */
global const float *bias, /* o */
global float *output, /* n, c, h, w */
private const int in_chan_num,
private const int out_chan_num,
private const int in_height,
private const int in_width,
private const int out_height,
private const int out_width) {
#include <conv_helper.h>
//TODO merge the depthwise with conv 3x3 to remove duplicate code.
void kernel depthwise_conv_3x3(global const float *input, /* n, c, h, w */
global const float *filter, /* m, i, kh, kw */
global const float *bias, /* o */
global float *output, /* n, c, h, w */
private const int in_chan_num,
private const int out_chan_num,
private const int in_height,
private const int in_width,
private const int out_height,
private const int out_width,
private const int stride_h,
private const int stride_w) {
int batch = get_global_id(0);
int out_chan_blk = get_global_id(1);
int out_pixel_blk = get_global_id(2);
......@@ -30,32 +28,39 @@ void kernel depthwise_conv_3x3_s1(global const float *input, /* n, c, h, w */
const int out_chan_end = min(out_chan_begin + 4, out_chan_num);
const int out_pixel_begin = out_pixel_height * out_width + out_pixel_width * 4;
const int out_pixel_end = min(out_pixel_begin + 4, (out_pixel_height + 1) * out_width);
const int in_pixel_begin = out_pixel_height * in_width + out_pixel_width * 4;
const int in_pixel_begin = out_pixel_height * stride_h * in_width + out_pixel_width * stride_w * 4;
const int in_offset = batch * in_chan_num * in_pixel;
const int out_offset = batch * out_chan_num * out_pixel;
const float *input_base = input + in_offset + in_pixel_begin;
float *output_base = output + out_offset + out_pixel_begin;
int pixels = out_pixel_end - out_pixel_begin;
const int pixels = out_pixel_end - out_pixel_begin;
for (int i = out_chan_begin; i < out_chan_end; ++i) {
float bias_value = bias[i];
const float *input_ptr = input_base + (i / multiplier) * in_pixel;
const float *filter_ptr = filter + i * 9;
float *output_ptr = output_base + i * out_pixel;
if (pixels < 4) {
for (int out_idx = 0; out_idx < pixels; ++out_idx) {
output_ptr[out_idx] = bias_value;
output_ptr[out_idx] += conv3x3(input_ptr, filter_ptr, in_width);
input_ptr += 1;
if (pixels == 4) {
float4 res = (float4)bias[i];
if (stride_w == 1) {
res += conv1x3_s1(input_ptr + 0 * in_width, filter_ptr + 0 * 3);
res += conv1x3_s1(input_ptr + 1 * in_width, filter_ptr + 1 * 3);
res += conv1x3_s1(input_ptr + 2 * in_width, filter_ptr + 2 * 3);
} else {
res += conv1x3_s2(input_ptr + 0 * in_width, filter_ptr + 0 * 3);
res += conv1x3_s2(input_ptr + 1 * in_width, filter_ptr + 1 * 3);
res += conv1x3_s2(input_ptr + 2 * in_width, filter_ptr + 2 * 3);
}
} else {
float4 res = (float4)bias_value;
res += conv1x3_s1(input_ptr + 0 * in_width, filter_ptr + 0 * 3);
res += conv1x3_s1(input_ptr + 1 * in_width, filter_ptr + 1 * 3);
res += conv1x3_s1(input_ptr + 2 * in_width, filter_ptr + 2 * 3);
vstore4(res, 0, output_ptr);
} else {
for (int p = 0; p < pixels; ++p) {
float res = bias[i];
res += conv3x3(input_ptr, filter_ptr, in_width);
output_ptr[p] = res;
input_ptr += stride_w;
}
}
}
......
......@@ -22,29 +22,29 @@ static void InnerConv2dK3x3S12(const Tensor *input, const Tensor *filter,
auto runtime = OpenCLRuntime::Get();
auto program = runtime->program();
auto bm_kernel = cl::Kernel(program, "conv_2d_3x3");
auto conv_kernel = cl::Kernel(program, "conv_2d_3x3");
uint32_t idx = 0;
bm_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer())));
bm_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(filter->buffer())));
bm_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(bias->buffer())));
bm_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
bm_kernel.setArg(idx++, static_cast<uint32_t>(input->dim(1)));
bm_kernel.setArg(idx++, static_cast<uint32_t>(channels));
bm_kernel.setArg(idx++, static_cast<uint32_t>(input->dim(2)));
bm_kernel.setArg(idx++, static_cast<uint32_t>(input->dim(3)));
bm_kernel.setArg(idx++, static_cast<uint32_t>(height));
bm_kernel.setArg(idx++, static_cast<uint32_t>(width));
bm_kernel.setArg(idx++, stride);
bm_kernel.setArg(idx++, stride);
conv_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer())));
conv_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(filter->buffer())));
conv_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(bias->buffer())));
conv_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
conv_kernel.setArg(idx++, static_cast<int32_t>(input->dim(1)));
conv_kernel.setArg(idx++, static_cast<int32_t>(channels));
conv_kernel.setArg(idx++, static_cast<int32_t>(input->dim(2)));
conv_kernel.setArg(idx++, static_cast<int32_t>(input->dim(3)));
conv_kernel.setArg(idx++, static_cast<int32_t>(height));
conv_kernel.setArg(idx++, static_cast<int32_t>(width));
conv_kernel.setArg(idx++, stride);
conv_kernel.setArg(idx++, stride);
const uint32_t gws[3] = {static_cast<uint32_t>(output->dim(0)),
static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(pixel_blocks)};
const uint32_t lws[3] = {static_cast<uint32_t>(1),
static_cast<uint32_t>(1),
static_cast<uint32_t>(256)};
static_cast<uint32_t>(8),
static_cast<uint32_t>(128)};
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
bm_kernel, cl::NullRange,
conv_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]));
MACE_CHECK(error == CL_SUCCESS);
......
......@@ -10,6 +10,8 @@ namespace kernels {
extern void DepthwiseConvOpenclK3x3S1(const Tensor *input, const Tensor *filter,
const Tensor *bias, Tensor *output);
extern void DepthwiseConvOpenclK3x3S2(const Tensor *input, const Tensor *filter,
const Tensor *bias, Tensor *output);
template <>
void DepthwiseConv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
const Tensor *filter,
......@@ -21,7 +23,7 @@ void DepthwiseConv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor
static const Conv2dOpenclFunction selector[5][2] = {
{nullptr, nullptr},
{nullptr, nullptr},
{DepthwiseConvOpenclK3x3S1, nullptr},
{DepthwiseConvOpenclK3x3S1, DepthwiseConvOpenclK3x3S2},
{nullptr, nullptr},
{nullptr, nullptr}};
......
......@@ -9,10 +9,11 @@
namespace mace {
namespace kernels {
extern void DepthwiseConvOpenclK3x3S1(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
Tensor *output) {
static void InnerDepthwiseConvOpenclK3x3S12(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const uint32_t stride,
Tensor *output) {
const index_t batch = output->dim(0);
const index_t channels = output->dim(1);
const index_t height = output->dim(2);
......@@ -24,33 +25,53 @@ extern void DepthwiseConvOpenclK3x3S1(const Tensor *input,
const index_t input_width = input->dim(3);
MACE_CHECK(input_batch == batch);
auto runtime = OpenCLRuntime::Get();
auto program = runtime->program();
auto conv_2d = cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer, cl::Buffer,
int, int, int, int, int, int, int>(program, "depthwise_conv_3x3_s1");
const index_t pixels = height * width;
const index_t channel_blocks = (channels + 3) / 4;
const index_t pixel_blocks = (width + 3) / 4 * height;
cl_int error;
conv_2d(cl::EnqueueArgs(runtime->command_queue(),
cl::NDRange(static_cast<int>(batch),
static_cast<int>(channel_blocks),
static_cast<int>(pixel_blocks)),
cl::NDRange(1, 1, 256)),
*(static_cast<cl::Buffer *>(input->buffer())),
*(static_cast<cl::Buffer *>(filter->buffer())),
*(static_cast<cl::Buffer *>(bias->buffer())),
*(static_cast<cl::Buffer *>(output->buffer())),
static_cast<int>(input_channels),
static_cast<int>(channels),
static_cast<int>(input_height),
static_cast<int>(input_width),
static_cast<int>(height),
static_cast<int>(width),
error);
auto runtime = OpenCLRuntime::Get();
auto program = runtime->program();
auto conv_kernel = cl::Kernel(program, "depthwise_conv_3x3");
uint32_t idx = 0;
conv_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer())));
conv_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(filter->buffer())));
conv_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(bias->buffer())));
conv_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
conv_kernel.setArg(idx++, static_cast<int32_t>(input->dim(1)));
conv_kernel.setArg(idx++, static_cast<int32_t>(channels));
conv_kernel.setArg(idx++, static_cast<int32_t>(input->dim(2)));
conv_kernel.setArg(idx++, static_cast<int32_t>(input->dim(3)));
conv_kernel.setArg(idx++, static_cast<int32_t>(height));
conv_kernel.setArg(idx++, static_cast<int32_t>(width));
conv_kernel.setArg(idx++, stride);
conv_kernel.setArg(idx++, stride);
const uint32_t gws[3] = {static_cast<uint32_t>(output->dim(0)),
static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(pixel_blocks)};
const uint32_t lws[3] = {static_cast<uint32_t>(1),
static_cast<uint32_t>(1),
static_cast<uint32_t>(256)};
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
conv_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]));
MACE_CHECK(error == CL_SUCCESS);
}
extern void DepthwiseConvOpenclK3x3S1(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
Tensor *output) {
InnerDepthwiseConvOpenclK3x3S12(input, filter, bias, 1, output);
};
extern void DepthwiseConvOpenclK3x3S2(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
Tensor *output) {
InnerDepthwiseConvOpenclK3x3S12(input, filter, bias, 2, output);
};
} // namespace kernels
......
......@@ -3,7 +3,6 @@
//
#include <algorithm>
#include <sstream>
#include "mace/core/operator.h"
#include "mace/core/testing/test_benchmark.h"
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册