提交 0d5bf6ce 编写于 作者: L liuqi

Change the data type of conv kernel params from uint32_t to int32_t.

上级 b469a945
......@@ -4,7 +4,7 @@ void kernel batch_norm(global const float *input,
global const float *mean,
global const float *var,
global const float *epsilon,
private const uint pixels,
private const int pixels,
global float *output,
__local float4 *new_scale,
__local float4 *new_offset) {
......@@ -12,7 +12,7 @@ void kernel batch_norm(global const float *input,
const int channel = get_global_id(1);
const int channels = get_global_size(1);
const int pixel_offset = get_global_id(2);
const unsigned int local_channel = get_local_id(1);
const int local_channel = get_local_id(1);
const int local_pixel_idx = get_local_id(2);
if(local_pixel_idx == 0) {
......
......@@ -3,44 +3,44 @@ void kernel conv_2d_3x3(global const float *input,
global const float *filter,
global const float *bias,
global float *output,
private const uint in_chan_num,
private const uint out_chan_num,
private const uint in_height,
private const uint in_width,
private const uint out_height,
private const uint out_width,
private const uint stride_h,
private const uint stride_w) {
const int batch = get_global_id(0);
const int out_chan_blk = get_global_id(1);
const int out_pixel_blk = get_global_id(2);
private const int in_chan_num,
private const int out_chan_num,
private const int in_height,
private const int in_width,
private const int out_height,
private const int out_width,
private const int stride_h,
private const int stride_w) {
int batch = get_global_id(0);
int out_chan_blk = get_global_id(1);
int out_pixel_blk = get_global_id(2);
const uint in_pixel = in_height * in_width;
const uint out_pixel = out_height * out_width;
const int in_pixel = in_height * in_width;
const int out_pixel = out_height * out_width;
const uint round_out_width = (out_width + 3) / 4;
const uint out_pixel_height = out_pixel_blk / round_out_width;
const uint out_pixel_width = out_pixel_blk % round_out_width;
const int round_out_width = (out_width + 3) / 4;
const int out_pixel_height = out_pixel_blk / round_out_width;
const int out_pixel_width = out_pixel_blk % round_out_width;
const uint out_chan_begin = out_chan_blk * 4;
const uint out_chan_end = min(out_chan_begin + 4, out_chan_num);
const uint out_pixel_begin = out_pixel_height * out_width + out_pixel_width * 4;
const uint out_pixel_end = min(out_pixel_begin + 4, (out_pixel_height + 1) * out_width);
const uint in_pixel_begin = out_pixel_height * stride_h * in_width + out_pixel_width * stride_w * 4;
const int out_chan_begin = out_chan_blk * 4;
const int out_chan_end = min(out_chan_begin + 4, out_chan_num);
const int out_pixel_begin = out_pixel_height * out_width + out_pixel_width * 4;
const int out_pixel_end = min(out_pixel_begin + 4, (out_pixel_height + 1) * out_width);
const int in_pixel_begin = out_pixel_height * stride_h * in_width + out_pixel_width * stride_w * 4;
const uint in_offset = batch * in_chan_num * in_pixel;
const uint out_offset = batch * out_chan_num * out_pixel;
const int in_offset = batch * in_chan_num * in_pixel;
const int out_offset = batch * out_chan_num * out_pixel;
const float *input_base = input + in_offset + in_pixel_begin;
float *output_base = output + out_offset + out_pixel_begin;
uint pixels = out_pixel_end - out_pixel_begin;
for (uint i = out_chan_begin; i < out_chan_end; ++i) {
for (int i = out_chan_begin; i < out_chan_end; ++i) {
float *output_ptr = output_base + i * out_pixel;
const float *filter_base = filter + i * in_chan_num * 9;
if (pixels == 4) {
float4 res = (float4)bias[i];
for (uint in_chan_idx = 0; in_chan_idx < in_chan_num; ++in_chan_idx) {
for (int in_chan_idx = 0; in_chan_idx < in_chan_num; ++in_chan_idx) {
const float* input_ptr = input_base + in_chan_idx * in_pixel;
const float* filter_ptr = filter_base + in_chan_idx * 9;
if (stride_w == 1) {
......@@ -55,7 +55,7 @@ void kernel conv_2d_3x3(global const float *input,
}
vstore4(res, 0, output_ptr);
} else {
for (uint p = 0; p < pixels; ++p) {
for (int p = 0; p < pixels; ++p) {
float res = bias[i];
for (uint in_chan_idx = 0; in_chan_idx < in_chan_num; ++in_chan_idx) {
const float* input_ptr = input_base + in_chan_idx * in_pixel + p * stride_w;
......
......@@ -4,40 +4,40 @@ void kernel depthwise_conv_3x3(global const float *input, /* n, c, h, w */
global const float *filter, /* m, i, kh, kw */
global const float *bias, /* o */
global float *output, /* n, c, h, w */
private const uint in_chan_num,
private const uint out_chan_num,
private const uint in_height,
private const uint in_width,
private const uint out_height,
private const uint out_width,
private const uint stride_h,
private const uint stride_w) {
const int batch = get_global_id(0);
const int out_chan_blk = get_global_id(1);
const int out_pixel_blk = get_global_id(2);
private const int in_chan_num,
private const int out_chan_num,
private const int in_height,
private const int in_width,
private const int out_height,
private const int out_width,
private const int stride_h,
private const int stride_w) {
int batch = get_global_id(0);
int out_chan_blk = get_global_id(1);
int out_pixel_blk = get_global_id(2);
const uint in_pixel = in_height * in_width;
const uint out_pixel = out_height * out_width;
const uint multiplier = out_chan_num / in_chan_num;
const int in_pixel = in_height * in_width;
const int out_pixel = out_height * out_width;
const int multiplier = out_chan_num / in_chan_num;
const uint round_out_width = (out_width + 3) / 4;
const uint out_pixel_height = out_pixel_blk / round_out_width;
const uint out_pixel_width = out_pixel_blk % round_out_width;
const int round_out_width = (out_width + 3) / 4;
const int out_pixel_height = out_pixel_blk / round_out_width;
const int out_pixel_width = out_pixel_blk % round_out_width;
const uint out_chan_begin = out_chan_blk * 4;
const uint out_chan_end = min(out_chan_begin + 4, out_chan_num);
const uint out_pixel_begin = out_pixel_height * out_width + out_pixel_width * 4;
const uint out_pixel_end = min(out_pixel_begin + 4, (out_pixel_height + 1) * out_width);
const uint in_pixel_begin = out_pixel_height * stride_h * in_width + out_pixel_width * stride_w * 4;
const int out_chan_begin = out_chan_blk * 4;
const int out_chan_end = min(out_chan_begin + 4, out_chan_num);
const int out_pixel_begin = out_pixel_height * out_width + out_pixel_width * 4;
const int out_pixel_end = min(out_pixel_begin + 4, (out_pixel_height + 1) * out_width);
const int in_pixel_begin = out_pixel_height * stride_h * in_width + out_pixel_width * stride_w * 4;
const uint in_offset = batch * in_chan_num * in_pixel;
const uint out_offset = batch * out_chan_num * out_pixel;
const int in_offset = batch * in_chan_num * in_pixel;
const int out_offset = batch * out_chan_num * out_pixel;
const float *input_base = input + in_offset + in_pixel_begin;
float *output_base = output + out_offset + out_pixel_begin;
uint pixels = out_pixel_end - out_pixel_begin;
for (uint i = out_chan_begin; i < out_chan_end; ++i) {
for (int i = out_chan_begin; i < out_chan_end; ++i) {
float bias_value = bias[i];
const float *input_ptr = input_base + (i / multiplier) * in_pixel;
const float *filter_ptr = filter + i * 9;
......@@ -55,7 +55,7 @@ void kernel depthwise_conv_3x3(global const float *input, /* n, c, h, w */
}
vstore4(res, 0, output_ptr);
} else {
for (uint p = 0; p < pixels; ++p) {
for (int p = 0; p < pixels; ++p) {
float res = bias[i];
res += conv3x3(input_ptr, filter_ptr, in_width);
output_ptr[p] = res;
......
......@@ -29,20 +29,20 @@ static void InnerConv2dK3x3S12(const Tensor *input, const Tensor *filter,
conv_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(filter->buffer())));
conv_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(bias->buffer())));
conv_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
conv_kernel.setArg(idx++, static_cast<uint32_t>(input->dim(1)));
conv_kernel.setArg(idx++, static_cast<uint32_t>(channels));
conv_kernel.setArg(idx++, static_cast<uint32_t>(input->dim(2)));
conv_kernel.setArg(idx++, static_cast<uint32_t>(input->dim(3)));
conv_kernel.setArg(idx++, static_cast<uint32_t>(height));
conv_kernel.setArg(idx++, static_cast<uint32_t>(width));
conv_kernel.setArg(idx++, static_cast<int32_t>(input->dim(1)));
conv_kernel.setArg(idx++, static_cast<int32_t>(channels));
conv_kernel.setArg(idx++, static_cast<int32_t>(input->dim(2)));
conv_kernel.setArg(idx++, static_cast<int32_t>(input->dim(3)));
conv_kernel.setArg(idx++, static_cast<int32_t>(height));
conv_kernel.setArg(idx++, static_cast<int32_t>(width));
conv_kernel.setArg(idx++, stride);
conv_kernel.setArg(idx++, stride);
const uint32_t gws[3] = {static_cast<uint32_t>(output->dim(0)),
static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(pixel_blocks)};
const uint32_t lws[3] = {static_cast<uint32_t>(1),
static_cast<uint32_t>(1),
static_cast<uint32_t>(256)};
static_cast<uint32_t>(8),
static_cast<uint32_t>(128)};
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
conv_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1], gws[2]),
......
......@@ -38,12 +38,12 @@ static void InnerDepthwiseConvOpenclK3x3S12(const Tensor *input,
conv_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(filter->buffer())));
conv_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(bias->buffer())));
conv_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
conv_kernel.setArg(idx++, static_cast<uint32_t>(input->dim(1)));
conv_kernel.setArg(idx++, static_cast<uint32_t>(channels));
conv_kernel.setArg(idx++, static_cast<uint32_t>(input->dim(2)));
conv_kernel.setArg(idx++, static_cast<uint32_t>(input->dim(3)));
conv_kernel.setArg(idx++, static_cast<uint32_t>(height));
conv_kernel.setArg(idx++, static_cast<uint32_t>(width));
conv_kernel.setArg(idx++, static_cast<int32_t>(input->dim(1)));
conv_kernel.setArg(idx++, static_cast<int32_t>(channels));
conv_kernel.setArg(idx++, static_cast<int32_t>(input->dim(2)));
conv_kernel.setArg(idx++, static_cast<int32_t>(input->dim(3)));
conv_kernel.setArg(idx++, static_cast<int32_t>(height));
conv_kernel.setArg(idx++, static_cast<int32_t>(width));
conv_kernel.setArg(idx++, stride);
conv_kernel.setArg(idx++, stride);
......
......@@ -3,7 +3,6 @@
//
#include <algorithm>
#include <sstream>
#include "mace/core/operator.h"
#include "mace/core/testing/test_benchmark.h"
......
......@@ -296,7 +296,7 @@ static void TestUnalignedConvNxNS12() {
ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 0.001);
};
for (int kernel_size : {1, 3, 5}) {
for (int kernel_size : {3}) {
for (int stride : {1, 2}) {
func(kernel_size, kernel_size, stride, stride, VALID);
func(kernel_size, kernel_size, stride, stride, SAME);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册