提交 2460a946 编写于 作者: L liuqi

Support arbitrary input size.

上级 9bda14f3
...@@ -152,6 +152,7 @@ class ActivationFunctor<DeviceType::OPENCL, T> { ...@@ -152,6 +152,7 @@ class ActivationFunctor<DeviceType::OPENCL, T> {
T relux_max_limit_; T relux_max_limit_;
cl::Kernel kernel_; cl::Kernel kernel_;
std::string tuning_key_prefix_; std::string tuning_key_prefix_;
std::vector<index_t> input_shape_;
}; };
} // namespace kernels } // namespace kernels
......
...@@ -91,6 +91,7 @@ struct AddNFunctor<DeviceType::OPENCL, T> { ...@@ -91,6 +91,7 @@ struct AddNFunctor<DeviceType::OPENCL, T> {
StatsFuture *future); StatsFuture *future);
cl::Kernel kernel_; cl::Kernel kernel_;
std::vector<index_t> input_shape_;
}; };
} // namespace kernels } // namespace kernels
......
...@@ -156,6 +156,7 @@ struct BatchNormFunctor<DeviceType::OPENCL, T> : BatchNormFunctorBase { ...@@ -156,6 +156,7 @@ struct BatchNormFunctor<DeviceType::OPENCL, T> : BatchNormFunctorBase {
Tensor *output, Tensor *output,
StatsFuture *future); StatsFuture *future);
cl::Kernel kernel_; cl::Kernel kernel_;
std::vector<index_t> input_shape_;
}; };
} // namepsace kernels } // namepsace kernels
......
...@@ -62,6 +62,7 @@ struct BiasAddFunctor<DeviceType::OPENCL, T> { ...@@ -62,6 +62,7 @@ struct BiasAddFunctor<DeviceType::OPENCL, T> {
Tensor *output, Tensor *output,
StatsFuture *future); StatsFuture *future);
cl::Kernel kernel_; cl::Kernel kernel_;
std::vector<index_t> input_shape_;
}; };
} // namepsace kernels } // namepsace kernels
......
...@@ -55,6 +55,7 @@ struct ChannelShuffleFunctor<DeviceType::OPENCL, T> { ...@@ -55,6 +55,7 @@ struct ChannelShuffleFunctor<DeviceType::OPENCL, T> {
cl::Kernel kernel_; cl::Kernel kernel_;
const int groups_; const int groups_;
std::vector<index_t> input_shape_;
}; };
} // namespace kernels } // namespace kernels
......
...@@ -83,6 +83,7 @@ struct ConcatFunctor<DeviceType::OPENCL, T> : ConcatFunctorBase { ...@@ -83,6 +83,7 @@ struct ConcatFunctor<DeviceType::OPENCL, T> : ConcatFunctorBase {
Tensor *output, Tensor *output,
StatsFuture *future); StatsFuture *future);
cl::Kernel kernel_; cl::Kernel kernel_;
std::vector<index_t> input_shape_;
}; };
} // namepsace kernels } // namepsace kernels
......
...@@ -401,6 +401,7 @@ struct Conv2dFunctor<DeviceType::OPENCL, T> : Conv2dFunctorBase { ...@@ -401,6 +401,7 @@ struct Conv2dFunctor<DeviceType::OPENCL, T> : Conv2dFunctorBase {
StatsFuture *future); StatsFuture *future);
cl::Kernel kernel_; cl::Kernel kernel_;
std::vector<index_t> input_shape_;
}; };
} // namespace kernels } // namespace kernels
......
...@@ -439,6 +439,7 @@ struct DepthwiseConv2dFunctor<DeviceType::OPENCL, T> ...@@ -439,6 +439,7 @@ struct DepthwiseConv2dFunctor<DeviceType::OPENCL, T>
StatsFuture *future); StatsFuture *future);
cl::Kernel kernel_; cl::Kernel kernel_;
std::vector<index_t> input_shape_;
}; };
} // namespace kernels } // namespace kernels
......
...@@ -94,6 +94,7 @@ struct EltwiseFunctor<DeviceType::OPENCL, T> : EltwiseFunctorBase { ...@@ -94,6 +94,7 @@ struct EltwiseFunctor<DeviceType::OPENCL, T> : EltwiseFunctorBase {
StatsFuture *future); StatsFuture *future);
cl::Kernel kernel_; cl::Kernel kernel_;
std::vector<index_t> input_shape_;
}; };
} // namespace kernels } // namespace kernels
......
...@@ -90,6 +90,7 @@ struct FullyConnectedFunctor<DeviceType::OPENCL, T> : FullyConnectedBase { ...@@ -90,6 +90,7 @@ struct FullyConnectedFunctor<DeviceType::OPENCL, T> : FullyConnectedBase {
cl::Kernel kernel_; cl::Kernel kernel_;
std::vector<uint32_t> gws_; std::vector<uint32_t> gws_;
std::vector<uint32_t> lws_; std::vector<uint32_t> lws_;
std::vector<index_t> input_shape_;
}; };
} // namespace kernels } // namespace kernels
......
...@@ -58,6 +58,9 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -58,6 +58,9 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
LOG(FATAL) << "Unknown activation type: " << activation_; LOG(FATAL) << "Unknown activation type: " << activation_;
} }
kernel_ = runtime->BuildKernel("activation", kernel_name, built_options); kernel_ = runtime->BuildKernel("activation", kernel_name, built_options);
}
if (!IsVecEqual(input_shape_, input->shape())) {
int idx = 0; int idx = 0;
kernel_.setArg(idx++, *(input->opencl_image())); kernel_.setArg(idx++, *(input->opencl_image()));
if (activation_ == PRELU) { if (activation_ == PRELU) {
...@@ -66,6 +69,8 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -66,6 +69,8 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
} }
kernel_.setArg(idx++, static_cast<float>(relux_max_limit_)); kernel_.setArg(idx++, static_cast<float>(relux_max_limit_));
kernel_.setArg(idx++, *(output->opencl_image())); kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
} }
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks), const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
......
...@@ -32,15 +32,6 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -32,15 +32,6 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
MACE_CHECK(channels == input_tensors[i]->dim(3)); MACE_CHECK(channels == input_tensors[i]->dim(3));
} }
std::vector<index_t> output_shape = input_tensors[0]->shape();
std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape);
output_tensor->ResizeImage(output_shape, output_image_shape);
const index_t channel_blocks = RoundUpDiv4(channels);
const index_t width_pixels = channel_blocks * width;
const index_t batch_height_pixels = batch * height;
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
if (input_tensors.size() > 4) { if (input_tensors.size() > 4) {
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
...@@ -55,11 +46,26 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -55,11 +46,26 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
built_options.emplace(MakeString("-DINPUT_NUM=", input_tensors.size())); built_options.emplace(MakeString("-DINPUT_NUM=", input_tensors.size()));
kernel_ = runtime->BuildKernel("addn", kernel_name, built_options); kernel_ = runtime->BuildKernel("addn", kernel_name, built_options);
}
std::vector<index_t> output_shape = input_tensors[0]->shape();
const index_t channel_blocks = RoundUpDiv4(channels);
const index_t width_pixels = channel_blocks * width;
const index_t batch_height_pixels = batch * height;
if (!IsVecEqual(input_shape_, input_tensors[0]->shape())) {
std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape);
output_tensor->ResizeImage(output_shape, output_image_shape);
uint32_t idx = 0; uint32_t idx = 0;
for (auto input : input_tensors) { for (auto input : input_tensors) {
kernel_.setArg(idx++, *(input->opencl_image())); kernel_.setArg(idx++, *(input->opencl_image()));
} }
kernel_.setArg(idx++, *(output_tensor->opencl_image())); kernel_.setArg(idx++, *(output_tensor->opencl_image()));
input_shape_ = input_tensors[0]->shape();
} }
const uint32_t gws[2] = {static_cast<uint32_t>(width_pixels), const uint32_t gws[2] = {static_cast<uint32_t>(width_pixels),
......
...@@ -61,7 +61,8 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -61,7 +61,8 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
} }
kernel_ = runtime->BuildKernel("batch_norm", kernel_name, built_options); kernel_ = runtime->BuildKernel("batch_norm", kernel_name, built_options);
}
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0; uint32_t idx = 0;
kernel_.setArg(idx++, *(input->opencl_image())); kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(scale->opencl_image())); kernel_.setArg(idx++, *(scale->opencl_image()));
...@@ -73,6 +74,8 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -73,6 +74,8 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
} }
kernel_.setArg(idx++, *(output->opencl_image())); kernel_.setArg(idx++, *(output->opencl_image()));
kernel_.setArg(idx++, relux_max_limit_); kernel_.setArg(idx++, relux_max_limit_);
input_shape_ = input->shape();
} }
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks), const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
......
...@@ -33,10 +33,13 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -33,10 +33,13 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
kernel_ = runtime->BuildKernel("bias_add", kernel_name, built_options); kernel_ = runtime->BuildKernel("bias_add", kernel_name, built_options);
}
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0; uint32_t idx = 0;
kernel_.setArg(idx++, *(input->opencl_image())); kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(bias->opencl_image())); kernel_.setArg(idx++, *(bias->opencl_image()));
kernel_.setArg(idx++, *(output->opencl_image())); kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
} }
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks), const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
......
...@@ -13,7 +13,8 @@ namespace mace { ...@@ -13,7 +13,8 @@ namespace mace {
namespace kernels { namespace kernels {
template <typename T> template <typename T>
void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(
const Tensor *input,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
output->ResizeLike(input); output->ResizeLike(input);
...@@ -39,12 +40,15 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *inpu ...@@ -39,12 +40,15 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *inpu
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
kernel_ = runtime->BuildKernel("channel_shuffle", kernel_name, built_options); kernel_ = runtime->BuildKernel("channel_shuffle", kernel_name, built_options);
}
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0; uint32_t idx = 0;
kernel_.setArg(idx++, *(input->opencl_image())); kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, groups_); kernel_.setArg(idx++, groups_);
kernel_.setArg(idx++, static_cast<uint32_t>(channels_per_group)); kernel_.setArg(idx++, static_cast<uint32_t>(channels_per_group));
kernel_.setArg(idx++, *(output->opencl_image())); kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
} }
const uint32_t gws[3] = {static_cast<uint32_t>(group_channel_blocks), const uint32_t gws[3] = {static_cast<uint32_t>(group_channel_blocks),
static_cast<uint32_t>(width), static_cast<uint32_t>(width),
......
...@@ -15,6 +15,7 @@ static void Concat2(cl::Kernel *kernel, ...@@ -15,6 +15,7 @@ static void Concat2(cl::Kernel *kernel,
const Tensor *input0, const Tensor *input0,
const Tensor *input1, const Tensor *input1,
const DataType dt, const DataType dt,
std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
const index_t batch = output->dim(0); const index_t batch = output->dim(0);
...@@ -41,6 +42,8 @@ static void Concat2(cl::Kernel *kernel, ...@@ -41,6 +42,8 @@ static void Concat2(cl::Kernel *kernel,
} }
*kernel = runtime->BuildKernel("concat", kernel_name, built_options); *kernel = runtime->BuildKernel("concat", kernel_name, built_options);
}
if (!IsVecEqual(*prev_input_shape, input0->shape())) {
uint32_t idx = 0; uint32_t idx = 0;
kernel->setArg(idx++, kernel->setArg(idx++,
*(static_cast<const cl::Image2D *>(input0->opencl_image()))); *(static_cast<const cl::Image2D *>(input0->opencl_image())));
...@@ -49,6 +52,7 @@ static void Concat2(cl::Kernel *kernel, ...@@ -49,6 +52,7 @@ static void Concat2(cl::Kernel *kernel,
kernel->setArg(idx++, static_cast<int32_t>(input0->dim(3))); kernel->setArg(idx++, static_cast<int32_t>(input0->dim(3)));
kernel->setArg(idx++, kernel->setArg(idx++,
*(static_cast<cl::Image2D *>(output->opencl_image()))); *(static_cast<cl::Image2D *>(output->opencl_image())));
*prev_input_shape = input0->shape();
} }
const uint32_t gws[3] = { const uint32_t gws[3] = {
...@@ -142,7 +146,7 @@ void ConcatFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -142,7 +146,7 @@ void ConcatFunctor<DeviceType::OPENCL, T>::operator()(
switch (inputs_count) { switch (inputs_count) {
case 2: case 2:
Concat2(&kernel_, input_list[0], input_list[1], DataTypeToEnum<T>::value, Concat2(&kernel_, input_list[0], input_list[1], DataTypeToEnum<T>::value,
output, future); &input_shape_, output, future);
break; break;
default: default:
if (divisible_four) { if (divisible_four) {
......
...@@ -18,6 +18,7 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel, ...@@ -18,6 +18,7 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit, const float relux_max_limit,
const DataType dt, const DataType dt,
std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
StatsFuture *future); StatsFuture *future);
...@@ -31,6 +32,7 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel, ...@@ -31,6 +32,7 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit, const float relux_max_limit,
const DataType dt, const DataType dt,
std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
StatsFuture *future); StatsFuture *future);
...@@ -44,6 +46,7 @@ extern void Conv2dOpencl(cl::Kernel *kernel, ...@@ -44,6 +46,7 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit, const float relux_max_limit,
const DataType dt, const DataType dt,
std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
StatsFuture *future); StatsFuture *future);
...@@ -57,8 +60,8 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -57,8 +60,8 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
cl::Kernel * kernel, const Tensor *input, const Tensor *filter, cl::Kernel * kernel, const Tensor *input, const Tensor *filter,
const Tensor *bias, const int stride, const int *padding, const Tensor *bias, const int stride, const int *padding,
const int *dilations, const ActivationType activation, const int *dilations, const ActivationType activation,
const float relux_max_limit, const DataType dt, Tensor *output, const float relux_max_limit, const DataType dt,
StatsFuture *future); std::vector<index_t> *input_shape, Tensor *output, StatsFuture *future);
// Selection matrix: kernel_size x stride_size // Selection matrix: kernel_size x stride_size
static const Conv2dOpenclFunction selector[5] = { static const Conv2dOpenclFunction selector[5] = {
Conv2dOpenclK1x1, nullptr, Conv2dOpenclK3x3, nullptr, nullptr}; Conv2dOpenclK1x1, nullptr, Conv2dOpenclK3x3, nullptr, nullptr};
...@@ -97,11 +100,11 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -97,11 +100,11 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
auto conv2d_func = selector[kernel_h - 1]; auto conv2d_func = selector[kernel_h - 1];
conv2d_func(&kernel_, input, filter, bias, strides_[0], paddings.data(), conv2d_func(&kernel_, input, filter, bias, strides_[0], paddings.data(),
dilations_, activation_, relux_max_limit_, dilations_, activation_, relux_max_limit_,
DataTypeToEnum<T>::value, output, future); DataTypeToEnum<T>::value, &input_shape_, output, future);
} else { } else {
Conv2dOpencl(&kernel_, input, filter, bias, strides_[0], paddings.data(), Conv2dOpencl(&kernel_, input, filter, bias, strides_[0], paddings.data(),
dilations_, activation_, relux_max_limit_, dilations_, activation_, relux_max_limit_,
DataTypeToEnum<T>::value, output, future); DataTypeToEnum<T>::value, &input_shape_, output, future);
} }
} }
......
...@@ -20,6 +20,7 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel, ...@@ -20,6 +20,7 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit, const float relux_max_limit,
const DataType dt, const DataType dt,
std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
const index_t batch = output->dim(0); const index_t batch = output->dim(0);
...@@ -68,6 +69,8 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel, ...@@ -68,6 +69,8 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
*kernel = runtime->BuildKernel("conv_2d_1x1", kernel_name, built_options); *kernel = runtime->BuildKernel("conv_2d_1x1", kernel_name, built_options);
}
if (!IsVecEqual(*prev_input_shape, input->shape())) {
uint32_t idx = 0; uint32_t idx = 0;
kernel->setArg(idx++, *(input->opencl_image())); kernel->setArg(idx++, *(input->opencl_image()));
kernel->setArg(idx++, *(filter->opencl_image())); kernel->setArg(idx++, *(filter->opencl_image()));
...@@ -83,6 +86,8 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel, ...@@ -83,6 +86,8 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
kernel->setArg(idx++, static_cast<int>(height)); kernel->setArg(idx++, static_cast<int>(height));
kernel->setArg(idx++, static_cast<int>(width)); kernel->setArg(idx++, static_cast<int>(width));
kernel->setArg(idx++, stride); kernel->setArg(idx++, stride);
*prev_input_shape = input->shape();
} }
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks), const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
......
...@@ -22,6 +22,7 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel, ...@@ -22,6 +22,7 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit, const float relux_max_limit,
const DataType dt, const DataType dt,
std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
const index_t batch = output->dim(0); const index_t batch = output->dim(0);
...@@ -62,7 +63,8 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel, ...@@ -62,7 +63,8 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
*kernel = runtime->BuildKernel("conv_2d_3x3", kernel_name, built_options); *kernel = runtime->BuildKernel("conv_2d_3x3", kernel_name, built_options);
}
if (!IsVecEqual(*prev_input_shape, input->shape())) {
uint32_t idx = 0; uint32_t idx = 0;
kernel->setArg(idx++, *(input->opencl_image())); kernel->setArg(idx++, *(input->opencl_image()));
kernel->setArg(idx++, *(filter->opencl_image())); kernel->setArg(idx++, *(filter->opencl_image()));
...@@ -81,6 +83,8 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel, ...@@ -81,6 +83,8 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
kernel->setArg(idx++, padding[1] / 2); kernel->setArg(idx++, padding[1] / 2);
kernel->setArg(idx++, dilations[0]); kernel->setArg(idx++, dilations[0]);
kernel->setArg(idx++, dilations[1]); kernel->setArg(idx++, dilations[1]);
*prev_input_shape = input->shape();
} }
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks), const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
......
...@@ -22,6 +22,7 @@ extern void Conv2dOpencl(cl::Kernel *kernel, ...@@ -22,6 +22,7 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit, const float relux_max_limit,
const DataType dt, const DataType dt,
std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
const index_t batch = output->dim(0); const index_t batch = output->dim(0);
...@@ -62,7 +63,8 @@ extern void Conv2dOpencl(cl::Kernel *kernel, ...@@ -62,7 +63,8 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
*kernel = runtime->BuildKernel("conv_2d", kernel_name, built_options); *kernel = runtime->BuildKernel("conv_2d", kernel_name, built_options);
}
if (!IsVecEqual(*prev_input_shape, input->shape())) {
uint32_t idx = 0; uint32_t idx = 0;
kernel->setArg(idx++, *(input->opencl_image())); kernel->setArg(idx++, *(input->opencl_image()));
kernel->setArg(idx++, *(filter->opencl_image())); kernel->setArg(idx++, *(filter->opencl_image()));
...@@ -83,6 +85,8 @@ extern void Conv2dOpencl(cl::Kernel *kernel, ...@@ -83,6 +85,8 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
kernel->setArg(idx++, padding[1] / 2); kernel->setArg(idx++, padding[1] / 2);
kernel->setArg(idx++, dilations[0]); kernel->setArg(idx++, dilations[0]);
kernel->setArg(idx++, dilations[1]); kernel->setArg(idx++, dilations[1]);
*prev_input_shape = input->shape();
} }
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks), const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
......
...@@ -21,6 +21,7 @@ void DepthwiseConv2d(cl::Kernel *kernel, ...@@ -21,6 +21,7 @@ void DepthwiseConv2d(cl::Kernel *kernel,
const ActivationType activation, const ActivationType activation,
const float relux_max_limit, const float relux_max_limit,
const DataType dt, const DataType dt,
std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
const index_t batch = output->dim(0); const index_t batch = output->dim(0);
...@@ -35,17 +36,6 @@ void DepthwiseConv2d(cl::Kernel *kernel, ...@@ -35,17 +36,6 @@ void DepthwiseConv2d(cl::Kernel *kernel,
const index_t input_channel_blocks = RoundUpDiv4(input_channels); const index_t input_channel_blocks = RoundUpDiv4(input_channels);
const index_t width_blocks = RoundUpDiv4(width); const index_t width_blocks = RoundUpDiv4(width);
if (kernel->get() == nullptr) { if (kernel->get() == nullptr) {
const index_t input_batch = input->dim(0);
const index_t input_height = input->dim(1);
const index_t input_width = input->dim(2);
const index_t filter_height = filter->dim(0);
const index_t filter_width = filter->dim(1);
MACE_CHECK(multiplier == 1, "Multiplier > 1 not supported");
MACE_CHECK(multiplier * input_channels == channels);
MACE_CHECK(filter->dim(2) == input_channels, filter->dim(2), "!=",
input_channels);
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
std::set<std::string> built_options; std::set<std::string> built_options;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv2d"); std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv2d");
...@@ -80,6 +70,18 @@ void DepthwiseConv2d(cl::Kernel *kernel, ...@@ -80,6 +70,18 @@ void DepthwiseConv2d(cl::Kernel *kernel,
*kernel = *kernel =
runtime->BuildKernel("depthwise_conv2d", kernel_name, built_options); runtime->BuildKernel("depthwise_conv2d", kernel_name, built_options);
}
if (!IsVecEqual(*prev_input_shape, input->shape())) {
const index_t input_batch = input->dim(0);
const index_t input_height = input->dim(1);
const index_t input_width = input->dim(2);
const index_t filter_height = filter->dim(0);
const index_t filter_width = filter->dim(1);
MACE_CHECK(multiplier == 1, "Multiplier > 1 not supported");
MACE_CHECK(multiplier * input_channels == channels);
MACE_CHECK(filter->dim(2) == input_channels, filter->dim(2), "!=",
input_channels);
uint32_t idx = 0; uint32_t idx = 0;
kernel->setArg(idx++, *(input->opencl_image())); kernel->setArg(idx++, *(input->opencl_image()));
...@@ -102,6 +104,7 @@ void DepthwiseConv2d(cl::Kernel *kernel, ...@@ -102,6 +104,7 @@ void DepthwiseConv2d(cl::Kernel *kernel,
kernel->setArg(idx++, static_cast<short>(dilations[0])); kernel->setArg(idx++, static_cast<short>(dilations[0]));
kernel->setArg(idx++, static_cast<short>(dilations[1])); kernel->setArg(idx++, static_cast<short>(dilations[1]));
} }
*prev_input_shape = input->shape();
} }
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks), const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
...@@ -120,9 +123,7 @@ void DepthwiseConv2dFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -120,9 +123,7 @@ void DepthwiseConv2dFunctor<DeviceType::OPENCL, T>::operator()(
const Tensor *bias, const Tensor *bias,
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
typedef void (*Conv2dOpenclFunction)(const Tensor *input,
const Tensor *filter, const Tensor *bias,
Tensor *output, StatsFuture *future);
index_t kernel_h = filter->dim(2); index_t kernel_h = filter->dim(2);
index_t kernel_w = filter->dim(3); index_t kernel_w = filter->dim(3);
if (strides_[0] != strides_[1]) { if (strides_[0] != strides_[1]) {
...@@ -163,7 +164,7 @@ void DepthwiseConv2dFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -163,7 +164,7 @@ void DepthwiseConv2dFunctor<DeviceType::OPENCL, T>::operator()(
DepthwiseConv2d(&kernel_, input, filter, bias, strides_[0], paddings.data(), DepthwiseConv2d(&kernel_, input, filter, bias, strides_[0], paddings.data(),
dilations_, activation_, relux_max_limit_, dilations_, activation_, relux_max_limit_,
DataTypeToEnum<T>::value, output, future); DataTypeToEnum<T>::value, &input_shape_, output, future);
} }
template struct DepthwiseConv2dFunctor<DeviceType::OPENCL, float>; template struct DepthwiseConv2dFunctor<DeviceType::OPENCL, float>;
......
...@@ -36,6 +36,8 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0, ...@@ -36,6 +36,8 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
if (!coeff_.empty()) built_options.emplace("-DCOEFF_SUM"); if (!coeff_.empty()) built_options.emplace("-DCOEFF_SUM");
kernel_ = runtime->BuildKernel("eltwise", kernel_name, built_options); kernel_ = runtime->BuildKernel("eltwise", kernel_name, built_options);
}
if (!IsVecEqual(input_shape_, input0->shape())) {
uint32_t idx = 0; uint32_t idx = 0;
kernel_.setArg(idx++, *(input0->opencl_image())); kernel_.setArg(idx++, *(input0->opencl_image()));
kernel_.setArg(idx++, *(input1->opencl_image())); kernel_.setArg(idx++, *(input1->opencl_image()));
...@@ -44,6 +46,7 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0, ...@@ -44,6 +46,7 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
kernel_.setArg(idx++, coeff_[1]); kernel_.setArg(idx++, coeff_[1]);
} }
kernel_.setArg(idx++, *(output->opencl_image())); kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input0->shape();
} }
const uint32_t gws[2] = {static_cast<uint32_t>(width_pixels), const uint32_t gws[2] = {static_cast<uint32_t>(width_pixels),
......
...@@ -13,6 +13,7 @@ void FCWXKernel(cl::Kernel *kernel, ...@@ -13,6 +13,7 @@ void FCWXKernel(cl::Kernel *kernel,
const Tensor *input, const Tensor *input,
const Tensor *weight, const Tensor *weight,
const Tensor *bias, const Tensor *bias,
std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
const ActivationType activation, const ActivationType activation,
std::vector<uint32_t> &gws, std::vector<uint32_t> &gws,
...@@ -67,6 +68,11 @@ void FCWXKernel(cl::Kernel *kernel, ...@@ -67,6 +68,11 @@ void FCWXKernel(cl::Kernel *kernel,
const uint32_t inter_local_blks = kwg_size / (gws[0] * gws[1]); const uint32_t inter_local_blks = kwg_size / (gws[0] * gws[1]);
lws = {gws[0], gws[1], inter_local_blks}; lws = {gws[0], gws[1], inter_local_blks};
}
if (!IsVecEqual(*prev_input_shape, input->shape())) {
const index_t batch = output->dim(0);
const index_t output_blocks = RoundUpDiv4(output->dim(3));
uint32_t idx = 0; uint32_t idx = 0;
kernel->setArg(idx++, *(input->opencl_image())); kernel->setArg(idx++, *(input->opencl_image()));
kernel->setArg(idx++, *(weight->opencl_image())); kernel->setArg(idx++, *(weight->opencl_image()));
...@@ -80,6 +86,10 @@ void FCWXKernel(cl::Kernel *kernel, ...@@ -80,6 +86,10 @@ void FCWXKernel(cl::Kernel *kernel,
kernel->setArg(idx++, static_cast<int>(RoundUpDiv4(input->dim(3)))); kernel->setArg(idx++, static_cast<int>(RoundUpDiv4(input->dim(3))));
kernel->setArg(idx++, static_cast<int>(output_blocks)); kernel->setArg(idx++, static_cast<int>(output_blocks));
kernel->setArg(idx++, relux_max_limit); kernel->setArg(idx++, relux_max_limit);
gws[2] = static_cast<uint32_t>(batch * output_blocks);
*prev_input_shape = input->shape();
} }
cl::Event event; cl::Event event;
cl_int error = runtime->command_queue().enqueueNDRangeKernel( cl_int error = runtime->command_queue().enqueueNDRangeKernel(
...@@ -103,6 +113,7 @@ void FCWTXKernel(cl::Kernel *kernel, ...@@ -103,6 +113,7 @@ void FCWTXKernel(cl::Kernel *kernel,
const Tensor *input, const Tensor *input,
const Tensor *weight, const Tensor *weight,
const Tensor *bias, const Tensor *bias,
std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
const ActivationType activation, const ActivationType activation,
std::vector<uint32_t> &gws, std::vector<uint32_t> &gws,
...@@ -141,6 +152,9 @@ void FCWTXKernel(cl::Kernel *kernel, ...@@ -141,6 +152,9 @@ void FCWTXKernel(cl::Kernel *kernel,
*kernel = *kernel =
runtime->BuildKernel("fully_connected", kernel_name, built_options); runtime->BuildKernel("fully_connected", kernel_name, built_options);
lws = {16, 64, 1};
}
if (!IsVecEqual(*prev_input_shape, input->shape())) {
uint32_t idx = 0; uint32_t idx = 0;
kernel->setArg(idx++, *(input->opencl_image())); kernel->setArg(idx++, *(input->opencl_image()));
kernel->setArg(idx++, *(weight->opencl_image())); kernel->setArg(idx++, *(weight->opencl_image()));
...@@ -155,14 +169,13 @@ void FCWTXKernel(cl::Kernel *kernel, ...@@ -155,14 +169,13 @@ void FCWTXKernel(cl::Kernel *kernel,
kernel->setArg(idx++, relux_max_limit); kernel->setArg(idx++, relux_max_limit);
const index_t batch = output->dim(0); const index_t batch = output->dim(0);
const index_t output_size = output->dim(3); const index_t output_blocks = RoundUpDiv4(output->dim(3));
const index_t output_blocks = RoundUpDiv4(output_size);
gws = { gws = {
static_cast<uint32_t>(batch), static_cast<uint32_t>(output_blocks), static_cast<uint32_t>(batch), static_cast<uint32_t>(output_blocks),
}; };
lws = {16, 64, 1};
*prev_input_shape = input->shape();
} }
std::stringstream ss; std::stringstream ss;
...@@ -185,10 +198,10 @@ void FullyConnectedFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -185,10 +198,10 @@ void FullyConnectedFunctor<DeviceType::OPENCL, T>::operator()(
output->ResizeImage(output_shape, output_image_shape); output->ResizeImage(output_shape, output_image_shape);
if (weight_type_ == BufferType::WEIGHT_HEIGHT) { if (weight_type_ == BufferType::WEIGHT_HEIGHT) {
FCWTXKernel<T>(&kernel_, input, weight, bias, output, FCWTXKernel<T>(&kernel_, input, weight, bias, &input_shape_, output,
activation_, gws_, lws_, relux_max_limit_, future); activation_, gws_, lws_, relux_max_limit_, future);
} else { } else {
FCWXKernel<T>(&kernel_, input, weight, bias, output, FCWXKernel<T>(&kernel_, input, weight, bias, &input_shape_, output,
activation_, gws_, lws_, relux_max_limit_, future); activation_, gws_, lws_, relux_max_limit_, future);
} }
}; };
......
...@@ -71,6 +71,13 @@ inline bool LimitKernelTime() { ...@@ -71,6 +71,13 @@ inline bool LimitKernelTime() {
return flag != nullptr && strlen(flag) == 1 && flag[0] == '1'; return flag != nullptr && strlen(flag) == 1 && flag[0] == '1';
} }
template <typename T>
bool IsVecEqual(const std::vector<T> &input0,
const std::vector<T> &input1) {
return ((input0.size() == input1.size()) &&
(std::equal(input0.begin(), input0.end(), input1.begin())));
}
namespace { namespace {
template <typename T> template <typename T>
void AppendToStream(std::stringstream *ss, const std::string &delimiter, T v) { void AppendToStream(std::stringstream *ss, const std::string &delimiter, T v) {
......
...@@ -36,7 +36,7 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A, ...@@ -36,7 +36,7 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
kernel_ = runtime->BuildKernel("matmul", kernel_name, built_options); kernel_ = runtime->BuildKernel("matmul", kernel_name, built_options);
}
uint32_t idx = 0; uint32_t idx = 0;
kernel_.setArg(idx++, *(A->opencl_image())); kernel_.setArg(idx++, *(A->opencl_image()));
kernel_.setArg(idx++, *(B->opencl_image())); kernel_.setArg(idx++, *(B->opencl_image()));
...@@ -46,7 +46,6 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A, ...@@ -46,7 +46,6 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
kernel_.setArg(idx++, static_cast<int>(A->dim(2))); kernel_.setArg(idx++, static_cast<int>(A->dim(2)));
kernel_.setArg(idx++, static_cast<int>(height_blocks)); kernel_.setArg(idx++, static_cast<int>(height_blocks));
kernel_.setArg(idx++, static_cast<int>(RoundUpDiv4(A->dim(2)))); kernel_.setArg(idx++, static_cast<int>(RoundUpDiv4(A->dim(2))));
}
const uint32_t gws[2] = { const uint32_t gws[2] = {
static_cast<uint32_t>(width_blocks), static_cast<uint32_t>(width_blocks),
......
...@@ -17,31 +17,6 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -17,31 +17,6 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
StatsFuture *future) { StatsFuture *future) {
MACE_CHECK(dilations_[0] == 1 && dilations_[1] == 1) MACE_CHECK(dilations_[0] == 1 && dilations_[1] == 1)
<< "Pooling opencl kernel not support dilation yet"; << "Pooling opencl kernel not support dilation yet";
std::vector<index_t> output_shape(4);
std::vector<index_t> filter_shape = {kernels_[0], kernels_[1], input->dim(3),
input->dim(3)};
std::vector<int> paddings(2);
if (paddings_.empty()) {
kernels::CalcNHWCPaddingAndOutputSize(
input->shape().data(), filter_shape.data(), dilations_, strides_,
padding_type_, output_shape.data(), paddings.data());
} else {
paddings = paddings_;
CalcOutputSize(input->shape().data(), filter_shape.data(), paddings_.data(),
dilations_, strides_, RoundType::CEIL, output_shape.data());
}
std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape);
output->ResizeImage(output_shape, output_image_shape);
index_t batch = output->dim(0);
index_t out_height = output->dim(1);
index_t out_width = output->dim(2);
index_t channels = output->dim(3);
index_t channel_blocks = (channels + 3) / 4;
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
const DataType dt = DataTypeToEnum<T>::value; const DataType dt = DataTypeToEnum<T>::value;
...@@ -62,18 +37,49 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -62,18 +37,49 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
} }
kernel_ = runtime->BuildKernel("pooling", kernel_name, built_options); kernel_ = runtime->BuildKernel("pooling", kernel_name, built_options);
}
if (!IsVecEqual(input_shape_, input->shape())) {
std::vector<index_t> output_shape(4);
std::vector<index_t> filter_shape = {kernels_[0], kernels_[1], input->dim(3),
input->dim(3)};
std::vector<int> paddings(2);
if (paddings_.empty()) {
kernels::CalcNHWCPaddingAndOutputSize(
input->shape().data(), filter_shape.data(), dilations_, strides_,
padding_type_, output_shape.data(), paddings.data());
} else {
paddings = paddings_;
CalcOutputSize(input->shape().data(), filter_shape.data(), paddings_.data(),
dilations_, strides_, RoundType::CEIL, output_shape.data());
}
std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape);
output->ResizeImage(output_shape, output_image_shape);
uint32_t idx = 0; uint32_t idx = 0;
kernel_.setArg(idx++, *(input->opencl_image())); kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, static_cast<int32_t>(input->dim(1))); kernel_.setArg(idx++, static_cast<int32_t>(input->dim(1)));
kernel_.setArg(idx++, static_cast<int32_t>(input->dim(2))); kernel_.setArg(idx++, static_cast<int32_t>(input->dim(2)));
kernel_.setArg(idx++, static_cast<int32_t>(out_height)); kernel_.setArg(idx++, static_cast<int32_t>(output->dim(1)));
kernel_.setArg(idx++, paddings[0] / 2); kernel_.setArg(idx++, paddings[0] / 2);
kernel_.setArg(idx++, paddings[1] / 2); kernel_.setArg(idx++, paddings[1] / 2);
kernel_.setArg(idx++, strides_[0]); kernel_.setArg(idx++, strides_[0]);
kernel_.setArg(idx++, kernels_[0]); kernel_.setArg(idx++, kernels_[0]);
kernel_.setArg(idx++, *(output->opencl_image())); kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
} }
index_t batch = output->dim(0);
index_t out_height = output->dim(1);
index_t out_width = output->dim(2);
index_t channels = output->dim(3);
index_t channel_blocks = (channels + 3) / 4;
const uint32_t gws[3] = { const uint32_t gws[3] = {
static_cast<uint32_t>(channel_blocks), static_cast<uint32_t>(out_width), static_cast<uint32_t>(channel_blocks), static_cast<uint32_t>(out_width),
static_cast<uint32_t>(batch * out_height), static_cast<uint32_t>(batch * out_height),
......
...@@ -24,6 +24,19 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -24,6 +24,19 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
const index_t out_height = out_height_; const index_t out_height = out_height_;
const index_t out_width = out_width_; const index_t out_width = out_width_;
if (kernel_.get() == nullptr) {
auto runtime = OpenCLRuntime::Global();
std::set<std::string> built_options;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bilinear_nocache");
built_options.emplace("-Dresize_bilinear_nocache=" + kernel_name);
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
kernel_ =
runtime->BuildKernel("resize_bilinear", kernel_name, built_options);
}
if (!IsVecEqual(input_shape_, input->shape())) {
MACE_CHECK(out_height > 0 && out_width > 0); MACE_CHECK(out_height > 0 && out_width > 0);
std::vector<index_t> output_shape{batch, out_height, out_width, channels}; std::vector<index_t> output_shape{batch, out_height, out_width, channels};
...@@ -32,23 +45,11 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -32,23 +45,11 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
output_image_shape); output_image_shape);
output->ResizeImage(output_shape, output_image_shape); output->ResizeImage(output_shape, output_image_shape);
if (kernel_.get() == nullptr) {
float height_scale = float height_scale =
CalculateResizeScale(in_height, out_height, align_corners_); CalculateResizeScale(in_height, out_height, align_corners_);
float width_scale = float width_scale =
CalculateResizeScale(in_width, out_width, align_corners_); CalculateResizeScale(in_width, out_width, align_corners_);
auto runtime = OpenCLRuntime::Global();
std::set<std::string> built_options;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bilinear_nocache");
built_options.emplace("-Dresize_bilinear_nocache=" + kernel_name);
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
kernel_ =
runtime->BuildKernel("resize_bilinear", kernel_name, built_options);
uint32_t idx = 0; uint32_t idx = 0;
kernel_.setArg(idx++, *(input->opencl_image())); kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(output->opencl_image())); kernel_.setArg(idx++, *(output->opencl_image()));
...@@ -57,6 +58,9 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -57,6 +58,9 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
kernel_.setArg(idx++, static_cast<int32_t>(in_height)); kernel_.setArg(idx++, static_cast<int32_t>(in_height));
kernel_.setArg(idx++, static_cast<int32_t>(in_width)); kernel_.setArg(idx++, static_cast<int32_t>(in_width));
kernel_.setArg(idx++, static_cast<int32_t>(out_height)); kernel_.setArg(idx++, static_cast<int32_t>(out_height));
input_shape_ = input->shape();
} }
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks), const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
......
...@@ -34,11 +34,14 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits, ...@@ -34,11 +34,14 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
kernel_ = runtime->BuildKernel("softmax", kernel_name, built_options); kernel_ = runtime->BuildKernel("softmax", kernel_name, built_options);
}
if (!IsVecEqual(input_shape_, logits->shape())) {
uint32_t idx = 0; uint32_t idx = 0;
kernel_.setArg(idx++, *(logits->opencl_image())); kernel_.setArg(idx++, *(logits->opencl_image()));
kernel_.setArg(idx++, static_cast<int>(channels)); kernel_.setArg(idx++, static_cast<int>(channels));
kernel_.setArg(idx++, remain_channels); kernel_.setArg(idx++, remain_channels);
kernel_.setArg(idx++, *(output->opencl_image())); kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = logits->shape();
} }
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks), const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width), static_cast<uint32_t>(width),
......
...@@ -43,6 +43,8 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -43,6 +43,8 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
kernel_ = kernel_ =
runtime->BuildKernel("space_to_batch", kernel_name, built_options); runtime->BuildKernel("space_to_batch", kernel_name, built_options);
}
if (!IsVecEqual(space_shape_, space_tensor->shape())) {
uint32_t idx = 0; uint32_t idx = 0;
if (b2s_) { if (b2s_) {
kernel_.setArg(idx++, *(batch_tensor->opencl_image())); kernel_.setArg(idx++, *(batch_tensor->opencl_image()));
...@@ -59,6 +61,8 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -59,6 +61,8 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(2))); kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(2)));
kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(1))); kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(1)));
kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(2))); kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(2)));
space_shape_ = space_tensor->shape();
} }
const uint32_t chan_blk = RoundUpDiv4<uint32_t>(batch_tensor->dim(3)); const uint32_t chan_blk = RoundUpDiv4<uint32_t>(batch_tensor->dim(3));
......
...@@ -14,6 +14,21 @@ namespace kernels { ...@@ -14,6 +14,21 @@ namespace kernels {
template <typename T> template <typename T>
void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()( void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
const Tensor *input_tensor, Tensor *output_tensor, StatsFuture *future) { const Tensor *input_tensor, Tensor *output_tensor, StatsFuture *future) {
if (kernel_.get() == nullptr) {
std::string obfuscated_kernel_name =
MACE_OBFUSCATE_SYMBOL("winograd_transform_2x2");
std::set<std::string> built_options;
built_options.emplace("-Dwinograd_transform_2x2=" + obfuscated_kernel_name);
built_options.emplace("-DDATA_TYPE=" +
DtToUpstreamCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" +
DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
auto runtime = OpenCLRuntime::Global();
kernel_ = runtime->BuildKernel("winograd_transform", obfuscated_kernel_name,
built_options);
}
std::vector<index_t> output_shape(4); std::vector<index_t> output_shape(4);
std::vector<index_t> filter_shape = {3, 3, input_tensor->dim(3), 1}; std::vector<index_t> filter_shape = {3, 3, input_tensor->dim(3), 1};
std::vector<int> paddings(2); std::vector<int> paddings(2);
...@@ -27,29 +42,16 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -27,29 +42,16 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
paddings_.data(), dilations_.data(), strides_.data(), paddings_.data(), dilations_.data(), strides_.data(),
RoundType::FLOOR, output_shape.data()); RoundType::FLOOR, output_shape.data());
} }
const index_t round_h = (output_shape[1] + 1) / 2; const index_t round_h = (output_shape[1] + 1) / 2;
const index_t round_w = (output_shape[2] + 1) / 2; const index_t round_w = (output_shape[2] + 1) / 2;
const index_t out_width = input_tensor->dim(0) * round_h * round_w; const index_t out_width = input_tensor->dim(0) * round_h * round_w;
if (!IsVecEqual(input_shape_, input_tensor->shape())) {
output_shape = {16, input_tensor->dim(3), out_width, 1}; output_shape = {16, input_tensor->dim(3), out_width, 1};
std::vector<size_t> image_shape; std::vector<size_t> image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_HEIGHT, image_shape); CalImage2DShape(output_shape, BufferType::IN_OUT_HEIGHT, image_shape);
output_tensor->ResizeImage(output_shape, image_shape); output_tensor->ResizeImage(output_shape, image_shape);
if (kernel_.get() == nullptr) {
std::string obfuscated_kernel_name =
MACE_OBFUSCATE_SYMBOL("winograd_transform_2x2");
std::set<std::string> built_options;
built_options.emplace("-Dwinograd_transform_2x2=" + obfuscated_kernel_name);
built_options.emplace("-DDATA_TYPE=" +
DtToUpstreamCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" +
DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
auto runtime = OpenCLRuntime::Global();
kernel_ = runtime->BuildKernel("winograd_transform", obfuscated_kernel_name,
built_options);
uint32_t idx = 0; uint32_t idx = 0;
kernel_.setArg(idx++, *(input_tensor->opencl_image())); kernel_.setArg(idx++, *(input_tensor->opencl_image()));
kernel_.setArg(idx++, *(output_tensor->opencl_image())); kernel_.setArg(idx++, *(output_tensor->opencl_image()));
...@@ -60,6 +62,8 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -60,6 +62,8 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
kernel_.setArg(idx++, static_cast<uint32_t>(round_w)); kernel_.setArg(idx++, static_cast<uint32_t>(round_w));
kernel_.setArg(idx++, static_cast<uint32_t>(paddings[0] / 2)); kernel_.setArg(idx++, static_cast<uint32_t>(paddings[0] / 2));
kernel_.setArg(idx++, static_cast<uint32_t>(paddings[1] / 2)); kernel_.setArg(idx++, static_cast<uint32_t>(paddings[1] / 2));
input_shape_ = input_tensor->shape();
} }
const uint32_t gws[2] = { const uint32_t gws[2] = {
...@@ -79,11 +83,6 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -79,11 +83,6 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
const Tensor *bias, const Tensor *bias,
Tensor *output_tensor, Tensor *output_tensor,
StatsFuture *future) { StatsFuture *future) {
std::vector<index_t> output_shape = {batch_, height_, width_,
input_tensor->dim(1)};
std::vector<size_t> image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, image_shape);
output_tensor->ResizeImage(output_shape, image_shape);
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
std::string obfuscated_kernel_name = std::string obfuscated_kernel_name =
...@@ -121,6 +120,13 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -121,6 +120,13 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
kernel_ = runtime->BuildKernel("winograd_transform", obfuscated_kernel_name, kernel_ = runtime->BuildKernel("winograd_transform", obfuscated_kernel_name,
built_options); built_options);
}
if (!IsVecEqual(input_shape_, input_tensor->shape())) {
std::vector<index_t> output_shape = {batch_, height_, width_,
input_tensor->dim(1)};
std::vector<size_t> image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, image_shape);
output_tensor->ResizeImage(output_shape, image_shape);
const uint32_t round_h = (height_ + 1) / 2; const uint32_t round_h = (height_ + 1) / 2;
const uint32_t round_w = (width_ + 1) / 2; const uint32_t round_w = (width_ + 1) / 2;
...@@ -139,6 +145,8 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -139,6 +145,8 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
kernel_.setArg(idx++, static_cast<uint32_t>(round_h * round_w)); kernel_.setArg(idx++, static_cast<uint32_t>(round_h * round_w));
kernel_.setArg(idx++, static_cast<uint32_t>(round_w)); kernel_.setArg(idx++, static_cast<uint32_t>(round_w));
kernel_.setArg(idx++, relux_max_limit_); kernel_.setArg(idx++, relux_max_limit_);
input_shape_ = input_tensor->shape();
} }
const uint32_t gws[2] = { const uint32_t gws[2] = {
......
...@@ -182,6 +182,7 @@ struct PoolingFunctor<DeviceType::OPENCL, T> : PoolingFunctorBase { ...@@ -182,6 +182,7 @@ struct PoolingFunctor<DeviceType::OPENCL, T> : PoolingFunctorBase {
StatsFuture *future); StatsFuture *future);
cl::Kernel kernel_; cl::Kernel kernel_;
std::vector<index_t> input_shape_;
}; };
} // namespace kernels } // namespace kernels
......
...@@ -172,6 +172,7 @@ struct ResizeBilinearFunctor<DeviceType::OPENCL, T> ...@@ -172,6 +172,7 @@ struct ResizeBilinearFunctor<DeviceType::OPENCL, T>
void operator()(const Tensor *input, Tensor *output, StatsFuture *future); void operator()(const Tensor *input, Tensor *output, StatsFuture *future);
cl::Kernel kernel_; cl::Kernel kernel_;
std::vector<index_t> input_shape_;
}; };
} // namespace kernels } // namespace kernels
......
...@@ -57,6 +57,7 @@ struct SoftmaxFunctor<DeviceType::OPENCL, T> { ...@@ -57,6 +57,7 @@ struct SoftmaxFunctor<DeviceType::OPENCL, T> {
void operator()(const Tensor *logits, Tensor *output, StatsFuture *future); void operator()(const Tensor *logits, Tensor *output, StatsFuture *future);
cl::Kernel kernel_; cl::Kernel kernel_;
std::vector<index_t> input_shape_;
}; };
} // namepsace kernels } // namepsace kernels
......
...@@ -54,6 +54,7 @@ struct SpaceToBatchFunctor<DeviceType::OPENCL, T> : SpaceToBatchFunctorBase { ...@@ -54,6 +54,7 @@ struct SpaceToBatchFunctor<DeviceType::OPENCL, T> : SpaceToBatchFunctorBase {
StatsFuture *future); StatsFuture *future);
cl::Kernel kernel_; cl::Kernel kernel_;
std::vector<index_t> space_shape_;
}; };
} // namespace kernels } // namespace kernels
......
...@@ -49,6 +49,7 @@ struct WinogradTransformFunctor<DeviceType::OPENCL, T> ...@@ -49,6 +49,7 @@ struct WinogradTransformFunctor<DeviceType::OPENCL, T>
void operator()(const Tensor *input, Tensor *output, StatsFuture *future); void operator()(const Tensor *input, Tensor *output, StatsFuture *future);
cl::Kernel kernel_; cl::Kernel kernel_;
std::vector<index_t> input_shape_;
}; };
struct WinogradInverseTransformFunctorBase { struct WinogradInverseTransformFunctorBase {
...@@ -105,6 +106,7 @@ struct WinogradInverseTransformFunctor<DeviceType::OPENCL, T> ...@@ -105,6 +106,7 @@ struct WinogradInverseTransformFunctor<DeviceType::OPENCL, T>
StatsFuture *future); StatsFuture *future);
cl::Kernel kernel_; cl::Kernel kernel_;
std::vector<index_t> input_shape_;
}; };
} // namespace kernels } // namespace kernels
......
...@@ -96,7 +96,7 @@ def output_shape(input_shape, filter_shape): ...@@ -96,7 +96,7 @@ def output_shape(input_shape, filter_shape):
return out_shape return out_shape
def winog_conv(m, r, input, filter): def winograd_conv(m, r, input, filter):
alpha = m + r - 1 alpha = m + r - 1
print 'Winograd(m = %d, r = %d, tile size=%d' % (m, r, alpha) print 'Winograd(m = %d, r = %d, tile size=%d' % (m, r, alpha)
alpha_square = alpha * alpha alpha_square = alpha * alpha
...@@ -194,14 +194,14 @@ def main(): ...@@ -194,14 +194,14 @@ def main():
# filter.tofile("filter_in") # filter.tofile("filter_in")
for i in [2, 4, 6]: for i in [2, 4, 6]:
print "==========f(%d,3)==========" % i print "==========f(%d,3)==========" % i
winog_out = winog_conv(i, 3, input, filter) winograd_out = winograd_conv(i, 3, input, filter)
res = np.allclose(tf_out, winog_out) res = np.allclose(tf_out, winograd_out)
if res: if res:
print "=========Pass=========" print "=========Pass========="
else: else:
print "=========Failed=======" print "=========Failed======="
print "TF: ", tf_out print "TF: ", tf_out
print "Winograd: ", winog_out print "Winograd: ", winograd_out
if __name__ == '__main__': if __name__ == '__main__':
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册