提交 16703420 编写于 作者: W wuchenghui

fix cpplint for mace/kernels

上级 564833a5
......@@ -5,6 +5,10 @@
#ifndef MACE_KERNELS_ACTIVATION_H_
#define MACE_KERNELS_ACTIVATION_H_
#include <algorithm>
#include <string>
#include <vector>
#include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h"
......
......@@ -8,6 +8,7 @@
#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
#include <arm_neon.h>
#endif
#include <algorithm>
#include <vector>
#include "mace/core/future.h"
......@@ -17,9 +18,7 @@
namespace mace {
namespace kernels {
namespace {
constexpr int kCostPerGroup = 1024;
} // namespace
template <DeviceType D, typename T>
struct AddNFunctor {
......
......@@ -8,6 +8,7 @@
#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
#include <arm_neon.h>
#endif
#include <vector>
#include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h"
......@@ -159,7 +160,7 @@ struct BatchNormFunctor<DeviceType::OPENCL, T> : BatchNormFunctorBase {
std::vector<index_t> input_shape_;
};
} // namepsace kernels
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_BATCH_NORM_H_
......@@ -5,6 +5,8 @@
#ifndef MACE_KERNELS_BIAS_ADD_H_
#define MACE_KERNELS_BIAS_ADD_H_
#include <vector>
#include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h"
......@@ -65,7 +67,7 @@ struct BiasAddFunctor<DeviceType::OPENCL, T> {
std::vector<index_t> input_shape_;
};
} // namepsace kernels
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_BIAS_ADD_H_
......@@ -13,13 +13,14 @@ namespace mace {
namespace kernels {
struct BufferToImageFunctorBase {
BufferToImageFunctorBase(bool i2b) : i2b_(i2b) {}
explicit BufferToImageFunctorBase(bool i2b) : i2b_(i2b) {}
bool i2b_;
};
template <DeviceType D, typename T>
struct BufferToImageFunctor : BufferToImageFunctorBase {
BufferToImageFunctor(bool i2b = false) : BufferToImageFunctorBase(i2b) {}
explicit BufferToImageFunctor(bool i2b = false)
: BufferToImageFunctorBase(i2b) {}
void operator()(Tensor *input,
const BufferType type,
Tensor *output,
......@@ -30,14 +31,15 @@ struct BufferToImageFunctor : BufferToImageFunctorBase {
template <typename T>
struct BufferToImageFunctor<DeviceType::OPENCL, T> : BufferToImageFunctorBase {
BufferToImageFunctor(bool i2b = false) : BufferToImageFunctorBase(i2b) {}
explicit BufferToImageFunctor(bool i2b = false)
: BufferToImageFunctorBase(i2b) {}
void operator()(Tensor *input,
const BufferType type,
Tensor *output,
StatsFuture *future);
};
} // namepsace kernels
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_BUFFER_TO_IMAGE_H_
......@@ -5,6 +5,8 @@
#ifndef MACE_KERNELS_CHANNEL_SHUFFLE_H_
#define MACE_KERNELS_CHANNEL_SHUFFLE_H_
#include <vector>
#include "mace/core/future.h"
#include "mace/core/tensor.h"
......@@ -13,7 +15,7 @@ namespace kernels {
template <DeviceType D, typename T>
struct ChannelShuffleFunctor {
ChannelShuffleFunctor(const int groups) : groups_(groups) {}
explicit ChannelShuffleFunctor(const int groups) : groups_(groups) {}
void operator()(const Tensor *input,
Tensor *output,
......@@ -49,7 +51,7 @@ struct ChannelShuffleFunctor {
template <typename T>
struct ChannelShuffleFunctor<DeviceType::OPENCL, T> {
ChannelShuffleFunctor(const int groups) : groups_(groups) {}
explicit ChannelShuffleFunctor(const int groups) : groups_(groups) {}
void operator()(const Tensor *input, Tensor *output, StatsFuture *future);
......
......@@ -5,6 +5,8 @@
#ifndef MACE_KERNELS_CONCAT_H_
#define MACE_KERNELS_CONCAT_H_
#include <vector>
#include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h"
......@@ -15,14 +17,14 @@ namespace mace {
namespace kernels {
struct ConcatFunctorBase {
ConcatFunctorBase(const int32_t axis) : axis_(axis) {}
explicit ConcatFunctorBase(const int32_t axis) : axis_(axis) {}
int32_t axis_;
};
template <DeviceType D, typename T>
struct ConcatFunctor : ConcatFunctorBase {
ConcatFunctor(const int32_t axis) : ConcatFunctorBase(axis) {}
explicit ConcatFunctor(const int32_t axis) : ConcatFunctorBase(axis) {}
void operator()(const std::vector<const Tensor *> &input_list,
Tensor *output,
......@@ -77,7 +79,7 @@ struct ConcatFunctor : ConcatFunctorBase {
template <typename T>
struct ConcatFunctor<DeviceType::OPENCL, T> : ConcatFunctorBase {
ConcatFunctor(const int32_t axis) : ConcatFunctorBase(axis) {}
explicit ConcatFunctor(const int32_t axis) : ConcatFunctorBase(axis) {}
void operator()(const std::vector<const Tensor *> &input_list,
Tensor *output,
......@@ -86,7 +88,7 @@ struct ConcatFunctor<DeviceType::OPENCL, T> : ConcatFunctorBase {
std::vector<index_t> input_shape_;
};
} // namepsace kernels
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_CONCAT_H_
......@@ -8,6 +8,8 @@
#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
#include <arm_neon.h>
#endif
#include <algorithm>
#include <vector>
#include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h"
......@@ -18,7 +20,6 @@
namespace mace {
namespace kernels {
namespace {
template <typename T,
int inc_tile_size,
......@@ -61,9 +62,9 @@ void Conv2dKernelFunc(const T *input_ptr, // batch start
#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
// AArch64 NEON has 32 128-bit general purpose registers
static_assert(inc_tile_size == 4, "input channels tile size must be 4");
float32x4_t in[h_count * w_count];
float32x4_t in[h_count * w_count]; // NOLINT(runtime/arrays)
#else
T in[h_count * w_count * inc_tile_size];
T in[h_count * w_count * inc_tile_size]; // NOLINT(runtime/arrays)
#endif
for (int hi = 0; hi < h_count; ++hi) {
for (int wi = 0; wi < w_count; ++wi) {
......@@ -86,9 +87,9 @@ void Conv2dKernelFunc(const T *input_ptr, // batch start
#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
static_assert(inc_tile_size == 4, "input channels tile size must be 4");
float32x4_t weights[c_count];
float32x4_t weights[c_count]; // NOLINT(runtime/arrays)
#else
T weights[c_count * inc_tile_size];
T weights[c_count * inc_tile_size]; // NOLINT(runtime/arrays)
#endif
for (int ci = 0; ci < c_count; ++ci) {
const int weights_idx = ci;
......@@ -126,7 +127,7 @@ void Conv2dKernelFunc(const T *input_ptr, // batch start
}
// handling the remaining input channels
for (; inc < input_channels; ++inc) {
T in[h_count * w_count];
T in[h_count * w_count]; // NOLINT(runtime/arrays)
for (int hi = 0; hi < h_count; ++hi) {
for (int wi = 0; wi < w_count; ++wi) {
const int in_idx = hi * w_count + wi;
......@@ -138,7 +139,7 @@ void Conv2dKernelFunc(const T *input_ptr, // batch start
}
}
T weights[c_count];
T weights[c_count]; // NOLINT(runtime/arrays)
for (int ci = 0; ci < c_count; ++ci) {
const int weights_idx = ci;
const int filter_offset =
......@@ -173,7 +174,6 @@ void Conv2dKernelFunc(const T *input_ptr, // batch start
}
}
}
}; // namespace
struct Conv2dFunctorBase {
Conv2dFunctorBase(const int *strides,
......@@ -331,7 +331,7 @@ struct Conv2dFunctor : Conv2dFunctorBase {
auto output_data = output->mutable_data<T>();
constexpr int inc_tile_size = 4;
// TODO Auto tuning these parameters
// TODO(heliangliang) Auto tuning these parameters
#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
const int c_tile_size = 4;
const int h_tile_size = 2;
......
......@@ -4,6 +4,8 @@
#include "mace/kernels/conv_pool_2d_util.h"
#include <vector>
namespace mace {
namespace kernels {
......@@ -56,7 +58,7 @@ void CalcPaddingAndOutputSize(const index_t *input_shape, // NCHW
}
// Note: TensorFlow may padded one more on the right/bottom side
// TODO may be it's better to also truncate the left/top to
// TODO(liuqi): may be it's better to also truncate the left/top to
// utilize the more centered features. We need to benchmark
// based on the model accuracy.
......@@ -120,7 +122,7 @@ void CalcNHWCPaddingAndOutputSize(const index_t *input_shape, // NHWC
}
// Note: TensorFlow may padded one more on the right/bottom side
// TODO may be it's better to also truncate the left/top to
// TODO(liuqi): may be it's better to also truncate the left/top to
// utilize the more centered features. We need to benchmark
// based on the model accuracy.
......@@ -219,7 +221,7 @@ void CalPaddingSize(const index_t *input_shape, // NCHW
}
// Note: TensorFlow may padded one more on the right/bottom side
// TODO may be it's better to also truncate the left/top to
// TODO(liuqi): may be it's better to also truncate the left/top to
// utilize the more centered features. We need to benchmark
// based on the model accuracy.
padding_size[0] = std::max<int>(
......
......@@ -8,6 +8,8 @@
#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
#include <arm_neon.h>
#endif
#include <algorithm>
#include <vector>
#include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h"
......@@ -17,8 +19,6 @@
namespace mace {
namespace kernels {
namespace {
template <typename T>
void DepthwiseConv2dKernel(const T *input_ptr,
const T *filter_ptr,
......@@ -233,8 +233,6 @@ void DepthwiseConv2dNoOOBCheckKernel(const T *input_ptr,
}
}
} // namespace
struct DepthwiseConv2dFunctorBase {
DepthwiseConv2dFunctorBase(const int *strides,
const Padding padding_type,
......
......@@ -4,6 +4,9 @@
#ifndef MACE_KERNELS_ELTWISE_H_
#define MACE_KERNELS_ELTWISE_H_
#include <algorithm>
#include <vector>
#include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h"
......
......@@ -5,6 +5,8 @@
#ifndef MACE_KERNELS_FULLY_CONNECTED_H_
#define MACE_KERNELS_FULLY_CONNECTED_H_
#include <vector>
#include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h"
......
......@@ -21,7 +21,6 @@
namespace mace {
namespace kernels {
namespace {
template<typename T,
int register_tile_size,
int h_count,
......@@ -87,7 +86,6 @@ inline void MatMulKernelFunc(const T *A,
}
}
}
} // namespace
#define MACE_DO_MATMUL(HC, WC, KC) \
MatMulKernelFunc<T, register_tile_size, HC, WC, KC>(a_ptr_batch_base, \
......@@ -118,7 +116,6 @@ switch (k_count) { \
LOG(FATAL) << "Unsupported k tile: " << k_count; \
}
#define MACE_CASE_W_MATMUL(HC) \
switch (w_count) { \
case 1: \
......
......@@ -78,7 +78,7 @@ void BatchNormFunctor<DeviceType::NEON, float>::operator()(
}
}
}
};
}
} // namespace kernels
} // namespace mace
......@@ -296,7 +296,7 @@ void Conv2dNeonK1x1S1(const float *input, // NCHW
}
}
}
};
}
void Conv2dNeonPixelK1x1S1(
const float *input, // NCHW
......@@ -321,7 +321,7 @@ void Conv2dNeonPixelK1x1S1(
const index_t total_pixels = height * width;
// Process 4 * 2 = 8 pixels for each innermost loop
// TODO Does 64 bit v.s. 32 bit index matters? need benchmark
// TODO(heliangliang): Does 64 bit v.s. 32 bit index matters? need benchmark
const index_t total_loops = total_pixels >> 3;
const index_t loop_remaining = total_pixels & 7;
......@@ -329,7 +329,7 @@ void Conv2dNeonPixelK1x1S1(
for (index_t n = 0; n < batch; ++n) {
for (index_t c = 0; c < channels; ++c) {
const float *filter_ptr = filter + c * input_channels;
// TODO Will GCC opt these out?
// TODO(heliangliang): Will GCC opt these out?
float *channel_output_start =
output + n * channels * height * width + c * height * width;
const float *input_ptr =
......@@ -469,7 +469,7 @@ void Conv2dNeonPixelK1x1S1(
}
}
}
};
}
} // namespace kernels
} // namespace mace
......@@ -45,7 +45,6 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
built_options.emplace(MakeString("-DINPUT_NUM=", input_tensors.size()));
kernel_ = runtime->BuildKernel("addn", kernel_name, built_options);
}
std::vector<index_t> output_shape = input_tensors[0]->shape();
......@@ -56,7 +55,8 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
if (!IsVecEqual(input_shape_, input_tensors[0]->shape())) {
std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape);
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
&output_image_shape);
output_tensor->ResizeImage(output_shape, output_image_shape);
uint32_t idx = 0;
......@@ -75,7 +75,7 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
ss << "addn_opencl_kernel_" << output_shape[0] << "_" << output_shape[1]
<< "_" << output_shape[2] << "_" << output_shape[3];
TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future);
};
}
template struct AddNFunctor<DeviceType::OPENCL, float>;
......
......@@ -32,7 +32,6 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
kernel_ = runtime->BuildKernel("bias_add", kernel_name, built_options);
}
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
......
......@@ -14,7 +14,7 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
Tensor *buffer, const BufferType type, Tensor *image, StatsFuture *future) {
std::vector<size_t> image_shape;
if (!i2b_) {
CalImage2DShape(buffer->shape(), type, image_shape);
CalImage2DShape(buffer->shape(), type, &image_shape);
if (type == WINOGRAD_FILTER) {
std::vector<index_t> new_shape = CalWinogradShape(buffer->shape(), type);
image->ResizeImage(new_shape, image_shape);
......
......@@ -39,7 +39,8 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
kernel_ = runtime->BuildKernel("channel_shuffle", kernel_name, built_options);
kernel_ = runtime->BuildKernel("channel_shuffle", kernel_name,
built_options);
}
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
......@@ -61,7 +62,6 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(
<< output->dim(2) << "_"
<< output->dim(3);
TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
}
template
......
......@@ -41,7 +41,6 @@ static void Concat2(cl::Kernel *kernel,
built_options.emplace("-DDIVISIBLE_FOUR");
}
*kernel = runtime->BuildKernel("concat", kernel_name, built_options);
}
if (!IsVecEqual(*prev_input_shape, input0->shape())) {
uint32_t idx = 0;
......@@ -140,7 +139,7 @@ void ConcatFunctor<DeviceType::OPENCL, T>::operator()(
inputs_count == 2 || divisible_four,
"Dimensions of inputs should be divisible by 4 when inputs_count > 2.");
std::vector<size_t> image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, image_shape);
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
output->ResizeImage(output_shape, image_shape);
switch (inputs_count) {
......@@ -155,7 +154,7 @@ void ConcatFunctor<DeviceType::OPENCL, T>::operator()(
MACE_NOT_IMPLEMENTED;
}
}
};
}
template struct ConcatFunctor<DeviceType::OPENCL, float>;
template struct ConcatFunctor<DeviceType::OPENCL, half>;
......
......@@ -92,7 +92,8 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
}
std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape);
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
&output_image_shape);
output->ResizeImage(output_shape, output_image_shape);
if (kernel_h == kernel_w && kernel_h <= 5 &&
......
......@@ -68,7 +68,6 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
auto runtime = OpenCLRuntime::Global();
*kernel = runtime->BuildKernel("conv_2d_1x1", kernel_name, built_options);
}
if (!IsVecEqual(*prev_input_shape, input->shape())) {
uint32_t idx = 0;
......
......@@ -91,18 +91,18 @@ void DepthwiseConv2d(cl::Kernel *kernel,
}
kernel->setArg(idx++, *(output->opencl_image()));
kernel->setArg(idx++, relux_max_limit);
kernel->setArg(idx++, static_cast<short>(input_height));
kernel->setArg(idx++, static_cast<short>(input_width));
kernel->setArg(idx++, static_cast<short>(input_channel_blocks));
kernel->setArg(idx++, static_cast<short>(height));
kernel->setArg(idx++, static_cast<short>(width));
kernel->setArg(idx++, static_cast<short>(filter_height));
kernel->setArg(idx++, static_cast<short>(filter_width));
kernel->setArg(idx++, static_cast<short>(paddings[0] / 2));
kernel->setArg(idx++, static_cast<short>(paddings[1] / 2));
kernel->setArg(idx++, static_cast<int16_t>(input_height));
kernel->setArg(idx++, static_cast<int16_t>(input_width));
kernel->setArg(idx++, static_cast<int16_t>(input_channel_blocks));
kernel->setArg(idx++, static_cast<int16_t>(height));
kernel->setArg(idx++, static_cast<int16_t>(width));
kernel->setArg(idx++, static_cast<int16_t>(filter_height));
kernel->setArg(idx++, static_cast<int16_t>(filter_width));
kernel->setArg(idx++, static_cast<int16_t>(paddings[0] / 2));
kernel->setArg(idx++, static_cast<int16_t>(paddings[1] / 2));
if (stride != 1 || dilations[0] != 1 || dilations[1] != 1) {
kernel->setArg(idx++, static_cast<short>(dilations[0]));
kernel->setArg(idx++, static_cast<short>(dilations[1]));
kernel->setArg(idx++, static_cast<int16_t>(dilations[0]));
kernel->setArg(idx++, static_cast<int16_t>(dilations[1]));
}
*prev_input_shape = input->shape();
}
......@@ -159,7 +159,8 @@ void DepthwiseConv2dFunctor<DeviceType::OPENCL, T>::operator()(
}
std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape);
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
&output_image_shape);
output->ResizeImage(output_shape, output_image_shape);
DepthwiseConv2d(&kernel_, input, filter, bias, strides_[0], paddings.data(),
......
......@@ -35,7 +35,6 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
built_options.emplace(MakeString("-DELTWISE_TYPE=", type_));
if (!coeff_.empty()) built_options.emplace("-DCOEFF_SUM");
kernel_ = runtime->BuildKernel("eltwise", kernel_name, built_options);
}
if (!IsVecEqual(input_shape_, input0->shape())) {
uint32_t idx = 0;
......
......@@ -16,12 +16,14 @@ void FCWXKernel(cl::Kernel *kernel,
std::vector<index_t> *prev_input_shape,
Tensor *output,
const ActivationType activation,
std::vector<uint32_t> &gws,
std::vector<uint32_t> &lws,
std::vector<uint32_t> *gws,
std::vector<uint32_t> *lws,
const float relux_max_limit,
StatsFuture *future) {
MACE_CHECK(input->dim(3) % 4 == 0)
<< "FC width kernel only support input with 4x channel.";
MACE_CHECK_NOTNULL(gws);
MACE_CHECK_NOTNULL(lws);
auto runtime = OpenCLRuntime::Global();
if (kernel->get() == nullptr) {
......@@ -62,12 +64,11 @@ void FCWXKernel(cl::Kernel *kernel,
const index_t output_blocks = RoundUpDiv4(output_size);
const uint32_t wave_size = runtime->GetKernelWaveSize(*kernel);
gws = {4, (wave_size / 4), static_cast<uint32_t>(batch * output_blocks)};
*gws = {4, (wave_size / 4), static_cast<uint32_t>(batch * output_blocks)};
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(*kernel);
const uint32_t inter_local_blks = kwg_size / (gws[0] * gws[1]);
lws = {gws[0], gws[1], inter_local_blks};
const uint32_t inter_local_blks = kwg_size / ((*gws)[0] * (*gws)[1]);
*lws = {(*gws)[0], (*gws)[1], inter_local_blks};
}
if (!IsVecEqual(*prev_input_shape, input->shape())) {
const index_t batch = output->dim(0);
......@@ -80,21 +81,22 @@ void FCWXKernel(cl::Kernel *kernel,
kernel->setArg(idx++, *(bias->opencl_image()));
}
kernel->setArg(idx++, *(output->opencl_image()));
kernel->setArg(idx++, (lws[0] * lws[1] * lws[2] * sizeof(float)), nullptr);
kernel->setArg(idx++, ((*lws)[0] * (*lws)[1] * (*lws)[2] * sizeof(float)),
nullptr);
kernel->setArg(idx++, static_cast<int>(input->dim(1)));
kernel->setArg(idx++, static_cast<int>(input->dim(2)));
kernel->setArg(idx++, static_cast<int>(RoundUpDiv4(input->dim(3))));
kernel->setArg(idx++, static_cast<int>(output_blocks));
kernel->setArg(idx++, relux_max_limit);
gws[2] = static_cast<uint32_t>(batch * output_blocks);
(*gws)[2] = static_cast<uint32_t>(batch * output_blocks);
*prev_input_shape = input->shape();
}
cl::Event event;
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
*kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
*kernel, cl::NullRange, cl::NDRange((*gws)[0], (*gws)[1], (*gws)[2]),
cl::NDRange((*lws)[0], (*lws)[1], (*lws)[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
if (future != nullptr) {
......@@ -105,7 +107,6 @@ void FCWXKernel(cl::Kernel *kernel,
}
};
}
}
template <typename T>
......@@ -116,10 +117,12 @@ void FCWTXKernel(cl::Kernel *kernel,
std::vector<index_t> *prev_input_shape,
Tensor *output,
const ActivationType activation,
std::vector<uint32_t> &gws,
std::vector<uint32_t> &lws,
std::vector<uint32_t> *gws,
std::vector<uint32_t> *lws,
const float relux_max_limit,
StatsFuture *future) {
MACE_CHECK_NOTNULL(gws);
MACE_CHECK_NOTNULL(lws);
if (kernel->get() == nullptr) {
auto runtime = OpenCLRuntime::Global();
std::set<std::string> built_options;
......@@ -152,7 +155,7 @@ void FCWTXKernel(cl::Kernel *kernel,
*kernel =
runtime->BuildKernel("fully_connected", kernel_name, built_options);
lws = {16, 64, 1};
*lws = {16, 64, 1};
}
if (!IsVecEqual(*prev_input_shape, input->shape())) {
uint32_t idx = 0;
......@@ -171,18 +174,16 @@ void FCWTXKernel(cl::Kernel *kernel,
const index_t batch = output->dim(0);
const index_t output_blocks = RoundUpDiv4(output->dim(3));
gws = {
*gws = {
static_cast<uint32_t>(batch), static_cast<uint32_t>(output_blocks),
};
*prev_input_shape = input->shape();
}
std::stringstream ss;
ss << "fc_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) << "_"
<< output->dim(2) << "_" << output->dim(3);
TuningOrRun2DKernel(*kernel, ss.str(), gws.data(), lws, future);
TuningOrRun2DKernel(*kernel, ss.str(), gws->data(), *lws, future);
}
template <typename T>
......@@ -194,17 +195,18 @@ void FullyConnectedFunctor<DeviceType::OPENCL, T>::operator()(
StatsFuture *future) {
std::vector<index_t> output_shape = {input->dim(0), 1, 1, weight->dim(0)};
std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape);
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
&output_image_shape);
output->ResizeImage(output_shape, output_image_shape);
if (weight_type_ == BufferType::WEIGHT_HEIGHT) {
FCWTXKernel<T>(&kernel_, input, weight, bias, &input_shape_, output,
activation_, gws_, lws_, relux_max_limit_, future);
activation_, &gws_, &lws_, relux_max_limit_, future);
} else {
FCWXKernel<T>(&kernel_, input, weight, bias, &input_shape_, output,
activation_, gws_, lws_, relux_max_limit_, future);
activation_, &gws_, &lws_, relux_max_limit_, future);
}
};
}
template struct FullyConnectedFunctor<DeviceType::OPENCL, float>;
......
......@@ -3,6 +3,11 @@
//
#include "mace/kernels/opencl/helper.h"
#include <algorithm>
#include <string>
#include <vector>
#include "mace/utils/tuner.h"
#include "mace/utils/utils.h"
......@@ -11,91 +16,92 @@ namespace kernels {
// [(C + 3) / 4 * W, N * H]
void CalInOutputImageShape(const std::vector<index_t> &shape, /* NHWC */
std::vector<size_t> &image_shape) {
std::vector<size_t> *image_shape) {
MACE_CHECK(shape.size() == 4);
image_shape.resize(2);
image_shape[0] = RoundUpDiv4(shape[3]) * shape[2];
image_shape[1] = shape[0] * shape[1];
image_shape->resize(2);
(*image_shape)[0] = RoundUpDiv4(shape[3]) * shape[2];
(*image_shape)[1] = shape[0] * shape[1];
}
// [RoundUp<4>(Ic) * H * W, (Oc + 3) / 4]
void CalConv2dFilterImageShape(const std::vector<index_t> &shape, /* HWOI */
std::vector<size_t> &image_shape) {
std::vector<size_t> *image_shape) {
MACE_CHECK(shape.size() == 4);
image_shape.resize(2);
image_shape[0] = shape[0] * shape[1] * RoundUp<index_t>(shape[3], 4);
image_shape[1] = RoundUpDiv4(shape[2]);
image_shape->resize(2);
(*image_shape)[0] = shape[0] * shape[1] * RoundUp<index_t>(shape[3], 4);
(*image_shape)[1] = RoundUpDiv4(shape[2]);
}
// [H * W * M, (Ic + 3) / 4]
void CalDepthwiseConv2dFilterImageShape(
const std::vector<index_t> &shape, /* HWIM */
std::vector<size_t> &image_shape) {
std::vector<size_t> *image_shape) {
MACE_CHECK(shape.size() == 4);
image_shape.resize(2);
image_shape[0] = shape[0] * shape[1] * shape[3];
image_shape[1] = RoundUpDiv4(shape[2]);
image_shape->resize(2);
(*image_shape)[0] = shape[0] * shape[1] * shape[3];
(*image_shape)[1] = RoundUpDiv4(shape[2]);
}
// [(size + 3) / 4, 1]
void CalArgImageShape(const std::vector<index_t> &shape,
std::vector<size_t> &image_shape) {
std::vector<size_t> *image_shape) {
MACE_CHECK(shape.size() == 1);
image_shape.resize(2);
image_shape[0] = RoundUpDiv4(shape[0]);
image_shape[1] = 1;
image_shape->resize(2);
(*image_shape)[0] = RoundUpDiv4(shape[0]);
(*image_shape)[1] = 1;
}
// Only support 3x3 now
// [ (Ic + 3) / 4, 16 * Oc]
void CalWinogradFilterImageShape(
const std::vector<index_t> &shape, /* Oc, Ic, H, W*/
std::vector<size_t> &image_shape) {
std::vector<size_t> *image_shape) {
MACE_CHECK(shape.size() == 4);
image_shape.resize(2);
image_shape[0] = RoundUpDiv4(shape[1]);
image_shape[1] = (shape[0] << 4);
image_shape->resize(2);
(*image_shape)[0] = RoundUpDiv4(shape[1]);
(*image_shape)[1] = (shape[0] << 4);
}
// [W * C, N * RoundUp<4>(H)]
void CalInOutHeightImageShape(const std::vector<index_t> &shape, /* NHWC */
std::vector<size_t> &image_shape) {
std::vector<size_t> *image_shape) {
MACE_CHECK(shape.size() == 4);
image_shape.resize(2);
image_shape[0] = shape[2] * shape[3];
image_shape[1] = shape[0] * RoundUpDiv4(shape[1]);
image_shape->resize(2);
(*image_shape)[0] = shape[2] * shape[3];
(*image_shape)[1] = shape[0] * RoundUpDiv4(shape[1]);
}
// [RoundUp<4>(W) * C, N * H]
void CalInOutWidthImageShape(const std::vector<index_t> &shape, /* NHWC */
std::vector<size_t> &image_shape) {
std::vector<size_t> *image_shape) {
MACE_CHECK(shape.size() == 4);
image_shape.resize(2);
image_shape[0] = RoundUpDiv4(shape[2]) * shape[3];
image_shape[1] = shape[0] * shape[1];
image_shape->resize(2);
(*image_shape)[0] = RoundUpDiv4(shape[2]) * shape[3];
(*image_shape)[1] = shape[0] * shape[1];
}
// [W, (H + 3) / 4]
void CalWeightHeightImageShape(const std::vector<index_t> &shape, /* HW */
std::vector<size_t> &image_shape) {
std::vector<size_t> *image_shape) {
MACE_CHECK(shape.size() == 2);
image_shape.resize(2);
image_shape[0] = shape[1];
image_shape[1] = RoundUpDiv4(shape[0]);
image_shape->resize(2);
(*image_shape)[0] = shape[1];
(*image_shape)[1] = RoundUpDiv4(shape[0]);
}
// [(W + 3) / 4, H]
void CalWeightWidthImageShape(const std::vector<index_t> &shape, /* HW */
std::vector<size_t> &image_shape) {
std::vector<size_t> *image_shape) {
MACE_CHECK(shape.size() == 2);
image_shape.resize(2);
image_shape[0] = RoundUpDiv4(shape[1]);
image_shape[1] = shape[0];
image_shape->resize(2);
(*image_shape)[0] = RoundUpDiv4(shape[1]);
(*image_shape)[1] = shape[0];
}
void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
const BufferType type,
std::vector<size_t> &image_shape) {
std::vector<size_t> *image_shape) {
MACE_CHECK_NOTNULL(image_shape);
switch (type) {
case CONV2D_FILTER:
CalConv2dFilterImageShape(shape, image_shape);
......@@ -188,7 +194,7 @@ std::string DtToUpstreamCLCMDDt(const DataType dt) {
}
}
void TuningOrRun3DKernel(cl::Kernel &kernel,
void TuningOrRun3DKernel(const cl::Kernel &kernel,
const std::string tuning_key,
const uint32_t *gws,
const std::vector<uint32_t> &lws,
......@@ -202,7 +208,7 @@ void TuningOrRun3DKernel(cl::Kernel &kernel,
local_ws[2] =
std::min<uint32_t>(gws[2], kwg_size / (local_ws[0] * local_ws[1]));
return {
// TODO tuning these magic numbers
// TODO(heliangliang): tuning these magic numbers
{local_ws[0], local_ws[1], local_ws[2], 1},
{kwg_size / 16, 4, 4, 1},
{kwg_size / 32, 4, 8, 1},
......@@ -291,7 +297,7 @@ void TuningOrRun3DKernel(cl::Kernel &kernel,
}
}
void TuningOrRun2DKernel(cl::Kernel &kernel,
void TuningOrRun2DKernel(const cl::Kernel &kernel,
const std::string tuning_key,
const uint32_t *gws,
const std::vector<uint32_t> &lws,
......
......@@ -5,6 +5,9 @@
#ifndef MACE_KERNELS_OPENCL_HELPER_H_
#define MACE_KERNELS_OPENCL_HELPER_H_
#include <string>
#include <vector>
#include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
......@@ -30,7 +33,7 @@ enum BufferType {
void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
const BufferType type,
std::vector<size_t> &image_shape);
std::vector<size_t> *image_shape);
std::vector<index_t> CalWinogradShape(const std::vector<index_t> &shape,
const BufferType type);
......@@ -43,13 +46,13 @@ std::string DtToCLDt(const DataType dt);
std::string DtToUpstreamCLDt(const DataType dt);
void TuningOrRun3DKernel(cl::Kernel &kernel,
void TuningOrRun3DKernel(const cl::Kernel &kernel,
const std::string tuning_key,
const uint32_t *gws,
const std::vector<uint32_t> &lws,
StatsFuture *future);
void TuningOrRun2DKernel(cl::Kernel &kernel,
void TuningOrRun2DKernel(const cl::Kernel &kernel,
const std::string tuning_key,
const uint32_t *gws,
const std::vector<uint32_t> &lws,
......@@ -78,7 +81,6 @@ bool IsVecEqual(const std::vector<T> &input0,
(std::equal(input0.begin(), input0.end(), input1.begin())));
}
namespace {
template <typename T>
void AppendToStream(std::stringstream *ss, const std::string &delimiter, T v) {
(*ss) << v;
......@@ -92,7 +94,6 @@ void AppendToStream(std::stringstream *ss,
(*ss) << first << delimiter;
AppendToStream(ss, delimiter, args...);
}
} // namespace
template <typename... Args>
std::string Concat(Args... args) {
......
......@@ -17,7 +17,7 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
StatsFuture *future) {
std::vector<index_t> c_shape = {A->dim(0), A->dim(1), B->dim(2), 1};
std::vector<size_t> c_image_shape;
CalImage2DShape(c_shape, BufferType::IN_OUT_HEIGHT, c_image_shape);
CalImage2DShape(c_shape, BufferType::IN_OUT_HEIGHT, &c_image_shape);
C->ResizeImage(c_shape, c_image_shape);
const index_t batch = C->dim(0);
......@@ -56,7 +56,7 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
ss << "matmul_opencl_kernel_" << C->dim(0) << "_" << C->dim(1) << "_"
<< C->dim(2) << "_" << C->dim(3);
TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future);
};
}
template struct MatMulFunctor<DeviceType::OPENCL, float>;
......
......@@ -36,12 +36,11 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
built_options.emplace("-DPOOL_AVG");
}
kernel_ = runtime->BuildKernel("pooling", kernel_name, built_options);
}
if (!IsVecEqual(input_shape_, input->shape())) {
std::vector<index_t> output_shape(4);
std::vector<index_t> filter_shape = {kernels_[0], kernels_[1], input->dim(3),
input->dim(3)};
std::vector<index_t> filter_shape = {kernels_[0], kernels_[1],
input->dim(3), input->dim(3)};
std::vector<int> paddings(2);
if (paddings_.empty()) {
......@@ -50,12 +49,14 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
padding_type_, output_shape.data(), paddings.data());
} else {
paddings = paddings_;
CalcOutputSize(input->shape().data(), filter_shape.data(), paddings_.data(),
dilations_, strides_, RoundType::CEIL, output_shape.data());
CalcOutputSize(input->shape().data(), filter_shape.data(),
paddings_.data(), dilations_, strides_, RoundType::CEIL,
output_shape.data());
}
std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape);
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
&output_image_shape);
output->ResizeImage(output_shape, output_image_shape);
uint32_t idx = 0;
......
......@@ -34,7 +34,6 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
kernel_ =
runtime->BuildKernel("resize_bilinear", kernel_name, built_options);
}
if (!IsVecEqual(input_shape_, input->shape())) {
MACE_CHECK(out_height > 0 && out_width > 0);
......@@ -42,7 +41,7 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
output_image_shape);
&output_image_shape);
output->ResizeImage(output_shape, output_image_shape);
float height_scale =
......@@ -60,7 +59,6 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
kernel_.setArg(idx++, static_cast<int32_t>(out_height));
input_shape_ = input->shape();
}
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
......
......@@ -24,7 +24,7 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()(
input->dim(2), output_channels});
std::vector<size_t> image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, image_shape);
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
for (size_t i= 0; i < outputs_count; ++i) {
output_list[i]->ResizeImage(output_shape, image_shape);
}
......
......@@ -33,7 +33,6 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
kernel_ = runtime->BuildKernel("softmax", kernel_name, built_options);
}
if (!IsVecEqual(input_shape_, logits->shape())) {
uint32_t idx = 0;
......
......@@ -22,7 +22,8 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
StatsFuture *future) {
const char *kernel_name = nullptr;
std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape);
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
&output_image_shape);
if (b2s_) {
space_tensor->ResizeImage(output_shape, output_image_shape);
kernel_name = "batch_to_space";
......@@ -42,7 +43,6 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
DtToCLCMDDt(DataTypeToEnum<T>::value));
kernel_ =
runtime->BuildKernel("space_to_batch", kernel_name, built_options);
}
if (!IsVecEqual(space_shape_, space_tensor->shape())) {
uint32_t idx = 0;
......
......@@ -27,7 +27,6 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
auto runtime = OpenCLRuntime::Global();
kernel_ = runtime->BuildKernel("winograd_transform", obfuscated_kernel_name,
built_options);
}
std::vector<index_t> output_shape(4);
std::vector<index_t> filter_shape = {3, 3, input_tensor->dim(3), 1};
......@@ -49,7 +48,7 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
if (!IsVecEqual(input_shape_, input_tensor->shape())) {
output_shape = {16, input_tensor->dim(3), out_width, 1};
std::vector<size_t> image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_HEIGHT, image_shape);
CalImage2DShape(output_shape, BufferType::IN_OUT_HEIGHT, &image_shape);
output_tensor->ResizeImage(output_shape, image_shape);
uint32_t idx = 0;
......@@ -83,7 +82,6 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
const Tensor *bias,
Tensor *output_tensor,
StatsFuture *future) {
if (kernel_.get() == nullptr) {
std::string obfuscated_kernel_name =
MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_2x2");
......@@ -125,7 +123,7 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
std::vector<index_t> output_shape = {batch_, height_, width_,
input_tensor->dim(1)};
std::vector<size_t> image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, image_shape);
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
output_tensor->ResizeImage(output_shape, image_shape);
const uint32_t round_h = (height_ + 1) / 2;
......
......@@ -2,10 +2,13 @@
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#ifndef MACE_KERNELS_POOLING_H
#define MACE_KERNELS_POOLING_H
#ifndef MACE_KERNELS_POOLING_H_
#define MACE_KERNELS_POOLING_H_
#include <algorithm>
#include <limits>
#include <vector>
#include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h"
......@@ -188,4 +191,4 @@ struct PoolingFunctor<DeviceType::OPENCL, T> : PoolingFunctorBase {
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_POOLING_H
#endif // MACE_KERNELS_POOLING_H_
......@@ -4,6 +4,8 @@
#ifndef MACE_KERNELS_RESHAPE_H_
#define MACE_KERNELS_RESHAPE_H_
#include <vector>
#include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h"
......@@ -20,7 +22,7 @@ struct ReshapeFunctor {
Tensor *output,
StatsFuture *future) {
output->Resize(out_shape);
// TODO copy on write to avoid this copy.
// TODO(liuqi): copy on write to avoid this copy.
output->CopyBytes(input->raw_data(), input->size() * sizeof(T));
}
};
......
......@@ -4,6 +4,9 @@
#ifndef MACE_KERNELS_RESIZE_BILINEAR_H_
#define MACE_KERNELS_RESIZE_BILINEAR_H_
#include <algorithm>
#include <vector>
#include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h"
......@@ -11,7 +14,6 @@
namespace mace {
namespace kernels {
namespace {
struct CachedInterpolation {
index_t lower; // Lower source index used in the interpolation
index_t upper; // Upper source index used in the interpolation
......@@ -101,7 +103,6 @@ void ResizeImage(const T *images,
}
}
}
}
struct ResizeBilinearFunctorBase {
ResizeBilinearFunctorBase(const std::vector<index_t> &size,
......
......@@ -5,6 +5,8 @@
#ifndef MACE_KERNELS_SLICE_H_
#define MACE_KERNELS_SLICE_H_
#include <vector>
#include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h"
......@@ -16,7 +18,6 @@ namespace kernels {
template<DeviceType D, typename T>
struct SliceFunctor {
void operator()(const Tensor *input,
const std::vector<Tensor *> &output_list,
StatsFuture *future) {
......@@ -56,15 +57,13 @@ struct SliceFunctor {
template<typename T>
struct SliceFunctor<DeviceType::OPENCL, T> {
void operator()(const Tensor *input,
const std::vector<Tensor *> &output_list,
StatsFuture *future);
cl::Kernel kernel_;
};
} // namepsace kernels
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_SLICE_H_
......@@ -5,6 +5,10 @@
#ifndef MACE_KERNELS_SOFTMAX_H_
#define MACE_KERNELS_SOFTMAX_H_
#include <algorithm>
#include <functional>
#include <vector>
#include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h"
......@@ -38,7 +42,7 @@ struct SoftmaxFunctor {
for (index_t c = 1; c < num_classes; ++c) {
max_value = std::max(max_value, logits_ptr[pos + c]);
}
// TODO: check overflow?
// TODO(liuqi): check overflow?
T sum = 0;
for (index_t c = 0; c < num_classes; ++c) {
exp_data[c] = ::exp((logits_ptr[pos + c] - max_value));
......@@ -60,7 +64,7 @@ struct SoftmaxFunctor<DeviceType::OPENCL, T> {
std::vector<index_t> input_shape_;
};
} // namepsace kernels
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_SOFTMAX_H_
......@@ -2,8 +2,10 @@
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#ifndef MACE_KERNELS_CONV_2D_H_
#define MACE_KERNELS_CONV_2D_H_
#ifndef MACE_KERNELS_SPACE_TO_BATCH_H_
#define MACE_KERNELS_SPACE_TO_BATCH_H_
#include <vector>
#include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h"
......@@ -60,4 +62,4 @@ struct SpaceToBatchFunctor<DeviceType::OPENCL, T> : SpaceToBatchFunctorBase {
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_CONV_2D_H_
#endif // MACE_KERNELS_SPACE_TO_BATCH_H_
......@@ -5,6 +5,8 @@
#ifndef MACE_KERNELS_WINOGRAD_TRANSFORM_H_
#define MACE_KERNELS_WINOGRAD_TRANSFORM_H_
#include <vector>
#include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h"
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册