提交 16703420 编写于 作者: W wuchenghui

fix cpplint for mace/kernels

上级 564833a5
...@@ -5,6 +5,10 @@ ...@@ -5,6 +5,10 @@
#ifndef MACE_KERNELS_ACTIVATION_H_ #ifndef MACE_KERNELS_ACTIVATION_H_
#define MACE_KERNELS_ACTIVATION_H_ #define MACE_KERNELS_ACTIVATION_H_
#include <algorithm>
#include <string>
#include <vector>
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
......
...@@ -8,6 +8,7 @@ ...@@ -8,6 +8,7 @@
#if defined(MACE_ENABLE_NEON) && defined(__aarch64__) #if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
#include <arm_neon.h> #include <arm_neon.h>
#endif #endif
#include <algorithm>
#include <vector> #include <vector>
#include "mace/core/future.h" #include "mace/core/future.h"
...@@ -17,9 +18,7 @@ ...@@ -17,9 +18,7 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
namespace {
constexpr int kCostPerGroup = 1024; constexpr int kCostPerGroup = 1024;
} // namespace
template <DeviceType D, typename T> template <DeviceType D, typename T>
struct AddNFunctor { struct AddNFunctor {
......
...@@ -8,6 +8,7 @@ ...@@ -8,6 +8,7 @@
#if defined(MACE_ENABLE_NEON) && defined(__aarch64__) #if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
#include <arm_neon.h> #include <arm_neon.h>
#endif #endif
#include <vector>
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/cl2_header.h"
...@@ -159,7 +160,7 @@ struct BatchNormFunctor<DeviceType::OPENCL, T> : BatchNormFunctorBase { ...@@ -159,7 +160,7 @@ struct BatchNormFunctor<DeviceType::OPENCL, T> : BatchNormFunctorBase {
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
} // namepsace kernels } // namespace kernels
} // namespace mace } // namespace mace
#endif // MACE_KERNELS_BATCH_NORM_H_ #endif // MACE_KERNELS_BATCH_NORM_H_
...@@ -5,6 +5,8 @@ ...@@ -5,6 +5,8 @@
#ifndef MACE_KERNELS_BIAS_ADD_H_ #ifndef MACE_KERNELS_BIAS_ADD_H_
#define MACE_KERNELS_BIAS_ADD_H_ #define MACE_KERNELS_BIAS_ADD_H_
#include <vector>
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
...@@ -65,7 +67,7 @@ struct BiasAddFunctor<DeviceType::OPENCL, T> { ...@@ -65,7 +67,7 @@ struct BiasAddFunctor<DeviceType::OPENCL, T> {
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
} // namepsace kernels } // namespace kernels
} // namespace mace } // namespace mace
#endif // MACE_KERNELS_BIAS_ADD_H_ #endif // MACE_KERNELS_BIAS_ADD_H_
...@@ -13,13 +13,14 @@ namespace mace { ...@@ -13,13 +13,14 @@ namespace mace {
namespace kernels { namespace kernels {
struct BufferToImageFunctorBase { struct BufferToImageFunctorBase {
BufferToImageFunctorBase(bool i2b) : i2b_(i2b) {} explicit BufferToImageFunctorBase(bool i2b) : i2b_(i2b) {}
bool i2b_; bool i2b_;
}; };
template <DeviceType D, typename T> template <DeviceType D, typename T>
struct BufferToImageFunctor : BufferToImageFunctorBase { struct BufferToImageFunctor : BufferToImageFunctorBase {
BufferToImageFunctor(bool i2b = false) : BufferToImageFunctorBase(i2b) {} explicit BufferToImageFunctor(bool i2b = false)
: BufferToImageFunctorBase(i2b) {}
void operator()(Tensor *input, void operator()(Tensor *input,
const BufferType type, const BufferType type,
Tensor *output, Tensor *output,
...@@ -30,14 +31,15 @@ struct BufferToImageFunctor : BufferToImageFunctorBase { ...@@ -30,14 +31,15 @@ struct BufferToImageFunctor : BufferToImageFunctorBase {
template <typename T> template <typename T>
struct BufferToImageFunctor<DeviceType::OPENCL, T> : BufferToImageFunctorBase { struct BufferToImageFunctor<DeviceType::OPENCL, T> : BufferToImageFunctorBase {
BufferToImageFunctor(bool i2b = false) : BufferToImageFunctorBase(i2b) {} explicit BufferToImageFunctor(bool i2b = false)
: BufferToImageFunctorBase(i2b) {}
void operator()(Tensor *input, void operator()(Tensor *input,
const BufferType type, const BufferType type,
Tensor *output, Tensor *output,
StatsFuture *future); StatsFuture *future);
}; };
} // namepsace kernels } // namespace kernels
} // namespace mace } // namespace mace
#endif // MACE_KERNELS_BUFFER_TO_IMAGE_H_ #endif // MACE_KERNELS_BUFFER_TO_IMAGE_H_
...@@ -5,6 +5,8 @@ ...@@ -5,6 +5,8 @@
#ifndef MACE_KERNELS_CHANNEL_SHUFFLE_H_ #ifndef MACE_KERNELS_CHANNEL_SHUFFLE_H_
#define MACE_KERNELS_CHANNEL_SHUFFLE_H_ #define MACE_KERNELS_CHANNEL_SHUFFLE_H_
#include <vector>
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
...@@ -13,7 +15,7 @@ namespace kernels { ...@@ -13,7 +15,7 @@ namespace kernels {
template <DeviceType D, typename T> template <DeviceType D, typename T>
struct ChannelShuffleFunctor { struct ChannelShuffleFunctor {
ChannelShuffleFunctor(const int groups) : groups_(groups) {} explicit ChannelShuffleFunctor(const int groups) : groups_(groups) {}
void operator()(const Tensor *input, void operator()(const Tensor *input,
Tensor *output, Tensor *output,
...@@ -49,7 +51,7 @@ struct ChannelShuffleFunctor { ...@@ -49,7 +51,7 @@ struct ChannelShuffleFunctor {
template <typename T> template <typename T>
struct ChannelShuffleFunctor<DeviceType::OPENCL, T> { struct ChannelShuffleFunctor<DeviceType::OPENCL, T> {
ChannelShuffleFunctor(const int groups) : groups_(groups) {} explicit ChannelShuffleFunctor(const int groups) : groups_(groups) {}
void operator()(const Tensor *input, Tensor *output, StatsFuture *future); void operator()(const Tensor *input, Tensor *output, StatsFuture *future);
......
...@@ -5,6 +5,8 @@ ...@@ -5,6 +5,8 @@
#ifndef MACE_KERNELS_CONCAT_H_ #ifndef MACE_KERNELS_CONCAT_H_
#define MACE_KERNELS_CONCAT_H_ #define MACE_KERNELS_CONCAT_H_
#include <vector>
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
...@@ -15,14 +17,14 @@ namespace mace { ...@@ -15,14 +17,14 @@ namespace mace {
namespace kernels { namespace kernels {
struct ConcatFunctorBase { struct ConcatFunctorBase {
ConcatFunctorBase(const int32_t axis) : axis_(axis) {} explicit ConcatFunctorBase(const int32_t axis) : axis_(axis) {}
int32_t axis_; int32_t axis_;
}; };
template <DeviceType D, typename T> template <DeviceType D, typename T>
struct ConcatFunctor : ConcatFunctorBase { struct ConcatFunctor : ConcatFunctorBase {
ConcatFunctor(const int32_t axis) : ConcatFunctorBase(axis) {} explicit ConcatFunctor(const int32_t axis) : ConcatFunctorBase(axis) {}
void operator()(const std::vector<const Tensor *> &input_list, void operator()(const std::vector<const Tensor *> &input_list,
Tensor *output, Tensor *output,
...@@ -77,7 +79,7 @@ struct ConcatFunctor : ConcatFunctorBase { ...@@ -77,7 +79,7 @@ struct ConcatFunctor : ConcatFunctorBase {
template <typename T> template <typename T>
struct ConcatFunctor<DeviceType::OPENCL, T> : ConcatFunctorBase { struct ConcatFunctor<DeviceType::OPENCL, T> : ConcatFunctorBase {
ConcatFunctor(const int32_t axis) : ConcatFunctorBase(axis) {} explicit ConcatFunctor(const int32_t axis) : ConcatFunctorBase(axis) {}
void operator()(const std::vector<const Tensor *> &input_list, void operator()(const std::vector<const Tensor *> &input_list,
Tensor *output, Tensor *output,
...@@ -86,7 +88,7 @@ struct ConcatFunctor<DeviceType::OPENCL, T> : ConcatFunctorBase { ...@@ -86,7 +88,7 @@ struct ConcatFunctor<DeviceType::OPENCL, T> : ConcatFunctorBase {
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
} // namepsace kernels } // namespace kernels
} // namespace mace } // namespace mace
#endif // MACE_KERNELS_CONCAT_H_ #endif // MACE_KERNELS_CONCAT_H_
...@@ -8,6 +8,8 @@ ...@@ -8,6 +8,8 @@
#if defined(MACE_ENABLE_NEON) && defined(__aarch64__) #if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
#include <arm_neon.h> #include <arm_neon.h>
#endif #endif
#include <algorithm>
#include <vector>
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/cl2_header.h"
...@@ -18,7 +20,6 @@ ...@@ -18,7 +20,6 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
namespace {
template <typename T, template <typename T,
int inc_tile_size, int inc_tile_size,
...@@ -61,9 +62,9 @@ void Conv2dKernelFunc(const T *input_ptr, // batch start ...@@ -61,9 +62,9 @@ void Conv2dKernelFunc(const T *input_ptr, // batch start
#if defined(MACE_ENABLE_NEON) && defined(__aarch64__) #if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
// AArch64 NEON has 32 128-bit general purpose registers // AArch64 NEON has 32 128-bit general purpose registers
static_assert(inc_tile_size == 4, "input channels tile size must be 4"); static_assert(inc_tile_size == 4, "input channels tile size must be 4");
float32x4_t in[h_count * w_count]; float32x4_t in[h_count * w_count]; // NOLINT(runtime/arrays)
#else #else
T in[h_count * w_count * inc_tile_size]; T in[h_count * w_count * inc_tile_size]; // NOLINT(runtime/arrays)
#endif #endif
for (int hi = 0; hi < h_count; ++hi) { for (int hi = 0; hi < h_count; ++hi) {
for (int wi = 0; wi < w_count; ++wi) { for (int wi = 0; wi < w_count; ++wi) {
...@@ -86,9 +87,9 @@ void Conv2dKernelFunc(const T *input_ptr, // batch start ...@@ -86,9 +87,9 @@ void Conv2dKernelFunc(const T *input_ptr, // batch start
#if defined(MACE_ENABLE_NEON) && defined(__aarch64__) #if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
static_assert(inc_tile_size == 4, "input channels tile size must be 4"); static_assert(inc_tile_size == 4, "input channels tile size must be 4");
float32x4_t weights[c_count]; float32x4_t weights[c_count]; // NOLINT(runtime/arrays)
#else #else
T weights[c_count * inc_tile_size]; T weights[c_count * inc_tile_size]; // NOLINT(runtime/arrays)
#endif #endif
for (int ci = 0; ci < c_count; ++ci) { for (int ci = 0; ci < c_count; ++ci) {
const int weights_idx = ci; const int weights_idx = ci;
...@@ -126,7 +127,7 @@ void Conv2dKernelFunc(const T *input_ptr, // batch start ...@@ -126,7 +127,7 @@ void Conv2dKernelFunc(const T *input_ptr, // batch start
} }
// handling the remaining input channels // handling the remaining input channels
for (; inc < input_channels; ++inc) { for (; inc < input_channels; ++inc) {
T in[h_count * w_count]; T in[h_count * w_count]; // NOLINT(runtime/arrays)
for (int hi = 0; hi < h_count; ++hi) { for (int hi = 0; hi < h_count; ++hi) {
for (int wi = 0; wi < w_count; ++wi) { for (int wi = 0; wi < w_count; ++wi) {
const int in_idx = hi * w_count + wi; const int in_idx = hi * w_count + wi;
...@@ -138,7 +139,7 @@ void Conv2dKernelFunc(const T *input_ptr, // batch start ...@@ -138,7 +139,7 @@ void Conv2dKernelFunc(const T *input_ptr, // batch start
} }
} }
T weights[c_count]; T weights[c_count]; // NOLINT(runtime/arrays)
for (int ci = 0; ci < c_count; ++ci) { for (int ci = 0; ci < c_count; ++ci) {
const int weights_idx = ci; const int weights_idx = ci;
const int filter_offset = const int filter_offset =
...@@ -173,7 +174,6 @@ void Conv2dKernelFunc(const T *input_ptr, // batch start ...@@ -173,7 +174,6 @@ void Conv2dKernelFunc(const T *input_ptr, // batch start
} }
} }
} }
}; // namespace
struct Conv2dFunctorBase { struct Conv2dFunctorBase {
Conv2dFunctorBase(const int *strides, Conv2dFunctorBase(const int *strides,
...@@ -331,7 +331,7 @@ struct Conv2dFunctor : Conv2dFunctorBase { ...@@ -331,7 +331,7 @@ struct Conv2dFunctor : Conv2dFunctorBase {
auto output_data = output->mutable_data<T>(); auto output_data = output->mutable_data<T>();
constexpr int inc_tile_size = 4; constexpr int inc_tile_size = 4;
// TODO Auto tuning these parameters // TODO(heliangliang) Auto tuning these parameters
#if defined(MACE_ENABLE_NEON) && defined(__aarch64__) #if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
const int c_tile_size = 4; const int c_tile_size = 4;
const int h_tile_size = 2; const int h_tile_size = 2;
......
...@@ -4,6 +4,8 @@ ...@@ -4,6 +4,8 @@
#include "mace/kernels/conv_pool_2d_util.h" #include "mace/kernels/conv_pool_2d_util.h"
#include <vector>
namespace mace { namespace mace {
namespace kernels { namespace kernels {
...@@ -56,7 +58,7 @@ void CalcPaddingAndOutputSize(const index_t *input_shape, // NCHW ...@@ -56,7 +58,7 @@ void CalcPaddingAndOutputSize(const index_t *input_shape, // NCHW
} }
// Note: TensorFlow may padded one more on the right/bottom side // Note: TensorFlow may padded one more on the right/bottom side
// TODO may be it's better to also truncate the left/top to // TODO(liuqi): may be it's better to also truncate the left/top to
// utilize the more centered features. We need to benchmark // utilize the more centered features. We need to benchmark
// based on the model accuracy. // based on the model accuracy.
...@@ -120,7 +122,7 @@ void CalcNHWCPaddingAndOutputSize(const index_t *input_shape, // NHWC ...@@ -120,7 +122,7 @@ void CalcNHWCPaddingAndOutputSize(const index_t *input_shape, // NHWC
} }
// Note: TensorFlow may padded one more on the right/bottom side // Note: TensorFlow may padded one more on the right/bottom side
// TODO may be it's better to also truncate the left/top to // TODO(liuqi): may be it's better to also truncate the left/top to
// utilize the more centered features. We need to benchmark // utilize the more centered features. We need to benchmark
// based on the model accuracy. // based on the model accuracy.
...@@ -219,7 +221,7 @@ void CalPaddingSize(const index_t *input_shape, // NCHW ...@@ -219,7 +221,7 @@ void CalPaddingSize(const index_t *input_shape, // NCHW
} }
// Note: TensorFlow may padded one more on the right/bottom side // Note: TensorFlow may padded one more on the right/bottom side
// TODO may be it's better to also truncate the left/top to // TODO(liuqi): may be it's better to also truncate the left/top to
// utilize the more centered features. We need to benchmark // utilize the more centered features. We need to benchmark
// based on the model accuracy. // based on the model accuracy.
padding_size[0] = std::max<int>( padding_size[0] = std::max<int>(
......
...@@ -8,6 +8,8 @@ ...@@ -8,6 +8,8 @@
#if defined(MACE_ENABLE_NEON) && defined(__aarch64__) #if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
#include <arm_neon.h> #include <arm_neon.h>
#endif #endif
#include <algorithm>
#include <vector>
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/cl2_header.h"
...@@ -17,8 +19,6 @@ ...@@ -17,8 +19,6 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
namespace {
template <typename T> template <typename T>
void DepthwiseConv2dKernel(const T *input_ptr, void DepthwiseConv2dKernel(const T *input_ptr,
const T *filter_ptr, const T *filter_ptr,
...@@ -233,8 +233,6 @@ void DepthwiseConv2dNoOOBCheckKernel(const T *input_ptr, ...@@ -233,8 +233,6 @@ void DepthwiseConv2dNoOOBCheckKernel(const T *input_ptr,
} }
} }
} // namespace
struct DepthwiseConv2dFunctorBase { struct DepthwiseConv2dFunctorBase {
DepthwiseConv2dFunctorBase(const int *strides, DepthwiseConv2dFunctorBase(const int *strides,
const Padding padding_type, const Padding padding_type,
......
...@@ -4,6 +4,9 @@ ...@@ -4,6 +4,9 @@
#ifndef MACE_KERNELS_ELTWISE_H_ #ifndef MACE_KERNELS_ELTWISE_H_
#define MACE_KERNELS_ELTWISE_H_ #define MACE_KERNELS_ELTWISE_H_
#include <algorithm>
#include <vector>
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
......
...@@ -5,6 +5,8 @@ ...@@ -5,6 +5,8 @@
#ifndef MACE_KERNELS_FULLY_CONNECTED_H_ #ifndef MACE_KERNELS_FULLY_CONNECTED_H_
#define MACE_KERNELS_FULLY_CONNECTED_H_ #define MACE_KERNELS_FULLY_CONNECTED_H_
#include <vector>
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
......
...@@ -21,7 +21,6 @@ ...@@ -21,7 +21,6 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
namespace {
template<typename T, template<typename T,
int register_tile_size, int register_tile_size,
int h_count, int h_count,
...@@ -87,7 +86,6 @@ inline void MatMulKernelFunc(const T *A, ...@@ -87,7 +86,6 @@ inline void MatMulKernelFunc(const T *A,
} }
} }
} }
} // namespace
#define MACE_DO_MATMUL(HC, WC, KC) \ #define MACE_DO_MATMUL(HC, WC, KC) \
MatMulKernelFunc<T, register_tile_size, HC, WC, KC>(a_ptr_batch_base, \ MatMulKernelFunc<T, register_tile_size, HC, WC, KC>(a_ptr_batch_base, \
...@@ -118,7 +116,6 @@ switch (k_count) { \ ...@@ -118,7 +116,6 @@ switch (k_count) { \
LOG(FATAL) << "Unsupported k tile: " << k_count; \ LOG(FATAL) << "Unsupported k tile: " << k_count; \
} }
#define MACE_CASE_W_MATMUL(HC) \ #define MACE_CASE_W_MATMUL(HC) \
switch (w_count) { \ switch (w_count) { \
case 1: \ case 1: \
......
...@@ -78,7 +78,7 @@ void BatchNormFunctor<DeviceType::NEON, float>::operator()( ...@@ -78,7 +78,7 @@ void BatchNormFunctor<DeviceType::NEON, float>::operator()(
} }
} }
} }
}; }
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
...@@ -296,7 +296,7 @@ void Conv2dNeonK1x1S1(const float *input, // NCHW ...@@ -296,7 +296,7 @@ void Conv2dNeonK1x1S1(const float *input, // NCHW
} }
} }
} }
}; }
void Conv2dNeonPixelK1x1S1( void Conv2dNeonPixelK1x1S1(
const float *input, // NCHW const float *input, // NCHW
...@@ -321,7 +321,7 @@ void Conv2dNeonPixelK1x1S1( ...@@ -321,7 +321,7 @@ void Conv2dNeonPixelK1x1S1(
const index_t total_pixels = height * width; const index_t total_pixels = height * width;
// Process 4 * 2 = 8 pixels for each innermost loop // Process 4 * 2 = 8 pixels for each innermost loop
// TODO Does 64 bit v.s. 32 bit index matters? need benchmark // TODO(heliangliang): Does 64 bit v.s. 32 bit index matters? need benchmark
const index_t total_loops = total_pixels >> 3; const index_t total_loops = total_pixels >> 3;
const index_t loop_remaining = total_pixels & 7; const index_t loop_remaining = total_pixels & 7;
...@@ -329,7 +329,7 @@ void Conv2dNeonPixelK1x1S1( ...@@ -329,7 +329,7 @@ void Conv2dNeonPixelK1x1S1(
for (index_t n = 0; n < batch; ++n) { for (index_t n = 0; n < batch; ++n) {
for (index_t c = 0; c < channels; ++c) { for (index_t c = 0; c < channels; ++c) {
const float *filter_ptr = filter + c * input_channels; const float *filter_ptr = filter + c * input_channels;
// TODO Will GCC opt these out? // TODO(heliangliang): Will GCC opt these out?
float *channel_output_start = float *channel_output_start =
output + n * channels * height * width + c * height * width; output + n * channels * height * width + c * height * width;
const float *input_ptr = const float *input_ptr =
...@@ -469,7 +469,7 @@ void Conv2dNeonPixelK1x1S1( ...@@ -469,7 +469,7 @@ void Conv2dNeonPixelK1x1S1(
} }
} }
} }
}; }
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
...@@ -45,7 +45,6 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -45,7 +45,6 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
built_options.emplace(MakeString("-DINPUT_NUM=", input_tensors.size())); built_options.emplace(MakeString("-DINPUT_NUM=", input_tensors.size()));
kernel_ = runtime->BuildKernel("addn", kernel_name, built_options); kernel_ = runtime->BuildKernel("addn", kernel_name, built_options);
} }
std::vector<index_t> output_shape = input_tensors[0]->shape(); std::vector<index_t> output_shape = input_tensors[0]->shape();
...@@ -56,7 +55,8 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -56,7 +55,8 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
if (!IsVecEqual(input_shape_, input_tensors[0]->shape())) { if (!IsVecEqual(input_shape_, input_tensors[0]->shape())) {
std::vector<size_t> output_image_shape; std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape); CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
&output_image_shape);
output_tensor->ResizeImage(output_shape, output_image_shape); output_tensor->ResizeImage(output_shape, output_image_shape);
uint32_t idx = 0; uint32_t idx = 0;
...@@ -75,7 +75,7 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -75,7 +75,7 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
ss << "addn_opencl_kernel_" << output_shape[0] << "_" << output_shape[1] ss << "addn_opencl_kernel_" << output_shape[0] << "_" << output_shape[1]
<< "_" << output_shape[2] << "_" << output_shape[3]; << "_" << output_shape[2] << "_" << output_shape[3];
TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future); TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future);
}; }
template struct AddNFunctor<DeviceType::OPENCL, float>; template struct AddNFunctor<DeviceType::OPENCL, float>;
......
...@@ -32,7 +32,6 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -32,7 +32,6 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
kernel_ = runtime->BuildKernel("bias_add", kernel_name, built_options); kernel_ = runtime->BuildKernel("bias_add", kernel_name, built_options);
} }
if (!IsVecEqual(input_shape_, input->shape())) { if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0; uint32_t idx = 0;
......
...@@ -14,7 +14,7 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -14,7 +14,7 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
Tensor *buffer, const BufferType type, Tensor *image, StatsFuture *future) { Tensor *buffer, const BufferType type, Tensor *image, StatsFuture *future) {
std::vector<size_t> image_shape; std::vector<size_t> image_shape;
if (!i2b_) { if (!i2b_) {
CalImage2DShape(buffer->shape(), type, image_shape); CalImage2DShape(buffer->shape(), type, &image_shape);
if (type == WINOGRAD_FILTER) { if (type == WINOGRAD_FILTER) {
std::vector<index_t> new_shape = CalWinogradShape(buffer->shape(), type); std::vector<index_t> new_shape = CalWinogradShape(buffer->shape(), type);
image->ResizeImage(new_shape, image_shape); image->ResizeImage(new_shape, image_shape);
......
...@@ -39,7 +39,8 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -39,7 +39,8 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(
auto dt = DataTypeToEnum<T>::value; auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
kernel_ = runtime->BuildKernel("channel_shuffle", kernel_name, built_options); kernel_ = runtime->BuildKernel("channel_shuffle", kernel_name,
built_options);
} }
if (!IsVecEqual(input_shape_, input->shape())) { if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0; uint32_t idx = 0;
...@@ -61,7 +62,6 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -61,7 +62,6 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(
<< output->dim(2) << "_" << output->dim(2) << "_"
<< output->dim(3); << output->dim(3);
TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future); TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
} }
template template
......
...@@ -41,7 +41,6 @@ static void Concat2(cl::Kernel *kernel, ...@@ -41,7 +41,6 @@ static void Concat2(cl::Kernel *kernel,
built_options.emplace("-DDIVISIBLE_FOUR"); built_options.emplace("-DDIVISIBLE_FOUR");
} }
*kernel = runtime->BuildKernel("concat", kernel_name, built_options); *kernel = runtime->BuildKernel("concat", kernel_name, built_options);
} }
if (!IsVecEqual(*prev_input_shape, input0->shape())) { if (!IsVecEqual(*prev_input_shape, input0->shape())) {
uint32_t idx = 0; uint32_t idx = 0;
...@@ -140,7 +139,7 @@ void ConcatFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -140,7 +139,7 @@ void ConcatFunctor<DeviceType::OPENCL, T>::operator()(
inputs_count == 2 || divisible_four, inputs_count == 2 || divisible_four,
"Dimensions of inputs should be divisible by 4 when inputs_count > 2."); "Dimensions of inputs should be divisible by 4 when inputs_count > 2.");
std::vector<size_t> image_shape; std::vector<size_t> image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, image_shape); CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
output->ResizeImage(output_shape, image_shape); output->ResizeImage(output_shape, image_shape);
switch (inputs_count) { switch (inputs_count) {
...@@ -155,7 +154,7 @@ void ConcatFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -155,7 +154,7 @@ void ConcatFunctor<DeviceType::OPENCL, T>::operator()(
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
} }
}; }
template struct ConcatFunctor<DeviceType::OPENCL, float>; template struct ConcatFunctor<DeviceType::OPENCL, float>;
template struct ConcatFunctor<DeviceType::OPENCL, half>; template struct ConcatFunctor<DeviceType::OPENCL, half>;
......
...@@ -92,7 +92,8 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -92,7 +92,8 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
} }
std::vector<size_t> output_image_shape; std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape); CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
&output_image_shape);
output->ResizeImage(output_shape, output_image_shape); output->ResizeImage(output_shape, output_image_shape);
if (kernel_h == kernel_w && kernel_h <= 5 && if (kernel_h == kernel_w && kernel_h <= 5 &&
......
...@@ -68,7 +68,6 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel, ...@@ -68,7 +68,6 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
*kernel = runtime->BuildKernel("conv_2d_1x1", kernel_name, built_options); *kernel = runtime->BuildKernel("conv_2d_1x1", kernel_name, built_options);
} }
if (!IsVecEqual(*prev_input_shape, input->shape())) { if (!IsVecEqual(*prev_input_shape, input->shape())) {
uint32_t idx = 0; uint32_t idx = 0;
......
...@@ -91,18 +91,18 @@ void DepthwiseConv2d(cl::Kernel *kernel, ...@@ -91,18 +91,18 @@ void DepthwiseConv2d(cl::Kernel *kernel,
} }
kernel->setArg(idx++, *(output->opencl_image())); kernel->setArg(idx++, *(output->opencl_image()));
kernel->setArg(idx++, relux_max_limit); kernel->setArg(idx++, relux_max_limit);
kernel->setArg(idx++, static_cast<short>(input_height)); kernel->setArg(idx++, static_cast<int16_t>(input_height));
kernel->setArg(idx++, static_cast<short>(input_width)); kernel->setArg(idx++, static_cast<int16_t>(input_width));
kernel->setArg(idx++, static_cast<short>(input_channel_blocks)); kernel->setArg(idx++, static_cast<int16_t>(input_channel_blocks));
kernel->setArg(idx++, static_cast<short>(height)); kernel->setArg(idx++, static_cast<int16_t>(height));
kernel->setArg(idx++, static_cast<short>(width)); kernel->setArg(idx++, static_cast<int16_t>(width));
kernel->setArg(idx++, static_cast<short>(filter_height)); kernel->setArg(idx++, static_cast<int16_t>(filter_height));
kernel->setArg(idx++, static_cast<short>(filter_width)); kernel->setArg(idx++, static_cast<int16_t>(filter_width));
kernel->setArg(idx++, static_cast<short>(paddings[0] / 2)); kernel->setArg(idx++, static_cast<int16_t>(paddings[0] / 2));
kernel->setArg(idx++, static_cast<short>(paddings[1] / 2)); kernel->setArg(idx++, static_cast<int16_t>(paddings[1] / 2));
if (stride != 1 || dilations[0] != 1 || dilations[1] != 1) { if (stride != 1 || dilations[0] != 1 || dilations[1] != 1) {
kernel->setArg(idx++, static_cast<short>(dilations[0])); kernel->setArg(idx++, static_cast<int16_t>(dilations[0]));
kernel->setArg(idx++, static_cast<short>(dilations[1])); kernel->setArg(idx++, static_cast<int16_t>(dilations[1]));
} }
*prev_input_shape = input->shape(); *prev_input_shape = input->shape();
} }
...@@ -159,7 +159,8 @@ void DepthwiseConv2dFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -159,7 +159,8 @@ void DepthwiseConv2dFunctor<DeviceType::OPENCL, T>::operator()(
} }
std::vector<size_t> output_image_shape; std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape); CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
&output_image_shape);
output->ResizeImage(output_shape, output_image_shape); output->ResizeImage(output_shape, output_image_shape);
DepthwiseConv2d(&kernel_, input, filter, bias, strides_[0], paddings.data(), DepthwiseConv2d(&kernel_, input, filter, bias, strides_[0], paddings.data(),
......
...@@ -35,7 +35,6 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0, ...@@ -35,7 +35,6 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
built_options.emplace(MakeString("-DELTWISE_TYPE=", type_)); built_options.emplace(MakeString("-DELTWISE_TYPE=", type_));
if (!coeff_.empty()) built_options.emplace("-DCOEFF_SUM"); if (!coeff_.empty()) built_options.emplace("-DCOEFF_SUM");
kernel_ = runtime->BuildKernel("eltwise", kernel_name, built_options); kernel_ = runtime->BuildKernel("eltwise", kernel_name, built_options);
} }
if (!IsVecEqual(input_shape_, input0->shape())) { if (!IsVecEqual(input_shape_, input0->shape())) {
uint32_t idx = 0; uint32_t idx = 0;
......
...@@ -16,12 +16,14 @@ void FCWXKernel(cl::Kernel *kernel, ...@@ -16,12 +16,14 @@ void FCWXKernel(cl::Kernel *kernel,
std::vector<index_t> *prev_input_shape, std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
const ActivationType activation, const ActivationType activation,
std::vector<uint32_t> &gws, std::vector<uint32_t> *gws,
std::vector<uint32_t> &lws, std::vector<uint32_t> *lws,
const float relux_max_limit, const float relux_max_limit,
StatsFuture *future) { StatsFuture *future) {
MACE_CHECK(input->dim(3) % 4 == 0) MACE_CHECK(input->dim(3) % 4 == 0)
<< "FC width kernel only support input with 4x channel."; << "FC width kernel only support input with 4x channel.";
MACE_CHECK_NOTNULL(gws);
MACE_CHECK_NOTNULL(lws);
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
if (kernel->get() == nullptr) { if (kernel->get() == nullptr) {
...@@ -62,12 +64,11 @@ void FCWXKernel(cl::Kernel *kernel, ...@@ -62,12 +64,11 @@ void FCWXKernel(cl::Kernel *kernel,
const index_t output_blocks = RoundUpDiv4(output_size); const index_t output_blocks = RoundUpDiv4(output_size);
const uint32_t wave_size = runtime->GetKernelWaveSize(*kernel); const uint32_t wave_size = runtime->GetKernelWaveSize(*kernel);
gws = {4, (wave_size / 4), static_cast<uint32_t>(batch * output_blocks)}; *gws = {4, (wave_size / 4), static_cast<uint32_t>(batch * output_blocks)};
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(*kernel); const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(*kernel);
const uint32_t inter_local_blks = kwg_size / (gws[0] * gws[1]); const uint32_t inter_local_blks = kwg_size / ((*gws)[0] * (*gws)[1]);
lws = {gws[0], gws[1], inter_local_blks}; *lws = {(*gws)[0], (*gws)[1], inter_local_blks};
} }
if (!IsVecEqual(*prev_input_shape, input->shape())) { if (!IsVecEqual(*prev_input_shape, input->shape())) {
const index_t batch = output->dim(0); const index_t batch = output->dim(0);
...@@ -80,21 +81,22 @@ void FCWXKernel(cl::Kernel *kernel, ...@@ -80,21 +81,22 @@ void FCWXKernel(cl::Kernel *kernel,
kernel->setArg(idx++, *(bias->opencl_image())); kernel->setArg(idx++, *(bias->opencl_image()));
} }
kernel->setArg(idx++, *(output->opencl_image())); kernel->setArg(idx++, *(output->opencl_image()));
kernel->setArg(idx++, (lws[0] * lws[1] * lws[2] * sizeof(float)), nullptr); kernel->setArg(idx++, ((*lws)[0] * (*lws)[1] * (*lws)[2] * sizeof(float)),
nullptr);
kernel->setArg(idx++, static_cast<int>(input->dim(1))); kernel->setArg(idx++, static_cast<int>(input->dim(1)));
kernel->setArg(idx++, static_cast<int>(input->dim(2))); kernel->setArg(idx++, static_cast<int>(input->dim(2)));
kernel->setArg(idx++, static_cast<int>(RoundUpDiv4(input->dim(3)))); kernel->setArg(idx++, static_cast<int>(RoundUpDiv4(input->dim(3))));
kernel->setArg(idx++, static_cast<int>(output_blocks)); kernel->setArg(idx++, static_cast<int>(output_blocks));
kernel->setArg(idx++, relux_max_limit); kernel->setArg(idx++, relux_max_limit);
gws[2] = static_cast<uint32_t>(batch * output_blocks); (*gws)[2] = static_cast<uint32_t>(batch * output_blocks);
*prev_input_shape = input->shape(); *prev_input_shape = input->shape();
} }
cl::Event event; cl::Event event;
cl_int error = runtime->command_queue().enqueueNDRangeKernel( cl_int error = runtime->command_queue().enqueueNDRangeKernel(
*kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), *kernel, cl::NullRange, cl::NDRange((*gws)[0], (*gws)[1], (*gws)[2]),
cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); cl::NDRange((*lws)[0], (*lws)[1], (*lws)[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
if (future != nullptr) { if (future != nullptr) {
...@@ -105,7 +107,6 @@ void FCWXKernel(cl::Kernel *kernel, ...@@ -105,7 +107,6 @@ void FCWXKernel(cl::Kernel *kernel,
} }
}; };
} }
} }
template <typename T> template <typename T>
...@@ -116,10 +117,12 @@ void FCWTXKernel(cl::Kernel *kernel, ...@@ -116,10 +117,12 @@ void FCWTXKernel(cl::Kernel *kernel,
std::vector<index_t> *prev_input_shape, std::vector<index_t> *prev_input_shape,
Tensor *output, Tensor *output,
const ActivationType activation, const ActivationType activation,
std::vector<uint32_t> &gws, std::vector<uint32_t> *gws,
std::vector<uint32_t> &lws, std::vector<uint32_t> *lws,
const float relux_max_limit, const float relux_max_limit,
StatsFuture *future) { StatsFuture *future) {
MACE_CHECK_NOTNULL(gws);
MACE_CHECK_NOTNULL(lws);
if (kernel->get() == nullptr) { if (kernel->get() == nullptr) {
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
std::set<std::string> built_options; std::set<std::string> built_options;
...@@ -152,7 +155,7 @@ void FCWTXKernel(cl::Kernel *kernel, ...@@ -152,7 +155,7 @@ void FCWTXKernel(cl::Kernel *kernel,
*kernel = *kernel =
runtime->BuildKernel("fully_connected", kernel_name, built_options); runtime->BuildKernel("fully_connected", kernel_name, built_options);
lws = {16, 64, 1}; *lws = {16, 64, 1};
} }
if (!IsVecEqual(*prev_input_shape, input->shape())) { if (!IsVecEqual(*prev_input_shape, input->shape())) {
uint32_t idx = 0; uint32_t idx = 0;
...@@ -171,18 +174,16 @@ void FCWTXKernel(cl::Kernel *kernel, ...@@ -171,18 +174,16 @@ void FCWTXKernel(cl::Kernel *kernel,
const index_t batch = output->dim(0); const index_t batch = output->dim(0);
const index_t output_blocks = RoundUpDiv4(output->dim(3)); const index_t output_blocks = RoundUpDiv4(output->dim(3));
gws = { *gws = {
static_cast<uint32_t>(batch), static_cast<uint32_t>(output_blocks), static_cast<uint32_t>(batch), static_cast<uint32_t>(output_blocks),
}; };
*prev_input_shape = input->shape(); *prev_input_shape = input->shape();
} }
std::stringstream ss; std::stringstream ss;
ss << "fc_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) << "_" ss << "fc_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) << "_"
<< output->dim(2) << "_" << output->dim(3); << output->dim(2) << "_" << output->dim(3);
TuningOrRun2DKernel(*kernel, ss.str(), gws.data(), lws, future); TuningOrRun2DKernel(*kernel, ss.str(), gws->data(), *lws, future);
} }
template <typename T> template <typename T>
...@@ -194,17 +195,18 @@ void FullyConnectedFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -194,17 +195,18 @@ void FullyConnectedFunctor<DeviceType::OPENCL, T>::operator()(
StatsFuture *future) { StatsFuture *future) {
std::vector<index_t> output_shape = {input->dim(0), 1, 1, weight->dim(0)}; std::vector<index_t> output_shape = {input->dim(0), 1, 1, weight->dim(0)};
std::vector<size_t> output_image_shape; std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape); CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
&output_image_shape);
output->ResizeImage(output_shape, output_image_shape); output->ResizeImage(output_shape, output_image_shape);
if (weight_type_ == BufferType::WEIGHT_HEIGHT) { if (weight_type_ == BufferType::WEIGHT_HEIGHT) {
FCWTXKernel<T>(&kernel_, input, weight, bias, &input_shape_, output, FCWTXKernel<T>(&kernel_, input, weight, bias, &input_shape_, output,
activation_, gws_, lws_, relux_max_limit_, future); activation_, &gws_, &lws_, relux_max_limit_, future);
} else { } else {
FCWXKernel<T>(&kernel_, input, weight, bias, &input_shape_, output, FCWXKernel<T>(&kernel_, input, weight, bias, &input_shape_, output,
activation_, gws_, lws_, relux_max_limit_, future); activation_, &gws_, &lws_, relux_max_limit_, future);
} }
}; }
template struct FullyConnectedFunctor<DeviceType::OPENCL, float>; template struct FullyConnectedFunctor<DeviceType::OPENCL, float>;
......
...@@ -3,6 +3,11 @@ ...@@ -3,6 +3,11 @@
// //
#include "mace/kernels/opencl/helper.h" #include "mace/kernels/opencl/helper.h"
#include <algorithm>
#include <string>
#include <vector>
#include "mace/utils/tuner.h" #include "mace/utils/tuner.h"
#include "mace/utils/utils.h" #include "mace/utils/utils.h"
...@@ -11,91 +16,92 @@ namespace kernels { ...@@ -11,91 +16,92 @@ namespace kernels {
// [(C + 3) / 4 * W, N * H] // [(C + 3) / 4 * W, N * H]
void CalInOutputImageShape(const std::vector<index_t> &shape, /* NHWC */ void CalInOutputImageShape(const std::vector<index_t> &shape, /* NHWC */
std::vector<size_t> &image_shape) { std::vector<size_t> *image_shape) {
MACE_CHECK(shape.size() == 4); MACE_CHECK(shape.size() == 4);
image_shape.resize(2); image_shape->resize(2);
image_shape[0] = RoundUpDiv4(shape[3]) * shape[2]; (*image_shape)[0] = RoundUpDiv4(shape[3]) * shape[2];
image_shape[1] = shape[0] * shape[1]; (*image_shape)[1] = shape[0] * shape[1];
} }
// [RoundUp<4>(Ic) * H * W, (Oc + 3) / 4] // [RoundUp<4>(Ic) * H * W, (Oc + 3) / 4]
void CalConv2dFilterImageShape(const std::vector<index_t> &shape, /* HWOI */ void CalConv2dFilterImageShape(const std::vector<index_t> &shape, /* HWOI */
std::vector<size_t> &image_shape) { std::vector<size_t> *image_shape) {
MACE_CHECK(shape.size() == 4); MACE_CHECK(shape.size() == 4);
image_shape.resize(2); image_shape->resize(2);
image_shape[0] = shape[0] * shape[1] * RoundUp<index_t>(shape[3], 4); (*image_shape)[0] = shape[0] * shape[1] * RoundUp<index_t>(shape[3], 4);
image_shape[1] = RoundUpDiv4(shape[2]); (*image_shape)[1] = RoundUpDiv4(shape[2]);
} }
// [H * W * M, (Ic + 3) / 4] // [H * W * M, (Ic + 3) / 4]
void CalDepthwiseConv2dFilterImageShape( void CalDepthwiseConv2dFilterImageShape(
const std::vector<index_t> &shape, /* HWIM */ const std::vector<index_t> &shape, /* HWIM */
std::vector<size_t> &image_shape) { std::vector<size_t> *image_shape) {
MACE_CHECK(shape.size() == 4); MACE_CHECK(shape.size() == 4);
image_shape.resize(2); image_shape->resize(2);
image_shape[0] = shape[0] * shape[1] * shape[3]; (*image_shape)[0] = shape[0] * shape[1] * shape[3];
image_shape[1] = RoundUpDiv4(shape[2]); (*image_shape)[1] = RoundUpDiv4(shape[2]);
} }
// [(size + 3) / 4, 1] // [(size + 3) / 4, 1]
void CalArgImageShape(const std::vector<index_t> &shape, void CalArgImageShape(const std::vector<index_t> &shape,
std::vector<size_t> &image_shape) { std::vector<size_t> *image_shape) {
MACE_CHECK(shape.size() == 1); MACE_CHECK(shape.size() == 1);
image_shape.resize(2); image_shape->resize(2);
image_shape[0] = RoundUpDiv4(shape[0]); (*image_shape)[0] = RoundUpDiv4(shape[0]);
image_shape[1] = 1; (*image_shape)[1] = 1;
} }
// Only support 3x3 now // Only support 3x3 now
// [ (Ic + 3) / 4, 16 * Oc] // [ (Ic + 3) / 4, 16 * Oc]
void CalWinogradFilterImageShape( void CalWinogradFilterImageShape(
const std::vector<index_t> &shape, /* Oc, Ic, H, W*/ const std::vector<index_t> &shape, /* Oc, Ic, H, W*/
std::vector<size_t> &image_shape) { std::vector<size_t> *image_shape) {
MACE_CHECK(shape.size() == 4); MACE_CHECK(shape.size() == 4);
image_shape.resize(2); image_shape->resize(2);
image_shape[0] = RoundUpDiv4(shape[1]); (*image_shape)[0] = RoundUpDiv4(shape[1]);
image_shape[1] = (shape[0] << 4); (*image_shape)[1] = (shape[0] << 4);
} }
// [W * C, N * RoundUp<4>(H)] // [W * C, N * RoundUp<4>(H)]
void CalInOutHeightImageShape(const std::vector<index_t> &shape, /* NHWC */ void CalInOutHeightImageShape(const std::vector<index_t> &shape, /* NHWC */
std::vector<size_t> &image_shape) { std::vector<size_t> *image_shape) {
MACE_CHECK(shape.size() == 4); MACE_CHECK(shape.size() == 4);
image_shape.resize(2); image_shape->resize(2);
image_shape[0] = shape[2] * shape[3]; (*image_shape)[0] = shape[2] * shape[3];
image_shape[1] = shape[0] * RoundUpDiv4(shape[1]); (*image_shape)[1] = shape[0] * RoundUpDiv4(shape[1]);
} }
// [RoundUp<4>(W) * C, N * H] // [RoundUp<4>(W) * C, N * H]
void CalInOutWidthImageShape(const std::vector<index_t> &shape, /* NHWC */ void CalInOutWidthImageShape(const std::vector<index_t> &shape, /* NHWC */
std::vector<size_t> &image_shape) { std::vector<size_t> *image_shape) {
MACE_CHECK(shape.size() == 4); MACE_CHECK(shape.size() == 4);
image_shape.resize(2); image_shape->resize(2);
image_shape[0] = RoundUpDiv4(shape[2]) * shape[3]; (*image_shape)[0] = RoundUpDiv4(shape[2]) * shape[3];
image_shape[1] = shape[0] * shape[1]; (*image_shape)[1] = shape[0] * shape[1];
} }
// [W, (H + 3) / 4] // [W, (H + 3) / 4]
void CalWeightHeightImageShape(const std::vector<index_t> &shape, /* HW */ void CalWeightHeightImageShape(const std::vector<index_t> &shape, /* HW */
std::vector<size_t> &image_shape) { std::vector<size_t> *image_shape) {
MACE_CHECK(shape.size() == 2); MACE_CHECK(shape.size() == 2);
image_shape.resize(2); image_shape->resize(2);
image_shape[0] = shape[1]; (*image_shape)[0] = shape[1];
image_shape[1] = RoundUpDiv4(shape[0]); (*image_shape)[1] = RoundUpDiv4(shape[0]);
} }
// [(W + 3) / 4, H] // [(W + 3) / 4, H]
void CalWeightWidthImageShape(const std::vector<index_t> &shape, /* HW */ void CalWeightWidthImageShape(const std::vector<index_t> &shape, /* HW */
std::vector<size_t> &image_shape) { std::vector<size_t> *image_shape) {
MACE_CHECK(shape.size() == 2); MACE_CHECK(shape.size() == 2);
image_shape.resize(2); image_shape->resize(2);
image_shape[0] = RoundUpDiv4(shape[1]); (*image_shape)[0] = RoundUpDiv4(shape[1]);
image_shape[1] = shape[0]; (*image_shape)[1] = shape[0];
} }
void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */ void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
const BufferType type, const BufferType type,
std::vector<size_t> &image_shape) { std::vector<size_t> *image_shape) {
MACE_CHECK_NOTNULL(image_shape);
switch (type) { switch (type) {
case CONV2D_FILTER: case CONV2D_FILTER:
CalConv2dFilterImageShape(shape, image_shape); CalConv2dFilterImageShape(shape, image_shape);
...@@ -188,7 +194,7 @@ std::string DtToUpstreamCLCMDDt(const DataType dt) { ...@@ -188,7 +194,7 @@ std::string DtToUpstreamCLCMDDt(const DataType dt) {
} }
} }
void TuningOrRun3DKernel(cl::Kernel &kernel, void TuningOrRun3DKernel(const cl::Kernel &kernel,
const std::string tuning_key, const std::string tuning_key,
const uint32_t *gws, const uint32_t *gws,
const std::vector<uint32_t> &lws, const std::vector<uint32_t> &lws,
...@@ -202,7 +208,7 @@ void TuningOrRun3DKernel(cl::Kernel &kernel, ...@@ -202,7 +208,7 @@ void TuningOrRun3DKernel(cl::Kernel &kernel,
local_ws[2] = local_ws[2] =
std::min<uint32_t>(gws[2], kwg_size / (local_ws[0] * local_ws[1])); std::min<uint32_t>(gws[2], kwg_size / (local_ws[0] * local_ws[1]));
return { return {
// TODO tuning these magic numbers // TODO(heliangliang): tuning these magic numbers
{local_ws[0], local_ws[1], local_ws[2], 1}, {local_ws[0], local_ws[1], local_ws[2], 1},
{kwg_size / 16, 4, 4, 1}, {kwg_size / 16, 4, 4, 1},
{kwg_size / 32, 4, 8, 1}, {kwg_size / 32, 4, 8, 1},
...@@ -291,7 +297,7 @@ void TuningOrRun3DKernel(cl::Kernel &kernel, ...@@ -291,7 +297,7 @@ void TuningOrRun3DKernel(cl::Kernel &kernel,
} }
} }
void TuningOrRun2DKernel(cl::Kernel &kernel, void TuningOrRun2DKernel(const cl::Kernel &kernel,
const std::string tuning_key, const std::string tuning_key,
const uint32_t *gws, const uint32_t *gws,
const std::vector<uint32_t> &lws, const std::vector<uint32_t> &lws,
......
...@@ -5,6 +5,9 @@ ...@@ -5,6 +5,9 @@
#ifndef MACE_KERNELS_OPENCL_HELPER_H_ #ifndef MACE_KERNELS_OPENCL_HELPER_H_
#define MACE_KERNELS_OPENCL_HELPER_H_ #define MACE_KERNELS_OPENCL_HELPER_H_
#include <string>
#include <vector>
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/runtime/opencl/opencl_runtime.h"
...@@ -30,7 +33,7 @@ enum BufferType { ...@@ -30,7 +33,7 @@ enum BufferType {
void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */ void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
const BufferType type, const BufferType type,
std::vector<size_t> &image_shape); std::vector<size_t> *image_shape);
std::vector<index_t> CalWinogradShape(const std::vector<index_t> &shape, std::vector<index_t> CalWinogradShape(const std::vector<index_t> &shape,
const BufferType type); const BufferType type);
...@@ -43,13 +46,13 @@ std::string DtToCLDt(const DataType dt); ...@@ -43,13 +46,13 @@ std::string DtToCLDt(const DataType dt);
std::string DtToUpstreamCLDt(const DataType dt); std::string DtToUpstreamCLDt(const DataType dt);
void TuningOrRun3DKernel(cl::Kernel &kernel, void TuningOrRun3DKernel(const cl::Kernel &kernel,
const std::string tuning_key, const std::string tuning_key,
const uint32_t *gws, const uint32_t *gws,
const std::vector<uint32_t> &lws, const std::vector<uint32_t> &lws,
StatsFuture *future); StatsFuture *future);
void TuningOrRun2DKernel(cl::Kernel &kernel, void TuningOrRun2DKernel(const cl::Kernel &kernel,
const std::string tuning_key, const std::string tuning_key,
const uint32_t *gws, const uint32_t *gws,
const std::vector<uint32_t> &lws, const std::vector<uint32_t> &lws,
...@@ -78,7 +81,6 @@ bool IsVecEqual(const std::vector<T> &input0, ...@@ -78,7 +81,6 @@ bool IsVecEqual(const std::vector<T> &input0,
(std::equal(input0.begin(), input0.end(), input1.begin()))); (std::equal(input0.begin(), input0.end(), input1.begin())));
} }
namespace {
template <typename T> template <typename T>
void AppendToStream(std::stringstream *ss, const std::string &delimiter, T v) { void AppendToStream(std::stringstream *ss, const std::string &delimiter, T v) {
(*ss) << v; (*ss) << v;
...@@ -92,7 +94,6 @@ void AppendToStream(std::stringstream *ss, ...@@ -92,7 +94,6 @@ void AppendToStream(std::stringstream *ss,
(*ss) << first << delimiter; (*ss) << first << delimiter;
AppendToStream(ss, delimiter, args...); AppendToStream(ss, delimiter, args...);
} }
} // namespace
template <typename... Args> template <typename... Args>
std::string Concat(Args... args) { std::string Concat(Args... args) {
......
...@@ -17,7 +17,7 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A, ...@@ -17,7 +17,7 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
StatsFuture *future) { StatsFuture *future) {
std::vector<index_t> c_shape = {A->dim(0), A->dim(1), B->dim(2), 1}; std::vector<index_t> c_shape = {A->dim(0), A->dim(1), B->dim(2), 1};
std::vector<size_t> c_image_shape; std::vector<size_t> c_image_shape;
CalImage2DShape(c_shape, BufferType::IN_OUT_HEIGHT, c_image_shape); CalImage2DShape(c_shape, BufferType::IN_OUT_HEIGHT, &c_image_shape);
C->ResizeImage(c_shape, c_image_shape); C->ResizeImage(c_shape, c_image_shape);
const index_t batch = C->dim(0); const index_t batch = C->dim(0);
...@@ -56,7 +56,7 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A, ...@@ -56,7 +56,7 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
ss << "matmul_opencl_kernel_" << C->dim(0) << "_" << C->dim(1) << "_" ss << "matmul_opencl_kernel_" << C->dim(0) << "_" << C->dim(1) << "_"
<< C->dim(2) << "_" << C->dim(3); << C->dim(2) << "_" << C->dim(3);
TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future); TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future);
}; }
template struct MatMulFunctor<DeviceType::OPENCL, float>; template struct MatMulFunctor<DeviceType::OPENCL, float>;
......
...@@ -36,12 +36,11 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -36,12 +36,11 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
built_options.emplace("-DPOOL_AVG"); built_options.emplace("-DPOOL_AVG");
} }
kernel_ = runtime->BuildKernel("pooling", kernel_name, built_options); kernel_ = runtime->BuildKernel("pooling", kernel_name, built_options);
} }
if (!IsVecEqual(input_shape_, input->shape())) { if (!IsVecEqual(input_shape_, input->shape())) {
std::vector<index_t> output_shape(4); std::vector<index_t> output_shape(4);
std::vector<index_t> filter_shape = {kernels_[0], kernels_[1], input->dim(3), std::vector<index_t> filter_shape = {kernels_[0], kernels_[1],
input->dim(3)}; input->dim(3), input->dim(3)};
std::vector<int> paddings(2); std::vector<int> paddings(2);
if (paddings_.empty()) { if (paddings_.empty()) {
...@@ -50,12 +49,14 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -50,12 +49,14 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
padding_type_, output_shape.data(), paddings.data()); padding_type_, output_shape.data(), paddings.data());
} else { } else {
paddings = paddings_; paddings = paddings_;
CalcOutputSize(input->shape().data(), filter_shape.data(), paddings_.data(), CalcOutputSize(input->shape().data(), filter_shape.data(),
dilations_, strides_, RoundType::CEIL, output_shape.data()); paddings_.data(), dilations_, strides_, RoundType::CEIL,
output_shape.data());
} }
std::vector<size_t> output_image_shape; std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape); CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
&output_image_shape);
output->ResizeImage(output_shape, output_image_shape); output->ResizeImage(output_shape, output_image_shape);
uint32_t idx = 0; uint32_t idx = 0;
......
...@@ -34,7 +34,6 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -34,7 +34,6 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
kernel_ = kernel_ =
runtime->BuildKernel("resize_bilinear", kernel_name, built_options); runtime->BuildKernel("resize_bilinear", kernel_name, built_options);
} }
if (!IsVecEqual(input_shape_, input->shape())) { if (!IsVecEqual(input_shape_, input->shape())) {
MACE_CHECK(out_height > 0 && out_width > 0); MACE_CHECK(out_height > 0 && out_width > 0);
...@@ -42,7 +41,7 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -42,7 +41,7 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
std::vector<size_t> output_image_shape; std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
output_image_shape); &output_image_shape);
output->ResizeImage(output_shape, output_image_shape); output->ResizeImage(output_shape, output_image_shape);
float height_scale = float height_scale =
...@@ -60,7 +59,6 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -60,7 +59,6 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
kernel_.setArg(idx++, static_cast<int32_t>(out_height)); kernel_.setArg(idx++, static_cast<int32_t>(out_height));
input_shape_ = input->shape(); input_shape_ = input->shape();
} }
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks), const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
......
...@@ -24,7 +24,7 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -24,7 +24,7 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()(
input->dim(2), output_channels}); input->dim(2), output_channels});
std::vector<size_t> image_shape; std::vector<size_t> image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, image_shape); CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
for (size_t i= 0; i < outputs_count; ++i) { for (size_t i= 0; i < outputs_count; ++i) {
output_list[i]->ResizeImage(output_shape, image_shape); output_list[i]->ResizeImage(output_shape, image_shape);
} }
......
...@@ -33,7 +33,6 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits, ...@@ -33,7 +33,6 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
kernel_ = runtime->BuildKernel("softmax", kernel_name, built_options); kernel_ = runtime->BuildKernel("softmax", kernel_name, built_options);
} }
if (!IsVecEqual(input_shape_, logits->shape())) { if (!IsVecEqual(input_shape_, logits->shape())) {
uint32_t idx = 0; uint32_t idx = 0;
......
...@@ -22,7 +22,8 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -22,7 +22,8 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
StatsFuture *future) { StatsFuture *future) {
const char *kernel_name = nullptr; const char *kernel_name = nullptr;
std::vector<size_t> output_image_shape; std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape); CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
&output_image_shape);
if (b2s_) { if (b2s_) {
space_tensor->ResizeImage(output_shape, output_image_shape); space_tensor->ResizeImage(output_shape, output_image_shape);
kernel_name = "batch_to_space"; kernel_name = "batch_to_space";
...@@ -42,7 +43,6 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -42,7 +43,6 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
DtToCLCMDDt(DataTypeToEnum<T>::value)); DtToCLCMDDt(DataTypeToEnum<T>::value));
kernel_ = kernel_ =
runtime->BuildKernel("space_to_batch", kernel_name, built_options); runtime->BuildKernel("space_to_batch", kernel_name, built_options);
} }
if (!IsVecEqual(space_shape_, space_tensor->shape())) { if (!IsVecEqual(space_shape_, space_tensor->shape())) {
uint32_t idx = 0; uint32_t idx = 0;
......
...@@ -27,7 +27,6 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -27,7 +27,6 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
auto runtime = OpenCLRuntime::Global(); auto runtime = OpenCLRuntime::Global();
kernel_ = runtime->BuildKernel("winograd_transform", obfuscated_kernel_name, kernel_ = runtime->BuildKernel("winograd_transform", obfuscated_kernel_name,
built_options); built_options);
} }
std::vector<index_t> output_shape(4); std::vector<index_t> output_shape(4);
std::vector<index_t> filter_shape = {3, 3, input_tensor->dim(3), 1}; std::vector<index_t> filter_shape = {3, 3, input_tensor->dim(3), 1};
...@@ -49,7 +48,7 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -49,7 +48,7 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
if (!IsVecEqual(input_shape_, input_tensor->shape())) { if (!IsVecEqual(input_shape_, input_tensor->shape())) {
output_shape = {16, input_tensor->dim(3), out_width, 1}; output_shape = {16, input_tensor->dim(3), out_width, 1};
std::vector<size_t> image_shape; std::vector<size_t> image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_HEIGHT, image_shape); CalImage2DShape(output_shape, BufferType::IN_OUT_HEIGHT, &image_shape);
output_tensor->ResizeImage(output_shape, image_shape); output_tensor->ResizeImage(output_shape, image_shape);
uint32_t idx = 0; uint32_t idx = 0;
...@@ -83,7 +82,6 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -83,7 +82,6 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
const Tensor *bias, const Tensor *bias,
Tensor *output_tensor, Tensor *output_tensor,
StatsFuture *future) { StatsFuture *future) {
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
std::string obfuscated_kernel_name = std::string obfuscated_kernel_name =
MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_2x2"); MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_2x2");
...@@ -125,7 +123,7 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -125,7 +123,7 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
std::vector<index_t> output_shape = {batch_, height_, width_, std::vector<index_t> output_shape = {batch_, height_, width_,
input_tensor->dim(1)}; input_tensor->dim(1)};
std::vector<size_t> image_shape; std::vector<size_t> image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, image_shape); CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
output_tensor->ResizeImage(output_shape, image_shape); output_tensor->ResizeImage(output_shape, image_shape);
const uint32_t round_h = (height_ + 1) / 2; const uint32_t round_h = (height_ + 1) / 2;
......
...@@ -2,10 +2,13 @@ ...@@ -2,10 +2,13 @@
// Copyright (c) 2017 XiaoMi All rights reserved. // Copyright (c) 2017 XiaoMi All rights reserved.
// //
#ifndef MACE_KERNELS_POOLING_H #ifndef MACE_KERNELS_POOLING_H_
#define MACE_KERNELS_POOLING_H #define MACE_KERNELS_POOLING_H_
#include <algorithm>
#include <limits> #include <limits>
#include <vector>
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
...@@ -188,4 +191,4 @@ struct PoolingFunctor<DeviceType::OPENCL, T> : PoolingFunctorBase { ...@@ -188,4 +191,4 @@ struct PoolingFunctor<DeviceType::OPENCL, T> : PoolingFunctorBase {
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
#endif // MACE_KERNELS_POOLING_H #endif // MACE_KERNELS_POOLING_H_
...@@ -4,6 +4,8 @@ ...@@ -4,6 +4,8 @@
#ifndef MACE_KERNELS_RESHAPE_H_ #ifndef MACE_KERNELS_RESHAPE_H_
#define MACE_KERNELS_RESHAPE_H_ #define MACE_KERNELS_RESHAPE_H_
#include <vector>
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
...@@ -20,7 +22,7 @@ struct ReshapeFunctor { ...@@ -20,7 +22,7 @@ struct ReshapeFunctor {
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
output->Resize(out_shape); output->Resize(out_shape);
// TODO copy on write to avoid this copy. // TODO(liuqi): copy on write to avoid this copy.
output->CopyBytes(input->raw_data(), input->size() * sizeof(T)); output->CopyBytes(input->raw_data(), input->size() * sizeof(T));
} }
}; };
......
...@@ -4,6 +4,9 @@ ...@@ -4,6 +4,9 @@
#ifndef MACE_KERNELS_RESIZE_BILINEAR_H_ #ifndef MACE_KERNELS_RESIZE_BILINEAR_H_
#define MACE_KERNELS_RESIZE_BILINEAR_H_ #define MACE_KERNELS_RESIZE_BILINEAR_H_
#include <algorithm>
#include <vector>
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
...@@ -11,7 +14,6 @@ ...@@ -11,7 +14,6 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
namespace {
struct CachedInterpolation { struct CachedInterpolation {
index_t lower; // Lower source index used in the interpolation index_t lower; // Lower source index used in the interpolation
index_t upper; // Upper source index used in the interpolation index_t upper; // Upper source index used in the interpolation
...@@ -101,7 +103,6 @@ void ResizeImage(const T *images, ...@@ -101,7 +103,6 @@ void ResizeImage(const T *images,
} }
} }
} }
}
struct ResizeBilinearFunctorBase { struct ResizeBilinearFunctorBase {
ResizeBilinearFunctorBase(const std::vector<index_t> &size, ResizeBilinearFunctorBase(const std::vector<index_t> &size,
......
...@@ -5,6 +5,8 @@ ...@@ -5,6 +5,8 @@
#ifndef MACE_KERNELS_SLICE_H_ #ifndef MACE_KERNELS_SLICE_H_
#define MACE_KERNELS_SLICE_H_ #define MACE_KERNELS_SLICE_H_
#include <vector>
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
...@@ -16,7 +18,6 @@ namespace kernels { ...@@ -16,7 +18,6 @@ namespace kernels {
template<DeviceType D, typename T> template<DeviceType D, typename T>
struct SliceFunctor { struct SliceFunctor {
void operator()(const Tensor *input, void operator()(const Tensor *input,
const std::vector<Tensor *> &output_list, const std::vector<Tensor *> &output_list,
StatsFuture *future) { StatsFuture *future) {
...@@ -56,15 +57,13 @@ struct SliceFunctor { ...@@ -56,15 +57,13 @@ struct SliceFunctor {
template<typename T> template<typename T>
struct SliceFunctor<DeviceType::OPENCL, T> { struct SliceFunctor<DeviceType::OPENCL, T> {
void operator()(const Tensor *input, void operator()(const Tensor *input,
const std::vector<Tensor *> &output_list, const std::vector<Tensor *> &output_list,
StatsFuture *future); StatsFuture *future);
cl::Kernel kernel_; cl::Kernel kernel_;
}; };
} // namepsace kernels } // namespace kernels
} // namespace mace } // namespace mace
#endif // MACE_KERNELS_SLICE_H_ #endif // MACE_KERNELS_SLICE_H_
...@@ -5,6 +5,10 @@ ...@@ -5,6 +5,10 @@
#ifndef MACE_KERNELS_SOFTMAX_H_ #ifndef MACE_KERNELS_SOFTMAX_H_
#define MACE_KERNELS_SOFTMAX_H_ #define MACE_KERNELS_SOFTMAX_H_
#include <algorithm>
#include <functional>
#include <vector>
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
...@@ -38,7 +42,7 @@ struct SoftmaxFunctor { ...@@ -38,7 +42,7 @@ struct SoftmaxFunctor {
for (index_t c = 1; c < num_classes; ++c) { for (index_t c = 1; c < num_classes; ++c) {
max_value = std::max(max_value, logits_ptr[pos + c]); max_value = std::max(max_value, logits_ptr[pos + c]);
} }
// TODO: check overflow? // TODO(liuqi): check overflow?
T sum = 0; T sum = 0;
for (index_t c = 0; c < num_classes; ++c) { for (index_t c = 0; c < num_classes; ++c) {
exp_data[c] = ::exp((logits_ptr[pos + c] - max_value)); exp_data[c] = ::exp((logits_ptr[pos + c] - max_value));
...@@ -60,7 +64,7 @@ struct SoftmaxFunctor<DeviceType::OPENCL, T> { ...@@ -60,7 +64,7 @@ struct SoftmaxFunctor<DeviceType::OPENCL, T> {
std::vector<index_t> input_shape_; std::vector<index_t> input_shape_;
}; };
} // namepsace kernels } // namespace kernels
} // namespace mace } // namespace mace
#endif // MACE_KERNELS_SOFTMAX_H_ #endif // MACE_KERNELS_SOFTMAX_H_
...@@ -2,8 +2,10 @@ ...@@ -2,8 +2,10 @@
// Copyright (c) 2017 XiaoMi All rights reserved. // Copyright (c) 2017 XiaoMi All rights reserved.
// //
#ifndef MACE_KERNELS_CONV_2D_H_ #ifndef MACE_KERNELS_SPACE_TO_BATCH_H_
#define MACE_KERNELS_CONV_2D_H_ #define MACE_KERNELS_SPACE_TO_BATCH_H_
#include <vector>
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/cl2_header.h"
...@@ -60,4 +62,4 @@ struct SpaceToBatchFunctor<DeviceType::OPENCL, T> : SpaceToBatchFunctorBase { ...@@ -60,4 +62,4 @@ struct SpaceToBatchFunctor<DeviceType::OPENCL, T> : SpaceToBatchFunctorBase {
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
#endif // MACE_KERNELS_CONV_2D_H_ #endif // MACE_KERNELS_SPACE_TO_BATCH_H_
...@@ -5,6 +5,8 @@ ...@@ -5,6 +5,8 @@
#ifndef MACE_KERNELS_WINOGRAD_TRANSFORM_H_ #ifndef MACE_KERNELS_WINOGRAD_TRANSFORM_H_
#define MACE_KERNELS_WINOGRAD_TRANSFORM_H_ #define MACE_KERNELS_WINOGRAD_TRANSFORM_H_
#include <vector>
#include "mace/core/future.h" #include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册