提交 2944863e 编写于 作者: Y yejianwu

merge with origin master, fix mace/ops code format

......@@ -62,6 +62,8 @@ std::unique_ptr<OperatorBase> OperatorRegistry::CreateOperator(
}
}
namespace ops {
extern void Register_Activation(OperatorRegistry *op_registry);
extern void Register_AddN(OperatorRegistry *op_registry);
extern void Register_BatchNorm(OperatorRegistry *op_registry);
......@@ -88,32 +90,34 @@ extern void Register_Eltwise(OperatorRegistry *op_registry);
extern void Register_FullyConnected(OperatorRegistry *op_registry);
extern void Register_Slice(OperatorRegistry *op_registry);
} // namespace ops
OperatorRegistry::OperatorRegistry() {
Register_Activation(this);
Register_AddN(this);
Register_BatchNorm(this);
Register_BatchToSpaceND(this);
Register_BiasAdd(this);
Register_BufferToImage(this);
Register_ChannelShuffle(this);
Register_Concat(this);
Register_Conv2D(this);
Register_DepthwiseConv2d(this);
Register_FoldedBatchNorm(this);
Register_FusedConv2D(this);
Register_GlobalAvgPooling(this);
Register_ImageToBuffer(this);
Register_Pooling(this);
Register_ResizeBilinear(this);
Register_Softmax(this);
Register_SpaceToBatchND(this);
Register_MatMul(this);
Register_WinogradTransform(this);
Register_WinogradInverseTransform(this);
Register_Reshape(this);
Register_Eltwise(this);
Register_FullyConnected(this);
Register_Slice(this);
ops::Register_Activation(this);
ops::Register_AddN(this);
ops::Register_BatchNorm(this);
ops::Register_BatchToSpaceND(this);
ops::Register_BiasAdd(this);
ops::Register_BufferToImage(this);
ops::Register_ChannelShuffle(this);
ops::Register_Concat(this);
ops::Register_Conv2D(this);
ops::Register_DepthwiseConv2d(this);
ops::Register_FoldedBatchNorm(this);
ops::Register_FusedConv2D(this);
ops::Register_GlobalAvgPooling(this);
ops::Register_ImageToBuffer(this);
ops::Register_Pooling(this);
ops::Register_ResizeBilinear(this);
ops::Register_Softmax(this);
ops::Register_SpaceToBatchND(this);
ops::Register_MatMul(this);
ops::Register_WinogradTransform(this);
ops::Register_WinogradInverseTransform(this);
ops::Register_Reshape(this);
ops::Register_Eltwise(this);
ops::Register_FullyConnected(this);
ops::Register_Slice(this);
}
} // namespace mace
......@@ -152,6 +152,7 @@ class ActivationFunctor<DeviceType::OPENCL, T> {
T relux_max_limit_;
cl::Kernel kernel_;
std::string tuning_key_prefix_;
std::vector<index_t> input_shape_;
};
} // namespace kernels
......
......@@ -91,6 +91,7 @@ struct AddNFunctor<DeviceType::OPENCL, T> {
StatsFuture *future);
cl::Kernel kernel_;
std::vector<index_t> input_shape_;
};
} // namespace kernels
......
......@@ -156,6 +156,7 @@ struct BatchNormFunctor<DeviceType::OPENCL, T> : BatchNormFunctorBase {
Tensor *output,
StatsFuture *future);
cl::Kernel kernel_;
std::vector<index_t> input_shape_;
};
} // namepsace kernels
......
......@@ -62,6 +62,7 @@ struct BiasAddFunctor<DeviceType::OPENCL, T> {
Tensor *output,
StatsFuture *future);
cl::Kernel kernel_;
std::vector<index_t> input_shape_;
};
} // namepsace kernels
......
......@@ -55,6 +55,7 @@ struct ChannelShuffleFunctor<DeviceType::OPENCL, T> {
cl::Kernel kernel_;
const int groups_;
std::vector<index_t> input_shape_;
};
} // namespace kernels
......
......@@ -83,6 +83,7 @@ struct ConcatFunctor<DeviceType::OPENCL, T> : ConcatFunctorBase {
Tensor *output,
StatsFuture *future);
cl::Kernel kernel_;
std::vector<index_t> input_shape_;
};
} // namepsace kernels
......
......@@ -401,6 +401,7 @@ struct Conv2dFunctor<DeviceType::OPENCL, T> : Conv2dFunctorBase {
StatsFuture *future);
cl::Kernel kernel_;
std::vector<index_t> input_shape_;
};
} // namespace kernels
......
......@@ -439,6 +439,7 @@ struct DepthwiseConv2dFunctor<DeviceType::OPENCL, T>
StatsFuture *future);
cl::Kernel kernel_;
std::vector<index_t> input_shape_;
};
} // namespace kernels
......
......@@ -94,6 +94,7 @@ struct EltwiseFunctor<DeviceType::OPENCL, T> : EltwiseFunctorBase {
StatsFuture *future);
cl::Kernel kernel_;
std::vector<index_t> input_shape_;
};
} // namespace kernels
......
......@@ -90,6 +90,7 @@ struct FullyConnectedFunctor<DeviceType::OPENCL, T> : FullyConnectedBase {
cl::Kernel kernel_;
std::vector<uint32_t> gws_;
std::vector<uint32_t> lws_;
std::vector<index_t> input_shape_;
};
} // namespace kernels
......
......@@ -58,6 +58,9 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
LOG(FATAL) << "Unknown activation type: " << activation_;
}
kernel_ = runtime->BuildKernel("activation", kernel_name, built_options);
}
if (!IsVecEqual(input_shape_, input->shape())) {
int idx = 0;
kernel_.setArg(idx++, *(input->opencl_image()));
if (activation_ == PRELU) {
......@@ -66,6 +69,8 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
}
kernel_.setArg(idx++, static_cast<float>(relux_max_limit_));
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
......
......@@ -32,15 +32,6 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
MACE_CHECK(channels == input_tensors[i]->dim(3));
}
std::vector<index_t> output_shape = input_tensors[0]->shape();
std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape);
output_tensor->ResizeImage(output_shape, output_image_shape);
const index_t channel_blocks = RoundUpDiv4(channels);
const index_t width_pixels = channel_blocks * width;
const index_t batch_height_pixels = batch * height;
if (kernel_.get() == nullptr) {
if (input_tensors.size() > 4) {
MACE_NOT_IMPLEMENTED;
......@@ -55,11 +46,26 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
built_options.emplace(MakeString("-DINPUT_NUM=", input_tensors.size()));
kernel_ = runtime->BuildKernel("addn", kernel_name, built_options);
}
std::vector<index_t> output_shape = input_tensors[0]->shape();
const index_t channel_blocks = RoundUpDiv4(channels);
const index_t width_pixels = channel_blocks * width;
const index_t batch_height_pixels = batch * height;
if (!IsVecEqual(input_shape_, input_tensors[0]->shape())) {
std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape);
output_tensor->ResizeImage(output_shape, output_image_shape);
uint32_t idx = 0;
for (auto input : input_tensors) {
kernel_.setArg(idx++, *(input->opencl_image()));
}
kernel_.setArg(idx++, *(output_tensor->opencl_image()));
input_shape_ = input_tensors[0]->shape();
}
const uint32_t gws[2] = {static_cast<uint32_t>(width_pixels),
......
......@@ -61,7 +61,8 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
}
kernel_ = runtime->BuildKernel("batch_norm", kernel_name, built_options);
}
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(scale->opencl_image()));
......@@ -73,6 +74,8 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
}
kernel_.setArg(idx++, *(output->opencl_image()));
kernel_.setArg(idx++, relux_max_limit_);
input_shape_ = input->shape();
}
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
......
......@@ -33,10 +33,13 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
kernel_ = runtime->BuildKernel("bias_add", kernel_name, built_options);
}
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(bias->opencl_image()));
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
......
......@@ -13,9 +13,10 @@ namespace mace {
namespace kernels {
template <typename T>
void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
Tensor *output,
StatsFuture *future) {
void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(
const Tensor *input,
Tensor *output,
StatsFuture *future) {
output->ResizeLike(input);
const index_t batch = input->dim(0);
......@@ -39,12 +40,15 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *inpu
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
kernel_ = runtime->BuildKernel("channel_shuffle", kernel_name, built_options);
}
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, groups_);
kernel_.setArg(idx++, static_cast<uint32_t>(channels_per_group));
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
const uint32_t gws[3] = {static_cast<uint32_t>(group_channel_blocks),
static_cast<uint32_t>(width),
......
......@@ -15,6 +15,7 @@ static void Concat2(cl::Kernel *kernel,
const Tensor *input0,
const Tensor *input1,
const DataType dt,
std::vector<index_t> *prev_input_shape,
Tensor *output,
StatsFuture *future) {
const index_t batch = output->dim(0);
......@@ -41,6 +42,8 @@ static void Concat2(cl::Kernel *kernel,
}
*kernel = runtime->BuildKernel("concat", kernel_name, built_options);
}
if (!IsVecEqual(*prev_input_shape, input0->shape())) {
uint32_t idx = 0;
kernel->setArg(idx++,
*(static_cast<const cl::Image2D *>(input0->opencl_image())));
......@@ -49,6 +52,7 @@ static void Concat2(cl::Kernel *kernel,
kernel->setArg(idx++, static_cast<int32_t>(input0->dim(3)));
kernel->setArg(idx++,
*(static_cast<cl::Image2D *>(output->opencl_image())));
*prev_input_shape = input0->shape();
}
const uint32_t gws[3] = {
......@@ -142,7 +146,7 @@ void ConcatFunctor<DeviceType::OPENCL, T>::operator()(
switch (inputs_count) {
case 2:
Concat2(&kernel_, input_list[0], input_list[1], DataTypeToEnum<T>::value,
output, future);
&input_shape_, output, future);
break;
default:
if (divisible_four) {
......
......@@ -18,6 +18,7 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
const ActivationType activation,
const float relux_max_limit,
const DataType dt,
std::vector<index_t> *prev_input_shape,
Tensor *output,
StatsFuture *future);
......@@ -31,6 +32,7 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
const ActivationType activation,
const float relux_max_limit,
const DataType dt,
std::vector<index_t> *prev_input_shape,
Tensor *output,
StatsFuture *future);
......@@ -44,6 +46,7 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
const ActivationType activation,
const float relux_max_limit,
const DataType dt,
std::vector<index_t> *prev_input_shape,
Tensor *output,
StatsFuture *future);
......@@ -57,8 +60,8 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
cl::Kernel * kernel, const Tensor *input, const Tensor *filter,
const Tensor *bias, const int stride, const int *padding,
const int *dilations, const ActivationType activation,
const float relux_max_limit, const DataType dt, Tensor *output,
StatsFuture *future);
const float relux_max_limit, const DataType dt,
std::vector<index_t> *input_shape, Tensor *output, StatsFuture *future);
// Selection matrix: kernel_size x stride_size
static const Conv2dOpenclFunction selector[5] = {
Conv2dOpenclK1x1, nullptr, Conv2dOpenclK3x3, nullptr, nullptr};
......@@ -97,11 +100,11 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
auto conv2d_func = selector[kernel_h - 1];
conv2d_func(&kernel_, input, filter, bias, strides_[0], paddings.data(),
dilations_, activation_, relux_max_limit_,
DataTypeToEnum<T>::value, output, future);
DataTypeToEnum<T>::value, &input_shape_, output, future);
} else {
Conv2dOpencl(&kernel_, input, filter, bias, strides_[0], paddings.data(),
dilations_, activation_, relux_max_limit_,
DataTypeToEnum<T>::value, output, future);
DataTypeToEnum<T>::value, &input_shape_, output, future);
}
}
......
......@@ -20,6 +20,7 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
const ActivationType activation,
const float relux_max_limit,
const DataType dt,
std::vector<index_t> *prev_input_shape,
Tensor *output,
StatsFuture *future) {
const index_t batch = output->dim(0);
......@@ -68,6 +69,8 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
auto runtime = OpenCLRuntime::Global();
*kernel = runtime->BuildKernel("conv_2d_1x1", kernel_name, built_options);
}
if (!IsVecEqual(*prev_input_shape, input->shape())) {
uint32_t idx = 0;
kernel->setArg(idx++, *(input->opencl_image()));
kernel->setArg(idx++, *(filter->opencl_image()));
......@@ -83,6 +86,8 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
kernel->setArg(idx++, static_cast<int>(height));
kernel->setArg(idx++, static_cast<int>(width));
kernel->setArg(idx++, stride);
*prev_input_shape = input->shape();
}
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
......
......@@ -22,6 +22,7 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
const ActivationType activation,
const float relux_max_limit,
const DataType dt,
std::vector<index_t> *prev_input_shape,
Tensor *output,
StatsFuture *future) {
const index_t batch = output->dim(0);
......@@ -62,7 +63,8 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
auto runtime = OpenCLRuntime::Global();
*kernel = runtime->BuildKernel("conv_2d_3x3", kernel_name, built_options);
}
if (!IsVecEqual(*prev_input_shape, input->shape())) {
uint32_t idx = 0;
kernel->setArg(idx++, *(input->opencl_image()));
kernel->setArg(idx++, *(filter->opencl_image()));
......@@ -81,6 +83,8 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
kernel->setArg(idx++, padding[1] / 2);
kernel->setArg(idx++, dilations[0]);
kernel->setArg(idx++, dilations[1]);
*prev_input_shape = input->shape();
}
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
......
......@@ -22,6 +22,7 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
const ActivationType activation,
const float relux_max_limit,
const DataType dt,
std::vector<index_t> *prev_input_shape,
Tensor *output,
StatsFuture *future) {
const index_t batch = output->dim(0);
......@@ -62,7 +63,8 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
auto runtime = OpenCLRuntime::Global();
*kernel = runtime->BuildKernel("conv_2d", kernel_name, built_options);
}
if (!IsVecEqual(*prev_input_shape, input->shape())) {
uint32_t idx = 0;
kernel->setArg(idx++, *(input->opencl_image()));
kernel->setArg(idx++, *(filter->opencl_image()));
......@@ -83,6 +85,8 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
kernel->setArg(idx++, padding[1] / 2);
kernel->setArg(idx++, dilations[0]);
kernel->setArg(idx++, dilations[1]);
*prev_input_shape = input->shape();
}
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
......
......@@ -21,6 +21,7 @@ void DepthwiseConv2d(cl::Kernel *kernel,
const ActivationType activation,
const float relux_max_limit,
const DataType dt,
std::vector<index_t> *prev_input_shape,
Tensor *output,
StatsFuture *future) {
const index_t batch = output->dim(0);
......@@ -35,17 +36,6 @@ void DepthwiseConv2d(cl::Kernel *kernel,
const index_t input_channel_blocks = RoundUpDiv4(input_channels);
const index_t width_blocks = RoundUpDiv4(width);
if (kernel->get() == nullptr) {
const index_t input_batch = input->dim(0);
const index_t input_height = input->dim(1);
const index_t input_width = input->dim(2);
const index_t filter_height = filter->dim(0);
const index_t filter_width = filter->dim(1);
MACE_CHECK(multiplier == 1, "Multiplier > 1 not supported");
MACE_CHECK(multiplier * input_channels == channels);
MACE_CHECK(filter->dim(2) == input_channels, filter->dim(2), "!=",
input_channels);
auto runtime = OpenCLRuntime::Global();
std::set<std::string> built_options;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv2d");
......@@ -80,6 +70,18 @@ void DepthwiseConv2d(cl::Kernel *kernel,
*kernel =
runtime->BuildKernel("depthwise_conv2d", kernel_name, built_options);
}
if (!IsVecEqual(*prev_input_shape, input->shape())) {
const index_t input_batch = input->dim(0);
const index_t input_height = input->dim(1);
const index_t input_width = input->dim(2);
const index_t filter_height = filter->dim(0);
const index_t filter_width = filter->dim(1);
MACE_CHECK(multiplier == 1, "Multiplier > 1 not supported");
MACE_CHECK(multiplier * input_channels == channels);
MACE_CHECK(filter->dim(2) == input_channels, filter->dim(2), "!=",
input_channels);
uint32_t idx = 0;
kernel->setArg(idx++, *(input->opencl_image()));
......@@ -102,6 +104,7 @@ void DepthwiseConv2d(cl::Kernel *kernel,
kernel->setArg(idx++, static_cast<short>(dilations[0]));
kernel->setArg(idx++, static_cast<short>(dilations[1]));
}
*prev_input_shape = input->shape();
}
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
......@@ -120,9 +123,7 @@ void DepthwiseConv2dFunctor<DeviceType::OPENCL, T>::operator()(
const Tensor *bias,
Tensor *output,
StatsFuture *future) {
typedef void (*Conv2dOpenclFunction)(const Tensor *input,
const Tensor *filter, const Tensor *bias,
Tensor *output, StatsFuture *future);
index_t kernel_h = filter->dim(2);
index_t kernel_w = filter->dim(3);
if (strides_[0] != strides_[1]) {
......@@ -163,7 +164,7 @@ void DepthwiseConv2dFunctor<DeviceType::OPENCL, T>::operator()(
DepthwiseConv2d(&kernel_, input, filter, bias, strides_[0], paddings.data(),
dilations_, activation_, relux_max_limit_,
DataTypeToEnum<T>::value, output, future);
DataTypeToEnum<T>::value, &input_shape_, output, future);
}
template struct DepthwiseConv2dFunctor<DeviceType::OPENCL, float>;
......
......@@ -36,6 +36,8 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
if (!coeff_.empty()) built_options.emplace("-DCOEFF_SUM");
kernel_ = runtime->BuildKernel("eltwise", kernel_name, built_options);
}
if (!IsVecEqual(input_shape_, input0->shape())) {
uint32_t idx = 0;
kernel_.setArg(idx++, *(input0->opencl_image()));
kernel_.setArg(idx++, *(input1->opencl_image()));
......@@ -44,6 +46,7 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
kernel_.setArg(idx++, coeff_[1]);
}
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input0->shape();
}
const uint32_t gws[2] = {static_cast<uint32_t>(width_pixels),
......
......@@ -13,6 +13,7 @@ void FCWXKernel(cl::Kernel *kernel,
const Tensor *input,
const Tensor *weight,
const Tensor *bias,
std::vector<index_t> *prev_input_shape,
Tensor *output,
const ActivationType activation,
std::vector<uint32_t> &gws,
......@@ -67,6 +68,11 @@ void FCWXKernel(cl::Kernel *kernel,
const uint32_t inter_local_blks = kwg_size / (gws[0] * gws[1]);
lws = {gws[0], gws[1], inter_local_blks};
}
if (!IsVecEqual(*prev_input_shape, input->shape())) {
const index_t batch = output->dim(0);
const index_t output_blocks = RoundUpDiv4(output->dim(3));
uint32_t idx = 0;
kernel->setArg(idx++, *(input->opencl_image()));
kernel->setArg(idx++, *(weight->opencl_image()));
......@@ -80,6 +86,10 @@ void FCWXKernel(cl::Kernel *kernel,
kernel->setArg(idx++, static_cast<int>(RoundUpDiv4(input->dim(3))));
kernel->setArg(idx++, static_cast<int>(output_blocks));
kernel->setArg(idx++, relux_max_limit);
gws[2] = static_cast<uint32_t>(batch * output_blocks);
*prev_input_shape = input->shape();
}
cl::Event event;
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
......@@ -103,6 +113,7 @@ void FCWTXKernel(cl::Kernel *kernel,
const Tensor *input,
const Tensor *weight,
const Tensor *bias,
std::vector<index_t> *prev_input_shape,
Tensor *output,
const ActivationType activation,
std::vector<uint32_t> &gws,
......@@ -141,6 +152,9 @@ void FCWTXKernel(cl::Kernel *kernel,
*kernel =
runtime->BuildKernel("fully_connected", kernel_name, built_options);
lws = {16, 64, 1};
}
if (!IsVecEqual(*prev_input_shape, input->shape())) {
uint32_t idx = 0;
kernel->setArg(idx++, *(input->opencl_image()));
kernel->setArg(idx++, *(weight->opencl_image()));
......@@ -155,14 +169,13 @@ void FCWTXKernel(cl::Kernel *kernel,
kernel->setArg(idx++, relux_max_limit);
const index_t batch = output->dim(0);
const index_t output_size = output->dim(3);
const index_t output_blocks = RoundUpDiv4(output_size);
const index_t output_blocks = RoundUpDiv4(output->dim(3));
gws = {
static_cast<uint32_t>(batch), static_cast<uint32_t>(output_blocks),
};
lws = {16, 64, 1};
*prev_input_shape = input->shape();
}
std::stringstream ss;
......@@ -185,11 +198,11 @@ void FullyConnectedFunctor<DeviceType::OPENCL, T>::operator()(
output->ResizeImage(output_shape, output_image_shape);
if (weight_type_ == BufferType::WEIGHT_HEIGHT) {
FCWTXKernel<T>(&kernel_, input, weight, bias, output,
FCWTXKernel<T>(&kernel_, input, weight, bias, &input_shape_, output,
activation_, gws_, lws_, relux_max_limit_, future);
} else {
FCWXKernel<T>(&kernel_, input, weight, bias, output,
activation_, gws_, lws_, relux_max_limit_, future);
FCWXKernel<T>(&kernel_, input, weight, bias, &input_shape_, output,
activation_, gws_, lws_, relux_max_limit_, future);
}
};
......
......@@ -71,6 +71,13 @@ inline bool LimitKernelTime() {
return flag != nullptr && strlen(flag) == 1 && flag[0] == '1';
}
template <typename T>
bool IsVecEqual(const std::vector<T> &input0,
const std::vector<T> &input1) {
return ((input0.size() == input1.size()) &&
(std::equal(input0.begin(), input0.end(), input1.begin())));
}
namespace {
template <typename T>
void AppendToStream(std::stringstream *ss, const std::string &delimiter, T v) {
......
......@@ -36,17 +36,16 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
kernel_ = runtime->BuildKernel("matmul", kernel_name, built_options);
uint32_t idx = 0;
kernel_.setArg(idx++, *(A->opencl_image()));
kernel_.setArg(idx++, *(B->opencl_image()));
kernel_.setArg(idx++, *(C->opencl_image()));
kernel_.setArg(idx++, static_cast<int>(height));
kernel_.setArg(idx++, static_cast<int>(width));
kernel_.setArg(idx++, static_cast<int>(A->dim(2)));
kernel_.setArg(idx++, static_cast<int>(height_blocks));
kernel_.setArg(idx++, static_cast<int>(RoundUpDiv4(A->dim(2))));
}
uint32_t idx = 0;
kernel_.setArg(idx++, *(A->opencl_image()));
kernel_.setArg(idx++, *(B->opencl_image()));
kernel_.setArg(idx++, *(C->opencl_image()));
kernel_.setArg(idx++, static_cast<int>(height));
kernel_.setArg(idx++, static_cast<int>(width));
kernel_.setArg(idx++, static_cast<int>(A->dim(2)));
kernel_.setArg(idx++, static_cast<int>(height_blocks));
kernel_.setArg(idx++, static_cast<int>(RoundUpDiv4(A->dim(2))));
const uint32_t gws[2] = {
static_cast<uint32_t>(width_blocks),
......
......@@ -17,31 +17,6 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
StatsFuture *future) {
MACE_CHECK(dilations_[0] == 1 && dilations_[1] == 1)
<< "Pooling opencl kernel not support dilation yet";
std::vector<index_t> output_shape(4);
std::vector<index_t> filter_shape = {kernels_[0], kernels_[1], input->dim(3),
input->dim(3)};
std::vector<int> paddings(2);
if (paddings_.empty()) {
kernels::CalcNHWCPaddingAndOutputSize(
input->shape().data(), filter_shape.data(), dilations_, strides_,
padding_type_, output_shape.data(), paddings.data());
} else {
paddings = paddings_;
CalcOutputSize(input->shape().data(), filter_shape.data(), paddings_.data(),
dilations_, strides_, RoundType::CEIL, output_shape.data());
}
std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape);
output->ResizeImage(output_shape, output_image_shape);
index_t batch = output->dim(0);
index_t out_height = output->dim(1);
index_t out_width = output->dim(2);
index_t channels = output->dim(3);
index_t channel_blocks = (channels + 3) / 4;
if (kernel_.get() == nullptr) {
const DataType dt = DataTypeToEnum<T>::value;
......@@ -62,18 +37,49 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
}
kernel_ = runtime->BuildKernel("pooling", kernel_name, built_options);
}
if (!IsVecEqual(input_shape_, input->shape())) {
std::vector<index_t> output_shape(4);
std::vector<index_t> filter_shape = {kernels_[0], kernels_[1], input->dim(3),
input->dim(3)};
std::vector<int> paddings(2);
if (paddings_.empty()) {
kernels::CalcNHWCPaddingAndOutputSize(
input->shape().data(), filter_shape.data(), dilations_, strides_,
padding_type_, output_shape.data(), paddings.data());
} else {
paddings = paddings_;
CalcOutputSize(input->shape().data(), filter_shape.data(), paddings_.data(),
dilations_, strides_, RoundType::CEIL, output_shape.data());
}
std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape);
output->ResizeImage(output_shape, output_image_shape);
uint32_t idx = 0;
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, static_cast<int32_t>(input->dim(1)));
kernel_.setArg(idx++, static_cast<int32_t>(input->dim(2)));
kernel_.setArg(idx++, static_cast<int32_t>(out_height));
kernel_.setArg(idx++, static_cast<int32_t>(output->dim(1)));
kernel_.setArg(idx++, paddings[0] / 2);
kernel_.setArg(idx++, paddings[1] / 2);
kernel_.setArg(idx++, strides_[0]);
kernel_.setArg(idx++, kernels_[0]);
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
index_t batch = output->dim(0);
index_t out_height = output->dim(1);
index_t out_width = output->dim(2);
index_t channels = output->dim(3);
index_t channel_blocks = (channels + 3) / 4;
const uint32_t gws[3] = {
static_cast<uint32_t>(channel_blocks), static_cast<uint32_t>(out_width),
static_cast<uint32_t>(batch * out_height),
......
......@@ -25,6 +25,18 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
const index_t out_width = out_width_;
if (kernel_.get() == nullptr) {
auto runtime = OpenCLRuntime::Global();
std::set<std::string> built_options;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bilinear_nocache");
built_options.emplace("-Dresize_bilinear_nocache=" + kernel_name);
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
kernel_ =
runtime->BuildKernel("resize_bilinear", kernel_name, built_options);
}
if (!IsVecEqual(input_shape_, input->shape())) {
MACE_CHECK(out_height > 0 && out_width > 0);
std::vector<index_t> output_shape{batch, out_height, out_width, channels};
......@@ -38,16 +50,6 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
float width_scale =
CalculateResizeScale(in_width, out_width, align_corners_);
auto runtime = OpenCLRuntime::Global();
std::set<std::string> built_options;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bilinear_nocache");
built_options.emplace("-Dresize_bilinear_nocache=" + kernel_name);
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
kernel_ =
runtime->BuildKernel("resize_bilinear", kernel_name, built_options);
uint32_t idx = 0;
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(output->opencl_image()));
......@@ -56,6 +58,9 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
kernel_.setArg(idx++, static_cast<int32_t>(in_height));
kernel_.setArg(idx++, static_cast<int32_t>(in_width));
kernel_.setArg(idx++, static_cast<int32_t>(out_height));
input_shape_ = input->shape();
}
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
......
......@@ -34,11 +34,14 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
kernel_ = runtime->BuildKernel("softmax", kernel_name, built_options);
}
if (!IsVecEqual(input_shape_, logits->shape())) {
uint32_t idx = 0;
kernel_.setArg(idx++, *(logits->opencl_image()));
kernel_.setArg(idx++, static_cast<int>(channels));
kernel_.setArg(idx++, remain_channels);
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = logits->shape();
}
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
......
......@@ -43,6 +43,8 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
kernel_ =
runtime->BuildKernel("space_to_batch", kernel_name, built_options);
}
if (!IsVecEqual(space_shape_, space_tensor->shape())) {
uint32_t idx = 0;
if (b2s_) {
kernel_.setArg(idx++, *(batch_tensor->opencl_image()));
......@@ -59,6 +61,8 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(2)));
kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(1)));
kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(2)));
space_shape_ = space_tensor->shape();
}
const uint32_t chan_blk = RoundUpDiv4<uint32_t>(batch_tensor->dim(3));
......
......@@ -14,6 +14,21 @@ namespace kernels {
template <typename T>
void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
const Tensor *input_tensor, Tensor *output_tensor, StatsFuture *future) {
if (kernel_.get() == nullptr) {
std::string obfuscated_kernel_name =
MACE_OBFUSCATE_SYMBOL("winograd_transform_2x2");
std::set<std::string> built_options;
built_options.emplace("-Dwinograd_transform_2x2=" + obfuscated_kernel_name);
built_options.emplace("-DDATA_TYPE=" +
DtToUpstreamCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" +
DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
auto runtime = OpenCLRuntime::Global();
kernel_ = runtime->BuildKernel("winograd_transform", obfuscated_kernel_name,
built_options);
}
std::vector<index_t> output_shape(4);
std::vector<index_t> filter_shape = {3, 3, input_tensor->dim(3), 1};
std::vector<int> paddings(2);
......@@ -27,29 +42,16 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
paddings_.data(), dilations_.data(), strides_.data(),
RoundType::FLOOR, output_shape.data());
}
const index_t round_h = (output_shape[1] + 1) / 2;
const index_t round_w = (output_shape[2] + 1) / 2;
const index_t out_width = input_tensor->dim(0) * round_h * round_w;
if (kernel_.get() == nullptr) {
if (!IsVecEqual(input_shape_, input_tensor->shape())) {
output_shape = {16, input_tensor->dim(3), out_width, 1};
std::vector<size_t> image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_HEIGHT, image_shape);
output_tensor->ResizeImage(output_shape, image_shape);
std::string obfuscated_kernel_name =
MACE_OBFUSCATE_SYMBOL("winograd_transform_2x2");
std::set<std::string> built_options;
built_options.emplace("-Dwinograd_transform_2x2=" + obfuscated_kernel_name);
built_options.emplace("-DDATA_TYPE=" +
DtToUpstreamCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" +
DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
auto runtime = OpenCLRuntime::Global();
kernel_ = runtime->BuildKernel("winograd_transform", obfuscated_kernel_name,
built_options);
uint32_t idx = 0;
kernel_.setArg(idx++, *(input_tensor->opencl_image()));
kernel_.setArg(idx++, *(output_tensor->opencl_image()));
......@@ -60,6 +62,8 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
kernel_.setArg(idx++, static_cast<uint32_t>(round_w));
kernel_.setArg(idx++, static_cast<uint32_t>(paddings[0] / 2));
kernel_.setArg(idx++, static_cast<uint32_t>(paddings[1] / 2));
input_shape_ = input_tensor->shape();
}
const uint32_t gws[2] = {
......@@ -79,11 +83,6 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
const Tensor *bias,
Tensor *output_tensor,
StatsFuture *future) {
std::vector<index_t> output_shape = {batch_, height_, width_,
input_tensor->dim(1)};
std::vector<size_t> image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, image_shape);
output_tensor->ResizeImage(output_shape, image_shape);
if (kernel_.get() == nullptr) {
std::string obfuscated_kernel_name =
......@@ -121,6 +120,13 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
auto runtime = OpenCLRuntime::Global();
kernel_ = runtime->BuildKernel("winograd_transform", obfuscated_kernel_name,
built_options);
}
if (!IsVecEqual(input_shape_, input_tensor->shape())) {
std::vector<index_t> output_shape = {batch_, height_, width_,
input_tensor->dim(1)};
std::vector<size_t> image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, image_shape);
output_tensor->ResizeImage(output_shape, image_shape);
const uint32_t round_h = (height_ + 1) / 2;
const uint32_t round_w = (width_ + 1) / 2;
......@@ -139,6 +145,8 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
kernel_.setArg(idx++, static_cast<uint32_t>(round_h * round_w));
kernel_.setArg(idx++, static_cast<uint32_t>(round_w));
kernel_.setArg(idx++, relux_max_limit_);
input_shape_ = input_tensor->shape();
}
const uint32_t gws[2] = {
......
......@@ -182,6 +182,7 @@ struct PoolingFunctor<DeviceType::OPENCL, T> : PoolingFunctorBase {
StatsFuture *future);
cl::Kernel kernel_;
std::vector<index_t> input_shape_;
};
} // namespace kernels
......
......@@ -172,6 +172,7 @@ struct ResizeBilinearFunctor<DeviceType::OPENCL, T>
void operator()(const Tensor *input, Tensor *output, StatsFuture *future);
cl::Kernel kernel_;
std::vector<index_t> input_shape_;
};
} // namespace kernels
......
......@@ -57,6 +57,7 @@ struct SoftmaxFunctor<DeviceType::OPENCL, T> {
void operator()(const Tensor *logits, Tensor *output, StatsFuture *future);
cl::Kernel kernel_;
std::vector<index_t> input_shape_;
};
} // namepsace kernels
......
......@@ -54,6 +54,7 @@ struct SpaceToBatchFunctor<DeviceType::OPENCL, T> : SpaceToBatchFunctorBase {
StatsFuture *future);
cl::Kernel kernel_;
std::vector<index_t> space_shape_;
};
} // namespace kernels
......
......@@ -49,6 +49,7 @@ struct WinogradTransformFunctor<DeviceType::OPENCL, T>
void operator()(const Tensor *input, Tensor *output, StatsFuture *future);
cl::Kernel kernel_;
std::vector<index_t> input_shape_;
};
struct WinogradInverseTransformFunctorBase {
......@@ -105,6 +106,7 @@ struct WinogradInverseTransformFunctor<DeviceType::OPENCL, T>
StatsFuture *future);
cl::Kernel kernel_;
std::vector<index_t> input_shape_;
};
} // namespace kernels
......
......@@ -5,6 +5,7 @@
#include "mace/ops/activation.h"
namespace mace {
namespace ops {
void Register_Activation(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Activation")
......@@ -26,4 +27,5 @@ void Register_Activation(OperatorRegistry *op_registry) {
ActivationOp<DeviceType::OPENCL, half>);
}
} // namespace ops
} // namespace mace
......@@ -5,10 +5,13 @@
#ifndef MACE_OPS_ACTIVATION_H_
#define MACE_OPS_ACTIVATION_H_
#include <string>
#include "mace/core/operator.h"
#include "mace/kernels/activation.h"
namespace mace {
namespace ops {
template <DeviceType D, class T>
class ActivationOp : public Operator<D, T> {
......@@ -36,6 +39,7 @@ class ActivationOp : public Operator<D, T> {
kernels::ActivationFunctor<D, T> functor_;
};
} // namespace ops
} // namespace mace
#endif // MACE_OPS_ACTIVATION_H_
......@@ -3,11 +3,15 @@
//
#include <string>
#include "mace/core/operator.h"
#include "mace/core/testing/test_benchmark.h"
#include "mace/ops/ops_test_util.h"
namespace mace {
namespace ops {
namespace test {
template <DeviceType D, typename T>
static void ReluBenchmark(
int iters, int batch, int channels, int height, int width) {
......@@ -316,4 +320,6 @@ BM_SIGMOID(1, 3, 512, 512);
BM_SIGMOID(1, 32, 112, 112);
BM_SIGMOID(1, 64, 256, 256);
} // namespace test
} // namespace ops
} // namespace mace
......@@ -6,6 +6,8 @@
#include "mace/ops/ops_test_util.h"
namespace mace {
namespace ops {
namespace test {
class ActivationOpTest : public OpsTestBase {};
......@@ -365,4 +367,6 @@ TEST_F(ActivationOpTest, OPENCLSimpleSigmoid) {
TestSimpleSigmoid<DeviceType::OPENCL>();
}
} // namespace test
} // namespace ops
} // namespace mace
......@@ -5,6 +5,7 @@
#include "mace/ops/addn.h"
namespace mace {
namespace ops {
void Register_AddN(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("AddN")
......@@ -26,4 +27,5 @@ void Register_AddN(OperatorRegistry *op_registry) {
AddNOp<DeviceType::OPENCL, half>);
}
} // namespace ops
} // namespace mace
......@@ -11,6 +11,7 @@
#include "mace/kernels/addn.h"
namespace mace {
namespace ops {
template <DeviceType D, class T>
class AddNOp : public Operator<D, T> {
......@@ -40,6 +41,7 @@ class AddNOp : public Operator<D, T> {
kernels::AddNFunctor<D, T> functor_;
};
} // namespace ops
} // namespace mace
#endif // MACE_OPS_ADDN_H_
......@@ -3,11 +3,15 @@
//
#include <string>
#include "mace/core/operator.h"
#include "mace/core/testing/test_benchmark.h"
#include "mace/ops/ops_test_util.h"
namespace mace {
namespace ops {
namespace test {
template <DeviceType D, typename T>
static void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) {
mace::testing::StopTiming();
......@@ -75,4 +79,6 @@ BM_ADDN(4, 1, 128, 128, 3);
BM_ADDN(2, 1, 256, 256, 3);
BM_ADDN(2, 1, 512, 512, 3);
} // namespace test
} // namespace ops
} // namespace mace
......@@ -6,6 +6,8 @@
#include "mace/ops/ops_test_util.h"
namespace mace {
namespace ops {
namespace test {
class AddnOpTest : public OpsTestBase {};
......@@ -62,15 +64,15 @@ TEST_F(AddnOpTest, CPUSimpleAdd3) { SimpleAdd3<DeviceType::CPU>(); }
template <DeviceType D>
void RandomTest() {
testing::internal::LogToStderr();
srand(time(NULL));
static unsigned int seed = time(NULL);
for (int round = 0; round < 10; ++round) {
// generate random input
index_t n = 1 + (rand() % 5);
index_t h = 1 + (rand() % 100);
index_t w = 1 + (rand() % 100);
index_t c = 1 + (rand() % 32);
int input_num = 2 + rand() % 3;
index_t n = 1 + (rand_r(&seed) % 5);
index_t h = 1 + (rand_r(&seed) % 100);
index_t w = 1 + (rand_r(&seed) % 100);
index_t c = 1 + (rand_r(&seed) % 32);
int input_num = 2 + rand_r(&seed) % 3;
// Construct graph
OpsTestNet net;
auto op_def = OpDefBuilder("AddN", "AddNTest");
......@@ -117,4 +119,6 @@ void RandomTest() {
TEST_F(AddnOpTest, OPENCLRandom) { RandomTest<DeviceType::OPENCL>(); }
} // namespace test
} // namespace ops
} // namespace mace
......@@ -5,6 +5,7 @@
#include "mace/ops/batch_norm.h"
namespace mace {
namespace ops {
void Register_BatchNorm(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchNorm")
......@@ -26,4 +27,5 @@ void Register_BatchNorm(OperatorRegistry *op_registry) {
BatchNormOp<DeviceType::OPENCL, half>);
}
} // namespace ops
} // namespace mace
......@@ -10,6 +10,7 @@
#include "mace/kernels/batch_norm.h"
namespace mace {
namespace ops {
template <DeviceType D, class T>
class BatchNormOp : public Operator<D, T> {
......@@ -55,6 +56,7 @@ class BatchNormOp : public Operator<D, T> {
OP_OUTPUT_TAGS(OUTPUT);
};
} // namespace ops
} // namespace mace
#endif // MACE_OPS_BATCH_NORM_H_
......@@ -8,6 +8,9 @@
#include "mace/ops/ops_test_util.h"
namespace mace {
namespace ops {
namespace test {
template <DeviceType D, typename T>
static void BatchNorm(
int iters, int batch, int channels, int height, int width) {
......@@ -101,4 +104,6 @@ BM_BATCH_NORM(1, 1024, 7, 7);
BM_BATCH_NORM(32, 1, 256, 256);
BM_BATCH_NORM(32, 3, 256, 256);
} // namespace test
} // namespace ops
} // namespace mace
......@@ -6,6 +6,8 @@
#include "mace/ops/ops_test_util.h"
namespace mace {
namespace ops {
namespace test {
class BatchNormOpTest : public OpsTestBase {};
......@@ -75,11 +77,10 @@ TEST_F(BatchNormOpTest, SimpleCPU) { Simple<DeviceType::CPU>(); }
TEST_F(BatchNormOpTest, SimpleOPENCL) { Simple<DeviceType::OPENCL>(); }
TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
srand(time(NULL));
// generate random input
index_t batch = 1 + rand() % 10;
index_t channels = 3 + rand() % 50;
static unsigned int seed = time(NULL);
index_t batch = 1 + rand_r(&seed) % 10;
index_t channels = 3 + rand_r(&seed) % 50;
index_t height = 64;
index_t width = 64;
......@@ -147,11 +148,10 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
}
TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {
srand(time(NULL));
// generate random input
index_t batch = 1 + rand() % 10;
index_t channels = 3 + rand() % 50;
static unsigned int seed = time(NULL);
index_t batch = 1 + rand_r(&seed) % 10;
index_t channels = 3 + rand_r(&seed) % 50;
index_t height = 64;
index_t width = 64;
......@@ -220,11 +220,10 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {
}
TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
srand(time(NULL));
// generate random input
index_t batch = 1 + rand() % 10;
index_t channels = 3 + rand() % 50;
static unsigned int seed = time(NULL);
index_t batch = 1 + rand_r(&seed) % 10;
index_t channels = 3 + rand_r(&seed) % 50;
index_t height = 103;
index_t width = 113;
......@@ -292,11 +291,10 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
}
TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {
srand(time(NULL));
// generate random input
index_t batch = 1 + rand() % 10;
index_t channels = 3 + rand() % 50;
static unsigned int seed = time(NULL);
index_t batch = 1 + rand_r(&seed) % 10;
index_t channels = 3 + rand_r(&seed) % 50;
index_t height = 103;
index_t width = 113;
......@@ -363,4 +361,7 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {
kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.5);
}
}
} // namespace test
} // namespace ops
} // namespace mace
......@@ -5,6 +5,7 @@
#include "mace/ops/batch_to_space.h"
namespace mace {
namespace ops {
void Register_BatchToSpaceND(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchToSpaceND")
......@@ -19,4 +20,5 @@ void Register_BatchToSpaceND(OperatorRegistry *op_registry) {
BatchToSpaceNDOp<DeviceType::OPENCL, half>);
}
} // namespace ops
} // namespace mace
......@@ -2,15 +2,17 @@
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#ifndef MACE_OPS_SPACE_TO_BATCH_H_
#define MACE_OPS_SPACE_TO_BATCH_H_
#ifndef MACE_OPS_BATCH_TO_SPACE_H_
#define MACE_OPS_BATCH_TO_SPACE_H_
#include <memory>
#include <vector>
#include "mace/core/operator.h"
#include "mace/kernels/space_to_batch.h"
namespace mace {
namespace ops {
template <DeviceType D, typename T>
class BatchToSpaceNDOp : public Operator<D, T> {
......@@ -68,6 +70,7 @@ class BatchToSpaceNDOp : public Operator<D, T> {
OP_OUTPUT_TAGS(OUTPUT);
};
} // namespace ops
} // namespace mace
#endif // MACE_OPS_SPACE_TO_BATCH_H_
#endif // MACE_OPS_BATCH_TO_SPACE_H_
......@@ -7,6 +7,9 @@
#include "mace/ops/ops_test_util.h"
namespace mace {
namespace ops {
namespace test {
template <DeviceType D, typename T>
static void BMBatchToSpace(
int iters, int batch, int channels, int height, int width, int arg) {
......@@ -53,4 +56,7 @@ static void BMBatchToSpace(
BM_BATCH_TO_SPACE(128, 8, 8, 128, 2);
BM_BATCH_TO_SPACE(4, 128, 128, 32, 2);
BM_BATCH_TO_SPACE(16, 64, 64, 32, 4);
} // namespace test
} // namespace ops
} // namespace mace
......@@ -5,6 +5,7 @@
#include "mace/ops/bias_add.h"
namespace mace {
namespace ops {
void Register_BiasAdd(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("BiasAdd")
......@@ -26,4 +27,5 @@ void Register_BiasAdd(OperatorRegistry *op_registry) {
BiasAddOp<DeviceType::OPENCL, half>);
}
} // namespace ops
} // namespace mace
......@@ -2,13 +2,14 @@
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#ifndef MACE_BIAS_ADD_H_
#define MACE_BIAS_ADD_H_
#ifndef MACE_OPS_BIAS_ADD_H_
#define MACE_OPS_BIAS_ADD_H_
#include "mace/core/operator.h"
#include "mace/kernels/bias_add.h"
namespace mace {
namespace ops {
template <DeviceType D, class T>
class BiasAddOp : public Operator<D, T> {
......@@ -40,6 +41,7 @@ class BiasAddOp : public Operator<D, T> {
OP_OUTPUT_TAGS(OUTPUT);
};
} // namespace ops
} // namespace mace
#endif // MACE_BIAS_ADD_H_
#endif // MACE_OPS_BIAS_ADD_H_
......@@ -8,6 +8,9 @@
#include "mace/ops/ops_test_util.h"
namespace mace {
namespace ops {
namespace test {
template <DeviceType D, typename T>
static void BiasAdd(int iters, int batch, int channels, int height, int width) {
mace::testing::StopTiming();
......@@ -77,4 +80,7 @@ BM_BIAS_ADD(1, 512, 14, 14);
BM_BIAS_ADD(1, 1024, 7, 7);
BM_BIAS_ADD(32, 1, 256, 256);
BM_BIAS_ADD(32, 3, 256, 256);
} // namespace test
} // namespace ops
} // namespace mace
......@@ -6,6 +6,8 @@
#include "mace/ops/ops_test_util.h"
namespace mace {
namespace ops {
namespace test {
class BiasAddOpTest : public OpsTestBase {};
......@@ -60,13 +62,12 @@ TEST_F(BiasAddOpTest, BiasAddSimpleOPENCL) {
}
TEST_F(BiasAddOpTest, SimpleRandomOPENCL) {
srand(time(NULL));
// generate random input
index_t batch = 1 + rand() % 10;
index_t channels = 3 + rand() % 50;
index_t height = 64 + rand() % 50;
index_t width = 64 + rand() % 50;
static unsigned int seed = time(NULL);
index_t batch = 1 + rand_r(&seed) % 10;
index_t channels = 3 + rand_r(&seed) % 50;
index_t height = 64 + rand_r(&seed) % 50;
index_t width = 64 + rand_r(&seed) % 50;
// Construct graph
OpsTestNet net;
......@@ -110,13 +111,12 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) {
}
TEST_F(BiasAddOpTest, ComplexRandomOPENCL) {
srand(time(NULL));
// generate random input
index_t batch = 1 + rand() % 10;
index_t channels = 3 + rand() % 50;
index_t height = 103 + rand() % 100;
index_t width = 113 + rand() % 100;
static unsigned int seed = time(NULL);
index_t batch = 1 + rand_r(&seed) % 10;
index_t channels = 3 + rand_r(&seed) % 50;
index_t height = 103 + rand_r(&seed) % 100;
index_t width = 113 + rand_r(&seed) % 100;
// Construct graph
OpsTestNet net;
......@@ -158,4 +158,7 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) {
kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2);
}
}
} // namespace test
} // namespace ops
} // namespace mace
......@@ -5,6 +5,7 @@
#include "mace/ops/buffer_to_image.h"
namespace mace {
namespace ops {
void Register_BufferToImage(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("BufferToImage")
......@@ -20,4 +21,5 @@ void Register_BufferToImage(OperatorRegistry *op_registry) {
BufferToImageOp<DeviceType::OPENCL, half>);
}
} // namespace ops
} // namespace mace
......@@ -9,6 +9,7 @@
#include "mace/kernels/buffer_to_image.h"
namespace mace {
namespace ops {
template <DeviceType D, typename T>
class BufferToImageOp : public Operator<D, T> {
......@@ -36,5 +37,6 @@ class BufferToImageOp : public Operator<D, T> {
OP_OUTPUT_TAGS(OUTPUT);
};
} // namespace ops
} // namespace mace
#endif // MACE_OPS_BUFFER_TO_IMAGE_H_
......@@ -5,7 +5,9 @@
#include "gtest/gtest.h"
#include "mace/ops/ops_test_util.h"
using namespace mace;
namespace mace {
namespace ops {
namespace test {
template <DeviceType D, typename T>
void TestBidirectionTransform(const int type,
......@@ -188,3 +190,7 @@ TEST(BufferToImageTest, ArgStringHalfToHalfSmall) {
TestStringHalfBidirectionTransform<DeviceType::OPENCL, half>(
kernels::ARGUMENT, {2}, input_data);
}
} // namespace test
} // namespace ops
} // namespace mace
......@@ -5,6 +5,7 @@
#include "mace/ops/channel_shuffle.h"
namespace mace {
namespace ops {
void Register_ChannelShuffle(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("ChannelShuffle")
......@@ -24,4 +25,5 @@ void Register_ChannelShuffle(OperatorRegistry *op_registry) {
ChannelShuffleOp<DeviceType::OPENCL, half>);
}
} // namespace ops
} // namespace mace
......@@ -11,6 +11,7 @@
#include "mace/kernels/channel_shuffle.h"
namespace mace {
namespace ops {
template <DeviceType D, typename T>
class ChannelShuffleOp : public Operator<D, T> {
......@@ -42,6 +43,7 @@ class ChannelShuffleOp : public Operator<D, T> {
kernels::ChannelShuffleFunctor<D, T> functor_;
};
} // namespace ops
} // namespace mace
#endif // MACE_OPS_CHANNEL_SHUFFLE_H_
......@@ -7,10 +7,12 @@
#include "mace/ops/ops_test_util.h"
namespace mace {
namespace ops {
namespace test {
template<DeviceType D, typename T>
template <DeviceType D, typename T>
static void ChannelShuffle(
int iters, int batch, int channels, int height, int width, int group) {
int iters, int batch, int channels, int height, int width, int group) {
mace::testing::StopTiming();
OpsTestNet net;
......@@ -23,15 +25,15 @@ static void ChannelShuffle(
kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("ChannelShuffle", "ChannelShuffleTest")
.Input("InputImage")
.Output("Output")
.AddIntArg("group", group)
.Finalize(net.NewOperatorDef());
.Input("InputImage")
.Output("Output")
.AddIntArg("group", group)
.Finalize(net.NewOperatorDef());
} else {
OpDefBuilder("Softmax", "SoftmaxBM")
.Input("Input")
.Output("Output")
.Finalize(net.NewOperatorDef());
.Input("Input")
.Output("Output")
.Finalize(net.NewOperatorDef());
}
// Warm-up
......@@ -47,18 +49,19 @@ static void ChannelShuffle(
net.Sync();
}
#define BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, TYPE, DEVICE) \
static void BM_CHANNEL_SHUFFLE_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
ChannelShuffle<DEVICE, TYPE>(iters, N, C, H, W, G); \
} \
#define BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, TYPE, DEVICE) \
static void \
BM_CHANNEL_SHUFFLE_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
ChannelShuffle<DEVICE, TYPE>(iters, N, C, H, W, G); \
} \
BENCHMARK(BM_CHANNEL_SHUFFLE_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE)
#define BM_CHANNEL_SHUFFLE(N, C, H, W, G) \
BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, float, CPU); \
#define BM_CHANNEL_SHUFFLE(N, C, H, W, G) \
BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, float, CPU); \
BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, float, OPENCL); \
BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, half, OPENCL);
......@@ -66,4 +69,6 @@ BM_CHANNEL_SHUFFLE(1, 64, 64, 64, 8);
BM_CHANNEL_SHUFFLE(1, 64, 128, 128, 8);
BM_CHANNEL_SHUFFLE(1, 64, 256, 256, 8);
} // namespace mace
} // namespace test
} // namespace ops
} // namespace mace
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/core/operator.h"
#include "mace/ops/ops_test_util.h"
using namespace mace;
namespace mace {
namespace ops {
namespace test {
class ChannelShuffleOpTest : public OpsTestBase {};
......@@ -38,30 +41,34 @@ TEST_F(ChannelShuffleOpTest, C16G4_OPENCL) {
// Add input data
net.AddInputFromArray<DeviceType::OPENCL, float>(
"Input", {1, 1, 2, 16},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31});
"Input", {1, 1, 2, 16},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31});
BufferToImage<DeviceType::OPENCL, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("ChannelShuffle", "ChannelShuffleTest")
.Input("InputImage")
.Output("OutputImage")
.AddIntArg("group", 4)
.Finalize(net.NewOperatorDef());
.Input("InputImage")
.Output("OutputImage")
.AddIntArg("group", 4)
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(DeviceType::OPENCL);
// Transfer output
ImageToBuffer<DeviceType::OPENCL, float>(&net, "OutputImage", "Output",
kernels::BufferType::IN_OUT_CHANNEL);
kernels::BufferType::IN_OUT_CHANNEL);
// Check
auto expected = CreateTensor<float>(
{1, 1, 2, 16}, {0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31});
{1, 1, 2, 16},
{0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
}
} // namespace test
} // namespace ops
} // namespace mace
......@@ -5,6 +5,7 @@
#include "mace/ops/concat.h"
namespace mace {
namespace ops {
void Register_Concat(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Concat")
......@@ -25,4 +26,5 @@ void Register_Concat(OperatorRegistry *op_registry) {
ConcatOp<DeviceType::OPENCL, half>);
}
} // namespace ops
} // namespace mace
......@@ -5,9 +5,13 @@
#ifndef MACE_OPS_CONCAT_H_
#define MACE_OPS_CONCAT_H_
#include <vector>
#include "mace/core/operator.h"
#include "mace/kernels/concat.h"
namespace mace {
namespace ops {
template <DeviceType D, typename T>
class ConcatOp : public Operator<D, T> {
......@@ -41,6 +45,7 @@ class ConcatOp : public Operator<D, T> {
OP_OUTPUT_TAGS(OUTPUT);
};
} // namespace ops
} // namespace mace
#endif // MACE_OPS_CONCAT_H_
......@@ -7,6 +7,9 @@
#include "mace/ops/ops_test_util.h"
namespace mace {
namespace ops {
namespace test {
template <DeviceType D, typename T>
static void ConcatHelper(int iters, int concat_dim, int dim1) {
mace::testing::StopTiming();
......@@ -106,4 +109,6 @@ BM_CONCAT_OPENCL_MACRO(3, 32, 32, 64, half);
BM_CONCAT_OPENCL_MACRO(3, 32, 32, 128, half);
BM_CONCAT_OPENCL_MACRO(3, 32, 32, 256, half);
} // namespace test
} // namespace ops
} // namespace mace
......@@ -2,11 +2,16 @@
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/ops/concat.h"
#include <string>
#include <functional>
#include "gmock/gmock.h"
#include "mace/ops/ops_test_util.h"
#include "mace/ops/concat.h"
using namespace mace;
namespace mace {
namespace ops {
namespace test {
class ConcatOpTest : public OpsTestBase {};
......@@ -87,10 +92,10 @@ TEST_F(ConcatOpTest, CPUSimpleVertical) {
}
TEST_F(ConcatOpTest, CPURandom) {
srand(time(nullptr));
static unsigned int seed = time(NULL);
int dim = 5;
int num_inputs = 2 + rand() % 10;
int axis = rand() % dim;
int num_inputs = 2 + rand_r(&seed) % 10;
int axis = rand_r(&seed) % dim;
// Construct graph
OpsTestNet net;
auto builder = OpDefBuilder("Concat", "ConcatTest");
......@@ -108,7 +113,7 @@ TEST_F(ConcatOpTest, CPURandom) {
std::vector<float *> input_ptrs(num_inputs, nullptr);
index_t concat_axis_size = 0;
for (int i = 0; i < num_inputs; ++i) {
input_shapes[i][axis] = 1 + rand() % dim;
input_shapes[i][axis] = 1 + rand_r(&seed) % dim;
concat_axis_size += input_shapes[i][axis];
GenerateRandomRealTypeData(input_shapes[i], &inputs[i]);
input_ptrs[i] = inputs[i].data();
......@@ -217,3 +222,7 @@ TEST_F(ConcatOpTest, OPENCLAlignedMultiInput) {
OpenclRandomTest<float>(
{{3, 32, 32, 32}, {3, 32, 32, 32}, {3, 32, 32, 32}, {3, 32, 32, 32}}, 3);
}
} // namespace test
} // namespace ops
} // namespace mace
......@@ -5,6 +5,7 @@
#include "mace/ops/conv_2d.h"
namespace mace {
namespace ops {
void Register_Conv2D(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Conv2D")
......@@ -26,4 +27,5 @@ void Register_Conv2D(OperatorRegistry *op_registry) {
Conv2dOp<DeviceType::OPENCL, half>);
}
} // namespace ops
} // namespace mace
......@@ -12,6 +12,7 @@
#include "mace/ops/conv_pool_2d_base.h"
namespace mace {
namespace ops {
template <DeviceType D, typename T>
class Conv2dOp : public ConvPool2dOpBase<D, T> {
......@@ -44,6 +45,7 @@ class Conv2dOp : public ConvPool2dOpBase<D, T> {
OP_OUTPUT_TAGS(OUTPUT);
};
} // namespace ops
} // namespace mace
#endif // MACE_OPS_CONV_2D_H_
......@@ -10,6 +10,8 @@
#include "mace/ops/ops_test_util.h"
namespace mace {
namespace ops {
namespace test {
template <DeviceType D, typename T>
static void Conv2d(int iters,
......@@ -80,30 +82,32 @@ static void Conv2d(int iters,
// approximate the amortized latency. The OpenCL runtime for Mali/Adreno is
// in-order.
#define BM_CONV_2D_MACRO(N, C, H, W, KH, KW, STRIDE, DILATION, P, OC, TYPE, \
DEVICE) \
static void \
BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##D##DILATION##_##P##_##OC##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
int64_t pad_h = 0, pad_w = 0; \
if (P == SAME) { \
pad_h = KH / 2; \
pad_w = KW / 2; \
} \
int64_t oh = \
(H + 2 * pad_h - KH - (KH - 1) * (DILATION - 1)) / STRIDE + 1; \
int64_t ow = \
(W + 2 * pad_w - KW - (KW - 1) * (DILATION - 1)) / STRIDE + 1; \
const int64_t macc = \
static_cast<int64_t>(iters) * N * OC * oh * ow * (KH * KW * C + 1); \
mace::testing::MaccProcessed(macc); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
Conv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, STRIDE, DILATION, \
mace::Padding::P, OC); \
} \
BENCHMARK( \
BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##D##DILATION##_##P##_##OC##_##TYPE##_##DEVICE)
#define BM_CONV_2D_MACRO(N, C, H, W, KH, KW, STRIDE, DILATION, P, OC, TYPE, \
DEVICE) \
static void \
BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##D##DILATION\
##_##P##_##OC##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
int64_t pad_h = 0, pad_w = 0; \
if (P == SAME) { \
pad_h = KH / 2; \
pad_w = KW / 2; \
} \
int64_t oh = \
(H + 2 * pad_h - KH - (KH - 1) * (DILATION - 1)) / STRIDE + 1; \
int64_t ow = \
(W + 2 * pad_w - KW - (KW - 1) * (DILATION - 1)) / STRIDE + 1; \
const int64_t macc = \
static_cast<int64_t>(iters) * N * OC * oh * ow * (KH * KW * C + 1); \
mace::testing::MaccProcessed(macc); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
Conv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, STRIDE, DILATION, \
mace::Padding::P, OC); \
} \
BENCHMARK( \
BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##D##DILATION\
##_##P##_##OC##_##TYPE##_##DEVICE)
#define BM_CONV_2D(N, C, H, W, KH, KW, S, D, P, OC) \
BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, float, CPU); \
......@@ -139,4 +143,6 @@ BM_CONV_2D(1, 32, 256, 256, 3, 3, 1, 4, VALID, 32);
BM_CONV_2D(1, 128, 56, 56, 1, 1, 1, 1, SAME, 128);
BM_CONV_2D(1, 1024, 7, 7, 1, 1, 1, 1, SAME, 1024);
} // namespace test
} // namespace ops
} // namespace mace
......@@ -2,11 +2,15 @@
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/ops/conv_2d.h"
#include <fstream>
#include <vector>
#include "mace/ops/conv_2d.h"
#include "mace/ops/ops_test_util.h"
using namespace mace;
namespace mace {
namespace ops {
namespace test {
class Conv2dOpTest : public OpsTestBase {};
......@@ -347,14 +351,13 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape,
testing::internal::LogToStderr();
auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
Padding type) {
srand(time(NULL));
// generate random input
index_t batch = 3 + (rand() % 10);
static unsigned int seed = time(NULL);
index_t batch = 3 + (rand_r(&seed) % 10);
index_t height = shape[0];
index_t width = shape[1];
index_t input_channels = shape[2] + (rand() % 10);
index_t output_channels = shape[3] + (rand() % 10);
index_t input_channels = shape[2] + (rand_r(&seed) % 10);
index_t output_channels = shape[3] + (rand_r(&seed) % 10);
// Construct graph
OpsTestNet net;
OpDefBuilder("Conv2D", "Conv2dTest")
......@@ -729,3 +732,7 @@ TEST_F(Conv2dOpTest, OPENCLAlignedPad2) {
TEST_F(Conv2dOpTest, OPENCLUnalignedPad4) {
TestArbitraryPadConvNxN<DeviceType::OPENCL, float>({107, 113, 5, 7}, {4, 4});
}
} // namespace test
} // namespace ops
} // namespace mace
......@@ -5,10 +5,13 @@
#ifndef MACE_OPS_CONV_POOL_2D_BASE_H_
#define MACE_OPS_CONV_POOL_2D_BASE_H_
#include <vector>
#include "mace/core/operator.h"
#include "mace/kernels/conv_pool_2d_util.h"
namespace mace {
namespace ops {
template <DeviceType D, class T>
class ConvPool2dOpBase : public Operator<D, T> {
......@@ -29,6 +32,7 @@ class ConvPool2dOpBase : public Operator<D, T> {
std::vector<int> dilations_;
};
} // namespace ops
} // namespace mace
#endif // MACE_OPS_CONV_POOL_2D_BASE_H_
......@@ -5,6 +5,8 @@
#include "mace/ops/ops_test_util.h"
namespace mace {
namespace ops {
namespace test {
TEST(CoreTest, INIT_MODE) {
std::vector<OperatorDef> op_defs;
......@@ -56,4 +58,6 @@ TEST(CoreTest, INIT_MODE) {
1e-5);
}
} // namespace test
} // namespace ops
} // namespace mace
......@@ -5,6 +5,7 @@
#include "mace/ops/depthwise_conv2d.h"
namespace mace {
namespace ops {
void Register_DepthwiseConv2d(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthwiseConv2d")
......@@ -26,4 +27,5 @@ void Register_DepthwiseConv2d(OperatorRegistry *op_registry) {
DepthwiseConv2dOp<DeviceType::OPENCL, half>);
}
} // namespace ops
} // namespace mace
......@@ -6,6 +6,7 @@
#define MACE_OPS_DEPTHWISE_CONV2D_H_
#include <memory>
#include <string>
#include "mace/core/operator.h"
#include "mace/kernels/conv_2d.h"
......@@ -13,6 +14,7 @@
#include "mace/ops/conv_pool_2d_base.h"
namespace mace {
namespace ops {
template <DeviceType D, typename T>
class DepthwiseConv2dOp : public ConvPool2dOpBase<D, T> {
......@@ -48,6 +50,7 @@ class DepthwiseConv2dOp : public ConvPool2dOpBase<D, T> {
OP_OUTPUT_TAGS(OUTPUT);
};
} // namespace ops
} // namespace mace
#endif // MACE_OPS_DEPTHWISE_CONV2D_H_
......@@ -10,6 +10,8 @@
#include "mace/ops/ops_test_util.h"
namespace mace {
namespace ops {
namespace test {
template <DeviceType D, typename T>
static void DepthwiseConv2d(int iters,
......@@ -75,31 +77,33 @@ static void DepthwiseConv2d(int iters,
}
}
#define BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, STRIDE, P, M, TYPE, \
DEVICE) \
static void \
BM_DEPTHWISE_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##M##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t dilation = 1; \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
int64_t pad_h = 0, pad_w = 0; \
if (P == SAME) { \
pad_h = KH / 2; \
pad_w = KW / 2; \
} \
int64_t oh = \
(H + 2 * pad_h - KH - (KH - 1) * (dilation - 1)) / STRIDE + 1; \
int64_t ow = \
(W + 2 * pad_w - KW - (KW - 1) * (dilation - 1)) / STRIDE + 1; \
const int64_t macc = \
static_cast<int64_t>(iters) * N * C * M * oh * ow * (KH * KW + 1); \
mace::testing::MaccProcessed(macc); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
DepthwiseConv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, STRIDE, \
mace::Padding::P, M); \
} \
BENCHMARK( \
BM_DEPTHWISE_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##M##_##TYPE##_##DEVICE)
#define BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, STRIDE, P, M, TYPE, \
DEVICE) \
static void \
BM_DEPTHWISE_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_\
##P##_##M##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t dilation = 1; \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
int64_t pad_h = 0, pad_w = 0; \
if (P == SAME) { \
pad_h = KH / 2; \
pad_w = KW / 2; \
} \
int64_t oh = \
(H + 2 * pad_h - KH - (KH - 1) * (dilation - 1)) / STRIDE + 1; \
int64_t ow = \
(W + 2 * pad_w - KW - (KW - 1) * (dilation - 1)) / STRIDE + 1; \
const int64_t macc = \
static_cast<int64_t>(iters) * N * C * M * oh * ow * (KH * KW + 1); \
mace::testing::MaccProcessed(macc); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
DepthwiseConv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, STRIDE, \
mace::Padding::P, M); \
} \
BENCHMARK( \
BM_DEPTHWISE_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_\
##P##_##M##_##TYPE##_##DEVICE)
#define BM_DEPTHWISE_CONV_2D(N, C, H, W, KH, KW, S, P, M) \
BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, float, CPU); \
......@@ -121,4 +125,6 @@ BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 2, SAME, 1);
BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 2, VALID, 1);
BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 2, SAME, 1);
} // namespace test
} // namespace ops
} // namespace mace
......@@ -5,9 +5,9 @@
#include "mace/ops/conv_2d.h"
#include "mace/ops/ops_test_util.h"
using namespace mace;
namespace {
namespace mace {
namespace ops {
namespace test {
class DepthwiseConv2dOpTest : public OpsTestBase {};
......@@ -207,11 +207,10 @@ void TestNxNS12(const index_t height, const index_t width) {
testing::internal::LogToStderr();
auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
Padding type) {
srand(time(NULL));
// generate random input
index_t batch = 1 + rand() % 5;
index_t input_channels = 3 + rand() % 16;
static unsigned int seed = time(NULL);
index_t batch = 1 + rand_r(&seed) % 5;
index_t input_channels = 3 + rand_r(&seed) % 16;
index_t multiplier = 1;
// Construct graph
OpsTestNet net;
......@@ -316,4 +315,6 @@ TEST_F(DepthwiseConv2dOpTest, OpenCLUnalignedNxNS12Half) {
TestNxNS12<DeviceType::OPENCL, half>(107, 113);
}
} // namespace
} // namespace test
} // namespace ops
} // namespace mace
......@@ -5,6 +5,7 @@
#include "mace/ops/eltwise.h"
namespace mace {
namespace ops {
void Register_Eltwise(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Eltwise")
......@@ -26,4 +27,5 @@ void Register_Eltwise(OperatorRegistry *op_registry) {
EltwiseOp<DeviceType::OPENCL, half>);
}
} // namespace ops
} // namespace mace
......@@ -2,13 +2,14 @@
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#ifndef MACE_OPS_RESHAPE_H_
#define MACE_OPS_RESHAPE_H_
#ifndef MACE_OPS_ELTWISE_H_
#define MACE_OPS_ELTWISE_H_
#include "mace/core/operator.h"
#include "mace/kernels/eltwise.h"
namespace mace {
namespace ops {
template <DeviceType D, typename T>
class EltwiseOp : public Operator<D, T> {
......@@ -44,6 +45,7 @@ class EltwiseOp : public Operator<D, T> {
OP_OUTPUT_TAGS(OUTPUT);
};
} // namespace ops
} // namespace mace
#endif // MACE_OPS_RESHAPE_H_
#endif // MACE_OPS_ELTWISE_H_
......@@ -2,13 +2,17 @@
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/kernels/eltwise.h"
#include <string>
#include "mace/core/operator.h"
#include "mace/core/testing/test_benchmark.h"
#include "mace/kernels/eltwise.h"
#include "mace/ops/ops_test_util.h"
namespace mace {
namespace ops {
namespace test {
template <DeviceType D, typename T>
static void EltwiseBenchmark(
int iters, kernels::EltwiseType type, int n, int h, int w, int c) {
......@@ -81,4 +85,6 @@ BM_ELTWISE(0, 1, 240, 240, 256);
BM_ELTWISE(1, 1, 240, 240, 256);
BM_ELTWISE(2, 1, 240, 240, 256);
} // namespace test
} // namespace ops
} // namespace mace
......@@ -7,6 +7,8 @@
#include "mace/ops/ops_test_util.h"
namespace mace {
namespace ops {
namespace test {
class EltwiseOpTest : public OpsTestBase {};
......@@ -170,4 +172,6 @@ TEST_F(EltwiseOpTest, OPENCLRandomHalf) {
{13, 32, 32, 64});
}
} // namespace test
} // namespace ops
} // namespace mace
......@@ -5,6 +5,7 @@
#include "mace/ops/folded_batch_norm.h"
namespace mace {
namespace ops {
void Register_FoldedBatchNorm(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm")
......@@ -26,4 +27,5 @@ void Register_FoldedBatchNorm(OperatorRegistry *op_registry) {
FoldedBatchNormOp<DeviceType::OPENCL, half>);
}
} // namespace ops
} // namespace mace
......@@ -5,10 +5,13 @@
#ifndef MACE_OPS_FOLDED_BATCH_NORM_H_
#define MACE_OPS_FOLDED_BATCH_NORM_H_
#include <string>
#include "mace/core/operator.h"
#include "mace/kernels/batch_norm.h"
namespace mace {
namespace ops {
template <DeviceType D, class T>
class FoldedBatchNormOp : public Operator<D, T> {
......@@ -48,6 +51,7 @@ class FoldedBatchNormOp : public Operator<D, T> {
OP_OUTPUT_TAGS(OUTPUT);
};
} // namespace ops
} // namespace mace
#endif // MACE_OPS_FOLDED_BATCH_NORM_H_
......@@ -6,6 +6,8 @@
#include "mace/ops/ops_test_util.h"
namespace mace {
namespace ops {
namespace test {
class FoldedBatchNormOpTest : public OpsTestBase {};
......@@ -14,12 +16,12 @@ void CalculateScaleOffset(const std::vector<float> &gamma,
const std::vector<float> &mean,
const std::vector<float> &var,
const float epsilon,
std::vector<float> &scale,
std::vector<float> &offset) {
std::vector<float> *scale,
std::vector<float> *offset) {
size_t size = gamma.size();
for (int i = 0; i < size; ++i) {
scale[i] = gamma[i] / std::sqrt(var[i] + epsilon);
offset[i] = offset[i] - mean[i] * scale[i];
(*scale)[i] = gamma[i] / std::sqrt(var[i] + epsilon);
(*offset)[i] = (*offset)[i] - mean[i] * (*scale)[i];
}
}
......@@ -32,7 +34,7 @@ void Simple() {
{5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15});
std::vector<float> scale(1);
std::vector<float> offset(1);
CalculateScaleOffset({4.0f}, {2.0}, {10}, {11.67f}, 1e-3, scale, offset);
CalculateScaleOffset({4.0f}, {2.0}, {10}, {11.67f}, 1e-3, &scale, &offset);
net.AddInputFromArray<D, float>("Scale", {1}, scale);
net.AddInputFromArray<D, float>("Offset", {1}, offset);
......@@ -172,11 +174,10 @@ width});
*/
TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) {
srand(time(NULL));
// generate random input
index_t batch = 1 + rand() % 10;
index_t channels = 3 + rand() % 50;
static unsigned int seed = time(NULL);
index_t batch = 1 + rand_r(&seed) % 10;
index_t channels = 3 + rand_r(&seed) % 50;
index_t height = 64;
index_t width = 64;
......@@ -227,11 +228,10 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) {
}
TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) {
srand(time(NULL));
// generate random input
index_t batch = 1 + rand() % 10;
index_t channels = 3 + rand() % 50;
static unsigned int seed = time(NULL);
index_t batch = 1 + rand_r(&seed) % 10;
index_t channels = 3 + rand_r(&seed) % 50;
index_t height = 64;
index_t width = 64;
......@@ -283,11 +283,10 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) {
}
TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) {
srand(time(NULL));
// generate random input
index_t batch = 1 + rand() % 10;
index_t channels = 3 + rand() % 50;
static unsigned int seed = time(NULL);
index_t batch = 1 + rand_r(&seed) % 10;
index_t channels = 3 + rand_r(&seed) % 50;
index_t height = 103;
index_t width = 113;
......@@ -337,11 +336,10 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) {
}
TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) {
srand(time(NULL));
// generate random input
index_t batch = 1 + rand() % 10;
index_t channels = 3 + rand() % 50;
static unsigned int seed = time(NULL);
index_t batch = 1 + rand_r(&seed) % 10;
index_t channels = 3 + rand_r(&seed) % 50;
index_t height = 103;
index_t width = 113;
......@@ -390,4 +388,7 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) {
kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.5);
}
}
} // namespace test
} // namespace ops
} // namespace mace
......@@ -5,6 +5,7 @@
#include "mace/ops/fully_connected.h"
namespace mace {
namespace ops {
void Register_FullyConnected(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("FC")
......@@ -26,4 +27,5 @@ void Register_FullyConnected(OperatorRegistry *op_registry) {
FullyConnectedOp<DeviceType::OPENCL, half>);
}
} // namespace ops
} // namespace mace
......@@ -11,6 +11,7 @@
#include "mace/kernels/fully_connected.h"
namespace mace {
namespace ops {
template <DeviceType D, class T>
class FullyConnectedOp : public Operator<D, T> {
......@@ -48,6 +49,7 @@ class FullyConnectedOp : public Operator<D, T> {
OP_OUTPUT_TAGS(OUTPUT);
};
} // namespace ops
} // namespace mace
#endif // MACE_OPS_FULLY_CONNECTED_H_
......@@ -3,11 +3,14 @@
//
#include <string>
#include "mace/core/operator.h"
#include "mace/core/testing/test_benchmark.h"
#include "mace/ops/ops_test_util.h"
namespace mace {
namespace ops {
namespace test {
template <DeviceType D, typename T>
static void FCBenchmark(
......@@ -84,4 +87,7 @@ BM_FC(1, 16, 16, 32, 32);
BM_FC(1, 8, 8, 32, 1000);
BM_FC(1, 2, 2, 512, 2);
BM_FC(1, 7, 7, 512, 4096);
} // namespace test
} // namespace ops
} // namespace mace
......@@ -3,10 +3,13 @@
//
#include <fstream>
#include "mace/core/operator.h"
#include "mace/ops/ops_test_util.h"
namespace mace {
namespace ops {
namespace test {
class FullyConnectedOpTest : public OpsTestBase {};
......@@ -263,4 +266,6 @@ TEST_F(FullyConnectedOpTest, OPENCLHalfWidthFormatAligned) {
TestWXFormat<half>(1, 16, 32, 32, 32);
}
} // namespace test
} // namespace ops
} // namespace mace
......@@ -5,6 +5,7 @@
#include "mace/ops/fused_conv_2d.h"
namespace mace {
namespace ops {
void Register_FusedConv2D(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("FusedConv2D")
......@@ -26,4 +27,5 @@ void Register_FusedConv2D(OperatorRegistry *op_registry) {
FusedConv2dOp<DeviceType::OPENCL, half>);
}
} // namespace ops
} // namespace mace
......@@ -13,6 +13,7 @@
#include "mace/ops/conv_pool_2d_base.h"
namespace mace {
namespace ops {
template <DeviceType D, typename T>
class FusedConv2dOp : public ConvPool2dOpBase<D, T> {
......@@ -47,6 +48,7 @@ class FusedConv2dOp : public ConvPool2dOpBase<D, T> {
OP_OUTPUT_TAGS(OUTPUT);
};
} // namespace ops
} // namespace mace
#endif // MACE_OPS_FUSED_CONV_2D_H_
......@@ -8,6 +8,8 @@
#include "mace/ops/ops_test_util.h"
namespace mace {
namespace ops {
namespace test {
class FusedConv2dOpTest : public OpsTestBase {};
......@@ -276,9 +278,8 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
testing::internal::LogToStderr();
auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
Padding type) {
unsigned int seed = time(NULL);
// generate random input
static unsigned int seed = time(NULL);
index_t batch = 3 + (rand_r(&seed) % 10);
index_t height = shape[0];
index_t width = shape[1];
......@@ -352,9 +353,8 @@ static void TestHalfComplexConvNxNS12(const std::vector<index_t> &shape) {
testing::internal::LogToStderr();
auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
Padding type) {
unsigned int seed = time(NULL);
// generate random input
static unsigned int seed = time(NULL);
index_t batch = 3 + (rand_r(&seed) % 10);
index_t height = shape[0];
index_t width = shape[1];
......@@ -679,4 +679,6 @@ TEST_F(FusedConv2dOpTest, OPENCL15X15AtrousConvD4) {
{2, 2});
}
} // namespace test
} // namespace ops
} // namespace mace
......@@ -5,6 +5,7 @@
#include "mace/ops/global_avg_pooling.h"
namespace mace {
namespace ops {
void Register_GlobalAvgPooling(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("GlobalAvgPooling")
......@@ -14,4 +15,5 @@ void Register_GlobalAvgPooling(OperatorRegistry *op_registry) {
GlobalAvgPoolingOp<DeviceType::CPU, float>);
}
} // namespace ops
} // namespace mace
......@@ -11,6 +11,7 @@
#include "mace/kernels/global_avg_pooling.h"
namespace mace {
namespace ops {
template <DeviceType D, class T>
class GlobalAvgPoolingOp : public Operator<D, T> {
......@@ -40,6 +41,7 @@ class GlobalAvgPoolingOp : public Operator<D, T> {
OP_OUTPUT_TAGS(OUTPUT);
};
} // namespace ops
} // namespace mace
#endif // MACE_OPS_GLOBAL_AVG_POOLING_H_
......@@ -8,7 +8,8 @@
#include "mace/ops/ops_test_util.h"
namespace mace {
namespace kernels {
namespace ops {
namespace test {
template <DeviceType D>
static void GlobalAvgPooling(
......@@ -54,5 +55,6 @@ BM_GLOBAL_AVG_POOLING(1, 3, 7, 7);
BM_GLOBAL_AVG_POOLING(1, 3, 64, 64);
BM_GLOBAL_AVG_POOLING(1, 3, 256, 256);
} // namespace kernels
} // namespace test
} // namespace ops
} // namespace mace
......@@ -5,6 +5,8 @@
#include "mace/ops/ops_test_util.h"
namespace mace {
namespace ops {
namespace test {
class GlobalAvgPoolingOpTest : public OpsTestBase {};
......@@ -32,4 +34,6 @@ TEST_F(GlobalAvgPoolingOpTest, 3x7x7_CPU) {
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
}
} // namespace test
} // namespace ops
} // namespace mace
......@@ -5,6 +5,7 @@
#include "mace/ops/image_to_buffer.h"
namespace mace {
namespace ops {
void Register_ImageToBuffer(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("ImageToBuffer")
......@@ -20,4 +21,5 @@ void Register_ImageToBuffer(OperatorRegistry *op_registry) {
ImageToBufferOp<DeviceType::OPENCL, half>);
}
} // namespace ops
} // namespace mace
......@@ -9,6 +9,7 @@
#include "mace/kernels/buffer_to_image.h"
namespace mace {
namespace ops {
template <DeviceType D, typename T>
class ImageToBufferOp : public Operator<D, T> {
......@@ -35,5 +36,7 @@ class ImageToBufferOp : public Operator<D, T> {
OP_OUTPUT_TAGS(OUTPUT);
};
} // namespace ops
} // namespace mace
#endif // MACE_OPS_IMAGE_TO_BUFFER_H_
......@@ -5,6 +5,7 @@
#include "mace/ops/matmul.h"
namespace mace {
namespace ops {
void Register_MatMul(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("MatMul")
......@@ -26,4 +27,5 @@ void Register_MatMul(OperatorRegistry *op_registry) {
MatMulOp<DeviceType::OPENCL, half>);
}
} // namespace ops
} // namespace mace
......@@ -9,6 +9,7 @@
#include "mace/kernels/matmul.h"
namespace mace {
namespace ops {
template <DeviceType D, class T>
class MatMulOp : public Operator<D, T> {
......@@ -35,6 +36,7 @@ class MatMulOp : public Operator<D, T> {
kernels::MatMulFunctor<D, T> functor_;
};
} // namespace ops
} // namespace mace
#endif // MACE_OPS_MATMUL_H_
......@@ -3,11 +3,15 @@
//
#include <string>
#include "mace/core/operator.h"
#include "mace/core/testing/test_benchmark.h"
#include "mace/ops/ops_test_util.h"
namespace mace {
namespace ops {
namespace test {
template <DeviceType D, typename T>
static void MatMulBenchmark(
int iters, int batch, int height, int channels, int out_width) {
......@@ -72,4 +76,7 @@ BM_MATMUL(16, 32, 128, 3969);
BM_MATMUL(16, 128, 128, 49);
BM_MATMUL(16, 128, 128, 961);
BM_MATMUL(16, 128, 128, 3969);
} // namespace test
} // namespace ops
} // namespace mace
......@@ -3,10 +3,13 @@
//
#include <fstream>
#include "mace/core/operator.h"
#include "mace/ops/ops_test_util.h"
namespace mace {
namespace ops {
namespace test {
class MatMulOpTest : public OpsTestBase {};
......@@ -170,4 +173,7 @@ TEST_F(MatMulOpTest, OPENCLHalfUnAlignedWithBatch) {
Complex<half>(16, 32, 64, 64);
Complex<half>(31, 31, 61, 67);
}
} // namespace test
} // namespace ops
} // namespace mace
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册