提交 2944863e 编写于 作者: Y yejianwu

merge with origin master, fix mace/ops code format

......@@ -62,6 +62,8 @@ std::unique_ptr<OperatorBase> OperatorRegistry::CreateOperator(
}
}
namespace ops {
extern void Register_Activation(OperatorRegistry *op_registry);
extern void Register_AddN(OperatorRegistry *op_registry);
extern void Register_BatchNorm(OperatorRegistry *op_registry);
......@@ -88,32 +90,34 @@ extern void Register_Eltwise(OperatorRegistry *op_registry);
extern void Register_FullyConnected(OperatorRegistry *op_registry);
extern void Register_Slice(OperatorRegistry *op_registry);
} // namespace ops
OperatorRegistry::OperatorRegistry() {
Register_Activation(this);
Register_AddN(this);
Register_BatchNorm(this);
Register_BatchToSpaceND(this);
Register_BiasAdd(this);
Register_BufferToImage(this);
Register_ChannelShuffle(this);
Register_Concat(this);
Register_Conv2D(this);
Register_DepthwiseConv2d(this);
Register_FoldedBatchNorm(this);
Register_FusedConv2D(this);
Register_GlobalAvgPooling(this);
Register_ImageToBuffer(this);
Register_Pooling(this);
Register_ResizeBilinear(this);
Register_Softmax(this);
Register_SpaceToBatchND(this);
Register_MatMul(this);
Register_WinogradTransform(this);
Register_WinogradInverseTransform(this);
Register_Reshape(this);
Register_Eltwise(this);
Register_FullyConnected(this);
Register_Slice(this);
ops::Register_Activation(this);
ops::Register_AddN(this);
ops::Register_BatchNorm(this);
ops::Register_BatchToSpaceND(this);
ops::Register_BiasAdd(this);
ops::Register_BufferToImage(this);
ops::Register_ChannelShuffle(this);
ops::Register_Concat(this);
ops::Register_Conv2D(this);
ops::Register_DepthwiseConv2d(this);
ops::Register_FoldedBatchNorm(this);
ops::Register_FusedConv2D(this);
ops::Register_GlobalAvgPooling(this);
ops::Register_ImageToBuffer(this);
ops::Register_Pooling(this);
ops::Register_ResizeBilinear(this);
ops::Register_Softmax(this);
ops::Register_SpaceToBatchND(this);
ops::Register_MatMul(this);
ops::Register_WinogradTransform(this);
ops::Register_WinogradInverseTransform(this);
ops::Register_Reshape(this);
ops::Register_Eltwise(this);
ops::Register_FullyConnected(this);
ops::Register_Slice(this);
}
} // namespace mace
......@@ -152,6 +152,7 @@ class ActivationFunctor<DeviceType::OPENCL, T> {
T relux_max_limit_;
cl::Kernel kernel_;
std::string tuning_key_prefix_;
std::vector<index_t> input_shape_;
};
} // namespace kernels
......
......@@ -91,6 +91,7 @@ struct AddNFunctor<DeviceType::OPENCL, T> {
StatsFuture *future);
cl::Kernel kernel_;
std::vector<index_t> input_shape_;
};
} // namespace kernels
......
......@@ -156,6 +156,7 @@ struct BatchNormFunctor<DeviceType::OPENCL, T> : BatchNormFunctorBase {
Tensor *output,
StatsFuture *future);
cl::Kernel kernel_;
std::vector<index_t> input_shape_;
};
} // namepsace kernels
......
......@@ -62,6 +62,7 @@ struct BiasAddFunctor<DeviceType::OPENCL, T> {
Tensor *output,
StatsFuture *future);
cl::Kernel kernel_;
std::vector<index_t> input_shape_;
};
} // namepsace kernels
......
......@@ -55,6 +55,7 @@ struct ChannelShuffleFunctor<DeviceType::OPENCL, T> {
cl::Kernel kernel_;
const int groups_;
std::vector<index_t> input_shape_;
};
} // namespace kernels
......
......@@ -83,6 +83,7 @@ struct ConcatFunctor<DeviceType::OPENCL, T> : ConcatFunctorBase {
Tensor *output,
StatsFuture *future);
cl::Kernel kernel_;
std::vector<index_t> input_shape_;
};
} // namepsace kernels
......
......@@ -401,6 +401,7 @@ struct Conv2dFunctor<DeviceType::OPENCL, T> : Conv2dFunctorBase {
StatsFuture *future);
cl::Kernel kernel_;
std::vector<index_t> input_shape_;
};
} // namespace kernels
......
......@@ -439,6 +439,7 @@ struct DepthwiseConv2dFunctor<DeviceType::OPENCL, T>
StatsFuture *future);
cl::Kernel kernel_;
std::vector<index_t> input_shape_;
};
} // namespace kernels
......
......@@ -94,6 +94,7 @@ struct EltwiseFunctor<DeviceType::OPENCL, T> : EltwiseFunctorBase {
StatsFuture *future);
cl::Kernel kernel_;
std::vector<index_t> input_shape_;
};
} // namespace kernels
......
......@@ -90,6 +90,7 @@ struct FullyConnectedFunctor<DeviceType::OPENCL, T> : FullyConnectedBase {
cl::Kernel kernel_;
std::vector<uint32_t> gws_;
std::vector<uint32_t> lws_;
std::vector<index_t> input_shape_;
};
} // namespace kernels
......
......@@ -58,6 +58,9 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
LOG(FATAL) << "Unknown activation type: " << activation_;
}
kernel_ = runtime->BuildKernel("activation", kernel_name, built_options);
}
if (!IsVecEqual(input_shape_, input->shape())) {
int idx = 0;
kernel_.setArg(idx++, *(input->opencl_image()));
if (activation_ == PRELU) {
......@@ -66,6 +69,8 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
}
kernel_.setArg(idx++, static_cast<float>(relux_max_limit_));
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
......
......@@ -32,15 +32,6 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
MACE_CHECK(channels == input_tensors[i]->dim(3));
}
std::vector<index_t> output_shape = input_tensors[0]->shape();
std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape);
output_tensor->ResizeImage(output_shape, output_image_shape);
const index_t channel_blocks = RoundUpDiv4(channels);
const index_t width_pixels = channel_blocks * width;
const index_t batch_height_pixels = batch * height;
if (kernel_.get() == nullptr) {
if (input_tensors.size() > 4) {
MACE_NOT_IMPLEMENTED;
......@@ -55,11 +46,26 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
built_options.emplace(MakeString("-DINPUT_NUM=", input_tensors.size()));
kernel_ = runtime->BuildKernel("addn", kernel_name, built_options);
}
std::vector<index_t> output_shape = input_tensors[0]->shape();
const index_t channel_blocks = RoundUpDiv4(channels);
const index_t width_pixels = channel_blocks * width;
const index_t batch_height_pixels = batch * height;
if (!IsVecEqual(input_shape_, input_tensors[0]->shape())) {
std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape);
output_tensor->ResizeImage(output_shape, output_image_shape);
uint32_t idx = 0;
for (auto input : input_tensors) {
kernel_.setArg(idx++, *(input->opencl_image()));
}
kernel_.setArg(idx++, *(output_tensor->opencl_image()));
input_shape_ = input_tensors[0]->shape();
}
const uint32_t gws[2] = {static_cast<uint32_t>(width_pixels),
......
......@@ -61,7 +61,8 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
}
kernel_ = runtime->BuildKernel("batch_norm", kernel_name, built_options);
}
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(scale->opencl_image()));
......@@ -73,6 +74,8 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
}
kernel_.setArg(idx++, *(output->opencl_image()));
kernel_.setArg(idx++, relux_max_limit_);
input_shape_ = input->shape();
}
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
......
......@@ -33,10 +33,13 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
kernel_ = runtime->BuildKernel("bias_add", kernel_name, built_options);
}
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(bias->opencl_image()));
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
......
......@@ -13,9 +13,10 @@ namespace mace {
namespace kernels {
template <typename T>
void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
Tensor *output,
StatsFuture *future) {
void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(
const Tensor *input,
Tensor *output,
StatsFuture *future) {
output->ResizeLike(input);
const index_t batch = input->dim(0);
......@@ -39,12 +40,15 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *inpu
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
kernel_ = runtime->BuildKernel("channel_shuffle", kernel_name, built_options);
}
if (!IsVecEqual(input_shape_, input->shape())) {
uint32_t idx = 0;
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, groups_);
kernel_.setArg(idx++, static_cast<uint32_t>(channels_per_group));
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
const uint32_t gws[3] = {static_cast<uint32_t>(group_channel_blocks),
static_cast<uint32_t>(width),
......
......@@ -15,6 +15,7 @@ static void Concat2(cl::Kernel *kernel,
const Tensor *input0,
const Tensor *input1,
const DataType dt,
std::vector<index_t> *prev_input_shape,
Tensor *output,
StatsFuture *future) {
const index_t batch = output->dim(0);
......@@ -41,6 +42,8 @@ static void Concat2(cl::Kernel *kernel,
}
*kernel = runtime->BuildKernel("concat", kernel_name, built_options);
}
if (!IsVecEqual(*prev_input_shape, input0->shape())) {
uint32_t idx = 0;
kernel->setArg(idx++,
*(static_cast<const cl::Image2D *>(input0->opencl_image())));
......@@ -49,6 +52,7 @@ static void Concat2(cl::Kernel *kernel,
kernel->setArg(idx++, static_cast<int32_t>(input0->dim(3)));
kernel->setArg(idx++,
*(static_cast<cl::Image2D *>(output->opencl_image())));
*prev_input_shape = input0->shape();
}
const uint32_t gws[3] = {
......@@ -142,7 +146,7 @@ void ConcatFunctor<DeviceType::OPENCL, T>::operator()(
switch (inputs_count) {
case 2:
Concat2(&kernel_, input_list[0], input_list[1], DataTypeToEnum<T>::value,
output, future);
&input_shape_, output, future);
break;
default:
if (divisible_four) {
......
......@@ -18,6 +18,7 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
const ActivationType activation,
const float relux_max_limit,
const DataType dt,
std::vector<index_t> *prev_input_shape,
Tensor *output,
StatsFuture *future);
......@@ -31,6 +32,7 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
const ActivationType activation,
const float relux_max_limit,
const DataType dt,
std::vector<index_t> *prev_input_shape,
Tensor *output,
StatsFuture *future);
......@@ -44,6 +46,7 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
const ActivationType activation,
const float relux_max_limit,
const DataType dt,
std::vector<index_t> *prev_input_shape,
Tensor *output,
StatsFuture *future);
......@@ -57,8 +60,8 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
cl::Kernel * kernel, const Tensor *input, const Tensor *filter,
const Tensor *bias, const int stride, const int *padding,
const int *dilations, const ActivationType activation,
const float relux_max_limit, const DataType dt, Tensor *output,
StatsFuture *future);
const float relux_max_limit, const DataType dt,
std::vector<index_t> *input_shape, Tensor *output, StatsFuture *future);
// Selection matrix: kernel_size x stride_size
static const Conv2dOpenclFunction selector[5] = {
Conv2dOpenclK1x1, nullptr, Conv2dOpenclK3x3, nullptr, nullptr};
......@@ -97,11 +100,11 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
auto conv2d_func = selector[kernel_h - 1];
conv2d_func(&kernel_, input, filter, bias, strides_[0], paddings.data(),
dilations_, activation_, relux_max_limit_,
DataTypeToEnum<T>::value, output, future);
DataTypeToEnum<T>::value, &input_shape_, output, future);
} else {
Conv2dOpencl(&kernel_, input, filter, bias, strides_[0], paddings.data(),
dilations_, activation_, relux_max_limit_,
DataTypeToEnum<T>::value, output, future);
DataTypeToEnum<T>::value, &input_shape_, output, future);
}
}
......
......@@ -20,6 +20,7 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
const ActivationType activation,
const float relux_max_limit,
const DataType dt,
std::vector<index_t> *prev_input_shape,
Tensor *output,
StatsFuture *future) {
const index_t batch = output->dim(0);
......@@ -68,6 +69,8 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
auto runtime = OpenCLRuntime::Global();
*kernel = runtime->BuildKernel("conv_2d_1x1", kernel_name, built_options);
}
if (!IsVecEqual(*prev_input_shape, input->shape())) {
uint32_t idx = 0;
kernel->setArg(idx++, *(input->opencl_image()));
kernel->setArg(idx++, *(filter->opencl_image()));
......@@ -83,6 +86,8 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
kernel->setArg(idx++, static_cast<int>(height));
kernel->setArg(idx++, static_cast<int>(width));
kernel->setArg(idx++, stride);
*prev_input_shape = input->shape();
}
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
......
......@@ -22,6 +22,7 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
const ActivationType activation,
const float relux_max_limit,
const DataType dt,
std::vector<index_t> *prev_input_shape,
Tensor *output,
StatsFuture *future) {
const index_t batch = output->dim(0);
......@@ -62,7 +63,8 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
auto runtime = OpenCLRuntime::Global();
*kernel = runtime->BuildKernel("conv_2d_3x3", kernel_name, built_options);
}
if (!IsVecEqual(*prev_input_shape, input->shape())) {
uint32_t idx = 0;
kernel->setArg(idx++, *(input->opencl_image()));
kernel->setArg(idx++, *(filter->opencl_image()));
......@@ -81,6 +83,8 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
kernel->setArg(idx++, padding[1] / 2);
kernel->setArg(idx++, dilations[0]);
kernel->setArg(idx++, dilations[1]);
*prev_input_shape = input->shape();
}
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
......
......@@ -22,6 +22,7 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
const ActivationType activation,
const float relux_max_limit,
const DataType dt,
std::vector<index_t> *prev_input_shape,
Tensor *output,
StatsFuture *future) {
const index_t batch = output->dim(0);
......@@ -62,7 +63,8 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
auto runtime = OpenCLRuntime::Global();
*kernel = runtime->BuildKernel("conv_2d", kernel_name, built_options);
}
if (!IsVecEqual(*prev_input_shape, input->shape())) {
uint32_t idx = 0;
kernel->setArg(idx++, *(input->opencl_image()));
kernel->setArg(idx++, *(filter->opencl_image()));
......@@ -83,6 +85,8 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
kernel->setArg(idx++, padding[1] / 2);
kernel->setArg(idx++, dilations[0]);
kernel->setArg(idx++, dilations[1]);
*prev_input_shape = input->shape();
}
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
......
......@@ -21,6 +21,7 @@ void DepthwiseConv2d(cl::Kernel *kernel,
const ActivationType activation,
const float relux_max_limit,
const DataType dt,
std::vector<index_t> *prev_input_shape,
Tensor *output,
StatsFuture *future) {
const index_t batch = output->dim(0);
......@@ -35,17 +36,6 @@ void DepthwiseConv2d(cl::Kernel *kernel,
const index_t input_channel_blocks = RoundUpDiv4(input_channels);
const index_t width_blocks = RoundUpDiv4(width);
if (kernel->get() == nullptr) {
const index_t input_batch = input->dim(0);
const index_t input_height = input->dim(1);
const index_t input_width = input->dim(2);
const index_t filter_height = filter->dim(0);
const index_t filter_width = filter->dim(1);
MACE_CHECK(multiplier == 1, "Multiplier > 1 not supported");
MACE_CHECK(multiplier * input_channels == channels);
MACE_CHECK(filter->dim(2) == input_channels, filter->dim(2), "!=",
input_channels);
auto runtime = OpenCLRuntime::Global();
std::set<std::string> built_options;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv2d");
......@@ -80,6 +70,18 @@ void DepthwiseConv2d(cl::Kernel *kernel,
*kernel =
runtime->BuildKernel("depthwise_conv2d", kernel_name, built_options);
}
if (!IsVecEqual(*prev_input_shape, input->shape())) {
const index_t input_batch = input->dim(0);
const index_t input_height = input->dim(1);
const index_t input_width = input->dim(2);
const index_t filter_height = filter->dim(0);
const index_t filter_width = filter->dim(1);
MACE_CHECK(multiplier == 1, "Multiplier > 1 not supported");
MACE_CHECK(multiplier * input_channels == channels);
MACE_CHECK(filter->dim(2) == input_channels, filter->dim(2), "!=",
input_channels);
uint32_t idx = 0;
kernel->setArg(idx++, *(input->opencl_image()));
......@@ -102,6 +104,7 @@ void DepthwiseConv2d(cl::Kernel *kernel,
kernel->setArg(idx++, static_cast<short>(dilations[0]));
kernel->setArg(idx++, static_cast<short>(dilations[1]));
}
*prev_input_shape = input->shape();
}
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
......@@ -120,9 +123,7 @@ void DepthwiseConv2dFunctor<DeviceType::OPENCL, T>::operator()(
const Tensor *bias,
Tensor *output,
StatsFuture *future) {
typedef void (*Conv2dOpenclFunction)(const Tensor *input,
const Tensor *filter, const Tensor *bias,
Tensor *output, StatsFuture *future);
index_t kernel_h = filter->dim(2);
index_t kernel_w = filter->dim(3);
if (strides_[0] != strides_[1]) {
......@@ -163,7 +164,7 @@ void DepthwiseConv2dFunctor<DeviceType::OPENCL, T>::operator()(
DepthwiseConv2d(&kernel_, input, filter, bias, strides_[0], paddings.data(),
dilations_, activation_, relux_max_limit_,
DataTypeToEnum<T>::value, output, future);
DataTypeToEnum<T>::value, &input_shape_, output, future);
}
template struct DepthwiseConv2dFunctor<DeviceType::OPENCL, float>;
......
......@@ -36,6 +36,8 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
if (!coeff_.empty()) built_options.emplace("-DCOEFF_SUM");
kernel_ = runtime->BuildKernel("eltwise", kernel_name, built_options);
}
if (!IsVecEqual(input_shape_, input0->shape())) {
uint32_t idx = 0;
kernel_.setArg(idx++, *(input0->opencl_image()));
kernel_.setArg(idx++, *(input1->opencl_image()));
......@@ -44,6 +46,7 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
kernel_.setArg(idx++, coeff_[1]);
}
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input0->shape();
}
const uint32_t gws[2] = {static_cast<uint32_t>(width_pixels),
......
......@@ -13,6 +13,7 @@ void FCWXKernel(cl::Kernel *kernel,
const Tensor *input,
const Tensor *weight,
const Tensor *bias,
std::vector<index_t> *prev_input_shape,
Tensor *output,
const ActivationType activation,
std::vector<uint32_t> &gws,
......@@ -67,6 +68,11 @@ void FCWXKernel(cl::Kernel *kernel,
const uint32_t inter_local_blks = kwg_size / (gws[0] * gws[1]);
lws = {gws[0], gws[1], inter_local_blks};
}
if (!IsVecEqual(*prev_input_shape, input->shape())) {
const index_t batch = output->dim(0);
const index_t output_blocks = RoundUpDiv4(output->dim(3));
uint32_t idx = 0;
kernel->setArg(idx++, *(input->opencl_image()));
kernel->setArg(idx++, *(weight->opencl_image()));
......@@ -80,6 +86,10 @@ void FCWXKernel(cl::Kernel *kernel,
kernel->setArg(idx++, static_cast<int>(RoundUpDiv4(input->dim(3))));
kernel->setArg(idx++, static_cast<int>(output_blocks));
kernel->setArg(idx++, relux_max_limit);
gws[2] = static_cast<uint32_t>(batch * output_blocks);
*prev_input_shape = input->shape();
}
cl::Event event;
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
......@@ -103,6 +113,7 @@ void FCWTXKernel(cl::Kernel *kernel,
const Tensor *input,
const Tensor *weight,
const Tensor *bias,
std::vector<index_t> *prev_input_shape,
Tensor *output,
const ActivationType activation,
std::vector<uint32_t> &gws,
......@@ -141,6 +152,9 @@ void FCWTXKernel(cl::Kernel *kernel,
*kernel =
runtime->BuildKernel("fully_connected", kernel_name, built_options);
lws = {16, 64, 1};
}
if (!IsVecEqual(*prev_input_shape, input->shape())) {
uint32_t idx = 0;
kernel->setArg(idx++, *(input->opencl_image()));
kernel->setArg(idx++, *(weight->opencl_image()));
......@@ -155,14 +169,13 @@ void FCWTXKernel(cl::Kernel *kernel,
kernel->setArg(idx++, relux_max_limit);
const index_t batch = output->dim(0);
const index_t output_size = output->dim(3);
const index_t output_blocks = RoundUpDiv4(output_size);
const index_t output_blocks = RoundUpDiv4(output->dim(3));
gws = {
static_cast<uint32_t>(batch), static_cast<uint32_t>(output_blocks),
};
lws = {16, 64, 1};
*prev_input_shape = input->shape();
}
std::stringstream ss;
......@@ -185,11 +198,11 @@ void FullyConnectedFunctor<DeviceType::OPENCL, T>::operator()(
output->ResizeImage(output_shape, output_image_shape);
if (weight_type_ == BufferType::WEIGHT_HEIGHT) {
FCWTXKernel<T>(&kernel_, input, weight, bias, output,
FCWTXKernel<T>(&kernel_, input, weight, bias, &input_shape_, output,
activation_, gws_, lws_, relux_max_limit_, future);
} else {
FCWXKernel<T>(&kernel_, input, weight, bias, output,
activation_, gws_, lws_, relux_max_limit_, future);
FCWXKernel<T>(&kernel_, input, weight, bias, &input_shape_, output,
activation_, gws_, lws_, relux_max_limit_, future);
}
};
......
......@@ -71,6 +71,13 @@ inline bool LimitKernelTime() {
return flag != nullptr && strlen(flag) == 1 && flag[0] == '1';
}
template <typename T>
bool IsVecEqual(const std::vector<T> &input0,
const std::vector<T> &input1) {
return ((input0.size() == input1.size()) &&
(std::equal(input0.begin(), input0.end(), input1.begin())));
}
namespace {
template <typename T>
void AppendToStream(std::stringstream *ss, const std::string &delimiter, T v) {
......
......@@ -36,17 +36,16 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
kernel_ = runtime->BuildKernel("matmul", kernel_name, built_options);
uint32_t idx = 0;
kernel_.setArg(idx++, *(A->opencl_image()));
kernel_.setArg(idx++, *(B->opencl_image()));
kernel_.setArg(idx++, *(C->opencl_image()));
kernel_.setArg(idx++, static_cast<int>(height));
kernel_.setArg(idx++, static_cast<int>(width));
kernel_.setArg(idx++, static_cast<int>(A->dim(2)));
kernel_.setArg(idx++, static_cast<int>(height_blocks));
kernel_.setArg(idx++, static_cast<int>(RoundUpDiv4(A->dim(2))));
}
uint32_t idx = 0;
kernel_.setArg(idx++, *(A->opencl_image()));
kernel_.setArg(idx++, *(B->opencl_image()));
kernel_.setArg(idx++, *(C->opencl_image()));
kernel_.setArg(idx++, static_cast<int>(height));
kernel_.setArg(idx++, static_cast<int>(width));
kernel_.setArg(idx++, static_cast<int>(A->dim(2)));
kernel_.setArg(idx++, static_cast<int>(height_blocks));
kernel_.setArg(idx++, static_cast<int>(RoundUpDiv4(A->dim(2))));
const uint32_t gws[2] = {
static_cast<uint32_t>(width_blocks),
......
......@@ -17,31 +17,6 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
StatsFuture *future) {
MACE_CHECK(dilations_[0] == 1 && dilations_[1] == 1)
<< "Pooling opencl kernel not support dilation yet";
std::vector<index_t> output_shape(4);
std::vector<index_t> filter_shape = {kernels_[0], kernels_[1], input->dim(3),
input->dim(3)};
std::vector<int> paddings(2);
if (paddings_.empty()) {
kernels::CalcNHWCPaddingAndOutputSize(
input->shape().data(), filter_shape.data(), dilations_, strides_,
padding_type_, output_shape.data(), paddings.data());
} else {
paddings = paddings_;
CalcOutputSize(input->shape().data(), filter_shape.data(), paddings_.data(),
dilations_, strides_, RoundType::CEIL, output_shape.data());
}
std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape);
output->ResizeImage(output_shape, output_image_shape);
index_t batch = output->dim(0);
index_t out_height = output->dim(1);
index_t out_width = output->dim(2);
index_t channels = output->dim(3);
index_t channel_blocks = (channels + 3) / 4;
if (kernel_.get() == nullptr) {
const DataType dt = DataTypeToEnum<T>::value;
......@@ -62,18 +37,49 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
}
kernel_ = runtime->BuildKernel("pooling", kernel_name, built_options);
}
if (!IsVecEqual(input_shape_, input->shape())) {
std::vector<index_t> output_shape(4);
std::vector<index_t> filter_shape = {kernels_[0], kernels_[1], input->dim(3),
input->dim(3)};
std::vector<int> paddings(2);
if (paddings_.empty()) {
kernels::CalcNHWCPaddingAndOutputSize(
input->shape().data(), filter_shape.data(), dilations_, strides_,
padding_type_, output_shape.data(), paddings.data());
} else {
paddings = paddings_;
CalcOutputSize(input->shape().data(), filter_shape.data(), paddings_.data(),
dilations_, strides_, RoundType::CEIL, output_shape.data());
}
std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape);
output->ResizeImage(output_shape, output_image_shape);
uint32_t idx = 0;
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, static_cast<int32_t>(input->dim(1)));
kernel_.setArg(idx++, static_cast<int32_t>(input->dim(2)));
kernel_.setArg(idx++, static_cast<int32_t>(out_height));
kernel_.setArg(idx++, static_cast<int32_t>(output->dim(1)));
kernel_.setArg(idx++, paddings[0] / 2);
kernel_.setArg(idx++, paddings[1] / 2);
kernel_.setArg(idx++, strides_[0]);
kernel_.setArg(idx++, kernels_[0]);
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = input->shape();
}
index_t batch = output->dim(0);
index_t out_height = output->dim(1);
index_t out_width = output->dim(2);
index_t channels = output->dim(3);
index_t channel_blocks = (channels + 3) / 4;
const uint32_t gws[3] = {
static_cast<uint32_t>(channel_blocks), static_cast<uint32_t>(out_width),
static_cast<uint32_t>(batch * out_height),
......
......@@ -25,6 +25,18 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
const index_t out_width = out_width_;
if (kernel_.get() == nullptr) {
auto runtime = OpenCLRuntime::Global();
std::set<std::string> built_options;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bilinear_nocache");
built_options.emplace("-Dresize_bilinear_nocache=" + kernel_name);
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
kernel_ =
runtime->BuildKernel("resize_bilinear", kernel_name, built_options);
}
if (!IsVecEqual(input_shape_, input->shape())) {
MACE_CHECK(out_height > 0 && out_width > 0);
std::vector<index_t> output_shape{batch, out_height, out_width, channels};
......@@ -38,16 +50,6 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
float width_scale =
CalculateResizeScale(in_width, out_width, align_corners_);
auto runtime = OpenCLRuntime::Global();
std::set<std::string> built_options;
std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bilinear_nocache");
built_options.emplace("-Dresize_bilinear_nocache=" + kernel_name);
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
kernel_ =
runtime->BuildKernel("resize_bilinear", kernel_name, built_options);
uint32_t idx = 0;
kernel_.setArg(idx++, *(input->opencl_image()));
kernel_.setArg(idx++, *(output->opencl_image()));
......@@ -56,6 +58,9 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
kernel_.setArg(idx++, static_cast<int32_t>(in_height));
kernel_.setArg(idx++, static_cast<int32_t>(in_width));
kernel_.setArg(idx++, static_cast<int32_t>(out_height));
input_shape_ = input->shape();
}
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
......
......@@ -34,11 +34,14 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
kernel_ = runtime->BuildKernel("softmax", kernel_name, built_options);
}
if (!IsVecEqual(input_shape_, logits->shape())) {
uint32_t idx = 0;
kernel_.setArg(idx++, *(logits->opencl_image()));
kernel_.setArg(idx++, static_cast<int>(channels));
kernel_.setArg(idx++, remain_channels);
kernel_.setArg(idx++, *(output->opencl_image()));
input_shape_ = logits->shape();
}
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
......
......@@ -43,6 +43,8 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
kernel_ =
runtime->BuildKernel("space_to_batch", kernel_name, built_options);
}
if (!IsVecEqual(space_shape_, space_tensor->shape())) {
uint32_t idx = 0;
if (b2s_) {
kernel_.setArg(idx++, *(batch_tensor->opencl_image()));
......@@ -59,6 +61,8 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(2)));
kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(1)));
kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(2)));
space_shape_ = space_tensor->shape();
}
const uint32_t chan_blk = RoundUpDiv4<uint32_t>(batch_tensor->dim(3));
......
......@@ -14,6 +14,21 @@ namespace kernels {
template <typename T>
void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
const Tensor *input_tensor, Tensor *output_tensor, StatsFuture *future) {
if (kernel_.get() == nullptr) {
std::string obfuscated_kernel_name =
MACE_OBFUSCATE_SYMBOL("winograd_transform_2x2");
std::set<std::string> built_options;
built_options.emplace("-Dwinograd_transform_2x2=" + obfuscated_kernel_name);
built_options.emplace("-DDATA_TYPE=" +
DtToUpstreamCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" +
DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
auto runtime = OpenCLRuntime::Global();
kernel_ = runtime->BuildKernel("winograd_transform", obfuscated_kernel_name,
built_options);
}
std::vector<index_t> output_shape(4);
std::vector<index_t> filter_shape = {3, 3, input_tensor->dim(3), 1};
std::vector<int> paddings(2);
......@@ -27,29 +42,16 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
paddings_.data(), dilations_.data(), strides_.data(),
RoundType::FLOOR, output_shape.data());
}
const index_t round_h = (output_shape[1] + 1) / 2;
const index_t round_w = (output_shape[2] + 1) / 2;
const index_t out_width = input_tensor->dim(0) * round_h * round_w;
if (kernel_.get() == nullptr) {
if (!IsVecEqual(input_shape_, input_tensor->shape())) {
output_shape = {16, input_tensor->dim(3), out_width, 1};
std::vector<size_t> image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_HEIGHT, image_shape);
output_tensor->ResizeImage(output_shape, image_shape);
std::string obfuscated_kernel_name =
MACE_OBFUSCATE_SYMBOL("winograd_transform_2x2");
std::set<std::string> built_options;
built_options.emplace("-Dwinograd_transform_2x2=" + obfuscated_kernel_name);
built_options.emplace("-DDATA_TYPE=" +
DtToUpstreamCLDt(DataTypeToEnum<T>::value));
built_options.emplace("-DCMD_DATA_TYPE=" +
DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
auto runtime = OpenCLRuntime::Global();
kernel_ = runtime->BuildKernel("winograd_transform", obfuscated_kernel_name,
built_options);
uint32_t idx = 0;
kernel_.setArg(idx++, *(input_tensor->opencl_image()));
kernel_.setArg(idx++, *(output_tensor->opencl_image()));
......@@ -60,6 +62,8 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
kernel_.setArg(idx++, static_cast<uint32_t>(round_w));
kernel_.setArg(idx++, static_cast<uint32_t>(paddings[0] / 2));
kernel_.setArg(idx++, static_cast<uint32_t>(paddings[1] / 2));
input_shape_ = input_tensor->shape();
}
const uint32_t gws[2] = {
......@@ -79,11 +83,6 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
const Tensor *bias,
Tensor *output_tensor,
StatsFuture *future) {
std::vector<index_t> output_shape = {batch_, height_, width_,
input_tensor->dim(1)};
std::vector<size_t> image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, image_shape);
output_tensor->ResizeImage(output_shape, image_shape);
if (kernel_.get() == nullptr) {
std::string obfuscated_kernel_name =
......@@ -121,6 +120,13 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
auto runtime = OpenCLRuntime::Global();
kernel_ = runtime->BuildKernel("winograd_transform", obfuscated_kernel_name,
built_options);
}
if (!IsVecEqual(input_shape_, input_tensor->shape())) {
std::vector<index_t> output_shape = {batch_, height_, width_,
input_tensor->dim(1)};
std::vector<size_t> image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, image_shape);
output_tensor->ResizeImage(output_shape, image_shape);
const uint32_t round_h = (height_ + 1) / 2;
const uint32_t round_w = (width_ + 1) / 2;
......@@ -139,6 +145,8 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
kernel_.setArg(idx++, static_cast<uint32_t>(round_h * round_w));
kernel_.setArg(idx++, static_cast<uint32_t>(round_w));
kernel_.setArg(idx++, relux_max_limit_);
input_shape_ = input_tensor->shape();
}
const uint32_t gws[2] = {
......
......@@ -182,6 +182,7 @@ struct PoolingFunctor<DeviceType::OPENCL, T> : PoolingFunctorBase {
StatsFuture *future);
cl::Kernel kernel_;
std::vector<index_t> input_shape_;
};
} // namespace kernels
......
......@@ -172,6 +172,7 @@ struct ResizeBilinearFunctor<DeviceType::OPENCL, T>
void operator()(const Tensor *input, Tensor *output, StatsFuture *future);
cl::Kernel kernel_;
std::vector<index_t> input_shape_;
};
} // namespace kernels
......
......@@ -57,6 +57,7 @@ struct SoftmaxFunctor<DeviceType::OPENCL, T> {
void operator()(const Tensor *logits, Tensor *output, StatsFuture *future);
cl::Kernel kernel_;
std::vector<index_t> input_shape_;
};
} // namepsace kernels
......
......@@ -54,6 +54,7 @@ struct SpaceToBatchFunctor<DeviceType::OPENCL, T> : SpaceToBatchFunctorBase {
StatsFuture *future);
cl::Kernel kernel_;
std::vector<index_t> space_shape_;
};
} // namespace kernels
......
......@@ -49,6 +49,7 @@ struct WinogradTransformFunctor<DeviceType::OPENCL, T>
void operator()(const Tensor *input, Tensor *output, StatsFuture *future);
cl::Kernel kernel_;
std::vector<index_t> input_shape_;
};
struct WinogradInverseTransformFunctorBase {
......@@ -105,6 +106,7 @@ struct WinogradInverseTransformFunctor<DeviceType::OPENCL, T>
StatsFuture *future);
cl::Kernel kernel_;
std::vector<index_t> input_shape_;
};
} // namespace kernels
......
......@@ -5,6 +5,7 @@
#include "mace/ops/activation.h"
namespace mace {
namespace ops {
void Register_Activation(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Activation")
......@@ -26,4 +27,5 @@ void Register_Activation(OperatorRegistry *op_registry) {
ActivationOp<DeviceType::OPENCL, half>);
}
} // namespace ops
} // namespace mace
......@@ -5,10 +5,13 @@
#ifndef MACE_OPS_ACTIVATION_H_
#define MACE_OPS_ACTIVATION_H_
#include <string>
#include "mace/core/operator.h"
#include "mace/kernels/activation.h"
namespace mace {
namespace ops {
template <DeviceType D, class T>
class ActivationOp : public Operator<D, T> {
......@@ -36,6 +39,7 @@ class ActivationOp : public Operator<D, T> {
kernels::ActivationFunctor<D, T> functor_;
};
} // namespace ops
} // namespace mace
#endif // MACE_OPS_ACTIVATION_H_
......@@ -3,11 +3,15 @@
//
#include <string>
#include "mace/core/operator.h"
#include "mace/core/testing/test_benchmark.h"
#include "mace/ops/ops_test_util.h"
namespace mace {
namespace ops {
namespace test {
template <DeviceType D, typename T>
static void ReluBenchmark(
int iters, int batch, int channels, int height, int width) {
......@@ -316,4 +320,6 @@ BM_SIGMOID(1, 3, 512, 512);
BM_SIGMOID(1, 32, 112, 112);
BM_SIGMOID(1, 64, 256, 256);
} // namespace test
} // namespace ops
} // namespace mace
......@@ -6,6 +6,8 @@
#include "mace/ops/ops_test_util.h"
namespace mace {
namespace ops {
namespace test {
class ActivationOpTest : public OpsTestBase {};
......@@ -365,4 +367,6 @@ TEST_F(ActivationOpTest, OPENCLSimpleSigmoid) {
TestSimpleSigmoid<DeviceType::OPENCL>();
}
} // namespace test
} // namespace ops
} // namespace mace
......@@ -5,6 +5,7 @@
#include "mace/ops/addn.h"
namespace mace {
namespace ops {
void Register_AddN(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("AddN")
......@@ -26,4 +27,5 @@ void Register_AddN(OperatorRegistry *op_registry) {
AddNOp<DeviceType::OPENCL, half>);
}
} // namespace ops
} // namespace mace
......@@ -11,6 +11,7 @@
#include "mace/kernels/addn.h"
namespace mace {
namespace ops {
template <DeviceType D, class T>
class AddNOp : public Operator<D, T> {
......@@ -40,6 +41,7 @@ class AddNOp : public Operator<D, T> {
kernels::AddNFunctor<D, T> functor_;
};
} // namespace ops
} // namespace mace
#endif // MACE_OPS_ADDN_H_
......@@ -3,11 +3,15 @@
//
#include <string>
#include "mace/core/operator.h"
#include "mace/core/testing/test_benchmark.h"
#include "mace/ops/ops_test_util.h"
namespace mace {
namespace ops {
namespace test {
template <DeviceType D, typename T>
static void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) {
mace::testing::StopTiming();
......@@ -75,4 +79,6 @@ BM_ADDN(4, 1, 128, 128, 3);
BM_ADDN(2, 1, 256, 256, 3);
BM_ADDN(2, 1, 512, 512, 3);
} // namespace test
} // namespace ops
} // namespace mace
......@@ -6,6 +6,8 @@
#include "mace/ops/ops_test_util.h"
namespace mace {
namespace ops {
namespace test {
class AddnOpTest : public OpsTestBase {};
......@@ -62,15 +64,15 @@ TEST_F(AddnOpTest, CPUSimpleAdd3) { SimpleAdd3<DeviceType::CPU>(); }
template <DeviceType D>
void RandomTest() {
testing::internal::LogToStderr();
srand(time(NULL));
static unsigned int seed = time(NULL);
for (int round = 0; round < 10; ++round) {
// generate random input
index_t n = 1 + (rand() % 5);
index_t h = 1 + (rand() % 100);
index_t w = 1 + (rand() % 100);
index_t c = 1 + (rand() % 32);
int input_num = 2 + rand() % 3;
index_t n = 1 + (rand_r(&seed) % 5);
index_t h = 1 + (rand_r(&seed) % 100);
index_t w = 1 + (rand_r(&seed) % 100);
index_t c = 1 + (rand_r(&seed) % 32);
int input_num = 2 + rand_r(&seed) % 3;
// Construct graph
OpsTestNet net;
auto op_def = OpDefBuilder("AddN", "AddNTest");
......@@ -117,4 +119,6 @@ void RandomTest() {
TEST_F(AddnOpTest, OPENCLRandom) { RandomTest<DeviceType::OPENCL>(); }
} // namespace test
} // namespace ops
} // namespace mace
......@@ -5,6 +5,7 @@
#include "mace/ops/batch_norm.h"
namespace mace {
namespace ops {
void Register_BatchNorm(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchNorm")
......@@ -26,4 +27,5 @@ void Register_BatchNorm(OperatorRegistry *op_registry) {
BatchNormOp<DeviceType::OPENCL, half>);
}
} // namespace ops
} // namespace mace
......@@ -10,6 +10,7 @@
#include "mace/kernels/batch_norm.h"
namespace mace {
namespace ops {
template <DeviceType D, class T>
class BatchNormOp : public Operator<D, T> {
......@@ -55,6 +56,7 @@ class BatchNormOp : public Operator<D, T> {
OP_OUTPUT_TAGS(OUTPUT);
};
} // namespace ops
} // namespace mace
#endif // MACE_OPS_BATCH_NORM_H_
......@@ -8,6 +8,9 @@
#include "mace/ops/ops_test_util.h"
namespace mace {
namespace ops {
namespace test {
template <DeviceType D, typename T>
static void BatchNorm(
int iters, int batch, int channels, int height, int width) {
......@@ -101,4 +104,6 @@ BM_BATCH_NORM(1, 1024, 7, 7);
BM_BATCH_NORM(32, 1, 256, 256);
BM_BATCH_NORM(32, 3, 256, 256);
} // namespace test
} // namespace ops
} // namespace mace
......@@ -6,6 +6,8 @@
#include "mace/ops/ops_test_util.h"
namespace mace {
namespace ops {
namespace test {
class BatchNormOpTest : public OpsTestBase {};
......@@ -75,11 +77,10 @@ TEST_F(BatchNormOpTest, SimpleCPU) { Simple<DeviceType::CPU>(); }
TEST_F(BatchNormOpTest, SimpleOPENCL) { Simple<DeviceType::OPENCL>(); }
TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
srand(time(NULL));
// generate random input
index_t batch = 1 + rand() % 10;
index_t channels = 3 + rand() % 50;
static unsigned int seed = time(NULL);
index_t batch = 1 + rand_r(&seed) % 10;
index_t channels = 3 + rand_r(&seed) % 50;
index_t height = 64;
index_t width = 64;
......@@ -147,11 +148,10 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
}
TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {
srand(time(NULL));
// generate random input
index_t batch = 1 + rand() % 10;
index_t channels = 3 + rand() % 50;
static unsigned int seed = time(NULL);
index_t batch = 1 + rand_r(&seed) % 10;
index_t channels = 3 + rand_r(&seed) % 50;
index_t height = 64;
index_t width = 64;
......@@ -220,11 +220,10 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {
}
TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
srand(time(NULL));
// generate random input
index_t batch = 1 + rand() % 10;
index_t channels = 3 + rand() % 50;
static unsigned int seed = time(NULL);
index_t batch = 1 + rand_r(&seed) % 10;
index_t channels = 3 + rand_r(&seed) % 50;
index_t height = 103;
index_t width = 113;
......@@ -292,11 +291,10 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
}
TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {
srand(time(NULL));
// generate random input
index_t batch = 1 + rand() % 10;
index_t channels = 3 + rand() % 50;
static unsigned int seed = time(NULL);
index_t batch = 1 + rand_r(&seed) % 10;
index_t channels = 3 + rand_r(&seed) % 50;
index_t height = 103;
index_t width = 113;
......@@ -363,4 +361,7 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {
kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.5);
}
}
} // namespace test
} // namespace ops
} // namespace mace
......@@ -5,6 +5,7 @@
#include "mace/ops/batch_to_space.h"
namespace mace {
namespace ops {
void Register_BatchToSpaceND(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchToSpaceND")
......@@ -19,4 +20,5 @@ void Register_BatchToSpaceND(OperatorRegistry *op_registry) {
BatchToSpaceNDOp<DeviceType::OPENCL, half>);
}
} // namespace ops
} // namespace mace
......@@ -2,15 +2,17 @@
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#ifndef MACE_OPS_SPACE_TO_BATCH_H_
#define MACE_OPS_SPACE_TO_BATCH_H_
#ifndef MACE_OPS_BATCH_TO_SPACE_H_
#define MACE_OPS_BATCH_TO_SPACE_H_
#include <memory>
#include <vector>
#include "mace/core/operator.h"
#include "mace/kernels/space_to_batch.h"
namespace mace {
namespace ops {
template <DeviceType D, typename T>
class BatchToSpaceNDOp : public Operator<D, T> {
......@@ -68,6 +70,7 @@ class BatchToSpaceNDOp : public Operator<D, T> {
OP_OUTPUT_TAGS(OUTPUT);
};
} // namespace ops
} // namespace mace
#endif // MACE_OPS_SPACE_TO_BATCH_H_
#endif // MACE_OPS_BATCH_TO_SPACE_H_
......@@ -7,6 +7,9 @@
#include "mace/ops/ops_test_util.h"
namespace mace {
namespace ops {
namespace test {
template <DeviceType D, typename T>
static void BMBatchToSpace(
int iters, int batch, int channels, int height, int width, int arg) {
......@@ -53,4 +56,7 @@ static void BMBatchToSpace(
BM_BATCH_TO_SPACE(128, 8, 8, 128, 2);
BM_BATCH_TO_SPACE(4, 128, 128, 32, 2);
BM_BATCH_TO_SPACE(16, 64, 64, 32, 4);
} // namespace test
} // namespace ops
} // namespace mace
......@@ -5,6 +5,7 @@
#include "mace/ops/bias_add.h"
namespace mace {
namespace ops {
void Register_BiasAdd(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("BiasAdd")
......@@ -26,4 +27,5 @@ void Register_BiasAdd(OperatorRegistry *op_registry) {
BiasAddOp<DeviceType::OPENCL, half>);
}
} // namespace ops
} // namespace mace
......@@ -2,13 +2,14 @@
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#ifndef MACE_BIAS_ADD_H_
#define MACE_BIAS_ADD_H_
#ifndef MACE_OPS_BIAS_ADD_H_
#define MACE_OPS_BIAS_ADD_H_
#include "mace/core/operator.h"
#include "mace/kernels/bias_add.h"
namespace mace {
namespace ops {
template <DeviceType D, class T>
class BiasAddOp : public Operator<D, T> {
......@@ -40,6 +41,7 @@ class BiasAddOp : public Operator<D, T> {
OP_OUTPUT_TAGS(OUTPUT);
};
} // namespace ops
} // namespace mace
#endif // MACE_BIAS_ADD_H_
#endif // MACE_OPS_BIAS_ADD_H_
......@@ -8,6 +8,9 @@
#include "mace/ops/ops_test_util.h"
namespace mace {
namespace ops {
namespace test {
template <DeviceType D, typename T>
static void BiasAdd(int iters, int batch, int channels, int height, int width) {
mace::testing::StopTiming();
......@@ -77,4 +80,7 @@ BM_BIAS_ADD(1, 512, 14, 14);
BM_BIAS_ADD(1, 1024, 7, 7);
BM_BIAS_ADD(32, 1, 256, 256);
BM_BIAS_ADD(32, 3, 256, 256);
} // namespace test
} // namespace ops
} // namespace mace
......@@ -6,6 +6,8 @@
#include "mace/ops/ops_test_util.h"
namespace mace {
namespace ops {
namespace test {
class BiasAddOpTest : public OpsTestBase {};
......@@ -60,13 +62,12 @@ TEST_F(BiasAddOpTest, BiasAddSimpleOPENCL) {
}
TEST_F(BiasAddOpTest, SimpleRandomOPENCL) {
srand(time(NULL));
// generate random input
index_t batch = 1 + rand() % 10;
index_t channels = 3 + rand() % 50;
index_t height = 64 + rand() % 50;
index_t width = 64 + rand() % 50;
static unsigned int seed = time(NULL);
index_t batch = 1 + rand_r(&seed) % 10;
index_t channels = 3 + rand_r(&seed) % 50;
index_t height = 64 + rand_r(&seed) % 50;
index_t width = 64 + rand_r(&seed) % 50;
// Construct graph
OpsTestNet net;
......@@ -110,13 +111,12 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) {
}
TEST_F(BiasAddOpTest, ComplexRandomOPENCL) {
srand(time(NULL));
// generate random input
index_t batch = 1 + rand() % 10;
index_t channels = 3 + rand() % 50;
index_t height = 103 + rand() % 100;
index_t width = 113 + rand() % 100;
static unsigned int seed = time(NULL);
index_t batch = 1 + rand_r(&seed) % 10;
index_t channels = 3 + rand_r(&seed) % 50;
index_t height = 103 + rand_r(&seed) % 100;
index_t width = 113 + rand_r(&seed) % 100;
// Construct graph
OpsTestNet net;
......@@ -158,4 +158,7 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) {
kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-2);
}
}
} // namespace test
} // namespace ops
} // namespace mace
......@@ -5,6 +5,7 @@
#include "mace/ops/buffer_to_image.h"
namespace mace {
namespace ops {
void Register_BufferToImage(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("BufferToImage")
......@@ -20,4 +21,5 @@ void Register_BufferToImage(OperatorRegistry *op_registry) {
BufferToImageOp<DeviceType::OPENCL, half>);
}
} // namespace ops
} // namespace mace
......@@ -9,6 +9,7 @@
#include "mace/kernels/buffer_to_image.h"
namespace mace {
namespace ops {
template <DeviceType D, typename T>
class BufferToImageOp : public Operator<D, T> {
......@@ -36,5 +37,6 @@ class BufferToImageOp : public Operator<D, T> {
OP_OUTPUT_TAGS(OUTPUT);
};
} // namespace ops
} // namespace mace
#endif // MACE_OPS_BUFFER_TO_IMAGE_H_
......@@ -5,7 +5,9 @@
#include "gtest/gtest.h"
#include "mace/ops/ops_test_util.h"
using namespace mace;
namespace mace {
namespace ops {
namespace test {
template <DeviceType D, typename T>
void TestBidirectionTransform(const int type,
......@@ -188,3 +190,7 @@ TEST(BufferToImageTest, ArgStringHalfToHalfSmall) {
TestStringHalfBidirectionTransform<DeviceType::OPENCL, half>(
kernels::ARGUMENT, {2}, input_data);
}
} // namespace test
} // namespace ops
} // namespace mace
......@@ -5,6 +5,7 @@
#include "mace/ops/channel_shuffle.h"
namespace mace {
namespace ops {
void Register_ChannelShuffle(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("ChannelShuffle")
......@@ -24,4 +25,5 @@ void Register_ChannelShuffle(OperatorRegistry *op_registry) {
ChannelShuffleOp<DeviceType::OPENCL, half>);
}
} // namespace ops
} // namespace mace
......@@ -11,6 +11,7 @@
#include "mace/kernels/channel_shuffle.h"
namespace mace {
namespace ops {
template <DeviceType D, typename T>
class ChannelShuffleOp : public Operator<D, T> {
......@@ -42,6 +43,7 @@ class ChannelShuffleOp : public Operator<D, T> {
kernels::ChannelShuffleFunctor<D, T> functor_;
};
} // namespace ops
} // namespace mace
#endif // MACE_OPS_CHANNEL_SHUFFLE_H_
......@@ -7,10 +7,12 @@
#include "mace/ops/ops_test_util.h"
namespace mace {
namespace ops {
namespace test {
template<DeviceType D, typename T>
template <DeviceType D, typename T>
static void ChannelShuffle(
int iters, int batch, int channels, int height, int width, int group) {
int iters, int batch, int channels, int height, int width, int group) {
mace::testing::StopTiming();
OpsTestNet net;
......@@ -23,15 +25,15 @@ static void ChannelShuffle(
kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("ChannelShuffle", "ChannelShuffleTest")
.Input("InputImage")
.Output("Output")
.AddIntArg("group", group)
.Finalize(net.NewOperatorDef());
.Input("InputImage")
.Output("Output")
.AddIntArg("group", group)
.Finalize(net.NewOperatorDef());
} else {
OpDefBuilder("Softmax", "SoftmaxBM")
.Input("Input")
.Output("Output")
.Finalize(net.NewOperatorDef());
.Input("Input")
.Output("Output")
.Finalize(net.NewOperatorDef());
}
// Warm-up
......@@ -47,18 +49,19 @@ static void ChannelShuffle(
net.Sync();
}
#define BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, TYPE, DEVICE) \
static void BM_CHANNEL_SHUFFLE_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
ChannelShuffle<DEVICE, TYPE>(iters, N, C, H, W, G); \
} \
#define BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, TYPE, DEVICE) \
static void \
BM_CHANNEL_SHUFFLE_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::MaccProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
ChannelShuffle<DEVICE, TYPE>(iters, N, C, H, W, G); \
} \
BENCHMARK(BM_CHANNEL_SHUFFLE_##N##_##C##_##H##_##W##_##G##_##TYPE##_##DEVICE)
#define BM_CHANNEL_SHUFFLE(N, C, H, W, G) \
BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, float, CPU); \
#define BM_CHANNEL_SHUFFLE(N, C, H, W, G) \
BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, float, CPU); \
BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, float, OPENCL); \
BM_CHANNEL_SHUFFLE_MACRO(N, C, H, W, G, half, OPENCL);
......@@ -66,4 +69,6 @@ BM_CHANNEL_SHUFFLE(1, 64, 64, 64, 8);
BM_CHANNEL_SHUFFLE(1, 64, 128, 128, 8);
BM_CHANNEL_SHUFFLE(1, 64, 256, 256, 8);
} // namespace mace
} // namespace test
} // namespace ops
} // namespace mace
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/core/operator.h"
#include "mace/ops/ops_test_util.h"
using namespace mace;
namespace mace {
namespace ops {
namespace test {
class ChannelShuffleOpTest : public OpsTestBase {};
......@@ -38,30 +41,34 @@ TEST_F(ChannelShuffleOpTest, C16G4_OPENCL) {
// Add input data
net.AddInputFromArray<DeviceType::OPENCL, float>(
"Input", {1, 1, 2, 16},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31});
"Input", {1, 1, 2, 16},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31});
BufferToImage<DeviceType::OPENCL, float>(&net, "Input", "InputImage",
kernels::BufferType::IN_OUT_CHANNEL);
kernels::BufferType::IN_OUT_CHANNEL);
OpDefBuilder("ChannelShuffle", "ChannelShuffleTest")
.Input("InputImage")
.Output("OutputImage")
.AddIntArg("group", 4)
.Finalize(net.NewOperatorDef());
.Input("InputImage")
.Output("OutputImage")
.AddIntArg("group", 4)
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(DeviceType::OPENCL);
// Transfer output
ImageToBuffer<DeviceType::OPENCL, float>(&net, "OutputImage", "Output",
kernels::BufferType::IN_OUT_CHANNEL);
kernels::BufferType::IN_OUT_CHANNEL);
// Check
auto expected = CreateTensor<float>(
{1, 1, 2, 16}, {0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31});
{1, 1, 2, 16},
{0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
}
} // namespace test
} // namespace ops
} // namespace mace
......@@ -5,6 +5,7 @@
#include "mace/ops/concat.h"
namespace mace {
namespace ops {
void Register_Concat(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Concat")
......@@ -25,4 +26,5 @@ void Register_Concat(OperatorRegistry *op_registry) {
ConcatOp<DeviceType::OPENCL, half>);
}
} // namespace ops
} // namespace mace
......@@ -5,9 +5,13 @@
#ifndef MACE_OPS_CONCAT_H_
#define MACE_OPS_CONCAT_H_
#include <vector>
#include "mace/core/operator.h"
#include "mace/kernels/concat.h"
namespace mace {
namespace ops {
template <DeviceType D, typename T>
class ConcatOp : public Operator<D, T> {
......@@ -41,6 +45,7 @@ class ConcatOp : public Operator<D, T> {
OP_OUTPUT_TAGS(OUTPUT);
};
} // namespace ops
} // namespace mace
#endif // MACE_OPS_CONCAT_H_
......@@ -7,6 +7,9 @@
#include "mace/ops/ops_test_util.h"
namespace mace {
namespace ops {
namespace test {
template <DeviceType D, typename T>
static void ConcatHelper(int iters, int concat_dim, int dim1) {
mace::testing::StopTiming();
......@@ -106,4 +109,6 @@ BM_CONCAT_OPENCL_MACRO(3, 32, 32, 64, half);
BM_CONCAT_OPENCL_MACRO(3, 32, 32, 128, half);
BM_CONCAT_OPENCL_MACRO(3, 32, 32, 256, half);
} // namespace test
} // namespace ops
} // namespace mace
......@@ -2,11 +2,16 @@
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/ops/concat.h"
#include <string>
#include <functional>
#include "gmock/gmock.h"
#include "mace/ops/ops_test_util.h"
#include "mace/ops/concat.h"
using namespace mace;
namespace mace {
namespace ops {
namespace test {
class ConcatOpTest : public OpsTestBase {};
......@@ -87,10 +92,10 @@ TEST_F(ConcatOpTest, CPUSimpleVertical) {
}
TEST_F(ConcatOpTest, CPURandom) {
srand(time(nullptr));
static unsigned int seed = time(NULL);
int dim = 5;
int num_inputs = 2 + rand() % 10;
int axis = rand() % dim;
int num_inputs = 2 + rand_r(&seed) % 10;
int axis = rand_r(&seed) % dim;
// Construct graph
OpsTestNet net;
auto builder = OpDefBuilder("Concat", "ConcatTest");
......@@ -108,7 +113,7 @@ TEST_F(ConcatOpTest, CPURandom) {
std::vector<float *> input_ptrs(num_inputs, nullptr);
index_t concat_axis_size = 0;
for (int i = 0; i < num_inputs; ++i) {
input_shapes[i][axis] = 1 + rand() % dim;
input_shapes[i][axis] = 1 + rand_r(&seed) % dim;
concat_axis_size += input_shapes[i][axis];
GenerateRandomRealTypeData(input_shapes[i], &inputs[i]);
input_ptrs[i] = inputs[i].data();
......@@ -217,3 +222,7 @@ TEST_F(ConcatOpTest, OPENCLAlignedMultiInput) {
OpenclRandomTest<float>(
{{3, 32, 32, 32}, {3, 32, 32, 32}, {3, 32, 32, 32}, {3, 32, 32, 32}}, 3);
}
} // namespace test
} // namespace ops
} // namespace mace
......@@ -5,6 +5,7 @@
#include "mace/ops/conv_2d.h"
namespace mace {
namespace ops {
void Register_Conv2D(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Conv2D")
......@@ -26,4 +27,5 @@ void Register_Conv2D(OperatorRegistry *op_registry) {
Conv2dOp<DeviceType::OPENCL, half>);
}
} // namespace ops
} // namespace mace
......@@ -12,6 +12,7 @@
#include "mace/ops/conv_pool_2d_base.h"
namespace mace {
namespace ops {
template <DeviceType D, typename T>
class Conv2dOp : public ConvPool2dOpBase<D, T> {
......@@ -44,6 +45,7 @@ class Conv2dOp : public ConvPool2dOpBase<D, T> {
OP_OUTPUT_TAGS(OUTPUT);
};
} // namespace ops
} // namespace mace
#endif // MACE_OPS_CONV_2D_H_
......@@ -10,6 +10,8 @@
#include "mace/ops/ops_test_util.h"
namespace mace {
namespace ops {
namespace test {
template <DeviceType D, typename T>
static void Conv2d(int iters,
......@@ -80,30 +82,32 @@ static void Conv2d(int iters,
// approximate the amortized latency. The OpenCL runtime for Mali/Adreno is
// in-order.
#define BM_CONV_2D_MACRO(N, C, H, W, KH, KW, STRIDE, DILATION, P, OC, TYPE, \
DEVICE) \
static void \
BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##D##DILATION##_##P##_##OC##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
int64_t pad_h = 0, pad_w = 0; \
if (P == SAME) { \
pad_h = KH / 2; \
pad_w = KW / 2; \
} \
int64_t oh = \
(H + 2 * pad_h - KH - (KH - 1) * (DILATION - 1)) / STRIDE + 1; \
int64_t ow = \
(W + 2 * pad_w - KW - (KW - 1) * (DILATION - 1)) / STRIDE + 1; \
const int64_t macc = \
static_cast<int64_t>(iters) * N * OC * oh * ow * (KH * KW * C + 1); \
mace::testing::MaccProcessed(macc); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
Conv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, STRIDE, DILATION, \
mace::Padding::P, OC); \
} \
BENCHMARK( \
BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##D##DILATION##_##P##_##OC##_##TYPE##_##DEVICE)
#define BM_CONV_2D_MACRO(N, C, H, W, KH, KW, STRIDE, DILATION, P, OC, TYPE, \
DEVICE) \
static void \
BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##D##DILATION\
##_##P##_##OC##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
int64_t pad_h = 0, pad_w = 0; \
if (P == SAME) { \
pad_h = KH / 2; \
pad_w = KW / 2; \
} \
int64_t oh = \
(H + 2 * pad_h - KH - (KH - 1) * (DILATION - 1)) / STRIDE + 1; \
int64_t ow = \
(W + 2 * pad_w - KW - (KW - 1) * (DILATION - 1)) / STRIDE + 1; \
const int64_t macc = \
static_cast<int64_t>(iters) * N * OC * oh * ow * (KH * KW * C + 1); \
mace::testing::MaccProcessed(macc); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
Conv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, STRIDE, DILATION, \
mace::Padding::P, OC); \
} \
BENCHMARK( \
BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##D##DILATION\
##_##P##_##OC##_##TYPE##_##DEVICE)
#define BM_CONV_2D(N, C, H, W, KH, KW, S, D, P, OC) \
BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, float, CPU); \
......@@ -139,4 +143,6 @@ BM_CONV_2D(1, 32, 256, 256, 3, 3, 1, 4, VALID, 32);
BM_CONV_2D(1, 128, 56, 56, 1, 1, 1, 1, SAME, 128);
BM_CONV_2D(1, 1024, 7, 7, 1, 1, 1, 1, SAME, 1024);
} // namespace test
} // namespace ops
} // namespace mace
......@@ -2,11 +2,15 @@
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/ops/conv_2d.h"
#include <fstream>
#include <vector>
#include "mace/ops/conv_2d.h"
#include "mace/ops/ops_test_util.h"
using namespace mace;
namespace mace {
namespace ops {
namespace test {
class Conv2dOpTest : public OpsTestBase {};
......@@ -347,14 +351,13 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape,
testing::internal::LogToStderr();
auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
Padding type) {
srand(time(NULL));
// generate random input
index_t batch = 3 + (rand() % 10);
static unsigned int seed = time(NULL);
index_t batch = 3 + (rand_r(&seed) % 10);
index_t height = shape[0];
index_t width = shape[1];
index_t input_channels = shape[2] + (rand() % 10);
index_t output_channels = shape[3] + (rand() % 10);
index_t input_channels = shape[2] + (rand_r(&seed) % 10);
index_t output_channels = shape[3] + (rand_r(&seed) % 10);
// Construct graph
OpsTestNet net;
OpDefBuilder("Conv2D", "Conv2dTest")
......@@ -729,3 +732,7 @@ TEST_F(Conv2dOpTest, OPENCLAlignedPad2) {
TEST_F(Conv2dOpTest, OPENCLUnalignedPad4) {
TestArbitraryPadConvNxN<DeviceType::OPENCL, float>({107, 113, 5, 7}, {4, 4});
}
} // namespace test
} // namespace ops
} // namespace mace
......@@ -5,10 +5,13 @@
#ifndef MACE_OPS_CONV_POOL_2D_BASE_H_
#define MACE_OPS_CONV_POOL_2D_BASE_H_
#include <vector>
#include "mace/core/operator.h"
#include "mace/kernels/conv_pool_2d_util.h"
namespace mace {
namespace ops {
template <DeviceType D, class T>
class ConvPool2dOpBase : public Operator<D, T> {
......@@ -29,6 +32,7 @@ class ConvPool2dOpBase : public Operator<D, T> {
std::vector<int> dilations_;
};
} // namespace ops
} // namespace mace
#endif // MACE_OPS_CONV_POOL_2D_BASE_H_
......@@ -5,6 +5,8 @@
#include "mace/ops/ops_test_util.h"
namespace mace {
namespace ops {
namespace test {
TEST(CoreTest, INIT_MODE) {
std::vector<OperatorDef> op_defs;
......@@ -56,4 +58,6 @@ TEST(CoreTest, INIT_MODE) {
1e-5);
}
} // namespace test
} // namespace ops
} // namespace mace
......@@ -5,6 +5,7 @@
#include "mace/ops/depthwise_conv2d.h"
namespace mace {
namespace ops {
void Register_DepthwiseConv2d(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthwiseConv2d")
......@@ -26,4 +27,5 @@ void Register_DepthwiseConv2d(OperatorRegistry *op_registry) {
DepthwiseConv2dOp<DeviceType::OPENCL, half>);
}
} // namespace ops
} // namespace mace
......@@ -6,6 +6,7 @@
#define MACE_OPS_DEPTHWISE_CONV2D_H_
#include <memory>
#include <string>
#include "mace/core/operator.h"
#include "mace/kernels/conv_2d.h"
......@@ -13,6 +14,7 @@
#include "mace/ops/conv_pool_2d_base.h"
namespace mace {
namespace ops {
template <DeviceType D, typename T>
class DepthwiseConv2dOp : public ConvPool2dOpBase<D, T> {
......@@ -48,6 +50,7 @@ class DepthwiseConv2dOp : public ConvPool2dOpBase<D, T> {
OP_OUTPUT_TAGS(OUTPUT);
};
} // namespace ops
} // namespace mace
#endif // MACE_OPS_DEPTHWISE_CONV2D_H_
......@@ -10,6 +10,8 @@
#include "mace/ops/ops_test_util.h"
namespace mace {
namespace ops {
namespace test {
template <DeviceType D, typename T>
static void DepthwiseConv2d(int iters,
......@@ -75,31 +77,33 @@ static void DepthwiseConv2d(int iters,
}
}
#define BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, STRIDE, P, M, TYPE, \
DEVICE) \
static void \
BM_DEPTHWISE_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##M##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t dilation = 1; \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
int64_t pad_h = 0, pad_w = 0; \
if (P == SAME) { \
pad_h = KH / 2; \
pad_w = KW / 2; \
} \
int64_t oh = \
(H + 2 * pad_h - KH - (KH - 1) * (dilation - 1)) / STRIDE + 1; \
int64_t ow = \
(W + 2 * pad_w - KW - (KW - 1) * (dilation - 1)) / STRIDE + 1; \
const int64_t macc = \
static_cast<int64_t>(iters) * N * C * M * oh * ow * (KH * KW + 1); \
mace::testing::MaccProcessed(macc); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
DepthwiseConv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, STRIDE, \
mace::Padding::P, M); \
} \
BENCHMARK( \
BM_DEPTHWISE_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##M##_##TYPE##_##DEVICE)
#define BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, STRIDE, P, M, TYPE, \
DEVICE) \
static void \
BM_DEPTHWISE_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_\
##P##_##M##_##TYPE##_##DEVICE( \
int iters) { \
const int64_t dilation = 1; \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
int64_t pad_h = 0, pad_w = 0; \
if (P == SAME) { \
pad_h = KH / 2; \
pad_w = KW / 2; \
} \
int64_t oh = \
(H + 2 * pad_h - KH - (KH - 1) * (dilation - 1)) / STRIDE + 1; \
int64_t ow = \
(W + 2 * pad_w - KW - (KW - 1) * (dilation - 1)) / STRIDE + 1; \
const int64_t macc = \
static_cast<int64_t>(iters) * N * C * M * oh * ow * (KH * KW + 1); \
mace::testing::MaccProcessed(macc); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
DepthwiseConv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, STRIDE, \
mace::Padding::P, M); \
} \
BENCHMARK( \
BM_DEPTHWISE_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_\
##P##_##M##_##TYPE##_##DEVICE)
#define BM_DEPTHWISE_CONV_2D(N, C, H, W, KH, KW, S, P, M) \
BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, float, CPU); \
......@@ -121,4 +125,6 @@ BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 2, SAME, 1);
BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 2, VALID, 1);
BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 2, SAME, 1);
} // namespace test
} // namespace ops
} // namespace mace
......@@ -5,9 +5,9 @@
#include "mace/ops/conv_2d.h"
#include "mace/ops/ops_test_util.h"
using namespace mace;
namespace {
namespace mace {
namespace ops {
namespace test {
class DepthwiseConv2dOpTest : public OpsTestBase {};
......@@ -207,11 +207,10 @@ void TestNxNS12(const index_t height, const index_t width) {
testing::internal::LogToStderr();
auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
Padding type) {
srand(time(NULL));
// generate random input
index_t batch = 1 + rand() % 5;
index_t input_channels = 3 + rand() % 16;
static unsigned int seed = time(NULL);
index_t batch = 1 + rand_r(&seed) % 5;
index_t input_channels = 3 + rand_r(&seed) % 16;
index_t multiplier = 1;
// Construct graph
OpsTestNet net;
......@@ -316,4 +315,6 @@ TEST_F(DepthwiseConv2dOpTest, OpenCLUnalignedNxNS12Half) {
TestNxNS12<DeviceType::OPENCL, half>(107, 113);
}
} // namespace
} // namespace test
} // namespace ops
} // namespace mace
......@@ -5,6 +5,7 @@
#include "mace/ops/eltwise.h"
namespace mace {
namespace ops {
void Register_Eltwise(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Eltwise")
......@@ -26,4 +27,5 @@ void Register_Eltwise(OperatorRegistry *op_registry) {
EltwiseOp<DeviceType::OPENCL, half>);
}
} // namespace ops
} // namespace mace
......@@ -2,13 +2,14 @@
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#ifndef MACE_OPS_RESHAPE_H_
#define MACE_OPS_RESHAPE_H_
#ifndef MACE_OPS_ELTWISE_H_
#define MACE_OPS_ELTWISE_H_
#include "mace/core/operator.h"
#include "mace/kernels/eltwise.h"
namespace mace {
namespace ops {
template <DeviceType D, typename T>
class EltwiseOp : public Operator<D, T> {
......@@ -44,6 +45,7 @@ class EltwiseOp : public Operator<D, T> {
OP_OUTPUT_TAGS(OUTPUT);
};
} // namespace ops
} // namespace mace
#endif // MACE_OPS_RESHAPE_H_
#endif // MACE_OPS_ELTWISE_H_
......@@ -2,13 +2,17 @@
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/kernels/eltwise.h"
#include <string>
#include "mace/core/operator.h"
#include "mace/core/testing/test_benchmark.h"
#include "mace/kernels/eltwise.h"
#include "mace/ops/ops_test_util.h"
namespace mace {
namespace ops {
namespace test {
template <DeviceType D, typename T>
static void EltwiseBenchmark(
int iters, kernels::EltwiseType type, int n, int h, int w, int c) {
......@@ -81,4 +85,6 @@ BM_ELTWISE(0, 1, 240, 240, 256);
BM_ELTWISE(1, 1, 240, 240, 256);
BM_ELTWISE(2, 1, 240, 240, 256);
} // namespace test
} // namespace ops
} // namespace mace
......@@ -7,6 +7,8 @@
#include "mace/ops/ops_test_util.h"
namespace mace {
namespace ops {
namespace test {
class EltwiseOpTest : public OpsTestBase {};
......@@ -170,4 +172,6 @@ TEST_F(EltwiseOpTest, OPENCLRandomHalf) {
{13, 32, 32, 64});
}
} // namespace test
} // namespace ops
} // namespace mace
......@@ -5,6 +5,7 @@
#include "mace/ops/folded_batch_norm.h"
namespace mace {
namespace ops {
void Register_FoldedBatchNorm(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm")
......@@ -26,4 +27,5 @@ void Register_FoldedBatchNorm(OperatorRegistry *op_registry) {
FoldedBatchNormOp<DeviceType::OPENCL, half>);
}
} // namespace ops
} // namespace mace
......@@ -5,10 +5,13 @@
#ifndef MACE_OPS_FOLDED_BATCH_NORM_H_
#define MACE_OPS_FOLDED_BATCH_NORM_H_
#include <string>
#include "mace/core/operator.h"
#include "mace/kernels/batch_norm.h"
namespace mace {
namespace ops {
template <DeviceType D, class T>
class FoldedBatchNormOp : public Operator<D, T> {
......@@ -48,6 +51,7 @@ class FoldedBatchNormOp : public Operator<D, T> {
OP_OUTPUT_TAGS(OUTPUT);
};
} // namespace ops
} // namespace mace
#endif // MACE_OPS_FOLDED_BATCH_NORM_H_
......@@ -6,6 +6,8 @@
#include "mace/ops/ops_test_util.h"
namespace mace {
namespace ops {
namespace test {
class FoldedBatchNormOpTest : public OpsTestBase {};
......@@ -14,12 +16,12 @@ void CalculateScaleOffset(const std::vector<float> &gamma,
const std::vector<float> &mean,
const std::vector<float> &var,
const float epsilon,
std::vector<float> &scale,
std::vector<float> &offset) {
std::vector<float> *scale,
std::vector<float> *offset) {
size_t size = gamma.size();
for (int i = 0; i < size; ++i) {
scale[i] = gamma[i] / std::sqrt(var[i] + epsilon);
offset[i] = offset[i] - mean[i] * scale[i];
(*scale)[i] = gamma[i] / std::sqrt(var[i] + epsilon);
(*offset)[i] = (*offset)[i] - mean[i] * (*scale)[i];
}
}
......@@ -32,7 +34,7 @@ void Simple() {
{5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15});
std::vector<float> scale(1);
std::vector<float> offset(1);
CalculateScaleOffset({4.0f}, {2.0}, {10}, {11.67f}, 1e-3, scale, offset);
CalculateScaleOffset({4.0f}, {2.0}, {10}, {11.67f}, 1e-3, &scale, &offset);
net.AddInputFromArray<D, float>("Scale", {1}, scale);
net.AddInputFromArray<D, float>("Offset", {1}, offset);
......@@ -172,11 +174,10 @@ width});
*/
TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) {
srand(time(NULL));
// generate random input
index_t batch = 1 + rand() % 10;
index_t channels = 3 + rand() % 50;
static unsigned int seed = time(NULL);
index_t batch = 1 + rand_r(&seed) % 10;
index_t channels = 3 + rand_r(&seed) % 50;
index_t height = 64;
index_t width = 64;
......@@ -227,11 +228,10 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) {
}
TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) {
srand(time(NULL));
// generate random input
index_t batch = 1 + rand() % 10;
index_t channels = 3 + rand() % 50;
static unsigned int seed = time(NULL);
index_t batch = 1 + rand_r(&seed) % 10;
index_t channels = 3 + rand_r(&seed) % 50;
index_t height = 64;
index_t width = 64;
......@@ -283,11 +283,10 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) {
}
TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) {
srand(time(NULL));
// generate random input
index_t batch = 1 + rand() % 10;
index_t channels = 3 + rand() % 50;
static unsigned int seed = time(NULL);
index_t batch = 1 + rand_r(&seed) % 10;
index_t channels = 3 + rand_r(&seed) % 50;
index_t height = 103;
index_t width = 113;
......@@ -337,11 +336,10 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) {
}
TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) {
srand(time(NULL));
// generate random input
index_t batch = 1 + rand() % 10;
index_t channels = 3 + rand() % 50;
static unsigned int seed = time(NULL);
index_t batch = 1 + rand_r(&seed) % 10;
index_t channels = 3 + rand_r(&seed) % 50;
index_t height = 103;
index_t width = 113;
......@@ -390,4 +388,7 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) {
kernels::BufferType::IN_OUT_CHANNEL);
ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.5);
}
}
} // namespace test
} // namespace ops
} // namespace mace
......@@ -5,6 +5,7 @@
#include "mace/ops/fully_connected.h"
namespace mace {
namespace ops {
void Register_FullyConnected(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("FC")
......@@ -26,4 +27,5 @@ void Register_FullyConnected(OperatorRegistry *op_registry) {
FullyConnectedOp<DeviceType::OPENCL, half>);
}
} // namespace ops
} // namespace mace
......@@ -11,6 +11,7 @@
#include "mace/kernels/fully_connected.h"
namespace mace {
namespace ops {
template <DeviceType D, class T>
class FullyConnectedOp : public Operator<D, T> {
......@@ -48,6 +49,7 @@ class FullyConnectedOp : public Operator<D, T> {
OP_OUTPUT_TAGS(OUTPUT);
};
} // namespace ops
} // namespace mace
#endif // MACE_OPS_FULLY_CONNECTED_H_
......@@ -3,11 +3,14 @@
//
#include <string>
#include "mace/core/operator.h"
#include "mace/core/testing/test_benchmark.h"
#include "mace/ops/ops_test_util.h"
namespace mace {
namespace ops {
namespace test {
template <DeviceType D, typename T>
static void FCBenchmark(
......@@ -84,4 +87,7 @@ BM_FC(1, 16, 16, 32, 32);
BM_FC(1, 8, 8, 32, 1000);
BM_FC(1, 2, 2, 512, 2);
BM_FC(1, 7, 7, 512, 4096);
} // namespace test
} // namespace ops
} // namespace mace
......@@ -3,10 +3,13 @@
//
#include <fstream>
#include "mace/core/operator.h"
#include "mace/ops/ops_test_util.h"
namespace mace {
namespace ops {
namespace test {
class FullyConnectedOpTest : public OpsTestBase {};
......@@ -263,4 +266,6 @@ TEST_F(FullyConnectedOpTest, OPENCLHalfWidthFormatAligned) {
TestWXFormat<half>(1, 16, 32, 32, 32);
}
} // namespace test
} // namespace ops
} // namespace mace
......@@ -5,6 +5,7 @@
#include "mace/ops/fused_conv_2d.h"
namespace mace {
namespace ops {
void Register_FusedConv2D(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("FusedConv2D")
......@@ -26,4 +27,5 @@ void Register_FusedConv2D(OperatorRegistry *op_registry) {
FusedConv2dOp<DeviceType::OPENCL, half>);
}
} // namespace ops
} // namespace mace
......@@ -13,6 +13,7 @@
#include "mace/ops/conv_pool_2d_base.h"
namespace mace {
namespace ops {
template <DeviceType D, typename T>
class FusedConv2dOp : public ConvPool2dOpBase<D, T> {
......@@ -47,6 +48,7 @@ class FusedConv2dOp : public ConvPool2dOpBase<D, T> {
OP_OUTPUT_TAGS(OUTPUT);
};
} // namespace ops
} // namespace mace
#endif // MACE_OPS_FUSED_CONV_2D_H_
......@@ -8,6 +8,8 @@
#include "mace/ops/ops_test_util.h"
namespace mace {
namespace ops {
namespace test {
class FusedConv2dOpTest : public OpsTestBase {};
......@@ -276,9 +278,8 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
testing::internal::LogToStderr();
auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
Padding type) {
unsigned int seed = time(NULL);
// generate random input
static unsigned int seed = time(NULL);
index_t batch = 3 + (rand_r(&seed) % 10);
index_t height = shape[0];
index_t width = shape[1];
......@@ -352,9 +353,8 @@ static void TestHalfComplexConvNxNS12(const std::vector<index_t> &shape) {
testing::internal::LogToStderr();
auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
Padding type) {
unsigned int seed = time(NULL);
// generate random input
static unsigned int seed = time(NULL);
index_t batch = 3 + (rand_r(&seed) % 10);
index_t height = shape[0];
index_t width = shape[1];
......@@ -679,4 +679,6 @@ TEST_F(FusedConv2dOpTest, OPENCL15X15AtrousConvD4) {
{2, 2});
}
} // namespace test
} // namespace ops
} // namespace mace
......@@ -5,6 +5,7 @@
#include "mace/ops/global_avg_pooling.h"
namespace mace {
namespace ops {
void Register_GlobalAvgPooling(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("GlobalAvgPooling")
......@@ -14,4 +15,5 @@ void Register_GlobalAvgPooling(OperatorRegistry *op_registry) {
GlobalAvgPoolingOp<DeviceType::CPU, float>);
}
} // namespace ops
} // namespace mace
......@@ -11,6 +11,7 @@
#include "mace/kernels/global_avg_pooling.h"
namespace mace {
namespace ops {
template <DeviceType D, class T>
class GlobalAvgPoolingOp : public Operator<D, T> {
......@@ -40,6 +41,7 @@ class GlobalAvgPoolingOp : public Operator<D, T> {
OP_OUTPUT_TAGS(OUTPUT);
};
} // namespace ops
} // namespace mace
#endif // MACE_OPS_GLOBAL_AVG_POOLING_H_
......@@ -8,7 +8,8 @@
#include "mace/ops/ops_test_util.h"
namespace mace {
namespace kernels {
namespace ops {
namespace test {
template <DeviceType D>
static void GlobalAvgPooling(
......@@ -54,5 +55,6 @@ BM_GLOBAL_AVG_POOLING(1, 3, 7, 7);
BM_GLOBAL_AVG_POOLING(1, 3, 64, 64);
BM_GLOBAL_AVG_POOLING(1, 3, 256, 256);
} // namespace kernels
} // namespace test
} // namespace ops
} // namespace mace
......@@ -5,6 +5,8 @@
#include "mace/ops/ops_test_util.h"
namespace mace {
namespace ops {
namespace test {
class GlobalAvgPoolingOpTest : public OpsTestBase {};
......@@ -32,4 +34,6 @@ TEST_F(GlobalAvgPoolingOpTest, 3x7x7_CPU) {
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
}
} // namespace test
} // namespace ops
} // namespace mace
......@@ -5,6 +5,7 @@
#include "mace/ops/image_to_buffer.h"
namespace mace {
namespace ops {
void Register_ImageToBuffer(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("ImageToBuffer")
......@@ -20,4 +21,5 @@ void Register_ImageToBuffer(OperatorRegistry *op_registry) {
ImageToBufferOp<DeviceType::OPENCL, half>);
}
} // namespace ops
} // namespace mace
......@@ -9,6 +9,7 @@
#include "mace/kernels/buffer_to_image.h"
namespace mace {
namespace ops {
template <DeviceType D, typename T>
class ImageToBufferOp : public Operator<D, T> {
......@@ -35,5 +36,7 @@ class ImageToBufferOp : public Operator<D, T> {
OP_OUTPUT_TAGS(OUTPUT);
};
} // namespace ops
} // namespace mace
#endif // MACE_OPS_IMAGE_TO_BUFFER_H_
......@@ -5,6 +5,7 @@
#include "mace/ops/matmul.h"
namespace mace {
namespace ops {
void Register_MatMul(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("MatMul")
......@@ -26,4 +27,5 @@ void Register_MatMul(OperatorRegistry *op_registry) {
MatMulOp<DeviceType::OPENCL, half>);
}
} // namespace ops
} // namespace mace
......@@ -9,6 +9,7 @@
#include "mace/kernels/matmul.h"
namespace mace {
namespace ops {
template <DeviceType D, class T>
class MatMulOp : public Operator<D, T> {
......@@ -35,6 +36,7 @@ class MatMulOp : public Operator<D, T> {
kernels::MatMulFunctor<D, T> functor_;
};
} // namespace ops
} // namespace mace
#endif // MACE_OPS_MATMUL_H_
......@@ -3,11 +3,15 @@
//
#include <string>
#include "mace/core/operator.h"
#include "mace/core/testing/test_benchmark.h"
#include "mace/ops/ops_test_util.h"
namespace mace {
namespace ops {
namespace test {
template <DeviceType D, typename T>
static void MatMulBenchmark(
int iters, int batch, int height, int channels, int out_width) {
......@@ -72,4 +76,7 @@ BM_MATMUL(16, 32, 128, 3969);
BM_MATMUL(16, 128, 128, 49);
BM_MATMUL(16, 128, 128, 961);
BM_MATMUL(16, 128, 128, 3969);
} // namespace test
} // namespace ops
} // namespace mace
......@@ -3,10 +3,13 @@
//
#include <fstream>
#include "mace/core/operator.h"
#include "mace/ops/ops_test_util.h"
namespace mace {
namespace ops {
namespace test {
class MatMulOpTest : public OpsTestBase {};
......@@ -170,4 +173,7 @@ TEST_F(MatMulOpTest, OPENCLHalfUnAlignedWithBatch) {
Complex<half>(16, 32, 64, 64);
Complex<half>(31, 31, 61, 67);
}
} // namespace test
} // namespace ops
} // namespace mace
......@@ -22,6 +22,8 @@
#include "mace/utils/utils.h"
namespace mace {
namespace ops {
namespace test {
class OpDefBuilder {
public:
......@@ -423,6 +425,8 @@ void ImageToBuffer(OpsTestNet *net,
net->Sync();
}
} // namespace test
} // namespace ops
} // namespace mace
#endif // MACE_OPS_OPS_TEST_UTIL_H_
......@@ -5,6 +5,7 @@
#include "mace/ops/pooling.h"
namespace mace {
namespace ops {
void Register_Pooling(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pooling")
......@@ -30,4 +31,5 @@ void Register_Pooling(OperatorRegistry *op_registry) {
PoolingOp<DeviceType::OPENCL, half>);
}
} // namespace ops
} // namespace mace
......@@ -12,6 +12,7 @@
#include "mace/ops/conv_pool_2d_base.h"
namespace mace {
namespace ops {
template <DeviceType D, class T>
class PoolingOp : public ConvPool2dOpBase<D, T> {
......@@ -46,6 +47,7 @@ class PoolingOp : public ConvPool2dOpBase<D, T> {
OP_OUTPUT_TAGS(OUTPUT);
};
} // namespace ops
} // namespace mace
#endif // MACE_OPS_POOLING_H_
......@@ -9,7 +9,8 @@
#include "mace/ops/ops_test_util.h"
namespace mace {
namespace kernels {
namespace ops {
namespace test {
template <DeviceType D>
static void Pooling(int iters,
......@@ -73,5 +74,6 @@ BM_POOLING(1, 3, 257, 257, 2, 2, SAME, MAX);
BM_POOLING(1, 3, 513, 513, 2, 2, SAME, MAX);
BM_POOLING(1, 3, 1025, 1025, 2, 2, SAME, MAX);
} // namespace kernels
} // namespace test
} // namespace ops
} // namespace mace
......@@ -10,6 +10,8 @@
#include "mace/ops/ops_test_util.h"
namespace mace {
namespace ops {
namespace test {
class PoolingOpTest : public OpsTestBase {};
......@@ -394,4 +396,6 @@ TEST_F(PoolingOpTest, OPENCLUnAlignedLargeKernelAvgPooling) {
Padding::SAME);
}
} // namespace test
} // namespace ops
} // namespace mace
......@@ -5,6 +5,7 @@
#include "mace/ops/reshape.h"
namespace mace {
namespace ops {
void Register_Reshape(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Reshape")
......@@ -14,4 +15,5 @@ void Register_Reshape(OperatorRegistry *op_registry) {
ReshapeOp<DeviceType::CPU, float>);
}
} // namespace ops
} // namespace mace
......@@ -11,6 +11,7 @@
#include "mace/kernels/reshape.h"
namespace mace {
namespace ops {
template <DeviceType D, typename T>
class ReshapeOp : public Operator<D, T> {
......@@ -63,6 +64,7 @@ class ReshapeOp : public Operator<D, T> {
OP_OUTPUT_TAGS(OUTPUT);
};
} // namespace ops
} // namespace mace
#endif // MACE_OPS_RESHAPE_H_
......@@ -7,6 +7,8 @@
#include "mace/ops/ops_test_util.h"
namespace mace {
namespace ops {
namespace test {
class ReshapeTest : public OpsTestBase {};
......@@ -54,4 +56,6 @@ TEST_F(ReshapeTest, Complex) {
TestReshape({1, 2, 3, 4}, {1, 3, 8}, {1, 3, 8});
}
} // namespace test
} // namespace ops
} // namespace mace
......@@ -5,6 +5,7 @@
#include "mace/ops/resize_bilinear.h"
namespace mace {
namespace ops {
void Register_ResizeBilinear(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("ResizeBilinear")
......@@ -26,4 +27,5 @@ void Register_ResizeBilinear(OperatorRegistry *op_registry) {
ResizeBilinearOp<DeviceType::OPENCL, half>);
}
} // namespace ops
} // namespace mace
......@@ -9,6 +9,7 @@
#include "mace/kernels/resize_bilinear.h"
namespace mace {
namespace ops {
template <DeviceType D, class T>
class ResizeBilinearOp : public Operator<D, T> {
......@@ -34,6 +35,7 @@ class ResizeBilinearOp : public Operator<D, T> {
kernels::ResizeBilinearFunctor<D, T> functor_;
};
} // namespace ops
} // namespace mace
#endif // MACE_OPS_RESIZE_BILINEAR_H_
......@@ -8,6 +8,9 @@
#include "mace/ops/ops_test_util.h"
namespace mace {
namespace ops {
namespace test {
template <DeviceType D, typename T>
static void ResizeBilinearBenchmark(int iters,
int batch,
......@@ -86,4 +89,6 @@ BM_RESIZE_BILINEAR(1, 128, 240, 240, 480, 480);
BM_RESIZE_BILINEAR(1, 3, 4032, 3016, 480, 480);
BM_RESIZE_BILINEAR(1, 3, 480, 480, 4032, 3016);
} // namespace test
} // namespace ops
} // namespace mace
......@@ -4,11 +4,13 @@
#include <vector>
#include "mace/ops/resize_bilinear.h"
#include "mace/core/operator.h"
#include "mace/ops/resize_bilinear.h"
#include "mace/ops/ops_test_util.h"
namespace mace {
namespace ops {
namespace test {
class ResizeBilinearTest : public OpsTestBase {};
......@@ -63,9 +65,8 @@ TEST_F(ResizeBilinearTest, ResizeBilinearWAlignCorners) {
template <DeviceType D>
void TestRandomResizeBilinear() {
unsigned int seed = time(nullptr);
testing::internal::LogToStderr();
static unsigned int seed = time(NULL);
for (int round = 0; round < 10; ++round) {
int batch = 1 + rand_r(&seed) % 5;
int channels = 1 + rand_r(&seed) % 100;
......@@ -108,7 +109,7 @@ void TestRandomResizeBilinear() {
ImageToBuffer<D, float>(&net, "OutputImage", "DeviceOutput",
kernels::BufferType::IN_OUT_CHANNEL);
} else {
// TODO(yejianwu) support NEON
// TODO(someone): support NEON
}
// Check
ExpectTensorNear<float>(expected, *net.GetOutput("DeviceOutput"), 0.001);
......@@ -125,4 +126,6 @@ TEST_F(ResizeBilinearTest, OPENCLRandomResizeBilinear) {
TestRandomResizeBilinear<DeviceType::OPENCL>();
}
} // namespace test
} // namespace ops
} // namespace mace
......@@ -5,6 +5,7 @@
#include "mace/ops/slice.h"
namespace mace {
namespace ops {
void Register_Slice(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Slice")
......@@ -25,4 +26,5 @@ void Register_Slice(OperatorRegistry *op_registry) {
SliceOp<DeviceType::OPENCL, half>);
}
} // namespace ops
} // namespace mace
......@@ -9,7 +9,9 @@
#include "mace/core/operator.h"
#include "mace/kernels/slice.h"
namespace mace {
namespace ops {
template <DeviceType D, typename T>
class SliceOp : public Operator<D, T> {
......@@ -36,6 +38,7 @@ class SliceOp : public Operator<D, T> {
OP_INPUT_TAGS(INPUT);
};
} // namespace ops
} // namespace mace
#endif // MACE_OPS_SLICE_H_
......@@ -7,6 +7,9 @@
#include "mace/ops/ops_test_util.h"
namespace mace {
namespace ops {
namespace test {
template<DeviceType D, typename T>
static void BMSliceHelper(int iters,
const std::vector<index_t> &input_shape,
......@@ -79,5 +82,6 @@ BM_SLICE(1, 32, 32, 256, 2);
BM_SLICE(1, 128, 128, 32, 2);
BM_SLICE(1, 128, 128, 128, 2);
} // namespace test
} // namespace ops
} // namespace mace
......@@ -5,17 +5,19 @@
#include <functional>
#include <vector>
#include "gmock/gmock.h"
#include "mace/ops/slice.h"
#include "mace/ops/ops_test_util.h"
#include "gmock/gmock.h"
namespace mace {
namespace ops {
namespace test {
class SliceOpTest : public OpsTestBase {};
template<DeviceType D, typename T>
void RandomTest(const int num_outputs) {
unsigned int seed = time(nullptr);
static unsigned int seed = time(NULL);
const index_t output_channels = 4 * (1 + rand_r(&seed) % 10);
const index_t input_channels = num_outputs * output_channels;
const index_t batch = 3 + (rand_r(&seed) % 10);
......@@ -108,4 +110,6 @@ TEST_F(SliceOpTest, OPENCLHalf) {
RandomTest<DeviceType::OPENCL, half>(11);
}
} // namespace test
} // namespace ops
} // namespace mace
......@@ -5,6 +5,7 @@
#include "mace/ops/softmax.h"
namespace mace {
namespace ops {
void Register_Softmax(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("Softmax")
......@@ -26,4 +27,5 @@ void Register_Softmax(OperatorRegistry *op_registry) {
SoftmaxOp<DeviceType::OPENCL, half>);
}
} // namespace ops
} // namespace mace
......@@ -9,6 +9,7 @@
#include "mace/kernels/softmax.h"
namespace mace {
namespace ops {
template <DeviceType D, class T>
class SoftmaxOp : public Operator<D, T> {
......@@ -34,6 +35,7 @@ class SoftmaxOp : public Operator<D, T> {
OP_OUTPUT_TAGS(OUTPUT);
};
} // namespace ops
} // namespace mace
#endif // MACE_OPS_SOFTMAX_H_
......@@ -3,11 +3,15 @@
//
#include <string>
#include "mace/core/operator.h"
#include "mace/core/testing/test_benchmark.h"
#include "mace/ops/ops_test_util.h"
namespace mace {
namespace ops {
namespace test {
template <DeviceType D, typename T>
static void SoftmaxBenchmark(
int iters, int batch, int channels, int height, int width) {
......@@ -66,4 +70,7 @@ BM_SOFTMAX(1, 3, 512, 512);
BM_SOFTMAX(1, 4, 512, 512);
BM_SOFTMAX(1, 10, 256, 256);
BM_SOFTMAX(1, 1024, 7, 7);
} // namespace test
} // namespace ops
} // namespace mace
......@@ -6,6 +6,8 @@
#include "mace/ops/ops_test_util.h"
namespace mace {
namespace ops {
namespace test {
class SoftmaxOpTest : public OpsTestBase {};
......@@ -102,4 +104,6 @@ TEST_F(SoftmaxOpTest, OPENCLUnAligned) {
Complex<DeviceType::OPENCL>({5, 211, 107, 1});
}
} // namespace test
} // namespace ops
} // namespace mace
......@@ -5,6 +5,7 @@
#include "mace/ops/space_to_batch.h"
namespace mace {
namespace ops {
void Register_SpaceToBatchND(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToBatchND")
......@@ -19,4 +20,5 @@ void Register_SpaceToBatchND(OperatorRegistry *op_registry) {
SpaceToBatchNDOp<DeviceType::OPENCL, half>);
}
} // namespace ops
} // namespace mace
......@@ -12,6 +12,7 @@
#include "mace/kernels/space_to_batch.h"
namespace mace {
namespace ops {
template <DeviceType D, typename T>
class SpaceToBatchNDOp : public Operator<D, T> {
......@@ -72,6 +73,7 @@ class SpaceToBatchNDOp : public Operator<D, T> {
OP_OUTPUT_TAGS(OUTPUT);
};
} // namespace ops
} // namespace mace
#endif // MACE_OPS_SPACE_TO_BATCH_H_
......@@ -7,6 +7,9 @@
#include "mace/ops/ops_test_util.h"
namespace mace {
namespace ops {
namespace test {
template <DeviceType D, typename T>
static void BMSpaceToBatch(
int iters, int batch, int height, int width, int channels, int shape) {
......@@ -55,4 +58,7 @@ static void BMSpaceToBatch(
BM_SPACE_TO_BATCH(128, 16, 16, 128, 2);
BM_SPACE_TO_BATCH(1, 256, 256, 32, 2);
BM_SPACE_TO_BATCH(1, 256, 256, 32, 4);
} // namespace test
} // namespace ops
} // namespace mace
......@@ -3,10 +3,13 @@
//
#include <fstream>
#include "gtest/gtest.h"
#include "mace/ops/ops_test_util.h"
namespace mace {
namespace ops {
namespace test {
template <DeviceType D>
void RunSpaceToBatch(const std::vector<index_t> &input_shape,
......@@ -217,4 +220,6 @@ TEST(SpaceToBatchTest, MultiBatchAndChannelData) {
// space_tensor.get());
//}
} // namespace test
} // namespace ops
} // namespace mace
......@@ -3,11 +3,14 @@
//
#include <fstream>
#include "mace/core/operator.h"
#include "mace/kernels/conv_pool_2d_util.h"
#include "mace/ops/ops_test_util.h"
namespace mace {
namespace ops {
namespace test {
class WinogradConvlutionTest : public OpsTestBase {};
......@@ -40,7 +43,7 @@ void WinogradConvolution(const index_t batch,
const index_t in_channels,
const index_t out_channels,
const Padding padding) {
srand(time(NULL));
// srand(time(NULL));
// Construct graph
OpsTestNet net;
......@@ -157,7 +160,7 @@ void WinogradConvolutionWithPad(const index_t batch,
const index_t in_channels,
const index_t out_channels,
const int padding) {
srand(time(NULL));
// srand(time(NULL));
// Construct graph
OpsTestNet net;
......@@ -246,9 +249,6 @@ void WinogradConvolutionWithPad(const index_t batch,
}
}
TEST_F(WinogradConvlutionTest, UnAlignedConvolutionPad2) {
WinogradConvolutionWithPad<DeviceType::OPENCL, float>(1, 64, 64, 40, 19, 2);
WinogradConvolutionWithPad<DeviceType::OPENCL, float>(1, 32, 32, 96, 109, 2);
}
} // namespace test
} // namespace ops
} // namespace mace
......@@ -5,6 +5,7 @@
#include "mace/ops/winograd_inverse_transform.h"
namespace mace {
namespace ops {
void Register_WinogradInverseTransform(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("WinogradInverseTransform")
......@@ -19,4 +20,5 @@ void Register_WinogradInverseTransform(OperatorRegistry *op_registry) {
WinogradInverseTransformOp<DeviceType::OPENCL, half>);
}
} // namespace ops
} // namespace mace
......@@ -13,6 +13,7 @@
#include "mace/kernels/winograd_transform.h"
namespace mace {
namespace ops {
template <DeviceType D, typename T>
class WinogradInverseTransformOp : public Operator<D, T> {
......@@ -43,6 +44,7 @@ class WinogradInverseTransformOp : public Operator<D, T> {
OP_OUTPUT_TAGS(OUTPUT);
};
} // namespace ops
} // namespace mace
#endif // MACE_OPS_WINOGRAD_INVERSE_TRANSFORM_H_
......@@ -5,6 +5,7 @@
#include "mace/ops/winograd_transform.h"
namespace mace {
namespace ops {
void Register_WinogradTransform(OperatorRegistry *op_registry) {
REGISTER_OPERATOR(op_registry, OpKeyBuilder("WinogradTransform")
......@@ -19,4 +20,5 @@ void Register_WinogradTransform(OperatorRegistry *op_registry) {
WinogradTransformOp<DeviceType::OPENCL, half>);
}
} // namespace ops
} // namespace mace
......@@ -11,6 +11,7 @@
#include "mace/kernels/winograd_transform.h"
namespace mace {
namespace ops {
template <DeviceType D, typename T>
class WinogradTransformOp : public Operator<D, T> {
......@@ -37,6 +38,7 @@ class WinogradTransformOp : public Operator<D, T> {
OP_OUTPUT_TAGS(OUTPUT);
};
} // namespace ops
} // namespace mace
#endif // MACE_OPS_WINOGRAD_TRANSFORM_H_
......@@ -7,6 +7,9 @@
#include "mace/ops/ops_test_util.h"
namespace mace {
namespace ops {
namespace test {
template <DeviceType D, typename T>
static void BMWinogradTransform(
int iters, int batch, int height, int width, int channels) {
......@@ -105,4 +108,6 @@ BM_WINOGRAD_INVERSE_TRANSFORM(1, 14, 14, 32);
BM_WINOGRAD_INVERSE_TRANSFORM(1, 62, 62, 32);
BM_WINOGRAD_INVERSE_TRANSFORM(1, 126, 126, 32);
} // namespace test
} // namespace ops
} // namespace mace
......@@ -18,7 +18,13 @@ OPTION_ARGS=$3
echo $OPTION_ARGS
DEVICE_ID=`echo_device_id_by_soc $TARGET_SOC`
RESULT_VALUE=`echo_device_id_by_soc $TARGET_SOC`
if [ $? -ne 0 ]; then
echo $RESULT_VALUE
exit 1
else
DEVICE_ID=$RESULT_VALUE
fi
if [ -f "$MODEL_OUTPUT_DIR/benchmark_model" ]; then
rm -rf $MODEL_OUTPUT_DIR/benchmark_model
......
......@@ -17,7 +17,13 @@ RUN_SECONDS=$2
MERGED_LIB_FILE=$3
MODEL_INPUT_DIR=$4
DEVICE_ID=`echo_device_id_by_soc $TARGET_SOC`
RESULT_VALUE=`echo_device_id_by_soc $TARGET_SOC`
if [ $? -ne 0 ]; then
echo $RESULT_VALUE
exit 1
else
DEVICE_ID=$RESULT_VALUE
fi
if [ "$CPU_MODEL_TAG" != '' ]; then
CPU_MODEL_TAG_BUILD_FLAGS="--copt=-DMACE_CPU_MODEL_TAG=${CPU_MODEL_TAG}"
......
......@@ -13,7 +13,13 @@ CURRENT_DIR=`dirname $0`
source ${CURRENT_DIR}/env.sh
TARGET_SOC=$1
DEVICE_ID=`echo_device_id_by_soc $TARGET_SOC`
RESULT_VALUE=`echo_device_id_by_soc $TARGET_SOC`
if [ $? -ne 0 ]; then
echo $RESULT_VALUE
exit 1
else
DEVICE_ID=$RESULT_VALUE
fi
if [ x"$TARGET_ABI" != x"host" ]; then
adb -s $DEVICE_ID shell rm -rf $PHONE_DATA_DIR || exit 1
......
......@@ -40,6 +40,10 @@ echo_device_id_by_soc()
device_soc=`adb -s ${device} shell getprop | grep ro.board.platform | cut -d [ -f3 | cut -d ] -f1`
if [ x"$TARGET_SOC" = x"$device_soc" ]; then
echo "$device"
return 0
fi
done
echo "MACE ERROR: Not found device with soc ${TARGET_SOC}"
return 1
}
......@@ -16,7 +16,13 @@ TARGET_SOC=$1
CL_BIN_DIRS=$2
PULL_OR_NOT=$3
DEVICE_ID=`echo_device_id_by_soc $TARGET_SOC`
RESULT_VALUE=`echo_device_id_by_soc $TARGET_SOC`
if [ $? -ne 0 ]; then
echo $RESULT_VALUE
exit 1
else
DEVICE_ID=$RESULT_VALUE
fi
if [ "$PULL_OR_NOT" = 1 ]; then
CL_BIN_DIR=${CL_BIN_DIRS}
......
......@@ -22,7 +22,13 @@ OPTION_ARGS=$7
echo $OPTION_ARGS
DEVICE_ID=`echo_device_id_by_soc $TARGET_SOC`
RESULT_VALUE=`echo_device_id_by_soc $TARGET_SOC`
if [ $? -ne 0 ]; then
echo $RESULT_VALUE
exit 1
else
DEVICE_ID=$RESULT_VALUE
fi
if [ x"$TARGET_ABI" = x"host" ]; then
MACE_CPP_MIN_VLOG_LEVEL=$VLOG_LEVEL \
......
......@@ -16,7 +16,13 @@ TARGET_SOC=$1
MODEL_OUTPUT_DIR=$2
GENERATE_DATA_OR_NOT=$3
DEVICE_ID=`echo_device_id_by_soc $TARGET_SOC`
RESULT_VALUE=`echo_device_id_by_soc $TARGET_SOC`
if [ $? -ne 0 ]; then
echo $RESULT_VALUE
exit 1
else
DEVICE_ID=$RESULT_VALUE
fi
IFS=',' read -r -a INPUT_NAMES <<< "${INPUT_NODES}"
IFS=',' read -r -a OUTPUT_NAMES <<< "${OUTPUT_NODES}"
......
......@@ -2,22 +2,89 @@ import numpy as np
import math
import tensorflow as tf
A_T = np.array([[1, 1, 1, 0], [0, 1, -1, -1]]).astype(np.float32)
A = np.transpose(A_T)
B_T = np.array([
A_T = {}
A = {}
B_T = {}
B = {}
G = {}
G_T = {}
# f(2, 3)
A_T[4] = np.array([[1, 1, 1, 0], [0, 1, -1, -1]]).astype(np.float32)
A[4] = np.transpose(A_T[4])
B_T[4] = np.array([
[1, 0, -1, 0],
[0, 1, 1, 0],
[0, -1, 1, 0],
[0, 1, 0, -1]
]).astype(np.float32)
B = np.transpose(B_T)
G = np.array([
B[4] = np.transpose(B_T[4])
G[4] = np.array([
[1, 0, 0],
[0.5, 0.5, 0.5],
[0.5, -0.5, 0.5],
[0, 0, 1],
]).astype(np.float32)
G_T = np.transpose(G)
G_T[4] = np.transpose(G[4])
# f(4, 3)
A_T[6] = np.array([
[1, 1, 1, 1, 1, 0],
[0, 1, -1, 2, -2, 0],
[0, 1, 1, 4, 4, 0],
[0, 1, -1, 8, -8, 1],
]).astype(np.float32)
A[6] = np.transpose(A_T[6])
B_T[6] = np.array([
[4, 0, -5, 0, 1, 0],
[0, -4, -4, 1, 1, 0],
[0, 4, -4, -1, 1, 0],
[0, -2, -1, 2, 1, 0],
[0, 2, -1, -2, 1, 0],
[0, 4, 0, -5, 0, 1],
]).astype(np.float32)
B[6] = np.transpose(B_T[6])
G[6] = np.array([
[1/4.0 , 0 , 0 ],
[-1/6.0, -1/6.0 , -1/6.0],
[-1/6.0, 1/6.0 , -1/6.0],
[1/24.0, 1/12.0 , 1/6.0 ],
[1/24.0, -1/12.0, 1/6.0 ],
[ 0 , 0 , 1 ],
]).astype(np.float32)
G_T[6] = np.transpose(G[6])
# f(6, 3)
A_T[8] = np.array([
[1, 1, 1 , 1 , 1 , 1 , 1 , 0],
[0, 1, -1, 2 , -2 , 1/2. , -1/2. , 0],
[0, 1, 1 , 4 , 4 , 1/4. , 1/4. , 0],
[0, 1, -1, 8 , -8 , 1/8. , -1/8. , 0],
[0, 1, 1 , 16, 16 , 1/16., 1/16. , 0],
[0, 1, -1, 32, -32, 1/32., -1/32., 1],
]).astype(np.float32)
A[8] = np.transpose(A_T[8])
B_T[8] = np.array([
[1, 0 , -21/4., 0 , 21/4., 0 , -1, 0],
[0, 1 , 1 , -17/4., -17/4., 1 , 1 , 0],
[0, -1 , 1 , 17/4. , -17/4., -1 , 1 , 0],
[0, 1/2. , 1/4. , -5/2. , -5/4., 2 , 1 , 0],
[0, -1/2., 1/4. , 5/2. , -5/4., -2 , 1 , 0],
[0, 2 , 4 , -5/2. , -5 , 1/2. , 1 , 0],
[0, -2 , 4 , 5/2. , -5 , -1/2. , 1 , 0],
[0, -1 , 0 , 21/4. , 0 , -21/4., 0 , 1],
]).astype(np.float32)
B[8] = np.transpose(B_T[8])
G[8] = np.array([
[ 1 , 0 , 0 ],
[-2/9. , -2/9. , -2/9.],
[-2/9. , 2/9. , -2/9.],
[1/90. , 1/45. , 2/45.],
[1/90. , -1/45. , 2/45.],
[32/45., 16/45. , 8/45.],
[32/45., -16/45., 8/45.],
[ 0 , 0 , 1 ],
]).astype(np.float32)
G_T[8] = np.transpose(G[8])
def output_shape(input_shape, filter_shape):
......@@ -29,55 +96,54 @@ def output_shape(input_shape, filter_shape):
return out_shape
def winog_conv(input, filter):
m = 2
r = 3
def winograd_conv(m, r, input, filter):
alpha = m + r - 1
print 'Winograd(m = %d, r = %d, tile size=%d' % (m, r, alpha)
alpha_square = alpha * alpha
input_shape = input.shape
filter_shape = filter.shape
out_shape = output_shape(input_shape, filter_shape)
K = filter_shape[0]
C = input_shape[1]
U = np.zeros((K * 16, C))
U = np.zeros((K * alpha_square, C))
for k in range(K):
for c in range(C):
u = np.dot(np.dot(G, filter[k, c, :, :]), G_T)
for i in range(4):
for j in range(4) :
U[(i * 4 + j) * K + k, c] = u[i, j]
u = np.dot(np.dot(G[alpha], filter[k, c, :, :]), G_T[alpha])
for i in range(alpha):
for j in range(alpha) :
U[(i * alpha + j) * K + k, c] = u[i, j]
print 'filter out: ', U.shape
print U[0, 0]
U.astype(np.float32).tofile("filter_out")
rounded_h = int(math.ceil(out_shape[2] / 2.0))
rounded_w = int(math.ceil(out_shape[3] / 2.0))
rounded_h = int(math.ceil(out_shape[2] / (m * 1.0)))
rounded_w = int(math.ceil(out_shape[3] / (m * 1.0)))
P = input_shape[0] * rounded_h * rounded_w
V = np.zeros((C * 16, P))
V = np.zeros((C * alpha_square, P))
for p in range(P):
for c in range(C):
n = p / (rounded_w * rounded_h)
t = p % (rounded_h * rounded_w)
h_idx = t / rounded_w
w_idx = t % rounded_w
h_start = h_idx * 2
w_start = w_idx * 2
h_end = min(h_start+4, input_shape[2])
w_end = min(w_start+4, input_shape[3])
d = np.zeros((4, 4))
d[0:h_end-h_start, 0:w_end-w_start] = input[n, c, h_start:h_end, w_start:w_end]
v = np.dot(np.dot(B_T, d), B)
for i in range(4):
for j in range(4):
V[(i*4+j)*C + c, p] = v[i, j]
tmp = V.reshape(16, C, P, 1)
h_start = h_idx * m
w_start = w_idx * m
h_end = min(h_start+alpha, input_shape[2])
w_end = min(w_start+alpha, input_shape[3])
d = np.zeros((alpha, alpha))
d[0:h_end-h_start, 0:w_end-w_start] = \
input[n, c, h_start:h_end, w_start:w_end]
v = np.dot(np.dot(B_T[alpha], d), B[alpha])
for i in range(alpha):
for j in range(alpha):
V[(i*alpha+j)*C + c, p] = v[i, j]
tmp = V.reshape(alpha_square, C, P, 1)
print 'input out: ', tmp.shape
tmp.astype(np.float32).tofile("C")
M = np.zeros((16 * K, P))
for i in range(alpha * alpha):
M = np.zeros((alpha_square * K, P))
for i in range(alpha_square):
u = U[i * K : (i+1) * K, :]
v = V[i * C : (i+1) * C, :]
M[i * K : (i+1) * K, :] = np.dot(u, v)
......@@ -87,17 +153,17 @@ def winog_conv(input, filter):
res = np.zeros((out_shape[0], out_shape[2], out_shape[3], out_shape[1]))
for k in range(K):
for b in range(P):
m = np.zeros((4, 4))
for i in range(4):
for j in range(4):
m[i][j] = M[(i*4+j) * K + k, b]
y = np.dot(np.dot(A_T, m), A)
for i in range(2):
for j in range(2):
tm = np.zeros((alpha, alpha))
for i in range(alpha):
for j in range(alpha):
tm[i][j] = M[(i*alpha+j) * K + k, b]
y = np.dot(np.dot(A_T[alpha], tm), A[alpha])
for i in range(m):
for j in range(m):
n = b / (rounded_h * rounded_w)
t = b % (rounded_h * rounded_w)
p = (t / rounded_w) * 2 + i
q = (t % rounded_w) * 2 + j
p = (t / rounded_w) * m + i
q = (t % rounded_w) * m + j
if p >= out_shape[2] or q >= out_shape[3]:
continue
res[n, p, q, k] = y[i, j]
......@@ -115,25 +181,27 @@ def tf_conv(input, filter):
def main():
input = np.random.random([7, 61, 71, 31]).astype(np.float32)
input = np.random.random([5, 23, 29, 15]).astype(np.float32)
# input = np.fromfile(file="A", dtype=np.float32)
# input = input.reshape(1, 3, 3, 5)
print 'input shape: ', input.shape
input.tofile("A")
filter = np.random.random([3, 3, 31, 31]).astype(np.float32)
# input.tofile("A")
filter = np.random.random([3, 3, 15, 13]).astype(np.float32)
tf_out = tf_conv(input, filter)
input = input.transpose((0, 3, 1, 2))
filter = filter.transpose((3, 2, 0, 1))
print 'filter shape: ', filter.shape
filter.tofile("filter_in")
winog_out = winog_conv(input, filter)
res = np.allclose(tf_out, winog_out)
if res:
print "=========Pass========="
else:
print "=========Failed========="
print "TF: ", tf_out
print "Winograd: ", winog_out
# filter.tofile("filter_in")
for i in [2, 4, 6]:
print "==========f(%d,3)==========" % i
winograd_out = winograd_conv(i, 3, input, filter)
res = np.allclose(tf_out, winograd_out)
if res:
print "=========Pass========="
else:
print "=========Failed======="
print "TF: ", tf_out
print "Winograd: ", winograd_out
if __name__ == '__main__':
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册