diff --git a/mace/core/BUILD b/mace/core/BUILD index 9d337bc4b8f19755dbc71f1f2a1120981b64e628..63a30357240f99df964d1c23d2884739f61827db 100644 --- a/mace/core/BUILD +++ b/mace/core/BUILD @@ -12,11 +12,11 @@ load("//mace:mace.bzl", "if_android") cc_library( name = "opencl_runtime", srcs = glob([ - "runtime/opencl/cl.hpp", - "runtime/opencl/cl2.hpp", "runtime/opencl/*.cc", ]), hdrs = glob([ + "runtime/opencl/cl.hpp", + "runtime/opencl/cl2.hpp", "runtime/opencl/*.h", ]), copts = ["-std=c++11"], diff --git a/mace/core/runtime/opencl/cl2_header.h b/mace/core/runtime/opencl/cl2_header.h new file mode 100644 index 0000000000000000000000000000000000000000..f7c4af4b1cfee051e8afd869d945700eefa1cf20 --- /dev/null +++ b/mace/core/runtime/opencl/cl2_header.h @@ -0,0 +1,12 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#ifndef MACE_CORE_RUNTIME_OPENCL_CL2_HEADER_H_ +#define MACE_CORE_RUNTIME_OPENCL_CL2_HEADER_H_ + +#define CL_HPP_TARGET_OPENCL_VERSION 200 + +#include "mace/core/runtime/opencl/cl2.hpp" + +#endif // MACE_CORE_RUNTIME_OPENCL_CL2_HEADER_H_ diff --git a/mace/core/runtime/opencl/opencl_allocator.cc b/mace/core/runtime/opencl/opencl_allocator.cc index f1338cf2bb05bf08b436c38c76216e5a0a97bed2..d40432e2a2bb18b264d641d55f66868a26dafc22 100644 --- a/mace/core/runtime/opencl/opencl_allocator.cc +++ b/mace/core/runtime/opencl/opencl_allocator.cc @@ -2,8 +2,8 @@ // Copyright (c) 2017 XiaoMi All rights reserved. // +#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/opencl_allocator.h" -#include "mace/core/runtime/opencl/cl2.hpp" #include "mace/core/runtime/opencl/opencl_runtime.h" namespace mace { diff --git a/mace/core/runtime/opencl/opencl_runtime.cc b/mace/core/runtime/opencl/opencl_runtime.cc index b85b9e48074f0d354aa5c85a12ad2a03e0a4f0a8..26bc60d8471cdbddffc5eea9aee8f836ae15491b 100644 --- a/mace/core/runtime/opencl/opencl_runtime.cc +++ b/mace/core/runtime/opencl/opencl_runtime.cc @@ -26,6 +26,7 @@ bool ReadSourceFile(const std::string &filename, std::string *content) { std::string line; while (std::getline(ifs, line)) { *content += line; + *content += "\n"; } ifs.close(); return true; @@ -66,14 +67,15 @@ bool BuildProgram(OpenCLRuntime *runtime, *program = cl::Program(runtime->context(), sources); std::string build_options = "-Werror -cl-mad-enable -cl-fast-relaxed-math -I" + path; // TODO(heliangliang) -cl-unsafe-math-optimizations -cl-fast-relaxed-math - if (program->build({runtime->device()}, build_options.c_str()) != CL_SUCCESS) { + cl_int ret = program->build({runtime->device()}, build_options.c_str()); + if (ret != CL_SUCCESS) { if (program->getBuildInfo(runtime->device()) == CL_BUILD_ERROR) { std::string build_log = program->getBuildInfo(runtime->device()); LOG(INFO) << "Program build log: " << build_log; } - LOG(FATAL) << "Build program failed"; + LOG(FATAL) << "Build program failed: " << ret; } return true; diff --git a/mace/core/runtime/opencl/opencl_runtime.h b/mace/core/runtime/opencl/opencl_runtime.h index e7c7b180e43ba13dfdfedec9a63b5973bdfcb55a..f21ade57fa73cabd48a50e6baf4f7284cc51e40f 100644 --- a/mace/core/runtime/opencl/opencl_runtime.h +++ b/mace/core/runtime/opencl/opencl_runtime.h @@ -5,14 +5,10 @@ #ifndef MACE_CORE_RUNTIME_OPENCL_OPENCL_RUNTIME_H_ #define MACE_CORE_RUNTIME_OPENCL_OPENCL_RUNTIME_H_ -#ifndef CL_HPP_TARGET_OPENCL_VERSION -#define CL_HPP_TARGET_OPENCL_VERSION 200 -#endif - #include #include -#include "mace/core/runtime/opencl/cl2.hpp" +#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/opencl_wrapper.h" namespace mace { diff --git a/mace/core/runtime/opencl/opencl_wrapper.cc b/mace/core/runtime/opencl/opencl_wrapper.cc index 49f9293429f5e7e3e0b7531a4301cfd448495777..ad8a98f856a5f7dc661a1f048224a27f7ee5270f 100644 --- a/mace/core/runtime/opencl/opencl_wrapper.cc +++ b/mace/core/runtime/opencl/opencl_wrapper.cc @@ -126,6 +126,12 @@ class OpenCLLibraryImpl final { using clRetainKernelFunc = cl_int (*)(cl_kernel kernel); using clCreateBufferFunc = cl_mem (*)(cl_context, cl_mem_flags, size_t, void *, cl_int *); + using clCreateImageFunc = cl_mem (*)(cl_context, + cl_mem_flags, + const cl_image_format *, + const cl_image_desc *, + void *, + cl_int *); using clCreateProgramWithSourceFunc = cl_program (*)( cl_context, cl_uint, const char **, const size_t *, cl_int *); using clReleaseKernelFunc = cl_int (*)(cl_kernel kernel); @@ -136,8 +142,12 @@ class OpenCLLibraryImpl final { using clRetainDeviceFunc = cl_int (*)(cl_device_id); using clReleaseDeviceFunc = cl_int (*)(cl_device_id); using clRetainEventFunc = cl_int (*)(cl_event); - using clGetKernelWorkGroupInfoFunc = - cl_int (*)(cl_kernel, cl_device_id, cl_kernel_work_group_info, size_t, void *, size_t *); + using clGetKernelWorkGroupInfoFunc = cl_int (*)(cl_kernel, + cl_device_id, + cl_kernel_work_group_info, + size_t, + void *, + size_t *); #define DEFINE_FUNC_PTR(func) func##Func func = nullptr @@ -149,6 +159,7 @@ class OpenCLLibraryImpl final { DEFINE_FUNC_PTR(clReleaseKernel); DEFINE_FUNC_PTR(clCreateProgramWithSource); DEFINE_FUNC_PTR(clCreateBuffer); + DEFINE_FUNC_PTR(clCreateImage); DEFINE_FUNC_PTR(clRetainKernel); DEFINE_FUNC_PTR(clCreateKernel); DEFINE_FUNC_PTR(clGetProgramInfo); @@ -269,6 +280,7 @@ void *OpenCLLibraryImpl::LoadFromPath(const std::string &path) { ASSIGN_FROM_DLSYM(clReleaseKernel); ASSIGN_FROM_DLSYM(clCreateProgramWithSource); ASSIGN_FROM_DLSYM(clCreateBuffer); + ASSIGN_FROM_DLSYM(clCreateImage); ASSIGN_FROM_DLSYM(clRetainKernel); ASSIGN_FROM_DLSYM(clCreateKernel); ASSIGN_FROM_DLSYM(clGetProgramInfo); @@ -708,6 +720,24 @@ cl_mem clCreateBuffer(cl_context context, } } +cl_mem clCreateImage(cl_context context, + cl_mem_flags flags, + const cl_image_format *image_format, + const cl_image_desc *image_desc, + void *host_ptr, + cl_int *errcode_ret) { + auto func = mace::OpenCLLibraryImpl::Get().clCreateImage; + if (func != nullptr) { + return func(context, flags, image_format, image_desc, host_ptr, + errcode_ret); + } else { + if (errcode_ret != nullptr) { + *errcode_ret = CL_OUT_OF_RESOURCES; + } + return nullptr; + } +} + cl_program clCreateProgramWithSource(cl_context context, cl_uint count, const char **strings, @@ -795,8 +825,8 @@ cl_int clGetKernelWorkGroupInfo(cl_kernel kernel, size_t *param_value_size_ret) { auto func = mace::OpenCLLibraryImpl::Get().clGetKernelWorkGroupInfo; if (func != nullptr) { - return func(kernel, device, param_name, param_value_size, - param_value, param_value_size_ret); + return func(kernel, device, param_name, param_value_size, param_value, + param_value_size_ret); } else { return CL_OUT_OF_RESOURCES; } diff --git a/mace/kernels/opencl/batch_norm_opencl.cc b/mace/kernels/opencl/batch_norm_opencl.cc index 82efa4595ae6e4f091ce9618dc4e1b16199f382d..86f15164a1f9bd99d09b591cff705eccc62446d9 100644 --- a/mace/kernels/opencl/batch_norm_opencl.cc +++ b/mace/kernels/opencl/batch_norm_opencl.cc @@ -3,7 +3,7 @@ // #include "mace/kernels/batch_norm.h" -#include "mace/core/runtime/opencl/cl2.hpp" +#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/utils/tuner.h" @@ -79,4 +79,4 @@ void BatchNormFunctor::operator()( } } // namespace kernels -} // namespace mace \ No newline at end of file +} // namespace mace diff --git a/mace/kernels/opencl/cl/common.h b/mace/kernels/opencl/cl/common.h new file mode 100644 index 0000000000000000000000000000000000000000..a6be3c53e52e9a68879f697d6cf994b4126450d4 --- /dev/null +++ b/mace/kernels/opencl/cl/common.h @@ -0,0 +1,2 @@ +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable diff --git a/mace/kernels/opencl/cl/conv_2d_1x1.cl b/mace/kernels/opencl/cl/conv_2d_1x1.cl index 7b856f89d5742c3424c4b3f59eb7dbf3fc2073f3..dc5b1f816e99a6466b8183160c1fa7104c58aaaf 100644 --- a/mace/kernels/opencl/cl/conv_2d_1x1.cl +++ b/mace/kernels/opencl/cl/conv_2d_1x1.cl @@ -1,21 +1,22 @@ -void kernel conv_2d_1x1_naive(global const float *input, /* n, c, h, w */ - global const float *filter, /* o, i, kh, kw */ - global const float *bias, /* o */ - global float *output, /* n, c, h, w */ - private const int input_channels) { +#include + +__kernel void conv_2d_1x1_naive(__global const float *input, /* n, c, h, w */ + __global const float *filter, /* o, i, kh, kw */ + __global const float *bias, /* o */ + __global float *output, /* n, c, h, w */ + __private const int in_chan_num) { const int batch = get_global_id(0); const int channel = get_global_id(1); const int channels = get_global_size(1); const int pixel = get_global_id(2); const int pixels = get_global_size(2); - float *output_ptr = output + (batch * channels + channel) * pixels; output_ptr[pixel] = bias[channel]; - for (int inc = 0; inc < input_channels; ++inc) { - const float *input_ptr = input + (batch * input_channels + inc) * pixels + pixel; - const float weights = filter[channel * input_channels + inc]; + for (int inc = 0; inc < in_chan_num; ++inc) { + const float *input_ptr = input + (batch * in_chan_num + inc) * pixels + pixel; + const float weights = filter[channel * in_chan_num + inc]; float in = input_ptr[0]; float out = output_ptr[0]; out += in * weights; @@ -23,17 +24,19 @@ void kernel conv_2d_1x1_naive(global const float *input, /* n, c, h, w */ } } -void kernel conv_2d_1x1_v2(global const float *input, /* n, c, h, w */ - global const float *filter, /* o, i, kh, kw */ - global const float *bias, /* o */ - global float *output, /* n, c, h, w */ - private const int in_chan_num, - private const int out_chan_num, - private const int pixel_num) { +__kernel void conv_2d_1x1_v2(__global const float *input, /* n, c, h, w */ + __global const float *filter, /* o, i, kh, kw */ + __global const float *bias, /* o */ + __global float *output, /* n, c, h, w */ + __private const int in_chan_num, + __private const int out_chan_num, + __private const int pixel_num) { int batch = get_global_id(0); int out_chan_blk = get_global_id(1); int out_pixel_blk = get_global_id(2); + __private float output_slice[4 * 4]; + const int out_chan_begin = out_chan_blk * 4; const int out_chan_end = min(out_chan_begin + 4, out_chan_num); const int out_pixel_begin = out_pixel_blk * 4; @@ -41,21 +44,23 @@ void kernel conv_2d_1x1_v2(global const float *input, /* n, c, h, w */ const int in_offset = batch * in_chan_num * pixel_num; const int out_offset = batch * out_chan_num * pixel_num; + const float *input_base = input + in_offset + out_pixel_begin; float *output_base = output + out_offset + out_pixel_begin; - int pixels = out_pixel_end - out_pixel_begin; + int out_chan_len = out_chan_end - out_chan_begin; + int pixel_len = out_pixel_end - out_pixel_begin; for (int out_chan = out_chan_begin; out_chan < out_chan_end; ++out_chan) { float bias_value = bias[out_chan]; - float *output_ptr = output_base + out_chan * pixel_num; - for (int p = 0; p < pixels; ++p) { - output_ptr[p] = bias_value; + int out_chan_offset = out_chan - out_chan_begin; + for (int p = 0; p < pixel_len; ++p) { + output_slice[out_chan_offset * 4 + p] = bias_value; } } int in_chan = 0; - if (pixels == 4) { + if (pixel_len == 4) { for (; in_chan + 3 < in_chan_num; in_chan += 4) { const float *input_ptr = input_base + in_chan * pixel_num; int out_chan = out_chan_begin; @@ -66,30 +71,32 @@ void kernel conv_2d_1x1_v2(global const float *input, /* n, c, h, w */ float4 in1 = vload4(0, input_ptr + pixel_num); float4 in2 = vload4(0, input_ptr + 2 * pixel_num); float4 in3 = vload4(0, input_ptr + 3 * pixel_num); + #pragma unroll for (int oc = 0; oc < 4; ++oc) { float4 weights = vload4(0, filter_ptr + oc * in_chan_num); - float4 out = vload4(0, output_ptr + oc * pixel_num); + float4 out = vload4(oc, output_slice); out += in0 * weights.x; out += in1 * weights.y; out += in2 * weights.z; out += in3 * weights.w; - vstore4(out, 0, output_ptr + oc * pixel_num); + vstore4(out, oc, output_slice); } } for (; out_chan < out_chan_end; ++out_chan) { const float* filter_ptr = filter + out_chan * in_chan_num + in_chan; float *output_ptr = output_base + out_chan * pixel_num; + int out_chan_offset = out_chan - out_chan_begin; float4 weights = vload4(0, filter_ptr); float4 in0 = vload4(0, input_ptr); float4 in1 = vload4(0, input_ptr + pixel_num); float4 in2 = vload4(0, input_ptr + 2 * pixel_num); float4 in3 = vload4(0, input_ptr + 3 * pixel_num); - float4 out = vload4(0, output_ptr); + float4 out = vload4(out_chan_offset, output_slice); out += in0 * weights.x; out += in1 * weights.y; out += in2 * weights.z; out += in3 * weights.w; - vstore4(out, 0, output_ptr); + vstore4(out, out_chan_offset, output_slice); } } } @@ -99,13 +106,71 @@ void kernel conv_2d_1x1_v2(global const float *input, /* n, c, h, w */ for (int out_chan = out_chan_begin; out_chan < out_chan_end; ++out_chan) { float weights = filter[out_chan * in_chan_num + in_chan]; float *output_ptr = output_base + out_chan * pixel_num; + int out_chan_offset = out_chan - out_chan_begin; - for (int p = 0; p < pixels; ++p) { + for (int p = 0; p < pixel_len; ++p) { float in = input_ptr[p]; - float out = output_ptr[p]; - out += in * weights; - output_ptr[p] = out; + output_slice[out_chan_offset * 4 + p] += in * weights; } } } + + for (int out_chan_offset = 0; out_chan_offset < out_chan_len; ++out_chan_offset) { + int out_chan = out_chan_begin + out_chan_offset; + float *output_ptr = output_base + out_chan * pixel_num; + if (pixel_len == 4) { + float4 out = vload4(out_chan_offset, output_slice); + vstore4(out, 0, output_ptr); + } else { + int offset = out_chan_offset << 2; + for (int p = 0; p < pixel_len; ++p) { + output_ptr[p] = output_slice[offset + p]; + } + } + } +} + +/* FIXME this is incomplete */ +__kernel void conv_2d_1x1_v3(__read_only image3d_t input, /* n, c/4, h, w, 4 */ + __global const float *filter, /* o, i, kh, kw */ + __global const float *bias, /* o */ + __write_only image3d_t output, /* n, c/4, h, w, 4 */ + __private const int batch_num, + __private const int in_chan_num, + __private const int out_chan_num, + __private const int height, + __private const int width) { + int out_chan_blk = get_global_id(0); + int h = get_global_id(1); + int w = get_global_id(2); + + + int in_chan_blk_num = (in_chan_num + 3) / 4; + int out_chan_blk_num = (out_chan_num + 3) / 4; + + const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + + for (int batch = 0; batch < batch_num; ++batch) { + float4 bias_value = vload4(out_chan_blk, bias); + __private float4 out = bias_value; + + for (int in_chan_blk = 0; in_chan_blk < in_chan_blk_num; ++in_chan_blk) { + int in_d = batch * in_chan_blk_num + in_chan_blk; + float4 in = read_imagef(input, sampler, (int4)(in_d, h, w, 0)); + + const float *filter_base = filter + (out_chan_blk << 2) * in_chan_num; + float4 weights = vload4(in_chan_blk, filter_base); + out.x += dot(in, weights); + weights = vload4(in_chan_blk, filter_base + in_chan_num); + out.y += dot(in, weights); + weights = vload4(in_chan_blk, filter_base + in_chan_num * 2); + out.z += dot(in, weights); + weights = vload4(in_chan_blk, filter_base + in_chan_num * 3); + out.w += dot(in, weights); + } + + int out_d = batch * out_chan_blk_num + out_chan_blk; + int4 out_coord = (int4)(out_d, h, w, 0); + write_imagef(output, out_coord, out); + } } diff --git a/mace/kernels/opencl/conv_2d_opencl_1x1.cc b/mace/kernels/opencl/conv_2d_opencl_1x1.cc index 636bdd79525abb01f3a84cd21f492e156004feed..130ca4b7ab166d238c36484a5452565c119dde94 100644 --- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc +++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc @@ -2,9 +2,10 @@ // Copyright (c) 2017 XiaoMi All rights reserved. // +#include "mace/kernels/conv_2d.h" #include "mace/core/common.h" +#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/opencl_runtime.h" -#include "mace/kernels/conv_2d.h" #include "mace/utils/utils.h" namespace mace { @@ -22,22 +23,22 @@ void Conv1x1Naive(const Tensor *input, auto runtime = OpenCLRuntime::Get(); auto program = runtime->program(); - auto conv_2d = cl::KernelFunctor(program, "conv_2d_1x1_naive"); + auto conv_2d = + cl::KernelFunctor(program, "conv_2d_1x1_naive"); const index_t pixels = height * width; cl_int error; - conv_2d(cl::EnqueueArgs(runtime->command_queue(), - cl::NDRange(static_cast(batch), - static_cast(channels), - static_cast(pixels)), - cl::NDRange(1, 1, 128)), + conv_2d(cl::EnqueueArgs( + runtime->command_queue(), + cl::NDRange(static_cast(batch), static_cast(channels), + static_cast(pixels)), + cl::NDRange(1, 1, 128)), *(static_cast(input->buffer())), *(static_cast(filter->buffer())), *(static_cast(bias->buffer())), *(static_cast(output->buffer())), - static_cast(input_channels), - error); + static_cast(input_channels), error); MACE_CHECK(error == CL_SUCCESS); } @@ -53,31 +54,126 @@ void Conv1x1V2(const Tensor *input, auto runtime = OpenCLRuntime::Get(); auto program = runtime->program(); - auto conv_2d = cl::KernelFunctor(program, "conv_2d_1x1_v2"); const index_t pixels = height * width; const index_t channel_blocks = (channels + 3) / 4; const index_t pixel_blocks = (pixels + 3) / 4; + // TODO KernelFunctor has an extra clReleaseCommandQueue due to a copy + // TODO check wired clReleaseCommandQueue latency + // The KernelFunctor can cause segment faults in cb_retain_event + auto conv_2d_kernel = cl::Kernel(program, "conv_2d_1x1_v2"); + const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel); + uint32_t idx = 0; + conv_2d_kernel.setArg(idx++, + *(static_cast(input->buffer()))); + conv_2d_kernel.setArg(idx++, + *(static_cast(filter->buffer()))); + conv_2d_kernel.setArg(idx++, + *(static_cast(bias->buffer()))); + conv_2d_kernel.setArg(idx++, *(static_cast(output->buffer()))); + conv_2d_kernel.setArg(idx++, static_cast(input_channels)); + conv_2d_kernel.setArg(idx++, static_cast(channels)); + conv_2d_kernel.setArg(idx++, static_cast(pixels)); + + auto command_queue = runtime->command_queue(); + cl_int error = command_queue.enqueueNDRangeKernel( + conv_2d_kernel, cl::NullRange, + cl::NDRange(static_cast(batch), static_cast(channel_blocks), + static_cast(pixel_blocks)), + cl::NDRange(1, 2, kwg_size / 2)); + MACE_CHECK(error == CL_SUCCESS, error); +} + +void Conv1x1V3(const Tensor *input, + const Tensor *filter, + const Tensor *bias, + Tensor *output) { + const index_t batch = output->shape()[0]; + const index_t channels = output->shape()[1]; + const index_t height = output->shape()[2]; + const index_t width = output->shape()[3]; + const index_t input_channels = input->shape()[1]; + + auto runtime = OpenCLRuntime::Get(); + auto program = runtime->program(); + + const index_t pixels = height * width; + const index_t pixel_blocks = (pixels + 3) / 4; + + const index_t channel_blocks = (channels + 3) / 4; + const index_t input_channel_blocks = (input_channels + 3) / 4; + + // FIXME temp hacking + static std::map input_image_map; + static std::map output_image_map; + cl::Image3D input_image; + cl::Image3D output_image; + auto input_iter = + input_image_map.find(reinterpret_cast(input->buffer())); + if (input_iter != input_image_map.end()) { + input_image = input_iter->second; + } else { + // The batch dimension is collapsed with channel + cl_int error; + cl::Image3D image = + cl::Image3D(OpenCLRuntime::Get()->context(), + CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, + cl::ImageFormat(CL_RGBA, CL_FLOAT), height, width, + batch * input_channel_blocks, 0, 0, nullptr, &error); + MACE_CHECK(error == CL_SUCCESS); + input_image = image; + input_image_map.clear(); + input_image_map.emplace(reinterpret_cast(input->buffer()), + image); + } + auto output_iter = + output_image_map.find(reinterpret_cast(output->buffer())); + if (output_iter != output_image_map.end()) { + output_image = output_iter->second; + } else { + cl_int error; + cl::Image3D image = + cl::Image3D(OpenCLRuntime::Get()->context(), + CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, + cl::ImageFormat(CL_RGBA, CL_FLOAT), height, width, + batch * channel_blocks, 0, 0, nullptr, &error); + MACE_CHECK(error == CL_SUCCESS); + output_image = image; + output_image_map.clear(); + output_image_map.emplace(reinterpret_cast(output->buffer()), + image); + } + + auto conv_2d_kernel = cl::Kernel(program, "conv_2d_1x1_v3"); + const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel); + + uint32_t idx = 0; + conv_2d_kernel.setArg(idx++, input_image); + conv_2d_kernel.setArg(idx++, + *(static_cast(filter->buffer()))); + conv_2d_kernel.setArg(idx++, + *(static_cast(bias->buffer()))); + conv_2d_kernel.setArg(idx++, output_image); + conv_2d_kernel.setArg(idx++, static_cast(batch)); + conv_2d_kernel.setArg(idx++, static_cast(input_channels)); + conv_2d_kernel.setArg(idx++, static_cast(channels)); + conv_2d_kernel.setArg(idx++, static_cast(height)); + conv_2d_kernel.setArg(idx++, static_cast(width)); + + auto command_queue = runtime->command_queue(); cl_int error; - conv_2d(cl::EnqueueArgs(runtime->command_queue(), - cl::NDRange(static_cast(batch), - static_cast(channel_blocks), - static_cast(pixel_blocks)), - cl::NDRange(1, 1, 256)), - *(static_cast(input->buffer())), - *(static_cast(filter->buffer())), - *(static_cast(bias->buffer())), - *(static_cast(output->buffer())), - static_cast(input_channels), - static_cast(channels), - static_cast(pixels), - error); - MACE_CHECK(error == CL_SUCCESS); + error = command_queue.enqueueNDRangeKernel( + conv_2d_kernel, cl::NullRange, + cl::NDRange(static_cast(channel_blocks), static_cast(height), + static_cast(width)), + cl::NDRange(1, 2, kwg_size / 2)); + MACE_CHECK(error == CL_SUCCESS, error); } -extern void Conv2dOpenclK1x1S1(const Tensor *input, const Tensor *filter, - const Tensor *bias, Tensor *output) { +extern void Conv2dOpenclK1x1S1(const Tensor *input, + const Tensor *filter, + const Tensor *bias, + Tensor *output) { const index_t batch = output->shape()[0]; const index_t height = output->shape()[2]; const index_t width = output->shape()[3]; @@ -89,7 +185,6 @@ extern void Conv2dOpenclK1x1S1(const Tensor *input, const Tensor *filter, MACE_CHECK(input_batch == batch && input_height == height && input_width == width); - // Conv1x1Naive(input, filter, bias, output); Conv1x1V2(input, filter, bias, output); }; diff --git a/mace/ops/addn_benchmark.cc b/mace/ops/addn_benchmark.cc index 801e9426294e26aef30b89f81ece3f3327f9c26f..ad48f4458e570f826b8d9caaf5c75f45d74dbaa1 100644 --- a/mace/ops/addn_benchmark.cc +++ b/mace/ops/addn_benchmark.cc @@ -17,7 +17,7 @@ static void AddNBenchmark(int iters, int n, int size) { for (int i = 0; i < n; ++i) { op_def_builder.Input(internal::MakeString("Input", i).c_str()); } - op_def_builder.Output("Output").Finalize(net.operator_def()); + op_def_builder.Output("Output").Finalize(net.NewOperatorDef()); // Add input data for (int i = 0; i < n; ++i) { diff --git a/mace/ops/addn_test.cc b/mace/ops/addn_test.cc index 76a46355c574151997451df974cb62eb46b28393..a48d066235eec33f1465ffe6f74fce6bb97e0d37 100644 --- a/mace/ops/addn_test.cc +++ b/mace/ops/addn_test.cc @@ -17,7 +17,7 @@ TEST_F(AddnOpTest, AddnOp) { .Input("Input2") .Input("Input3") .Output("Output") - .Finalize(net.operator_def()); + .Finalize(net.NewOperatorDef()); // Add input data net.AddRandomInput("Input1", {1, 2, 3, 4}); diff --git a/mace/ops/batch_norm_benchmark.cc b/mace/ops/batch_norm_benchmark.cc index c1ac84ef60e7c89ab2042f3815f18f1fbaf63da4..499af6f29c5f1918f8233ef1e11ba155e35cc869 100644 --- a/mace/ops/batch_norm_benchmark.cc +++ b/mace/ops/batch_norm_benchmark.cc @@ -21,7 +21,7 @@ static void BatchNorm( .Input("Var") .Input("Epsilon") .Output("Output") - .Finalize(net.operator_def()); + .Finalize(net.NewOperatorDef()); // Add input data net.AddRandomInput("Input", {batch, channels, height, width}); diff --git a/mace/ops/batch_norm_test.cc b/mace/ops/batch_norm_test.cc index 39b9ff1c9358fd07ec3c775ecced3b1c8ffd0228..e13df29c33aad74ea730d39696e9cfa66a3f0aac 100644 --- a/mace/ops/batch_norm_test.cc +++ b/mace/ops/batch_norm_test.cc @@ -21,7 +21,7 @@ void Simple() { .Input("Var") .Input("Epsilon") .Output("Output") - .Finalize(net.operator_def()); + .Finalize(net.NewOperatorDef()); // Add input data net.AddInputFromArray("Input", {1, 1, 6, 2}, @@ -73,7 +73,7 @@ TEST_F(BatchNormOpTest, SimpleRandomNeon) { .Input("Var") .Input("Epsilon") .Output("Output") - .Finalize(net.operator_def()); + .Finalize(net.NewOperatorDef()); // Add input data net.AddRandomInput("Input", {batch, channels, height, width}); @@ -114,7 +114,7 @@ TEST_F(BatchNormOpTest, ComplexRandomNeon) { .Input("Var") .Input("Epsilon") .Output("Output") - .Finalize(net.operator_def()); + .Finalize(net.NewOperatorDef()); // Add input data net.AddRandomInput("Input", {batch, channels, height, width}); @@ -155,7 +155,7 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) { .Input("Var") .Input("Epsilon") .Output("Output") - .Finalize(net.operator_def()); + .Finalize(net.NewOperatorDef()); // Add input data net.AddRandomInput("Input", {batch, channels, height, width}); @@ -201,7 +201,7 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) { .Input("Var") .Input("Epsilon") .Output("Output") - .Finalize(net.operator_def()); + .Finalize(net.NewOperatorDef()); // Add input data net.AddRandomInput("Input", {batch, channels, height, width}); diff --git a/mace/ops/channel_shuffle_benchmark.cc b/mace/ops/channel_shuffle_benchmark.cc index 112e5fef0279d6be3fc95e93930da6c2f47cd2b1..d9f63f3acffaf8861a120bec381ed175db2963fb 100644 --- a/mace/ops/channel_shuffle_benchmark.cc +++ b/mace/ops/channel_shuffle_benchmark.cc @@ -19,10 +19,10 @@ static void ChannelShuffle( OpDefBuilder("GlobalAvgPooling", "GlobalAvgPoolingTest") .Input("Input") .Output("Output") - .Finalize(net.operator_def()); + .AddIntArg("group", group) + .Finalize(net.NewOperatorDef()); // Add input data - net.AddIntArg("group", group); net.AddRandomInput("Input", {batch, channels, height, width}); // Warm-up diff --git a/mace/ops/channel_shuffle_test.cc b/mace/ops/channel_shuffle_test.cc index c862e516de5677ae92193a99391ddc61d32f8b65..ca30029d2053c41e702cefe39baccc3d72293dfb 100644 --- a/mace/ops/channel_shuffle_test.cc +++ b/mace/ops/channel_shuffle_test.cc @@ -14,9 +14,9 @@ TEST_F(ChannelShuffleOpTest, C8G4) { OpDefBuilder("ChannelShuffle", "ChannelShuffleTest") .Input("Input") .Output("Output") - .Finalize(net.operator_def()); + .AddIntArg("group", 4) + .Finalize(net.NewOperatorDef()); - net.AddIntArg("group", 4); // Add input data net.AddInputFromArray( diff --git a/mace/ops/concat_benchmark.cc b/mace/ops/concat_benchmark.cc index c871c20d600f11238226247e53bae5161307c84a..275886a6d345293b5a8a965ef442ea99932a8fba 100644 --- a/mace/ops/concat_benchmark.cc +++ b/mace/ops/concat_benchmark.cc @@ -17,7 +17,7 @@ static void ConcatHelper(int iters, int concat_dim, int dim1) { .Input("Input1") .Input("Axis") .Output("Output") - .Finalize(net.operator_def()); + .Finalize(net.NewOperatorDef()); // Add input data const int kDim0 = 100; diff --git a/mace/ops/concat_test.cc b/mace/ops/concat_test.cc index 7e910d21d3bf3379c2932320f16cf772f1497726..1d94d4679a5f9292b744444f1a24e3719f35f3c1 100644 --- a/mace/ops/concat_test.cc +++ b/mace/ops/concat_test.cc @@ -18,7 +18,7 @@ TEST_F(ConcatOpTest, Simple_Horizon) { .Input("Input1") .Input("Axis") .Output("Output") - .Finalize(net.operator_def()); + .Finalize(net.NewOperatorDef()); std::vector input_shape = {4, 4}; std::vector input0; @@ -56,7 +56,7 @@ TEST_F(ConcatOpTest, Simple_Vertical) { .Input("Input1") .Input("Axis") .Output("Output") - .Finalize(net.operator_def()); + .Finalize(net.NewOperatorDef()); std::vector input_shape = {4, 4}; std::vector input0; @@ -99,7 +99,7 @@ TEST_F(ConcatOpTest, Random) { for (int i = 0; i < num_inputs; ++i) { builder = builder.Input(("Input" + ToString(i)).c_str()); } - builder.Input("Axis").Output("Output").Finalize(net.operator_def()); + builder.Input("Axis").Output("Output").Finalize(net.NewOperatorDef()); std::vector shape_data; GenerateRandomIntTypeData({dim}, shape_data, 1, dim); diff --git a/mace/ops/conv_2d_benchmark.cc b/mace/ops/conv_2d_benchmark.cc index caeac58a6052af80cf89e4dec4a087fc3f804f75..ca3ccf72b96e642e34fc1f6c8b45d787e000dfa9 100644 --- a/mace/ops/conv_2d_benchmark.cc +++ b/mace/ops/conv_2d_benchmark.cc @@ -3,6 +3,7 @@ // #include +#include #include "mace/core/operator.h" #include "mace/core/testing/test_benchmark.h" @@ -13,6 +14,7 @@ namespace mace { template static void Conv2d(int iters, + int iters_to_sync, int batch, int channels, int height, @@ -30,17 +32,15 @@ static void Conv2d(int iters, .Input("Filter") .Input("Bias") .Output("Output") - .Finalize(net.operator_def()); - - // Add args - net.AddIntsArg("strides", {stride, stride}); - net.AddIntArg("padding", padding); - net.AddIntsArg("dilations", {1, 1}); + .AddIntsArg("strides", {stride, stride}) + .AddIntArg("padding", padding) + .AddIntsArg("dilations", {1, 1}) + .Finalize(net.NewOperatorDef()); // Add input data net.AddRandomInput("Input", {batch, channels, height, width}); net.AddRandomInput("Filter", - {output_channels, channels, kernel_h, kernel_w}); + {output_channels, channels, kernel_h, kernel_w}); net.AddRandomInput("Bias", {output_channels}); // Warm-up @@ -52,10 +52,17 @@ static void Conv2d(int iters, mace::testing::StartTiming(); while (iters--) { net.RunOp(D); - net.Sync(); + if (iters % iters_to_sync == 0) { + net.Sync(); + } } } +// In common network, there are usually more than 1 layers, this is used to +// approximate the amortized latency. The OpenCL runtime for Mali/Adreno is +// in-order. +constexpr int kItersToSync = 10; + #define BM_CONV_2D_MACRO(N, C, H, W, KH, KW, STRIDE, P, OC, TYPE, DEVICE) \ static void \ BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE( \ @@ -63,8 +70,8 @@ static void Conv2d(int iters, const int64_t tot = static_cast(iters) * N * C * H * W; \ mace::testing::ItemsProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ - Conv2d(iters, N, C, H, W, KH, KW, STRIDE, mace::Padding::P, \ - OC); \ + Conv2d(iters, kItersToSync, N, C, H, W, KH, KW, STRIDE, \ + mace::Padding::P, OC); \ } \ BENCHMARK( \ BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE) diff --git a/mace/ops/conv_2d_test.cc b/mace/ops/conv_2d_test.cc index 149711950eb0c29e69d715a74ed8e5b7dca5579b..a8e9f189c255e09178b5b6aa02f0e9ab28606786 100644 --- a/mace/ops/conv_2d_test.cc +++ b/mace/ops/conv_2d_test.cc @@ -18,12 +18,12 @@ TEST_F(Conv2dOpTest, Simple_VALID) { .Input("Filter") .Input("Bias") .Output("Output") - .Finalize(net.operator_def()); + .AddIntsArg("strides", {1, 1}) + .AddIntArg("padding", Padding::VALID) + .AddIntsArg("dilations", {1, 1}) + .Finalize(net.NewOperatorDef()); // Add args - net.AddIntsArg("strides", {1, 1}); - net.AddIntArg("padding", Padding::VALID); - net.AddIntsArg("dilations", {1, 1}); // Add input data net.AddInputFromArray( @@ -52,12 +52,10 @@ TEST_F(Conv2dOpTest, Simple_SAME) { .Input("Filter") .Input("Bias") .Output("Output") - .Finalize(net.operator_def()); - - // Add args - net.AddIntsArg("strides", {1, 1}); - net.AddIntArg("padding", Padding::SAME); - net.AddIntsArg("dilations", {1, 1}); + .AddIntsArg("strides", {1, 1}) + .AddIntArg("padding", Padding::SAME) + .AddIntsArg("dilations", {1, 1}) + .Finalize(net.NewOperatorDef()); // Add input data net.AddInputFromArray( @@ -88,12 +86,10 @@ TEST_F(Conv2dOpTest, Combined) { .Input("Filter") .Input("Bias") .Output("Output") - .Finalize(net.operator_def()); - - // Add args - net.AddIntsArg("strides", {2, 2}); - net.AddIntArg("padding", Padding::SAME); - net.AddIntsArg("dilations", {1, 1}); + .AddIntsArg("strides", {2, 2}) + .AddIntArg("padding", Padding::SAME) + .AddIntsArg("dilations", {1, 1}) + .Finalize(net.NewOperatorDef()); // Add input data net.AddInputFromArray( @@ -127,12 +123,10 @@ void TestConv1x1() { .Input("Filter") .Input("Bias") .Output("Output") - .Finalize(net.operator_def()); - - // Add args - net.AddIntsArg("strides", {1, 1}); - net.AddIntArg("padding", Padding::VALID); - net.AddIntsArg("dilations", {1, 1}); + .AddIntsArg("strides", {1, 1}) + .AddIntArg("padding", Padding::VALID) + .AddIntsArg("dilations", {1, 1}) + .Finalize(net.NewOperatorDef()); // Add input data net.AddInputFromArray( @@ -190,12 +184,10 @@ TEST_F(Conv2dOpTest, AlignedConvNxNS12) { .Input("Filter") .Input("Bias") .Output("Output") - .Finalize(net.operator_def()); - - // Add args - net.AddIntsArg("strides", {stride_h, stride_w}); - net.AddIntArg("padding", type); - net.AddIntsArg("dilations", {1, 1}); + .AddIntsArg("strides", {stride_h, stride_w}) + .AddIntArg("padding", type) + .AddIntsArg("dilations", {1, 1}) + .Finalize(net.NewOperatorDef()); // Add input data net.AddRandomInput("Input", {batch, input_channels, height, width}); @@ -241,12 +233,10 @@ TEST_F(Conv2dOpTest, UnalignedConvNxNS12) { .Input("Filter") .Input("Bias") .Output("Output") - .Finalize(net.operator_def()); - - // Add args - net.AddIntsArg("strides", {stride_h, stride_w}); - net.AddIntArg("padding", type); - net.AddIntsArg("dilations", {1, 1}); + .AddIntsArg("strides", {stride_h, stride_w}) + .AddIntArg("padding", type) + .AddIntsArg("dilations", {1, 1}) + .Finalize(net.NewOperatorDef()); // Add input data net.AddRandomInput("Input", {batch, input_channels, height, width}); diff --git a/mace/ops/depthwise_conv2d_test.cc b/mace/ops/depthwise_conv2d_test.cc index 5a588950cde2c3cebb5983bb2e6c98872f673c05..05cd5d7a204016c76b9f571e6c4b8581a8c32e21 100644 --- a/mace/ops/depthwise_conv2d_test.cc +++ b/mace/ops/depthwise_conv2d_test.cc @@ -19,12 +19,10 @@ void SimpleValidTest() { .Input("Filter") .Input("Bias") .Output("Output") - .Finalize(net.operator_def()); - - // Add args - net.AddIntsArg("strides", {1, 1}); - net.AddIntArg("padding", Padding::VALID); - net.AddIntsArg("dilations", {1, 1}); + .AddIntsArg("strides", {1, 1}) + .AddIntArg("padding", Padding::VALID) + .AddIntsArg("dilations", {1, 1}) + .Finalize(net.NewOperatorDef()); // Add input data net.AddInputFromArray("Input", {1, 2, 2, 3}, @@ -68,12 +66,10 @@ void TestNxNS12(const index_t height, const index_t width) { .Input("Filter") .Input("Bias") .Output("Output") - .Finalize(net.operator_def()); - - // Add args - net.AddIntsArg("strides", {stride_h, stride_w}); - net.AddIntArg("padding", type); - net.AddIntsArg("dilations", {1, 1}); + .AddIntsArg("strides", {stride_h, stride_w}) + .AddIntArg("padding", type) + .AddIntsArg("dilations", {1, 1}) + .Finalize(net.NewOperatorDef()); // Add input data net.AddRandomInput("Input", {batch, input_channels, height, width}); diff --git a/mace/ops/depthwise_conv_2d_benchmark.cc b/mace/ops/depthwise_conv_2d_benchmark.cc index 2534cdad9504bb67075fd947fa7e532d25900734..13f64ddfbcdeb62378fd6b9bfc43a647910c0fdb 100644 --- a/mace/ops/depthwise_conv_2d_benchmark.cc +++ b/mace/ops/depthwise_conv_2d_benchmark.cc @@ -30,12 +30,10 @@ static void DepthwiseConv2d(int iters, .Input("Filter") .Input("Bias") .Output("Output") - .Finalize(net.operator_def()); - - // Add args - net.AddIntsArg("strides", {stride, stride}); - net.AddIntArg("padding", padding); - net.AddIntsArg("dilations", {1, 1}); + .AddIntsArg("strides", {stride, stride}) + .AddIntArg("padding", padding) + .AddIntsArg("dilations", {1, 1}) + .Finalize(net.NewOperatorDef()); // Add input data net.AddRandomInput("Input", {batch, channels, height, width}); diff --git a/mace/ops/global_avg_pooling_benchmark.cc b/mace/ops/global_avg_pooling_benchmark.cc index a00634961b2b930b8023d6c5200284550333e056..3638243fd067c55d3d36147c28187d2ec635410d 100644 --- a/mace/ops/global_avg_pooling_benchmark.cc +++ b/mace/ops/global_avg_pooling_benchmark.cc @@ -19,7 +19,7 @@ static void GlobalAvgPooling( OpDefBuilder("GlobalAvgPooling", "GlobalAvgPoolingTest") .Input("Input") .Output("Output") - .Finalize(net.operator_def()); + .Finalize(net.NewOperatorDef()); // Add input data net.AddRandomInput("Input", {batch, channels, height, width}); diff --git a/mace/ops/global_avg_pooling_test.cc b/mace/ops/global_avg_pooling_test.cc index 540f874a0a6e4a5d48d856d3749c78f4b988d552..da82e53435c043da7e2d6ad618201374bc9a5568 100644 --- a/mace/ops/global_avg_pooling_test.cc +++ b/mace/ops/global_avg_pooling_test.cc @@ -14,7 +14,7 @@ TEST_F(GlobalAvgPoolingOpTest, 3x7x7_CPU) { OpDefBuilder("GlobalAvgPooling", "GlobalAvgPoolingTest") .Input("Input") .Output("Output") - .Finalize(net.operator_def()); + .Finalize(net.NewOperatorDef()); // Add input data std::vector input(147); @@ -38,7 +38,7 @@ TEST_F(GlobalAvgPoolingOpTest, 3x7x7_NEON) { OpDefBuilder("GlobalAvgPooling", "GlobalAvgPoolingTest") .Input("Input") .Output("Output") - .Finalize(net.operator_def()); + .Finalize(net.NewOperatorDef()); // Add input data std::vector input(147); diff --git a/mace/ops/ops_test_util.h b/mace/ops/ops_test_util.h index 4abde486559c3140820d5d144303d4b72ee73b82..dcadf566a49ebe8deea86991d971522898fa3a0c 100644 --- a/mace/ops/ops_test_util.h +++ b/mace/ops/ops_test_util.h @@ -17,21 +17,70 @@ namespace mace { class OpDefBuilder { public: - OpDefBuilder(const char *type, const char *name) { + OpDefBuilder(const char *type, const std::string &name) { op_def_.set_type(type); op_def_.set_name(name); } - OpDefBuilder &Input(const char *input_name) { + OpDefBuilder &Input(const std::string &input_name) { op_def_.add_input(input_name); return *this; } - OpDefBuilder &Output(const char *output_name) { + OpDefBuilder &Output(const std::string &output_name) { op_def_.add_output(output_name); return *this; } + OpDefBuilder AddIntArg(const std::string &name, const int value) { + auto arg = op_def_.add_arg(); + arg->set_name(name); + arg->set_i(value); + return *this; + } + + OpDefBuilder AddFloatArg(const std::string &name, const float value) { + auto arg = op_def_.add_arg(); + arg->set_name(name); + arg->set_f(value); + return *this; + } + + OpDefBuilder AddStringArg(const std::string &name, const char *value) { + auto arg = op_def_.add_arg(); + arg->set_name(name); + arg->set_s(value); + return *this; + } + + OpDefBuilder AddIntsArg(const std::string &name, const std::vector &values) { + auto arg = op_def_.add_arg(); + arg->set_name(name); + for (auto value : values) { + arg->add_ints(value); + } + return *this; + } + + OpDefBuilder AddFloatsArg(const std::string &name, const std::vector &values) { + auto arg = op_def_.add_arg(); + arg->set_name(name); + for (auto value : values) { + arg->add_floats(value); + } + return *this; + } + + OpDefBuilder AddStringsArg(const std::string &name, + const std::vector &values) { + auto arg = op_def_.add_arg(); + arg->set_name(name); + for (auto value : values) { + arg->add_strings(value); + } + return *this; + } + void Finalize(OperatorDef *op_def) const { MACE_CHECK(op_def != nullptr, "input should not be null."); *op_def = op_def_; @@ -45,7 +94,7 @@ class OpsTestNet { OpsTestNet() {} template - void AddInputFromArray(const char *name, + void AddInputFromArray(const std::string &name, const std::vector &shape, const std::vector &data) { Tensor *input = @@ -58,7 +107,7 @@ class OpsTestNet { } template - void AddRepeatedInput(const char *name, + void AddRepeatedInput(const std::string &name, const std::vector &shape, const T data) { Tensor *input = @@ -70,7 +119,7 @@ class OpsTestNet { } template - void AddRandomInput(const char *name, + void AddRandomInput(const std::string &name, const std::vector &shape, bool positive = false) { Tensor *input = @@ -89,56 +138,18 @@ class OpsTestNet { }); } - void AddIntArg(const char *name, const int value) { - auto arg = op_def_.add_arg(); - arg->set_name(name); - arg->set_i(value); - } - - void AddFloatArg(const char *name, const float value) { - auto arg = op_def_.add_arg(); - arg->set_name(name); - arg->set_f(value); - } - - void AddStringArg(const char *name, const char *value) { - auto arg = op_def_.add_arg(); - arg->set_name(name); - arg->set_s(value); - } - - void AddIntsArg(const char *name, const std::vector &values) { - auto arg = op_def_.add_arg(); - arg->set_name(name); - for (auto value : values) { - arg->add_ints(value); - } - } - - void AddFloatsArg(const char *name, const std::vector &values) { - auto arg = op_def_.add_arg(); - arg->set_name(name); - for (auto value : values) { - arg->add_floats(value); - } - } - - void AddStringsArg(const char *name, - const std::vector &values) { - auto arg = op_def_.add_arg(); - arg->set_name(name); - for (auto value : values) { - arg->add_strings(value); - } + OperatorDef *NewOperatorDef() { + op_defs_.emplace_back(OperatorDef()); + return &op_defs_[op_defs_.size() - 1]; } - OperatorDef *operator_def() { return &op_def_; } - Workspace *ws() { return &ws_; } bool RunOp(DeviceType device) { NetDef net_def; - net_def.add_op()->CopyFrom(op_def_); + for (auto &op_def_ : op_defs_) { + net_def.add_op()->CopyFrom(op_def_); + } VLOG(3) << net_def.DebugString(); net_ = CreateNet(net_def, &ws_, device); device_ = device; @@ -159,7 +170,7 @@ class OpsTestNet { public: Workspace ws_; - OperatorDef op_def_; + std::vector op_defs_; std::unique_ptr net_; DeviceType device_; }; diff --git a/mace/ops/pooling_benchmark.cc b/mace/ops/pooling_benchmark.cc index 5282bff73b1af7939b1595b89e8279bc55a15446..479563ece6c82bd47f5a22bdbe3f801c5553582c 100644 --- a/mace/ops/pooling_benchmark.cc +++ b/mace/ops/pooling_benchmark.cc @@ -27,14 +27,12 @@ static void Pooling(int iters, OpDefBuilder("Pooling", "PoolingTest") .Input("Input") .Output("Output") - .Finalize(net.operator_def()); - - // Add args - net.AddIntArg("pooling_type", pooling_type); - net.AddIntsArg("kernels", {kernel, kernel}); - net.AddIntsArg("strides", {stride, stride}); - net.AddIntArg("padding", padding); - net.AddIntsArg("dilations", {1, 1}); + .AddIntArg("pooling_type", pooling_type) + .AddIntsArg("kernels", {kernel, kernel}) + .AddIntsArg("strides", {stride, stride}) + .AddIntArg("padding", padding) + .AddIntsArg("dilations", {1, 1}) + .Finalize(net.NewOperatorDef()); // Add input data net.AddRandomInput("Input", {batch, channels, height, width}); diff --git a/mace/ops/pooling_test.cc b/mace/ops/pooling_test.cc index 75096f5dfc6e17582a67b7741cd910c59cf5f536..cd2dd609d653d2d35cf71b808a2c00a25fa55be5 100644 --- a/mace/ops/pooling_test.cc +++ b/mace/ops/pooling_test.cc @@ -19,14 +19,12 @@ TEST_F(PoolingOpTest, MAX_VALID) { OpDefBuilder("Pooling", "PoolingTest") .Input("Input") .Output("Output") - .Finalize(net.operator_def()); - - // Add args - net.AddIntsArg("kernels", {2, 2}); - net.AddIntsArg("strides", {2, 2}); - net.AddIntArg("padding", Padding::VALID); - net.AddIntsArg("dilations", {1, 1}); - net.AddIntArg("pooling_type", PoolingType::MAX); + .AddIntsArg("kernels", {2, 2}) + .AddIntsArg("strides", {2, 2}) + .AddIntArg("padding", Padding::VALID) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("pooling_type", PoolingType::MAX) + .Finalize(net.NewOperatorDef()); // Add input data net.AddInputFromArray( @@ -50,14 +48,12 @@ TEST_F(PoolingOpTest, AVG_VALID) { OpDefBuilder("Pooling", "PoolingTest") .Input("Input") .Output("Output") - .Finalize(net.operator_def()); - - // Add args - net.AddIntsArg("kernels", {2, 2}); - net.AddIntsArg("strides", {2, 2}); - net.AddIntArg("padding", Padding::VALID); - net.AddIntsArg("dilations", {1, 1}); - net.AddIntArg("pooling_type", PoolingType::AVG); + .AddIntsArg("kernels", {2, 2}) + .AddIntsArg("strides", {2, 2}) + .AddIntArg("padding", Padding::VALID) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("pooling_type", PoolingType::AVG) + .Finalize(net.NewOperatorDef()); // Add input data net.AddInputFromArray( @@ -81,14 +77,12 @@ TEST_F(PoolingOpTest, MAX_SAME) { OpDefBuilder("Pooling", "PoolingTest") .Input("Input") .Output("Output") - .Finalize(net.operator_def()); - - // Add args - net.AddIntsArg("kernels", {2, 2}); - net.AddIntsArg("strides", {2, 2}); - net.AddIntArg("padding", Padding::SAME); - net.AddIntsArg("dilations", {1, 1}); - net.AddIntArg("pooling_type", PoolingType::MAX); + .AddIntsArg("kernels", {2, 2}) + .AddIntsArg("strides", {2, 2}) + .AddIntArg("padding", Padding::SAME) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("pooling_type", PoolingType::MAX) + .Finalize(net.NewOperatorDef()); // Add input data net.AddInputFromArray("Input", {1, 1, 3, 3}, @@ -109,14 +103,12 @@ TEST_F(PoolingOpTest, MAX_VALID_DILATION) { OpDefBuilder("Pooling", "PoolingTest") .Input("Input") .Output("Output") - .Finalize(net.operator_def()); - - // Add args - net.AddIntsArg("kernels", {2, 2}); - net.AddIntsArg("strides", {1, 1}); - net.AddIntArg("padding", Padding::VALID); - net.AddIntsArg("dilations", {2, 2}); - net.AddIntArg("pooling_type", PoolingType::MAX); + .AddIntsArg("kernels", {2, 2}) + .AddIntsArg("strides", {1, 1}) + .AddIntArg("padding", Padding::VALID) + .AddIntsArg("dilations", {2, 2}) + .AddIntArg("pooling_type", PoolingType::MAX) + .Finalize(net.NewOperatorDef()); // Add input data net.AddInputFromArray( @@ -138,14 +130,12 @@ TEST_F(PoolingOpTest, MAX_k2x2s2x2) { OpDefBuilder("Pooling", "PoolingTest") .Input("Input") .Output("Output") - .Finalize(net.operator_def()); - - // Add args - net.AddIntArg("pooling_type", PoolingType::MAX); - net.AddIntsArg("kernels", {2, 2}); - net.AddIntsArg("strides", {2, 2}); - net.AddIntArg("padding", Padding::SAME); - net.AddIntsArg("dilations", {1, 1}); + .AddIntArg("pooling_type", PoolingType::MAX) + .AddIntsArg("kernels", {2, 2}) + .AddIntsArg("strides", {2, 2}) + .AddIntArg("padding", Padding::SAME) + .AddIntsArg("dilations", {1, 1}) + .Finalize(net.NewOperatorDef()); // Add input data net.AddInputFromArray( @@ -166,14 +156,12 @@ TEST_F(PoolingOpTest, MAX_k3x3s2x2) { OpDefBuilder("Pooling", "PoolingTest") .Input("Input") .Output("Output") - .Finalize(net.operator_def()); - - // Add args - net.AddIntArg("pooling_type", PoolingType::MAX); - net.AddIntsArg("kernels", {3, 3}); - net.AddIntsArg("strides", {2, 2}); - net.AddIntArg("padding", Padding::VALID); - net.AddIntsArg("dilations", {1, 1}); + .AddIntArg("pooling_type", PoolingType::MAX) + .AddIntsArg("kernels", {3, 3}) + .AddIntsArg("strides", {2, 2}) + .AddIntArg("padding", Padding::VALID) + .AddIntsArg("dilations", {1, 1}) + .Finalize(net.NewOperatorDef()); // Add input data net.AddInputFromArray( @@ -195,14 +183,12 @@ TEST_F(PoolingOpTest, AVG_k2x2s2x2) { OpDefBuilder("Pooling", "PoolingTest") .Input("Input") .Output("Output") - .Finalize(net.operator_def()); - - // Add args - net.AddIntArg("pooling_type", PoolingType::AVG); - net.AddIntsArg("kernels", {2, 2}); - net.AddIntsArg("strides", {2, 2}); - net.AddIntArg("padding", Padding::SAME); - net.AddIntsArg("dilations", {1, 1}); + .AddIntArg("pooling_type", PoolingType::AVG) + .AddIntsArg("kernels", {2, 2}) + .AddIntsArg("strides", {2, 2}) + .AddIntArg("padding", Padding::SAME) + .AddIntsArg("dilations", {1, 1}) + .Finalize(net.NewOperatorDef()); // Add input data net.AddInputFromArray( diff --git a/mace/ops/relu_benchmark.cc b/mace/ops/relu_benchmark.cc index 1a2be2ca6edebcfa3a856ce244201caa05b3337f..a1fc6ed42cd1c4c8ab1102ea0c8cbde7faba9968 100644 --- a/mace/ops/relu_benchmark.cc +++ b/mace/ops/relu_benchmark.cc @@ -16,7 +16,7 @@ static void ReluBenchmark(int iters, int size) { OpDefBuilder("Relu", "ReluBM") .Input("Input") .Output("Output") - .Finalize(net.operator_def()); + .Finalize(net.NewOperatorDef()); // Add input data net.AddRandomInput("Input", {size}); diff --git a/mace/ops/relu_test.cc b/mace/ops/relu_test.cc index 91964b725e6e27f3c2d11627e9cee42d651dcf7f..5a6eb7cac7c2839496b7c7f85ab6628d40b571b3 100644 --- a/mace/ops/relu_test.cc +++ b/mace/ops/relu_test.cc @@ -15,7 +15,7 @@ TEST_F(ReluOpTest, ReluOp) { OpDefBuilder("Relu", "ReluTest") .Input("Input") .Output("Output") - .Finalize(net.operator_def()); + .Finalize(net.NewOperatorDef()); // Add input data net.AddRandomInput("Input", {1, 2, 3, 5}); @@ -38,11 +38,11 @@ TEST_F(ReluOpTest, ReluOpWithMax) { OpDefBuilder("Relu", "ReluTestWithMax") .Input("Input") .Output("Output") - .Finalize(net.operator_def()); + .AddFloatArg("max_limit", 0.5) + .Finalize(net.NewOperatorDef()); // Add input data net.AddRandomInput("Input", {1, 2, 3, 5}); - net.AddFloatArg("max_limit", 0.5); // Run net.RunOp(); diff --git a/mace/ops/resize_bilinear_test.cc b/mace/ops/resize_bilinear_test.cc index dc05c5efaa38ea5ae499ab5e6fe84f1ff56e39a4..9d95564b132b93ac89904fd2b14ad68dae6e2649 100644 --- a/mace/ops/resize_bilinear_test.cc +++ b/mace/ops/resize_bilinear_test.cc @@ -18,7 +18,7 @@ TEST_F(ResizeBilinearTest, ResizeBilinearWOAlignCorners) { .Input("Input") .Input("OutSize") .Output("Output") - .Finalize(net.operator_def()); + .Finalize(net.NewOperatorDef()); // Add input data vector input(24); @@ -43,9 +43,8 @@ TEST_F(ResizeBilinearTest, ResizeBilinearWAlignCorners) { .Input("Input") .Input("OutSize") .Output("Output") - .Finalize(net.operator_def()); - - net.AddIntArg("align_corners", 1); + .AddIntArg("align_corners", 1) + .Finalize(net.NewOperatorDef()); // Add input data vector input(24);