diff --git a/mace/core/runtime/opencl/opencl_allocator.cc b/mace/core/runtime/opencl/opencl_allocator.cc index b501e42b7eed1c85ea2d66b11a3cf3eea12700a0..f1338cf2bb05bf08b436c38c76216e5a0a97bed2 100644 --- a/mace/core/runtime/opencl/opencl_allocator.cc +++ b/mace/core/runtime/opencl/opencl_allocator.cc @@ -31,7 +31,7 @@ void OpenCLAllocator::Delete(void *buffer) { void *OpenCLAllocator::Map(void *buffer, size_t nbytes) { auto cl_buffer = static_cast(buffer); auto queue = OpenCLRuntime::Get()->command_queue(); - // TODO (heliangliang) Non-blocking call + // TODO(heliangliang) Non-blocking call cl_int error; void *mapped_ptr = queue.enqueueMapBuffer(*cl_buffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, diff --git a/mace/core/runtime/opencl/opencl_runtime.cc b/mace/core/runtime/opencl/opencl_runtime.cc index 0e1b1bfd67b31de91bdf0f9a5a51b2281768f279..a2b4c5cbe34a550d40224afb9e6c2d77aeacb980 100644 --- a/mace/core/runtime/opencl/opencl_runtime.cc +++ b/mace/core/runtime/opencl/opencl_runtime.cc @@ -4,8 +4,12 @@ #include #include +#include #include +#include +#include + #include "mace/core/logging.h" #include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/runtime/opencl/opencl_wrapper.h" @@ -13,8 +17,7 @@ namespace mace { namespace { -bool ReadSourceFile(const char *filename, std::string *content) { - MACE_CHECK_NOTNULL(filename); +bool ReadSourceFile(const std::string &filename, std::string *content) { MACE_CHECK_NOTNULL(content); *content = ""; std::ifstream ifs(filename, std::ifstream::in); @@ -31,26 +34,50 @@ bool ReadSourceFile(const char *filename, std::string *content) { } bool BuildProgram(OpenCLRuntime *runtime, - const char *filename, + const std::string &path, cl::Program *program) { - MACE_CHECK_NOTNULL(filename); MACE_CHECK_NOTNULL(program); - std::string kernel_code; - if (!ReadSourceFile(filename, &kernel_code)) { - LOG(ERROR) << "Failed to read kernel source " << filename; - return false; - } + auto closer = [](DIR *d) { + if (d != nullptr) closedir(d); + }; + std::unique_ptr dir(opendir(path.c_str()), closer); + MACE_CHECK_NOTNULL(dir.get()); + const std::string kSourceSuffix = ".cl"; cl::Program::Sources sources; - sources.push_back({kernel_code.c_str(), kernel_code.length()}); + errno = 0; + dirent *entry = readdir(dir.get()); + MACE_CHECK(errno == 0); + while (entry != nullptr) { + if (entry->d_type == DT_REG) { + std::string d_name(entry->d_name); + if (d_name.size() > kSourceSuffix.size() && + d_name.compare(d_name.size() - kSourceSuffix.size(), + kSourceSuffix.size(), kSourceSuffix) == 0) { + std::string filename = path + d_name; + std::string kernel_source; + MACE_CHECK(ReadSourceFile(filename, &kernel_source)); + sources.push_back({kernel_source.c_str(), kernel_source.length()}); + } + } + entry = readdir(dir.get()); + MACE_CHECK(errno == 0); + }; *program = cl::Program(runtime->context(), sources); - if (program->build({runtime->device()}) != CL_SUCCESS) { - LOG(INFO) << "Error building: " - << program->getBuildInfo(runtime->device()); - return false; + std::string build_options = "-Werror -cl-mad-enable -I" + path; + // TODO(heliangliang) -cl-unsafe-math-optimizations -cl-fast-relaxed-math + if (program->build({runtime->device()}, build_options.c_str()) != CL_SUCCESS) { + if (program->getBuildInfo(runtime->device()) == + CL_BUILD_ERROR) { + std::string build_log = + program->getBuildInfo(runtime->device()); + LOG(INFO) << "Program build log: " << build_log; + } + LOG(FATAL) << "Build program failed"; } + return true; } @@ -123,24 +150,16 @@ cl::Device &OpenCLRuntime::device() { return device_; } cl::CommandQueue &OpenCLRuntime::command_queue() { return command_queue_; } -cl::Program OpenCLRuntime::GetProgram(const std::string &name) { - static const char *kernel_source_path = getenv("MACE_KERNEL_SOURCE_PATH"); - std::string filename = name; - if (kernel_source_path != nullptr) { - filename = kernel_source_path + name; - } +cl::Program &OpenCLRuntime::program() { + // TODO(heliangliang) Support binary format + static const char *kernel_path = getenv("MACE_KERNEL_PATH"); + std::string path(kernel_path == nullptr ? "" : kernel_path); - std::lock_guard lock(program_lock_); - // TODO (heliangliang) Support binary format - auto iter = programs_.find(name); - if (iter != programs_.end()) { - return iter->second; - } else { - cl::Program program; - MACE_CHECK(BuildProgram(this, filename.c_str(), &program)); - programs_.emplace(name, program); - return program; - } + std::call_once(build_flag_, [this, &path]() { + MACE_CHECK(BuildProgram(this, path, &program_)); + }); + + return program_; } } // namespace mace diff --git a/mace/core/runtime/opencl/opencl_runtime.h b/mace/core/runtime/opencl/opencl_runtime.h index 7d559f68c49600c55847e9c5a18e5a59101ce01d..057b2a80320130d322a1146698cfcb40334ca0a4 100644 --- a/mace/core/runtime/opencl/opencl_runtime.h +++ b/mace/core/runtime/opencl/opencl_runtime.h @@ -28,14 +28,14 @@ class OpenCLRuntime { cl::Context &context(); cl::Device &device(); cl::CommandQueue &command_queue(); - cl::Program GetProgram(const std::string &name); + cl::Program &program(); private: cl::Context context_; cl::CommandQueue command_queue_; cl::Device device_; - std::map programs_; - std::mutex program_lock_; + cl::Program program_; + std::once_flag build_flag_; }; } // namespace mace diff --git a/mace/core/runtime/opencl/opencl_smoketest.cc b/mace/core/runtime/opencl/opencl_smoketest.cc index f9eb8e6814f9c961b7c99577262b2aebffcd2e10..ab32a81d89c462e9c15e100ff00aa9ebb382556e 100644 --- a/mace/core/runtime/opencl/opencl_smoketest.cc +++ b/mace/core/runtime/opencl/opencl_smoketest.cc @@ -40,7 +40,7 @@ int main() { step[0] = step_size; } - auto program = runtime->GetProgram("simple_add.cl"); + auto program = runtime->program(); auto simple_add = cl::KernelFunctor( diff --git a/mace/core/runtime/opencl/opencl_wrapper.cc b/mace/core/runtime/opencl/opencl_wrapper.cc index d39d0d0ff8f6f8be552ee93507bf7247ad5e9df4..e1d9c12284301b2ad4533287a12a3ab9ddfe4009 100644 --- a/mace/core/runtime/opencl/opencl_wrapper.cc +++ b/mace/core/runtime/opencl/opencl_wrapper.cc @@ -195,8 +195,8 @@ OpenCLLibraryImpl &OpenCLLibraryImpl::Get() { bool OpenCLLibraryImpl::Load() { if (loaded()) return true; - // TODO (heliangliang) Make this configurable - // TODO (heliangliang) Benchmark 64 bit overhead + // TODO(heliangliang) Make this configurable + // TODO(heliangliang) Benchmark 64 bit overhead static const std::vector paths = { #if defined(__aarch64__) // Qualcomm Adreno diff --git a/mace/core/tensor.h b/mace/core/tensor.h index 6de5a8acc6bcec318b723afc333194ed4f5cf6e4..5862b84f306199b60613d89dff60bfe01a40605d 100644 --- a/mace/core/tensor.h +++ b/mace/core/tensor.h @@ -88,7 +88,7 @@ class Tensor { * Map the device buffer as CPU buffer to access the data, unmap must be * called later */ - inline void Map() { + inline void Map() const { if (!OnHost()) { MACE_CHECK(buffer_ != nullptr && data_ == nullptr); data_ = alloc_->Map(buffer_, size_ * SizeOfType()); @@ -98,7 +98,7 @@ class Tensor { /* * Unmap the device buffer */ - inline void Unmap() { + inline void Unmap() const { if (!OnHost()) { MACE_CHECK(buffer_ != nullptr && data_ != nullptr); alloc_->Unmap(buffer_, data_); @@ -187,7 +187,7 @@ class Tensor { LOG(INFO) << os.str(); } - inline size_t SizeOfType() { + inline size_t SizeOfType() const { size_t type_size = 0; CASES(dtype_, type_size = sizeof(T)); return type_size; @@ -203,14 +203,15 @@ class Tensor { class MappingGuard { public: - MappingGuard(Tensor *tensor) : tensor_(tensor) { - MACE_ASSERT(tensor_ != nullptr); - tensor_->Map(); + MappingGuard(const Tensor *tensor) : tensor_(tensor) { + if (tensor_ != nullptr) tensor_->Map(); + } + ~MappingGuard() { + if (tensor_ != nullptr) tensor_->Unmap(); } - ~MappingGuard() { tensor_->Unmap(); } private: - Tensor *tensor_; + const Tensor *tensor_; }; private: @@ -233,7 +234,7 @@ class Tensor { // read or write void *buffer_; // Mapped buffer - void *data_; + mutable void *data_; vector shape_; DISABLE_COPY_AND_ASSIGN(Tensor); diff --git a/mace/examples/mace_run.cc b/mace/examples/mace_run.cc index 102f862ecf4839b63622a49e5a4e5b439a0f458d..d75b7b9483883f5dc192068c5831d7bb3b61d6ee 100644 --- a/mace/examples/mace_run.cc +++ b/mace/examples/mace_run.cc @@ -84,7 +84,7 @@ int main(int argc, char **argv) { Workspace ws; ws.LoadModelTensor(net_def, DeviceType::CPU); Tensor *input_tensor = - ws.CreateTensor(input_node + ":0", cpu_allocator(), DT_FLOAT); + ws.CreateTensor(input_node + ":0", GetDeviceAllocator(DeviceType::CPU), DT_FLOAT); input_tensor->Resize(shape); float *input_data = input_tensor->mutable_data(); diff --git a/mace/kernels/BUILD b/mace/kernels/BUILD index 8f86e6eb7809f52a810faa93321c5b1ab15efe89..b6711a0f8f8dcc7b3bc9a885b578703b93f5f8b0 100644 --- a/mace/kernels/BUILD +++ b/mace/kernels/BUILD @@ -11,8 +11,8 @@ load("//mace:mace.bzl", "if_android") cc_library( name = "kernels", - srcs = glob(["*.cc"]) + if_android(glob(["neon/*.cc"])), - hdrs = glob(["*.h"]) + if_android(glob(["neon/*.h"])), + srcs = glob(["*.cc"]) + if_android(glob(["neon/*.cc", "opencl/*.cc"])), + hdrs = glob(["*.h"]) + if_android(glob(["neon/*.h", "opencl/*.h"])), copts = [ "-std=c++11", "-fopenmp", @@ -20,6 +20,7 @@ cc_library( linkopts = if_android(["-lm"]), deps = [ "//mace/core", + "//mace/core:opencl_runtime", "//mace/utils", ], ) diff --git a/mace/kernels/conv_2d.h b/mace/kernels/conv_2d.h index d520baf77bea3503abaa38f2d4ca981f3eb4405f..8a5cff2a0354d8ab467fc02905d49d9d9f1e42c9 100644 --- a/mace/kernels/conv_2d.h +++ b/mace/kernels/conv_2d.h @@ -19,27 +19,26 @@ struct Conv2dFunctor { const int *dilations) : strides_(strides), paddings_(paddings), dilations_(dilations) {} - void operator()(const T *input, // NCHW - const index_t *input_shape, - const T *filter, // c_out, c_in, kernel_h, kernel_w - const index_t *filter_shape, - const T *bias, // c_out - T *output, // NCHW - const index_t *output_shape) { + void operator()(const Tensor *input, + const Tensor *filter, + const Tensor *bias, + Tensor *output) { + MACE_CHECK_NOTNULL(input); + MACE_CHECK_NOTNULL(filter); MACE_CHECK_NOTNULL(output); - index_t batch = output_shape[0]; - index_t channels = output_shape[1]; - index_t height = output_shape[2]; - index_t width = output_shape[3]; + index_t batch = output->shape()[0]; + index_t channels = output->shape()[1]; + index_t height = output->shape()[2]; + index_t width = output->shape()[3]; - index_t input_batch = input_shape[0]; - index_t input_channels = input_shape[1]; - index_t input_height = input_shape[2]; - index_t input_width = input_shape[3]; + index_t input_batch = input->shape()[0]; + index_t input_channels = input->shape()[1]; + index_t input_height = input->shape()[2]; + index_t input_width = input->shape()[3]; - index_t kernel_h = filter_shape[2]; - index_t kernel_w = filter_shape[3]; + index_t kernel_h = filter->shape()[2]; + index_t kernel_w = filter->shape()[3]; int stride_h = strides_[0]; int stride_w = strides_[1]; @@ -57,17 +56,26 @@ struct Conv2dFunctor { index_t kernel_size = input_channels * kernel_h * kernel_w; + Tensor::MappingGuard input_mapper(input); + Tensor::MappingGuard filter_mapper(filter); + Tensor::MappingGuard bias_mapper(bias); + Tensor::MappingGuard output_mapper(output); + auto input_data = input->data(); + auto filter_data = filter->data(); + auto bias_data = bias == nullptr ? nullptr : bias->data(); + auto output_data = output->mutable_data(); + #pragma omp parallel for collapse(2) for (int n = 0; n < batch; ++n) { for (int c = 0; c < channels; ++c) { - T bias_channel = bias ? bias[c] : 0; + T bias_channel = bias_data ? bias_data[c] : 0; for (int h = 0; h < height; ++h) { for (int w = 0; w < width; ++w) { index_t offset = n * channels * height * width + c * height * width + h * width + w; - output[offset] = bias_channel; + output_data[offset] = bias_channel; T sum = 0; - const T *filter_ptr = filter + c * kernel_size; + const T *filter_ptr = filter_data + c * kernel_size; for (int inc = 0; inc < input_channels; ++inc) { for (int kh = 0; kh < kernel_h; ++kh) { for (int kw = 0; kw < kernel_w; ++kw) { @@ -86,13 +94,13 @@ struct Conv2dFunctor { n * input_channels * input_height * input_width + inc * input_height * input_width + inh * input_width + inw; - sum += input[input_offset] * *filter_ptr; + sum += input_data[input_offset] * *filter_ptr; } ++filter_ptr; } } } - output[offset] += sum; + output_data[offset] += sum; } } } @@ -105,16 +113,12 @@ struct Conv2dFunctor { }; template <> -void Conv2dFunctor::operator()( - const float *input, - const index_t *input_shape, - const float *filter, - const index_t *filter_shape, - const float *bias, - float *output, - const index_t *output_shape); - -} // namespace kernels -} // namespace mace +void Conv2dFunctor::operator()(const Tensor *input, + const Tensor *filter, + const Tensor *bias, + Tensor *output); + +} // namespace kernels +} // namespace mace #endif // MACE_KERNELS_CONV_2D_H_ diff --git a/mace/kernels/neon/conv_2d_neon.cc b/mace/kernels/neon/conv_2d_neon.cc index c135cb8cc5df9ebefd01db606fd92f531ff3bccd..7f912c3d60a6f22b2817f1dfd0ae1bc4b8cdefad 100644 --- a/mace/kernels/neon/conv_2d_neon.cc +++ b/mace/kernels/neon/conv_2d_neon.cc @@ -41,14 +41,14 @@ extern void Conv2dNeonK5x5S1(const float *input, const index_t *output_shape); template <> -void Conv2dFunctor::operator()( - const float *input, - const index_t *input_shape, - const float *filter, - const index_t *filter_shape, - const float *bias, - float *output, - const index_t *output_shape) { +void Conv2dFunctor::operator()(const Tensor *input, + const Tensor *filter, + const Tensor *bias, + Tensor *output) { + MACE_CHECK_NOTNULL(input); + MACE_CHECK_NOTNULL(filter); + MACE_CHECK_NOTNULL(output); + typedef void (*Conv2dNeonFunction)( const float *input, const index_t *input_shape, const float *filter, const index_t *filter_shape, const float *bias, float *output, @@ -61,8 +61,8 @@ void Conv2dFunctor::operator()( {nullptr, nullptr}, {Conv2dNeonK5x5S1, nullptr}}; // not implement yet - index_t kernel_h = filter_shape[2]; - index_t kernel_w = filter_shape[3]; + index_t kernel_h = filter->shape()[2]; + index_t kernel_w = filter->shape()[3]; if (kernel_h != kernel_w || kernel_h > 5 || strides_[0] != strides_[1] || strides_[0] > 2 || dilations_[0] != 1 || dilations_[1] != 1 || selector[kernel_h - 1][strides_[0] - 1] == nullptr) { @@ -71,21 +71,32 @@ void Conv2dFunctor::operator()( << " stride " << strides_[0] << "x" << strides_[1] << " is not implemented yet, using slow version"; Conv2dFunctor(strides_, paddings_, dilations_)( - input, input_shape, filter, filter_shape, bias, output, output_shape); + input, filter, bias, output); return; } + Tensor::MappingGuard input_mapper(input); + Tensor::MappingGuard filter_mapper(filter); + Tensor::MappingGuard bias_mapper(bias); + Tensor::MappingGuard output_mapper(output); + auto input_data = input->data(); + auto input_shape = input->shape().data(); + auto filter_data = filter->data(); + auto bias_data = bias == nullptr ? nullptr : bias->data(); + auto output_data = output->mutable_data(); + auto output_shape = output->shape().data(); + // Keep this alive during kernel execution Tensor padded_input; if (paddings_[0] > 0 || paddings_[1] > 0) { - ConstructInputWithPadding(input, input_shape, paddings_.data(), - &padded_input); - input = padded_input.data(); + ConstructInputWithPadding(input_data, input->shape().data(), + paddings_.data(), &padded_input); + input_data = padded_input.data(); input_shape = padded_input.shape().data(); } auto conv2d_neon_func = selector[kernel_h - 1][strides_[0] - 1]; - conv2d_neon_func(input, input_shape, filter, nullptr, bias, output, - output_shape); + conv2d_neon_func(input_data, input_shape, filter_data, nullptr, + bias_data, output_data, output_shape); } } // namespace kernels diff --git a/mace/kernels/opencl/conv_2d_opencl.cc b/mace/kernels/opencl/conv_2d_opencl.cc new file mode 100644 index 0000000000000000000000000000000000000000..3aca41d0076e93bd846b6670509b0c14f256dfc8 --- /dev/null +++ b/mace/kernels/opencl/conv_2d_opencl.cc @@ -0,0 +1,51 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#include "mace/kernels/conv_2d.h" +#include "mace/kernels/conv_pool_2d_util.h" + +namespace mace { +namespace kernels { + +extern void Conv2dOpenclK1x1S1(const Tensor *input, const Tensor *filter, + const Tensor *bias, Tensor *output); + +template <> +void Conv2dFunctor::operator()(const Tensor *input, + const Tensor *filter, + const Tensor *bias, + Tensor *output) { + typedef void (*Conv2dOpenclFunction)(const Tensor *input, const Tensor *filter, + const Tensor *bias, Tensor *output); + // Selection matrix: kernel_size x stride_size + static const Conv2dOpenclFunction selector[5][2] = { + {Conv2dOpenclK1x1S1, nullptr}, + {nullptr, nullptr}, + {nullptr, nullptr}, + {nullptr, nullptr}, + {nullptr, nullptr}}; + + index_t kernel_h = filter->shape()[2]; + index_t kernel_w = filter->shape()[3]; + if (kernel_h != kernel_w || kernel_h > 5 || strides_[0] != strides_[1] || + strides_[0] > 2 || dilations_[0] != 1 || dilations_[1] != 1 || + selector[kernel_h - 1][strides_[0] - 1] == nullptr) { + LOG(WARNING) << "OpenCL conv2d kernel with " + << "filter" << kernel_h << "x" << kernel_w << "," + << " stride " << strides_[0] << "x" << strides_[1] + << " is not implemented yet, using slow version"; + // TODO(heliangliang) The CPU/NEON kernel should map the buffer + Conv2dFunctor(strides_, paddings_, dilations_)( + input, filter, bias, output); + return; + } + + MACE_CHECK(paddings_[0] == 1 && paddings_[1] == 1, "Padding not supported"); + + auto conv2d_func = selector[kernel_h - 1][strides_[0] - 1]; + conv2d_func(input, filter, bias, output); +} + +} // namespace kernels +} // namespace mace diff --git a/mace/kernels/opencl/conv_2d_opencl_1x1.cc b/mace/kernels/opencl/conv_2d_opencl_1x1.cc new file mode 100644 index 0000000000000000000000000000000000000000..af75a259d91bce105d803c69126908255abbccfc --- /dev/null +++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc @@ -0,0 +1,36 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#include "mace/core/common.h" +#include "mace/kernels/conv_2d.h" +#include "mace/utils/utils.h" + +namespace mace { +namespace kernels { + +static constexpr index_t kInputChannelBlockSize = 2; +static constexpr index_t kOutputChannelBlockSize = 4; + +extern void Conv2dOpenclK1x1S1(const Tensor *input, const Tensor *filter, + const Tensor *bias, Tensor *output) { + const index_t batch = output->shape()[0]; + const index_t channels = output->shape()[1]; + const index_t height = output->shape()[2]; + const index_t width = output->shape()[3]; + + const index_t input_batch = input->shape()[0]; + const index_t input_channels = input->shape()[1]; + const index_t input_height = input->shape()[2]; + const index_t input_width = input->shape()[3]; + + MACE_CHECK(input_batch == batch && input_height == height && + input_width == width); + + const index_t total_pixels = height * width; + const index_t round_up_channels = RoundUp(channels, kOutputChannelBlockSize); + +}; + +} // namespace kernels +} // namespace mace diff --git a/mace/ops/conv_2d.cc b/mace/ops/conv_2d.cc index 092a488cdf7c4d4a17e546564a4ddec5da5333f5..b3886b296d6b01e21bcc414475ae0f03534df5b8 100644 --- a/mace/ops/conv_2d.cc +++ b/mace/ops/conv_2d.cc @@ -12,4 +12,6 @@ REGISTER_CPU_OPERATOR(Conv2D, Conv2dOp); REGISTER_NEON_OPERATOR(Conv2D, Conv2dOp); #endif // __ARM_NEON +REGISTER_OPENCL_OPERATOR(Conv2D, Conv2dOp); + } // namespace mace diff --git a/mace/ops/conv_2d.h b/mace/ops/conv_2d.h index a223514a47cc0b52bbe92556290cffcc9d37042b..bc4388598141b0a3ad4ccb80300173f960d11ef8 100644 --- a/mace/ops/conv_2d.h +++ b/mace/ops/conv_2d.h @@ -25,12 +25,7 @@ class Conv2dOp : public ConvPool2dOpBase { bool Run() override { const Tensor *input = this->Input(INPUT); const Tensor *filter = this->Input(FILTER); - const T *bias_data = nullptr; - if (this->InputSize() >= 3) { - const Tensor *bias = this->Input(BIAS); - bias_data = bias->data(); - } - + const Tensor *bias = this->InputSize() >= 3 ? this->Input(BIAS) : nullptr; Tensor *output = this->Output(OUTPUT); std::vector output_shape(4); @@ -42,9 +37,7 @@ class Conv2dOp : public ConvPool2dOpBase { output->Resize(output_shape); functor_.paddings_ = paddings; - functor_(input->data(), input->shape().data(), filter->data(), - filter->shape().data(), bias_data, output->mutable_data(), - output->shape().data()); + functor_(input, filter, bias, output); return true; } diff --git a/mace/ops/conv_2d_benchmark.cc b/mace/ops/conv_2d_benchmark.cc index d682f7091100b160d261b3b73f67e2635e70173c..8a78041c305154c1a296b1f8e92b1e31e22dd0dd 100644 --- a/mace/ops/conv_2d_benchmark.cc +++ b/mace/ops/conv_2d_benchmark.cc @@ -67,9 +67,10 @@ static void Conv2d(int iters, BENCHMARK( \ BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE) -#define BM_CONV_2D(N, C, H, W, KH, KW, S, P, OC, TYPE) \ - BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, CPU); \ - BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, NEON); +#define BM_CONV_2D(N, C, H, W, KH, KW, S, P, OC, TYPE) \ + BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, CPU); \ + BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, NEON); \ + BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, OPENCL); BM_CONV_2D(1, 64, 32, 32, 1, 1, 1, VALID, 128, float); BM_CONV_2D(1, 64, 33, 31, 1, 1, 1, VALID, 128, float); // Test bad alignments diff --git a/mace/ops/ops_test_util.h b/mace/ops/ops_test_util.h index 480e7de0e0260c1f83f9e75bdb8e4153dd7975b3..252cb5d6950df2668b9951b2b1c5b2729731a412 100644 --- a/mace/ops/ops_test_util.h +++ b/mace/ops/ops_test_util.h @@ -48,7 +48,7 @@ class OpsTestNet { const std::vector &shape, const std::vector &data) { Tensor *input = - ws_.CreateTensor(name, cpu_allocator(), DataTypeToEnum::v()); + ws_.CreateTensor(name, GetDeviceAllocator(DeviceType::CPU), DataTypeToEnum::v()); input->Resize(shape); T *input_data = input->mutable_data(); MACE_CHECK(static_cast(input->size()) == data.size()); @@ -60,7 +60,7 @@ class OpsTestNet { const std::vector &shape, const T data) { Tensor *input = - ws_.CreateTensor(name, cpu_allocator(), DataTypeToEnum::v()); + ws_.CreateTensor(name, GetDeviceAllocator(DeviceType::CPU), DataTypeToEnum::v()); input->Resize(shape); T *input_data = input->mutable_data(); std::fill(input_data, input_data + input->size(), data); @@ -71,7 +71,7 @@ class OpsTestNet { const std::vector &shape, bool positive = false) { Tensor *input = - ws_.CreateTensor(name, cpu_allocator(), DataTypeToEnum::v()); + ws_.CreateTensor(name, GetDeviceAllocator(DeviceType::CPU), DataTypeToEnum::v()); input->Resize(shape); float *input_data = input->mutable_data(); @@ -206,7 +206,7 @@ void GenerateRandomIntTypeData(const std::vector &shape, template unique_ptr CreateTensor(const std::vector &shape, const std::vector &data) { - unique_ptr res(new Tensor(cpu_allocator(), DataTypeToEnum::v())); + unique_ptr res(new Tensor(GetDeviceAllocator(DeviceType::CPU), DataTypeToEnum::v())); res->Resize(shape); T *input_data = res->mutable_data(); memcpy(input_data, data.data(), data.size() * sizeof(T)); diff --git a/mace/tools/benchmark/benchmark_model.cc b/mace/tools/benchmark/benchmark_model.cc index 3cd644023a1269acf14a7e8a566d5b426e146941..6ecfc1f4ed417e13d91c161a1bf9149be147684d 100644 --- a/mace/tools/benchmark/benchmark_model.cc +++ b/mace/tools/benchmark/benchmark_model.cc @@ -269,7 +269,7 @@ int Main(int argc, char **argv) { // Load inputs for (size_t i = 0; i < inputs_count; ++i) { Tensor *input_tensor = - ws.CreateTensor(input_layers[i], cpu_allocator(), DT_FLOAT); + ws.CreateTensor(input_layers[i], GetDeviceAllocator(DeviceType::CPU), DT_FLOAT); vector shapes; str_util::SplitAndParseToInts(input_layer_shapes[i], ',', &shapes); input_tensor->Resize(shapes);