提交 8dc58b60 编写于 作者: 李寅

Merge branch 'opencl' into 'master'

Add conv2d opencl kernel placeholder

See merge request !72
...@@ -31,7 +31,7 @@ void OpenCLAllocator::Delete(void *buffer) { ...@@ -31,7 +31,7 @@ void OpenCLAllocator::Delete(void *buffer) {
void *OpenCLAllocator::Map(void *buffer, size_t nbytes) { void *OpenCLAllocator::Map(void *buffer, size_t nbytes) {
auto cl_buffer = static_cast<cl::Buffer *>(buffer); auto cl_buffer = static_cast<cl::Buffer *>(buffer);
auto queue = OpenCLRuntime::Get()->command_queue(); auto queue = OpenCLRuntime::Get()->command_queue();
// TODO (heliangliang) Non-blocking call // TODO(heliangliang) Non-blocking call
cl_int error; cl_int error;
void *mapped_ptr = void *mapped_ptr =
queue.enqueueMapBuffer(*cl_buffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, queue.enqueueMapBuffer(*cl_buffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0,
......
...@@ -4,8 +4,12 @@ ...@@ -4,8 +4,12 @@
#include <cstdlib> #include <cstdlib>
#include <fstream> #include <fstream>
#include <memory>
#include <mutex> #include <mutex>
#include <dirent.h>
#include <errno.h>
#include "mace/core/logging.h" #include "mace/core/logging.h"
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/core/runtime/opencl/opencl_wrapper.h" #include "mace/core/runtime/opencl/opencl_wrapper.h"
...@@ -13,8 +17,7 @@ ...@@ -13,8 +17,7 @@
namespace mace { namespace mace {
namespace { namespace {
bool ReadSourceFile(const char *filename, std::string *content) { bool ReadSourceFile(const std::string &filename, std::string *content) {
MACE_CHECK_NOTNULL(filename);
MACE_CHECK_NOTNULL(content); MACE_CHECK_NOTNULL(content);
*content = ""; *content = "";
std::ifstream ifs(filename, std::ifstream::in); std::ifstream ifs(filename, std::ifstream::in);
...@@ -31,26 +34,50 @@ bool ReadSourceFile(const char *filename, std::string *content) { ...@@ -31,26 +34,50 @@ bool ReadSourceFile(const char *filename, std::string *content) {
} }
bool BuildProgram(OpenCLRuntime *runtime, bool BuildProgram(OpenCLRuntime *runtime,
const char *filename, const std::string &path,
cl::Program *program) { cl::Program *program) {
MACE_CHECK_NOTNULL(filename);
MACE_CHECK_NOTNULL(program); MACE_CHECK_NOTNULL(program);
std::string kernel_code; auto closer = [](DIR *d) {
if (!ReadSourceFile(filename, &kernel_code)) { if (d != nullptr) closedir(d);
LOG(ERROR) << "Failed to read kernel source " << filename; };
return false; std::unique_ptr<DIR, decltype(closer)> dir(opendir(path.c_str()), closer);
} MACE_CHECK_NOTNULL(dir.get());
const std::string kSourceSuffix = ".cl";
cl::Program::Sources sources; cl::Program::Sources sources;
sources.push_back({kernel_code.c_str(), kernel_code.length()}); errno = 0;
dirent *entry = readdir(dir.get());
MACE_CHECK(errno == 0);
while (entry != nullptr) {
if (entry->d_type == DT_REG) {
std::string d_name(entry->d_name);
if (d_name.size() > kSourceSuffix.size() &&
d_name.compare(d_name.size() - kSourceSuffix.size(),
kSourceSuffix.size(), kSourceSuffix) == 0) {
std::string filename = path + d_name;
std::string kernel_source;
MACE_CHECK(ReadSourceFile(filename, &kernel_source));
sources.push_back({kernel_source.c_str(), kernel_source.length()});
}
}
entry = readdir(dir.get());
MACE_CHECK(errno == 0);
};
*program = cl::Program(runtime->context(), sources); *program = cl::Program(runtime->context(), sources);
if (program->build({runtime->device()}) != CL_SUCCESS) { std::string build_options = "-Werror -cl-mad-enable -I" + path;
LOG(INFO) << "Error building: " // TODO(heliangliang) -cl-unsafe-math-optimizations -cl-fast-relaxed-math
<< program->getBuildInfo<CL_PROGRAM_BUILD_LOG>(runtime->device()); if (program->build({runtime->device()}, build_options.c_str()) != CL_SUCCESS) {
return false; if (program->getBuildInfo<CL_PROGRAM_BUILD_STATUS>(runtime->device()) ==
CL_BUILD_ERROR) {
std::string build_log =
program->getBuildInfo<CL_PROGRAM_BUILD_LOG>(runtime->device());
LOG(INFO) << "Program build log: " << build_log;
}
LOG(FATAL) << "Build program failed";
} }
return true; return true;
} }
...@@ -123,24 +150,16 @@ cl::Device &OpenCLRuntime::device() { return device_; } ...@@ -123,24 +150,16 @@ cl::Device &OpenCLRuntime::device() { return device_; }
cl::CommandQueue &OpenCLRuntime::command_queue() { return command_queue_; } cl::CommandQueue &OpenCLRuntime::command_queue() { return command_queue_; }
cl::Program OpenCLRuntime::GetProgram(const std::string &name) { cl::Program &OpenCLRuntime::program() {
static const char *kernel_source_path = getenv("MACE_KERNEL_SOURCE_PATH"); // TODO(heliangliang) Support binary format
std::string filename = name; static const char *kernel_path = getenv("MACE_KERNEL_PATH");
if (kernel_source_path != nullptr) { std::string path(kernel_path == nullptr ? "" : kernel_path);
filename = kernel_source_path + name;
}
std::lock_guard<std::mutex> lock(program_lock_); std::call_once(build_flag_, [this, &path]() {
// TODO (heliangliang) Support binary format MACE_CHECK(BuildProgram(this, path, &program_));
auto iter = programs_.find(name); });
if (iter != programs_.end()) {
return iter->second; return program_;
} else {
cl::Program program;
MACE_CHECK(BuildProgram(this, filename.c_str(), &program));
programs_.emplace(name, program);
return program;
}
} }
} // namespace mace } // namespace mace
...@@ -28,14 +28,14 @@ class OpenCLRuntime { ...@@ -28,14 +28,14 @@ class OpenCLRuntime {
cl::Context &context(); cl::Context &context();
cl::Device &device(); cl::Device &device();
cl::CommandQueue &command_queue(); cl::CommandQueue &command_queue();
cl::Program GetProgram(const std::string &name); cl::Program &program();
private: private:
cl::Context context_; cl::Context context_;
cl::CommandQueue command_queue_; cl::CommandQueue command_queue_;
cl::Device device_; cl::Device device_;
std::map<std::string, cl::Program> programs_; cl::Program program_;
std::mutex program_lock_; std::once_flag build_flag_;
}; };
} // namespace mace } // namespace mace
......
...@@ -40,7 +40,7 @@ int main() { ...@@ -40,7 +40,7 @@ int main() {
step[0] = step_size; step[0] = step_size;
} }
auto program = runtime->GetProgram("simple_add.cl"); auto program = runtime->program();
auto simple_add = auto simple_add =
cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer, cl::Buffer>( cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer, cl::Buffer>(
......
...@@ -195,8 +195,8 @@ OpenCLLibraryImpl &OpenCLLibraryImpl::Get() { ...@@ -195,8 +195,8 @@ OpenCLLibraryImpl &OpenCLLibraryImpl::Get() {
bool OpenCLLibraryImpl::Load() { bool OpenCLLibraryImpl::Load() {
if (loaded()) return true; if (loaded()) return true;
// TODO (heliangliang) Make this configurable // TODO(heliangliang) Make this configurable
// TODO (heliangliang) Benchmark 64 bit overhead // TODO(heliangliang) Benchmark 64 bit overhead
static const std::vector<std::string> paths = { static const std::vector<std::string> paths = {
#if defined(__aarch64__) #if defined(__aarch64__)
// Qualcomm Adreno // Qualcomm Adreno
......
...@@ -88,7 +88,7 @@ class Tensor { ...@@ -88,7 +88,7 @@ class Tensor {
* Map the device buffer as CPU buffer to access the data, unmap must be * Map the device buffer as CPU buffer to access the data, unmap must be
* called later * called later
*/ */
inline void Map() { inline void Map() const {
if (!OnHost()) { if (!OnHost()) {
MACE_CHECK(buffer_ != nullptr && data_ == nullptr); MACE_CHECK(buffer_ != nullptr && data_ == nullptr);
data_ = alloc_->Map(buffer_, size_ * SizeOfType()); data_ = alloc_->Map(buffer_, size_ * SizeOfType());
...@@ -98,7 +98,7 @@ class Tensor { ...@@ -98,7 +98,7 @@ class Tensor {
/* /*
* Unmap the device buffer * Unmap the device buffer
*/ */
inline void Unmap() { inline void Unmap() const {
if (!OnHost()) { if (!OnHost()) {
MACE_CHECK(buffer_ != nullptr && data_ != nullptr); MACE_CHECK(buffer_ != nullptr && data_ != nullptr);
alloc_->Unmap(buffer_, data_); alloc_->Unmap(buffer_, data_);
...@@ -187,7 +187,7 @@ class Tensor { ...@@ -187,7 +187,7 @@ class Tensor {
LOG(INFO) << os.str(); LOG(INFO) << os.str();
} }
inline size_t SizeOfType() { inline size_t SizeOfType() const {
size_t type_size = 0; size_t type_size = 0;
CASES(dtype_, type_size = sizeof(T)); CASES(dtype_, type_size = sizeof(T));
return type_size; return type_size;
...@@ -203,14 +203,15 @@ class Tensor { ...@@ -203,14 +203,15 @@ class Tensor {
class MappingGuard { class MappingGuard {
public: public:
MappingGuard(Tensor *tensor) : tensor_(tensor) { MappingGuard(const Tensor *tensor) : tensor_(tensor) {
MACE_ASSERT(tensor_ != nullptr); if (tensor_ != nullptr) tensor_->Map();
tensor_->Map(); }
~MappingGuard() {
if (tensor_ != nullptr) tensor_->Unmap();
} }
~MappingGuard() { tensor_->Unmap(); }
private: private:
Tensor *tensor_; const Tensor *tensor_;
}; };
private: private:
...@@ -233,7 +234,7 @@ class Tensor { ...@@ -233,7 +234,7 @@ class Tensor {
// read or write // read or write
void *buffer_; void *buffer_;
// Mapped buffer // Mapped buffer
void *data_; mutable void *data_;
vector<index_t> shape_; vector<index_t> shape_;
DISABLE_COPY_AND_ASSIGN(Tensor); DISABLE_COPY_AND_ASSIGN(Tensor);
......
...@@ -84,7 +84,7 @@ int main(int argc, char **argv) { ...@@ -84,7 +84,7 @@ int main(int argc, char **argv) {
Workspace ws; Workspace ws;
ws.LoadModelTensor(net_def, DeviceType::CPU); ws.LoadModelTensor(net_def, DeviceType::CPU);
Tensor *input_tensor = Tensor *input_tensor =
ws.CreateTensor(input_node + ":0", cpu_allocator(), DT_FLOAT); ws.CreateTensor(input_node + ":0", GetDeviceAllocator(DeviceType::CPU), DT_FLOAT);
input_tensor->Resize(shape); input_tensor->Resize(shape);
float *input_data = input_tensor->mutable_data<float>(); float *input_data = input_tensor->mutable_data<float>();
......
...@@ -11,8 +11,8 @@ load("//mace:mace.bzl", "if_android") ...@@ -11,8 +11,8 @@ load("//mace:mace.bzl", "if_android")
cc_library( cc_library(
name = "kernels", name = "kernels",
srcs = glob(["*.cc"]) + if_android(glob(["neon/*.cc"])), srcs = glob(["*.cc"]) + if_android(glob(["neon/*.cc", "opencl/*.cc"])),
hdrs = glob(["*.h"]) + if_android(glob(["neon/*.h"])), hdrs = glob(["*.h"]) + if_android(glob(["neon/*.h", "opencl/*.h"])),
copts = [ copts = [
"-std=c++11", "-std=c++11",
"-fopenmp", "-fopenmp",
...@@ -20,6 +20,7 @@ cc_library( ...@@ -20,6 +20,7 @@ cc_library(
linkopts = if_android(["-lm"]), linkopts = if_android(["-lm"]),
deps = [ deps = [
"//mace/core", "//mace/core",
"//mace/core:opencl_runtime",
"//mace/utils", "//mace/utils",
], ],
) )
......
...@@ -19,27 +19,26 @@ struct Conv2dFunctor { ...@@ -19,27 +19,26 @@ struct Conv2dFunctor {
const int *dilations) const int *dilations)
: strides_(strides), paddings_(paddings), dilations_(dilations) {} : strides_(strides), paddings_(paddings), dilations_(dilations) {}
void operator()(const T *input, // NCHW void operator()(const Tensor *input,
const index_t *input_shape, const Tensor *filter,
const T *filter, // c_out, c_in, kernel_h, kernel_w const Tensor *bias,
const index_t *filter_shape, Tensor *output) {
const T *bias, // c_out MACE_CHECK_NOTNULL(input);
T *output, // NCHW MACE_CHECK_NOTNULL(filter);
const index_t *output_shape) {
MACE_CHECK_NOTNULL(output); MACE_CHECK_NOTNULL(output);
index_t batch = output_shape[0]; index_t batch = output->shape()[0];
index_t channels = output_shape[1]; index_t channels = output->shape()[1];
index_t height = output_shape[2]; index_t height = output->shape()[2];
index_t width = output_shape[3]; index_t width = output->shape()[3];
index_t input_batch = input_shape[0]; index_t input_batch = input->shape()[0];
index_t input_channels = input_shape[1]; index_t input_channels = input->shape()[1];
index_t input_height = input_shape[2]; index_t input_height = input->shape()[2];
index_t input_width = input_shape[3]; index_t input_width = input->shape()[3];
index_t kernel_h = filter_shape[2]; index_t kernel_h = filter->shape()[2];
index_t kernel_w = filter_shape[3]; index_t kernel_w = filter->shape()[3];
int stride_h = strides_[0]; int stride_h = strides_[0];
int stride_w = strides_[1]; int stride_w = strides_[1];
...@@ -57,17 +56,26 @@ struct Conv2dFunctor { ...@@ -57,17 +56,26 @@ struct Conv2dFunctor {
index_t kernel_size = input_channels * kernel_h * kernel_w; index_t kernel_size = input_channels * kernel_h * kernel_w;
Tensor::MappingGuard input_mapper(input);
Tensor::MappingGuard filter_mapper(filter);
Tensor::MappingGuard bias_mapper(bias);
Tensor::MappingGuard output_mapper(output);
auto input_data = input->data<T>();
auto filter_data = filter->data<T>();
auto bias_data = bias == nullptr ? nullptr : bias->data<T>();
auto output_data = output->mutable_data<T>();
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2)
for (int n = 0; n < batch; ++n) { for (int n = 0; n < batch; ++n) {
for (int c = 0; c < channels; ++c) { for (int c = 0; c < channels; ++c) {
T bias_channel = bias ? bias[c] : 0; T bias_channel = bias_data ? bias_data[c] : 0;
for (int h = 0; h < height; ++h) { for (int h = 0; h < height; ++h) {
for (int w = 0; w < width; ++w) { for (int w = 0; w < width; ++w) {
index_t offset = n * channels * height * width + index_t offset = n * channels * height * width +
c * height * width + h * width + w; c * height * width + h * width + w;
output[offset] = bias_channel; output_data[offset] = bias_channel;
T sum = 0; T sum = 0;
const T *filter_ptr = filter + c * kernel_size; const T *filter_ptr = filter_data + c * kernel_size;
for (int inc = 0; inc < input_channels; ++inc) { for (int inc = 0; inc < input_channels; ++inc) {
for (int kh = 0; kh < kernel_h; ++kh) { for (int kh = 0; kh < kernel_h; ++kh) {
for (int kw = 0; kw < kernel_w; ++kw) { for (int kw = 0; kw < kernel_w; ++kw) {
...@@ -86,13 +94,13 @@ struct Conv2dFunctor { ...@@ -86,13 +94,13 @@ struct Conv2dFunctor {
n * input_channels * input_height * input_width + n * input_channels * input_height * input_width +
inc * input_height * input_width + inh * input_width + inc * input_height * input_width + inh * input_width +
inw; inw;
sum += input[input_offset] * *filter_ptr; sum += input_data[input_offset] * *filter_ptr;
} }
++filter_ptr; ++filter_ptr;
} }
} }
} }
output[offset] += sum; output_data[offset] += sum;
} }
} }
} }
...@@ -105,16 +113,12 @@ struct Conv2dFunctor { ...@@ -105,16 +113,12 @@ struct Conv2dFunctor {
}; };
template <> template <>
void Conv2dFunctor<DeviceType::NEON, float>::operator()( void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
const float *input, const Tensor *filter,
const index_t *input_shape, const Tensor *bias,
const float *filter, Tensor *output);
const index_t *filter_shape,
const float *bias, } // namespace kernels
float *output, } // namespace mace
const index_t *output_shape);
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_CONV_2D_H_ #endif // MACE_KERNELS_CONV_2D_H_
...@@ -41,14 +41,14 @@ extern void Conv2dNeonK5x5S1(const float *input, ...@@ -41,14 +41,14 @@ extern void Conv2dNeonK5x5S1(const float *input,
const index_t *output_shape); const index_t *output_shape);
template <> template <>
void Conv2dFunctor<DeviceType::NEON, float>::operator()( void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
const float *input, const Tensor *filter,
const index_t *input_shape, const Tensor *bias,
const float *filter, Tensor *output) {
const index_t *filter_shape, MACE_CHECK_NOTNULL(input);
const float *bias, MACE_CHECK_NOTNULL(filter);
float *output, MACE_CHECK_NOTNULL(output);
const index_t *output_shape) {
typedef void (*Conv2dNeonFunction)( typedef void (*Conv2dNeonFunction)(
const float *input, const index_t *input_shape, const float *filter, const float *input, const index_t *input_shape, const float *filter,
const index_t *filter_shape, const float *bias, float *output, const index_t *filter_shape, const float *bias, float *output,
...@@ -61,8 +61,8 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()( ...@@ -61,8 +61,8 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(
{nullptr, nullptr}, {nullptr, nullptr},
{Conv2dNeonK5x5S1, nullptr}}; {Conv2dNeonK5x5S1, nullptr}};
// not implement yet // not implement yet
index_t kernel_h = filter_shape[2]; index_t kernel_h = filter->shape()[2];
index_t kernel_w = filter_shape[3]; index_t kernel_w = filter->shape()[3];
if (kernel_h != kernel_w || kernel_h > 5 || strides_[0] != strides_[1] || if (kernel_h != kernel_w || kernel_h > 5 || strides_[0] != strides_[1] ||
strides_[0] > 2 || dilations_[0] != 1 || dilations_[1] != 1 || strides_[0] > 2 || dilations_[0] != 1 || dilations_[1] != 1 ||
selector[kernel_h - 1][strides_[0] - 1] == nullptr) { selector[kernel_h - 1][strides_[0] - 1] == nullptr) {
...@@ -71,21 +71,32 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()( ...@@ -71,21 +71,32 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(
<< " stride " << strides_[0] << "x" << strides_[1] << " stride " << strides_[0] << "x" << strides_[1]
<< " is not implemented yet, using slow version"; << " is not implemented yet, using slow version";
Conv2dFunctor<DeviceType::CPU, float>(strides_, paddings_, dilations_)( Conv2dFunctor<DeviceType::CPU, float>(strides_, paddings_, dilations_)(
input, input_shape, filter, filter_shape, bias, output, output_shape); input, filter, bias, output);
return; return;
} }
Tensor::MappingGuard input_mapper(input);
Tensor::MappingGuard filter_mapper(filter);
Tensor::MappingGuard bias_mapper(bias);
Tensor::MappingGuard output_mapper(output);
auto input_data = input->data<float>();
auto input_shape = input->shape().data();
auto filter_data = filter->data<float>();
auto bias_data = bias == nullptr ? nullptr : bias->data<float>();
auto output_data = output->mutable_data<float>();
auto output_shape = output->shape().data();
// Keep this alive during kernel execution // Keep this alive during kernel execution
Tensor padded_input; Tensor padded_input;
if (paddings_[0] > 0 || paddings_[1] > 0) { if (paddings_[0] > 0 || paddings_[1] > 0) {
ConstructInputWithPadding(input, input_shape, paddings_.data(), ConstructInputWithPadding(input_data, input->shape().data(),
&padded_input); paddings_.data(), &padded_input);
input = padded_input.data<float>(); input_data = padded_input.data<float>();
input_shape = padded_input.shape().data(); input_shape = padded_input.shape().data();
} }
auto conv2d_neon_func = selector[kernel_h - 1][strides_[0] - 1]; auto conv2d_neon_func = selector[kernel_h - 1][strides_[0] - 1];
conv2d_neon_func(input, input_shape, filter, nullptr, bias, output, conv2d_neon_func(input_data, input_shape, filter_data, nullptr,
output_shape); bias_data, output_data, output_shape);
} }
} // namespace kernels } // namespace kernels
......
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/kernels/conv_2d.h"
#include "mace/kernels/conv_pool_2d_util.h"
namespace mace {
namespace kernels {
extern void Conv2dOpenclK1x1S1(const Tensor *input, const Tensor *filter,
const Tensor *bias, Tensor *output);
template <>
void Conv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
Tensor *output) {
typedef void (*Conv2dOpenclFunction)(const Tensor *input, const Tensor *filter,
const Tensor *bias, Tensor *output);
// Selection matrix: kernel_size x stride_size
static const Conv2dOpenclFunction selector[5][2] = {
{Conv2dOpenclK1x1S1, nullptr},
{nullptr, nullptr},
{nullptr, nullptr},
{nullptr, nullptr},
{nullptr, nullptr}};
index_t kernel_h = filter->shape()[2];
index_t kernel_w = filter->shape()[3];
if (kernel_h != kernel_w || kernel_h > 5 || strides_[0] != strides_[1] ||
strides_[0] > 2 || dilations_[0] != 1 || dilations_[1] != 1 ||
selector[kernel_h - 1][strides_[0] - 1] == nullptr) {
LOG(WARNING) << "OpenCL conv2d kernel with "
<< "filter" << kernel_h << "x" << kernel_w << ","
<< " stride " << strides_[0] << "x" << strides_[1]
<< " is not implemented yet, using slow version";
// TODO(heliangliang) The CPU/NEON kernel should map the buffer
Conv2dFunctor<DeviceType::CPU, float>(strides_, paddings_, dilations_)(
input, filter, bias, output);
return;
}
MACE_CHECK(paddings_[0] == 1 && paddings_[1] == 1, "Padding not supported");
auto conv2d_func = selector[kernel_h - 1][strides_[0] - 1];
conv2d_func(input, filter, bias, output);
}
} // namespace kernels
} // namespace mace
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/core/common.h"
#include "mace/kernels/conv_2d.h"
#include "mace/utils/utils.h"
namespace mace {
namespace kernels {
static constexpr index_t kInputChannelBlockSize = 2;
static constexpr index_t kOutputChannelBlockSize = 4;
extern void Conv2dOpenclK1x1S1(const Tensor *input, const Tensor *filter,
const Tensor *bias, Tensor *output) {
const index_t batch = output->shape()[0];
const index_t channels = output->shape()[1];
const index_t height = output->shape()[2];
const index_t width = output->shape()[3];
const index_t input_batch = input->shape()[0];
const index_t input_channels = input->shape()[1];
const index_t input_height = input->shape()[2];
const index_t input_width = input->shape()[3];
MACE_CHECK(input_batch == batch && input_height == height &&
input_width == width);
const index_t total_pixels = height * width;
const index_t round_up_channels = RoundUp(channels, kOutputChannelBlockSize);
};
} // namespace kernels
} // namespace mace
...@@ -12,4 +12,6 @@ REGISTER_CPU_OPERATOR(Conv2D, Conv2dOp<DeviceType::CPU, float>); ...@@ -12,4 +12,6 @@ REGISTER_CPU_OPERATOR(Conv2D, Conv2dOp<DeviceType::CPU, float>);
REGISTER_NEON_OPERATOR(Conv2D, Conv2dOp<DeviceType::NEON, float>); REGISTER_NEON_OPERATOR(Conv2D, Conv2dOp<DeviceType::NEON, float>);
#endif // __ARM_NEON #endif // __ARM_NEON
REGISTER_OPENCL_OPERATOR(Conv2D, Conv2dOp<DeviceType::OPENCL, float>);
} // namespace mace } // namespace mace
...@@ -25,12 +25,7 @@ class Conv2dOp : public ConvPool2dOpBase<D, T> { ...@@ -25,12 +25,7 @@ class Conv2dOp : public ConvPool2dOpBase<D, T> {
bool Run() override { bool Run() override {
const Tensor *input = this->Input(INPUT); const Tensor *input = this->Input(INPUT);
const Tensor *filter = this->Input(FILTER); const Tensor *filter = this->Input(FILTER);
const T *bias_data = nullptr; const Tensor *bias = this->InputSize() >= 3 ? this->Input(BIAS) : nullptr;
if (this->InputSize() >= 3) {
const Tensor *bias = this->Input(BIAS);
bias_data = bias->data<T>();
}
Tensor *output = this->Output(OUTPUT); Tensor *output = this->Output(OUTPUT);
std::vector<index_t> output_shape(4); std::vector<index_t> output_shape(4);
...@@ -42,9 +37,7 @@ class Conv2dOp : public ConvPool2dOpBase<D, T> { ...@@ -42,9 +37,7 @@ class Conv2dOp : public ConvPool2dOpBase<D, T> {
output->Resize(output_shape); output->Resize(output_shape);
functor_.paddings_ = paddings; functor_.paddings_ = paddings;
functor_(input->data<T>(), input->shape().data(), filter->data<T>(), functor_(input, filter, bias, output);
filter->shape().data(), bias_data, output->mutable_data<T>(),
output->shape().data());
return true; return true;
} }
......
...@@ -67,9 +67,10 @@ static void Conv2d(int iters, ...@@ -67,9 +67,10 @@ static void Conv2d(int iters,
BENCHMARK( \ BENCHMARK( \
BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE) BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE)
#define BM_CONV_2D(N, C, H, W, KH, KW, S, P, OC, TYPE) \ #define BM_CONV_2D(N, C, H, W, KH, KW, S, P, OC, TYPE) \
BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, CPU); \ BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, CPU); \
BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, NEON); BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, NEON); \
BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, OPENCL);
BM_CONV_2D(1, 64, 32, 32, 1, 1, 1, VALID, 128, float); BM_CONV_2D(1, 64, 32, 32, 1, 1, 1, VALID, 128, float);
BM_CONV_2D(1, 64, 33, 31, 1, 1, 1, VALID, 128, float); // Test bad alignments BM_CONV_2D(1, 64, 33, 31, 1, 1, 1, VALID, 128, float); // Test bad alignments
......
...@@ -48,7 +48,7 @@ class OpsTestNet { ...@@ -48,7 +48,7 @@ class OpsTestNet {
const std::vector<index_t> &shape, const std::vector<index_t> &shape,
const std::vector<T> &data) { const std::vector<T> &data) {
Tensor *input = Tensor *input =
ws_.CreateTensor(name, cpu_allocator(), DataTypeToEnum<T>::v()); ws_.CreateTensor(name, GetDeviceAllocator(DeviceType::CPU), DataTypeToEnum<T>::v());
input->Resize(shape); input->Resize(shape);
T *input_data = input->mutable_data<T>(); T *input_data = input->mutable_data<T>();
MACE_CHECK(static_cast<size_t>(input->size()) == data.size()); MACE_CHECK(static_cast<size_t>(input->size()) == data.size());
...@@ -60,7 +60,7 @@ class OpsTestNet { ...@@ -60,7 +60,7 @@ class OpsTestNet {
const std::vector<index_t> &shape, const std::vector<index_t> &shape,
const T data) { const T data) {
Tensor *input = Tensor *input =
ws_.CreateTensor(name, cpu_allocator(), DataTypeToEnum<T>::v()); ws_.CreateTensor(name, GetDeviceAllocator(DeviceType::CPU), DataTypeToEnum<T>::v());
input->Resize(shape); input->Resize(shape);
T *input_data = input->mutable_data<T>(); T *input_data = input->mutable_data<T>();
std::fill(input_data, input_data + input->size(), data); std::fill(input_data, input_data + input->size(), data);
...@@ -71,7 +71,7 @@ class OpsTestNet { ...@@ -71,7 +71,7 @@ class OpsTestNet {
const std::vector<index_t> &shape, const std::vector<index_t> &shape,
bool positive = false) { bool positive = false) {
Tensor *input = Tensor *input =
ws_.CreateTensor(name, cpu_allocator(), DataTypeToEnum<T>::v()); ws_.CreateTensor(name, GetDeviceAllocator(DeviceType::CPU), DataTypeToEnum<T>::v());
input->Resize(shape); input->Resize(shape);
float *input_data = input->mutable_data<T>(); float *input_data = input->mutable_data<T>();
...@@ -206,7 +206,7 @@ void GenerateRandomIntTypeData(const std::vector<index_t> &shape, ...@@ -206,7 +206,7 @@ void GenerateRandomIntTypeData(const std::vector<index_t> &shape,
template <typename T> template <typename T>
unique_ptr<Tensor> CreateTensor(const std::vector<index_t> &shape, unique_ptr<Tensor> CreateTensor(const std::vector<index_t> &shape,
const std::vector<T> &data) { const std::vector<T> &data) {
unique_ptr<Tensor> res(new Tensor(cpu_allocator(), DataTypeToEnum<T>::v())); unique_ptr<Tensor> res(new Tensor(GetDeviceAllocator(DeviceType::CPU), DataTypeToEnum<T>::v()));
res->Resize(shape); res->Resize(shape);
T *input_data = res->mutable_data<T>(); T *input_data = res->mutable_data<T>();
memcpy(input_data, data.data(), data.size() * sizeof(T)); memcpy(input_data, data.data(), data.size() * sizeof(T));
......
...@@ -269,7 +269,7 @@ int Main(int argc, char **argv) { ...@@ -269,7 +269,7 @@ int Main(int argc, char **argv) {
// Load inputs // Load inputs
for (size_t i = 0; i < inputs_count; ++i) { for (size_t i = 0; i < inputs_count; ++i) {
Tensor *input_tensor = Tensor *input_tensor =
ws.CreateTensor(input_layers[i], cpu_allocator(), DT_FLOAT); ws.CreateTensor(input_layers[i], GetDeviceAllocator(DeviceType::CPU), DT_FLOAT);
vector<index_t> shapes; vector<index_t> shapes;
str_util::SplitAndParseToInts(input_layer_shapes[i], ',', &shapes); str_util::SplitAndParseToInts(input_layer_shapes[i], ',', &shapes);
input_tensor->Resize(shapes); input_tensor->Resize(shapes);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册