提交 8dc58b60 编写于 作者: 李寅

Merge branch 'opencl' into 'master'

Add conv2d opencl kernel placeholder

See merge request !72
......@@ -31,7 +31,7 @@ void OpenCLAllocator::Delete(void *buffer) {
void *OpenCLAllocator::Map(void *buffer, size_t nbytes) {
auto cl_buffer = static_cast<cl::Buffer *>(buffer);
auto queue = OpenCLRuntime::Get()->command_queue();
// TODO (heliangliang) Non-blocking call
// TODO(heliangliang) Non-blocking call
cl_int error;
void *mapped_ptr =
queue.enqueueMapBuffer(*cl_buffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0,
......
......@@ -4,8 +4,12 @@
#include <cstdlib>
#include <fstream>
#include <memory>
#include <mutex>
#include <dirent.h>
#include <errno.h>
#include "mace/core/logging.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/core/runtime/opencl/opencl_wrapper.h"
......@@ -13,8 +17,7 @@
namespace mace {
namespace {
bool ReadSourceFile(const char *filename, std::string *content) {
MACE_CHECK_NOTNULL(filename);
bool ReadSourceFile(const std::string &filename, std::string *content) {
MACE_CHECK_NOTNULL(content);
*content = "";
std::ifstream ifs(filename, std::ifstream::in);
......@@ -31,26 +34,50 @@ bool ReadSourceFile(const char *filename, std::string *content) {
}
bool BuildProgram(OpenCLRuntime *runtime,
const char *filename,
const std::string &path,
cl::Program *program) {
MACE_CHECK_NOTNULL(filename);
MACE_CHECK_NOTNULL(program);
std::string kernel_code;
if (!ReadSourceFile(filename, &kernel_code)) {
LOG(ERROR) << "Failed to read kernel source " << filename;
return false;
}
auto closer = [](DIR *d) {
if (d != nullptr) closedir(d);
};
std::unique_ptr<DIR, decltype(closer)> dir(opendir(path.c_str()), closer);
MACE_CHECK_NOTNULL(dir.get());
const std::string kSourceSuffix = ".cl";
cl::Program::Sources sources;
sources.push_back({kernel_code.c_str(), kernel_code.length()});
errno = 0;
dirent *entry = readdir(dir.get());
MACE_CHECK(errno == 0);
while (entry != nullptr) {
if (entry->d_type == DT_REG) {
std::string d_name(entry->d_name);
if (d_name.size() > kSourceSuffix.size() &&
d_name.compare(d_name.size() - kSourceSuffix.size(),
kSourceSuffix.size(), kSourceSuffix) == 0) {
std::string filename = path + d_name;
std::string kernel_source;
MACE_CHECK(ReadSourceFile(filename, &kernel_source));
sources.push_back({kernel_source.c_str(), kernel_source.length()});
}
}
entry = readdir(dir.get());
MACE_CHECK(errno == 0);
};
*program = cl::Program(runtime->context(), sources);
if (program->build({runtime->device()}) != CL_SUCCESS) {
LOG(INFO) << "Error building: "
<< program->getBuildInfo<CL_PROGRAM_BUILD_LOG>(runtime->device());
return false;
std::string build_options = "-Werror -cl-mad-enable -I" + path;
// TODO(heliangliang) -cl-unsafe-math-optimizations -cl-fast-relaxed-math
if (program->build({runtime->device()}, build_options.c_str()) != CL_SUCCESS) {
if (program->getBuildInfo<CL_PROGRAM_BUILD_STATUS>(runtime->device()) ==
CL_BUILD_ERROR) {
std::string build_log =
program->getBuildInfo<CL_PROGRAM_BUILD_LOG>(runtime->device());
LOG(INFO) << "Program build log: " << build_log;
}
LOG(FATAL) << "Build program failed";
}
return true;
}
......@@ -123,24 +150,16 @@ cl::Device &OpenCLRuntime::device() { return device_; }
cl::CommandQueue &OpenCLRuntime::command_queue() { return command_queue_; }
cl::Program OpenCLRuntime::GetProgram(const std::string &name) {
static const char *kernel_source_path = getenv("MACE_KERNEL_SOURCE_PATH");
std::string filename = name;
if (kernel_source_path != nullptr) {
filename = kernel_source_path + name;
}
cl::Program &OpenCLRuntime::program() {
// TODO(heliangliang) Support binary format
static const char *kernel_path = getenv("MACE_KERNEL_PATH");
std::string path(kernel_path == nullptr ? "" : kernel_path);
std::lock_guard<std::mutex> lock(program_lock_);
// TODO (heliangliang) Support binary format
auto iter = programs_.find(name);
if (iter != programs_.end()) {
return iter->second;
} else {
cl::Program program;
MACE_CHECK(BuildProgram(this, filename.c_str(), &program));
programs_.emplace(name, program);
return program;
}
std::call_once(build_flag_, [this, &path]() {
MACE_CHECK(BuildProgram(this, path, &program_));
});
return program_;
}
} // namespace mace
......@@ -28,14 +28,14 @@ class OpenCLRuntime {
cl::Context &context();
cl::Device &device();
cl::CommandQueue &command_queue();
cl::Program GetProgram(const std::string &name);
cl::Program &program();
private:
cl::Context context_;
cl::CommandQueue command_queue_;
cl::Device device_;
std::map<std::string, cl::Program> programs_;
std::mutex program_lock_;
cl::Program program_;
std::once_flag build_flag_;
};
} // namespace mace
......
......@@ -40,7 +40,7 @@ int main() {
step[0] = step_size;
}
auto program = runtime->GetProgram("simple_add.cl");
auto program = runtime->program();
auto simple_add =
cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer, cl::Buffer>(
......
......@@ -195,8 +195,8 @@ OpenCLLibraryImpl &OpenCLLibraryImpl::Get() {
bool OpenCLLibraryImpl::Load() {
if (loaded()) return true;
// TODO (heliangliang) Make this configurable
// TODO (heliangliang) Benchmark 64 bit overhead
// TODO(heliangliang) Make this configurable
// TODO(heliangliang) Benchmark 64 bit overhead
static const std::vector<std::string> paths = {
#if defined(__aarch64__)
// Qualcomm Adreno
......
......@@ -88,7 +88,7 @@ class Tensor {
* Map the device buffer as CPU buffer to access the data, unmap must be
* called later
*/
inline void Map() {
inline void Map() const {
if (!OnHost()) {
MACE_CHECK(buffer_ != nullptr && data_ == nullptr);
data_ = alloc_->Map(buffer_, size_ * SizeOfType());
......@@ -98,7 +98,7 @@ class Tensor {
/*
* Unmap the device buffer
*/
inline void Unmap() {
inline void Unmap() const {
if (!OnHost()) {
MACE_CHECK(buffer_ != nullptr && data_ != nullptr);
alloc_->Unmap(buffer_, data_);
......@@ -187,7 +187,7 @@ class Tensor {
LOG(INFO) << os.str();
}
inline size_t SizeOfType() {
inline size_t SizeOfType() const {
size_t type_size = 0;
CASES(dtype_, type_size = sizeof(T));
return type_size;
......@@ -203,14 +203,15 @@ class Tensor {
class MappingGuard {
public:
MappingGuard(Tensor *tensor) : tensor_(tensor) {
MACE_ASSERT(tensor_ != nullptr);
tensor_->Map();
MappingGuard(const Tensor *tensor) : tensor_(tensor) {
if (tensor_ != nullptr) tensor_->Map();
}
~MappingGuard() {
if (tensor_ != nullptr) tensor_->Unmap();
}
~MappingGuard() { tensor_->Unmap(); }
private:
Tensor *tensor_;
const Tensor *tensor_;
};
private:
......@@ -233,7 +234,7 @@ class Tensor {
// read or write
void *buffer_;
// Mapped buffer
void *data_;
mutable void *data_;
vector<index_t> shape_;
DISABLE_COPY_AND_ASSIGN(Tensor);
......
......@@ -84,7 +84,7 @@ int main(int argc, char **argv) {
Workspace ws;
ws.LoadModelTensor(net_def, DeviceType::CPU);
Tensor *input_tensor =
ws.CreateTensor(input_node + ":0", cpu_allocator(), DT_FLOAT);
ws.CreateTensor(input_node + ":0", GetDeviceAllocator(DeviceType::CPU), DT_FLOAT);
input_tensor->Resize(shape);
float *input_data = input_tensor->mutable_data<float>();
......
......@@ -11,8 +11,8 @@ load("//mace:mace.bzl", "if_android")
cc_library(
name = "kernels",
srcs = glob(["*.cc"]) + if_android(glob(["neon/*.cc"])),
hdrs = glob(["*.h"]) + if_android(glob(["neon/*.h"])),
srcs = glob(["*.cc"]) + if_android(glob(["neon/*.cc", "opencl/*.cc"])),
hdrs = glob(["*.h"]) + if_android(glob(["neon/*.h", "opencl/*.h"])),
copts = [
"-std=c++11",
"-fopenmp",
......@@ -20,6 +20,7 @@ cc_library(
linkopts = if_android(["-lm"]),
deps = [
"//mace/core",
"//mace/core:opencl_runtime",
"//mace/utils",
],
)
......
......@@ -19,27 +19,26 @@ struct Conv2dFunctor {
const int *dilations)
: strides_(strides), paddings_(paddings), dilations_(dilations) {}
void operator()(const T *input, // NCHW
const index_t *input_shape,
const T *filter, // c_out, c_in, kernel_h, kernel_w
const index_t *filter_shape,
const T *bias, // c_out
T *output, // NCHW
const index_t *output_shape) {
void operator()(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
Tensor *output) {
MACE_CHECK_NOTNULL(input);
MACE_CHECK_NOTNULL(filter);
MACE_CHECK_NOTNULL(output);
index_t batch = output_shape[0];
index_t channels = output_shape[1];
index_t height = output_shape[2];
index_t width = output_shape[3];
index_t batch = output->shape()[0];
index_t channels = output->shape()[1];
index_t height = output->shape()[2];
index_t width = output->shape()[3];
index_t input_batch = input_shape[0];
index_t input_channels = input_shape[1];
index_t input_height = input_shape[2];
index_t input_width = input_shape[3];
index_t input_batch = input->shape()[0];
index_t input_channels = input->shape()[1];
index_t input_height = input->shape()[2];
index_t input_width = input->shape()[3];
index_t kernel_h = filter_shape[2];
index_t kernel_w = filter_shape[3];
index_t kernel_h = filter->shape()[2];
index_t kernel_w = filter->shape()[3];
int stride_h = strides_[0];
int stride_w = strides_[1];
......@@ -57,17 +56,26 @@ struct Conv2dFunctor {
index_t kernel_size = input_channels * kernel_h * kernel_w;
Tensor::MappingGuard input_mapper(input);
Tensor::MappingGuard filter_mapper(filter);
Tensor::MappingGuard bias_mapper(bias);
Tensor::MappingGuard output_mapper(output);
auto input_data = input->data<T>();
auto filter_data = filter->data<T>();
auto bias_data = bias == nullptr ? nullptr : bias->data<T>();
auto output_data = output->mutable_data<T>();
#pragma omp parallel for collapse(2)
for (int n = 0; n < batch; ++n) {
for (int c = 0; c < channels; ++c) {
T bias_channel = bias ? bias[c] : 0;
T bias_channel = bias_data ? bias_data[c] : 0;
for (int h = 0; h < height; ++h) {
for (int w = 0; w < width; ++w) {
index_t offset = n * channels * height * width +
c * height * width + h * width + w;
output[offset] = bias_channel;
output_data[offset] = bias_channel;
T sum = 0;
const T *filter_ptr = filter + c * kernel_size;
const T *filter_ptr = filter_data + c * kernel_size;
for (int inc = 0; inc < input_channels; ++inc) {
for (int kh = 0; kh < kernel_h; ++kh) {
for (int kw = 0; kw < kernel_w; ++kw) {
......@@ -86,13 +94,13 @@ struct Conv2dFunctor {
n * input_channels * input_height * input_width +
inc * input_height * input_width + inh * input_width +
inw;
sum += input[input_offset] * *filter_ptr;
sum += input_data[input_offset] * *filter_ptr;
}
++filter_ptr;
}
}
}
output[offset] += sum;
output_data[offset] += sum;
}
}
}
......@@ -105,16 +113,12 @@ struct Conv2dFunctor {
};
template <>
void Conv2dFunctor<DeviceType::NEON, float>::operator()(
const float *input,
const index_t *input_shape,
const float *filter,
const index_t *filter_shape,
const float *bias,
float *output,
const index_t *output_shape);
} // namespace kernels
} // namespace mace
void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
Tensor *output);
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_CONV_2D_H_
......@@ -41,14 +41,14 @@ extern void Conv2dNeonK5x5S1(const float *input,
const index_t *output_shape);
template <>
void Conv2dFunctor<DeviceType::NEON, float>::operator()(
const float *input,
const index_t *input_shape,
const float *filter,
const index_t *filter_shape,
const float *bias,
float *output,
const index_t *output_shape) {
void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
Tensor *output) {
MACE_CHECK_NOTNULL(input);
MACE_CHECK_NOTNULL(filter);
MACE_CHECK_NOTNULL(output);
typedef void (*Conv2dNeonFunction)(
const float *input, const index_t *input_shape, const float *filter,
const index_t *filter_shape, const float *bias, float *output,
......@@ -61,8 +61,8 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(
{nullptr, nullptr},
{Conv2dNeonK5x5S1, nullptr}};
// not implement yet
index_t kernel_h = filter_shape[2];
index_t kernel_w = filter_shape[3];
index_t kernel_h = filter->shape()[2];
index_t kernel_w = filter->shape()[3];
if (kernel_h != kernel_w || kernel_h > 5 || strides_[0] != strides_[1] ||
strides_[0] > 2 || dilations_[0] != 1 || dilations_[1] != 1 ||
selector[kernel_h - 1][strides_[0] - 1] == nullptr) {
......@@ -71,21 +71,32 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(
<< " stride " << strides_[0] << "x" << strides_[1]
<< " is not implemented yet, using slow version";
Conv2dFunctor<DeviceType::CPU, float>(strides_, paddings_, dilations_)(
input, input_shape, filter, filter_shape, bias, output, output_shape);
input, filter, bias, output);
return;
}
Tensor::MappingGuard input_mapper(input);
Tensor::MappingGuard filter_mapper(filter);
Tensor::MappingGuard bias_mapper(bias);
Tensor::MappingGuard output_mapper(output);
auto input_data = input->data<float>();
auto input_shape = input->shape().data();
auto filter_data = filter->data<float>();
auto bias_data = bias == nullptr ? nullptr : bias->data<float>();
auto output_data = output->mutable_data<float>();
auto output_shape = output->shape().data();
// Keep this alive during kernel execution
Tensor padded_input;
if (paddings_[0] > 0 || paddings_[1] > 0) {
ConstructInputWithPadding(input, input_shape, paddings_.data(),
&padded_input);
input = padded_input.data<float>();
ConstructInputWithPadding(input_data, input->shape().data(),
paddings_.data(), &padded_input);
input_data = padded_input.data<float>();
input_shape = padded_input.shape().data();
}
auto conv2d_neon_func = selector[kernel_h - 1][strides_[0] - 1];
conv2d_neon_func(input, input_shape, filter, nullptr, bias, output,
output_shape);
conv2d_neon_func(input_data, input_shape, filter_data, nullptr,
bias_data, output_data, output_shape);
}
} // namespace kernels
......
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/kernels/conv_2d.h"
#include "mace/kernels/conv_pool_2d_util.h"
namespace mace {
namespace kernels {
extern void Conv2dOpenclK1x1S1(const Tensor *input, const Tensor *filter,
const Tensor *bias, Tensor *output);
template <>
void Conv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
Tensor *output) {
typedef void (*Conv2dOpenclFunction)(const Tensor *input, const Tensor *filter,
const Tensor *bias, Tensor *output);
// Selection matrix: kernel_size x stride_size
static const Conv2dOpenclFunction selector[5][2] = {
{Conv2dOpenclK1x1S1, nullptr},
{nullptr, nullptr},
{nullptr, nullptr},
{nullptr, nullptr},
{nullptr, nullptr}};
index_t kernel_h = filter->shape()[2];
index_t kernel_w = filter->shape()[3];
if (kernel_h != kernel_w || kernel_h > 5 || strides_[0] != strides_[1] ||
strides_[0] > 2 || dilations_[0] != 1 || dilations_[1] != 1 ||
selector[kernel_h - 1][strides_[0] - 1] == nullptr) {
LOG(WARNING) << "OpenCL conv2d kernel with "
<< "filter" << kernel_h << "x" << kernel_w << ","
<< " stride " << strides_[0] << "x" << strides_[1]
<< " is not implemented yet, using slow version";
// TODO(heliangliang) The CPU/NEON kernel should map the buffer
Conv2dFunctor<DeviceType::CPU, float>(strides_, paddings_, dilations_)(
input, filter, bias, output);
return;
}
MACE_CHECK(paddings_[0] == 1 && paddings_[1] == 1, "Padding not supported");
auto conv2d_func = selector[kernel_h - 1][strides_[0] - 1];
conv2d_func(input, filter, bias, output);
}
} // namespace kernels
} // namespace mace
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/core/common.h"
#include "mace/kernels/conv_2d.h"
#include "mace/utils/utils.h"
namespace mace {
namespace kernels {
static constexpr index_t kInputChannelBlockSize = 2;
static constexpr index_t kOutputChannelBlockSize = 4;
extern void Conv2dOpenclK1x1S1(const Tensor *input, const Tensor *filter,
const Tensor *bias, Tensor *output) {
const index_t batch = output->shape()[0];
const index_t channels = output->shape()[1];
const index_t height = output->shape()[2];
const index_t width = output->shape()[3];
const index_t input_batch = input->shape()[0];
const index_t input_channels = input->shape()[1];
const index_t input_height = input->shape()[2];
const index_t input_width = input->shape()[3];
MACE_CHECK(input_batch == batch && input_height == height &&
input_width == width);
const index_t total_pixels = height * width;
const index_t round_up_channels = RoundUp(channels, kOutputChannelBlockSize);
};
} // namespace kernels
} // namespace mace
......@@ -12,4 +12,6 @@ REGISTER_CPU_OPERATOR(Conv2D, Conv2dOp<DeviceType::CPU, float>);
REGISTER_NEON_OPERATOR(Conv2D, Conv2dOp<DeviceType::NEON, float>);
#endif // __ARM_NEON
REGISTER_OPENCL_OPERATOR(Conv2D, Conv2dOp<DeviceType::OPENCL, float>);
} // namespace mace
......@@ -25,12 +25,7 @@ class Conv2dOp : public ConvPool2dOpBase<D, T> {
bool Run() override {
const Tensor *input = this->Input(INPUT);
const Tensor *filter = this->Input(FILTER);
const T *bias_data = nullptr;
if (this->InputSize() >= 3) {
const Tensor *bias = this->Input(BIAS);
bias_data = bias->data<T>();
}
const Tensor *bias = this->InputSize() >= 3 ? this->Input(BIAS) : nullptr;
Tensor *output = this->Output(OUTPUT);
std::vector<index_t> output_shape(4);
......@@ -42,9 +37,7 @@ class Conv2dOp : public ConvPool2dOpBase<D, T> {
output->Resize(output_shape);
functor_.paddings_ = paddings;
functor_(input->data<T>(), input->shape().data(), filter->data<T>(),
filter->shape().data(), bias_data, output->mutable_data<T>(),
output->shape().data());
functor_(input, filter, bias, output);
return true;
}
......
......@@ -67,9 +67,10 @@ static void Conv2d(int iters,
BENCHMARK( \
BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE)
#define BM_CONV_2D(N, C, H, W, KH, KW, S, P, OC, TYPE) \
BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, CPU); \
BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, NEON);
#define BM_CONV_2D(N, C, H, W, KH, KW, S, P, OC, TYPE) \
BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, CPU); \
BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, NEON); \
BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, OPENCL);
BM_CONV_2D(1, 64, 32, 32, 1, 1, 1, VALID, 128, float);
BM_CONV_2D(1, 64, 33, 31, 1, 1, 1, VALID, 128, float); // Test bad alignments
......
......@@ -48,7 +48,7 @@ class OpsTestNet {
const std::vector<index_t> &shape,
const std::vector<T> &data) {
Tensor *input =
ws_.CreateTensor(name, cpu_allocator(), DataTypeToEnum<T>::v());
ws_.CreateTensor(name, GetDeviceAllocator(DeviceType::CPU), DataTypeToEnum<T>::v());
input->Resize(shape);
T *input_data = input->mutable_data<T>();
MACE_CHECK(static_cast<size_t>(input->size()) == data.size());
......@@ -60,7 +60,7 @@ class OpsTestNet {
const std::vector<index_t> &shape,
const T data) {
Tensor *input =
ws_.CreateTensor(name, cpu_allocator(), DataTypeToEnum<T>::v());
ws_.CreateTensor(name, GetDeviceAllocator(DeviceType::CPU), DataTypeToEnum<T>::v());
input->Resize(shape);
T *input_data = input->mutable_data<T>();
std::fill(input_data, input_data + input->size(), data);
......@@ -71,7 +71,7 @@ class OpsTestNet {
const std::vector<index_t> &shape,
bool positive = false) {
Tensor *input =
ws_.CreateTensor(name, cpu_allocator(), DataTypeToEnum<T>::v());
ws_.CreateTensor(name, GetDeviceAllocator(DeviceType::CPU), DataTypeToEnum<T>::v());
input->Resize(shape);
float *input_data = input->mutable_data<T>();
......@@ -206,7 +206,7 @@ void GenerateRandomIntTypeData(const std::vector<index_t> &shape,
template <typename T>
unique_ptr<Tensor> CreateTensor(const std::vector<index_t> &shape,
const std::vector<T> &data) {
unique_ptr<Tensor> res(new Tensor(cpu_allocator(), DataTypeToEnum<T>::v()));
unique_ptr<Tensor> res(new Tensor(GetDeviceAllocator(DeviceType::CPU), DataTypeToEnum<T>::v()));
res->Resize(shape);
T *input_data = res->mutable_data<T>();
memcpy(input_data, data.data(), data.size() * sizeof(T));
......
......@@ -269,7 +269,7 @@ int Main(int argc, char **argv) {
// Load inputs
for (size_t i = 0; i < inputs_count; ++i) {
Tensor *input_tensor =
ws.CreateTensor(input_layers[i], cpu_allocator(), DT_FLOAT);
ws.CreateTensor(input_layers[i], GetDeviceAllocator(DeviceType::CPU), DT_FLOAT);
vector<index_t> shapes;
str_util::SplitAndParseToInts(input_layer_shapes[i], ',', &shapes);
input_tensor->Resize(shapes);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册