提交 3b55f153 编写于 作者: L liuqi

Support opencl image2d format and add buffer to image opencl kernel.

上级 0f27434c
......@@ -10,6 +10,7 @@
#include "mace/core/common.h"
#include "mace/core/registry.h"
#include "mace/proto/mace.pb.h"
#include "mace/core/types.h"
namespace mace {
......@@ -26,8 +27,14 @@ class Allocator {
Allocator() {}
virtual ~Allocator() noexcept {}
virtual void *New(size_t nbytes) = 0;
virtual void *NewImage(const std::vector<size_t> &image_shape,
const DataType dt) = 0;
virtual void Delete(void *data) = 0;
virtual void DeleteImage(void *data) = 0;
virtual void *Map(void *buffer, size_t nbytes) = 0;
virtual void *MapImage(void *buffer,
const std::vector<size_t> &image_shape,
std::vector<size_t> &mapped_image_pitch) = 0;
virtual void Unmap(void *buffer, void *mapper_ptr) = 0;
virtual bool OnHost() = 0;
......@@ -58,8 +65,19 @@ class CPUAllocator : public Allocator {
return data;
}
void *NewImage(const std::vector<size_t> &shape,
const DataType dt) override {
return nullptr;
}
void Delete(void *data) override { free(data); }
void DeleteImage(void *data) override { free(data); };
void *Map(void *buffer, size_t nbytes) override { return buffer; }
void *MapImage(void *buffer,
const std::vector<size_t> &image_shape,
std::vector<size_t> &mapped_image_pitch) override {
return buffer;
}
void Unmap(void *buffer, void *mapper_ptr) override {}
bool OnHost() override { return true; }
};
......
此差异已折叠。
......@@ -51,6 +51,9 @@ bool SimpleNet::Run(RunMetadata *run_metadata) {
return false;
}
if (op_stats) {
if (device_type_ == DeviceType::OPENCL) {
OpenCLRuntime::Get()->command_queue().finish();
}
op_stats->set_op_end_rel_micros(NowInMicroSec() -
op_stats->all_start_micros());
op_stats->set_all_end_rel_micros(NowInMicroSec() -
......
......@@ -8,6 +8,29 @@
namespace mace {
namespace {
static cl_channel_type DataTypeToCLChannelType(const DataType t) {
switch (t) {
case DT_HALF:
case DT_FLOAT:
return CL_FLOAT;
case DT_INT8:
case DT_INT16:
case DT_INT32:
return CL_SIGNED_INT32;
case DT_UINT8:
case DT_UINT16:
case DT_UINT32:
return CL_UNSIGNED_INT32;
default:
LOG(FATAL) << "Image doesn't support the data type: " << t;
return 0;
}
}
}
OpenCLAllocator::OpenCLAllocator() {}
OpenCLAllocator::~OpenCLAllocator() {}
......@@ -21,6 +44,23 @@ void *OpenCLAllocator::New(size_t nbytes) {
return static_cast<void *>(buffer);
}
void *OpenCLAllocator::NewImage(const std::vector<size_t> &image_shape,
const DataType dt) {
MACE_CHECK(image_shape.size() == 2) << "Image shape's size must equal 2";
cl::ImageFormat img_format(CL_RGBA, DataTypeToCLChannelType(dt));
cl_int error;
cl::Image2D *cl_image =
new cl::Image2D(OpenCLRuntime::Get()->context(),
CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR ,
img_format,
image_shape[0], image_shape[1],
0, nullptr, &error);
return cl_image;
}
void OpenCLAllocator::Delete(void *buffer) {
if (buffer != nullptr) {
cl::Buffer *cl_buffer = static_cast<cl::Buffer *>(buffer);
......@@ -28,6 +68,13 @@ void OpenCLAllocator::Delete(void *buffer) {
}
}
void OpenCLAllocator::DeleteImage(void *buffer) {
if (buffer != nullptr) {
cl::Image3D *cl_image = static_cast<cl::Image3D *>(buffer);
delete cl_image;
}
}
void *OpenCLAllocator::Map(void *buffer, size_t nbytes) {
auto cl_buffer = static_cast<cl::Buffer *>(buffer);
auto queue = OpenCLRuntime::Get()->command_queue();
......@@ -40,6 +87,29 @@ void *OpenCLAllocator::Map(void *buffer, size_t nbytes) {
return mapped_ptr;
}
// TODO : there is something wrong with half type.
void *OpenCLAllocator::MapImage(void *buffer,
const std::vector<size_t> &image_shape,
std::vector<size_t> &mapped_image_pitch) {
MACE_CHECK(image_shape.size() == 2) << "Just support map 2d image";
auto cl_image = static_cast<cl::Image2D *>(buffer);
std::array<size_t, 3> origin = {0, 0, 0};
std::array<size_t, 3> region = {image_shape[0], image_shape[1], 1};
mapped_image_pitch.resize(2);
cl_int error;
void *mapped_ptr =
OpenCLRuntime::Get()->command_queue().enqueueMapImage(*cl_image,
CL_TRUE, CL_MAP_READ | CL_MAP_WRITE,
origin, region,
&mapped_image_pitch[0],
&mapped_image_pitch[1],
nullptr, nullptr, &error);
MACE_CHECK(error == CL_SUCCESS) << error;
return mapped_ptr;
}
void OpenCLAllocator::Unmap(void *buffer, void *mapped_ptr) {
auto cl_buffer = static_cast<cl::Buffer *>(buffer);
auto queue = OpenCLRuntime::Get()->command_queue();
......
......@@ -17,10 +17,25 @@ class OpenCLAllocator : public Allocator {
void *New(size_t nbytes) override;
/*
* Only support shape.size() > 1 and collapse first n-2 dimensions to depth.
* Use Image3D with RGBA (128-bit) format to represent the image.
*
* @ shape : [depth, ..., height, width ].
*/
void *NewImage(const std::vector<size_t> &image_shape,
const DataType dt) override;
void Delete(void *buffer) override;
void DeleteImage(void *buffer) override;
void *Map(void *buffer, size_t nbytes) override;
void *MapImage(void *buffer,
const std::vector<size_t> &image_shape,
std::vector<size_t> &mapped_image_pitch) override;
void Unmap(void *buffer, void *mapped_ptr) override;
bool OnHost() override;
......
......@@ -120,6 +120,7 @@ const std::map<std::string, std::string>
{"relu", "relu.cl"},
{"resize_bilinear", "resize_bilinear.cl"},
{"space_to_batch", "space_to_batch.cl"},
{"buffer_to_image", "buffer_to_image.cl"},
};
void OpenCLRuntime::BuildProgram(const std::string &program_file_name,
......
......@@ -101,6 +101,18 @@ class OpenCLLibraryImpl final {
const cl_event *,
cl_event *,
cl_int *);
using clEnqueueMapImageFunc = void *(*)(cl_command_queue,
cl_mem,
cl_bool,
cl_map_flags,
const size_t *,
const size_t *,
size_t *,
size_t *,
cl_uint,
const cl_event *,
cl_event *,
cl_int *);
using clCreateCommandQueueWithPropertiesFunc =
cl_command_queue (*)(cl_context /* context */,
cl_device_id /* device */,
......@@ -148,6 +160,11 @@ class OpenCLLibraryImpl final {
size_t,
void *,
size_t *);
using clGetImageInfoFunc = cl_int (*)(cl_mem,
cl_image_info,
size_t,
void *,
size_t *);
#define DEFINE_FUNC_PTR(func) func##Func func = nullptr
......@@ -172,6 +189,7 @@ class OpenCLLibraryImpl final {
DEFINE_FUNC_PTR(clCreateCommandQueueWithProperties);
DEFINE_FUNC_PTR(clReleaseCommandQueue);
DEFINE_FUNC_PTR(clEnqueueMapBuffer);
DEFINE_FUNC_PTR(clEnqueueMapImage);
DEFINE_FUNC_PTR(clRetainProgram);
DEFINE_FUNC_PTR(clGetProgramBuildInfo);
DEFINE_FUNC_PTR(clEnqueueReadBuffer);
......@@ -191,6 +209,7 @@ class OpenCLLibraryImpl final {
DEFINE_FUNC_PTR(clReleaseDevice);
DEFINE_FUNC_PTR(clRetainEvent);
DEFINE_FUNC_PTR(clGetKernelWorkGroupInfo);
DEFINE_FUNC_PTR(clGetImageInfo);
#undef DEFINE_FUNC_PTR
......@@ -294,6 +313,7 @@ void *OpenCLLibraryImpl::LoadFromPath(const std::string &path) {
ASSIGN_FROM_DLSYM(clCreateCommandQueueWithProperties);
ASSIGN_FROM_DLSYM(clReleaseCommandQueue);
ASSIGN_FROM_DLSYM(clEnqueueMapBuffer);
ASSIGN_FROM_DLSYM(clEnqueueMapImage);
ASSIGN_FROM_DLSYM(clRetainProgram);
ASSIGN_FROM_DLSYM(clGetProgramBuildInfo);
ASSIGN_FROM_DLSYM(clEnqueueReadBuffer);
......@@ -313,6 +333,7 @@ void *OpenCLLibraryImpl::LoadFromPath(const std::string &path) {
ASSIGN_FROM_DLSYM(clReleaseDevice);
ASSIGN_FROM_DLSYM(clRetainEvent);
ASSIGN_FROM_DLSYM(clGetKernelWorkGroupInfo);
ASSIGN_FROM_DLSYM(clGetImageInfo);
#undef ASSIGN_FROM_DLSYM
......@@ -577,6 +598,31 @@ void *clEnqueueMapBuffer(cl_command_queue command_queue,
return nullptr;
}
}
void *clEnqueueMapImage(cl_command_queue command_queue,
cl_mem image,
cl_bool blocking_map,
cl_map_flags map_flags,
const size_t origin[3],
const size_t region[3],
size_t *image_row_pitch,
size_t *image_slice_pitch,
cl_uint num_events_in_wait_list,
const cl_event *event_wait_list,
cl_event *event,
cl_int *errcode_ret) {
auto func = mace::OpenCLLibraryImpl::Get().clEnqueueMapImage;
if (func != nullptr) {
return func(command_queue, image, blocking_map, map_flags, origin, region,
image_row_pitch, image_slice_pitch,
num_events_in_wait_list, event_wait_list, event, errcode_ret);
} else {
if (errcode_ret != nullptr) {
*errcode_ret = CL_OUT_OF_RESOURCES;
}
return nullptr;
}
}
cl_command_queue clCreateCommandQueueWithProperties(
cl_context context,
cl_device_id device,
......@@ -832,3 +878,17 @@ cl_int clGetKernelWorkGroupInfo(cl_kernel kernel,
return CL_OUT_OF_RESOURCES;
}
}
cl_int clGetImageInfo(cl_mem image,
cl_image_info param_name,
size_t param_value_size,
void *param_value,
size_t *param_value_size_ret) {
auto func = mace::OpenCLLibraryImpl::Get().clGetImageInfo;
if (func != nullptr) {
return func(image, param_name, param_value_size, param_value,
param_value_size_ret);
} else {
return CL_OUT_OF_RESOURCES;
}
}
......@@ -23,6 +23,7 @@ namespace mace {
#define CASES_WITH_DEFAULT(TYPE_ENUM, STMTS, INVALID, DEFAULT) \
switch (TYPE_ENUM) { \
CASE(half, SINGLE_ARG(STMTS)) \
CASE(float, SINGLE_ARG(STMTS)) \
CASE(double, SINGLE_ARG(STMTS)) \
CASE(int32_t, SINGLE_ARG(STMTS)) \
......@@ -68,20 +69,26 @@ class Tensor {
size_(0),
dtype_(DT_FLOAT),
buffer_(nullptr),
data_(nullptr){};
data_(nullptr),
is_image_(false){};
Tensor(Allocator *alloc, DataType type)
: alloc_(alloc),
size_(0),
dtype_(type),
buffer_(nullptr),
data_(nullptr){};
data_(nullptr),
is_image_(false){};
~Tensor() {
MACE_CHECK(data_ == nullptr, "Buffer must be unmapped before destroy");
if (buffer_ != nullptr) {
MACE_CHECK_NOTNULL(alloc_);
alloc_->Delete(buffer_);
if (is_image_) {
alloc_->DeleteImage(buffer_);
} else {
alloc_->Delete(buffer_);
}
}
}
......@@ -91,6 +98,10 @@ class Tensor {
inline const vector<index_t> &shape() const { return shape_; }
inline const vector<size_t> &image_shape() const { return image_shape_; }
inline const bool is_image() const { return is_image_; }
inline index_t dim_size() const { return shape_.size(); }
inline index_t dim(unsigned int index) const {
......@@ -120,6 +131,11 @@ class Tensor {
}
}
inline void MapImage(std::vector<size_t> &mapped_image_pitch) const {
MACE_CHECK(!OnHost() && buffer_ != nullptr && data_ == nullptr);
data_ = alloc_->MapImage(buffer_, image_shape_, mapped_image_pitch);
}
/*
* Unmap the device buffer
*/
......@@ -162,17 +178,53 @@ class Tensor {
inline void Resize(const vector<index_t> &shape) {
shape_ = shape;
index_t size = NumElements();
if (size_ != size) {
if (size_ != size || is_image_) {
size_ = size;
MACE_CHECK(data_ == nullptr, "Buffer must be unmapped before resize");
alloc_->Delete(buffer_);
if (is_image_) {
alloc_->DeleteImage(buffer_);
} else {
alloc_->Delete(buffer_);
}
is_image_ = false;
CASES(dtype_, buffer_ = alloc_->New(size_ * sizeof(T)));
}
}
inline void ResizeLike(const Tensor &other) { Resize(other.shape()); }
inline void ResizeImage(const vector<index_t> &shape,
const std::vector<size_t> &image_shape) {
shape_ = shape;
index_t size = NumElements();
if (size_ != size || !is_image_) {
size_ = size;
MACE_CHECK(data_ == nullptr, "Buffer must be unmapped before resize");
if (is_image_) {
alloc_->DeleteImage(buffer_);
} else {
alloc_->Delete(buffer_);
}
is_image_ = true;
image_shape_ = image_shape;
buffer_ = alloc_->NewImage(image_shape, dtype_);
}
}
inline void ResizeLike(const Tensor &other) {
if (other.is_image()) {
ResizeImage(other.shape(), other.image_shape());
} else {
Resize(other.shape());
}
}
inline void ResizeLike(const Tensor *other) { Resize(other->shape()); }
inline void ResizeLike(const Tensor *other) {
if (other->is_image()) {
ResizeImage(other->shape(), other->image_shape());
} else {
Resize(other->shape());
}
}
template <typename T>
inline void Copy(const T *src, index_t size) {
......@@ -202,8 +254,6 @@ class Tensor {
for (int i : shape_) {
os << i << ", ";
}
LOG(INFO) << "Tensor shape: " << os.str()
<< " type: " << DataType_Name(dtype_);
os.str("");
os.clear();
......@@ -212,7 +262,7 @@ class Tensor {
if ( i != 0 && i % shape_[3] == 0) {
os << "\n";
}
CASES(dtype_, (os << this->data<T>()[i]) << ", ");
CASES(dtype_, (os << (this->data<T>()[i]) << ", "));
}
LOG(INFO) << os.str();
}
......@@ -228,20 +278,33 @@ class Tensor {
dtype_ = other.dtype_;
ResizeLike(other);
MappingGuard map_other(&other);
CopyBytes(other.raw_data(), size_ * SizeOfType());
if (is_image_) {
LOG(FATAL) << "Not support copy image tensor, please use Copy API.";
} else {
CopyBytes(other.raw_data(), size_ * SizeOfType());
}
}
class MappingGuard {
public:
MappingGuard(const Tensor *tensor) : tensor_(tensor) {
if (tensor_ != nullptr) tensor_->Map();
if (tensor_ != nullptr) {
if (tensor_->is_image()) {
tensor_->MapImage(mapped_image_pitch_);
} else {
tensor_->Map();
}
}
}
~MappingGuard() {
if (tensor_ != nullptr) tensor_->Unmap();
}
inline const vector<size_t> &mapped_image_pitch() const { return mapped_image_pitch_; }
private:
const Tensor *tensor_;
std::vector<size_t> mapped_image_pitch_;
};
private:
......@@ -261,6 +324,9 @@ class Tensor {
// Mapped buffer
mutable void *data_;
vector<index_t> shape_;
// Image for opencl
bool is_image_;
std::vector<size_t> image_shape_;
DISABLE_COPY_AND_ASSIGN(Tensor);
};
......
......@@ -24,31 +24,32 @@ bool DataTypeCanUseMemcpy(DataType dt) {
}
}
std::string DataTypeToCLType(const DataType dt) {
size_t GetEnumTypeSize(const DataType dt) {
switch (dt) {
case DT_FLOAT:
return "float";
return sizeof(float);
case DT_HALF:
return "half";
return sizeof(half);
case DT_UINT8:
return "uchar";
return sizeof(uint8_t);
case DT_INT8:
return "char";
return sizeof(int8_t);
case DT_DOUBLE:
return "double";
return sizeof(double);
case DT_INT32:
return "int";
return sizeof(int32_t);
case DT_UINT32:
return "int";
return sizeof(uint32_t);
case DT_UINT16:
return "ushort";
return sizeof(uint16_t);
case DT_INT16:
return "short";
return sizeof(int16_t);
case DT_INT64:
return "long";
return sizeof(int64_t);
default:
LOG(FATAL) << "Unsupported data type";
return "";
return 0;
}
}
......
......@@ -7,12 +7,16 @@
#include "mace/core/common.h"
#include "mace/proto/mace.pb.h"
#include "mace/core/half.h"
namespace mace {
using half = half_float::half;
bool DataTypeCanUseMemcpy(DataType dt);
std::string DataTypeToCLType(const DataType dt);
size_t GetEnumTypeSize(const DataType dt);
template <class T>
struct IsValidDataType;
......@@ -43,6 +47,7 @@ struct EnumToDataType {}; // Specializations below
typedef TYPE Type; \
}
MATCH_TYPE_AND_ENUM(half, DT_HALF);
MATCH_TYPE_AND_ENUM(float, DT_FLOAT);
MATCH_TYPE_AND_ENUM(double, DT_DOUBLE);
MATCH_TYPE_AND_ENUM(int32_t, DT_INT32);
......
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#ifndef MACE_KERNELS_BATCH_NORM_H_
#define MACE_KERNELS_BATCH_NORM_H_
#include "mace/core/tensor.h"
#include "mace/kernels/opencl/helper.h"
namespace mace {
namespace kernels {
struct BufferToImageFunctorBase {
BufferToImageFunctorBase(bool i2b) : i2b_(i2b) {}
bool i2b_;
};
template<DeviceType D, typename T>
struct BufferToImageFunctor : BufferToImageFunctorBase{
BufferToImageFunctor(bool i2b = false) :
BufferToImageFunctorBase(i2b) {}
void operator()(Tensor *input,
const BufferType type,
Tensor *output) {
MACE_NOT_IMPLEMENTED;
}
bool i2b_;
};
template<typename T>
struct BufferToImageFunctor<DeviceType::OPENCL, T> : BufferToImageFunctorBase{
BufferToImageFunctor(bool i2b = false) :
BufferToImageFunctorBase(i2b) {}
void operator()(Tensor *input,
const BufferType type,
Tensor *output);
};
} // namepsace kernels
} // namespace mace
#endif // MACE_KERNELS_BATCH_NORM_H_
......@@ -15,9 +15,9 @@ template <DeviceType D, typename T>
struct Conv2dFunctor {
Conv2dFunctor() {}
Conv2dFunctor(const int *strides,
const std::vector<int> &paddings,
const Padding &paddings,
const int *dilations)
: strides_(strides), paddings_(paddings), dilations_(dilations) {}
: strides_(strides), dilations_(dilations), paddings_(paddings) {}
void operator()(const Tensor *input,
const Tensor *filter,
......@@ -27,6 +27,13 @@ struct Conv2dFunctor {
MACE_CHECK_NOTNULL(filter);
MACE_CHECK_NOTNULL(output);
std::vector<index_t> output_shape(4);
std::vector<int> paddings(2);
kernels::CalcPaddingAndOutputSize(
input->shape().data(), filter->shape().data(), dilations_,
strides_, paddings_, output_shape.data(), paddings.data());
output->Resize(output_shape);
index_t batch = output->dim(0);
index_t channels = output->dim(1);
index_t height = output->dim(2);
......@@ -49,10 +56,10 @@ struct Conv2dFunctor {
MACE_CHECK(batch == input_batch, "Input/Output batch size mismatch");
// The left-upper most offset of the padded input
int padded_h_start = 0 - paddings_[0] / 2;
int padded_w_start = 0 - paddings_[1] / 2;
index_t padded_h_stop = input_height + paddings_[0] - paddings_[0] / 2;
index_t padded_w_stop = input_width + paddings_[1] - paddings_[1] / 2;
int padded_h_start = 0 - paddings[0] / 2;
int padded_w_start = 0 - paddings[1] / 2;
index_t padded_h_stop = input_height + paddings[0] - paddings[0] / 2;
index_t padded_w_stop = input_width + paddings[1] - paddings[1] / 2;
index_t kernel_size = input_channels * kernel_h * kernel_w;
......@@ -108,8 +115,8 @@ struct Conv2dFunctor {
}
const int *strides_; // [stride_h, stride_w]
std::vector<int> paddings_; // [padding_h, padding_w]
const int *dilations_; // [dilation_h, dilation_w]
Padding paddings_;
};
template <>
......
......@@ -49,6 +49,14 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
MACE_CHECK_NOTNULL(filter);
MACE_CHECK_NOTNULL(output);
std::vector<index_t> output_shape_vec(4);
std::vector<int> paddings(2);
kernels::CalcPaddingAndOutputSize(
input->shape().data(), filter->shape().data(), dilations_,
strides_, paddings_, output_shape_vec.data(), paddings.data());
output->Resize(output_shape_vec);
typedef void (*Conv2dNeonFunction)(
const float *input, const index_t *input_shape, const float *filter,
const index_t *filter_shape, const float *bias, float *output,
......@@ -77,8 +85,8 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
Tensor padded_input;
// Keep this alive during kernel execution
if (paddings_[0] > 0 || paddings_[1] > 0) {
ConstructInputWithPadding(input, paddings_.data(), &padded_input);
if (paddings[0] > 0 || paddings[1] > 0) {
ConstructInputWithPadding(input, paddings.data(), &padded_input);
input = &padded_input;
}
Tensor::MappingGuard input_mapper(input);
......
......@@ -4,6 +4,7 @@
#include "mace/kernels/addn.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h"
namespace mace {
namespace kernels {
......
......@@ -6,6 +6,7 @@
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/utils/tuner.h"
#include "mace/kernels/opencl/helper.h"
namespace mace {
namespace kernels {
......
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/kernels/buffer_to_image.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
namespace mace {
namespace kernels {
template<typename T>
void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
const BufferType type,
Tensor *image) {
MACE_CHECK(!buffer->is_image()) << "buffer must be buffer-type";
std::vector<size_t> image_shape;
if (!i2b_) {
CalImage2DShape(buffer->shape(), type, image_shape);
image->ResizeImage(buffer->shape(), image_shape);
} else {
image_shape = image->image_shape();
buffer->Resize(image->shape());
}
std::set<std::string> built_options;
built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(image->dtype()));
built_options.emplace("-DCMD_DATA_TYPE=" + DataTypeToOpenclCMDDataType(image->dtype()));
auto runtime = OpenCLRuntime::Get();
string kernel_name;
switch (type) {
case FILTER:
kernel_name = i2b_ ? "filter_image_to_buffer" : "filter_buffer_to_image";
break;
case IN_OUT:
kernel_name = i2b_ ? "in_out_image_to_buffer" : "in_out_buffer_to_image";
break;
case ARGUMENT:
kernel_name = i2b_ ? "arg_image_to_buffer" : "arg_buffer_to_image";
break;
}
VLOG(0) << kernel_name;
auto b2f_kernel = runtime->BuildKernel("buffer_to_image",
kernel_name,
built_options);
uint32_t idx = 0;
b2f_kernel.setArg(idx++, *(static_cast<const cl::Image3D *>(buffer->buffer())));
if (type == ARGUMENT) {
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(0)));
} else {
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(1)));
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(2)));
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(3)));
}
b2f_kernel.setArg(idx++, *(static_cast<cl::Image3D *>(image->buffer())));
const size_t gws[3] = {image_shape[0],
image_shape[1],
1};
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(b2f_kernel);
const std::vector<uint32_t> lws = {kwg_size, 1, 1};
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
b2f_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]));
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
}
template struct BufferToImageFunctor<DeviceType::OPENCL, float>;
template struct BufferToImageFunctor<DeviceType::OPENCL, half>;
} // namespace kernels
} // namespace mace
#include <common.h>
__kernel void filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w, ic, oc */
__private const int filter_w,
__private const int in_channel,
__private const int out_channel,
__write_only image2d_t output) {
int w = get_global_id(0);
int h = get_global_id(1);
const int out_channel_idx = h * 4;
const int hw_idx = w / in_channel;
int in_channel_idx = w % in_channel;
const int h_idx = hw_idx / filter_w;
const int w_idx = hw_idx % filter_w;
const int offset = ((h_idx * filter_w + w_idx) * in_channel + in_channel_idx) * out_channel
+ out_channel_idx;
VEC_DATA_TYPE(DATA_TYPE, 4) values = vload4(0, input + offset);
int2 coord = (int2)(w, h);
CMD_TYPE(write_image, CMD_DATA_TYPE)(output, coord, values);
}
__kernel void filter_image_to_buffer(__global DATA_TYPE *output, /* h, w, ic, oc */
__private const int filter_w,
__private const int in_channel,
__private const int out_channel,
__read_only image2d_t input) {
int w = get_global_id(0);
int h = get_global_id(1);
const int out_channel_idx = h * 4;
const int hw_idx = w / in_channel;
int in_channel_idx = w % in_channel;
const int h_idx = hw_idx / filter_w;
const int w_idx = hw_idx % filter_w;
const int offset = ((h_idx * filter_w + w_idx) * in_channel + in_channel_idx) * out_channel
+ out_channel_idx;
const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
int2 coord = (int2)(w, h);
VEC_DATA_TYPE(DATA_TYPE, 4) values = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, coord);
if (out_channel_idx + 4 > out_channel) {
const int diff = in_channel - in_channel_idx;
output[offset] = values.s0;
if (diff == 2) {
output[offset+1] = values.s1;
} else {
output[offset+1] = values.s1;
output[offset+2] = values.s2;
}
} else {
vstore4(values, 0, output + offset);
}
}
__kernel void in_out_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
__private const int height,
__private const int width,
__private const int channels,
__write_only image2d_t output) {
int w = get_global_id(0);
int h = get_global_id(1);
const int batch_idx = h / height;
const int height_idx = h % height;
const int width_idx = w % width;
const int channel_idx = w / width * 4;
const int offset = ((batch_idx * height + height_idx) * width + width_idx) * channels
+ channel_idx;
VEC_DATA_TYPE(DATA_TYPE, 4) values = vload4(0, input + offset);
int2 coord = (int2)(w, h);
CMD_TYPE(write_image, CMD_DATA_TYPE)(output, coord, values);
}
__kernel void in_out_image_to_buffer(__global DATA_TYPE *output, /* nhwc */
__private const int height,
__private const int width,
__private const int channels,
__read_only image2d_t input) {
int w = get_global_id(0);
int h = get_global_id(1);
const int batch_idx = h / height;
const int height_idx = h % height;
const int width_idx = w % width;
const int channel_idx = w / width * 4;
const int offset = ((batch_idx * height + height_idx) * width + width_idx) * channels
+ channel_idx;
const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
int2 coord = (int2)(w, h);
VEC_DATA_TYPE(DATA_TYPE, 4) values = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, coord);
if (channel_idx + 4 > channels) {
const int diff = channels - channel_idx;
output[offset] = values.s0;
if (diff == 2) {
output[offset+1] = values.s1;
} else {
output[offset+1] = values.s1;
output[offset+2] = values.s2;
}
} else {
vstore4(values, 0, output + offset);
}
}
__kernel void arg_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
__private const int count,
__write_only image2d_t output) {
int w = get_global_id(0);
int h = get_global_id(1);
const int offset = w * 4;
VEC_DATA_TYPE(DATA_TYPE, 4) values = vload4(0, input + offset);
int2 coord = (int2)(w, h);
CMD_TYPE(write_image, CMD_DATA_TYPE)(output, coord, values);
}
__kernel void arg_image_to_buffer(__global DATA_TYPE *output, /* nhwc */
__private const int count,
__read_only image2d_t input) {
int w = get_global_id(0);
int h = get_global_id(1);
const int offset = w * 4;
const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
int2 coord = (int2)(w, h);
VEC_DATA_TYPE(DATA_TYPE, 4) values = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, coord);
if (offset + 4 > count) {
const int diff = count - offset;
output[offset] = values.s0;
if (diff == 2) {
output[offset+1] = values.s1;
} else {
output[offset+1] = values.s1;
output[offset+2] = values.s2;
}
} else {
vstore4(values, 0, output + offset);
}
}
......@@ -11,4 +11,7 @@
#define VEC_DATA_TYPE_STR(data_type, size) data_type##size
#define VEC_DATA_TYPE(data_type, size) VEC_DATA_TYPE_STR(data_type, size)
#define CMD_TYPE_STR(cmd, type) cmd##type
#define CMD_TYPE(cmd, type) CMD_TYPE_STR(cmd, type)
#endif // MACE_KERNELS_OPENCL_CL_COMMON_H_
......@@ -165,47 +165,85 @@ __kernel void conv_2d_1x1_v2(__global const DATA_TYPE *input, /* n, c, h, w */
}
}
/* FIXME this is incomplete */
__kernel void conv_2d_1x1_v3(__read_only image3d_t input, /* n, c/4, h, w, 4 */
__global const float *filter, /* o, i, kh, kw */
__global const float *bias, /* o */
__write_only image3d_t output, /* n, c/4, h, w, 4 */
__private const int batch_num,
__private const int in_chan_num,
__private const int out_chan_num,
__private const int height,
__private const int width) {
int out_chan_blk = get_global_id(0);
int h = get_global_id(1);
int w = get_global_id(2);
// TODO : validation
__kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__read_only image2d_t filter, /* cout%4 * cin, cout/4 */
__read_only image2d_t bias, /* cout%4 * cout/4 */
__write_only image2d_t output,
__private const int in_ch_blks,
__private const int width) {
const int out_ch_blk = get_global_id(0);
const int out_w_blk = get_global_id(1);
const int out_w_blks = get_global_size(1);
const int out_hb = get_global_id(2);
const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
int in_chan_blk_num = (in_chan_num + 3) / 4;
int out_chan_blk_num = (out_chan_num + 3) / 4;
half4 bias_value = read_imageh(bias, sampler, (int2)(out_w_blk, 1));
half4 out0 = (half4)(bias_value.x, bias_value.x, bias_value.x, bias_value.x);
half4 out1 = (half4)(bias_value.y, bias_value.y, bias_value.y, bias_value.y);
half4 out2 = (half4)(bias_value.z, bias_value.z, bias_value.z, bias_value.z);
half4 out3 = (half4)(bias_value.w, bias_value.w, bias_value.w, bias_value.w);
// Unrolling this loop hurt perfmance
int in_x_base = 0;
for (int in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
const int in_x0 = in_x_base + out_w_blk;;
const int in_x1 = in_x0 + out_w_blks;
const int in_x2 = in_x1 + out_w_blks;
const int in_x3 = in_x2 + out_w_blks;
in_x_base += width;
const half4 in0 = read_imageh(input, sampler, (int2)(in_x0, out_hb));
const half4 in1 = read_imageh(input, sampler, (int2)(in_x1, out_hb));
const half4 in2 = read_imageh(input, sampler, (int2)(in_x2, out_hb));
const half4 in3 = read_imageh(input, sampler, (int2)(in_x3, out_hb));
// The order matters, load input first then load filter, why?
const int filter_x0 = in_ch_blk << 2;
const half4 weights0 = read_imageh(filter, sampler, (int2)(filter_x0, out_ch_blk));
const half4 weights1 = read_imageh(filter, sampler, (int2)(filter_x0 + 1, out_ch_blk));
const half4 weights2 = read_imageh(filter, sampler, (int2)(filter_x0 + 2, out_ch_blk));
const half4 weights3 = read_imageh(filter, sampler, (int2)(filter_x0 + 3, out_ch_blk));
// Will prefetch L2 improve performance? How to pretch image data?
// Interleaving load and mul does not improve performance as expected
out0 += in0.x * weights0;
out1 += in1.x * weights0;
out2 += in2.x * weights0;
out3 += in3.x * weights0;
out0 += in0.y * weights1;
out1 += in1.y * weights1;
out2 += in2.y * weights1;
out3 += in3.y * weights1;
out0 += in0.z * weights2;
out1 += in1.z * weights2;
out2 += in2.z * weights2;
out3 += in3.z * weights2;
out0 += in0.w * weights3;
out1 += in1.w * weights3;
out2 += in2.w * weights3;
out3 += in3.w * weights3;
}
const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
const int out_x_offset = out_ch_blk * width;
const int w0 = out_w_blk;
write_imageh(output, (int2)(out_x_offset + w0, out_hb), out0);
for (int batch = 0; batch < batch_num; ++batch) {
float4 bias_value = vload4(out_chan_blk, bias);
__private float4 out = bias_value;
for (int in_chan_blk = 0; in_chan_blk < in_chan_blk_num; ++in_chan_blk) {
int in_d = batch * in_chan_blk_num + in_chan_blk;
float4 in = read_imagef(input, sampler, (int4)(in_d, h, w, 0));
const float *filter_base = filter + (out_chan_blk << 2) * in_chan_num;
float4 weights = vload4(in_chan_blk, filter_base);
out.x += dot(in, weights);
weights = vload4(in_chan_blk, filter_base + in_chan_num);
out.y += dot(in, weights);
weights = vload4(in_chan_blk, filter_base + in_chan_num * 2);
out.z += dot(in, weights);
weights = vload4(in_chan_blk, filter_base + in_chan_num * 3);
out.w += dot(in, weights);
}
const int w1 = w0 + out_w_blks;
if (w1 >= width) return;
write_imageh(output, (int2)(out_x_offset + w1, out_hb), out1);
int out_d = batch * out_chan_blk_num + out_chan_blk;
int4 out_coord = (int4)(out_d, h, w, 0);
write_imagef(output, out_coord, out);
}
const int w2 = w1 + out_w_blks;
if (w2 >= width) return;
write_imageh(output, (int2)(out_x_offset + w2, out_hb), out2);
const int w3 = w2 + out_w_blks;
if (w3 >= width) return;
write_imageh(output, (int2)(out_x_offset + w3, out_hb), out3);
}
......@@ -3,6 +3,7 @@
//
#include "mace/kernels/conv_2d.h"
#include "mace/kernels/opencl/helper.h"
namespace mace {
namespace kernels {
......@@ -47,10 +48,25 @@ void Conv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
input, filter, bias, output);
return;
}
std::vector<index_t> output_shape(4);
std::vector<int> paddings(2);
kernels::CalcPaddingAndOutputSize(
input->shape().data(), filter->shape().data(), dilations_,
strides_, paddings_, output_shape.data(), paddings.data());
if (input->is_image()) {
std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT, output_image_shape);
output->ResizeImage(output_shape, output_image_shape);
} else {
output->Resize(output_shape);
}
auto conv2d_func = selector[kernel_h - 1][strides_[0] - 1];
if (paddings_[0] > 0 || paddings_[1] > 0) {
if (paddings[0] > 0 || paddings[1] > 0) {
Tensor padded_input(GetDeviceAllocator(DeviceType::OPENCL), DataTypeToEnum<float>::v());
ConstructInputWithPadding(input, paddings_.data(), &padded_input);
ConstructInputWithPadding(input, paddings.data(), &padded_input);
conv2d_func(&padded_input, filter, bias, output);
}else {
conv2d_func(input, filter, bias, output);
......
......@@ -3,45 +3,14 @@
//
#include "mace/kernels/conv_2d.h"
#include "mace/core/common.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/utils/utils.h"
#include "mace/kernels/opencl/helper.h"
namespace mace {
namespace kernels {
void Conv1x1Naive(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
Tensor *output) {
const index_t batch = output->dim(0);
const index_t channels = output->dim(1);
const index_t height = output->dim(2);
const index_t width = output->dim(3);
const index_t input_channels = input->dim(1);
auto runtime = OpenCLRuntime::Get();
auto program = runtime->program();
auto conv_2d =
cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer, cl::Buffer, int,
int>(program, "conv_2d_1x1_naive");
const index_t pixels = height * width;
cl_int error;
conv_2d(cl::EnqueueArgs(
runtime->command_queue(),
cl::NDRange(static_cast<int>(batch), static_cast<int>(channels),
static_cast<int>(pixels)),
cl::NDRange(1, 1, 128)),
*(static_cast<cl::Buffer *>(input->buffer())),
*(static_cast<cl::Buffer *>(filter->buffer())),
*(static_cast<cl::Buffer *>(bias->buffer())),
*(static_cast<cl::Buffer *>(output->buffer())),
static_cast<int>(input_channels), error);
MACE_CHECK(error == CL_SUCCESS);
}
void Conv1x1V2(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
......@@ -98,85 +67,44 @@ void Conv1x1V3(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
Tensor *output) {
const index_t batch = output->dim(0);
const index_t channels = output->dim(1);
const index_t height = output->dim(2);
const index_t width = output->dim(3);
const index_t input_channels = input->dim(1);
auto runtime = OpenCLRuntime::Get();
auto program = runtime->program();
const index_t channel_blocks = RoundUpDiv4(channels);
const index_t input_channel_blocks = RoundUpDiv4(input_channels);
const index_t pixels = height * width;
const index_t pixel_blocks = (pixels + 3) / 4;
std::set<std::string> built_options;
built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
built_options.emplace("-DSTRIDE_1");
built_options.emplace(bias != nullptr ? "-DBIAS" : "");
const index_t channel_blocks = (channels + 3) / 4;
const index_t input_channel_blocks = (input_channels + 3) / 4;
// FIXME temp hacking
static std::map<std::ptrdiff_t, cl::Image3D> input_image_map;
static std::map<std::ptrdiff_t, cl::Image3D> output_image_map;
cl::Image3D input_image;
cl::Image3D output_image;
auto input_iter =
input_image_map.find(reinterpret_cast<std::ptrdiff_t>(input->buffer()));
if (input_iter != input_image_map.end()) {
input_image = input_iter->second;
} else {
// The batch dimension is collapsed with channel
cl_int error;
cl::Image3D image =
cl::Image3D(OpenCLRuntime::Get()->context(),
CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
cl::ImageFormat(CL_RGBA, CL_FLOAT), height, width,
batch * input_channel_blocks, 0, 0, nullptr, &error);
MACE_CHECK(error == CL_SUCCESS);
input_image = image;
input_image_map.clear();
input_image_map.emplace(reinterpret_cast<std::ptrdiff_t>(input->buffer()),
image);
}
auto output_iter =
output_image_map.find(reinterpret_cast<std::ptrdiff_t>(output->buffer()));
if (output_iter != output_image_map.end()) {
output_image = output_iter->second;
} else {
cl_int error;
cl::Image3D image =
cl::Image3D(OpenCLRuntime::Get()->context(),
CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR,
cl::ImageFormat(CL_RGBA, CL_FLOAT), height, width,
batch * channel_blocks, 0, 0, nullptr, &error);
MACE_CHECK(error == CL_SUCCESS);
output_image = image;
output_image_map.clear();
output_image_map.emplace(reinterpret_cast<std::ptrdiff_t>(output->buffer()),
image);
}
auto runtime = OpenCLRuntime::Get();
auto program = runtime->program();
auto conv_2d_kernel = cl::Kernel(program, "conv_2d_1x1_v3");
auto conv_2d_kernel = runtime->BuildKernel("conv_2d_1x1", "conv_2d_1x1", built_options);
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);
uint32_t idx = 0;
conv_2d_kernel.setArg(idx++, input_image);
conv_2d_kernel.setArg(idx++,
*(static_cast<const cl::Buffer *>(filter->buffer())));
conv_2d_kernel.setArg(idx++,
*(static_cast<const cl::Buffer *>(bias->buffer())));
conv_2d_kernel.setArg(idx++, output_image);
conv_2d_kernel.setArg(idx++, static_cast<int>(batch));
conv_2d_kernel.setArg(idx++, static_cast<int>(input_channels));
conv_2d_kernel.setArg(idx++, static_cast<int>(channels));
conv_2d_kernel.setArg(idx++, static_cast<int>(height));
conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(filter->buffer())));
if (bias != nullptr) {
conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(bias->buffer())));
}
conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(output->buffer())));
conv_2d_kernel.setArg(idx++, static_cast<int>(input_channel_blocks));
conv_2d_kernel.setArg(idx++, static_cast<int>(width));
auto command_queue = runtime->command_queue();
cl_int error;
error = command_queue.enqueueNDRangeKernel(
conv_2d_kernel, cl::NullRange,
cl::NDRange(static_cast<int>(channel_blocks), static_cast<int>(height),
static_cast<int>(width)),
cl::NDRange(1, 2, kwg_size / 2));
cl::NDRange(static_cast<uint32_t>(channel_blocks), static_cast<uint32_t>(height),
static_cast<uint32_t>(height * batch)),
cl::NDRange(4, 15, 8));
MACE_CHECK(error == CL_SUCCESS, error);
}
......
......@@ -5,6 +5,7 @@
#include "mace/core/common.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/conv_2d.h"
#include "mace/kernels/opencl/helper.h"
namespace mace {
namespace kernels {
......
......@@ -5,6 +5,7 @@
#include "mace/core/common.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/conv_2d.h"
#include "mace/kernels/opencl/helper.h"
namespace mace {
namespace kernels {
......
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h"
namespace mace {
namespace kernels {
// [(c+3)/4*W, N * H]
void CalInOutputImageShape(const std::vector<index_t> &shape, /* NHWC */
std::vector<size_t> &image_shape) {
MACE_CHECK(shape.size() == 4);
image_shape.resize(2);
image_shape[0] = RoundUpDiv4(shape[3]) * shape[2];
image_shape[1] = shape[0] * shape[1];
}
// [H * W * Ic, (Oc + 3) / 4]
void CalFilterImageShape(const std::vector<index_t> &shape, /* HWIO*/
std::vector<size_t> &image_shape) {
MACE_CHECK(shape.size() == 4);
image_shape.resize(2);
image_shape[0] = shape[0] * shape[1] * shape[2];
image_shape[1] = RoundUpDiv4(shape.back());
}
// [(size + 3) / 4, 1]
void CalArgImageShape(const std::vector<index_t> &shape,
std::vector<size_t> &image_shape) {
MACE_CHECK(shape.size() == 1);
image_shape.resize(2);
image_shape[0] = RoundUpDiv4(shape[0]);
image_shape[1] = 1;
}
void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
const BufferType type,
std::vector<size_t> &image_shape) {
switch (type) {
case FILTER:
CalFilterImageShape(shape, image_shape);
break;
case IN_OUT:
CalInOutputImageShape(shape, image_shape);
break;
case ARGUMENT:
CalArgImageShape(shape, image_shape);
break;
default:
LOG(FATAL) << "Mace not supported yet.";
}
}
std::string DataTypeToCLType(const DataType dt) {
switch (dt) {
case DT_FLOAT:
return "float";
case DT_HALF:
return "half";
case DT_UINT8:
return "uchar";
case DT_INT8:
return "char";
case DT_DOUBLE:
return "double";
case DT_INT32:
return "int";
case DT_UINT32:
return "int";
case DT_UINT16:
return "ushort";
case DT_INT16:
return "short";
case DT_INT64:
return "long";
default:
LOG(FATAL) << "Unsupported data type";
return "";
}
}
std::string DataTypeToOpenclCMDDataType(const DataType dt) {
switch (dt) {
case DT_FLOAT:
return "f";
case DT_HALF:
return "h";
default:
LOG(FATAL) << "Not supported data type for opencl cmd data type";
return "";
}
}
} // namespace kernels
} // namespace mace
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#ifndef MACE_KERNELS_OPENCL_HELPER_H_
#define MACE_KERNELS_OPENCL_HELPER_H_
#include "mace/core/types.h"
namespace mace {
namespace kernels {
enum BufferType {
FILTER = 0,
IN_OUT= 1,
ARGUMENT = 2
};
void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
const BufferType type,
std::vector<size_t> &image_shape);
std::string DataTypeToOpenclCMDDataType(const DataType dt);
std::string DataTypeToCLType(const DataType dt);
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_OPENCL_HELPER_H_
......@@ -5,6 +5,7 @@
#include "mace/kernels/pooling.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h"
namespace mace {
namespace kernels {
......
......@@ -5,6 +5,7 @@
#include "mace/kernels/relu.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h"
namespace mace {
namespace kernels {
......
......@@ -5,6 +5,7 @@
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/core/tensor.h"
#include "mace/kernels/resize_bilinear.h"
#include "mace/kernels/opencl/helper.h"
namespace mace {
namespace kernels {
......
......@@ -8,6 +8,7 @@
#include "mace/core/common.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/space_to_batch.h"
#include "mace/kernels/opencl/helper.h"
namespace mace {
namespace kernels {
......
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/ops/buffer_to_image.h"
namespace mace {
REGISTER_OPENCL_OPERATOR(BufferToImage, BufferToImageOp<DeviceType::OPENCL, float>);
} // namespace mace
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#ifndef MACE_OPS_BUFFER_TO_IMAGE_H_
#define MACE_OPS_BUFFER_TO_IMAGE_H_
#include "mace/core/operator.h"
#include "mace/kernels/buffer_to_image.h"
namespace mace {
template <DeviceType D, typename T>
class BufferToImageOp: public Operator<D, T> {
public:
BufferToImageOp(const OperatorDef &op_def, Workspace *ws)
: Operator<D, T>(op_def, ws) {}
bool Run() override {
const Tensor *input_tensor = this->Input(INPUT);
kernels::BufferType type = static_cast<kernels::BufferType>(OperatorBase::GetSingleArgument<int>(
"buffer_type", static_cast<int>(kernels::FILTER)));
Tensor *output = this->Output(OUTPUT);
functor_(const_cast<Tensor *>(input_tensor), type, output);
return true;
}
private:
kernels::BufferToImageFunctor<D, T> functor_;
protected:
OP_INPUT_TAGS(INPUT);
OP_OUTPUT_TAGS(OUTPUT);
};
} // namespace mace
#endif // MACE_OPS_BUFFER_TO_IMAGE_H_
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "gtest/gtest.h"
#include "mace/ops/ops_test_util.h"
#include "mace/kernels/opencl/helper.h"
using namespace mace;
template<DeviceType D, typename T>
void TestBidirectionTransform(const int type, const std::vector<index_t> &input_shape) {
OpsTestNet net;
OpDefBuilder("BufferToImage", "BufferToImageTest")
.Input("Input")
.Output("B2IOutput")
.AddIntArg("buffer_type", type)
.Finalize(net.NewOperatorDef());
// Add input data
net.AddRandomInput<D, T>("Input", input_shape);
// Run
net.RunOp(D);
OpDefBuilder("ImageToBuffer", "ImageToBufferTest")
.Input("B2IOutput")
.Output("I2BOutput")
.AddIntArg("buffer_type", type)
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
// Check
ExpectTensorNear<T>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"), 1e-5);
}
TEST(BufferToImageTest, ArgSmall) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::ARGUMENT, {1});
}
TEST(BufferToImageTest, ArgMedia) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::ARGUMENT, {11});
}
TEST(BufferToImageTest, ArgLarge) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::ARGUMENT, {256});
}
TEST(BufferToImageTest, InputSmallSingleChannel) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT, {1, 2, 3, 1});
}
TEST(BufferToImageTest, InputSmallMultipleChannel) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT, {1, 2, 3, 3});
}
TEST(BufferToImageTest, InputSmallMultipleBatchAndChannel) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT, {3, 2, 3, 3});
}
TEST(BufferToImageTest, InputMedia) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT, {3, 13, 17, 128});
}
TEST(BufferToImageTest, InputLarge) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT, {3, 64, 64, 256});
}
TEST(BufferToImageTest, Filter1x1Small) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::FILTER, {1, 1, 3, 5});
}
TEST(BufferToImageTest, Filter1x1Media) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::FILTER, {1, 1, 13, 17});
}
TEST(BufferToImageTest, Filter1x1Large) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::FILTER, {1, 1, 128, 512});
}
TEST(BufferToImageTest, Filter3x3Small) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::FILTER, {3, 3, 3, 5});
}
TEST(BufferToImageTest, Filter3x3Meida) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::FILTER, {3, 3, 13, 17});
}
TEST(BufferToImageTest, Filter3x3Large) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::FILTER, {3, 3, 128, 256});
}
......@@ -17,9 +17,9 @@ template <DeviceType D, typename T>
class Conv2dOp : public ConvPool2dOpBase<D, T> {
public:
Conv2dOp(const OperatorDef &op_def, Workspace *ws)
: ConvPool2dOpBase<D, T>(op_def, ws) {
functor_.strides_ = this->strides_.data();
functor_.dilations_ = this->dilations_.data();
: ConvPool2dOpBase<D, T>(op_def, ws),
functor_(this->strides_.data(), this->padding_,
this->dilations_.data()) {
}
bool Run() override {
......@@ -28,15 +28,6 @@ class Conv2dOp : public ConvPool2dOpBase<D, T> {
const Tensor *bias = this->InputSize() >= 3 ? this->Input(BIAS) : nullptr;
Tensor *output = this->Output(OUTPUT);
std::vector<index_t> output_shape(4);
std::vector<int> paddings(2);
kernels::CalcPaddingAndOutputSize(
input->shape().data(), filter->shape().data(), this->dilations_.data(),
this->strides_.data(), this->padding_, output_shape.data(),
paddings.data());
output->Resize(output_shape);
functor_.paddings_ = paddings;
functor_(input, filter, bias, output);
return true;
......
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/ops/image_to_buffer.h"
namespace mace {
REGISTER_OPENCL_OPERATOR(ImageToBuffer, ImageToBufferOp<DeviceType::OPENCL, float>);
} // namespace mace
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#ifndef MACE_OPS_BUFFER_TO_IMAGE_H_
#define MACE_OPS_BUFFER_TO_IMAGE_H_
#include "mace/core/operator.h"
#include "mace/kernels/buffer_to_image.h"
namespace mace {
template <DeviceType D, typename T>
class ImageToBufferOp: public Operator<D, T> {
public:
ImageToBufferOp(const OperatorDef &op_def, Workspace *ws)
: Operator<D, T>(op_def, ws), functor_(true) {}
bool Run() override {
const Tensor *input_tensor = this->Input(INPUT);
Tensor *output = this->Output(OUTPUT);
kernels::BufferType type = static_cast<kernels::BufferType>(OperatorBase::GetSingleArgument<int>(
"buffer_type", static_cast<int>(kernels::FILTER)));
functor_(output, type, const_cast<Tensor *>(input_tensor));
return true;
}
private:
kernels::BufferToImageFunctor<D, T> functor_;
protected:
OP_INPUT_TAGS(INPUT);
OP_OUTPUT_TAGS(OUTPUT);
};
} // namespace mace
#endif // MACE_OPS_BUFFER_TO_IMAGE_H_
......@@ -12,6 +12,7 @@
#include "mace/core/net.h"
#include "mace/core/tensor.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h"
namespace mace {
......@@ -106,6 +107,32 @@ class OpsTestNet {
memcpy(input_data, data.data(), data.size() * sizeof(T));
}
template <DeviceType D, typename T>
void AddInputImageFromArray(const std::string &name,
const std::vector<index_t> &shape,
const std::vector<T> &data) {
Tensor *input =
ws_.CreateTensor(name, GetDeviceAllocator(D), DataTypeToEnum<T>::v());
std::vector<size_t> image_shape;
input->ResizeImage(shape, image_shape);
Tensor::MappingGuard input_mapper(input);
T *input_data = input->mutable_data<T>();
MACE_CHECK(static_cast<size_t>(input->size()) == data.size());
const T *data_ptr = data.data();
const int type_size = sizeof(T);
const int row_pitch = shape[3];
auto mapped_image_pitch = input_mapper.mapped_image_pitch();
const size_t slice_size = mapped_image_pitch[1] / sizeof(T);
for (int c = 0; c < shape[0] * shape[1]; ++c) {
T *input_ptr = input_data + c * slice_size;
for (int h = 0; h < shape[2]; ++h) {
memcpy(input_ptr, data_ptr, row_pitch * type_size);
input_ptr += mapped_image_pitch[0] / sizeof(T);
data_ptr += row_pitch;
}
}
}
template <DeviceType D, typename T>
void AddRepeatedInput(const std::string &name,
const std::vector<index_t> &shape,
......@@ -126,19 +153,26 @@ class OpsTestNet {
ws_.CreateTensor(name, GetDeviceAllocator(D), DataTypeToEnum<T>::v());
input->Resize(shape);
Tensor::MappingGuard input_mapper(input);
float *input_data = input->mutable_data<T>();
T *input_data = input->mutable_data<T>();
std::random_device rd;
std::mt19937 gen(rd());
std::normal_distribution<T> nd(0, 1);
std::generate(input_data, input_data + input->size(),
[&gen, &nd, positive] {
return positive ? std::abs(nd(gen)) : nd(gen);
});
std::normal_distribution<float> nd(0, 1);
if (DataTypeToEnum<T>::value == DT_HALF) {
std::generate(input_data, input_data + input->size(),
[&gen, &nd, positive] {
return half_float::half_cast<half>(positive ? std::abs(nd(gen)) : nd(gen));
});
} else {
std::generate(input_data, input_data + input->size(),
[&gen, &nd, positive] {
return positive ? std::abs(nd(gen)) : nd(gen);
});
}
}
OperatorDef *NewOperatorDef() {
op_defs_.clear();
op_defs_.emplace_back(OperatorDef());
return &op_defs_[op_defs_.size() - 1];
}
......@@ -258,7 +292,8 @@ inline std::string ShapeToString(const Tensor &x) {
template <typename T>
struct is_floating_point_type {
static const bool value =
std::is_same<T, float>::value || std::is_same<T, double>::value;
std::is_same<T, float>::value || std::is_same<T, double>::value
|| std::is_same<T, half>::value;
};
template <typename T>
......@@ -314,6 +349,7 @@ struct Expector<T, true> {
<< " index = " << i;
}
}
};
template <typename T>
......@@ -330,6 +366,7 @@ std::string ToString(const T &input) {
return ss.str();
}
} // namespace mace
#endif // MACE_OPS_TEST_UTIL_H_
......@@ -13,6 +13,17 @@ Integer RoundUp(Integer i, Integer factor) {
return (i + factor - 1) / factor * factor;
}
template <typename Integer, uint32_t factor>
Integer RoundUpDiv(Integer i) {
return (i + factor - 1) / factor;
}
// Partial specialization of function templates is not allowed
template <typename Integer>
Integer RoundUpDiv4(Integer i) {
return (i + 3) >> 2;
}
template <typename Integer>
Integer CeilQuotient(Integer a, Integer b) {
return (a + b - 1) / b;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册