提交 3b55f153 编写于 作者: L liuqi

Support opencl image2d format and add buffer to image opencl kernel.

上级 0f27434c
...@@ -10,6 +10,7 @@ ...@@ -10,6 +10,7 @@
#include "mace/core/common.h" #include "mace/core/common.h"
#include "mace/core/registry.h" #include "mace/core/registry.h"
#include "mace/proto/mace.pb.h" #include "mace/proto/mace.pb.h"
#include "mace/core/types.h"
namespace mace { namespace mace {
...@@ -26,8 +27,14 @@ class Allocator { ...@@ -26,8 +27,14 @@ class Allocator {
Allocator() {} Allocator() {}
virtual ~Allocator() noexcept {} virtual ~Allocator() noexcept {}
virtual void *New(size_t nbytes) = 0; virtual void *New(size_t nbytes) = 0;
virtual void *NewImage(const std::vector<size_t> &image_shape,
const DataType dt) = 0;
virtual void Delete(void *data) = 0; virtual void Delete(void *data) = 0;
virtual void DeleteImage(void *data) = 0;
virtual void *Map(void *buffer, size_t nbytes) = 0; virtual void *Map(void *buffer, size_t nbytes) = 0;
virtual void *MapImage(void *buffer,
const std::vector<size_t> &image_shape,
std::vector<size_t> &mapped_image_pitch) = 0;
virtual void Unmap(void *buffer, void *mapper_ptr) = 0; virtual void Unmap(void *buffer, void *mapper_ptr) = 0;
virtual bool OnHost() = 0; virtual bool OnHost() = 0;
...@@ -58,8 +65,19 @@ class CPUAllocator : public Allocator { ...@@ -58,8 +65,19 @@ class CPUAllocator : public Allocator {
return data; return data;
} }
void *NewImage(const std::vector<size_t> &shape,
const DataType dt) override {
return nullptr;
}
void Delete(void *data) override { free(data); } void Delete(void *data) override { free(data); }
void DeleteImage(void *data) override { free(data); };
void *Map(void *buffer, size_t nbytes) override { return buffer; } void *Map(void *buffer, size_t nbytes) override { return buffer; }
void *MapImage(void *buffer,
const std::vector<size_t> &image_shape,
std::vector<size_t> &mapped_image_pitch) override {
return buffer;
}
void Unmap(void *buffer, void *mapper_ptr) override {} void Unmap(void *buffer, void *mapper_ptr) override {}
bool OnHost() override { return true; } bool OnHost() override { return true; }
}; };
......
此差异已折叠。
...@@ -51,6 +51,9 @@ bool SimpleNet::Run(RunMetadata *run_metadata) { ...@@ -51,6 +51,9 @@ bool SimpleNet::Run(RunMetadata *run_metadata) {
return false; return false;
} }
if (op_stats) { if (op_stats) {
if (device_type_ == DeviceType::OPENCL) {
OpenCLRuntime::Get()->command_queue().finish();
}
op_stats->set_op_end_rel_micros(NowInMicroSec() - op_stats->set_op_end_rel_micros(NowInMicroSec() -
op_stats->all_start_micros()); op_stats->all_start_micros());
op_stats->set_all_end_rel_micros(NowInMicroSec() - op_stats->set_all_end_rel_micros(NowInMicroSec() -
......
...@@ -8,6 +8,29 @@ ...@@ -8,6 +8,29 @@
namespace mace { namespace mace {
namespace {
static cl_channel_type DataTypeToCLChannelType(const DataType t) {
switch (t) {
case DT_HALF:
case DT_FLOAT:
return CL_FLOAT;
case DT_INT8:
case DT_INT16:
case DT_INT32:
return CL_SIGNED_INT32;
case DT_UINT8:
case DT_UINT16:
case DT_UINT32:
return CL_UNSIGNED_INT32;
default:
LOG(FATAL) << "Image doesn't support the data type: " << t;
return 0;
}
}
}
OpenCLAllocator::OpenCLAllocator() {} OpenCLAllocator::OpenCLAllocator() {}
OpenCLAllocator::~OpenCLAllocator() {} OpenCLAllocator::~OpenCLAllocator() {}
...@@ -21,6 +44,23 @@ void *OpenCLAllocator::New(size_t nbytes) { ...@@ -21,6 +44,23 @@ void *OpenCLAllocator::New(size_t nbytes) {
return static_cast<void *>(buffer); return static_cast<void *>(buffer);
} }
void *OpenCLAllocator::NewImage(const std::vector<size_t> &image_shape,
const DataType dt) {
MACE_CHECK(image_shape.size() == 2) << "Image shape's size must equal 2";
cl::ImageFormat img_format(CL_RGBA, DataTypeToCLChannelType(dt));
cl_int error;
cl::Image2D *cl_image =
new cl::Image2D(OpenCLRuntime::Get()->context(),
CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR ,
img_format,
image_shape[0], image_shape[1],
0, nullptr, &error);
return cl_image;
}
void OpenCLAllocator::Delete(void *buffer) { void OpenCLAllocator::Delete(void *buffer) {
if (buffer != nullptr) { if (buffer != nullptr) {
cl::Buffer *cl_buffer = static_cast<cl::Buffer *>(buffer); cl::Buffer *cl_buffer = static_cast<cl::Buffer *>(buffer);
...@@ -28,6 +68,13 @@ void OpenCLAllocator::Delete(void *buffer) { ...@@ -28,6 +68,13 @@ void OpenCLAllocator::Delete(void *buffer) {
} }
} }
void OpenCLAllocator::DeleteImage(void *buffer) {
if (buffer != nullptr) {
cl::Image3D *cl_image = static_cast<cl::Image3D *>(buffer);
delete cl_image;
}
}
void *OpenCLAllocator::Map(void *buffer, size_t nbytes) { void *OpenCLAllocator::Map(void *buffer, size_t nbytes) {
auto cl_buffer = static_cast<cl::Buffer *>(buffer); auto cl_buffer = static_cast<cl::Buffer *>(buffer);
auto queue = OpenCLRuntime::Get()->command_queue(); auto queue = OpenCLRuntime::Get()->command_queue();
...@@ -40,6 +87,29 @@ void *OpenCLAllocator::Map(void *buffer, size_t nbytes) { ...@@ -40,6 +87,29 @@ void *OpenCLAllocator::Map(void *buffer, size_t nbytes) {
return mapped_ptr; return mapped_ptr;
} }
// TODO : there is something wrong with half type.
void *OpenCLAllocator::MapImage(void *buffer,
const std::vector<size_t> &image_shape,
std::vector<size_t> &mapped_image_pitch) {
MACE_CHECK(image_shape.size() == 2) << "Just support map 2d image";
auto cl_image = static_cast<cl::Image2D *>(buffer);
std::array<size_t, 3> origin = {0, 0, 0};
std::array<size_t, 3> region = {image_shape[0], image_shape[1], 1};
mapped_image_pitch.resize(2);
cl_int error;
void *mapped_ptr =
OpenCLRuntime::Get()->command_queue().enqueueMapImage(*cl_image,
CL_TRUE, CL_MAP_READ | CL_MAP_WRITE,
origin, region,
&mapped_image_pitch[0],
&mapped_image_pitch[1],
nullptr, nullptr, &error);
MACE_CHECK(error == CL_SUCCESS) << error;
return mapped_ptr;
}
void OpenCLAllocator::Unmap(void *buffer, void *mapped_ptr) { void OpenCLAllocator::Unmap(void *buffer, void *mapped_ptr) {
auto cl_buffer = static_cast<cl::Buffer *>(buffer); auto cl_buffer = static_cast<cl::Buffer *>(buffer);
auto queue = OpenCLRuntime::Get()->command_queue(); auto queue = OpenCLRuntime::Get()->command_queue();
......
...@@ -17,10 +17,25 @@ class OpenCLAllocator : public Allocator { ...@@ -17,10 +17,25 @@ class OpenCLAllocator : public Allocator {
void *New(size_t nbytes) override; void *New(size_t nbytes) override;
/*
* Only support shape.size() > 1 and collapse first n-2 dimensions to depth.
* Use Image3D with RGBA (128-bit) format to represent the image.
*
* @ shape : [depth, ..., height, width ].
*/
void *NewImage(const std::vector<size_t> &image_shape,
const DataType dt) override;
void Delete(void *buffer) override; void Delete(void *buffer) override;
void DeleteImage(void *buffer) override;
void *Map(void *buffer, size_t nbytes) override; void *Map(void *buffer, size_t nbytes) override;
void *MapImage(void *buffer,
const std::vector<size_t> &image_shape,
std::vector<size_t> &mapped_image_pitch) override;
void Unmap(void *buffer, void *mapped_ptr) override; void Unmap(void *buffer, void *mapped_ptr) override;
bool OnHost() override; bool OnHost() override;
......
...@@ -120,6 +120,7 @@ const std::map<std::string, std::string> ...@@ -120,6 +120,7 @@ const std::map<std::string, std::string>
{"relu", "relu.cl"}, {"relu", "relu.cl"},
{"resize_bilinear", "resize_bilinear.cl"}, {"resize_bilinear", "resize_bilinear.cl"},
{"space_to_batch", "space_to_batch.cl"}, {"space_to_batch", "space_to_batch.cl"},
{"buffer_to_image", "buffer_to_image.cl"},
}; };
void OpenCLRuntime::BuildProgram(const std::string &program_file_name, void OpenCLRuntime::BuildProgram(const std::string &program_file_name,
......
...@@ -101,6 +101,18 @@ class OpenCLLibraryImpl final { ...@@ -101,6 +101,18 @@ class OpenCLLibraryImpl final {
const cl_event *, const cl_event *,
cl_event *, cl_event *,
cl_int *); cl_int *);
using clEnqueueMapImageFunc = void *(*)(cl_command_queue,
cl_mem,
cl_bool,
cl_map_flags,
const size_t *,
const size_t *,
size_t *,
size_t *,
cl_uint,
const cl_event *,
cl_event *,
cl_int *);
using clCreateCommandQueueWithPropertiesFunc = using clCreateCommandQueueWithPropertiesFunc =
cl_command_queue (*)(cl_context /* context */, cl_command_queue (*)(cl_context /* context */,
cl_device_id /* device */, cl_device_id /* device */,
...@@ -148,6 +160,11 @@ class OpenCLLibraryImpl final { ...@@ -148,6 +160,11 @@ class OpenCLLibraryImpl final {
size_t, size_t,
void *, void *,
size_t *); size_t *);
using clGetImageInfoFunc = cl_int (*)(cl_mem,
cl_image_info,
size_t,
void *,
size_t *);
#define DEFINE_FUNC_PTR(func) func##Func func = nullptr #define DEFINE_FUNC_PTR(func) func##Func func = nullptr
...@@ -172,6 +189,7 @@ class OpenCLLibraryImpl final { ...@@ -172,6 +189,7 @@ class OpenCLLibraryImpl final {
DEFINE_FUNC_PTR(clCreateCommandQueueWithProperties); DEFINE_FUNC_PTR(clCreateCommandQueueWithProperties);
DEFINE_FUNC_PTR(clReleaseCommandQueue); DEFINE_FUNC_PTR(clReleaseCommandQueue);
DEFINE_FUNC_PTR(clEnqueueMapBuffer); DEFINE_FUNC_PTR(clEnqueueMapBuffer);
DEFINE_FUNC_PTR(clEnqueueMapImage);
DEFINE_FUNC_PTR(clRetainProgram); DEFINE_FUNC_PTR(clRetainProgram);
DEFINE_FUNC_PTR(clGetProgramBuildInfo); DEFINE_FUNC_PTR(clGetProgramBuildInfo);
DEFINE_FUNC_PTR(clEnqueueReadBuffer); DEFINE_FUNC_PTR(clEnqueueReadBuffer);
...@@ -191,6 +209,7 @@ class OpenCLLibraryImpl final { ...@@ -191,6 +209,7 @@ class OpenCLLibraryImpl final {
DEFINE_FUNC_PTR(clReleaseDevice); DEFINE_FUNC_PTR(clReleaseDevice);
DEFINE_FUNC_PTR(clRetainEvent); DEFINE_FUNC_PTR(clRetainEvent);
DEFINE_FUNC_PTR(clGetKernelWorkGroupInfo); DEFINE_FUNC_PTR(clGetKernelWorkGroupInfo);
DEFINE_FUNC_PTR(clGetImageInfo);
#undef DEFINE_FUNC_PTR #undef DEFINE_FUNC_PTR
...@@ -294,6 +313,7 @@ void *OpenCLLibraryImpl::LoadFromPath(const std::string &path) { ...@@ -294,6 +313,7 @@ void *OpenCLLibraryImpl::LoadFromPath(const std::string &path) {
ASSIGN_FROM_DLSYM(clCreateCommandQueueWithProperties); ASSIGN_FROM_DLSYM(clCreateCommandQueueWithProperties);
ASSIGN_FROM_DLSYM(clReleaseCommandQueue); ASSIGN_FROM_DLSYM(clReleaseCommandQueue);
ASSIGN_FROM_DLSYM(clEnqueueMapBuffer); ASSIGN_FROM_DLSYM(clEnqueueMapBuffer);
ASSIGN_FROM_DLSYM(clEnqueueMapImage);
ASSIGN_FROM_DLSYM(clRetainProgram); ASSIGN_FROM_DLSYM(clRetainProgram);
ASSIGN_FROM_DLSYM(clGetProgramBuildInfo); ASSIGN_FROM_DLSYM(clGetProgramBuildInfo);
ASSIGN_FROM_DLSYM(clEnqueueReadBuffer); ASSIGN_FROM_DLSYM(clEnqueueReadBuffer);
...@@ -313,6 +333,7 @@ void *OpenCLLibraryImpl::LoadFromPath(const std::string &path) { ...@@ -313,6 +333,7 @@ void *OpenCLLibraryImpl::LoadFromPath(const std::string &path) {
ASSIGN_FROM_DLSYM(clReleaseDevice); ASSIGN_FROM_DLSYM(clReleaseDevice);
ASSIGN_FROM_DLSYM(clRetainEvent); ASSIGN_FROM_DLSYM(clRetainEvent);
ASSIGN_FROM_DLSYM(clGetKernelWorkGroupInfo); ASSIGN_FROM_DLSYM(clGetKernelWorkGroupInfo);
ASSIGN_FROM_DLSYM(clGetImageInfo);
#undef ASSIGN_FROM_DLSYM #undef ASSIGN_FROM_DLSYM
...@@ -577,6 +598,31 @@ void *clEnqueueMapBuffer(cl_command_queue command_queue, ...@@ -577,6 +598,31 @@ void *clEnqueueMapBuffer(cl_command_queue command_queue,
return nullptr; return nullptr;
} }
} }
void *clEnqueueMapImage(cl_command_queue command_queue,
cl_mem image,
cl_bool blocking_map,
cl_map_flags map_flags,
const size_t origin[3],
const size_t region[3],
size_t *image_row_pitch,
size_t *image_slice_pitch,
cl_uint num_events_in_wait_list,
const cl_event *event_wait_list,
cl_event *event,
cl_int *errcode_ret) {
auto func = mace::OpenCLLibraryImpl::Get().clEnqueueMapImage;
if (func != nullptr) {
return func(command_queue, image, blocking_map, map_flags, origin, region,
image_row_pitch, image_slice_pitch,
num_events_in_wait_list, event_wait_list, event, errcode_ret);
} else {
if (errcode_ret != nullptr) {
*errcode_ret = CL_OUT_OF_RESOURCES;
}
return nullptr;
}
}
cl_command_queue clCreateCommandQueueWithProperties( cl_command_queue clCreateCommandQueueWithProperties(
cl_context context, cl_context context,
cl_device_id device, cl_device_id device,
...@@ -832,3 +878,17 @@ cl_int clGetKernelWorkGroupInfo(cl_kernel kernel, ...@@ -832,3 +878,17 @@ cl_int clGetKernelWorkGroupInfo(cl_kernel kernel,
return CL_OUT_OF_RESOURCES; return CL_OUT_OF_RESOURCES;
} }
} }
cl_int clGetImageInfo(cl_mem image,
cl_image_info param_name,
size_t param_value_size,
void *param_value,
size_t *param_value_size_ret) {
auto func = mace::OpenCLLibraryImpl::Get().clGetImageInfo;
if (func != nullptr) {
return func(image, param_name, param_value_size, param_value,
param_value_size_ret);
} else {
return CL_OUT_OF_RESOURCES;
}
}
...@@ -23,6 +23,7 @@ namespace mace { ...@@ -23,6 +23,7 @@ namespace mace {
#define CASES_WITH_DEFAULT(TYPE_ENUM, STMTS, INVALID, DEFAULT) \ #define CASES_WITH_DEFAULT(TYPE_ENUM, STMTS, INVALID, DEFAULT) \
switch (TYPE_ENUM) { \ switch (TYPE_ENUM) { \
CASE(half, SINGLE_ARG(STMTS)) \
CASE(float, SINGLE_ARG(STMTS)) \ CASE(float, SINGLE_ARG(STMTS)) \
CASE(double, SINGLE_ARG(STMTS)) \ CASE(double, SINGLE_ARG(STMTS)) \
CASE(int32_t, SINGLE_ARG(STMTS)) \ CASE(int32_t, SINGLE_ARG(STMTS)) \
...@@ -68,22 +69,28 @@ class Tensor { ...@@ -68,22 +69,28 @@ class Tensor {
size_(0), size_(0),
dtype_(DT_FLOAT), dtype_(DT_FLOAT),
buffer_(nullptr), buffer_(nullptr),
data_(nullptr){}; data_(nullptr),
is_image_(false){};
Tensor(Allocator *alloc, DataType type) Tensor(Allocator *alloc, DataType type)
: alloc_(alloc), : alloc_(alloc),
size_(0), size_(0),
dtype_(type), dtype_(type),
buffer_(nullptr), buffer_(nullptr),
data_(nullptr){}; data_(nullptr),
is_image_(false){};
~Tensor() { ~Tensor() {
MACE_CHECK(data_ == nullptr, "Buffer must be unmapped before destroy"); MACE_CHECK(data_ == nullptr, "Buffer must be unmapped before destroy");
if (buffer_ != nullptr) { if (buffer_ != nullptr) {
MACE_CHECK_NOTNULL(alloc_); MACE_CHECK_NOTNULL(alloc_);
if (is_image_) {
alloc_->DeleteImage(buffer_);
} else {
alloc_->Delete(buffer_); alloc_->Delete(buffer_);
} }
} }
}
inline DataType dtype() const { return dtype_; } inline DataType dtype() const { return dtype_; }
...@@ -91,6 +98,10 @@ class Tensor { ...@@ -91,6 +98,10 @@ class Tensor {
inline const vector<index_t> &shape() const { return shape_; } inline const vector<index_t> &shape() const { return shape_; }
inline const vector<size_t> &image_shape() const { return image_shape_; }
inline const bool is_image() const { return is_image_; }
inline index_t dim_size() const { return shape_.size(); } inline index_t dim_size() const { return shape_.size(); }
inline index_t dim(unsigned int index) const { inline index_t dim(unsigned int index) const {
...@@ -120,6 +131,11 @@ class Tensor { ...@@ -120,6 +131,11 @@ class Tensor {
} }
} }
inline void MapImage(std::vector<size_t> &mapped_image_pitch) const {
MACE_CHECK(!OnHost() && buffer_ != nullptr && data_ == nullptr);
data_ = alloc_->MapImage(buffer_, image_shape_, mapped_image_pitch);
}
/* /*
* Unmap the device buffer * Unmap the device buffer
*/ */
...@@ -162,17 +178,53 @@ class Tensor { ...@@ -162,17 +178,53 @@ class Tensor {
inline void Resize(const vector<index_t> &shape) { inline void Resize(const vector<index_t> &shape) {
shape_ = shape; shape_ = shape;
index_t size = NumElements(); index_t size = NumElements();
if (size_ != size) { if (size_ != size || is_image_) {
size_ = size; size_ = size;
MACE_CHECK(data_ == nullptr, "Buffer must be unmapped before resize"); MACE_CHECK(data_ == nullptr, "Buffer must be unmapped before resize");
if (is_image_) {
alloc_->DeleteImage(buffer_);
} else {
alloc_->Delete(buffer_); alloc_->Delete(buffer_);
}
is_image_ = false;
CASES(dtype_, buffer_ = alloc_->New(size_ * sizeof(T))); CASES(dtype_, buffer_ = alloc_->New(size_ * sizeof(T)));
} }
} }
inline void ResizeLike(const Tensor &other) { Resize(other.shape()); } inline void ResizeImage(const vector<index_t> &shape,
const std::vector<size_t> &image_shape) {
shape_ = shape;
index_t size = NumElements();
if (size_ != size || !is_image_) {
size_ = size;
MACE_CHECK(data_ == nullptr, "Buffer must be unmapped before resize");
if (is_image_) {
alloc_->DeleteImage(buffer_);
} else {
alloc_->Delete(buffer_);
}
is_image_ = true;
image_shape_ = image_shape;
buffer_ = alloc_->NewImage(image_shape, dtype_);
}
}
inline void ResizeLike(const Tensor &other) {
if (other.is_image()) {
ResizeImage(other.shape(), other.image_shape());
} else {
Resize(other.shape());
}
}
inline void ResizeLike(const Tensor *other) { Resize(other->shape()); } inline void ResizeLike(const Tensor *other) {
if (other->is_image()) {
ResizeImage(other->shape(), other->image_shape());
} else {
Resize(other->shape());
}
}
template <typename T> template <typename T>
inline void Copy(const T *src, index_t size) { inline void Copy(const T *src, index_t size) {
...@@ -202,8 +254,6 @@ class Tensor { ...@@ -202,8 +254,6 @@ class Tensor {
for (int i : shape_) { for (int i : shape_) {
os << i << ", "; os << i << ", ";
} }
LOG(INFO) << "Tensor shape: " << os.str()
<< " type: " << DataType_Name(dtype_);
os.str(""); os.str("");
os.clear(); os.clear();
...@@ -212,7 +262,7 @@ class Tensor { ...@@ -212,7 +262,7 @@ class Tensor {
if ( i != 0 && i % shape_[3] == 0) { if ( i != 0 && i % shape_[3] == 0) {
os << "\n"; os << "\n";
} }
CASES(dtype_, (os << this->data<T>()[i]) << ", "); CASES(dtype_, (os << (this->data<T>()[i]) << ", "));
} }
LOG(INFO) << os.str(); LOG(INFO) << os.str();
} }
...@@ -228,20 +278,33 @@ class Tensor { ...@@ -228,20 +278,33 @@ class Tensor {
dtype_ = other.dtype_; dtype_ = other.dtype_;
ResizeLike(other); ResizeLike(other);
MappingGuard map_other(&other); MappingGuard map_other(&other);
if (is_image_) {
LOG(FATAL) << "Not support copy image tensor, please use Copy API.";
} else {
CopyBytes(other.raw_data(), size_ * SizeOfType()); CopyBytes(other.raw_data(), size_ * SizeOfType());
} }
}
class MappingGuard { class MappingGuard {
public: public:
MappingGuard(const Tensor *tensor) : tensor_(tensor) { MappingGuard(const Tensor *tensor) : tensor_(tensor) {
if (tensor_ != nullptr) tensor_->Map(); if (tensor_ != nullptr) {
if (tensor_->is_image()) {
tensor_->MapImage(mapped_image_pitch_);
} else {
tensor_->Map();
}
}
} }
~MappingGuard() { ~MappingGuard() {
if (tensor_ != nullptr) tensor_->Unmap(); if (tensor_ != nullptr) tensor_->Unmap();
} }
inline const vector<size_t> &mapped_image_pitch() const { return mapped_image_pitch_; }
private: private:
const Tensor *tensor_; const Tensor *tensor_;
std::vector<size_t> mapped_image_pitch_;
}; };
private: private:
...@@ -261,6 +324,9 @@ class Tensor { ...@@ -261,6 +324,9 @@ class Tensor {
// Mapped buffer // Mapped buffer
mutable void *data_; mutable void *data_;
vector<index_t> shape_; vector<index_t> shape_;
// Image for opencl
bool is_image_;
std::vector<size_t> image_shape_;
DISABLE_COPY_AND_ASSIGN(Tensor); DISABLE_COPY_AND_ASSIGN(Tensor);
}; };
......
...@@ -24,31 +24,32 @@ bool DataTypeCanUseMemcpy(DataType dt) { ...@@ -24,31 +24,32 @@ bool DataTypeCanUseMemcpy(DataType dt) {
} }
} }
std::string DataTypeToCLType(const DataType dt) {
size_t GetEnumTypeSize(const DataType dt) {
switch (dt) { switch (dt) {
case DT_FLOAT: case DT_FLOAT:
return "float"; return sizeof(float);
case DT_HALF: case DT_HALF:
return "half"; return sizeof(half);
case DT_UINT8: case DT_UINT8:
return "uchar"; return sizeof(uint8_t);
case DT_INT8: case DT_INT8:
return "char"; return sizeof(int8_t);
case DT_DOUBLE: case DT_DOUBLE:
return "double"; return sizeof(double);
case DT_INT32: case DT_INT32:
return "int"; return sizeof(int32_t);
case DT_UINT32: case DT_UINT32:
return "int"; return sizeof(uint32_t);
case DT_UINT16: case DT_UINT16:
return "ushort"; return sizeof(uint16_t);
case DT_INT16: case DT_INT16:
return "short"; return sizeof(int16_t);
case DT_INT64: case DT_INT64:
return "long"; return sizeof(int64_t);
default: default:
LOG(FATAL) << "Unsupported data type"; LOG(FATAL) << "Unsupported data type";
return ""; return 0;
} }
} }
......
...@@ -7,12 +7,16 @@ ...@@ -7,12 +7,16 @@
#include "mace/core/common.h" #include "mace/core/common.h"
#include "mace/proto/mace.pb.h" #include "mace/proto/mace.pb.h"
#include "mace/core/half.h"
namespace mace { namespace mace {
using half = half_float::half;
bool DataTypeCanUseMemcpy(DataType dt); bool DataTypeCanUseMemcpy(DataType dt);
std::string DataTypeToCLType(const DataType dt); size_t GetEnumTypeSize(const DataType dt);
template <class T> template <class T>
struct IsValidDataType; struct IsValidDataType;
...@@ -43,6 +47,7 @@ struct EnumToDataType {}; // Specializations below ...@@ -43,6 +47,7 @@ struct EnumToDataType {}; // Specializations below
typedef TYPE Type; \ typedef TYPE Type; \
} }
MATCH_TYPE_AND_ENUM(half, DT_HALF);
MATCH_TYPE_AND_ENUM(float, DT_FLOAT); MATCH_TYPE_AND_ENUM(float, DT_FLOAT);
MATCH_TYPE_AND_ENUM(double, DT_DOUBLE); MATCH_TYPE_AND_ENUM(double, DT_DOUBLE);
MATCH_TYPE_AND_ENUM(int32_t, DT_INT32); MATCH_TYPE_AND_ENUM(int32_t, DT_INT32);
......
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#ifndef MACE_KERNELS_BATCH_NORM_H_
#define MACE_KERNELS_BATCH_NORM_H_
#include "mace/core/tensor.h"
#include "mace/kernels/opencl/helper.h"
namespace mace {
namespace kernels {
struct BufferToImageFunctorBase {
BufferToImageFunctorBase(bool i2b) : i2b_(i2b) {}
bool i2b_;
};
template<DeviceType D, typename T>
struct BufferToImageFunctor : BufferToImageFunctorBase{
BufferToImageFunctor(bool i2b = false) :
BufferToImageFunctorBase(i2b) {}
void operator()(Tensor *input,
const BufferType type,
Tensor *output) {
MACE_NOT_IMPLEMENTED;
}
bool i2b_;
};
template<typename T>
struct BufferToImageFunctor<DeviceType::OPENCL, T> : BufferToImageFunctorBase{
BufferToImageFunctor(bool i2b = false) :
BufferToImageFunctorBase(i2b) {}
void operator()(Tensor *input,
const BufferType type,
Tensor *output);
};
} // namepsace kernels
} // namespace mace
#endif // MACE_KERNELS_BATCH_NORM_H_
...@@ -15,9 +15,9 @@ template <DeviceType D, typename T> ...@@ -15,9 +15,9 @@ template <DeviceType D, typename T>
struct Conv2dFunctor { struct Conv2dFunctor {
Conv2dFunctor() {} Conv2dFunctor() {}
Conv2dFunctor(const int *strides, Conv2dFunctor(const int *strides,
const std::vector<int> &paddings, const Padding &paddings,
const int *dilations) const int *dilations)
: strides_(strides), paddings_(paddings), dilations_(dilations) {} : strides_(strides), dilations_(dilations), paddings_(paddings) {}
void operator()(const Tensor *input, void operator()(const Tensor *input,
const Tensor *filter, const Tensor *filter,
...@@ -27,6 +27,13 @@ struct Conv2dFunctor { ...@@ -27,6 +27,13 @@ struct Conv2dFunctor {
MACE_CHECK_NOTNULL(filter); MACE_CHECK_NOTNULL(filter);
MACE_CHECK_NOTNULL(output); MACE_CHECK_NOTNULL(output);
std::vector<index_t> output_shape(4);
std::vector<int> paddings(2);
kernels::CalcPaddingAndOutputSize(
input->shape().data(), filter->shape().data(), dilations_,
strides_, paddings_, output_shape.data(), paddings.data());
output->Resize(output_shape);
index_t batch = output->dim(0); index_t batch = output->dim(0);
index_t channels = output->dim(1); index_t channels = output->dim(1);
index_t height = output->dim(2); index_t height = output->dim(2);
...@@ -49,10 +56,10 @@ struct Conv2dFunctor { ...@@ -49,10 +56,10 @@ struct Conv2dFunctor {
MACE_CHECK(batch == input_batch, "Input/Output batch size mismatch"); MACE_CHECK(batch == input_batch, "Input/Output batch size mismatch");
// The left-upper most offset of the padded input // The left-upper most offset of the padded input
int padded_h_start = 0 - paddings_[0] / 2; int padded_h_start = 0 - paddings[0] / 2;
int padded_w_start = 0 - paddings_[1] / 2; int padded_w_start = 0 - paddings[1] / 2;
index_t padded_h_stop = input_height + paddings_[0] - paddings_[0] / 2; index_t padded_h_stop = input_height + paddings[0] - paddings[0] / 2;
index_t padded_w_stop = input_width + paddings_[1] - paddings_[1] / 2; index_t padded_w_stop = input_width + paddings[1] - paddings[1] / 2;
index_t kernel_size = input_channels * kernel_h * kernel_w; index_t kernel_size = input_channels * kernel_h * kernel_w;
...@@ -108,8 +115,8 @@ struct Conv2dFunctor { ...@@ -108,8 +115,8 @@ struct Conv2dFunctor {
} }
const int *strides_; // [stride_h, stride_w] const int *strides_; // [stride_h, stride_w]
std::vector<int> paddings_; // [padding_h, padding_w]
const int *dilations_; // [dilation_h, dilation_w] const int *dilations_; // [dilation_h, dilation_w]
Padding paddings_;
}; };
template <> template <>
......
...@@ -49,6 +49,14 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input, ...@@ -49,6 +49,14 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
MACE_CHECK_NOTNULL(filter); MACE_CHECK_NOTNULL(filter);
MACE_CHECK_NOTNULL(output); MACE_CHECK_NOTNULL(output);
std::vector<index_t> output_shape_vec(4);
std::vector<int> paddings(2);
kernels::CalcPaddingAndOutputSize(
input->shape().data(), filter->shape().data(), dilations_,
strides_, paddings_, output_shape_vec.data(), paddings.data());
output->Resize(output_shape_vec);
typedef void (*Conv2dNeonFunction)( typedef void (*Conv2dNeonFunction)(
const float *input, const index_t *input_shape, const float *filter, const float *input, const index_t *input_shape, const float *filter,
const index_t *filter_shape, const float *bias, float *output, const index_t *filter_shape, const float *bias, float *output,
...@@ -77,8 +85,8 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input, ...@@ -77,8 +85,8 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
Tensor padded_input; Tensor padded_input;
// Keep this alive during kernel execution // Keep this alive during kernel execution
if (paddings_[0] > 0 || paddings_[1] > 0) { if (paddings[0] > 0 || paddings[1] > 0) {
ConstructInputWithPadding(input, paddings_.data(), &padded_input); ConstructInputWithPadding(input, paddings.data(), &padded_input);
input = &padded_input; input = &padded_input;
} }
Tensor::MappingGuard input_mapper(input); Tensor::MappingGuard input_mapper(input);
......
...@@ -4,6 +4,7 @@ ...@@ -4,6 +4,7 @@
#include "mace/kernels/addn.h" #include "mace/kernels/addn.h"
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
......
...@@ -6,6 +6,7 @@ ...@@ -6,6 +6,7 @@
#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/utils/tuner.h" #include "mace/utils/tuner.h"
#include "mace/kernels/opencl/helper.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
......
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/kernels/buffer_to_image.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
namespace mace {
namespace kernels {
template<typename T>
void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
const BufferType type,
Tensor *image) {
MACE_CHECK(!buffer->is_image()) << "buffer must be buffer-type";
std::vector<size_t> image_shape;
if (!i2b_) {
CalImage2DShape(buffer->shape(), type, image_shape);
image->ResizeImage(buffer->shape(), image_shape);
} else {
image_shape = image->image_shape();
buffer->Resize(image->shape());
}
std::set<std::string> built_options;
built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(image->dtype()));
built_options.emplace("-DCMD_DATA_TYPE=" + DataTypeToOpenclCMDDataType(image->dtype()));
auto runtime = OpenCLRuntime::Get();
string kernel_name;
switch (type) {
case FILTER:
kernel_name = i2b_ ? "filter_image_to_buffer" : "filter_buffer_to_image";
break;
case IN_OUT:
kernel_name = i2b_ ? "in_out_image_to_buffer" : "in_out_buffer_to_image";
break;
case ARGUMENT:
kernel_name = i2b_ ? "arg_image_to_buffer" : "arg_buffer_to_image";
break;
}
VLOG(0) << kernel_name;
auto b2f_kernel = runtime->BuildKernel("buffer_to_image",
kernel_name,
built_options);
uint32_t idx = 0;
b2f_kernel.setArg(idx++, *(static_cast<const cl::Image3D *>(buffer->buffer())));
if (type == ARGUMENT) {
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(0)));
} else {
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(1)));
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(2)));
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(3)));
}
b2f_kernel.setArg(idx++, *(static_cast<cl::Image3D *>(image->buffer())));
const size_t gws[3] = {image_shape[0],
image_shape[1],
1};
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(b2f_kernel);
const std::vector<uint32_t> lws = {kwg_size, 1, 1};
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
b2f_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]));
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
}
template struct BufferToImageFunctor<DeviceType::OPENCL, float>;
template struct BufferToImageFunctor<DeviceType::OPENCL, half>;
} // namespace kernels
} // namespace mace
#include <common.h>
__kernel void filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w, ic, oc */
__private const int filter_w,
__private const int in_channel,
__private const int out_channel,
__write_only image2d_t output) {
int w = get_global_id(0);
int h = get_global_id(1);
const int out_channel_idx = h * 4;
const int hw_idx = w / in_channel;
int in_channel_idx = w % in_channel;
const int h_idx = hw_idx / filter_w;
const int w_idx = hw_idx % filter_w;
const int offset = ((h_idx * filter_w + w_idx) * in_channel + in_channel_idx) * out_channel
+ out_channel_idx;
VEC_DATA_TYPE(DATA_TYPE, 4) values = vload4(0, input + offset);
int2 coord = (int2)(w, h);
CMD_TYPE(write_image, CMD_DATA_TYPE)(output, coord, values);
}
__kernel void filter_image_to_buffer(__global DATA_TYPE *output, /* h, w, ic, oc */
__private const int filter_w,
__private const int in_channel,
__private const int out_channel,
__read_only image2d_t input) {
int w = get_global_id(0);
int h = get_global_id(1);
const int out_channel_idx = h * 4;
const int hw_idx = w / in_channel;
int in_channel_idx = w % in_channel;
const int h_idx = hw_idx / filter_w;
const int w_idx = hw_idx % filter_w;
const int offset = ((h_idx * filter_w + w_idx) * in_channel + in_channel_idx) * out_channel
+ out_channel_idx;
const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
int2 coord = (int2)(w, h);
VEC_DATA_TYPE(DATA_TYPE, 4) values = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, coord);
if (out_channel_idx + 4 > out_channel) {
const int diff = in_channel - in_channel_idx;
output[offset] = values.s0;
if (diff == 2) {
output[offset+1] = values.s1;
} else {
output[offset+1] = values.s1;
output[offset+2] = values.s2;
}
} else {
vstore4(values, 0, output + offset);
}
}
__kernel void in_out_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
__private const int height,
__private const int width,
__private const int channels,
__write_only image2d_t output) {
int w = get_global_id(0);
int h = get_global_id(1);
const int batch_idx = h / height;
const int height_idx = h % height;
const int width_idx = w % width;
const int channel_idx = w / width * 4;
const int offset = ((batch_idx * height + height_idx) * width + width_idx) * channels
+ channel_idx;
VEC_DATA_TYPE(DATA_TYPE, 4) values = vload4(0, input + offset);
int2 coord = (int2)(w, h);
CMD_TYPE(write_image, CMD_DATA_TYPE)(output, coord, values);
}
__kernel void in_out_image_to_buffer(__global DATA_TYPE *output, /* nhwc */
__private const int height,
__private const int width,
__private const int channels,
__read_only image2d_t input) {
int w = get_global_id(0);
int h = get_global_id(1);
const int batch_idx = h / height;
const int height_idx = h % height;
const int width_idx = w % width;
const int channel_idx = w / width * 4;
const int offset = ((batch_idx * height + height_idx) * width + width_idx) * channels
+ channel_idx;
const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
int2 coord = (int2)(w, h);
VEC_DATA_TYPE(DATA_TYPE, 4) values = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, coord);
if (channel_idx + 4 > channels) {
const int diff = channels - channel_idx;
output[offset] = values.s0;
if (diff == 2) {
output[offset+1] = values.s1;
} else {
output[offset+1] = values.s1;
output[offset+2] = values.s2;
}
} else {
vstore4(values, 0, output + offset);
}
}
__kernel void arg_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
__private const int count,
__write_only image2d_t output) {
int w = get_global_id(0);
int h = get_global_id(1);
const int offset = w * 4;
VEC_DATA_TYPE(DATA_TYPE, 4) values = vload4(0, input + offset);
int2 coord = (int2)(w, h);
CMD_TYPE(write_image, CMD_DATA_TYPE)(output, coord, values);
}
__kernel void arg_image_to_buffer(__global DATA_TYPE *output, /* nhwc */
__private const int count,
__read_only image2d_t input) {
int w = get_global_id(0);
int h = get_global_id(1);
const int offset = w * 4;
const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
int2 coord = (int2)(w, h);
VEC_DATA_TYPE(DATA_TYPE, 4) values = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, coord);
if (offset + 4 > count) {
const int diff = count - offset;
output[offset] = values.s0;
if (diff == 2) {
output[offset+1] = values.s1;
} else {
output[offset+1] = values.s1;
output[offset+2] = values.s2;
}
} else {
vstore4(values, 0, output + offset);
}
}
...@@ -11,4 +11,7 @@ ...@@ -11,4 +11,7 @@
#define VEC_DATA_TYPE_STR(data_type, size) data_type##size #define VEC_DATA_TYPE_STR(data_type, size) data_type##size
#define VEC_DATA_TYPE(data_type, size) VEC_DATA_TYPE_STR(data_type, size) #define VEC_DATA_TYPE(data_type, size) VEC_DATA_TYPE_STR(data_type, size)
#define CMD_TYPE_STR(cmd, type) cmd##type
#define CMD_TYPE(cmd, type) CMD_TYPE_STR(cmd, type)
#endif // MACE_KERNELS_OPENCL_CL_COMMON_H_ #endif // MACE_KERNELS_OPENCL_CL_COMMON_H_
...@@ -165,47 +165,85 @@ __kernel void conv_2d_1x1_v2(__global const DATA_TYPE *input, /* n, c, h, w */ ...@@ -165,47 +165,85 @@ __kernel void conv_2d_1x1_v2(__global const DATA_TYPE *input, /* n, c, h, w */
} }
} }
/* FIXME this is incomplete */ // TODO : validation
__kernel void conv_2d_1x1_v3(__read_only image3d_t input, /* n, c/4, h, w, 4 */ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__global const float *filter, /* o, i, kh, kw */ __read_only image2d_t filter, /* cout%4 * cin, cout/4 */
__global const float *bias, /* o */ __read_only image2d_t bias, /* cout%4 * cout/4 */
__write_only image3d_t output, /* n, c/4, h, w, 4 */ __write_only image2d_t output,
__private const int batch_num, __private const int in_ch_blks,
__private const int in_chan_num,
__private const int out_chan_num,
__private const int height,
__private const int width) { __private const int width) {
int out_chan_blk = get_global_id(0); const int out_ch_blk = get_global_id(0);
int h = get_global_id(1); const int out_w_blk = get_global_id(1);
int w = get_global_id(2); const int out_w_blks = get_global_size(1);
const int out_hb = get_global_id(2);
int in_chan_blk_num = (in_chan_num + 3) / 4;
int out_chan_blk_num = (out_chan_num + 3) / 4;
const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
for (int batch = 0; batch < batch_num; ++batch) { half4 bias_value = read_imageh(bias, sampler, (int2)(out_w_blk, 1));
float4 bias_value = vload4(out_chan_blk, bias); half4 out0 = (half4)(bias_value.x, bias_value.x, bias_value.x, bias_value.x);
__private float4 out = bias_value; half4 out1 = (half4)(bias_value.y, bias_value.y, bias_value.y, bias_value.y);
half4 out2 = (half4)(bias_value.z, bias_value.z, bias_value.z, bias_value.z);
for (int in_chan_blk = 0; in_chan_blk < in_chan_blk_num; ++in_chan_blk) { half4 out3 = (half4)(bias_value.w, bias_value.w, bias_value.w, bias_value.w);
int in_d = batch * in_chan_blk_num + in_chan_blk;
float4 in = read_imagef(input, sampler, (int4)(in_d, h, w, 0)); // Unrolling this loop hurt perfmance
int in_x_base = 0;
const float *filter_base = filter + (out_chan_blk << 2) * in_chan_num; for (int in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
float4 weights = vload4(in_chan_blk, filter_base); const int in_x0 = in_x_base + out_w_blk;;
out.x += dot(in, weights); const int in_x1 = in_x0 + out_w_blks;
weights = vload4(in_chan_blk, filter_base + in_chan_num); const int in_x2 = in_x1 + out_w_blks;
out.y += dot(in, weights); const int in_x3 = in_x2 + out_w_blks;
weights = vload4(in_chan_blk, filter_base + in_chan_num * 2); in_x_base += width;
out.z += dot(in, weights);
weights = vload4(in_chan_blk, filter_base + in_chan_num * 3); const half4 in0 = read_imageh(input, sampler, (int2)(in_x0, out_hb));
out.w += dot(in, weights); const half4 in1 = read_imageh(input, sampler, (int2)(in_x1, out_hb));
const half4 in2 = read_imageh(input, sampler, (int2)(in_x2, out_hb));
const half4 in3 = read_imageh(input, sampler, (int2)(in_x3, out_hb));
// The order matters, load input first then load filter, why?
const int filter_x0 = in_ch_blk << 2;
const half4 weights0 = read_imageh(filter, sampler, (int2)(filter_x0, out_ch_blk));
const half4 weights1 = read_imageh(filter, sampler, (int2)(filter_x0 + 1, out_ch_blk));
const half4 weights2 = read_imageh(filter, sampler, (int2)(filter_x0 + 2, out_ch_blk));
const half4 weights3 = read_imageh(filter, sampler, (int2)(filter_x0 + 3, out_ch_blk));
// Will prefetch L2 improve performance? How to pretch image data?
// Interleaving load and mul does not improve performance as expected
out0 += in0.x * weights0;
out1 += in1.x * weights0;
out2 += in2.x * weights0;
out3 += in3.x * weights0;
out0 += in0.y * weights1;
out1 += in1.y * weights1;
out2 += in2.y * weights1;
out3 += in3.y * weights1;
out0 += in0.z * weights2;
out1 += in1.z * weights2;
out2 += in2.z * weights2;
out3 += in3.z * weights2;
out0 += in0.w * weights3;
out1 += in1.w * weights3;
out2 += in2.w * weights3;
out3 += in3.w * weights3;
} }
int out_d = batch * out_chan_blk_num + out_chan_blk; const int out_x_offset = out_ch_blk * width;
int4 out_coord = (int4)(out_d, h, w, 0); const int w0 = out_w_blk;
write_imagef(output, out_coord, out); write_imageh(output, (int2)(out_x_offset + w0, out_hb), out0);
}
const int w1 = w0 + out_w_blks;
if (w1 >= width) return;
write_imageh(output, (int2)(out_x_offset + w1, out_hb), out1);
const int w2 = w1 + out_w_blks;
if (w2 >= width) return;
write_imageh(output, (int2)(out_x_offset + w2, out_hb), out2);
const int w3 = w2 + out_w_blks;
if (w3 >= width) return;
write_imageh(output, (int2)(out_x_offset + w3, out_hb), out3);
} }
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
// //
#include "mace/kernels/conv_2d.h" #include "mace/kernels/conv_2d.h"
#include "mace/kernels/opencl/helper.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
...@@ -47,10 +48,25 @@ void Conv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input, ...@@ -47,10 +48,25 @@ void Conv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
input, filter, bias, output); input, filter, bias, output);
return; return;
} }
std::vector<index_t> output_shape(4);
std::vector<int> paddings(2);
kernels::CalcPaddingAndOutputSize(
input->shape().data(), filter->shape().data(), dilations_,
strides_, paddings_, output_shape.data(), paddings.data());
if (input->is_image()) {
std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT, output_image_shape);
output->ResizeImage(output_shape, output_image_shape);
} else {
output->Resize(output_shape);
}
auto conv2d_func = selector[kernel_h - 1][strides_[0] - 1]; auto conv2d_func = selector[kernel_h - 1][strides_[0] - 1];
if (paddings_[0] > 0 || paddings_[1] > 0) { if (paddings[0] > 0 || paddings[1] > 0) {
Tensor padded_input(GetDeviceAllocator(DeviceType::OPENCL), DataTypeToEnum<float>::v()); Tensor padded_input(GetDeviceAllocator(DeviceType::OPENCL), DataTypeToEnum<float>::v());
ConstructInputWithPadding(input, paddings_.data(), &padded_input); ConstructInputWithPadding(input, paddings.data(), &padded_input);
conv2d_func(&padded_input, filter, bias, output); conv2d_func(&padded_input, filter, bias, output);
}else { }else {
conv2d_func(input, filter, bias, output); conv2d_func(input, filter, bias, output);
......
...@@ -3,45 +3,14 @@ ...@@ -3,45 +3,14 @@
// //
#include "mace/kernels/conv_2d.h" #include "mace/kernels/conv_2d.h"
#include "mace/core/common.h"
#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/utils/utils.h" #include "mace/utils/utils.h"
#include "mace/kernels/opencl/helper.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
void Conv1x1Naive(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
Tensor *output) {
const index_t batch = output->dim(0);
const index_t channels = output->dim(1);
const index_t height = output->dim(2);
const index_t width = output->dim(3);
const index_t input_channels = input->dim(1);
auto runtime = OpenCLRuntime::Get();
auto program = runtime->program();
auto conv_2d =
cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer, cl::Buffer, int,
int>(program, "conv_2d_1x1_naive");
const index_t pixels = height * width;
cl_int error;
conv_2d(cl::EnqueueArgs(
runtime->command_queue(),
cl::NDRange(static_cast<int>(batch), static_cast<int>(channels),
static_cast<int>(pixels)),
cl::NDRange(1, 1, 128)),
*(static_cast<cl::Buffer *>(input->buffer())),
*(static_cast<cl::Buffer *>(filter->buffer())),
*(static_cast<cl::Buffer *>(bias->buffer())),
*(static_cast<cl::Buffer *>(output->buffer())),
static_cast<int>(input_channels), error);
MACE_CHECK(error == CL_SUCCESS);
}
void Conv1x1V2(const Tensor *input, void Conv1x1V2(const Tensor *input,
const Tensor *filter, const Tensor *filter,
const Tensor *bias, const Tensor *bias,
...@@ -98,85 +67,44 @@ void Conv1x1V3(const Tensor *input, ...@@ -98,85 +67,44 @@ void Conv1x1V3(const Tensor *input,
const Tensor *filter, const Tensor *filter,
const Tensor *bias, const Tensor *bias,
Tensor *output) { Tensor *output) {
const index_t batch = output->dim(0); const index_t batch = output->dim(0);
const index_t channels = output->dim(1); const index_t channels = output->dim(1);
const index_t height = output->dim(2); const index_t height = output->dim(2);
const index_t width = output->dim(3); const index_t width = output->dim(3);
const index_t input_channels = input->dim(1); const index_t input_channels = input->dim(1);
auto runtime = OpenCLRuntime::Get(); const index_t channel_blocks = RoundUpDiv4(channels);
auto program = runtime->program(); const index_t input_channel_blocks = RoundUpDiv4(input_channels);
const index_t pixels = height * width; std::set<std::string> built_options;
const index_t pixel_blocks = (pixels + 3) / 4; built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
built_options.emplace("-DSTRIDE_1");
built_options.emplace(bias != nullptr ? "-DBIAS" : "");
const index_t channel_blocks = (channels + 3) / 4; auto runtime = OpenCLRuntime::Get();
const index_t input_channel_blocks = (input_channels + 3) / 4; auto program = runtime->program();
// FIXME temp hacking
static std::map<std::ptrdiff_t, cl::Image3D> input_image_map;
static std::map<std::ptrdiff_t, cl::Image3D> output_image_map;
cl::Image3D input_image;
cl::Image3D output_image;
auto input_iter =
input_image_map.find(reinterpret_cast<std::ptrdiff_t>(input->buffer()));
if (input_iter != input_image_map.end()) {
input_image = input_iter->second;
} else {
// The batch dimension is collapsed with channel
cl_int error;
cl::Image3D image =
cl::Image3D(OpenCLRuntime::Get()->context(),
CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
cl::ImageFormat(CL_RGBA, CL_FLOAT), height, width,
batch * input_channel_blocks, 0, 0, nullptr, &error);
MACE_CHECK(error == CL_SUCCESS);
input_image = image;
input_image_map.clear();
input_image_map.emplace(reinterpret_cast<std::ptrdiff_t>(input->buffer()),
image);
}
auto output_iter =
output_image_map.find(reinterpret_cast<std::ptrdiff_t>(output->buffer()));
if (output_iter != output_image_map.end()) {
output_image = output_iter->second;
} else {
cl_int error;
cl::Image3D image =
cl::Image3D(OpenCLRuntime::Get()->context(),
CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR,
cl::ImageFormat(CL_RGBA, CL_FLOAT), height, width,
batch * channel_blocks, 0, 0, nullptr, &error);
MACE_CHECK(error == CL_SUCCESS);
output_image = image;
output_image_map.clear();
output_image_map.emplace(reinterpret_cast<std::ptrdiff_t>(output->buffer()),
image);
}
auto conv_2d_kernel = cl::Kernel(program, "conv_2d_1x1_v3"); auto conv_2d_kernel = runtime->BuildKernel("conv_2d_1x1", "conv_2d_1x1", built_options);
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel); const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);
uint32_t idx = 0; uint32_t idx = 0;
conv_2d_kernel.setArg(idx++, input_image); conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
conv_2d_kernel.setArg(idx++, conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(filter->buffer())));
*(static_cast<const cl::Buffer *>(filter->buffer()))); if (bias != nullptr) {
conv_2d_kernel.setArg(idx++, conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(bias->buffer())));
*(static_cast<const cl::Buffer *>(bias->buffer()))); }
conv_2d_kernel.setArg(idx++, output_image); conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(output->buffer())));
conv_2d_kernel.setArg(idx++, static_cast<int>(batch)); conv_2d_kernel.setArg(idx++, static_cast<int>(input_channel_blocks));
conv_2d_kernel.setArg(idx++, static_cast<int>(input_channels));
conv_2d_kernel.setArg(idx++, static_cast<int>(channels));
conv_2d_kernel.setArg(idx++, static_cast<int>(height));
conv_2d_kernel.setArg(idx++, static_cast<int>(width)); conv_2d_kernel.setArg(idx++, static_cast<int>(width));
auto command_queue = runtime->command_queue(); auto command_queue = runtime->command_queue();
cl_int error; cl_int error;
error = command_queue.enqueueNDRangeKernel( error = command_queue.enqueueNDRangeKernel(
conv_2d_kernel, cl::NullRange, conv_2d_kernel, cl::NullRange,
cl::NDRange(static_cast<int>(channel_blocks), static_cast<int>(height), cl::NDRange(static_cast<uint32_t>(channel_blocks), static_cast<uint32_t>(height),
static_cast<int>(width)), static_cast<uint32_t>(height * batch)),
cl::NDRange(1, 2, kwg_size / 2)); cl::NDRange(4, 15, 8));
MACE_CHECK(error == CL_SUCCESS, error); MACE_CHECK(error == CL_SUCCESS, error);
} }
......
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
#include "mace/core/common.h" #include "mace/core/common.h"
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/conv_2d.h" #include "mace/kernels/conv_2d.h"
#include "mace/kernels/opencl/helper.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
......
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
#include "mace/core/common.h" #include "mace/core/common.h"
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/conv_2d.h" #include "mace/kernels/conv_2d.h"
#include "mace/kernels/opencl/helper.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
......
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h"
namespace mace {
namespace kernels {
// [(c+3)/4*W, N * H]
void CalInOutputImageShape(const std::vector<index_t> &shape, /* NHWC */
std::vector<size_t> &image_shape) {
MACE_CHECK(shape.size() == 4);
image_shape.resize(2);
image_shape[0] = RoundUpDiv4(shape[3]) * shape[2];
image_shape[1] = shape[0] * shape[1];
}
// [H * W * Ic, (Oc + 3) / 4]
void CalFilterImageShape(const std::vector<index_t> &shape, /* HWIO*/
std::vector<size_t> &image_shape) {
MACE_CHECK(shape.size() == 4);
image_shape.resize(2);
image_shape[0] = shape[0] * shape[1] * shape[2];
image_shape[1] = RoundUpDiv4(shape.back());
}
// [(size + 3) / 4, 1]
void CalArgImageShape(const std::vector<index_t> &shape,
std::vector<size_t> &image_shape) {
MACE_CHECK(shape.size() == 1);
image_shape.resize(2);
image_shape[0] = RoundUpDiv4(shape[0]);
image_shape[1] = 1;
}
void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
const BufferType type,
std::vector<size_t> &image_shape) {
switch (type) {
case FILTER:
CalFilterImageShape(shape, image_shape);
break;
case IN_OUT:
CalInOutputImageShape(shape, image_shape);
break;
case ARGUMENT:
CalArgImageShape(shape, image_shape);
break;
default:
LOG(FATAL) << "Mace not supported yet.";
}
}
std::string DataTypeToCLType(const DataType dt) {
switch (dt) {
case DT_FLOAT:
return "float";
case DT_HALF:
return "half";
case DT_UINT8:
return "uchar";
case DT_INT8:
return "char";
case DT_DOUBLE:
return "double";
case DT_INT32:
return "int";
case DT_UINT32:
return "int";
case DT_UINT16:
return "ushort";
case DT_INT16:
return "short";
case DT_INT64:
return "long";
default:
LOG(FATAL) << "Unsupported data type";
return "";
}
}
std::string DataTypeToOpenclCMDDataType(const DataType dt) {
switch (dt) {
case DT_FLOAT:
return "f";
case DT_HALF:
return "h";
default:
LOG(FATAL) << "Not supported data type for opencl cmd data type";
return "";
}
}
} // namespace kernels
} // namespace mace
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#ifndef MACE_KERNELS_OPENCL_HELPER_H_
#define MACE_KERNELS_OPENCL_HELPER_H_
#include "mace/core/types.h"
namespace mace {
namespace kernels {
enum BufferType {
FILTER = 0,
IN_OUT= 1,
ARGUMENT = 2
};
void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
const BufferType type,
std::vector<size_t> &image_shape);
std::string DataTypeToOpenclCMDDataType(const DataType dt);
std::string DataTypeToCLType(const DataType dt);
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_OPENCL_HELPER_H_
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
#include "mace/kernels/pooling.h" #include "mace/kernels/pooling.h"
#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
......
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
#include "mace/kernels/relu.h" #include "mace/kernels/relu.h"
#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
......
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/kernels/resize_bilinear.h" #include "mace/kernels/resize_bilinear.h"
#include "mace/kernels/opencl/helper.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
......
...@@ -8,6 +8,7 @@ ...@@ -8,6 +8,7 @@
#include "mace/core/common.h" #include "mace/core/common.h"
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/space_to_batch.h" #include "mace/kernels/space_to_batch.h"
#include "mace/kernels/opencl/helper.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
......
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/ops/buffer_to_image.h"
namespace mace {
REGISTER_OPENCL_OPERATOR(BufferToImage, BufferToImageOp<DeviceType::OPENCL, float>);
} // namespace mace
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#ifndef MACE_OPS_BUFFER_TO_IMAGE_H_
#define MACE_OPS_BUFFER_TO_IMAGE_H_
#include "mace/core/operator.h"
#include "mace/kernels/buffer_to_image.h"
namespace mace {
template <DeviceType D, typename T>
class BufferToImageOp: public Operator<D, T> {
public:
BufferToImageOp(const OperatorDef &op_def, Workspace *ws)
: Operator<D, T>(op_def, ws) {}
bool Run() override {
const Tensor *input_tensor = this->Input(INPUT);
kernels::BufferType type = static_cast<kernels::BufferType>(OperatorBase::GetSingleArgument<int>(
"buffer_type", static_cast<int>(kernels::FILTER)));
Tensor *output = this->Output(OUTPUT);
functor_(const_cast<Tensor *>(input_tensor), type, output);
return true;
}
private:
kernels::BufferToImageFunctor<D, T> functor_;
protected:
OP_INPUT_TAGS(INPUT);
OP_OUTPUT_TAGS(OUTPUT);
};
} // namespace mace
#endif // MACE_OPS_BUFFER_TO_IMAGE_H_
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "gtest/gtest.h"
#include "mace/ops/ops_test_util.h"
#include "mace/kernels/opencl/helper.h"
using namespace mace;
template<DeviceType D, typename T>
void TestBidirectionTransform(const int type, const std::vector<index_t> &input_shape) {
OpsTestNet net;
OpDefBuilder("BufferToImage", "BufferToImageTest")
.Input("Input")
.Output("B2IOutput")
.AddIntArg("buffer_type", type)
.Finalize(net.NewOperatorDef());
// Add input data
net.AddRandomInput<D, T>("Input", input_shape);
// Run
net.RunOp(D);
OpDefBuilder("ImageToBuffer", "ImageToBufferTest")
.Input("B2IOutput")
.Output("I2BOutput")
.AddIntArg("buffer_type", type)
.Finalize(net.NewOperatorDef());
// Run
net.RunOp(D);
// Check
ExpectTensorNear<T>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"), 1e-5);
}
TEST(BufferToImageTest, ArgSmall) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::ARGUMENT, {1});
}
TEST(BufferToImageTest, ArgMedia) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::ARGUMENT, {11});
}
TEST(BufferToImageTest, ArgLarge) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::ARGUMENT, {256});
}
TEST(BufferToImageTest, InputSmallSingleChannel) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT, {1, 2, 3, 1});
}
TEST(BufferToImageTest, InputSmallMultipleChannel) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT, {1, 2, 3, 3});
}
TEST(BufferToImageTest, InputSmallMultipleBatchAndChannel) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT, {3, 2, 3, 3});
}
TEST(BufferToImageTest, InputMedia) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT, {3, 13, 17, 128});
}
TEST(BufferToImageTest, InputLarge) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT, {3, 64, 64, 256});
}
TEST(BufferToImageTest, Filter1x1Small) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::FILTER, {1, 1, 3, 5});
}
TEST(BufferToImageTest, Filter1x1Media) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::FILTER, {1, 1, 13, 17});
}
TEST(BufferToImageTest, Filter1x1Large) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::FILTER, {1, 1, 128, 512});
}
TEST(BufferToImageTest, Filter3x3Small) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::FILTER, {3, 3, 3, 5});
}
TEST(BufferToImageTest, Filter3x3Meida) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::FILTER, {3, 3, 13, 17});
}
TEST(BufferToImageTest, Filter3x3Large) {
TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::FILTER, {3, 3, 128, 256});
}
...@@ -17,9 +17,9 @@ template <DeviceType D, typename T> ...@@ -17,9 +17,9 @@ template <DeviceType D, typename T>
class Conv2dOp : public ConvPool2dOpBase<D, T> { class Conv2dOp : public ConvPool2dOpBase<D, T> {
public: public:
Conv2dOp(const OperatorDef &op_def, Workspace *ws) Conv2dOp(const OperatorDef &op_def, Workspace *ws)
: ConvPool2dOpBase<D, T>(op_def, ws) { : ConvPool2dOpBase<D, T>(op_def, ws),
functor_.strides_ = this->strides_.data(); functor_(this->strides_.data(), this->padding_,
functor_.dilations_ = this->dilations_.data(); this->dilations_.data()) {
} }
bool Run() override { bool Run() override {
...@@ -28,15 +28,6 @@ class Conv2dOp : public ConvPool2dOpBase<D, T> { ...@@ -28,15 +28,6 @@ class Conv2dOp : public ConvPool2dOpBase<D, T> {
const Tensor *bias = this->InputSize() >= 3 ? this->Input(BIAS) : nullptr; const Tensor *bias = this->InputSize() >= 3 ? this->Input(BIAS) : nullptr;
Tensor *output = this->Output(OUTPUT); Tensor *output = this->Output(OUTPUT);
std::vector<index_t> output_shape(4);
std::vector<int> paddings(2);
kernels::CalcPaddingAndOutputSize(
input->shape().data(), filter->shape().data(), this->dilations_.data(),
this->strides_.data(), this->padding_, output_shape.data(),
paddings.data());
output->Resize(output_shape);
functor_.paddings_ = paddings;
functor_(input, filter, bias, output); functor_(input, filter, bias, output);
return true; return true;
......
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/ops/image_to_buffer.h"
namespace mace {
REGISTER_OPENCL_OPERATOR(ImageToBuffer, ImageToBufferOp<DeviceType::OPENCL, float>);
} // namespace mace
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#ifndef MACE_OPS_BUFFER_TO_IMAGE_H_
#define MACE_OPS_BUFFER_TO_IMAGE_H_
#include "mace/core/operator.h"
#include "mace/kernels/buffer_to_image.h"
namespace mace {
template <DeviceType D, typename T>
class ImageToBufferOp: public Operator<D, T> {
public:
ImageToBufferOp(const OperatorDef &op_def, Workspace *ws)
: Operator<D, T>(op_def, ws), functor_(true) {}
bool Run() override {
const Tensor *input_tensor = this->Input(INPUT);
Tensor *output = this->Output(OUTPUT);
kernels::BufferType type = static_cast<kernels::BufferType>(OperatorBase::GetSingleArgument<int>(
"buffer_type", static_cast<int>(kernels::FILTER)));
functor_(output, type, const_cast<Tensor *>(input_tensor));
return true;
}
private:
kernels::BufferToImageFunctor<D, T> functor_;
protected:
OP_INPUT_TAGS(INPUT);
OP_OUTPUT_TAGS(OUTPUT);
};
} // namespace mace
#endif // MACE_OPS_BUFFER_TO_IMAGE_H_
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
#include "mace/core/net.h" #include "mace/core/net.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h"
namespace mace { namespace mace {
...@@ -106,6 +107,32 @@ class OpsTestNet { ...@@ -106,6 +107,32 @@ class OpsTestNet {
memcpy(input_data, data.data(), data.size() * sizeof(T)); memcpy(input_data, data.data(), data.size() * sizeof(T));
} }
template <DeviceType D, typename T>
void AddInputImageFromArray(const std::string &name,
const std::vector<index_t> &shape,
const std::vector<T> &data) {
Tensor *input =
ws_.CreateTensor(name, GetDeviceAllocator(D), DataTypeToEnum<T>::v());
std::vector<size_t> image_shape;
input->ResizeImage(shape, image_shape);
Tensor::MappingGuard input_mapper(input);
T *input_data = input->mutable_data<T>();
MACE_CHECK(static_cast<size_t>(input->size()) == data.size());
const T *data_ptr = data.data();
const int type_size = sizeof(T);
const int row_pitch = shape[3];
auto mapped_image_pitch = input_mapper.mapped_image_pitch();
const size_t slice_size = mapped_image_pitch[1] / sizeof(T);
for (int c = 0; c < shape[0] * shape[1]; ++c) {
T *input_ptr = input_data + c * slice_size;
for (int h = 0; h < shape[2]; ++h) {
memcpy(input_ptr, data_ptr, row_pitch * type_size);
input_ptr += mapped_image_pitch[0] / sizeof(T);
data_ptr += row_pitch;
}
}
}
template <DeviceType D, typename T> template <DeviceType D, typename T>
void AddRepeatedInput(const std::string &name, void AddRepeatedInput(const std::string &name,
const std::vector<index_t> &shape, const std::vector<index_t> &shape,
...@@ -126,19 +153,26 @@ class OpsTestNet { ...@@ -126,19 +153,26 @@ class OpsTestNet {
ws_.CreateTensor(name, GetDeviceAllocator(D), DataTypeToEnum<T>::v()); ws_.CreateTensor(name, GetDeviceAllocator(D), DataTypeToEnum<T>::v());
input->Resize(shape); input->Resize(shape);
Tensor::MappingGuard input_mapper(input); Tensor::MappingGuard input_mapper(input);
float *input_data = input->mutable_data<T>(); T *input_data = input->mutable_data<T>();
std::random_device rd; std::random_device rd;
std::mt19937 gen(rd()); std::mt19937 gen(rd());
std::normal_distribution<T> nd(0, 1); std::normal_distribution<float> nd(0, 1);
if (DataTypeToEnum<T>::value == DT_HALF) {
std::generate(input_data, input_data + input->size(),
[&gen, &nd, positive] {
return half_float::half_cast<half>(positive ? std::abs(nd(gen)) : nd(gen));
});
} else {
std::generate(input_data, input_data + input->size(), std::generate(input_data, input_data + input->size(),
[&gen, &nd, positive] { [&gen, &nd, positive] {
return positive ? std::abs(nd(gen)) : nd(gen); return positive ? std::abs(nd(gen)) : nd(gen);
}); });
} }
}
OperatorDef *NewOperatorDef() { OperatorDef *NewOperatorDef() {
op_defs_.clear();
op_defs_.emplace_back(OperatorDef()); op_defs_.emplace_back(OperatorDef());
return &op_defs_[op_defs_.size() - 1]; return &op_defs_[op_defs_.size() - 1];
} }
...@@ -258,7 +292,8 @@ inline std::string ShapeToString(const Tensor &x) { ...@@ -258,7 +292,8 @@ inline std::string ShapeToString(const Tensor &x) {
template <typename T> template <typename T>
struct is_floating_point_type { struct is_floating_point_type {
static const bool value = static const bool value =
std::is_same<T, float>::value || std::is_same<T, double>::value; std::is_same<T, float>::value || std::is_same<T, double>::value
|| std::is_same<T, half>::value;
}; };
template <typename T> template <typename T>
...@@ -314,6 +349,7 @@ struct Expector<T, true> { ...@@ -314,6 +349,7 @@ struct Expector<T, true> {
<< " index = " << i; << " index = " << i;
} }
} }
}; };
template <typename T> template <typename T>
...@@ -330,6 +366,7 @@ std::string ToString(const T &input) { ...@@ -330,6 +366,7 @@ std::string ToString(const T &input) {
return ss.str(); return ss.str();
} }
} // namespace mace } // namespace mace
#endif // MACE_OPS_TEST_UTIL_H_ #endif // MACE_OPS_TEST_UTIL_H_
...@@ -13,6 +13,17 @@ Integer RoundUp(Integer i, Integer factor) { ...@@ -13,6 +13,17 @@ Integer RoundUp(Integer i, Integer factor) {
return (i + factor - 1) / factor * factor; return (i + factor - 1) / factor * factor;
} }
template <typename Integer, uint32_t factor>
Integer RoundUpDiv(Integer i) {
return (i + factor - 1) / factor;
}
// Partial specialization of function templates is not allowed
template <typename Integer>
Integer RoundUpDiv4(Integer i) {
return (i + 3) >> 2;
}
template <typename Integer> template <typename Integer>
Integer CeilQuotient(Integer a, Integer b) { Integer CeilQuotient(Integer a, Integer b) {
return (a + b - 1) / b; return (a + b - 1) / b;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册