提交 fe8b202b 编写于 作者: L Liangliang He

Merge branch 'master' into 'master'

Refactor tensor, accelerate gpu initialization time

See merge request !260
...@@ -26,17 +26,17 @@ class Allocator { ...@@ -26,17 +26,17 @@ class Allocator {
public: public:
Allocator() {} Allocator() {}
virtual ~Allocator() noexcept {} virtual ~Allocator() noexcept {}
virtual void *New(size_t nbytes) = 0; virtual void *New(size_t nbytes) const = 0;
virtual void *NewImage(const std::vector<size_t> &image_shape, virtual void *NewImage(const std::vector<size_t> &image_shape,
const DataType dt) = 0; const DataType dt) const = 0;
virtual void Delete(void *data) = 0; virtual void Delete(void *data) const = 0;
virtual void DeleteImage(void *data) = 0; virtual void DeleteImage(void *data) const = 0;
virtual void *Map(void *buffer, size_t nbytes) = 0; virtual void *Map(void *buffer, size_t offset, size_t nbytes) const = 0;
virtual void *MapImage(void *buffer, virtual void *MapImage(void *buffer,
const std::vector<size_t> &image_shape, const std::vector<size_t> &image_shape,
std::vector<size_t> &mapped_image_pitch) = 0; std::vector<size_t> *mapped_image_pitch) const = 0;
virtual void Unmap(void *buffer, void *mapper_ptr) = 0; virtual void Unmap(void *buffer, void *mapper_ptr) const = 0;
virtual bool OnHost() = 0; virtual bool OnHost() const = 0;
template <typename T> template <typename T>
T *New(size_t num_elements) { T *New(size_t num_elements) {
...@@ -52,7 +52,7 @@ class Allocator { ...@@ -52,7 +52,7 @@ class Allocator {
class CPUAllocator : public Allocator { class CPUAllocator : public Allocator {
public: public:
~CPUAllocator() override {} ~CPUAllocator() override {}
void *New(size_t nbytes) override { void *New(size_t nbytes) const override {
VLOG(3) << "Allocate CPU buffer: " << nbytes; VLOG(3) << "Allocate CPU buffer: " << nbytes;
void *data = nullptr; void *data = nullptr;
#ifdef __ANDROID__ #ifdef __ANDROID__
...@@ -67,27 +67,29 @@ class CPUAllocator : public Allocator { ...@@ -67,27 +67,29 @@ class CPUAllocator : public Allocator {
} }
void *NewImage(const std::vector<size_t> &shape, void *NewImage(const std::vector<size_t> &shape,
const DataType dt) override { const DataType dt) const override {
LOG(FATAL) << "Allocate CPU image"; LOG(FATAL) << "Allocate CPU image";
return nullptr; return nullptr;
} }
void Delete(void *data) override { void Delete(void *data) const override {
VLOG(3) << "Free CPU buffer"; VLOG(3) << "Free CPU buffer";
free(data); free(data);
} }
void DeleteImage(void *data) override { void DeleteImage(void *data) const override {
LOG(FATAL) << "Free CPU image"; LOG(FATAL) << "Free CPU image";
free(data); free(data);
}; };
void *Map(void *buffer, size_t nbytes) override { return buffer; } void *Map(void *buffer, size_t offset, size_t nbytes) const override {
return (char*)buffer + offset;
}
void *MapImage(void *buffer, void *MapImage(void *buffer,
const std::vector<size_t> &image_shape, const std::vector<size_t> &image_shape,
std::vector<size_t> &mapped_image_pitch) override { std::vector<size_t> *mapped_image_pitch) const override {
return buffer; return buffer;
} }
void Unmap(void *buffer, void *mapper_ptr) override {} void Unmap(void *buffer, void *mapper_ptr) const override {}
bool OnHost() override { return true; } bool OnHost() const override { return true; }
}; };
std::map<int32_t, Allocator *> *gAllocatorRegistry(); std::map<int32_t, Allocator *> *gAllocatorRegistry();
......
//
// Copyright (c) 2018 XiaoMi All rights reserved.
//
#ifndef MACE_CORE_BUFFER_H_
#define MACE_CORE_BUFFER_H_
#include "mace/core/types.h"
#include "mace/core/allocator.h"
#include <vector>
namespace mace {
class BufferBase {
public:
BufferBase() : size_(0) {}
BufferBase(index_t size) : size_(size) {}
virtual ~BufferBase() {}
virtual void *buffer() = 0;
virtual const void *raw_data() const = 0;
virtual void *raw_mutable_data() = 0;
virtual void *Map(index_t offset,
index_t length,
std::vector<size_t> *pitch) const = 0;
virtual void UnMap(void *mapped_ptr) const = 0;
virtual void Map(std::vector<size_t> *pitch) = 0;
virtual void UnMap() = 0;
virtual void Resize(index_t size) = 0;
virtual void Copy(void *src, index_t offset, index_t length) = 0;
virtual bool OnHost() const = 0;
virtual index_t offset() const {
return 0;
};
template<typename T>
const T *data() const {
return reinterpret_cast<const T *>(raw_data());
}
template<typename T>
T *mutable_data() {
return reinterpret_cast<T *>(raw_mutable_data());
}
index_t size() const {
return size_;
}
protected:
index_t size_;
};
class Buffer : public BufferBase {
public:
Buffer(Allocator *allocator)
: BufferBase(0),
allocator_(allocator),
buf_(nullptr),
mapped_buf_(nullptr),
is_data_owner_(true) {}
Buffer(Allocator *allocator, index_t size)
: BufferBase(size),
allocator_(allocator),
mapped_buf_(nullptr),
is_data_owner_(true) {
buf_ = allocator->New(size);
}
Buffer(Allocator *allocator, void *data, index_t size)
: BufferBase(size),
allocator_(allocator),
buf_(data),
mapped_buf_(nullptr),
is_data_owner_(false) {}
virtual ~Buffer() {
if (mapped_buf_ != nullptr) {
UnMap();
}
if (is_data_owner_ && buf_ != nullptr) {
allocator_->Delete(buf_);
}
}
void *buffer() {
MACE_CHECK_NOTNULL(buf_);
return buf_;
};
const void *raw_data() const {
if (OnHost()) {
MACE_CHECK_NOTNULL(buf_);
return buf_;
} else {
MACE_CHECK_NOTNULL(mapped_buf_);
return mapped_buf_;
}
}
void *raw_mutable_data() {
if (OnHost()) {
MACE_CHECK_NOTNULL(buf_);
return buf_;
} else {
MACE_CHECK_NOTNULL(mapped_buf_);
return mapped_buf_;
}
}
void *Map(index_t offset, index_t length, std::vector<size_t> *pitch) const {
MACE_CHECK_NOTNULL(buf_);
return allocator_->Map(buf_, offset, length);
}
void UnMap(void *mapped_ptr) const {
MACE_CHECK_NOTNULL(buf_);
MACE_CHECK_NOTNULL(mapped_ptr);
allocator_->Unmap(buf_, mapped_ptr);
}
void Map(std::vector<size_t> *pitch) {
MACE_CHECK(mapped_buf_ == nullptr, "buf has been already mapped");
mapped_buf_ = Map(0, size_, pitch);
};
void UnMap() {
UnMap(mapped_buf_);
mapped_buf_ = nullptr;
}
void Resize(index_t size) {
MACE_CHECK(is_data_owner_,
"data is not owned by this buffer, cannot resize");
if (size != size_) {
if (buf_ != nullptr) {
allocator_->Delete(buf_);
}
size_ = size;
buf_ = allocator_->New(size);
}
}
void Copy(void *src, index_t offset, index_t length) {
MACE_CHECK_NOTNULL(mapped_buf_);
MACE_CHECK(length <= size_, "out of buffer");
memcpy(mapped_buf_, (char *) src + offset, length);
}
bool OnHost() const {
return allocator_->OnHost();
}
private:
Allocator *allocator_;
void *buf_;
void *mapped_buf_;
bool is_data_owner_;
DISABLE_COPY_AND_ASSIGN(Buffer);
};
class Image : public BufferBase {
public:
Image()
: BufferBase(0),
allocator_(GetDeviceAllocator(OPENCL)),
buf_(nullptr),
mapped_buf_(nullptr) {}
Image(std::vector<size_t> shape, DataType data_type)
: BufferBase(std::accumulate(shape.begin(), shape.end(),
1, std::multiplies<index_t>())
* GetEnumTypeSize(data_type)),
allocator_(GetDeviceAllocator(OPENCL)),
mapped_buf_(nullptr) {
shape_ = shape;
data_type_ = data_type;
buf_ = allocator_->NewImage(shape, data_type);
}
virtual ~Image() {
if (mapped_buf_ != nullptr) {
UnMap();
}
if (buf_ != nullptr) {
allocator_->DeleteImage(buf_);
}
}
void *buffer() {
MACE_CHECK_NOTNULL(buf_);
return buf_;
};
const void *raw_data() const {
MACE_CHECK_NOTNULL(mapped_buf_);
return mapped_buf_;
}
void *raw_mutable_data() {
MACE_CHECK_NOTNULL(mapped_buf_);
return mapped_buf_;
}
std::vector<size_t> image_shape() const {
return shape_;
}
void *Map(index_t offset, index_t length, std::vector<size_t> *pitch) const {
MACE_NOT_IMPLEMENTED;
return nullptr;
}
void UnMap(void *mapped_ptr) const {
MACE_CHECK_NOTNULL(buf_);
MACE_CHECK_NOTNULL(mapped_ptr);
allocator_->Unmap(buf_, mapped_ptr);
}
void Map(std::vector<size_t> *pitch) {
MACE_CHECK_NOTNULL(buf_);
MACE_CHECK(mapped_buf_ == nullptr, "buf has been already mapped");
MACE_CHECK_NOTNULL(pitch);
mapped_buf_ = allocator_->MapImage(buf_, shape_, pitch);
};
void UnMap() {
UnMap(mapped_buf_);
mapped_buf_ = nullptr;
};
void Resize(index_t size) {}
void Copy(void *src, index_t offset, index_t length) {
MACE_NOT_IMPLEMENTED;
}
bool OnHost() const {
return allocator_->OnHost();
}
private:
Allocator *allocator_;
std::vector<size_t> shape_;
DataType data_type_;
void *buf_;
void *mapped_buf_;
DISABLE_COPY_AND_ASSIGN(Image);
};
class BufferSlice : public BufferBase {
public:
BufferSlice() {}
BufferSlice(BufferBase *buffer, index_t offset, index_t length)
: BufferBase(buffer->size()),
buffer_(buffer),
mapped_buf_(nullptr),
offset_(offset),
length_(length) {
MACE_CHECK(offset >= 0, "buffer slice offset should >= 0");
MACE_CHECK(offset + length <= size_,
"buffer slice offset + length (",
offset,
" + ",
length,
") should <= ",
size_);
}
BufferSlice(const BufferSlice &other) : BufferSlice(other.buffer_,
other.offset_,
other.length_) {}
~BufferSlice() {
if (mapped_buf_ != nullptr) {
UnMap();
}
}
void *buffer() {
return buffer_->buffer();
};
const void *raw_data() const {
if (OnHost()) {
MACE_CHECK_NOTNULL(buffer_);
return (char *) buffer_->raw_data() + offset_;
} else {
MACE_CHECK_NOTNULL(mapped_buf_);
return mapped_buf_;
}
}
void *raw_mutable_data() {
MACE_NOT_IMPLEMENTED;
return nullptr;
}
void *Map(index_t offset, index_t length, std::vector<size_t> *pitch) const {
MACE_NOT_IMPLEMENTED;
return nullptr;
}
void UnMap(void *mapped_ptr) const {
MACE_NOT_IMPLEMENTED;
}
void Map(std::vector<size_t> *pitch) {
MACE_CHECK_NOTNULL(buffer_);
MACE_CHECK(mapped_buf_ == nullptr, "mapped buf is not null");
mapped_buf_ = buffer_->Map(offset_, length_, pitch);
};
void UnMap() {
MACE_CHECK_NOTNULL(mapped_buf_);
buffer_->UnMap(mapped_buf_);
mapped_buf_ = nullptr;
};
void Resize(index_t size) {
}
void Copy(void *src, index_t offset, index_t length) {
MACE_NOT_IMPLEMENTED;
}
index_t offset() const {
return offset_;
}
bool OnHost() const {
return buffer_->OnHost();
}
private:
BufferBase *buffer_;
void *mapped_buf_;
index_t offset_;
index_t length_;
};
}
#endif // MACE_CORE_BUFFER_H_
...@@ -542,7 +542,6 @@ MaceEngine::MaceEngine(const NetDef *net_def, DeviceType device_type) : ...@@ -542,7 +542,6 @@ MaceEngine::MaceEngine(const NetDef *net_def, DeviceType device_type) :
if (!net->Run()) { if (!net->Run()) {
LOG(FATAL) << "Net init run failed"; LOG(FATAL) << "Net init run failed";
} }
ws_->RemoveUnsedTensor();
net_ = std::move(CreateNet(op_registry_, *net_def, ws_.get(), device_type)); net_ = std::move(CreateNet(op_registry_, *net_def, ws_.get(), device_type));
} }
} }
......
...@@ -15,14 +15,14 @@ NetBase::NetBase(const std::shared_ptr<const OperatorRegistry> op_registry, ...@@ -15,14 +15,14 @@ NetBase::NetBase(const std::shared_ptr<const OperatorRegistry> op_registry,
DeviceType type) DeviceType type)
: op_registry_(op_registry), name_(net_def->name()) {} : op_registry_(op_registry), name_(net_def->name()) {}
SimpleNet::SimpleNet(const std::shared_ptr<const OperatorRegistry> op_registry, SerialNet::SerialNet(const std::shared_ptr<const OperatorRegistry> op_registry,
const std::shared_ptr<const NetDef> net_def, const std::shared_ptr<const NetDef> net_def,
Workspace *ws, Workspace *ws,
DeviceType type, DeviceType type,
const NetMode mode) const NetMode mode)
: NetBase(op_registry, net_def, ws, type), : NetBase(op_registry, net_def, ws, type),
device_type_(type) { device_type_(type) {
MACE_LATENCY_LOGGER(1, "Constructing SimpleNet ", net_def->name()); MACE_LATENCY_LOGGER(1, "Constructing SerialNet ", net_def->name());
for (int idx = 0; idx < net_def->op_size(); ++idx) { for (int idx = 0; idx < net_def->op_size(); ++idx) {
const auto &operator_def = net_def->op(idx); const auto &operator_def = net_def->op(idx);
VLOG(3) << "Creating operator " << operator_def.name() << "(" VLOG(3) << "Creating operator " << operator_def.name() << "("
...@@ -36,7 +36,7 @@ SimpleNet::SimpleNet(const std::shared_ptr<const OperatorRegistry> op_registry, ...@@ -36,7 +36,7 @@ SimpleNet::SimpleNet(const std::shared_ptr<const OperatorRegistry> op_registry,
} }
} }
bool SimpleNet::Run(RunMetadata *run_metadata) { bool SerialNet::Run(RunMetadata *run_metadata) {
MACE_MEMORY_LOGGING_GUARD(); MACE_MEMORY_LOGGING_GUARD();
MACE_LATENCY_LOGGER(1, "Running net"); MACE_LATENCY_LOGGER(1, "Running net");
for (auto iter = operators_.begin(); iter != operators_.end(); ++iter) { for (auto iter = operators_.begin(); iter != operators_.end(); ++iter) {
...@@ -99,7 +99,7 @@ std::unique_ptr<NetBase> CreateNet( ...@@ -99,7 +99,7 @@ std::unique_ptr<NetBase> CreateNet(
Workspace *ws, Workspace *ws,
DeviceType type, DeviceType type,
const NetMode mode) { const NetMode mode) {
std::unique_ptr<NetBase> net(new SimpleNet(op_registry, net_def, ws, type, mode)); std::unique_ptr<NetBase> net(new SerialNet(op_registry, net_def, ws, type, mode));
return net; return net;
} }
......
...@@ -33,9 +33,9 @@ class NetBase { ...@@ -33,9 +33,9 @@ class NetBase {
DISABLE_COPY_AND_ASSIGN(NetBase); DISABLE_COPY_AND_ASSIGN(NetBase);
}; };
class SimpleNet : public NetBase { class SerialNet : public NetBase {
public: public:
SimpleNet(const std::shared_ptr<const OperatorRegistry> op_registry, SerialNet(const std::shared_ptr<const OperatorRegistry> op_registry,
const std::shared_ptr<const NetDef> net_def, const std::shared_ptr<const NetDef> net_def,
Workspace *ws, Workspace *ws,
DeviceType type, DeviceType type,
...@@ -47,7 +47,7 @@ class SimpleNet : public NetBase { ...@@ -47,7 +47,7 @@ class SimpleNet : public NetBase {
std::vector<std::unique_ptr<OperatorBase> > operators_; std::vector<std::unique_ptr<OperatorBase> > operators_;
DeviceType device_type_; DeviceType device_type_;
DISABLE_COPY_AND_ASSIGN(SimpleNet); DISABLE_COPY_AND_ASSIGN(SerialNet);
}; };
std::unique_ptr<NetBase> CreateNet( std::unique_ptr<NetBase> CreateNet(
......
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
#ifndef MACE_CORE_PREALLOCATED_POOLED_ALLOCATOR_H_ #ifndef MACE_CORE_PREALLOCATED_POOLED_ALLOCATOR_H_
#define MACE_CORE_PREALLOCATED_POOLED_ALLOCATOR_H_ #define MACE_CORE_PREALLOCATED_POOLED_ALLOCATOR_H_
#include <unordered_map>
#include "mace/core/allocator.h" #include "mace/core/allocator.h"
namespace mace { namespace mace {
...@@ -13,17 +14,26 @@ class PreallocatedPooledAllocator { ...@@ -13,17 +14,26 @@ class PreallocatedPooledAllocator {
public: public:
PreallocatedPooledAllocator() {} PreallocatedPooledAllocator() {}
virtual ~PreallocatedPooledAllocator() noexcept {} ~PreallocatedPooledAllocator() noexcept {}
virtual void PreallocateImage(int mem_id, void SetBuffer(int mem_id, std::unique_ptr<BufferBase> &&buffer) {
const std::vector<size_t> &image_shape, buffers_[mem_id] = std::move(buffer);
DataType data_type) = 0; }
virtual void *GetImage(int mem_id) = 0; BufferBase *GetBuffer(int mem_id) {
if (buffers_.find(mem_id) != buffers_.end()) {
return buffers_[mem_id].get();
} else {
return nullptr;
}
}
virtual bool HasImage(int mem_id) = 0; virtual bool HasBuffer(int mem_id) {
return buffers_.find(mem_id) != buffers_.end();
}
virtual std::vector<size_t> GetImageSize(int mem_id) = 0; private:
std::unordered_map<int, std::unique_ptr<BufferBase>> buffers_;
}; };
} // namespace mace } // namespace mace
......
...@@ -11,7 +11,6 @@ ...@@ -11,7 +11,6 @@
#include "mace/core/runtime/hexagon/quantize.h" #include "mace/core/runtime/hexagon/quantize.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/public/mace.h" #include "mace/public/mace.h"
#include "mace/core/serializer.h"
namespace mace { namespace mace {
...@@ -37,7 +36,6 @@ class HexagonControlWrapper { ...@@ -37,7 +36,6 @@ class HexagonControlWrapper {
void SetGraphMode(int mode); void SetGraphMode(int mode);
private: private:
// CAVEAT: Need offset as HVX library reserves some ids
static constexpr int NODE_ID_OFFSET = 10000; static constexpr int NODE_ID_OFFSET = 10000;
inline uint32_t node_id(uint32_t nodeid) { inline uint32_t node_id(uint32_t nodeid) {
...@@ -45,7 +43,6 @@ class HexagonControlWrapper { ...@@ -45,7 +43,6 @@ class HexagonControlWrapper {
} }
int nn_id_; int nn_id_;
Serializer serializer_;
Quantizer quantizer_; Quantizer quantizer_;
std::vector<std::vector<index_t>> input_shapes_; std::vector<std::vector<index_t>> input_shapes_;
......
...@@ -35,7 +35,7 @@ static cl_channel_type DataTypeToCLChannelType(const DataType t) { ...@@ -35,7 +35,7 @@ static cl_channel_type DataTypeToCLChannelType(const DataType t) {
OpenCLAllocator::OpenCLAllocator() {} OpenCLAllocator::OpenCLAllocator() {}
OpenCLAllocator::~OpenCLAllocator() {} OpenCLAllocator::~OpenCLAllocator() {}
void *OpenCLAllocator::New(size_t nbytes) { void *OpenCLAllocator::New(size_t nbytes) const {
VLOG(3) << "Allocate OpenCL buffer: " << nbytes; VLOG(3) << "Allocate OpenCL buffer: " << nbytes;
cl_int error; cl_int error;
cl::Buffer *buffer = new cl::Buffer(OpenCLRuntime::Global()->context(), cl::Buffer *buffer = new cl::Buffer(OpenCLRuntime::Global()->context(),
...@@ -47,7 +47,7 @@ void *OpenCLAllocator::New(size_t nbytes) { ...@@ -47,7 +47,7 @@ void *OpenCLAllocator::New(size_t nbytes) {
} }
void *OpenCLAllocator::NewImage(const std::vector<size_t> &image_shape, void *OpenCLAllocator::NewImage(const std::vector<size_t> &image_shape,
const DataType dt) { const DataType dt) const {
MACE_CHECK(image_shape.size() == 2) << "Image shape's size must equal 2"; MACE_CHECK(image_shape.size() == 2) << "Image shape's size must equal 2";
VLOG(3) << "Allocate OpenCL image: " << image_shape[0] << ", " << image_shape[1]; VLOG(3) << "Allocate OpenCL image: " << image_shape[0] << ", " << image_shape[1];
...@@ -67,7 +67,7 @@ void *OpenCLAllocator::NewImage(const std::vector<size_t> &image_shape, ...@@ -67,7 +67,7 @@ void *OpenCLAllocator::NewImage(const std::vector<size_t> &image_shape,
return cl_image; return cl_image;
} }
void OpenCLAllocator::Delete(void *buffer) { void OpenCLAllocator::Delete(void *buffer) const {
VLOG(3) << "Free OpenCL buffer"; VLOG(3) << "Free OpenCL buffer";
if (buffer != nullptr) { if (buffer != nullptr) {
cl::Buffer *cl_buffer = static_cast<cl::Buffer *>(buffer); cl::Buffer *cl_buffer = static_cast<cl::Buffer *>(buffer);
...@@ -75,7 +75,7 @@ void OpenCLAllocator::Delete(void *buffer) { ...@@ -75,7 +75,7 @@ void OpenCLAllocator::Delete(void *buffer) {
} }
} }
void OpenCLAllocator::DeleteImage(void *buffer) { void OpenCLAllocator::DeleteImage(void *buffer) const {
VLOG(3) << "Free OpenCL image"; VLOG(3) << "Free OpenCL image";
if (buffer != nullptr) { if (buffer != nullptr) {
cl::Image2D *cl_image = static_cast<cl::Image2D *>(buffer); cl::Image2D *cl_image = static_cast<cl::Image2D *>(buffer);
...@@ -83,13 +83,13 @@ void OpenCLAllocator::DeleteImage(void *buffer) { ...@@ -83,13 +83,13 @@ void OpenCLAllocator::DeleteImage(void *buffer) {
} }
} }
void *OpenCLAllocator::Map(void *buffer, size_t nbytes) { void *OpenCLAllocator::Map(void *buffer, size_t offset, size_t nbytes) const {
auto cl_buffer = static_cast<cl::Buffer *>(buffer); auto cl_buffer = static_cast<cl::Buffer *>(buffer);
auto queue = OpenCLRuntime::Global()->command_queue(); auto queue = OpenCLRuntime::Global()->command_queue();
// TODO(heliangliang) Non-blocking call // TODO(heliangliang) Non-blocking call
cl_int error; cl_int error;
void *mapped_ptr = void *mapped_ptr =
queue.enqueueMapBuffer(*cl_buffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, queue.enqueueMapBuffer(*cl_buffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, offset,
nbytes, nullptr, nullptr, &error); nbytes, nullptr, nullptr, &error);
MACE_CHECK(error == CL_SUCCESS); MACE_CHECK(error == CL_SUCCESS);
return mapped_ptr; return mapped_ptr;
...@@ -98,33 +98,33 @@ void *OpenCLAllocator::Map(void *buffer, size_t nbytes) { ...@@ -98,33 +98,33 @@ void *OpenCLAllocator::Map(void *buffer, size_t nbytes) {
// TODO : there is something wrong with half type. // TODO : there is something wrong with half type.
void *OpenCLAllocator::MapImage(void *buffer, void *OpenCLAllocator::MapImage(void *buffer,
const std::vector<size_t> &image_shape, const std::vector<size_t> &image_shape,
std::vector<size_t> &mapped_image_pitch) { std::vector<size_t> *mapped_image_pitch) const {
MACE_CHECK(image_shape.size() == 2) << "Just support map 2d image"; MACE_CHECK(image_shape.size() == 2) << "Just support map 2d image";
auto cl_image = static_cast<cl::Image2D *>(buffer); auto cl_image = static_cast<cl::Image2D *>(buffer);
std::array<size_t, 3> origin = {0, 0, 0}; std::array<size_t, 3> origin = {0, 0, 0};
std::array<size_t, 3> region = {image_shape[0], image_shape[1], 1}; std::array<size_t, 3> region = {image_shape[0], image_shape[1], 1};
mapped_image_pitch.resize(2); mapped_image_pitch->resize(2);
cl_int error; cl_int error;
void *mapped_ptr = void *mapped_ptr =
OpenCLRuntime::Global()->command_queue().enqueueMapImage(*cl_image, OpenCLRuntime::Global()->command_queue().enqueueMapImage(*cl_image,
CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE,
origin, region, origin, region,
&mapped_image_pitch[0], mapped_image_pitch->data(),
&mapped_image_pitch[1], mapped_image_pitch->data() + 1,
nullptr, nullptr, &error); nullptr, nullptr, &error);
MACE_CHECK(error == CL_SUCCESS) << error; MACE_CHECK(error == CL_SUCCESS) << error;
return mapped_ptr; return mapped_ptr;
} }
void OpenCLAllocator::Unmap(void *buffer, void *mapped_ptr) { void OpenCLAllocator::Unmap(void *buffer, void *mapped_ptr) const {
auto cl_buffer = static_cast<cl::Buffer *>(buffer); auto cl_buffer = static_cast<cl::Buffer *>(buffer);
auto queue = OpenCLRuntime::Global()->command_queue(); auto queue = OpenCLRuntime::Global()->command_queue();
MACE_CHECK(queue.enqueueUnmapMemObject(*cl_buffer, mapped_ptr, nullptr, MACE_CHECK(queue.enqueueUnmapMemObject(*cl_buffer, mapped_ptr, nullptr,
nullptr) == CL_SUCCESS); nullptr) == CL_SUCCESS);
} }
bool OpenCLAllocator::OnHost() { return false; } bool OpenCLAllocator::OnHost() const { return false; }
} // namespace mace } // namespace mace
...@@ -15,7 +15,7 @@ class OpenCLAllocator : public Allocator { ...@@ -15,7 +15,7 @@ class OpenCLAllocator : public Allocator {
~OpenCLAllocator() override; ~OpenCLAllocator() override;
void *New(size_t nbytes) override; void *New(size_t nbytes) const override;
/* /*
* Use Image2D with RGBA (128-bit) format to represent the image. * Use Image2D with RGBA (128-bit) format to represent the image.
...@@ -23,21 +23,21 @@ class OpenCLAllocator : public Allocator { ...@@ -23,21 +23,21 @@ class OpenCLAllocator : public Allocator {
* @ shape : [depth, ..., height, width ]. * @ shape : [depth, ..., height, width ].
*/ */
void *NewImage(const std::vector<size_t> &image_shape, void *NewImage(const std::vector<size_t> &image_shape,
const DataType dt) override; const DataType dt) const override;
void Delete(void *buffer) override; void Delete(void *buffer) const override;
void DeleteImage(void *buffer) override; void DeleteImage(void *buffer) const override;
void *Map(void *buffer, size_t nbytes) override; void *Map(void *buffer, size_t offset, size_t nbytes) const override;
void *MapImage(void *buffer, void *MapImage(void *buffer,
const std::vector<size_t> &image_shape, const std::vector<size_t> &image_shape,
std::vector<size_t> &mapped_image_pitch) override; std::vector<size_t> *mapped_image_pitch) const override;
void Unmap(void *buffer, void *mapped_ptr) override; void Unmap(void *buffer, void *mapped_ptr) const override;
bool OnHost() override; bool OnHost() const override;
}; };
} // namespace mace } // namespace mace
......
//
// Copyright (c) 2018 XiaoMi All rights reserved.
//
#include "mace/core/runtime/opencl/opencl_preallocated_pooled_allocator.h"
namespace mace {
OpenCLPreallocatedPooledAllocator::OpenCLPreallocatedPooledAllocator()
: allocator(GetDeviceAllocator(DeviceType::OPENCL)) {
}
OpenCLPreallocatedPooledAllocator::~OpenCLPreallocatedPooledAllocator() {
}
void OpenCLPreallocatedPooledAllocator::PreallocateImage(int mem_id,
const std::vector<
size_t> &image_shape,
DataType data_type) {
MACE_CHECK(!this->HasImage(mem_id), "Memory already exists: ", mem_id);
VLOG(2) << "Preallocate OpenCL image: " << mem_id << " "
<< image_shape[0] << ", " << image_shape[1];
images_[mem_id] = std::move(std::unique_ptr<void, std::function<void(void *)>>(
allocator->NewImage(image_shape, data_type), [this](void *p) {
this->allocator->DeleteImage(p);
}));
image_shapes_[mem_id] = image_shape;
}
} // namespace mace
//
// Copyright (c) 2018 XiaoMi All rights reserved.
//
#ifndef MACE_CORE_RUNTIME_OPENCL_PREALLOCATED_POOLED_ALLOCATOR_H_
#define MACE_CORE_RUNTIME_OPENCL_PREALLOCATED_POOLED_ALLOCATOR_H_
#include "mace/core/preallocated_pooled_allocator.h"
#include <unordered_map>
namespace mace {
class OpenCLPreallocatedPooledAllocator : public PreallocatedPooledAllocator {
public:
OpenCLPreallocatedPooledAllocator();
~OpenCLPreallocatedPooledAllocator() override;
void PreallocateImage(int mem_id,
const std::vector<size_t> &image_shape,
DataType data_type) override;
inline void *GetImage(int mem_id) override {
MACE_CHECK(HasImage(mem_id), "image does not exist");
return images_[mem_id].get();
}
inline bool HasImage(int mem_id) override {
return images_.find(mem_id) != images_.end();
}
inline std::vector<size_t> GetImageSize(int mem_id) override {
return image_shapes_[mem_id];
}
private:
std::unordered_map<int, std::unique_ptr<void, std::function<void(void *)>>>
images_;
std::unordered_map<int, std::vector<size_t>> image_shapes_;
Allocator *allocator;
};
} // namepsace mace
#endif // MACE_CORE_RUNTIME_OPENCL_PREALLOCATED_POOLED_ALLOCATOR_H_
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/core/serializer.h"
namespace mace {
std::unique_ptr<ConstTensor> Serializer::Serialize(const Tensor &tensor,
const std::string &name) {
MACE_NOT_IMPLEMENTED;
return nullptr;
}
std::unique_ptr<Tensor> Serializer::Deserialize(const ConstTensor &const_tensor,
DeviceType type) {
std::unique_ptr<Tensor> tensor(
new Tensor(GetDeviceAllocator(type), const_tensor.data_type()));
std::vector<index_t> dims;
for (const index_t d : const_tensor.dims()) {
dims.push_back(d);
}
tensor->Resize(dims);
switch (const_tensor.data_type()) {
case DT_HALF:
tensor->Copy<half>(reinterpret_cast<const half *>(const_tensor.data()),
const_tensor.data_size());
break;
case DT_FLOAT:
tensor->Copy<float>(reinterpret_cast<const float *>(const_tensor.data()),
const_tensor.data_size());
break;
case DT_DOUBLE:
tensor->Copy<double>(
reinterpret_cast<const double *>(const_tensor.data()),
const_tensor.data_size());
break;
case DT_INT32:
tensor->Copy<int32_t>(
reinterpret_cast<const int32_t *>(const_tensor.data()),
const_tensor.data_size());
break;
case DT_INT64:
tensor->Copy<int64_t>(
reinterpret_cast<const int64_t *>(const_tensor.data()),
const_tensor.data_size());
break;
case DT_UINT8:
tensor->Copy<uint8_t>(
reinterpret_cast<const uint8_t *>(const_tensor.data()),
const_tensor.data_size());
break;
case DT_INT16:
tensor->CopyWithCast<int32_t, uint16_t>(
reinterpret_cast<const int32_t *>(const_tensor.data()),
const_tensor.data_size());
break;
case DT_INT8:
tensor->CopyWithCast<int32_t, int8_t>(
reinterpret_cast<const int32_t *>(const_tensor.data()),
const_tensor.data_size());
break;
case DT_UINT16:
tensor->CopyWithCast<int32_t, int16_t>(
reinterpret_cast<const int32_t *>(const_tensor.data()),
const_tensor.data_size());
break;
case DT_BOOL:
tensor->CopyWithCast<int32_t, bool>(
reinterpret_cast<const int32_t *>(const_tensor.data()),
const_tensor.data_size());
break;
default:
MACE_NOT_IMPLEMENTED;
break;
}
return tensor;
}
} // namespace mace
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#ifndef MACE_CORE_SERIALIZER_H_
#define MACE_CORE_SERIALIZER_H_
#include "mace/core/tensor.h"
#include "mace/public/mace.h"
namespace mace {
class Serializer {
public:
Serializer() {}
~Serializer() {}
std::unique_ptr<ConstTensor> Serialize(const Tensor &tensor,
const std::string &name);
std::unique_ptr<Tensor> Deserialize(const ConstTensor &const_tensor,
DeviceType type);
DISABLE_COPY_AND_ASSIGN(Serializer);
};
} // namespace mace
#endif // MACE_CORE_SERIALIZER_H_
...@@ -5,7 +5,7 @@ ...@@ -5,7 +5,7 @@
#ifndef MACE_CORE_TENSOR_H_ #ifndef MACE_CORE_TENSOR_H_
#define MACE_CORE_TENSOR_H_ #define MACE_CORE_TENSOR_H_
#include "mace/core/allocator.h" #include "mace/core/buffer.h"
#include "mace/utils/logging.h" #include "mace/utils/logging.h"
#include "mace/core/types.h" #include "mace/core/types.h"
#include "mace/public/mace.h" #include "mace/public/mace.h"
...@@ -46,7 +46,6 @@ namespace mace { ...@@ -46,7 +46,6 @@ namespace mace {
CASES_WITH_DEFAULT(TYPE_ENUM, STMTS, LOG(FATAL) << "Type not set"; \ CASES_WITH_DEFAULT(TYPE_ENUM, STMTS, LOG(FATAL) << "Type not set"; \
, LOG(FATAL) << "Unexpected type: " << TYPE_ENUM;) , LOG(FATAL) << "Unexpected type: " << TYPE_ENUM;)
namespace numerical_chars { namespace numerical_chars {
inline std::ostream &operator<<(std::ostream &os, char c) { inline std::ostream &operator<<(std::ostream &os, char c) {
return std::is_signed<char>::value ? os << static_cast<int>(c) return std::is_signed<char>::value ? os << static_cast<int>(c)
...@@ -64,27 +63,30 @@ inline std::ostream &operator<<(std::ostream &os, unsigned char c) { ...@@ -64,27 +63,30 @@ inline std::ostream &operator<<(std::ostream &os, unsigned char c) {
class Tensor { class Tensor {
public: public:
Tensor()
: alloc_(GetDeviceAllocator(DeviceType::CPU)),
size_(0),
dtype_(DT_FLOAT),
buffer_(nullptr),
data_(nullptr),
unused_(false),
is_image_(false){
};
Tensor(Allocator *alloc, DataType type) Tensor(Allocator *alloc, DataType type)
: alloc_(alloc), : allocator_(alloc),
size_(0), dtype_(type),
dtype_(type), buffer_(nullptr),
buffer_(nullptr), is_buffer_owner_(true) {};
data_(nullptr),
unused_(false), Tensor(BufferBase *buffer, DataType dtype)
is_image_(false){ : dtype_(dtype),
}; buffer_(buffer),
is_buffer_owner_(false) {}
Tensor(const BufferSlice &buffer_slice, DataType dtype)
: dtype_(dtype),
buffer_slice_(buffer_slice),
is_buffer_owner_(false) {
buffer_ = &buffer_slice_;
}
Tensor() : Tensor(GetDeviceAllocator(CPU), DT_FLOAT) {}
~Tensor() { ~Tensor() {
if (is_buffer_owner_ && buffer_ != nullptr) {
delete buffer_;
}
} }
inline DataType dtype() const { return dtype_; } inline DataType dtype() const { return dtype_; }
...@@ -93,182 +95,113 @@ class Tensor { ...@@ -93,182 +95,113 @@ class Tensor {
inline const std::vector<index_t> &shape() const { return shape_; } inline const std::vector<index_t> &shape() const { return shape_; }
inline const std::vector<size_t> &image_shape() const { return image_shape_; }
inline const bool is_image() const { return is_image_; }
inline index_t dim_size() const { return shape_.size(); } inline index_t dim_size() const { return shape_.size(); }
inline index_t dim(unsigned int index) const { inline index_t dim(unsigned int index) const {
MACE_CHECK(index < shape_.size(), "Dim out of range: ", MACE_CHECK(index < shape_.size(), "Dim out of range: ",
index, " >= ", shape_.size()); index, " >= ", shape_.size());
return shape_[index]; return shape_[index];
} }
inline index_t size() const { return size_; } inline index_t size() const {
inline index_t raw_size() const { return size_ * SizeOfType(); }
inline const bool unused() const { return unused_; }
inline int64_t NumElements() const {
return std::accumulate(shape_.begin(), shape_.end(), 1, return std::accumulate(shape_.begin(), shape_.end(), 1,
std::multiplies<int64_t>()); std::multiplies<int64_t>());
} }
inline const bool OnHost() const { return alloc_->OnHost(); } inline index_t raw_size() const {
return size() * SizeOfType();
/*
* Map the device buffer as CPU buffer to access the data, unmap must be
* called later
*/
inline void Map() const {
if (!OnHost()) {
MACE_CHECK(buffer_ != nullptr && data_ == nullptr);
data_ = alloc_->Map(buffer_.get(), size_ * SizeOfType());
}
} }
inline void MapImage(std::vector<size_t> &mapped_image_pitch) const { inline void *buffer() const {
MACE_CHECK(!OnHost() && buffer_ != nullptr && data_ == nullptr); MACE_CHECK(buffer_ != nullptr && buffer_->buffer() != nullptr,
data_ = alloc_->MapImage(buffer_.get(), image_shape_, mapped_image_pitch); "buffer is null");
return buffer_->buffer();
} }
/* inline index_t buffer_offset() const {
* Unmap the device buffer return buffer_->offset();
*/
inline void Unmap() const {
if (!OnHost()) {
MACE_CHECK(buffer_ != nullptr && data_ != nullptr);
alloc_->Unmap(buffer_.get(), data_);
data_ = nullptr;
}
} }
void *buffer() const { return buffer_.get(); }
inline const void *raw_data() const { inline const void *raw_data() const {
void *data = MappedBuffer(); MACE_CHECK(buffer_ != nullptr, "buffer is null");
MACE_CHECK(data != nullptr || size_ == 0, return buffer_->raw_data();
"The tensor is of non-zero shape, but its data is not allocated "
"or mapped yet.");
return data;
} }
template <typename T> template<typename T>
inline const T *data() const { inline const T *data() const {
return static_cast<const T *>(raw_data()); MACE_CHECK(buffer_ != nullptr, "buffer is null");
return buffer_->data<T>();
} }
inline void *raw_mutable_data() { inline void *raw_mutable_data() {
void *data = MappedBuffer(); MACE_CHECK(buffer_ != nullptr, "buffer is null");
MACE_CHECK(data != nullptr || size_ == 0, return buffer_->raw_mutable_data();
"The tensor is of non-zero shape, but its data is not allocated "
"or mapped yet.");
return data;
} }
template <typename T> template<typename T>
inline T *mutable_data() { inline T *mutable_data() {
return static_cast<T *>(raw_mutable_data()); MACE_CHECK(buffer_ != nullptr, "buffer is null");
return static_cast<T *>(buffer_->raw_mutable_data());
} }
inline void Resize(const std::vector<index_t> &shape) { inline void Reshape(const std::vector<index_t> &shape) {
MACE_CHECK(!is_image_ || buffer_ == nullptr,
"Resize is not for image, use ResizeImage instead.");
is_image_ = false;
shape_ = shape; shape_ = shape;
index_t size = NumElements(); MACE_CHECK(raw_size() <= buffer_->size());
if (size_ != size) {
size_ = size;
MACE_CHECK(data_ == nullptr, "Buffer must be unmapped before resize");
CASES(dtype_,
(buffer_ =
std::move(std::unique_ptr<void, std::function<void(void *)>>(
alloc_->New(size_ * sizeof(T)),
[this](void *p) {
this->alloc_->Delete(p);
})
)));
}
} }
inline void ResizeImage(const std::vector<index_t> &shape, inline void Resize(const std::vector<index_t> &shape) {
const std::vector<size_t> &image_shape) {
MACE_CHECK(is_image_ || buffer_ == nullptr,
"ResizeImage is not for buffer, use Resize instead.");
is_image_ = true;
shape_ = shape; shape_ = shape;
index_t size = NumElements(); if (buffer_ != nullptr) {
if (size_ != size) { buffer_->Resize(raw_size());
size_ = size; } else {
image_shape_ = image_shape; buffer_ = new Buffer(allocator_, raw_size());
if (!preallocated_image_shape_.empty()) { is_buffer_owner_ = true;
MACE_CHECK(preallocated_image_shape_[0] >= image_shape[0]
&& preallocated_image_shape_[1] >= image_shape[1],
"image shape not large enough: preallocated ",
preallocated_image_shape_[0],
" ",
preallocated_image_shape_[1],
"apply for ",
image_shape[0],
" ",
image_shape[1]);
} else {
buffer_ = std::move(std::unique_ptr<void, std::function<void(void *)>>(
alloc_->NewImage(image_shape, dtype_),
[this](void *p) { this->alloc_->DeleteImage(p); }));
preallocated_image_shape_ = image_shape;
}
} }
} }
inline void ResizeLike(const Tensor &other) { inline void ResizeLike(const Tensor &other) {
if (other.is_image()) { Resize(other.shape());
ResizeImage(other.shape(), other.image_shape());
} else {
Resize(other.shape());
}
} }
inline void ResizeLike(const Tensor *other) { inline void ResizeLike(const Tensor *other) {
if (other->is_image()) { Resize(other->shape());
ResizeImage(other->shape(), other->image_shape()); }
} else {
Resize(other->shape()); inline void ResizeImage(const std::vector<index_t> &shape,
const std::vector<size_t> &image_shape) {
shape_ = shape;
if (buffer_ == nullptr) {
buffer_ = new Image(image_shape, dtype_);
is_buffer_owner_ = true;
} }
} }
inline void PreallocateImage(void *image, inline void CopyBytes(const void *src, size_t size) {
const std::vector<size_t>& image_shape) { MappingGuard guard(this);
is_image_ = true; memcpy(buffer_->raw_mutable_data(), src, size);
buffer_ = std::move(std::unique_ptr<void, std::function<void(void *)>>(
image, [](void *p) {
// tensor does not have ownership of preallocated memory
}));
preallocated_image_shape_ = image_shape;
} }
template <typename T> template<typename T>
inline void Copy(const T *src, index_t size) { inline void Copy(const T *src, index_t length) {
MACE_CHECK(size == size_, "copy src and dst with different size."); MACE_CHECK(length == size(), "copy src and dst with different size.");
CopyBytes(static_cast<const void *>(src), sizeof(T) * size); CopyBytes(static_cast<const void *>(src), sizeof(T) * length);
} }
template <typename SrcType, typename DstType> inline void Copy(const Tensor &other) {
inline void CopyWithCast(const SrcType *src, size_t size) { dtype_ = other.dtype_;
MACE_CHECK(static_cast<index_t>(size) == size_, ResizeLike(other);
"copy src and dst with different size."); MappingGuard map_other(&other);
std::unique_ptr<DstType[]> buffer(new DstType[size]); CopyBytes(other.raw_data(), other.size() * SizeOfType());
for (size_t i = 0; i < size; ++i) {
buffer[i] = static_cast<DstType>(src[i]);
}
CopyBytes(static_cast<const void *>(buffer.get()), sizeof(DstType) * size);
} }
inline void CopyBytes(const void *src, size_t size) { inline size_t SizeOfType() const {
MappingGuard map_this(this); size_t type_size = 0;
memcpy(raw_mutable_data(), src, size); CASES(dtype_, type_size = sizeof(T));
return type_size;
}
inline BufferBase *UnderlyingBuffer() const {
return buffer_;
} }
inline void DebugPrint() const { inline void DebugPrint() const {
...@@ -281,8 +214,8 @@ class Tensor { ...@@ -281,8 +214,8 @@ class Tensor {
os.str(""); os.str("");
os.clear(); os.clear();
MappingGuard guard(this); MappingGuard guard(this);
for (int i = 0; i < size_; ++i) { for (int i = 0; i < size(); ++i) {
if ( i != 0 && i % shape_[3] == 0) { if (i != 0 && i % shape_[3] == 0) {
os << "\n"; os << "\n";
} }
CASES(dtype_, (os << (this->data<T>()[i]) << ", ")); CASES(dtype_, (os << (this->data<T>()[i]) << ", "));
...@@ -291,37 +224,11 @@ class Tensor { ...@@ -291,37 +224,11 @@ class Tensor {
<< dim(2) << ", " << dim(3) << "], content:\n" << os.str(); << dim(2) << ", " << dim(3) << "], content:\n" << os.str();
} }
inline size_t SizeOfType() const {
size_t type_size = 0;
CASES(dtype_, type_size = sizeof(T));
return type_size;
}
inline void Copy(const Tensor &other) {
alloc_ = other.alloc_;
dtype_ = other.dtype_;
ResizeLike(other);
MappingGuard map_other(&other);
if (is_image_) {
LOG(FATAL) << "Not support copy image tensor, please use Copy API.";
} else {
CopyBytes(other.raw_data(), size_ * SizeOfType());
}
}
inline void MarkUnused() {
this->unused_ = true;
}
class MappingGuard { class MappingGuard {
public: public:
MappingGuard(const Tensor *tensor) : tensor_(tensor) { MappingGuard(const Tensor *tensor) : tensor_(tensor) {
if (tensor_ != nullptr) { if (tensor_ != nullptr) {
if (tensor_->is_image()) { tensor_->buffer_->Map(&mapped_image_pitch_);
tensor_->MapImage(mapped_image_pitch_);
} else {
tensor_->Map();
}
} }
} }
...@@ -331,42 +238,29 @@ class Tensor { ...@@ -331,42 +238,29 @@ class Tensor {
} }
~MappingGuard() { ~MappingGuard() {
if (tensor_ != nullptr) tensor_->Unmap(); if (tensor_ != nullptr) tensor_->buffer_->UnMap();
} }
inline const std::vector<size_t> &mapped_image_pitch() const { return mapped_image_pitch_; } inline const std::vector<size_t> &mapped_image_pitch() const {
return mapped_image_pitch_;
}
private: private:
const Tensor *tensor_; const Tensor *tensor_;
std::vector<size_t> mapped_image_pitch_; std::vector<size_t> mapped_image_pitch_;
DISABLE_COPY_AND_ASSIGN(MappingGuard); DISABLE_COPY_AND_ASSIGN(MappingGuard);
}; };
private: private:
inline void *MappedBuffer() const { Allocator *allocator_;
if (OnHost()) {
return buffer_.get();
}
return data_;
}
Allocator *alloc_;
index_t size_;
DataType dtype_; DataType dtype_;
// Raw buffer, must be mapped as host accessable data before
// read or write
std::unique_ptr<void, std::function<void(void*)>> buffer_;
// Mapped buffer
mutable void *data_;
std::vector<index_t> shape_; std::vector<index_t> shape_;
// Image for opencl BufferBase *buffer_;
bool unused_; BufferSlice buffer_slice_;
bool is_image_; bool is_buffer_owner_;
std::vector<size_t> image_shape_;
std::vector<size_t> preallocated_image_shape_;
DISABLE_COPY_AND_ASSIGN(Tensor); DISABLE_COPY_AND_ASSIGN(Tensor);
}; };
} // namespace tensor } // namespace tensor
......
...@@ -6,21 +6,11 @@ ...@@ -6,21 +6,11 @@
#include <vector> #include <vector>
#include "mace/core/workspace.h" #include "mace/core/workspace.h"
#include "mace/core/serializer.h"
#include "mace/core/arg_helper.h" #include "mace/core/arg_helper.h"
#include "mace/core/runtime/opencl/opencl_preallocated_pooled_allocator.h"
#include "mace/utils/timer.h" #include "mace/utils/timer.h"
namespace mace { namespace mace {
std::vector<std::string> Workspace::Tensors() const {
std::vector<std::string> names;
for (auto &entry : tensor_map_) {
names.push_back(entry.first);
}
return names;
}
Tensor *Workspace::CreateTensor(const std::string &name, Tensor *Workspace::CreateTensor(const std::string &name,
Allocator *alloc, Allocator *alloc,
DataType type) { DataType type) {
...@@ -28,21 +18,12 @@ Tensor *Workspace::CreateTensor(const std::string &name, ...@@ -28,21 +18,12 @@ Tensor *Workspace::CreateTensor(const std::string &name,
VLOG(3) << "Tensor " << name << " already exists. Skipping."; VLOG(3) << "Tensor " << name << " already exists. Skipping.";
} else { } else {
VLOG(3) << "Creating Tensor " << name; VLOG(3) << "Creating Tensor " << name;
tensor_map_[name] = std::move(std::unique_ptr<Tensor>(new Tensor(alloc, type))); tensor_map_[name] =
std::move(std::unique_ptr<Tensor>(new Tensor(alloc, type)));
} }
return GetTensor(name); return GetTensor(name);
} }
bool Workspace::RemoveTensor(const std::string &name) {
auto it = tensor_map_.find(name);
if (it != tensor_map_.end()) {
VLOG(3) << "Removing blob " << name << " from this workspace.";
tensor_map_.erase(it);
return true;
}
return false;
}
const Tensor *Workspace::GetTensor(const std::string &name) const { const Tensor *Workspace::GetTensor(const std::string &name) const {
if (tensor_map_.count(name)) { if (tensor_map_.count(name)) {
return tensor_map_.at(name).get(); return tensor_map_.at(name).get();
...@@ -52,26 +33,51 @@ const Tensor *Workspace::GetTensor(const std::string &name) const { ...@@ -52,26 +33,51 @@ const Tensor *Workspace::GetTensor(const std::string &name) const {
return nullptr; return nullptr;
} }
void Workspace::RemoveUnsedTensor() {
auto iter = tensor_map_.begin();
auto end_iter = tensor_map_.end();
while(iter != end_iter) {
auto old_iter = iter++;
if(old_iter->second->unused()) {
tensor_map_.erase(old_iter);
}
}
}
Tensor *Workspace::GetTensor(const std::string &name) { Tensor *Workspace::GetTensor(const std::string &name) {
return const_cast<Tensor *>( return const_cast<Tensor *>(
static_cast<const Workspace *>(this)->GetTensor(name)); static_cast<const Workspace *>(this)->GetTensor(name));
}
std::vector<std::string> Workspace::Tensors() const {
std::vector<std::string> names;
for (auto &entry : tensor_map_) {
names.push_back(entry.first);
}
return names;
} }
void Workspace::LoadModelTensor(const NetDef &net_def, DeviceType type) { void Workspace::LoadModelTensor(const NetDef &net_def, DeviceType type) {
MACE_LATENCY_LOGGER(1, "Load model tensors"); MACE_LATENCY_LOGGER(1, "Load model tensors");
Serializer serializer; index_t model_data_size = 0;
unsigned char *model_data_ptr = nullptr;
for (auto &const_tensor : net_def.tensors()) {
if (model_data_ptr == nullptr
|| reinterpret_cast<long long>(const_tensor.data())
< reinterpret_cast<long long>(model_data_ptr)) {
model_data_ptr = const_cast<unsigned char *>(const_tensor.data());
}
}
for (auto &const_tensor : net_def.tensors()) {
model_data_size = std::max(model_data_size,
static_cast<index_t>(
(reinterpret_cast<long long>(const_tensor.data())
- reinterpret_cast<long long>(model_data_ptr))
+ const_tensor.data_size()
* GetEnumTypeSize(const_tensor.data_type())));
}
VLOG(3) << "Model data size: " << model_data_size;
if (type == DeviceType::CPU) {
tensor_buffer_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(type), model_data_ptr, model_data_size)));
} else {
tensor_buffer_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(type), model_data_size)));
tensor_buffer_->Map(nullptr);
tensor_buffer_->Copy(model_data_ptr, 0, model_data_size);
tensor_buffer_->UnMap();
}
for (auto &const_tensor : net_def.tensors()) { for (auto &const_tensor : net_def.tensors()) {
MACE_LATENCY_LOGGER(2, "Load tensor ", const_tensor.name()); MACE_LATENCY_LOGGER(2, "Load tensor ", const_tensor.name());
VLOG(3) << "Tensor name: " << const_tensor.name() VLOG(3) << "Tensor name: " << const_tensor.name()
...@@ -79,9 +85,24 @@ void Workspace::LoadModelTensor(const NetDef &net_def, DeviceType type) { ...@@ -79,9 +85,24 @@ void Workspace::LoadModelTensor(const NetDef &net_def, DeviceType type) {
<< ", shape: " << ", shape: "
<< MakeString(std::vector<index_t>(const_tensor.dims().begin(), << MakeString(std::vector<index_t>(const_tensor.dims().begin(),
const_tensor.dims().end())); const_tensor.dims().end()));
tensor_map_[const_tensor.name()] = std::vector<index_t> dims;
serializer.Deserialize(const_tensor, type); for (const index_t d : const_tensor.dims()) {
dims.push_back(d);
}
index_t
offset = (long long) const_tensor.data() - (long long) model_data_ptr;
std::unique_ptr<Tensor> tensor(
new Tensor(BufferSlice(tensor_buffer_.get(),
offset,
const_tensor.data_size()
* GetEnumTypeSize(const_tensor.data_type())),
const_tensor.data_type()));
tensor->Reshape(dims);
tensor_map_[const_tensor.name()] = std::move(tensor);
} }
if (type == DeviceType::OPENCL) { if (type == DeviceType::OPENCL) {
CreateImageOutputTensor(net_def); CreateImageOutputTensor(net_def);
} }
...@@ -91,9 +112,6 @@ void Workspace::CreateImageOutputTensor(const NetDef &net_def) { ...@@ -91,9 +112,6 @@ void Workspace::CreateImageOutputTensor(const NetDef &net_def) {
if (!net_def.has_mem_arena() || net_def.mem_arena().mem_block_size() == 0) { if (!net_def.has_mem_arena() || net_def.mem_arena().mem_block_size() == 0) {
return; return;
} }
preallocated_allocator_ =
std::move(std::unique_ptr<PreallocatedPooledAllocator>(
new OpenCLPreallocatedPooledAllocator));
DataType dtype = DataType::DT_INVALID; DataType dtype = DataType::DT_INVALID;
// We use the data type of the first op (with mem id, must be image), // We use the data type of the first op (with mem id, must be image),
...@@ -116,18 +134,16 @@ void Workspace::CreateImageOutputTensor(const NetDef &net_def) { ...@@ -116,18 +134,16 @@ void Workspace::CreateImageOutputTensor(const NetDef &net_def) {
} }
MACE_CHECK(dtype != DataType::DT_INVALID, "data type is invalid."); MACE_CHECK(dtype != DataType::DT_INVALID, "data type is invalid.");
for (auto &mem_block: net_def.mem_arena().mem_block()) { for (auto &mem_block: net_def.mem_arena().mem_block()) {
preallocated_allocator_->PreallocateImage(mem_block.mem_id(), std::unique_ptr<BufferBase>
{mem_block.x(), mem_block.y()}, image_buf(new Image({mem_block.x(), mem_block.y()}, dtype));
dtype); preallocated_allocator_.SetBuffer(mem_block.mem_id(), std::move(image_buf));
} }
VLOG(3) << "Preallocate image to tensors"; VLOG(3) << "Preallocate image to tensors";
auto allocator = GetDeviceAllocator(DeviceType::OPENCL);
for (auto &op: net_def.op()) { for (auto &op: net_def.op()) {
if (op.has_mem_id()) { if (op.has_mem_id()) {
CreateTensor(op.output(0), allocator, dtype); std::unique_ptr<Tensor> tensor
tensor_map_[op.output(0)]->PreallocateImage( (new Tensor(preallocated_allocator_.GetBuffer(op.mem_id()), dtype));
preallocated_allocator_->GetImage(op.mem_id()), tensor_map_[op.output(0)] = std::move(tensor);
preallocated_allocator_->GetImageSize(op.mem_id()));
} }
} }
} }
......
...@@ -15,26 +15,23 @@ class Workspace { ...@@ -15,26 +15,23 @@ class Workspace {
public: public:
typedef std::map<std::string, std::unique_ptr<Tensor>> TensorMap; typedef std::map<std::string, std::unique_ptr<Tensor>> TensorMap;
Workspace() Workspace() {}
: preallocated_allocator_(nullptr) {}
~Workspace() {} ~Workspace() {}
std::vector<std::string> Tensors() const; Tensor *CreateTensor(const std::string &name,
Allocator *alloc,
Tensor *CreateTensor(const std::string &name, Allocator *alloc, DataType type); DataType type);
bool RemoveTensor(const std::string &name);
void RemoveUnsedTensor();
inline bool HasTensor(const std::string &name) const { inline bool HasTensor(const std::string &name) const {
return tensor_map_.count(name); return tensor_map_.find(name) != tensor_map_.end();
} }
const Tensor *GetTensor(const std::string &name) const; const Tensor *GetTensor(const std::string &name) const;
Tensor *GetTensor(const std::string &name); Tensor *GetTensor(const std::string &name);
std::vector<std::string> Tensors() const;
void LoadModelTensor(const NetDef &net_def, DeviceType type); void LoadModelTensor(const NetDef &net_def, DeviceType type);
private: private:
...@@ -42,9 +39,11 @@ class Workspace { ...@@ -42,9 +39,11 @@ class Workspace {
TensorMap tensor_map_; TensorMap tensor_map_;
std::unique_ptr<PreallocatedPooledAllocator> preallocated_allocator_; std::unique_ptr<BufferBase> tensor_buffer_;
PreallocatedPooledAllocator preallocated_allocator_;
DISABLE_COPY_AND_ASSIGN(Workspace); DISABLE_COPY_AND_ASSIGN(Workspace);
}; };
} // namespace mace } // namespace mace
......
...@@ -127,7 +127,7 @@ struct BatchNormFunctor : BatchNormFunctorBase { ...@@ -127,7 +127,7 @@ struct BatchNormFunctor : BatchNormFunctorBase {
} }
} }
} }
DoActivation(output_ptr, output_ptr, output->NumElements(), activation_, DoActivation(output_ptr, output_ptr, output->size(), activation_,
relux_max_limit_); relux_max_limit_);
} }
}; };
......
...@@ -616,7 +616,7 @@ struct Conv2dFunctor : Conv2dFunctorBase { ...@@ -616,7 +616,7 @@ struct Conv2dFunctor : Conv2dFunctorBase {
} }
} }
} }
DoActivation(output_data, output_data, output->NumElements(), activation_, DoActivation(output_data, output_data, output->size(), activation_,
relux_max_limit_); relux_max_limit_);
} }
}; };
......
...@@ -402,7 +402,7 @@ struct DepthwiseConv2dFunctor : public DepthwiseConv2dFunctorBase { ...@@ -402,7 +402,7 @@ struct DepthwiseConv2dFunctor : public DepthwiseConv2dFunctorBase {
valid_h_stop, valid_w_start, valid_w_stop); valid_h_stop, valid_w_start, valid_w_stop);
output_ptr = output->mutable_data<T>(); output_ptr = output->mutable_data<T>();
DoActivation(output_ptr, output_ptr, output->NumElements(), activation_, DoActivation(output_ptr, output_ptr, output->size(), activation_,
relux_max_limit_); relux_max_limit_);
} }
}; };
......
...@@ -65,7 +65,7 @@ struct FullyConnectedFunctor : FullyConnectedBase { ...@@ -65,7 +65,7 @@ struct FullyConnectedFunctor : FullyConnectedBase {
} }
} }
DoActivation(output_ptr, output_ptr, output->NumElements(), activation_, DoActivation(output_ptr, output_ptr, output->size(), activation_,
relux_max_limit_); relux_max_limit_);
} }
}; };
......
...@@ -14,7 +14,6 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer, ...@@ -14,7 +14,6 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
const BufferType type, const BufferType type,
Tensor *image, Tensor *image,
StatsFuture *future) { StatsFuture *future) {
MACE_CHECK(!buffer->is_image()) << "buffer must be buffer-type";
std::vector<size_t> image_shape; std::vector<size_t> image_shape;
if (!i2b_) { if (!i2b_) {
CalImage2DShape(buffer->shape(), type, image_shape); CalImage2DShape(buffer->shape(), type, image_shape);
...@@ -25,9 +24,9 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer, ...@@ -25,9 +24,9 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
} else { } else {
image->ResizeImage(buffer->shape(), image_shape); image->ResizeImage(buffer->shape(), image_shape);
} }
buffer->MarkUnused();
} else { } else {
image_shape = image->image_shape(); Image *image_buf = dynamic_cast<Image*>(image->UnderlyingBuffer());
image_shape = image_buf->image_shape();
buffer->Resize(image->shape()); buffer->Resize(image->shape());
} }
...@@ -78,7 +77,11 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer, ...@@ -78,7 +77,11 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
built_options); built_options);
uint32_t idx = 0; uint32_t idx = 0;
b2f_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(buffer->buffer()))); b2f_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(buffer->buffer())));
if (!i2b_) {
MACE_CHECK(buffer->buffer_offset() % GetEnumTypeSize(buffer->dtype()) == 0, "buffer offset not aligned");
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->buffer_offset() / GetEnumTypeSize(buffer->dtype())));
}
if (type == ARGUMENT) { if (type == ARGUMENT) {
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(0))); b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(0)));
} else if(type == WEIGHT_HEIGHT) { } else if(type == WEIGHT_HEIGHT) {
......
#include <common.h> #include <common.h>
__kernel void filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w, oc, ic */ __kernel void filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w, oc, ic */
__private const int input_offset,
__private const int filter_w, __private const int filter_w,
__private const int out_channel, __private const int out_channel,
__private const int in_channel, __private const int in_channel,
...@@ -13,7 +14,7 @@ __kernel void filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w, o ...@@ -13,7 +14,7 @@ __kernel void filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w, o
const int in_channel_idx = w % rounded_in_channel; const int in_channel_idx = w % rounded_in_channel;
const int h_idx = hw_idx / filter_w; const int h_idx = hw_idx / filter_w;
const int w_idx = hw_idx % filter_w; const int w_idx = hw_idx % filter_w;
const int offset = ((h_idx * filter_w + w_idx) * out_channel + out_channel_idx) * in_channel const int offset = input_offset + ((h_idx * filter_w + w_idx) * out_channel + out_channel_idx) * in_channel
+ in_channel_idx; + in_channel_idx;
VEC_DATA_TYPE(DATA_TYPE, 4) values = 0; VEC_DATA_TYPE(DATA_TYPE, 4) values = 0;
...@@ -79,6 +80,7 @@ __kernel void filter_image_to_buffer(__global DATA_TYPE *output, /* h, w, oc, ic ...@@ -79,6 +80,7 @@ __kernel void filter_image_to_buffer(__global DATA_TYPE *output, /* h, w, oc, ic
} }
__kernel void dw_filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w, ic, m */ __kernel void dw_filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w, ic, m */
__private const int input_offset,
__private const int filter_w, __private const int filter_w,
__private const int in_channel, __private const int in_channel,
__private const int multiplier, __private const int multiplier,
...@@ -92,7 +94,7 @@ __kernel void dw_filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w ...@@ -92,7 +94,7 @@ __kernel void dw_filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w
const int h_idx = w / filter_w; const int h_idx = w / filter_w;
const int w_idx = w % filter_w; const int w_idx = w % filter_w;
const int offset = mad24(mad24(h_idx, filter_w, w_idx), const int offset = input_offset + mad24(mad24(h_idx, filter_w, w_idx),
in_channel, in_channel_idx); in_channel, in_channel_idx);
const int size = in_channel - in_channel_idx; const int size = in_channel - in_channel_idx;
...@@ -117,7 +119,7 @@ __kernel void dw_filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w ...@@ -117,7 +119,7 @@ __kernel void dw_filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w
const int h_idx = hw_idx / filter_w; const int h_idx = hw_idx / filter_w;
const int w_idx = hw_idx % filter_w; const int w_idx = hw_idx % filter_w;
const int offset = mad24(mad24(mad24(h_idx, filter_w, w_idx), const int offset = input_offset + mad24(mad24(mad24(h_idx, filter_w, w_idx),
in_channel, in_channel_idx), in_channel, in_channel_idx),
multiplier, m); multiplier, m);
// TODO support multiplier > 1 // TODO support multiplier > 1
...@@ -128,6 +130,7 @@ __kernel void dw_filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w ...@@ -128,6 +130,7 @@ __kernel void dw_filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w
} }
__kernel void in_out_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */ __kernel void in_out_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
__private const int input_offset,
__private const int height, __private const int height,
__private const int width, __private const int width,
__private const int channels, __private const int channels,
...@@ -138,7 +141,7 @@ __kernel void in_out_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */ ...@@ -138,7 +141,7 @@ __kernel void in_out_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
const int height_idx = h % height; const int height_idx = h % height;
const int width_idx = w % width; const int width_idx = w % width;
const int channel_idx = w / width * 4; const int channel_idx = w / width * 4;
const int offset = ((batch_idx * height + height_idx) * width + width_idx) * channels const int offset = input_offset + ((batch_idx * height + height_idx) * width + width_idx) * channels
+ channel_idx; + channel_idx;
const int size = channels - channel_idx; const int size = channels - channel_idx;
...@@ -191,13 +194,16 @@ __kernel void in_out_image_to_buffer(__global DATA_TYPE *output, /* nhwc */ ...@@ -191,13 +194,16 @@ __kernel void in_out_image_to_buffer(__global DATA_TYPE *output, /* nhwc */
} }
__kernel void arg_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */ __kernel void arg_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
__private const int input_offset,
__private const int count, __private const int count,
__write_only image2d_t output) { __write_only image2d_t output) {
int w = get_global_id(0); int w = get_global_id(0);
int h = get_global_id(1); int h = get_global_id(1);
const int offset = w * 4;
const int size = count - offset; const int offset = input_offset + w * 4;
const int size = count - w * 4;
VEC_DATA_TYPE(DATA_TYPE, 4) values = 0; VEC_DATA_TYPE(DATA_TYPE, 4) values = 0;
if (size < 4) { if (size < 4) {
switch(size) { switch(size) {
...@@ -241,6 +247,7 @@ __kernel void arg_image_to_buffer(__global DATA_TYPE *output, /* nhwc */ ...@@ -241,6 +247,7 @@ __kernel void arg_image_to_buffer(__global DATA_TYPE *output, /* nhwc */
__kernel void in_out_height_buffer_to_image(__global const DATA_TYPE *input, //nhwc __kernel void in_out_height_buffer_to_image(__global const DATA_TYPE *input, //nhwc
__private const int input_offset,
__private const int height, __private const int height,
__private const int width, __private const int width,
__private const int channels, __private const int channels,
...@@ -253,7 +260,7 @@ __kernel void in_out_height_buffer_to_image(__global const DATA_TYPE *input, //n ...@@ -253,7 +260,7 @@ __kernel void in_out_height_buffer_to_image(__global const DATA_TYPE *input, //n
const int height_idx = (h % height_blks) << 2; const int height_idx = (h % height_blks) << 2;
const int width_idx = w % width; const int width_idx = w % width;
const int channel_idx = w / width; const int channel_idx = w / width;
int offset = ((batch_idx * height + height_idx) * width + width_idx) * channels int offset = input_offset + ((batch_idx * height + height_idx) * width + width_idx) * channels
+ channel_idx; + channel_idx;
int size = height - height_idx; int size = height - height_idx;
...@@ -304,6 +311,7 @@ __kernel void in_out_height_image_to_buffer(__global DATA_TYPE *output, //nhwc ...@@ -304,6 +311,7 @@ __kernel void in_out_height_image_to_buffer(__global DATA_TYPE *output, //nhwc
__kernel void in_out_width_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */ __kernel void in_out_width_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
__private const int input_offset,
__private const int height, __private const int height,
__private const int width, __private const int width,
__private const int channels, __private const int channels,
...@@ -314,7 +322,7 @@ __kernel void in_out_width_buffer_to_image(__global const DATA_TYPE *input, /* n ...@@ -314,7 +322,7 @@ __kernel void in_out_width_buffer_to_image(__global const DATA_TYPE *input, /* n
const int height_idx = h % height; const int height_idx = h % height;
const int width_idx = (w % width) << 2; const int width_idx = (w % width) << 2;
const int channel_idx = w / width; const int channel_idx = w / width;
const int offset = ((batch_idx * height + height_idx) * width + width_idx) * channels const int offset = input_offset + ((batch_idx * height + height_idx) * width + width_idx) * channels
+ channel_idx; + channel_idx;
int size = width - width_idx; int size = width - width_idx;
...@@ -336,6 +344,7 @@ __kernel void in_out_width_buffer_to_image(__global const DATA_TYPE *input, /* n ...@@ -336,6 +344,7 @@ __kernel void in_out_width_buffer_to_image(__global const DATA_TYPE *input, /* n
// only support 3x3 now // only support 3x3 now
__kernel void winograd_filter_buffer_to_image(__global const DATA_TYPE *input, //Oc, Ic, H, W __kernel void winograd_filter_buffer_to_image(__global const DATA_TYPE *input, //Oc, Ic, H, W
__private const int input_offset,
__private const int in_channels, __private const int in_channels,
__private const int height, __private const int height,
__private const int width, __private const int width,
...@@ -345,7 +354,7 @@ __kernel void winograd_filter_buffer_to_image(__global const DATA_TYPE *input, / ...@@ -345,7 +354,7 @@ __kernel void winograd_filter_buffer_to_image(__global const DATA_TYPE *input, /
const int out_channels = get_global_size(1); const int out_channels = get_global_size(1);
const int out_channel_idx = h; const int out_channel_idx = h;
const int in_channel_idx = w << 2; const int in_channel_idx = w << 2;
const int offset = (out_channel_idx * in_channels + in_channel_idx) * height * width; const int offset = input_offset + (out_channel_idx * in_channels + in_channel_idx) * height * width;
const int length = min((in_channels - in_channel_idx) * 9, 36); const int length = min((in_channels - in_channel_idx) * 9, 36);
DATA_TYPE in[36] = {0}; DATA_TYPE in[36] = {0};
DATA_TYPE4 tt; DATA_TYPE4 tt;
...@@ -390,7 +399,7 @@ __kernel void winograd_filter_buffer_to_image(__global const DATA_TYPE *input, / ...@@ -390,7 +399,7 @@ __kernel void winograd_filter_buffer_to_image(__global const DATA_TYPE *input, /
tu3[3] = tu3[2]; tu3[3] = tu3[2];
tu3[2] = tt - tu3[1] / 2; tu3[2] = tt - tu3[1] / 2;
tu3[1] = tt + tu3[1] / 2; tu3[1] = tt + tu3[1] / 2;
int2 coord = (int2)(w, h); int2 coord = (int2)(w, h);
#pragma unroll #pragma unroll
for (short i = 0; i < 4; ++i) { for (short i = 0; i < 4; ++i) {
......
...@@ -65,13 +65,12 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -65,13 +65,12 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
index_t kernel_h = filter->dim(0); index_t kernel_h = filter->dim(0);
index_t kernel_w = filter->dim(1); index_t kernel_w = filter->dim(1);
if (!input->is_image() || strides_[0] != strides_[1] || if (strides_[0] != strides_[1] ||
(dilations_[0] > 1 && (strides_[0] > 1 || kernel_h == 1))) { (dilations_[0] > 1 && (strides_[0] > 1 || kernel_h == 1))) {
LOG(WARNING) << "OpenCL conv2d kernel with " LOG(WARNING) << "OpenCL conv2d kernel with "
<< "filter" << kernel_h << "x" << kernel_w << "," << "filter" << kernel_h << "x" << kernel_w << ","
<< " stride " << strides_[0] << "x" << strides_[1] << " stride " << strides_[0] << "x" << strides_[1]
<< ",dilations " << dilations_[0] << "x" << dilations_[1] << ",dilations " << dilations_[0] << "x" << dilations_[1]
<< " and input image: " << input->is_image()
<< " is not implemented yet."; << " is not implemented yet.";
MACE_NOT_IMPLEMENTED; MACE_NOT_IMPLEMENTED;
} }
......
...@@ -27,13 +27,12 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -27,13 +27,12 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
if (kernel_.get() == nullptr) { if (kernel_.get() == nullptr) {
MACE_CHECK(out_height > 0 && out_width > 0); MACE_CHECK(out_height > 0 && out_width > 0);
std::vector<index_t> output_shape{batch, out_height, out_width, channels}; std::vector<index_t> output_shape{batch, out_height, out_width, channels};
if (input->is_image()) {
std::vector<size_t> output_image_shape; std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape); CalImage2DShape(output_shape,
output->ResizeImage(output_shape, output_image_shape); BufferType::IN_OUT_CHANNEL,
} else { output_image_shape);
output->Resize(output_shape); output->ResizeImage(output_shape, output_image_shape);
}
float height_scale = float height_scale =
CalculateResizeScale(in_height, out_height, align_corners_); CalculateResizeScale(in_height, out_height, align_corners_);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册