From bcbcbf7a01470fb5c66ab16a3229bb4fc9a1c670 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=8E=E5=AF=85?= Date: Mon, 5 Mar 2018 10:16:24 +0800 Subject: [PATCH] Refactor tensor, accelerate gpu initialization time --- mace/core/allocator.h | 34 +- mace/core/buffer.h | 356 ++++++++++++++++++ mace/core/mace.cc | 1 - mace/core/net.cc | 8 +- mace/core/net.h | 6 +- mace/core/preallocated_pooled_allocator.h | 24 +- .../runtime/hexagon/hexagon_control_wrapper.h | 3 - mace/core/runtime/opencl/opencl_allocator.cc | 24 +- mace/core/runtime/opencl/opencl_allocator.h | 16 +- .../opencl_preallocated_pooled_allocator.cc | 30 -- .../opencl_preallocated_pooled_allocator.h | 45 --- mace/core/serializer.cc | 82 ---- mace/core/serializer.h | 29 -- mace/core/tensor.h | 292 +++++--------- mace/core/workspace.cc | 112 +++--- mace/core/workspace.h | 23 +- mace/kernels/batch_norm.h | 2 +- mace/kernels/conv_2d.h | 2 +- mace/kernels/depthwise_conv2d.h | 2 +- mace/kernels/fully_connected.h | 2 +- mace/kernels/opencl/buffer_to_image.cc | 11 +- mace/kernels/opencl/cl/buffer_to_image.cl | 29 +- mace/kernels/opencl/conv_2d_opencl.cc | 3 +- mace/kernels/opencl/resize_bilinear_opencl.cc | 13 +- 24 files changed, 623 insertions(+), 526 deletions(-) create mode 100644 mace/core/buffer.h delete mode 100644 mace/core/runtime/opencl/opencl_preallocated_pooled_allocator.cc delete mode 100644 mace/core/runtime/opencl/opencl_preallocated_pooled_allocator.h delete mode 100644 mace/core/serializer.cc delete mode 100644 mace/core/serializer.h diff --git a/mace/core/allocator.h b/mace/core/allocator.h index f265ed24..7ab65f0b 100644 --- a/mace/core/allocator.h +++ b/mace/core/allocator.h @@ -26,17 +26,17 @@ class Allocator { public: Allocator() {} virtual ~Allocator() noexcept {} - virtual void *New(size_t nbytes) = 0; + virtual void *New(size_t nbytes) const = 0; virtual void *NewImage(const std::vector &image_shape, - const DataType dt) = 0; - virtual void Delete(void *data) = 0; - virtual void DeleteImage(void *data) = 0; - virtual void *Map(void *buffer, size_t nbytes) = 0; + const DataType dt) const = 0; + virtual void Delete(void *data) const = 0; + virtual void DeleteImage(void *data) const = 0; + virtual void *Map(void *buffer, size_t offset, size_t nbytes) const = 0; virtual void *MapImage(void *buffer, const std::vector &image_shape, - std::vector &mapped_image_pitch) = 0; - virtual void Unmap(void *buffer, void *mapper_ptr) = 0; - virtual bool OnHost() = 0; + std::vector *mapped_image_pitch) const = 0; + virtual void Unmap(void *buffer, void *mapper_ptr) const = 0; + virtual bool OnHost() const = 0; template T *New(size_t num_elements) { @@ -52,7 +52,7 @@ class Allocator { class CPUAllocator : public Allocator { public: ~CPUAllocator() override {} - void *New(size_t nbytes) override { + void *New(size_t nbytes) const override { VLOG(3) << "Allocate CPU buffer: " << nbytes; void *data = nullptr; #ifdef __ANDROID__ @@ -67,27 +67,29 @@ class CPUAllocator : public Allocator { } void *NewImage(const std::vector &shape, - const DataType dt) override { + const DataType dt) const override { LOG(FATAL) << "Allocate CPU image"; return nullptr; } - void Delete(void *data) override { + void Delete(void *data) const override { VLOG(3) << "Free CPU buffer"; free(data); } - void DeleteImage(void *data) override { + void DeleteImage(void *data) const override { LOG(FATAL) << "Free CPU image"; free(data); }; - void *Map(void *buffer, size_t nbytes) override { return buffer; } + void *Map(void *buffer, size_t offset, size_t nbytes) const override { + return (char*)buffer + offset; + } void *MapImage(void *buffer, const std::vector &image_shape, - std::vector &mapped_image_pitch) override { + std::vector *mapped_image_pitch) const override { return buffer; } - void Unmap(void *buffer, void *mapper_ptr) override {} - bool OnHost() override { return true; } + void Unmap(void *buffer, void *mapper_ptr) const override {} + bool OnHost() const override { return true; } }; std::map *gAllocatorRegistry(); diff --git a/mace/core/buffer.h b/mace/core/buffer.h new file mode 100644 index 00000000..bada99c0 --- /dev/null +++ b/mace/core/buffer.h @@ -0,0 +1,356 @@ +// +// Copyright (c) 2018 XiaoMi All rights reserved. +// + +#ifndef MACE_CORE_BUFFER_H_ +#define MACE_CORE_BUFFER_H_ + +#include "mace/core/types.h" +#include "mace/core/allocator.h" +#include + +namespace mace { + +class BufferBase { + public: + BufferBase() : size_(0) {} + BufferBase(index_t size) : size_(size) {} + virtual ~BufferBase() {} + + virtual void *buffer() = 0; + + virtual const void *raw_data() const = 0; + + virtual void *raw_mutable_data() = 0; + + virtual void *Map(index_t offset, + index_t length, + std::vector *pitch) const = 0; + + virtual void UnMap(void *mapped_ptr) const = 0; + + virtual void Map(std::vector *pitch) = 0; + + virtual void UnMap() = 0; + + virtual void Resize(index_t size) = 0; + + virtual void Copy(void *src, index_t offset, index_t length) = 0; + + virtual bool OnHost() const = 0; + + virtual index_t offset() const { + return 0; + }; + + template + const T *data() const { + return reinterpret_cast(raw_data()); + } + + template + T *mutable_data() { + return reinterpret_cast(raw_mutable_data()); + } + + index_t size() const { + return size_; + } + + protected: + index_t size_; +}; + +class Buffer : public BufferBase { + public: + Buffer(Allocator *allocator) + : BufferBase(0), + allocator_(allocator), + buf_(nullptr), + mapped_buf_(nullptr), + is_data_owner_(true) {} + + Buffer(Allocator *allocator, index_t size) + : BufferBase(size), + allocator_(allocator), + mapped_buf_(nullptr), + is_data_owner_(true) { + buf_ = allocator->New(size); + } + + Buffer(Allocator *allocator, void *data, index_t size) + : BufferBase(size), + allocator_(allocator), + buf_(data), + mapped_buf_(nullptr), + is_data_owner_(false) {} + + virtual ~Buffer() { + if (mapped_buf_ != nullptr) { + UnMap(); + } + if (is_data_owner_ && buf_ != nullptr) { + allocator_->Delete(buf_); + } + } + + void *buffer() { + MACE_CHECK_NOTNULL(buf_); + return buf_; + }; + + const void *raw_data() const { + if (OnHost()) { + MACE_CHECK_NOTNULL(buf_); + return buf_; + } else { + MACE_CHECK_NOTNULL(mapped_buf_); + return mapped_buf_; + } + } + + void *raw_mutable_data() { + if (OnHost()) { + MACE_CHECK_NOTNULL(buf_); + return buf_; + } else { + MACE_CHECK_NOTNULL(mapped_buf_); + return mapped_buf_; + } + } + + void *Map(index_t offset, index_t length, std::vector *pitch) const { + MACE_CHECK_NOTNULL(buf_); + return allocator_->Map(buf_, offset, length); + } + + void UnMap(void *mapped_ptr) const { + MACE_CHECK_NOTNULL(buf_); + MACE_CHECK_NOTNULL(mapped_ptr); + allocator_->Unmap(buf_, mapped_ptr); + } + + void Map(std::vector *pitch) { + MACE_CHECK(mapped_buf_ == nullptr, "buf has been already mapped"); + mapped_buf_ = Map(0, size_, pitch); + }; + + void UnMap() { + UnMap(mapped_buf_); + mapped_buf_ = nullptr; + } + + void Resize(index_t size) { + MACE_CHECK(is_data_owner_, + "data is not owned by this buffer, cannot resize"); + if (size != size_) { + if (buf_ != nullptr) { + allocator_->Delete(buf_); + } + size_ = size; + buf_ = allocator_->New(size); + } + } + + void Copy(void *src, index_t offset, index_t length) { + MACE_CHECK_NOTNULL(mapped_buf_); + MACE_CHECK(length <= size_, "out of buffer"); + memcpy(mapped_buf_, (char *) src + offset, length); + } + + bool OnHost() const { + return allocator_->OnHost(); + } + + private: + Allocator *allocator_; + void *buf_; + void *mapped_buf_; + bool is_data_owner_; + + DISABLE_COPY_AND_ASSIGN(Buffer); +}; + +class Image : public BufferBase { + public: + Image() + : BufferBase(0), + allocator_(GetDeviceAllocator(OPENCL)), + buf_(nullptr), + mapped_buf_(nullptr) {} + + Image(std::vector shape, DataType data_type) + : BufferBase(std::accumulate(shape.begin(), shape.end(), + 1, std::multiplies()) + * GetEnumTypeSize(data_type)), + allocator_(GetDeviceAllocator(OPENCL)), + mapped_buf_(nullptr) { + shape_ = shape; + data_type_ = data_type; + buf_ = allocator_->NewImage(shape, data_type); + } + + virtual ~Image() { + if (mapped_buf_ != nullptr) { + UnMap(); + } + if (buf_ != nullptr) { + allocator_->DeleteImage(buf_); + } + } + + void *buffer() { + MACE_CHECK_NOTNULL(buf_); + return buf_; + }; + + const void *raw_data() const { + MACE_CHECK_NOTNULL(mapped_buf_); + return mapped_buf_; + } + + void *raw_mutable_data() { + MACE_CHECK_NOTNULL(mapped_buf_); + return mapped_buf_; + } + + std::vector image_shape() const { + return shape_; + } + + void *Map(index_t offset, index_t length, std::vector *pitch) const { + MACE_NOT_IMPLEMENTED; + return nullptr; + } + + void UnMap(void *mapped_ptr) const { + MACE_CHECK_NOTNULL(buf_); + MACE_CHECK_NOTNULL(mapped_ptr); + allocator_->Unmap(buf_, mapped_ptr); + } + + void Map(std::vector *pitch) { + MACE_CHECK_NOTNULL(buf_); + MACE_CHECK(mapped_buf_ == nullptr, "buf has been already mapped"); + MACE_CHECK_NOTNULL(pitch); + mapped_buf_ = allocator_->MapImage(buf_, shape_, pitch); + }; + + void UnMap() { + UnMap(mapped_buf_); + mapped_buf_ = nullptr; + }; + + void Resize(index_t size) {} + + void Copy(void *src, index_t offset, index_t length) { + MACE_NOT_IMPLEMENTED; + } + + bool OnHost() const { + return allocator_->OnHost(); + } + + private: + Allocator *allocator_; + std::vector shape_; + DataType data_type_; + void *buf_; + void *mapped_buf_; + + DISABLE_COPY_AND_ASSIGN(Image); +}; + +class BufferSlice : public BufferBase { + public: + BufferSlice() {} + BufferSlice(BufferBase *buffer, index_t offset, index_t length) + : BufferBase(buffer->size()), + buffer_(buffer), + mapped_buf_(nullptr), + offset_(offset), + length_(length) { + MACE_CHECK(offset >= 0, "buffer slice offset should >= 0"); + MACE_CHECK(offset + length <= size_, + "buffer slice offset + length (", + offset, + " + ", + length, + ") should <= ", + size_); + } + BufferSlice(const BufferSlice &other) : BufferSlice(other.buffer_, + other.offset_, + other.length_) {} + + ~BufferSlice() { + if (mapped_buf_ != nullptr) { + UnMap(); + } + } + + void *buffer() { + return buffer_->buffer(); + }; + + const void *raw_data() const { + if (OnHost()) { + MACE_CHECK_NOTNULL(buffer_); + return (char *) buffer_->raw_data() + offset_; + } else { + MACE_CHECK_NOTNULL(mapped_buf_); + return mapped_buf_; + } + } + + void *raw_mutable_data() { + MACE_NOT_IMPLEMENTED; + return nullptr; + } + + void *Map(index_t offset, index_t length, std::vector *pitch) const { + MACE_NOT_IMPLEMENTED; + return nullptr; + } + + void UnMap(void *mapped_ptr) const { + MACE_NOT_IMPLEMENTED; + } + + void Map(std::vector *pitch) { + MACE_CHECK_NOTNULL(buffer_); + MACE_CHECK(mapped_buf_ == nullptr, "mapped buf is not null"); + mapped_buf_ = buffer_->Map(offset_, length_, pitch); + }; + + void UnMap() { + MACE_CHECK_NOTNULL(mapped_buf_); + buffer_->UnMap(mapped_buf_); + mapped_buf_ = nullptr; + }; + + void Resize(index_t size) { + } + + void Copy(void *src, index_t offset, index_t length) { + MACE_NOT_IMPLEMENTED; + } + + index_t offset() const { + return offset_; + } + + bool OnHost() const { + return buffer_->OnHost(); + } + + private: + BufferBase *buffer_; + void *mapped_buf_; + index_t offset_; + index_t length_; +}; + +} + +#endif // MACE_CORE_BUFFER_H_ diff --git a/mace/core/mace.cc b/mace/core/mace.cc index 81e26246..58591a78 100644 --- a/mace/core/mace.cc +++ b/mace/core/mace.cc @@ -542,7 +542,6 @@ MaceEngine::MaceEngine(const NetDef *net_def, DeviceType device_type) : if (!net->Run()) { LOG(FATAL) << "Net init run failed"; } - ws_->RemoveUnsedTensor(); net_ = std::move(CreateNet(op_registry_, *net_def, ws_.get(), device_type)); } } diff --git a/mace/core/net.cc b/mace/core/net.cc index 46febc68..ce44b951 100644 --- a/mace/core/net.cc +++ b/mace/core/net.cc @@ -15,14 +15,14 @@ NetBase::NetBase(const std::shared_ptr op_registry, DeviceType type) : op_registry_(op_registry), name_(net_def->name()) {} -SimpleNet::SimpleNet(const std::shared_ptr op_registry, +SerialNet::SerialNet(const std::shared_ptr op_registry, const std::shared_ptr net_def, Workspace *ws, DeviceType type, const NetMode mode) : NetBase(op_registry, net_def, ws, type), device_type_(type) { - MACE_LATENCY_LOGGER(1, "Constructing SimpleNet ", net_def->name()); + MACE_LATENCY_LOGGER(1, "Constructing SerialNet ", net_def->name()); for (int idx = 0; idx < net_def->op_size(); ++idx) { const auto &operator_def = net_def->op(idx); VLOG(3) << "Creating operator " << operator_def.name() << "(" @@ -36,7 +36,7 @@ SimpleNet::SimpleNet(const std::shared_ptr op_registry, } } -bool SimpleNet::Run(RunMetadata *run_metadata) { +bool SerialNet::Run(RunMetadata *run_metadata) { MACE_MEMORY_LOGGING_GUARD(); MACE_LATENCY_LOGGER(1, "Running net"); for (auto iter = operators_.begin(); iter != operators_.end(); ++iter) { @@ -99,7 +99,7 @@ std::unique_ptr CreateNet( Workspace *ws, DeviceType type, const NetMode mode) { - std::unique_ptr net(new SimpleNet(op_registry, net_def, ws, type, mode)); + std::unique_ptr net(new SerialNet(op_registry, net_def, ws, type, mode)); return net; } diff --git a/mace/core/net.h b/mace/core/net.h index 1f57794f..3b625393 100644 --- a/mace/core/net.h +++ b/mace/core/net.h @@ -33,9 +33,9 @@ class NetBase { DISABLE_COPY_AND_ASSIGN(NetBase); }; -class SimpleNet : public NetBase { +class SerialNet : public NetBase { public: - SimpleNet(const std::shared_ptr op_registry, + SerialNet(const std::shared_ptr op_registry, const std::shared_ptr net_def, Workspace *ws, DeviceType type, @@ -47,7 +47,7 @@ class SimpleNet : public NetBase { std::vector > operators_; DeviceType device_type_; - DISABLE_COPY_AND_ASSIGN(SimpleNet); + DISABLE_COPY_AND_ASSIGN(SerialNet); }; std::unique_ptr CreateNet( diff --git a/mace/core/preallocated_pooled_allocator.h b/mace/core/preallocated_pooled_allocator.h index dcb35070..75cf4117 100644 --- a/mace/core/preallocated_pooled_allocator.h +++ b/mace/core/preallocated_pooled_allocator.h @@ -5,6 +5,7 @@ #ifndef MACE_CORE_PREALLOCATED_POOLED_ALLOCATOR_H_ #define MACE_CORE_PREALLOCATED_POOLED_ALLOCATOR_H_ +#include #include "mace/core/allocator.h" namespace mace { @@ -13,17 +14,26 @@ class PreallocatedPooledAllocator { public: PreallocatedPooledAllocator() {} - virtual ~PreallocatedPooledAllocator() noexcept {} + ~PreallocatedPooledAllocator() noexcept {} - virtual void PreallocateImage(int mem_id, - const std::vector &image_shape, - DataType data_type) = 0; + void SetBuffer(int mem_id, std::unique_ptr &&buffer) { + buffers_[mem_id] = std::move(buffer); + } - virtual void *GetImage(int mem_id) = 0; + BufferBase *GetBuffer(int mem_id) { + if (buffers_.find(mem_id) != buffers_.end()) { + return buffers_[mem_id].get(); + } else { + return nullptr; + } + } - virtual bool HasImage(int mem_id) = 0; + virtual bool HasBuffer(int mem_id) { + return buffers_.find(mem_id) != buffers_.end(); + } - virtual std::vector GetImageSize(int mem_id) = 0; + private: + std::unordered_map> buffers_; }; } // namespace mace diff --git a/mace/core/runtime/hexagon/hexagon_control_wrapper.h b/mace/core/runtime/hexagon/hexagon_control_wrapper.h index f24f9522..09a1c778 100644 --- a/mace/core/runtime/hexagon/hexagon_control_wrapper.h +++ b/mace/core/runtime/hexagon/hexagon_control_wrapper.h @@ -11,7 +11,6 @@ #include "mace/core/runtime/hexagon/quantize.h" #include "mace/core/tensor.h" #include "mace/public/mace.h" -#include "mace/core/serializer.h" namespace mace { @@ -37,7 +36,6 @@ class HexagonControlWrapper { void SetGraphMode(int mode); private: - // CAVEAT: Need offset as HVX library reserves some ids static constexpr int NODE_ID_OFFSET = 10000; inline uint32_t node_id(uint32_t nodeid) { @@ -45,7 +43,6 @@ class HexagonControlWrapper { } int nn_id_; - Serializer serializer_; Quantizer quantizer_; std::vector> input_shapes_; diff --git a/mace/core/runtime/opencl/opencl_allocator.cc b/mace/core/runtime/opencl/opencl_allocator.cc index 929b4818..abc88bdd 100644 --- a/mace/core/runtime/opencl/opencl_allocator.cc +++ b/mace/core/runtime/opencl/opencl_allocator.cc @@ -35,7 +35,7 @@ static cl_channel_type DataTypeToCLChannelType(const DataType t) { OpenCLAllocator::OpenCLAllocator() {} OpenCLAllocator::~OpenCLAllocator() {} -void *OpenCLAllocator::New(size_t nbytes) { +void *OpenCLAllocator::New(size_t nbytes) const { VLOG(3) << "Allocate OpenCL buffer: " << nbytes; cl_int error; cl::Buffer *buffer = new cl::Buffer(OpenCLRuntime::Global()->context(), @@ -47,7 +47,7 @@ void *OpenCLAllocator::New(size_t nbytes) { } void *OpenCLAllocator::NewImage(const std::vector &image_shape, - const DataType dt) { + const DataType dt) const { MACE_CHECK(image_shape.size() == 2) << "Image shape's size must equal 2"; VLOG(3) << "Allocate OpenCL image: " << image_shape[0] << ", " << image_shape[1]; @@ -67,7 +67,7 @@ void *OpenCLAllocator::NewImage(const std::vector &image_shape, return cl_image; } -void OpenCLAllocator::Delete(void *buffer) { +void OpenCLAllocator::Delete(void *buffer) const { VLOG(3) << "Free OpenCL buffer"; if (buffer != nullptr) { cl::Buffer *cl_buffer = static_cast(buffer); @@ -75,7 +75,7 @@ void OpenCLAllocator::Delete(void *buffer) { } } -void OpenCLAllocator::DeleteImage(void *buffer) { +void OpenCLAllocator::DeleteImage(void *buffer) const { VLOG(3) << "Free OpenCL image"; if (buffer != nullptr) { cl::Image2D *cl_image = static_cast(buffer); @@ -83,13 +83,13 @@ void OpenCLAllocator::DeleteImage(void *buffer) { } } -void *OpenCLAllocator::Map(void *buffer, size_t nbytes) { +void *OpenCLAllocator::Map(void *buffer, size_t offset, size_t nbytes) const { auto cl_buffer = static_cast(buffer); auto queue = OpenCLRuntime::Global()->command_queue(); // TODO(heliangliang) Non-blocking call cl_int error; void *mapped_ptr = - queue.enqueueMapBuffer(*cl_buffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, + queue.enqueueMapBuffer(*cl_buffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, offset, nbytes, nullptr, nullptr, &error); MACE_CHECK(error == CL_SUCCESS); return mapped_ptr; @@ -98,33 +98,33 @@ void *OpenCLAllocator::Map(void *buffer, size_t nbytes) { // TODO : there is something wrong with half type. void *OpenCLAllocator::MapImage(void *buffer, const std::vector &image_shape, - std::vector &mapped_image_pitch) { + std::vector *mapped_image_pitch) const { MACE_CHECK(image_shape.size() == 2) << "Just support map 2d image"; auto cl_image = static_cast(buffer); std::array origin = {0, 0, 0}; std::array region = {image_shape[0], image_shape[1], 1}; - mapped_image_pitch.resize(2); + mapped_image_pitch->resize(2); cl_int error; void *mapped_ptr = OpenCLRuntime::Global()->command_queue().enqueueMapImage(*cl_image, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, origin, region, - &mapped_image_pitch[0], - &mapped_image_pitch[1], + mapped_image_pitch->data(), + mapped_image_pitch->data() + 1, nullptr, nullptr, &error); MACE_CHECK(error == CL_SUCCESS) << error; return mapped_ptr; } -void OpenCLAllocator::Unmap(void *buffer, void *mapped_ptr) { +void OpenCLAllocator::Unmap(void *buffer, void *mapped_ptr) const { auto cl_buffer = static_cast(buffer); auto queue = OpenCLRuntime::Global()->command_queue(); MACE_CHECK(queue.enqueueUnmapMemObject(*cl_buffer, mapped_ptr, nullptr, nullptr) == CL_SUCCESS); } -bool OpenCLAllocator::OnHost() { return false; } +bool OpenCLAllocator::OnHost() const { return false; } } // namespace mace diff --git a/mace/core/runtime/opencl/opencl_allocator.h b/mace/core/runtime/opencl/opencl_allocator.h index 6aa8fbf9..7065b132 100644 --- a/mace/core/runtime/opencl/opencl_allocator.h +++ b/mace/core/runtime/opencl/opencl_allocator.h @@ -15,7 +15,7 @@ class OpenCLAllocator : public Allocator { ~OpenCLAllocator() override; - void *New(size_t nbytes) override; + void *New(size_t nbytes) const override; /* * Use Image2D with RGBA (128-bit) format to represent the image. @@ -23,21 +23,21 @@ class OpenCLAllocator : public Allocator { * @ shape : [depth, ..., height, width ]. */ void *NewImage(const std::vector &image_shape, - const DataType dt) override; + const DataType dt) const override; - void Delete(void *buffer) override; + void Delete(void *buffer) const override; - void DeleteImage(void *buffer) override; + void DeleteImage(void *buffer) const override; - void *Map(void *buffer, size_t nbytes) override; + void *Map(void *buffer, size_t offset, size_t nbytes) const override; void *MapImage(void *buffer, const std::vector &image_shape, - std::vector &mapped_image_pitch) override; + std::vector *mapped_image_pitch) const override; - void Unmap(void *buffer, void *mapped_ptr) override; + void Unmap(void *buffer, void *mapped_ptr) const override; - bool OnHost() override; + bool OnHost() const override; }; } // namespace mace diff --git a/mace/core/runtime/opencl/opencl_preallocated_pooled_allocator.cc b/mace/core/runtime/opencl/opencl_preallocated_pooled_allocator.cc deleted file mode 100644 index f1ef73a9..00000000 --- a/mace/core/runtime/opencl/opencl_preallocated_pooled_allocator.cc +++ /dev/null @@ -1,30 +0,0 @@ -// -// Copyright (c) 2018 XiaoMi All rights reserved. -// - -#include "mace/core/runtime/opencl/opencl_preallocated_pooled_allocator.h" - -namespace mace { - -OpenCLPreallocatedPooledAllocator::OpenCLPreallocatedPooledAllocator() - : allocator(GetDeviceAllocator(DeviceType::OPENCL)) { -} - -OpenCLPreallocatedPooledAllocator::~OpenCLPreallocatedPooledAllocator() { -} - -void OpenCLPreallocatedPooledAllocator::PreallocateImage(int mem_id, - const std::vector< - size_t> &image_shape, - DataType data_type) { - MACE_CHECK(!this->HasImage(mem_id), "Memory already exists: ", mem_id); - VLOG(2) << "Preallocate OpenCL image: " << mem_id << " " - << image_shape[0] << ", " << image_shape[1]; - images_[mem_id] = std::move(std::unique_ptr>( - allocator->NewImage(image_shape, data_type), [this](void *p) { - this->allocator->DeleteImage(p); - })); - image_shapes_[mem_id] = image_shape; -} - -} // namespace mace diff --git a/mace/core/runtime/opencl/opencl_preallocated_pooled_allocator.h b/mace/core/runtime/opencl/opencl_preallocated_pooled_allocator.h deleted file mode 100644 index 960ff174..00000000 --- a/mace/core/runtime/opencl/opencl_preallocated_pooled_allocator.h +++ /dev/null @@ -1,45 +0,0 @@ -// -// Copyright (c) 2018 XiaoMi All rights reserved. -// - -#ifndef MACE_CORE_RUNTIME_OPENCL_PREALLOCATED_POOLED_ALLOCATOR_H_ -#define MACE_CORE_RUNTIME_OPENCL_PREALLOCATED_POOLED_ALLOCATOR_H_ - -#include "mace/core/preallocated_pooled_allocator.h" -#include - -namespace mace { - -class OpenCLPreallocatedPooledAllocator : public PreallocatedPooledAllocator { - public: - OpenCLPreallocatedPooledAllocator(); - - ~OpenCLPreallocatedPooledAllocator() override; - - void PreallocateImage(int mem_id, - const std::vector &image_shape, - DataType data_type) override; - - inline void *GetImage(int mem_id) override { - MACE_CHECK(HasImage(mem_id), "image does not exist"); - return images_[mem_id].get(); - } - - inline bool HasImage(int mem_id) override { - return images_.find(mem_id) != images_.end(); - } - - inline std::vector GetImageSize(int mem_id) override { - return image_shapes_[mem_id]; - } - - private: - std::unordered_map>> - images_; - std::unordered_map> image_shapes_; - Allocator *allocator; -}; - -} // namepsace mace - -#endif // MACE_CORE_RUNTIME_OPENCL_PREALLOCATED_POOLED_ALLOCATOR_H_ diff --git a/mace/core/serializer.cc b/mace/core/serializer.cc deleted file mode 100644 index c171205f..00000000 --- a/mace/core/serializer.cc +++ /dev/null @@ -1,82 +0,0 @@ -// -// Copyright (c) 2017 XiaoMi All rights reserved. -// - -#include "mace/core/serializer.h" - -namespace mace { - -std::unique_ptr Serializer::Serialize(const Tensor &tensor, - const std::string &name) { - MACE_NOT_IMPLEMENTED; - return nullptr; -} - -std::unique_ptr Serializer::Deserialize(const ConstTensor &const_tensor, - DeviceType type) { - std::unique_ptr tensor( - new Tensor(GetDeviceAllocator(type), const_tensor.data_type())); - std::vector dims; - for (const index_t d : const_tensor.dims()) { - dims.push_back(d); - } - tensor->Resize(dims); - - switch (const_tensor.data_type()) { - case DT_HALF: - tensor->Copy(reinterpret_cast(const_tensor.data()), - const_tensor.data_size()); - break; - case DT_FLOAT: - tensor->Copy(reinterpret_cast(const_tensor.data()), - const_tensor.data_size()); - break; - case DT_DOUBLE: - tensor->Copy( - reinterpret_cast(const_tensor.data()), - const_tensor.data_size()); - break; - case DT_INT32: - tensor->Copy( - reinterpret_cast(const_tensor.data()), - const_tensor.data_size()); - break; - case DT_INT64: - tensor->Copy( - reinterpret_cast(const_tensor.data()), - const_tensor.data_size()); - break; - case DT_UINT8: - tensor->Copy( - reinterpret_cast(const_tensor.data()), - const_tensor.data_size()); - break; - case DT_INT16: - tensor->CopyWithCast( - reinterpret_cast(const_tensor.data()), - const_tensor.data_size()); - break; - case DT_INT8: - tensor->CopyWithCast( - reinterpret_cast(const_tensor.data()), - const_tensor.data_size()); - break; - case DT_UINT16: - tensor->CopyWithCast( - reinterpret_cast(const_tensor.data()), - const_tensor.data_size()); - break; - case DT_BOOL: - tensor->CopyWithCast( - reinterpret_cast(const_tensor.data()), - const_tensor.data_size()); - break; - default: - MACE_NOT_IMPLEMENTED; - break; - } - - return tensor; -} - -} // namespace mace diff --git a/mace/core/serializer.h b/mace/core/serializer.h deleted file mode 100644 index fcc98a72..00000000 --- a/mace/core/serializer.h +++ /dev/null @@ -1,29 +0,0 @@ -// -// Copyright (c) 2017 XiaoMi All rights reserved. -// - -#ifndef MACE_CORE_SERIALIZER_H_ -#define MACE_CORE_SERIALIZER_H_ - -#include "mace/core/tensor.h" -#include "mace/public/mace.h" - -namespace mace { - -class Serializer { - public: - Serializer() {} - ~Serializer() {} - - std::unique_ptr Serialize(const Tensor &tensor, - const std::string &name); - - std::unique_ptr Deserialize(const ConstTensor &const_tensor, - DeviceType type); - - DISABLE_COPY_AND_ASSIGN(Serializer); -}; - -} // namespace mace - -#endif // MACE_CORE_SERIALIZER_H_ diff --git a/mace/core/tensor.h b/mace/core/tensor.h index 0fcda89c..cfe832ed 100644 --- a/mace/core/tensor.h +++ b/mace/core/tensor.h @@ -5,7 +5,7 @@ #ifndef MACE_CORE_TENSOR_H_ #define MACE_CORE_TENSOR_H_ -#include "mace/core/allocator.h" +#include "mace/core/buffer.h" #include "mace/utils/logging.h" #include "mace/core/types.h" #include "mace/public/mace.h" @@ -46,7 +46,6 @@ namespace mace { CASES_WITH_DEFAULT(TYPE_ENUM, STMTS, LOG(FATAL) << "Type not set"; \ , LOG(FATAL) << "Unexpected type: " << TYPE_ENUM;) - namespace numerical_chars { inline std::ostream &operator<<(std::ostream &os, char c) { return std::is_signed::value ? os << static_cast(c) @@ -64,27 +63,30 @@ inline std::ostream &operator<<(std::ostream &os, unsigned char c) { class Tensor { public: - Tensor() - : alloc_(GetDeviceAllocator(DeviceType::CPU)), - size_(0), - dtype_(DT_FLOAT), - buffer_(nullptr), - data_(nullptr), - unused_(false), - is_image_(false){ -}; - Tensor(Allocator *alloc, DataType type) - : alloc_(alloc), - size_(0), - dtype_(type), - buffer_(nullptr), - data_(nullptr), - unused_(false), - is_image_(false){ - }; + : allocator_(alloc), + dtype_(type), + buffer_(nullptr), + is_buffer_owner_(true) {}; + + Tensor(BufferBase *buffer, DataType dtype) + : dtype_(dtype), + buffer_(buffer), + is_buffer_owner_(false) {} + + Tensor(const BufferSlice &buffer_slice, DataType dtype) + : dtype_(dtype), + buffer_slice_(buffer_slice), + is_buffer_owner_(false) { + buffer_ = &buffer_slice_; + } + + Tensor() : Tensor(GetDeviceAllocator(CPU), DT_FLOAT) {} ~Tensor() { + if (is_buffer_owner_ && buffer_ != nullptr) { + delete buffer_; + } } inline DataType dtype() const { return dtype_; } @@ -93,182 +95,113 @@ class Tensor { inline const std::vector &shape() const { return shape_; } - inline const std::vector &image_shape() const { return image_shape_; } - - inline const bool is_image() const { return is_image_; } - inline index_t dim_size() const { return shape_.size(); } inline index_t dim(unsigned int index) const { MACE_CHECK(index < shape_.size(), "Dim out of range: ", - index, " >= ", shape_.size()); + index, " >= ", shape_.size()); return shape_[index]; } - inline index_t size() const { return size_; } - - inline index_t raw_size() const { return size_ * SizeOfType(); } - - inline const bool unused() const { return unused_; } - - inline int64_t NumElements() const { + inline index_t size() const { return std::accumulate(shape_.begin(), shape_.end(), 1, std::multiplies()); } - inline const bool OnHost() const { return alloc_->OnHost(); } - - /* - * Map the device buffer as CPU buffer to access the data, unmap must be - * called later - */ - inline void Map() const { - if (!OnHost()) { - MACE_CHECK(buffer_ != nullptr && data_ == nullptr); - data_ = alloc_->Map(buffer_.get(), size_ * SizeOfType()); - } + inline index_t raw_size() const { + return size() * SizeOfType(); } - inline void MapImage(std::vector &mapped_image_pitch) const { - MACE_CHECK(!OnHost() && buffer_ != nullptr && data_ == nullptr); - data_ = alloc_->MapImage(buffer_.get(), image_shape_, mapped_image_pitch); + inline void *buffer() const { + MACE_CHECK(buffer_ != nullptr && buffer_->buffer() != nullptr, + "buffer is null"); + return buffer_->buffer(); } - /* - * Unmap the device buffer - */ - inline void Unmap() const { - if (!OnHost()) { - MACE_CHECK(buffer_ != nullptr && data_ != nullptr); - alloc_->Unmap(buffer_.get(), data_); - data_ = nullptr; - } + inline index_t buffer_offset() const { + return buffer_->offset(); } - void *buffer() const { return buffer_.get(); } - inline const void *raw_data() const { - void *data = MappedBuffer(); - MACE_CHECK(data != nullptr || size_ == 0, - "The tensor is of non-zero shape, but its data is not allocated " - "or mapped yet."); - return data; + MACE_CHECK(buffer_ != nullptr, "buffer is null"); + return buffer_->raw_data(); } - template + template inline const T *data() const { - return static_cast(raw_data()); + MACE_CHECK(buffer_ != nullptr, "buffer is null"); + return buffer_->data(); } inline void *raw_mutable_data() { - void *data = MappedBuffer(); - MACE_CHECK(data != nullptr || size_ == 0, - "The tensor is of non-zero shape, but its data is not allocated " - "or mapped yet."); - return data; + MACE_CHECK(buffer_ != nullptr, "buffer is null"); + return buffer_->raw_mutable_data(); } - template + template inline T *mutable_data() { - return static_cast(raw_mutable_data()); + MACE_CHECK(buffer_ != nullptr, "buffer is null"); + return static_cast(buffer_->raw_mutable_data()); } - inline void Resize(const std::vector &shape) { - MACE_CHECK(!is_image_ || buffer_ == nullptr, - "Resize is not for image, use ResizeImage instead."); - is_image_ = false; + inline void Reshape(const std::vector &shape) { shape_ = shape; - index_t size = NumElements(); - if (size_ != size) { - size_ = size; - MACE_CHECK(data_ == nullptr, "Buffer must be unmapped before resize"); - CASES(dtype_, - (buffer_ = - std::move(std::unique_ptr>( - alloc_->New(size_ * sizeof(T)), - [this](void *p) { - this->alloc_->Delete(p); - }) - ))); - } + MACE_CHECK(raw_size() <= buffer_->size()); } - inline void ResizeImage(const std::vector &shape, - const std::vector &image_shape) { - MACE_CHECK(is_image_ || buffer_ == nullptr, - "ResizeImage is not for buffer, use Resize instead."); - is_image_ = true; + inline void Resize(const std::vector &shape) { shape_ = shape; - index_t size = NumElements(); - if (size_ != size) { - size_ = size; - image_shape_ = image_shape; - if (!preallocated_image_shape_.empty()) { - MACE_CHECK(preallocated_image_shape_[0] >= image_shape[0] - && preallocated_image_shape_[1] >= image_shape[1], - "image shape not large enough: preallocated ", - preallocated_image_shape_[0], - " ", - preallocated_image_shape_[1], - "apply for ", - image_shape[0], - " ", - image_shape[1]); - } else { - buffer_ = std::move(std::unique_ptr>( - alloc_->NewImage(image_shape, dtype_), - [this](void *p) { this->alloc_->DeleteImage(p); })); - preallocated_image_shape_ = image_shape; - } + if (buffer_ != nullptr) { + buffer_->Resize(raw_size()); + } else { + buffer_ = new Buffer(allocator_, raw_size()); + is_buffer_owner_ = true; } } inline void ResizeLike(const Tensor &other) { - if (other.is_image()) { - ResizeImage(other.shape(), other.image_shape()); - } else { - Resize(other.shape()); - } + Resize(other.shape()); } inline void ResizeLike(const Tensor *other) { - if (other->is_image()) { - ResizeImage(other->shape(), other->image_shape()); - } else { - Resize(other->shape()); + Resize(other->shape()); + } + + inline void ResizeImage(const std::vector &shape, + const std::vector &image_shape) { + shape_ = shape; + if (buffer_ == nullptr) { + buffer_ = new Image(image_shape, dtype_); + is_buffer_owner_ = true; } } - inline void PreallocateImage(void *image, - const std::vector& image_shape) { - is_image_ = true; - buffer_ = std::move(std::unique_ptr>( - image, [](void *p) { - // tensor does not have ownership of preallocated memory - })); - preallocated_image_shape_ = image_shape; + inline void CopyBytes(const void *src, size_t size) { + MappingGuard guard(this); + memcpy(buffer_->raw_mutable_data(), src, size); } - template - inline void Copy(const T *src, index_t size) { - MACE_CHECK(size == size_, "copy src and dst with different size."); - CopyBytes(static_cast(src), sizeof(T) * size); + template + inline void Copy(const T *src, index_t length) { + MACE_CHECK(length == size(), "copy src and dst with different size."); + CopyBytes(static_cast(src), sizeof(T) * length); } - template - inline void CopyWithCast(const SrcType *src, size_t size) { - MACE_CHECK(static_cast(size) == size_, - "copy src and dst with different size."); - std::unique_ptr buffer(new DstType[size]); - for (size_t i = 0; i < size; ++i) { - buffer[i] = static_cast(src[i]); - } - CopyBytes(static_cast(buffer.get()), sizeof(DstType) * size); + inline void Copy(const Tensor &other) { + dtype_ = other.dtype_; + ResizeLike(other); + MappingGuard map_other(&other); + CopyBytes(other.raw_data(), other.size() * SizeOfType()); } - inline void CopyBytes(const void *src, size_t size) { - MappingGuard map_this(this); - memcpy(raw_mutable_data(), src, size); + inline size_t SizeOfType() const { + size_t type_size = 0; + CASES(dtype_, type_size = sizeof(T)); + return type_size; + } + + inline BufferBase *UnderlyingBuffer() const { + return buffer_; } inline void DebugPrint() const { @@ -281,8 +214,8 @@ class Tensor { os.str(""); os.clear(); MappingGuard guard(this); - for (int i = 0; i < size_; ++i) { - if ( i != 0 && i % shape_[3] == 0) { + for (int i = 0; i < size(); ++i) { + if (i != 0 && i % shape_[3] == 0) { os << "\n"; } CASES(dtype_, (os << (this->data()[i]) << ", ")); @@ -291,37 +224,11 @@ class Tensor { << dim(2) << ", " << dim(3) << "], content:\n" << os.str(); } - inline size_t SizeOfType() const { - size_t type_size = 0; - CASES(dtype_, type_size = sizeof(T)); - return type_size; - } - - inline void Copy(const Tensor &other) { - alloc_ = other.alloc_; - dtype_ = other.dtype_; - ResizeLike(other); - MappingGuard map_other(&other); - if (is_image_) { - LOG(FATAL) << "Not support copy image tensor, please use Copy API."; - } else { - CopyBytes(other.raw_data(), size_ * SizeOfType()); - } - } - - inline void MarkUnused() { - this->unused_ = true; - } - class MappingGuard { public: MappingGuard(const Tensor *tensor) : tensor_(tensor) { if (tensor_ != nullptr) { - if (tensor_->is_image()) { - tensor_->MapImage(mapped_image_pitch_); - } else { - tensor_->Map(); - } + tensor_->buffer_->Map(&mapped_image_pitch_); } } @@ -331,42 +238,29 @@ class Tensor { } ~MappingGuard() { - if (tensor_ != nullptr) tensor_->Unmap(); + if (tensor_ != nullptr) tensor_->buffer_->UnMap(); } - inline const std::vector &mapped_image_pitch() const { return mapped_image_pitch_; } + inline const std::vector &mapped_image_pitch() const { + return mapped_image_pitch_; + } private: const Tensor *tensor_; std::vector mapped_image_pitch_; - DISABLE_COPY_AND_ASSIGN(MappingGuard); + DISABLE_COPY_AND_ASSIGN(MappingGuard); }; private: - inline void *MappedBuffer() const { - if (OnHost()) { - return buffer_.get(); - } - return data_; - } - - Allocator *alloc_; - index_t size_; + Allocator *allocator_; DataType dtype_; - // Raw buffer, must be mapped as host accessable data before - // read or write - std::unique_ptr> buffer_; - // Mapped buffer - mutable void *data_; std::vector shape_; - // Image for opencl - bool unused_; - bool is_image_; - std::vector image_shape_; - std::vector preallocated_image_shape_; + BufferBase *buffer_; + BufferSlice buffer_slice_; + bool is_buffer_owner_; - DISABLE_COPY_AND_ASSIGN(Tensor); + DISABLE_COPY_AND_ASSIGN(Tensor); }; } // namespace tensor diff --git a/mace/core/workspace.cc b/mace/core/workspace.cc index 371ab2e0..a7adb939 100644 --- a/mace/core/workspace.cc +++ b/mace/core/workspace.cc @@ -6,21 +6,11 @@ #include #include "mace/core/workspace.h" -#include "mace/core/serializer.h" #include "mace/core/arg_helper.h" -#include "mace/core/runtime/opencl/opencl_preallocated_pooled_allocator.h" #include "mace/utils/timer.h" namespace mace { -std::vector Workspace::Tensors() const { - std::vector names; - for (auto &entry : tensor_map_) { - names.push_back(entry.first); - } - return names; -} - Tensor *Workspace::CreateTensor(const std::string &name, Allocator *alloc, DataType type) { @@ -28,21 +18,12 @@ Tensor *Workspace::CreateTensor(const std::string &name, VLOG(3) << "Tensor " << name << " already exists. Skipping."; } else { VLOG(3) << "Creating Tensor " << name; - tensor_map_[name] = std::move(std::unique_ptr(new Tensor(alloc, type))); + tensor_map_[name] = + std::move(std::unique_ptr(new Tensor(alloc, type))); } return GetTensor(name); } -bool Workspace::RemoveTensor(const std::string &name) { - auto it = tensor_map_.find(name); - if (it != tensor_map_.end()) { - VLOG(3) << "Removing blob " << name << " from this workspace."; - tensor_map_.erase(it); - return true; - } - return false; -} - const Tensor *Workspace::GetTensor(const std::string &name) const { if (tensor_map_.count(name)) { return tensor_map_.at(name).get(); @@ -52,26 +33,51 @@ const Tensor *Workspace::GetTensor(const std::string &name) const { return nullptr; } - -void Workspace::RemoveUnsedTensor() { - auto iter = tensor_map_.begin(); - auto end_iter = tensor_map_.end(); - while(iter != end_iter) { - auto old_iter = iter++; - if(old_iter->second->unused()) { - tensor_map_.erase(old_iter); - } - } -} - Tensor *Workspace::GetTensor(const std::string &name) { return const_cast( - static_cast(this)->GetTensor(name)); + static_cast(this)->GetTensor(name)); +} + +std::vector Workspace::Tensors() const { + std::vector names; + for (auto &entry : tensor_map_) { + names.push_back(entry.first); + } + return names; } void Workspace::LoadModelTensor(const NetDef &net_def, DeviceType type) { MACE_LATENCY_LOGGER(1, "Load model tensors"); - Serializer serializer; + index_t model_data_size = 0; + unsigned char *model_data_ptr = nullptr; + for (auto &const_tensor : net_def.tensors()) { + if (model_data_ptr == nullptr + || reinterpret_cast(const_tensor.data()) + < reinterpret_cast(model_data_ptr)) { + model_data_ptr = const_cast(const_tensor.data()); + } + } + for (auto &const_tensor : net_def.tensors()) { + model_data_size = std::max(model_data_size, + static_cast( + (reinterpret_cast(const_tensor.data()) + - reinterpret_cast(model_data_ptr)) + + const_tensor.data_size() + * GetEnumTypeSize(const_tensor.data_type()))); + } + VLOG(3) << "Model data size: " << model_data_size; + + if (type == DeviceType::CPU) { + tensor_buffer_ = std::move(std::unique_ptr( + new Buffer(GetDeviceAllocator(type), model_data_ptr, model_data_size))); + } else { + tensor_buffer_ = std::move(std::unique_ptr( + new Buffer(GetDeviceAllocator(type), model_data_size))); + tensor_buffer_->Map(nullptr); + tensor_buffer_->Copy(model_data_ptr, 0, model_data_size); + tensor_buffer_->UnMap(); + } + for (auto &const_tensor : net_def.tensors()) { MACE_LATENCY_LOGGER(2, "Load tensor ", const_tensor.name()); VLOG(3) << "Tensor name: " << const_tensor.name() @@ -79,9 +85,24 @@ void Workspace::LoadModelTensor(const NetDef &net_def, DeviceType type) { << ", shape: " << MakeString(std::vector(const_tensor.dims().begin(), const_tensor.dims().end())); - tensor_map_[const_tensor.name()] = - serializer.Deserialize(const_tensor, type); + std::vector dims; + for (const index_t d : const_tensor.dims()) { + dims.push_back(d); + } + + index_t + offset = (long long) const_tensor.data() - (long long) model_data_ptr; + std::unique_ptr tensor( + new Tensor(BufferSlice(tensor_buffer_.get(), + offset, + const_tensor.data_size() + * GetEnumTypeSize(const_tensor.data_type())), + const_tensor.data_type())); + + tensor->Reshape(dims); + tensor_map_[const_tensor.name()] = std::move(tensor); } + if (type == DeviceType::OPENCL) { CreateImageOutputTensor(net_def); } @@ -91,9 +112,6 @@ void Workspace::CreateImageOutputTensor(const NetDef &net_def) { if (!net_def.has_mem_arena() || net_def.mem_arena().mem_block_size() == 0) { return; } - preallocated_allocator_ = - std::move(std::unique_ptr( - new OpenCLPreallocatedPooledAllocator)); DataType dtype = DataType::DT_INVALID; // We use the data type of the first op (with mem id, must be image), @@ -116,18 +134,16 @@ void Workspace::CreateImageOutputTensor(const NetDef &net_def) { } MACE_CHECK(dtype != DataType::DT_INVALID, "data type is invalid."); for (auto &mem_block: net_def.mem_arena().mem_block()) { - preallocated_allocator_->PreallocateImage(mem_block.mem_id(), - {mem_block.x(), mem_block.y()}, - dtype); + std::unique_ptr + image_buf(new Image({mem_block.x(), mem_block.y()}, dtype)); + preallocated_allocator_.SetBuffer(mem_block.mem_id(), std::move(image_buf)); } VLOG(3) << "Preallocate image to tensors"; - auto allocator = GetDeviceAllocator(DeviceType::OPENCL); for (auto &op: net_def.op()) { if (op.has_mem_id()) { - CreateTensor(op.output(0), allocator, dtype); - tensor_map_[op.output(0)]->PreallocateImage( - preallocated_allocator_->GetImage(op.mem_id()), - preallocated_allocator_->GetImageSize(op.mem_id())); + std::unique_ptr tensor + (new Tensor(preallocated_allocator_.GetBuffer(op.mem_id()), dtype)); + tensor_map_[op.output(0)] = std::move(tensor); } } } diff --git a/mace/core/workspace.h b/mace/core/workspace.h index c1bc17a5..84274914 100644 --- a/mace/core/workspace.h +++ b/mace/core/workspace.h @@ -15,26 +15,23 @@ class Workspace { public: typedef std::map> TensorMap; - Workspace() - : preallocated_allocator_(nullptr) {} + Workspace() {} ~Workspace() {} - std::vector Tensors() const; - - Tensor *CreateTensor(const std::string &name, Allocator *alloc, DataType type); - - bool RemoveTensor(const std::string &name); - - void RemoveUnsedTensor(); + Tensor *CreateTensor(const std::string &name, + Allocator *alloc, + DataType type); inline bool HasTensor(const std::string &name) const { - return tensor_map_.count(name); + return tensor_map_.find(name) != tensor_map_.end(); } const Tensor *GetTensor(const std::string &name) const; Tensor *GetTensor(const std::string &name); + std::vector Tensors() const; + void LoadModelTensor(const NetDef &net_def, DeviceType type); private: @@ -42,9 +39,11 @@ class Workspace { TensorMap tensor_map_; - std::unique_ptr preallocated_allocator_; + std::unique_ptr tensor_buffer_; + + PreallocatedPooledAllocator preallocated_allocator_; - DISABLE_COPY_AND_ASSIGN(Workspace); + DISABLE_COPY_AND_ASSIGN(Workspace); }; } // namespace mace diff --git a/mace/kernels/batch_norm.h b/mace/kernels/batch_norm.h index 0d489f40..bceee6ff 100644 --- a/mace/kernels/batch_norm.h +++ b/mace/kernels/batch_norm.h @@ -127,7 +127,7 @@ struct BatchNormFunctor : BatchNormFunctorBase { } } } - DoActivation(output_ptr, output_ptr, output->NumElements(), activation_, + DoActivation(output_ptr, output_ptr, output->size(), activation_, relux_max_limit_); } }; diff --git a/mace/kernels/conv_2d.h b/mace/kernels/conv_2d.h index e6c22cd9..f4f49565 100644 --- a/mace/kernels/conv_2d.h +++ b/mace/kernels/conv_2d.h @@ -616,7 +616,7 @@ struct Conv2dFunctor : Conv2dFunctorBase { } } } - DoActivation(output_data, output_data, output->NumElements(), activation_, + DoActivation(output_data, output_data, output->size(), activation_, relux_max_limit_); } }; diff --git a/mace/kernels/depthwise_conv2d.h b/mace/kernels/depthwise_conv2d.h index 141119a9..da4d00be 100644 --- a/mace/kernels/depthwise_conv2d.h +++ b/mace/kernels/depthwise_conv2d.h @@ -402,7 +402,7 @@ struct DepthwiseConv2dFunctor : public DepthwiseConv2dFunctorBase { valid_h_stop, valid_w_start, valid_w_stop); output_ptr = output->mutable_data(); - DoActivation(output_ptr, output_ptr, output->NumElements(), activation_, + DoActivation(output_ptr, output_ptr, output->size(), activation_, relux_max_limit_); } }; diff --git a/mace/kernels/fully_connected.h b/mace/kernels/fully_connected.h index 4919e63a..031717f1 100644 --- a/mace/kernels/fully_connected.h +++ b/mace/kernels/fully_connected.h @@ -65,7 +65,7 @@ struct FullyConnectedFunctor : FullyConnectedBase { } } - DoActivation(output_ptr, output_ptr, output->NumElements(), activation_, + DoActivation(output_ptr, output_ptr, output->size(), activation_, relux_max_limit_); } }; diff --git a/mace/kernels/opencl/buffer_to_image.cc b/mace/kernels/opencl/buffer_to_image.cc index f9d9e781..bc906163 100644 --- a/mace/kernels/opencl/buffer_to_image.cc +++ b/mace/kernels/opencl/buffer_to_image.cc @@ -14,7 +14,6 @@ void BufferToImageFunctor::operator()(Tensor *buffer, const BufferType type, Tensor *image, StatsFuture *future) { - MACE_CHECK(!buffer->is_image()) << "buffer must be buffer-type"; std::vector image_shape; if (!i2b_) { CalImage2DShape(buffer->shape(), type, image_shape); @@ -25,9 +24,9 @@ void BufferToImageFunctor::operator()(Tensor *buffer, } else { image->ResizeImage(buffer->shape(), image_shape); } - buffer->MarkUnused(); } else { - image_shape = image->image_shape(); + Image *image_buf = dynamic_cast(image->UnderlyingBuffer()); + image_shape = image_buf->image_shape(); buffer->Resize(image->shape()); } @@ -78,7 +77,11 @@ void BufferToImageFunctor::operator()(Tensor *buffer, built_options); uint32_t idx = 0; - b2f_kernel.setArg(idx++, *(static_cast(buffer->buffer()))); + b2f_kernel.setArg(idx++, *(static_cast(buffer->buffer()))); + if (!i2b_) { + MACE_CHECK(buffer->buffer_offset() % GetEnumTypeSize(buffer->dtype()) == 0, "buffer offset not aligned"); + b2f_kernel.setArg(idx++, static_cast(buffer->buffer_offset() / GetEnumTypeSize(buffer->dtype()))); + } if (type == ARGUMENT) { b2f_kernel.setArg(idx++, static_cast(buffer->dim(0))); } else if(type == WEIGHT_HEIGHT) { diff --git a/mace/kernels/opencl/cl/buffer_to_image.cl b/mace/kernels/opencl/cl/buffer_to_image.cl index d4a28e6e..42ab6617 100644 --- a/mace/kernels/opencl/cl/buffer_to_image.cl +++ b/mace/kernels/opencl/cl/buffer_to_image.cl @@ -1,6 +1,7 @@ #include __kernel void filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w, oc, ic */ + __private const int input_offset, __private const int filter_w, __private const int out_channel, __private const int in_channel, @@ -13,7 +14,7 @@ __kernel void filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w, o const int in_channel_idx = w % rounded_in_channel; const int h_idx = hw_idx / filter_w; const int w_idx = hw_idx % filter_w; - const int offset = ((h_idx * filter_w + w_idx) * out_channel + out_channel_idx) * in_channel + const int offset = input_offset + ((h_idx * filter_w + w_idx) * out_channel + out_channel_idx) * in_channel + in_channel_idx; VEC_DATA_TYPE(DATA_TYPE, 4) values = 0; @@ -79,6 +80,7 @@ __kernel void filter_image_to_buffer(__global DATA_TYPE *output, /* h, w, oc, ic } __kernel void dw_filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w, ic, m */ + __private const int input_offset, __private const int filter_w, __private const int in_channel, __private const int multiplier, @@ -92,7 +94,7 @@ __kernel void dw_filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w const int h_idx = w / filter_w; const int w_idx = w % filter_w; - const int offset = mad24(mad24(h_idx, filter_w, w_idx), + const int offset = input_offset + mad24(mad24(h_idx, filter_w, w_idx), in_channel, in_channel_idx); const int size = in_channel - in_channel_idx; @@ -117,7 +119,7 @@ __kernel void dw_filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w const int h_idx = hw_idx / filter_w; const int w_idx = hw_idx % filter_w; - const int offset = mad24(mad24(mad24(h_idx, filter_w, w_idx), + const int offset = input_offset + mad24(mad24(mad24(h_idx, filter_w, w_idx), in_channel, in_channel_idx), multiplier, m); // TODO support multiplier > 1 @@ -128,6 +130,7 @@ __kernel void dw_filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w } __kernel void in_out_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */ + __private const int input_offset, __private const int height, __private const int width, __private const int channels, @@ -138,7 +141,7 @@ __kernel void in_out_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */ const int height_idx = h % height; const int width_idx = w % width; const int channel_idx = w / width * 4; - const int offset = ((batch_idx * height + height_idx) * width + width_idx) * channels + const int offset = input_offset + ((batch_idx * height + height_idx) * width + width_idx) * channels + channel_idx; const int size = channels - channel_idx; @@ -191,13 +194,16 @@ __kernel void in_out_image_to_buffer(__global DATA_TYPE *output, /* nhwc */ } __kernel void arg_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */ + __private const int input_offset, __private const int count, __write_only image2d_t output) { int w = get_global_id(0); int h = get_global_id(1); - const int offset = w * 4; - const int size = count - offset; + const int offset = input_offset + w * 4; + const int size = count - w * 4; + + VEC_DATA_TYPE(DATA_TYPE, 4) values = 0; if (size < 4) { switch(size) { @@ -241,6 +247,7 @@ __kernel void arg_image_to_buffer(__global DATA_TYPE *output, /* nhwc */ __kernel void in_out_height_buffer_to_image(__global const DATA_TYPE *input, //nhwc + __private const int input_offset, __private const int height, __private const int width, __private const int channels, @@ -253,7 +260,7 @@ __kernel void in_out_height_buffer_to_image(__global const DATA_TYPE *input, //n const int height_idx = (h % height_blks) << 2; const int width_idx = w % width; const int channel_idx = w / width; - int offset = ((batch_idx * height + height_idx) * width + width_idx) * channels + int offset = input_offset + ((batch_idx * height + height_idx) * width + width_idx) * channels + channel_idx; int size = height - height_idx; @@ -304,6 +311,7 @@ __kernel void in_out_height_image_to_buffer(__global DATA_TYPE *output, //nhwc __kernel void in_out_width_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */ + __private const int input_offset, __private const int height, __private const int width, __private const int channels, @@ -314,7 +322,7 @@ __kernel void in_out_width_buffer_to_image(__global const DATA_TYPE *input, /* n const int height_idx = h % height; const int width_idx = (w % width) << 2; const int channel_idx = w / width; - const int offset = ((batch_idx * height + height_idx) * width + width_idx) * channels + const int offset = input_offset + ((batch_idx * height + height_idx) * width + width_idx) * channels + channel_idx; int size = width - width_idx; @@ -336,6 +344,7 @@ __kernel void in_out_width_buffer_to_image(__global const DATA_TYPE *input, /* n // only support 3x3 now __kernel void winograd_filter_buffer_to_image(__global const DATA_TYPE *input, //Oc, Ic, H, W + __private const int input_offset, __private const int in_channels, __private const int height, __private const int width, @@ -345,7 +354,7 @@ __kernel void winograd_filter_buffer_to_image(__global const DATA_TYPE *input, / const int out_channels = get_global_size(1); const int out_channel_idx = h; const int in_channel_idx = w << 2; - const int offset = (out_channel_idx * in_channels + in_channel_idx) * height * width; + const int offset = input_offset + (out_channel_idx * in_channels + in_channel_idx) * height * width; const int length = min((in_channels - in_channel_idx) * 9, 36); DATA_TYPE in[36] = {0}; DATA_TYPE4 tt; @@ -390,7 +399,7 @@ __kernel void winograd_filter_buffer_to_image(__global const DATA_TYPE *input, / tu3[3] = tu3[2]; tu3[2] = tt - tu3[1] / 2; tu3[1] = tt + tu3[1] / 2; - + int2 coord = (int2)(w, h); #pragma unroll for (short i = 0; i < 4; ++i) { diff --git a/mace/kernels/opencl/conv_2d_opencl.cc b/mace/kernels/opencl/conv_2d_opencl.cc index 94aa7838..5dc97944 100644 --- a/mace/kernels/opencl/conv_2d_opencl.cc +++ b/mace/kernels/opencl/conv_2d_opencl.cc @@ -65,13 +65,12 @@ void Conv2dFunctor::operator()(const Tensor *input, index_t kernel_h = filter->dim(0); index_t kernel_w = filter->dim(1); - if (!input->is_image() || strides_[0] != strides_[1] || + if (strides_[0] != strides_[1] || (dilations_[0] > 1 && (strides_[0] > 1 || kernel_h == 1))) { LOG(WARNING) << "OpenCL conv2d kernel with " << "filter" << kernel_h << "x" << kernel_w << "," << " stride " << strides_[0] << "x" << strides_[1] << ",dilations " << dilations_[0] << "x" << dilations_[1] - << " and input image: " << input->is_image() << " is not implemented yet."; MACE_NOT_IMPLEMENTED; } diff --git a/mace/kernels/opencl/resize_bilinear_opencl.cc b/mace/kernels/opencl/resize_bilinear_opencl.cc index f4e91045..d8f4185e 100644 --- a/mace/kernels/opencl/resize_bilinear_opencl.cc +++ b/mace/kernels/opencl/resize_bilinear_opencl.cc @@ -27,13 +27,12 @@ void ResizeBilinearFunctor::operator()( if (kernel_.get() == nullptr) { MACE_CHECK(out_height > 0 && out_width > 0); std::vector output_shape{batch, out_height, out_width, channels}; - if (input->is_image()) { - std::vector output_image_shape; - CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape); - output->ResizeImage(output_shape, output_image_shape); - } else { - output->Resize(output_shape); - } + + std::vector output_image_shape; + CalImage2DShape(output_shape, + BufferType::IN_OUT_CHANNEL, + output_image_shape); + output->ResizeImage(output_shape, output_image_shape); float height_scale = CalculateResizeScale(in_height, out_height, align_corners_); -- GitLab