提交 bcbcbf7a 编写于 作者: 李寅

Refactor tensor, accelerate gpu initialization time

上级 d548b461
......@@ -26,17 +26,17 @@ class Allocator {
public:
Allocator() {}
virtual ~Allocator() noexcept {}
virtual void *New(size_t nbytes) = 0;
virtual void *New(size_t nbytes) const = 0;
virtual void *NewImage(const std::vector<size_t> &image_shape,
const DataType dt) = 0;
virtual void Delete(void *data) = 0;
virtual void DeleteImage(void *data) = 0;
virtual void *Map(void *buffer, size_t nbytes) = 0;
const DataType dt) const = 0;
virtual void Delete(void *data) const = 0;
virtual void DeleteImage(void *data) const = 0;
virtual void *Map(void *buffer, size_t offset, size_t nbytes) const = 0;
virtual void *MapImage(void *buffer,
const std::vector<size_t> &image_shape,
std::vector<size_t> &mapped_image_pitch) = 0;
virtual void Unmap(void *buffer, void *mapper_ptr) = 0;
virtual bool OnHost() = 0;
std::vector<size_t> *mapped_image_pitch) const = 0;
virtual void Unmap(void *buffer, void *mapper_ptr) const = 0;
virtual bool OnHost() const = 0;
template <typename T>
T *New(size_t num_elements) {
......@@ -52,7 +52,7 @@ class Allocator {
class CPUAllocator : public Allocator {
public:
~CPUAllocator() override {}
void *New(size_t nbytes) override {
void *New(size_t nbytes) const override {
VLOG(3) << "Allocate CPU buffer: " << nbytes;
void *data = nullptr;
#ifdef __ANDROID__
......@@ -67,27 +67,29 @@ class CPUAllocator : public Allocator {
}
void *NewImage(const std::vector<size_t> &shape,
const DataType dt) override {
const DataType dt) const override {
LOG(FATAL) << "Allocate CPU image";
return nullptr;
}
void Delete(void *data) override {
void Delete(void *data) const override {
VLOG(3) << "Free CPU buffer";
free(data);
}
void DeleteImage(void *data) override {
void DeleteImage(void *data) const override {
LOG(FATAL) << "Free CPU image";
free(data);
};
void *Map(void *buffer, size_t nbytes) override { return buffer; }
void *Map(void *buffer, size_t offset, size_t nbytes) const override {
return (char*)buffer + offset;
}
void *MapImage(void *buffer,
const std::vector<size_t> &image_shape,
std::vector<size_t> &mapped_image_pitch) override {
std::vector<size_t> *mapped_image_pitch) const override {
return buffer;
}
void Unmap(void *buffer, void *mapper_ptr) override {}
bool OnHost() override { return true; }
void Unmap(void *buffer, void *mapper_ptr) const override {}
bool OnHost() const override { return true; }
};
std::map<int32_t, Allocator *> *gAllocatorRegistry();
......
//
// Copyright (c) 2018 XiaoMi All rights reserved.
//
#ifndef MACE_CORE_BUFFER_H_
#define MACE_CORE_BUFFER_H_
#include "mace/core/types.h"
#include "mace/core/allocator.h"
#include <vector>
namespace mace {
class BufferBase {
public:
BufferBase() : size_(0) {}
BufferBase(index_t size) : size_(size) {}
virtual ~BufferBase() {}
virtual void *buffer() = 0;
virtual const void *raw_data() const = 0;
virtual void *raw_mutable_data() = 0;
virtual void *Map(index_t offset,
index_t length,
std::vector<size_t> *pitch) const = 0;
virtual void UnMap(void *mapped_ptr) const = 0;
virtual void Map(std::vector<size_t> *pitch) = 0;
virtual void UnMap() = 0;
virtual void Resize(index_t size) = 0;
virtual void Copy(void *src, index_t offset, index_t length) = 0;
virtual bool OnHost() const = 0;
virtual index_t offset() const {
return 0;
};
template<typename T>
const T *data() const {
return reinterpret_cast<const T *>(raw_data());
}
template<typename T>
T *mutable_data() {
return reinterpret_cast<T *>(raw_mutable_data());
}
index_t size() const {
return size_;
}
protected:
index_t size_;
};
class Buffer : public BufferBase {
public:
Buffer(Allocator *allocator)
: BufferBase(0),
allocator_(allocator),
buf_(nullptr),
mapped_buf_(nullptr),
is_data_owner_(true) {}
Buffer(Allocator *allocator, index_t size)
: BufferBase(size),
allocator_(allocator),
mapped_buf_(nullptr),
is_data_owner_(true) {
buf_ = allocator->New(size);
}
Buffer(Allocator *allocator, void *data, index_t size)
: BufferBase(size),
allocator_(allocator),
buf_(data),
mapped_buf_(nullptr),
is_data_owner_(false) {}
virtual ~Buffer() {
if (mapped_buf_ != nullptr) {
UnMap();
}
if (is_data_owner_ && buf_ != nullptr) {
allocator_->Delete(buf_);
}
}
void *buffer() {
MACE_CHECK_NOTNULL(buf_);
return buf_;
};
const void *raw_data() const {
if (OnHost()) {
MACE_CHECK_NOTNULL(buf_);
return buf_;
} else {
MACE_CHECK_NOTNULL(mapped_buf_);
return mapped_buf_;
}
}
void *raw_mutable_data() {
if (OnHost()) {
MACE_CHECK_NOTNULL(buf_);
return buf_;
} else {
MACE_CHECK_NOTNULL(mapped_buf_);
return mapped_buf_;
}
}
void *Map(index_t offset, index_t length, std::vector<size_t> *pitch) const {
MACE_CHECK_NOTNULL(buf_);
return allocator_->Map(buf_, offset, length);
}
void UnMap(void *mapped_ptr) const {
MACE_CHECK_NOTNULL(buf_);
MACE_CHECK_NOTNULL(mapped_ptr);
allocator_->Unmap(buf_, mapped_ptr);
}
void Map(std::vector<size_t> *pitch) {
MACE_CHECK(mapped_buf_ == nullptr, "buf has been already mapped");
mapped_buf_ = Map(0, size_, pitch);
};
void UnMap() {
UnMap(mapped_buf_);
mapped_buf_ = nullptr;
}
void Resize(index_t size) {
MACE_CHECK(is_data_owner_,
"data is not owned by this buffer, cannot resize");
if (size != size_) {
if (buf_ != nullptr) {
allocator_->Delete(buf_);
}
size_ = size;
buf_ = allocator_->New(size);
}
}
void Copy(void *src, index_t offset, index_t length) {
MACE_CHECK_NOTNULL(mapped_buf_);
MACE_CHECK(length <= size_, "out of buffer");
memcpy(mapped_buf_, (char *) src + offset, length);
}
bool OnHost() const {
return allocator_->OnHost();
}
private:
Allocator *allocator_;
void *buf_;
void *mapped_buf_;
bool is_data_owner_;
DISABLE_COPY_AND_ASSIGN(Buffer);
};
class Image : public BufferBase {
public:
Image()
: BufferBase(0),
allocator_(GetDeviceAllocator(OPENCL)),
buf_(nullptr),
mapped_buf_(nullptr) {}
Image(std::vector<size_t> shape, DataType data_type)
: BufferBase(std::accumulate(shape.begin(), shape.end(),
1, std::multiplies<index_t>())
* GetEnumTypeSize(data_type)),
allocator_(GetDeviceAllocator(OPENCL)),
mapped_buf_(nullptr) {
shape_ = shape;
data_type_ = data_type;
buf_ = allocator_->NewImage(shape, data_type);
}
virtual ~Image() {
if (mapped_buf_ != nullptr) {
UnMap();
}
if (buf_ != nullptr) {
allocator_->DeleteImage(buf_);
}
}
void *buffer() {
MACE_CHECK_NOTNULL(buf_);
return buf_;
};
const void *raw_data() const {
MACE_CHECK_NOTNULL(mapped_buf_);
return mapped_buf_;
}
void *raw_mutable_data() {
MACE_CHECK_NOTNULL(mapped_buf_);
return mapped_buf_;
}
std::vector<size_t> image_shape() const {
return shape_;
}
void *Map(index_t offset, index_t length, std::vector<size_t> *pitch) const {
MACE_NOT_IMPLEMENTED;
return nullptr;
}
void UnMap(void *mapped_ptr) const {
MACE_CHECK_NOTNULL(buf_);
MACE_CHECK_NOTNULL(mapped_ptr);
allocator_->Unmap(buf_, mapped_ptr);
}
void Map(std::vector<size_t> *pitch) {
MACE_CHECK_NOTNULL(buf_);
MACE_CHECK(mapped_buf_ == nullptr, "buf has been already mapped");
MACE_CHECK_NOTNULL(pitch);
mapped_buf_ = allocator_->MapImage(buf_, shape_, pitch);
};
void UnMap() {
UnMap(mapped_buf_);
mapped_buf_ = nullptr;
};
void Resize(index_t size) {}
void Copy(void *src, index_t offset, index_t length) {
MACE_NOT_IMPLEMENTED;
}
bool OnHost() const {
return allocator_->OnHost();
}
private:
Allocator *allocator_;
std::vector<size_t> shape_;
DataType data_type_;
void *buf_;
void *mapped_buf_;
DISABLE_COPY_AND_ASSIGN(Image);
};
class BufferSlice : public BufferBase {
public:
BufferSlice() {}
BufferSlice(BufferBase *buffer, index_t offset, index_t length)
: BufferBase(buffer->size()),
buffer_(buffer),
mapped_buf_(nullptr),
offset_(offset),
length_(length) {
MACE_CHECK(offset >= 0, "buffer slice offset should >= 0");
MACE_CHECK(offset + length <= size_,
"buffer slice offset + length (",
offset,
" + ",
length,
") should <= ",
size_);
}
BufferSlice(const BufferSlice &other) : BufferSlice(other.buffer_,
other.offset_,
other.length_) {}
~BufferSlice() {
if (mapped_buf_ != nullptr) {
UnMap();
}
}
void *buffer() {
return buffer_->buffer();
};
const void *raw_data() const {
if (OnHost()) {
MACE_CHECK_NOTNULL(buffer_);
return (char *) buffer_->raw_data() + offset_;
} else {
MACE_CHECK_NOTNULL(mapped_buf_);
return mapped_buf_;
}
}
void *raw_mutable_data() {
MACE_NOT_IMPLEMENTED;
return nullptr;
}
void *Map(index_t offset, index_t length, std::vector<size_t> *pitch) const {
MACE_NOT_IMPLEMENTED;
return nullptr;
}
void UnMap(void *mapped_ptr) const {
MACE_NOT_IMPLEMENTED;
}
void Map(std::vector<size_t> *pitch) {
MACE_CHECK_NOTNULL(buffer_);
MACE_CHECK(mapped_buf_ == nullptr, "mapped buf is not null");
mapped_buf_ = buffer_->Map(offset_, length_, pitch);
};
void UnMap() {
MACE_CHECK_NOTNULL(mapped_buf_);
buffer_->UnMap(mapped_buf_);
mapped_buf_ = nullptr;
};
void Resize(index_t size) {
}
void Copy(void *src, index_t offset, index_t length) {
MACE_NOT_IMPLEMENTED;
}
index_t offset() const {
return offset_;
}
bool OnHost() const {
return buffer_->OnHost();
}
private:
BufferBase *buffer_;
void *mapped_buf_;
index_t offset_;
index_t length_;
};
}
#endif // MACE_CORE_BUFFER_H_
......@@ -542,7 +542,6 @@ MaceEngine::MaceEngine(const NetDef *net_def, DeviceType device_type) :
if (!net->Run()) {
LOG(FATAL) << "Net init run failed";
}
ws_->RemoveUnsedTensor();
net_ = std::move(CreateNet(op_registry_, *net_def, ws_.get(), device_type));
}
}
......
......@@ -15,14 +15,14 @@ NetBase::NetBase(const std::shared_ptr<const OperatorRegistry> op_registry,
DeviceType type)
: op_registry_(op_registry), name_(net_def->name()) {}
SimpleNet::SimpleNet(const std::shared_ptr<const OperatorRegistry> op_registry,
SerialNet::SerialNet(const std::shared_ptr<const OperatorRegistry> op_registry,
const std::shared_ptr<const NetDef> net_def,
Workspace *ws,
DeviceType type,
const NetMode mode)
: NetBase(op_registry, net_def, ws, type),
device_type_(type) {
MACE_LATENCY_LOGGER(1, "Constructing SimpleNet ", net_def->name());
MACE_LATENCY_LOGGER(1, "Constructing SerialNet ", net_def->name());
for (int idx = 0; idx < net_def->op_size(); ++idx) {
const auto &operator_def = net_def->op(idx);
VLOG(3) << "Creating operator " << operator_def.name() << "("
......@@ -36,7 +36,7 @@ SimpleNet::SimpleNet(const std::shared_ptr<const OperatorRegistry> op_registry,
}
}
bool SimpleNet::Run(RunMetadata *run_metadata) {
bool SerialNet::Run(RunMetadata *run_metadata) {
MACE_MEMORY_LOGGING_GUARD();
MACE_LATENCY_LOGGER(1, "Running net");
for (auto iter = operators_.begin(); iter != operators_.end(); ++iter) {
......@@ -99,7 +99,7 @@ std::unique_ptr<NetBase> CreateNet(
Workspace *ws,
DeviceType type,
const NetMode mode) {
std::unique_ptr<NetBase> net(new SimpleNet(op_registry, net_def, ws, type, mode));
std::unique_ptr<NetBase> net(new SerialNet(op_registry, net_def, ws, type, mode));
return net;
}
......
......@@ -33,9 +33,9 @@ class NetBase {
DISABLE_COPY_AND_ASSIGN(NetBase);
};
class SimpleNet : public NetBase {
class SerialNet : public NetBase {
public:
SimpleNet(const std::shared_ptr<const OperatorRegistry> op_registry,
SerialNet(const std::shared_ptr<const OperatorRegistry> op_registry,
const std::shared_ptr<const NetDef> net_def,
Workspace *ws,
DeviceType type,
......@@ -47,7 +47,7 @@ class SimpleNet : public NetBase {
std::vector<std::unique_ptr<OperatorBase> > operators_;
DeviceType device_type_;
DISABLE_COPY_AND_ASSIGN(SimpleNet);
DISABLE_COPY_AND_ASSIGN(SerialNet);
};
std::unique_ptr<NetBase> CreateNet(
......
......@@ -5,6 +5,7 @@
#ifndef MACE_CORE_PREALLOCATED_POOLED_ALLOCATOR_H_
#define MACE_CORE_PREALLOCATED_POOLED_ALLOCATOR_H_
#include <unordered_map>
#include "mace/core/allocator.h"
namespace mace {
......@@ -13,17 +14,26 @@ class PreallocatedPooledAllocator {
public:
PreallocatedPooledAllocator() {}
virtual ~PreallocatedPooledAllocator() noexcept {}
~PreallocatedPooledAllocator() noexcept {}
virtual void PreallocateImage(int mem_id,
const std::vector<size_t> &image_shape,
DataType data_type) = 0;
void SetBuffer(int mem_id, std::unique_ptr<BufferBase> &&buffer) {
buffers_[mem_id] = std::move(buffer);
}
virtual void *GetImage(int mem_id) = 0;
BufferBase *GetBuffer(int mem_id) {
if (buffers_.find(mem_id) != buffers_.end()) {
return buffers_[mem_id].get();
} else {
return nullptr;
}
}
virtual bool HasImage(int mem_id) = 0;
virtual bool HasBuffer(int mem_id) {
return buffers_.find(mem_id) != buffers_.end();
}
virtual std::vector<size_t> GetImageSize(int mem_id) = 0;
private:
std::unordered_map<int, std::unique_ptr<BufferBase>> buffers_;
};
} // namespace mace
......
......@@ -11,7 +11,6 @@
#include "mace/core/runtime/hexagon/quantize.h"
#include "mace/core/tensor.h"
#include "mace/public/mace.h"
#include "mace/core/serializer.h"
namespace mace {
......@@ -37,7 +36,6 @@ class HexagonControlWrapper {
void SetGraphMode(int mode);
private:
// CAVEAT: Need offset as HVX library reserves some ids
static constexpr int NODE_ID_OFFSET = 10000;
inline uint32_t node_id(uint32_t nodeid) {
......@@ -45,7 +43,6 @@ class HexagonControlWrapper {
}
int nn_id_;
Serializer serializer_;
Quantizer quantizer_;
std::vector<std::vector<index_t>> input_shapes_;
......
......@@ -35,7 +35,7 @@ static cl_channel_type DataTypeToCLChannelType(const DataType t) {
OpenCLAllocator::OpenCLAllocator() {}
OpenCLAllocator::~OpenCLAllocator() {}
void *OpenCLAllocator::New(size_t nbytes) {
void *OpenCLAllocator::New(size_t nbytes) const {
VLOG(3) << "Allocate OpenCL buffer: " << nbytes;
cl_int error;
cl::Buffer *buffer = new cl::Buffer(OpenCLRuntime::Global()->context(),
......@@ -47,7 +47,7 @@ void *OpenCLAllocator::New(size_t nbytes) {
}
void *OpenCLAllocator::NewImage(const std::vector<size_t> &image_shape,
const DataType dt) {
const DataType dt) const {
MACE_CHECK(image_shape.size() == 2) << "Image shape's size must equal 2";
VLOG(3) << "Allocate OpenCL image: " << image_shape[0] << ", " << image_shape[1];
......@@ -67,7 +67,7 @@ void *OpenCLAllocator::NewImage(const std::vector<size_t> &image_shape,
return cl_image;
}
void OpenCLAllocator::Delete(void *buffer) {
void OpenCLAllocator::Delete(void *buffer) const {
VLOG(3) << "Free OpenCL buffer";
if (buffer != nullptr) {
cl::Buffer *cl_buffer = static_cast<cl::Buffer *>(buffer);
......@@ -75,7 +75,7 @@ void OpenCLAllocator::Delete(void *buffer) {
}
}
void OpenCLAllocator::DeleteImage(void *buffer) {
void OpenCLAllocator::DeleteImage(void *buffer) const {
VLOG(3) << "Free OpenCL image";
if (buffer != nullptr) {
cl::Image2D *cl_image = static_cast<cl::Image2D *>(buffer);
......@@ -83,13 +83,13 @@ void OpenCLAllocator::DeleteImage(void *buffer) {
}
}
void *OpenCLAllocator::Map(void *buffer, size_t nbytes) {
void *OpenCLAllocator::Map(void *buffer, size_t offset, size_t nbytes) const {
auto cl_buffer = static_cast<cl::Buffer *>(buffer);
auto queue = OpenCLRuntime::Global()->command_queue();
// TODO(heliangliang) Non-blocking call
cl_int error;
void *mapped_ptr =
queue.enqueueMapBuffer(*cl_buffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0,
queue.enqueueMapBuffer(*cl_buffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, offset,
nbytes, nullptr, nullptr, &error);
MACE_CHECK(error == CL_SUCCESS);
return mapped_ptr;
......@@ -98,33 +98,33 @@ void *OpenCLAllocator::Map(void *buffer, size_t nbytes) {
// TODO : there is something wrong with half type.
void *OpenCLAllocator::MapImage(void *buffer,
const std::vector<size_t> &image_shape,
std::vector<size_t> &mapped_image_pitch) {
std::vector<size_t> *mapped_image_pitch) const {
MACE_CHECK(image_shape.size() == 2) << "Just support map 2d image";
auto cl_image = static_cast<cl::Image2D *>(buffer);
std::array<size_t, 3> origin = {0, 0, 0};
std::array<size_t, 3> region = {image_shape[0], image_shape[1], 1};
mapped_image_pitch.resize(2);
mapped_image_pitch->resize(2);
cl_int error;
void *mapped_ptr =
OpenCLRuntime::Global()->command_queue().enqueueMapImage(*cl_image,
CL_TRUE, CL_MAP_READ | CL_MAP_WRITE,
origin, region,
&mapped_image_pitch[0],
&mapped_image_pitch[1],
mapped_image_pitch->data(),
mapped_image_pitch->data() + 1,
nullptr, nullptr, &error);
MACE_CHECK(error == CL_SUCCESS) << error;
return mapped_ptr;
}
void OpenCLAllocator::Unmap(void *buffer, void *mapped_ptr) {
void OpenCLAllocator::Unmap(void *buffer, void *mapped_ptr) const {
auto cl_buffer = static_cast<cl::Buffer *>(buffer);
auto queue = OpenCLRuntime::Global()->command_queue();
MACE_CHECK(queue.enqueueUnmapMemObject(*cl_buffer, mapped_ptr, nullptr,
nullptr) == CL_SUCCESS);
}
bool OpenCLAllocator::OnHost() { return false; }
bool OpenCLAllocator::OnHost() const { return false; }
} // namespace mace
......@@ -15,7 +15,7 @@ class OpenCLAllocator : public Allocator {
~OpenCLAllocator() override;
void *New(size_t nbytes) override;
void *New(size_t nbytes) const override;
/*
* Use Image2D with RGBA (128-bit) format to represent the image.
......@@ -23,21 +23,21 @@ class OpenCLAllocator : public Allocator {
* @ shape : [depth, ..., height, width ].
*/
void *NewImage(const std::vector<size_t> &image_shape,
const DataType dt) override;
const DataType dt) const override;
void Delete(void *buffer) override;
void Delete(void *buffer) const override;
void DeleteImage(void *buffer) override;
void DeleteImage(void *buffer) const override;
void *Map(void *buffer, size_t nbytes) override;
void *Map(void *buffer, size_t offset, size_t nbytes) const override;
void *MapImage(void *buffer,
const std::vector<size_t> &image_shape,
std::vector<size_t> &mapped_image_pitch) override;
std::vector<size_t> *mapped_image_pitch) const override;
void Unmap(void *buffer, void *mapped_ptr) override;
void Unmap(void *buffer, void *mapped_ptr) const override;
bool OnHost() override;
bool OnHost() const override;
};
} // namespace mace
......
//
// Copyright (c) 2018 XiaoMi All rights reserved.
//
#include "mace/core/runtime/opencl/opencl_preallocated_pooled_allocator.h"
namespace mace {
OpenCLPreallocatedPooledAllocator::OpenCLPreallocatedPooledAllocator()
: allocator(GetDeviceAllocator(DeviceType::OPENCL)) {
}
OpenCLPreallocatedPooledAllocator::~OpenCLPreallocatedPooledAllocator() {
}
void OpenCLPreallocatedPooledAllocator::PreallocateImage(int mem_id,
const std::vector<
size_t> &image_shape,
DataType data_type) {
MACE_CHECK(!this->HasImage(mem_id), "Memory already exists: ", mem_id);
VLOG(2) << "Preallocate OpenCL image: " << mem_id << " "
<< image_shape[0] << ", " << image_shape[1];
images_[mem_id] = std::move(std::unique_ptr<void, std::function<void(void *)>>(
allocator->NewImage(image_shape, data_type), [this](void *p) {
this->allocator->DeleteImage(p);
}));
image_shapes_[mem_id] = image_shape;
}
} // namespace mace
//
// Copyright (c) 2018 XiaoMi All rights reserved.
//
#ifndef MACE_CORE_RUNTIME_OPENCL_PREALLOCATED_POOLED_ALLOCATOR_H_
#define MACE_CORE_RUNTIME_OPENCL_PREALLOCATED_POOLED_ALLOCATOR_H_
#include "mace/core/preallocated_pooled_allocator.h"
#include <unordered_map>
namespace mace {
class OpenCLPreallocatedPooledAllocator : public PreallocatedPooledAllocator {
public:
OpenCLPreallocatedPooledAllocator();
~OpenCLPreallocatedPooledAllocator() override;
void PreallocateImage(int mem_id,
const std::vector<size_t> &image_shape,
DataType data_type) override;
inline void *GetImage(int mem_id) override {
MACE_CHECK(HasImage(mem_id), "image does not exist");
return images_[mem_id].get();
}
inline bool HasImage(int mem_id) override {
return images_.find(mem_id) != images_.end();
}
inline std::vector<size_t> GetImageSize(int mem_id) override {
return image_shapes_[mem_id];
}
private:
std::unordered_map<int, std::unique_ptr<void, std::function<void(void *)>>>
images_;
std::unordered_map<int, std::vector<size_t>> image_shapes_;
Allocator *allocator;
};
} // namepsace mace
#endif // MACE_CORE_RUNTIME_OPENCL_PREALLOCATED_POOLED_ALLOCATOR_H_
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/core/serializer.h"
namespace mace {
std::unique_ptr<ConstTensor> Serializer::Serialize(const Tensor &tensor,
const std::string &name) {
MACE_NOT_IMPLEMENTED;
return nullptr;
}
std::unique_ptr<Tensor> Serializer::Deserialize(const ConstTensor &const_tensor,
DeviceType type) {
std::unique_ptr<Tensor> tensor(
new Tensor(GetDeviceAllocator(type), const_tensor.data_type()));
std::vector<index_t> dims;
for (const index_t d : const_tensor.dims()) {
dims.push_back(d);
}
tensor->Resize(dims);
switch (const_tensor.data_type()) {
case DT_HALF:
tensor->Copy<half>(reinterpret_cast<const half *>(const_tensor.data()),
const_tensor.data_size());
break;
case DT_FLOAT:
tensor->Copy<float>(reinterpret_cast<const float *>(const_tensor.data()),
const_tensor.data_size());
break;
case DT_DOUBLE:
tensor->Copy<double>(
reinterpret_cast<const double *>(const_tensor.data()),
const_tensor.data_size());
break;
case DT_INT32:
tensor->Copy<int32_t>(
reinterpret_cast<const int32_t *>(const_tensor.data()),
const_tensor.data_size());
break;
case DT_INT64:
tensor->Copy<int64_t>(
reinterpret_cast<const int64_t *>(const_tensor.data()),
const_tensor.data_size());
break;
case DT_UINT8:
tensor->Copy<uint8_t>(
reinterpret_cast<const uint8_t *>(const_tensor.data()),
const_tensor.data_size());
break;
case DT_INT16:
tensor->CopyWithCast<int32_t, uint16_t>(
reinterpret_cast<const int32_t *>(const_tensor.data()),
const_tensor.data_size());
break;
case DT_INT8:
tensor->CopyWithCast<int32_t, int8_t>(
reinterpret_cast<const int32_t *>(const_tensor.data()),
const_tensor.data_size());
break;
case DT_UINT16:
tensor->CopyWithCast<int32_t, int16_t>(
reinterpret_cast<const int32_t *>(const_tensor.data()),
const_tensor.data_size());
break;
case DT_BOOL:
tensor->CopyWithCast<int32_t, bool>(
reinterpret_cast<const int32_t *>(const_tensor.data()),
const_tensor.data_size());
break;
default:
MACE_NOT_IMPLEMENTED;
break;
}
return tensor;
}
} // namespace mace
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#ifndef MACE_CORE_SERIALIZER_H_
#define MACE_CORE_SERIALIZER_H_
#include "mace/core/tensor.h"
#include "mace/public/mace.h"
namespace mace {
class Serializer {
public:
Serializer() {}
~Serializer() {}
std::unique_ptr<ConstTensor> Serialize(const Tensor &tensor,
const std::string &name);
std::unique_ptr<Tensor> Deserialize(const ConstTensor &const_tensor,
DeviceType type);
DISABLE_COPY_AND_ASSIGN(Serializer);
};
} // namespace mace
#endif // MACE_CORE_SERIALIZER_H_
......@@ -5,7 +5,7 @@
#ifndef MACE_CORE_TENSOR_H_
#define MACE_CORE_TENSOR_H_
#include "mace/core/allocator.h"
#include "mace/core/buffer.h"
#include "mace/utils/logging.h"
#include "mace/core/types.h"
#include "mace/public/mace.h"
......@@ -46,7 +46,6 @@ namespace mace {
CASES_WITH_DEFAULT(TYPE_ENUM, STMTS, LOG(FATAL) << "Type not set"; \
, LOG(FATAL) << "Unexpected type: " << TYPE_ENUM;)
namespace numerical_chars {
inline std::ostream &operator<<(std::ostream &os, char c) {
return std::is_signed<char>::value ? os << static_cast<int>(c)
......@@ -64,27 +63,30 @@ inline std::ostream &operator<<(std::ostream &os, unsigned char c) {
class Tensor {
public:
Tensor()
: alloc_(GetDeviceAllocator(DeviceType::CPU)),
size_(0),
dtype_(DT_FLOAT),
buffer_(nullptr),
data_(nullptr),
unused_(false),
is_image_(false){
};
Tensor(Allocator *alloc, DataType type)
: alloc_(alloc),
size_(0),
dtype_(type),
buffer_(nullptr),
data_(nullptr),
unused_(false),
is_image_(false){
};
: allocator_(alloc),
dtype_(type),
buffer_(nullptr),
is_buffer_owner_(true) {};
Tensor(BufferBase *buffer, DataType dtype)
: dtype_(dtype),
buffer_(buffer),
is_buffer_owner_(false) {}
Tensor(const BufferSlice &buffer_slice, DataType dtype)
: dtype_(dtype),
buffer_slice_(buffer_slice),
is_buffer_owner_(false) {
buffer_ = &buffer_slice_;
}
Tensor() : Tensor(GetDeviceAllocator(CPU), DT_FLOAT) {}
~Tensor() {
if (is_buffer_owner_ && buffer_ != nullptr) {
delete buffer_;
}
}
inline DataType dtype() const { return dtype_; }
......@@ -93,182 +95,113 @@ class Tensor {
inline const std::vector<index_t> &shape() const { return shape_; }
inline const std::vector<size_t> &image_shape() const { return image_shape_; }
inline const bool is_image() const { return is_image_; }
inline index_t dim_size() const { return shape_.size(); }
inline index_t dim(unsigned int index) const {
MACE_CHECK(index < shape_.size(), "Dim out of range: ",
index, " >= ", shape_.size());
index, " >= ", shape_.size());
return shape_[index];
}
inline index_t size() const { return size_; }
inline index_t raw_size() const { return size_ * SizeOfType(); }
inline const bool unused() const { return unused_; }
inline int64_t NumElements() const {
inline index_t size() const {
return std::accumulate(shape_.begin(), shape_.end(), 1,
std::multiplies<int64_t>());
}
inline const bool OnHost() const { return alloc_->OnHost(); }
/*
* Map the device buffer as CPU buffer to access the data, unmap must be
* called later
*/
inline void Map() const {
if (!OnHost()) {
MACE_CHECK(buffer_ != nullptr && data_ == nullptr);
data_ = alloc_->Map(buffer_.get(), size_ * SizeOfType());
}
inline index_t raw_size() const {
return size() * SizeOfType();
}
inline void MapImage(std::vector<size_t> &mapped_image_pitch) const {
MACE_CHECK(!OnHost() && buffer_ != nullptr && data_ == nullptr);
data_ = alloc_->MapImage(buffer_.get(), image_shape_, mapped_image_pitch);
inline void *buffer() const {
MACE_CHECK(buffer_ != nullptr && buffer_->buffer() != nullptr,
"buffer is null");
return buffer_->buffer();
}
/*
* Unmap the device buffer
*/
inline void Unmap() const {
if (!OnHost()) {
MACE_CHECK(buffer_ != nullptr && data_ != nullptr);
alloc_->Unmap(buffer_.get(), data_);
data_ = nullptr;
}
inline index_t buffer_offset() const {
return buffer_->offset();
}
void *buffer() const { return buffer_.get(); }
inline const void *raw_data() const {
void *data = MappedBuffer();
MACE_CHECK(data != nullptr || size_ == 0,
"The tensor is of non-zero shape, but its data is not allocated "
"or mapped yet.");
return data;
MACE_CHECK(buffer_ != nullptr, "buffer is null");
return buffer_->raw_data();
}
template <typename T>
template<typename T>
inline const T *data() const {
return static_cast<const T *>(raw_data());
MACE_CHECK(buffer_ != nullptr, "buffer is null");
return buffer_->data<T>();
}
inline void *raw_mutable_data() {
void *data = MappedBuffer();
MACE_CHECK(data != nullptr || size_ == 0,
"The tensor is of non-zero shape, but its data is not allocated "
"or mapped yet.");
return data;
MACE_CHECK(buffer_ != nullptr, "buffer is null");
return buffer_->raw_mutable_data();
}
template <typename T>
template<typename T>
inline T *mutable_data() {
return static_cast<T *>(raw_mutable_data());
MACE_CHECK(buffer_ != nullptr, "buffer is null");
return static_cast<T *>(buffer_->raw_mutable_data());
}
inline void Resize(const std::vector<index_t> &shape) {
MACE_CHECK(!is_image_ || buffer_ == nullptr,
"Resize is not for image, use ResizeImage instead.");
is_image_ = false;
inline void Reshape(const std::vector<index_t> &shape) {
shape_ = shape;
index_t size = NumElements();
if (size_ != size) {
size_ = size;
MACE_CHECK(data_ == nullptr, "Buffer must be unmapped before resize");
CASES(dtype_,
(buffer_ =
std::move(std::unique_ptr<void, std::function<void(void *)>>(
alloc_->New(size_ * sizeof(T)),
[this](void *p) {
this->alloc_->Delete(p);
})
)));
}
MACE_CHECK(raw_size() <= buffer_->size());
}
inline void ResizeImage(const std::vector<index_t> &shape,
const std::vector<size_t> &image_shape) {
MACE_CHECK(is_image_ || buffer_ == nullptr,
"ResizeImage is not for buffer, use Resize instead.");
is_image_ = true;
inline void Resize(const std::vector<index_t> &shape) {
shape_ = shape;
index_t size = NumElements();
if (size_ != size) {
size_ = size;
image_shape_ = image_shape;
if (!preallocated_image_shape_.empty()) {
MACE_CHECK(preallocated_image_shape_[0] >= image_shape[0]
&& preallocated_image_shape_[1] >= image_shape[1],
"image shape not large enough: preallocated ",
preallocated_image_shape_[0],
" ",
preallocated_image_shape_[1],
"apply for ",
image_shape[0],
" ",
image_shape[1]);
} else {
buffer_ = std::move(std::unique_ptr<void, std::function<void(void *)>>(
alloc_->NewImage(image_shape, dtype_),
[this](void *p) { this->alloc_->DeleteImage(p); }));
preallocated_image_shape_ = image_shape;
}
if (buffer_ != nullptr) {
buffer_->Resize(raw_size());
} else {
buffer_ = new Buffer(allocator_, raw_size());
is_buffer_owner_ = true;
}
}
inline void ResizeLike(const Tensor &other) {
if (other.is_image()) {
ResizeImage(other.shape(), other.image_shape());
} else {
Resize(other.shape());
}
Resize(other.shape());
}
inline void ResizeLike(const Tensor *other) {
if (other->is_image()) {
ResizeImage(other->shape(), other->image_shape());
} else {
Resize(other->shape());
Resize(other->shape());
}
inline void ResizeImage(const std::vector<index_t> &shape,
const std::vector<size_t> &image_shape) {
shape_ = shape;
if (buffer_ == nullptr) {
buffer_ = new Image(image_shape, dtype_);
is_buffer_owner_ = true;
}
}
inline void PreallocateImage(void *image,
const std::vector<size_t>& image_shape) {
is_image_ = true;
buffer_ = std::move(std::unique_ptr<void, std::function<void(void *)>>(
image, [](void *p) {
// tensor does not have ownership of preallocated memory
}));
preallocated_image_shape_ = image_shape;
inline void CopyBytes(const void *src, size_t size) {
MappingGuard guard(this);
memcpy(buffer_->raw_mutable_data(), src, size);
}
template <typename T>
inline void Copy(const T *src, index_t size) {
MACE_CHECK(size == size_, "copy src and dst with different size.");
CopyBytes(static_cast<const void *>(src), sizeof(T) * size);
template<typename T>
inline void Copy(const T *src, index_t length) {
MACE_CHECK(length == size(), "copy src and dst with different size.");
CopyBytes(static_cast<const void *>(src), sizeof(T) * length);
}
template <typename SrcType, typename DstType>
inline void CopyWithCast(const SrcType *src, size_t size) {
MACE_CHECK(static_cast<index_t>(size) == size_,
"copy src and dst with different size.");
std::unique_ptr<DstType[]> buffer(new DstType[size]);
for (size_t i = 0; i < size; ++i) {
buffer[i] = static_cast<DstType>(src[i]);
}
CopyBytes(static_cast<const void *>(buffer.get()), sizeof(DstType) * size);
inline void Copy(const Tensor &other) {
dtype_ = other.dtype_;
ResizeLike(other);
MappingGuard map_other(&other);
CopyBytes(other.raw_data(), other.size() * SizeOfType());
}
inline void CopyBytes(const void *src, size_t size) {
MappingGuard map_this(this);
memcpy(raw_mutable_data(), src, size);
inline size_t SizeOfType() const {
size_t type_size = 0;
CASES(dtype_, type_size = sizeof(T));
return type_size;
}
inline BufferBase *UnderlyingBuffer() const {
return buffer_;
}
inline void DebugPrint() const {
......@@ -281,8 +214,8 @@ class Tensor {
os.str("");
os.clear();
MappingGuard guard(this);
for (int i = 0; i < size_; ++i) {
if ( i != 0 && i % shape_[3] == 0) {
for (int i = 0; i < size(); ++i) {
if (i != 0 && i % shape_[3] == 0) {
os << "\n";
}
CASES(dtype_, (os << (this->data<T>()[i]) << ", "));
......@@ -291,37 +224,11 @@ class Tensor {
<< dim(2) << ", " << dim(3) << "], content:\n" << os.str();
}
inline size_t SizeOfType() const {
size_t type_size = 0;
CASES(dtype_, type_size = sizeof(T));
return type_size;
}
inline void Copy(const Tensor &other) {
alloc_ = other.alloc_;
dtype_ = other.dtype_;
ResizeLike(other);
MappingGuard map_other(&other);
if (is_image_) {
LOG(FATAL) << "Not support copy image tensor, please use Copy API.";
} else {
CopyBytes(other.raw_data(), size_ * SizeOfType());
}
}
inline void MarkUnused() {
this->unused_ = true;
}
class MappingGuard {
public:
MappingGuard(const Tensor *tensor) : tensor_(tensor) {
if (tensor_ != nullptr) {
if (tensor_->is_image()) {
tensor_->MapImage(mapped_image_pitch_);
} else {
tensor_->Map();
}
tensor_->buffer_->Map(&mapped_image_pitch_);
}
}
......@@ -331,42 +238,29 @@ class Tensor {
}
~MappingGuard() {
if (tensor_ != nullptr) tensor_->Unmap();
if (tensor_ != nullptr) tensor_->buffer_->UnMap();
}
inline const std::vector<size_t> &mapped_image_pitch() const { return mapped_image_pitch_; }
inline const std::vector<size_t> &mapped_image_pitch() const {
return mapped_image_pitch_;
}
private:
const Tensor *tensor_;
std::vector<size_t> mapped_image_pitch_;
DISABLE_COPY_AND_ASSIGN(MappingGuard);
DISABLE_COPY_AND_ASSIGN(MappingGuard);
};
private:
inline void *MappedBuffer() const {
if (OnHost()) {
return buffer_.get();
}
return data_;
}
Allocator *alloc_;
index_t size_;
Allocator *allocator_;
DataType dtype_;
// Raw buffer, must be mapped as host accessable data before
// read or write
std::unique_ptr<void, std::function<void(void*)>> buffer_;
// Mapped buffer
mutable void *data_;
std::vector<index_t> shape_;
// Image for opencl
bool unused_;
bool is_image_;
std::vector<size_t> image_shape_;
std::vector<size_t> preallocated_image_shape_;
BufferBase *buffer_;
BufferSlice buffer_slice_;
bool is_buffer_owner_;
DISABLE_COPY_AND_ASSIGN(Tensor);
DISABLE_COPY_AND_ASSIGN(Tensor);
};
} // namespace tensor
......
......@@ -6,21 +6,11 @@
#include <vector>
#include "mace/core/workspace.h"
#include "mace/core/serializer.h"
#include "mace/core/arg_helper.h"
#include "mace/core/runtime/opencl/opencl_preallocated_pooled_allocator.h"
#include "mace/utils/timer.h"
namespace mace {
std::vector<std::string> Workspace::Tensors() const {
std::vector<std::string> names;
for (auto &entry : tensor_map_) {
names.push_back(entry.first);
}
return names;
}
Tensor *Workspace::CreateTensor(const std::string &name,
Allocator *alloc,
DataType type) {
......@@ -28,21 +18,12 @@ Tensor *Workspace::CreateTensor(const std::string &name,
VLOG(3) << "Tensor " << name << " already exists. Skipping.";
} else {
VLOG(3) << "Creating Tensor " << name;
tensor_map_[name] = std::move(std::unique_ptr<Tensor>(new Tensor(alloc, type)));
tensor_map_[name] =
std::move(std::unique_ptr<Tensor>(new Tensor(alloc, type)));
}
return GetTensor(name);
}
bool Workspace::RemoveTensor(const std::string &name) {
auto it = tensor_map_.find(name);
if (it != tensor_map_.end()) {
VLOG(3) << "Removing blob " << name << " from this workspace.";
tensor_map_.erase(it);
return true;
}
return false;
}
const Tensor *Workspace::GetTensor(const std::string &name) const {
if (tensor_map_.count(name)) {
return tensor_map_.at(name).get();
......@@ -52,26 +33,51 @@ const Tensor *Workspace::GetTensor(const std::string &name) const {
return nullptr;
}
void Workspace::RemoveUnsedTensor() {
auto iter = tensor_map_.begin();
auto end_iter = tensor_map_.end();
while(iter != end_iter) {
auto old_iter = iter++;
if(old_iter->second->unused()) {
tensor_map_.erase(old_iter);
}
}
}
Tensor *Workspace::GetTensor(const std::string &name) {
return const_cast<Tensor *>(
static_cast<const Workspace *>(this)->GetTensor(name));
static_cast<const Workspace *>(this)->GetTensor(name));
}
std::vector<std::string> Workspace::Tensors() const {
std::vector<std::string> names;
for (auto &entry : tensor_map_) {
names.push_back(entry.first);
}
return names;
}
void Workspace::LoadModelTensor(const NetDef &net_def, DeviceType type) {
MACE_LATENCY_LOGGER(1, "Load model tensors");
Serializer serializer;
index_t model_data_size = 0;
unsigned char *model_data_ptr = nullptr;
for (auto &const_tensor : net_def.tensors()) {
if (model_data_ptr == nullptr
|| reinterpret_cast<long long>(const_tensor.data())
< reinterpret_cast<long long>(model_data_ptr)) {
model_data_ptr = const_cast<unsigned char *>(const_tensor.data());
}
}
for (auto &const_tensor : net_def.tensors()) {
model_data_size = std::max(model_data_size,
static_cast<index_t>(
(reinterpret_cast<long long>(const_tensor.data())
- reinterpret_cast<long long>(model_data_ptr))
+ const_tensor.data_size()
* GetEnumTypeSize(const_tensor.data_type())));
}
VLOG(3) << "Model data size: " << model_data_size;
if (type == DeviceType::CPU) {
tensor_buffer_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(type), model_data_ptr, model_data_size)));
} else {
tensor_buffer_ = std::move(std::unique_ptr<Buffer>(
new Buffer(GetDeviceAllocator(type), model_data_size)));
tensor_buffer_->Map(nullptr);
tensor_buffer_->Copy(model_data_ptr, 0, model_data_size);
tensor_buffer_->UnMap();
}
for (auto &const_tensor : net_def.tensors()) {
MACE_LATENCY_LOGGER(2, "Load tensor ", const_tensor.name());
VLOG(3) << "Tensor name: " << const_tensor.name()
......@@ -79,9 +85,24 @@ void Workspace::LoadModelTensor(const NetDef &net_def, DeviceType type) {
<< ", shape: "
<< MakeString(std::vector<index_t>(const_tensor.dims().begin(),
const_tensor.dims().end()));
tensor_map_[const_tensor.name()] =
serializer.Deserialize(const_tensor, type);
std::vector<index_t> dims;
for (const index_t d : const_tensor.dims()) {
dims.push_back(d);
}
index_t
offset = (long long) const_tensor.data() - (long long) model_data_ptr;
std::unique_ptr<Tensor> tensor(
new Tensor(BufferSlice(tensor_buffer_.get(),
offset,
const_tensor.data_size()
* GetEnumTypeSize(const_tensor.data_type())),
const_tensor.data_type()));
tensor->Reshape(dims);
tensor_map_[const_tensor.name()] = std::move(tensor);
}
if (type == DeviceType::OPENCL) {
CreateImageOutputTensor(net_def);
}
......@@ -91,9 +112,6 @@ void Workspace::CreateImageOutputTensor(const NetDef &net_def) {
if (!net_def.has_mem_arena() || net_def.mem_arena().mem_block_size() == 0) {
return;
}
preallocated_allocator_ =
std::move(std::unique_ptr<PreallocatedPooledAllocator>(
new OpenCLPreallocatedPooledAllocator));
DataType dtype = DataType::DT_INVALID;
// We use the data type of the first op (with mem id, must be image),
......@@ -116,18 +134,16 @@ void Workspace::CreateImageOutputTensor(const NetDef &net_def) {
}
MACE_CHECK(dtype != DataType::DT_INVALID, "data type is invalid.");
for (auto &mem_block: net_def.mem_arena().mem_block()) {
preallocated_allocator_->PreallocateImage(mem_block.mem_id(),
{mem_block.x(), mem_block.y()},
dtype);
std::unique_ptr<BufferBase>
image_buf(new Image({mem_block.x(), mem_block.y()}, dtype));
preallocated_allocator_.SetBuffer(mem_block.mem_id(), std::move(image_buf));
}
VLOG(3) << "Preallocate image to tensors";
auto allocator = GetDeviceAllocator(DeviceType::OPENCL);
for (auto &op: net_def.op()) {
if (op.has_mem_id()) {
CreateTensor(op.output(0), allocator, dtype);
tensor_map_[op.output(0)]->PreallocateImage(
preallocated_allocator_->GetImage(op.mem_id()),
preallocated_allocator_->GetImageSize(op.mem_id()));
std::unique_ptr<Tensor> tensor
(new Tensor(preallocated_allocator_.GetBuffer(op.mem_id()), dtype));
tensor_map_[op.output(0)] = std::move(tensor);
}
}
}
......
......@@ -15,26 +15,23 @@ class Workspace {
public:
typedef std::map<std::string, std::unique_ptr<Tensor>> TensorMap;
Workspace()
: preallocated_allocator_(nullptr) {}
Workspace() {}
~Workspace() {}
std::vector<std::string> Tensors() const;
Tensor *CreateTensor(const std::string &name, Allocator *alloc, DataType type);
bool RemoveTensor(const std::string &name);
void RemoveUnsedTensor();
Tensor *CreateTensor(const std::string &name,
Allocator *alloc,
DataType type);
inline bool HasTensor(const std::string &name) const {
return tensor_map_.count(name);
return tensor_map_.find(name) != tensor_map_.end();
}
const Tensor *GetTensor(const std::string &name) const;
Tensor *GetTensor(const std::string &name);
std::vector<std::string> Tensors() const;
void LoadModelTensor(const NetDef &net_def, DeviceType type);
private:
......@@ -42,9 +39,11 @@ class Workspace {
TensorMap tensor_map_;
std::unique_ptr<PreallocatedPooledAllocator> preallocated_allocator_;
std::unique_ptr<BufferBase> tensor_buffer_;
PreallocatedPooledAllocator preallocated_allocator_;
DISABLE_COPY_AND_ASSIGN(Workspace);
DISABLE_COPY_AND_ASSIGN(Workspace);
};
} // namespace mace
......
......@@ -127,7 +127,7 @@ struct BatchNormFunctor : BatchNormFunctorBase {
}
}
}
DoActivation(output_ptr, output_ptr, output->NumElements(), activation_,
DoActivation(output_ptr, output_ptr, output->size(), activation_,
relux_max_limit_);
}
};
......
......@@ -616,7 +616,7 @@ struct Conv2dFunctor : Conv2dFunctorBase {
}
}
}
DoActivation(output_data, output_data, output->NumElements(), activation_,
DoActivation(output_data, output_data, output->size(), activation_,
relux_max_limit_);
}
};
......
......@@ -402,7 +402,7 @@ struct DepthwiseConv2dFunctor : public DepthwiseConv2dFunctorBase {
valid_h_stop, valid_w_start, valid_w_stop);
output_ptr = output->mutable_data<T>();
DoActivation(output_ptr, output_ptr, output->NumElements(), activation_,
DoActivation(output_ptr, output_ptr, output->size(), activation_,
relux_max_limit_);
}
};
......
......@@ -65,7 +65,7 @@ struct FullyConnectedFunctor : FullyConnectedBase {
}
}
DoActivation(output_ptr, output_ptr, output->NumElements(), activation_,
DoActivation(output_ptr, output_ptr, output->size(), activation_,
relux_max_limit_);
}
};
......
......@@ -14,7 +14,6 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
const BufferType type,
Tensor *image,
StatsFuture *future) {
MACE_CHECK(!buffer->is_image()) << "buffer must be buffer-type";
std::vector<size_t> image_shape;
if (!i2b_) {
CalImage2DShape(buffer->shape(), type, image_shape);
......@@ -25,9 +24,9 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
} else {
image->ResizeImage(buffer->shape(), image_shape);
}
buffer->MarkUnused();
} else {
image_shape = image->image_shape();
Image *image_buf = dynamic_cast<Image*>(image->UnderlyingBuffer());
image_shape = image_buf->image_shape();
buffer->Resize(image->shape());
}
......@@ -78,7 +77,11 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
built_options);
uint32_t idx = 0;
b2f_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(buffer->buffer())));
b2f_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(buffer->buffer())));
if (!i2b_) {
MACE_CHECK(buffer->buffer_offset() % GetEnumTypeSize(buffer->dtype()) == 0, "buffer offset not aligned");
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->buffer_offset() / GetEnumTypeSize(buffer->dtype())));
}
if (type == ARGUMENT) {
b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(0)));
} else if(type == WEIGHT_HEIGHT) {
......
#include <common.h>
__kernel void filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w, oc, ic */
__private const int input_offset,
__private const int filter_w,
__private const int out_channel,
__private const int in_channel,
......@@ -13,7 +14,7 @@ __kernel void filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w, o
const int in_channel_idx = w % rounded_in_channel;
const int h_idx = hw_idx / filter_w;
const int w_idx = hw_idx % filter_w;
const int offset = ((h_idx * filter_w + w_idx) * out_channel + out_channel_idx) * in_channel
const int offset = input_offset + ((h_idx * filter_w + w_idx) * out_channel + out_channel_idx) * in_channel
+ in_channel_idx;
VEC_DATA_TYPE(DATA_TYPE, 4) values = 0;
......@@ -79,6 +80,7 @@ __kernel void filter_image_to_buffer(__global DATA_TYPE *output, /* h, w, oc, ic
}
__kernel void dw_filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w, ic, m */
__private const int input_offset,
__private const int filter_w,
__private const int in_channel,
__private const int multiplier,
......@@ -92,7 +94,7 @@ __kernel void dw_filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w
const int h_idx = w / filter_w;
const int w_idx = w % filter_w;
const int offset = mad24(mad24(h_idx, filter_w, w_idx),
const int offset = input_offset + mad24(mad24(h_idx, filter_w, w_idx),
in_channel, in_channel_idx);
const int size = in_channel - in_channel_idx;
......@@ -117,7 +119,7 @@ __kernel void dw_filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w
const int h_idx = hw_idx / filter_w;
const int w_idx = hw_idx % filter_w;
const int offset = mad24(mad24(mad24(h_idx, filter_w, w_idx),
const int offset = input_offset + mad24(mad24(mad24(h_idx, filter_w, w_idx),
in_channel, in_channel_idx),
multiplier, m);
// TODO support multiplier > 1
......@@ -128,6 +130,7 @@ __kernel void dw_filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w
}
__kernel void in_out_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
__private const int input_offset,
__private const int height,
__private const int width,
__private const int channels,
......@@ -138,7 +141,7 @@ __kernel void in_out_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
const int height_idx = h % height;
const int width_idx = w % width;
const int channel_idx = w / width * 4;
const int offset = ((batch_idx * height + height_idx) * width + width_idx) * channels
const int offset = input_offset + ((batch_idx * height + height_idx) * width + width_idx) * channels
+ channel_idx;
const int size = channels - channel_idx;
......@@ -191,13 +194,16 @@ __kernel void in_out_image_to_buffer(__global DATA_TYPE *output, /* nhwc */
}
__kernel void arg_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
__private const int input_offset,
__private const int count,
__write_only image2d_t output) {
int w = get_global_id(0);
int h = get_global_id(1);
const int offset = w * 4;
const int size = count - offset;
const int offset = input_offset + w * 4;
const int size = count - w * 4;
VEC_DATA_TYPE(DATA_TYPE, 4) values = 0;
if (size < 4) {
switch(size) {
......@@ -241,6 +247,7 @@ __kernel void arg_image_to_buffer(__global DATA_TYPE *output, /* nhwc */
__kernel void in_out_height_buffer_to_image(__global const DATA_TYPE *input, //nhwc
__private const int input_offset,
__private const int height,
__private const int width,
__private const int channels,
......@@ -253,7 +260,7 @@ __kernel void in_out_height_buffer_to_image(__global const DATA_TYPE *input, //n
const int height_idx = (h % height_blks) << 2;
const int width_idx = w % width;
const int channel_idx = w / width;
int offset = ((batch_idx * height + height_idx) * width + width_idx) * channels
int offset = input_offset + ((batch_idx * height + height_idx) * width + width_idx) * channels
+ channel_idx;
int size = height - height_idx;
......@@ -304,6 +311,7 @@ __kernel void in_out_height_image_to_buffer(__global DATA_TYPE *output, //nhwc
__kernel void in_out_width_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
__private const int input_offset,
__private const int height,
__private const int width,
__private const int channels,
......@@ -314,7 +322,7 @@ __kernel void in_out_width_buffer_to_image(__global const DATA_TYPE *input, /* n
const int height_idx = h % height;
const int width_idx = (w % width) << 2;
const int channel_idx = w / width;
const int offset = ((batch_idx * height + height_idx) * width + width_idx) * channels
const int offset = input_offset + ((batch_idx * height + height_idx) * width + width_idx) * channels
+ channel_idx;
int size = width - width_idx;
......@@ -336,6 +344,7 @@ __kernel void in_out_width_buffer_to_image(__global const DATA_TYPE *input, /* n
// only support 3x3 now
__kernel void winograd_filter_buffer_to_image(__global const DATA_TYPE *input, //Oc, Ic, H, W
__private const int input_offset,
__private const int in_channels,
__private const int height,
__private const int width,
......@@ -345,7 +354,7 @@ __kernel void winograd_filter_buffer_to_image(__global const DATA_TYPE *input, /
const int out_channels = get_global_size(1);
const int out_channel_idx = h;
const int in_channel_idx = w << 2;
const int offset = (out_channel_idx * in_channels + in_channel_idx) * height * width;
const int offset = input_offset + (out_channel_idx * in_channels + in_channel_idx) * height * width;
const int length = min((in_channels - in_channel_idx) * 9, 36);
DATA_TYPE in[36] = {0};
DATA_TYPE4 tt;
......@@ -390,7 +399,7 @@ __kernel void winograd_filter_buffer_to_image(__global const DATA_TYPE *input, /
tu3[3] = tu3[2];
tu3[2] = tt - tu3[1] / 2;
tu3[1] = tt + tu3[1] / 2;
int2 coord = (int2)(w, h);
#pragma unroll
for (short i = 0; i < 4; ++i) {
......
......@@ -65,13 +65,12 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
index_t kernel_h = filter->dim(0);
index_t kernel_w = filter->dim(1);
if (!input->is_image() || strides_[0] != strides_[1] ||
if (strides_[0] != strides_[1] ||
(dilations_[0] > 1 && (strides_[0] > 1 || kernel_h == 1))) {
LOG(WARNING) << "OpenCL conv2d kernel with "
<< "filter" << kernel_h << "x" << kernel_w << ","
<< " stride " << strides_[0] << "x" << strides_[1]
<< ",dilations " << dilations_[0] << "x" << dilations_[1]
<< " and input image: " << input->is_image()
<< " is not implemented yet.";
MACE_NOT_IMPLEMENTED;
}
......
......@@ -27,13 +27,12 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
if (kernel_.get() == nullptr) {
MACE_CHECK(out_height > 0 && out_width > 0);
std::vector<index_t> output_shape{batch, out_height, out_width, channels};
if (input->is_image()) {
std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape);
output->ResizeImage(output_shape, output_image_shape);
} else {
output->Resize(output_shape);
}
std::vector<size_t> output_image_shape;
CalImage2DShape(output_shape,
BufferType::IN_OUT_CHANNEL,
output_image_shape);
output->ResizeImage(output_shape, output_image_shape);
float height_scale =
CalculateResizeScale(in_height, out_height, align_corners_);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册