提交 db3ad39f 编写于 作者: 吴承辉

Merge branch 'share_buffer' into 'master'

Share tmp buffer among ops

See merge request !379
......@@ -6,6 +6,7 @@
#define MACE_CORE_BUFFER_H_
#include <vector>
#include <algorithm>
#include <functional>
#include "mace/core/allocator.h"
......@@ -161,12 +162,10 @@ class Buffer : public BufferBase {
bool OnHost() const { return allocator_->OnHost(); }
void Clear() {
if (buf_ != nullptr) {
memset(buf_, 0, size_);
}
memset(reinterpret_cast<char*>(raw_mutable_data()), 0, size_);
}
private:
protected:
Allocator *allocator_;
void *buf_;
void *mapped_buf_;
......@@ -267,19 +266,23 @@ class Image : public BufferBase {
class BufferSlice : public BufferBase {
public:
BufferSlice()
: buffer_(nullptr), mapped_buf_(nullptr), offset_(0), length_(0) {}
: BufferBase(0), buffer_(nullptr), mapped_buf_(nullptr), offset_(0) {}
BufferSlice(BufferBase *buffer, index_t offset, index_t length)
: BufferBase(buffer->size()),
buffer_(buffer),
mapped_buf_(nullptr),
offset_(offset),
length_(length) {
: BufferBase(length),
buffer_(buffer),
mapped_buf_(nullptr),
offset_(offset) {
MACE_CHECK(offset >= 0, "buffer slice offset should >= 0");
MACE_CHECK(offset + length <= size_, "buffer slice offset + length (",
offset, " + ", length, ") should <= ", size_);
MACE_CHECK(offset + length <= buffer->size(),
"buffer slice offset + length (",
offset,
" + ",
length,
") should <= ",
buffer->size());
}
BufferSlice(const BufferSlice &other)
: BufferSlice(other.buffer_, other.offset_, other.length_) {}
: BufferSlice(other.buffer_, other.offset_, other.size_) {}
~BufferSlice() {
if (buffer_ != nullptr && mapped_buf_ != nullptr) {
......@@ -303,8 +306,13 @@ class BufferSlice : public BufferBase {
}
void *raw_mutable_data() {
MACE_NOT_IMPLEMENTED;
return nullptr;
if (OnHost()) {
MACE_CHECK_NOTNULL(buffer_);
return reinterpret_cast<char*>(buffer_->raw_mutable_data()) + offset_;
} else {
MACE_CHECK_NOTNULL(mapped_buf_);
return mapped_buf_;
}
}
void *Map(index_t offset, index_t length, std::vector<size_t> *pitch) const {
......@@ -317,7 +325,7 @@ class BufferSlice : public BufferBase {
void Map(std::vector<size_t> *pitch) {
MACE_CHECK_NOTNULL(buffer_);
MACE_CHECK(mapped_buf_ == nullptr, "mapped buf is not null");
mapped_buf_ = buffer_->Map(offset_, length_, pitch);
mapped_buf_ = buffer_->Map(offset_, size_, pitch);
}
void UnMap() {
......@@ -326,7 +334,10 @@ class BufferSlice : public BufferBase {
mapped_buf_ = nullptr;
}
void Resize(index_t size) { MACE_NOT_IMPLEMENTED; }
void Resize(index_t size) {
MACE_CHECK(size == size_, "resize buffer slice from ", size_,
" to ", size, " is illegal");
}
void Copy(void *src, index_t offset, index_t length) { MACE_NOT_IMPLEMENTED; }
......@@ -335,15 +346,58 @@ class BufferSlice : public BufferBase {
bool OnHost() const { return buffer_->OnHost(); }
void Clear() {
MACE_NOT_IMPLEMENTED;
memset(raw_mutable_data(), 0, size_);
}
private:
BufferBase *buffer_;
void *mapped_buf_;
index_t offset_;
index_t length_;
};
class ScratchBuffer: public Buffer {
public:
explicit ScratchBuffer(Allocator *allocator)
: Buffer(allocator),
offset_(0) {}
ScratchBuffer(Allocator *allocator, index_t size)
: Buffer(allocator, size),
offset_(0) {}
ScratchBuffer(Allocator *allocator, void *data, index_t size)
: Buffer(allocator, data, size),
offset_(0) {}
virtual ~ScratchBuffer() {}
void GrowSize(index_t size) {
if (size > size_) {
Resize(size);
}
}
BufferSlice Scratch(index_t size) {
MACE_CHECK(offset_ + size <= size_,
"scratch size not enough: ",
offset_,
" + ",
size,
" > ",
size_);
BufferSlice slice(this, offset_, size);
offset_ += size;
return slice;
}
void Rewind() {
offset_ = 0;
}
private:
index_t offset_;
};
} // namespace mace
#endif // MACE_CORE_BUFFER_H_
......@@ -12,6 +12,9 @@
namespace mace {
Workspace::Workspace() : host_scratch_buffer_(new ScratchBuffer(
GetDeviceAllocator(DeviceType::CPU))) {}
Tensor *Workspace::CreateTensor(const std::string &name,
Allocator *alloc,
DataType type) {
......@@ -159,4 +162,12 @@ void Workspace::CreateImageOutputTensor(const NetDef &net_def) {
}
}
ScratchBuffer *Workspace::GetScratchBuffer(DeviceType device_type) {
if (device_type == CPU || device_type == NEON) {
return host_scratch_buffer_.get();
} else {
return nullptr;
}
}
} // namespace mace
......@@ -20,7 +20,7 @@ class Workspace {
public:
typedef std::map<std::string, std::unique_ptr<Tensor>> TensorMap;
Workspace() {}
Workspace();
~Workspace() {}
Tensor *CreateTensor(const std::string &name,
......@@ -39,6 +39,8 @@ class Workspace {
void LoadModelTensor(const NetDef &net_def, DeviceType type);
ScratchBuffer *GetScratchBuffer(DeviceType device_type);
private:
void CreateImageOutputTensor(const NetDef &net_def);
......@@ -48,6 +50,8 @@ class Workspace {
PreallocatedPooledAllocator preallocated_allocator_;
std::unique_ptr<ScratchBuffer> host_scratch_buffer_;
DISABLE_COPY_AND_ASSIGN(Workspace);
};
......
......@@ -154,17 +154,28 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
int pad_left = paddings[1] >> 1;
int pad_right = paddings[1] - pad_left;
std::function<void(const float *input, float *output)> conv_func;
auto input_data = input->data<float>();
auto filter_data = filter->data<float>();
auto bias_data = bias == nullptr ? nullptr : bias->data<float>();
auto output_data = output->mutable_data<float>();
if (USE_WINOGRAD && filter_h == 3 && filter_w == 3 && stride_h == 1
&& stride_w == 1
&& dilation_h == 1 && dilation_w == 1
&& input_channels >= 8 && channels >= 8) {
std::function<void(const float *input, float *output)> conv_func;
bool use_winograd = USE_WINOGRAD && filter_h == 3 && filter_w == 3
&& stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1
&& input_channels >= 8 && channels >= 8;
bool use_neon_3x3_s1 = filter_h == 3 && filter_w == 3
&& stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1;
bool use_neon_3x3_s2 = filter_h == 3 && filter_w == 3
&& stride_h == 2 && stride_w == 2 && dilation_h == 1 && dilation_w == 1;
bool use_neon_1x1_s1 = filter_h == 1 && filter_w == 1
&& stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1;
std::vector<index_t> transformed_input_shape;
std::vector<index_t> transformed_output_shape;
std::vector<index_t> transformed_filter_shape;
if (use_winograd) {
extra_output_height = RoundUp<index_t>(height, WINOGRAD_OUT_TILE_SIZE);
extra_input_height = std::max(padded_input_height, extra_output_height + 2);
extra_output_width = RoundUp<index_t>(width, WINOGRAD_OUT_TILE_SIZE);
......@@ -181,12 +192,90 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
index_t tile_count = tile_height_count * tile_width_count;
index_t in_tile_area =
(WINOGRAD_OUT_TILE_SIZE + 2) * (WINOGRAD_OUT_TILE_SIZE + 2);
transformed_input_.Resize({in_tile_area, batch, input_channels,
tile_count});
transformed_filter_.Resize({in_tile_area, channels, input_channels});
transformed_output_.Resize({in_tile_area, batch, channels, tile_count});
conv_func = [=](const float *pad_input, float *pad_output) {
transformed_input_shape.insert(transformed_input_shape.end(),
{in_tile_area, batch, input_channels,
tile_count});
transformed_output_shape.insert(transformed_output_shape.end(),
{in_tile_area, batch, channels,
tile_count});
transformed_filter_shape.insert(transformed_filter_shape.end(),
{in_tile_area, channels, input_channels});
} else if (use_neon_3x3_s1) {
extra_output_height = RoundUp<index_t>(height, 2);
extra_input_height = std::max(padded_input_height, extra_output_height + 2);
extra_output_width = RoundUp<index_t>(width, 4);
extra_input_width = std::max(padded_input_width, extra_output_width + 2);
if (extra_input_height != padded_input_height) {
pad_bottom += (extra_input_height - padded_input_height);
}
if (extra_input_width != padded_input_width) {
pad_right += (extra_input_width - padded_input_width);
}
} else if (use_neon_3x3_s2) {
extra_output_height = height;
extra_input_height =
std::max(padded_input_height, (extra_output_height - 1) * 2 + 3);
extra_output_width = RoundUp<index_t>(width, 4);
extra_input_width =
std::max(padded_input_width, (extra_output_width - 1) * 2 + 3);
if (extra_input_height != padded_input_height) {
pad_bottom += (extra_input_height - padded_input_height);
}
if (extra_input_width != padded_input_width) {
pad_right += (extra_input_width - padded_input_width);
}
}
// decide scratch size before allocate it
index_t total_scratch_size = 0;
index_t transformed_input_size = 0;
index_t transformed_output_size = 0;
index_t padded_input_size = 0;
index_t padded_output_size = 0;
if (use_winograd) {
transformed_input_size =
std::accumulate(transformed_input_shape.begin(),
transformed_input_shape.end(),
1,
std::multiplies<index_t>()) * sizeof(float);
transformed_output_size =
std::accumulate(transformed_output_shape.begin(),
transformed_output_shape.end(),
1,
std::multiplies<index_t>()) * sizeof(float);
total_scratch_size += transformed_input_size + transformed_output_size;
}
if (extra_input_height != input_height || extra_input_width != input_width) {
padded_input_size =
batch * input_channels * (input_height + pad_top + pad_bottom)
* (input_width + pad_left + pad_right) * sizeof(float);
total_scratch_size += padded_input_size;
}
if (extra_output_height != height || extra_output_width != width) {
padded_output_size =
batch * channels * extra_output_height * extra_output_width
* sizeof(float);
total_scratch_size += padded_output_size;
}
// Init scratch buffer
scratch_->Rewind();
scratch_->GrowSize(total_scratch_size);
Tensor transformed_input(scratch_->Scratch(transformed_input_size), DT_FLOAT);
Tensor
transformed_output(scratch_->Scratch(transformed_output_size), DT_FLOAT);
Tensor padded_input(scratch_->Scratch(padded_input_size), DT_FLOAT);
Tensor padded_output(scratch_->Scratch(padded_output_size), DT_FLOAT);
// decide which convolution function to call
if (use_winograd) {
transformed_input.Resize(transformed_input_shape);
transformed_output.Resize(transformed_output_shape);
if (!is_filter_transformed_) {
transformed_filter_.Resize(transformed_filter_shape);
}
conv_func = [&](const float *pad_input, float *pad_output) {
WinoGradConv3x3s1(pad_input,
filter_data,
batch,
......@@ -195,26 +284,14 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
input_channels,
channels,
WINOGRAD_OUT_TILE_SIZE,
transformed_input_.mutable_data<float>(),
transformed_input.mutable_data<float>(),
transformed_filter_.mutable_data<float>(),
transformed_output_.mutable_data<float>(),
transformed_output.mutable_data<float>(),
is_filter_transformed_,
pad_output);
is_filter_transformed_ = true;
};
} else if (filter_h == 3 && filter_w == 3 && stride_h == 1 && stride_w == 1
&& dilation_h == 1 && dilation_w == 1) {
extra_output_height = RoundUp<index_t>(height, 2);
extra_input_height = std::max(padded_input_height, extra_output_height + 2);
extra_output_width = RoundUp<index_t>(width, 4);
extra_input_width = std::max(padded_input_width, extra_output_width + 2);
if (extra_input_height != padded_input_height) {
pad_bottom += (extra_input_height - padded_input_height);
}
if (extra_input_width != padded_input_width) {
pad_right += (extra_input_width - padded_input_width);
}
} else if (use_neon_3x3_s1) {
conv_func = [=](const float *pad_input, float *pad_output) {
Conv2dNeonK3x3S1(pad_input,
filter_data,
......@@ -227,21 +304,7 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
channels,
pad_output);
};
} else if (filter_h == 3 && filter_w == 3 && stride_h == 2 && stride_w == 2
&& dilation_h == 1 && dilation_w == 1) {
extra_output_height = height;
extra_input_height =
std::max(padded_input_height, (extra_output_height - 1) * 2 + 3);
extra_output_width = RoundUp<index_t>(width, 4);
extra_input_width =
std::max(padded_input_width, (extra_output_width - 1) * 2 + 3);
if (extra_input_height != padded_input_height) {
pad_bottom += (extra_input_height - padded_input_height);
}
if (extra_input_width != padded_input_width) {
pad_right += (extra_input_width - padded_input_width);
}
} else if (use_neon_3x3_s2) {
conv_func = [=](const float *pad_input, float *pad_output) {
Conv2dNeonK3x3S2(pad_input,
filter_data,
......@@ -254,8 +317,7 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
channels,
pad_output);
};
} else if (filter_h == 1 && filter_w == 1 && stride_h == 1 && stride_w == 1
&& dilation_h == 1 && dilation_w == 1) {
} else if (use_neon_1x1_s1) {
conv_func = [=](const float *pad_input, float *pad_output) {
Conv2dNeonK1x1S1(input_data,
filter_data,
......@@ -287,28 +349,27 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
};
}
// pad input and output
const Tensor *pad_input_ptr = input;
// Keep this alive during kernel execution
if (extra_input_height != input_height || extra_input_width != input_width) {
padded_input.Clear();
ConstructNCHWInputWithSpecificPadding(input,
pad_top,
pad_bottom,
pad_left,
pad_right,
&padded_input_);
pad_input_ptr = &padded_input_;
&padded_input);
pad_input_ptr = &padded_input;
}
const float *pad_input_data = pad_input_ptr->data<float>();
Tensor *pad_output_ptr = output;
// Keep this alive during kernel execution
if (extra_output_height != height || extra_output_width != width) {
std::vector<index_t> extra_output_shape
{batch, channels, extra_output_height, extra_output_width};
padded_output_.Resize(extra_output_shape);
padded_output_.Clear();
pad_output_ptr = &padded_output_;
padded_output.Resize({batch, channels, extra_output_height,
extra_output_width});
padded_output.Clear();
pad_output_ptr = &padded_output;
}
const float *pad_input_data = pad_input_ptr->data<float>();
float *pad_output_data = pad_output_ptr->mutable_data<float>();
conv_func(pad_input_data, pad_output_data);
......
......@@ -297,7 +297,8 @@ struct Conv2dFunctor : Conv2dFunctorBase {
const std::vector<int> &paddings,
const int *dilations,
const ActivationType activation,
const float relux_max_limit)
const float relux_max_limit,
ScratchBuffer *scratch)
: Conv2dFunctorBase(strides,
padding_type,
paddings,
......@@ -422,14 +423,16 @@ struct Conv2dFunctor<DeviceType::NEON, float> : Conv2dFunctorBase {
const std::vector<int> &paddings,
const int *dilations,
const ActivationType activation,
const float relux_max_limit)
const float relux_max_limit,
ScratchBuffer *scratch)
: Conv2dFunctorBase(strides,
padding_type,
paddings,
dilations,
activation,
relux_max_limit),
is_filter_transformed_(false) {}
is_filter_transformed_(false),
scratch_(scratch) {}
void operator()(const Tensor *input,
const Tensor *filter,
......@@ -437,13 +440,9 @@ struct Conv2dFunctor<DeviceType::NEON, float> : Conv2dFunctorBase {
Tensor *output,
StatsFuture *future);
// TODO(liyin): share tmp buffers among ops
Tensor padded_input_;
Tensor padded_output_;
Tensor transformed_input_;
Tensor transformed_filter_;
Tensor transformed_output_;
bool is_filter_transformed_;
ScratchBuffer *scratch_;
};
template <typename T>
......@@ -453,7 +452,8 @@ struct Conv2dFunctor<DeviceType::OPENCL, T> : Conv2dFunctorBase {
const std::vector<int> &paddings,
const int *dilations,
const ActivationType activation,
const float relux_max_limit)
const float relux_max_limit,
ScratchBuffer *scratch)
: Conv2dFunctorBase(strides,
padding_type,
paddings,
......
......@@ -24,7 +24,8 @@ class Conv2dOp : public ConvPool2dOpBase<D, T> {
this->paddings_,
this->dilations_.data(),
kernels::ActivationType::NOOP,
0.0f) {}
0.0f,
ws->GetScratchBuffer(D)) {}
bool Run(StatsFuture *future) override {
const Tensor *input = this->Input(INPUT);
......
......@@ -27,7 +27,8 @@ class FusedConv2dOp : public ConvPool2dOpBase<D, T> {
kernels::StringToActivationType(
OperatorBase::GetSingleArgument<std::string>("activation",
"NOOP")),
OperatorBase::GetSingleArgument<float>("max_limit", 0.0f)) {}
OperatorBase::GetSingleArgument<float>("max_limit", 0.0f),
ws->GetScratchBuffer(D)) {}
bool Run(StatsFuture *future) override {
const Tensor *input = this->Input(INPUT);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册