diff --git a/mace/core/buffer.h b/mace/core/buffer.h index b655fdc4b10857a7a446b1b7ffc859bf535427e9..08cbf1a9562c69e2344d25bf8d977f1b2a94ffea 100644 --- a/mace/core/buffer.h +++ b/mace/core/buffer.h @@ -6,6 +6,7 @@ #define MACE_CORE_BUFFER_H_ #include +#include #include #include "mace/core/allocator.h" @@ -161,12 +162,10 @@ class Buffer : public BufferBase { bool OnHost() const { return allocator_->OnHost(); } void Clear() { - if (buf_ != nullptr) { - memset(buf_, 0, size_); - } + memset(reinterpret_cast(raw_mutable_data()), 0, size_); } - private: + protected: Allocator *allocator_; void *buf_; void *mapped_buf_; @@ -267,19 +266,23 @@ class Image : public BufferBase { class BufferSlice : public BufferBase { public: BufferSlice() - : buffer_(nullptr), mapped_buf_(nullptr), offset_(0), length_(0) {} + : BufferBase(0), buffer_(nullptr), mapped_buf_(nullptr), offset_(0) {} BufferSlice(BufferBase *buffer, index_t offset, index_t length) - : BufferBase(buffer->size()), - buffer_(buffer), - mapped_buf_(nullptr), - offset_(offset), - length_(length) { + : BufferBase(length), + buffer_(buffer), + mapped_buf_(nullptr), + offset_(offset) { MACE_CHECK(offset >= 0, "buffer slice offset should >= 0"); - MACE_CHECK(offset + length <= size_, "buffer slice offset + length (", - offset, " + ", length, ") should <= ", size_); + MACE_CHECK(offset + length <= buffer->size(), + "buffer slice offset + length (", + offset, + " + ", + length, + ") should <= ", + buffer->size()); } BufferSlice(const BufferSlice &other) - : BufferSlice(other.buffer_, other.offset_, other.length_) {} + : BufferSlice(other.buffer_, other.offset_, other.size_) {} ~BufferSlice() { if (buffer_ != nullptr && mapped_buf_ != nullptr) { @@ -303,8 +306,13 @@ class BufferSlice : public BufferBase { } void *raw_mutable_data() { - MACE_NOT_IMPLEMENTED; - return nullptr; + if (OnHost()) { + MACE_CHECK_NOTNULL(buffer_); + return reinterpret_cast(buffer_->raw_mutable_data()) + offset_; + } else { + MACE_CHECK_NOTNULL(mapped_buf_); + return mapped_buf_; + } } void *Map(index_t offset, index_t length, std::vector *pitch) const { @@ -317,7 +325,7 @@ class BufferSlice : public BufferBase { void Map(std::vector *pitch) { MACE_CHECK_NOTNULL(buffer_); MACE_CHECK(mapped_buf_ == nullptr, "mapped buf is not null"); - mapped_buf_ = buffer_->Map(offset_, length_, pitch); + mapped_buf_ = buffer_->Map(offset_, size_, pitch); } void UnMap() { @@ -326,7 +334,10 @@ class BufferSlice : public BufferBase { mapped_buf_ = nullptr; } - void Resize(index_t size) { MACE_NOT_IMPLEMENTED; } + void Resize(index_t size) { + MACE_CHECK(size == size_, "resize buffer slice from ", size_, + " to ", size, " is illegal"); + } void Copy(void *src, index_t offset, index_t length) { MACE_NOT_IMPLEMENTED; } @@ -335,15 +346,58 @@ class BufferSlice : public BufferBase { bool OnHost() const { return buffer_->OnHost(); } void Clear() { - MACE_NOT_IMPLEMENTED; + memset(raw_mutable_data(), 0, size_); } private: BufferBase *buffer_; void *mapped_buf_; index_t offset_; - index_t length_; }; + +class ScratchBuffer: public Buffer { + public: + explicit ScratchBuffer(Allocator *allocator) + : Buffer(allocator), + offset_(0) {} + + ScratchBuffer(Allocator *allocator, index_t size) + : Buffer(allocator, size), + offset_(0) {} + + ScratchBuffer(Allocator *allocator, void *data, index_t size) + : Buffer(allocator, data, size), + offset_(0) {} + + virtual ~ScratchBuffer() {} + + void GrowSize(index_t size) { + if (size > size_) { + Resize(size); + } + } + + BufferSlice Scratch(index_t size) { + MACE_CHECK(offset_ + size <= size_, + "scratch size not enough: ", + offset_, + " + ", + size, + " > ", + size_); + BufferSlice slice(this, offset_, size); + offset_ += size; + return slice; + } + + void Rewind() { + offset_ = 0; + } + + private: + index_t offset_; +}; + } // namespace mace #endif // MACE_CORE_BUFFER_H_ diff --git a/mace/core/workspace.cc b/mace/core/workspace.cc index 227c99737c1fc766c6a8fe0944ce6ea5b84cacc3..d068cbd8920ab2b155f05f07ea776925c2d75813 100644 --- a/mace/core/workspace.cc +++ b/mace/core/workspace.cc @@ -12,6 +12,9 @@ namespace mace { +Workspace::Workspace() : host_scratch_buffer_(new ScratchBuffer( + GetDeviceAllocator(DeviceType::CPU))) {} + Tensor *Workspace::CreateTensor(const std::string &name, Allocator *alloc, DataType type) { @@ -159,4 +162,12 @@ void Workspace::CreateImageOutputTensor(const NetDef &net_def) { } } +ScratchBuffer *Workspace::GetScratchBuffer(DeviceType device_type) { + if (device_type == CPU || device_type == NEON) { + return host_scratch_buffer_.get(); + } else { + return nullptr; + } +} + } // namespace mace diff --git a/mace/core/workspace.h b/mace/core/workspace.h index 1e1012672c30d388fe34ff645b50ed36a292c16b..c918a694efa8fe3837dc977f792122436bf74119 100644 --- a/mace/core/workspace.h +++ b/mace/core/workspace.h @@ -20,7 +20,7 @@ class Workspace { public: typedef std::map> TensorMap; - Workspace() {} + Workspace(); ~Workspace() {} Tensor *CreateTensor(const std::string &name, @@ -39,6 +39,8 @@ class Workspace { void LoadModelTensor(const NetDef &net_def, DeviceType type); + ScratchBuffer *GetScratchBuffer(DeviceType device_type); + private: void CreateImageOutputTensor(const NetDef &net_def); @@ -48,6 +50,8 @@ class Workspace { PreallocatedPooledAllocator preallocated_allocator_; + std::unique_ptr host_scratch_buffer_; + DISABLE_COPY_AND_ASSIGN(Workspace); }; diff --git a/mace/kernels/arm/conv_2d.cc b/mace/kernels/arm/conv_2d.cc index 7fc16cda27b4c3c93d491aa9caf6372247df6e96..04b79abc46922ebd258fb091fab98b7db088c8c4 100644 --- a/mace/kernels/arm/conv_2d.cc +++ b/mace/kernels/arm/conv_2d.cc @@ -154,17 +154,28 @@ void Conv2dFunctor::operator()(const Tensor *input, int pad_left = paddings[1] >> 1; int pad_right = paddings[1] - pad_left; - std::function conv_func; - auto input_data = input->data(); auto filter_data = filter->data(); auto bias_data = bias == nullptr ? nullptr : bias->data(); auto output_data = output->mutable_data(); - if (USE_WINOGRAD && filter_h == 3 && filter_w == 3 && stride_h == 1 - && stride_w == 1 - && dilation_h == 1 && dilation_w == 1 - && input_channels >= 8 && channels >= 8) { + std::function conv_func; + + bool use_winograd = USE_WINOGRAD && filter_h == 3 && filter_w == 3 + && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1 + && input_channels >= 8 && channels >= 8; + bool use_neon_3x3_s1 = filter_h == 3 && filter_w == 3 + && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1; + bool use_neon_3x3_s2 = filter_h == 3 && filter_w == 3 + && stride_h == 2 && stride_w == 2 && dilation_h == 1 && dilation_w == 1; + bool use_neon_1x1_s1 = filter_h == 1 && filter_w == 1 + && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1; + + std::vector transformed_input_shape; + std::vector transformed_output_shape; + std::vector transformed_filter_shape; + + if (use_winograd) { extra_output_height = RoundUp(height, WINOGRAD_OUT_TILE_SIZE); extra_input_height = std::max(padded_input_height, extra_output_height + 2); extra_output_width = RoundUp(width, WINOGRAD_OUT_TILE_SIZE); @@ -181,12 +192,90 @@ void Conv2dFunctor::operator()(const Tensor *input, index_t tile_count = tile_height_count * tile_width_count; index_t in_tile_area = (WINOGRAD_OUT_TILE_SIZE + 2) * (WINOGRAD_OUT_TILE_SIZE + 2); - transformed_input_.Resize({in_tile_area, batch, input_channels, - tile_count}); - transformed_filter_.Resize({in_tile_area, channels, input_channels}); - transformed_output_.Resize({in_tile_area, batch, channels, tile_count}); - conv_func = [=](const float *pad_input, float *pad_output) { + transformed_input_shape.insert(transformed_input_shape.end(), + {in_tile_area, batch, input_channels, + tile_count}); + transformed_output_shape.insert(transformed_output_shape.end(), + {in_tile_area, batch, channels, + tile_count}); + transformed_filter_shape.insert(transformed_filter_shape.end(), + {in_tile_area, channels, input_channels}); + } else if (use_neon_3x3_s1) { + extra_output_height = RoundUp(height, 2); + extra_input_height = std::max(padded_input_height, extra_output_height + 2); + extra_output_width = RoundUp(width, 4); + extra_input_width = std::max(padded_input_width, extra_output_width + 2); + if (extra_input_height != padded_input_height) { + pad_bottom += (extra_input_height - padded_input_height); + } + if (extra_input_width != padded_input_width) { + pad_right += (extra_input_width - padded_input_width); + } + } else if (use_neon_3x3_s2) { + extra_output_height = height; + extra_input_height = + std::max(padded_input_height, (extra_output_height - 1) * 2 + 3); + extra_output_width = RoundUp(width, 4); + extra_input_width = + std::max(padded_input_width, (extra_output_width - 1) * 2 + 3); + if (extra_input_height != padded_input_height) { + pad_bottom += (extra_input_height - padded_input_height); + } + if (extra_input_width != padded_input_width) { + pad_right += (extra_input_width - padded_input_width); + } + } + + // decide scratch size before allocate it + index_t total_scratch_size = 0; + index_t transformed_input_size = 0; + index_t transformed_output_size = 0; + index_t padded_input_size = 0; + index_t padded_output_size = 0; + if (use_winograd) { + transformed_input_size = + std::accumulate(transformed_input_shape.begin(), + transformed_input_shape.end(), + 1, + std::multiplies()) * sizeof(float); + transformed_output_size = + std::accumulate(transformed_output_shape.begin(), + transformed_output_shape.end(), + 1, + std::multiplies()) * sizeof(float); + total_scratch_size += transformed_input_size + transformed_output_size; + } + if (extra_input_height != input_height || extra_input_width != input_width) { + padded_input_size = + batch * input_channels * (input_height + pad_top + pad_bottom) + * (input_width + pad_left + pad_right) * sizeof(float); + total_scratch_size += padded_input_size; + } + if (extra_output_height != height || extra_output_width != width) { + padded_output_size = + batch * channels * extra_output_height * extra_output_width + * sizeof(float); + total_scratch_size += padded_output_size; + } + // Init scratch buffer + scratch_->Rewind(); + scratch_->GrowSize(total_scratch_size); + Tensor transformed_input(scratch_->Scratch(transformed_input_size), DT_FLOAT); + Tensor + transformed_output(scratch_->Scratch(transformed_output_size), DT_FLOAT); + Tensor padded_input(scratch_->Scratch(padded_input_size), DT_FLOAT); + Tensor padded_output(scratch_->Scratch(padded_output_size), DT_FLOAT); + + // decide which convolution function to call + if (use_winograd) { + transformed_input.Resize(transformed_input_shape); + transformed_output.Resize(transformed_output_shape); + if (!is_filter_transformed_) { + transformed_filter_.Resize(transformed_filter_shape); + } + + conv_func = [&](const float *pad_input, float *pad_output) { WinoGradConv3x3s1(pad_input, filter_data, batch, @@ -195,26 +284,14 @@ void Conv2dFunctor::operator()(const Tensor *input, input_channels, channels, WINOGRAD_OUT_TILE_SIZE, - transformed_input_.mutable_data(), + transformed_input.mutable_data(), transformed_filter_.mutable_data(), - transformed_output_.mutable_data(), + transformed_output.mutable_data(), is_filter_transformed_, pad_output); is_filter_transformed_ = true; }; - } else if (filter_h == 3 && filter_w == 3 && stride_h == 1 && stride_w == 1 - && dilation_h == 1 && dilation_w == 1) { - extra_output_height = RoundUp(height, 2); - extra_input_height = std::max(padded_input_height, extra_output_height + 2); - extra_output_width = RoundUp(width, 4); - extra_input_width = std::max(padded_input_width, extra_output_width + 2); - if (extra_input_height != padded_input_height) { - pad_bottom += (extra_input_height - padded_input_height); - } - if (extra_input_width != padded_input_width) { - pad_right += (extra_input_width - padded_input_width); - } - + } else if (use_neon_3x3_s1) { conv_func = [=](const float *pad_input, float *pad_output) { Conv2dNeonK3x3S1(pad_input, filter_data, @@ -227,21 +304,7 @@ void Conv2dFunctor::operator()(const Tensor *input, channels, pad_output); }; - } else if (filter_h == 3 && filter_w == 3 && stride_h == 2 && stride_w == 2 - && dilation_h == 1 && dilation_w == 1) { - extra_output_height = height; - extra_input_height = - std::max(padded_input_height, (extra_output_height - 1) * 2 + 3); - extra_output_width = RoundUp(width, 4); - extra_input_width = - std::max(padded_input_width, (extra_output_width - 1) * 2 + 3); - if (extra_input_height != padded_input_height) { - pad_bottom += (extra_input_height - padded_input_height); - } - if (extra_input_width != padded_input_width) { - pad_right += (extra_input_width - padded_input_width); - } - + } else if (use_neon_3x3_s2) { conv_func = [=](const float *pad_input, float *pad_output) { Conv2dNeonK3x3S2(pad_input, filter_data, @@ -254,8 +317,7 @@ void Conv2dFunctor::operator()(const Tensor *input, channels, pad_output); }; - } else if (filter_h == 1 && filter_w == 1 && stride_h == 1 && stride_w == 1 - && dilation_h == 1 && dilation_w == 1) { + } else if (use_neon_1x1_s1) { conv_func = [=](const float *pad_input, float *pad_output) { Conv2dNeonK1x1S1(input_data, filter_data, @@ -287,28 +349,27 @@ void Conv2dFunctor::operator()(const Tensor *input, }; } + // pad input and output const Tensor *pad_input_ptr = input; - // Keep this alive during kernel execution if (extra_input_height != input_height || extra_input_width != input_width) { + padded_input.Clear(); ConstructNCHWInputWithSpecificPadding(input, pad_top, pad_bottom, pad_left, pad_right, - &padded_input_); - pad_input_ptr = &padded_input_; + &padded_input); + pad_input_ptr = &padded_input; } - const float *pad_input_data = pad_input_ptr->data(); Tensor *pad_output_ptr = output; - // Keep this alive during kernel execution if (extra_output_height != height || extra_output_width != width) { - std::vector extra_output_shape - {batch, channels, extra_output_height, extra_output_width}; - padded_output_.Resize(extra_output_shape); - padded_output_.Clear(); - pad_output_ptr = &padded_output_; + padded_output.Resize({batch, channels, extra_output_height, + extra_output_width}); + padded_output.Clear(); + pad_output_ptr = &padded_output; } + const float *pad_input_data = pad_input_ptr->data(); float *pad_output_data = pad_output_ptr->mutable_data(); conv_func(pad_input_data, pad_output_data); diff --git a/mace/kernels/conv_2d.h b/mace/kernels/conv_2d.h index 6833b9b66449f70f38ef56779722fbd67b06eaa7..9a8f7c758679616218fcb5b006e4fa8226da0263 100644 --- a/mace/kernels/conv_2d.h +++ b/mace/kernels/conv_2d.h @@ -297,7 +297,8 @@ struct Conv2dFunctor : Conv2dFunctorBase { const std::vector &paddings, const int *dilations, const ActivationType activation, - const float relux_max_limit) + const float relux_max_limit, + ScratchBuffer *scratch) : Conv2dFunctorBase(strides, padding_type, paddings, @@ -422,14 +423,16 @@ struct Conv2dFunctor : Conv2dFunctorBase { const std::vector &paddings, const int *dilations, const ActivationType activation, - const float relux_max_limit) + const float relux_max_limit, + ScratchBuffer *scratch) : Conv2dFunctorBase(strides, padding_type, paddings, dilations, activation, relux_max_limit), - is_filter_transformed_(false) {} + is_filter_transformed_(false), + scratch_(scratch) {} void operator()(const Tensor *input, const Tensor *filter, @@ -437,13 +440,9 @@ struct Conv2dFunctor : Conv2dFunctorBase { Tensor *output, StatsFuture *future); - // TODO(liyin): share tmp buffers among ops - Tensor padded_input_; - Tensor padded_output_; - Tensor transformed_input_; Tensor transformed_filter_; - Tensor transformed_output_; bool is_filter_transformed_; + ScratchBuffer *scratch_; }; template @@ -453,7 +452,8 @@ struct Conv2dFunctor : Conv2dFunctorBase { const std::vector &paddings, const int *dilations, const ActivationType activation, - const float relux_max_limit) + const float relux_max_limit, + ScratchBuffer *scratch) : Conv2dFunctorBase(strides, padding_type, paddings, diff --git a/mace/ops/conv_2d.h b/mace/ops/conv_2d.h index 528f1e1f67c32b037262e0019847eb25fdc62a4c..33758808dde5a145c7e575c29268716f766ac5ed 100644 --- a/mace/ops/conv_2d.h +++ b/mace/ops/conv_2d.h @@ -24,7 +24,8 @@ class Conv2dOp : public ConvPool2dOpBase { this->paddings_, this->dilations_.data(), kernels::ActivationType::NOOP, - 0.0f) {} + 0.0f, + ws->GetScratchBuffer(D)) {} bool Run(StatsFuture *future) override { const Tensor *input = this->Input(INPUT); diff --git a/mace/ops/fused_conv_2d.h b/mace/ops/fused_conv_2d.h index db9c6e3a48920c1346b12ac2943d42f940fb6c8f..29df5913f680ed712596b99b58d6455339144df8 100644 --- a/mace/ops/fused_conv_2d.h +++ b/mace/ops/fused_conv_2d.h @@ -27,7 +27,8 @@ class FusedConv2dOp : public ConvPool2dOpBase { kernels::StringToActivationType( OperatorBase::GetSingleArgument("activation", "NOOP")), - OperatorBase::GetSingleArgument("max_limit", 0.0f)) {} + OperatorBase::GetSingleArgument("max_limit", 0.0f), + ws->GetScratchBuffer(D)) {} bool Run(StatsFuture *future) override { const Tensor *input = this->Input(INPUT);