Merge branch 'share_buffer' into 'master'

Share tmp buffer among ops See merge request !379

Merge branch 'share_buffer' into 'master'
Share tmp buffer among ops See merge request !379
db3ad39f · 吴承辉 · a0a7849e · 76521e98 · db3ad39f · db3ad39f
7 changed file
--- a/mace/core/buffer.h
+++ b/mace/core/buffer.h
@@ -6,6 +6,7 @@
 #define MACE_CORE_BUFFER_H_
 #include <vector>
+#include <algorithm>
 #include <functional>
 #include "mace/core/allocator.h"
@@ -161,12 +162,10 @@ class Buffer : public BufferBase {
  bool OnHost() const { return allocator_->OnHost(); }
  void Clear() {
-    if (buf_ != nullptr) {
+    memset(reinterpret_cast<char*>(raw_mutable_data()), 0, size_);
-      memset(buf_, 0, size_);
-    }
  }
- private:
+ protected:
  Allocator *allocator_;
  void *buf_;
  void *mapped_buf_;
@@ -267,19 +266,23 @@ class Image : public BufferBase {
 class BufferSlice : public BufferBase {
 public:
  BufferSlice()
-      : buffer_(nullptr), mapped_buf_(nullptr), offset_(0), length_(0) {}
+      : BufferBase(0), buffer_(nullptr), mapped_buf_(nullptr), offset_(0) {}
  BufferSlice(BufferBase *buffer, index_t offset, index_t length)
-      : BufferBase(buffer->size()),
+    : BufferBase(length),
      buffer_(buffer),
      mapped_buf_(nullptr),
-        offset_(offset),
+      offset_(offset) {
-        length_(length) {
    MACE_CHECK(offset >= 0, "buffer slice offset should >= 0");
-    MACE_CHECK(offset + length <= size_, "buffer slice offset + length (",
+    MACE_CHECK(offset + length <= buffer->size(),
-               offset, " + ", length, ") should <= ", size_);
+               "buffer slice offset + length (",
+               offset,
+               " + ",
+               length,
+               ") should <= ",
+               buffer->size());
  }
  BufferSlice(const BufferSlice &other)
-      : BufferSlice(other.buffer_, other.offset_, other.length_) {}
+      : BufferSlice(other.buffer_, other.offset_, other.size_) {}
  ~BufferSlice() {
    if (buffer_ != nullptr && mapped_buf_ != nullptr) {
@@ -303,8 +306,13 @@ class BufferSlice : public BufferBase {
  }
  void *raw_mutable_data() {
-    MACE_NOT_IMPLEMENTED;
+    if (OnHost()) {
-    return nullptr;
+      MACE_CHECK_NOTNULL(buffer_);
+      return reinterpret_cast<char*>(buffer_->raw_mutable_data()) + offset_;
+    } else {
+      MACE_CHECK_NOTNULL(mapped_buf_);
+      return mapped_buf_;
+    }
  }
  void *Map(index_t offset, index_t length, std::vector<size_t> *pitch) const {
@@ -317,7 +325,7 @@ class BufferSlice : public BufferBase {
  void Map(std::vector<size_t> *pitch) {
    MACE_CHECK_NOTNULL(buffer_);
    MACE_CHECK(mapped_buf_ == nullptr, "mapped buf is not null");
-    mapped_buf_ = buffer_->Map(offset_, length_, pitch);
+    mapped_buf_ = buffer_->Map(offset_, size_, pitch);
  }
  void UnMap() {
@@ -326,7 +334,10 @@ class BufferSlice : public BufferBase {
    mapped_buf_ = nullptr;
  }
-  void Resize(index_t size) { MACE_NOT_IMPLEMENTED; }
+  void Resize(index_t size) {
+    MACE_CHECK(size == size_, "resize buffer slice from ", size_,
+      " to ", size, " is illegal");
+  }
  void Copy(void *src, index_t offset, index_t length) { MACE_NOT_IMPLEMENTED; }
@@ -335,15 +346,58 @@ class BufferSlice : public BufferBase {
  bool OnHost() const { return buffer_->OnHost(); }
  void Clear() {
-    MACE_NOT_IMPLEMENTED;
+    memset(raw_mutable_data(), 0, size_);
  }
 private:
  BufferBase *buffer_;
  void *mapped_buf_;
  index_t offset_;
-  index_t length_;
 };
+class ScratchBuffer: public Buffer {
+ public:
+  explicit ScratchBuffer(Allocator *allocator)
+    : Buffer(allocator),
+      offset_(0) {}
+  ScratchBuffer(Allocator *allocator, index_t size)
+    : Buffer(allocator, size),
+      offset_(0) {}
+  ScratchBuffer(Allocator *allocator, void *data, index_t size)
+    : Buffer(allocator, data, size),
+      offset_(0) {}
+  virtual ~ScratchBuffer() {}
+  void GrowSize(index_t size) {
+    if (size > size_) {
+      Resize(size);
+    }
+  }
+  BufferSlice Scratch(index_t size) {
+    MACE_CHECK(offset_ + size <= size_,
+               "scratch size not enough: ",
+               offset_,
+               " + ",
+               size,
+               " > ",
+               size_);
+    BufferSlice slice(this, offset_, size);
+    offset_ += size;
+    return slice;
+  }
+  void Rewind() {
+    offset_ = 0;
+  }
+ private:
+  index_t offset_;
+};
 }  // namespace mace
 #endif  // MACE_CORE_BUFFER_H_
--- a/mace/core/workspace.cc
+++ b/mace/core/workspace.cc
@@ -12,6 +12,9 @@
 namespace mace {
+Workspace::Workspace() : host_scratch_buffer_(new ScratchBuffer(
+  GetDeviceAllocator(DeviceType::CPU))) {}
 Tensor *Workspace::CreateTensor(const std::string &name,
                                Allocator *alloc,
                                DataType type) {
@@ -159,4 +162,12 @@ void Workspace::CreateImageOutputTensor(const NetDef &net_def) {
  }
 }
+ScratchBuffer *Workspace::GetScratchBuffer(DeviceType device_type) {
+  if (device_type == CPU || device_type == NEON) {
+    return host_scratch_buffer_.get();
+  } else {
+    return nullptr;
+  }
+}
 }  // namespace mace
--- a/mace/core/workspace.h
+++ b/mace/core/workspace.h
@@ -20,7 +20,7 @@ class Workspace {
 public:
  typedef std::map<std::string, std::unique_ptr<Tensor>> TensorMap;
-  Workspace() {}
+  Workspace();
  ~Workspace() {}
  Tensor *CreateTensor(const std::string &name,
@@ -39,6 +39,8 @@ class Workspace {
  void LoadModelTensor(const NetDef &net_def, DeviceType type);
+  ScratchBuffer *GetScratchBuffer(DeviceType device_type);
 private:
  void CreateImageOutputTensor(const NetDef &net_def);
@@ -48,6 +50,8 @@ class Workspace {
  PreallocatedPooledAllocator preallocated_allocator_;
+  std::unique_ptr<ScratchBuffer> host_scratch_buffer_;
  DISABLE_COPY_AND_ASSIGN(Workspace);
 };

--- a/mace/kernels/arm/conv_2d.cc
+++ b/mace/kernels/arm/conv_2d.cc
@@ -154,17 +154,28 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
  int pad_left = paddings[1] >> 1;
  int pad_right = paddings[1] - pad_left;
-  std::function<void(const float *input, float *output)> conv_func;
  auto input_data = input->data<float>();
  auto filter_data = filter->data<float>();
  auto bias_data = bias == nullptr ? nullptr : bias->data<float>();
  auto output_data = output->mutable_data<float>();
-  if (USE_WINOGRAD && filter_h == 3 && filter_w == 3 && stride_h == 1
+  std::function<void(const float *input, float *output)> conv_func;
-    && stride_w == 1
-    && dilation_h == 1 && dilation_w == 1
+  bool use_winograd = USE_WINOGRAD && filter_h == 3 && filter_w == 3
-    && input_channels >= 8 && channels >= 8) {
+    && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1
+    && input_channels >= 8 && channels >= 8;
+  bool use_neon_3x3_s1 = filter_h == 3 && filter_w == 3
+    && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1;
+  bool use_neon_3x3_s2 = filter_h == 3 && filter_w == 3
+    && stride_h == 2 && stride_w == 2 && dilation_h == 1 && dilation_w == 1;
+  bool use_neon_1x1_s1 = filter_h == 1 && filter_w == 1
+    && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1;
+  std::vector<index_t> transformed_input_shape;
+  std::vector<index_t> transformed_output_shape;
+  std::vector<index_t> transformed_filter_shape;
+  if (use_winograd) {
    extra_output_height = RoundUp<index_t>(height, WINOGRAD_OUT_TILE_SIZE);
    extra_input_height = std::max(padded_input_height, extra_output_height + 2);
    extra_output_width = RoundUp<index_t>(width, WINOGRAD_OUT_TILE_SIZE);
@@ -181,12 +192,90 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
    index_t tile_count = tile_height_count * tile_width_count;
    index_t in_tile_area =
      (WINOGRAD_OUT_TILE_SIZE + 2) * (WINOGRAD_OUT_TILE_SIZE + 2);
-    transformed_input_.Resize({in_tile_area, batch, input_channels,
+    transformed_input_shape.insert(transformed_input_shape.end(),
+                                   {in_tile_area, batch, input_channels,
+                                    tile_count});
+    transformed_output_shape.insert(transformed_output_shape.end(),
+                                    {in_tile_area, batch, channels,
                                     tile_count});
-    transformed_filter_.Resize({in_tile_area, channels, input_channels});
+    transformed_filter_shape.insert(transformed_filter_shape.end(),
-    transformed_output_.Resize({in_tile_area, batch, channels, tile_count});
+                                    {in_tile_area, channels, input_channels});
+  } else if (use_neon_3x3_s1) {
+    extra_output_height = RoundUp<index_t>(height, 2);
+    extra_input_height = std::max(padded_input_height, extra_output_height + 2);
+    extra_output_width = RoundUp<index_t>(width, 4);
+    extra_input_width = std::max(padded_input_width, extra_output_width + 2);
+    if (extra_input_height != padded_input_height) {
+      pad_bottom += (extra_input_height - padded_input_height);
+    }
+    if (extra_input_width != padded_input_width) {
+      pad_right += (extra_input_width - padded_input_width);
+    }
+  } else if (use_neon_3x3_s2) {
+    extra_output_height = height;
+    extra_input_height =
+      std::max(padded_input_height, (extra_output_height - 1) * 2 + 3);
+    extra_output_width = RoundUp<index_t>(width, 4);
+    extra_input_width =
+      std::max(padded_input_width, (extra_output_width - 1) * 2 + 3);
+    if (extra_input_height != padded_input_height) {
+      pad_bottom += (extra_input_height - padded_input_height);
+    }
+    if (extra_input_width != padded_input_width) {
+      pad_right += (extra_input_width - padded_input_width);
+    }
+  }
-    conv_func = [=](const float *pad_input, float *pad_output) {
+  // decide scratch size before allocate it
+  index_t total_scratch_size = 0;
+  index_t transformed_input_size = 0;
+  index_t transformed_output_size = 0;
+  index_t padded_input_size = 0;
+  index_t padded_output_size = 0;
+  if (use_winograd) {
+    transformed_input_size =
+      std::accumulate(transformed_input_shape.begin(),
+                      transformed_input_shape.end(),
+                      1,
+                      std::multiplies<index_t>()) * sizeof(float);
+    transformed_output_size =
+      std::accumulate(transformed_output_shape.begin(),
+                      transformed_output_shape.end(),
+                      1,
+                      std::multiplies<index_t>()) * sizeof(float);
+    total_scratch_size += transformed_input_size + transformed_output_size;
+  }
+  if (extra_input_height != input_height || extra_input_width != input_width) {
+    padded_input_size =
+      batch * input_channels * (input_height + pad_top + pad_bottom)
+        * (input_width + pad_left + pad_right) * sizeof(float);
+    total_scratch_size += padded_input_size;
+  }
+  if (extra_output_height != height || extra_output_width != width) {
+    padded_output_size =
+      batch * channels * extra_output_height * extra_output_width
+        * sizeof(float);
+    total_scratch_size += padded_output_size;
+  }
+  // Init scratch buffer
+  scratch_->Rewind();
+  scratch_->GrowSize(total_scratch_size);
+  Tensor transformed_input(scratch_->Scratch(transformed_input_size), DT_FLOAT);
+  Tensor
+    transformed_output(scratch_->Scratch(transformed_output_size), DT_FLOAT);
+  Tensor padded_input(scratch_->Scratch(padded_input_size), DT_FLOAT);
+  Tensor padded_output(scratch_->Scratch(padded_output_size), DT_FLOAT);
+  // decide which convolution function to call
+  if (use_winograd) {
+    transformed_input.Resize(transformed_input_shape);
+    transformed_output.Resize(transformed_output_shape);
+    if (!is_filter_transformed_) {
+      transformed_filter_.Resize(transformed_filter_shape);
+    }
+    conv_func = [&](const float *pad_input, float *pad_output) {
      WinoGradConv3x3s1(pad_input,
                        filter_data,
                        batch,
@@ -195,26 +284,14 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
                        input_channels,
                        channels,
                        WINOGRAD_OUT_TILE_SIZE,
-                        transformed_input_.mutable_data<float>(),
+                        transformed_input.mutable_data<float>(),
                        transformed_filter_.mutable_data<float>(),
-                        transformed_output_.mutable_data<float>(),
+                        transformed_output.mutable_data<float>(),
                        is_filter_transformed_,
                        pad_output);
      is_filter_transformed_ = true;
    };
-  } else if (filter_h == 3 && filter_w == 3 && stride_h == 1 && stride_w == 1
+  } else if (use_neon_3x3_s1) {
-    && dilation_h == 1 && dilation_w == 1) {
-    extra_output_height = RoundUp<index_t>(height, 2);
-    extra_input_height = std::max(padded_input_height, extra_output_height + 2);
-    extra_output_width = RoundUp<index_t>(width, 4);
-    extra_input_width = std::max(padded_input_width, extra_output_width + 2);
-    if (extra_input_height != padded_input_height) {
-      pad_bottom += (extra_input_height - padded_input_height);
-    }
-    if (extra_input_width != padded_input_width) {
-      pad_right += (extra_input_width - padded_input_width);
-    }
    conv_func = [=](const float *pad_input, float *pad_output) {
      Conv2dNeonK3x3S1(pad_input,
                       filter_data,
@@ -227,21 +304,7 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
                       channels,
                       pad_output);
    };
-  } else if (filter_h == 3 && filter_w == 3 && stride_h == 2 && stride_w == 2
+  } else if (use_neon_3x3_s2) {
-    && dilation_h == 1 && dilation_w == 1) {
-    extra_output_height = height;
-    extra_input_height =
-      std::max(padded_input_height, (extra_output_height - 1) * 2 + 3);
-    extra_output_width = RoundUp<index_t>(width, 4);
-    extra_input_width =
-      std::max(padded_input_width, (extra_output_width - 1) * 2 + 3);
-    if (extra_input_height != padded_input_height) {
-      pad_bottom += (extra_input_height - padded_input_height);
-    }
-    if (extra_input_width != padded_input_width) {
-      pad_right += (extra_input_width - padded_input_width);
-    }
    conv_func = [=](const float *pad_input, float *pad_output) {
      Conv2dNeonK3x3S2(pad_input,
                       filter_data,
@@ -254,8 +317,7 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
                       channels,
                       pad_output);
    };
-  } else if (filter_h == 1 && filter_w == 1 && stride_h == 1 && stride_w == 1
+  } else if (use_neon_1x1_s1) {
-    && dilation_h == 1 && dilation_w == 1) {
    conv_func = [=](const float *pad_input, float *pad_output) {
      Conv2dNeonK1x1S1(input_data,
                       filter_data,
@@ -287,28 +349,27 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
    };
  }
+  // pad input and output
  const Tensor *pad_input_ptr = input;
-  // Keep this alive during kernel execution
  if (extra_input_height != input_height || extra_input_width != input_width) {
+    padded_input.Clear();
    ConstructNCHWInputWithSpecificPadding(input,
                                          pad_top,
                                          pad_bottom,
                                          pad_left,
                                          pad_right,
-                                          &padded_input_);
+                                          &padded_input);
-    pad_input_ptr = &padded_input_;
+    pad_input_ptr = &padded_input;
  }
-  const float *pad_input_data = pad_input_ptr->data<float>();
  Tensor *pad_output_ptr = output;
-  // Keep this alive during kernel execution
  if (extra_output_height != height || extra_output_width != width) {
-    std::vector<index_t> extra_output_shape
+    padded_output.Resize({batch, channels, extra_output_height,
-      {batch, channels, extra_output_height, extra_output_width};
+                           extra_output_width});
-    padded_output_.Resize(extra_output_shape);
+    padded_output.Clear();
-    padded_output_.Clear();
+    pad_output_ptr = &padded_output;
-    pad_output_ptr = &padded_output_;
  }
+  const float *pad_input_data = pad_input_ptr->data<float>();
  float *pad_output_data = pad_output_ptr->mutable_data<float>();
  conv_func(pad_input_data, pad_output_data);

--- a/mace/kernels/conv_2d.h
+++ b/mace/kernels/conv_2d.h
@@ -297,7 +297,8 @@ struct Conv2dFunctor : Conv2dFunctorBase {
                const std::vector<int> &paddings,
                const int *dilations,
                const ActivationType activation,
-                const float relux_max_limit)
+                const float relux_max_limit,
+                ScratchBuffer *scratch)
      : Conv2dFunctorBase(strides,
                          padding_type,
                          paddings,
@@ -422,14 +423,16 @@ struct Conv2dFunctor<DeviceType::NEON, float> : Conv2dFunctorBase {
                const std::vector<int> &paddings,
                const int *dilations,
                const ActivationType activation,
-                const float relux_max_limit)
+                const float relux_max_limit,
+                ScratchBuffer *scratch)
    : Conv2dFunctorBase(strides,
                        padding_type,
                        paddings,
                        dilations,
                        activation,
                        relux_max_limit),
-      is_filter_transformed_(false) {}
+      is_filter_transformed_(false),
+      scratch_(scratch) {}
  void operator()(const Tensor *input,
                  const Tensor *filter,
@@ -437,13 +440,9 @@ struct Conv2dFunctor<DeviceType::NEON, float> : Conv2dFunctorBase {
                  Tensor *output,
                  StatsFuture *future);
-  // TODO(liyin): share tmp buffers among ops
-  Tensor padded_input_;
-  Tensor padded_output_;
-  Tensor transformed_input_;
  Tensor transformed_filter_;
-  Tensor transformed_output_;
  bool is_filter_transformed_;
+  ScratchBuffer *scratch_;
 };
 template <typename T>
@@ -453,7 +452,8 @@ struct Conv2dFunctor<DeviceType::OPENCL, T> : Conv2dFunctorBase {
                const std::vector<int> &paddings,
                const int *dilations,
                const ActivationType activation,
-                const float relux_max_limit)
+                const float relux_max_limit,
+                ScratchBuffer *scratch)
      : Conv2dFunctorBase(strides,
                          padding_type,
                          paddings,

--- a/mace/ops/conv_2d.h
+++ b/mace/ops/conv_2d.h
@@ -24,7 +24,8 @@ class Conv2dOp : public ConvPool2dOpBase<D, T> {
                 this->paddings_,
                 this->dilations_.data(),
                 kernels::ActivationType::NOOP,
-                 0.0f) {}
+                 0.0f,
+                 ws->GetScratchBuffer(D)) {}
  bool Run(StatsFuture *future) override {
    const Tensor *input = this->Input(INPUT);

--- a/mace/ops/fused_conv_2d.h
+++ b/mace/ops/fused_conv_2d.h
@@ -27,7 +27,8 @@ class FusedConv2dOp : public ConvPool2dOpBase<D, T> {
                 kernels::StringToActivationType(
                     OperatorBase::GetSingleArgument<std::string>("activation",
                                                                  "NOOP")),
-                 OperatorBase::GetSingleArgument<float>("max_limit", 0.0f)) {}
+                 OperatorBase::GetSingleArgument<float>("max_limit", 0.0f),
+                 ws->GetScratchBuffer(D)) {}
  bool Run(StatsFuture *future) override {
    const Tensor *input = this->Input(INPUT);