Preallocate full size scratch buffer for variable-length models

b3524eb5 · 李寅 · 94b88a0c · b3524eb5 · b3524eb5 · b3524eb5
5 changed file
--- a/mace/core/buffer.h
+++ b/mace/core/buffer.h
@@ -469,6 +469,7 @@ class ScratchBuffer: public Buffer {

  MaceStatus GrowSize(index_t size) {
    if (size > size_) {
+      VLOG(1) << "Grow scratch size to: " << size;
      MACE_CHECK(offset_ == 0, "scratch is being used, cannot grow size");
      return Resize(size);
    }

--- a/mace/core/operator.h
+++ b/mace/core/operator.h
@@ -117,6 +117,15 @@ class Operator : public OperatorBase {
        }
        outputs_.push_back(MACE_CHECK_NOTNULL(ws->CreateTensor(
          output_str, context->device()->allocator(), output_type)));
+
+        if (i < operator_def.output_shape_size()) {
+          std::vector<index_t>
+              shape_configured(operator_def.output_shape(i).dims_size());
+          for (size_t dim = 0; dim < shape_configured.size(); ++dim) {
+            shape_configured[dim] = operator_def.output_shape(i).dims(dim);
+          }
+          ws->GetTensor(output_str)->SetShapeConfigured(shape_configured);
+        }
      }
    }
  }

--- a/mace/core/tensor.h
+++ b/mace/core/tensor.h
@@ -18,6 +18,7 @@
 #include <string>
 #include <vector>
 #include <functional>
+#include <algorithm>

 #include "mace/core/buffer.h"
 #include "mace/core/preallocated_pooled_allocator.h"
@@ -159,6 +160,34 @@ class Tensor {

  inline const std::vector<index_t> &shape() const { return shape_; }

+  inline std::vector<index_t> max_shape() const {
+    if (shape_configured_.empty()) {
+      return shape();
+    } else {
+      auto &_shape = shape();
+      std::vector<index_t> max_shape(_shape.size());
+      MACE_CHECK(_shape.size() == shape_configured_.size());
+      for (size_t i = 0; i < shape_configured_.size(); ++i) {
+        max_shape[i] = std::max(_shape[i], shape_configured_[i]);
+      }
+      return max_shape;
+    }
+  }
+
+  inline index_t max_size() const {
+    auto _max_shape = max_shape();
+    return std::accumulate(_max_shape.begin(),
+                           _max_shape.end(),
+                           1,
+                           std::multiplies<index_t>());
+  }
+
+  inline index_t raw_max_size() const { return max_size() * SizeOfType(); }
+
+  inline void SetShapeConfigured(const std::vector<index_t> &shape_configured) {
+    shape_configured_ = shape_configured;
+  }
+
  inline index_t dim_size() const { return shape_.size(); }

  inline index_t dim(unsigned int index) const {
@@ -431,6 +460,7 @@ class Tensor {
  Allocator *allocator_;
  DataType dtype_;
  std::vector<index_t> shape_;
+  std::vector<index_t> shape_configured_;
  std::vector<size_t> image_shape_;
  BufferBase *buffer_;
  BufferSlice buffer_slice_;

--- a/mace/core/workspace.cc
+++ b/mace/core/workspace.cc
@@ -340,6 +340,17 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
                       output_type);
        }
      }
+
+      for (int output_idx = 0; output_idx < op.output_shape_size();
+           ++output_idx) {
+        std::vector<index_t>
+            shape_configured(op.output_shape(output_idx).dims_size());
+        for (size_t dim = 0; dim < shape_configured.size(); ++dim) {
+          shape_configured[dim] = op.output_shape(output_idx).dims(dim);
+        }
+        tensor_map_[op.output(output_idx)]->SetShapeConfigured(
+            shape_configured);
+      }
    }
  }
  return MaceStatus::MACE_SUCCESS;

--- a/mace/kernels/matmul.h
+++ b/mace/kernels/matmul.h
@@ -91,6 +91,14 @@ struct MatMulFunctor : OpKernel {

    auto scratch_buffer = context_->workspace()->GetScratchBuffer(D);
    scratch_buffer->Rewind();
+    index_t scratch_size = C->raw_max_size();
+    if (!A->is_weight()) {
+      scratch_size += A->raw_max_size();
+    }
+    if (!B->is_weight()) {
+      scratch_size += B->raw_max_size();
+    }
+    scratch_buffer->GrowSize(scratch_size);

    sgemm_.Run(a_ptr_base,
               b_ptr_base,