diff --git a/mace/core/buffer.h b/mace/core/buffer.h
index b655fdc4b10857a7a446b1b7ffc859bf535427e9..08cbf1a9562c69e2344d25bf8d977f1b2a94ffea 100644
--- a/mace/core/buffer.h
+++ b/mace/core/buffer.h
@@ -6,6 +6,7 @@
 #define MACE_CORE_BUFFER_H_
 
 #include <vector>
+#include <algorithm>
 #include <functional>
 
 #include "mace/core/allocator.h"
@@ -161,12 +162,10 @@ class Buffer : public BufferBase {
   bool OnHost() const { return allocator_->OnHost(); }
 
   void Clear() {
-    if (buf_ != nullptr) {
-      memset(buf_, 0, size_);
-    }
+    memset(reinterpret_cast<char*>(raw_mutable_data()), 0, size_);
   }
 
- private:
+ protected:
   Allocator *allocator_;
   void *buf_;
   void *mapped_buf_;
@@ -267,19 +266,23 @@ class Image : public BufferBase {
 class BufferSlice : public BufferBase {
  public:
   BufferSlice()
-      : buffer_(nullptr), mapped_buf_(nullptr), offset_(0), length_(0) {}
+      : BufferBase(0), buffer_(nullptr), mapped_buf_(nullptr), offset_(0) {}
   BufferSlice(BufferBase *buffer, index_t offset, index_t length)
-      : BufferBase(buffer->size()),
-        buffer_(buffer),
-        mapped_buf_(nullptr),
-        offset_(offset),
-        length_(length) {
+    : BufferBase(length),
+      buffer_(buffer),
+      mapped_buf_(nullptr),
+      offset_(offset) {
     MACE_CHECK(offset >= 0, "buffer slice offset should >= 0");
-    MACE_CHECK(offset + length <= size_, "buffer slice offset + length (",
-               offset, " + ", length, ") should <= ", size_);
+    MACE_CHECK(offset + length <= buffer->size(),
+               "buffer slice offset + length (",
+               offset,
+               " + ",
+               length,
+               ") should <= ",
+               buffer->size());
   }
   BufferSlice(const BufferSlice &other)
-      : BufferSlice(other.buffer_, other.offset_, other.length_) {}
+      : BufferSlice(other.buffer_, other.offset_, other.size_) {}
 
   ~BufferSlice() {
     if (buffer_ != nullptr && mapped_buf_ != nullptr) {
@@ -303,8 +306,13 @@ class BufferSlice : public BufferBase {
   }
 
   void *raw_mutable_data() {
-    MACE_NOT_IMPLEMENTED;
-    return nullptr;
+    if (OnHost()) {
+      MACE_CHECK_NOTNULL(buffer_);
+      return reinterpret_cast<char*>(buffer_->raw_mutable_data()) + offset_;
+    } else {
+      MACE_CHECK_NOTNULL(mapped_buf_);
+      return mapped_buf_;
+    }
   }
 
   void *Map(index_t offset, index_t length, std::vector<size_t> *pitch) const {
@@ -317,7 +325,7 @@ class BufferSlice : public BufferBase {
   void Map(std::vector<size_t> *pitch) {
     MACE_CHECK_NOTNULL(buffer_);
     MACE_CHECK(mapped_buf_ == nullptr, "mapped buf is not null");
-    mapped_buf_ = buffer_->Map(offset_, length_, pitch);
+    mapped_buf_ = buffer_->Map(offset_, size_, pitch);
   }
 
   void UnMap() {
@@ -326,7 +334,10 @@ class BufferSlice : public BufferBase {
     mapped_buf_ = nullptr;
   }
 
-  void Resize(index_t size) { MACE_NOT_IMPLEMENTED; }
+  void Resize(index_t size) {
+    MACE_CHECK(size == size_, "resize buffer slice from ", size_,
+      " to ", size, " is illegal");
+  }
 
   void Copy(void *src, index_t offset, index_t length) { MACE_NOT_IMPLEMENTED; }
 
@@ -335,15 +346,58 @@ class BufferSlice : public BufferBase {
   bool OnHost() const { return buffer_->OnHost(); }
 
   void Clear() {
-    MACE_NOT_IMPLEMENTED;
+    memset(raw_mutable_data(), 0, size_);
   }
 
  private:
   BufferBase *buffer_;
   void *mapped_buf_;
   index_t offset_;
-  index_t length_;
 };
+
+class ScratchBuffer: public Buffer {
+ public:
+  explicit ScratchBuffer(Allocator *allocator)
+    : Buffer(allocator),
+      offset_(0) {}
+
+  ScratchBuffer(Allocator *allocator, index_t size)
+    : Buffer(allocator, size),
+      offset_(0) {}
+
+  ScratchBuffer(Allocator *allocator, void *data, index_t size)
+    : Buffer(allocator, data, size),
+      offset_(0) {}
+
+  virtual ~ScratchBuffer() {}
+
+  void GrowSize(index_t size) {
+    if (size > size_) {
+      Resize(size);
+    }
+  }
+
+  BufferSlice Scratch(index_t size) {
+    MACE_CHECK(offset_ + size <= size_,
+               "scratch size not enough: ",
+               offset_,
+               " + ",
+               size,
+               " > ",
+               size_);
+    BufferSlice slice(this, offset_, size);
+    offset_ += size;
+    return slice;
+  }
+
+  void Rewind() {
+    offset_ = 0;
+  }
+
+ private:
+  index_t offset_;
+};
+
 }  // namespace mace
 
 #endif  // MACE_CORE_BUFFER_H_
diff --git a/mace/core/workspace.cc b/mace/core/workspace.cc
index 227c99737c1fc766c6a8fe0944ce6ea5b84cacc3..d068cbd8920ab2b155f05f07ea776925c2d75813 100644
--- a/mace/core/workspace.cc
+++ b/mace/core/workspace.cc
@@ -12,6 +12,9 @@
 
 namespace mace {
 
+Workspace::Workspace() : host_scratch_buffer_(new ScratchBuffer(
+  GetDeviceAllocator(DeviceType::CPU))) {}
+
 Tensor *Workspace::CreateTensor(const std::string &name,
                                 Allocator *alloc,
                                 DataType type) {
@@ -159,4 +162,12 @@ void Workspace::CreateImageOutputTensor(const NetDef &net_def) {
   }
 }
 
+ScratchBuffer *Workspace::GetScratchBuffer(DeviceType device_type) {
+  if (device_type == CPU || device_type == NEON) {
+    return host_scratch_buffer_.get();
+  } else {
+    return nullptr;
+  }
+}
+
 }  // namespace mace
diff --git a/mace/core/workspace.h b/mace/core/workspace.h
index 1e1012672c30d388fe34ff645b50ed36a292c16b..c918a694efa8fe3837dc977f792122436bf74119 100644
--- a/mace/core/workspace.h
+++ b/mace/core/workspace.h
@@ -20,7 +20,7 @@ class Workspace {
  public:
   typedef std::map<std::string, std::unique_ptr<Tensor>> TensorMap;
 
-  Workspace() {}
+  Workspace();
   ~Workspace() {}
 
   Tensor *CreateTensor(const std::string &name,
@@ -39,6 +39,8 @@ class Workspace {
 
   void LoadModelTensor(const NetDef &net_def, DeviceType type);
 
+  ScratchBuffer *GetScratchBuffer(DeviceType device_type);
+
  private:
   void CreateImageOutputTensor(const NetDef &net_def);
 
@@ -48,6 +50,8 @@ class Workspace {
 
   PreallocatedPooledAllocator preallocated_allocator_;
 
+  std::unique_ptr<ScratchBuffer> host_scratch_buffer_;
+
   DISABLE_COPY_AND_ASSIGN(Workspace);
 };
 
diff --git a/mace/kernels/arm/conv_2d.cc b/mace/kernels/arm/conv_2d.cc
index 7fc16cda27b4c3c93d491aa9caf6372247df6e96..04b79abc46922ebd258fb091fab98b7db088c8c4 100644
--- a/mace/kernels/arm/conv_2d.cc
+++ b/mace/kernels/arm/conv_2d.cc
@@ -154,17 +154,28 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
   int pad_left = paddings[1] >> 1;
   int pad_right = paddings[1] - pad_left;
 
-  std::function<void(const float *input, float *output)> conv_func;
-
   auto input_data = input->data<float>();
   auto filter_data = filter->data<float>();
   auto bias_data = bias == nullptr ? nullptr : bias->data<float>();
   auto output_data = output->mutable_data<float>();
 
-  if (USE_WINOGRAD && filter_h == 3 && filter_w == 3 && stride_h == 1
-    && stride_w == 1
-    && dilation_h == 1 && dilation_w == 1
-    && input_channels >= 8 && channels >= 8) {
+  std::function<void(const float *input, float *output)> conv_func;
+
+  bool use_winograd = USE_WINOGRAD && filter_h == 3 && filter_w == 3
+    && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1
+    && input_channels >= 8 && channels >= 8;
+  bool use_neon_3x3_s1 = filter_h == 3 && filter_w == 3
+    && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1;
+  bool use_neon_3x3_s2 = filter_h == 3 && filter_w == 3
+    && stride_h == 2 && stride_w == 2 && dilation_h == 1 && dilation_w == 1;
+  bool use_neon_1x1_s1 = filter_h == 1 && filter_w == 1
+    && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1;
+
+  std::vector<index_t> transformed_input_shape;
+  std::vector<index_t> transformed_output_shape;
+  std::vector<index_t> transformed_filter_shape;
+
+  if (use_winograd) {
     extra_output_height = RoundUp<index_t>(height, WINOGRAD_OUT_TILE_SIZE);
     extra_input_height = std::max(padded_input_height, extra_output_height + 2);
     extra_output_width = RoundUp<index_t>(width, WINOGRAD_OUT_TILE_SIZE);
@@ -181,12 +192,90 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
     index_t tile_count = tile_height_count * tile_width_count;
     index_t in_tile_area =
       (WINOGRAD_OUT_TILE_SIZE + 2) * (WINOGRAD_OUT_TILE_SIZE + 2);
-    transformed_input_.Resize({in_tile_area, batch, input_channels,
-                               tile_count});
-    transformed_filter_.Resize({in_tile_area, channels, input_channels});
-    transformed_output_.Resize({in_tile_area, batch, channels, tile_count});
 
-    conv_func = [=](const float *pad_input, float *pad_output) {
+    transformed_input_shape.insert(transformed_input_shape.end(),
+                                   {in_tile_area, batch, input_channels,
+                                    tile_count});
+    transformed_output_shape.insert(transformed_output_shape.end(),
+                                    {in_tile_area, batch, channels,
+                                     tile_count});
+    transformed_filter_shape.insert(transformed_filter_shape.end(),
+                                    {in_tile_area, channels, input_channels});
+  } else if (use_neon_3x3_s1) {
+    extra_output_height = RoundUp<index_t>(height, 2);
+    extra_input_height = std::max(padded_input_height, extra_output_height + 2);
+    extra_output_width = RoundUp<index_t>(width, 4);
+    extra_input_width = std::max(padded_input_width, extra_output_width + 2);
+    if (extra_input_height != padded_input_height) {
+      pad_bottom += (extra_input_height - padded_input_height);
+    }
+    if (extra_input_width != padded_input_width) {
+      pad_right += (extra_input_width - padded_input_width);
+    }
+  } else if (use_neon_3x3_s2) {
+    extra_output_height = height;
+    extra_input_height =
+      std::max(padded_input_height, (extra_output_height - 1) * 2 + 3);
+    extra_output_width = RoundUp<index_t>(width, 4);
+    extra_input_width =
+      std::max(padded_input_width, (extra_output_width - 1) * 2 + 3);
+    if (extra_input_height != padded_input_height) {
+      pad_bottom += (extra_input_height - padded_input_height);
+    }
+    if (extra_input_width != padded_input_width) {
+      pad_right += (extra_input_width - padded_input_width);
+    }
+  }
+
+  // decide scratch size before allocate it
+  index_t total_scratch_size = 0;
+  index_t transformed_input_size = 0;
+  index_t transformed_output_size = 0;
+  index_t padded_input_size = 0;
+  index_t padded_output_size = 0;
+  if (use_winograd) {
+    transformed_input_size =
+      std::accumulate(transformed_input_shape.begin(),
+                      transformed_input_shape.end(),
+                      1,
+                      std::multiplies<index_t>()) * sizeof(float);
+    transformed_output_size =
+      std::accumulate(transformed_output_shape.begin(),
+                      transformed_output_shape.end(),
+                      1,
+                      std::multiplies<index_t>()) * sizeof(float);
+    total_scratch_size += transformed_input_size + transformed_output_size;
+  }
+  if (extra_input_height != input_height || extra_input_width != input_width) {
+    padded_input_size =
+      batch * input_channels * (input_height + pad_top + pad_bottom)
+        * (input_width + pad_left + pad_right) * sizeof(float);
+    total_scratch_size += padded_input_size;
+  }
+  if (extra_output_height != height || extra_output_width != width) {
+    padded_output_size =
+      batch * channels * extra_output_height * extra_output_width
+        * sizeof(float);
+    total_scratch_size += padded_output_size;
+  }
+  // Init scratch buffer
+  scratch_->Rewind();
+  scratch_->GrowSize(total_scratch_size);
+  Tensor transformed_input(scratch_->Scratch(transformed_input_size), DT_FLOAT);
+  Tensor
+    transformed_output(scratch_->Scratch(transformed_output_size), DT_FLOAT);
+  Tensor padded_input(scratch_->Scratch(padded_input_size), DT_FLOAT);
+  Tensor padded_output(scratch_->Scratch(padded_output_size), DT_FLOAT);
+
+  // decide which convolution function to call
+  if (use_winograd) {
+    transformed_input.Resize(transformed_input_shape);
+    transformed_output.Resize(transformed_output_shape);
+    if (!is_filter_transformed_) {
+      transformed_filter_.Resize(transformed_filter_shape);
+    }
+
+    conv_func = [&](const float *pad_input, float *pad_output) {
       WinoGradConv3x3s1(pad_input,
                         filter_data,
                         batch,
@@ -195,26 +284,14 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
                         input_channels,
                         channels,
                         WINOGRAD_OUT_TILE_SIZE,
-                        transformed_input_.mutable_data<float>(),
+                        transformed_input.mutable_data<float>(),
                         transformed_filter_.mutable_data<float>(),
-                        transformed_output_.mutable_data<float>(),
+                        transformed_output.mutable_data<float>(),
                         is_filter_transformed_,
                         pad_output);
       is_filter_transformed_ = true;
     };
-  } else if (filter_h == 3 && filter_w == 3 && stride_h == 1 && stride_w == 1
-    && dilation_h == 1 && dilation_w == 1) {
-    extra_output_height = RoundUp<index_t>(height, 2);
-    extra_input_height = std::max(padded_input_height, extra_output_height + 2);
-    extra_output_width = RoundUp<index_t>(width, 4);
-    extra_input_width = std::max(padded_input_width, extra_output_width + 2);
-    if (extra_input_height != padded_input_height) {
-      pad_bottom += (extra_input_height - padded_input_height);
-    }
-    if (extra_input_width != padded_input_width) {
-      pad_right += (extra_input_width - padded_input_width);
-    }
-
+  } else if (use_neon_3x3_s1) {
     conv_func = [=](const float *pad_input, float *pad_output) {
       Conv2dNeonK3x3S1(pad_input,
                        filter_data,
@@ -227,21 +304,7 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
                        channels,
                        pad_output);
     };
-  } else if (filter_h == 3 && filter_w == 3 && stride_h == 2 && stride_w == 2
-    && dilation_h == 1 && dilation_w == 1) {
-    extra_output_height = height;
-    extra_input_height =
-      std::max(padded_input_height, (extra_output_height - 1) * 2 + 3);
-    extra_output_width = RoundUp<index_t>(width, 4);
-    extra_input_width =
-      std::max(padded_input_width, (extra_output_width - 1) * 2 + 3);
-    if (extra_input_height != padded_input_height) {
-      pad_bottom += (extra_input_height - padded_input_height);
-    }
-    if (extra_input_width != padded_input_width) {
-      pad_right += (extra_input_width - padded_input_width);
-    }
-
+  } else if (use_neon_3x3_s2) {
     conv_func = [=](const float *pad_input, float *pad_output) {
       Conv2dNeonK3x3S2(pad_input,
                        filter_data,
@@ -254,8 +317,7 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
                        channels,
                        pad_output);
     };
-  } else if (filter_h == 1 && filter_w == 1 && stride_h == 1 && stride_w == 1
-    && dilation_h == 1 && dilation_w == 1) {
+  } else if (use_neon_1x1_s1) {
     conv_func = [=](const float *pad_input, float *pad_output) {
       Conv2dNeonK1x1S1(input_data,
                        filter_data,
@@ -287,28 +349,27 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
     };
   }
 
+  // pad input and output
   const Tensor *pad_input_ptr = input;
-  // Keep this alive during kernel execution
   if (extra_input_height != input_height || extra_input_width != input_width) {
+    padded_input.Clear();
     ConstructNCHWInputWithSpecificPadding(input,
                                           pad_top,
                                           pad_bottom,
                                           pad_left,
                                           pad_right,
-                                          &padded_input_);
-    pad_input_ptr = &padded_input_;
+                                          &padded_input);
+    pad_input_ptr = &padded_input;
   }
-  const float *pad_input_data = pad_input_ptr->data<float>();
 
   Tensor *pad_output_ptr = output;
-  // Keep this alive during kernel execution
   if (extra_output_height != height || extra_output_width != width) {
-    std::vector<index_t> extra_output_shape
-      {batch, channels, extra_output_height, extra_output_width};
-    padded_output_.Resize(extra_output_shape);
-    padded_output_.Clear();
-    pad_output_ptr = &padded_output_;
+    padded_output.Resize({batch, channels, extra_output_height,
+                           extra_output_width});
+    padded_output.Clear();
+    pad_output_ptr = &padded_output;
   }
+  const float *pad_input_data = pad_input_ptr->data<float>();
   float *pad_output_data = pad_output_ptr->mutable_data<float>();
 
   conv_func(pad_input_data, pad_output_data);
diff --git a/mace/kernels/conv_2d.h b/mace/kernels/conv_2d.h
index 6833b9b66449f70f38ef56779722fbd67b06eaa7..9a8f7c758679616218fcb5b006e4fa8226da0263 100644
--- a/mace/kernels/conv_2d.h
+++ b/mace/kernels/conv_2d.h
@@ -297,7 +297,8 @@ struct Conv2dFunctor : Conv2dFunctorBase {
                 const std::vector<int> &paddings,
                 const int *dilations,
                 const ActivationType activation,
-                const float relux_max_limit)
+                const float relux_max_limit,
+                ScratchBuffer *scratch)
       : Conv2dFunctorBase(strides,
                           padding_type,
                           paddings,
@@ -422,14 +423,16 @@ struct Conv2dFunctor<DeviceType::NEON, float> : Conv2dFunctorBase {
                 const std::vector<int> &paddings,
                 const int *dilations,
                 const ActivationType activation,
-                const float relux_max_limit)
+                const float relux_max_limit,
+                ScratchBuffer *scratch)
     : Conv2dFunctorBase(strides,
                         padding_type,
                         paddings,
                         dilations,
                         activation,
                         relux_max_limit),
-      is_filter_transformed_(false) {}
+      is_filter_transformed_(false),
+      scratch_(scratch) {}
 
   void operator()(const Tensor *input,
                   const Tensor *filter,
@@ -437,13 +440,9 @@ struct Conv2dFunctor<DeviceType::NEON, float> : Conv2dFunctorBase {
                   Tensor *output,
                   StatsFuture *future);
 
-  // TODO(liyin): share tmp buffers among ops
-  Tensor padded_input_;
-  Tensor padded_output_;
-  Tensor transformed_input_;
   Tensor transformed_filter_;
-  Tensor transformed_output_;
   bool is_filter_transformed_;
+  ScratchBuffer *scratch_;
 };
 
 template <typename T>
@@ -453,7 +452,8 @@ struct Conv2dFunctor<DeviceType::OPENCL, T> : Conv2dFunctorBase {
                 const std::vector<int> &paddings,
                 const int *dilations,
                 const ActivationType activation,
-                const float relux_max_limit)
+                const float relux_max_limit,
+                ScratchBuffer *scratch)
       : Conv2dFunctorBase(strides,
                           padding_type,
                           paddings,
diff --git a/mace/ops/conv_2d.h b/mace/ops/conv_2d.h
index 528f1e1f67c32b037262e0019847eb25fdc62a4c..33758808dde5a145c7e575c29268716f766ac5ed 100644
--- a/mace/ops/conv_2d.h
+++ b/mace/ops/conv_2d.h
@@ -24,7 +24,8 @@ class Conv2dOp : public ConvPool2dOpBase<D, T> {
                  this->paddings_,
                  this->dilations_.data(),
                  kernels::ActivationType::NOOP,
-                 0.0f) {}
+                 0.0f,
+                 ws->GetScratchBuffer(D)) {}
 
   bool Run(StatsFuture *future) override {
     const Tensor *input = this->Input(INPUT);
diff --git a/mace/ops/fused_conv_2d.h b/mace/ops/fused_conv_2d.h
index db9c6e3a48920c1346b12ac2943d42f940fb6c8f..29df5913f680ed712596b99b58d6455339144df8 100644
--- a/mace/ops/fused_conv_2d.h
+++ b/mace/ops/fused_conv_2d.h
@@ -27,7 +27,8 @@ class FusedConv2dOp : public ConvPool2dOpBase<D, T> {
                  kernels::StringToActivationType(
                      OperatorBase::GetSingleArgument<std::string>("activation",
                                                                   "NOOP")),
-                 OperatorBase::GetSingleArgument<float>("max_limit", 0.0f)) {}
+                 OperatorBase::GetSingleArgument<float>("max_limit", 0.0f),
+                 ws->GetScratchBuffer(D)) {}
 
   bool Run(StatsFuture *future) override {
     const Tensor *input = this->Input(INPUT);