diff --git a/mace/kernels/activation.h b/mace/kernels/activation.h
index dd750a389b348223392b49fe06a39c8dda9005b9..1e3601a4a5f2f38dbf0bfa6d5acc8dbd21c2fa4d 100644
--- a/mace/kernels/activation.h
+++ b/mace/kernels/activation.h
@@ -152,6 +152,7 @@ class ActivationFunctor<DeviceType::OPENCL, T> {
   T relux_max_limit_;
   cl::Kernel kernel_;
   std::string tuning_key_prefix_;
+  std::vector<index_t> input_shape_;
 };
 
 }  // namespace kernels
diff --git a/mace/kernels/addn.h b/mace/kernels/addn.h
index 6e9ba2d4111a1cb9e67a7adb4cc3131d250e352c..3a5a45df4dd3476e4ab7a2f58bae658b461e206a 100644
--- a/mace/kernels/addn.h
+++ b/mace/kernels/addn.h
@@ -91,6 +91,7 @@ struct AddNFunctor<DeviceType::OPENCL, T> {
                   StatsFuture *future);
 
   cl::Kernel kernel_;
+  std::vector<index_t> input_shape_;
 };
 
 }  // namespace kernels
diff --git a/mace/kernels/batch_norm.h b/mace/kernels/batch_norm.h
index 1e6a12bff0f3f0878a5c017f81c21aa4a79c40a0..57f0f4d66a0107ef6e907e4c5579bed0feef2be3 100644
--- a/mace/kernels/batch_norm.h
+++ b/mace/kernels/batch_norm.h
@@ -156,6 +156,7 @@ struct BatchNormFunctor<DeviceType::OPENCL, T> : BatchNormFunctorBase {
                   Tensor *output,
                   StatsFuture *future);
   cl::Kernel kernel_;
+  std::vector<index_t> input_shape_;
 };
 
 }  // namepsace kernels
diff --git a/mace/kernels/bias_add.h b/mace/kernels/bias_add.h
index 28adcf8deb2034a0fb7b9812cb6975265e09f3fa..d8e411efa8a70ebd2ca850f6ac91fa1bd2198fe6 100644
--- a/mace/kernels/bias_add.h
+++ b/mace/kernels/bias_add.h
@@ -62,6 +62,7 @@ struct BiasAddFunctor<DeviceType::OPENCL, T> {
                   Tensor *output,
                   StatsFuture *future);
   cl::Kernel kernel_;
+  std::vector<index_t> input_shape_;
 };
 
 }  // namepsace kernels
diff --git a/mace/kernels/channel_shuffle.h b/mace/kernels/channel_shuffle.h
index e627121d176a7f36be3f33c9cdc4085048d55f2a..da2ce094a141984c49ad21a208bdaafb8a97311e 100644
--- a/mace/kernels/channel_shuffle.h
+++ b/mace/kernels/channel_shuffle.h
@@ -55,6 +55,7 @@ struct ChannelShuffleFunctor<DeviceType::OPENCL, T> {
 
   cl::Kernel kernel_;
   const int groups_;
+  std::vector<index_t> input_shape_;
 };
 
 }  // namespace kernels
diff --git a/mace/kernels/concat.h b/mace/kernels/concat.h
index 021b0f6176f7a8c39bc8525215802dee5ea08f24..6870594641baaab5aae866c033107a7b6df0507c 100644
--- a/mace/kernels/concat.h
+++ b/mace/kernels/concat.h
@@ -83,6 +83,7 @@ struct ConcatFunctor<DeviceType::OPENCL, T> : ConcatFunctorBase {
                   Tensor *output,
                   StatsFuture *future);
   cl::Kernel kernel_;
+  std::vector<index_t> input_shape_;
 };
 
 }  // namepsace kernels
diff --git a/mace/kernels/conv_2d.h b/mace/kernels/conv_2d.h
index a4a24eedae4b7c92f2ff1b1841c78b6f99d566bc..b107d33229c0b77be24e0702db9cf0585801b06f 100644
--- a/mace/kernels/conv_2d.h
+++ b/mace/kernels/conv_2d.h
@@ -401,6 +401,7 @@ struct Conv2dFunctor<DeviceType::OPENCL, T> : Conv2dFunctorBase {
                   StatsFuture *future);
 
   cl::Kernel kernel_;
+  std::vector<index_t> input_shape_;
 };
 
 }  // namespace kernels
diff --git a/mace/kernels/depthwise_conv2d.h b/mace/kernels/depthwise_conv2d.h
index c0a1719f86f70559a6ab6bd97072b31e21e2ead5..dc6b737077ab16b093d3993c5f414430fa17d186 100644
--- a/mace/kernels/depthwise_conv2d.h
+++ b/mace/kernels/depthwise_conv2d.h
@@ -439,6 +439,7 @@ struct DepthwiseConv2dFunctor<DeviceType::OPENCL, T>
                   StatsFuture *future);
 
   cl::Kernel kernel_;
+  std::vector<index_t> input_shape_;
 };
 
 }  // namespace kernels
diff --git a/mace/kernels/eltwise.h b/mace/kernels/eltwise.h
index 263dfb808a4810d4d1a1ca49c4728d813cda1552..1aa883d568ff493fa092e23637b2b6accf1d8a38 100644
--- a/mace/kernels/eltwise.h
+++ b/mace/kernels/eltwise.h
@@ -94,6 +94,7 @@ struct EltwiseFunctor<DeviceType::OPENCL, T> : EltwiseFunctorBase {
                   StatsFuture *future);
 
   cl::Kernel kernel_;
+  std::vector<index_t> input_shape_;
 };
 
 }  // namespace kernels
diff --git a/mace/kernels/fully_connected.h b/mace/kernels/fully_connected.h
index b8a740215f3c3d23a85cc4d55184ab0b65e4c13e..5c527d4593e02bead0e55998674690a4c5864e50 100644
--- a/mace/kernels/fully_connected.h
+++ b/mace/kernels/fully_connected.h
@@ -90,6 +90,7 @@ struct FullyConnectedFunctor<DeviceType::OPENCL, T> : FullyConnectedBase {
   cl::Kernel kernel_;
   std::vector<uint32_t> gws_;
   std::vector<uint32_t> lws_;
+  std::vector<index_t> input_shape_;
 };
 
 }  // namespace kernels
diff --git a/mace/kernels/opencl/activation_opencl.cc b/mace/kernels/opencl/activation_opencl.cc
index 180e38cafad10fc07eb9d51ac7cbde501ef94982..9792cae56889275053362ed6e7d230ff744fd4ac 100644
--- a/mace/kernels/opencl/activation_opencl.cc
+++ b/mace/kernels/opencl/activation_opencl.cc
@@ -58,6 +58,9 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
         LOG(FATAL) << "Unknown activation type: " << activation_;
     }
     kernel_ = runtime->BuildKernel("activation", kernel_name, built_options);
+  }
+
+  if (!IsVecEqual(input_shape_, input->shape())) {
     int idx = 0;
     kernel_.setArg(idx++, *(input->opencl_image()));
     if (activation_ == PRELU) {
@@ -66,6 +69,8 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
     }
     kernel_.setArg(idx++, static_cast<float>(relux_max_limit_));
     kernel_.setArg(idx++, *(output->opencl_image()));
+
+    input_shape_ = input->shape();
   }
 
   const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
diff --git a/mace/kernels/opencl/addn.cc b/mace/kernels/opencl/addn.cc
index a6863a59a9abfa2fa56ac5555f820d6101ca13fb..9f9571d0637026330e1d2f5ad2dea31f116eeefc 100644
--- a/mace/kernels/opencl/addn.cc
+++ b/mace/kernels/opencl/addn.cc
@@ -32,15 +32,6 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
     MACE_CHECK(channels == input_tensors[i]->dim(3));
   }
 
-  std::vector<index_t> output_shape = input_tensors[0]->shape();
-  std::vector<size_t> output_image_shape;
-  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape);
-  output_tensor->ResizeImage(output_shape, output_image_shape);
-
-  const index_t channel_blocks = RoundUpDiv4(channels);
-  const index_t width_pixels = channel_blocks * width;
-  const index_t batch_height_pixels = batch * height;
-
   if (kernel_.get() == nullptr) {
     if (input_tensors.size() > 4) {
       MACE_NOT_IMPLEMENTED;
@@ -55,11 +46,26 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
     built_options.emplace(MakeString("-DINPUT_NUM=", input_tensors.size()));
     kernel_ = runtime->BuildKernel("addn", kernel_name, built_options);
 
+  }
+
+  std::vector<index_t> output_shape = input_tensors[0]->shape();
+
+  const index_t channel_blocks = RoundUpDiv4(channels);
+  const index_t width_pixels = channel_blocks * width;
+  const index_t batch_height_pixels = batch * height;
+
+  if (!IsVecEqual(input_shape_, input_tensors[0]->shape())) {
+    std::vector<size_t> output_image_shape;
+    CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape);
+    output_tensor->ResizeImage(output_shape, output_image_shape);
+
     uint32_t idx = 0;
     for (auto input : input_tensors) {
       kernel_.setArg(idx++, *(input->opencl_image()));
     }
     kernel_.setArg(idx++, *(output_tensor->opencl_image()));
+
+    input_shape_ = input_tensors[0]->shape();
   }
 
   const uint32_t gws[2] = {static_cast<uint32_t>(width_pixels),
diff --git a/mace/kernels/opencl/batch_norm_opencl.cc b/mace/kernels/opencl/batch_norm_opencl.cc
index 8f14f34bedb97c7d9a228eebc0128448db1d4023..d9dfb8254d0bea67c0eb78c673579e5f57301fd5 100644
--- a/mace/kernels/opencl/batch_norm_opencl.cc
+++ b/mace/kernels/opencl/batch_norm_opencl.cc
@@ -61,7 +61,8 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
     }
 
     kernel_ = runtime->BuildKernel("batch_norm", kernel_name, built_options);
-
+  }
+  if (!IsVecEqual(input_shape_, input->shape())) {
     uint32_t idx = 0;
     kernel_.setArg(idx++, *(input->opencl_image()));
     kernel_.setArg(idx++, *(scale->opencl_image()));
@@ -73,6 +74,8 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
     }
     kernel_.setArg(idx++, *(output->opencl_image()));
     kernel_.setArg(idx++, relux_max_limit_);
+
+    input_shape_ = input->shape();
   }
 
   const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
diff --git a/mace/kernels/opencl/bias_add_opencl.cc b/mace/kernels/opencl/bias_add_opencl.cc
index 613b633bc8ca9a366d88a74dfcfc669b7f1b1ce1..d2490000b71a034a8cbe19f9ada9e5f5e1ed08fa 100644
--- a/mace/kernels/opencl/bias_add_opencl.cc
+++ b/mace/kernels/opencl/bias_add_opencl.cc
@@ -33,10 +33,13 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
     kernel_ = runtime->BuildKernel("bias_add", kernel_name, built_options);
 
+  }
+  if (!IsVecEqual(input_shape_, input->shape())) {
     uint32_t idx = 0;
     kernel_.setArg(idx++, *(input->opencl_image()));
     kernel_.setArg(idx++, *(bias->opencl_image()));
     kernel_.setArg(idx++, *(output->opencl_image()));
+    input_shape_ = input->shape();
   }
 
   const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
diff --git a/mace/kernels/opencl/channel_shuffle.cc b/mace/kernels/opencl/channel_shuffle.cc
index 3325ff24f1eff56a1dea3aec9323f490d40aac3c..a88b3b059cfacd31249f09dffd72f8ddee230c00 100644
--- a/mace/kernels/opencl/channel_shuffle.cc
+++ b/mace/kernels/opencl/channel_shuffle.cc
@@ -13,9 +13,10 @@ namespace mace {
 namespace kernels {
 
 template <typename T>
-void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
-                                       Tensor *output,
-                                       StatsFuture *future) {
+void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(
+    const Tensor *input,
+    Tensor *output,
+    StatsFuture *future) {
   output->ResizeLike(input);
 
   const index_t batch = input->dim(0);
@@ -39,12 +40,15 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *inpu
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
     kernel_ = runtime->BuildKernel("channel_shuffle", kernel_name, built_options);
-
+  }
+  if (!IsVecEqual(input_shape_, input->shape())) {
     uint32_t idx = 0;
     kernel_.setArg(idx++, *(input->opencl_image()));
     kernel_.setArg(idx++, groups_);
     kernel_.setArg(idx++, static_cast<uint32_t>(channels_per_group));
     kernel_.setArg(idx++, *(output->opencl_image()));
+
+    input_shape_ = input->shape();
   }
   const uint32_t gws[3] = {static_cast<uint32_t>(group_channel_blocks),
                            static_cast<uint32_t>(width),
diff --git a/mace/kernels/opencl/concat.cc b/mace/kernels/opencl/concat.cc
index 119ec7cd61a99f915ff7ec29443839ea2923d3a4..e99ab0605d02714b6851cb3cb8cf96f865ae5e1c 100644
--- a/mace/kernels/opencl/concat.cc
+++ b/mace/kernels/opencl/concat.cc
@@ -15,6 +15,7 @@ static void Concat2(cl::Kernel *kernel,
                     const Tensor *input0,
                     const Tensor *input1,
                     const DataType dt,
+                    std::vector<index_t> *prev_input_shape,
                     Tensor *output,
                     StatsFuture *future) {
   const index_t batch = output->dim(0);
@@ -41,6 +42,8 @@ static void Concat2(cl::Kernel *kernel,
     }
     *kernel = runtime->BuildKernel("concat", kernel_name, built_options);
 
+  }
+  if (!IsVecEqual(*prev_input_shape, input0->shape())) {
     uint32_t idx = 0;
     kernel->setArg(idx++,
                    *(static_cast<const cl::Image2D *>(input0->opencl_image())));
@@ -49,6 +52,7 @@ static void Concat2(cl::Kernel *kernel,
     kernel->setArg(idx++, static_cast<int32_t>(input0->dim(3)));
     kernel->setArg(idx++,
                    *(static_cast<cl::Image2D *>(output->opencl_image())));
+    *prev_input_shape = input0->shape();
   }
 
   const uint32_t gws[3] = {
@@ -142,7 +146,7 @@ void ConcatFunctor<DeviceType::OPENCL, T>::operator()(
   switch (inputs_count) {
     case 2:
       Concat2(&kernel_, input_list[0], input_list[1], DataTypeToEnum<T>::value,
-              output, future);
+              &input_shape_, output, future);
       break;
     default:
       if (divisible_four) {
diff --git a/mace/kernels/opencl/conv_2d_opencl.cc b/mace/kernels/opencl/conv_2d_opencl.cc
index 3ed87e7c6b85eb5d7f83ecdd3d2aca90b10a9ed9..46683fd1709eda83be6826d4e2519d28bf4956b1 100644
--- a/mace/kernels/opencl/conv_2d_opencl.cc
+++ b/mace/kernels/opencl/conv_2d_opencl.cc
@@ -18,6 +18,7 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
                              const ActivationType activation,
                              const float relux_max_limit,
                              const DataType dt,
+                             std::vector<index_t> *prev_input_shape,
                              Tensor *output,
                              StatsFuture *future);
 
@@ -31,6 +32,7 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
                              const ActivationType activation,
                              const float relux_max_limit,
                              const DataType dt,
+                             std::vector<index_t> *prev_input_shape,
                              Tensor *output,
                              StatsFuture *future);
 
@@ -44,6 +46,7 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
                          const ActivationType activation,
                          const float relux_max_limit,
                          const DataType dt,
+                         std::vector<index_t> *prev_input_shape,
                          Tensor *output,
                          StatsFuture *future);
 
@@ -57,8 +60,8 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
       cl::Kernel * kernel, const Tensor *input, const Tensor *filter,
       const Tensor *bias, const int stride, const int *padding,
       const int *dilations, const ActivationType activation,
-      const float relux_max_limit, const DataType dt, Tensor *output,
-      StatsFuture *future);
+      const float relux_max_limit, const DataType dt,
+      std::vector<index_t> *input_shape, Tensor *output, StatsFuture *future);
   // Selection matrix: kernel_size x stride_size
   static const Conv2dOpenclFunction selector[5] = {
       Conv2dOpenclK1x1, nullptr, Conv2dOpenclK3x3, nullptr, nullptr};
@@ -97,11 +100,11 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
     auto conv2d_func = selector[kernel_h - 1];
     conv2d_func(&kernel_, input, filter, bias, strides_[0], paddings.data(),
                 dilations_, activation_, relux_max_limit_,
-                DataTypeToEnum<T>::value, output, future);
+                DataTypeToEnum<T>::value, &input_shape_, output, future);
   } else {
     Conv2dOpencl(&kernel_, input, filter, bias, strides_[0], paddings.data(),
                  dilations_, activation_, relux_max_limit_,
-                 DataTypeToEnum<T>::value, output, future);
+                 DataTypeToEnum<T>::value, &input_shape_, output, future);
   }
 }
 
diff --git a/mace/kernels/opencl/conv_2d_opencl_1x1.cc b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
index 41eaad5633917ece4126506aea31f2bad08afd98..4109a97932163919e436a1847549c44ef8d60e31 100644
--- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
@@ -20,6 +20,7 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
                              const ActivationType activation,
                              const float relux_max_limit,
                              const DataType dt,
+                             std::vector<index_t> *prev_input_shape,
                              Tensor *output,
                              StatsFuture *future) {
   const index_t batch = output->dim(0);
@@ -68,6 +69,8 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
     auto runtime = OpenCLRuntime::Global();
     *kernel = runtime->BuildKernel("conv_2d_1x1", kernel_name, built_options);
 
+  }
+  if (!IsVecEqual(*prev_input_shape, input->shape())) {
     uint32_t idx = 0;
     kernel->setArg(idx++, *(input->opencl_image()));
     kernel->setArg(idx++, *(filter->opencl_image()));
@@ -83,6 +86,8 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
     kernel->setArg(idx++, static_cast<int>(height));
     kernel->setArg(idx++, static_cast<int>(width));
     kernel->setArg(idx++, stride);
+
+    *prev_input_shape = input->shape();
   }
 
   const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
diff --git a/mace/kernels/opencl/conv_2d_opencl_3x3.cc b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
index df2672c95a1aa0ac9ab421df1a2e91de039e8f5f..ba047cdfad9e6280020d98d92170ea3c8820aa9d 100644
--- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
@@ -22,6 +22,7 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
                              const ActivationType activation,
                              const float relux_max_limit,
                              const DataType dt,
+                             std::vector<index_t> *prev_input_shape,
                              Tensor *output,
                              StatsFuture *future) {
   const index_t batch = output->dim(0);
@@ -62,7 +63,8 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
 
     auto runtime = OpenCLRuntime::Global();
     *kernel = runtime->BuildKernel("conv_2d_3x3", kernel_name, built_options);
-
+  }
+  if (!IsVecEqual(*prev_input_shape, input->shape())) {
     uint32_t idx = 0;
     kernel->setArg(idx++, *(input->opencl_image()));
     kernel->setArg(idx++, *(filter->opencl_image()));
@@ -81,6 +83,8 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
     kernel->setArg(idx++, padding[1] / 2);
     kernel->setArg(idx++, dilations[0]);
     kernel->setArg(idx++, dilations[1]);
+
+    *prev_input_shape = input->shape();
   }
 
   const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
diff --git a/mace/kernels/opencl/conv_2d_opencl_general.cc b/mace/kernels/opencl/conv_2d_opencl_general.cc
index c317aa8c635ca817a30b01213bf5aa7f8355c44e..fd48605f2cfee1827a559af03a799120b9561e52 100644
--- a/mace/kernels/opencl/conv_2d_opencl_general.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_general.cc
@@ -22,6 +22,7 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
                          const ActivationType activation,
                          const float relux_max_limit,
                          const DataType dt,
+                         std::vector<index_t> *prev_input_shape,
                          Tensor *output,
                          StatsFuture *future) {
   const index_t batch = output->dim(0);
@@ -62,7 +63,8 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
 
     auto runtime = OpenCLRuntime::Global();
     *kernel = runtime->BuildKernel("conv_2d", kernel_name, built_options);
-
+  }
+  if (!IsVecEqual(*prev_input_shape, input->shape())) {
     uint32_t idx = 0;
     kernel->setArg(idx++, *(input->opencl_image()));
     kernel->setArg(idx++, *(filter->opencl_image()));
@@ -83,6 +85,8 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
     kernel->setArg(idx++, padding[1] / 2);
     kernel->setArg(idx++, dilations[0]);
     kernel->setArg(idx++, dilations[1]);
+
+    *prev_input_shape = input->shape();
   }
 
   const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
diff --git a/mace/kernels/opencl/depthwise_conv_opencl.cc b/mace/kernels/opencl/depthwise_conv_opencl.cc
index 1b99188b3acb34fb5c87c5f30a58de6c5b400f15..37b587dcef2caddae7ae5d73254a8c87dbf9f5a1 100644
--- a/mace/kernels/opencl/depthwise_conv_opencl.cc
+++ b/mace/kernels/opencl/depthwise_conv_opencl.cc
@@ -21,6 +21,7 @@ void DepthwiseConv2d(cl::Kernel *kernel,
                      const ActivationType activation,
                      const float relux_max_limit,
                      const DataType dt,
+                     std::vector<index_t> *prev_input_shape,
                      Tensor *output,
                      StatsFuture *future) {
   const index_t batch = output->dim(0);
@@ -35,17 +36,6 @@ void DepthwiseConv2d(cl::Kernel *kernel,
   const index_t input_channel_blocks = RoundUpDiv4(input_channels);
   const index_t width_blocks = RoundUpDiv4(width);
   if (kernel->get() == nullptr) {
-    const index_t input_batch = input->dim(0);
-    const index_t input_height = input->dim(1);
-    const index_t input_width = input->dim(2);
-
-    const index_t filter_height = filter->dim(0);
-    const index_t filter_width = filter->dim(1);
-    MACE_CHECK(multiplier == 1, "Multiplier > 1 not supported");
-    MACE_CHECK(multiplier * input_channels == channels);
-    MACE_CHECK(filter->dim(2) == input_channels, filter->dim(2), "!=",
-               input_channels);
-
     auto runtime = OpenCLRuntime::Global();
     std::set<std::string> built_options;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv2d");
@@ -80,6 +70,18 @@ void DepthwiseConv2d(cl::Kernel *kernel,
 
     *kernel =
         runtime->BuildKernel("depthwise_conv2d", kernel_name, built_options);
+  }
+  if (!IsVecEqual(*prev_input_shape, input->shape())) {
+    const index_t input_batch = input->dim(0);
+    const index_t input_height = input->dim(1);
+    const index_t input_width = input->dim(2);
+
+    const index_t filter_height = filter->dim(0);
+    const index_t filter_width = filter->dim(1);
+    MACE_CHECK(multiplier == 1, "Multiplier > 1 not supported");
+    MACE_CHECK(multiplier * input_channels == channels);
+    MACE_CHECK(filter->dim(2) == input_channels, filter->dim(2), "!=",
+               input_channels);
 
     uint32_t idx = 0;
     kernel->setArg(idx++, *(input->opencl_image()));
@@ -102,6 +104,7 @@ void DepthwiseConv2d(cl::Kernel *kernel,
       kernel->setArg(idx++, static_cast<short>(dilations[0]));
       kernel->setArg(idx++, static_cast<short>(dilations[1]));
     }
+    *prev_input_shape = input->shape();
   }
 
   const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
@@ -120,9 +123,7 @@ void DepthwiseConv2dFunctor<DeviceType::OPENCL, T>::operator()(
     const Tensor *bias,
     Tensor *output,
     StatsFuture *future) {
-  typedef void (*Conv2dOpenclFunction)(const Tensor *input,
-                                       const Tensor *filter, const Tensor *bias,
-                                       Tensor *output, StatsFuture *future);
+
   index_t kernel_h = filter->dim(2);
   index_t kernel_w = filter->dim(3);
   if (strides_[0] != strides_[1]) {
@@ -163,7 +164,7 @@ void DepthwiseConv2dFunctor<DeviceType::OPENCL, T>::operator()(
 
   DepthwiseConv2d(&kernel_, input, filter, bias, strides_[0], paddings.data(),
                   dilations_, activation_, relux_max_limit_,
-                  DataTypeToEnum<T>::value, output, future);
+                  DataTypeToEnum<T>::value, &input_shape_, output, future);
 }
 
 template struct DepthwiseConv2dFunctor<DeviceType::OPENCL, float>;
diff --git a/mace/kernels/opencl/eltwise_opencl.cc b/mace/kernels/opencl/eltwise_opencl.cc
index 82312c75338d7ba8f79da37194a783beb959da45..dde05b29e2b2a6c8264ced78dea7d8fb3a37ef65 100644
--- a/mace/kernels/opencl/eltwise_opencl.cc
+++ b/mace/kernels/opencl/eltwise_opencl.cc
@@ -36,6 +36,8 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
     if (!coeff_.empty()) built_options.emplace("-DCOEFF_SUM");
     kernel_ = runtime->BuildKernel("eltwise", kernel_name, built_options);
 
+  }
+  if (!IsVecEqual(input_shape_, input0->shape())) {
     uint32_t idx = 0;
     kernel_.setArg(idx++, *(input0->opencl_image()));
     kernel_.setArg(idx++, *(input1->opencl_image()));
@@ -44,6 +46,7 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
       kernel_.setArg(idx++, coeff_[1]);
     }
     kernel_.setArg(idx++, *(output->opencl_image()));
+    input_shape_ = input0->shape();
   }
 
   const uint32_t gws[2] = {static_cast<uint32_t>(width_pixels),
diff --git a/mace/kernels/opencl/fully_connected_opencl.cc b/mace/kernels/opencl/fully_connected_opencl.cc
index abcbfe526349ad64b758cc2897e80cabca4c6a61..d5db519025514be82b5101de3c25c74c444c9b59 100644
--- a/mace/kernels/opencl/fully_connected_opencl.cc
+++ b/mace/kernels/opencl/fully_connected_opencl.cc
@@ -13,6 +13,7 @@ void FCWXKernel(cl::Kernel *kernel,
                 const Tensor *input,
                 const Tensor *weight,
                 const Tensor *bias,
+                std::vector<index_t> *prev_input_shape,
                 Tensor *output,
                 const ActivationType activation,
                 std::vector<uint32_t> &gws,
@@ -67,6 +68,11 @@ void FCWXKernel(cl::Kernel *kernel,
     const uint32_t inter_local_blks = kwg_size / (gws[0] * gws[1]);
     lws = {gws[0], gws[1], inter_local_blks};
 
+  }
+  if (!IsVecEqual(*prev_input_shape, input->shape())) {
+    const index_t batch = output->dim(0);
+    const index_t output_blocks = RoundUpDiv4(output->dim(3));
+
     uint32_t idx = 0;
     kernel->setArg(idx++, *(input->opencl_image()));
     kernel->setArg(idx++, *(weight->opencl_image()));
@@ -80,6 +86,10 @@ void FCWXKernel(cl::Kernel *kernel,
     kernel->setArg(idx++, static_cast<int>(RoundUpDiv4(input->dim(3))));
     kernel->setArg(idx++, static_cast<int>(output_blocks));
     kernel->setArg(idx++, relux_max_limit);
+
+    gws[2] = static_cast<uint32_t>(batch * output_blocks);
+
+    *prev_input_shape = input->shape();
   }
   cl::Event event;
   cl_int error = runtime->command_queue().enqueueNDRangeKernel(
@@ -103,6 +113,7 @@ void FCWTXKernel(cl::Kernel *kernel,
                  const Tensor *input,
                  const Tensor *weight,
                  const Tensor *bias,
+                 std::vector<index_t> *prev_input_shape,
                  Tensor *output,
                  const ActivationType activation,
                  std::vector<uint32_t> &gws,
@@ -141,6 +152,9 @@ void FCWTXKernel(cl::Kernel *kernel,
     *kernel =
         runtime->BuildKernel("fully_connected", kernel_name, built_options);
 
+    lws = {16, 64, 1};
+  }
+  if (!IsVecEqual(*prev_input_shape, input->shape())) {
     uint32_t idx = 0;
     kernel->setArg(idx++, *(input->opencl_image()));
     kernel->setArg(idx++, *(weight->opencl_image()));
@@ -155,14 +169,13 @@ void FCWTXKernel(cl::Kernel *kernel,
     kernel->setArg(idx++, relux_max_limit);
 
     const index_t batch = output->dim(0);
-    const index_t output_size = output->dim(3);
-
-    const index_t output_blocks = RoundUpDiv4(output_size);
+    const index_t output_blocks = RoundUpDiv4(output->dim(3));
 
     gws = {
         static_cast<uint32_t>(batch), static_cast<uint32_t>(output_blocks),
     };
-    lws = {16, 64, 1};
+
+    *prev_input_shape = input->shape();
   }
 
   std::stringstream ss;
@@ -185,11 +198,11 @@ void FullyConnectedFunctor<DeviceType::OPENCL, T>::operator()(
   output->ResizeImage(output_shape, output_image_shape);
 
   if (weight_type_ == BufferType::WEIGHT_HEIGHT) {
-    FCWTXKernel<T>(&kernel_, input, weight, bias, output,
+    FCWTXKernel<T>(&kernel_, input, weight, bias, &input_shape_, output,
                    activation_, gws_, lws_, relux_max_limit_, future);
   } else {
-    FCWXKernel<T>(&kernel_, input, weight, bias, output,
-                     activation_, gws_, lws_, relux_max_limit_, future);
+    FCWXKernel<T>(&kernel_, input, weight, bias, &input_shape_, output,
+                  activation_, gws_, lws_, relux_max_limit_, future);
   }
 };
 
diff --git a/mace/kernels/opencl/helper.h b/mace/kernels/opencl/helper.h
index 6513415a02a00574dbfc1b22c1c909e94e6bfd49..56bf295ee2dec5451f9d142ccd0e63441b37e545 100644
--- a/mace/kernels/opencl/helper.h
+++ b/mace/kernels/opencl/helper.h
@@ -71,6 +71,13 @@ inline bool LimitKernelTime() {
   return flag != nullptr && strlen(flag) == 1 && flag[0] == '1';
 }
 
+template <typename T>
+bool IsVecEqual(const std::vector<T> &input0,
+                const std::vector<T> &input1) {
+  return ((input0.size() == input1.size()) &&
+      (std::equal(input0.begin(), input0.end(), input1.begin())));
+}
+
 namespace {
 template <typename T>
 void AppendToStream(std::stringstream *ss, const std::string &delimiter, T v) {
diff --git a/mace/kernels/opencl/matmul.cc b/mace/kernels/opencl/matmul.cc
index d453c29308133aa81f5b19422b020d06dbba49fc..4b61edb271df814b4bdcea251d28b2ca03cf3be4 100644
--- a/mace/kernels/opencl/matmul.cc
+++ b/mace/kernels/opencl/matmul.cc
@@ -36,17 +36,16 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
     kernel_ = runtime->BuildKernel("matmul", kernel_name, built_options);
-
-    uint32_t idx = 0;
-    kernel_.setArg(idx++, *(A->opencl_image()));
-    kernel_.setArg(idx++, *(B->opencl_image()));
-    kernel_.setArg(idx++, *(C->opencl_image()));
-    kernel_.setArg(idx++, static_cast<int>(height));
-    kernel_.setArg(idx++, static_cast<int>(width));
-    kernel_.setArg(idx++, static_cast<int>(A->dim(2)));
-    kernel_.setArg(idx++, static_cast<int>(height_blocks));
-    kernel_.setArg(idx++, static_cast<int>(RoundUpDiv4(A->dim(2))));
   }
+  uint32_t idx = 0;
+  kernel_.setArg(idx++, *(A->opencl_image()));
+  kernel_.setArg(idx++, *(B->opencl_image()));
+  kernel_.setArg(idx++, *(C->opencl_image()));
+  kernel_.setArg(idx++, static_cast<int>(height));
+  kernel_.setArg(idx++, static_cast<int>(width));
+  kernel_.setArg(idx++, static_cast<int>(A->dim(2)));
+  kernel_.setArg(idx++, static_cast<int>(height_blocks));
+  kernel_.setArg(idx++, static_cast<int>(RoundUpDiv4(A->dim(2))));
 
   const uint32_t gws[2] = {
       static_cast<uint32_t>(width_blocks),
diff --git a/mace/kernels/opencl/pooling_opencl.cc b/mace/kernels/opencl/pooling_opencl.cc
index d9256776d1f094a505de40a92bf79a1553cd1272..d8a6d675a8da5749d3a2cf02360e3ec619a809ff 100644
--- a/mace/kernels/opencl/pooling_opencl.cc
+++ b/mace/kernels/opencl/pooling_opencl.cc
@@ -17,31 +17,6 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
                                                        StatsFuture *future) {
   MACE_CHECK(dilations_[0] == 1 && dilations_[1] == 1)
       << "Pooling opencl kernel not support dilation yet";
-  std::vector<index_t> output_shape(4);
-  std::vector<index_t> filter_shape = {kernels_[0], kernels_[1], input->dim(3),
-                                       input->dim(3)};
-
-  std::vector<int> paddings(2);
-  if (paddings_.empty()) {
-    kernels::CalcNHWCPaddingAndOutputSize(
-        input->shape().data(), filter_shape.data(), dilations_, strides_,
-        padding_type_, output_shape.data(), paddings.data());
-  } else {
-    paddings = paddings_;
-    CalcOutputSize(input->shape().data(), filter_shape.data(), paddings_.data(),
-                   dilations_, strides_, RoundType::CEIL, output_shape.data());
-  }
-
-  std::vector<size_t> output_image_shape;
-  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape);
-  output->ResizeImage(output_shape, output_image_shape);
-
-  index_t batch = output->dim(0);
-  index_t out_height = output->dim(1);
-  index_t out_width = output->dim(2);
-  index_t channels = output->dim(3);
-
-  index_t channel_blocks = (channels + 3) / 4;
 
   if (kernel_.get() == nullptr) {
     const DataType dt = DataTypeToEnum<T>::value;
@@ -62,18 +37,49 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
     }
     kernel_ = runtime->BuildKernel("pooling", kernel_name, built_options);
 
+  }
+  if (!IsVecEqual(input_shape_, input->shape())) {
+    std::vector<index_t> output_shape(4);
+    std::vector<index_t> filter_shape = {kernels_[0], kernels_[1], input->dim(3),
+                                         input->dim(3)};
+
+    std::vector<int> paddings(2);
+    if (paddings_.empty()) {
+      kernels::CalcNHWCPaddingAndOutputSize(
+          input->shape().data(), filter_shape.data(), dilations_, strides_,
+          padding_type_, output_shape.data(), paddings.data());
+    } else {
+      paddings = paddings_;
+      CalcOutputSize(input->shape().data(), filter_shape.data(), paddings_.data(),
+                     dilations_, strides_, RoundType::CEIL, output_shape.data());
+    }
+
+    std::vector<size_t> output_image_shape;
+    CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape);
+    output->ResizeImage(output_shape, output_image_shape);
+
     uint32_t idx = 0;
     kernel_.setArg(idx++, *(input->opencl_image()));
     kernel_.setArg(idx++, static_cast<int32_t>(input->dim(1)));
     kernel_.setArg(idx++, static_cast<int32_t>(input->dim(2)));
-    kernel_.setArg(idx++, static_cast<int32_t>(out_height));
+    kernel_.setArg(idx++, static_cast<int32_t>(output->dim(1)));
     kernel_.setArg(idx++, paddings[0] / 2);
     kernel_.setArg(idx++, paddings[1] / 2);
     kernel_.setArg(idx++, strides_[0]);
     kernel_.setArg(idx++, kernels_[0]);
     kernel_.setArg(idx++, *(output->opencl_image()));
+
+    input_shape_ = input->shape();
   }
 
+  index_t batch = output->dim(0);
+  index_t out_height = output->dim(1);
+  index_t out_width = output->dim(2);
+  index_t channels = output->dim(3);
+
+  index_t channel_blocks = (channels + 3) / 4;
+
+
   const uint32_t gws[3] = {
       static_cast<uint32_t>(channel_blocks), static_cast<uint32_t>(out_width),
       static_cast<uint32_t>(batch * out_height),
diff --git a/mace/kernels/opencl/resize_bilinear_opencl.cc b/mace/kernels/opencl/resize_bilinear_opencl.cc
index 470a335deb264610638cdbfda11f8bffeb974062..a3bb2ee1951f433ae41a4c2dc41367fe77d1e497 100644
--- a/mace/kernels/opencl/resize_bilinear_opencl.cc
+++ b/mace/kernels/opencl/resize_bilinear_opencl.cc
@@ -25,6 +25,18 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
   const index_t out_width = out_width_;
 
   if (kernel_.get() == nullptr) {
+    auto runtime = OpenCLRuntime::Global();
+    std::set<std::string> built_options;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bilinear_nocache");
+    built_options.emplace("-Dresize_bilinear_nocache=" + kernel_name);
+    auto dt = DataTypeToEnum<T>::value;
+    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+    kernel_ =
+        runtime->BuildKernel("resize_bilinear", kernel_name, built_options);
+
+  }
+  if (!IsVecEqual(input_shape_, input->shape())) {
     MACE_CHECK(out_height > 0 && out_width > 0);
     std::vector<index_t> output_shape{batch, out_height, out_width, channels};
 
@@ -38,16 +50,6 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
     float width_scale =
         CalculateResizeScale(in_width, out_width, align_corners_);
 
-    auto runtime = OpenCLRuntime::Global();
-    std::set<std::string> built_options;
-    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bilinear_nocache");
-    built_options.emplace("-Dresize_bilinear_nocache=" + kernel_name);
-    auto dt = DataTypeToEnum<T>::value;
-    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
-    kernel_ =
-        runtime->BuildKernel("resize_bilinear", kernel_name, built_options);
-
     uint32_t idx = 0;
     kernel_.setArg(idx++, *(input->opencl_image()));
     kernel_.setArg(idx++, *(output->opencl_image()));
@@ -56,6 +58,9 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
     kernel_.setArg(idx++, static_cast<int32_t>(in_height));
     kernel_.setArg(idx++, static_cast<int32_t>(in_width));
     kernel_.setArg(idx++, static_cast<int32_t>(out_height));
+
+    input_shape_ = input->shape();
+
   }
 
   const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
diff --git a/mace/kernels/opencl/softmax_opencl.cc b/mace/kernels/opencl/softmax_opencl.cc
index 25e1c9e4aa98a354524df81fa92c0c4c21bd5710..4aabe9017f06073ddffe7e04871b62b76da15dc6 100644
--- a/mace/kernels/opencl/softmax_opencl.cc
+++ b/mace/kernels/opencl/softmax_opencl.cc
@@ -34,11 +34,14 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
     kernel_ = runtime->BuildKernel("softmax", kernel_name, built_options);
 
+  }
+  if (!IsVecEqual(input_shape_, logits->shape())) {
     uint32_t idx = 0;
     kernel_.setArg(idx++, *(logits->opencl_image()));
     kernel_.setArg(idx++, static_cast<int>(channels));
     kernel_.setArg(idx++, remain_channels);
     kernel_.setArg(idx++, *(output->opencl_image()));
+    input_shape_ = logits->shape();
   }
   const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
                            static_cast<uint32_t>(width),
diff --git a/mace/kernels/opencl/space_to_batch_opencl.cc b/mace/kernels/opencl/space_to_batch_opencl.cc
index 0cecb0a7809d8cc44535af095c53b03959dda28c..91f5564d520de9e11ad832231060f37ea3f64191 100644
--- a/mace/kernels/opencl/space_to_batch_opencl.cc
+++ b/mace/kernels/opencl/space_to_batch_opencl.cc
@@ -43,6 +43,8 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
     kernel_ =
         runtime->BuildKernel("space_to_batch", kernel_name, built_options);
 
+  }
+  if (!IsVecEqual(space_shape_, space_tensor->shape())) {
     uint32_t idx = 0;
     if (b2s_) {
       kernel_.setArg(idx++, *(batch_tensor->opencl_image()));
@@ -59,6 +61,8 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
     kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(2)));
     kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(1)));
     kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(2)));
+
+    space_shape_ = space_tensor->shape();
   }
 
   const uint32_t chan_blk = RoundUpDiv4<uint32_t>(batch_tensor->dim(3));
diff --git a/mace/kernels/opencl/winograd_transform.cc b/mace/kernels/opencl/winograd_transform.cc
index ee7d5d12e84aec4a878ca4575c4565271eb33e7c..c07ccc9944786e8cbcd8dde4aa6ada7794542019 100644
--- a/mace/kernels/opencl/winograd_transform.cc
+++ b/mace/kernels/opencl/winograd_transform.cc
@@ -14,6 +14,21 @@ namespace kernels {
 template <typename T>
 void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
     const Tensor *input_tensor, Tensor *output_tensor, StatsFuture *future) {
+
+  if (kernel_.get() == nullptr) {
+    std::string obfuscated_kernel_name =
+        MACE_OBFUSCATE_SYMBOL("winograd_transform_2x2");
+    std::set<std::string> built_options;
+    built_options.emplace("-Dwinograd_transform_2x2=" + obfuscated_kernel_name);
+    built_options.emplace("-DDATA_TYPE=" +
+                          DtToUpstreamCLDt(DataTypeToEnum<T>::value));
+    built_options.emplace("-DCMD_DATA_TYPE=" +
+                          DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
+    auto runtime = OpenCLRuntime::Global();
+    kernel_ = runtime->BuildKernel("winograd_transform", obfuscated_kernel_name,
+                                   built_options);
+
+  }
   std::vector<index_t> output_shape(4);
   std::vector<index_t> filter_shape = {3, 3, input_tensor->dim(3), 1};
   std::vector<int> paddings(2);
@@ -27,29 +42,16 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
                    paddings_.data(), dilations_.data(), strides_.data(),
                    RoundType::FLOOR, output_shape.data());
   }
-
   const index_t round_h = (output_shape[1] + 1) / 2;
   const index_t round_w = (output_shape[2] + 1) / 2;
   const index_t out_width = input_tensor->dim(0) * round_h * round_w;
 
-  if (kernel_.get() == nullptr) {
+  if (!IsVecEqual(input_shape_, input_tensor->shape())) {
     output_shape = {16, input_tensor->dim(3), out_width, 1};
     std::vector<size_t> image_shape;
     CalImage2DShape(output_shape, BufferType::IN_OUT_HEIGHT, image_shape);
     output_tensor->ResizeImage(output_shape, image_shape);
 
-    std::string obfuscated_kernel_name =
-        MACE_OBFUSCATE_SYMBOL("winograd_transform_2x2");
-    std::set<std::string> built_options;
-    built_options.emplace("-Dwinograd_transform_2x2=" + obfuscated_kernel_name);
-    built_options.emplace("-DDATA_TYPE=" +
-                          DtToUpstreamCLDt(DataTypeToEnum<T>::value));
-    built_options.emplace("-DCMD_DATA_TYPE=" +
-                          DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
-    auto runtime = OpenCLRuntime::Global();
-    kernel_ = runtime->BuildKernel("winograd_transform", obfuscated_kernel_name,
-                                   built_options);
-
     uint32_t idx = 0;
     kernel_.setArg(idx++, *(input_tensor->opencl_image()));
     kernel_.setArg(idx++, *(output_tensor->opencl_image()));
@@ -60,6 +62,8 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
     kernel_.setArg(idx++, static_cast<uint32_t>(round_w));
     kernel_.setArg(idx++, static_cast<uint32_t>(paddings[0] / 2));
     kernel_.setArg(idx++, static_cast<uint32_t>(paddings[1] / 2));
+
+    input_shape_ = input_tensor->shape();
   }
 
   const uint32_t gws[2] = {
@@ -79,11 +83,6 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
     const Tensor *bias,
     Tensor *output_tensor,
     StatsFuture *future) {
-  std::vector<index_t> output_shape = {batch_, height_, width_,
-                                       input_tensor->dim(1)};
-  std::vector<size_t> image_shape;
-  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, image_shape);
-  output_tensor->ResizeImage(output_shape, image_shape);
 
   if (kernel_.get() == nullptr) {
     std::string obfuscated_kernel_name =
@@ -121,6 +120,13 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
     auto runtime = OpenCLRuntime::Global();
     kernel_ = runtime->BuildKernel("winograd_transform", obfuscated_kernel_name,
                                    built_options);
+  }
+  if (!IsVecEqual(input_shape_, input_tensor->shape())) {
+    std::vector<index_t> output_shape = {batch_, height_, width_,
+                                         input_tensor->dim(1)};
+    std::vector<size_t> image_shape;
+    CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, image_shape);
+    output_tensor->ResizeImage(output_shape, image_shape);
 
     const uint32_t round_h = (height_ + 1) / 2;
     const uint32_t round_w = (width_ + 1) / 2;
@@ -139,6 +145,8 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
     kernel_.setArg(idx++, static_cast<uint32_t>(round_h * round_w));
     kernel_.setArg(idx++, static_cast<uint32_t>(round_w));
     kernel_.setArg(idx++, relux_max_limit_);
+
+    input_shape_ = input_tensor->shape();
   }
 
   const uint32_t gws[2] = {
diff --git a/mace/kernels/pooling.h b/mace/kernels/pooling.h
index 6bd5d94e1684d5228dcb1a468d05220f904deaae..bc9892e5864d420f9505de9462df5a17eedb4241 100644
--- a/mace/kernels/pooling.h
+++ b/mace/kernels/pooling.h
@@ -182,6 +182,7 @@ struct PoolingFunctor<DeviceType::OPENCL, T> : PoolingFunctorBase {
                   StatsFuture *future);
 
   cl::Kernel kernel_;
+  std::vector<index_t> input_shape_;
 };
 
 }  // namespace kernels
diff --git a/mace/kernels/resize_bilinear.h b/mace/kernels/resize_bilinear.h
index bdd94192b8ae4f91a08c2088a31a7188b22e765e..52c1da102926870d6e65dfa52ee68c7ff5a43f76 100644
--- a/mace/kernels/resize_bilinear.h
+++ b/mace/kernels/resize_bilinear.h
@@ -172,6 +172,7 @@ struct ResizeBilinearFunctor<DeviceType::OPENCL, T>
   void operator()(const Tensor *input, Tensor *output, StatsFuture *future);
 
   cl::Kernel kernel_;
+  std::vector<index_t> input_shape_;
 };
 
 }  // namespace kernels
diff --git a/mace/kernels/softmax.h b/mace/kernels/softmax.h
index 7ff375d31be90228628dc91b16326eab4079ddd6..d5bc5717d8cfdbfc391de634f08d8fd427e5ca9d 100644
--- a/mace/kernels/softmax.h
+++ b/mace/kernels/softmax.h
@@ -57,6 +57,7 @@ struct SoftmaxFunctor<DeviceType::OPENCL, T> {
   void operator()(const Tensor *logits, Tensor *output, StatsFuture *future);
 
   cl::Kernel kernel_;
+  std::vector<index_t> input_shape_;
 };
 
 }  // namepsace kernels
diff --git a/mace/kernels/space_to_batch.h b/mace/kernels/space_to_batch.h
index 402bf97cb956a8e7dfcb75645cc1fe395282f8e9..ef7467b57acd0fc1d3563148ec53dd1ea4869a9f 100644
--- a/mace/kernels/space_to_batch.h
+++ b/mace/kernels/space_to_batch.h
@@ -54,6 +54,7 @@ struct SpaceToBatchFunctor<DeviceType::OPENCL, T> : SpaceToBatchFunctorBase {
                   StatsFuture *future);
 
   cl::Kernel kernel_;
+  std::vector<index_t> space_shape_;
 };
 
 }  // namespace kernels
diff --git a/mace/kernels/winograd_transform.h b/mace/kernels/winograd_transform.h
index 464a59ced093d123c2853c37bedeea8879cb68c0..f3b7f7d640328860f5ffdc5dc6b065e78e324896 100644
--- a/mace/kernels/winograd_transform.h
+++ b/mace/kernels/winograd_transform.h
@@ -49,6 +49,7 @@ struct WinogradTransformFunctor<DeviceType::OPENCL, T>
   void operator()(const Tensor *input, Tensor *output, StatsFuture *future);
 
   cl::Kernel kernel_;
+  std::vector<index_t> input_shape_;
 };
 
 struct WinogradInverseTransformFunctorBase {
@@ -105,6 +106,7 @@ struct WinogradInverseTransformFunctor<DeviceType::OPENCL, T>
                   StatsFuture *future);
 
   cl::Kernel kernel_;
+  std::vector<index_t> input_shape_;
 };
 
 }  // namespace kernels
diff --git a/tools/wino_conv.py b/tools/wino_conv.py
index a8cdf3d8e88586b10dd3256de3670978c2a2e5f2..0dc3f8d611e32c6cf931bd8ec9228cb8a25408ab 100644
--- a/tools/wino_conv.py
+++ b/tools/wino_conv.py
@@ -2,22 +2,89 @@ import numpy as np
 import math
 import tensorflow as tf
 
-A_T = np.array([[1, 1, 1, 0], [0, 1, -1, -1]]).astype(np.float32)
-A = np.transpose(A_T)
-B_T = np.array([
+A_T = {}
+A = {}
+B_T = {}
+B = {}
+G = {}
+G_T = {}
+# f(2, 3)
+A_T[4] = np.array([[1, 1, 1, 0], [0, 1, -1, -1]]).astype(np.float32)
+A[4] = np.transpose(A_T[4])
+B_T[4] = np.array([
   [1, 0, -1, 0],
   [0, 1, 1, 0],
   [0, -1, 1, 0],
   [0, 1, 0, -1]
 ]).astype(np.float32)
-B = np.transpose(B_T)
-G = np.array([
+B[4] = np.transpose(B_T[4])
+G[4] = np.array([
   [1, 0, 0],
   [0.5, 0.5, 0.5],
   [0.5, -0.5, 0.5],
   [0, 0, 1],
 ]).astype(np.float32)
-G_T = np.transpose(G)
+G_T[4] = np.transpose(G[4])
+
+# f(4, 3)
+A_T[6] = np.array([
+  [1, 1,  1, 1,  1, 0],
+  [0, 1, -1, 2, -2, 0],
+  [0, 1,  1, 4,  4, 0],
+  [0, 1, -1, 8, -8, 1],
+]).astype(np.float32)
+A[6] = np.transpose(A_T[6])
+B_T[6] = np.array([
+  [4,  0, -5,  0, 1, 0],
+  [0, -4, -4,  1, 1, 0],
+  [0,  4, -4, -1, 1, 0],
+  [0, -2, -1,  2, 1, 0],
+  [0,  2, -1, -2, 1, 0],
+  [0,  4,  0, -5, 0, 1],
+]).astype(np.float32)
+B[6] = np.transpose(B_T[6])
+G[6] = np.array([
+  [1/4.0 ,   0    ,  0    ],
+  [-1/6.0, -1/6.0 , -1/6.0],
+  [-1/6.0,  1/6.0 , -1/6.0],
+  [1/24.0, 1/12.0 , 1/6.0 ],
+  [1/24.0, -1/12.0, 1/6.0 ],
+  [ 0    ,  0     ,  1    ],
+]).astype(np.float32)
+G_T[6] = np.transpose(G[6])
+
+# f(6, 3)
+A_T[8] = np.array([
+  [1, 1, 1 , 1 ,  1 ,  1  ,   1  , 0],
+  [0, 1, -1, 2 , -2 , 1/2. , -1/2. , 0],
+  [0, 1, 1 , 4 ,  4 , 1/4. ,  1/4. , 0],
+  [0, 1, -1, 8 , -8 , 1/8. , -1/8. , 0],
+  [0, 1, 1 , 16, 16 , 1/16., 1/16. , 0],
+  [0, 1, -1, 32, -32, 1/32., -1/32., 1],
+]).astype(np.float32)
+A[8] = np.transpose(A_T[8])
+B_T[8] = np.array([
+  [1,  0  , -21/4.,   0  ,  21/4.,   0  , -1, 0],
+  [0,  1  ,   1  , -17/4., -17/4.,   1  , 1 , 0],
+  [0,  -1 ,   1  , 17/4. , -17/4.,  -1  , 1 , 0],
+  [0, 1/2. ,  1/4. , -5/2. , -5/4.,   2  , 1 , 0],
+  [0, -1/2.,  1/4. ,  5/2. , -5/4.,  -2  , 1 , 0],
+  [0,  2  ,   4  , -5/2. ,  -5  ,  1/2. , 1 , 0],
+  [0,  -2 ,   4  ,  5/2. ,  -5  , -1/2. , 1 , 0],
+  [0,  -1 ,   0  , 21/4. ,   0  , -21/4., 0 , 1],
+]).astype(np.float32)
+B[8] = np.transpose(B_T[8])
+G[8] = np.array([
+ [ 1    ,   0    ,  0  ],
+ [-2/9. , -2/9.  , -2/9.],
+ [-2/9. ,  2/9.  , -2/9.],
+ [1/90. , 1/45.  , 2/45.],
+ [1/90. , -1/45. , 2/45.],
+ [32/45., 16/45. , 8/45.],
+ [32/45., -16/45., 8/45.],
+ [ 0    ,  0     ,  1   ],
+]).astype(np.float32)
+G_T[8] = np.transpose(G[8])
 
 
 def output_shape(input_shape, filter_shape):
@@ -29,55 +96,54 @@ def output_shape(input_shape, filter_shape):
   return out_shape
 
 
-def winog_conv(input, filter):
-  m = 2
-  r = 3
+def winograd_conv(m, r, input, filter):
   alpha = m + r - 1
+  print 'Winograd(m = %d, r = %d, tile size=%d' % (m, r, alpha)
+  alpha_square = alpha * alpha
   input_shape = input.shape
   filter_shape = filter.shape
   out_shape = output_shape(input_shape, filter_shape)
 
   K = filter_shape[0]
   C = input_shape[1]
-  U = np.zeros((K * 16, C))
+  U = np.zeros((K * alpha_square, C))
 
   for k in range(K):
     for c in range(C):
-      u = np.dot(np.dot(G, filter[k, c, :, :]), G_T)
-      for i in range(4):
-        for j in range(4) :
-          U[(i * 4 + j) * K + k, c] = u[i, j]
+      u = np.dot(np.dot(G[alpha], filter[k, c, :, :]), G_T[alpha])
+      for i in range(alpha):
+        for j in range(alpha) :
+          U[(i * alpha + j) * K + k, c] = u[i, j]
 
   print 'filter out: ', U.shape
-  print U[0, 0]
-  U.astype(np.float32).tofile("filter_out")
 
-  rounded_h = int(math.ceil(out_shape[2] / 2.0))
-  rounded_w = int(math.ceil(out_shape[3] / 2.0))
+  rounded_h = int(math.ceil(out_shape[2] / (m * 1.0)))
+  rounded_w = int(math.ceil(out_shape[3] / (m * 1.0)))
   P = input_shape[0] * rounded_h * rounded_w
-  V = np.zeros((C * 16, P))
+  V = np.zeros((C * alpha_square, P))
   for p in range(P):
     for c in range(C):
       n = p / (rounded_w * rounded_h)
       t = p % (rounded_h * rounded_w)
       h_idx = t / rounded_w
       w_idx = t % rounded_w
-      h_start = h_idx * 2
-      w_start = w_idx * 2
-      h_end = min(h_start+4, input_shape[2])
-      w_end = min(w_start+4, input_shape[3])
-      d = np.zeros((4, 4))
-      d[0:h_end-h_start, 0:w_end-w_start] = input[n, c, h_start:h_end, w_start:w_end]
-      v = np.dot(np.dot(B_T, d), B)
-      for i in range(4):
-        for j in range(4):
-          V[(i*4+j)*C + c, p] = v[i, j]
-
-  tmp = V.reshape(16, C, P, 1)
+      h_start = h_idx * m
+      w_start = w_idx * m
+      h_end = min(h_start+alpha, input_shape[2])
+      w_end = min(w_start+alpha, input_shape[3])
+      d = np.zeros((alpha, alpha))
+      d[0:h_end-h_start, 0:w_end-w_start] = \
+              input[n, c, h_start:h_end, w_start:w_end]
+      v = np.dot(np.dot(B_T[alpha], d), B[alpha])
+      for i in range(alpha):
+        for j in range(alpha):
+          V[(i*alpha+j)*C + c, p] = v[i, j]
+
+  tmp = V.reshape(alpha_square, C, P, 1)
   print 'input out: ', tmp.shape
   tmp.astype(np.float32).tofile("C")
-  M = np.zeros((16 * K, P))
-  for i in range(alpha * alpha):
+  M = np.zeros((alpha_square * K, P))
+  for i in range(alpha_square):
     u = U[i * K : (i+1) * K, :]
     v = V[i * C : (i+1) * C, :]
     M[i * K : (i+1) * K, :] = np.dot(u, v)
@@ -87,17 +153,17 @@ def winog_conv(input, filter):
   res = np.zeros((out_shape[0], out_shape[2], out_shape[3], out_shape[1]))
   for k in range(K):
     for b in range(P):
-      m = np.zeros((4, 4))
-      for i in range(4):
-        for j in range(4):
-          m[i][j] = M[(i*4+j) * K + k, b]
-      y = np.dot(np.dot(A_T, m), A)
-      for i in range(2):
-        for j in range(2):
+      tm = np.zeros((alpha, alpha))
+      for i in range(alpha):
+        for j in range(alpha):
+          tm[i][j] = M[(i*alpha+j) * K + k, b]
+      y = np.dot(np.dot(A_T[alpha], tm), A[alpha])
+      for i in range(m):
+        for j in range(m):
           n = b / (rounded_h * rounded_w)
           t = b % (rounded_h * rounded_w)
-          p = (t / rounded_w) * 2 + i
-          q = (t % rounded_w) * 2 + j
+          p = (t / rounded_w) * m + i
+          q = (t % rounded_w) * m + j
           if p >= out_shape[2] or q >= out_shape[3]:
             continue
           res[n, p, q, k] = y[i, j]
@@ -115,25 +181,27 @@ def tf_conv(input, filter):
 
 
 def main():
-  input = np.random.random([7, 61, 71, 31]).astype(np.float32)
+  input = np.random.random([5, 23, 29, 15]).astype(np.float32)
   # input = np.fromfile(file="A", dtype=np.float32)
   # input = input.reshape(1, 3, 3, 5)
   print 'input shape: ', input.shape
-  input.tofile("A")
-  filter = np.random.random([3, 3, 31, 31]).astype(np.float32)
+  # input.tofile("A")
+  filter = np.random.random([3, 3, 15, 13]).astype(np.float32)
   tf_out = tf_conv(input, filter)
   input = input.transpose((0, 3, 1, 2))
   filter = filter.transpose((3, 2, 0, 1))
   print 'filter shape: ', filter.shape
-  filter.tofile("filter_in")
-  winog_out = winog_conv(input, filter)
-  res = np.allclose(tf_out, winog_out)
-  if res:
-    print "=========Pass========="
-  else:
-    print "=========Failed========="
-    print "TF: ", tf_out
-    print "Winograd: ", winog_out
+  # filter.tofile("filter_in")
+  for i in [2, 4, 6]:
+    print "==========f(%d,3)==========" % i
+    winograd_out = winograd_conv(i, 3, input, filter)
+    res = np.allclose(tf_out, winograd_out)
+    if res:
+      print "=========Pass========="
+    else:
+      print "=========Failed======="
+      print "TF: ", tf_out
+      print "Winograd: ", winograd_out
 
 
 if __name__ == '__main__':