Merge branch 'develop' into step_rnn/opt_ddim_lite

test=develop

Merge branch 'develop' into step_rnn/opt_ddim_lite
test=develop
6554854a · Liu Yiqun · aef8084f · 9171b70e · 6554854a · 6554854a
57 changed file
--- a/cmake/cross_compiling/ios.cmake
+++ b/cmake/cross_compiling/ios.cmake
@@ -120,6 +120,7 @@
 #

 ## Lite settings
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -flto")
 if (ARM_TARGET_OS STREQUAL "ios")
  set(PLATFORM "OS")
 elseif(ARM_TARGET_OS STREQUAL "ios64")

--- a/lite/api/CMakeLists.txt
+++ b/lite/api/CMakeLists.txt
@@ -305,6 +305,26 @@ if(NOT IOS)
        FPGA_DEPS ${fpga_kernels}
        X86_DEPS ${x86_kernels}
        CUDA_DEPS ${cuda_kernels})
+
+    lite_cc_binary(benchmark_bin SRCS benchmark.cc DEPS paddle_api_full paddle_api_light gflags utils
+        ${ops} ${host_kernels}
+        ARM_DEPS ${arm_kernels}
+        NPU_DEPS ${npu_kernels}
+        XPU_DEPS ${xpu_kernels}
+        CL_DEPS ${opencl_kernels}
+        FPGA_DEPS ${fpga_kernels}
+        X86_DEPS ${x86_kernels}
+        CUDA_DEPS ${cuda_kernels})
+    lite_cc_binary(multithread_test SRCS lite_multithread_test.cc DEPS paddle_api_full paddle_api_light gflags utils
+        ${ops} ${host_kernels}
+        ARM_DEPS ${arm_kernels}
+        CV_DEPS paddle_cv_arm
+        NPU_DEPS ${npu_kernels}
+        XPU_DEPS ${xpu_kernels}
+        CL_DEPS ${opencl_kernels}
+        FPGA_DEPS ${fpga_kernels}
+        X86_DEPS ${x86_kernels}
+        CUDA_DEPS ${cuda_kernels})
 endif()

 #lite_cc_binary(cxx_api_bin SRCS cxx_api_bin.cc

--- a/lite/api/lite_multithread_test.cc
+++ b/lite/api/lite_multithread_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <string>
+#include <vector>
+#include "lite/api/paddle_api.h"
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/api/paddle_use_passes.h"
+#include "lite/api/test_helper.h"
+#include "lite/core/device_info.h"
+#include "lite/core/profile/timer.h"
+#include "lite/utils/cp_logging.h"
+#include "lite/utils/string.h"
+#ifdef LITE_WITH_PROFILE
+#include "lite/core/profile/basic_profiler.h"
+#endif             // LITE_WITH_PROFILE
+#include <thread>  // NOLINT
+
+using paddle::lite::profile::Timer;
+
+DEFINE_string(input_shape,
+              "1,3,224,224",
+              "input shapes, separated by colon and comma");
+
+DEFINE_string(model_dir_0, "", "model_dir_0");
+DEFINE_string(input_shape_0,
+              "1,3,224,224",
+              "input shapes another, separated by colon and comma");
+
+DEFINE_bool(use_optimize_nb,
+            false,
+            "optimized & naive buffer model for mobile devices");
+
+DEFINE_int32(test_type, 0, "multithread test type");
+
+namespace paddle {
+namespace lite_api {
+
+void OutputOptModel(const std::string& load_model_dir,
+                    const std::string& save_optimized_model_dir,
+                    const std::vector<std::vector<int64_t>>& input_shapes) {
+  lite_api::CxxConfig config;
+  config.set_model_dir(load_model_dir);
+  config.set_valid_places({
+      Place{TARGET(kARM), PRECISION(kFloat)},
+  });
+  auto predictor = lite_api::CreatePaddlePredictor(config);
+
+  // delete old optimized model
+  int ret = system(
+      paddle::lite::string_format("rm -rf %s", save_optimized_model_dir.c_str())
+          .c_str());
+  if (ret == 0) {
+    LOG(INFO) << "delete old optimized model " << save_optimized_model_dir;
+  }
+  predictor->SaveOptimizedModel(save_optimized_model_dir,
+                                LiteModelType::kNaiveBuffer);
+  LOG(INFO) << "Load model from " << load_model_dir;
+  LOG(INFO) << "Save optimized model to " << save_optimized_model_dir;
+}
+
+#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
+void Run(const std::vector<std::vector<int64_t>>& input_shapes,
+         const std::string& model_dir,
+         const PowerMode power_mode,
+         const int thread_num,
+         const int repeat,
+         int tid,
+         const int warmup_times = 5) {
+  lite_api::MobileConfig config;
+  config.set_model_dir(model_dir);
+  config.set_power_mode(power_mode);
+  config.set_threads(thread_num);
+
+  auto predictor = lite_api::CreatePaddlePredictor(config);
+
+  for (int j = 0; j < input_shapes.size(); ++j) {
+    auto input_tensor = predictor->GetInput(j);
+    input_tensor->Resize(input_shapes[j]);
+    auto input_data = input_tensor->mutable_data<float>();
+    int input_num = 1;
+    for (int i = 0; i < input_shapes[j].size(); ++i) {
+      input_num *= input_shapes[j][i];
+    }
+    for (int i = 0; i < input_num; ++i) {
+      input_data[i] = 1.f;
+    }
+  }
+
+  for (int i = 0; i < warmup_times; ++i) {
+    predictor->Run();
+  }
+
+  Timer ti;
+  for (int j = 0; j < repeat; ++j) {
+    ti.Start();
+    predictor->Run();
+    float t = ti.Stop();
+    auto output = predictor->GetOutput(0);
+    auto out = output->data<float>();
+    LOG(INFO) << "[thread " << tid << "] Model: " << model_dir
+              << " output[0]:" << out[0] << "; output[1]:" << out[1];
+  }
+  LOG(INFO) << "[thread " << tid << "] Model: " << model_dir
+            << ", power_mode: " << static_cast<int>(power_mode)
+            << ", threads num " << thread_num
+            << ", avg time: " << ti.LapTimes().Avg() << "ms"
+            << ", min time: " << ti.LapTimes().Min() << " ms"
+            << ", max time: " << ti.LapTimes().Max() << " ms.";
+}
+
+void RunTestType_00(const std::vector<std::vector<int64_t>>& input_shapes,
+                    const std::string& model_dir,
+                    const PowerMode power_mode,
+                    const int thread_num,
+                    const int repeat,
+                    const int warmup_times = 5) {
+  std::thread run_th0(Run,
+                      input_shapes,
+                      model_dir,
+                      power_mode,
+                      thread_num,
+                      repeat,
+                      0,
+                      warmup_times);
+  Run(input_shapes, model_dir, power_mode, thread_num, repeat, 1, warmup_times);
+  run_th0.join();
+}
+void RunTestType_01(const std::vector<std::vector<int64_t>>& input_shapes,
+                    const std::string& model_dir,
+                    const std::vector<std::vector<int64_t>>& input_shapes_0,
+                    const std::string& model_dir_0,
+                    const PowerMode power_mode,
+                    const int thread_num,
+                    const int repeat,
+                    const int warmup_times = 5) {
+  std::thread run_th0(Run,
+                      input_shapes,
+                      model_dir,
+                      power_mode,
+                      thread_num,
+                      repeat,
+                      0,
+                      warmup_times);
+  Run(input_shapes_0,
+      model_dir_0,
+      power_mode,
+      thread_num,
+      repeat,
+      1,
+      warmup_times);
+  run_th0.join();
+}
+
+void run_with_predictor(std::shared_ptr<PaddlePredictor> predictor,
+                        const std::vector<std::vector<int64_t>>& input_shapes,
+                        int index,
+                        const std::string& name) {
+  for (int j = 0; j < input_shapes.size(); ++j) {
+    auto input_tensor = predictor->GetInput(j);
+    input_tensor->Resize(input_shapes[j]);
+    auto input_data = input_tensor->mutable_data<float>();
+    int input_num = 1;
+    for (int i = 0; i < input_shapes[j].size(); ++i) {
+      input_num *= input_shapes[j][i];
+    }
+    for (int i = 0; i < input_num; ++i) {
+      input_data[i] = 1.f;
+    }
+  }
+
+  Timer ti;
+  ti.Start();
+  predictor->Run();
+  float t = ti.Stop();
+
+  auto output = predictor->GetOutput(0);
+  auto out = output->data<float>();
+  LOG(INFO) << "[thread " << index << "] name: " << name
+            << ",run time: " << ti.LapTimes().Avg() << "ms"
+            << " output[0]:" << out[0] << "; output[1]:" << out[1];
+}
+void RunTestType_10(const std::vector<std::vector<int64_t>>& input_shapes,
+                    const std::string& model_dir,
+                    const PowerMode power_mode,
+                    const int thread_num,
+                    const int repeat,
+                    int warmup = 5) {
+  lite_api::MobileConfig config;
+  config.set_model_dir(model_dir);
+  config.set_power_mode(power_mode);
+  config.set_threads(thread_num);
+
+  auto predictor = lite_api::CreatePaddlePredictor(config);
+
+  for (int i = 0; i < repeat; ++i) {
+    std::thread pre_th0(
+        run_with_predictor, predictor, input_shapes, i, model_dir);
+    pre_th0.join();
+  }
+}
+void RunTestType_11(const std::vector<std::vector<int64_t>>& input_shapes,
+                    const std::string& model_dir,
+                    const std::vector<std::vector<int64_t>>& input_shapes_0,
+                    const std::string& model_dir_0,
+                    const PowerMode power_mode,
+                    const int thread_num,
+                    const int repeat,
+                    int warmup = 5) {
+  lite_api::MobileConfig config;
+  config.set_model_dir(model_dir);
+  config.set_power_mode(power_mode);
+  config.set_threads(thread_num);
+
+  auto predictor = lite_api::CreatePaddlePredictor(config);
+
+  config.set_model_dir(model_dir_0);
+  auto predictor_0 = lite_api::CreatePaddlePredictor(config);
+
+  for (int i = 0; i < 2 * repeat; i += 2) {
+    std::thread pre_th0(
+        run_with_predictor, predictor, input_shapes, i, model_dir);
+    std::thread pre_th1(
+        run_with_predictor, predictor_0, input_shapes_0, i + 1, model_dir_0);
+    pre_th0.join();
+    pre_th1.join();
+  }
+}
+
+#endif
+
+}  // namespace lite_api
+}  // namespace paddle
+
+int main(int argc, char** argv) {
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+  if (FLAGS_model_dir == "") {
+    LOG(INFO) << "usage: "
+              << "--model_dir /path/to/your/model";
+    exit(0);
+  }
+  std::string save_optimized_model_dir = "";
+  std::string save_optimized_model_dir_0 = "";
+  if (FLAGS_use_optimize_nb) {
+    save_optimized_model_dir = FLAGS_model_dir;
+    save_optimized_model_dir_0 = FLAGS_model_dir_0;
+  } else {
+    save_optimized_model_dir = FLAGS_model_dir + "opt2";
+    save_optimized_model_dir_0 = FLAGS_model_dir_0 + "opt2";
+  }
+
+  auto split_string =
+      [](const std::string& str_in) -> std::vector<std::string> {
+    std::vector<std::string> str_out;
+    std::string tmp_str = str_in;
+    while (!tmp_str.empty()) {
+      size_t next_offset = tmp_str.find(":");
+      str_out.push_back(tmp_str.substr(0, next_offset));
+      if (next_offset == std::string::npos) {
+        break;
+      } else {
+        tmp_str = tmp_str.substr(next_offset + 1);
+      }
+    }
+    return str_out;
+  };
+
+  auto get_shape = [](const std::string& str_shape) -> std::vector<int64_t> {
+    std::vector<int64_t> shape;
+    std::string tmp_str = str_shape;
+    while (!tmp_str.empty()) {
+      int dim = atoi(tmp_str.data());
+      shape.push_back(dim);
+      size_t next_offset = tmp_str.find(",");
+      if (next_offset == std::string::npos) {
+        break;
+      } else {
+        tmp_str = tmp_str.substr(next_offset + 1);
+      }
+    }
+    return shape;
+  };
+
+  std::vector<std::string> str_input_shapes = split_string(FLAGS_input_shape);
+  std::vector<std::vector<int64_t>> input_shapes;
+  for (int i = 0; i < str_input_shapes.size(); ++i) {
+    input_shapes.push_back(get_shape(str_input_shapes[i]));
+  }
+  std::vector<std::string> str_input_shapes_0 =
+      split_string(FLAGS_input_shape_0);
+  std::vector<std::vector<int64_t>> input_shapes_0;
+  for (int i = 0; i < str_input_shapes_0.size(); ++i) {
+    input_shapes_0.push_back(get_shape(str_input_shapes_0[i]));
+  }
+
+  if (!FLAGS_use_optimize_nb) {
+    // Output optimized model
+    paddle::lite_api::OutputOptModel(
+        FLAGS_model_dir, save_optimized_model_dir, input_shapes);
+    paddle::lite_api::OutputOptModel(
+        FLAGS_model_dir_0, save_optimized_model_dir_0, input_shapes_0);
+  }
+
+#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
+  // Run inference using optimized model
+  if (FLAGS_test_type == 0) {
+    paddle::lite_api::RunTestType_00(
+        input_shapes,
+        save_optimized_model_dir,
+        static_cast<paddle::lite_api::PowerMode>(0),
+        FLAGS_threads,
+        FLAGS_repeats,
+        5);
+    LOG(INFO) << "=========above is case 0, below is case "
+                 "1============================";
+    paddle::lite_api::RunTestType_10(
+        input_shapes,
+        save_optimized_model_dir,
+        static_cast<paddle::lite_api::PowerMode>(0),
+        FLAGS_threads,
+        FLAGS_repeats);
+  }
+  if (FLAGS_test_type == 1) {
+    paddle::lite_api::RunTestType_01(
+        input_shapes,
+        save_optimized_model_dir,
+        input_shapes_0,
+        save_optimized_model_dir_0,
+        static_cast<paddle::lite_api::PowerMode>(0),
+        FLAGS_threads,
+        FLAGS_repeats,
+        5);
+    LOG(INFO) << "=========above is case 0, below is case "
+                 "1============================";
+    paddle::lite_api::RunTestType_11(
+        input_shapes,
+        save_optimized_model_dir,
+        input_shapes_0,
+        save_optimized_model_dir_0,
+        static_cast<paddle::lite_api::PowerMode>(0),
+        FLAGS_threads,
+        FLAGS_repeats);
+  }
+
+#endif
+  return 0;
+}
--- a/lite/backends/x86/cpu_info.cc
+++ b/lite/backends/x86/cpu_info.cc
@@ -32,26 +32,37 @@
 #include <gflags/gflags.h>
 #include <algorithm>

-DEFINE_double(fraction_of_cpu_memory_to_use,
-              1,
-              "Default use 100% of CPU memory for PaddlePaddle,"
-              "reserve the rest for page tables, etc");
-DEFINE_uint64(initial_cpu_memory_in_mb,
-              500ul,
-              "Initial CPU memory for PaddlePaddle, in MD unit.");
-
-DEFINE_double(
-    fraction_of_cuda_pinned_memory_to_use,
-    0.5,
-    "Default use 50% of CPU memory as the pinned_memory for PaddlePaddle,"
-    "reserve the rest for page tables, etc");
+#include "lite/utils/env.h"
+
+// DEFINE_double(fraction_of_cpu_memory_to_use,
+//               1,
+//               "Default use 100% of CPU memory for PaddlePaddle,"
+//               "reserve the rest for page tables, etc");
+double fraction_of_cpu_memory_to_use =
+    paddle::lite::GetDoubleFromEnv("fraction_of_cpu_memory_to_use", 1);
+
+// DEFINE_uint64(initial_cpu_memory_in_mb,
+//               500ul,
+//               "Initial CPU memory for PaddlePaddle, in MD unit.");
+uint64_t initial_cpu_memory_in_mb =
+    paddle::lite::GetUInt64FromEnv("initial_cpu_memory_in_mb", 500ul);
+
+// DEFINE_double(
+//     fraction_of_cuda_pinned_memory_to_use,
+//     0.5,
+//     "Default use 50% of CPU memory as the pinned_memory for PaddlePaddle,"
+//     "reserve the rest for page tables, etc");
+double fraction_of_cuda_pinned_memory_to_use = paddle::lite::GetDoubleFromEnv(
+    "fraction_of_cuda_pinned_memory_to_use", 0.5);

 // If use_pinned_memory is true, CPUAllocator calls mlock, which
 // returns pinned and locked memory as staging areas for data exchange
 // between host and device.  Allocates too much would reduce the amount
 // of memory available to the system for paging.  So, by default, we
 // should set false to use_pinned_memory.
-DEFINE_bool(use_pinned_memory, true, "If set, allocate cpu pinned memory.");
+// DEFINE_bool(use_pinned_memory, true, "If set, allocate cpu pinned memory.");
+bool use_pinned_memory =
+    paddle::lite::GetBoolFromEnv("use_pinned_memory", true);

 namespace paddle {
 namespace lite {
@@ -81,7 +92,7 @@ size_t CpuTotalPhysicalMemory() {
 size_t CpuMaxAllocSize() {
  // For distributed systems, it requires configuring and limiting
  // the fraction of memory to use.
-  return FLAGS_fraction_of_cpu_memory_to_use * CpuTotalPhysicalMemory();
+  return fraction_of_cpu_memory_to_use * CpuTotalPhysicalMemory();
 }

 size_t CpuMinChunkSize() {
@@ -92,15 +103,14 @@ size_t CpuMinChunkSize() {
 size_t CpuMaxChunkSize() {
  // Allow to allocate the maximum chunk size is roughly 3% of CPU memory,
  // or the initial_cpu_memory_in_mb.
-  return std::min(
-      static_cast<size_t>(CpuMaxAllocSize() / 32),
-      static_cast<size_t>(FLAGS_initial_cpu_memory_in_mb * 1 << 20));
+  return std::min(static_cast<size_t>(CpuMaxAllocSize() / 32),
+                  static_cast<size_t>(initial_cpu_memory_in_mb * 1 << 20));
 }

 size_t CUDAPinnedMaxAllocSize() {
  // For distributed systems, it requires configuring and limiting
  // the fraction of memory to use.
-  return FLAGS_fraction_of_cuda_pinned_memory_to_use * CpuTotalPhysicalMemory();
+  return fraction_of_cuda_pinned_memory_to_use * CpuTotalPhysicalMemory();
 }

 size_t CUDAPinnedMinChunkSize() {

--- a/lite/backends/x86/dynamic_loader.cc
+++ b/lite/backends/x86/dynamic_loader.cc
@@ -22,36 +22,46 @@ limitations under the License. */
 #include "lite/backends/x86/cupti_lib_path.h"
 #include "lite/backends/x86/port.h"
 #include "lite/backends/x86/warpctc_lib_path.h"
+#include "lite/utils/env.h"
 #include "lite/utils/paddle_enforce.h"

-DEFINE_string(cudnn_dir,
-              "",
-              "Specify path for loading libcudnn.so. For instance, "
-              "/usr/local/cudnn/lib. If empty [default], dlopen "
-              "will search cudnn from LD_LIBRARY_PATH");
+// DEFINE_string(cudnn_dir,
+//               "",
+//               "Specify path for loading libcudnn.so. For instance, "
+//               "/usr/local/cudnn/lib. If empty [default], dlopen "
+//               "will search cudnn from LD_LIBRARY_PATH");
+std::string cudnn_dir = paddle::lite::GetStringFromEnv("cudnn_dir");  // NOLINT

-DEFINE_string(cuda_dir,
-              "",
-              "Specify path for loading cuda library, such as libcublas, "
-              "libcurand. For instance, /usr/local/cuda/lib64. If default, "
-              "dlopen will search cuda from LD_LIBRARY_PATH");
+// DEFINE_string(cuda_dir,
+//               "",
+//               "Specify path for loading cuda library, such as libcublas, "
+//               "libcurand. For instance, /usr/local/cuda/lib64. If default, "
+//               "dlopen will search cuda from LD_LIBRARY_PATH");
+std::string cuda_dir = paddle::lite::GetStringFromEnv("cuda_dir");  // NOLINT

-DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so.");
+// DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so.");
+std::string f_warpctc_dir =                         // NOLINT
+    paddle::lite::GetStringFromEnv("warpctc_dir");  // NOLINT

-DEFINE_string(nccl_dir,
-              "",
-              "Specify path for loading nccl library, such as libcublas, "
-              "libcurand. For instance, /usr/local/cuda/lib64. If default, "
-              "dlopen will search cuda from LD_LIBRARY_PATH");
+// DEFINE_string(nccl_dir,
+//               "",
+//               "Specify path for loading nccl library, such as libcublas, "
+//               "libcurand. For instance, /usr/local/cuda/lib64. If default, "
+//               "dlopen will search cuda from LD_LIBRARY_PATH");
+std::string nccl_dir = paddle::lite::GetStringFromEnv("nccl_dir");  // NOLINT

-DEFINE_string(cupti_dir, "", "Specify path for loading cupti.so.");
+// DEFINE_string(cupti_dir, "", "Specify path for loading cupti.so.");
+std::string cupti_dir = paddle::lite::GetStringFromEnv("cupti_dir");  // NOLINT

-DEFINE_string(
-    tensorrt_dir,
-    "",
-    "Specify path for loading tensorrt library, such as libnvinfer.so.");
+// DEFINE_string(
+//     tensorrt_dir,
+//     "",
+//     "Specify path for loading tensorrt library, such as libnvinfer.so.");
+std::string tensorrt_dir =                           // NOLINT
+    paddle::lite::GetStringFromEnv("tensorrt_dir");  // NOLINT

-DEFINE_string(mklml_dir, "", "Specify path for loading libmklml_intel.so.");
+// DEFINE_string(mklml_dir, "", "Specify path for loading libmklml_intel.so.");
+std::string mklml_dir = paddle::lite::GetStringFromEnv("mklml_dir");  // NOLINT

 namespace paddle {
 namespace lite {
@@ -180,28 +190,28 @@ auto error_msg =

 void* GetCublasDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib");
+  return GetDsoHandleFromSearchPath(cuda_dir, "libcublas.dylib");
 #elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_cublas_lib);
+  return GetDsoHandleFromSearchPath(cuda_dir, win_cublas_lib);
 #else
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so");
+  return GetDsoHandleFromSearchPath(cuda_dir, "libcublas.so");
 #endif
 }

 void* GetCUDNNDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", false);
+  return GetDsoHandleFromSearchPath(cudnn_dir, "libcudnn.dylib", false);
 #elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
-  return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, win_cudnn_lib);
+  return GetDsoHandleFromSearchPath(cudnn_dir, win_cudnn_lib);
 #else
-  return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", false);
+  return GetDsoHandleFromSearchPath(cudnn_dir, "libcudnn.so", false);
 #endif
 }

 void* GetCUPTIDsoHandle() {
  std::string cupti_path = cupti_lib_path;
-  if (!FLAGS_cupti_dir.empty()) {
-    cupti_path = FLAGS_cupti_dir;
+  if (!cupti_dir.empty()) {
+    cupti_path = cupti_dir;
  }
 #if defined(__APPLE__) || defined(__OSX__)
  return GetDsoHandleFromSearchPath(cupti_path, "libcupti.dylib", false);
@@ -212,18 +222,18 @@ void* GetCUPTIDsoHandle() {

 void* GetCurandDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib");
+  return GetDsoHandleFromSearchPath(cuda_dir, "libcurand.dylib");
 #elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_curand_lib);
+  return GetDsoHandleFromSearchPath(cuda_dir, win_curand_lib);
 #else
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so");
+  return GetDsoHandleFromSearchPath(cuda_dir, "libcurand.so");
 #endif
 }

 void* GetWarpCTCDsoHandle() {
  std::string warpctc_dir = warpctc_lib_path;
-  if (!FLAGS_warpctc_dir.empty()) {
-    warpctc_dir = FLAGS_warpctc_dir;
+  if (!f_warpctc_dir.empty()) {
+    warpctc_dir = f_warpctc_dir;
  }
 #if defined(__APPLE__) || defined(__OSX__)
  return GetDsoHandleFromSearchPath(warpctc_dir, "libwarpctc.dylib");
@@ -236,27 +246,27 @@ void* GetWarpCTCDsoHandle() {

 void* GetNCCLDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.dylib");
+  return GetDsoHandleFromSearchPath(nccl_dir, "libnccl.dylib");
 #else
-  return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.so");
+  return GetDsoHandleFromSearchPath(nccl_dir, "libnccl.so");
 #endif
 }

 void* GetTensorRtDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_tensorrt_dir, "libnvinfer.dylib");
+  return GetDsoHandleFromSearchPath(tensorrt_dir, "libnvinfer.dylib");
 #else
-  return GetDsoHandleFromSearchPath(FLAGS_tensorrt_dir, "libnvinfer.so");
+  return GetDsoHandleFromSearchPath(tensorrt_dir, "libnvinfer.so");
 #endif
 }

 void* GetMKLMLDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.dylib");
+  return GetDsoHandleFromSearchPath(mklml_dir, "libmklml_intel.dylib");
 #elif defined(_WIN32)
-  return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "mklml.dll");
+  return GetDsoHandleFromSearchPath(mklml_dir, "mklml.dll");
 #else
-  return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.so");
+  return GetDsoHandleFromSearchPath(mklml_dir, "libmklml_intel.so");
 #endif
 }


--- a/lite/backends/x86/jit/gen_base.cc
+++ b/lite/backends/x86/jit/gen_base.cc
@@ -21,13 +21,15 @@
 // posix_memalign
 #include "lite/backends/x86/cpu_info.h"
 #include "lite/backends/x86/jit/macro.h"
+#include "lite/utils/env.h"
 #include "lite/utils/paddle_enforce.h"

 #ifndef _WIN32
 #define posix_memalign_free free
 #endif

-DEFINE_bool(dump_jitcode, false, "Whether to dump the jitcode to file");
+// DEFINE_bool(dump_jitcode, false, "Whether to dump the jitcode to file");
+bool dump_jitcode = paddle::lite::GetBoolFromEnv("dump_jitcode");

 namespace paddle {
 namespace lite {

--- a/lite/backends/x86/jit/gen_base.h
+++ b/lite/backends/x86/jit/gen_base.h
@@ -20,7 +20,8 @@
 #include <vector>
 #include "lite/backends/x86/jit/kernel_base.h"

-DECLARE_bool(dump_jitcode);
+// DECLARE_bool(dump_jitcode);
+extern bool dump_jitcode;

 namespace paddle {
 namespace lite {
@@ -36,7 +37,7 @@ class GenBase : public Kernel {
  template <typename Func>
  Func getCode() const {
    const unsigned char* code = this->getCodeInternal();
-    if (FLAGS_dump_jitcode) {
+    if (dump_jitcode) {
      this->dumpCode(code);
    }
    // Note: failed to cast with reinterpret_cast<const Func> on Mac clang,

--- a/lite/backends/x86/math/beam_search.cc
+++ b/lite/backends/x86/math/beam_search.cc
@@ -86,7 +86,8 @@ class BeamSearchFunctor<TARGET(kX86), T> {
    //    selected_ids->mutable_data<int64_t>(dims, platform::CPUPlace());
    // auto *selected_scores_data =
    //    selected_scores->mutable_data<float>(dims, platform::CPUPlace());
-    parent_idx->Resize({static_cast<int64_t>(num_instances)});
+    parent_idx->Resize(
+        std::vector<int64_t>({static_cast<int64_t>(num_instances)}));
    auto *parent_idx_data =
        parent_idx ? parent_idx->mutable_data<int>(TARGET(kX86)) : nullptr;
    // auto *parent_idx_data =

--- a/lite/backends/x86/math/detail/avx_mathfun.h
+++ b/lite/backends/x86/math/detail/avx_mathfun.h
@@ -41,9 +41,11 @@

  (this is the zlib license)
 */
-
+#pragma once
 #include "lite/backends/x86/cpu_info.h"

+namespace paddle {
+namespace lite {
 /* __m128 is ugly to write */
 typedef __m256 v8sf;   // vector of 8 float (avx)
 typedef __m256i v8si;  // vector of 8 int   (avx)
@@ -134,7 +136,7 @@ typedef union imm_xmm_union {
    return (ret);                                        \
  }

-//#warning "Using SSE2 to perform AVX2 bitshift ops"
+// #warning "Using SSE2 to perform AVX2 bitshift ops"
 AVX2_BITOP_USING_SSE2(slli_epi32)
 AVX2_BITOP_USING_SSE2(srli_epi32)

@@ -152,7 +154,7 @@ AVX2_BITOP_USING_SSE2(srli_epi32)
    return (ret);                                                     \
  }

-//#warning "Using SSE2 to perform AVX2 integer ops"
+// #warning "Using SSE2 to perform AVX2 integer ops"
 AVX2_INTOP_USING_SSE2(and_si128)
 AVX2_INTOP_USING_SSE2(andnot_si128)
 AVX2_INTOP_USING_SSE2(cmpeq_epi32)
@@ -175,23 +177,23 @@ AVX2_INTOP_USING_SSE2(add_epi32)
 */
 v8sf log256_ps(v8sf x) {
  v8si imm0;
-  v8sf one = *(v8sf *)_ps256_1;
+  v8sf one = *(v8sf *)_ps256_1;  // NOLINT

  // v8sf invalid_mask = _mm256_cmple_ps(x, _mm256_setzero_ps());
  v8sf invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LE_OS);

-  x = _mm256_max_ps(
-      x, *(v8sf *)_ps256_min_norm_pos); /* cut off denormalized stuff */
+  x = _mm256_max_ps(x, *(v8sf *)_ps256_min_norm_pos);  // NOLINT
+  /* cut off denormalized stuff */                     // NOLINT

  // can be done with AVX2
  imm0 = avx2_mm256_srli_epi32(_mm256_castps_si256(x), 23);

  /* keep only the fractional part */
-  x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_mant_mask);
-  x = _mm256_or_ps(x, *(v8sf *)_ps256_0p5);
+  x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_mant_mask);  // NOLINT
+  x = _mm256_or_ps(x, *(v8sf *)_ps256_0p5);             // NOLINT

  // this is again another AVX2 instruction
-  imm0 = avx2_mm256_sub_epi32(imm0, *(v8si *)_pi32_256_0x7f);
+  imm0 = avx2_mm256_sub_epi32(imm0, *(v8si *)_pi32_256_0x7f);  // NOLINT
  v8sf e = _mm256_cvtepi32_ps(imm0);

  e = _mm256_add_ps(e, one);
@@ -203,7 +205,8 @@ v8sf log256_ps(v8sf x) {
     } else { x = x - 1.0; }
  */
  // v8sf mask = _mm256_cmplt_ps(x, *(v8sf*)_ps256_cephes_SQRTHF);
-  v8sf mask = _mm256_cmp_ps(x, *(v8sf *)_ps256_cephes_SQRTHF, _CMP_LT_OS);
+  v8sf mask =
+      _mm256_cmp_ps(x, *(v8sf *)_ps256_cephes_SQRTHF, _CMP_LT_OS);  // NOLINT
  v8sf tmp = _mm256_and_ps(x, mask);
  x = _mm256_sub_ps(x, one);
  e = _mm256_sub_ps(e, _mm256_and_ps(one, mask));
@@ -211,34 +214,34 @@ v8sf log256_ps(v8sf x) {

  v8sf z = _mm256_mul_ps(x, x);

-  v8sf y = *(v8sf *)_ps256_cephes_log_p0;
+  v8sf y = *(v8sf *)_ps256_cephes_log_p0;  // NOLINT
  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p1);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p1);  // NOLINT
  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p2);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p2);  // NOLINT
  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p3);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p3);  // NOLINT
  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p4);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p4);  // NOLINT
  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p5);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p5);  // NOLINT
  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p6);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p6);  // NOLINT
  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p7);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p7);  // NOLINT
  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p8);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p8);  // NOLINT
  y = _mm256_mul_ps(y, x);

  y = _mm256_mul_ps(y, z);

-  tmp = _mm256_mul_ps(e, *(v8sf *)_ps256_cephes_log_q1);
+  tmp = _mm256_mul_ps(e, *(v8sf *)_ps256_cephes_log_q1);  // NOLINT
  y = _mm256_add_ps(y, tmp);

-  tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5);
+  tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5);  // NOLINT
  y = _mm256_sub_ps(y, tmp);

-  tmp = _mm256_mul_ps(e, *(v8sf *)_ps256_cephes_log_q2);
+  tmp = _mm256_mul_ps(e, *(v8sf *)_ps256_cephes_log_q2);  // NOLINT
  x = _mm256_add_ps(x, y);
  x = _mm256_add_ps(x, tmp);
  x = _mm256_or_ps(x, invalid_mask);  // negative arg will be NAN
@@ -262,14 +265,14 @@ _PS256_CONST(cephes_exp_p5, 5.0000001201E-1);
 v8sf exp256_ps(v8sf x) {
  v8sf tmp = _mm256_setzero_ps(), fx;
  v8si imm0;
-  v8sf one = *(v8sf *)_ps256_1;
+  v8sf one = *(v8sf *)_ps256_1;  // NOLINT

-  x = _mm256_min_ps(x, *(v8sf *)_ps256_exp_hi);
-  x = _mm256_max_ps(x, *(v8sf *)_ps256_exp_lo);
+  x = _mm256_min_ps(x, *(v8sf *)_ps256_exp_hi);  // NOLINT
+  x = _mm256_max_ps(x, *(v8sf *)_ps256_exp_lo);  // NOLINT

  /* express exp(x) as exp(g + n*log(2)) */
-  fx = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_LOG2EF);
-  fx = _mm256_add_ps(fx, *(v8sf *)_ps256_0p5);
+  fx = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_LOG2EF);  // NOLINT
+  fx = _mm256_add_ps(fx, *(v8sf *)_ps256_0p5);           // NOLINT

  /* how to perform a floorf with SSE: just below */
  // imm0 = _mm256_cvttps_epi32(fx);
@@ -283,24 +286,24 @@ v8sf exp256_ps(v8sf x) {
  mask = _mm256_and_ps(mask, one);
  fx = _mm256_sub_ps(tmp, mask);

-  tmp = _mm256_mul_ps(fx, *(v8sf *)_ps256_cephes_exp_C1);
-  v8sf z = _mm256_mul_ps(fx, *(v8sf *)_ps256_cephes_exp_C2);
+  tmp = _mm256_mul_ps(fx, *(v8sf *)_ps256_cephes_exp_C1);     // NOLINT
+  v8sf z = _mm256_mul_ps(fx, *(v8sf *)_ps256_cephes_exp_C2);  // NOLINT
  x = _mm256_sub_ps(x, tmp);
  x = _mm256_sub_ps(x, z);

  z = _mm256_mul_ps(x, x);

-  v8sf y = *(v8sf *)_ps256_cephes_exp_p0;
+  v8sf y = *(v8sf *)_ps256_cephes_exp_p0;  // NOLINT
  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p1);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p1);  // NOLINT
  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p2);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p2);  // NOLINT
  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p3);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p3);  // NOLINT
  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p4);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p4);  // NOLINT
  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p5);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p5);  // NOLINT
  y = _mm256_mul_ps(y, z);
  y = _mm256_add_ps(y, x);
  y = _mm256_add_ps(y, one);
@@ -308,7 +311,7 @@ v8sf exp256_ps(v8sf x) {
  /* build 2^n */
  imm0 = _mm256_cvttps_epi32(fx);
  // another two AVX2 instructions
-  imm0 = avx2_mm256_add_epi32(imm0, *(v8si *)_pi32_256_0x7f);
+  imm0 = avx2_mm256_add_epi32(imm0, *(v8si *)_pi32_256_0x7f);  // NOLINT
  imm0 = avx2_mm256_slli_epi32(imm0, 23);
  v8sf pow2n = _mm256_castsi256_ps(imm0);
  y = _mm256_mul_ps(y, pow2n);
@@ -349,12 +352,12 @@ v8sf sin256_ps(v8sf x) {  // any x

  sign_bit = x;
  /* take the absolute value */
-  x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask);
+  x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask);  // NOLINT
  /* extract the sign bit (upper one) */
-  sign_bit = _mm256_and_ps(sign_bit, *(v8sf *)_ps256_sign_mask);
+  sign_bit = _mm256_and_ps(sign_bit, *(v8sf *)_ps256_sign_mask);  // NOLINT

  /* scale by 4/Pi */
-  y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI);
+  y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI);  // NOLINT

 /*
  Here we start a series of integer operations, which are in the
@@ -367,12 +370,12 @@ v8sf sin256_ps(v8sf x) {  // any x
  imm2 = _mm256_cvttps_epi32(y);
  /* j=(j+1) & (~1) (see the cephes sources) */
  // another two AVX2 instruction
-  imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1);
-  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1);
+  imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1);     // NOLINT
+  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1);  // NOLINT
  y = _mm256_cvtepi32_ps(imm2);

  /* get the swap sign flag */
-  imm0 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_4);
+  imm0 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_4);  // NOLINT
  imm0 = avx2_mm256_slli_epi32(imm0, 29);
  /* get the polynom selection mask
     there is one polynom for 0 <= x <= Pi/4
@@ -380,31 +383,31 @@ v8sf sin256_ps(v8sf x) {  // any x

     Both branches will be computed.
  */
-  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2);
-  imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0);
+  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2);    // NOLINT
+  imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0);  // NOLINT
 #else
  /* we use SSE2 routines to perform the integer ops */
  COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y), imm2_1, imm2_2);

-  imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1);
-  imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1);
+  imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1);  // NOLINT
+  imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1);  // NOLINT

-  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1);
-  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1);
+  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1);  // NOLINT
+  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1);  // NOLINT

  COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
  y = _mm256_cvtepi32_ps(imm2);

-  imm0_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_4);
-  imm0_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_4);
+  imm0_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_4);  // NOLINT
+  imm0_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_4);  // NOLINT

  imm0_1 = _mm_slli_epi32(imm0_1, 29);
  imm0_2 = _mm_slli_epi32(imm0_2, 29);

  COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);

-  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2);
-  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2);
+  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2);  // NOLINT
+  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2);  // NOLINT

  imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
  imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
@@ -418,9 +421,9 @@ v8sf sin256_ps(v8sf x) {  // any x

  /* The magic pass: "Extended precision modular arithmetic"
     x = ((x - y * DP1) - y * DP2) - y * DP3; */
-  xmm1 = *(v8sf *)_ps256_minus_cephes_DP1;
-  xmm2 = *(v8sf *)_ps256_minus_cephes_DP2;
-  xmm3 = *(v8sf *)_ps256_minus_cephes_DP3;
+  xmm1 = *(v8sf *)_ps256_minus_cephes_DP1;  // NOLINT
+  xmm2 = *(v8sf *)_ps256_minus_cephes_DP2;  // NOLINT
+  xmm3 = *(v8sf *)_ps256_minus_cephes_DP3;  // NOLINT
  xmm1 = _mm256_mul_ps(y, xmm1);
  xmm2 = _mm256_mul_ps(y, xmm2);
  xmm3 = _mm256_mul_ps(y, xmm3);
@@ -429,26 +432,26 @@ v8sf sin256_ps(v8sf x) {  // any x
  x = _mm256_add_ps(x, xmm3);

  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
-  y = *(v8sf *)_ps256_coscof_p0;
+  y = *(v8sf *)_ps256_coscof_p0;  // NOLINT
  v8sf z = _mm256_mul_ps(x, x);

  y = _mm256_mul_ps(y, z);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1);  // NOLINT
  y = _mm256_mul_ps(y, z);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2);  // NOLINT
  y = _mm256_mul_ps(y, z);
  y = _mm256_mul_ps(y, z);
-  v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5);
+  v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5);  // NOLINT
  y = _mm256_sub_ps(y, tmp);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_1);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_1);  // NOLINT

  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */

-  v8sf y2 = *(v8sf *)_ps256_sincof_p0;
+  v8sf y2 = *(v8sf *)_ps256_sincof_p0;  // NOLINT
  y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1);
+  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1);  // NOLINT
  y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2);
+  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2);  // NOLINT
  y2 = _mm256_mul_ps(y2, z);
  y2 = _mm256_mul_ps(y2, x);
  y2 = _mm256_add_ps(y2, x);
@@ -475,53 +478,53 @@ v8sf cos256_ps(v8sf x) {  // any x
 #endif

  /* take the absolute value */
-  x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask);
+  x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask);  // NOLINT

  /* scale by 4/Pi */
-  y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI);
+  y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI);  // NOLINT

 #ifdef __AVX2__
  /* store the integer part of y in mm0 */
  imm2 = _mm256_cvttps_epi32(y);
  /* j=(j+1) & (~1) (see the cephes sources) */
-  imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1);
-  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1);
+  imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1);     // NOLINT
+  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1);  // NOLINT
  y = _mm256_cvtepi32_ps(imm2);
-  imm2 = avx2_mm256_sub_epi32(imm2, *(v8si *)_pi32_256_2);
+  imm2 = avx2_mm256_sub_epi32(imm2, *(v8si *)_pi32_256_2);  // NOLINT

  /* get the swap sign flag */
-  imm0 = avx2_mm256_andnot_si256(imm2, *(v8si *)_pi32_256_4);
+  imm0 = avx2_mm256_andnot_si256(imm2, *(v8si *)_pi32_256_4);  // NOLINT
  imm0 = avx2_mm256_slli_epi32(imm0, 29);
  /* get the polynom selection mask */
-  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2);
-  imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0);
+  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2);    // NOLINT
+  imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0);  // NOLINT
 #else

  /* we use SSE2 routines to perform the integer ops */
  COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y), imm2_1, imm2_2);

-  imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1);
-  imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1);
+  imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1);  // NOLINT
+  imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1);  // NOLINT

-  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1);
-  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1);
+  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1);  // NOLINT
+  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1);  // NOLINT

  COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
  y = _mm256_cvtepi32_ps(imm2);

-  imm2_1 = _mm_sub_epi32(imm2_1, *(v4si *)_pi32avx_2);
-  imm2_2 = _mm_sub_epi32(imm2_2, *(v4si *)_pi32avx_2);
+  imm2_1 = _mm_sub_epi32(imm2_1, *(v4si *)_pi32avx_2);  // NOLINT
+  imm2_2 = _mm_sub_epi32(imm2_2, *(v4si *)_pi32avx_2);  // NOLINT

-  imm0_1 = _mm_andnot_si128(imm2_1, *(v4si *)_pi32avx_4);
-  imm0_2 = _mm_andnot_si128(imm2_2, *(v4si *)_pi32avx_4);
+  imm0_1 = _mm_andnot_si128(imm2_1, *(v4si *)_pi32avx_4);  // NOLINT
+  imm0_2 = _mm_andnot_si128(imm2_2, *(v4si *)_pi32avx_4);  // NOLINT

  imm0_1 = _mm_slli_epi32(imm0_1, 29);
  imm0_2 = _mm_slli_epi32(imm0_2, 29);

  COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);

-  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2);
-  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2);
+  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2);  // NOLINT
+  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2);  // NOLINT

  imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
  imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
@@ -534,9 +537,9 @@ v8sf cos256_ps(v8sf x) {  // any x

  /* The magic pass: "Extended precision modular arithmetic"
     x = ((x - y * DP1) - y * DP2) - y * DP3; */
-  xmm1 = *(v8sf *)_ps256_minus_cephes_DP1;
-  xmm2 = *(v8sf *)_ps256_minus_cephes_DP2;
-  xmm3 = *(v8sf *)_ps256_minus_cephes_DP3;
+  xmm1 = *(v8sf *)_ps256_minus_cephes_DP1;  // NOLINT
+  xmm2 = *(v8sf *)_ps256_minus_cephes_DP2;  // NOLINT
+  xmm3 = *(v8sf *)_ps256_minus_cephes_DP3;  // NOLINT
  xmm1 = _mm256_mul_ps(y, xmm1);
  xmm2 = _mm256_mul_ps(y, xmm2);
  xmm3 = _mm256_mul_ps(y, xmm3);
@@ -545,26 +548,26 @@ v8sf cos256_ps(v8sf x) {  // any x
  x = _mm256_add_ps(x, xmm3);

  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
-  y = *(v8sf *)_ps256_coscof_p0;
+  y = *(v8sf *)_ps256_coscof_p0;  // NOLINT
  v8sf z = _mm256_mul_ps(x, x);

  y = _mm256_mul_ps(y, z);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1);  // NOLINT
  y = _mm256_mul_ps(y, z);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2);  // NOLINT
  y = _mm256_mul_ps(y, z);
  y = _mm256_mul_ps(y, z);
-  v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5);
+  v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5);  // NOLINT
  y = _mm256_sub_ps(y, tmp);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_1);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_1);  // NOLINT

  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */

-  v8sf y2 = *(v8sf *)_ps256_sincof_p0;
+  v8sf y2 = *(v8sf *)_ps256_sincof_p0;  // NOLINT
  y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1);
+  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1);  // NOLINT
  y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2);
+  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2);  // NOLINT
  y2 = _mm256_mul_ps(y2, z);
  y2 = _mm256_mul_ps(y2, x);
  y2 = _mm256_add_ps(y2, x);
@@ -595,42 +598,43 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {

  sign_bit_sin = x;
  /* take the absolute value */
-  x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask);
+  x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask);  // NOLINT
  /* extract the sign bit (upper one) */
-  sign_bit_sin = _mm256_and_ps(sign_bit_sin, *(v8sf *)_ps256_sign_mask);
+  sign_bit_sin =
+      _mm256_and_ps(sign_bit_sin, *(v8sf *)_ps256_sign_mask);  // NOLINT

  /* scale by 4/Pi */
-  y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI);
+  y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI);  // NOLINT

 #ifdef __AVX2__
  /* store the integer part of y in imm2 */
  imm2 = _mm256_cvttps_epi32(y);

  /* j=(j+1) & (~1) (see the cephes sources) */
-  imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1);
-  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1);
+  imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1);     // NOLINT
+  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1);  // NOLINT

  y = _mm256_cvtepi32_ps(imm2);
  imm4 = imm2;

  /* get the swap sign flag for the sine */
-  imm0 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_4);
+  imm0 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_4);  // NOLINT
  imm0 = avx2_mm256_slli_epi32(imm0, 29);
  // v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0);

  /* get the polynom selection mask for the sine*/
-  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2);
-  imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0);
+  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2);    // NOLINT
+  imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0);  // NOLINT
 // v8sf poly_mask = _mm256_castsi256_ps(imm2);
 #else
  /* we use SSE2 routines to perform the integer ops */
  COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y), imm2_1, imm2_2);

-  imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1);
-  imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1);
+  imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1);  // NOLINT
+  imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1);  // NOLINT

-  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1);
-  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1);
+  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1);  // NOLINT
+  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1);  // NOLINT

  COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
  y = _mm256_cvtepi32_ps(imm2);
@@ -638,16 +642,16 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
  imm4_1 = imm2_1;
  imm4_2 = imm2_2;

-  imm0_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_4);
-  imm0_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_4);
+  imm0_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_4);  // NOLINT
+  imm0_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_4);  // NOLINT

  imm0_1 = _mm_slli_epi32(imm0_1, 29);
  imm0_2 = _mm_slli_epi32(imm0_2, 29);

  COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);

-  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2);
-  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2);
+  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2);  // NOLINT
+  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2);  // NOLINT

  imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
  imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
@@ -659,9 +663,9 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {

  /* The magic pass: "Extended precision modular arithmetic"
     x = ((x - y * DP1) - y * DP2) - y * DP3; */
-  xmm1 = *(v8sf *)_ps256_minus_cephes_DP1;
-  xmm2 = *(v8sf *)_ps256_minus_cephes_DP2;
-  xmm3 = *(v8sf *)_ps256_minus_cephes_DP3;
+  xmm1 = *(v8sf *)_ps256_minus_cephes_DP1;  // NOLINT
+  xmm2 = *(v8sf *)_ps256_minus_cephes_DP2;  // NOLINT
+  xmm3 = *(v8sf *)_ps256_minus_cephes_DP3;  // NOLINT
  xmm1 = _mm256_mul_ps(y, xmm1);
  xmm2 = _mm256_mul_ps(y, xmm2);
  xmm3 = _mm256_mul_ps(y, xmm3);
@@ -670,15 +674,15 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
  x = _mm256_add_ps(x, xmm3);

 #ifdef __AVX2__
-  imm4 = avx2_mm256_sub_epi32(imm4, *(v8si *)_pi32_256_2);
-  imm4 = avx2_mm256_andnot_si256(imm4, *(v8si *)_pi32_256_4);
+  imm4 = avx2_mm256_sub_epi32(imm4, *(v8si *)_pi32_256_2);     // NOLINT
+  imm4 = avx2_mm256_andnot_si256(imm4, *(v8si *)_pi32_256_4);  // NOLINT
  imm4 = avx2_mm256_slli_epi32(imm4, 29);
 #else
-  imm4_1 = _mm_sub_epi32(imm4_1, *(v4si *)_pi32avx_2);
-  imm4_2 = _mm_sub_epi32(imm4_2, *(v4si *)_pi32avx_2);
+  imm4_1 = _mm_sub_epi32(imm4_1, *(v4si *)_pi32avx_2);  // NOLINT
+  imm4_2 = _mm_sub_epi32(imm4_2, *(v4si *)_pi32avx_2);  // NOLINT

-  imm4_1 = _mm_andnot_si128(imm4_1, *(v4si *)_pi32avx_4);
-  imm4_2 = _mm_andnot_si128(imm4_2, *(v4si *)_pi32avx_4);
+  imm4_1 = _mm_andnot_si128(imm4_1, *(v4si *)_pi32avx_4);  // NOLINT
+  imm4_2 = _mm_andnot_si128(imm4_2, *(v4si *)_pi32avx_4);  // NOLINT

  imm4_1 = _mm_slli_epi32(imm4_1, 29);
  imm4_2 = _mm_slli_epi32(imm4_2, 29);
@@ -692,25 +696,25 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {

  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
  v8sf z = _mm256_mul_ps(x, x);
-  y = *(v8sf *)_ps256_coscof_p0;
+  y = *(v8sf *)_ps256_coscof_p0;  // NOLINT

  y = _mm256_mul_ps(y, z);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1);  // NOLINT
  y = _mm256_mul_ps(y, z);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2);  // NOLINT
  y = _mm256_mul_ps(y, z);
  y = _mm256_mul_ps(y, z);
-  v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5);
+  v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5);  // NOLINT
  y = _mm256_sub_ps(y, tmp);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_1);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_1);  // NOLINT

  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */

-  v8sf y2 = *(v8sf *)_ps256_sincof_p0;
+  v8sf y2 = *(v8sf *)_ps256_sincof_p0;  // NOLINT
  y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1);
+  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1);  // NOLINT
  y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2);
+  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2);  // NOLINT
  y2 = _mm256_mul_ps(y2, z);
  y2 = _mm256_mul_ps(y2, x);
  y2 = _mm256_add_ps(y2, x);
@@ -729,3 +733,6 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
  *s = _mm256_xor_ps(xmm1, sign_bit_sin);
  *c = _mm256_xor_ps(xmm2, sign_bit_cos);
 }
+
+}  // namespace lite
+}  // namespace paddle
--- a/lite/core/kernel.h
+++ b/lite/core/kernel.h
@@ -83,14 +83,11 @@ class KernelBase {
 #if defined(LITE_WITH_CUDA)
    WorkSpace::Global_CUDA().AllocReset();
 #endif
-
 #ifdef LITE_WITH_PROFILE
-    CHECK(profiler_) << "Profiler pointer of kernel can not be nullptr. "
-                        "When LITE_WITH_PROFILE is defined, please set a "
-                        "Profiler for Instruction.";
-    profiler_->StartTiming(profile_id_, ctx_.get());
+    profiler_->StopTiming(profile::Type::kCreate, profile_id_, ctx_.get());
+    profiler_->StartTiming(profile::Type::kDispatch, profile_id_, ctx_.get());
    Run();
-    profiler_->StopTiming(profile_id_, ctx_.get());
+    profiler_->StopTiming(profile::Type::kDispatch, profile_id_, ctx_.get());
 #else
    Run();
 #endif

--- a/lite/core/memory.h
+++ b/lite/core/memory.h
@@ -120,6 +120,7 @@ class Buffer {
    if (space_ > 0) {
      TargetFree(target_, data_);
    }
+    data_ = nullptr;
    target_ = TargetType::kHost;
    space_ = 0;
  }

--- a/lite/core/profile/profiler.cc
+++ b/lite/core/profile/profiler.cc
@@ -28,36 +28,55 @@ auto op_comp = [](const OpCharacter& c1, const OpCharacter& c2) {
 };
 }

-int Profiler::NewTimer(const OpCharacter& ch) {
-  StatisUnit unit;
-  unit.character = ch;
+std::map<Type, std::string> TypeStr{
+    {Type::kUnk, "Unknown"},
+    {Type::kCreate, "Create"},
+    {Type::kDispatch, "Dispatch"},
+};
+
+StatisUnit::StatisUnit(const OpCharacter& ch) : character(ch) {
+  create_t.reset(new DeviceTimer<TargetType::kHost>());
  if (ch.target == TargetType::kCUDA) {
 #ifdef LITE_WITH_CUDA
-    unit.timer.reset(new DeviceTimer<TargetType::kCUDA>());
+    dispatch_t.reset(new DeviceTimer<TargetType::kCUDA>());
 #else
    LOG(ERROR) << "The timer type specified as cuda is uninitialized, so the "
                  "default x86 timer is used instead.";
 #endif
  } else {
-    unit.timer.reset(new DeviceTimer<TargetType::kHost>());
+    dispatch_t.reset(new DeviceTimer<TargetType::kHost>());
  }
+}
+
+lite::profile::Timer* StatisUnit::Timer(Type type) {
+  if (type == Type::kCreate) {
+    return create_t.get();
+  } else if (type == Type::kDispatch) {
+    return dispatch_t.get();
+  }
+  LOG(FATAL) << "Timer cannot be returned for unknown platforms.";
+  return nullptr;
+}
+
+int Profiler::NewTimer(const OpCharacter& ch) {
+  StatisUnit unit(ch);
  units_.push_back(std::move(unit));
  return units_.size() - 1;
 }

-void Profiler::StartTiming(const int index, KernelContext* ctx) {
+void Profiler::StartTiming(Type type, const int index, KernelContext* ctx) {
  CHECK_LT(index, units_.size())
      << "The timer index in the profiler is out of range.";
-  units_[index].timer->Start(ctx);
+  units_[index].Timer(type)->Start(ctx);
 }

-float Profiler::StopTiming(const int index, KernelContext* ctx) {
+float Profiler::StopTiming(Type type, const int index, KernelContext* ctx) {
  CHECK_LT(index, units_.size())
      << "The timer index in the profiler is out of range.";
-  return units_[index].timer->Stop(ctx);
+  return units_[index].Timer(type)->Stop(ctx);
 }

-std::string Profiler::Summary(bool concise, size_t w) {
+std::string Profiler::Summary(Type type, bool concise, size_t w) {
  using std::setw;
  using std::left;
  using std::fixed;
@@ -65,12 +84,14 @@ std::string Profiler::Summary(bool concise, size_t w) {
  std::string title;
  // Title.
  if (concise) {
-    ss << "Timing cycle = " << units_.front().timer->LapTimes().Size()
+    ss << "Timing cycle = " << units_.front().Timer(type)->LapTimes().Size()
       << std::endl;
-    ss << "===== Concise Profiler Summary: " << name_ << ", Exclude " << w
+    ss << "===== Concise " << TypeStr.find(type)->second
+       << " Profiler Summary: " << name_ << ", Exclude " << w
       << " warm-ups =====" << std::endl;
  } else {
-    ss << "===== Detailed Profiler Summary: " << name_ << ", Exclude " << w
+    ss << "===== Detailed " << TypeStr.find(type)->second
+       << " Profiler Summary: " << name_ << ", Exclude " << w
       << " warm-ups =====" << std::endl;
  }
  ss << setw(25) << left << "Operator Type"
@@ -84,16 +105,16 @@ std::string Profiler::Summary(bool concise, size_t w) {
  if (concise) {
    std::map<OpCharacter, TimeInfo, decltype(op_comp)> summary(op_comp);
    for (auto& unit : units_) {
-      auto ch = summary.find(unit.character);
+      auto ch = summary.find(unit.Character());
      if (ch != summary.end()) {
-        ch->second.avg += unit.timer->LapTimes().Avg(w);
-        ch->second.min += unit.timer->LapTimes().Min(w);
-        ch->second.max += unit.timer->LapTimes().Max(w);
+        ch->second.avg += unit.Timer(type)->LapTimes().Avg(w);
+        ch->second.min += unit.Timer(type)->LapTimes().Min(w);
+        ch->second.max += unit.Timer(type)->LapTimes().Max(w);
      } else {
-        TimeInfo info({unit.timer->LapTimes().Avg(w),
-                       unit.timer->LapTimes().Min(w),
-                       unit.timer->LapTimes().Max(w)});
-        summary.insert({unit.character, info});
+        TimeInfo info({unit.Timer(type)->LapTimes().Avg(w),
+                       unit.Timer(type)->LapTimes().Min(w),
+                       unit.Timer(type)->LapTimes().Max(w)});
+        summary.insert({unit.Character(), info});
      }
    }
    for (const auto& item : summary) {
@@ -109,14 +130,15 @@ std::string Profiler::Summary(bool concise, size_t w) {
    }
  } else {
    for (auto& unit : units_) {
+      const auto& times = unit.Timer(type)->LapTimes();
      // clang-format off
-      ss << setw(25) << left << fixed << unit.character.op_type                \
-         << " " << setw(40) << left << fixed << unit.character.kernel_name     \
-         << " " << setw(12) << left << fixed << unit.character.remark          \
-         << " " << setw(12) << left << fixed << unit.timer->LapTimes().Avg(w)  \
-         << " " << setw(12) << left << fixed << unit.timer->LapTimes().Min(w)  \
-         << " " << setw(12) << left << fixed << unit.timer->LapTimes().Max(w)  \
-         << " " << setw(12) << left << fixed << unit.timer->LapTimes().Last(w) \
+      ss << setw(25) << left << fixed << unit.Character().op_type            \
+         << " " << setw(40) << left << fixed << unit.Character().kernel_name \
+         << " " << setw(12) << left << fixed << unit.Character().remark      \
+         << " " << setw(12) << left << fixed << times.Avg(w)                 \
+         << " " << setw(12) << left << fixed << times.Min(w)                 \
+         << " " << setw(12) << left << fixed << times.Max(w)                 \
+         << " " << setw(12) << left << fixed << times.Last(w)                \
         << std::endl;
      // clang-format on
    }

--- a/lite/core/profile/profiler.h
+++ b/lite/core/profile/profiler.h
@@ -13,6 +13,7 @@
 // limitations under the License.

 #pragma once
+#include <map>
 #include <memory>
 #include <string>
 #include <vector>
@@ -22,6 +23,14 @@ namespace paddle {
 namespace lite {
 namespace profile {

+enum class Type {
+  kUnk = 0,
+  kCreate,
+  kDispatch,
+};
+
+extern std::map<Type, std::string> TypeStr;
+
 struct TimeInfo {
  float avg;
  float min;
@@ -35,8 +44,15 @@ struct OpCharacter {
  std::string remark{std::string("N/A")};
 };

-struct StatisUnit {
-  std::unique_ptr<Timer> timer;
+class StatisUnit final {
+ public:
+  explicit StatisUnit(const OpCharacter& ch);
+  lite::profile::Timer* Timer(Type type);
+  const OpCharacter& Character() const { return character; }
+
+ protected:
+  std::unique_ptr<lite::profile::Timer> create_t;
+  std::unique_ptr<lite::profile::Timer> dispatch_t;
  OpCharacter character;
 };

@@ -45,9 +61,9 @@ class Profiler final {
  Profiler() = default;
  explicit Profiler(const std::string& name) : name_(name) {}
  int NewTimer(const OpCharacter& ch);
-  void StartTiming(const int index, KernelContext* ctx);
-  float StopTiming(const int index, KernelContext* ctx);
-  std::string Summary(bool concise = true, size_t warm_up = 10);
+  void StartTiming(Type type, const int index, KernelContext* ctx);
+  float StopTiming(Type type, const int index, KernelContext* ctx);
+  std::string Summary(Type type, bool concise = true, size_t warm_up = 10);

 private:
  std::string name_{std::string("N/A")};

--- a/lite/core/profile/test_timer.cc
+++ b/lite/core/profile/test_timer.cc
@@ -69,10 +69,10 @@ TEST(profiler, real_latency) {
  ch.op_type = "operator/1";
  ch.kernel_name = "kernel/1";
  int idx = profiler.NewTimer(ch);
-  profiler.StartTiming(idx, &ctx);
+  profiler.StartTiming(Type::kDispatch, idx, &ctx);
  std::this_thread::sleep_for(std::chrono::milliseconds(10));
-  profiler.StopTiming(idx, &ctx);
-  std::cout << profiler.Summary();
+  profiler.StopTiming(Type::kDispatch, idx, &ctx);
+  std::cout << profiler.Summary(Type::kDispatch);
 }
 #endif


--- a/lite/core/program.cc
+++ b/lite/core/program.cc
@@ -147,7 +147,7 @@ void RuntimeProgram::Run() {
 #endif  // LITE_WITH_PROFILE
  }
 #ifdef LITE_WITH_PROFILE
-  LOG(INFO) << "\n" << profiler_.Summary(false, 0);
+  LOG(INFO) << "\n" << profiler_.Summary(profile::Type::kDispatch, false, 0);
 #endif  // LITE_WITH_PROFILE
 }

@@ -252,8 +252,16 @@ void Program::PrepareWorkspace(const cpp::ProgramDesc& prog) {
 }

 void Instruction::Run() {
+#ifdef LITE_WITH_PROFILE
+  CHECK(profiler_) << "Profiler pointer of kernel can not be nullptr. "
+                      "When LITE_WITH_PROFILE is defined, please set a "
+                      "Profiler for Instruction.";
+  profiler_->StartTiming(
+      profile::Type::kCreate, profile_id_, kernel_->mutable_context());
+#endif
  CHECK(op_) << "op null";
  CHECK(kernel_) << "kernel null";
+
  if (first_epoch_) {
    first_epoch_ = false;
    CHECK(op_->CheckShape());
@@ -263,10 +271,7 @@ void Instruction::Run() {
    return;
  }

-  // VLOG(4) << "kernel launch";
  op_->InferShape();
-  // VLOG(4) << ">> Running kernel: " << op_->op_info()->Repr() << " on Target "
-  //        << TargetToStr(kernel_->target());
  kernel_->Launch();
  has_run_ = true;
 }

--- a/lite/core/program.h
+++ b/lite/core/program.h
@@ -143,7 +143,8 @@ class LITE_API RuntimeProgram {
  }
  ~RuntimeProgram() {
 #ifdef LITE_WITH_PROFILE
-    LOG(INFO) << "\n" << profiler_.Summary();
+    LOG(INFO) << "\n" << profiler_.Summary(profile::Type::kCreate);
+    LOG(INFO) << "\n" << profiler_.Summary(profile::Type::kDispatch);
 #endif  // LITE_WITH_PROFILE
  }


--- a/lite/core/tensor.h
+++ b/lite/core/tensor.h
@@ -233,6 +233,10 @@ class TensorLite {
        (static_cast<char *>(buffer_->data()) + offset_));
  }

+  void clear() {
+    buffer_->Free();
+    offset_ = 0;
+  }
  size_t data_size() const { return this->dims().production(); }

  size_t memory_size() const { return memory_size_; }

--- a/lite/kernels/arm/conditional_block_compute.cc
+++ b/lite/kernels/arm/conditional_block_compute.cc
@@ -34,6 +34,9 @@ void ConditionalBlockCompute::PrepareForRun() {
 }
 void ConditionalBlockCompute::Run() {
  auto& param = Param<operators::ConditionalBlockParam>();
+  for (auto& out : param.outs) {
+    out->clear();
+  }
  bool need_run = true;
  if (param.is_scalar_condition) {
    auto* cond = param.cond;

--- a/lite/kernels/arm/split_lod_tensor_compute.cc
+++ b/lite/kernels/arm/split_lod_tensor_compute.cc
@@ -82,6 +82,10 @@ void SplitLodTensorCompute::Run() {
        ranges.begin(), ranges.end(), 0UL, [](size_t a, const CopyRange &b) {
          return a + b.end - b.begin;
        });
+    if (height == 0) {
+      out->clear();
+      continue;
+    }
    auto x_dim = x->dims();
    x_dim[0] = static_cast<int64_t>(height);
    out->Resize(x_dim);

--- a/lite/kernels/arm/unsqueeze_compute.cc
+++ b/lite/kernels/arm/unsqueeze_compute.cc
@@ -54,12 +54,12 @@ REGISTER_LITE_KERNEL(unsqueeze,
                     kNCHW,
                     paddle::lite::kernels::host::UnsqueezeCompute,
                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
    .BindInput("AxesTensor",
               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
    .BindInput("AxesTensorList",
               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
    .Finalize();

 REGISTER_LITE_KERNEL(unsqueeze2,
@@ -68,11 +68,11 @@ REGISTER_LITE_KERNEL(unsqueeze2,
                     kNCHW,
                     paddle::lite::kernels::host::Unsqueeze2Compute,
                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
    .BindInput("AxesTensor",
               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
    .BindInput("AxesTensorList",
               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
    .BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kARM))})
    .Finalize();
--- a/lite/kernels/arm/yolo_box_compute.cc
+++ b/lite/kernels/arm/yolo_box_compute.cc
@@ -54,7 +54,8 @@ REGISTER_LITE_KERNEL(yolo_box,
                     paddle::lite::kernels::arm::YoloBoxCompute,
                     def)
    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("ImgSize", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("ImgSize",
+               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
    .BindOutput("Boxes", {LiteType::GetTensorTy(TARGET(kARM))})
    .BindOutput("Scores", {LiteType::GetTensorTy(TARGET(kARM))})
    .Finalize();
--- a/lite/kernels/cuda/softmax_compute.cu
+++ b/lite/kernels/cuda/softmax_compute.cu
@@ -156,8 +156,8 @@ void SoftmaxCompute::PrepareForRun() {
  cudaGetDevice(&device_id);
  cudaDeviceProp deviceProp;
  cudaGetDeviceProperties(&deviceProp, device_id);
-  sharedmem_size = deviceProp.sharedMemPerBlock;
-  max_dimsize = sharedmem_size / sizeof(float) / CUDA_NUM_THREADS;
+  sharedmem_size_ = deviceProp.sharedMemPerBlock;
+  max_dimsize_ = sharedmem_size_ / sizeof(float) / CUDA_NUM_THREADS;
 }

 void SoftmaxCompute::Run() {
@@ -174,29 +174,27 @@ void SoftmaxCompute::Run() {
  int outer_num = x_dims.Slice(0, axis).production();
  int inner_num = x_dims.Slice(axis + 1, x_rank).production();
  int total_threads = inner_num * outer_num;
-  int axis_size = x_dims[axis];
+  axis_size_ = x_dims[axis];

  const int threads = CUDA_NUM_THREADS;
  const int blocks = (total_threads + threads - 1) / threads;
  auto input_data = param.x->data<float>();
  auto output_data = param.output->mutable_data<float>(TARGET(kCUDA));
-  if (axis_size <= max_dimsize) {
-    int use_sharemem_size = axis_size * threads * sizeof(float);
+  if (axis_size_ <= max_dimsize_) {
+    int use_sharemem_size = axis_size_ * threads * sizeof(float);
    sharemem_softmax_kernel<<<blocks, threads, use_sharemem_size, stream>>>(
        total_threads,
        input_data,
        output_data,
        inner_num,
        outer_num,
-        axis_size);
+        axis_size_);
  } else {
    //! re_alloc device memory
-    Tensor tmax_data;
-    Tensor tsum_data;
-    tmax_data.Resize({1, 1, 1, outer_num * inner_num});
-    tsum_data.Resize({1, 1, 1, outer_num * inner_num});
-    auto max_data = tmax_data.mutable_data<float>(TARGET(kCUDA));
-    auto sum_data = tsum_data.mutable_data<float>(TARGET(kCUDA));
+    tmax_data_.Resize({1, 1, 1, outer_num * inner_num});
+    tsum_data_.Resize({1, 1, 1, outer_num * inner_num});
+    auto max_data = tmax_data_.mutable_data<float>(TARGET(kCUDA));
+    auto sum_data = tsum_data_.mutable_data<float>(TARGET(kCUDA));
    //! firstly, get maximum data
    float min_data = std::numeric_limits<float>::lowest();
    softmax_max_kernel<float><<<blocks, threads, 0, stream>>>(total_threads,
@@ -205,7 +203,7 @@ void SoftmaxCompute::Run() {
                                                              min_data,
                                                              inner_num,
                                                              outer_num,
-                                                              axis_size);
+                                                              axis_size_);
    //! then, compute exp and sum data
    softmax_sub_exp_sum_kernel<float><<<blocks, threads, 0, stream>>>(
        total_threads,
@@ -215,10 +213,10 @@ void SoftmaxCompute::Run() {
        sum_data,
        inner_num,
        outer_num,
-        axis_size);
+        axis_size_);
    //! last, compute divided output
    softmax_divid_output_kernel<float><<<blocks, threads, 0, stream>>>(
-        total_threads, output_data, sum_data, inner_num, outer_num, axis_size);
+        total_threads, output_data, sum_data, inner_num, outer_num, axis_size_);
  }
  cudaError_t error = cudaGetLastError();
  if (error != cudaSuccess) LOG(ERROR) << cudaGetErrorString(error);

--- a/lite/kernels/cuda/softmax_compute.h
+++ b/lite/kernels/cuda/softmax_compute.h
@@ -30,9 +30,11 @@ class SoftmaxCompute
  virtual ~SoftmaxCompute() = default;

 private:
-  size_t sharedmem_size;
-  int num_threads;
-  int max_dimsize;
+  lite::Tensor tmax_data_;
+  lite::Tensor tsum_data_;
+  size_t sharedmem_size_;
+  int max_dimsize_;
+  int axis_size_;
 };

 }  // namespace cuda

--- a/lite/kernels/npu/bridges/engine.h
+++ b/lite/kernels/npu/bridges/engine.h
@@ -28,12 +28,14 @@ namespace subgraph {

 class Engine {
 public:
-  Engine(int block_idx,
+  Engine(KernelContext *ctx,
+         int block_idx,
         cpp::BlockDesc *block_desc,
         const std::vector<std::string> &input_names,
         const std::vector<std::string> &output_names,
         lite::Scope *scope)
-      : block_idx_(block_idx),
+      : ctx_(ctx),
+        block_idx_(block_idx),
        block_desc_(block_desc),
        input_names_(input_names),
        output_names_(output_names),
@@ -55,6 +57,7 @@ class Engine {

  virtual bool InputShapeChanged();

+  KernelContext *ctx_{nullptr};
  int block_idx_;
  cpp::BlockDesc *block_desc_;
  std::vector<std::string> input_names_;

--- a/lite/kernels/npu/subgraph_compute.cc
+++ b/lite/kernels/npu/subgraph_compute.cc
@@ -207,7 +207,8 @@ int SubgraphEngine::LaunchDeviceProgram() {

 void SubgraphCompute::PrepareForRun() {
  auto& param = this->Param<param_t>();
-  engine_.reset(new SubgraphEngine(param.sub_block_idx,
+  engine_.reset(new SubgraphEngine(ctx_.get(),
+                                   param.sub_block_idx,
                                   param.sub_block_desc,
                                   param.input_data_names,
                                   param.output_data_names,

--- a/lite/kernels/npu/subgraph_compute.h
+++ b/lite/kernels/npu/subgraph_compute.h
@@ -29,13 +29,14 @@ namespace npu {

 class SubgraphEngine : public subgraph::Engine {
 public:
-  SubgraphEngine(int block_idx,
+  SubgraphEngine(KernelContext *ctx,
+                 int block_idx,
                 cpp::BlockDesc *block_desc,
                 const std::vector<std::string> &input_names,
                 const std::vector<std::string> &output_names,
                 Scope *scope)
      : subgraph::Engine(
-            block_idx, block_desc, input_names, output_names, scope) {}
+            ctx, block_idx, block_desc, input_names, output_names, scope) {}

 protected:
  int BuildDeviceProgram() override;

--- a/lite/kernels/x86/gru_compute.cc
+++ b/lite/kernels/x86/gru_compute.cc
@@ -13,10 +13,13 @@
 // limitations under the License.

 #include "lite/kernels/x86/gru_compute.h"
+#include "lite/utils/env.h"

-DEFINE_int32(paddle_num_threads,
-             1,
-             "Number of threads for each paddle instance.");
+// DEFINE_int32(paddle_num_threads,
+//              1,
+//              "Number of threads for each paddle instance.");
+int32_t paddle_num_threads =
+    paddle::lite::GetIntFromEnv("paddle_num_threads", 1);

 REGISTER_LITE_KERNEL(gru,
                     kX86,

--- a/lite/kernels/x86/gru_compute.h
+++ b/lite/kernels/x86/gru_compute.h
@@ -26,7 +26,8 @@
 #include "lite/core/types.h"
 #include "lite/fluid/eigen.h"

-DECLARE_int32(paddle_num_threads);
+// DECLARE_int32(paddle_num_threads);
+extern int32_t paddle_num_threads;

 namespace paddle {
 namespace lite {
@@ -109,7 +110,7 @@ class GRUCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {

 #ifdef PADDLE_WITH_MKLML
    // use MKL packed to speedup GEMM
-    if (FLAGS_paddle_num_threads >= 4) {
+    if (paddle_num_threads >= 4) {
      auto blas = lite::x86::math::GetBlas<TARGET(kX86), T>(context);
      T* packed_gate = blas.GEMM_ALLOC(CblasBMatrix,
                                       1 /*height of C*/,

--- a/lite/kernels/xpu/bridges/matmul_op.cc
+++ b/lite/kernels/xpu/bridges/matmul_op.cc
@@ -49,9 +49,10 @@ int MatmulConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  auto out_type = kernel->GetOutputDeclType("Out");
  CHECK(out_type->precision() == PRECISION(kFloat));
  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
+  auto out = scope->FindMutableTensor(out_name);
+  auto out_dims = out->dims();

  auto transpose_x = op_info->GetAttr<bool>("transpose_X");
-  CHECK(!transpose_x) << "XPU only support transpose_x == true now";
  auto transpose_y = op_info->GetAttr<bool>("transpose_Y");
  auto alpha = op_info->GetAttr<float>("alpha");

@@ -71,11 +72,68 @@ int MatmulConverter(void* ctx, OpLite* op, KernelBase* kernel) {
    y_node = graph->AddNode(y_name, y_dims);
  }

-  auto matmul_node =
-      graph->builder_.CreateMatmul2D(*x_node, *y_node, transpose_y);
-  graph->AddNode(out_name, graph->builder_.CreateScale(matmul_node, alpha));
-
-  return SUCCESS;
+  // Matmul node
+  if (x_dims.size() > 2 && y_dims.size() >= 2) {
+    // x: [B, ..., M, K], y: [B, ..., K, N], out: [B, ..., M, N]
+    // x: [B, M, K], y: [K, N], out: [B, M, N]
+    // Reshape and transposed X node
+    if (x_dims.size() != 3) {
+      auto m = static_cast<int>(x_dims[x_dims.size() - 2]);
+      auto k = static_cast<int>(x_dims[x_dims.size() - 1]);
+      x_node =
+          graph->AddNode(x_name + "/reshape",
+                         graph->builder_.CreateReshape(*x_node, {-1, m, k}));
+      if (transpose_x) {
+        x_node =
+            graph->AddNode(x_name + "/reshape/transpose",
+                           graph->builder_.CreateTranspose(*x_node, {0, 2, 1}));
+      }
+    }
+    // Reshape and transposed Y node
+    if (y_dims.size() != 3) {
+      auto k = static_cast<int>(y_dims[y_dims.size() - 2]);
+      auto n = static_cast<int>(y_dims[y_dims.size() - 1]);
+      y_node =
+          graph->AddNode(y_name + "/reshape",
+                         graph->builder_.CreateReshape(*y_node, {-1, k, n}));
+      if (!transpose_y) {
+        y_node =
+            graph->AddNode(y_name + "/reshape/transpose",
+                           graph->builder_.CreateTranspose(*y_node, {0, 2, 1}));
+      }
+    }
+    // Matmul node
+    auto matmul_node = graph->AddNode(
+        out_name, graph->builder_.CreateBatchMatmul(*x_node, *y_node));
+    if (fabs(alpha - 1) > 1e-6f) {
+      matmul_node = graph->AddNode(
+          out_name, graph->builder_.CreateScale(*matmul_node, alpha));
+    }
+    if (out_dims.size() != 3) {
+      graph->AddNode(out_name,
+                     graph->builder_.CreateReshape(
+                         *matmul_node, CvtShape<xtcl::Integer>(out_dims)));
+    }
+  } else if (x_dims.size() == 2 && y_dims.size() == 2) {
+    // x: [M, K], y: [K, N], out: [M, N]
+    if (transpose_x) {
+      x_node = graph->AddNode(x_name + "/transpose",
+                              graph->builder_.CreateTranspose(*x_node, {1, 0}));
+    }
+    auto matmul_node = graph->AddNode(
+        out_name,
+        graph->builder_.CreateMatmul2D(*x_node, *y_node, transpose_y));
+    if (fabs(alpha - 1) > 1e-6f) {
+      matmul_node = graph->AddNode(
+          out_name, graph->builder_.CreateScale(*matmul_node, alpha));
+    }
+  } else if (x_dims.size() == 1 && y_dims.size() == 1) {
+    // x: [K], y: [K], out: [1]
+    // x: [M], y: [N], x_transpose: true, y_transpose: true, out: [M, N]
+    LOG(FATAL) << "[XPU] Not supported.";
+    return FAILED;
+  }
+  return REBUILD_WHEN_SHAPE_CHANGED;
 }

 }  // namespace xpu

--- a/lite/kernels/xpu/bridges/mul_op.cc
+++ b/lite/kernels/xpu/bridges/mul_op.cc
@@ -67,15 +67,27 @@ int MulConverter(void* ctx, OpLite* op, KernelBase* kernel) {
    x_node =
        graph->AddNode(x_name + "/reshape",
                       graph->builder_.CreateReshape(
-                           *x_node, {-1, static_cast<int>(y_matrix_dims[0])}));
+                           *x_node, {-1, static_cast<int>(x_matrix_dims[1])}));
  }

  // Y node
-  auto y_const_node = graph->AddNode(y_name, *y, y_matrix_dims);
+  std::shared_ptr<xtcl::xExpr> y_node = nullptr;
+  if (graph->HasNode(y_name)) {
+    y_node = graph->GetNode(y_name);
+  } else {
+    y_node = graph->AddNode(y_name, y_dims);
+  }
+  // Flatten Y node
+  if (y_dims.size() != 2) {
+    y_node =
+        graph->AddNode(y_name + "/reshape",
+                       graph->builder_.CreateReshape(
+                           *y_node, {static_cast<int>(y_matrix_dims[0]), -1}));
+  }

  // Reshape the matmul node with the inferred shape as the output node
  auto matmul_node = graph->AddNode(
-      out_name, graph->builder_.CreateMatmul2D(*x_node, *y_const_node, false));
+      out_name, graph->builder_.CreateMatmul2D(*x_node, *y_node, false));
  if (out_dims.size() != 2) {
    graph->AddNode(out_name,
                   graph->builder_.CreateReshape(

--- a/lite/kernels/xpu/subgraph_compute.cc
+++ b/lite/kernels/xpu/subgraph_compute.cc
@@ -197,7 +197,8 @@ int SubgraphEngine::LaunchDeviceProgram() {

 void SubgraphCompute::PrepareForRun() {
  auto& param = this->Param<param_t>();
-  engine_.reset(new SubgraphEngine(param.sub_block_idx,
+  engine_.reset(new SubgraphEngine(ctx_.get(),
+                                   param.sub_block_idx,
                                   param.sub_block_desc,
                                   param.input_data_names,
                                   param.output_data_names,

--- a/lite/kernels/xpu/subgraph_compute.h
+++ b/lite/kernels/xpu/subgraph_compute.h
@@ -29,13 +29,14 @@ namespace xpu {

 class SubgraphEngine : public subgraph::Engine {
 public:
-  SubgraphEngine(int block_idx,
+  SubgraphEngine(KernelContext *ctx,
+                 int block_idx,
                 cpp::BlockDesc *block_desc,
                 const std::vector<std::string> &input_names,
                 const std::vector<std::string> &output_names,
                 Scope *scope)
      : subgraph::Engine(
-            block_idx, block_desc, input_names, output_names, scope) {}
+            ctx, block_idx, block_desc, input_names, output_names, scope) {}

 protected:
  int BuildDeviceProgram() override;

--- a/lite/operators/CMakeLists.txt
+++ b/lite/operators/CMakeLists.txt
@@ -50,6 +50,7 @@ add_operator(layout_op basic SRCS layout_op.cc DEPS ${op_DEPS})
 add_operator(instance_norm_op basic SRCS instance_norm_op.cc DEPS ${op_DEPS})
 add_operator(subgraph_op basic SRCS subgraph_op.cc DEPS ${op_DEPS})
 add_operator(grid_sampler_op basic SRCS grid_sampler_op.cc DEPS ${op_DEPS})
+add_operator(flatten_op basic SRCS flatten_op.cc DEPS ${op_DEPS})

 # 2.basic ops not used in basic models
 add_operator(negative_op extra SRCS negative_op.cc DEPS ${op_DEPS})
@@ -78,11 +79,9 @@ add_operator(anchor_generator_op extra SRCS anchor_generator_op.cc DEPS ${op_DEP
 add_operator(generate_proposals_op extra SRCS generate_proposals_op.cc DEPS ${op_DEPS})
 add_operator(roi_align_op extra SRCS roi_align_op.cc DEPS ${op_DEPS})
 add_operator(box_clip_op extra SRCS box_clip_op.cc DEPS ${op_DEPS})
-add_operator(flatten_op extra SRCS flatten_op.cc DEPS ${op_DEPS})
 add_operator(fake_quantize_range_abs_max_op extra SRCS fake_quantize_range_abs_max.cc DEPS ${op_DEPS})
 add_operator(sequence_expand_as_op_lite extra SRCS sequence_expand_as_op.cc DEPS ${op_DEPS})
 add_operator(assign_value_op extra SRCS assign_value_op.cc DEPS ${op_DEPS})
-
 add_operator(fake_quantize_dequantize_moving_avg_abs_max_op extra SRCS fake_quantize_dequantize_moving_avg_max_abs.cc DEPS ${op_DEPS})
 add_operator(fake_channel_wise_dequantize_max_abs_op extra SRCS fake_channel_wise_dequantize_max_abs.cc DEPS ${op_DEPS})
 add_operator(split_lod_tensor_op_lite extra SRCS split_lod_tensor_op.cc DEPS ${op_DEPS})

--- a/lite/operators/attention_padding_mask_op.cc
+++ b/lite/operators/attention_padding_mask_op.cc
@@ -13,6 +13,7 @@
 // limitations under the License.

 #include "lite/operators/attention_padding_mask_op.h"
+#include <vector>
 #include "lite/core/op_registry.h"
 #include "lite/core/scope.h"

@@ -39,7 +40,8 @@ bool AttentionPaddingMaskOp::InferShape() const {
      << "Mismatch batch size, bottom0: " << att_batch
      << ", bottom1: " << src_batch;

-  param_.pad_begin->Resize({static_cast<int64_t>(src_batch)});
+  param_.pad_begin->Resize(
+      std::vector<int64_t>({static_cast<int64_t>(src_batch)}));
  param_.Out->Resize(param_.X->dims());
  param_.Out->set_lod(param_.X->lod());


--- a/lite/operators/instance_norm_op.cc
+++ b/lite/operators/instance_norm_op.cc
@@ -46,8 +46,9 @@ bool InstanceNormOp::InferShape() const {
  auto x_dims = param_.x->dims();
  int64_t batch_size = x_dims[0];
  int64_t channel_size = x_dims[1];
-  param_.saved_mean->Resize({batch_size * channel_size});
-  param_.saved_variance->Resize({batch_size * channel_size});
+  param_.saved_mean->Resize(std::vector<int64_t>({batch_size * channel_size}));
+  param_.saved_variance->Resize(
+      std::vector<int64_t>({batch_size * channel_size}));
  param_.out->Resize(x_dims);
  return true;
 }

--- a/lite/operators/reduce_prod_op.cc
+++ b/lite/operators/reduce_prod_op.cc
@@ -50,7 +50,7 @@ bool ReduceProdOpLite::InferShape() const {
    if (keep_dim) {
      out->Resize({static_cast<int64_t>(x_rank), 1});
    } else {
-      out->Resize({1});
+      out->Resize(std::vector<int64_t>({1L}));
    }
  } else {
    auto dims_vector = x_dims.Vectorize();

--- a/lite/tests/kernels/CMakeLists.txt
+++ b/lite/tests/kernels/CMakeLists.txt
@@ -30,6 +30,7 @@ if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND (LITE_WITH_X86 OR LITE_WITH
    lite_cc_test(test_kernel_layer_norm_compute SRCS layer_norm_compute_test.cc DEPS arena_framework ${xpu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_dropout_compute SRCS dropout_compute_test.cc DEPS arena_framework ${xpu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_softmax_compute SRCS softmax_compute_test.cc DEPS arena_framework ${xpu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_mul_compute SRCS mul_compute_test.cc DEPS arena_framework ${xpu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})

 if(LITE_BUILD_EXTRA)
    lite_cc_test(test_gru_unit SRCS gru_unit_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})

--- a/lite/tests/kernels/mul_compute_test.cc
+++ b/lite/tests/kernels/mul_compute_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <cmath>
+#include <string>
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/core/arena/framework.h"
+#include "lite/tests/utils/fill_data.h"
+
+namespace paddle {
+namespace lite {
+
+class MulComputeTester : public arena::TestCase {
+ protected:
+  // common attributes for this op.
+  std::string type_ = "mul";
+  std::string x_ = "x";
+  std::string y_ = "y";
+  std::string out_ = "out";
+  DDim x_dims_{{1, 2}};
+  DDim y_dims_{{2, 1}};
+  int x_num_col_dims_{1};
+  int y_num_col_dims_{1};
+
+ public:
+  MulComputeTester(const Place& place,
+                   const std::string& alias,
+                   DDim x_dims,
+                   DDim y_dims,
+                   int x_num_col_dims,
+                   int y_num_col_dims)
+      : TestCase(place, alias),
+        x_dims_(x_dims),
+        y_dims_(y_dims),
+        x_num_col_dims_(x_num_col_dims),
+        y_num_col_dims_(y_num_col_dims) {}
+
+  void RunBaseline(Scope* scope) override {
+    auto* x = scope->FindTensor(x_);
+    auto* y = scope->FindTensor(y_);
+    auto x_mat_dims = x_dims_.Flatten2D(x_num_col_dims_);
+    auto y_mat_dims = y_dims_.Flatten2D(y_num_col_dims_);
+    CHECK_EQ(x_mat_dims[1], y_mat_dims[0]);
+
+    auto* out = scope->NewTensor(out_);
+    CHECK(out);
+    std::vector<int64_t> out_shape;
+    for (int i = 0; i < x_num_col_dims_; i++) {
+      out_shape.push_back(x_dims_[i]);
+    }
+    for (int i = y_num_col_dims_; i < y_dims_.size(); i++) {
+      out_shape.push_back(y_dims_[i]);
+    }
+    out->Resize(DDim(out_shape));
+
+    auto x_data = x->data<float>();
+    auto y_data = y->data<float>();
+    auto* out_data = out->mutable_data<float>();
+
+    const int M = x_mat_dims[0];
+    const int K = x_mat_dims[1];
+    const int N = y_mat_dims[1];
+    for (int m = 0; m < M; ++m) {
+      for (int n = 0; n < N; ++n) {
+        out_data[m * N + n] = 0;
+        for (int k = 0; k < K; ++k) {
+          out_data[m * N + n] += x_data[m * K + k] * y_data[k * N + n];
+        }
+      }
+    }
+  }
+
+  void PrepareOpDesc(cpp::OpDesc* op_desc) {
+    op_desc->SetType(type_);
+    op_desc->SetInput("X", {x_});
+    op_desc->SetInput("Y", {y_});
+    op_desc->SetOutput("Out", {out_});
+    op_desc->SetAttr("x_num_col_dims", x_num_col_dims_);
+    op_desc->SetAttr("y_num_col_dims", y_num_col_dims_);
+  }
+
+  void PrepareData() override {
+    std::vector<float> x(x_dims_.production());
+    fill_data_rand(x.data(), -1.f, 1.f, x_dims_.production());
+    SetCommonTensor(x_, x_dims_, x.data());
+
+    std::vector<float> y(y_dims_.production());
+    fill_data_rand(y.data(), -1.f, 1.f, y_dims_.production());
+    SetCommonTensor(y_, y_dims_, y.data());
+  }
+};
+
+void TestMul(const std::vector<int64_t>& x_dims,
+             const std::vector<int64_t>& y_dims,
+             int x_num_col_dims,
+             int y_num_col_dims,
+             const Place& place,
+             float abs_error) {
+  std::unique_ptr<arena::TestCase> tester(new MulComputeTester(place,
+                                                               "def",
+                                                               DDim(x_dims),
+                                                               DDim(y_dims),
+                                                               x_num_col_dims,
+                                                               y_num_col_dims));
+  arena::Arena arena(std::move(tester), place, abs_error);
+  arena.TestPrecision();
+}
+
+TEST(Mul, precision) {
+  LOG(INFO) << "test mul op";
+  float abs_error = 2e-5;
+  Place place;
+#if defined(LITE_WITH_XPU)
+  place = TARGET(kXPU);
+#else
+  return;
+#endif
+
+  TestMul({4, 5}, {5, 4}, 1, 1, place, abs_error);
+  TestMul({4, 5}, {5, 4, 3, 2}, 1, 1, place, abs_error);
+  TestMul({4, 20}, {5, 4, 3, 2}, 1, 2, place, abs_error);
+  TestMul({4, 60}, {5, 4, 3, 2}, 1, 3, place, abs_error);
+  TestMul({2, 3, 4, 5}, {60, 4}, 1, 1, place, abs_error);
+  TestMul({2, 3, 4, 5}, {20, 4}, 2, 1, place, abs_error);
+  TestMul({2, 3, 4, 5}, {5, 4}, 3, 1, place, abs_error);
+  TestMul({2, 3, 4, 5}, {60, 3, 4, 5}, 1, 1, place, abs_error);
+  TestMul({2, 3, 4, 5}, {4, 5, 6, 2}, 2, 2, place, abs_error);
+  TestMul({2, 3, 4, 5}, {5, 1, 4, 2}, 3, 2, place, abs_error);
+}
+
+}  // namespace lite
+}  // namespace paddle
--- a/lite/tests/kernels/unsqueeze_compute_test.cc
+++ b/lite/tests/kernels/unsqueeze_compute_test.cc
@@ -107,6 +107,7 @@ class UnsqueezeComputeTester : public arena::TestCase {
  }

  void PrepareData() override {
+    SetPrecisionType(out_, PRECISION(kFloat));
    std::vector<float> in_data(dims_.production());
    for (int i = 0; i < dims_.production(); ++i) {
      in_data[i] = i;
@@ -213,6 +214,7 @@ class Unsqueeze2ComputeTester : public arena::TestCase {
  }

  void PrepareData() override {
+    SetPrecisionType(out_, PRECISION(kFloat));
    std::vector<float> in_data(dims_.production());
    for (int i = 0; i < dims_.production(); ++i) {
      in_data[i] = i;

--- a/lite/tools/ci_build.sh
+++ b/lite/tools/ci_build.sh
@@ -1042,23 +1042,6 @@ function main {
                build_test_arm_subtask_armlinux
                shift
                ;;
-            build_test_arm_model_mobilenetv1)
-                build_test_arm_subtask_model test_mobilenetv1 mobilenet_v1
-                build_test_arm_subtask_model test_mobilenetv1_int8 MobileNetV1_quant
-                shift
-                ;;
-            build_test_arm_model_mobilenetv2)
-                build_test_arm_subtask_model test_mobilenetv2 mobilenet_v2_relu
-                shift
-                ;;
-            build_test_arm_model_resnet50)
-                build_test_arm_subtask_model test_resnet50 resnet50
-                shift
-                ;;
-            build_test_arm_model_inceptionv4)
-                build_test_arm_subtask_model test_inceptionv4 inception_v4_simple
-                shift
-                ;;
            check_style)
                check_style
                shift

--- a/lite/utils/env.h
+++ b/lite/utils/env.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <cstdlib>
+#include <cstring>
+
+#include <iostream>
+#include <string>
+
+namespace paddle {
+namespace lite {
+
+static std::string GetStringFromEnv(const std::string& str,
+                                    const std::string& def = "") {
+  char* variable = std::getenv(str.c_str());
+  if (!variable) {
+    return def;
+  }
+  return std::string(variable);
+}
+
+static bool GetBoolFromEnv(const std::string& str, bool def = false) {
+  char* variable = std::getenv(str.c_str());
+  if (!variable) {
+    return def;
+  }
+  if (strcmp(variable, "false") == 0 || strcmp(variable, "0") == 0) {
+    return false;
+  } else {
+    return true;
+  }
+}
+
+static int GetIntFromEnv(const std::string& str, int def = 0) {
+  char* variable = std::getenv(str.c_str());
+  if (!variable) {
+    return def;
+  }
+  return atoi(variable);
+}
+
+static double GetDoubleFromEnv(const std::string& str, double def = 0.0) {
+  char* variable = std::getenv(str.c_str());
+  if (!variable) {
+    return def;
+  }
+  return atof(variable);
+}
+
+static uint64_t GetUInt64FromEnv(const std::string& str, uint64_t def = 0ul) {
+  char* variable = std::getenv(str.c_str());
+  if (!variable) {
+    return def;
+  }
+  return static_cast<uint64_t>(atol(variable));
+}
+
+}  // namespace lite
+}  // namespace paddle
--- a/mobile/src/framework/cl/cl_image.cpp
+++ b/mobile/src/framework/cl/cl_image.cpp
@@ -18,6 +18,37 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace framework {

+void CLImage::PrintTensor(const CLImage &cl_image) const {
+  size_t width = cl_image.ImageDims()[0];
+  size_t height = cl_image.ImageDims()[1];
+
+  half_t *image_data = new half_t[height * width * 4];
+  cl_int err;
+  cl_mem image = cl_image.GetCLImage();
+  size_t origin[3] = {0, 0, 0};
+  size_t region[3] = {width, height, 1};
+  err = clEnqueueReadImage(cl_image.CommandQueue(), image, CL_TRUE, origin,
+                           region, 0, 0, image_data, 0, NULL, NULL);
+
+  CL_CHECK_ERRORS(err);
+
+  PADDLE_MOBILE_ENFORCE(cl_image.numel() != 0,
+                        "cl_image numel should not be 0 ");
+  float *tensor_data = new float[cl_image.numel()];
+  auto converter = cl_image.Converter();
+  converter->ImageToNCHW(image_data, tensor_data, cl_image.ImageDims(),
+                         cl_image.dims());
+  int stride = cl_image.numel() / 20;
+  stride = stride > 0 ? stride : 1;
+
+  for (int i = 0; i < cl_image.numel(); i++) {
+    printf("%f \n", tensor_data[i]);
+  }
+
+  delete[](tensor_data);
+  delete[](image_data);
+}
+
 void CLImageToTensor(CLImage *cl_image, Tensor *tensor, cl_context context,
                     cl_command_queue commandQueue, cl_kernel kernel) {
  tensor->mutable_data<float>();

--- a/mobile/src/framework/cl/cl_image.h
+++ b/mobile/src/framework/cl/cl_image.h
@@ -14,6 +14,7 @@ limitations under the License. */

 #pragma once

+#include <iostream>
 #include <memory>
 #include <vector>

@@ -285,6 +286,7 @@ class CLImage {
  cl_event GetClEvent() const { return cl_event_.get(); }

  CLImageConverterBase *Converter() const { return image_converter_; }
+  void PrintTensor(const CLImage &cl_image) const;

 private:
  void InitCLImage(cl_context context, size_t width, size_t height,

--- a/mobile/src/framework/cl/cl_tool.h
+++ b/mobile/src/framework/cl/cl_tool.h
@@ -21,13 +21,14 @@ namespace framework {

 const char* opencl_error_to_str(cl_int error);

-#define CL_CHECK_ERRORS(ERR)                                          \
-  if (ERR != CL_SUCCESS) {                                            \
-    printf(                                                           \
-        "OpenCL error with code %s happened in file %s at line %d. "  \
-        "Exiting.\n",                                                 \
-        paddle_mobile::framework::opencl_error_to_str(ERR), __FILE__, \
-        __LINE__);                                                    \
+#define CL_CHECK_ERRORS(ERR)                                                  \
+  if (ERR != CL_SUCCESS) {                                                    \
+    printf(                                                                   \
+        "\033[1;31;40mOpenCL error with code %s happened in file %s at line " \
+        "%d. "                                                                \
+        "Exiting.\033[0m\n",                                                  \
+        paddle_mobile::framework::opencl_error_to_str(ERR), __FILE__,         \
+        __LINE__);                                                            \
  }

 }  // namespace framework

--- a/mobile/src/framework/executor.cpp
+++ b/mobile/src/framework/executor.cpp
@@ -363,7 +363,10 @@ void Executor<Device, T>::InitNoPersistableMemory(const Tensor &input_tensor) {
        DLOG << "InitNoPersistableMemory var " << var_desc->Name();
        auto tensor = var->template GetMutable<LoDTensor>();
        if (tensor->IsInitialized() && tensor->dims().size() == 4) {
-          DLOG << "var's tensor is Initialized or dims size != 4";
+          // don't change user's input and avoid memory leaks
+          if (feed_indices_.find(var_desc->Name()) != feed_indices_.end()) {
+            break;
+          }
          DDim tensor_dim = tensor->dims();
          DDim new_dim =
              make_ddim({tensor_dim[0], tensor_dim[1], input_tensor.dims()[2],

--- a/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp
+++ b/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp
@@ -241,7 +241,9 @@ void ConvAddBnRelu(framework::CLHelper *cl_helper,
  cl_int status;
  int index = 0;

-  if (param.Filter()->dims()[2] == 1 && param.Filter()->dims()[3] == 1) {
+  const int filter_height = param.Filter()->dims()[2];
+  const int filter_width = param.Filter()->dims()[3];
+  if (filter_height == 1 && filter_width == 1) {
    status = clSetKernelArg(kernel, index++, sizeof(int), &c_block);
    CL_CHECK_ERRORS(status);

@@ -404,7 +406,7 @@ void ConvAddBnRelu(framework::CLHelper *cl_helper,
    status = clSetKernelArg(kernel, index++, sizeof(int), &output_height);
    CL_CHECK_ERRORS(status);

-    if (param.Filter()->dims()[2] == 3 && param.Filter()->dims()[3] == 3) {
+    if (filter_height == 3 && filter_width == 3) {
      // normal conv
      if (param.Filter()->dims()[0] == param.Output()->dims()[1] &&
          param.Filter()->dims()[1] == param.Input()->dims()[1]) {
@@ -425,6 +427,17 @@ void ConvAddBnRelu(framework::CLHelper *cl_helper,
        status = clSetKernelArg(kernel, index++, sizeof(int), &group);
        CL_CHECK_ERRORS(status);
      }
+    } else if (filter_height != 3 && filter_width != 3) {
+      // not 3x3
+      if (param.Filter()->dims()[1] == 1 &&
+          param.Input()->dims()[1] == param.Output()->dims()[1]) {
+        // deepwise basic use in not 3x3
+        status = clSetKernelArg(kernel, index++, sizeof(int), &filter_width);
+        CL_CHECK_ERRORS(status);
+
+        status = clSetKernelArg(kernel, index++, sizeof(int), &filter_height);
+        CL_CHECK_ERRORS(status);
+      }
    }

    status = clEnqueueNDRangeKernel(

--- a/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
@@ -24,980 +24,1101 @@ conv_add_bn_relu

 #include "cl_common.h"

-__kernel void conv_3x3(__private const int global_size_dim0,
-                                              __private const int global_size_dim1,
-                                              __private const int global_size_dim2,
-                                              __read_only image2d_t input_image,
-                                              __read_only image2d_t filter,
-
+__kernel void conv_3x3(
+    __private const int global_size_dim0, __private const int global_size_dim1,
+    __private const int global_size_dim2, __read_only image2d_t input_image,
+    __read_only image2d_t filter,
 #if defined(BIASE_CH) || defined(BIASE_ELE)
-                                              __read_only image2d_t bias,
+    __read_only image2d_t bias,
 #endif

 #ifdef BATCH_NORM
-                                              __read_only image2d_t new_scale,
-                                              __read_only image2d_t new_biase,
-#endif
-
-                                              __write_only image2d_t output_image,
-                                              __private const int stride,
-                                              __private const int offset,
-                                              __private const int input_c,
-                                              __private const int dilation,
-                                              __private const int input_width,/* of one block */
-                                              __private const int input_height,/* of one block */
-                                              __private const int output_width,
-                                              __private const int output_height,
-                                              __private const int output_c,
-                                              __private const int filter_channel,
-                                              __private const int group) {
-
-    const int out_c = get_global_id(0);
-    const int out_w = get_global_id(1);
-    const int out_nh = get_global_id(2);
-
-    int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
-
-    if (out_c >= global_size_dim0 ||
-        out_w >= global_size_dim1 ||
-        out_nh >= global_size_dim2) {
-        return;
-    }
+    __read_only image2d_t new_scale, __read_only image2d_t new_biase,
+#endif

+    __write_only image2d_t output_image, __private const int stride,
+    __private const int offset, __private const int input_c,
+    __private const int dilation,
+    __private const int input_width,  /* of one block */
+    __private const int input_height, /* of one block */
+    __private const int output_width, __private const int output_height,
+    __private const int output_c, __private const int filter_channel,
+    __private const int group) {

-    int2 stride_xy;
-    stride_xy.x = stride;
-    stride_xy.y = stride;
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);

-    int2 ouput_pos_in_one_block;
-    ouput_pos_in_one_block.x = out_w;
-    ouput_pos_in_one_block.y = out_nh;
+  int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);

+  if (out_c >= global_size_dim0 || out_w >= global_size_dim1 ||
+      out_nh >= global_size_dim2) {
+    return;
+  }

-    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                              CLK_ADDRESS_CLAMP          |
-                              CLK_FILTER_NEAREST;
+  int2 stride_xy;
+  stride_xy.x = stride;
+  stride_xy.y = stride;

-    int2 in_pos_in_one_block;
-    in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
-    in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
+  int2 ouput_pos_in_one_block;
+  ouput_pos_in_one_block.x = out_w;
+  ouput_pos_in_one_block.y = out_nh;
+
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+  int2 in_pos_in_one_block;
+  in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
+  in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;

 #ifdef BIASE_CH
-    half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
+  half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
 #elif defined(BIASE_ELE)
-    half4 output = read_imageh(bias, sampler, output_pos);
+  half4 output = read_imageh(bias, sampler, output_pos);
 #else
-    half4 output = 0.0f;
-#endif
-
-    half4 input[9];
-    if (group == 1) {
-        for (int i = 0; i < input_c; ++i) {
-            int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
-            input[0] = select(read_imageh(input_image, sampler,
-                                (int2)(pos_in.x - dilation, pos_in.y - dilation)),
-                                (half4)(0.0f),
-                                (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
-
-            input[1] = select(read_imageh(input_image, sampler,
-                              (int2)(pos_in.x, pos_in.y - dilation)),
-                              (half4)(0.0f),
-                              (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
-
-            input[2] = select(read_imageh(input_image, sampler,
-                              (int2)(pos_in.x + dilation, pos_in.y - dilation)),
-                              (half4)(0.0f),
-                              (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
-
-            input[3] = select(read_imageh(input_image, sampler,
-                              (int2)(pos_in.x - dilation, pos_in.y)),
-                              (half4)(0.0f),
-                              (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15));
-
-            input[4] = select(read_imageh(input_image, sampler,
-                              (int2)(pos_in.x, pos_in.y)),
-                              (half4)(0.0f),
-                              (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15));
-
-            input[5] = select(read_imageh(input_image, sampler,
-                              (int2)(pos_in.x + dilation, pos_in.y)),
-                              (half4)(0.0f),
-                              (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15));
-
-            input[6] = select(read_imageh(input_image, sampler,
-                              (int2)(pos_in.x - dilation, pos_in.y + dilation)),
-                              (half4)(0.0f),
-                              (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
-
-            input[7] = select(read_imageh(input_image, sampler,
-                              (int2)(pos_in.x, pos_in.y + dilation)),
-                              (half4)(0.0f),
-                              (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
-
-            input[8] = select(read_imageh(input_image, sampler,
-                              (int2)(pos_in.x + dilation, pos_in.y + dilation)),
-                              (half4)(0.0f),
-                              (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
-
-
-/*
-            for (int j = 0; j < 9; ++j) {
-                int2 pos_of_weight;
-                pos_of_weight.x = i * 3 + j % 3;
-                pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-                half4 weight_x = read_imageh(filter, sampler, pos_of_weight);
-                output.x += dot(input[j], weight_x);
-
-                pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-                half4 weight_y = read_imageh(filter, sampler, pos_of_weight);
-                output.y += dot(input[j], weight_y);
-
-                pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-                half4 weight_z = read_imageh(filter, sampler, pos_of_weight);
-                output.z += dot(input[j], weight_z);
-
-                pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-                half4 weight_w = read_imageh(filter, sampler, pos_of_weight);
-                output.w += dot(input[j], weight_w);
-            }
-*/
-                int j = 0;
-                int2 pos_of_weight;
-                pos_of_weight.x = i * 3 + j % 3;
-                pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-                half4 weight_x = read_imageh(filter, sampler, pos_of_weight);
-                output.x += dot(input[j], weight_x);
-
-                pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-                half4 weight_y = read_imageh(filter, sampler, pos_of_weight);
-                output.y += dot(input[j], weight_y);
-
-                pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-                half4 weight_z = read_imageh(filter, sampler, pos_of_weight);
-                output.z += dot(input[j], weight_z);
-
-                pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-                half4 weight_w = read_imageh(filter, sampler, pos_of_weight);
-                output.w += dot(input[j], weight_w);
-
-                j = 1;
-                pos_of_weight.x = i * 3 + j % 3;
-                pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-                weight_x = read_imageh(filter, sampler, pos_of_weight);
-                output.x += dot(input[j], weight_x);
-
-                pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-                weight_y = read_imageh(filter, sampler, pos_of_weight);
-                output.y += dot(input[j], weight_y);
-
-                pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-                weight_z = read_imageh(filter, sampler, pos_of_weight);
-                output.z += dot(input[j], weight_z);
-
-                pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-                weight_w = read_imageh(filter, sampler, pos_of_weight);
-                output.w += dot(input[j], weight_w);
-
-                j = 2;
-                pos_of_weight.x = i * 3 + j % 3;
-                pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-                weight_x = read_imageh(filter, sampler, pos_of_weight);
-                output.x += dot(input[j], weight_x);
-
-                pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-                weight_y = read_imageh(filter, sampler, pos_of_weight);
-                output.y += dot(input[j], weight_y);
-
-                pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-                weight_z = read_imageh(filter, sampler, pos_of_weight);
-                output.z += dot(input[j], weight_z);
-
-                pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-                weight_w = read_imageh(filter, sampler, pos_of_weight);
-                output.w += dot(input[j], weight_w);
-
-                j = 3;
-                pos_of_weight.x = i * 3 + j % 3;
-                pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-                weight_x = read_imageh(filter, sampler, pos_of_weight);
-                output.x += dot(input[j], weight_x);
-
-                pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-                weight_y = read_imageh(filter, sampler, pos_of_weight);
-                output.y += dot(input[j], weight_y);
-
-                pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-                weight_z = read_imageh(filter, sampler, pos_of_weight);
-                output.z += dot(input[j], weight_z);
-
-                pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-                weight_w = read_imageh(filter, sampler, pos_of_weight);
-                output.w += dot(input[j], weight_w);
-
-                j = 4;
-                pos_of_weight.x = i * 3 + j % 3;
-                pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-                weight_x = read_imageh(filter, sampler, pos_of_weight);
-                output.x += dot(input[j], weight_x);
-
-                pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-                weight_y = read_imageh(filter, sampler, pos_of_weight);
-                output.y += dot(input[j], weight_y);
-
-                pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-                weight_z = read_imageh(filter, sampler, pos_of_weight);
-                output.z += dot(input[j], weight_z);
-
-                pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-                weight_w = read_imageh(filter, sampler, pos_of_weight);
-                output.w += dot(input[j], weight_w);
-
-                j = 5;
-                pos_of_weight.x = i * 3 + j % 3;
-                pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-                weight_x = read_imageh(filter, sampler, pos_of_weight);
-                output.x += dot(input[j], weight_x);
-
-                pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-                weight_y = read_imageh(filter, sampler, pos_of_weight);
-                output.y += dot(input[j], weight_y);
-
-                pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-                weight_z = read_imageh(filter, sampler, pos_of_weight);
-                output.z += dot(input[j], weight_z);
-
-                pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-                weight_w = read_imageh(filter, sampler, pos_of_weight);
-                output.w += dot(input[j], weight_w);
-
-               j = 6;
-               pos_of_weight.x = i * 3 + j % 3;
-               pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-               weight_x = read_imageh(filter, sampler, pos_of_weight);
-               output.x += dot(input[j], weight_x);
-
-               pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-               weight_y = read_imageh(filter, sampler, pos_of_weight);
-               output.y += dot(input[j], weight_y);
-
-               pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-               weight_z = read_imageh(filter, sampler, pos_of_weight);
-               output.z += dot(input[j], weight_z);
-
-               pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-               weight_w = read_imageh(filter, sampler, pos_of_weight);
-               output.w += dot(input[j], weight_w);
-
-               j = 7;
-               pos_of_weight.x = i * 3 + j % 3;
-               pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-               weight_x = read_imageh(filter, sampler, pos_of_weight);
-               output.x += dot(input[j], weight_x);
-
-               pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-               weight_y = read_imageh(filter, sampler, pos_of_weight);
-               output.y += dot(input[j], weight_y);
-
-               pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-               weight_z = read_imageh(filter, sampler, pos_of_weight);
-               output.z += dot(input[j], weight_z);
-
-               pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-               weight_w = read_imageh(filter, sampler, pos_of_weight);
-               output.w += dot(input[j], weight_w);
-
-               j = 8;
-               pos_of_weight.x = i * 3 + j % 3;
-               pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-               weight_x = read_imageh(filter, sampler, pos_of_weight);
-               output.x += dot(input[j], weight_x);
-
-               pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-               weight_y = read_imageh(filter, sampler, pos_of_weight);
-               output.y += dot(input[j], weight_y);
-
-               pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-               weight_z = read_imageh(filter, sampler, pos_of_weight);
-               output.z += dot(input[j], weight_z);
-
-               pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-               weight_w = read_imageh(filter, sampler, pos_of_weight);
-               output.w += dot(input[j], weight_w);
+  half4 output = 0.0f;
+#endif

+  half4 input[9];
+  if (group == 1) {
+    for (int i = 0; i < input_c; ++i) {
+      int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x,
+                           in_pos_in_one_block.y);
+      input[0] =
+          select(read_imageh(input_image, sampler,
+                             (int2)(pos_in.x - dilation, pos_in.y - dilation)),
+                 (half4)(0.0f),
+                 (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
+                            in_pos_in_one_block.y - dilation < 0 ||
+                            in_pos_in_one_block.x - dilation >= input_width ||
+                            in_pos_in_one_block.y - dilation >= input_height)
+                           << 15));
+
+      input[1] =
+          select(read_imageh(input_image, sampler,
+                             (int2)(pos_in.x, pos_in.y - dilation)),
+                 (half4)(0.0f),
+                 (ushort4)((in_pos_in_one_block.x < 0 ||
+                            in_pos_in_one_block.y - dilation < 0 ||
+                            in_pos_in_one_block.x >= input_width ||
+                            in_pos_in_one_block.y - dilation >= input_height)
+                           << 15));
+
+      input[2] =
+          select(read_imageh(input_image, sampler,
+                             (int2)(pos_in.x + dilation, pos_in.y - dilation)),
+                 (half4)(0.0f),
+                 (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
+                            in_pos_in_one_block.y - dilation < 0 ||
+                            in_pos_in_one_block.x + dilation >= input_width ||
+                            in_pos_in_one_block.y - dilation >= input_height)
+                           << 15));
+
+      input[3] =
+          select(read_imageh(input_image, sampler,
+                             (int2)(pos_in.x - dilation, pos_in.y)),
+                 (half4)(0.0f),
+                 (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
+                            in_pos_in_one_block.y < 0 ||
+                            in_pos_in_one_block.x - dilation >= input_width ||
+                            in_pos_in_one_block.y >= input_height)
+                           << 15));
+
+      input[4] = select(
+          read_imageh(input_image, sampler, (int2)(pos_in.x, pos_in.y)),
+          (half4)(0.0f),
+          (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 ||
+                     in_pos_in_one_block.x >= input_width ||
+                     in_pos_in_one_block.y >= input_height)
+                    << 15));
+
+      input[5] =
+          select(read_imageh(input_image, sampler,
+                             (int2)(pos_in.x + dilation, pos_in.y)),
+                 (half4)(0.0f),
+                 (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
+                            in_pos_in_one_block.y < 0 ||
+                            in_pos_in_one_block.x + dilation >= input_width ||
+                            in_pos_in_one_block.y >= input_height)
+                           << 15));
+
+      input[6] =
+          select(read_imageh(input_image, sampler,
+                             (int2)(pos_in.x - dilation, pos_in.y + dilation)),
+                 (half4)(0.0f),
+                 (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
+                            in_pos_in_one_block.y + dilation < 0 ||
+                            in_pos_in_one_block.x - dilation >= input_width ||
+                            in_pos_in_one_block.y + dilation >= input_height)
+                           << 15));
+
+      input[7] =
+          select(read_imageh(input_image, sampler,
+                             (int2)(pos_in.x, pos_in.y + dilation)),
+                 (half4)(0.0f),
+                 (ushort4)((in_pos_in_one_block.x < 0 ||
+                            in_pos_in_one_block.y + dilation < 0 ||
+                            in_pos_in_one_block.x >= input_width ||
+                            in_pos_in_one_block.y + dilation >= input_height)
+                           << 15));
+
+      input[8] =
+          select(read_imageh(input_image, sampler,
+                             (int2)(pos_in.x + dilation, pos_in.y + dilation)),
+                 (half4)(0.0f),
+                 (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
+                            in_pos_in_one_block.y + dilation < 0 ||
+                            in_pos_in_one_block.x + dilation >= input_width ||
+                            in_pos_in_one_block.y + dilation >= input_height)
+                           << 15));
+
+      /*
+                  for (int j = 0; j < 9; ++j) {
+                      int2 pos_of_weight;
+                      pos_of_weight.x = i * 3 + j % 3;
+                      pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+                      half4 weight_x = read_imageh(filter, sampler,
+         pos_of_weight);
+                      output.x += dot(input[j], weight_x);
+
+                      pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+                      half4 weight_y = read_imageh(filter, sampler,
+         pos_of_weight);
+                      output.y += dot(input[j], weight_y);
+
+                      pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+                      half4 weight_z = read_imageh(filter, sampler,
+         pos_of_weight);
+                      output.z += dot(input[j], weight_z);
+
+                      pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+                      half4 weight_w = read_imageh(filter, sampler,
+         pos_of_weight);
+                      output.w += dot(input[j], weight_w);
+                  }
+      */
+      int j = 0;
+      int2 pos_of_weight;
+      pos_of_weight.x = i * 3 + j % 3;
+      pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+      half4 weight_x = read_imageh(filter, sampler, pos_of_weight);
+      output.x += dot(input[j], weight_x);
+
+      pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+      half4 weight_y = read_imageh(filter, sampler, pos_of_weight);
+      output.y += dot(input[j], weight_y);
+
+      pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+      half4 weight_z = read_imageh(filter, sampler, pos_of_weight);
+      output.z += dot(input[j], weight_z);
+
+      pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+      half4 weight_w = read_imageh(filter, sampler, pos_of_weight);
+      output.w += dot(input[j], weight_w);
+
+      j = 1;
+      pos_of_weight.x = i * 3 + j % 3;
+      pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+      weight_x = read_imageh(filter, sampler, pos_of_weight);
+      output.x += dot(input[j], weight_x);
+
+      pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+      weight_y = read_imageh(filter, sampler, pos_of_weight);
+      output.y += dot(input[j], weight_y);
+
+      pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+      weight_z = read_imageh(filter, sampler, pos_of_weight);
+      output.z += dot(input[j], weight_z);
+
+      pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+      weight_w = read_imageh(filter, sampler, pos_of_weight);
+      output.w += dot(input[j], weight_w);
+
+      j = 2;
+      pos_of_weight.x = i * 3 + j % 3;
+      pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+      weight_x = read_imageh(filter, sampler, pos_of_weight);
+      output.x += dot(input[j], weight_x);
+
+      pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+      weight_y = read_imageh(filter, sampler, pos_of_weight);
+      output.y += dot(input[j], weight_y);
+
+      pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+      weight_z = read_imageh(filter, sampler, pos_of_weight);
+      output.z += dot(input[j], weight_z);
+
+      pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+      weight_w = read_imageh(filter, sampler, pos_of_weight);
+      output.w += dot(input[j], weight_w);
+
+      j = 3;
+      pos_of_weight.x = i * 3 + j % 3;
+      pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+      weight_x = read_imageh(filter, sampler, pos_of_weight);
+      output.x += dot(input[j], weight_x);
+
+      pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+      weight_y = read_imageh(filter, sampler, pos_of_weight);
+      output.y += dot(input[j], weight_y);
+
+      pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+      weight_z = read_imageh(filter, sampler, pos_of_weight);
+      output.z += dot(input[j], weight_z);
+
+      pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+      weight_w = read_imageh(filter, sampler, pos_of_weight);
+      output.w += dot(input[j], weight_w);
+
+      j = 4;
+      pos_of_weight.x = i * 3 + j % 3;
+      pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+      weight_x = read_imageh(filter, sampler, pos_of_weight);
+      output.x += dot(input[j], weight_x);
+
+      pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+      weight_y = read_imageh(filter, sampler, pos_of_weight);
+      output.y += dot(input[j], weight_y);
+
+      pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+      weight_z = read_imageh(filter, sampler, pos_of_weight);
+      output.z += dot(input[j], weight_z);
+
+      pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+      weight_w = read_imageh(filter, sampler, pos_of_weight);
+      output.w += dot(input[j], weight_w);
+
+      j = 5;
+      pos_of_weight.x = i * 3 + j % 3;
+      pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+      weight_x = read_imageh(filter, sampler, pos_of_weight);
+      output.x += dot(input[j], weight_x);
+
+      pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+      weight_y = read_imageh(filter, sampler, pos_of_weight);
+      output.y += dot(input[j], weight_y);
+
+      pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+      weight_z = read_imageh(filter, sampler, pos_of_weight);
+      output.z += dot(input[j], weight_z);
+
+      pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+      weight_w = read_imageh(filter, sampler, pos_of_weight);
+      output.w += dot(input[j], weight_w);
+
+      j = 6;
+      pos_of_weight.x = i * 3 + j % 3;
+      pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+      weight_x = read_imageh(filter, sampler, pos_of_weight);
+      output.x += dot(input[j], weight_x);
+
+      pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+      weight_y = read_imageh(filter, sampler, pos_of_weight);
+      output.y += dot(input[j], weight_y);
+
+      pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+      weight_z = read_imageh(filter, sampler, pos_of_weight);
+      output.z += dot(input[j], weight_z);
+
+      pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+      weight_w = read_imageh(filter, sampler, pos_of_weight);
+      output.w += dot(input[j], weight_w);
+
+      j = 7;
+      pos_of_weight.x = i * 3 + j % 3;
+      pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+      weight_x = read_imageh(filter, sampler, pos_of_weight);
+      output.x += dot(input[j], weight_x);
+
+      pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+      weight_y = read_imageh(filter, sampler, pos_of_weight);
+      output.y += dot(input[j], weight_y);
+
+      pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+      weight_z = read_imageh(filter, sampler, pos_of_weight);
+      output.z += dot(input[j], weight_z);
+
+      pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+      weight_w = read_imageh(filter, sampler, pos_of_weight);
+      output.w += dot(input[j], weight_w);
+
+      j = 8;
+      pos_of_weight.x = i * 3 + j % 3;
+      pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+      weight_x = read_imageh(filter, sampler, pos_of_weight);
+      output.x += dot(input[j], weight_x);
+
+      pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+      weight_y = read_imageh(filter, sampler, pos_of_weight);
+      output.y += dot(input[j], weight_y);
+
+      pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+      weight_z = read_imageh(filter, sampler, pos_of_weight);
+      output.z += dot(input[j], weight_z);
+
+      pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+      weight_w = read_imageh(filter, sampler, pos_of_weight);
+      output.w += dot(input[j], weight_w);
+    }
+  } else {
+    for (int i = 0; i < 4; i++) {
+      int used_input_channel_num =
+          (out_c * 4 + i) / (output_c / group) * filter_channel;
+      for (int f_c = 0; f_c < filter_channel; ++f_c) {
+        int input_c = used_input_channel_num + f_c;
+        int input_block = input_c / 4;
+        int2 pos_in = (int2)(input_block * input_width + in_pos_in_one_block.x,
+                             in_pos_in_one_block.y);
+        input[0] = select(
+            read_imageh(input_image, sampler,
+                        (int2)(pos_in.x - dilation, pos_in.y - dilation)),
+            (half4)(0.0f),
+            (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
+                       in_pos_in_one_block.y - dilation < 0 ||
+                       in_pos_in_one_block.x - dilation >= input_width ||
+                       in_pos_in_one_block.y - dilation >= input_height)
+                      << 15));
+        input[1] =
+            select(read_imageh(input_image, sampler,
+                               (int2)(pos_in.x, pos_in.y - dilation)),
+                   (half4)(0.0f),
+                   (ushort4)((in_pos_in_one_block.x < 0 ||
+                              in_pos_in_one_block.y - dilation < 0 ||
+                              in_pos_in_one_block.x >= input_width ||
+                              in_pos_in_one_block.y - dilation >= input_height)
+                             << 15));
+        input[2] = select(
+            read_imageh(input_image, sampler,
+                        (int2)(pos_in.x + dilation, pos_in.y - dilation)),
+            (half4)(0.0f),
+            (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
+                       in_pos_in_one_block.y - dilation < 0 ||
+                       in_pos_in_one_block.x + dilation >= input_width ||
+                       in_pos_in_one_block.y - dilation >= input_height)
+                      << 15));
+        input[3] =
+            select(read_imageh(input_image, sampler,
+                               (int2)(pos_in.x - dilation, pos_in.y)),
+                   (half4)(0.0f),
+                   (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
+                              in_pos_in_one_block.y < 0 ||
+                              in_pos_in_one_block.x - dilation >= input_width ||
+                              in_pos_in_one_block.y >= input_height)
+                             << 15));
+        input[4] = select(
+            read_imageh(input_image, sampler, (int2)(pos_in.x, pos_in.y)),
+            (half4)(0.0f),
+            (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 ||
+                       in_pos_in_one_block.x >= input_width ||
+                       in_pos_in_one_block.y >= input_height)
+                      << 15));
+        input[5] =
+            select(read_imageh(input_image, sampler,
+                               (int2)(pos_in.x + dilation, pos_in.y)),
+                   (half4)(0.0f),
+                   (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
+                              in_pos_in_one_block.y < 0 ||
+                              in_pos_in_one_block.x + dilation >= input_width ||
+                              in_pos_in_one_block.y >= input_height)
+                             << 15));
+        input[6] = select(
+            read_imageh(input_image, sampler,
+                        (int2)(pos_in.x - dilation, pos_in.y + dilation)),
+            (half4)(0.0f),
+            (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
+                       in_pos_in_one_block.y + dilation < 0 ||
+                       in_pos_in_one_block.x - dilation >= input_width ||
+                       in_pos_in_one_block.y + dilation >= input_height)
+                      << 15));
+        input[7] =
+            select(read_imageh(input_image, sampler,
+                               (int2)(pos_in.x, pos_in.y + dilation)),
+                   (half4)(0.0f),
+                   (ushort4)((in_pos_in_one_block.x < 0 ||
+                              in_pos_in_one_block.y + dilation < 0 ||
+                              in_pos_in_one_block.x >= input_width ||
+                              in_pos_in_one_block.y + dilation >= input_height)
+                             << 15));
+        input[8] = select(
+            read_imageh(input_image, sampler,
+                        (int2)(pos_in.x + dilation, pos_in.y + dilation)),
+            (half4)(0.0f),
+            (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
+                       in_pos_in_one_block.y + dilation < 0 ||
+                       in_pos_in_one_block.x + dilation >= input_width ||
+                       in_pos_in_one_block.y + dilation >= input_height)
+                      << 15));
+
+        half tmp_out = 0;
+        for (int j = 0; j < 9; j++) {
+          int2 pos_of_weight;
+          pos_of_weight.x = (f_c / 4) * 3 + j % 3;
+          pos_of_weight.y = out_c * 4 * 3 + i * 3 + j / 3;
+          half4 weight = read_imageh(filter, sampler, pos_of_weight);
+          int f_c_offset = f_c % 4;
+          half f_value;
+          if (f_c_offset == 0) {
+            f_value = weight.x;
+          } else if (f_c_offset == 1) {
+            f_value = weight.y;
+          } else if (f_c_offset == 2) {
+            f_value = weight.z;
+          } else if (f_c_offset == 3) {
+            f_value = weight.w;
+          }
+          int input_c_offset = input_c % 4;
+          half input_value;
+          if (input_c_offset == 0) {
+            input_value = input[j].x;
+          } else if (input_c_offset == 1) {
+            input_value = input[j].y;
+          } else if (input_c_offset == 2) {
+            input_value = input[j].z;
+          } else if (input_c_offset == 3) {
+            input_value = input[j].w;
+          }
+          tmp_out += f_value * input_value;
        }
-    } else {
-        for (int i = 0; i < 4; i++) {
-            int used_input_channel_num = (out_c * 4 + i) / (output_c / group) * filter_channel;
-            for (int f_c = 0; f_c < filter_channel; ++f_c) {
-                int input_c = used_input_channel_num + f_c;
-                int input_block = input_c / 4;
-                int2 pos_in = (int2)(input_block * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
-                input[0] = select(read_imageh(input_image, sampler,
-                                    (int2)(pos_in.x - dilation, pos_in.y - dilation)),
-                                    (half4)(0.0f),
-                                    (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
-                input[1] = select(read_imageh(input_image, sampler,
-                                  (int2)(pos_in.x, pos_in.y - dilation)),
-                                  (half4)(0.0f),
-                                  (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
-                input[2] = select(read_imageh(input_image, sampler,
-                                  (int2)(pos_in.x + dilation, pos_in.y - dilation)),
-                                  (half4)(0.0f),
-                                  (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
-                input[3] = select(read_imageh(input_image, sampler,
-                                  (int2)(pos_in.x - dilation, pos_in.y)),
-                                  (half4)(0.0f),
-                                  (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15));
-                input[4] = select(read_imageh(input_image, sampler,
-                                  (int2)(pos_in.x, pos_in.y)),
-                                  (half4)(0.0f),
-                                  (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15));
-                input[5] = select(read_imageh(input_image, sampler,
-                                  (int2)(pos_in.x + dilation, pos_in.y)),
-                                  (half4)(0.0f),
-                                  (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15));
-                input[6] = select(read_imageh(input_image, sampler,
-                                  (int2)(pos_in.x - dilation, pos_in.y + dilation)),
-                                  (half4)(0.0f),
-                                  (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
-                input[7] = select(read_imageh(input_image, sampler,
-                                  (int2)(pos_in.x, pos_in.y + dilation)),
-                                  (half4)(0.0f),
-                                  (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
-                input[8] = select(read_imageh(input_image, sampler,
-                                  (int2)(pos_in.x + dilation, pos_in.y + dilation)),
-                                  (half4)(0.0f),
-                                  (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
-
-                half tmp_out = 0;
-                for (int j = 0; j < 9; j++) {
-                    int2 pos_of_weight;
-                    pos_of_weight.x = (f_c / 4) * 3 + j % 3;
-                    pos_of_weight.y = out_c * 4 * 3 + i * 3 + j / 3;
-                    half4 weight = read_imageh(filter, sampler, pos_of_weight);
-                    int f_c_offset = f_c % 4;
-                    half f_value;
-                    if (f_c_offset == 0) {
-                        f_value = weight.x;
-                    } else if (f_c_offset == 1) {
-                        f_value = weight.y;
-                    } else if (f_c_offset == 2) {
-                        f_value = weight.z;
-                    } else if (f_c_offset == 3) {
-                        f_value = weight.w;
-                    }
-                    int input_c_offset = input_c % 4;
-                    half input_value;
-                    if (input_c_offset == 0) {
-                        input_value = input[j].x;
-                    } else if (input_c_offset == 1) {
-                        input_value = input[j].y;
-                    } else if (input_c_offset == 2) {
-                        input_value = input[j].z;
-                    } else if (input_c_offset == 3) {
-                        input_value = input[j].w;
-                    }
-                    tmp_out += f_value * input_value;
-                }
-
-                if (i == 0) {
-                    output.x += tmp_out;
-                } else if (i == 1) {
-                    output.y += tmp_out;
-                } else if (i == 2) {
-                    output.z += tmp_out;
-                } else if (i == 3) {
-                    output.w += tmp_out;
-                }
-            }
+
+        if (i == 0) {
+          output.x += tmp_out;
+        } else if (i == 1) {
+          output.y += tmp_out;
+        } else if (i == 2) {
+          output.z += tmp_out;
+        } else if (i == 3) {
+          output.w += tmp_out;
        }
+      }
    }
-
+  }

 #ifdef BATCH_NORM
-    output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
+  output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+           read_imageh(new_biase, sampler, (int2)(out_c, 0));
 #endif

 #ifdef RELU
-    output = activation(output);
+  output = activation(output);
 #endif

-    write_imageh(output_image, output_pos, output);
+  write_imageh(output_image, output_pos, output);
 }

-   // dilation == 1
-__kernel void conv_3x3spl(__private const int item_ch,
-                               __private const int item_w,
-                               __private const int item_h,
-                               __read_only image2d_t input_image,
-                               __read_only image2d_t filter_image,
+// dilation == 1
+__kernel void conv_3x3spl(
+    __private const int item_ch, __private const int item_w,
+    __private const int item_h, __read_only image2d_t input_image,
+    __read_only image2d_t filter_image,
 #if defined(BIASE_CH) || defined(BIASE_ELE)
-        __read_only image2d_t bias,
+    __read_only image2d_t bias,
 #endif
 #ifdef BATCH_NORM
-__read_only image2d_t new_scale,
-                                              __read_only image2d_t new_biase,
-#endif
-                               __write_only image2d_t output_image,
-                               __private const int stride,
-                               __private const int pad,
-                               __private const int dilation,
-                               __private const int in_ch,
-                               __private const int in_w,
-                               __private const int in_h,
-                               __private const int out_w,
-                               __private const int out_h) {
-
-    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                              CLK_ADDRESS_CLAMP          |
-                              CLK_FILTER_NEAREST;
-
-    // item_id
-    const int item_ch_id = get_global_id(0);
-    const int item_w_id = get_global_id(1);
-    const int item_h_id = get_global_id(2);
-
-    // out_width_id_per_blk and out_batch_id
-    int out_batch_id = item_h_id / in_h;
-    int out_w_base_id = item_ch_id * out_w;
-    int out_w_id0 = item_w_id;
-    int out_w_id1 = out_w_id0 + item_w;
-    int out_w_id2 = out_w_id1 + item_w;
-    int out_w_id3 = out_w_id2 + item_w;
-    int out_w_id4 = out_w_id3 + item_w;
-
-    // in_width_id_per_blk and in_height_id_per_batch
-    int in_h_id = (item_h_id % out_h) * stride - pad;
-    int in_w_id0 = item_w_id * stride - pad;
-    int in_w_id1 = in_w_id0 + item_w * stride;
-    int in_w_id2 = in_w_id1 + item_w * stride;
-    int in_w_id3 = in_w_id2 + item_w * stride;
-    int in_w_id4 = in_w_id3 + item_w * stride;
+    __read_only image2d_t new_scale, __read_only image2d_t new_biase,
+#endif
+    __write_only image2d_t output_image, __private const int stride,
+    __private const int pad, __private const int dilation,
+    __private const int in_ch, __private const int in_w,
+    __private const int in_h, __private const int out_w,
+    __private const int out_h) {
+
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+  // item_id
+  const int item_ch_id = get_global_id(0);
+  const int item_w_id = get_global_id(1);
+  const int item_h_id = get_global_id(2);
+
+  // out_width_id_per_blk and out_batch_id
+  int out_batch_id = item_h_id / in_h;
+  int out_w_base_id = item_ch_id * out_w;
+  int out_w_id0 = item_w_id;
+  int out_w_id1 = out_w_id0 + item_w;
+  int out_w_id2 = out_w_id1 + item_w;
+  int out_w_id3 = out_w_id2 + item_w;
+  int out_w_id4 = out_w_id3 + item_w;
+
+  // in_width_id_per_blk and in_height_id_per_batch
+  int in_h_id = (item_h_id % out_h) * stride - pad;
+  int in_w_id0 = item_w_id * stride - pad;
+  int in_w_id1 = in_w_id0 + item_w * stride;
+  int in_w_id2 = in_w_id1 + item_w * stride;
+  int in_w_id3 = in_w_id2 + item_w * stride;
+  int in_w_id4 = in_w_id3 + item_w * stride;

 #ifdef BIASE_CH

-    half4 output[5];
-    output[0] = read_imageh(bias, sampler, (int2)(item_ch_id, 0));
-    output[1] = output[0];
-    output[2] = output[0];
-    output[3] = output[0];
-    output[4] = output[0];
+  half4 output[5];
+  output[0] = read_imageh(bias, sampler, (int2)(item_ch_id, 0));
+  output[1] = output[0];
+  output[2] = output[0];
+  output[3] = output[0];
+  output[4] = output[0];

 #elif defined(BIASE_ELE)

-    half4 output[5];
-    output[0] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id0, item_h_id));
-    if (out_w_id1 < out_w) {
-        output[1] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id1, item_h_id));
-    }
-    if (out_w_id2 < out_w) {
-        output[2] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id2, item_h_id));
-    }
-    if (out_w_id3 < out_w) {
-        output[3] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id3, item_h_id));
-    }
-    if (out_w_id4 < out_w) {
-        output[4] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id4, item_h_id));
-    }
+  half4 output[5];
+  output[0] =
+      read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id0, item_h_id));
+  if (out_w_id1 < out_w) {
+    output[1] = read_imageh(bias, sampler,
+                            (int2)(out_w_base_id + out_w_id1, item_h_id));
+  }
+  if (out_w_id2 < out_w) {
+    output[2] = read_imageh(bias, sampler,
+                            (int2)(out_w_base_id + out_w_id2, item_h_id));
+  }
+  if (out_w_id3 < out_w) {
+    output[3] = read_imageh(bias, sampler,
+                            (int2)(out_w_base_id + out_w_id3, item_h_id));
+  }
+  if (out_w_id4 < out_w) {
+    output[4] = read_imageh(bias, sampler,
+                            (int2)(out_w_base_id + out_w_id4, item_h_id));
+  }
 #else
-    half4 output[5] = {0.0f};
-#endif
-
-    half4 filter[4] = {0.0f};
-    half4 filter_trans[4] = {0.0f};
-    half4 input[5] = {0.0f};
-
-    int filter_h_val0 = item_ch_id * 4 * 3;
-    int filter_h_val1 = filter_h_val0 + 3;
-    int filter_h_val2 = filter_h_val1 + 3;
-    int filter_h_val3 = filter_h_val2 + 3;
-
-    for (int ch = 0; ch < (in_ch + 3) / 4; ch++) {
-        int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0;
-
-        const int in_w_base_id = mul24(ch, in_w);
-
-        int filter_w_val = ch * 3;
-
-        for (int h = 0; h < 3; h++) {
-
-            int in_h_val = select(out_batch_id * in_h + in_h_id + h, -1,
-                                 (out_batch_id * in_h + in_h_id + h < 0 || out_batch_id * in_h + in_h_id + h >= in_h));
-
-            for (int w = 0; w < 3; w++) {
-
-                int in_w_val0 = select(in_w_base_id + in_w_id0 + w, -1,
-                                  (in_w_id0 + w < 0 || in_w_id0 + w >= in_w));
-                int in_w_val1 = select(in_w_base_id + in_w_id1 + w, -1,
-                                   (in_w_id1 + w < 0 || in_w_id1 + w >= in_w));
-                int in_w_val2 = select(in_w_base_id + in_w_id2 + w, -1,
-                                   (in_w_id2 + w < 0 || in_w_id2 + w >= in_w));
-                int in_w_val3 = select(in_w_base_id + in_w_id3 + w, -1,
-                                   (in_w_id3 + w < 0 || in_w_id3 + w >= in_w));
-                int in_w_val4 = select(in_w_base_id + in_w_id4 + w, -1,
-                                   (in_w_id4 + w < 0 || in_w_id4 + w >= in_w));
-
-                filter[0] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val0 + h)); // in_ch:0-3,out_ch:0
-                filter[1] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val1 + h)); // in_ch:0-3,out_ch:1
-                filter[2] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val2 + h)); // in_ch:0-3,out_ch:2
-                filter[3] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val3 + h)); // in_ch:0-3,out_ch:3
-
-                filter_trans[0] = (half4)(filter[0].x, filter[1].x, filter[2].x, filter[3].x);    // in_ch:0,out_ch:0-3
-                filter_trans[1] = (half4)(filter[0].y, filter[1].y, filter[2].y, filter[3].y);    // in_ch:1,out_ch:0-3
-                filter_trans[2] = (half4)(filter[0].z, filter[1].z, filter[2].z, filter[3].z);    // in_ch:2,out_ch:0-3
-                filter_trans[3] = (half4)(filter[0].w, filter[1].w, filter[2].w, filter[3].w);    // in_ch:3,out_ch:0-3
-
-                input[0] = read_imageh(input_image, sampler, (int2)(in_w_val0, in_h_val));
-                input[1] = read_imageh(input_image, sampler, (int2)(in_w_val1, in_h_val));
-                input[2] = read_imageh(input_image, sampler, (int2)(in_w_val2, in_h_val));
-                input[3] = read_imageh(input_image, sampler, (int2)(in_w_val3, in_h_val));
-                input[4] = read_imageh(input_image, sampler, (int2)(in_w_val4, in_h_val));
-
-                output[0] = mad(input[0].x, filter_trans[0], output[0]);
-                output[1] = mad(input[1].x, filter_trans[0], output[1]);
-                output[2] = mad(input[2].x, filter_trans[0], output[2]);
-                output[3] = mad(input[3].x, filter_trans[0], output[3]);
-                output[4] = mad(input[4].x, filter_trans[0], output[4]);
-
-                if (ch_surplus < 3) {
-                    output[0] = mad(input[0].y, filter_trans[1], output[0]);
-                    output[1] = mad(input[1].y, filter_trans[1], output[1]);
-                    output[2] = mad(input[2].y, filter_trans[1], output[2]);
-                    output[3] = mad(input[3].y, filter_trans[1], output[3]);
-                    output[4] = mad(input[4].y, filter_trans[1], output[4]);
-                }
-                if (ch_surplus < 2) {
-                    output[0] = mad(input[0].z, filter_trans[2], output[0]);
-                    output[1] = mad(input[1].z, filter_trans[2], output[1]);
-                    output[2] = mad(input[2].z, filter_trans[2], output[2]);
-                    output[3] = mad(input[3].z, filter_trans[2], output[3]);
-                    output[4] = mad(input[4].z, filter_trans[2], output[4]);
-                }
-                if (ch_surplus < 1) {
-                    output[0] = mad(input[0].w, filter_trans[3], output[0]);
-                    output[1] = mad(input[1].w, filter_trans[3], output[1]);
-                    output[2] = mad(input[2].w, filter_trans[3], output[2]);
-                    output[3] = mad(input[3].w, filter_trans[3], output[3]);
-                    output[4] = mad(input[4].w, filter_trans[3], output[4]);
-                }
-            }
+  half4 output[5] = {0.0f};
+#endif
+
+  half4 filter[4] = {0.0f};
+  half4 filter_trans[4] = {0.0f};
+  half4 input[5] = {0.0f};
+
+  int filter_h_val0 = item_ch_id * 4 * 3;
+  int filter_h_val1 = filter_h_val0 + 3;
+  int filter_h_val2 = filter_h_val1 + 3;
+  int filter_h_val3 = filter_h_val2 + 3;
+
+  for (int ch = 0; ch < (in_ch + 3) / 4; ch++) {
+    int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0;
+
+    const int in_w_base_id = mul24(ch, in_w);
+
+    int filter_w_val = ch * 3;
+
+    for (int h = 0; h < 3; h++) {
+      int in_h_val = select(out_batch_id * in_h + in_h_id + h, -1,
+                            (out_batch_id * in_h + in_h_id + h < 0 ||
+                             out_batch_id * in_h + in_h_id + h >= in_h));
+
+      for (int w = 0; w < 3; w++) {
+        int in_w_val0 = select(in_w_base_id + in_w_id0 + w, -1,
+                               (in_w_id0 + w < 0 || in_w_id0 + w >= in_w));
+        int in_w_val1 = select(in_w_base_id + in_w_id1 + w, -1,
+                               (in_w_id1 + w < 0 || in_w_id1 + w >= in_w));
+        int in_w_val2 = select(in_w_base_id + in_w_id2 + w, -1,
+                               (in_w_id2 + w < 0 || in_w_id2 + w >= in_w));
+        int in_w_val3 = select(in_w_base_id + in_w_id3 + w, -1,
+                               (in_w_id3 + w < 0 || in_w_id3 + w >= in_w));
+        int in_w_val4 = select(in_w_base_id + in_w_id4 + w, -1,
+                               (in_w_id4 + w < 0 || in_w_id4 + w >= in_w));
+
+        filter[0] = read_imageh(
+            filter_image, sampler,
+            (int2)(filter_w_val + w, filter_h_val0 + h));  // in_ch:0-3,out_ch:0
+        filter[1] = read_imageh(
+            filter_image, sampler,
+            (int2)(filter_w_val + w, filter_h_val1 + h));  // in_ch:0-3,out_ch:1
+        filter[2] = read_imageh(
+            filter_image, sampler,
+            (int2)(filter_w_val + w, filter_h_val2 + h));  // in_ch:0-3,out_ch:2
+        filter[3] = read_imageh(
+            filter_image, sampler,
+            (int2)(filter_w_val + w, filter_h_val3 + h));  // in_ch:0-3,out_ch:3
+
+        filter_trans[0] = (half4)(filter[0].x, filter[1].x, filter[2].x,
+                                  filter[3].x);  // in_ch:0,out_ch:0-3
+        filter_trans[1] = (half4)(filter[0].y, filter[1].y, filter[2].y,
+                                  filter[3].y);  // in_ch:1,out_ch:0-3
+        filter_trans[2] = (half4)(filter[0].z, filter[1].z, filter[2].z,
+                                  filter[3].z);  // in_ch:2,out_ch:0-3
+        filter_trans[3] = (half4)(filter[0].w, filter[1].w, filter[2].w,
+                                  filter[3].w);  // in_ch:3,out_ch:0-3
+
+        input[0] =
+            read_imageh(input_image, sampler, (int2)(in_w_val0, in_h_val));
+        input[1] =
+            read_imageh(input_image, sampler, (int2)(in_w_val1, in_h_val));
+        input[2] =
+            read_imageh(input_image, sampler, (int2)(in_w_val2, in_h_val));
+        input[3] =
+            read_imageh(input_image, sampler, (int2)(in_w_val3, in_h_val));
+        input[4] =
+            read_imageh(input_image, sampler, (int2)(in_w_val4, in_h_val));
+
+        output[0] = mad(input[0].x, filter_trans[0], output[0]);
+        output[1] = mad(input[1].x, filter_trans[0], output[1]);
+        output[2] = mad(input[2].x, filter_trans[0], output[2]);
+        output[3] = mad(input[3].x, filter_trans[0], output[3]);
+        output[4] = mad(input[4].x, filter_trans[0], output[4]);
+
+        if (ch_surplus < 3) {
+          output[0] = mad(input[0].y, filter_trans[1], output[0]);
+          output[1] = mad(input[1].y, filter_trans[1], output[1]);
+          output[2] = mad(input[2].y, filter_trans[1], output[2]);
+          output[3] = mad(input[3].y, filter_trans[1], output[3]);
+          output[4] = mad(input[4].y, filter_trans[1], output[4]);
+        }
+        if (ch_surplus < 2) {
+          output[0] = mad(input[0].z, filter_trans[2], output[0]);
+          output[1] = mad(input[1].z, filter_trans[2], output[1]);
+          output[2] = mad(input[2].z, filter_trans[2], output[2]);
+          output[3] = mad(input[3].z, filter_trans[2], output[3]);
+          output[4] = mad(input[4].z, filter_trans[2], output[4]);
+        }
+        if (ch_surplus < 1) {
+          output[0] = mad(input[0].w, filter_trans[3], output[0]);
+          output[1] = mad(input[1].w, filter_trans[3], output[1]);
+          output[2] = mad(input[2].w, filter_trans[3], output[2]);
+          output[3] = mad(input[3].w, filter_trans[3], output[3]);
+          output[4] = mad(input[4].w, filter_trans[3], output[4]);
        }
+      }
    }
+  }
 #ifdef BATCH_NORM
-    half4 scale = read_imageh(new_scale, sampler, (int2)(item_ch_id, 0));
-    half4 biase = read_imageh(new_biase, sampler, (int2)(item_ch_id, 0));
-    output[0] = mad(scale, output[0], biase);
-    if (out_w_id1 < out_w) {
-        output[1] =  mad(scale, output[1], biase);
-    }
-    if (out_w_id2 < out_w) {
-        output[2] =  mad(scale, output[2], biase);
-    }
-    if (out_w_id3 < out_w) {
-        output[3] =  mad(scale, output[3], biase);
-    }
-    if (out_w_id4 < out_w) {
-        output[4] =  mad(scale, output[4], biase);
-    }
+  half4 scale = read_imageh(new_scale, sampler, (int2)(item_ch_id, 0));
+  half4 biase = read_imageh(new_biase, sampler, (int2)(item_ch_id, 0));
+  output[0] = mad(scale, output[0], biase);
+  if (out_w_id1 < out_w) {
+    output[1] = mad(scale, output[1], biase);
+  }
+  if (out_w_id2 < out_w) {
+    output[2] = mad(scale, output[2], biase);
+  }
+  if (out_w_id3 < out_w) {
+    output[3] = mad(scale, output[3], biase);
+  }
+  if (out_w_id4 < out_w) {
+    output[4] = mad(scale, output[4], biase);
+  }
 #endif

 #ifdef RELU
-    output[0] = activation(output[0]);
-    output[1] = activation(output[1]);
-    output[2] = activation(output[2]);
-    output[3] = activation(output[3]);
-    output[4] = activation(output[4]);
-#endif
-    write_imageh(output_image, (int2)(out_w_base_id + out_w_id0, item_h_id), output[0]);
-    if (out_w_id1 < out_w) {
-        write_imageh(output_image, (int2)(out_w_base_id + out_w_id1, item_h_id), output[1]);
-    }
-    if (out_w_id2 < out_w) {
-        write_imageh(output_image, (int2)(out_w_base_id + out_w_id2, item_h_id), output[2]);
-    }
-    if (out_w_id3 < out_w) {
-        write_imageh(output_image, (int2)(out_w_base_id + out_w_id3, item_h_id), output[3]);
-    }
-    if (out_w_id4 < out_w) {
-        write_imageh(output_image, (int2)(out_w_base_id + out_w_id4, item_h_id), output[4]);
-    }
+  output[0] = activation(output[0]);
+  output[1] = activation(output[1]);
+  output[2] = activation(output[2]);
+  output[3] = activation(output[3]);
+  output[4] = activation(output[4]);
+#endif
+  write_imageh(output_image, (int2)(out_w_base_id + out_w_id0, item_h_id),
+               output[0]);
+  if (out_w_id1 < out_w) {
+    write_imageh(output_image, (int2)(out_w_base_id + out_w_id1, item_h_id),
+                 output[1]);
+  }
+  if (out_w_id2 < out_w) {
+    write_imageh(output_image, (int2)(out_w_base_id + out_w_id2, item_h_id),
+                 output[2]);
+  }
+  if (out_w_id3 < out_w) {
+    write_imageh(output_image, (int2)(out_w_base_id + out_w_id3, item_h_id),
+                 output[3]);
+  }
+  if (out_w_id4 < out_w) {
+    write_imageh(output_image, (int2)(out_w_base_id + out_w_id4, item_h_id),
+                 output[4]);
+  }
 }

-
-
-__kernel void depth_conv_3x3(__private const int global_size_dim0,
-                                              __private const int global_size_dim1,
-                                              __private const int global_size_dim2,
-                                              __read_only image2d_t input,
-                                              __read_only image2d_t filter,
+__kernel void depth_conv_3x3(
+    __private const int global_size_dim0, __private const int global_size_dim1,
+    __private const int global_size_dim2, __read_only image2d_t input,
+    __read_only image2d_t filter,
 #if defined(BIASE_CH) || defined(BIASE_ELE)
-                                              __read_only image2d_t bias,
+    __read_only image2d_t bias,
 #endif
 #ifdef BATCH_NORM
-                                              __read_only image2d_t new_scale,
-                                              __read_only image2d_t new_biase,
+    __read_only image2d_t new_scale, __read_only image2d_t new_biase,
 #endif
-                                              __write_only image2d_t output_image,
-                                              __private const int stride,
-                                              __private const int offset,
-                                              __private const int input_c,
-                                              __private const int dilation,
-                                              __private const int input_width,/* of one block */
-                                              __private const int input_height, /* of one block */
-                                              __private const int output_width,
-                                              __private const int output_height) {
-
-    const int out_c = get_global_id(0);
-    const int out_w = get_global_id(1);
-    const int out_nh = get_global_id(2);
-
-    int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
+    __write_only image2d_t output_image, __private const int stride,
+    __private const int offset, __private const int input_c,
+    __private const int dilation,
+    __private const int input_width,  /* of one block */
+    __private const int input_height, /* of one block */
+    __private const int output_width, __private const int output_height) {

+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);

-    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                              CLK_ADDRESS_CLAMP          |
-                              CLK_FILTER_NEAREST;
+  int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);

-    const int batch_index = out_nh / output_height;
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;

-    const int out_nh_in_one_batch = out_nh % output_height;
+  const int batch_index = out_nh / output_height;

+  const int out_nh_in_one_batch = out_nh % output_height;

-    int2 stride_xy = (int2)(stride, stride);
-    int2 ouput_pos_in_one_block = (int2)(out_w, out_nh_in_one_batch);
+  int2 stride_xy = (int2)(stride, stride);
+  int2 ouput_pos_in_one_block = (int2)(out_w, out_nh_in_one_batch);

-    int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
+  int2 in_pos_in_one_block =
+      ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);

 #ifdef BIASE_CH
-    half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
+  half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
 #elif defined(BIASE_ELE)
-    half4 output = read_imageh(bias, sampler, output_pos);
+  half4 output = read_imageh(bias, sampler, output_pos);
 #else
-    half4 output = 0.0f;
+  half4 output = 0.0f;
 #endif

-    const int filter_width = 3;
-    const int filter_height = 3;
-
-    int2 pos_in_input_block = (int2)(out_c * input_width, batch_index * input_height);
-
-    int2 pos_in_filter_block = (int2)(out_c * filter_width, batch_index * filter_height);
-
-    int filter_x = pos_in_filter_block.x ;
-    int filter_y = pos_in_filter_block.y ;
-
-    half4 inputs[9];
-
-        inputs[0] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)),
-                           (half4)(0.0f),
-                           (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15));
-
-        inputs[1] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y - 1)),
-                           (half4)(0.0f),
-                           (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15));
-
-        inputs[2] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)),
-                           (half4)(0.0f),
-                           (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15));
-
-        inputs[3] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y)),
-                           (half4)(0.0f),
-                           (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y >= input_height) << 15));
-        /*
-        if (output_pos.x == 112 && output_pos.y == 0) {
-              half4 input1 = inputs[3];
-              float4 in = (float4)(input1.x, input1.y, input1.z, input1.w);
-              printf(" input4 3 - %v4hlf \n", in);
-              printf(" --- %d ---\n", in_pos_in_one_block.x - 1);
-        }
-        */
-
-
-        inputs[4] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y)),
-                           (half4)(0.0f),
-                           (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15));
-
-        inputs[5] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y)),
-                           (half4)(0.0f),
-                           (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y >= input_height) << 15));
-
-        inputs[6] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)),
-                           (half4)(0.0f),
-                           (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15));
-
-        inputs[7] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y + 1)),
-                           (half4)(0.0f),
-                           (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15));
-
-        inputs[8] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)),
-                           (half4)(0.0f),
-                           (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15));
-
-    half4 filters[9];
-    filters[0] =  read_imageh(filter, sampler,(int2)(filter_x,filter_y));
-    filters[1] =  read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y));
-    filters[2] =  read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y));
-    filters[3] =  read_imageh(filter, sampler,(int2)(filter_x,filter_y + 1));
-    filters[4] =  read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y + 1));
-    filters[5] =  read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y + 1));
-    filters[6] =  read_imageh(filter, sampler,(int2)(filter_x,filter_y + 2));
-    filters[7] =  read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y + 2));
-    filters[8] =  read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y + 2));
-
-    for(int i = 0 ;i < 9 ; i++){
-     output += inputs[i] * filters[i];
-    }
+  const int filter_width = 3;
+  const int filter_height = 3;
+
+  int2 pos_in_input_block =
+      (int2)(out_c * input_width, batch_index * input_height);
+
+  int2 pos_in_filter_block =
+      (int2)(out_c * filter_width, batch_index * filter_height);
+
+  int filter_x = pos_in_filter_block.x;
+  int filter_y = pos_in_filter_block.y;
+
+  half4 inputs[9];
+
+  inputs[0] = select(
+      read_imageh(input, sampler,
+                  (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1,
+                         pos_in_input_block.y + in_pos_in_one_block.y - 1)),
+      (half4)(0.0f), (ushort4)((in_pos_in_one_block.x - 1 < 0 ||
+                                in_pos_in_one_block.y - 1 < 0 ||
+                                in_pos_in_one_block.x - 1 >= input_width ||
+                                in_pos_in_one_block.y - 1 >= input_height)
+                               << 15));
+
+  inputs[1] = select(
+      read_imageh(input, sampler,
+                  (int2)(pos_in_input_block.x + in_pos_in_one_block.x,
+                         pos_in_input_block.y + in_pos_in_one_block.y - 1)),
+      (half4)(0.0f),
+      (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - 1 < 0 ||
+                 in_pos_in_one_block.x >= input_width ||
+                 in_pos_in_one_block.y - 1 >= input_height)
+                << 15));
+
+  inputs[2] = select(
+      read_imageh(input, sampler,
+                  (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1,
+                         pos_in_input_block.y + in_pos_in_one_block.y - 1)),
+      (half4)(0.0f), (ushort4)((in_pos_in_one_block.x + 1 < 0 ||
+                                in_pos_in_one_block.y - 1 < 0 ||
+                                in_pos_in_one_block.x + 1 >= input_width ||
+                                in_pos_in_one_block.y - 1 >= input_height)
+                               << 15));
+
+  inputs[3] = select(
+      read_imageh(input, sampler,
+                  (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1,
+                         pos_in_input_block.y + in_pos_in_one_block.y)),
+      (half4)(0.0f),
+      (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y < 0 ||
+                 in_pos_in_one_block.x - 1 >= input_width ||
+                 in_pos_in_one_block.y >= input_height)
+                << 15));
+  /*
+  if (output_pos.x == 112 && output_pos.y == 0) {
+        half4 input1 = inputs[3];
+        float4 in = (float4)(input1.x, input1.y, input1.z, input1.w);
+        printf(" input4 3 - %v4hlf \n", in);
+        printf(" --- %d ---\n", in_pos_in_one_block.x - 1);
+  }
+  */
+
+  inputs[4] = select(
+      read_imageh(input, sampler,
+                  (int2)(pos_in_input_block.x + in_pos_in_one_block.x,
+                         pos_in_input_block.y + in_pos_in_one_block.y)),
+      (half4)(0.0f),
+      (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 ||
+                 in_pos_in_one_block.x >= input_width ||
+                 in_pos_in_one_block.y >= input_height)
+                << 15));
+
+  inputs[5] = select(
+      read_imageh(input, sampler,
+                  (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1,
+                         pos_in_input_block.y + in_pos_in_one_block.y)),
+      (half4)(0.0f),
+      (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y < 0 ||
+                 in_pos_in_one_block.x + 1 >= input_width ||
+                 in_pos_in_one_block.y >= input_height)
+                << 15));
+
+  inputs[6] = select(
+      read_imageh(input, sampler,
+                  (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1,
+                         pos_in_input_block.y + in_pos_in_one_block.y + 1)),
+      (half4)(0.0f), (ushort4)((in_pos_in_one_block.x - 1 < 0 ||
+                                in_pos_in_one_block.y + 1 < 0 ||
+                                in_pos_in_one_block.x - 1 >= input_width ||
+                                in_pos_in_one_block.y + 1 >= input_height)
+                               << 15));
+
+  inputs[7] = select(
+      read_imageh(input, sampler,
+                  (int2)(pos_in_input_block.x + in_pos_in_one_block.x,
+                         pos_in_input_block.y + in_pos_in_one_block.y + 1)),
+      (half4)(0.0f),
+      (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + 1 < 0 ||
+                 in_pos_in_one_block.x >= input_width ||
+                 in_pos_in_one_block.y + 1 >= input_height)
+                << 15));
+
+  inputs[8] = select(
+      read_imageh(input, sampler,
+                  (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1,
+                         pos_in_input_block.y + in_pos_in_one_block.y + 1)),
+      (half4)(0.0f), (ushort4)((in_pos_in_one_block.x + 1 < 0 ||
+                                in_pos_in_one_block.y + 1 < 0 ||
+                                in_pos_in_one_block.x + 1 >= input_width ||
+                                in_pos_in_one_block.y + 1 >= input_height)
+                               << 15));
+
+  half4 filters[9];
+  filters[0] = read_imageh(filter, sampler, (int2)(filter_x, filter_y));
+  filters[1] = read_imageh(filter, sampler, (int2)(filter_x + 1, filter_y));
+  filters[2] = read_imageh(filter, sampler, (int2)(filter_x + 2, filter_y));
+  filters[3] = read_imageh(filter, sampler, (int2)(filter_x, filter_y + 1));
+  filters[4] = read_imageh(filter, sampler, (int2)(filter_x + 1, filter_y + 1));
+  filters[5] = read_imageh(filter, sampler, (int2)(filter_x + 2, filter_y + 1));
+  filters[6] = read_imageh(filter, sampler, (int2)(filter_x, filter_y + 2));
+  filters[7] = read_imageh(filter, sampler, (int2)(filter_x + 1, filter_y + 2));
+  filters[8] = read_imageh(filter, sampler, (int2)(filter_x + 2, filter_y + 2));
+
+  for (int i = 0; i < 9; i++) {
+    output += inputs[i] * filters[i];
+  }
 #ifdef BATCH_NORM
-    output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
+  output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+           read_imageh(new_biase, sampler, (int2)(out_c, 0));
 #endif

 #ifdef RELU
-    output = activation(output);
+  output = activation(output);
 #endif

+  /*
+  if (output_pos.x == 112 && output_pos.y == 0) {
+      for (int i = 0; i < 9; ++i) {
+          half4 input1 = inputs[i];
+          float4 in = (float4)(input1.x, input1.y, input1.z, input1.w);
+          printf(" input4 %d - %v4hlf \n", i, in);
+      }
+      float4 out = (float4)(output.x, output.y, output.z, output.w);
+      printf(" depth wise output output4 = %v4hlf \n", out);
+      printf(" pos_in_input_block -x %d \n ", pos_in_input_block.x);
+      printf(" pos_in_input_block -y %d \n ", pos_in_input_block.y);
+      printf(" in_pos_in_one_block - x %d \n", in_pos_in_one_block.x);
+      printf(" in_pos_in_one_block - y %d \n", in_pos_in_one_block.y);
+  }
+  */

-    /*
-
-    if (output_pos.x == 112 && output_pos.y == 0) {
-
-        for (int i = 0; i < 9; ++i) {
-            half4 input1 = inputs[i];
-            float4 in = (float4)(input1.x, input1.y, input1.z, input1.w);
-            printf(" input4 %d - %v4hlf \n", i, in);
-        }
-
-        float4 out = (float4)(output.x, output.y, output.z, output.w);
-        printf(" depth wise output output4 = %v4hlf \n", out);
-        printf(" pos_in_input_block -x %d \n ", pos_in_input_block.x);
-        printf(" pos_in_input_block -y %d \n ", pos_in_input_block.y);
-        printf(" in_pos_in_one_block - x %d \n", in_pos_in_one_block.x);
-        printf(" in_pos_in_one_block - y %d \n", in_pos_in_one_block.y);
-    }
-
-    */
-
-    write_imageh(output_image, output_pos, output);
-
+  write_imageh(output_image, output_pos, output);
 }

-
-
-__kernel void depth_conv_3x3s1(__private const int ou_ch_blk,
-                                              __private const int ou_w_blk,
-                                              __private const int ou_nh,
-                                              __read_only image2d_t input,
-                                              __read_only image2d_t filter,
+__kernel void depth_conv_3x3s1(
+    __private const int ou_ch_blk, __private const int ou_w_blk,
+    __private const int ou_nh, __read_only image2d_t input,
+    __read_only image2d_t filter,
 #if defined(BIASE_CH) || defined(BIASE_ELE)
-                                              __read_only image2d_t bias,
+    __read_only image2d_t bias,
 #endif
 #ifdef BATCH_NORM
-                                              __read_only image2d_t new_scale,
-                                              __read_only image2d_t new_biase,
-#endif
-                                              __write_only image2d_t output_image,
-                                              __private const int stride,
-                                              __private const int pad,
-                                              __private const int dilation,
-                                              __private const int in_ch,
-                                              __private const int in_w,/* of one block */
-                                              __private const int in_h, /* of one block */
-                                              __private const int ou_w,
-                                              __private const int ou_h) {
-
-    const int ou_ch_blk_id = get_global_id(0);
-    const int ou_w_blk_id = get_global_id(1);
-    const int ou_nh_id = get_global_id(2);
-    const int w_blk_size = 2;
-
-    const int batch_id = ou_nh_id / ou_h;
-    int ou_col_id = ou_w_blk_id * w_blk_size;
-    int ou_row_id = ou_nh_id % ou_h;
-    int ou_x = mad24(ou_ch_blk_id, ou_w, ou_col_id);
-
-    // input pos in one block and on batch
-    int col_id = ou_col_id - pad;
-    int row_id = ou_row_id - pad;
-
-    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                              CLK_ADDRESS_CLAMP          |
-                              CLK_FILTER_NEAREST;
-
-#ifdef BIASE_CH
-    half4 output[2];
-    output[0] = read_imageh(bias, sampler, (int2)(ou_ch_blk_id, 0));
-    output[1] = output[0];
-#elif defined(BIASE_ELE)
-    half4 output[2];
-    output[0] = read_imageh(bias, sampler, (int2)(ou_x, ou_nh_id));
-    if (ou_col_id + 1 < ou_w) {
-        output[1] = read_imageh(bias, sampler, (int2)(ou_x + 1, ou_nh_id));
-    }
-#else
-    half4 output[2] = {0.0f};
+    __read_only image2d_t new_scale, __read_only image2d_t new_biase,
 #endif
+    __write_only image2d_t output_image, __private const int stride,
+    __private const int pad, __private const int dilation,
+    __private const int in_ch, __private const int in_w, /* of one block */
+    __private const int in_h,                            /* of one block */
+    __private const int ou_w, __private const int ou_h) {

-    half4 inputs[12];
+  const int ou_ch_blk_id = get_global_id(0);
+  const int ou_w_blk_id = get_global_id(1);
+  const int ou_nh_id = get_global_id(2);
+  const int w_blk_size = 2;

-    int filter_x = ou_ch_blk_id * 3;
-    int filter_y = 0;
-    half4 filters[9];
-    filters[0] =  read_imageh(filter, sampler,(int2)(filter_x,filter_y));
-    filters[1] =  read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y));
-    filters[2] =  read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y));
+  const int batch_id = ou_nh_id / ou_h;
+  int ou_col_id = ou_w_blk_id * w_blk_size;
+  int ou_row_id = ou_nh_id % ou_h;
+  int ou_x = mad24(ou_ch_blk_id, ou_w, ou_col_id);

-    int in_x = mad24(ou_ch_blk_id, in_w, col_id);
-    int in_y = mad24(batch_id, in_h, row_id);
+  // input pos in one block and on batch
+  int col_id = ou_col_id - pad;
+  int row_id = ou_row_id - pad;

-    int y0 = select(in_y, -1, row_id < 0 || row_id >= in_h);
-    int x0 = select(in_x, -1, col_id < 0 || col_id >= in_w);
-    inputs[0] = read_imageh(input, sampler, (int2)(x0, y0));
-    int x1 = select(in_x + 1, -1, col_id + 1 < 0 || col_id + 1 >= in_w);
-    inputs[1] = read_imageh(input, sampler, (int2)(x1, y0));
-    int x2 = select(in_x + 2, -1, col_id + 2 < 0 || col_id + 2 >= in_w);
-    inputs[2] = read_imageh(input, sampler, (int2)(x2, y0));
-    int x3 = select(in_x + 3, -1, col_id + 3 < 0 || col_id + 3 >= in_w);
-    inputs[3] = read_imageh(input, sampler, (int2)(x3, y0));
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;

-    output[0] = mad(inputs[0], filters[0], output[0]);
-    output[1] = mad(inputs[1], filters[0], output[1]);
+#ifdef BIASE_CH
+  half4 output[2];
+  output[0] = read_imageh(bias, sampler, (int2)(ou_ch_blk_id, 0));
+  output[1] = output[0];
+#elif defined(BIASE_ELE)
+  half4 output[2];
+  output[0] = read_imageh(bias, sampler, (int2)(ou_x, ou_nh_id));
+  if (ou_col_id + 1 < ou_w) {
+    output[1] = read_imageh(bias, sampler, (int2)(ou_x + 1, ou_nh_id));
+  }
+#else
+  half4 output[2] = {0.0f};
+#endif

-    output[0] = mad(inputs[1], filters[1], output[0]);
-    output[1] = mad(inputs[2], filters[1], output[1]);
+  half4 inputs[12];

-    output[0] = mad(inputs[2], filters[2], output[0]);
-    output[1] = mad(inputs[3], filters[2], output[1]);
+  int filter_x = ou_ch_blk_id * 3;
+  int filter_y = 0;
+  half4 filters[9];
+  filters[0] = read_imageh(filter, sampler, (int2)(filter_x, filter_y));
+  filters[1] = read_imageh(filter, sampler, (int2)(filter_x + 1, filter_y));
+  filters[2] = read_imageh(filter, sampler, (int2)(filter_x + 2, filter_y));

+  int in_x = mad24(ou_ch_blk_id, in_w, col_id);
+  int in_y = mad24(batch_id, in_h, row_id);

-    filters[3] =  read_imageh(filter, sampler,(int2)(filter_x,filter_y + 1));
-    filters[4] =  read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y + 1));
-    filters[5] =  read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y + 1));
+  int y0 = select(in_y, -1, row_id < 0 || row_id >= in_h);
+  int x0 = select(in_x, -1, col_id < 0 || col_id >= in_w);
+  inputs[0] = read_imageh(input, sampler, (int2)(x0, y0));
+  int x1 = select(in_x + 1, -1, col_id + 1 < 0 || col_id + 1 >= in_w);
+  inputs[1] = read_imageh(input, sampler, (int2)(x1, y0));
+  int x2 = select(in_x + 2, -1, col_id + 2 < 0 || col_id + 2 >= in_w);
+  inputs[2] = read_imageh(input, sampler, (int2)(x2, y0));
+  int x3 = select(in_x + 3, -1, col_id + 3 < 0 || col_id + 3 >= in_w);
+  inputs[3] = read_imageh(input, sampler, (int2)(x3, y0));

+  output[0] = mad(inputs[0], filters[0], output[0]);
+  output[1] = mad(inputs[1], filters[0], output[1]);

-    int y1 = select(in_y + 1, -1, row_id + 1 < 0 || row_id + 1 >= in_h);
-    inputs[4] = read_imageh(input, sampler, (int2)(x0, y1));
-    inputs[5] = read_imageh(input, sampler, (int2)(x1, y1));
-    inputs[6] = read_imageh(input, sampler, (int2)(x2, y1));
-    inputs[7] = read_imageh(input, sampler, (int2)(x3, y1));
+  output[0] = mad(inputs[1], filters[1], output[0]);
+  output[1] = mad(inputs[2], filters[1], output[1]);

+  output[0] = mad(inputs[2], filters[2], output[0]);
+  output[1] = mad(inputs[3], filters[2], output[1]);

-    output[0] = mad(inputs[4], filters[3], output[0]);
-    output[1] = mad(inputs[5], filters[3], output[1]);
+  filters[3] = read_imageh(filter, sampler, (int2)(filter_x, filter_y + 1));
+  filters[4] = read_imageh(filter, sampler, (int2)(filter_x + 1, filter_y + 1));
+  filters[5] = read_imageh(filter, sampler, (int2)(filter_x + 2, filter_y + 1));

-    output[0] = mad(inputs[5], filters[4], output[0]);
-    output[1] = mad(inputs[6], filters[4], output[1]);
+  int y1 = select(in_y + 1, -1, row_id + 1 < 0 || row_id + 1 >= in_h);
+  inputs[4] = read_imageh(input, sampler, (int2)(x0, y1));
+  inputs[5] = read_imageh(input, sampler, (int2)(x1, y1));
+  inputs[6] = read_imageh(input, sampler, (int2)(x2, y1));
+  inputs[7] = read_imageh(input, sampler, (int2)(x3, y1));

-    output[0] = mad(inputs[6], filters[5], output[0]);
-    output[1] = mad(inputs[7], filters[5], output[1]);
+  output[0] = mad(inputs[4], filters[3], output[0]);
+  output[1] = mad(inputs[5], filters[3], output[1]);

+  output[0] = mad(inputs[5], filters[4], output[0]);
+  output[1] = mad(inputs[6], filters[4], output[1]);

-    filters[6] =  read_imageh(filter, sampler,(int2)(filter_x,filter_y + 2));
-    filters[7] =  read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y + 2));
-    filters[8] =  read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y + 2));
+  output[0] = mad(inputs[6], filters[5], output[0]);
+  output[1] = mad(inputs[7], filters[5], output[1]);

-    int y2 = select(in_y + 2, -1, row_id + 2 < 0 || row_id + 2 >= in_h);
-    inputs[8] = read_imageh(input, sampler, (int2)(x0, y2));
-    inputs[9] = read_imageh(input, sampler, (int2)(x1, y2));
-    inputs[10] = read_imageh(input, sampler, (int2)(x2, y2));
-    inputs[11] = read_imageh(input, sampler, (int2)(x3, y2));
+  filters[6] = read_imageh(filter, sampler, (int2)(filter_x, filter_y + 2));
+  filters[7] = read_imageh(filter, sampler, (int2)(filter_x + 1, filter_y + 2));
+  filters[8] = read_imageh(filter, sampler, (int2)(filter_x + 2, filter_y + 2));

+  int y2 = select(in_y + 2, -1, row_id + 2 < 0 || row_id + 2 >= in_h);
+  inputs[8] = read_imageh(input, sampler, (int2)(x0, y2));
+  inputs[9] = read_imageh(input, sampler, (int2)(x1, y2));
+  inputs[10] = read_imageh(input, sampler, (int2)(x2, y2));
+  inputs[11] = read_imageh(input, sampler, (int2)(x3, y2));

-    output[0] = mad(inputs[8], filters[6], output[0]);
-    output[1] = mad(inputs[9], filters[6], output[1]);
+  output[0] = mad(inputs[8], filters[6], output[0]);
+  output[1] = mad(inputs[9], filters[6], output[1]);

-    output[0] = mad(inputs[9], filters[7], output[0]);
-    output[1] = mad(inputs[10], filters[7], output[1]);
+  output[0] = mad(inputs[9], filters[7], output[0]);
+  output[1] = mad(inputs[10], filters[7], output[1]);

-    output[0] = mad(inputs[10], filters[8], output[0]);
-    output[1] = mad(inputs[11], filters[8], output[1]);
+  output[0] = mad(inputs[10], filters[8], output[0]);
+  output[1] = mad(inputs[11], filters[8], output[1]);
 #ifdef BATCH_NORM
-    half4 scale = read_imageh(new_scale, sampler, (int2)(ou_ch_blk_id, 0));
-    half4 biase = read_imageh(new_biase, sampler, (int2)(ou_ch_blk_id, 0));
-    output[0] = mad(scale, output[0], biase);
-    if (ou_col_id + 1 < ou_w) {
-        output[1] = mad(scale, output[1], biase);
-    }
+  half4 scale = read_imageh(new_scale, sampler, (int2)(ou_ch_blk_id, 0));
+  half4 biase = read_imageh(new_biase, sampler, (int2)(ou_ch_blk_id, 0));
+  output[0] = mad(scale, output[0], biase);
+  if (ou_col_id + 1 < ou_w) {
+    output[1] = mad(scale, output[1], biase);
+  }
 #endif

 #ifdef RELU
-    output[0] = activation(output[0]);
-    output[1] = activation(output[1]);
+  output[0] = activation(output[0]);
+  output[1] = activation(output[1]);
 #endif

-    write_imageh(output_image, (int2)(ou_x, ou_nh_id), output[0]);
-    if (ou_col_id + 1 < ou_w) {
-        write_imageh(output_image, (int2)(ou_x + 1, ou_nh_id), output[1]);
-    }
-
+  write_imageh(output_image, (int2)(ou_x, ou_nh_id), output[0]);
+  if (ou_col_id + 1 < ou_w) {
+    write_imageh(output_image, (int2)(ou_x + 1, ou_nh_id), output[1]);
+  }
 }

-__kernel void conv_1x1(__private const int global_size_dim0,
-                       __private const int global_size_dim1,
-                       __private const int global_size_dim2,
-                       __read_only image2d_t input_image,
-                       __read_only image2d_t filter,
+__kernel void conv_1x1(
+    __private const int global_size_dim0, __private const int global_size_dim1,
+    __private const int global_size_dim2, __read_only image2d_t input_image,
+    __read_only image2d_t filter,
 #if defined(BIASE_CH) || defined(BIASE_ELE)
-                       __read_only image2d_t bias,
+    __read_only image2d_t bias,
 #endif
 #ifdef BATCH_NORM
-                       __read_only image2d_t new_scale,
-                       __read_only image2d_t new_biase,
-#endif
-                       __write_only image2d_t output_image,
-                       __private const int stride,
-                       __private const int offset,
-                       __private const int input_c,
-                       __private const int dilation,
-                       __private const int input_width,/* of one block */
-                       __private const int input_height,/* of one block */
-                       __private const int output_width,
-                       __private const int output_height) {
+    __read_only image2d_t new_scale, __read_only image2d_t new_biase,
+#endif
+    __write_only image2d_t output_image, __private const int stride,
+    __private const int offset, __private const int input_c,
+    __private const int dilation,
+    __private const int input_width,  /* of one block */
+    __private const int input_height, /* of one block */
+    __private const int output_width, __private const int output_height) {
  const int out_c = get_global_id(0);
  const int out_w = get_global_id(1);
  const int out_nh = get_global_id(2);

  int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);

-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                           CLK_ADDRESS_CLAMP         |
-                           CLK_FILTER_NEAREST;
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;

  const uint kernelHXW = 1;
  int2 stride_xy = (int2)(stride, stride);
  int2 ouput_pos_in_one_block = (int2)(out_w, out_nh);
-  int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
+  int2 in_pos_in_one_block =
+      ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);

 #ifdef BIASE_CH
-    half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
+  half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
 #elif defined(BIASE_ELE)
-    half4 output = read_imageh(bias, sampler, output_pos);
+  half4 output = read_imageh(bias, sampler, output_pos);
 #else
-    half4 output = 0.0f;
+  half4 output = 0.0f;
 #endif

-   for (int i = 0; i < input_c; ++i) {
-        int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
-        half4 input = read_imageh(input_image, sampler, pos_in);
-
-        half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
-        half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1));
-        half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2));
-        half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3));
-/*
-        output.x = dot(input, weight0);
-        output.y = dot(input, weight1);
-        output.z = dot(input, weight2);
-        output.w = dot(input, weight3);
-*/
+  for (int i = 0; i < input_c; ++i) {
+    int2 pos_in =
+        (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
+    half4 input = read_imageh(input_image, sampler, pos_in);

-        output = mad(input.x, weight0, output);
-        output = mad(input.y, weight1, output);
-        output = mad(input.z, weight2, output);
-        output = mad(input.w, weight3, output);
+    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
+    half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1));
+    half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2));
+    half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3));
+    /*
+            output.x = dot(input, weight0);
+            output.y = dot(input, weight1);
+            output.z = dot(input, weight2);
+            output.w = dot(input, weight3);
+    */

-   }
+    output = mad(input.x, weight0, output);
+    output = mad(input.y, weight1, output);
+    output = mad(input.z, weight2, output);
+    output = mad(input.w, weight3, output);
+  }

 #ifdef BATCH_NORM
-    output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
+  output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+           read_imageh(new_biase, sampler, (int2)(out_c, 0));
 #endif

 #ifdef RELU
@@ -1017,14 +1138,12 @@ __kernel void conv_1x1_simple(
    __read_only image2d_t new_scale, __read_only image2d_t new_biase,
 #endif
    __write_only image2d_t output_image, __private const int stride,
-    __private const int offset, __private const int input_c,__private const int input_c_origin,
-    __private const int dilation,
+    __private const int offset, __private const int input_c,
+    __private const int input_c_origin, __private const int dilation,
    __private const int input_width,  /* of one block */
    __private const int input_height, /* of one block */
-    __private const int output_width,
-    __private const int output_height,
-    __private const int old_w
-) {
+    __private const int output_width, __private const int output_height,
+    __private const int old_w) {
  half zero = 0.0f;
  const int out_c = get_global_id(0);
  const int out_w = get_global_id(1);
@@ -1035,7 +1154,7 @@ __kernel void conv_1x1_simple(
  int out_w2 = out_w + global_size_dim1 * 2;
  int out_w3 = out_w + global_size_dim1 * 3;

-  int outpos_main = mul24(out_c , old_w);
+  int outpos_main = mul24(out_c, old_w);
  int2 output_pos0 = (int2)(outpos_main + out_w0, out_nh);
  int2 output_pos1 = (int2)(outpos_main + out_w1, out_nh);
  int2 output_pos2 = (int2)(outpos_main + out_w2, out_nh);
@@ -1064,14 +1183,14 @@ __kernel void conv_1x1_simple(

 #ifdef BIASE_CH
  half4 output0 = read_imageh(bias, sampler, (int2)(out_c, 0));
-    half4 output1 = output0;
-    half4 output2 = output0;
-    half4 output3 = output0;
+  half4 output1 = output0;
+  half4 output2 = output0;
+  half4 output3 = output0;
 #elif defined(BIASE_ELE)
  half4 output0 = read_imageh(bias, sampler, output_pos0);
-    half4 output1 = output0;
-    half4 output2 = output0;
-    half4 output3 = output0;
+  half4 output1 = output0;
+  half4 output2 = output0;
+  half4 output3 = output0;

 #else
  half4 output0 = 0.0f;
@@ -1082,7 +1201,8 @@ __kernel void conv_1x1_simple(

  for (int i = 0; i < input_c; ++i) {
    // ------------0---------------
-    int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x, in_pos_in_one_block0.y);
+    int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x,
+                         in_pos_in_one_block0.y);
    half4 input0 = read_imageh(input_image, sampler, pos_in);

    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
@@ -1095,7 +1215,8 @@ __kernel void conv_1x1_simple(
    output0 = mad(input0.z, weight2, output0);
    output0 = mad(input0.w, weight3, output0);
    // -------------1--------------
-    pos_in = (int2)(i * input_width + in_pos_in_one_block1.x, in_pos_in_one_block1.y);
+    pos_in = (int2)(i * input_width + in_pos_in_one_block1.x,
+                    in_pos_in_one_block1.y);
    half4 input1 = read_imageh(input_image, sampler, pos_in);

    output1 = mad(input1.x, weight0, output1);
@@ -1104,7 +1225,8 @@ __kernel void conv_1x1_simple(
    output1 = mad(input1.w, weight3, output1);

    // -------------2--------------
-    pos_in = (int2)(i * input_width + in_pos_in_one_block2.x, in_pos_in_one_block2.y);
+    pos_in = (int2)(i * input_width + in_pos_in_one_block2.x,
+                    in_pos_in_one_block2.y);
    half4 input2 = read_imageh(input_image, sampler, pos_in);

    output2 = mad(input2.x, weight0, output2);
@@ -1113,7 +1235,8 @@ __kernel void conv_1x1_simple(
    output2 = mad(input2.w, weight3, output2);

    // -------------3--------------
-    pos_in = (int2)(i * input_width + in_pos_in_one_block3.x, in_pos_in_one_block3.y);
+    pos_in = (int2)(i * input_width + in_pos_in_one_block3.x,
+                    in_pos_in_one_block3.y);
    half4 input3 = read_imageh(input_image, sampler, pos_in);

    output3 = mad(input3.x, weight0, output3);
@@ -1124,38 +1247,38 @@ __kernel void conv_1x1_simple(

 #ifdef BATCH_NORM
  output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-          read_imageh(new_biase, sampler, (int2)(out_c, 0));
+            read_imageh(new_biase, sampler, (int2)(out_c, 0));

-    output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-          read_imageh(new_biase, sampler, (int2)(out_c, 0));
+  output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+            read_imageh(new_biase, sampler, (int2)(out_c, 0));

-    output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-          read_imageh(new_biase, sampler, (int2)(out_c, 0));
+  output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+            read_imageh(new_biase, sampler, (int2)(out_c, 0));

-    output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-          read_imageh(new_biase, sampler, (int2)(out_c, 0));
+  output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+            read_imageh(new_biase, sampler, (int2)(out_c, 0));
 #endif

 #ifdef RELU
  output0 = activation(output0);
-     output1 = activation(output1);
-     output2 = activation(output2);
-     output3 = activation(output3);
+  output1 = activation(output1);
+  output2 = activation(output2);
+  output3 = activation(output3);
 #endif

  if (out_w0 < old_w) {
    write_imageh(output_image, output_pos0, output0);
  }

-  if (out_w1 < old_w){
+  if (out_w1 < old_w) {
    write_imageh(output_image, output_pos1, output1);
  }

-  if (out_w2 < old_w){
+  if (out_w2 < old_w) {
    write_imageh(output_image, output_pos2, output2);
  }

-  if (out_w3 < old_w){
+  if (out_w3 < old_w) {
    write_imageh(output_image, output_pos3, output3);
  }
 }
@@ -1170,14 +1293,12 @@ __kernel void conv_1x1_wrapped(
    __read_only image2d_t new_scale, __read_only image2d_t new_biase,
 #endif
    __write_only image2d_t output_image, __private const int stride,
-    __private const int offset, __private const int input_c,__private const int input_c_origin,
-    __private const int dilation,
+    __private const int offset, __private const int input_c,
+    __private const int input_c_origin, __private const int dilation,
    __private const int input_width,  /* of one block */
    __private const int input_height, /* of one block */
-    __private const int output_width,
-    __private const int output_height,
-    __private const int old_w
-    ) {
+    __private const int output_width, __private const int output_height,
+    __private const int old_w) {

  const int out_c = get_global_id(0);
  const int out_w = get_global_id(1);
@@ -1188,7 +1309,7 @@ __kernel void conv_1x1_wrapped(
  int out_w2 = out_w + global_size_dim1 * 2;
  int out_w3 = out_w + global_size_dim1 * 3;

-  int outpos_main = mul24(out_c , old_w);
+  int outpos_main = mul24(out_c, old_w);
  int2 output_pos0 = (int2)(outpos_main + out_w0, out_nh);
  int2 output_pos1 = (int2)(outpos_main + out_w1, out_nh);
  int2 output_pos2 = (int2)(outpos_main + out_w2, out_nh);
@@ -1216,15 +1337,15 @@ __kernel void conv_1x1_wrapped(
      ouput_pos_in_one_block3 * stride_xy + (int2)(offset, offset);

 #ifdef BIASE_CH
-    half4 output0 = read_imageh(bias, sampler, (int2)(out_c, 0));
-    half4 output1 = read_imageh(bias, sampler, (int2)(out_c, 0));
-    half4 output2 = read_imageh(bias, sampler, (int2)(out_c, 0));
-    half4 output3 = read_imageh(bias, sampler, (int2)(out_c, 0));
+  half4 output0 = read_imageh(bias, sampler, (int2)(out_c, 0));
+  half4 output1 = read_imageh(bias, sampler, (int2)(out_c, 0));
+  half4 output2 = read_imageh(bias, sampler, (int2)(out_c, 0));
+  half4 output3 = read_imageh(bias, sampler, (int2)(out_c, 0));
 #elif defined(BIASE_ELE)
-    half4 output0 = read_imageh(bias, sampler, output_pos0);
-    half4 output1 = read_imageh(bias, sampler, output_pos1);
-    half4 output2 = read_imageh(bias, sampler, output_pos2);
-    half4 output3 = read_imageh(bias, sampler, output_pos3);
+  half4 output0 = read_imageh(bias, sampler, output_pos0);
+  half4 output1 = read_imageh(bias, sampler, output_pos1);
+  half4 output2 = read_imageh(bias, sampler, output_pos2);
+  half4 output3 = read_imageh(bias, sampler, output_pos3);

 #else
  half4 output0 = 0.0f;
@@ -1237,7 +1358,8 @@ __kernel void conv_1x1_wrapped(
  int burndary_index = input_c * 4 - input_c_origin;
  for (int i = 0; i < input_c; ++i) {
    // ------------0---------------
-    int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x, in_pos_in_one_block0.y);
+    int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x,
+                         in_pos_in_one_block0.y);
    half4 input0 = read_imageh(input_image, sampler, pos_in);

    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
@@ -1245,30 +1367,31 @@ __kernel void conv_1x1_wrapped(
    half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2));
    half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3));

-    if ((max_w_bound - pos_in.x-1) < input_width && (max_w_bound - pos_in.x-1)>=0 ){
-      if (burndary_index==0){
+    if ((max_w_bound - pos_in.x - 1) < input_width &&
+        (max_w_bound - pos_in.x - 1) >= 0) {
+      if (burndary_index == 0) {
        output0 = mad(input0.x, weight0, output0);
        output0 = mad(input0.y, weight1, output0);
        output0 = mad(input0.z, weight2, output0);
        output0 = mad(input0.w, weight3, output0);
-      } else if (burndary_index==1){
+      } else if (burndary_index == 1) {
        output0 = mad(input0.x, weight0, output0);
        output0 = mad(input0.y, weight1, output0);
        output0 = mad(input0.z, weight2, output0);
        output0 = mad(0.0f, weight3, output0);

-      } else if (burndary_index==2){
+      } else if (burndary_index == 2) {
        output0 = mad(input0.x, weight0, output0);
        output0 = mad(input0.y, weight1, output0);
        output0 = mad(0.0f, weight2, output0);
        output0 = mad(0.0f, weight3, output0);
-      } else if (burndary_index==3){
+      } else if (burndary_index == 3) {
        output0 = mad(input0.x, weight0, output0);
        output0 = mad(0.0f, weight1, output0);
        output0 = mad(0.0f, weight2, output0);
        output0 = mad(0.0f, weight3, output0);
      }
-    }else {
+    } else {
      output0 = mad(input0.x, weight0, output0);
      output0 = mad(input0.y, weight1, output0);
      output0 = mad(input0.z, weight2, output0);
@@ -1276,33 +1399,34 @@ __kernel void conv_1x1_wrapped(
    }

    // -------------1--------------
-    pos_in = (int2)(i * input_width + in_pos_in_one_block1.x, in_pos_in_one_block1.y);
+    pos_in = (int2)(i * input_width + in_pos_in_one_block1.x,
+                    in_pos_in_one_block1.y);
    half4 input1 = read_imageh(input_image, sampler, pos_in);

-    if (abs(max_w_bound - pos_in.x) < input_width){
-      if (burndary_index==0){
+    if (abs(max_w_bound - pos_in.x) < input_width) {
+      if (burndary_index == 0) {
        output1 = mad(input1.x, weight0, output1);
        output1 = mad(input1.y, weight1, output1);
        output1 = mad(input1.z, weight2, output1);
        output1 = mad(input1.w, weight3, output1);
-      } else if (burndary_index==1){
+      } else if (burndary_index == 1) {
        output1 = mad(input1.x, weight0, output1);
        output1 = mad(input1.y, weight1, output1);
        output1 = mad(input1.z, weight2, output1);
        output1 = mad(0.0f, weight3, output1);

-      } else if (burndary_index==2){
+      } else if (burndary_index == 2) {
        output1 = mad(input1.x, weight0, output1);
        output1 = mad(input1.y, weight1, output1);
        output1 = mad(0.0f, weight2, output1);
        output1 = mad(0.0f, weight3, output1);
-      } else if (burndary_index==3){
+      } else if (burndary_index == 3) {
        output1 = mad(input1.x, weight0, output1);
        output1 = mad(0.0f, weight1, output1);
        output1 = mad(0.0f, weight2, output1);
        output1 = mad(0.0f, weight3, output1);
      }
-    }else {
+    } else {
      output1 = mad(input1.x, weight0, output1);
      output1 = mad(input1.y, weight1, output1);
      output1 = mad(input1.z, weight2, output1);
@@ -1310,33 +1434,34 @@ __kernel void conv_1x1_wrapped(
    }

    // -------------2--------------
-    pos_in = (int2)(i * input_width + in_pos_in_one_block2.x, in_pos_in_one_block2.y);
+    pos_in = (int2)(i * input_width + in_pos_in_one_block2.x,
+                    in_pos_in_one_block2.y);
    half4 input2 = read_imageh(input_image, sampler, pos_in);

-    if (abs(max_w_bound - pos_in.x) < input_width){
-      if (burndary_index==0){
+    if (abs(max_w_bound - pos_in.x) < input_width) {
+      if (burndary_index == 0) {
        output2 = mad(input2.x, weight0, output2);
        output2 = mad(input2.y, weight1, output2);
        output2 = mad(input2.z, weight2, output2);
        output2 = mad(input2.w, weight3, output2);
-      } else if (burndary_index==1){
+      } else if (burndary_index == 1) {
        output2 = mad(input2.x, weight0, output2);
        output2 = mad(input2.y, weight1, output2);
        output2 = mad(input2.z, weight2, output2);
        output2 = mad(0.0f, weight3, output2);

-      } else if (burndary_index==2){
+      } else if (burndary_index == 2) {
        output2 = mad(input2.x, weight0, output2);
        output2 = mad(input2.y, weight1, output2);
        output2 = mad(0.0f, weight2, output2);
        output2 = mad(0.0f, weight3, output2);
-      } else if (burndary_index==3){
+      } else if (burndary_index == 3) {
        output2 = mad(input2.x, weight0, output2);
        output2 = mad(0.0f, weight1, output2);
        output2 = mad(0.0f, weight2, output2);
        output2 = mad(0.0f, weight3, output2);
      }
-    }else {
+    } else {
      output2 = mad(input2.x, weight0, output2);
      output2 = mad(input2.y, weight1, output2);
      output2 = mad(input2.z, weight2, output2);
@@ -1344,33 +1469,34 @@ __kernel void conv_1x1_wrapped(
    }

    // -------------3--------------
-    pos_in = (int2)(i * input_width + in_pos_in_one_block3.x, in_pos_in_one_block3.y);
+    pos_in = (int2)(i * input_width + in_pos_in_one_block3.x,
+                    in_pos_in_one_block3.y);
    half4 input3 = read_imageh(input_image, sampler, pos_in);

-    if (abs(max_w_bound - pos_in.x) < input_width){
-      if (burndary_index==0){
+    if (abs(max_w_bound - pos_in.x) < input_width) {
+      if (burndary_index == 0) {
        output3 = mad(input3.x, weight0, output3);
        output3 = mad(input3.y, weight1, output3);
        output3 = mad(input3.z, weight2, output3);
        output3 = mad(input3.w, weight3, output3);
-      } else if (burndary_index==1){
+      } else if (burndary_index == 1) {
        output3 = mad(input3.x, weight0, output3);
        output3 = mad(input3.y, weight1, output3);
        output3 = mad(input3.z, weight2, output3);
        output3 = mad(0.0f, weight3, output3);

-      } else if (burndary_index==2){
+      } else if (burndary_index == 2) {
        output3 = mad(input3.x, weight0, output3);
        output3 = mad(input3.y, weight1, output3);
        output3 = mad(0.0f, weight2, output3);
        output3 = mad(0.0f, weight3, output3);
-      } else if (burndary_index==3){
+      } else if (burndary_index == 3) {
        output3 = mad(input3.x, weight0, output3);
        output3 = mad(0.0f, weight1, output3);
        output3 = mad(0.0f, weight2, output3);
        output3 = mad(0.0f, weight3, output3);
      }
-    }else {
+    } else {
      output3 = mad(input3.x, weight0, output3);
      output3 = mad(input3.y, weight1, output3);
      output3 = mad(input3.z, weight2, output3);
@@ -1379,1015 +1505,1060 @@ __kernel void conv_1x1_wrapped(
  }

 #ifdef BATCH_NORM
-    output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-          read_imageh(new_biase, sampler, (int2)(out_c, 0));
+  output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+            read_imageh(new_biase, sampler, (int2)(out_c, 0));

-    output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-          read_imageh(new_biase, sampler, (int2)(out_c, 0));
+  output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+            read_imageh(new_biase, sampler, (int2)(out_c, 0));

-    output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-          read_imageh(new_biase, sampler, (int2)(out_c, 0));
+  output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+            read_imageh(new_biase, sampler, (int2)(out_c, 0));

-    output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-          read_imageh(new_biase, sampler, (int2)(out_c, 0));
+  output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+            read_imageh(new_biase, sampler, (int2)(out_c, 0));
 #endif

 #ifdef RELU
-     output0 = activation(output0);
-     output1 = activation(output1);
-     output2 = activation(output2);
-     output3 = activation(output3);
+  output0 = activation(output0);
+  output1 = activation(output1);
+  output2 = activation(output2);
+  output3 = activation(output3);
 #endif

  if (out_w0 < old_w) {
    write_imageh(output_image, output_pos0, output0);
  }

-  if (out_w1 < old_w){
+  if (out_w1 < old_w) {
    write_imageh(output_image, output_pos1, output1);
  }

-  if (out_w2 < old_w){
+  if (out_w2 < old_w) {
    write_imageh(output_image, output_pos2, output2);
  }

-  if (out_w3 < old_w){
+  if (out_w3 < old_w) {
    write_imageh(output_image, output_pos3, output3);
  }
 }

-__kernel void conv_7x7(__private const int global_size_dim0,
-                                              __private const int global_size_dim1,
-                                              __private const int global_size_dim2,
-                                              __read_only image2d_t input_image,
-                                              __read_only image2d_t filter_image,
-
+__kernel void conv_7x7(
+    __private const int global_size_dim0, __private const int global_size_dim1,
+    __private const int global_size_dim2, __read_only image2d_t input_image,
+    __read_only image2d_t filter_image,
 #if defined(BIASE_CH) || defined(BIASE_ELE)
-                                              __read_only image2d_t bias,
+    __read_only image2d_t bias,
 #endif

 #ifdef BATCH_NORM
-                                              __read_only image2d_t new_scale,
-                                              __read_only image2d_t new_biase,
-#endif
-
-                                              __write_only image2d_t output_image,
-                                              __private const int stride,
-                                              __private const int offset,
-                                              __private const int input_c,
-                                              __private const int dilation,
-                                              __private const int input_width,/* of one block */
-                                              __private const int input_height,/* of one block */
-                                              __private const int output_width,
-                                              __private const int output_height) {
-
-    const int out_c = get_global_id(0);
-    const int out_w = get_global_id(1);
-    const int out_nh = get_global_id(2);
-
-    int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
-
-    if (out_c >= global_size_dim0 ||
-        out_w >= global_size_dim1 ||
-        out_nh >= global_size_dim2) {
-        return;
-    }
-    const int filter_n0 = 4 * out_c + 0;
-    const int filter_n1 = 4 * out_c + 1;
-    const int filter_n2 = 4 * out_c + 2;
-    const int filter_n3 = 4 * out_c + 3;
+    __read_only image2d_t new_scale, __read_only image2d_t new_biase,
+#endif
+
+    __write_only image2d_t output_image, __private const int stride,
+    __private const int offset, __private const int input_c,
+    __private const int dilation,
+    __private const int input_width,  /* of one block */
+    __private const int input_height, /* of one block */
+    __private const int output_width, __private const int output_height) {
+
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+
+  int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);

-    int2 stride_xy;
-    stride_xy.x = stride;
-    stride_xy.y = stride;
+  if (out_c >= global_size_dim0 || out_w >= global_size_dim1 ||
+      out_nh >= global_size_dim2) {
+    return;
+  }
+  const int filter_n0 = 4 * out_c + 0;
+  const int filter_n1 = 4 * out_c + 1;
+  const int filter_n2 = 4 * out_c + 2;
+  const int filter_n3 = 4 * out_c + 3;

-    int2 ouput_pos_in_one_block;
-    ouput_pos_in_one_block.x = out_w;
-    ouput_pos_in_one_block.y = out_nh;
+  int2 stride_xy;
+  stride_xy.x = stride;
+  stride_xy.y = stride;

+  int2 ouput_pos_in_one_block;
+  ouput_pos_in_one_block.x = out_w;
+  ouput_pos_in_one_block.y = out_nh;

-    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                              CLK_ADDRESS_CLAMP          |
-                              CLK_FILTER_NEAREST;
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;

-    int2 in_pos_in_one_block;
-    in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
-    in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
+  int2 in_pos_in_one_block;
+  in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
+  in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;

 #ifdef BIASE_CH
-    half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
+  half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
 #elif defined(BIASE_ELE)
-    half4 output = read_imageh(bias, sampler, output_pos);
+  half4 output = read_imageh(bias, sampler, output_pos);
 #else
-    half4 output = 0.0f;
-#endif
-
-   half4 input;
-   half4 filter[4];
-   int2 filter_pos0;
-   int2 filter_pos1;
-   int2 filter_pos2;
-   int2 filter_pos3;
-   for (int i = 0; i < input_c; ++i) {
-   int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
-        for(int j = 0; j < 7; j++){
-         for(int k = 0; k < 7; k++){
-          input  =  select(read_imageh(input_image, sampler,
-                                (int2)(pos_in.x + (j - 3) * dilation, pos_in.y +  (k - 3) * dilation)),
-                                (half4)(0.0f),
-                                (ushort4)((in_pos_in_one_block.x + (j - 3) * dilation < 0 || in_pos_in_one_block.y + (k - 3) * dilation < 0 || in_pos_in_one_block.x + (j - 3) * dilation >= input_width || in_pos_in_one_block.y + (k - 3) * dilation >= input_height) << 15));
-         int filter_h = k;
-         int filter_w = j;
-         int filter_c = i;
-
-         filter_pos0.x = filter_c * 7 + filter_w;
-         filter_pos0.y = filter_n0 * 7 + filter_h;
-
-         filter_pos1.x = filter_c * 7 + filter_w;
-         filter_pos1.y = filter_n1 * 7 + filter_h;
-
-         filter_pos2.x = filter_c * 7 + filter_w;
-         filter_pos2.y = filter_n2 * 7 + filter_h;
-
-         filter_pos3.x = filter_c * 7 + filter_w;
-         filter_pos3.y = filter_n3 * 7 + filter_h;
-
-         filter[0] =  read_imageh(filter_image, sampler, filter_pos0);
-         filter[1] =  read_imageh(filter_image, sampler, filter_pos1);
-         filter[2] =  read_imageh(filter_image, sampler, filter_pos2);
-         filter[3] =  read_imageh(filter_image, sampler, filter_pos3);
-
-         output.x += dot(input, filter[0]);
-         output.y += dot(input, filter[1]);
-         output.z += dot(input, filter[2]);
-         output.w += dot(input, filter[3]);
-         }
-        }
+  half4 output = 0.0f;
+#endif
+
+  half4 input;
+  half4 filter[4];
+  int2 filter_pos0;
+  int2 filter_pos1;
+  int2 filter_pos2;
+  int2 filter_pos3;
+  for (int i = 0; i < input_c; ++i) {
+    int2 pos_in =
+        (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
+    for (int j = 0; j < 7; j++) {
+      for (int k = 0; k < 7; k++) {
+        input = select(
+            read_imageh(input_image, sampler,
+                        (int2)(pos_in.x + (j - 3) * dilation,
+                               pos_in.y + (k - 3) * dilation)),
+            (half4)(0.0f),
+            (ushort4)(
+                (in_pos_in_one_block.x + (j - 3) * dilation < 0 ||
+                 in_pos_in_one_block.y + (k - 3) * dilation < 0 ||
+                 in_pos_in_one_block.x + (j - 3) * dilation >= input_width ||
+                 in_pos_in_one_block.y + (k - 3) * dilation >= input_height)
+                << 15));
+        int filter_h = k;
+        int filter_w = j;
+        int filter_c = i;
+
+        filter_pos0.x = filter_c * 7 + filter_w;
+        filter_pos0.y = filter_n0 * 7 + filter_h;
+
+        filter_pos1.x = filter_c * 7 + filter_w;
+        filter_pos1.y = filter_n1 * 7 + filter_h;
+
+        filter_pos2.x = filter_c * 7 + filter_w;
+        filter_pos2.y = filter_n2 * 7 + filter_h;
+
+        filter_pos3.x = filter_c * 7 + filter_w;
+        filter_pos3.y = filter_n3 * 7 + filter_h;
+
+        filter[0] = read_imageh(filter_image, sampler, filter_pos0);
+        filter[1] = read_imageh(filter_image, sampler, filter_pos1);
+        filter[2] = read_imageh(filter_image, sampler, filter_pos2);
+        filter[3] = read_imageh(filter_image, sampler, filter_pos3);
+
+        output.x += dot(input, filter[0]);
+        output.y += dot(input, filter[1]);
+        output.z += dot(input, filter[2]);
+        output.w += dot(input, filter[3]);
+      }
    }
+  }

 #ifdef BATCH_NORM
-    output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
+  output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+           read_imageh(new_biase, sampler, (int2)(out_c, 0));
 #endif

 #ifdef RELU
-    output = activation(output);
+  output = activation(output);
 #endif

-    write_imageh(output_image, output_pos, output);
+  write_imageh(output_image, output_pos, output);
 }

-__kernel void conv_7x7Pt1x2(__private const int global_size_dim0,
-                                              __private const int global_size_dim1,
-                                              __private const int global_size_dim2,
-                                              __read_only image2d_t input_image,
-                                              __read_only image2d_t filter_image,
-
+__kernel void conv_7x7Pt1x2(
+    __private const int global_size_dim0, __private const int global_size_dim1,
+    __private const int global_size_dim2, __read_only image2d_t input_image,
+    __read_only image2d_t filter_image,
 #if defined(BIASE_CH) || defined(BIASE_ELE)
-                                              __read_only image2d_t bias,
+    __read_only image2d_t bias,
 #endif

 #ifdef BATCH_NORM
-                                              __read_only image2d_t new_scale,
-                                              __read_only image2d_t new_biase,
-#endif
-
-                                              __write_only image2d_t output_image,
-                                              __private const int stride,
-                                              __private const int offset,
-                                              __private const int input_c,
-                                              __private const int dilation,
-                                              __private const int input_width,/* of one block */
-                                              __private const int input_height,/* of one block */
-                                              __private const int output_width,
-                                              __private const int output_height) {
-
-    const int out_c = get_global_id(0);
-    const int out_w1 = get_global_id(1);
-    const int out_nh = get_global_id(2);
-
-    if (out_c >= global_size_dim0 ||
-        out_w1 >= global_size_dim1 ||
-        out_nh >= global_size_dim2) {
-        return;
-    }
-    const int out_w = out_w1 * 2;
+    __read_only image2d_t new_scale, __read_only image2d_t new_biase,
+#endif
+
+    __write_only image2d_t output_image, __private const int stride,
+    __private const int offset, __private const int input_c,
+    __private const int dilation,
+    __private const int input_width,  /* of one block */
+    __private const int input_height, /* of one block */
+    __private const int output_width, __private const int output_height) {
+
+  const int out_c = get_global_id(0);
+  const int out_w1 = get_global_id(1);
+  const int out_nh = get_global_id(2);

-    int2 output_pos = (int2)(out_c * output_width + out_w, out_nh);
+  if (out_c >= global_size_dim0 || out_w1 >= global_size_dim1 ||
+      out_nh >= global_size_dim2) {
+    return;
+  }
+  const int out_w = out_w1 * 2;

-    const int filter_n0 = 4 * out_c + 0;
-    const int filter_n1 = 4 * out_c + 1;
-    const int filter_n2 = 4 * out_c + 2;
-    const int filter_n3 = 4 * out_c + 3;
+  int2 output_pos = (int2)(out_c * output_width + out_w, out_nh);

-    int2 stride_xy;
-    stride_xy.x = stride;
-    stride_xy.y = stride;
+  const int filter_n0 = 4 * out_c + 0;
+  const int filter_n1 = 4 * out_c + 1;
+  const int filter_n2 = 4 * out_c + 2;
+  const int filter_n3 = 4 * out_c + 3;

-    int2 ouput_pos_in_one_block;
-    ouput_pos_in_one_block.x = out_w;
-    ouput_pos_in_one_block.y = out_nh;
+  int2 stride_xy;
+  stride_xy.x = stride;
+  stride_xy.y = stride;

+  int2 ouput_pos_in_one_block;
+  ouput_pos_in_one_block.x = out_w;
+  ouput_pos_in_one_block.y = out_nh;

-    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                              CLK_ADDRESS_CLAMP          |
-                              CLK_FILTER_NEAREST;
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;

-    int2 in_pos_in_one_block;
-    in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
-    in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
+  int2 in_pos_in_one_block;
+  in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
+  in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;

-    half4 output0 = 0.0f;
-    half4 output1 = 0.0f;
+  half4 output0 = 0.0f;
+  half4 output1 = 0.0f;
 #ifdef BIASE_CH
-    output0 = read_imageh(bias, sampler, (int2)(out_c, 0));
-    output1 = output0;
+  output0 = read_imageh(bias, sampler, (int2)(out_c, 0));
+  output1 = output0;
 #elif defined(BIASE_ELE)
-    output0 = read_imageh(bias, sampler, output_pos);
-    output1 = read_imageh(bias, sampler, (int2)(output_pos.x + 1, output_pos.y));
+  output0 = read_imageh(bias, sampler, output_pos);
+  output1 = read_imageh(bias, sampler, (int2)(output_pos.x + 1, output_pos.y));
 #else
-    output0 = 0.0f;
-    output1 = 0.0f;
-#endif
-
-   half4 input[8];
-   half4 filter0[4];
-   half4 filter1[4];
-   half4 filter2[4];
-   half4 filter3[4];
-   int2 filter_pos0;
-   int2 filter_pos1;
-   int2 filter_pos2;
-   int2 filter_pos3;
-   for (int i = 0; i < input_c; ++i) {
-        int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
-        for(int k = 0; k < 7; k++){
-         for (int j = 0; j < 8; j++) {
-             input[j]  =  select(read_imageh(input_image, sampler,
-                                                                  (int2)(pos_in.x + (j - 3) * dilation, pos_in.y +  (k - 3) * dilation)),
-                                                                  (half4)(0.0f),
-                                                                  (ushort4)((in_pos_in_one_block.x + (j - 3) * dilation < 0 || in_pos_in_one_block.y + (k - 3) * dilation < 0 || in_pos_in_one_block.x + (j - 3) * dilation >= input_width || in_pos_in_one_block.y + (k - 3) * dilation >= input_height) << 15));
-
-             int filter_h = k;
-             int filter_w = j;
-             int filter_c = i;
-
-             if (j < 7) {
-                filter_pos0.x = filter_c * 7 + filter_w;
-                filter_pos0.y = filter_n0 * 7 + filter_h;
-
-                filter_pos1.x = filter_c * 7 + filter_w;
-                filter_pos1.y = filter_n1 * 7 + filter_h;
-
-                filter_pos2.x = filter_c * 7 + filter_w;
-                filter_pos2.y = filter_n2 * 7 + filter_h;
-
-                filter_pos3.x = filter_c * 7 + filter_w;
-                filter_pos3.y = filter_n3 * 7 + filter_h;
-
-                filter0[0] =  read_imageh(filter_image, sampler, filter_pos0);
-                filter0[1] =  read_imageh(filter_image, sampler, filter_pos1);
-                filter0[2] =  read_imageh(filter_image, sampler, filter_pos2);
-                filter0[3] =  read_imageh(filter_image, sampler, filter_pos3);
-
-                output0.x += dot(input[j], filter0[0]);
-                output0.y += dot(input[j], filter0[1]);
-                output0.z += dot(input[j], filter0[2]);
-                output0.w += dot(input[j], filter0[3]);
-             }
-
-             if (j > 0) {
-               output1.x += dot(input[j], filter1[0]);
-               output1.y += dot(input[j], filter1[1]);
-               output1.z += dot(input[j], filter1[2]);
-               output1.w += dot(input[j], filter1[3]);
-             }
-
-             filter1[0] = filter0[0];
-             filter1[1] = filter0[1];
-             filter1[2] = filter0[2];
-             filter1[3] = filter0[3];
-         }
+  output0 = 0.0f;
+  output1 = 0.0f;
+#endif
+
+  half4 input[8];
+  half4 filter0[4];
+  half4 filter1[4];
+  half4 filter2[4];
+  half4 filter3[4];
+  int2 filter_pos0;
+  int2 filter_pos1;
+  int2 filter_pos2;
+  int2 filter_pos3;
+  for (int i = 0; i < input_c; ++i) {
+    int2 pos_in =
+        (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
+    for (int k = 0; k < 7; k++) {
+      for (int j = 0; j < 8; j++) {
+        input[j] = select(
+            read_imageh(input_image, sampler,
+                        (int2)(pos_in.x + (j - 3) * dilation,
+                               pos_in.y + (k - 3) * dilation)),
+            (half4)(0.0f),
+            (ushort4)(
+                (in_pos_in_one_block.x + (j - 3) * dilation < 0 ||
+                 in_pos_in_one_block.y + (k - 3) * dilation < 0 ||
+                 in_pos_in_one_block.x + (j - 3) * dilation >= input_width ||
+                 in_pos_in_one_block.y + (k - 3) * dilation >= input_height)
+                << 15));
+
+        int filter_h = k;
+        int filter_w = j;
+        int filter_c = i;
+
+        if (j < 7) {
+          filter_pos0.x = filter_c * 7 + filter_w;
+          filter_pos0.y = filter_n0 * 7 + filter_h;
+
+          filter_pos1.x = filter_c * 7 + filter_w;
+          filter_pos1.y = filter_n1 * 7 + filter_h;
+
+          filter_pos2.x = filter_c * 7 + filter_w;
+          filter_pos2.y = filter_n2 * 7 + filter_h;
+
+          filter_pos3.x = filter_c * 7 + filter_w;
+          filter_pos3.y = filter_n3 * 7 + filter_h;
+
+          filter0[0] = read_imageh(filter_image, sampler, filter_pos0);
+          filter0[1] = read_imageh(filter_image, sampler, filter_pos1);
+          filter0[2] = read_imageh(filter_image, sampler, filter_pos2);
+          filter0[3] = read_imageh(filter_image, sampler, filter_pos3);
+
+          output0.x += dot(input[j], filter0[0]);
+          output0.y += dot(input[j], filter0[1]);
+          output0.z += dot(input[j], filter0[2]);
+          output0.w += dot(input[j], filter0[3]);
+        }
+
+        if (j > 0) {
+          output1.x += dot(input[j], filter1[0]);
+          output1.y += dot(input[j], filter1[1]);
+          output1.z += dot(input[j], filter1[2]);
+          output1.w += dot(input[j], filter1[3]);
        }
-   }
+
+        filter1[0] = filter0[0];
+        filter1[1] = filter0[1];
+        filter1[2] = filter0[2];
+        filter1[3] = filter0[3];
+      }
+    }
+  }

 #ifdef BATCH_NORM
-    half4 s = read_imageh(new_scale, sampler, (int2)(out_c, 0));
-    half4 b = read_imageh(new_biase, sampler, (int2)(out_c, 0));
-    output0 = output0 * s + b;
-    output1 = output1 * s + b;
+  half4 s = read_imageh(new_scale, sampler, (int2)(out_c, 0));
+  half4 b = read_imageh(new_biase, sampler, (int2)(out_c, 0));
+  output0 = output0 * s + b;
+  output1 = output1 * s + b;
 #endif

 #ifdef RELU
-    output0 = activation(output0);
-    output1 = activation(output1);
+  output0 = activation(output0);
+  output1 = activation(output1);
 #endif
-    write_imageh(output_image, output_pos, output0);
-    if ((output_pos.x + 1) % output_width != 0) {
-      write_imageh(output_image, (int2)(output_pos.x + 1, output_pos.y), output1);
-    }
+  write_imageh(output_image, output_pos, output0);
+  if ((output_pos.x + 1) % output_width != 0) {
+    write_imageh(output_image, (int2)(output_pos.x + 1, output_pos.y), output1);
+  }
 }

 // dilation == 1
-__kernel void conv_7x7spl(__private const int item_ch,
-                          __private const int item_w,
-                          __private const int item_h,
-                          __read_only image2d_t input_image,
-                          __read_only image2d_t filter_image,
+__kernel void conv_7x7spl(
+    __private const int item_ch, __private const int item_w,
+    __private const int item_h, __read_only image2d_t input_image,
+    __read_only image2d_t filter_image,
 #if defined(BIASE_CH) || defined(BIASE_ELE)
-        __read_only image2d_t bias,
+    __read_only image2d_t bias,
 #endif
 #ifdef BATCH_NORM
-__read_only image2d_t new_scale,
-                                              __read_only image2d_t new_biase,
-#endif
-                          __write_only image2d_t output_image,
-                          __private const int stride,
-                          __private const int pad,
-                          __private const int dilation,
-                          __private const int in_ch,
-                          __private const int in_w,
-                          __private const int in_h,
-                          __private const int out_w,
-                          __private const int out_h) {
-
-    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                              CLK_ADDRESS_CLAMP          |
-                              CLK_FILTER_NEAREST;
-    // filter
-    const int filter_w = 7;
-    const int filter_h = 7;
-
-    // item_id
-    const int item_ch_id = get_global_id(0);
-    const int item_w_id = get_global_id(1);
-    const int item_h_id = get_global_id(2);
-
-    // out_width_id_per_blk and out_batch_id
-    int out_batch_id = item_h_id / in_h;
-    int out_w_base_id = item_ch_id * out_w;
-    int out_w_id0 = item_w_id;
-    int out_w_id1 = out_w_id0 + item_w;
-    int out_w_id2 = out_w_id1 + item_w;
-    int out_w_id3 = out_w_id2 + item_w;
-    int out_w_id4 = out_w_id3 + item_w;
-
-    // in_width_id_per_blk and in_height_id_per_batch
-    int in_h_id = (item_h_id % out_h) * stride - pad;
-    int in_w_id0 = item_w_id * stride - pad;
-    int in_w_id1 = in_w_id0 + item_w * stride;
-    int in_w_id2 = in_w_id1 + item_w * stride;
-    int in_w_id3 = in_w_id2 + item_w * stride;
-    int in_w_id4 = in_w_id3 + item_w * stride;
+    __read_only image2d_t new_scale, __read_only image2d_t new_biase,
+#endif
+    __write_only image2d_t output_image, __private const int stride,
+    __private const int pad, __private const int dilation,
+    __private const int in_ch, __private const int in_w,
+    __private const int in_h, __private const int out_w,
+    __private const int out_h) {
+
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  // filter
+  const int filter_w = 7;
+  const int filter_h = 7;
+
+  // item_id
+  const int item_ch_id = get_global_id(0);
+  const int item_w_id = get_global_id(1);
+  const int item_h_id = get_global_id(2);
+
+  // out_width_id_per_blk and out_batch_id
+  int out_batch_id = item_h_id / in_h;
+  int out_w_base_id = item_ch_id * out_w;
+  int out_w_id0 = item_w_id;
+  int out_w_id1 = out_w_id0 + item_w;
+  int out_w_id2 = out_w_id1 + item_w;
+  int out_w_id3 = out_w_id2 + item_w;
+  int out_w_id4 = out_w_id3 + item_w;
+
+  // in_width_id_per_blk and in_height_id_per_batch
+  int in_h_id = (item_h_id % out_h) * stride - pad;
+  int in_w_id0 = item_w_id * stride - pad;
+  int in_w_id1 = in_w_id0 + item_w * stride;
+  int in_w_id2 = in_w_id1 + item_w * stride;
+  int in_w_id3 = in_w_id2 + item_w * stride;
+  int in_w_id4 = in_w_id3 + item_w * stride;

 #ifdef BIASE_CH

-    half4 output[5];
-    output[0] = read_imageh(bias, sampler, (int2)(item_ch_id, 0));
-    output[1] = output[0];
-    output[2] = output[0];
-    output[3] = output[0];
-    output[4] = output[0];
+  half4 output[5];
+  output[0] = read_imageh(bias, sampler, (int2)(item_ch_id, 0));
+  output[1] = output[0];
+  output[2] = output[0];
+  output[3] = output[0];
+  output[4] = output[0];

 #elif defined(BIASE_ELE)

-    half4 output[5];
-    output[0] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id0, item_h_id));
-    if (out_w_id1 < out_w) {
-        output[1] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id1, item_h_id));
-    }
-    if (out_w_id2 < out_w) {
-        output[2] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id2, item_h_id));
-    }
-    if (out_w_id3 < out_w) {
-        output[3] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id3, item_h_id));
-    }
-    if (out_w_id4 < out_w) {
-        output[4] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id4, item_h_id));
-    }
+  half4 output[5];
+  output[0] =
+      read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id0, item_h_id));
+  if (out_w_id1 < out_w) {
+    output[1] = read_imageh(bias, sampler,
+                            (int2)(out_w_base_id + out_w_id1, item_h_id));
+  }
+  if (out_w_id2 < out_w) {
+    output[2] = read_imageh(bias, sampler,
+                            (int2)(out_w_base_id + out_w_id2, item_h_id));
+  }
+  if (out_w_id3 < out_w) {
+    output[3] = read_imageh(bias, sampler,
+                            (int2)(out_w_base_id + out_w_id3, item_h_id));
+  }
+  if (out_w_id4 < out_w) {
+    output[4] = read_imageh(bias, sampler,
+                            (int2)(out_w_base_id + out_w_id4, item_h_id));
+  }
 #else
-    half4 output[5] = {0.0f};
-#endif
-
-    half4 filter[4] = {0.0f};
-    half4 filter_trans[4] = {0.0f};
-    half4 input[5] = {0.0f};
-
-    int filter_h_val0 = item_ch_id * 4 * filter_h;
-    int filter_h_val1 = filter_h_val0 + filter_h;
-    int filter_h_val2 = filter_h_val1 + filter_h;
-    int filter_h_val3 = filter_h_val2 + filter_h;
-
-    for (int ch = 0; ch < (in_ch + 3) / 4; ch++) {
-        int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0;
-
-        const int in_w_base_id = mul24(ch, in_w);
-
-        int filter_w_val = ch * filter_w;
-
-        for (int h = 0; h < filter_h; h++) {
-
-            int in_h_val = select(out_batch_id * in_h + in_h_id + h, -1,
-                                  (out_batch_id * in_h + in_h_id + h < 0 || out_batch_id * in_h + in_h_id + h >= in_h));
-
-            for (int w = 0; w < filter_w; w++) {
-
-                int in_w_val0 = select(in_w_base_id + in_w_id0 + w, -1,
-                                       (in_w_id0 + w < 0 || in_w_id0 + w >= in_w));
-                int in_w_val1 = select(in_w_base_id + in_w_id1 + w, -1,
-                                       (in_w_id1 + w < 0 || in_w_id1 + w >= in_w));
-                int in_w_val2 = select(in_w_base_id + in_w_id2 + w, -1,
-                                       (in_w_id2 + w < 0 || in_w_id2 + w >= in_w));
-                int in_w_val3 = select(in_w_base_id + in_w_id3 + w, -1,
-                                       (in_w_id3 + w < 0 || in_w_id3 + w >= in_w));
-                int in_w_val4 = select(in_w_base_id + in_w_id4 + w, -1,
-                                       (in_w_id4 + w < 0 || in_w_id4 + w >= in_w));
-
-                filter[0] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val0 + h)); // in_ch:0-3,out_ch:0
-                filter[1] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val1 + h)); // in_ch:0-3,out_ch:1
-                filter[2] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val2 + h)); // in_ch:0-3,out_ch:2
-                filter[3] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val3 + h)); // in_ch:0-3,out_ch:3
-
-                filter_trans[0] = (half4)(filter[0].x, filter[1].x, filter[2].x, filter[3].x);    // in_ch:0,out_ch:0-3
-                filter_trans[1] = (half4)(filter[0].y, filter[1].y, filter[2].y, filter[3].y);    // in_ch:1,out_ch:0-3
-                filter_trans[2] = (half4)(filter[0].z, filter[1].z, filter[2].z, filter[3].z);    // in_ch:2,out_ch:0-3
-                filter_trans[3] = (half4)(filter[0].w, filter[1].w, filter[2].w, filter[3].w);    // in_ch:3,out_ch:0-3
-
-                input[0] = read_imageh(input_image, sampler, (int2)(in_w_val0, in_h_val));
-                input[1] = read_imageh(input_image, sampler, (int2)(in_w_val1, in_h_val));
-                input[2] = read_imageh(input_image, sampler, (int2)(in_w_val2, in_h_val));
-                input[3] = read_imageh(input_image, sampler, (int2)(in_w_val3, in_h_val));
-                input[4] = read_imageh(input_image, sampler, (int2)(in_w_val4, in_h_val));
-
-                output[0] = mad(input[0].x, filter_trans[0], output[0]);
-                output[1] = mad(input[1].x, filter_trans[0], output[1]);
-                output[2] = mad(input[2].x, filter_trans[0], output[2]);
-                output[3] = mad(input[3].x, filter_trans[0], output[3]);
-                output[4] = mad(input[4].x, filter_trans[0], output[4]);
-
-                if (ch_surplus < 3) {
-                    output[0] = mad(input[0].y, filter_trans[1], output[0]);
-                    output[1] = mad(input[1].y, filter_trans[1], output[1]);
-                    output[2] = mad(input[2].y, filter_trans[1], output[2]);
-                    output[3] = mad(input[3].y, filter_trans[1], output[3]);
-                    output[4] = mad(input[4].y, filter_trans[1], output[4]);
-                }
-                if (ch_surplus < 2) {
-                    output[0] = mad(input[0].z, filter_trans[2], output[0]);
-                    output[1] = mad(input[1].z, filter_trans[2], output[1]);
-                    output[2] = mad(input[2].z, filter_trans[2], output[2]);
-                    output[3] = mad(input[3].z, filter_trans[2], output[3]);
-                    output[4] = mad(input[4].z, filter_trans[2], output[4]);
-                }
-                if (ch_surplus < 1) {
-                    output[0] = mad(input[0].w, filter_trans[3], output[0]);
-                    output[1] = mad(input[1].w, filter_trans[3], output[1]);
-                    output[2] = mad(input[2].w, filter_trans[3], output[2]);
-                    output[3] = mad(input[3].w, filter_trans[3], output[3]);
-                    output[4] = mad(input[4].w, filter_trans[3], output[4]);
-                }
-            }
+  half4 output[5] = {0.0f};
+#endif
+
+  half4 filter[4] = {0.0f};
+  half4 filter_trans[4] = {0.0f};
+  half4 input[5] = {0.0f};
+
+  int filter_h_val0 = item_ch_id * 4 * filter_h;
+  int filter_h_val1 = filter_h_val0 + filter_h;
+  int filter_h_val2 = filter_h_val1 + filter_h;
+  int filter_h_val3 = filter_h_val2 + filter_h;
+
+  for (int ch = 0; ch < (in_ch + 3) / 4; ch++) {
+    int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0;
+
+    const int in_w_base_id = mul24(ch, in_w);
+
+    int filter_w_val = ch * filter_w;
+
+    for (int h = 0; h < filter_h; h++) {
+      int in_h_val = select(out_batch_id * in_h + in_h_id + h, -1,
+                            (out_batch_id * in_h + in_h_id + h < 0 ||
+                             out_batch_id * in_h + in_h_id + h >= in_h));
+
+      for (int w = 0; w < filter_w; w++) {
+        int in_w_val0 = select(in_w_base_id + in_w_id0 + w, -1,
+                               (in_w_id0 + w < 0 || in_w_id0 + w >= in_w));
+        int in_w_val1 = select(in_w_base_id + in_w_id1 + w, -1,
+                               (in_w_id1 + w < 0 || in_w_id1 + w >= in_w));
+        int in_w_val2 = select(in_w_base_id + in_w_id2 + w, -1,
+                               (in_w_id2 + w < 0 || in_w_id2 + w >= in_w));
+        int in_w_val3 = select(in_w_base_id + in_w_id3 + w, -1,
+                               (in_w_id3 + w < 0 || in_w_id3 + w >= in_w));
+        int in_w_val4 = select(in_w_base_id + in_w_id4 + w, -1,
+                               (in_w_id4 + w < 0 || in_w_id4 + w >= in_w));
+
+        filter[0] = read_imageh(
+            filter_image, sampler,
+            (int2)(filter_w_val + w, filter_h_val0 + h));  // in_ch:0-3,out_ch:0
+        filter[1] = read_imageh(
+            filter_image, sampler,
+            (int2)(filter_w_val + w, filter_h_val1 + h));  // in_ch:0-3,out_ch:1
+        filter[2] = read_imageh(
+            filter_image, sampler,
+            (int2)(filter_w_val + w, filter_h_val2 + h));  // in_ch:0-3,out_ch:2
+        filter[3] = read_imageh(
+            filter_image, sampler,
+            (int2)(filter_w_val + w, filter_h_val3 + h));  // in_ch:0-3,out_ch:3
+
+        filter_trans[0] = (half4)(filter[0].x, filter[1].x, filter[2].x,
+                                  filter[3].x);  // in_ch:0,out_ch:0-3
+        filter_trans[1] = (half4)(filter[0].y, filter[1].y, filter[2].y,
+                                  filter[3].y);  // in_ch:1,out_ch:0-3
+        filter_trans[2] = (half4)(filter[0].z, filter[1].z, filter[2].z,
+                                  filter[3].z);  // in_ch:2,out_ch:0-3
+        filter_trans[3] = (half4)(filter[0].w, filter[1].w, filter[2].w,
+                                  filter[3].w);  // in_ch:3,out_ch:0-3
+
+        input[0] =
+            read_imageh(input_image, sampler, (int2)(in_w_val0, in_h_val));
+        input[1] =
+            read_imageh(input_image, sampler, (int2)(in_w_val1, in_h_val));
+        input[2] =
+            read_imageh(input_image, sampler, (int2)(in_w_val2, in_h_val));
+        input[3] =
+            read_imageh(input_image, sampler, (int2)(in_w_val3, in_h_val));
+        input[4] =
+            read_imageh(input_image, sampler, (int2)(in_w_val4, in_h_val));
+
+        output[0] = mad(input[0].x, filter_trans[0], output[0]);
+        output[1] = mad(input[1].x, filter_trans[0], output[1]);
+        output[2] = mad(input[2].x, filter_trans[0], output[2]);
+        output[3] = mad(input[3].x, filter_trans[0], output[3]);
+        output[4] = mad(input[4].x, filter_trans[0], output[4]);
+
+        if (ch_surplus < 3) {
+          output[0] = mad(input[0].y, filter_trans[1], output[0]);
+          output[1] = mad(input[1].y, filter_trans[1], output[1]);
+          output[2] = mad(input[2].y, filter_trans[1], output[2]);
+          output[3] = mad(input[3].y, filter_trans[1], output[3]);
+          output[4] = mad(input[4].y, filter_trans[1], output[4]);
        }
+        if (ch_surplus < 2) {
+          output[0] = mad(input[0].z, filter_trans[2], output[0]);
+          output[1] = mad(input[1].z, filter_trans[2], output[1]);
+          output[2] = mad(input[2].z, filter_trans[2], output[2]);
+          output[3] = mad(input[3].z, filter_trans[2], output[3]);
+          output[4] = mad(input[4].z, filter_trans[2], output[4]);
+        }
+        if (ch_surplus < 1) {
+          output[0] = mad(input[0].w, filter_trans[3], output[0]);
+          output[1] = mad(input[1].w, filter_trans[3], output[1]);
+          output[2] = mad(input[2].w, filter_trans[3], output[2]);
+          output[3] = mad(input[3].w, filter_trans[3], output[3]);
+          output[4] = mad(input[4].w, filter_trans[3], output[4]);
+        }
+      }
    }
+  }
 #ifdef BATCH_NORM
-    half4 scale = read_imageh(new_scale, sampler, (int2)(item_ch_id, 0));
-    half4 biase = read_imageh(new_biase, sampler, (int2)(item_ch_id, 0));
-    output[0] = mad(scale, output[0], biase);
-    if (out_w_id1 < out_w) {
-        output[1] =  mad(scale, output[1], biase);
-    }
-    if (out_w_id2 < out_w) {
-        output[2] =  mad(scale, output[2], biase);
-    }
-    if (out_w_id3 < out_w) {
-        output[3] =  mad(scale, output[3], biase);
-    }
-    if (out_w_id4 < out_w) {
-        output[4] =  mad(scale, output[4], biase);
-    }
+  half4 scale = read_imageh(new_scale, sampler, (int2)(item_ch_id, 0));
+  half4 biase = read_imageh(new_biase, sampler, (int2)(item_ch_id, 0));
+  output[0] = mad(scale, output[0], biase);
+  if (out_w_id1 < out_w) {
+    output[1] = mad(scale, output[1], biase);
+  }
+  if (out_w_id2 < out_w) {
+    output[2] = mad(scale, output[2], biase);
+  }
+  if (out_w_id3 < out_w) {
+    output[3] = mad(scale, output[3], biase);
+  }
+  if (out_w_id4 < out_w) {
+    output[4] = mad(scale, output[4], biase);
+  }
 #endif

 #ifdef RELU
-    output[0] = activation(output[0]);
-    output[1] = activation(output[1]);
-    output[2] = activation(output[2]);
-    output[3] = activation(output[3]);
-    output[4] = activation(output[4]);
-#endif
-    write_imageh(output_image, (int2)(out_w_base_id + out_w_id0, item_h_id), output[0]);
-    if (out_w_id1 < out_w) {
-        write_imageh(output_image, (int2)(out_w_base_id + out_w_id1, item_h_id), output[1]);
-    }
-    if (out_w_id2 < out_w) {
-        write_imageh(output_image, (int2)(out_w_base_id + out_w_id2, item_h_id), output[2]);
-    }
-    if (out_w_id3 < out_w) {
-        write_imageh(output_image, (int2)(out_w_base_id + out_w_id3, item_h_id), output[3]);
-    }
-    if (out_w_id4 < out_w) {
-        write_imageh(output_image, (int2)(out_w_base_id + out_w_id4, item_h_id), output[4]);
-    }
+  output[0] = activation(output[0]);
+  output[1] = activation(output[1]);
+  output[2] = activation(output[2]);
+  output[3] = activation(output[3]);
+  output[4] = activation(output[4]);
+#endif
+  write_imageh(output_image, (int2)(out_w_base_id + out_w_id0, item_h_id),
+               output[0]);
+  if (out_w_id1 < out_w) {
+    write_imageh(output_image, (int2)(out_w_base_id + out_w_id1, item_h_id),
+                 output[1]);
+  }
+  if (out_w_id2 < out_w) {
+    write_imageh(output_image, (int2)(out_w_base_id + out_w_id2, item_h_id),
+                 output[2]);
+  }
+  if (out_w_id3 < out_w) {
+    write_imageh(output_image, (int2)(out_w_base_id + out_w_id3, item_h_id),
+                 output[3]);
+  }
+  if (out_w_id4 < out_w) {
+    write_imageh(output_image, (int2)(out_w_base_id + out_w_id4, item_h_id),
+                 output[4]);
+  }
 }

-__kernel void conv_5x5(__private const int global_size_dim0,
-                                              __private const int global_size_dim1,
-                                              __private const int global_size_dim2,
-                                              __read_only image2d_t input_image,
-                                              __read_only image2d_t filter_image,
-
+__kernel void conv_5x5(
+    __private const int global_size_dim0, __private const int global_size_dim1,
+    __private const int global_size_dim2, __read_only image2d_t input_image,
+    __read_only image2d_t filter_image,
 #if defined(BIASE_CH) || defined(BIASE_ELE)
-                                              __read_only image2d_t bias,
+    __read_only image2d_t bias,
 #endif

 #ifdef BATCH_NORM
-                                              __read_only image2d_t new_scale,
-                                              __read_only image2d_t new_biase,
-#endif
-
-                                              __write_only image2d_t output_image,
-                                              __private const int stride,
-                                              __private const int offset,
-                                              __private const int input_c,
-                                              __private const int dilation,
-                                              __private const int input_width,/* of one block */
-                                              __private const int input_height,/* of one block */
-                                              __private const int output_width,
-                                              __private const int output_height) {
-
-    const int out_c = get_global_id(0);
-    const int out_w = get_global_id(1);
-    const int out_nh = get_global_id(2);
-
-    int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
-
-    if (out_c >= global_size_dim0 ||
-        out_w >= global_size_dim1 ||
-        out_nh >= global_size_dim2) {
-        return;
-    }
-    const filter_n0 = 4 * out_c + 0;
-    const filter_n1 = 4 * out_c + 1;
-    const filter_n2 = 4 * out_c + 2;
-    const filter_n3 = 4 * out_c + 3;
+    __read_only image2d_t new_scale, __read_only image2d_t new_biase,
+#endif
+
+    __write_only image2d_t output_image, __private const int stride,
+    __private const int offset, __private const int input_c,
+    __private const int dilation,
+    __private const int input_width,  /* of one block */
+    __private const int input_height, /* of one block */
+    __private const int output_width, __private const int output_height) {

-    int2 stride_xy;
-    stride_xy.x = stride;
-    stride_xy.y = stride;
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+
+  int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
+
+  if (out_c >= global_size_dim0 || out_w >= global_size_dim1 ||
+      out_nh >= global_size_dim2) {
+    return;
+  }
+  const filter_n0 = 4 * out_c + 0;
+  const filter_n1 = 4 * out_c + 1;
+  const filter_n2 = 4 * out_c + 2;
+  const filter_n3 = 4 * out_c + 3;

-    int2 ouput_pos_in_one_block;
-    ouput_pos_in_one_block.x = out_w;
-    ouput_pos_in_one_block.y = out_nh;
+  int2 stride_xy;
+  stride_xy.x = stride;
+  stride_xy.y = stride;

+  int2 ouput_pos_in_one_block;
+  ouput_pos_in_one_block.x = out_w;
+  ouput_pos_in_one_block.y = out_nh;

-    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                              CLK_ADDRESS_CLAMP          |
-                              CLK_FILTER_NEAREST;
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;

-    int2 in_pos_in_one_block;
-    in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
-    in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
+  int2 in_pos_in_one_block;
+  in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
+  in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;

 #ifdef BIASE_CH
-    half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
+  half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
 #elif defined(BIASE_ELE)
-    half4 output = read_imageh(bias, sampler, output_pos);
+  half4 output = read_imageh(bias, sampler, output_pos);
 #else
-    half4 output = 0.0f;
-#endif
-
-   half4 input;
-   half4 filter[4];
-   int2 filter_pos0;
-   int2 filter_pos1;
-   int2 filter_pos2;
-   int2 filter_pos3;
-   for (int i = 0; i < input_c; ++i) {
-   int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
-        for(int j = 0; j < 5; j++){
-         for(int k = 0; k < 5; k++){
-          input  =  select(read_imageh(input_image, sampler,
-                                (int2)(pos_in.x + (j - 2) * dilation, pos_in.y +  (k - 2) * dilation)),
-                                (half4)(0.0f),
-                                (ushort4)((in_pos_in_one_block.x + (j - 2) * dilation < 0 || in_pos_in_one_block.y + (k - 2) * dilation < 0 || in_pos_in_one_block.x + (j - 2) * dilation >= input_width || in_pos_in_one_block.y + (k - 2) * dilation >= input_height) << 15));
-         int filter_h = k;
-         int filter_w = j;
-         int filter_c = i;
-
-         filter_pos0.x = filter_c * 5 + filter_w;
-         filter_pos0.y = filter_n0 * 5 + filter_h;
-
-         filter_pos1.x = filter_c * 5 + filter_w;
-         filter_pos1.y = filter_n1 * 5 + filter_h;
-
-         filter_pos2.x = filter_c * 5 + filter_w;
-         filter_pos2.y = filter_n2 * 5 + filter_h;
-
-         filter_pos3.x = filter_c * 5 + filter_w;
-         filter_pos3.y = filter_n3 * 5 + filter_h;
-
-         filter[0] =  read_imageh(filter_image, sampler, filter_pos0);
-         filter[1] =  read_imageh(filter_image, sampler, filter_pos1);
-         filter[2] =  read_imageh(filter_image, sampler, filter_pos2);
-         filter[3] =  read_imageh(filter_image, sampler, filter_pos3);
-
-         output.x += dot(input, filter[0]);
-         output.y += dot(input, filter[1]);
-         output.z += dot(input, filter[2]);
-         output.w += dot(input, filter[3]);
-         }
-        }
+  half4 output = 0.0f;
+#endif
+
+  half4 input;
+  half4 filter[4];
+  int2 filter_pos0;
+  int2 filter_pos1;
+  int2 filter_pos2;
+  int2 filter_pos3;
+  for (int i = 0; i < input_c; ++i) {
+    int2 pos_in =
+        (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
+    for (int j = 0; j < 5; j++) {
+      for (int k = 0; k < 5; k++) {
+        input = select(
+            read_imageh(input_image, sampler,
+                        (int2)(pos_in.x + (j - 2) * dilation,
+                               pos_in.y + (k - 2) * dilation)),
+            (half4)(0.0f),
+            (ushort4)(
+                (in_pos_in_one_block.x + (j - 2) * dilation < 0 ||
+                 in_pos_in_one_block.y + (k - 2) * dilation < 0 ||
+                 in_pos_in_one_block.x + (j - 2) * dilation >= input_width ||
+                 in_pos_in_one_block.y + (k - 2) * dilation >= input_height)
+                << 15));
+        int filter_h = k;
+        int filter_w = j;
+        int filter_c = i;
+
+        filter_pos0.x = filter_c * 5 + filter_w;
+        filter_pos0.y = filter_n0 * 5 + filter_h;
+
+        filter_pos1.x = filter_c * 5 + filter_w;
+        filter_pos1.y = filter_n1 * 5 + filter_h;
+
+        filter_pos2.x = filter_c * 5 + filter_w;
+        filter_pos2.y = filter_n2 * 5 + filter_h;
+
+        filter_pos3.x = filter_c * 5 + filter_w;
+        filter_pos3.y = filter_n3 * 5 + filter_h;
+
+        filter[0] = read_imageh(filter_image, sampler, filter_pos0);
+        filter[1] = read_imageh(filter_image, sampler, filter_pos1);
+        filter[2] = read_imageh(filter_image, sampler, filter_pos2);
+        filter[3] = read_imageh(filter_image, sampler, filter_pos3);
+
+        output.x += dot(input, filter[0]);
+        output.y += dot(input, filter[1]);
+        output.z += dot(input, filter[2]);
+        output.w += dot(input, filter[3]);
+      }
    }
+  }

 #ifdef BATCH_NORM
-    output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
+  output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+           read_imageh(new_biase, sampler, (int2)(out_c, 0));
 #endif

 #ifdef RELU
-    output = activation(output);
+  output = activation(output);
 #endif

-    write_imageh(output_image, output_pos, output);
+  write_imageh(output_image, output_pos, output);
 }

-__kernel void convBNAdd_3x3(__private const int global_size_dim0,
-                                              __private const int global_size_dim1,
-                                              __private const int global_size_dim2,
-                                              __read_only image2d_t input_image,
-                                              __read_only image2d_t filter,
-
+__kernel void convBNAdd_3x3(
+    __private const int global_size_dim0, __private const int global_size_dim1,
+    __private const int global_size_dim2, __read_only image2d_t input_image,
+    __read_only image2d_t filter,
 #if defined(BIASE_CH) || defined(BIASE_ELE)
-                                              __read_only image2d_t bias,
+    __read_only image2d_t bias,
 #endif

 #ifdef BATCH_NORM
-                                              __read_only image2d_t new_scale,
-                                              __read_only image2d_t new_biase,
-#endif
-
-                                              __write_only image2d_t output_image,
-                                              __private const int stride,
-                                              __private const int offset,
-                                              __private const int input_c,
-                                              __private const int dilation,
-                                              __private const int input_width,/* of one block */
-                                              __private const int input_height,/* of one block */
-                                              __private const int output_width,
-                                              __private const int output_height) {
-
-    const int out_c = get_global_id(0);
-    const int out_w = get_global_id(1);
-    const int out_nh = get_global_id(2);
-
-    int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
-
-    if (out_c >= global_size_dim0 ||
-        out_w >= global_size_dim1 ||
-        out_nh >= global_size_dim2) {
-        return;
-    }
-
-
-    int2 stride_xy;
-    stride_xy.x = stride;
-    stride_xy.y = stride;
-
-    int2 ouput_pos_in_one_block;
-    ouput_pos_in_one_block.x = out_w;
-    ouput_pos_in_one_block.y = out_nh;
-
-
-    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                              CLK_ADDRESS_CLAMP          |
-                              CLK_FILTER_NEAREST;
-
-    int2 in_pos_in_one_block;
-    in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
-    in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
+    __read_only image2d_t new_scale, __read_only image2d_t new_biase,
+#endif

+    __write_only image2d_t output_image, __private const int stride,
+    __private const int offset, __private const int input_c,
+    __private const int dilation,
+    __private const int input_width,  /* of one block */
+    __private const int input_height, /* of one block */
+    __private const int output_width, __private const int output_height) {

-    half4 output = (half4)0.0f;
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);

-   half4 input[9];
+  int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);

-   for (int i = 0; i < input_c; ++i) {
-        int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
-        input[0] = select(read_imageh(input_image, sampler,
-                            (int2)(pos_in.x - dilation, pos_in.y - dilation)),
-                            (half4)(0.0f),
-                            (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
+  if (out_c >= global_size_dim0 || out_w >= global_size_dim1 ||
+      out_nh >= global_size_dim2) {
+    return;
+  }

-        input[1] = select(read_imageh(input_image, sampler,
-                          (int2)(pos_in.x, pos_in.y - dilation)),
-                          (half4)(0.0f),
-                          (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
+  int2 stride_xy;
+  stride_xy.x = stride;
+  stride_xy.y = stride;

-        input[2] = select(read_imageh(input_image, sampler,
-                          (int2)(pos_in.x + dilation, pos_in.y - dilation)),
-                          (half4)(0.0f),
-                          (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
+  int2 ouput_pos_in_one_block;
+  ouput_pos_in_one_block.x = out_w;
+  ouput_pos_in_one_block.y = out_nh;

-        input[3] = select(read_imageh(input_image, sampler,
-                          (int2)(pos_in.x - dilation, pos_in.y)),
-                          (half4)(0.0f),
-                          (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15));
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;

-        input[4] = select(read_imageh(input_image, sampler,
-                          (int2)(pos_in.x, pos_in.y)),
-                          (half4)(0.0f),
-                          (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15));
+  int2 in_pos_in_one_block;
+  in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
+  in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;

-        input[5] = select(read_imageh(input_image, sampler,
-                          (int2)(pos_in.x + dilation, pos_in.y)),
-                          (half4)(0.0f),
-                          (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15));
+  half4 output = (half4)0.0f;

-        input[6] = select(read_imageh(input_image, sampler,
-                          (int2)(pos_in.x - dilation, pos_in.y + dilation)),
-                          (half4)(0.0f),
-                          (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
+  half4 input[9];

-        input[7] = select(read_imageh(input_image, sampler,
-                          (int2)(pos_in.x, pos_in.y + dilation)),
-                          (half4)(0.0f),
-                          (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
+  for (int i = 0; i < input_c; ++i) {
+    int2 pos_in =
+        (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
+    input[0] =
+        select(read_imageh(input_image, sampler,
+                           (int2)(pos_in.x - dilation, pos_in.y - dilation)),
+               (half4)(0.0f),
+               (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
+                          in_pos_in_one_block.y - dilation < 0 ||
+                          in_pos_in_one_block.x - dilation >= input_width ||
+                          in_pos_in_one_block.y - dilation >= input_height)
+                         << 15));
+
+    input[1] =
+        select(read_imageh(input_image, sampler,
+                           (int2)(pos_in.x, pos_in.y - dilation)),
+               (half4)(0.0f),
+               (ushort4)((in_pos_in_one_block.x < 0 ||
+                          in_pos_in_one_block.y - dilation < 0 ||
+                          in_pos_in_one_block.x >= input_width ||
+                          in_pos_in_one_block.y - dilation >= input_height)
+                         << 15));
+
+    input[2] =
+        select(read_imageh(input_image, sampler,
+                           (int2)(pos_in.x + dilation, pos_in.y - dilation)),
+               (half4)(0.0f),
+               (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
+                          in_pos_in_one_block.y - dilation < 0 ||
+                          in_pos_in_one_block.x + dilation >= input_width ||
+                          in_pos_in_one_block.y - dilation >= input_height)
+                         << 15));
+
+    input[3] =
+        select(read_imageh(input_image, sampler,
+                           (int2)(pos_in.x - dilation, pos_in.y)),
+               (half4)(0.0f),
+               (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
+                          in_pos_in_one_block.y < 0 ||
+                          in_pos_in_one_block.x - dilation >= input_width ||
+                          in_pos_in_one_block.y >= input_height)
+                         << 15));
+
+    input[4] = select(
+        read_imageh(input_image, sampler, (int2)(pos_in.x, pos_in.y)),
+        (half4)(0.0f),
+        (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 ||
+                   in_pos_in_one_block.x >= input_width ||
+                   in_pos_in_one_block.y >= input_height)
+                  << 15));
+
+    input[5] =
+        select(read_imageh(input_image, sampler,
+                           (int2)(pos_in.x + dilation, pos_in.y)),
+               (half4)(0.0f),
+               (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
+                          in_pos_in_one_block.y < 0 ||
+                          in_pos_in_one_block.x + dilation >= input_width ||
+                          in_pos_in_one_block.y >= input_height)
+                         << 15));
+
+    input[6] =
+        select(read_imageh(input_image, sampler,
+                           (int2)(pos_in.x - dilation, pos_in.y + dilation)),
+               (half4)(0.0f),
+               (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
+                          in_pos_in_one_block.y + dilation < 0 ||
+                          in_pos_in_one_block.x - dilation >= input_width ||
+                          in_pos_in_one_block.y + dilation >= input_height)
+                         << 15));
+
+    input[7] =
+        select(read_imageh(input_image, sampler,
+                           (int2)(pos_in.x, pos_in.y + dilation)),
+               (half4)(0.0f),
+               (ushort4)((in_pos_in_one_block.x < 0 ||
+                          in_pos_in_one_block.y + dilation < 0 ||
+                          in_pos_in_one_block.x >= input_width ||
+                          in_pos_in_one_block.y + dilation >= input_height)
+                         << 15));
+
+    input[8] =
+        select(read_imageh(input_image, sampler,
+                           (int2)(pos_in.x + dilation, pos_in.y + dilation)),
+               (half4)(0.0f),
+               (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
+                          in_pos_in_one_block.y + dilation < 0 ||
+                          in_pos_in_one_block.x + dilation >= input_width ||
+                          in_pos_in_one_block.y + dilation >= input_height)
+                         << 15));

-        input[8] = select(read_imageh(input_image, sampler,
-                          (int2)(pos_in.x + dilation, pos_in.y + dilation)),
-                          (half4)(0.0f),
-                          (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
+    /*
+            for (int j = 0; j < 9; ++j) {
+                int2 pos_of_weight;
+                pos_of_weight.x = i * 3 + j % 3;
+                pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+                half4 weight_x = read_imageh(filter, sampler, pos_of_weight);
+                output.x += dot(input[j], weight_x);

+                pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+                half4 weight_y = read_imageh(filter, sampler, pos_of_weight);
+                output.y += dot(input[j], weight_y);

-/*
-        for (int j = 0; j < 9; ++j) {
-            int2 pos_of_weight;
-            pos_of_weight.x = i * 3 + j % 3;
-            pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-            half4 weight_x = read_imageh(filter, sampler, pos_of_weight);
-            output.x += dot(input[j], weight_x);
-
-            pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-            half4 weight_y = read_imageh(filter, sampler, pos_of_weight);
-            output.y += dot(input[j], weight_y);
-
-            pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-            half4 weight_z = read_imageh(filter, sampler, pos_of_weight);
-            output.z += dot(input[j], weight_z);
-
-            pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-            half4 weight_w = read_imageh(filter, sampler, pos_of_weight);
-            output.w += dot(input[j], weight_w);
-        }
-*/
-            int j = 0;
-            int2 pos_of_weight;
-            pos_of_weight.x = i * 3 + j % 3;
-            pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-            half4 weight_x = read_imageh(filter, sampler, pos_of_weight);
-            output.x += dot(input[j], weight_x);
-
-            pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-            half4 weight_y = read_imageh(filter, sampler, pos_of_weight);
-            output.y += dot(input[j], weight_y);
-
-            pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-            half4 weight_z = read_imageh(filter, sampler, pos_of_weight);
-            output.z += dot(input[j], weight_z);
-
-            pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-            half4 weight_w = read_imageh(filter, sampler, pos_of_weight);
-            output.w += dot(input[j], weight_w);
-
-            j = 1;
-            pos_of_weight.x = i * 3 + j % 3;
-            pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-            weight_x = read_imageh(filter, sampler, pos_of_weight);
-            output.x += dot(input[j], weight_x);
-
-            pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-            weight_y = read_imageh(filter, sampler, pos_of_weight);
-            output.y += dot(input[j], weight_y);
-
-            pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-            weight_z = read_imageh(filter, sampler, pos_of_weight);
-            output.z += dot(input[j], weight_z);
-
-            pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-            weight_w = read_imageh(filter, sampler, pos_of_weight);
-            output.w += dot(input[j], weight_w);
-
-            j = 2;
-            pos_of_weight.x = i * 3 + j % 3;
-            pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-            weight_x = read_imageh(filter, sampler, pos_of_weight);
-            output.x += dot(input[j], weight_x);
-
-            pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-            weight_y = read_imageh(filter, sampler, pos_of_weight);
-            output.y += dot(input[j], weight_y);
-
-            pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-            weight_z = read_imageh(filter, sampler, pos_of_weight);
-            output.z += dot(input[j], weight_z);
-
-            pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-            weight_w = read_imageh(filter, sampler, pos_of_weight);
-            output.w += dot(input[j], weight_w);
-
-            j = 3;
-            pos_of_weight.x = i * 3 + j % 3;
-            pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-            weight_x = read_imageh(filter, sampler, pos_of_weight);
-            output.x += dot(input[j], weight_x);
-
-            pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-            weight_y = read_imageh(filter, sampler, pos_of_weight);
-            output.y += dot(input[j], weight_y);
-
-            pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-            weight_z = read_imageh(filter, sampler, pos_of_weight);
-            output.z += dot(input[j], weight_z);
-
-            pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-            weight_w = read_imageh(filter, sampler, pos_of_weight);
-            output.w += dot(input[j], weight_w);
-
-            j = 4;
-            pos_of_weight.x = i * 3 + j % 3;
-            pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-            weight_x = read_imageh(filter, sampler, pos_of_weight);
-            output.x += dot(input[j], weight_x);
-
-            pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-            weight_y = read_imageh(filter, sampler, pos_of_weight);
-            output.y += dot(input[j], weight_y);
-
-            pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-            weight_z = read_imageh(filter, sampler, pos_of_weight);
-            output.z += dot(input[j], weight_z);
-
-            pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-            weight_w = read_imageh(filter, sampler, pos_of_weight);
-            output.w += dot(input[j], weight_w);
-
-            j = 5;
-            pos_of_weight.x = i * 3 + j % 3;
-            pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-            weight_x = read_imageh(filter, sampler, pos_of_weight);
-            output.x += dot(input[j], weight_x);
-
-            pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-            weight_y = read_imageh(filter, sampler, pos_of_weight);
-            output.y += dot(input[j], weight_y);
-
-            pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-            weight_z = read_imageh(filter, sampler, pos_of_weight);
-            output.z += dot(input[j], weight_z);
-
-            pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-            weight_w = read_imageh(filter, sampler, pos_of_weight);
-            output.w += dot(input[j], weight_w);
-
-           j = 6;
-           pos_of_weight.x = i * 3 + j % 3;
-           pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-           weight_x = read_imageh(filter, sampler, pos_of_weight);
-           output.x += dot(input[j], weight_x);
-
-           pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-           weight_y = read_imageh(filter, sampler, pos_of_weight);
-           output.y += dot(input[j], weight_y);
-
-           pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-           weight_z = read_imageh(filter, sampler, pos_of_weight);
-           output.z += dot(input[j], weight_z);
-
-           pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-           weight_w = read_imageh(filter, sampler, pos_of_weight);
-           output.w += dot(input[j], weight_w);
-
-           j = 7;
-           pos_of_weight.x = i * 3 + j % 3;
-           pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-           weight_x = read_imageh(filter, sampler, pos_of_weight);
-           output.x += dot(input[j], weight_x);
-
-           pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-           weight_y = read_imageh(filter, sampler, pos_of_weight);
-           output.y += dot(input[j], weight_y);
-
-           pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-           weight_z = read_imageh(filter, sampler, pos_of_weight);
-           output.z += dot(input[j], weight_z);
-
-           pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-           weight_w = read_imageh(filter, sampler, pos_of_weight);
-           output.w += dot(input[j], weight_w);
-
-           j = 8;
-           pos_of_weight.x = i * 3 + j % 3;
-           pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-           weight_x = read_imageh(filter, sampler, pos_of_weight);
-           output.x += dot(input[j], weight_x);
-
-           pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-           weight_y = read_imageh(filter, sampler, pos_of_weight);
-           output.y += dot(input[j], weight_y);
-
-           pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-           weight_z = read_imageh(filter, sampler, pos_of_weight);
-           output.z += dot(input[j], weight_z);
-
-           pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-           weight_w = read_imageh(filter, sampler, pos_of_weight);
-           output.w += dot(input[j], weight_w);
+                pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+                half4 weight_z = read_imageh(filter, sampler, pos_of_weight);
+                output.z += dot(input[j], weight_z);

-    }
+                pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+                half4 weight_w = read_imageh(filter, sampler, pos_of_weight);
+                output.w += dot(input[j], weight_w);
+            }
+    */
+    int j = 0;
+    int2 pos_of_weight;
+    pos_of_weight.x = i * 3 + j % 3;
+    pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+    half4 weight_x = read_imageh(filter, sampler, pos_of_weight);
+    output.x += dot(input[j], weight_x);
+
+    pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+    half4 weight_y = read_imageh(filter, sampler, pos_of_weight);
+    output.y += dot(input[j], weight_y);
+
+    pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+    half4 weight_z = read_imageh(filter, sampler, pos_of_weight);
+    output.z += dot(input[j], weight_z);
+
+    pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+    half4 weight_w = read_imageh(filter, sampler, pos_of_weight);
+    output.w += dot(input[j], weight_w);
+
+    j = 1;
+    pos_of_weight.x = i * 3 + j % 3;
+    pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+    weight_x = read_imageh(filter, sampler, pos_of_weight);
+    output.x += dot(input[j], weight_x);
+
+    pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+    weight_y = read_imageh(filter, sampler, pos_of_weight);
+    output.y += dot(input[j], weight_y);
+
+    pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+    weight_z = read_imageh(filter, sampler, pos_of_weight);
+    output.z += dot(input[j], weight_z);
+
+    pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+    weight_w = read_imageh(filter, sampler, pos_of_weight);
+    output.w += dot(input[j], weight_w);
+
+    j = 2;
+    pos_of_weight.x = i * 3 + j % 3;
+    pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+    weight_x = read_imageh(filter, sampler, pos_of_weight);
+    output.x += dot(input[j], weight_x);
+
+    pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+    weight_y = read_imageh(filter, sampler, pos_of_weight);
+    output.y += dot(input[j], weight_y);
+
+    pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+    weight_z = read_imageh(filter, sampler, pos_of_weight);
+    output.z += dot(input[j], weight_z);
+
+    pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+    weight_w = read_imageh(filter, sampler, pos_of_weight);
+    output.w += dot(input[j], weight_w);
+
+    j = 3;
+    pos_of_weight.x = i * 3 + j % 3;
+    pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+    weight_x = read_imageh(filter, sampler, pos_of_weight);
+    output.x += dot(input[j], weight_x);
+
+    pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+    weight_y = read_imageh(filter, sampler, pos_of_weight);
+    output.y += dot(input[j], weight_y);
+
+    pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+    weight_z = read_imageh(filter, sampler, pos_of_weight);
+    output.z += dot(input[j], weight_z);
+
+    pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+    weight_w = read_imageh(filter, sampler, pos_of_weight);
+    output.w += dot(input[j], weight_w);
+
+    j = 4;
+    pos_of_weight.x = i * 3 + j % 3;
+    pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+    weight_x = read_imageh(filter, sampler, pos_of_weight);
+    output.x += dot(input[j], weight_x);
+
+    pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+    weight_y = read_imageh(filter, sampler, pos_of_weight);
+    output.y += dot(input[j], weight_y);
+
+    pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+    weight_z = read_imageh(filter, sampler, pos_of_weight);
+    output.z += dot(input[j], weight_z);
+
+    pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+    weight_w = read_imageh(filter, sampler, pos_of_weight);
+    output.w += dot(input[j], weight_w);
+
+    j = 5;
+    pos_of_weight.x = i * 3 + j % 3;
+    pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+    weight_x = read_imageh(filter, sampler, pos_of_weight);
+    output.x += dot(input[j], weight_x);
+
+    pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+    weight_y = read_imageh(filter, sampler, pos_of_weight);
+    output.y += dot(input[j], weight_y);
+
+    pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+    weight_z = read_imageh(filter, sampler, pos_of_weight);
+    output.z += dot(input[j], weight_z);
+
+    pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+    weight_w = read_imageh(filter, sampler, pos_of_weight);
+    output.w += dot(input[j], weight_w);
+
+    j = 6;
+    pos_of_weight.x = i * 3 + j % 3;
+    pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+    weight_x = read_imageh(filter, sampler, pos_of_weight);
+    output.x += dot(input[j], weight_x);
+
+    pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+    weight_y = read_imageh(filter, sampler, pos_of_weight);
+    output.y += dot(input[j], weight_y);
+
+    pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+    weight_z = read_imageh(filter, sampler, pos_of_weight);
+    output.z += dot(input[j], weight_z);
+
+    pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+    weight_w = read_imageh(filter, sampler, pos_of_weight);
+    output.w += dot(input[j], weight_w);
+
+    j = 7;
+    pos_of_weight.x = i * 3 + j % 3;
+    pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+    weight_x = read_imageh(filter, sampler, pos_of_weight);
+    output.x += dot(input[j], weight_x);
+
+    pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+    weight_y = read_imageh(filter, sampler, pos_of_weight);
+    output.y += dot(input[j], weight_y);
+
+    pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+    weight_z = read_imageh(filter, sampler, pos_of_weight);
+    output.z += dot(input[j], weight_z);
+
+    pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+    weight_w = read_imageh(filter, sampler, pos_of_weight);
+    output.w += dot(input[j], weight_w);
+
+    j = 8;
+    pos_of_weight.x = i * 3 + j % 3;
+    pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+    weight_x = read_imageh(filter, sampler, pos_of_weight);
+    output.x += dot(input[j], weight_x);
+
+    pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+    weight_y = read_imageh(filter, sampler, pos_of_weight);
+    output.y += dot(input[j], weight_y);
+
+    pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+    weight_z = read_imageh(filter, sampler, pos_of_weight);
+    output.z += dot(input[j], weight_z);
+
+    pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+    weight_w = read_imageh(filter, sampler, pos_of_weight);
+    output.w += dot(input[j], weight_w);
+  }

 #ifdef BATCH_NORM
-    output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
+  output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+           read_imageh(new_biase, sampler, (int2)(out_c, 0));
 #endif

 #ifdef BIASE_CH
-    output += read_imageh(bias, sampler, (int2)(out_c, 0));
+  output += read_imageh(bias, sampler, (int2)(out_c, 0));
 #elif defined(BIASE_ELE)
-    output += read_imageh(bias, sampler, output_pos);
+  output += read_imageh(bias, sampler, output_pos);
 #endif

 #ifdef RELU
-    output = activation(output);
+  output = activation(output);
 #endif

-    write_imageh(output_image, output_pos, output);
+  write_imageh(output_image, output_pos, output);
 }

-__kernel void convBNAdd_1x1(__private const int global_size_dim0,
-                       __private const int global_size_dim1,
-                       __private const int global_size_dim2,
-                       __read_only image2d_t input_image,
-                       __read_only image2d_t filter,
+__kernel void convBNAdd_1x1(
+    __private const int global_size_dim0, __private const int global_size_dim1,
+    __private const int global_size_dim2, __read_only image2d_t input_image,
+    __read_only image2d_t filter,
 #if defined(BIASE_CH) || defined(BIASE_ELE)
-                       __read_only image2d_t bias,
+    __read_only image2d_t bias,
 #endif
 #ifdef BATCH_NORM
-                       __read_only image2d_t new_scale,
-                       __read_only image2d_t new_biase,
-#endif
-                       __write_only image2d_t output_image,
-                       __private const int stride,
-                       __private const int offset,
-                       __private const int input_c,
-                       __private const int dilation,
-                       __private const int input_width,/* of one block */
-                       __private const int input_height,/* of one block */
-                       __private const int output_width,
-                       __private const int output_height) {
+    __read_only image2d_t new_scale, __read_only image2d_t new_biase,
+#endif
+    __write_only image2d_t output_image, __private const int stride,
+    __private const int offset, __private const int input_c,
+    __private const int dilation,
+    __private const int input_width,  /* of one block */
+    __private const int input_height, /* of one block */
+    __private const int output_width, __private const int output_height) {
  const int out_c = get_global_id(0);
  const int out_w = get_global_id(1);
  const int out_nh = get_global_id(2);

  int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);

-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                           CLK_ADDRESS_CLAMP         |
-                           CLK_FILTER_NEAREST;
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;

  const uint kernelHXW = 1;
  int2 stride_xy = (int2)(stride, stride);
  int2 ouput_pos_in_one_block = (int2)(out_w, out_nh);
-  int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
-
+  int2 in_pos_in_one_block =
+      ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);

  half4 output = 0.0f;

-   for (int i = 0; i < input_c; ++i) {
-        int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
-        half4 input = read_imageh(input_image, sampler, pos_in);
-
-        half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
-        half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1));
-        half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2));
-        half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3));
-/*
-        output.x = dot(input, weight0);
-        output.y = dot(input, weight1);
-        output.z = dot(input, weight2);
-        output.w = dot(input, weight3);
-*/
+  for (int i = 0; i < input_c; ++i) {
+    int2 pos_in =
+        (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
+    half4 input = read_imageh(input_image, sampler, pos_in);

-        output = mad(input.x, weight0, output);
-        output = mad(input.y, weight1, output);
-        output = mad(input.z, weight2, output);
-        output = mad(input.w, weight3, output);
+    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
+    half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1));
+    half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2));
+    half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3));
+    /*
+            output.x = dot(input, weight0);
+            output.y = dot(input, weight1);
+            output.z = dot(input, weight2);
+            output.w = dot(input, weight3);
+    */

-   }
+    output = mad(input.x, weight0, output);
+    output = mad(input.y, weight1, output);
+    output = mad(input.z, weight2, output);
+    output = mad(input.w, weight3, output);
+  }

 #ifdef BATCH_NORM
-    output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
+  output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+           read_imageh(new_biase, sampler, (int2)(out_c, 0));
 #endif

 #ifdef BIASE_CH
-    output += read_imageh(bias, sampler, (int2)(out_c, 0));
+  output += read_imageh(bias, sampler, (int2)(out_c, 0));
 #elif defined(BIASE_ELE)
-    output += read_imageh(bias, sampler, output_pos);
+  output += read_imageh(bias, sampler, output_pos);
 #endif

 #ifdef RELU
@@ -2398,24 +2569,22 @@ __kernel void convBNAdd_1x1(__private const int global_size_dim0,
 }

 __kernel void convBNAdd_1x1_spl(
-        __private const int global_size_dim0, __private const int global_size_dim1,
-        __private const int global_size_dim2, __read_only image2d_t input_image,
-        __read_only image2d_t filter,
+    __private const int global_size_dim0, __private const int global_size_dim1,
+    __private const int global_size_dim2, __read_only image2d_t input_image,
+    __read_only image2d_t filter,
 #if defined(BIASE_CH) || defined(BIASE_ELE)
-        __read_only image2d_t bias,
+    __read_only image2d_t bias,
 #endif
 #ifdef BATCH_NORM
-        __read_only image2d_t new_scale, __read_only image2d_t new_biase,
-#endif
-        __write_only image2d_t output_image, __private const int stride,
-        __private const int offset, __private const int input_c,
-        __private const int dilation,
-        __private const int input_width,  /* of one block */
-        __private const int input_height, /* of one block */
-        __private const int output_width,
-        __private const int output_height,
-        __private const int old_w
-) {
+    __read_only image2d_t new_scale, __read_only image2d_t new_biase,
+#endif
+    __write_only image2d_t output_image, __private const int stride,
+    __private const int offset, __private const int input_c,
+    __private const int dilation,
+    __private const int input_width,  /* of one block */
+    __private const int input_height, /* of one block */
+    __private const int output_width, __private const int output_height,
+    __private const int old_w) {

  const int out_c = get_global_id(0);
  const int out_w = get_global_id(1);
@@ -2426,33 +2595,32 @@ __kernel void convBNAdd_1x1_spl(
  int out_w2 = out_w + global_size_dim1 * 2;
  int out_w3 = out_w + global_size_dim1 * 3;

-  int outpos_main = mul24(out_c , old_w);
+  int outpos_main = mul24(out_c, old_w);
  int2 output_pos0 = (int2)(outpos_main + out_w0, out_nh);
  int2 output_pos1 = (int2)(outpos_main + out_w1, out_nh);
  int2 output_pos2 = (int2)(outpos_main + out_w2, out_nh);
  int2 output_pos3 = (int2)(outpos_main + out_w3, out_nh);

  const sampler_t sampler =
-          CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;

  int2 stride_xy = (int2)(stride, stride);

  int2 ouput_pos_in_one_block0 = (int2)(out_w0, out_nh);
  int2 in_pos_in_one_block0 =
-          ouput_pos_in_one_block0 * stride_xy + (int2)(offset, offset);
+      ouput_pos_in_one_block0 * stride_xy + (int2)(offset, offset);

  int2 ouput_pos_in_one_block1 = (int2)(out_w1, out_nh);
  int2 in_pos_in_one_block1 =
-          ouput_pos_in_one_block1 * stride_xy + (int2)(offset, offset);
+      ouput_pos_in_one_block1 * stride_xy + (int2)(offset, offset);

  int2 ouput_pos_in_one_block2 = (int2)(out_w2, out_nh);
  int2 in_pos_in_one_block2 =
-          ouput_pos_in_one_block2 * stride_xy + (int2)(offset, offset);
+      ouput_pos_in_one_block2 * stride_xy + (int2)(offset, offset);

  int2 ouput_pos_in_one_block3 = (int2)(out_w3, out_nh);
  int2 in_pos_in_one_block3 =
-          ouput_pos_in_one_block3 * stride_xy + (int2)(offset, offset);
-
+      ouput_pos_in_one_block3 * stride_xy + (int2)(offset, offset);

  half4 output0 = 0.0f;
  half4 output1 = 0.0f;
@@ -2461,7 +2629,8 @@ __kernel void convBNAdd_1x1_spl(

  for (int i = 0; i < input_c; ++i) {
    // ------------0---------------
-    int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x, in_pos_in_one_block0.y);
+    int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x,
+                         in_pos_in_one_block0.y);
    half4 input0 = read_imageh(input_image, sampler, pos_in);

    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
@@ -2475,7 +2644,8 @@ __kernel void convBNAdd_1x1_spl(
    output0 = mad(input0.w, weight3, output0);

    // -------------1--------------
-    pos_in = (int2)(i * input_width + in_pos_in_one_block1.x, in_pos_in_one_block1.y);
+    pos_in = (int2)(i * input_width + in_pos_in_one_block1.x,
+                    in_pos_in_one_block1.y);
    half4 input1 = read_imageh(input_image, sampler, pos_in);
    //
    //    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
@@ -2490,7 +2660,8 @@ __kernel void convBNAdd_1x1_spl(
    output1 = mad(input1.w, weight3, output1);

    // -------------2--------------
-    pos_in = (int2)(i * input_width + in_pos_in_one_block2.x, in_pos_in_one_block2.y);
+    pos_in = (int2)(i * input_width + in_pos_in_one_block2.x,
+                    in_pos_in_one_block2.y);
    half4 input2 = read_imageh(input_image, sampler, pos_in);

    //    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
@@ -2505,7 +2676,8 @@ __kernel void convBNAdd_1x1_spl(
    output2 = mad(input2.w, weight3, output2);

    // -------------3--------------
-    pos_in = (int2)(i * input_width + in_pos_in_one_block3.x, in_pos_in_one_block3.y);
+    pos_in = (int2)(i * input_width + in_pos_in_one_block3.x,
+                    in_pos_in_one_block3.y);
    half4 input3 = read_imageh(input_image, sampler, pos_in);

    //    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
@@ -2521,29 +2693,29 @@ __kernel void convBNAdd_1x1_spl(
  }

 #ifdef BATCH_NORM
-    output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-          read_imageh(new_biase, sampler, (int2)(out_c, 0));
+  output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+            read_imageh(new_biase, sampler, (int2)(out_c, 0));

-    output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-          read_imageh(new_biase, sampler, (int2)(out_c, 0));
+  output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+            read_imageh(new_biase, sampler, (int2)(out_c, 0));

-    output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-          read_imageh(new_biase, sampler, (int2)(out_c, 0));
+  output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+            read_imageh(new_biase, sampler, (int2)(out_c, 0));

-    output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-          read_imageh(new_biase, sampler, (int2)(out_c, 0));
+  output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+            read_imageh(new_biase, sampler, (int2)(out_c, 0));
 #endif

 #ifdef BIASE_CH
-    output0 += read_imageh(bias, sampler, (int2)(out_c, 0));
-    output1 += read_imageh(bias, sampler, (int2)(out_c, 0));
-    output2 += read_imageh(bias, sampler, (int2)(out_c, 0));
-    output3 += read_imageh(bias, sampler, (int2)(out_c, 0));
+  output0 += read_imageh(bias, sampler, (int2)(out_c, 0));
+  output1 += read_imageh(bias, sampler, (int2)(out_c, 0));
+  output2 += read_imageh(bias, sampler, (int2)(out_c, 0));
+  output3 += read_imageh(bias, sampler, (int2)(out_c, 0));
 #elif defined(BIASE_ELE)
-    output0 += read_imageh(bias, sampler, output_pos0);
-    output1 += read_imageh(bias, sampler, output_pos1);
-    output2 += read_imageh(bias, sampler, output_pos2);
-    output3 += read_imageh(bias, sampler, output_pos3);
+  output0 += read_imageh(bias, sampler, output_pos0);
+  output1 += read_imageh(bias, sampler, output_pos1);
+  output2 += read_imageh(bias, sampler, output_pos2);
+  output3 += read_imageh(bias, sampler, output_pos3);
 #endif

 #ifdef RELU
@@ -2557,22 +2729,108 @@ __kernel void convBNAdd_1x1_spl(
    write_imageh(output_image, output_pos0, output0);
  }

-  if (out_w1 < old_w){
+  if (out_w1 < old_w) {
    write_imageh(output_image, output_pos1, output1);
  }

-  if (out_w2 < old_w){
+  if (out_w2 < old_w) {
    write_imageh(output_image, output_pos2, output2);
  }

-  if (out_w3 < old_w){
+  if (out_w3 < old_w) {
    write_imageh(output_image, output_pos3, output3);
  }
 }

+__kernel void depth_conv(
+    __private const int global_size_dim0, __private const int global_size_dim1,
+    __private const int global_size_dim2, __read_only image2d_t input,
+    __read_only image2d_t filter,
+#if defined(BIASE_CH) || defined(BIASE_ELE)
+    __read_only image2d_t bias,
+#endif
+#ifdef BATCH_NORM
+    __read_only image2d_t new_scale, __read_only image2d_t new_biase,
+#endif
+    __write_only image2d_t output_image, __private const int stride,
+    __private const int offset, __private const int input_c,
+    __private const int dilation,
+    __private const int input_width,  /* of one block */
+    __private const int input_height, /* of one block */
+    __private const int output_width, __private const int output_height,
+    __private const int filter_width, __private const int filter_height) {

+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);

+  int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  const int batch_index = out_nh / output_height;
+  const int out_nh_in_one_batch = out_nh % output_height;
+  int2 stride_xy = (int2)(stride, stride);
+  int2 ouput_pos_in_one_block = (int2)(out_w, out_nh_in_one_batch);
+  int2 in_pos_in_one_block =
+      ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
+#ifdef BIASE_CH
+  half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
+#elif defined(BIASE_ELE)
+  half4 output = read_imageh(bias, sampler, output_pos);
+#else
+  half4 output = 0.0f;
+#endif

+  int2 pos_in_input_block =
+      (int2)(out_c * input_width, batch_index * input_height);
+  int2 pos_in_filter_block =
+      (int2)(out_c * filter_width, batch_index * filter_height);
+  int filter_x = pos_in_filter_block.x;
+  int filter_y = pos_in_filter_block.y;
+  int input_x_base = pos_in_input_block.x + in_pos_in_one_block.x;
+  int input_y_base = pos_in_input_block.y + in_pos_in_one_block.y;
+  int2 align = {filter_width / 2, filter_height / 2};
+  /*  if (output_pos.x == 0 && output_pos.y == 0){
+      printf("align.x=%d  align.y=%d \n ",align.x,align.y);
+      printf("stride=%d \n ",stride);
+    }*/
+  for (int fy = 0; fy < filter_height; ++fy) {
+    for (int fx = 0; fx < filter_width; ++fx) {
+      int x_off = fx - align.x;
+      int y_off = fy - align.y;
+      /*      if (output_pos.x == 0 && output_pos.y == 0){
+              printf("fx=%d  fy=%d \n ",fx,fy);
+              printf("x_off=%d  y_off=%d \n ",x_off,y_off);
+            }*/
+      half4 in = select(
+          read_imageh(input, sampler,
+                      (int2)(input_x_base + x_off, input_y_base + y_off)),
+          (half4)(0.0f),
+          (ushort4)((in_pos_in_one_block.x + x_off < 0 ||
+                     in_pos_in_one_block.y + y_off < 0 ||
+                     in_pos_in_one_block.x + x_off >= input_width ||
+                     in_pos_in_one_block.y + y_off >= input_height)
+                    << 15));
+      half4 f =
+          read_imageh(filter, sampler, (int2)(filter_x + fx, filter_y + fy));
+      output += in * f;
+      /*if (output_pos.x ==111  && output_pos.y == 0){
+        printf("in={ %f , %f , %f , %f } \n
+      ",convert_float(in.x),convert_float(in.y),convert_float(in.z),convert_float(in.w));
+        printf("filter={ %f , %f , %f , %f } \n
+      ",convert_float(f.x),convert_float(f.y),convert_float(f.z),convert_float(f.w));
+        printf("output={ %f , %f , %f , %f } \n
+      ",convert_float(output.x),convert_float(output.y),convert_float(output.z),convert_float(output.w));
+      }*/
+    }
+  }
+#ifdef BATCH_NORM
+  output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+           read_imageh(new_biase, sampler, (int2)(out_c, 0));
+#endif

-
-
+#ifdef RELU
+  output = activation(output);
+#endif
+  write_imageh(output_image, output_pos, output);
+}
\ No newline at end of file
--- a/mobile/src/operators/kernel/cl/cl_kernel/elementwise_mul_kernel.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/elementwise_mul_kernel.cl
@@ -13,33 +13,101 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
-__kernel void elementwise_mul(__global image2d_t input, __global image2d_t bias,__write_only image2d_t outputImage) {
-     int x = get_global_id(0);
-     int y = get_global_id(1);
-     const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-     int2 coords;
-     coords.x = x;
-     coords.y = y;
-     half4 in = read_imageh(input, sampler, coords);
-     half4 biase = read_imageh(bias, sampler, coords);
-     half4 output = in * biase;
-     write_imageh(outputImage,coords,output);
- }
-
-
-__kernel void channel_mul(__global image2d_t input, __global image2d_t bias,__write_only
-image2d_t outputImage, int w) {
+__kernel void elementwise_mul(__global image2d_t input, __global image2d_t bias,
+                              __write_only image2d_t outputImage) {
  int x = get_global_id(0);
  int y = get_global_id(1);
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  int2 coords;
+  coords.x = x;
+  coords.y = y;
+  half4 in = read_imageh(input, sampler, coords);
+  half4 biase = read_imageh(bias, sampler, coords);
+  half4 output = in * biase;
+  write_imageh(outputImage, coords, output);
+}
+
+__kernel void channel_mul(__global image2d_t input, __global image2d_t bias,
+                          __write_only image2d_t outputImage, int w) {
+  int x = get_global_id(0);
+  int y = get_global_id(1);
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
  int2 coords;
  coords.x = x;
  coords.y = y;
  int2 coords_bias;
-  coords_bias.x = x/w;
+  coords_bias.x = x / w;
  coords_bias.y = 0;
  half4 in = read_imageh(input, sampler, coords);
  half4 biase = read_imageh(bias, sampler, coords_bias);
  half4 output = in * biase;
-  write_imageh(outputImage,coords,output);
+  write_imageh(outputImage, coords, output);
 }
+
+// etc : 1 1 1 72
+// run time Y  [value,0,0,0] * 72
+__kernel void channel_mul_d2(__global image2d_t input, __global image2d_t bias,
+                             __write_only image2d_t outputImage, int w) {
+  int x = get_global_id(0);
+  int y = get_global_id(1);
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  int2 coords;
+  coords.x = x;
+  coords.y = y;
+
+  int2 coords_bias0;
+  int2 coords_bias1;
+  int2 coords_bias2;
+  int2 coords_bias3;
+
+  /*  if (x == 0 && y == 0) {
+      half4 b = (half4){0, 0, 0, 0};
+  #define PPI(j, k)                                                          \
+    b = read_imageh(bias, sampler, (int2){j, k});                            \
+    printf("bias(%d,%d)={ %f , %f , %f , %f }\n ", j, k, convert_float(b.x), \
+           convert_float(b.y), convert_float(b.z), convert_float(b.w));
+      for (int i = 0; i < 73; ++i) {
+        PPI(i, 0);
+      }
+  #undef PPI
+    }*/
+
+  coords_bias0.x = x / w * 4;
+  coords_bias0.y = 0;
+
+  coords_bias1.x = x / w * 4 + 1;
+  coords_bias1.y = 0;
+
+  coords_bias2.x = x / w * 4 + 2;
+  coords_bias2.y = 0;
+
+  coords_bias3.x = x / w * 4 + 3;
+  coords_bias3.y = 0;
+
+  half4 biase0 = read_imageh(bias, sampler, coords_bias0);
+  half4 biase1 = read_imageh(bias, sampler, coords_bias1);
+  half4 biase2 = read_imageh(bias, sampler, coords_bias2);
+  half4 biase3 = read_imageh(bias, sampler, coords_bias3);
+  /*  if (x == 0 && y == 0) {
+      printf("bias0={ %f , %f , %f , %f }\n ",
+             convert_float(biase0.x), convert_float(biase0.y),
+             convert_float(biase0.z), convert_float(biase0.w));
+
+      printf("bias1={ %f , %f , %f , %f }\n ",
+             convert_float(biase1.x), convert_float(biase1.y),
+             convert_float(biase1.z), convert_float(biase1.w));
+      printf("bias2={ %f , %f , %f , %f }\n ",
+             convert_float(biase2.x), convert_float(biase2.y),
+             convert_float(biase2.z), convert_float(biase2.w));
+      printf("bias3={ %f , %f , %f , %f }\n ",
+             convert_float(biase3.x), convert_float(biase3.y),
+             convert_float(biase3.z), convert_float(biase3.w));
+    }*/
+  half4 biase = {biase0.x, biase1.x, biase2.x, biase3.x};
+  half4 in = read_imageh(input, sampler, coords);
+  half4 output = mad(in, biase, 0);
+  write_imageh(outputImage, coords, output);
+}
\ No newline at end of file
--- a/mobile/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
@@ -174,6 +174,16 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init(
                                 build_options);
    }

+  } else if (param->Filter()->dims()[1] == 1 &&
+             param->Input()->dims()[1] == param->Output()->dims()[1] &&
+             param->Filter()->dims()[2] != 3) {
+    param->Filter()->InitDWImage(cl_helper_.CLContext(),
+                                 cl_helper_.CLCommandQueue());
+    // other depthwise not with filter 3x3
+    DLOG << "depth_conv basic ";
+    param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT;
+    this->cl_helper_.AddKernel("depth_conv", conv_kernel_file, build_options);
+
  } else if (param->Filter()->dims()[2] == 3 &&
             param->Filter()->dims()[3] == 3) {
    //    if (param->Strides()[0] == param->Strides()[1] &&
@@ -214,6 +224,7 @@ void ConvAddBNReluKernel<GPU_CL, float>::Compute(
    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT:
    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
    case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
+    case ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT:
      ConvAddBnRelu(&this->cl_helper_, param, true, param.Bias(),
                    param.NewScale(), param.NewBias());
      break;

--- a/mobile/src/operators/kernel/cl/conv_add_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_add_kernel.cpp
@@ -71,6 +71,14 @@ bool ConvAddKernel<GPU_CL, float>::Init(FusionConvAddParam<GPU_CL> *param) {
                                 build_options);
    }

+  } else if (param->Filter()->dims()[1] == 1 &&
+             param->Input()->dims()[1] == param->Output()->dims()[1] &&
+             param->Filter()->dims()[2] != 3) {
+    param->Filter()->InitDWImage(cl_helper_.CLContext(),
+                                 cl_helper_.CLCommandQueue());
+    param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT;
+    this->cl_helper_.AddKernel("depth_conv", conv_kernel_file, build_options);
+
  } else if (param->Filter()->dims()[2] == 3 &&
             param->Filter()->dims()[3] == 3) {
    //    if (param->Strides()[0] == param->Strides()[1] &&
@@ -124,6 +132,7 @@ void ConvAddKernel<GPU_CL, float>::Compute(
    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT:
    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW5x5_FLOAT:
    case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
+    case ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT:
      ConvAddBnRelu(&this->cl_helper_, param, false, param.Bias());
      break;
    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW7x7_FLOAT:

--- a/mobile/src/operators/kernel/cl/conv_add_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_add_relu_kernel.cpp
@@ -72,6 +72,14 @@ bool ConvAddReluKernel<GPU_CL, float>::Init(
                                 build_options);
    }

+  } else if (param->Filter()->dims()[1] == 1 &&
+             param->Input()->dims()[1] == param->Output()->dims()[1] &&
+             param->Filter()->dims()[2] != 3) {
+    param->Filter()->InitDWImage(cl_helper_.CLContext(),
+                                 cl_helper_.CLCommandQueue());
+    DLOG << "init depwise conv basic";
+    param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT;
+    this->cl_helper_.AddKernel("depth_conv", conv_kernel_file, build_options);
  } else if (param->Filter()->dims()[2] == 3 &&
             param->Filter()->dims()[3] == 3) {
    //    if (param->Strides()[0] == param->Strides()[1] &&
@@ -130,6 +138,7 @@ void ConvAddReluKernel<GPU_CL, float>::Compute(
    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW5x5_FLOAT:
    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW7x7_FLOAT:
    case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
+    case ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT:
      ConvAddBnRelu(&this->cl_helper_, param, true, param.Bias());
      break;
    case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT:

--- a/mobile/src/operators/kernel/cl/conv_bn_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_bn_relu_kernel.cpp
@@ -129,6 +129,14 @@ bool ConvBNReluKernel<GPU_CL, float>::Init(
                                 build_options);
    }

+  } else if (param->Filter()->dims()[1] == 1 &&
+             param->Input()->dims()[1] == param->Output()->dims()[1] &&
+             param->Filter()->dims()[2] != 3) {
+    param->Filter()->InitDWImage(cl_helper_.CLContext(),
+                                 cl_helper_.CLCommandQueue());
+
+    param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT;
+    this->cl_helper_.AddKernel("depth_conv", conv_kernel_file, build_options);
  } else if (param->Filter()->dims()[2] == 3 &&
             param->Filter()->dims()[3] == 3) {
    //    if (param->Strides()[0] == param->Strides()[1] &&
@@ -168,6 +176,7 @@ void ConvBNReluKernel<GPU_CL, float>::Compute(
    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT:
    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
    case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
+    case ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT:
      ConvAddBnRelu(&this->cl_helper_, param, true, nullptr, param.NewScale(),
                    param.NewBias());
      break;

--- a/mobile/src/operators/kernel/cl/conv_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_kernel.cpp
@@ -66,6 +66,14 @@ bool ConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) {
    }
    DLOG << "depth_conv 3x3";

+  } else if (param->Filter()->dims()[1] == 1 &&
+             param->Input()->dims()[1] == param->Output()->dims()[1] &&
+             param->Filter()->dims()[2] != 3) {
+    param->Filter()->InitDWImage(cl_helper_.CLContext(),
+                                 cl_helper_.CLCommandQueue());
+
+    param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT;
+    this->cl_helper_.AddKernel("depth_conv", conv_kernel_file);
  } else if (param->Filter()->dims()[2] == 3 &&
             param->Filter()->dims()[3] == 3) {
    //    if (param->Strides()[0] == param->Strides()[1] &&
@@ -115,6 +123,7 @@ void ConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> &param) {
    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
    case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW7x7_FLOAT:
+    case ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT:
      ConvAddBnRelu(&this->cl_helper_, param);
      break;
    case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT:

--- a/mobile/src/operators/kernel/cl/conv_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_relu_kernel.cpp
@@ -72,6 +72,14 @@ bool ConvReluKernel<GPU_CL, float>::Init(FusionConvReluParam<GPU_CL> *param) {

    DLOG << "depth_conv 3x3";

+  } else if (param->Filter()->dims()[1] == 1 &&
+             param->Input()->dims()[1] == param->Output()->dims()[1] &&
+             param->Filter()->dims()[2] != 3) {
+    param->Filter()->InitDWImage(cl_helper_.CLContext(),
+                                 cl_helper_.CLCommandQueue());
+
+    param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT;
+    this->cl_helper_.AddKernel("depth_conv", conv_kernel_file, build_options);
  } else if (param->Filter()->dims()[2] == 3 &&
             param->Filter()->dims()[3] == 3) {
    //    if (param->Strides()[0] == param->Strides()[1] &&
@@ -120,6 +128,7 @@ void ConvReluKernel<GPU_CL, float>::Compute(
    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT:
    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
    case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
+    case ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT:
      ConvAddBnRelu(&this->cl_helper_, param, true);
      break;
    case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT:

--- a/mobile/src/operators/kernel/cl/elementwise_mul_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/elementwise_mul_kernel.cpp
@@ -15,6 +15,8 @@ limitations under the License. */
 #ifdef ELEMENTWISEMUL_OP

 #include "operators/kernel/elementwise_mul_kernel.h"
+#include <framework/cl/cl_half.h>
+#include <iostream>
 #include "framework/cl/cl_image.h"

 namespace paddle_mobile {
@@ -23,19 +25,24 @@ namespace operators {
 template <>
 bool ElementwiseMulKernel<GPU_CL, float>::Init(
    ElementwiseMulParam<GPU_CL> *param) {
-  DLOG << "-----init add-----";
  framework::CLImage *bias = reinterpret_cast<framework::CLImage *>(
      const_cast<framework::CLImage *>(param->InputY()));
  if (bias->dims() == param->InputX()->dims()) {
+    DLOG << "init element wise mul";
    this->cl_helper_.AddKernel("elementwise_mul", "elementwise_mul_kernel.cl");
-  } else if (bias->dims().size() == 4) {
+  } else if (bias->dims().size() == 1) {
+    DLOG << "init channel_mul";
    this->cl_helper_.AddKernel("channel_mul", "elementwise_mul_kernel.cl");
+  } else if (bias->dims().size() == 2) {
+    // etc. input  1 72 28 28
+    // filter 1 72
+    DLOG << "init channel_mul_d2";
+    this->cl_helper_.AddKernel("channel_mul_d2", "elementwise_mul_kernel.cl");
  } else {
-    DLOG << "error:bias dims is error";
+    PADDLE_MOBILE_ENFORCE(false, "element mul not supported yet");
  }
  return true;
 }
-
 template <>
 void ElementwiseMulKernel<GPU_CL, float>::Compute(
    const ElementwiseMulParam<GPU_CL> &param) {
@@ -64,8 +71,8 @@ void ElementwiseMulKernel<GPU_CL, float>::Compute(
        clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
                               NULL, global_work_size, NULL, 0, NULL, NULL);
    CL_CHECK_ERRORS(status);
-  } else if (bias->dims().size() == 4) {
-    DLOG << "zp7 444";
+  } else if (bias->dims().size() == 1) {
+    DLOG << "channel mul";
    cl_mem input_image = input->GetCLImage();
    cl_mem bias_image = bias->GetCLImage();
    cl_mem output_image = output->GetCLImage();
@@ -84,14 +91,48 @@ void ElementwiseMulKernel<GPU_CL, float>::Compute(
    CL_CHECK_ERRORS(status);
    auto width = input->ImageWidth();
    auto height = input->ImageHeight();
-    DLOG << "dede:" << width << "," << height;
    size_t global_work_size[2] = {width, height};
    status =
        clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
                               NULL, global_work_size, NULL, 0, NULL, NULL);
    CL_CHECK_ERRORS(status);
+  } else if (bias->dims().size() == 2) {
+    DLOG << "channel mul d2";
+
+    // etc. input  1 72 28 28
+    // filter 1 72   -->  1 1 1 72
+    DLOG << "input->ImageDims():  " << input->ImageDims();
+    DLOG << "bias->ImageDims():  " << bias->ImageDims();
+    DLOG << "out->ImageDims():  " << output->ImageDims();
+
+    DLOG << "channel mul d2";
+    cl_mem input_image = input->GetCLImage();
+    cl_mem bias_image = bias->GetCLImage();
+    cl_mem output_image = output->GetCLImage();
+    int tensor_w = input->dims()[input->dims().size() - 1];
+    status = clSetKernelArg(kernel, 0, sizeof(cl_mem),
+                            reinterpret_cast<void *>(&input_image));
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 1, sizeof(cl_mem),
+                            reinterpret_cast<void *>(&bias_image));
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 2, sizeof(cl_mem),
+                            reinterpret_cast<void *>(&output_image));
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 3, sizeof(cl_int),
+                            reinterpret_cast<void *>(&tensor_w));
+    CL_CHECK_ERRORS(status);
+    auto width = input->ImageWidth();
+    auto height = input->ImageHeight();
+    size_t global_work_size[2] = {width, height};
+    status =
+        clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
+                               NULL, global_work_size, NULL, 0, NULL, NULL);
+    CL_CHECK_ERRORS(status);
+
+    //    bias->PrintTensor(*bias);
  } else {
-    DLOG << "error:bias dims is error";
+    PADDLE_MOBILE_ENFORCE(false, "element mul not support this situation yet")
  }
 }


--- a/mobile/src/operators/op_param.h
+++ b/mobile/src/operators/op_param.h
@@ -489,6 +489,7 @@ class ConvParam : public OpParam {
    EXEC_SLIDINGWINDOW5x5_FLOAT,
    EXEC_SLIDINGWINDOW7x7_FLOAT,
    EXEC_GEMM1x1s1_FLOAT,
+    EXEC_DEPTHWISEBASIC_FLOAT,
  };

  ExecMode &ExecMode() const { return exec_mode_; }

--- a/mobile/test/net/test_net_multi_feed.cpp
+++ b/mobile/test/net/test_net_multi_feed.cpp
@@ -216,4 +216,6 @@ void test(int argc, char *argv[]) {
    std::cout << std::endl;
  }
 }
+#else
+int main() {}
 #endif