diff --git a/.gitignore b/.gitignore
index 7543585c33046943ce017c07a07882bdbd989968..e24bb3325979c9e6151ae090b853f733d2151dea 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,3 +3,8 @@ tags
 .idea/
 cmake-build-debug/
 *.pyc
+mace/codegen/models/
+mace/codegen/opencl/
+mace/codegen/opencl_bin/
+mace/codegen/tuning/
+mace/codegen/version/
diff --git a/mace/core/allocator.h b/mace/core/allocator.h
index d20c5cef7eb1ad76f24a98d71106277db8ed4324..36ef202935037af7f0b2dc46e7bae8d8c4aa9efd 100644
--- a/mace/core/allocator.h
+++ b/mace/core/allocator.h
@@ -53,6 +53,7 @@ class CPUAllocator : public Allocator {
  public:
   ~CPUAllocator() override {}
   void *New(size_t nbytes) override {
+    VLOG(3) << "Allocate CPU buffer: " << nbytes;
     void *data = nullptr;
 #ifdef __ANDROID__
     data = memalign(kMaceAlignment, nbytes);
@@ -67,11 +68,18 @@ class CPUAllocator : public Allocator {
 
   void *NewImage(const std::vector<size_t> &shape,
                  const DataType dt) override {
+    LOG(FATAL) << "Allocate CPU image";
     return nullptr;
   }
 
-  void Delete(void *data) override { free(data); }
-  void DeleteImage(void *data) override { free(data); };
+  void Delete(void *data) override {
+    VLOG(3) << "Free CPU buffer";
+    free(data);
+  }
+  void DeleteImage(void *data) override {
+    LOG(FATAL) << "Free CPU image";
+    free(data);
+  };
   void *Map(void *buffer, size_t nbytes) override { return buffer; }
   void *MapImage(void *buffer,
                  const std::vector<size_t> &image_shape,
diff --git a/mace/core/runtime/opencl/opencl_allocator.cc b/mace/core/runtime/opencl/opencl_allocator.cc
index 9c8b5ceee552a0a2d8df5bf62eb6608fdd44c47d..280b84d38659605c29ee71c6f479747bd506abac 100644
--- a/mace/core/runtime/opencl/opencl_allocator.cc
+++ b/mace/core/runtime/opencl/opencl_allocator.cc
@@ -36,6 +36,7 @@ OpenCLAllocator::OpenCLAllocator() {}
 
 OpenCLAllocator::~OpenCLAllocator() {}
 void *OpenCLAllocator::New(size_t nbytes) {
+  VLOG(3) << "Allocate OpenCL buffer: " << nbytes;
   cl_int error;
   cl::Buffer *buffer = new cl::Buffer(OpenCLRuntime::Global()->context(),
                                       CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
@@ -48,6 +49,7 @@ void *OpenCLAllocator::New(size_t nbytes) {
 void *OpenCLAllocator::NewImage(const std::vector<size_t> &image_shape,
                                 const DataType dt) {
   MACE_CHECK(image_shape.size() == 2) << "Image shape's size must equal 2";
+  VLOG(3) << "Allocate OpenCL image: " << image_shape[0] << ", " << image_shape[1];
 
   cl::ImageFormat img_format(CL_RGBA, DataTypeToCLChannelType(dt));
 
@@ -64,6 +66,7 @@ void *OpenCLAllocator::NewImage(const std::vector<size_t> &image_shape,
 }
 
 void OpenCLAllocator::Delete(void *buffer) {
+  VLOG(3) << "Free OpenCL buffer";
   if (buffer != nullptr) {
     cl::Buffer *cl_buffer = static_cast<cl::Buffer *>(buffer);
     delete cl_buffer;
@@ -71,6 +74,7 @@ void OpenCLAllocator::Delete(void *buffer) {
 }
 
 void OpenCLAllocator::DeleteImage(void *buffer) {
+  VLOG(3) << "Free OpenCL image";
   if (buffer != nullptr) {
     cl::Image2D *cl_image = static_cast<cl::Image2D *>(buffer);
     delete cl_image;
diff --git a/mace/core/runtime/opencl/opencl_wrapper.cc b/mace/core/runtime/opencl/opencl_wrapper.cc
index 34d8da3156934b48d481fbe2b67a4cb8b4764fbc..cb64f77991fd14bd33281889a6660481e2f9a1b3 100644
--- a/mace/core/runtime/opencl/opencl_wrapper.cc
+++ b/mace/core/runtime/opencl/opencl_wrapper.cc
@@ -337,13 +337,13 @@ OpenCLLibraryImpl *openclLibraryImpl = nullptr;
 }  // namespace
 
 void LoadOpenCLLibrary() {
-  if (openclLibraryImpl == nullptr) {
-    openclLibraryImpl = new OpenCLLibraryImpl();
-    MACE_CHECK(openclLibraryImpl->Load());
-  }
+  MACE_CHECK(openclLibraryImpl == nullptr);
+  openclLibraryImpl = new OpenCLLibraryImpl();
+  MACE_CHECK(openclLibraryImpl->Load());
 }
 
 void UnloadOpenCLLibrary() {
+  MACE_CHECK_NOTNULL(openclLibraryImpl);
   openclLibraryImpl->Unload();
   delete openclLibraryImpl;
   openclLibraryImpl = nullptr;
diff --git a/mace/examples/mace_run.cc b/mace/examples/mace_run.cc
index 7a02b8efca692a67ef1167f1e2032500a8a83732..3de72de9289a419b98f9122c0f196f2fdb101b75 100644
--- a/mace/examples/mace_run.cc
+++ b/mace/examples/mace_run.cc
@@ -12,13 +12,14 @@
  *          --output_file=mace.out  \
  *          --device=NEON
  */
+#include <malloc.h>
+#include <cstdlib>
 #include <fstream>
-#include <numeric>
 #include <iostream>
-#include <cstdlib>
+#include <numeric>
 #include "mace/utils/command_line_flags.h"
-#include "mace/utils/logging.h"
 #include "mace/utils/env_time.h"
+#include "mace/utils/logging.h"
 
 #include "mace/core/public/mace.h"
 #include "mace/core/public/version.h"
@@ -44,7 +45,7 @@ void ParseShape(const string &str, vector<int64_t> *shape) {
 }
 
 DeviceType ParseDeviceType(const string &device_str) {
-  if(device_str.compare("CPU") == 0) {
+  if (device_str.compare("CPU") == 0) {
     return DeviceType::CPU;
   } else if (device_str.compare("NEON") == 0) {
     return DeviceType::NEON;
@@ -55,6 +56,53 @@ DeviceType ParseDeviceType(const string &device_str) {
   }
 }
 
+struct mallinfo LogMallinfoChange(struct mallinfo prev) {
+  struct mallinfo curr = mallinfo();
+  if (prev.arena != curr.arena) {
+    LOG(INFO) << "Non-mmapped space allocated (bytes): " << curr.arena
+              << ", diff: " << ((int64_t)curr.arena - (int64_t)prev.arena);
+  }
+  if (prev.ordblks != curr.ordblks) {
+    LOG(INFO) << "Number of free chunks: " << curr.ordblks
+              << ", diff: " << ((int64_t)curr.ordblks - (int64_t)prev.ordblks);
+  }
+  if (prev.smblks != curr.smblks) {
+    LOG(INFO) << "Number of free fastbin blocks: " << curr.smblks
+              << ", diff: " << ((int64_t)curr.smblks - (int64_t)prev.smblks);
+  }
+  if (prev.hblks != curr.hblks) {
+    LOG(INFO) << "Number of mmapped regions: " << curr.hblks
+              << ", diff: " << ((int64_t)curr.hblks - (int64_t)prev.hblks);
+  }
+  if (prev.hblkhd != curr.hblkhd) {
+    LOG(INFO) << "Space allocated in mmapped regions (bytes): " << curr.hblkhd
+              << ", diff: " << ((int64_t)curr.hblkhd - (int64_t)prev.hblkhd);
+  }
+  if (prev.usmblks != curr.usmblks) {
+    LOG(INFO) << "Maximum total allocated space (bytes): " << curr.usmblks
+              << ", diff: " << ((int64_t)curr.usmblks - (int64_t)prev.usmblks);
+  }
+  if (prev.fsmblks != curr.fsmblks) {
+    LOG(INFO) << "Space in freed fastbin blocks (bytes): " << curr.fsmblks
+              << ", diff: " << ((int64_t)curr.fsmblks - (int64_t)prev.fsmblks);
+  }
+  if (prev.uordblks != curr.uordblks) {
+    LOG(INFO) << "Total allocated space (bytes): " << curr.uordblks
+              << ", diff: "
+              << ((int64_t)curr.uordblks - (int64_t)prev.uordblks);
+  }
+  if (prev.fordblks != curr.fordblks) {
+    LOG(INFO) << "Total free space (bytes): " << curr.fordblks << ", diff: "
+              << ((int64_t)curr.fordblks - (int64_t)prev.fordblks);
+  }
+  if (prev.keepcost != curr.keepcost) {
+    LOG(INFO) << "Top-most, releasable space (bytes): " << curr.keepcost
+              << ", diff: "
+              << ((int64_t)curr.keepcost - (int64_t)prev.keepcost);
+  }
+  return curr;
+}
+
 int main(int argc, char **argv) {
   string model_file;
   string input_node;
@@ -64,6 +112,7 @@ int main(int argc, char **argv) {
   string output_file;
   string device;
   int round = 1;
+  int malloc_check_cycle = -1;
 
   std::vector<Flag> flag_list = {
       Flag("model", &model_file, "model file name"),
@@ -74,6 +123,8 @@ int main(int argc, char **argv) {
       Flag("output_file", &output_file, "output file name"),
       Flag("device", &device, "CPU/NEON"),
       Flag("round", &round, "round"),
+      Flag("malloc_check_cycle", &malloc_check_cycle,
+           "malloc debug check cycle, -1 to disable"),
   };
 
   string usage = Flags::Usage(argv[0], flag_list);
@@ -107,7 +158,8 @@ int main(int argc, char **argv) {
 
   DeviceType device_type = ParseDeviceType(device);
   VLOG(1) << "Device Type" << device_type;
-  int64_t input_size = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int64_t>());
+  int64_t input_size = std::accumulate(shape.begin(), shape.end(), 1,
+                                       std::multiplies<int64_t>());
   std::unique_ptr<float[]> input_data(new float[input_size]);
 
   // load input
@@ -136,8 +188,13 @@ int main(int argc, char **argv) {
   if (round > 0) {
     VLOG(0) << "Run model";
     t0 = utils::NowMicros();
+    struct mallinfo prev = mallinfo();
     for (int i = 0; i < round; ++i) {
       engine.Run(input_data.get(), shape, output_shape);
+      if (malloc_check_cycle >= 1 && i % malloc_check_cycle == 0) {
+        LOG(INFO) << "=== check malloc info change #" << i << " ===";
+        prev = LogMallinfoChange(prev);
+      }
     }
     t1 = utils::NowMicros();
     LOG(INFO) << "Avg duration: " << (t1 - t0) / round << " us";
@@ -146,9 +203,10 @@ int main(int argc, char **argv) {
   const float *output = engine.Run(input_data.get(), shape, output_shape);
   if (output != nullptr) {
     ofstream out_file(output_file, ios::binary);
-    int64_t output_size = std::accumulate(output_shape.begin(), output_shape.end(), 1, std::multiplies<int64_t>());
-    out_file.write((const char *) (output),
-                   output_size * sizeof(float));
+    int64_t output_size =
+        std::accumulate(output_shape.begin(), output_shape.end(), 1,
+                        std::multiplies<int64_t>());
+    out_file.write((const char *)(output), output_size * sizeof(float));
     out_file.flush();
     out_file.close();
     stringstream ss;
diff --git a/tools/validate_gcn.sh b/tools/validate_gcn.sh
index d31d098626ad6d6531660daaa824e514147b7b30..645df8b586e255394900dbb6f6b5fa2b9e6f7675 100644
--- a/tools/validate_gcn.sh
+++ b/tools/validate_gcn.sh
@@ -10,6 +10,7 @@ if [ $# -lt 2 ];then
   exit -1
 fi
 
+VLOG_LEVEL=0
 TF_MODEL_FILE_PATH=$1
 MODEL_DIR=$(dirname ${TF_MODEL_FILE_PATH})
 MACE_SOURCE_DIR=`/bin/pwd`
@@ -60,7 +61,7 @@ build_and_run()
   fi
 
   adb </dev/null shell MACE_TUNING=${tuning_flag} \
-    MACE_CPP_MIN_VLOG_LEVEL=0 \
+    MACE_CPP_MIN_VLOG_LEVEL=$VLOG_LEVEL \
     MACE_RUN_PARAMETER_PATH=${PHONE_DATA_DIR}/mace_run.config \
     MACE_KERNEL_PATH=$KERNEL_DIR \
     ${PHONE_DATA_DIR}/mace_run \
@@ -81,7 +82,7 @@ python tools/validate.py --generate_data true --random_seed 1 \
 
 echo "Step 2: Convert tf model to mace model and optimize memory"
 bazel build //mace/python/tools:tf_converter
-rm -rf ${CODEGEN_DIR}/models
+rm -rf ${MODEL_CODEGEN_DIR}
 mkdir -p ${MODEL_CODEGEN_DIR}
 bazel-bin/mace/python/tools/tf_converter --input=${TF_MODEL_FILE_PATH} \
                                          --output=${MODEL_CODEGEN_DIR}/mace_gcn${IMAGE_SIZE}.cc \