diff --git a/mace/core/BUILD b/mace/core/BUILD
index 218fd1bd2354a9936d243764c9cdb113389ba817..7f974ba4e89aa14085fde004eb8da1413d4de124 100644
--- a/mace/core/BUILD
+++ b/mace/core/BUILD
@@ -7,6 +7,8 @@ package(
 
 licenses(["notice"])  # Apache 2.0
 
+load("//mace:mace.bzl", "if_android")
+
 cc_library(
     name = "core",
     srcs = glob([
@@ -19,6 +21,10 @@ cc_library(
     deps = [
         "//mace/proto:cc_proto",
     ],
+    linkopts = if_android([
+        "-llog",
+        "-pie",
+    ]),
 )
 
 # Main program for tests
diff --git a/mace/core/common.h b/mace/core/common.h
index c2c2931660275634f894948927a3c1dd7e909204..e5e07225ab1b21165fe7b3b7f2ca809824a7e740 100644
--- a/mace/core/common.h
+++ b/mace/core/common.h
@@ -12,7 +12,6 @@
 #include <vector>
 #include <algorithm>
 
-#include "mace/core/integral_types.h"
 #include "mace/core/logging.h"
 
 using std::set;
@@ -21,7 +20,7 @@ using std::string;
 using std::unique_ptr;
 using std::vector;
 
-typedef int64 TIndex;
+typedef int64_t index_t;
 
 // Disable the copy and assignment operator for a class.
 #ifndef DISABLE_COPY_AND_ASSIGN
diff --git a/mace/core/integral_types.h b/mace/core/integral_types.h
deleted file mode 100644
index 72298201ef68403a93dbdb4d41087ad0f669e7a7..0000000000000000000000000000000000000000
--- a/mace/core/integral_types.h
+++ /dev/null
@@ -1,19 +0,0 @@
-//
-// Copyright (c) 2017 XiaoMi All rights reserved.
-//
-
-
-#ifndef MACE_CORE_INTEGRAL_TYPES_H_
-#define MACE_CORE_INTEGRAL_TYPES_H_
-
-typedef int8_t int8;
-typedef int16_t int16;
-typedef int32_t int32;
-typedef int64_t int64;
-
-typedef uint8_t uint8;
-typedef uint16_t uint16;
-typedef uint32_t uint32;
-typedef uint64_t uint64;
-
-#endif // MACE_CORE_INTEGRAL_TYPES_H_
diff --git a/mace/core/logging.cc b/mace/core/logging.cc
index 5e0982d58e5d38fa1117b9d35ba2bec8a55dc092..f01d0980241187b2fcc2acb829e3c4b79f30b8d4 100644
--- a/mace/core/logging.cc
+++ b/mace/core/logging.cc
@@ -69,18 +69,18 @@ void LogMessage::GenerateLogMessage() {
 
 namespace {
 
-// Parse log level (int64) from environment variable (char*)
-int64 LogLevelStrToInt(const char* tf_env_var_val) {
-  if (tf_env_var_val == nullptr) {
+// Parse log level (int64_t) from environment variable (char*)
+int64_t LogLevelStrToInt(const char* mace_env_var_val) {
+  if (mace_env_var_val == nullptr) {
     return 0;
   }
 
   // Ideally we would use env_var / safe_strto64, but it is
   // hard to use here without pulling in a lot of dependencies,
   // so we use std:istringstream instead
-  string min_log_level(tf_env_var_val);
+  string min_log_level(mace_env_var_val);
   std::istringstream ss(min_log_level);
-  int64 level;
+  int64_t level;
   if (!(ss >> level)) {
     // Invalid vlog level setting, set level to default (0)
     level = 0;
@@ -89,26 +89,26 @@ int64 LogLevelStrToInt(const char* tf_env_var_val) {
   return level;
 }
 
-int64 MinLogLevelFromEnv() {
-  const char* tf_env_var_val = getenv("MACE_CPP_MIN_LOG_LEVEL");
-  return LogLevelStrToInt(tf_env_var_val);
+int64_t MinLogLevelFromEnv() {
+  const char* mace_env_var_val = getenv("MACE_CPP_MIN_LOG_LEVEL");
+  return LogLevelStrToInt(mace_env_var_val);
 }
 
-int64 MinVLogLevelFromEnv() {
-  const char* tf_env_var_val = getenv("MACE_CPP_MIN_VLOG_LEVEL");
-  return LogLevelStrToInt(tf_env_var_val);
+int64_t MinVLogLevelFromEnv() {
+  const char* mace_env_var_val = getenv("MACE_CPP_MIN_VLOG_LEVEL");
+  return LogLevelStrToInt(mace_env_var_val);
 }
 
 }  // namespace
 
 LogMessage::~LogMessage() {
   // Read the min log level once during the first call to logging.
-  static int64 min_log_level = MinLogLevelFromEnv();
+  static int64_t min_log_level = MinLogLevelFromEnv();
   if (severity_ >= min_log_level) GenerateLogMessage();
 }
 
-int64 LogMessage::MinVLogLevel() {
-  static int64 min_vlog_level = MinVLogLevelFromEnv();
+int64_t LogMessage::MinVLogLevel() {
+  static int64_t min_vlog_level = MinVLogLevelFromEnv();
   return min_vlog_level;
 }
 
diff --git a/mace/core/logging.h b/mace/core/logging.h
index c613a87d640618d689d6aadf04782f40a8172011..0787af3383d91074ae60c214f096923b8fc891d9 100644
--- a/mace/core/logging.h
+++ b/mace/core/logging.h
@@ -9,8 +9,6 @@
 #include <limits>
 #include <string>
 
-#include "mace/core/integral_types.h"
-
 #undef ERROR
 
 namespace mace {
@@ -62,7 +60,7 @@ class LogMessage : public std::basic_ostringstream<char> {
   // Returns the minimum log level for VLOG statements.
   // E.g., if MinVLogLevel() is 2, then VLOG(2) statements will produce output,
   // but VLOG(3) will not. Defaults to 0.
-  static int64 MinVLogLevel();
+  static int64_t MinVLogLevel();
 
  protected:
   void GenerateLogMessage();
diff --git a/mace/core/operator.cc b/mace/core/operator.cc
index 2e5086ac222a70503bf655ff9d92557369beccb4..a755577b65b7d3c5c80dd1da50b6dd4d256bccf8 100644
--- a/mace/core/operator.cc
+++ b/mace/core/operator.cc
@@ -6,8 +6,8 @@
 
 namespace mace {
 
-std::map<int32, OperatorRegistry*>* gDeviceTypeRegistry() {
-  static std::map<int32, OperatorRegistry*> g_device_type_registry;
+std::map<int32_t, OperatorRegistry*>* gDeviceTypeRegistry() {
+  static std::map<int32_t, OperatorRegistry*> g_device_type_registry;
   return &g_device_type_registry;
 }
 
diff --git a/mace/core/operator.h b/mace/core/operator.h
index 970404f6ed00da9b743cc2a1a7eb6559cd6a6a09..df488691bac3fd8de6aea9c98a7175f80b50f41e 100644
--- a/mace/core/operator.h
+++ b/mace/core/operator.h
@@ -44,7 +44,7 @@ class OperatorBase {
         *operator_def_, name, default_value);
   }
 
-  inline const Tensor *Input(TIndex idx) {
+  inline const Tensor *Input(index_t idx) {
     MACE_CHECK(idx < inputs_.size());
     return inputs_[idx];
   }
diff --git a/mace/core/serializer.cc b/mace/core/serializer.cc
index 310e76299db02eb8dae9fa2032f65a5cccd1c6e2..3e80e545b2a0aa23eb26906f588c9713beba046e 100644
--- a/mace/core/serializer.cc
+++ b/mace/core/serializer.cc
@@ -17,8 +17,8 @@ unique_ptr<Tensor> Serializer::Deserialize(const TensorProto &proto,
                                            DeviceType type) {
   unique_ptr<Tensor> tensor(new Tensor(GetDeviceAllocator(type),
                                        proto.data_type()));
-  vector<TIndex> dims;
-  for (const TIndex d : proto.dims()) {
+  vector<index_t> dims;
+  for (const index_t d : proto.dims()) {
     dims.push_back(d);
   }
   tensor->Resize(dims);
@@ -33,31 +33,31 @@ unique_ptr<Tensor> Serializer::Deserialize(const TensorProto &proto,
                            proto.double_data().size());
       break;
     case DT_INT32:
-      tensor->template Copy<int32>(proto.int32_data().data(),
+      tensor->template Copy<int32_t>(proto.int32_data().data(),
                                    proto.int32_data().size());
       break;
     case DT_UINT8:
-      tensor->CopyWithCast<int32, uint8>(proto.int32_data().data(),
+      tensor->CopyWithCast<int32_t, uint8_t>(proto.int32_data().data(),
                                          proto.int32_data().size());
       break;
     case DT_INT16:
-      tensor->CopyWithCast<int32, int16>(proto.int32_data().data(),
+      tensor->CopyWithCast<int32_t, int16_t>(proto.int32_data().data(),
                                          proto.int32_data().size());
       break;
     case DT_INT8:
-      tensor->CopyWithCast<int32, int8>(proto.int32_data().data(),
+      tensor->CopyWithCast<int32_t, int8_t>(proto.int32_data().data(),
                                         proto.int32_data().size());
       break;
     case DT_INT64:
-      tensor->Copy<int64>(proto.int64_data().data(),
+      tensor->Copy<int64_t>(proto.int64_data().data(),
                           proto.int64_data().size());
       break;
     case DT_UINT16:
-      tensor->CopyWithCast<int32, uint16>(proto.int32_data().data(),
+      tensor->CopyWithCast<int32_t, uint16_t>(proto.int32_data().data(),
                                           proto.int32_data().size());
       break;
     case DT_BOOL:
-      tensor->CopyWithCast<int32, bool>(proto.int32_data().data(),
+      tensor->CopyWithCast<int32_t, bool>(proto.int32_data().data(),
                                         proto.int32_data().size());
       break;
     case DT_STRING: {
diff --git a/mace/core/tensor.h b/mace/core/tensor.h
index 77a44615aca803f22504340569a349da7835c648..1af32d3f2338f344332375d9cb67cbe23a4f119d 100644
--- a/mace/core/tensor.h
+++ b/mace/core/tensor.h
@@ -25,13 +25,13 @@ namespace mace {
   switch (TYPE_ENUM) {                                         \
     CASE(float, SINGLE_ARG(STMTS))                             \
     CASE(double, SINGLE_ARG(STMTS))                            \
-    CASE(int32, SINGLE_ARG(STMTS))                             \
-    CASE(uint8, SINGLE_ARG(STMTS))                             \
-    CASE(uint16, SINGLE_ARG(STMTS))                            \
-    CASE(int16, SINGLE_ARG(STMTS))                             \
-    CASE(int8, SINGLE_ARG(STMTS))                              \
+    CASE(int32_t, SINGLE_ARG(STMTS))                             \
+    CASE(uint8_t, SINGLE_ARG(STMTS))                             \
+    CASE(uint16_t, SINGLE_ARG(STMTS))                            \
+    CASE(int16_t, SINGLE_ARG(STMTS))                             \
+    CASE(int8_t, SINGLE_ARG(STMTS))                              \
     CASE(string, SINGLE_ARG(STMTS))                            \
-    CASE(int64, SINGLE_ARG(STMTS))                             \
+    CASE(int64_t, SINGLE_ARG(STMTS))                             \
     CASE(bool, SINGLE_ARG(STMTS))                              \
     case DT_INVALID:                                           \
       INVALID;                                                 \
@@ -64,17 +64,17 @@ class Tensor {
 
   inline DataType dtype() const { return dtype_; }
 
-  inline const vector<TIndex>& shape() const { return shape_; }
+  inline const vector<index_t>& shape() const { return shape_; }
 
-  inline TIndex dim_size() const { return shape_.size(); }
+  inline index_t dim_size() const { return shape_.size(); }
 
-  inline TIndex dim(TIndex index) const {
+  inline index_t dim(index_t index) const {
     MACE_CHECK(index < shape_.size(), "Exceeding ndim limit");
     MACE_CHECK(index >= 0, "Cannot have negative dimension index");
     return shape_[index];
   }
 
-  inline TIndex size() const { return size_; }
+  inline index_t size() const { return size_; }
 
   inline const void* raw_data() const {
     MACE_CHECK(data_.get() || size_ == 0);
@@ -108,9 +108,9 @@ class Tensor {
     return static_cast<T*>(raw_mutable_data());
   }
 
-  inline void Resize(const vector<TIndex>& shape) {
+  inline void Resize(const vector<index_t>& shape) {
     shape_ = shape;
-    TIndex size = NumElements();
+    index_t size = NumElements();
     if (size_ != size) {
       size_ = size;
       data_.reset();
@@ -126,14 +126,14 @@ class Tensor {
   }
 
   template <typename T>
-  inline void Copy(const T* src, TIndex size) {
+  inline void Copy(const T* src, index_t size) {
     MACE_CHECK(size == size_, "copy src and dst with different size.");
     CopyBytes(static_cast<const void*>(src), sizeof(T) * size);
   }
 
   template <typename SrcType, typename DstType>
   inline void CopyWithCast(const SrcType* src, size_t size) {
-    MACE_CHECK(static_cast<TIndex>(size) == size_, "copy src and dst with different size.");
+    MACE_CHECK(static_cast<index_t>(size) == size_, "copy src and dst with different size.");
     unique_ptr<DstType[]> buffer(new DstType[size]);
     for (size_t i = 0; i < size; ++i) {
       buffer[i] = static_cast<DstType>(src[i]);
@@ -161,15 +161,15 @@ class Tensor {
   }
 
  private:
-  inline int64 NumElements() const {
-    return std::accumulate(shape_.begin(), shape_.end(), 1, std::multiplies<int64>());
+  inline int64_t NumElements() const {
+    return std::accumulate(shape_.begin(), shape_.end(), 1, std::multiplies<int64_t>());
   }
 
   Allocator* alloc_;
-  TIndex size_;
+  index_t size_;
   DataType dtype_;
   std::shared_ptr<void> data_;
-  vector<TIndex> shape_;
+  vector<index_t> shape_;
 };
 
 } // namespace tensor
diff --git a/mace/core/testing/env_time.h b/mace/core/testing/env_time.h
index 6be189a658ab489fdf59fcc4f666c71574ad468b..f07783c1f66e4551886276e30796001ae1fc1a52 100644
--- a/mace/core/testing/env_time.h
+++ b/mace/core/testing/env_time.h
@@ -16,10 +16,10 @@ namespace mace {
 
 namespace testing {
 
-inline int64 NowMicros() {
+inline int64_t NowMicros() {
   struct timeval tv;
   gettimeofday(&tv, nullptr);
-  return static_cast<int64>(tv.tv_sec) * 1000000 + tv.tv_usec;
+  return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
 }
 
 }  // namespace testing
diff --git a/mace/core/testing/test_benchmark.cc b/mace/core/testing/test_benchmark.cc
index 1eb976ec998833ffcf436dcf4744566d460f168f..885a9a63f70956428008291f29dc293245c7d37a 100644
--- a/mace/core/testing/test_benchmark.cc
+++ b/mace/core/testing/test_benchmark.cc
@@ -16,10 +16,10 @@ namespace testing {
 
 static std::vector<Benchmark*>* all_benchmarks = nullptr;
 static std::string label;
-static int64 bytes_processed;
-static int64 items_processed;
-static int64 accum_time = 0;
-static int64 start_time = 0;
+static int64_t bytes_processed;
+static int64_t items_processed;
+static int64_t accum_time = 0;
+static int64_t start_time = 0;
 
 Benchmark::Benchmark(const char* name, void (*fn)(int))
     : name_(name), num_args_(0), fn0_(fn) {
@@ -112,10 +112,10 @@ void Benchmark::Register() {
 }
 
 void Benchmark::Run(int arg1, int arg2, int* run_count, double* run_seconds) {
-  static const int64 kMinIters = 100;
-  static const int64 kMaxIters = 1000000000;
+  static const int64_t kMinIters = 100;
+  static const int64_t kMaxIters = 1000000000;
   static const double kMinTime = 0.5;
-  int64 iters = kMinIters;
+  int64_t iters = kMinIters;
   while (true) {
     accum_time = 0;
     start_time = NowMicros();
@@ -142,13 +142,13 @@ void Benchmark::Run(int arg1, int arg2, int* run_count, double* run_seconds) {
     double multiplier = 1.4 * kMinTime / std::max(seconds, 1e-9);
     multiplier = std::min(10.0, multiplier);
     if (multiplier <= 1.0) multiplier *= 2.0;
-    iters = std::max<int64>(multiplier * iters, iters + 1);
+    iters = std::max<int64_t>(multiplier * iters, iters + 1);
     iters = std::min(iters, kMaxIters);
   }
 }
 
-void BytesProcessed(int64 n) { bytes_processed = n; }
-void ItemsProcessed(int64 n) { items_processed = n; }
+void BytesProcessed(int64_t n) { bytes_processed = n; }
+void ItemsProcessed(int64_t n) { items_processed = n; }
 void StartTiming() {
   if (start_time == 0) start_time = NowMicros();
 }
diff --git a/mace/core/testing/test_benchmark.h b/mace/core/testing/test_benchmark.h
index 44a352f54df4cae609b3955eb5343c9b78d34126..5800f5edb0912899b09fc95ebebb8a741e2a48e1 100644
--- a/mace/core/testing/test_benchmark.h
+++ b/mace/core/testing/test_benchmark.h
@@ -42,8 +42,8 @@ class Benchmark {
 };
 
 void RunBenchmarks();
-void BytesProcessed(int64);
-void ItemsProcessed(int64);
+void BytesProcessed(int64_t);
+void ItemsProcessed(int64_t);
 void StartTiming();
 void StopTiming();
 
diff --git a/mace/core/types.h b/mace/core/types.h
index 161be5a7103a9c8c69be3932b9997e7dbee51124..b174993d024587875d6b597cdcb7a19f9d79d154 100644
--- a/mace/core/types.h
+++ b/mace/core/types.h
@@ -42,16 +42,16 @@ struct EnumToDataType {};  // Specializations below
 
 MATCH_TYPE_AND_ENUM(float, DT_FLOAT);
 MATCH_TYPE_AND_ENUM(double, DT_DOUBLE);
-MATCH_TYPE_AND_ENUM(int32, DT_INT32);
-MATCH_TYPE_AND_ENUM(uint16, DT_UINT16);
-MATCH_TYPE_AND_ENUM(uint8, DT_UINT8);
-MATCH_TYPE_AND_ENUM(int16, DT_INT16);
-MATCH_TYPE_AND_ENUM(int8, DT_INT8);
+MATCH_TYPE_AND_ENUM(int32_t, DT_INT32);
+MATCH_TYPE_AND_ENUM(uint16_t, DT_UINT16);
+MATCH_TYPE_AND_ENUM(uint8_t, DT_UINT8);
+MATCH_TYPE_AND_ENUM(int16_t, DT_INT16);
+MATCH_TYPE_AND_ENUM(int8_t, DT_INT8);
 MATCH_TYPE_AND_ENUM(string, DT_STRING);
-MATCH_TYPE_AND_ENUM(int64, DT_INT64);
+MATCH_TYPE_AND_ENUM(int64_t, DT_INT64);
 MATCH_TYPE_AND_ENUM(bool, DT_BOOL);
 
-static const int32 kint32max = ((int32)0x7FFFFFFF);
+static const int32_t kint32_tmax = ((int32_t)0x7FFFFFFF);
 
 } // namespace mace
 
diff --git a/mace/examples/BUILD b/mace/examples/BUILD
index 4f4a7794e0cb00f7d8312299dd7572afd74e68d6..82915d74e05e02c0bbddc04163d7c4e53f12f22b 100644
--- a/mace/examples/BUILD
+++ b/mace/examples/BUILD
@@ -7,10 +7,6 @@ cc_binary(
         "helloworld.cc",
     ],
     copts = ["-std=c++11"],
-    linkopts = if_android([
-        "-pie",
-        "-llog",
-    ]),
     deps = [
         "//mace/core",
         "//mace/ops",
@@ -21,10 +17,6 @@ cc_test(
     name = "benchmark_example",
     srcs = ["benchmark_example.cc"],
     copts = ["-std=c++11"],
-    linkopts = if_android([
-        "-pie",
-        "-llog",
-    ]),
     linkstatic = 1,
     deps = [
         "//mace/core",
diff --git a/mace/examples/benchmark_example.cc b/mace/examples/benchmark_example.cc
index 106c6c3c4ccdf7dcc091a6c9f9bbc8c0c15d2611..50e5184b0bf384d81932466584cdbc688db40a21 100644
--- a/mace/examples/benchmark_example.cc
+++ b/mace/examples/benchmark_example.cc
@@ -6,7 +6,7 @@
 
 static void foo(int iters) {
   static const int N = 32;
-  const int64 tot = static_cast<int64>(iters) * N;
+  const int64_t tot = static_cast<int64_t>(iters) * N;
   mace::testing::ItemsProcessed(tot);
   mace::testing::BytesProcessed(tot * (sizeof(float)));
 
@@ -26,7 +26,7 @@ BENCHMARK(foo);
 
 
 static void bar(int iters, int n) {
-  const int64 tot = static_cast<int64>(iters) * n;
+  const int64_t tot = static_cast<int64_t>(iters) * n;
   mace::testing::ItemsProcessed(tot);
   mace::testing::BytesProcessed(tot * (sizeof(float)));
 
diff --git a/mace/kernels/BUILD b/mace/kernels/BUILD
index de8293e35421ac29031db5281d162de3999efb78..098e80a949d457406730e0f0146b45c03b75faee 100644
--- a/mace/kernels/BUILD
+++ b/mace/kernels/BUILD
@@ -18,6 +18,9 @@ cc_library(
         "//mace/core:core",
     ],
     copts = ['-std=c++11'],
+    linkopts = ["-fopenmp"] + if_android([
+        "-lm",
+        ]),
 )
 
 cc_test(
@@ -29,11 +32,9 @@ cc_test(
         "//mace/core:core",
     ],
     copts = ['-std=c++11'],
-    linkopts = ["-fopenmp"] + if_android([
+    linkopts = if_android([
         "-pie",
-        "-llog",
-        "-lm",
-    ]),
+        ]),
     linkstatic = 1,
     testonly = 1,
 )
@@ -47,11 +48,6 @@ cc_test(
         "//mace/core:test_benchmark_main",
     ],
     copts = ['-std=c++11'],
-    linkopts = ["-fopenmp"] + if_android([
-        "-pie",
-        "-llog",
-        "-lm",
-    ]),
     linkstatic = 1,
     testonly = 1,
 )
diff --git a/mace/kernels/addn.h b/mace/kernels/addn.h
index 3f79ac69b468c623acbe2cdb6d9179bbe3906bda..30648eb8a15186198ec8b2c9fb98c04695bf4366 100644
--- a/mace/kernels/addn.h
+++ b/mace/kernels/addn.h
@@ -15,7 +15,7 @@ void AddNFuntion(const vector<const Tensor*>& input_tensor, Tensor *output_tenso
   int n = input_tensor.size();
   MACE_CHECK(n > 1);
   MACE_CHECK_NOTNULL(input_tensor[0]);
-  int64 size = input_tensor[0]->size();
+  int64_t size = input_tensor[0]->size();
   vector<const T*> inputs(n);
   for (int i = 0; i < n; ++i) {
     inputs[i] = input_tensor[i]->data<T>();
@@ -24,7 +24,7 @@ void AddNFuntion(const vector<const Tensor*>& input_tensor, Tensor *output_tenso
   T* output = output_tensor->mutable_data<T>();
 
   for (int i = 0; i < n; ++i) {
-    for (int64 j = 0; j < size; ++j) {
+    for (int64_t j = 0; j < size; ++j) {
       output[j] += inputs[i][j];
     }
   }
diff --git a/mace/kernels/batch_norm.h b/mace/kernels/batch_norm.h
index d2899d762c6274d0de3cf821bd66b5a983c60e8d..84ca48d4a76bc477258ce0d9ec152d5f313709a9 100644
--- a/mace/kernels/batch_norm.h
+++ b/mace/kernels/batch_norm.h
@@ -30,9 +30,9 @@ struct BatchNormFunctor : public BatchNormFunctorBase<D, T> {
                   const T* offset,
                   const T* mean,
                   const T* var,
-                  const TIndex n,
-                  const TIndex channel,
-                  const TIndex sample_size,
+                  const index_t n,
+                  const index_t channel,
+                  const index_t sample_size,
                   T* output) {
     // Batch normalization in the paper https://arxiv.org/abs/1502.03167 .
     // The calculation formula for inference is
@@ -42,15 +42,15 @@ struct BatchNormFunctor : public BatchNormFunctorBase<D, T> {
     // new_offset = \offset - mean * common_val;
     // Y = new_scale * X + new_offset;
     T new_scale, new_offset;
-    for (TIndex c = 0; c < channel; ++c) {
+    for (index_t c = 0; c < channel; ++c) {
       new_scale = scale[c] / std::sqrt(var[c] + this->variance_epsilon_);
       new_offset = offset[c] - mean[c] * new_scale;
-      TIndex pos = c * sample_size;
+      index_t pos = c * sample_size;
 
-      for (TIndex i = 0; i < n; ++i) {
+      for (index_t i = 0; i < n; ++i) {
         const T* input_sample_ptr = input + pos;
         T* output_sample_ptr = output + pos;
-        for (TIndex j = 0; j < sample_size; ++j) {
+        for (index_t j = 0; j < sample_size; ++j) {
           output_sample_ptr[j] = new_scale * input_sample_ptr[j] + new_offset;
         }
         pos += channel * sample_size;
diff --git a/mace/kernels/benchmark/addn_benchmark.cc b/mace/kernels/benchmark/addn_benchmark.cc
index f63fed77b11847f3aacca8291c333699b0bd840a..4cec0270dbc6e1d9f55eb0db404965d9d1f1088e 100644
--- a/mace/kernels/benchmark/addn_benchmark.cc
+++ b/mace/kernels/benchmark/addn_benchmark.cc
@@ -11,7 +11,7 @@ using namespace mace;
 using namespace mace::kernels;
 
 static void AddNBenchmark(int iters, int n, int type) {
-  const int64 tot = static_cast<int64>(iters) * n * 3;
+  const int64_t tot = static_cast<int64_t>(iters) * n * 3;
   mace::testing::ItemsProcessed(tot);
   mace::testing::BytesProcessed(tot * (sizeof(float)));
 
@@ -35,7 +35,7 @@ static void AddNBenchmark(int iters, int n, int type) {
   float *input3 = input_tensor3.mutable_data<float>();
   float *output = output_tensor.mutable_data<float>();
 
-  for (int64 i = 0; i < n; ++i) {
+  for (int64_t i = 0; i < n; ++i) {
     input1[i] = nd(gen);
     input2[i] = nd(gen);
     input3[i] = nd(gen);
diff --git a/mace/kernels/benchmark/relu_benchmark.cc b/mace/kernels/benchmark/relu_benchmark.cc
index 9276cadc737bba60a0fac81893dd5aa797d3f6a9..86858681ca29518f6ed98e46f58794d82c984057 100644
--- a/mace/kernels/benchmark/relu_benchmark.cc
+++ b/mace/kernels/benchmark/relu_benchmark.cc
@@ -11,7 +11,7 @@ using namespace mace;
 using namespace mace::kernels;
 
 static void ReluBenchmark(int iters, int n, int type) {
-  const int64 tot = static_cast<int64>(iters) * n;
+  const int64_t tot = static_cast<int64_t>(iters) * n;
   mace::testing::ItemsProcessed(tot);
   mace::testing::BytesProcessed(tot * (sizeof(float)));
 
@@ -25,7 +25,7 @@ static void ReluBenchmark(int iters, int n, int type) {
   output_tensor.ResizeLike(input_tensor);
   float *input = input_tensor.mutable_data<float>();
   float *output = output_tensor.mutable_data<float>();
-  for (int64 i = 0; i < n; ++i) {
+  for (int64_t i = 0; i < n; ++i) {
     input[i] = nd(gen);
   }
 
diff --git a/mace/kernels/neon/addn_neon.cc b/mace/kernels/neon/addn_neon.cc
index 3baab3c3b3dadb8570e0f7b4830fd9c14c1799fa..ad6f06e8df7c17dc189316a20be3be5586a212e6 100644
--- a/mace/kernels/neon/addn_neon.cc
+++ b/mace/kernels/neon/addn_neon.cc
@@ -14,7 +14,7 @@ void NeonAddNFuntion_float(const vector<const Tensor *> &input_tensor,
   int n = input_tensor.size();
   MACE_CHECK(n > 1);
   MACE_CHECK_NOTNULL(input_tensor[0]);
-  int64 size = input_tensor[0]->size();
+  int64_t size = input_tensor[0]->size();
   output_tensor->ResizeLike(input_tensor[0]);
   float *output = output_tensor->mutable_data<float>();
   vector<const float *> inputs(n);
@@ -22,19 +22,19 @@ void NeonAddNFuntion_float(const vector<const Tensor *> &input_tensor,
     inputs[i] = input_tensor[i]->data<float>();
   }
 
-  int64 cost = size * n;
-  int64 groups = 1;
+  int64_t cost = size * n;
+  int64_t groups = 1;
   if (cost > kCostPerGroup) {
     groups = cost / kCostPerGroup;
   }
-  int64 element_per_group = size / groups;
+  int64_t element_per_group = size / groups;
 
 #pragma omp parallel for num_threads(1) // no significant performance improve
-  for (int64 i = 0; i < size; i += element_per_group) {
-    int64 count = std::min(element_per_group, size - i);
+  for (int64_t i = 0; i < size; i += element_per_group) {
+    int64_t count = std::min(element_per_group, size - i);
     int nn = count >> 2;
     int remain = count - (nn << 2);
-    for (int64 j = 0; j < n; ++j) {
+    for (int64_t j = 0; j < n; ++j) {
       const float *inptr = inputs[j] + i;
       float *outptr = output + i;
       for (int k = 0; k < nn; ++k) {
diff --git a/mace/kernels/neon/batch_norm_neon.cc b/mace/kernels/neon/batch_norm_neon.cc
index 9db63f68bd178330f5cfd94dc46b991ffea46a61..a306fdbc804e0c5995846fa89dd5bb681d31e1ed 100644
--- a/mace/kernels/neon/batch_norm_neon.cc
+++ b/mace/kernels/neon/batch_norm_neon.cc
@@ -34,18 +34,18 @@ struct BatchNormFunctor<DeviceType::NEON, T> : public BatchNormFunctorBase<Devic
     T new_scale, new_offset;
     int count = sample_size >> 2;
     int remain_count = sample_size - count;
-    for (TIndex c = 0; c < channel; ++c) {
+    for (index_t c = 0; c < channel; ++c) {
       new_scale = scale[c] / std::sqrt(var[c] + this->variance_epsilon_);
       new_offset = offset[c] - mean[c] * new_scale;
-      TIndex pos = c * sample_size;
+      index_t pos = c * sample_size;
 
       float32x4_t new_scale_f = vdupq_n_f32(new_scale);
       float32x4_t new_offset_f = vdupq_n_f32(new_offset);
-      for (TIndex i = 0; i < n; ++i) {
+      for (index_t i = 0; i < n; ++i) {
         const float* input_sample_ptr = input + pos;
         float* output_sample_ptr = output + pos;
 
-        for(TIndex j = 0; j < count; ++j) {
+        for(index_t j = 0; j < count; ++j) {
           float32x4_t input_f = vld1q_f32(input_sample_ptr);
           float32x4_t output_f = new_offset_f;
           output_f = vfmaq_f32(output_f, input_f, new_scale_f);
@@ -53,7 +53,7 @@ struct BatchNormFunctor<DeviceType::NEON, T> : public BatchNormFunctorBase<Devic
           input_sample_ptr += 4;
           output_sample_ptr += 4;
         }
-        for(TIndex j = 0; j < remain_count; ++j) {
+        for(index_t j = 0; j < remain_count; ++j) {
           *output_sample_ptr = new_scale * *input_sample_ptr + new_offset;
           ++output_sample_ptr;
           ++input_sample_ptr;
diff --git a/mace/kernels/neon/relu_neon.cc b/mace/kernels/neon/relu_neon.cc
index 29c4e354a783f945f25e25ce9de75b879776f737..e487081891ab7d6bd7a039c5429442d0a7641d1a 100644
--- a/mace/kernels/neon/relu_neon.cc
+++ b/mace/kernels/neon/relu_neon.cc
@@ -10,14 +10,14 @@ namespace kernels {
 
 void NeonReluFuntion_float(const Tensor *input_tensor,
                            Tensor *output_tensor) {
-  int64 size = input_tensor->size();
+  int64_t size = input_tensor->size();
   output_tensor->ResizeLike(input_tensor);
   const float *input = input_tensor->data<float>();
   float *output = output_tensor->mutable_data<float>();
 
 #pragma omp parallel for num_threads(1) // no significant performance improve
-  for (int64 i = 0; i < size; i += kCostPerGroup) {
-    int64 count = std::min(static_cast<int64>(kCostPerGroup), size - i);
+  for (int64_t i = 0; i < size; i += kCostPerGroup) {
+    int64_t count = std::min(static_cast<int64_t>(kCostPerGroup), size - i);
     int nn = count >> 2;
     int remain = count - (nn << 2);
     const float *inptr = input + i;
diff --git a/mace/kernels/relu.h b/mace/kernels/relu.h
index 086f762b41e85c3ff7042086ba1b56d3607d30c2..d0de2f0b061524537479c9082ca250fba47e6c29 100644
--- a/mace/kernels/relu.h
+++ b/mace/kernels/relu.h
@@ -12,12 +12,12 @@ namespace kernels {
 
 template<typename T>
 void ReluFuntion(const Tensor *input_tensor, Tensor *output_tensor) {
-  int64 size = input_tensor->size();
+  int64_t size = input_tensor->size();
   output_tensor->ResizeLike(input_tensor);
   const T *input = input_tensor->data<T>();
   T *output = output_tensor->mutable_data<T>();
 
-  for (int64 i = 0; i < size; ++i) {
+  for (int64_t i = 0; i < size; ++i) {
     output[i] = std::max(input[i], static_cast<T>(0));
   }
 }
diff --git a/mace/kernels/test/addn_neon_test.cc b/mace/kernels/test/addn_neon_test.cc
index 8d1ca924b9b3ef8fecf96301007afa593cd54600..521fe9129b64a1e8f646c2124b3f56de32af677a 100644
--- a/mace/kernels/test/addn_neon_test.cc
+++ b/mace/kernels/test/addn_neon_test.cc
@@ -15,7 +15,7 @@ TEST(NeonTest, AddN) {
   std::mt19937 gen(rd());
   std::normal_distribution<float> nd(0, 1);
 
-  int64 count = 100000;
+  int64_t count = 100000;
   Tensor input_tensor1(cpu_allocator(), DataType::DT_FLOAT);
   input_tensor1.Resize({100, 1000});
   Tensor input_tensor2(cpu_allocator(), DataType::DT_FLOAT);
@@ -37,7 +37,7 @@ TEST(NeonTest, AddN) {
   float *output = output_tensor.mutable_data<float>();
   float *output_neon = output_tensor_neon.mutable_data<float>();
 
-  for (int64 i = 0; i < count; ++i) {
+  for (int64_t i = 0; i < count; ++i) {
     input1[i] = nd(gen);
     input2[i] = nd(gen);
     input3[i] = nd(gen);
@@ -48,7 +48,7 @@ TEST(NeonTest, AddN) {
 
   ASSERT_EQ(count, output_tensor.size());
   ASSERT_EQ(count, output_tensor_neon.size());
-  for (int64 i = 0; i < count; ++i) {
+  for (int64_t i = 0; i < count; ++i) {
     ASSERT_FLOAT_EQ(output[i], output_neon[i]);
   }
 }
diff --git a/mace/kernels/test/relu_neon_test.cc b/mace/kernels/test/relu_neon_test.cc
index 40c1bc62d68a94820ac99d1140203c24dd412235..a16dc2692501017a494d25d5af9dab73be8c44db 100644
--- a/mace/kernels/test/relu_neon_test.cc
+++ b/mace/kernels/test/relu_neon_test.cc
@@ -15,7 +15,7 @@ TEST(NeonTest, Relu) {
   std::mt19937 gen(rd());
   std::normal_distribution<float> nd(0, 1);
 
-  int64 count = 100000;
+  int64_t count = 100000;
   Tensor input_tensor(cpu_allocator(), DataType::DT_FLOAT);
   input_tensor.Resize({100, 1000});
   Tensor output_tensor(cpu_allocator(), DataType::DT_FLOAT);
@@ -27,7 +27,7 @@ TEST(NeonTest, Relu) {
   float *output = output_tensor.mutable_data<float>();
   float *output_neon = output_tensor_neon.mutable_data<float>();
 
-  for (int64 i = 0; i < count; ++i) {
+  for (int64_t i = 0; i < count; ++i) {
     input[i] = nd(gen);
   }
 
@@ -36,7 +36,7 @@ TEST(NeonTest, Relu) {
 
   ASSERT_EQ(count, output_tensor.size());
   ASSERT_EQ(count, output_tensor_neon.size());
-  for (int64 i = 0; i < count; ++i) {
+  for (int64_t i = 0; i < count; ++i) {
     ASSERT_FLOAT_EQ(output[i], output_neon[i]);
   }
 }
diff --git a/mace/ops/batch_norm.h b/mace/ops/batch_norm.h
index 8a3c01b44354cbe991b41fe08d98b2302b1f4099..59c227c865b519b81c7e6d818a052336acd2e570 100644
--- a/mace/ops/batch_norm.h
+++ b/mace/ops/batch_norm.h
@@ -33,9 +33,9 @@ class BatchNormOp : public Operator<D, T> {
       Tensor* output = this->Output(0);
       output->ResizeLike(input);
 
-      const TIndex n = input->dim(0);
-      const TIndex channel = input->dim(1);
-      const TIndex sample_size = input->dim(2) * input->dim(3);
+      const index_t n = input->dim(0);
+      const index_t channel = input->dim(1);
+      const index_t sample_size = input->dim(2) * input->dim(3);
 
       const float* input_ptr = input->data<float>();
       const float* scale_ptr = scale->data<float>();
diff --git a/mace/ops/ops_test_util.h b/mace/ops/ops_test_util.h
index 61085f7dd0ed090fb248db3f76037199d7538c78..0e96943c60085014bc01c65323882ebc0480249e 100644
--- a/mace/ops/ops_test_util.h
+++ b/mace/ops/ops_test_util.h
@@ -43,7 +43,7 @@ class OpsTestBase : public ::testing::Test {
     }
   public:
     template <typename T>
-    void AddInputFromArray(const char* name, const std::vector<TIndex>& shape, const std::vector<T>& data) {
+    void AddInputFromArray(const char* name, const std::vector<index_t>& shape, const std::vector<T>& data) {
       Tensor* input = ws_.CreateTensor(name, cpu_allocator(), DataTypeToEnum<T>::v());
       input->Resize(shape);
       float* input_data = input->mutable_data<float>();
@@ -70,7 +70,7 @@ class OpsTestBase : public ::testing::Test {
 };
 
 template <typename T>
-Tensor CreateTensor(const std::vector<TIndex>& shape, const std::vector<T>& data) {
+Tensor CreateTensor(const std::vector<index_t>& shape, const std::vector<T>& data) {
   Tensor res(cpu_allocator(), DataTypeToEnum<T>::v());
   res.Resize(shape);
   float* input_data = res.mutable_data<float>();
@@ -90,7 +90,7 @@ inline std::string ShapeToString(const Tensor& x) {
   std::stringstream stream;
   for (int i = 0; i < x.dim_size(); i++) {
     if (i > 0) stream<<",";
-    int64 dim = x.dim(i);
+    int64_t dim = x.dim(i);
     if (dim < 0) {
       stream<<"?";
     } else {