Update data types and linking opts

a3850281 · Liangliang He · 68a335f1 · a3850281 · a3850281 · 68a335f1
28 changed file
--- a/mace/core/BUILD
+++ b/mace/core/BUILD
@@ -7,6 +7,8 @@ package(

 licenses(["notice"])  # Apache 2.0

+load("//mace:mace.bzl", "if_android")
+
 cc_library(
    name = "core",
    srcs = glob([
@@ -19,6 +21,10 @@ cc_library(
    deps = [
        "//mace/proto:cc_proto",
    ],
+    linkopts = if_android([
+        "-llog",
+        "-pie",
+    ]),
 )

 # Main program for tests

--- a/mace/core/common.h
+++ b/mace/core/common.h
@@ -12,7 +12,6 @@
 #include <vector>
 #include <algorithm>

-#include "mace/core/integral_types.h"
 #include "mace/core/logging.h"

 using std::set;
@@ -21,7 +20,7 @@ using std::string;
 using std::unique_ptr;
 using std::vector;

-typedef int64 TIndex;
+typedef int64_t index_t;

 // Disable the copy and assignment operator for a class.
 #ifndef DISABLE_COPY_AND_ASSIGN

--- a/mace/core/integral_types.h
+++ b/mace/core/integral_types.h
-//
-// Copyright (c) 2017 XiaoMi All rights reserved.
-//
-
-
-#ifndef MACE_CORE_INTEGRAL_TYPES_H_
-#define MACE_CORE_INTEGRAL_TYPES_H_
-
-typedef int8_t int8;
-typedef int16_t int16;
-typedef int32_t int32;
-typedef int64_t int64;
-
-typedef uint8_t uint8;
-typedef uint16_t uint16;
-typedef uint32_t uint32;
-typedef uint64_t uint64;
-
-#endif // MACE_CORE_INTEGRAL_TYPES_H_
--- a/mace/core/logging.cc
+++ b/mace/core/logging.cc
@@ -69,18 +69,18 @@ void LogMessage::GenerateLogMessage() {

 namespace {

-// Parse log level (int64) from environment variable (char*)
-int64 LogLevelStrToInt(const char* tf_env_var_val) {
-  if (tf_env_var_val == nullptr) {
+// Parse log level (int64_t) from environment variable (char*)
+int64_t LogLevelStrToInt(const char* mace_env_var_val) {
+  if (mace_env_var_val == nullptr) {
    return 0;
  }

  // Ideally we would use env_var / safe_strto64, but it is
  // hard to use here without pulling in a lot of dependencies,
  // so we use std:istringstream instead
-  string min_log_level(tf_env_var_val);
+  string min_log_level(mace_env_var_val);
  std::istringstream ss(min_log_level);
-  int64 level;
+  int64_t level;
  if (!(ss >> level)) {
    // Invalid vlog level setting, set level to default (0)
    level = 0;
@@ -89,26 +89,26 @@ int64 LogLevelStrToInt(const char* tf_env_var_val) {
  return level;
 }

-int64 MinLogLevelFromEnv() {
-  const char* tf_env_var_val = getenv("MACE_CPP_MIN_LOG_LEVEL");
-  return LogLevelStrToInt(tf_env_var_val);
+int64_t MinLogLevelFromEnv() {
+  const char* mace_env_var_val = getenv("MACE_CPP_MIN_LOG_LEVEL");
+  return LogLevelStrToInt(mace_env_var_val);
 }

-int64 MinVLogLevelFromEnv() {
-  const char* tf_env_var_val = getenv("MACE_CPP_MIN_VLOG_LEVEL");
-  return LogLevelStrToInt(tf_env_var_val);
+int64_t MinVLogLevelFromEnv() {
+  const char* mace_env_var_val = getenv("MACE_CPP_MIN_VLOG_LEVEL");
+  return LogLevelStrToInt(mace_env_var_val);
 }

 }  // namespace

 LogMessage::~LogMessage() {
  // Read the min log level once during the first call to logging.
-  static int64 min_log_level = MinLogLevelFromEnv();
+  static int64_t min_log_level = MinLogLevelFromEnv();
  if (severity_ >= min_log_level) GenerateLogMessage();
 }

-int64 LogMessage::MinVLogLevel() {
-  static int64 min_vlog_level = MinVLogLevelFromEnv();
+int64_t LogMessage::MinVLogLevel() {
+  static int64_t min_vlog_level = MinVLogLevelFromEnv();
  return min_vlog_level;
 }


--- a/mace/core/logging.h
+++ b/mace/core/logging.h
@@ -9,8 +9,6 @@
 #include <limits>
 #include <string>

-#include "mace/core/integral_types.h"
-
 #undef ERROR

 namespace mace {
@@ -62,7 +60,7 @@ class LogMessage : public std::basic_ostringstream<char> {
  // Returns the minimum log level for VLOG statements.
  // E.g., if MinVLogLevel() is 2, then VLOG(2) statements will produce output,
  // but VLOG(3) will not. Defaults to 0.
-  static int64 MinVLogLevel();
+  static int64_t MinVLogLevel();

 protected:
  void GenerateLogMessage();

--- a/mace/core/operator.cc
+++ b/mace/core/operator.cc
@@ -6,8 +6,8 @@

 namespace mace {

-std::map<int32, OperatorRegistry*>* gDeviceTypeRegistry() {
-  static std::map<int32, OperatorRegistry*> g_device_type_registry;
+std::map<int32_t, OperatorRegistry*>* gDeviceTypeRegistry() {
+  static std::map<int32_t, OperatorRegistry*> g_device_type_registry;
  return &g_device_type_registry;
 }


--- a/mace/core/operator.h
+++ b/mace/core/operator.h
@@ -44,7 +44,7 @@ class OperatorBase {
        *operator_def_, name, default_value);
  }

-  inline const Tensor *Input(TIndex idx) {
+  inline const Tensor *Input(index_t idx) {
    MACE_CHECK(idx < inputs_.size());
    return inputs_[idx];
  }

--- a/mace/core/serializer.cc
+++ b/mace/core/serializer.cc
@@ -17,8 +17,8 @@ unique_ptr<Tensor> Serializer::Deserialize(const TensorProto &proto,
                                           DeviceType type) {
  unique_ptr<Tensor> tensor(new Tensor(GetDeviceAllocator(type),
                                       proto.data_type()));
-  vector<TIndex> dims;
-  for (const TIndex d : proto.dims()) {
+  vector<index_t> dims;
+  for (const index_t d : proto.dims()) {
    dims.push_back(d);
  }
  tensor->Resize(dims);
@@ -33,31 +33,31 @@ unique_ptr<Tensor> Serializer::Deserialize(const TensorProto &proto,
                           proto.double_data().size());
      break;
    case DT_INT32:
-      tensor->template Copy<int32>(proto.int32_data().data(),
+      tensor->template Copy<int32_t>(proto.int32_data().data(),
                                   proto.int32_data().size());
      break;
    case DT_UINT8:
-      tensor->CopyWithCast<int32, uint8>(proto.int32_data().data(),
+      tensor->CopyWithCast<int32_t, uint8_t>(proto.int32_data().data(),
                                         proto.int32_data().size());
      break;
    case DT_INT16:
-      tensor->CopyWithCast<int32, int16>(proto.int32_data().data(),
+      tensor->CopyWithCast<int32_t, int16_t>(proto.int32_data().data(),
                                         proto.int32_data().size());
      break;
    case DT_INT8:
-      tensor->CopyWithCast<int32, int8>(proto.int32_data().data(),
+      tensor->CopyWithCast<int32_t, int8_t>(proto.int32_data().data(),
                                        proto.int32_data().size());
      break;
    case DT_INT64:
-      tensor->Copy<int64>(proto.int64_data().data(),
+      tensor->Copy<int64_t>(proto.int64_data().data(),
                          proto.int64_data().size());
      break;
    case DT_UINT16:
-      tensor->CopyWithCast<int32, uint16>(proto.int32_data().data(),
+      tensor->CopyWithCast<int32_t, uint16_t>(proto.int32_data().data(),
                                          proto.int32_data().size());
      break;
    case DT_BOOL:
-      tensor->CopyWithCast<int32, bool>(proto.int32_data().data(),
+      tensor->CopyWithCast<int32_t, bool>(proto.int32_data().data(),
                                        proto.int32_data().size());
      break;
    case DT_STRING: {

--- a/mace/core/tensor.h
+++ b/mace/core/tensor.h
@@ -25,13 +25,13 @@ namespace mace {
  switch (TYPE_ENUM) {                                         \
    CASE(float, SINGLE_ARG(STMTS))                             \
    CASE(double, SINGLE_ARG(STMTS))                            \
-    CASE(int32, SINGLE_ARG(STMTS))                             \
-    CASE(uint8, SINGLE_ARG(STMTS))                             \
-    CASE(uint16, SINGLE_ARG(STMTS))                            \
-    CASE(int16, SINGLE_ARG(STMTS))                             \
-    CASE(int8, SINGLE_ARG(STMTS))                              \
+    CASE(int32_t, SINGLE_ARG(STMTS))                             \
+    CASE(uint8_t, SINGLE_ARG(STMTS))                             \
+    CASE(uint16_t, SINGLE_ARG(STMTS))                            \
+    CASE(int16_t, SINGLE_ARG(STMTS))                             \
+    CASE(int8_t, SINGLE_ARG(STMTS))                              \
    CASE(string, SINGLE_ARG(STMTS))                            \
-    CASE(int64, SINGLE_ARG(STMTS))                             \
+    CASE(int64_t, SINGLE_ARG(STMTS))                             \
    CASE(bool, SINGLE_ARG(STMTS))                              \
    case DT_INVALID:                                           \
      INVALID;                                                 \
@@ -64,17 +64,17 @@ class Tensor {

  inline DataType dtype() const { return dtype_; }

-  inline const vector<TIndex>& shape() const { return shape_; }
+  inline const vector<index_t>& shape() const { return shape_; }

-  inline TIndex dim_size() const { return shape_.size(); }
+  inline index_t dim_size() const { return shape_.size(); }

-  inline TIndex dim(TIndex index) const {
+  inline index_t dim(index_t index) const {
    MACE_CHECK(index < shape_.size(), "Exceeding ndim limit");
    MACE_CHECK(index >= 0, "Cannot have negative dimension index");
    return shape_[index];
  }

-  inline TIndex size() const { return size_; }
+  inline index_t size() const { return size_; }

  inline const void* raw_data() const {
    MACE_CHECK(data_.get() || size_ == 0);
@@ -108,9 +108,9 @@ class Tensor {
    return static_cast<T*>(raw_mutable_data());
  }

-  inline void Resize(const vector<TIndex>& shape) {
+  inline void Resize(const vector<index_t>& shape) {
    shape_ = shape;
-    TIndex size = NumElements();
+    index_t size = NumElements();
    if (size_ != size) {
      size_ = size;
      data_.reset();
@@ -126,14 +126,14 @@ class Tensor {
  }

  template <typename T>
-  inline void Copy(const T* src, TIndex size) {
+  inline void Copy(const T* src, index_t size) {
    MACE_CHECK(size == size_, "copy src and dst with different size.");
    CopyBytes(static_cast<const void*>(src), sizeof(T) * size);
  }

  template <typename SrcType, typename DstType>
  inline void CopyWithCast(const SrcType* src, size_t size) {
-    MACE_CHECK(static_cast<TIndex>(size) == size_, "copy src and dst with different size.");
+    MACE_CHECK(static_cast<index_t>(size) == size_, "copy src and dst with different size.");
    unique_ptr<DstType[]> buffer(new DstType[size]);
    for (size_t i = 0; i < size; ++i) {
      buffer[i] = static_cast<DstType>(src[i]);
@@ -161,15 +161,15 @@ class Tensor {
  }

 private:
-  inline int64 NumElements() const {
-    return std::accumulate(shape_.begin(), shape_.end(), 1, std::multiplies<int64>());
+  inline int64_t NumElements() const {
+    return std::accumulate(shape_.begin(), shape_.end(), 1, std::multiplies<int64_t>());
  }

  Allocator* alloc_;
-  TIndex size_;
+  index_t size_;
  DataType dtype_;
  std::shared_ptr<void> data_;
-  vector<TIndex> shape_;
+  vector<index_t> shape_;
 };

 } // namespace tensor

--- a/mace/core/testing/env_time.h
+++ b/mace/core/testing/env_time.h
@@ -16,10 +16,10 @@ namespace mace {

 namespace testing {

-inline int64 NowMicros() {
+inline int64_t NowMicros() {
  struct timeval tv;
  gettimeofday(&tv, nullptr);
-  return static_cast<int64>(tv.tv_sec) * 1000000 + tv.tv_usec;
+  return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
 }

 }  // namespace testing

--- a/mace/core/testing/test_benchmark.cc
+++ b/mace/core/testing/test_benchmark.cc
@@ -16,10 +16,10 @@ namespace testing {

 static std::vector<Benchmark*>* all_benchmarks = nullptr;
 static std::string label;
-static int64 bytes_processed;
-static int64 items_processed;
-static int64 accum_time = 0;
-static int64 start_time = 0;
+static int64_t bytes_processed;
+static int64_t items_processed;
+static int64_t accum_time = 0;
+static int64_t start_time = 0;

 Benchmark::Benchmark(const char* name, void (*fn)(int))
    : name_(name), num_args_(0), fn0_(fn) {
@@ -112,10 +112,10 @@ void Benchmark::Register() {
 }

 void Benchmark::Run(int arg1, int arg2, int* run_count, double* run_seconds) {
-  static const int64 kMinIters = 100;
-  static const int64 kMaxIters = 1000000000;
+  static const int64_t kMinIters = 100;
+  static const int64_t kMaxIters = 1000000000;
  static const double kMinTime = 0.5;
-  int64 iters = kMinIters;
+  int64_t iters = kMinIters;
  while (true) {
    accum_time = 0;
    start_time = NowMicros();
@@ -142,13 +142,13 @@ void Benchmark::Run(int arg1, int arg2, int* run_count, double* run_seconds) {
    double multiplier = 1.4 * kMinTime / std::max(seconds, 1e-9);
    multiplier = std::min(10.0, multiplier);
    if (multiplier <= 1.0) multiplier *= 2.0;
-    iters = std::max<int64>(multiplier * iters, iters + 1);
+    iters = std::max<int64_t>(multiplier * iters, iters + 1);
    iters = std::min(iters, kMaxIters);
  }
 }

-void BytesProcessed(int64 n) { bytes_processed = n; }
-void ItemsProcessed(int64 n) { items_processed = n; }
+void BytesProcessed(int64_t n) { bytes_processed = n; }
+void ItemsProcessed(int64_t n) { items_processed = n; }
 void StartTiming() {
  if (start_time == 0) start_time = NowMicros();
 }

--- a/mace/core/testing/test_benchmark.h
+++ b/mace/core/testing/test_benchmark.h
@@ -42,8 +42,8 @@ class Benchmark {
 };

 void RunBenchmarks();
-void BytesProcessed(int64);
-void ItemsProcessed(int64);
+void BytesProcessed(int64_t);
+void ItemsProcessed(int64_t);
 void StartTiming();
 void StopTiming();


--- a/mace/core/types.h
+++ b/mace/core/types.h
@@ -42,16 +42,16 @@ struct EnumToDataType {};  // Specializations below

 MATCH_TYPE_AND_ENUM(float, DT_FLOAT);
 MATCH_TYPE_AND_ENUM(double, DT_DOUBLE);
-MATCH_TYPE_AND_ENUM(int32, DT_INT32);
-MATCH_TYPE_AND_ENUM(uint16, DT_UINT16);
-MATCH_TYPE_AND_ENUM(uint8, DT_UINT8);
-MATCH_TYPE_AND_ENUM(int16, DT_INT16);
-MATCH_TYPE_AND_ENUM(int8, DT_INT8);
+MATCH_TYPE_AND_ENUM(int32_t, DT_INT32);
+MATCH_TYPE_AND_ENUM(uint16_t, DT_UINT16);
+MATCH_TYPE_AND_ENUM(uint8_t, DT_UINT8);
+MATCH_TYPE_AND_ENUM(int16_t, DT_INT16);
+MATCH_TYPE_AND_ENUM(int8_t, DT_INT8);
 MATCH_TYPE_AND_ENUM(string, DT_STRING);
-MATCH_TYPE_AND_ENUM(int64, DT_INT64);
+MATCH_TYPE_AND_ENUM(int64_t, DT_INT64);
 MATCH_TYPE_AND_ENUM(bool, DT_BOOL);

-static const int32 kint32max = ((int32)0x7FFFFFFF);
+static const int32_t kint32_tmax = ((int32_t)0x7FFFFFFF);

 } // namespace mace


--- a/mace/examples/BUILD
+++ b/mace/examples/BUILD
@@ -7,10 +7,6 @@ cc_binary(
        "helloworld.cc",
    ],
    copts = ["-std=c++11"],
-    linkopts = if_android([
-        "-pie",
-        "-llog",
-    ]),
    deps = [
        "//mace/core",
        "//mace/ops",
@@ -21,10 +17,6 @@ cc_test(
    name = "benchmark_example",
    srcs = ["benchmark_example.cc"],
    copts = ["-std=c++11"],
-    linkopts = if_android([
-        "-pie",
-        "-llog",
-    ]),
    linkstatic = 1,
    deps = [
        "//mace/core",

--- a/mace/examples/benchmark_example.cc
+++ b/mace/examples/benchmark_example.cc
@@ -6,7 +6,7 @@

 static void foo(int iters) {
  static const int N = 32;
-  const int64 tot = static_cast<int64>(iters) * N;
+  const int64_t tot = static_cast<int64_t>(iters) * N;
  mace::testing::ItemsProcessed(tot);
  mace::testing::BytesProcessed(tot * (sizeof(float)));

@@ -26,7 +26,7 @@ BENCHMARK(foo);


 static void bar(int iters, int n) {
-  const int64 tot = static_cast<int64>(iters) * n;
+  const int64_t tot = static_cast<int64_t>(iters) * n;
  mace::testing::ItemsProcessed(tot);
  mace::testing::BytesProcessed(tot * (sizeof(float)));


--- a/mace/kernels/BUILD
+++ b/mace/kernels/BUILD
@@ -18,6 +18,9 @@ cc_library(
        "//mace/core:core",
    ],
    copts = ['-std=c++11'],
+    linkopts = ["-fopenmp"] + if_android([
+        "-lm",
+        ]),
 )

 cc_test(
@@ -29,11 +32,9 @@ cc_test(
        "//mace/core:core",
    ],
    copts = ['-std=c++11'],
-    linkopts = ["-fopenmp"] + if_android([
+    linkopts = if_android([
        "-pie",
-        "-llog",
-        "-lm",
-    ]),
+        ]),
    linkstatic = 1,
    testonly = 1,
 )
@@ -47,11 +48,6 @@ cc_test(
        "//mace/core:test_benchmark_main",
    ],
    copts = ['-std=c++11'],
-    linkopts = ["-fopenmp"] + if_android([
-        "-pie",
-        "-llog",
-        "-lm",
-    ]),
    linkstatic = 1,
    testonly = 1,
 )
--- a/mace/kernels/addn.h
+++ b/mace/kernels/addn.h
@@ -15,7 +15,7 @@ void AddNFuntion(const vector<const Tensor*>& input_tensor, Tensor *output_tenso
  int n = input_tensor.size();
  MACE_CHECK(n > 1);
  MACE_CHECK_NOTNULL(input_tensor[0]);
-  int64 size = input_tensor[0]->size();
+  int64_t size = input_tensor[0]->size();
  vector<const T*> inputs(n);
  for (int i = 0; i < n; ++i) {
    inputs[i] = input_tensor[i]->data<T>();
@@ -24,7 +24,7 @@ void AddNFuntion(const vector<const Tensor*>& input_tensor, Tensor *output_tenso
  T* output = output_tensor->mutable_data<T>();

  for (int i = 0; i < n; ++i) {
-    for (int64 j = 0; j < size; ++j) {
+    for (int64_t j = 0; j < size; ++j) {
      output[j] += inputs[i][j];
    }
  }

--- a/mace/kernels/batch_norm.h
+++ b/mace/kernels/batch_norm.h
@@ -30,9 +30,9 @@ struct BatchNormFunctor : public BatchNormFunctorBase<D, T> {
                  const T* offset,
                  const T* mean,
                  const T* var,
-                  const TIndex n,
-                  const TIndex channel,
-                  const TIndex sample_size,
+                  const index_t n,
+                  const index_t channel,
+                  const index_t sample_size,
                  T* output) {
    // Batch normalization in the paper https://arxiv.org/abs/1502.03167 .
    // The calculation formula for inference is
@@ -42,15 +42,15 @@ struct BatchNormFunctor : public BatchNormFunctorBase<D, T> {
    // new_offset = \offset - mean * common_val;
    // Y = new_scale * X + new_offset;
    T new_scale, new_offset;
-    for (TIndex c = 0; c < channel; ++c) {
+    for (index_t c = 0; c < channel; ++c) {
      new_scale = scale[c] / std::sqrt(var[c] + this->variance_epsilon_);
      new_offset = offset[c] - mean[c] * new_scale;
-      TIndex pos = c * sample_size;
+      index_t pos = c * sample_size;

-      for (TIndex i = 0; i < n; ++i) {
+      for (index_t i = 0; i < n; ++i) {
        const T* input_sample_ptr = input + pos;
        T* output_sample_ptr = output + pos;
-        for (TIndex j = 0; j < sample_size; ++j) {
+        for (index_t j = 0; j < sample_size; ++j) {
          output_sample_ptr[j] = new_scale * input_sample_ptr[j] + new_offset;
        }
        pos += channel * sample_size;

--- a/mace/kernels/benchmark/addn_benchmark.cc
+++ b/mace/kernels/benchmark/addn_benchmark.cc
@@ -11,7 +11,7 @@ using namespace mace;
 using namespace mace::kernels;

 static void AddNBenchmark(int iters, int n, int type) {
-  const int64 tot = static_cast<int64>(iters) * n * 3;
+  const int64_t tot = static_cast<int64_t>(iters) * n * 3;
  mace::testing::ItemsProcessed(tot);
  mace::testing::BytesProcessed(tot * (sizeof(float)));

@@ -35,7 +35,7 @@ static void AddNBenchmark(int iters, int n, int type) {
  float *input3 = input_tensor3.mutable_data<float>();
  float *output = output_tensor.mutable_data<float>();

-  for (int64 i = 0; i < n; ++i) {
+  for (int64_t i = 0; i < n; ++i) {
    input1[i] = nd(gen);
    input2[i] = nd(gen);
    input3[i] = nd(gen);

--- a/mace/kernels/benchmark/relu_benchmark.cc
+++ b/mace/kernels/benchmark/relu_benchmark.cc
@@ -11,7 +11,7 @@ using namespace mace;
 using namespace mace::kernels;

 static void ReluBenchmark(int iters, int n, int type) {
-  const int64 tot = static_cast<int64>(iters) * n;
+  const int64_t tot = static_cast<int64_t>(iters) * n;
  mace::testing::ItemsProcessed(tot);
  mace::testing::BytesProcessed(tot * (sizeof(float)));

@@ -25,7 +25,7 @@ static void ReluBenchmark(int iters, int n, int type) {
  output_tensor.ResizeLike(input_tensor);
  float *input = input_tensor.mutable_data<float>();
  float *output = output_tensor.mutable_data<float>();
-  for (int64 i = 0; i < n; ++i) {
+  for (int64_t i = 0; i < n; ++i) {
    input[i] = nd(gen);
  }


--- a/mace/kernels/neon/addn_neon.cc
+++ b/mace/kernels/neon/addn_neon.cc
@@ -14,7 +14,7 @@ void NeonAddNFuntion_float(const vector<const Tensor *> &input_tensor,
  int n = input_tensor.size();
  MACE_CHECK(n > 1);
  MACE_CHECK_NOTNULL(input_tensor[0]);
-  int64 size = input_tensor[0]->size();
+  int64_t size = input_tensor[0]->size();
  output_tensor->ResizeLike(input_tensor[0]);
  float *output = output_tensor->mutable_data<float>();
  vector<const float *> inputs(n);
@@ -22,19 +22,19 @@ void NeonAddNFuntion_float(const vector<const Tensor *> &input_tensor,
    inputs[i] = input_tensor[i]->data<float>();
  }

-  int64 cost = size * n;
-  int64 groups = 1;
+  int64_t cost = size * n;
+  int64_t groups = 1;
  if (cost > kCostPerGroup) {
    groups = cost / kCostPerGroup;
  }
-  int64 element_per_group = size / groups;
+  int64_t element_per_group = size / groups;

 #pragma omp parallel for num_threads(1) // no significant performance improve
-  for (int64 i = 0; i < size; i += element_per_group) {
-    int64 count = std::min(element_per_group, size - i);
+  for (int64_t i = 0; i < size; i += element_per_group) {
+    int64_t count = std::min(element_per_group, size - i);
    int nn = count >> 2;
    int remain = count - (nn << 2);
-    for (int64 j = 0; j < n; ++j) {
+    for (int64_t j = 0; j < n; ++j) {
      const float *inptr = inputs[j] + i;
      float *outptr = output + i;
      for (int k = 0; k < nn; ++k) {

--- a/mace/kernels/neon/batch_norm_neon.cc
+++ b/mace/kernels/neon/batch_norm_neon.cc
@@ -34,18 +34,18 @@ struct BatchNormFunctor<DeviceType::NEON, T> : public BatchNormFunctorBase<Devic
    T new_scale, new_offset;
    int count = sample_size >> 2;
    int remain_count = sample_size - count;
-    for (TIndex c = 0; c < channel; ++c) {
+    for (index_t c = 0; c < channel; ++c) {
      new_scale = scale[c] / std::sqrt(var[c] + this->variance_epsilon_);
      new_offset = offset[c] - mean[c] * new_scale;
-      TIndex pos = c * sample_size;
+      index_t pos = c * sample_size;

      float32x4_t new_scale_f = vdupq_n_f32(new_scale);
      float32x4_t new_offset_f = vdupq_n_f32(new_offset);
-      for (TIndex i = 0; i < n; ++i) {
+      for (index_t i = 0; i < n; ++i) {
        const float* input_sample_ptr = input + pos;
        float* output_sample_ptr = output + pos;

-        for(TIndex j = 0; j < count; ++j) {
+        for(index_t j = 0; j < count; ++j) {
          float32x4_t input_f = vld1q_f32(input_sample_ptr);
          float32x4_t output_f = new_offset_f;
          output_f = vfmaq_f32(output_f, input_f, new_scale_f);
@@ -53,7 +53,7 @@ struct BatchNormFunctor<DeviceType::NEON, T> : public BatchNormFunctorBase<Devic
          input_sample_ptr += 4;
          output_sample_ptr += 4;
        }
-        for(TIndex j = 0; j < remain_count; ++j) {
+        for(index_t j = 0; j < remain_count; ++j) {
          *output_sample_ptr = new_scale * *input_sample_ptr + new_offset;
          ++output_sample_ptr;
          ++input_sample_ptr;

--- a/mace/kernels/neon/relu_neon.cc
+++ b/mace/kernels/neon/relu_neon.cc
@@ -10,14 +10,14 @@ namespace kernels {

 void NeonReluFuntion_float(const Tensor *input_tensor,
                           Tensor *output_tensor) {
-  int64 size = input_tensor->size();
+  int64_t size = input_tensor->size();
  output_tensor->ResizeLike(input_tensor);
  const float *input = input_tensor->data<float>();
  float *output = output_tensor->mutable_data<float>();

 #pragma omp parallel for num_threads(1) // no significant performance improve
-  for (int64 i = 0; i < size; i += kCostPerGroup) {
-    int64 count = std::min(static_cast<int64>(kCostPerGroup), size - i);
+  for (int64_t i = 0; i < size; i += kCostPerGroup) {
+    int64_t count = std::min(static_cast<int64_t>(kCostPerGroup), size - i);
    int nn = count >> 2;
    int remain = count - (nn << 2);
    const float *inptr = input + i;

--- a/mace/kernels/relu.h
+++ b/mace/kernels/relu.h
@@ -12,12 +12,12 @@ namespace kernels {

 template<typename T>
 void ReluFuntion(const Tensor *input_tensor, Tensor *output_tensor) {
-  int64 size = input_tensor->size();
+  int64_t size = input_tensor->size();
  output_tensor->ResizeLike(input_tensor);
  const T *input = input_tensor->data<T>();
  T *output = output_tensor->mutable_data<T>();

-  for (int64 i = 0; i < size; ++i) {
+  for (int64_t i = 0; i < size; ++i) {
    output[i] = std::max(input[i], static_cast<T>(0));
  }
 }

--- a/mace/kernels/test/addn_neon_test.cc
+++ b/mace/kernels/test/addn_neon_test.cc
@@ -15,7 +15,7 @@ TEST(NeonTest, AddN) {
  std::mt19937 gen(rd());
  std::normal_distribution<float> nd(0, 1);

-  int64 count = 100000;
+  int64_t count = 100000;
  Tensor input_tensor1(cpu_allocator(), DataType::DT_FLOAT);
  input_tensor1.Resize({100, 1000});
  Tensor input_tensor2(cpu_allocator(), DataType::DT_FLOAT);
@@ -37,7 +37,7 @@ TEST(NeonTest, AddN) {
  float *output = output_tensor.mutable_data<float>();
  float *output_neon = output_tensor_neon.mutable_data<float>();

-  for (int64 i = 0; i < count; ++i) {
+  for (int64_t i = 0; i < count; ++i) {
    input1[i] = nd(gen);
    input2[i] = nd(gen);
    input3[i] = nd(gen);
@@ -48,7 +48,7 @@ TEST(NeonTest, AddN) {

  ASSERT_EQ(count, output_tensor.size());
  ASSERT_EQ(count, output_tensor_neon.size());
-  for (int64 i = 0; i < count; ++i) {
+  for (int64_t i = 0; i < count; ++i) {
    ASSERT_FLOAT_EQ(output[i], output_neon[i]);
  }
 }

--- a/mace/kernels/test/relu_neon_test.cc
+++ b/mace/kernels/test/relu_neon_test.cc
@@ -15,7 +15,7 @@ TEST(NeonTest, Relu) {
  std::mt19937 gen(rd());
  std::normal_distribution<float> nd(0, 1);

-  int64 count = 100000;
+  int64_t count = 100000;
  Tensor input_tensor(cpu_allocator(), DataType::DT_FLOAT);
  input_tensor.Resize({100, 1000});
  Tensor output_tensor(cpu_allocator(), DataType::DT_FLOAT);
@@ -27,7 +27,7 @@ TEST(NeonTest, Relu) {
  float *output = output_tensor.mutable_data<float>();
  float *output_neon = output_tensor_neon.mutable_data<float>();

-  for (int64 i = 0; i < count; ++i) {
+  for (int64_t i = 0; i < count; ++i) {
    input[i] = nd(gen);
  }

@@ -36,7 +36,7 @@ TEST(NeonTest, Relu) {

  ASSERT_EQ(count, output_tensor.size());
  ASSERT_EQ(count, output_tensor_neon.size());
-  for (int64 i = 0; i < count; ++i) {
+  for (int64_t i = 0; i < count; ++i) {
    ASSERT_FLOAT_EQ(output[i], output_neon[i]);
  }
 }

--- a/mace/ops/batch_norm.h
+++ b/mace/ops/batch_norm.h
@@ -33,9 +33,9 @@ class BatchNormOp : public Operator<D, T> {
      Tensor* output = this->Output(0);
      output->ResizeLike(input);

-      const TIndex n = input->dim(0);
-      const TIndex channel = input->dim(1);
-      const TIndex sample_size = input->dim(2) * input->dim(3);
+      const index_t n = input->dim(0);
+      const index_t channel = input->dim(1);
+      const index_t sample_size = input->dim(2) * input->dim(3);

      const float* input_ptr = input->data<float>();
      const float* scale_ptr = scale->data<float>();

--- a/mace/ops/ops_test_util.h
+++ b/mace/ops/ops_test_util.h
@@ -43,7 +43,7 @@ class OpsTestBase : public ::testing::Test {
    }
  public:
    template <typename T>
-    void AddInputFromArray(const char* name, const std::vector<TIndex>& shape, const std::vector<T>& data) {
+    void AddInputFromArray(const char* name, const std::vector<index_t>& shape, const std::vector<T>& data) {
      Tensor* input = ws_.CreateTensor(name, cpu_allocator(), DataTypeToEnum<T>::v());
      input->Resize(shape);
      float* input_data = input->mutable_data<float>();
@@ -70,7 +70,7 @@ class OpsTestBase : public ::testing::Test {
 };

 template <typename T>
-Tensor CreateTensor(const std::vector<TIndex>& shape, const std::vector<T>& data) {
+Tensor CreateTensor(const std::vector<index_t>& shape, const std::vector<T>& data) {
  Tensor res(cpu_allocator(), DataTypeToEnum<T>::v());
  res.Resize(shape);
  float* input_data = res.mutable_data<float>();
@@ -90,7 +90,7 @@ inline std::string ShapeToString(const Tensor& x) {
  std::stringstream stream;
  for (int i = 0; i < x.dim_size(); i++) {
    if (i > 0) stream<<",";
-    int64 dim = x.dim(i);
+    int64_t dim = x.dim(i);
    if (dim < 0) {
      stream<<"?";
    } else {