fix host run broken

3f418e2d · yejianwu · 7c1711d8 · 3f418e2d · 3f418e2d · 3f418e2d
58 changed file
--- a/mace/core/BUILD
+++ b/mace/core/BUILD
@@ -52,10 +52,10 @@ cc_library(
    ]),
    deps = [
        "//mace/utils",
+        "//mace/codegen:generated_version",
    ] + if_android([
        ":opencl_headers",
        "//mace/codegen:generated_opencl",
-        "//mace/codegen:generated_version",
        "@half//:half",
    ]) + if_production_mode([
        "//mace/codegen:generated_tuning_params",

--- a/mace/core/operator.cc
+++ b/mace/core/operator.cc
@@ -79,7 +79,6 @@ extern void Register_AddN(OperatorRegistry *op_registry);
 extern void Register_BatchNorm(OperatorRegistry *op_registry);
 extern void Register_BatchToSpaceND(OperatorRegistry *op_registry);
 extern void Register_BiasAdd(OperatorRegistry *op_registry);
-extern void Register_BufferToImage(OperatorRegistry *op_registry);
 extern void Register_ChannelShuffle(OperatorRegistry *op_registry);
 extern void Register_Concat(OperatorRegistry *op_registry);
 extern void Register_Conv2D(OperatorRegistry *op_registry);
@@ -92,7 +91,6 @@ extern void Register_FoldedBatchNorm(OperatorRegistry *op_registry);
 extern void Register_FullyConnected(OperatorRegistry *op_registry);
 extern void Register_FusedConv2D(OperatorRegistry *op_registry);
 extern void Register_GlobalAvgPooling(OperatorRegistry *op_registry);
-extern void Register_ImageToBuffer(OperatorRegistry *op_registry);
 extern void Register_LocalResponseNorm(OperatorRegistry *op_registry);
 extern void Register_MatMul(OperatorRegistry *op_registry);
 extern void Register_Pad(OperatorRegistry *op_registry);
@@ -111,6 +109,11 @@ extern void Register_SpaceToDepth(OperatorRegistry *op_registry);
 extern void Register_Transpose(OperatorRegistry *op_registry);
 extern void Register_WinogradInverseTransform(OperatorRegistry *op_registry);
 extern void Register_WinogradTransform(OperatorRegistry *op_registry);
+
+#ifdef MACE_ENABLE_OPENCL
+extern void Register_BufferToImage(OperatorRegistry *op_registry);
+extern void Register_ImageToBuffer(OperatorRegistry *op_registry);
+#endif  // MACE_ENABLE_OPENCL
 }  // namespace ops

 OperatorRegistry::OperatorRegistry() {
@@ -120,7 +123,6 @@ OperatorRegistry::OperatorRegistry() {
  ops::Register_BatchNorm(this);
  ops::Register_BatchToSpaceND(this);
  ops::Register_BiasAdd(this);
-  ops::Register_BufferToImage(this);
  ops::Register_ChannelShuffle(this);
  ops::Register_Concat(this);
  ops::Register_Conv2D(this);
@@ -133,7 +135,6 @@ OperatorRegistry::OperatorRegistry() {
  ops::Register_FullyConnected(this);
  ops::Register_FusedConv2D(this);
  ops::Register_GlobalAvgPooling(this);
-  ops::Register_ImageToBuffer(this);
  ops::Register_LocalResponseNorm(this);
  ops::Register_MatMul(this);
  ops::Register_Pad(this);
@@ -152,6 +153,11 @@ OperatorRegistry::OperatorRegistry() {
  ops::Register_Transpose(this);
  ops::Register_WinogradInverseTransform(this);
  ops::Register_WinogradTransform(this);
+
+#ifdef MACE_ENABLE_OPENCL
+  ops::Register_BufferToImage(this);
+  ops::Register_ImageToBuffer(this);
+#endif  // MACE_ENABLE_OPENCL
 }

 }  // namespace mace
--- a/mace/core/runtime/cpu/cpu_runtime.cc
+++ b/mace/core/runtime/cpu/cpu_runtime.cc
@@ -16,6 +16,7 @@

 #include <omp.h>
 #include <unistd.h>
+#include <sys/syscall.h>
 #include <sys/types.h>
 #include <algorithm>
 #include <utility>
@@ -86,7 +87,7 @@ void SetThreadAffinity(cpu_set_t mask) {
 #if defined(__ANDROID__)
  pid_t pid = gettid();
 #else
-  pid_t pid = pthread_self();
+  pid_t pid = syscall(SYS_gettid);
 #endif
  int err = sched_setaffinity(pid, sizeof(mask), &mask);
  MACE_CHECK(err == 0, "set affinity error: ", errno);

--- a/mace/kernels/BUILD
+++ b/mace/kernels/BUILD
@@ -14,20 +14,30 @@ cc_library(
    srcs = glob(
        [
            "*.cc",
-            "opencl/*.cc",
            "arm/*.cc",
        ],
        exclude = [
            "*_test.cc",
            "arm/*_test.cc",
+        ],
+    ) + if_android(glob([
+            "opencl/*.cc",
+        ],
+        exclude = [
            "opencl/*_test.cc",
+        ])),
+    hdrs = glob(
+        [
+            "*.h",
+            "arm/*.h",
        ],
-    ),
-    hdrs = glob([
-        "*.h",
-        "opencl/*.h",
-        "arm/*.h",
-    ]),
+        exclude = [
+            "buffer_to_image.h",
+        ],
+    ) + if_android(glob([
+            "opencl/*.h",
+            "buffer_to_image.h",
+        ])),
    copts = if_openmp_enabled(["-fopenmp"]) +
      if_neon_enabled(["-DMACE_ENABLE_NEON"]) +
      if_android_armv7(["-mfpu=neon"]) +

--- a/mace/kernels/activation.h
+++ b/mace/kernels/activation.h
@@ -21,10 +21,13 @@
 #include <vector>

 #include "mace/core/future.h"
-#include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/tensor.h"
 #include "mace/core/types.h"

+#ifdef MACE_ENABLE_OPENCL
+#include "mace/core/runtime/opencl/cl2_header.h"
+#endif  // MACE_ENABLE_OPENCL
+
 namespace mace {
 namespace kernels {

@@ -166,6 +169,7 @@ class ActivationFunctor<DeviceType::NEON, float> {
  float relux_max_limit_;
 };

+#ifdef MACE_ENABLE_OPENCL
 template <typename T>
 class ActivationFunctor<DeviceType::OPENCL, T> {
 public:
@@ -186,6 +190,7 @@ class ActivationFunctor<DeviceType::OPENCL, T> {
  std::string tuning_key_prefix_;
  std::vector<index_t> input_shape_;
 };
+#endif  // MACE_ENABLE_OPENCL

 }  // namespace kernels
 }  // namespace mace

--- a/mace/kernels/addn.h
+++ b/mace/kernels/addn.h
@@ -23,9 +23,12 @@
 #include <vector>

 #include "mace/core/future.h"
-#include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/tensor.h"

+#ifdef MACE_ENABLE_OPENCL
+#include "mace/core/runtime/opencl/cl2_header.h"
+#endif  // MACE_ENABLE_OPENCL
+
 namespace mace {
 namespace kernels {

@@ -88,6 +91,7 @@ struct AddNFunctor {
  }
 };

+#ifdef MACE_ENABLE_OPENCL
 template <typename T>
 struct AddNFunctor<DeviceType::OPENCL, T> {
  void operator()(const std::vector<const Tensor *> &input_tensors,
@@ -99,6 +103,7 @@ struct AddNFunctor<DeviceType::OPENCL, T> {
  std::unique_ptr<BufferBase> kernel_error_;
  std::vector<index_t> input_shape_;
 };
+#endif  // MACE_ENABLE_OPENCL

 }  // namespace kernels
 }  // namespace mace

--- a/mace/kernels/batch_norm.h
+++ b/mace/kernels/batch_norm.h
@@ -22,11 +22,14 @@
 #include <vector>

 #include "mace/core/future.h"
-#include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/tensor.h"
 #include "mace/kernels/activation.h"
 #include "mace/public/mace.h"

+#ifdef MACE_ENABLE_OPENCL
+#include "mace/core/runtime/opencl/cl2_header.h"
+#endif  // MACE_ENABLE_OPENCL
+
 namespace mace {
 namespace kernels {

@@ -159,7 +162,7 @@ struct BatchNormFunctor<DeviceType::NEON, float> : BatchNormFunctorBase {
                  StatsFuture *future);
 };

-
+#ifdef MACE_ENABLE_OPENCL
 template <typename T>
 struct BatchNormFunctor<DeviceType::OPENCL, T> : BatchNormFunctorBase {
  BatchNormFunctor(const bool folded_constant,
@@ -179,6 +182,7 @@ struct BatchNormFunctor<DeviceType::OPENCL, T> : BatchNormFunctorBase {
  std::unique_ptr<BufferBase> kernel_error_;
  std::vector<index_t> input_shape_;
 };
+#endif  // MACE_ENABLE_OPENCL

 }  // namespace kernels
 }  // namespace mace

--- a/mace/kernels/bias_add.h
+++ b/mace/kernels/bias_add.h
@@ -19,10 +19,13 @@
 #include <vector>

 #include "mace/core/future.h"
-#include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/tensor.h"
 #include "mace/public/mace.h"

+#ifdef MACE_ENABLE_OPENCL
+#include "mace/core/runtime/opencl/cl2_header.h"
+#endif  // MACE_ENABLE_OPENCL
+
 namespace mace {
 namespace kernels {

@@ -68,6 +71,7 @@ void BiasAddFunctor<DeviceType::NEON, float>::operator()(
    StatsFuture *future);
 */

+#ifdef MACE_ENABLE_OPENCL
 template <typename T>
 struct BiasAddFunctor<DeviceType::OPENCL, T> {
  void operator()(const Tensor *input,
@@ -79,6 +83,7 @@ struct BiasAddFunctor<DeviceType::OPENCL, T> {
  std::unique_ptr<BufferBase> kernel_error_;
  std::vector<index_t> input_shape_;
 };
+#endif  // MACE_ENABLE_OPENCL

 }  // namespace kernels
 }  // namespace mace

--- a/mace/kernels/channel_shuffle.h
+++ b/mace/kernels/channel_shuffle.h
@@ -60,6 +60,7 @@ struct ChannelShuffleFunctor {
  const int groups_;
 };

+#ifdef MACE_ENABLE_OPENCL
 template <typename T>
 struct ChannelShuffleFunctor<DeviceType::OPENCL, T> {
  explicit ChannelShuffleFunctor(const int groups) : groups_(groups) {}
@@ -72,6 +73,7 @@ struct ChannelShuffleFunctor<DeviceType::OPENCL, T> {
  const int groups_;
  std::vector<index_t> input_shape_;
 };
+#endif  // MACE_ENABLE_OPENCL

 }  // namespace kernels
 }  // namespace mace

--- a/mace/kernels/concat.h
+++ b/mace/kernels/concat.h
@@ -19,11 +19,14 @@
 #include <vector>

 #include "mace/core/future.h"
-#include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/tensor.h"
 #include "mace/core/types.h"
 #include "mace/public/mace.h"

+#ifdef MACE_ENABLE_OPENCL
+#include "mace/core/runtime/opencl/cl2_header.h"
+#endif  // MACE_ENABLE_OPENCL
+
 namespace mace {
 namespace kernels {

@@ -88,6 +91,7 @@ struct ConcatFunctor : ConcatFunctorBase {
  }
 };

+#ifdef MACE_ENABLE_OPENCL
 template <typename T>
 struct ConcatFunctor<DeviceType::OPENCL, T> : ConcatFunctorBase {
  explicit ConcatFunctor(const int32_t axis) : ConcatFunctorBase(axis) {}
@@ -100,6 +104,7 @@ struct ConcatFunctor<DeviceType::OPENCL, T> : ConcatFunctorBase {
  std::unique_ptr<BufferBase> kernel_error_;
  std::vector<index_t> input_shape_;
 };
+#endif  // MACE_ENABLE_OPENCL

 }  // namespace kernels
 }  // namespace mace

--- a/mace/kernels/conv_2d.h
+++ b/mace/kernels/conv_2d.h
@@ -23,12 +23,15 @@
 #include <vector>

 #include "mace/core/future.h"
-#include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/tensor.h"
 #include "mace/kernels/activation.h"
 #include "mace/kernels/conv_pool_2d_util.h"
 #include "mace/utils/utils.h"

+#ifdef MACE_ENABLE_OPENCL
+#include "mace/core/runtime/opencl/cl2_header.h"
+#endif  // MACE_ENABLE_OPENCL
+
 namespace mace {
 namespace kernels {

@@ -457,6 +460,7 @@ struct Conv2dFunctor<DeviceType::NEON, float> : Conv2dFunctorBase {
  ScratchBuffer *scratch_;
 };

+#ifdef MACE_ENABLE_OPENCL
 template <typename T>
 struct Conv2dFunctor<DeviceType::OPENCL, T> : Conv2dFunctorBase {
  Conv2dFunctor(const int *strides,
@@ -485,6 +489,7 @@ struct Conv2dFunctor<DeviceType::OPENCL, T> : Conv2dFunctorBase {
  std::unique_ptr<BufferBase> kernel_error_;
  std::vector<index_t> input_shape_;
 };
+#endif  // MACE_ENABLE_OPENCL

 }  // namespace kernels
 }  // namespace mace

--- a/mace/kernels/cwise.h
+++ b/mace/kernels/cwise.h
@@ -20,9 +20,12 @@
 #include <vector>

 #include "mace/core/future.h"
-#include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/tensor.h"

+#ifdef MACE_ENABLE_OPENCL
+#include "mace/core/runtime/opencl/cl2_header.h"
+#endif  // MACE_ENABLE_OPENCL
+
 namespace mace {
 namespace kernels {

@@ -116,6 +119,7 @@ struct CWiseFunctor : CWiseFunctorBase {
  }
 };

+#ifdef MACE_ENABLE_OPENCL
 template <typename T>
 struct CWiseFunctor<DeviceType::OPENCL, T> : CWiseFunctorBase {
  CWiseFunctor(const CWiseType type, const float coeff)
@@ -130,6 +134,7 @@ struct CWiseFunctor<DeviceType::OPENCL, T> : CWiseFunctorBase {
  std::unique_ptr<BufferBase> kernel_error_;
  std::vector<index_t> input_shape_;
 };
+#endif  // MACE_ENABLE_OPENCL

 }  // namespace kernels
 }  // namespace mace

--- a/mace/kernels/depth_to_space.h
+++ b/mace/kernels/depth_to_space.h
@@ -18,10 +18,13 @@
 #include <vector>

 #include "mace/core/future.h"
-#include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/tensor.h"
 #include "mace/public/mace.h"

+#ifdef MACE_ENABLE_OPENCL
+#include "mace/core/runtime/opencl/cl2_header.h"
+#endif  // MACE_ENABLE_OPENCL
+
 namespace mace {
 namespace kernels {

@@ -112,6 +115,7 @@ struct DepthToSpaceOpFunctor {
  bool d2s_;
 };

+#ifdef MACE_ENABLE_OPENCL
 template <typename T>
 struct DepthToSpaceOpFunctor<DeviceType::OPENCL, T> {
  DepthToSpaceOpFunctor(const int block_size, bool d2s)
@@ -125,6 +129,7 @@ struct DepthToSpaceOpFunctor<DeviceType::OPENCL, T> {
  std::unique_ptr<BufferBase> kernel_error_;
  std::vector<index_t> input_shape_;
 };
+#endif  // MACE_ENABLE_OPENCL

 }  // namespace kernels
 }  // namespace mace

--- a/mace/kernels/depthwise_conv2d.h
+++ b/mace/kernels/depthwise_conv2d.h
@@ -23,11 +23,14 @@
 #include <vector>

 #include "mace/core/future.h"
-#include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/kernels/conv_pool_2d_util.h"
 #include "mace/kernels/activation.h"
 #include "mace/public/mace.h"

+#ifdef MACE_ENABLE_OPENCL
+#include "mace/core/runtime/opencl/cl2_header.h"
+#endif  // MACE_ENABLE_OPENCL
+
 namespace mace {
 namespace kernels {

@@ -441,6 +444,7 @@ struct DepthwiseConv2dFunctor<DeviceType::NEON, float>
                  StatsFuture *future);
 };

+#ifdef MACE_ENABLE_OPENCL
 template <typename T>
 struct DepthwiseConv2dFunctor<DeviceType::OPENCL, T>
    : DepthwiseConv2dFunctorBase {
@@ -468,6 +472,7 @@ struct DepthwiseConv2dFunctor<DeviceType::OPENCL, T>
  std::unique_ptr<BufferBase> kernel_error_;
  std::vector<index_t> input_shape_;
 };
+#endif  // MACE_ENABLE_OPENCL

 }  // namespace kernels
 }  // namespace mace

--- a/mace/kernels/eltwise.h
+++ b/mace/kernels/eltwise.h
@@ -20,9 +20,12 @@
 #include <vector>

 #include "mace/core/future.h"
-#include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/tensor.h"

+#ifdef MACE_ENABLE_OPENCL
+#include "mace/core/runtime/opencl/cl2_header.h"
+#endif  // MACE_ENABLE_OPENCL
+
 namespace mace {
 namespace kernels {

@@ -105,6 +108,7 @@ struct EltwiseFunctor : EltwiseFunctorBase {
  }
 };

+#ifdef MACE_ENABLE_OPENCL
 template <typename T>
 struct EltwiseFunctor<DeviceType::OPENCL, T> : EltwiseFunctorBase {
  EltwiseFunctor(const EltwiseType type, const std::vector<float> &coeff)
@@ -120,6 +124,7 @@ struct EltwiseFunctor<DeviceType::OPENCL, T> : EltwiseFunctorBase {
  std::unique_ptr<BufferBase> kernel_error_;
  std::vector<index_t> input_shape_;
 };
+#endif  // MACE_ENABLE_OPENCL

 }  // namespace kernels
 }  // namespace mace

--- a/mace/kernels/fully_connected.h
+++ b/mace/kernels/fully_connected.h
@@ -19,16 +19,19 @@
 #include <vector>

 #include "mace/core/future.h"
-#include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/tensor.h"
 #include "mace/kernels/activation.h"
+
+#ifdef MACE_ENABLE_OPENCL
+#include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/kernels/opencl/helper.h"
+#endif  // MACE_ENABLE_OPENCL

 namespace mace {
 namespace kernels {

 struct FullyConnectedBase {
-  FullyConnectedBase(const BufferType weight_type,
+  FullyConnectedBase(const int /*BufferType*/ weight_type,
                     const ActivationType activation,
                     const float relux_max_limit)
      : weight_type_(weight_type),
@@ -42,7 +45,7 @@ struct FullyConnectedBase {

 template <DeviceType D, typename T>
 struct FullyConnectedFunctor : FullyConnectedBase {
-  FullyConnectedFunctor(const BufferType weight_type,
+  FullyConnectedFunctor(const int /*BufferType*/ weight_type,
                        const ActivationType activation,
                        const float relux_max_limit)
      : FullyConnectedBase(weight_type, activation, relux_max_limit) {}
@@ -89,7 +92,7 @@ struct FullyConnectedFunctor : FullyConnectedBase {

 template <>
 struct FullyConnectedFunctor<DeviceType::NEON, float> : FullyConnectedBase {
-  FullyConnectedFunctor(const BufferType weight_type,
+  FullyConnectedFunctor(const int /*BufferType*/ weight_type,
                        const ActivationType activation,
                        const float relux_max_limit)
    : FullyConnectedBase(weight_type, activation, relux_max_limit) {}
@@ -101,9 +104,10 @@ struct FullyConnectedFunctor<DeviceType::NEON, float> : FullyConnectedBase {
                  StatsFuture *future);
 };

+#ifdef MACE_ENABLE_OPENCL
 template <typename T>
 struct FullyConnectedFunctor<DeviceType::OPENCL, T> : FullyConnectedBase {
-  FullyConnectedFunctor(const BufferType weight_type,
+  FullyConnectedFunctor(const int /*BufferType*/ weight_type,
                        const ActivationType activation,
                        const float relux_max_limit)
      : FullyConnectedBase(weight_type, activation, relux_max_limit) {}
@@ -120,6 +124,7 @@ struct FullyConnectedFunctor<DeviceType::OPENCL, T> : FullyConnectedBase {
  std::vector<index_t> input_shape_;
  std::unique_ptr<BufferBase> kernel_error_;
 };
+#endif  // MACE_ENABLE_OPENCL

 }  // namespace kernels
 }  // namespace mace

--- a/mace/kernels/gemm.cc
+++ b/mace/kernels/gemm.cc
@@ -14,6 +14,7 @@

 #include <math.h>
 #include <algorithm>
+#include <cstring>

 #if defined(MACE_ENABLE_NEON)
 #include <arm_neon.h>

--- a/mace/kernels/local_response_norm.h
+++ b/mace/kernels/local_response_norm.h
@@ -10,10 +10,13 @@
 #include <vector>

 #include "mace/core/future.h"
-#include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/tensor.h"
 #include "mace/public/mace.h"

+#ifdef MACE_ENABLE_OPENCL
+#include "mace/core/runtime/opencl/cl2_header.h"
+#endif  // MACE_ENABLE_OPENCL
+
 namespace mace {
 namespace kernels {


--- a/mace/kernels/matmul.h
+++ b/mace/kernels/matmul.h
@@ -25,11 +25,14 @@
 #include <vector>

 #include "mace/core/future.h"
-#include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/tensor.h"
 #include "mace/kernels/gemm.h"
 #include "mace/utils/utils.h"

+#ifdef MACE_ENABLE_OPENCL
+#include "mace/core/runtime/opencl/cl2_header.h"
+#endif  // MACE_ENABLE_OPENCL
+
 namespace mace {
 namespace kernels {

@@ -70,6 +73,7 @@ struct MatMulFunctor {
  }
 };

+#ifdef MACE_ENABLE_OPENCL
 template<typename T>
 struct MatMulFunctor<DeviceType::OPENCL, T> {
  void operator()(const Tensor *A,
@@ -81,6 +85,7 @@ struct MatMulFunctor<DeviceType::OPENCL, T> {
  uint32_t kwg_size_;
  std::unique_ptr<BufferBase> kernel_error_;
 };
+#endif  // MACE_ENABLE_OPENCL

 }  // namespace kernels
 }  // namespace mace

--- a/mace/kernels/pad.h
+++ b/mace/kernels/pad.h
@@ -20,9 +20,12 @@
 #include <vector>

 #include "mace/core/future.h"
-#include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/tensor.h"

+#ifdef MACE_ENABLE_OPENCL
+#include "mace/core/runtime/opencl/cl2_header.h"
+#endif  // MACE_ENABLE_OPENCL
+
 namespace mace {
 namespace kernels {

@@ -79,6 +82,7 @@ struct PadFunctor : public PadFunctorBase {
  }
 };

+#ifdef MACE_ENABLE_OPENCL
 template <typename T>
 struct PadFunctor<DeviceType::OPENCL, T> : PadFunctorBase {
  PadFunctor(const std::vector<int> &paddings,
@@ -94,6 +98,7 @@ struct PadFunctor<DeviceType::OPENCL, T> : PadFunctorBase {
  std::unique_ptr<BufferBase> kernel_error_;
  std::vector<index_t> input_shape_;
 };
+#endif  // MACE_ENABLE_OPENCL

 }  // namespace kernels
 }  // namespace mace

--- a/mace/kernels/pooling.h
+++ b/mace/kernels/pooling.h
@@ -21,10 +21,13 @@
 #include <vector>

 #include "mace/core/future.h"
-#include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/tensor.h"
 #include "mace/kernels/conv_pool_2d_util.h"

+#ifdef MACE_ENABLE_OPENCL
+#include "mace/core/runtime/opencl/cl2_header.h"
+#endif  // MACE_ENABLE_OPENCL
+
 namespace mace {

 enum PoolingType {
@@ -192,6 +195,7 @@ struct PoolingFunctor<DeviceType::NEON, float> : PoolingFunctorBase {
                  StatsFuture *future);
 };

+#ifdef MACE_ENABLE_OPENCL
 template <typename T>
 struct PoolingFunctor<DeviceType::OPENCL, T> : PoolingFunctorBase {
  PoolingFunctor(const PoolingType pooling_type,
@@ -212,6 +216,7 @@ struct PoolingFunctor<DeviceType::OPENCL, T> : PoolingFunctorBase {
  std::unique_ptr<BufferBase> kernel_error_;
  std::vector<index_t> input_shape_;
 };
+#endif  // MACE_ENABLE_OPENCL

 }  // namespace kernels
 }  // namespace mace

--- a/mace/kernels/reorganize.h
+++ b/mace/kernels/reorganize.h
@@ -18,9 +18,12 @@
 #include <vector>

 #include "mace/core/future.h"
-#include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/tensor.h"

+#ifdef MACE_ENABLE_OPENCL
+#include "mace/core/runtime/opencl/cl2_header.h"
+#endif  // MACE_ENABLE_OPENCL
+
 namespace mace {
 namespace kernels {


--- a/mace/kernels/reshape.h
+++ b/mace/kernels/reshape.h
@@ -18,9 +18,12 @@
 #include <vector>

 #include "mace/core/future.h"
-#include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/tensor.h"

+#ifdef MACE_ENABLE_OPENCL
+#include "mace/core/runtime/opencl/cl2_header.h"
+#endif  // MACE_ENABLE_OPENCL
+
 namespace mace {
 namespace kernels {


--- a/mace/kernels/resize_bilinear.h
+++ b/mace/kernels/resize_bilinear.h
@@ -20,9 +20,12 @@
 #include <vector>

 #include "mace/core/future.h"
-#include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/tensor.h"

+#ifdef MACE_ENABLE_OPENCL
+#include "mace/core/runtime/opencl/cl2_header.h"
+#endif  // MACE_ENABLE_OPENCL
+
 namespace mace {
 namespace kernels {

@@ -177,6 +180,7 @@ struct ResizeBilinearFunctor : ResizeBilinearFunctorBase {
  }
 };

+#ifdef MACE_ENABLE_OPENCL
 template <typename T>
 struct ResizeBilinearFunctor<DeviceType::OPENCL, T>
    : ResizeBilinearFunctorBase {
@@ -190,6 +194,7 @@ struct ResizeBilinearFunctor<DeviceType::OPENCL, T>
  std::unique_ptr<BufferBase> kernel_error_;
  std::vector<index_t> input_shape_;
 };
+#endif  // MACE_ENABLE_OPENCL

 }  // namespace kernels
 }  // namespace mace

--- a/mace/kernels/slice.h
+++ b/mace/kernels/slice.h
@@ -20,11 +20,14 @@
 #include <vector>

 #include "mace/core/future.h"
-#include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/tensor.h"
 #include "mace/core/types.h"
 #include "mace/public/mace.h"

+#ifdef MACE_ENABLE_OPENCL
+#include "mace/core/runtime/opencl/cl2_header.h"
+#endif  // MACE_ENABLE_OPENCL
+
 namespace mace {
 namespace kernels {

@@ -81,6 +84,7 @@ struct SliceFunctor : SliceFunctorBase {
  }
 };

+#ifdef MACE_ENABLE_OPENCL
 template<typename T>
 struct SliceFunctor<DeviceType::OPENCL, T> : SliceFunctorBase {
  explicit SliceFunctor(const int32_t axis) : SliceFunctorBase(axis) {}
@@ -92,6 +96,7 @@ struct SliceFunctor<DeviceType::OPENCL, T> : SliceFunctorBase {
  uint32_t kwg_size_;
  std::unique_ptr<BufferBase> kernel_error_;
 };
+#endif  // MACE_ENABLE_OPENCL

 }  // namespace kernels
 }  // namespace mace

--- a/mace/kernels/softmax.h
+++ b/mace/kernels/softmax.h
@@ -21,11 +21,14 @@
 #include <vector>

 #include "mace/core/future.h"
-#include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/tensor.h"
 #include "mace/public/mace.h"
 #include "mace/utils/utils.h"

+#ifdef MACE_ENABLE_OPENCL
+#include "mace/core/runtime/opencl/cl2_header.h"
+#endif  // MACE_ENABLE_OPENCL
+
 namespace mace {
 namespace kernels {

@@ -72,6 +75,7 @@ struct SoftmaxFunctor<DeviceType::NEON, float> {
  void operator()(const Tensor *logits, Tensor *output, StatsFuture *future);
 };

+#ifdef MACE_ENABLE_OPENCL
 template <typename T>
 struct SoftmaxFunctor<DeviceType::OPENCL, T> {
  void operator()(const Tensor *logits, Tensor *output, StatsFuture *future);
@@ -81,6 +85,7 @@ struct SoftmaxFunctor<DeviceType::OPENCL, T> {
  std::unique_ptr<BufferBase> kernel_error_;
  std::vector<index_t> input_shape_;
 };
+#endif  // MACE_ENABLE_OPENCL

 }  // namespace kernels
 }  // namespace mace

--- a/mace/kernels/space_to_batch.h
+++ b/mace/kernels/space_to_batch.h
@@ -19,10 +19,13 @@
 #include <vector>

 #include "mace/core/future.h"
-#include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/tensor.h"
 #include "mace/public/mace.h"

+#ifdef MACE_ENABLE_OPENCL
+#include "mace/core/runtime/opencl/cl2_header.h"
+#endif  // MACE_ENABLE_OPENCL
+
 namespace mace {
 namespace kernels {

@@ -54,6 +57,7 @@ struct SpaceToBatchFunctor : SpaceToBatchFunctorBase {
  }
 };

+#ifdef MACE_ENABLE_OPENCL
 template <typename T>
 struct SpaceToBatchFunctor<DeviceType::OPENCL, T> : SpaceToBatchFunctorBase {
  SpaceToBatchFunctor(const std::vector<int> &paddings,
@@ -71,6 +75,7 @@ struct SpaceToBatchFunctor<DeviceType::OPENCL, T> : SpaceToBatchFunctorBase {
  std::unique_ptr<BufferBase> kernel_error_;
  std::vector<index_t> space_shape_;
 };
+#endif  // MACE_ENABLE_OPENCL

 }  // namespace kernels
 }  // namespace mace

--- a/mace/kernels/winograd_transform.h
+++ b/mace/kernels/winograd_transform.h
@@ -19,11 +19,14 @@
 #include <vector>

 #include "mace/core/future.h"
-#include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/tensor.h"
 #include "mace/kernels/activation.h"
 #include "mace/kernels/conv_pool_2d_util.h"

+#ifdef MACE_ENABLE_OPENCL
+#include "mace/core/runtime/opencl/cl2_header.h"
+#endif  // MACE_ENABLE_OPENCL
+
 namespace mace {
 namespace kernels {

@@ -52,6 +55,7 @@ struct WinogradTransformFunctor : WinogradTransformFunctorBase {
  }
 };

+#ifdef MACE_ENABLE_OPENCL
 template <typename T>
 struct WinogradTransformFunctor<DeviceType::OPENCL, T>
    : WinogradTransformFunctorBase {
@@ -66,6 +70,7 @@ struct WinogradTransformFunctor<DeviceType::OPENCL, T>
  std::unique_ptr<BufferBase> kernel_error_;
  std::vector<index_t> input_shape_;
 };
+#endif  // MACE_ENABLE_OPENCL

 struct WinogradInverseTransformFunctorBase {
  WinogradInverseTransformFunctorBase(const int batch,
@@ -104,6 +109,7 @@ struct WinogradInverseTransformFunctor : WinogradInverseTransformFunctorBase {
  }
 };

+#ifdef MACE_ENABLE_OPENCL
 template <typename T>
 struct WinogradInverseTransformFunctor<DeviceType::OPENCL, T>
    : WinogradInverseTransformFunctorBase {
@@ -125,6 +131,7 @@ struct WinogradInverseTransformFunctor<DeviceType::OPENCL, T>
  std::unique_ptr<BufferBase> kernel_error_;
  std::vector<index_t> input_shape_;
 };
+#endif  // MACE_ENABLE_OPENCL

 }  // namespace kernels
 }  // namespace mace

--- a/mace/ops/BUILD
+++ b/mace/ops/BUILD
@@ -28,8 +28,14 @@ cc_library(
        exclude = [
            "*_test.cc",
            "*_benchmark.cc",
+            "buffer_to_image.cc",
+            "image_to_buffer.cc",
        ],
-    ),
+    ) + if_android(
+        [
+            "buffer_to_image.cc",
+            "image_to_buffer.cc",
+        ]),
    hdrs = glob(
        ["*.h"],
        exclude = ["ops_test_util.h"],

--- a/mace/ops/activation.cc
+++ b/mace/ops/activation.cc
@@ -24,6 +24,7 @@ void Register_Activation(OperatorRegistry *op_registry) {
                                     .Build(),
                    ActivationOp<DeviceType::CPU, float>);

+#ifdef MACE_ENABLE_OPENCL
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("Activation")
                                     .Device(DeviceType::OPENCL)
                                     .TypeConstraint<float>("T")
@@ -35,6 +36,8 @@ void Register_Activation(OperatorRegistry *op_registry) {
                                     .TypeConstraint<half>("T")
                                     .Build(),
                    ActivationOp<DeviceType::OPENCL, half>);
+#endif  // MACE_ENABLE_OPENCL
+
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("Activation")
                                     .Device(DeviceType::NEON)
                                     .TypeConstraint<float>("T")

--- a/mace/ops/addn.cc
+++ b/mace/ops/addn.cc
@@ -24,6 +24,7 @@ void Register_AddN(OperatorRegistry *op_registry) {
                                     .Build(),
                    AddNOp<DeviceType::CPU, float>);

+#ifdef MACE_ENABLE_OPENCL
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("AddN")
                                     .Device(DeviceType::OPENCL)
                                     .TypeConstraint<float>("T")
@@ -35,6 +36,7 @@ void Register_AddN(OperatorRegistry *op_registry) {
                                     .TypeConstraint<half>("T")
                                     .Build(),
                    AddNOp<DeviceType::OPENCL, half>);
+#endif  // MACE_ENABLE_OPENCL

  REGISTER_OPERATOR(op_registry, OpKeyBuilder("AddN")
                                     .Device(DeviceType::NEON)

--- a/mace/ops/batch_norm.cc
+++ b/mace/ops/batch_norm.cc
@@ -24,6 +24,7 @@ void Register_BatchNorm(OperatorRegistry *op_registry) {
                                     .Build(),
                    BatchNormOp<DeviceType::CPU, float>);

+#ifdef MACE_ENABLE_OPENCL
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchNorm")
                                     .Device(DeviceType::OPENCL)
                                     .TypeConstraint<float>("T")
@@ -35,6 +36,8 @@ void Register_BatchNorm(OperatorRegistry *op_registry) {
                                     .TypeConstraint<half>("T")
                                     .Build(),
                    BatchNormOp<DeviceType::OPENCL, half>);
+#endif  // MACE_ENABLE_OPENCL
+
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchNorm")
                                     .Device(DeviceType::NEON)
                                     .TypeConstraint<float>("T")

--- a/mace/ops/batch_to_space.cc
+++ b/mace/ops/batch_to_space.cc
@@ -18,6 +18,7 @@ namespace mace {
 namespace ops {

 void Register_BatchToSpaceND(OperatorRegistry *op_registry) {
+#ifdef MACE_ENABLE_OPENCL
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchToSpaceND")
                                     .Device(DeviceType::OPENCL)
                                     .TypeConstraint<float>("T")
@@ -28,6 +29,7 @@ void Register_BatchToSpaceND(OperatorRegistry *op_registry) {
                                     .TypeConstraint<half>("T")
                                     .Build(),
                    BatchToSpaceNDOp<DeviceType::OPENCL, half>);
+#endif  // MACE_ENABLE_OPENCL
 }

 }  // namespace ops

--- a/mace/ops/bias_add.cc
+++ b/mace/ops/bias_add.cc
@@ -24,6 +24,7 @@ void Register_BiasAdd(OperatorRegistry *op_registry) {
                                     .Build(),
                    BiasAddOp<DeviceType::CPU, float>);

+#ifdef MACE_ENABLE_OPENCL
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("BiasAdd")
                                     .Device(DeviceType::OPENCL)
                                     .TypeConstraint<float>("T")
@@ -35,6 +36,7 @@ void Register_BiasAdd(OperatorRegistry *op_registry) {
                                     .TypeConstraint<half>("T")
                                     .Build(),
                    BiasAddOp<DeviceType::OPENCL, half>);
+#endif  // MACE_ENABLE_OPENCL
 }

 }  // namespace ops

--- a/mace/ops/channel_shuffle.cc
+++ b/mace/ops/channel_shuffle.cc
@@ -23,16 +23,20 @@ void Register_ChannelShuffle(OperatorRegistry *op_registry) {
                                     .TypeConstraint<float>("T")
                                     .Build(),
                    ChannelShuffleOp<DeviceType::CPU, float>);
+
+#ifdef MACE_ENABLE_OPENCL
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("ChannelShuffle")
                                     .Device(DeviceType::OPENCL)
                                     .TypeConstraint<float>("T")
                                     .Build(),
                    ChannelShuffleOp<DeviceType::OPENCL, float>);
+
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("ChannelShuffle")
                                     .Device(DeviceType::OPENCL)
                                     .TypeConstraint<half>("T")
                                     .Build(),
                    ChannelShuffleOp<DeviceType::OPENCL, half>);
+#endif  // MACE_ENABLE_OPENCL
 }

 }  // namespace ops

--- a/mace/ops/concat.cc
+++ b/mace/ops/concat.cc
@@ -23,16 +23,21 @@ void Register_Concat(OperatorRegistry *op_registry) {
                                     .TypeConstraint<float>("T")
                                     .Build(),
                    ConcatOp<DeviceType::CPU, float>);
+
+#ifdef MACE_ENABLE_OPENCL
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("Concat")
                                     .Device(DeviceType::OPENCL)
                                     .TypeConstraint<float>("T")
                                     .Build(),
                    ConcatOp<DeviceType::OPENCL, float>);
+
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("Concat")
                                     .Device(DeviceType::OPENCL)
                                     .TypeConstraint<half>("T")
                                     .Build(),
                    ConcatOp<DeviceType::OPENCL, half>);
+#endif  // MACE_ENABLE_OPENCL
+
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("Concat")
                                     .Device(DeviceType::NEON)
                                     .TypeConstraint<float>("T")

--- a/mace/ops/conv_2d.cc
+++ b/mace/ops/conv_2d.cc
@@ -24,6 +24,7 @@ void Register_Conv2D(OperatorRegistry *op_registry) {
                                     .Build(),
                    Conv2dOp<DeviceType::CPU, float>);

+#ifdef MACE_ENABLE_OPENCL
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("Conv2D")
                                     .Device(DeviceType::OPENCL)
                                     .TypeConstraint<float>("T")
@@ -35,6 +36,7 @@ void Register_Conv2D(OperatorRegistry *op_registry) {
                                     .TypeConstraint<half>("T")
                                     .Build(),
                    Conv2dOp<DeviceType::OPENCL, half>);
+#endif  // MACE_ENABLE_OPENCL

  REGISTER_OPERATOR(op_registry, OpKeyBuilder("Conv2D")
                                     .Device(DeviceType::NEON)

--- a/mace/ops/cwise.cc
+++ b/mace/ops/cwise.cc
@@ -24,6 +24,7 @@ void Register_CWise(OperatorRegistry *op_registry) {
                                     .Build(),
                    CWiseOp<DeviceType::CPU, float>);

+#ifdef MACE_ENABLE_OPENCL
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("CWise")
                                     .Device(DeviceType::OPENCL)
                                     .TypeConstraint<float>("T")
@@ -35,6 +36,7 @@ void Register_CWise(OperatorRegistry *op_registry) {
                                     .TypeConstraint<half>("T")
                                     .Build(),
                    CWiseOp<DeviceType::OPENCL, half>);
+#endif  // MACE_ENABLE_OPENCL
 }

 }  // namespace ops

--- a/mace/ops/depth_to_space.cc
+++ b/mace/ops/depth_to_space.cc
@@ -24,6 +24,7 @@ void Register_DepthToSpace(OperatorRegistry *op_registry) {
                                     .Build(),
                    DepthToSpaceOp<DeviceType::CPU, float>);

+#ifdef MACE_ENABLE_OPENCL
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthToSpace")
                                     .Device(DeviceType::OPENCL)
                                     .TypeConstraint<float>("T")
@@ -35,6 +36,7 @@ void Register_DepthToSpace(OperatorRegistry *op_registry) {
                                     .TypeConstraint<half>("T")
                                     .Build(),
                    DepthToSpaceOp<DeviceType::OPENCL, half>);
+#endif  // MACE_ENABLE_OPENCL
 }

 }  // namespace ops

--- a/mace/ops/depthwise_conv2d.cc
+++ b/mace/ops/depthwise_conv2d.cc
@@ -24,6 +24,7 @@ void Register_DepthwiseConv2d(OperatorRegistry *op_registry) {
                                     .Build(),
                    DepthwiseConv2dOp<DeviceType::CPU, float>);

+#ifdef MACE_ENABLE_OPENCL
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthwiseConv2d")
                                     .Device(DeviceType::OPENCL)
                                     .TypeConstraint<float>("T")
@@ -35,6 +36,7 @@ void Register_DepthwiseConv2d(OperatorRegistry *op_registry) {
                                     .TypeConstraint<half>("T")
                                     .Build(),
                    DepthwiseConv2dOp<DeviceType::OPENCL, half>);
+#endif  // MACE_ENABLE_OPENCL

  REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthwiseConv2d")
                                     .Device(DeviceType::NEON)

--- a/mace/ops/eltwise.cc
+++ b/mace/ops/eltwise.cc
@@ -24,6 +24,7 @@ void Register_Eltwise(OperatorRegistry *op_registry) {
                                     .Build(),
                    EltwiseOp<DeviceType::CPU, float>);

+#ifdef MACE_ENABLE_OPENCL
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("Eltwise")
                                     .Device(DeviceType::OPENCL)
                                     .TypeConstraint<float>("T")
@@ -35,6 +36,8 @@ void Register_Eltwise(OperatorRegistry *op_registry) {
                                     .TypeConstraint<half>("T")
                                     .Build(),
                    EltwiseOp<DeviceType::OPENCL, half>);
+#endif  // MACE_ENABLE_OPENCL
+
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("Eltwise")
                                     .Device(DeviceType::NEON)
                                     .TypeConstraint<float>("T")

--- a/mace/ops/folded_batch_norm.cc
+++ b/mace/ops/folded_batch_norm.cc
@@ -24,6 +24,7 @@ void Register_FoldedBatchNorm(OperatorRegistry *op_registry) {
                                     .Build(),
                    FoldedBatchNormOp<DeviceType::CPU, float>);

+#ifdef MACE_ENABLE_OPENCL
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm")
                                     .Device(DeviceType::OPENCL)
                                     .TypeConstraint<float>("T")
@@ -35,6 +36,8 @@ void Register_FoldedBatchNorm(OperatorRegistry *op_registry) {
                                     .TypeConstraint<half>("T")
                                     .Build(),
                    FoldedBatchNormOp<DeviceType::OPENCL, half>);
+#endif  // MACE_ENABLE_OPENCL
+
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm")
                                     .Device(DeviceType::NEON)
                                     .TypeConstraint<float>("T")

--- a/mace/ops/fully_connected.cc
+++ b/mace/ops/fully_connected.cc
@@ -24,6 +24,7 @@ void Register_FullyConnected(OperatorRegistry *op_registry) {
                                     .Build(),
                    FullyConnectedOp<DeviceType::CPU, float>);

+#ifdef MACE_ENABLE_OPENCL
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("FC")
                                     .Device(DeviceType::OPENCL)
                                     .TypeConstraint<float>("T")
@@ -35,6 +36,7 @@ void Register_FullyConnected(OperatorRegistry *op_registry) {
                                     .TypeConstraint<half>("T")
                                     .Build(),
                    FullyConnectedOp<DeviceType::OPENCL, half>);
+#endif  // MACE_ENABLE_OPENCL

  REGISTER_OPERATOR(op_registry, OpKeyBuilder("FC")
                                     .Device(DeviceType::NEON)

--- a/mace/ops/fully_connected.h
+++ b/mace/ops/fully_connected.h
@@ -28,10 +28,9 @@ class FullyConnectedOp : public Operator<D, T> {
 public:
  FullyConnectedOp(const OperatorDef &operator_def, Workspace *ws)
      : Operator<D, T>(operator_def, ws),
-        functor_(static_cast<kernels::BufferType>(
-                     OperatorBase::GetSingleArgument<int>(
-                         "weight_type", static_cast<int>(
-                             kernels::WEIGHT_WIDTH))),
+        functor_(OperatorBase::GetSingleArgument<int>(
+                         "weight_type",
+                         7 /*static_cast<int>(kernels::WEIGHT_WIDTH)*/),
                 kernels::StringToActivationType(
                     OperatorBase::GetSingleArgument<std::string>("activation",
                                                                  "NOOP")),

--- a/mace/ops/fused_conv_2d.cc
+++ b/mace/ops/fused_conv_2d.cc
@@ -24,6 +24,7 @@ void Register_FusedConv2D(OperatorRegistry *op_registry) {
                                     .Build(),
                    FusedConv2dOp<DeviceType::CPU, float>);

+#ifdef MACE_ENABLE_OPENCL
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("FusedConv2D")
                                     .Device(DeviceType::OPENCL)
                                     .TypeConstraint<float>("T")
@@ -35,6 +36,8 @@ void Register_FusedConv2D(OperatorRegistry *op_registry) {
                                     .TypeConstraint<half>("T")
                                     .Build(),
                    FusedConv2dOp<DeviceType::OPENCL, half>);
+#endif  // MACE_ENABLE_OPENCL
+
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("FusedConv2D")
                                     .Device(DeviceType::NEON)
                                     .TypeConstraint<float>("T")

--- a/mace/ops/matmul.cc
+++ b/mace/ops/matmul.cc
@@ -24,6 +24,7 @@ void Register_MatMul(OperatorRegistry *op_registry) {
                                     .Build(),
                    MatMulOp<DeviceType::CPU, float>);

+#ifdef MACE_ENABLE_OPENCL
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("MatMul")
                                     .Device(DeviceType::OPENCL)
                                     .TypeConstraint<float>("T")
@@ -35,6 +36,7 @@ void Register_MatMul(OperatorRegistry *op_registry) {
                                     .TypeConstraint<half>("T")
                                     .Build(),
                    MatMulOp<DeviceType::OPENCL, half>);
+#endif  // MACE_ENABLE_OPENCL
 }

 }  // namespace ops

--- a/mace/ops/pad.cc
+++ b/mace/ops/pad.cc
@@ -24,6 +24,7 @@ void Register_Pad(OperatorRegistry *op_registry) {
                                     .Build(),
                    PadOp<DeviceType::CPU, float>);

+#ifdef MACE_ENABLE_OPENCL
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pad")
                                     .Device(DeviceType::OPENCL)
                                     .TypeConstraint<float>("T")
@@ -34,6 +35,7 @@ void Register_Pad(OperatorRegistry *op_registry) {
                                     .TypeConstraint<half>("T")
                                     .Build(),
                    PadOp<DeviceType::OPENCL, half>);
+#endif  // MACE_ENABLE_OPENCL
 }

 }  // namespace ops

--- a/mace/ops/pooling.cc
+++ b/mace/ops/pooling.cc
@@ -23,22 +23,21 @@ void Register_Pooling(OperatorRegistry *op_registry) {
                                     .TypeConstraint<float>("T")
                                     .Build(),
                    PoolingOp<DeviceType::CPU, float>);
-  REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pooling")
-                                     .Device(DeviceType::CPU)
-                                     .TypeConstraint<half>("T")
-                                     .Build(),
-                    PoolingOp<DeviceType::CPU, half>);

+#ifdef MACE_ENABLE_OPENCL
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pooling")
                                     .Device(DeviceType::OPENCL)
                                     .TypeConstraint<float>("T")
                                     .Build(),
                    PoolingOp<DeviceType::OPENCL, float>);
+
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pooling")
                                     .Device(DeviceType::OPENCL)
                                     .TypeConstraint<half>("T")
                                     .Build(),
                    PoolingOp<DeviceType::OPENCL, half>);
+#endif  // MACE_ENABLE_OPENCL
+
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pooling")
                                     .Device(DeviceType::NEON)
                                     .TypeConstraint<float>("T")

--- a/mace/ops/resize_bilinear.cc
+++ b/mace/ops/resize_bilinear.cc
@@ -24,6 +24,7 @@ void Register_ResizeBilinear(OperatorRegistry *op_registry) {
                                     .Build(),
                    ResizeBilinearOp<DeviceType::CPU, float>);

+#ifdef MACE_ENABLE_OPENCL
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("ResizeBilinear")
                                     .Device(DeviceType::OPENCL)
                                     .TypeConstraint<float>("T")
@@ -35,6 +36,7 @@ void Register_ResizeBilinear(OperatorRegistry *op_registry) {
                                     .TypeConstraint<half>("T")
                                     .Build(),
                    ResizeBilinearOp<DeviceType::OPENCL, half>);
+#endif  // MACE_ENABLE_OPENCL
 }

 }  // namespace ops

--- a/mace/ops/slice.cc
+++ b/mace/ops/slice.cc
@@ -24,16 +24,20 @@ void Register_Slice(OperatorRegistry *op_registry) {
                                     .Build(),
                    SliceOp<DeviceType::CPU, float>);

+#ifdef MACE_ENABLE_OPENCL
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("Slice")
                                     .Device(DeviceType::OPENCL)
                                     .TypeConstraint<float>("T")
                                     .Build(),
                    SliceOp<DeviceType::OPENCL, float>);
+
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("Slice")
                                     .Device(DeviceType::OPENCL)
                                     .TypeConstraint<half>("T")
                                     .Build(),
                    SliceOp<DeviceType::OPENCL, half>);
+#endif  // MACE_ENABLE_OPENCL
+
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("Slice")
                                     .Device(DeviceType::NEON)
                                     .TypeConstraint<float>("T")

--- a/mace/ops/softmax.cc
+++ b/mace/ops/softmax.cc
@@ -24,6 +24,7 @@ void Register_Softmax(OperatorRegistry *op_registry) {
                                     .Build(),
                    SoftmaxOp<DeviceType::CPU, float>);

+#ifdef MACE_ENABLE_OPENCL
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("Softmax")
                                     .Device(DeviceType::OPENCL)
                                     .TypeConstraint<float>("T")
@@ -35,6 +36,8 @@ void Register_Softmax(OperatorRegistry *op_registry) {
                                     .TypeConstraint<half>("T")
                                     .Build(),
                    SoftmaxOp<DeviceType::OPENCL, half>);
+#endif  // MACE_ENABLE_OPENCL
+
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("Softmax")
                                     .Device(DeviceType::NEON)
                                     .TypeConstraint<float>("T")

--- a/mace/ops/space_to_batch.cc
+++ b/mace/ops/space_to_batch.cc
@@ -18,16 +18,19 @@ namespace mace {
 namespace ops {

 void Register_SpaceToBatchND(OperatorRegistry *op_registry) {
+#ifdef MACE_ENABLE_OPENCL
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToBatchND")
                                     .Device(DeviceType::OPENCL)
                                     .TypeConstraint<float>("T")
                                     .Build(),
                    SpaceToBatchNDOp<DeviceType::OPENCL, float>);
+
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToBatchND")
                                     .Device(DeviceType::OPENCL)
                                     .TypeConstraint<half>("T")
                                     .Build(),
                    SpaceToBatchNDOp<DeviceType::OPENCL, half>);
+#endif  // MACE_ENABLE_OPENCL
 }

 }  // namespace ops

--- a/mace/ops/space_to_depth.cc
+++ b/mace/ops/space_to_depth.cc
@@ -24,6 +24,7 @@ void Register_SpaceToDepth(OperatorRegistry *op_registry) {
                                     .Build(),
                    SpaceToDepthOp<DeviceType::CPU, float>);

+#ifdef MACE_ENABLE_OPENCL
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToDepth")
                                     .Device(DeviceType::OPENCL)
                                     .TypeConstraint<float>("T")
@@ -35,6 +36,7 @@ void Register_SpaceToDepth(OperatorRegistry *op_registry) {
                                     .TypeConstraint<half>("T")
                                     .Build(),
                    SpaceToDepthOp<DeviceType::OPENCL, half>);
+#endif  // MACE_ENABLE_OPENCL
 }

 }  // namespace ops

--- a/mace/ops/winograd_inverse_transform.cc
+++ b/mace/ops/winograd_inverse_transform.cc
@@ -18,16 +18,19 @@ namespace mace {
 namespace ops {

 void Register_WinogradInverseTransform(OperatorRegistry *op_registry) {
+#ifdef MACE_ENABLE_OPENCL
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("WinogradInverseTransform")
                                     .Device(DeviceType::OPENCL)
                                     .TypeConstraint<float>("T")
                                     .Build(),
                    WinogradInverseTransformOp<DeviceType::OPENCL, float>);
+
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("WinogradInverseTransform")
                                     .Device(DeviceType::OPENCL)
                                     .TypeConstraint<half>("T")
                                     .Build(),
                    WinogradInverseTransformOp<DeviceType::OPENCL, half>);
+#endif  // MACE_ENABLE_OPENCL
 }

 }  // namespace ops

--- a/mace/ops/winograd_transform.cc
+++ b/mace/ops/winograd_transform.cc
@@ -18,16 +18,19 @@ namespace mace {
 namespace ops {

 void Register_WinogradTransform(OperatorRegistry *op_registry) {
+#ifdef MACE_ENABLE_OPENCL
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("WinogradTransform")
                                     .Device(DeviceType::OPENCL)
                                     .TypeConstraint<float>("T")
                                     .Build(),
                    WinogradTransformOp<DeviceType::OPENCL, float>);
+
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("WinogradTransform")
                                     .Device(DeviceType::OPENCL)
                                     .TypeConstraint<half>("T")
                                     .Build(),
                    WinogradTransformOp<DeviceType::OPENCL, half>);
+#endif  // MACE_ENABLE_OPENCL
 }

 }  // namespace ops

--- a/mace/tools/validation/BUILD
+++ b/mace/tools/validation/BUILD
 # Examples
-load("//mace:mace.bzl", "if_openmp_enabled")
+load("//mace:mace.bzl", "if_openmp_enabled", "if_android")

 cc_binary(
    name = "mace_run",
    srcs = ["mace_run.cc"],
    linkopts = if_openmp_enabled(["-fopenmp"]),
    linkstatic = 1,
+    copts = if_android(["-DMACE_ENABLE_OPENCL"]),
    deps = [
        "//external:gflags_nothreads",
        "//mace/codegen:generated_models",

--- a/mace/tools/validation/mace_run.cc
+++ b/mace/tools/validation/mace_run.cc
@@ -33,12 +33,15 @@
 #include <numeric>

 #include "gflags/gflags.h"
-#include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/public/mace.h"
 #include "mace/public/mace_runtime.h"
 #include "mace/utils/env_time.h"
 #include "mace/utils/logging.h"

+#ifdef MACE_ENABLE_OPENCL
+#include "mace/core/runtime/opencl/opencl_runtime.h"
+#endif  // MACE_ENABLE_OPENCL
+
 // #include "mace/codegen/models/${MACE_MODEL_TAG}/${MACE_MODEL_TAG}.h" instead
 namespace mace {
 namespace MACE_MODEL_TAG {
@@ -113,6 +116,7 @@ DeviceType ParseDeviceType(const std::string &device_str) {
  }
 }

+#ifdef MACE_ENABLE_OPENCL
 void WriteOpenCLPlatformInfo(const std::string &output_dir) {
  std::string platform_info = OpenCLRuntime::Global()->platform_info();
  const std::string cl_platform_info_file_name = output_dir
@@ -126,6 +130,7 @@ void WriteOpenCLPlatformInfo(const std::string &output_dir) {
    LOG(WARNING) << "Write opencl platform info failed.";
  }
 }
+#endif  // MACE_ENABLE_OPENCL

 struct mallinfo LogMallinfoChange(struct mallinfo prev) {
  struct mallinfo curr = mallinfo();
@@ -225,11 +230,13 @@ bool RunModel(const std::vector<std::string> &input_names,
  mace::SetOpenMPThreadPolicy(
      FLAGS_omp_num_threads,
      static_cast<CPUAffinityPolicy >(FLAGS_cpu_affinity_policy));
+#ifdef MACE_ENABLE_OPENCL
  if (device_type == DeviceType::OPENCL) {
    mace::SetGPUHints(
        static_cast<GPUPerfHint>(FLAGS_gpu_perf_hint),
        static_cast<GPUPriorityHint>(FLAGS_gpu_priority_hint));
  }
+#endif  // MACE_ENABLE_OPENCL

  const char *kernel_path = getenv("MACE_CL_PROGRAM_PATH");
  const std::string kernel_file_path =
@@ -318,9 +325,11 @@ bool RunModel(const std::vector<std::string> &input_names,
  printf("time %11.3f %11.3f %11.3f %11.3f %11.3f\n", create_net_millis,
         mace_engine_ctor_millis, init_millis, warmup_millis, model_run_millis);

+#ifdef MACE_ENABLE_OPENCL
  if (device_type == DeviceType::OPENCL) {
    WriteOpenCLPlatformInfo(kernel_file_path);
  }
+#endif  // MACE_ENABLE_OPENCL

  for (size_t i = 0; i < output_count; ++i) {
    std::string output_name =

--- a/tools/mace_tools.py
+++ b/tools/mace_tools.py
@@ -34,26 +34,30 @@ from ConfigParser import ConfigParser


 def get_target_socs(configs):
-    available_socs = sh_commands.adb_get_all_socs()
-    target_socs = available_socs
-    if hasattr(configs, "target_socs"):
-        target_socs = set(configs["target_socs"])
-        target_socs = target_socs & available_socs
-
-    if FLAGS.target_socs != "all":
-        socs = set(FLAGS.target_socs.split(','))
-        target_socs = target_socs & socs
-        missing_socs = socs.difference(target_socs)
-        if len(missing_socs) > 0:
-            print(
-                "Error: devices with SoCs are not connected %s" % missing_socs)
+    if "host" in configs["target_abis"]:
+        return [""]
+    else:
+        available_socs = sh_commands.adb_get_all_socs()
+        target_socs = available_socs
+        if hasattr(configs, "target_socs"):
+            target_socs = set(configs["target_socs"])
+            target_socs = target_socs & available_socs
+
+        if FLAGS.target_socs != "all":
+            socs = set(FLAGS.target_socs.split(','))
+            target_socs = target_socs & socs
+            missing_socs = socs.difference(target_socs)
+            if len(missing_socs) > 0:
+                print(
+                    "Error: devices with SoCs are not connected %s" %
+                    missing_socs)
+                exit(1)
+
+        if not target_socs:
+            print("Error: no device to run")
            exit(1)

-    if not target_socs:
-        print("Error: no device to run")
-        exit(1)
-
-    return target_socs
+        return target_socs


 def get_data_and_device_type(runtime):
@@ -339,8 +343,8 @@ def parse_args():


 def process_models(project_name, configs, embed_model_data, vlog_level,
-                   target_soc, target_abi, serialno, phone_data_dir,
-                   option_args):
+                   target_abi, phone_data_dir, option_args,
+                   target_soc="", serialno="", device_name=""):
    hexagon_mode = get_hexagon_mode(configs)
    model_output_dirs = []
    for model_name in configs["models"]:
@@ -358,11 +362,16 @@ def process_models(project_name, configs, embed_model_data, vlog_level,

        # Create model build directory
        model_path_digest = md5sum(model_config["model_file_path"])
-        device_name = sh_commands.adb_get_device_name_by_serialno(serialno)
-        model_output_dir = "%s/%s/%s/%s/%s/%s_%s/%s" % (
-            FLAGS.output_dir, project_name, "build",
-            model_name, model_path_digest, device_name.replace(' ', ''),
-            target_soc, target_abi)
+
+        if target_abi == "host":
+            model_output_dir = "%s/%s/%s/%s/%s/%s" % (
+                FLAGS.output_dir, project_name, "build",
+                model_name, model_path_digest, target_abi)
+        else:
+            model_output_dir = "%s/%s/%s/%s/%s/%s_%s/%s" % (
+                FLAGS.output_dir, project_name, "build",
+                model_name, model_path_digest, device_name.replace(' ', ''),
+                target_soc, target_abi)
        model_output_dirs.append(model_output_dir)

        if FLAGS.mode == "build" or FLAGS.mode == "all":
@@ -563,15 +572,19 @@ def main(unused_args):
                        print("Run on device: %s, %s, %s" % (
                            serialno, props["ro.board.platform"],
                              props["ro.product.model"]))
+                        device_name = \
+                                sh_commands.adb_get_device_name_by_serialno(
+                                        serialno)
                        process_models(project_name, configs, embed_model_data,
-                                       vlog_level, target_soc, target_abi,
-                                       serialno, phone_data_dir, option_args)
+                                       vlog_level, target_abi, phone_data_dir,
+                                       option_args, target_soc, serialno,
+                                       device_name)
            else:
                print("====================================================")
                print("Run on host")
                process_models(project_name, configs, embed_model_data,
-                               vlog_level, target_soc, target_abi, '',
-                               phone_data_dir, option_args)
+                               vlog_level, target_abi, phone_data_dir,
+                               option_args)

    if FLAGS.mode == "build" or FLAGS.mode == "all":
        sh_commands.packaging_lib(FLAGS.output_dir, project_name)