diff --git a/.gitignore b/.gitignore
index d4f869a452e161f7d355e325b69cec0b7f4fc903..c02dd020231b2d414fe60a112e20cffba202f7f4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,7 +3,6 @@ build/*
 cmake-build/
 cmake-build-debug/
 docs/_build/
-*.a
 
 .idea/
 .vscode/
diff --git a/mace/BUILD.bazel b/mace/BUILD.bazel
index 35c6b405e07a511a6e422e1639c7819f7c960a62..748af938613ec84275140c6dc4425f5b56ee248c 100644
--- a/mace/BUILD.bazel
+++ b/mace/BUILD.bazel
@@ -131,3 +131,14 @@ config_setting(
     },
     visibility = ["//visibility:public"],
 )
+
+config_setting(
+    name = "rpcmem_enabled",
+    define_values = {
+        "rpcmem": "true",
+    },
+    values = {
+        "crosstool_top": "//external:android/crosstool",
+    },
+    visibility = ["//visibility:public"],
+)
diff --git a/mace/core/BUILD.bazel b/mace/core/BUILD.bazel
index f9cc96970974706e491ece91a0d0700b08b8c9d7..fcb7b20773fafa2c48b9e1636271308ec161d8cf 100644
--- a/mace/core/BUILD.bazel
+++ b/mace/core/BUILD.bazel
@@ -18,6 +18,7 @@ load(
     "if_opencl_enabled",
     "if_openmp_enabled",
     "if_quantize_enabled",
+    "if_rpcmem_enabled",
 )
 
 cc_library(
@@ -75,6 +76,8 @@ cc_library(
     ]) + if_android_armv7([
         "-mfpu=neon-fp16",
         "-mfloat-abi=softfp",
+    ]) + if_rpcmem_enabled([
+        "-DMACE_ENABLE_RPCMEM",
     ]),
     linkopts = ["-ldl"],
     deps = [
@@ -94,6 +97,8 @@ cc_library(
         "//third_party/hta",
     ]) + if_apu_enabled([
         "//third_party/apu:libapu-frontend",
+    ]) + if_rpcmem_enabled([
+        "//third_party/rpcmem",
     ]),
 )
 
diff --git a/mace/core/CMakeLists.txt b/mace/core/CMakeLists.txt
index 25dd168023fede667d4323ecd4c8cd62d3052b7a..75b74bb9005d321355332de46f5921ba7d638822 100644
--- a/mace/core/CMakeLists.txt
+++ b/mace/core/CMakeLists.txt
@@ -45,6 +45,10 @@ if(MACE_ENABLE_MTK_APU)
   set(EXTRA_LINK_LIBS ${EXTRA_LINK_LIBS} apu-frontend)
 endif(MACE_ENABLE_MTK_APU)
 
+if(MACE_ENABLE_RPCMEM)
+    set(EXTRA_LINK_LIBS ${EXTRA_LINK_LIBS} rpcmem)
+endif(MACE_ENABLE_RPCMEM)
+
 add_library(core STATIC ${CORE_SRCS})
 target_link_libraries(core PRIVATE
   proto
diff --git a/mace/core/allocator.h b/mace/core/allocator.h
index 66987c46791d6500fa654a65afecbaea4800b16c..e4e7b35f0b58a9e98a70bfa6f3559ab1e039463b 100644
--- a/mace/core/allocator.h
+++ b/mace/core/allocator.h
@@ -47,16 +47,20 @@ class Allocator {
  public:
   Allocator() {}
   virtual ~Allocator() noexcept {}
-  virtual MaceStatus New(size_t nbytes, void **result) const = 0;
+  virtual MaceStatus New(size_t nbytes, void **result) = 0;
   virtual MaceStatus NewImage(const std::vector<size_t> &image_shape,
                               const DataType dt,
-                              void **result) const = 0;
-  virtual void Delete(void *data) const = 0;
-  virtual void DeleteImage(void *data) const = 0;
-  virtual void *Map(void *buffer, size_t offset, size_t nbytes) const = 0;
+                              void **result) = 0;
+  virtual void Delete(void *data) = 0;
+  virtual void DeleteImage(void *data) = 0;
+  virtual void *Map(void *buffer,
+                    size_t offset,
+                    size_t nbytes,
+                    bool finish_cmd_queue) const = 0;
   virtual void *MapImage(void *buffer,
                          const std::vector<size_t> &image_shape,
-                         std::vector<size_t> *mapped_image_pitch) const = 0;
+                         std::vector<size_t> *mapped_image_pitch,
+                         bool finish_cmd_queue) const = 0;
   virtual void Unmap(void *buffer, void *mapper_ptr) const = 0;
   virtual bool OnHost() const = 0;
 };
@@ -64,7 +68,7 @@ class Allocator {
 class CPUAllocator : public Allocator {
  public:
   ~CPUAllocator() override {}
-  MaceStatus New(size_t nbytes, void **result) const override {
+  MaceStatus New(size_t nbytes, void **result) override {
     VLOG(3) << "Allocate CPU buffer: " << nbytes;
     if (nbytes == 0) {
       return MaceStatus::MACE_SUCCESS;
@@ -82,7 +86,7 @@ class CPUAllocator : public Allocator {
 
   MaceStatus NewImage(const std::vector<size_t> &shape,
                       const DataType dt,
-                      void **result) const override {
+                      void **result) override {
     MACE_UNUSED(shape);
     MACE_UNUSED(dt);
     MACE_UNUSED(result);
@@ -90,24 +94,30 @@ class CPUAllocator : public Allocator {
     return MaceStatus::MACE_SUCCESS;
   }
 
-  void Delete(void *data) const override {
+  void Delete(void *data) override {
     MACE_CHECK_NOTNULL(data);
     VLOG(3) << "Free CPU buffer";
     free(data);
   }
-  void DeleteImage(void *data) const override {
+  void DeleteImage(void *data) override {
     LOG(FATAL) << "Free CPU image";
     free(data);
   };
-  void *Map(void *buffer, size_t offset, size_t nbytes) const override {
+  void *Map(void *buffer,
+            size_t offset,
+            size_t nbytes,
+            bool finish_cmd_queue) const override {
     MACE_UNUSED(nbytes);
+    MACE_UNUSED(finish_cmd_queue);
     return reinterpret_cast<char*>(buffer) + offset;
   }
   void *MapImage(void *buffer,
                  const std::vector<size_t> &image_shape,
-                 std::vector<size_t> *mapped_image_pitch) const override {
+                 std::vector<size_t> *mapped_image_pitch,
+                 bool finish_cmd_queue) const override {
     MACE_UNUSED(image_shape);
     MACE_UNUSED(mapped_image_pitch);
+    MACE_UNUSED(finish_cmd_queue);
     return buffer;
   }
   void Unmap(void *buffer, void *mapper_ptr) const override {
diff --git a/mace/core/buffer.h b/mace/core/buffer.h
index d1f5f1a507ffde8f884b81096ea19b7ffd60ba73..49be42179b1e38a17b3f5c2f21f94c82edf88a68 100644
--- a/mace/core/buffer.h
+++ b/mace/core/buffer.h
@@ -54,11 +54,13 @@ class BufferBase {
 
   virtual void *Map(index_t offset,
                     index_t length,
-                    std::vector<size_t> *pitch) const = 0;
+                    std::vector<size_t> *pitch,
+                    bool finish_cmd_queue) const = 0;
 
   virtual void UnMap(void *mapped_ptr) const = 0;
 
-  virtual void Map(std::vector<size_t> *pitch) = 0;
+  virtual void Map(std::vector<size_t> *pitch,
+                   bool finish_cmd_queue = true) = 0;
 
   virtual void UnMap() = 0;
 
@@ -171,10 +173,13 @@ class Buffer : public BufferBase {
     return this->Allocate(nbytes);
   }
 
-  void *Map(index_t offset, index_t length, std::vector<size_t> *pitch) const {
+  void *Map(index_t offset,
+            index_t length,
+            std::vector<size_t> *pitch,
+            bool finish_cmd_queue) const {
     MACE_CHECK_NOTNULL(buf_);
     MACE_UNUSED(pitch);
-    return allocator_->Map(buf_, offset, length);
+    return allocator_->Map(buf_, offset, length, finish_cmd_queue);
   }
 
   void UnMap(void *mapped_ptr) const {
@@ -183,9 +188,9 @@ class Buffer : public BufferBase {
     allocator_->Unmap(buf_, mapped_ptr);
   }
 
-  void Map(std::vector<size_t> *pitch) {
+  void Map(std::vector<size_t> *pitch, bool finish_cmd_queue = true) {
     MACE_CHECK(mapped_buf_ == nullptr, "buf has been already mapped");
-    mapped_buf_ = Map(0, size_, pitch);
+    mapped_buf_ = Map(0, size_, pitch, finish_cmd_queue);
   }
 
   void UnMap() {
@@ -300,10 +305,14 @@ class Image : public BufferBase {
     return allocator_->NewImage(shape, data_type, &buf_);
   }
 
-  void *Map(index_t offset, index_t length, std::vector<size_t> *pitch) const {
+  void *Map(index_t offset,
+            index_t length,
+            std::vector<size_t> *pitch,
+            bool finish_cmd_queue) const {
     MACE_UNUSED(offset);
     MACE_UNUSED(length);
     MACE_UNUSED(pitch);
+    MACE_UNUSED(finish_cmd_queue);
     MACE_NOT_IMPLEMENTED;
     return nullptr;
   }
@@ -314,11 +323,11 @@ class Image : public BufferBase {
     allocator_->Unmap(buf_, mapped_ptr);
   }
 
-  void Map(std::vector<size_t> *pitch) {
+  void Map(std::vector<size_t> *pitch, bool finish_cmd_queue = true) {
     MACE_CHECK_NOTNULL(buf_);
     MACE_CHECK(mapped_buf_ == nullptr, "buf has been already mapped");
     MACE_CHECK_NOTNULL(pitch);
-    mapped_buf_ = allocator_->MapImage(buf_, shape_, pitch);
+    mapped_buf_ = allocator_->MapImage(buf_, shape_, pitch, finish_cmd_queue);
   }
 
   void UnMap() {
@@ -434,18 +443,21 @@ class BufferSlice : public BufferBase {
     return MaceStatus::MACE_SUCCESS;
   }
 
-  void *Map(index_t offset, index_t length, std::vector<size_t> *pitch) const {
-    return buffer_->Map(offset_ + offset, length, pitch);
+  void *Map(index_t offset,
+            index_t length,
+            std::vector<size_t> *pitch,
+            bool finish_cmd_queue) const {
+    return buffer_->Map(offset_ + offset, length, pitch, finish_cmd_queue);
   }
 
   void UnMap(void *mapped_ptr) const {
     buffer_->UnMap(mapped_ptr);
   }
 
-  void Map(std::vector<size_t> *pitch) {
+  void Map(std::vector<size_t> *pitch, bool finish_cmd_queue = true) {
     MACE_CHECK_NOTNULL(buffer_);
     MACE_CHECK(mapped_buf_ == nullptr, "mapped buf is not null");
-    mapped_buf_ = buffer_->Map(offset_, size_, pitch);
+    mapped_buf_ = buffer_->Map(offset_, size_, pitch, finish_cmd_queue);
   }
 
   void UnMap() {
diff --git a/mace/core/runtime/opencl/opencl_allocator.cc b/mace/core/runtime/opencl/opencl_allocator.cc
index d999416959f96b58d9fc4c5a288a5cadb6065910..b0bf041b6883000057a63608a82761abfa2037a3 100644
--- a/mace/core/runtime/opencl/opencl_allocator.cc
+++ b/mace/core/runtime/opencl/opencl_allocator.cc
@@ -16,9 +16,11 @@
 
 #include "mace/core/runtime/opencl/opencl_allocator.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
+#ifdef MACE_ENABLE_RPCMEM
+#include "third_party/rpcmem/rpcmem.h"
+#endif  // MACE_ENABLE_RPCMEM
 
 namespace mace {
-
 namespace {
 
 static cl_channel_type DataTypeToCLChannelType(const DataType t) {
@@ -36,14 +38,29 @@ static cl_channel_type DataTypeToCLChannelType(const DataType t) {
       return 0;
   }
 }
+
+#ifdef MACE_ENABLE_RPCMEM
+std::once_flag ion_prepared;
+void PrepareQualcommION() {
+  rpcmem_init();
+  std::atexit(rpcmem_deinit);
+}
+#endif  // MACE_ENABLE_RPCMEM
+
 }  // namespace
 
 OpenCLAllocator::OpenCLAllocator(
-    OpenCLRuntime *opencl_runtime):
-    opencl_runtime_(opencl_runtime) {}
+    OpenCLRuntime *opencl_runtime): opencl_runtime_(opencl_runtime) {
+#ifdef MACE_ENABLE_RPCMEM
+  if (opencl_runtime_->ion_type() == IONType::QUALCOMM_ION) {
+    std::call_once(ion_prepared, PrepareQualcommION);
+  }
+#endif  // MACE_ENABLE_RPCMEM
+}
 
 OpenCLAllocator::~OpenCLAllocator() {}
-MaceStatus OpenCLAllocator::New(size_t nbytes, void **result) const {
+
+MaceStatus OpenCLAllocator::New(size_t nbytes, void **result) {
   if (nbytes == 0) {
     return MaceStatus::MACE_SUCCESS;
   }
@@ -53,10 +70,27 @@ MaceStatus OpenCLAllocator::New(size_t nbytes, void **result) const {
     return MaceStatus::MACE_OUT_OF_RESOURCES;
   }
 
-  cl_int error;
-  cl::Buffer *buffer = new cl::Buffer(opencl_runtime_->context(),
-                                      CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
-                                      nbytes, nullptr, &error);
+  cl_int error = CL_SUCCESS;
+  cl::Buffer *buffer = nullptr;
+#ifdef MACE_ENABLE_RPCMEM
+  if (opencl_runtime_->ion_type() == IONType::QUALCOMM_ION) {
+    cl_mem_ion_host_ptr ion_host;
+    CreateQualcommBufferIONHostPtr(nbytes, &ion_host);
+
+    buffer = new cl::Buffer(
+        opencl_runtime_->context(),
+        CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR | CL_MEM_EXT_HOST_PTR_QCOM,
+        nbytes, &ion_host, &error);
+
+    cl_to_host_map_[static_cast<void *>(buffer)] = ion_host.ion_hostptr;
+  } else {
+#endif  // MACE_ENABLE_RPCMEM
+    buffer = new cl::Buffer(opencl_runtime_->context(),
+                            CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
+                            nbytes, nullptr, &error);
+#ifdef MACE_ENABLE_RPCMEM
+  }
+#endif  // MACE_ENABLE_RPCMEM
   if (error != CL_SUCCESS) {
     LOG(WARNING) << "Allocate OpenCL Buffer with "
                  << nbytes << " bytes failed because of "
@@ -72,7 +106,7 @@ MaceStatus OpenCLAllocator::New(size_t nbytes, void **result) const {
 
 MaceStatus OpenCLAllocator::NewImage(const std::vector<size_t> &image_shape,
                                      const DataType dt,
-                                     void **result) const {
+                                     void **result) {
   MACE_CHECK(image_shape.size() == 2, "Image shape's size must equal 2");
   MACE_LATENCY_LOGGER(1, "Allocate OpenCL image: ",
                       image_shape[0], ", ", image_shape[1]);
@@ -82,12 +116,29 @@ MaceStatus OpenCLAllocator::NewImage(const std::vector<size_t> &image_shape,
   }
 
   cl::ImageFormat img_format(CL_RGBA, DataTypeToCLChannelType(dt));
+  cl_int error = CL_SUCCESS;
+  cl::Image2D *cl_image = nullptr;
+#ifdef MACE_ENABLE_RPCMEM
+  if (opencl_runtime_->ion_type() == IONType::QUALCOMM_ION) {
+    cl_mem_ion_host_ptr ion_host;
+    size_t pitch;
+    CreateQualcommImageIONHostPtr(image_shape, img_format, &pitch, &ion_host);
 
-  cl_int error;
-  cl::Image2D *cl_image =
-      new cl::Image2D(opencl_runtime_->context(),
-                      CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, img_format,
-                      image_shape[0], image_shape[1], 0, nullptr, &error);
+    cl_image = new cl::Image2D(
+        opencl_runtime_->context(),
+        CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR | CL_MEM_EXT_HOST_PTR_QCOM,
+        img_format, image_shape[0], image_shape[1], pitch, &ion_host, &error);
+
+    cl_to_host_map_[static_cast<void *>(cl_image)] = ion_host.ion_hostptr;
+  } else {
+#endif  // MACE_ENABLE_RPCMEM
+    cl_image =
+        new cl::Image2D(opencl_runtime_->context(),
+                        CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, img_format,
+                        image_shape[0], image_shape[1], 0, nullptr, &error);
+#ifdef MACE_ENABLE_RPCMEM
+  }
+#endif  // MACE_ENABLE_RPCMEM
   if (error != CL_SUCCESS) {
     LOG(WARNING) << "Allocate OpenCL image with shape: ["
                  << image_shape[0] << ", " << image_shape[1]
@@ -108,72 +159,180 @@ MaceStatus OpenCLAllocator::NewImage(const std::vector<size_t> &image_shape,
   }
 }
 
-void OpenCLAllocator::Delete(void *buffer) const {
+void OpenCLAllocator::Delete(void *buffer) {
   MACE_LATENCY_LOGGER(1, "Free OpenCL buffer");
   if (buffer != nullptr) {
     cl::Buffer *cl_buffer = static_cast<cl::Buffer *>(buffer);
     delete cl_buffer;
+#ifdef MACE_ENABLE_RPCMEM
+    if (opencl_runtime_->ion_type() == IONType::QUALCOMM_ION) {
+      auto it = cl_to_host_map_.find(buffer);
+      MACE_CHECK(it != cl_to_host_map_.end(), "OpenCL buffer not found!");
+      rpcmem_free(it->second);
+      cl_to_host_map_.erase(buffer);
+    }
+#endif  // MACE_ENABLE_RPCMEM
   }
 }
 
-void OpenCLAllocator::DeleteImage(void *buffer) const {
+void OpenCLAllocator::DeleteImage(void *buffer) {
   MACE_LATENCY_LOGGER(1, "Free OpenCL image");
   if (buffer != nullptr) {
     cl::Image2D *cl_image = static_cast<cl::Image2D *>(buffer);
     delete cl_image;
+#ifdef MACE_ENABLE_RPCMEM
+    if (opencl_runtime_->ion_type() == IONType::QUALCOMM_ION) {
+      auto it = cl_to_host_map_.find(buffer);
+      MACE_CHECK(it != cl_to_host_map_.end(), "OpenCL image not found!");
+      rpcmem_free(it->second);
+      cl_to_host_map_.erase(buffer);
+    }
+#endif  // MACE_ENABLE_RPCMEM
   }
 }
 
-void *OpenCLAllocator::Map(void *buffer, size_t offset, size_t nbytes) const {
+void *OpenCLAllocator::Map(void *buffer,
+                           size_t offset,
+                           size_t nbytes,
+                           bool finish_cmd_queue) const {
   MACE_LATENCY_LOGGER(1, "Map OpenCL buffer");
-  auto cl_buffer = static_cast<cl::Buffer *>(buffer);
-  auto queue = opencl_runtime_->command_queue();
-  // TODO(heliangliang) Non-blocking call
-  cl_int error;
-  void *mapped_ptr =
-      queue.enqueueMapBuffer(*cl_buffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE,
-                             offset, nbytes, nullptr, nullptr, &error);
-  if (error != CL_SUCCESS) {
-    LOG(ERROR) << "Map buffer failed, error: " << OpenCLErrorToString(error);
-    mapped_ptr = nullptr;
+  void *mapped_ptr = nullptr;
+#ifdef MACE_ENABLE_RPCMEM
+  if (opencl_runtime_->ion_type() == IONType::QUALCOMM_ION) {
+    auto it = cl_to_host_map_.find(buffer);
+    MACE_CHECK(it != cl_to_host_map_.end(), "Try to map unallocated Buffer!");
+    mapped_ptr = it->second;
+
+    if (finish_cmd_queue) {
+      opencl_runtime_->command_queue().finish();
+    }
+
+    if (opencl_runtime_->qcom_host_cache_policy() ==
+        CL_MEM_HOST_WRITEBACK_QCOM) {
+      MACE_CHECK(rpcmem_sync_cache(mapped_ptr, RPCMEM_SYNC_START) == 0);
+    }
+  } else {
+#endif  // MACE_ENABLE_RPCMEM
+    MACE_UNUSED(finish_cmd_queue);
+    auto cl_buffer = static_cast<cl::Buffer *>(buffer);
+    auto queue = opencl_runtime_->command_queue();
+    // TODO(heliangliang) Non-blocking call
+    cl_int error;
+    mapped_ptr =
+        queue.enqueueMapBuffer(*cl_buffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE,
+                               offset, nbytes, nullptr, nullptr, &error);
+    if (error != CL_SUCCESS) {
+      LOG(ERROR) << "Map buffer failed, error: " << OpenCLErrorToString(error);
+    }
+#ifdef MACE_ENABLE_RPCMEM
   }
+#endif  // MACE_ENABLE_RPCMEM
   return mapped_ptr;
 }
 
 // TODO(liuqi) there is something wrong with half type.
 void *OpenCLAllocator::MapImage(void *buffer,
                                 const std::vector<size_t> &image_shape,
-                                std::vector<size_t> *mapped_image_pitch) const {
+                                std::vector<size_t> *mapped_image_pitch,
+                                bool finish_cmd_queue) const {
   MACE_LATENCY_LOGGER(1, "Map OpenCL Image");
   MACE_CHECK(image_shape.size() == 2) << "Just support map 2d image";
-  auto cl_image = static_cast<cl::Image2D *>(buffer);
-  std::array<size_t, 3> origin = {{0, 0, 0}};
-  std::array<size_t, 3> region = {{image_shape[0], image_shape[1], 1}};
-
-  mapped_image_pitch->resize(2);
-  cl_int error;
-  void *mapped_ptr = opencl_runtime_->command_queue().enqueueMapImage(
-      *cl_image, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, origin, region,
-      mapped_image_pitch->data(), mapped_image_pitch->data() + 1, nullptr,
-      nullptr, &error);
-  if (error != CL_SUCCESS) {
-    LOG(ERROR) << "Map Image failed, error: " << OpenCLErrorToString(error);
-    mapped_ptr = nullptr;
+  void *mapped_ptr = nullptr;
+#ifdef MACE_ENABLE_RPCMEM
+  if (opencl_runtime_->ion_type() == IONType::QUALCOMM_ION) {
+    // TODO(libin): Set mapped_image_pitch if needed
+    auto it = cl_to_host_map_.find(buffer);
+    MACE_CHECK(it != cl_to_host_map_.end(), "Try to map unallocated Image!");
+    mapped_ptr = it->second;
+
+    if (finish_cmd_queue) {
+      opencl_runtime_->command_queue().finish();
+    }
+
+    if (opencl_runtime_->qcom_host_cache_policy() ==
+        CL_MEM_HOST_WRITEBACK_QCOM) {
+      MACE_CHECK(rpcmem_sync_cache(mapped_ptr, RPCMEM_SYNC_START) == 0);
+    }
+  } else {
+#endif  // MACE_ENABLE_RPCMEM
+    MACE_UNUSED(finish_cmd_queue);
+    auto cl_image = static_cast<cl::Image2D *>(buffer);
+    std::array<size_t, 3> origin = {{0, 0, 0}};
+    std::array<size_t, 3> region = {{image_shape[0], image_shape[1], 1}};
+
+    mapped_image_pitch->resize(2);
+    cl_int error;
+    mapped_ptr = opencl_runtime_->command_queue().enqueueMapImage(
+        *cl_image, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, origin, region,
+        mapped_image_pitch->data(), mapped_image_pitch->data() + 1, nullptr,
+        nullptr, &error);
+    if (error != CL_SUCCESS) {
+      LOG(ERROR) << "Map Image failed, error: " << OpenCLErrorToString(error);
+    }
+#ifdef MACE_ENABLE_RPCMEM
   }
+#endif  // MACE_ENABLE_RPCMEM
   return mapped_ptr;
 }
 
 void OpenCLAllocator::Unmap(void *buffer, void *mapped_ptr) const {
   MACE_LATENCY_LOGGER(1, "Unmap OpenCL buffer/Image");
-  auto cl_buffer = static_cast<cl::Buffer *>(buffer);
-  auto queue = opencl_runtime_->command_queue();
-  cl_int error = queue.enqueueUnmapMemObject(*cl_buffer, mapped_ptr,
-                                             nullptr, nullptr);
-  if (error != CL_SUCCESS) {
-    LOG(ERROR) << "Unmap buffer failed, error: " << OpenCLErrorToString(error);
+#ifdef MACE_ENABLE_RPCMEM
+  if (opencl_runtime_->ion_type() == IONType::QUALCOMM_ION) {
+    if (opencl_runtime_->qcom_host_cache_policy() ==
+        CL_MEM_HOST_WRITEBACK_QCOM) {
+      MACE_CHECK(rpcmem_sync_cache(mapped_ptr, RPCMEM_SYNC_END) == 0);
+    }
+  } else {
+#endif  // MACE_ENABLE_RPCMEM
+    auto cl_buffer = static_cast<cl::Buffer *>(buffer);
+    auto queue = opencl_runtime_->command_queue();
+    cl_int error = queue.enqueueUnmapMemObject(*cl_buffer, mapped_ptr,
+                                              nullptr, nullptr);
+    if (error != CL_SUCCESS) {
+      LOG(ERROR) << "Unmap buffer failed, error: "
+                 << OpenCLErrorToString(error);
+    }
+#ifdef MACE_ENABLE_RPCMEM
   }
+#endif  // MACE_ENABLE_RPCMEM
 }
 
 bool OpenCLAllocator::OnHost() const { return false; }
 
+#ifdef MACE_ENABLE_RPCMEM
+void OpenCLAllocator::CreateQualcommBufferIONHostPtr(
+    const size_t nbytes,
+    cl_mem_ion_host_ptr *ion_host) {
+  void *host = rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_FLAG_CACHED,
+                            nbytes + opencl_runtime_->qcom_ext_mem_padding());
+  MACE_CHECK_NOTNULL(host);
+  auto host_addr = reinterpret_cast<std::uintptr_t>(host);
+  auto page_size = opencl_runtime_->qcom_page_size();
+  MACE_CHECK(host_addr % page_size == 0, "ION memory address: ", host_addr,
+             " must be aligned to page size: ", page_size);
+  int fd = rpcmem_to_fd(host);
+  MACE_CHECK(fd >= 0, "Invalid rpcmem file descriptor: ", fd);
+
+  ion_host->ext_host_ptr.allocation_type = CL_MEM_ION_HOST_PTR_QCOM;
+  ion_host->ext_host_ptr.host_cache_policy =
+      opencl_runtime_->qcom_host_cache_policy();
+  ion_host->ion_filedesc = fd;
+  ion_host->ion_hostptr = host;
+}
+
+void OpenCLAllocator::CreateQualcommImageIONHostPtr(
+    const std::vector<size_t> &shape,
+    const cl::ImageFormat &format,
+    size_t *pitch,
+    cl_mem_ion_host_ptr *ion_host) {
+  cl_int error = clGetDeviceImageInfoQCOM(
+      opencl_runtime_->device().get(), shape[0], shape[1], &format,
+      CL_IMAGE_ROW_PITCH, sizeof(*pitch), pitch, nullptr);
+  MACE_CHECK(error == CL_SUCCESS, "clGetDeviceImageInfoQCOM failed, error: ",
+             OpenCLErrorToString(error));
+
+  CreateQualcommBufferIONHostPtr(*pitch * shape[1], ion_host);
+}
+#endif  // MACE_ENABLE_RPCMEM
 }  // namespace mace
diff --git a/mace/core/runtime/opencl/opencl_allocator.h b/mace/core/runtime/opencl/opencl_allocator.h
index 9ee9c81de352614f209b234c278fc43c54e741e0..0c2783a137ffc89aacc52e5d72a0a0a05a53d1d5 100644
--- a/mace/core/runtime/opencl/opencl_allocator.h
+++ b/mace/core/runtime/opencl/opencl_allocator.h
@@ -16,6 +16,7 @@
 #define MACE_CORE_RUNTIME_OPENCL_OPENCL_ALLOCATOR_H_
 
 #include <memory>
+#include <unordered_map>
 #include <vector>
 
 #include "mace/core/allocator.h"
@@ -29,7 +30,7 @@ class OpenCLAllocator : public Allocator {
 
   ~OpenCLAllocator() override;
 
-  MaceStatus New(size_t nbytes, void **result) const override;
+  MaceStatus New(size_t nbytes, void **result) override;
 
   /*
    * Use Image2D with RGBA (128-bit) format to represent the image.
@@ -38,23 +39,37 @@ class OpenCLAllocator : public Allocator {
    */
   MaceStatus NewImage(const std::vector<size_t> &image_shape,
                       const DataType dt,
-                      void **result) const override;
+                      void **result) override;
 
-  void Delete(void *buffer) const override;
+  void Delete(void *buffer) override;
 
-  void DeleteImage(void *buffer) const override;
+  void DeleteImage(void *buffer) override;
 
-  void *Map(void *buffer, size_t offset, size_t nbytes) const override;
+  void *Map(void *buffer,
+            size_t offset,
+            size_t nbytes,
+            bool finish_cmd_queue) const override;
 
   void *MapImage(void *buffer,
                  const std::vector<size_t> &image_shape,
-                 std::vector<size_t> *mapped_image_pitch) const override;
+                 std::vector<size_t> *mapped_image_pitch,
+                 bool finish_cmd_queue) const override;
 
   void Unmap(void *buffer, void *mapped_ptr) const override;
 
   bool OnHost() const override;
 
  private:
+#ifdef MACE_ENABLE_RPCMEM
+  void CreateQualcommBufferIONHostPtr(const size_t nbytes,
+                                      cl_mem_ion_host_ptr *ion_host);
+  void CreateQualcommImageIONHostPtr(const std::vector<size_t> &shape,
+                                     const cl::ImageFormat &format,
+                                     size_t *pitch,
+                                     cl_mem_ion_host_ptr *ion_host);
+
+  std::unordered_map<void *, void *> cl_to_host_map_;
+#endif  // MACE_ENABLE_RPCMEM
   OpenCLRuntime *opencl_runtime_;
 };
 
diff --git a/mace/core/runtime/opencl/opencl_extension.h b/mace/core/runtime/opencl/opencl_extension.h
index da3ba8556ec93bc1a5eaf0738771a89ec1e384d4..dabf81fe30cabb36962065c8a4e3202d6b6568b3 100644
--- a/mace/core/runtime/opencl/opencl_extension.h
+++ b/mace/core/runtime/opencl/opencl_extension.h
@@ -37,4 +37,8 @@ typedef cl_uint cl_priority_hint;
 
 /* Accepted by clGetKernelWorkGroupInfo */
 #define CL_KERNEL_WAVE_SIZE_QCOM 0xAA02
+
+// Cache policy specifying io-coherence
+#define CL_MEM_HOST_IOCOHERENT_QCOM 0x40A9
+
 #endif  // MACE_CORE_RUNTIME_OPENCL_OPENCL_EXTENSION_H_
diff --git a/mace/core/runtime/opencl/opencl_runtime.cc b/mace/core/runtime/opencl/opencl_runtime.cc
index 17f1dd5a89bcf4f6aaf507c38a3feae04dd618c5..cb2e650fb901347d33b1f555cf7f2708876ecaa7 100644
--- a/mace/core/runtime/opencl/opencl_runtime.cc
+++ b/mace/core/runtime/opencl/opencl_runtime.cc
@@ -230,6 +230,37 @@ GPUType ParseGPUType(const std::string &device_name) {
   }
 }
 
+#ifdef MACE_ENABLE_RPCMEM
+IONType ParseIONType(const std::string &device_extensions) {
+  constexpr const char *kQualcommIONStr = "cl_qcom_ion_host_ptr";
+
+  if (device_extensions.find(kQualcommIONStr) != std::string::npos) {
+    return IONType::QUALCOMM_ION;
+  } else {
+    return IONType::NONE_ION;
+  }
+}
+
+uint32_t ParseQcomHostCachePolicy(const std::string &device_extensions) {
+  constexpr const char *kQualcommIocoherentStr =
+      "cl_qcom_ext_host_ptr_iocoherent";
+
+  if (device_extensions.find(kQualcommIocoherentStr) != std::string::npos) {
+    return CL_MEM_HOST_IOCOHERENT_QCOM;
+  } else {
+    return CL_MEM_HOST_WRITEBACK_QCOM;
+  }
+}
+
+std::string QcomHostCachePolicyToString(uint32_t policy) {
+  switch (policy) {
+    case CL_MEM_HOST_IOCOHERENT_QCOM: return "CL_MEM_HOST_IOCOHERENT_QCOM";
+    case CL_MEM_HOST_WRITEBACK_QCOM: return "CL_MEM_HOST_WRITEBACK_QCOM";
+    default: return MakeString("UNKNOWN: ", policy);
+  }
+}
+#endif  // MACE_ENABLE_RPCMEM
+
 const char *kOpenCLPlatformInfoKey =
     "mace_opencl_precompiled_platform_info_key";
 }  // namespace
@@ -311,6 +342,35 @@ OpenCLRuntime::OpenCLRuntime(
         return;
       }
 
+#ifdef MACE_ENABLE_RPCMEM
+      const std::string device_extensions =
+          device.getInfo<CL_DEVICE_EXTENSIONS>();
+      ion_type_ = ParseIONType(device_extensions);
+      if (ion_type_ == IONType::QUALCOMM_ION) {
+        qcom_ext_mem_padding_ = 0;
+        cl_int err = device.getInfo(CL_DEVICE_EXT_MEM_PADDING_IN_BYTES_QCOM,
+                                    &qcom_ext_mem_padding_);
+        if (err != CL_SUCCESS) {
+          LOG(ERROR) << "Failed to get CL_DEVICE_EXT_MEM_PADDING_IN_BYTES_QCOM "
+                     << OpenCLErrorToString(err);
+        }
+
+        qcom_page_size_ = 4096;
+        err = device.getInfo(CL_DEVICE_PAGE_SIZE_QCOM, &qcom_page_size_);
+        if (err != CL_SUCCESS) {
+          LOG(ERROR) << "Failed to get CL_DEVICE_PAGE_SIZE_QCOM: "
+                     << OpenCLErrorToString(err);
+        }
+
+        qcom_host_cache_policy_ = ParseQcomHostCachePolicy(device_extensions);
+
+        VLOG(1) << "Using QUALCOMM ION buffer with padding size: "
+                << qcom_ext_mem_padding_ << ", page size: " << qcom_page_size_
+                << ", with host cache policy: "
+                << QcomHostCachePolicyToString(qcom_host_cache_policy_);
+      }
+#endif  // MACE_ENABLE_RPCMEM
+
       VLOG(1) << "Using device: " << device_name;
       break;
     }
@@ -776,6 +836,24 @@ GPUType OpenCLRuntime::gpu_type() const {
   return gpu_type_;
 }
 
+#ifdef MACE_ENABLE_RPCMEM
+IONType OpenCLRuntime::ion_type() const {
+  return ion_type_;
+}
+
+uint32_t OpenCLRuntime::qcom_ext_mem_padding() const {
+  return qcom_ext_mem_padding_;
+}
+
+uint32_t OpenCLRuntime::qcom_page_size() const {
+  return qcom_page_size_;
+}
+
+uint32_t OpenCLRuntime::qcom_host_cache_policy() const {
+  return qcom_host_cache_policy_;
+}
+#endif  // MACE_ENABLE_RPCMEM
+
 const std::string OpenCLRuntime::platform_info() const {
   return platform_info_;
 }
diff --git a/mace/core/runtime/opencl/opencl_runtime.h b/mace/core/runtime/opencl/opencl_runtime.h
index f06f313ad973730d3d43fad739bb2224fd617496..2aefdde8b717f17b483164bf6239f6fb243a3b66 100644
--- a/mace/core/runtime/opencl/opencl_runtime.h
+++ b/mace/core/runtime/opencl/opencl_runtime.h
@@ -49,6 +49,12 @@ enum OpenCLVersion {
   CL_VER_2_1,
 };
 
+#ifdef MACE_ENABLE_RPCMEM
+enum IONType {
+  QUALCOMM_ION,
+  NONE_ION,
+};
+#endif  // MACE_ENABLE_RPCMEM
 
 const std::string OpenCLErrorToString(cl_int error);
 
@@ -86,6 +92,13 @@ class OpenCLRuntime {
   Tuner<uint32_t> *tuner();
   bool is_opencl_avaliable();
 
+#ifdef MACE_ENABLE_RPCMEM
+  IONType ion_type() const;
+  uint32_t qcom_ext_mem_padding() const;
+  uint32_t qcom_page_size() const;
+  uint32_t qcom_host_cache_policy() const;
+#endif  // MACE_ENABLE_RPCMEM
+
   void GetCallStats(const cl::Event &event, CallStats *stats);
   uint64_t GetDeviceMaxWorkGroupSize();
   uint64_t GetDeviceMaxMemAllocSize();
@@ -144,6 +157,13 @@ class OpenCLRuntime {
   bool out_of_range_check_;
   uint64_t device_global_mem_cache_size_;
   uint32_t device_compute_units_;
+
+#ifdef MACE_ENABLE_RPCMEM
+  IONType ion_type_;
+  uint32_t qcom_ext_mem_padding_;
+  uint32_t qcom_page_size_;
+  uint32_t qcom_host_cache_policy_;
+#endif  // MACE_ENABLE_RPCMEM
 };
 
 class OpenCLProfilingTimer : public Timer {
diff --git a/mace/core/runtime/opencl/opencl_wrapper.cc b/mace/core/runtime/opencl/opencl_wrapper.cc
index 552c413ade4348e0e9a740ae00471778a92b8d26..41dd529b4d3a1d63f82481a25cd8785fa16f437e 100644
--- a/mace/core/runtime/opencl/opencl_wrapper.cc
+++ b/mace/core/runtime/opencl/opencl_wrapper.cc
@@ -216,6 +216,15 @@ class OpenCLLibrary final {
   using clGetImageInfoFunc =
       cl_int (*)(cl_mem, cl_image_info, size_t, void *, size_t *);
 
+  using clGetDeviceImageInfoQCOMFunc = cl_int (*)(cl_device_id,
+                                                  size_t,
+                                                  size_t,
+                                                  const cl_image_format *,
+                                                  cl_image_pitch_info_qcom,
+                                                  size_t,
+                                                  void *,
+                                                  size_t *);
+
 #define MACE_CL_DEFINE_FUNC_PTR(func) func##Func func = nullptr
 
   MACE_CL_DEFINE_FUNC_PTR(clGetPlatformIDs);
@@ -265,6 +274,7 @@ class OpenCLLibrary final {
   MACE_CL_DEFINE_FUNC_PTR(clGetEventInfo);
   MACE_CL_DEFINE_FUNC_PTR(clGetEventProfilingInfo);
   MACE_CL_DEFINE_FUNC_PTR(clGetImageInfo);
+  MACE_CL_DEFINE_FUNC_PTR(clGetDeviceImageInfoQCOM);
 
 #undef MACE_CL_DEFINE_FUNC_PTR
 
@@ -400,6 +410,7 @@ void *OpenCLLibrary::LoadFromPath(const std::string &path) {
   MACE_CL_ASSIGN_FROM_DLSYM(clGetEventInfo);
   MACE_CL_ASSIGN_FROM_DLSYM(clGetEventProfilingInfo);
   MACE_CL_ASSIGN_FROM_DLSYM(clGetImageInfo);
+  MACE_CL_ASSIGN_FROM_DLSYM(clGetDeviceImageInfoQCOM);
 
 #undef MACE_CL_ASSIGN_FROM_DLSYM
 
@@ -802,6 +813,26 @@ CL_API_ENTRY cl_int clGetImageInfo(cl_mem image,
   }
 }
 
+CL_API_ENTRY cl_int clGetDeviceImageInfoQCOM(
+    cl_device_id device,
+    size_t image_width,
+    size_t image_height,
+    const cl_image_format *image_format,
+    cl_image_pitch_info_qcom param_name,
+    size_t param_value_size,
+    void *param_value,
+    size_t *param_value_size_ret)
+    CL_EXT_SUFFIX__VERSION_1_1 {
+  auto func = mace::runtime::OpenCLLibrary::Get()->clGetDeviceImageInfoQCOM;
+  if (func != nullptr) {
+    MACE_LATENCY_LOGGER(3, "clGetDeviceImageInfoQCOM");
+    return func(device, image_width, image_height, image_format, param_name,
+                param_value_size, param_value, param_value_size_ret);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
 // Command Queue APIs
 CL_API_ENTRY cl_command_queue clCreateCommandQueueWithProperties(
     cl_context context,
diff --git a/mace/core/tensor.h b/mace/core/tensor.h
index 92414a723140671f685c16fc5d326d8a8d07565d..dc7d24b49e3236a93a6dbb3ee9325d755c1aa5cc 100644
--- a/mace/core/tensor.h
+++ b/mace/core/tensor.h
@@ -426,10 +426,11 @@ class Tensor {
 
   class MappingGuard {
    public:
-    explicit MappingGuard(const Tensor *tensor) : tensor_(tensor) {
+    explicit MappingGuard(const Tensor *tensor, bool finish_cmd_queue = true) :
+        tensor_(tensor) {
       if (tensor_ != nullptr) {
         MACE_CHECK_NOTNULL(tensor_->buffer_);
-        tensor_->buffer_->Map(&mapped_image_pitch_);
+        tensor_->buffer_->Map(&mapped_image_pitch_, finish_cmd_queue);
       }
     }
 
diff --git a/mace/mace.bzl b/mace/mace.bzl
index 47d44edb38e90ebf61f6c1ed9d2dcff23126214d..6322e0357659405da23c7bffea80b252df4b341d 100644
--- a/mace/mace.bzl
+++ b/mace/mace.bzl
@@ -109,6 +109,12 @@ def if_quantize_enabled(a):
       "//conditions:default": [],
   })
 
+def if_rpcmem_enabled(a):
+  return select({
+      "//mace:rpcmem_enabled": a,
+      "//conditions:default": [],
+  })
+
 def mace_version_genrule():
   native.genrule(
       name = "mace_version_gen",
diff --git a/mace/ops/opencl/buffer_transformer.h b/mace/ops/opencl/buffer_transformer.h
index f3df8bc4452766b8a15d579f55aae09722c9a48e..0dcec529674a12ea54c56342c4730aed0b244c99 100644
--- a/mace/ops/opencl/buffer_transformer.h
+++ b/mace/ops/opencl/buffer_transformer.h
@@ -66,7 +66,9 @@ class OpenCLBufferTransformer {
                 << " with data type " << dt;
         internal_tensor->Resize(input->shape());
         const uint8_t *input_ptr = input->data<uint8_t>();
-        Tensor::MappingGuard guard(internal_tensor);
+        // No need to finish the opencl command queue to write to the tensor
+        // from CPU, this can accelerate the mapping if using ION buffer.
+        Tensor::MappingGuard guard(internal_tensor, false);
         uint8_t *internal_ptr = internal_tensor->mutable_data<uint8_t>();
         memcpy(internal_ptr, input_ptr, input->raw_size());
         // 2. convert the internal GPU Buffer to output.
diff --git a/test/ccunit/mace/ops/batch_norm_test.cc b/test/ccunit/mace/ops/batch_norm_test.cc
index 83c8219f9e788d24d268f89a3c0f9ff7288bcaf4..0a07fc64cf60ef44513fc9d6d547c1b26d105edb 100644
--- a/test/ccunit/mace/ops/batch_norm_test.cc
+++ b/test/ccunit/mace/ops/batch_norm_test.cc
@@ -134,14 +134,16 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
       .AddFloatArg("leakyrelu_coefficient", 0.1)
       .Finalize(net.NewOperatorDef());
 
+  net.Setup(DeviceType::GPU);
+
   // Tuning
   setenv("MACE_TUNING", "1", 1);
-  net.RunOp(DeviceType::GPU);
+  net.Run();
   unsetenv("MACE_TUNING");
 
   // Run on opencl
-  net.RunOp(DeviceType::GPU);
-  net.Sync();
+  net.Run();
+
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"),
                           1e-5, 1e-4);
 }
@@ -200,14 +202,15 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {
       .AddIntArg("T", static_cast<int>(DataType::DT_HALF))
       .Finalize(net.NewOperatorDef());
 
+  net.Setup(DeviceType::GPU);
+
   // Tuning
   setenv("MACE_TUNING", "1", 1);
-  net.RunOp(DeviceType::GPU);
+  net.Run();
   unsetenv("MACE_TUNING");
 
   // Run on opencl
-  net.RunOp(DeviceType::GPU);
-  net.Sync();
+  net.Run();
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"),
                           1e-1, 1e-2);
@@ -266,14 +269,15 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
       .Output("Output")
       .Finalize(net.NewOperatorDef());
 
+  net.Setup(DeviceType::GPU);
+
   // tuning
   setenv("MACE_TUNING", "1", 1);
-  net.RunOp(DeviceType::GPU);
+  net.Run();
   unsetenv("MACE_TUNING");
 
   // Run on opencl
-  net.RunOp(DeviceType::GPU);
-  net.Sync();
+  net.Run();
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"),
                           1e-5, 1e-4);
@@ -333,14 +337,15 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {
       .AddIntArg("T", static_cast<int>(DataType::DT_HALF))
       .Finalize(net.NewOperatorDef());
 
+  net.Setup(DeviceType::GPU);
+
   // tuning
   setenv("MACE_TUNING", "1", 1);
-  net.RunOp(DeviceType::GPU);
+  net.Run();
   unsetenv("MACE_TUNING");
 
   // Run on opencl
-  net.RunOp(DeviceType::GPU);
-  net.Sync();
+  net.Run();
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"),
                           1e-1, 1e-2);
diff --git a/test/ccunit/mace/ops/buffer_to_image_test.cc b/test/ccunit/mace/ops/buffer_to_image_test.cc
index 644283d405f2a712c58707b83e3070893e2d2ba2..b1252876b99098c65cc5c77d846adb2b463781c2 100644
--- a/test/ccunit/mace/ops/buffer_to_image_test.cc
+++ b/test/ccunit/mace/ops/buffer_to_image_test.cc
@@ -46,6 +46,9 @@ void TestBidirectionTransform(const OpenCLBufferType type,
       .Transform(&context, b2i_output,
                  type, MemoryType::GPU_BUFFER, 0, i2b_output);
 
+  net.Setup(DeviceType::GPU);
+  net.Sync();
+
   // Check
   ExpectTensorNear<T>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"),
                       1e-5);
@@ -187,6 +190,9 @@ void TestDiffTypeBidirectionTransform(const OpenCLBufferType type,
       .Transform(&context, b2i_output,
                  type, MemoryType::GPU_BUFFER, 0, i2b_output);
 
+  net.Setup(DeviceType::GPU);
+  net.Sync();
+
   // Check
   ExpectTensorNear<float>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"),
                           1e-3, 1e-6);
@@ -227,6 +233,9 @@ void TestStringHalfBidirectionTransform(const OpenCLBufferType type,
       .Transform(&context, b2i_output,
                  type, MemoryType::GPU_BUFFER, 0, i2b_output);
 
+  net.Setup(DeviceType::GPU);
+  net.Sync();
+
   // Check
   ExpectTensorNear<half>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"),
                          1e-3, 1e-6);
diff --git a/test/ccunit/mace/ops/buffer_transform_test.cc b/test/ccunit/mace/ops/buffer_transform_test.cc
index f29a2e012249d5214ddedeaf9320aec80e71120c..fe6d698722950a0a44fdb6e32e544156716a328e 100644
--- a/test/ccunit/mace/ops/buffer_transform_test.cc
+++ b/test/ccunit/mace/ops/buffer_transform_test.cc
@@ -59,6 +59,9 @@ void TestBidirectionTransform(const OpenCLBufferType type,
       .Transform(&context, bt_output,
                  type, MemoryType::GPU_BUFFER, 0, output);
 
+  net.Setup(DeviceType::GPU);
+  net.Sync();
+
   if (DataTypeToEnum<OrgType>::value == DataTypeToEnum<DstType>::value) {
     EXPECT_EQ(net.GetOutput("Input")->UnderlyingBuffer(),
               net.GetOutput("Output")->UnderlyingBuffer());
@@ -96,6 +99,9 @@ void TestArgumentTransform(const index_t input_size) {
                  OpenCLBufferType::ARGUMENT, MemoryType::GPU_BUFFER,
                  0, output);
 
+  net.Setup(DeviceType::GPU);
+  net.Sync();
+
   index_t expected_size = RoundUp<index_t>(input_size, 4);
   EXPECT_EQ(expected_size, output->buffer_shape()[0]);
 
diff --git a/test/ccunit/mace/ops/opencl/out_of_range_check_test.cc b/test/ccunit/mace/ops/opencl/out_of_range_check_test.cc
index 8a17c2d2c5e5d9ed0431005404b630efdfd2c974..5ee423d335b14eeda11c2148222e0e9fd854e925 100644
--- a/test/ccunit/mace/ops/opencl/out_of_range_check_test.cc
+++ b/test/ccunit/mace/ops/opencl/out_of_range_check_test.cc
@@ -110,7 +110,7 @@ MaceStatus BufferToImageOpImpl(OpContext *context,
   bool is_out_of_range = false;
   if (runtime->IsOutOfRangeCheckEnabled()) {
     oorc_flag->Map(nullptr);
-    is_out_of_range = *(oorc_flag->mutable_data<char>()) == 1 ? true : false;
+    is_out_of_range = *(oorc_flag->mutable_data<int>()) == 1 ? true : false;
     oorc_flag->UnMap();
   }
   return is_out_of_range ? MaceStatus::MACE_OUT_OF_RESOURCES
diff --git a/third_party/rpcmem/BUILD.bazel b/third_party/rpcmem/BUILD.bazel
new file mode 100644
index 0000000000000000000000000000000000000000..1a1a061cca7f2c99637bcec066c040763133a94b
--- /dev/null
+++ b/third_party/rpcmem/BUILD.bazel
@@ -0,0 +1,24 @@
+# These files are generated fron rpcmem project
+
+licenses(["notice"])
+
+exports_files(["license.txt"])
+
+load(
+    "//mace:mace.bzl",
+    "if_android_arm64",
+    "if_android_armv7",
+)
+
+cc_library(
+    name = "rpcmem",
+    srcs = if_android_armv7([
+        "armeabi-v7a/rpcmem.a",
+    ]) + if_android_arm64([
+        "arm64-v8a/rpcmem.a",
+    ]),
+    hdrs = [
+        "rpcmem.h",
+    ],
+    visibility = ["//visibility:public"],
+)
diff --git a/third_party/rpcmem/arm64-v8a/rpcmem.a b/third_party/rpcmem/arm64-v8a/rpcmem.a
new file mode 100644
index 0000000000000000000000000000000000000000..982714b75645f2a408f7fde000376cf32d18055b
Binary files /dev/null and b/third_party/rpcmem/arm64-v8a/rpcmem.a differ
diff --git a/third_party/rpcmem/armeabi-v7a/rpcmem.a b/third_party/rpcmem/armeabi-v7a/rpcmem.a
new file mode 100644
index 0000000000000000000000000000000000000000..faa1baa5f8e7a689e66ac126df0446c27d08e061
Binary files /dev/null and b/third_party/rpcmem/armeabi-v7a/rpcmem.a differ
diff --git a/third_party/rpcmem/license.txt b/third_party/rpcmem/license.txt
new file mode 100755
index 0000000000000000000000000000000000000000..dc6f86d29677f1fe8ca2e140371aad05d82b8166
--- /dev/null
+++ b/third_party/rpcmem/license.txt
@@ -0,0 +1,5 @@
+/*==============================================================================
+  Copyright (c) 2012-2013 Qualcomm Technologies, Inc.
+  All rights reserved. Qualcomm Proprietary and Confidential.
+  rpcmem.h and rpcmem.a are generated from Hexagon SDK and modified by Xiaomi, Inc.
+==============================================================================*/
diff --git a/third_party/rpcmem/rpcmem.cmake b/third_party/rpcmem/rpcmem.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..be9d04ff1ad188d92d43ffeb4a9c4f1d006460a5
--- /dev/null
+++ b/third_party/rpcmem/rpcmem.cmake
@@ -0,0 +1,10 @@
+set(RPCMEM_INSTALL_DIR  "${PROJECT_SOURCE_DIR}/third_party/rpcmem")
+set(RPCMEM_INCLUDE_DIR  "${RPCMEM_INSTALL_DIR}")
+
+include_directories(SYSTEM "${RPCMEM_INCLUDE_DIR}")
+
+set(RPCMEM_LIB "${RPCMEM_INSTALL_DIR}/${ANDROID_ABI}/rpcmem.a")
+add_library(rpcmem STATIC IMPORTED GLOBAL)
+set_target_properties(rpcmem PROPERTIES IMPORTED_LOCATION ${RPCMEM_LIB})
+
+install(FILES ${RPCMEM_LIB} DESTINATION lib)
diff --git a/third_party/rpcmem/rpcmem.h b/third_party/rpcmem/rpcmem.h
new file mode 100755
index 0000000000000000000000000000000000000000..f0296a2212d8fc57e64dbcee1c6cde3b5b0e821c
--- /dev/null
+++ b/third_party/rpcmem/rpcmem.h
@@ -0,0 +1,141 @@
+/*==============================================================================
+  Copyright (c) 2012-2013 Qualcomm Technologies, Inc.
+  All rights reserved. Qualcomm Proprietary and Confidential.
+==============================================================================*/
+
+#ifndef RPCMEM_H
+#define RPCMEM_H
+
+/**
+ * RPCMEM_DEFAULT_HEAP
+ * Dynamicaly select the heap to use.  This should be ok for most usecases.
+ */
+#define RPCMEM_DEFAULT_HEAP -1
+
+/**
+ * RPCMEM HEAP IDs
+ * SYSTEM HEAP:
+ *   - non-contiguous physical memory
+ *   - for sub-systems with SMMU
+ *   - recommended for HVX/CDSPs
+ * CONTIG HEAP:
+ *   - Contiguous physical memory
+ *   - limited memory
+ *   - for sub-systems without SMMU (ex. sDSP and mDSP)
+ */
+#define RPCMEM_HEAP_ID_SYSTEM         (25)
+#define RPCMEM_HEAP_ID_CONTIG         (22)
+#define RPCMEM_HEAP_ID_SECURE         (9)
+#define RPCMEM_HEAP_ID_SYSTEM_CONTIG  (21)
+
+/**
+ * RPCMEM_DEFAULT_FLAGS should allocate memory with the same properties
+ * as the ION_FLAG_CACHED flag
+ */
+#ifdef ION_FLAG_CACHED
+#define RPCMEM_DEFAULT_FLAGS ION_FLAG_CACHED
+#else
+#define RPCMEM_DEFAULT_FLAGS 1
+#endif
+
+/**
+ * RPCMEM_FLAG_UNCACHED
+ * ION_FLAG_CACHED should be defined as 1
+ */
+#define RPCMEM_FLAG_UNCACHED 0
+#define RPCMEM_FLAG_CACHED RPCMEM_DEFAULT_FLAGS
+
+/**
+ * examples:
+ *
+ * heap 22, uncached, 1kb
+ *    rpcmem_alloc(22, 0, 1024);
+ *    rpcmem_alloc(22, RPCMEM_FLAG_UNCACHED, 1024);
+ *
+ * heap 21, cached, 2kb
+ *    rpcmem_alloc(21, RPCMEM_FLAG_CACHED, 2048);
+ *    #include <ion.h>
+ *    rpcmem_alloc(21, ION_FLAG_CACHED, 2048);
+ *
+ * just give me the defaults, 2kb
+ *    rpcmem_alloc(RPCMEM_DEFAULT_HEAP, RPCMEM_DEFAULT_FLAGS, 2048);
+ *    rpcmem_alloc_def(2048);
+ *
+ * give me the default flags, but from heap 18, 4kb
+ *    rpcmem_alloc(18, RPCMEM_DEFAULT_FLAGS, 4096);
+ *
+ */
+#define ION_SECURE_FLAGS    ((1 << 31) | (1 << 19))
+
+/**
+ * To flag start/end for rpcmem_sync_cache
+ */
+#define RPCMEM_SYNC_START 0
+#define RPCMEM_SYNC_END   1
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * call once to initialize the library
+ * NOTE: rpcmem_init is not thread safe
+ */
+void rpcmem_init(void);
+/**
+ * call once for cleanup
+ * NOTE: rpcmem_deinit is not thread safe
+ */
+void rpcmem_deinit(void);
+
+/**
+ * Allocate via ION a buffer of size
+ * @heapid, the heap id to use
+ * @flags, ion flags to use to when allocating
+ * @size, the buffer size to allocate
+ * @retval, 0 on failure, pointer to buffer on success
+ *
+ * For example:
+ *    buf = rpcmem_alloc(RPCMEM_DEFAULT_HEAP, RPCMEM_DEFAULT_FLAGS, size);
+ */
+
+void* rpcmem_alloc(int heapid, unsigned int flags, int size);
+
+/**
+ * allocate with default settings
+ */
+ #if !defined(WINNT) && !defined (_WIN32_WINNT)
+__attribute__((unused))
+#endif
+static __inline void* rpcmem_alloc_def(int size) {
+   return rpcmem_alloc(RPCMEM_DEFAULT_HEAP, RPCMEM_DEFAULT_FLAGS, size);
+}
+
+/**
+ * free buffer, ignores invalid buffers
+ */
+void rpcmem_free(void* po);
+
+/**
+ * returns associated fd
+ */
+int rpcmem_to_fd(void* po);
+
+/**
+ * cache coherency management
+ */
+int rpcmem_sync_cache(void* po, unsigned int flags);
+
+#ifdef __cplusplus
+}
+#endif
+
+/** these are deprecated
+ */
+#define RPCMEM_HEAP_DEFAULT     0x80000000
+#define RPCMEM_HEAP_NOREG       0x40000000
+#define RPCMEM_HEAP_UNCACHED    0x20000000
+#define RPCMEM_HEAP_NOVA        0x10000000
+#define RPCMEM_HEAP_NONCOHERENT 0x08000000
+
+#endif //RPCMEM_H
diff --git a/third_party/third_party.cmake b/third_party/third_party.cmake
index f800758725fc1385fcea0a9a62b12b6dc1b563e6..02cd3c1830e738664abd18b26fe9f12b10818f13 100644
--- a/third_party/third_party.cmake
+++ b/third_party/third_party.cmake
@@ -51,6 +51,7 @@ include(${PROJECT_SOURCE_DIR}/third_party/opencl-headers/opencl-headers.cmake)
 include(${PROJECT_SOURCE_DIR}/third_party/protobuf/protobuf.cmake)
 include(${PROJECT_SOURCE_DIR}/third_party/tflite/tflite.cmake)
 include(${PROJECT_SOURCE_DIR}/third_party/caffe/caffe.cmake)
+include(${PROJECT_SOURCE_DIR}/third_party/rpcmem/rpcmem.cmake)
 
 if(MACE_ENABLE_HEXAGON_DSP)
   include(${PROJECT_SOURCE_DIR}/third_party/nnlib/nnlib.cmake)
diff --git a/tools/bazel-build-standalone-lib.sh b/tools/bazel-build-standalone-lib.sh
index 4b87f4b4bf5c1be3ba68e16227de4a607a795be2..8a07811373c996793e7652fab21379086c146945 100755
--- a/tools/bazel-build-standalone-lib.sh
+++ b/tools/bazel-build-standalone-lib.sh
@@ -34,12 +34,12 @@ mkdir -p $LIB_DIR/aarch64_linux_gnu/cpu_gpu
 
 # build shared libraries
 echo "build shared lib for armeabi-v7a + cpu_gpu_dsp"
-bazel build --config android --config optimization mace/libmace:libmace_dynamic --define neon=true --define opencl=true --define hexagon=true --define quantize=true --cpu=armeabi-v7a
+bazel build --config android --config optimization mace/libmace:libmace_dynamic --define neon=true --define opencl=true --define hexagon=true --define quantize=true --cpu=armeabi-v7a --define rpcmem=true
 cp bazel-bin/mace/libmace/libmace.so $LIB_DIR/armeabi-v7a/cpu_gpu_dsp/
 cp third_party/nnlib/armeabi-v7a/*so $LIB_DIR/armeabi-v7a/cpu_gpu_dsp/
 
 echo "build shared lib for arm64-v8a + cpu_gpu_dsp"
-bazel build --config android --config optimization mace/libmace:libmace_dynamic --define neon=true --define opencl=true --define hexagon=true --define quantize=true --cpu=arm64-v8a
+bazel build --config android --config optimization mace/libmace:libmace_dynamic --define neon=true --define opencl=true --define hexagon=true --define quantize=true --cpu=arm64-v8a --define rpcmem=true
 cp bazel-bin/mace/libmace/libmace.so $LIB_DIR/arm64-v8a/cpu_gpu_dsp/
 cp third_party/nnlib/arm64-v8a/*so $LIB_DIR/arm64-v8a/cpu_gpu_dsp/
 
@@ -49,11 +49,11 @@ cp bazel-bin/mace/libmace/libmace.so $LIB_DIR/arm64-v8a/cpu_gpu_apu/
 cp third_party/apu/*so $LIB_DIR/arm64-v8a/cpu_gpu_apu/
 
 echo "build shared lib for armeabi-v7a + cpu_gpu"
-bazel build --config android --config optimization mace/libmace:libmace_dynamic --define neon=true --define opencl=true --define quantize=true --cpu=armeabi-v7a
+bazel build --config android --config optimization mace/libmace:libmace_dynamic --define neon=true --define opencl=true --define quantize=true --cpu=armeabi-v7a --define rpcmem=true
 cp bazel-bin/mace/libmace/libmace.so $LIB_DIR/armeabi-v7a/cpu_gpu/
 
 echo "build shared lib for arm64-v8a + cpu_gpu"
-bazel build --config android --config optimization mace/libmace:libmace_dynamic --define neon=true --define opencl=true --define quantize=true --cpu=arm64-v8a
+bazel build --config android --config optimization mace/libmace:libmace_dynamic --define neon=true --define opencl=true --define quantize=true --cpu=arm64-v8a --define rpcmem=true
 cp bazel-bin/mace/libmace/libmace.so $LIB_DIR/arm64-v8a/cpu_gpu/
 
 echo "build shared lib for arm_linux_gnueabihf + cpu_gpu"
@@ -72,12 +72,12 @@ fi
 
 # build static libraries
 echo "build static lib for armeabi-v7a + cpu_gpu_dsp"
-bazel build --config android --config optimization mace/libmace:libmace_static --config symbol_hidden --define neon=true --define opencl=true --define hexagon=true --define quantize=true --cpu=armeabi-v7a
+bazel build --config android --config optimization mace/libmace:libmace_static --config symbol_hidden --define neon=true --define opencl=true --define hexagon=true --define quantize=true --cpu=armeabi-v7a --define rpcmem=true
 cp bazel-genfiles/mace/libmace/libmace.a $LIB_DIR/armeabi-v7a/cpu_gpu_dsp/
 cp third_party/nnlib/armeabi-v7a/*so $LIB_DIR/armeabi-v7a/cpu_gpu_dsp/
 
 echo "build static lib for arm64-v8a + cpu_gpu_dsp"
-bazel build --config android --config optimization mace/libmace:libmace_static --config symbol_hidden --define neon=true --define opencl=true --define hexagon=true --define quantize=true --cpu=arm64-v8a
+bazel build --config android --config optimization mace/libmace:libmace_static --config symbol_hidden --define neon=true --define opencl=true --define hexagon=true --define quantize=true --cpu=arm64-v8a --define rpcmem=true
 cp bazel-genfiles/mace/libmace/libmace.a $LIB_DIR/arm64-v8a/cpu_gpu_dsp/
 cp third_party/nnlib/arm64-v8a/*so $LIB_DIR/arm64-v8a/cpu_gpu_dsp/
 
@@ -87,11 +87,11 @@ cp bazel-genfiles/mace/libmace/libmace.a $LIB_DIR/arm64-v8a/cpu_gpu_apu/
 cp third_party/apu/*so $LIB_DIR/arm64-v8a/cpu_gpu_apu/
 
 echo "build static lib for armeabi-v7a + cpu_gpu"
-bazel build --config android --config optimization mace/libmace:libmace_static --config symbol_hidden --define neon=true --define opencl=true --define quantize=true --cpu=armeabi-v7a
+bazel build --config android --config optimization mace/libmace:libmace_static --config symbol_hidden --define neon=true --define opencl=true --define quantize=true --cpu=armeabi-v7a --define rpcmem=true
 cp bazel-genfiles/mace/libmace/libmace.a $LIB_DIR/armeabi-v7a/cpu_gpu/
 
 echo "build static lib for arm64-v8a + cpu_gpu"
-bazel build --config android --config optimization mace/libmace:libmace_static --config symbol_hidden --define neon=true --define opencl=true --define quantize=true --cpu=arm64-v8a
+bazel build --config android --config optimization mace/libmace:libmace_static --config symbol_hidden --define neon=true --define opencl=true --define quantize=true --cpu=arm64-v8a --define rpcmem=true
 cp bazel-genfiles/mace/libmace/libmace.a $LIB_DIR/arm64-v8a/cpu_gpu/
 
 echo "build static lib for arm_linux_gnueabihf + cpu_gpu"
diff --git a/tools/bazel_adb_run.py b/tools/bazel_adb_run.py
index 564401eba662b8f897512373e9c7cb8ded1a006b..1679f604d85cac55b47865a22fd87f021931a227 100644
--- a/tools/bazel_adb_run.py
+++ b/tools/bazel_adb_run.py
@@ -95,6 +95,11 @@ def parse_args():
         type=str2bool,
         default=True,
         help="Whether to use quantization ops")
+    parser.add_argument(
+        "--enable_rpcmem",
+        type=str2bool,
+        default=True,
+        help="Whether to use rpcmem")
     parser.add_argument(
         '--address_sanitizer',
         action="store_true",
@@ -164,6 +169,7 @@ def main(unused_args):
             toolchain=toolchain,
             enable_neon=FLAGS.enable_neon,
             enable_quantize=FLAGS.enable_quantize,
+            enable_rpcmem=FLAGS.enable_rpcmem,
             address_sanitizer=FLAGS.address_sanitizer,
             debug_mode=FLAGS.debug_mode)
         if FLAGS.run_target:
diff --git a/tools/cmake/cmake-build-arm64-v8a.sh b/tools/cmake/cmake-build-arm64-v8a.sh
index e1efeed241a8a6c3cbff6567664f5f7b7606cbd1..999b0b74b1fc7ef36c827203f870ef44f45f25a1 100755
--- a/tools/cmake/cmake-build-arm64-v8a.sh
+++ b/tools/cmake/cmake-build-arm64-v8a.sh
@@ -43,6 +43,7 @@ cmake -DANDROID_ABI="arm64-v8a" \
       -DMACE_ENABLE_TESTS=ON              \
       -DMACE_ENABLE_BENCHMARKS=ON         \
       -DMACE_ENABLE_CODE_MODE=${MACE_ENABLE_CODE_MODE}        \
+      -DMACE_ENABLE_RPCMEM=ON                                 \
       -DCMAKE_INSTALL_PREFIX=install      \
       ../../..
 make -j6 VERBOSE=1 && make install
diff --git a/tools/cmake/cmake-build-armeabi-v7a.sh b/tools/cmake/cmake-build-armeabi-v7a.sh
index 12fab64f8dce0f9474a0a3cea2f8d1905f06a4fc..c98d196ce8791a50e5d3106cbea2a58fc27cc2dd 100755
--- a/tools/cmake/cmake-build-armeabi-v7a.sh
+++ b/tools/cmake/cmake-build-armeabi-v7a.sh
@@ -45,6 +45,7 @@ cmake -DANDROID_ABI="armeabi-v7a" \
       -DMACE_ENABLE_TESTS=ON                                 \
       -DMACE_ENABLE_BENCHMARKS=ON                            \
       -DMACE_ENABLE_CODE_MODE=${MACE_ENABLE_CODE_MODE}       \
+      -DMACE_ENABLE_RPCMEM=ON                                \
       -DCMAKE_INSTALL_PREFIX=install                         \
       ../../..
 make -j6 VERBOSE=1 && make install
diff --git a/tools/sh_commands.py b/tools/sh_commands.py
index 5c1c85694b1fc365aca39b2fc4570ef791661b60..831d015d1b3613ce5088ff91adb5aab36f3bae55 100644
--- a/tools/sh_commands.py
+++ b/tools/sh_commands.py
@@ -270,6 +270,7 @@ def bazel_build(target,
                 enable_neon=True,
                 enable_opencl=True,
                 enable_quantize=True,
+                enable_rpcmem=True,
                 address_sanitizer=False,
                 symbol_hidden=True,
                 debug_mode=False,
@@ -303,6 +304,8 @@ def bazel_build(target,
             "--define",
             "quantize=%s" % str(enable_quantize).lower(),
             "--define",
+            "rpcmem=%s" % str(enable_rpcmem).lower(),
+            "--define",
             "hexagon=%s" % str(enable_hexagon).lower(),
             "--define",
             "hta=%s" % str(enable_hta).lower(),