From 665b23e51bf14c8a7453d3b607d0dc34b4781795 Mon Sep 17 00:00:00 2001
From: gineshidalgo99 <gineshidalgo99@gmail.com>
Date: Sat, 2 Feb 2019 19:39:51 -0500
Subject: [PATCH] Removed boost and caffe deps from headers

---
 .github/issue_template.md                     |   1 +
 README.md                                     |   6 +-
 doc/installation.md                           |   6 +-
 doc/modules/python_module.md                  |   2 +-
 doc/release_notes.md                          |   1 +
 doc/released_features.md                      |   4 +-
 examples/tests/clTest.cpp                     |  38 +-
 examples/tests/resizeTest.cpp                 |   3 +-
 include/openpose/core/arrayCpuGpu.hpp         | 100 +++
 include/openpose/core/common.hpp              |   1 +
 include/openpose/core/headers.hpp             |   1 +
 include/openpose/core/macros.hpp              |  34 +-
 .../openpose/net/bodyPartConnectorCaffe.hpp   |  18 +-
 include/openpose/net/maximumCaffe.hpp         |  18 +-
 include/openpose/net/net.hpp                  |   2 +
 include/openpose/net/netCaffe.hpp             |   2 +-
 include/openpose/net/netOpenCv.hpp            |   2 +-
 include/openpose/net/nmsCaffe.hpp             |  20 +-
 include/openpose/net/resizeAndMergeCaffe.hpp  |  20 +-
 src/openpose/core/CMakeLists.txt              |   1 +
 src/openpose/core/arrayCpuGpu.cpp             | 581 ++++++++++++++++++
 src/openpose/face/faceExtractorCaffe.cpp      |  24 +-
 src/openpose/hand/handExtractorCaffe.cpp      |  42 +-
 src/openpose/net/bodyPartConnectorCaffe.cpp   |  20 +-
 src/openpose/net/maximumCaffe.cpp             |  28 +-
 src/openpose/net/netCaffe.cpp                 |  74 ++-
 src/openpose/net/netOpenCv.cpp                |   6 +-
 src/openpose/net/nmsCaffe.cpp                 |  26 +-
 src/openpose/net/resizeAndMergeCaffe.cpp      |  31 +-
 src/openpose/pose/poseExtractorCaffe.cpp      |  64 +-
 30 files changed, 943 insertions(+), 233 deletions(-)
 create mode 100644 include/openpose/core/arrayCpuGpu.hpp
 create mode 100644 src/openpose/core/arrayCpuGpu.cpp

diff --git a/.github/issue_template.md b/.github/issue_template.md
index 24ad1ed2..89e73fbc 100644
--- a/.github/issue_template.md
+++ b/.github/issue_template.md
@@ -48,6 +48,7 @@ You might select multiple topics, delete the rest:
 3. **General configuration**:
     - **Installation mode**: CMake, sh script, manual Makefile installation, ... (Ubuntu); CMake, ... (Windows); ...?
     - **Operating system** (`lsb_release -a` in Ubuntu):
+    - **Operating system version** (e.g., Ubuntu 16, Windows 10, ...):
     - **Release or Debug mode**? (by default: release):
     - Compiler (`gcc --version` in Ubuntu or VS version in Windows): 5.4.0, ... (Ubuntu); VS2015 Enterprise Update 3, VS2017 community, ... (Windows); ...?
 
diff --git a/README.md b/README.md
index a43810ec..9f3c99e3 100644
--- a/README.md
+++ b/README.md
@@ -4,8 +4,8 @@
 
 -----------------
 
-|                 | `Python (CUDA GPU)` | `Python (CPU)` | `CUDA GPU` | `CPU`  | `Debug mode` |
-| :---:           | :---:               | :---:          | :---:      |:---:   | :---:        |
+|                 | `Python (CUDA GPU)` | `Python (CPU)` | `CUDA GPU` | `CPU` | `Debug mode` |
+| :---:           | :---:               | :---:          | :---:      |:---:  | :---:        |
 | **`Linux`**     | [![Status](https://travis-matrix-badges.herokuapp.com/repos/CMU-Perceptual-Computing-Lab/openpose/branches/master/1)](https://travis-ci.org/CMU-Perceptual-Computing-Lab/openpose) | [![Status](https://travis-matrix-badges.herokuapp.com/repos/CMU-Perceptual-Computing-Lab/openpose/branches/master/2)](https://travis-ci.org/CMU-Perceptual-Computing-Lab/openpose) | [![Status](https://travis-matrix-badges.herokuapp.com/repos/CMU-Perceptual-Computing-Lab/openpose/branches/master/3)](https://travis-ci.org/CMU-Perceptual-Computing-Lab/openpose) | [![Status](https://travis-matrix-badges.herokuapp.com/repos/CMU-Perceptual-Computing-Lab/openpose/branches/master/4)](https://travis-ci.org/CMU-Perceptual-Computing-Lab/openpose) | [![Status](https://travis-matrix-badges.herokuapp.com/repos/CMU-Perceptual-Computing-Lab/openpose/branches/master/5)](https://travis-ci.org/CMU-Perceptual-Computing-Lab/openpose) |
 | **`MacOS`**     | | [![Status](https://travis-matrix-badges.herokuapp.com/repos/CMU-Perceptual-Computing-Lab/openpose/branches/master/6)](https://travis-ci.org/CMU-Perceptual-Computing-Lab/openpose) | | [![Status](https://travis-matrix-badges.herokuapp.com/repos/CMU-Perceptual-Computing-Lab/openpose/branches/master/7)](https://travis-ci.org/CMU-Perceptual-Computing-Lab/openpose) | [![Status](https://travis-matrix-badges.herokuapp.com/repos/CMU-Perceptual-Computing-Lab/openpose/branches/master/8)](https://travis-ci.org/CMU-Perceptual-Computing-Lab/openpose) |
 <!-- | **`Windows`**   | | | | | | -->
@@ -46,7 +46,7 @@ It is **authored by [Gines Hidalgo](https://www.gineshidalgo.com), [Zhe Cao](htt
     - Available: command-line demo, C++ wrapper, and C++ API.
     - [**Python API**](doc/modules/python_module.md).
     - [**Unity Plugin**](https://github.com/CMU-Perceptual-Computing-Lab/openpose_unity_plugin).
-    - CUDA (Nvidia GPU), OpenCL (AMD GPU), and CPU versions.
+    - CUDA (Nvidia GPU), OpenCL (AMD GPU), and CPU-only (no GPU) versions.
     - Training code included in the [**original CVPR 2017 GitHub repository**](https://github.com/ZheC/Multi-Person-Pose-Estimation).
 
 
diff --git a/doc/installation.md b/doc/installation.md
index 0b7ec274..07b5b527 100644
--- a/doc/installation.md
+++ b/doc/installation.md
@@ -48,14 +48,14 @@ This installation section is only intended if you plan to modify the OpenPose co
 
 ## Requirements and Dependencies
 - **Requirements** for the default configuration (you might need more resources with a greater `--net_resolution` and/or `scale_number` or less resources by reducing the net resolution and/or using the MPI and MPI_4 models):
-    - Nvidia GPU version:
+    - CUDA (Nvidia GPU) version:
         - NVIDIA graphics card with at least 1.6 GB available (the `nvidia-smi` command checks the available GPU memory in Ubuntu).
         - At least 2.5 GB of free RAM memory for BODY_25 model or 2 GB for COCO model (assuming cuDNN installed).
         - Highly recommended: cuDNN.
-    - AMD GPU version:
+    - OpenCL (AMD GPU) version:
         - Vega series graphics card
         - At least 2 GB of free RAM memory.
-    - CPU version:
+    - CPU-only (no GPU) version:
         - Around 8GB of free RAM memory.
     - Highly recommended: a CPU with at least 8 cores.
 - **Dependencies**:
diff --git a/doc/modules/python_module.md b/doc/modules/python_module.md
index 7aa3df6b..cde4f27b 100644
--- a/doc/modules/python_module.md
+++ b/doc/modules/python_module.md
@@ -18,7 +18,7 @@ This module exposes a Python API for OpenPose. It is effectively a wrapper that
 ## Compatibility
 The OpenPose Python module is compatible with both Python 2 and Python 3 (default and recommended). In addition, it will also run in all OpenPose compatible operating systems. It uses [Pybind11](https://github.com/pybind/pybind11) for mapping between C++ and Python datatypes.
 
-To compile, enable `BUILD_PYTHON` in CMake-gui. In Windows, make sure you compile the whole solution (clicking the green play button does not compile the whole solution!). You can do that by right-click on the OpenPose project solution, and clicking in `Build solution`.
+To compile, enable `BUILD_PYTHON` in CMake-gui. In Windows, make sure you compile the whole solution (clicking the green play button does not compile the whole solution!). You can do that by right-click on the OpenPose project solution, and clicking in `Build Solution` (or individually building the PyOpenPose module).
 
 Pybind selects the latest version of Python by default (Python 3). To use Python 2, change `PYTHON_EXECUTABLE` and `PYTHON_LIBRARY` flags in CMake-gui to your desired Python version.
 
diff --git a/doc/release_notes.md b/doc/release_notes.md
index 9184fcda..4b2c35cf 100644
--- a/doc/release_notes.md
+++ b/doc/release_notes.md
@@ -316,6 +316,7 @@ OpenPose Library - Release Notes
     33. Added the flags `--face_detector` and `--hand_detector`, that enable the user to select the face/hand rectangle detector that is used for the later face/hand keypoint detection. It includes OpenCV (for face), and also allows the user to provide its own input. Flag `--hand_tracking` is removed and integrated into this flag too.
     34. Maximum queue size per OpenPose thread is configurable through the Wrapper class.
     35. Added pre-processing capabilities to Wrapper (WorkerType::PreProcessing), which will be run right after the image has been read.
+    36. Removed boost::shared_ptr and caffe::Blob dependencies from the headers. No 3rdparty dependencies left on headers (except dim3 for CUDA).
 2. Functions or parameters renamed:
     1. By default, python example `tutorial_developer/python_2_pose_from_heatmaps.py` was using 2 scales starting at -1x736, changed to 1 scale at -1x368.
     2. WrapperStructPose default parameters changed to match those of the OpenPose demo binary.
diff --git a/doc/released_features.md b/doc/released_features.md
index 17c36097..8e99fe16 100644
--- a/doc/released_features.md
+++ b/doc/released_features.md
@@ -1,8 +1,8 @@
 OpenPose Library - All Released Features
 ====================================
 
-- Jan 2018: [**Unity plugin released**](https://github.com/CMU-Perceptual-Computing-Lab/openpose_unity_plugin)!
-- Jan 2018: [**Improved Python API**](doc/modules/python_module.md) released! Including body, face, hands, and all the functionality of the C++ API!
+- Jan 2019: [**Unity plugin released**](https://github.com/CMU-Perceptual-Computing-Lab/openpose_unity_plugin)!
+- Jan 2019: [**Improved Python API**](doc/modules/python_module.md) released! Including body, face, hands, and all the functionality of the C++ API!
 - Dec 2018: [**Foot dataset**](https://cmu-perceptual-computing-lab.github.io/foot_keypoint_dataset) and [**new paper released**](https://arxiv.org/abs/1812.08008)!
 - Sep 2018: [**Experimental tracker**](./quick_start.md#tracking)!
 - Jun 2018: [**Combined body-foot model released! 40% faster and 5% more accurate**](./installation.md)!
diff --git a/examples/tests/clTest.cpp b/examples/tests/clTest.cpp
index 91f2368d..4f1391ec 100644
--- a/examples/tests/clTest.cpp
+++ b/examples/tests/clTest.cpp
@@ -5,6 +5,10 @@
 #include <openpose/flags.hpp>
 // OpenPose dependencies
 #include <openpose/headers.hpp>
+// Caffe dependencies
+#ifdef USE_CAFFE
+    #include <caffe/blob.hpp>
+#endif
 // OpenCL dependencies
 #ifdef USE_OPENCL
 #include <openpose/gpu/opencl.hcl>
@@ -72,7 +76,8 @@ DEFINE_string(image_path,               "examples/media/COCO_val2014_00000000019
 
 typedef cl::KernelFunctor<cl::Buffer, int, int, float> ScaleFunctor;
 const std::string scaleKernelString = MULTI_LINE_STRING(
-            __kernel void scaleKernel(__global float* targetPtr, const int targetWidth, const int targetHeight, const float scale)
+            __kernel void scaleKernel(__global float* targetPtr, const int targetWidth, const int targetHeight,
+                                      const float scale)
 {
                 int x = get_global_id(0);
                 int y = get_global_id(1);
@@ -111,7 +116,8 @@ int clTest()
         std::unique_ptr<caffe::Net<float>> upCaffeNet;
         caffe::Caffe::set_mode(caffe::Caffe::GPU);
         caffe::Caffe::SelectDevice(0, true);
-        upCaffeNet.reset(new caffe::Net<float>{"models/pose/coco/pose_deploy_linevec.prototxt", caffe::TEST, caffe::Caffe::GetDefaultDevice()});
+        upCaffeNet.reset(new caffe::Net<float>{
+            "models/pose/coco/pose_deploy_linevec.prototxt", caffe::TEST, caffe::Caffe::GetDefaultDevice()});
         upCaffeNet->CopyTrainedLayersFrom("models/pose/coco/pose_iter_440000.caffemodel");
         op::OpenCL::getInstance(0, CL_DEVICE_TYPE_GPU, true);
 
@@ -125,19 +131,16 @@ int clTest()
         blob_proto.set_height(imgResize.size().width);
         blob_proto.set_width(imgResize.size().height);
         blob_proto.clear_data();
-        for (int c = 0; c < 3; ++c) {
-            for (int h = 0; h < imgResize.size().height; ++h) {
-                for (int w = 0; w < imgResize.size().width; ++w) {
+        for (int c = 0; c < 3; ++c)
+            for (int h = 0; h < imgResize.size().height; ++h)
+                for (int w = 0; w < imgResize.size().width; ++w)
                     blob_proto.add_data(imgResize.at<cv::Vec3f>(h, w)[c]);
-                }
-            }
-        }
         blob_proto.set_num(1);
         caffe::Blob<float>* input_layer = upCaffeNet->input_blobs()[0];
         input_layer->FromProto(blob_proto);
         upCaffeNet->Forward(0);
 
-        boost::shared_ptr<caffe::Blob<float> > output_blob = upCaffeNet->blob_by_name("net_output");
+        boost::shared_ptr<caffe::Blob<float>> output_blob = upCaffeNet->blob_by_name("net_output");
 
         // Test
         cl::Device& device = op::OpenCL::getInstance(0)->getDevice();
@@ -156,8 +159,9 @@ int clTest()
             // Read it
             // Read back image to GPU
             float* heatmaps = new float[output_blob->shape()[1] * output_blob->shape()[2] * output_blob->shape()[3]];
-            op::OpenCL::getInstance(0)->getQueue().enqueueReadBuffer(outputBuffer, CL_TRUE, 0,
-                                                                     output_blob->shape()[1] * output_blob->shape()[2] * output_blob->shape()[3] * sizeof(float), heatmaps);
+            op::OpenCL::getInstance(0)->getQueue().enqueueReadBuffer(
+                outputBuffer, CL_TRUE, 0,
+                output_blob->shape()[1] * output_blob->shape()[2] * output_blob->shape()[3] * sizeof(float), heatmaps);
 
             int heatmapChannels = output_blob->shape()[1];
             int shape = output_blob->shape()[2] * output_blob->shape()[3];
@@ -172,11 +176,11 @@ int clTest()
             }
         }
         #if defined(USE_OPENCL) && defined(CL_HPP_ENABLE_EXCEPTIONS)
-        catch (const cl::Error& e)
-        {
-            op::error(std::string(e.what()) + " : " + op::OpenCL::clErrorToString(e.err()) + " ID: " +
-                      std::to_string(0), __LINE__, __FUNCTION__, __FILE__);
-        }
+            catch (const cl::Error& e)
+            {
+                op::error(std::string(e.what()) + " : " + op::OpenCL::clErrorToString(e.err()) + " ID: " +
+                          std::to_string(0), __LINE__, __FUNCTION__, __FILE__);
+            }
         #endif
         catch (const std::exception& e)
         {
@@ -223,6 +227,6 @@ int main()
 #else
     op::error("OpenPose must be compiled with the `USE_CAFFE` & `USE_OPENCL` macro definitions in order to run"
               " this functionality.", __LINE__, __FUNCTION__, __FILE__);
-    return 0;
+    return -1;
 #endif
 }
diff --git a/examples/tests/resizeTest.cpp b/examples/tests/resizeTest.cpp
index a19c826d..3eef40ed 100644
--- a/examples/tests/resizeTest.cpp
+++ b/examples/tests/resizeTest.cpp
@@ -15,7 +15,7 @@
 
     cv::Mat gpuResize(cv::Mat& img, const cv::Size& newSize)
     {
-        #ifdef USE_CUDA
+        #if defined USE_CAFFE && defined USE_CUDA
             // Upload to Source to GPU
             float* cpuPtr = &img.at<float>(0);
             float* gpuPtr;
@@ -49,6 +49,7 @@
             UNUSED(newSize);
             op::error("OpenPose must be compiled with the `USE_CAFFE` & `USE_CUDA` macro definitions in order to run"
                   " this functionality.", __LINE__, __FUNCTION__, __FILE__);
+            return cv::Mat();
         #endif
     }
 
diff --git a/include/openpose/core/arrayCpuGpu.hpp b/include/openpose/core/arrayCpuGpu.hpp
new file mode 100644
index 00000000..6a696ab6
--- /dev/null
+++ b/include/openpose/core/arrayCpuGpu.hpp
@@ -0,0 +1,100 @@
+#ifndef OPENPOSE_CORE_ARRAY_CPU_GPU_HPP
+#define OPENPOSE_CORE_ARRAY_CPU_GPU_HPP
+
+#include <memory> // std::shared_ptr
+#include <vector>
+#include <openpose/core/macros.hpp>
+
+namespace op
+{
+    /**
+     * ArrayCpuGpu<T>: Bind of caffe::Blob<T> to avoid Caffe as dependency in the headers. 
+     */
+    template<typename T>
+    class ArrayCpuGpu
+    {
+    public:
+        ArrayCpuGpu();
+        explicit ArrayCpuGpu(const void* caffeBlobTPtr);
+        explicit ArrayCpuGpu(const int num, const int channels, const int height, const int width);
+        // explicit ArrayCpuGpu(const std::vector<int>& shape);
+
+        void Reshape(const int num, const int channels, const int height, const int width);
+        void Reshape(const std::vector<int>& shape);
+        // // void Reshape(const BlobShape& shape);
+        // // void ReshapeLike(const Blob& other);
+        // void ReshapeLike(const ArrayCpuGpu& other);
+        std::string shape_string() const;
+        const std::vector<int>& shape() const;
+        int shape(const int index) const;
+        int num_axes() const;
+        int count() const;
+        int count(const int start_axis, const int end_axis) const;
+        int count(const int start_axis) const;
+
+        int CanonicalAxisIndex(const int axis_index) const;
+
+        // int num() const;
+        // int channels() const;
+        // int height() const;
+        // int width() const;
+        // int LegacyShape(const int index) const;
+
+        // int offset(const int n, const int c = 0, const int h = 0, const int w = 0) const;
+        // int offset(const std::vector<int>& indices) const;
+
+        // // void CopyFrom(const Blob<T>& source, bool copy_diff = false, bool reshape = false);
+        // void CopyFrom(const ArrayCpuGpu<T>& source, bool copy_diff = false, bool reshape = false);
+
+        T data_at(const int n, const int c, const int h, const int w) const;
+        T diff_at(const int n, const int c, const int h, const int w) const;
+        // T data_at(const std::vector<int>& index) const; // Caffe warning
+        // T diff_at(const std::vector<int>& index) const;
+
+        // // const boost::shared_ptr<SyncedMemory>& data() const;
+        // // const boost::shared_ptr<SyncedMemory>& diff() const;
+
+        const T* cpu_data() const;
+        void set_cpu_data(T* data);
+        const int* gpu_shape() const;
+        const T* gpu_data() const;
+        void set_gpu_data(T* data);
+        const T* cpu_diff() const;
+        const T* gpu_diff() const;
+        T* mutable_cpu_data();
+        T* mutable_gpu_data();
+        T* mutable_cpu_diff();
+        T* mutable_gpu_diff();
+        void Update();
+        // void FromProto(const BlobProto& proto, bool reshape = true);
+        // void ToProto(BlobProto* proto, bool write_diff = false) const;
+
+        // T asum_data() const;
+        // T asum_diff() const;
+        // T sumsq_data() const;
+        // T sumsq_diff() const;
+
+        // void scale_data(T scale_factor);
+        // void scale_diff(T scale_factor);
+
+        // // void ShareData(const Blob& other);
+        // // void ShareDiff(const Blob& other);
+
+        // // bool ShapeEquals(const BlobProto& other);
+
+    private:
+        // PIMPL idiom
+        // http://www.cppsamples.com/common-tasks/pimpl.html
+        struct ImplArrayCpuGpu;
+        std::shared_ptr<ImplArrayCpuGpu> spImpl;
+
+        // PIMP requires DELETE_COPY & destructor, or extra code
+        // http://oliora.github.io/2015/12/29/pimpl-and-rule-of-zero.html
+        DELETE_COPY(ArrayCpuGpu);
+    };
+
+    // // Static methods
+    // OVERLOAD_C_OUT(ArrayCpuGpu)
+}
+
+#endif // OPENPOSE_CORE_ARRAY_CPU_GPU_HPP
diff --git a/include/openpose/core/common.hpp b/include/openpose/core/common.hpp
index db270303..da05f9ed 100644
--- a/include/openpose/core/common.hpp
+++ b/include/openpose/core/common.hpp
@@ -8,6 +8,7 @@
 #include <vector>
 // OpenPose most used classes
 #include <openpose/core/array.hpp>
+#include <openpose/core/arrayCpuGpu.hpp>
 #include <openpose/core/macros.hpp>
 #include <openpose/core/point.hpp>
 #include <openpose/core/rectangle.hpp>
diff --git a/include/openpose/core/headers.hpp b/include/openpose/core/headers.hpp
index 9c8dcaf0..c85f4512 100644
--- a/include/openpose/core/headers.hpp
+++ b/include/openpose/core/headers.hpp
@@ -3,6 +3,7 @@
 
 // core module
 #include <openpose/core/array.hpp>
+#include <openpose/core/arrayCpuGpu.hpp>
 #include <openpose/core/common.hpp>
 #include <openpose/core/cvMatToOpInput.hpp>
 #include <openpose/core/cvMatToOpOutput.hpp>
diff --git a/include/openpose/core/macros.hpp b/include/openpose/core/macros.hpp
index 052789d2..d4e99f83 100644
--- a/include/openpose/core/macros.hpp
+++ b/include/openpose/core/macros.hpp
@@ -54,6 +54,24 @@ const std::string OPEN_POSE_NAME_AND_VERSION = OPEN_POSE_NAME_STRING + " " + OPE
     template classType OP_API className<double>; \
     template classType OP_API className<long double>
 
+// Instantiate a class with float and double specifications
+#define COMPILE_TEMPLATE_FLOATING_TYPES_CLASS(className) COMPILE_TEMPLATE_FLOATING_TYPES(className, class)
+#define COMPILE_TEMPLATE_FLOATING_TYPES_STRUCT(className) COMPILE_TEMPLATE_FLOATING_TYPES(className, struct)
+#define COMPILE_TEMPLATE_FLOATING_TYPES(className, classType) \
+  char gInstantiationGuard##className; \
+  template classType OP_API className<float>; \
+  template classType OP_API className<double>
+
+// Instantiate a class with float and double specifications
+#define COMPILE_TEMPLATE_FLOATING_INT_TYPES_CLASS(className) COMPILE_TEMPLATE_FLOATING_INT_TYPES(className, class)
+#define COMPILE_TEMPLATE_FLOATING_INT_TYPES_STRUCT(className) COMPILE_TEMPLATE_FLOATING_INT_TYPES(className, struct)
+#define COMPILE_TEMPLATE_FLOATING_INT_TYPES(className, classType) \
+  char gInstantiationGuard##className; \
+  template classType OP_API className<int>; \
+  template classType OP_API className<unsigned int>; \
+  template classType OP_API className<float>; \
+  template classType OP_API className<double>
+
 /**
  * cout operator overload calling toString() function
  * @return std::ostream containing output from toString()
@@ -65,25 +83,9 @@ const std::string OPEN_POSE_NAME_AND_VERSION = OPEN_POSE_NAME_STRING + " " + OPE
         return ostream; \
     }
 
-// Instantiate a class with float and double specifications
-#define COMPILE_TEMPLATE_FLOATING_TYPES_CLASS(className) COMPILE_TEMPLATE_FLOATING_TYPES(className, class)
-#define COMPILE_TEMPLATE_FLOATING_TYPES_STRUCT(className) COMPILE_TEMPLATE_FLOATING_TYPES(className, struct)
-#define COMPILE_TEMPLATE_FLOATING_TYPES(className, classType) \
-  char gInstantiationGuard##className; \
-  template classType OP_API className<float>; \
-  template classType OP_API className<double>
-
 // PIMPL does not work if function arguments need the 3rd-party class. Alternative:
 // stackoverflow.com/questions/13978775/how-to-avoid-include-dependency-to-external-library?answertab=active#tab-top
 struct dim3;
-namespace caffe
-{
-    template <typename T> class Blob;
-}
-namespace boost
-{
-    template <typename T> class shared_ptr; // E.g., boost::shared_ptr<caffe::Blob<float>>
-}
 
 // Compabitility for OpenCV 4.0 while preserving 2.4.X and 3.X compatibility
 // Note:
diff --git a/include/openpose/net/bodyPartConnectorCaffe.hpp b/include/openpose/net/bodyPartConnectorCaffe.hpp
index 51775332..4562ee29 100644
--- a/include/openpose/net/bodyPartConnectorCaffe.hpp
+++ b/include/openpose/net/bodyPartConnectorCaffe.hpp
@@ -17,7 +17,7 @@ namespace op
 
         virtual ~BodyPartConnectorCaffe();
 
-        virtual void Reshape(const std::vector<caffe::Blob<T>*>& bottom, const int gpuID = 0);
+        virtual void Reshape(const std::vector<ArrayCpuGpu<T>*>& bottom, const int gpuID = 0);
 
         virtual inline const char* type() const { return "BodyPartConnector"; }
 
@@ -35,23 +35,23 @@ namespace op
 
         void setScaleNetToOutput(const T scaleNetToOutput);
 
-        virtual void Forward(const std::vector<caffe::Blob<T>*>& bottom, Array<T>& poseKeypoints,
+        virtual void Forward(const std::vector<ArrayCpuGpu<T>*>& bottom, Array<T>& poseKeypoints,
                              Array<T>& poseScores);
 
-        virtual void Forward_cpu(const std::vector<caffe::Blob<T>*>& bottom, Array<T>& poseKeypoints,
+        virtual void Forward_cpu(const std::vector<ArrayCpuGpu<T>*>& bottom, Array<T>& poseKeypoints,
                                  Array<T>& poseScores);
 
-        virtual void Forward_gpu(const std::vector<caffe::Blob<T>*>& bottom, Array<T>& poseKeypoints,
+        virtual void Forward_gpu(const std::vector<ArrayCpuGpu<T>*>& bottom, Array<T>& poseKeypoints,
                                  Array<T>& poseScores);
 
-        virtual void Forward_ocl(const std::vector<caffe::Blob<T>*>& bottom, Array<T>& poseKeypoints,
+        virtual void Forward_ocl(const std::vector<ArrayCpuGpu<T>*>& bottom, Array<T>& poseKeypoints,
                                  Array<T>& poseScores);
 
-        virtual void Backward_cpu(const std::vector<caffe::Blob<T>*>& top, const std::vector<bool>& propagate_down,
-                                  const std::vector<caffe::Blob<T>*>& bottom);
+        virtual void Backward_cpu(const std::vector<ArrayCpuGpu<T>*>& top, const std::vector<bool>& propagate_down,
+                                  const std::vector<ArrayCpuGpu<T>*>& bottom);
 
-        virtual void Backward_gpu(const std::vector<caffe::Blob<T>*>& top, const std::vector<bool>& propagate_down,
-                                  const std::vector<caffe::Blob<T>*>& bottom);
+        virtual void Backward_gpu(const std::vector<ArrayCpuGpu<T>*>& top, const std::vector<bool>& propagate_down,
+                                  const std::vector<ArrayCpuGpu<T>*>& bottom);
 
     private:
         PoseModel mPoseModel;
diff --git a/include/openpose/net/maximumCaffe.hpp b/include/openpose/net/maximumCaffe.hpp
index d1789490..8ecfcc4b 100644
--- a/include/openpose/net/maximumCaffe.hpp
+++ b/include/openpose/net/maximumCaffe.hpp
@@ -16,23 +16,23 @@ namespace op
 
         virtual ~MaximumCaffe();
 
-        virtual void LayerSetUp(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top);
+        virtual void LayerSetUp(const std::vector<ArrayCpuGpu<T>*>& bottom, const std::vector<ArrayCpuGpu<T>*>& top);
 
-        virtual void Reshape(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top);
+        virtual void Reshape(const std::vector<ArrayCpuGpu<T>*>& bottom, const std::vector<ArrayCpuGpu<T>*>& top);
 
         virtual inline const char* type() const { return "Maximum"; }
 
-        virtual void Forward(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top);
+        virtual void Forward(const std::vector<ArrayCpuGpu<T>*>& bottom, const std::vector<ArrayCpuGpu<T>*>& top);
 
-        virtual void Forward_cpu(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top);
+        virtual void Forward_cpu(const std::vector<ArrayCpuGpu<T>*>& bottom, const std::vector<ArrayCpuGpu<T>*>& top);
 
-        virtual void Forward_gpu(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top);
+        virtual void Forward_gpu(const std::vector<ArrayCpuGpu<T>*>& bottom, const std::vector<ArrayCpuGpu<T>*>& top);
 
-        virtual void Backward_cpu(const std::vector<caffe::Blob<T>*>& top, const std::vector<bool>& propagate_down,
-                                  const std::vector<caffe::Blob<T>*>& bottom);
+        virtual void Backward_cpu(const std::vector<ArrayCpuGpu<T>*>& top, const std::vector<bool>& propagate_down,
+                                  const std::vector<ArrayCpuGpu<T>*>& bottom);
 
-        virtual void Backward_gpu(const std::vector<caffe::Blob<T>*>& top, const std::vector<bool>& propagate_down,
-                                  const std::vector<caffe::Blob<T>*>& bottom);
+        virtual void Backward_gpu(const std::vector<ArrayCpuGpu<T>*>& top, const std::vector<bool>& propagate_down,
+                                  const std::vector<ArrayCpuGpu<T>*>& bottom);
 
     private:
         std::array<int, 4> mBottomSize;
diff --git a/include/openpose/net/net.hpp b/include/openpose/net/net.hpp
index d595327a..5e44aaed 100644
--- a/include/openpose/net/net.hpp
+++ b/include/openpose/net/net.hpp
@@ -13,6 +13,8 @@ namespace op
         virtual void initializationOnThread() = 0;
 
         virtual void forwardPass(const Array<float>& inputData) const = 0;
+
+        virtual std::shared_ptr<ArrayCpuGpu<float>> getOutputBlobArray() const = 0;
     };
 }
 
diff --git a/include/openpose/net/netCaffe.hpp b/include/openpose/net/netCaffe.hpp
index 699bf29f..ffe131e8 100644
--- a/include/openpose/net/netCaffe.hpp
+++ b/include/openpose/net/netCaffe.hpp
@@ -18,7 +18,7 @@ namespace op
 
         void forwardPass(const Array<float>& inputNetData) const;
 
-        boost::shared_ptr<caffe::Blob<float>> getOutputBlob() const;
+        std::shared_ptr<ArrayCpuGpu<float>> getOutputBlobArray() const;
 
     private:
         // PIMPL idiom
diff --git a/include/openpose/net/netOpenCv.hpp b/include/openpose/net/netOpenCv.hpp
index 173d0b50..3a767a7c 100644
--- a/include/openpose/net/netOpenCv.hpp
+++ b/include/openpose/net/netOpenCv.hpp
@@ -17,7 +17,7 @@ namespace op
 
         void forwardPass(const Array<float>& inputNetData) const;
 
-        boost::shared_ptr<caffe::Blob<float>> getOutputBlob() const;
+        std::shared_ptr<ArrayCpuGpu<float>> getOutputBlobArray() const;
 
     private:
         // PIMPL idiom
diff --git a/include/openpose/net/nmsCaffe.hpp b/include/openpose/net/nmsCaffe.hpp
index 44bf227e..f91eb85e 100644
--- a/include/openpose/net/nmsCaffe.hpp
+++ b/include/openpose/net/nmsCaffe.hpp
@@ -16,9 +16,9 @@ namespace op
 
         virtual ~NmsCaffe();
 
-        virtual void LayerSetUp(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top);
+        virtual void LayerSetUp(const std::vector<ArrayCpuGpu<T>*>& bottom, const std::vector<ArrayCpuGpu<T>*>& top);
 
-        virtual void Reshape(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top,
+        virtual void Reshape(const std::vector<ArrayCpuGpu<T>*>& bottom, const std::vector<ArrayCpuGpu<T>*>& top,
                              const int maxPeaks, const int outputChannels = -1, const int gpuID = 0);
 
         virtual inline const char* type() const { return "Nms"; }
@@ -28,19 +28,19 @@ namespace op
         // Empirically gives better results (copied from Matlab original code)
         void setOffset(const Point<T>& offset);
 
-        virtual void Forward(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top);
+        virtual void Forward(const std::vector<ArrayCpuGpu<T>*>& bottom, const std::vector<ArrayCpuGpu<T>*>& top);
 
-        virtual void Forward_cpu(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top);
+        virtual void Forward_cpu(const std::vector<ArrayCpuGpu<T>*>& bottom, const std::vector<ArrayCpuGpu<T>*>& top);
 
-        virtual void Forward_gpu(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top);
+        virtual void Forward_gpu(const std::vector<ArrayCpuGpu<T>*>& bottom, const std::vector<ArrayCpuGpu<T>*>& top);
 
-        virtual void Forward_ocl(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top);
+        virtual void Forward_ocl(const std::vector<ArrayCpuGpu<T>*>& bottom, const std::vector<ArrayCpuGpu<T>*>& top);
 
-        virtual void Backward_cpu(const std::vector<caffe::Blob<T>*>& top, const std::vector<bool>& propagate_down,
-                                  const std::vector<caffe::Blob<T>*>& bottom);
+        virtual void Backward_cpu(const std::vector<ArrayCpuGpu<T>*>& top, const std::vector<bool>& propagate_down,
+                                  const std::vector<ArrayCpuGpu<T>*>& bottom);
 
-        virtual void Backward_gpu(const std::vector<caffe::Blob<T>*>& top, const std::vector<bool>& propagate_down,
-                                  const std::vector<caffe::Blob<T>*>& bottom);
+        virtual void Backward_gpu(const std::vector<ArrayCpuGpu<T>*>& top, const std::vector<bool>& propagate_down,
+                                  const std::vector<ArrayCpuGpu<T>*>& bottom);
 
     private:
         T mThreshold;
diff --git a/include/openpose/net/resizeAndMergeCaffe.hpp b/include/openpose/net/resizeAndMergeCaffe.hpp
index 30168be6..b28b94a9 100644
--- a/include/openpose/net/resizeAndMergeCaffe.hpp
+++ b/include/openpose/net/resizeAndMergeCaffe.hpp
@@ -16,9 +16,9 @@ namespace op
 
         virtual ~ResizeAndMergeCaffe();
 
-        virtual void LayerSetUp(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top);
+        virtual void LayerSetUp(const std::vector<ArrayCpuGpu<T>*>& bottom, const std::vector<ArrayCpuGpu<T>*>& top);
 
-        virtual void Reshape(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top,
+        virtual void Reshape(const std::vector<ArrayCpuGpu<T>*>& bottom, const std::vector<ArrayCpuGpu<T>*>& top,
                              const T netFactor, const T scaleFactor, const bool mergeFirstDimension = true,
                              const int gpuID = 0);
 
@@ -26,19 +26,19 @@ namespace op
 
         void setScaleRatios(const std::vector<T>& scaleRatios);
 
-        virtual void Forward(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top);
+        virtual void Forward(const std::vector<ArrayCpuGpu<T>*>& bottom, const std::vector<ArrayCpuGpu<T>*>& top);
 
-        virtual void Forward_cpu(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top);
+        virtual void Forward_cpu(const std::vector<ArrayCpuGpu<T>*>& bottom, const std::vector<ArrayCpuGpu<T>*>& top);
 
-        virtual void Forward_gpu(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top);
+        virtual void Forward_gpu(const std::vector<ArrayCpuGpu<T>*>& bottom, const std::vector<ArrayCpuGpu<T>*>& top);
 
-        virtual void Forward_ocl(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top);
+        virtual void Forward_ocl(const std::vector<ArrayCpuGpu<T>*>& bottom, const std::vector<ArrayCpuGpu<T>*>& top);
 
-        virtual void Backward_cpu(const std::vector<caffe::Blob<T>*>& top, const std::vector<bool>& propagate_down,
-                                  const std::vector<caffe::Blob<T>*>& bottom);
+        virtual void Backward_cpu(const std::vector<ArrayCpuGpu<T>*>& top, const std::vector<bool>& propagate_down,
+                                  const std::vector<ArrayCpuGpu<T>*>& bottom);
 
-        virtual void Backward_gpu(const std::vector<caffe::Blob<T>*>& top, const std::vector<bool>& propagate_down,
-                                  const std::vector<caffe::Blob<T>*>& bottom);
+        virtual void Backward_gpu(const std::vector<ArrayCpuGpu<T>*>& top, const std::vector<bool>& propagate_down,
+                                  const std::vector<ArrayCpuGpu<T>*>& bottom);
 
     private:
         std::vector<T*> mTempGPUData;
diff --git a/src/openpose/core/CMakeLists.txt b/src/openpose/core/CMakeLists.txt
index 98916c06..2ddcc43a 100644
--- a/src/openpose/core/CMakeLists.txt
+++ b/src/openpose/core/CMakeLists.txt
@@ -1,6 +1,7 @@
 set(CMAKE_CXX_SOURCE_FILE_EXTENSIONS C;M;c++;cc;cpp;cxx;mm;CPP;cl)
 set(SOURCES_OP_CORE
     array.cpp
+    arrayCpuGpu.cpp
     cvMatToOpInput.cpp
     cvMatToOpOutput.cpp
     datum.cpp
diff --git a/src/openpose/core/arrayCpuGpu.cpp b/src/openpose/core/arrayCpuGpu.cpp
new file mode 100644
index 00000000..90e025b8
--- /dev/null
+++ b/src/openpose/core/arrayCpuGpu.cpp
@@ -0,0 +1,581 @@
+#ifdef USE_CAFFE
+    #include <caffe/blob.hpp>
+#endif
+#include <openpose/utilities/errorAndLog.hpp>
+#include <openpose/core/arrayCpuGpu.hpp>
+
+namespace op
+{
+    template<typename T>
+    struct ArrayCpuGpu<T>::ImplArrayCpuGpu
+    {
+        #ifdef USE_CAFFE
+            std::unique_ptr<caffe::Blob<T>> upCaffeBlobT;
+            caffe::Blob<T>* pCaffeBlobT;
+        #endif
+    };
+
+    const std::string constructorErrorMessage = "ArrayCpuGpu class only implemented for Caffe DL framework (enable"
+        " `USE_CAFFE` in CMake-GUI).";
+    template<typename T>
+    ArrayCpuGpu<T>::ArrayCpuGpu()
+    {
+        try
+        {
+            #ifdef USE_CAFFE
+                spImpl.reset(new ImplArrayCpuGpu{});
+                spImpl->upCaffeBlobT.reset(new caffe::Blob<T>{});
+                spImpl->pCaffeBlobT = spImpl->upCaffeBlobT.get();
+            #else
+                error(constructorErrorMessage, __LINE__, __FUNCTION__, __FILE__);
+            #endif
+        }
+        catch (const std::exception& e)
+        {
+            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+        }
+    }
+
+    template<typename T>
+    ArrayCpuGpu<T>::ArrayCpuGpu(const void* caffeBlobTPtr)
+    {
+        try
+        {
+            #ifdef USE_CAFFE
+                spImpl.reset(new ImplArrayCpuGpu{});
+                spImpl->pCaffeBlobT = (caffe::Blob<T>*)caffeBlobTPtr;
+            #else
+                UNUSED(caffeBlobTPtr);
+                error(constructorErrorMessage, __LINE__, __FUNCTION__, __FILE__);
+            #endif
+        }
+        catch (const std::exception& e)
+        {
+            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+        }
+    }
+
+    template<typename T>
+    ArrayCpuGpu<T>::ArrayCpuGpu(const int num, const int channels, const int height, const int width)
+    {
+        try
+        {
+            #ifdef USE_CAFFE
+                spImpl.reset(new ImplArrayCpuGpu{});
+                spImpl->upCaffeBlobT.reset(new caffe::Blob<T>{num, channels, height, width});
+                spImpl->pCaffeBlobT = spImpl->upCaffeBlobT.get();
+            #else
+                UNUSED(num);
+                UNUSED(channels);
+                UNUSED(height);
+                UNUSED(width);
+                error(constructorErrorMessage, __LINE__, __FUNCTION__, __FILE__);
+            #endif
+        }
+        catch (const std::exception& e)
+        {
+            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+        }
+    }
+
+    // template<typename T>
+    // ArrayCpuGpu<T>::ArrayCpuGpu(const std::vector<int>& shape)
+    // {
+    //     try
+    //     {
+    //         #ifdef USE_CAFFE
+    //             spImpl.reset(new ImplArrayCpuGpu{});
+    //             spImpl->upCaffeBlobT.reset(new caffe::Blob<T>{shape});
+    //             spImpl->pCaffeBlobT = spImpl->upCaffeBlobT.get();
+    //         #else
+    //             error(constructorErrorMessage, __LINE__, __FUNCTION__, __FILE__);
+    //         #endif
+    //     }
+    //     catch (const std::exception& e)
+    //     {
+    //         error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+    //     }
+    // }
+
+    template<typename T>
+    void ArrayCpuGpu<T>::Reshape(const int num, const int channels, const int height, const int width)
+    {
+        try
+        {
+            #ifdef USE_CAFFE
+                spImpl->pCaffeBlobT->Reshape(num, channels, height, width);
+            #else
+                UNUSED(num);
+                UNUSED(channels);
+                UNUSED(height);
+                UNUSED(width);
+            #endif
+        }
+        catch (const std::exception& e)
+        {
+            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+        }
+    }
+
+    template<typename T>
+    void ArrayCpuGpu<T>::Reshape(const std::vector<int>& shape)
+    {
+        try
+        {
+            #ifdef USE_CAFFE
+                spImpl->pCaffeBlobT->Reshape(shape);
+            #else
+                UNUSED(shape);
+            #endif
+        }
+        catch (const std::exception& e)
+        {
+            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+        }
+    }
+
+    template<typename T>
+    std::string ArrayCpuGpu<T>::shape_string() const
+    {
+        try
+        {
+            #ifdef USE_CAFFE
+                return spImpl->pCaffeBlobT->shape_string();
+            #else
+                return "";
+            #endif
+        }
+        catch (const std::exception& e)
+        {
+            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+            return "";
+        }
+    }
+
+    std::vector<int> DUMB_VECTOR;
+    template<typename T>
+    const std::vector<int>& ArrayCpuGpu<T>::shape() const
+    {
+        try
+        {
+            #ifdef USE_CAFFE
+                return spImpl->pCaffeBlobT->shape();
+            #else
+                return DUMB_VECTOR;
+            #endif
+        }
+        catch (const std::exception& e)
+        {
+            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+            return DUMB_VECTOR;
+        }
+    }
+
+    template<typename T>
+    int ArrayCpuGpu<T>::shape(const int index) const
+    {
+        try
+        {
+            #ifdef USE_CAFFE
+                return spImpl->pCaffeBlobT->shape(index);
+            #else
+                UNUSED(index);
+                return -1;
+            #endif
+        }
+        catch (const std::exception& e)
+        {
+            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+            return -1;
+        }
+    }
+
+    template<typename T>
+    int ArrayCpuGpu<T>::num_axes() const
+    {
+        try
+        {
+            #ifdef USE_CAFFE
+                return spImpl->pCaffeBlobT->num_axes();
+            #else
+                return -1;
+            #endif
+        }
+        catch (const std::exception& e)
+        {
+            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+            return -1;
+        }
+    }
+
+    template<typename T>
+    int ArrayCpuGpu<T>::count() const
+    {
+        try
+        {
+            #ifdef USE_CAFFE
+                return spImpl->pCaffeBlobT->count();
+            #else
+                return -1;
+            #endif
+        }
+        catch (const std::exception& e)
+        {
+            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+            return -1;
+        }
+    }
+
+    template<typename T>
+    int ArrayCpuGpu<T>::count(const int start_axis, const int end_axis) const
+    {
+        try
+        {
+            #ifdef USE_CAFFE
+                return spImpl->pCaffeBlobT->count(start_axis, end_axis);
+            #else
+                UNUSED(start_axis);
+                UNUSED(end_axis);
+                return -1;
+            #endif
+        }
+        catch (const std::exception& e)
+        {
+            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+            return -1;
+        }
+    }
+
+    template<typename T>
+    int ArrayCpuGpu<T>::count(const int start_axis) const
+    {
+        try
+        {
+            #ifdef USE_CAFFE
+                return spImpl->pCaffeBlobT->count(start_axis);
+            #else
+                UNUSED(start_axis);
+                return -1;
+            #endif
+        }
+        catch (const std::exception& e)
+        {
+            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+            return -1;
+        }
+    }
+
+    template<typename T>
+    int ArrayCpuGpu<T>::CanonicalAxisIndex(const int axis_index) const
+    {
+        try
+        {
+            #ifdef USE_CAFFE
+                return spImpl->pCaffeBlobT->CanonicalAxisIndex(axis_index);
+            #else
+                UNUSED(axis_index);
+                return -1;
+            #endif
+        }
+        catch (const std::exception& e)
+        {
+            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+            return -1;
+        }
+    }
+
+    template<typename T>
+    T ArrayCpuGpu<T>::data_at(const int n, const int c, const int h, const int w) const
+    {
+        try
+        {
+            #ifdef USE_CAFFE
+                return spImpl->pCaffeBlobT->data_at(n, c, h, w);
+            #else
+                UNUSED(n);
+                UNUSED(c);
+                UNUSED(h);
+                UNUSED(w);
+                return T{0};
+            #endif
+        }
+        catch (const std::exception& e)
+        {
+            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+            return T{0};
+        }
+    }
+
+    template<typename T>
+    T ArrayCpuGpu<T>::diff_at(const int n, const int c, const int h, const int w) const
+    {
+        try
+        {
+            #ifdef USE_CAFFE
+                return spImpl->pCaffeBlobT->diff_at(n, c, h, w);
+            #else
+                UNUSED(n);
+                UNUSED(c);
+                UNUSED(h);
+                UNUSED(w);
+                return T{0};
+            #endif
+        }
+        catch (const std::exception& e)
+        {
+            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+            return T{0};
+        }
+    }
+
+    // template<typename T>
+    // T ArrayCpuGpu<T>::data_at(const std::vector<int>& index) const
+    // {
+    //     try
+    //     {
+    //         #ifdef USE_CAFFE
+    //             return spImpl->pCaffeBlobT->data_at(index);
+    //         #else
+    //             UNUSED(index);
+    //             return T{0};
+    //         #endif
+    //     }
+    //     catch (const std::exception& e)
+    //     {
+    //         error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+    //         return T{0};
+    //     }
+    // }
+
+    // template<typename T>
+    // T ArrayCpuGpu<T>::diff_at(const std::vector<int>& index) const
+    // {
+    //     try
+    //     {
+    //         #ifdef USE_CAFFE
+    //             return spImpl->pCaffeBlobT->diff_at(index);
+    //         #else
+    //             UNUSED(index);
+    //             return T{0};
+    //         #endif
+    //     }
+    //     catch (const std::exception& e)
+    //     {
+    //         error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+    //         return T{0};
+    //     }
+    // }
+
+    template<typename T>
+    const T* ArrayCpuGpu<T>::cpu_data() const
+    {
+        try
+        {
+            #ifdef USE_CAFFE
+                return spImpl->pCaffeBlobT->cpu_data();
+            #else
+                return nullptr;
+            #endif
+        }
+        catch (const std::exception& e)
+        {
+            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+            return nullptr;
+        }
+    }
+
+    template<typename T>
+    void ArrayCpuGpu<T>::set_cpu_data(T* data)
+    {
+        try
+        {
+            #ifdef USE_CAFFE
+                spImpl->pCaffeBlobT->set_cpu_data(data);
+            #else
+                UNUSED(data);
+            #endif
+        }
+        catch (const std::exception& e)
+        {
+            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+        }
+    }
+
+    template<typename T>
+    const int* ArrayCpuGpu<T>::gpu_shape() const
+    {
+        try
+        {
+            #ifdef USE_CAFFE
+                return spImpl->pCaffeBlobT->gpu_shape();
+            #else
+                return nullptr;
+            #endif
+        }
+        catch (const std::exception& e)
+        {
+            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+            return nullptr;
+        }
+    }
+
+    template<typename T>
+    const T* ArrayCpuGpu<T>::gpu_data() const
+    {
+        try
+        {
+            #ifdef USE_CAFFE
+                return spImpl->pCaffeBlobT->gpu_data();
+            #else
+                return nullptr;
+            #endif
+        }
+        catch (const std::exception& e)
+        {
+            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+            return nullptr;
+        }
+    }
+
+    template<typename T>
+    void ArrayCpuGpu<T>::set_gpu_data(T* data)
+    {
+        try
+        {
+            #ifdef USE_CAFFE
+                spImpl->pCaffeBlobT->set_gpu_data(data);
+            #else
+                UNUSED(data);
+            #endif
+        }
+        catch (const std::exception& e)
+        {
+            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+        }
+    }
+
+    template<typename T>
+    const T* ArrayCpuGpu<T>::cpu_diff() const
+    {
+        try
+        {
+            #ifdef USE_CAFFE
+                return spImpl->pCaffeBlobT->cpu_diff();
+            #else
+                return nullptr;
+            #endif
+        }
+        catch (const std::exception& e)
+        {
+            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+            return nullptr;
+        }
+    }
+
+    template<typename T>
+    const T* ArrayCpuGpu<T>::gpu_diff() const
+    {
+        try
+        {
+            #ifdef USE_CAFFE
+                return spImpl->pCaffeBlobT->gpu_diff();
+            #else
+                return nullptr;
+            #endif
+        }
+        catch (const std::exception& e)
+        {
+            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+            return nullptr;
+        }
+    }
+
+    template<typename T>
+    T* ArrayCpuGpu<T>::mutable_cpu_data()
+    {
+        try
+        {
+            #ifdef USE_CAFFE
+                return spImpl->pCaffeBlobT->mutable_cpu_data();
+            #else
+                return nullptr;
+            #endif
+        }
+        catch (const std::exception& e)
+        {
+            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+            return nullptr;
+        }
+    }
+
+    template<typename T>
+    T* ArrayCpuGpu<T>::mutable_gpu_data()
+    {
+        try
+        {
+            #ifdef USE_CAFFE
+                return spImpl->pCaffeBlobT->mutable_gpu_data();
+            #else
+                return nullptr;
+            #endif
+        }
+        catch (const std::exception& e)
+        {
+            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+            return nullptr;
+        }
+    }
+
+    template<typename T>
+    T* ArrayCpuGpu<T>::mutable_cpu_diff()
+    {
+        try
+        {
+            #ifdef USE_CAFFE
+                return spImpl->pCaffeBlobT->mutable_cpu_diff();
+            #else
+                return nullptr;
+            #endif
+        }
+        catch (const std::exception& e)
+        {
+            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+            return nullptr;
+        }
+    }
+
+    template<typename T>
+    T* ArrayCpuGpu<T>::mutable_gpu_diff()
+    {
+        try
+        {
+            #ifdef USE_CAFFE
+                return spImpl->pCaffeBlobT->mutable_gpu_diff();
+            #else
+                return nullptr;
+            #endif
+        }
+        catch (const std::exception& e)
+        {
+            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+            return nullptr;
+        }
+    }
+
+    template<typename T>
+    void ArrayCpuGpu<T>::Update()
+    {
+        try
+        {
+            #ifdef USE_CAFFE
+                spImpl->pCaffeBlobT->Update();
+            #endif
+        }
+        catch (const std::exception& e)
+        {
+            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+        }
+    }
+
+    COMPILE_TEMPLATE_FLOATING_INT_TYPES_CLASS(ArrayCpuGpu);
+}
diff --git a/src/openpose/face/faceExtractorCaffe.cpp b/src/openpose/face/faceExtractorCaffe.cpp
index a3a82ddc..968e2dcf 100644
--- a/src/openpose/face/faceExtractorCaffe.cpp
+++ b/src/openpose/face/faceExtractorCaffe.cpp
@@ -22,9 +22,9 @@ namespace op
             std::shared_ptr<ResizeAndMergeCaffe<float>> spResizeAndMergeCaffe;
             std::shared_ptr<MaximumCaffe<float>> spMaximumCaffe;
             // Init with thread
-            boost::shared_ptr<caffe::Blob<float>> spCaffeNetOutputBlob;
-            std::shared_ptr<caffe::Blob<float>> spHeatMapsBlob;
-            std::shared_ptr<caffe::Blob<float>> spPeaksBlob;
+            std::shared_ptr<ArrayCpuGpu<float>> spCaffeNetOutputBlob;
+            std::shared_ptr<ArrayCpuGpu<float>> spHeatMapsBlob;
+            std::shared_ptr<ArrayCpuGpu<float>> spPeaksBlob;
 
             ImplFaceExtractorCaffe(const std::string& modelFolder, const int gpuId, const bool enableGoogleLogging) :
                 netInitialized{false},
@@ -78,17 +78,19 @@ namespace op
 
         inline void reshapeFaceExtractorCaffe(std::shared_ptr<ResizeAndMergeCaffe<float>>& resizeAndMergeCaffe,
                                               std::shared_ptr<MaximumCaffe<float>>& maximumCaffe,
-                                              boost::shared_ptr<caffe::Blob<float>>& caffeNetOutputBlob,
-                                              std::shared_ptr<caffe::Blob<float>>& heatMapsBlob,
-                                              std::shared_ptr<caffe::Blob<float>>& peaksBlob,
+                                              std::shared_ptr<ArrayCpuGpu<float>>& caffeNetOutputBlob,
+                                              std::shared_ptr<ArrayCpuGpu<float>>& heatMapsBlob,
+                                              std::shared_ptr<ArrayCpuGpu<float>>& peaksBlob,
                                               const int gpuID)
         {
             try
             {
                 // HeatMaps extractor blob and layer
                 const bool mergeFirstDimension = true;
-                resizeAndMergeCaffe->Reshape({caffeNetOutputBlob.get()}, {heatMapsBlob.get()},
-                                             FACE_CCN_DECREASE_FACTOR, 1.f, mergeFirstDimension, gpuID);
+                resizeAndMergeCaffe->Reshape(
+                    std::vector<ArrayCpuGpu<float>*>{caffeNetOutputBlob.get()},
+                    std::vector<ArrayCpuGpu<float>*>{heatMapsBlob.get()},
+                    FACE_CCN_DECREASE_FACTOR, 1.f, mergeFirstDimension, gpuID);
                 // Pose extractor blob and layer
                 maximumCaffe->Reshape({heatMapsBlob.get()}, {peaksBlob.get()});
                 // Cuda check
@@ -153,9 +155,9 @@ namespace op
                     cudaCheck(__LINE__, __FUNCTION__, __FILE__);
                 #endif
                 // Initialize blobs
-                upImpl->spCaffeNetOutputBlob = upImpl->spNetCaffe->getOutputBlob();
-                upImpl->spHeatMapsBlob = {std::make_shared<caffe::Blob<float>>(1,1,1,1)};
-                upImpl->spPeaksBlob = {std::make_shared<caffe::Blob<float>>(1,1,1,1)};
+                upImpl->spCaffeNetOutputBlob = upImpl->spNetCaffe->getOutputBlobArray();
+                upImpl->spHeatMapsBlob = {std::make_shared<ArrayCpuGpu<float>>(1,1,1,1)};
+                upImpl->spPeaksBlob = {std::make_shared<ArrayCpuGpu<float>>(1,1,1,1)};
                 #ifdef USE_CUDA
                     cudaCheck(__LINE__, __FUNCTION__, __FILE__);
                 #endif
diff --git a/src/openpose/hand/handExtractorCaffe.cpp b/src/openpose/hand/handExtractorCaffe.cpp
index f92708c5..b1045c3d 100644
--- a/src/openpose/hand/handExtractorCaffe.cpp
+++ b/src/openpose/hand/handExtractorCaffe.cpp
@@ -1,4 +1,4 @@
-#if defined USE_CAFFE
+#ifdef USE_CAFFE
     #include <caffe/blob.hpp>
 #endif
 #include <opencv2/opencv.hpp> // CV_WARP_INVERSE_MAP, CV_INTER_LINEAR
@@ -16,16 +16,16 @@ namespace op
 {
     struct HandExtractorCaffe::ImplHandExtractorCaffe
     {
-        #if defined USE_CAFFE
+        #ifdef USE_CAFFE
             bool netInitialized;
             const int mGpuId;
             std::shared_ptr<NetCaffe> spNetCaffe;
             std::shared_ptr<ResizeAndMergeCaffe<float>> spResizeAndMergeCaffe;
             std::shared_ptr<MaximumCaffe<float>> spMaximumCaffe;
             // Init with thread
-            boost::shared_ptr<caffe::Blob<float>> spCaffeNetOutputBlob;
-            std::shared_ptr<caffe::Blob<float>> spHeatMapsBlob;
-            std::shared_ptr<caffe::Blob<float>> spPeaksBlob;
+            std::shared_ptr<ArrayCpuGpu<float>> spCaffeNetOutputBlob;
+            std::shared_ptr<ArrayCpuGpu<float>> spHeatMapsBlob;
+            std::shared_ptr<ArrayCpuGpu<float>> spPeaksBlob;
 
             ImplHandExtractorCaffe(const std::string& modelFolder, const int gpuId,
                                    const bool enableGoogleLogging) :
@@ -40,7 +40,7 @@ namespace op
         #endif
     };
 
-    #if defined USE_CAFFE
+    #ifdef USE_CAFFE
         void cropFrame(Array<float>& handImageCrop, cv::Mat& affineMatrix, const cv::Mat& cvInputData,
                        const Rectangle<float>& handRectangle, const int netInputSide,
                        const Point<int>& netOutputSize, const bool mirrorImage)
@@ -162,17 +162,19 @@ namespace op
 
         inline void reshapeHandExtractorCaffe(std::shared_ptr<ResizeAndMergeCaffe<float>>& resizeAndMergeCaffe,
                                               std::shared_ptr<MaximumCaffe<float>>& maximumCaffe,
-                                              boost::shared_ptr<caffe::Blob<float>>& caffeNetOutputBlob,
-                                              std::shared_ptr<caffe::Blob<float>>& heatMapsBlob,
-                                              std::shared_ptr<caffe::Blob<float>>& peaksBlob,
+                                              std::shared_ptr<ArrayCpuGpu<float>>& caffeNetOutputBlob,
+                                              std::shared_ptr<ArrayCpuGpu<float>>& heatMapsBlob,
+                                              std::shared_ptr<ArrayCpuGpu<float>>& peaksBlob,
                                               const int gpuID)
         {
             try
             {
                 // HeatMaps extractor blob and layer
                 const bool mergeFirstDimension = true;
-                resizeAndMergeCaffe->Reshape({caffeNetOutputBlob.get()}, {heatMapsBlob.get()},
-                                             HAND_CCN_DECREASE_FACTOR, 1.f, mergeFirstDimension, gpuID);
+                resizeAndMergeCaffe->Reshape(
+                    std::vector<ArrayCpuGpu<float>*>{caffeNetOutputBlob.get()},
+                    std::vector<ArrayCpuGpu<float>*>{heatMapsBlob.get()},
+                    HAND_CCN_DECREASE_FACTOR, 1.f, mergeFirstDimension, gpuID);
                 // Pose extractor blob and layer
                 maximumCaffe->Reshape({heatMapsBlob.get()}, {peaksBlob.get()});
                 // Cuda check
@@ -194,7 +196,7 @@ namespace op
                                            const ScaleMode heatMapScaleMode,
                                            const bool enableGoogleLogging) :
         HandExtractorNet{netInputSize, netOutputSize, numberScales, rangeScales, heatMapTypes, heatMapScaleMode}
-        #if defined USE_CAFFE
+        #ifdef USE_CAFFE
         , upImpl{new ImplHandExtractorCaffe{modelFolder, gpuId, enableGoogleLogging}}
         #endif
     {
@@ -228,19 +230,19 @@ namespace op
     {
         try
         {
-            #if defined USE_CAFFE
+            #ifdef USE_CAFFE
                 // Logging
                 log("Starting initialization on thread.", Priority::Low, __LINE__, __FUNCTION__, __FILE__);
                 // Initialize Caffe net
                 upImpl->spNetCaffe->initializationOnThread();
-                #if defined USE_CUDA
+                #ifdef USE_CUDA
                     cudaCheck(__LINE__, __FUNCTION__, __FILE__);
                 #endif
                 // Initialize blobs
-                upImpl->spCaffeNetOutputBlob = upImpl->spNetCaffe->getOutputBlob();
-                upImpl->spHeatMapsBlob = {std::make_shared<caffe::Blob<float>>(1,1,1,1)};
-                upImpl->spPeaksBlob = {std::make_shared<caffe::Blob<float>>(1,1,1,1)};
-                #if defined USE_CUDA
+                upImpl->spCaffeNetOutputBlob = upImpl->spNetCaffe->getOutputBlobArray();
+                upImpl->spHeatMapsBlob = {std::make_shared<ArrayCpuGpu<float>>(1,1,1,1)};
+                upImpl->spPeaksBlob = {std::make_shared<ArrayCpuGpu<float>>(1,1,1,1)};
+                #ifdef USE_CUDA
                     cudaCheck(__LINE__, __FUNCTION__, __FILE__);
                 #endif
                 // Logging
@@ -258,7 +260,7 @@ namespace op
     {
         try
         {
-            #if defined USE_CAFFE
+            #ifdef USE_CAFFE
                 if (mEnabled && !handRectangles.empty())
                 {
                     // Sanity check
@@ -409,7 +411,7 @@ namespace op
     {
         try
         {
-            #if defined USE_CAFFE
+            #ifdef USE_CAFFE
                 // 1. Deep net
                 upImpl->spNetCaffe->forwardPass(mHandImageCrop);
 
diff --git a/src/openpose/net/bodyPartConnectorCaffe.cpp b/src/openpose/net/bodyPartConnectorCaffe.cpp
index 9ce8fcea..9312d962 100644
--- a/src/openpose/net/bodyPartConnectorCaffe.cpp
+++ b/src/openpose/net/bodyPartConnectorCaffe.cpp
@@ -53,7 +53,7 @@ namespace op
     }
 
     template <typename T>
-    void BodyPartConnectorCaffe<T>::Reshape(const std::vector<caffe::Blob<T>*>& bottom, const int gpuID)
+    void BodyPartConnectorCaffe<T>::Reshape(const std::vector<ArrayCpuGpu<T>*>& bottom, const int gpuID)
     {
         try
         {
@@ -73,6 +73,7 @@ namespace op
                 mGpuID = gpuID;
             #else
                 UNUSED(bottom);
+                UNUSED(gpuID);
             #endif
         }
         catch (const std::exception& e)
@@ -173,7 +174,7 @@ namespace op
     }
 
     template <typename T>
-    void BodyPartConnectorCaffe<T>::Forward(const std::vector<caffe::Blob<T>*>& bottom, Array<T>& poseKeypoints,
+    void BodyPartConnectorCaffe<T>::Forward(const std::vector<ArrayCpuGpu<T>*>& bottom, Array<T>& poseKeypoints,
                                             Array<T>& poseScores)
     {
         try
@@ -196,7 +197,7 @@ namespace op
     }
 
     template <typename T>
-    void BodyPartConnectorCaffe<T>::Forward_cpu(const std::vector<caffe::Blob<T>*>& bottom, Array<T>& poseKeypoints,
+    void BodyPartConnectorCaffe<T>::Forward_cpu(const std::vector<ArrayCpuGpu<T>*>& bottom, Array<T>& poseKeypoints,
                                                 Array<T>& poseScores)
     {
         try
@@ -213,6 +214,7 @@ namespace op
             #else
                 UNUSED(bottom);
                 UNUSED(poseKeypoints);
+                UNUSED(poseScores);
             #endif
         }
         catch (const std::exception& e)
@@ -222,7 +224,7 @@ namespace op
     }
 
     template <typename T>
-    void BodyPartConnectorCaffe<T>::Forward_ocl(const std::vector<caffe::Blob<T>*>& bottom, Array<T>& poseKeypoints,
+    void BodyPartConnectorCaffe<T>::Forward_ocl(const std::vector<ArrayCpuGpu<T>*>& bottom, Array<T>& poseKeypoints,
                                                 Array<T>& poseScores)
     {
         try
@@ -298,7 +300,7 @@ namespace op
     }
 
     template <typename T>
-    void BodyPartConnectorCaffe<T>::Forward_gpu(const std::vector<caffe::Blob<T>*>& bottom, Array<T>& poseKeypoints,
+    void BodyPartConnectorCaffe<T>::Forward_gpu(const std::vector<ArrayCpuGpu<T>*>& bottom, Array<T>& poseKeypoints,
                                                 Array<T>& poseScores)
     {
         try
@@ -373,9 +375,9 @@ namespace op
     }
 
     template <typename T>
-    void BodyPartConnectorCaffe<T>::Backward_cpu(const std::vector<caffe::Blob<T>*>& top,
+    void BodyPartConnectorCaffe<T>::Backward_cpu(const std::vector<ArrayCpuGpu<T>*>& top,
                                                  const std::vector<bool>& propagate_down,
-                                                 const std::vector<caffe::Blob<T>*>& bottom)
+                                                 const std::vector<ArrayCpuGpu<T>*>& bottom)
     {
         try
         {
@@ -393,9 +395,9 @@ namespace op
     }
 
     template <typename T>
-    void BodyPartConnectorCaffe<T>::Backward_gpu(const std::vector<caffe::Blob<T>*>& top,
+    void BodyPartConnectorCaffe<T>::Backward_gpu(const std::vector<ArrayCpuGpu<T>*>& top,
                                                  const std::vector<bool>& propagate_down,
-                                                 const std::vector<caffe::Blob<T>*>& bottom)
+                                                 const std::vector<ArrayCpuGpu<T>*>& bottom)
     {
         try
         {
diff --git a/src/openpose/net/maximumCaffe.cpp b/src/openpose/net/maximumCaffe.cpp
index b5488914..58eecebb 100644
--- a/src/openpose/net/maximumCaffe.cpp
+++ b/src/openpose/net/maximumCaffe.cpp
@@ -28,8 +28,8 @@ namespace op
     }
 
     template <typename T>
-    void MaximumCaffe<T>::LayerSetUp(const std::vector<caffe::Blob<T>*>& bottom,
-                                     const std::vector<caffe::Blob<T>*>& top)
+    void MaximumCaffe<T>::LayerSetUp(const std::vector<ArrayCpuGpu<T>*>& bottom,
+                                     const std::vector<ArrayCpuGpu<T>*>& top)
     {
         try
         {
@@ -50,8 +50,8 @@ namespace op
     }
 
     template <typename T>
-    void MaximumCaffe<T>::Reshape(const std::vector<caffe::Blob<T>*>& bottom,
-                                  const std::vector<caffe::Blob<T>*>& top)
+    void MaximumCaffe<T>::Reshape(const std::vector<ArrayCpuGpu<T>*>& bottom,
+                                  const std::vector<ArrayCpuGpu<T>*>& top)
     {
         try
         {
@@ -86,8 +86,8 @@ namespace op
     }
 
     template <typename T>
-    void MaximumCaffe<T>::Forward(const std::vector<caffe::Blob<T>*>& bottom,
-                                  const std::vector<caffe::Blob<T>*>& top)
+    void MaximumCaffe<T>::Forward(const std::vector<ArrayCpuGpu<T>*>& bottom,
+                                  const std::vector<ArrayCpuGpu<T>*>& top)
     {
         try
         {
@@ -108,8 +108,8 @@ namespace op
     }
 
     template <typename T>
-    void MaximumCaffe<T>::Forward_cpu(const std::vector<caffe::Blob<T>*>& bottom,
-                                      const std::vector<caffe::Blob<T>*>& top)
+    void MaximumCaffe<T>::Forward_cpu(const std::vector<ArrayCpuGpu<T>*>& bottom,
+                                      const std::vector<ArrayCpuGpu<T>*>& top)
     {
         try
         {
@@ -127,8 +127,8 @@ namespace op
     }
 
     template <typename T>
-    void MaximumCaffe<T>::Forward_gpu(const std::vector<caffe::Blob<T>*>& bottom,
-                                      const std::vector<caffe::Blob<T>*>& top)
+    void MaximumCaffe<T>::Forward_gpu(const std::vector<ArrayCpuGpu<T>*>& bottom,
+                                      const std::vector<ArrayCpuGpu<T>*>& top)
     {
         try
         {
@@ -148,9 +148,9 @@ namespace op
     }
 
     template <typename T>
-    void MaximumCaffe<T>::Backward_cpu(const std::vector<caffe::Blob<T>*>& top,
+    void MaximumCaffe<T>::Backward_cpu(const std::vector<ArrayCpuGpu<T>*>& top,
                                        const std::vector<bool>& propagate_down,
-                                       const std::vector<caffe::Blob<T>*>& bottom)
+                                       const std::vector<ArrayCpuGpu<T>*>& bottom)
     {
         try
         {
@@ -168,9 +168,9 @@ namespace op
     }
 
     template <typename T>
-    void MaximumCaffe<T>::Backward_gpu(const std::vector<caffe::Blob<T>*>& top,
+    void MaximumCaffe<T>::Backward_gpu(const std::vector<ArrayCpuGpu<T>*>& top,
                                        const std::vector<bool>& propagate_down,
-                                       const std::vector<caffe::Blob<T>*>& bottom)
+                                       const std::vector<ArrayCpuGpu<T>*>& bottom)
     {
         try
         {
diff --git a/src/openpose/net/netCaffe.cpp b/src/openpose/net/netCaffe.cpp
index ed7c86dd..39a50417 100644
--- a/src/openpose/net/netCaffe.cpp
+++ b/src/openpose/net/netCaffe.cpp
@@ -44,43 +44,50 @@ namespace op
                 mCaffeTrainedModel{caffeTrainedModel},
                 mLastBlobName{lastBlobName}
             {
-                const std::string message{".\nPossible causes:\n\t1. Not downloading the OpenPose trained models."
-                                          "\n\t2. Not running OpenPose from the same directory where the `model`"
-                                          " folder is located.\n\t3. Using paths with spaces."};
-                if (!existFile(mCaffeProto))
-                    error("Prototxt file not found: " + mCaffeProto + message, __LINE__, __FUNCTION__, __FILE__);
-                if (!existFile(mCaffeTrainedModel))
-                    error("Caffe trained model file not found: " + mCaffeTrainedModel + message,
-                          __LINE__, __FUNCTION__, __FILE__);
-                // Double if condition in order to speed up the program if it is called several times
-                if (enableGoogleLogging && !sGoogleLoggingInitialized)
+                try
                 {
-                    std::lock_guard<std::mutex> lock{sMutexNetCaffe};
+                    const std::string message{".\nPossible causes:\n\t1. Not downloading the OpenPose trained models."
+                                              "\n\t2. Not running OpenPose from the same directory where the `model`"
+                                              " folder is located.\n\t3. Using paths with spaces."};
+                    if (!existFile(mCaffeProto))
+                        error("Prototxt file not found: " + mCaffeProto + message, __LINE__, __FUNCTION__, __FILE__);
+                    if (!existFile(mCaffeTrainedModel))
+                        error("Caffe trained model file not found: " + mCaffeTrainedModel + message,
+                              __LINE__, __FUNCTION__, __FILE__);
+                    // Double if condition in order to speed up the program if it is called several times
                     if (enableGoogleLogging && !sGoogleLoggingInitialized)
-                    {
-                        google::InitGoogleLogging("OpenPose");
-                        sGoogleLoggingInitialized = true;
-                    }
-                }
-                #ifdef USE_OPENCL
-                    // Initialize OpenCL
-                    if (!sOpenCLInitialized)
                     {
                         std::lock_guard<std::mutex> lock{sMutexNetCaffe};
-                        if (!sOpenCLInitialized)
+                        if (enableGoogleLogging && !sGoogleLoggingInitialized)
                         {
-                            caffe::Caffe::set_mode(caffe::Caffe::GPU);
-                            std::vector<int> devices;
-                            const int maxNumberGpu = OpenCL::getTotalGPU();
-                            for (auto i = 0; i < maxNumberGpu; i++)
-                                devices.emplace_back(i);
-                            caffe::Caffe::SetDevices(devices);
-                            if (mGpuId >= maxNumberGpu)
-                                error("Unexpected error. Please, notify us.", __LINE__, __FUNCTION__, __FILE__);
-                            sOpenCLInitialized = true;
+                            google::InitGoogleLogging("OpenPose");
+                            sGoogleLoggingInitialized = true;
                         }
                     }
-                #endif
+                    #ifdef USE_OPENCL
+                        // Initialize OpenCL
+                        if (!sOpenCLInitialized)
+                        {
+                            std::lock_guard<std::mutex> lock{sMutexNetCaffe};
+                            if (!sOpenCLInitialized)
+                            {
+                                caffe::Caffe::set_mode(caffe::Caffe::GPU);
+                                std::vector<int> devices;
+                                const int maxNumberGpu = OpenCL::getTotalGPU();
+                                for (auto i = 0; i < maxNumberGpu; i++)
+                                    devices.emplace_back(i);
+                                caffe::Caffe::SetDevices(devices);
+                                if (mGpuId >= maxNumberGpu)
+                                    error("Unexpected error. Please, notify us.", __LINE__, __FUNCTION__, __FILE__);
+                                sOpenCLInitialized = true;
+                            }
+                        }
+                    #endif
+                }
+                catch (const std::exception& e)
+                {
+                    error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+                }
             }
         #endif
     };
@@ -113,10 +120,10 @@ namespace op
         try
         {
             #ifndef USE_CAFFE
-                UNUSED(netInputSize4D);
                 UNUSED(caffeProto);
                 UNUSED(caffeTrainedModel);
                 UNUSED(gpuId);
+                UNUSED(enableGoogleLogging);
                 UNUSED(lastBlobName);
                 error("OpenPose must be compiled with the `USE_CAFFE` macro definition in order to use this"
                       " functionality.", __LINE__, __FUNCTION__, __FILE__);
@@ -166,6 +173,7 @@ namespace op
                 #endif
                 // Set spOutputBlob
                 upImpl->spOutputBlob = upImpl->upCaffeNet->blob_by_name(upImpl->mLastBlobName);
+                // Sanity check
                 if (upImpl->spOutputBlob == nullptr)
                     error("The output blob is a nullptr. Did you use the same name than the prototxt? (Used: "
                           + upImpl->mLastBlobName + ").", __LINE__, __FUNCTION__, __FILE__);
@@ -227,12 +235,12 @@ namespace op
         }
     }
 
-    boost::shared_ptr<caffe::Blob<float>> NetCaffe::getOutputBlob() const
+    std::shared_ptr<ArrayCpuGpu<float>> NetCaffe::getOutputBlobArray() const
     {
         try
         {
             #ifdef USE_CAFFE
-                return upImpl->spOutputBlob;
+                return std::make_shared<ArrayCpuGpu<float>>(upImpl->spOutputBlob.get());
             #else
                 return nullptr;
             #endif
diff --git a/src/openpose/net/netOpenCv.cpp b/src/openpose/net/netOpenCv.cpp
index e1fb86d7..63edb9da 100644
--- a/src/openpose/net/netOpenCv.cpp
+++ b/src/openpose/net/netOpenCv.cpp
@@ -31,7 +31,6 @@ namespace op
             // OpenCV DNN
             cv::dnn::Net mNet;
             cv::Mat mNetOutputBlob;
-            // std::shared_ptr<caffe::Blob<float>> spOutputBlob;
             boost::shared_ptr<caffe::Blob<float>> spOutputBlob;
 
             ImplNetOpenCv(const std::string& caffeProto, const std::string& caffeTrainedModel, const int gpuId) :
@@ -39,7 +38,6 @@ namespace op
                 mCaffeProto{caffeProto},
                 mCaffeTrainedModel{caffeTrainedModel},
                 mNet{cv::dnn::readNetFromCaffe(caffeProto, caffeTrainedModel)},
-                // spOutputBlob{std::make_shared<caffe::Blob<float>>(1,1,1,1)}
                 spOutputBlob{new caffe::Blob<float>(1,1,1,1)}
             {
                 const std::string message{".\nPossible causes:\n\t1. Not downloading the OpenPose trained models."
@@ -139,12 +137,12 @@ namespace op
         }
     }
 
-    boost::shared_ptr<caffe::Blob<float>> NetOpenCv::getOutputBlob() const
+    std::shared_ptr<ArrayCpuGpu<float>> NetOpenCv::getOutputBlobArray() const
     {
         try
         {
             #ifdef USE_OPEN_CV_DNN
-                return upImpl->spOutputBlob;
+                return std::make_shared<ArrayCpuGpu<float>>(upImpl->spOutputBlob.get());
             #else
                 return nullptr;
             #endif
diff --git a/src/openpose/net/nmsCaffe.cpp b/src/openpose/net/nmsCaffe.cpp
index d68f776c..89609acb 100644
--- a/src/openpose/net/nmsCaffe.cpp
+++ b/src/openpose/net/nmsCaffe.cpp
@@ -14,12 +14,12 @@ namespace op
     struct NmsCaffe<T>::ImplNmsCaffe
     {
         #ifdef USE_CAFFE
-            caffe::Blob<int> mKernelBlob;
+            ArrayCpuGpu<int> mKernelBlob;
             std::array<int, 4> mBottomSize;
             std::array<int, 4> mTopSize;
             // Special Kernel for OpenCL NMS
             #if defined USE_CAFFE && defined USE_OPENCL
-                //std::shared_ptr<caffe::Blob<uint8_t>> mKernelBlobT;
+                //std::shared_ptr<ArrayCpuGpu<uint8_t>> mKernelBlobT;
                 uint8_t* mKernelGpuPtr;
                 uint8_t* mKernelCpuPtr;
             #endif
@@ -65,7 +65,7 @@ namespace op
     }
 
     template <typename T>
-    void NmsCaffe<T>::LayerSetUp(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top)
+    void NmsCaffe<T>::LayerSetUp(const std::vector<ArrayCpuGpu<T>*>& bottom, const std::vector<ArrayCpuGpu<T>*>& top)
     {
         try
         {
@@ -86,7 +86,7 @@ namespace op
     }
 
     template <typename T>
-    void NmsCaffe<T>::Reshape(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top,
+    void NmsCaffe<T>::Reshape(const std::vector<ArrayCpuGpu<T>*>& bottom, const std::vector<ArrayCpuGpu<T>*>& top,
                               const int maxPeaks, const int outputChannels, const int gpuID)
     {
         try
@@ -127,6 +127,8 @@ namespace op
                 UNUSED(bottom);
                 UNUSED(top);
                 UNUSED(maxPeaks);
+                UNUSED(outputChannels);
+                UNUSED(gpuID);
             #endif
         }
         catch (const std::exception& e)
@@ -162,7 +164,7 @@ namespace op
     }
 
     template <typename T>
-    void NmsCaffe<T>::Forward(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top)
+    void NmsCaffe<T>::Forward(const std::vector<ArrayCpuGpu<T>*>& bottom, const std::vector<ArrayCpuGpu<T>*>& top)
     {
         try
         {
@@ -184,7 +186,7 @@ namespace op
     }
 
     template <typename T>
-    void NmsCaffe<T>::Forward_cpu(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top)
+    void NmsCaffe<T>::Forward_cpu(const std::vector<ArrayCpuGpu<T>*>& bottom, const std::vector<ArrayCpuGpu<T>*>& top)
     {
         try
         {
@@ -203,7 +205,7 @@ namespace op
     }
 
     template <typename T>
-    void NmsCaffe<T>::Forward_gpu(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top)
+    void NmsCaffe<T>::Forward_gpu(const std::vector<ArrayCpuGpu<T>*>& bottom, const std::vector<ArrayCpuGpu<T>*>& top)
     {
         try
         {
@@ -224,7 +226,7 @@ namespace op
     }
 
     template <typename T>
-    void NmsCaffe<T>::Forward_ocl(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top)
+    void NmsCaffe<T>::Forward_ocl(const std::vector<ArrayCpuGpu<T>*>& bottom, const std::vector<ArrayCpuGpu<T>*>& top)
     {
         try
         {
@@ -246,8 +248,8 @@ namespace op
     }
 
     template <typename T>
-    void NmsCaffe<T>::Backward_cpu(const std::vector<caffe::Blob<T>*>& top, const std::vector<bool>& propagate_down,
-                                   const std::vector<caffe::Blob<T>*>& bottom)
+    void NmsCaffe<T>::Backward_cpu(const std::vector<ArrayCpuGpu<T>*>& top, const std::vector<bool>& propagate_down,
+                                   const std::vector<ArrayCpuGpu<T>*>& bottom)
     {
         try
         {
@@ -265,8 +267,8 @@ namespace op
     }
 
     template <typename T>
-    void NmsCaffe<T>::Backward_gpu(const std::vector<caffe::Blob<T>*>& top, const std::vector<bool>& propagate_down,
-                                   const std::vector<caffe::Blob<T>*>& bottom)
+    void NmsCaffe<T>::Backward_gpu(const std::vector<ArrayCpuGpu<T>*>& top, const std::vector<bool>& propagate_down,
+                                   const std::vector<ArrayCpuGpu<T>*>& bottom)
     {
         try
         {
diff --git a/src/openpose/net/resizeAndMergeCaffe.cpp b/src/openpose/net/resizeAndMergeCaffe.cpp
index 8e0d98c4..192cb087 100644
--- a/src/openpose/net/resizeAndMergeCaffe.cpp
+++ b/src/openpose/net/resizeAndMergeCaffe.cpp
@@ -33,8 +33,8 @@ namespace op
     }
 
     template <typename T>
-    void ResizeAndMergeCaffe<T>::LayerSetUp(const std::vector<caffe::Blob<T>*>& bottom,
-                                            const std::vector<caffe::Blob<T>*>& top)
+    void ResizeAndMergeCaffe<T>::LayerSetUp(const std::vector<ArrayCpuGpu<T>*>& bottom,
+                                            const std::vector<ArrayCpuGpu<T>*>& top)
     {
         try
         {
@@ -56,7 +56,7 @@ namespace op
 
     template <typename T>
     void ResizeAndMergeCaffe<T>::Reshape(
-        const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top, const T netFactor,
+        const std::vector<ArrayCpuGpu<T>*>& bottom, const std::vector<ArrayCpuGpu<T>*>& top, const T netFactor,
         const T scaleFactor, const bool mergeFirstDimension, const int gpuID)
     {
         try
@@ -100,6 +100,7 @@ namespace op
                 UNUSED(netFactor);
                 UNUSED(scaleFactor);
                 UNUSED(mergeFirstDimension);
+                UNUSED(gpuID);
             #endif
         }
         catch (const std::exception& e)
@@ -122,8 +123,8 @@ namespace op
     }
 
     template <typename T>
-    void ResizeAndMergeCaffe<T>::Forward(const std::vector<caffe::Blob<T>*>& bottom,
-                                         const std::vector<caffe::Blob<T>*>& top)
+    void ResizeAndMergeCaffe<T>::Forward(const std::vector<ArrayCpuGpu<T>*>& bottom,
+                                         const std::vector<ArrayCpuGpu<T>*>& top)
     {
         try
         {
@@ -145,8 +146,8 @@ namespace op
     }
 
     template <typename T>
-    void ResizeAndMergeCaffe<T>::Forward_cpu(const std::vector<caffe::Blob<T>*>& bottom,
-                                             const std::vector<caffe::Blob<T>*>& top)
+    void ResizeAndMergeCaffe<T>::Forward_cpu(const std::vector<ArrayCpuGpu<T>*>& bottom,
+                                             const std::vector<ArrayCpuGpu<T>*>& top)
     {
         try
         {
@@ -168,8 +169,8 @@ namespace op
     }
 
     template <typename T>
-    void ResizeAndMergeCaffe<T>::Forward_gpu(const std::vector<caffe::Blob<T>*>& bottom,
-                                             const std::vector<caffe::Blob<T>*>& top)
+    void ResizeAndMergeCaffe<T>::Forward_gpu(const std::vector<ArrayCpuGpu<T>*>& bottom,
+                                             const std::vector<ArrayCpuGpu<T>*>& top)
     {
         try
         {
@@ -193,8 +194,8 @@ namespace op
     }
 
     template <typename T>
-    void ResizeAndMergeCaffe<T>::Forward_ocl(const std::vector<caffe::Blob<T>*>& bottom,
-                                             const std::vector<caffe::Blob<T>*>& top)
+    void ResizeAndMergeCaffe<T>::Forward_ocl(const std::vector<ArrayCpuGpu<T>*>& bottom,
+                                             const std::vector<ArrayCpuGpu<T>*>& top)
     {
         try
         {
@@ -218,9 +219,9 @@ namespace op
     }
 
     template <typename T>
-    void ResizeAndMergeCaffe<T>::Backward_cpu(const std::vector<caffe::Blob<T>*>& top,
+    void ResizeAndMergeCaffe<T>::Backward_cpu(const std::vector<ArrayCpuGpu<T>*>& top,
                                               const std::vector<bool>& propagate_down,
-                                              const std::vector<caffe::Blob<T>*>& bottom)
+                                              const std::vector<ArrayCpuGpu<T>*>& bottom)
     {
         try
         {
@@ -238,9 +239,9 @@ namespace op
     }
 
     template <typename T>
-    void ResizeAndMergeCaffe<T>::Backward_gpu(const std::vector<caffe::Blob<T>*>& top,
+    void ResizeAndMergeCaffe<T>::Backward_gpu(const std::vector<ArrayCpuGpu<T>*>& top,
                                               const std::vector<bool>& propagate_down,
-                                              const std::vector<caffe::Blob<T>*>& bottom)
+                                              const std::vector<ArrayCpuGpu<T>*>& bottom)
     {
         try
         {
diff --git a/src/openpose/pose/poseExtractorCaffe.cpp b/src/openpose/pose/poseExtractorCaffe.cpp
index 236bbfbe..7422a5a6 100644
--- a/src/openpose/pose/poseExtractorCaffe.cpp
+++ b/src/openpose/pose/poseExtractorCaffe.cpp
@@ -44,10 +44,10 @@ namespace op
             std::vector<std::vector<int>> mNetInput4DSizes;
             std::vector<double> mScaleInputToNetInputs;
             // Init with thread
-            std::vector<boost::shared_ptr<caffe::Blob<float>>> spCaffeNetOutputBlobs;
-            std::shared_ptr<caffe::Blob<float>> spHeatMapsBlob;
-            std::shared_ptr<caffe::Blob<float>> spPeaksBlob;
-            std::shared_ptr<caffe::Blob<float>> spMaximumPeaksBlob;
+            std::vector<std::shared_ptr<ArrayCpuGpu<float>>> spCaffeNetOutputBlobs;
+            std::shared_ptr<ArrayCpuGpu<float>> spHeatMapsBlob;
+            std::shared_ptr<ArrayCpuGpu<float>> spPeaksBlob;
+            std::shared_ptr<ArrayCpuGpu<float>> spMaximumPeaksBlob;
 
             ImplPoseExtractorCaffe(
                 const PoseModel poseModel, const int gpuId, const std::string& modelFolder,
@@ -69,13 +69,13 @@ namespace op
     };
 
     #ifdef USE_CAFFE
-        std::vector<caffe::Blob<float>*> caffeNetSharedToPtr(
-            std::vector<boost::shared_ptr<caffe::Blob<float>>>& caffeNetOutputBlob)
+        std::vector<ArrayCpuGpu<float>*> arraySharedToPtr(
+            const std::vector<std::shared_ptr<ArrayCpuGpu<float>>>& caffeNetOutputBlob)
         {
             try
             {
                 // Prepare spCaffeNetOutputBlobss
-                std::vector<caffe::Blob<float>*> caffeNetOutputBlobs(caffeNetOutputBlob.size());
+                std::vector<ArrayCpuGpu<float>*> caffeNetOutputBlobs(caffeNetOutputBlob.size());
                 for (auto i = 0u ; i < caffeNetOutputBlobs.size() ; i++)
                     caffeNetOutputBlobs[i] = caffeNetOutputBlob[i].get();
                 return caffeNetOutputBlobs;
@@ -87,26 +87,27 @@ namespace op
             }
         }
 
-        inline void reshapePoseExtractorCaffe(std::shared_ptr<ResizeAndMergeCaffe<float>>& resizeAndMergeCaffe,
-                                              std::shared_ptr<NmsCaffe<float>>& nmsCaffe,
-                                              std::shared_ptr<BodyPartConnectorCaffe<float>>& bodyPartConnectorCaffe,
-                                              std::shared_ptr<MaximumCaffe<float>>& maximumCaffe,
-                                              std::vector<boost::shared_ptr<caffe::Blob<float>>>& caffeNetOutputBlob,
-                                              std::shared_ptr<caffe::Blob<float>>& heatMapsBlob,
-                                              std::shared_ptr<caffe::Blob<float>>& peaksBlob,
-                                              std::shared_ptr<caffe::Blob<float>>& maximumPeaksBlob,
-                                              const float scaleInputToNetInput,
-                                              const PoseModel poseModel,
-                                              const int gpuID)
+        inline void reshapePoseExtractorCaffe(
+            std::shared_ptr<ResizeAndMergeCaffe<float>>& resizeAndMergeCaffe,
+            std::shared_ptr<NmsCaffe<float>>& nmsCaffe,
+            std::shared_ptr<BodyPartConnectorCaffe<float>>& bodyPartConnectorCaffe,
+            std::shared_ptr<MaximumCaffe<float>>& maximumCaffe,
+            std::vector<std::shared_ptr<ArrayCpuGpu<float>>>& caffeNetOutputBlobsShared,
+            std::shared_ptr<ArrayCpuGpu<float>>& heatMapsBlob,
+            std::shared_ptr<ArrayCpuGpu<float>>& peaksBlob,
+            std::shared_ptr<ArrayCpuGpu<float>>& maximumPeaksBlob,
+            const float scaleInputToNetInput,
+            const PoseModel poseModel,
+            const int gpuID)
         {
             try
             {
                 // HeatMaps extractor blob and layer
                 // Caffe modifies bottom - Heatmap gets resized
-                const auto caffeNetOutputBlobs = caffeNetSharedToPtr(caffeNetOutputBlob);
-                resizeAndMergeCaffe->Reshape(caffeNetOutputBlobs, {heatMapsBlob.get()},
-                                             getPoseNetDecreaseFactor(poseModel), 1.f/scaleInputToNetInput, true,
-                                             gpuID);
+                const auto caffeNetOutputBlobs = arraySharedToPtr(caffeNetOutputBlobsShared);
+                resizeAndMergeCaffe->Reshape(
+                    caffeNetOutputBlobs, {heatMapsBlob.get()},
+                    getPoseNetDecreaseFactor(poseModel), 1.f/scaleInputToNetInput, true, gpuID);
                 // Pose extractor blob and layer
                 nmsCaffe->Reshape({heatMapsBlob.get()}, {peaksBlob.get()}, getPoseMaxPeaks(),
                                   getPoseNumberBodyParts(poseModel), gpuID);
@@ -127,7 +128,7 @@ namespace op
 
         void addCaffeNetOnThread(
             std::vector<std::shared_ptr<Net>>& net,
-            std::vector<boost::shared_ptr<caffe::Blob<float>>>& caffeNetOutputBlob,
+            std::vector<std::shared_ptr<ArrayCpuGpu<float>>>& caffeNetOutputBlob,
             const PoseModel poseModel, const int gpuId, const std::string& modelFolder,
             const std::string& protoTxtPath, const std::string& caffeModelPath, const bool enableGoogleLogging)
         {
@@ -147,8 +148,7 @@ namespace op
                 // UNUSED(enableGoogleLogging);
                 // Initializing them on the thread
                 net.back()->initializationOnThread();
-                caffeNetOutputBlob.emplace_back(((NetCaffe*)net.back().get())->getOutputBlob());
-                // caffeNetOutputBlob.emplace_back(((NetOpenCv*)net.back().get())->getOutputBlob());
+                caffeNetOutputBlob.emplace_back((net.back().get())->getOutputBlobArray());
                 // Sanity check
                 if (net.size() != caffeNetOutputBlob.size())
                     error("Weird error, this should not happen. Notify us.", __LINE__, __FUNCTION__, __FILE__);
@@ -221,10 +221,10 @@ namespace op
                     cudaCheck(__LINE__, __FUNCTION__, __FILE__);
                 #endif
                 // Initialize blobs
-                upImpl->spHeatMapsBlob = {std::make_shared<caffe::Blob<float>>(1,1,1,1)};
-                upImpl->spPeaksBlob = {std::make_shared<caffe::Blob<float>>(1,1,1,1)};
+                upImpl->spHeatMapsBlob = {std::make_shared<ArrayCpuGpu<float>>(1,1,1,1)};
+                upImpl->spPeaksBlob = {std::make_shared<ArrayCpuGpu<float>>(1,1,1,1)};
                 if (TOP_DOWN_REFINEMENT)
-                    upImpl->spMaximumPeaksBlob = {std::make_shared<caffe::Blob<float>>(1,1,1,1)};
+                    upImpl->spMaximumPeaksBlob = {std::make_shared<ArrayCpuGpu<float>>(1,1,1,1)};
                 #ifdef USE_CUDA
                     cudaCheck(__LINE__, __FUNCTION__, __FILE__);
                 #endif
@@ -295,7 +295,7 @@ namespace op
                 }
                 // 2. Resize heat maps + merge different scales
                 // ~5ms (GPU) / ~20ms (CPU)
-                const auto caffeNetOutputBlobs = caffeNetSharedToPtr(upImpl->spCaffeNetOutputBlobs);
+                const auto caffeNetOutputBlobs = arraySharedToPtr(upImpl->spCaffeNetOutputBlobs);
                 const std::vector<float> floatScaleRatios(scaleInputToNetInputs.begin(), scaleInputToNetInputs.end());
                 upImpl->spResizeAndMergeCaffe->setScaleRatios(floatScaleRatios);
                 upImpl->spResizeAndMergeCaffe->Forward(caffeNetOutputBlobs, {upImpl->spHeatMapsBlob.get()});
@@ -417,7 +417,8 @@ namespace op
                             // Re-Process image
                             // 1. Caffe deep network
                             upImpl->spNets.at(0)->forwardPass(inputNetDataRoi);
-                            std::vector<boost::shared_ptr<caffe::Blob<float>>> caffeNetOutputBlob{upImpl->spCaffeNetOutputBlobs[0]};
+                            std::vector<std::shared_ptr<ArrayCpuGpu<float>>> caffeNetOutputBlob{
+                                upImpl->spCaffeNetOutputBlobs[0]};
                             // Reshape blobs
                             if (!vectorsAreEqual(upImpl->mNetInput4DSizes.at(0), inputNetDataRoi.getSize()))
                             {
@@ -431,8 +432,7 @@ namespace op
                                                           upImpl->mGpuId);
                             }
                             // 2. Resize heat maps + merge different scales
-                            // const auto caffeNetOutputBlobs = caffeNetSharedToPtr(upImpl->spCaffeNetOutputBlobs);
-                            const auto caffeNetOutputBlobs = caffeNetSharedToPtr(caffeNetOutputBlob);
+                            const auto caffeNetOutputBlobs = arraySharedToPtr(caffeNetOutputBlob);
                             // const std::vector<float> floatScaleRatios(
                             //     scaleInputToNetInputs.begin(), scaleInputToNetInputs.end());
                             const std::vector<float> floatScaleRatios{(float)scaleInputToNetInputs[0]};
-- 
GitLab