CPU_ONLY by ONLY_CUDA & flags not required at deployment

5160a127 · gineshidalgo99 · db3eaffe · 5160a127 · 5160a127 · 5160a127
125 changed file
--- a/.github/issue_template.md
+++ b/.github/issue_template.md
 ### Posting rules
-1. **Add** the **system configuration (all of it!), command and output** if you have some kind of error or performance question.
-2. **No duplicated** posts.
-3. **No** posts about **questions already answered / clearly explained in** the **documentation** (e.g. **no more low-speed nor out-of-memory questions**).
-4. Set a **proper issue title**: add the Ubuntu/Windows word and be specific (e.g. do not simple call it: `Compile error`).
-5. **No** questions about **training**. OpenPose only implements testing.
-6. Only English comments.
-Issues/comments that do not follow this will be **ignored or removed** with no further clarification.
+1. **No** questions about **training**. OpenPose only implements testing.
+2. **No** questions about **Caffe installation errors/issues**. Check [Caffe](http://caffe.berkeleyvision.org) documentation and help for those errors.
+3. **Fill** the **Your System Configuration section (all of it!)** if you have some kind of error or performance question.
+4. **No duplicated** posts.
+5. **No** posts about **questions already answered / clearly explained in** the **documentation** (e.g. **no more low-speed nor out-of-memory questions**).
+6. Set a **proper issue title**: add the Ubuntu/Windows word and be specific (e.g. do not simple call it: `Compile error`).
+7. Only English comments.
+Issues/comments which do not follow these rules will be **ignored or removed** with no further clarification.



-### Issue summary
+### Issue Summary



-### Executed command (if any)
+### Executed Command (if any)
 Note: add `--logging_level 0` to get higher debug information.



-### OpenPose output (if any)
+### OpenPose Output (if any)



-### Type of issue
+### Type of Issue
 You might select multiple topics, delete the rest:
 - Compilation/installation error
 - Execution error
@@ -33,13 +34,12 @@ You might select multiple topics, delete the rest:



-### Your system configuration
-**Installation mode**: CMake or sh script or manual Makefile installation.
+### Your System Configuration
 **Operating system** (`lsb_release -a` in Ubuntu):
+**Installation mode**: CMake, sh script, or manual Makefile installation (Ubuntu); VS2015, VS2017, CMake, ... (Windows)
 **CUDA version** (`cat /usr/local/cuda/version.txt` in most cases):
 **cuDNN version**:
 **GPU model** (`nvidia-smi` in Ubuntu):
 **Caffe version**: Default from OpenPose or custom version.
-**OpenCV version**: installed with `apt-get install libopencv-dev` (Ubuntu) or default from OpenPose (Windows) or OpenCV 2.X or OpenCV 3.X. Especify **full version** (e.g. 3.1 or 2.4.9)
-Generation mode (only for Ubuntu): Makefile + Makefile.config (default, Ubuntu) or CMake (Ubuntu, Windows) or Visual Studio (Windows).
+**OpenCV version**: pre-compiled `apt-get install libopencv-dev` (only Ubuntu); OpenPose default (only Windows); compiled from source: 2.4.9, 2.4.12, 3.1, 3.2, ...
 Compiler (`gcc --version` in Ubuntu):
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -28,9 +28,6 @@ set(BUILD_SHARED_LIBS ON)
 # Turn on C++11
 add_definitions(-std=c++11)

-# OpenPose flags
-add_definitions(-DUSE_CAFFE)
-
 # C++ additional flags
 set(OP_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp -Wpedantic -Wall -Wextra -Wfatal-errors")

@@ -46,12 +43,21 @@ include(CMakeDependentOption)
 if (${DL_FRAMEWORK} MATCHES "CAFFE")
  CMAKE_DEPENDENT_OPTION(BUILD_CAFFE "Build Caffe as part of OpenPose." ON
      "DL_FRAMEWORK" ON)
+  # OpenPose flags
+  add_definitions(-DUSE_CAFFE)
 endif (${DL_FRAMEWORK} MATCHES "CAFFE")

 # Set the acceleration library
 set(GPU_MODE CUDA CACHE STRING "Select the acceleration GPU library or CPU otherwise.")
 set_property(CACHE GPU_MODE PROPERTY STRINGS CUDA)
 # set_property(CACHE GPU_MODE PROPERTY STRINGS CUDA OPENCL CPU_ONLY)
+if (${GPU_MODE} MATCHES "CUDA")
+  # OpenPose flags
+  add_definitions(-DUSE_CUDA)
+elseif (${GPU_MODE} MATCHES "CPU_ONLY")
+  # OpenPose flag for Caffe
+  add_definitions(-DCPU_ONLY)
+endif ()

 # Suboptions for GPU architectures
 if (${GPU_MODE} MATCHES "CUDA")

--- a/Makefile
+++ b/Makefile
@@ -139,7 +139,7 @@ endif
 CUDA_LIB_DIR += $(CUDA_DIR)/lib

 INCLUDE_DIRS += $(BUILD_INCLUDE_DIR) ./src ./include
-ifneq ($(CPU_ONLY), 1)
+ifeq ($(USE_CUDA), 1)
 	INCLUDE_DIRS += $(CUDA_INCLUDE_DIR)
 	LIBRARY_DIRS += $(CUDA_LIB_DIR)
 	LIBRARIES += cudart cublas curand
@@ -251,7 +251,7 @@ endif
 # libstdc++ for NVCC compatibility on OS X >= 10.9 with CUDA < 7.0
 ifeq ($(OSX), 1)
 	CXX := /usr/bin/clang++
-	ifneq ($(CPU_ONLY), 1)
+	ifeq ($(USE_CUDA), 1)
 		CUDA_VERSION := $(shell $(CUDA_DIR)/bin/nvcc -V | grep -o 'release [0-9.]*' | tr -d '[a-z ]')
 		ifeq ($(shell echo | awk '{exit $(CUDA_VERSION) < 7.0;}'), 1)
 			CXXFLAGS += -stdlib=libstdc++
@@ -300,12 +300,6 @@ else
 	COMMON_FLAGS += -DNDEBUG -O3
 endif

-# cuDNN acceleration configuration.
-ifeq ($(USE_CUDNN), 1)
-	LIBRARIES += cudnn
-	COMMON_FLAGS += -DUSE_CUDNN
-endif
-
 # configure IO libraries
 ifeq ($(USE_OPENCV), 1)
 	COMMON_FLAGS += -DUSE_OPENCV
@@ -321,52 +315,11 @@ endif
 endif

 # CPU-only configuration
-ifeq ($(CPU_ONLY), 1)
-	OBJS := $(CXX_OBJS)
-	ALL_WARNS := $(ALL_CXX_WARNS)
-	COMMON_FLAGS += -DCPU_ONLY
-endif
-
-# BLAS configuration (default = ATLAS)
-BLAS ?= atlas
-ifeq ($(BLAS), mkl)
-	# MKL
-	LIBRARIES += mkl_rt
-	COMMON_FLAGS += -DUSE_MKL
-	MKLROOT ?= /opt/intel/mkl
-	BLAS_INCLUDE ?= $(MKLROOT)/include
-	BLAS_LIB ?= $(MKLROOT)/lib $(MKLROOT)/lib/intel64
-else ifeq ($(BLAS), open)
-	# OpenBLAS
-	LIBRARIES += openblas
+ifeq ($(USE_CUDA), 1)
+	COMMON_FLAGS += -DUSE_CUDA
 else
-	# ATLAS
-	ifeq ($(LINUX), 1)
-		ifeq ($(BLAS), atlas)
-			# Linux simply has cblas and atlas
-			LIBRARIES += cblas atlas
-		endif
-	else ifeq ($(OSX), 1)
-		# OS X packages atlas as the vecLib framework
-		LIBRARIES += cblas
-		# 10.10 has accelerate while 10.9 has veclib
-		XCODE_CLT_VER := $(shell pkgutil --pkg-info=com.apple.pkg.CLTools_Executables | grep 'version' | sed 's/[^0-9]*\([0-9]\).*/\1/')
-		XCODE_CLT_GEQ_7 := $(shell [ $(XCODE_CLT_VER) -gt 6 ] && echo 1)
-		XCODE_CLT_GEQ_6 := $(shell [ $(XCODE_CLT_VER) -gt 5 ] && echo 1)
-		ifeq ($(XCODE_CLT_GEQ_7), 1)
-			BLAS_INCLUDE ?= /Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/$(shell ls /Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/ | sort | tail -1)/System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/Headers
-		else ifeq ($(XCODE_CLT_GEQ_6), 1)
-			BLAS_INCLUDE ?= /System/Library/Frameworks/Accelerate.framework/Versions/Current/Frameworks/vecLib.framework/Headers/
-			LDFLAGS += -framework Accelerate
-		else
-			BLAS_INCLUDE ?= /System/Library/Frameworks/vecLib.framework/Versions/Current/Headers/
-			LDFLAGS += -framework vecLib
-		endif
-	endif
+	COMMON_FLAGS += -DCPU_ONLY # For Caffe
 endif
-#'
-INCLUDE_DIRS += $(BLAS_INCLUDE)
-LIBRARY_DIRS += $(BLAS_LIB)

 LIBRARY_DIRS += $(LIB_BUILD_DIR)


--- a/README.md
+++ b/README.md
@@ -107,13 +107,13 @@ Note: you should not need to modify the OpenPose source code nor examples. In th


 ### OpenPose Library
-Your case if you want to change internal functions and/or extend its functionality. First, take a look at the [Demo](#demo) and [OpenPose Wrapper](#openpose-wrapper). Second, read the 2 following subsections: OpenPose Overview and Extending Functionality.
+Your case if you want to change internal functions and/or extend its functionality.

-1. OpenPose Overview: Learn the basics about the library source code in [doc/library_overview.md](doc/library_overview.md).
-
-2. Extending Functionality: Learn how to extend the library in [doc/library_extend_functionality.md](doc/library_extend_functionality.md).
-
-3. Adding An Extra Module: Learn how to add an extra module in [doc/library_add_new_module.md](doc/library_add_new_module.md).
+1. Take a look at the [Demo](#demo) and [OpenPose Wrapper](#openpose-wrapper).
+2. OpenPose Overview: Learn the basics about the library source code in [doc/library_overview.md](doc/library_overview.md).
+3. Extending Functionality: Learn how to extend the library in [doc/library_extend_functionality.md](doc/library_extend_functionality.md).
+4. Adding An Extra Module: Learn how to add an extra module in [doc/library_add_new_module.md](doc/library_add_new_module.md).
+5. See the Doxygen documentation on [http://cmu-perceptual-computing-lab.github.io/openpose/html/index.html](http://cmu-perceptual-computing-lab.github.io/openpose/html/index.html) or build it from the source code.




--- a/doc/installation.md
+++ b/doc/installation.md
@@ -18,7 +18,7 @@ OpenPose - Installation and FAQ
 ## Operating Systems
 - **Ubuntu** 14 and 16.
 - **Windows** 8 and 10.
- **Nvidia Jetson TX2**, installation instructions in [doc/installation_jetson_tx2](./installation_jetson_tx2).
+- **Nvidia Jetson TX2**, installation instructions in [doc/installation_jetson_tx2.md](./installation_jetson_tx2.md).
 - OpenPose has also been used on **Windows 7**, **Mac**, **CentOS**, and **Nvidia Jetson (TK1 and TX1)** embedded systems. However, we do not officially support them at the moment.


@@ -133,7 +133,7 @@ You just need to remove the OpenPose folder, by default called `openpose/`. E.g.

 ### Installation - Library
 1. Install the pre-requisites:
-    1. Microsoft Visual Studio (VS) 2015 Enterprise Update 3. VS Enterprise Update 1 and VS 2017 will give some compiler errors, while VS 2015 Community has not been tested.
+    1. Microsoft Visual Studio (VS) 2015 Enterprise Update 3. If Visual Studio 2017 Community is desired, we do not support it, but it might be compiled by firstly [enabling CUDA 8.0](https://stackoverflow.com/questions/43745099/using-cuda-with-visual-studio-2017?answertab=active#tab-top). VS Enterprise Update 1 will give some compiler errors and VS 2015 Community has not been tested.
    2. [CUDA 8](https://developer.nvidia.com/cuda-downloads): Install it on the default location, `C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v8.0`. Otherwise, modify the Visual Studio project solution accordingly. Install CUDA 8.0 after Visual Studio 2015 is installed to assure that the CUDA installation will generate all necessary files for VS. If CUDA was already installed, re-install it after installing VS!
    3. [cuDNN 5.1](https://developer.nvidia.com/cudnn): Once you have downloaded it, just unzip it and copy (merge) the contents on the CUDA folder, `C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v8.0`.
 2. Download the OpenPose dependencies and models (body, face and hand models) by double-clicking on `{openpose_path}\windows\download_3rdparty_and_models.bat`. Alternatively, you might prefer to download them manually:
@@ -214,8 +214,8 @@ We only modified some Caffe compilation flags and minor details. You can use you
 ## Compiling without cuDNN
 The [cuDNN](https://developer.nvidia.com/cudnn) library is not mandatory, but required for full keypoint detection accuracy. In case your graphics card is not compatible with cuDNN, you can disable it by:

- Ubuntu: Modifying the `Makefile.config` files in both the OpenPose and `3rdparty/caffe` folders, disabling `USE_CUDNN`.
- Windows: Compiling Caffe by your own with without cuDNN support and removing the `USE_CUDNN` define from the OpenPose project solution in Visual Studio.
+- Ubuntu: Disable `USE_CUDNN` in the `Makefile.config` file in `3rdparty/caffe`, and recompiling Caffe.
+- Windows: Compiling Caffe by your own with without cuDNN support and replacing the [3rdparty/windows/caffe](../3rdparty/windows/caffe)) folder by your own implementation.

 Then, you would have to reduce the `--net_resolution` flag to fit the model into the GPU memory. You can try values like "640x320", "320x240", "320x160", or "160x80" to see your GPU memory capabilities. After finding the maximum approximate resolution that your GPU can handle without throwing an out-of-memory error, adjust the `net_resolution` ratio to your image or video to be processed (see the `--net_resolution` explanation from [doc/demo_overview.md](./demo_overview.md)).


--- a/doc/openpose_3d_reconstruction_demo.md
+++ b/doc/openpose_3d_reconstruction_demo.md
@@ -60,7 +60,7 @@ The program uses 3 cameras by default, but cameras can be added or removed from
    - (Optional) Spinnaker SDK overview: https://www.ptgrey.com/spinnaker-sdk
 6. Get the last OpenGL Glut library version for the rendering:
    - Download the latest `MSVC Package` from http://www.transmissionzero.co.uk/software/freeglut-devel/
-    - Copy `{freeglutParentDirectory}\freeglut\bin\x64\` as `{OpenPoseDirectory}\3rdparty\windows\freeglut\bin\bin\`.
+    - Copy `{freeglutParentDirectory}\freeglut\bin\x64\` as `{OpenPoseDirectory}\3rdparty\windows\freeglut\bin\`.
    - Copy `{freeglutParentDirectory}\freeglut\include\` as `{OpenPoseDirectory}\3rdparty\windows\freeglut\include\`.
    - Copy `{freeglutParentDirectory}\freeglut\lib\x64\` as `{OpenPoseDirectory}\3rdparty\windows\freeglut\lib\`.

@@ -78,7 +78,6 @@ We did not create an Ubuntu version. We did an very first version for Ubuntu 16
 8. Get the required files from `{OpenPose path}/examples_beta/openpose3d/`. Check the Windows VS solution for more details.
 9. Create a proper Makefile or CMake file to run it. The following code is part of an old QMake (Qt) file generated for the old version, you can ideally get all the flags and includes from it:
 ```
-DEFINES += USE_CAFFE USE_CUDNN
 INCLUDEPATH += \
    $$PWD/include \
    $$PWD/3rdparty/caffe/include \

--- a/doc/release_notes.md
+++ b/doc/release_notes.md
@@ -122,14 +122,18 @@ OpenPose Library - Release Notes
        3. CvMatToOutput and Renderers allow to keep input resolution as output for images (core module).
    3. New standalone face keypoint detector based on OpenCV face detector: much faster if body keypoint detection is not required but much less accurate.
    4. Face and hand keypoint detectors now can return each keypoint heatmap.
-    5. COCO JSON file outputs 0 as score for non-detected keypoints.
-    6. Added example for OpenPose for user asynchronous output and cleaned all `tutorial_wrapper/` examples.
-    7. Added `-1` option for `net_resolution` in order to auto-select the best possible aspect ratio given the user input.
+    5. The flag `USE_CUDNN` is no longer required; `USE_CAFFE` and `USE_CUDA` (replacing the old `CPU_ONLY`) are no longer required to use the library, only to build it. In addition, Caffe and its dependencies have been removed from the OpenPose header files. Only OpenCV include and lib folders are required when building a project using OpenPose.
+    6. OpenPose successfully compiles if the flags `USE_CAFFE` and/or `USE_CUDA` are not enabled, although it will give an error saying they are required.
+    7. COCO JSON file outputs 0 as score for non-detected keypoints.
+    8. Added example for OpenPose for user asynchronous output and cleaned all `tutorial_wrapper/` examples.
+    9. Added `-1` option for `net_resolution` in order to auto-select the best possible aspect ratio given the user input.
 2. Functions or parameters renamed:
    1. OpenPose able to change its size and initial size:
        1. Flag `resolution` renamed as `output_resolution`.
        2. FrameDisplayer, GuiInfoAdder and Gui constructors arguments modified (gui module).
        3. OpOutputToCvMat constructor removed (core module).
        4. New Renders classes to split GpuRenderers from CpuRenderers.
+        5. Etc.
+    2. `CPU_ONLY` changed by `USE_CUDA` to keep format.
 3. Main bugs fixed:
    1. Ubuntu installer script now works even if Python pip was not installed previously.
--- a/examples/tests/wrapperHandFromJsonTest.hpp
+++ b/examples/tests/wrapperHandFromJsonTest.hpp
@@ -173,12 +173,13 @@ namespace op
            wDatumProducer = std::make_shared<WDatumProducer<TDatumsPtr, TDatums>>(datumProducer);

            // Input cvMat to OpenPose format
-            const auto cvMatToOpInput = std::make_shared<CvMatToOpInput>(
-                wrapperStructPose.netInputSize, wrapperStructPose.scalesNumber, wrapperStructPose.scaleGap
-            );
+            const auto cvMatToOpInput = std::make_shared<CvMatToOpInput>();
            spWCvMatToOpInput = std::make_shared<WCvMatToOpInput<TDatumsPtr>>(cvMatToOpInput);
-            const auto cvMatToOpOutput = std::make_shared<CvMatToOpOutput>(finalOutputSize, displayGui);
-            spWCvMatToOpOutput = std::make_shared<WCvMatToOpOutput<TDatumsPtr>>(cvMatToOpOutput);
+            if (displayGui)
+            {
+                const auto cvMatToOpOutput = std::make_shared<CvMatToOpOutput>();
+                spWCvMatToOpOutput = std::make_shared<WCvMatToOpOutput<TDatumsPtr>>(cvMatToOpOutput);
+            }

            // Hand extractor(s)
            if (wrapperStructHand.enable)
@@ -196,7 +197,7 @@ namespace op
                        spWPoses.at(gpuId) = {std::make_shared<WHandDetectorFromTxt<TDatumsPtr>>(handDetector)};
                    // Hand keypoint extractor
                    const auto netOutputSize = wrapperStructHand.netInputSize;
-                    const auto handExtractor = std::make_shared<HandExtractor>(
+                    const auto handExtractor = std::make_shared<HandExtractorCaffe>(
                        wrapperStructHand.netInputSize, netOutputSize, wrapperStructPose.modelFolder,
                        gpuId + gpuNumberStart, wrapperStructHand.scalesNumber, wrapperStructHand.scaleRange
                    );
@@ -304,7 +305,7 @@ namespace op
        try
        {
            // Security checks
-            if (spWCvMatToOpInput == nullptr || spWCvMatToOpOutput == nullptr)
+            if (spWCvMatToOpInput == nullptr)
                error("Configure the WrapperHandFromJsonTest class before calling `start()`.", __LINE__, __FUNCTION__, __FILE__);
            if (wDatumProducer == nullptr)
            {
@@ -325,9 +326,14 @@ namespace op
            // If custom user Worker in same thread or producer on same thread
            spWIdGenerator = std::make_shared<WIdGenerator<std::shared_ptr<TDatums>>>();
            // OpenPose producer
-            mThreadManager.add(threadId++, {wDatumProducer, spWIdGenerator, spWCvMatToOpInput, spWCvMatToOpOutput}, queueIn++, queueOut++);   // Thread 0 or 1, queues 0 -> 1
+            // Thread 0 or 1, queues 0 -> 1
+            if (spWCvMatToOpOutput == nullptr)
+                mThreadManager.add(threadId++, {wDatumProducer, spWIdGenerator, spWCvMatToOpInput}, queueIn++, queueOut++);
+            else
+                mThreadManager.add(threadId++, {wDatumProducer, spWIdGenerator, spWCvMatToOpInput, spWCvMatToOpOutput}, queueIn++, queueOut++);
            // Pose estimation & rendering
-            if (!spWPoses.empty())                                                                      // Thread 1 or 2...X, queues 1 -> 2, X = 2 + #GPUs
+            // Thread 1 or 2...X, queues 1 -> 2, X = 2 + #GPUs
+            if (!spWPoses.empty())
            {
                for (auto& wPose : spWPoses)
                    mThreadManager.add(threadId++, wPose, queueIn, queueOut);
@@ -336,10 +342,12 @@ namespace op
            }
            // If custom user Worker in same thread or producer on same thread
            // Post processing workers + User post processing workers + Output workers
-            mThreadManager.add(threadId++, mergeWorkers(mPostProcessingWs, mOutputWs), queueIn++, queueOut++); // Thread 2 or 3, queues 2 -> 3
+            // Thread 2 or 3, queues 2 -> 3
+            mThreadManager.add(threadId++, mergeWorkers(mPostProcessingWs, mOutputWs), queueIn++, queueOut++);
            // OpenPose GUI
+            // Thread Y+1, queues Q+1 -> Q+2
            if (spWGui != nullptr)
-                mThreadManager.add(threadId++, spWGui, queueIn++, queueOut++);                               // Thread Y+1, queues Q+1 -> Q+2
+                mThreadManager.add(threadId++, spWGui, queueIn++, queueOut++);
            log("", Priority::Low, __LINE__, __FUNCTION__, __FILE__);
        }
        catch (const std::exception& e)

--- a/examples/tutorial_pose/1_extract_from_image.cpp
+++ b/examples/tutorial_pose/1_extract_from_image.cpp
@@ -62,7 +62,8 @@ int openPoseTutorialPose1()
    // Step 1 - Set logging level
        // - 0 will output all the logging messages
        // - 255 will output nothing
-    op::check(0 <= FLAGS_logging_level && FLAGS_logging_level <= 255, "Wrong logging_level value.", __LINE__, __FUNCTION__, __FILE__);
+    op::check(0 <= FLAGS_logging_level && FLAGS_logging_level <= 255, "Wrong logging_level value.",
+              __LINE__, __FUNCTION__, __FILE__);
    op::ConfigureLog::setPriorityThreshold((op::Priority)FLAGS_logging_level);
    op::log("", op::Priority::Low, __LINE__, __FUNCTION__, __FILE__);
    // Step 2 - Read Google flags (user defined configuration)
@@ -78,15 +79,18 @@ int openPoseTutorialPose1()
    if (FLAGS_alpha_pose < 0. || FLAGS_alpha_pose > 1.)
        op::error("Alpha value for blending must be in the range [0,1].", __LINE__, __FUNCTION__, __FILE__);
    if (FLAGS_scale_gap <= 0. && FLAGS_scale_number > 1)
-        op::error("Incompatible flag configuration: scale_gap must be greater than 0 or scale_number = 1.", __LINE__, __FUNCTION__, __FILE__);
+        op::error("Incompatible flag configuration: scale_gap must be greater than 0 or scale_number = 1.",
+                  __LINE__, __FUNCTION__, __FILE__);
    // Logging
    op::log("", op::Priority::Low, __LINE__, __FUNCTION__, __FILE__);
    // Step 3 - Initialize all required classes
-    op::CvMatToOpInput cvMatToOpInput{netInputSize, FLAGS_scale_number, (float)FLAGS_scale_gap};
-    op::CvMatToOpOutput cvMatToOpOutput{outputSize};
+    op::ScaleAndSizeExtractor scaleAndSizeExtractor(netInputSize, outputSize, FLAGS_scale_number, FLAGS_scale_gap);
+    op::CvMatToOpInput cvMatToOpInput;
+    op::CvMatToOpOutput cvMatToOpOutput;
    op::PoseExtractorCaffe poseExtractorCaffe{netInputSize, netOutputSize, outputSize, FLAGS_scale_number, poseModel,
                                              FLAGS_model_folder, FLAGS_num_gpu_start};
-    op::PoseCpuRenderer poseRenderer{poseModel, (float)FLAGS_render_threshold, !FLAGS_disable_blending, (float)FLAGS_alpha_pose};
+    op::PoseCpuRenderer poseRenderer{poseModel, (float)FLAGS_render_threshold, !FLAGS_disable_blending,
+                                     (float)FLAGS_alpha_pose};
    op::OpOutputToCvMat opOutputToCvMat;
    op::FrameDisplayer frameDisplayer{"OpenPose Tutorial - Example 1", outputSize};
    // Step 4 - Initialize resources on desired thread (in this case single thread, i.e. we init resources here)
@@ -95,22 +99,27 @@ int openPoseTutorialPose1()

    // ------------------------- POSE ESTIMATION AND RENDERING -------------------------
    // Step 1 - Read and load image, error if empty (possibly wrong path)
-    cv::Mat inputImage = op::loadImage(FLAGS_image_path, CV_LOAD_IMAGE_COLOR); // Alternative: cv::imread(FLAGS_image_path, CV_LOAD_IMAGE_COLOR);
+    // Alternative: cv::imread(FLAGS_image_path, CV_LOAD_IMAGE_COLOR);
+    cv::Mat inputImage = op::loadImage(FLAGS_image_path, CV_LOAD_IMAGE_COLOR);
    if(inputImage.empty())
        op::error("Could not open or find the image: " + FLAGS_image_path, __LINE__, __FUNCTION__, __FILE__);
-    // Step 2 - Format input image to OpenPose input and output formats
-    op::Array<float> netInputArray;
-    std::vector<float> scaleRatios;
-    std::tie(netInputArray, scaleRatios) = cvMatToOpInput.format(inputImage);
+    const op::Point<int> imageSize{inputImage.cols, inputImage.rows};
+    // Step 2 - Get desired scale sizes
+    std::vector<double> scaleInputToNetInputs;
+    std::vector<op::Point<int>> netInputSizes;
    double scaleInputToOutput;
-    op::Array<float> outputArray;
-    std::tie(scaleInputToOutput, outputArray) = cvMatToOpOutput.format(inputImage);
-    // Step 3 - Estimate poseKeypoints
-    poseExtractorCaffe.forwardPass(netInputArray, {inputImage.cols, inputImage.rows}, scaleRatios);
+    op::Point<int> outputResolution;
+    std::tie(scaleInputToNetInputs, netInputSizes, scaleInputToOutput, outputResolution)
+        = scaleAndSizeExtractor.extract(imageSize);
+    // Step 3 - Format input image to OpenPose input and output formats
+    const auto netInputArray = cvMatToOpInput.createArray(inputImage, scaleInputToNetInputs, netInputSizes);
+    auto outputArray = cvMatToOpOutput.createArray(inputImage, scaleInputToOutput, outputResolution);
+    // Step 4 - Estimate poseKeypoints
+    poseExtractorCaffe.forwardPass(netInputArray, imageSize, scaleInputToNetInputs);
    const auto poseKeypoints = poseExtractorCaffe.getPoseKeypoints();
-    // Step 4 - Render poseKeypoints
+    // Step 5 - Render poseKeypoints
    poseRenderer.renderPose(outputArray, poseKeypoints);
-    // Step 5 - OpenPose output format to cv::Mat
+    // Step 6 - OpenPose output format to cv::Mat
    auto outputImage = opOutputToCvMat.formatToCvMat(outputArray);

    // ------------------------- SHOWING RESULT AND CLOSING -------------------------

--- a/examples/tutorial_pose/2_extract_pose_or_heatmat_from_image.cpp
+++ b/examples/tutorial_pose/2_extract_pose_or_heatmat_from_image.cpp
@@ -67,7 +67,8 @@ int openPoseTutorialPose2()
    // Step 1 - Set logging level
        // - 0 will output all the logging messages
        // - 255 will output nothing
-    op::check(0 <= FLAGS_logging_level && FLAGS_logging_level <= 255, "Wrong logging_level value.", __LINE__, __FUNCTION__, __FILE__);
+    op::check(0 <= FLAGS_logging_level && FLAGS_logging_level <= 255, "Wrong logging_level value.",
+              __LINE__, __FUNCTION__, __FILE__);
    op::ConfigureLog::setPriorityThreshold((op::Priority)FLAGS_logging_level);
    op::log("", op::Priority::Low, __LINE__, __FUNCTION__, __FILE__);
    // Step 2 - Read Google flags (user defined configuration)
@@ -83,16 +84,18 @@ int openPoseTutorialPose2()
    if (FLAGS_alpha_pose < 0. || FLAGS_alpha_pose > 1.)
        op::error("Alpha value for blending must be in the range [0,1].", __LINE__, __FUNCTION__, __FILE__);
    if (FLAGS_scale_gap <= 0. && FLAGS_scale_number > 1)
-        op::error("Incompatible flag configuration: scale_gap must be greater than 0 or scale_number = 1.", __LINE__, __FUNCTION__, __FILE__);
+        op::error("Incompatible flag configuration: scale_gap must be greater than 0 or scale_number = 1.",
+                  __LINE__, __FUNCTION__, __FILE__);
    // Logging
    op::log("", op::Priority::Low, __LINE__, __FUNCTION__, __FILE__);
    // Step 3 - Initialize all required classes
-    op::CvMatToOpInput cvMatToOpInput{netInputSize, FLAGS_scale_number, (float)FLAGS_scale_gap};
-    op::CvMatToOpOutput cvMatToOpOutput{outputSize};
-    std::shared_ptr<op::PoseExtractor> poseExtractorPtr = std::make_shared<op::PoseExtractorCaffe>(netInputSize, netOutputSize, outputSize,
-                                                                                                   FLAGS_scale_number, poseModel,
-                                                                                                   FLAGS_model_folder, FLAGS_num_gpu_start);
-    op::PoseGpuRenderer poseGpuRenderer{netOutputSize, poseModel, poseExtractorPtr, (float)FLAGS_render_threshold,
+    op::ScaleAndSizeExtractor scaleAndSizeExtractor(netInputSize, outputSize, FLAGS_scale_number, FLAGS_scale_gap);
+    op::CvMatToOpInput cvMatToOpInput;
+    op::CvMatToOpOutput cvMatToOpOutput;
+    auto poseExtractorPtr = std::make_shared<op::PoseExtractorCaffe>(
+        netInputSize, netOutputSize, outputSize, FLAGS_scale_number, poseModel, FLAGS_model_folder, FLAGS_num_gpu_start
+    );
+    op::PoseGpuRenderer poseGpuRenderer{poseModel, poseExtractorPtr, (float)FLAGS_render_threshold,
                                        !FLAGS_disable_blending, (float)FLAGS_alpha_pose, (float)FLAGS_alpha_heatmap};
    poseGpuRenderer.setElementToRender(FLAGS_part_to_show);
    op::OpOutputToCvMat opOutputToCvMat;
@@ -103,23 +106,28 @@ int openPoseTutorialPose2()

    // ------------------------- POSE ESTIMATION AND RENDERING -------------------------
    // Step 1 - Read and load image, error if empty (possibly wrong path)
-    cv::Mat inputImage = op::loadImage(FLAGS_image_path, CV_LOAD_IMAGE_COLOR); // Alternative: cv::imread(FLAGS_image_path, CV_LOAD_IMAGE_COLOR);
+    // Alternative: cv::imread(FLAGS_image_path, CV_LOAD_IMAGE_COLOR);
+    cv::Mat inputImage = op::loadImage(FLAGS_image_path, CV_LOAD_IMAGE_COLOR);
    if(inputImage.empty())
        op::error("Could not open or find the image: " + FLAGS_image_path, __LINE__, __FUNCTION__, __FILE__);
-    // Step 2 - Format input image to OpenPose input and output formats
-    op::Array<float> netInputArray;
-    std::vector<float> scaleRatios;
-    std::tie(netInputArray, scaleRatios) = cvMatToOpInput.format(inputImage);
+    const op::Point<int> imageSize{inputImage.cols, inputImage.rows};
+    // Step 2 - Get desired scale sizes
+    std::vector<double> scaleInputToNetInputs;
+    std::vector<op::Point<int>> netInputSizes;
    double scaleInputToOutput;
-    op::Array<float> outputArray;
-    std::tie(scaleInputToOutput, outputArray) = cvMatToOpOutput.format(inputImage);
-    // Step 3 - Estimate poseKeypoints
-    poseExtractorPtr->forwardPass(netInputArray, {inputImage.cols, inputImage.rows}, scaleRatios);
+    op::Point<int> outputResolution;
+    std::tie(scaleInputToNetInputs, netInputSizes, scaleInputToOutput, outputResolution)
+        = scaleAndSizeExtractor.extract(imageSize);
+    // Step 3 - Format input image to OpenPose input and output formats
+    const auto netInputArray = cvMatToOpInput.createArray(inputImage, scaleInputToNetInputs, netInputSizes);
+    auto outputArray = cvMatToOpOutput.createArray(inputImage, scaleInputToOutput, outputResolution);
+    // Step 4 - Estimate poseKeypoints
+    poseExtractorPtr->forwardPass(netInputArray, imageSize, scaleInputToNetInputs);
    const auto poseKeypoints = poseExtractorPtr->getPoseKeypoints();
    const auto scaleNetToOutput = poseExtractorPtr->getScaleNetToOutput();
-    // Step 4 - Render pose
+    // Step 5 - Render pose
    poseGpuRenderer.renderPose(outputArray, poseKeypoints, scaleNetToOutput);
-    // Step 5 - OpenPose output format to cv::Mat
+    // Step 6 - OpenPose output format to cv::Mat
    auto outputImage = opOutputToCvMat.formatToCvMat(outputArray);

    // ------------------------- SHOWING RESULT AND CLOSING -------------------------

--- a/examples/tutorial_wrapper/1_user_asynchronous_output.cpp
+++ b/examples/tutorial_wrapper/1_user_asynchronous_output.cpp
@@ -192,7 +192,7 @@ public:
        {
            cv::imshow("User worker GUI", datumsPtr->at(0).cvOutputData);
            // Display image and sleeps at least 1 ms (it usually sleeps ~5-10 msec to display the image)
-            key = cv::waitKey(1);
+            key = (char)cv::waitKey(1);
        }
        else
            op::log("Nullptr or empty datumsPtr found.", op::Priority::High, __LINE__, __FUNCTION__, __FILE__);

--- a/examples/tutorial_wrapper/2_user_synchronous.cpp
+++ b/examples/tutorial_wrapper/2_user_synchronous.cpp
@@ -318,7 +318,7 @@ public:
                // Display rendered output image
                cv::imshow("User worker GUI", datumsPtr->at(0).cvOutputData);
                // Display image and sleeps at least 1 ms (it usually sleeps ~5-10 msec to display the image)
-                const char key = cv::waitKey(1);
+                const char key = (char)cv::waitKey(1);
                if (key == 27)
                    this->stop();
            }

--- a/examples/tutorial_wrapper/3_user_asynchronous.cpp
+++ b/examples/tutorial_wrapper/3_user_asynchronous.cpp
@@ -232,7 +232,7 @@ public:
        {
            cv::imshow("User worker GUI", datumsPtr->at(0).cvOutputData);
            // Display image and sleeps at least 1 ms (it usually sleeps ~5-10 msec to display the image)
-            key = cv::waitKey(1);
+            key = (char)cv::waitKey(1);
        }
        else
            op::log("Nullptr or empty datumsPtr found.", op::Priority::High, __LINE__, __FUNCTION__, __FILE__);

--- a/include/openpose/core/common.hpp
+++ b/include/openpose/core/common.hpp
@@ -3,7 +3,7 @@

 // Std library most used classes
 #include <array>
-#include <memory> // std::shared_ptr
+#include <memory> // std::shared_ptr, std::unique_ptr
 #include <string>
 #include <vector>
 // OpenPose most used classes

--- a/include/openpose/core/cvMatToOpInput.hpp
+++ b/include/openpose/core/cvMatToOpInput.hpp
 #ifndef OPENPOSE_CORE_CV_MAT_TO_OP_INPUT_HPP
 #define OPENPOSE_CORE_CV_MAT_TO_OP_INPUT_HPP

-#include <utility> // std::pair
 #include <opencv2/core/core.hpp> // cv::Mat
 #include <openpose/core/common.hpp>

@@ -10,14 +9,8 @@ namespace op
    class OP_API CvMatToOpInput
    {
    public:
-        CvMatToOpInput(const Point<int>& netInputResolution, const int scaleNumber = 1, const float scaleGap = 0.25);
-
-        std::pair<Array<float>, std::vector<float>> format(const cv::Mat& cvInputData) const;
-
-    private:
-        const int mScaleNumber;
-        const float mScaleGap;
-        const std::vector<int> mInputNetSize4D;
+        Array<float> createArray(const cv::Mat& cvInputData, const std::vector<double>& scaleInputToNetInputs,
+                                 const std::vector<Point<int>>& netInputSizes) const;
    };
 }


--- a/include/openpose/core/cvMatToOpOutput.hpp
+++ b/include/openpose/core/cvMatToOpOutput.hpp
@@ -9,14 +9,8 @@ namespace op
    class OP_API CvMatToOpOutput
    {
    public:
-        // Use outputResolution <= {0,0} to keep input resolution
-        CvMatToOpOutput(const Point<int>& outputResolution = Point<int>{0, 0}, const bool generateOutput = true);
-
-        std::tuple<double, Array<float>> format(const cv::Mat& cvInputData) const;
-
-    private:
-        const bool mGenerateOutput;
-        const std::vector<int> mOutputSize3D;
+        Array<float> createArray(const cv::Mat& cvInputData, const double scaleInputToOutput,
+                                 const Point<int>& outputResolution) const;
    };
 }


--- a/include/openpose/core/datum.hpp
+++ b/include/openpose/core/datum.hpp
@@ -8,17 +8,21 @@ namespace op
 {
    /**
     * Datum: The OpenPose Basic Piece of Information Between Threads
-     * Datum is one the main OpenPose classes/structs. The workers and threads share by default a std::shared_ptr<std::vector<Datum>>. It contains
-     * all the parameters that the different workers and threads need to exchange.
+     * Datum is one the main OpenPose classes/structs. The workers and threads share by default a
+     * std::shared_ptr<std::vector<Datum>>. It contains all the parameters that the different workers and threads need
+     * to exchange.
     */
    struct OP_API Datum
    {
-        // -------------------------------------------------- ID parameters -------------------------------------------------- //
+        // ---------------------------------------- ID parameters ---------------------------------------- //
        unsigned long long id; /**< Datum ID. Internally used to sort the Datums if multi-threading is used. */

-        std::string name;      /**< Name used when saving the data to disk (e.g. `write_images` or `write_keypoint` flags in the demo). */
+        /**
+         * Name used when saving the data to disk (e.g. `write_images` or `write_keypoint` flags in the demo).
+         */
+        std::string name;

-        // -------------------------------------------------- Input image and rendered version parameters -------------------------------------------------- //
+        // ------------------------------ Input image and rendered version parameters ------------------------------ //
        /**
         * Original image to be processed in cv::Mat uchar format.
         * Size: (input_width x input_height) x 3 channels
@@ -27,8 +31,10 @@ namespace op

        /**
         * Original image to be processed in Array<float> format.
-         * It has been resized to the net input resolution, as well as reformatted Array<float> format to be compatible with the net.
-         * In case of >1 scales, then each scale is right- and bottom-padded to fill the greatest resolution. The scales are sorted from bigger to smaller.
+         * It has been resized to the net input resolution, as well as reformatted Array<float> format to be compatible
+         * with the net.
+         * In case of >1 scales, then each scale is right- and bottom-padded to fill the greatest resolution. The
+         * scales are sorted from bigger to smaller.
         * Size: #scales x 3 x input_net_height x input_net_width
         */
        Array<float> inputNetData;
@@ -49,7 +55,7 @@ namespace op
         */
        cv::Mat cvOutputData;

-        // -------------------------------------------------- Resulting Array<float> data parameters -------------------------------------------------- //
+        // ------------------------------ Resulting Array<float> data parameters ------------------------------ //
        /**
         * Body pose (x,y,score) locations for each person in the image.
         * It has been resized to the desired output resolution (e.g. `resolution` flag in the demo).
@@ -60,10 +66,14 @@ namespace op

        /**
         * Body pose heatmaps (body parts, background and/or PAFs) for the whole image.
-         * This parameters is by default empty and disabled for performance. Each group (body parts, background and PAFs) can be individually enabled.
-         * #heatmaps = #body parts (if enabled) + 1 (if background enabled) + 2 x #PAFs (if enabled). Each PAF has 2 consecutive channels, one for x- and one for y-coordinates.
-         * Order heatmaps: body parts + background (as appears in POSE_BODY_PART_MAPPING) + (x,y) channel of each PAF (sorted as appears in POSE_BODY_PART_PAIRS). See `pose/poseParameters.hpp`.
-         * The user can choose the heatmaps normalization: ranges [0, 1], [-1, 1] or [0, 255]. Check the `heatmaps_scale` flag in the examples/tutorial_wrapper/ for more details.
+         * This parameters is by default empty and disabled for performance. Each group (body parts, background and
+         * PAFs) can be individually enabled.
+         * #heatmaps = #body parts (if enabled) + 1 (if background enabled) + 2 x #PAFs (if enabled). Each PAF has 2
+         * consecutive channels, one for x- and one for y-coordinates.
+         * Order heatmaps: body parts + background (as appears in POSE_BODY_PART_MAPPING) + (x,y) channel of each PAF
+         * (sorted as appears in POSE_BODY_PART_PAIRS). See `pose/poseParameters.hpp`.
+         * The user can choose the heatmaps normalization: ranges [0, 1], [-1, 1] or [0, 255]. Check the
+         * `heatmaps_scale` flag in the examples/tutorial_wrapper/ for more details.
         * Size: #heatmaps x output_net_height x output_net_width
         */
        Array<float> poseHeatMaps;
@@ -111,29 +121,55 @@ namespace op
         */
        std::array<Array<float>, 2> handHeatMaps;

-        // -------------------------------------------------- Other parameters -------------------------------------------------- //
-        float scaleInputToOutput; /**< Scale ratio between the input Datum::cvInputData and the output Datum::cvOutputData. */
+        // ---------------------------------------- Other parameters ---------------------------------------- //
+        /**
+         * Scale ratio between the input Datum::cvInputData and the net input size.
+         */
+        std::vector<double> scaleInputToNetInputs;
+
+        /**
+         * Size(s) (width x height) of the image(s) fed to the pose deep net.
+         * The size of the std::vector corresponds to the number of scales. 
+         */
+        std::vector<Point<int>> netInputSizes;
+
+        /**
+         * Scale ratio between the input Datum::cvInputData and the output Datum::cvOutputData.
+         */
+        double scaleInputToOutput;

-        float scaleNetToOutput; /**< Scale ratio between the net output and the final output Datum::cvOutputData. */
+        /**
+         * Size (width x height) of the image returned by the deep net.
+         */
+        Point<int> netOutputSize;

-        std::vector<float> scaleRatios; /**< Scale ratios between each scale (e.g. flag `scale_number`). Used to resize the different scales. */
+        /**
+         * Scale ratio between the net output and the final output Datum::cvOutputData.
+         */
+        double scaleNetToOutput;

-        std::pair<int, std::string> elementRendered; /**< Pair with the element key id POSE_BODY_PART_MAPPING on `pose/poseParameters.hpp` and its mapped value (e.g. 1 and "Neck"). */
+        /**
+         * Pair with the element key id POSE_BODY_PART_MAPPING on `pose/poseParameters.hpp` and its mapped value (e.g.
+         * 1 and "Neck").
+         */
+        std::pair<int, std::string> elementRendered;





-        // -------------------------------------------------- Functions -------------------------------------------------- //
+        // ---------------------------------------- Functions ---------------------------------------- //
        /**
         * Default constructor struct.
-         * It simply initializes the struct, id is temporary set to 0 and each other variable is assigned to its default value.
+         * It simply initializes the struct, id is temporary set to 0 and each other variable is assigned to its
+         * default value.
         */
        explicit Datum();

        /**
         * Copy constructor.
-         * It performs `fast copy`: For performance purpose, copying a Datum or Array<T> or cv::Mat just copies the reference, it still shares the same internal data.
+         * It performs `fast copy`: For performance purpose, copying a Datum or Array<T> or cv::Mat just copies the
+         * reference, it still shares the same internal data.
         * Modifying the copied element will modify the original one.
         * Use clone() for a slower but real copy, similarly to cv::Mat and Array<T>.
         * @param datum Datum to be copied.
@@ -172,7 +208,8 @@ namespace op
        /**
         * Clone function.
         * Similar to cv::Mat::clone and Array<T>::clone.
-         * It performs a real but slow copy of the data, i.e., even if the copied element is modified, the original one is not.
+         * It performs a real but slow copy of the data, i.e., even if the copied element is modified, the original
+         * one is not.
         * @return The resulting Datum.
         */
        Datum clone() const;
@@ -181,7 +218,7 @@ namespace op



-        // -------------------------------------------------- Comparison operators -------------------------------------------------- //
+        // ---------------------------------------- Comparison operators ---------------------------------------- //
        /**
         * Less comparison operator.
         * @param datum Datum to be compared.

--- a/include/openpose/core/headers.hpp
+++ b/include/openpose/core/headers.hpp
@@ -21,9 +21,11 @@
 #include <openpose/core/renderer.hpp>
 #include <openpose/core/resizeAndMergeBase.hpp>
 #include <openpose/core/resizeAndMergeCaffe.hpp>
+#include <openpose/core/scaleAndSizeExtractor.hpp>
 #include <openpose/core/wCvMatToOpInput.hpp>
 #include <openpose/core/wCvMatToOpOutput.hpp>
 #include <openpose/core/wKeypointScaler.hpp>
 #include <openpose/core/wOpOutputToCvMat.hpp>
+#include <openpose/core/wScaleAndSizeExtractor.hpp>

 #endif // OPENPOSE_CORE_HEADERS_HPP
--- a/include/openpose/core/keypointScaler.hpp
+++ b/include/openpose/core/keypointScaler.hpp
@@ -11,9 +11,9 @@ namespace op
    public:
        explicit KeypointScaler(const ScaleMode scaleMode);

-        void scale(Array<float>& arrayToScale, const float scaleInputToOutput, const float scaleNetToOutput, const Point<int>& producerSize) const;
+        void scale(Array<float>& arrayToScale, const double scaleInputToOutput, const double scaleNetToOutput, const Point<int>& producerSize) const;

-        void scale(std::vector<Array<float>>& arraysToScale, const float scaleInputToOutput, const float scaleNetToOutput, const Point<int>& producerSize) const;
+        void scale(std::vector<Array<float>>& arraysToScale, const double scaleInputToOutput, const double scaleNetToOutput, const Point<int>& producerSize) const;

    private:
        const ScaleMode mScaleMode;

--- a/include/openpose/core/macros.hpp
+++ b/include/openpose/core/macros.hpp
@@ -26,10 +26,9 @@
    className(const className&) = delete; \
    className& operator=(const className&) = delete

+// Instantiate a class with all the basic types
 #define COMPILE_TEMPLATE_BASIC_TYPES_CLASS(className) COMPILE_TEMPLATE_BASIC_TYPES(className, class)
-
 #define COMPILE_TEMPLATE_BASIC_TYPES_STRUCT(className) COMPILE_TEMPLATE_BASIC_TYPES(className, struct)
-
 #define COMPILE_TEMPLATE_BASIC_TYPES(className, classType) \
    template classType OP_API className<char>; \
    template classType OP_API className<signed char>; \
@@ -46,6 +45,22 @@
    template classType OP_API className<double>; \
    template classType OP_API className<long double>

+// Instantiate a class with float and double specifications
+#define COMPILE_TEMPLATE_FLOATING_TYPES_CLASS(className) COMPILE_TEMPLATE_FLOATING_TYPES(className, class)
+#define COMPILE_TEMPLATE_FLOATING_TYPES_STRUCT(className) COMPILE_TEMPLATE_FLOATING_TYPES(className, struct)
+#define COMPILE_TEMPLATE_FLOATING_TYPES(className, classType) \
+  char gInstantiationGuard##className; \
+  template classType OP_API className<float>; \
+  template classType OP_API className<double>
+
+// PIMPL does not work if function arguments need the 3rd-party class. Alternative:
+// stackoverflow.com/questions/13978775/how-to-avoid-include-dependency-to-external-library?answertab=active#tab-top
+struct dim3;
+namespace caffe
+{
+    template <typename T> class Blob;
+}
+
 // Includes at the end, since this macros class does not need them, but the files that call this
 // file. However, keeping the files at the beginning might create a circular include linking problem.
 #include <memory> // std::shared_ptr

--- a/include/openpose/core/maximumBase.hpp
+++ b/include/openpose/core/maximumBase.hpp
@@ -6,10 +6,12 @@
 namespace op
 {
    template <typename T>
-    OP_API void maximumCpu(T* targetPtr, const T* const sourcePtr, const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize);
+    OP_API void maximumCpu(T* targetPtr, const T* const sourcePtr, const std::array<int, 4>& targetSize,
+                           const std::array<int, 4>& sourceSize);

    template <typename T>
-    OP_API void maximumGpu(T* targetPtr, const T* const sourcePtr, const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize);
+    OP_API void maximumGpu(T* targetPtr, const T* const sourcePtr, const std::array<int, 4>& targetSize,
+                           const std::array<int, 4>& sourceSize);
 }

 #endif // OPENPOSE_CORE_MAXIMUM_BASE_HPP
--- a/include/openpose/core/maximumCaffe.hpp
+++ b/include/openpose/core/maximumCaffe.hpp
-#ifdef USE_CAFFE
 #ifndef OPENPOSE_CORE_MAXIMUM_CAFFE_HPP
 #define OPENPOSE_CORE_MAXIMUM_CAFFE_HPP

-#include <caffe/blob.hpp>
 #include <openpose/core/common.hpp>

 namespace op
 {
-    // It mostly follows the Caffe::layer implementation, so Caffe users can easily use it. However, in order to keep the compatibility with any generic Caffe version,
-    // we keep this 'layer' inside our library rather than in the Caffe code.
+    // It mostly follows the Caffe::layer implementation, so Caffe users can easily use it. However, in order to keep
+    // the compatibility with any generic Caffe version, we keep this 'layer' inside our library rather than in the
+    // Caffe code.
    template <typename T>
    class OP_API MaximumCaffe
    {
@@ -25,9 +24,11 @@ namespace op

        virtual void Forward_gpu(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top);

-        virtual void Backward_cpu(const std::vector<caffe::Blob<T>*>& top, const std::vector<bool>& propagate_down, const std::vector<caffe::Blob<T>*>& bottom);
+        virtual void Backward_cpu(const std::vector<caffe::Blob<T>*>& top, const std::vector<bool>& propagate_down,
+                                  const std::vector<caffe::Blob<T>*>& bottom);

-        virtual void Backward_gpu(const std::vector<caffe::Blob<T>*>& top, const std::vector<bool>& propagate_down, const std::vector<caffe::Blob<T>*>& bottom);
+        virtual void Backward_gpu(const std::vector<caffe::Blob<T>*>& top, const std::vector<bool>& propagate_down,
+                                  const std::vector<caffe::Blob<T>*>& bottom);

    private:
        std::array<int, 4> mBottomSize;
@@ -36,4 +37,3 @@ namespace op
 }

 #endif // OPENPOSE_CORE_MAXIMUM_CAFFE_HPP
-#endif
--- a/include/openpose/core/netCaffe.hpp
+++ b/include/openpose/core/netCaffe.hpp
-#ifdef USE_CAFFE
 #ifndef OPENPOSE_CORE_NET_CAFFE_HPP
 #define OPENPOSE_CORE_NET_CAFFE_HPP

-#include <caffe/net.hpp>
 #include <openpose/core/common.hpp>
 #include <openpose/core/net.hpp>

@@ -11,7 +9,8 @@ namespace op
    class OP_API NetCaffe : public Net
    {
    public:
-        NetCaffe(const std::array<int, 4>& netInputSize4D, const std::string& caffeProto, const std::string& caffeTrainedModel, const int gpuId = 0,
+        NetCaffe(const std::array<int, 4>& netInputSize4D, const std::string& caffeProto,
+                 const std::string& caffeTrainedModel, const int gpuId = 0,
                 const std::string& lastBlobName = "net_output");

        virtual ~NetCaffe();
@@ -29,20 +28,15 @@ namespace op
        boost::shared_ptr<caffe::Blob<float>> getOutputBlob() const;

    private:
-        // Init with constructor
-        const int mGpuId;
-        const std::array<int, 4> mNetInputSize4D;
-        const unsigned long mNetInputMemory;
-        const std::string mCaffeProto;
-        const std::string mCaffeTrainedModel;
-        const std::string mLastBlobName;
-        // Init with thread
-        std::unique_ptr<caffe::Net<float>> upCaffeNet;
-        boost::shared_ptr<caffe::Blob<float>> spOutputBlob;
+        // PIMPL idiom
+        // http://www.cppsamples.com/common-tasks/pimpl.html
+        struct ImplNetCaffe;
+        std::unique_ptr<ImplNetCaffe> upImpl;

+        // PIMP requires DELETE_COPY & destructor, or extra code
+        // http://oliora.github.io/2015/12/29/pimpl-and-rule-of-zero.html
        DELETE_COPY(NetCaffe);
    };
 }

 #endif // OPENPOSE_CORE_NET_CAFFE_HPP
-#endif
--- a/include/openpose/core/nmsCaffe.hpp
+++ b/include/openpose/core/nmsCaffe.hpp
-#ifdef USE_CAFFE
 #ifndef OPENPOSE_CORE_NMS_CAFFE_HPP
 #define OPENPOSE_CORE_NMS_CAFFE_HPP

-#include <caffe/blob.hpp>
 #include <openpose/core/common.hpp>

 namespace op
 {
-    // It mostly follows the Caffe::layer implementation, so Caffe users can easily use it. However, in order to keep the compatibility with any generic Caffe version,
-    // we keep this 'layer' inside our library rather than in the Caffe code.
+    // It mostly follows the Caffe::layer implementation, so Caffe users can easily use it. However, in order to keep
+    // the compatibility with any generic Caffe version, we keep this 'layer' inside our library rather than in the
+    // Caffe code.
    template <typename T>
    class OP_API NmsCaffe
    {
    public:
        explicit NmsCaffe();

+        virtual ~NmsCaffe();
+
        virtual void LayerSetUp(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top);

-        virtual void Reshape(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top, const int maxPeaks);
+        virtual void Reshape(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top,
+                             const int maxPeaks);

        virtual inline const char* type() const { return "Nms"; }

@@ -27,17 +29,24 @@ namespace op

        virtual void Forward_gpu(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top);

-        virtual void Backward_cpu(const std::vector<caffe::Blob<T>*>& top, const std::vector<bool>& propagate_down, const std::vector<caffe::Blob<T>*>& bottom);
+        virtual void Backward_cpu(const std::vector<caffe::Blob<T>*>& top, const std::vector<bool>& propagate_down,
+                                  const std::vector<caffe::Blob<T>*>& bottom);

-        virtual void Backward_gpu(const std::vector<caffe::Blob<T>*>& top, const std::vector<bool>& propagate_down, const std::vector<caffe::Blob<T>*>& bottom);
+        virtual void Backward_gpu(const std::vector<caffe::Blob<T>*>& top, const std::vector<bool>& propagate_down,
+                                  const std::vector<caffe::Blob<T>*>& bottom);

    private:
        T mThreshold;
-        caffe::Blob<int> mKernelBlob;
-        std::array<int, 4> mBottomSize;
-        std::array<int, 4> mTopSize;
+
+        // PIMPL idiom
+        // http://www.cppsamples.com/common-tasks/pimpl.html
+        struct ImplNmsCaffe;
+        std::unique_ptr<ImplNmsCaffe> upImpl;
+
+        // PIMP requires DELETE_COPY & destructor, or extra code
+        // http://oliora.github.io/2015/12/29/pimpl-and-rule-of-zero.html
+        DELETE_COPY(NmsCaffe);
    };
 }

 #endif // OPENPOSE_CORE_NMS_CAFFE_HPP
-#endif
--- a/include/openpose/core/resizeAndMergeBase.hpp
+++ b/include/openpose/core/resizeAndMergeBase.hpp
@@ -7,11 +7,11 @@ namespace op
 {
    template <typename T>
    OP_API void resizeAndMergeCpu(T* targetPtr, const T* const sourcePtr, const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize,
-                                  const std::vector<T>& scaleRatios = {1});
+                                  const std::vector<T>& scaleInputToNetInputs = {1.f});

    template <typename T>
    OP_API void resizeAndMergeGpu(T* targetPtr, const T* const sourcePtr, const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize,
-                                  const std::vector<T>& scaleRatios = {1});
+                                  const std::vector<T>& scaleInputToNetInputs = {1.f});
 }

 #endif // OPENPOSE_CORE_RESIZE_AND_MERGE_BASE_HPP
--- a/include/openpose/core/resizeAndMergeCaffe.hpp
+++ b/include/openpose/core/resizeAndMergeCaffe.hpp
-#ifdef USE_CAFFE
 #ifndef OPENPOSE_CORE_RESIZE_AND_MERGE_CAFFE_HPP
 #define OPENPOSE_CORE_RESIZE_AND_MERGE_CAFFE_HPP

-#include <caffe/blob.hpp>
 #include <openpose/core/common.hpp>

+// PIMPL does not work here. Alternative:
+// stackoverflow.com/questions/13978775/how-to-avoid-include-dependency-to-external-library?answertab=active#tab-top
+namespace caffe
+{
+    template <typename T> class Blob;
+}
+
 namespace op
 {
-    // It mostly follows the Caffe::layer implementation, so Caffe users can easily use it. However, in order to keep the
-    // compatibility with any generic Caffe version,
-    // we keep this 'layer' inside our library rather than in the Caffe code.
+    // It mostly follows the Caffe::layer implementation, so Caffe users can easily use it. However, in order to keep
+    // the compatibility with any generic Caffe version, we keep this 'layer' inside our library rather than in the
+    // Caffe code.
    template <typename T>
    class OP_API ResizeAndMergeCaffe
    {
@@ -45,4 +50,3 @@ namespace op
 }

 #endif // OPENPOSE_CORE_RESIZE_AND_MERGE_CAFFE_HPP
-#endif
--- a/include/openpose/core/scaleAndSizeExtractor.hpp
+++ b/include/openpose/core/scaleAndSizeExtractor.hpp
+#ifndef OPENPOSE_CORE_SCALE_AND_SIZE_EXTRACTOR_HPP
+#define OPENPOSE_CORE_SCALE_AND_SIZE_EXTRACTOR_HPP
+
+#include <tuple>
+#include <openpose/core/common.hpp>
+
+namespace op
+{
+    class OP_API ScaleAndSizeExtractor
+    {
+    public:
+        ScaleAndSizeExtractor(const Point<int>& netInputResolution, const Point<int>& outputResolution,
+                              const int scaleNumber = 1, const double scaleGap = 0.25);
+
+        std::tuple<std::vector<double>, std::vector<Point<int>>, double, Point<int>> extract(
+            const Point<int>& inputResolution) const;
+
+    private:
+        const Point<int> mNetInputResolution;
+        const Point<int> mOutputSize;
+        const int mScaleNumber;
+        const double mScaleGap;
+    };
+}
+
+#endif // OPENPOSE_CORE_SCALE_AND_SIZE_EXTRACTOR_HPP
--- a/include/openpose/core/wCvMatToOpInput.hpp
+++ b/include/openpose/core/wCvMatToOpInput.hpp
@@ -29,7 +29,6 @@ namespace op


 // Implementation
-#include <openpose/utilities/openCv.hpp>
 #include <openpose/utilities/pointerContainer.hpp>
 namespace op
 {
@@ -57,10 +56,12 @@ namespace op
                const auto profilerKey = Profiler::timerInit(__LINE__, __FUNCTION__, __FILE__);
                // cv::Mat -> float*
                for (auto& tDatum : *tDatums)
-                    std::tie(tDatum.inputNetData, tDatum.scaleRatios) = spCvMatToOpInput->format(tDatum.cvInputData);
+                    tDatum.inputNetData = spCvMatToOpInput->createArray(tDatum.cvInputData,
+                                                                        tDatum.scaleInputToNetInputs,
+                                                                        tDatum.netInputSizes);
                // Profiling speed
                Profiler::timerEnd(profilerKey);
-                Profiler::printAveragedTimeMsOnIterationX(profilerKey, __LINE__, __FUNCTION__, __FILE__, Profiler::DEFAULT_X);
+                Profiler::printAveragedTimeMsOnIterationX(profilerKey, __LINE__, __FUNCTION__, __FILE__);
                // Debugging log
                dLog("", Priority::Low, __LINE__, __FUNCTION__, __FILE__);
            }

--- a/include/openpose/core/wCvMatToOpOutput.hpp
+++ b/include/openpose/core/wCvMatToOpOutput.hpp
@@ -59,10 +59,11 @@ namespace op
                const auto profilerKey = Profiler::timerInit(__LINE__, __FUNCTION__, __FILE__);
                // cv::Mat -> float*
                for (auto& tDatum : tDatumsNoPtr)
-                    std::tie(tDatum.scaleInputToOutput, tDatum.outputData) = spCvMatToOpOutput->format(tDatum.cvInputData);
+                    tDatum.outputData = spCvMatToOpOutput->createArray(tDatum.cvInputData, tDatum.scaleInputToOutput,
+                                                                       tDatum.netOutputSize);
                // Profiling speed
                Profiler::timerEnd(profilerKey);
-                Profiler::printAveragedTimeMsOnIterationX(profilerKey, __LINE__, __FUNCTION__, __FILE__, Profiler::DEFAULT_X);
+                Profiler::printAveragedTimeMsOnIterationX(profilerKey, __LINE__, __FUNCTION__, __FILE__);
                // Debugging log
                dLog("", Priority::Low, __LINE__, __FUNCTION__, __FILE__);
            }

--- a/include/openpose/core/wKeypointScaler.hpp
+++ b/include/openpose/core/wKeypointScaler.hpp
@@ -56,11 +56,11 @@ namespace op
                for (auto& tDatum : *tDatums)
                {
                    std::vector<Array<float>> arraysToScale{tDatum.poseKeypoints, tDatum.handKeypoints[0], tDatum.handKeypoints[1], tDatum.faceKeypoints};
-                    spKeypointScaler->scale(arraysToScale, (float)tDatum.scaleInputToOutput, (float)tDatum.scaleNetToOutput, Point<int>{tDatum.cvInputData.cols, tDatum.cvInputData.rows});
+                    spKeypointScaler->scale(arraysToScale, tDatum.scaleInputToOutput, tDatum.scaleNetToOutput, Point<int>{tDatum.cvInputData.cols, tDatum.cvInputData.rows});
                }
                // Profiling speed
                Profiler::timerEnd(profilerKey);
-                Profiler::printAveragedTimeMsOnIterationX(profilerKey, __LINE__, __FUNCTION__, __FILE__, Profiler::DEFAULT_X);
+                Profiler::printAveragedTimeMsOnIterationX(profilerKey, __LINE__, __FUNCTION__, __FILE__);
                // Debugging log
                dLog("", Priority::Low, __LINE__, __FUNCTION__, __FILE__);
            }

--- a/include/openpose/core/wOpOutputToCvMat.hpp
+++ b/include/openpose/core/wOpOutputToCvMat.hpp
@@ -59,7 +59,7 @@ namespace op
                    tDatum.cvOutputData = spOpOutputToCvMat->formatToCvMat(tDatum.outputData);
                // Profiling speed
                Profiler::timerEnd(profilerKey);
-                Profiler::printAveragedTimeMsOnIterationX(profilerKey, __LINE__, __FUNCTION__, __FILE__, Profiler::DEFAULT_X);
+                Profiler::printAveragedTimeMsOnIterationX(profilerKey, __LINE__, __FUNCTION__, __FILE__);
                // Debugging log
                dLog("", Priority::Low, __LINE__, __FUNCTION__, __FILE__);
            }

--- a/include/openpose/core/wScaleAndSizeExtractor.hpp
+++ b/include/openpose/core/wScaleAndSizeExtractor.hpp
+#ifndef OPENPOSE_CORE_W_SCALE_AND_SIZE_EXTRACTOR_HPP
+#define OPENPOSE_CORE_W_SCALE_AND_SIZE_EXTRACTOR_HPP
+
+#include <openpose/core/common.hpp>
+#include <openpose/core/scaleAndSizeExtractor.hpp>
+#include <openpose/thread/worker.hpp>
+
+namespace op
+{
+    template<typename TDatums>
+    class WScaleAndSizeExtractor : public Worker<TDatums>
+    {
+    public:
+        explicit WScaleAndSizeExtractor(const std::shared_ptr<ScaleAndSizeExtractor>& scaleAndSizeExtractor);
+
+        void initializationOnThread();
+
+        void work(TDatums& tDatums);
+
+    private:
+        const std::shared_ptr<ScaleAndSizeExtractor> spScaleAndSizeExtractor;
+
+        DELETE_COPY(WScaleAndSizeExtractor);
+    };
+}
+
+
+
+
+
+// Implementation
+#include <openpose/utilities/pointerContainer.hpp>
+namespace op
+{
+    template<typename TDatums>
+    WScaleAndSizeExtractor<TDatums>::WScaleAndSizeExtractor(const std::shared_ptr<ScaleAndSizeExtractor>& scaleAndSizeExtractor) :
+        spScaleAndSizeExtractor{scaleAndSizeExtractor}
+    {
+    }
+
+    template<typename TDatums>
+    void WScaleAndSizeExtractor<TDatums>::initializationOnThread()
+    {
+    }
+
+    template<typename TDatums>
+    void WScaleAndSizeExtractor<TDatums>::work(TDatums& tDatums)
+    {
+        try
+        {
+            if (checkNoNullNorEmpty(tDatums))
+            {
+                // Debugging log
+                dLog("", Priority::Low, __LINE__, __FUNCTION__, __FILE__);
+                // Profiling speed
+                const auto profilerKey = Profiler::timerInit(__LINE__, __FUNCTION__, __FILE__);
+                // cv::Mat -> float*
+                for (auto& tDatum : *tDatums)
+                {
+                    const Point<int> inputSize{tDatum.cvInputData.cols, tDatum.cvInputData.rows};
+                    std::tie(tDatum.scaleInputToNetInputs, tDatum.netInputSizes, tDatum.scaleInputToOutput,
+                        tDatum.netOutputSize) = spScaleAndSizeExtractor->extract(inputSize);
+                }
+                // Profiling speed
+                Profiler::timerEnd(profilerKey);
+                Profiler::printAveragedTimeMsOnIterationX(profilerKey, __LINE__, __FUNCTION__, __FILE__);
+                // Debugging log
+                dLog("", Priority::Low, __LINE__, __FUNCTION__, __FILE__);
+            }
+        }
+        catch (const std::exception& e)
+        {
+            this->stop();
+            tDatums = nullptr;
+            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+        }
+    }
+
+    COMPILE_TEMPLATE_DATUM(WScaleAndSizeExtractor);
+}
+
+#endif // OPENPOSE_CORE_W_SCALE_AND_SIZE_EXTRACTOR_HPP
--- a/include/openpose/face/faceDetector.hpp
+++ b/include/openpose/face/faceDetector.hpp
@@ -11,7 +11,7 @@ namespace op
    public:
        explicit FaceDetector(const PoseModel poseModel);

-        std::vector<Rectangle<float>> detectFaces(const Array<float>& poseKeypoints, const float scaleInputToOutput) const;
+        std::vector<Rectangle<float>> detectFaces(const Array<float>& poseKeypoints, const double scaleInputToOutput) const;

    private:
        const unsigned int mNeck;

--- a/include/openpose/face/faceExtractor.hpp
+++ b/include/openpose/face/faceExtractor.hpp
 #ifndef OPENPOSE_FACE_FACE_EXTRACTOR_HPP
 #define OPENPOSE_FACE_FACE_EXTRACTOR_HPP

-#include <atomic>
 #include <thread>
 #include <opencv2/core/core.hpp> // cv::Mat
 #include <openpose/core/common.hpp>
-#include <openpose/core/maximumCaffe.hpp>
-#include <openpose/core/net.hpp>
-#include <openpose/core/resizeAndMergeCaffe.hpp>
 #include <openpose/core/enumClasses.hpp>

 namespace op
 {
+    /**
+     * Face keypoint extractor class.
+     */
    class OP_API FaceExtractor
    {
    public:
+        /**
+         * Constructor of the FaceExtractor class.
+         * @param netInputSize Size at which the cropped image (where the face is located) is resized.
+         * @param netOutputSize Size of the final results. At the moment, it must be equal than netOutputSize.
+         */
        explicit FaceExtractor(const Point<int>& netInputSize, const Point<int>& netOutputSize,
-                               const std::string& modelFolder, const int gpuId,
                               const std::vector<HeatMapType>& heatMapTypes = {},
                               const ScaleMode heatMapScale = ScaleMode::ZeroToOne);

-        void initializationOnThread();
+        /**
+         * Virtual destructor of the HandExtractor class.
+         * Required to allow inheritance.
+         */
+        virtual ~FaceExtractor();

-        void forwardPass(const std::vector<Rectangle<float>>& faceRectangles, const cv::Mat& cvInputData,
-                         const float scaleInputToOutput);
+        /**
+         * This function must be call before using any other function. It must also be called inside the thread in
+         * which the functions are going to be used.
+         */
+        void initializationOnThread();

-        Array<float> getFaceKeypoints() const;
+        /**
+         * This function extracts the face keypoints for each detected face in the image.
+         * @param faceRectangles location of the faces in the image. It is a length-variable std::vector, where
+         * each index corresponds to a different person in the image. Internally, a op::Rectangle<float>
+         * (similar to cv::Rect for floating values) with the position of that face (or 0,0,0,0 if
+         * some face is missing, e.g. if a specific person has only half of the body inside the image).
+         * @param cvInputData Original image in cv::Mat format and BGR format.
+         * @param scaleInputToOutput Desired scale of the final keypoints. Set to 1 if the desired size is the
+         * cvInputData size.
+         */
+        virtual void forwardPass(const std::vector<Rectangle<float>>& faceRectangles, const cv::Mat& cvInputData,
+                                 const double scaleInputToOutput) = 0;

        Array<float> getHeatMaps() const;

-    private:
+        /**
+         * This function returns the face keypoins. VERY IMPORTANT: use getFaceKeypoints().clone() if the keypoints are
+         * going to be edited in a different thread.
+         * @return A Array with all the face keypoints. It follows the pose structure, i.e. the first dimension
+         * corresponds to all the people in the image, the second to each specific keypoint, and the third one to
+         * (x, y, score).
+         */
+        Array<float> getFaceKeypoints() const;
+
+    protected:
        const Point<int> mNetOutputSize;
-        std::shared_ptr<Net> spNet;
-        std::shared_ptr<ResizeAndMergeCaffe<float>> spResizeAndMergeCaffe;
-        std::shared_ptr<MaximumCaffe<float>> spMaximumCaffe;
        Array<float> mFaceImageCrop;
        Array<float> mFaceKeypoints;
        // HeatMaps parameters
+        Array<float> mHeatMaps;
        const ScaleMode mHeatMapScaleMode;
        const std::vector<HeatMapType> mHeatMapTypes;
-        Array<float> mHeatMaps;
+
+        virtual void netInitializationOnThread() = 0;
+
+    private:
        // Init with thread
-        boost::shared_ptr<caffe::Blob<float>> spCaffeNetOutputBlob;
-        std::shared_ptr<caffe::Blob<float>> spHeatMapsBlob;
-        std::shared_ptr<caffe::Blob<float>> spPeaksBlob;
        std::thread::id mThreadId;

        void checkThread() const;

--- a/include/openpose/face/faceExtractorCaffe.hpp
+++ b/include/openpose/face/faceExtractorCaffe.hpp
+#ifndef OPENPOSE_FACE_FACE_EXTRACTOR_CAFFE_HPP
+#define OPENPOSE_FACE_FACE_EXTRACTOR_CAFFE_HPP
+
+#include <opencv2/core/core.hpp> // cv::Mat
+#include <openpose/core/common.hpp>
+#include <openpose/core/enumClasses.hpp>
+#include <openpose/face/faceExtractor.hpp>
+
+namespace op
+{
+    /**
+     * Face keypoint extractor class for Caffe framework.
+     */
+    class OP_API FaceExtractorCaffe : public FaceExtractor
+    {
+    public:
+        /**
+         * Constructor of the FaceExtractor class.
+         * @param netInputSize Size at which the cropped image (where the face is located) is resized.
+         * @param netOutputSize Size of the final results. At the moment, it must be equal than netOutputSize.
+         */
+        explicit FaceExtractorCaffe(const Point<int>& netInputSize, const Point<int>& netOutputSize,
+                                    const std::string& modelFolder, const int gpuId,
+                                    const std::vector<HeatMapType>& heatMapTypes = {},
+                                    const ScaleMode heatMapScale = ScaleMode::ZeroToOne);
+
+        virtual ~FaceExtractorCaffe();
+
+        /**
+         * This function must be call before using any other function. It must also be called inside the thread in
+         * which the functions are going to be used.
+         */
+        void netInitializationOnThread();
+
+        /**
+         * This function extracts the face keypoints for each detected face in the image.
+         * @param faceRectangles location of the faces in the image. It is a length-variable std::vector, where
+         * each index corresponds to a different person in the image. Internally, a op::Rectangle<float>
+         * (similar to cv::Rect for floating values) with the position of that face (or 0,0,0,0 if
+         * some face is missing, e.g. if a specific person has only half of the body inside the image).
+         * @param cvInputData Original image in cv::Mat format and BGR format.
+         * @param scaleInputToOutput Desired scale of the final keypoints. Set to 1 if the desired size is the
+         * cvInputData size.
+         */
+        void forwardPass(const std::vector<Rectangle<float>>& faceRectangles, const cv::Mat& cvInputData,
+                         const double scaleInputToOutput);
+
+    private:
+        // PIMPL idiom
+        // http://www.cppsamples.com/common-tasks/pimpl.html
+        struct ImplFaceExtractorCaffe;
+        std::unique_ptr<ImplFaceExtractorCaffe> upImpl;
+
+        // PIMP requires DELETE_COPY & destructor, or extra code
+        // http://oliora.github.io/2015/12/29/pimpl-and-rule-of-zero.html
+        DELETE_COPY(FaceExtractorCaffe);
+    };
+}
+
+#endif // OPENPOSE_FACE_FACE_EXTRACTOR_CAFFE_HPP
--- a/include/openpose/face/headers.hpp
+++ b/include/openpose/face/headers.hpp
@@ -5,6 +5,7 @@
 #include <openpose/face/faceDetector.hpp>
 #include <openpose/face/faceDetectorOpenCV.hpp>
 #include <openpose/face/faceExtractor.hpp>
+#include <openpose/face/faceExtractorCaffe.hpp>
 #include <openpose/face/faceParameters.hpp>
 #include <openpose/face/faceCpuRenderer.hpp>
 #include <openpose/face/faceGpuRenderer.hpp>

--- a/include/openpose/face/wFaceDetector.hpp
+++ b/include/openpose/face/wFaceDetector.hpp
@@ -59,7 +59,7 @@ namespace op
                    tDatum.faceRectangles = spFaceDetector->detectFaces(tDatum.poseKeypoints, tDatum.scaleInputToOutput);
                // Profiling speed
                Profiler::timerEnd(profilerKey);
-                Profiler::printAveragedTimeMsOnIterationX(profilerKey, __LINE__, __FUNCTION__, __FILE__, Profiler::DEFAULT_X);
+                Profiler::printAveragedTimeMsOnIterationX(profilerKey, __LINE__, __FUNCTION__, __FILE__);
                // Debugging log
                dLog("", Priority::Low, __LINE__, __FUNCTION__, __FILE__);
            }

--- a/include/openpose/face/wFaceDetectorOpenCV.hpp
+++ b/include/openpose/face/wFaceDetectorOpenCV.hpp
@@ -59,7 +59,7 @@ namespace op
                    tDatum.faceRectangles = spFaceDetectorOpenCV->detectFaces(tDatum.cvInputData);
                // Profiling speed
                Profiler::timerEnd(profilerKey);
-                Profiler::printAveragedTimeMsOnIterationX(profilerKey, __LINE__, __FUNCTION__, __FILE__, Profiler::DEFAULT_X);
+                Profiler::printAveragedTimeMsOnIterationX(profilerKey, __LINE__, __FUNCTION__, __FILE__);
                // Debugging log
                dLog("", Priority::Low, __LINE__, __FUNCTION__, __FILE__);
            }

--- a/include/openpose/face/wFaceExtractor.hpp
+++ b/include/openpose/face/wFaceExtractor.hpp
@@ -64,7 +64,7 @@ namespace op
                }
                // Profiling speed
                Profiler::timerEnd(profilerKey);
-                Profiler::printAveragedTimeMsOnIterationX(profilerKey, __LINE__, __FUNCTION__, __FILE__, Profiler::DEFAULT_X);
+                Profiler::printAveragedTimeMsOnIterationX(profilerKey, __LINE__, __FUNCTION__, __FILE__);
                // Debugging log
                dLog("", Priority::Low, __LINE__, __FUNCTION__, __FILE__);
            }

--- a/include/openpose/face/wFaceRenderer.hpp
+++ b/include/openpose/face/wFaceRenderer.hpp
@@ -60,7 +60,7 @@ namespace op
                    spFaceRenderer->renderFace(tDatum.outputData, tDatum.faceKeypoints);
                // Profiling speed
                Profiler::timerEnd(profilerKey);
-                Profiler::printAveragedTimeMsOnIterationX(profilerKey, __LINE__, __FUNCTION__, __FILE__, Profiler::DEFAULT_X);
+                Profiler::printAveragedTimeMsOnIterationX(profilerKey, __LINE__, __FUNCTION__, __FILE__);
                // Debugging log
                dLog("", Priority::Low, __LINE__, __FUNCTION__, __FILE__);
            }

--- a/include/openpose/filestream/wCocoJsonSaver.hpp
+++ b/include/openpose/filestream/wCocoJsonSaver.hpp
@@ -63,7 +63,7 @@ namespace op
                spCocoJsonSaver->record(tDatum.poseKeypoints, tDatum.name);
                // Profiling speed
                Profiler::timerEnd(profilerKey);
-                Profiler::printAveragedTimeMsOnIterationX(profilerKey, __LINE__, __FUNCTION__, __FILE__, Profiler::DEFAULT_X);
+                Profiler::printAveragedTimeMsOnIterationX(profilerKey, __LINE__, __FUNCTION__, __FILE__);
                // Debugging log
                dLog("", Priority::Low, __LINE__, __FUNCTION__, __FILE__);
            }

--- a/include/openpose/filestream/wFaceSaver.hpp
+++ b/include/openpose/filestream/wFaceSaver.hpp
@@ -65,7 +65,7 @@ namespace op
                spKeypointSaver->saveKeypoints(keypointVector, fileName, "face");
                // Profiling speed
                Profiler::timerEnd(profilerKey);
-                Profiler::printAveragedTimeMsOnIterationX(profilerKey, __LINE__, __FUNCTION__, __FILE__, Profiler::DEFAULT_X);
+                Profiler::printAveragedTimeMsOnIterationX(profilerKey, __LINE__, __FUNCTION__, __FILE__);
                // Debugging log
                dLog("", Priority::Low, __LINE__, __FUNCTION__, __FILE__);
            }

--- a/include/openpose/filestream/wHandSaver.hpp
+++ b/include/openpose/filestream/wHandSaver.hpp
@@ -70,7 +70,7 @@ namespace op
                spKeypointSaver->saveKeypoints(keypointVector, fileName, "hand_right");
                // Profiling speed
                Profiler::timerEnd(profilerKey);
-                Profiler::printAveragedTimeMsOnIterationX(profilerKey, __LINE__, __FUNCTION__, __FILE__, Profiler::DEFAULT_X);
+                Profiler::printAveragedTimeMsOnIterationX(profilerKey, __LINE__, __FUNCTION__, __FILE__);
                // Debugging log
                dLog("", Priority::Low, __LINE__, __FUNCTION__, __FILE__);
            }

--- a/include/openpose/filestream/wImageSaver.hpp
+++ b/include/openpose/filestream/wImageSaver.hpp
@@ -64,7 +64,7 @@ namespace op
                spImageSaver->saveImages(cvOutputDatas, fileName);
                // Profiling speed
                Profiler::timerEnd(profilerKey);
-                Profiler::printAveragedTimeMsOnIterationX(profilerKey, __LINE__, __FUNCTION__, __FILE__, Profiler::DEFAULT_X);
+                Profiler::printAveragedTimeMsOnIterationX(profilerKey, __LINE__, __FUNCTION__, __FILE__);
                // Debugging log
                dLog("", Priority::Low, __LINE__, __FUNCTION__, __FILE__);
            }

--- a/include/openpose/filestream/wKeypointJsonSaver.hpp
+++ b/include/openpose/filestream/wKeypointJsonSaver.hpp
@@ -76,7 +76,7 @@ namespace op
                }
                // Profiling speed
                Profiler::timerEnd(profilerKey);
-                Profiler::printAveragedTimeMsOnIterationX(profilerKey, __LINE__, __FUNCTION__, __FILE__, Profiler::DEFAULT_X);
+                Profiler::printAveragedTimeMsOnIterationX(profilerKey, __LINE__, __FUNCTION__, __FILE__);
                // Debugging log
                dLog("", Priority::Low, __LINE__, __FUNCTION__, __FILE__);
            }

--- a/include/openpose/filestream/wPoseSaver.hpp
+++ b/include/openpose/filestream/wPoseSaver.hpp
@@ -65,7 +65,7 @@ namespace op
                spKeypointSaver->saveKeypoints(keypointVector, fileName, "pose");
                // Profiling speed
                Profiler::timerEnd(profilerKey);
-                Profiler::printAveragedTimeMsOnIterationX(profilerKey, __LINE__, __FUNCTION__, __FILE__, Profiler::DEFAULT_X);
+                Profiler::printAveragedTimeMsOnIterationX(profilerKey, __LINE__, __FUNCTION__, __FILE__);
                // Debugging log
                dLog("", Priority::Low, __LINE__, __FUNCTION__, __FILE__);
            }

--- a/include/openpose/filestream/wVideoSaver.hpp
+++ b/include/openpose/filestream/wVideoSaver.hpp
@@ -63,7 +63,7 @@ namespace op
                spVideoSaver->write(cvOutputDatas);
                // Profiling speed
                Profiler::timerEnd(profilerKey);
-                Profiler::printAveragedTimeMsOnIterationX(profilerKey, __LINE__, __FUNCTION__, __FILE__, Profiler::DEFAULT_X);
+                Profiler::printAveragedTimeMsOnIterationX(profilerKey, __LINE__, __FUNCTION__, __FILE__);
                // Debugging log
                dLog("", Priority::Low, __LINE__, __FUNCTION__, __FILE__);
            }

--- a/include/openpose/gui/guiInfoAdder.hpp
+++ b/include/openpose/gui/guiInfoAdder.hpp
@@ -12,7 +12,8 @@ namespace op
    public:
        GuiInfoAdder(const int numberGpus, const bool guiEnabled = false);

-        void addInfo(cv::Mat& cvOutputData, const Array<float>& poseKeypoints, const unsigned long long id, const std::string& elementRenderedName);
+        void addInfo(cv::Mat& cvOutputData, const int numberPeople, const unsigned long long id,
+                     const std::string& elementRenderedName);

    private:
        // Const variables

--- a/include/openpose/gui/wGui.hpp
+++ b/include/openpose/gui/wGui.hpp
@@ -75,7 +75,7 @@ namespace op
                if (!tDatumsNoPtr.empty())
                {
                    Profiler::timerEnd(profilerKey);
-                    Profiler::printAveragedTimeMsOnIterationX(profilerKey, __LINE__, __FUNCTION__, __FILE__, Profiler::DEFAULT_X);
+                    Profiler::printAveragedTimeMsOnIterationX(profilerKey, __LINE__, __FUNCTION__, __FILE__);
                }
                // Debugging log
                dLog("", Priority::Low, __LINE__, __FUNCTION__, __FILE__);

--- a/include/openpose/gui/wGuiInfoAdder.hpp
+++ b/include/openpose/gui/wGuiInfoAdder.hpp
@@ -56,10 +56,12 @@ namespace op
                const auto profilerKey = Profiler::timerInit(__LINE__, __FUNCTION__, __FILE__);
                // Add GUI components to frame
                for (auto& tDatum : *tDatums)
-                    spGuiInfoAdder->addInfo(tDatum.cvOutputData, tDatum.poseKeypoints, tDatum.id, tDatum.elementRendered.second);
+                    spGuiInfoAdder->addInfo(tDatum.cvOutputData, std::max(tDatum.poseKeypoints.getSize(0),
+                                                                          tDatum.faceKeypoints.getSize(0)),
+                                            tDatum.id, tDatum.elementRendered.second);
                // Profiling speed
                Profiler::timerEnd(profilerKey);
-                Profiler::printAveragedTimeMsOnIterationX(profilerKey, __LINE__, __FUNCTION__, __FILE__, Profiler::DEFAULT_X);
+                Profiler::printAveragedTimeMsOnIterationX(profilerKey, __LINE__, __FUNCTION__, __FILE__);
                // Debugging log
                dLog("", Priority::Low, __LINE__, __FUNCTION__, __FILE__);
            }

--- a/include/openpose/hand/handDetector.hpp
+++ b/include/openpose/hand/handDetector.hpp
@@ -14,9 +14,9 @@ namespace op
    public:
        explicit HandDetector(const PoseModel poseModel);

-        std::vector<std::array<Rectangle<float>, 2>> detectHands(const Array<float>& poseKeypoints, const float scaleInputToOutput) const;
+        std::vector<std::array<Rectangle<float>, 2>> detectHands(const Array<float>& poseKeypoints, const double scaleInputToOutput) const;

-        std::vector<std::array<Rectangle<float>, 2>> trackHands(const Array<float>& poseKeypoints, const float scaleInputToOutput);
+        std::vector<std::array<Rectangle<float>, 2>> trackHands(const Array<float>& poseKeypoints, const double scaleInputToOutput);

        void updateTracker(const std::array<Array<float>, 2>& handKeypoints, const unsigned long long id);


--- a/include/openpose/hand/handExtractor.hpp
+++ b/include/openpose/hand/handExtractor.hpp
 #ifndef OPENPOSE_HAND_HAND_EXTRACTOR_HPP
 #define OPENPOSE_HAND_HAND_EXTRACTOR_HPP

-#include <atomic>
 #include <thread>
 #include <opencv2/core/core.hpp> // cv::Mat
-#include <openpose/core/enumClasses.hpp>
 #include <openpose/core/common.hpp>
-#include <openpose/core/maximumCaffe.hpp>
-#include <openpose/core/net.hpp>
-#include <openpose/core/resizeAndMergeCaffe.hpp>
+#include <openpose/core/enumClasses.hpp>

 namespace op
 {
@@ -22,18 +18,21 @@ namespace op
         * Constructor of the HandExtractor class.
         * @param netInputSize Size at which the cropped image (where the hand is located) is resized.
         * @param netOutputSize Size of the final results. At the moment, it must be equal than netOutputSize.
-         * @param modelFolder Folder where the models are located.
-         * @param gpuId The GPU index (0-based) which the deep net will use.
         * @param numberScales Number of scales to run. The more scales, the slower it will be but possibly also more
         * accurate.
         * @param rangeScales The range between the smaller and bigger scale.
         */
        explicit HandExtractor(const Point<int>& netInputSize, const Point<int>& netOutputSize,
-                               const std::string& modelFolder, const int gpuId,
                               const unsigned short numberScales = 1, const float rangeScales = 0.4f,
                               const std::vector<HeatMapType>& heatMapTypes = {},
                               const ScaleMode heatMapScale = ScaleMode::ZeroToOne);

+        /**
+         * Virtual destructor of the HandExtractor class.
+         * Required to allow inheritance.
+         */
+        virtual ~HandExtractor();
+
        /**
         * This function must be call before using any other function. It must also be called inside the thread in
         * which the functions are going to be used.
@@ -42,7 +41,7 @@ namespace op

        /**
         * This function extracts the hand keypoints for each detected hand in the image.
-         * @param fpsMode handRectangles Location of the hands in the image. It is a length-variable std::vector, where
+         * @param handRectangles location of the hands in the image. It is a length-variable std::vector, where
         * each index corresponds to a different person in the image. Internally the std::vector, a std::array of 2
         * elements: index 0 and 1 for left and right hand respectively. Inside each array element, a
         * op::Rectangle<float> (similar to cv::Rect for floating values) with the position of that hand (or 0,0,0,0 if
@@ -51,45 +50,39 @@ namespace op
         * @param scaleInputToOutput Desired scale of the final keypoints. Set to 1 if the desired size is the
         * cvInputData size.
         */
-        void forwardPass(const std::vector<std::array<Rectangle<float>, 2>> handRectangles, const cv::Mat& cvInputData,
-                         const float scaleInputToOutput);
+        virtual void forwardPass(const std::vector<std::array<Rectangle<float>, 2>> handRectangles,
+                                 const cv::Mat& cvInputData,
+                                 const double scaleInputToOutput) = 0;
+
+        std::array<Array<float>, 2> getHeatMaps() const;

        /**
         * This function returns the hand keypoins. VERY IMPORTANT: use getHandKeypoints().clone() if the keypoints are
         * going to be edited in a different thread.
-         * @return And std::array with all the left hand keypoints (index 0) and all the right ones (index 1). Each
+         * @return A std::array with all the left hand keypoints (index 0) and all the right ones (index 1). Each
         * Array<float> follows the pose structure, i.e. the first dimension corresponds to all the people in the
         * image, the second to each specific keypoint, and the third one to (x, y, score).
         */
        std::array<Array<float>, 2> getHandKeypoints() const;

-        std::array<Array<float>, 2> getHeatMaps() const;
-
-    private:
+    protected:
        const std::pair<unsigned short, float> mMultiScaleNumberAndRange;
        const Point<int> mNetOutputSize;
-        std::shared_ptr<Net> spNet;
-        std::shared_ptr<ResizeAndMergeCaffe<float>> spResizeAndMergeCaffe;
-        std::shared_ptr<MaximumCaffe<float>> spMaximumCaffe;
        Array<float> mHandImageCrop;
        std::array<Array<float>, 2> mHandKeypoints;
        // HeatMaps parameters
        const ScaleMode mHeatMapScaleMode;
        const std::vector<HeatMapType> mHeatMapTypes;
        std::array<Array<float>, 2> mHeatMaps;
+
+        virtual void netInitializationOnThread() = 0;
+
+    private:
        // Init with thread
-        boost::shared_ptr<caffe::Blob<float>> spCaffeNetOutputBlob;
-        std::shared_ptr<caffe::Blob<float>> spHeatMapsBlob;
-        std::shared_ptr<caffe::Blob<float>> spPeaksBlob;
        std::thread::id mThreadId;

        void checkThread() const;

-        void detectHandKeypoints(Array<float>& handCurrent, const float scaleInputToOutput, const int person,
-                                 const cv::Mat& affineMatrix);
-
-        Array<float> getHeatMapsFromLastPass() const;
-
        DELETE_COPY(HandExtractor);
    };
 }

--- a/include/openpose/hand/handExtractorCaffe.hpp
+++ b/include/openpose/hand/handExtractorCaffe.hpp
+#ifndef OPENPOSE_HAND_HAND_EXTRACTOR_CAFFE_HPP
+#define OPENPOSE_HAND_HAND_EXTRACTOR_CAFFE_HPP
+
+#include <opencv2/core/core.hpp> // cv::Mat
+#include <openpose/core/common.hpp>
+#include <openpose/core/enumClasses.hpp>
+#include <openpose/hand/handExtractor.hpp>
+
+namespace op
+{
+    /**
+     * Hand keypoint extractor class for Caffe framework.
+     */
+    class OP_API HandExtractorCaffe : public HandExtractor
+    {
+    public:
+        /**
+         * Constructor of the HandExtractorCaffe class.
+         * @param netInputSize Size at which the cropped image (where the hand is located) is resized.
+         * @param netOutputSize Size of the final results. At the moment, it must be equal than netOutputSize.
+         * @param modelFolder Folder where the models are located.
+         * @param gpuId The GPU index (0-based) which the deep net will use.
+         * @param numberScales Number of scales to run. The more scales, the slower it will be but possibly also more
+         * accurate.
+         * @param rangeScales The range between the smaller and bigger scale.
+         */
+        explicit HandExtractorCaffe(const Point<int>& netInputSize, const Point<int>& netOutputSize,
+                                    const std::string& modelFolder, const int gpuId,
+                                    const unsigned short numberScales = 1, const float rangeScales = 0.4f,
+                                    const std::vector<HeatMapType>& heatMapTypes = {},
+                                    const ScaleMode heatMapScale = ScaleMode::ZeroToOne);
+
+        /**
+         * Virtual destructor of the HandExtractor class.
+         * Required to allow inheritance.
+         */
+        virtual ~HandExtractorCaffe();
+
+        /**
+         * This function must be call before using any other function. It must also be called inside the thread in
+         * which the functions are going to be used.
+         */
+        void netInitializationOnThread();
+
+        /**
+         * This function extracts the hand keypoints for each detected hand in the image.
+         * @param handRectangles location of the hands in the image. It is a length-variable std::vector, where
+         * each index corresponds to a different person in the image. Internally the std::vector, a std::array of 2
+         * elements: index 0 and 1 for left and right hand respectively. Inside each array element, a
+         * op::Rectangle<float> (similar to cv::Rect for floating values) with the position of that hand (or 0,0,0,0 if
+         * some hand is missing, e.g. if a specific person has only half of the body inside the image).
+         * @param cvInputData Original image in cv::Mat format and BGR format.
+         * @param scaleInputToOutput Desired scale of the final keypoints. Set to 1 if the desired size is the
+         * cvInputData size.
+         */
+        void forwardPass(const std::vector<std::array<Rectangle<float>, 2>> handRectangles, const cv::Mat& cvInputData,
+                         const double scaleInputToOutput);
+
+    private:
+        // PIMPL idiom
+        // http://www.cppsamples.com/common-tasks/pimpl.html
+        struct ImplHandExtractorCaffe;
+        std::unique_ptr<ImplHandExtractorCaffe> upImpl;
+
+        void detectHandKeypoints(Array<float>& handCurrent, const double scaleInputToOutput, const int person,
+                                 const cv::Mat& affineMatrix);
+
+        Array<float> getHeatMapsFromLastPass() const;
+
+        // PIMP requires DELETE_COPY & destructor, or extra code
+        // http://oliora.github.io/2015/12/29/pimpl-and-rule-of-zero.html
+        DELETE_COPY(HandExtractorCaffe);
+    };
+}
+
+#endif // OPENPOSE_HAND_HAND_EXTRACTOR_CAFFE_HPP
--- a/include/openpose/hand/headers.hpp
+++ b/include/openpose/hand/headers.hpp
@@ -5,6 +5,7 @@
 #include <openpose/hand/handDetector.hpp>
 #include <openpose/hand/handDetectorFromTxt.hpp>
 #include <openpose/hand/handExtractor.hpp>
+#include <openpose/hand/handExtractorCaffe.hpp>
 #include <openpose/hand/handParameters.hpp>
 #include <openpose/hand/handCpuRenderer.hpp>
 #include <openpose/hand/handGpuRenderer.hpp>

--- a/include/openpose/hand/wHandDetector.hpp
+++ b/include/openpose/hand/wHandDetector.hpp
@@ -59,7 +59,7 @@ namespace op
                    tDatum.handRectangles = spHandDetector->detectHands(tDatum.poseKeypoints, tDatum.scaleInputToOutput);
                // Profiling speed
                Profiler::timerEnd(profilerKey);
-                Profiler::printAveragedTimeMsOnIterationX(profilerKey, __LINE__, __FUNCTION__, __FILE__, Profiler::DEFAULT_X);
+                Profiler::printAveragedTimeMsOnIterationX(profilerKey, __LINE__, __FUNCTION__, __FILE__);
                // Debugging log
                dLog("", Priority::Low, __LINE__, __FUNCTION__, __FILE__);
            }

--- a/include/openpose/hand/wHandDetectorFromTxt.hpp
+++ b/include/openpose/hand/wHandDetectorFromTxt.hpp
@@ -59,7 +59,7 @@ namespace op
                    tDatum.handRectangles = spHandDetectorFromTxt->detectHands();
                // Profiling speed
                Profiler::timerEnd(profilerKey);
-                Profiler::printAveragedTimeMsOnIterationX(profilerKey, __LINE__, __FUNCTION__, __FILE__, Profiler::DEFAULT_X);
+                Profiler::printAveragedTimeMsOnIterationX(profilerKey, __LINE__, __FUNCTION__, __FILE__);
                // Debugging log
                dLog("", Priority::Low, __LINE__, __FUNCTION__, __FILE__);
            }

--- a/include/openpose/hand/wHandDetectorTracking.hpp
+++ b/include/openpose/hand/wHandDetectorTracking.hpp
@@ -59,7 +59,7 @@ namespace op
                    tDatum.handRectangles = spHandDetector->trackHands(tDatum.poseKeypoints, tDatum.scaleInputToOutput);
                // Profiling speed
                Profiler::timerEnd(profilerKey);
-                Profiler::printAveragedTimeMsOnIterationX(profilerKey, __LINE__, __FUNCTION__, __FILE__, Profiler::DEFAULT_X);
+                Profiler::printAveragedTimeMsOnIterationX(profilerKey, __LINE__, __FUNCTION__, __FILE__);
                // Debugging log
                dLog("", Priority::Low, __LINE__, __FUNCTION__, __FILE__);
            }

--- a/include/openpose/hand/wHandDetectorUpdate.hpp
+++ b/include/openpose/hand/wHandDetectorUpdate.hpp
@@ -59,7 +59,7 @@ namespace op
                    spHandDetector->updateTracker(tDatum.handKeypoints, tDatum.id);
                // Profiling speed
                Profiler::timerEnd(profilerKey);
-                Profiler::printAveragedTimeMsOnIterationX(profilerKey, __LINE__, __FUNCTION__, __FILE__, Profiler::DEFAULT_X);
+                Profiler::printAveragedTimeMsOnIterationX(profilerKey, __LINE__, __FUNCTION__, __FILE__);
                // Debugging log
                dLog("", Priority::Low, __LINE__, __FUNCTION__, __FILE__);
            }

--- a/include/openpose/hand/wHandExtractor.hpp
+++ b/include/openpose/hand/wHandExtractor.hpp
@@ -67,7 +67,7 @@ namespace op
                }
                // Profiling speed
                Profiler::timerEnd(profilerKey);
-                Profiler::printAveragedTimeMsOnIterationX(profilerKey, __LINE__, __FUNCTION__, __FILE__, Profiler::DEFAULT_X);
+                Profiler::printAveragedTimeMsOnIterationX(profilerKey, __LINE__, __FUNCTION__, __FILE__);
                // Debugging log
                dLog("", Priority::Low, __LINE__, __FUNCTION__, __FILE__);
            }

--- a/include/openpose/hand/wHandRenderer.hpp
+++ b/include/openpose/hand/wHandRenderer.hpp
@@ -60,7 +60,7 @@ namespace op
                    spHandRenderer->renderHand(tDatum.outputData, tDatum.handKeypoints);
                // Profiling speed
                Profiler::timerEnd(profilerKey);
-                Profiler::printAveragedTimeMsOnIterationX(profilerKey, __LINE__, __FUNCTION__, __FILE__, Profiler::DEFAULT_X);
+                Profiler::printAveragedTimeMsOnIterationX(profilerKey, __LINE__, __FUNCTION__, __FILE__);
                // Debugging log
                dLog("", Priority::Low, __LINE__, __FUNCTION__, __FILE__);
            }

--- a/include/openpose/pose/bodyPartConnectorBase.hpp
+++ b/include/openpose/pose/bodyPartConnectorBase.hpp
@@ -7,12 +7,16 @@
 namespace op
 {
    template <typename T>
-    OP_API void connectBodyPartsCpu(Array<T>& poseKeypoints, const T* const heatMapPtr, const T* const peaksPtr, const PoseModel poseModel, const Point<int>& heatMapSize, const int maxPeaks,
-                                    const int interMinAboveThreshold, const T interThreshold, const int minSubsetCnt, const T minSubsetScore, const T scaleFactor = 1.f);
+    OP_API void connectBodyPartsCpu(Array<T>& poseKeypoints, const T* const heatMapPtr, const T* const peaksPtr,
+                                    const PoseModel poseModel, const Point<int>& heatMapSize, const int maxPeaks,
+                                    const int interMinAboveThreshold, const T interThreshold, const int minSubsetCnt,
+                                    const T minSubsetScore, const T scaleFactor = 1.f);

    template <typename T>
-    OP_API void connectBodyPartsGpu(Array<T>& poseKeypoints, T* posePtr, const T* const heatMapPtr, const T* const peaksPtr, const PoseModel poseModel, const Point<int>& heatMapSize,
-                                    const int maxPeaks, const int interMinAboveThreshold, const T interThreshold, const int minSubsetCnt, const T minSubsetScore, const T scaleFactor = 1.f);
+    OP_API void connectBodyPartsGpu(Array<T>& poseKeypoints, T* posePtr, const T* const heatMapPtr,
+                                    const T* const peaksPtr, const PoseModel poseModel, const Point<int>& heatMapSize,
+                                    const int maxPeaks, const int interMinAboveThreshold, const T interThreshold,
+                                    const int minSubsetCnt, const T minSubsetScore, const T scaleFactor = 1.f);
 }

 #endif // OPENPOSE_POSE_BODY_PARTS_CONNECTOR_HPP
--- a/include/openpose/pose/bodyPartConnectorCaffe.hpp
+++ b/include/openpose/pose/bodyPartConnectorCaffe.hpp
-#ifdef USE_CAFFE
 #ifndef OPENPOSE_POSE_BODY_PART_CONNECTOR_CAFFE_HPP
 #define OPENPOSE_POSE_BODY_PART_CONNECTOR_CAFFE_HPP

-#include <caffe/blob.hpp>
 #include <openpose/core/common.hpp>
 #include <openpose/pose/enumClasses.hpp>

+// PIMPL does not work here. Alternative:
+// stackoverflow.com/questions/13978775/how-to-avoid-include-dependency-to-external-library?answertab=active#tab-top
+namespace caffe
+{
+    template <typename T> class Blob;
+}
+
 namespace op
 {
-    // It mostly follows the Caffe::layer implementation, so Caffe users can easily use it. However, in order to keep the compatibility with any generic Caffe version,
-    // we keep this 'layer' inside our library rather than in the Caffe code.
+    // It mostly follows the Caffe::layer implementation, so Caffe users can easily use it. However, in order to keep
+    // the compatibility with any generic Caffe version, we keep this 'layer' inside our library rather than in the
+    // Caffe code.
    template <typename T>
    class OP_API BodyPartConnectorCaffe
    {
@@ -36,11 +42,14 @@ namespace op

        virtual void Forward_cpu(const std::vector<caffe::Blob<T>*>& bottom, Array<T>& poseKeypoints);

-        virtual void Forward_gpu(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top, Array<T>& poseKeypoints);
+        virtual void Forward_gpu(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top,
+                                 Array<T>& poseKeypoints);

-        virtual void Backward_cpu(const std::vector<caffe::Blob<T>*>& top, const std::vector<bool>& propagate_down, const std::vector<caffe::Blob<T>*>& bottom);
+        virtual void Backward_cpu(const std::vector<caffe::Blob<T>*>& top, const std::vector<bool>& propagate_down,
+                                  const std::vector<caffe::Blob<T>*>& bottom);

-        virtual void Backward_gpu(const std::vector<caffe::Blob<T>*>& top, const std::vector<bool>& propagate_down, const std::vector<caffe::Blob<T>*>& bottom);
+        virtual void Backward_gpu(const std::vector<caffe::Blob<T>*>& top, const std::vector<bool>& propagate_down,
+                                  const std::vector<caffe::Blob<T>*>& bottom);

    private:
        PoseModel mPoseModel;
@@ -58,4 +67,3 @@ namespace op
 }

 #endif // OPENPOSE_POSE_BODY_PART_CONNECTOR_CAFFE_HPP
-#endif
--- a/include/openpose/pose/poseExtractor.hpp
+++ b/include/openpose/pose/poseExtractor.hpp
@@ -12,19 +12,23 @@ namespace op
    class OP_API PoseExtractor
    {
    public:
-        PoseExtractor(const Point<int>& netOutputSize, const Point<int>& outputSize, const PoseModel poseModel, const std::vector<HeatMapType>& heatMapTypes = {},
+        PoseExtractor(const Point<int>& netOutputSize, const Point<int>& outputSize, const PoseModel poseModel,
+                      const std::vector<HeatMapType>& heatMapTypes = {},
                      const ScaleMode heatMapScale = ScaleMode::ZeroToOne);

        virtual ~PoseExtractor();

        void initializationOnThread();

-        virtual void forwardPass(const Array<float>& inputNetData, const Point<int>& inputDataSize, const std::vector<float>& scaleRatios = {1.f}) = 0;
+        virtual void forwardPass(const Array<float>& inputNetData, const Point<int>& inputDataSize,
+                                 const std::vector<double>& scaleRatios = {1.f}) = 0;

        virtual const float* getHeatMapCpuConstPtr() const = 0;

        virtual const float* getHeatMapGpuConstPtr() const = 0;

+        virtual std::vector<int> getHeatMapSize() const = 0;
+
        Array<float> getHeatMaps() const;

        virtual const float* getPoseGpuConstPtr() const = 0;

--- a/include/openpose/pose/poseExtractorCaffe.hpp
+++ b/include/openpose/pose/poseExtractorCaffe.hpp
-#ifdef USE_CAFFE
 #ifndef OPENPOSE_POSE_POSE_EXTRACTOR_CAFFE_HPP
 #define OPENPOSE_POSE_POSE_EXTRACTOR_CAFFE_HPP

-#include <caffe/blob.hpp>
 #include <openpose/core/common.hpp>
-#include <openpose/core/net.hpp>
-#include <openpose/core/nmsCaffe.hpp>
-#include <openpose/core/resizeAndMergeCaffe.hpp>
-#include <openpose/pose/bodyPartConnectorCaffe.hpp>
 #include <openpose/pose/enumClasses.hpp>
 #include <openpose/pose/poseExtractor.hpp>

@@ -16,37 +10,37 @@ namespace op
    class OP_API PoseExtractorCaffe : public PoseExtractor
    {
    public:
-        PoseExtractorCaffe(const Point<int>& netInputSize, const Point<int>& netOutputSize, const Point<int>& outputSize, const int scaleNumber,
-                           const PoseModel poseModel, const std::string& modelFolder, const int gpuId, const std::vector<HeatMapType>& heatMapTypes = {},
+        PoseExtractorCaffe(const Point<int>& netInputSize, const Point<int>& netOutputSize,
+                           const Point<int>& outputSize, const int scaleNumber, const PoseModel poseModel,
+                           const std::string& modelFolder, const int gpuId,
+                           const std::vector<HeatMapType>& heatMapTypes = {},
                           const ScaleMode heatMapScale = ScaleMode::ZeroToOne);

        virtual ~PoseExtractorCaffe();

        void netInitializationOnThread();

-        void forwardPass(const Array<float>& inputNetData, const Point<int>& inputDataSize, const std::vector<float>& scaleRatios = {1.f});
+        void forwardPass(const Array<float>& inputNetData, const Point<int>& inputDataSize,
+                         const std::vector<double>& scaleRatios = {1.f});

        const float* getHeatMapCpuConstPtr() const;

        const float* getHeatMapGpuConstPtr() const;

+        std::vector<int> getHeatMapSize() const;
+
        const float* getPoseGpuConstPtr() const;

    private:
-        const float mResizeScale;
-        std::shared_ptr<Net> spNet;
-        std::shared_ptr<ResizeAndMergeCaffe<float>> spResizeAndMergeCaffe;
-        std::shared_ptr<NmsCaffe<float>> spNmsCaffe;
-        std::shared_ptr<BodyPartConnectorCaffe<float>> spBodyPartConnectorCaffe;
-        // Init with thread
-        boost::shared_ptr<caffe::Blob<float>> spCaffeNetOutputBlob;
-        std::shared_ptr<caffe::Blob<float>> spHeatMapsBlob;
-        std::shared_ptr<caffe::Blob<float>> spPeaksBlob;
-        std::shared_ptr<caffe::Blob<float>> spPoseBlob;
+        // PIMPL idiom
+        // http://www.cppsamples.com/common-tasks/pimpl.html
+        struct ImplPoseExtractorCaffe;
+        std::unique_ptr<ImplPoseExtractorCaffe> upImpl;

+        // PIMP requires DELETE_COPY & destructor, or extra code
+        // http://oliora.github.io/2015/12/29/pimpl-and-rule-of-zero.html
        DELETE_COPY(PoseExtractorCaffe);
    };
 }

 #endif // OPENPOSE_POSE_POSE_EXTRACTOR_CAFFE_HPP
-#endif
--- a/include/openpose/pose/poseGpuRenderer.hpp
+++ b/include/openpose/pose/poseGpuRenderer.hpp
@@ -13,9 +13,9 @@ namespace op
    class OP_API PoseGpuRenderer : public GpuRenderer, public PoseRenderer
    {
    public:
-        PoseGpuRenderer(const Point<int>& heatMapsSize, const PoseModel poseModel,
-                        const std::shared_ptr<PoseExtractor>& poseExtractor, const float renderThreshold,
-                        const bool blendOriginalFrame = true, const float alphaKeypoint = POSE_DEFAULT_ALPHA_KEYPOINT,
+        PoseGpuRenderer(const PoseModel poseModel, const std::shared_ptr<PoseExtractor>& poseExtractor,
+                        const float renderThreshold, const bool blendOriginalFrame = true,
+                        const float alphaKeypoint = POSE_DEFAULT_ALPHA_KEYPOINT,
                        const float alphaHeatMap = POSE_DEFAULT_ALPHA_HEAT_MAP,
                        const unsigned int elementToRender = 0u);

@@ -27,7 +27,6 @@ namespace op
                                               const float scaleNetToOutput = -1.f);

    private:
-        const Point<int> mHeatMapsSize;
        const std::shared_ptr<PoseExtractor> spPoseExtractor;
        // Init with thread
        float* pGpuPose; // GPU aux memory

--- a/include/openpose/pose/wPoseExtractor.hpp
+++ b/include/openpose/pose/wPoseExtractor.hpp
@@ -58,14 +58,16 @@ namespace op
                // Extract people pose
                for (auto& tDatum : *tDatums)
                {
-                    spPoseExtractor->forwardPass(tDatum.inputNetData, Point<int>{tDatum.cvInputData.cols, tDatum.cvInputData.rows}, tDatum.scaleRatios);
+                    spPoseExtractor->forwardPass(tDatum.inputNetData,
+                                                 Point<int>{tDatum.cvInputData.cols, tDatum.cvInputData.rows},
+                                                 tDatum.scaleInputToNetInputs);
                    tDatum.poseHeatMaps = spPoseExtractor->getHeatMaps().clone();
                    tDatum.poseKeypoints = spPoseExtractor->getPoseKeypoints().clone();
                    tDatum.scaleNetToOutput = spPoseExtractor->getScaleNetToOutput();
                }
                // Profiling speed
                Profiler::timerEnd(profilerKey);
-                Profiler::printAveragedTimeMsOnIterationX(profilerKey, __LINE__, __FUNCTION__, __FILE__, Profiler::DEFAULT_X);
+                Profiler::printAveragedTimeMsOnIterationX(profilerKey, __LINE__, __FUNCTION__, __FILE__);
                // Debugging log
                dLog("", Priority::Low, __LINE__, __FUNCTION__, __FILE__);
            }

--- a/include/openpose/pose/wPoseRenderer.hpp
+++ b/include/openpose/pose/wPoseRenderer.hpp
@@ -60,7 +60,7 @@ namespace op
                    tDatum.elementRendered = spPoseRenderer->renderPose(tDatum.outputData, tDatum.poseKeypoints, (float)tDatum.scaleNetToOutput);
                // Profiling speed
                Profiler::timerEnd(profilerKey);
-                Profiler::printAveragedTimeMsOnIterationX(profilerKey, __LINE__, __FUNCTION__, __FILE__, Profiler::DEFAULT_X);
+                Profiler::printAveragedTimeMsOnIterationX(profilerKey, __LINE__, __FUNCTION__, __FILE__);
                // Debugging log
                dLog("", Priority::Low, __LINE__, __FUNCTION__, __FILE__);
            }

--- a/include/openpose/producer/wDatumProducer.hpp
+++ b/include/openpose/producer/wDatumProducer.hpp
@@ -60,7 +60,7 @@ namespace op
                this->stop();
            // Profiling speed
            Profiler::timerEnd(profilerKey);
-            Profiler::printAveragedTimeMsOnIterationX(profilerKey, __LINE__, __FUNCTION__, __FILE__, Profiler::DEFAULT_X);
+            Profiler::printAveragedTimeMsOnIterationX(profilerKey, __LINE__, __FUNCTION__, __FILE__);
            // Debugging log
            dLog("", Priority::Low, __LINE__, __FUNCTION__, __FILE__);
            // Return TDatums

--- a/include/openpose/thread/wQueueOrderer.hpp
+++ b/include/openpose/thread/wQueueOrderer.hpp
@@ -106,7 +106,7 @@ namespace op
            {
                // Profiling speed
                Profiler::timerEnd(profilerKey);
-                Profiler::printAveragedTimeMsOnIterationX(profilerKey, __LINE__, __FUNCTION__, __FILE__, Profiler::DEFAULT_X);
+                Profiler::printAveragedTimeMsOnIterationX(profilerKey, __LINE__, __FUNCTION__, __FILE__);
                // Debugging log
                dLog("", Priority::Low, __LINE__, __FUNCTION__, __FILE__);
            }

--- a/include/openpose/utilities/cuda.hpp
+++ b/include/openpose/utilities/cuda.hpp
@@ -2,8 +2,6 @@
 #define OPENPOSE_UTILITIES_CUDA_HPP

 #include <utility> // std::pair
-#include <cuda.h>
-#include <cuda_runtime.h>
 #include <openpose/core/common.hpp>

 namespace op
@@ -14,14 +12,14 @@ namespace op

    OP_API int getGpuNumber();

-    inline unsigned int getNumberCudaBlocks(const unsigned int totalRequired, const unsigned int numberCudaThreads = CUDA_NUM_THREADS)
+    inline unsigned int getNumberCudaBlocks(const unsigned int totalRequired,
+                                            const unsigned int numberCudaThreads = CUDA_NUM_THREADS)
    {
        return (totalRequired + numberCudaThreads - 1) / numberCudaThreads;
    }

-    OP_API dim3 getNumberCudaBlocks(const Point<int>& frameSize, const dim3 numberCudaThreads = dim3{ CUDA_NUM_THREADS, CUDA_NUM_THREADS, 1 });
-
-    OP_API std::pair<dim3, dim3> getNumberCudaThreadsAndBlocks(const Point<int>& frameSize);
+    OP_API void getNumberCudaThreadsAndBlocks(dim3& numberCudaThreads, dim3& numberCudaBlocks,
+                                              const Point<int>& frameSize);
 }

 #endif // OPENPOSE_UTILITIES_CUDA_HPP
--- a/include/openpose/utilities/profiler.hpp
+++ b/include/openpose/utilities/profiler.hpp
@@ -14,7 +14,7 @@
    // const auto profilerKey = Profiler::timerInit(__LINE__, __FUNCTION__, __FILE__);
    // // functions to do...
    // Profiler::timerEnd(profilerKey);
-    // Profiler::printAveragedTimeMsOnIterationX(profilerKey, __LINE__, __FUNCTION__, __FILE__, Profiler::DEFAULT_X);
+    // Profiler::printAveragedTimeMsOnIterationX(profilerKey, __LINE__, __FUNCTION__, __FILE__, NUMBER_ITERATIONS);

 namespace op
 {

--- a/include/openpose/wrapper/wrapper.hpp
+++ b/include/openpose/wrapper/wrapper.hpp
@@ -214,6 +214,7 @@ namespace op
        std::vector<TWorker> mUserInputWs;
        TWorker wDatumProducer;
        TWorker spWIdGenerator;
+        TWorker spWScaleAndSizeExtractor;
        TWorker spWCvMatToOpInput;
        TWorker spWCvMatToOpOutput;
        std::vector<std::vector<TWorker>> spWPoses;
@@ -581,8 +582,8 @@ namespace op
                error("Net input size cannot be -1x-1.", __LINE__, __FUNCTION__, __FILE__);
            else if (poseNetInputSize.x == -1 || poseNetInputSize.y == -1)
            {
-                if (producerSize.area() <= 0)
-                    error("Net resolution cannot be -1 for image_dir, only for video and webcam.",
+                if (producerSize.x <= 0 || producerSize.y <= 0)
+                    error("Net resolution cannot be -1 for image_dir, only for video, webcam, and IP camera.",
                          __LINE__, __FUNCTION__, __FILE__);
                else if (poseNetInputSize.x == -1)
                    poseNetInputSize.x = 16 * intRound(
@@ -593,6 +594,10 @@ namespace op
                        poseNetInputSize.x * producerSize.y / (float) producerSize.x / 16.f
                    );
            }
+            // Security checks
+            if ((poseNetInputSize.x > 0 && poseNetInputSize.x % 16 != 0)
+                || (poseNetInputSize.y > 0 && poseNetInputSize.y % 16 != 0))
+                error("Net input resolution must be multiples of 16.", __LINE__, __FUNCTION__, __FILE__);

            // Producer
            if (wrapperStructInput.producerSharedPtr != nullptr)
@@ -606,12 +611,29 @@ namespace op
            else
                wDatumProducer = nullptr;

+            // Get input scales and sizes
+            const auto scaleAndSizeExtractor = std::make_shared<ScaleAndSizeExtractor>(
+                poseNetInputSize, finalOutputSize, wrapperStructPose.scalesNumber, wrapperStructPose.scaleGap
+            );
+            spWScaleAndSizeExtractor = std::make_shared<WScaleAndSizeExtractor<TDatumsPtr>>(scaleAndSizeExtractor);
+
+            // Input cvMat to OpenPose input & output format
+            const auto cvMatToOpInput = std::make_shared<CvMatToOpInput>();
+            spWCvMatToOpInput = std::make_shared<WCvMatToOpInput<TDatumsPtr>>(cvMatToOpInput);
+            if (renderOutput)
+            {
+                const auto cvMatToOpOutput = std::make_shared<CvMatToOpOutput>();
+                spWCvMatToOpOutput = std::make_shared<WCvMatToOpOutput<TDatumsPtr>>(cvMatToOpOutput);
+            }
+
            // Pose estimators & renderers
            const Point<int>& poseNetOutputSize = poseNetInputSize;
            std::vector<std::shared_ptr<PoseExtractor>> poseExtractors;
            std::vector<std::shared_ptr<PoseGpuRenderer>> poseGpuRenderers;
            std::shared_ptr<PoseCpuRenderer> poseCpuRenderer;
            std::vector<TWorker> cpuRenderers;
+            spWPoses.clear();
+            spWPoses.resize(gpuNumber);
            if (wrapperStructPose.enable)
            {
                // Pose estimators
@@ -634,11 +656,11 @@ namespace op
                    // GPU rendering
                    if (renderOutputGpu)
                    {
-                        for (auto gpuId = 0u; gpuId < poseExtractors.size(); gpuId++)
+                        for (const auto& poseExtractor : poseExtractors)
                        {
                            poseGpuRenderers.emplace_back(std::make_shared<PoseGpuRenderer>(
-                                poseNetOutputSize, wrapperStructPose.poseModel, poseExtractors[gpuId],
-                                wrapperStructPose.renderThreshold, wrapperStructPose.blendOriginalFrame, alphaKeypoint,
+                                wrapperStructPose.poseModel, poseExtractor, wrapperStructPose.renderThreshold,
+                                wrapperStructPose.blendOriginalFrame, alphaKeypoint,
                                alphaHeatMap, wrapperStructPose.defaultPartToRender
                            ));
                        }
@@ -651,26 +673,13 @@ namespace op
                        cpuRenderers.emplace_back(std::make_shared<WPoseRenderer<TDatumsPtr>>(poseCpuRenderer));
                    }
                }
-            }
-            log("", Priority::Low, __LINE__, __FUNCTION__, __FILE__);
+                log("", Priority::Low, __LINE__, __FUNCTION__, __FILE__);

-            // Input cvMat to OpenPose format
-            const auto cvMatToOpInput = std::make_shared<CvMatToOpInput>(
-                poseNetInputSize, wrapperStructPose.scalesNumber, wrapperStructPose.scaleGap
-            );
-            spWCvMatToOpInput = std::make_shared<WCvMatToOpInput<TDatumsPtr>>(cvMatToOpInput);
-            const auto cvMatToOpOutput = std::make_shared<CvMatToOpOutput>(finalOutputSize, renderOutput);
-            spWCvMatToOpOutput = std::make_shared<WCvMatToOpOutput<TDatumsPtr>>(cvMatToOpOutput);
-
-            // Pose extractor(s)
-            if (wrapperStructPose.enable)
-            {
+                // Pose extractor(s)
                spWPoses.resize(poseExtractors.size());
                for (auto i = 0u; i < spWPoses.size(); i++)
                    spWPoses.at(i) = {std::make_shared<WPoseExtractor<TDatumsPtr>>(poseExtractors.at(i))};
            }
-            else
-                spWPoses.resize(gpuNumber);


            // Face extractor(s)
@@ -693,7 +702,9 @@ namespace op
                    {
                        // 1 FaceDetectorOpenCV per thread, OpenCV face detector is not thread-safe
                        const auto faceDetectorOpenCV = std::make_shared<FaceDetectorOpenCV>(modelFolder);
-                        spWPoses.at(gpu).emplace_back(std::make_shared<WFaceDetectorOpenCV<TDatumsPtr>>(faceDetectorOpenCV));
+                        spWPoses.at(gpu).emplace_back(
+                            std::make_shared<WFaceDetectorOpenCV<TDatumsPtr>>(faceDetectorOpenCV)
+                        );
                    }
                }
                // Face keypoint extractor
@@ -701,7 +712,7 @@ namespace op
                {
                    // Face keypoint extractor
                    const auto netOutputSize = wrapperStructFace.netInputSize;
-                    const auto faceExtractor = std::make_shared<FaceExtractor>(
+                    const auto faceExtractor = std::make_shared<FaceExtractorCaffe>(
                        wrapperStructFace.netInputSize, netOutputSize, modelFolder,
                        gpu + gpuNumberStart, wrapperStructPose.heatMapTypes, wrapperStructPose.heatMapScale
                    );
@@ -726,7 +737,7 @@ namespace op
                        spWPoses.at(gpu).emplace_back(std::make_shared<WHandDetector<TDatumsPtr>>(handDetector));
                    // Hand keypoint extractor
                    const auto netOutputSize = wrapperStructHand.netInputSize;
-                    const auto handExtractor = std::make_shared<HandExtractor>(
+                    const auto handExtractor = std::make_shared<HandExtractorCaffe>(
                        wrapperStructHand.netInputSize, netOutputSize, modelFolder,
                        gpu + gpuNumberStart, wrapperStructHand.scalesNumber, wrapperStructHand.scaleRange,
                        wrapperStructPose.heatMapTypes, wrapperStructPose.heatMapScale
@@ -884,7 +895,8 @@ namespace op
            if (!wrapperStructOutput.writeVideo.empty() && wrapperStructInput.producerSharedPtr != nullptr)
            {
                if (finalOutputSize.x <= 0 || finalOutputSize.y <= 0)
-                    error("Video can only be recorded if outputSize is known.", __LINE__, __FUNCTION__, __FILE__);
+                    error("Video can only be recorded if outputSize is fixed (e.g. video, webcam, IP camera),"
+                          "but not for a image directory.", __LINE__, __FUNCTION__, __FILE__);
                const auto originalVideoFps = (wrapperStructInput.producerSharedPtr->get(CV_CAP_PROP_FPS) > 0.
                                               ? wrapperStructInput.producerSharedPtr->get(CV_CAP_PROP_FPS) : 30.);
                const auto videoSaver = std::make_shared<VideoSaver>(
@@ -1103,6 +1115,7 @@ namespace op
            // Reset
            mUserInputWs.clear();
            wDatumProducer = nullptr;
+            spWScaleAndSizeExtractor = nullptr;
            spWCvMatToOpInput = nullptr;
            spWCvMatToOpOutput = nullptr;
            spWPoses.clear();
@@ -1126,7 +1139,7 @@ namespace op
            // The less number of queues -> the less lag

            // Security checks
-            if (spWCvMatToOpInput == nullptr || spWCvMatToOpOutput == nullptr)
+            if (spWScaleAndSizeExtractor == nullptr || spWCvMatToOpInput == nullptr)
                error("Configure the Wrapper class before calling `start()`.", __LINE__, __FUNCTION__, __FILE__);
            if ((wDatumProducer == nullptr) == (mUserInputWs.empty())
                && mThreadManagerMode != ThreadManagerMode::Asynchronous
@@ -1159,8 +1172,12 @@ namespace op
                mThreadManager.add(mThreadId, mUserInputWs, queueIn++, queueOut++);
                threadIdPP();
                // Thread 1, queues 1 -> 2
-                mThreadManager.add(mThreadId, {spWIdGenerator, spWCvMatToOpInput, spWCvMatToOpOutput}, queueIn++,
-                                   queueOut++);
+                if (spWCvMatToOpOutput == nullptr)
+                    mThreadManager.add(mThreadId, {spWIdGenerator, spWScaleAndSizeExtractor, spWCvMatToOpInput},
+                                       queueIn++, queueOut++);
+                else
+                    mThreadManager.add(mThreadId, {spWIdGenerator, spWScaleAndSizeExtractor, spWCvMatToOpInput,
+                                       spWCvMatToOpOutput}, queueIn++, queueOut++);
            }
            // If custom user Worker in same thread or producer on same thread
            else
@@ -1177,7 +1194,12 @@ namespace op
                            && mThreadManagerMode != ThreadManagerMode::AsynchronousIn)
                    error("No input selected.", __LINE__, __FUNCTION__, __FILE__);

-                workersAux = mergeWorkers(workersAux, {spWIdGenerator, spWCvMatToOpInput, spWCvMatToOpOutput});
+                if (spWCvMatToOpOutput == nullptr)
+                    workersAux = mergeWorkers(workersAux, {spWIdGenerator, spWScaleAndSizeExtractor,
+                                                           spWCvMatToOpInput});
+                else
+                    workersAux = mergeWorkers(workersAux, {spWIdGenerator, spWScaleAndSizeExtractor,
+                                                           spWCvMatToOpInput, spWCvMatToOpOutput});
                // Thread 0 or 1, queues 0 -> 1
                mThreadManager.add(mThreadId, workersAux, queueIn++, queueOut++);
            }

--- a/src/openpose/core/CMakeLists.txt
+++ b/src/openpose/core/CMakeLists.txt
@@ -19,7 +19,8 @@ cuda_add_library(core
    renderer.cpp
    resizeAndMergeBase.cpp
    resizeAndMergeBase.cu
-    resizeAndMergeCaffe.cpp)
+    resizeAndMergeCaffe.cpp
+    scaleAndSizeExtractor)

 target_link_libraries(core ${Caffe_LIBS})
 if (BUILD_CAFFE)

--- a/src/openpose/core/cvMatToOpInput.cpp
+++ b/src/openpose/core/cvMatToOpInput.cpp
@@ -4,24 +4,9 @@

 namespace op
 {
-    CvMatToOpInput::CvMatToOpInput(const Point<int>& netInputResolution, const int scaleNumber, const float scaleGap) :
-        mScaleNumber{scaleNumber},
-        mScaleGap{scaleGap},
-        mInputNetSize4D{{mScaleNumber, 3, netInputResolution.y, netInputResolution.x}}
-    {
-        try
-        {
-            // Security checks
-            if (netInputResolution.x % 16 != 0 || netInputResolution.y % 16 != 0)
-                error("Net input resolution must be multiples of 16.", __LINE__, __FUNCTION__, __FILE__);
-        }
-        catch (const std::exception& e)
-        {
-            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
-        }
-    }
-
-    std::pair<Array<float>, std::vector<float>> CvMatToOpInput::format(const cv::Mat& cvInputData) const
+    Array<float> CvMatToOpInput::createArray(const cv::Mat& cvInputData,
+                                             const std::vector<double>& scaleInputToNetInputs,
+                                             const std::vector<Point<int>>& netInputSizes) const
    {
        try
        {
@@ -30,38 +15,26 @@ namespace op
                error("Wrong input element (empty cvInputData).", __LINE__, __FUNCTION__, __FILE__);
            if (cvInputData.channels() != 3)
                error("Input images must be 3-channel BGR.", __LINE__, __FUNCTION__, __FILE__);
-
+            if (scaleInputToNetInputs.size() != netInputSizes.size())
+                error("scaleInputToNetInputs.size() != netInputSizes.size().", __LINE__, __FUNCTION__, __FILE__);
            // inputNetData - Reescale keeping aspect ratio and transform to float the input deep net image
-            Array<float> inputNetData{mInputNetSize4D};
-            std::vector<float> scaleRatios(mScaleNumber, 1.f);
+            const auto numberScales = (int)scaleInputToNetInputs.size();
+            Array<float> inputNetData{{numberScales, 3, netInputSizes.at(0).y, netInputSizes.at(0).x}};
+            std::vector<double> scaleRatios(numberScales, 1.f);
            const auto inputNetDataOffset = inputNetData.getVolume(1, 3);
-            for (auto i = 0; i < mScaleNumber; i++)
+            for (auto i = 0; i < numberScales; i++)
            {
-                const auto currentScale = 1.f - i*mScaleGap;
-                if (currentScale < 0.f || 1.f < currentScale)
-                    error("All scales must be in the range [0, 1], i.e. 0 <= 1-scale_number*scale_gap <= 1", __LINE__, __FUNCTION__, __FILE__);
-
-                const auto netInputWidth = inputNetData.getSize(3);
-                const auto targetWidth  = fastTruncate(intRound(netInputWidth * currentScale) / 16 * 16, 1, netInputWidth);
-                const auto netInputHeight = inputNetData.getSize(2);
-                const auto targetHeight  = fastTruncate(intRound(netInputHeight * currentScale) / 16 * 16, 1, netInputHeight);
-                const Point<int> targetSize{targetWidth, targetHeight};
-                const auto scale = resizeGetScaleFactor(Point<int>{cvInputData.cols, cvInputData.rows}, targetSize);
-                const cv::Mat frameWithNetSize = resizeFixedAspectRatio(cvInputData, scale, Point<int>{netInputWidth, netInputHeight});
+                const cv::Mat frameWithNetSize = resizeFixedAspectRatio(cvInputData, scaleInputToNetInputs[i],
+                                                                        netInputSizes[i]);
                // Fill inputNetData
                uCharCvMatToFloatPtr(inputNetData.getPtr() + i * inputNetDataOffset, frameWithNetSize, true);
-                // Fill scaleRatios
-                scaleRatios[i] = {(float)scale};
-                if (i > 0)
-                    scaleRatios[i] /= scaleRatios[0];
            }
-            scaleRatios.at(0) /= scaleRatios[0];
-            return std::make_pair(inputNetData, scaleRatios);
+            return inputNetData;
        }
        catch (const std::exception& e)
        {
            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
-            return std::make_pair(Array<float>{}, std::vector<float>{});
+            return Array<float>{};
        }
    }
 }
--- a/src/openpose/core/cvMatToOpOutput.cpp
+++ b/src/openpose/core/cvMatToOpOutput.cpp
@@ -3,13 +3,7 @@

 namespace op
 {
-    CvMatToOpOutput::CvMatToOpOutput(const Point<int>& outputResolution, const bool generateOutput) :
-        mGenerateOutput{generateOutput},
-        mOutputSize3D{3, outputResolution.y, outputResolution.x}
-    {
-    }
-
-    std::tuple<double, Array<float>> CvMatToOpOutput::format(const cv::Mat& cvInputData) const
+    Array<float> CvMatToOpOutput::createArray(const cv::Mat& cvInputData, const double scaleInputToOutput, const Point<int>& outputResolution) const
    {
        try
        {
@@ -18,38 +12,18 @@ namespace op
                error("Wrong input element (empty cvInputData).", __LINE__, __FUNCTION__, __FILE__);
            if (cvInputData.channels() != 3)
                error("Input images must be 3-channel BGR.", __LINE__, __FUNCTION__, __FILE__);
-            // scaleInputToOutput - Scale between input and desired output size
-            double scaleInputToOutput;
-            Point<int> outputResolution;
-            // Output = mOutputSize3D size
-            if (mOutputSize3D[1] > 0 && mOutputSize3D[2] > 0)
-            {
-                outputResolution = Point<int>{mOutputSize3D[2], mOutputSize3D[1]};
-                scaleInputToOutput = resizeGetScaleFactor(Point<int>{cvInputData.cols, cvInputData.rows},
-                                                          outputResolution);
-            }
-            // Output = input size
-            else
-            {
-                outputResolution = Point<int>{cvInputData.cols, cvInputData.rows};
-                scaleInputToOutput = 1.;
-            }
            // outputData - Reescale keeping aspect ratio and transform to float the output image
-            Array<float> outputData;
-            if (mGenerateOutput)
-            {
-                const cv::Mat frameWithOutputSize = resizeFixedAspectRatio(cvInputData, scaleInputToOutput,
-                                                                           outputResolution);
-                outputData.reset({3, outputResolution.y, outputResolution.x});
-                uCharCvMatToFloatPtr(outputData.getPtr(), frameWithOutputSize, false);
-            }
+            const cv::Mat frameWithOutputSize = resizeFixedAspectRatio(cvInputData, scaleInputToOutput,
+                                                                       outputResolution);
+            Array<float> outputData({3, outputResolution.y, outputResolution.x});
+            uCharCvMatToFloatPtr(outputData.getPtr(), frameWithOutputSize, false);
            // Return result
-            return std::make_tuple(scaleInputToOutput, outputData);
+            return outputData;
        }
        catch (const std::exception& e)
        {
            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
-            return std::make_tuple(0., Array<float>{});
+            return Array<float>{};
        }
    }
 }
--- a/src/openpose/core/datum.cpp
+++ b/src/openpose/core/datum.cpp
@@ -25,9 +25,10 @@ namespace op
        handRectangles{datum.handRectangles},
        handKeypoints(datum.handKeypoints), // Parentheses instead of braces to avoid error in GCC 4.8
        // Other parameters
+        scaleInputToNetInputs{datum.scaleInputToNetInputs},
+        netInputSizes{datum.netInputSizes},
        scaleInputToOutput{datum.scaleInputToOutput},
        scaleNetToOutput{datum.scaleNetToOutput},
-        scaleRatios{datum.scaleRatios},
        elementRendered{datum.elementRendered}
    {
    }
@@ -53,9 +54,10 @@ namespace op
            handRectangles = datum.handRectangles,
            handKeypoints = datum.handKeypoints,
            // Other parameters
+            scaleInputToNetInputs = datum.scaleInputToNetInputs;
+            netInputSizes = datum.netInputSizes;
            scaleInputToOutput = datum.scaleInputToOutput;
            scaleNetToOutput = datum.scaleNetToOutput;
-            scaleRatios = datum.scaleRatios;
            elementRendered = datum.elementRendered;
            // Return
            return *this;
@@ -92,7 +94,8 @@ namespace op
            std::swap(handRectangles, datum.handRectangles);
            std::swap(handKeypoints, datum.handKeypoints);
            // Other parameters
-            std::swap(scaleRatios, datum.scaleRatios);
+            std::swap(scaleInputToNetInputs, datum.scaleInputToNetInputs);
+            std::swap(netInputSizes, datum.netInputSizes);
            std::swap(elementRendered, datum.elementRendered);
        }
        catch (const std::exception& e)
@@ -122,9 +125,8 @@ namespace op
            std::swap(handRectangles, datum.handRectangles);
            std::swap(handKeypoints, datum.handKeypoints);
            // Other parameters
-            scaleInputToOutput = datum.scaleInputToOutput;
-            scaleNetToOutput = datum.scaleNetToOutput;
-            std::swap(scaleRatios, datum.scaleRatios);
+            std::swap(scaleInputToNetInputs, datum.scaleInputToNetInputs);
+            std::swap(netInputSizes, datum.netInputSizes);
            std::swap(elementRendered, datum.elementRendered);
            // Return
            return *this;
@@ -163,9 +165,10 @@ namespace op
            datum.handKeypoints[0] = handKeypoints[0].clone();
            datum.handKeypoints[1] = handKeypoints[1].clone();
            // Other parameters
+            datum.scaleInputToNetInputs = scaleInputToNetInputs;
+            datum.netInputSizes = netInputSizes;
            datum.scaleInputToOutput = scaleInputToOutput;
            datum.scaleNetToOutput = scaleNetToOutput;
-            datum.scaleRatios = scaleRatios;
            datum.elementRendered = elementRendered;
            // Return
            return std::move(datum);

--- a/src/openpose/core/defineTemplates.cpp
+++ b/src/openpose/core/defineTemplates.cpp
@@ -6,4 +6,5 @@ namespace op
    DEFINE_TEMPLATE_DATUM(WCvMatToOpOutput);
    DEFINE_TEMPLATE_DATUM(WKeypointScaler);
    DEFINE_TEMPLATE_DATUM(WOpOutputToCvMat);
+    DEFINE_TEMPLATE_DATUM(WScaleAndSizeExtractor);
 }
--- a/src/openpose/core/gpuRenderer.cpp
+++ b/src/openpose/core/gpuRenderer.cpp
-#ifndef CPU_ONLY
+#ifdef USE_CUDA
    #include <cuda.h>
    #include <cuda_runtime_api.h>
 #endif
@@ -6,26 +6,26 @@

 namespace op
 {
-    void checkAndIncreaseGpuMemory(std::shared_ptr<float*>& gpuMemoryPtr,
-                                   std::shared_ptr<std::atomic<unsigned long long>>& currentVolumePtr,
-                                   const unsigned long long memoryVolume)
-    {
-        try
+    #ifdef USE_CUDA
+        void checkAndIncreaseGpuMemory(std::shared_ptr<float*>& gpuMemoryPtr,
+                                       std::shared_ptr<std::atomic<unsigned long long>>& currentVolumePtr,
+                                       const unsigned long long memoryVolume)
        {
-            #ifndef CPU_ONLY
+            try
+            {
                if (*currentVolumePtr < memoryVolume)
                {
                    *currentVolumePtr = memoryVolume;
                    cudaFree(*gpuMemoryPtr);
                    cudaMalloc((void**)(gpuMemoryPtr.get()), *currentVolumePtr * sizeof(float));
                }
-            #endif
-        }
-        catch (const std::exception& e)
-        {
-            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
-        }
+            }
+            catch (const std::exception& e)
+            {
+                error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+            }
    }
+    #endif

    GpuRenderer::GpuRenderer(const float renderThreshold, const float alphaKeypoint,
                             const float alphaHeatMap, const bool blendOriginalFrame,
@@ -44,7 +44,7 @@ namespace op
    {
        try
        {
-            #ifndef CPU_ONLY
+            #ifdef USE_CUDA
                if (mIsLastRenderer)
                    cudaFree(*spGpuMemory);
            #endif
@@ -97,7 +97,7 @@ namespace op
    {
        try
        {
-            #ifndef CPU_ONLY
+            #ifdef USE_CUDA
                if (!*spGpuMemoryAllocated)
                {
                    checkAndIncreaseGpuMemory(spGpuMemory, spVolume, memoryVolume);
@@ -105,8 +105,10 @@ namespace op
                    *spGpuMemoryAllocated = true;
                }
            #else
-                error("GPU rendering not available if `CPU_ONLY` is set.", __LINE__, __FUNCTION__, __FILE__);
                UNUSED(cpuMemory);
+                UNUSED(memoryVolume);
+                error("OpenPose must be compiled with the `USE_CUDA` macro definitions in order to run this"
+                      " functionality.", __LINE__, __FUNCTION__, __FILE__);
            #endif
        }
        catch (const std::exception& e)
@@ -119,7 +121,7 @@ namespace op
    {
        try
        {
-            #ifndef CPU_ONLY
+            #ifdef USE_CUDA
                if (*spGpuMemoryAllocated && mIsLastRenderer)
                {
                    if (*spVolume < memoryVolume)
@@ -129,8 +131,10 @@ namespace op
                    *spGpuMemoryAllocated = false;
                }
            #else
-                error("GPU rendering not available if `CPU_ONLY` is set.", __LINE__, __FUNCTION__, __FILE__);
                UNUSED(cpuMemory);
+                UNUSED(memoryVolume);
+                error("OpenPose must be compiled with the `USE_CUDA` macro definitions in order to run this"
+                      " functionality.", __LINE__, __FUNCTION__, __FILE__);
            #endif
        }
        catch (const std::exception& e)

--- a/src/openpose/core/keypointScaler.cpp
+++ b/src/openpose/core/keypointScaler.cpp
@@ -8,7 +8,8 @@ namespace op
    {
    }

-    void KeypointScaler::scale(Array<float>& arrayToScale, const float scaleInputToOutput, const float scaleNetToOutput, const Point<int>& producerSize) const
+    void KeypointScaler::scale(Array<float>& arrayToScale, const double scaleInputToOutput,
+                               const double scaleNetToOutput, const Point<int>& producerSize) const
    {
        try
        {
@@ -21,7 +22,8 @@ namespace op
        }
    }

-    void KeypointScaler::scale(std::vector<Array<float>>& arrayToScalesToScale, const float scaleInputToOutput, const float scaleNetToOutput, const Point<int>& producerSize) const
+    void KeypointScaler::scale(std::vector<Array<float>>& arrayToScalesToScale, const double scaleInputToOutput,
+                               const double scaleNetToOutput, const Point<int>& producerSize) const
    {
        try
        {
@@ -30,15 +32,15 @@ namespace op
                // InputResolution
                if (mScaleMode == ScaleMode::InputResolution)
                    for (auto& arrayToScale : arrayToScalesToScale)
-                        scaleKeypoints(arrayToScale, 1.f/scaleInputToOutput);
+                        scaleKeypoints(arrayToScale, float(1./scaleInputToOutput));
                // NetOutputResolution
                else if (mScaleMode == ScaleMode::NetOutputResolution)
                    for (auto& arrayToScale : arrayToScalesToScale)
-                        scaleKeypoints(arrayToScale, 1.f/scaleNetToOutput);
+                        scaleKeypoints(arrayToScale, float(1./scaleNetToOutput));
                // [0,1]
                else if (mScaleMode == ScaleMode::ZeroToOne)
                {
-                    const auto scale = 1.f/scaleInputToOutput;
+                    const auto scale = float(1./scaleInputToOutput);
                    const auto scaleX = scale / ((float)producerSize.x - 1.f);
                    const auto scaleY = scale / ((float)producerSize.y - 1.f);
                    for (auto& arrayToScale : arrayToScalesToScale)
@@ -47,7 +49,7 @@ namespace op
                // [-1,1]
                else if (mScaleMode == ScaleMode::PlusMinusOne)
                {
-                    const auto scale = 2.f/scaleInputToOutput;
+                    const auto scale = float(2./scaleInputToOutput);
                    const auto scaleX = (scale / ((float)producerSize.x - 1.f));
                    const auto scaleY = (scale / ((float)producerSize.y - 1.f));
                    const auto offset = -1.f;

--- a/src/openpose/core/maximumBase.cpp
+++ b/src/openpose/core/maximumBase.cpp
@@ -4,12 +4,12 @@
 namespace op
 {
    template <typename T>
-    void maximumCpu(T* targetPtr, int* kernelPtr, const T* const sourcePtr, const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize)
+    void maximumCpu(T* targetPtr, const T* const sourcePtr, const std::array<int, 4>& targetSize,
+                    const std::array<int, 4>& sourceSize)
    {
        try
        {
            UNUSED(targetPtr);
-            UNUSED(kernelPtr);
            UNUSED(sourcePtr);
            UNUSED(targetSize);
            UNUSED(sourceSize);
@@ -25,14 +25,14 @@ namespace op
            // const auto numberParts = targetSize[2];
            // const auto numberSubparts = targetSize[3];

-            // // log("sourceSize[0]: " + std::to_string(sourceSize[0]));  // = 1
-            // // log("sourceSize[1]: " + std::to_string(sourceSize[1]));  // = #body parts + bck = 22 (hands) or 71 (face) 
-            // // log("sourceSize[2]: " + std::to_string(sourceSize[2]));  // = 368 = height
-            // // log("sourceSize[3]: " + std::to_string(sourceSize[3]));  // = 368 = width
-            // // log("targetSize[0]: " + std::to_string(targetSize[0]));  // = 1
-            // // log("targetSize[1]: " + std::to_string(targetSize[1]));  // = 1
-            // // log("targetSize[2]: " + std::to_string(targetSize[2]));  // = 21(hands) or 70 (face)
-            // // log("targetSize[3]: " + std::to_string(targetSize[3]));  // = 3 = [x, y, score]
+            // // log("sourceSize[0]: " + std::to_string(sourceSize[0])); // = 1
+            // // log("sourceSize[1]: " + std::to_string(sourceSize[1])); // = #body_parts+bck=22(hands) or 71(face)
+            // // log("sourceSize[2]: " + std::to_string(sourceSize[2])); // = 368 = height
+            // // log("sourceSize[3]: " + std::to_string(sourceSize[3])); // = 368 = width
+            // // log("targetSize[0]: " + std::to_string(targetSize[0])); // = 1
+            // // log("targetSize[1]: " + std::to_string(targetSize[1])); // = 1
+            // // log("targetSize[2]: " + std::to_string(targetSize[2])); // = 21(hands) or 70 (face)
+            // // log("targetSize[3]: " + std::to_string(targetSize[3])); // = 3 = [x, y, score]
            // // log(" ");
            // for (auto n = 0; n < num; n++)
            // {
@@ -45,7 +45,8 @@ namespace op
            //             auto* targetPtrOffsetted = targetPtr + (offsetChannel + part) * numberSubparts;
            //             const auto* const sourcePtrOffsetted = sourcePtr + (offsetChannel + part) * imageOffset;
            //             // Option a - 6.3 fps
-            //             const auto sourceIndexIterator = thrust::max_element(thrust::host, sourcePtrOffsetted, sourcePtrOffsetted + imageOffset);
+            //             const auto sourceIndexIterator = thrust::max_element(thrust::host, sourcePtrOffsetted,
+            //                                                                  sourcePtrOffsetted + imageOffset);
            //             const auto sourceIndex = (int)(sourceIndexIterator - sourcePtrOffsetted);
            //             targetPtrOffsetted[0] = sourceIndex % width;
            //             targetPtrOffsetted[1] = sourceIndex / width;
@@ -60,6 +61,8 @@ namespace op
        }
    }

-    template void maximumCpu(float* targetPtr, int* kernelPtr, const float* const sourcePtr, const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize);
-    template void maximumCpu(double* targetPtr, int* kernelPtr, const double* const sourcePtr, const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize);
+    template void maximumCpu(float* targetPtr, const float* const sourcePtr, const std::array<int, 4>& targetSize,
+                             const std::array<int, 4>& sourceSize);
+    template void maximumCpu(double* targetPtr, const double* const sourcePtr, const std::array<int, 4>& targetSize,
+                             const std::array<int, 4>& sourceSize);
 }
--- a/src/openpose/core/maximumCaffe.cpp
+++ b/src/openpose/core/maximumCaffe.cpp
 #ifdef USE_CAFFE
+    #include <caffe/blob.hpp>
+#endif
 #include <openpose/core/maximumBase.hpp>
 #include <openpose/core/maximumCaffe.hpp>

@@ -7,17 +9,34 @@ namespace op
    template <typename T>
    MaximumCaffe<T>::MaximumCaffe()
    {
+        try
+        {
+            #ifndef USE_CAFFE
+                error("OpenPose must be compiled with the `USE_CAFFE` macro definition in order to use this"
+                      " functionality.", __LINE__, __FUNCTION__, __FILE__);
+            #endif
+        }
+        catch (const std::exception& e)
+        {
+            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+        }
    }

    template <typename T>
-    void MaximumCaffe<T>::LayerSetUp(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top)
+    void MaximumCaffe<T>::LayerSetUp(const std::vector<caffe::Blob<T>*>& bottom,
+                                     const std::vector<caffe::Blob<T>*>& top)
    {
        try
        {
-            if (top.size() != 1)
-                error("top.size() != 1", __LINE__, __FUNCTION__, __FILE__);
-            if (bottom.size() != 1)
-                error("bottom.size() != 1", __LINE__, __FUNCTION__, __FILE__);
+            #ifdef USE_CAFFE
+                if (top.size() != 1)
+                    error("top.size() != 1", __LINE__, __FUNCTION__, __FILE__);
+                if (bottom.size() != 1)
+                    error("bottom.size() != 1", __LINE__, __FUNCTION__, __FILE__);
+            #else
+                UNUSED(bottom);
+                UNUSED(top);
+            #endif
        }
        catch (const std::exception& e)
        {
@@ -26,26 +45,34 @@ namespace op
    }

    template <typename T>
-    void MaximumCaffe<T>::Reshape(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top)
+    void MaximumCaffe<T>::Reshape(const std::vector<caffe::Blob<T>*>& bottom,
+                                  const std::vector<caffe::Blob<T>*>& top)
    {
        try
        {
-            auto bottomBlob = bottom.at(0);
-            auto topBlob = top.at(0);
+            #ifdef USE_CAFFE
+                auto bottomBlob = bottom.at(0);
+                auto topBlob = top.at(0);

-            // Bottom shape
-            std::vector<int> bottomShape = bottomBlob->shape();
+                // Bottom shape
+                std::vector<int> bottomShape = bottomBlob->shape();

-            // Top shape
-            std::vector<int> topShape{bottomShape};
-            topShape[1] = 1; // Unnecessary
-            topShape[2] = bottomShape[1]-1; // Number parts + bck - 1
-            topShape[3] = 3;  // X, Y, score
-            topBlob->Reshape(topShape);
+                // Top shape
+                std::vector<int> topShape{bottomShape};
+                topShape[1] = 1; // Unnecessary
+                topShape[2] = bottomShape[1]-1; // Number parts + bck - 1
+                topShape[3] = 3;  // X, Y, score
+                topBlob->Reshape(topShape);

-            // Array sizes
-            mTopSize = std::array<int, 4>{topBlob->shape(0), topBlob->shape(1), topBlob->shape(2), topBlob->shape(3)};
-            mBottomSize = std::array<int, 4>{bottomBlob->shape(0), bottomBlob->shape(1), bottomBlob->shape(2), bottomBlob->shape(3)};
+                // Array sizes
+                mTopSize = std::array<int, 4>{topBlob->shape(0), topBlob->shape(1), topBlob->shape(2),
+                                              topBlob->shape(3)};
+                mBottomSize = std::array<int, 4>{bottomBlob->shape(0), bottomBlob->shape(1), bottomBlob->shape(2),
+                                                 bottomBlob->shape(3)};
+            #else
+                UNUSED(bottom);
+                UNUSED(top);
+            #endif
        }
        catch (const std::exception& e)
        {
@@ -54,11 +81,17 @@ namespace op
    }

    template <typename T>
-    void MaximumCaffe<T>::Forward_cpu(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top)
+    void MaximumCaffe<T>::Forward_cpu(const std::vector<caffe::Blob<T>*>& bottom,
+                                      const std::vector<caffe::Blob<T>*>& top)
    {
        try
        {
-            maximumGpu(top.at(0)->mutable_cpu_data(), bottom.at(0)->cpu_data(), mTopSize, mBottomSize);
+            #ifdef USE_CAFFE
+                maximumCpu(top.at(0)->mutable_cpu_data(), bottom.at(0)->cpu_data(), mTopSize, mBottomSize);
+            #else
+                UNUSED(bottom);
+                UNUSED(top);
+            #endif
        }
        catch (const std::exception& e)
        {
@@ -67,11 +100,19 @@ namespace op
    }

    template <typename T>
-    void MaximumCaffe<T>::Forward_gpu(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top)
+    void MaximumCaffe<T>::Forward_gpu(const std::vector<caffe::Blob<T>*>& bottom,
+                                      const std::vector<caffe::Blob<T>*>& top)
    {
        try
        {
-            maximumGpu(top.at(0)->mutable_gpu_data(), bottom.at(0)->gpu_data(), mTopSize, mBottomSize);
+            #if defined USE_CAFFE && defined USE_CUDA
+                maximumGpu(top.at(0)->mutable_gpu_data(), bottom.at(0)->gpu_data(), mTopSize, mBottomSize);
+            #else
+                UNUSED(bottom);
+                UNUSED(top);
+                error("OpenPose must be compiled with the `USE_CAFFE` & `USE_CUDA` macro definitions in order to run"
+                      " this functionality.", __LINE__, __FUNCTION__, __FILE__);
+            #endif
        }
        catch (const std::exception& e)
        {
@@ -80,14 +121,18 @@ namespace op
    }

    template <typename T>
-    void MaximumCaffe<T>::Backward_cpu(const std::vector<caffe::Blob<T>*>& top, const std::vector<bool>& propagate_down, const std::vector<caffe::Blob<T>*>& bottom)
+    void MaximumCaffe<T>::Backward_cpu(const std::vector<caffe::Blob<T>*>& top,
+                                       const std::vector<bool>& propagate_down,
+                                       const std::vector<caffe::Blob<T>*>& bottom)
    {
        try
        {
            UNUSED(top);
            UNUSED(propagate_down);
            UNUSED(bottom);
-            NOT_IMPLEMENTED;
+            #ifdef USE_CAFFE
+                NOT_IMPLEMENTED;
+            #endif
        }
        catch (const std::exception& e)
        {
@@ -96,14 +141,20 @@ namespace op
    }

    template <typename T>
-    void MaximumCaffe<T>::Backward_gpu(const std::vector<caffe::Blob<T>*>& top, const std::vector<bool>& propagate_down, const std::vector<caffe::Blob<T>*>& bottom)
+    void MaximumCaffe<T>::Backward_gpu(const std::vector<caffe::Blob<T>*>& top,
+                                       const std::vector<bool>& propagate_down,
+                                       const std::vector<caffe::Blob<T>*>& bottom)
    {
        try
        {
            UNUSED(top);
            UNUSED(propagate_down);
            UNUSED(bottom);
-            NOT_IMPLEMENTED;
+            #ifdef USE_CAFFE
+                #ifdef USE_CAFFE
+                NOT_IMPLEMENTED;
+            #endif
+            #endif
        }
        catch (const std::exception& e)
        {
@@ -111,7 +162,5 @@ namespace op
        }
    }

-    INSTANTIATE_CLASS(MaximumCaffe);
+    COMPILE_TEMPLATE_FLOATING_TYPES_CLASS(MaximumCaffe);
 }
-
-#endif
--- a/src/openpose/core/netCaffe.cpp
+++ b/src/openpose/core/netCaffe.cpp
-#ifdef USE_CAFFE
 #include <numeric> // std::accumulate
+#ifdef USE_CAFFE
+    #include <caffe/net.hpp>
+#endif
 #include <openpose/utilities/cuda.hpp>
 #include <openpose/core/netCaffe.hpp>

 namespace op
 {
-    NetCaffe::NetCaffe(const std::array<int, 4>& netInputSize4D, const std::string& caffeProto, const std::string& caffeTrainedModel, const int gpuId, const std::string& lastBlobName) :
-        mGpuId{gpuId},
-        // mNetInputSize4D{netInputSize4D}, // This line crashes on some devices with old G++
-        mNetInputSize4D{netInputSize4D[0], netInputSize4D[1], netInputSize4D[2], netInputSize4D[3]},
-        mNetInputMemory{std::accumulate(mNetInputSize4D.begin(), mNetInputSize4D.end(), 1, std::multiplies<int>()) * sizeof(float)},
-        mCaffeProto{caffeProto},
-        mCaffeTrainedModel{caffeTrainedModel},
-        mLastBlobName{lastBlobName}
+    struct NetCaffe::ImplNetCaffe
+    {
+        #ifdef USE_CAFFE
+            // Init with constructor
+            const int mGpuId;
+            const std::array<int, 4> mNetInputSize4D;
+            const unsigned long mNetInputMemory;
+            const std::string mCaffeProto;
+            const std::string mCaffeTrainedModel;
+            const std::string mLastBlobName;
+            // Init with thread
+            std::unique_ptr<caffe::Net<float>> upCaffeNet;
+            boost::shared_ptr<caffe::Blob<float>> spOutputBlob;
+
+            ImplNetCaffe(const std::array<int, 4>& netInputSize4D, const std::string& caffeProto,
+                         const std::string& caffeTrainedModel, const int gpuId, const std::string& lastBlobName) :
+                mGpuId{gpuId},
+                // mNetInputSize4D{netInputSize4D}, // This line crashes on some devices with old G++
+                mNetInputSize4D{netInputSize4D[0], netInputSize4D[1], netInputSize4D[2], netInputSize4D[3]},
+                mNetInputMemory{sizeof(float) * std::accumulate(mNetInputSize4D.begin(), mNetInputSize4D.end(), 1,
+                                                                std::multiplies<int>())},
+                mCaffeProto{caffeProto},
+                mCaffeTrainedModel{caffeTrainedModel},
+                mLastBlobName{lastBlobName}
+            {
+            }
+        #endif
+    };
+
+    NetCaffe::NetCaffe(const std::array<int, 4>& netInputSize4D, const std::string& caffeProto,
+                       const std::string& caffeTrainedModel, const int gpuId, const std::string& lastBlobName)
+        #ifdef USE_CAFFE
+            : upImpl{new ImplNetCaffe{netInputSize4D, caffeProto, caffeTrainedModel, gpuId, lastBlobName}}
+        #endif
    {
+        try
+        {
+            #ifndef USE_CAFFE
+                UNUSED(netInputSize4D);
+                UNUSED(caffeProto);
+                UNUSED(caffeTrainedModel);
+                UNUSED(gpuId);
+                UNUSED(lastBlobName);
+                error("OpenPose must be compiled with the `USE_CAFFE` macro definition in order to use this"
+                      " functionality.", __LINE__, __FUNCTION__, __FILE__);
+            #endif
+        }
+        catch (const std::exception& e)
+        {
+            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+        }
    }

    NetCaffe::~NetCaffe()
@@ -24,19 +68,23 @@ namespace op
    {
        try
        {
-            // Initialize net
-            caffe::Caffe::set_mode(caffe::Caffe::GPU);
-            caffe::Caffe::SetDevice(mGpuId);
-            upCaffeNet.reset(new caffe::Net<float>{mCaffeProto, caffe::TEST});
-            upCaffeNet->CopyTrainedLayersFrom(mCaffeTrainedModel);
-            upCaffeNet->blobs()[0]->Reshape({mNetInputSize4D[0], mNetInputSize4D[1], mNetInputSize4D[2], mNetInputSize4D[3]});
-            upCaffeNet->Reshape();
-            cudaCheck(__LINE__, __FUNCTION__, __FILE__);
-            // Set spOutputBlob
-            spOutputBlob = upCaffeNet->blob_by_name(mLastBlobName);
-            if (spOutputBlob == nullptr)
-                error("The output blob is a nullptr. Did you use the same name than the prototxt? (Used: " + mLastBlobName + ").", __LINE__, __FUNCTION__, __FILE__);
-            cudaCheck(__LINE__, __FUNCTION__, __FILE__);
+            #ifdef USE_CAFFE
+                // Initialize net
+                caffe::Caffe::set_mode(caffe::Caffe::GPU);
+                caffe::Caffe::SetDevice(upImpl->mGpuId);
+                upImpl->upCaffeNet.reset(new caffe::Net<float>{upImpl->mCaffeProto, caffe::TEST});
+                upImpl->upCaffeNet->CopyTrainedLayersFrom(upImpl->mCaffeTrainedModel);
+                upImpl->upCaffeNet->blobs()[0]->Reshape({upImpl->mNetInputSize4D[0], upImpl->mNetInputSize4D[1],
+                                                         upImpl->mNetInputSize4D[2], upImpl->mNetInputSize4D[3]});
+                upImpl->upCaffeNet->Reshape();
+                cudaCheck(__LINE__, __FUNCTION__, __FILE__);
+                // Set spOutputBlob
+                upImpl->spOutputBlob = upImpl->upCaffeNet->blob_by_name(upImpl->mLastBlobName);
+                if (upImpl->spOutputBlob == nullptr)
+                    error("The output blob is a nullptr. Did you use the same name than the prototxt? (Used: "
+                          + upImpl->mLastBlobName + ").", __LINE__, __FUNCTION__, __FILE__);
+                cudaCheck(__LINE__, __FUNCTION__, __FILE__);
+            #endif
        }
        catch (const std::exception& e)
        {
@@ -48,7 +96,11 @@ namespace op
    {
        try
        {
-            return upCaffeNet->blobs().at(0)->mutable_cpu_data();
+            #ifdef USE_CAFFE
+                return upImpl->upCaffeNet->blobs().at(0)->mutable_cpu_data();
+            #else
+                return nullptr;
+            #endif
        }
        catch (const std::exception& e)
        {
@@ -61,7 +113,11 @@ namespace op
    {
        try
        {
-            return upCaffeNet->blobs().at(0)->mutable_gpu_data();
+            #ifdef USE_CAFFE
+                return upImpl->upCaffeNet->blobs().at(0)->mutable_gpu_data();
+            #else
+                return nullptr;
+            #endif
        }
        catch (const std::exception& e)
        {
@@ -74,15 +130,26 @@ namespace op
    {
        try
        {
-            // Copy frame data to GPU memory
-            if (inputData != nullptr)
-            {
-                auto* gpuImagePtr = upCaffeNet->blobs().at(0)->mutable_gpu_data();
-                cudaMemcpy(gpuImagePtr, inputData, mNetInputMemory, cudaMemcpyHostToDevice);
-            }
-            // Perform deep network forward pass
-            upCaffeNet->ForwardFrom(0);
-            cudaCheck(__LINE__, __FUNCTION__, __FILE__);
+            #ifdef USE_CAFFE
+                // Copy frame data to GPU memory
+                if (inputData != nullptr)
+                {
+                    #ifdef USE_CUDA
+                        auto* gpuImagePtr = upImpl->upCaffeNet->blobs().at(0)->mutable_gpu_data();
+                        cudaMemcpy(gpuImagePtr, inputData, upImpl->mNetInputMemory, cudaMemcpyHostToDevice);
+                    #else
+                        auto* cpuImagePtr = upImpl->upCaffeNet->blobs().at(0)->mutable_cpu_data();
+                        std::copy(inputData,
+                                  inputData + upImpl->mNetInputMemory/sizeof(float),
+                                  cpuImagePtr);
+                    #endif
+                }
+                // Perform deep network forward pass
+                upImpl->upCaffeNet->ForwardFrom(0);
+                cudaCheck(__LINE__, __FUNCTION__, __FILE__);
+            #else
+                UNUSED(inputData);
+            #endif
        }
        catch (const std::exception& e)
        {
@@ -94,7 +161,11 @@ namespace op
    {
        try
        {
-            return spOutputBlob;
+            #ifdef USE_CAFFE
+                return upImpl->spOutputBlob;
+            #else
+                return nullptr;
+            #endif
        }
        catch (const std::exception& e)
        {
@@ -103,5 +174,3 @@ namespace op
        }
    }
 }
-
-#endif
--- a/src/openpose/core/nmsCaffe.cpp
+++ b/src/openpose/core/nmsCaffe.cpp
 #ifdef USE_CAFFE
+    #include <caffe/blob.hpp>
+#endif
 #include <openpose/core/nmsBase.hpp>
 #include <openpose/core/nmsCaffe.hpp>

 namespace op
 {
    template <typename T>
-    NmsCaffe<T>::NmsCaffe()
+    struct NmsCaffe<T>::ImplNmsCaffe
+    {
+        #ifdef USE_CAFFE
+            caffe::Blob<int> mKernelBlob;
+            std::array<int, 4> mBottomSize;
+            std::array<int, 4> mTopSize;
+        #endif
+
+        ImplNmsCaffe(){};
+    };
+
+    template <typename T>
+    NmsCaffe<T>::NmsCaffe() :
+        upImpl{new ImplNmsCaffe{}}
+    {
+        try
+        {
+            #ifndef USE_CAFFE
+                error("OpenPose must be compiled with the `USE_CAFFE` macro definition in order to use this"
+                      " functionality.", __LINE__, __FUNCTION__, __FILE__);
+            #endif
+        }
+        catch (const std::exception& e)
+        {
+            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+        }
+    }
+
+    template <typename T>
+    NmsCaffe<T>::~NmsCaffe()
    {
    }

@@ -14,10 +45,15 @@ namespace op
    {
        try
        {
-            if (top.size() != 1)
-                error("top.size() != 1", __LINE__, __FUNCTION__, __FILE__);
-            if (bottom.size() != 1)
-                error("bottom.size() != 1", __LINE__, __FUNCTION__, __FILE__);
+            #ifdef USE_CAFFE
+                if (top.size() != 1)
+                    error("top.size() != 1", __LINE__, __FUNCTION__, __FILE__);
+                if (bottom.size() != 1)
+                    error("bottom.size() != 1", __LINE__, __FUNCTION__, __FILE__);
+            #else
+                UNUSED(bottom);
+                UNUSED(top);
+            #endif
        }
        catch (const std::exception& e)
        {
@@ -26,27 +62,36 @@ namespace op
    }

    template <typename T>
-    void NmsCaffe<T>::Reshape(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top, const int maxPeaks)
+    void NmsCaffe<T>::Reshape(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top,
+                              const int maxPeaks)
    {
        try
        {
-            auto bottomBlob = bottom.at(0);
-            auto topBlob = top.at(0);
+            #ifdef USE_CAFFE
+                auto bottomBlob = bottom.at(0);
+                auto topBlob = top.at(0);

-            // Bottom shape
-            std::vector<int> bottomShape = bottomBlob->shape();
+                // Bottom shape
+                std::vector<int> bottomShape = bottomBlob->shape();

-            // Top shape
-            std::vector<int> topShape{bottomShape};
-            topShape[1] = bottomShape[1]-1; // Number parts + bck - 1
-            topShape[2] = maxPeaks+1; // # maxPeaks + 1
-            topShape[3] = 3;  // X, Y, score
-            topBlob->Reshape(topShape);
-            mKernelBlob.Reshape(bottomShape);
+                // Top shape
+                std::vector<int> topShape{bottomShape};
+                topShape[1] = bottomShape[1]-1; // Number parts + bck - 1
+                topShape[2] = maxPeaks+1; // # maxPeaks + 1
+                topShape[3] = 3;  // X, Y, score
+                topBlob->Reshape(topShape);
+                upImpl->mKernelBlob.Reshape(bottomShape);

-            // Array sizes
-            mTopSize = std::array<int, 4>{topBlob->shape(0), topBlob->shape(1), topBlob->shape(2), topBlob->shape(3)};
-            mBottomSize = std::array<int, 4>{bottomBlob->shape(0), bottomBlob->shape(1), bottomBlob->shape(2), bottomBlob->shape(3)};
+                // Array sizes
+                upImpl->mTopSize = std::array<int, 4>{topBlob->shape(0), topBlob->shape(1),
+                                                      topBlob->shape(2), topBlob->shape(3)};
+                upImpl->mBottomSize = std::array<int, 4>{bottomBlob->shape(0), bottomBlob->shape(1),
+                                                         bottomBlob->shape(2), bottomBlob->shape(3)};
+            #else
+                UNUSED(bottom);
+                UNUSED(top);
+                UNUSED(maxPeaks);
+            #endif
        }
        catch (const std::exception& e)
        {
@@ -72,7 +117,13 @@ namespace op
    {
        try
        {
-            nmsGpu(top.at(0)->mutable_cpu_data(), mKernelBlob.mutable_cpu_data(), bottom.at(0)->cpu_data(), mThreshold, mTopSize, mBottomSize);
+            #ifdef USE_CAFFE
+                nmsCpu(top.at(0)->mutable_cpu_data(), upImpl->mKernelBlob.mutable_cpu_data(), bottom.at(0)->cpu_data(),
+                       mThreshold, upImpl->mTopSize, upImpl->mBottomSize);
+            #else
+                UNUSED(bottom);
+                UNUSED(top);
+            #endif
        }
        catch (const std::exception& e)
        {
@@ -85,7 +136,15 @@ namespace op
    {
        try
        {
-            nmsGpu(top.at(0)->mutable_gpu_data(), mKernelBlob.mutable_gpu_data(), bottom.at(0)->gpu_data(), mThreshold, mTopSize, mBottomSize);
+            #if defined USE_CAFFE && defined USE_CUDA
+                nmsGpu(top.at(0)->mutable_gpu_data(), upImpl->mKernelBlob.mutable_gpu_data(),
+                       bottom.at(0)->gpu_data(), mThreshold, upImpl->mTopSize, upImpl->mBottomSize);
+            #else
+                UNUSED(bottom);
+                UNUSED(top);
+                error("OpenPose must be compiled with the `USE_CAFFE` & `USE_CUDA` macro definitions in order to run"
+                      " this functionality.", __LINE__, __FUNCTION__, __FILE__);
+            #endif
        }
        catch (const std::exception& e)
        {
@@ -94,14 +153,17 @@ namespace op
    }

    template <typename T>
-    void NmsCaffe<T>::Backward_cpu(const std::vector<caffe::Blob<T>*>& top, const std::vector<bool>& propagate_down, const std::vector<caffe::Blob<T>*>& bottom)
+    void NmsCaffe<T>::Backward_cpu(const std::vector<caffe::Blob<T>*>& top, const std::vector<bool>& propagate_down,
+                                   const std::vector<caffe::Blob<T>*>& bottom)
    {
        try
        {
            UNUSED(top);
            UNUSED(propagate_down);
            UNUSED(bottom);
-            NOT_IMPLEMENTED;
+            #ifdef USE_CAFFE
+                NOT_IMPLEMENTED;
+            #endif
        }
        catch (const std::exception& e)
        {
@@ -110,14 +172,17 @@ namespace op
    }

    template <typename T>
-    void NmsCaffe<T>::Backward_gpu(const std::vector<caffe::Blob<T>*>& top, const std::vector<bool>& propagate_down, const std::vector<caffe::Blob<T>*>& bottom)
+    void NmsCaffe<T>::Backward_gpu(const std::vector<caffe::Blob<T>*>& top, const std::vector<bool>& propagate_down,
+                                   const std::vector<caffe::Blob<T>*>& bottom)
    {
        try
        {
            UNUSED(top);
            UNUSED(propagate_down);
            UNUSED(bottom);
-            NOT_IMPLEMENTED;
+            #ifdef USE_CAFFE
+                NOT_IMPLEMENTED;
+            #endif
        }
        catch (const std::exception& e)
        {
@@ -125,7 +190,5 @@ namespace op
        }
    }

-    INSTANTIATE_CLASS(NmsCaffe);
+    COMPILE_TEMPLATE_FLOATING_TYPES_CLASS(NmsCaffe);
 }
-
-#endif
--- a/src/openpose/core/resizeAndMergeBase.cpp
+++ b/src/openpose/core/resizeAndMergeBase.cpp
@@ -5,13 +5,13 @@ namespace op
 {
    template <typename T>
    void resizeAndMergeCpu(T* targetPtr, const T* const sourcePtr, const std::array<int, 4>& targetSize,
-                           const std::array<int, 4>& sourceSize, const std::vector<T>& scaleRatios)
+                           const std::array<int, 4>& sourceSize, const std::vector<T>& scaleInputToNetInputs)
    {
        try
        {
            UNUSED(targetPtr);
            UNUSED(sourcePtr);
-            UNUSED(scaleRatios);
+            UNUSED(scaleInputToNetInputs);
            UNUSED(targetSize);
            UNUSED(sourceSize);
            error("CPU version not completely implemented.", __LINE__, __FUNCTION__, __FILE__);
@@ -61,7 +61,7 @@ namespace op
    }

    template void resizeAndMergeCpu(float* targetPtr, const float* const sourcePtr, const std::array<int, 4>& targetSize,
-                                    const std::array<int, 4>& sourceSize, const std::vector<float>& scaleRatios);
+                                    const std::array<int, 4>& sourceSize, const std::vector<float>& scaleInputToNetInputs);
    template void resizeAndMergeCpu(double* targetPtr, const double* const sourcePtr, const std::array<int, 4>& targetSize,
-                                    const std::array<int, 4>& sourceSize, const std::vector<double>& scaleRatios);
+                                    const std::array<int, 4>& sourceSize, const std::vector<double>& scaleInputToNetInputs);
 }
--- a/src/openpose/core/resizeAndMergeBase.cu
+++ b/src/openpose/core/resizeAndMergeBase.cu
@@ -25,7 +25,7 @@ namespace op
    }

    template <typename T>
-    __global__ void resizeKernelAndMerge(T* targetPtr, const T* const sourcePtr, const int sourceNumOffset, const int num, const T* scaleRatios,
+    __global__ void resizeKernelAndMerge(T* targetPtr, const T* const sourcePtr, const int sourceNumOffset, const int num, const T* scaleInputToNetInputs,
                                         const int sourceWidth, const int sourceHeight, const int targetWidth, const int targetHeight)
    {
        const auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
@@ -38,8 +38,8 @@ namespace op
            // targetPixel = -1000.f; // For fastMax
            for (auto n = 0; n < num; n++)
            {
-                const auto currentWidth = sourceWidth * scaleRatios[n];
-                const auto currentHeight = sourceHeight * scaleRatios[n];
+                const auto currentWidth = sourceWidth * scaleInputToNetInputs[n] / scaleInputToNetInputs[0];
+                const auto currentHeight = sourceHeight * scaleInputToNetInputs[n] / scaleInputToNetInputs[0];

                const auto scaleWidth = targetWidth / currentWidth;
                const auto scaleHeight = targetHeight / currentHeight;
@@ -58,7 +58,7 @@ namespace op

    template <typename T>
    void resizeAndMergeGpu(T* targetPtr, const T* const sourcePtr, const std::array<int, 4>& targetSize,
-                           const std::array<int, 4>& sourceSize, const std::vector<T>& scaleRatios)
+                           const std::array<int, 4>& sourceSize, const std::vector<T>& scaleInputToNetInputs)
    {
        try
        {
@@ -92,24 +92,24 @@ namespace op
            // Multi-scale merging
            else
            {
-                // If scale_number > 1 --> scaleRatios must be set
-                if (scaleRatios.size() != num)
+                // If scale_number > 1 --> scaleInputToNetInputs must be set
+                if (scaleInputToNetInputs.size() != num)
                    error("The scale ratios size must be equal than the number of scales.", __LINE__, __FUNCTION__, __FILE__);
                const auto maxScales = 10;
-                if (scaleRatios.size() > maxScales)
+                if (scaleInputToNetInputs.size() > maxScales)
                    error("The maximum number of scales is " + std::to_string(maxScales) + ".", __LINE__, __FUNCTION__, __FILE__);
-                // Copy scaleRatios
-                T* scaleRatiosGpuPtr;
-                cudaMalloc((void**)&scaleRatiosGpuPtr, maxScales * sizeof(T));
-                cudaMemcpy(scaleRatiosGpuPtr, scaleRatios.data(), scaleRatios.size() * sizeof(T), cudaMemcpyHostToDevice);
+                // Copy scaleInputToNetInputs
+                T* scaleInputToNetInputsPtr;
+                cudaMalloc((void**)&scaleInputToNetInputsPtr, maxScales * sizeof(T));
+                cudaMemcpy(scaleInputToNetInputsPtr, scaleInputToNetInputs.data(), scaleInputToNetInputs.size() * sizeof(T), cudaMemcpyHostToDevice);
                // Perform resize + merging
                const auto sourceNumOffset = channels * sourceChannelOffset;
                for (auto c = 0 ; c < channels ; c++)
                    resizeKernelAndMerge<<<numBlocks, threadsPerBlock>>>(targetPtr + c * targetChannelOffset,
                                                                         sourcePtr + c * sourceChannelOffset, sourceNumOffset,
-                                                                         num, scaleRatiosGpuPtr, sourceWidth, sourceHeight, targetWidth, targetHeight);
+                                                                         num, scaleInputToNetInputsPtr, sourceWidth, sourceHeight, targetWidth, targetHeight);
                // Free memory
-                cudaFree(scaleRatiosGpuPtr);
+                cudaFree(scaleInputToNetInputsPtr);
            }

            cudaCheck(__LINE__, __FUNCTION__, __FILE__);
@@ -121,7 +121,7 @@ namespace op
    }

    template void resizeAndMergeGpu(float* targetPtr, const float* const sourcePtr, const std::array<int, 4>& targetSize,
-                                    const std::array<int, 4>& sourceSize, const std::vector<float>& scaleRatios);
+                                    const std::array<int, 4>& sourceSize, const std::vector<float>& scaleInputToNetInputs);
    template void resizeAndMergeGpu(double* targetPtr, const double* const sourcePtr, const std::array<int, 4>& targetSize,
-                                    const std::array<int, 4>& sourceSize, const std::vector<double>& scaleRatios);
+                                    const std::array<int, 4>& sourceSize, const std::vector<double>& scaleInputToNetInputs);
 }
--- a/src/openpose/core/resizeAndMergeCaffe.cpp
+++ b/src/openpose/core/resizeAndMergeCaffe.cpp
 #ifdef USE_CAFFE
+    #include <caffe/blob.hpp>
+#endif
 #include <openpose/core/resizeAndMergeBase.hpp>
 #include <openpose/utilities/fastMath.hpp>
 #include <openpose/core/resizeAndMergeCaffe.hpp>
@@ -9,17 +11,34 @@ namespace op
    ResizeAndMergeCaffe<T>::ResizeAndMergeCaffe() :
        mScaleRatios{1}
    {
+        try
+        {
+            #ifndef USE_CAFFE
+                error("OpenPose must be compiled with the `USE_CAFFE` macro definition in order to use this"
+                      " functionality.", __LINE__, __FUNCTION__, __FILE__);
+            #endif
+        }
+        catch (const std::exception& e)
+        {
+            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+        }
    }

    template <typename T>
-    void ResizeAndMergeCaffe<T>::LayerSetUp(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top)
+    void ResizeAndMergeCaffe<T>::LayerSetUp(const std::vector<caffe::Blob<T>*>& bottom,
+                                            const std::vector<caffe::Blob<T>*>& top)
    {
        try
        {
-            if (top.size() != 1)
-                error("top.size() != 1", __LINE__, __FUNCTION__, __FILE__);
-            if (bottom.size() != 1)
-                error("bottom.size() != 2", __LINE__, __FUNCTION__, __FILE__);
+            #ifdef USE_CAFFE
+                if (top.size() != 1)
+                    error("top.size() != 1", __LINE__, __FUNCTION__, __FILE__);
+                if (bottom.size() != 1)
+                    error("bottom.size() != 2", __LINE__, __FUNCTION__, __FILE__);
+            #else
+                UNUSED(bottom);
+                UNUSED(top);
+            #endif
        }
        catch (const std::exception& e)
        {
@@ -28,24 +47,34 @@ namespace op
    }

    template <typename T>
-    void ResizeAndMergeCaffe<T>::Reshape(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top,
-                                         const float factor, const bool mergeFirstDimension)
+    void ResizeAndMergeCaffe<T>::Reshape(const std::vector<caffe::Blob<T>*>& bottom,
+                                         const std::vector<caffe::Blob<T>*>& top, const float factor,
+                                         const bool mergeFirstDimension)
    {
        try
        {
-            auto bottomBlob = bottom.at(0);
-            auto topBlob = top.at(0);
+            #ifdef USE_CAFFE
+                auto bottomBlob = bottom.at(0);
+                auto topBlob = top.at(0);

-            // Top shape
-            auto topShape = bottomBlob->shape();
-            topShape[0] = (mergeFirstDimension ? 1 : bottomBlob->shape(0));
-            topShape[2] = intRound(topShape[2] * factor);
-            topShape[3] = intRound(topShape[3] * factor);
-            topBlob->Reshape(topShape);
+                // Top shape
+                auto topShape = bottomBlob->shape();
+                topShape[0] = (mergeFirstDimension ? 1 : bottomBlob->shape(0));
+                topShape[2] = intRound(topShape[2] * factor);
+                topShape[3] = intRound(topShape[3] * factor);
+                topBlob->Reshape(topShape);

-            // Array sizes
-            mTopSize = std::array<int, 4>{topBlob->shape(0), topBlob->shape(1), topBlob->shape(2), topBlob->shape(3)};
-            mBottomSize = std::array<int, 4>{bottomBlob->shape(0), bottomBlob->shape(1), bottomBlob->shape(2), bottomBlob->shape(3)};
+                // Array sizes
+                mTopSize = std::array<int, 4>{topBlob->shape(0), topBlob->shape(1), topBlob->shape(2),
+                                              topBlob->shape(3)};
+                mBottomSize = std::array<int, 4>{bottomBlob->shape(0), bottomBlob->shape(1),
+                                                 bottomBlob->shape(2), bottomBlob->shape(3)};
+            #else
+                UNUSED(bottom);
+                UNUSED(top);
+                UNUSED(factor);
+                UNUSED(mergeFirstDimension);
+            #endif
        }
        catch (const std::exception& e)
        {
@@ -67,11 +96,18 @@ namespace op
    }

    template <typename T>
-    void ResizeAndMergeCaffe<T>::Forward_cpu(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top)
+    void ResizeAndMergeCaffe<T>::Forward_cpu(const std::vector<caffe::Blob<T>*>& bottom,
+                                             const std::vector<caffe::Blob<T>*>& top)
    {
        try
        {
-            resizeAndMergeCpu(top.at(0)->mutable_cpu_data(), bottom.at(0)->cpu_data(), mTopSize, mBottomSize, mScaleRatios);
+            #ifdef USE_CAFFE
+                resizeAndMergeCpu(top.at(0)->mutable_cpu_data(), bottom.at(0)->cpu_data(), mTopSize, mBottomSize,
+                                  mScaleRatios);
+            #else
+                UNUSED(bottom);
+                UNUSED(top);
+            #endif
        }
        catch (const std::exception& e)
        {
@@ -80,11 +116,20 @@ namespace op
    }

    template <typename T>
-    void ResizeAndMergeCaffe<T>::Forward_gpu(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top)
+    void ResizeAndMergeCaffe<T>::Forward_gpu(const std::vector<caffe::Blob<T>*>& bottom,
+                                             const std::vector<caffe::Blob<T>*>& top)
    {
        try
        {
-            resizeAndMergeGpu(top.at(0)->mutable_gpu_data(), bottom.at(0)->gpu_data(), mTopSize, mBottomSize, mScaleRatios);
+            #if defined USE_CAFFE && defined USE_CUDA
+                resizeAndMergeGpu(top.at(0)->mutable_gpu_data(), bottom.at(0)->gpu_data(), mTopSize, mBottomSize,
+                                  mScaleRatios);
+            #else
+                UNUSED(bottom);
+                UNUSED(top);
+                error("OpenPose must be compiled with the `USE_CAFFE` & `USE_CUDA` macro definitions in order to run"
+                      " this functionality.", __LINE__, __FUNCTION__, __FILE__);
+            #endif
        }
        catch (const std::exception& e)
        {
@@ -93,7 +138,8 @@ namespace op
    }

    template <typename T>
-    void ResizeAndMergeCaffe<T>::Backward_cpu(const std::vector<caffe::Blob<T>*>& top, const std::vector<bool>& propagate_down,
+    void ResizeAndMergeCaffe<T>::Backward_cpu(const std::vector<caffe::Blob<T>*>& top,
+                                              const std::vector<bool>& propagate_down,
                                              const std::vector<caffe::Blob<T>*>& bottom)
    {
        try
@@ -101,7 +147,9 @@ namespace op
            UNUSED(top);
            UNUSED(propagate_down);
            UNUSED(bottom);
-            NOT_IMPLEMENTED;
+            #ifdef USE_CAFFE
+                NOT_IMPLEMENTED;
+            #endif
        }
        catch (const std::exception& e)
        {
@@ -118,7 +166,9 @@ namespace op
            UNUSED(top);
            UNUSED(propagate_down);
            UNUSED(bottom);
-            NOT_IMPLEMENTED;
+            #ifdef USE_CAFFE
+                NOT_IMPLEMENTED;
+            #endif
        }
        catch (const std::exception& e)
        {
@@ -126,7 +176,5 @@ namespace op
        }
    }

-    INSTANTIATE_CLASS(ResizeAndMergeCaffe);
+    COMPILE_TEMPLATE_FLOATING_TYPES_CLASS(ResizeAndMergeCaffe);
 }
-
-#endif
--- a/src/openpose/core/scaleAndSizeExtractor.cpp
+++ b/src/openpose/core/scaleAndSizeExtractor.cpp
+#include <openpose/utilities/fastMath.hpp>
+#include <openpose/utilities/openCv.hpp> // resizeGetScaleFactor
+#include <openpose/core/scaleAndSizeExtractor.hpp>
+
+namespace op
+{
+    ScaleAndSizeExtractor::ScaleAndSizeExtractor(const Point<int>& netInputResolution,
+                                                 const Point<int>& outputResolution, const int scaleNumber,
+                                                 const double scaleGap) :
+        mNetInputResolution{netInputResolution},
+        mOutputSize{outputResolution},
+        mScaleNumber{scaleNumber},
+        mScaleGap{scaleGap}
+    {
+        try
+        {
+            // Security checks
+            if ((netInputResolution.x > 0 && netInputResolution.x % 16 != 0)
+                || (netInputResolution.y > 0 && netInputResolution.y % 16 != 0))
+                error("Net input resolution must be multiples of 16.", __LINE__, __FUNCTION__, __FILE__);
+            if (scaleNumber < 1)
+                error("There must be at least 1 scale.", __LINE__, __FUNCTION__, __FILE__);
+            if (scaleGap <= 0.)
+                error("The gap between scales must be strictly positive.", __LINE__, __FUNCTION__, __FILE__);
+        }
+        catch (const std::exception& e)
+        {
+            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+        }
+    }
+
+    std::tuple<std::vector<double>, std::vector<Point<int>>, double, Point<int>> ScaleAndSizeExtractor::extract(
+        const Point<int>& inputResolution) const
+    {
+        try
+        {
+            // Security checks
+            if (inputResolution.area() <= 0)
+                error("Wrong input element (empty cvInputData).", __LINE__, __FUNCTION__, __FILE__);
+            // scaleRatios & sizes - Reescale keeping aspect ratio
+            std::vector<double> scaleRatios(mScaleNumber, 1.f);
+            std::vector<Point<int>> sizes(mScaleNumber);
+            for (auto i = 0; i < mScaleNumber; i++)
+            {
+                const auto currentScale = 1. - i*mScaleGap;
+                if (currentScale < 0. || 1. < currentScale)
+                    error("All scales must be in the range [0, 1], i.e. 0 <= 1-scale_number*scale_gap <= 1",
+                          __LINE__, __FUNCTION__, __FILE__);
+
+                const auto targetWidth = fastTruncate(intRound(mNetInputResolution.x * currentScale) / 16 * 16, 1,
+                                                      mNetInputResolution.x);
+                const auto targetHeight = fastTruncate(intRound(mNetInputResolution.y * currentScale) / 16 * 16, 1,
+                                                       mNetInputResolution.y);
+                const Point<int> targetSize{targetWidth, targetHeight};
+                scaleRatios[i] = resizeGetScaleFactor(inputResolution, targetSize);
+                sizes[i] = mNetInputResolution;
+            }
+            // scaleInputToOutput - Scale between input and desired output size
+            Point<int> outputResolution;
+            double scaleInputToOutput;
+            // Output = mOutputSize3D size
+            if (mOutputSize.x > 0 && mOutputSize.y > 0)
+            {
+                outputResolution = mOutputSize;
+                scaleInputToOutput = resizeGetScaleFactor(inputResolution, outputResolution);
+            }
+            // Output = input size
+            else
+            {
+                outputResolution = inputResolution;
+                scaleInputToOutput = 1.;
+            }
+            // Return result
+            return std::make_tuple(scaleRatios, sizes, scaleInputToOutput, outputResolution);
+        }
+        catch (const std::exception& e)
+        {
+            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+            return std::make_tuple(std::vector<double>{}, std::vector<Point<int>>{}, 1., Point<int>{});
+        }
+    }
+}
--- a/src/openpose/face/CMakeLists.txt
+++ b/src/openpose/face/CMakeLists.txt
@@ -3,6 +3,7 @@ set(SOURCES
    faceDetector.cpp
    faceDetectorOpenCV.cpp
    faceExtractor.cpp
+    faceExtractorCaffe.cpp
    faceCpuRenderer.cpp
    faceGpuRenderer.cpp
    renderFace.cpp

--- a/src/openpose/face/faceDetector.cpp
+++ b/src/openpose/face/faceDetector.cpp
@@ -108,7 +108,7 @@ namespace op
        }
    }

-    std::vector<Rectangle<float>> FaceDetector::detectFaces(const Array<float>& poseKeypoints, const float scaleInputToOutput) const
+    std::vector<Rectangle<float>> FaceDetector::detectFaces(const Array<float>& poseKeypoints, const double scaleInputToOutput) const
    {
        try
        {
@@ -119,7 +119,7 @@ namespace op
            // Otherwise, get face position(s)
            if (!poseKeypoints.empty())
                for (auto person = 0 ; person < numberPeople ; person++)
-                    faceRectangles.at(person) = getFaceFromPoseKeypoints(poseKeypoints, person, mNeck, mNose, mLEar, mREar, mLEye, mREye, threshold) / scaleInputToOutput;
+                    faceRectangles.at(person) = getFaceFromPoseKeypoints(poseKeypoints, person, mNeck, mNose, mLEar, mREar, mLEye, mREye, threshold) / (float)scaleInputToOutput;
            return faceRectangles;
        }
        catch (const std::exception& e)

--- a/src/openpose/face/faceExtractor.cpp
+++ b/src/openpose/face/faceExtractor.cpp
-#include <opencv2/opencv.hpp> // CV_WARP_INVERSE_MAP, CV_INTER_LINEAR
-#include <openpose/core/netCaffe.hpp>
-#include <openpose/face/faceParameters.hpp>
 #include <openpose/utilities/check.hpp>
-#include <openpose/utilities/cuda.hpp>
-#include <openpose/utilities/fastMath.hpp>
-#include <openpose/utilities/openCv.hpp>
 #include <openpose/face/faceExtractor.hpp>

 namespace op
 {
-    void updateFaceHeatMapsForPerson(Array<float>& heatMaps, const int person, const ScaleMode heatMapScaleMode,
-                                     const float* heatMapsGpuPtr)
-    {
-        try
-        {
-            // Copy memory
-            const auto channelOffset = heatMaps.getVolume(2, 3);
-            const auto volumeBodyParts = FACE_NUMBER_PARTS * channelOffset;
-            auto totalOffset = 0u;
-            auto* heatMapsPtr = &heatMaps.getPtr()[person*volumeBodyParts];
-            // Copy face parts
-            cudaMemcpy(heatMapsPtr, heatMapsGpuPtr, volumeBodyParts * sizeof(float), cudaMemcpyDeviceToHost);
-            // Change from [0,1] to [-1,1]
-            if (heatMapScaleMode == ScaleMode::PlusMinusOne)
-                for (auto i = 0u ; i < volumeBodyParts ; i++)
-                    heatMapsPtr[i] = fastTruncate(heatMapsPtr[i]) * 2.f - 1.f;
-            // [0, 255]
-            else if (heatMapScaleMode == ScaleMode::UnsignedChar)
-                for (auto i = 0u ; i < volumeBodyParts ; i++)
-                    heatMapsPtr[i] = (float)intRound(fastTruncate(heatMapsPtr[i]) * 255.f);
-            // Avoid values outside original range
-            else
-                for (auto i = 0u ; i < volumeBodyParts ; i++)
-                    heatMapsPtr[i] = fastTruncate(heatMapsPtr[i]);
-            totalOffset += (unsigned int)volumeBodyParts;
-        }
-        catch (const std::exception& e)
-        {
-            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
-        }
-    }
-
    FaceExtractor::FaceExtractor(const Point<int>& netInputSize, const Point<int>& netOutputSize,
-                                 const std::string& modelFolder, const int gpuId,
                                 const std::vector<HeatMapType>& heatMapTypes, const ScaleMode heatMapScale) :
        mNetOutputSize{netOutputSize},
-        spNet{std::make_shared<NetCaffe>(std::array<int,4>{1, 3, mNetOutputSize.y, mNetOutputSize.x},
-                                         modelFolder + FACE_PROTOTXT, modelFolder + FACE_TRAINED_MODEL, gpuId)},
-        spResizeAndMergeCaffe{std::make_shared<ResizeAndMergeCaffe<float>>()},
-        spMaximumCaffe{std::make_shared<MaximumCaffe<float>>()},
        mFaceImageCrop{mNetOutputSize.area()*3},
        mHeatMapScaleMode{heatMapScale},
        mHeatMapTypes{heatMapTypes}
@@ -77,153 +34,18 @@ namespace op
        }
    }

-    void FaceExtractor::initializationOnThread()
+    FaceExtractor::~FaceExtractor()
    {
-        try
-        {
-            // Logging
-            log("Starting initialization on thread.", Priority::Low, __LINE__, __FUNCTION__, __FILE__);
-            // Get thread id
-            mThreadId = {std::this_thread::get_id()};
-            // Caffe net
-            spNet->initializationOnThread();
-            spCaffeNetOutputBlob = ((NetCaffe*)spNet.get())->getOutputBlob();
-            cudaCheck(__LINE__, __FUNCTION__, __FILE__);
-            // HeatMaps extractor blob and layer
-            spHeatMapsBlob = {std::make_shared<caffe::Blob<float>>(1,1,1,1)};
-            const bool mergeFirstDimension = true;
-            spResizeAndMergeCaffe->Reshape({spCaffeNetOutputBlob.get()}, {spHeatMapsBlob.get()},
-                                            FACE_CCN_DECREASE_FACTOR, mergeFirstDimension);
-            cudaCheck(__LINE__, __FUNCTION__, __FILE__);
-            // Pose extractor blob and layer
-            spPeaksBlob = {std::make_shared<caffe::Blob<float>>(1,1,1,1)};
-            spMaximumCaffe->Reshape({spHeatMapsBlob.get()}, {spPeaksBlob.get()});
-            cudaCheck(__LINE__, __FUNCTION__, __FILE__);
-            // Logging
-            log("Finished initialization on thread.", Priority::Low, __LINE__, __FUNCTION__, __FILE__);
-        }
-        catch (const std::exception& e)
-        {
-            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
-        }
    }

-    void FaceExtractor::forwardPass(const std::vector<Rectangle<float>>& faceRectangles, const cv::Mat& cvInputData,
-                                    const float scaleInputToOutput)
+    void FaceExtractor::initializationOnThread()
    {
        try
        {
-            if (!faceRectangles.empty())
-            {
-                // Security checks
-                if (cvInputData.empty())
-                    error("Empty cvInputData.", __LINE__, __FUNCTION__, __FILE__);
-
-                // Fix parameters
-                const auto netInputSide = fastMin(mNetOutputSize.x, mNetOutputSize.y);
-
-                // Set face size
-                const auto numberPeople = (int)faceRectangles.size();
-                mFaceKeypoints.reset({numberPeople, (int)FACE_NUMBER_PARTS, 3}, 0);
-
-                // HeatMaps: define size
-                if (!mHeatMapTypes.empty())
-                    mHeatMaps.reset({numberPeople, (int)FACE_NUMBER_PARTS, mNetOutputSize.y, mNetOutputSize.x});
-
-                // // Debugging
-                // cv::Mat cvInputDataCopy = cvInputData.clone();
-                // Extract face keypoints for each person
-                for (auto person = 0 ; person < numberPeople ; person++)
-                {
-                    const auto& faceRectangle = faceRectangles.at(person);
-                    // Only consider faces with a minimum pixel area
-                    const auto minFaceSize = fastMin(faceRectangle.width, faceRectangle.height);
-                    // // Debugging -> red rectangle
-                    // log(std::to_string(cvInputData.cols) + " " + std::to_string(cvInputData.rows));
-                    // cv::rectangle(cvInputDataCopy,
-                    //               cv::Point{(int)faceRectangle.x, (int)faceRectangle.y},
-                    //               cv::Point{(int)faceRectangle.bottomRight().x, (int)faceRectangle.bottomRight().y},
-                    //               cv::Scalar{0,0,255}, 2);
-                    // Get parts
-                    if (minFaceSize > 40)
-                    {
-                        // // Debugging -> green rectangle overwriting red one
-                        // log(std::to_string(cvInputData.cols) + " " + std::to_string(cvInputData.rows));
-                        // cv::rectangle(cvInputDataCopy,
-                        //               cv::Point{(int)faceRectangle.x, (int)faceRectangle.y},
-                        //               cv::Point{(int)faceRectangle.bottomRight().x,
-                        //                         (int)faceRectangle.bottomRight().y},
-                        //               cv::Scalar{0,255,0}, 2);
-                        // Resize and shift image to face rectangle positions
-                        const auto faceSize = fastMax(faceRectangle.width, faceRectangle.height);
-                        const double scaleFace = faceSize / (double)netInputSide;
-                        cv::Mat Mscaling = cv::Mat::eye(2, 3, CV_64F);
-                        Mscaling.at<double>(0,0) = scaleFace;
-                        Mscaling.at<double>(1,1) = scaleFace;
-                        Mscaling.at<double>(0,2) = faceRectangle.x;
-                        Mscaling.at<double>(1,2) = faceRectangle.y;
-
-                        cv::Mat faceImage;
-                        cv::warpAffine(cvInputData, faceImage, Mscaling, cv::Size{mNetOutputSize.x, mNetOutputSize.y},
-                                       CV_INTER_LINEAR | CV_WARP_INVERSE_MAP, cv::BORDER_CONSTANT, cv::Scalar(0,0,0));
-
-                        // cv::Mat -> float*
-                        uCharCvMatToFloatPtr(mFaceImageCrop.getPtr(), faceImage, true);
-
-                        // // Debugging
-                        // if (person < 5)
-                        // cv::imshow("faceImage" + std::to_string(person), faceImage);
-
-                        // 1. Caffe deep network
-                        auto* inputDataGpuPtr = spNet->getInputDataGpuPtr();
-                        cudaMemcpy(inputDataGpuPtr, mFaceImageCrop.getPtr(), mNetOutputSize.area() * 3 * sizeof(float),
-                                   cudaMemcpyHostToDevice);
-                        spNet->forwardPass();
-
-                        // 2. Resize heat maps + merge different scales
-                        #ifndef CPU_ONLY
-                            spResizeAndMergeCaffe->Forward_gpu({spCaffeNetOutputBlob.get()}, {spHeatMapsBlob.get()});
-                            cudaCheck(__LINE__, __FUNCTION__, __FILE__);
-                        #else
-                            spResizeAndMergeCaffe->Forward_cpu({spCaffeNetOutputBlob.get()}, {spHeatMapsBlob.get()});
-                        #endif
-
-                        // 3. Get peaks by Non-Maximum Suppression
-                        #ifndef CPU_ONLY
-                            spMaximumCaffe->Forward_gpu({spHeatMapsBlob.get()}, {spPeaksBlob.get()});
-                            cudaCheck(__LINE__, __FUNCTION__, __FILE__);
-                        #else
-                            spMaximumCaffe->Forward_cpu({spHeatMapsBlob.get()}, {spPeaksBlob.get()});
-                        #endif
-
-                        const auto* facePeaksPtr = spPeaksBlob->mutable_cpu_data();
-                        for (auto part = 0 ; part < mFaceKeypoints.getSize(1) ; part++)
-                        {
-                            const auto xyIndex = part * mFaceKeypoints.getSize(2);
-                            const auto x = facePeaksPtr[xyIndex];
-                            const auto y = facePeaksPtr[xyIndex + 1];
-                            const auto score = facePeaksPtr[xyIndex + 2];
-                            const auto baseIndex = mFaceKeypoints.getSize(2)
-                                                 * (part + person * mFaceKeypoints.getSize(1));
-                            mFaceKeypoints[baseIndex] = (float)(scaleInputToOutput * (Mscaling.at<double>(0,0) * x
-                                                                                      + Mscaling.at<double>(0,1) * y
-                                                                                      + Mscaling.at<double>(0,2)));
-                            mFaceKeypoints[baseIndex+1] = (float)(scaleInputToOutput * (Mscaling.at<double>(1,0) * x
-                                                                                      + Mscaling.at<double>(1,1) * y
-                                                                                      + Mscaling.at<double>(1,2)));
-                            mFaceKeypoints[baseIndex+2] = score;
-                        }
-                        // HeatMaps: storing
-                        if (!mHeatMapTypes.empty())
-                            updateFaceHeatMapsForPerson(mHeatMaps, person, mHeatMapScaleMode,
-                                                        spHeatMapsBlob->gpu_data());
-                    }
-                }
-                // // Debugging
-                // cv::imshow("AcvInputDataCopy", cvInputDataCopy);
-            }
-            else
-                mFaceKeypoints.reset();
+            // Get thread id
+            mThreadId = {std::this_thread::get_id()};
+            // Deep net initialization
+            netInitializationOnThread();
        }
        catch (const std::exception& e)
        {

--- a/src/openpose/face/faceExtractorCaffe.cpp
+++ b/src/openpose/face/faceExtractorCaffe.cpp
+#if defined USE_CAFFE && defined USE_CUDA
+    #include <caffe/blob.hpp>
+#endif
+#include <opencv2/opencv.hpp> // CV_WARP_INVERSE_MAP, CV_INTER_LINEAR
+#include <openpose/core/maximumCaffe.hpp>
+#include <openpose/core/netCaffe.hpp>
+#include <openpose/core/resizeAndMergeCaffe.hpp>
+#include <openpose/face/faceParameters.hpp>
+#include <openpose/utilities/cuda.hpp>
+#include <openpose/utilities/fastMath.hpp>
+#include <openpose/utilities/openCv.hpp>
+#include <openpose/face/faceExtractorCaffe.hpp>
+
+namespace op
+{
+    struct FaceExtractorCaffe::ImplFaceExtractorCaffe
+    {
+        #if defined USE_CAFFE && defined USE_CUDA
+            std::shared_ptr<NetCaffe> spNetCaffe;
+            std::shared_ptr<ResizeAndMergeCaffe<float>> spResizeAndMergeCaffe;
+            std::shared_ptr<MaximumCaffe<float>> spMaximumCaffe;
+            // Init with thread
+            boost::shared_ptr<caffe::Blob<float>> spCaffeNetOutputBlob;
+            std::shared_ptr<caffe::Blob<float>> spHeatMapsBlob;
+            std::shared_ptr<caffe::Blob<float>> spPeaksBlob;
+
+            ImplFaceExtractorCaffe(const Point<int>& netOutputSize,
+                                   const std::string& modelFolder, const int gpuId) :
+                spNetCaffe{std::make_shared<NetCaffe>(std::array<int,4>{1, 3, netOutputSize.y, netOutputSize.x},
+                                                      modelFolder + FACE_PROTOTXT, modelFolder + FACE_TRAINED_MODEL,
+                                                      gpuId)},
+                spResizeAndMergeCaffe{std::make_shared<ResizeAndMergeCaffe<float>>()},
+                spMaximumCaffe{std::make_shared<MaximumCaffe<float>>()}
+            {
+            }
+        #endif
+    };
+
+    #if defined USE_CAFFE && defined USE_CUDA
+        void updateFaceHeatMapsForPerson(Array<float>& heatMaps, const int person, const ScaleMode heatMapScaleMode,
+                                         const float* heatMapsGpuPtr)
+        {
+            try
+            {
+                // Copy memory
+                const auto channelOffset = heatMaps.getVolume(2, 3);
+                const auto volumeBodyParts = FACE_NUMBER_PARTS * channelOffset;
+                auto totalOffset = 0u;
+                auto* heatMapsPtr = &heatMaps.getPtr()[person*volumeBodyParts];
+                // Copy face parts
+                cudaMemcpy(heatMapsPtr, heatMapsGpuPtr, volumeBodyParts * sizeof(float), cudaMemcpyDeviceToHost);
+                // Change from [0,1] to [-1,1]
+                if (heatMapScaleMode == ScaleMode::PlusMinusOne)
+                    for (auto i = 0u ; i < volumeBodyParts ; i++)
+                        heatMapsPtr[i] = fastTruncate(heatMapsPtr[i]) * 2.f - 1.f;
+                // [0, 255]
+                else if (heatMapScaleMode == ScaleMode::UnsignedChar)
+                    for (auto i = 0u ; i < volumeBodyParts ; i++)
+                        heatMapsPtr[i] = (float)intRound(fastTruncate(heatMapsPtr[i]) * 255.f);
+                // Avoid values outside original range
+                else
+                    for (auto i = 0u ; i < volumeBodyParts ; i++)
+                        heatMapsPtr[i] = fastTruncate(heatMapsPtr[i]);
+                totalOffset += (unsigned int)volumeBodyParts;
+            }
+            catch (const std::exception& e)
+            {
+                error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+            }
+        }
+    #endif
+
+    FaceExtractorCaffe::FaceExtractorCaffe(const Point<int>& netInputSize, const Point<int>& netOutputSize,
+                                           const std::string& modelFolder, const int gpuId,
+                                           const std::vector<HeatMapType>& heatMapTypes,
+                                           const ScaleMode heatMapScale) :
+        FaceExtractor{netInputSize, netOutputSize, heatMapTypes, heatMapScale}
+        #if defined USE_CAFFE && defined USE_CUDA
+        , upImpl{new ImplFaceExtractorCaffe{mNetOutputSize, modelFolder, gpuId}}
+        #endif
+    {
+        try
+        {
+            #if !defined USE_CAFFE || !defined USE_CUDA
+                UNUSED(netInputSize);
+                UNUSED(netOutputSize);
+                UNUSED(modelFolder);
+                UNUSED(gpuId);
+                UNUSED(heatMapTypes);
+                UNUSED(heatMapScale);
+                error("OpenPose must be compiled with the `USE_CAFFE` & `USE_CUDA` macro definitions in order to run"
+                      " this functionality.", __LINE__, __FUNCTION__, __FILE__);
+            #endif
+        }
+        catch (const std::exception& e)
+        {
+            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+        }
+    }
+
+    FaceExtractorCaffe::~FaceExtractorCaffe()
+    {
+    }
+
+    void FaceExtractorCaffe::netInitializationOnThread()
+    {
+        try
+        {
+            #if defined USE_CAFFE && defined USE_CUDA
+                // Logging
+                log("Starting initialization on thread.", Priority::Low, __LINE__, __FUNCTION__, __FILE__);
+                // Caffe net
+                upImpl->spNetCaffe->initializationOnThread();
+                upImpl->spCaffeNetOutputBlob = upImpl->spNetCaffe->getOutputBlob();
+                cudaCheck(__LINE__, __FUNCTION__, __FILE__);
+                // HeatMaps extractor blob and layer
+                upImpl->spHeatMapsBlob = {std::make_shared<caffe::Blob<float>>(1,1,1,1)};
+                const bool mergeFirstDimension = true;
+                upImpl->spResizeAndMergeCaffe->Reshape({upImpl->spCaffeNetOutputBlob.get()},
+                                                       {upImpl->spHeatMapsBlob.get()},
+                                                       FACE_CCN_DECREASE_FACTOR, mergeFirstDimension);
+                cudaCheck(__LINE__, __FUNCTION__, __FILE__);
+                // Pose extractor blob and layer
+                upImpl->spPeaksBlob = {std::make_shared<caffe::Blob<float>>(1,1,1,1)};
+                upImpl->spMaximumCaffe->Reshape({upImpl->spHeatMapsBlob.get()}, {upImpl->spPeaksBlob.get()});
+                cudaCheck(__LINE__, __FUNCTION__, __FILE__);
+                // Logging
+                log("Finished initialization on thread.", Priority::Low, __LINE__, __FUNCTION__, __FILE__);
+            #endif
+        }
+        catch (const std::exception& e)
+        {
+            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+        }
+    }
+
+    void FaceExtractorCaffe::forwardPass(const std::vector<Rectangle<float>>& faceRectangles,
+                                         const cv::Mat& cvInputData,
+                                         const double scaleInputToOutput)
+    {
+        try
+        {
+            #if defined USE_CAFFE && defined USE_CUDA
+                if (!faceRectangles.empty())
+                {
+                    // Security checks
+                    if (cvInputData.empty())
+                        error("Empty cvInputData.", __LINE__, __FUNCTION__, __FILE__);
+
+                    // Fix parameters
+                    const auto netInputSide = fastMin(mNetOutputSize.x, mNetOutputSize.y);
+
+                    // Set face size
+                    const auto numberPeople = (int)faceRectangles.size();
+                    mFaceKeypoints.reset({numberPeople, (int)FACE_NUMBER_PARTS, 3}, 0);
+
+                    // HeatMaps: define size
+                    if (!mHeatMapTypes.empty())
+                        mHeatMaps.reset({numberPeople, (int)FACE_NUMBER_PARTS, mNetOutputSize.y, mNetOutputSize.x});
+
+                    // // Debugging
+                    // cv::Mat cvInputDataCopy = cvInputData.clone();
+                    // Extract face keypoints for each person
+                    for (auto person = 0 ; person < numberPeople ; person++)
+                    {
+                        const auto& faceRectangle = faceRectangles.at(person);
+                        // Only consider faces with a minimum pixel area
+                        const auto minFaceSize = fastMin(faceRectangle.width, faceRectangle.height);
+                        // // Debugging -> red rectangle
+                        // log(std::to_string(cvInputData.cols) + " " + std::to_string(cvInputData.rows));
+                        // cv::rectangle(cvInputDataCopy,
+                        //               cv::Point{(int)faceRectangle.x, (int)faceRectangle.y},
+                        //               cv::Point{(int)faceRectangle.bottomRight().x,
+                        //                         (int)faceRectangle.bottomRight().y},
+                        //               cv::Scalar{0,0,255}, 2);
+                        // Get parts
+                        if (minFaceSize > 40)
+                        {
+                            // // Debugging -> green rectangle overwriting red one
+                            // log(std::to_string(cvInputData.cols) + " " + std::to_string(cvInputData.rows));
+                            // cv::rectangle(cvInputDataCopy,
+                            //               cv::Point{(int)faceRectangle.x, (int)faceRectangle.y},
+                            //               cv::Point{(int)faceRectangle.bottomRight().x,
+                            //                         (int)faceRectangle.bottomRight().y},
+                            //               cv::Scalar{0,255,0}, 2);
+                            // Resize and shift image to face rectangle positions
+                            const auto faceSize = fastMax(faceRectangle.width, faceRectangle.height);
+                            const double scaleFace = faceSize / (double)netInputSide;
+                            cv::Mat Mscaling = cv::Mat::eye(2, 3, CV_64F);
+                            Mscaling.at<double>(0,0) = scaleFace;
+                            Mscaling.at<double>(1,1) = scaleFace;
+                            Mscaling.at<double>(0,2) = faceRectangle.x;
+                            Mscaling.at<double>(1,2) = faceRectangle.y;
+
+                            cv::Mat faceImage;
+                            cv::warpAffine(cvInputData, faceImage, Mscaling,
+                                           cv::Size{mNetOutputSize.x, mNetOutputSize.y},
+                                           CV_INTER_LINEAR | CV_WARP_INVERSE_MAP,
+                                           cv::BORDER_CONSTANT, cv::Scalar(0,0,0));
+
+                            // cv::Mat -> float*
+                            uCharCvMatToFloatPtr(mFaceImageCrop.getPtr(), faceImage, true);
+
+                            // // Debugging
+                            // if (person < 5)
+                            // cv::imshow("faceImage" + std::to_string(person), faceImage);
+
+                            // 1. Caffe deep network
+                            auto* inputDataGpuPtr = upImpl->spNetCaffe->getInputDataGpuPtr();
+                            cudaMemcpy(inputDataGpuPtr, mFaceImageCrop.getPtr(),
+                                       mNetOutputSize.area() * 3 * sizeof(float),
+                                       cudaMemcpyHostToDevice);
+                            upImpl->spNetCaffe->forwardPass();
+
+                            // 2. Resize heat maps + merge different scales
+                            #ifdef USE_CUDA
+                                upImpl->spResizeAndMergeCaffe->Forward_gpu({upImpl->spCaffeNetOutputBlob.get()},
+                                                                           {upImpl->spHeatMapsBlob.get()});
+                                cudaCheck(__LINE__, __FUNCTION__, __FILE__);
+                            #else
+                                upImpl->spResizeAndMergeCaffe->Forward_cpu({upImpl->spCaffeNetOutputBlob.get()},
+                                                                           {upImpl->spHeatMapsBlob.get()});
+                            #endif
+
+                            // 3. Get peaks by Non-Maximum Suppression
+                            #ifdef USE_CUDA
+                                upImpl->spMaximumCaffe->Forward_gpu({upImpl->spHeatMapsBlob.get()},
+                                                                    {upImpl->spPeaksBlob.get()});
+                                cudaCheck(__LINE__, __FUNCTION__, __FILE__);
+                            #else
+                                upImpl->spMaximumCaffe->Forward_cpu({upImpl->spHeatMapsBlob.get()},
+                                                                    {upImpl->spPeaksBlob.get()});
+                            #endif
+
+                            const auto* facePeaksPtr = upImpl->spPeaksBlob->mutable_cpu_data();
+                            for (auto part = 0 ; part < mFaceKeypoints.getSize(1) ; part++)
+                            {
+                                const auto xyIndex = part * mFaceKeypoints.getSize(2);
+                                const auto x = facePeaksPtr[xyIndex];
+                                const auto y = facePeaksPtr[xyIndex + 1];
+                                const auto score = facePeaksPtr[xyIndex + 2];
+                                const auto baseIndex = mFaceKeypoints.getSize(2)
+                                                     * (part + person * mFaceKeypoints.getSize(1));
+                                mFaceKeypoints[baseIndex] = (float)(scaleInputToOutput
+                                                                    * (Mscaling.at<double>(0,0) * x
+                                                                       + Mscaling.at<double>(0,1) * y
+                                                                       + Mscaling.at<double>(0,2)));
+                                mFaceKeypoints[baseIndex+1] = (float)(scaleInputToOutput
+                                                                      * (Mscaling.at<double>(1,0) * x
+                                                                         + Mscaling.at<double>(1,1) * y
+                                                                         + Mscaling.at<double>(1,2)));
+                                mFaceKeypoints[baseIndex+2] = score;
+                            }
+                            // HeatMaps: storing
+                            if (!mHeatMapTypes.empty())
+                                updateFaceHeatMapsForPerson(mHeatMaps, person, mHeatMapScaleMode,
+                                                            upImpl->spHeatMapsBlob->gpu_data());
+                        }
+                    }
+                    // // Debugging
+                    // cv::imshow("AcvInputDataCopy", cvInputDataCopy);
+                }
+                else
+                    mFaceKeypoints.reset();
+            #else
+                UNUSED(faceRectangles);
+                UNUSED(cvInputData);
+                UNUSED(scaleInputToOutput);
+            #endif
+        }
+        catch (const std::exception& e)
+        {
+            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+        }
+    }
+}
--- a/src/openpose/face/faceGpuRenderer.cpp
+++ b/src/openpose/face/faceGpuRenderer.cpp
-#ifndef CPU_ONLY
+#ifdef USE_CUDA
    #include <cuda.h>
    #include <cuda_runtime_api.h>
 #endif
@@ -19,7 +19,7 @@ namespace op
        try
        {
            // Free CUDA pointers - Note that if pointers are 0 (i.e. nullptr), no operation is performed.
-            #ifndef CPU_ONLY
+            #ifdef USE_CUDA
                cudaFree(pGpuFace);
            #endif
        }
@@ -35,7 +35,7 @@ namespace op
        {
            log("Starting initialization on thread.", Priority::Low, __LINE__, __FUNCTION__, __FILE__);
            // GPU memory allocation for rendering
-            #ifndef CPU_ONLY
+            #ifdef USE_CUDA
                cudaMalloc((void**)(&pGpuFace), POSE_MAX_PEOPLE * FACE_NUMBER_PARTS * 3 * sizeof(float));
            #endif
            log("Finished initialization on thread.", Priority::Low, __LINE__, __FUNCTION__, __FILE__);
@@ -54,7 +54,7 @@ namespace op
            if (outputData.empty())
                error("Empty Array<float> outputData.", __LINE__, __FUNCTION__, __FILE__);
            // GPU rendering
-            #ifndef CPU_ONLY
+            #ifdef USE_CUDA
                const auto elementRendered = spElementToRender->load(); // I prefer std::round(T&) over intRound(T) for std::atomic
                const auto numberPeople = faceKeypoints.getSize(0);
                const Point<int> frameSize{outputData.getSize(2), outputData.getSize(1)};
@@ -73,11 +73,11 @@ namespace op
                // GPU memory to CPU if last renderer
                gpuToCpuMemoryIfLastRenderer(outputData.getPtr(), outputData.getVolume());
                cudaCheck(__LINE__, __FUNCTION__, __FILE__);
-            // CPU_ONLY mode
            #else
-                error("GPU rendering not available if `CPU_ONLY` is set.", __LINE__, __FUNCTION__, __FILE__);
                UNUSED(outputData);
                UNUSED(faceKeypoints);
+                error("OpenPose must be compiled with the `USE_CUDA` macro definitions in order to run this"
+                      " functionality.", __LINE__, __FUNCTION__, __FILE__);
            #endif
        }
        catch (const std::exception& e)

--- a/src/openpose/face/renderFace.cu
+++ b/src/openpose/face/renderFace.cu
@@ -6,13 +6,12 @@

 namespace op
 {
-    const dim3 THREADS_PER_BLOCK{128, 128, 1};
    __constant__ const unsigned int PART_PAIRS_GPU[] = FACE_PAIRS_RENDER_GPU;
    __constant__ const float COLORS[] = {FACE_COLORS_RENDER_GPU};

    __global__ void renderFaceParts(float* targetPtr, const int targetWidth, const int targetHeight,
-                                    const float* const facePtr, const int numberPeople, const float threshold,
-                                    const float alphaColorToAdd)
+                                    const float* const facePtr, const int numberPeople,
+                                    const float threshold, const float alphaColorToAdd)
    {
        const auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
        const auto y = (blockIdx.y * blockDim.y) + threadIdx.y;
@@ -43,10 +42,11 @@ namespace op
        {
            if (numberPeople > 0)
            {
-                const auto numBlocks = getNumberCudaBlocks(frameSize, THREADS_PER_BLOCK);
-                renderFaceParts<<<THREADS_PER_BLOCK, numBlocks>>>(framePtr, frameSize.x, frameSize.y, facePtr,
-                                                                  numberPeople, renderThreshold,
-                                                                  alphaColorToAdd);
+                dim3 threadsPerBlock;
+                dim3 numBlocks;
+                getNumberCudaThreadsAndBlocks(threadsPerBlock, numBlocks, frameSize);
+                renderFaceParts<<<threadsPerBlock, numBlocks>>>(framePtr, frameSize.x, frameSize.y, facePtr,
+                                                                numberPeople, renderThreshold, alphaColorToAdd);
                cudaCheck(__LINE__, __FUNCTION__, __FILE__);
            }
        }

--- a/src/openpose/gui/guiInfoAdder.cpp
+++ b/src/openpose/gui/guiInfoAdder.cpp
@@ -58,7 +58,7 @@ namespace op
    {
    }

-    void GuiInfoAdder::addInfo(cv::Mat& cvOutputData, const Array<float>& poseKeypoints, const unsigned long long id,
+    void GuiInfoAdder::addInfo(cv::Mat& cvOutputData, const int numberPeople, const unsigned long long id,
                               const std::string& elementRenderedName)
    {
        try
@@ -101,8 +101,9 @@ namespace op
            putTextOnCvMat(cvOutputData, "Frame: " + std::to_string(id),
                           {borderMargin, (int)(cvOutputData.rows - borderMargin)}, white, false);
            // Number people
-            putTextOnCvMat(cvOutputData, "People: " + std::to_string(poseKeypoints.getSize(0)),
-                           {(int)(cvOutputData.cols - borderMargin), (int)(cvOutputData.rows - borderMargin)}, white, true);
+            putTextOnCvMat(cvOutputData, "People: " + std::to_string(numberPeople),
+                           {(int)(cvOutputData.cols - borderMargin), (int)(cvOutputData.rows - borderMargin)},
+                           white, true);
        }
        catch (const std::exception& e)
        {

--- a/src/openpose/hand/CMakeLists.txt
+++ b/src/openpose/hand/CMakeLists.txt
@@ -3,6 +3,7 @@ set(SOURCES
    handDetector.cpp
    handDetectorFromTxt.cpp
    handExtractor.cpp
+    handExtractorCaffe.cpp
    handCpuRenderer.cpp
    handGpuRenderer.cpp
    renderHand.cpp

--- a/src/openpose/hand/handDetector.cpp
+++ b/src/openpose/hand/handDetector.cpp
@@ -130,7 +130,7 @@ namespace op
    {
    }

-    std::vector<std::array<Rectangle<float>, 2>> HandDetector::detectHands(const Array<float>& poseKeypoints, const float scaleInputToOutput) const
+    std::vector<std::array<Rectangle<float>, 2>> HandDetector::detectHands(const Array<float>& poseKeypoints, const double scaleInputToOutput) const
    {
        try
        {
@@ -148,8 +148,8 @@ namespace op
                        mPoseIndexes[(int)PosePart::LShoulder], mPoseIndexes[(int)PosePart::RWrist],
                        mPoseIndexes[(int)PosePart::RElbow], mPoseIndexes[(int)PosePart::RShoulder], threshold
                    );
-                    handRectangles.at(person).at(0) /= scaleInputToOutput;
-                    handRectangles.at(person).at(1) /= scaleInputToOutput;
+                    handRectangles.at(person).at(0) /= (float) scaleInputToOutput;
+                    handRectangles.at(person).at(1) /= (float) scaleInputToOutput;
                }
            }
            return handRectangles;
@@ -161,7 +161,7 @@ namespace op
        }
    }

-    std::vector<std::array<Rectangle<float>, 2>> HandDetector::trackHands(const Array<float>& poseKeypoints, const float scaleInputToOutput)
+    std::vector<std::array<Rectangle<float>, 2>> HandDetector::trackHands(const Array<float>& poseKeypoints, const double scaleInputToOutput)
    {
        try
        {

--- a/src/openpose/hand/handExtractor.cpp
+++ b/src/openpose/hand/handExtractor.cpp
-#include <limits> // std::numeric_limits
-#include <opencv2/opencv.hpp> // CV_WARP_INVERSE_MAP, CV_INTER_LINEAR
-#include <openpose/core/netCaffe.hpp>
-#include <openpose/hand/handParameters.hpp>
 #include <openpose/utilities/check.hpp>
-#include <openpose/utilities/cuda.hpp>
-#include <openpose/utilities/fastMath.hpp>
-#include <openpose/utilities/keypoint.hpp>
-#include <openpose/utilities/openCv.hpp>
 #include <openpose/hand/handExtractor.hpp>

 namespace op
 {
-    void cropFrame(Array<float>& handImageCrop, cv::Mat& affineMatrix, const cv::Mat& cvInputData,
-                   const Rectangle<float>& handRectangle, const int netInputSide,
-                   const Point<int>& netOutputSize, const bool mirrorImage)
-    {
-        try
-        {
-            // Resize image to hands positions
-            const auto scaleLeftHand = handRectangle.width / (float)netInputSide;
-            affineMatrix = cv::Mat::eye(2,3,CV_64F);
-            if (mirrorImage)
-                affineMatrix.at<double>(0,0) = -scaleLeftHand;
-            else
-                affineMatrix.at<double>(0,0) = scaleLeftHand;
-            affineMatrix.at<double>(1,1) = scaleLeftHand;
-            if (mirrorImage)
-                affineMatrix.at<double>(0,2) = handRectangle.x + handRectangle.width;
-            else
-                affineMatrix.at<double>(0,2) = handRectangle.x;
-            affineMatrix.at<double>(1,2) = handRectangle.y;
-            cv::Mat handImage;
-            cv::warpAffine(cvInputData, handImage, affineMatrix, cv::Size{netOutputSize.x, netOutputSize.y},
-                           CV_INTER_LINEAR | CV_WARP_INVERSE_MAP, cv::BORDER_CONSTANT, cv::Scalar{0,0,0});
-                           // CV_INTER_CUBIC | CV_WARP_INVERSE_MAP, cv::BORDER_CONSTANT, cv::Scalar{0,0,0});
-            // cv::Mat -> float*
-            uCharCvMatToFloatPtr(handImageCrop.getPtr(), handImage, true);
-        }
-        catch (const std::exception& e)
-        {
-            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
-        }
-    }
-
-    void connectKeypoints(Array<float>& handCurrent, const float scaleInputToOutput, const int person,
-                          const cv::Mat& affineMatrix, const float* handPeaks)
-    {
-        try
-        {
-            // Estimate keypoint locations
-            for (auto part = 0 ; part < handCurrent.getSize(1) ; part++)
-            {
-                const auto xyIndex = part * handCurrent.getSize(2);
-                const auto x = handPeaks[xyIndex];
-                const auto y = handPeaks[xyIndex + 1];
-                const auto score = handPeaks[xyIndex + 2];
-                const auto baseIndex = handCurrent.getSize(2) * (part + person * handCurrent.getSize(1));
-                handCurrent[baseIndex] = (float)(scaleInputToOutput
-                                       * (affineMatrix.at<double>(0,0)*x + affineMatrix.at<double>(0,1)*y
-                                                                       + affineMatrix.at<double>(0,2)));
-                handCurrent[baseIndex+1] = (float)(scaleInputToOutput
-                                         * (affineMatrix.at<double>(1,0)*x + affineMatrix.at<double>(1,1)*y
-                                                                       + affineMatrix.at<double>(1,2)));
-                handCurrent[baseIndex+2] = score;
-            }
-        }
-        catch (const std::exception& e)
-        {
-            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
-        }
-    }
-
-    Rectangle<float> getHandRectangle(Array<float>& handCurrent, const int person, const float increaseRatio,
-                                      const int handNumberParts, const float thresholdRectangle,
-                                      const Rectangle<float>& previousHandRectangle = Rectangle<float>{})
-    {
-        try
-        {
-            // Initial Rectangle
-            auto handRectangle = getKeypointsRectangle(handCurrent, person, handNumberParts, thresholdRectangle);
-            // Get final width
-            auto finalWidth = fastMax(handRectangle.width, handRectangle.height) * increaseRatio;
-            if (previousHandRectangle.width > 0 && previousHandRectangle.height > 0)
-                finalWidth = fastMax(handRectangle.width, 0.85f
-                                     * fastMax(previousHandRectangle.width, previousHandRectangle.height));
-            // Update Rectangle
-            handRectangle.recenter(finalWidth, finalWidth);
-            return handRectangle;
-        }
-        catch (const std::exception& e)
-        {
-            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
-            return Rectangle<float>{};
-        }
-    }
-
-    void updateHandHeatMapsForPerson(Array<float>& heatMaps, const int person, const ScaleMode heatMapScaleMode,
-                                     const float* heatMapsGpuPtr)
-    {
-        try
-        {
-            // Copy memory
-            const auto channelOffset = heatMaps.getVolume(2, 3);
-            const auto volumeBodyParts = HAND_NUMBER_PARTS * channelOffset;
-            auto totalOffset = 0u;
-            auto* heatMapsPtr = &heatMaps.getPtr()[person*volumeBodyParts];
-            // Copy hand parts
-            cudaMemcpy(heatMapsPtr, heatMapsGpuPtr, volumeBodyParts * sizeof(float), cudaMemcpyDeviceToHost);
-            // Change from [0,1] to [-1,1]
-            if (heatMapScaleMode == ScaleMode::PlusMinusOne)
-                for (auto i = 0u ; i < volumeBodyParts ; i++)
-                    heatMapsPtr[i] = fastTruncate(heatMapsPtr[i]) * 2.f - 1.f;
-            // [0, 255]
-            else if (heatMapScaleMode == ScaleMode::UnsignedChar)
-                for (auto i = 0u ; i < volumeBodyParts ; i++)
-                    heatMapsPtr[i] = (float)intRound(fastTruncate(heatMapsPtr[i]) * 255.f);
-            // Avoid values outside original range
-            else
-                for (auto i = 0u ; i < volumeBodyParts ; i++)
-                    heatMapsPtr[i] = fastTruncate(heatMapsPtr[i]);
-            totalOffset += (unsigned int)volumeBodyParts;
-        }
-        catch (const std::exception& e)
-        {
-            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
-        }
-    }
-
    HandExtractor::HandExtractor(const Point<int>& netInputSize, const Point<int>& netOutputSize,
-                                 const std::string& modelFolder, const int gpuId, const unsigned short numberScales,
-                                 const float rangeScales, const std::vector<HeatMapType>& heatMapTypes,
-                                 const ScaleMode heatMapScale) :
+                                 const unsigned short numberScales, const float rangeScales,
+                                 const std::vector<HeatMapType>& heatMapTypes, const ScaleMode heatMapScale) :
        mMultiScaleNumberAndRange{std::make_pair(numberScales, rangeScales)},
        mNetOutputSize{netOutputSize},
-        spNet{std::make_shared<NetCaffe>(std::array<int,4>{1, 3, mNetOutputSize.y, mNetOutputSize.x},
-                                         modelFolder + HAND_PROTOTXT, modelFolder + HAND_TRAINED_MODEL, gpuId)},
-        spResizeAndMergeCaffe{std::make_shared<ResizeAndMergeCaffe<float>>()},
-        spMaximumCaffe{std::make_shared<MaximumCaffe<float>>()},
        mHandImageCrop{mNetOutputSize.area()*3},
        mHeatMapScaleMode{heatMapScale},
        mHeatMapTypes{heatMapTypes}
@@ -165,161 +36,18 @@ namespace op
        }
    }

-    void HandExtractor::initializationOnThread()
+    HandExtractor::~HandExtractor()
    {
-        try
-        {
-            // Logging
-            log("Starting initialization on thread.", Priority::Low, __LINE__, __FUNCTION__, __FILE__);
-            // Get thread id
-            mThreadId = {std::this_thread::get_id()};
-            // Caffe net
-            spNet->initializationOnThread();
-            spCaffeNetOutputBlob = ((NetCaffe*)spNet.get())->getOutputBlob();
-            cudaCheck(__LINE__, __FUNCTION__, __FILE__);
-            // HeatMaps extractor blob and layer
-            spHeatMapsBlob = {std::make_shared<caffe::Blob<float>>(1,1,1,1)};
-            const bool mergeFirstDimension = true;
-            spResizeAndMergeCaffe->Reshape({spCaffeNetOutputBlob.get()}, {spHeatMapsBlob.get()},
-                                            HAND_CCN_DECREASE_FACTOR, mergeFirstDimension);
-            cudaCheck(__LINE__, __FUNCTION__, __FILE__);
-            // Pose extractor blob and layer
-            spPeaksBlob = {std::make_shared<caffe::Blob<float>>(1,1,1,1)};
-            spMaximumCaffe->Reshape({spHeatMapsBlob.get()}, {spPeaksBlob.get()});
-            cudaCheck(__LINE__, __FUNCTION__, __FILE__);
-            // Logging
-            log("Finished initialization on thread.", Priority::Low, __LINE__, __FUNCTION__, __FILE__);
-        }
-        catch (const std::exception& e)
-        {
-            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
-        }
    }

-    void HandExtractor::forwardPass(const std::vector<std::array<Rectangle<float>, 2>> handRectangles,
-                                    const cv::Mat& cvInputData,
-                                    const float scaleInputToOutput)
+    void HandExtractor::initializationOnThread()
    {
        try
        {
-            if (!handRectangles.empty())
-            {
-                // Security checks
-                if (cvInputData.empty())
-                    error("Empty cvInputData.", __LINE__, __FUNCTION__, __FILE__);
-
-                // Fix parameters
-                const auto netInputSide = fastMin(mNetOutputSize.x, mNetOutputSize.y);
-
-                // Set hand size
-                const auto numberPeople = (int)handRectangles.size();
-                mHandKeypoints[0].reset({numberPeople, (int)HAND_NUMBER_PARTS, 3}, 0);
-                mHandKeypoints[1].reset(mHandKeypoints[0].getSize(), 0);
-
-                // HeatMaps: define size
-                if (!mHeatMapTypes.empty())
-                {
-                    mHeatMaps[0].reset({numberPeople, (int)HAND_NUMBER_PARTS, mNetOutputSize.y, mNetOutputSize.x});
-                    mHeatMaps[1].reset({numberPeople, (int)HAND_NUMBER_PARTS, mNetOutputSize.y, mNetOutputSize.x});
-                }
-
-                // // Debugging
-                // cv::Mat cvInputDataCopied = cvInputData.clone();
-                // Extract hand keypoints for each person
-                for (auto hand = 0 ; hand < 2 ; hand++)
-                {
-                    // Parameters
-                    auto& handCurrent = mHandKeypoints[hand];
-                    const bool mirrorImage = (hand == 0);
-                    for (auto person = 0 ; person < numberPeople ; person++)
-                    {
-                        const auto& handRectangle = handRectangles.at(person).at(hand);
-                        // Only consider faces with a minimum pixel area
-                        const auto minHandSize = fastMin(handRectangle.width, handRectangle.height);
-                        // // Debugging -> red rectangle
-                        // if (handRectangle.width > 0)
-                        //     cv::rectangle(cvInputDataCopied,
-                        //                   cv::Point{intRound(handRectangle.x), intRound(handRectangle.y)},
-                        //                   cv::Point{intRound(handRectangle.x + handRectangle.width),
-                        //                             intRound(handRectangle.y + handRectangle.height)},
-                        //                   cv::Scalar{(hand * 255.f),0.f,255.f}, 2);
-                        // Get parts
-                        if (minHandSize > 1 && handRectangle.area() > 10)
-                        {
-                            // Single-scale detection
-                            if (mMultiScaleNumberAndRange.first == 1)
-                            {
-                                // // Debugging -> green rectangle overwriting red one
-                                // if (handRectangle.width > 0)
-                                //     cv::rectangle(cvInputDataCopied,
-                                //                   cv::Point{intRound(handRectangle.x), intRound(handRectangle.y)},
-                                //                   cv::Point{intRound(handRectangle.x + handRectangle.width),
-                                //                             intRound(handRectangle.y + handRectangle.height)},
-                                //                   cv::Scalar{(hand * 255.f),255.f,0.f}, 2);
-                                // Parameters
-                                cv::Mat affineMatrix;
-                                // Resize image to hands positions + cv::Mat -> float*
-                                cropFrame(mHandImageCrop, affineMatrix, cvInputData, handRectangle, netInputSide,
-                                          mNetOutputSize, mirrorImage);
-                                // Deep net + Estimate keypoint locations
-                                detectHandKeypoints(handCurrent, scaleInputToOutput, person, affineMatrix);
-                            }
-                            // Multi-scale detection
-                            else
-                            {
-                                const auto handPtrArea = handCurrent.getSize(1) * handCurrent.getSize(2);
-                                auto* handCurrentPtr = handCurrent.getPtr() + person * handPtrArea;
-                                const auto numberScales = mMultiScaleNumberAndRange.first;
-                                const auto initScale = 1.f - mMultiScaleNumberAndRange.second / 2.f;
-                                for (auto i = 0 ; i < numberScales ; i++)
-                                {
-                                    // Get current scale
-                                    const auto scale = initScale
-                                                     + mMultiScaleNumberAndRange.second * i / (numberScales-1.f);
-                                    // Process hand
-                                    Array<float> handEstimated({1, handCurrent.getSize(1), handCurrent.getSize(2)}, 0);
-                                    const auto handRectangleScale = recenter(
-                                        handRectangle,
-                                        (float)(intRound(handRectangle.width * scale) / 2 * 2),
-                                        (float)(intRound(handRectangle.height * scale) / 2 * 2)
-                                    );
-                                    // // Debugging -> blue rectangle
-                                    // cv::rectangle(cvInputDataCopied,
-                                    //               cv::Point{intRound(handRectangleScale.x),
-                                    //                         intRound(handRectangleScale.y)},
-                                    //               cv::Point{intRound(handRectangleScale.x
-                                    //                                  + handRectangleScale.width),
-                                    //                         intRound(handRectangleScale.y
-                                    //                                  + handRectangleScale.height)},
-                                    //               cv::Scalar{255,0,0}, 2);
-                                    // Parameters
-                                    cv::Mat affineMatrix;
-                                    // Resize image to hands positions + cv::Mat -> float*
-                                    cropFrame(mHandImageCrop, affineMatrix, cvInputData, handRectangleScale,
-                                              netInputSide, mNetOutputSize, mirrorImage);
-                                    // Deep net + Estimate keypoint locations
-                                    detectHandKeypoints(handEstimated, scaleInputToOutput, 0, affineMatrix);
-                                    if (i == 0
-                                        || getAverageScore(handEstimated, 0) > getAverageScore(handCurrent, person))
-                                        std::copy(handEstimated.getConstPtr(),
-                                                  handEstimated.getConstPtr() + handPtrArea, handCurrentPtr);
-                                }
-                            }
-                            // HeatMaps: storing
-                            if (!mHeatMapTypes.empty())
-                                updateHandHeatMapsForPerson(mHeatMaps[hand], person, mHeatMapScaleMode,
-                                                        spHeatMapsBlob->gpu_data());
-                        }
-                    }
-                }
-                // // Debugging
-                // cv::imshow("cvInputDataCopied", cvInputDataCopied);
-            }
-            else
-            {
-                mHandKeypoints[0].reset();
-                mHandKeypoints[1].reset();
-            }
+            // Get thread id
+            mThreadId = {std::this_thread::get_id()};
+            // Deep net initialization
+            netInitializationOnThread();
        }
        catch (const std::exception& e)
        {
@@ -368,41 +96,4 @@ namespace op
            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
        }
    }
-
-    void HandExtractor::detectHandKeypoints(Array<float>& handCurrent, const float scaleInputToOutput,
-                                            const int person, const cv::Mat& affineMatrix)
-    {
-        try
-        {
-            // Deep net
-            // 1. Caffe deep network
-            auto* inputDataGpuPtr = spNet->getInputDataGpuPtr();
-            cudaMemcpy(inputDataGpuPtr, mHandImageCrop.getConstPtr(), mNetOutputSize.area() * 3 * sizeof(float),
-                       cudaMemcpyHostToDevice);
-            spNet->forwardPass();
-
-            // 2. Resize heat maps + merge different scales
-            #ifndef CPU_ONLY
-                spResizeAndMergeCaffe->Forward_gpu({spCaffeNetOutputBlob.get()}, {spHeatMapsBlob.get()});
-                cudaCheck(__LINE__, __FUNCTION__, __FILE__);
-            #else
-                spResizeAndMergeCaffe->Forward_cpu({spCaffeNetOutputBlob.get()}, {spHeatMapsBlob.get()});
-            #endif
-
-            // 3. Get peaks by Non-Maximum Suppression
-            #ifndef CPU_ONLY
-                spMaximumCaffe->Forward_gpu({spHeatMapsBlob.get()}, {spPeaksBlob.get()});
-                cudaCheck(__LINE__, __FUNCTION__, __FILE__);
-            #else
-                spMaximumCaffe->Forward_cpu({spHeatMapsBlob.get()}, {spPeaksBlob.get()});
-            #endif
-
-            // Estimate keypoint locations
-            connectKeypoints(handCurrent, scaleInputToOutput, person, affineMatrix, spPeaksBlob->mutable_cpu_data());
-        }
-        catch (const std::exception& e)
-        {
-            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
-        }
-    }
 }
--- a/src/openpose/hand/handExtractorCaffe.cpp
+++ b/src/openpose/hand/handExtractorCaffe.cpp
--- a/src/openpose/hand/handGpuRenderer.cpp
+++ b/src/openpose/hand/handGpuRenderer.cpp
-#ifndef CPU_ONLY
+#ifdef USE_CUDA
    #include <cuda.h>
    #include <cuda_runtime_api.h>
 #endif
@@ -19,7 +19,7 @@ namespace op
        try
        {
            // Free CUDA pointers - Note that if pointers are 0 (i.e. nullptr), no operation is performed.
-            #ifndef CPU_ONLY
+            #ifdef USE_CUDA
                cudaFree(pGpuHand);
            #endif
        }
@@ -35,7 +35,7 @@ namespace op
        {
            log("Starting initialization on thread.", Priority::Low, __LINE__, __FUNCTION__, __FILE__);
            // GPU memory allocation for rendering
-            #ifndef CPU_ONLY
+            #ifdef USE_CUDA
                cudaMalloc((void**)(&pGpuHand), HAND_MAX_HANDS * HAND_NUMBER_PARTS * 3 * sizeof(float));
            #endif
            log("Finished initialization on thread.", Priority::Low, __LINE__, __FUNCTION__, __FILE__);
@@ -56,7 +56,7 @@ namespace op
            if (handKeypoints[0].getSize(0) != handKeypoints[1].getSize(0))
                error("Wrong hand format: handKeypoints.getSize(0) != handKeypoints.getSize(1).", __LINE__, __FUNCTION__, __FILE__);
            // GPU rendering
-            #ifndef CPU_ONLY
+            #ifdef USE_CUDA
                const auto elementRendered = spElementToRender->load(); // I prefer std::round(T&) over intRound(T) for std::atomic
                const auto numberPeople = handKeypoints[0].getSize(0);
                const Point<int> frameSize{outputData.getSize(2), outputData.getSize(1)};
@@ -76,11 +76,11 @@ namespace op
                // GPU memory to CPU if last renderer
                gpuToCpuMemoryIfLastRenderer(outputData.getPtr(), outputData.getVolume());
                cudaCheck(__LINE__, __FUNCTION__, __FILE__);
-            // CPU_ONLY mode
            #else
-                error("GPU rendering not available if `CPU_ONLY` is set.", __LINE__, __FUNCTION__, __FILE__);
                UNUSED(outputData);
                UNUSED(handKeypoints);
+                error("OpenPose must be compiled with the `USE_CUDA` macro definitions in order to run this"
+                      " functionality.", __LINE__, __FUNCTION__, __FILE__);
            #endif
        }
        catch (const std::exception& e)

--- a/src/openpose/hand/renderHand.cu
+++ b/src/openpose/hand/renderHand.cu
@@ -9,8 +9,6 @@ namespace op
    __constant__ const unsigned int PART_PAIRS_GPU[] = HAND_PAIRS_RENDER_GPU;
    __constant__ const float COLORS[] = {HAND_COLORS_RENDER_GPU};

-
-
    __global__ void renderHandsParts(float* targetPtr, const int targetWidth, const int targetHeight,
                                     const float* const handsPtr, const int numberHands,
                                     const float threshold, const float alphaColorToAdd)
@@ -46,7 +44,7 @@ namespace op
            {
                dim3 threadsPerBlock;
                dim3 numBlocks;
-                std::tie(threadsPerBlock, numBlocks) = getNumberCudaThreadsAndBlocks(frameSize);
+                getNumberCudaThreadsAndBlocks(threadsPerBlock, numBlocks, frameSize);
                renderHandsParts<<<threadsPerBlock, numBlocks>>>(framePtr, frameSize.x, frameSize.y, handsPtr,
                                                                 numberHands, renderThreshold, alphaColorToAdd);
                cudaCheck(__LINE__, __FUNCTION__, __FILE__);

--- a/src/openpose/pose/bodyPartConnectorBase.cu
+++ b/src/openpose/pose/bodyPartConnectorBase.cu
--- a/src/openpose/pose/bodyPartConnectorCaffe.cpp
+++ b/src/openpose/pose/bodyPartConnectorCaffe.cpp
--- a/src/openpose/pose/poseExtractor.cpp
+++ b/src/openpose/pose/poseExtractor.cpp
--- a/src/openpose/pose/poseExtractorCaffe.cpp
+++ b/src/openpose/pose/poseExtractorCaffe.cpp
--- a/src/openpose/pose/poseGpuRenderer.cpp
+++ b/src/openpose/pose/poseGpuRenderer.cpp
--- a/src/openpose/pose/renderPose.cu
+++ b/src/openpose/pose/renderPose.cu
--- a/src/openpose/utilities/cuda.cpp
+++ b/src/openpose/utilities/cuda.cpp
--- a/ubuntu/Makefile.config.Ubuntu14_cuda7.example
+++ b/ubuntu/Makefile.config.Ubuntu14_cuda7.example
--- a/ubuntu/Makefile.config.Ubuntu14_cuda8.example
+++ b/ubuntu/Makefile.config.Ubuntu14_cuda8.example
--- a/ubuntu/Makefile.config.Ubuntu16_cuda7.example
+++ b/ubuntu/Makefile.config.Ubuntu16_cuda7.example
--- a/ubuntu/Makefile.config.Ubuntu16_cuda8.example
+++ b/ubuntu/Makefile.config.Ubuntu16_cuda8.example
--- a/ubuntu/Makefile.config.Ubuntu16_cuda8_JetsonTX2
+++ b/ubuntu/Makefile.config.Ubuntu16_cuda8_JetsonTX2
--- a/windows/OpenPose/OpenPose.vcxproj
+++ b/windows/OpenPose/OpenPose.vcxproj
--- a/windows/OpenPose/OpenPose.vcxproj.filters
+++ b/windows/OpenPose/OpenPose.vcxproj.filters
--- a/windows/OpenPose3DReconstruction/OpenPose3DReconstruction.vcxproj
+++ b/windows/OpenPose3DReconstruction/OpenPose3DReconstruction.vcxproj
--- a/windows/OpenPoseDemo/OpenPoseDemo.vcxproj
+++ b/windows/OpenPoseDemo/OpenPoseDemo.vcxproj
--- a/windows/TutorialPose/1_extract_from_image.vcxproj
+++ b/windows/TutorialPose/1_extract_from_image.vcxproj
--- a/windows/TutorialPose/2_extract_pose_or_heatmat_from_image.vcxproj
+++ b/windows/TutorialPose/2_extract_pose_or_heatmat_from_image.vcxproj
--- a/windows/TutorialThread/1_openpose_read_and_display.vcxproj
+++ b/windows/TutorialThread/1_openpose_read_and_display.vcxproj
--- a/windows/TutorialThread/2_user_processing_function.vcxproj
+++ b/windows/TutorialThread/2_user_processing_function.vcxproj
--- a/windows/TutorialThread/3_user_input_processing_and_output.vcxproj
+++ b/windows/TutorialThread/3_user_input_processing_and_output.vcxproj
--- a/windows/TutorialThread/4_user_input_processing_output_and_datum.vcxproj
+++ b/windows/TutorialThread/4_user_input_processing_output_and_datum.vcxproj
--- a/windows/TutorialWrapper/1_user_asynchronous_output.vcxproj
+++ b/windows/TutorialWrapper/1_user_asynchronous_output.vcxproj
--- a/windows/TutorialWrapper/2_user_synchronous.vcxproj
+++ b/windows/TutorialWrapper/2_user_synchronous.vcxproj
--- a/windows/TutorialWrapper/3_user_asynchronous.vcxproj
+++ b/windows/TutorialWrapper/3_user_asynchronous.vcxproj