diff --git a/README.md b/README.md
index 556314dbb7cc0eec9bf97a4d8ca2fcfc17e5b26d..2b36976b43200a10c4c7729bb186b87f329b496e 100644
--- a/README.md
+++ b/README.md
@@ -23,7 +23,7 @@ OpenPose is freely available for free non-commercial use, and may be redistribut
 
 Library main functionality:
 
-* Multi-person 15 or **18-keypoint body pose** estimation and rendering. **Running time invariant of number of people** on the image.
+* Multi-person 15 or **18-keypoint body pose** estimation and rendering. **Running time invariant to number of people** on the image.
 
 * Multi-person **2x21-keypoint hand** estimation and rendering. Note: In this initial version, **running time** linearly **depends** on the **number of people** on the image. **Coming soon (in around 1-5 weeks)!**
 
@@ -76,8 +76,6 @@ The pose estimation work is based on the C++ code from [the ECCV 2016 demo](http
     2. [OpenPose Wrapper](#openpose-wrapper)
     3. [OpenPose Library](#openpose-library)
 4. [Output](#output)
-    1. [Output Format](#output-format)
-    2. [Reading Saved Results](#reading-saved-results)
 5. [OpenPose Benchmark](#openpose-benchmark)
 6. [Send Us Your Feedback!](#send-us-your-feedback)
 7. [Citation](#citation)
diff --git a/doc/release_notes.md b/doc/release_notes.md
index a1eee70d7a451f3f0bd71d8b14d5f826fe30f5fc..e474c3b0ec73f7eca253536dbebcc76829cc730c 100644
--- a/doc/release_notes.md
+++ b/doc/release_notes.md
@@ -41,5 +41,16 @@ OpenPose Library - Release Notes
 3. Main bugs fixed:
     1. All visualization functions moved to same thread, so it works with most OpenCV custom compiled versions.
     2. Fixed error on debug mode: `Too many resources requested for launch`.
-    3. Bug in Array::getConstCvMat() if mVolume=0, now returning empty cv::Mat.
-    4. Bug: `--process_real_time` threw error with webcam.
+
+
+
+## Current version (future OpenPose 1.0.0rc4)
+1. Main improvements:
+    1. Check() functions give more feedback.
+    2. Improved documentation.
+2. Functions or paremeters renamed:
+    1. `Datum::scaleRatios` to save the relative scale ratio when multi-scale.
+3. Main bugs fixed:
+    1. Fixed bug in Array::getConstCvMat() if mVolume=0, now returning empty cv::Mat.
+    2. Fixed bug: `--process_real_time` threw error with webcam.
+    3. Fixed bug: Face not working with output resolution different to input.
diff --git a/examples/tutorial_pose/1_extract_from_image.cpp b/examples/tutorial_pose/1_extract_from_image.cpp
index c23fb89ced7ddc764436655647d9b8a2abf1859e..1018bef790ec005d36a7737c70e9ea4495605a1a 100644
--- a/examples/tutorial_pose/1_extract_from_image.cpp
+++ b/examples/tutorial_pose/1_extract_from_image.cpp
@@ -28,7 +28,7 @@ DEFINE_int32(logging_level,             3,              "The logging level. Inte
 DEFINE_string(image_path,               "examples/media/COCO_val2014_000000000192.jpg",     "Process the desired image.");
 // OpenPose
 DEFINE_string(model_pose,               "COCO",         "Model to be used (e.g. COCO, MPI, MPI_4_layers).");
-DEFINE_string(model_folder,             "models/",      "Folder where the pose models (COCO and MPI) are located.");
+DEFINE_string(model_folder,             "models/",      "Folder path (absolute or relative) where the models (pose, face, ...) are located.");
 DEFINE_string(net_resolution,           "656x368",      "Multiples of 16. If it is increased, the accuracy usually increases. If it is decreased, the speed increases.");
 DEFINE_string(resolution,               "1280x720",     "The image resolution (display). Use \"-1x-1\" to force the program to use the default images resolution.");
 DEFINE_int32(num_gpu_start,             0,              "GPU device start number.");
@@ -98,7 +98,7 @@ int openPoseTutorialPose1()
     // Step 3 - Initialize all required classes
     op::CvMatToOpInput cvMatToOpInput{netInputSize, FLAGS_num_scales, (float)FLAGS_scale_gap};
     op::CvMatToOpOutput cvMatToOpOutput{outputSize};
-    op::PoseExtractorCaffe poseExtractorCaffe{netInputSize, netOutputSize, outputSize, FLAGS_num_scales, (float)FLAGS_scale_gap, poseModel,
+    op::PoseExtractorCaffe poseExtractorCaffe{netInputSize, netOutputSize, outputSize, FLAGS_num_scales, poseModel,
                                               FLAGS_model_folder, FLAGS_num_gpu_start};
     op::PoseRenderer poseRenderer{netOutputSize, outputSize, poseModel, nullptr, (float)FLAGS_alpha_pose};
     op::OpOutputToCvMat opOutputToCvMat{outputSize};
@@ -114,12 +114,14 @@ int openPoseTutorialPose1()
     if(inputImage.empty())
         op::error("Could not open or find the image: " + FLAGS_image_path, __LINE__, __FUNCTION__, __FILE__);
     // Step 2 - Format input image to OpenPose input and output formats
-    const auto netInputArray = cvMatToOpInput.format(inputImage);
+    op::Array<float> netInputArray;
+    std::vector<float> scaleRatios;
+    std::tie(netInputArray, scaleRatios) = cvMatToOpInput.format(inputImage);
     double scaleInputToOutput;
     op::Array<float> outputArray;
     std::tie(scaleInputToOutput, outputArray) = cvMatToOpOutput.format(inputImage);
     // Step 3 - Estimate poseKeypoints
-    poseExtractorCaffe.forwardPass(netInputArray, {inputImage.cols, inputImage.rows});
+    poseExtractorCaffe.forwardPass(netInputArray, {inputImage.cols, inputImage.rows}, scaleRatios);
     const auto poseKeypoints = poseExtractorCaffe.getPoseKeypoints();
     // Step 4 - Render poseKeypoints
     poseRenderer.renderPose(outputArray, poseKeypoints);
diff --git a/examples/tutorial_pose/2_extract_pose_or_heatmat_from_image.cpp b/examples/tutorial_pose/2_extract_pose_or_heatmat_from_image.cpp
index 52fadcaf22faacbfc5b6fad76c468540a8df4bec..4f55a07289266275ddd62e985a090f4f02e6b8f1 100644
--- a/examples/tutorial_pose/2_extract_pose_or_heatmat_from_image.cpp
+++ b/examples/tutorial_pose/2_extract_pose_or_heatmat_from_image.cpp
@@ -28,7 +28,7 @@ DEFINE_int32(logging_level,             3,              "The logging level. Inte
 DEFINE_string(image_path,               "examples/media/COCO_val2014_000000000192.jpg",     "Process the desired image.");
 // OpenPose
 DEFINE_string(model_pose,               "COCO",         "Model to be used (e.g. COCO, MPI, MPI_4_layers).");
-DEFINE_string(model_folder,             "models/",      "Folder where the pose models (COCO and MPI) are located.");
+DEFINE_string(model_folder,             "models/",      "Folder path (absolute or relative) where the models (pose, face, ...) are located.");
 DEFINE_string(net_resolution,           "656x368",      "Multiples of 16. If it is increased, the accuracy usually increases. If it is decreased, the speed increases.");
 DEFINE_string(resolution,               "1280x720",     "The image resolution (display). Use \"-1x-1\" to force the program to use the default images resolution.");
 DEFINE_int32(num_gpu_start,             0,              "GPU device start number.");
@@ -101,8 +101,7 @@ int openPoseTutorialPose2()
     op::CvMatToOpInput cvMatToOpInput{netInputSize, FLAGS_num_scales, (float)FLAGS_scale_gap};
     op::CvMatToOpOutput cvMatToOpOutput{outputSize};
     std::shared_ptr<op::PoseExtractor> poseExtractorPtr = std::make_shared<op::PoseExtractorCaffe>(netInputSize, netOutputSize, outputSize, FLAGS_num_scales,
-                                                                                                   (float)FLAGS_scale_gap, poseModel,
-                                                                                                   FLAGS_model_folder, FLAGS_num_gpu_start);
+                                                                                                   poseModel, FLAGS_model_folder, FLAGS_num_gpu_start);
     op::PoseRenderer poseRenderer{netOutputSize, outputSize, poseModel, poseExtractorPtr, (float)FLAGS_alpha_pose, (float)FLAGS_alpha_heatmap};
     poseRenderer.setElementToRender(FLAGS_part_to_show);
     op::OpOutputToCvMat opOutputToCvMat{outputSize};
@@ -118,12 +117,14 @@ int openPoseTutorialPose2()
     if(inputImage.empty())
         op::error("Could not open or find the image: " + FLAGS_image_path, __LINE__, __FUNCTION__, __FILE__);
     // Step 2 - Format input image to OpenPose input and output formats
-    const auto netInputArray = cvMatToOpInput.format(inputImage);
+    op::Array<float> netInputArray;
+    std::vector<float> scaleRatios;
+    std::tie(netInputArray, scaleRatios) = cvMatToOpInput.format(inputImage);
     double scaleInputToOutput;
     op::Array<float> outputArray;
     std::tie(scaleInputToOutput, outputArray) = cvMatToOpOutput.format(inputImage);
     // Step 3 - Estimate poseKeypoints
-    poseExtractorPtr->forwardPass(netInputArray, {inputImage.cols, inputImage.rows});
+    poseExtractorPtr->forwardPass(netInputArray, {inputImage.cols, inputImage.rows}, scaleRatios);
     const auto poseKeypoints = poseExtractorPtr->getPoseKeypoints();
     const auto scaleNetToOutput = poseExtractorPtr->getScaleNetToOutput();
     // Step 4 - Render pose
diff --git a/include/openpose/core/cvMatToOpInput.hpp b/include/openpose/core/cvMatToOpInput.hpp
index 50fc2e1d91ef841d3b07bf4fbd7c2cc24d00caf5..4f87bbf4ce57680c49e4923951592a1475b8b4bd 100644
--- a/include/openpose/core/cvMatToOpInput.hpp
+++ b/include/openpose/core/cvMatToOpInput.hpp
@@ -1,6 +1,7 @@
 #ifndef OPENPOSE_CORE_CV_MAT_TO_OP_INPUT_HPP
 #define OPENPOSE_CORE_CV_MAT_TO_OP_INPUT_HPP
 
+#include <utility> // std::pair
 #include <vector>
 #include <opencv2/core/core.hpp> // cv::Mat
 #include "array.hpp"
@@ -13,7 +14,7 @@ namespace op
     public:
         CvMatToOpInput(const Point<int>& netInputResolution, const int scaleNumber = 1, const float scaleGap = 0.25);
 
-        Array<float> format(const cv::Mat& cvInputData) const;
+        std::pair<Array<float>, std::vector<float>> format(const cv::Mat& cvInputData) const;
 
     private:
         const int mScaleNumber;
diff --git a/include/openpose/core/datum.hpp b/include/openpose/core/datum.hpp
index 01eaaba697f4118d8a99e65417d04fd716c5109b..a948e03768dce3983974b118a5197e29d25c4ddb 100644
--- a/include/openpose/core/datum.hpp
+++ b/include/openpose/core/datum.hpp
@@ -74,7 +74,7 @@ namespace op
 
         /**
          * Face detection locations (x,y,width,height) for each person in the image.
-         * It has been resized to the same resolution as `poseKeypoints`.
+         * It is resized to cvInputData.size().
          * Size: #people
          */
         std::vector<Rectangle<float>> faceRectangles;
@@ -86,6 +86,13 @@ namespace op
          */
         Array<float> faceKeypoints;
 
+        /**
+         * Hand detection locations (x,y,width,height) for each person in the image.
+         * It is resized to cvInputData.size().
+         * Size: #people
+         */
+        std::vector<std::array<Rectangle<float>, 2>> handRectangles;
+
         /**
          * Experimental (NOT IMPLEMENTED YET)
          * Hands code is in development phase. Not included in this version.
@@ -98,6 +105,8 @@ namespace op
 
         float scaleNetToOutput; /**< Scale ratio between the net output and the final output Datum::cvOutputData. */
 
+        std::vector<float> scaleRatios; /**< Scale ratios between each scale (e.g. flag `num_scales`). Used to resize the different scales. */
+
         std::pair<int, std::string> elementRendered; /**< Pair with the element key id POSE_BODY_PART_MAPPING on `pose/poseParameters.hpp` and its mapped value (e.g. 1 and "Neck"). */
 
 
@@ -167,7 +176,7 @@ namespace op
          * @param datum Datum to be compared.
          * @result Whether the instance satisfies the condition with respect to datum.
          */
-        inline bool operator <(const Datum& datum) const
+        inline bool operator<(const Datum& datum) const
         {
             return id < datum.id;
         }
@@ -176,7 +185,7 @@ namespace op
          * @param datum Datum to be compared.
          * @result Whether the instance satisfies the condition with respect to datum.
          */
-        inline bool operator >(const Datum& datum) const
+        inline bool operator>(const Datum& datum) const
         {
             return id > datum.id;
         }
@@ -185,7 +194,7 @@ namespace op
          * @param datum Datum to be compared.
          * @result Whether the instance satisfies the condition with respect to datum.
          */
-        inline bool operator <=(const Datum& datum) const
+        inline bool operator<=(const Datum& datum) const
         {
             return id <= datum.id;
         }
@@ -194,7 +203,7 @@ namespace op
          * @param datum Datum to be compared.
          * @result Whether the instance satisfies the condition with respect to datum.
          */
-        inline bool operator >=(const Datum& datum) const
+        inline bool operator>=(const Datum& datum) const
         {
             return id >= datum.id;
         }
@@ -203,7 +212,7 @@ namespace op
          * @param datum Datum to be compared.
          * @result Whether the instance satisfies the condition with respect to datum.
          */
-        inline bool operator ==(const Datum& datum) const
+        inline bool operator==(const Datum& datum) const
         {
             return id == datum.id;
         }
@@ -212,7 +221,7 @@ namespace op
          * @param datum Datum to be compared.
          * @result Whether the instance satisfies the condition with respect to datum.
          */
-        inline bool operator !=(const Datum& datum) const
+        inline bool operator!=(const Datum& datum) const
         {
             return id != datum.id;
         }
diff --git a/include/openpose/core/resizeAndMergeBase.hpp b/include/openpose/core/resizeAndMergeBase.hpp
index b36d51a53356edade57afc5666e4ec05a1cf83bd..f4cd03af6674150c11385c59a12135177895cef7 100644
--- a/include/openpose/core/resizeAndMergeBase.hpp
+++ b/include/openpose/core/resizeAndMergeBase.hpp
@@ -2,14 +2,17 @@
 #define OPENPOSE_CORE_RESIZE_AND_MERGE_BASE_HPP
 
 #include <array>
+#include <vector>
 
 namespace op
 {
     template <typename T>
-    void resizeAndMergeCpu(T* targetPtr, const T* const sourcePtr, const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize, const T scaleGap = 0.f);
+    void resizeAndMergeCpu(T* targetPtr, const T* const sourcePtr, const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize,
+                           const std::vector<T>& scaleRatios = {1});
 
     template <typename T>
-    void resizeAndMergeGpu(T* targetPtr, const T* const sourcePtr, const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize, const T scaleGap = 0.f);
+    void resizeAndMergeGpu(T* targetPtr, const T* const sourcePtr, const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize,
+                           const std::vector<T>& scaleRatios = {1});
 }
 
 #endif // OPENPOSE_CORE_RESIZE_AND_MERGE_BASE_HPP
diff --git a/include/openpose/core/resizeAndMergeCaffe.hpp b/include/openpose/core/resizeAndMergeCaffe.hpp
index 2e1d1ef644d2e6299ff54b4bee6085639eb6fff7..a2f253f72eed0adbf42c8b57566addb3d3e8a9be 100644
--- a/include/openpose/core/resizeAndMergeCaffe.hpp
+++ b/include/openpose/core/resizeAndMergeCaffe.hpp
@@ -3,12 +3,14 @@
 #define OPENPOSE_CORE_RESIZE_AND_MERGE_CAFFE_HPP
 
 #include <array>
+#include <vector>
 #include <caffe/blob.hpp>
 #include <openpose/utilities/macros.hpp>
 
 namespace op
 {
-    // It mostly follows the Caffe::layer implementation, so Caffe users can easily use it. However, in order to keep the compatibility with any generic Caffe version,
+    // It mostly follows the Caffe::layer implementation, so Caffe users can easily use it. However, in order to keep the
+    // compatibility with any generic Caffe version,
     // we keep this 'layer' inside our library rather than in the Caffe code.
     template <typename T>
     class ResizeAndMergeCaffe
@@ -18,22 +20,25 @@ namespace op
 
         virtual void LayerSetUp(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top);
 
-        virtual void Reshape(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top, const float factor, const bool mergeFirstDimension = true);
+        virtual void Reshape(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top,
+                             const float factor, const bool mergeFirstDimension = true);
 
         virtual inline const char* type() const { return "ResizeAndMerge"; }
 
-        void setScaleGap(const T scaleGap);
+        void setScaleRatios(const std::vector<T>& scaleRatios);
 
         virtual void Forward_cpu(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top);
 
         virtual void Forward_gpu(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top);
 
-        virtual void Backward_cpu(const std::vector<caffe::Blob<T>*>& top, const std::vector<bool>& propagate_down, const std::vector<caffe::Blob<T>*>& bottom);
+        virtual void Backward_cpu(const std::vector<caffe::Blob<T>*>& top, const std::vector<bool>& propagate_down,
+                                  const std::vector<caffe::Blob<T>*>& bottom);
 
-        virtual void Backward_gpu(const std::vector<caffe::Blob<T>*>& top, const std::vector<bool>& propagate_down, const std::vector<caffe::Blob<T>*>& bottom);
+        virtual void Backward_gpu(const std::vector<caffe::Blob<T>*>& top, const std::vector<bool>& propagate_down,
+                                  const std::vector<caffe::Blob<T>*>& bottom);
 
     private:
-        T mScaleGap;
+        std::vector<T> mScaleRatios;
         std::array<int, 4> mBottomSize;
         std::array<int, 4> mTopSize;
 
diff --git a/include/openpose/core/wCvMatToOpInput.hpp b/include/openpose/core/wCvMatToOpInput.hpp
index 06bd7cecb5646486848f3d35d53f2c6092160ccd..5ad46c5db87bab9ae6d86a8f639a9c1d649a4e07 100644
--- a/include/openpose/core/wCvMatToOpInput.hpp
+++ b/include/openpose/core/wCvMatToOpInput.hpp
@@ -60,7 +60,7 @@ namespace op
                 const auto profilerKey = Profiler::timerInit(__LINE__, __FUNCTION__, __FILE__);
                 // cv::Mat -> float*
                 for (auto& tDatum : *tDatums)
-                    tDatum.inputNetData = spCvMatToOpInput->format(tDatum.cvInputData);
+                    std::tie(tDatum.inputNetData, tDatum.scaleRatios) = spCvMatToOpInput->format(tDatum.cvInputData);
                 // Profiling speed
                 Profiler::timerEnd(profilerKey);
                 Profiler::printAveragedTimeMsOnIterationX(profilerKey, __LINE__, __FUNCTION__, __FILE__, Profiler::DEFAULT_X);
diff --git a/include/openpose/pose/poseExtractor.hpp b/include/openpose/pose/poseExtractor.hpp
index 306699ca22dd905f48fc4143aa04c1c82f3e6c06..c76c48c6b94eab4f7f8b49643a52525b32272ba6 100644
--- a/include/openpose/pose/poseExtractor.hpp
+++ b/include/openpose/pose/poseExtractor.hpp
@@ -22,7 +22,7 @@ namespace op
 
         void initializationOnThread();
 
-        virtual void forwardPass(const Array<float>& inputNetData, const Point<int>& inputDataSize) = 0;
+        virtual void forwardPass(const Array<float>& inputNetData, const Point<int>& inputDataSize, const std::vector<float>& scaleRatios = {1.f}) = 0;
 
         virtual const float* getHeatMapCpuConstPtr() const = 0;
 
diff --git a/include/openpose/pose/poseExtractorCaffe.hpp b/include/openpose/pose/poseExtractorCaffe.hpp
index 9dc465db8d9e7eb8df6ecb9e87dd66c6e04b2a78..57b1e37c1d46679de3bbc36dbdbff571d6bd3d73 100644
--- a/include/openpose/pose/poseExtractorCaffe.hpp
+++ b/include/openpose/pose/poseExtractorCaffe.hpp
@@ -20,14 +20,14 @@ namespace op
     {
     public:
         PoseExtractorCaffe(const Point<int>& netInputSize, const Point<int>& netOutputSize, const Point<int>& outputSize, const int scaleNumber,
-                           const float scaleGap, const PoseModel poseModel, const std::string& modelFolder, const int gpuId, const std::vector<HeatMapType>& heatMapTypes = {},
+                           const PoseModel poseModel, const std::string& modelFolder, const int gpuId, const std::vector<HeatMapType>& heatMapTypes = {},
                            const ScaleMode heatMapScale = ScaleMode::ZeroToOne);
 
         virtual ~PoseExtractorCaffe();
 
         void netInitializationOnThread();
 
-        void forwardPass(const Array<float>& inputNetData, const Point<int>& inputDataSize);
+        void forwardPass(const Array<float>& inputNetData, const Point<int>& inputDataSize, const std::vector<float>& scaleRatios = {1.f});
 
         const float* getHeatMapCpuConstPtr() const;
 
@@ -36,6 +36,7 @@ namespace op
         const float* getPoseGpuConstPtr() const;
 
     private:
+        const float mResizeScale;
         std::shared_ptr<Net> spNet;
         std::shared_ptr<ResizeAndMergeCaffe<float>> spResizeAndMergeCaffe;
         std::shared_ptr<NmsCaffe<float>> spNmsCaffe;
diff --git a/include/openpose/pose/wPoseExtractor.hpp b/include/openpose/pose/wPoseExtractor.hpp
index d2c54569c1edc10b31556ed1528859843a2d5acd..dfab2a9e5c6fa41e6d3219dbe8271739c7c1bb98 100644
--- a/include/openpose/pose/wPoseExtractor.hpp
+++ b/include/openpose/pose/wPoseExtractor.hpp
@@ -61,7 +61,7 @@ namespace op
                 // Extract people pose
                 for (auto& tDatum : *tDatums)
                 {
-                    spPoseExtractor->forwardPass(tDatum.inputNetData, Point<int>{tDatum.cvInputData.cols, tDatum.cvInputData.rows});
+                    spPoseExtractor->forwardPass(tDatum.inputNetData, Point<int>{tDatum.cvInputData.cols, tDatum.cvInputData.rows}, tDatum.scaleRatios);
                     tDatum.poseHeatMaps = spPoseExtractor->getHeatMaps();
                     tDatum.poseKeypoints = spPoseExtractor->getPoseKeypoints();
                     tDatum.scaleNetToOutput = spPoseExtractor->getScaleNetToOutput();
diff --git a/include/openpose/utilities/cuda.hu b/include/openpose/utilities/cuda.hu
index ae1630cc3419c7d9ee52a7b2499925865b01186f..5e060006823f89fa72d714ce2117f5f6bf7dd86e 100644
--- a/include/openpose/utilities/cuda.hu
+++ b/include/openpose/utilities/cuda.hu
@@ -81,7 +81,8 @@ namespace op
 
     // Cubic interpolation
     template <typename T>
-    inline __device__ void cubicSequentialData(int* xIntArray, int* yIntArray, T& dx, T& dy, const T xSource, const T ySource, const int width, const int height)
+    inline __device__ void cubicSequentialData(int* xIntArray, int* yIntArray, T& dx, T& dy, const T xSource, const T ySource,
+                                               const int width, const int height)
     {
         xIntArray[1] = fastTruncate(int(xSource + 1e-5), 0, width - 1);
         xIntArray[0] = fastMax(0, xIntArray[1] - 1);
@@ -97,7 +98,7 @@ namespace op
     }
 
     template <typename T>
-    inline __device__ T cubicInterpolation(const T v0, const T v1, const T v2, const T v3, const T dx)
+    inline __device__ T cubicInterpolate(const T v0, const T v1, const T v2, const T v3, const T dx)
     {
         // http://www.paulinternet.nl/?page=bicubic
         // const auto a = (-0.5f * v0 + 1.5f * v1 - 1.5f * v2 + 0.5f * v3);
@@ -108,10 +109,12 @@ namespace op
                 + (v0 - 2.5f * v1 + 2.f * v2 - 0.5f * v3) * dx * dx
                 - 0.5f * (v0 - v2) * dx // + (-0.5f * v0 + 0.5f * v2) * dx
                 + v1;
+        // return v1 + 0.5f * dx * (v2 - v0 + dx * (2.f * v0 - 5.f * v1 + 4.f * v2 - v3 + dx * (3.f * (v1 - v2) + v3 - v0)));
     }
 
     template <typename T>
-    inline __device__ T cubicResize(const T* const sourcePtr, const T xSource, const T ySource, const int widthSource, const int heightSource, const int widthSourcePtr)
+    inline __device__ T bicubicInterpolate(const T* const sourcePtr, const T xSource, const T ySource, const int widthSource,
+                                           const int heightSource, const int widthSourcePtr)
     {
         int xIntArray[4];
         int yIntArray[4];
@@ -122,16 +125,17 @@ namespace op
         T temp[4];
         for (unsigned char i = 0; i < 4; i++)
         {
-            const int offset = yIntArray[i]*widthSourcePtr;
-            temp[i] = cubicInterpolation(sourcePtr[offset + xIntArray[0]], sourcePtr[offset + xIntArray[1]], sourcePtr[offset + xIntArray[2]], sourcePtr[offset + xIntArray[3]], dx);
+            const auto offset = yIntArray[i]*widthSourcePtr;
+            temp[i] = cubicInterpolate(sourcePtr[offset + xIntArray[0]], sourcePtr[offset + xIntArray[1]],
+                                       sourcePtr[offset + xIntArray[2]], sourcePtr[offset + xIntArray[3]], dx);
         }
-        return cubicInterpolation(temp[0], temp[1], temp[2], temp[3], dy);
+        return cubicInterpolate(temp[0], temp[1], temp[2], temp[3], dy);
     }
 
     template <typename T>
     inline __device__ T addWeighted(const T value1, const T value2, const T alphaValue2)
     {
-        return (1 - alphaValue2) * value1 + alphaValue2 * value2;
+        return (1.f - alphaValue2) * value1 + alphaValue2 * value2;
     }
 
     template <typename T>
diff --git a/include/openpose/wrapper/wrapper.hpp b/include/openpose/wrapper/wrapper.hpp
index 18af1cff50400acc66b404db14a5607d89463b6d..9739c1d494d1607b885366ab1caa649d51f8f189 100644
--- a/include/openpose/wrapper/wrapper.hpp
+++ b/include/openpose/wrapper/wrapper.hpp
@@ -552,13 +552,13 @@ namespace op
                 wDatumProducer = nullptr;
 
             // Pose estimators
-            const Point<int>& netOutputSize = wrapperStructPose.netInputSize;
+            const Point<int>& poseNetOutputSize = wrapperStructPose.netInputSize;
             std::vector<std::shared_ptr<PoseExtractor>> poseExtractors;
             for (auto gpuId = 0; gpuId < gpuNumber; gpuId++)
                 poseExtractors.emplace_back(std::make_shared<PoseExtractorCaffe>(
-                    wrapperStructPose.netInputSize, netOutputSize, finalOutputSize, wrapperStructPose.scalesNumber,
-                    wrapperStructPose.scaleGap, wrapperStructPose.poseModel, wrapperStructPose.modelFolder,
-                    gpuId + gpuNumberStart, wrapperStructPose.heatMapTypes, wrapperStructPose.heatMapScale
+                    wrapperStructPose.netInputSize, poseNetOutputSize, finalOutputSize, wrapperStructPose.scalesNumber,
+                    wrapperStructPose.poseModel, wrapperStructPose.modelFolder, gpuId + gpuNumberStart,
+                    wrapperStructPose.heatMapTypes, wrapperStructPose.heatMapScale
                 ));
 
             // Pose renderers
@@ -572,7 +572,7 @@ namespace op
                 for (auto gpuId = 0; gpuId < poseExtractors.size(); gpuId++)
                 {
                     poseRenderers.emplace_back(std::make_shared<PoseRenderer>(
-                        netOutputSize, finalOutputSize, wrapperStructPose.poseModel, poseExtractors[gpuId],
+                        poseNetOutputSize, finalOutputSize, wrapperStructPose.poseModel, poseExtractors[gpuId],
                         wrapperStructPose.blendOriginalFrame, alphaKeypoint,
                         alphaHeatMap, wrapperStructPose.defaultPartToRender
                     ));
@@ -678,7 +678,7 @@ namespace op
             // Re-scale pose if desired
             if (wrapperStructPose.keypointScale != ScaleMode::OutputResolution
                 && (wrapperStructPose.keypointScale != ScaleMode::InputResolution || (finalOutputSize != producerSize))
-                && (wrapperStructPose.keypointScale != ScaleMode::NetOutputResolution || (finalOutputSize != netOutputSize)))
+                && (wrapperStructPose.keypointScale != ScaleMode::NetOutputResolution || (finalOutputSize != poseNetOutputSize)))
             {
                 auto keypointScaler = std::make_shared<KeypointScaler>(wrapperStructPose.keypointScale);
                 mPostProcessingWs.emplace_back(std::make_shared<WKeypointScaler<TDatumsPtr>>(keypointScaler));
diff --git a/src/openpose/core/cvMatToOpInput.cpp b/src/openpose/core/cvMatToOpInput.cpp
index 225864c713ccb6217a278416012f8da8bae4efb6..d0570378e955f89edec13e2331984e6b8cae86df 100644
--- a/src/openpose/core/cvMatToOpInput.cpp
+++ b/src/openpose/core/cvMatToOpInput.cpp
@@ -10,9 +10,19 @@ namespace op
         mScaleGap{scaleGap},
         mInputNetSize4D{{mScaleNumber, 3, netInputResolution.y, netInputResolution.x}}
     {
+        try
+        {
+            // Security checks
+            if (netInputResolution.x % 16 != 0 || netInputResolution.y % 16 != 0)
+                error("Net input resolution must be multiples of 16.", __LINE__, __FUNCTION__, __FILE__);
+        }
+        catch (const std::exception& e)
+        {
+            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+        }
     }
 
-    Array<float> CvMatToOpInput::format(const cv::Mat& cvInputData) const
+    std::pair<Array<float>, std::vector<float>> CvMatToOpInput::format(const cv::Mat& cvInputData) const
     {
         try
         {
@@ -22,29 +32,35 @@ namespace op
 
             // inputNetData - Reescale keeping aspect ratio and transform to float the input deep net image
             Array<float> inputNetData{mInputNetSize4D};
+            std::vector<float> scaleRatios(mScaleNumber, 1.f);
             const auto inputNetDataOffset = inputNetData.getVolume(1, 3);
             for (auto i = 0; i < mScaleNumber; i++)
             {
-                const auto requestedScale = 1.f - i*mScaleGap;
-                if (requestedScale > 1.f)
-                    error("All scales must be <= 1, i.e. 1-num_scales*scale_gap <= 1", __LINE__, __FUNCTION__, __FILE__);
+                const auto currentScale = 1.f - i*mScaleGap;
+                if (currentScale < 0.f || 1.f < currentScale)
+                    error("All scales must be in the range [0, 1], i.e. 0 <= 1-num_scales*scale_gap <= 1", __LINE__, __FUNCTION__, __FILE__);
 
                 const auto netInputWidth = inputNetData.getSize(3);
-                const auto targetWidth  = fastTruncate(16 * intRound(netInputWidth * requestedScale / 16.), 1, netInputWidth/16*16);
+                const auto targetWidth  = fastTruncate(intRound(netInputWidth * currentScale) / 16 * 16, 1, netInputWidth);
                 const auto netInputHeight = inputNetData.getSize(2);
-                const auto targetHeight  = fastTruncate(16 * intRound(netInputHeight * requestedScale / 16.), 1, netInputHeight/16*16);
+                const auto targetHeight  = fastTruncate(intRound(netInputHeight * currentScale) / 16 * 16, 1, netInputHeight);
                 const Point<int> targetSize{targetWidth, targetHeight};
                 const auto scale = resizeGetScaleFactor(Point<int>{cvInputData.cols, cvInputData.rows}, targetSize);
                 const cv::Mat frameWithNetSize = resizeFixedAspectRatio(cvInputData, scale, Point<int>{netInputWidth, netInputHeight});
+                // Fill inputNetData
                 uCharCvMatToFloatPtr(inputNetData.getPtr() + i * inputNetDataOffset, frameWithNetSize, true);
+                // Fill scaleRatios
+                scaleRatios[i] = scale;
+                if (i > 0)
+                    scaleRatios[i] /= scaleRatios[0];
             }
-
-            return inputNetData;
+            scaleRatios.at(0) /= scaleRatios[0];
+            return std::make_pair(inputNetData, scaleRatios);
         }
         catch (const std::exception& e)
         {
             error(e.what(), __LINE__, __FUNCTION__, __FILE__);
-            return Array<float>{};
+            return std::make_pair(Array<float>{}, std::vector<float>{});
         }
     }
 }
diff --git a/src/openpose/core/datum.cpp b/src/openpose/core/datum.cpp
index 950cdc6bb8fbf6f0a0d78248818cbdfa9f7bff61..ef2f1444038c5b54259cbbcd931a19356e7a8440 100644
--- a/src/openpose/core/datum.cpp
+++ b/src/openpose/core/datum.cpp
@@ -22,10 +22,12 @@ namespace op
         poseHeatMaps{datum.poseHeatMaps},
         faceRectangles{datum.faceRectangles},
         faceKeypoints{datum.faceKeypoints},
+        handRectangles{datum.handRectangles},
         handKeypoints{datum.handKeypoints},
         // Other parameters
         scaleInputToOutput{datum.scaleInputToOutput},
         scaleNetToOutput{datum.scaleNetToOutput},
+        scaleRatios{datum.scaleRatios},
         elementRendered{datum.elementRendered}
     {
     }
@@ -48,10 +50,12 @@ namespace op
             poseHeatMaps = datum.poseHeatMaps,
             faceRectangles = datum.faceRectangles,
             faceKeypoints = datum.faceKeypoints,
+            handRectangles = datum.handRectangles,
             handKeypoints = datum.handKeypoints,
             // Other parameters
             scaleInputToOutput = datum.scaleInputToOutput;
             scaleNetToOutput = datum.scaleNetToOutput;
+            scaleRatios = datum.scaleRatios;
             elementRendered = datum.elementRendered;
             // Return
             return *this;
@@ -85,8 +89,10 @@ namespace op
             std::swap(poseHeatMaps, datum.poseHeatMaps);
             std::swap(faceRectangles, datum.faceRectangles);
             std::swap(faceKeypoints, datum.faceKeypoints);
+            std::swap(handRectangles, datum.handRectangles);
             std::swap(handKeypoints, datum.handKeypoints);
             // Other parameters
+            std::swap(scaleRatios, datum.scaleRatios);
             std::swap(elementRendered, datum.elementRendered);
         }
         catch (const std::exception& e)
@@ -113,10 +119,12 @@ namespace op
             std::swap(poseHeatMaps, datum.poseHeatMaps);
             std::swap(faceRectangles, datum.faceRectangles);
             std::swap(faceKeypoints, datum.faceKeypoints);
+            std::swap(handRectangles, datum.handRectangles);
             std::swap(handKeypoints, datum.handKeypoints);
             // Other parameters
             scaleInputToOutput = datum.scaleInputToOutput;
             scaleNetToOutput = datum.scaleNetToOutput;
+            std::swap(scaleRatios, datum.scaleRatios);
             std::swap(elementRendered, datum.elementRendered);
             // Return
             return *this;
@@ -151,10 +159,12 @@ namespace op
             datum.poseHeatMaps = poseHeatMaps.clone();
             datum.faceRectangles = faceRectangles;
             datum.faceKeypoints = faceKeypoints.clone();
+            datum.handRectangles = datum.handRectangles;
             datum.handKeypoints = handKeypoints.clone();
             // Other parameters
             datum.scaleInputToOutput = scaleInputToOutput;
             datum.scaleNetToOutput = scaleNetToOutput;
+            datum.scaleRatios = scaleRatios;
             datum.elementRendered = elementRendered;
             // Return
             return std::move(datum);
diff --git a/src/openpose/core/resizeAndMergeBase.cpp b/src/openpose/core/resizeAndMergeBase.cpp
index bfcf2ea761ba50e6aec6eda18d7d08c6a128b716..463074c89deadbc3303e7bc0c3a9a8eb37fc7700 100644
--- a/src/openpose/core/resizeAndMergeBase.cpp
+++ b/src/openpose/core/resizeAndMergeBase.cpp
@@ -6,13 +6,14 @@
 namespace op
 {
     template <typename T>
-    void resizeAndMergeCpu(T* targetPtr, const T* const sourcePtr, const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize, const T scaleGap)
+    void resizeAndMergeCpu(T* targetPtr, const T* const sourcePtr, const std::array<int, 4>& targetSize,
+                           const std::array<int, 4>& sourceSize, const std::vector<T>& scaleRatios)
     {
         try
         {
             UNUSED(targetPtr);
             UNUSED(sourcePtr);
-            UNUSED(scaleGap);
+            UNUSED(scaleRatios);
             UNUSED(targetSize);
             UNUSED(sourceSize);
             error("CPU version not completely implemented.", __LINE__, __FUNCTION__, __FILE__);
@@ -61,6 +62,8 @@ namespace op
         }
     }
 
-    template void resizeAndMergeCpu(float* targetPtr, const float* const sourcePtr, const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize, const float scaleGap);
-    template void resizeAndMergeCpu(double* targetPtr, const double* const sourcePtr, const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize, const double scaleGap);
+    template void resizeAndMergeCpu(float* targetPtr, const float* const sourcePtr, const std::array<int, 4>& targetSize,
+                                    const std::array<int, 4>& sourceSize, const std::vector<float>& scaleRatios);
+    template void resizeAndMergeCpu(double* targetPtr, const double* const sourcePtr, const std::array<int, 4>& targetSize,
+                                    const std::array<int, 4>& sourceSize, const std::vector<double>& scaleRatios);
 }
diff --git a/src/openpose/core/resizeAndMergeBase.cu b/src/openpose/core/resizeAndMergeBase.cu
index 98038462f89cfec13e3662054b834a2c2d057cc3..3e57c958749db31208e0c79c94350385452b0efc 100644
--- a/src/openpose/core/resizeAndMergeBase.cu
+++ b/src/openpose/core/resizeAndMergeBase.cu
@@ -8,7 +8,8 @@ namespace op
     const auto THREADS_PER_BLOCK_1D = 16u;
 
     template <typename T>
-    __global__ void resizeKernel(T* targetPtr, const T* const sourcePtr, const int sourceWidth, const int sourceHeight, const int targetWidth, const int targetHeight)
+    __global__ void resizeKernel(T* targetPtr, const T* const sourcePtr, const int sourceWidth, const int sourceHeight, const int targetWidth,
+                                 const int targetHeight)
     {
         const auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
         const auto y = (blockIdx.y * blockDim.y) + threadIdx.y;
@@ -20,12 +21,12 @@ namespace op
             const T xSource = (x + 0.5f) / scaleWidth - 0.5f;
             const T ySource = (y + 0.5f) / scaleHeight - 0.5f;
 
-            targetPtr[y*targetWidth+x] = cubicResize(sourcePtr, xSource, ySource, sourceWidth, sourceHeight, sourceWidth);
+            targetPtr[y*targetWidth+x] = bicubicInterpolate(sourcePtr, xSource, ySource, sourceWidth, sourceHeight, sourceWidth);
         }
     }
 
     template <typename T>
-    __global__ void resizeKernelAndMerge(T* targetPtr, const T* const sourcePtr, const int sourceNumOffset, const int num, const T scaleGap,
+    __global__ void resizeKernelAndMerge(T* targetPtr, const T* const sourcePtr, const int sourceNumOffset, const int num, const T* scaleRatios,
                                          const int sourceWidth, const int sourceHeight, const int targetWidth, const int targetHeight)
     {
         const auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
@@ -38,17 +39,17 @@ namespace op
             // targetPixel = -1000.f; // For fastMax
             for (auto n = 0; n < num; n++)
             {
-                const auto numberScale = 1 - n * scaleGap;
-                const auto widthPaddedSource = int(sourceWidth * numberScale);
-                const auto heightPaddedSource = int(sourceHeight * numberScale);
+                const auto currentWidth = sourceWidth * scaleRatios[n];
+                const auto currentHeight = sourceHeight * scaleRatios[n];
 
-                const auto scaleWidth = targetWidth / T(widthPaddedSource);
-                const auto scaleHeight = targetHeight / T(heightPaddedSource);
+                const auto scaleWidth = targetWidth / currentWidth;
+                const auto scaleHeight = targetHeight / currentHeight;
                 const T xSource = (x + 0.5f) / scaleWidth - 0.5f;
                 const T ySource = (y + 0.5f) / scaleHeight - 0.5f;
 
                 const T* const sourcePtrN = sourcePtr + n * sourceNumOffset;
-                const auto interpolated = cubicResize(sourcePtrN, xSource, ySource, widthPaddedSource, heightPaddedSource, sourceWidth);
+                const auto interpolated = bicubicInterpolate(sourcePtrN, xSource, ySource, intRound(currentWidth),
+                                                             intRound(currentHeight), sourceWidth);
                 targetPixel += interpolated;
                 // targetPixel = fastMax(targetPixel, interpolated);
             }
@@ -57,7 +58,8 @@ namespace op
     }
 
     template <typename T>
-    void resizeAndMergeGpu(T* targetPtr, const T* const sourcePtr, const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize, const T scaleGap)
+    void resizeAndMergeGpu(T* targetPtr, const T* const sourcePtr, const std::array<int, 4>& targetSize,
+                           const std::array<int, 4>& sourceSize, const std::vector<T>& scaleRatios)
     {
         try
         {
@@ -73,21 +75,42 @@ namespace op
             const auto sourceChannelOffset = sourceHeight * sourceWidth;
             const auto targetChannelOffset = targetWidth * targetHeight;
 
+            // No multi-scale merging
             if (targetSize[0] > 1)
             {
                 for (auto n = 0; n < num; n++)
-                    for (auto c = 0; c < channels; c++)
-                        resizeKernel<<<numBlocks, threadsPerBlock>>>(targetPtr + (n*channels + c) * targetChannelOffset, sourcePtr + (n*channels + c) * sourceChannelOffset,
+                {
+                    const auto offsetBase = n*channels;
+                    for (auto c = 0 ; c < channels ; c++)
+                    {
+                        const auto offset = offsetBase + c;
+                        resizeKernel<<<numBlocks, threadsPerBlock>>>(targetPtr + offset * targetChannelOffset,
+                                                                     sourcePtr + offset * sourceChannelOffset,
                                                                      sourceWidth, sourceHeight, targetWidth, targetHeight);
+                    }
+                }
             }
+            // Multi-scale merging
             else
             {
-                if (scaleGap <= 0.f && num != targetSize[0])
-                    error("The scale gap must be greater than 0.", __LINE__, __FUNCTION__, __FILE__);
+                // If num_scales > 1 --> scaleRatios must be set
+                if (scaleRatios.size() != num)
+                    error("The scale ratios size must be equal than the number of scales.", __LINE__, __FUNCTION__, __FILE__);
+                const auto maxScales = 10;
+                if (scaleRatios.size() > maxScales)
+                    error("The maximum number of scales is " + std::to_string(maxScales) + ".", __LINE__, __FUNCTION__, __FILE__);
+                // Copy scaleRatios
+                T* scaleRatiosGpuPtr;
+                cudaMalloc((void**)&scaleRatiosGpuPtr, maxScales * sizeof(T));
+                cudaMemcpy(scaleRatiosGpuPtr, scaleRatios.data(), scaleRatios.size() * sizeof(T), cudaMemcpyHostToDevice);
+                // Perform resize + merging
                 const auto sourceNumOffset = channels * sourceChannelOffset;
-                for (auto c = 0; c < channels; c++)
-                    resizeKernelAndMerge<<<numBlocks, threadsPerBlock>>>(targetPtr + c * targetChannelOffset, sourcePtr + c * sourceChannelOffset, sourceNumOffset,
-                                                                         num, scaleGap, sourceWidth, sourceHeight, targetWidth, targetHeight);
+                for (auto c = 0 ; c < channels ; c++)
+                    resizeKernelAndMerge<<<numBlocks, threadsPerBlock>>>(targetPtr + c * targetChannelOffset,
+                                                                         sourcePtr + c * sourceChannelOffset, sourceNumOffset,
+                                                                         num, scaleRatiosGpuPtr, sourceWidth, sourceHeight, targetWidth, targetHeight);
+                // Free memory
+                cudaFree(scaleRatiosGpuPtr);
             }
 
             cudaCheck(__LINE__, __FUNCTION__, __FILE__);
@@ -98,6 +121,8 @@ namespace op
         }
     }
 
-    template void resizeAndMergeGpu(float* targetPtr, const float* const sourcePtr, const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize, const float scaleGap);
-    template void resizeAndMergeGpu(double* targetPtr, const double* const sourcePtr, const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize, const double scaleGap);
+    template void resizeAndMergeGpu(float* targetPtr, const float* const sourcePtr, const std::array<int, 4>& targetSize,
+                                    const std::array<int, 4>& sourceSize, const std::vector<float>& scaleRatios);
+    template void resizeAndMergeGpu(double* targetPtr, const double* const sourcePtr, const std::array<int, 4>& targetSize,
+                                    const std::array<int, 4>& sourceSize, const std::vector<double>& scaleRatios);
 }
diff --git a/src/openpose/core/resizeAndMergeCaffe.cpp b/src/openpose/core/resizeAndMergeCaffe.cpp
index 2f30afc8421800bed5a5f7d53885027285e0aeb9..91a2658b95c4f0522a19ba6fe355858ac84cd125 100644
--- a/src/openpose/core/resizeAndMergeCaffe.cpp
+++ b/src/openpose/core/resizeAndMergeCaffe.cpp
@@ -8,7 +8,8 @@
 namespace op
 {
     template <typename T>
-    ResizeAndMergeCaffe<T>::ResizeAndMergeCaffe()
+    ResizeAndMergeCaffe<T>::ResizeAndMergeCaffe() :
+        mScaleRatios{1}
     {
     }
 
@@ -29,7 +30,8 @@ namespace op
     }
 
     template <typename T>
-    void ResizeAndMergeCaffe<T>::Reshape(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top, const float factor, const bool mergeFirstDimension)
+    void ResizeAndMergeCaffe<T>::Reshape(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top,
+                                         const float factor, const bool mergeFirstDimension)
     {
         try
         {
@@ -54,11 +56,11 @@ namespace op
     }
 
     template <typename T>
-    void ResizeAndMergeCaffe<T>::setScaleGap(const T scaleGap)
+    void ResizeAndMergeCaffe<T>::setScaleRatios(const std::vector<T>& scaleRatios)
     {
         try
         {
-            mScaleGap = {scaleGap};
+            mScaleRatios = {scaleRatios};
         }
         catch (const std::exception& e)
         {
@@ -71,7 +73,7 @@ namespace op
     {
         try
         {
-            resizeAndMergeCpu(top.at(0)->mutable_cpu_data(), bottom.at(0)->cpu_data(), mTopSize, mBottomSize, mScaleGap);
+            resizeAndMergeCpu(top.at(0)->mutable_cpu_data(), bottom.at(0)->cpu_data(), mTopSize, mBottomSize, mScaleRatios);
         }
         catch (const std::exception& e)
         {
@@ -84,7 +86,7 @@ namespace op
     {
         try
         {
-            resizeAndMergeGpu(top.at(0)->mutable_gpu_data(), bottom.at(0)->gpu_data(), mTopSize, mBottomSize, mScaleGap);
+            resizeAndMergeGpu(top.at(0)->mutable_gpu_data(), bottom.at(0)->gpu_data(), mTopSize, mBottomSize, mScaleRatios);
         }
         catch (const std::exception& e)
         {
@@ -93,7 +95,8 @@ namespace op
     }
 
     template <typename T>
-    void ResizeAndMergeCaffe<T>::Backward_cpu(const std::vector<caffe::Blob<T>*>& top, const std::vector<bool>& propagate_down, const std::vector<caffe::Blob<T>*>& bottom)
+    void ResizeAndMergeCaffe<T>::Backward_cpu(const std::vector<caffe::Blob<T>*>& top, const std::vector<bool>& propagate_down,
+                                              const std::vector<caffe::Blob<T>*>& bottom)
     {
         try
         {
@@ -109,7 +112,8 @@ namespace op
     }
 
     template <typename T>
-    void ResizeAndMergeCaffe<T>::Backward_gpu(const std::vector<caffe::Blob<T>*>& top, const std::vector<bool>& propagate_down, const std::vector<caffe::Blob<T>*>& bottom)
+    void ResizeAndMergeCaffe<T>::Backward_gpu(const std::vector<caffe::Blob<T>*>& top, const std::vector<bool>& propagate_down,
+                                              const std::vector<caffe::Blob<T>*>& bottom)
     {
         try
         {
diff --git a/src/openpose/pose/poseExtractorCaffe.cpp b/src/openpose/pose/poseExtractorCaffe.cpp
index 2834ae618f00ada75f8342ee4d10beab4b7ecee5..11f9b91a77a0f05df80c8599fc74648b3856995a 100644
--- a/src/openpose/pose/poseExtractorCaffe.cpp
+++ b/src/openpose/pose/poseExtractorCaffe.cpp
@@ -11,9 +11,10 @@
 namespace op
 {
     PoseExtractorCaffe::PoseExtractorCaffe(const Point<int>& netInputSize, const Point<int>& netOutputSize, const Point<int>& outputSize, const int scaleNumber,
-                                           const float scaleGap, const PoseModel poseModel, const std::string& modelFolder, const int gpuId, const std::vector<HeatMapType>& heatMapTypes,
+                                           const PoseModel poseModel, const std::string& modelFolder, const int gpuId, const std::vector<HeatMapType>& heatMapTypes,
                                            const ScaleMode heatMapScale) :
         PoseExtractor{netOutputSize, outputSize, poseModel, heatMapTypes, heatMapScale},
+        mResizeScale{mNetOutputSize.x / (float)netInputSize.x},
         spNet{std::make_shared<NetCaffe>(std::array<int,4>{scaleNumber, 3, (int)netInputSize.y, (int)netInputSize.x},
                                          modelFolder + POSE_PROTOTXT[(int)poseModel], modelFolder + POSE_TRAINED_MODEL[(int)poseModel], gpuId)},
         spResizeAndMergeCaffe{std::make_shared<ResizeAndMergeCaffe<float>>()},
@@ -22,9 +23,10 @@ namespace op
     {
         try
         {
-            checkE(netOutputSize.x, netInputSize.x, "Net input and output size must be equal.", __LINE__, __FUNCTION__, __FILE__);
-            checkE(netOutputSize.y, netInputSize.y, "Net input and output size must be equal.", __LINE__, __FUNCTION__, __FILE__);
-            spResizeAndMergeCaffe->setScaleGap(scaleGap);
+            const auto resizeScale = mNetOutputSize.x / (float)netInputSize.x;
+            const auto resizeScaleCheck = resizeScale / (mNetOutputSize.y/(float)netInputSize.y);
+            if (1+1e-6 < resizeScaleCheck || resizeScaleCheck < 1-1e-6)
+                error("Net input and output size must be proportional. resizeScaleCheck = " + std::to_string(resizeScaleCheck), __LINE__, __FUNCTION__, __FILE__);
         }
         catch (const std::exception& e)
         {
@@ -49,7 +51,7 @@ namespace op
 
             // HeatMaps extractor blob and layer
             spHeatMapsBlob = {std::make_shared<caffe::Blob<float>>(1,1,1,1)};
-            spResizeAndMergeCaffe->Reshape({spCaffeNetOutputBlob.get()}, {spHeatMapsBlob.get()}, POSE_CCN_DECREASE_FACTOR[(int)mPoseModel]);
+            spResizeAndMergeCaffe->Reshape({spCaffeNetOutputBlob.get()}, {spHeatMapsBlob.get()}, mResizeScale * POSE_CCN_DECREASE_FACTOR[(int)mPoseModel]);
             cudaCheck(__LINE__, __FUNCTION__, __FILE__);
 
             // Pose extractor blob and layer
@@ -71,7 +73,7 @@ namespace op
         }
     }
 
-    void PoseExtractorCaffe::forwardPass(const Array<float>& inputNetData, const Point<int>& inputDataSize)
+    void PoseExtractorCaffe::forwardPass(const Array<float>& inputNetData, const Point<int>& inputDataSize, const std::vector<float>& scaleRatios)
     {
         try
         {
@@ -83,6 +85,7 @@ namespace op
             spNet->forwardPass(inputNetData.getConstPtr());                                                     // ~79.3836ms
 
             // 2. Resize heat maps + merge different scales
+            spResizeAndMergeCaffe->setScaleRatios(scaleRatios);
             #ifndef CPU_ONLY
                 spResizeAndMergeCaffe->Forward_gpu({spCaffeNetOutputBlob.get()}, {spHeatMapsBlob.get()});       // ~5ms
                 cudaCheck(__LINE__, __FUNCTION__, __FILE__);
diff --git a/src/openpose/pose/poseRenderGpu.cu b/src/openpose/pose/poseRenderGpu.cu
index 7a5cea0cee7de65eede0d0bef02228bc6222ab6b..b01c7be914c90a5219d94748632c5e1a01392a33 100644
--- a/src/openpose/pose/poseRenderGpu.cu
+++ b/src/openpose/pose/poseRenderGpu.cu
@@ -244,7 +244,7 @@ namespace op
             const auto ySource = (y + 0.5f) / scaleToKeepRatio - 0.5f;
             const auto heatMapOffset = part * widthHeatMap * heightHeatMap;
             const auto* const heatMapPtrOffsetted = heatMapPtr + heatMapOffset;
-            const auto interpolatedValue = cubicResize(heatMapPtrOffsetted, xSource, ySource, widthHeatMap, heightHeatMap, widthHeatMap);
+            const auto interpolatedValue = bicubicInterpolate(heatMapPtrOffsetted, xSource, ySource, widthHeatMap, heightHeatMap, widthHeatMap);
 
             float rgbColor[3];
             getColorHeatMap(rgbColor, interpolatedValue, 0.f, 1.f);