Bug fixed: multi-scale was introducing offset

3b15986a · gineshidalgo99 · 204615ef · 3b15986a · 3b15986a · 3b15986a
21 changed file
--- a/README.md
+++ b/README.md
@@ -23,7 +23,7 @@ OpenPose is freely available for free non-commercial use, and may be redistribut

 Library main functionality:

-* Multi-person 15 or **18-keypoint body pose** estimation and rendering. **Running time invariant of number of people** on the image.
+* Multi-person 15 or **18-keypoint body pose** estimation and rendering. **Running time invariant to number of people** on the image.

 * Multi-person **2x21-keypoint hand** estimation and rendering. Note: In this initial version, **running time** linearly **depends** on the **number of people** on the image. **Coming soon (in around 1-5 weeks)!**

@@ -76,8 +76,6 @@ The pose estimation work is based on the C++ code from [the ECCV 2016 demo](http
    2. [OpenPose Wrapper](#openpose-wrapper)
    3. [OpenPose Library](#openpose-library)
 4. [Output](#output)
-    1. [Output Format](#output-format)
-    2. [Reading Saved Results](#reading-saved-results)
 5. [OpenPose Benchmark](#openpose-benchmark)
 6. [Send Us Your Feedback!](#send-us-your-feedback)
 7. [Citation](#citation)

--- a/doc/release_notes.md
+++ b/doc/release_notes.md
@@ -41,5 +41,16 @@ OpenPose Library - Release Notes
 3. Main bugs fixed:
    1. All visualization functions moved to same thread, so it works with most OpenCV custom compiled versions.
    2. Fixed error on debug mode: `Too many resources requested for launch`.
-    3. Bug in Array::getConstCvMat() if mVolume=0, now returning empty cv::Mat.
-    4. Bug: `--process_real_time` threw error with webcam.
+
+
+
+## Current version (future OpenPose 1.0.0rc4)
+1. Main improvements:
+    1. Check() functions give more feedback.
+    2. Improved documentation.
+2. Functions or paremeters renamed:
+    1. `Datum::scaleRatios` to save the relative scale ratio when multi-scale.
+3. Main bugs fixed:
+    1. Fixed bug in Array::getConstCvMat() if mVolume=0, now returning empty cv::Mat.
+    2. Fixed bug: `--process_real_time` threw error with webcam.
+    3. Fixed bug: Face not working with output resolution different to input.
--- a/examples/tutorial_pose/1_extract_from_image.cpp
+++ b/examples/tutorial_pose/1_extract_from_image.cpp
@@ -28,7 +28,7 @@ DEFINE_int32(logging_level,             3,              "The logging level. Inte
 DEFINE_string(image_path,               "examples/media/COCO_val2014_000000000192.jpg",     "Process the desired image.");
 // OpenPose
 DEFINE_string(model_pose,               "COCO",         "Model to be used (e.g. COCO, MPI, MPI_4_layers).");
-DEFINE_string(model_folder,             "models/",      "Folder where the pose models (COCO and MPI) are located.");
+DEFINE_string(model_folder,             "models/",      "Folder path (absolute or relative) where the models (pose, face, ...) are located.");
 DEFINE_string(net_resolution,           "656x368",      "Multiples of 16. If it is increased, the accuracy usually increases. If it is decreased, the speed increases.");
 DEFINE_string(resolution,               "1280x720",     "The image resolution (display). Use \"-1x-1\" to force the program to use the default images resolution.");
 DEFINE_int32(num_gpu_start,             0,              "GPU device start number.");
@@ -98,7 +98,7 @@ int openPoseTutorialPose1()
    // Step 3 - Initialize all required classes
    op::CvMatToOpInput cvMatToOpInput{netInputSize, FLAGS_num_scales, (float)FLAGS_scale_gap};
    op::CvMatToOpOutput cvMatToOpOutput{outputSize};
-    op::PoseExtractorCaffe poseExtractorCaffe{netInputSize, netOutputSize, outputSize, FLAGS_num_scales, (float)FLAGS_scale_gap, poseModel,
+    op::PoseExtractorCaffe poseExtractorCaffe{netInputSize, netOutputSize, outputSize, FLAGS_num_scales, poseModel,
                                              FLAGS_model_folder, FLAGS_num_gpu_start};
    op::PoseRenderer poseRenderer{netOutputSize, outputSize, poseModel, nullptr, (float)FLAGS_alpha_pose};
    op::OpOutputToCvMat opOutputToCvMat{outputSize};
@@ -114,12 +114,14 @@ int openPoseTutorialPose1()
    if(inputImage.empty())
        op::error("Could not open or find the image: " + FLAGS_image_path, __LINE__, __FUNCTION__, __FILE__);
    // Step 2 - Format input image to OpenPose input and output formats
-    const auto netInputArray = cvMatToOpInput.format(inputImage);
+    op::Array<float> netInputArray;
+    std::vector<float> scaleRatios;
+    std::tie(netInputArray, scaleRatios) = cvMatToOpInput.format(inputImage);
    double scaleInputToOutput;
    op::Array<float> outputArray;
    std::tie(scaleInputToOutput, outputArray) = cvMatToOpOutput.format(inputImage);
    // Step 3 - Estimate poseKeypoints
-    poseExtractorCaffe.forwardPass(netInputArray, {inputImage.cols, inputImage.rows});
+    poseExtractorCaffe.forwardPass(netInputArray, {inputImage.cols, inputImage.rows}, scaleRatios);
    const auto poseKeypoints = poseExtractorCaffe.getPoseKeypoints();
    // Step 4 - Render poseKeypoints
    poseRenderer.renderPose(outputArray, poseKeypoints);

--- a/examples/tutorial_pose/2_extract_pose_or_heatmat_from_image.cpp
+++ b/examples/tutorial_pose/2_extract_pose_or_heatmat_from_image.cpp
@@ -28,7 +28,7 @@ DEFINE_int32(logging_level,             3,              "The logging level. Inte
 DEFINE_string(image_path,               "examples/media/COCO_val2014_000000000192.jpg",     "Process the desired image.");
 // OpenPose
 DEFINE_string(model_pose,               "COCO",         "Model to be used (e.g. COCO, MPI, MPI_4_layers).");
-DEFINE_string(model_folder,             "models/",      "Folder where the pose models (COCO and MPI) are located.");
+DEFINE_string(model_folder,             "models/",      "Folder path (absolute or relative) where the models (pose, face, ...) are located.");
 DEFINE_string(net_resolution,           "656x368",      "Multiples of 16. If it is increased, the accuracy usually increases. If it is decreased, the speed increases.");
 DEFINE_string(resolution,               "1280x720",     "The image resolution (display). Use \"-1x-1\" to force the program to use the default images resolution.");
 DEFINE_int32(num_gpu_start,             0,              "GPU device start number.");
@@ -101,8 +101,7 @@ int openPoseTutorialPose2()
    op::CvMatToOpInput cvMatToOpInput{netInputSize, FLAGS_num_scales, (float)FLAGS_scale_gap};
    op::CvMatToOpOutput cvMatToOpOutput{outputSize};
    std::shared_ptr<op::PoseExtractor> poseExtractorPtr = std::make_shared<op::PoseExtractorCaffe>(netInputSize, netOutputSize, outputSize, FLAGS_num_scales,
-                                                                                                   (float)FLAGS_scale_gap, poseModel,
-                                                                                                   FLAGS_model_folder, FLAGS_num_gpu_start);
+                                                                                                   poseModel, FLAGS_model_folder, FLAGS_num_gpu_start);
    op::PoseRenderer poseRenderer{netOutputSize, outputSize, poseModel, poseExtractorPtr, (float)FLAGS_alpha_pose, (float)FLAGS_alpha_heatmap};
    poseRenderer.setElementToRender(FLAGS_part_to_show);
    op::OpOutputToCvMat opOutputToCvMat{outputSize};
@@ -118,12 +117,14 @@ int openPoseTutorialPose2()
    if(inputImage.empty())
        op::error("Could not open or find the image: " + FLAGS_image_path, __LINE__, __FUNCTION__, __FILE__);
    // Step 2 - Format input image to OpenPose input and output formats
-    const auto netInputArray = cvMatToOpInput.format(inputImage);
+    op::Array<float> netInputArray;
+    std::vector<float> scaleRatios;
+    std::tie(netInputArray, scaleRatios) = cvMatToOpInput.format(inputImage);
    double scaleInputToOutput;
    op::Array<float> outputArray;
    std::tie(scaleInputToOutput, outputArray) = cvMatToOpOutput.format(inputImage);
    // Step 3 - Estimate poseKeypoints
-    poseExtractorPtr->forwardPass(netInputArray, {inputImage.cols, inputImage.rows});
+    poseExtractorPtr->forwardPass(netInputArray, {inputImage.cols, inputImage.rows}, scaleRatios);
    const auto poseKeypoints = poseExtractorPtr->getPoseKeypoints();
    const auto scaleNetToOutput = poseExtractorPtr->getScaleNetToOutput();
    // Step 4 - Render pose

--- a/include/openpose/core/cvMatToOpInput.hpp
+++ b/include/openpose/core/cvMatToOpInput.hpp
 #ifndef OPENPOSE_CORE_CV_MAT_TO_OP_INPUT_HPP
 #define OPENPOSE_CORE_CV_MAT_TO_OP_INPUT_HPP

+#include <utility> // std::pair
 #include <vector>
 #include <opencv2/core/core.hpp> // cv::Mat
 #include "array.hpp"
@@ -13,7 +14,7 @@ namespace op
    public:
        CvMatToOpInput(const Point<int>& netInputResolution, const int scaleNumber = 1, const float scaleGap = 0.25);

-        Array<float> format(const cv::Mat& cvInputData) const;
+        std::pair<Array<float>, std::vector<float>> format(const cv::Mat& cvInputData) const;

    private:
        const int mScaleNumber;

--- a/include/openpose/core/datum.hpp
+++ b/include/openpose/core/datum.hpp
@@ -74,7 +74,7 @@ namespace op

        /**
         * Face detection locations (x,y,width,height) for each person in the image.
-         * It has been resized to the same resolution as `poseKeypoints`.
+         * It is resized to cvInputData.size().
         * Size: #people
         */
        std::vector<Rectangle<float>> faceRectangles;
@@ -86,6 +86,13 @@ namespace op
         */
        Array<float> faceKeypoints;

+        /**
+         * Hand detection locations (x,y,width,height) for each person in the image.
+         * It is resized to cvInputData.size().
+         * Size: #people
+         */
+        std::vector<std::array<Rectangle<float>, 2>> handRectangles;
+
        /**
         * Experimental (NOT IMPLEMENTED YET)
         * Hands code is in development phase. Not included in this version.
@@ -98,6 +105,8 @@ namespace op

        float scaleNetToOutput; /**< Scale ratio between the net output and the final output Datum::cvOutputData. */

+        std::vector<float> scaleRatios; /**< Scale ratios between each scale (e.g. flag `num_scales`). Used to resize the different scales. */
+
        std::pair<int, std::string> elementRendered; /**< Pair with the element key id POSE_BODY_PART_MAPPING on `pose/poseParameters.hpp` and its mapped value (e.g. 1 and "Neck"). */


@@ -167,7 +176,7 @@ namespace op
         * @param datum Datum to be compared.
         * @result Whether the instance satisfies the condition with respect to datum.
         */
-        inline bool operator <(const Datum& datum) const
+        inline bool operator<(const Datum& datum) const
        {
            return id < datum.id;
        }
@@ -176,7 +185,7 @@ namespace op
         * @param datum Datum to be compared.
         * @result Whether the instance satisfies the condition with respect to datum.
         */
-        inline bool operator >(const Datum& datum) const
+        inline bool operator>(const Datum& datum) const
        {
            return id > datum.id;
        }
@@ -185,7 +194,7 @@ namespace op
         * @param datum Datum to be compared.
         * @result Whether the instance satisfies the condition with respect to datum.
         */
-        inline bool operator <=(const Datum& datum) const
+        inline bool operator<=(const Datum& datum) const
        {
            return id <= datum.id;
        }
@@ -194,7 +203,7 @@ namespace op
         * @param datum Datum to be compared.
         * @result Whether the instance satisfies the condition with respect to datum.
         */
-        inline bool operator >=(const Datum& datum) const
+        inline bool operator>=(const Datum& datum) const
        {
            return id >= datum.id;
        }
@@ -203,7 +212,7 @@ namespace op
         * @param datum Datum to be compared.
         * @result Whether the instance satisfies the condition with respect to datum.
         */
-        inline bool operator ==(const Datum& datum) const
+        inline bool operator==(const Datum& datum) const
        {
            return id == datum.id;
        }
@@ -212,7 +221,7 @@ namespace op
         * @param datum Datum to be compared.
         * @result Whether the instance satisfies the condition with respect to datum.
         */
-        inline bool operator !=(const Datum& datum) const
+        inline bool operator!=(const Datum& datum) const
        {
            return id != datum.id;
        }

--- a/include/openpose/core/resizeAndMergeBase.hpp
+++ b/include/openpose/core/resizeAndMergeBase.hpp
@@ -2,14 +2,17 @@
 #define OPENPOSE_CORE_RESIZE_AND_MERGE_BASE_HPP

 #include <array>
+#include <vector>

 namespace op
 {
    template <typename T>
-    void resizeAndMergeCpu(T* targetPtr, const T* const sourcePtr, const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize, const T scaleGap = 0.f);
+    void resizeAndMergeCpu(T* targetPtr, const T* const sourcePtr, const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize,
+                           const std::vector<T>& scaleRatios = {1});

    template <typename T>
-    void resizeAndMergeGpu(T* targetPtr, const T* const sourcePtr, const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize, const T scaleGap = 0.f);
+    void resizeAndMergeGpu(T* targetPtr, const T* const sourcePtr, const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize,
+                           const std::vector<T>& scaleRatios = {1});
 }

 #endif // OPENPOSE_CORE_RESIZE_AND_MERGE_BASE_HPP
--- a/include/openpose/core/resizeAndMergeCaffe.hpp
+++ b/include/openpose/core/resizeAndMergeCaffe.hpp
@@ -3,12 +3,14 @@
 #define OPENPOSE_CORE_RESIZE_AND_MERGE_CAFFE_HPP

 #include <array>
+#include <vector>
 #include <caffe/blob.hpp>
 #include <openpose/utilities/macros.hpp>

 namespace op
 {
-    // It mostly follows the Caffe::layer implementation, so Caffe users can easily use it. However, in order to keep the compatibility with any generic Caffe version,
+    // It mostly follows the Caffe::layer implementation, so Caffe users can easily use it. However, in order to keep the
+    // compatibility with any generic Caffe version,
    // we keep this 'layer' inside our library rather than in the Caffe code.
    template <typename T>
    class ResizeAndMergeCaffe
@@ -18,22 +20,25 @@ namespace op

        virtual void LayerSetUp(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top);

-        virtual void Reshape(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top, const float factor, const bool mergeFirstDimension = true);
+        virtual void Reshape(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top,
+                             const float factor, const bool mergeFirstDimension = true);

        virtual inline const char* type() const { return "ResizeAndMerge"; }

-        void setScaleGap(const T scaleGap);
+        void setScaleRatios(const std::vector<T>& scaleRatios);

        virtual void Forward_cpu(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top);

        virtual void Forward_gpu(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top);

-        virtual void Backward_cpu(const std::vector<caffe::Blob<T>*>& top, const std::vector<bool>& propagate_down, const std::vector<caffe::Blob<T>*>& bottom);
+        virtual void Backward_cpu(const std::vector<caffe::Blob<T>*>& top, const std::vector<bool>& propagate_down,
+                                  const std::vector<caffe::Blob<T>*>& bottom);

-        virtual void Backward_gpu(const std::vector<caffe::Blob<T>*>& top, const std::vector<bool>& propagate_down, const std::vector<caffe::Blob<T>*>& bottom);
+        virtual void Backward_gpu(const std::vector<caffe::Blob<T>*>& top, const std::vector<bool>& propagate_down,
+                                  const std::vector<caffe::Blob<T>*>& bottom);

    private:
-        T mScaleGap;
+        std::vector<T> mScaleRatios;
        std::array<int, 4> mBottomSize;
        std::array<int, 4> mTopSize;


--- a/include/openpose/core/wCvMatToOpInput.hpp
+++ b/include/openpose/core/wCvMatToOpInput.hpp
@@ -60,7 +60,7 @@ namespace op
                const auto profilerKey = Profiler::timerInit(__LINE__, __FUNCTION__, __FILE__);
                // cv::Mat -> float*
                for (auto& tDatum : *tDatums)
-                    tDatum.inputNetData = spCvMatToOpInput->format(tDatum.cvInputData);
+                    std::tie(tDatum.inputNetData, tDatum.scaleRatios) = spCvMatToOpInput->format(tDatum.cvInputData);
                // Profiling speed
                Profiler::timerEnd(profilerKey);
                Profiler::printAveragedTimeMsOnIterationX(profilerKey, __LINE__, __FUNCTION__, __FILE__, Profiler::DEFAULT_X);

--- a/include/openpose/pose/poseExtractor.hpp
+++ b/include/openpose/pose/poseExtractor.hpp
@@ -22,7 +22,7 @@ namespace op

        void initializationOnThread();

-        virtual void forwardPass(const Array<float>& inputNetData, const Point<int>& inputDataSize) = 0;
+        virtual void forwardPass(const Array<float>& inputNetData, const Point<int>& inputDataSize, const std::vector<float>& scaleRatios = {1.f}) = 0;

        virtual const float* getHeatMapCpuConstPtr() const = 0;


--- a/include/openpose/pose/poseExtractorCaffe.hpp
+++ b/include/openpose/pose/poseExtractorCaffe.hpp
@@ -20,14 +20,14 @@ namespace op
    {
    public:
        PoseExtractorCaffe(const Point<int>& netInputSize, const Point<int>& netOutputSize, const Point<int>& outputSize, const int scaleNumber,
-                           const float scaleGap, const PoseModel poseModel, const std::string& modelFolder, const int gpuId, const std::vector<HeatMapType>& heatMapTypes = {},
+                           const PoseModel poseModel, const std::string& modelFolder, const int gpuId, const std::vector<HeatMapType>& heatMapTypes = {},
                           const ScaleMode heatMapScale = ScaleMode::ZeroToOne);

        virtual ~PoseExtractorCaffe();

        void netInitializationOnThread();

-        void forwardPass(const Array<float>& inputNetData, const Point<int>& inputDataSize);
+        void forwardPass(const Array<float>& inputNetData, const Point<int>& inputDataSize, const std::vector<float>& scaleRatios = {1.f});

        const float* getHeatMapCpuConstPtr() const;

@@ -36,6 +36,7 @@ namespace op
        const float* getPoseGpuConstPtr() const;

    private:
+        const float mResizeScale;
        std::shared_ptr<Net> spNet;
        std::shared_ptr<ResizeAndMergeCaffe<float>> spResizeAndMergeCaffe;
        std::shared_ptr<NmsCaffe<float>> spNmsCaffe;

--- a/include/openpose/pose/wPoseExtractor.hpp
+++ b/include/openpose/pose/wPoseExtractor.hpp
@@ -61,7 +61,7 @@ namespace op
                // Extract people pose
                for (auto& tDatum : *tDatums)
                {
-                    spPoseExtractor->forwardPass(tDatum.inputNetData, Point<int>{tDatum.cvInputData.cols, tDatum.cvInputData.rows});
+                    spPoseExtractor->forwardPass(tDatum.inputNetData, Point<int>{tDatum.cvInputData.cols, tDatum.cvInputData.rows}, tDatum.scaleRatios);
                    tDatum.poseHeatMaps = spPoseExtractor->getHeatMaps();
                    tDatum.poseKeypoints = spPoseExtractor->getPoseKeypoints();
                    tDatum.scaleNetToOutput = spPoseExtractor->getScaleNetToOutput();

--- a/include/openpose/utilities/cuda.hu
+++ b/include/openpose/utilities/cuda.hu
@@ -81,7 +81,8 @@ namespace op

    // Cubic interpolation
    template <typename T>
-    inline __device__ void cubicSequentialData(int* xIntArray, int* yIntArray, T& dx, T& dy, const T xSource, const T ySource, const int width, const int height)
+    inline __device__ void cubicSequentialData(int* xIntArray, int* yIntArray, T& dx, T& dy, const T xSource, const T ySource,
+                                               const int width, const int height)
    {
        xIntArray[1] = fastTruncate(int(xSource + 1e-5), 0, width - 1);
        xIntArray[0] = fastMax(0, xIntArray[1] - 1);
@@ -97,7 +98,7 @@ namespace op
    }

    template <typename T>
-    inline __device__ T cubicInterpolation(const T v0, const T v1, const T v2, const T v3, const T dx)
+    inline __device__ T cubicInterpolate(const T v0, const T v1, const T v2, const T v3, const T dx)
    {
        // http://www.paulinternet.nl/?page=bicubic
        // const auto a = (-0.5f * v0 + 1.5f * v1 - 1.5f * v2 + 0.5f * v3);
@@ -108,10 +109,12 @@ namespace op
                + (v0 - 2.5f * v1 + 2.f * v2 - 0.5f * v3) * dx * dx
                - 0.5f * (v0 - v2) * dx // + (-0.5f * v0 + 0.5f * v2) * dx
                + v1;
+        // return v1 + 0.5f * dx * (v2 - v0 + dx * (2.f * v0 - 5.f * v1 + 4.f * v2 - v3 + dx * (3.f * (v1 - v2) + v3 - v0)));
    }

    template <typename T>
-    inline __device__ T cubicResize(const T* const sourcePtr, const T xSource, const T ySource, const int widthSource, const int heightSource, const int widthSourcePtr)
+    inline __device__ T bicubicInterpolate(const T* const sourcePtr, const T xSource, const T ySource, const int widthSource,
+                                           const int heightSource, const int widthSourcePtr)
    {
        int xIntArray[4];
        int yIntArray[4];
@@ -122,16 +125,17 @@ namespace op
        T temp[4];
        for (unsigned char i = 0; i < 4; i++)
        {
-            const int offset = yIntArray[i]*widthSourcePtr;
-            temp[i] = cubicInterpolation(sourcePtr[offset + xIntArray[0]], sourcePtr[offset + xIntArray[1]], sourcePtr[offset + xIntArray[2]], sourcePtr[offset + xIntArray[3]], dx);
+            const auto offset = yIntArray[i]*widthSourcePtr;
+            temp[i] = cubicInterpolate(sourcePtr[offset + xIntArray[0]], sourcePtr[offset + xIntArray[1]],
+                                       sourcePtr[offset + xIntArray[2]], sourcePtr[offset + xIntArray[3]], dx);
        }
-        return cubicInterpolation(temp[0], temp[1], temp[2], temp[3], dy);
+        return cubicInterpolate(temp[0], temp[1], temp[2], temp[3], dy);
    }

    template <typename T>
    inline __device__ T addWeighted(const T value1, const T value2, const T alphaValue2)
    {
-        return (1 - alphaValue2) * value1 + alphaValue2 * value2;
+        return (1.f - alphaValue2) * value1 + alphaValue2 * value2;
    }

    template <typename T>

--- a/include/openpose/wrapper/wrapper.hpp
+++ b/include/openpose/wrapper/wrapper.hpp
@@ -552,13 +552,13 @@ namespace op
                wDatumProducer = nullptr;

            // Pose estimators
-            const Point<int>& netOutputSize = wrapperStructPose.netInputSize;
+            const Point<int>& poseNetOutputSize = wrapperStructPose.netInputSize;
            std::vector<std::shared_ptr<PoseExtractor>> poseExtractors;
            for (auto gpuId = 0; gpuId < gpuNumber; gpuId++)
                poseExtractors.emplace_back(std::make_shared<PoseExtractorCaffe>(
-                    wrapperStructPose.netInputSize, netOutputSize, finalOutputSize, wrapperStructPose.scalesNumber,
-                    wrapperStructPose.scaleGap, wrapperStructPose.poseModel, wrapperStructPose.modelFolder,
-                    gpuId + gpuNumberStart, wrapperStructPose.heatMapTypes, wrapperStructPose.heatMapScale
+                    wrapperStructPose.netInputSize, poseNetOutputSize, finalOutputSize, wrapperStructPose.scalesNumber,
+                    wrapperStructPose.poseModel, wrapperStructPose.modelFolder, gpuId + gpuNumberStart,
+                    wrapperStructPose.heatMapTypes, wrapperStructPose.heatMapScale
                ));

            // Pose renderers
@@ -572,7 +572,7 @@ namespace op
                for (auto gpuId = 0; gpuId < poseExtractors.size(); gpuId++)
                {
                    poseRenderers.emplace_back(std::make_shared<PoseRenderer>(
-                        netOutputSize, finalOutputSize, wrapperStructPose.poseModel, poseExtractors[gpuId],
+                        poseNetOutputSize, finalOutputSize, wrapperStructPose.poseModel, poseExtractors[gpuId],
                        wrapperStructPose.blendOriginalFrame, alphaKeypoint,
                        alphaHeatMap, wrapperStructPose.defaultPartToRender
                    ));
@@ -678,7 +678,7 @@ namespace op
            // Re-scale pose if desired
            if (wrapperStructPose.keypointScale != ScaleMode::OutputResolution
                && (wrapperStructPose.keypointScale != ScaleMode::InputResolution || (finalOutputSize != producerSize))
-                && (wrapperStructPose.keypointScale != ScaleMode::NetOutputResolution || (finalOutputSize != netOutputSize)))
+                && (wrapperStructPose.keypointScale != ScaleMode::NetOutputResolution || (finalOutputSize != poseNetOutputSize)))
            {
                auto keypointScaler = std::make_shared<KeypointScaler>(wrapperStructPose.keypointScale);
                mPostProcessingWs.emplace_back(std::make_shared<WKeypointScaler<TDatumsPtr>>(keypointScaler));

--- a/src/openpose/core/cvMatToOpInput.cpp
+++ b/src/openpose/core/cvMatToOpInput.cpp
@@ -10,9 +10,19 @@ namespace op
        mScaleGap{scaleGap},
        mInputNetSize4D{{mScaleNumber, 3, netInputResolution.y, netInputResolution.x}}
    {
+        try
+        {
+            // Security checks
+            if (netInputResolution.x % 16 != 0 || netInputResolution.y % 16 != 0)
+                error("Net input resolution must be multiples of 16.", __LINE__, __FUNCTION__, __FILE__);
+        }
+        catch (const std::exception& e)
+        {
+            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+        }
    }

-    Array<float> CvMatToOpInput::format(const cv::Mat& cvInputData) const
+    std::pair<Array<float>, std::vector<float>> CvMatToOpInput::format(const cv::Mat& cvInputData) const
    {
        try
        {
@@ -22,29 +32,35 @@ namespace op

            // inputNetData - Reescale keeping aspect ratio and transform to float the input deep net image
            Array<float> inputNetData{mInputNetSize4D};
+            std::vector<float> scaleRatios(mScaleNumber, 1.f);
            const auto inputNetDataOffset = inputNetData.getVolume(1, 3);
            for (auto i = 0; i < mScaleNumber; i++)
            {
-                const auto requestedScale = 1.f - i*mScaleGap;
-                if (requestedScale > 1.f)
-                    error("All scales must be <= 1, i.e. 1-num_scales*scale_gap <= 1", __LINE__, __FUNCTION__, __FILE__);
+                const auto currentScale = 1.f - i*mScaleGap;
+                if (currentScale < 0.f || 1.f < currentScale)
+                    error("All scales must be in the range [0, 1], i.e. 0 <= 1-num_scales*scale_gap <= 1", __LINE__, __FUNCTION__, __FILE__);

                const auto netInputWidth = inputNetData.getSize(3);
-                const auto targetWidth  = fastTruncate(16 * intRound(netInputWidth * requestedScale / 16.), 1, netInputWidth/16*16);
+                const auto targetWidth  = fastTruncate(intRound(netInputWidth * currentScale) / 16 * 16, 1, netInputWidth);
                const auto netInputHeight = inputNetData.getSize(2);
-                const auto targetHeight  = fastTruncate(16 * intRound(netInputHeight * requestedScale / 16.), 1, netInputHeight/16*16);
+                const auto targetHeight  = fastTruncate(intRound(netInputHeight * currentScale) / 16 * 16, 1, netInputHeight);
                const Point<int> targetSize{targetWidth, targetHeight};
                const auto scale = resizeGetScaleFactor(Point<int>{cvInputData.cols, cvInputData.rows}, targetSize);
                const cv::Mat frameWithNetSize = resizeFixedAspectRatio(cvInputData, scale, Point<int>{netInputWidth, netInputHeight});
+                // Fill inputNetData
                uCharCvMatToFloatPtr(inputNetData.getPtr() + i * inputNetDataOffset, frameWithNetSize, true);
+                // Fill scaleRatios
+                scaleRatios[i] = scale;
+                if (i > 0)
+                    scaleRatios[i] /= scaleRatios[0];
            }
-
-            return inputNetData;
+            scaleRatios.at(0) /= scaleRatios[0];
+            return std::make_pair(inputNetData, scaleRatios);
        }
        catch (const std::exception& e)
        {
            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
-            return Array<float>{};
+            return std::make_pair(Array<float>{}, std::vector<float>{});
        }
    }
 }
--- a/src/openpose/core/datum.cpp
+++ b/src/openpose/core/datum.cpp
@@ -22,10 +22,12 @@ namespace op
        poseHeatMaps{datum.poseHeatMaps},
        faceRectangles{datum.faceRectangles},
        faceKeypoints{datum.faceKeypoints},
+        handRectangles{datum.handRectangles},
        handKeypoints{datum.handKeypoints},
        // Other parameters
        scaleInputToOutput{datum.scaleInputToOutput},
        scaleNetToOutput{datum.scaleNetToOutput},
+        scaleRatios{datum.scaleRatios},
        elementRendered{datum.elementRendered}
    {
    }
@@ -48,10 +50,12 @@ namespace op
            poseHeatMaps = datum.poseHeatMaps,
            faceRectangles = datum.faceRectangles,
            faceKeypoints = datum.faceKeypoints,
+            handRectangles = datum.handRectangles,
            handKeypoints = datum.handKeypoints,
            // Other parameters
            scaleInputToOutput = datum.scaleInputToOutput;
            scaleNetToOutput = datum.scaleNetToOutput;
+            scaleRatios = datum.scaleRatios;
            elementRendered = datum.elementRendered;
            // Return
            return *this;
@@ -85,8 +89,10 @@ namespace op
            std::swap(poseHeatMaps, datum.poseHeatMaps);
            std::swap(faceRectangles, datum.faceRectangles);
            std::swap(faceKeypoints, datum.faceKeypoints);
+            std::swap(handRectangles, datum.handRectangles);
            std::swap(handKeypoints, datum.handKeypoints);
            // Other parameters
+            std::swap(scaleRatios, datum.scaleRatios);
            std::swap(elementRendered, datum.elementRendered);
        }
        catch (const std::exception& e)
@@ -113,10 +119,12 @@ namespace op
            std::swap(poseHeatMaps, datum.poseHeatMaps);
            std::swap(faceRectangles, datum.faceRectangles);
            std::swap(faceKeypoints, datum.faceKeypoints);
+            std::swap(handRectangles, datum.handRectangles);
            std::swap(handKeypoints, datum.handKeypoints);
            // Other parameters
            scaleInputToOutput = datum.scaleInputToOutput;
            scaleNetToOutput = datum.scaleNetToOutput;
+            std::swap(scaleRatios, datum.scaleRatios);
            std::swap(elementRendered, datum.elementRendered);
            // Return
            return *this;
@@ -151,10 +159,12 @@ namespace op
            datum.poseHeatMaps = poseHeatMaps.clone();
            datum.faceRectangles = faceRectangles;
            datum.faceKeypoints = faceKeypoints.clone();
+            datum.handRectangles = datum.handRectangles;
            datum.handKeypoints = handKeypoints.clone();
            // Other parameters
            datum.scaleInputToOutput = scaleInputToOutput;
            datum.scaleNetToOutput = scaleNetToOutput;
+            datum.scaleRatios = scaleRatios;
            datum.elementRendered = elementRendered;
            // Return
            return std::move(datum);

--- a/src/openpose/core/resizeAndMergeBase.cpp
+++ b/src/openpose/core/resizeAndMergeBase.cpp
@@ -6,13 +6,14 @@
 namespace op
 {
    template <typename T>
-    void resizeAndMergeCpu(T* targetPtr, const T* const sourcePtr, const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize, const T scaleGap)
+    void resizeAndMergeCpu(T* targetPtr, const T* const sourcePtr, const std::array<int, 4>& targetSize,
+                           const std::array<int, 4>& sourceSize, const std::vector<T>& scaleRatios)
    {
        try
        {
            UNUSED(targetPtr);
            UNUSED(sourcePtr);
-            UNUSED(scaleGap);
+            UNUSED(scaleRatios);
            UNUSED(targetSize);
            UNUSED(sourceSize);
            error("CPU version not completely implemented.", __LINE__, __FUNCTION__, __FILE__);
@@ -61,6 +62,8 @@ namespace op
        }
    }

-    template void resizeAndMergeCpu(float* targetPtr, const float* const sourcePtr, const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize, const float scaleGap);
-    template void resizeAndMergeCpu(double* targetPtr, const double* const sourcePtr, const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize, const double scaleGap);
+    template void resizeAndMergeCpu(float* targetPtr, const float* const sourcePtr, const std::array<int, 4>& targetSize,
+                                    const std::array<int, 4>& sourceSize, const std::vector<float>& scaleRatios);
+    template void resizeAndMergeCpu(double* targetPtr, const double* const sourcePtr, const std::array<int, 4>& targetSize,
+                                    const std::array<int, 4>& sourceSize, const std::vector<double>& scaleRatios);
 }
--- a/src/openpose/core/resizeAndMergeBase.cu
+++ b/src/openpose/core/resizeAndMergeBase.cu
@@ -8,7 +8,8 @@ namespace op
    const auto THREADS_PER_BLOCK_1D = 16u;

    template <typename T>
-    __global__ void resizeKernel(T* targetPtr, const T* const sourcePtr, const int sourceWidth, const int sourceHeight, const int targetWidth, const int targetHeight)
+    __global__ void resizeKernel(T* targetPtr, const T* const sourcePtr, const int sourceWidth, const int sourceHeight, const int targetWidth,
+                                 const int targetHeight)
    {
        const auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
        const auto y = (blockIdx.y * blockDim.y) + threadIdx.y;
@@ -20,12 +21,12 @@ namespace op
            const T xSource = (x + 0.5f) / scaleWidth - 0.5f;
            const T ySource = (y + 0.5f) / scaleHeight - 0.5f;

-            targetPtr[y*targetWidth+x] = cubicResize(sourcePtr, xSource, ySource, sourceWidth, sourceHeight, sourceWidth);
+            targetPtr[y*targetWidth+x] = bicubicInterpolate(sourcePtr, xSource, ySource, sourceWidth, sourceHeight, sourceWidth);
        }
    }

    template <typename T>
-    __global__ void resizeKernelAndMerge(T* targetPtr, const T* const sourcePtr, const int sourceNumOffset, const int num, const T scaleGap,
+    __global__ void resizeKernelAndMerge(T* targetPtr, const T* const sourcePtr, const int sourceNumOffset, const int num, const T* scaleRatios,
                                         const int sourceWidth, const int sourceHeight, const int targetWidth, const int targetHeight)
    {
        const auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
@@ -38,17 +39,17 @@ namespace op
            // targetPixel = -1000.f; // For fastMax
            for (auto n = 0; n < num; n++)
            {
-                const auto numberScale = 1 - n * scaleGap;
-                const auto widthPaddedSource = int(sourceWidth * numberScale);
-                const auto heightPaddedSource = int(sourceHeight * numberScale);
+                const auto currentWidth = sourceWidth * scaleRatios[n];
+                const auto currentHeight = sourceHeight * scaleRatios[n];

-                const auto scaleWidth = targetWidth / T(widthPaddedSource);
-                const auto scaleHeight = targetHeight / T(heightPaddedSource);
+                const auto scaleWidth = targetWidth / currentWidth;
+                const auto scaleHeight = targetHeight / currentHeight;
                const T xSource = (x + 0.5f) / scaleWidth - 0.5f;
                const T ySource = (y + 0.5f) / scaleHeight - 0.5f;

                const T* const sourcePtrN = sourcePtr + n * sourceNumOffset;
-                const auto interpolated = cubicResize(sourcePtrN, xSource, ySource, widthPaddedSource, heightPaddedSource, sourceWidth);
+                const auto interpolated = bicubicInterpolate(sourcePtrN, xSource, ySource, intRound(currentWidth),
+                                                             intRound(currentHeight), sourceWidth);
                targetPixel += interpolated;
                // targetPixel = fastMax(targetPixel, interpolated);
            }
@@ -57,7 +58,8 @@ namespace op
    }

    template <typename T>
-    void resizeAndMergeGpu(T* targetPtr, const T* const sourcePtr, const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize, const T scaleGap)
+    void resizeAndMergeGpu(T* targetPtr, const T* const sourcePtr, const std::array<int, 4>& targetSize,
+                           const std::array<int, 4>& sourceSize, const std::vector<T>& scaleRatios)
    {
        try
        {
@@ -73,21 +75,42 @@ namespace op
            const auto sourceChannelOffset = sourceHeight * sourceWidth;
            const auto targetChannelOffset = targetWidth * targetHeight;

+            // No multi-scale merging
            if (targetSize[0] > 1)
            {
                for (auto n = 0; n < num; n++)
-                    for (auto c = 0; c < channels; c++)
-                        resizeKernel<<<numBlocks, threadsPerBlock>>>(targetPtr + (n*channels + c) * targetChannelOffset, sourcePtr + (n*channels + c) * sourceChannelOffset,
+                {
+                    const auto offsetBase = n*channels;
+                    for (auto c = 0 ; c < channels ; c++)
+                    {
+                        const auto offset = offsetBase + c;
+                        resizeKernel<<<numBlocks, threadsPerBlock>>>(targetPtr + offset * targetChannelOffset,
+                                                                     sourcePtr + offset * sourceChannelOffset,
                                                                     sourceWidth, sourceHeight, targetWidth, targetHeight);
+                    }
+                }
            }
+            // Multi-scale merging
            else
            {
-                if (scaleGap <= 0.f && num != targetSize[0])
-                    error("The scale gap must be greater than 0.", __LINE__, __FUNCTION__, __FILE__);
+                // If num_scales > 1 --> scaleRatios must be set
+                if (scaleRatios.size() != num)
+                    error("The scale ratios size must be equal than the number of scales.", __LINE__, __FUNCTION__, __FILE__);
+                const auto maxScales = 10;
+                if (scaleRatios.size() > maxScales)
+                    error("The maximum number of scales is " + std::to_string(maxScales) + ".", __LINE__, __FUNCTION__, __FILE__);
+                // Copy scaleRatios
+                T* scaleRatiosGpuPtr;
+                cudaMalloc((void**)&scaleRatiosGpuPtr, maxScales * sizeof(T));
+                cudaMemcpy(scaleRatiosGpuPtr, scaleRatios.data(), scaleRatios.size() * sizeof(T), cudaMemcpyHostToDevice);
+                // Perform resize + merging
                const auto sourceNumOffset = channels * sourceChannelOffset;
-                for (auto c = 0; c < channels; c++)
-                    resizeKernelAndMerge<<<numBlocks, threadsPerBlock>>>(targetPtr + c * targetChannelOffset, sourcePtr + c * sourceChannelOffset, sourceNumOffset,
-                                                                         num, scaleGap, sourceWidth, sourceHeight, targetWidth, targetHeight);
+                for (auto c = 0 ; c < channels ; c++)
+                    resizeKernelAndMerge<<<numBlocks, threadsPerBlock>>>(targetPtr + c * targetChannelOffset,
+                                                                         sourcePtr + c * sourceChannelOffset, sourceNumOffset,
+                                                                         num, scaleRatiosGpuPtr, sourceWidth, sourceHeight, targetWidth, targetHeight);
+                // Free memory
+                cudaFree(scaleRatiosGpuPtr);
            }

            cudaCheck(__LINE__, __FUNCTION__, __FILE__);
@@ -98,6 +121,8 @@ namespace op
        }
    }

-    template void resizeAndMergeGpu(float* targetPtr, const float* const sourcePtr, const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize, const float scaleGap);
-    template void resizeAndMergeGpu(double* targetPtr, const double* const sourcePtr, const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize, const double scaleGap);
+    template void resizeAndMergeGpu(float* targetPtr, const float* const sourcePtr, const std::array<int, 4>& targetSize,
+                                    const std::array<int, 4>& sourceSize, const std::vector<float>& scaleRatios);
+    template void resizeAndMergeGpu(double* targetPtr, const double* const sourcePtr, const std::array<int, 4>& targetSize,
+                                    const std::array<int, 4>& sourceSize, const std::vector<double>& scaleRatios);
 }
--- a/src/openpose/core/resizeAndMergeCaffe.cpp
+++ b/src/openpose/core/resizeAndMergeCaffe.cpp
@@ -8,7 +8,8 @@
 namespace op
 {
    template <typename T>
-    ResizeAndMergeCaffe<T>::ResizeAndMergeCaffe()
+    ResizeAndMergeCaffe<T>::ResizeAndMergeCaffe() :
+        mScaleRatios{1}
    {
    }

@@ -29,7 +30,8 @@ namespace op
    }

    template <typename T>
-    void ResizeAndMergeCaffe<T>::Reshape(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top, const float factor, const bool mergeFirstDimension)
+    void ResizeAndMergeCaffe<T>::Reshape(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top,
+                                         const float factor, const bool mergeFirstDimension)
    {
        try
        {
@@ -54,11 +56,11 @@ namespace op
    }

    template <typename T>
-    void ResizeAndMergeCaffe<T>::setScaleGap(const T scaleGap)
+    void ResizeAndMergeCaffe<T>::setScaleRatios(const std::vector<T>& scaleRatios)
    {
        try
        {
-            mScaleGap = {scaleGap};
+            mScaleRatios = {scaleRatios};
        }
        catch (const std::exception& e)
        {
@@ -71,7 +73,7 @@ namespace op
    {
        try
        {
-            resizeAndMergeCpu(top.at(0)->mutable_cpu_data(), bottom.at(0)->cpu_data(), mTopSize, mBottomSize, mScaleGap);
+            resizeAndMergeCpu(top.at(0)->mutable_cpu_data(), bottom.at(0)->cpu_data(), mTopSize, mBottomSize, mScaleRatios);
        }
        catch (const std::exception& e)
        {
@@ -84,7 +86,7 @@ namespace op
    {
        try
        {
-            resizeAndMergeGpu(top.at(0)->mutable_gpu_data(), bottom.at(0)->gpu_data(), mTopSize, mBottomSize, mScaleGap);
+            resizeAndMergeGpu(top.at(0)->mutable_gpu_data(), bottom.at(0)->gpu_data(), mTopSize, mBottomSize, mScaleRatios);
        }
        catch (const std::exception& e)
        {
@@ -93,7 +95,8 @@ namespace op
    }

    template <typename T>
-    void ResizeAndMergeCaffe<T>::Backward_cpu(const std::vector<caffe::Blob<T>*>& top, const std::vector<bool>& propagate_down, const std::vector<caffe::Blob<T>*>& bottom)
+    void ResizeAndMergeCaffe<T>::Backward_cpu(const std::vector<caffe::Blob<T>*>& top, const std::vector<bool>& propagate_down,
+                                              const std::vector<caffe::Blob<T>*>& bottom)
    {
        try
        {
@@ -109,7 +112,8 @@ namespace op
    }

    template <typename T>
-    void ResizeAndMergeCaffe<T>::Backward_gpu(const std::vector<caffe::Blob<T>*>& top, const std::vector<bool>& propagate_down, const std::vector<caffe::Blob<T>*>& bottom)
+    void ResizeAndMergeCaffe<T>::Backward_gpu(const std::vector<caffe::Blob<T>*>& top, const std::vector<bool>& propagate_down,
+                                              const std::vector<caffe::Blob<T>*>& bottom)
    {
        try
        {

--- a/src/openpose/pose/poseExtractorCaffe.cpp
+++ b/src/openpose/pose/poseExtractorCaffe.cpp
@@ -11,9 +11,10 @@
 namespace op
 {
    PoseExtractorCaffe::PoseExtractorCaffe(const Point<int>& netInputSize, const Point<int>& netOutputSize, const Point<int>& outputSize, const int scaleNumber,
-                                           const float scaleGap, const PoseModel poseModel, const std::string& modelFolder, const int gpuId, const std::vector<HeatMapType>& heatMapTypes,
+                                           const PoseModel poseModel, const std::string& modelFolder, const int gpuId, const std::vector<HeatMapType>& heatMapTypes,
                                           const ScaleMode heatMapScale) :
        PoseExtractor{netOutputSize, outputSize, poseModel, heatMapTypes, heatMapScale},
+        mResizeScale{mNetOutputSize.x / (float)netInputSize.x},
        spNet{std::make_shared<NetCaffe>(std::array<int,4>{scaleNumber, 3, (int)netInputSize.y, (int)netInputSize.x},
                                         modelFolder + POSE_PROTOTXT[(int)poseModel], modelFolder + POSE_TRAINED_MODEL[(int)poseModel], gpuId)},
        spResizeAndMergeCaffe{std::make_shared<ResizeAndMergeCaffe<float>>()},
@@ -22,9 +23,10 @@ namespace op
    {
        try
        {
-            checkE(netOutputSize.x, netInputSize.x, "Net input and output size must be equal.", __LINE__, __FUNCTION__, __FILE__);
-            checkE(netOutputSize.y, netInputSize.y, "Net input and output size must be equal.", __LINE__, __FUNCTION__, __FILE__);
-            spResizeAndMergeCaffe->setScaleGap(scaleGap);
+            const auto resizeScale = mNetOutputSize.x / (float)netInputSize.x;
+            const auto resizeScaleCheck = resizeScale / (mNetOutputSize.y/(float)netInputSize.y);
+            if (1+1e-6 < resizeScaleCheck || resizeScaleCheck < 1-1e-6)
+                error("Net input and output size must be proportional. resizeScaleCheck = " + std::to_string(resizeScaleCheck), __LINE__, __FUNCTION__, __FILE__);
        }
        catch (const std::exception& e)
        {
@@ -49,7 +51,7 @@ namespace op

            // HeatMaps extractor blob and layer
            spHeatMapsBlob = {std::make_shared<caffe::Blob<float>>(1,1,1,1)};
-            spResizeAndMergeCaffe->Reshape({spCaffeNetOutputBlob.get()}, {spHeatMapsBlob.get()}, POSE_CCN_DECREASE_FACTOR[(int)mPoseModel]);
+            spResizeAndMergeCaffe->Reshape({spCaffeNetOutputBlob.get()}, {spHeatMapsBlob.get()}, mResizeScale * POSE_CCN_DECREASE_FACTOR[(int)mPoseModel]);
            cudaCheck(__LINE__, __FUNCTION__, __FILE__);

            // Pose extractor blob and layer
@@ -71,7 +73,7 @@ namespace op
        }
    }

-    void PoseExtractorCaffe::forwardPass(const Array<float>& inputNetData, const Point<int>& inputDataSize)
+    void PoseExtractorCaffe::forwardPass(const Array<float>& inputNetData, const Point<int>& inputDataSize, const std::vector<float>& scaleRatios)
    {
        try
        {
@@ -83,6 +85,7 @@ namespace op
            spNet->forwardPass(inputNetData.getConstPtr());                                                     // ~79.3836ms

            // 2. Resize heat maps + merge different scales
+            spResizeAndMergeCaffe->setScaleRatios(scaleRatios);
            #ifndef CPU_ONLY
                spResizeAndMergeCaffe->Forward_gpu({spCaffeNetOutputBlob.get()}, {spHeatMapsBlob.get()});       // ~5ms
                cudaCheck(__LINE__, __FUNCTION__, __FILE__);

--- a/src/openpose/pose/poseRenderGpu.cu
+++ b/src/openpose/pose/poseRenderGpu.cu
@@ -244,7 +244,7 @@ namespace op
            const auto ySource = (y + 0.5f) / scaleToKeepRatio - 0.5f;
            const auto heatMapOffset = part * widthHeatMap * heightHeatMap;
            const auto* const heatMapPtrOffsetted = heatMapPtr + heatMapOffset;
-            const auto interpolatedValue = cubicResize(heatMapPtrOffsetted, xSource, ySource, widthHeatMap, heightHeatMap, widthHeatMap);
+            const auto interpolatedValue = bicubicInterpolate(heatMapPtrOffsetted, xSource, ySource, widthHeatMap, heightHeatMap, widthHeatMap);

            float rgbColor[3];
            getColorHeatMap(rgbColor, interpolatedValue, 0.f, 1.f);