Part candidates at same scale than keypoints

27903fb3 · gineshidalgo99 · 29ea7e24 · 27903fb3 · 27903fb3 · 27903fb3
16 changed file
--- a/doc/3d_reconstruction_demo.md
+++ b/doc/3d_reconstruction_demo.md
@@ -74,7 +74,7 @@ In order to verify that the camera parameters introduced by the user are sorted
 ## Installing the OpenPose 3-D Reconstruction Module
-Check the [doc/installation.md#3d-reconstruction-module](./quick_start.md#3d-reconstruction-module) for installation steps.
+Check the [doc/installation.md#3d-reconstruction-module](./installation.md#3d-reconstruction-module) for installation steps.

--- a/doc/release_notes.md
+++ b/doc/release_notes.md
@@ -218,6 +218,7 @@ OpenPose Library - Release Notes
    1. Removed scale parameter from hand and face rectangle extractor (causing wrong results if custom `--output_resolution`).
 3. Main bugs fixed:
    1. Hand and face work properly again with any `--output_resolution`.
+    2. Part candidates (`--part_candidates`) are saved with the same scale than the final keypoints itself.

--- a/include/openpose/core/keypointScaler.hpp
+++ b/include/openpose/core/keypointScaler.hpp
@@ -17,6 +17,9 @@ namespace op
        void scale(std::vector<Array<float>>& arraysToScale, const double scaleInputToOutput,
                   const double scaleNetToOutput, const Point<int>& producerSize) const;
+        void scale(std::vector<std::vector<std::array<float,3>>>& poseCandidates, const double scaleInputToOutput,
+                   const double scaleNetToOutput, const Point<int>& producerSize) const;
    private:
        const ScaleMode mScaleMode;
    };

--- a/include/openpose/core/wKeypointScaler.hpp
+++ b/include/openpose/core/wKeypointScaler.hpp
@@ -59,6 +59,9 @@ namespace op
                                                            tDatum.handKeypoints[1], tDatum.faceKeypoints};
                    spKeypointScaler->scale(arraysToScale, tDatum.scaleInputToOutput, tDatum.scaleNetToOutput,
                                            Point<int>{tDatum.cvInputData.cols, tDatum.cvInputData.rows});
+                    // Rescale part candidates
+                    spKeypointScaler->scale(tDatum.poseCandidates, tDatum.scaleInputToOutput, tDatum.scaleNetToOutput,
+                                            Point<int>{tDatum.cvInputData.cols, tDatum.cvInputData.rows});
                }
                // Profiling speed
                Profiler::timerEnd(profilerKey);

--- a/include/openpose/net/nmsBase.hpp
+++ b/include/openpose/net/nmsBase.hpp
@@ -7,16 +7,18 @@ namespace op
 {
    template <typename T>
    OP_API void nmsCpu(T* targetPtr, int* kernelPtr, const T* const sourcePtr, const T threshold,
-                       const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize);
+                       const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize,
+                       const Point<T>& offset);
    template <typename T>
    OP_API void nmsGpu(T* targetPtr, int* kernelPtr, const T* const sourcePtr, const T threshold,
-                       const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize);
+                       const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize,
+                       const Point<T>& offset);
    template <typename T>
    OP_API void nmsOcl(T* targetPtr, int* kernelPtr, const T* const sourcePtr, const T threshold,
                       const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize,
-                       const int gpuID = 0);
+                       const Point<T>& offset, const int gpuID = 0);
 }
 #endif // OPENPOSE_NET_NMS_BASE_HPP
--- a/include/openpose/net/nmsCaffe.hpp
+++ b/include/openpose/net/nmsCaffe.hpp
@@ -25,6 +25,9 @@ namespace op
        void setThreshold(const T threshold);
+        // Empirically gives better results (copied from Matlab original code)
+        void setOffset(const Point<T>& offset);
        virtual void Forward_cpu(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top);
        virtual void Forward_gpu(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top);
@@ -39,6 +42,7 @@ namespace op
    private:
        T mThreshold;
+        Point<T> mOffset;
        int mGpuID;
        // PIMPL idiom

--- a/include/openpose/utilities/openCv.hpp
+++ b/include/openpose/utilities/openCv.hpp
@@ -8,7 +8,7 @@
 namespace op
 {
    OP_API void putTextOnCvMat(cv::Mat& cvMat, const std::string& textToDisplay, const Point<int>& position,
-                   const cv::Scalar& color, const bool normalizeWidth, const int imageWidth);
+                               const cv::Scalar& color, const bool normalizeWidth, const int imageWidth);
    OP_API void floatPtrToUCharCvMat(cv::Mat& uCharCvMat, const float* const floatPtrImage,
                                     const std::array<int, 3> resolutionSize);

--- a/src/openpose/core/keypointScaler.cpp
+++ b/src/openpose/core/keypointScaler.cpp
@@ -3,6 +3,40 @@
 namespace op
 {
+    Rectangle<float> getScaleAndOffset(const ScaleMode scaleMode, const double scaleInputToOutput,
+                                       const double scaleNetToOutput, const Point<int>& producerSize)
+    {
+        try
+        {
+            // OutputResolution
+            if (scaleMode == ScaleMode::OutputResolution)
+                return Rectangle<float>{0.f, 0.f, float(scaleInputToOutput), float(scaleInputToOutput)};
+            // NetOutputResolution
+            else if (scaleMode == ScaleMode::NetOutputResolution)
+                return Rectangle<float>{0.f, 0.f, float(1./scaleNetToOutput),
+                                        float(1./scaleNetToOutput)};
+            // [0,1]
+            else if (scaleMode == ScaleMode::ZeroToOne)
+                return Rectangle<float>{0.f, 0.f, 1.f / ((float)producerSize.x - 1.f),
+                                        1.f / ((float)producerSize.y - 1.f)};
+            // [-1,1]
+            else if (scaleMode == ScaleMode::PlusMinusOne)
+                return Rectangle<float>{-1.f, -1.f, 2.f / ((float)producerSize.x - 1.f),
+                                        2.f / ((float)producerSize.y - 1.f)};
+            // InputResolution
+            else if (scaleMode == ScaleMode::InputResolution)
+                return Rectangle<float>{0.f, 0.f, 1.f, 1.f};
+            // Unknown
+            error("Unknown ScaleMode selected.", __LINE__, __FUNCTION__, __FILE__);
+            return Rectangle<float>{};
+        }
+        catch (const std::exception& e)
+        {
+            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+            return Rectangle<float>{};
+        }
+    }
    KeypointScaler::KeypointScaler(const ScaleMode scaleMode) :
        mScaleMode{scaleMode}
    {
@@ -29,38 +63,61 @@ namespace op
        {
            if (mScaleMode != ScaleMode::InputResolution)
            {
-                // OutputResolution
+                // Get scale and offset
-                if (mScaleMode == ScaleMode::OutputResolution)
+                const auto scaleAndOffset = getScaleAndOffset(mScaleMode, scaleInputToOutput, scaleNetToOutput,
-                {
+                                                              producerSize);
+                // Only scaling
+                if (scaleAndOffset.x == 0 && scaleAndOffset.y == 0)
                    for (auto& arrayToScale : arrayToScalesToScale)
-                        scaleKeypoints(arrayToScale, float(scaleInputToOutput));
+                        scaleKeypoints(arrayToScale, scaleAndOffset.width, scaleAndOffset.height);
-                }
+                // Scaling + offset
-                // NetOutputResolution
+                else
-                else if (mScaleMode == ScaleMode::NetOutputResolution)
-                {
                    for (auto& arrayToScale : arrayToScalesToScale)
-                        scaleKeypoints(arrayToScale, float(1./scaleNetToOutput));
+                        scaleKeypoints(arrayToScale, scaleAndOffset.width, scaleAndOffset.height,
-                }
+                                       scaleAndOffset.x, scaleAndOffset.y);
-                // [0,1]
+            }
-                else if (mScaleMode == ScaleMode::ZeroToOne)
+        }
+        catch (const std::exception& e)
+        {
+            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+        }
+    }
+    void KeypointScaler::scale(std::vector<std::vector<std::array<float,3>>>& poseCandidates,
+                               const double scaleInputToOutput, const double scaleNetToOutput,
+                               const Point<int>& producerSize) const
+    {
+        try
+        {
+            if (mScaleMode != ScaleMode::InputResolution)
+            {
+                // Get scale and offset
+                const auto scaleAndOffset = getScaleAndOffset(mScaleMode, scaleInputToOutput, scaleNetToOutput,
+                                                              producerSize);
+                // Only scaling
+                if (scaleAndOffset.x == 0 && scaleAndOffset.y == 0)
                {
-                    const auto scaleX = 1.f / ((float)producerSize.x - 1.f);
+                    for (auto& partCandidates : poseCandidates)
-                    const auto scaleY = 1.f / ((float)producerSize.y - 1.f);
+                    {
-                    for (auto& arrayToScale : arrayToScalesToScale)
+                        for (auto& candidate : partCandidates)
-                        scaleKeypoints(arrayToScale, scaleX, scaleY);
+                        {
+                            candidate[0] *= scaleAndOffset.width;
+                            candidate[1] *= scaleAndOffset.height;
+                        }
+                    }
                }
-                // [-1,1]
+                // Scaling + offset
-                else if (mScaleMode == ScaleMode::PlusMinusOne)
+                else
                {
-                    const auto scaleX = (2.f / ((float)producerSize.x - 1.f));
+                    for (auto& partCandidates : poseCandidates)
-                    const auto scaleY = (2.f / ((float)producerSize.y - 1.f));
+                    {
-                    const auto offset = -1.f;
+                        for (auto& candidate : partCandidates)
-                    for (auto& arrayToScale : arrayToScalesToScale)
+                        {
-                        scaleKeypoints(arrayToScale, scaleX, scaleY, offset, offset);
+                            candidate[0] = candidate[0]*scaleAndOffset.width + scaleAndOffset.x;
+                            candidate[1] = candidate[1]*scaleAndOffset.height + scaleAndOffset.y;
+                        }
+                    }
                }
-                // Unknown
-                else
-                    error("Unknown ScaleMode selected.", __LINE__, __FUNCTION__, __FILE__);
            }
        }
        catch (const std::exception& e)

--- a/src/openpose/net/nmsBase.cpp
+++ b/src/openpose/net/nmsBase.cpp
@@ -68,8 +68,8 @@ namespace op
    }
    template <typename T>
-    void nmsAccuratePeakPosition(const T* const sourcePtr, const int& peakLocX, const int& peakLocY,
+    void nmsAccuratePeakPosition(T* output, const T* const sourcePtr, const int& peakLocX, const int& peakLocY,
-                                 const int& width, const int& height, T* output)
+                                 const int& width, const int& height, const Point<T>& offset)
    {
        T xAcc = 0.f;
        T yAcc = 0.f;
@@ -98,14 +98,18 @@ namespace op
            }
        }
-        output[0] = xAcc / scoreAcc;
+        // Offset to keep Matlab format (empirically higher acc)
-        output[1] = yAcc / scoreAcc;
+        // Best results for 1 scale: x + 0, y + 0.5
+        // +0.5 to both to keep Matlab format
+        output[0] = xAcc / scoreAcc + offset.x;
+        output[1] = yAcc / scoreAcc + offset.y;
        output[2] = sourcePtr[peakLocY*width + peakLocX];
    }
    template <typename T>
    void nmsCpu(T* targetPtr, int* kernelPtr, const T* const sourcePtr, const T threshold,
-                const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize)
+                const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize,
+                const Point<T>& offset)
    {
        try
        {
@@ -149,8 +153,8 @@ namespace op
                            if (currKernelPtr[index] == 1)
                            {
                                // Accurate Peak Position
-                                nmsAccuratePeakPosition(currSourcePtr, x, y, sourceWidth, sourceHeight,
+                                nmsAccuratePeakPosition(&currTargetPtr[currentPeakCount*3], currSourcePtr, x, y,
-                                                        &currTargetPtr[currentPeakCount*3]);
+                                                        sourceWidth, sourceHeight, offset);
                                currentPeakCount++;
                            }
                        }
@@ -167,7 +171,9 @@ namespace op
    }
    template void nmsCpu(float* targetPtr, int* kernelPtr, const float* const sourcePtr, const float threshold,
-                         const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize);
+                         const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize,
+                         const Point<float>& offset);
    template void nmsCpu(double* targetPtr, int* kernelPtr, const double* const sourcePtr, const double threshold,
-                         const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize);
+                         const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize,
+                         const Point<double>& offset);
 }
--- a/src/openpose/net/nmsBase.cu
+++ b/src/openpose/net/nmsBase.cu
@@ -48,7 +48,8 @@ namespace op
    template <typename T>
    __global__ void writeResultKernel(T* output, const int length, const int* const kernelPtr,
-                                      const T* const sourcePtr, const int width, const int height, const int maxPeaks)
+                                      const T* const sourcePtr, const int width, const int height, const int maxPeaks,
+                                      const T offsetX, const T offsetY)
    {
        __shared__ int local[THREADS_PER_BLOCK+1]; // one more
        const auto globalIdx = blockIdx.x * blockDim.x + threadIdx.x;
@@ -101,9 +102,12 @@ namespace op
                            }
                        }
+                        // Offset to keep Matlab format (empirically higher acc)
+                        // Best results for 1 scale: x + 0, y + 0.5
+                        // +0.5 to both to keep Matlab format
                        const auto outputIndex = (peakIndex + 1) * 3;
-                        output[outputIndex] = xAcc / scoreAcc;
+                        output[outputIndex] = xAcc / scoreAcc + offsetX;
-                        output[outputIndex + 1] = yAcc / scoreAcc;
+                        output[outputIndex + 1] = yAcc / scoreAcc + offsetY;
                        output[outputIndex + 2] = sourcePtr[peakLocY*width + peakLocX];
                    }
                }
@@ -115,7 +119,7 @@ namespace op
    template <typename T>
    void nmsGpu(T* targetPtr, int* kernelPtr, const T* const sourcePtr, const T threshold,
-                const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize)
+                const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize, const Point<T>& offset)
    {
        try
        {
@@ -177,7 +181,7 @@ namespace op
                    // This returns targetPtrOffsetted, with the NMS applied over it
                    writeResultKernel<<<numBlocks1D, threadsPerBlock1D>>>(targetPtrOffsetted, imageOffset,
                                                                          kernelPtrOffsetted, sourcePtrOffsetted,
-                                                                          width, height, maxPeaks);
+                                                                          width, height, maxPeaks, offset.x, offset.y);
                }
            }
            cudaCheck(__LINE__, __FUNCTION__, __FILE__);
@@ -189,7 +193,9 @@ namespace op
    }
    template void nmsGpu(float* targetPtr, int* kernelPtr, const float* const sourcePtr, const float threshold,
-                         const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize);
+                         const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize,
+                         const Point<float>& offset);
    template void nmsGpu(double* targetPtr, int* kernelPtr, const double* const sourcePtr, const double threshold,
-                         const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize);
+                         const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize,
+                         const Point<double>& offset);
 }
--- a/src/openpose/net/nmsBaseCL.cpp
+++ b/src/openpose/net/nmsBaseCL.cpp
@@ -13,8 +13,9 @@ namespace op
 {
    #ifdef USE_OPENCL
        const std::string nmsOclCommonFunctions = MULTI_LINE_STRING(
-            void nmsAccuratePeakPosition(__global const Type* sourcePtr, const int peakLocX, const int peakLocY,
+            void nmsAccuratePeakPosition(__global const Type* sourcePtr, Type* fx, Type* fy, Type* fscore,
-                                         const int width, const int height, Type* fx, Type* fy, Type* fscore)
+                                         const int peakLocX, const int peakLocY, const int width, const int height,
+                                         const T offsetX, const T offsetY)
            {
                Type xAcc = 0.f;
                Type yAcc = 0.f;
@@ -43,8 +44,11 @@ namespace op
                    }
                }
-                *fx = xAcc / scoreAcc;
+                // Offset to keep Matlab format (empirically higher acc)
-                *fy = yAcc / scoreAcc;
+                // Best results for 1 scale: x + 0, y + 0.5
+                // +0.5 to both to keep Matlab format
+                *fx = xAcc / scoreAcc + offsetX;
+                *fy = yAcc / scoreAcc + offsetY;
                *fscore = sourcePtr[peakLocY*width + peakLocX];
            }
@@ -85,10 +89,7 @@ namespace op
                            && value > left && value > right
                            && value > bottomLeft && value > bottom && value > bottomRight)
                        {
-                            //Type fx = 0; Type fy = 0; Type fscore = 0;
-                            //nmsAccuratePeakPosition(sourcePtr, x, y, w, h, &fx, &fy, &fscore);
                            kernelPtr[index] = 1;
-                            //if(debug) printf("%d %d \n", x,y);
                        }
                        else
                            kernelPtr[index] = 0;
@@ -104,7 +105,8 @@ namespace op
        typedef cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer, int, int, int, int> NMSWriteKernelFunctor;
        const std::string nmsWriteKernel = MULTI_LINE_STRING(
            __kernel void nmsWriteKernel(__global Type* targetPtr, __global int* kernelPtr, __global const Type* sourcePtr,
-                                         const int w, const int h, const int maxPeaks, const int debug)
+                                         const int w, const int h, const int maxPeaks, const int debug,
+                                         const T offsetX, const T offsetY)
            {
                int x = get_global_id(0);
                int y = get_global_id(1);
@@ -118,7 +120,7 @@ namespace op
                        if (prev - curr)
                        {
                            Type fx = 0; Type fy = 0; Type fscore = 0;
-                            nmsAccuratePeakPosition(sourcePtr, x, y, w, h, &fx, &fy, &fscore);
+                            nmsAccuratePeakPosition(sourcePtr, &fx, &fy, &fscore, x, y, w, h, offsetX, offsetY);
                            //if (debug) printf("C %d %d %d \n", x,y,kernelPtr[index]);
                            __global Type* output = &targetPtr[curr*3];
                            output[0] = fx; output[1] = fy; output[2] = fscore;
@@ -144,7 +146,8 @@ namespace op
    template <typename T>
    void nmsOcl(T* targetPtr, int* kernelPtr, const T* const sourcePtr, const T threshold,
-                const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize, const int gpuID)
+                const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize, const Point<T>& offset,
+                const int gpuID)
    {
        try
        {
@@ -229,7 +232,8 @@ namespace op
                                                                                      sizeof(int) *  width * height, &kernelCPU[0]);
                        // Write Kernel
                        nmsWriteKernel(cl::EnqueueArgs(op::OpenCL::getInstance(gpuID)->getQueue(), cl::NDRange(width, height)),
-                                          targetBuffer, kernelBuffer, sourceBuffer, width, height, targetPeaks-1, debug);
+                                          targetBuffer, kernelBuffer, sourceBuffer, width, height, targetPeaks-1, debug,
+                                          offset.x, offset.y);
                    }
                }
            #else
@@ -239,6 +243,7 @@ namespace op
                UNUSED(threshold);
                UNUSED(targetSize);
                UNUSED(sourceSize);
+                UNUSED(offset);
                UNUSED(gpuID);
                error("OpenPose must be compiled with the `USE_OPENCL` macro definition in order to use this"
                      " functionality.", __LINE__, __FUNCTION__, __FILE__);
@@ -258,7 +263,9 @@ namespace op
    }
    template void nmsOcl(float* targetPtr, int* kernelPtr, const float* const sourcePtr, const float threshold,
-                         const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize, int gpuID);
+                         const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize,
+                         const Point<float>& offset, const int gpuID);
    template void nmsOcl(double* targetPtr, int* kernelPtr, const double* const sourcePtr, const double threshold,
-                         const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize, int gpuID);
+                         const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize,
+                         const Point<double>& offset, const int gpuID);
 }
--- a/src/openpose/net/nmsCaffe.cpp
+++ b/src/openpose/net/nmsCaffe.cpp
@@ -127,6 +127,19 @@ namespace op
        }
    }
+    template <typename T>
+    void NmsCaffe<T>::setOffset(const Point<T>& offset)
+    {
+        try
+        {
+            mOffset = {offset};
+        }
+        catch (const std::exception& e)
+        {
+            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+        }
+    }
    template <typename T>
    void NmsCaffe<T>::Forward_cpu(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top)
    {
@@ -134,7 +147,7 @@ namespace op
        {
            #ifdef USE_CAFFE
                nmsCpu(top.at(0)->mutable_cpu_data(), upImpl->mKernelBlob.mutable_cpu_data(), bottom.at(0)->cpu_data(),
-                       mThreshold, upImpl->mTopSize, upImpl->mBottomSize);
+                       mThreshold, upImpl->mTopSize, upImpl->mBottomSize, mOffset);
            #else
                UNUSED(bottom);
                UNUSED(top);
@@ -153,7 +166,7 @@ namespace op
        {
            #if defined USE_CAFFE && defined USE_CUDA
                nmsGpu(top.at(0)->mutable_gpu_data(), upImpl->mKernelBlob.mutable_gpu_data(),
-                       bottom.at(0)->gpu_data(), mThreshold, upImpl->mTopSize, upImpl->mBottomSize);
+                       bottom.at(0)->gpu_data(), mThreshold, upImpl->mTopSize, upImpl->mBottomSize, mOffset);
            #else
                UNUSED(bottom);
                UNUSED(top);
@@ -174,7 +187,8 @@ namespace op
        {
            #if defined USE_CAFFE && defined USE_OPENCL
                nmsOcl(top.at(0)->mutable_gpu_data(), upImpl->mKernelBlobT->mutable_gpu_data(),
-                       bottom.at(0)->gpu_data(), mThreshold, upImpl->mTopSize, upImpl->mBottomSize, mGpuID);
+                       bottom.at(0)->gpu_data(), mThreshold, upImpl->mTopSize, upImpl->mBottomSize, mOffset,
+                       mGpuID);
            #else
                UNUSED(bottom);
                UNUSED(top);

--- a/src/openpose/pose/bodyPartConnectorBase.cpp
+++ b/src/openpose/pose/bodyPartConnectorBase.cpp
@@ -332,10 +332,8 @@ namespace op
                    const auto bodyPartIndex = subsetI[bodyPart];
                    if (bodyPartIndex > 0)
                    {
-                        // Best results for 1 scale: x + 0, y + 0.5
+                        poseKeypoints[baseOffset] = peaksPtr[bodyPartIndex-2] * scaleFactor;
-                        // +0.5 to both to keep Matlab format
+                        poseKeypoints[baseOffset + 1] = peaksPtr[bodyPartIndex-1] * scaleFactor;
-                        poseKeypoints[baseOffset] = peaksPtr[bodyPartIndex-2] * scaleFactor + 0.5f;
-                        poseKeypoints[baseOffset + 1] = peaksPtr[bodyPartIndex-1] * scaleFactor + 0.5f;
                        poseKeypoints[baseOffset + 2] = peaksPtr[bodyPartIndex];
                    }
                    else

--- a/src/openpose/pose/bodyPartConnectorBase.cu
+++ b/src/openpose/pose/bodyPartConnectorBase.cu
@@ -334,10 +334,8 @@ namespace op
                    const auto bodyPartIndex = subsetI[bodyPart];
                    if (bodyPartIndex > 0)
                    {
-                        // Best results for 1 scale: x + 0, y + 0.5
+                        poseKeypoints[baseOffset] = peaksPtr[bodyPartIndex-2] * scaleFactor;
-                        // +0.5 to both to keep Matlab format
+                        poseKeypoints[baseOffset + 1] = peaksPtr[bodyPartIndex-1] * scaleFactor;
-                        poseKeypoints[baseOffset] = peaksPtr[bodyPartIndex-2] * scaleFactor + 0.5f;
-                        poseKeypoints[baseOffset + 1] = peaksPtr[bodyPartIndex-1] * scaleFactor + 0.5f;
                        poseKeypoints[baseOffset + 2] = peaksPtr[bodyPartIndex];
                    }
                    else

--- a/src/openpose/pose/poseExtractor.cpp
+++ b/src/openpose/pose/poseExtractor.cpp
@@ -238,8 +238,8 @@ namespace op
                    candidates[part].resize(numberPartCandidates);
                    const auto* partCandidatesPtr = &candidatesCpuPtr[part*peaksArea+3];
                    for (auto candidate = 0 ; candidate < numberPartCandidates ; candidate++)
-                        candidates[part][candidate] = {partCandidatesPtr[3*candidate],
+                        candidates[part][candidate] = {partCandidatesPtr[3*candidate] * mScaleNetToOutput,
-                                                       partCandidatesPtr[3*candidate+1],
+                                                       partCandidatesPtr[3*candidate+1] * mScaleNetToOutput,
                                                       partCandidatesPtr[3*candidate+2]};
                }
            }

--- a/src/openpose/pose/poseExtractorCaffe.cpp
+++ b/src/openpose/pose/poseExtractorCaffe.cpp
@@ -261,8 +261,18 @@ namespace op
                    upImpl->spResizeAndMergeCaffe->Forward_cpu(caffeNetOutputBlobs, {upImpl->spHeatMapsBlob.get()}); // ~20ms
                #endif
+                // Get scale net to output (i.e. image input)
+                // Note: In order to resize to input size, (un)comment the following lines
+                const auto scaleProducerToNetInput = resizeGetScaleFactor(inputDataSize, mNetOutputSize);
+                const Point<int> netSize{intRound(scaleProducerToNetInput*inputDataSize.x),
+                                         intRound(scaleProducerToNetInput*inputDataSize.y)};
+                mScaleNetToOutput = {(float)resizeGetScaleFactor(netSize, inputDataSize)};
+                // mScaleNetToOutput = 1.f;
                // 3. Get peaks by Non-Maximum Suppression
                upImpl->spNmsCaffe->setThreshold((float)get(PoseProperty::NMSThreshold));
+                const auto nmsOffset = float(0.5/double(mScaleNetToOutput));
+                upImpl->spNmsCaffe->setOffset(Point<float>{nmsOffset, nmsOffset});
                #ifdef USE_CUDA
                    //upImpl->spNmsCaffe->Forward_cpu({upImpl->spHeatMapsBlob.get()}, {upImpl->spPeaksBlob.get()}); // ~ 7ms
                    upImpl->spNmsCaffe->Forward_gpu({upImpl->spHeatMapsBlob.get()}, {upImpl->spPeaksBlob.get()});// ~2ms
@@ -274,14 +284,6 @@ namespace op
                    upImpl->spNmsCaffe->Forward_cpu({upImpl->spHeatMapsBlob.get()}, {upImpl->spPeaksBlob.get()}); // ~ 7ms
                #endif
-                // Get scale net to output (i.e. image input)
-                // Note: In order to resize to input size, (un)comment the following lines
-                const auto scaleProducerToNetInput = resizeGetScaleFactor(inputDataSize, mNetOutputSize);
-                const Point<int> netSize{intRound(scaleProducerToNetInput*inputDataSize.x),
-                                         intRound(scaleProducerToNetInput*inputDataSize.y)};
-                mScaleNetToOutput = {(float)resizeGetScaleFactor(netSize, inputDataSize)};
-                // mScaleNetToOutput = 1.f;
                // 4. Connecting body parts
                // Get scale net to output (i.e. image input)
                upImpl->spBodyPartConnectorCaffe->setScaleNetToOutput(mScaleNetToOutput);