From 27903fb368add0fa48e1282cddd6968665609881 Mon Sep 17 00:00:00 2001
From: gineshidalgo99 <gineshidalgo99@gmail.com>
Date: Wed, 4 Apr 2018 14:41:24 -0400
Subject: [PATCH] Part candidates at same scale than keypoints

---
 doc/3d_reconstruction_demo.md               |   2 +-
 doc/release_notes.md                        |   1 +
 include/openpose/core/keypointScaler.hpp    |   3 +
 include/openpose/core/wKeypointScaler.hpp   |   3 +
 include/openpose/net/nmsBase.hpp            |   8 +-
 include/openpose/net/nmsCaffe.hpp           |   4 +
 include/openpose/utilities/openCv.hpp       |   2 +-
 src/openpose/core/keypointScaler.cpp        | 109 +++++++++++++++-----
 src/openpose/net/nmsBase.cpp                |  24 +++--
 src/openpose/net/nmsBase.cu                 |  20 ++--
 src/openpose/net/nmsBaseCL.cpp              |  33 +++---
 src/openpose/net/nmsCaffe.cpp               |  20 +++-
 src/openpose/pose/bodyPartConnectorBase.cpp |   6 +-
 src/openpose/pose/bodyPartConnectorBase.cu  |   6 +-
 src/openpose/pose/poseExtractor.cpp         |   4 +-
 src/openpose/pose/poseExtractorCaffe.cpp    |  18 ++--
 16 files changed, 182 insertions(+), 81 deletions(-)
diff --git a/doc/3d_reconstruction_demo.md b/doc/3d_reconstruction_demo.md
index 9131252d..34e8405e 100644
--- a/doc/3d_reconstruction_demo.md
+++ b/doc/3d_reconstruction_demo.md
@@ -74,7 +74,7 @@ In order to verify that the camera parameters introduced by the user are sorted
 
 
 ## Installing the OpenPose 3-D Reconstruction Module
-Check the [doc/installation.md#3d-reconstruction-module](./quick_start.md#3d-reconstruction-module) for installation steps.
+Check the [doc/installation.md#3d-reconstruction-module](./installation.md#3d-reconstruction-module) for installation steps.
 
 
 
diff --git a/doc/release_notes.md b/doc/release_notes.md
index 0dcea54e..491008e7 100644
--- a/doc/release_notes.md
+++ b/doc/release_notes.md
@@ -218,6 +218,7 @@ OpenPose Library - Release Notes
     1. Removed scale parameter from hand and face rectangle extractor (causing wrong results if custom `--output_resolution`).
 3. Main bugs fixed:
     1. Hand and face work properly again with any `--output_resolution`.
+    2. Part candidates (`--part_candidates`) are saved with the same scale than the final keypoints itself.
 
 
 
diff --git a/include/openpose/core/keypointScaler.hpp b/include/openpose/core/keypointScaler.hpp
index abc326d2..3bf1ecfc 100644
--- a/include/openpose/core/keypointScaler.hpp
+++ b/include/openpose/core/keypointScaler.hpp
@@ -17,6 +17,9 @@ namespace op
         void scale(std::vector<Array<float>>& arraysToScale, const double scaleInputToOutput,
                    const double scaleNetToOutput, const Point<int>& producerSize) const;
 
+        void scale(std::vector<std::vector<std::array<float,3>>>& poseCandidates, const double scaleInputToOutput,
+                   const double scaleNetToOutput, const Point<int>& producerSize) const;
+
     private:
         const ScaleMode mScaleMode;
     };
diff --git a/include/openpose/core/wKeypointScaler.hpp b/include/openpose/core/wKeypointScaler.hpp
index 1e73dfaa..31539d15 100644
--- a/include/openpose/core/wKeypointScaler.hpp
+++ b/include/openpose/core/wKeypointScaler.hpp
@@ -59,6 +59,9 @@ namespace op
                                                             tDatum.handKeypoints[1], tDatum.faceKeypoints};
                     spKeypointScaler->scale(arraysToScale, tDatum.scaleInputToOutput, tDatum.scaleNetToOutput,
                                             Point<int>{tDatum.cvInputData.cols, tDatum.cvInputData.rows});
+                    // Rescale part candidates
+                    spKeypointScaler->scale(tDatum.poseCandidates, tDatum.scaleInputToOutput, tDatum.scaleNetToOutput,
+                                            Point<int>{tDatum.cvInputData.cols, tDatum.cvInputData.rows});
                 }
                 // Profiling speed
                 Profiler::timerEnd(profilerKey);
diff --git a/include/openpose/net/nmsBase.hpp b/include/openpose/net/nmsBase.hpp
index cdf24e03..238fb667 100644
--- a/include/openpose/net/nmsBase.hpp
+++ b/include/openpose/net/nmsBase.hpp
@@ -7,16 +7,18 @@ namespace op
 {
     template <typename T>
     OP_API void nmsCpu(T* targetPtr, int* kernelPtr, const T* const sourcePtr, const T threshold,
-                       const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize);
+                       const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize,
+                       const Point<T>& offset);
 
     template <typename T>
     OP_API void nmsGpu(T* targetPtr, int* kernelPtr, const T* const sourcePtr, const T threshold,
-                       const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize);
+                       const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize,
+                       const Point<T>& offset);
 
     template <typename T>
     OP_API void nmsOcl(T* targetPtr, int* kernelPtr, const T* const sourcePtr, const T threshold,
                        const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize,
-                       const int gpuID = 0);
+                       const Point<T>& offset, const int gpuID = 0);
 }
 
 #endif // OPENPOSE_NET_NMS_BASE_HPP
diff --git a/include/openpose/net/nmsCaffe.hpp b/include/openpose/net/nmsCaffe.hpp
index 8561f7e9..6e8ff331 100644
--- a/include/openpose/net/nmsCaffe.hpp
+++ b/include/openpose/net/nmsCaffe.hpp
@@ -25,6 +25,9 @@ namespace op
 
         void setThreshold(const T threshold);
 
+        // Empirically gives better results (copied from Matlab original code)
+        void setOffset(const Point<T>& offset);
+
         virtual void Forward_cpu(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top);
 
         virtual void Forward_gpu(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top);
@@ -39,6 +42,7 @@ namespace op
 
     private:
         T mThreshold;
+        Point<T> mOffset;
         int mGpuID;
 
         // PIMPL idiom
diff --git a/include/openpose/utilities/openCv.hpp b/include/openpose/utilities/openCv.hpp
index 982f1fcf..bd3b31a7 100644
--- a/include/openpose/utilities/openCv.hpp
+++ b/include/openpose/utilities/openCv.hpp
@@ -8,7 +8,7 @@
 namespace op
 {
     OP_API void putTextOnCvMat(cv::Mat& cvMat, const std::string& textToDisplay, const Point<int>& position,
-                   const cv::Scalar& color, const bool normalizeWidth, const int imageWidth);
+                               const cv::Scalar& color, const bool normalizeWidth, const int imageWidth);
 
     OP_API void floatPtrToUCharCvMat(cv::Mat& uCharCvMat, const float* const floatPtrImage,
                                      const std::array<int, 3> resolutionSize);
diff --git a/src/openpose/core/keypointScaler.cpp b/src/openpose/core/keypointScaler.cpp
index e35e4e87..1a138455 100644
--- a/src/openpose/core/keypointScaler.cpp
+++ b/src/openpose/core/keypointScaler.cpp
@@ -3,6 +3,40 @@
 
 namespace op
 {
+    Rectangle<float> getScaleAndOffset(const ScaleMode scaleMode, const double scaleInputToOutput,
+                                       const double scaleNetToOutput, const Point<int>& producerSize)
+    {
+        try
+        {
+            // OutputResolution
+            if (scaleMode == ScaleMode::OutputResolution)
+                return Rectangle<float>{0.f, 0.f, float(scaleInputToOutput), float(scaleInputToOutput)};
+            // NetOutputResolution
+            else if (scaleMode == ScaleMode::NetOutputResolution)
+                return Rectangle<float>{0.f, 0.f, float(1./scaleNetToOutput),
+                                        float(1./scaleNetToOutput)};
+            // [0,1]
+            else if (scaleMode == ScaleMode::ZeroToOne)
+                return Rectangle<float>{0.f, 0.f, 1.f / ((float)producerSize.x - 1.f),
+                                        1.f / ((float)producerSize.y - 1.f)};
+            // [-1,1]
+            else if (scaleMode == ScaleMode::PlusMinusOne)
+                return Rectangle<float>{-1.f, -1.f, 2.f / ((float)producerSize.x - 1.f),
+                                        2.f / ((float)producerSize.y - 1.f)};
+            // InputResolution
+            else if (scaleMode == ScaleMode::InputResolution)
+                return Rectangle<float>{0.f, 0.f, 1.f, 1.f};
+            // Unknown
+            error("Unknown ScaleMode selected.", __LINE__, __FUNCTION__, __FILE__);
+            return Rectangle<float>{};
+        }
+        catch (const std::exception& e)
+        {
+            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+            return Rectangle<float>{};
+        }
+    }
+
     KeypointScaler::KeypointScaler(const ScaleMode scaleMode) :
         mScaleMode{scaleMode}
     {
@@ -29,38 +63,61 @@ namespace op
         {
             if (mScaleMode != ScaleMode::InputResolution)
             {
-                // OutputResolution
-                if (mScaleMode == ScaleMode::OutputResolution)
-                {
+                // Get scale and offset
+                const auto scaleAndOffset = getScaleAndOffset(mScaleMode, scaleInputToOutput, scaleNetToOutput,
+                                                              producerSize);
+                // Only scaling
+                if (scaleAndOffset.x == 0 && scaleAndOffset.y == 0)
                     for (auto& arrayToScale : arrayToScalesToScale)
-                        scaleKeypoints(arrayToScale, float(scaleInputToOutput));
-                }
-                // NetOutputResolution
-                else if (mScaleMode == ScaleMode::NetOutputResolution)
-                {
+                        scaleKeypoints(arrayToScale, scaleAndOffset.width, scaleAndOffset.height);
+                // Scaling + offset
+                else
                     for (auto& arrayToScale : arrayToScalesToScale)
-                        scaleKeypoints(arrayToScale, float(1./scaleNetToOutput));
-                }
-                // [0,1]
-                else if (mScaleMode == ScaleMode::ZeroToOne)
+                        scaleKeypoints(arrayToScale, scaleAndOffset.width, scaleAndOffset.height,
+                                       scaleAndOffset.x, scaleAndOffset.y);
+            }
+        }
+        catch (const std::exception& e)
+        {
+            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+        }
+    }
+
+    void KeypointScaler::scale(std::vector<std::vector<std::array<float,3>>>& poseCandidates,
+                               const double scaleInputToOutput, const double scaleNetToOutput,
+                               const Point<int>& producerSize) const
+    {
+        try
+        {
+            if (mScaleMode != ScaleMode::InputResolution)
+            {
+                // Get scale and offset
+                const auto scaleAndOffset = getScaleAndOffset(mScaleMode, scaleInputToOutput, scaleNetToOutput,
+                                                              producerSize);
+                // Only scaling
+                if (scaleAndOffset.x == 0 && scaleAndOffset.y == 0)
                 {
-                    const auto scaleX = 1.f / ((float)producerSize.x - 1.f);
-                    const auto scaleY = 1.f / ((float)producerSize.y - 1.f);
-                    for (auto& arrayToScale : arrayToScalesToScale)
-                        scaleKeypoints(arrayToScale, scaleX, scaleY);
+                    for (auto& partCandidates : poseCandidates)
+                    {
+                        for (auto& candidate : partCandidates)
+                        {
+                            candidate[0] *= scaleAndOffset.width;
+                            candidate[1] *= scaleAndOffset.height;
+                        }
+                    }
                 }
-                // [-1,1]
-                else if (mScaleMode == ScaleMode::PlusMinusOne)
+                // Scaling + offset
+                else
                 {
-                    const auto scaleX = (2.f / ((float)producerSize.x - 1.f));
-                    const auto scaleY = (2.f / ((float)producerSize.y - 1.f));
-                    const auto offset = -1.f;
-                    for (auto& arrayToScale : arrayToScalesToScale)
-                        scaleKeypoints(arrayToScale, scaleX, scaleY, offset, offset);
+                    for (auto& partCandidates : poseCandidates)
+                    {
+                        for (auto& candidate : partCandidates)
+                        {
+                            candidate[0] = candidate[0]*scaleAndOffset.width + scaleAndOffset.x;
+                            candidate[1] = candidate[1]*scaleAndOffset.height + scaleAndOffset.y;
+                        }
+                    }
                 }
-                // Unknown
-                else
-                    error("Unknown ScaleMode selected.", __LINE__, __FUNCTION__, __FILE__);
             }
         }
         catch (const std::exception& e)
diff --git a/src/openpose/net/nmsBase.cpp b/src/openpose/net/nmsBase.cpp
index 8fc59f89..69adadae 100644
--- a/src/openpose/net/nmsBase.cpp
+++ b/src/openpose/net/nmsBase.cpp
@@ -68,8 +68,8 @@ namespace op
     }
 
     template <typename T>
-    void nmsAccuratePeakPosition(const T* const sourcePtr, const int& peakLocX, const int& peakLocY,
-                                 const int& width, const int& height, T* output)
+    void nmsAccuratePeakPosition(T* output, const T* const sourcePtr, const int& peakLocX, const int& peakLocY,
+                                 const int& width, const int& height, const Point<T>& offset)
     {
         T xAcc = 0.f;
         T yAcc = 0.f;
@@ -98,14 +98,18 @@ namespace op
             }
         }
 
-        output[0] = xAcc / scoreAcc;
-        output[1] = yAcc / scoreAcc;
+        // Offset to keep Matlab format (empirically higher acc)
+        // Best results for 1 scale: x + 0, y + 0.5
+        // +0.5 to both to keep Matlab format
+        output[0] = xAcc / scoreAcc + offset.x;
+        output[1] = yAcc / scoreAcc + offset.y;
         output[2] = sourcePtr[peakLocY*width + peakLocX];
     }
 
     template <typename T>
     void nmsCpu(T* targetPtr, int* kernelPtr, const T* const sourcePtr, const T threshold,
-                const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize)
+                const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize,
+                const Point<T>& offset)
     {
         try
         {
@@ -149,8 +153,8 @@ namespace op
                             if (currKernelPtr[index] == 1)
                             {
                                 // Accurate Peak Position
-                                nmsAccuratePeakPosition(currSourcePtr, x, y, sourceWidth, sourceHeight,
-                                                        &currTargetPtr[currentPeakCount*3]);
+                                nmsAccuratePeakPosition(&currTargetPtr[currentPeakCount*3], currSourcePtr, x, y,
+                                                        sourceWidth, sourceHeight, offset);
                                 currentPeakCount++;
                             }
                         }
@@ -167,7 +171,9 @@ namespace op
     }
 
     template void nmsCpu(float* targetPtr, int* kernelPtr, const float* const sourcePtr, const float threshold,
-                         const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize);
+                         const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize,
+                         const Point<float>& offset);
     template void nmsCpu(double* targetPtr, int* kernelPtr, const double* const sourcePtr, const double threshold,
-                         const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize);
+                         const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize,
+                         const Point<double>& offset);
 }
diff --git a/src/openpose/net/nmsBase.cu b/src/openpose/net/nmsBase.cu
index 3133df60..498b131d 100644
--- a/src/openpose/net/nmsBase.cu
+++ b/src/openpose/net/nmsBase.cu
@@ -48,7 +48,8 @@ namespace op
 
     template <typename T>
     __global__ void writeResultKernel(T* output, const int length, const int* const kernelPtr,
-                                      const T* const sourcePtr, const int width, const int height, const int maxPeaks)
+                                      const T* const sourcePtr, const int width, const int height, const int maxPeaks,
+                                      const T offsetX, const T offsetY)
     {
         __shared__ int local[THREADS_PER_BLOCK+1]; // one more
         const auto globalIdx = blockIdx.x * blockDim.x + threadIdx.x;
@@ -101,9 +102,12 @@ namespace op
                             }
                         }
 
+                        // Offset to keep Matlab format (empirically higher acc)
+                        // Best results for 1 scale: x + 0, y + 0.5
+                        // +0.5 to both to keep Matlab format
                         const auto outputIndex = (peakIndex + 1) * 3;
-                        output[outputIndex] = xAcc / scoreAcc;
-                        output[outputIndex + 1] = yAcc / scoreAcc;
+                        output[outputIndex] = xAcc / scoreAcc + offsetX;
+                        output[outputIndex + 1] = yAcc / scoreAcc + offsetY;
                         output[outputIndex + 2] = sourcePtr[peakLocY*width + peakLocX];
                     }
                 }
@@ -115,7 +119,7 @@ namespace op
 
     template <typename T>
     void nmsGpu(T* targetPtr, int* kernelPtr, const T* const sourcePtr, const T threshold,
-                const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize)
+                const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize, const Point<T>& offset)
     {
         try
         {
@@ -177,7 +181,7 @@ namespace op
                     // This returns targetPtrOffsetted, with the NMS applied over it
                     writeResultKernel<<<numBlocks1D, threadsPerBlock1D>>>(targetPtrOffsetted, imageOffset,
                                                                           kernelPtrOffsetted, sourcePtrOffsetted,
-                                                                          width, height, maxPeaks);
+                                                                          width, height, maxPeaks, offset.x, offset.y);
                 }
             }
             cudaCheck(__LINE__, __FUNCTION__, __FILE__);
@@ -189,7 +193,9 @@ namespace op
     }
 
     template void nmsGpu(float* targetPtr, int* kernelPtr, const float* const sourcePtr, const float threshold,
-                         const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize);
+                         const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize,
+                         const Point<float>& offset);
     template void nmsGpu(double* targetPtr, int* kernelPtr, const double* const sourcePtr, const double threshold,
-                         const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize);
+                         const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize,
+                         const Point<double>& offset);
 }
diff --git a/src/openpose/net/nmsBaseCL.cpp b/src/openpose/net/nmsBaseCL.cpp
index 0699e92e..a3c78e72 100644
--- a/src/openpose/net/nmsBaseCL.cpp
+++ b/src/openpose/net/nmsBaseCL.cpp
@@ -13,8 +13,9 @@ namespace op
 {
     #ifdef USE_OPENCL
         const std::string nmsOclCommonFunctions = MULTI_LINE_STRING(
-            void nmsAccuratePeakPosition(__global const Type* sourcePtr, const int peakLocX, const int peakLocY,
-                                         const int width, const int height, Type* fx, Type* fy, Type* fscore)
+            void nmsAccuratePeakPosition(__global const Type* sourcePtr, Type* fx, Type* fy, Type* fscore,
+                                         const int peakLocX, const int peakLocY, const int width, const int height,
+                                         const T offsetX, const T offsetY)
             {
                 Type xAcc = 0.f;
                 Type yAcc = 0.f;
@@ -43,8 +44,11 @@ namespace op
                     }
                 }
 
-                *fx = xAcc / scoreAcc;
-                *fy = yAcc / scoreAcc;
+                // Offset to keep Matlab format (empirically higher acc)
+                // Best results for 1 scale: x + 0, y + 0.5
+                // +0.5 to both to keep Matlab format
+                *fx = xAcc / scoreAcc + offsetX;
+                *fy = yAcc / scoreAcc + offsetY;
                 *fscore = sourcePtr[peakLocY*width + peakLocX];
             }
 
@@ -85,10 +89,7 @@ namespace op
                             && value > left && value > right
                             && value > bottomLeft && value > bottom && value > bottomRight)
                         {
-                            //Type fx = 0; Type fy = 0; Type fscore = 0;
-                            //nmsAccuratePeakPosition(sourcePtr, x, y, w, h, &fx, &fy, &fscore);
                             kernelPtr[index] = 1;
-                            //if(debug) printf("%d %d \n", x,y);
                         }
                         else
                             kernelPtr[index] = 0;
@@ -104,7 +105,8 @@ namespace op
         typedef cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer, int, int, int, int> NMSWriteKernelFunctor;
         const std::string nmsWriteKernel = MULTI_LINE_STRING(
             __kernel void nmsWriteKernel(__global Type* targetPtr, __global int* kernelPtr, __global const Type* sourcePtr,
-                                         const int w, const int h, const int maxPeaks, const int debug)
+                                         const int w, const int h, const int maxPeaks, const int debug,
+                                         const T offsetX, const T offsetY)
             {
                 int x = get_global_id(0);
                 int y = get_global_id(1);
@@ -118,7 +120,7 @@ namespace op
                         if (prev - curr)
                         {
                             Type fx = 0; Type fy = 0; Type fscore = 0;
-                            nmsAccuratePeakPosition(sourcePtr, x, y, w, h, &fx, &fy, &fscore);
+                            nmsAccuratePeakPosition(sourcePtr, &fx, &fy, &fscore, x, y, w, h, offsetX, offsetY);
                             //if (debug) printf("C %d %d %d \n", x,y,kernelPtr[index]);
                             __global Type* output = &targetPtr[curr*3];
                             output[0] = fx; output[1] = fy; output[2] = fscore;
@@ -144,7 +146,8 @@ namespace op
 
     template <typename T>
     void nmsOcl(T* targetPtr, int* kernelPtr, const T* const sourcePtr, const T threshold,
-                const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize, const int gpuID)
+                const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize, const Point<T>& offset,
+                const int gpuID)
     {
         try
         {
@@ -229,7 +232,8 @@ namespace op
                                                                                       sizeof(int) *  width * height, &kernelCPU[0]);
                         // Write Kernel
                         nmsWriteKernel(cl::EnqueueArgs(op::OpenCL::getInstance(gpuID)->getQueue(), cl::NDRange(width, height)),
-                                          targetBuffer, kernelBuffer, sourceBuffer, width, height, targetPeaks-1, debug);
+                                          targetBuffer, kernelBuffer, sourceBuffer, width, height, targetPeaks-1, debug,
+                                          offset.x, offset.y);
                     }
                 }
             #else
@@ -239,6 +243,7 @@ namespace op
                 UNUSED(threshold);
                 UNUSED(targetSize);
                 UNUSED(sourceSize);
+                UNUSED(offset);
                 UNUSED(gpuID);
                 error("OpenPose must be compiled with the `USE_OPENCL` macro definition in order to use this"
                       " functionality.", __LINE__, __FUNCTION__, __FILE__);
@@ -258,7 +263,9 @@ namespace op
     }
 
     template void nmsOcl(float* targetPtr, int* kernelPtr, const float* const sourcePtr, const float threshold,
-                         const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize, int gpuID);
+                         const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize,
+                         const Point<float>& offset, const int gpuID);
     template void nmsOcl(double* targetPtr, int* kernelPtr, const double* const sourcePtr, const double threshold,
-                         const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize, int gpuID);
+                         const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize,
+                         const Point<double>& offset, const int gpuID);
 }
diff --git a/src/openpose/net/nmsCaffe.cpp b/src/openpose/net/nmsCaffe.cpp
index 674f35cf..eeb9b971 100644
--- a/src/openpose/net/nmsCaffe.cpp
+++ b/src/openpose/net/nmsCaffe.cpp
@@ -127,6 +127,19 @@ namespace op
         }
     }
 
+    template <typename T>
+    void NmsCaffe<T>::setOffset(const Point<T>& offset)
+    {
+        try
+        {
+            mOffset = {offset};
+        }
+        catch (const std::exception& e)
+        {
+            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+        }
+    }
+
     template <typename T>
     void NmsCaffe<T>::Forward_cpu(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top)
     {
@@ -134,7 +147,7 @@ namespace op
         {
             #ifdef USE_CAFFE
                 nmsCpu(top.at(0)->mutable_cpu_data(), upImpl->mKernelBlob.mutable_cpu_data(), bottom.at(0)->cpu_data(),
-                       mThreshold, upImpl->mTopSize, upImpl->mBottomSize);
+                       mThreshold, upImpl->mTopSize, upImpl->mBottomSize, mOffset);
             #else
                 UNUSED(bottom);
                 UNUSED(top);
@@ -153,7 +166,7 @@ namespace op
         {
             #if defined USE_CAFFE && defined USE_CUDA
                 nmsGpu(top.at(0)->mutable_gpu_data(), upImpl->mKernelBlob.mutable_gpu_data(),
-                       bottom.at(0)->gpu_data(), mThreshold, upImpl->mTopSize, upImpl->mBottomSize);
+                       bottom.at(0)->gpu_data(), mThreshold, upImpl->mTopSize, upImpl->mBottomSize, mOffset);
             #else
                 UNUSED(bottom);
                 UNUSED(top);
@@ -174,7 +187,8 @@ namespace op
         {
             #if defined USE_CAFFE && defined USE_OPENCL
                 nmsOcl(top.at(0)->mutable_gpu_data(), upImpl->mKernelBlobT->mutable_gpu_data(),
-                       bottom.at(0)->gpu_data(), mThreshold, upImpl->mTopSize, upImpl->mBottomSize, mGpuID);
+                       bottom.at(0)->gpu_data(), mThreshold, upImpl->mTopSize, upImpl->mBottomSize, mOffset,
+                       mGpuID);
             #else
                 UNUSED(bottom);
                 UNUSED(top);
diff --git a/src/openpose/pose/bodyPartConnectorBase.cpp b/src/openpose/pose/bodyPartConnectorBase.cpp
index 3821c003..599a173a 100644
--- a/src/openpose/pose/bodyPartConnectorBase.cpp
+++ b/src/openpose/pose/bodyPartConnectorBase.cpp
@@ -332,10 +332,8 @@ namespace op
                     const auto bodyPartIndex = subsetI[bodyPart];
                     if (bodyPartIndex > 0)
                     {
-                        // Best results for 1 scale: x + 0, y + 0.5
-                        // +0.5 to both to keep Matlab format
-                        poseKeypoints[baseOffset] = peaksPtr[bodyPartIndex-2] * scaleFactor + 0.5f;
-                        poseKeypoints[baseOffset + 1] = peaksPtr[bodyPartIndex-1] * scaleFactor + 0.5f;
+                        poseKeypoints[baseOffset] = peaksPtr[bodyPartIndex-2] * scaleFactor;
+                        poseKeypoints[baseOffset + 1] = peaksPtr[bodyPartIndex-1] * scaleFactor;
                         poseKeypoints[baseOffset + 2] = peaksPtr[bodyPartIndex];
                     }
                     else
diff --git a/src/openpose/pose/bodyPartConnectorBase.cu b/src/openpose/pose/bodyPartConnectorBase.cu
index 4895d043..fffe8640 100644
--- a/src/openpose/pose/bodyPartConnectorBase.cu
+++ b/src/openpose/pose/bodyPartConnectorBase.cu
@@ -334,10 +334,8 @@ namespace op
                     const auto bodyPartIndex = subsetI[bodyPart];
                     if (bodyPartIndex > 0)
                     {
-                        // Best results for 1 scale: x + 0, y + 0.5
-                        // +0.5 to both to keep Matlab format
-                        poseKeypoints[baseOffset] = peaksPtr[bodyPartIndex-2] * scaleFactor + 0.5f;
-                        poseKeypoints[baseOffset + 1] = peaksPtr[bodyPartIndex-1] * scaleFactor + 0.5f;
+                        poseKeypoints[baseOffset] = peaksPtr[bodyPartIndex-2] * scaleFactor;
+                        poseKeypoints[baseOffset + 1] = peaksPtr[bodyPartIndex-1] * scaleFactor;
                         poseKeypoints[baseOffset + 2] = peaksPtr[bodyPartIndex];
                     }
                     else
diff --git a/src/openpose/pose/poseExtractor.cpp b/src/openpose/pose/poseExtractor.cpp
index 0fc670cb..f10ffd1d 100644
--- a/src/openpose/pose/poseExtractor.cpp
+++ b/src/openpose/pose/poseExtractor.cpp
@@ -238,8 +238,8 @@ namespace op
                     candidates[part].resize(numberPartCandidates);
                     const auto* partCandidatesPtr = &candidatesCpuPtr[part*peaksArea+3];
                     for (auto candidate = 0 ; candidate < numberPartCandidates ; candidate++)
-                        candidates[part][candidate] = {partCandidatesPtr[3*candidate],
-                                                       partCandidatesPtr[3*candidate+1],
+                        candidates[part][candidate] = {partCandidatesPtr[3*candidate] * mScaleNetToOutput,
+                                                       partCandidatesPtr[3*candidate+1] * mScaleNetToOutput,
                                                        partCandidatesPtr[3*candidate+2]};
                 }
             }
diff --git a/src/openpose/pose/poseExtractorCaffe.cpp b/src/openpose/pose/poseExtractorCaffe.cpp
index e59513b5..4308d5f5 100644
--- a/src/openpose/pose/poseExtractorCaffe.cpp
+++ b/src/openpose/pose/poseExtractorCaffe.cpp
@@ -261,8 +261,18 @@ namespace op
                     upImpl->spResizeAndMergeCaffe->Forward_cpu(caffeNetOutputBlobs, {upImpl->spHeatMapsBlob.get()}); // ~20ms
                 #endif
 
+                // Get scale net to output (i.e. image input)
+                // Note: In order to resize to input size, (un)comment the following lines
+                const auto scaleProducerToNetInput = resizeGetScaleFactor(inputDataSize, mNetOutputSize);
+                const Point<int> netSize{intRound(scaleProducerToNetInput*inputDataSize.x),
+                                         intRound(scaleProducerToNetInput*inputDataSize.y)};
+                mScaleNetToOutput = {(float)resizeGetScaleFactor(netSize, inputDataSize)};
+                // mScaleNetToOutput = 1.f;
+
                 // 3. Get peaks by Non-Maximum Suppression
                 upImpl->spNmsCaffe->setThreshold((float)get(PoseProperty::NMSThreshold));
+                const auto nmsOffset = float(0.5/double(mScaleNetToOutput));
+                upImpl->spNmsCaffe->setOffset(Point<float>{nmsOffset, nmsOffset});
                 #ifdef USE_CUDA
                     //upImpl->spNmsCaffe->Forward_cpu({upImpl->spHeatMapsBlob.get()}, {upImpl->spPeaksBlob.get()}); // ~ 7ms
                     upImpl->spNmsCaffe->Forward_gpu({upImpl->spHeatMapsBlob.get()}, {upImpl->spPeaksBlob.get()});// ~2ms
@@ -274,14 +284,6 @@ namespace op
                     upImpl->spNmsCaffe->Forward_cpu({upImpl->spHeatMapsBlob.get()}, {upImpl->spPeaksBlob.get()}); // ~ 7ms
                 #endif
 
-                // Get scale net to output (i.e. image input)
-                // Note: In order to resize to input size, (un)comment the following lines
-                const auto scaleProducerToNetInput = resizeGetScaleFactor(inputDataSize, mNetOutputSize);
-                const Point<int> netSize{intRound(scaleProducerToNetInput*inputDataSize.x),
-                                         intRound(scaleProducerToNetInput*inputDataSize.y)};
-                mScaleNetToOutput = {(float)resizeGetScaleFactor(netSize, inputDataSize)};
-                // mScaleNetToOutput = 1.f;
-
                 // 4. Connecting body parts
                 // Get scale net to output (i.e. image input)
                 upImpl->spBodyPartConnectorCaffe->setScaleNetToOutput(mScaleNetToOutput);
-- 
GitLab