From aeca5cadfc44b99edb1faf1aa19a973e1ce538be Mon Sep 17 00:00:00 2001
From: gineshidalgo99 <gineshidalgo99@gmail.com>
Date: Tue, 28 Nov 2017 10:19:42 -0500
Subject: [PATCH] Added doc (#342, #343)

---
 README.md                                     |  16 +-
 doc/installation.md                           |   8 +-
 doc/installation_cmake.md                     |   2 +-
 doc/quick_start.md                            |  10 +-
 ...andalone_face_or_hand_keypoint_detector.md |   2 +
 .../openpose/pose/bodyPartConnectorBase.hpp   |   5 +-
 .../openpose/pose/bodyPartConnectorCaffe.hpp  |   4 +-
 src/openpose/pose/bodyPartConnectorBase.cpp   |  76 ++--
 src/openpose/pose/bodyPartConnectorBase.cu    | 382 ++++++++++++++++--
 src/openpose/pose/bodyPartConnectorCaffe.cpp  |  19 +-
 src/openpose/pose/poseExtractorCaffe.cpp      |   4 +-
 11 files changed, 437 insertions(+), 91 deletions(-)
diff --git a/README.md b/README.md
index a4445490..41441ef5 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
 <div align="center">
-    <img src=".github/Logo_main_black.png", width="360">
+    <img src=".github/Logo_main_black.png", width="300">
 </div>
 
 -----------------
@@ -20,10 +20,10 @@ OpenPose represents the **first real-time multi-person system to jointly detect
     - 15 or **18-keypoint body estimation**. **Running time invariant to number of detected people**.
     - **2x21-keypoint hand** estimation. Currently, **running time depends** on **number of detected people**.
     - **70-keypoint face** estimation. Currently, **running time depends** on **number of detected people**.
-- Inputs: Image, video, webcam, and IP camera. Included C++ demos to add your custom input.
-- Outputs: Basic GUI display, keypoint saving (JSON, XML, YML, ...), and/or rendered image/video + keypoint saving (PNG, JPG, AVI, ...).
+- **Input**: Image, video, webcam, and IP camera. Included C++ demos to add your custom input.
+- **Output**: Basic image + keypoint display/saving (PNG, JPG, AVI, ...), keypoint saving (JSON, XML, YML, ...), and/or keypoints as array class.
 - Available: command-line demo, C++ wrapper, and C++ API.
-- OS: Ubuntu, Windows, Nvidia TX2.
+- **OS**: Ubuntu (14, 16), Windows (8, 10), Nvidia TX2.
 
 
 
@@ -78,7 +78,13 @@ See [doc/installation.md](doc/installation.md) for instructions on how to build
 ## Quick Start
 Most users do not need the [OpenPose C++ API](#openpose-c-api), but they can simply use the basic [Demo](#demo) and/or [OpenPose Wrapper](#openpose-wrapper).
 
-- **Demo**: To easily process images/video/webcam and display/save the results. See [doc/demo_overview.md](doc/demo_overview.md).
+- **Demo**: To easily process images/video/webcam and display/save the results. See [doc/demo_overview.md](doc/demo_overview.md). E.g. run it in a video with:
+```
+# Ubuntu
+./build/examples/openpose/openpose.bin --video examples/media/video.avi
+:: Windows - Portable Demo
+bin\OpenPoseDemo.exe --video examples\media\video.avi
+```
 
 - **OpenPose Wrapper**: If you want to read a specific input, and/or add your custom post-processing function, and/or implement your own display/saving, check the `Wrapper` tutorial on [examples/tutorial_wrapper/](examples/tutorial_wrapper/). You can create your custom code on [examples/user_code/](examples/user_code/) and quickly compile it by using `make all` in the OpenPose folder (assuming Makefile installer).
 
diff --git a/doc/installation.md b/doc/installation.md
index 009604df..1d6c656d 100644
--- a/doc/installation.md
+++ b/doc/installation.md
@@ -138,9 +138,11 @@ You just need to remove the OpenPose folder, by default called `openpose/`. E.g.
 
 ### Installation - Library
 1. Install the pre-requisites:
-    1. Microsoft Visual Studio (VS) 2015 Enterprise Update 3. If Visual Studio 2017 Community is desired, we do not support it, but it might be compiled by firstly [enabling CUDA 8.0 in VS2017](https://stackoverflow.com/questions/43745099/using-cuda-with-visual-studio-2017?answertab=active#tab-top). VS Enterprise Update 1 will give some compiler errors and VS 2015 Community has not been tested.
-    2. [CUDA 8](https://developer.nvidia.com/cuda-80-ga2-download-archive): Install it on the default location, `C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v8.0`. Otherwise, modify the Visual Studio project solution accordingly. Install CUDA 8.0 after Visual Studio 2015 is installed to assure that the CUDA installation will generate all necessary files for VS. If CUDA was already installed, re-install it after installing VS!
-    3. [cuDNN 5.1](https://developer.nvidia.com/cudnn): Once you have downloaded it, just unzip it and copy (merge) the contents on the CUDA folder, `C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v8.0`.
+    1. **Microsoft Visual Studio (VS) 2015 Enterprise Update 3**.
+        - If **Visual Studio 2017 Community** is desired, we do not officially support it, but it might be compiled by firstly [enabling CUDA 8.0 in VS2017](https://stackoverflow.com/questions/43745099/using-cuda-with-visual-studio-2017?answertab=active#tab-top) or use **VS2017 with CUDA 9** by checking the `.vcxproj` file and changing the necessary paths from CUDA 8 to 9.
+        - VS 2015 Enterprise Update 1 will give some compiler errors and VS 2015 Community has not been tested.
+    2. [**CUDA 8**](https://developer.nvidia.com/cuda-80-ga2-download-archive): Install it on the default location, `C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v8.0`. Otherwise, modify the Visual Studio project solution accordingly. Install CUDA 8.0 after Visual Studio 2015 is installed to assure that the CUDA installation will generate all necessary files for VS. If CUDA was already installed, re-install it after installing VS!
+    3. [**cuDNN 5.1**](https://developer.nvidia.com/cudnn): Once you have downloaded it, just unzip it and copy (merge) the contents on the CUDA folder, `C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v8.0`.
 2. Download the OpenPose dependencies and models (body, face and hand models) by double-clicking on `{openpose_path}\windows\download_3rdparty_and_models.bat`. Alternatively, you might prefer to download them manually:
     - Models:
         - [COCO model](http://posefs1.perception.cs.cmu.edu/OpenPose/models/pose/coco/pose_iter_440000.caffemodel): download in `models/pose/coco/`.
diff --git a/doc/installation_cmake.md b/doc/installation_cmake.md
index 42dfcd37..efcc209e 100644
--- a/doc/installation_cmake.md
+++ b/doc/installation_cmake.md
@@ -110,7 +110,7 @@ cd build/
 sudo make install
 ```
 
-Once the installation is completed, you can use OpenPose in your other project using the `find_package` cmake command. Below, is a small example `CMakeLists.txt`.
+Once the installation is completed, you can use OpenPose in your other project using the `find_package` cmake command. Below, is a small example `CMakeLists.txt`. In order to use this script, you also need to copy `FindGFlags.cmake` and `FindGlog.cmake` into your `<project_root_directory>/cmake/Modules/` (create the directory if necessary).
 ```
 cmake_minimum_required(VERSION 2.8.7)
 
diff --git a/doc/quick_start.md b/doc/quick_start.md
index 90a72ac6..a8a0a9d9 100644
--- a/doc/quick_start.md
+++ b/doc/quick_start.md
@@ -22,7 +22,7 @@ Check that the library is working properly by running any of the following comma
 ./build/examples/openpose/openpose.bin --video examples/media/video.avi --face --hand
 ```
 ```
-:: Windows - Demo
+:: Windows - Portable Demo
 bin\OpenPoseDemo.exe --video examples\media\video.avi
 :: With face and hands
 bin\OpenPoseDemo.exe --video examples\media\video.avi --face --hand
@@ -44,7 +44,7 @@ windows\x64\Release\OpenPoseDemo.exe --video examples\media\video.avi --face --h
 ./build/examples/openpose/openpose.bin --face --hand
 ```
 ```
-:: Windows - Demo
+:: Windows - Portable Demo
 bin\OpenPoseDemo.exe
 :: With face and hands
 bin\OpenPoseDemo.exe --face --hand
@@ -66,7 +66,7 @@ windows\x64\Release\OpenPoseDemo.exe --face --hand
 ./build/examples/openpose/openpose.bin --image_dir examples/media/ --face --hand
 ```
 ```
-:: Windows - Demo
+:: Windows - Portable Demo
 bin\OpenPoseDemo.exe --image_dir examples\media\
 :: With face and hands
 bin\OpenPoseDemo.exe --image_dir examples\media\ --face --hand
@@ -89,9 +89,9 @@ This command provides the most accurate results we have been able to achieve for
 ./build/examples/openpose/openpose.bin --net_resolution "1312x736" --scale_number 4 --scale_gap 0.25 --hand --hand_scale_number 6 --hand_scale_range 0.4 --face
 ```
 ```
-:: Windows - Demo: Body
+:: Windows - Portable Demo: Body
 bin\OpenPoseDemo.exe --net_resolution "1312x736" --scale_number 4 --scale_gap 0.25
-:: Windows - Demo: Body + Hand + Face
+:: Windows - Portable Demo: Body + Hand + Face
 bin\OpenPoseDemo.exe --net_resolution "1312x736" --scale_number 4 --scale_gap 0.25 --hand --hand_scale_number 6 --hand_scale_range 0.4 --face
 ```
 ```
diff --git a/doc/standalone_face_or_hand_keypoint_detector.md b/doc/standalone_face_or_hand_keypoint_detector.md
index fe26547c..62cfb706 100644
--- a/doc/standalone_face_or_hand_keypoint_detector.md
+++ b/doc/standalone_face_or_hand_keypoint_detector.md
@@ -16,5 +16,7 @@ There are 2 ways to add the OpenPose face keypoint detector to your own code wit
 
 2. Elegant solution: If you wanna use the whole OpenPose framework, simply copy `include/wrapper/wrapper.hpp` as e.g. `examples/userCode/wrapperFace.hpp`, and change our `FaceDetector` or `FaceDetectorOpenCV` class by your custom face detector class inside your `WrapperFace` class. If you wanna omit the Pose keypoint detection for a big speed up if you do not need it, you can simply use the `body_disable` flag.
 
+Note: both `FaceExtractor` and `HandExtractor` classes requires as input **squared rectangles**. In addition, the function **`initializationOnThread()` must be called only once, and inside the same thread where `forwardPass` is gonna be run**.
+
 ## Custom Standalone Hand Keypoint Detector
 The analogous steps apply to the hand keypoint detector, but modifying `include/openpose/hand/handExtractor.hpp`.
diff --git a/include/openpose/pose/bodyPartConnectorBase.hpp b/include/openpose/pose/bodyPartConnectorBase.hpp
index cfc6de51..ab35ad24 100644
--- a/include/openpose/pose/bodyPartConnectorBase.hpp
+++ b/include/openpose/pose/bodyPartConnectorBase.hpp
@@ -13,10 +13,11 @@ namespace op
                                     const int minSubsetCnt, const T minSubsetScore, const T scaleFactor = 1.f);
 
     template <typename T>
-    OP_API void connectBodyPartsGpu(Array<T>& poseKeypoints, T* posePtr, const T* const heatMapPtr,
+    OP_API void connectBodyPartsGpu(Array<T>& poseKeypoints, Array<T>& poseScores, const T* const heatMapPtr,
                                     const T* const peaksPtr, const PoseModel poseModel, const Point<int>& heatMapSize,
                                     const int maxPeaks, const T interMinAboveThreshold, const T interThreshold,
-                                    const int minSubsetCnt, const T minSubsetScore, const T scaleFactor = 1.f);
+                                    const int minSubsetCnt, const T minSubsetScore, const T scaleFactor = 1.f,
+                                    const T* const heatMapGpuPtr = nullptr, const T* const peaksGpuPtr = nullptr);
 }
 
 #endif // OPENPOSE_POSE_BODY_PARTS_CONNECTOR_HPP
diff --git a/include/openpose/pose/bodyPartConnectorCaffe.hpp b/include/openpose/pose/bodyPartConnectorCaffe.hpp
index 1b46e967..74cca492 100644
--- a/include/openpose/pose/bodyPartConnectorCaffe.hpp
+++ b/include/openpose/pose/bodyPartConnectorCaffe.hpp
@@ -43,8 +43,8 @@ namespace op
         virtual void Forward_cpu(const std::vector<caffe::Blob<T>*>& bottom, Array<T>& poseKeypoints,
                                  Array<T>& poseScores);
 
-        virtual void Forward_gpu(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top,
-                                 Array<T>& poseKeypoints);
+        virtual void Forward_gpu(const std::vector<caffe::Blob<T>*>& bottom, Array<T>& poseKeypoints,
+                                 Array<T>& poseScores);
 
         virtual void Backward_cpu(const std::vector<caffe::Blob<T>*>& top, const std::vector<bool>& propagate_down,
                                   const std::vector<caffe::Blob<T>*>& bottom);
diff --git a/src/openpose/pose/bodyPartConnectorBase.cpp b/src/openpose/pose/bodyPartConnectorBase.cpp
index 31ac09a5..89225692 100644
--- a/src/openpose/pose/bodyPartConnectorBase.cpp
+++ b/src/openpose/pose/bodyPartConnectorBase.cpp
@@ -31,21 +31,21 @@ namespace op
             {
                 const auto bodyPartA = bodyPartPairs[2*pairIndex];
                 const auto bodyPartB = bodyPartPairs[2*pairIndex+1];
-                const auto* candidateA = peaksPtr + bodyPartA*peaksOffset;
-                const auto* candidateB = peaksPtr + bodyPartB*peaksOffset;
-                const auto nA = intRound(candidateA[0]);
-                const auto nB = intRound(candidateB[0]);
+                const auto* candidateAPtr = peaksPtr + bodyPartA*peaksOffset;
+                const auto* candidateBPtr = peaksPtr + bodyPartB*peaksOffset;
+                const auto numberA = intRound(candidateAPtr[0]);
+                const auto numberB = intRound(candidateBPtr[0]);
 
                 // Add parts into the subset in special case
-                if (nA == 0 || nB == 0)
+                if (numberA == 0 || numberB == 0)
                 {
                     // Change w.r.t. other
-                    if (nA == 0) // nB == 0 or not
+                    if (numberA == 0) // numberB == 0 or not
                     {
                         if (poseModel == PoseModel::COCO_18 || poseModel == PoseModel::BODY_18
                             || poseModel == PoseModel::BODY_19 || poseModel == PoseModel::BODY_23)
                         {
-                            for (auto i = 1; i <= nB; i++)
+                            for (auto i = 1; i <= numberB; i++)
                             {
                                 bool num = false;
                                 const auto indexB = bodyPartB;
@@ -65,7 +65,7 @@ namespace op
                                     rowVector[ bodyPartB ] = bodyPartB*peaksOffset + i*3 + 2;
                                     // Last number in each row is the parts number of that person
                                     rowVector[subsetCounterIndex] = 1;
-                                    const auto subsetScore = candidateB[i*3+2];
+                                    const auto subsetScore = candidateBPtr[i*3+2];
                                     // Second last number in each row is the total score
                                     subset.emplace_back(std::make_pair(rowVector, subsetScore));
                                 }
@@ -73,7 +73,7 @@ namespace op
                         }
                         else if (poseModel == PoseModel::MPI_15 || poseModel == PoseModel::MPI_15_4)
                         {
-                            for (auto i = 1; i <= nB; i++)
+                            for (auto i = 1; i <= numberB; i++)
                             {
                                 std::vector<int> rowVector(subsetSize, 0);
                                 // Store the index
@@ -81,7 +81,7 @@ namespace op
                                 // Last number in each row is the parts number of that person
                                 rowVector[subsetCounterIndex] = 1;
                                 // Second last number in each row is the total score
-                                const auto subsetScore = candidateB[i*3+2];
+                                const auto subsetScore = candidateBPtr[i*3+2];
                                 subset.emplace_back(std::make_pair(rowVector, subsetScore));
                             }
                         }
@@ -89,12 +89,12 @@ namespace op
                             error("Unknown model, cast to int = " + std::to_string((int)poseModel),
                                   __LINE__, __FUNCTION__, __FILE__);
                     }
-                    else // if (nA != 0 && nB == 0)
+                    else // if (numberA != 0 && numberB == 0)
                     {
                         if (poseModel == PoseModel::COCO_18 || poseModel == PoseModel::BODY_18
                             || poseModel == PoseModel::BODY_19 || poseModel == PoseModel::BODY_23)
                         {
-                            for (auto i = 1; i <= nA; i++)
+                            for (auto i = 1; i <= numberA; i++)
                             {
                                 bool num = false;
                                 const auto indexA = bodyPartA;
@@ -115,14 +115,14 @@ namespace op
                                     // Last number in each row is the parts number of that person
                                     rowVector[subsetCounterIndex] = 1;
                                     // Second last number in each row is the total score
-                                    const auto subsetScore = candidateA[i*3+2];
+                                    const auto subsetScore = candidateAPtr[i*3+2];
                                     subset.emplace_back(std::make_pair(rowVector, subsetScore));
                                 }
                             }
                         }
                         else if (poseModel == PoseModel::MPI_15 || poseModel == PoseModel::MPI_15_4)
                         {
-                            for (auto i = 1; i <= nA; i++)
+                            for (auto i = 1; i <= numberA; i++)
                             {
                                 std::vector<int> rowVector(subsetSize, 0);
                                 // Store the index
@@ -130,7 +130,7 @@ namespace op
                                 // Last number in each row is the parts number of that person
                                 rowVector[subsetCounterIndex] = 1;
                                 // Second last number in each row is the total score
-                                const auto subsetScore = candidateA[i*3+2];
+                                const auto subsetScore = candidateAPtr[i*3+2];
                                 subset.emplace_back(std::make_pair(rowVector, subsetScore));
                             }
                         }
@@ -139,40 +139,40 @@ namespace op
                                   __LINE__, __FUNCTION__, __FILE__);
                     }
                 }
-                else // if (nA != 0 && nB != 0)
+                else // if (numberA != 0 && numberB != 0)
                 {
                     std::vector<std::tuple<double, int, int>> temp;
-                    const auto* const mapX = heatMapPtr + mapIdx[2*pairIndex] * heatMapOffset;
-                    const auto* const mapY = heatMapPtr + mapIdx[2*pairIndex+1] * heatMapOffset;
-                    for (auto i = 1; i <= nA; i++)
+                    const auto* mapX = heatMapPtr + mapIdx[2*pairIndex] * heatMapOffset;
+                    const auto* mapY = heatMapPtr + mapIdx[2*pairIndex+1] * heatMapOffset;
+                    for (auto i = 1; i <= numberA; i++)
                     {
-                        for (auto j = 1; j <= nB; j++)
+                        for (auto j = 1; j <= numberB; j++)
                         {
-                            const auto dX = candidateB[j*3] - candidateA[i*3];
-                            const auto dY = candidateB[j*3+1] - candidateA[i*3+1];
-                            const auto dMax = fastMax(std::abs(dX), std::abs(dY));
-                            const auto numberPointsInLine = fastMax(5, fastMin(25, intRound(std::sqrt(5*dMax))));
-                            const auto normVec = T(std::sqrt( dX*dX + dY*dY ));
+                            const auto vectorAToBX = candidateBPtr[j*3] - candidateAPtr[i*3];
+                            const auto vectorAToBY = candidateBPtr[j*3+1] - candidateAPtr[i*3+1];
+                            const auto vectorAToBMax = fastMax(std::abs(vectorAToBX), std::abs(vectorAToBY));
+                            const auto numberPointsInLine = fastMax(5, fastMin(25, intRound(std::sqrt(5*vectorAToBMax))));
+                            const auto vectorNorm = T(std::sqrt( vectorAToBX*vectorAToBX + vectorAToBY*vectorAToBY ));
                             // If the peaksPtr are coincident. Don't connect them.
-                            if (normVec > 1e-6)
+                            if (vectorNorm > 1e-6)
                             {
-                                const auto sX = candidateA[i*3];
-                                const auto sY = candidateA[i*3+1];
-                                const auto vecX = dX/normVec;
-                                const auto vecY = dY/normVec;
+                                const auto sX = candidateAPtr[i*3];
+                                const auto sY = candidateAPtr[i*3+1];
+                                const auto vectorAToBNormX = vectorAToBX/vectorNorm;
+                                const auto vectorAToBNormY = vectorAToBY/vectorNorm;
 
                                 auto sum = 0.;
                                 auto count = 0;
-                                const auto dXInLine = dX/numberPointsInLine;
-                                const auto dYInLine = dY/numberPointsInLine;
+                                const auto vectorAToBXInLine = vectorAToBX/numberPointsInLine;
+                                const auto vectorAToBYInLine = vectorAToBY/numberPointsInLine;
                                 for (auto lm = 0; lm < numberPointsInLine; lm++)
                                 {
-                                    const auto mX = fastMin(heatMapSize.x-1, intRound(sX + lm*dXInLine));
-                                    const auto mY = fastMin(heatMapSize.y-1, intRound(sY + lm*dYInLine));
+                                    const auto mX = fastMin(heatMapSize.x-1, intRound(sX + lm*vectorAToBXInLine));
+                                    const auto mY = fastMin(heatMapSize.y-1, intRound(sY + lm*vectorAToBYInLine));
                                     checkGE(mX, 0, "", __LINE__, __FUNCTION__, __FILE__);
                                     checkGE(mY, 0, "", __LINE__, __FUNCTION__, __FILE__);
                                     const auto idx = mY * heatMapSize.x + mX;
-                                    const auto score = (vecX*mapX[idx] + vecY*mapY[idx]);
+                                    const auto score = (vectorAToBNormX*mapX[idx] + vectorAToBNormY*mapY[idx]);
                                     if (score > interThreshold)
                                     {
                                         sum += score;
@@ -193,9 +193,9 @@ namespace op
                         std::sort(temp.begin(), temp.end(), std::greater<std::tuple<T, int, int>>());
 
                     std::vector<std::tuple<int, int, double>> connectionK;
-                    const auto minAB = fastMin(nA, nB);
-                    std::vector<int> occurA(nA, 0);
-                    std::vector<int> occurB(nB, 0);
+                    const auto minAB = fastMin(numberA, numberB);
+                    std::vector<int> occurA(numberA, 0);
+                    std::vector<int> occurB(numberB, 0);
                     auto counter = 0;
                     for (auto row = 0u; row < temp.size(); row++)
                     {
diff --git a/src/openpose/pose/bodyPartConnectorBase.cu b/src/openpose/pose/bodyPartConnectorBase.cu
index a320362e..a49510c3 100644
--- a/src/openpose/pose/bodyPartConnectorBase.cu
+++ b/src/openpose/pose/bodyPartConnectorBase.cu
@@ -1,29 +1,359 @@
+#include <openpose/utilities/check.hpp>
 #include <openpose/utilities/cuda.hpp>
+#include <openpose/utilities/fastMath.hpp>
+#include <openpose/pose/poseParameters.hpp>
 #include <openpose/pose/bodyPartConnectorBase.hpp>
 
 namespace op
 {
     template <typename T>
-    void connectBodyPartsGpu(Array<T>& poseKeypoints, T* posePtr, const T* const heatMapPtr, const T* const peaksPtr,
-                             const PoseModel poseModel, const Point<int>& heatMapSize, const int maxPeaks,
-                             const T interMinAboveThreshold, const T interThreshold, const int minSubsetCnt,
-                             const T minSubsetScore, const T scaleFactor)
+    void connectBodyPartsGpu(Array<T>& poseKeypoints, Array<T>& poseScores, const T* const heatMapPtr,
+                             const T* const peaksPtr, const PoseModel poseModel, const Point<int>& heatMapSize,
+                             const int maxPeaks, const T interMinAboveThreshold, const T interThreshold,
+                             const int minSubsetCnt, const T minSubsetScore, const T scaleFactor,
+                             const T* const heatMapGpuPtr, const T* const peaksGpuPtr)
     {
         try
         {
-            UNUSED(poseKeypoints);
-            UNUSED(posePtr);
-            UNUSED(heatMapPtr);
-            UNUSED(peaksPtr);
-            UNUSED(poseModel);
-            UNUSED(heatMapSize);
-            UNUSED(maxPeaks);
-            UNUSED(interMinAboveThreshold);
-            UNUSED(interThreshold);
-            UNUSED(minSubsetCnt);
-            UNUSED(minSubsetScore);
-            UNUSED(scaleFactor);
-            error("GPU version not implemented yet.", __LINE__, __FUNCTION__, __FILE__);
+            // Parts Connection
+            const auto& bodyPartPairs = POSE_BODY_PART_PAIRS[(int)poseModel];
+            const auto& mapIdx = POSE_MAP_IDX[(int)poseModel];
+            const auto numberBodyParts = POSE_NUMBER_BODY_PARTS[(int)poseModel];
+            const auto numberBodyPartPairs = bodyPartPairs.size() / 2;
+
+            // Vector<int> = Each body part + body parts counter; double = subsetScore
+            std::vector<std::pair<std::vector<int>, double>> subset;
+            const auto subsetCounterIndex = numberBodyParts;
+            const auto subsetSize = numberBodyParts+1;
+
+            const auto peaksOffset = 3*(maxPeaks+1);
+            const auto heatMapOffset = heatMapSize.area();
+
+            for (auto pairIndex = 0u; pairIndex < numberBodyPartPairs; pairIndex++)
+            {
+                const auto bodyPartA = bodyPartPairs[2*pairIndex];
+                const auto bodyPartB = bodyPartPairs[2*pairIndex+1];
+                const auto* candidateAPtr = peaksPtr + bodyPartA*peaksOffset;
+                const auto* candidateBPtr = peaksPtr + bodyPartB*peaksOffset;
+                const auto numberA = intRound(candidateAPtr[0]);
+                const auto numberB = intRound(candidateBPtr[0]);
+
+                // Add parts into the subset in special case
+                if (numberA == 0 || numberB == 0)
+                {
+                    // Change w.r.t. other
+                    if (numberA == 0) // numberB == 0 or not
+                    {
+                        if (poseModel == PoseModel::COCO_18 || poseModel == PoseModel::BODY_18
+                            || poseModel == PoseModel::BODY_19 || poseModel == PoseModel::BODY_23)
+                        {
+                            for (auto i = 1; i <= numberB; i++)
+                            {
+                                bool num = false;
+                                const auto indexB = bodyPartB;
+                                for (auto j = 0u; j < subset.size(); j++)
+                                {
+                                    const auto off = (int)bodyPartB*peaksOffset + i*3 + 2;
+                                    if (subset[j].first[indexB] == off)
+                                    {
+                                        num = true;
+                                        break;
+                                    }
+                                }
+                                if (!num)
+                                {
+                                    std::vector<int> rowVector(subsetSize, 0);
+                                    // Store the index
+                                    rowVector[ bodyPartB ] = bodyPartB*peaksOffset + i*3 + 2;
+                                    // Last number in each row is the parts number of that person
+                                    rowVector[subsetCounterIndex] = 1;
+                                    const auto subsetScore = candidateBPtr[i*3+2];
+                                    // Second last number in each row is the total score
+                                    subset.emplace_back(std::make_pair(rowVector, subsetScore));
+                                }
+                            }
+                        }
+                        else if (poseModel == PoseModel::MPI_15 || poseModel == PoseModel::MPI_15_4)
+                        {
+                            for (auto i = 1; i <= numberB; i++)
+                            {
+                                std::vector<int> rowVector(subsetSize, 0);
+                                // Store the index
+                                rowVector[ bodyPartB ] = bodyPartB*peaksOffset + i*3 + 2;
+                                // Last number in each row is the parts number of that person
+                                rowVector[subsetCounterIndex] = 1;
+                                // Second last number in each row is the total score
+                                const auto subsetScore = candidateBPtr[i*3+2];
+                                subset.emplace_back(std::make_pair(rowVector, subsetScore));
+                            }
+                        }
+                        else
+                            error("Unknown model, cast to int = " + std::to_string((int)poseModel),
+                                  __LINE__, __FUNCTION__, __FILE__);
+                    }
+                    else // if (numberA != 0 && numberB == 0)
+                    {
+                        if (poseModel == PoseModel::COCO_18 || poseModel == PoseModel::BODY_18
+                            || poseModel == PoseModel::BODY_19 || poseModel == PoseModel::BODY_23)
+                        {
+                            for (auto i = 1; i <= numberA; i++)
+                            {
+                                bool num = false;
+                                const auto indexA = bodyPartA;
+                                for (auto j = 0u; j < subset.size(); j++)
+                                {
+                                    const auto off = (int)bodyPartA*peaksOffset + i*3 + 2;
+                                    if (subset[j].first[indexA] == off)
+                                    {
+                                        num = true;
+                                        break;
+                                    }
+                                }
+                                if (!num)
+                                {
+                                    std::vector<int> rowVector(subsetSize, 0);
+                                    // Store the index
+                                    rowVector[ bodyPartA ] = bodyPartA*peaksOffset + i*3 + 2;
+                                    // Last number in each row is the parts number of that person
+                                    rowVector[subsetCounterIndex] = 1;
+                                    // Second last number in each row is the total score
+                                    const auto subsetScore = candidateAPtr[i*3+2];
+                                    subset.emplace_back(std::make_pair(rowVector, subsetScore));
+                                }
+                            }
+                        }
+                        else if (poseModel == PoseModel::MPI_15 || poseModel == PoseModel::MPI_15_4)
+                        {
+                            for (auto i = 1; i <= numberA; i++)
+                            {
+                                std::vector<int> rowVector(subsetSize, 0);
+                                // Store the index
+                                rowVector[ bodyPartA ] = bodyPartA*peaksOffset + i*3 + 2;
+                                // Last number in each row is the parts number of that person
+                                rowVector[subsetCounterIndex] = 1;
+                                // Second last number in each row is the total score
+                                const auto subsetScore = candidateAPtr[i*3+2];
+                                subset.emplace_back(std::make_pair(rowVector, subsetScore));
+                            }
+                        }
+                        else
+                            error("Unknown model, cast to int = " + std::to_string((int)poseModel),
+                                  __LINE__, __FUNCTION__, __FILE__);
+                    }
+                }
+                else // if (numberA != 0 && numberB != 0)
+                {
+                    std::vector<std::tuple<double, int, int>> temp;
+                    const auto* mapX = heatMapPtr + mapIdx[2*pairIndex] * heatMapOffset;
+                    const auto* mapY = heatMapPtr + mapIdx[2*pairIndex+1] * heatMapOffset;
+                    for (auto i = 1; i <= numberA; i++)
+                    {
+                        for (auto j = 1; j <= numberB; j++)
+                        {
+                            const auto vectorAToBX = candidateBPtr[j*3] - candidateAPtr[i*3];
+                            const auto vectorAToBY = candidateBPtr[j*3+1] - candidateAPtr[i*3+1];
+                            const auto vectorAToBMax = fastMax(std::abs(vectorAToBX), std::abs(vectorAToBY));
+                            const auto numberPointsInLine = fastMax(5, fastMin(25, intRound(std::sqrt(5*vectorAToBMax))));
+                            const auto vectorNorm = T(std::sqrt( vectorAToBX*vectorAToBX + vectorAToBY*vectorAToBY ));
+                            // If the peaksPtr are coincident. Don't connect them.
+                            if (vectorNorm > 1e-6)
+                            {
+                                const auto sX = candidateAPtr[i*3];
+                                const auto sY = candidateAPtr[i*3+1];
+                                const auto vectorAToBNormX = vectorAToBX/vectorNorm;
+                                const auto vectorAToBNormY = vectorAToBY/vectorNorm;
+
+                                auto sum = 0.;
+                                auto count = 0;
+                                const auto vectorAToBXInLine = vectorAToBX/numberPointsInLine;
+                                const auto vectorAToBYInLine = vectorAToBY/numberPointsInLine;
+                                for (auto lm = 0; lm < numberPointsInLine; lm++)
+                                {
+                                    const auto mX = fastMin(heatMapSize.x-1, intRound(sX + lm*vectorAToBXInLine));
+                                    const auto mY = fastMin(heatMapSize.y-1, intRound(sY + lm*vectorAToBYInLine));
+                                    checkGE(mX, 0, "", __LINE__, __FUNCTION__, __FILE__);
+                                    checkGE(mY, 0, "", __LINE__, __FUNCTION__, __FILE__);
+                                    const auto idx = mY * heatMapSize.x + mX;
+                                    const auto score = (vectorAToBNormX*mapX[idx] + vectorAToBNormY*mapY[idx]);
+                                    if (score > interThreshold)
+                                    {
+                                        sum += score;
+                                        count++;
+                                    }
+                                }
+
+                                // parts score + connection score
+                                if (count/(float)numberPointsInLine > interMinAboveThreshold)
+                                    temp.emplace_back(std::make_tuple(sum/count, i, j));
+                            }
+                        }
+                    }
+
+                    // select the top minAB connection, assuming that each part occur only once
+                    // sort rows in descending order based on parts + connection score
+                    if (!temp.empty())
+                        std::sort(temp.begin(), temp.end(), std::greater<std::tuple<T, int, int>>());
+
+                    std::vector<std::tuple<int, int, double>> connectionK;
+                    const auto minAB = fastMin(numberA, numberB);
+                    std::vector<int> occurA(numberA, 0);
+                    std::vector<int> occurB(numberB, 0);
+                    auto counter = 0;
+                    for (auto row = 0u; row < temp.size(); row++)
+                    {
+                        const auto score = std::get<0>(temp[row]);
+                        const auto x = std::get<1>(temp[row]);
+                        const auto y = std::get<2>(temp[row]);
+                        if (!occurA[x-1] && !occurB[y-1])
+                        {
+                            connectionK.emplace_back(std::make_tuple(bodyPartA*peaksOffset + x*3 + 2,
+                                                                     bodyPartB*peaksOffset + y*3 + 2,
+                                                                     score));
+                            counter++;
+                            if (counter==minAB)
+                                break;
+                            occurA[x-1] = 1;
+                            occurB[y-1] = 1;
+                        }
+                    }
+
+                    // Cluster all the body part candidates into subset based on the part connection
+                    if (!connectionK.empty())
+                    {
+                        // initialize first body part connection 15&16
+                        if (pairIndex==0)
+                        {
+                            for (const auto connectionKI : connectionK)
+                            {
+                                std::vector<int> rowVector(numberBodyParts+3, 0);
+                                const auto indexA = std::get<0>(connectionKI);
+                                const auto indexB = std::get<1>(connectionKI);
+                                const auto score = std::get<2>(connectionKI);
+                                rowVector[bodyPartPairs[0]] = indexA;
+                                rowVector[bodyPartPairs[1]] = indexB;
+                                rowVector[subsetCounterIndex] = 2;
+                                // add the score of parts and the connection
+                                const auto subsetScore = peaksPtr[indexA] + peaksPtr[indexB] + score;
+                                subset.emplace_back(std::make_pair(rowVector, subsetScore));
+                            }
+                        }
+                        // Add ears connections (in case person is looking to opposite direction to camera)
+                        else if (((poseModel == PoseModel::COCO_18
+                                    || poseModel == PoseModel::BODY_18) && (pairIndex==17 || pairIndex==18))
+                                 || (poseModel == PoseModel::BODY_19 && (pairIndex==18 || pairIndex==19))
+                                 || (poseModel == PoseModel::BODY_23 && (pairIndex==22 || pairIndex==23)))
+                        {
+                            for (const auto& connectionKI : connectionK)
+                            {
+                                const auto indexA = std::get<0>(connectionKI);
+                                const auto indexB = std::get<1>(connectionKI);
+                                for (auto& subsetJ : subset)
+                                {
+                                    auto& subsetJFirst = subsetJ.first[bodyPartA];
+                                    auto& subsetJFirstPlus1 = subsetJ.first[bodyPartB];
+                                    if (subsetJFirst == indexA && subsetJFirstPlus1 == 0)
+                                        subsetJFirstPlus1 = indexB;
+                                    else if (subsetJFirstPlus1 == indexB && subsetJFirst == 0)
+                                        subsetJFirst = indexA;
+                                }
+                            }
+                        }
+                        else
+                        {
+                            // A is already in the subset, find its connection B
+                            for (const auto& connectionKI : connectionK)
+                            {
+                                const auto indexA = std::get<0>(connectionKI);
+                                const auto indexB = std::get<1>(connectionKI);
+                                const auto score = std::get<2>(connectionKI);
+                                auto num = 0;
+                                for (auto& subsetJ : subset)
+                                {
+                                    if (subsetJ.first[bodyPartA] == indexA)
+                                    {
+                                        subsetJ.first[bodyPartB] = indexB;
+                                        num++;
+                                        subsetJ.first[subsetCounterIndex] = subsetJ.first[subsetCounterIndex] + 1;
+                                        subsetJ.second += peaksPtr[indexB] + score;
+                                    }
+                                }
+                                // if can not find partA in the subset, create a new subset
+                                if (num==0)
+                                {
+                                    std::vector<int> rowVector(subsetSize, 0);
+                                    rowVector[bodyPartA] = indexA;
+                                    rowVector[bodyPartB] = indexB;
+                                    rowVector[subsetCounterIndex] = 2;
+                                    const auto subsetScore = peaksPtr[indexA] + peaksPtr[indexB] + score;
+                                    subset.emplace_back(std::make_pair(rowVector, subsetScore));
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+
+            // Delete people below the following thresholds:
+                // a) minSubsetCnt: removed if less than minSubsetCnt body parts
+                // b) minSubsetScore: removed if global score smaller than this
+                // c) POSE_MAX_PEOPLE: keep first POSE_MAX_PEOPLE people above thresholds
+            auto numberPeople = 0;
+            std::vector<int> validSubsetIndexes;
+            validSubsetIndexes.reserve(fastMin((size_t)POSE_MAX_PEOPLE, subset.size()));
+            for (auto index = 0u ; index < subset.size() ; index++)
+            {
+                const auto subsetCounter = subset[index].first[subsetCounterIndex];
+                const auto subsetScore = subset[index].second;
+                if (subsetCounter >= minSubsetCnt && (subsetScore/subsetCounter) >= minSubsetScore)
+                {
+                    numberPeople++;
+                    validSubsetIndexes.emplace_back(index);
+                    if (numberPeople == POSE_MAX_PEOPLE)
+                        break;
+                }
+                else if (subsetCounter < 1)
+                    error("Bad subsetCounter. Bug in this function if this happens.",
+                          __LINE__, __FUNCTION__, __FILE__);
+            }
+
+            // Fill and return poseKeypoints
+            if (numberPeople > 0)
+            {
+                poseKeypoints.reset({numberPeople, (int)numberBodyParts, 3});
+                poseScores.reset(numberPeople);
+            }
+            else
+            {
+                poseKeypoints.reset();
+                poseScores.reset();
+            }
+            const auto numberBodyPartsAndPAFs = numberBodyParts + numberBodyPartPairs;
+            for (auto person = 0u ; person < validSubsetIndexes.size() ; person++)
+            {
+                const auto& subsetPair = subset[validSubsetIndexes[person]];
+                const auto& subsetI = subsetPair.first;
+                for (auto bodyPart = 0u; bodyPart < numberBodyParts; bodyPart++)
+                {
+                    const auto baseOffset = (person*numberBodyParts + bodyPart) * 3;
+                    const auto bodyPartIndex = subsetI[bodyPart];
+                    if (bodyPartIndex > 0)
+                    {
+                        // Best results for 1 scale: x + 0, y + 0.5
+                        // +0.5 to both to keep Matlab format
+                        poseKeypoints[baseOffset] = peaksPtr[bodyPartIndex-2] * scaleFactor + 0.5f;
+                        poseKeypoints[baseOffset + 1] = peaksPtr[bodyPartIndex-1] * scaleFactor + 0.5f;
+                        poseKeypoints[baseOffset + 2] = peaksPtr[bodyPartIndex];
+                    }
+                    else
+                    {
+                        poseKeypoints[baseOffset] = 0.f;
+                        poseKeypoints[baseOffset + 1] = 0.f;
+                        poseKeypoints[baseOffset + 2] = 0.f;
+                    }
+                }
+                poseScores[person] = subsetPair.second / (float)(numberBodyPartsAndPAFs);
+            }
+
             cudaCheck(__LINE__, __FUNCTION__, __FILE__);
         }
         catch (const std::exception& e)
@@ -32,14 +362,16 @@ namespace op
         }
     }
 
-    template void connectBodyPartsGpu(Array<float>& poseKeypoints, float* posePtr, const float* const heatMapPtr,
-                                      const float* const peaksPtr, const PoseModel poseModel,
-                                      const Point<int>& heatMapSize, const int maxPeaks,
+    template void connectBodyPartsGpu(Array<float>& poseKeypoints, Array<float>& poseScores,
+                                      const float* const heatMapPtr, const float* const peaksPtr,
+                                      const PoseModel poseModel, const Point<int>& heatMapSize, const int maxPeaks,
                                       const float interMinAboveThreshold, const float interThreshold,
-                                      const int minSubsetCnt, const float minSubsetScore, const float scaleFactor);
-    template void connectBodyPartsGpu(Array<double>& poseKeypoints, double* posePtr, const double* const heatMapPtr,
-                                      const double* const peaksPtr, const PoseModel poseModel,
-                                      const Point<int>& heatMapSize, const int maxPeaks,
+                                      const int minSubsetCnt, const float minSubsetScore, const float scaleFactor,
+                                      const float* const heatMapGpuPtr, const float* const peaksGpuPtr);
+    template void connectBodyPartsGpu(Array<double>& poseKeypoints, Array<double>& poseScores,
+                                      const double* const heatMapPtr, const double* const peaksPtr,
+                                      const PoseModel poseModel, const Point<int>& heatMapSize, const int maxPeaks,
                                       const double interMinAboveThreshold, const double interThreshold,
-                                      const int minSubsetCnt, const double minSubsetScore, const double scaleFactor);
+                                      const int minSubsetCnt, const double minSubsetScore, const double scaleFactor,
+                                      const double* const heatMapGpuPtr, const double* const peaksGpuPtr);
 }
diff --git a/src/openpose/pose/bodyPartConnectorCaffe.cpp b/src/openpose/pose/bodyPartConnectorCaffe.cpp
index 867442f0..535a899f 100644
--- a/src/openpose/pose/bodyPartConnectorCaffe.cpp
+++ b/src/openpose/pose/bodyPartConnectorCaffe.cpp
@@ -183,20 +183,23 @@ namespace op
     }
 
     template <typename T>
-    void BodyPartConnectorCaffe<T>::Forward_gpu(const std::vector<caffe::Blob<T>*>& bottom,
-                                                const std::vector<caffe::Blob<T>*>& top, Array<T>& poseKeypoints)
+    void BodyPartConnectorCaffe<T>::Forward_gpu(const std::vector<caffe::Blob<T>*>& bottom, Array<T>& poseKeypoints,
+                                                Array<T>& poseScores)
     {
         try
         {
             #if defined USE_CAFFE && defined USE_CUDA
                 const auto heatMapsBlob = bottom.at(0);
-                const auto* const heatMapsPtr = heatMapsBlob->gpu_data();
-                const auto* const peaksPtr = bottom.at(1)->gpu_data();
+                const auto* const heatMapsPtr = heatMapsBlob->cpu_data();
+                const auto* const peaksPtr = bottom.at(1)->cpu_data();
+                const auto* const heatMapsGpuPtr = heatMapsBlob->gpu_data();
+                const auto* const peaksGpuPtr = bottom.at(1)->gpu_data();
                 const auto maxPeaks = mTopSize[1];
-                connectBodyPartsGpu(poseKeypoints, top.at(0)->mutable_gpu_data(), heatMapsPtr, peaksPtr, mPoseModel,
-                                    Point<int>{heatMapsBlob->shape(3), heatMapsBlob->shape(2)}, maxPeaks,
-                                    mInterMinAboveThreshold, mInterThreshold, mMinSubsetCnt, mMinSubsetScore,
-                                    mScaleNetToOutput);
+                connectBodyPartsGpu(poseKeypoints, poseScores, heatMapsPtr, peaksPtr, mPoseModel,
+                                    Point<int>{heatMapsBlob->shape(3), heatMapsBlob->shape(2)},
+                                    maxPeaks, mInterMinAboveThreshold, mInterThreshold,
+                                    mMinSubsetCnt, mMinSubsetScore, mScaleNetToOutput,
+                                    heatMapsGpuPtr, peaksGpuPtr);
             #else
                 UNUSED(bottom);
                 UNUSED(top);
diff --git a/src/openpose/pose/poseExtractorCaffe.cpp b/src/openpose/pose/poseExtractorCaffe.cpp
index 02397ce9..4b145bfd 100644
--- a/src/openpose/pose/poseExtractorCaffe.cpp
+++ b/src/openpose/pose/poseExtractorCaffe.cpp
@@ -281,11 +281,11 @@ namespace op
                 upImpl->spBodyPartConnectorCaffe->setMinSubsetCnt((int)get(PoseProperty::ConnectMinSubsetCnt));
                 upImpl->spBodyPartConnectorCaffe->setMinSubsetScore((float)get(PoseProperty::ConnectMinSubsetScore));
 
-                // GPU version not implemented yet
+                // CUDA version not implemented yet
                 // #ifdef USE_CUDA
                 //     upImpl->spBodyPartConnectorCaffe->Forward_gpu({upImpl->spHeatMapsBlob.get(),
                 //                                                    upImpl->spPeaksBlob.get()},
-                //                                                   {upImpl->spPoseBlob.get()}, mPoseKeypoints);
+                //                                                   mPoseKeypoints, mPoseScores);
                 // #else
                     upImpl->spBodyPartConnectorCaffe->Forward_cpu({upImpl->spHeatMapsBlob.get(),
                                                                    upImpl->spPeaksBlob.get()},
-- 
GitLab