Example 09_keypoints_from_heatmaps working for CUDA again

c4369cba · Gines Hidalgo · 3c9441ae · c4369cba · c4369cba · c4369cba
12 changed file
--- a/doc/release_notes.md
+++ b/doc/release_notes.md
@@ -370,13 +370,14 @@ OpenPose Library - Release Notes
    1. Highly improved 3D triangulation for >3 cameras by fixing some small bugs.
    2. Added community-based support for Nvidia NVCaffe.
    3. Increased accuracy very lightly for CUDA version (about 0.01%) by adapting the threshold in `process()` in `bodyPartConnectorBase.cu` to `defaultNmsThreshold`. This also removes any posibility of future bugs in that function for using a default NMS threshold higher than 0.15 (which was the hard-coded value used previously).
-    4. Increased mAP but reduced mAR (both about 0.01%) as well as reduction of false positives. Step 1: removed legs where only knee/ankle/feet are found. Step 2: If no people is found in an image, `removePeopleBelowThresholds` is re-run with `maximizePositives = true`.
+    4. Increased mAP but reduced mAR (both about 0.01%) as well as reduction of false positives. Step 1: removed legs where only knee/ankle/feet are found. Step 2: If no people is found in an image, `removePeopleBelowThresholdsAndFillFaces` is re-run with `maximizePositives = true`.
    5. Number of maximum people is not limited by the maximum number of max peaks anymore. However, the number of body part candidates for a specific keypoint (e.g., nose) is still limited to the number of max peaks.
+    6. Added more checks during destructors of CUDA-related functions and safer CUDA frees.
 2. Functions or parameters renamed:
    1. `--3d_min_views` default value (-1) no longer means that all camera views are required. Instead, it will be equal to max(2, min(4, #cameras-1)). This should provide a good trade-off between recall and precission.
 3. Main bugs fixed:
    1. Windows: Added back support for OpenGL and Spinnaker, as well as DLLs for debug compilation.
-    2. `06_face_from_image.cpp` and `07_hand_from_image.cpp` working again, they stopped working in version 1.5.0 with the GPU image resize for the GUI.
+    2. `06_face_from_image.cpp`, `07_hand_from_image.cpp`, and `09_keypoints_from_heatmaps` working again, they stopped working in version 1.5.0 with the GPU image resize for the GUI.
 4. Changes/additions that affect the compatibility with the OpenPose Unity Plugin:



--- a/src/openpose/core/cvMatToOpInput.cpp
+++ b/src/openpose/core/cvMatToOpInput.cpp
@@ -34,9 +34,23 @@ namespace op
                if (mGpuResize)
                {
                    // Free temporary memory
-                    cudaFree(pInputImageCuda);
-                    cudaFree(pOutputImageCuda);
-                    cudaFree(pInputImageReorderedCuda);
+                    cudaCheck(__LINE__, __FUNCTION__, __FILE__);
+                    if (pInputImageCuda != nullptr)
+                    {
+                        cudaFree(pInputImageCuda);
+                        pInputImageCuda = nullptr;
+                    }
+                    if (pOutputImageCuda != nullptr)
+                    {
+                        cudaFree(pOutputImageCuda);
+                        pOutputImageCuda = nullptr;
+                    }
+                    if (pInputImageReorderedCuda != nullptr)
+                    {
+                        cudaFree(pInputImageReorderedCuda);
+                        pInputImageReorderedCuda = nullptr;
+                    }
+                    cudaCheck(__LINE__, __FUNCTION__, __FILE__);
                }
            #endif
        }

--- a/src/openpose/core/cvMatToOpOutput.cpp
+++ b/src/openpose/core/cvMatToOpOutput.cpp
@@ -38,8 +38,18 @@ namespace op
                if (mGpuResize)
                {
                    // Free temporary memory
-                    cudaFree(pInputImageCuda);
-                    cudaFree(*spOutputImageCuda);
+                    cudaCheck(__LINE__, __FUNCTION__, __FILE__);
+                    if (pInputImageCuda != nullptr)
+                    {
+                        cudaFree(pInputImageCuda);
+                        pInputImageCuda = nullptr;
+                    }
+                    if (*spOutputImageCuda != nullptr)
+                    {
+                        cudaFree(*spOutputImageCuda);
+                        *spOutputImageCuda = nullptr;
+                    }
+                    cudaCheck(__LINE__, __FUNCTION__, __FILE__);
                }
            #endif
        }

--- a/src/openpose/core/gpuRenderer.cpp
+++ b/src/openpose/core/gpuRenderer.cpp
@@ -46,7 +46,10 @@ namespace op
        {
            #ifdef USE_CUDA
                if (mIsLastRenderer && spGpuMemory != nullptr)
+                {
                    cudaFree(*spGpuMemory);
+                    *spGpuMemory = nullptr;
+                }
            #endif
        }
        catch (const std::exception& e)

--- a/src/openpose/core/opOutputToCvMat.cpp
+++ b/src/openpose/core/opOutputToCvMat.cpp
@@ -37,8 +37,18 @@ namespace op
                if (mGpuResize)
                {
                    // Free temporary memory
-                    cudaFree(*spOutputImageFloatCuda);
-                    cudaFree(pOutputImageUCharCuda);
+                    cudaCheck(__LINE__, __FUNCTION__, __FILE__);
+                    if (*spOutputImageFloatCuda != nullptr)
+                    {
+                        cudaFree(*spOutputImageFloatCuda);
+                        *spOutputImageFloatCuda = nullptr;
+                    }
+                    if (pOutputImageUCharCuda != nullptr)
+                    {
+                        cudaFree(pOutputImageUCharCuda);
+                        pOutputImageUCharCuda = nullptr;
+                    }
+                    cudaCheck(__LINE__, __FUNCTION__, __FILE__);
                }
            #endif
        }

--- a/src/openpose/face/faceGpuRenderer.cpp
+++ b/src/openpose/face/faceGpuRenderer.cpp
@@ -24,14 +24,28 @@ namespace op
        {
            // Free CUDA pointers - Note that if pointers are 0 (i.e., nullptr), no operation is performed.
            #ifdef USE_CUDA
+                cudaCheck(__LINE__, __FUNCTION__, __FILE__);
                if (pGpuFace != nullptr)
+                {
                    cudaFree(pGpuFace);
+                    pGpuFace = nullptr;
+                }
                if (pMaxPtr != nullptr)
+                {
                    cudaFree(pMaxPtr);
+                    pMaxPtr = nullptr;
+                }
                if (pMinPtr != nullptr)
+                {
                    cudaFree(pMinPtr);
+                    pMinPtr = nullptr;
+                }
                if (pScalePtr != nullptr)
+                {
                    cudaFree(pScalePtr);
+                    pScalePtr = nullptr;
+                }
+                cudaCheck(__LINE__, __FUNCTION__, __FILE__);
            #endif
        }
        catch (const std::exception& e)

--- a/src/openpose/hand/handGpuRenderer.cpp
+++ b/src/openpose/hand/handGpuRenderer.cpp
@@ -24,14 +24,28 @@ namespace op
        {
            // Free CUDA pointers - Note that if pointers are 0 (i.e., nullptr), no operation is performed.
            #ifdef USE_CUDA
+                cudaCheck(__LINE__, __FUNCTION__, __FILE__);
                if (pGpuHand != nullptr)
+                {
                    cudaFree(pGpuHand);
+                    pGpuHand = nullptr;
+                }
                if (pMaxPtr != nullptr)
+                {
                    cudaFree(pMaxPtr);
+                    pMaxPtr = nullptr;
+                }
                if (pMinPtr != nullptr)
+                {
                    cudaFree(pMinPtr);
+                    pMinPtr = nullptr;
+                }
                if (pScalePtr != nullptr)
+                {
                    cudaFree(pScalePtr);
+                    pScalePtr = nullptr;
+                }
+                cudaCheck(__LINE__, __FUNCTION__, __FILE__);
            #endif
        }
        catch (const std::exception& e)

--- a/src/openpose/net/bodyPartConnectorBase.cpp
+++ b/src/openpose/net/bodyPartConnectorBase.cpp
@@ -61,14 +61,14 @@ namespace op
    template <typename T>
    void getKeypointCounter(
        int& personCounter, const std::vector<std::pair<std::vector<int>, T>>& peopleVector,
-        const unsigned int index, const int indexFirst, const int indexLast, const int minimum)
+        const unsigned int part, const int partFirst, const int partLast, const int minimum)
    {
        try
        {
            // Count keypoints
            auto keypointCounter = 0;
-            for (auto i = indexFirst ; i < indexLast ; i++)
-                keypointCounter += (peopleVector[index].first.at(i) > 0);
+            for (auto i = partFirst ; i < partLast ; i++)
+                keypointCounter += (peopleVector[part].first.at(i) > 0);
            // If enough keypoints --> subtract them and keep them at least as big as minimum
            if (keypointCounter > minimum)
                personCounter += minimum-keypointCounter; // personCounter = non-considered keypoints + minimum
@@ -79,6 +79,53 @@ namespace op
        }
    }

+    template <typename T>
+    void getRoiDiameterAndBounds(
+        Rectangle<int>& roi, int& diameter, int& partFirstNon0, int& partLastNon0,
+        const std::vector<int>& personVector, const T* const peaksPtr,
+        const int partInit, const int partEnd)
+    {
+        try
+        {
+            // Find ROI, partFirstNon0, and partLastNon0
+            roi = Rectangle<int>{0,0,0,0};
+            partFirstNon0 = -1;
+            partLastNon0 = -1;
+            for (auto part = partInit ; part < partEnd ; part++)
+            {
+                const auto x = peaksPtr[personVector[part]-2];
+                const auto y = peaksPtr[personVector[part]-1];
+                const auto score = peaksPtr[personVector[part]];
+                if (score > 0)
+                {
+                    // ROI
+                    if (roi.x > x)
+                        roi.x = x;
+                    if (roi.y > y)
+                        roi.y = y;
+                    if (roi.width < x)
+                        roi.width = x;
+                    if (roi.height > y)
+                        roi.height = y;
+                    // First keypoint?
+                    if (partFirstNon0 < 0)
+                        partFirstNon0 = part;
+                    // Last keypoint?
+                    partLastNon0 = part;
+                }
+            }
+            // From [p1, p2] to [p1, width, height]
+            roi.width -= roi.x;
+            roi.height -= roi.y;
+            // diameter
+            diameter = fastMax(roi.width, roi.height);
+        }
+        catch (const std::exception& e)
+        {
+            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+        }
+    }
+
    template <typename T>
    std::vector<std::pair<std::vector<int>, T>> createPeopleVector(
        const T* const heatMapPtr, const T* const peaksPtr, const PoseModel poseModel, const Point<int>& heatMapSize,
@@ -643,32 +690,6 @@ namespace op
        }
    }

-    template <typename T>
-    void getRoiDiameterAndBounds(
-        Rectangle<int>& roi, int& diameter, int& indexFirstNon0, int& indexLastNon0,
-        const std::vector<int>& personVector, const T* const peaksPtr,
-        const int indexInit, const int indexEnd)
-    {
-        try
-        {
-            roi = Rectangle<int>{0,0,0,0};
-            for (auto index = 0u ; index < personVector.size()-1 ; index++)
-            {
-                const auto x = peaksPtr[personVector[index]-2];
-                const auto y = peaksPtr[personVector[index]-1];
-                const auto score = peaksPtr[personVector[index]];
-                if (roi.x > x)
-                    roi.x = x;
-                if (roi.y > y)
-                    roi.y = y;
-            }
-        }
-        catch (const std::exception& e)
-        {
-            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
-        }
-    }
-
    template <typename T>
    void removePeopleBelowThresholdsAndFillFaces(
        std::vector<int>& validSubsetIndexes, int& numberPeople,
@@ -693,28 +714,28 @@ namespace op
            std::vector<int> faceInvalidSubsetIndexes;
            faceInvalidSubsetIndexes.reserve(peopleVector.size());
            // For each person candidate
-            for (auto index = 0u ; index < peopleVector.size() ; index++)
+            for (auto person = 0u ; person < peopleVector.size() ; person++)
            {
-                auto personCounter = peopleVector[index].first.back();
+                auto personCounter = peopleVector[person].first.back();
                // Analog for hand/face keypoints
                if (numberBodyParts >= 135)
                {
                    // No consider face keypoints for personCounter
                    const auto currentCounter = personCounter;
-                    getKeypointCounter(personCounter, peopleVector, index, 65, 135, 1);
+                    getKeypointCounter(personCounter, peopleVector, person, 65, 135, 1);
                    const auto newCounter = personCounter;
-                    if (personCounter == 0)
+                    if (personCounter == 1)
                    {
-                        faceInvalidSubsetIndexes.emplace_back(index);
+                        faceInvalidSubsetIndexes.emplace_back(person);
                        continue;
                    }
                    // If body is still valid and facial points were removed, then add to valid faces
                    else if (currentCounter != newCounter)
-                        faceValidSubsetIndexes.emplace_back(index);
+                        faceValidSubsetIndexes.emplace_back(person);
                    // No consider right hand keypoints for personCounter
-                    getKeypointCounter(personCounter, peopleVector, index, 45, 65, 1);
+                    getKeypointCounter(personCounter, peopleVector, person, 45, 65, 1);
                    // No consider left hand keypoints for personCounter
-                    getKeypointCounter(personCounter, peopleVector, index, 25, 45, 1);
+                    getKeypointCounter(personCounter, peopleVector, person, 25, 45, 1);
                }
                // Foot keypoints do not affect personCounter (too many false positives,
                // same foot usually appears as both left and right keypoints)
@@ -724,7 +745,7 @@ namespace op
                if (!maximizePositives && (numberBodyParts == 25 || numberBodyParts > 70))
                {
                    const auto currentCounter = personCounter;
-                    getKeypointCounter(personCounter, peopleVector, index, 19, 25, 0);
+                    getKeypointCounter(personCounter, peopleVector, person, 19, 25, 0);
                    const auto newCounter = personCounter;
                    // Problem: Same leg/foot keypoints are considered for both left and right keypoints.
                    // Solution: Remove legs that are duplicated and that do not have upper torso
@@ -733,11 +754,11 @@ namespace op
                        continue;
                }
                // Add only valid people
-                const auto personScore = peopleVector[index].second;
+                const auto personScore = peopleVector[person].second;
                if (personCounter >= minSubsetCnt && (personScore/personCounter) >= minSubsetScore)
                {
                    numberPeople++;
-                    validSubsetIndexes.emplace_back(index);
+                    validSubsetIndexes.emplace_back(person);
                    // // This is not required, it is OK if there are more people. No more GPU memory used.
                    // if (numberPeople == maxPeaks)
                    //     break;
@@ -747,25 +768,6 @@ namespace op
                    error("Bad personCounter (" + std::to_string(personCounter) + "). Bug in this"
                          " function if this happens.", __LINE__, __FUNCTION__, __FILE__);
            }
-//             // Random standalone facial keypoints --> Merge into a more complete face
-//             if (numberPeople > 0 && faceInvalidSubsetIndexes.size() > 0)
-//             {
-//                 for (auto faceId = 0u ; faceId < faceInvalidSubsetIndexes.size() ; faceId++)
-//                 {
-//                     // Get ROI
-//                     Rectangle<int> roi;
-//                     int diameter;
-//                     int indexFirstNon0;
-//                     int indexLastNon0;
-//                     const auto index = faceValidSubsetIndexes[faceId];
-//                     getRoiDiameterAndBounds(
-//                         roi, diameter, indexFirstNon0, indexLastNon0, peopleVector[index].first, peaksPtr, 65, 135);
-//                     // const auto personCounter = peopleVector[index].first.back();
-//                     // const auto x = peaksPtr[peopleVector[index].first[part]-2];
-//                     // const auto y = peaksPtr[peopleVector[index].first[part]-1];
-//                     // const auto score = peaksPtr[peopleVector[index].first[part]];
-//                 }
-//             }
            // If no people found --> Repeat with maximizePositives = true
            // Result: Increased COCO mAP because we catch more foot-only images
            if (numberPeople == 0 && !maximizePositives)

--- a/src/openpose/net/bodyPartConnectorCaffe.cpp
+++ b/src/openpose/net/bodyPartConnectorCaffe.cpp
@@ -3,6 +3,7 @@
 #endif
 #ifdef USE_CUDA
    #include <openpose/gpu/cuda.hpp>
+    #include <openpose/gpu/cuda.hu>
 #endif
 #ifdef USE_OPENCL
    #include <openpose/gpu/opencl.hcl>
@@ -41,14 +42,28 @@ namespace op
        try
        {
            #if defined USE_CAFFE && defined USE_CUDA
-                cudaFree(pBodyPartPairsGpuPtr);
-                cudaFree(pMapIdxGpuPtr);
-                cudaFree(pFinalOutputGpuPtr);
+                cudaCheck(__LINE__, __FUNCTION__, __FILE__);
+                if (pBodyPartPairsGpuPtr != nullptr)
+                {
+                    cudaFree(pBodyPartPairsGpuPtr);
+                    pBodyPartPairsGpuPtr = nullptr;
+                }
+                if (pMapIdxGpuPtr != nullptr)
+                {
+                    cudaFree(pMapIdxGpuPtr);
+                    pMapIdxGpuPtr = nullptr;
+                }
+                if (pFinalOutputGpuPtr != nullptr)
+                {
+                    cudaFree(pFinalOutputGpuPtr);
+                    pFinalOutputGpuPtr = nullptr;
+                }
+                cudaCheck(__LINE__, __FUNCTION__, __FILE__);
            #endif
        }
        catch (const std::exception& e)
        {
-            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+            errorDestructor(e.what(), __LINE__, __FUNCTION__, __FILE__);
        }
    }


--- a/src/openpose/net/nmsCaffe.cpp
+++ b/src/openpose/net/nmsCaffe.cpp
@@ -45,9 +45,9 @@ namespace op
            #if defined USE_CAFFE && defined USE_OPENCL
                try
                {
-                    if(mKernelGpuPtr != nullptr)
+                    if (mKernelGpuPtr != nullptr)
                        clReleaseMemObject((cl_mem)mKernelGpuPtr);
-                    if(mKernelCpuPtr != nullptr)
+                    if (mKernelCpuPtr != nullptr)
                        delete mKernelCpuPtr;
                }
                catch (const std::exception& e)

--- a/src/openpose/net/resizeAndMergeBase.cu
+++ b/src/openpose/net/resizeAndMergeBase.cu
@@ -4,8 +4,18 @@

 namespace op
 {
+    const auto THREADS_PER_BLOCK = 256u;
    const auto THREADS_PER_BLOCK_1D = 16u;

+    template <typename T>
+    __global__ void fillKernel(
+        T* targetPtr, const T* const sourcePtr, const int N)
+    {
+        const auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
+        if (x < N)
+            targetPtr[x] = sourcePtr[x];
+    }
+
    // template <typename T>
    // __global__ void resizeKernelOld(
    //     T* targetPtr, const T* const sourcePtr, const int widthSource, const int heightSource, const int widthTarget,
@@ -329,18 +339,29 @@ namespace op

                    // Optimized function for 8x resize
                    // OP_CUDA_PROFILE_INIT(REPS);
-                    if (widthTarget / widthSource != 8 || heightTarget / heightSource != 8)
-                        error("Kernel only implemented for 8x resize. Notify us if this error appears.",
-                            __LINE__, __FUNCTION__, __FILE__);
-                    const auto rescaleFactor = (unsigned int) std::ceil(heightTarget / (float)(heightSource));
-                    const dim3 threadsPerBlock{rescaleFactor, rescaleFactor, 1};
-                    const dim3 numBlocks{
-                        getNumberCudaBlocks(widthTarget, threadsPerBlock.x),
-                        getNumberCudaBlocks(heightTarget, threadsPerBlock.y),
-                        getNumberCudaBlocks(num * channels, threadsPerBlock.z)};
-                    resize8TimesKernel<<<numBlocks, threadsPerBlock>>>(
-                        targetPtr, sourcePtrs.at(0), widthSource, heightSource, widthTarget, heightTarget,
-                        rescaleFactor);
+                    if (widthTarget / widthSource == 1 && heightTarget / heightSource == 1)
+                    {
+                        const auto N = widthTarget * heightTarget * num * channels;
+                        const dim3 threadsPerBlock{THREADS_PER_BLOCK};
+                        const dim3 numBlocks{getNumberCudaBlocks(N, threadsPerBlock.x)};
+                        fillKernel<<<numBlocks, threadsPerBlock>>>(
+                            targetPtr, sourcePtrs.at(0), N);
+                    }
+                    else
+                    {
+                        if (widthTarget / widthSource != 8 || heightTarget / heightSource != 8)
+                            error("Kernel only implemented for 8x resize. Notify us if this error appears.",
+                                __LINE__, __FUNCTION__, __FILE__);
+                        const auto rescaleFactor = (unsigned int) std::ceil(heightTarget / (float)(heightSource));
+                        const dim3 threadsPerBlock{rescaleFactor, rescaleFactor, 1};
+                        const dim3 numBlocks{
+                            getNumberCudaBlocks(widthTarget, threadsPerBlock.x),
+                            getNumberCudaBlocks(heightTarget, threadsPerBlock.y),
+                            getNumberCudaBlocks(num * channels, threadsPerBlock.z)};
+                        resize8TimesKernel<<<numBlocks, threadsPerBlock>>>(
+                            targetPtr, sourcePtrs.at(0), widthSource, heightSource, widthTarget, heightTarget,
+                            rescaleFactor);
+                    }
                    // OP_CUDA_PROFILE_END(timeNormalize3, 1e3, REPS);

                    // // Profiling code
@@ -482,10 +503,14 @@ namespace op
                    widthTarget, heightTarget, sourcePtrs[0], sourcePtrs[1], sourcePtrs[2], sourcePtrs[3],
                    sourcePtrs[4], sourcePtrs[5], sourcePtrs[6], sourcePtrs[7]);
                // Free memory
-                cudaFree(widthSources);
-                cudaFree(heightSources);
-                cudaFree(scaleWidths);
-                cudaFree(scaleHeights);
+                if (widthSources != nullptr)
+                    cudaFree(widthSources);
+                if (heightSources != nullptr)
+                    cudaFree(heightSources);
+                if (scaleWidths != nullptr)
+                    cudaFree(scaleWidths);
+                if (scaleHeights != nullptr)
+                    cudaFree(scaleHeights);
                // OP_CUDA_PROFILE_END(timeNormalize3, 1e3, REPS);

                // // Profiling code

--- a/src/openpose/pose/poseGpuRenderer.cpp
+++ b/src/openpose/pose/poseGpuRenderer.cpp
@@ -34,16 +34,32 @@ namespace op
        try
        {
            // Free CUDA pointers - Note that if pointers are 0 (i.e., nullptr), no operation is performed.
+            log("", Priority::Low, __LINE__, __FUNCTION__, __FILE__);
            #ifdef USE_CUDA
+                cudaCheck(__LINE__, __FUNCTION__, __FILE__);
                if (pGpuPose != nullptr)
+                {
                    cudaFree(pGpuPose);
+                    pGpuPose = nullptr;
+                }
                if (pMaxPtr != nullptr)
+                {
                    cudaFree(pMaxPtr);
+                    pMaxPtr = nullptr;
+                }
                if (pMinPtr != nullptr)
+                {
                    cudaFree(pMinPtr);
+                    pMinPtr = nullptr;
+                }
                if (pScalePtr != nullptr)
+                {
                    cudaFree(pScalePtr);
+                    pScalePtr = nullptr;
+                }
+                cudaCheck(__LINE__, __FUNCTION__, __FILE__);
            #endif
+            log("", Priority::Low, __LINE__, __FUNCTION__, __FILE__);
        }
        catch (const std::exception& e)
        {