ResizeAndMerge multi-scale more robust

5c873aa9 · gineshidalgo99 · d1d483f3 · 5c873aa9 · 5c873aa9 · 5c873aa9
4 changed file
--- a/doc/release_notes.md
+++ b/doc/release_notes.md
@@ -113,7 +113,7 @@ OpenPose Library - Release Notes



-## Current version (future OpenPose 1.2.0alpha)
+## Current version (future OpenPose 1.2.0)
 1. Main improvements:
    1. Speed increase when processing images with different aspect ratios. E.g. ~20% increase over 3.7k COCO validation images on 1 scale.
    2. Huge speed increase and memory reduction when processing multi-scale. E.g. over 3.7k COCO validation images on 4 scales: ~40% (~770 to ~450 sec) speed increase, ~25% memory reduction (from ~8.9 to ~6.7 GB / GPU).

--- a/examples/tests/pose_accuracy_coco_test_dev.sh
+++ b/examples/tests/pose_accuracy_coco_test_dev.sh
@@ -13,8 +13,5 @@ OP_BIN=./build/examples/openpose/openpose.bin
    # 1 scale
 $OP_BIN --image_dir $IMAGE_FOLDER --write_coco_json ${JSON_FOLDER}1_test.json --no_display --render_pose 0

-#     # 3 scales
-# $OP_BIN --image_dir $IMAGE_FOLDER --write_coco_json ${JSON_FOLDER}1_3.json --no_display --render_pose 0 --scale_number 3 --scale_gap 0.25
-
    # 4 scales
 $OP_BIN --image_dir $IMAGE_FOLDER --write_coco_json ${JSON_FOLDER}1_4_test.json --no_display --render_pose 0 --scale_number 4 --scale_gap 0.25 --net_resolution "1312x736"
--- a/examples/tests/pose_accuracy_coco_val.sh
+++ b/examples/tests/pose_accuracy_coco_val.sh
@@ -32,5 +32,5 @@ $OP_BIN --image_dir $IMAGE_FOLDER --write_coco_json ${JSON_FOLDER}1.json --no_di
 #     # 4 scales
 # $OP_BIN --num_gpu 1 --image_dir $IMAGE_FOLDER --write_coco_json ${JSON_FOLDER}1_4.json --no_display --render_pose 0 --scale_number 4 --scale_gap 0.25 --net_resolution "1312x736" --frame_last 3558

-# Debugging - Rendered frames saved
+# 	  # Debugging - Rendered frames saved
 # $OP_BIN --image_dir $IMAGE_FOLDER --write_images ${JSON_FOLDER}frameOutput --no_display
--- a/src/openpose/core/resizeAndMergeBase.cu
+++ b/src/openpose/core/resizeAndMergeBase.cu
@@ -15,30 +15,46 @@ namespace op

        if (x < targetWidth && y < targetHeight)
        {
-            const T xSource = (x + 0.5f) * sourceWidth / T(targetWidth) - 0.5f;
-            const T ySource = (y + 0.5f) * sourceHeight / T(targetHeight) - 0.5f;
+            const T xSource = (x + T(0.5f)) * sourceWidth / T(targetWidth) - T(0.5f);
+            const T ySource = (y + T(0.5f)) * sourceHeight / T(targetHeight) - T(0.5f);
            targetPtr[y*targetWidth+x] = bicubicInterpolate(sourcePtr, xSource, ySource, sourceWidth, sourceHeight,
                                                            sourceWidth);
        }
    }

    template <typename T>
-    __global__ void resizeKernelAndMerge(T* targetPtr, const T* const sourcePtr, const T scaleWidth,
-                                         const T scaleHeight, const int sourceWidth, const int sourceHeight,
-                                         const int targetWidth, const int targetHeight, const int averageCounter)
+    __global__ void resizeKernelAndAdd(T* targetPtr, const T* const sourcePtr, const T scaleWidth,
+                                       const T scaleHeight, const int sourceWidth, const int sourceHeight,
+                                       const int targetWidth, const int targetHeight)
    {
        const auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
        const auto y = (blockIdx.y * blockDim.y) + threadIdx.y;

        if (x < targetWidth && y < targetHeight)
        {
-            const T xSource = (x + 0.5f) / scaleWidth - 0.5f;
-            const T ySource = (y + 0.5f) / scaleHeight - 0.5f;
+            const T xSource = (x + T(0.5f)) / scaleWidth - T(0.5f);
+            const T ySource = (y + T(0.5f)) / scaleHeight - T(0.5f);
+            targetPtr[y*targetWidth+x] += bicubicInterpolate(sourcePtr, xSource, ySource, sourceWidth, sourceHeight,
+                                                             sourceWidth);
+        }
+    }
+
+    template <typename T>
+    __global__ void resizeKernelAndAverage(T* targetPtr, const T* const sourcePtr, const T scaleWidth,
+                                           const T scaleHeight, const int sourceWidth, const int sourceHeight,
+                                           const int targetWidth, const int targetHeight, const int counter)
+    {
+        const auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
+        const auto y = (blockIdx.y * blockDim.y) + threadIdx.y;
+
+        if (x < targetWidth && y < targetHeight)
+        {
+            const T xSource = (x + T(0.5f)) / scaleWidth - T(0.5f);
+            const T ySource = (y + T(0.5f)) / scaleHeight - T(0.5f);
            const auto interpolated = bicubicInterpolate(sourcePtr, xSource, ySource, sourceWidth, sourceHeight,
                                                         sourceWidth);
            auto& targetPixel = targetPtr[y*targetWidth+x];
-            targetPixel = ((averageCounter * targetPixel) + interpolated) / T(averageCounter + 1);
-            // targetPixel = fastMax(targetPixel, interpolated);
+            targetPixel = (targetPixel + interpolated) / T(counter);
        }
    }

@@ -91,14 +107,13 @@ namespace op
                }
                // Old inefficient multi-scale merging
                else
-                    error("It should never reaches this point. Notify us.", __LINE__, __FUNCTION__, __FILE__);
+                    error("It should never reache this point. Notify us otherwise.", __LINE__, __FUNCTION__, __FILE__);
            }
            // Multi-scaling merging
            else
            {
                const auto targetChannelOffset = targetWidth * targetHeight;
                cudaMemset(targetPtr, 0.f, channels*targetChannelOffset * sizeof(T));
-                auto averageCounter = -1;
                const auto scaleToMainScaleWidth = targetWidth / T(sourceWidth);
                const auto scaleToMainScaleHeight = targetHeight / T(sourceHeight);

@@ -111,14 +126,29 @@ namespace op
                    const auto scaleInputToNet = scaleInputToNetInputs[i] / scaleInputToNetInputs[0];
                    const auto scaleWidth = scaleToMainScaleWidth / scaleInputToNet;
                    const auto scaleHeight = scaleToMainScaleHeight / scaleInputToNet;
-                    averageCounter++;
-                    for (auto c = 0 ; c < channels ; c++)
+                    // All but last image --> add
+                    if (i < sourceSizes.size() - 1)
                    {
-                        resizeKernelAndMerge<<<numBlocks, threadsPerBlock>>>(
-                            targetPtr + c * targetChannelOffset, sourcePtrs[i] + c * sourceChannelOffset,
-                            scaleWidth, scaleHeight, currentWidth, currentHeight, targetWidth,
-                            targetHeight, averageCounter
-                        );
+                        for (auto c = 0 ; c < channels ; c++)
+                        {
+                            resizeKernelAndAdd<<<numBlocks, threadsPerBlock>>>(
+                                targetPtr + c * targetChannelOffset, sourcePtrs[i] + c * sourceChannelOffset,
+                                scaleWidth, scaleHeight, currentWidth, currentHeight, targetWidth,
+                                targetHeight
+                            );
+                        }
+                    }
+                    // Last image --> average all
+                    else
+                    {
+                        for (auto c = 0 ; c < channels ; c++)
+                        {
+                            resizeKernelAndAverage<<<numBlocks, threadsPerBlock>>>(
+                                targetPtr + c * targetChannelOffset, sourcePtrs[i] + c * sourceChannelOffset,
+                                scaleWidth, scaleHeight, currentWidth, currentHeight, targetWidth,
+                                targetHeight, sourceSizes.size()
+                            );
+                        }
                    }
                }
            }