From 35f8c9d18f9d252acb3efcadec327661105c58b8 Mon Sep 17 00:00:00 2001 From: joker3212 Date: Fri, 3 May 2019 21:42:31 -0400 Subject: [PATCH] Render and resize speedup in CUDA (#1209) --- include/openpose/face/faceGpuRenderer.hpp | 3 + include/openpose/face/renderFace.hpp | 9 +- include/openpose/gpu/cuda.hu | 24 +- include/openpose/hand/handGpuRenderer.hpp | 3 + include/openpose/hand/renderHand.hpp | 11 +- include/openpose/net/resizeAndMergeBase.hpp | 10 +- include/openpose/pose/poseGpuRenderer.hpp | 3 + include/openpose/pose/renderPose.hpp | 66 ++-- include/openpose/utilities/render.hu | 223 ++++++++++- src/openpose/face/faceGpuRenderer.cpp | 23 +- src/openpose/face/renderFace.cu | 42 +- src/openpose/hand/handGpuRenderer.cpp | 21 +- src/openpose/hand/renderHand.cu | 42 +- src/openpose/net/resizeAndMergeBase.cu | 217 ++++++++++- src/openpose/pose/poseGpuRenderer.cpp | 33 +- src/openpose/pose/renderPose.cu | 406 +++++++++++--------- 16 files changed, 847 insertions(+), 289 deletions(-) diff --git a/include/openpose/face/faceGpuRenderer.hpp b/include/openpose/face/faceGpuRenderer.hpp index dba5ece0..f0d82a4b 100644 --- a/include/openpose/face/faceGpuRenderer.hpp +++ b/include/openpose/face/faceGpuRenderer.hpp @@ -23,6 +23,9 @@ namespace op private: float* pGpuFace; // GPU aux memory + float* pMaxPtr; // GPU aux memory + float* pMinPtr; // GPU aux memory + float* pScalePtr; // GPU aux memory DELETE_COPY(FaceGpuRenderer); }; diff --git a/include/openpose/face/renderFace.hpp b/include/openpose/face/renderFace.hpp index 3582d660..ed7a90e4 100644 --- a/include/openpose/face/renderFace.hpp +++ b/include/openpose/face/renderFace.hpp @@ -6,10 +6,13 @@ namespace op { - OP_API void renderFaceKeypointsCpu(Array& frameArray, const Array& faceKeypoints, const float renderThreshold); + OP_API void renderFaceKeypointsCpu( + Array& frameArray, const Array& faceKeypoints, const float renderThreshold); - void renderFaceKeypointsGpu(float* framePtr, const Point& frameSize, const float* const facePtr, const int numberPeople, - const float renderThreshold, const float alphaColorToAdd = FACE_DEFAULT_ALPHA_KEYPOINT); + void renderFaceKeypointsGpu( + float* framePtr, float* maxPtr, float* minPtr, float* scalePtr, const Point& frameSize, + const float* const facePtr, const int numberPeople, const float renderThreshold, + const float alphaColorToAdd = FACE_DEFAULT_ALPHA_KEYPOINT); } #endif // OPENPOSE_FACE_RENDER_FACE_HPP diff --git a/include/openpose/gpu/cuda.hu b/include/openpose/gpu/cuda.hu index f0d842ec..6e299c0a 100644 --- a/include/openpose/gpu/cuda.hu +++ b/include/openpose/gpu/cuda.hu @@ -121,8 +121,9 @@ namespace op } template - inline __device__ T bicubicInterpolate(const T* const sourcePtr, const T xSource, const T ySource, const int widthSource, - const int heightSource, const int widthSourcePtr) + inline __device__ T bicubicInterpolate( + const T* const sourcePtr, const T xSource, const T ySource, const int widthSource, const int heightSource, + const int widthSourcePtr) { int xIntArray[4]; int yIntArray[4]; @@ -140,6 +141,25 @@ namespace op return cubicInterpolate(temp[0], temp[1], temp[2], temp[3], dy); } + template + inline __device__ T bicubicInterpolate8Times( + const T* const sourcePtr, const T xSource, const T ySource, const int widthSource, const int heightSource, + const int widthSourcePtr, const int threadIdxX, const int threadIdxY) + { + // Now we only need dx and dy + const T dx = xSource - fastTruncateCuda(int(xSource + 1e-5), 0, widthSource - 1); + const T dy = ySource - fastTruncateCuda(int(ySource + 1e-5), 0, heightSource - 1); + + T temp[4]; + for (unsigned char i = 0; i < 4; i++) + { + const auto offset = 5 * (i + (threadIdxY > 3 ? 1 : 0)) + (threadIdxX > 3 ? 1 : 0); + temp[i] = cubicInterpolate(sourcePtr[offset + 0], sourcePtr[offset + 1], + sourcePtr[offset + 2], sourcePtr[offset + 3], dx); + } + return cubicInterpolate(temp[0], temp[1], temp[2], temp[3], dy); + } + template inline __device__ T addWeighted(const T value1, const T value2, const T alphaValue2) { diff --git a/include/openpose/hand/handGpuRenderer.hpp b/include/openpose/hand/handGpuRenderer.hpp index a9eb77d4..47e59b08 100644 --- a/include/openpose/hand/handGpuRenderer.hpp +++ b/include/openpose/hand/handGpuRenderer.hpp @@ -23,6 +23,9 @@ namespace op private: float* pGpuHand; // GPU aux memory + float* pMaxPtr; // GPU aux memory + float* pMinPtr; // GPU aux memory + float* pScalePtr; // GPU aux memory DELETE_COPY(HandGpuRenderer); }; diff --git a/include/openpose/hand/renderHand.hpp b/include/openpose/hand/renderHand.hpp index dea0d024..7610a8d8 100644 --- a/include/openpose/hand/renderHand.hpp +++ b/include/openpose/hand/renderHand.hpp @@ -6,12 +6,13 @@ namespace op { - OP_API void renderHandKeypointsCpu(Array& frameArray, const std::array, 2>& handKeypoints, - const float renderThreshold); + OP_API void renderHandKeypointsCpu( + Array& frameArray, const std::array, 2>& handKeypoints, const float renderThreshold); - void renderHandKeypointsGpu(float* framePtr, const Point& frameSize, const float* const handsPtr, - const int numberHands, const float renderThreshold, - const float alphaColorToAdd = HAND_DEFAULT_ALPHA_KEYPOINT); + void renderHandKeypointsGpu( + float* framePtr, float* maxPtr, float* minPtr, float* scalePtr, const Point& frameSize, + const float* const handsPtr, const int numberHands, const float renderThreshold, + const float alphaColorToAdd = HAND_DEFAULT_ALPHA_KEYPOINT); } #endif // OPENPOSE_HAND_GPU_HAND_RENDER_HPP diff --git a/include/openpose/net/resizeAndMergeBase.hpp b/include/openpose/net/resizeAndMergeBase.hpp index 75c72406..ddfc487f 100644 --- a/include/openpose/net/resizeAndMergeBase.hpp +++ b/include/openpose/net/resizeAndMergeBase.hpp @@ -22,6 +22,14 @@ namespace op T* targetPtr, const std::vector& sourcePtrs, std::vector& sourceTempPtrs, const std::array& targetSize, const std::vector>& sourceSizes, const std::vector& scaleInputToNetInputs = {1.f}, const int gpuID = 0); -} + // Functions for the files cvMatToOpInput/Output + void resizeAndMergeRGBGPU( + float* targetPtr, const float* const srcPtr, const int sourceWidth, const int sourceHeight, + const int targetWidth, const int targetHeight, const float scaleFactor); + + void reorderAndCast( + float* targetPtr, const unsigned char* const srcPtr, const int width, const int height); + +} #endif // OPENPOSE_NET_RESIZE_AND_MERGE_BASE_HPP diff --git a/include/openpose/pose/poseGpuRenderer.hpp b/include/openpose/pose/poseGpuRenderer.hpp index 4553d074..1238d9d4 100644 --- a/include/openpose/pose/poseGpuRenderer.hpp +++ b/include/openpose/pose/poseGpuRenderer.hpp @@ -31,6 +31,9 @@ namespace op const std::shared_ptr spPoseExtractorNet; // Init with thread float* pGpuPose; // GPU aux memory + float* pMaxPtr; // GPU aux memory + float* pMinPtr; // GPU aux memory + float* pScalePtr; // GPU aux memory DELETE_COPY(PoseGpuRenderer); }; diff --git a/include/openpose/pose/renderPose.hpp b/include/openpose/pose/renderPose.hpp index c72dd7f4..196b3617 100644 --- a/include/openpose/pose/renderPose.hpp +++ b/include/openpose/pose/renderPose.hpp @@ -8,39 +8,39 @@ namespace op { - OP_API void renderPoseKeypointsCpu(Array& frameArray, const Array& poseKeypoints, - const PoseModel poseModel, const float renderThreshold, - const bool blendOriginalFrame = true); - - void renderPoseKeypointsGpu(float* framePtr, const PoseModel poseModel, const int numberPeople, - const Point& frameSize, const float* const posePtr, - const float renderThreshold, const bool googlyEyes = false, - const bool blendOriginalFrame = true, - const float alphaBlending = POSE_DEFAULT_ALPHA_KEYPOINT); - - void renderPoseHeatMapGpu(float* frame, const Point& frameSize, const float* const heatMapPtr, - const Point& heatMapSize, const float scaleToKeepRatio, - const unsigned int part, - const float alphaBlending = POSE_DEFAULT_ALPHA_HEAT_MAP); - - void renderPoseHeatMapsGpu(float* frame, const PoseModel poseModel, const Point& frameSize, - const float* const heatMapPtr, const Point& heatMapSize, - const float scaleToKeepRatio, - const float alphaBlending = POSE_DEFAULT_ALPHA_HEAT_MAP); - - void renderPosePAFGpu(float* framePtr, const PoseModel poseModel, const Point& frameSize, - const float* const heatMapPtr, const Point& heatMapSize, - const float scaleToKeepRatio, const int part, - const float alphaBlending = POSE_DEFAULT_ALPHA_HEAT_MAP); - - void renderPosePAFsGpu(float* framePtr, const PoseModel poseModel, const Point& frameSize, - const float* const heatMapPtr, const Point& heatMapSize, - const float scaleToKeepRatio, - const float alphaBlending = POSE_DEFAULT_ALPHA_HEAT_MAP); - - void renderPoseDistanceGpu(float* framePtr, const Point& frameSize, const float* const heatMapPtr, - const Point& heatMapSize, const float scaleToKeepRatio, - const unsigned int part, const float alphaBlending = POSE_DEFAULT_ALPHA_HEAT_MAP); + OP_API void renderPoseKeypointsCpu( + Array& frameArray, const Array& poseKeypoints, const PoseModel poseModel, + const float renderThreshold, const bool blendOriginalFrame = true); + + void renderPoseKeypointsGpu( + float* framePtr, float* maxPtr, float* minPtr, float* scalePtr, const PoseModel poseModel, + const int numberPeople, const Point& frameSize, const float* const posePtr, + const float renderThreshold, const bool googlyEyes = false, const bool blendOriginalFrame = true, + const float alphaBlending = POSE_DEFAULT_ALPHA_KEYPOINT); + + void renderPoseHeatMapGpu( + float* frame, const Point& frameSize, const float* const heatMapPtr, const Point& heatMapSize, + const float scaleToKeepRatio, const unsigned int part, + const float alphaBlending = POSE_DEFAULT_ALPHA_HEAT_MAP); + + void renderPoseHeatMapsGpu( + float* frame, const PoseModel poseModel, const Point& frameSize, const float* const heatMapPtr, + const Point& heatMapSize, const float scaleToKeepRatio, + const float alphaBlending = POSE_DEFAULT_ALPHA_HEAT_MAP); + + void renderPosePAFGpu( + float* framePtr, const PoseModel poseModel, const Point& frameSize, const float* const heatMapPtr, + const Point& heatMapSize, const float scaleToKeepRatio, const int part, + const float alphaBlending = POSE_DEFAULT_ALPHA_HEAT_MAP); + + void renderPosePAFsGpu( + float* framePtr, const PoseModel poseModel, const Point& frameSize, const float* const heatMapPtr, + const Point& heatMapSize, const float scaleToKeepRatio, + const float alphaBlending = POSE_DEFAULT_ALPHA_HEAT_MAP); + + void renderPoseDistanceGpu( + float* framePtr, const Point& frameSize, const float* const heatMapPtr, const Point& heatMapSize, + const float scaleToKeepRatio, const unsigned int part, const float alphaBlending = POSE_DEFAULT_ALPHA_HEAT_MAP); } #endif // OPENPOSE_POSE_RENDER_POSE_HPP diff --git a/include/openpose/utilities/render.hu b/include/openpose/utilities/render.hu index 4948f373..0ff4791f 100644 --- a/include/openpose/utilities/render.hu +++ b/include/openpose/utilities/render.hu @@ -3,16 +3,217 @@ namespace op { - inline __device__ void renderKeypoints(float* targetPtr, float2* sharedMaxs, float2* sharedMins, - float* sharedScaleF, const int globalIdx, const int x, const int y, - const int targetWidth, const int targetHeight, - const float* const keypointsPtr, const unsigned int* const partPairsPtr, - const int numberPeople, const int numberParts, const int numberPartPairs, - const float* const rgbColorsPtr, const int numberColors, const float radius, - const float lineWidth, const float* const keypointScalePtr, - const int numberScales, const float threshold, - const float alphaColorToAdd, const bool blendOriginalFrame = true, - const int googlyEye1 = -1, const int googlyEye2 = -1) + __inline__ __device__ void getBoundingBoxPerPerson( + float* maxPtr, float* minPtr, float* scalePtr,const int targetWidth, const int targetHeight, + const float* const keypointsPtr, const int numberPeople, const int numberParts, const float threshold) + { + const auto globalIdx = threadIdx.x; + // const auto xIndex = 2*globalIdx; + // const auto yIndex = xIndex+1; + const auto xIndex = globalIdx; + const auto yIndex = numberPeople+globalIdx; + + // Fill shared parameters + // if (globalIdx < numberPeople) + { + auto minValueX = (float)targetWidth; + auto minValueY = (float)targetHeight; + auto maxValueX = 0.f; + auto maxValueY = 0.f; + for (auto part = 0 ; part < numberParts ; part++) + { + const auto index = 3 * (globalIdx*numberParts + part); + const auto x = keypointsPtr[index]; + const auto y = keypointsPtr[index+1]; + const auto score = keypointsPtr[index+2]; + if (score > threshold) + { + if (x < minValueX) + minValueX = x; + if (x > maxValueX) + maxValueX = x; + if (y < minValueY) + minValueY = y; + if (y > maxValueY) + maxValueY = y; + } + } + if (maxValueX != 0.f && maxValueY != 0.f) + { + const auto averageX = maxValueX - minValueX; + const auto averageY = maxValueY - minValueY; + // (averageX + averageY) / 2.f / 400.f + scalePtr[globalIdx] = fastTruncateCuda((averageX + averageY) / 400.f, 0.33f, 1.f); + const auto constantToAdd = 50.f; + maxValueX += constantToAdd; + maxValueY += constantToAdd; + minValueX -= constantToAdd; + minValueY -= constantToAdd; + } + + minPtr[xIndex] = minValueX; + minPtr[yIndex] = minValueY; + maxPtr[xIndex] = maxValueX; + maxPtr[yIndex] = maxValueY; + } + } + + __inline__ __device__ void renderKeypoints( + float* targetPtr, float* sharedMaxs, float* sharedMins, float* sharedScaleF, const float* const maxPtr, + const float* const minPtr, const float* const scalePtr, const int globalIdx, const int x, const int y, + const int targetWidth, const int targetHeight, const float* const keypointsPtr, + const unsigned int* const partPairsPtr, const int numberPeople, const int numberParts, + const int numberPartPairs, const float* const rgbColorsPtr, const int numberColors, const float radius, + const float lineWidth, const float* const keypointScalePtr, const int numberScales, const float threshold, + const float alphaColorToAdd, const bool blendOriginalFrame = true, const int googlyEye1 = -1, + const int googlyEye2 = -1) + { + // Load shared memory + if (globalIdx < 2*numberPeople) + { + sharedMins[globalIdx] = minPtr[globalIdx]; + sharedMaxs[globalIdx] = maxPtr[globalIdx]; + if (globalIdx < numberPeople) + sharedScaleF[globalIdx] = scalePtr[globalIdx]; + } + __syncthreads(); + + // Fill each (x,y) target pixel + if (x < targetWidth && y < targetHeight) + { + const auto baseIndex = 3*(y * targetWidth + x); + auto b = targetPtr[baseIndex]; + auto g = targetPtr[baseIndex+1]; + auto r = targetPtr[baseIndex+2]; + if (!blendOriginalFrame) + { + b = 0.f; + g = 0.f; + r = 0.f; + } + + const auto lineWidthSquared = lineWidth * lineWidth; + const auto radiusSquared = radius * radius; + for (auto person = 0; person < numberPeople; person++) + { + // Make sure person x,y in the limits + // Make sure person is not empty. Assume all joints are below threshold. Then + // maxs = 0 and mins = width/height. So if statement would be false + // const auto xIndex = 2*person; + // const auto yIndex = xIndex+1; + const auto xIndex = person; + const auto yIndex = numberPeople+person; + if (x <= sharedMaxs[xIndex] && x >= sharedMins[xIndex] + && y <= sharedMaxs[yIndex] && y >= sharedMins[yIndex]) + { + // Part pair connections + for (auto partPair = 0; partPair < numberPartPairs; partPair++) + { + const auto partA = partPairsPtr[2*partPair]; + const auto partB = partPairsPtr[2*partPair+1]; + const auto indexA = person*numberParts*3 + partA*3; + const auto xA = keypointsPtr[indexA]; + const auto yA = keypointsPtr[indexA + 1]; + const auto scoreA = keypointsPtr[indexA + 2]; + const auto indexB = person*numberParts*3 + partB*3; + const auto xB = keypointsPtr[indexB]; + const auto yB = keypointsPtr[indexB + 1]; + const auto scoreB = keypointsPtr[indexB + 2]; + + if (scoreA > threshold && scoreB > threshold) + { + const auto keypointScale = keypointScalePtr[partB%numberScales] + * keypointScalePtr[partB%numberScales] + * keypointScalePtr[partB%numberScales]; + const auto lineWidthScaled = lineWidthSquared * keypointScale; + const auto bSqrt = sharedScaleF[person] * sharedScaleF[person] * lineWidthScaled; + + const auto xP = (xA + xB) / 2.f; + const auto yP = (yA + yB) / 2.f; + const auto aSqrt = (xA - xP) * (xA - xP) + (yA - yP) * (yA - yP); + + const auto angle = atan2f(yB - yA, xB - xA); + const auto sine = sinf(angle); + const auto cosine = cosf(angle); + const auto A = cosine * (x - xP) + sine * (y - yP); + const auto B = sine * (x - xP) - cosine * (y - yP); + + const auto judge = A * A / aSqrt + B * B / bSqrt; + const auto minV = 0.f; + const auto maxV = 1.f; + if (minV <= judge && judge <= maxV) + // Before used partPair vs partB + addColorWeighted(r, g, b, &rgbColorsPtr[(partB%numberColors)*3], alphaColorToAdd); + } + } + + // Part circles + for (auto part = 0u; part < numberParts; part++) + { + const auto index = 3 * (person*numberParts + part); + const auto localX = keypointsPtr[index]; + const auto localY = keypointsPtr[index + 1]; + const auto score = keypointsPtr[index + 2]; + + if (score > threshold) + { + const auto keypointScale = keypointScalePtr[part%numberScales] + * keypointScalePtr[part%numberScales] + * keypointScalePtr[part%numberScales]; + const auto radiusScaled = radiusSquared * keypointScale; + const auto dist2 = (x - localX) * (x - localX) + (y - localY) * (y - localY); + // Googly eyes + if (googlyEye1 == part || googlyEye2 == part) + { + const auto eyeRatio = 2.5f * sqrt(radiusScaled); + const auto minr2 = sharedScaleF[person] * sharedScaleF[person] + * (eyeRatio - 2) * (eyeRatio - 2); + const auto maxr2 = sharedScaleF[person] * sharedScaleF[person] * eyeRatio * eyeRatio; + if (dist2 <= maxr2) + { + float colorToAdd [3] = {0., 0., 0.}; + if (dist2 <= minr2) + for (auto& color : colorToAdd) + color = {255.f}; + if (dist2 <= minr2*0.6f) + { + const auto dist3 = (x-4 - localX) + * (x-4 - localX) + (y - localY+4) * (y - localY+4); + if (dist3 > 14.0625f) // 3.75f^2 + for (auto& color : colorToAdd) + color = {0.f}; + } + const auto alphaColorToAdd = 0.9f; + addColorWeighted(r, g, b, colorToAdd, alphaColorToAdd); + } + } + // Other parts + else + { + const auto minr2 = 0.f; + const auto maxr2 = sharedScaleF[person] * sharedScaleF[person] * radiusScaled; + if (minr2 <= dist2 && dist2 <= maxr2) + addColorWeighted(r, g, b, &rgbColorsPtr[(part%numberColors)*3], alphaColorToAdd); + } + + } + } + } + } + targetPtr[baseIndex] = b; + targetPtr[baseIndex+1] = g; + targetPtr[baseIndex+2] = r; + } + } + + __inline__ __device__ void renderKeypointsOld( + float* targetPtr, float2* sharedMaxs, float2* sharedMins, float* sharedScaleF, const int globalIdx, + const int x, const int y, const int targetWidth, const int targetHeight, const float* const keypointsPtr, + const unsigned int* const partPairsPtr, const int numberPeople, const int numberParts, + const int numberPartPairs, const float* const rgbColorsPtr, const int numberColors, const float radius, + const float lineWidth, const float* const keypointScalePtr, const int numberScales, const float threshold, + const float alphaColorToAdd, const bool blendOriginalFrame = true, const int googlyEye1 = -1, + const int googlyEye2 = -1) { // Fill shared parameters if (globalIdx < numberPeople) @@ -52,7 +253,6 @@ namespace op sharedMins[globalIdx].y -= constantToAdd; } } - __syncthreads(); // Fill each (x,y) target pixel @@ -176,4 +376,5 @@ namespace op } } + #endif // OPENPOSE_UTILITIES_RENDER_HU diff --git a/src/openpose/face/faceGpuRenderer.cpp b/src/openpose/face/faceGpuRenderer.cpp index be129e58..e0bf4d38 100644 --- a/src/openpose/face/faceGpuRenderer.cpp +++ b/src/openpose/face/faceGpuRenderer.cpp @@ -10,7 +10,11 @@ namespace op { FaceGpuRenderer::FaceGpuRenderer(const float renderThreshold, const float alphaKeypoint, const float alphaHeatMap) : - GpuRenderer{renderThreshold, alphaKeypoint, alphaHeatMap} + GpuRenderer{renderThreshold, alphaKeypoint, alphaHeatMap}, + pGpuFace{nullptr}, + pMaxPtr{nullptr}, + pMinPtr{nullptr}, + pScalePtr{nullptr} { } @@ -20,7 +24,14 @@ namespace op { // Free CUDA pointers - Note that if pointers are 0 (i.e., nullptr), no operation is performed. #ifdef USE_CUDA - cudaFree(pGpuFace); + if (pGpuFace != nullptr) + cudaFree(pGpuFace); + if (pMaxPtr != nullptr) + cudaFree(pMaxPtr); + if (pMinPtr != nullptr) + cudaFree(pMinPtr); + if (pScalePtr != nullptr) + cudaFree(pScalePtr); #endif } catch (const std::exception& e) @@ -37,6 +48,9 @@ namespace op // GPU memory allocation for rendering #ifdef USE_CUDA cudaMalloc((void**)(&pGpuFace), POSE_MAX_PEOPLE * FACE_NUMBER_PARTS * 3 * sizeof(float)); + cudaMalloc((void**)&pMaxPtr, sizeof(float) * 2 * FACE_NUMBER_PARTS); + cudaMalloc((void**)&pMinPtr, sizeof(float) * 2 * FACE_NUMBER_PARTS); + cudaMalloc((void**)&pScalePtr, sizeof(float) * FACE_NUMBER_PARTS); #endif log("Finished initialization on thread.", Priority::Low, __LINE__, __FUNCTION__, __FILE__); } @@ -63,8 +77,9 @@ namespace op cudaMemcpy(pGpuFace, faceKeypoints.getConstPtr(), faceKeypoints.getSize(0) * FACE_NUMBER_PARTS * 3 * sizeof(float), cudaMemcpyHostToDevice); - renderFaceKeypointsGpu(*spGpuMemory, frameSize, pGpuFace, faceKeypoints.getSize(0), - mRenderThreshold, getAlphaKeypoint()); + renderFaceKeypointsGpu( + *spGpuMemory, pMaxPtr, pMinPtr, pScalePtr, frameSize, pGpuFace, faceKeypoints.getSize(0), + mRenderThreshold, getAlphaKeypoint()); // CUDA check cudaCheck(__LINE__, __FUNCTION__, __FILE__); } diff --git a/src/openpose/face/renderFace.cu b/src/openpose/face/renderFace.cu index 898a3392..34b71776 100644 --- a/src/openpose/face/renderFace.cu +++ b/src/openpose/face/renderFace.cu @@ -10,17 +10,25 @@ namespace op __constant__ const float SCALES[] = {FACE_SCALES_RENDER_GPU}; __constant__ const float COLORS[] = {FACE_COLORS_RENDER_GPU}; - __global__ void renderFaceParts(float* targetPtr, const int targetWidth, const int targetHeight, - const float* const facePtr, const int numberPeople, - const float threshold, const float alphaColorToAdd) + __global__ void getBoundingBoxPerPersonFace( + float* maxPtr, float* minPtr, float* scalePtr,const int targetWidth, const int targetHeight, + const float* const keypointsPtr, const int numberPeople, const int numberParts, const float threshold) + { + getBoundingBoxPerPerson( + maxPtr, minPtr, scalePtr, targetWidth, targetHeight, keypointsPtr, numberPeople, numberParts, threshold); + } + + __global__ void renderFaceParts( + float* targetPtr, float* minPtr, float* maxPtr, float* scalePtr, const int targetWidth, const int targetHeight, + const float* const facePtr, const int numberPeople, const float threshold, const float alphaColorToAdd) { const auto x = (blockIdx.x * blockDim.x) + threadIdx.x; const auto y = (blockIdx.y * blockDim.y) + threadIdx.y; const auto globalIdx = threadIdx.y * blockDim.x + threadIdx.x; // Shared parameters - __shared__ float2 sharedMins[FACE_MAX_FACES]; - __shared__ float2 sharedMaxs[FACE_MAX_FACES]; + __shared__ float sharedMins[2*FACE_MAX_FACES]; + __shared__ float sharedMaxs[2*FACE_MAX_FACES]; __shared__ float sharedScaleF[FACE_MAX_FACES]; // Other parameters @@ -31,23 +39,33 @@ namespace op const auto lineWidth = fastMinCuda(targetWidth, targetHeight) / 250.f; // Render key points - renderKeypoints(targetPtr, sharedMaxs, sharedMins, sharedScaleF, globalIdx, x, y, targetWidth, targetHeight, - facePtr, PART_PAIRS_GPU, numberPeople, FACE_NUMBER_PARTS, numberPartPairs, COLORS, - numberColors, radius, lineWidth, SCALES, numberScales, threshold, alphaColorToAdd); + renderKeypoints( + targetPtr, sharedMaxs, sharedMins, sharedScaleF, maxPtr, minPtr, scalePtr, globalIdx, x, y, targetWidth, + targetHeight, facePtr, PART_PAIRS_GPU, numberPeople, FACE_NUMBER_PARTS, numberPartPairs, COLORS, + numberColors, radius, lineWidth, SCALES, numberScales, threshold, alphaColorToAdd); } - void renderFaceKeypointsGpu(float* framePtr, const Point& frameSize, const float* const facePtr, - const int numberPeople, const float renderThreshold, const float alphaColorToAdd) + void renderFaceKeypointsGpu( + float* framePtr, float* maxPtr, float* minPtr, float* scalePtr, const Point& frameSize, + const float* const facePtr, const int numberPeople, const float renderThreshold, const float alphaColorToAdd) { try { if (numberPeople > 0) { + // Get bouding boxes + const dim3 threadsPerBlockBoundBox = {1, 1, 1}; + const dim3 numBlocksBox{getNumberCudaBlocks(POSE_MAX_PEOPLE, threadsPerBlockBoundBox.x)}; + getBoundingBoxPerPersonFace<<>>( + maxPtr, minPtr, scalePtr, frameSize.x, frameSize.y, facePtr, numberPeople, + FACE_NUMBER_PARTS, renderThreshold); + // Draw hands dim3 threadsPerBlock; dim3 numBlocks; getNumberCudaThreadsAndBlocks(threadsPerBlock, numBlocks, frameSize); - renderFaceParts<<>>(framePtr, frameSize.x, frameSize.y, facePtr, - numberPeople, renderThreshold, alphaColorToAdd); + renderFaceParts<<>>( + framePtr, minPtr, maxPtr, scalePtr, frameSize.x, frameSize.y, facePtr, numberPeople, + renderThreshold, alphaColorToAdd); cudaCheck(__LINE__, __FUNCTION__, __FILE__); } } diff --git a/src/openpose/hand/handGpuRenderer.cpp b/src/openpose/hand/handGpuRenderer.cpp index 8d02f433..f76cd107 100644 --- a/src/openpose/hand/handGpuRenderer.cpp +++ b/src/openpose/hand/handGpuRenderer.cpp @@ -10,7 +10,11 @@ namespace op { HandGpuRenderer::HandGpuRenderer(const float renderThreshold, const float alphaKeypoint, const float alphaHeatMap) : - GpuRenderer{renderThreshold, alphaKeypoint, alphaHeatMap} + GpuRenderer{renderThreshold, alphaKeypoint, alphaHeatMap}, + pGpuHand{nullptr}, + pMaxPtr{nullptr}, + pMinPtr{nullptr}, + pScalePtr{nullptr} { } @@ -20,7 +24,14 @@ namespace op { // Free CUDA pointers - Note that if pointers are 0 (i.e., nullptr), no operation is performed. #ifdef USE_CUDA - cudaFree(pGpuHand); + if (pGpuHand != nullptr) + cudaFree(pGpuHand); + if (pMaxPtr != nullptr) + cudaFree(pMaxPtr); + if (pMinPtr != nullptr) + cudaFree(pMinPtr); + if (pScalePtr != nullptr) + cudaFree(pScalePtr); #endif } catch (const std::exception& e) @@ -37,6 +48,9 @@ namespace op // GPU memory allocation for rendering #ifdef USE_CUDA cudaMalloc((void**)(&pGpuHand), HAND_MAX_HANDS * HAND_NUMBER_PARTS * 3 * sizeof(float)); + cudaMalloc((void**)&pMaxPtr, sizeof(float) * 2 * HAND_MAX_HANDS); + cudaMalloc((void**)&pMinPtr, sizeof(float) * 2 * HAND_MAX_HANDS); + cudaMalloc((void**)&pScalePtr, sizeof(float) * HAND_MAX_HANDS); #endif log("Finished initialization on thread.", Priority::Low, __LINE__, __FUNCTION__, __FILE__); } @@ -69,7 +83,8 @@ namespace op cudaMemcpy(pGpuHand + handVolume, handKeypoints[1].getConstPtr(), handVolume * sizeof(float), cudaMemcpyHostToDevice); renderHandKeypointsGpu( - *spGpuMemory, frameSize, pGpuHand, 2 * numberPeople, mRenderThreshold, getAlphaKeypoint()); + *spGpuMemory, pMaxPtr, pMinPtr, pScalePtr, frameSize, pGpuHand, 2 * numberPeople, + mRenderThreshold, getAlphaKeypoint()); // CUDA check cudaCheck(__LINE__, __FUNCTION__, __FILE__); } diff --git a/src/openpose/hand/renderHand.cu b/src/openpose/hand/renderHand.cu index 8564ee16..d3f478db 100644 --- a/src/openpose/hand/renderHand.cu +++ b/src/openpose/hand/renderHand.cu @@ -10,17 +10,25 @@ namespace op __constant__ const float SCALES[] = {HAND_SCALES_RENDER_GPU}; __constant__ const float COLORS[] = {HAND_COLORS_RENDER_GPU}; - __global__ void renderHandsParts(float* targetPtr, const int targetWidth, const int targetHeight, - const float* const handsPtr, const int numberHands, - const float threshold, const float alphaColorToAdd) + __global__ void getBoundingBoxPerPersonHand( + float* maxPtr, float* minPtr, float* scalePtr,const int targetWidth, const int targetHeight, + const float* const keypointsPtr, const int numberPeople, const int numberParts, const float threshold) + { + getBoundingBoxPerPerson( + maxPtr, minPtr, scalePtr, targetWidth, targetHeight, keypointsPtr, numberPeople, numberParts, threshold); + } + + __global__ void renderHandsParts( + float* targetPtr, float* minPtr, float* maxPtr, float* scalePtr, const int targetWidth, const int targetHeight, + const float* const handsPtr, const int numberHands, const float threshold, const float alphaColorToAdd) { const auto x = (blockIdx.x * blockDim.x) + threadIdx.x; const auto y = (blockIdx.y * blockDim.y) + threadIdx.y; const auto globalIdx = threadIdx.y * blockDim.x + threadIdx.x; // Shared parameters - __shared__ float2 sharedMins[HAND_MAX_HANDS]; - __shared__ float2 sharedMaxs[HAND_MAX_HANDS]; + __shared__ float sharedMins[2*HAND_MAX_HANDS]; + __shared__ float sharedMaxs[2*HAND_MAX_HANDS]; __shared__ float sharedScaleF[HAND_MAX_HANDS]; // Other parameters @@ -31,23 +39,33 @@ namespace op const auto lineWidth = fastMinCuda(targetWidth, targetHeight) / 80.f; // Render key points - renderKeypoints(targetPtr, sharedMaxs, sharedMins, sharedScaleF, globalIdx, x, y, targetWidth, targetHeight, - handsPtr, PART_PAIRS_GPU, numberHands, HAND_NUMBER_PARTS, numberPartPairs, COLORS, - numberColors, radius, lineWidth, SCALES, numberScales, threshold, alphaColorToAdd); + renderKeypoints( + targetPtr, sharedMaxs, sharedMins, sharedScaleF, maxPtr, minPtr, scalePtr, globalIdx, x, y, targetWidth, + targetHeight, handsPtr, PART_PAIRS_GPU, numberHands, HAND_NUMBER_PARTS, numberPartPairs, COLORS, + numberColors, radius, lineWidth, SCALES, numberScales, threshold, alphaColorToAdd); } - void renderHandKeypointsGpu(float* framePtr, const Point& frameSize, const float* const handsPtr, - const int numberHands, const float renderThreshold, const float alphaColorToAdd) + void renderHandKeypointsGpu( + float* framePtr, float* maxPtr, float* minPtr, float* scalePtr, const Point& frameSize, + const float* const handsPtr, const int numberHands, const float renderThreshold, const float alphaColorToAdd) { try { if (numberHands > 0) { + // Get bouding boxes + const dim3 threadsPerBlockBoundBox = {1, 1, 1}; + const dim3 numBlocksBox{getNumberCudaBlocks(POSE_MAX_PEOPLE, threadsPerBlockBoundBox.x)}; + getBoundingBoxPerPersonHand<<>>( + maxPtr, minPtr, scalePtr, frameSize.x, frameSize.y, handsPtr, numberHands, + HAND_NUMBER_PARTS, renderThreshold); + // Draw hands dim3 threadsPerBlock; dim3 numBlocks; getNumberCudaThreadsAndBlocks(threadsPerBlock, numBlocks, frameSize); - renderHandsParts<<>>(framePtr, frameSize.x, frameSize.y, handsPtr, - numberHands, renderThreshold, alphaColorToAdd); + renderHandsParts<<>>( + framePtr, minPtr, maxPtr, scalePtr, frameSize.x, frameSize.y, handsPtr, numberHands, + renderThreshold, alphaColorToAdd); cudaCheck(__LINE__, __FUNCTION__, __FILE__); } } diff --git a/src/openpose/net/resizeAndMergeBase.cu b/src/openpose/net/resizeAndMergeBase.cu index 38dd2ba3..a52b8969 100644 --- a/src/openpose/net/resizeAndMergeBase.cu +++ b/src/openpose/net/resizeAndMergeBase.cu @@ -6,19 +6,111 @@ namespace op { const auto THREADS_PER_BLOCK_1D = 16u; + // template + // __global__ void resizeKernelOld( + // T* targetPtr, const T* const sourcePtr, const int sourceWidth, const int sourceHeight, const int targetWidth, + // const int targetHeight) + // { + // const auto x = (blockIdx.x * blockDim.x) + threadIdx.x; + // const auto y = (blockIdx.y * blockDim.y) + threadIdx.y; + + // if (x < targetWidth && y < targetHeight) + // { + // const T xSource = (x + T(0.5f)) * sourceWidth / T(targetWidth) - T(0.5f); + // const T ySource = (y + T(0.5f)) * sourceHeight / T(targetHeight) - T(0.5f); + // targetPtr[y*targetWidth+x] = bicubicInterpolate( + // sourcePtr, xSource, ySource, sourceWidth, sourceHeight, sourceWidth); + // } + // } + + template + __global__ void resizeKernel( + T* targetPtr, const T* const sourcePtr, const int sourceWidth, const int sourceHeight, const int targetWidth, + const int targetHeight, const int channels) + { + const auto x = (blockIdx.x * blockDim.x) + threadIdx.x; + const auto y = (blockIdx.y * blockDim.y) + threadIdx.y; + const auto channel = (blockIdx.z * blockDim.z) + threadIdx.z; + + const auto sourceArea = sourceWidth * sourceHeight; + const auto targetArea = targetWidth * targetHeight; + + if (x < targetWidth && y < targetHeight && channel < channels) + { + const T xSource = (x + T(0.5f)) * sourceWidth / T(targetWidth) - T(0.5f); + const T ySource = (y + T(0.5f)) * sourceHeight / T(targetHeight) - T(0.5f); + const T* sourcePtrChannel = sourcePtr + channel * sourceArea; + targetPtr[channel * targetArea + y*targetWidth+x] = bicubicInterpolate( + sourcePtrChannel, xSource, ySource, sourceWidth, sourceHeight, sourceWidth); + } + } + template - __global__ void resizeKernel(T* targetPtr, const T* const sourcePtr, const int sourceWidth, const int sourceHeight, - const int targetWidth, const int targetHeight) + __global__ void resizeAndPadKernel( + T* targetPtr, const T* const sourcePtr, const int sourceWidth, const int sourceHeight, const int targetWidth, + const int targetHeight, const float rescaleFactor, const int channels) { const auto x = (blockIdx.x * blockDim.x) + threadIdx.x; const auto y = (blockIdx.y * blockDim.y) + threadIdx.y; + const auto channel = (blockIdx.z * blockDim.z) + threadIdx.z; + + const auto sourceArea = sourceWidth * sourceHeight; + const auto targetArea = targetWidth * targetHeight; + + if (x < targetWidth && y < targetHeight && channel < channels) + { + const T xSource = (x + T(0.5f)) * 1.0 / T(rescaleFactor) - T(0.5f); + const T ySource = (y + T(0.5f)) * 1.0 / T(rescaleFactor) - T(0.5f); + const T* sourcePtrChannel = sourcePtr + channel * sourceArea; + if (x < sourceWidth * rescaleFactor && y < sourceHeight * rescaleFactor) + targetPtr[channel * targetArea + y*targetWidth+x] = bicubicInterpolate( + sourcePtrChannel, xSource, ySource, sourceWidth, sourceHeight, sourceWidth); + else + targetPtr[channel * targetArea + y*targetWidth+x] = 0; + } + } + + + template + __global__ void resize8TimesKernel( + T* targetPtr, const T* const sourcePtr, const int sourceWidth, const int sourceHeight, const int targetWidth, + const int targetHeight, const unsigned int rescaleFactor) + { + const auto x = (blockIdx.x * blockDim.x) + threadIdx.x; + const auto y = (blockIdx.y * blockDim.y) + threadIdx.y; + const auto channel = (blockIdx.z * blockDim.z) + threadIdx.z; + + // Load shared memory + // If resize >= 5, then #threads per block >= # elements of shared memory + __shared__ T sourcePtrShared[25]; + const auto sharedLoadId = threadIdx.x + rescaleFactor*threadIdx.y; + if (sharedLoadId < 25) + { + const auto minTargetX = blockIdx.x * rescaleFactor; + const auto minSourceX = (minTargetX + T(0.5f)) * sourceWidth / T(targetWidth) - T(0.5f); + const auto minSourceXInt_1 = int(minSourceX+ 1e-5); + const auto minSourceXInt = minSourceXInt_1 - 1; + const auto minTargetY = blockIdx.y * rescaleFactor; + const auto minSourceY = (minTargetY + T(0.5f)) * sourceHeight / T(targetHeight) - T(0.5f); + const auto minSourceYInt_1 = int(minSourceY + 1e-5); + const auto minSourceYInt = minSourceYInt_1 - 1; + + const auto yClean = fastTruncateCuda(int(minSourceYInt+sharedLoadId/5 + 1e-5), 0, sourceHeight - 1); + const auto xClean = fastTruncateCuda(int(minSourceXInt+sharedLoadId%5 + 1e-5), 0, sourceWidth - 1); + const auto sourceArea = sourceWidth * sourceHeight; + const T* sourcePtrChannel = sourcePtr + channel * sourceArea; + sourcePtrShared[sharedLoadId] = sourcePtrChannel[yClean * sourceWidth + xClean]; + } + // Wait here until shared memory has been loaded + __syncthreads(); if (x < targetWidth && y < targetHeight) { + const auto targetArea = targetWidth * targetHeight; const T xSource = (x + T(0.5f)) * sourceWidth / T(targetWidth) - T(0.5f); const T ySource = (y + T(0.5f)) * sourceHeight / T(targetHeight) - T(0.5f); - targetPtr[y*targetWidth+x] = bicubicInterpolate(sourcePtr, xSource, ySource, sourceWidth, sourceHeight, - sourceWidth); + targetPtr[channel * targetArea + y*targetWidth+x] = bicubicInterpolate8Times( + sourcePtrShared, xSource, ySource, sourceWidth, sourceHeight, sourceWidth, threadIdx.x, threadIdx.y); } } @@ -58,10 +150,53 @@ namespace op } } + __global__ void reorderAndCastKernel( + float* targetPtr, const unsigned char* const srcPtr, const int width, const int height) + { + const auto x = (blockIdx.x * blockDim.x) + threadIdx.x; + const auto y = (blockIdx.y * blockDim.y) + threadIdx.y; + const auto c = (blockIdx.z * blockDim.z) + threadIdx.z; + if (x < width && y < height) + { + const auto channels = 3; + const auto originFramePtrOffsetY = y * width; + const auto channelOffset = c * width * height; + const auto targetIndex = channelOffset + y * width + x; + const auto srcIndex = (originFramePtrOffsetY + x) * channels + c; + targetPtr[targetIndex] = float(srcPtr[srcIndex]) * (1/256.f) - 0.5f; + } + } + + void reorderAndCast(float* targetPtr, const unsigned char* const srcPtr, const int width, const int height) + { + const dim3 threadsPerBlock{32, 1, 1}; + const dim3 numBlocks{ + getNumberCudaBlocks(width, threadsPerBlock.x), + getNumberCudaBlocks(height, threadsPerBlock.y), + getNumberCudaBlocks(3, threadsPerBlock.z)}; + reorderAndCastKernel<<>>(targetPtr, srcPtr, width, height); + } + + void resizeAndMergeRGBGPU( + float* targetPtr, const float* const srcPtr, const int sourceWidth, const int sourceHeight, + const int targetWidth, const int targetHeight, const float scaleFactor) + + { + const auto channels = 3; + const dim3 threadsPerBlock{THREADS_PER_BLOCK_1D, THREADS_PER_BLOCK_1D, 1}; + const dim3 numBlocks{ + getNumberCudaBlocks(targetWidth, threadsPerBlock.x), + getNumberCudaBlocks(targetHeight, threadsPerBlock.y), + getNumberCudaBlocks(channels, threadsPerBlock.z)}; + + resizeAndPadKernel<<>>( + targetPtr, srcPtr, sourceWidth, sourceHeight, targetWidth, targetHeight, scaleFactor, channels); + } + template - void resizeAndMergeGpu(T* targetPtr, const std::vector& sourcePtrs, const std::array& targetSize, - const std::vector>& sourceSizes, - const std::vector& scaleInputToNetInputs) + void resizeAndMergeGpu( + T* targetPtr, const std::vector& sourcePtrs, const std::array& targetSize, + const std::vector>& sourceSizes, const std::vector& scaleInputToNetInputs) { try { @@ -90,20 +225,60 @@ namespace op const auto num = sourceSize[0]; if (targetSize[0] > 1 || num == 1) { - const auto sourceChannelOffset = sourceHeight * sourceWidth; - const auto targetChannelOffset = targetWidth * targetHeight; - for (auto n = 0; n < num; n++) - { - const auto offsetBase = n*channels; - for (auto c = 0 ; c < channels ; c++) - { - const auto offset = offsetBase + c; - resizeKernel<<>>(targetPtr + offset * targetChannelOffset, - sourcePtrs.at(0) + offset * sourceChannelOffset, - sourceWidth, sourceHeight, targetWidth, - targetHeight); - } - } + // // Profiling code + // const auto REPS = 250; + // double timeNormalize0 = 0.; + // double timeNormalize1 = 0.; + // double timeNormalize2 = 0.; + // double timeNormalize3 = 0.; + // // Non-optimized function + // OP_CUDA_PROFILE_INIT(REPS); + // const auto sourceChannelOffset = sourceHeight * sourceWidth; + // const auto targetChannelOffset = targetWidth * targetHeight; + // for (auto n = 0; n < num; n++) + // { + // const auto offsetBase = n*channels; + // for (auto c = 0 ; c < channels ; c++) + // { + // const auto offset = offsetBase + c; + // resizeKernelOld<<>>( + // targetPtr + offset * targetChannelOffset, + // sourcePtrs.at(0) + offset * sourceChannelOffset, + // sourceWidth, sourceHeight, targetWidth, targetHeight); + // } + // } + // OP_CUDA_PROFILE_END(timeNormalize1, 1e3, REPS); + + // Optimized function for any resize size (suboptimal for 8x resize) + // OP_CUDA_PROFILE_INIT(REPS); + const dim3 threadsPerBlock{THREADS_PER_BLOCK_1D, THREADS_PER_BLOCK_1D, 1}; + const dim3 numBlocks{getNumberCudaBlocks(targetWidth, threadsPerBlock.x), + getNumberCudaBlocks(targetHeight, threadsPerBlock.y), + getNumberCudaBlocks(num * channels, threadsPerBlock.z)}; + resizeKernel<<>>( + targetPtr, sourcePtrs.at(0), sourceWidth, sourceHeight, targetWidth, targetHeight, + num * channels); + // OP_CUDA_PROFILE_END(timeNormalize2, 1e3, REPS); + + // // Optimized function for 8x resize + // OP_CUDA_PROFILE_INIT(REPS); + // if (targetWidth / sourceWidth != 8 || targetHeight / sourceHeight != 8) + // error("Kernel only implemented for 8x resize. Notify us if this error appears.", + // __LINE__, __FUNCTION__, __FILE__); + // const auto rescaleFactor = (unsigned int) std::ceil((float)(targetHeight) / (float)(sourceHeight)); + + // const dim3 threadsPerBlock{rescaleFactor, rescaleFactor, 1}; + // const dim3 numBlocks{getNumberCudaBlocks(targetWidth, threadsPerBlock.x), + // getNumberCudaBlocks(targetHeight, threadsPerBlock.y), + // getNumberCudaBlocks(num * channels, threadsPerBlock.z)}; + // resize8TimesKernel<<>>( + // targetPtr, sourcePtrs.at(0), sourceWidth, sourceHeight, targetWidth, targetHeight, rescaleFactor); + // OP_CUDA_PROFILE_END(timeNormalize3, 1e3, REPS); + + // Profiling code + // log(" Res(ori)=" + std::to_string(timeNormalize1) + "ms"); + // log(" Res(new)=" + std::to_string(timeNormalize2) + "ms"); + // log(" Res(new8x)=" + std::to_string(timeNormalize3) + "ms"); } // Old inefficient multi-scale merging else diff --git a/src/openpose/pose/poseGpuRenderer.cpp b/src/openpose/pose/poseGpuRenderer.cpp index ab1fe86e..030d2cf4 100644 --- a/src/openpose/pose/poseGpuRenderer.cpp +++ b/src/openpose/pose/poseGpuRenderer.cpp @@ -22,7 +22,10 @@ namespace op getNumberElementsToRender(poseModel)}, // mNumberElementsToRender PoseRenderer{poseModel}, spPoseExtractorNet{poseExtractorNet}, - pGpuPose{nullptr} + pGpuPose{nullptr}, + pMaxPtr{nullptr}, + pMinPtr{nullptr}, + pScalePtr{nullptr} { } @@ -32,7 +35,14 @@ namespace op { // Free CUDA pointers - Note that if pointers are 0 (i.e., nullptr), no operation is performed. #ifdef USE_CUDA - cudaFree(pGpuPose); + if (pGpuPose != nullptr) + cudaFree(pGpuPose); + if (pMaxPtr != nullptr) + cudaFree(pMaxPtr); + if (pMinPtr != nullptr) + cudaFree(pMinPtr); + if (pScalePtr != nullptr) + cudaFree(pScalePtr); #endif } catch (const std::exception& e) @@ -49,7 +59,10 @@ namespace op // GPU memory allocation for rendering #ifdef USE_CUDA cudaMalloc((void**)(&pGpuPose), - POSE_MAX_PEOPLE * getPoseNumberBodyParts(mPoseModel) * 3 * sizeof(float)); + POSE_MAX_PEOPLE * getPoseNumberBodyParts(mPoseModel) * 3 * sizeof(float)); + cudaMalloc((void**)&pMaxPtr, sizeof(float) * 2 * POSE_MAX_PEOPLE); + cudaMalloc((void**)&pMinPtr, sizeof(float) * 2 * POSE_MAX_PEOPLE); + cudaMalloc((void**)&pScalePtr, sizeof(float) * POSE_MAX_PEOPLE); cudaCheck(__LINE__, __FUNCTION__, __FILE__); #endif log("Finished initialization on thread.", Priority::Low, __LINE__, __FUNCTION__, __FILE__); @@ -60,10 +73,9 @@ namespace op } } - std::pair PoseGpuRenderer::renderPose(Array& outputData, - const Array& poseKeypoints, - const float scaleInputToOutput, - const float scaleNetToOutput) + std::pair PoseGpuRenderer::renderPose( + Array& outputData, const Array& poseKeypoints, const float scaleInputToOutput, + const float scaleNetToOutput) { try { @@ -78,7 +90,6 @@ namespace op if (numberPeople > 0 || elementRendered != 0 || !mBlendOriginalFrame) { cpuToGpuMemoryIfNotCopiedYet(outputData.getPtr(), outputData.getVolume()); - cudaCheck(__LINE__, __FUNCTION__, __FILE__); const auto numberBodyParts = getPoseNumberBodyParts(mPoseModel); const auto hasBkg = addBkgChannel(mPoseModel); const auto numberBodyPartsPlusBkg = numberBodyParts + (hasBkg ? 1 : 0); @@ -96,9 +107,9 @@ namespace op poseKeypointsRescaled.getConstPtr(), numberPeople * numberBodyParts * 3 * sizeof(float), cudaMemcpyHostToDevice); - renderPoseKeypointsGpu(*spGpuMemory, mPoseModel, numberPeople, frameSize, pGpuPose, - mRenderThreshold, mShowGooglyEyes, mBlendOriginalFrame, - getAlphaKeypoint()); + renderPoseKeypointsGpu( + *spGpuMemory, pMaxPtr, pMinPtr, pScalePtr, mPoseModel, numberPeople, frameSize, pGpuPose, + mRenderThreshold, mShowGooglyEyes, mBlendOriginalFrame, getAlphaKeypoint()); } else { diff --git a/src/openpose/pose/renderPose.cu b/src/openpose/pose/renderPose.cu index 4afc1ff4..cd1cdb57 100644 --- a/src/openpose/pose/renderPose.cu +++ b/src/openpose/pose/renderPose.cu @@ -14,8 +14,6 @@ namespace op __constant__ const unsigned int BODY_19_PAIRS_GPU[] = {POSE_BODY_19_PAIRS_RENDER_GPU}; __constant__ const unsigned int BODY_23_PAIRS_GPU[] = {POSE_BODY_23_PAIRS_RENDER_GPU}; __constant__ const unsigned int BODY_25B_PAIRS_GPU[] = {POSE_BODY_25B_PAIRS_RENDER_GPU}; - __constant__ const unsigned int BODY_65_PAIRS_GPU[] = {POSE_BODY_65_PAIRS_RENDER_GPU}; - __constant__ const unsigned int BODY_95_PAIRS_GPU[] = {POSE_BODY_95_PAIRS_RENDER_GPU}; __constant__ const unsigned int BODY_135_PAIRS_GPU[] = {POSE_BODY_135_PAIRS_RENDER_GPU}; __constant__ const unsigned int MPI_PAIRS_GPU[] = {POSE_MPI_PAIRS_RENDER_GPU}; __constant__ const unsigned int CAR_12_PAIRS_GPU[] = {POSE_CAR_12_PAIRS_RENDER_GPU}; @@ -26,8 +24,6 @@ namespace op __constant__ const float BODY_19_SCALES[] = {POSE_BODY_19_SCALES_RENDER_GPU}; __constant__ const float BODY_23_SCALES[] = {POSE_BODY_23_SCALES_RENDER_GPU}; __constant__ const float BODY_25B_SCALES[] = {POSE_BODY_25B_SCALES_RENDER_GPU}; - __constant__ const float BODY_65_SCALES[] = {POSE_BODY_65_SCALES_RENDER_GPU}; - __constant__ const float BODY_95_SCALES[] = {POSE_BODY_95_SCALES_RENDER_GPU}; __constant__ const float BODY_135_SCALES[] = {POSE_BODY_135_SCALES_RENDER_GPU}; __constant__ const float MPI_SCALES[] = {POSE_MPI_SCALES_RENDER_GPU}; __constant__ const float CAR_12_SCALES[] = {POSE_CAR_12_SCALES_RENDER_GPU}; @@ -38,8 +34,6 @@ namespace op __constant__ const float BODY_19_COLORS[] = {POSE_BODY_19_COLORS_RENDER_GPU}; __constant__ const float BODY_23_COLORS[] = {POSE_BODY_23_COLORS_RENDER_GPU}; __constant__ const float BODY_25B_COLORS[] = {POSE_BODY_25B_COLORS_RENDER_GPU}; - __constant__ const float BODY_65_COLORS[] = {POSE_BODY_65_COLORS_RENDER_GPU}; - __constant__ const float BODY_95_COLORS[] = {POSE_BODY_95_COLORS_RENDER_GPU}; __constant__ const float BODY_135_COLORS[] = {POSE_BODY_135_COLORS_RENDER_GPU}; __constant__ const float MPI_COLORS[] = {POSE_MPI_COLORS_RENDER_GPU}; __constant__ const float CAR_12_COLORS[] = {POSE_CAR_12_COLORS_RENDER_GPU}; @@ -124,18 +118,26 @@ namespace op colorPtr.z *= rad; } + __global__ void getBoundingBoxPerPersonPose( + float* maxPtr, float* minPtr, float* scalePtr,const int targetWidth, const int targetHeight, + const float* const keypointsPtr, const int numberPeople, const int numberParts, const float threshold) + { + getBoundingBoxPerPerson( + maxPtr, minPtr, scalePtr, targetWidth, targetHeight, keypointsPtr, numberPeople, numberParts, threshold); + } + __global__ void renderPoseCoco( - float* targetPtr, const int targetWidth, const int targetHeight, const float* const posePtr, - const int numberPeople, const float threshold, const bool googlyEyes, const bool blendOriginalFrame, - const float alphaColorToAdd) + float* targetPtr, float* minPtr, float* maxPtr, float* scalePtr, const int targetWidth, const int targetHeight, + const float* const posePtr, const int numberPeople, const float threshold, const bool googlyEyes, + const bool blendOriginalFrame, const float alphaColorToAdd) { const auto x = (blockIdx.x * blockDim.x) + threadIdx.x; const auto y = (blockIdx.y * blockDim.y) + threadIdx.y; const auto globalIdx = threadIdx.y * blockDim.x + threadIdx.x; // Shared parameters - __shared__ float2 sharedMins[POSE_MAX_PEOPLE]; - __shared__ float2 sharedMaxs[POSE_MAX_PEOPLE]; + __shared__ float sharedMins[2*POSE_MAX_PEOPLE]; + __shared__ float sharedMaxs[2*POSE_MAX_PEOPLE]; __shared__ float sharedScaleF[POSE_MAX_PEOPLE]; // Other parameters @@ -146,24 +148,25 @@ namespace op const auto lineWidth = fastMinCuda(targetWidth, targetHeight) / 120.f; // Render key points - renderKeypoints(targetPtr, sharedMaxs, sharedMins, sharedScaleF, globalIdx, x, y, targetWidth, targetHeight, - posePtr, COCO_PAIRS_GPU, numberPeople, 18, numberPartPairs, COCO_COLORS, - numberColors, radius, lineWidth, COCO_SCALES, numberScales, threshold, alphaColorToAdd, - blendOriginalFrame, (googlyEyes ? 14 : -1), (googlyEyes ? 15 : -1)); + renderKeypoints( + targetPtr, sharedMaxs, sharedMins, sharedScaleF, maxPtr, minPtr, scalePtr, + globalIdx, x, y, targetWidth, targetHeight, posePtr, COCO_PAIRS_GPU, numberPeople, 18, numberPartPairs, + COCO_COLORS, numberColors, radius, lineWidth, COCO_SCALES, numberScales, threshold, alphaColorToAdd, + blendOriginalFrame, (googlyEyes ? 14 : -1), (googlyEyes ? 15 : -1)); } __global__ void renderPoseBody19( - float* targetPtr, const int targetWidth, const int targetHeight, const float* const posePtr, - const int numberPeople, const float threshold, const bool googlyEyes, const bool blendOriginalFrame, - const float alphaColorToAdd) + float* targetPtr, float* minPtr, float* maxPtr, float* scalePtr, const int targetWidth, const int targetHeight, + const float* const posePtr, const int numberPeople, const float threshold, const bool googlyEyes, + const bool blendOriginalFrame, const float alphaColorToAdd) { const auto x = (blockIdx.x * blockDim.x) + threadIdx.x; const auto y = (blockIdx.y * blockDim.y) + threadIdx.y; const auto globalIdx = threadIdx.y * blockDim.x + threadIdx.x; // Shared parameters - __shared__ float2 sharedMins[POSE_MAX_PEOPLE]; - __shared__ float2 sharedMaxs[POSE_MAX_PEOPLE]; + __shared__ float sharedMins[2*POSE_MAX_PEOPLE]; + __shared__ float sharedMaxs[2*POSE_MAX_PEOPLE]; __shared__ float sharedScaleF[POSE_MAX_PEOPLE]; // Other parameters @@ -174,24 +177,26 @@ namespace op const auto lineWidth = fastMinCuda(targetWidth, targetHeight) / 120.f; // Render key points - renderKeypoints(targetPtr, sharedMaxs, sharedMins, sharedScaleF, globalIdx, x, y, targetWidth, targetHeight, - posePtr, BODY_19_PAIRS_GPU, numberPeople, 19, numberPartPairs, BODY_19_COLORS, numberColors, - radius, lineWidth, BODY_19_SCALES, numberScales, threshold, alphaColorToAdd, - blendOriginalFrame, (googlyEyes ? 15 : -1), (googlyEyes ? 16 : -1)); + renderKeypoints( + targetPtr, sharedMaxs, sharedMins, sharedScaleF, maxPtr, minPtr, scalePtr, + globalIdx, x, y, targetWidth, targetHeight, posePtr, BODY_19_PAIRS_GPU, numberPeople, 19, numberPartPairs, + BODY_19_COLORS, numberColors, radius, lineWidth, BODY_19_SCALES, numberScales, threshold, alphaColorToAdd, + blendOriginalFrame, (googlyEyes ? 15 : -1), + (googlyEyes ? 16 : -1)); } __global__ void renderPoseBody23( - float* targetPtr, const int targetWidth, const int targetHeight, const float* const posePtr, - const int numberPeople, const float threshold, const bool googlyEyes, const bool blendOriginalFrame, - const float alphaColorToAdd) + float* targetPtr, float* minPtr, float* maxPtr, float* scalePtr, const int targetWidth, const int targetHeight, + const float* const posePtr, const int numberPeople, const float threshold, const bool googlyEyes, + const bool blendOriginalFrame, const float alphaColorToAdd) { const auto x = (blockIdx.x * blockDim.x) + threadIdx.x; const auto y = (blockIdx.y * blockDim.y) + threadIdx.y; const auto globalIdx = threadIdx.y * blockDim.x + threadIdx.x; // Shared parameters - __shared__ float2 sharedMins[POSE_MAX_PEOPLE]; - __shared__ float2 sharedMaxs[POSE_MAX_PEOPLE]; + __shared__ float sharedMins[2*POSE_MAX_PEOPLE]; + __shared__ float sharedMaxs[2*POSE_MAX_PEOPLE]; __shared__ float sharedScaleF[POSE_MAX_PEOPLE]; // Other parameters @@ -202,24 +207,54 @@ namespace op const auto lineWidth = fastMinCuda(targetWidth, targetHeight) / 120.f; // Render key points - renderKeypoints(targetPtr, sharedMaxs, sharedMins, sharedScaleF, globalIdx, x, y, targetWidth, targetHeight, - posePtr, BODY_23_PAIRS_GPU, numberPeople, 23, numberPartPairs, BODY_23_COLORS, numberColors, - radius, lineWidth, BODY_23_SCALES, numberScales, threshold, alphaColorToAdd, - blendOriginalFrame, (googlyEyes ? 13 : -1), (googlyEyes ? 14 : -1)); + renderKeypoints( + targetPtr, sharedMaxs, sharedMins, sharedScaleF, maxPtr, minPtr, scalePtr, + globalIdx, x, y, targetWidth, targetHeight, posePtr, BODY_23_PAIRS_GPU, numberPeople, 23, numberPartPairs, + BODY_23_COLORS, numberColors, radius, lineWidth, BODY_23_SCALES, numberScales, threshold, alphaColorToAdd, + blendOriginalFrame, (googlyEyes ? 13 : -1), (googlyEyes ? 14 : -1)); } + // __global__ void renderPoseBody25Old( + // float* targetPtr, const int targetWidth, const int targetHeight, const float* const posePtr, + // const int numberPeople, const float threshold, const bool googlyEyes, const bool blendOriginalFrame, + // const float alphaColorToAdd) + // { + // const auto x = (blockIdx.x * blockDim.x) + threadIdx.x; + // const auto y = (blockIdx.y * blockDim.y) + threadIdx.y; + // const auto globalIdx = threadIdx.y * blockDim.x + threadIdx.x; + + // // Shared parameters + // __shared__ float2 sharedMins[POSE_MAX_PEOPLE]; + // __shared__ float2 sharedMaxs[POSE_MAX_PEOPLE]; + // __shared__ float sharedScaleF[POSE_MAX_PEOPLE]; + + // // Other parameters + // const auto numberPartPairs = sizeof(BODY_25_PAIRS_GPU) / (2*sizeof(BODY_25_PAIRS_GPU[0])); + // const auto numberScales = sizeof(BODY_25_SCALES) / sizeof(BODY_25_SCALES[0]); + // const auto numberColors = sizeof(BODY_25_COLORS) / (3*sizeof(BODY_25_COLORS[0])); + // const auto radius = fastMinCuda(targetWidth, targetHeight) / 100.f; + // const auto lineWidth = fastMinCuda(targetWidth, targetHeight) / 120.f; + + // // Render key points + // renderKeypointsOld( + // targetPtr, sharedMaxs, sharedMins, sharedScaleF, globalIdx, x, y, targetWidth, targetHeight, posePtr, + // BODY_25_PAIRS_GPU, numberPeople, 25, numberPartPairs, BODY_25_COLORS, numberColors, radius, lineWidth, + // BODY_25_SCALES, numberScales, threshold, alphaColorToAdd, blendOriginalFrame, (googlyEyes ? 15 : -1), + // (googlyEyes ? 16 : -1)); + // } + __global__ void renderPoseBody25( - float* targetPtr, const int targetWidth, const int targetHeight, const float* const posePtr, - const int numberPeople, const float threshold, const bool googlyEyes, const bool blendOriginalFrame, - const float alphaColorToAdd) + float* targetPtr, float* minPtr, float* maxPtr, float* scalePtr, const int targetWidth, + const int targetHeight, const float* const posePtr, const int numberPeople, const float threshold, + const bool googlyEyes, const bool blendOriginalFrame, const float alphaColorToAdd) { const auto x = (blockIdx.x * blockDim.x) + threadIdx.x; const auto y = (blockIdx.y * blockDim.y) + threadIdx.y; const auto globalIdx = threadIdx.y * blockDim.x + threadIdx.x; // Shared parameters - __shared__ float2 sharedMins[POSE_MAX_PEOPLE]; - __shared__ float2 sharedMaxs[POSE_MAX_PEOPLE]; + __shared__ float sharedMins[2*POSE_MAX_PEOPLE]; + __shared__ float sharedMaxs[2*POSE_MAX_PEOPLE]; __shared__ float sharedScaleF[POSE_MAX_PEOPLE]; // Other parameters @@ -230,24 +265,26 @@ namespace op const auto lineWidth = fastMinCuda(targetWidth, targetHeight) / 120.f; // Render key points - renderKeypoints(targetPtr, sharedMaxs, sharedMins, sharedScaleF, globalIdx, x, y, targetWidth, targetHeight, - posePtr, BODY_25_PAIRS_GPU, numberPeople, 25, numberPartPairs, BODY_25_COLORS, numberColors, - radius, lineWidth, BODY_25_SCALES, numberScales, threshold, alphaColorToAdd, - blendOriginalFrame, (googlyEyes ? 15 : -1), (googlyEyes ? 16 : -1)); + renderKeypoints( + targetPtr, sharedMaxs, sharedMins, sharedScaleF, maxPtr, minPtr, scalePtr, + globalIdx, x, y, targetWidth, targetHeight, + posePtr, BODY_25_PAIRS_GPU, numberPeople, 25, numberPartPairs, BODY_25_COLORS, numberColors, + radius, lineWidth, BODY_25_SCALES, numberScales, threshold, alphaColorToAdd, + blendOriginalFrame, (googlyEyes ? 15 : -1), (googlyEyes ? 16 : -1)); } __global__ void renderPoseBody25b( - float* targetPtr, const int targetWidth, const int targetHeight, const float* const posePtr, - const int numberPeople, const float threshold, const bool googlyEyes, const bool blendOriginalFrame, - const float alphaColorToAdd) + float* targetPtr, float* minPtr, float* maxPtr, float* scalePtr, const int targetWidth, const int targetHeight, + const float* const posePtr, const int numberPeople, const float threshold, const bool googlyEyes, + const bool blendOriginalFrame, const float alphaColorToAdd) { const auto x = (blockIdx.x * blockDim.x) + threadIdx.x; const auto y = (blockIdx.y * blockDim.y) + threadIdx.y; const auto globalIdx = threadIdx.y * blockDim.x + threadIdx.x; // Shared parameters - __shared__ float2 sharedMins[POSE_MAX_PEOPLE]; - __shared__ float2 sharedMaxs[POSE_MAX_PEOPLE]; + __shared__ float sharedMins[2*POSE_MAX_PEOPLE]; + __shared__ float sharedMaxs[2*POSE_MAX_PEOPLE]; __shared__ float sharedScaleF[POSE_MAX_PEOPLE]; // Other parameters @@ -258,80 +295,54 @@ namespace op const auto lineWidth = fastMinCuda(targetWidth, targetHeight) / 120.f; // Render key points - renderKeypoints(targetPtr, sharedMaxs, sharedMins, sharedScaleF, globalIdx, x, y, targetWidth, targetHeight, - posePtr, BODY_25B_PAIRS_GPU, numberPeople, 25, numberPartPairs, BODY_25B_COLORS, numberColors, - radius, lineWidth, BODY_25B_SCALES, numberScales, threshold, alphaColorToAdd, - blendOriginalFrame, (googlyEyes ? 1 : -1), (googlyEyes ? 2 : -1)); - } - - __global__ void renderPoseBody65( - float* targetPtr, const int targetWidth, const int targetHeight, const float* const posePtr, - const int numberPeople, const float threshold, const bool googlyEyes, const bool blendOriginalFrame, - const float alphaColorToAdd) - { - const auto x = (blockIdx.x * blockDim.x) + threadIdx.x; - const auto y = (blockIdx.y * blockDim.y) + threadIdx.y; - const auto globalIdx = threadIdx.y * blockDim.x + threadIdx.x; - - // Shared parameters - __shared__ float2 sharedMins[POSE_MAX_PEOPLE]; - __shared__ float2 sharedMaxs[POSE_MAX_PEOPLE]; - __shared__ float sharedScaleF[POSE_MAX_PEOPLE]; - - // Other parameters - const auto numberPartPairs = sizeof(BODY_65_PAIRS_GPU) / (2*sizeof(BODY_65_PAIRS_GPU[0])); - const auto numberScales = sizeof(BODY_65_SCALES) / sizeof(BODY_65_SCALES[0]); - const auto numberColors = sizeof(BODY_65_COLORS) / (3*sizeof(BODY_65_COLORS[0])); - const auto radius = fastMinCuda(targetWidth, targetHeight) / 100.f; - const auto lineWidth = fastMinCuda(targetWidth, targetHeight) / 120.f; - - // Render key points - renderKeypoints(targetPtr, sharedMaxs, sharedMins, sharedScaleF, globalIdx, x, y, targetWidth, targetHeight, - posePtr, BODY_65_PAIRS_GPU, numberPeople, 65, numberPartPairs, BODY_65_COLORS, numberColors, - radius, lineWidth, BODY_65_SCALES, numberScales, threshold, alphaColorToAdd, - blendOriginalFrame, (googlyEyes ? 15 : -1), (googlyEyes ? 16 : -1)); + renderKeypoints( + targetPtr, sharedMaxs, sharedMins, sharedScaleF, maxPtr, minPtr, scalePtr, + globalIdx, x, y, targetWidth, targetHeight, posePtr, BODY_25B_PAIRS_GPU, numberPeople, 25, numberPartPairs, + BODY_25B_COLORS, numberColors, radius, lineWidth, BODY_25B_SCALES, numberScales, threshold, alphaColorToAdd, + blendOriginalFrame, (googlyEyes ? 1 : -1), (googlyEyes ? 2 : -1)); } - __global__ void renderPoseBody95( - float* targetPtr, const int targetWidth, const int targetHeight, const float* const posePtr, - const int numberPeople, const float threshold, const bool googlyEyes, const bool blendOriginalFrame, - const float alphaColorToAdd) - { - const auto x = (blockIdx.x * blockDim.x) + threadIdx.x; - const auto y = (blockIdx.y * blockDim.y) + threadIdx.y; - const auto globalIdx = threadIdx.y * blockDim.x + threadIdx.x; - - // Shared parameters - __shared__ float2 sharedMins[POSE_MAX_PEOPLE]; - __shared__ float2 sharedMaxs[POSE_MAX_PEOPLE]; - __shared__ float sharedScaleF[POSE_MAX_PEOPLE]; - - // Other parameters - const auto numberPartPairs = sizeof(BODY_95_PAIRS_GPU) / (2*sizeof(BODY_95_PAIRS_GPU[0])); - const auto numberScales = sizeof(BODY_95_SCALES) / sizeof(BODY_95_SCALES[0]); - const auto numberColors = sizeof(BODY_95_COLORS) / (3*sizeof(BODY_95_COLORS[0])); - const auto radius = fastMinCuda(targetWidth, targetHeight) / 100.f; - const auto lineWidth = fastMinCuda(targetWidth, targetHeight) / 120.f; - - // Render key points - renderKeypoints(targetPtr, sharedMaxs, sharedMins, sharedScaleF, globalIdx, x, y, targetWidth, targetHeight, - posePtr, BODY_95_PAIRS_GPU, numberPeople, 95, numberPartPairs, BODY_95_COLORS, numberColors, - radius, lineWidth, BODY_95_SCALES, numberScales, threshold, alphaColorToAdd, - blendOriginalFrame, (googlyEyes ? 1 : -1), (googlyEyes ? 2 : -1)); - } + // __global__ void renderPoseBody135Old( + // float* targetPtr, const int targetWidth, const int targetHeight, const float* const posePtr, + // const int numberPeople, const float threshold, const bool googlyEyes, const bool blendOriginalFrame, + // const float alphaColorToAdd) + // { + // const auto x = (blockIdx.x * blockDim.x) + threadIdx.x; + // const auto y = (blockIdx.y * blockDim.y) + threadIdx.y; + // const auto globalIdx = threadIdx.y * blockDim.x + threadIdx.x; + + // // Shared parameters + // __shared__ float2 sharedMins[POSE_MAX_PEOPLE]; + // __shared__ float2 sharedMaxs[POSE_MAX_PEOPLE]; + // __shared__ float sharedScaleF[POSE_MAX_PEOPLE]; + + // // Other parameters + // const auto numberPartPairs = sizeof(BODY_135_PAIRS_GPU) / (2*sizeof(BODY_135_PAIRS_GPU[0])); + // const auto numberScales = sizeof(BODY_135_SCALES) / sizeof(BODY_135_SCALES[0]); + // const auto numberColors = sizeof(BODY_135_COLORS) / (3*sizeof(BODY_135_COLORS[0])); + // const auto radius = fastMinCuda(targetWidth, targetHeight) / 100.f; + // const auto lineWidth = fastMinCuda(targetWidth, targetHeight) / 120.f; + + // // Render key points + // renderKeypointsOld( + // targetPtr, sharedMaxs, sharedMins, sharedScaleF, globalIdx, x, y, targetWidth, targetHeight, posePtr, + // BODY_135_PAIRS_GPU, numberPeople, 135, numberPartPairs, BODY_135_COLORS, numberColors, radius, lineWidth, + // BODY_135_SCALES, numberScales, threshold, alphaColorToAdd, blendOriginalFrame, (googlyEyes ? 1 : -1), + // (googlyEyes ? 2 : -1)); + // } __global__ void renderPoseBody135( - float* targetPtr, const int targetWidth, const int targetHeight, const float* const posePtr, - const int numberPeople, const float threshold, const bool googlyEyes, const bool blendOriginalFrame, - const float alphaColorToAdd) + float* targetPtr, float* minPtr, float* maxPtr, float* scalePtr, const int targetWidth, const int targetHeight, + const float* const posePtr, const int numberPeople, const float threshold, const bool googlyEyes, + const bool blendOriginalFrame, const float alphaColorToAdd) { const auto x = (blockIdx.x * blockDim.x) + threadIdx.x; const auto y = (blockIdx.y * blockDim.y) + threadIdx.y; const auto globalIdx = threadIdx.y * blockDim.x + threadIdx.x; // Shared parameters - __shared__ float2 sharedMins[POSE_MAX_PEOPLE]; - __shared__ float2 sharedMaxs[POSE_MAX_PEOPLE]; + __shared__ float sharedMins[2*POSE_MAX_PEOPLE]; + __shared__ float sharedMaxs[2*POSE_MAX_PEOPLE]; __shared__ float sharedScaleF[POSE_MAX_PEOPLE]; // Other parameters @@ -342,23 +353,25 @@ namespace op const auto lineWidth = fastMinCuda(targetWidth, targetHeight) / 120.f; // Render key points - renderKeypoints(targetPtr, sharedMaxs, sharedMins, sharedScaleF, globalIdx, x, y, targetWidth, targetHeight, - posePtr, BODY_135_PAIRS_GPU, numberPeople, 135, numberPartPairs, BODY_135_COLORS, numberColors, - radius, lineWidth, BODY_135_SCALES, numberScales, threshold, alphaColorToAdd, - blendOriginalFrame, (googlyEyes ? 1 : -1), (googlyEyes ? 2 : -1)); + renderKeypoints( + targetPtr, sharedMaxs, sharedMins, sharedScaleF, maxPtr, minPtr, scalePtr, globalIdx, x, y, targetWidth, + targetHeight, posePtr, BODY_135_PAIRS_GPU, numberPeople, 135, numberPartPairs, BODY_135_COLORS, + numberColors, radius, lineWidth, BODY_135_SCALES, numberScales, threshold, alphaColorToAdd, + blendOriginalFrame, (googlyEyes ? 1 : -1), (googlyEyes ? 2 : -1)); } __global__ void renderPoseMpi29Parts( - float* targetPtr, const int targetWidth, const int targetHeight, const float* const posePtr, - const int numberPeople, const float threshold, const bool blendOriginalFrame, const float alphaColorToAdd) + float* targetPtr, float* minPtr, float* maxPtr, float* scalePtr, const int targetWidth, const int targetHeight, + const float* const posePtr, const int numberPeople, const float threshold, const bool blendOriginalFrame, + const float alphaColorToAdd) { const auto x = (blockIdx.x * blockDim.x) + threadIdx.x; const auto y = (blockIdx.y * blockDim.y) + threadIdx.y; const auto globalIdx = threadIdx.y * blockDim.x + threadIdx.x; // Shared parameters - __shared__ float2 sharedMins[POSE_MAX_PEOPLE]; - __shared__ float2 sharedMaxs[POSE_MAX_PEOPLE]; + __shared__ float sharedMins[2*POSE_MAX_PEOPLE]; + __shared__ float sharedMaxs[2*POSE_MAX_PEOPLE]; __shared__ float sharedScaleF[POSE_MAX_PEOPLE]; // Other parameters @@ -369,23 +382,24 @@ namespace op const auto lineWidth = fastMinCuda(targetWidth, targetHeight) / 120.f; // Render key points - renderKeypoints(targetPtr, sharedMaxs, sharedMins, sharedScaleF, globalIdx, x, y, targetWidth, targetHeight, - posePtr, MPI_PAIRS_GPU, numberPeople, 15, numberPartPairs, MPI_COLORS, numberColors, - radius, lineWidth, COCO_SCALES, numberScales, threshold, alphaColorToAdd, blendOriginalFrame); + renderKeypoints( + targetPtr, sharedMaxs, sharedMins, sharedScaleF, maxPtr, minPtr, scalePtr, globalIdx, x, y, targetWidth, + targetHeight, posePtr, MPI_PAIRS_GPU, numberPeople, 15, numberPartPairs, MPI_COLORS, numberColors, + radius, lineWidth, COCO_SCALES, numberScales, threshold, alphaColorToAdd, blendOriginalFrame); } __global__ void renderPoseCar12( - float* targetPtr, const int targetWidth, const int targetHeight, const float* const posePtr, - const int numberPeople, const float threshold, const bool googlyEyes, const bool blendOriginalFrame, - const float alphaColorToAdd) + float* targetPtr, float* minPtr, float* maxPtr, float* scalePtr, const int targetWidth, const int targetHeight, + const float* const posePtr, const int numberPeople, const float threshold, const bool googlyEyes, + const bool blendOriginalFrame, const float alphaColorToAdd) { const auto x = (blockIdx.x * blockDim.x) + threadIdx.x; const auto y = (blockIdx.y * blockDim.y) + threadIdx.y; const auto globalIdx = threadIdx.y * blockDim.x + threadIdx.x; // Shared parameters - __shared__ float2 sharedMins[POSE_MAX_PEOPLE]; - __shared__ float2 sharedMaxs[POSE_MAX_PEOPLE]; + __shared__ float sharedMins[2*POSE_MAX_PEOPLE]; + __shared__ float sharedMaxs[2*POSE_MAX_PEOPLE]; __shared__ float sharedScaleF[POSE_MAX_PEOPLE]; // Other parameters @@ -396,24 +410,25 @@ namespace op const auto lineWidth = fastMinCuda(targetWidth, targetHeight) / 120.f; // Render key points - renderKeypoints(targetPtr, sharedMaxs, sharedMins, sharedScaleF, globalIdx, x, y, targetWidth, targetHeight, - posePtr, CAR_12_PAIRS_GPU, numberPeople, 12, numberPartPairs, CAR_12_COLORS, numberColors, - radius, lineWidth, CAR_12_SCALES, numberScales, threshold, alphaColorToAdd, - blendOriginalFrame, (googlyEyes ? 4 : -1), (googlyEyes ? 5 : -1)); + renderKeypoints( + targetPtr, sharedMaxs, sharedMins, sharedScaleF, maxPtr, minPtr, scalePtr, globalIdx, x, y, targetWidth, + targetHeight, posePtr, CAR_12_PAIRS_GPU, numberPeople, 12, numberPartPairs, CAR_12_COLORS, numberColors, + radius, lineWidth, CAR_12_SCALES, numberScales, threshold, alphaColorToAdd, blendOriginalFrame, + (googlyEyes ? 4 : -1), (googlyEyes ? 5 : -1)); } __global__ void renderPoseCar22( - float* targetPtr, const int targetWidth, const int targetHeight, const float* const posePtr, - const int numberPeople, const float threshold, const bool googlyEyes, const bool blendOriginalFrame, - const float alphaColorToAdd) + float* targetPtr, float* minPtr, float* maxPtr, float* scalePtr, const int targetWidth, const int targetHeight, + const float* const posePtr, const int numberPeople, const float threshold, const bool googlyEyes, + const bool blendOriginalFrame, const float alphaColorToAdd) { const auto x = (blockIdx.x * blockDim.x) + threadIdx.x; const auto y = (blockIdx.y * blockDim.y) + threadIdx.y; const auto globalIdx = threadIdx.y * blockDim.x + threadIdx.x; // Shared parameters - __shared__ float2 sharedMins[POSE_MAX_PEOPLE]; - __shared__ float2 sharedMaxs[POSE_MAX_PEOPLE]; + __shared__ float sharedMins[2*POSE_MAX_PEOPLE]; + __shared__ float sharedMaxs[2*POSE_MAX_PEOPLE]; __shared__ float sharedScaleF[POSE_MAX_PEOPLE]; // Other parameters @@ -424,10 +439,11 @@ namespace op const auto lineWidth = fastMinCuda(targetWidth, targetHeight) / 120.f; // Render key points - renderKeypoints(targetPtr, sharedMaxs, sharedMins, sharedScaleF, globalIdx, x, y, targetWidth, targetHeight, - posePtr, CAR_22_PAIRS_GPU, numberPeople, 22, numberPartPairs, CAR_22_COLORS, numberColors, - radius, lineWidth, CAR_22_SCALES, numberScales, threshold, alphaColorToAdd, - blendOriginalFrame, (googlyEyes ? 6 : -1), (googlyEyes ? 7 : -1)); + renderKeypoints( + targetPtr, sharedMaxs, sharedMins, sharedScaleF, maxPtr, minPtr, scalePtr, globalIdx, x, y, targetWidth, + targetHeight, posePtr, CAR_22_PAIRS_GPU, numberPeople, 22, numberPartPairs, CAR_22_COLORS, numberColors, + radius, lineWidth, CAR_22_SCALES, numberScales, threshold, alphaColorToAdd, blendOriginalFrame, + (googlyEyes ? 6 : -1), (googlyEyes ? 7 : -1)); } __global__ void renderBodyPartHeatMaps(float* targetPtr, const int targetWidth, const int targetHeight, @@ -619,9 +635,10 @@ namespace op } } - void renderPoseKeypointsGpu(float* framePtr, const PoseModel poseModel, const int numberPeople, - const Point& frameSize, const float* const posePtr, const float renderThreshold, - const bool googlyEyes, const bool blendOriginalFrame, const float alphaBlending) + void renderPoseKeypointsGpu( + float* framePtr, float* maxPtr, float* minPtr, float* scalePtr, const PoseModel poseModel, + const int numberPeople, const Point& frameSize, const float* const posePtr, + const float renderThreshold, const bool googlyEyes, const bool blendOriginalFrame, const float alphaBlending) { try { @@ -642,63 +659,110 @@ namespace op getNumberCudaThreadsAndBlocks(threadsPerBlock, numBlocks, frameSize); // Body pose + const dim3 threadsPerBlockBoundBox = {1, 1, 1}; + const dim3 numBlocksBox{getNumberCudaBlocks(POSE_MAX_PEOPLE, threadsPerBlockBoundBox.x)}; + getBoundingBoxPerPersonPose<<>>( + maxPtr, minPtr, scalePtr, frameSize.x, frameSize.y, posePtr, numberPeople, + getPoseNumberBodyParts(poseModel), renderThreshold); if (poseModel == PoseModel::BODY_25 || poseModel == PoseModel::BODY_25D || poseModel == PoseModel::BODY_25E) + { + // const auto REPS = 1000; + // double timeNormalize0 = 0.; + // double timeNormalize1 = 0.; + + // // Non-optimized code + // OP_CUDA_PROFILE_INIT(REPS); + // renderPoseBody25Old<<>>( + // framePtr, frameSize.x, frameSize.y, posePtr, numberPeople, renderThreshold, googlyEyes, + // blendOriginalFrame, alphaBlending + // ); + // OP_CUDA_PROFILE_END(timeNormalize0, 1e3, REPS); + + // Optimized code + // OP_CUDA_PROFILE_INIT(REPS); + // const dim3 threadsPerBlockBoundBox = {1, 1, 1}; + // const dim3 numBlocksBox{getNumberCudaBlocks(POSE_MAX_PEOPLE, threadsPerBlockBoundBox.x)}; + // getBoundingBoxPerPersonPose<<>>( + // maxPtr, minPtr, scalePtr, frameSize.x, frameSize.y, posePtr, numberPeople, 25, + // renderThreshold); renderPoseBody25<<>>( - framePtr, frameSize.x, frameSize.y, posePtr, numberPeople, renderThreshold, googlyEyes, - blendOriginalFrame, alphaBlending + framePtr, minPtr, maxPtr, scalePtr, frameSize.x, frameSize.y, posePtr, numberPeople, + renderThreshold, googlyEyes, blendOriginalFrame, alphaBlending ); + // OP_CUDA_PROFILE_END(timeNormalize1, 1e3, REPS); + + // // Profiling code + // log(" renderOld=" + std::to_string(timeNormalize0) + "ms"); + // log(" renderNew=" + std::to_string(timeNormalize1) + "ms"); + } else if (poseModel == PoseModel::COCO_18) renderPoseCoco<<>>( - framePtr, frameSize.x, frameSize.y, posePtr, numberPeople, renderThreshold, googlyEyes, - blendOriginalFrame, alphaBlending + framePtr, minPtr, maxPtr, scalePtr, frameSize.x, frameSize.y, posePtr, numberPeople, + renderThreshold, googlyEyes, blendOriginalFrame, alphaBlending ); else if (poseModel == PoseModel::BODY_19 || poseModel == PoseModel::BODY_19E || poseModel == PoseModel::BODY_19N || poseModel == PoseModel::BODY_19_X2) renderPoseBody19<<>>( - framePtr, frameSize.x, frameSize.y, posePtr, numberPeople, renderThreshold, googlyEyes, - blendOriginalFrame, alphaBlending + framePtr, minPtr, maxPtr, scalePtr, frameSize.x, frameSize.y, posePtr, numberPeople, + renderThreshold, googlyEyes, blendOriginalFrame, alphaBlending ); else if (poseModel == PoseModel::BODY_23) renderPoseBody23<<>>( - framePtr, frameSize.x, frameSize.y, posePtr, numberPeople, renderThreshold, googlyEyes, - blendOriginalFrame, alphaBlending + framePtr, minPtr, maxPtr, scalePtr, frameSize.x, frameSize.y, posePtr, numberPeople, + renderThreshold, googlyEyes, blendOriginalFrame, alphaBlending ); else if (poseModel == PoseModel::BODY_25B) renderPoseBody25b<<>>( - framePtr, frameSize.x, frameSize.y, posePtr, numberPeople, renderThreshold, googlyEyes, - blendOriginalFrame, alphaBlending - ); - else if (poseModel == PoseModel::BODY_65) - renderPoseBody65<<>>( - framePtr, frameSize.x, frameSize.y, posePtr, numberPeople, renderThreshold, googlyEyes, - blendOriginalFrame, alphaBlending - ); - else if (poseModel == PoseModel::BODY_95) - renderPoseBody95<<>>( - framePtr, frameSize.x, frameSize.y, posePtr, numberPeople, renderThreshold, googlyEyes, - blendOriginalFrame, alphaBlending + framePtr, minPtr, maxPtr, scalePtr, frameSize.x, frameSize.y, posePtr, numberPeople, + renderThreshold, googlyEyes, blendOriginalFrame, alphaBlending ); else if (poseModel == PoseModel::BODY_135) + { + // const auto REPS = 500; + // double timeNormalize1 = 0.; + // double timeNormalize2 = 0.; + + // // Non-optimized code + // OP_CUDA_PROFILE_INIT(REPS); + // renderPoseBody135Old<<>>( + // framePtr, frameSize.x, frameSize.y, posePtr, numberPeople, renderThreshold, googlyEyes, + // blendOriginalFrame, alphaBlending + // ); + // OP_CUDA_PROFILE_END(timeNormalize1, 1e3, REPS); + + // Optimized code + // OP_CUDA_PROFILE_INIT(REPS); + // const dim3 threadsPerBlockBoundBox = {1, 1, 1}; + // const dim3 numBlocksBox{getNumberCudaBlocks(POSE_MAX_PEOPLE, threadsPerBlockBoundBox.x)}; + // getBoundingBoxPerPersonPose<<>>( + // maxPtr, minPtr, scalePtr, frameSize.x, frameSize.y, posePtr, numberPeople, 135, + // renderThreshold); renderPoseBody135<<>>( - framePtr, frameSize.x, frameSize.y, posePtr, numberPeople, renderThreshold, googlyEyes, - blendOriginalFrame, alphaBlending + framePtr, minPtr, maxPtr, scalePtr, frameSize.x, frameSize.y, posePtr, numberPeople, + renderThreshold, googlyEyes, blendOriginalFrame, alphaBlending ); + // OP_CUDA_PROFILE_END(timeNormalize2, 1e3, REPS); + + // // Profiling code + // log(" renderOld=" + std::to_string(timeNormalize1) + "ms"); + // log(" renderNew=" + std::to_string(timeNormalize2) + "ms"); + } else if (poseModel == PoseModel::MPI_15 || poseModel == PoseModel::MPI_15_4) renderPoseMpi29Parts<<>>( - framePtr, frameSize.x, frameSize.y, posePtr, numberPeople, renderThreshold, - blendOriginalFrame, alphaBlending + framePtr, minPtr, maxPtr, scalePtr, frameSize.x, frameSize.y, posePtr, numberPeople, + renderThreshold, blendOriginalFrame, alphaBlending ); // Car pose else if (poseModel == PoseModel::CAR_12) renderPoseCar12<<>>( - framePtr, frameSize.x, frameSize.y, posePtr, numberPeople, renderThreshold, googlyEyes, - blendOriginalFrame, alphaBlending + framePtr, minPtr, maxPtr, scalePtr, frameSize.x, frameSize.y, posePtr, numberPeople, + renderThreshold, googlyEyes, blendOriginalFrame, alphaBlending ); else if (poseModel == PoseModel::CAR_22) renderPoseCar22<<>>( - framePtr, frameSize.x, frameSize.y, posePtr, numberPeople, renderThreshold, googlyEyes, - blendOriginalFrame, alphaBlending + framePtr, minPtr, maxPtr, scalePtr, frameSize.x, frameSize.y, posePtr, numberPeople, + renderThreshold, googlyEyes, blendOriginalFrame, alphaBlending ); // Unknown else -- GitLab