提交 35f8c9d1 编写于 作者: J joker3212 提交者: Gines

Render and resize speedup in CUDA (#1209)

上级 5f4cf6bf
......@@ -23,6 +23,9 @@ namespace op
private:
float* pGpuFace; // GPU aux memory
float* pMaxPtr; // GPU aux memory
float* pMinPtr; // GPU aux memory
float* pScalePtr; // GPU aux memory
DELETE_COPY(FaceGpuRenderer);
};
......
......@@ -6,10 +6,13 @@
namespace op
{
OP_API void renderFaceKeypointsCpu(Array<float>& frameArray, const Array<float>& faceKeypoints, const float renderThreshold);
OP_API void renderFaceKeypointsCpu(
Array<float>& frameArray, const Array<float>& faceKeypoints, const float renderThreshold);
void renderFaceKeypointsGpu(float* framePtr, const Point<int>& frameSize, const float* const facePtr, const int numberPeople,
const float renderThreshold, const float alphaColorToAdd = FACE_DEFAULT_ALPHA_KEYPOINT);
void renderFaceKeypointsGpu(
float* framePtr, float* maxPtr, float* minPtr, float* scalePtr, const Point<int>& frameSize,
const float* const facePtr, const int numberPeople, const float renderThreshold,
const float alphaColorToAdd = FACE_DEFAULT_ALPHA_KEYPOINT);
}
#endif // OPENPOSE_FACE_RENDER_FACE_HPP
......@@ -121,8 +121,9 @@ namespace op
}
template <typename T>
inline __device__ T bicubicInterpolate(const T* const sourcePtr, const T xSource, const T ySource, const int widthSource,
const int heightSource, const int widthSourcePtr)
inline __device__ T bicubicInterpolate(
const T* const sourcePtr, const T xSource, const T ySource, const int widthSource, const int heightSource,
const int widthSourcePtr)
{
int xIntArray[4];
int yIntArray[4];
......@@ -140,6 +141,25 @@ namespace op
return cubicInterpolate(temp[0], temp[1], temp[2], temp[3], dy);
}
template <typename T>
inline __device__ T bicubicInterpolate8Times(
const T* const sourcePtr, const T xSource, const T ySource, const int widthSource, const int heightSource,
const int widthSourcePtr, const int threadIdxX, const int threadIdxY)
{
// Now we only need dx and dy
const T dx = xSource - fastTruncateCuda(int(xSource + 1e-5), 0, widthSource - 1);
const T dy = ySource - fastTruncateCuda(int(ySource + 1e-5), 0, heightSource - 1);
T temp[4];
for (unsigned char i = 0; i < 4; i++)
{
const auto offset = 5 * (i + (threadIdxY > 3 ? 1 : 0)) + (threadIdxX > 3 ? 1 : 0);
temp[i] = cubicInterpolate(sourcePtr[offset + 0], sourcePtr[offset + 1],
sourcePtr[offset + 2], sourcePtr[offset + 3], dx);
}
return cubicInterpolate(temp[0], temp[1], temp[2], temp[3], dy);
}
template <typename T>
inline __device__ T addWeighted(const T value1, const T value2, const T alphaValue2)
{
......
......@@ -23,6 +23,9 @@ namespace op
private:
float* pGpuHand; // GPU aux memory
float* pMaxPtr; // GPU aux memory
float* pMinPtr; // GPU aux memory
float* pScalePtr; // GPU aux memory
DELETE_COPY(HandGpuRenderer);
};
......
......@@ -6,12 +6,13 @@
namespace op
{
OP_API void renderHandKeypointsCpu(Array<float>& frameArray, const std::array<Array<float>, 2>& handKeypoints,
const float renderThreshold);
OP_API void renderHandKeypointsCpu(
Array<float>& frameArray, const std::array<Array<float>, 2>& handKeypoints, const float renderThreshold);
void renderHandKeypointsGpu(float* framePtr, const Point<int>& frameSize, const float* const handsPtr,
const int numberHands, const float renderThreshold,
const float alphaColorToAdd = HAND_DEFAULT_ALPHA_KEYPOINT);
void renderHandKeypointsGpu(
float* framePtr, float* maxPtr, float* minPtr, float* scalePtr, const Point<int>& frameSize,
const float* const handsPtr, const int numberHands, const float renderThreshold,
const float alphaColorToAdd = HAND_DEFAULT_ALPHA_KEYPOINT);
}
#endif // OPENPOSE_HAND_GPU_HAND_RENDER_HPP
......@@ -22,6 +22,14 @@ namespace op
T* targetPtr, const std::vector<const T*>& sourcePtrs, std::vector<T*>& sourceTempPtrs,
const std::array<int, 4>& targetSize, const std::vector<std::array<int, 4>>& sourceSizes,
const std::vector<T>& scaleInputToNetInputs = {1.f}, const int gpuID = 0);
}
// Functions for the files cvMatToOpInput/Output
void resizeAndMergeRGBGPU(
float* targetPtr, const float* const srcPtr, const int sourceWidth, const int sourceHeight,
const int targetWidth, const int targetHeight, const float scaleFactor);
void reorderAndCast(
float* targetPtr, const unsigned char* const srcPtr, const int width, const int height);
}
#endif // OPENPOSE_NET_RESIZE_AND_MERGE_BASE_HPP
......@@ -31,6 +31,9 @@ namespace op
const std::shared_ptr<PoseExtractorNet> spPoseExtractorNet;
// Init with thread
float* pGpuPose; // GPU aux memory
float* pMaxPtr; // GPU aux memory
float* pMinPtr; // GPU aux memory
float* pScalePtr; // GPU aux memory
DELETE_COPY(PoseGpuRenderer);
};
......
......@@ -8,39 +8,39 @@
namespace op
{
OP_API void renderPoseKeypointsCpu(Array<float>& frameArray, const Array<float>& poseKeypoints,
const PoseModel poseModel, const float renderThreshold,
const bool blendOriginalFrame = true);
void renderPoseKeypointsGpu(float* framePtr, const PoseModel poseModel, const int numberPeople,
const Point<int>& frameSize, const float* const posePtr,
const float renderThreshold, const bool googlyEyes = false,
const bool blendOriginalFrame = true,
const float alphaBlending = POSE_DEFAULT_ALPHA_KEYPOINT);
void renderPoseHeatMapGpu(float* frame, const Point<int>& frameSize, const float* const heatMapPtr,
const Point<int>& heatMapSize, const float scaleToKeepRatio,
const unsigned int part,
const float alphaBlending = POSE_DEFAULT_ALPHA_HEAT_MAP);
void renderPoseHeatMapsGpu(float* frame, const PoseModel poseModel, const Point<int>& frameSize,
const float* const heatMapPtr, const Point<int>& heatMapSize,
const float scaleToKeepRatio,
const float alphaBlending = POSE_DEFAULT_ALPHA_HEAT_MAP);
void renderPosePAFGpu(float* framePtr, const PoseModel poseModel, const Point<int>& frameSize,
const float* const heatMapPtr, const Point<int>& heatMapSize,
const float scaleToKeepRatio, const int part,
const float alphaBlending = POSE_DEFAULT_ALPHA_HEAT_MAP);
void renderPosePAFsGpu(float* framePtr, const PoseModel poseModel, const Point<int>& frameSize,
const float* const heatMapPtr, const Point<int>& heatMapSize,
const float scaleToKeepRatio,
const float alphaBlending = POSE_DEFAULT_ALPHA_HEAT_MAP);
void renderPoseDistanceGpu(float* framePtr, const Point<int>& frameSize, const float* const heatMapPtr,
const Point<int>& heatMapSize, const float scaleToKeepRatio,
const unsigned int part, const float alphaBlending = POSE_DEFAULT_ALPHA_HEAT_MAP);
OP_API void renderPoseKeypointsCpu(
Array<float>& frameArray, const Array<float>& poseKeypoints, const PoseModel poseModel,
const float renderThreshold, const bool blendOriginalFrame = true);
void renderPoseKeypointsGpu(
float* framePtr, float* maxPtr, float* minPtr, float* scalePtr, const PoseModel poseModel,
const int numberPeople, const Point<int>& frameSize, const float* const posePtr,
const float renderThreshold, const bool googlyEyes = false, const bool blendOriginalFrame = true,
const float alphaBlending = POSE_DEFAULT_ALPHA_KEYPOINT);
void renderPoseHeatMapGpu(
float* frame, const Point<int>& frameSize, const float* const heatMapPtr, const Point<int>& heatMapSize,
const float scaleToKeepRatio, const unsigned int part,
const float alphaBlending = POSE_DEFAULT_ALPHA_HEAT_MAP);
void renderPoseHeatMapsGpu(
float* frame, const PoseModel poseModel, const Point<int>& frameSize, const float* const heatMapPtr,
const Point<int>& heatMapSize, const float scaleToKeepRatio,
const float alphaBlending = POSE_DEFAULT_ALPHA_HEAT_MAP);
void renderPosePAFGpu(
float* framePtr, const PoseModel poseModel, const Point<int>& frameSize, const float* const heatMapPtr,
const Point<int>& heatMapSize, const float scaleToKeepRatio, const int part,
const float alphaBlending = POSE_DEFAULT_ALPHA_HEAT_MAP);
void renderPosePAFsGpu(
float* framePtr, const PoseModel poseModel, const Point<int>& frameSize, const float* const heatMapPtr,
const Point<int>& heatMapSize, const float scaleToKeepRatio,
const float alphaBlending = POSE_DEFAULT_ALPHA_HEAT_MAP);
void renderPoseDistanceGpu(
float* framePtr, const Point<int>& frameSize, const float* const heatMapPtr, const Point<int>& heatMapSize,
const float scaleToKeepRatio, const unsigned int part, const float alphaBlending = POSE_DEFAULT_ALPHA_HEAT_MAP);
}
#endif // OPENPOSE_POSE_RENDER_POSE_HPP
......@@ -3,16 +3,217 @@
namespace op
{
inline __device__ void renderKeypoints(float* targetPtr, float2* sharedMaxs, float2* sharedMins,
float* sharedScaleF, const int globalIdx, const int x, const int y,
const int targetWidth, const int targetHeight,
const float* const keypointsPtr, const unsigned int* const partPairsPtr,
const int numberPeople, const int numberParts, const int numberPartPairs,
const float* const rgbColorsPtr, const int numberColors, const float radius,
const float lineWidth, const float* const keypointScalePtr,
const int numberScales, const float threshold,
const float alphaColorToAdd, const bool blendOriginalFrame = true,
const int googlyEye1 = -1, const int googlyEye2 = -1)
__inline__ __device__ void getBoundingBoxPerPerson(
float* maxPtr, float* minPtr, float* scalePtr,const int targetWidth, const int targetHeight,
const float* const keypointsPtr, const int numberPeople, const int numberParts, const float threshold)
{
const auto globalIdx = threadIdx.x;
// const auto xIndex = 2*globalIdx;
// const auto yIndex = xIndex+1;
const auto xIndex = globalIdx;
const auto yIndex = numberPeople+globalIdx;
// Fill shared parameters
// if (globalIdx < numberPeople)
{
auto minValueX = (float)targetWidth;
auto minValueY = (float)targetHeight;
auto maxValueX = 0.f;
auto maxValueY = 0.f;
for (auto part = 0 ; part < numberParts ; part++)
{
const auto index = 3 * (globalIdx*numberParts + part);
const auto x = keypointsPtr[index];
const auto y = keypointsPtr[index+1];
const auto score = keypointsPtr[index+2];
if (score > threshold)
{
if (x < minValueX)
minValueX = x;
if (x > maxValueX)
maxValueX = x;
if (y < minValueY)
minValueY = y;
if (y > maxValueY)
maxValueY = y;
}
}
if (maxValueX != 0.f && maxValueY != 0.f)
{
const auto averageX = maxValueX - minValueX;
const auto averageY = maxValueY - minValueY;
// (averageX + averageY) / 2.f / 400.f
scalePtr[globalIdx] = fastTruncateCuda((averageX + averageY) / 400.f, 0.33f, 1.f);
const auto constantToAdd = 50.f;
maxValueX += constantToAdd;
maxValueY += constantToAdd;
minValueX -= constantToAdd;
minValueY -= constantToAdd;
}
minPtr[xIndex] = minValueX;
minPtr[yIndex] = minValueY;
maxPtr[xIndex] = maxValueX;
maxPtr[yIndex] = maxValueY;
}
}
__inline__ __device__ void renderKeypoints(
float* targetPtr, float* sharedMaxs, float* sharedMins, float* sharedScaleF, const float* const maxPtr,
const float* const minPtr, const float* const scalePtr, const int globalIdx, const int x, const int y,
const int targetWidth, const int targetHeight, const float* const keypointsPtr,
const unsigned int* const partPairsPtr, const int numberPeople, const int numberParts,
const int numberPartPairs, const float* const rgbColorsPtr, const int numberColors, const float radius,
const float lineWidth, const float* const keypointScalePtr, const int numberScales, const float threshold,
const float alphaColorToAdd, const bool blendOriginalFrame = true, const int googlyEye1 = -1,
const int googlyEye2 = -1)
{
// Load shared memory
if (globalIdx < 2*numberPeople)
{
sharedMins[globalIdx] = minPtr[globalIdx];
sharedMaxs[globalIdx] = maxPtr[globalIdx];
if (globalIdx < numberPeople)
sharedScaleF[globalIdx] = scalePtr[globalIdx];
}
__syncthreads();
// Fill each (x,y) target pixel
if (x < targetWidth && y < targetHeight)
{
const auto baseIndex = 3*(y * targetWidth + x);
auto b = targetPtr[baseIndex];
auto g = targetPtr[baseIndex+1];
auto r = targetPtr[baseIndex+2];
if (!blendOriginalFrame)
{
b = 0.f;
g = 0.f;
r = 0.f;
}
const auto lineWidthSquared = lineWidth * lineWidth;
const auto radiusSquared = radius * radius;
for (auto person = 0; person < numberPeople; person++)
{
// Make sure person x,y in the limits
// Make sure person is not empty. Assume all joints are below threshold. Then
// maxs = 0 and mins = width/height. So if statement would be false
// const auto xIndex = 2*person;
// const auto yIndex = xIndex+1;
const auto xIndex = person;
const auto yIndex = numberPeople+person;
if (x <= sharedMaxs[xIndex] && x >= sharedMins[xIndex]
&& y <= sharedMaxs[yIndex] && y >= sharedMins[yIndex])
{
// Part pair connections
for (auto partPair = 0; partPair < numberPartPairs; partPair++)
{
const auto partA = partPairsPtr[2*partPair];
const auto partB = partPairsPtr[2*partPair+1];
const auto indexA = person*numberParts*3 + partA*3;
const auto xA = keypointsPtr[indexA];
const auto yA = keypointsPtr[indexA + 1];
const auto scoreA = keypointsPtr[indexA + 2];
const auto indexB = person*numberParts*3 + partB*3;
const auto xB = keypointsPtr[indexB];
const auto yB = keypointsPtr[indexB + 1];
const auto scoreB = keypointsPtr[indexB + 2];
if (scoreA > threshold && scoreB > threshold)
{
const auto keypointScale = keypointScalePtr[partB%numberScales]
* keypointScalePtr[partB%numberScales]
* keypointScalePtr[partB%numberScales];
const auto lineWidthScaled = lineWidthSquared * keypointScale;
const auto bSqrt = sharedScaleF[person] * sharedScaleF[person] * lineWidthScaled;
const auto xP = (xA + xB) / 2.f;
const auto yP = (yA + yB) / 2.f;
const auto aSqrt = (xA - xP) * (xA - xP) + (yA - yP) * (yA - yP);
const auto angle = atan2f(yB - yA, xB - xA);
const auto sine = sinf(angle);
const auto cosine = cosf(angle);
const auto A = cosine * (x - xP) + sine * (y - yP);
const auto B = sine * (x - xP) - cosine * (y - yP);
const auto judge = A * A / aSqrt + B * B / bSqrt;
const auto minV = 0.f;
const auto maxV = 1.f;
if (minV <= judge && judge <= maxV)
// Before used partPair vs partB
addColorWeighted(r, g, b, &rgbColorsPtr[(partB%numberColors)*3], alphaColorToAdd);
}
}
// Part circles
for (auto part = 0u; part < numberParts; part++)
{
const auto index = 3 * (person*numberParts + part);
const auto localX = keypointsPtr[index];
const auto localY = keypointsPtr[index + 1];
const auto score = keypointsPtr[index + 2];
if (score > threshold)
{
const auto keypointScale = keypointScalePtr[part%numberScales]
* keypointScalePtr[part%numberScales]
* keypointScalePtr[part%numberScales];
const auto radiusScaled = radiusSquared * keypointScale;
const auto dist2 = (x - localX) * (x - localX) + (y - localY) * (y - localY);
// Googly eyes
if (googlyEye1 == part || googlyEye2 == part)
{
const auto eyeRatio = 2.5f * sqrt(radiusScaled);
const auto minr2 = sharedScaleF[person] * sharedScaleF[person]
* (eyeRatio - 2) * (eyeRatio - 2);
const auto maxr2 = sharedScaleF[person] * sharedScaleF[person] * eyeRatio * eyeRatio;
if (dist2 <= maxr2)
{
float colorToAdd [3] = {0., 0., 0.};
if (dist2 <= minr2)
for (auto& color : colorToAdd)
color = {255.f};
if (dist2 <= minr2*0.6f)
{
const auto dist3 = (x-4 - localX)
* (x-4 - localX) + (y - localY+4) * (y - localY+4);
if (dist3 > 14.0625f) // 3.75f^2
for (auto& color : colorToAdd)
color = {0.f};
}
const auto alphaColorToAdd = 0.9f;
addColorWeighted(r, g, b, colorToAdd, alphaColorToAdd);
}
}
// Other parts
else
{
const auto minr2 = 0.f;
const auto maxr2 = sharedScaleF[person] * sharedScaleF[person] * radiusScaled;
if (minr2 <= dist2 && dist2 <= maxr2)
addColorWeighted(r, g, b, &rgbColorsPtr[(part%numberColors)*3], alphaColorToAdd);
}
}
}
}
}
targetPtr[baseIndex] = b;
targetPtr[baseIndex+1] = g;
targetPtr[baseIndex+2] = r;
}
}
__inline__ __device__ void renderKeypointsOld(
float* targetPtr, float2* sharedMaxs, float2* sharedMins, float* sharedScaleF, const int globalIdx,
const int x, const int y, const int targetWidth, const int targetHeight, const float* const keypointsPtr,
const unsigned int* const partPairsPtr, const int numberPeople, const int numberParts,
const int numberPartPairs, const float* const rgbColorsPtr, const int numberColors, const float radius,
const float lineWidth, const float* const keypointScalePtr, const int numberScales, const float threshold,
const float alphaColorToAdd, const bool blendOriginalFrame = true, const int googlyEye1 = -1,
const int googlyEye2 = -1)
{
// Fill shared parameters
if (globalIdx < numberPeople)
......@@ -52,7 +253,6 @@ namespace op
sharedMins[globalIdx].y -= constantToAdd;
}
}
__syncthreads();
// Fill each (x,y) target pixel
......@@ -176,4 +376,5 @@ namespace op
}
}
#endif // OPENPOSE_UTILITIES_RENDER_HU
......@@ -10,7 +10,11 @@ namespace op
{
FaceGpuRenderer::FaceGpuRenderer(const float renderThreshold, const float alphaKeypoint,
const float alphaHeatMap) :
GpuRenderer{renderThreshold, alphaKeypoint, alphaHeatMap}
GpuRenderer{renderThreshold, alphaKeypoint, alphaHeatMap},
pGpuFace{nullptr},
pMaxPtr{nullptr},
pMinPtr{nullptr},
pScalePtr{nullptr}
{
}
......@@ -20,7 +24,14 @@ namespace op
{
// Free CUDA pointers - Note that if pointers are 0 (i.e., nullptr), no operation is performed.
#ifdef USE_CUDA
cudaFree(pGpuFace);
if (pGpuFace != nullptr)
cudaFree(pGpuFace);
if (pMaxPtr != nullptr)
cudaFree(pMaxPtr);
if (pMinPtr != nullptr)
cudaFree(pMinPtr);
if (pScalePtr != nullptr)
cudaFree(pScalePtr);
#endif
}
catch (const std::exception& e)
......@@ -37,6 +48,9 @@ namespace op
// GPU memory allocation for rendering
#ifdef USE_CUDA
cudaMalloc((void**)(&pGpuFace), POSE_MAX_PEOPLE * FACE_NUMBER_PARTS * 3 * sizeof(float));
cudaMalloc((void**)&pMaxPtr, sizeof(float) * 2 * FACE_NUMBER_PARTS);
cudaMalloc((void**)&pMinPtr, sizeof(float) * 2 * FACE_NUMBER_PARTS);
cudaMalloc((void**)&pScalePtr, sizeof(float) * FACE_NUMBER_PARTS);
#endif
log("Finished initialization on thread.", Priority::Low, __LINE__, __FUNCTION__, __FILE__);
}
......@@ -63,8 +77,9 @@ namespace op
cudaMemcpy(pGpuFace, faceKeypoints.getConstPtr(),
faceKeypoints.getSize(0) * FACE_NUMBER_PARTS * 3 * sizeof(float),
cudaMemcpyHostToDevice);
renderFaceKeypointsGpu(*spGpuMemory, frameSize, pGpuFace, faceKeypoints.getSize(0),
mRenderThreshold, getAlphaKeypoint());
renderFaceKeypointsGpu(
*spGpuMemory, pMaxPtr, pMinPtr, pScalePtr, frameSize, pGpuFace, faceKeypoints.getSize(0),
mRenderThreshold, getAlphaKeypoint());
// CUDA check
cudaCheck(__LINE__, __FUNCTION__, __FILE__);
}
......
......@@ -10,17 +10,25 @@ namespace op
__constant__ const float SCALES[] = {FACE_SCALES_RENDER_GPU};
__constant__ const float COLORS[] = {FACE_COLORS_RENDER_GPU};
__global__ void renderFaceParts(float* targetPtr, const int targetWidth, const int targetHeight,
const float* const facePtr, const int numberPeople,
const float threshold, const float alphaColorToAdd)
__global__ void getBoundingBoxPerPersonFace(
float* maxPtr, float* minPtr, float* scalePtr,const int targetWidth, const int targetHeight,
const float* const keypointsPtr, const int numberPeople, const int numberParts, const float threshold)
{
getBoundingBoxPerPerson(
maxPtr, minPtr, scalePtr, targetWidth, targetHeight, keypointsPtr, numberPeople, numberParts, threshold);
}
__global__ void renderFaceParts(
float* targetPtr, float* minPtr, float* maxPtr, float* scalePtr, const int targetWidth, const int targetHeight,
const float* const facePtr, const int numberPeople, const float threshold, const float alphaColorToAdd)
{
const auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
const auto y = (blockIdx.y * blockDim.y) + threadIdx.y;
const auto globalIdx = threadIdx.y * blockDim.x + threadIdx.x;
// Shared parameters
__shared__ float2 sharedMins[FACE_MAX_FACES];
__shared__ float2 sharedMaxs[FACE_MAX_FACES];
__shared__ float sharedMins[2*FACE_MAX_FACES];
__shared__ float sharedMaxs[2*FACE_MAX_FACES];
__shared__ float sharedScaleF[FACE_MAX_FACES];
// Other parameters
......@@ -31,23 +39,33 @@ namespace op
const auto lineWidth = fastMinCuda(targetWidth, targetHeight) / 250.f;
// Render key points
renderKeypoints(targetPtr, sharedMaxs, sharedMins, sharedScaleF, globalIdx, x, y, targetWidth, targetHeight,
facePtr, PART_PAIRS_GPU, numberPeople, FACE_NUMBER_PARTS, numberPartPairs, COLORS,
numberColors, radius, lineWidth, SCALES, numberScales, threshold, alphaColorToAdd);
renderKeypoints(
targetPtr, sharedMaxs, sharedMins, sharedScaleF, maxPtr, minPtr, scalePtr, globalIdx, x, y, targetWidth,
targetHeight, facePtr, PART_PAIRS_GPU, numberPeople, FACE_NUMBER_PARTS, numberPartPairs, COLORS,
numberColors, radius, lineWidth, SCALES, numberScales, threshold, alphaColorToAdd);
}
void renderFaceKeypointsGpu(float* framePtr, const Point<int>& frameSize, const float* const facePtr,
const int numberPeople, const float renderThreshold, const float alphaColorToAdd)
void renderFaceKeypointsGpu(
float* framePtr, float* maxPtr, float* minPtr, float* scalePtr, const Point<int>& frameSize,
const float* const facePtr, const int numberPeople, const float renderThreshold, const float alphaColorToAdd)
{
try
{
if (numberPeople > 0)
{
// Get bouding boxes
const dim3 threadsPerBlockBoundBox = {1, 1, 1};
const dim3 numBlocksBox{getNumberCudaBlocks(POSE_MAX_PEOPLE, threadsPerBlockBoundBox.x)};
getBoundingBoxPerPersonFace<<<threadsPerBlockBoundBox, numBlocksBox>>>(
maxPtr, minPtr, scalePtr, frameSize.x, frameSize.y, facePtr, numberPeople,
FACE_NUMBER_PARTS, renderThreshold);
// Draw hands
dim3 threadsPerBlock;
dim3 numBlocks;
getNumberCudaThreadsAndBlocks(threadsPerBlock, numBlocks, frameSize);
renderFaceParts<<<threadsPerBlock, numBlocks>>>(framePtr, frameSize.x, frameSize.y, facePtr,
numberPeople, renderThreshold, alphaColorToAdd);
renderFaceParts<<<threadsPerBlock, numBlocks>>>(
framePtr, minPtr, maxPtr, scalePtr, frameSize.x, frameSize.y, facePtr, numberPeople,
renderThreshold, alphaColorToAdd);
cudaCheck(__LINE__, __FUNCTION__, __FILE__);
}
}
......
......@@ -10,7 +10,11 @@ namespace op
{
HandGpuRenderer::HandGpuRenderer(const float renderThreshold, const float alphaKeypoint,
const float alphaHeatMap) :
GpuRenderer{renderThreshold, alphaKeypoint, alphaHeatMap}
GpuRenderer{renderThreshold, alphaKeypoint, alphaHeatMap},
pGpuHand{nullptr},
pMaxPtr{nullptr},
pMinPtr{nullptr},
pScalePtr{nullptr}
{
}
......@@ -20,7 +24,14 @@ namespace op
{
// Free CUDA pointers - Note that if pointers are 0 (i.e., nullptr), no operation is performed.
#ifdef USE_CUDA
cudaFree(pGpuHand);
if (pGpuHand != nullptr)
cudaFree(pGpuHand);
if (pMaxPtr != nullptr)
cudaFree(pMaxPtr);
if (pMinPtr != nullptr)
cudaFree(pMinPtr);
if (pScalePtr != nullptr)
cudaFree(pScalePtr);
#endif
}
catch (const std::exception& e)
......@@ -37,6 +48,9 @@ namespace op
// GPU memory allocation for rendering
#ifdef USE_CUDA
cudaMalloc((void**)(&pGpuHand), HAND_MAX_HANDS * HAND_NUMBER_PARTS * 3 * sizeof(float));
cudaMalloc((void**)&pMaxPtr, sizeof(float) * 2 * HAND_MAX_HANDS);
cudaMalloc((void**)&pMinPtr, sizeof(float) * 2 * HAND_MAX_HANDS);
cudaMalloc((void**)&pScalePtr, sizeof(float) * HAND_MAX_HANDS);
#endif
log("Finished initialization on thread.", Priority::Low, __LINE__, __FUNCTION__, __FILE__);
}
......@@ -69,7 +83,8 @@ namespace op
cudaMemcpy(pGpuHand + handVolume, handKeypoints[1].getConstPtr(),
handVolume * sizeof(float), cudaMemcpyHostToDevice);
renderHandKeypointsGpu(
*spGpuMemory, frameSize, pGpuHand, 2 * numberPeople, mRenderThreshold, getAlphaKeypoint());
*spGpuMemory, pMaxPtr, pMinPtr, pScalePtr, frameSize, pGpuHand, 2 * numberPeople,
mRenderThreshold, getAlphaKeypoint());
// CUDA check
cudaCheck(__LINE__, __FUNCTION__, __FILE__);
}
......
......@@ -10,17 +10,25 @@ namespace op
__constant__ const float SCALES[] = {HAND_SCALES_RENDER_GPU};
__constant__ const float COLORS[] = {HAND_COLORS_RENDER_GPU};
__global__ void renderHandsParts(float* targetPtr, const int targetWidth, const int targetHeight,
const float* const handsPtr, const int numberHands,
const float threshold, const float alphaColorToAdd)
__global__ void getBoundingBoxPerPersonHand(
float* maxPtr, float* minPtr, float* scalePtr,const int targetWidth, const int targetHeight,
const float* const keypointsPtr, const int numberPeople, const int numberParts, const float threshold)
{
getBoundingBoxPerPerson(
maxPtr, minPtr, scalePtr, targetWidth, targetHeight, keypointsPtr, numberPeople, numberParts, threshold);
}
__global__ void renderHandsParts(
float* targetPtr, float* minPtr, float* maxPtr, float* scalePtr, const int targetWidth, const int targetHeight,
const float* const handsPtr, const int numberHands, const float threshold, const float alphaColorToAdd)
{
const auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
const auto y = (blockIdx.y * blockDim.y) + threadIdx.y;
const auto globalIdx = threadIdx.y * blockDim.x + threadIdx.x;
// Shared parameters
__shared__ float2 sharedMins[HAND_MAX_HANDS];
__shared__ float2 sharedMaxs[HAND_MAX_HANDS];
__shared__ float sharedMins[2*HAND_MAX_HANDS];
__shared__ float sharedMaxs[2*HAND_MAX_HANDS];
__shared__ float sharedScaleF[HAND_MAX_HANDS];
// Other parameters
......@@ -31,23 +39,33 @@ namespace op
const auto lineWidth = fastMinCuda(targetWidth, targetHeight) / 80.f;
// Render key points
renderKeypoints(targetPtr, sharedMaxs, sharedMins, sharedScaleF, globalIdx, x, y, targetWidth, targetHeight,
handsPtr, PART_PAIRS_GPU, numberHands, HAND_NUMBER_PARTS, numberPartPairs, COLORS,
numberColors, radius, lineWidth, SCALES, numberScales, threshold, alphaColorToAdd);
renderKeypoints(
targetPtr, sharedMaxs, sharedMins, sharedScaleF, maxPtr, minPtr, scalePtr, globalIdx, x, y, targetWidth,
targetHeight, handsPtr, PART_PAIRS_GPU, numberHands, HAND_NUMBER_PARTS, numberPartPairs, COLORS,
numberColors, radius, lineWidth, SCALES, numberScales, threshold, alphaColorToAdd);
}
void renderHandKeypointsGpu(float* framePtr, const Point<int>& frameSize, const float* const handsPtr,
const int numberHands, const float renderThreshold, const float alphaColorToAdd)
void renderHandKeypointsGpu(
float* framePtr, float* maxPtr, float* minPtr, float* scalePtr, const Point<int>& frameSize,
const float* const handsPtr, const int numberHands, const float renderThreshold, const float alphaColorToAdd)
{
try
{
if (numberHands > 0)
{
// Get bouding boxes
const dim3 threadsPerBlockBoundBox = {1, 1, 1};
const dim3 numBlocksBox{getNumberCudaBlocks(POSE_MAX_PEOPLE, threadsPerBlockBoundBox.x)};
getBoundingBoxPerPersonHand<<<threadsPerBlockBoundBox, numBlocksBox>>>(
maxPtr, minPtr, scalePtr, frameSize.x, frameSize.y, handsPtr, numberHands,
HAND_NUMBER_PARTS, renderThreshold);
// Draw hands
dim3 threadsPerBlock;
dim3 numBlocks;
getNumberCudaThreadsAndBlocks(threadsPerBlock, numBlocks, frameSize);
renderHandsParts<<<threadsPerBlock, numBlocks>>>(framePtr, frameSize.x, frameSize.y, handsPtr,
numberHands, renderThreshold, alphaColorToAdd);
renderHandsParts<<<threadsPerBlock, numBlocks>>>(
framePtr, minPtr, maxPtr, scalePtr, frameSize.x, frameSize.y, handsPtr, numberHands,
renderThreshold, alphaColorToAdd);
cudaCheck(__LINE__, __FUNCTION__, __FILE__);
}
}
......
......@@ -6,19 +6,111 @@ namespace op
{
const auto THREADS_PER_BLOCK_1D = 16u;
// template <typename T>
// __global__ void resizeKernelOld(
// T* targetPtr, const T* const sourcePtr, const int sourceWidth, const int sourceHeight, const int targetWidth,
// const int targetHeight)
// {
// const auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
// const auto y = (blockIdx.y * blockDim.y) + threadIdx.y;
// if (x < targetWidth && y < targetHeight)
// {
// const T xSource = (x + T(0.5f)) * sourceWidth / T(targetWidth) - T(0.5f);
// const T ySource = (y + T(0.5f)) * sourceHeight / T(targetHeight) - T(0.5f);
// targetPtr[y*targetWidth+x] = bicubicInterpolate(
// sourcePtr, xSource, ySource, sourceWidth, sourceHeight, sourceWidth);
// }
// }
template <typename T>
__global__ void resizeKernel(
T* targetPtr, const T* const sourcePtr, const int sourceWidth, const int sourceHeight, const int targetWidth,
const int targetHeight, const int channels)
{
const auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
const auto y = (blockIdx.y * blockDim.y) + threadIdx.y;
const auto channel = (blockIdx.z * blockDim.z) + threadIdx.z;
const auto sourceArea = sourceWidth * sourceHeight;
const auto targetArea = targetWidth * targetHeight;
if (x < targetWidth && y < targetHeight && channel < channels)
{
const T xSource = (x + T(0.5f)) * sourceWidth / T(targetWidth) - T(0.5f);
const T ySource = (y + T(0.5f)) * sourceHeight / T(targetHeight) - T(0.5f);
const T* sourcePtrChannel = sourcePtr + channel * sourceArea;
targetPtr[channel * targetArea + y*targetWidth+x] = bicubicInterpolate(
sourcePtrChannel, xSource, ySource, sourceWidth, sourceHeight, sourceWidth);
}
}
template <typename T>
__global__ void resizeKernel(T* targetPtr, const T* const sourcePtr, const int sourceWidth, const int sourceHeight,
const int targetWidth, const int targetHeight)
__global__ void resizeAndPadKernel(
T* targetPtr, const T* const sourcePtr, const int sourceWidth, const int sourceHeight, const int targetWidth,
const int targetHeight, const float rescaleFactor, const int channels)
{
const auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
const auto y = (blockIdx.y * blockDim.y) + threadIdx.y;
const auto channel = (blockIdx.z * blockDim.z) + threadIdx.z;
const auto sourceArea = sourceWidth * sourceHeight;
const auto targetArea = targetWidth * targetHeight;
if (x < targetWidth && y < targetHeight && channel < channels)
{
const T xSource = (x + T(0.5f)) * 1.0 / T(rescaleFactor) - T(0.5f);
const T ySource = (y + T(0.5f)) * 1.0 / T(rescaleFactor) - T(0.5f);
const T* sourcePtrChannel = sourcePtr + channel * sourceArea;
if (x < sourceWidth * rescaleFactor && y < sourceHeight * rescaleFactor)
targetPtr[channel * targetArea + y*targetWidth+x] = bicubicInterpolate(
sourcePtrChannel, xSource, ySource, sourceWidth, sourceHeight, sourceWidth);
else
targetPtr[channel * targetArea + y*targetWidth+x] = 0;
}
}
template <typename T>
__global__ void resize8TimesKernel(
T* targetPtr, const T* const sourcePtr, const int sourceWidth, const int sourceHeight, const int targetWidth,
const int targetHeight, const unsigned int rescaleFactor)
{
const auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
const auto y = (blockIdx.y * blockDim.y) + threadIdx.y;
const auto channel = (blockIdx.z * blockDim.z) + threadIdx.z;
// Load shared memory
// If resize >= 5, then #threads per block >= # elements of shared memory
__shared__ T sourcePtrShared[25];
const auto sharedLoadId = threadIdx.x + rescaleFactor*threadIdx.y;
if (sharedLoadId < 25)
{
const auto minTargetX = blockIdx.x * rescaleFactor;
const auto minSourceX = (minTargetX + T(0.5f)) * sourceWidth / T(targetWidth) - T(0.5f);
const auto minSourceXInt_1 = int(minSourceX+ 1e-5);
const auto minSourceXInt = minSourceXInt_1 - 1;
const auto minTargetY = blockIdx.y * rescaleFactor;
const auto minSourceY = (minTargetY + T(0.5f)) * sourceHeight / T(targetHeight) - T(0.5f);
const auto minSourceYInt_1 = int(minSourceY + 1e-5);
const auto minSourceYInt = minSourceYInt_1 - 1;
const auto yClean = fastTruncateCuda(int(minSourceYInt+sharedLoadId/5 + 1e-5), 0, sourceHeight - 1);
const auto xClean = fastTruncateCuda(int(minSourceXInt+sharedLoadId%5 + 1e-5), 0, sourceWidth - 1);
const auto sourceArea = sourceWidth * sourceHeight;
const T* sourcePtrChannel = sourcePtr + channel * sourceArea;
sourcePtrShared[sharedLoadId] = sourcePtrChannel[yClean * sourceWidth + xClean];
}
// Wait here until shared memory has been loaded
__syncthreads();
if (x < targetWidth && y < targetHeight)
{
const auto targetArea = targetWidth * targetHeight;
const T xSource = (x + T(0.5f)) * sourceWidth / T(targetWidth) - T(0.5f);
const T ySource = (y + T(0.5f)) * sourceHeight / T(targetHeight) - T(0.5f);
targetPtr[y*targetWidth+x] = bicubicInterpolate(sourcePtr, xSource, ySource, sourceWidth, sourceHeight,
sourceWidth);
targetPtr[channel * targetArea + y*targetWidth+x] = bicubicInterpolate8Times(
sourcePtrShared, xSource, ySource, sourceWidth, sourceHeight, sourceWidth, threadIdx.x, threadIdx.y);
}
}
......@@ -58,10 +150,53 @@ namespace op
}
}
__global__ void reorderAndCastKernel(
float* targetPtr, const unsigned char* const srcPtr, const int width, const int height)
{
const auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
const auto y = (blockIdx.y * blockDim.y) + threadIdx.y;
const auto c = (blockIdx.z * blockDim.z) + threadIdx.z;
if (x < width && y < height)
{
const auto channels = 3;
const auto originFramePtrOffsetY = y * width;
const auto channelOffset = c * width * height;
const auto targetIndex = channelOffset + y * width + x;
const auto srcIndex = (originFramePtrOffsetY + x) * channels + c;
targetPtr[targetIndex] = float(srcPtr[srcIndex]) * (1/256.f) - 0.5f;
}
}
void reorderAndCast(float* targetPtr, const unsigned char* const srcPtr, const int width, const int height)
{
const dim3 threadsPerBlock{32, 1, 1};
const dim3 numBlocks{
getNumberCudaBlocks(width, threadsPerBlock.x),
getNumberCudaBlocks(height, threadsPerBlock.y),
getNumberCudaBlocks(3, threadsPerBlock.z)};
reorderAndCastKernel<<<numBlocks, threadsPerBlock>>>(targetPtr, srcPtr, width, height);
}
void resizeAndMergeRGBGPU(
float* targetPtr, const float* const srcPtr, const int sourceWidth, const int sourceHeight,
const int targetWidth, const int targetHeight, const float scaleFactor)
{
const auto channels = 3;
const dim3 threadsPerBlock{THREADS_PER_BLOCK_1D, THREADS_PER_BLOCK_1D, 1};
const dim3 numBlocks{
getNumberCudaBlocks(targetWidth, threadsPerBlock.x),
getNumberCudaBlocks(targetHeight, threadsPerBlock.y),
getNumberCudaBlocks(channels, threadsPerBlock.z)};
resizeAndPadKernel<<<numBlocks, threadsPerBlock>>>(
targetPtr, srcPtr, sourceWidth, sourceHeight, targetWidth, targetHeight, scaleFactor, channels);
}
template <typename T>
void resizeAndMergeGpu(T* targetPtr, const std::vector<const T*>& sourcePtrs, const std::array<int, 4>& targetSize,
const std::vector<std::array<int, 4>>& sourceSizes,
const std::vector<T>& scaleInputToNetInputs)
void resizeAndMergeGpu(
T* targetPtr, const std::vector<const T*>& sourcePtrs, const std::array<int, 4>& targetSize,
const std::vector<std::array<int, 4>>& sourceSizes, const std::vector<T>& scaleInputToNetInputs)
{
try
{
......@@ -90,20 +225,60 @@ namespace op
const auto num = sourceSize[0];
if (targetSize[0] > 1 || num == 1)
{
const auto sourceChannelOffset = sourceHeight * sourceWidth;
const auto targetChannelOffset = targetWidth * targetHeight;
for (auto n = 0; n < num; n++)
{
const auto offsetBase = n*channels;
for (auto c = 0 ; c < channels ; c++)
{
const auto offset = offsetBase + c;
resizeKernel<<<numBlocks, threadsPerBlock>>>(targetPtr + offset * targetChannelOffset,
sourcePtrs.at(0) + offset * sourceChannelOffset,
sourceWidth, sourceHeight, targetWidth,
targetHeight);
}
}
// // Profiling code
// const auto REPS = 250;
// double timeNormalize0 = 0.;
// double timeNormalize1 = 0.;
// double timeNormalize2 = 0.;
// double timeNormalize3 = 0.;
// // Non-optimized function
// OP_CUDA_PROFILE_INIT(REPS);
// const auto sourceChannelOffset = sourceHeight * sourceWidth;
// const auto targetChannelOffset = targetWidth * targetHeight;
// for (auto n = 0; n < num; n++)
// {
// const auto offsetBase = n*channels;
// for (auto c = 0 ; c < channels ; c++)
// {
// const auto offset = offsetBase + c;
// resizeKernelOld<<<numBlocks, threadsPerBlock>>>(
// targetPtr + offset * targetChannelOffset,
// sourcePtrs.at(0) + offset * sourceChannelOffset,
// sourceWidth, sourceHeight, targetWidth, targetHeight);
// }
// }
// OP_CUDA_PROFILE_END(timeNormalize1, 1e3, REPS);
// Optimized function for any resize size (suboptimal for 8x resize)
// OP_CUDA_PROFILE_INIT(REPS);
const dim3 threadsPerBlock{THREADS_PER_BLOCK_1D, THREADS_PER_BLOCK_1D, 1};
const dim3 numBlocks{getNumberCudaBlocks(targetWidth, threadsPerBlock.x),
getNumberCudaBlocks(targetHeight, threadsPerBlock.y),
getNumberCudaBlocks(num * channels, threadsPerBlock.z)};
resizeKernel<<<numBlocks, threadsPerBlock>>>(
targetPtr, sourcePtrs.at(0), sourceWidth, sourceHeight, targetWidth, targetHeight,
num * channels);
// OP_CUDA_PROFILE_END(timeNormalize2, 1e3, REPS);
// // Optimized function for 8x resize
// OP_CUDA_PROFILE_INIT(REPS);
// if (targetWidth / sourceWidth != 8 || targetHeight / sourceHeight != 8)
// error("Kernel only implemented for 8x resize. Notify us if this error appears.",
// __LINE__, __FUNCTION__, __FILE__);
// const auto rescaleFactor = (unsigned int) std::ceil((float)(targetHeight) / (float)(sourceHeight));
// const dim3 threadsPerBlock{rescaleFactor, rescaleFactor, 1};
// const dim3 numBlocks{getNumberCudaBlocks(targetWidth, threadsPerBlock.x),
// getNumberCudaBlocks(targetHeight, threadsPerBlock.y),
// getNumberCudaBlocks(num * channels, threadsPerBlock.z)};
// resize8TimesKernel<<<numBlocks, threadsPerBlock>>>(
// targetPtr, sourcePtrs.at(0), sourceWidth, sourceHeight, targetWidth, targetHeight, rescaleFactor);
// OP_CUDA_PROFILE_END(timeNormalize3, 1e3, REPS);
// Profiling code
// log(" Res(ori)=" + std::to_string(timeNormalize1) + "ms");
// log(" Res(new)=" + std::to_string(timeNormalize2) + "ms");
// log(" Res(new8x)=" + std::to_string(timeNormalize3) + "ms");
}
// Old inefficient multi-scale merging
else
......
......@@ -22,7 +22,10 @@ namespace op
getNumberElementsToRender(poseModel)}, // mNumberElementsToRender
PoseRenderer{poseModel},
spPoseExtractorNet{poseExtractorNet},
pGpuPose{nullptr}
pGpuPose{nullptr},
pMaxPtr{nullptr},
pMinPtr{nullptr},
pScalePtr{nullptr}
{
}
......@@ -32,7 +35,14 @@ namespace op
{
// Free CUDA pointers - Note that if pointers are 0 (i.e., nullptr), no operation is performed.
#ifdef USE_CUDA
cudaFree(pGpuPose);
if (pGpuPose != nullptr)
cudaFree(pGpuPose);
if (pMaxPtr != nullptr)
cudaFree(pMaxPtr);
if (pMinPtr != nullptr)
cudaFree(pMinPtr);
if (pScalePtr != nullptr)
cudaFree(pScalePtr);
#endif
}
catch (const std::exception& e)
......@@ -49,7 +59,10 @@ namespace op
// GPU memory allocation for rendering
#ifdef USE_CUDA
cudaMalloc((void**)(&pGpuPose),
POSE_MAX_PEOPLE * getPoseNumberBodyParts(mPoseModel) * 3 * sizeof(float));
POSE_MAX_PEOPLE * getPoseNumberBodyParts(mPoseModel) * 3 * sizeof(float));
cudaMalloc((void**)&pMaxPtr, sizeof(float) * 2 * POSE_MAX_PEOPLE);
cudaMalloc((void**)&pMinPtr, sizeof(float) * 2 * POSE_MAX_PEOPLE);
cudaMalloc((void**)&pScalePtr, sizeof(float) * POSE_MAX_PEOPLE);
cudaCheck(__LINE__, __FUNCTION__, __FILE__);
#endif
log("Finished initialization on thread.", Priority::Low, __LINE__, __FUNCTION__, __FILE__);
......@@ -60,10 +73,9 @@ namespace op
}
}
std::pair<int, std::string> PoseGpuRenderer::renderPose(Array<float>& outputData,
const Array<float>& poseKeypoints,
const float scaleInputToOutput,
const float scaleNetToOutput)
std::pair<int, std::string> PoseGpuRenderer::renderPose(
Array<float>& outputData, const Array<float>& poseKeypoints, const float scaleInputToOutput,
const float scaleNetToOutput)
{
try
{
......@@ -78,7 +90,6 @@ namespace op
if (numberPeople > 0 || elementRendered != 0 || !mBlendOriginalFrame)
{
cpuToGpuMemoryIfNotCopiedYet(outputData.getPtr(), outputData.getVolume());
cudaCheck(__LINE__, __FUNCTION__, __FILE__);
const auto numberBodyParts = getPoseNumberBodyParts(mPoseModel);
const auto hasBkg = addBkgChannel(mPoseModel);
const auto numberBodyPartsPlusBkg = numberBodyParts + (hasBkg ? 1 : 0);
......@@ -96,9 +107,9 @@ namespace op
poseKeypointsRescaled.getConstPtr(),
numberPeople * numberBodyParts * 3 * sizeof(float),
cudaMemcpyHostToDevice);
renderPoseKeypointsGpu(*spGpuMemory, mPoseModel, numberPeople, frameSize, pGpuPose,
mRenderThreshold, mShowGooglyEyes, mBlendOriginalFrame,
getAlphaKeypoint());
renderPoseKeypointsGpu(
*spGpuMemory, pMaxPtr, pMinPtr, pScalePtr, mPoseModel, numberPeople, frameSize, pGpuPose,
mRenderThreshold, mShowGooglyEyes, mBlendOriginalFrame, getAlphaKeypoint());
}
else
{
......
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册