提交 19b667b5 编写于 作者: G gineshidalgo99

Max render resolution from 720p to >32k images

上级 34d77981
......@@ -2,15 +2,30 @@
#define OPENPOSE__UTILITIES_CUDA_HPP
#include <string>
#include <utility> // std::pair
#include <cuda.h>
#include <cuda_runtime.h>
#include <opencv2/core/core.hpp>
namespace op
{
const auto CUDA_NUM_THREADS = 512u;
void cudaCheck(const int line = -1, const std::string& function = "", const std::string& file = "");
inline unsigned int getNumberCudaBlocks(const unsigned int totalRequired, const unsigned int numberCudaThreads)
inline unsigned int getNumberCudaBlocks(const unsigned int totalRequired, const unsigned int numberCudaThreads = CUDA_NUM_THREADS)
{
return (totalRequired + numberCudaThreads - 1) / numberCudaThreads;
}
inline dim3 getNumberCudaBlocks(const cv::Size& frameSize, const dim3 numberCudaThreads = dim3{CUDA_NUM_THREADS, CUDA_NUM_THREADS, 1})
{
return dim3{getNumberCudaBlocks(frameSize.width, numberCudaThreads.x),
getNumberCudaBlocks(frameSize.height, numberCudaThreads.y),
numberCudaThreads.z};
}
std::pair<dim3, dim3> getNumberCudaThreadsAndBlocks(const cv::Size& frameSize);
}
#endif // OPENPOSE__UTILITIES_CUDA_HPP
......@@ -35,8 +35,8 @@ namespace op
sharedMaxs[globalIdx].y = y;
}
}
const auto averageX = sharedMaxs[globalIdx].x-sharedMins[globalIdx].x;
const auto averageY = sharedMaxs[globalIdx].y-sharedMins[globalIdx].y;
const auto averageX = sharedMaxs[globalIdx].x - sharedMins[globalIdx].x;
const auto averageY = sharedMaxs[globalIdx].y - sharedMins[globalIdx].y;
sharedScaleF[globalIdx] = fastTruncate((averageX + averageY) / 400.f, 0.33f, 1.f); // (averageX + averageY) / 2.f / 400.f
const auto constantToAdd = 50.f;
sharedMaxs[globalIdx].x += constantToAdd;
......
......@@ -7,8 +7,7 @@
namespace op
{
const auto THREADS_PER_BLOCK_1D = 32;
const dim3 THREADS_PER_BLOCK{128, 128, 1};
__constant__ const unsigned char PART_PAIRS_GPU[] = FACE_PAIRS_TO_RENDER;
__constant__ const float RGB_COLORS[] = {
255.f, 255.f, 255.f,
......@@ -48,9 +47,8 @@ namespace op
if (numberFaces > 0)
{
const auto threshold = 0.5f;
dim3 threadsPerBlock = dim3{THREADS_PER_BLOCK_1D, THREADS_PER_BLOCK_1D};
dim3 numBlocks = dim3{getNumberCudaBlocks(frameSize.width, threadsPerBlock.x), getNumberCudaBlocks(frameSize.height, threadsPerBlock.y)};
renderFaceParts<<<threadsPerBlock, numBlocks>>>(framePtr, frameSize.width, frameSize.height, facePtr, numberFaces, threshold, alphaColorToAdd);
const auto numBlocks = getNumberCudaBlocks(frameSize, THREADS_PER_BLOCK);
renderFaceParts<<<THREADS_PER_BLOCK, numBlocks>>>(framePtr, frameSize.width, frameSize.height, facePtr, numberFaces, threshold, alphaColorToAdd);
cudaCheck(__LINE__, __FUNCTION__, __FILE__);
}
}
......
......@@ -7,8 +7,6 @@
namespace op
{
const auto THREADS_PER_BLOCK_1D = 32;
__constant__ const unsigned char PART_PAIRS_GPU[] = HAND_PAIRS_TO_RENDER;
__constant__ const float RGB_COLORS[] = {
179.f, 0.f, 0.f,
......@@ -71,8 +69,9 @@ namespace op
if (numberHands > 0)
{
const auto threshold = 0.05f;
dim3 threadsPerBlock = dim3{THREADS_PER_BLOCK_1D, THREADS_PER_BLOCK_1D};
dim3 numBlocks = dim3{getNumberCudaBlocks(frameSize.width, threadsPerBlock.x), getNumberCudaBlocks(frameSize.height, threadsPerBlock.y)};
dim3 threadsPerBlock;
dim3 numBlocks;
std::tie(threadsPerBlock, numBlocks) = getNumberCudaThreadsAndBlocks(frameSize);
renderHandsParts<<<threadsPerBlock, numBlocks>>>(framePtr, frameSize.width, frameSize.height, handsPtr, numberHands, threshold, alphaColorToAdd);
cudaCheck(__LINE__, __FUNCTION__, __FILE__);
}
......
#include <utility> /// std::pair
#include <utility> // std::pair
#include "openpose/pose/poseParameters.hpp"
#include "openpose/utilities/errorAndLog.hpp"
#include "openpose/utilities/cuda.hpp"
......@@ -8,7 +8,6 @@
namespace op
{
const auto THREADS_PER_BLOCK_1D = 32u;
__constant__ const unsigned char COCO_PAIRS_GPU[] = POSE_COCO_PAIRS_TO_RENDER;
__constant__ const unsigned char MPI_PAIRS_GPU[] = POSE_MPI_PAIRS_TO_RENDER;
__constant__ const float COCO_RGB_COLORS[] = {
......@@ -388,24 +387,9 @@ namespace op
}
}
inline std::pair<dim3, dim3> getThreadsAndBlocks(const cv::Size& frameSize)
{
try
{
std::pair<dim3, dim3> threadsAndBlocks;
threadsAndBlocks.first = dim3{THREADS_PER_BLOCK_1D, THREADS_PER_BLOCK_1D};
threadsAndBlocks.second = dim3{getNumberCudaBlocks(frameSize.width, threadsAndBlocks.first.x), getNumberCudaBlocks(frameSize.height, threadsAndBlocks.first.y)};
return threadsAndBlocks;
}
catch (const std::exception& e)
{
error(e.what(), __LINE__, __FUNCTION__, __FILE__);
return std::make_pair(dim3{0,0,0}, dim3{0,0,0});
}
}
inline void renderKeyPointsPartAffinityAux(float* framePtr, const PoseModel poseModel, const cv::Size& frameSize, const float* const heatMapPtr,
const cv::Size& heatMapSize, const float scaleToKeepRatio, const int part, const int partsToRender, const float alphaBlending)
inline void renderKeyPointsPartAffinityAux(float* framePtr, const PoseModel poseModel, const cv::Size& frameSize,
const float* const heatMapPtr, const cv::Size& heatMapSize, const float scaleToKeepRatio,
const int part, const int partsToRender, const float alphaBlending)
{
try
{
......@@ -415,9 +399,9 @@ namespace op
const auto heatMapOffset = POSE_NUMBER_BODY_PARTS[(int)poseModel] * heatMapSize.area();
dim3 threadsPerBlock;
dim3 numBlocks;
std::tie(threadsPerBlock, numBlocks) = getThreadsAndBlocks(frameSize);
renderPartAffinities<<<threadsPerBlock, numBlocks>>>(framePtr, frameSize.width, frameSize.height, heatMapPtr, heatMapSize.width, heatMapSize.height,
scaleToKeepRatio, partsToRender, part, alphaBlending);
std::tie(threadsPerBlock, numBlocks) = getNumberCudaThreadsAndBlocks(frameSize);
renderPartAffinities<<<threadsPerBlock, numBlocks>>>(framePtr, frameSize.width, frameSize.height, heatMapPtr, heatMapSize.width,
heatMapSize.height, scaleToKeepRatio, partsToRender, part, alphaBlending);
cudaCheck(__LINE__, __FUNCTION__, __FILE__);
}
catch (const std::exception& e)
......@@ -441,13 +425,15 @@ namespace op
dim3 threadsPerBlock;
dim3 numBlocks;
std::tie(threadsPerBlock, numBlocks) = getThreadsAndBlocks(frameSize);
std::tie(threadsPerBlock, numBlocks) = getNumberCudaThreadsAndBlocks(frameSize);
const auto threshold = getThresholdForPose(poseModel);
if (poseModel == PoseModel::COCO_18)
renderPoseCoco<<<threadsPerBlock, numBlocks>>>(framePtr, frameSize.width, frameSize.height, posePtr, numberPeople, threshold, googlyEyes, blendOriginalFrame, alphaBlending);
renderPoseCoco<<<threadsPerBlock, numBlocks>>>(framePtr, frameSize.width, frameSize.height, posePtr, numberPeople,
threshold, googlyEyes, blendOriginalFrame, alphaBlending);
else if (poseModel == PoseModel::MPI_15 || poseModel == PoseModel::MPI_15_4)
renderPoseMpi29Parts<<<threadsPerBlock, numBlocks>>>(framePtr, frameSize.width, frameSize.height, posePtr, numberPeople, threshold, blendOriginalFrame, alphaBlending);
renderPoseMpi29Parts<<<threadsPerBlock, numBlocks>>>(framePtr, frameSize.width, frameSize.height, posePtr,
numberPeople, threshold, blendOriginalFrame, alphaBlending);
else
error("Unvalid Model.", __LINE__, __FUNCTION__, __FILE__);
cudaCheck(__LINE__, __FUNCTION__, __FILE__);
......@@ -469,7 +455,7 @@ namespace op
checkAlpha(alphaBlending);
dim3 threadsPerBlock;
dim3 numBlocks;
std::tie(threadsPerBlock, numBlocks) = getThreadsAndBlocks(frameSize);
std::tie(threadsPerBlock, numBlocks) = getNumberCudaThreadsAndBlocks(frameSize);
const auto numberBodyParts = POSE_NUMBER_BODY_PARTS[(int)poseModel];
const auto heatMapOffset = numberBodyParts * heatMapSize.area();
......@@ -493,12 +479,12 @@ namespace op
checkAlpha(alphaBlending);
dim3 threadsPerBlock;
dim3 numBlocks;
std::tie(threadsPerBlock, numBlocks) = getThreadsAndBlocks(frameSize);
std::tie(threadsPerBlock, numBlocks) = getNumberCudaThreadsAndBlocks(frameSize);
const auto numberBodyParts = POSE_NUMBER_BODY_PARTS[(int)poseModel];
const auto heatMapOffset = numberBodyParts * heatMapSize.area();
renderBodyPartHeatMaps<<<threadsPerBlock, numBlocks>>>(framePtr, frameSize.width, frameSize.height, heatMapPtr, heatMapSize.width, heatMapSize.height,
scaleToKeepRatio, numberBodyParts, alphaBlending);
scaleToKeepRatio, numberBodyParts, alphaBlending);
cudaCheck(__LINE__, __FUNCTION__, __FILE__);
}
catch (const std::exception& e)
......
#include <cuda.h>
#include <cuda_runtime.h>
#include "openpose/utilities/errorAndLog.hpp"
#include "openpose/utilities/fastMath.hpp"
#include "openpose/utilities/cuda.hpp"
namespace op
{
const dim3 THREADS_PER_BLOCK_SMALL{32, 32, 1};
const dim3 THREADS_PER_BLOCK_MEDIUM{128, 128, 1};
const dim3 THREADS_PER_BLOCK_BIG{256, 256, 1};
void cudaCheck(const int line, const std::string& function, const std::string& file)
{
const auto errorCode = cudaPeekAtLastError();
if(errorCode != cudaSuccess)
error("Cuda check failed (" + std::to_string(errorCode) + " vs. " + std::to_string(cudaSuccess) + "):" + cudaGetErrorString(errorCode), line, function, file);
error("Cuda check failed (" + std::to_string(errorCode) + " vs. " + std::to_string(cudaSuccess) + "): " + cudaGetErrorString(errorCode), line, function, file);
}
std::pair<dim3, dim3> getNumberCudaThreadsAndBlocks(const cv::Size& frameSize)
{
try
{
// Image <= 1280x720 --> THREADS_PER_BLOCK_SMALL
// Image < 16K --> THREADS_PER_BLOCK_MEDIUM
// Image > 16K --> THREADS_PER_BLOCK_BIG
const auto maxValue = fastMax(frameSize.width, frameSize.height);
const auto threadsPerBlock = (maxValue < 1281 ? THREADS_PER_BLOCK_SMALL
: (maxValue < 16384 ? THREADS_PER_BLOCK_MEDIUM
: THREADS_PER_BLOCK_BIG));
return std::make_pair(threadsPerBlock, getNumberCudaBlocks(frameSize, threadsPerBlock));
}
catch (const std::exception& e)
{
error(e.what(), __LINE__, __FUNCTION__, __FILE__);
return std::make_pair(dim3{1,1,1}, dim3{1,1,1});
}
}
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册