Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
shy_201992
openpose
提交
35f8c9d1
O
openpose
项目概览
shy_201992
/
openpose
与 Fork 源项目一致
从无法访问的项目Fork
通知
7
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
O
openpose
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
前往新版Gitcode,体验更适合开发者的 AI 搜索 >>
提交
35f8c9d1
编写于
5月 03, 2019
作者:
J
joker3212
提交者:
Gines
5月 03, 2019
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Render and resize speedup in CUDA (#1209)
上级
5f4cf6bf
变更
16
展开全部
隐藏空白更改
内联
并排
Showing
16 changed file
with
847 addition
and
289 deletion
+847
-289
include/openpose/face/faceGpuRenderer.hpp
include/openpose/face/faceGpuRenderer.hpp
+3
-0
include/openpose/face/renderFace.hpp
include/openpose/face/renderFace.hpp
+6
-3
include/openpose/gpu/cuda.hu
include/openpose/gpu/cuda.hu
+22
-2
include/openpose/hand/handGpuRenderer.hpp
include/openpose/hand/handGpuRenderer.hpp
+3
-0
include/openpose/hand/renderHand.hpp
include/openpose/hand/renderHand.hpp
+6
-5
include/openpose/net/resizeAndMergeBase.hpp
include/openpose/net/resizeAndMergeBase.hpp
+9
-1
include/openpose/pose/poseGpuRenderer.hpp
include/openpose/pose/poseGpuRenderer.hpp
+3
-0
include/openpose/pose/renderPose.hpp
include/openpose/pose/renderPose.hpp
+33
-33
include/openpose/utilities/render.hu
include/openpose/utilities/render.hu
+212
-11
src/openpose/face/faceGpuRenderer.cpp
src/openpose/face/faceGpuRenderer.cpp
+19
-4
src/openpose/face/renderFace.cu
src/openpose/face/renderFace.cu
+30
-12
src/openpose/hand/handGpuRenderer.cpp
src/openpose/hand/handGpuRenderer.cpp
+18
-3
src/openpose/hand/renderHand.cu
src/openpose/hand/renderHand.cu
+30
-12
src/openpose/net/resizeAndMergeBase.cu
src/openpose/net/resizeAndMergeBase.cu
+196
-21
src/openpose/pose/poseGpuRenderer.cpp
src/openpose/pose/poseGpuRenderer.cpp
+22
-11
src/openpose/pose/renderPose.cu
src/openpose/pose/renderPose.cu
+235
-171
未找到文件。
include/openpose/face/faceGpuRenderer.hpp
浏览文件 @
35f8c9d1
...
...
@@ -23,6 +23,9 @@ namespace op
private:
float
*
pGpuFace
;
// GPU aux memory
float
*
pMaxPtr
;
// GPU aux memory
float
*
pMinPtr
;
// GPU aux memory
float
*
pScalePtr
;
// GPU aux memory
DELETE_COPY
(
FaceGpuRenderer
);
};
...
...
include/openpose/face/renderFace.hpp
浏览文件 @
35f8c9d1
...
...
@@ -6,10 +6,13 @@
namespace
op
{
OP_API
void
renderFaceKeypointsCpu
(
Array
<
float
>&
frameArray
,
const
Array
<
float
>&
faceKeypoints
,
const
float
renderThreshold
);
OP_API
void
renderFaceKeypointsCpu
(
Array
<
float
>&
frameArray
,
const
Array
<
float
>&
faceKeypoints
,
const
float
renderThreshold
);
void
renderFaceKeypointsGpu
(
float
*
framePtr
,
const
Point
<
int
>&
frameSize
,
const
float
*
const
facePtr
,
const
int
numberPeople
,
const
float
renderThreshold
,
const
float
alphaColorToAdd
=
FACE_DEFAULT_ALPHA_KEYPOINT
);
void
renderFaceKeypointsGpu
(
float
*
framePtr
,
float
*
maxPtr
,
float
*
minPtr
,
float
*
scalePtr
,
const
Point
<
int
>&
frameSize
,
const
float
*
const
facePtr
,
const
int
numberPeople
,
const
float
renderThreshold
,
const
float
alphaColorToAdd
=
FACE_DEFAULT_ALPHA_KEYPOINT
);
}
#endif // OPENPOSE_FACE_RENDER_FACE_HPP
include/openpose/gpu/cuda.hu
浏览文件 @
35f8c9d1
...
...
@@ -121,8 +121,9 @@ namespace op
}
template <typename T>
inline __device__ T bicubicInterpolate(const T* const sourcePtr, const T xSource, const T ySource, const int widthSource,
const int heightSource, const int widthSourcePtr)
inline __device__ T bicubicInterpolate(
const T* const sourcePtr, const T xSource, const T ySource, const int widthSource, const int heightSource,
const int widthSourcePtr)
{
int xIntArray[4];
int yIntArray[4];
...
...
@@ -140,6 +141,25 @@ namespace op
return cubicInterpolate(temp[0], temp[1], temp[2], temp[3], dy);
}
template <typename T>
inline __device__ T bicubicInterpolate8Times(
const T* const sourcePtr, const T xSource, const T ySource, const int widthSource, const int heightSource,
const int widthSourcePtr, const int threadIdxX, const int threadIdxY)
{
// Now we only need dx and dy
const T dx = xSource - fastTruncateCuda(int(xSource + 1e-5), 0, widthSource - 1);
const T dy = ySource - fastTruncateCuda(int(ySource + 1e-5), 0, heightSource - 1);
T temp[4];
for (unsigned char i = 0; i < 4; i++)
{
const auto offset = 5 * (i + (threadIdxY > 3 ? 1 : 0)) + (threadIdxX > 3 ? 1 : 0);
temp[i] = cubicInterpolate(sourcePtr[offset + 0], sourcePtr[offset + 1],
sourcePtr[offset + 2], sourcePtr[offset + 3], dx);
}
return cubicInterpolate(temp[0], temp[1], temp[2], temp[3], dy);
}
template <typename T>
inline __device__ T addWeighted(const T value1, const T value2, const T alphaValue2)
{
...
...
include/openpose/hand/handGpuRenderer.hpp
浏览文件 @
35f8c9d1
...
...
@@ -23,6 +23,9 @@ namespace op
private:
float
*
pGpuHand
;
// GPU aux memory
float
*
pMaxPtr
;
// GPU aux memory
float
*
pMinPtr
;
// GPU aux memory
float
*
pScalePtr
;
// GPU aux memory
DELETE_COPY
(
HandGpuRenderer
);
};
...
...
include/openpose/hand/renderHand.hpp
浏览文件 @
35f8c9d1
...
...
@@ -6,12 +6,13 @@
namespace
op
{
OP_API
void
renderHandKeypointsCpu
(
Array
<
float
>&
frameArray
,
const
std
::
array
<
Array
<
float
>
,
2
>&
handKeypoints
,
const
float
renderThreshold
);
OP_API
void
renderHandKeypointsCpu
(
Array
<
float
>&
frameArray
,
const
std
::
array
<
Array
<
float
>
,
2
>&
handKeypoints
,
const
float
renderThreshold
);
void
renderHandKeypointsGpu
(
float
*
framePtr
,
const
Point
<
int
>&
frameSize
,
const
float
*
const
handsPtr
,
const
int
numberHands
,
const
float
renderThreshold
,
const
float
alphaColorToAdd
=
HAND_DEFAULT_ALPHA_KEYPOINT
);
void
renderHandKeypointsGpu
(
float
*
framePtr
,
float
*
maxPtr
,
float
*
minPtr
,
float
*
scalePtr
,
const
Point
<
int
>&
frameSize
,
const
float
*
const
handsPtr
,
const
int
numberHands
,
const
float
renderThreshold
,
const
float
alphaColorToAdd
=
HAND_DEFAULT_ALPHA_KEYPOINT
);
}
#endif // OPENPOSE_HAND_GPU_HAND_RENDER_HPP
include/openpose/net/resizeAndMergeBase.hpp
浏览文件 @
35f8c9d1
...
...
@@ -22,6 +22,14 @@ namespace op
T
*
targetPtr
,
const
std
::
vector
<
const
T
*>&
sourcePtrs
,
std
::
vector
<
T
*>&
sourceTempPtrs
,
const
std
::
array
<
int
,
4
>&
targetSize
,
const
std
::
vector
<
std
::
array
<
int
,
4
>>&
sourceSizes
,
const
std
::
vector
<
T
>&
scaleInputToNetInputs
=
{
1.
f
},
const
int
gpuID
=
0
);
}
// Functions for the files cvMatToOpInput/Output
void
resizeAndMergeRGBGPU
(
float
*
targetPtr
,
const
float
*
const
srcPtr
,
const
int
sourceWidth
,
const
int
sourceHeight
,
const
int
targetWidth
,
const
int
targetHeight
,
const
float
scaleFactor
);
void
reorderAndCast
(
float
*
targetPtr
,
const
unsigned
char
*
const
srcPtr
,
const
int
width
,
const
int
height
);
}
#endif // OPENPOSE_NET_RESIZE_AND_MERGE_BASE_HPP
include/openpose/pose/poseGpuRenderer.hpp
浏览文件 @
35f8c9d1
...
...
@@ -31,6 +31,9 @@ namespace op
const
std
::
shared_ptr
<
PoseExtractorNet
>
spPoseExtractorNet
;
// Init with thread
float
*
pGpuPose
;
// GPU aux memory
float
*
pMaxPtr
;
// GPU aux memory
float
*
pMinPtr
;
// GPU aux memory
float
*
pScalePtr
;
// GPU aux memory
DELETE_COPY
(
PoseGpuRenderer
);
};
...
...
include/openpose/pose/renderPose.hpp
浏览文件 @
35f8c9d1
...
...
@@ -8,39 +8,39 @@
namespace
op
{
OP_API
void
renderPoseKeypointsCpu
(
Array
<
float
>&
frameArray
,
const
Array
<
float
>&
poseKeypoints
,
const
PoseModel
poseModel
,
const
float
renderThreshold
,
const
bool
blendOriginalFrame
=
true
);
void
renderPoseKeypointsGpu
(
float
*
framePtr
,
const
PoseModel
poseModel
,
const
int
numberPeople
,
const
Point
<
int
>&
frameSize
,
const
float
*
const
posePtr
,
const
float
renderThreshold
,
const
bool
googlyEyes
=
false
,
const
bool
blendOriginalFrame
=
true
,
const
float
alphaBlending
=
POSE_DEFAULT_ALPHA_KEYPOINT
);
void
renderPoseHeatMapGpu
(
float
*
frame
,
const
Point
<
int
>&
frameSize
,
const
float
*
const
heatMapPtr
,
const
Point
<
int
>&
heatMapSize
,
const
float
scaleToKeepRatio
,
const
unsigned
int
part
,
const
float
alphaBlending
=
POSE_DEFAULT_ALPHA_HEAT_MAP
);
void
renderPoseHeatMapsGpu
(
float
*
frame
,
const
PoseModel
poseModel
,
const
Point
<
int
>&
frameSize
,
const
float
*
const
heatMapPtr
,
const
Point
<
int
>&
heatMapSize
,
const
float
scaleToKeepRatio
,
const
float
alphaBlending
=
POSE_DEFAULT_ALPHA_HEAT_MAP
);
void
renderPosePAFGpu
(
float
*
framePtr
,
const
PoseModel
poseModel
,
const
Point
<
int
>&
frameSize
,
const
float
*
const
heatMapPtr
,
const
Point
<
int
>&
heatMapSize
,
const
float
scaleToKeepRatio
,
const
int
part
,
const
float
alphaBlending
=
POSE_DEFAULT_ALPHA_HEAT_MAP
);
void
renderPosePAFsGpu
(
float
*
framePtr
,
const
PoseModel
poseModel
,
const
Point
<
int
>&
frameSize
,
const
float
*
const
heatMapPtr
,
const
Point
<
int
>&
heatMapSize
,
const
float
scaleToKeepRatio
,
const
float
alphaBlending
=
POSE_DEFAULT_ALPHA_HEAT_MAP
);
void
renderPoseDistanceGpu
(
float
*
framePtr
,
const
Point
<
int
>&
frameSize
,
const
float
*
const
heatMapPtr
,
const
Point
<
int
>&
heatMapSize
,
const
float
scaleToKeepRatio
,
const
unsigned
int
part
,
const
float
alphaBlending
=
POSE_DEFAULT_ALPHA_HEAT_MAP
);
OP_API
void
renderPoseKeypointsCpu
(
Array
<
float
>&
frameArray
,
const
Array
<
float
>&
poseKeypoints
,
const
PoseModel
poseModel
,
const
float
renderThreshold
,
const
bool
blendOriginalFrame
=
true
);
void
renderPoseKeypointsGpu
(
float
*
framePtr
,
float
*
maxPtr
,
float
*
minPtr
,
float
*
scalePtr
,
const
PoseModel
poseModel
,
const
int
numberPeople
,
const
Point
<
int
>&
frameSize
,
const
float
*
const
posePtr
,
const
float
renderThreshold
,
const
bool
googlyEyes
=
false
,
const
bool
blendOriginalFrame
=
true
,
const
float
alphaBlending
=
POSE_DEFAULT_ALPHA_KEYPOINT
);
void
renderPoseHeatMapGpu
(
float
*
frame
,
const
Point
<
int
>&
frameSize
,
const
float
*
const
heatMapPtr
,
const
Point
<
int
>&
heatMapSize
,
const
float
scaleToKeepRatio
,
const
unsigned
int
part
,
const
float
alphaBlending
=
POSE_DEFAULT_ALPHA_HEAT_MAP
);
void
renderPoseHeatMapsGpu
(
float
*
frame
,
const
PoseModel
poseModel
,
const
Point
<
int
>&
frameSize
,
const
float
*
const
heatMapPtr
,
const
Point
<
int
>&
heatMapSize
,
const
float
scaleToKeepRatio
,
const
float
alphaBlending
=
POSE_DEFAULT_ALPHA_HEAT_MAP
);
void
renderPosePAFGpu
(
float
*
framePtr
,
const
PoseModel
poseModel
,
const
Point
<
int
>&
frameSize
,
const
float
*
const
heatMapPtr
,
const
Point
<
int
>&
heatMapSize
,
const
float
scaleToKeepRatio
,
const
int
part
,
const
float
alphaBlending
=
POSE_DEFAULT_ALPHA_HEAT_MAP
);
void
renderPosePAFsGpu
(
float
*
framePtr
,
const
PoseModel
poseModel
,
const
Point
<
int
>&
frameSize
,
const
float
*
const
heatMapPtr
,
const
Point
<
int
>&
heatMapSize
,
const
float
scaleToKeepRatio
,
const
float
alphaBlending
=
POSE_DEFAULT_ALPHA_HEAT_MAP
);
void
renderPoseDistanceGpu
(
float
*
framePtr
,
const
Point
<
int
>&
frameSize
,
const
float
*
const
heatMapPtr
,
const
Point
<
int
>&
heatMapSize
,
const
float
scaleToKeepRatio
,
const
unsigned
int
part
,
const
float
alphaBlending
=
POSE_DEFAULT_ALPHA_HEAT_MAP
);
}
#endif // OPENPOSE_POSE_RENDER_POSE_HPP
include/openpose/utilities/render.hu
浏览文件 @
35f8c9d1
...
...
@@ -3,16 +3,217 @@
namespace op
{
inline __device__ void renderKeypoints(float* targetPtr, float2* sharedMaxs, float2* sharedMins,
float* sharedScaleF, const int globalIdx, const int x, const int y,
const int targetWidth, const int targetHeight,
const float* const keypointsPtr, const unsigned int* const partPairsPtr,
const int numberPeople, const int numberParts, const int numberPartPairs,
const float* const rgbColorsPtr, const int numberColors, const float radius,
const float lineWidth, const float* const keypointScalePtr,
const int numberScales, const float threshold,
const float alphaColorToAdd, const bool blendOriginalFrame = true,
const int googlyEye1 = -1, const int googlyEye2 = -1)
__inline__ __device__ void getBoundingBoxPerPerson(
float* maxPtr, float* minPtr, float* scalePtr,const int targetWidth, const int targetHeight,
const float* const keypointsPtr, const int numberPeople, const int numberParts, const float threshold)
{
const auto globalIdx = threadIdx.x;
// const auto xIndex = 2*globalIdx;
// const auto yIndex = xIndex+1;
const auto xIndex = globalIdx;
const auto yIndex = numberPeople+globalIdx;
// Fill shared parameters
// if (globalIdx < numberPeople)
{
auto minValueX = (float)targetWidth;
auto minValueY = (float)targetHeight;
auto maxValueX = 0.f;
auto maxValueY = 0.f;
for (auto part = 0 ; part < numberParts ; part++)
{
const auto index = 3 * (globalIdx*numberParts + part);
const auto x = keypointsPtr[index];
const auto y = keypointsPtr[index+1];
const auto score = keypointsPtr[index+2];
if (score > threshold)
{
if (x < minValueX)
minValueX = x;
if (x > maxValueX)
maxValueX = x;
if (y < minValueY)
minValueY = y;
if (y > maxValueY)
maxValueY = y;
}
}
if (maxValueX != 0.f && maxValueY != 0.f)
{
const auto averageX = maxValueX - minValueX;
const auto averageY = maxValueY - minValueY;
// (averageX + averageY) / 2.f / 400.f
scalePtr[globalIdx] = fastTruncateCuda((averageX + averageY) / 400.f, 0.33f, 1.f);
const auto constantToAdd = 50.f;
maxValueX += constantToAdd;
maxValueY += constantToAdd;
minValueX -= constantToAdd;
minValueY -= constantToAdd;
}
minPtr[xIndex] = minValueX;
minPtr[yIndex] = minValueY;
maxPtr[xIndex] = maxValueX;
maxPtr[yIndex] = maxValueY;
}
}
__inline__ __device__ void renderKeypoints(
float* targetPtr, float* sharedMaxs, float* sharedMins, float* sharedScaleF, const float* const maxPtr,
const float* const minPtr, const float* const scalePtr, const int globalIdx, const int x, const int y,
const int targetWidth, const int targetHeight, const float* const keypointsPtr,
const unsigned int* const partPairsPtr, const int numberPeople, const int numberParts,
const int numberPartPairs, const float* const rgbColorsPtr, const int numberColors, const float radius,
const float lineWidth, const float* const keypointScalePtr, const int numberScales, const float threshold,
const float alphaColorToAdd, const bool blendOriginalFrame = true, const int googlyEye1 = -1,
const int googlyEye2 = -1)
{
// Load shared memory
if (globalIdx < 2*numberPeople)
{
sharedMins[globalIdx] = minPtr[globalIdx];
sharedMaxs[globalIdx] = maxPtr[globalIdx];
if (globalIdx < numberPeople)
sharedScaleF[globalIdx] = scalePtr[globalIdx];
}
__syncthreads();
// Fill each (x,y) target pixel
if (x < targetWidth && y < targetHeight)
{
const auto baseIndex = 3*(y * targetWidth + x);
auto b = targetPtr[baseIndex];
auto g = targetPtr[baseIndex+1];
auto r = targetPtr[baseIndex+2];
if (!blendOriginalFrame)
{
b = 0.f;
g = 0.f;
r = 0.f;
}
const auto lineWidthSquared = lineWidth * lineWidth;
const auto radiusSquared = radius * radius;
for (auto person = 0; person < numberPeople; person++)
{
// Make sure person x,y in the limits
// Make sure person is not empty. Assume all joints are below threshold. Then
// maxs = 0 and mins = width/height. So if statement would be false
// const auto xIndex = 2*person;
// const auto yIndex = xIndex+1;
const auto xIndex = person;
const auto yIndex = numberPeople+person;
if (x <= sharedMaxs[xIndex] && x >= sharedMins[xIndex]
&& y <= sharedMaxs[yIndex] && y >= sharedMins[yIndex])
{
// Part pair connections
for (auto partPair = 0; partPair < numberPartPairs; partPair++)
{
const auto partA = partPairsPtr[2*partPair];
const auto partB = partPairsPtr[2*partPair+1];
const auto indexA = person*numberParts*3 + partA*3;
const auto xA = keypointsPtr[indexA];
const auto yA = keypointsPtr[indexA + 1];
const auto scoreA = keypointsPtr[indexA + 2];
const auto indexB = person*numberParts*3 + partB*3;
const auto xB = keypointsPtr[indexB];
const auto yB = keypointsPtr[indexB + 1];
const auto scoreB = keypointsPtr[indexB + 2];
if (scoreA > threshold && scoreB > threshold)
{
const auto keypointScale = keypointScalePtr[partB%numberScales]
* keypointScalePtr[partB%numberScales]
* keypointScalePtr[partB%numberScales];
const auto lineWidthScaled = lineWidthSquared * keypointScale;
const auto bSqrt = sharedScaleF[person] * sharedScaleF[person] * lineWidthScaled;
const auto xP = (xA + xB) / 2.f;
const auto yP = (yA + yB) / 2.f;
const auto aSqrt = (xA - xP) * (xA - xP) + (yA - yP) * (yA - yP);
const auto angle = atan2f(yB - yA, xB - xA);
const auto sine = sinf(angle);
const auto cosine = cosf(angle);
const auto A = cosine * (x - xP) + sine * (y - yP);
const auto B = sine * (x - xP) - cosine * (y - yP);
const auto judge = A * A / aSqrt + B * B / bSqrt;
const auto minV = 0.f;
const auto maxV = 1.f;
if (minV <= judge && judge <= maxV)
// Before used partPair vs partB
addColorWeighted(r, g, b, &rgbColorsPtr[(partB%numberColors)*3], alphaColorToAdd);
}
}
// Part circles
for (auto part = 0u; part < numberParts; part++)
{
const auto index = 3 * (person*numberParts + part);
const auto localX = keypointsPtr[index];
const auto localY = keypointsPtr[index + 1];
const auto score = keypointsPtr[index + 2];
if (score > threshold)
{
const auto keypointScale = keypointScalePtr[part%numberScales]
* keypointScalePtr[part%numberScales]
* keypointScalePtr[part%numberScales];
const auto radiusScaled = radiusSquared * keypointScale;
const auto dist2 = (x - localX) * (x - localX) + (y - localY) * (y - localY);
// Googly eyes
if (googlyEye1 == part || googlyEye2 == part)
{
const auto eyeRatio = 2.5f * sqrt(radiusScaled);
const auto minr2 = sharedScaleF[person] * sharedScaleF[person]
* (eyeRatio - 2) * (eyeRatio - 2);
const auto maxr2 = sharedScaleF[person] * sharedScaleF[person] * eyeRatio * eyeRatio;
if (dist2 <= maxr2)
{
float colorToAdd [3] = {0., 0., 0.};
if (dist2 <= minr2)
for (auto& color : colorToAdd)
color = {255.f};
if (dist2 <= minr2*0.6f)
{
const auto dist3 = (x-4 - localX)
* (x-4 - localX) + (y - localY+4) * (y - localY+4);
if (dist3 > 14.0625f) // 3.75f^2
for (auto& color : colorToAdd)
color = {0.f};
}
const auto alphaColorToAdd = 0.9f;
addColorWeighted(r, g, b, colorToAdd, alphaColorToAdd);
}
}
// Other parts
else
{
const auto minr2 = 0.f;
const auto maxr2 = sharedScaleF[person] * sharedScaleF[person] * radiusScaled;
if (minr2 <= dist2 && dist2 <= maxr2)
addColorWeighted(r, g, b, &rgbColorsPtr[(part%numberColors)*3], alphaColorToAdd);
}
}
}
}
}
targetPtr[baseIndex] = b;
targetPtr[baseIndex+1] = g;
targetPtr[baseIndex+2] = r;
}
}
__inline__ __device__ void renderKeypointsOld(
float* targetPtr, float2* sharedMaxs, float2* sharedMins, float* sharedScaleF, const int globalIdx,
const int x, const int y, const int targetWidth, const int targetHeight, const float* const keypointsPtr,
const unsigned int* const partPairsPtr, const int numberPeople, const int numberParts,
const int numberPartPairs, const float* const rgbColorsPtr, const int numberColors, const float radius,
const float lineWidth, const float* const keypointScalePtr, const int numberScales, const float threshold,
const float alphaColorToAdd, const bool blendOriginalFrame = true, const int googlyEye1 = -1,
const int googlyEye2 = -1)
{
// Fill shared parameters
if (globalIdx < numberPeople)
...
...
@@ -52,7 +253,6 @@ namespace op
sharedMins[globalIdx].y -= constantToAdd;
}
}
__syncthreads();
// Fill each (x,y) target pixel
...
...
@@ -176,4 +376,5 @@ namespace op
}
}
#endif // OPENPOSE_UTILITIES_RENDER_HU
src/openpose/face/faceGpuRenderer.cpp
浏览文件 @
35f8c9d1
...
...
@@ -10,7 +10,11 @@ namespace op
{
FaceGpuRenderer
::
FaceGpuRenderer
(
const
float
renderThreshold
,
const
float
alphaKeypoint
,
const
float
alphaHeatMap
)
:
GpuRenderer
{
renderThreshold
,
alphaKeypoint
,
alphaHeatMap
}
GpuRenderer
{
renderThreshold
,
alphaKeypoint
,
alphaHeatMap
},
pGpuFace
{
nullptr
},
pMaxPtr
{
nullptr
},
pMinPtr
{
nullptr
},
pScalePtr
{
nullptr
}
{
}
...
...
@@ -20,7 +24,14 @@ namespace op
{
// Free CUDA pointers - Note that if pointers are 0 (i.e., nullptr), no operation is performed.
#ifdef USE_CUDA
cudaFree
(
pGpuFace
);
if
(
pGpuFace
!=
nullptr
)
cudaFree
(
pGpuFace
);
if
(
pMaxPtr
!=
nullptr
)
cudaFree
(
pMaxPtr
);
if
(
pMinPtr
!=
nullptr
)
cudaFree
(
pMinPtr
);
if
(
pScalePtr
!=
nullptr
)
cudaFree
(
pScalePtr
);
#endif
}
catch
(
const
std
::
exception
&
e
)
...
...
@@ -37,6 +48,9 @@ namespace op
// GPU memory allocation for rendering
#ifdef USE_CUDA
cudaMalloc
((
void
**
)(
&
pGpuFace
),
POSE_MAX_PEOPLE
*
FACE_NUMBER_PARTS
*
3
*
sizeof
(
float
));
cudaMalloc
((
void
**
)
&
pMaxPtr
,
sizeof
(
float
)
*
2
*
FACE_NUMBER_PARTS
);
cudaMalloc
((
void
**
)
&
pMinPtr
,
sizeof
(
float
)
*
2
*
FACE_NUMBER_PARTS
);
cudaMalloc
((
void
**
)
&
pScalePtr
,
sizeof
(
float
)
*
FACE_NUMBER_PARTS
);
#endif
log
(
"Finished initialization on thread."
,
Priority
::
Low
,
__LINE__
,
__FUNCTION__
,
__FILE__
);
}
...
...
@@ -63,8 +77,9 @@ namespace op
cudaMemcpy
(
pGpuFace
,
faceKeypoints
.
getConstPtr
(),
faceKeypoints
.
getSize
(
0
)
*
FACE_NUMBER_PARTS
*
3
*
sizeof
(
float
),
cudaMemcpyHostToDevice
);
renderFaceKeypointsGpu
(
*
spGpuMemory
,
frameSize
,
pGpuFace
,
faceKeypoints
.
getSize
(
0
),
mRenderThreshold
,
getAlphaKeypoint
());
renderFaceKeypointsGpu
(
*
spGpuMemory
,
pMaxPtr
,
pMinPtr
,
pScalePtr
,
frameSize
,
pGpuFace
,
faceKeypoints
.
getSize
(
0
),
mRenderThreshold
,
getAlphaKeypoint
());
// CUDA check
cudaCheck
(
__LINE__
,
__FUNCTION__
,
__FILE__
);
}
...
...
src/openpose/face/renderFace.cu
浏览文件 @
35f8c9d1
...
...
@@ -10,17 +10,25 @@ namespace op
__constant__
const
float
SCALES
[]
=
{
FACE_SCALES_RENDER_GPU
};
__constant__
const
float
COLORS
[]
=
{
FACE_COLORS_RENDER_GPU
};
__global__
void
renderFaceParts
(
float
*
targetPtr
,
const
int
targetWidth
,
const
int
targetHeight
,
const
float
*
const
facePtr
,
const
int
numberPeople
,
const
float
threshold
,
const
float
alphaColorToAdd
)
__global__
void
getBoundingBoxPerPersonFace
(
float
*
maxPtr
,
float
*
minPtr
,
float
*
scalePtr
,
const
int
targetWidth
,
const
int
targetHeight
,
const
float
*
const
keypointsPtr
,
const
int
numberPeople
,
const
int
numberParts
,
const
float
threshold
)
{
getBoundingBoxPerPerson
(
maxPtr
,
minPtr
,
scalePtr
,
targetWidth
,
targetHeight
,
keypointsPtr
,
numberPeople
,
numberParts
,
threshold
);
}
__global__
void
renderFaceParts
(
float
*
targetPtr
,
float
*
minPtr
,
float
*
maxPtr
,
float
*
scalePtr
,
const
int
targetWidth
,
const
int
targetHeight
,
const
float
*
const
facePtr
,
const
int
numberPeople
,
const
float
threshold
,
const
float
alphaColorToAdd
)
{
const
auto
x
=
(
blockIdx
.
x
*
blockDim
.
x
)
+
threadIdx
.
x
;
const
auto
y
=
(
blockIdx
.
y
*
blockDim
.
y
)
+
threadIdx
.
y
;
const
auto
globalIdx
=
threadIdx
.
y
*
blockDim
.
x
+
threadIdx
.
x
;
// Shared parameters
__shared__
float
2
sharedMins
[
FACE_MAX_FACES
];
__shared__
float
2
sharedMaxs
[
FACE_MAX_FACES
];
__shared__
float
sharedMins
[
2
*
FACE_MAX_FACES
];
__shared__
float
sharedMaxs
[
2
*
FACE_MAX_FACES
];
__shared__
float
sharedScaleF
[
FACE_MAX_FACES
];
// Other parameters
...
...
@@ -31,23 +39,33 @@ namespace op
const
auto
lineWidth
=
fastMinCuda
(
targetWidth
,
targetHeight
)
/
250.
f
;
// Render key points
renderKeypoints
(
targetPtr
,
sharedMaxs
,
sharedMins
,
sharedScaleF
,
globalIdx
,
x
,
y
,
targetWidth
,
targetHeight
,
facePtr
,
PART_PAIRS_GPU
,
numberPeople
,
FACE_NUMBER_PARTS
,
numberPartPairs
,
COLORS
,
numberColors
,
radius
,
lineWidth
,
SCALES
,
numberScales
,
threshold
,
alphaColorToAdd
);
renderKeypoints
(
targetPtr
,
sharedMaxs
,
sharedMins
,
sharedScaleF
,
maxPtr
,
minPtr
,
scalePtr
,
globalIdx
,
x
,
y
,
targetWidth
,
targetHeight
,
facePtr
,
PART_PAIRS_GPU
,
numberPeople
,
FACE_NUMBER_PARTS
,
numberPartPairs
,
COLORS
,
numberColors
,
radius
,
lineWidth
,
SCALES
,
numberScales
,
threshold
,
alphaColorToAdd
);
}
void
renderFaceKeypointsGpu
(
float
*
framePtr
,
const
Point
<
int
>&
frameSize
,
const
float
*
const
facePtr
,
const
int
numberPeople
,
const
float
renderThreshold
,
const
float
alphaColorToAdd
)
void
renderFaceKeypointsGpu
(
float
*
framePtr
,
float
*
maxPtr
,
float
*
minPtr
,
float
*
scalePtr
,
const
Point
<
int
>&
frameSize
,
const
float
*
const
facePtr
,
const
int
numberPeople
,
const
float
renderThreshold
,
const
float
alphaColorToAdd
)
{
try
{
if
(
numberPeople
>
0
)
{
// Get bouding boxes
const
dim3
threadsPerBlockBoundBox
=
{
1
,
1
,
1
};
const
dim3
numBlocksBox
{
getNumberCudaBlocks
(
POSE_MAX_PEOPLE
,
threadsPerBlockBoundBox
.
x
)};
getBoundingBoxPerPersonFace
<<<
threadsPerBlockBoundBox
,
numBlocksBox
>>>
(
maxPtr
,
minPtr
,
scalePtr
,
frameSize
.
x
,
frameSize
.
y
,
facePtr
,
numberPeople
,
FACE_NUMBER_PARTS
,
renderThreshold
);
// Draw hands
dim3
threadsPerBlock
;
dim3
numBlocks
;
getNumberCudaThreadsAndBlocks
(
threadsPerBlock
,
numBlocks
,
frameSize
);
renderFaceParts
<<<
threadsPerBlock
,
numBlocks
>>>
(
framePtr
,
frameSize
.
x
,
frameSize
.
y
,
facePtr
,
numberPeople
,
renderThreshold
,
alphaColorToAdd
);
renderFaceParts
<<<
threadsPerBlock
,
numBlocks
>>>
(
framePtr
,
minPtr
,
maxPtr
,
scalePtr
,
frameSize
.
x
,
frameSize
.
y
,
facePtr
,
numberPeople
,
renderThreshold
,
alphaColorToAdd
);
cudaCheck
(
__LINE__
,
__FUNCTION__
,
__FILE__
);
}
}
...
...
src/openpose/hand/handGpuRenderer.cpp
浏览文件 @
35f8c9d1
...
...
@@ -10,7 +10,11 @@ namespace op
{
HandGpuRenderer
::
HandGpuRenderer
(
const
float
renderThreshold
,
const
float
alphaKeypoint
,
const
float
alphaHeatMap
)
:
GpuRenderer
{
renderThreshold
,
alphaKeypoint
,
alphaHeatMap
}
GpuRenderer
{
renderThreshold
,
alphaKeypoint
,
alphaHeatMap
},
pGpuHand
{
nullptr
},
pMaxPtr
{
nullptr
},
pMinPtr
{
nullptr
},
pScalePtr
{
nullptr
}
{
}
...
...
@@ -20,7 +24,14 @@ namespace op
{
// Free CUDA pointers - Note that if pointers are 0 (i.e., nullptr), no operation is performed.
#ifdef USE_CUDA
cudaFree
(
pGpuHand
);
if
(
pGpuHand
!=
nullptr
)
cudaFree
(
pGpuHand
);
if
(
pMaxPtr
!=
nullptr
)
cudaFree
(
pMaxPtr
);
if
(
pMinPtr
!=
nullptr
)
cudaFree
(
pMinPtr
);
if
(
pScalePtr
!=
nullptr
)
cudaFree
(
pScalePtr
);
#endif
}
catch
(
const
std
::
exception
&
e
)
...
...
@@ -37,6 +48,9 @@ namespace op
// GPU memory allocation for rendering
#ifdef USE_CUDA
cudaMalloc
((
void
**
)(
&
pGpuHand
),
HAND_MAX_HANDS
*
HAND_NUMBER_PARTS
*
3
*
sizeof
(
float
));
cudaMalloc
((
void
**
)
&
pMaxPtr
,
sizeof
(
float
)
*
2
*
HAND_MAX_HANDS
);
cudaMalloc
((
void
**
)
&
pMinPtr
,
sizeof
(
float
)
*
2
*
HAND_MAX_HANDS
);
cudaMalloc
((
void
**
)
&
pScalePtr
,
sizeof
(
float
)
*
HAND_MAX_HANDS
);
#endif
log
(
"Finished initialization on thread."
,
Priority
::
Low
,
__LINE__
,
__FUNCTION__
,
__FILE__
);
}
...
...
@@ -69,7 +83,8 @@ namespace op
cudaMemcpy
(
pGpuHand
+
handVolume
,
handKeypoints
[
1
].
getConstPtr
(),
handVolume
*
sizeof
(
float
),
cudaMemcpyHostToDevice
);
renderHandKeypointsGpu
(
*
spGpuMemory
,
frameSize
,
pGpuHand
,
2
*
numberPeople
,
mRenderThreshold
,
getAlphaKeypoint
());
*
spGpuMemory
,
pMaxPtr
,
pMinPtr
,
pScalePtr
,
frameSize
,
pGpuHand
,
2
*
numberPeople
,
mRenderThreshold
,
getAlphaKeypoint
());
// CUDA check
cudaCheck
(
__LINE__
,
__FUNCTION__
,
__FILE__
);
}
...
...
src/openpose/hand/renderHand.cu
浏览文件 @
35f8c9d1
...
...
@@ -10,17 +10,25 @@ namespace op
__constant__
const
float
SCALES
[]
=
{
HAND_SCALES_RENDER_GPU
};
__constant__
const
float
COLORS
[]
=
{
HAND_COLORS_RENDER_GPU
};
__global__
void
renderHandsParts
(
float
*
targetPtr
,
const
int
targetWidth
,
const
int
targetHeight
,
const
float
*
const
handsPtr
,
const
int
numberHands
,
const
float
threshold
,
const
float
alphaColorToAdd
)
__global__
void
getBoundingBoxPerPersonHand
(
float
*
maxPtr
,
float
*
minPtr
,
float
*
scalePtr
,
const
int
targetWidth
,
const
int
targetHeight
,
const
float
*
const
keypointsPtr
,
const
int
numberPeople
,
const
int
numberParts
,
const
float
threshold
)
{
getBoundingBoxPerPerson
(
maxPtr
,
minPtr
,
scalePtr
,
targetWidth
,
targetHeight
,
keypointsPtr
,
numberPeople
,
numberParts
,
threshold
);
}
__global__
void
renderHandsParts
(
float
*
targetPtr
,
float
*
minPtr
,
float
*
maxPtr
,
float
*
scalePtr
,
const
int
targetWidth
,
const
int
targetHeight
,
const
float
*
const
handsPtr
,
const
int
numberHands
,
const
float
threshold
,
const
float
alphaColorToAdd
)
{
const
auto
x
=
(
blockIdx
.
x
*
blockDim
.
x
)
+
threadIdx
.
x
;
const
auto
y
=
(
blockIdx
.
y
*
blockDim
.
y
)
+
threadIdx
.
y
;
const
auto
globalIdx
=
threadIdx
.
y
*
blockDim
.
x
+
threadIdx
.
x
;
// Shared parameters
__shared__
float
2
sharedMins
[
HAND_MAX_HANDS
];
__shared__
float
2
sharedMaxs
[
HAND_MAX_HANDS
];
__shared__
float
sharedMins
[
2
*
HAND_MAX_HANDS
];
__shared__
float
sharedMaxs
[
2
*
HAND_MAX_HANDS
];
__shared__
float
sharedScaleF
[
HAND_MAX_HANDS
];
// Other parameters
...
...
@@ -31,23 +39,33 @@ namespace op
const
auto
lineWidth
=
fastMinCuda
(
targetWidth
,
targetHeight
)
/
80.
f
;
// Render key points
renderKeypoints
(
targetPtr
,
sharedMaxs
,
sharedMins
,
sharedScaleF
,
globalIdx
,
x
,
y
,
targetWidth
,
targetHeight
,
handsPtr
,
PART_PAIRS_GPU
,
numberHands
,
HAND_NUMBER_PARTS
,
numberPartPairs
,
COLORS
,
numberColors
,
radius
,
lineWidth
,
SCALES
,
numberScales
,
threshold
,
alphaColorToAdd
);
renderKeypoints
(
targetPtr
,
sharedMaxs
,
sharedMins
,
sharedScaleF
,
maxPtr
,
minPtr
,
scalePtr
,
globalIdx
,
x
,
y
,
targetWidth
,
targetHeight
,
handsPtr
,
PART_PAIRS_GPU
,
numberHands
,
HAND_NUMBER_PARTS
,
numberPartPairs
,
COLORS
,
numberColors
,
radius
,
lineWidth
,
SCALES
,
numberScales
,
threshold
,
alphaColorToAdd
);
}
void
renderHandKeypointsGpu
(
float
*
framePtr
,
const
Point
<
int
>&
frameSize
,
const
float
*
const
handsPtr
,
const
int
numberHands
,
const
float
renderThreshold
,
const
float
alphaColorToAdd
)
void
renderHandKeypointsGpu
(
float
*
framePtr
,
float
*
maxPtr
,
float
*
minPtr
,
float
*
scalePtr
,
const
Point
<
int
>&
frameSize
,
const
float
*
const
handsPtr
,
const
int
numberHands
,
const
float
renderThreshold
,
const
float
alphaColorToAdd
)
{
try
{
if
(
numberHands
>
0
)
{
// Get bouding boxes
const
dim3
threadsPerBlockBoundBox
=
{
1
,
1
,
1
};
const
dim3
numBlocksBox
{
getNumberCudaBlocks
(
POSE_MAX_PEOPLE
,
threadsPerBlockBoundBox
.
x
)};
getBoundingBoxPerPersonHand
<<<
threadsPerBlockBoundBox
,
numBlocksBox
>>>
(
maxPtr
,
minPtr
,
scalePtr
,
frameSize
.
x
,
frameSize
.
y
,
handsPtr
,
numberHands
,
HAND_NUMBER_PARTS
,
renderThreshold
);
// Draw hands
dim3
threadsPerBlock
;
dim3
numBlocks
;
getNumberCudaThreadsAndBlocks
(
threadsPerBlock
,
numBlocks
,
frameSize
);
renderHandsParts
<<<
threadsPerBlock
,
numBlocks
>>>
(
framePtr
,
frameSize
.
x
,
frameSize
.
y
,
handsPtr
,
numberHands
,
renderThreshold
,
alphaColorToAdd
);
renderHandsParts
<<<
threadsPerBlock
,
numBlocks
>>>
(
framePtr
,
minPtr
,
maxPtr
,
scalePtr
,
frameSize
.
x
,
frameSize
.
y
,
handsPtr
,
numberHands
,
renderThreshold
,
alphaColorToAdd
);
cudaCheck
(
__LINE__
,
__FUNCTION__
,
__FILE__
);
}
}
...
...
src/openpose/net/resizeAndMergeBase.cu
浏览文件 @
35f8c9d1
...
...
@@ -6,19 +6,111 @@ namespace op
{
const
auto
THREADS_PER_BLOCK_1D
=
16u
;
// template <typename T>
// __global__ void resizeKernelOld(
// T* targetPtr, const T* const sourcePtr, const int sourceWidth, const int sourceHeight, const int targetWidth,
// const int targetHeight)
// {
// const auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
// const auto y = (blockIdx.y * blockDim.y) + threadIdx.y;
// if (x < targetWidth && y < targetHeight)
// {
// const T xSource = (x + T(0.5f)) * sourceWidth / T(targetWidth) - T(0.5f);
// const T ySource = (y + T(0.5f)) * sourceHeight / T(targetHeight) - T(0.5f);
// targetPtr[y*targetWidth+x] = bicubicInterpolate(
// sourcePtr, xSource, ySource, sourceWidth, sourceHeight, sourceWidth);
// }
// }
template
<
typename
T
>
__global__
void
resizeKernel
(
T
*
targetPtr
,
const
T
*
const
sourcePtr
,
const
int
sourceWidth
,
const
int
sourceHeight
,
const
int
targetWidth
,
const
int
targetHeight
,
const
int
channels
)
{
const
auto
x
=
(
blockIdx
.
x
*
blockDim
.
x
)
+
threadIdx
.
x
;
const
auto
y
=
(
blockIdx
.
y
*
blockDim
.
y
)
+
threadIdx
.
y
;
const
auto
channel
=
(
blockIdx
.
z
*
blockDim
.
z
)
+
threadIdx
.
z
;
const
auto
sourceArea
=
sourceWidth
*
sourceHeight
;
const
auto
targetArea
=
targetWidth
*
targetHeight
;
if
(
x
<
targetWidth
&&
y
<
targetHeight
&&
channel
<
channels
)
{
const
T
xSource
=
(
x
+
T
(
0.5
f
))
*
sourceWidth
/
T
(
targetWidth
)
-
T
(
0.5
f
);
const
T
ySource
=
(
y
+
T
(
0.5
f
))
*
sourceHeight
/
T
(
targetHeight
)
-
T
(
0.5
f
);
const
T
*
sourcePtrChannel
=
sourcePtr
+
channel
*
sourceArea
;
targetPtr
[
channel
*
targetArea
+
y
*
targetWidth
+
x
]
=
bicubicInterpolate
(
sourcePtrChannel
,
xSource
,
ySource
,
sourceWidth
,
sourceHeight
,
sourceWidth
);
}
}
template
<
typename
T
>
__global__
void
resizeKernel
(
T
*
targetPtr
,
const
T
*
const
sourcePtr
,
const
int
sourceWidth
,
const
int
sourceHeight
,
const
int
targetWidth
,
const
int
targetHeight
)
__global__
void
resizeAndPadKernel
(
T
*
targetPtr
,
const
T
*
const
sourcePtr
,
const
int
sourceWidth
,
const
int
sourceHeight
,
const
int
targetWidth
,
const
int
targetHeight
,
const
float
rescaleFactor
,
const
int
channels
)
{
const
auto
x
=
(
blockIdx
.
x
*
blockDim
.
x
)
+
threadIdx
.
x
;
const
auto
y
=
(
blockIdx
.
y
*
blockDim
.
y
)
+
threadIdx
.
y
;
const
auto
channel
=
(
blockIdx
.
z
*
blockDim
.
z
)
+
threadIdx
.
z
;
const
auto
sourceArea
=
sourceWidth
*
sourceHeight
;
const
auto
targetArea
=
targetWidth
*
targetHeight
;
if
(
x
<
targetWidth
&&
y
<
targetHeight
&&
channel
<
channels
)
{
const
T
xSource
=
(
x
+
T
(
0.5
f
))
*
1.0
/
T
(
rescaleFactor
)
-
T
(
0.5
f
);
const
T
ySource
=
(
y
+
T
(
0.5
f
))
*
1.0
/
T
(
rescaleFactor
)
-
T
(
0.5
f
);
const
T
*
sourcePtrChannel
=
sourcePtr
+
channel
*
sourceArea
;
if
(
x
<
sourceWidth
*
rescaleFactor
&&
y
<
sourceHeight
*
rescaleFactor
)
targetPtr
[
channel
*
targetArea
+
y
*
targetWidth
+
x
]
=
bicubicInterpolate
(
sourcePtrChannel
,
xSource
,
ySource
,
sourceWidth
,
sourceHeight
,
sourceWidth
);
else
targetPtr
[
channel
*
targetArea
+
y
*
targetWidth
+
x
]
=
0
;
}
}
template
<
typename
T
>
__global__
void
resize8TimesKernel
(
T
*
targetPtr
,
const
T
*
const
sourcePtr
,
const
int
sourceWidth
,
const
int
sourceHeight
,
const
int
targetWidth
,
const
int
targetHeight
,
const
unsigned
int
rescaleFactor
)
{
const
auto
x
=
(
blockIdx
.
x
*
blockDim
.
x
)
+
threadIdx
.
x
;
const
auto
y
=
(
blockIdx
.
y
*
blockDim
.
y
)
+
threadIdx
.
y
;
const
auto
channel
=
(
blockIdx
.
z
*
blockDim
.
z
)
+
threadIdx
.
z
;
// Load shared memory
// If resize >= 5, then #threads per block >= # elements of shared memory
__shared__
T
sourcePtrShared
[
25
];
const
auto
sharedLoadId
=
threadIdx
.
x
+
rescaleFactor
*
threadIdx
.
y
;
if
(
sharedLoadId
<
25
)
{
const
auto
minTargetX
=
blockIdx
.
x
*
rescaleFactor
;
const
auto
minSourceX
=
(
minTargetX
+
T
(
0.5
f
))
*
sourceWidth
/
T
(
targetWidth
)
-
T
(
0.5
f
);
const
auto
minSourceXInt_1
=
int
(
minSourceX
+
1e-5
);
const
auto
minSourceXInt
=
minSourceXInt_1
-
1
;
const
auto
minTargetY
=
blockIdx
.
y
*
rescaleFactor
;
const
auto
minSourceY
=
(
minTargetY
+
T
(
0.5
f
))
*
sourceHeight
/
T
(
targetHeight
)
-
T
(
0.5
f
);
const
auto
minSourceYInt_1
=
int
(
minSourceY
+
1e-5
);
const
auto
minSourceYInt
=
minSourceYInt_1
-
1
;
const
auto
yClean
=
fastTruncateCuda
(
int
(
minSourceYInt
+
sharedLoadId
/
5
+
1e-5
),
0
,
sourceHeight
-
1
);
const
auto
xClean
=
fastTruncateCuda
(
int
(
minSourceXInt
+
sharedLoadId
%
5
+
1e-5
),
0
,
sourceWidth
-
1
);
const
auto
sourceArea
=
sourceWidth
*
sourceHeight
;
const
T
*
sourcePtrChannel
=
sourcePtr
+
channel
*
sourceArea
;
sourcePtrShared
[
sharedLoadId
]
=
sourcePtrChannel
[
yClean
*
sourceWidth
+
xClean
];
}
// Wait here until shared memory has been loaded
__syncthreads
();
if
(
x
<
targetWidth
&&
y
<
targetHeight
)
{
const
auto
targetArea
=
targetWidth
*
targetHeight
;
const
T
xSource
=
(
x
+
T
(
0.5
f
))
*
sourceWidth
/
T
(
targetWidth
)
-
T
(
0.5
f
);
const
T
ySource
=
(
y
+
T
(
0.5
f
))
*
sourceHeight
/
T
(
targetHeight
)
-
T
(
0.5
f
);
targetPtr
[
y
*
targetWidth
+
x
]
=
bicubicInterpolate
(
sourcePtr
,
xSource
,
ySource
,
sourceWidth
,
sourceHeight
,
sourceWidth
);
targetPtr
[
channel
*
targetArea
+
y
*
targetWidth
+
x
]
=
bicubicInterpolate8Times
(
sourcePtrShared
,
xSource
,
ySource
,
sourceWidth
,
sourceHeight
,
sourceWidth
,
threadIdx
.
x
,
threadIdx
.
y
);
}
}
...
...
@@ -58,10 +150,53 @@ namespace op
}
}
__global__
void
reorderAndCastKernel
(
float
*
targetPtr
,
const
unsigned
char
*
const
srcPtr
,
const
int
width
,
const
int
height
)
{
const
auto
x
=
(
blockIdx
.
x
*
blockDim
.
x
)
+
threadIdx
.
x
;
const
auto
y
=
(
blockIdx
.
y
*
blockDim
.
y
)
+
threadIdx
.
y
;
const
auto
c
=
(
blockIdx
.
z
*
blockDim
.
z
)
+
threadIdx
.
z
;
if
(
x
<
width
&&
y
<
height
)
{
const
auto
channels
=
3
;
const
auto
originFramePtrOffsetY
=
y
*
width
;
const
auto
channelOffset
=
c
*
width
*
height
;
const
auto
targetIndex
=
channelOffset
+
y
*
width
+
x
;
const
auto
srcIndex
=
(
originFramePtrOffsetY
+
x
)
*
channels
+
c
;
targetPtr
[
targetIndex
]
=
float
(
srcPtr
[
srcIndex
])
*
(
1
/
256.
f
)
-
0.5
f
;
}
}
void
reorderAndCast
(
float
*
targetPtr
,
const
unsigned
char
*
const
srcPtr
,
const
int
width
,
const
int
height
)
{
const
dim3
threadsPerBlock
{
32
,
1
,
1
};
const
dim3
numBlocks
{
getNumberCudaBlocks
(
width
,
threadsPerBlock
.
x
),
getNumberCudaBlocks
(
height
,
threadsPerBlock
.
y
),
getNumberCudaBlocks
(
3
,
threadsPerBlock
.
z
)};
reorderAndCastKernel
<<<
numBlocks
,
threadsPerBlock
>>>
(
targetPtr
,
srcPtr
,
width
,
height
);
}
void
resizeAndMergeRGBGPU
(
float
*
targetPtr
,
const
float
*
const
srcPtr
,
const
int
sourceWidth
,
const
int
sourceHeight
,
const
int
targetWidth
,
const
int
targetHeight
,
const
float
scaleFactor
)
{
const
auto
channels
=
3
;
const
dim3
threadsPerBlock
{
THREADS_PER_BLOCK_1D
,
THREADS_PER_BLOCK_1D
,
1
};
const
dim3
numBlocks
{
getNumberCudaBlocks
(
targetWidth
,
threadsPerBlock
.
x
),
getNumberCudaBlocks
(
targetHeight
,
threadsPerBlock
.
y
),
getNumberCudaBlocks
(
channels
,
threadsPerBlock
.
z
)};
resizeAndPadKernel
<<<
numBlocks
,
threadsPerBlock
>>>
(
targetPtr
,
srcPtr
,
sourceWidth
,
sourceHeight
,
targetWidth
,
targetHeight
,
scaleFactor
,
channels
);
}
template
<
typename
T
>
void
resizeAndMergeGpu
(
T
*
targetPtr
,
const
std
::
vector
<
const
T
*>&
sourcePtrs
,
const
std
::
array
<
int
,
4
>&
targetSize
,
const
std
::
vector
<
std
::
array
<
int
,
4
>>&
sourceSizes
,
const
std
::
vector
<
T
>&
scaleInputToNetInputs
)
void
resizeAndMergeGpu
(
T
*
targetPtr
,
const
std
::
vector
<
const
T
*>&
sourcePtrs
,
const
std
::
array
<
int
,
4
>&
targetSize
,
const
std
::
vector
<
std
::
array
<
int
,
4
>>&
sourceSizes
,
const
std
::
vector
<
T
>&
scaleInputToNetInputs
)
{
try
{
...
...
@@ -90,20 +225,60 @@ namespace op
const
auto
num
=
sourceSize
[
0
];
if
(
targetSize
[
0
]
>
1
||
num
==
1
)
{
const
auto
sourceChannelOffset
=
sourceHeight
*
sourceWidth
;
const
auto
targetChannelOffset
=
targetWidth
*
targetHeight
;
for
(
auto
n
=
0
;
n
<
num
;
n
++
)
{
const
auto
offsetBase
=
n
*
channels
;
for
(
auto
c
=
0
;
c
<
channels
;
c
++
)
{
const
auto
offset
=
offsetBase
+
c
;
resizeKernel
<<<
numBlocks
,
threadsPerBlock
>>>
(
targetPtr
+
offset
*
targetChannelOffset
,
sourcePtrs
.
at
(
0
)
+
offset
*
sourceChannelOffset
,
sourceWidth
,
sourceHeight
,
targetWidth
,
targetHeight
);
}
}
// // Profiling code
// const auto REPS = 250;
// double timeNormalize0 = 0.;
// double timeNormalize1 = 0.;
// double timeNormalize2 = 0.;
// double timeNormalize3 = 0.;
// // Non-optimized function
// OP_CUDA_PROFILE_INIT(REPS);
// const auto sourceChannelOffset = sourceHeight * sourceWidth;
// const auto targetChannelOffset = targetWidth * targetHeight;
// for (auto n = 0; n < num; n++)
// {
// const auto offsetBase = n*channels;
// for (auto c = 0 ; c < channels ; c++)
// {
// const auto offset = offsetBase + c;
// resizeKernelOld<<<numBlocks, threadsPerBlock>>>(
// targetPtr + offset * targetChannelOffset,
// sourcePtrs.at(0) + offset * sourceChannelOffset,
// sourceWidth, sourceHeight, targetWidth, targetHeight);
// }
// }
// OP_CUDA_PROFILE_END(timeNormalize1, 1e3, REPS);
// Optimized function for any resize size (suboptimal for 8x resize)
// OP_CUDA_PROFILE_INIT(REPS);
const
dim3
threadsPerBlock
{
THREADS_PER_BLOCK_1D
,
THREADS_PER_BLOCK_1D
,
1
};
const
dim3
numBlocks
{
getNumberCudaBlocks
(
targetWidth
,
threadsPerBlock
.
x
),
getNumberCudaBlocks
(
targetHeight
,
threadsPerBlock
.
y
),
getNumberCudaBlocks
(
num
*
channels
,
threadsPerBlock
.
z
)};
resizeKernel
<<<
numBlocks
,
threadsPerBlock
>>>
(
targetPtr
,
sourcePtrs
.
at
(
0
),
sourceWidth
,
sourceHeight
,
targetWidth
,
targetHeight
,
num
*
channels
);
// OP_CUDA_PROFILE_END(timeNormalize2, 1e3, REPS);
// // Optimized function for 8x resize
// OP_CUDA_PROFILE_INIT(REPS);
// if (targetWidth / sourceWidth != 8 || targetHeight / sourceHeight != 8)
// error("Kernel only implemented for 8x resize. Notify us if this error appears.",
// __LINE__, __FUNCTION__, __FILE__);
// const auto rescaleFactor = (unsigned int) std::ceil((float)(targetHeight) / (float)(sourceHeight));
// const dim3 threadsPerBlock{rescaleFactor, rescaleFactor, 1};
// const dim3 numBlocks{getNumberCudaBlocks(targetWidth, threadsPerBlock.x),
// getNumberCudaBlocks(targetHeight, threadsPerBlock.y),
// getNumberCudaBlocks(num * channels, threadsPerBlock.z)};
// resize8TimesKernel<<<numBlocks, threadsPerBlock>>>(
// targetPtr, sourcePtrs.at(0), sourceWidth, sourceHeight, targetWidth, targetHeight, rescaleFactor);
// OP_CUDA_PROFILE_END(timeNormalize3, 1e3, REPS);
// Profiling code
// log(" Res(ori)=" + std::to_string(timeNormalize1) + "ms");
// log(" Res(new)=" + std::to_string(timeNormalize2) + "ms");
// log(" Res(new8x)=" + std::to_string(timeNormalize3) + "ms");
}
// Old inefficient multi-scale merging
else
...
...
src/openpose/pose/poseGpuRenderer.cpp
浏览文件 @
35f8c9d1
...
...
@@ -22,7 +22,10 @@ namespace op
getNumberElementsToRender
(
poseModel
)},
// mNumberElementsToRender
PoseRenderer
{
poseModel
},
spPoseExtractorNet
{
poseExtractorNet
},
pGpuPose
{
nullptr
}
pGpuPose
{
nullptr
},
pMaxPtr
{
nullptr
},
pMinPtr
{
nullptr
},
pScalePtr
{
nullptr
}
{
}
...
...
@@ -32,7 +35,14 @@ namespace op
{
// Free CUDA pointers - Note that if pointers are 0 (i.e., nullptr), no operation is performed.
#ifdef USE_CUDA
cudaFree
(
pGpuPose
);
if
(
pGpuPose
!=
nullptr
)
cudaFree
(
pGpuPose
);
if
(
pMaxPtr
!=
nullptr
)
cudaFree
(
pMaxPtr
);
if
(
pMinPtr
!=
nullptr
)
cudaFree
(
pMinPtr
);
if
(
pScalePtr
!=
nullptr
)
cudaFree
(
pScalePtr
);
#endif
}
catch
(
const
std
::
exception
&
e
)
...
...
@@ -49,7 +59,10 @@ namespace op
// GPU memory allocation for rendering
#ifdef USE_CUDA
cudaMalloc
((
void
**
)(
&
pGpuPose
),
POSE_MAX_PEOPLE
*
getPoseNumberBodyParts
(
mPoseModel
)
*
3
*
sizeof
(
float
));
POSE_MAX_PEOPLE
*
getPoseNumberBodyParts
(
mPoseModel
)
*
3
*
sizeof
(
float
));
cudaMalloc
((
void
**
)
&
pMaxPtr
,
sizeof
(
float
)
*
2
*
POSE_MAX_PEOPLE
);
cudaMalloc
((
void
**
)
&
pMinPtr
,
sizeof
(
float
)
*
2
*
POSE_MAX_PEOPLE
);
cudaMalloc
((
void
**
)
&
pScalePtr
,
sizeof
(
float
)
*
POSE_MAX_PEOPLE
);
cudaCheck
(
__LINE__
,
__FUNCTION__
,
__FILE__
);
#endif
log
(
"Finished initialization on thread."
,
Priority
::
Low
,
__LINE__
,
__FUNCTION__
,
__FILE__
);
...
...
@@ -60,10 +73,9 @@ namespace op
}
}
std
::
pair
<
int
,
std
::
string
>
PoseGpuRenderer
::
renderPose
(
Array
<
float
>&
outputData
,
const
Array
<
float
>&
poseKeypoints
,
const
float
scaleInputToOutput
,
const
float
scaleNetToOutput
)
std
::
pair
<
int
,
std
::
string
>
PoseGpuRenderer
::
renderPose
(
Array
<
float
>&
outputData
,
const
Array
<
float
>&
poseKeypoints
,
const
float
scaleInputToOutput
,
const
float
scaleNetToOutput
)
{
try
{
...
...
@@ -78,7 +90,6 @@ namespace op
if
(
numberPeople
>
0
||
elementRendered
!=
0
||
!
mBlendOriginalFrame
)
{
cpuToGpuMemoryIfNotCopiedYet
(
outputData
.
getPtr
(),
outputData
.
getVolume
());
cudaCheck
(
__LINE__
,
__FUNCTION__
,
__FILE__
);
const
auto
numberBodyParts
=
getPoseNumberBodyParts
(
mPoseModel
);
const
auto
hasBkg
=
addBkgChannel
(
mPoseModel
);
const
auto
numberBodyPartsPlusBkg
=
numberBodyParts
+
(
hasBkg
?
1
:
0
);
...
...
@@ -96,9 +107,9 @@ namespace op
poseKeypointsRescaled
.
getConstPtr
(),
numberPeople
*
numberBodyParts
*
3
*
sizeof
(
float
),
cudaMemcpyHostToDevice
);
renderPoseKeypointsGpu
(
*
spGpuMemory
,
mPoseModel
,
numberPeople
,
frameSize
,
pGpuPose
,
mRenderThreshold
,
mShowGooglyEyes
,
mBlendOriginalFram
e
,
getAlphaKeypoint
());
renderPoseKeypointsGpu
(
*
spGpuMemory
,
pMaxPtr
,
pMinPtr
,
pScalePtr
,
mPoseModel
,
numberPeople
,
frameSize
,
pGpuPos
e
,
mRenderThreshold
,
mShowGooglyEyes
,
mBlendOriginalFrame
,
getAlphaKeypoint
());
}
else
{
...
...
src/openpose/pose/renderPose.cu
浏览文件 @
35f8c9d1
此差异已折叠。
点击以展开。
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录