提交 4a87e704 编写于 作者: M Matt Pharr

Update from book source. MultiWorkQueue simplifications.

上级 c7c74f1f
......@@ -39,8 +39,8 @@ inline int GetBlockSize(const char *description, F kernel) {
return iter->second;
int minGridSize, blockSize;
CUDA_CHECK(cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize,
kernel, 0, 0));
CUDA_CHECK(
cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, kernel, 0, 0));
kernelBlockSizes[index] = blockSize;
LOG_VERBOSE("[%s]: block size %d", description, blockSize);
......@@ -56,11 +56,16 @@ __global__ void Kernel(F func, int nItems) {
func(tid);
}
#ifdef PBRT_IS_WINDOWS
#define PBRT_GPU_LAMBDA(...) [=,*this] PBRT_GPU(__VA_ARGS__) mutable
#else
#define PBRT_GPU_LAMBDA(...) [=] PBRT_GPU(__VA_ARGS__)
#endif
// GPU Launch Function Declarations
template <typename F>
void GPUParallelFor(const char *description, int nItems, F func);
template <typename F>
void GPUDo(const char *description, F func) {
GPUParallelFor(description, 1, [=] PBRT_GPU(int) mutable { func(); });
}
void GPUWait();
template <typename F>
void GPUParallelFor(const char *description, int nItems, F func) {
......@@ -89,11 +94,6 @@ void GPUParallelFor(const char *description, int nItems, F func) {
#endif
}
template <typename F>
void GPUDo(const char *description, F func) {
GPUParallelFor(description, 1, [=] PBRT_GPU(int) mutable { func(); });
}
void ReportKernelStats();
} // namespace pbrt
......
......@@ -209,7 +209,7 @@ void GPUPathIntegrator::SampleMediumInteraction(int depth) {
auto enqueue = [=](auto ptr) {
using Material = typename std::remove_reference_t<decltype(*ptr)>;
q->Push<Material>(MaterialEvalWorkItem<Material>{
q->Push<MaterialEvalWorkItem<Material>>(MaterialEvalWorkItem<Material>{
ptr, lambda, beta, uniPathPDF, ms.pi, ms.n, ms.ns, ms.dpdus, ms.dpdvs,
ms.dndus, ms.dndvs, -ray.d, ms.uv, ray.time, ms.anyNonSpecularBounces,
ms.etaScale, ms.mediumInterface, ms.pixelIndex});
......
......@@ -229,7 +229,7 @@ static __forceinline__ __device__ void ProcessClosestIntersection(
auto enqueue = [=](auto ptr) {
using Material = typename std::remove_reference_t<decltype(*ptr)>;
q->Push<Material>(MaterialEvalWorkItem<Material>{
q->Push<MaterialEvalWorkItem<Material>>(MaterialEvalWorkItem<Material>{
ptr, r.lambda, r.beta, r.uniPathPDF, intr.pi, intr.n, intr.shading.n,
intr.shading.dpdu, intr.shading.dpdv, intr.shading.dndu, intr.shading.dndv,
intr.wo, intr.uv, intr.time, r.anyNonSpecularBounces, r.etaScale,
......
......@@ -50,7 +50,7 @@ void GPUPathIntegrator::EvaluateMaterialAndBSDF(TextureEvaluator texEval,
RayQueue *nextRayQueue = NextRayQueue(depth);
ForAllQueued(
name.c_str(), evalQueue->Get<Material>(), maxQueueSize,
name.c_str(), evalQueue->Get<MaterialEvalWorkItem<Material>>(), maxQueueSize,
PBRT_GPU_LAMBDA(const MaterialEvalWorkItem<Material> me, int index) {
const Material *material = me.material;
......
......@@ -439,10 +439,10 @@ class MediumSampleQueue : public WorkQueue<MediumSampleWorkItem> {
using MediumScatterQueue = WorkQueue<MediumScatterWorkItem>;
using MaterialEvalQueue =
MultiWorkQueue<MaterialEvalWorkItem, CoatedDiffuseMaterial, CoatedConductorMaterial,
ConductorMaterial, DielectricMaterial, DiffuseMaterial,
DiffuseTransmissionMaterial, HairMaterial, MeasuredMaterial,
SubsurfaceMaterial, ThinDielectricMaterial, MixMaterial>;
MultiWorkQueue<MaterialEvalWorkItem<CoatedDiffuseMaterial>, MaterialEvalWorkItem<CoatedConductorMaterial>,
MaterialEvalWorkItem<ConductorMaterial>, MaterialEvalWorkItem<DielectricMaterial>, MaterialEvalWorkItem<DiffuseMaterial>,
MaterialEvalWorkItem<DiffuseTransmissionMaterial>, MaterialEvalWorkItem<HairMaterial>, MaterialEvalWorkItem<MeasuredMaterial>,
MaterialEvalWorkItem<SubsurfaceMaterial>, MaterialEvalWorkItem<ThinDielectricMaterial>, MaterialEvalWorkItem<MixMaterial>>;
} // namespace pbrt
......
......@@ -20,32 +20,35 @@
#if (__CUDA_ARCH__ >= 600)
#define PBRT_HAVE_CUDA_ATOMICS
#endif
#endif // PBRT_IS_WINDOWS
#endif // PBRT_IS_WINDOWS
#ifdef PBRT_HAVE_CUDA_ATOMICS
#include <cuda/atomic>
#endif // PBRT_HAVE_CUDA_ATOMICS
#endif // PBRT_HAVE_CUDA_ATOMICS
namespace pbrt {
// WorkQueue Definition
template <typename WorkItem>
class WorkQueue : public SOA<WorkItem> {
public:
// WorkQueue Public Methods
WorkQueue(int n, Allocator alloc) : SOA<WorkItem>(n, alloc) {}
PBRT_CPU_GPU
int Size() const {
#ifdef PBRT_HAVE_CUDA_ATOMICS
return size.load(cuda::std::memory_order_relaxed);
namespace std = cuda::std;
return size.load(std::memory_order_relaxed);
#else
return size;
#endif
}
PBRT_CPU_GPU
void Reset() {
#ifdef PBRT_HAVE_CUDA_ATOMICS
size.store(0, cuda::std::memory_order_relaxed);
namespace std = cuda::std;
size.store(0, std::memory_order_relaxed);
#else
size = 0;
#endif
......@@ -59,10 +62,12 @@ class WorkQueue : public SOA<WorkItem> {
}
protected:
// WorkQueue Protected Methods
PBRT_CPU_GPU
int AllocateEntry() {
#ifdef PBRT_HAVE_CUDA_ATOMICS
return size.fetch_add(1, cuda::std::memory_order_relaxed);
namespace std = cuda::std;
return size.fetch_add(1, std::memory_order_relaxed);
#else
#ifdef PBRT_IS_GPU_CODE
return atomicAdd(&size, 1);
......@@ -74,13 +79,16 @@ class WorkQueue : public SOA<WorkItem> {
}
private:
// WorkQueue Private Members
#ifdef PBRT_HAVE_CUDA_ATOMICS
cuda::atomic<int, cuda::thread_scope_device> size{0};
using GPUAtomicInt = cuda::atomic<int, cuda::thread_scope_device>;
GPUAtomicInt size{0};
#else
int size = 0;
#endif
};
// WorkQueue Inline Functions
template <typename F, typename WorkItem>
void ForAllQueued(const char *desc, WorkQueue<WorkItem> *q, int maxQueued, F func) {
GPUParallelFor(desc, maxQueued, [=] PBRT_GPU(int index) mutable {
......@@ -90,22 +98,16 @@ void ForAllQueued(const char *desc, WorkQueue<WorkItem> *q, int maxQueued, F fun
});
}
template <template <typename> class Work, typename... Ts>
class MultiWorkQueueHelper;
// MultiWorkQueue Definition
template <typename... Ts>
class MultiWorkQueue;
template <template <typename> class Work>
class MultiWorkQueueHelper<Work> {
template <typename T, typename... Ts>
class MultiWorkQueue<T, Ts...> : public MultiWorkQueue<Ts...> {
public:
MultiWorkQueueHelper(int n, Allocator alloc, pstd::span<const bool>) {}
};
template <template <typename> class WorkItem, typename T, typename... Ts>
class MultiWorkQueueHelper<WorkItem, T, Ts...>
: public MultiWorkQueueHelper<WorkItem, Ts...> {
public:
MultiWorkQueueHelper(int n, Allocator alloc, pstd::span<const bool> haveType)
: MultiWorkQueueHelper<WorkItem, Ts...>(n, alloc,
haveType.subspan(1, haveType.size())),
// MultiWorkQueue Public Methods
MultiWorkQueue(int n, Allocator alloc, pstd::span<const bool> haveType)
: MultiWorkQueue<Ts...>(n, alloc, haveType.subspan(1, haveType.size())),
q(haveType.front() ? n : 1, alloc) {}
template <typename Tsz>
......@@ -113,22 +115,22 @@ class MultiWorkQueueHelper<WorkItem, T, Ts...>
if constexpr (std::is_same_v<Tsz, T>)
return q.Size();
else
return MultiWorkQueueHelper<WorkItem, Ts...>::template Size<Tsz>();
return MultiWorkQueue<Ts...>::template Size<Tsz>();
}
PBRT_CPU_GPU
void Reset() {
q.Reset();
if constexpr (sizeof...(Ts) > 0)
MultiWorkQueueHelper<WorkItem, Ts...>::Reset();
MultiWorkQueue<Ts...>::Reset();
}
template <typename Tg>
PBRT_CPU_GPU WorkQueue<WorkItem<Tg>> *Get() {
PBRT_CPU_GPU WorkQueue<Tg> *Get() {
if constexpr (std::is_same_v<Tg, T>)
return &q;
else
return MultiWorkQueueHelper<WorkItem, Ts...>::template Get<Tg>();
return MultiWorkQueue<Ts...>::template Get<Tg>();
}
template <typename Tq, typename... Args>
......@@ -136,40 +138,18 @@ class MultiWorkQueueHelper<WorkItem, T, Ts...>
if constexpr (std::is_same_v<Tq, T>)
return q.Push(std::forward<Args>(args)...);
else
return MultiWorkQueueHelper<WorkItem, Ts...>::template Push<Tq>(
std::forward<Args>(args)...);
return MultiWorkQueue<Ts...>::template Push<Tq>(std::forward<Args>(args)...);
}
private:
WorkQueue<WorkItem<T>> q;
// MultiWorkQueue Private Members
WorkQueue<T> q;
};
template <template <typename> class WorkItem, typename... Ts>
class MultiWorkQueue {
template <>
class MultiWorkQueue<> {
public:
MultiWorkQueue(int n, Allocator alloc, pstd::span<const bool> haveType)
: helper(n, alloc, haveType) {}
template <typename T>
PBRT_CPU_GPU int Size() const {
return helper.template Size<T>();
}
PBRT_CPU_GPU
void Reset() { helper.Reset(); }
template <typename T>
PBRT_CPU_GPU WorkQueue<WorkItem<T>> *Get() {
return helper.template Get<T>();
}
template <typename T, typename... Args>
PBRT_CPU_GPU int Push(Args &&... args) {
return helper.template Push<T>(std::forward<Args>(args)...);
}
private:
MultiWorkQueueHelper<WorkItem, Ts...> helper;
MultiWorkQueue(int n, Allocator alloc, pstd::span<const bool> haveType) {}
};
} // namespace pbrt
......
......@@ -8,6 +8,7 @@
#include <stdint.h>
#include <cstddef>
// GPU Macro Definitions
#if defined(__CUDA_ARCH__)
#define PBRT_IS_GPU_CODE
#endif
......@@ -29,6 +30,12 @@
#define PBRT_GPU
#endif
#ifdef PBRT_IS_WINDOWS
#define PBRT_GPU_LAMBDA(...) [ =, *this ] PBRT_GPU(__VA_ARGS__) mutable
#else
#define PBRT_GPU_LAMBDA(...) [=] PBRT_GPU(__VA_ARGS__)
#endif
#ifdef PBRT_BUILD_GPU_RENDERER
#define PBRT_L1_CACHE_LINE_SIZE 128
#else
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册