提交 57bc3657 编写于 作者: M Megvii Engine Team

style(dnn/cuda): format cuda elemwise code

GitOrigin-RevId: 246755ce20d708b5b35b48452996deeb63491513
上级 09eaa398
...@@ -6,7 +6,8 @@ ...@@ -6,7 +6,8 @@
* *
* Unless required by applicable law or agreed to in writing, * Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an * software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/ */
#include "src/cuda/elemwise_helper.cuh" #include "src/cuda/elemwise_helper.cuh"
...@@ -21,7 +22,7 @@ ...@@ -21,7 +22,7 @@
#define _cb_check_ndim(n) megdnn::TensorShape::MAX_NDIM == n || #define _cb_check_ndim(n) megdnn::TensorShape::MAX_NDIM == n ||
static_assert(MEGDNN_FOREACH_TENSOR_NDIM(_cb_check_ndim) false, static_assert(MEGDNN_FOREACH_TENSOR_NDIM(_cb_check_ndim) false,
"bad foreach ndim"); "bad foreach ndim");
#undef _cb_check_ndim #undef _cb_check_ndim
namespace megdnn { namespace megdnn {
...@@ -32,28 +33,30 @@ namespace elemwise_intl { ...@@ -32,28 +33,30 @@ namespace elemwise_intl {
#pragma GCC diagnostic push #pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Warray-bounds" #pragma GCC diagnostic ignored "-Warray-bounds"
template<int ndim, typename ctype> template <int ndim, typename ctype>
void ParamElemVisitor<ndim, ctype, BCAST_OTHER>::host_init( void ParamElemVisitor<ndim, ctype, BCAST_OTHER>::host_init(const TensorND& rv,
const TensorND &rv, int /*grid_size*/, int /*block_size*/) { int /*grid_size*/,
int /*block_size*/) {
megdnn_assert(rv.layout.ndim && rv.layout.ndim <= ndim); megdnn_assert(rv.layout.ndim && rv.layout.ndim <= ndim);
m_ptr = rv.ptr<ctype>(); m_ptr = rv.ptr<ctype>();
for (size_t i = 0; i < rv.layout.ndim; ++ i) { for (size_t i = 0; i < rv.layout.ndim; ++i) {
m_stride[i] = rv.layout.stride[i]; m_stride[i] = rv.layout.stride[i];
if (i + 1 < rv.layout.ndim) if (i + 1 < rv.layout.ndim)
m_shape_highdim[i] = rv.layout.shape[i + 1]; m_shape_highdim[i] = rv.layout.shape[i + 1];
} }
for (int i = rv.layout.ndim - 1; i < ndim - 1; ++ i) { for (int i = rv.layout.ndim - 1; i < ndim - 1; ++i) {
m_shape_highdim[i] = 1; m_shape_highdim[i] = 1;
} }
for (int i = rv.layout.ndim; i < ndim; ++ i) { for (int i = rv.layout.ndim; i < ndim; ++i) {
m_stride[i] = 0; m_stride[i] = 0;
} }
} }
#pragma GCC diagnostic pop #pragma GCC diagnostic pop
template <typename ctype> template <typename ctype>
void ParamElemVisitor<3, ctype, BCAST_101>::host_init( void ParamElemVisitor<3, ctype, BCAST_101>::host_init(const TensorND& rv,
const TensorND& rv, int grid_size, int block_size) { int grid_size,
int block_size) {
uint32_t shape2, shape1; uint32_t shape2, shape1;
int stride1; int stride1;
if (rv.layout.ndim == 3) { if (rv.layout.ndim == 3) {
...@@ -74,8 +77,8 @@ void ParamElemVisitor<3, ctype, BCAST_101>::host_init( ...@@ -74,8 +77,8 @@ void ParamElemVisitor<3, ctype, BCAST_101>::host_init(
template <typename ctype> template <typename ctype>
void ParamElemVisitor<2, ctype, BCAST_10>::host_init(const TensorND& rv, void ParamElemVisitor<2, ctype, BCAST_10>::host_init(const TensorND& rv,
int grid_size, int grid_size,
int block_size) { int block_size) {
megdnn_assert(rv.layout.ndim == NDIM && !rv.layout.stride[0]); megdnn_assert(rv.layout.ndim == NDIM && !rv.layout.stride[0]);
m_ptr = rv.ptr<ctype>(); m_ptr = rv.ptr<ctype>();
m_stride1 = rv.layout.stride[1]; m_stride1 = rv.layout.stride[1];
...@@ -85,8 +88,8 @@ void ParamElemVisitor<2, ctype, BCAST_10>::host_init(const TensorND& rv, ...@@ -85,8 +88,8 @@ void ParamElemVisitor<2, ctype, BCAST_10>::host_init(const TensorND& rv,
template <typename ctype> template <typename ctype>
void ParamElemVisitor<2, ctype, BCAST_01>::host_init(const TensorND& rv, void ParamElemVisitor<2, ctype, BCAST_01>::host_init(const TensorND& rv,
int grid_size, int grid_size,
int block_size) { int block_size) {
megdnn_assert(rv.layout.ndim == NDIM && !rv.layout.stride[1]); megdnn_assert(rv.layout.ndim == NDIM && !rv.layout.stride[1]);
m_ptr = rv.ptr<ctype>(); m_ptr = rv.ptr<ctype>();
m_stride0 = rv.layout.stride[0]; m_stride0 = rv.layout.stride[0];
...@@ -94,9 +97,10 @@ void ParamElemVisitor<2, ctype, BCAST_01>::host_init(const TensorND& rv, ...@@ -94,9 +97,10 @@ void ParamElemVisitor<2, ctype, BCAST_01>::host_init(const TensorND& rv,
rv.layout.shape[1]); rv.layout.shape[1]);
} }
template<typename ctype> template <typename ctype>
void ParamElemVisitor<1, ctype, BCAST_FULL>::host_init( void ParamElemVisitor<1, ctype, BCAST_FULL>::host_init(const TensorND& rv,
const TensorND &rv, int /*grid_size*/, int /*block_size*/) { int /*grid_size*/,
int /*block_size*/) {
megdnn_assert(rv.layout.ndim == NDIM && !rv.layout.stride[0]); megdnn_assert(rv.layout.ndim == NDIM && !rv.layout.stride[0]);
m_ptr = rv.ptr<ctype>(); m_ptr = rv.ptr<ctype>();
} }
...@@ -119,14 +123,13 @@ void ParamVectVisitor<4, ctype, BCAST_1010>::host_init(const TensorND& rv, ...@@ -119,14 +123,13 @@ void ParamVectVisitor<4, ctype, BCAST_1010>::host_init(const TensorND& rv,
} }
#define INST(ndim, ctype, brd) template class ParamElemVisitor<ndim, ctype, brd> #define INST(ndim, ctype, brd) template class ParamElemVisitor<ndim, ctype, brd>
#define INST_FOR_CTYPE \ #define INST_FOR_CTYPE \
MEGDNN_FOREACH_TENSOR_NDIM(ndim_cb) \ MEGDNN_FOREACH_TENSOR_NDIM(ndim_cb) \
INST(3, ct, BCAST_101); \ INST(3, ct, BCAST_101); \
INST(2, ct, BCAST_10); \ INST(2, ct, BCAST_10); \
INST(2, ct, BCAST_01); \ INST(2, ct, BCAST_01); \
INST(1, ct, BCAST_FULL); INST(1, ct, BCAST_FULL);
#define ndim_cb(_ndim) INST(_ndim, ct, BCAST_OTHER); #define ndim_cb(_ndim) INST(_ndim, ct, BCAST_OTHER);
#define ct dt_byte #define ct dt_byte
...@@ -175,11 +178,10 @@ INST(dt_qint8); ...@@ -175,11 +178,10 @@ INST(dt_qint8);
INST(dt_quint8); INST(dt_quint8);
#undef dt_ibyte #undef dt_ibyte
} // namespace elemwise_intl } // namespace elemwise_intl
void elemwise_intl::get_launch_spec( void elemwise_intl::get_launch_spec(const void* kern, size_t size,
const void *kern, size_t size, int *grid_size, int *block_size) { int* grid_size, int* block_size) {
safe_size_in_kern(size); safe_size_in_kern(size);
auto config = query_launch_config_for_kernel(kern); auto config = query_launch_config_for_kernel(kern);
*block_size = config.block_size; *block_size = config.block_size;
...@@ -202,11 +204,8 @@ void elemwise_intl::get_launch_spec( ...@@ -202,11 +204,8 @@ void elemwise_intl::get_launch_spec(
void elemwise_intl::on_bad_ndim(int ndim) { void elemwise_intl::on_bad_ndim(int ndim) {
megdnn_throw(ssprintf("invalid ndim: %d", ndim)); megdnn_throw(ssprintf("invalid ndim: %d", ndim));
MEGDNN_MARK_USED_VAR(ndim); MEGDNN_MARK_USED_VAR(ndim);
} }
} // namespace cuda } // namespace cuda
} // namespace megdnn } // namespace megdnn
// vim: ft=cpp syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} // vim: ft=cpp syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
此差异已折叠。
...@@ -6,7 +6,8 @@ ...@@ -6,7 +6,8 @@
* *
* Unless required by applicable law or agreed to in writing, * Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an * software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/ */
#include "test/common/elemwise.h" #include "test/common/elemwise.h"
...@@ -26,66 +27,61 @@ using namespace test; ...@@ -26,66 +27,61 @@ using namespace test;
#define cudnn_check(e) megdnn_assert((e) == CUDNN_STATUS_SUCCESS) #define cudnn_check(e) megdnn_assert((e) == CUDNN_STATUS_SUCCESS)
namespace { namespace {
__attribute__((unused)) __attribute__((unused)) cudnnTensorDescriptor_t make_cudnn_tensor_desc(
cudnnTensorDescriptor_t make_cudnn_tensor_desc(const TensorLayout &ly) { const TensorLayout& ly) {
megdnn_assert(ly.ndim && ly.ndim <= 4 && ly.is_contiguous()); megdnn_assert(ly.ndim && ly.ndim <= 4 && ly.is_contiguous());
int dim[4] = {1, 1, 1, 1}, stride[4] = {1, 1, 1, 1}; int dim[4] = {1, 1, 1, 1}, stride[4] = {1, 1, 1, 1};
for (size_t i = 0; i < ly.ndim; ++ i) { for (size_t i = 0; i < ly.ndim; ++i) {
dim[i] = ly.shape[i]; dim[i] = ly.shape[i];
stride[i] = ly.stride[i]; stride[i] = ly.stride[i];
}
cudnnTensorDescriptor_t ret;
cudnn_check(cudnnCreateTensorDescriptor(&ret));
// cudnn requires tensors to be at-least 4D
cudnn_check(cudnnSetTensor4dDescriptorEx(ret,
CUDNN_DATA_FLOAT,
dim[0], dim[1], dim[2], dim[3],
stride[0], stride[1], stride[2], stride[3]));
return ret;
} }
cudnnTensorDescriptor_t ret;
cudnn_check(cudnnCreateTensorDescriptor(&ret));
// cudnn requires tensors to be at-least 4D
cudnn_check(cudnnSetTensor4dDescriptorEx(ret, CUDNN_DATA_FLOAT, dim[0],
dim[1], dim[2], dim[3], stride[0],
stride[1], stride[2], stride[3]));
return ret;
}
void run_tensor_add( void run_tensor_add(Handle* handle_cuda, const TensorND& a, const TensorND& b,
Handle *handle_cuda, const TensorND& c) {
const TensorND &a, const TensorND &b,
const TensorND &c) {
#if 1 #if 1
cudnnHandle_t cudnn_handle; cudnnHandle_t cudnn_handle;
cudnn_check(cudnnCreate(&cudnn_handle)); cudnn_check(cudnnCreate(&cudnn_handle));
cuda_check(cudaDeviceSynchronize()); cuda_check(cudaDeviceSynchronize());
cuda_check(cudaMemcpy(c.raw_ptr, a.raw_ptr, a.layout.span().dist_byte(), cuda_check(cudaMemcpy(c.raw_ptr, a.raw_ptr, a.layout.span().dist_byte(),
cudaMemcpyDeviceToDevice)); cudaMemcpyDeviceToDevice));
auto bdesc = make_cudnn_tensor_desc(b.layout), auto bdesc = make_cudnn_tensor_desc(b.layout),
cdesc = make_cudnn_tensor_desc(c.layout); cdesc = make_cudnn_tensor_desc(c.layout);
float alpha = 1, beta = 1; float alpha = 1, beta = 1;
cudaProfilerStart(); cudaProfilerStart();
cudnn_check(cudnnAddTensor(cudnn_handle, cudnn_check(cudnnAddTensor(cudnn_handle, &alpha, bdesc, b.raw_ptr, &beta,
&alpha, bdesc, b.raw_ptr, cdesc, c.raw_ptr));
&beta, cdesc, c.raw_ptr)); cudaProfilerStop();
cudaProfilerStop();
cudnn_check(cudnnDestroyTensorDescriptor(cdesc));
cudnn_check(cudnnDestroyTensorDescriptor(cdesc)); cudnn_check(cudnnDestroyTensorDescriptor(bdesc));
cudnn_check(cudnnDestroyTensorDescriptor(bdesc)); cudnn_check(cudnnDestroy(cudnn_handle));
cudnn_check(cudnnDestroy(cudnn_handle));
cuda_check(cudaMemset(c.raw_ptr, 0, c.layout.span().dist_byte()));
cuda_check(cudaMemset(c.raw_ptr, 0, c.layout.span().dist_byte())); cuda_check(cudaDeviceSynchronize());
cuda_check(cudaDeviceSynchronize());
#endif #endif
auto opr = handle_cuda->create_operator<ElemwiseForward>(); auto opr = handle_cuda->create_operator<ElemwiseForward>();
opr->param().mode = ElemwiseForward::Mode::ADD; opr->param().mode = ElemwiseForward::Mode::ADD;
cudaProfilerStart(); cudaProfilerStart();
opr->exec({a, b}, c); opr->exec({a, b}, c);
cudaProfilerStop(); cudaProfilerStop();
} }
} // anonymous namespace } // anonymous namespace
template<typename tag> template <typename tag>
class CUDA_ELEMWISE: public CUDA { class CUDA_ELEMWISE : public CUDA {};
};
TYPED_TEST_CASE(CUDA_ELEMWISE, elemwise::test_types); TYPED_TEST_CASE(CUDA_ELEMWISE, elemwise::test_types);
TYPED_TEST(CUDA_ELEMWISE, run) { TYPED_TEST(CUDA_ELEMWISE, run) {
elemwise::run_test<TypeParam>(this->handle_cuda()); elemwise::run_test<TypeParam>(this->handle_cuda());
...@@ -275,18 +271,17 @@ TEST_F(CUDA, ELEMWISE_BFLOAT16) { ...@@ -275,18 +271,17 @@ TEST_F(CUDA, ELEMWISE_BFLOAT16) {
//! the memory of this test case is too large, sometimes will fail on tx1 //! the memory of this test case is too large, sometimes will fail on tx1
TEST_F(CUDA, ELEMWISE_BENCHMARK_DENSE) { TEST_F(CUDA, ELEMWISE_BENCHMARK_DENSE) {
constexpr size_t A = 256 * 1024 * 64, constexpr size_t A = 256 * 1024 * 64, S0 = 16, S1 = 256, S2 = 64, S3 = 64;
S0 = 16, S1 = 256, S2 = 64, S3 = 64;
static_assert(A == S0 * S1 * S2 * S3, "bad value"); static_assert(A == S0 * S1 * S2 * S3, "bad value");
SyncedTensor<> SyncedTensor<> t0(handle_cuda(),
t0(handle_cuda(), {TensorShape{S0, S1, S2, S3}, dtype::Float32()}), {TensorShape{S0, S1, S2, S3}, dtype::Float32()}),
t1(handle_cuda(), {TensorShape{S0, S1, S2, S3}, dtype::Float32()}); t1(handle_cuda(), {TensorShape{S0, S1, S2, S3}, dtype::Float32()});
UniformFloatRNG rng{-2.f, 2.f}; UniformFloatRNG rng{-2.f, 2.f};
rng.gen(t0.tensornd_host()); rng.gen(t0.tensornd_host());
run_tensor_add(handle_cuda(), run_tensor_add(handle_cuda(), t0.tensornd_dev(), t0.tensornd_dev(),
t0.tensornd_dev(), t0.tensornd_dev(), t1.tensornd_dev()); t1.tensornd_dev());
auto p0 = t0.ptr_host(), p1 = t1.ptr_host(); auto p0 = t0.ptr_host(), p1 = t1.ptr_host();
for (size_t i = 0; i < A; ++ i) { for (size_t i = 0; i < A; ++i) {
ASSERT_EQ(p0[i] + p0[i], p1[i]) << "at index " << i << "/" << A; ASSERT_EQ(p0[i] + p0[i], p1[i]) << "at index " << i << "/" << A;
} }
} }
...@@ -294,19 +289,19 @@ TEST_F(CUDA, ELEMWISE_BENCHMARK_DENSE) { ...@@ -294,19 +289,19 @@ TEST_F(CUDA, ELEMWISE_BENCHMARK_DENSE) {
#if MEGDNN_WITH_BENCHMARK #if MEGDNN_WITH_BENCHMARK
TEST_F(CUDA, ELEMWISE_BENCHMARK_BCAST_101) { TEST_F(CUDA, ELEMWISE_BENCHMARK_BCAST_101) {
constexpr size_t A = 511, B = 509, C0 = 23, C1 = 23, C = C0 * C1; constexpr size_t A = 511, B = 509, C0 = 23, C1 = 23, C = C0 * C1;
SyncedTensor<> SyncedTensor<> t0(handle_cuda(),
t0(handle_cuda(), {TensorShape{A, B, C0, C1}, dtype::Float32()}), {TensorShape{A, B, C0, C1}, dtype::Float32()}),
t1(handle_cuda(), {TensorShape{1, B, 1, 1}, dtype::Float32()}), t1(handle_cuda(), {TensorShape{1, B, 1, 1}, dtype::Float32()}),
t2(handle_cuda(), {TensorShape{A, B, C0, C1}, dtype::Float32()}); t2(handle_cuda(), {TensorShape{A, B, C0, C1}, dtype::Float32()});
UniformFloatRNG rng{-2.f, 2.f}; UniformFloatRNG rng{-2.f, 2.f};
rng.gen(t0.tensornd_host()); rng.gen(t0.tensornd_host());
rng.gen(t1.tensornd_host()); rng.gen(t1.tensornd_host());
run_tensor_add(handle_cuda(), run_tensor_add(handle_cuda(), t0.tensornd_dev(), t1.tensornd_dev(),
t0.tensornd_dev(), t1.tensornd_dev(), t2.tensornd_dev()); t2.tensornd_dev());
auto p0 = t0.ptr_host(), p1 = t1.ptr_host(), p2 = t2.ptr_host(); auto p0 = t0.ptr_host(), p1 = t1.ptr_host(), p2 = t2.ptr_host();
for (size_t i = 0; i < A; ++ i) { for (size_t i = 0; i < A; ++i) {
for (size_t j = 0; j < B; ++ j) { for (size_t j = 0; j < B; ++j) {
for (size_t k = 0; k < C; ++ k) { for (size_t k = 0; k < C; ++k) {
auto off = i * B * C + j * C + k; auto off = i * B * C + j * C + k;
ASSERT_EQ(p0[off] + p1[j], p2[off]); ASSERT_EQ(p0[off] + p1[j], p2[off]);
} }
...@@ -317,16 +312,16 @@ TEST_F(CUDA, ELEMWISE_BENCHMARK_BCAST_101) { ...@@ -317,16 +312,16 @@ TEST_F(CUDA, ELEMWISE_BENCHMARK_BCAST_101) {
TEST_F(CUDA, ELEMWISE_BENCHMARK_BCAST_10) { TEST_F(CUDA, ELEMWISE_BENCHMARK_BCAST_10) {
constexpr size_t A = 11583, B = 11587; constexpr size_t A = 11583, B = 11587;
SyncedTensor<> t0(handle_cuda(), {TensorShape{A, B}, dtype::Float32()}), SyncedTensor<> t0(handle_cuda(), {TensorShape{A, B}, dtype::Float32()}),
t1(handle_cuda(), {TensorShape{1, B}, dtype::Float32()}), t1(handle_cuda(), {TensorShape{1, B}, dtype::Float32()}),
t2(handle_cuda(), {TensorShape{A, B}, dtype::Float32()}); t2(handle_cuda(), {TensorShape{A, B}, dtype::Float32()});
UniformFloatRNG rng{-2.f, 2.f}; UniformFloatRNG rng{-2.f, 2.f};
rng.gen(t0.tensornd_host()); rng.gen(t0.tensornd_host());
rng.gen(t1.tensornd_host()); rng.gen(t1.tensornd_host());
run_tensor_add(handle_cuda(), run_tensor_add(handle_cuda(), t0.tensornd_dev(), t1.tensornd_dev(),
t0.tensornd_dev(), t1.tensornd_dev(), t2.tensornd_dev()); t2.tensornd_dev());
auto p0 = t0.ptr_host(), p1 = t1.ptr_host(), p2 = t2.ptr_host(); auto p0 = t0.ptr_host(), p1 = t1.ptr_host(), p2 = t2.ptr_host();
for (size_t i = 0; i < A; ++ i) { for (size_t i = 0; i < A; ++i) {
for (size_t j = 0; j < B; ++ j) { for (size_t j = 0; j < B; ++j) {
auto off = i * B + j; auto off = i * B + j;
ASSERT_EQ(p0[off] + p1[j], p2[off]); ASSERT_EQ(p0[off] + p1[j], p2[off]);
} }
...@@ -336,16 +331,16 @@ TEST_F(CUDA, ELEMWISE_BENCHMARK_BCAST_10) { ...@@ -336,16 +331,16 @@ TEST_F(CUDA, ELEMWISE_BENCHMARK_BCAST_10) {
TEST_F(CUDA, ELEMWISE_BENCHMARK_BCAST_01) { TEST_F(CUDA, ELEMWISE_BENCHMARK_BCAST_01) {
constexpr size_t A = 11583, B = 11587; constexpr size_t A = 11583, B = 11587;
SyncedTensor<> t0(handle_cuda(), {TensorShape{1, A, B}, dtype::Float32()}), SyncedTensor<> t0(handle_cuda(), {TensorShape{1, A, B}, dtype::Float32()}),
t1(handle_cuda(), {TensorShape{1, A, 1}, dtype::Float32()}), t1(handle_cuda(), {TensorShape{1, A, 1}, dtype::Float32()}),
t2(handle_cuda(), {TensorShape{1, A, B}, dtype::Float32()}); t2(handle_cuda(), {TensorShape{1, A, B}, dtype::Float32()});
UniformFloatRNG rng{-2.f, 2.f}; UniformFloatRNG rng{-2.f, 2.f};
rng.gen(t0.tensornd_host()); rng.gen(t0.tensornd_host());
rng.gen(t1.tensornd_host()); rng.gen(t1.tensornd_host());
run_tensor_add(handle_cuda(), run_tensor_add(handle_cuda(), t0.tensornd_dev(), t1.tensornd_dev(),
t0.tensornd_dev(), t1.tensornd_dev(), t2.tensornd_dev()); t2.tensornd_dev());
auto p0 = t0.ptr_host(), p1 = t1.ptr_host(), p2 = t2.ptr_host(); auto p0 = t0.ptr_host(), p1 = t1.ptr_host(), p2 = t2.ptr_host();
for (size_t i = 0; i < A; ++ i) { for (size_t i = 0; i < A; ++i) {
for (size_t j = 0; j < B; ++ j) { for (size_t j = 0; j < B; ++j) {
auto off = i * B + j; auto off = i * B + j;
ASSERT_EQ(p0[off] + p1[i], p2[off]); ASSERT_EQ(p0[off] + p1[i], p2[off]);
} }
...@@ -361,8 +356,9 @@ TEST_F(CUDA, BENCHMARK_ELEMWISE_IBYTE) { ...@@ -361,8 +356,9 @@ TEST_F(CUDA, BENCHMARK_ELEMWISE_IBYTE) {
.set_param(Mode::FUSE_ADD_RELU) .set_param(Mode::FUSE_ADD_RELU)
.set_dtype(0, dtype::Int8()) .set_dtype(0, dtype::Int8())
.set_dtype(1, dtype::Int8()); .set_dtype(1, dtype::Int8());
auto time = bencher.execs({{N * C * H * W + 1}, {N * C * H * W + 1}, {}}) / auto time =
nr_times; bencher.execs({{N * C * H * W + 1}, {N * C * H * W + 1}, {}}) /
nr_times;
printf("time = %.2fms, bandwidth = %.2fGB/s\n", time, printf("time = %.2fms, bandwidth = %.2fGB/s\n", time,
(3.0 * (N * C * H * W + 1)) / (time * 1e6)); (3.0 * (N * C * H * W + 1)) / (time * 1e6));
time = bencher.execs({{N, C / 4, H, W, 4}, {N, C / 4, H, W, 4}, {}}) / time = bencher.execs({{N, C / 4, H, W, 4}, {N, C / 4, H, W, 4}, {}}) /
...@@ -395,7 +391,6 @@ TEST_F(CUDA, BENCHMARK_ELEMWISE_IBYTE) { ...@@ -395,7 +391,6 @@ TEST_F(CUDA, BENCHMARK_ELEMWISE_IBYTE) {
nr_times; nr_times;
printf("time = %.2fms, bandwidth = %.2fGB/s\n", time, printf("time = %.2fms, bandwidth = %.2fGB/s\n", time,
(C + 2.0 * N * C * H * W) / (time * 1e6)); (C + 2.0 * N * C * H * W) / (time * 1e6));
}; };
run_bench(256, 256, 56, 56); run_bench(256, 256, 56, 56);
} }
...@@ -428,4 +423,3 @@ TEST_F(CUDA, BENCHMARK_ELEMWISE_MIN_MAX) { ...@@ -428,4 +423,3 @@ TEST_F(CUDA, BENCHMARK_ELEMWISE_MIN_MAX) {
#endif #endif
// vim: syntax=cpp.doxygen // vim: syntax=cpp.doxygen
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册