diff --git a/src/gopt/impl/reformat_emitter.cpp b/src/gopt/impl/reformat_emitter.cpp index 7f3d41169718455f357575755ddc96fbb673f6ee..db3a3d0c453b965c35a551ebd7d26f76627e4716 100644 --- a/src/gopt/impl/reformat_emitter.cpp +++ b/src/gopt/impl/reformat_emitter.cpp @@ -13,6 +13,7 @@ #include "megbrain/gopt/reformat_emitter.h" #include #include "megbrain/opr/tensor_manip.h" +#include "megbrain/opr/io.h" using namespace mgb; using namespace gopt; @@ -243,4 +244,63 @@ ReformatEmitter::UnderlyingBuilders ReformatEmitter::analyze() const { } return builders; } + +/* ============== PaddingEmitter ================= */ +PaddingEmitter::EmitResult PaddingEmitter::emit() const { + auto&& const_extent = m_const_extent; + auto&& axis = m_axis; + auto builder = [const_extent, axis](const VarNodeArray& vars) { + auto i = vars[0]; + auto padding_shp_var = vars[1]; + TensorShape shape; + shape.ndim = i->shape().ndim; + for (size_t ax = 0; ax < shape.ndim; ++ax) + shape[ax] = 1; + shape[axis] = const_extent; + auto host_val = + std::make_shared(i->comp_node(), i->dtype()); + host_val->resize(shape); + auto ptr = host_val->raw_ptr(); + size_t size_bytes = TensorLayout{shape, i->dtype()}.span().dist_byte(); + std::memset(ptr, 0, size_bytes); + auto padding = + opr::ImmutableTensor::make(*i->owner_graph(), *host_val); + padding = opr::Broadcast::make(padding, padding_shp_var); + auto o = opr::Concat::make({i, padding}, axis); + return o.node(); + }; + auto checker = [axis](const VarNodeArray& vars) { + mgb_assert(vars.size() == 2); + return vars[0]->shape().ndim > axis; + }; + return std::make_tuple(builder, checker); +} + +/* ============== SubtensorEmitter ================= */ +SubtensorEmitter::EmitResult SubtensorEmitter::emit() const { + auto&& const_extent = m_const_extent; + auto&& axis = m_axis; + auto builder = [const_extent, axis](const VarNodeArray& vars) { + auto i = vars[0]; + auto x = SymbolVar(i); + auto cv = [&x](int v) { return x.make_scalar(v); }; + using AIdx = opr::Subtensor::AxisIndexer; + std::vector index(i->shape().ndim); + for (size_t ax = 0; ax < index.size(); ++ax) { + if (ax == axis) + index[ax] = + AIdx::make_interval(ax, None, cv(const_extent), None); + else + index[ax] = AIdx::make_interval(ax, None, None, cv(1)); + } + auto o = opr::Subtensor::make(x, index); + return o.node(); + }; + auto checker = [axis](const VarNodeArray& vars) { + mgb_assert(vars.size() == 2); + return vars[0]->shape().ndim > axis; + }; + return std::make_tuple(builder, checker); +} + // vim: syntax=cpp.doxygen diff --git a/src/gopt/impl/reformat_manager.cpp b/src/gopt/impl/reformat_manager.cpp index 4df3bfc1940a522e33be9c66bdae8d259c93b783..decf4521cc3b0fe06d22ccb3e4b893d951c772f1 100644 --- a/src/gopt/impl/reformat_manager.cpp +++ b/src/gopt/impl/reformat_manager.cpp @@ -12,12 +12,27 @@ #include "megbrain/gopt/reformat_manager.h" #include "megbrain/opr/tensor_manip.h" +#include "megbrain/utils/arith_helper.h" using namespace mgb; using namespace gopt; using NamedTensorShape = megdnn::NamedTensorShape; +using Dimension = megdnn::Dimension; namespace { +int gcd(const int& p, const int& q) { + int x = p, y = q; + while (y != 0) { + if (x < y) { + y = (y % x); + } else { + x = (x % y); + std::swap(x, y); + } + } + return x; +} + NamedTensorShape tensor_formats_to_named_tensor_shape(TensorFormats format) { switch (format) { case TensorFormats::NCHW: @@ -371,6 +386,170 @@ ReformatManager::ReformatImpl ReformatManager::get( }) } +ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_featrue( + const VarNode* orig_var, TensorFormats orig_format, + const ReformatKey& key) const { + NamedTensorShape input_shape = + tensor_formats_to_named_tensor_shape(key.input_format); + NamedTensorShape output_shape = + tensor_formats_to_named_tensor_shape(key.output_format); + size_t input_alignment, output_alignment; + size_t input_channel_idx, output_channel_idx; + for (size_t i = 0; i < input_shape.ndim; ++i) { + if (input_shape[i].name() == Dimension::Name::C && + input_shape[i].extent() == Dimension::UNDETERMINED_EXTENT) { + input_channel_idx = i; + input_alignment = input_shape[i].stride(); + break; + } + } + for (size_t i = 0; i < output_shape.ndim; ++i) { + if (output_shape[i].name() == Dimension::Name::C && + output_shape[i].extent() == Dimension::UNDETERMINED_EXTENT) { + output_channel_idx = i; + output_alignment = output_shape[i].stride(); + break; + } + } + NamedTensorShape orig_shape = + tensor_formats_to_named_tensor_shape(orig_format); + size_t orig_channel = 0; + for (size_t i = 0; i < orig_shape.ndim; ++i) { + if (orig_shape[i].name() == Dimension::Name::C && + orig_shape[i].extent() == Dimension::UNDETERMINED_EXTENT) { + orig_channel = orig_var->shape()[i] * orig_shape[i].stride(); + break; + } + } + mgb_assert(orig_channel > 0, + "incompatible NamedTensorShape for feature(got:%s)", + orig_shape.to_string().c_str()); + size_t aligned_in_channel = + divup(orig_channel, input_alignment) * input_alignment; + size_t aligned_out_channel = + divup(orig_channel, output_alignment) * output_alignment; + size_t common_alignment = input_alignment * output_alignment / + gcd(input_alignment, output_alignment); + size_t aligned_channel = + divup(orig_channel, common_alignment) * common_alignment; + auto builder = [key, aligned_channel, aligned_in_channel, + aligned_out_channel, input_shape, input_channel_idx, + output_shape, + output_channel_idx](const VarNodeArray& vars) { + VarNode *x, *cur; + x = cur = vars[0]; + if (aligned_channel > aligned_in_channel) { + auto padding_shape = input_shape; + auto&& dim = padding_shape[input_channel_idx]; + size_t const_extent = + (aligned_channel - aligned_in_channel) / dim.stride(); + padding_shape[input_channel_idx] = + Dimension(dim.name(), dim.stride(), const_extent); + auto make_shape = std::get<0>( + MakeShapeEmitter{input_shape, padding_shape}.emit()); + auto padding_shp_var = make_shape({x}); + auto padding = std::get<0>( + PaddingEmitter{const_extent, input_channel_idx}.emit()); + cur = padding({cur, padding_shp_var}); + } + cur = ReformatManager::instance().get(key)({cur}); + if (aligned_channel > aligned_out_channel) { + auto&& dim = output_shape[output_channel_idx]; + size_t const_extent = aligned_out_channel / dim.stride(); + auto sub = std::get<0>( + SubtensorEmitter{const_extent, output_channel_idx}.emit()); + cur = sub({cur}); + } + return cur; + }; + return builder; +} + +ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_weight( + const VarNode* orig_var, const ReformatKey& key, + const AlignmentDesc& extra_alignment) const { + size_t in_channels = 0, out_channels = 0; + size_t input_channel_idx, output_channel_idx; + Dimension::Name out_channel_name; + auto input_shape = tensor_formats_to_named_tensor_shape(key.input_format); + for (size_t i = 0; i < input_shape.ndim; ++i) { + if (input_shape[i].name() == Dimension::Name::C && + input_shape[i].extent() == Dimension::UNDETERMINED_EXTENT) { + in_channels = orig_var->shape()[i]; + input_channel_idx = i; + mgb_assert(input_shape[i].stride() == 1, + "unsupport weight format(got:%s)", + input_shape.to_string().c_str()); + } else if ((input_shape[i].name() == Dimension::Name::K || + input_shape[i].name() == Dimension::Name::N) && + input_shape[i].extent() == Dimension::UNDETERMINED_EXTENT) { + out_channels = orig_var->shape()[i]; + out_channel_name = input_shape[i].name(); + output_channel_idx = i; + mgb_assert(input_shape[i].stride() == 1, + "unsupport weight format(got:%s)", + input_shape.to_string().c_str()); + } + } + size_t in_channel_alignment, out_channel_alignment = 1; + auto output_shape = tensor_formats_to_named_tensor_shape(key.output_format); + for (size_t i = 0; i < output_shape.ndim; ++i) { + if (output_shape[i].name() == Dimension::Name::C && + output_shape[i].extent() == Dimension::UNDETERMINED_EXTENT) { + in_channel_alignment = output_shape[i].stride(); + } else if (output_shape[i].name() == out_channel_name && + output_shape[i].extent() == Dimension::UNDETERMINED_EXTENT) { + out_channel_alignment = output_shape[i].stride(); + } + } + size_t aligned_in_channel = + divup(in_channels, in_channel_alignment) * in_channel_alignment; + if (extra_alignment.name == out_channel_name) { + out_channel_alignment = + extra_alignment.alignment * out_channel_alignment / + gcd(extra_alignment.alignment, out_channel_alignment); + } + size_t aligned_out_channel = + divup(out_channels, out_channel_alignment) * out_channel_alignment; + auto builder = [key, input_shape, in_channels, input_channel_idx, + aligned_in_channel, out_channels, output_channel_idx, + aligned_out_channel](const VarNodeArray& vars) { + VarNode *x, *cur; + x = cur = vars[0]; + if (aligned_in_channel > in_channels) { + auto padding_shape = input_shape; + auto&& dim = padding_shape[input_channel_idx]; + size_t const_extent = + (aligned_in_channel - in_channels) / dim.stride(); + padding_shape[input_channel_idx] = + Dimension(dim.name(), dim.stride(), const_extent); + auto make_shape = std::get<0>( + MakeShapeEmitter{input_shape, padding_shape}.emit()); + auto padding_shp_var = make_shape({x}); + auto padding = std::get<0>( + PaddingEmitter{const_extent, input_channel_idx}.emit()); + cur = padding({cur, padding_shp_var}); + } + if (aligned_out_channel > out_channels) { + auto padding_shape = input_shape; + auto&& dim = padding_shape[output_channel_idx]; + size_t const_extent = + (aligned_out_channel - out_channels) / dim.stride(); + padding_shape[output_channel_idx] = + Dimension(dim.name(), dim.stride(), const_extent); + auto make_shape = std::get<0>( + MakeShapeEmitter{input_shape, padding_shape}.emit()); + auto padding_shp_var = make_shape({cur}); + auto padding = std::get<0>( + PaddingEmitter{const_extent, output_channel_idx}.emit()); + cur = padding({cur, padding_shp_var}); + } + cur = ReformatManager::instance().get(key)({cur}); + return cur; + }; + return builder; +} + const ReformatManager& ReformatManager::instance() { static ReformatManager inst; return inst; diff --git a/src/gopt/include/megbrain/gopt/reformat_emitter.h b/src/gopt/include/megbrain/gopt/reformat_emitter.h index bd62c2153402b02c7da5e53c94cf7626d9f361a2..9a1a2af18b407fd5b3591f425afcf5f6afa5ee37 100644 --- a/src/gopt/include/megbrain/gopt/reformat_emitter.h +++ b/src/gopt/include/megbrain/gopt/reformat_emitter.h @@ -77,6 +77,26 @@ private: }; UnderlyingBuilders analyze() const; }; + +class PaddingEmitter final : public Emitter { +public: + PaddingEmitter(size_t const_extent, size_t axis) + : m_const_extent{const_extent}, m_axis{axis} {} + EmitResult emit() const override; + +private: + size_t m_const_extent, m_axis; +}; + +class SubtensorEmitter final : public Emitter { +public: + SubtensorEmitter(size_t const_extent, size_t axis) + : m_const_extent{const_extent}, m_axis{axis} {} + EmitResult emit() const override; + +private: + size_t m_const_extent, m_axis; +}; } // namespace gopt } // namespace mgb diff --git a/src/gopt/include/megbrain/gopt/reformat_manager.h b/src/gopt/include/megbrain/gopt/reformat_manager.h index b6180ad60ffbc484831274a2064795553b203933..58a01bc58081f097e37e506826dc118575e2ff01 100644 --- a/src/gopt/include/megbrain/gopt/reformat_manager.h +++ b/src/gopt/include/megbrain/gopt/reformat_manager.h @@ -101,12 +101,21 @@ public: ReformatKey::Equal>; ReformatImpl get(const ReformatKey& key) const; ReformatImpl get(ReformatKey&& key) const { return get(key); } + ReformatImpl auto_aligned_reformat_featrue(const VarNode* orig_var, + TensorFormats orig_format, + const ReformatKey& key) const; + struct AlignmentDesc { + megdnn::Dimension::Name name; + size_t alignment; + }; + ReformatImpl auto_aligned_reformat_weight( + const VarNode* orig_var, const ReformatKey& key, + const AlignmentDesc& extra_alignment = {}) const; static const ReformatManager& instance(); private: ReformatCache m_cache; }; - } // namespace gopt } // namespace mgb diff --git a/src/gopt/test/reformat_manager.cpp b/src/gopt/test/reformat_manager.cpp index 0470695388cd15f6484d94cfedb10907c896ed09..378dc8d187d02c7af461d08b70986d2965b5bbdd 100644 --- a/src/gopt/test/reformat_manager.cpp +++ b/src/gopt/test/reformat_manager.cpp @@ -13,7 +13,10 @@ #include "./helper.h" #include "megbrain/gopt/reformat_manager.h" +#include "megbrain/graph/event.h" #include "megbrain/opr/tensor_manip.h" +#include "megbrain/plugin/base.h" +#include "megbrain/plugin/profiler.h" using namespace mgb; using namespace gopt; @@ -168,4 +171,287 @@ TEST(TestReformatManager, InputChannelSmall) { MGB_ASSERT_TENSOR_EQ(t1, t2); } +TEST(TestReformatManager, AutoAlignedFeature) { + constexpr size_t N = 16, C = 22, H = 55, W = 55; + HostTensorGenerator<> gen; + using ReformatKey = ReformatManager::ReformatKey; + auto src_format = TensorFormats::NCHWc4, + dst_format = TensorFormats::NCHWc32; + ReformatKey key{src_format, dst_format}; + + auto graph = ComputingGraph::make(); + graph->options().graph_opt_level = 0; + + std::shared_ptr host_orig_x = gen({N, C, H, W}); + std::shared_ptr host_x = gen({N, (C + 3) / 4, H, W, 4}); + auto mkvar = [&](const char* name, + const std::shared_ptr& host_val) { + return opr::Host2DeviceCopy::make(*graph, host_val).rename(name); + }; + auto orig_x = mkvar("orig_x", host_orig_x); + auto x = mkvar("x", host_x); + auto builder = ReformatManager::instance().auto_aligned_reformat_featrue( + orig_x.node(), TensorFormats::NCHW, key); + auto y = builder({x.node()}); + HostTensorND t; + auto func = graph->compile({make_callback_copy(y, t)}); + func->execute(); + *host_x = *gen({(N + 5), (C + 3) / 4, H, W, 4}); + func->execute(); + *host_x = *gen({(N - 5), (C + 3) / 4, H, W, 4}); + func->execute(); + auto shp = TensorShape{(N - 5), (C + 31) / 32, H, W, 32}; + ASSERT_TRUE(shp.eq_shape(t.shape())); +} + +TEST(TestReformatManager, AutoAlignedFeatureB4) { + constexpr size_t N = 16, C = 94, H = 55, W = 55; + HostTensorGenerator<> gen; + using ReformatKey = ReformatManager::ReformatKey; + auto src_format = TensorFormats::NCHWc4, + dst_format = TensorFormats::NCHWc64; + ReformatKey key{src_format, dst_format}; + + auto graph = ComputingGraph::make(); + graph->options().graph_opt_level = 0; + + std::shared_ptr host_orig_x = gen({N, C, H, W}); + std::shared_ptr host_x = gen({N, (C + 3) / 4, H, W, 4}); + auto mkvar = [&](const char* name, + const std::shared_ptr& host_val, + const DType& dtype) { + return opr::TypeCvt::make( + opr::Host2DeviceCopy::make(*graph, host_val).rename(name), + dtype); + }; + auto orig_x = mkvar("orig_x", host_orig_x, + dtype::Quantized4Asymm(20.f, static_cast(8))); + auto x = mkvar("x", host_x, + dtype::Quantized4Asymm(25.f, static_cast(4))); + auto builder = ReformatManager::instance().auto_aligned_reformat_featrue( + orig_x.node(), TensorFormats::NCHW, key); + auto y = builder({x.node()}); + HostTensorND t; + auto func = graph->compile({make_callback_copy(y, t)}); + func->execute(); +} + +TEST(TestReformatManager, AutoAlignedWeight) { + constexpr size_t K = 32, C = 32, R = 3, S = 3; + HostTensorGenerator<> gen; + using ReformatKey = ReformatManager::ReformatKey; + auto src_format = TensorFormats::NCHW, dst_format = TensorFormats::NCHWc64; + ReformatKey key{src_format, dst_format}; + + auto graph = ComputingGraph::make(); + graph->options().graph_opt_level = 0; + + auto mkvar = [&](const char* name, const TensorShape& shp) { + return opr::Host2DeviceCopy::make(*graph, gen(shp)).rename(name); + }; + auto w = mkvar("w", {K, C, R, S}); + auto builder = ReformatManager::instance().auto_aligned_reformat_weight( + w.node(), key, + ReformatManager::AlignmentDesc{megdnn::Dimension::Name::N, 64}); + auto y = builder({w.node()}); + HostTensorND t; + auto func = graph->compile({make_callback_copy(y, t)}); + func->execute(); +} + +#if MGB_CUDA +#include "megbrain/comp_node_env.h" +namespace { +class ReformatProfiler : public PluginBase { + using CompNodeEventPtr = std::unique_ptr; + +public: + class MarkInputContiguous; + ReformatProfiler(cg::ComputingGraph* graph, cg::OperatorNodeBase* opr_start, + cg::OperatorNodeBase* opr_end); + ~ReformatProfiler() noexcept; + double duration() const; + +private: + CompNodeEventPtr m_start, m_end; + cg::OperatorNodeBase *m_opr_start, *m_opr_end; +}; + +ReformatProfiler::ReformatProfiler(cg::ComputingGraph* graph, + cg::OperatorNodeBase* opr_start, + cg::OperatorNodeBase* opr_end) + : PluginBase(graph), m_opr_start(opr_start), m_opr_end(opr_end) { + using namespace cg::event; + auto on_reformat_start = [this](BeforeKernel const& event) { + auto opr = event.opr; + if (opr != m_opr_start) + return; + if (m_start == nullptr) { + m_start = event.comp_node.create_event(CompNode::Event::NEED_TIMER); + } + m_start->record(); + }; + auto on_reformat_end = [this](AfterKernel const& event) { + auto opr = event.opr; + if (opr != m_opr_end) + return; + if (m_end == nullptr) { + m_end = event.comp_node.create_event(CompNode::Event::NEED_TIMER); + } + m_end->record(); + }; + auto&& ev = graph->event(); + add_event_handler(ev.register_receiver(on_reformat_start)); + add_event_handler(ev.register_receiver(on_reformat_end)); +} + +ReformatProfiler::~ReformatProfiler() noexcept { + if (m_start) + m_start->host_wait(); + if (m_end) + m_end->host_wait(); +} + +double ReformatProfiler::duration() const { + mgb_assert(m_end); + m_end->host_wait(); + return m_start->elapsed_time_until(*m_end) - + m_start->elapsed_time_until(*m_start); +} + +MGB_DEFINE_OPR_CLASS(ReformatProfiler::MarkInputContiguous, + cg::SingleCNOperatorNodeBase) // { + void scn_do_execute() override{}; + void init_output_static_infer_desc() override; + void add_input_layout_constraint() override; + +public: + MarkInputContiguous(VarNode* node, const OperatorNodeConfig& config); + + static SymbolVar make(SymbolVar node, const OperatorNodeConfig& config = {}); +}; // namespace + +MGB_DYN_TYPE_OBJ_FINAL_IMPL(ReformatProfiler::MarkInputContiguous); + +ReformatProfiler::MarkInputContiguous::MarkInputContiguous( + VarNode* node, const OperatorNodeConfig& config) + : Super(node->owner_graph(), config, "mark_contiguous", {node}) { + add_input({node}); + add_output(None); +} + +SymbolVar ReformatProfiler::MarkInputContiguous::make( + SymbolVar node, const OperatorNodeConfig& config) { + return node.insert_single_output_opr(node.node(), + config); +} + +void ReformatProfiler::MarkInputContiguous::init_output_static_infer_desc() { + using namespace cg::static_infer; + auto&& mgr = owner_graph()->static_infer_manager(); + mgr.register_shape_infer(output(0), + ShapeInferDesc::make_identity(input(0))); +} + +void ReformatProfiler::MarkInputContiguous::add_input_layout_constraint() { + input(0)->add_layout_constraint_contiguous(); +} + +class CUTimer { +public: + CUTimer(cudaStream_t& stream, cudaEvent_t& evt0, cudaEvent_t& evt1) + : m_stream{stream}, m_evt0{evt0}, m_evt1{evt1} { + reset(); + } + + void reset() { + m_started = false; + m_stopped = false; + } + void start() { + mgb_assert(!m_started); + mgb_assert(!m_stopped); + m_started = true; + cudaEventRecord(m_evt0, m_stream); + } + void stop() { + mgb_assert(m_started); + mgb_assert(!m_stopped); + m_stopped = true; + cudaEventRecord(m_evt1, m_stream); + } + size_t get_time_in_us() const { + cudaStreamSynchronize(m_stream); + float t = -1; + cudaEventElapsedTime(&t, m_evt0, m_evt1); + return static_cast(t * 1e3); + } + +private: + bool m_started, m_stopped; + size_t m_start_point, m_stop_point; + cudaStream_t& m_stream; + cudaEvent_t &m_evt0, &m_evt1; +}; + +} // namespace + +TEST(TestReformatManager, AutoAlignedFeatureProfiling) { + REQUIRE_GPU(1); + auto cn = CompNode::load("gpux"); + using ReformatKey = ReformatManager::ReformatKey; + auto dtype = dtype::Quantized4Asymm(20.f, static_cast(4)); + HostTensorND hval(cn, dtype); + constexpr size_t N = 16, C = 18, H = 55, W = 55; + hval.resize({N, (C + 63) / 64, H, W, 64}); + std::shared_ptr dval = + std::make_shared(cn, dtype); + dval->copy_from(hval).sync(); + std::shared_ptr dprime = + std::make_shared(cn, dtype); + dprime->resize({N, C, H, W}); + + auto graph = ComputingGraph::make(); + graph->options().graph_opt_level = 0; + graph->options().var_sanity_check_first_run = false; + + auto x = opr::VolatileSharedDeviceTensor::make(*graph, dval); + auto xprime = opr::VolatileSharedDeviceTensor::make(*graph, dprime); + ReformatKey key{TensorFormats::NCHWc64, TensorFormats::NCHW}; + auto builder = ReformatManager::instance().auto_aligned_reformat_featrue( + xprime.node(), TensorFormats::NCHW, key); + auto y = builder({x.node()}); + auto mark = ReformatProfiler::MarkInputContiguous::make(SymbolVar(y)); + auto cb = [](DeviceTensorND& d) { MGB_MARK_USED_VAR(d); }; + auto output_spec = std::make_pair(mark, cb); + auto func = graph->compile({output_spec}); + static constexpr size_t RUNS = 100; + cn.activate(); + auto stream = CompNodeEnv::from_comp_node(cn).cuda_env().stream; + cudaEvent_t evt0; + cudaEvent_t evt1; + MGB_CUDA_CHECK(cudaEventCreate(&evt0)); + MGB_CUDA_CHECK(cudaEventCreate(&evt1)); + CUTimer timer(stream, evt0, evt1); + timer.start(); + for (size_t i = 0; i < RUNS; ++i) + func->execute(); + timer.stop(); + double time_cuda_evt = timer.get_time_in_us() / static_cast(RUNS); + + OperatorNodeBase* start = x.node()->owner_opr(); + OperatorNodeBase* end = y->owner_opr(); + std::unique_ptr profiler = + std::make_unique(graph.get(), start, end); + ASSERT_TRUE(y->shape().eq_shape(TensorShape{N, C, H, W})); + for (size_t i = 0; i < RUNS; ++i) + func->execute(); + double time_profiler = profiler->duration() * 1e6; + printf("%f, %f\n", time_profiler, time_cuda_evt); + ASSERT_EQ(time_cuda_evt, time_profiler); + MGB_CUDA_CHECK(cudaEventDestroy(evt0)); + MGB_CUDA_CHECK(cudaEventDestroy(evt1)); +} +#endif + // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}