/** * \file dnn/test/common/benchmarker.h * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. */ #pragma once #include #include #include #include #include "megdnn/basic_types.h" #include "megdnn/tensor_format.h" #include "test/common/opr_algo_proxy.h" #include "test/common/opr_proxy.h" #include "test/common/rng.h" #include "test/common/timer.h" namespace megdnn { namespace test { template class BenchmarkerBase { public: using Param = typename Opr::Param; using TensorValueArray = TensorNDArray; using BeforeExecCallback = std::function; using TensorsConstriant = std::function; BenchmarkerBase(Handle* handle, T timer) : m_timer(timer), m_handle_naive(create_cpu_handle(2, false)), m_handle(handle), m_default_rng(new NormalRNG()), m_param(Param()), m_proxy{new OprProxy()} {} const Handle* handle() const { return m_handle; } /*! * \brief benchmark opr on current param/dtype/rng config * \returns elapsed time in ms * * Benchmarker would construct TensorLayout vectors from shapes and * dtypes and call exec(TensorLayoutArray &). */ float exec(const TensorShapeArray& shapes) { return exec(make_layouts(shapes)); } float exec(TensorLayoutArray layouts); float exect(const TensorValueArray& testcase_in); //! disabiguate overloaded exec float execs(const TensorShapeArray& shapes) { return exec(shapes); } float execl(const TensorLayoutArray& layouts) { return exec(layouts); } BenchmarkerBase& set_param(Param param) { m_param = param; return *this; } BenchmarkerBase& set_dtype(size_t idx, DType dtype) { m_dtype[idx] = dtype; return *this; } BenchmarkerBase& set_rng(size_t idx, RNG* rng) { m_rng[idx] = rng; return *this; } BenchmarkerBase& set_fmt(size_t idx, TensorFormat fmt) { m_fmt[idx] = fmt; return *this; } BenchmarkerBase& set_tensors_constraint( const TensorsConstriant& tensor_constraint) { m_tensor_constraint = tensor_constraint; return *this; } TensorLayoutArray make_layouts(const TensorShapeArray& shapes) { TensorLayoutArray layouts(shapes.size()); for (size_t i = 0; i < shapes.size(); ++i) { DType dt = (m_dtype.find(i) != m_dtype.end() ? m_dtype[i] : dtype::Float32()); if (m_fmt.find(i) == m_fmt.end()) { layouts[i] = TensorLayout(shapes[i], dt); layouts[i].init_contiguous_stride(); } else layouts[i] = TensorLayout(shapes[i], dt, m_fmt[i]); } return layouts; } BenchmarkerBase& set_proxy(std::unique_ptr>& proxy) { m_proxy.reset(nullptr); m_proxy = std::move(proxy); return *this; } std::unique_ptr>& proxy() { return m_proxy; } BenchmarkerBase& set_times(size_t times) { m_times = times; return *this; } BenchmarkerBase& set_display(bool display) { m_display = display; return *this; } //! set a callback to be invoked before executing the operator BenchmarkerBase& set_before_exec_callback(const BeforeExecCallback& cb) { m_before_exec_callback = cb; return *this; } /*! * \brief set adaptive benchmarking: ignore set_times() and find * suitable times to run for given duration; * * Note: the value returned by exec() would be average time per run, * rather than total elapsed time, if this is enabled. */ BenchmarkerBase& set_adaptive_benchmark(float tot_time_in_secs) { m_adaptive_secs = tot_time_in_secs; return *this; } //! get the opr impl so setting other than param() can be modified Opr* opr() { if (!m_opr) { m_opr = m_handle->create_operator(); } return m_opr.get(); } const Param& param() const { return m_param; } private: T m_timer; bool m_display = true; size_t m_times = 1; float m_adaptive_secs = 0; std::unique_ptr m_handle_naive; Handle* m_handle; std::unique_ptr m_default_rng; std::map m_rng; std::map m_dtype; std::map m_fmt; Param m_param; std::unique_ptr> m_proxy; BeforeExecCallback m_before_exec_callback; std::unique_ptr m_opr; TensorsConstriant m_tensor_constraint; }; template float BenchmarkerBase::exec(TensorLayoutArray layouts) { auto opr = this->opr(); opr->param() = m_param; auto user_layouts = layouts; m_proxy->deduce_layout(opr, layouts); for (size_t i = 0; i < layouts.size(); ++i) { if (user_layouts[i].ndim > 0) { auto run = [&]() { ASSERT_TRUE(layouts[i].eq_shape(user_layouts[i])) << "User provided shape is " << user_layouts[i].TensorShape::to_string() << "\nExpected shape is " << layouts[i].TensorShape::to_string(); }; run(); } } auto allocate = [&layouts](Handle* handle) { TensorNDArray tensors(layouts.size()); auto trans_func = [handle](const TensorLayout& layout) { auto span = layout.span(); TensorND res; res.reset_ptr( static_cast(megdnn_malloc(handle, span.dist_byte())) - span.low_byte); res.layout = layout; return res; }; std::transform(layouts.begin(), layouts.end(), tensors.begin(), trans_func); return tensors; }; auto tensors_cur = allocate(m_handle); auto tensors_cur_host = allocate(m_handle_naive.get()); // init for (size_t i = 0; i < tensors_cur_host.size(); ++i) { TensorND& tensor = tensors_cur_host[i]; auto rng = m_rng[i]; if (!rng) rng = m_default_rng.get(); rng->gen(tensor); } if (m_tensor_constraint) { m_tensor_constraint(tensors_cur_host); } for (size_t i = 0; i < tensors_cur_host.size(); ++i) { TensorND& tensor = tensors_cur_host[i]; if (tensor.layout.ndim == 0) continue; auto size = tensor.layout.span().high_byte; megdnn_memcpy_H2D(m_handle, tensors_cur[i].raw_ptr(), tensor.raw_ptr(), size); } if (m_before_exec_callback) { m_before_exec_callback(opr, tensors_cur); } // run // warm up m_proxy->exec(opr, tensors_cur); megcoreSynchronize(m_handle->megcore_computing_handle()); if (m_adaptive_secs) { // find m_times for adaptive benchmarking m_times = 0; int cur_times = 1; auto remain_time = m_adaptive_secs * 1e6; while (remain_time > 0) { m_timer.reset(); m_timer.start(); for (int i = 0; i < cur_times; ++i) m_proxy->exec(opr, tensors_cur); megcoreSynchronize(m_handle->megcore_computing_handle()); m_timer.stop(); m_times += cur_times; auto this_run_time = m_timer.get_time_in_us(); remain_time -= this_run_time; cur_times = std::min( cur_times * 2, std::max(1, remain_time / this_run_time * cur_times)); } } m_timer.reset(); m_timer.start(); for (size_t t = 0; t < m_times; ++t) m_proxy->exec(opr, tensors_cur); megcoreSynchronize(m_handle->megcore_computing_handle()); m_timer.stop(); auto time_in_ms = m_timer.get_time_in_us() / 1e3; if (m_display) { std::cout << "Total time is " << time_in_ms << "ms " << "for " << m_times << " run(s)." << std::endl; } auto free = [](Handle* handle, TensorNDArray& tensors) { std::for_each(tensors.begin(), tensors.end(), [handle](const TensorND& tensor) { megdnn_free( handle, static_cast(tensor.raw_ptr()) + tensor.layout.span().low_byte); }); }; free(m_handle, tensors_cur); free(m_handle_naive.get(), tensors_cur_host); if (m_adaptive_secs) time_in_ms /= m_times; return time_in_ms; } template float BenchmarkerBase::exect(const TensorValueArray& testcase_in) { auto opr = this->opr(); opr->param() = m_param; TensorLayoutArray layouts; TensorNDArray tensors_cur_host; for (auto& inp : testcase_in) { layouts.push_back(inp.layout); tensors_cur_host.emplace_back(inp); } auto user_layouts = layouts; m_proxy->deduce_layout(opr, layouts); for (size_t i = 0; i < layouts.size(); ++i) if (user_layouts[i].ndim > 0) { auto run = [&]() { ASSERT_TRUE(layouts[i].eq_shape(user_layouts[i])) << "User provided shape is " << user_layouts[i].TensorShape::to_string() << "\nExpected shape is " << layouts[i].TensorShape::to_string(); }; run(); } auto allocate = [&layouts](Handle* handle) { TensorNDArray tensors(layouts.size()); auto trans_func = [handle](const TensorLayout& layout) { auto span = layout.span(); TensorND res; res.reset_ptr( static_cast(megdnn_malloc(handle, span.dist_byte())) - span.low_byte); res.layout = layout; return res; }; std::transform(layouts.begin(), layouts.end(), tensors.begin(), trans_func); return tensors; }; auto tensors_cur = allocate(m_handle); //! init for (size_t i = 0; i < tensors_cur_host.size(); ++i) { TensorND& tensor = tensors_cur_host[i]; auto size = tensor.layout.span().high_byte; if (tensor.layout.ndim == 0) continue; megdnn_memcpy_H2D(m_handle, tensors_cur[i].raw_ptr(), tensor.raw_ptr(), size); } if (m_before_exec_callback) { m_before_exec_callback(opr, tensors_cur); } //! run //! warm up m_proxy->exec(opr, tensors_cur); megcoreSynchronize(m_handle->megcore_computing_handle()); if (m_adaptive_secs) { //! find m_times for adaptive benchmarking m_times = 0; int cur_times = 1; auto remain_time = m_adaptive_secs * 1e6; while (remain_time > 0) { m_timer.reset(); m_timer.start(); for (int i = 0; i < cur_times; ++i) m_proxy->exec(opr, tensors_cur); megcoreSynchronize(m_handle->megcore_computing_handle()); m_timer.stop(); m_times += cur_times; auto this_run_time = m_timer.get_time_in_us(); remain_time -= this_run_time; cur_times = std::min( cur_times * 2, std::max(1, remain_time / this_run_time * cur_times)); } } m_timer.reset(); m_timer.start(); for (size_t t = 0; t < m_times; ++t) m_proxy->exec(opr, tensors_cur); megcoreSynchronize(m_handle->megcore_computing_handle()); m_timer.stop(); auto time_in_ms = m_timer.get_time_in_us() / 1e3; if (m_display) { std::cout << "Total time is " << time_in_ms << "ms " << "for " << m_times << " run(s)." << std::endl; } auto free = [](Handle* handle, TensorNDArray& tensors) { std::for_each(tensors.begin(), tensors.end(), [handle](const TensorND& tensor) { megdnn_free( handle, static_cast(tensor.raw_ptr()) + tensor.layout.span().low_byte); }); }; free(m_handle, tensors_cur); if (m_adaptive_secs) time_in_ms /= m_times; return time_in_ms; } template class Benchmarker; template class Benchmarker : public BenchmarkerBase { public: Benchmarker(Handle* handle) : BenchmarkerBase{handle, Timer{}} {} }; ////////////////// Algo Benchmark //////////////////////// template , typename T = Timer> float algo_benchmark( Benchmarker& benchmark, TensorLayoutArray layouts, const std::string& algo_base) { Proxy proxy; auto opr = benchmark.opr(); opr->param() = benchmark.param(); proxy.deduce_layout(opr, layouts); auto algos = OprAlgoProxy::get_all_algorithms_info_safe(opr, layouts); float min_used = std::numeric_limits::max(); bool execed = false; for (auto i : algos) { if (std::regex_match(i.desc.name, std::regex("(" + algo_base + ")(.*)"))) { opr->execution_policy().algo = i.desc; auto used = benchmark.exec(layouts); min_used = std::min(min_used, used); printf("run algo: %s used: %f ms min_used: %f ms\n", i.desc.name.c_str(), used, min_used); execed = true; } } megdnn_assert(execed, "no algo start with %s\n", algo_base.c_str()); return min_used; } template , typename T = Timer> float algo_benchmark( Benchmarker& benchmark, TensorShapeArray shapes, const std::string& algo_base) { return algo_benchmark(benchmark, benchmark.make_layouts(shapes), algo_base); } } // namespace test } // namespace megdnn // vim: syntax=cpp.doxygen