// Copyright (c) 2021 CINN Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #pragma once #include #include #include #include "paddle/cinn/frontend/decomposer/use_decomposer.h" #include "paddle/cinn/frontend/decomposer_registry.h" #include "paddle/cinn/frontend/net_builder.h" #include "paddle/cinn/frontend/optimize.h" #include "paddle/cinn/frontend/pass/use_program_pass.h" #include "paddle/cinn/frontend/program_pass.h" #include "paddle/cinn/hlir/framework/graph.h" #include "paddle/cinn/hlir/framework/graph_compiler.h" #include "paddle/cinn/hlir/framework/pass.h" #include "paddle/cinn/hlir/framework/tensor.h" #include "paddle/cinn/hlir/op/use_ops.h" #include "paddle/cinn/hlir/pass/use_pass.h" namespace cinn::frontend { using CPUKernelFunc = std::function& lengths, const std::vector& ptrs)>; template > std::ostream& operator<<(std::ostream& os, const std::vector& vec) { os << "{"; bool is_first = true; for (auto e : vec) { if (is_first) { is_first = false; } else { os << ", "; } os << e; } os << "}\n"; return os; } template void InitRandomVector( std::vector* vec, size_t numel, T low = static_cast(0), T high = static_cast(1), float precision = 1e-5) { std::random_device seed; std::default_random_engine engine(seed()); std::uniform_real_distribution dist(low, high); vec->resize(numel); for (size_t i = 0; i < numel; ++i) { T value = static_cast(dist(engine)); int coeff = static_cast(value / precision); vec->at(i) = precision * static_cast(coeff); } } template <> void InitRandomVector(std::vector* vec, size_t numel, int low, int high, float precision); template void CopyFromVector(const std::vector& vec, hlir::framework::Tensor tensor, Target target) { auto* data = tensor->mutable_data(target); size_t numel = tensor->shape().numel(); CHECK_EQ(vec.size(), numel); if (target == common::DefaultNVGPUTarget()) { #ifdef CINN_WITH_CUDA cudaMemcpy(data, vec.data(), numel * sizeof(T), cudaMemcpyHostToDevice); #else LOG(FATAL) << "NVGPU Target only support on flag CINN_WITH_CUDA ON! Please check."; #endif } else { std::copy(vec.begin(), vec.end(), data); } } template <> void CopyFromVector(const std::vector& vec, hlir::framework::Tensor tensor, Target target); template void CopyToVector(const hlir::framework::Tensor tensor, std::vector* vec) { auto* data = tensor->data(); size_t numel = tensor->shape().numel(); vec->resize(numel); #ifdef CINN_WITH_CUDA cudaMemcpy(vec->data(), data, numel * sizeof(T), cudaMemcpyDeviceToHost); #else for (size_t i = 0; i < numel; ++i) { vec->at(i) = data[i]; } #endif } template <> void CopyToVector(const hlir::framework::Tensor tensor, std::vector* vec); template void CheckOutput(const std::vector& actual, const std::vector& expect, float atol = 1e-8, float rtol = 1e-5) { CHECK_EQ(actual.size(), expect.size()); auto allclose = [](T a, T e, float atol, float rtol) { return abs(a - e) <= (atol + rtol * abs(e)); }; float max_diff = 0.0f; int offset = 0; int num_diffs = 0; size_t numel = actual.size(); for (size_t i = 0; i < numel; ++i) { if (!allclose(actual[i], expect[i], atol, rtol)) { float absolute_diff = abs((actual[i] - expect[i])); float relative_diff = abs(absolute_diff / expect[i]); if (relative_diff > max_diff) { max_diff = relative_diff; offset = i; } num_diffs += 1; VLOG(4) << "- i=" << i << ", " << std::setprecision(8) << actual[i] << " (actual) vs " << std::setprecision(8) << expect[i] << " (expect), relative_diff=" << relative_diff << ", absolute_diff=" << absolute_diff; } } LOG(INFO) << "- Total " << num_diffs << " different results, offset=" << offset << ", " << actual[offset] << " (actual) vs " << expect[offset] << " (expect), maximum_relative_diff=" << max_diff << " (absolute_diff=" << abs((actual[offset] - expect[offset])) << ")"; CHECK_EQ(num_diffs, 0); } template void ComputeReferenceCpu(const std::vector>& input_vecs, const std::vector>& output_vecs, std::vector>* output_refs, CPUKernelFunc cpu_kernel_func) { output_refs->resize(output_vecs.size()); for (size_t i = 0; i < output_vecs.size(); ++i) { output_refs->at(i).resize(output_vecs[i].size()); } // Prepare the arguments for reference. // For different operations, the needed parameters maybe different. size_t n = input_vecs[0].size(); std::vector lengths; lengths.push_back(n); std::vector ptrs(input_vecs.size() + output_refs->size()); for (size_t i = 0; i < input_vecs.size(); ++i) { ptrs[i] = const_cast(static_cast(input_vecs[i].data())); } for (size_t i = 0; i < output_refs->size(); ++i) { ptrs[input_vecs.size() + i] = output_refs->at(i).data(); } cpu_kernel_func(lengths, ptrs); } void RunDecomposer(Program* prog, const Target& target, const std::vector& passes = {"Decomposer"}, const std::vector& fetch_ids = {}); template void RunAndCheckShape(NetBuilder& builder, const std::vector& input_names, const std::vector& output_names, const std::vector>& output_shapes, std::vector>* input_vecs = nullptr, std::vector>* output_vecs = nullptr, T low = 0, T high = 1, const std::vector& passes = {"Decomposer"}) { auto prog = builder.Build(); Target target = common::DefaultTarget(); RunDecomposer(&prog, target, passes, output_names); auto graph = std::make_shared(prog, target); hlir::framework::ApplyPasses(graph.get(), DefaultOpFusionPasses()); auto scope = BuildScope(target, graph); hlir::framework::GraphCompiler gc(target, scope, graph); auto runtime_program = gc.Build(); std::vector> input_vecs_internal; std::vector>* input_vecs_ptr = input_vecs ? input_vecs : &input_vecs_internal; for (size_t i = 0; i < input_names.size(); ++i) { scope->Var(input_names[i]); auto tensor = scope->GetTensor(input_names[i]); std::vector vec; InitRandomVector(&vec, tensor->shape().numel(), low, high); CopyFromVector(vec, tensor, target); input_vecs_ptr->push_back(vec); } runtime_program->Execute(); for (size_t i = 0; i < output_names.size(); ++i) { auto tensor = scope->GetTensor(output_names[i]); CHECK_EQ(tensor->shape().data() == output_shapes[i], true) << "The " << i << "-th shape is expected to be " << output_shapes[i]; if (output_vecs) { std::vector vec; CopyToVector(tensor, &vec); output_vecs->push_back(vec); } } } template void RunAndCheck(NetBuilder& builder, const std::vector& input_names, const std::vector& output_names, const std::vector>& output_shapes, CPUKernelFunc cpu_kernel_func, T low = 0, T high = 1, float atol = 1e-8, float rtol = 1e-5, const std::vector& passes = {"Decomposer"}) { std::vector> input_vecs; std::vector> output_vecs; RunAndCheckShape(builder, input_names, output_names, output_shapes, &input_vecs, &output_vecs, low, high, passes); std::vector> output_refs; ComputeReferenceCpu(input_vecs, output_vecs, &output_refs, cpu_kernel_func); for (size_t i = 0; i < output_vecs.size(); ++i) { LOG(INFO) << "Check the " << i << "-th output, name=" << output_names[i] << ", shape=" << output_shapes[i]; CheckOutput(output_vecs[i], output_refs[i], atol, rtol); } } } // namespace cinn::frontend