// Copyright (c) 2021 CINN Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "paddle/cinn/backends/compiler.h" #include #include #include "paddle/cinn/cinn.h" #include "paddle/cinn/common/test_helper.h" #include "paddle/cinn/hlir/pe/elementwise.h" #include "paddle/cinn/hlir/pe/nn.h" #include "paddle/cinn/runtime/use_extern_funcs.h" #include "paddle/cinn/utils/timer.h" namespace cinn { namespace backends { TEST(Compiler, x86) { Expr M(1024), N(1024); auto create_module = [&]() { Placeholder A("A", {M, N}); Placeholder B("B", {M, N}); auto C = Compute( {M, N}, [=](Expr i, Expr j) { return A(i, j) + B(i, j); }, "C"); return std::make_tuple(A, B, C); }; { // test x86 auto _A_B_C_ = create_module(); // NOLINT auto& A = std::get<0>(_A_B_C_); auto& B = std::get<1>(_A_B_C_); auto& C = std::get<2>(_A_B_C_); auto stages = CreateStages({C}); auto fn = Lower("fn", stages, {A, B, C}); ir::Module::Builder builder("some_module", common::DefaultHostTarget()); builder.AddFunction(fn); auto compiler = Compiler::Create(common::DefaultHostTarget()); compiler->Build(builder.Build()); auto* fnp = compiler->Lookup("fn"); ASSERT_TRUE(fnp); auto* Ab = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()}).set_random().Build(); auto* Bb = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()}).set_random().Build(); auto* Cb = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()}).set_zero().Build(); auto args = common::ArgsBuilder().Add(Ab).Add(Bb).Add(Cb).Build(); reinterpret_cast(fnp)(args.data(), args.size()); // test result auto* Ad = reinterpret_cast(Ab->memory); auto* Bd = reinterpret_cast(Bb->memory); auto* Cd = reinterpret_cast(Cb->memory); for (int i = 0; i < Ab->num_elements(); i++) { ASSERT_NEAR(Ad[i] + Bd[i], Cd[i], 1e-5); } } } #ifdef CINN_WITH_CUDA TEST(Compiler, cuda) { Expr M(1024), N(1024); auto create_module = [&]() { Placeholder A("A", {M, N}); Placeholder B("B", {M, N}); auto C = Compute( {M, N}, [=](Expr i, Expr j) { return A(i, j) + B(i, j); }, "C"); return std::make_tuple(A, B, C); }; { // cuda auto _A_B_C_ = create_module(); // NOLINT auto& A = std::get<0>(_A_B_C_); auto& B = std::get<1>(_A_B_C_); auto& C = std::get<2>(_A_B_C_); auto stages = CreateStages({C}); stages[C]->Bind(0, "blockIdx.x"); stages[C]->Bind(1, "threadIdx.x"); auto fn = Lower("fn", stages, {A, B, C}); ir::Module::Builder builder("some_module", common::DefaultHostTarget()); builder.AddFunction(fn); auto compiler = Compiler::Create(common::DefaultNVGPUTarget()); compiler->Build(builder.Build()); auto* fnp = compiler->Lookup("fn"); ASSERT_TRUE(fnp); auto* Ab = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()}).set_random().Build(); auto* Bb = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()}).set_random().Build(); auto* Cb = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()}).set_zero().Build(); // allocate CUDA buffer void *Ag, *Bg, *Cg; const int num_bytes = Ab->num_elements() * sizeof(float); cudaMalloc(&Ag, num_bytes); cudaMalloc(&Bg, num_bytes); cudaMalloc(&Cg, num_bytes); CUDA_CALL(cudaMemcpy(Ag, Ab->memory, num_bytes, cudaMemcpyHostToDevice)); CUDA_CALL(cudaMemcpy(Bg, Bb->memory, num_bytes, cudaMemcpyHostToDevice)); CUDA_CALL(cudaMemcpy(Cg, Cb->memory, num_bytes, cudaMemcpyHostToDevice)); cinn_buffer_t Abb; Abb.memory = reinterpret_cast(Ag); cinn_buffer_t Bbb; Bbb.memory = reinterpret_cast(Bg); cinn_buffer_t Cbb; Cbb.memory = reinterpret_cast(Cg); auto args = common::ArgsBuilder().Add(&Abb).Add(&Bbb).Add(&Cbb).Build(); utils::Timer timer; timer.Start(); void* stream = nullptr; for (int i = 0; i < 1000; i++) { reinterpret_cast(fnp)(args.data(), args.size(), stream); } CUDA_CALL(cudaDeviceSynchronize()); float latency = timer.Stop(); LOG(INFO) << "latency: " << latency / 1000; std::vector ch(M.as_int32() * N.as_int32(), 0.f); CUDA_CALL(cudaMemcpy(ch.data(), Cg, ch.size() * sizeof(float), cudaMemcpyDeviceToHost)); auto* Ad = reinterpret_cast(Ab->memory); auto* Bd = reinterpret_cast(Bb->memory); for (int i = 0; i < Ab->num_elements(); i++) { ASSERT_NEAR(Ad[i] + Bd[i], ch[i], 1e-5); } } } #endif TEST(Compiler, sqrt) { Expr N(100); Expr C(10); Expr H(10); Expr W(10); Placeholder input("input", {N, C, H, W}); Placeholder mean("mean", {C}); Placeholder scale("scale", {C}); Placeholder variance("variance", {C}); Placeholder bias("bias", {C}); float epsilon = 0.1f; auto A = Compute( {N, C, H, W}, [=](Expr n, Expr c, Expr h, Expr w) { return (input(n, c, h, w) - mean(c)) * scale(c) / lang::Sqrt(variance(c) + Expr(epsilon)) + bias(c); }, "A"); auto B = hlir::pe::Pool2d(input, {3, 3}, {1, 1}, {1, 1, 1, 1}, "max", false, false); auto BB = hlir::pe::BatchNorm_NCHW(input, scale, bias, mean, variance, epsilon, "batchnorm"); auto stages = CreateStages({input, mean, scale, variance, A, bias, B[0], BB}); auto fn = Lower("fn", stages, {input, mean, scale, bias, variance, A, B[0], BB}); Module::Builder builder("some", common::DefaultHostTarget()); builder.AddFunction(fn); auto compiler = Compiler::Create(common::DefaultHostTarget()); compiler->Build(builder.Build()); } } // namespace backends } // namespace cinn