// Copyright (c) 2021 CINN Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "paddle/cinn/frontend/computation.h" #include #include "paddle/cinn/common/target.h" #include "paddle/cinn/frontend/decomposer/use_decomposer.h" #include "paddle/cinn/frontend/decomposer_registry.h" #include "paddle/cinn/frontend/net_builder.h" #include "paddle/cinn/frontend/pass/use_program_pass.h" #include "paddle/cinn/frontend/program_pass.h" DEFINE_string(model_dir, "", ""); namespace cinn { namespace frontend { Program CreateTestProgram() { constexpr int B = 8; constexpr int M = 32; constexpr int N = 24; NetBuilder builder("net_builder"); auto a = builder.CreateInput(Float(32), {M, N / 2}, "A"); auto b = builder.CreateInput(Float(32), {M, N / 2}, "B"); auto t = builder.Transpose(b, {1, 0}); auto r = builder.Reshape(t, {M, N / 2}); auto c = builder.Add(a, r); auto x = builder.Divide(a, b); auto d = builder.Concat({c, x}, 1); auto e = builder.BroadcastTo(d, {B, M, N}, {1, 2}); auto f = builder.Concat({a, b}, 1); auto g = builder.BroadcastTo(f, {B, M, N}, {1, 2}); auto h = builder.Subtract(e, g); auto i = builder.Max(e, h); auto j = builder.Min(e, h); auto k = builder.Multiply(i, j); auto l = builder.Constant(1, "condition"); auto m = builder.BroadcastTo(l, {B, M, N}, {0}); auto n = builder.Select(m, j, k); auto o = builder.ReduceSum(n, {0, 1, 2}); auto program = builder.Build(); return program; } Program CreateAddProgram() { constexpr int M = 32; constexpr int N = 24; NetBuilder builder("net_builder"); auto a = builder.CreateInput(Float(32), {M, N}); auto b = builder.CreateInput(Float(32), {M, N}); auto c = builder.Relu(a); auto d = builder.Add(b, c); auto program = builder.Build(); return program; } TEST(cinn_computation, basic_cpu) { NetBuilder builder("basic"); constexpr int M = 32; constexpr int N = 24; auto a = builder.CreateInput(Float(32), {M, N}, "A"); auto b = builder.CreateInput(Float(32), {M, N}, "B"); auto c = builder.Add(a, b); auto d = builder.Add(a, c); auto target = common::DefaultHostTarget(); auto comp = CinnComputation::BuildAndCompile(target, builder); std::vector hostA(M * N); std::vector hostB(M * N); std::vector hostD(M * N); std::vector hostD_expected(M * N); for (int i = 0; i < M * N; i++) { hostA[i] = static_cast(rand()) / INT_MAX; hostB[i] = static_cast(rand()) / INT_MAX; hostD_expected[i] = hostA[i] * 2 + hostB[i]; } comp->SetTensorData("A", reinterpret_cast(hostA.data()), hostA.size() * sizeof(float)); comp->SetTensorData("B", reinterpret_cast(hostB.data()), hostB.size() * sizeof(float)); comp->Execute(); comp->GetTensorData(d->id, reinterpret_cast(hostD.data()), hostD.size() * sizeof(float)); for (int i = 0; i < hostD.size(); i++) { ASSERT_NEAR(hostD[i], hostD_expected[i], 1e-5); } } #ifdef CINN_WITH_CUDA TEST(cinn_computation, basic_gpu) { NetBuilder builder("basic"); constexpr int M = 32; constexpr int N = 24; auto a = builder.CreateInput(Float(32), {M, N}, "A"); auto b = builder.CreateInput(Float(32), {M, N}, "B"); auto c = builder.Add(a, b); auto d = builder.Add(a, c); auto target = common::DefaultNVGPUTarget(); auto comp = CinnComputation::BuildAndCompile(target, builder); std::vector hostA(M * N); std::vector hostB(M * N); std::vector hostD(M * N); std::vector hostD_expected(M * N); for (int i = 0; i < M * N; i++) { hostA[i] = static_cast(rand()) / INT_MAX; hostB[i] = static_cast(rand()) / INT_MAX; hostD_expected[i] = hostA[i] * 2 + hostB[i]; } comp->SetTensorData("A", reinterpret_cast(hostA.data()), hostA.size() * sizeof(float)); comp->SetTensorData("B", reinterpret_cast(hostB.data()), hostB.size() * sizeof(float)); comp->Execute(); comp->GetTensorData(d->id, reinterpret_cast(hostD.data()), hostD.size() * sizeof(float)); for (int i = 0; i < hostD.size(); i++) { ASSERT_NEAR(hostD[i], hostD_expected[i], 1e-5); } } #endif TEST(cinn_computation, net_builder_cpu) { auto program = CreateTestProgram(); auto target = common::DefaultHostTarget(); auto compute = CinnComputation::Compile(target, program); auto inputs = compute->GetInputTensors(); ASSERT_EQ(inputs.size(), 2); auto tensorA = inputs[0]; auto tensorB = inputs[1]; ASSERT_EQ(tensorA->shape().numel(), 32 * 24 / 2); ASSERT_EQ(tensorB->shape().numel(), 32 * 24 / 2); auto outputs = compute->GetOutputTensors(); ASSERT_EQ(outputs.size(), 1); auto tensorOut = outputs[0]; auto load_input = [=](hlir::framework::Tensor t) { float *ptr = t->mutable_data(target); for (int i = 0; i < t->shape().numel(); i++) { ptr[i] = static_cast(rand()) / INT_MAX; } }; // run inference for 10 times for (int i = 0; i < 10; i++) { // load data directly to tensor's host memory load_input(tensorA); load_input(tensorB); // execute engine compute->Execute(); // get outputs (ignored) } } #ifdef CINN_WITH_CUDA TEST(cinn_computation, net_builder_gpu) { auto program = CreateTestProgram(); auto target = common::DefaultNVGPUTarget(); auto compute = CinnComputation::Compile(target, program); auto inputs = compute->GetInputTensors(); ASSERT_EQ(inputs.size(), 2); auto tensorA = inputs[0]; auto tensorB = inputs[1]; ASSERT_EQ(tensorA->shape().numel(), 32 * 24 / 2); ASSERT_EQ(tensorB->shape().numel(), 32 * 24 / 2); auto outputs = compute->GetOutputTensors(); ASSERT_EQ(outputs.size(), 1); auto tensorOut = outputs[0]; // run inference for 10 times for (int i = 0; i < 10; i++) { // load data directly to tensor's host memory // assume tensorA is generated in GPU directly float *device_ptrA = tensorOut->mutable_data(target); // ... generated data directly in device memory via gpu kernels // ... or async copy to device memory // ... not showed here // assume tensorB is generated in host memory, needs copy to GPU memory (sync.) std::vector hostB(32 * 24 / 2); compute->SetTensorData(tensorB, reinterpret_cast(hostB.data()), hostB.size() * sizeof(float)); // execute engine compute->Execute(); // get outputs std::vector hostOut(tensorOut->shape().numel()); compute->GetTensorData(tensorOut, reinterpret_cast(hostOut.data()), hostOut.size() * sizeof(float)); } } #endif TEST(cinn_computation, fc_execute_cpu) { auto target = common::DefaultHostTarget(); ASSERT_NE(FLAGS_model_dir, ""); auto compute = CinnComputation::CompilePaddleModel(target, FLAGS_model_dir, {"A"}, {{1, 30}}, false); auto inputs = compute->GetInputTensors(); ASSERT_EQ(inputs.size(), 1); auto A = inputs[0]; ASSERT_EQ(A->shape().numel(), 1 * 30); float *ptrA = A->mutable_data(target); for (int i = 0; i < 30; i++) ptrA[i] = static_cast(rand()) / INT_MAX; for (int i = 0; i < 30; i++) ptrA[i] = static_cast(0); compute->Execute(); } #ifdef CINN_WITH_CUDA TEST(cinn_computation, fc_execute_gpu) { auto target = common::DefaultNVGPUTarget(); ASSERT_NE(FLAGS_model_dir, ""); auto compute = CinnComputation::CompilePaddleModel(target, FLAGS_model_dir, {"A"}, {{1, 30}}, false); auto inputs = compute->GetInputTensors(); ASSERT_EQ(inputs.size(), 1); auto A = inputs[0]; ASSERT_EQ(A->shape().numel(), 1 * 30); auto outputs = compute->GetOutputTensors(); ASSERT_EQ(outputs.size(), 1); auto out = outputs[0]; std::vector hostA(30); for (float &v : hostA) v = static_cast(rand()) / INT_MAX; compute->SetTensorData(A, reinterpret_cast(hostA.data()), hostA.size() * sizeof(float)); compute->Execute(); std::vector hostOut(30); compute->GetTensorData(out, reinterpret_cast(hostOut.data()), hostOut.size() * sizeof(float)); } #endif TEST(cinn_computation, decomposer_cpu) { // this test only shows the API usage ASSERT_NE(cinn::frontend::ProgramPassRegistry::Global()->Find("Decomposer"), nullptr); // without decomposer { auto prog = CreateAddProgram(); auto target = common::DefaultHostTarget(); auto options = CinnComputation::DefaultCompileOptions(); options.use_decomposer = false; auto compute = CinnComputation::Compile(target, prog, options); auto names = compute->GetAllTensorNames(); ASSERT_EQ(names.size(), 3); } // with decomposer { auto prog = CreateAddProgram(); auto target = common::DefaultHostTarget(); auto options = CinnComputation::DefaultCompileOptions(); options.use_decomposer = true; auto compute = CinnComputation::Compile(target, prog, options); auto names = compute->GetAllTensorNames(); } } #ifdef CINN_WITH_CUDA TEST(cinn_computation, gpu_stream) { // this test only shows the API usage auto target = common::DefaultNVGPUTarget(); auto prog = CreateAddProgram(); auto options = CinnComputation::DefaultCompileOptions(); cudaStream_t streams[1]; cudaStreamCreate(&streams[0]); auto compute = CinnComputation::Compile(target, prog, options, {}, static_cast(streams[0])); compute->Execute(); } #endif TEST(cinn_computation, without_instantiate_variables) { // this test only shows the API usage auto target = common::DefaultHostTarget(); auto prog = CreateAddProgram(); auto options = CinnComputation::DefaultCompileOptions(); options.with_instantiate_variables = false; auto compute = CinnComputation::Compile(target, prog, options); auto names = compute->GetAllTensorNames(); std::map pod2args; // compute->Execute(&pod2args); } } // namespace frontend } // namespace cinn