// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "paddle/fluid/framework/new_executor/standalone_executor.h" #include #include #include #include #include "paddle/phi/core/kernel_registry.h" USE_OP_ITSELF(fill_constant); USE_OP_ITSELF(uniform_random); USE_OP(lookup_table); USE_OP_ITSELF(transpose2); USE_OP_ITSELF(reshape2); USE_OP_ITSELF(split); USE_OP_ITSELF(slice); USE_OP_ITSELF(concat); USE_OP_ITSELF(matmul); USE_OP_ITSELF(elementwise_add); USE_OP_ITSELF(sigmoid); USE_OP_ITSELF(tanh); USE_OP_ITSELF(elementwise_mul); USE_OP_ITSELF(softmax_with_cross_entropy); USE_OP_ITSELF(reduce_mean); USE_OP_ITSELF(reduce_sum); USE_OP_ITSELF(reduce_sum_grad); USE_OP_ITSELF(reduce_mean_grad); USE_OP_ITSELF(reshape2_grad); USE_OP_ITSELF(softmax_with_cross_entropy_grad); USE_OP_ITSELF(elementwise_add_grad); USE_OP_ITSELF(matmul_grad); USE_OP_ITSELF(square); USE_OP_ITSELF(transpose2_grad); USE_OP_ITSELF(concat_grad); USE_OP_ITSELF(elementwise_mul_grad); USE_OP_ITSELF(sigmoid_grad); USE_OP_ITSELF(tanh_grad); USE_OP_ITSELF(sum); USE_OP_ITSELF(slice_grad); USE_OP_ITSELF(lookup_table_grad); USE_OP_ITSELF(sqrt); USE_OP_ITSELF(elementwise_max); USE_OP_ITSELF(elementwise_div); USE_OP_ITSELF(sgd); USE_OP_ITSELF(squared_l2_norm); USE_OP_ITSELF(memcpy_h2d); USE_OP_ITSELF(memcpy_d2h); USE_OP_ITSELF(fetch_v2); PD_DECLARE_KERNEL(full, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(uniform_raw, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(uniform, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(transpose, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(reshape, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(split, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(concat, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(concat_grad, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(matmul, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(add_raw, KPS, ALL_LAYOUT); PD_DECLARE_KERNEL(add, KPS, ALL_LAYOUT); PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT); PD_DECLARE_KERNEL(multiply, KPS, ALL_LAYOUT); PD_DECLARE_KERNEL(multiply_grad, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(divide, KPS, ALL_LAYOUT); #ifdef PADDLE_WITH_XPU_KP PD_DECLARE_KERNEL(max_raw, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(maximum, GPU, ALL_LAYOUT); #else PD_DECLARE_KERNEL(max_raw, KPS, ALL_LAYOUT); PD_DECLARE_KERNEL(maximum, KPS, ALL_LAYOUT); #endif PD_DECLARE_KERNEL(mean, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(mean_grad, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(sigmoid, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(sigmoid_grad, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(squared_l2_norm, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(reshape_grad, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(matmul_grad, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(transpose_grad, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(sum, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(sum_grad, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(sgd, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(slice, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(slice_grad, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(cross_entropy_with_softmax, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(cross_entropy_with_softmax_grad, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(sqrt, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(add_n, GPU, ALL_LAYOUT); namespace paddle { namespace framework { ProgramDesc load_from_file(const std::string& file_name) { std::ifstream fin(file_name, std::ios::in | std::ios::binary); fin.seekg(0, std::ios::end); std::string buffer(fin.tellg(), ' '); fin.seekg(0, std::ios::beg); fin.read(&buffer[0], buffer.size()); fin.close(); ProgramDesc program_desc(buffer); return program_desc; } ProgramDesc GetLmMainProgram() { ProgramDesc main_prog = load_from_file("lm_main_program"); auto& global_block = main_prog.Block(0); int64_t batch_size = 20; auto& op1 = global_block.AllOps()[1]; auto shape1 = PADDLE_GET_CONST(std::vector, op1->GetAttr("shape")); shape1[0] = batch_size * 20; op1->SetAttr("shape", shape1); auto& op2 = global_block.AllOps()[2]; auto shape2 = PADDLE_GET_CONST(std::vector, op2->GetAttr("shape")); shape2[0] = batch_size; op2->SetAttr("shape", shape2); auto& op3 = global_block.AllOps()[3]; auto shape3 = PADDLE_GET_CONST(std::vector, op3->GetAttr("shape")); shape3[0] = batch_size; op3->SetAttr("shape", shape3); return main_prog; } TEST(StandaloneExecutor, run) { auto place = platform::CUDAPlace(0); ProgramDesc startup_prog = load_from_file("lm_startup_program"); ProgramDesc main_prog = GetLmMainProgram(); Scope scope; StandaloneExecutor startup_exec(place, std::vector{startup_prog}); startup_exec.Run(&scope, {}, {}); StandaloneExecutor exec(place, std::vector{main_prog}); exec.Run(&scope, {}, {}); auto start = std::chrono::steady_clock::now(); for (size_t i = 0; i < 10; ++i) { if (i % 200 == 0) { std::cout << i << std::endl; } exec.Run(&scope, {}, {}); } auto end = std::chrono::steady_clock::now(); std::chrono::duration diff = end - start; std::cout << "time cost " << diff.count() << std::endl; } TEST(InterpreterCore, skip_gc_vars) { auto place = platform::CUDAPlace(0); ProgramDesc startup_prog = load_from_file("lm_startup_program"); ProgramDesc main_prog = GetLmMainProgram(); Scope scope; std::shared_ptr startup_core = std::make_shared( place, startup_prog.Block(0), &scope, interpreter::ExecutionConfig()); startup_core->Run({}, {}); std::set skip_gc_vars = {"uniform_0.tmp_0", "transpose_0.tmp_0", "embedding_0.tmp_0", "slice_0.tmp_0", "split_1.tmp_2"}; std::set gc_vars = {"uniform_1.tmp_0", "matmul_0.tmp_0", "split_0.tmp_0", "elementwise_add_0.tmp_0", "tmp_0"}; interpreter::ExecutionConfig execution_config; execution_config.skip_gc_vars = skip_gc_vars; std::shared_ptr main_core = std::make_shared( place, main_prog.Block(0), &scope, execution_config); auto check_gc_result = [](Scope& scope, std::set& vars, bool is_skip_gc) { // the first local scope is created in startup_core // the second local scope is created in main_core ASSERT_EQ(scope.kids().size(), 2UL); auto* local_scope = scope.kids().back(); for (const std::string& var_name : vars) { ASSERT_EQ(local_scope->FindVar(var_name) ->GetMutable() ->IsInitialized(), is_skip_gc); } }; main_core->Run({}, {}); check_gc_result(scope, skip_gc_vars, true); check_gc_result(scope, gc_vars, false); main_core->Run({}, {}); check_gc_result(scope, skip_gc_vars, true); check_gc_result(scope, gc_vars, false); } void TestShareWorkQueue(const ProgramDesc& prog, const std::vector& feed_names, const std::vector& feed_tensors, const std::vector& fetch_names, const std::vector& fetch_results) { const platform::CPUPlace place = platform::CPUPlace(); Scope scope; std::shared_ptr core1 = std::make_shared( place, prog.Block(0), &scope, interpreter::ExecutionConfig()); std::shared_ptr core2 = std::make_shared( place, prog.Block(0), &scope, interpreter::ExecutionConfig()); core2->ShareWorkQueueFrom(core1); auto run_and_check = [&feed_names, &feed_tensors, &fetch_results]( std::shared_ptr core) { FetchList fetch_list = core->Run(feed_names, feed_tensors); for (size_t i = 0; i < fetch_list.size(); ++i) { const float* fetch_data = PADDLE_GET_CONST(phi::DenseTensor, fetch_list[i]).data(); ASSERT_FLOAT_EQ(*fetch_data, fetch_results.at(i)); } }; run_and_check(core1); run_and_check(core2); run_and_check(core1); run_and_check(core2); } TEST(InterpreterCore, workqueue_multiplexing) { ProgramDesc program; BlockDesc* main_block = program.MutableBlock(0); VarDesc* var_a = main_block->Var("a"); VarDesc* var_b = main_block->Var("b"); VarDesc* var_c = main_block->Var("c"); var_a->SetType(proto::VarType::LOD_TENSOR); var_b->SetType(proto::VarType::LOD_TENSOR); var_c->SetType(proto::VarType::LOD_TENSOR); OpDesc* add = main_block->AppendOp(); add->SetType("elementwise_add"); add->SetInput("X", {"a"}); add->SetInput("Y", {"b"}); add->SetOutput("Out", {"c"}); float data_a[] = {0, 1, 2, 3}; float data_b[] = {0.0, 0.1, 0.2, 0.3}; phi::DDim dims = phi::make_ddim({2, 2}); const platform::CPUPlace place = platform::CPUPlace(); phi::DenseTensor tensor_a = phi::DenseTensor(); phi::DenseTensor tensor_b = phi::DenseTensor(); std::copy_n(data_a, 4, tensor_a.mutable_data(dims, place)); std::copy_n(data_b, 4, tensor_b.mutable_data(dims, place)); TestShareWorkQueue( program, {"a", "b"}, {tensor_a, tensor_b}, {"c"}, {0.0, 1.1, 2.2, 3.3}); } } // namespace framework } // namespace paddle