standalone_executor_test.cc 9.7 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

15 16
#include "paddle/fluid/framework/new_executor/standalone_executor.h"

17
#include <gtest/gtest.h>
18

19
#include <chrono>
20 21 22
#include <iostream>
#include <string>

23
#include "paddle/phi/core/kernel_registry.h"
24

25
USE_OP_ITSELF(fill_constant);
26
USE_OP_ITSELF(uniform_random);
27
USE_OP(lookup_table);
28
USE_OP_ITSELF(transpose2);
29
USE_OP_ITSELF(reshape2);
30 31 32 33
USE_OP_ITSELF(split);
USE_OP_ITSELF(slice);
USE_OP_ITSELF(concat);
USE_OP_ITSELF(matmul);
34
USE_OP_ITSELF(elementwise_add);
Y
YuanRisheng 已提交
35
USE_OP_ITSELF(sigmoid);
36
USE_OP_ITSELF(tanh);
37
USE_OP_ITSELF(elementwise_mul);
38
USE_OP_ITSELF(softmax_with_cross_entropy);
39 40
USE_OP_ITSELF(reduce_mean);
USE_OP_ITSELF(reduce_sum);
C
chentianyu03 已提交
41
USE_OP_ITSELF(reduce_sum_grad);
42
USE_OP_ITSELF(reduce_mean_grad);
43
USE_OP_ITSELF(reshape2_grad);
44
USE_OP_ITSELF(softmax_with_cross_entropy_grad);
45
USE_OP_ITSELF(elementwise_add_grad);
46 47 48
USE_OP_ITSELF(matmul_grad);
USE_OP_ITSELF(square);
USE_OP_ITSELF(transpose2_grad);
C
chentianyu03 已提交
49
USE_OP_ITSELF(concat_grad);
Y
YuanRisheng 已提交
50
USE_OP_ITSELF(elementwise_mul_grad);
Y
YuanRisheng 已提交
51
USE_OP_ITSELF(sigmoid_grad);
52
USE_OP_ITSELF(tanh_grad);
Y
YuanRisheng 已提交
53
USE_OP_ITSELF(sum);
54 55
USE_OP_ITSELF(slice_grad);
USE_OP_ITSELF(lookup_table_grad);
56
USE_OP_ITSELF(sqrt);
57
USE_OP_ITSELF(elementwise_max);
58
USE_OP_ITSELF(elementwise_div);
59
USE_OP_ITSELF(sgd);
60
USE_OP_ITSELF(squared_l2_norm);
61 62
USE_OP_ITSELF(memcpy_h2d);
USE_OP_ITSELF(memcpy_d2h);
63
USE_OP_ITSELF(fetch_v2);
64 65

PD_DECLARE_KERNEL(full, GPU, ALL_LAYOUT);
66 67
PD_DECLARE_KERNEL(uniform_raw, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(uniform, GPU, ALL_LAYOUT);
68 69 70 71
PD_DECLARE_KERNEL(transpose, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(reshape, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(split, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(concat, GPU, ALL_LAYOUT);
C
chentianyu03 已提交
72
PD_DECLARE_KERNEL(concat_grad, GPU, ALL_LAYOUT);
73
PD_DECLARE_KERNEL(matmul, GPU, ALL_LAYOUT);
74 75
PD_DECLARE_KERNEL(add_raw, KPS, ALL_LAYOUT);
PD_DECLARE_KERNEL(add, KPS, ALL_LAYOUT);
76
PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
77 78 79
PD_DECLARE_KERNEL(multiply, KPS, ALL_LAYOUT);
PD_DECLARE_KERNEL(multiply_grad, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(divide, KPS, ALL_LAYOUT);
80
#ifdef PADDLE_WITH_XPU_KP
81
PD_DECLARE_KERNEL(max_raw, GPU, ALL_LAYOUT);
82
PD_DECLARE_KERNEL(maximum, GPU, ALL_LAYOUT);
83
#else
84
PD_DECLARE_KERNEL(max_raw, KPS, ALL_LAYOUT);
85
PD_DECLARE_KERNEL(maximum, KPS, ALL_LAYOUT);
86
#endif
87
PD_DECLARE_KERNEL(mean, GPU, ALL_LAYOUT);
88
PD_DECLARE_KERNEL(mean_grad, GPU, ALL_LAYOUT);
89 90
PD_DECLARE_KERNEL(sigmoid, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(sigmoid_grad, GPU, ALL_LAYOUT);
91
PD_DECLARE_KERNEL(squared_l2_norm, GPU, ALL_LAYOUT);
92 93 94 95 96 97 98
PD_DECLARE_KERNEL(reshape_grad, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(matmul_grad, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(transpose_grad, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(sum, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(sum_grad, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(sgd, GPU, ALL_LAYOUT);
H
hong 已提交
99 100
PD_DECLARE_KERNEL(slice, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(slice_grad, GPU, ALL_LAYOUT);
101 102
PD_DECLARE_KERNEL(cross_entropy_with_softmax, GPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(cross_entropy_with_softmax_grad, GPU, ALL_LAYOUT);
103
PD_DECLARE_KERNEL(sqrt, GPU, ALL_LAYOUT);
Y
YuanRisheng 已提交
104
PD_DECLARE_KERNEL(add_n, GPU, ALL_LAYOUT);
105

106 107
namespace paddle {
namespace framework {
108

109
ProgramDesc load_from_file(const std::string& file_name) {
110 111 112 113 114 115
  std::ifstream fin(file_name, std::ios::in | std::ios::binary);
  fin.seekg(0, std::ios::end);
  std::string buffer(fin.tellg(), ' ');
  fin.seekg(0, std::ios::beg);
  fin.read(&buffer[0], buffer.size());
  fin.close();
116
  ProgramDesc program_desc(buffer);
117 118 119
  return program_desc;
}

120 121
ProgramDesc GetLmMainProgram() {
  ProgramDesc main_prog = load_from_file("lm_main_program");
122

W
wanghuancoder 已提交
123
  auto& global_block = main_prog.Block(0);
124
  int64_t batch_size = 20;
W
wanghuancoder 已提交
125 126

  auto& op1 = global_block.AllOps()[1];
R
Ruibiao Chen 已提交
127
  auto shape1 = PADDLE_GET_CONST(std::vector<int64_t>, op1->GetAttr("shape"));
W
wanghuancoder 已提交
128 129 130 131
  shape1[0] = batch_size * 20;
  op1->SetAttr("shape", shape1);

  auto& op2 = global_block.AllOps()[2];
R
Ruibiao Chen 已提交
132
  auto shape2 = PADDLE_GET_CONST(std::vector<int64_t>, op2->GetAttr("shape"));
W
wanghuancoder 已提交
133 134 135 136
  shape2[0] = batch_size;
  op2->SetAttr("shape", shape2);

  auto& op3 = global_block.AllOps()[3];
R
Ruibiao Chen 已提交
137
  auto shape3 = PADDLE_GET_CONST(std::vector<int64_t>, op3->GetAttr("shape"));
W
wanghuancoder 已提交
138 139
  shape3[0] = batch_size;
  op3->SetAttr("shape", shape3);
140 141 142
  return main_prog;
}

143 144 145 146
TEST(StandaloneExecutor, run) {
  auto place = platform::CUDAPlace(0);
  ProgramDesc startup_prog = load_from_file("lm_startup_program");
  ProgramDesc main_prog = GetLmMainProgram();
W
wanghuancoder 已提交
147

148 149 150 151 152 153
  Scope scope;
  StandaloneExecutor startup_exec(place, startup_prog);
  startup_exec.Run(&scope, {}, {});
  StandaloneExecutor exec(place, main_prog);
  exec.Run(&scope, {}, {});
  auto start = std::chrono::steady_clock::now();
154

155 156 157 158
  for (size_t i = 0; i < 10; ++i) {
    if (i % 200 == 0) {
      std::cout << i << std::endl;
    }
159

160 161
    exec.Run(&scope, {}, {});
  }
162

163 164
  auto end = std::chrono::steady_clock::now();
  std::chrono::duration<double> diff = end - start;
165

166 167
  std::cout << "time cost " << diff.count() << std::endl;
}
168

169
TEST(InterpreterCore, skip_gc_vars) {
170
  auto place = platform::CUDAPlace(0);
171 172
  ProgramDesc startup_prog = load_from_file("lm_startup_program");
  ProgramDesc main_prog = GetLmMainProgram();
173 174 175 176

  Scope scope;

  std::shared_ptr<InterpreterCore> startup_core =
177 178 179
      std::make_shared<InterpreterCore>(
          place, startup_prog.Block(0), &scope, interpreter::ExecutionConfig());

180 181
  startup_core->Run({}, {});

182 183 184 185
  std::set<std::string> skip_gc_vars = {"uniform_0.tmp_0",
                                        "transpose_0.tmp_0",
                                        "embedding_0.tmp_0",
                                        "slice_0.tmp_0",
186
                                        "split_1.tmp_2"};
187 188 189 190
  std::set<std::string> gc_vars = {"uniform_1.tmp_0",
                                   "matmul_0.tmp_0",
                                   "split_0.tmp_0",
                                   "elementwise_add_0.tmp_0",
191
                                   "tmp_0"};
192

193 194 195
  interpreter::ExecutionConfig execution_config;
  execution_config.skip_gc_vars = skip_gc_vars;

196 197 198
  std::shared_ptr<InterpreterCore> main_core =
      std::make_shared<InterpreterCore>(
          place, main_prog.Block(0), &scope, execution_config);
199

200
  auto check_gc_result =
201 202 203 204 205
      [](Scope& scope, std::set<std::string>& vars, bool is_skip_gc) {
        // the first local scope is created in startup_core
        // the second local scope is created in main_core
        ASSERT_EQ(scope.kids().size(), 2UL);
        auto* local_scope = scope.kids().back();
206
        for (const std::string& var_name : vars) {
207
          ASSERT_EQ(local_scope->FindVar(var_name)
208
                        ->GetMutable<phi::DenseTensor>()
209 210
                        ->IsInitialized(),
                    is_skip_gc);
211 212
        }
      };
213 214

  main_core->Run({}, {});
215 216
  check_gc_result(scope, skip_gc_vars, true);
  check_gc_result(scope, gc_vars, false);
217 218

  main_core->Run({}, {});
219 220
  check_gc_result(scope, skip_gc_vars, true);
  check_gc_result(scope, gc_vars, false);
221
}
222

223 224
void TestShareWorkQueue(const ProgramDesc& prog,
                        const std::vector<std::string>& feed_names,
225
                        const std::vector<phi::DenseTensor>& feed_tensors,
226 227 228 229 230
                        const std::vector<std::string>& fetch_names,
                        const std::vector<float>& fetch_results) {
  const platform::CPUPlace place = platform::CPUPlace();

  Scope scope;
231 232 233 234
  std::shared_ptr<InterpreterCore> core1 = std::make_shared<InterpreterCore>(
      place, prog.Block(0), &scope, interpreter::ExecutionConfig());
  std::shared_ptr<InterpreterCore> core2 = std::make_shared<InterpreterCore>(
      place, prog.Block(0), &scope, interpreter::ExecutionConfig());
235 236 237 238 239 240 241
  core2->ShareWorkQueueFrom(core1);

  auto run_and_check = [&feed_names, &feed_tensors, &fetch_results](
                           std::shared_ptr<InterpreterCore> core) {
    FetchList fetch_list = core->Run(feed_names, feed_tensors);
    for (size_t i = 0; i < fetch_list.size(); ++i) {
      const float* fetch_data =
242
          PADDLE_GET_CONST(phi::DenseTensor, fetch_list[i]).data<float>();
243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274
      ASSERT_FLOAT_EQ(*fetch_data, fetch_results.at(i));
    }
  };

  run_and_check(core1);
  run_and_check(core2);
  run_and_check(core1);
  run_and_check(core2);
}

TEST(InterpreterCore, workqueue_multiplexing) {
  ProgramDesc program;
  BlockDesc* main_block = program.MutableBlock(0);
  VarDesc* var_a = main_block->Var("a");
  VarDesc* var_b = main_block->Var("b");
  VarDesc* var_c = main_block->Var("c");
  var_a->SetType(proto::VarType::LOD_TENSOR);
  var_b->SetType(proto::VarType::LOD_TENSOR);
  var_c->SetType(proto::VarType::LOD_TENSOR);

  OpDesc* add = main_block->AppendOp();
  add->SetType("elementwise_add");
  add->SetInput("X", {"a"});
  add->SetInput("Y", {"b"});
  add->SetOutput("Out", {"c"});

  float data_a[] = {0, 1, 2, 3};
  float data_b[] = {0.0, 0.1, 0.2, 0.3};

  phi::DDim dims = phi::make_ddim({2, 2});
  const platform::CPUPlace place = platform::CPUPlace();

275 276
  phi::DenseTensor tensor_a = phi::DenseTensor();
  phi::DenseTensor tensor_b = phi::DenseTensor();
277 278 279 280 281 282 283 284

  std::copy_n(data_a, 4, tensor_a.mutable_data<float>(dims, place));
  std::copy_n(data_b, 4, tensor_b.mutable_data<float>(dims, place));

  TestShareWorkQueue(
      program, {"a", "b"}, {tensor_a, tensor_b}, {"c"}, {0.0, 1.1, 2.2, 3.3});
}

285 286
}  // namespace framework
}  // namespace paddle