diff --git a/paddle/fluid/operators/distributed/grpc_client.cc b/paddle/fluid/operators/distributed/grpc_client.cc index e22bc552f85b85c75f06b4158f2abac2d3843256..13682b78f0eccf049daa315f3a26aafd22e42a41 100644 --- a/paddle/fluid/operators/distributed/grpc_client.cc +++ b/paddle/fluid/operators/distributed/grpc_client.cc @@ -125,7 +125,7 @@ VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep, VarHandlePtr h(new VarHandle(ep, "Get", var_name_val, p_ctx, p_scope)); s->Prepare(h, time_out); - framework::AsyncIO([var_name_val, p_scope, p_ctx, s, this] { + framework::AsyncIO([var_name_val, s, this] { // prepare input sendrecv::VariableMessage req; req.set_varname(var_name_val); @@ -166,7 +166,7 @@ VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep, s->Prepare(h, time_out); framework::AsyncIO([in_var_name_val, out_var_name_val, ep_val, p_scope, p_ctx, - time_out, s, this] { + s, this] { auto* var = p_scope->FindVar(in_var_name_val); ::grpc::ByteBuffer req; diff --git a/python/paddle/fluid/tests/unittests/dist_transformer.py b/python/paddle/fluid/tests/unittests/dist_transformer.py index e3db316698398ff693157d583ad1410d10dcf81d..3ec79f8ef6e6f70f1365eaa32352c284d294a1ea 100644 --- a/python/paddle/fluid/tests/unittests/dist_transformer.py +++ b/python/paddle/fluid/tests/unittests/dist_transformer.py @@ -92,7 +92,7 @@ class TrainTaskConfig(object): src_vocab_fpath = data_path + "vocab.bpe.32000" trg_vocab_fpath = data_path + "vocab.bpe.32000" train_file_pattern = data_path + "train.tok.clean.bpe.32000.en-de" - val_file_pattern = data_path + "newstest2013.tok.bpe.32000.en-de" + val_file_pattern = data_path + "newstest2013.tok.bpe.32000.en-de.cut" pool_size = 2000 sort_type = None local = True @@ -624,11 +624,12 @@ def train_loop(exe, train_progm, dev_count, sum_cost, avg_cost, lr_scheduler, init = True # Validate and save the model for inference. - if TrainTaskConfig.val_file_pattern is not None: - val_avg_cost, val_ppl = test() - print("[%f]" % val_avg_cost) - else: - assert (False) + if batch_id == 0 or batch_id == 4: + if TrainTaskConfig.val_file_pattern is not None: + val_avg_cost, val_ppl = test() + print("[%f]" % val_avg_cost) + else: + assert (False) #import transformer_reader as reader @@ -1701,8 +1702,9 @@ class DistTransformer2x2(TestDistRunnerBase): exe.run(startup_prog) exe.run(pserver_prog) - def run_trainer(self, place, args): - + def run_trainer(self, use_cuda, args): + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + TrainTaskConfig.use_gpu = use_cuda sum_cost, avg_cost, predict, token_num, local_lr_scheduler = get_model( args.is_dist, not args.sync_mode) diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py index c0f5da5a1ae43847dff6348ea5f3e3bfd5e89ab9..37cad73019c529f64868b6ad3c6e2fffe59cc0d8 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_base.py @@ -61,9 +61,10 @@ class TestDistRunnerBase(object): exe.run(startup_prog) exe.run(pserver_prog) - def run_trainer(self, place, args): + def run_trainer(self, use_cuda, args): import paddle import paddle.fluid as fluid + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \ self.get_model(batch_size=2) if args.mem_opt: @@ -91,7 +92,7 @@ class TestDistRunnerBase(object): build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce exe = fluid.ParallelExecutor( - True, + use_cuda, loss_name=avg_cost.name, exec_strategy=strategy, build_strategy=build_stra) @@ -142,9 +143,8 @@ def runtime_main(test_class): if args.role == "pserver" and args.is_dist: model.run_pserver(args) else: - p = fluid.CUDAPlace(0) if core.is_compiled_with_cuda( - ) else fluid.CPUPlace() - model.run_trainer(p, args) + use_cuda = True if core.is_compiled_with_cuda() else False + model.run_trainer(use_cuda, args) import paddle.compat as cpt @@ -225,11 +225,12 @@ class TestDistBase(unittest.TestCase): def check_with_place(self, model_file, delta=1e-3, check_error_log=False): # TODO(typhoonzero): should auto adapt GPU count on the machine. required_envs = { - "PATH": os.getenv("PATH"), - "PYTHONPATH": os.getenv("PYTHONPATH"), - "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH"), + "PATH": os.getenv("PATH", ""), + "PYTHONPATH": os.getenv("PYTHONPATH", ""), + "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""), "FLAGS_fraction_of_gpu_memory_to_use": "0.15", - "FLAGS_cudnn_deterministic": "1" + "FLAGS_cudnn_deterministic": "1", + "CPU_NUM": "1" } if check_error_log: diff --git a/python/paddle/fluid/tests/unittests/test_dist_transformer.py b/python/paddle/fluid/tests/unittests/test_dist_transformer.py index a8e6ce4cfe18384e405f1602429628914d2c2e00..e55f8707a9a8ac2b0d69c65b15e6593025511999 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_transformer.py +++ b/python/paddle/fluid/tests/unittests/test_dist_transformer.py @@ -14,6 +14,7 @@ from __future__ import print_function +import os import unittest import paddle from test_dist_base import TestDistBase @@ -44,6 +45,14 @@ def download_files(): test_url = url_prefix + 'newstest2013.tok.bpe.32000.en-de' test_md5 = '9dd74a266dbdb25314183899f269b4a2' paddle.dataset.common.download(test_url, 'test_dist_transformer', test_md5) + # cut test data for faster CI + orig_path = os.path.join(paddle.dataset.common.DATA_HOME, + "test_dist_transformer", + "newstest2013.tok.bpe.32000.en-de") + head_path = os.path.join(paddle.dataset.common.DATA_HOME, + "test_dist_transformer", + "newstest2013.tok.bpe.32000.en-de.cut") + os.system("head -n10 %s > %s" % (orig_path, head_path)) class TestDistTransformer2x2Sync(TestDistBase):