Fix mac ci dist (#13393)

437debf4 · Wu Yi · gongweibao · 3c5c6e74 · 437debf4 · 437debf4
4 changed file
--- a/paddle/fluid/operators/distributed/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc_client.cc
@@ -125,7 +125,7 @@ VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep,
  VarHandlePtr h(new VarHandle(ep, "Get", var_name_val, p_ctx, p_scope));
  s->Prepare(h, time_out);

-  framework::AsyncIO([var_name_val, p_scope, p_ctx, s, this] {
+  framework::AsyncIO([var_name_val, s, this] {
    // prepare input
    sendrecv::VariableMessage req;
    req.set_varname(var_name_val);
@@ -166,7 +166,7 @@ VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep,
  s->Prepare(h, time_out);

  framework::AsyncIO([in_var_name_val, out_var_name_val, ep_val, p_scope, p_ctx,
-                      time_out, s, this] {
+                      s, this] {
    auto* var = p_scope->FindVar(in_var_name_val);

    ::grpc::ByteBuffer req;

--- a/python/paddle/fluid/tests/unittests/dist_transformer.py
+++ b/python/paddle/fluid/tests/unittests/dist_transformer.py
@@ -92,7 +92,7 @@ class TrainTaskConfig(object):
    src_vocab_fpath = data_path + "vocab.bpe.32000"
    trg_vocab_fpath = data_path + "vocab.bpe.32000"
    train_file_pattern = data_path + "train.tok.clean.bpe.32000.en-de"
-    val_file_pattern = data_path + "newstest2013.tok.bpe.32000.en-de"
+    val_file_pattern = data_path + "newstest2013.tok.bpe.32000.en-de.cut"
    pool_size = 2000
    sort_type = None
    local = True
@@ -624,11 +624,12 @@ def train_loop(exe, train_progm, dev_count, sum_cost, avg_cost, lr_scheduler,
            init = True

            # Validate and save the model for inference.
-            if TrainTaskConfig.val_file_pattern is not None:
-                val_avg_cost, val_ppl = test()
-                print("[%f]" % val_avg_cost)
-            else:
-                assert (False)
+            if batch_id == 0 or batch_id == 4:
+                if TrainTaskConfig.val_file_pattern is not None:
+                    val_avg_cost, val_ppl = test()
+                    print("[%f]" % val_avg_cost)
+                else:
+                    assert (False)


 #import transformer_reader as reader
@@ -1701,8 +1702,9 @@ class DistTransformer2x2(TestDistRunnerBase):
        exe.run(startup_prog)
        exe.run(pserver_prog)

-    def run_trainer(self, place, args):
-
+    def run_trainer(self, use_cuda, args):
+        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+        TrainTaskConfig.use_gpu = use_cuda
        sum_cost, avg_cost, predict, token_num, local_lr_scheduler = get_model(
            args.is_dist, not args.sync_mode)


--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -61,9 +61,10 @@ class TestDistRunnerBase(object):
        exe.run(startup_prog)
        exe.run(pserver_prog)

-    def run_trainer(self, place, args):
+    def run_trainer(self, use_cuda, args):
        import paddle
        import paddle.fluid as fluid
+        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
        test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \
            self.get_model(batch_size=2)
        if args.mem_opt:
@@ -91,7 +92,7 @@ class TestDistRunnerBase(object):
            build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce

        exe = fluid.ParallelExecutor(
-            True,
+            use_cuda,
            loss_name=avg_cost.name,
            exec_strategy=strategy,
            build_strategy=build_stra)
@@ -142,9 +143,8 @@ def runtime_main(test_class):
    if args.role == "pserver" and args.is_dist:
        model.run_pserver(args)
    else:
-        p = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
-        model.run_trainer(p, args)
+        use_cuda = True if core.is_compiled_with_cuda() else False
+        model.run_trainer(use_cuda, args)


 import paddle.compat as cpt
@@ -225,11 +225,12 @@ class TestDistBase(unittest.TestCase):
    def check_with_place(self, model_file, delta=1e-3, check_error_log=False):
        # TODO(typhoonzero): should auto adapt GPU count on the machine.
        required_envs = {
-            "PATH": os.getenv("PATH"),
-            "PYTHONPATH": os.getenv("PYTHONPATH"),
-            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH"),
+            "PATH": os.getenv("PATH", ""),
+            "PYTHONPATH": os.getenv("PYTHONPATH", ""),
+            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
            "FLAGS_fraction_of_gpu_memory_to_use": "0.15",
-            "FLAGS_cudnn_deterministic": "1"
+            "FLAGS_cudnn_deterministic": "1",
+            "CPU_NUM": "1"
        }

        if check_error_log:

--- a/python/paddle/fluid/tests/unittests/test_dist_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transformer.py
@@ -14,6 +14,7 @@

 from __future__ import print_function

+import os
 import unittest
 import paddle
 from test_dist_base import TestDistBase
@@ -44,6 +45,14 @@ def download_files():
    test_url = url_prefix + 'newstest2013.tok.bpe.32000.en-de'
    test_md5 = '9dd74a266dbdb25314183899f269b4a2'
    paddle.dataset.common.download(test_url, 'test_dist_transformer', test_md5)
+    # cut test data for faster CI
+    orig_path = os.path.join(paddle.dataset.common.DATA_HOME,
+                             "test_dist_transformer",
+                             "newstest2013.tok.bpe.32000.en-de")
+    head_path = os.path.join(paddle.dataset.common.DATA_HOME,
+                             "test_dist_transformer",
+                             "newstest2013.tok.bpe.32000.en-de.cut")
+    os.system("head -n10 %s > %s" % (orig_path, head_path))


 class TestDistTransformer2x2Sync(TestDistBase):