diff --git a/fluid/neural_machine_translation/transformer/local_dist.sh b/fluid/neural_machine_translation/transformer/local_dist.sh new file mode 100755 index 0000000000000000000000000000000000000000..331845f06fcf0f96332eefed33426eb0596d2b31 --- /dev/null +++ b/fluid/neural_machine_translation/transformer/local_dist.sh @@ -0,0 +1,64 @@ +#!/bin/bash + +set -x + +unset http_proxy +unset https_proxy + +#pserver +export TRAINING_ROLE=PSERVER +export PADDLE_PORT=30134 +export PADDLE_PSERVERS=127.0.0.1 +export PADDLE_IS_LOCAL=0 +export PADDLE_INIT_TRAINER_COUNT=1 +export POD_IP=127.0.0.1 +export PADDLE_TRAINER_ID=0 +export PADDLE_TRAINERS_NUM=1 + +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/lib64/:/usr/local/lib/:/workspace/brpc +export PYTHONPATH=$PYTHONPATH:/paddle/build/build_reader_RelWithDebInfo_gpu/python + +#GLOG_v=7 GLOG_logtostderr=1 +CUDA_VISIBLE_DEVICES=4,5,6,7 python -u train.py \ + --src_vocab_fpath 'cluster_test_data_en_fr/thirdparty/vocab.wordpiece.en-fr' \ + --trg_vocab_fpath 'cluster_test_data_en_fr/thirdparty/vocab.wordpiece.en-fr' \ + --special_token '' '' '' \ + --token_delimiter '\x01' \ + --train_file_pattern 'cluster_test_data_en_fr/train/train.wordpiece.en-fr.0' \ + --val_file_pattern 'cluster_test_data_en_fr/thirdparty/newstest2014.wordpiece.en-fr' \ + --use_token_batch True \ + --batch_size 3200 \ + --sort_type pool \ + --pool_size 200000 \ + --local False > pserver.log 2>&1 & + +pserver_pid=$(echo $!) +echo $pserver_pid + +sleep 30s + +#trainer +export TRAINING_ROLE=TRAINER +export PADDLE_PORT=30134 +export PADDLE_PSERVERS=127.0.0.1 +export PADDLE_IS_LOCAL=0 +export PADDLE_INIT_TRAINER_COUNT=1 +export POD_IP=127.0.0.1 +export PADDLE_TRAINER_ID=0 +export PADDLE_TRAINERS_NUM=1 + +CUDA_VISIBLE_DEVICES=4,5,6,7 python -u train.py \ + --src_vocab_fpath 'cluster_test_data_en_fr/thirdparty/vocab.wordpiece.en-fr' \ + --trg_vocab_fpath 'cluster_test_data_en_fr/thirdparty/vocab.wordpiece.en-fr' \ + --special_token '' '' '' \ + --token_delimiter '\x01' \ + --train_file_pattern 'cluster_test_data_en_fr/train/train.wordpiece.en-fr.0' \ + --val_file_pattern 'cluster_test_data_en_fr/thirdparty/newstest2014.wordpiece.en-fr' \ + --use_token_batch True \ + --batch_size 3200 \ + --sort_type pool \ + --pool_size 200000 \ + --local False > trainer.log 2>&1 & + +#sleep 80 +#kill -9 $pserver_pid diff --git a/fluid/neural_machine_translation/transformer/train.py b/fluid/neural_machine_translation/transformer/train.py index 324780852fa954b6e3ef15c4116cce60898d3821..f1b1acdc4362ca02661a7a2ecbf5d626c859f1e6 100644 --- a/fluid/neural_machine_translation/transformer/train.py +++ b/fluid/neural_machine_translation/transformer/train.py @@ -643,7 +643,7 @@ def train(args): if args.sync: lr_decay = fluid.layers.learning_rate_scheduler.noam_decay( ModelHyperParams.d_model, TrainTaskConfig.warmup_steps) - print("before adam") + logging.info("before adam") with fluid.default_main_program()._lr_schedule_guard(): learning_rate = lr_decay * TrainTaskConfig.learning_rate @@ -661,7 +661,7 @@ def train(args): fluid.memory_optimize(train_prog) if args.local: - print("local start_up:") + logging.info("local start_up:") train_loop(exe, train_prog, startup_prog, dev_count, sum_cost, avg_cost, token_num, predict, pyreader) else: @@ -677,9 +677,9 @@ def train(args): if trainer_id == 0: logging.info("train_id == 0, sleep 60s") time.sleep(60) - print("trainers_num:", trainers_num) - print("worker_endpoints:", worker_endpoints) - print("current_endpoint:", current_endpoint) + logging.info("trainers_num:{}".format(trainers_num)) + logging.info("worker_endpoints:{}".format(worker_endpoints)) + logging.info("current_endpoint:{}".format(current_endpoint)) append_nccl2_prepare(trainer_id, worker_endpoints, current_endpoint) train_loop(exe, fluid.default_main_program(), dev_count, sum_cost, @@ -696,11 +696,11 @@ def train(args): current_endpoint = os.getenv("POD_IP") + ":" + port trainer_id = int(os.getenv("PADDLE_TRAINER_ID")) - print("pserver_endpoints", pserver_endpoints) - print("current_endpoint", current_endpoint) - print("trainer_id", trainer_id) - print("pserver_ips", pserver_ips) - print("port", port) + logging.info("pserver_endpoints:{}".format(pserver_endpoints)) + logging.info("current_endpoint:{}".format(current_endpoint)) + logging.info("trainer_id:{}".format(trainer_id)) + logging.info("pserver_ips:{}".format(pserver_ips)) + logging.info("port:{}".format(port)) t = fluid.DistributeTranspiler() t.transpile( @@ -715,30 +715,17 @@ def train(args): current_endpoint = os.getenv("POD_IP") + ":" + os.getenv( "PADDLE_PORT") if not current_endpoint: - print("need env SERVER_ENDPOINT") + logging.critical("need env SERVER_ENDPOINT") exit(1) pserver_prog = t.get_pserver_program(current_endpoint) pserver_startup = t.get_startup_program(current_endpoint, pserver_prog) - print("pserver start:") - program_to_code(pserver_startup) - print("pserver train:") - program_to_code(pserver_prog) - #sys.exit(0) - exe.run(pserver_startup) exe.run(pserver_prog) elif training_role == "TRAINER": logging.info("distributed: trainer started") trainer_prog = t.get_trainer_program() - ''' - print("trainer start:") - program_to_code(pserver_startup) - print("trainer train:") - program_to_code(trainer_prog) - sys.exit(0) - ''' train_loop(exe, train_prog, startup_prog, dev_count, sum_cost, avg_cost, token_num, predict, pyreader)