diff --git a/benchmark/cluster/vgg16/fluid/README.md b/benchmark/cluster/vgg16/fluid/README.md index 63a460f7a6200ceb4731f409c3745fd8208ce054..02b17dceb9e235fca321a346cac2cf2df4ddafd5 100644 --- a/benchmark/cluster/vgg16/fluid/README.md +++ b/benchmark/cluster/vgg16/fluid/README.md @@ -12,4 +12,5 @@ Check the logs for the distributed training progress and analyze the performance ## Enable verbos logs -Edit `pserver.yaml` and `trainer.yaml` and add an environment variable `GLOG_v=3` to see what happend in detail. \ No newline at end of file +Edit `pserver.yaml` and `trainer.yaml` and add an environment variable `GLOG_v=3` to see what happend in detail. + diff --git a/benchmark/cluster/vgg16/fluid/k8s_tools.py b/benchmark/cluster/vgg16/fluid/k8s_tools.py index 8a64dbd361ac9f9e6a80a1262cf123382f92fc16..4bee96a7a80968e605e4d057ed4c86851365c468 100644 --- a/benchmark/cluster/vgg16/fluid/k8s_tools.py +++ b/benchmark/cluster/vgg16/fluid/k8s_tools.py @@ -1,3 +1,17 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + #!/bin/env python import os import sys @@ -33,6 +47,7 @@ def wait_pods_running(label_selector, desired): print 'current cnt: %d sleep for 5 seconds...' % count time.sleep(5) + def count_pods_by_phase(label_selector, phase): pod_list = fetch_pods_info(label_selector) filtered_pod_list = filter(lambda x: x[0] == phase, pod_list) @@ -45,12 +60,14 @@ def fetch_pserver_ips(): pserver_ips = [item[1] for item in pod_list] return ",".join(pserver_ips) + def fetch_master_ip(): label_selector = "paddle-job-master=%s" % PADDLE_JOB_NAME pod_list = fetch_pods_info(label_selector) master_ips = [item[1] for item in pod_list] return master_ips[0] + def fetch_trainer_id(): label_selector = "paddle-job=%s" % PADDLE_JOB_NAME pod_list = fetch_pods_info(label_selector) @@ -75,4 +92,3 @@ if __name__ == "__main__": print count_pods_by_phase(sys.argv[2], sys.argv[3]) elif command == "wait_pods_running": wait_pods_running(sys.argv[2], sys.argv[3]) - diff --git a/benchmark/cluster/vgg16/fluid/reader.py b/benchmark/cluster/vgg16/fluid/reader.py index c5161ddea267e6c18738b02c34235ec88244be44..3e20f830fce8ec59e406dbd92dc9c233c3ebe718 100644 --- a/benchmark/cluster/vgg16/fluid/reader.py +++ b/benchmark/cluster/vgg16/fluid/reader.py @@ -1,2 +1,16 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import paddle.v2 as paddle paddle.dataset.cifar.train10() diff --git a/benchmark/cluster/vgg16/fluid/vgg16.py b/benchmark/cluster/vgg16/fluid/vgg16.py index a973f9d2a697c8658d401728137c1de28c30f3b5..3c7b5bf2f14efb00f670f19f3ff6ca2d2116b208 100644 --- a/benchmark/cluster/vgg16/fluid/vgg16.py +++ b/benchmark/cluster/vgg16/fluid/vgg16.py @@ -1,3 +1,17 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + """VGG16 benchmark in Fluid""" from __future__ import print_function @@ -11,6 +25,7 @@ import argparse import functools import os + def str2bool(v): if v.lower() in ('yes', 'true', 't', 'y', '1'): return True @@ -19,6 +34,7 @@ def str2bool(v): else: raise argparse.ArgumentTypeError('Boolean value expected.') + parser = argparse.ArgumentParser(description=__doc__) parser.add_argument( '--batch_size', type=int, default=128, help="Batch size for training.") @@ -122,7 +138,6 @@ def main(): place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0) exe = fluid.Executor(place) - # test def test(exe): accuracy.reset(exe) @@ -148,20 +163,21 @@ def main(): accuracy.reset(exe) for batch_id, data in enumerate(train_reader()): ts = time.time() - img_data = np.array(map(lambda x: x[0].reshape(data_shape), - data)).astype("float32") + img_data = np.array( + map(lambda x: x[0].reshape(data_shape), data)).astype( + "float32") y_data = np.array(map(lambda x: x[1], data)).astype("int64") y_data = y_data.reshape([-1, 1]) loss, acc = exe.run(trainer_prog, feed={"pixel": img_data, - "label": y_data}, + "label": y_data}, fetch_list=[avg_cost] + accuracy.metrics) iters += 1 num_samples += len(data) print( - "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, spent %f" % - (pass_id, iters, loss, acc, time.time() - ts) + "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, spent %f" + % (pass_id, iters, loss, acc, time.time() - ts) ) # The accuracy is the accumulation of batches, but not the current batch. pass_elapsed = time.time() - start_time @@ -170,7 +186,7 @@ def main(): print( "Pass = %d, Training performance = %f imgs/s, Train accuracy = %f, Test accuracy = %f\n" % (pass_id, num_samples / pass_elapsed, pass_train_acc, - pass_test_acc)) + pass_test_acc)) if args.local: # Parameter initialization @@ -179,8 +195,8 @@ def main(): # data reader train_reader = paddle.batch( paddle.reader.shuffle( - paddle.dataset.cifar.train10() - if args.data_set == 'cifar10' else paddle.dataset.flowers.train(), + paddle.dataset.cifar.train10() if args.data_set == 'cifar10' + else paddle.dataset.flowers.train(), buf_size=5120), batch_size=args.batch_size) test_reader = paddle.batch( @@ -196,19 +212,25 @@ def main(): pserver_endpoints = ",".join(eplist) print("pserver endpoints: ", pserver_endpoints) trainers = int(os.getenv("TRAINERS")) # total trainer count - current_endpoint = os.getenv("POD_IP") + ":6174" # current pserver endpoint - training_role = os.getenv("TRAINING_ROLE", - "TRAINER") # get the training role: trainer/pserver + current_endpoint = os.getenv( + "POD_IP") + ":6174" # current pserver endpoint + training_role = os.getenv( + "TRAINING_ROLE", + "TRAINER") # get the training role: trainer/pserver t = fluid.DistributeTranspiler() t.transpile( - optimize_ops, params_grads, pservers=pserver_endpoints, trainers=trainers) + optimize_ops, + params_grads, + pservers=pserver_endpoints, + trainers=trainers) if training_role == "PSERVER": if not current_endpoint: print("need env SERVER_ENDPOINT") exit(1) pserver_prog = t.get_pserver_program(current_endpoint) - pserver_startup = t.get_startup_program(current_endpoint, pserver_prog) + pserver_startup = t.get_startup_program(current_endpoint, + pserver_prog) print("starting server side startup") exe.run(pserver_startup) print("starting parameter server...") @@ -220,13 +242,13 @@ def main(): # data reader train_reader = paddle.batch( paddle.reader.shuffle( - paddle.dataset.cifar.train10() - if args.data_set == 'cifar10' else paddle.dataset.flowers.train(), + paddle.dataset.cifar.train10() if args.data_set == 'cifar10' + else paddle.dataset.flowers.train(), buf_size=5120), batch_size=args.batch_size) test_reader = paddle.batch( - paddle.dataset.cifar.test10() - if args.data_set == 'cifar10' else paddle.dataset.flowers.test(), + paddle.dataset.cifar.test10() if args.data_set == 'cifar10' else + paddle.dataset.flowers.test(), batch_size=args.batch_size) trainer_prog = t.get_trainer_program()