提交 b38452df 编写于 作者: T typhoonzero

fix styles

上级 cb34f6a2
......@@ -13,3 +13,4 @@ Check the logs for the distributed training progress and analyze the performance
## Enable verbos logs
Edit `pserver.yaml` and `trainer.yaml` and add an environment variable `GLOG_v=3` to see what happend in detail.
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#!/bin/env python
import os
import sys
......@@ -33,6 +47,7 @@ def wait_pods_running(label_selector, desired):
print 'current cnt: %d sleep for 5 seconds...' % count
time.sleep(5)
def count_pods_by_phase(label_selector, phase):
pod_list = fetch_pods_info(label_selector)
filtered_pod_list = filter(lambda x: x[0] == phase, pod_list)
......@@ -45,12 +60,14 @@ def fetch_pserver_ips():
pserver_ips = [item[1] for item in pod_list]
return ",".join(pserver_ips)
def fetch_master_ip():
label_selector = "paddle-job-master=%s" % PADDLE_JOB_NAME
pod_list = fetch_pods_info(label_selector)
master_ips = [item[1] for item in pod_list]
return master_ips[0]
def fetch_trainer_id():
label_selector = "paddle-job=%s" % PADDLE_JOB_NAME
pod_list = fetch_pods_info(label_selector)
......@@ -75,4 +92,3 @@ if __name__ == "__main__":
print count_pods_by_phase(sys.argv[2], sys.argv[3])
elif command == "wait_pods_running":
wait_pods_running(sys.argv[2], sys.argv[3])
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.v2 as paddle
paddle.dataset.cifar.train10()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""VGG16 benchmark in Fluid"""
from __future__ import print_function
......@@ -11,6 +25,7 @@ import argparse
import functools
import os
def str2bool(v):
if v.lower() in ('yes', 'true', 't', 'y', '1'):
return True
......@@ -19,6 +34,7 @@ def str2bool(v):
else:
raise argparse.ArgumentTypeError('Boolean value expected.')
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
'--batch_size', type=int, default=128, help="Batch size for training.")
......@@ -122,7 +138,6 @@ def main():
place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
exe = fluid.Executor(place)
# test
def test(exe):
accuracy.reset(exe)
......@@ -148,8 +163,9 @@ def main():
accuracy.reset(exe)
for batch_id, data in enumerate(train_reader()):
ts = time.time()
img_data = np.array(map(lambda x: x[0].reshape(data_shape),
data)).astype("float32")
img_data = np.array(
map(lambda x: x[0].reshape(data_shape), data)).astype(
"float32")
y_data = np.array(map(lambda x: x[1], data)).astype("int64")
y_data = y_data.reshape([-1, 1])
......@@ -160,8 +176,8 @@ def main():
iters += 1
num_samples += len(data)
print(
"Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, spent %f" %
(pass_id, iters, loss, acc, time.time() - ts)
"Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, spent %f"
% (pass_id, iters, loss, acc, time.time() - ts)
) # The accuracy is the accumulation of batches, but not the current batch.
pass_elapsed = time.time() - start_time
......@@ -179,8 +195,8 @@ def main():
# data reader
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.cifar.train10()
if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
paddle.dataset.cifar.train10() if args.data_set == 'cifar10'
else paddle.dataset.flowers.train(),
buf_size=5120),
batch_size=args.batch_size)
test_reader = paddle.batch(
......@@ -196,19 +212,25 @@ def main():
pserver_endpoints = ",".join(eplist)
print("pserver endpoints: ", pserver_endpoints)
trainers = int(os.getenv("TRAINERS")) # total trainer count
current_endpoint = os.getenv("POD_IP") + ":6174" # current pserver endpoint
training_role = os.getenv("TRAINING_ROLE",
current_endpoint = os.getenv(
"POD_IP") + ":6174" # current pserver endpoint
training_role = os.getenv(
"TRAINING_ROLE",
"TRAINER") # get the training role: trainer/pserver
t = fluid.DistributeTranspiler()
t.transpile(
optimize_ops, params_grads, pservers=pserver_endpoints, trainers=trainers)
optimize_ops,
params_grads,
pservers=pserver_endpoints,
trainers=trainers)
if training_role == "PSERVER":
if not current_endpoint:
print("need env SERVER_ENDPOINT")
exit(1)
pserver_prog = t.get_pserver_program(current_endpoint)
pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
pserver_startup = t.get_startup_program(current_endpoint,
pserver_prog)
print("starting server side startup")
exe.run(pserver_startup)
print("starting parameter server...")
......@@ -220,13 +242,13 @@ def main():
# data reader
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.cifar.train10()
if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
paddle.dataset.cifar.train10() if args.data_set == 'cifar10'
else paddle.dataset.flowers.train(),
buf_size=5120),
batch_size=args.batch_size)
test_reader = paddle.batch(
paddle.dataset.cifar.test10()
if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
paddle.dataset.cifar.test10() if args.data_set == 'cifar10' else
paddle.dataset.flowers.test(),
batch_size=args.batch_size)
trainer_prog = t.get_trainer_program()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册