Created by: JZ-LIANG
PR types
New features
PR changes
APIs
Describe
add original lars to fleet meta optimizer, no function modification
how to run:
python -m paddle.distributed.launch resnet_lars.py
# resnet_lars.py
import os
import time
import fleet_lightning as lightning
import paddle.fluid as fluid
import paddle.fluid.incubate.fleet.base.role_maker as role_maker
import paddle.fleet as fleet
configs = lightning.parse_train_configs()
role = role_maker.PaddleCloudRoleMaker(is_collective=True)
fleet.init(role)
model = lightning.applications.Resnet50()
loader = model.load_imagenet_from_file("/paddle/docker_data/paddle/data/dataset/ImageNet/train.txt", batch_size=256)
optimizer = fluid.optimizer.Momentum(learning_rate=configs.lr, momentum=configs.momentum)
dist_strategy = fleet.DistributedStrategy()
dist_strategy.lars = True
dist_strategy.lars_configs = {
"lars_coeff": 0.001,
"lars_weight_decay": 0.0005,
}
optimizer = fleet.distributed_optimizer(optimizer, dist_strategy)
optimizer.minimize(model.loss)
place = fluid.CUDAPlace(int(os.environ.get('FLAGS_selected_gpus', 0)))
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
scope = fluid.global_scope()
num_epoch = 3
batch_size=256
for epoch in range(num_epoch):
for i, data in enumerate(loader()):
start_time = time.time()
loss = exe.run(model.loss.block.program._graph, feed=data, fetch_list=[model.loss.name])
end_time = time.time()
print(
"worker_index: %d, epoch: %d, loss: %f, speed: %f img/s"
% (fleet.worker_index(), i, loss[0], batch_size / (end_time - start_time)))