Created by: mapingshuo
PR types
Bug fixes
PR changes
APIs
Describe
When using the Fleet API (version2.0), I found that my program lacks the 'backward' and 'optimize' operators, which means the optimize.minimize()
did not work correctly.
test code
# limitations under the License.
import os
os.environ['FLAGS_enable_parallel_graph'] = "0"
os.environ['FLAGS_fraction_of_gpu_memory_to_use'] = "0.98"
os.environ['FLAGS_sync_nccl_allreduce'] = "1"
os.environ['FLAGS_eager_delete_tensor_gb'] = "0"
os.environ['FLAGS_fuse_parameter_memory_size'] = "32"
os.environ['FLAGS_fuse_parameter_groups_size'] = "50"
import numpy as np
import fleet_lightning as lighting
import paddle.fluid as fluid
import paddle.fluid.incubate.fleet.base.role_maker as role_maker
import time
import paddle.fleet as fleet
import paddle
# lightning help users to focus more on learning to train a large scale model
# if you want to learn how to write a model, lightning is not for you
# focus more on engineering staff in fleet-lightning
configs = lighting.parse_train_configs()
role = role_maker.PaddleCloudRoleMaker(is_collective=True)
fleet.init(role)
# load Bert_large / Bert_base model
model = lighting.applications.Bert_large()
#model = lighting.applications.Bert_base()
data_loader = model.load_digital_dataset_from_file(
data_dir='train/',
vocab_path='vocab.txt',
max_seq_len=512,
batch_size=14,
)
place = fluid.CUDAPlace(int(os.environ.get('FLAGS_selected_gpus', 0)))
exec_strategy = fluid.ExecutionStrategy()
exec_strategy.num_threads = 2
exec_strategy.num_iteration_per_drop_scope = 1
dist_strategy = fleet.DistributedStrategy()
dist_strategy.exec_strategy = exec_strategy
dist_strategy.nccl_comm_num = 3
optimizer = fluid.optimizer.Adam(learning_rate=configs.lr)
optimizer = fleet.distributed_optimizer(optimizer, dist_strategy)
optimizer.minimize(model.loss)
print(optimizer)
print("after minimize")
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
with open("main_program.txt", 'w') as f:
f.write(str(paddle.default_main_program()))
scope = fluid.global_scope()
total_time = 0
for i, data in enumerate(data_loader()):
if i >= 10:
start_time = time.time()
cost_val = exe.run(paddle.default_main_program(),
feed=data,
fetch_list=[model.loss.name])
if i >= 10:
end_time = time.time()
total_time += (end_time - start_time)
print(
"worker_index: %d, step%d cost = %f, total time cost = %f, step per second: %f, speed: %f"
% (fleet.worker_index(), i, cost_val[0], total_time,
(i - 9) / total_time, 1 / (end_time - start_time)))
print("step: %d, encoder_layer_16_ffn_fc_1.w_0: %s" % (
i, scope.var("encoder_layer_16_ffn_fc_1.w_0").get_tensor().__array__()))
python -m paddle.distributed.launch --selected_gpus=2,3 no-backward.py
Result:
The dumped main_program.txt
contains no backward and optimize ops, such as adam
. This PR fixed this.