TypeError: slice indices must be integers or None or have an __index__ method
Created by: HerbertArthur
使用fleet.split_files(file_list)
时出现TypeError: slice indices must be integers or None or have an index method
报错信息如下:
I0907 02:48:48.708067 9968 communicator.h:252] AsyncCommunicator Initialized
>>> file_list: ['../../data_test_utf_2M/rank/train/data_10.txt', '../../data_test_utf_2M/rank/train/data_14.txt', '../../data_test_utf_2M/rank/train/data_6.txt', '../../data_test_utf_2M/rank/train/data_1.txt', '../../data_test_utf_2M/rank/train/data_11.txt', '../../data_test_utf_2M/rank/train/data_9.txt', '../../data_test_utf_2M/rank/train/data_2.txt', '../../data_test_utf_2M/rank/train/data_8.txt', '../../data_test_utf_2M/rank/train/data_3.txt', '../../data_test_utf_2M/rank/train/data_7.txt', '../../data_test_utf_2M/rank/train/data_13.txt', '../../data_test_utf_2M/rank/train/data_15.txt', '../../data_test_utf_2M/rank/train/data_16.txt', '../../data_test_utf_2M/rank/train/data_12.txt', '../../data_test_utf_2M/rank/train/data_4.txt', '../../data_test_utf_2M/rank/train/data_5.txt']
>>> file_list.size() 16
Traceback (most recent call last):
File "dist_train.py", line 138, in <module>
dist_train()
File "dist_train.py", line 130, in dist_train
args.sparse_feature_dim, args.vocab, args.num_thread)
File "dist_train.py", line 52, in dataset_reader
file_list = fleet.split_files(file_list)
File "/usr/local/envs/paddle/lib/python3.7/site-packages/paddle/fluid/incubate/fleet/base/fleet_base.py", line 175, in split_files
trainer_files[i] = files[begin:begin + blocks[i]]
TypeError: slice indices must be integers or None or have an __index__ method
我的训练代码如下:
def dataset_reader(data_path, batch_size, inputs, sparse_feature_dim, vocab, num_thread):
"""The dataset reader"""
... ...
# let each trainer got diffrent training data
file_list = ["%s/%s" % (data_path, x) for x in os.listdir(data_path)]
print(">>> file_list: ", file_list)
print(">>> file_list.size()", len(file_list))
file_list = fleet.split_files(file_list)
return dataset, file_list
def dist_train():
"""
train
"""
# ascertain the machine role in distribution env
role = role_maker.PaddleCloudRoleMaker()
fleet.init(role)
# assign the runing mode of distribution env
strategy = distribute_transpiler.DistributeTranspilerConfig()
strategy.sync_mode = False
strategy.runtime_split_send_recv = True
... ...
inputs = LRModel.inputs()
lr_model = LRModel(args.embedding_size, args.sparse_feature_dim)
predict = lr_model.inference()
auc, batch_auc, avg_cost, accuracy = lr_model.cal_loss(predict)
# eval_program = main_program.clone(for_test=True)
optimizer = fluid.optimizer.Adam(learning_rate=1e-3)
optimizer = fleet.distributed_optimizer(optimizer, strategy)
optimizer.minimize(avg_cost)
def train_loop(dataset, file_list):
... ....
for i in range(epochs):
random.shuffle(file_list)
dataset.set_filelist(file_list)
... ...
if fleet.is_server():
fleet.init_server()
fleet.run_server()
elif fleet.is_worker():
fleet.init_worker()
place = fluid.CPUPlace()
exe = fluid.Executor(place)
start_program = fleet.startup_program
exe.run(start_program)
dataset, file_list = dataset_reader(args.train_data_path, args.batch_size, inputs,
args.sparse_feature_dim, args.vocab, args.num_thread)
train_loop(dataset, file_list)
fleet.stop_worker()
logger.info(">>> Distribute train success...")
if __name__ == '__main__':
dist_train()
在paddle cloud以及本地机器上(centos7.6),我尝试了paddlepaddle 1.6和1.8版本,都是相同的错误。