graphsage的fleet分布式的问题
Created by: shuoyin
hi, 我跑了PGL提供的graphsage的demo,可以正常跑,然后把本地的程序改成了fleet的分布式。网络结构和超参数都没有变,启动一个pserver和一个worker,发现fleet的分布式程序loss不降,请问事什么问题。
下面是我跑本地版graphsage的log
这是我fleet分布式的log
下面是main函数的分布式部分,我只修改了main函数
def main(args):
data = load_data(args.normalize, args.symmetry)
log.info("preprocess finish")
log.info("Train Examples: %s" % len(data["train_index"]))
log.info("Val Examples: %s" % len(data["val_index"]))
log.info("Test Examples: %s" % len(data["test_index"]))
log.info("Num nodes %s" % data["graph"].num_nodes)
log.info("Num edges %s" % data["graph"].num_edges)
log.info("Average Degree %s" % np.mean(data["graph"].indegree()))
place = fluid.CUDAPlace(0) if args.use_cuda else fluid.CPUPlace()
train_program = fluid.default_main_program()
startup_program = fluid.default_startup_program()
samples = []
if args.samples_1 > 0:
samples.append(args.samples_1)
if args.samples_2 > 0:
samples.append(args.samples_2)
with fluid.program_guard(train_program, startup_program):
feature, feature_init = paddle_helper.constant(
"feat",
dtype=data['feature'].dtype,
value=data['feature'],
hide_batch_size=False)
graph_wrapper = pgl.graph_wrapper.GraphWrapper(
"sub_graph", place, node_feat=data['graph'].node_feat_info())
model_loss, model_acc = build_graph_model(
graph_wrapper,
num_class=data["num_class"],
feature=feature,
hidden_size=args.hidden_size,
graphsage_type=args.graphsage_type,
k_hop=len(samples))
test_program = train_program.clone(for_test=True)
trainer_id = int(os.environ["PADDLE_TRAINER_ID"])
trainers = int(os.environ["PADDLE_TRAINERS"])
training_role = os.environ["PADDLE_TRAINING_ROLE"]
log.info(training_role )
training_role = role_maker.Role.WORKER if training_role == "TRAINER" else role_maker.Role.SERVER
ports = os.getenv("PADDLE_PSERVER_PORTS")
pserver_ip = os.getenv("PADDLE_PSERVER_IP", "")
pserver_endpoints = []
for port in ports.split(","):
pserver_endpoints.append(':'.join([pserver_ip, port]))
role = role_maker.UserDefinedRoleMaker(current_id=trainer_id, role=training_role, worker_num=trainers, server_endpoints=pserver_endpoints)
config = DistributeTranspilerConfig()
config.sync_mode = True
fleet.init(role)
optimizer = fluid.optimizer.SGD(learning_rate=args.lr)
optimizer = fleet.distributed_optimizer(optimizer, config)
optimizer.minimize(model_loss)
exe = fluid.Executor(place)
if fleet.is_server():
log.info('running server')
fleet.init_server()
fleet.run_server()
if fleet.is_worker():
log.info('running worker')
fleet.init_worker()
exe.run(fleet.startup_program)
feature_init(place)