GUNREAL 模型的实现
Created by: ddayzzz
请问如何实现以下的模型呢?我已经有了一个初步的设计(在GA3C
的基础上进行的修改):
现在有以下的几个疑问:
ParallelExecutor 的设计是否正确
self.learn_exe = fluid.ParallelExecutor(
use_cuda=use_cuda,
main_program=self.learn_program,
build_strategy=build_strategy,
exec_strategy=exec_strategy)
# 从并行运行的环境中获得训练数据,将会得到 base,pc,vr,rp 的数据
self.base_sample_exes = []
for _ in range(base_predict_thread_num):
with fluid.scope_guard(fluid.global_scope().new_scope()):
pe = fluid.ParallelExecutor(
use_cuda=use_cuda,
main_program=self.base_sample_program,
build_strategy=build_strategy,
exec_strategy=exec_strategy)
self.base_sample_exes.append(pe)
self.pc_sample_exes = []
for _ in range(pc_predict_thread_num):
with fluid.scope_guard(fluid.global_scope().new_scope()):
pe = fluid.ParallelExecutor(
use_cuda=use_cuda,
main_program=self.pc_sample_program,
build_strategy=build_strategy,
exec_strategy=exec_strategy)
self.pc_sample_exes.append(pe)
self.vr_sample_exes = []
for _ in range(vr_predict_thread_num):
with fluid.scope_guard(fluid.global_scope().new_scope()):
pe = fluid.ParallelExecutor(
use_cuda=use_cuda,
main_program=self.vr_sample_program,
build_strategy=build_strategy,
exec_strategy=exec_strategy)
self.vr_sample_exes.append(pe)
Program 是否正确:
# 可能是类似于获得输出名称的 program
self.base_sample_program = fluid.Program()
self.vr_sample_program = fluid.Program()
self.pc_sample_program = fluid.Program()
# 获取 base,vr,rp ,pc的 program
# self.predict_program = fluid.Program()
self.learn_program = fluid.Program()
with fluid.program_guard(self.base_sample_program):
# 输入的额state
base_states = layers.data(name='base_states', shape=self.obs_shape, dtype='float32')
base_sample_actions, base_values = self.alg.sample(base_states)
# 输出行为 index,值函数. 为何需要他们的名称?
self.base_sample_outputs = [base_sample_actions.name, base_values.name]
with fluid.program_guard(self.pc_sample_program):
pc_states = layers.data(name='pc_states', shape=self.obs_shape, dtype='float32')
pc_q, pc_q_max = self.alg.predict_pc_q_and_pc_q_max(pc_states)
# 输出行为 index,值函数. 为何需要他们的名称?
self.pc_outputs = [pc_q_max.name]
with fluid.program_guard(self.vr_sample_program):
vr_states = layers.data(name='vr_states', shape=self.obs_shape, dtype='float32')
vr_v = self.alg.predict_vr_value(vr_states)
# 输出行为 index,值函数. 为何需要他们的名称?
self.vr_outputs = [vr_v.name]
with fluid.program_guard(self.learn_program):
base_states = layers.data(name='base_states', shape=self.obs_shape, dtype='float32')
# 这个算法自己进行 onehot
base_actions = layers.data(name='base_actions', shape=[6], dtype='float32')
base_R = layers.data(name='base_R', shape=[], dtype='float32')
base_values = layers.data(name='base_values', shape=[], dtype='float32')
# 辅助任务的数据
pc_states = layers.data(name='pc_states', shape=self.obs_shape, dtype='float32'); pc_R = layers.data(name='pc_R', shape=[20, 20], dtype='float32'); pc_actions = layers.data(name='pc_actions', shape=[6], dtype='float32')
vr_states = layers.data(name='vr_states', shape=self.obs_shape, dtype='float32'); vr_R = layers.data(name='vr_R', shape=[], dtype='float32') # vr
rp_states = layers.data(name='rp_states', shape=self.obs_shape, dtype='float32'); rp_C = layers.data(name='rp_C', shape=[3], dtype='float32') # rp
lr = layers.data(name='lr', shape=[1], dtype='float32', append_batch_size=False)
entropy_coeff = layers.data(name='entropy_coeff', shape=[], dtype='float32')
# 包装训练数据
self.learn_reader = fluid.layers.create_py_reader_by_data(
capacity=32,
feed_list=[
base_states, base_actions, base_R, base_values,
# 其他的辅助任务的数据
pc_states, pc_actions, pc_R,
vr_states, vr_R,
rp_states, rp_C,
# 训练网络的数据
lr, entropy_coeff
])
base_states, base_actions, base_R, base_values, pc_states, pc_actions, pc_R, vr_states, vr_R, rp_states, rp_C, lr, entropy_coeff = fluid.layers.read_file(self.learn_reader)
total_loss, pi_loss, vf_loss, entropy, pc_loss, vr_loss, rp_loss = self.alg.learn(
base_states, base_actions, base_R, base_values,
# 训练数据
pc_states, pc_R, pc_actions,
vr_states, vr_R,
rp_states, rp_C,
lr, entropy_coeff)
self.learn_outputs = [
total_loss.name, pi_loss.name, vf_loss.name, entropy.name, pc_loss.name, vr_loss.name, rp_loss.name
]