GUNREAL 模型的实现 (#171) · Issue · PaddlePaddle / PARL

GUNREAL 模型的实现

Created by: ddayzzz

请问如何实现以下的模型呢？我已经有了一个初步的设计（在GA3C的基础上进行的修改）：

现在有以下的几个疑问：

ParallelExecutor 的设计是否正确

self.learn_exe = fluid.ParallelExecutor(
            use_cuda=use_cuda,
            main_program=self.learn_program,
            build_strategy=build_strategy,
            exec_strategy=exec_strategy)
        # 从并行运行的环境中获得训练数据，将会得到 base，pc，vr，rp 的数据
        self.base_sample_exes = []
        for _ in range(base_predict_thread_num):
            with fluid.scope_guard(fluid.global_scope().new_scope()):
                pe = fluid.ParallelExecutor(
                    use_cuda=use_cuda,
                    main_program=self.base_sample_program,
                    build_strategy=build_strategy,
                    exec_strategy=exec_strategy)
                self.base_sample_exes.append(pe)
        self.pc_sample_exes = []
        for _ in range(pc_predict_thread_num):
            with fluid.scope_guard(fluid.global_scope().new_scope()):
                pe = fluid.ParallelExecutor(
                    use_cuda=use_cuda,
                    main_program=self.pc_sample_program,
                    build_strategy=build_strategy,
                    exec_strategy=exec_strategy)
                self.pc_sample_exes.append(pe)
        self.vr_sample_exes = []
        for _ in range(vr_predict_thread_num):
            with fluid.scope_guard(fluid.global_scope().new_scope()):
                pe = fluid.ParallelExecutor(
                    use_cuda=use_cuda,
                    main_program=self.vr_sample_program,
                    build_strategy=build_strategy,
                    exec_strategy=exec_strategy)
                self.vr_sample_exes.append(pe)

Program 是否正确：

# 可能是类似于获得输出名称的 program
        self.base_sample_program = fluid.Program()
        self.vr_sample_program = fluid.Program()
        self.pc_sample_program = fluid.Program()
        # 获取 base，vr，rp ，pc的 program
        # self.predict_program = fluid.Program()
        self.learn_program = fluid.Program()

        with fluid.program_guard(self.base_sample_program):
            # 输入的额state
            base_states = layers.data(name='base_states', shape=self.obs_shape, dtype='float32')
            base_sample_actions, base_values = self.alg.sample(base_states)
            # 输出行为 index，值函数. 为何需要他们的名称？
            self.base_sample_outputs = [base_sample_actions.name, base_values.name]

        with fluid.program_guard(self.pc_sample_program):
            pc_states = layers.data(name='pc_states', shape=self.obs_shape, dtype='float32')
            pc_q, pc_q_max = self.alg.predict_pc_q_and_pc_q_max(pc_states)
            # 输出行为 index，值函数. 为何需要他们的名称？
            self.pc_outputs = [pc_q_max.name]

        with fluid.program_guard(self.vr_sample_program):
            vr_states = layers.data(name='vr_states', shape=self.obs_shape, dtype='float32')
            vr_v = self.alg.predict_vr_value(vr_states)
            # 输出行为 index，值函数. 为何需要他们的名称？
            self.vr_outputs = [vr_v.name]

        with fluid.program_guard(self.learn_program):
            base_states = layers.data(name='base_states', shape=self.obs_shape, dtype='float32')
            # 这个算法自己进行 onehot
            base_actions = layers.data(name='base_actions', shape=[6], dtype='float32')

            base_R = layers.data(name='base_R', shape=[], dtype='float32')
            base_values = layers.data(name='base_values', shape=[], dtype='float32')
            # 辅助任务的数据
            pc_states = layers.data(name='pc_states', shape=self.obs_shape, dtype='float32'); pc_R = layers.data(name='pc_R', shape=[20, 20], dtype='float32'); pc_actions = layers.data(name='pc_actions', shape=[6], dtype='float32')
            vr_states = layers.data(name='vr_states', shape=self.obs_shape, dtype='float32'); vr_R = layers.data(name='vr_R', shape=[], dtype='float32')  # vr
            rp_states = layers.data(name='rp_states', shape=self.obs_shape, dtype='float32'); rp_C = layers.data(name='rp_C', shape=[3], dtype='float32')  # rp
            lr = layers.data(name='lr', shape=[1], dtype='float32', append_batch_size=False)
            entropy_coeff = layers.data(name='entropy_coeff', shape=[], dtype='float32')
            # 包装训练数据
            self.learn_reader = fluid.layers.create_py_reader_by_data(
                capacity=32,
                feed_list=[
                    base_states, base_actions, base_R, base_values,
                    # 其他的辅助任务的数据
                    pc_states, pc_actions, pc_R,
                    vr_states, vr_R,
                    rp_states, rp_C,
                    # 训练网络的数据
                    lr, entropy_coeff
                ])
            base_states, base_actions, base_R, base_values, pc_states, pc_actions, pc_R, vr_states, vr_R, rp_states, rp_C, lr, entropy_coeff = fluid.layers.read_file(self.learn_reader)

            total_loss, pi_loss, vf_loss, entropy, pc_loss, vr_loss, rp_loss = self.alg.learn(
                base_states, base_actions, base_R, base_values,
                # 训练数据
                pc_states, pc_R, pc_actions,
                vr_states, vr_R,
                rp_states, rp_C,
                lr, entropy_coeff)
            self.learn_outputs = [
                total_loss.name, pi_loss.name, vf_loss.name, entropy.name, pc_loss.name, vr_loss.name, rp_loss.name
            ]