Skip to content

  • 体验新版
    • 正在加载...
  • 登录
  • PaddlePaddle
  • PARL
  • Issue
  • #334

P
PARL
  • 项目概览

PaddlePaddle / PARL

通知 68
Star 3
Fork 0
  • 代码
    • 文件
    • 提交
    • 分支
    • Tags
    • 贡献者
    • 分支图
    • Diff
  • Issue 18
    • 列表
    • 看板
    • 标记
    • 里程碑
  • 合并请求 3
  • Wiki 0
    • Wiki
  • 分析
    • 仓库
    • DevOps
  • 项目成员
  • Pages
P
PARL
  • 项目概览
    • 项目概览
    • 详情
    • 发布
  • 仓库
    • 仓库
    • 文件
    • 提交
    • 分支
    • 标签
    • 贡献者
    • 分支图
    • 比较
  • Issue 18
    • Issue 18
    • 列表
    • 看板
    • 标记
    • 里程碑
  • 合并请求 3
    • 合并请求 3
  • Pages
  • 分析
    • 分析
    • 仓库分析
    • DevOps
  • Wiki 0
    • Wiki
  • 成员
    • 成员
  • 收起侧边栏
  • 动态
  • 分支图
  • 创建新Issue
  • 提交
  • Issue看板
已关闭
开放中
Opened 7月 08, 2020 by saxon_zh@saxon_zhGuest

[ValueError]关于在PG算法中使用监督学习的问题

Created by: zbp-xxxp

通过监督学习训练一个网络,根据obs输出act,但是sample()和predict()的feed里只有obs,我主动添加act后: def sample(self, obs, act): place = fluid.CPUPlace() # 指明executor的执行场所 x = fluid.data(name='obs', shape=[None, 4], dtype='float32') # 定义输入的形状和数据类型 y = fluid.data(name='act', shape=[None, 1], dtype='float32') # 定义输出的形状和数据类型 feeder = fluid.DataFeeder(place=place, feed_list=[x, y])

    # sample_model = fluid.io.load_persistables(fluid.Executor(fluid.CPUPlace()), "./sample_model", self.pred_program)
    # act_prob = sample_model.run(
    #     self.pred_program,
    #     feed={'obs': obs.astype('float32')},
    #     fetch_list=[self.act_prob])[0]
    act_prob = self.fluid_executor.run(
        self.pred_program,
        feed={'obs': obs.astype('float32'),
              'act': act.astype('float32')},
        fetch_list=[self.act_prob])[0]
    # act_prob = np.squeeze(act_prob, axis=0)  # 减少一维维度
    # act = np.random.choice(range(self.act_dim), p=act_prob)  # 根据动作概率选取动作
    fluid.io.save_persistables(act_prob,"./sample_model")
    # return act

报错: ValueError: var act not in this block

完整代码如下: import parl from parl import layers import numpy as np import paddle.fluid as fluid import gym import six from parl.utils import logger from parl.algorithms import PolicyGradient from pyglet.window import key

class Model(parl.Model): def init(self, act_dim): act_dim = act_dim hid1_size = 256 hid2_size = 64

    self.fc1 = layers.fc(size=hid1_size, act='relu')
    self.fc2 = layers.fc(size=hid2_size, act='relu')
    self.fc3 = layers.fc(size=act_dim, act='softmax')

def forward(self, obs):
    h1 = self.fc1(obs)
    h2 = self.fc2(h1)
    out = self.fc3(h2)
    return out

class Agent(parl.Agent): def init(self, algorithm, obs_dim, act_dim): self.obs_dim = obs_dim self.act_dim = act_dim super(Agent, self).init(algorithm)

def build_program(self):
    self.pred_program = fluid.Program()
    self.learn_program = fluid.Program()

    with fluid.program_guard(self.pred_program):  # 搭建计算图用于 预测动作,定义输入输出变量
        obs = layers.data(
            name='obs', shape=[self.obs_dim], dtype='float32')
        self.act_prob = self.alg.predict(obs)

    with fluid.program_guard(self.learn_program):  # 搭建计算图用于 更新policy网络,定义输入输出变量
        obs = layers.data(
            name='obs', shape=[self.obs_dim], dtype='float32')
        act = layers.data(name='act', shape=[1], dtype='int64')
        reward = layers.data(name='reward', shape=[], dtype='float32')
        self.cost = self.alg.learn(obs, act, reward)

def sample(self, obs, act):
    place = fluid.CPUPlace() # 指明executor的执行场所
    x = fluid.data(name='obs', shape=[None, 4], dtype='float32') # 定义输入的形状和数据类型
    y = fluid.data(name='act', shape=[None, 1], dtype='float32') # 定义输出的形状和数据类型
    feeder = fluid.DataFeeder(place=place, feed_list=[x, y])

    # sample_model = fluid.io.load_persistables(fluid.Executor(fluid.CPUPlace()), "./sample_model", self.pred_program)
    # act_prob = sample_model.run(
    #     self.pred_program,
    #     feed={'obs': obs.astype('float32')},
    #     fetch_list=[self.act_prob])[0]
    act_prob = self.fluid_executor.run(
        self.pred_program,
        feed={'obs': obs.astype('float32'),
              'act': act.astype('float32')},
        fetch_list=[self.act_prob])[0]
    # act_prob = np.squeeze(act_prob, axis=0)  # 减少一维维度
    # act = np.random.choice(range(self.act_dim), p=act_prob)  # 根据动作概率选取动作
    fluid.io.save_persistables(act_prob,"./sample_model")
    # return act

def predict(self, obs):
    obs = np.expand_dims(obs, axis=0)
    # predict_model = fluid.io.load_persistables(fluid.Executor(fluid.CPUPlace()), "./sample_model", self.pred_program)
    # act_prob = predict_model.run(
    #     self.pred_program,
    #     feed={'obs': obs.astype('float32')},
    #     fetch_list=[self.act_prob])[0]
    act_prob = self.fluid_executor.run(
        self.pred_program,
        feed={'obs': obs.astype('float32')},
        fetch_list=[self.act_prob])[0]
    act_prob = np.squeeze(act_prob, axis=0)
    act = np.argmax(act_prob)  # 根据动作概率选择概率最高的动作
    return act

def learn(self, obs, act, reward):
    act = np.expand_dims(act, axis=-1)
    feed = {
        'obs': obs.astype('float32'),
        'act': act.astype('int64'),
        'reward': reward.astype('float32')
    }
    # learn_model = fluid.io.load_persistables(fluid.Executor(fluid.CPUPlace()), "./sample_model", self.pred_program)
    # cost = learn_model.run(
    #     self.learn_program, feed=feed, fetch_list=[self.cost])[0]
    cost = self.fluid_executor.run(
        self.learn_program, feed=feed, fetch_list=[self.cost])[0]
    fluid.io.save_persistables(cost,"./learn_model")
    return cost

a = np.array([0.0, 0.0, 0.0])

def key_press(k, mod): global restart if k == 0xff0d: restart = True if k == key.LEFT: a[0] = -1.0 if k == key.RIGHT: a[0] = +1.0 if k == key.UP: a[1] = +1.0 if k == key.DOWN: a[2] = +0.8 # set 1.0 for wheels to block to zero rotation

def key_release(k, mod): if k == key.LEFT and a[0] == -1.0: a[0] = 0 if k == key.RIGHT and a[0] == +1.0: a[0] = 0 if k == key.UP: a[1] = 0 if k == key.DOWN: a[2] = 0

def WarnUp(env): env.render() env.viewer.window.on_key_press = key_press env.viewer.window.on_key_release = key_release obs_list, action_list, reward_list = [], [], [] lists = [] isopen = True for i in range(1): obs = env.reset() total_reward = 0.0 steps = 0 restart = False while True: obs_list.append(obs) action_list.append(a)

        s, r, done, info = env.step(a)
        reward_list.append(r)
        total_reward += r
        if steps % 1 == 0 or done:
            print("\naction " + str(["{:+0.2f}".format(x) for x in a]))
            print("step {} total_reward {:+0.2f}".format(steps, total_reward))
        steps += 1
        isopen = env.render()
        if done or restart or isopen == False:
            break
lists = [obs_list, action_list, reward_list]
np.save("data_3.npy",lists)
# return obs_list, action_list, reward_list

def preprocess(image): # input 96963 output 28*32 image = image[:84] image = image[::3,::3,]

# grass to black
mask = np.all(image == [102, 229, 102], axis=2)
image[mask] = [0, 0, 0]
mask = np.all(image == [102, 204, 102], axis=2)
image[mask] = [0, 0, 0]

image = image[:, :, 0]
image[image != 0] = 1  # 转为灰度图,除了黑色外其他都是白色

return image.astype(np.float).ravel()

def convert_action(action): # print(action) if 0 == action: action = [ 0.0, 1.0, 0.0] # STRAIGHT return action elif 1 == action: action = [ 1.0, 0.0, 0.0] # RIGHT return action elif 2 == action: action = [-1.0, 0.0, 0.0] # LEFT return action elif 3 == action: action = [0.0, 0.0, 0.0] # Keep return action else: print("action error")

def train(agent): feature_names = ['straight', 'left', 'right', 'keep'] feature_num = len(feature_names) lists = [] obs_lists = [] act_lists = [] for i in range(1,4): list = np.load("data_%d.npy"%i) # print(len(list[0])) # print(len(list[1])) for j in range(len(list[0])): obs = list[0][j] obs = preprocess(obs) # from shape (96, 96, 3) to (28*32,) obs = np.expand_dims(obs, axis=0) # 增加一维维度 obs_lists.append(obs) act = list[1][j] act_lists.append([obs, act])

lists = [obs_lists, act_lists]
data = np.array(lists)


for i in range(len(data[0])):
    obs = data[0][i]
    act = data[0][i]
    agent.sample(obs, act)

def main(): LEARNING_RATE = 1e-3 # env = gym.make('CarRacing-v0') # WarnUp(env) obs_dim = 28 * 32 act_dim = 4 # simply straight left and right and keep logger.info('obs_dim {}, act_dim {}'.format(obs_dim, act_dim))

# 根据parl框架构建agent
model = Model(act_dim=act_dim)
alg = PolicyGradient(model, lr=LEARNING_RATE)
agent = Agent(alg, obs_dim=obs_dim, act_dim=act_dim)

train(agent)

if name == 'main': main()

其中, .npy 文件保存了人为在环境操作的obs,act以及reward 在强化学习中使用监督学习有些困难,想问一下PARL有案例吗? 或者说,以上问题我该如何解决?谢谢!

指派人
分配到
无
里程碑
无
分配里程碑
工时统计
无
截止日期
无
标识: paddlepaddle/PARL#334
渝ICP备2023009037号

京公网安备11010502055752号

网络110报警服务 Powered by GitLab CE v13.7
开源知识
Git 入门 Pro Git 电子书 在线学 Git
Markdown 基础入门 IT 技术知识开源图谱
帮助
使用手册 反馈建议 博客
《GitCode 隐私声明》 《GitCode 服务条款》 关于GitCode
Powered by GitLab CE v13.7