未验证 提交 4157cdae 编写于 作者: W Will-Nie 提交者: GitHub

feature(nyp): add apple key to door treasure env(#128)

* add apple key to door treasure and polish

* add test, revise reward, build four envs

* add 7x7-1 ADTKT
上级 045937e3
from gym.envs.registration import register
register(id='MiniGrid-AKTDT-7x7-1-v0', entry_point='dizoo.minigrid.envs:AppleKeyToDoorTreasure_7x7_1')
register(id='MiniGrid-AKTDT-v0', entry_point='dizoo.minigrid.envs:AppleKeyToDoorTreasure')
register(id='MiniGrid-AKTDT-13x13-v0', entry_point='dizoo.minigrid.envs:AppleKeyToDoorTreasure_13x13')
register(id='MiniGrid-AKTDT-13x13-1-v0', entry_point='dizoo.minigrid.envs:AppleKeyToDoorTreasure_13x13_1')
register(id='MiniGrid-AKTDT-19x19-v0', entry_point='dizoo.minigrid.envs:AppleKeyToDoorTreasure_19x19')
register(id='MiniGrid-AKTDT-19x19-3-v0', entry_point='dizoo.minigrid.envs:AppleKeyToDoorTreasure_19x19_3')
\ No newline at end of file
......@@ -4,7 +4,7 @@ from easydict import EasyDict
from ding.entry import serial_pipeline_reward_model_ngu
print(torch.cuda.is_available(), torch.__version__)
collector_env_num = 32 #TODO
collector_env_num = 32 #TODO
evaluator_env_num = 5
nstep = 5
minigrid_ppo_rnd_config = dict(
......@@ -25,15 +25,13 @@ minigrid_ppo_rnd_config = dict(
learning_rate=5e-4,
obs_shape=2739,
action_shape=7,
batch_size=320, # transitions
batch_size=320, # transitions
update_per_collect=int(10), # 32*100/320=10
only_use_last_five_frames_for_icm_rnd=False,
clear_buffer_per_iters=10,
nstep=nstep,
hidden_size_list=[128, 128, 64],
type='rnd',
),
episodic_reward_model=dict(
intrinsic_reward_type='add',
......@@ -41,7 +39,6 @@ minigrid_ppo_rnd_config = dict(
obs_shape=2739,
action_shape=7,
batch_size=320, # transitions
update_per_collect=int(10), # 32*100/64=50
only_use_last_five_frames_for_icm_rnd=False,
clear_buffer_per_iters=10,
......@@ -84,12 +81,13 @@ minigrid_ppo_rnd_config = dict(
end=0.05,
decay=1e5,
),
replay_buffer=dict(replay_buffer_size=30000,
# (Float type) How much prioritization is used: 0 means no prioritization while 1 means full prioritization
alpha=0.6,
# (Float type) How much correction is used: 0 means no correction while 1 means full correction
beta=0.4,
)
replay_buffer=dict(
replay_buffer_size=30000,
# (Float type) How much prioritization is used: 0 means no prioritization while 1 means full prioritization
alpha=0.6,
# (Float type) How much correction is used: 0 means no correction while 1 means full correction
beta=0.4,
)
),
),
)
......@@ -105,7 +103,7 @@ minigrid_ppo_rnd_create_config = dict(
policy=dict(type='ngu'),
rnd_reward_model=dict(type='rnd'),
episodic_reward_model=dict(type='episodic'),
collector=dict(type='sample_ngu',)
collector=dict(type='sample_ngu', )
)
minigrid_ppo_rnd_create_config = EasyDict(minigrid_ppo_rnd_create_config)
create_config = minigrid_ppo_rnd_create_config
......
......@@ -6,7 +6,6 @@ minigrid_ppo_config = dict(
exp_name="minigrid_fourrooms_onppo",
# exp_name="minigrid_doorkey88_onppo",
# exp_name="minigrid_doorkey_onppo",
env=dict(
collector_env_num=8,
evaluator_env_num=5,
......@@ -36,11 +35,11 @@ minigrid_ppo_config = dict(
entropy_weight=0.001,
clip_ratio=0.2,
adv_norm=True,
value_norm=True,
value_norm=True,
),
collect=dict(
collector_env_num=collector_env_num,
n_sample=int(3200),
n_sample=int(3200),
# here self.traj_length = 3200//8 = 400, because in minigrid env the max_length is 300.
# in ding/worker/collector/sample_serial_collector.py
# self._traj_len = max(
......
......@@ -22,7 +22,6 @@ minigrid_r2d2_config = dict(
obs_shape=2739,
action_shape=7,
encoder_hidden_size_list=[128, 128, 512],
),
discount_factor=0.997,
burnin_step=2, # TODO(pu) 20
......@@ -54,13 +53,14 @@ minigrid_r2d2_config = dict(
start=0.95,
end=0.05,
decay=1e5,
),
replay_buffer=dict(replay_buffer_size=100000,
# (Float type) How much prioritization is used: 0 means no prioritization while 1 means full prioritization
alpha=0.6,
# (Float type) How much correction is used: 0 means no correction while 1 means full correction
beta=0.4,
)
),
replay_buffer=dict(
replay_buffer_size=100000,
# (Float type) How much prioritization is used: 0 means no prioritization while 1 means full prioritization
alpha=0.6,
# (Float type) How much correction is used: 0 means no correction while 1 means full correction
beta=0.4,
)
),
),
)
......
from easydict import EasyDict
from ding.entry import serial_pipeline_reward_model_onpolicy
import torch
print(torch.__version__,torch.cuda.is_available())
collector_env_num=8
print(torch.__version__, torch.cuda.is_available())
collector_env_num = 8
minigrid_ppo_rnd_config = dict(
# exp_name='minigrid_empty8_rnd_onppo_b01_weight1000_maxlen100',
# exp_name='minigrid_fourrooms_rnd_onppo_b01_weight1000_maxlen100',
......@@ -10,7 +10,6 @@ minigrid_ppo_rnd_config = dict(
# exp_name='minigrid_doorkey_rnd_onppo_b01_weight1000_maxlen300',
# exp_name='minigrid_kcs3r3_rnd_onppo_b01',
# exp_name='minigrid_om2dlh_rnd_onppo_b01',
env=dict(
collector_env_num=collector_env_num,
evaluator_env_num=5,
......@@ -54,14 +53,14 @@ minigrid_ppo_rnd_config = dict(
batch_size=320, # 64,
learning_rate=3e-4,
value_weight=0.5,
entropy_weight=0.001,
entropy_weight=0.001,
clip_ratio=0.2,
adv_norm=True,
value_norm=True,
value_norm=True,
),
collect=dict(
collector_env_num=collector_env_num,
n_sample=int(3200),
n_sample=int(3200),
# here self.traj_length = 3200//8 = 400, because in minigrid env the max_length is 300.
# in ding/worker/collector/sample_serial_collector.py
# self._traj_len = max(
......
from .minigrid_env import MiniGridEnv
from dizoo.minigrid.envs.app_key_to_door_treasure import AppleKeyToDoorTreasure, AppleKeyToDoorTreasure_13x13, AppleKeyToDoorTreasure_19x19, AppleKeyToDoorTreasure_13x13_1, AppleKeyToDoorTreasure_19x19_3, AppleKeyToDoorTreasure_7x7_1
\ No newline at end of file
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from gym_minigrid.minigrid import *
from gym_minigrid.minigrid import WorldObj
class Ball(WorldObj):
def __init__(self, color='blue'):
super(Ball, self).__init__('ball', color)
def can_pickup(self):
return False
def render(self, img):
fill_coords(img, point_in_circle(0.5, 0.5, 0.31), COLORS[self.color])
class AppleKeyToDoorTreasure(MiniGridEnv):
"""
Classic 4 rooms gridworld environment.
Can specify agent and goal position, if not it set at random.
"""
def __init__(self, agent_pos=None, goal_pos=None, grid_size=19, apple=2):
self._agent_default_pos = agent_pos
self._goal_default_pos = goal_pos
self.apple = apple
super().__init__(grid_size=grid_size, max_steps=100)
def _gen_grid(
self, width, height
): # Note that it is inherited from MiniGridEnv that if width and height == None, width = grid_size , height = grid_size
# Create the grid
self.grid = Grid(width, height)
# Generate the surrounding walls
self.grid.horz_wall(0, 0)
self.grid.horz_wall(0, height - 1)
self.grid.vert_wall(0, 0)
self.grid.vert_wall(width - 1, 0)
room_w = width // 2
room_h = height // 2
# For each row of rooms
for j in range(0, 2):
# For each column
for i in range(0, 2):
xL = i * room_w
yT = j * room_h
xR = xL + room_w
yB = yT + room_h
# Bottom wall and door
if i + 1 < 2:
if j + 1 < 2:
self.grid.vert_wall(xR, yT, room_h)
#pos = (xR, self._rand_int(yT + 1, yB))
else:
self.grid.vert_wall(xR, yT, room_h)
pos = (xR, self._rand_int(yT + 1, yB))
self.grid.set(*pos, None)
# Bottom wall and door
if j + 1 < 2:
if i + 1 < 2:
self.grid.horz_wall(xL, yB, room_w)
pos = (self._rand_int(xL + 1, xR), yB)
self.grid.set(*pos, None)
else:
self.grid.horz_wall(xL, yB, room_w)
pos = (self._rand_int(xL + 1, xR), yB)
self.put_obj(Door('yellow', is_locked=True), *pos)
# Place a yellow key on the left side
pos1 = (self._rand_int(room_w + 1, 2 * room_w), self._rand_int(room_h + 1, 2 * room_h)) # self._rand_int: [)
self.put_obj(Key('yellow'), *pos1)
pos2_dummy_list = [] # to avoid overlap of apples
for i in range(self.apple):
pos2 = (self._rand_int(1, room_w), self._rand_int(1, room_h))
while pos2 in pos2_dummy_list:
pos2 = (self._rand_int(1, room_w), self._rand_int(1, room_h))
self.put_obj(Ball('red'), *pos2)
pos2_dummy_list.append(pos2)
# Randomize the player start position and orientation
if self._agent_default_pos is not None:
self.agent_pos = self._agent_default_pos
self.grid.set(*self._agent_default_pos, None)
self.agent_dir = self._rand_int(0, 4) # assuming random start direction
else:
self.place_agent()
if self._goal_default_pos is not None:
goal = Goal()
self.put_obj(goal, *self._goal_default_pos)
goal.init_pos, goal.cur_pos = self._goal_default_pos
else:
self.place_obj(Goal())
self.mission = 'Reach the goal'
def _reward_ball(self):
"""
Compute the reward to be given upon finding the apple
"""
return 1
def _reward_goal(self):
"""
Compute the reward to be given upon success
"""
return 10
def step(self, action):
self.step_count += 1
reward = 0
done = False
# Get the position in front of the agent
fwd_pos = self.front_pos
# Get the contents of the cell in front of the agent
fwd_cell = self.grid.get(*fwd_pos)
# Rotate left
if action == self.actions.left:
self.agent_dir -= 1
if self.agent_dir < 0:
self.agent_dir += 4
# Rotate right
elif action == self.actions.right:
self.agent_dir = (self.agent_dir + 1) % 4
# Move forward
elif action == self.actions.forward:
if fwd_cell == None or fwd_cell.can_overlap(): # Ball and keys' can_overlap are False
self.agent_pos = fwd_pos
if fwd_cell != None and fwd_cell.type == 'goal':
done = True
reward = self._reward_goal()
if fwd_cell != None and fwd_cell.type == 'ball':
reward = self._reward_ball()
self.grid.set(*fwd_pos, None)
self.agent_pos = fwd_pos
if fwd_cell != None and fwd_cell.type == 'lava':
done = True
# Pick up an object
elif action == self.actions.pickup:
if fwd_cell and fwd_cell.can_pickup():
if self.carrying is None:
self.carrying = fwd_cell
self.carrying.cur_pos = np.array([-1, -1])
self.grid.set(*fwd_pos, None)
# Drop an object
elif action == self.actions.drop:
if not fwd_cell and self.carrying:
self.grid.set(*fwd_pos, self.carrying)
self.carrying.cur_pos = fwd_pos
self.carrying = None
# Toggle/activate an object: Here, this will open the door if you have the right key
elif action == self.actions.toggle:
if fwd_cell:
fwd_cell.toggle(self, fwd_pos)
# Done action (not used by default)
elif action == self.actions.done:
pass
else:
assert False, "unknown action"
if self.step_count >= self.max_steps:
done = True
obs = self.gen_obs()
return obs, reward, done, {}
class AppleKeyToDoorTreasure_13x13(AppleKeyToDoorTreasure):
def __init__(self):
super().__init__(agent_pos=(2, 8), goal_pos=(7, 1), grid_size=13, apple=2)
class AppleKeyToDoorTreasure_19x19(AppleKeyToDoorTreasure):
def __init__(self):
super().__init__(agent_pos=(2, 14), goal_pos=(10, 1), grid_size=19, apple=2)
class AppleKeyToDoorTreasure_13x13_1(AppleKeyToDoorTreasure):
def __init__(self):
super().__init__(agent_pos=(2, 8), goal_pos=(7, 1), grid_size=13, apple=1)
class AppleKeyToDoorTreasure_7x7_1(AppleKeyToDoorTreasure):
def __init__(self):
super().__init__(agent_pos=(1, 5), goal_pos=(4, 1), grid_size=7, apple=1)
class AppleKeyToDoorTreasure_19x19_3(AppleKeyToDoorTreasure):
def __init__(self):
super().__init__(agent_pos=(2, 14), goal_pos=(10, 1), grid_size=19, apple=3)
if __name__ == '__main__':
AppleKeyToDoorTreasure()._gen_grid(13, 13) # Note that Minigrid has set seeds automatically
......@@ -8,7 +8,7 @@ import gym
import numpy as np
from matplotlib import animation
import matplotlib.pyplot as plt
from gym_minigrid.wrappers import FlatObsWrapper, RGBImgPartialObsWrapper, ImgObsWrapper
from gym_minigrid.wrappers import FlatObsWrapper, RGBImgPartialObsWrapper, ImgObsWrapper, ViewSizeWrapper
from gym_minigrid.window import Window
from ding.envs import BaseEnv, BaseEnvTimestep, BaseEnvInfo
......@@ -89,6 +89,144 @@ MINIGRID_INFO_DICT = {
max_step=100,
use_wrappers=None,
),
'MiniGrid-AKTDT-v0': MiniGridEnvInfo(
agent_num=1,
obs_space=EnvElementInfo(shape=(2739, ), value={
'min': 0,
'max': 8,
'dtype': np.float32
}),
act_space=EnvElementInfo(
shape=(1, ),
value={
'min': 0,
'max': 7, # [0, 7)
'dtype': np.int64,
}
),
rew_space=EnvElementInfo(shape=(1, ), value={
'min': 0,
'max': 1,
'dtype': np.float32
}),
max_step=500,
use_wrappers=None,
),
'MiniGrid-AKTDT-7x7-1-v0': MiniGridEnvInfo(
agent_num=1,
obs_space=EnvElementInfo(shape=(2619, ), value={
'min': 0,
'max': 8,
'dtype': np.float32
}),
act_space=EnvElementInfo(
shape=(1, ),
value={
'min': 0,
'max': 7, # [0, 7)
'dtype': np.int64,
}
),
rew_space=EnvElementInfo(shape=(1, ), value={
'min': 0,
'max': 1,
'dtype': np.float32
}),
max_step=500,
use_wrappers=None,
),
'MiniGrid-AKTDT-13x13-v0': MiniGridEnvInfo(
agent_num=1,
obs_space=EnvElementInfo(shape=(2667, ), value={
'min': 0,
'max': 8,
'dtype': np.float32
}),
act_space=EnvElementInfo(
shape=(1, ),
value={
'min': 0,
'max': 7, # [0, 7)
'dtype': np.int64,
}
),
rew_space=EnvElementInfo(shape=(1, ), value={
'min': 0,
'max': 1,
'dtype': np.float32
}),
max_step=500,
use_wrappers=None,
),
'MiniGrid-AKTDT-13x13-1-v0': MiniGridEnvInfo(
agent_num=1,
obs_space=EnvElementInfo(shape=(2667, ), value={
'min': 0,
'max': 8,
'dtype': np.float32
}),
act_space=EnvElementInfo(
shape=(1, ),
value={
'min': 0,
'max': 7, # [0, 7)
'dtype': np.int64,
}
),
rew_space=EnvElementInfo(shape=(1, ), value={
'min': 0,
'max': 1,
'dtype': np.float32
}),
max_step=500,
use_wrappers=None,
),
'MiniGrid-AKTDT-19x19-v0': MiniGridEnvInfo(
agent_num=1,
obs_space=EnvElementInfo(shape=(2739, ), value={
'min': 0,
'max': 8,
'dtype': np.float32
}),
act_space=EnvElementInfo(
shape=(1, ),
value={
'min': 0,
'max': 7, # [0, 7)
'dtype': np.int64,
}
),
rew_space=EnvElementInfo(shape=(1, ), value={
'min': 0,
'max': 1,
'dtype': np.float32
}),
max_step=500,
use_wrappers=None,
),
'MiniGrid-AKTDT-19x19-3-v0': MiniGridEnvInfo(
agent_num=1,
obs_space=EnvElementInfo(shape=(2739, ), value={
'min': 0,
'max': 8,
'dtype': np.float32
}),
act_space=EnvElementInfo(
shape=(1, ),
value={
'min': 0,
'max': 7, # [0, 7)
'dtype': np.int64,
}
),
rew_space=EnvElementInfo(shape=(1, ), value={
'min': 0,
'max': 1,
'dtype': np.float32
}),
max_step=500,
use_wrappers=None,
),
'MiniGrid-DoorKey-8x8-v0': MiniGridEnvInfo(
agent_num=1,
obs_space=EnvElementInfo(shape=(2739, ), value={
......@@ -231,6 +369,14 @@ class MiniGridEnv(BaseEnv):
def reset(self) -> np.ndarray:
if not self._init_flag:
self._env = gym.make(self._env_id)
if self._env_id in ['MiniGrid-AKTDT-13x13-v0' or 'MiniGrid-AKTDT-13x13-1-v0']:
self._env = ViewSizeWrapper(
self._env, agent_view_size=5
) # customize the agent field of view size, note this must be an odd number # This also related to the observation space, see gym_minigrid.wrappers for more details
if self._env_id == 'MiniGrid-AKTDT-7x7-1-v0':
self._env = ViewSizeWrapper(
self._env, agent_view_size=3
)
if self._flat_obs:
self._env = FlatObsWrapper(self._env)
# self._env = RGBImgPartialObsWrapper(self._env)
......@@ -282,7 +428,7 @@ class MiniGridEnv(BaseEnv):
self.display_frames_as_gif(self._frames, path)
self._save_replay_count += 1
obs = to_ndarray(obs).astype(np.float32)
rew = to_ndarray([rew]) # wrapped to be transfered to a array with shape (1,)
rew = to_ndarray([rew]) # wrapped to be transferred to a array with shape (1,)
return BaseEnvTimestep(obs, rew, done, info)
def info(self) -> MiniGridEnvInfo:
......
......@@ -2,7 +2,23 @@ import pytest
import os
import numpy as np
from dizoo.minigrid.envs import MiniGridEnv
from easydict import EasyDict
import copy
# The following two cfg can be tested through TestMiniGridAKTDTnv
config = dict(
env_id='MiniGrid-AKTDT-13x13-v0',
flat_obs=True,
)
cfg = EasyDict(copy.deepcopy(config))
cfg.cfg_type = 'MiniGridEnvDict'
config2 = dict(
env_id='MiniGrid-AKTDT-7x7-1-v0',
flat_obs=True,
)
cfg2 = EasyDict(copy.deepcopy(config2))
cfg2.cfg_type = 'MiniGridEnvDict'
@pytest.mark.envtest
class TestMiniGridEnv:
......@@ -33,3 +49,61 @@ class TestMiniGridEnv:
env.reset()
print(env.info())
env.close()
@pytest.mark.envtest
class TestMiniGridAKTDTnv:
def test_adtkt_13(self):
env = MiniGridEnv(cfg2)
env.seed(314)
path = './video'
if not os.path.exists(path):
os.mkdir(path)
env.enable_save_replay(path)
assert env._seed == 314
obs = env.reset()
act_val = env.info().act_space.value
min_val, max_val = act_val['min'], act_val['max']
for i in range(env._max_step):
random_action = np.random.randint(min_val, max_val, size=(1, ))
timestep = env.step(random_action)
print(timestep)
print(timestep.obs.max())
assert isinstance(timestep.obs, np.ndarray)
assert isinstance(timestep.done, bool)
assert timestep.obs.shape == (2667, )
assert timestep.reward.shape == (1, )
assert timestep.reward >= env.info().rew_space.value['min']
assert timestep.reward <= env.info().rew_space.value['max']
if timestep.done:
env.reset()
print(env.info())
env.close()
def test_adtkt_7(self):
env = MiniGridEnv(cfg2)
env.seed(314)
path = './video'
if not os.path.exists(path):
os.mkdir(path)
env.enable_save_replay(path)
assert env._seed == 314
obs = env.reset()
act_val = env.info().act_space.value
min_val, max_val = act_val['min'], act_val['max']
for i in range(env._max_step):
random_action = np.random.randint(min_val, max_val, size=(1, ))
timestep = env.step(random_action)
print(timestep)
print(timestep.obs.max())
assert isinstance(timestep.obs, np.ndarray)
assert isinstance(timestep.done, bool)
assert timestep.obs.shape == (2619, )
assert timestep.reward.shape == (1, )
assert timestep.reward >= env.info().rew_space.value['min']
assert timestep.reward <= env.info().rew_space.value['max']
if timestep.done:
env.reset()
print(env.info())
env.close()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册