未验证 提交 dd4de1a0 编写于 作者: S simonat2011 提交者: GitHub

add ACER algorithm(szj) (#14)

* add endoro env config. add enduro's ppo,dqn,drdqn,rainbow,impala config.

* modified as reviewer mentions

* add qacd network

* fix bugs

* fix bugs

* update acer algorithm

* update ACER code

* update acer config

* fix bug

* update pong acer's config

* edit commit

* update code as mention

* fix the comment table and trust region

* fix format

* fix typing lint

* fix format,flake8

* fix format

* fix whitespace problem

* test(nyz): add acer unittest and algotest

* style(nyz): correct flake8 style
Co-authored-by: Nshenziju <simonshen2011@foxmail.com>
Co-authored-by: NSwain <niuyazhe314@outlook.com>
上级 dc161ea5
......@@ -14,6 +14,7 @@ from dizoo.classic_control.cartpole.config.cartpole_c51_config import cartpole_c
from dizoo.classic_control.cartpole.config.cartpole_qrdqn_config import cartpole_qrdqn_config, cartpole_qrdqn_create_config # noqa
from dizoo.classic_control.cartpole.config.cartpole_sqn_config import cartpole_sqn_config, cartpole_sqn_create_config # noqa
from dizoo.classic_control.cartpole.config.cartpole_ppg_config import cartpole_ppg_config, cartpole_ppg_create_config # noqa
from dizoo.classic_control.cartpole.config.cartpole_acer_config import cartpole_acer_config, cartpole_acer_create_config # noqa
from dizoo.classic_control.cartpole.entry.cartpole_ppg_main import main as ppg_main
from dizoo.classic_control.cartpole.entry.cartpole_ppo_main import main as ppo_main
from dizoo.classic_control.cartpole.config.cartpole_r2d2_config import cartpole_r2d2_config, cartpole_r2d2_create_config # noqa
......@@ -251,3 +252,13 @@ def test_sqn():
assert False, "pipeline fail"
finally:
os.popen('rm -rf log ckpt*')
@pytest.mark.unittest
def test_acer():
config = [deepcopy(cartpole_acer_config), deepcopy(cartpole_acer_create_config)]
config[0].policy.learn.update_per_collect = 1
try:
serial_pipeline(config, seed=0, max_iterations=1)
except Exception:
assert False, "pipeline fail"
......@@ -14,6 +14,7 @@ from dizoo.classic_control.cartpole.config.cartpole_c51_config import cartpole_c
from dizoo.classic_control.cartpole.config.cartpole_qrdqn_config import cartpole_qrdqn_config, cartpole_qrdqn_create_config # noqa
from dizoo.classic_control.cartpole.config.cartpole_sqn_config import cartpole_sqn_config, cartpole_sqn_create_config # noqa
from dizoo.classic_control.cartpole.config.cartpole_ppg_config import cartpole_ppg_config, cartpole_ppg_create_config # noqa
from dizoo.classic_control.cartpole.config.cartpole_acer_config import cartpole_acer_config, cartpole_acer_create_config # noqa
from dizoo.classic_control.cartpole.entry.cartpole_ppg_main import main as ppg_main
from dizoo.classic_control.cartpole.entry.cartpole_ppo_main import main as ppo_main
from dizoo.classic_control.cartpole.config.cartpole_r2d2_config import cartpole_r2d2_config, cartpole_r2d2_create_config # noqa
......@@ -262,3 +263,14 @@ def test_qrdqn():
assert False, "pipeline fail"
with open("./algo_record.log", "a+") as f:
f.write("21. qrdqn\n")
@pytest.mark.algotest
def test_acer():
config = [deepcopy(cartpole_acer_config), deepcopy(cartpole_acer_create_config)]
try:
serial_pipeline(config, seed=0)
except Exception:
assert False, "pipeline fail"
with open("./algo_record.log", "a+") as f:
f.write("22. acer\n")
......@@ -8,3 +8,4 @@ from .qmix import Mixer, QMix, CollaQ
from .coma import COMA
from .atoc import ATOC
from .sqn import SQN
from .acer import ACER
from typing import Union, Dict, Optional
import torch
import torch.nn as nn
from ding.utils import SequenceType, squeeze, MODEL_REGISTRY
from ..common import ReparameterizationHead, RegressionHead, DiscreteHead, MultiHead, \
FCEncoder, ConvEncoder
@MODEL_REGISTRY.register('acer')
class ACER(nn.Module):
r"""
Overview:
The ACER model.
Interfaces:
``__init__``, ``forward``, ``compute_actor``, ``compute_critic``
"""
mode = ['compute_actor', 'compute_critic']
def __init__(
self,
obs_shape: Union[int, SequenceType],
action_shape: Union[int, SequenceType],
encoder_hidden_size_list: SequenceType = [128, 128, 64],
actor_head_hidden_size: int = 64,
actor_head_layer_num: int = 1,
critic_head_hidden_size: int = 64,
critic_head_layer_num: int = 1,
activation: Optional[nn.Module] = nn.ReLU(),
norm_type: Optional[str] = None,
) -> None:
r"""
Overview:
Init the ACER Model according to arguments.
Arguments:
- obs_shape (:obj:`Union[int, SequenceType]`): Observation's space.
- action_shape (:obj:`Union[int, SequenceType]`): Action's space.
- actor_head_hidden_size (:obj:`Optional[int]`): The ``hidden_size`` to pass to actor-nn's ``Head``.
- actor_head_layer_num (:obj:`int`):
The num of layers used in the network to compute Q value output for actor's nn.
- critic_head_hidden_size (:obj:`Optional[int]`): The ``hidden_size`` to pass to critic-nn's ``Head``.
- critic_head_layer_num (:obj:`int`):
The num of layers used in the network to compute Q value output for critic's nn.
- activation (:obj:`Optional[nn.Module]`):
The type of activation function to use in ``MLP`` the after ``layer_fn``,
if ``None`` then default set to ``nn.ReLU()``
- norm_type (:obj:`Optional[str]`):
The type of normalization to use, see ``ding.torch_utils.fc_block`` for more details.
"""
super(ACER, self).__init__()
obs_shape: int = squeeze(obs_shape)
action_shape: int = squeeze(action_shape)
if isinstance(obs_shape, int) or len(obs_shape) == 1:
encoder_cls = FCEncoder
elif len(obs_shape) == 3:
encoder_cls = ConvEncoder
else:
raise RuntimeError(
"not support obs_shape for pre-defined encoder: {}, please customize your own DQN".format(obs_shape)
)
self.actor_encoder = encoder_cls(
obs_shape, encoder_hidden_size_list, activation=activation, norm_type=norm_type
)
self.critic_encoder = encoder_cls(
obs_shape, encoder_hidden_size_list, activation=activation, norm_type=norm_type
)
self.critic_head = RegressionHead(
critic_head_hidden_size, action_shape, critic_head_layer_num, activation=activation, norm_type=norm_type
)
self.actor_head = DiscreteHead(
actor_head_hidden_size, action_shape, actor_head_layer_num, activation=activation, norm_type=norm_type
)
self.actor = [self.actor_encoder, self.actor_head]
self.critic = [self.critic_encoder, self.critic_head]
self.actor = nn.ModuleList(self.actor)
self.critic = nn.ModuleList(self.critic)
def forward(self, inputs: Union[torch.Tensor, Dict], mode: str) -> Dict:
r"""
Overview:
Use observation to predict output.
Parameter updates with ACER's MLPs forward setup.
Arguments:
Forward with ``'compute_actor'``:
- inputs (:obj:`torch.Tensor`):
The encoded embedding tensor, determined with given ``hidden_size``, i.e. ``(B, N=hidden_size)``.
Whether ``actor_head_hidden_size`` or ``critic_head_hidden_size`` depend on ``mode``.
Forward with ``'compute_critic'``, inputs:`torch.Tensor` Necessary Keys:
- ``obs`` encoded tensors.
- mode (:obj:`str`): Name of the forward mode.
Returns:
- outputs (:obj:`Dict`): Outputs of network forward.
Forward with ``'compute_actor'``, Necessary Keys (either):
- logit (:obj:`torch.Tensor`):
- logit (:obj:`torch.Tensor`): Logit encoding tensor.
Forward with ``'compute_critic'``, Necessary Keys:
- q_value (:obj:`torch.Tensor`): Q value tensor.
Actor Shapes:
- obs (:obj:`torch.Tensor`): :math:`(B, N1)`, where B is batch size and N1 is ``obs_shape``
- logit (:obj:`torch.FloatTensor`): :math:`(B, N2)`, where B is batch size and N2 is ``action_shape``
Critic Shapes:
- inputs (:obj:`torch.Tensor`): :math:`(B, N1)`, B is batch size and N1 corresponds to ``obs_shape``
- q_value (:obj:`torch.FloatTensor`): :math:`(B, N2)`, where B is batch size and N2 is ``action_shape``
Actor Examples:
>>> # Regression mode
>>> model = ACER(64, 64)
>>> inputs = torch.randn(4, 64)
>>> actor_outputs = model(inputs,'compute_actor')
>>> assert actor_outputs['logit'].shape == torch.Size([4, 64])
Critic Examples:
>>> inputs = torch.randn(4,N)
>>> model = ACER(obs_shape=(N, ),action_shape=5)
>>> model(inputs, mode='compute_critic')['q_value'] # q value
tensor([[-0.0681, -0.0431, -0.0530, 0.1454, -0.1093],
[-0.0647, -0.0281, -0.0527, 0.1409, -0.1162],
[-0.0596, -0.0321, -0.0676, 0.1386, -0.1113],
[-0.0874, -0.0406, -0.0487, 0.1346, -0.1135]],
grad_fn=<AddmmBackward>)
"""
assert mode in self.mode, "not support forward mode: {}/{}".format(mode, self.mode)
return getattr(self, mode)(inputs)
def compute_actor(self, inputs: torch.Tensor) -> Dict:
r"""
Overview:
Use encoded embedding tensor to predict output.
Execute parameter updates with ``'compute_actor'`` mode
Use encoded embedding tensor to predict output.
Arguments:
- inputs (:obj:`torch.Tensor`):
The encoded embedding tensor, determined with given ``hidden_size``, i.e. ``(B, N=hidden_size)``.
``hidden_size = actor_head_hidden_size``
- mode (:obj:`str`): Name of the forward mode.
Returns:
- outputs (:obj:`Dict`): Outputs of forward pass encoder and head.
ReturnsKeys (either):
- logit (:obj:`torch.FloatTensor`): :math:`(B, N1)`, where B is batch size and N1 is ``action_shape``
Shapes:
- inputs (:obj:`torch.Tensor`): :math:`(B, N0)`, B is batch size and N0 corresponds to ``hidden_size``
- logit (:obj:`torch.FloatTensor`): :math:`(B, N1)`, where B is batch size and N1 is ``action_shape``
Examples:
>>> # Regression mode
>>> model = ACER(64, 64)
>>> inputs = torch.randn(4, 64)
>>> actor_outputs = model(inputs,'compute_actor')
>>> assert actor_outputs['logit'].shape == torch.Size([4, 64])
"""
x = self.actor_encoder(inputs)
x = self.actor_head(x)
return x
def compute_critic(self, inputs: torch.Tensor) -> Dict:
r"""
Overview:
Execute parameter updates with ``'compute_critic'`` mode
Use encoded embedding tensor to predict output.
Arguments:
- ``obs``, ``action`` encoded tensors.
- mode (:obj:`str`): Name of the forward mode.
Returns:
- outputs (:obj:`Dict`): Q-value output.
ReturnKeys:
- q_value (:obj:`torch.Tensor`): Q value tensor with same size as batch size.
Shapes:
- obs (:obj:`torch.Tensor`): :math:`(B, N1)`, where B is batch size and N1 is ``obs_shape``
- q_value (:obj:`torch.FloatTensor`): :math:`(B, N2)`, where B is batch size and N2 is ``action_shape``.
Examples:
>>> inputs =torch.randn(4, N)
>>> model = ACER(obs_shape=(N, ),action_shape=5)
>>> model(inputs, mode='compute_critic')['q_value'] # q value
tensor([[-0.0681, -0.0431, -0.0530, 0.1454, -0.1093],
[-0.0647, -0.0281, -0.0527, 0.1409, -0.1162],
[-0.0596, -0.0321, -0.0676, 0.1386, -0.1113],
[-0.0874, -0.0406, -0.0487, 0.1346, -0.1135]],
grad_fn=<AddmmBackward>)
"""
obs = inputs
x = self.critic_encoder(obs)
x = self.critic_head(x)
return {"q_value": x['pred']}
......@@ -18,6 +18,7 @@ from .qmix import QMIXPolicy
from .coma import COMAPolicy
from .collaq import CollaQPolicy
from .atoc import ATOCPolicy
from .acer import ACERPolicy
from .il import ILPolicy
......
此差异已折叠。
......@@ -20,6 +20,7 @@ from .qmix import QMIXPolicy
from .collaq import CollaQPolicy
from .coma import COMAPolicy
from .atoc import ATOCPolicy
from .acer import ACERPolicy
class EpsCommandModePolicy(CommandModePolicy):
......@@ -163,3 +164,8 @@ class COMACommandModePolicy(COMAPolicy, EpsCommandModePolicy):
@POLICY_REGISTRY.register('atoc_command')
class ATOCCommandModePolicy(ATOCPolicy, DummyCommandModePolicy):
pass
@POLICY_REGISTRY.register('acer_command')
class ACERCommandModePolisy(ACERPolicy, DummyCommandModePolicy):
pass
......@@ -15,3 +15,5 @@ from .adder import get_gae, get_gae_with_default_last_value, get_nstep_return_da
from .value_rescale import value_transform, value_inv_transform
from .vtrace import vtrace_data, vtrace_error
from .beta_function import beta_function_map
from .retrace import compute_q_retraces
from .acer import acer_policy_error, acer_value_error, acer_trust_region_update
from typing import Tuple, List
from collections import namedtuple
import torch
from torch.functional import Tensor
import torch.nn.functional as F
EPS = 1e-8
def acer_policy_error(
q_values: torch.Tensor,
q_retraces: torch.Tensor,
v_pred: torch.Tensor,
target_pi: torch.Tensor,
actions: torch.Tensor,
ratio: torch.Tensor,
c_clip_ratio: float = 10.0
) -> Tuple[torch.Tensor, torch.Tensor]:
"""
Overview:
Get ACER policy loss
Arguments:
- q_values (:obj:`torch.Tensor`): Q values
- q_retraces (:obj:`torch.Tensor`): Q values (be calculated by retrace method)
- v_pred (:obj:`torch.Tensor`): V values
- target_pi (:obj:`torch.Tensor`): The new policy's probability
- actions (:obj:`torch.Tensor`): The actions in replay buffer
- ratio (:obj:`torch.Tensor`): ratio of new polcy with behavior policy
- c_clip_ratio (:obj:`float`): clip value for ratio
Returns:
- actor_loss (:obj:`torch.Tensor`): policy loss from q_retrace
- bc_loss (:obj:`torch.Tensor`): correct policy loss
Shapes:
- q_values (:obj:`torch.FloatTensor`): :math:`(T, B, N)`, where B is batch size and N is action dim
- q_retraces (:obj:`torch.FloatTensor`): :math:`(T, B, 1)`
- v_pred (:obj:`torch.FloatTensor`): :math:`(T, B, 1)`
- target_pi (:obj:`torch.FloatTensor`): :math:`(T, B, N)`
- actions (:obj:`torch.LongTensor`): :math:`(T, B)`
- ratio (:obj:`torch.FloatTensor`): :math:`(T, B, N)`
- actor_loss (:obj:`torch.FloatTensor`): :math:`(T, B, 1)`
- bc_loss (:obj:`torch.FloatTensor`): :math:`(T, B, 1)`
"""
actions = actions.unsqueeze(-1)
with torch.no_grad():
advantage_retraces = q_retraces - v_pred # shape T,B,1
advantage_native = q_values - v_pred # shape T,B,env_action_shape
actor_loss = ratio.gather(-1, actions).clamp(max=c_clip_ratio
) * advantage_retraces * (target_pi.gather(-1, actions) +
EPS).log() # shape T,B,1
# bias correction term, the first target_pi will not calculate gradient flow
bias_correction_loss = (1.0-c_clip_ratio/(ratio+EPS)).clamp(min=0.0)*target_pi.detach() * \
advantage_native*(target_pi+EPS).log() # shape T,B,env_action_shape
bias_correction_loss = bias_correction_loss.sum(-1, keepdim=True)
return actor_loss, bias_correction_loss
def acer_value_error(q_values, q_retraces, actions):
"""
Overview:
Get ACER critic loss
Arguments:
- q_values (:obj:`torch.Tensor`): Q values
- q_retraces (:obj:`torch.Tensor`): Q values (be calculated by retrace method)
- actions (:obj:`torch.Tensor`): The actions in replay buffer
- ratio (:obj:`torch.Tensor`): ratio of new polcy with behavior policy
Returns:
- critic_loss (:obj:`torch.Tensor`): critic loss
Shapes:
- q_values (:obj:`torch.FloatTensor`): :math:`(T, B, N)`, where B is batch size and N is action dim
- q_retraces (:obj:`torch.FloatTensor`): :math:`(T, B, 1)`
- actions (:obj:`torch.LongTensor`): :math:`(T, B)`
- critic_loss (:obj:`torch.FloatTensor`): :math:`(T, B, 1)`
"""
actions = actions.unsqueeze(-1)
critic_loss = 0.5 * (q_retraces - q_values.gather(-1, actions)).pow(2)
return critic_loss
def acer_trust_region_update(
actor_gradients: List[torch.Tensor], target_pi: torch.Tensor, avg_pi: torch.Tensor, trust_region_value: float
) -> List[torch.Tensor]:
"""
Overview:
calcuate gradient with trust region constrain
Arguments:
- actor_gradients (:obj:`list(torch.Tensor)`): gradients value's for different part
- target_pi (:obj:`torch.Tensor`): The new policy's probability
- avg_pi (:obj:`torch.Tensor`): The average policy's probability
- trust_region_value (:obj:`float`): the range of trust region
Returns:
- update_gradients (:obj:`list(torch.Tensor)`): gradients with trust region constraint
Shapes:
- target_pi (:obj:`torch.FloatTensor`): :math:`(T, B, N)`
- avg_pi (:obj:`torch.FloatTensor`): :math:`(T, B, N)`
"""
with torch.no_grad():
KL_gradients = [(avg_pi / (target_pi + EPS))]
update_gradients = []
# TODO: here is only one elements in this list.Maybe will use to more elements in the future
actor_gradient = actor_gradients[0]
KL_gradient = KL_gradients[0]
scale = actor_gradient.mul(KL_gradient).sum(-1, keepdim=True) - trust_region_value
scale = torch.div(scale, KL_gradient.mul(KL_gradient).sum(-1, keepdim=True)).clamp(min=0.0)
update_gradients.append(actor_gradient - scale * KL_gradient)
return update_gradients
from re import A
import torch
import torch.nn.functional as F
from collections import namedtuple
from .isw import compute_importance_weights
from ding.hpc_rl import hpc_wrapper
def compute_q_retraces(
q_values: torch.Tensor,
v_pred: torch.Tensor,
rewards: torch.Tensor,
actions: torch.Tensor,
weights: torch.Tensor,
ratio: torch.Tensor,
gamma: float = 0.9
) -> torch.Tensor:
rewards = rewards.unsqueeze(-1) # shape T,B,1
actions = actions.unsqueeze(-1) # shape T,B,1
weights = weights.unsqueeze(-1) # shape T,B,1
q_retraces = torch.zeros_like(v_pred) # shape (T+1),B,1
n_len = q_retraces.size()[0] # T+1
tmp_retraces = v_pred[-1, ...] # shape B,1
q_retraces[-1, ...] = v_pred[-1, ...]
q_gather = torch.zeros_like(v_pred)
# ratio_gather = torch.zeros_like(actions)
q_gather[0:-1, ...] = q_values[0:-1, ...].gather(-1, actions) # shape (T+1),B,1
ratio_gather = ratio.gather(-1, actions) # shape T,B,1
for idx in reversed(range(n_len - 1)):
q_retraces[idx, ...] = rewards[idx, ...] + gamma * weights[idx, ...] * tmp_retraces
tmp_retraces = ratio_gather[idx, ...].clamp(max=1.0) * (q_retraces[idx, ...] - q_gather[idx, ...]) + v_pred[idx,
...]
# print(q_retraces.squeeze())
return q_retraces # shape (T+1),B,1
from copy import deepcopy
from ding.entry import serial_pipeline
from easydict import EasyDict
pong_acer_config = dict(
env=dict(
collector_env_num=16,
evaluator_env_num=4,
n_evaluator_episode=8,
stop_value=20,
env_id='PongNoFrameskip-v4',
frame_stack=4,
manager=dict(shared_memory=False, )
),
policy=dict(
cuda=True,
on_policy=False,
priority=False,
model=dict(
obs_shape=[4, 84, 84],
action_shape=6,
encoder_hidden_size_list=[128, 128, 512],
critic_head_hidden_size=512,
critic_head_layer_num=2,
actor_head_hidden_size=512,
actor_head_layer_num=2,
),
unroll_len=32,
learn=dict(
# (int) collect n_sample data, train model update_per_collect times
# here we follow impala serial pipeline
update_per_collect=10,
# (int) the number of data for a train iteration
batch_size=64,
# grad_clip_type='clip_norm',
# clip_value=10,
learning_rate_actor=0.0001,
learning_rate_critic=0.0003,
# (float) loss weight of the value network, the weight of policy network is set to 1
# (float) loss weight of the entropy regularization, the weight of policy network is set to 1
entropy_weight=0.01,
# (float) discount factor for future reward, defaults int [0, 1]
discount_factor=0.9,
# (float) additional discounting parameter
trust_region=True,
# (float) clip ratio of importance weights
# (float) clip ratio of importance weights
c_clip_ratio=10,
),
collect=dict(
# (int) collect n_sample data, train model n_iteration times
n_sample=16,
# (float) discount factor for future reward, defaults int [0, 1]
discount_factor=0.9,
collector=dict(collect_print_freq=1000, ),
),
eval=dict(evaluator=dict(eval_freq=5000, )),
other=dict(replay_buffer=dict(
type='naive',
replay_buffer_size=10000,
), ),
),
)
main_config = EasyDict(pong_acer_config)
pong_acer_create_config = dict(
env=dict(
type='atari',
import_names=['dizoo.atari.envs.atari_env'],
),
env_manager=dict(type='subprocess'),
policy=dict(type='acer'),
)
create_config = EasyDict(pong_acer_create_config)
if __name__ == '__main__':
serial_pipeline((main_config, create_config), seed=0)
......@@ -10,9 +10,7 @@ pong_impala_config = dict(
stop_value=20,
env_id='PongNoFrameskip-v4',
frame_stack=4,
manager=dict(
shared_memory=False,
)
manager=dict(shared_memory=False, )
),
policy=dict(
cuda=True,
......@@ -22,10 +20,10 @@ pong_impala_config = dict(
obs_shape=[4, 84, 84],
action_shape=6,
encoder_hidden_size_list=[128, 128, 512],
critic_head_hidden_size = 512,
critic_head_hidden_size=512,
critic_head_layer_num=2,
actor_head_hidden_size=512,
actor_head_layer_num =2,
actor_head_layer_num=2,
),
learn=dict(
# (int) collect n_sample data, train model update_per_collect times
......@@ -84,5 +82,4 @@ pong_impala_create_config = dict(
create_config = EasyDict(pong_impala_create_config)
if __name__ == '__main__':
from ding.entry import serial_pipeline
serial_pipeline((main_config, create_config), seed=0)
from easydict import EasyDict
from ding.entry import serial_pipeline
nstep = 3
lunarlander_acer_default_config = dict(
env=dict(
# Whether to use shared memory. Only effective if "env_manager_type" is 'subprocess'
manager=dict(shared_memory=True, ),
# Env number respectively for collector and evaluator.
collector_env_num=8,
evaluator_env_num=5,
n_evaluator_episode=5,
stop_value=5,
),
policy=dict(
# Whether to use cuda for network.
cuda=False,
# Whether the RL algorithm is on-policy or off-policy.
on_policy=False,
# Model config used for model creating. Remember to change this, especially "obs_dim" and "action_dim" according to specific env.
model=dict(
obs_shape=8,
action_shape=4,
encoder_hidden_size_list=[512, 64],
# Whether to use dueling head.
),
# Reward's future discount facotr, aka. gamma.
discount_factor=0.99,
# How many steps in td error.
nstep=nstep,
unroll_len=32,
# learn_mode config
learn=dict(
# (int) collect n_sample data, train model update_per_collect times
# here we follow impala serial pipeline
update_per_collect=10,
# (int) the number of data for a train iteration
batch_size=32,
# grad_clip_type='clip_norm',
# clip_value=10,
learning_rate_actor=0.0001,
learning_rate_critic=0.0001,
# (float) loss weight of the value network, the weight of policy network is set to 1
# (float) loss weight of the entropy regularization, the weight of policy network is set to 1
entropy_weight=0.0,
# (float) discount factor for future reward, defaults int [0, 1]
discount_factor=0.99,
# (float) additional discounting parameter
# (int) the trajectory length to calculate v-trace target
# (float) clip ratio of importance weights
c_clip_ratio=10,
),
collect=dict(
# (int) collect n_sample data, train model n_iteration times
n_sample=16,
# (float) discount factor for future reward, defaults int [0, 1]
discount_factor=0.99,
gae_lambda=0.95,
collector=dict(collect_print_freq=1000, ),
),
eval=dict(evaluator=dict(eval_freq=5000, )),
other=dict(replay_buffer=dict(
type='naive',
replay_buffer_size=50000,
), ),
),
)
lunarlander_acer_default_config = EasyDict(lunarlander_acer_default_config)
main_config = lunarlander_acer_default_config
lunarlander_acer_create_config = dict(
env=dict(
type='lunarlander',
import_names=['dizoo.box2d.lunarlander.envs.lunarlander_env'],
),
env_manager=dict(type='subprocess'),
policy=dict(type='acer'),
)
lunarlander_acer_create_config = EasyDict(lunarlander_acer_create_config)
create_config = lunarlander_acer_create_config
if __name__ == "__main__":
serial_pipeline([main_config, create_config], seed=0)
......@@ -57,9 +57,7 @@ lunarlander_dqn_default_config = dict(
end=0.1,
decay=50_000,
),
replay_buffer=dict(
replay_buffer_size=100000,
)
replay_buffer=dict(replay_buffer_size=100000, )
),
),
)
......@@ -78,4 +76,4 @@ lunarlander_dqn_create_config = EasyDict(lunarlander_dqn_create_config)
create_config = lunarlander_dqn_create_config
if __name__ == "__main__":
serial_pipeline([main_config, create_config], seed=0)
\ No newline at end of file
serial_pipeline([main_config, create_config], seed=0)
......@@ -10,4 +10,5 @@ from .cartpole_qrdqn_config import cartpole_qrdqn_config, cartpole_qrdqn_create_
from .cartpole_sqn_config import cartpole_sqn_config, cartpole_sqn_create_config
from .cartpole_ppg_config import cartpole_ppg_config, cartpole_ppg_create_config
from .cartpole_r2d2_config import cartpole_r2d2_config, cartpole_r2d2_create_config
from .cartpole_acer_config import cartpole_acer_config, cartpole_acer_create_config
# from .cartpole_ppo_default_loader import cartpole_ppo_default_loader
from easydict import EasyDict
cartpole_acer_config = dict(
env=dict(
collector_env_num=8,
evaluator_env_num=5,
n_evaluator_episode=5,
stop_value=195,
),
policy=dict(
cuda=False,
model=dict(
obs_shape=4,
action_shape=2,
encoder_hidden_size_list=[64, 64],
),
# (int) the trajectory length to calculate Q retrace target
unroll_len=32,
learn=dict(
# (int) collect n_sample data, train model update_per_collect times
# here we follow ppo serial pipeline
update_per_collect=4,
# (int) the number of data for a train iteration
batch_size=16,
learning_rate_actor=0.0005,
learning_rate_critic=0.0005,
# (float) loss weight of the entropy regularization, the weight of policy network is set to 1
# entropy_weight=0.0001,
entropy_weight=0.0,
# (float) discount factor for future reward, defaults int [0, 1]
discount_factor=0.9,
# (float) additional discounting parameter
# (int) the trajectory length to calculate v-trace target
# (float) clip ratio of importance weights
trust_region=True,
c_clip_ratio=10,
# (float) clip ratio of importance sampling
),
collect=dict(
# (int) collect n_sample data, train model n_iteration times
n_sample=16,
# (float) discount factor for future reward, defaults int [0, 1]
discount_factor=0.9,
collector=dict(collect_print_freq=1000, ),
),
eval=dict(evaluator=dict(eval_freq=200, )),
other=dict(replay_buffer=dict(replay_buffer_size=10000, ), ),
),
)
cartpole_acer_config = EasyDict(cartpole_acer_config)
main_config = cartpole_acer_config
cartpole_acer_create_config = dict(
env=dict(
type='cartpole',
import_names=['dizoo.classic_control.cartpole.envs.cartpole_env'],
),
env_manager=dict(type='base'),
policy=dict(type='acer'),
)
cartpole_acer_create_config = EasyDict(cartpole_acer_create_config)
create_config = cartpole_acer_create_config
......@@ -3,7 +3,6 @@ from ding.config import parallel_transform
from copy import deepcopy
from ding.entry import parallel_pipeline
gfootball_ppo_config = dict(
env=dict(
collector_env_num=1,
......@@ -65,10 +64,7 @@ gfootball_ppo_create_config = dict(
type='gfootball_sp',
),
env_manager=dict(type='base'),
policy=dict(
type='ppo_lstm_command',
import_names=['dizoo.gfootball.policy.ppo_lstm']
),
policy=dict(type='ppo_lstm_command', import_names=['dizoo.gfootball.policy.ppo_lstm']),
learner=dict(type='base', import_names=['ding.worker.learner.base_learner']),
collector=dict(
type='one_vs_one',
......@@ -101,7 +97,6 @@ gfootball_ppo_system_config = dict(
gfootball_ppo_system_config = EasyDict(gfootball_ppo_system_config)
system_config = gfootball_ppo_system_config
if __name__ == '__main__':
config = tuple([deepcopy(main_config), deepcopy(create_config), deepcopy(system_config)])
parallel_pipeline(config, seed=0)
\ No newline at end of file
parallel_pipeline(config, seed=0)
from .ppo_lstm import PPOPolicy, PPOCommandModePolicy
\ No newline at end of file
from .ppo_lstm import PPOPolicy, PPOCommandModePolicy
......@@ -294,7 +294,6 @@ class PPOPolicy(Policy):
else:
return get_nstep_return_data(data, self._nstep)
def _init_eval(self) -> None:
r"""
Overview:
......@@ -326,6 +325,7 @@ class PPOPolicy(Policy):
output = to_device(output, 'cpu')
output = default_decollate(output)
return {i: d for i, d in zip(data_id, output)}
def _reset_eval(self, data_id: Optional[List[int]] = None) -> None:
self._eval_model.reset(data_id=data_id)
......@@ -338,7 +338,6 @@ class PPOPolicy(Policy):
]
@POLICY_REGISTRY.register('ppo_lstm_command')
class PPOCommandModePolicy(PPOPolicy, DummyCommandModePolicy):
pass
\ No newline at end of file
pass
......@@ -12,15 +12,15 @@ pong_dqn_config = dict(
warp_frame=False,
use_ram=True,
pomdp=dict(noise_scale=0.01, zero_p=0.2, reward_noise=0.01, duplicate_p=0.2),
manager=dict(
shared_memory=False,
)
manager=dict(shared_memory=False, )
),
policy=dict(
cuda=True,
priority=False,
model=dict(
obs_shape=[512, ],
obs_shape=[
512,
],
action_shape=6,
encoder_hidden_size_list=[128, 128, 512],
),
......@@ -32,9 +32,7 @@ pong_dqn_config = dict(
learning_rate=0.0001,
target_update_freq=500,
),
collect=dict(
n_sample=100,
),
collect=dict(n_sample=100, ),
eval=dict(evaluator=dict(eval_freq=4000, )),
other=dict(
eps=dict(
......
......@@ -12,16 +12,16 @@ pong_ppo_config = dict(
warp_frame=False,
use_ram=True,
pomdp=dict(noise_scale=0.01, zero_p=0.2, reward_noise=0.01, duplicate_p=0.2),
manager=dict(
shared_memory=False,
)
manager=dict(shared_memory=False, )
),
policy=dict(
cuda=True,
on_policy=False,
# (bool) whether use on-policy training pipeline(behaviour policy and training policy are the same)
model=dict(
obs_shape=[512, ],
obs_shape=[
512,
],
action_shape=6,
encoder_hidden_size_list=[512, 512, 256],
actor_head_hidden_size=256,
......@@ -50,13 +50,11 @@ pong_ppo_config = dict(
discount_factor=0.99,
),
eval=dict(evaluator=dict(eval_freq=200, )),
other=dict(
replay_buffer=dict(
replay_buffer_size=100000,
max_use=3,
min_sample_ratio=1,
),
),
other=dict(replay_buffer=dict(
replay_buffer_size=100000,
max_use=3,
min_sample_ratio=1,
), ),
),
)
main_config = EasyDict(pong_ppo_config)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册