diff --git a/ding/policy/impala.py b/ding/policy/impala.py index 4d693144bb7ed6637ca5a88aaf94b9be8801a39c..a37a1b257353025f8c1db03b03f81938a30dfb0f 100644 --- a/ding/policy/impala.py +++ b/ding/policy/impala.py @@ -39,7 +39,6 @@ class IMPALAPolicy(Policy): | valid in serial training | means more off-policy == ==================== ======== ============== ======================================== ======================= """ - unroll_len = 32 config = dict( type='impala', cuda=False, @@ -49,6 +48,8 @@ class IMPALAPolicy(Policy): priority=False, # (bool) Whether use Importance Sampling Weight to correct biased update. If True, priority must be True. priority_IS_weight=False, + # (int) the trajectory length to calculate v-trace target + unroll_len=32, learn=dict( # (bool) Whether to use multi gpu multi_gpu=False, @@ -66,8 +67,6 @@ class IMPALAPolicy(Policy): discount_factor=0.9, # (float) additional discounting parameter lambda_=0.95, - # (int) the trajectory length to calculate v-trace target - unroll_len=unroll_len, # (float) clip ratio of importance weights rho_clip_ratio=1.0, # (float) clip ratio of importance weights @@ -78,8 +77,6 @@ class IMPALAPolicy(Policy): collect=dict( # (int) collect n_sample data, train model n_iteration times n_sample=16, - # (int) the trajectory length to calculate v-trace target - unroll_len=unroll_len, # (float) discount factor for future reward, defaults int [0, 1] discount_factor=0.9, gae_lambda=0.95, @@ -116,7 +113,7 @@ class IMPALAPolicy(Policy): self._learn_model = model_wrap(self._model, wrapper_name='base') self._action_shape = self._cfg.model.action_shape - self._unroll_len = self._cfg.learn.unroll_len + self._unroll_len = self._cfg.unroll_len # Algorithm config self._priority = self._cfg.priority @@ -290,7 +287,6 @@ class IMPALAPolicy(Policy): Collect mode init method. Called by ``self.__init__``, initialize algorithm arguments and collect_model. Use multinomial_sample to choose action. """ - self._collect_unroll_len = self._cfg.collect.unroll_len self._collect_model = model_wrap(self._model, wrapper_name='multinomial_sample') self._collect_model.reset() diff --git a/dizoo/atari/config/serial/enduro/enduro_impala_config.py b/dizoo/atari/config/serial/enduro/enduro_impala_config.py index 87f54bbc157464fb5b7365f2a0fe5ef4a89b7fd7..3e2874e8c4aa00dd817a37c2f1ac6d7a48b57e7c 100644 --- a/dizoo/atari/config/serial/enduro/enduro_impala_config.py +++ b/dizoo/atari/config/serial/enduro/enduro_impala_config.py @@ -14,6 +14,8 @@ enduro_impala_config = dict( ), policy=dict( cuda=True, + # (int) the trajectory length to calculate v-trace target + unroll_len=64, model=dict( obs_shape=[4, 84, 84], action_shape=9, @@ -40,8 +42,6 @@ enduro_impala_config = dict( discount_factor=0.99, # (float) additional discounting parameter lambda_=1.0, - # (int) the trajectory length to calculate v-trace target - unroll_len=64, # (float) clip ratio of importance weights rho_clip_ratio=1.0, # (float) clip ratio of importance weights @@ -52,8 +52,6 @@ enduro_impala_config = dict( collect=dict( # (int) collect n_sample data, train model n_iteration times n_sample=16, - # (int) the trajectory length to calculate v-trace target - unroll_len=64, # (float) discount factor for future reward, defaults int [0, 1] discount_factor=0.99, gae_lambda=0.95, diff --git a/dizoo/atari/config/serial/pong/pong_impala_config.py b/dizoo/atari/config/serial/pong/pong_impala_config.py index 7b5a536b7e978f958ef914b4903b6c2ce1fbdb9e..169b53d3cfc13fee43077c2898b1a6d1901f8839 100644 --- a/dizoo/atari/config/serial/pong/pong_impala_config.py +++ b/dizoo/atari/config/serial/pong/pong_impala_config.py @@ -15,6 +15,8 @@ pong_impala_config = dict( policy=dict( cuda=True, priority=False, + # (int) the trajectory length to calculate v-trace target + unroll_len=64, model=dict( obs_shape=[4, 84, 84], action_shape=6, @@ -41,8 +43,6 @@ pong_impala_config = dict( discount_factor=0.9, # (float) additional discounting parameter lambda_=0.95, - # (int) the trajectory length to calculate v-trace target - unroll_len=64, # (float) clip ratio of importance weights rho_clip_ratio=1.0, # (float) clip ratio of importance weights @@ -53,8 +53,6 @@ pong_impala_config = dict( collect=dict( # (int) collect n_sample data, train model n_iteration times n_sample=16, - # (int) the trajectory length to calculate v-trace target - unroll_len=64, # (float) discount factor for future reward, defaults int [0, 1] discount_factor=0.9, gae_lambda=0.95, diff --git a/dizoo/atari/config/serial/qbert/qbert_impala_config.py b/dizoo/atari/config/serial/qbert/qbert_impala_config.py index c957b7c3e6309d83956d0715f980ed561b2ed471..b206f43744be9ed41a9396a938d8659a9e1bc527 100644 --- a/dizoo/atari/config/serial/qbert/qbert_impala_config.py +++ b/dizoo/atari/config/serial/qbert/qbert_impala_config.py @@ -14,6 +14,8 @@ qbert_impala_config = dict( ), policy=dict( cuda=True, + # (int) the trajectory length to calculate v-trace target + unroll_len=64, model=dict( obs_shape=[4, 84, 84], action_shape=6, @@ -40,8 +42,6 @@ qbert_impala_config = dict( discount_factor=0.9, # (float) additional discounting parameter lambda_=0.95, - # (int) the trajectory length to calculate v-trace target - unroll_len=64, # (float) clip ratio of importance weights rho_clip_ratio=1.0, # (float) clip ratio of importance weights @@ -52,8 +52,6 @@ qbert_impala_config = dict( collect=dict( # (int) collect n_sample data, train model n_iteration times n_sample=16, - # (int) the trajectory length to calculate v-trace target - unroll_len=64, # (float) discount factor for future reward, defaults int [0, 1] discount_factor=0.9, gae_lambda=0.95, diff --git a/dizoo/atari/config/serial/spaceinvaders/spaceinvaders_impala_config.py b/dizoo/atari/config/serial/spaceinvaders/spaceinvaders_impala_config.py index 42d4434835d6e889471097eb26f4e169ec8e3e0e..dfa9c907d65db3f95b060352f24889f34ea63fb7 100644 --- a/dizoo/atari/config/serial/spaceinvaders/spaceinvaders_impala_config.py +++ b/dizoo/atari/config/serial/spaceinvaders/spaceinvaders_impala_config.py @@ -14,6 +14,8 @@ space_invaders_impala_config = dict( ), policy=dict( cuda=True, + # (int) the trajectory length to calculate v-trace target + unroll_len=64, model=dict( obs_shape=[4, 84, 84], action_shape=6, @@ -40,8 +42,6 @@ space_invaders_impala_config = dict( discount_factor=0.9, # (float) additional discounting parameter lambda_=0.95, - # (int) the trajectory length to calculate v-trace target - unroll_len=64, # (float) clip ratio of importance weights rho_clip_ratio=1.0, # (float) clip ratio of importance weights @@ -52,8 +52,6 @@ space_invaders_impala_config = dict( collect=dict( # (int) collect n_sample data, train model n_iteration times n_sample=16, - # (int) the trajectory length to calculate v-trace target - unroll_len=64, # (float) discount factor for future reward, defaults int [0, 1] discount_factor=0.9, gae_lambda=0.95,