✨ ppo configs

3f4caf56 · Varuna Jayasiri · 5c429e7a · 3f4caf56
隐藏空白更改
内联并排

Showing with 71 addition and 24 deletion

labml_nn/rl/ppo/experiment.py labml_nn/rl/ppo/experiment.py +71 -24

未找到文件。
--- a/labml_nn/rl/ppo/experiment.py
+++ b/labml_nn/rl/ppo/experiment.py
@@ -19,6 +19,7 @@ from torch import optim
 from torch.distributions import Categorical

 from labml import monit, tracker, logger, experiment
+from labml.internal.configs.dynamic_hyperparam import FloatDynamicHyperParam
 from labml_helpers.module import Module
 from labml_nn.rl.game import Worker
 from labml_nn.rl.ppo import ClippedPPOLoss, ClippedValueFunctionLoss
@@ -89,24 +90,40 @@ class Trainer:
    ## Trainer
    """

-    def __init__(self):
+    def __init__(self, *,
+                 updates: int, epochs: int, n_workers: int, worker_steps: int, batches: int,
+                 value_loss_coef: FloatDynamicHyperParam,
+                 entropy_bonus_coef: FloatDynamicHyperParam,
+                 clip_range: FloatDynamicHyperParam,
+                 learning_rate: FloatDynamicHyperParam,
+                 ):
        # #### Configurations

        # number of updates
-        self.updates = 10000
+        self.updates = updates
        # number of epochs to train the model with sampled data
-        self.epochs = 4
+        self.epochs = epochs
        # number of worker processes
-        self.n_workers = 8
+        self.n_workers = n_workers
        # number of steps to run on each process for a single update
-        self.worker_steps = 128
+        self.worker_steps = worker_steps
        # number of mini batches
-        self.n_mini_batch = 4
+        self.batches = batches
        # total number of samples for a single update
        self.batch_size = self.n_workers * self.worker_steps
        # size of a mini batch
-        self.mini_batch_size = self.batch_size // self.n_mini_batch
-        assert (self.batch_size % self.n_mini_batch == 0)
+        self.mini_batch_size = self.batch_size // self.batches
+        assert (self.batch_size % self.batches == 0)
+
+        # Value loss coefficient
+        self.value_loss_coef = value_loss_coef
+        # Entropy bonus coefficient
+        self.entropy_bonus_coef = entropy_bonus_coef
+
+        # Clipping range
+        self.clip_range = clip_range
+        # Learning rate
+        self.learning_rate = learning_rate

        # #### Initialize

@@ -204,7 +221,7 @@ class Trainer:

        return samples_flat

-    def train(self, samples: Dict[str, torch.Tensor], learning_rate: float, clip_range: float):
+    def train(self, samples: Dict[str, torch.Tensor]):
        """
        ### Train the model based on samples
        """
@@ -228,12 +245,11 @@ class Trainer:
                    mini_batch[k] = v[mini_batch_indexes]

                # train
-                loss = self._calc_loss(clip_range=clip_range,
-                                       samples=mini_batch)
+                loss = self._calc_loss(mini_batch)

                # Set learning rate
                for pg in self.optimizer.param_groups:
-                    pg['lr'] = learning_rate
+                    pg['lr'] = self.learning_rate()
                # Zero out the previously calculated gradients
                self.optimizer.zero_grad()
                # Calculate gradients
@@ -248,7 +264,7 @@ class Trainer:
        """#### Normalize advantage function"""
        return (adv - adv.mean()) / (adv.std() + 1e-8)

-    def _calc_loss(self, samples: Dict[str, torch.Tensor], clip_range: float) -> torch.Tensor:
+    def _calc_loss(self, samples: Dict[str, torch.Tensor]) -> torch.Tensor:
        """
        ### Calculate total loss
        """
@@ -270,7 +286,7 @@ class Trainer:
        log_pi = pi.log_prob(samples['actions'])

        # Calculate policy loss
-        policy_loss = self.ppo_loss(log_pi, samples['log_pis'], sampled_normalized_advantage, clip_range)
+        policy_loss = self.ppo_loss(log_pi, samples['log_pis'], sampled_normalized_advantage, self.clip_range())

        # Calculate Entropy Bonus
        #
@@ -280,12 +296,14 @@ class Trainer:
        entropy_bonus = entropy_bonus.mean()

        # Calculate value function loss
-        value_loss = self.value_loss(value, samples['values'], sampled_return, clip_range)
+        value_loss = self.value_loss(value, samples['values'], sampled_return, self.clip_range())

        # $\mathcal{L}^{CLIP+VF+EB} (\theta) =
        #  \mathcal{L}^{CLIP} (\theta) +
        #  c_1 \mathcal{L}^{VF} (\theta) - c_2 \mathcal{L}^{EB}(\theta)$
-        loss = policy_loss + 0.5 * value_loss - 0.01 * entropy_bonus
+        loss = (policy_loss
+                + self.value_loss_coef() * value_loss
+                - self.entropy_bonus_coef() * entropy_bonus)

        # for monitoring
        approx_kl_divergence = .5 * ((samples['log_pis'] - log_pi) ** 2).mean()
@@ -309,17 +327,11 @@ class Trainer:
        tracker.set_queue('length', 100, True)

        for update in monit.loop(self.updates):
-            progress = update / self.updates
-
-            # decreasing `learning_rate` and `clip_range` $\epsilon$
-            learning_rate = 2.5e-4 * (1 - progress)
-            clip_range = 0.1 * (1 - progress)
-
            # sample with current policy
            samples = self.sample()

            # train the model
-            self.train(samples, learning_rate, clip_range)
+            self.train(samples)

            # Save tracked indicators.
            tracker.save()
@@ -339,8 +351,43 @@ class Trainer:
 def main():
    # Create the experiment
    experiment.create(name='ppo')
+    # Configurations
+    configs = {
+        # number of updates
+        'updates': 10000,
+        # number of epochs to train the model with sampled data
+        'epochs': 4,
+        # number of worker processes
+        'n_workers': 8,
+        # number of steps to run on each process for a single update
+        'worker_steps': 128,
+        # number of mini batches
+        'batches': 4,
+        # Value loss coefficient
+        'value_loss_coef': FloatDynamicHyperParam(0.5),
+        # Entropy bonus coefficient
+        'entropy_bonus_coef': FloatDynamicHyperParam(0.01),
+        # Clip range
+        'clip_range': FloatDynamicHyperParam(0.1),
+        # Learning rate
+        'learning_rate': FloatDynamicHyperParam(2.5e-4, (0, 1e-3)),
+    }
+
+    experiment.configs(configs)
+
    # Initialize the trainer
-    m = Trainer()
+    m = Trainer(
+        updates=configs['updates'],
+        epochs=configs['epochs'],
+        n_workers=configs['n_workers'],
+        worker_steps=configs['worker_steps'],
+        batches=configs['batches'],
+        value_loss_coef=configs['value_loss_coef'],
+        entropy_bonus_coef=configs['entropy_bonus_coef'],
+        clip_range=configs['clip_range'],
+        learning_rate=configs['learning_rate'],
+    )
+
    # Run and monitor the experiment
    with experiment.start():
        m.run_training_loop()