experiment.py 13.7 KB
Newer Older
V
Varuna Jayasiri 已提交
1
"""
V
Varuna Jayasiri 已提交
2 3 4 5 6
---
title: PPO Experiment with Atari Breakout
summary: Annotated implementation to train a PPO agent on Atari Breakout game.
---

V
titles  
Varuna Jayasiri 已提交
7 8
# PPO Experiment with Atari Breakout

V
Varuna Jayasiri 已提交
9 10
This experiment trains Proximal Policy Optimization (PPO) agent  Atari Breakout game on OpenAI Gym.
It runs the [game environments on multiple processes](../game.html) to sample efficiently.
V
Varuna Jayasiri 已提交
11 12
"""

V
Varuna Jayasiri 已提交
13
from typing import Dict
V
ppo  
Varuna Jayasiri 已提交
14 15 16 17 18 19 20 21

import numpy as np
import torch
from torch import nn
from torch import optim
from torch.distributions import Categorical

from labml import monit, tracker, logger, experiment
V
Varuna Jayasiri 已提交
22
from labml.internal.configs.dynamic_hyperparam import FloatDynamicHyperParam
V
Varuna Jayasiri 已提交
23
from labml_helpers.module import Module
V
Varuna Jayasiri 已提交
24
from labml_nn.rl.game import Worker
V
Varuna Jayasiri 已提交
25
from labml_nn.rl.ppo import ClippedPPOLoss, ClippedValueFunctionLoss
V
ppo  
Varuna Jayasiri 已提交
26 27
from labml_nn.rl.ppo.gae import GAE

V
Varuna Jayasiri 已提交
28
# Select device
V
ppo  
Varuna Jayasiri 已提交
29
if torch.cuda.is_available():
V
Varuna Jayasiri 已提交
30
    device = torch.device("cuda:0")
V
ppo  
Varuna Jayasiri 已提交
31 32 33 34
else:
    device = torch.device("cpu")


V
Varuna Jayasiri 已提交
35
class Model(Module):
V
ppo  
Varuna Jayasiri 已提交
36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65
    """
    ## Model
    """

    def __init__(self):
        super().__init__()

        # The first convolution layer takes a
        # 84x84 frame and produces a 20x20 frame
        self.conv1 = nn.Conv2d(in_channels=4, out_channels=32, kernel_size=8, stride=4)

        # The second convolution layer takes a
        # 20x20 frame and produces a 9x9 frame
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2)

        # The third convolution layer takes a
        # 9x9 frame and produces a 7x7 frame
        self.conv3 = nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1)

        # A fully connected layer takes the flattened
        # frame from third convolution layer, and outputs
        # 512 features
        self.lin = nn.Linear(in_features=7 * 7 * 64, out_features=512)

        # A fully connected layer to get logits for $\pi$
        self.pi_logits = nn.Linear(in_features=512, out_features=4)

        # A fully connected layer to get value function
        self.value = nn.Linear(in_features=512, out_features=1)

V
Varuna Jayasiri 已提交
66
        #
V
ppo  
Varuna Jayasiri 已提交
67 68
        self.activation = nn.ReLU()

V
Varuna Jayasiri 已提交
69
    def __call__(self, obs: torch.Tensor):
V
ppo  
Varuna Jayasiri 已提交
70 71 72 73 74 75 76 77 78 79 80 81 82 83
        h = self.activation(self.conv1(obs))
        h = self.activation(self.conv2(h))
        h = self.activation(self.conv3(h))
        h = h.reshape((-1, 7 * 7 * 64))

        h = self.activation(self.lin(h))

        pi = Categorical(logits=self.pi_logits(h))
        value = self.value(h).reshape(-1)

        return pi, value


def obs_to_torch(obs: np.ndarray) -> torch.Tensor:
V
Varuna Jayasiri 已提交
84
    """Scale observations from `[0, 255]` to `[0, 1]`"""
V
ppo  
Varuna Jayasiri 已提交
85 86 87
    return torch.tensor(obs, dtype=torch.float32, device=device) / 255.


V
Varuna Jayasiri 已提交
88 89 90 91
class Trainer:
    """
    ## Trainer
    """
V
Varuna Jayasiri 已提交
92

V
Varuna Jayasiri 已提交
93 94 95 96 97 98 99
    def __init__(self, *,
                 updates: int, epochs: int, n_workers: int, worker_steps: int, batches: int,
                 value_loss_coef: FloatDynamicHyperParam,
                 entropy_bonus_coef: FloatDynamicHyperParam,
                 clip_range: FloatDynamicHyperParam,
                 learning_rate: FloatDynamicHyperParam,
                 ):
V
ppo  
Varuna Jayasiri 已提交
100 101 102
        # #### Configurations

        # number of updates
V
Varuna Jayasiri 已提交
103
        self.updates = updates
V
ppo  
Varuna Jayasiri 已提交
104
        # number of epochs to train the model with sampled data
V
Varuna Jayasiri 已提交
105
        self.epochs = epochs
V
ppo  
Varuna Jayasiri 已提交
106
        # number of worker processes
V
Varuna Jayasiri 已提交
107
        self.n_workers = n_workers
V
ppo  
Varuna Jayasiri 已提交
108
        # number of steps to run on each process for a single update
V
Varuna Jayasiri 已提交
109
        self.worker_steps = worker_steps
V
ppo  
Varuna Jayasiri 已提交
110
        # number of mini batches
V
Varuna Jayasiri 已提交
111
        self.batches = batches
V
ppo  
Varuna Jayasiri 已提交
112 113 114
        # total number of samples for a single update
        self.batch_size = self.n_workers * self.worker_steps
        # size of a mini batch
V
Varuna Jayasiri 已提交
115 116 117 118 119 120 121 122 123 124 125 126
        self.mini_batch_size = self.batch_size // self.batches
        assert (self.batch_size % self.batches == 0)

        # Value loss coefficient
        self.value_loss_coef = value_loss_coef
        # Entropy bonus coefficient
        self.entropy_bonus_coef = entropy_bonus_coef

        # Clipping range
        self.clip_range = clip_range
        # Learning rate
        self.learning_rate = learning_rate
V
ppo  
Varuna Jayasiri 已提交
127 128 129 130 131 132 133 134 135 136 137 138 139

        # #### Initialize

        # create workers
        self.workers = [Worker(47 + i) for i in range(self.n_workers)]

        # initialize tensors for observations
        self.obs = np.zeros((self.n_workers, 4, 84, 84), dtype=np.uint8)
        for worker in self.workers:
            worker.child.send(("reset", None))
        for i, worker in enumerate(self.workers):
            self.obs[i] = worker.child.recv()

V
Varuna Jayasiri 已提交
140
        # model
V
ppo  
Varuna Jayasiri 已提交
141 142 143 144 145
        self.model = Model().to(device)

        # optimizer
        self.optimizer = optim.Adam(self.model.parameters(), lr=2.5e-4)

V
Varuna Jayasiri 已提交
146
        # GAE with $\gamma = 0.99$ and $\lambda = 0.95$
V
ppo  
Varuna Jayasiri 已提交
147 148
        self.gae = GAE(self.n_workers, self.worker_steps, 0.99, 0.95)

V
Varuna Jayasiri 已提交
149 150 151 152 153 154
        # PPO Loss
        self.ppo_loss = ClippedPPOLoss()

        # Value Loss
        self.value_loss = ClippedValueFunctionLoss()

V
✨ DQN  
Varuna Jayasiri 已提交
155
    def sample(self) -> Dict[str, torch.Tensor]:
V
Varuna Jayasiri 已提交
156 157 158
        """
        ### Sample data with current policy
        """
V
ppo  
Varuna Jayasiri 已提交
159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201

        rewards = np.zeros((self.n_workers, self.worker_steps), dtype=np.float32)
        actions = np.zeros((self.n_workers, self.worker_steps), dtype=np.int32)
        done = np.zeros((self.n_workers, self.worker_steps), dtype=np.bool)
        obs = np.zeros((self.n_workers, self.worker_steps, 4, 84, 84), dtype=np.uint8)
        log_pis = np.zeros((self.n_workers, self.worker_steps), dtype=np.float32)
        values = np.zeros((self.n_workers, self.worker_steps + 1), dtype=np.float32)

        with torch.no_grad():
            # sample `worker_steps` from each worker
            for t in range(self.worker_steps):
                # `self.obs` keeps track of the last observation from each worker,
                #  which is the input for the model to sample the next action
                obs[:, t] = self.obs
                # sample actions from $\pi_{\theta_{OLD}}$ for each worker;
                #  this returns arrays of size `n_workers`
                pi, v = self.model(obs_to_torch(self.obs))
                values[:, t] = v.cpu().numpy()
                a = pi.sample()
                actions[:, t] = a.cpu().numpy()
                log_pis[:, t] = pi.log_prob(a).cpu().numpy()

                # run sampled actions on each worker
                for w, worker in enumerate(self.workers):
                    worker.child.send(("step", actions[w, t]))

                for w, worker in enumerate(self.workers):
                    # get results after executing the actions
                    self.obs[w], rewards[w, t], done[w, t], info = worker.child.recv()

                    # collect episode info, which is available if an episode finished;
                    #  this includes total reward and length of the episode -
                    #  look at `Game` to see how it works.
                    if info:
                        tracker.add('reward', info['reward'])
                        tracker.add('length', info['length'])

            # Get value of after the final step
            _, v = self.model(obs_to_torch(self.obs))
            values[:, self.worker_steps] = v.cpu().numpy()

        # calculate advantages
        advantages = self.gae(done, rewards, values)
V
Varuna Jayasiri 已提交
202 203

        #
V
ppo  
Varuna Jayasiri 已提交
204 205 206 207 208 209 210 211
        samples = {
            'obs': obs,
            'actions': actions,
            'values': values[:, :-1],
            'log_pis': log_pis,
            'advantages': advantages
        }

V
Varuna Jayasiri 已提交
212 213
        # samples are currently in `[workers, time_step]` table,
        # we should flatten it for training
V
ppo  
Varuna Jayasiri 已提交
214 215 216 217 218 219 220 221 222 223
        samples_flat = {}
        for k, v in samples.items():
            v = v.reshape(v.shape[0] * v.shape[1], *v.shape[2:])
            if k == 'obs':
                samples_flat[k] = obs_to_torch(v)
            else:
                samples_flat[k] = torch.tensor(v, device=device)

        return samples_flat

V
Varuna Jayasiri 已提交
224
    def train(self, samples: Dict[str, torch.Tensor]):
V
ppo  
Varuna Jayasiri 已提交
225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247
        """
        ### Train the model based on samples
        """

        # It learns faster with a higher number of epochs,
        #  but becomes a little unstable; that is,
        #  the average episode reward does not monotonically increase
        #  over time.
        # May be reducing the clipping range might solve it.
        for _ in range(self.epochs):
            # shuffle for each epoch
            indexes = torch.randperm(self.batch_size)

            # for each mini batch
            for start in range(0, self.batch_size, self.mini_batch_size):
                # get mini batch
                end = start + self.mini_batch_size
                mini_batch_indexes = indexes[start: end]
                mini_batch = {}
                for k, v in samples.items():
                    mini_batch[k] = v[mini_batch_indexes]

                # train
V
Varuna Jayasiri 已提交
248
                loss = self._calc_loss(mini_batch)
V
ppo  
Varuna Jayasiri 已提交
249

V
Varuna Jayasiri 已提交
250
                # Set learning rate
V
ppo  
Varuna Jayasiri 已提交
251
                for pg in self.optimizer.param_groups:
V
Varuna Jayasiri 已提交
252
                    pg['lr'] = self.learning_rate()
V
Varuna Jayasiri 已提交
253
                # Zero out the previously calculated gradients
V
ppo  
Varuna Jayasiri 已提交
254
                self.optimizer.zero_grad()
V
Varuna Jayasiri 已提交
255
                # Calculate gradients
V
ppo  
Varuna Jayasiri 已提交
256
                loss.backward()
V
Varuna Jayasiri 已提交
257
                # Clip gradients
V
ppo  
Varuna Jayasiri 已提交
258
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=0.5)
V
Varuna Jayasiri 已提交
259
                # Update parameters based on gradients
V
ppo  
Varuna Jayasiri 已提交
260 261 262 263 264 265 266
                self.optimizer.step()

    @staticmethod
    def _normalize(adv: torch.Tensor):
        """#### Normalize advantage function"""
        return (adv - adv.mean()) / (adv.std() + 1e-8)

V
Varuna Jayasiri 已提交
267
    def _calc_loss(self, samples: Dict[str, torch.Tensor]) -> torch.Tensor:
V
Varuna Jayasiri 已提交
268 269 270
        """
        ### Calculate total loss
        """
V
version  
Varuna Jayasiri 已提交
271

V
ppo  
Varuna Jayasiri 已提交
272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287
        # $R_t$ returns sampled from $\pi_{\theta_{OLD}}$
        sampled_return = samples['values'] + samples['advantages']

        # $\bar{A_t} = \frac{\hat{A_t} - \mu(\hat{A_t})}{\sigma(\hat{A_t})}$,
        # where $\hat{A_t}$ is advantages sampled from $\pi_{\theta_{OLD}}$.
        # Refer to sampling function in [Main class](#main) below
        #  for the calculation of $\hat{A}_t$.
        sampled_normalized_advantage = self._normalize(samples['advantages'])

        # Sampled observations are fed into the model to get $\pi_\theta(a_t|s_t)$ and $V^{\pi_\theta}(s_t)$;
        #  we are treating observations as state
        pi, value = self.model(samples['obs'])

        # $-\log \pi_\theta (a_t|s_t)$, $a_t$ are actions sampled from $\pi_{\theta_{OLD}}$
        log_pi = pi.log_prob(samples['actions'])

V
Varuna Jayasiri 已提交
288
        # Calculate policy loss
V
Varuna Jayasiri 已提交
289
        policy_loss = self.ppo_loss(log_pi, samples['log_pis'], sampled_normalized_advantage, self.clip_range())
V
ppo  
Varuna Jayasiri 已提交
290

V
Varuna Jayasiri 已提交
291 292
        # Calculate Entropy Bonus
        #
V
ppo  
Varuna Jayasiri 已提交
293 294 295 296 297
        # $\mathcal{L}^{EB}(\theta) =
        #  \mathbb{E}\Bigl[ S\bigl[\pi_\theta\bigr] (s_t) \Bigr]$
        entropy_bonus = pi.entropy()
        entropy_bonus = entropy_bonus.mean()

V
Varuna Jayasiri 已提交
298
        # Calculate value function loss
V
Varuna Jayasiri 已提交
299
        value_loss = self.value_loss(value, samples['values'], sampled_return, self.clip_range())
V
ppo  
Varuna Jayasiri 已提交
300 301

        # $\mathcal{L}^{CLIP+VF+EB} (\theta) =
V
Varuna Jayasiri 已提交
302 303
        #  \mathcal{L}^{CLIP} (\theta) +
        #  c_1 \mathcal{L}^{VF} (\theta) - c_2 \mathcal{L}^{EB}(\theta)$
V
Varuna Jayasiri 已提交
304 305 306
        loss = (policy_loss
                + self.value_loss_coef() * value_loss
                - self.entropy_bonus_coef() * entropy_bonus)
V
ppo  
Varuna Jayasiri 已提交
307 308 309 310

        # for monitoring
        approx_kl_divergence = .5 * ((samples['log_pis'] - log_pi) ** 2).mean()

V
Varuna Jayasiri 已提交
311
        # Add to tracker
V
Varuna Jayasiri 已提交
312 313
        tracker.add({'policy_reward': -policy_loss,
                     'value_loss': value_loss,
V
ppo  
Varuna Jayasiri 已提交
314 315
                     'entropy_bonus': entropy_bonus,
                     'kl_div': approx_kl_divergence,
V
Varuna Jayasiri 已提交
316
                     'clip_fraction': self.ppo_loss.clip_fraction})
V
ppo  
Varuna Jayasiri 已提交
317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333

        return loss

    def run_training_loop(self):
        """
        ### Run training loop
        """

        # last 100 episode information
        tracker.set_queue('reward', 100, True)
        tracker.set_queue('length', 100, True)

        for update in monit.loop(self.updates):
            # sample with current policy
            samples = self.sample()

            # train the model
V
Varuna Jayasiri 已提交
334
            self.train(samples)
V
ppo  
Varuna Jayasiri 已提交
335

V
Varuna Jayasiri 已提交
336
            # Save tracked indicators.
V
ppo  
Varuna Jayasiri 已提交
337
            tracker.save()
V
Varuna Jayasiri 已提交
338
            # Add a new line to the screen periodically
V
ppo  
Varuna Jayasiri 已提交
339 340 341 342 343 344 345 346 347 348 349 350
            if (update + 1) % 1_000 == 0:
                logger.log()

    def destroy(self):
        """
        ### Destroy
        Stop the workers
        """
        for worker in self.workers:
            worker.child.send(("close", None))


V
Varuna Jayasiri 已提交
351 352
def main():
    # Create the experiment
V
ppo  
Varuna Jayasiri 已提交
353
    experiment.create(name='ppo')
V
Varuna Jayasiri 已提交
354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377
    # Configurations
    configs = {
        # number of updates
        'updates': 10000,
        # number of epochs to train the model with sampled data
        'epochs': 4,
        # number of worker processes
        'n_workers': 8,
        # number of steps to run on each process for a single update
        'worker_steps': 128,
        # number of mini batches
        'batches': 4,
        # Value loss coefficient
        'value_loss_coef': FloatDynamicHyperParam(0.5),
        # Entropy bonus coefficient
        'entropy_bonus_coef': FloatDynamicHyperParam(0.01),
        # Clip range
        'clip_range': FloatDynamicHyperParam(0.1),
        # Learning rate
        'learning_rate': FloatDynamicHyperParam(2.5e-4, (0, 1e-3)),
    }

    experiment.configs(configs)

V
Varuna Jayasiri 已提交
378
    # Initialize the trainer
V
Varuna Jayasiri 已提交
379 380 381 382 383 384 385 386 387 388 389 390
    m = Trainer(
        updates=configs['updates'],
        epochs=configs['epochs'],
        n_workers=configs['n_workers'],
        worker_steps=configs['worker_steps'],
        batches=configs['batches'],
        value_loss_coef=configs['value_loss_coef'],
        entropy_bonus_coef=configs['entropy_bonus_coef'],
        clip_range=configs['clip_range'],
        learning_rate=configs['learning_rate'],
    )

V
Varuna Jayasiri 已提交
391
    # Run and monitor the experiment
V
Varuna Jayasiri 已提交
392 393
    with experiment.start():
        m.run_training_loop()
V
Varuna Jayasiri 已提交
394
    # Stop the workers
V
ppo  
Varuna Jayasiri 已提交
395
    m.destroy()
V
Varuna Jayasiri 已提交
396 397 398 399 400


# ## Run it
if __name__ == "__main__":
    main()