From d4387050c2ad0f025601dbb541a7e62fa6e39e3a Mon Sep 17 00:00:00 2001 From: Varuna Jayasiri Date: Sat, 27 Mar 2021 11:37:58 +0530 Subject: [PATCH] =?UTF-8?q?=F0=9F=93=9A=20docs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/rl/ppo/experiment.html | 656 ++++++++++++++++---------- docs/sitemap.xml | 4 +- docs/transformers/feedback/index.html | 241 ++++++---- 3 files changed, 550 insertions(+), 351 deletions(-) diff --git a/docs/rl/ppo/experiment.html b/docs/rl/ppo/experiment.html index aeef2e86..fe584587 100644 --- a/docs/rl/ppo/experiment.html +++ b/docs/rl/ppo/experiment.html @@ -86,10 +86,11 @@ It runs the game environments on multiple processes t 19from torch.distributions import Categorical 20 21from labml import monit, tracker, logger, experiment -22from labml_helpers.module import Module -23from labml_nn.rl.game import Worker -24from labml_nn.rl.ppo import ClippedPPOLoss, ClippedValueFunctionLoss -25from labml_nn.rl.ppo.gae import GAE +22from labml.internal.configs.dynamic_hyperparam import FloatDynamicHyperParam +23from labml_helpers.module import Module +24from labml_nn.rl.game import Worker +25from labml_nn.rl.ppo import ClippedPPOLoss, ClippedValueFunctionLoss +26from labml_nn.rl.ppo.gae import GAE
@@ -100,10 +101,10 @@ It runs the game environments on multiple processes t

Select device

-
28if torch.cuda.is_available():
-29    device = torch.device("cuda:0")
-30else:
-31    device = torch.device("cpu")
+
29if torch.cuda.is_available():
+30    device = torch.device("cuda:0")
+31else:
+32    device = torch.device("cpu")
@@ -114,7 +115,7 @@ It runs the game environments on multiple processes t

Model

-
34class Model(Module):
+
35class Model(Module):
@@ -125,8 +126,8 @@ It runs the game environments on multiple processes t
-
39    def __init__(self):
-40        super().__init__()
+
40    def __init__(self):
+41        super().__init__()
@@ -138,7 +139,7 @@ It runs the game environments on multiple processes t 84x84 frame and produces a 20x20 frame

-
44        self.conv1 = nn.Conv2d(in_channels=4, out_channels=32, kernel_size=8, stride=4)
+
45        self.conv1 = nn.Conv2d(in_channels=4, out_channels=32, kernel_size=8, stride=4)
@@ -150,7 +151,7 @@ It runs the game environments on multiple processes t 20x20 frame and produces a 9x9 frame

-
48        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2)
+
49        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2)
@@ -162,7 +163,7 @@ It runs the game environments on multiple processes t 9x9 frame and produces a 7x7 frame

-
52        self.conv3 = nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1)
+
53        self.conv3 = nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1)
@@ -175,7 +176,7 @@ frame from third convolution layer, and outputs 512 features

-
57        self.lin = nn.Linear(in_features=7 * 7 * 64, out_features=512)
+
58        self.lin = nn.Linear(in_features=7 * 7 * 64, out_features=512)
@@ -186,7 +187,7 @@ frame from third convolution layer, and outputs

A fully connected layer to get logits for $\pi$

-
60        self.pi_logits = nn.Linear(in_features=512, out_features=4)
+
61        self.pi_logits = nn.Linear(in_features=512, out_features=4)
@@ -197,7 +198,7 @@ frame from third convolution layer, and outputs

A fully connected layer to get value function

-
63        self.value = nn.Linear(in_features=512, out_features=1)
+
64        self.value = nn.Linear(in_features=512, out_features=1)
@@ -208,7 +209,7 @@ frame from third convolution layer, and outputs
-
66        self.activation = nn.ReLU()
+
67        self.activation = nn.ReLU()
@@ -219,18 +220,18 @@ frame from third convolution layer, and outputs
-
68    def __call__(self, obs: torch.Tensor):
-69        h = self.activation(self.conv1(obs))
-70        h = self.activation(self.conv2(h))
-71        h = self.activation(self.conv3(h))
-72        h = h.reshape((-1, 7 * 7 * 64))
-73
-74        h = self.activation(self.lin(h))
-75
-76        pi = Categorical(logits=self.pi_logits(h))
-77        value = self.value(h).reshape(-1)
-78
-79        return pi, value
+
69    def __call__(self, obs: torch.Tensor):
+70        h = self.activation(self.conv1(obs))
+71        h = self.activation(self.conv2(h))
+72        h = self.activation(self.conv3(h))
+73        h = h.reshape((-1, 7 * 7 * 64))
+74
+75        h = self.activation(self.lin(h))
+76
+77        pi = Categorical(logits=self.pi_logits(h))
+78        value = self.value(h).reshape(-1)
+79
+80        return pi, value
@@ -241,7 +242,7 @@ frame from third convolution layer, and outputs

Scale observations from [0, 255] to [0, 1]

-
82def obs_to_torch(obs: np.ndarray) -> torch.Tensor:
+
83def obs_to_torch(obs: np.ndarray) -> torch.Tensor:
@@ -252,7 +253,7 @@ frame from third convolution layer, and outputs
-
84    return torch.tensor(obs, dtype=torch.float32, device=device) / 255.
+
85    return torch.tensor(obs, dtype=torch.float32, device=device) / 255.
@@ -263,7 +264,7 @@ frame from third convolution layer, and outputs

Trainer

-
87class Trainer:
+
88class Trainer:
@@ -274,7 +275,13 @@ frame from third convolution layer, and outputs
-
92    def __init__(self):
+
93    def __init__(self, *,
+94                 updates: int, epochs: int, n_workers: int, worker_steps: int, batches: int,
+95                 value_loss_coef: FloatDynamicHyperParam,
+96                 entropy_bonus_coef: FloatDynamicHyperParam,
+97                 clip_range: FloatDynamicHyperParam,
+98                 learning_rate: FloatDynamicHyperParam,
+99                 ):
@@ -296,7 +303,7 @@ frame from third convolution layer, and outputs

number of updates

-
96        self.updates = 10000
+
103        self.updates = updates
@@ -307,7 +314,7 @@ frame from third convolution layer, and outputs

number of epochs to train the model with sampled data

-
98        self.epochs = 4
+
105        self.epochs = epochs
@@ -318,7 +325,7 @@ frame from third convolution layer, and outputs

number of worker processes

-
100        self.n_workers = 8
+
107        self.n_workers = n_workers
@@ -329,7 +336,7 @@ frame from third convolution layer, and outputs

number of steps to run on each process for a single update

-
102        self.worker_steps = 128
+
109        self.worker_steps = worker_steps
@@ -340,7 +347,7 @@ frame from third convolution layer, and outputs

number of mini batches

-
104        self.n_mini_batch = 4
+
111        self.batches = batches
@@ -351,7 +358,7 @@ frame from third convolution layer, and outputs

total number of samples for a single update

-
106        self.batch_size = self.n_workers * self.worker_steps
+
113        self.batch_size = self.n_workers * self.worker_steps
@@ -362,8 +369,8 @@ frame from third convolution layer, and outputs

size of a mini batch

-
108        self.mini_batch_size = self.batch_size // self.n_mini_batch
-109        assert (self.batch_size % self.n_mini_batch == 0)
+
115        self.mini_batch_size = self.batch_size // self.batches
+116        assert (self.batch_size % self.batches == 0)
@@ -371,10 +378,10 @@ frame from third convolution layer, and outputs -

Initialize

+

Value loss coefficient

-
+
119        self.value_loss_coef = value_loss_coef
@@ -382,10 +389,10 @@ frame from third convolution layer, and outputs -

create workers

+

Entropy bonus coefficient

-
114        self.workers = [Worker(47 + i) for i in range(self.n_workers)]
+
121        self.entropy_bonus_coef = entropy_bonus_coef
@@ -393,14 +400,10 @@ frame from third convolution layer, and outputs -

initialize tensors for observations

+

Clipping range

-
117        self.obs = np.zeros((self.n_workers, 4, 84, 84), dtype=np.uint8)
-118        for worker in self.workers:
-119            worker.child.send(("reset", None))
-120        for i, worker in enumerate(self.workers):
-121            self.obs[i] = worker.child.recv()
+
124        self.clip_range = clip_range
@@ -408,10 +411,10 @@ frame from third convolution layer, and outputs -

model

+

Learning rate

-
124        self.model = Model().to(device)
+
126        self.learning_rate = learning_rate
@@ -419,10 +422,10 @@ frame from third convolution layer, and outputs -

optimizer

+

Initialize

-
127        self.optimizer = optim.Adam(self.model.parameters(), lr=2.5e-4)
+
@@ -430,10 +433,10 @@ frame from third convolution layer, and outputs -

GAE with $\gamma = 0.99$ and $\lambda = 0.95$

+

create workers

-
130        self.gae = GAE(self.n_workers, self.worker_steps, 0.99, 0.95)
+
131        self.workers = [Worker(47 + i) for i in range(self.n_workers)]
@@ -441,10 +444,14 @@ frame from third convolution layer, and outputs -

PPO Loss

+

initialize tensors for observations

-
133        self.ppo_loss = ClippedPPOLoss()
+
134        self.obs = np.zeros((self.n_workers, 4, 84, 84), dtype=np.uint8)
+135        for worker in self.workers:
+136            worker.child.send(("reset", None))
+137        for i, worker in enumerate(self.workers):
+138            self.obs[i] = worker.child.recv()
@@ -452,21 +459,21 @@ frame from third convolution layer, and outputs -

Value Loss

+

model

-
136        self.value_loss = ClippedValueFunctionLoss()
+
141        self.model = Model().to(device)
-
+
-

Sample data with current policy

+

optimizer

-
138    def sample(self) -> Dict[str, torch.Tensor]:
+
144        self.optimizer = optim.Adam(self.model.parameters(), lr=2.5e-4)
@@ -474,17 +481,10 @@ frame from third convolution layer, and outputs - +

GAE with $\gamma = 0.99$ and $\lambda = 0.95$

-
143        rewards = np.zeros((self.n_workers, self.worker_steps), dtype=np.float32)
-144        actions = np.zeros((self.n_workers, self.worker_steps), dtype=np.int32)
-145        done = np.zeros((self.n_workers, self.worker_steps), dtype=np.bool)
-146        obs = np.zeros((self.n_workers, self.worker_steps, 4, 84, 84), dtype=np.uint8)
-147        log_pis = np.zeros((self.n_workers, self.worker_steps), dtype=np.float32)
-148        values = np.zeros((self.n_workers, self.worker_steps + 1), dtype=np.float32)
-149
-150        with torch.no_grad():
+
147        self.gae = GAE(self.n_workers, self.worker_steps, 0.99, 0.95)
@@ -492,10 +492,10 @@ frame from third convolution layer, and outputs -

sample worker_steps from each worker

+

PPO Loss

-
152            for t in range(self.worker_steps):
+
150        self.ppo_loss = ClippedPPOLoss()
@@ -503,27 +503,21 @@ frame from third convolution layer, and outputs -

self.obs keeps track of the last observation from each worker, - which is the input for the model to sample the next action

+

Value Loss

-
155                obs[:, t] = self.obs
+
153        self.value_loss = ClippedValueFunctionLoss()
-
+
-

sample actions from $\pi_{\theta_{OLD}}$ for each worker; - this returns arrays of size n_workers

+

Sample data with current policy

-
158                pi, v = self.model(obs_to_torch(self.obs))
-159                values[:, t] = v.cpu().numpy()
-160                a = pi.sample()
-161                actions[:, t] = a.cpu().numpy()
-162                log_pis[:, t] = pi.log_prob(a).cpu().numpy()
+
155    def sample(self) -> Dict[str, torch.Tensor]:
@@ -531,13 +525,17 @@ frame from third convolution layer, and outputs -

run sampled actions on each worker

+
-
165                for w, worker in enumerate(self.workers):
-166                    worker.child.send(("step", actions[w, t]))
-167
-168                for w, worker in enumerate(self.workers):
+
160        rewards = np.zeros((self.n_workers, self.worker_steps), dtype=np.float32)
+161        actions = np.zeros((self.n_workers, self.worker_steps), dtype=np.int32)
+162        done = np.zeros((self.n_workers, self.worker_steps), dtype=np.bool)
+163        obs = np.zeros((self.n_workers, self.worker_steps, 4, 84, 84), dtype=np.uint8)
+164        log_pis = np.zeros((self.n_workers, self.worker_steps), dtype=np.float32)
+165        values = np.zeros((self.n_workers, self.worker_steps + 1), dtype=np.float32)
+166
+167        with torch.no_grad():
@@ -545,10 +543,10 @@ frame from third convolution layer, and outputs -

get results after executing the actions

+

sample worker_steps from each worker

-
170                    self.obs[w], rewards[w, t], done[w, t], info = worker.child.recv()
+
169            for t in range(self.worker_steps):
@@ -556,14 +554,11 @@ frame from third convolution layer, and outputs -

collect episode info, which is available if an episode finished; - this includes total reward and length of the episode - - look at Game to see how it works.

+

self.obs keeps track of the last observation from each worker, + which is the input for the model to sample the next action

-
175                    if info:
-176                        tracker.add('reward', info['reward'])
-177                        tracker.add('length', info['length'])
+
172                obs[:, t] = self.obs
@@ -571,11 +566,15 @@ frame from third convolution layer, and outputs -

Get value of after the final step

+

sample actions from $\pi_{\theta_{OLD}}$ for each worker; + this returns arrays of size n_workers

-
180            _, v = self.model(obs_to_torch(self.obs))
-181            values[:, self.worker_steps] = v.cpu().numpy()
+
175                pi, v = self.model(obs_to_torch(self.obs))
+176                values[:, t] = v.cpu().numpy()
+177                a = pi.sample()
+178                actions[:, t] = a.cpu().numpy()
+179                log_pis[:, t] = pi.log_prob(a).cpu().numpy()
@@ -583,10 +582,13 @@ frame from third convolution layer, and outputs -

calculate advantages

+

run sampled actions on each worker

-
184        advantages = self.gae(done, rewards, values)
+
182                for w, worker in enumerate(self.workers):
+183                    worker.child.send(("step", actions[w, t]))
+184
+185                for w, worker in enumerate(self.workers):
@@ -594,16 +596,10 @@ frame from third convolution layer, and outputs - +

get results after executing the actions

-
187        samples = {
-188            'obs': obs,
-189            'actions': actions,
-190            'values': values[:, :-1],
-191            'log_pis': log_pis,
-192            'advantages': advantages
-193        }
+
187                    self.obs[w], rewards[w, t], done[w, t], info = worker.child.recv()
@@ -611,30 +607,26 @@ frame from third convolution layer, and outputs -

samples are currently in [workers, time_step] table, -we should flatten it for training

+

collect episode info, which is available if an episode finished; + this includes total reward and length of the episode - + look at Game to see how it works.

-
197        samples_flat = {}
-198        for k, v in samples.items():
-199            v = v.reshape(v.shape[0] * v.shape[1], *v.shape[2:])
-200            if k == 'obs':
-201                samples_flat[k] = obs_to_torch(v)
-202            else:
-203                samples_flat[k] = torch.tensor(v, device=device)
-204
-205        return samples_flat
+
192                    if info:
+193                        tracker.add('reward', info['reward'])
+194                        tracker.add('length', info['length'])
-
+
-

Train the model based on samples

+

Get value of after the final step

-
207    def train(self, samples: Dict[str, torch.Tensor], learning_rate: float, clip_range: float):
+
197            _, v = self.model(obs_to_torch(self.obs))
+198            values[:, self.worker_steps] = v.cpu().numpy()
@@ -642,14 +634,10 @@ we should flatten it for training

-

It learns faster with a higher number of epochs, - but becomes a little unstable; that is, - the average episode reward does not monotonically increase - over time. -May be reducing the clipping range might solve it.

+

calculate advantages

-
217        for _ in range(self.epochs):
+
201        advantages = self.gae(done, rewards, values)
@@ -657,10 +645,16 @@ May be reducing the clipping range might solve it.

-

shuffle for each epoch

+
-
219            indexes = torch.randperm(self.batch_size)
+
204        samples = {
+205            'obs': obs,
+206            'actions': actions,
+207            'values': values[:, :-1],
+208            'log_pis': log_pis,
+209            'advantages': advantages
+210        }
@@ -668,25 +662,30 @@ May be reducing the clipping range might solve it.

-

for each mini batch

+

samples are currently in [workers, time_step] table, +we should flatten it for training

-
222            for start in range(0, self.batch_size, self.mini_batch_size):
+
214        samples_flat = {}
+215        for k, v in samples.items():
+216            v = v.reshape(v.shape[0] * v.shape[1], *v.shape[2:])
+217            if k == 'obs':
+218                samples_flat[k] = obs_to_torch(v)
+219            else:
+220                samples_flat[k] = torch.tensor(v, device=device)
+221
+222        return samples_flat
-
+
-

get mini batch

+

Train the model based on samples

-
224                end = start + self.mini_batch_size
-225                mini_batch_indexes = indexes[start: end]
-226                mini_batch = {}
-227                for k, v in samples.items():
-228                    mini_batch[k] = v[mini_batch_indexes]
+
224    def train(self, samples: Dict[str, torch.Tensor]):
@@ -694,11 +693,14 @@ May be reducing the clipping range might solve it.

-

train

+

It learns faster with a higher number of epochs, + but becomes a little unstable; that is, + the average episode reward does not monotonically increase + over time. +May be reducing the clipping range might solve it.

-
231                loss = self._calc_loss(clip_range=clip_range,
-232                                       samples=mini_batch)
+
234        for _ in range(self.epochs):
@@ -706,11 +708,10 @@ May be reducing the clipping range might solve it.

-

Set learning rate

+

shuffle for each epoch

-
235                for pg in self.optimizer.param_groups:
-236                    pg['lr'] = learning_rate
+
236            indexes = torch.randperm(self.batch_size)
@@ -718,10 +719,10 @@ May be reducing the clipping range might solve it.

-

Zero out the previously calculated gradients

+

for each mini batch

-
238                self.optimizer.zero_grad()
+
239            for start in range(0, self.batch_size, self.mini_batch_size):
@@ -729,10 +730,14 @@ May be reducing the clipping range might solve it.

-

Calculate gradients

+

get mini batch

-
240                loss.backward()
+
241                end = start + self.mini_batch_size
+242                mini_batch_indexes = indexes[start: end]
+243                mini_batch = {}
+244                for k, v in samples.items():
+245                    mini_batch[k] = v[mini_batch_indexes]
@@ -740,10 +745,10 @@ May be reducing the clipping range might solve it.

-

Clip gradients

+

train

-
242                torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=0.5)
+
248                loss = self._calc_loss(mini_batch)
@@ -751,22 +756,22 @@ May be reducing the clipping range might solve it.

-

Update parameters based on gradients

+

Set learning rate

-
244                self.optimizer.step()
+
251                for pg in self.optimizer.param_groups:
+252                    pg['lr'] = self.learning_rate()
-
+
-

Normalize advantage function

+

Zero out the previously calculated gradients

-
246    @staticmethod
-247    def _normalize(adv: torch.Tensor):
+
254                self.optimizer.zero_grad()
@@ -774,21 +779,21 @@ May be reducing the clipping range might solve it.

- +

Calculate gradients

-
249        return (adv - adv.mean()) / (adv.std() + 1e-8)
+
256                loss.backward()
-
+
-

Calculate total loss

+

Clip gradients

-
251    def _calc_loss(self, samples: Dict[str, torch.Tensor], clip_range: float) -> torch.Tensor:
+
258                torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=0.5)
@@ -796,24 +801,22 @@ May be reducing the clipping range might solve it.

-

$R_t$ returns sampled from $\pi_{\theta_{OLD}}$

+

Update parameters based on gradients

-
257        sampled_return = samples['values'] + samples['advantages']
+
260                self.optimizer.step()
-
+
-

$\bar{A_t} = \frac{\hat{A_t} - \mu(\hat{A_t})}{\sigma(\hat{A_t})}$, -where $\hat{A_t}$ is advantages sampled from $\pi_{\theta_{OLD}}$. -Refer to sampling function in Main class below - for the calculation of $\hat{A}_t$.

+

Normalize advantage function

-
263        sampled_normalized_advantage = self._normalize(samples['advantages'])
+
262    @staticmethod
+263    def _normalize(adv: torch.Tensor):
@@ -821,22 +824,21 @@ Refer to sampling function in Main class below -

Sampled observations are fed into the model to get $\pi_\theta(a_t|s_t)$ and $V^{\pi_\theta}(s_t)$; - we are treating observations as state

+
-
267        pi, value = self.model(samples['obs'])
+
265        return (adv - adv.mean()) / (adv.std() + 1e-8)
-
+
-

$-\log \pi_\theta (a_t|s_t)$, $a_t$ are actions sampled from $\pi_{\theta_{OLD}}$

+

Calculate total loss

-
270        log_pi = pi.log_prob(samples['actions'])
+
267    def _calc_loss(self, samples: Dict[str, torch.Tensor]) -> torch.Tensor:
@@ -844,10 +846,10 @@ Refer to sampling function in Main class below -

Calculate policy loss

+

$R_t$ returns sampled from $\pi_{\theta_{OLD}}$

-
273        policy_loss = self.ppo_loss(log_pi, samples['log_pis'], sampled_normalized_advantage, clip_range)
+
273        sampled_return = samples['values'] + samples['advantages']
@@ -855,13 +857,13 @@ Refer to sampling function in Main class below -

Calculate Entropy Bonus

-

$\mathcal{L}^{EB}(\theta) = - \mathbb{E}\Bigl[ S\bigl[\pi_\theta\bigr] (s_t) \Bigr]$

+

$\bar{A_t} = \frac{\hat{A_t} - \mu(\hat{A_t})}{\sigma(\hat{A_t})}$, +where $\hat{A_t}$ is advantages sampled from $\pi_{\theta_{OLD}}$. +Refer to sampling function in Main class below + for the calculation of $\hat{A}_t$.

-
279        entropy_bonus = pi.entropy()
-280        entropy_bonus = entropy_bonus.mean()
+
279        sampled_normalized_advantage = self._normalize(samples['advantages'])
@@ -869,10 +871,11 @@ Refer to sampling function in Main class below -

Calculate value function loss

+

Sampled observations are fed into the model to get $\pi_\theta(a_t|s_t)$ and $V^{\pi_\theta}(s_t)$; + we are treating observations as state

-
283        value_loss = self.value_loss(value, samples['values'], sampled_return, clip_range)
+
283        pi, value = self.model(samples['obs'])
@@ -880,12 +883,10 @@ Refer to sampling function in Main class below -

$\mathcal{L}^{CLIP+VF+EB} (\theta) = - \mathcal{L}^{CLIP} (\theta) + - c_1 \mathcal{L}^{VF} (\theta) - c_2 \mathcal{L}^{EB}(\theta)$

+

$-\log \pi_\theta (a_t|s_t)$, $a_t$ are actions sampled from $\pi_{\theta_{OLD}}$

-
288        loss = policy_loss + 0.5 * value_loss - 0.01 * entropy_bonus
+
286        log_pi = pi.log_prob(samples['actions'])
@@ -893,10 +894,10 @@ Refer to sampling function in Main class below -

for monitoring

+

Calculate policy loss

-
291        approx_kl_divergence = .5 * ((samples['log_pis'] - log_pi) ** 2).mean()
+
289        policy_loss = self.ppo_loss(log_pi, samples['log_pis'], sampled_normalized_advantage, self.clip_range())
@@ -904,27 +905,24 @@ Refer to sampling function in Main class below -

Add to tracker

+

Calculate Entropy Bonus

+

$\mathcal{L}^{EB}(\theta) = + \mathbb{E}\Bigl[ S\bigl[\pi_\theta\bigr] (s_t) \Bigr]$

-
294        tracker.add({'policy_reward': -policy_loss,
-295                     'value_loss': value_loss,
-296                     'entropy_bonus': entropy_bonus,
-297                     'kl_div': approx_kl_divergence,
-298                     'clip_fraction': self.ppo_loss.clip_fraction})
-299
-300        return loss
+
295        entropy_bonus = pi.entropy()
+296        entropy_bonus = entropy_bonus.mean()
-
+
-

Run training loop

+

Calculate value function loss

-
302    def run_training_loop(self):
+
299        value_loss = self.value_loss(value, samples['values'], sampled_return, self.clip_range())
@@ -932,14 +930,14 @@ Refer to sampling function in Main class below -

last 100 episode information

+

$\mathcal{L}^{CLIP+VF+EB} (\theta) = + \mathcal{L}^{CLIP} (\theta) + + c_1 \mathcal{L}^{VF} (\theta) - c_2 \mathcal{L}^{EB}(\theta)$

-
308        tracker.set_queue('reward', 100, True)
-309        tracker.set_queue('length', 100, True)
-310
-311        for update in monit.loop(self.updates):
-312            progress = update / self.updates
+
304        loss = (policy_loss
+305                + self.value_loss_coef() * value_loss
+306                - self.entropy_bonus_coef() * entropy_bonus)
@@ -947,11 +945,10 @@ Refer to sampling function in Main class below -

decreasing learning_rate and clip_range $\epsilon$

+

for monitoring

-
315            learning_rate = 2.5e-4 * (1 - progress)
-316            clip_range = 0.1 * (1 - progress)
+
309        approx_kl_divergence = .5 * ((samples['log_pis'] - log_pi) ** 2).mean()
@@ -959,21 +956,27 @@ Refer to sampling function in Main class below -

sample with current policy

+

Add to tracker

-
319            samples = self.sample()
+
312        tracker.add({'policy_reward': -policy_loss,
+313                     'value_loss': value_loss,
+314                     'entropy_bonus': entropy_bonus,
+315                     'kl_div': approx_kl_divergence,
+316                     'clip_fraction': self.ppo_loss.clip_fraction})
+317
+318        return loss
-
+
-

train the model

+

Run training loop

-
322            self.train(samples, learning_rate, clip_range)
+
320    def run_training_loop(self):
@@ -981,10 +984,13 @@ Refer to sampling function in Main class below -

Save tracked indicators.

+

last 100 episode information

-
325            tracker.save()
+
326        tracker.set_queue('reward', 100, True)
+327        tracker.set_queue('length', 100, True)
+328
+329        for update in monit.loop(self.updates):
@@ -992,23 +998,21 @@ Refer to sampling function in Main class below -

Add a new line to the screen periodically

+

sample with current policy

-
327            if (update + 1) % 1_000 == 0:
-328                logger.log()
+
331            samples = self.sample()
-
+
-

Destroy

-

Stop the workers

+

train the model

-
330    def destroy(self):
+
334            self.train(samples)
@@ -1016,11 +1020,10 @@ Refer to sampling function in Main class below - +

Save tracked indicators.

-
335        for worker in self.workers:
-336            worker.child.send(("close", None))
+
337            tracker.save()
@@ -1028,21 +1031,23 @@ Refer to sampling function in Main class below - +

Add a new line to the screen periodically

-
339def main():
+
339            if (update + 1) % 1_000 == 0:
+340                logger.log()
-
+
-

Create the experiment

+

Destroy

+

Stop the workers

-
341    experiment.create(name='ppo')
+
342    def destroy(self):
@@ -1050,10 +1055,11 @@ Refer to sampling function in Main class below -

Initialize the trainer

+
-
343    m = Trainer()
+
347        for worker in self.workers:
+348            worker.child.send(("close", None))
@@ -1061,11 +1067,10 @@ Refer to sampling function in Main class below -

Run and monitor the experiment

+
-
345    with experiment.start():
-346        m.run_training_loop()
+
351def main():
@@ -1073,10 +1078,10 @@ Refer to sampling function in Main class below -

Stop the workers

+

Create the experiment

-
348    m.destroy()
+
353    experiment.create(name='ppo')
@@ -1084,11 +1089,168 @@ Refer to sampling function in Main class below +

Configurations

+
+
+
355    configs = {
+
+ +
+
+ +

number of updates

+
+
+
357        'updates': 10000,
+
+
+
+
+ +

number of epochs to train the model with sampled data

+
+
+
359        'epochs': 4,
+
+
+
+
+ +

number of worker processes

+
+
+
361        'n_workers': 8,
+
+
+
+
+ +

number of steps to run on each process for a single update

+
+
+
363        'worker_steps': 128,
+
+
+
+
+ +

number of mini batches

+
+
+
365        'batches': 4,
+
+
+
+
+ +

Value loss coefficient

+
+
+
367        'value_loss_coef': FloatDynamicHyperParam(0.5),
+
+
+
+
+ +

Entropy bonus coefficient

+
+
+
369        'entropy_bonus_coef': FloatDynamicHyperParam(0.01),
+
+
+
+
+ +

Clip range

+
+
+
371        'clip_range': FloatDynamicHyperParam(0.1),
+
+
+
+
+ +

Learning rate

+
+
+
373        'learning_rate': FloatDynamicHyperParam(2.5e-4, (0, 1e-3)),
+374    }
+375
+376    experiment.configs(configs)
+
+
+
+
+ +

Initialize the trainer

+
+
+
379    m = Trainer(
+380        updates=configs['updates'],
+381        epochs=configs['epochs'],
+382        n_workers=configs['n_workers'],
+383        worker_steps=configs['worker_steps'],
+384        batches=configs['batches'],
+385        value_loss_coef=configs['value_loss_coef'],
+386        entropy_bonus_coef=configs['entropy_bonus_coef'],
+387        clip_range=configs['clip_range'],
+388        learning_rate=configs['learning_rate'],
+389    )
+
+
+
+
+ +

Run and monitor the experiment

+
+
+
392    with experiment.start():
+393        m.run_training_loop()
+
+
+
+
+ +

Stop the workers

+
+
+
395    m.destroy()
+
+
+
+
+

Run it

-
352if __name__ == "__main__":
-353    main()
+
399if __name__ == "__main__":
+400    main()
diff --git a/docs/sitemap.xml b/docs/sitemap.xml index 86375058..14e5a78e 100644 --- a/docs/sitemap.xml +++ b/docs/sitemap.xml @@ -435,7 +435,7 @@ https://nn.labml.ai/transformers/feedback/index.html - 2021-02-27T16:30:00+00:00 + 2021-03-15T16:30:00+00:00 1.00 @@ -722,7 +722,7 @@ https://nn.labml.ai/rl/ppo/experiment.html - 2020-12-10T16:30:00+00:00 + 2021-03-27T16:30:00+00:00 1.00 diff --git a/docs/transformers/feedback/index.html b/docs/transformers/feedback/index.html index 934af315..46d9a107 100644 --- a/docs/transformers/feedback/index.html +++ b/docs/transformers/feedback/index.html @@ -422,9 +422,9 @@ A_{j} &= Q^\top K_j \\
158    def forward(self, *,
-159                 query: torch.Tensor,
-160                 key: torch.Tensor,
-161                 value: torch.Tensor):
+159 query: torch.Tensor, +160 key: torch.Tensor, +161 value: torch.Tensor):
@@ -610,9 +610,9 @@ Results in a tensor of shape [seq_len, batch_size, heads]

229    def forward(self, *,
-230                 x: torch.Tensor,
-231                 key: Optional[torch.Tensor],
-232                 value: Optional[torch.Tensor]):
+230 x: torch.Tensor, +231 key: Optional[torch.Tensor], +232 value: Optional[torch.Tensor]):
@@ -1115,7 +1115,7 @@ gradients with respect to the last result in the stack.

-
373    def __init__(self, max_len: int):
+
374    def __init__(self, max_len: int):
@@ -1126,12 +1126,12 @@ gradients with respect to the last result in the stack.

-
377        self.max_len = max_len
-378        self.memory = None
-379        self.memory_grad = None
-380        self.last = None
-381        self.n = -1
-382        self.last_get_n = -1
+
378        self.max_len = max_len
+379        self.memory = None
+380        self.memory_grad = None
+381        self.last = None
+382        self.n = -1
+383        self.last_get_n = -1
@@ -1145,7 +1145,7 @@ gradients with respect to the last result in the stack.

-
384    def append(self, n: int, value: torch.Tensor):
+
385    def append(self, n: int, value: torch.Tensor):
@@ -1157,7 +1157,7 @@ gradients with respect to the last result in the stack.

Otherwise this implementation fails

-
392        assert n == 0 or self.last_get_n == n - 1, f"{n}, {self.last_get_n}"
+
393        assert n == 0 or self.last_get_n == n - 1, f"{n}, {self.last_get_n}"
@@ -1168,7 +1168,7 @@ Otherwise this implementation fails

Do this without gradients

-
395        with torch.no_grad():
+
396        with torch.no_grad():
@@ -1179,7 +1179,7 @@ Otherwise this implementation fails

Initialize the shared memory tensor to keep the stack

-
397            if self.memory is None or self.memory.shape[1:] != value.shape:
+
398            if self.memory is None or self.memory.shape[1:] != value.shape:
@@ -1190,7 +1190,7 @@ Otherwise this implementation fails

This should only happen when the stack is empty

-
399                assert n == 0
+
400                assert n == 0
@@ -1201,7 +1201,7 @@ Otherwise this implementation fails

Create a tensor for the stack

-
401                self.memory = value.new_zeros(self.max_len, *value.shape, requires_grad=False)
+
402                self.memory = value.new_zeros(self.max_len, *value.shape, requires_grad=False)
@@ -1212,7 +1212,7 @@ Otherwise this implementation fails

Create a tensor to accumulate the gradients

-
403                self.memory_grad = value.new_zeros(self.memory.shape, requires_grad=False)
+
404                self.memory_grad = value.new_zeros(self.memory.shape, requires_grad=False)
@@ -1225,7 +1225,7 @@ Otherwise this implementation fails

we found this easier to use.

-
408            elif n == 0:
+
409            elif n == 0:
@@ -1236,7 +1236,7 @@ we found this easier to use.

Reset accumulated gradients

-
410                self.memory_grad.fill_(0.)
+
411                self.memory_grad.fill_(0.)
@@ -1247,7 +1247,7 @@ we found this easier to use.

Set the value in the correct position of the stack

-
413            self.memory.data[n] = value.detach()
+
414            self.memory.data[n] = value.detach()
@@ -1258,7 +1258,7 @@ we found this easier to use.

Keep track of the stack (for debugging)

-
415            self.n = n
+
416            self.n = n
@@ -1271,7 +1271,7 @@ We need this to be passed on to StackFunction in order to get the gradients propagated backwards.

-
420        self.last = value
+
421        self.last = value
@@ -1282,7 +1282,7 @@ to get the gradients propagated backwards.

Returns the stack

-
422    def get(self):
+
423    def get(self):
@@ -1294,7 +1294,7 @@ to get the gradients propagated backwards.

This is used for a sanity check in append.

-
429        self.last_get_n = self.n
+
430        self.last_get_n = self.n
@@ -1306,7 +1306,7 @@ This is used for a sanity check in append.

is called by PyTorch during backpropagation.

-
432        return StackFunction.apply(self.memory, self.memory_grad, self.last, self.n)
+
433        return StackFunction.apply(self.memory, self.memory_grad, self.last, self.n)
@@ -1314,49 +1314,51 @@ is called by PyTorch during backpropagation.

-

Updated Feedback Transformer Module

-

This is the updated feedback transformer module that caches the keys and values.

+

To release memory

-
435class FeedbackTransformerKV(Module):
+
435    def free(self):
-
+
-
    -
  • layer is the feedback transformer layer, which we clone for each layer
  • -
  • n_layers is the number of layers in the transformer
  • -
  • d_model is the number of features in the transformer
  • -
  • ‘heads’ is the number of attention heads
  • -
+
-
442    def __init__(self, layer: FeedbackTransformerLayer, n_layers: int, d_model: int, heads: int):
+
440        self.memory = None
+441        self.memory_grad = None
+442        self.last = None
-
+
- +

Updated Feedback Transformer Module

+

This is the updated feedback transformer module that caches the keys and values.

-
450        super().__init__()
+
445class FeedbackTransformerKV(Module):
-
+
-

Make copies of the transformer layer

+
    +
  • layer is the feedback transformer layer, which we clone for each layer
  • +
  • n_layers is the number of layers in the transformer
  • +
  • d_model is the number of features in the transformer
  • +
  • ‘heads’ is the number of attention heads
  • +
-
452        self.layers = clone_module_list(layer, n_layers)
+
452    def __init__(self, layer: FeedbackTransformerLayer, n_layers: int, d_model: int, heads: int):
@@ -1364,10 +1366,10 @@ is called by PyTorch during backpropagation.

-

Final normalization layer

+
-
454        self.norm = nn.LayerNorm([layer.size])
+
460        super().__init__()
@@ -1375,11 +1377,10 @@ is called by PyTorch during backpropagation.

-

Memory vectors are computed as a weighted sum of representations of each layer. -This is the weights parameter for that.

+

Make copies of the transformer layer

-
457        self.weights = nn.Parameter(torch.ones(n_layers + 1), requires_grad=True)
+
462        self.layers = clone_module_list(layer, n_layers)
@@ -1387,10 +1388,10 @@ This is the weights parameter for that.

-

Softmax for weights before taking the weighted sum

+

Final normalization layer

-
459        self.softmax = nn.Softmax(0)
+
464        self.norm = nn.LayerNorm([layer.size])
@@ -1398,10 +1399,11 @@ This is the weights parameter for that.

-

Number of features in a head

+

Memory vectors are computed as a weighted sum of representations of each layer. +This is the weights parameter for that.

-
462        d_k = d_model // heads
+
467        self.weights = nn.Parameter(torch.ones(n_layers + 1), requires_grad=True)
@@ -1409,10 +1411,10 @@ This is the weights parameter for that.

-

Module to transform embeddings (memory) to get keys

+

Softmax for weights before taking the weighted sum

-
464        self.key = PrepareForMultiHeadAttention(d_model, heads, d_k, bias=False)
+
469        self.softmax = nn.Softmax(0)
@@ -1420,10 +1422,10 @@ This is the weights parameter for that.

-

Module to transform embeddings (memory) to get keys

+

Number of features in a head

-
466        self.value = PrepareForMultiHeadAttention(d_model, heads, d_k, bias=False)
+
472        d_k = d_model // heads
@@ -1431,10 +1433,10 @@ This is the weights parameter for that.

-

Memory for stacked keys

+

Module to transform embeddings (memory) to get keys

-
469        self.mem_key = Stack(512)
+
474        self.key = PrepareForMultiHeadAttention(d_model, heads, d_k, bias=False)
@@ -1442,23 +1444,21 @@ This is the weights parameter for that.

-

Memory for stacked values

+

Module to transform embeddings (memory) to get keys

-
471        self.mem_value = Stack(512)
+
476        self.value = PrepareForMultiHeadAttention(d_model, heads, d_k, bias=False)
-
+
-
    -
  • x_seq is the input with shape [seq_len, batch_size, d_model]
  • -
+

Memory for stacked keys

-
473    def forward(self, x_seq: torch.Tensor):
+
479        self.mem_key = Stack(512)
@@ -1466,21 +1466,23 @@ This is the weights parameter for that.

-

Split the input to a list along the sequence axis

+

Memory for stacked values

-
479        x_seq = torch.unbind(x_seq, dim=0)
+
481        self.mem_value = Stack(512)
-
+
-

List to store the outputs

+
    +
  • x_seq is the input with shape [seq_len, batch_size, d_model]
  • +
-
481        res = []
+
483    def forward(self, x_seq: torch.Tensor):
@@ -1488,10 +1490,10 @@ This is the weights parameter for that.

-

For each input step

+

Split the input to a list along the sequence axis

-
483        for step, x in enumerate(x_seq):
+
489        x_seq = torch.unbind(x_seq, dim=0)
@@ -1499,10 +1501,10 @@ This is the weights parameter for that.

-

List to store layer outputs

+

List to store the outputs

-
485            layer_outputs = [x]
+
491        res = []
@@ -1510,11 +1512,10 @@ This is the weights parameter for that.

-

Stack of keys and values

+

For each input step

-
488            key_tensor = None
-489            value_tensor = None
+
493        for step, x in enumerate(x_seq):
@@ -1522,12 +1523,10 @@ This is the weights parameter for that.

-

Get the keys and values tensors if we are beyond the initial step

+

List to store layer outputs

-
491            if step > 0:
-492                key_tensor = self.mem_key.get()
-493                value_tensor = self.mem_value.get()
+
495            layer_outputs = [x]
@@ -1535,10 +1534,11 @@ This is the weights parameter for that.

-

Run through each layer

+

Stack of keys and values

-
496            for layer in self.layers:
+
498            key_tensor = None
+499            value_tensor = None
@@ -1546,10 +1546,12 @@ This is the weights parameter for that.

-

Get layer output

+

Get the keys and values tensors if we are beyond the initial step

-
498                x = layer(x=x, key=key_tensor, value=value_tensor)
+
501            if step > 0:
+502                key_tensor = self.mem_key.get()
+503                value_tensor = self.mem_value.get()
@@ -1557,10 +1559,10 @@ This is the weights parameter for that.

-

Append them to the list of layer outputs

+

Run through each layer

-
500                layer_outputs.append(x)
+
506            for layer in self.layers:
@@ -1568,10 +1570,10 @@ This is the weights parameter for that.

-

Stack the layer outputs to a tensor

+

Get layer output

-
503            layer_outputs = torch.stack(layer_outputs)
+
508                x = layer(x=x, key=key_tensor, value=value_tensor)
@@ -1579,10 +1581,10 @@ This is the weights parameter for that.

-

Calculate the memory vector as a weighted sum of layer outputs

+

Append them to the list of layer outputs

-
505            mem = torch.einsum('lbd,l->bd', layer_outputs, self.softmax(self.weights))
+
510                layer_outputs.append(x)
@@ -1590,10 +1592,10 @@ This is the weights parameter for that.

-

Calculate the keys from memory and add it to the stack

+

Stack the layer outputs to a tensor

-
507            self.mem_key.append(step, self.key(mem))
+
513            layer_outputs = torch.stack(layer_outputs)
@@ -1601,10 +1603,10 @@ This is the weights parameter for that.

-

Calculate the values from memory and add it to the stack

+

Calculate the memory vector as a weighted sum of layer outputs

-
509            self.mem_value.append(step, self.value(mem))
+
515            mem = torch.einsum('lbd,l->bd', layer_outputs, self.softmax(self.weights))
@@ -1612,10 +1614,10 @@ This is the weights parameter for that.

-

Append the output to results

+

Calculate the keys from memory and add it to the stack

-
511            res.append(x)
+
517            self.mem_key.append(step, self.key(mem))
@@ -1623,10 +1625,10 @@ This is the weights parameter for that.

-

Stack the output tensors

+

Calculate the values from memory and add it to the stack

-
514        res = torch.stack(res)
+
519            self.mem_value.append(step, self.value(mem))
@@ -1634,10 +1636,45 @@ This is the weights parameter for that.

+

Append the output to results

+
+
+
521            res.append(x)
+
+ +
+
+ +

Stack the output tensors

+
+
+
524        res = torch.stack(res)
+
+
+
+
+

Normalize the output

-
516        return self.norm(res)
+
526        return self.norm(res)
+
+
+
+
+ + +
+
+
528    def free(self):
+529        self.mem_key.free()
+530        self.mem_value.free()
-- GitLab