# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import numpy as np import parl import torch import torch.optim as optim from tqdm import tqdm from utils import * from connect4_model import Connect4Model args = dotdict({ 'lr': 0.001, 'dropout': 0.3, 'epochs': 5, 'batch_size': 64, 'num_channels': 64, }) class AlphaZero(parl.Algorithm): def __init__(self, model): self.model = model def learn(self, boards, target_pis, target_vs, optimizer): self.model.train() # train mode # compute model output out_log_pi, out_v = self.model(boards) pi_loss = -torch.sum(target_pis * out_log_pi) / target_pis.size()[0] v_loss = torch.sum( (target_vs - out_v.view(-1))**2) / target_vs.size()[0] total_loss = pi_loss + v_loss # compute gradient and do SGD step optimizer.zero_grad() total_loss.backward() optimizer.step() return total_loss, pi_loss, v_loss def predict(self, board): self.model.eval() # eval mode with torch.no_grad(): log_pi, v = self.model(board) pi = torch.exp(log_pi) return pi, v def create_agent(game, cuda=True): cuda = cuda and torch.cuda.is_available() model = Connect4Model(game, args) if cuda: model.cuda() algorithm = AlphaZero(model) alphazero_agent = AlphaZeroAgent(algorithm, game, cuda) return alphazero_agent class AlphaZeroAgent(parl.Agent): def __init__(self, algorithm, game, cuda): super(AlphaZeroAgent, self).__init__(algorithm) self.cuda = cuda self.board_x, self.board_y = game.getBoardSize() self.action_size = game.getActionSize() def learn(self, examples): """ Args: examples: list of examples, each example is of form (board, pi, v) """ optimizer = optim.Adam(self.algorithm.model.parameters(), lr=args.lr) for epoch in range(args.epochs): print('EPOCH ::: ' + str(epoch + 1)) batch_count = int(len(examples) / args.batch_size) pbar = tqdm(range(batch_count), desc='Training Net') for _ in pbar: sample_ids = np.random.randint( len(examples), size=args.batch_size) boards, pis, vs = list(zip(*[examples[i] for i in sample_ids])) boards = torch.FloatTensor(np.array(boards).astype(np.float64)) target_pis = torch.FloatTensor(np.array(pis)) target_vs = torch.FloatTensor(np.array(vs).astype(np.float64)) if self.cuda: boards, target_pis, target_vs = boards.contiguous().cuda( ), target_pis.contiguous().cuda(), target_vs.contiguous( ).cuda() total_loss, pi_loss, v_loss = self.algorithm.learn( boards, target_pis, target_vs, optimizer) # record loss with tqdm pbar.set_postfix(Loss_pi=pi_loss.item(), Loss_v=v_loss.item()) def predict(self, board): """ Args: board (np.array): input board Return: pi (np.array): probability of actions v (np.array): estimated value of input """ # preparing input board = torch.FloatTensor(board.astype(np.float64)) if self.cuda: board = board.contiguous().cuda() board = board.view(1, self.board_x, self.board_y) pi, v = self.algorithm.predict(board) return pi.data.cpu().numpy()[0], v.data.cpu().numpy()[0] def create_agent(game, cuda=True): cuda = cuda and torch.cuda.is_available() model = Connect4Model(game, args) if cuda: model.cuda() algorithm = AlphaZero(model) alphazero_agent = AlphaZeroAgent(algorithm, game, cuda) return alphazero_agent