parallel_run.py

#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import gym
import numpy as np
import os
import time
from tqdm import tqdm

import parl
import paddle.fluid as fluid
from parl.utils import get_gpu_count
from parl.utils import tensorboard, logger

from dqn import DQN  # slight changes from parl.algorithms.DQN
from atari_agent import AtariAgent
from atari_model import AtariModel
from replay_memory import ReplayMemory, Experience
from utils import get_player

MEMORY_SIZE = int(1e6)
MEMORY_WARMUP_SIZE = MEMORY_SIZE // 20
IMAGE_SIZE = (84, 84)
CONTEXT_LEN = 4
FRAME_SKIP = 4
UPDATE_FREQ = 4
GAMMA = 0.99
LEARNING_RATE = 3e-4

gpu_num = get_gpu_count()


def run_train_step(agent, rpm):
    for step in range(args.train_total_steps):
        # use the first 80% data to train
        batch_all_obs, batch_action, batch_reward, batch_isOver = rpm.sample_batch(
            args.batch_size * gpu_num)
        batch_obs = batch_all_obs[:, :CONTEXT_LEN, :, :]
        batch_next_obs = batch_all_obs[:, 1:, :, :]
        cost = agent.learn(batch_obs, batch_action, batch_reward,
                           batch_next_obs, batch_isOver)

        if step % 100 == 0:
            # use the last 20% data to evaluate
            batch_all_obs, batch_action, batch_reward, batch_isOver = rpm.sample_test_batch(
                args.batch_size)
            batch_obs = batch_all_obs[:, :CONTEXT_LEN, :, :]
            batch_next_obs = batch_all_obs[:, 1:, :, :]
            eval_cost = agent.supervised_eval(batch_obs, batch_action,
                                              batch_reward, batch_next_obs,
                                              batch_isOver)
            logger.info(
                "train step {}, train costs are {}, eval cost is {}.".format(
                    step, cost, eval_cost))


def collect_exp(env, rpm, agent):
    obs = env.reset()
    # collect data to fulfill replay memory
    for i in tqdm(range(MEMORY_SIZE)):
        context = rpm.recent_obs()
        context.append(obs)
        context = np.stack(context, axis=0)
        action = agent.sample(context)

        next_obs, reward, isOver, _ = env.step(action)
        rpm.append(Experience(obs, action, reward, isOver))
        obs = next_obs


def main():
    env = get_player(
        args.rom, image_size=IMAGE_SIZE, train=True, frame_skip=FRAME_SKIP)
    file_path = "memory.npz"
    rpm = ReplayMemory(
        MEMORY_SIZE,
        IMAGE_SIZE,
        CONTEXT_LEN,
        load_file=True,  # load replay memory data from file
        file_path=file_path)
    act_dim = env.action_space.n

    model = AtariModel(act_dim)
    algorithm = DQN(
        model, act_dim=act_dim, gamma=GAMMA, lr=LEARNING_RATE * gpu_num)
    agent = AtariAgent(
        algorithm, act_dim=act_dim, total_step=args.train_total_steps)
    if os.path.isfile('./model.ckpt'):
        logger.info("load model from file")
        agent.restore('./model.ckpt')

    if args.train:
        logger.info("train with memory data")
        run_train_step(agent, rpm)
        logger.info("finish training. Save the model.")
        agent.save('./model.ckpt')
    else:
        logger.info("collect experience")
        collect_exp(env, rpm, agent)
        rpm.save_memory()
        logger.info("finish collecting, save successfully")


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--rom', help='path of the rom of the atari game', required=True)
    parser.add_argument(
        '--batch_size', type=int, default=64, help='batch size for each GPU')
    parser.add_argument(
        '--train',
        action="store_true",
        help='update the value function (default: False)')
    parser.add_argument(
        '--train_total_steps',
        type=int,
        default=int(1e6),
        help='maximum environmental steps of games')

    args = parser.parse_args()
    main()