diff --git a/.copyright.hook b/.copyright.hook index 3be6d0ae5bf352aa08ee44ab2144670f1bf03510..1b0acacb97a1b3059fcc88fb44b6168fa0419473 100644 --- a/.copyright.hook +++ b/.copyright.hook @@ -1,6 +1,5 @@ from __future__ import absolute_import from __future__ import print_function -from __future__ import unicode_literals import argparse import io, re diff --git a/.teamcity/Dockerfile b/.teamcity/Dockerfile index c3d1c209eb04bf7379969a28d0be4ce1bfe10c0d..99eec25ba86ed4d2acf77faf25f14d9092b09595 100644 --- a/.teamcity/Dockerfile +++ b/.teamcity/Dockerfile @@ -18,3 +18,7 @@ FROM parl/parl-test:cuda9.0-cudnn7-v2 COPY ./requirements.txt /root/ + +RUN apt-get install -y libgflags-dev libgoogle-glog-dev libomp-dev unzip +RUN apt-get install -y libgtest-dev && cd /usr/src/gtest && mkdir build \ + && cd build && cmake .. && make && cp libgtest*.a /usr/local/lib diff --git a/.teamcity/build.sh b/.teamcity/build.sh index 6a33424797690bcd088381bd8173ae7d881c2dbc..1f3c0cd20e3dfc0fa3eb378d21d5e490d8afea33 100755 --- a/.teamcity/build.sh +++ b/.teamcity/build.sh @@ -69,7 +69,7 @@ function run_test_with_gpu() { Running unit tests with GPU... ======================================== EOF - ctest --output-on-failure -j10 + ctest --output-on-failure -j20 --verbose cd ${REPO_ROOT} rm -rf ${REPO_ROOT}/build } @@ -90,7 +90,7 @@ function run_test_with_cpu() { ===================================================== EOF if [ $# -eq 1 ];then - ctest --output-on-failure -j10 + ctest --output-on-failure -j20 --verbose else ctest --output-on-failure fi @@ -145,7 +145,8 @@ function main() { ;; test) # test code compability in environments with various python versions - declare -a envs=("py36_torch" "py37_torch" "py27" "py36" "py37") + #declare -a envs=("py36_torch" "py37_torch" "py27" "py36" "py37") + declare -a envs=("py27" "py36") for env in "${envs[@]}";do cd /work source ~/.bashrc @@ -158,7 +159,7 @@ function main() { echo ======================================== pip install . if [ \( $env == "py27" -o $env == "py36" -o $env == "py37" \) ] - then + then pip install -r .teamcity/requirements.txt run_test_with_cpu $env run_test_with_cpu $env "DIS_TESTING_SERIALLY" @@ -169,6 +170,10 @@ function main() { pip install -r .teamcity/requirements_torch.txt run_test_with_cpu $env "DIS_TESTING_TORCH" fi + # clean env + export LC_ALL=C.UTF-8 + export LANG=C.UTF-8 + xparl stop done run_test_with_gpu diff --git a/.teamcity/requirements.txt b/.teamcity/requirements.txt index 354e3632e02ce8e678df2024a6d16657281c1a0e..8ed94543532fee0c02b048a36dba05832ae3d161 100644 --- a/.teamcity/requirements.txt +++ b/.teamcity/requirements.txt @@ -3,4 +3,3 @@ paddlepaddle-gpu==1.6.1.post97 gym details parameterized -timeout_decorator diff --git a/.teamcity/requirements_torch.txt b/.teamcity/requirements_torch.txt index dd2808a12eaab7e3158d09334ffb916917427417..5cdd9ea56ad6cc2db2ecd1fc6f7e046ff84507b7 100644 --- a/.teamcity/requirements_torch.txt +++ b/.teamcity/requirements_torch.txt @@ -2,4 +2,3 @@ gym details parameterized -timeout_decorator diff --git a/.scripts/update_readme_paddle_version.py b/.teamcity/update_readme_paddle_version.py similarity index 94% rename from .scripts/update_readme_paddle_version.py rename to .teamcity/update_readme_paddle_version.py index 56d56914c65956a2bb753bc58269d59034766b1c..901d2d672d9f3eff1021241ac80b6e9f75d0886a 100644 --- a/.scripts/update_readme_paddle_version.py +++ b/.teamcity/update_readme_paddle_version.py @@ -37,7 +37,8 @@ if __name__ == '__main__': exclude_examples = [ 'NeurIPS2019-Learn-to-Move-Challenge', - 'NeurIPS2018-AI-for-Prosthetics-Challenge', 'EagerMode' + 'NeurIPS2018-AI-for-Prosthetics-Challenge', 'LiftSim_baseline', + 'EagerMode' ] for example in os.listdir('../examples/'): if example not in exclude_examples: diff --git a/.teamcity/windows_test.sh b/.teamcity/windows_test.sh new file mode 100755 index 0000000000000000000000000000000000000000..a6d12a6f6c9c212e406f8e900a03c3f4f0cfc44b --- /dev/null +++ b/.teamcity/windows_test.sh @@ -0,0 +1,76 @@ +#!/usr/bin/env bash + +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# NOTE: You need install mingw-cmake. + +function init() { + RED='\033[0;31m' + BLUE='\033[0;34m' + BOLD='\033[1m' + NONE='\033[0m' + + REPO_ROOT=`pwd` +} + + +function abort(){ + echo "Your change doesn't follow PaddlePaddle's code style." 1>&2 + echo "Please use pre-commit to check what is wrong." 1>&2 + exit 1 +} + +function run_test_with_cpu() { + export CUDA_VISIBLE_DEVICES="-1" + + mkdir -p ${REPO_ROOT}/build + cd ${REPO_ROOT}/build + if [ $# -eq 1 ];then + cmake -G "MinGW Makefiles" .. + else + cmake -G "MinGW Makefiles" .. -$2=ON + fi + cat < [English](./README.md) | 简体中文 -[**文档**](https://parl.readthedocs.io) | [**中文文档**](docs/zh_CN/Overview.md) +[**文档**](https://parl.readthedocs.io/en/stable/index.html) > PARL 是一个高性能、灵活的强化学习框架。 # 特点 @@ -48,7 +48,7 @@ class Agent(object): parl.connect('localhost:8037') agent = Agent() agent.say_hello() -ans = agent.sum(1,5) # run remotely and not comsume any local computation resources +ans = agent.sum(1,5) # run remotely and not comsume any local computation resources ``` 两步调度外部的计算资源: 1. 使用`parl.remote_class`修饰一个类,之后这个类就被转化为可以运行在其他CPU或者机器上的类。 @@ -61,8 +61,8 @@ ans = agent.sum(1,5) # run remotely and not comsume any local computation resour # 安装: ### 依赖 -- Python 2.7 or 3.5+. -- [paddlepaddle>=1.6.1](https://github.com/PaddlePaddle/Paddle) (**非必须的**,如果你只用并行部分的接口不需要安装paddle) +- Python 2.7 or 3.5+. (**Windows系统**目前仅支持python3.6+以上的环境) +- [paddlepaddle>=1.6.1](https://github.com/PaddlePaddle/Paddle) (**非必须的**,如果你只用并行部分的接口不需要安装paddle) ``` @@ -83,6 +83,6 @@ pip install parl - [冠军解决方案:NIPS2018强化学习假肢挑战赛](examples/NeurIPS2018-AI-for-Prosthetics-Challenge/) - [冠军解决方案:NIPS2019强化学习仿生人控制赛事](examples/NeurIPS2019-Learn-to-Move-Challenge/) -NeurlIPS2018 Half-Cheetah Breakout +NeurlIPS2018 Half-Cheetah Breakout
NeurlIPS2018 diff --git a/README.md b/README.md index a5cbdd76a71c01a04c33f79fe701322a57795010..ed8ae1e28a6864e0a1d171a172d17dfe1bc03b8f 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@

English | [简体中文](./README.cn.md) -[**Documentation**](https://parl.readthedocs.io) | [**中文文档**](docs/zh_CN/Overview.md) +[**Documentation**](https://parl.readthedocs.io/en/stable/index.html) > PARL is a flexible and high-efficient reinforcement learning framework. @@ -64,7 +64,7 @@ For users, they can write code in a simple way, just like writing multi-thread c # Install: ### Dependencies -- Python 2.7 or 3.5+. +- Python 2.7 or 3.5+(On **Windows**, PARL only supprorts the enviroment with python3.6+). - [paddlepaddle>=1.6.1](https://github.com/PaddlePaddle/Paddle) (**Optional**, if you only want to use APIs related to parallelization alone) diff --git a/benchmark/torch/AlphaZero/.pic/good_moves.png b/benchmark/torch/AlphaZero/.pic/good_moves.png new file mode 100644 index 0000000000000000000000000000000000000000..f007fc4a6f2dbc9df9a6a8163de08dcf59cb82dc Binary files /dev/null and b/benchmark/torch/AlphaZero/.pic/good_moves.png differ diff --git a/benchmark/torch/AlphaZero/.pic/perfect_moves.png b/benchmark/torch/AlphaZero/.pic/perfect_moves.png new file mode 100644 index 0000000000000000000000000000000000000000..72c3913ea58498446e92d170255c71606e194fe0 Binary files /dev/null and b/benchmark/torch/AlphaZero/.pic/perfect_moves.png differ diff --git a/benchmark/torch/AlphaZero/Arena.py b/benchmark/torch/AlphaZero/Arena.py new file mode 100644 index 0000000000000000000000000000000000000000..a0791803eb1061485f2f6a647540d9bc9d4f45ee --- /dev/null +++ b/benchmark/torch/AlphaZero/Arena.py @@ -0,0 +1,105 @@ +# Third party code +# +# The following code are copied or modified from: +# https://github.com/suragnair/alpha-zero-general + +from tqdm import tqdm +from parl.utils import logger + + +class Arena(): + """ + An Arena class where any 2 agents can be pit against each other. + """ + + def __init__(self, player1, player2, game, display=None): + """ + Input: + player 1,2: two functions that takes board as input, return action + game: Game object + display: a function that takes board as input and prints it (e.g. + display in othello/OthelloGame). Is necessary for verbose + mode. + + see othello/OthelloPlayers.py for an example. See pit.py for pitting + human players/other baselines with each other. + """ + self.player1 = player1 + self.player2 = player2 + self.game = game + self.display = display + + def playGame(self, verbose=False): + """ + Executes one episode of a game. + + Returns: + either + winner: player who won the game (1 if player1, -1 if player2) + or + draw result returned from the game that is neither 1, -1, nor 0. + """ + players = [self.player2, None, self.player1] + curPlayer = 1 + board = self.game.getInitBoard() + it = 0 + while self.game.getGameEnded(board, curPlayer) == 0: + it += 1 + if verbose: + assert self.display + print("Turn ", str(it), "Player ", str(curPlayer)) + self.display(board) + action = players[curPlayer + 1](self.game.getCanonicalForm( + board, curPlayer)) + + valids = self.game.getValidMoves( + self.game.getCanonicalForm(board, curPlayer), 1) + + if valids[action] == 0: + logger.error('Action {} is not valid!'.format(action)) + logger.debug('valids = {}'.format(valids)) + assert valids[action] > 0 + board, curPlayer = self.game.getNextState(board, curPlayer, action) + if verbose: + assert self.display + print("Game over: Turn ", str(it), "Result ", + str(self.game.getGameEnded(board, 1))) + self.display(board) + return curPlayer * self.game.getGameEnded(board, curPlayer) + + def playGames(self, num, verbose=False): + """ + Plays num games in which player1 starts num/2 games and player2 starts + num/2 games. + + Returns: + oneWon: games won by player1 + twoWon: games won by player2 + draws: games won by nobody + """ + + num = int(num / 2) + oneWon = 0 + twoWon = 0 + draws = 0 + for _ in tqdm(range(num), desc="Arena.playGames (1)"): + gameResult = self.playGame(verbose=verbose) + if gameResult == 1: + oneWon += 1 + elif gameResult == -1: + twoWon += 1 + else: + draws += 1 + + self.player1, self.player2 = self.player2, self.player1 + + for _ in tqdm(range(num), desc="Arena.playGames (2)"): + gameResult = self.playGame(verbose=verbose) + if gameResult == -1: + oneWon += 1 + elif gameResult == 1: + twoWon += 1 + else: + draws += 1 + + return oneWon, twoWon, draws diff --git a/benchmark/torch/AlphaZero/Coach.py b/benchmark/torch/AlphaZero/Coach.py new file mode 100644 index 0000000000000000000000000000000000000000..01394b076db969db42a7277b5d95f82bd661db3d --- /dev/null +++ b/benchmark/torch/AlphaZero/Coach.py @@ -0,0 +1,246 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys +import threading +import queue +import pickle +from pickle import Pickler, Unpickler +from random import shuffle +from parl.utils import tensorboard + +import numpy as np +from tqdm import tqdm + +import parl +from parl.utils import logger + +from actor import Actor +from utils import split_group, get_test_dataset +from alphazero_agent import create_agent + + +class Coach(): + """ + This class executes the self-play, learning and evaluating. + """ + + def __init__(self, game, args): + self.game = game + self.args = args + + # neural network of current generation + self.current_agent = create_agent(self.game) + # neural network of previous generation + self.previous_agent = create_agent(self.game) + + # history of examples from args.numItersForTrainExamplesHistory latest iterations + self.trainExamplesHistory = [] + + self.remote_actors_signal_queues = [] + self.remote_actors_return_queue = queue.Queue() + + self.test_dataset = get_test_dataset() + + def _run_remote_tasks(self, signal_queue): + # The remote actor will actually run on the local machine or other machines of xparl cluster + remote_actor = Actor(self.game, self.args) + + while True: + # receive running task signal + # signal: specify task type and task input data (optional) + signal = signal_queue.get() + + if signal["task"] == "self-play": + episode_num_each_actor = self.args.numEps // self.args.actors_num + result = remote_actor.self_play( + self.current_agent.get_weights(), episode_num_each_actor) + self.remote_actors_return_queue.put({"self-play": result}) + + elif signal["task"] == "pitting": + games_num_each_actor = self.args.arenaCompare // self.args.actors_num + result = remote_actor.pitting( + self.previous_agent.get_weights(), + self.current_agent.get_weights(), games_num_each_actor) + self.remote_actors_return_queue.put({"pitting": result}) + + elif signal["task"] == "evaluate_test_dataset": + test_dataset = signal["test_dataset"] + result = remote_actor.evaluate_test_dataset( + self.current_agent.get_weights(), test_dataset) + self.remote_actors_return_queue.put({ + "evaluate_test_dataset": + result + }) + else: + raise NotImplementedError + + def _create_remote_actors(self): + # connect to xparl cluster to submit jobs + parl.connect(self.args.master_address) + + for i in range(self.args.actors_num): + signal_queue = queue.Queue() + self.remote_actors_signal_queues.append(signal_queue) + + remote_thread = threading.Thread( + target=self._run_remote_tasks, args=(signal_queue, )) + remote_thread.setDaemon(True) + remote_thread.start() + + def learn(self): + """Each iteration: + 1. Performs numEps episodes of self-play. + 2. Retrains neural network with examples in trainExamplesHistory + (which has a maximum length of numItersForTrainExamplesHistory). + 3. Evaluates the new neural network with the test dataset. + 4. Pits the new neural network against the old one and accepts it + only if it wins >= updateThreshold fraction of games. + """ + + # create remote actors to run tasks (self-play/pitting/evaluate_test_dataset) in parallel. + self._create_remote_actors() + + for iteration in range(1, self.args.numIters + 1): + logger.info('Starting Iter #{} ...'.format(iteration)) + + #################### + logger.info('Step1: self-play in parallel...') + iterationTrainExamples = [] + # update weights of remote actors to the latest weights, and ask them to run self-play task + for signal_queue in self.remote_actors_signal_queues: + signal_queue.put({"task": "self-play"}) + # wait for all remote actors (a total of self.args.actors_num) to return the self-play results + for _ in range(self.args.actors_num): + result = self.remote_actors_return_queue.get() + iterationTrainExamples.extend(result["self-play"]) + + # save the iteration examples to the history + self.trainExamplesHistory.append(iterationTrainExamples) + if len(self.trainExamplesHistory + ) > self.args.numItersForTrainExamplesHistory: + logger.warning("Removing the oldest entry in trainExamples.") + self.trainExamplesHistory.pop(0) + self.saveTrainExamples(iteration) # backup history to a file + + #################### + logger.info('Step2: train neural network...') + # shuffle examples before training + trainExamples = [] + for e in self.trainExamplesHistory: + trainExamples.extend(e) + shuffle(trainExamples) + + # training new network, keeping a copy of the old one + self.current_agent.save( + os.path.join(self.args.checkpoint, 'temp.pth.tar')) + self.previous_agent.restore( + os.path.join(self.args.checkpoint, 'temp.pth.tar')) + + self.current_agent.learn(trainExamples) + + #################### + logger.info('Step3: evaluate test dataset in parallel...') + cnt = 0 + # update weights of remote actors to the latest weights, and ask them to evaluate assigned test dataset + for i, data in enumerate( + split_group( + self.test_dataset, + len(self.test_dataset) // self.args.actors_num)): + self.remote_actors_signal_queues[i].put({ + "task": + "evaluate_test_dataset", + "test_dataset": + data + }) + cnt += len(data) + perfect_moves_cnt, good_moves_cnt = 0, 0 + # wait for all remote actors (a total of self.args.actors_num) to return the evaluating results + for _ in range(self.args.actors_num): + (perfect_moves, + good_moves) = self.remote_actors_return_queue.get( + )["evaluate_test_dataset"] + perfect_moves_cnt += perfect_moves + good_moves_cnt += good_moves + logger.info('perfect moves rate: {}, good moves rate: {}'.format( + perfect_moves_cnt / cnt, good_moves_cnt / cnt)) + tensorboard.add_scalar('perfect_moves_rate', + perfect_moves_cnt / cnt, iteration) + tensorboard.add_scalar('good_moves_rate', good_moves_cnt / cnt, + iteration) + + #################### + logger.info( + 'Step4: pitting against previous generation in parallel...') + # transfer weights of previous generation and current generation to the remote actors, and ask them to pit. + for signal_queue in self.remote_actors_signal_queues: + signal_queue.put({"task": "pitting"}) + previous_wins, current_wins, draws = 0, 0, 0 + for _ in range(self.args.actors_num): + (pwins_, cwins_, + draws_) = self.remote_actors_return_queue.get()["pitting"] + previous_wins += pwins_ + current_wins += cwins_ + draws += draws_ + + logger.info('NEW/PREV WINS : %d / %d ; DRAWS : %d' % + (current_wins, previous_wins, draws)) + if previous_wins + current_wins == 0 or float(current_wins) / ( + previous_wins + current_wins) < self.args.updateThreshold: + logger.info('REJECTING NEW MODEL') + self.current_agent.restore( + os.path.join(self.args.checkpoint, 'temp.pth.tar')) + else: + logger.info('ACCEPTING NEW MODEL') + self.current_agent.save( + os.path.join(self.args.checkpoint, 'best.pth.tar')) + self.current_agent.save( + os.path.join(self.args.checkpoint, + self.getCheckpointFile(iteration))) + + def getCheckpointFile(self, iteration): + return 'checkpoint_' + str(iteration) + '.pth.tar' + + def saveTrainExamples(self, iteration): + folder = self.args.checkpoint + if not os.path.exists(folder): + os.makedirs(folder) + filename = os.path.join( + folder, + self.getCheckpointFile(iteration) + ".examples") + with open(filename, "wb+") as f: + Pickler(f).dump(self.trainExamplesHistory) + f.closed + + def loadModel(self): + self.current_agent.restore( + os.path.join(self.args.load_folder_file[0], + self.args.load_folder_file[1])) + + def loadTrainExamples(self): + modelFile = os.path.join(self.args.load_folder_file[0], + self.args.load_folder_file[1]) + examplesFile = modelFile + ".examples" + if not os.path.isfile(examplesFile): + logger.warning( + "File {} with trainExamples not found!".format(examplesFile)) + r = input("Continue? [y|n]") + if r != "y": + sys.exit() + else: + logger.info("File with trainExamples found. Loading it...") + with open(examplesFile, "rb") as f: + self.trainExamplesHistory = Unpickler(f).load() + logger.info('Loading done!') diff --git a/benchmark/torch/AlphaZero/MCTS.py b/benchmark/torch/AlphaZero/MCTS.py new file mode 100644 index 0000000000000000000000000000000000000000..b011efe15dbdc10ccbe2c07e6d30b2e2aaa82d9d --- /dev/null +++ b/benchmark/torch/AlphaZero/MCTS.py @@ -0,0 +1,164 @@ +# Third party code +# +# The following code are copied or modified from: +# https://github.com/suragnair/alpha-zero-general + +import math +import time + +import numpy as np + +EPS = 1e-8 + + +class MCTS(): + """ + This class handles the MCTS tree. + """ + + def __init__(self, game, nn_agent, args, dirichlet_noise=False): + self.game = game + self.nn_agent = nn_agent + self.args = args + self.dirichlet_noise = dirichlet_noise + self.Qsa = {} # stores Q values for s,a (as defined in the paper) + self.Nsa = {} # stores #times edge s,a was visited + self.Ns = {} # stores #times board s was visited + self.Ps = {} # stores initial policy (returned by neural net) + + self.Es = {} # stores game.getGameEnded ended for board s + self.Vs = {} # stores game.getValidMoves for board s + + def getActionProb(self, canonicalBoard, temp=1): + """ + This function performs numMCTSSims simulations of MCTS starting from + canonicalBoard. + + Returns: + probs: a policy vector where the probability of the ith action is + proportional to Nsa[(s,a)]**(1./temp) + """ + for i in range(self.args.numMCTSSims): + dir_noise = (i == 0 and self.dirichlet_noise) + self.search(canonicalBoard, dirichlet_noise=dir_noise) + + s = self.game.stringRepresentation(canonicalBoard) + counts = [ + self.Nsa[(s, a)] if (s, a) in self.Nsa else 0 + for a in range(self.game.getActionSize()) + ] + + if temp == 0: + bestAs = np.array(np.argwhere(counts == np.max(counts))).flatten() + bestA = np.random.choice(bestAs) + probs = [0] * len(counts) + probs[bestA] = 1 + return probs + + counts = [x**(1. / temp) for x in counts] + counts_sum = float(sum(counts)) + probs = [x / counts_sum for x in counts] + return probs + + def search(self, canonicalBoard, dirichlet_noise=False): + """ + This function performs one iteration of MCTS. It is recursively called + till a leaf node is found. The action chosen at each node is one that + has the maximum upper confidence bound as in the paper. + + Once a leaf node is found, the neural network is called to return an + initial policy P and a value v for the state. This value is propagated + up the search path. In case the leaf node is a terminal state, the + outcome is propagated up the search path. The values of Ns, Nsa, Qsa are + updated. + + NOTE: the return values are the negative of the value of the current + state. This is done since v is in [-1,1] and if v is the value of a + state for the current player, then its value is -v for the other player. + + Returns: + v: the negative of the value of the current canonicalBoard + """ + + s = self.game.stringRepresentation(canonicalBoard) + + if s not in self.Es: + self.Es[s] = self.game.getGameEnded(canonicalBoard, 1) + if self.Es[s] != 0: + # terminal node + return -self.Es[s] + + if s not in self.Ps: + # leaf node + self.Ps[s], v = self.nn_agent.predict(canonicalBoard) + + valids = self.game.getValidMoves(canonicalBoard, 1) + self.Ps[s] = self.Ps[s] * valids # masking invalid moves + if dirichlet_noise: + self.applyDirNoise(s, valids) + sum_Ps_s = np.sum(self.Ps[s]) + if sum_Ps_s > 0: + self.Ps[s] /= sum_Ps_s # renormalize + else: + # if all valid moves were masked make all valid moves equally probable + + # NB! All valid moves may be masked if either your NNet architecture is insufficient or you've get overfitting or something else. + # If you have got dozens or hundreds of these messages you should pay attention to your NNet and/or training process. + print("All valid moves were masked, doing a workaround.") + self.Ps[s] = self.Ps[s] + valids + self.Ps[s] /= np.sum(self.Ps[s]) + + self.Vs[s] = valids + self.Ns[s] = 0 + return -v + + valids = self.Vs[s] + if dirichlet_noise: + self.applyDirNoise(s, valids) + sum_Ps_s = np.sum(self.Ps[s]) + self.Ps[s] /= sum_Ps_s # renormalize + cur_best = -float('inf') + best_act = -1 + + # pick the action with the highest upper confidence bound + for a in range(self.game.getActionSize()): + if valids[a]: + if (s, a) in self.Qsa: + u = self.Qsa[ + (s, a)] + self.args.cpuct * self.Ps[s][a] * math.sqrt( + self.Ns[s]) / (1 + self.Nsa[(s, a)]) + else: + u = self.args.cpuct * self.Ps[s][a] * math.sqrt( + self.Ns[s] + EPS) # Q = 0 ? + + if u > cur_best: + cur_best = u + best_act = a + + a = best_act + next_s, next_player = self.game.getNextState(canonicalBoard, 1, a) + next_s = self.game.getCanonicalForm(next_s, next_player) + + v = self.search(next_s) + + if (s, a) in self.Qsa: + self.Qsa[(s, a)] = (self.Nsa[(s, a)] * self.Qsa[ + (s, a)] + v) / (self.Nsa[(s, a)] + 1) + self.Nsa[(s, a)] += 1 + + else: + self.Qsa[(s, a)] = v + self.Nsa[(s, a)] = 1 + + self.Ns[s] += 1 + return -v + + def applyDirNoise(self, s, valids): + dir_values = np.random.dirichlet( + [self.args.dirichletAlpha] * np.count_nonzero(valids)) + dir_idx = 0 + for idx in range(len(self.Ps[s])): + if self.Ps[s][idx]: + self.Ps[s][idx] = (0.75 * self.Ps[s][idx]) + ( + 0.25 * dir_values[dir_idx]) + dir_idx += 1 diff --git a/benchmark/torch/AlphaZero/README.md b/benchmark/torch/AlphaZero/README.md new file mode 100644 index 0000000000000000000000000000000000000000..72d9c807fb5066c51b49520b8aca3a5e666e133c --- /dev/null +++ b/benchmark/torch/AlphaZero/README.md @@ -0,0 +1,58 @@ +## AlphaZero baseline for Connect4 game (distributed version) +- In this example, we provide a fine-tuned AlphaZero baseline to solve the Connect4 game, based on the code of [alpha-zero-general](https://github.com/suragnair/alpha-zero-general) repo. +- We take advantage of the parallelism capacity of [PARL](https://github.com/PaddlePaddle/PARL) to support running self-play and evaluating tasks in parallel. +- We also provide scripts to pack your well-trained model to a submission file, which can be submitted to the Kaggle [Connect X](https://www.kaggle.com/c/connectx/leaderboard) competition directly. + +### Dependencies +- python3 +- [parl==1.3](https://github.com/PaddlePaddle/PARL) +- torch +- tqdm + +### Training +1. Download the [1k connect4 validation set](https://www.kaggle.com/petercnudde/1k-connect4-validation-set) to the current directory. (filename: `refmoves1k_kaggle`) + +2. Start xparl cluster +```bash +# You can change following `cpu_num` and `args.actor_nums` in the main.py +# based on the CPU number of your machine. + +xparl start --port 8010 --cpu_num 25 +``` + +```bash +# [OPTIONAL] You can also run the following script in other machines to add more CPU resource +# to the xparl cluster, so you can increase the parallelism (args.actor_nums). + +xparl connect --address MASTER_IP:8010 --cpu_num [CPU_NUM] +``` + +3. Run training script +```bash +python main.py +``` + +4. Visualize (good moves rate and perfect moves rate) +``` +tensorboard --logdir . +``` + +### Submitting +To submit the well-trained model to the Kaggle, you can use our provided script to generate `submission.py`, for example: +```bash +python gen_submission.py saved_model/best.pth.tar +``` + +### Performance +- Following are `good moves rate` and `perfect moves rate` indicators in tensorbaord, please refer to the [link](https://www.kaggle.com/petercnudde/scoring-connect-x-agents) for specific meaning. + +good moves rate perfect moves rate + +> It takes about 1 day to run 25 iterations on the machine with 25 cpus. + +- It can reach about score 1368 (rank 5 on 2020/06/04) in the Kaggle [Connect X](https://www.kaggle.com/c/connectx/leaderboard) competition. + + +### Reference +- [suragnair/alpha-zero-general](https://github.com/suragnair/alpha-zero-general) +- [Scoring connect-x agents](https://www.kaggle.com/petercnudde/scoring-connect-x-agents) diff --git a/benchmark/torch/AlphaZero/actor.py b/benchmark/torch/AlphaZero/actor.py new file mode 100644 index 0000000000000000000000000000000000000000..5ed719b92d292903f81f7c92a983927bf5c9cab5 --- /dev/null +++ b/benchmark/torch/AlphaZero/actor.py @@ -0,0 +1,165 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import parl +import os +from alphazero_agent import create_agent +from MCTS import MCTS +from Arena import Arena +from utils import win_loss_draw + + +@parl.remote_class +class Actor(object): + def __init__(self, game, args): + os.environ['OMP_NUM_THREADS'] = "1" + self.game = game + self.args = args + + # neural network of previous generation + self.previous_agent = create_agent(self.game, cuda=False) + # neural network of current generation + self.current_agent = create_agent(self.game, cuda=False) + + # MCTS of previous generation + self.previous_mcts = MCTS( + self.game, self.previous_agent, self.args, dirichlet_noise=True) + # MCTS of current generation + self.current_mcts = MCTS( + self.game, self.current_agent, self.args, dirichlet_noise=True) + + def self_play(self, current_weights, game_num): + """Collecting training data by self-play. + + Args: + current_weights (numpy.array): latest weights of neural network + game_num (int): game number of self-play + + Returns: + train_examples (list): examples of the form (canonicalBoard, currPlayer, pi,v) + """ + + # update weights of current neural network with latest weights + self.current_agent.set_weights(current_weights) + + train_examples = [] + for _ in range(game_num): + # reset node state of MCTS + self.current_mcts = MCTS( + self.game, self.current_agent, self.args, dirichlet_noise=True) + train_examples.extend(self._executeEpisode()) + return train_examples + + def pitting(self, previous_weights, current_weights, games_num): + """Fighting between previous generation agent and current generation agent + + Args: + previous_weights (numpy.array): weights of previous generation neural network + current_weights (numpy.array): weights of current generation neural network + game_num (int): game number of fighting + + Returns: + tuple of (game number of previous agent won, game number of current agent won, game number of draw) + """ + # update weights of previous and current neural network + self.previous_agent.set_weights(previous_weights) + self.current_agent.set_weights(current_weights) + + # reset node state of MCTS + self.previous_mcts = MCTS(self.game, self.previous_agent, self.args) + self.current_mcts = MCTS(self.game, self.current_agent, self.args) + + arena = Arena( + lambda x: np.argmax(self.previous_mcts.getActionProb(x, temp=0)), + lambda x: np.argmax(self.current_mcts.getActionProb(x, temp=0)), + self.game) + previous_wins, current_wins, draws = arena.playGames(games_num) + + return (previous_wins, current_wins, draws) + + def evaluate_test_dataset(self, current_weights, test_dataset): + """Evaluate performance of latest neural nerwork + + Args: + current_weights (numpy.array): latest weights of neural network + test_dataset (list): game number of self-play + + Returns: + tuple of (number of perfect moves, number of good moves) + """ + # update weights of current neural network with latest weights + self.current_agent.set_weights(current_weights) + + perfect_move_count, good_move_count = 0, 0 + for data in test_dataset: + self.current_mcts = MCTS(self.game, self.current_agent, self.args) + + x = self.game.getCanonicalForm(data['board'], data['player']) + agent_move = int( + np.argmax(self.current_mcts.getActionProb(x, temp=0))) + + moves = data["move_score"] + perfect_score = max(moves) + perfect_moves = [i for i in range(7) if moves[i] == perfect_score] + + if agent_move in perfect_moves: + perfect_move_count += 1 + if win_loss_draw( + moves[agent_move]) == win_loss_draw(perfect_score): + good_move_count += 1 + + return (perfect_move_count, good_move_count) + + def _executeEpisode(self): + """ + + This function executes one episode of self-play, starting with player 1. + As the game goes on, each turn is added as a training example to + trainExamples. The game is played till the game ends. After the game + ends, the outcome of the game is used to assign values to each example + in trainExamples. + + It uses a temp=1 if episodeStep < tempThresholdStep, and thereafter + uses temp=0. + + Returns: + trainExamples: a list of examples of the form (canonicalBoard, currPlayer, pi,v) + pi is the MCTS informed policy vector, v is +1 if + the player eventually won the game, else -1. + """ + trainExamples = [] + board = self.game.getInitBoard() + self.curPlayer = 1 + episodeStep = 0 + + while True: + episodeStep += 1 + canonicalBoard = self.game.getCanonicalForm(board, self.curPlayer) + temp = int(episodeStep < self.args.tempThresholdStep) + + pi = self.current_mcts.getActionProb(canonicalBoard, temp=temp) + sym = self.game.getSymmetries(canonicalBoard, pi) + for b, p in sym: # board, pi + trainExamples.append([b, self.curPlayer, p, None]) + + action = np.random.choice(len(pi), p=pi) + board, self.curPlayer = self.game.getNextState( + board, self.curPlayer, action) + + r = self.game.getGameEnded(board, self.curPlayer) + + if r != 0: + return [(x[0], x[2], r * ((-1)**(x[1] != self.curPlayer))) + for x in trainExamples] diff --git a/benchmark/torch/AlphaZero/alphazero_agent.py b/benchmark/torch/AlphaZero/alphazero_agent.py new file mode 100644 index 0000000000000000000000000000000000000000..9e7e497e4818f30ae8d71bee109f4ff6f9795962 --- /dev/null +++ b/benchmark/torch/AlphaZero/alphazero_agent.py @@ -0,0 +1,150 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import numpy as np +import parl +import torch +import torch.optim as optim + +from tqdm import tqdm +from utils import * +from connect4_model import Connect4Model + +args = dotdict({ + 'lr': 0.001, + 'dropout': 0.3, + 'epochs': 5, + 'batch_size': 64, + 'num_channels': 64, +}) + + +class AlphaZero(parl.Algorithm): + def __init__(self, model): + self.model = model + + def learn(self, boards, target_pis, target_vs, optimizer): + self.model.train() # train mode + + # compute model output + out_log_pi, out_v = self.model(boards) + + pi_loss = -torch.sum(target_pis * out_log_pi) / target_pis.size()[0] + + v_loss = torch.sum( + (target_vs - out_v.view(-1))**2) / target_vs.size()[0] + + total_loss = pi_loss + v_loss + + # compute gradient and do SGD step + optimizer.zero_grad() + total_loss.backward() + optimizer.step() + + return total_loss, pi_loss, v_loss + + def predict(self, board): + self.model.eval() # eval mode + + with torch.no_grad(): + log_pi, v = self.model(board) + + pi = torch.exp(log_pi) + return pi, v + + +def create_agent(game, cuda=True): + cuda = cuda and torch.cuda.is_available() + + model = Connect4Model(game, args) + if cuda: + model.cuda() + + algorithm = AlphaZero(model) + + alphazero_agent = AlphaZeroAgent(algorithm, game, cuda) + return alphazero_agent + + +class AlphaZeroAgent(parl.Agent): + def __init__(self, algorithm, game, cuda): + super(AlphaZeroAgent, self).__init__(algorithm) + self.cuda = cuda + self.board_x, self.board_y = game.getBoardSize() + self.action_size = game.getActionSize() + + def learn(self, examples): + """ + Args: + examples: list of examples, each example is of form (board, pi, v) + """ + optimizer = optim.Adam(self.algorithm.model.parameters(), lr=args.lr) + + for epoch in range(args.epochs): + print('EPOCH ::: ' + str(epoch + 1)) + + batch_count = int(len(examples) / args.batch_size) + + pbar = tqdm(range(batch_count), desc='Training Net') + for _ in pbar: + sample_ids = np.random.randint( + len(examples), size=args.batch_size) + boards, pis, vs = list(zip(*[examples[i] for i in sample_ids])) + boards = torch.FloatTensor(np.array(boards).astype(np.float64)) + target_pis = torch.FloatTensor(np.array(pis)) + target_vs = torch.FloatTensor(np.array(vs).astype(np.float64)) + + if self.cuda: + boards, target_pis, target_vs = boards.contiguous().cuda( + ), target_pis.contiguous().cuda(), target_vs.contiguous( + ).cuda() + + total_loss, pi_loss, v_loss = self.algorithm.learn( + boards, target_pis, target_vs, optimizer) + + # record loss with tqdm + pbar.set_postfix(Loss_pi=pi_loss.item(), Loss_v=v_loss.item()) + + def predict(self, board): + """ + Args: + board (np.array): input board + + Return: + pi (np.array): probability of actions + v (np.array): estimated value of input + """ + # preparing input + board = torch.FloatTensor(board.astype(np.float64)) + if self.cuda: + board = board.contiguous().cuda() + board = board.view(1, self.board_x, self.board_y) + + pi, v = self.algorithm.predict(board) + + return pi.data.cpu().numpy()[0], v.data.cpu().numpy()[0] + + +def create_agent(game, cuda=True): + cuda = cuda and torch.cuda.is_available() + + model = Connect4Model(game, args) + if cuda: + model.cuda() + + algorithm = AlphaZero(model) + + alphazero_agent = AlphaZeroAgent(algorithm, game, cuda) + return alphazero_agent diff --git a/benchmark/torch/AlphaZero/connect4_game.py b/benchmark/torch/AlphaZero/connect4_game.py new file mode 100644 index 0000000000000000000000000000000000000000..c10e8ca4afbca839ef71b18fd8f39f7493f30a4d --- /dev/null +++ b/benchmark/torch/AlphaZero/connect4_game.py @@ -0,0 +1,239 @@ +# Third party code +# +# The following code are copied or modified from: +# https://github.com/suragnair/alpha-zero-general + +import numpy as np +from collections import namedtuple + +DEFAULT_HEIGHT = 6 +DEFAULT_WIDTH = 7 +DEFAULT_WIN_LENGTH = 4 + +WinState = namedtuple('WinState', 'is_ended winner') + + +class Board(): + """ + Connect4 Board. + """ + + def __init__(self, + height=None, + width=None, + win_length=None, + np_pieces=None): + "Set up initial board configuration." + self.height = height or DEFAULT_HEIGHT + self.width = width or DEFAULT_WIDTH + self.win_length = win_length or DEFAULT_WIN_LENGTH + + if np_pieces is None: + self.np_pieces = np.zeros([self.height, self.width], dtype=np.int) + else: + self.np_pieces = np_pieces + assert self.np_pieces.shape == (self.height, self.width) + + def add_stone(self, column, player): + "Create copy of board containing new stone." + available_idx, = np.where(self.np_pieces[:, column] == 0) + if len(available_idx) == 0: + raise ValueError( + "Can't play column %s on board %s" % (column, self)) + + self.np_pieces[available_idx[-1]][column] = player + + def get_valid_moves(self): + "Any zero value in top row in a valid move" + return self.np_pieces[0] == 0 + + def get_win_state(self): + for player in [-1, 1]: + player_pieces = self.np_pieces == -player + # Check rows & columns for win + if (self._is_straight_winner(player_pieces) + or self._is_straight_winner(player_pieces.transpose()) + or self._is_diagonal_winner(player_pieces)): + return WinState(True, -player) + + # draw has very little value. + if not self.get_valid_moves().any(): + return WinState(True, None) + + # Game is not ended yet. + return WinState(False, None) + + def with_np_pieces(self, np_pieces): + """Create copy of board with specified pieces.""" + if np_pieces is None: + np_pieces = self.np_pieces + return Board(self.height, self.width, self.win_length, np_pieces) + + def _is_diagonal_winner(self, player_pieces): + """Checks if player_pieces contains a diagonal win.""" + win_length = self.win_length + for i in range(len(player_pieces) - win_length + 1): + for j in range(len(player_pieces[0]) - win_length + 1): + if all(player_pieces[i + x][j + x] for x in range(win_length)): + return True + for j in range(win_length - 1, len(player_pieces[0])): + if all(player_pieces[i + x][j - x] for x in range(win_length)): + return True + return False + + def _is_straight_winner(self, player_pieces): + """Checks if player_pieces contains a vertical or horizontal win.""" + run_lengths = [ + player_pieces[:, i:i + self.win_length].sum(axis=1) + for i in range(len(player_pieces) - self.win_length + 2) + ] + return max([x.max() for x in run_lengths]) >= self.win_length + + def __str__(self): + return str(self.np_pieces) + + +class Connect4Game(object): + """ + Connect4 Game class implementing the alpha-zero-general Game interface. + + Use 1 for player1 and -1 for player2. + """ + + def __init__(self, + height=None, + width=None, + win_length=None, + np_pieces=None): + self._base_board = Board(height, width, win_length, np_pieces) + + def getInitBoard(self): + """ + Returns: + startBoard: a representation of the board (ideally this is the form + that will be the input to your neural network) + """ + return self._base_board.np_pieces + + def getBoardSize(self): + """ + Returns: + (x,y): a tuple of board dimensions + """ + return (self._base_board.height, self._base_board.width) + + def getActionSize(self): + """ + Returns: + actionSize: number of all possible actions + """ + return self._base_board.width + + def getNextState(self, board, player, action): + """Returns a copy of the board with updated move, original board is unmodified. + + Input: + board: current board + player: current player (1 or -1) + action: action taken by current player + + Returns: + nextBoard: board after applying action + nextPlayer: player who plays in the next turn (should be -player) + + """ + b = self._base_board.with_np_pieces(np_pieces=np.copy(board)) + b.add_stone(action, player) + return b.np_pieces, -player + + def getValidMoves(self, board, player): + """Any zero value in top row in a valid move. + + Input: + board: current board + player: current player + + Returns: + validMoves: a binary vector of length self.getActionSize(), 1 for + moves that are valid from the current board and player, + 0 for invalid moves + """ + return self._base_board.with_np_pieces( + np_pieces=board).get_valid_moves() + + def getGameEnded(self, board, player): + """ + Input: + board: current board + player: current player (1 or -1) + + Returns: + r: 0 if game has not ended. 1 if player won, -1 if player lost, + small non-zero value for draw. + + """ + b = self._base_board.with_np_pieces(np_pieces=board) + winstate = b.get_win_state() + if winstate.is_ended: + if winstate.winner is None: + # draw has very little value. + return 1e-4 + elif winstate.winner == player: + return +1 + elif winstate.winner == -player: + return -1 + else: + raise ValueError('Unexpected winstate found: ', winstate) + else: + # 0 used to represent unfinished game. + return 0 + + def getCanonicalForm(self, board, player): + """ + Input: + board: current board + player: current player (1 or -1) + + Returns: + canonicalBoard: returns canonical form of board. The canonical form + should be independent of player. For e.g. in chess, + the canonical form can be chosen to be from the pov + of white. When the player is white, we can return + board as is. When the player is black, we can invert + the colors and return the board. + """ + return board * player + + def getSymmetries(self, board, pi): + """Board is left/right board symmetric + + Input: + board: current board + pi: policy vector of size self.getActionSize() + + Returns: + symmForms: a list of [(board,pi)] where each tuple is a symmetrical + form of the board and the corresponding pi vector. This + is used when training the neural network from examples. + """ + return [(board, pi), + (np.array(board[:, ::-1], copy=True), + np.array(pi[::-1], copy=True))] + + def stringRepresentation(self, board): + """ + Input: + board: current board + + Returns: + boardString: a quick conversion of board to a string format. + Required by MCTS for hashing. + """ + return board.tostring() + + @staticmethod + def display(board): + print(" -----------------------") + print(' '.join(map(str, range(len(board[0]))))) + print(board) + print(" -----------------------") diff --git a/benchmark/torch/AlphaZero/connect4_model.py b/benchmark/torch/AlphaZero/connect4_model.py new file mode 100644 index 0000000000000000000000000000000000000000..6c0f7705bfc40d1645d77c79ac7e47f1f721a317 --- /dev/null +++ b/benchmark/torch/AlphaZero/connect4_model.py @@ -0,0 +1,86 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import parl + +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim + + +class Connect4Model(parl.Model): + def __init__(self, game, args): + # game params + self.board_x, self.board_y = game.getBoardSize() + self.action_size = game.getActionSize() + self.args = args + + super(Connect4Model, self).__init__() + self.conv1 = nn.Conv2d(1, args.num_channels, 3, stride=1, padding=1) + self.conv2 = nn.Conv2d( + args.num_channels, args.num_channels, 3, stride=1, padding=1) + self.conv3 = nn.Conv2d( + args.num_channels, args.num_channels, 3, stride=1) + self.conv4 = nn.Conv2d( + args.num_channels, args.num_channels, 3, stride=1) + + self.bn1 = nn.BatchNorm2d(args.num_channels) + self.bn2 = nn.BatchNorm2d(args.num_channels) + self.bn3 = nn.BatchNorm2d(args.num_channels) + self.bn4 = nn.BatchNorm2d(args.num_channels) + + self.fc1 = nn.Linear( + args.num_channels * (self.board_x - 4) * (self.board_y - 4), 128) + self.fc_bn1 = nn.BatchNorm1d(128) + + self.fc2 = nn.Linear(128, 64) + self.fc_bn2 = nn.BatchNorm1d(64) + + self.fc3 = nn.Linear(64, self.action_size) + + self.fc4 = nn.Linear(64, 1) + + def forward(self, s): + """ + Args: + s(torch.Tensor): batch_size x board_x x board_y + """ + # batch_size x 1 x board_x x board_y + s = s.view(-1, 1, self.board_x, self.board_y) + # batch_size x num_channels x board_x x board_y + s = F.relu(self.bn1(self.conv1(s))) + # batch_size x num_channels x board_x x board_y + s = F.relu(self.bn2(self.conv2(s))) + # batch_size x num_channels x (board_x-2) x (board_y-2) + s = F.relu(self.bn3(self.conv3(s))) + # batch_size x num_channels x (board_x-4) x (board_y-4) + s = F.relu(self.bn4(self.conv4(s))) + s = s.view( + -1, + self.args.num_channels * (self.board_x - 4) * (self.board_y - 4)) + + s = F.dropout( + F.relu(self.fc_bn1(self.fc1(s))), + p=self.args.dropout, + training=self.training) # batch_size x 128 + s = F.dropout( + F.relu(self.fc_bn2(self.fc2(s))), + p=self.args.dropout, + training=self.training) # batch_size x 64 + + pi = self.fc3(s) # batch_size x action_size + v = self.fc4(s) # batch_size x 1 + + return F.log_softmax(pi, dim=1), torch.tanh(v) diff --git a/parl/framework/model_base.py b/benchmark/torch/AlphaZero/gen_submission.py similarity index 50% rename from parl/framework/model_base.py rename to benchmark/torch/AlphaZero/gen_submission.py index e4057a7706c2e26e66db340128679919290cb1bd..03728ec2cda4f155229ba7b4d18c7f2a22734e05 100644 --- a/parl/framework/model_base.py +++ b/benchmark/torch/AlphaZero/gen_submission.py @@ -12,13 +12,29 @@ # See the License for the specific language governing permissions and # limitations under the License. -import warnings +import sys +import base64 +import inspect +import os -warnings.simplefilter('default') +assert len(sys.argv) == 2, "please specify model path." +model_path = sys.argv[1] -warnings.warn( - "module `parl.framework.model_base.Model` is deprecated since version 1.2 and will be removed in version 1.3, please use `parl.Model` instead.", - DeprecationWarning, - stacklevel=2) +with open(model_path, 'rb') as f: + raw_bytes = f.read() + encoded_weights = base64.encodebytes(raw_bytes) -from parl.core.fluid.model import * +# encode weights of model to byte string +submission_file = """ +import base64 +decoded = base64.b64decode({}) + +""".format(encoded_weights) + +# insert code snippet of loading weights +with open('submission_template.py', 'r') as f: + submission_file += ''.join(f.readlines()) + +# generate final submission file +with open('submission.py', 'w') as f: + f.write(submission_file) diff --git a/benchmark/torch/AlphaZero/main.py b/benchmark/torch/AlphaZero/main.py new file mode 100644 index 0000000000000000000000000000000000000000..433e2ff0efb35e6a39df53a845a25a8110b20993 --- /dev/null +++ b/benchmark/torch/AlphaZero/main.py @@ -0,0 +1,78 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from Coach import Coach +from connect4_game import Connect4Game +from utils import * + +from parl.utils import logger + +args = dotdict({ + # master address of xparl cluster + 'master_address': 'localhost:8010', + # number of remote actors (execute tasks [self-play/pitting/evaluate_test_dataset] in parallel). + 'actors_num': 25, + + # total number of iteration + 'numIters': 200, + # Number of complete self-play games to simulate during a new iteration. + 'numEps': 500, + # Number of games to play during arena (pitting) play to determine if new neural network will be accepted. + 'arenaCompare': 50, + # Number of games moves for MCTS to simulate. + 'numMCTSSims': 800, + # temp=1 (Temperature, τ (tau)) if episodeStep < tempThresholdStep, and thereafter uses temp=0. + 'tempThresholdStep': 15, + # During arena playoff, new neural net will be accepted if threshold or more of games are won. + 'updateThreshold': 0.6, + # CPUCT parameter + 'cpuct': 4, + # alpha parameter of dirichlet noise which is added to the policy (pi) + 'dirichletAlpha': 1.0, + # history of examples from numItersForTrainExamplesHistory latest iterations (training data) + 'numItersForTrainExamplesHistory': 20, + + # folder to save model and training examples + 'checkpoint': './saved_model/', + # whether to load saved model and training examples + 'load_model': False, + 'load_folder_file': ('./saved_model', 'checkpoint_1.pth.tar'), +}) + +# Plays arenaCompare games in which player1 starts arenaCompare/2 games and player2 starts arenaCompare/2 games. +assert args.arenaCompare % 2 == 0 + +# make sure the tasks can be split evenly among different remote actors +assert args.numEps % args.actors_num == 0 +assert (args.arenaCompare // 2) % args.actors_num == 0 +assert 1000 % args.actors_num == 0 # there are 1000 boards state in test_dataset + + +def main(): + game = Connect4Game() + + c = Coach(game, args) + + if args.load_model: + logger.info('Loading checkpoint {}...'.format(args.load_folder_file)) + c.loadModel() + logger.info("Loading 'trainExamples' from file {}...".format( + args.load_folder_file)) + c.loadTrainExamples() + + c.learn() + + +if __name__ == "__main__": + main() diff --git a/benchmark/torch/AlphaZero/submission_template.py b/benchmark/torch/AlphaZero/submission_template.py new file mode 100644 index 0000000000000000000000000000000000000000..d9ba9e7eb85b0815403d98ae015c80f07f068334 --- /dev/null +++ b/benchmark/torch/AlphaZero/submission_template.py @@ -0,0 +1,559 @@ +# Third party code +# +# The following code are copied or modified from: +# https://github.com/suragnair/alpha-zero-general + +import os +os.environ['OMP_NUM_THREADS'] = "1" + + +# ===== utils.py ===== +class dotdict(dict): + def __getattr__(self, name): + return self[name] + + +# ===== MCTS.py ====== +import math +import time +import numpy as np + +EPS = 1e-8 + + +class MCTS(): + """ + This class handles the MCTS tree. + """ + + def __init__(self, game, nn_agent, args, dirichlet_noise=False): + self.game = game + self.nn_agent = nn_agent + self.args = args + self.dirichlet_noise = dirichlet_noise + self.Qsa = {} # stores Q values for s,a (as defined in the paper) + self.Nsa = {} # stores #times edge s,a was visited + self.Ns = {} # stores #times board s was visited + self.Ps = {} # stores initial policy (returned by neural net) + + self.Es = {} # stores game.getGameEnded ended for board s + self.Vs = {} # stores game.getValidMoves for board s + + def getActionProb(self, canonicalBoard, temp=1, timelimit=4.9): + """ + This function performs numMCTSSims simulations of MCTS starting from + canonicalBoard. + + Returns: + probs: a policy vector where the probability of the ith action is + proportional to Nsa[(s,a)]**(1./temp) + """ + dir_noise = self.dirichlet_noise + start_time = time.time() + while time.time() - start_time < timelimit: + self.search(canonicalBoard, dirichlet_noise=dir_noise) + + s = self.game.stringRepresentation(canonicalBoard) + counts = [ + self.Nsa[(s, a)] if (s, a) in self.Nsa else 0 + for a in range(self.game.getActionSize()) + ] + + if temp == 0: + bestAs = np.array(np.argwhere(counts == np.max(counts))).flatten() + bestA = np.random.choice(bestAs) + probs = [0] * len(counts) + probs[bestA] = 1 + return probs + + counts = [x**(1. / temp) for x in counts] + counts_sum = float(sum(counts)) + probs = [x / counts_sum for x in counts] + return probs + + def search(self, canonicalBoard, dirichlet_noise=False): + """ + This function performs one iteration of MCTS. It is recursively called + till a leaf node is found. The action chosen at each node is one that + has the maximum upper confidence bound as in the paper. + + Once a leaf node is found, the neural network is called to return an + initial policy P and a value v for the state. This value is propagated + up the search path. In case the leaf node is a terminal state, the + outcome is propagated up the search path. The values of Ns, Nsa, Qsa are + updated. + + NOTE: the return values are the negative of the value of the current + state. This is done since v is in [-1,1] and if v is the value of a + state for the current player, then its value is -v for the other player. + + Returns: + v: the negative of the value of the current canonicalBoard + """ + + s = self.game.stringRepresentation(canonicalBoard) + + if s not in self.Es: + self.Es[s] = self.game.getGameEnded(canonicalBoard, 1) + if self.Es[s] != 0: + # terminal node + return -self.Es[s] + + if s not in self.Ps: + # leaf node + self.Ps[s], v = self.nn_agent.predict(canonicalBoard) + + valids = self.game.getValidMoves(canonicalBoard, 1) + self.Ps[s] = self.Ps[s] * valids # masking invalid moves + if dirichlet_noise: + self.applyDirNoise(s, valids) + sum_Ps_s = np.sum(self.Ps[s]) + if sum_Ps_s > 0: + self.Ps[s] /= sum_Ps_s # renormalize + else: + # if all valid moves were masked make all valid moves equally probable + + # NB! All valid moves may be masked if either your NNet architecture is insufficient or you've get overfitting or something else. + # If you have got dozens or hundreds of these messages you should pay attention to your NNet and/or training process. + print("All valid moves were masked, doing a workaround.") + self.Ps[s] = self.Ps[s] + valids + self.Ps[s] /= np.sum(self.Ps[s]) + + self.Vs[s] = valids + self.Ns[s] = 0 + return -v + + valids = self.Vs[s] + if dirichlet_noise: + self.applyDirNoise(s, valids) + sum_Ps_s = np.sum(self.Ps[s]) + self.Ps[s] /= sum_Ps_s # renormalize + cur_best = -float('inf') + best_act = -1 + + # pick the action with the highest upper confidence bound + for a in range(self.game.getActionSize()): + if valids[a]: + if (s, a) in self.Qsa: + u = self.Qsa[ + (s, a)] + self.args.cpuct * self.Ps[s][a] * math.sqrt( + self.Ns[s]) / (1 + self.Nsa[(s, a)]) + else: + u = self.args.cpuct * self.Ps[s][a] * math.sqrt( + self.Ns[s] + EPS) # Q = 0 ? + + if u > cur_best: + cur_best = u + best_act = a + + a = best_act + next_s, next_player = self.game.getNextState(canonicalBoard, 1, a) + next_s = self.game.getCanonicalForm(next_s, next_player) + + v = self.search(next_s) + + if (s, a) in self.Qsa: + self.Qsa[(s, a)] = (self.Nsa[(s, a)] * self.Qsa[ + (s, a)] + v) / (self.Nsa[(s, a)] + 1) + self.Nsa[(s, a)] += 1 + + else: + self.Qsa[(s, a)] = v + self.Nsa[(s, a)] = 1 + + self.Ns[s] += 1 + return -v + + def applyDirNoise(self, s, valids): + dir_values = np.random.dirichlet( + [self.args.dirichletAlpha] * np.count_nonzero(valids)) + dir_idx = 0 + for idx in range(len(self.Ps[s])): + if self.Ps[s][idx]: + self.Ps[s][idx] = (0.75 * self.Ps[s][idx]) + ( + 0.25 * dir_values[dir_idx]) + dir_idx += 1 + + +# ===== connect4_game.py ====== +import numpy as np +from collections import namedtuple + +DEFAULT_HEIGHT = 6 +DEFAULT_WIDTH = 7 +DEFAULT_WIN_LENGTH = 4 + +WinState = namedtuple('WinState', 'is_ended winner') + + +class Board(): + """ + Connect4 Board. + """ + + def __init__(self, + height=None, + width=None, + win_length=None, + np_pieces=None): + "Set up initial board configuration." + self.height = height or DEFAULT_HEIGHT + self.width = width or DEFAULT_WIDTH + self.win_length = win_length or DEFAULT_WIN_LENGTH + + if np_pieces is None: + self.np_pieces = np.zeros([self.height, self.width], dtype=np.int) + else: + self.np_pieces = np_pieces + assert self.np_pieces.shape == (self.height, self.width) + + def add_stone(self, column, player): + "Create copy of board containing new stone." + available_idx, = np.where(self.np_pieces[:, column] == 0) + if len(available_idx) == 0: + raise ValueError( + "Can't play column %s on board %s" % (column, self)) + + self.np_pieces[available_idx[-1]][column] = player + + def get_valid_moves(self): + "Any zero value in top row in a valid move" + return self.np_pieces[0] == 0 + + def get_win_state(self): + for player in [-1, 1]: + player_pieces = self.np_pieces == -player + # Check rows & columns for win + if (self._is_straight_winner(player_pieces) + or self._is_straight_winner(player_pieces.transpose()) + or self._is_diagonal_winner(player_pieces)): + return WinState(True, -player) + + # draw has very little value. + if not self.get_valid_moves().any(): + return WinState(True, None) + + # Game is not ended yet. + return WinState(False, None) + + def with_np_pieces(self, np_pieces): + """Create copy of board with specified pieces.""" + if np_pieces is None: + np_pieces = self.np_pieces + return Board(self.height, self.width, self.win_length, np_pieces) + + def _is_diagonal_winner(self, player_pieces): + """Checks if player_pieces contains a diagonal win.""" + win_length = self.win_length + for i in range(len(player_pieces) - win_length + 1): + for j in range(len(player_pieces[0]) - win_length + 1): + if all(player_pieces[i + x][j + x] for x in range(win_length)): + return True + for j in range(win_length - 1, len(player_pieces[0])): + if all(player_pieces[i + x][j - x] for x in range(win_length)): + return True + return False + + def _is_straight_winner(self, player_pieces): + """Checks if player_pieces contains a vertical or horizontal win.""" + run_lengths = [ + player_pieces[:, i:i + self.win_length].sum(axis=1) + for i in range(len(player_pieces) - self.win_length + 2) + ] + return max([x.max() for x in run_lengths]) >= self.win_length + + def __str__(self): + return str(self.np_pieces) + + +class Connect4Game(object): + """ + Connect4 Game class implementing the alpha-zero-general Game interface. + + Use 1 for player1 and -1 for player2. + """ + + def __init__(self, + height=None, + width=None, + win_length=None, + np_pieces=None): + self._base_board = Board(height, width, win_length, np_pieces) + + def getInitBoard(self): + """ + Returns: + startBoard: a representation of the board (ideally this is the form + that will be the input to your neural network) + """ + return self._base_board.np_pieces + + def getBoardSize(self): + """ + Returns: + (x,y): a tuple of board dimensions + """ + return (self._base_board.height, self._base_board.width) + + def getActionSize(self): + """ + Returns: + actionSize: number of all possible actions + """ + return self._base_board.width + + def getNextState(self, board, player, action): + """Returns a copy of the board with updated move, original board is unmodified. + + Input: + board: current board + player: current player (1 or -1) + action: action taken by current player + + Returns: + nextBoard: board after applying action + nextPlayer: player who plays in the next turn (should be -player) + + """ + b = self._base_board.with_np_pieces(np_pieces=np.copy(board)) + b.add_stone(action, player) + return b.np_pieces, -player + + def getValidMoves(self, board, player): + """Any zero value in top row in a valid move. + + Input: + board: current board + player: current player + + Returns: + validMoves: a binary vector of length self.getActionSize(), 1 for + moves that are valid from the current board and player, + 0 for invalid moves + """ + return self._base_board.with_np_pieces( + np_pieces=board).get_valid_moves() + + def getGameEnded(self, board, player): + """ + Input: + board: current board + player: current player (1 or -1) + + Returns: + r: 0 if game has not ended. 1 if player won, -1 if player lost, + small non-zero value for draw. + + """ + b = self._base_board.with_np_pieces(np_pieces=board) + winstate = b.get_win_state() + if winstate.is_ended: + if winstate.winner is None: + # draw has very little value. + return 1e-4 + elif winstate.winner == player: + return +1 + elif winstate.winner == -player: + return -1 + else: + raise ValueError('Unexpected winstate found: ', winstate) + else: + # 0 used to represent unfinished game. + return 0 + + def getCanonicalForm(self, board, player): + """ + Input: + board: current board + player: current player (1 or -1) + + Returns: + canonicalBoard: returns canonical form of board. The canonical form + should be independent of player. For e.g. in chess, + the canonical form can be chosen to be from the pov + of white. When the player is white, we can return + board as is. When the player is black, we can invert + the colors and return the board. + """ + return board * player + + def getSymmetries(self, board, pi): + """Board is left/right board symmetric + + Input: + board: current board + pi: policy vector of size self.getActionSize() + + Returns: + symmForms: a list of [(board,pi)] where each tuple is a symmetrical + form of the board and the corresponding pi vector. This + is used when training the neural network from examples. + """ + return [(board, pi), + (np.array(board[:, ::-1], copy=True), + np.array(pi[::-1], copy=True))] + + def stringRepresentation(self, board): + """ + Input: + board: current board + + Returns: + boardString: a quick conversion of board to a string format. + Required by MCTS for hashing. + """ + return board.tostring() + + @staticmethod + def display(board): + print(" -----------------------") + print(' '.join(map(str, range(len(board[0]))))) + print(board) + print(" -----------------------") + + +# ===== connect4_model ====== +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim + + +#class Connect4Model(parl.Model): # Kaggle doesn't support parl package +class Connect4Model(nn.Module): + def __init__(self, game, args): + # game params + self.board_x, self.board_y = game.getBoardSize() + self.action_size = game.getActionSize() + self.args = args + + super(Connect4Model, self).__init__() + self.conv1 = nn.Conv2d(1, args.num_channels, 3, stride=1, padding=1) + self.conv2 = nn.Conv2d( + args.num_channels, args.num_channels, 3, stride=1, padding=1) + self.conv3 = nn.Conv2d( + args.num_channels, args.num_channels, 3, stride=1) + self.conv4 = nn.Conv2d( + args.num_channels, args.num_channels, 3, stride=1) + + self.bn1 = nn.BatchNorm2d(args.num_channels) + self.bn2 = nn.BatchNorm2d(args.num_channels) + self.bn3 = nn.BatchNorm2d(args.num_channels) + self.bn4 = nn.BatchNorm2d(args.num_channels) + + self.fc1 = nn.Linear( + args.num_channels * (self.board_x - 4) * (self.board_y - 4), 128) + self.fc_bn1 = nn.BatchNorm1d(128) + + self.fc2 = nn.Linear(128, 64) + self.fc_bn2 = nn.BatchNorm1d(64) + + self.fc3 = nn.Linear(64, self.action_size) + + self.fc4 = nn.Linear(64, 1) + + def forward(self, s): + # s: batch_size x board_x x board_y + s = s.view(-1, 1, self.board_x, + self.board_y) # batch_size x 1 x board_x x board_y + s = F.relu(self.bn1( + self.conv1(s))) # batch_size x num_channels x board_x x board_y + s = F.relu(self.bn2( + self.conv2(s))) # batch_size x num_channels x board_x x board_y + s = F.relu(self.bn3(self.conv3( + s))) # batch_size x num_channels x (board_x-2) x (board_y-2) + s = F.relu(self.bn4(self.conv4( + s))) # batch_size x num_channels x (board_x-4) x (board_y-4) + s = s.view( + -1, + self.args.num_channels * (self.board_x - 4) * (self.board_y - 4)) + + s = F.dropout( + F.relu(self.fc_bn1(self.fc1(s))), + p=self.args.dropout, + training=self.training) # batch_size x 128 + s = F.dropout( + F.relu(self.fc_bn2(self.fc2(s))), + p=self.args.dropout, + training=self.training) # batch_size x 64 + + pi = self.fc3(s) # batch_size x action_size + v = self.fc4(s) # batch_size x 1 + + return F.log_softmax(pi, dim=1), torch.tanh(v) + + +# ===== simple agent ====== +args = dotdict({ + 'dropout': 0.3, + 'num_channels': 64, +}) + + +class SimpleAgent(): + def __init__(self, game, cuda=True): + self.cuda = cuda and torch.cuda.is_available() + self.model = Connect4Model(game, args) + if self.cuda: + self.model.cuda() + + self.board_x, self.board_y = game.getBoardSize() + self.action_size = game.getActionSize() + + def predict(self, board): + """ + Args: + board (np.array): input board + + Return: + pi (np.array): probability of actions + v (np.array): estimated value of input + """ + # preparing input + board = torch.FloatTensor(board.astype(np.float64)) + if self.cuda: + board = board.contiguous().cuda() + board = board.view(1, self.board_x, self.board_y) + + self.model.eval() # eval mode + + with torch.no_grad(): + log_pi, v = self.model(board) + + pi = torch.exp(log_pi) + + return pi.data.cpu().numpy()[0], v.data.cpu().numpy()[0] + + def load_checkpoint(self, buffer): + map_location = None if self.cuda else 'cpu' + checkpoint = torch.load(buffer, map_location=map_location) + self.model.load_state_dict(checkpoint) + + +# ===== predict function ====== +import base64 +import io + +game = Connect4Game() + +# AlphaZero players +agent = SimpleAgent(game) +buffer = io.BytesIO(decoded) +agent.load_checkpoint(buffer) +mcts_args = dotdict({'numMCTSSims': 800, 'cpuct': 1.0}) +mcts = MCTS(game, agent, mcts_args) + + +def alphazero_agent(obs, config): + board = np.reshape(obs.board.copy(), game.getBoardSize()).astype(int) + board[np.where(board == 2)] = -1 + + player = 1 + if obs.mark == 2: + player = -1 + + x = game.getCanonicalForm(board, player) + + action = np.argmax( + mcts.getActionProb(x, temp=0, timelimit=config.timeout - 0.5)) + return int(action) diff --git a/benchmark/torch/AlphaZero/utils.py b/benchmark/torch/AlphaZero/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..5ae500cdae19f002538c563b6cbae725c7b0d9af --- /dev/null +++ b/benchmark/torch/AlphaZero/utils.py @@ -0,0 +1,65 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +class dotdict(dict): + def __getattr__(self, name): + try: + return self[name] + except KeyError: + raise AttributeError(name) + + +def win_loss_draw(score): + if score > 0: + return 'win' + if score < 0: + return 'loss' + return 'draw' + + +""" +split one list to multiple lists +""" +split_group = lambda the_list, group_size: zip(*(iter(the_list), ) * group_size) + +import numpy as np +import json +from connect4_game import Connect4Game + + +def get_test_dataset(): + game = Connect4Game() + test_dataset = [] + with open("refmoves1k_kaggle") as f: + for line in f: + data = json.loads(line) + + board = data["board"] + board = np.reshape(board, game.getBoardSize()).astype(int) + board[np.where(board == 2)] = -1 + + # find out how many moves are played to set the correct mark. + ply = len([x for x in data["board"] if x > 0]) + if ply & 1: + player = -1 + else: + player = 1 + + test_dataset.append({ + 'board': board, + 'player': player, + 'move_score': data['move score'], + }) + return test_dataset diff --git a/benchmark/torch/a2c/train.py b/benchmark/torch/a2c/train.py index f2985367f8304edb6bccc93f894a7d04f5f305c8..9a498023988bc72a0a0aa43d4850c25ced8d2856 100644 --- a/benchmark/torch/a2c/train.py +++ b/benchmark/torch/a2c/train.py @@ -27,7 +27,7 @@ from parl.env.atari_wrappers import wrap_deepmind from parl.utils.window_stat import WindowStat from parl.utils.time_stat import TimeStat from parl.utils import machine_info -from parl.utils import logger, get_gpu_count, tensorboard +from parl.utils import logger, get_gpu_count, summary from parl.algorithms import A2C from atari_model import ActorCritic @@ -205,19 +205,19 @@ class Learner(object): } if metric['mean_episode_rewards'] is not None: - tensorboard.add_scalar('train/mean_reward', - metric['mean_episode_rewards'], - self.sample_total_steps) - tensorboard.add_scalar('train/total_loss', metric['total_loss'], - self.sample_total_steps) - tensorboard.add_scalar('train/pi_loss', metric['pi_loss'], - self.sample_total_steps) - tensorboard.add_scalar('train/vf_loss', metric['vf_loss'], - self.sample_total_steps) - tensorboard.add_scalar('train/entropy', metric['entropy'], - self.sample_total_steps) - tensorboard.add_scalar('train/learn_rate', metric['lr'], - self.sample_total_steps) + summary.add_scalar('train/mean_reward', + metric['mean_episode_rewards'], + self.sample_total_steps) + summary.add_scalar('train/total_loss', metric['total_loss'], + self.sample_total_steps) + summary.add_scalar('train/pi_loss', metric['pi_loss'], + self.sample_total_steps) + summary.add_scalar('train/vf_loss', metric['vf_loss'], + self.sample_total_steps) + summary.add_scalar('train/entropy', metric['entropy'], + self.sample_total_steps) + summary.add_scalar('train/learn_rate', metric['lr'], + self.sample_total_steps) logger.info(metric) diff --git a/benchmark/torch/dqn/replay_memory.py b/benchmark/torch/dqn/replay_memory.py index ea8c6565155ddacae568e901566f9b390ee3a8b8..25ef0e50a7e53fd68d19b6ac3bffb807b18d6d29 100644 --- a/benchmark/torch/dqn/replay_memory.py +++ b/benchmark/torch/dqn/replay_memory.py @@ -16,16 +16,16 @@ import numpy as np import copy from collections import deque, namedtuple -Experience = namedtuple('Experience', ['state', 'action', 'reward', 'isOver']) +Experience = namedtuple('Experience', ['obs', 'action', 'reward', 'isOver']) class ReplayMemory(object): - def __init__(self, max_size, state_shape, context_len): + def __init__(self, max_size, obs_shape, context_len): self.max_size = int(max_size) - self.state_shape = state_shape + self.obs_shape = obs_shape self.context_len = int(context_len) - self.state = np.zeros((self.max_size, ) + state_shape, dtype='uint8') + self.obs = np.zeros((self.max_size, ) + obs_shape, dtype='uint8') self.action = np.zeros((self.max_size, ), dtype='int32') self.reward = np.zeros((self.max_size, ), dtype='float32') self.isOver = np.zeros((self.max_size, ), dtype='bool') @@ -48,42 +48,41 @@ class ReplayMemory(object): else: self._context.append(exp) - def recent_state(self): - """ maintain recent state for training""" + def recent_obs(self): + """ maintain recent obs for training""" lst = list(self._context) - states = [np.zeros(self.state_shape, dtype='uint8')] * \ + obs = [np.zeros(self.obs_shape, dtype='uint8')] * \ (self._context.maxlen - len(lst)) - states.extend([k.state for k in lst]) - return states + obs.extend([k.obs for k in lst]) + return obs def sample(self, idx): - """ return state, action, reward, isOver, - note that some frames in state may be generated from last episode, - they should be removed from state + """ return obs, action, reward, isOver, + note that some frames in obs may be generated from last episode, + they should be removed from obs """ - state = np.zeros( - (self.context_len + 1, ) + self.state_shape, dtype=np.uint8) - state_idx = np.arange(idx, - idx + self.context_len + 1) % self._curr_size + obs = np.zeros( + (self.context_len + 1, ) + self.obs_shape, dtype=np.uint8) + obs_idx = np.arange(idx, idx + self.context_len + 1) % self._curr_size # confirm that no frame was generated from last episode has_last_episode = False for k in range(self.context_len - 2, -1, -1): - to_check_idx = state_idx[k] + to_check_idx = obs_idx[k] if self.isOver[to_check_idx]: has_last_episode = True - state_idx = state_idx[k + 1:] - state[k + 1:] = self.state[state_idx] + obs_idx = obs_idx[k + 1:] + obs[k + 1:] = self.obs[obs_idx] break if not has_last_episode: - state = self.state[state_idx] + obs = self.obs[obs_idx] real_idx = (idx + self.context_len - 1) % self._curr_size action = self.action[real_idx] reward = self.reward[real_idx] isOver = self.isOver[real_idx] - return state, reward, action, isOver + return obs, reward, action, isOver def __len__(self): return self._curr_size @@ -92,7 +91,7 @@ class ReplayMemory(object): return self._curr_size def _assign(self, pos, exp): - self.state[pos] = exp.state + self.obs[pos] = exp.obs self.reward[pos] = exp.reward self.action[pos] = exp.action self.isOver[pos] = exp.isOver @@ -107,8 +106,8 @@ class ReplayMemory(object): return self._process_batch(batch_exp) def _process_batch(self, batch_exp): - state = np.asarray([e[0] for e in batch_exp], dtype='uint8') + obs = np.asarray([e[0] for e in batch_exp], dtype='uint8') reward = np.asarray([e[1] for e in batch_exp], dtype='float32') action = np.asarray([e[2] for e in batch_exp], dtype='int8') isOver = np.asarray([e[3] for e in batch_exp], dtype='bool') - return [state, action, reward, isOver] + return [obs, action, reward, isOver] diff --git a/benchmark/torch/dqn/train.py b/benchmark/torch/dqn/train.py index 9db3b8f776fa669772bb2748cbfed0a7067f5909..ba64b95c93a9b4879621331ad30cce3cbcbcac16 100644 --- a/benchmark/torch/dqn/train.py +++ b/benchmark/torch/dqn/train.py @@ -22,11 +22,11 @@ import parl import numpy as np from tqdm import tqdm -from parl.utils import tensorboard, logger +from parl.utils import summary, logger from parl.algorithms import DQN, DDQN from agent import AtariAgent -from atari_wrapper import FireResetEnv, FrameStack, LimitLength, MapState +from atari_wrapper import FireResetEnv, FrameStack, LimitLength from model import AtariModel from replay_memory import ReplayMemory, Experience from utils import get_player @@ -43,57 +43,57 @@ GAMMA = 0.99 def run_train_episode(env, agent, rpm): total_reward = 0 all_cost = [] - state = env.reset() + obs = env.reset() steps = 0 while True: steps += 1 - context = rpm.recent_state() - context.append(state) + context = rpm.recent_obs() + context.append(obs) context = np.stack(context, axis=0) action = agent.sample(context) - next_state, reward, isOver, _ = env.step(action) - rpm.append(Experience(state, action, reward, isOver)) + next_obs, reward, isOver, _ = env.step(action) + rpm.append(Experience(obs, action, reward, isOver)) if rpm.size() > MEMORY_WARMUP_SIZE: if steps % UPDATE_FREQ == 0: - batch_all_state, batch_action, batch_reward, batch_isOver = rpm.sample_batch( + batch_all_obs, batch_action, batch_reward, batch_isOver = rpm.sample_batch( args.batch_size) - batch_state = batch_all_state[:, :CONTEXT_LEN, :, :] - batch_next_state = batch_all_state[:, 1:, :, :] - cost = agent.learn(batch_state, batch_action, batch_reward, - batch_next_state, batch_isOver) + batch_obs = batch_all_obs[:, :CONTEXT_LEN, :, :] + batch_next_obs = batch_all_obs[:, 1:, :, :] + cost = agent.learn(batch_obs, batch_action, batch_reward, + batch_next_obs, batch_isOver) all_cost.append(cost) total_reward += reward - state = next_state + obs = next_obs if isOver: mean_loss = np.mean(all_cost) if all_cost else None return total_reward, steps, mean_loss def run_evaluate_episode(env, agent): - state = env.reset() + obs = env.reset() total_reward = 0 while True: - pred_Q = agent.predict(state) + pred_Q = agent.predict(obs) action = pred_Q.max(1)[1].item() - state, reward, isOver, _ = env.step(action) + obs, reward, isOver, _ = env.step(action) total_reward += reward if isOver: return total_reward -def get_fixed_states(rpm, batch_size): - states = [] +def get_fixed_obs(rpm, batch_size): + obs = [] for _ in range(3): - batch_all_state = rpm.sample_batch(batch_size)[0] - batch_state = batch_all_state[:, :CONTEXT_LEN, :, :] - states.append(batch_state) - fixed_states = np.concatenate(states, axis=0) - return fixed_states + batch_all_obs = rpm.sample_batch(batch_size)[0] + batch_obs = batch_all_obs[:, :CONTEXT_LEN, :, :] + obs.append(batch_obs) + fixed_obs = np.concatenate(obs, axis=0) + return fixed_obs -def evaluate_fixed_Q(agent, states): +def evaluate_fixed_Q(agent, obs): with torch.no_grad(): - max_pred_Q = agent.alg.model(states).max(1)[0].mean() + max_pred_Q = agent.alg.model(obs).max(1)[0].mean() return max_pred_Q.item() @@ -131,9 +131,9 @@ def main(): total_reward, steps, _ = run_train_episode(env, agent, rpm) pbar.update(steps) - # Get fixed states to check value function. - fixed_states = get_fixed_states(rpm, args.batch_size) - fixed_states = torch.tensor(fixed_states, dtype=torch.float, device=device) + # Get fixed obs to check value function. + fixed_obs = get_fixed_obs(rpm, args.batch_size) + fixed_obs = torch.tensor(fixed_obs, dtype=torch.float, device=device) # train test_flag = 0 @@ -152,18 +152,17 @@ def main(): for _ in range(3): eval_rewards.append(run_evaluate_episode(test_env, agent)) - tensorboard.add_scalar('dqn/eval', np.mean(eval_rewards), - total_steps) - tensorboard.add_scalar('dqn/score', total_reward, total_steps) - tensorboard.add_scalar('dqn/loss', loss, total_steps) - tensorboard.add_scalar('dqn/exploration', agent.exploration, - total_steps) - tensorboard.add_scalar('dqn/Q value', - evaluate_fixed_Q(agent, fixed_states), - total_steps) - tensorboard.add_scalar('dqn/grad_norm', - get_grad_norm(agent.alg.model), - total_steps) + summary.add_scalar('dqn/eval', np.mean(eval_rewards), + total_steps) + summary.add_scalar('dqn/score', total_reward, total_steps) + summary.add_scalar('dqn/loss', loss, total_steps) + summary.add_scalar('dqn/exploration', agent.exploration, + total_steps) + summary.add_scalar('dqn/Q value', + evaluate_fixed_Q(agent, fixed_obs), + total_steps) + summary.add_scalar('dqn/grad_norm', + get_grad_norm(agent.alg.model), total_steps) if __name__ == '__main__': diff --git a/benchmark/torch/ppo/arguments.py b/benchmark/torch/ppo/arguments.py new file mode 100644 index 0000000000000000000000000000000000000000..b7d5d33df54b4652a416f0f9bbb49c3d1bd4a522 --- /dev/null +++ b/benchmark/torch/ppo/arguments.py @@ -0,0 +1,103 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import torch + + +def get_args(): + parser = argparse.ArgumentParser(description='RL') + parser.add_argument( + '--lr', type=float, default=3e-4, help='learning rate (default: 3e-4)') + parser.add_argument( + '--eps', + type=float, + default=1e-5, + help='RMSprop optimizer epsilon (default: 1e-5)') + parser.add_argument( + '--gamma', + type=float, + default=0.99, + help='discount factor for rewards (default: 0.99)') + parser.add_argument( + '--gae-lambda', + type=float, + default=0.95, + help='gae lambda parameter (default: 0.95)') + parser.add_argument( + '--entropy-coef', + type=float, + default=0., + help='entropy term coefficient (default: 0.)') + parser.add_argument( + '--value-loss-coef', + type=float, + default=0.5, + help='value loss coefficient (default: 0.5)') + parser.add_argument( + '--max-grad-norm', + type=float, + default=0.5, + help='max norm of gradients (default: 0.5)') + parser.add_argument( + '--seed', type=int, default=1, help='random seed (default: 1)') + parser.add_argument( + '--num-steps', + type=int, + default=2048, + help='number of maximum forward steps in ppo (default: 2048)') + parser.add_argument( + '--ppo-epoch', + type=int, + default=10, + help='number of ppo epochs (default: 10)') + parser.add_argument( + '--num-mini-batch', + type=int, + default=32, + help='number of batches for ppo (default: 32)') + parser.add_argument( + '--clip-param', + type=float, + default=0.2, + help='ppo clip parameter (default: 0.2)') + parser.add_argument( + '--log-interval', + type=int, + default=1, + help='log interval, one log per n updates (default: 1)') + parser.add_argument( + '--eval-interval', + type=int, + default=10, + help='eval interval, one eval per n updates (default: 10)') + parser.add_argument( + '--num-env-steps', + type=int, + default=10e5, + help='number of environment steps to train (default: 10e5)') + parser.add_argument( + '--env-name', + default='Hopper-v2', + help='environment to train on (default: Hopper-v2)') + parser.add_argument( + '--use-linear-lr-decay', + action='store_true', + default=False, + help='use a linear schedule on the learning rate') + args = parser.parse_args() + + args.cuda = torch.cuda.is_available() + + return args diff --git a/benchmark/torch/ppo/evaluation.py b/benchmark/torch/ppo/evaluation.py new file mode 100644 index 0000000000000000000000000000000000000000..8aa020ca66a0c3a97d8deea55e37dabc4cf7512b --- /dev/null +++ b/benchmark/torch/ppo/evaluation.py @@ -0,0 +1,56 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import torch + +import utils +from wrapper import make_env + + +def evaluate(agent, ob_rms, env_name, seed, device): + if seed != None: + seed += 1 + eval_envs = make_env(env_name, seed, None) + vec_norm = utils.get_vec_normalize(eval_envs) + if vec_norm is not None: + vec_norm.eval() + vec_norm.ob_rms = ob_rms + + eval_episode_rewards = [] + + obs = eval_envs.reset() + eval_masks = torch.zeros(1, 1, device=device) + + while len(eval_episode_rewards) < 10: + with torch.no_grad(): + action = agent.predict(obs) + + # Obser reward and next obs + obs, _, done, infos = eval_envs.step(action) + + eval_masks = torch.tensor( + [[0.0] if done_ else [1.0] for done_ in done], + dtype=torch.float32, + device=device) + + for info in infos: + if 'episode' in info.keys(): + eval_episode_rewards.append(info['episode']['r']) + + eval_envs.close() + + print(" Evaluation using {} episodes: mean reward {:.5f}\n".format( + len(eval_episode_rewards), np.mean(eval_episode_rewards))) + return np.mean(eval_episode_rewards) diff --git a/benchmark/torch/ppo/mujoco_agent.py b/benchmark/torch/ppo/mujoco_agent.py new file mode 100644 index 0000000000000000000000000000000000000000..096f683f958829c0780ecc59d9ed144367c15f38 --- /dev/null +++ b/benchmark/torch/ppo/mujoco_agent.py @@ -0,0 +1,78 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import parl +import torch + + +class MujocoAgent(parl.Agent): + def __init__(self, algorithm, device): + self.alg = algorithm + self.device = device + + def predict(self, obs): + obs = torch.from_numpy(obs).float().to(self.device) + action = self.alg.predict(obs) + return action.cpu().numpy() + + def sample(self, obs): + obs = torch.from_numpy(obs).to(self.device) + value, action, action_log_probs = self.alg.sample(obs) + return value.cpu().numpy(), action.cpu().numpy(), \ + action_log_probs.cpu().numpy() + + def learn(self, next_value, gamma, gae_lambda, ppo_epoch, num_mini_batch, + rollouts): + value_loss_epoch = 0 + action_loss_epoch = 0 + dist_entropy_epoch = 0 + + for e in range(ppo_epoch): + data_generator = rollouts.sample_batch(next_value, gamma, + gae_lambda, num_mini_batch) + + for sample in data_generator: + obs_batch, actions_batch, \ + value_preds_batch, return_batch, old_action_log_probs_batch, \ + adv_targ = sample + + obs_batch = torch.from_numpy(obs_batch).to('cuda') + actions_batch = torch.from_numpy(actions_batch).to('cuda').to( + 'cuda') + value_preds_batch = torch.from_numpy(value_preds_batch).to( + 'cuda') + return_batch = torch.from_numpy(return_batch).to('cuda') + old_action_log_probs_batch = torch.from_numpy( + old_action_log_probs_batch).to('cuda') + adv_targ = torch.from_numpy(adv_targ).to('cuda') + + value_loss, action_loss, dist_entropy = self.alg.learn( + obs_batch, actions_batch, value_preds_batch, return_batch, + old_action_log_probs_batch, adv_targ) + + value_loss_epoch += value_loss + action_loss_epoch += action_loss + dist_entropy_epoch += dist_entropy + + num_updates = ppo_epoch * num_mini_batch + + value_loss_epoch /= num_updates + action_loss_epoch /= num_updates + dist_entropy_epoch /= num_updates + + return value_loss_epoch, action_loss_epoch, dist_entropy_epoch + + def value(self, obs): + obs = torch.from_numpy(obs).to(self.device) + return self.alg.value(obs).cpu().numpy() diff --git a/benchmark/torch/ppo/mujoco_model.py b/benchmark/torch/ppo/mujoco_model.py new file mode 100644 index 0000000000000000000000000000000000000000..83b762da2bd5a922d2a20605df641b6aec0ad949 --- /dev/null +++ b/benchmark/torch/ppo/mujoco_model.py @@ -0,0 +1,64 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import parl +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.distributions import Normal + + +class MujocoModel(parl.Model): + def __init__(self, obs_dim, act_dim): + super(MujocoModel, self).__init__() + self.actor = Actor(obs_dim, act_dim) + self.critic = Critic(obs_dim) + + def policy(self, obs): + return self.actor(obs) + + def value(self, obs): + return self.critic(obs) + + +class Actor(parl.Model): + def __init__(self, obs_dim, act_dim): + super(Actor, self).__init__() + self.fc1 = nn.Linear(obs_dim, 64) + self.fc2 = nn.Linear(64, 64) + + self.fc_mean = nn.Linear(64, act_dim) + self.log_std = nn.Parameter(torch.zeros(act_dim)) + + def forward(self, obs): + x = torch.tanh(self.fc1(obs)) + x = torch.tanh(self.fc2(x)) + + mean = self.fc_mean(x) + return mean, self.log_std + + +class Critic(parl.Model): + def __init__(self, obs_dim): + super(Critic, self).__init__() + self.fc1 = nn.Linear(obs_dim, 64) + self.fc2 = nn.Linear(64, 64) + self.fc3 = nn.Linear(64, 1) + + def forward(self, obs): + x = torch.tanh(self.fc1(obs)) + x = torch.tanh(self.fc2(x)) + value = self.fc3(x) + + return value diff --git a/benchmark/torch/ppo/storage.py b/benchmark/torch/ppo/storage.py new file mode 100644 index 0000000000000000000000000000000000000000..b986b670d545fb88938785fc812a320103023d5d --- /dev/null +++ b/benchmark/torch/ppo/storage.py @@ -0,0 +1,107 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler + + +class RolloutStorage(object): + def __init__(self, num_steps, obs_dim, act_dim): + self.num_steps = num_steps + self.obs_dim = obs_dim + self.act_dim = act_dim + + self.obs = np.zeros((num_steps + 1, obs_dim), dtype='float32') + self.actions = np.zeros((num_steps, act_dim), dtype='float32') + self.value_preds = np.zeros((num_steps + 1, ), dtype='float32') + self.returns = np.zeros((num_steps + 1, ), dtype='float32') + self.action_log_probs = np.zeros((num_steps, ), dtype='float32') + self.rewards = np.zeros((num_steps, ), dtype='float32') + + self.masks = np.ones((num_steps + 1, ), dtype='bool') + self.bad_masks = np.ones((num_steps + 1, ), dtype='bool') + + self.step = 0 + + def append(self, obs, actions, action_log_probs, value_preds, rewards, + masks, bad_masks): + """ + print("obs") + print(obs) + print("masks") + print(masks) + print("rewards") + print(rewards) + exit() + """ + self.obs[self.step + 1] = obs + self.actions[self.step] = actions + self.rewards[self.step] = rewards + self.action_log_probs[self.step] = action_log_probs + self.value_preds[self.step] = value_preds + self.masks[self.step + 1] = masks + self.bad_masks[self.step + 1] = bad_masks + + self.step = (self.step + 1) % self.num_steps + + def sample_batch(self, + next_value, + gamma, + gae_lambda, + num_mini_batch, + mini_batch_size=None): + # calculate return and advantage first + self.compute_returns(next_value, gamma, gae_lambda) + advantages = self.returns[:-1] - self.value_preds[:-1] + advantages = (advantages - advantages.mean()) / ( + advantages.std() + 1e-5) + + # generate sample batch + mini_batch_size = self.num_steps // num_mini_batch + sampler = BatchSampler( + SubsetRandomSampler(range(self.num_steps)), + mini_batch_size, + drop_last=True) + for indices in sampler: + obs_batch = self.obs[:-1][indices] + actions_batch = self.actions[indices] + value_preds_batch = self.value_preds[:-1][indices] + returns_batch = self.returns[:-1][indices] + old_action_log_probs_batch = self.action_log_probs[indices] + + value_preds_batch = value_preds_batch.reshape(-1, 1) + returns_batch = returns_batch.reshape(-1, 1) + old_action_log_probs_batch = old_action_log_probs_batch.reshape( + -1, 1) + + adv_targ = advantages[indices] + adv_targ = adv_targ.reshape(-1, 1) + + yield obs_batch, actions_batch, value_preds_batch, returns_batch, old_action_log_probs_batch, adv_targ + + def after_update(self): + self.obs[0] = np.copy(self.obs[-1]) + self.masks[0] = np.copy(self.masks[-1]) + self.bad_masks[0] = np.copy(self.bad_masks[-1]) + + def compute_returns(self, next_value, gamma, gae_lambda): + self.value_preds[-1] = next_value + gae = 0 + for step in reversed(range(self.rewards.size)): + delta = self.rewards[step] + gamma * self.value_preds[ + step + 1] * self.masks[step + 1] - self.value_preds[step] + gae = delta + gamma * gae_lambda * self.masks[step + 1] * gae + gae = gae * self.bad_masks[step + 1] + self.returns[step] = gae + self.value_preds[step] diff --git a/benchmark/torch/ppo/train.py b/benchmark/torch/ppo/train.py new file mode 100644 index 0000000000000000000000000000000000000000..6bb5dafbf4fbc6b96dc664030910446a7cfd46e1 --- /dev/null +++ b/benchmark/torch/ppo/train.py @@ -0,0 +1,128 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# modified from https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail + +import copy +import os +from collections import deque + +import gym +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim + +import utils +from arguments import get_args +from wrapper import make_env +from mujoco_model import MujocoModel +from parl.algorithms import PPO +from mujoco_agent import MujocoAgent +from storage import RolloutStorage +from evaluation import evaluate + + +def main(): + args = get_args() + + torch.manual_seed(args.seed) + torch.cuda.manual_seed_all(args.seed) + + torch.set_num_threads(1) + device = torch.device("cuda:0" if args.cuda else "cpu") + + envs = make_env(args.env_name, args.seed, args.gamma) + + model = MujocoModel(envs.observation_space.shape[0], + envs.action_space.shape[0]) + model.to(device) + + algorithm = PPO( + model, + args.clip_param, + args.value_loss_coef, + args.entropy_coef, + initial_lr=args.lr, + eps=args.eps, + max_grad_norm=args.max_grad_norm) + + agent = MujocoAgent(algorithm, device) + + rollouts = RolloutStorage(args.num_steps, envs.observation_space.shape[0], + envs.action_space.shape[0]) + + obs = envs.reset() + rollouts.obs[0] = np.copy(obs) + + episode_rewards = deque(maxlen=10) + + num_updates = int(args.num_env_steps) // args.num_steps + for j in range(num_updates): + + if args.use_linear_lr_decay: + # decrease learning rate linearly + utils.update_linear_schedule(algorithm.optimizer, j, num_updates, + args.lr) + + for step in range(args.num_steps): + # Sample actions + with torch.no_grad(): + value, action, action_log_prob = agent.sample( + rollouts.obs[step]) # why use obs from rollouts???有病吧 + + # Obser reward and next obs + obs, reward, done, infos = envs.step(action) + + for info in infos: + if 'episode' in info.keys(): + episode_rewards.append(info['episode']['r']) + + # If done then clean the history of observations. + masks = torch.FloatTensor( + [[0.0] if done_ else [1.0] for done_ in done]) + bad_masks = torch.FloatTensor( + [[0.0] if 'bad_transition' in info.keys() else [1.0] + for info in infos]) + rollouts.append(obs, action, action_log_prob, value, reward, masks, + bad_masks) + + with torch.no_grad(): + next_value = agent.value(rollouts.obs[-1]) + + value_loss, action_loss, dist_entropy = agent.learn( + next_value, args.gamma, args.gae_lambda, args.ppo_epoch, + args.num_mini_batch, rollouts) + + rollouts.after_update() + + if j % args.log_interval == 0 and len(episode_rewards) > 1: + total_num_steps = (j + 1) * args.num_steps + print( + "Updates {}, num timesteps {},\n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" + .format(j, total_num_steps, len(episode_rewards), + np.mean(episode_rewards), np.median(episode_rewards), + np.min(episode_rewards), np.max(episode_rewards), + dist_entropy, value_loss, action_loss)) + + if (args.eval_interval is not None and len(episode_rewards) > 1 + and j % args.eval_interval == 0): + ob_rms = utils.get_vec_normalize(envs).ob_rms + eval_mean_reward = evaluate(agent, ob_rms, args.env_name, + args.seed, device) + + +if __name__ == "__main__": + main() diff --git a/benchmark/torch/ppo/utils.py b/benchmark/torch/ppo/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..2e276a7f0779cfb55b3ef92012f22a61b7937c62 --- /dev/null +++ b/benchmark/torch/ppo/utils.py @@ -0,0 +1,43 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import glob +import os + +import torch +import torch.nn as nn + +from wrapper import VecNormalize + + +def get_vec_normalize(venv): + if isinstance(venv, VecNormalize): + return venv + elif hasattr(venv, 'venv'): + return get_vec_normalize(venv.venv) + + return None + + +def update_linear_schedule(optimizer, epoch, total_num_epochs, initial_lr): + """Decreases the learning rate linearly""" + lr = initial_lr - (initial_lr * (epoch / float(total_num_epochs))) + for param_group in optimizer.param_groups: + param_group['lr'] = lr + + +def init(module, weight_init, bias_init, gain=1): + weight_init(module.weight.data, gain=gain) + bias_init(module.bias.data) + return module diff --git a/benchmark/torch/ppo/wrapper.py b/benchmark/torch/ppo/wrapper.py new file mode 100644 index 0000000000000000000000000000000000000000..a890db1d0e5ee2cc2131794d9317a76a55e16e83 --- /dev/null +++ b/benchmark/torch/ppo/wrapper.py @@ -0,0 +1,180 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Simplified version of https://github.com/ShangtongZhang/DeepRL/blob/master/deep_rl/component/envs.py + +import numpy as np +import gym +from gym.core import Wrapper +import time + + +class TimeLimitMask(gym.Wrapper): + def step(self, action): + obs, rew, done, info = self.env.step(action) + if done and self.env._max_episode_steps == self.env._elapsed_steps: + info['bad_transition'] = True + return obs, rew, done, info + + def reset(self, **kwargs): + return self.env.reset(**kwargs) + + +class MonitorEnv(gym.Wrapper): + def __init__(self, env): + Wrapper.__init__(self, env=env) + self.tstart = time.time() + self.rewards = None + + def step(self, action): + ob, rew, done, info = self.env.step(action) + self.update(ob, rew, done, info) + return (ob, rew, done, info) + + def update(self, ob, rew, done, info): + self.rewards.append(rew) + if done: + eprew = sum(self.rewards) + eplen = len(self.rewards) + epinfo = { + "r": round(eprew, 6), + "l": eplen, + "t": round(time.time() - self.tstart, 6) + } + assert isinstance(info, dict) + info['episode'] = epinfo + self.reset() + + def reset(self, **kwargs): + self.rewards = [] + return self.env.reset(**kwargs) + + +class VectorEnv(gym.Wrapper): + def step(self, action): + ob, rew, done, info = self.env.step(action) + ob = np.array(ob) + ob = ob[np.newaxis, :] + rew = np.array([rew]) + + done = np.array([done]) + + info = [info] + return (ob, rew, done, info) + + +class RunningMeanStd(object): + # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm + def __init__(self, epsilon=1e-4, shape=()): + self.mean = np.zeros(shape, 'float64') + self.var = np.ones(shape, 'float64') + self.count = epsilon + + def update(self, x): + batch_mean = np.mean(x, axis=0) + batch_var = np.var(x, axis=0) + batch_count = x.shape[0] + self.update_from_moments(batch_mean, batch_var, batch_count) + + def update_from_moments(self, batch_mean, batch_var, batch_count): + self.mean, self.var, self.count = update_mean_var_count_from_moments( + self.mean, self.var, self.count, batch_mean, batch_var, + batch_count) + + +def update_mean_var_count_from_moments(mean, var, count, batch_mean, batch_var, + batch_count): + delta = batch_mean - mean + tot_count = count + batch_count + + new_mean = mean + delta * batch_count / tot_count + m_a = var * count + m_b = batch_var * batch_count + M2 = m_a + m_b + np.square(delta) * count * batch_count / tot_count + new_var = M2 / tot_count + new_count = tot_count + + return new_mean, new_var, new_count + + +class VecNormalize(gym.Wrapper): + def __init__(self, + env, + ob=True, + ret=True, + clipob=10., + cliprew=10., + gamma=0.99, + epsilon=1e-8): + Wrapper.__init__(self, env=env) + observation_space = env.observation_space.shape[0] + + self.ob_rms = RunningMeanStd(shape=observation_space) if ob else None + self.ret_rms = RunningMeanStd(shape=()) if ret else None + + self.clipob = clipob + self.cliprew = cliprew + self.gamma = gamma + self.epsilon = epsilon + self.ret = np.zeros(1) + self.training = True + + def step(self, action): + ob, rew, new, info = self.env.step(action) + self.ret = self.ret * self.gamma + rew + # normalize observation + ob = self._obfilt(ob) + # normalize reward + if self.ret_rms: + self.ret_rms.update(self.ret) + rew = np.clip(rew / np.sqrt(self.ret_rms.var + self.epsilon), + -self.cliprew, self.cliprew) + self.ret[new] = 0. + return ob, rew, new, info + + def reset(self): + self.ret = np.zeros(1) + ob = self.env.reset() + return self._obfilt(ob) + + def _obfilt(self, ob, update=True): + if self.ob_rms: + if self.training and update: + self.ob_rms.update(ob) + ob = np.clip((ob - self.ob_rms.mean) / + np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, + self.clipob) + return ob + else: + return ob + + def train(self): + self.training = True + + def eval(self): + self.trainint = False + + +def make_env(env_name, seed, gamma): + env = gym.make(env_name) + env.seed(seed) + env = TimeLimitMask(env) + env = MonitorEnv(env) + env = VectorEnv(env) + if gamma is None: + env = VecNormalize(env, ret=False) + else: + env = VecNormalize(env, gamma=gamma) + + return env diff --git a/benchmark/torch/td3/train.py b/benchmark/torch/td3/train.py index c844d8c079a4b10e1e0ade957202cd7d2dcd27fb..48bd1f77103f1e50bd28f55cc12bee09315496e7 100644 --- a/benchmark/torch/td3/train.py +++ b/benchmark/torch/td3/train.py @@ -15,7 +15,7 @@ import gym import argparse import numpy as np -from parl.utils import logger, tensorboard, ReplayMemory +from parl.utils import logger, summary, ReplayMemory from mujoco_model import MujocoModel from mujoco_agent import MujocoAgent @@ -103,8 +103,7 @@ def main(): train_reward, steps = run_train_episode(env, agent, rpm) total_steps += steps logger.info('Steps: {} Reward: {}'.format(total_steps, train_reward)) - tensorboard.add_scalar('train/episode_reward', train_reward, - total_steps) + summary.add_scalar('train/episode_reward', train_reward, total_steps) if total_steps // args.test_every_steps >= test_flag: while total_steps // args.test_every_steps >= test_flag: @@ -112,8 +111,8 @@ def main(): evaluate_reward = run_evaluate_episode(env, agent) logger.info('Steps {}, Evaluate reward: {}'.format( total_steps, evaluate_reward)) - tensorboard.add_scalar('eval/episode_reward', evaluate_reward, - total_steps) + summary.add_scalar('eval/episode_reward', evaluate_reward, + total_steps) if __name__ == '__main__': diff --git a/docs/EvoKit/minimal_example.rst b/docs/EvoKit/minimal_example.rst new file mode 100644 index 0000000000000000000000000000000000000000..0eb7c66902fe71ebe097586f8385f43952068860 --- /dev/null +++ b/docs/EvoKit/minimal_example.rst @@ -0,0 +1,190 @@ +minimal example +--------------------- + +``本教程的目标: +演示如何通过EvoKit库来解决经典的CartPole 问题。`` + +*本教程假定读者曾经使用过PaddlePaddle, 了解基本的进化算法迭代流程。* + +CartPole 介绍 +############# +CartPole又叫倒立摆。小车上放了一根杆,杆会因重力而倒下。为了不让杆倒下,我们要通过移动小车,来保持其是直立的。如下图所示。 +在每一个时间步,模型的输入是一个4维的向量,表示当前小车和杆的状态,模型输出的信号用于控制小车往左或者右移动。当杆没有倒下的时候,每个时间步,环境会给1分的奖励;当杆倒下后,环境不会给任何的奖励,游戏结束。 + +.. image:: ../../examples/QuickStart/performance.gif + :width: 300px + +step1: 生成预测网络 +######################## +根据上面的环境介绍,我们需要构造一个神经网络,输入为4维的向量,输出为2维的概率分布向量(表示左/右)移动的概率。 +在这里,我们使用Paddle来实现预测网络,并保存到本地。 + +.. code-block:: python + + from paddle import fluid + + def net(obs, act_dim): + hid1 = fluid.layers.fc(obs, size=20) + prob = fluid.layers.fc(hid1, size=act_dim, act='softmax') + return prob + + if __name__ == '__main__': + obs_dim = 4 + act_dim = 2 + obs = fluid.layers.data(name="obs", shape=[obs_dim], dtype='float32') + prob = net(obs, act_dim) + + exe = fluid.Executor(fluid.CPUPlace()) + exe.run(fluid.default_startup_program()) + fluid.io.save_inference_model( + dirname='init_model', + feeded_var_names=['obs'], + target_vars=[prob], + params_filename='params', + model_filename='model', + executor=exe) + +step2: 构造ESAgent +################### + +- 调用 ``load_config`` 加载配置文件。 +- 调用 ``load_inference_model`` 函数加载模型参数。 +- 调用 ``init_solver`` 初始化solver。 + +配置文件主要是用于指定进化算法类型(比如Gaussian或者CMA),使用的optimizer类型(Adam或者SGD)。 + +.. code-block:: c++ + + ESAgent agent = ESAgent(); + agent.load_config(config); + agent.load_inference_model(model_dir); + agent.init_solver(); + + // 附:EvoKit配置项示范 + solver { + type: BASIC_ES + optimizer { // 线下Adam更新 + type: ADAM + base_lr: 0.05 + adam { + beta1: 0.9 + beta2: 0.999 + epsilon: 1e-08 + } + } + sampling { // 线上高斯采样 + type: GAUSSIAN_SAMPLING + gaussian_sampling { + std: 0.5 + cached: true + seed: 1024 + cache_size : 100000 + } + } + } + + +step3: 生成用于采样的Agent +################### + +主要关注三个接口: + +- 调用 ``clone`` 生成一个用于sampling的agent。 +- 调用 ``add_noise`` 给这个agent的参数空间增加噪声,同时返回该噪声对应的唯一信息,这个信息得记录在log中,用于线下更新。 +- 调用 ``predict`` 提供预测接口。 + +.. code-block:: c++ + + auto sampling_agent = agent.clone(); + auto sampling_info = sampling_agent.add_noise(); + sampling_agent.predict(feature); + +step4: 用采样的数据更新模型参数 +################### + +用户提供两组数据: + +- 采样参数过程中用于线下复现采样噪声的sampling_info +- 扰动参数后,新参数的评估结果 + +.. code-block:: c++ + + agent.update(sampling_infos, rewards); + +主代码以及注释 +################# + +以下的代码演示通过多线程同时采样, 提升解决问题的效率。 + +.. code-block:: c++ + + int main(int argc, char* argv[]) { + std::vector envs; + // 构造10个环境,用于多线程训练 + for (int i = 0; i < ITER; ++i) { + envs.push_back(CartPole()); + } + + // 初始化ESAgent + std::string model_dir = "./demo/cartpole/init_model"; + std::string config_path = "./demo/cartpole/config.prototxt"; + std::shared_ptr agent = std::make_shared(); + agent->load_config(config_path); // 加载配置 + + agent->load_inference_model(FLAGS_model_dir); // 加载初始预测模型 + agent->init_solver(); // 初始化solver,注意要在load_inference_model后执行 + + // 生成10个agent用于同时采样 + std::vector> sampling_agents; + for (int i = 0; i < ITER; ++i) { + sampling_agents.push_back(agent->clone()); + } + + std::vector sampling_infos; + std::vector rewards(ITER, 0.0f); + sampling_infos.resize(ITER); + omp_set_num_threads(10); + + // 共迭代100轮 + for (int epoch = 0; epoch < 100; ++epoch) { + #pragma omp parallel for schedule(dynamic, 1) + for (int i = 0; i < ITER; ++i) { + std::shared_ptr sampling_agent = sampling_agents[i]; + SamplingInfo sampling_info; + sampling_agent->add_noise(sampling_info); + float reward = evaluate(envs[i], sampling_agent); + // 保存采样的sampling_info以及对应的评估结果reward + sampling_infos[i] = sampling_info; + rewards[i] = reward; + } + // 更新模型参数,注意:参数更新后会自动同步到sampling_agent中 + agent->update(sampling_infos, rewards); + + int reward = evaluate(envs[0], agent); + LOG(INFO) << "Epoch:" << epoch << " Reward: " << reward; // 打印每一轮reward + } + } + +如何运行demo +################# + +- 下载代码 + + 在icode上clone代码,我们的仓库路径是: ``baidu/nlp/deep-es`` ``TO DO: 修改库路径`` + +- 编译demo + + 通过bcloud的云端集群编译即可,命令为: ``bb`` + +- 运行demo + + 编译完成后,我们需要增加动态库查找路径: + + ``export LD_LIBRARY_PATH=./output/so/:$LD_LIBRARY_PATH`` + + 运行demo: ``./output/bin/cartpole/train`` + +问题解决 +#################### + +在使用过程中有任何问题,请加hi群: 1692822 (PARL官方答疑群)进行咨询,开发同学会直接回答任何的使用问题。 diff --git a/docs/EvoKit/online_example.rst b/docs/EvoKit/online_example.rst new file mode 100644 index 0000000000000000000000000000000000000000..c4963f8cb909a240f318b1e85c77ba310c460160 --- /dev/null +++ b/docs/EvoKit/online_example.rst @@ -0,0 +1,124 @@ +Example for Online Products +######################### + +``本教程的目标: 演示通过EvoKit库上线后,如何迭代算法,更新模型参数。`` + +在产品线中,线上无法实时拿到用户日志,经常是通过保存用户点击/时长日志,在线下根据用户数据更新模型,然后再推送到线上,完成算法的更新。 +本教程继续围绕经典的CartPole环境,展示如何通过在线采样/离线更新的方式,来更新迭代ES算法。 + +demo的完整代码示例放在demp/online_example文件夹中。 +``TO DO: 文件夹`` + +初始化solver +--------------------- +构造solver,对它初始化,并保存到文件。初始化solver仅需在开始时调用一次。 + +.. code-block:: c++ + + std::shared_ptr agent = std::make_shared(); + agent->load_config(FLAGS_config_path); + agent->load_inference_model(FLAGS_model_dir); + agent->init_solver(); + agent->save_solver(FLAGS_model_dir); + + +线上采样 +--------------------- +加载模型和solver,记录线上采样返回的sampling_info以及评估的reward,并通过二进制的方式记录到log文件中。 + +.. code-block:: c++ + + std::shared_ptr agent = std::make_shared(); + agent->load_config(FLAGS_config_path); + agent->load_inference_model(FLAGS_model_dir); + agent->load_solver(FLAGS_model_dir); + + #pragma omp parallel for schedule(dynamic, 1) + for (int i = 0; i < ITER; ++i) { + std::shared_ptr sampling_agent = sampling_agents[i]; + SamplingInfo sampling_info; + sampling_agent->add_noise(sampling_info); + float reward = evaluate(envs[i], sampling_agent); + sampling_infos[i] = sampling_info; + rewards[i] = reward; + } + + // save sampling information and log in binary fomrat + std::ofstream log_stream(FLAGS_log_path, std::ios::binary); + for (int i = 0; i < ITER; ++i) { + std::string data; + sampling_infos[i].SerializeToString(&data); + int size = data.size(); + log_stream.write((char*) &rewards[i], sizeof(float)); + log_stream.write((char*) &size, sizeof(int)); + log_stream.write(data.c_str(), size); + } + log_stream.close(); + + +线下更新 +----------------------- +在加载好之前记录的log之后,调用 ``update`` 函数进行更新,然后通过 ``save_inference_model`` 和 ``save_solver`` 函数保存更新后的参数到本地,推送到线上。 + +.. code-block:: c++ + + std::shared_ptr agent = std::make_shared(); + agent->load_config(FLAGS_config_path); + agent->load_inference_model(FLAGS_model_dir); + agent->load_solver(FLAGS_model_dir); + + // load training data + std::vector sampling_infos; + std::vector rewards(ITER, 0.0f); + sampling_infos.resize(ITER); + std::ifstream log_stream(FLAGS_log_path); + CHECK(log_stream.good()) << "[EvoKit] cannot open log: " << FLAGS_log_path; + char buffer[1000]; + for (int i = 0; i < ITER; ++i) { + int size; + log_stream.read((char*) &rewards[i], sizeof(float)); + log_stream.read((char*) &size, sizeof(int)); + log_stream.read(buffer, size); + buffer[size] = 0; + std::string data(buffer); + sampling_infos[i].ParseFromString(data); + } + + // update model and save parameter + agent->update(sampling_infos, rewards); + agent->save_inference_model(FLAGS_updated_model_dir); + agent->save_solver(FLAGS_updated_model_dir); + + +主代码 +----------------------- + +将以上代码分别编译成可执行文件。 + +- 初始化solver: ``init_solver`` 。 +- 线上采样: ``online_sampling`` 。 +- 线下更新: ``offline update`` 。 + +.. code-block:: shell + + #------------------------init solver------------------------ + ./init_solver \ + --model_dir="./model_warehouse/model_dir_0" \ + --config_path="config.prototxt" + + + for ((epoch=0;epoch<200;++epoch));do + #------------------------online sampling------------------------ + ./online_sampling \ + --log_path="./sampling_log" \ + --model_dir="./model_warehouse/model_dir_$epoch" \ + --config_path="./config.prototxt" + + #------------------------offline update------------------------ + next_epoch=$((epoch+1)) + ./offline_update \ + --log_path='./sampling_log' \ + --model_dir="./model_warehouse/model_dir_$epoch" \ + --updated_model_dir="./model_warehouse/model_dir_${next_epoch}" \ + --config_path="./config.prototxt" + done diff --git a/docs/EvoKit/overview.rst b/docs/EvoKit/overview.rst new file mode 100644 index 0000000000000000000000000000000000000000..ce6fa07211456e12a0fbc29f6ecc37b501e45f24 --- /dev/null +++ b/docs/EvoKit/overview.rst @@ -0,0 +1,21 @@ +Overview +------------------ + +``EvoKit`` 是一个集合了多种进化算法、兼容多种类预测框架的进化算法库,主打 **快速上线验证** 。 + +.. image:: ../../evo_kit/DeepES.gif + :align: center + :width: 400px + +特性 +######### + +**1. 多种进化算法支持。** 支持高斯采样、CMA、GA等算法,更多算法持续接入中。 + +**2. 主流优化器支持。** 支持SGD/Momentum/Adam等多个主流优化器,有效提升算法收敛效率。 + +**3. 一站式上线。** 整合了线上采样和线下更新流程, 提供Bcloud/Cmake等编译方式, 助力快速上线。 + +**4. 深度学习框架全系列兼容。** 裸写的网络,paddle/lego/Torch等深度学习框架,EvoKit都支持。 + +**5. 同步/异步更新方式。** 支持多个采样模型/多份采样数据异步更新,完美契合业务场景。 diff --git a/docs/conf.py b/docs/conf.py index e4e009f0d8d2edc5ae158b0ab5d680c9c45fcdc2..29f697d1db5fc60304f1da625ed92cf14f2f819b 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -101,3 +101,37 @@ def setup(app): add_module_names = False + +latex_engine = 'xelatex' +latex_use_xindy = False +latex_elements = { + 'preamble': '\\usepackage[UTF8]{ctex}\n', +} + +latex_elements = { + # The paper size ('letterpaper' or 'a4paper'). + #'papersize': 'letterpaper', + + # The font size ('10pt', '11pt' or '12pt'). + #'pointsize': '10pt', + + # Additional stuff for the LaTeX preamble. + #'preamble': '', + 'preamble': + r''' + \hypersetup{unicode=true} + \usepackage{CJKutf8} + \DeclareUnicodeCharacter{00A0}{\nobreakspace} + \DeclareUnicodeCharacter{2203}{\ensuremath{\exists}} + \DeclareUnicodeCharacter{2200}{\ensuremath{\forall}} + \DeclareUnicodeCharacter{2286}{\ensuremath{\subseteq}} + \DeclareUnicodeCharacter{2713}{x} + \DeclareUnicodeCharacter{27FA}{\ensuremath{\Longleftrightarrow}} + \DeclareUnicodeCharacter{221A}{\ensuremath{\sqrt{}}} + \DeclareUnicodeCharacter{221B}{\ensuremath{\sqrt[3]{}}} + \DeclareUnicodeCharacter{2295}{\ensuremath{\oplus}} + \DeclareUnicodeCharacter{2297}{\ensuremath{\otimes}} + \begin{CJK}{UTF8}{gbsn} + \AtEndDocument{\end{CJK}} + ''', +} diff --git a/docs/index.rst b/docs/index.rst index e7d6c144112fca11f836b6890c68b2e4c2010832..5009dde813c18dfb97c9066a7dfb9abecf22657a 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -46,7 +46,7 @@ Abstractions :maxdepth: 1 :caption: Installation - installation.rst + installation.rst .. toctree:: :maxdepth: 1 @@ -58,9 +58,10 @@ Abstractions :maxdepth: 1 :caption: Tutorial - getting_started.rst - new_alg.rst - save_param.rst + tutorial/getting_started.rst + tutorial/new_alg.rst + tutorial/save_param.rst + tutorial/tensorboard.rst .. toctree:: :maxdepth: 2 @@ -83,3 +84,11 @@ Abstractions model.rst algorithm.rst agent.rst + +.. toctree:: + :maxdepth: 2 + :caption: EvoKit + + EvoKit/overview.rst + EvoKit/minimal_example.rst + EvoKit/online_example.rst diff --git a/docs/tutorial/add_histogram.jpg b/docs/tutorial/add_histogram.jpg new file mode 100644 index 0000000000000000000000000000000000000000..1c33b0e3ad81f3ca0878c2623f6c4a6a80de19b0 Binary files /dev/null and b/docs/tutorial/add_histogram.jpg differ diff --git a/docs/tutorial/add_scalar.jpg b/docs/tutorial/add_scalar.jpg new file mode 100644 index 0000000000000000000000000000000000000000..27cb4a270150c00baf37332d79ed821c3bc901ba Binary files /dev/null and b/docs/tutorial/add_scalar.jpg differ diff --git a/docs/getting_started.rst b/docs/tutorial/getting_started.rst similarity index 98% rename from docs/getting_started.rst rename to docs/tutorial/getting_started.rst index a70a438ba7952a54d199d0fee345c0ee4e87b398..f406c47407d8b98d7bb26f99e8a54b64e11423c8 100644 --- a/docs/getting_started.rst +++ b/docs/tutorial/getting_started.rst @@ -178,9 +178,9 @@ Then we use this agent to interact with the environment, and run around 1000 epi Summary ----------- -.. image:: ../examples/QuickStart/performance.gif +.. image:: ../../examples/QuickStart/performance.gif :width: 300px -.. image:: ./images/quickstart.png +.. image:: ../images/quickstart.png :width: 300px In this tutorial, we have shown how to build an agent step-by-step to solve the `Cartpole` problem. diff --git a/docs/new_alg.rst b/docs/tutorial/new_alg.rst similarity index 98% rename from docs/new_alg.rst rename to docs/tutorial/new_alg.rst index 973c062b88cf5ad7f59e94161d4d019c72fbf717..1acf09796c3ed10ba6135ec367902e6f1d985d47 100644 --- a/docs/new_alg.rst +++ b/docs/tutorial/new_alg.rst @@ -59,7 +59,6 @@ Within class ``DQN(Algorithm)``, we define the following methods: Args: model (parl.Model): model defining forward network of Q function - hyperparas (dict): (deprecated) dict of hyper parameters. act_dim (int): dimension of the action space gamma (float): discounted factor for reward computation. lr (float): learning rate. diff --git a/docs/save_param.rst b/docs/tutorial/save_param.rst similarity index 95% rename from docs/save_param.rst rename to docs/tutorial/save_param.rst index 3824eb9d3fe23c47f375877a75c6c88aab06c0b4..82e411ab2010ef3f9b4dcca0fd0c23f319eac7b7 100644 --- a/docs/save_param.rst +++ b/docs/tutorial/save_param.rst @@ -22,5 +22,5 @@ Here is a demonstration of usage: agent.restore('./model.ckpt') # restore the parameters from ./model.ckpt to another_agent - another_agent = AtariAgent() + another_agent = AtariAgent() another_agent.restore('./model.ckpt') diff --git a/docs/tutorial/tensorboard.rst b/docs/tutorial/tensorboard.rst new file mode 100644 index 0000000000000000000000000000000000000000..8952a5e00b624e1c02b74c451da0d168ee6a4817 --- /dev/null +++ b/docs/tutorial/tensorboard.rst @@ -0,0 +1,55 @@ +summary +=============== + +Visualize the results with tensorboard. + +add_scalar +------------- + +Common used arguments: + +* summary.add_scalar(tag, scalar_value, global_step=None) + * tag *(string)* – Data identifier + * scalar_value *(float or string/blobname)* – Value to save + * global_step *(int)* – Global step value to record + +Example: + +.. code-block:: python + + from parl.utils import summary + + x = range(100) + for i in x: + summary.add_scalar('y=2x', i * 2, i) + +Expected result: + + .. image:: add_scalar.jpg + :scale: 50 % + +add_histogram +---------------- + +Common used arguments: + +* summary.add_scalar(tag, scalar_value, global_step=None) + * tag *(string)* – Data identifier + * values *(torch.Tensor, numpy.array, or string/blobname)* – Values to build histogram + * global_step *(int)* – Global step value to record + +Example: + +.. code-block:: python + + from parl.utils import summary + import numpy as np + + for i in range(10): + x = np.random.random(1000) + summary.add_histogram('distribution centers', x + i, i) + +Expected result: + + .. image:: add_histogram.jpg + :scale: 50 % diff --git a/evo_kit/CMakeLists.txt b/evo_kit/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..a9672c11aa0ea54ee6de4f6c6d60e92c18d47e60 --- /dev/null +++ b/evo_kit/CMakeLists.txt @@ -0,0 +1,89 @@ +cmake_minimum_required (VERSION 2.6) +project (EvoKit) + +########## options ########## +option(WITH_PADDLE "Compile EvoKit with PaddleLite framework." OFF) +option(WITH_TORCH "Compile EvoKit with Torch framework." OFF) + +message("WITH_PADDLE: "${WITH_PADDLE}) +message("WITH_TORCH: "${WITH_TORCH}) + +if (NOT (WITH_PADDLE OR WITH_TORCH)) + message("ERROR: You should choose at least one framework to compile EvoKit.") + return() +elseif(WITH_PADDLE AND WITH_TORCH) + message("ERROR: You cannot choose more than one framework to compile EvoKit.") + return() +endif() + +set(CMAKE_CXX_STANDARD 11) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CXX_EXTENSIONS OFF) + +find_package(OpenMP) +if (OPENMP_FOUND) + set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") + set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") + set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}") +endif() + + +file(GLOB src "core/src/*.cc" "core/proto/evo_kit/*.cc") +include_directories("core/include") +include_directories("core/proto") +include_directories("benchmark") + +########## PaddleLite config ########## +if (WITH_PADDLE) + add_definitions(-g -O3 -pthread) + + include_directories("paddle/include") + include_directories("${PROJECT_SOURCE_DIR}/inference_lite_lib/cxx/include" + "${PROJECT_SOURCE_DIR}/inference_lite_lib/third_party/mklml/include") + link_directories("${PROJECT_SOURCE_DIR}/inference_lite_lib/cxx/lib" + "${PROJECT_SOURCE_DIR}/inference_lite_lib/third_party/mklml/lib") + + file(GLOB framework_src "paddle/src/*.cc") + set(TARGET EvoKit_paddle) +########## Torch config ########## +elseif (WITH_TORCH) + # list(APPEND CMAKE_PREFIX_PATH "./libtorch") + # find_package(Torch REQUIRED ON) # TODO: not necessary for now + + include_directories("torch/include") + + file(GLOB framework_src "torch/src/*.cc") + set(TARGET EvoKit_torch) +else () + message("ERROR: You should choose at least one framework to compile EvoKit.") +endif() + + +add_library(${TARGET} STATIC ${src} ${framework_src}) +target_link_libraries(${TARGET} gflags protobuf pthread glog) + + +# ########## PaddleLite libraries ########## +# if (WITH_PADDLE) +# target_link_libraries(${TARGET} -lpaddle_full_api_shared) +# target_link_libraries(${TARGET} -lmklml_intel) +# target_link_libraries(${TARGET} -ldl) +# ########## Torch libraries ########## +# elseif (WITH_TORCH) +# target_link_libraries(${TARGET} "${TORCH_LIBRARIES}") +# endif() + +file(GLOB include "core/include/evo_kit/*.h") +file(GLOB proto_include "core/proto/evo_kit/*.h") +file(GLOB torch_include "torch/include/evo_kit/*.h") +file(GLOB paddle_include "paddle/include/evo_kit/*.h") +file(GLOB benchmark_include "benchmark/*.h") +file(GLOB findcmake "cmake/Torch/*.cmake") + +set(CMAKE_INSTALL_PREFIX "${PROJECT_SOURCE_DIR}/libevokit") +install(TARGETS ${TARGET} ARCHIVE DESTINATION "lib") +install(FILES ${include} ${proto_include} DESTINATION "include/evo_kit") +install(FILES ${torch_include} DESTINATION "torch/evo_kit") +install(FILES ${paddle_include} DESTINATION "paddle/evo_kit") +install(FILES ${benchmark_include} DESTINATION "include") +install(FILES ${findcmake} DESTINATION "cmake/Torch") diff --git a/evo_kit/DeepES.gif b/evo_kit/DeepES.gif new file mode 100644 index 0000000000000000000000000000000000000000..7240118f3fce55b587690450e0c9cafc2f0694db Binary files /dev/null and b/evo_kit/DeepES.gif differ diff --git a/evo_kit/README.md b/evo_kit/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ecac85379c048f22e444b9286d8e5225a7e7daa8 --- /dev/null +++ b/evo_kit/README.md @@ -0,0 +1,41 @@ +# EvoKit +EvoKit 是一个集合了多种进化算法、兼容多种类预测框架的进化算法库,主打快速上线验证 。 +

+PARL +

+ +## 使用示范 +```c++ +//实例化一个预测,根据配置文件加载模型,采样方式(Gaussian\CMA sampling..)、更新方式(SGD\Adam)等 +auto agent = ESAgent(config); + +for (int i = 0; i < 10; ++i) { + auto sampling_agnet = agent->clone(); // clone出一个sampling agent + SamplingInfo info; + sampling_agent->add_noise(info); // 参数扰动,同时保存随机种子到info中 + int reward = evaluate(env, sampling_agent); //评估参数 + noisy_info.push_back(info); // 记录随机噪声对应种子 + noisy_rewards.push_back(reward); // 记录评估结果 +} +//根据评估结果、随机种子更新参数,然后重复以上过程,直到收敛。 +agent->update(noisy_info, noisy_rewards); +``` + +## 一键运行demo列表 +- **PaddleLite**: sh ./scripts/build.sh paddle +- **Torch**: sh ./scripts/build.sh torch +- **裸写网络**: + +## 相关依赖: +- Protobuf2 +- OpenMP +- [glog](https://github.com/gflags/gflags/blob/master/INSTALL.md) +- [gflag](https://github.com/google/glog) + +## 额外依赖: + +### 使用PaddleLite +下载PaddleLite的X86预编译库,或者编译PaddleLite源码,得到inference_lite_lib文件夹,放在当前目录中。(可参考:[PaddleLite使用X86预测部署](https://paddle-lite.readthedocs.io/zh/latest/demo_guides/x86.html)) + +### 使用torch +下载[libtorch](https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-1.4.0%2Bcpu.zip)或者编译torch源码,得到libtorch文件夹,放在当前目录中。 diff --git a/evo_kit/benchmark/cartpole.h b/evo_kit/benchmark/cartpole.h new file mode 100644 index 0000000000000000000000000000000000000000..f289715aeac29cb76d5148a5ae8b4adc5233243c --- /dev/null +++ b/evo_kit/benchmark/cartpole.h @@ -0,0 +1,98 @@ +// Third party code +// This code is copied or modified from openai/gym's cartpole.py +#include +#include +#include +#include + +const double kPi = 3.1415926535898; + +class CartPole { +public: + double gravity = 9.8; + double masscart = 1.0; + double masspole = 0.1; + double total_mass = (masspole + masscart); + double length = 0.5; // actually half the pole's length; + double polemass_length = (masspole * length); + double force_mag = 10.0; + double tau = 0.02; // seconds between state updates; + + // Angle at which to fail the episode + double theta_threshold_radians = 12 * 2 * kPi / 360; + double x_threshold = 2.4; + int steps_beyond_done = -1; + + std::vector state = {0, 0, 0, 0}; + double reward; + bool done; + int step_ = 0; + + const float* getState() { + return state.data(); + } + + double getReward() { + return reward; + } + + double isDone() { + return done; + } + + void reset() { + std::random_device rd; + std::default_random_engine generator(rd()); + std::uniform_real_distribution distribution(-0.05, 0.05); + for (int i = 0; i < 4; ++i) { + state[i] = distribution(generator); + } + + steps_beyond_done = -1; + step_ = 0; + } + + CartPole() { + reset(); + } + + void step(int action) { + float x = state[0]; + float x_dot = state[1]; + float theta = state[2]; + float theta_dot = state[3]; + + auto force = (action == 1) ? force_mag : -force_mag; + auto costheta = std::cos(theta); + auto sintheta = std::sin(theta); + auto temp = (force + polemass_length * theta_dot * theta_dot * sintheta) / + total_mass; + auto thetaacc = (gravity * sintheta - costheta * temp) / + (length * (4.0 / 3.0 - masspole * costheta * costheta / total_mass)); + auto xacc = temp - polemass_length * thetaacc * costheta / total_mass; + + x = x + tau * x_dot; + x_dot = x_dot + tau * xacc; + theta = theta + tau * theta_dot; + theta_dot = theta_dot + tau * thetaacc; + + state = {x, x_dot, theta, theta_dot}; + + done = x < -x_threshold || x > x_threshold || + theta < -theta_threshold_radians || theta > theta_threshold_radians || + step_ > 200; + + if (!done) { + reward = 1.0; + } else if (steps_beyond_done == -1) { + // Pole just fell! + steps_beyond_done = 0; + reward = 0; + } else { + if (steps_beyond_done == 0) { + assert(false); // Can't do this + } + } + step_++; + } +}; diff --git a/evo_kit/cmake/Torch/EvoKitConfig.cmake b/evo_kit/cmake/Torch/EvoKitConfig.cmake new file mode 100644 index 0000000000000000000000000000000000000000..9f1c954430aec05a38d03c26a6b406343c01ad20 --- /dev/null +++ b/evo_kit/cmake/Torch/EvoKitConfig.cmake @@ -0,0 +1,45 @@ +# FindEvoKit +# ------- +# +# Finds the EvoKit library +# +# This will define the following variables: +# +# EVOKIT_FOUND -- True if the system has the EvoKit library +# EVOKIT_INCLUDE_DIRS -- The include directories for EvoKit +# EVOKIT_LIBRARY -- Libraries to link against +# +# and the following imported targets: +# +# EvoKit + +include(FindPackageHandleStandardArgs) + +if (DEFINED ENV{EVOKIT_INSTALL_PREFIX}) + set(EVOKIT_INSTALL_PREFIX $ENV{EVOKIT_INSTALL_PREFIX}) +else() + # Assume we are in /cmake/Torch/EvoKitConfig.cmake + get_filename_component(CMAKE_CURRENT_LIST_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH) + get_filename_component(EVOKIT_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/../../" ABSOLUTE) +endif() + +# Include directories. +if (EXISTS "${EVOKIT_INSTALL_PREFIX}/include") + set(EVOKIT_INCLUDE_DIRS + ${EVOKIT_INSTALL_PREFIX}/include + ${EVOKIT_INSTALL_PREFIX}/torch) +else() + set(EVOKIT_INCLUDE_DIRS + ${EVOKIT_INSTALL_PREFIX}/include + ${EVOKIT_INSTALL_PREFIX}/torch) +endif() + +find_library(EVOKIT_LIBRARY libEvoKit_torch.a PATHS "${EVOKIT_INSTALL_PREFIX}/lib") + +include_directories("${EVOKIT_INSTALL_PREFIX}/torch") +include_directories("${EVOKIT_INSTALL_PREFIX}/include") + +find_package_handle_standard_args(EvoKit DEFAULT_MSG EVOKIT_LIBRARY EVOKIT_INCLUDE_DIRS) +message(STATUS "EVOKIT_FOUND: ${EVOKIT_FOUND}") +message(STATUS "EVOKIT_INCLUDE_DIRS: ${EVOKIT_INCLUDE_DIRS}") +message(STATUS "EVOKIT_LIBRARY: ${EVOKIT_LIBRARY}") diff --git a/evo_kit/core/include/evo_kit/adam_optimizer.h b/evo_kit/core/include/evo_kit/adam_optimizer.h new file mode 100644 index 0000000000000000000000000000000000000000..b268b69f61d35e5d6df8eeb56b1869e7bcb828ff --- /dev/null +++ b/evo_kit/core/include/evo_kit/adam_optimizer.h @@ -0,0 +1,53 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef EVO_KIT_ADAM_OPTIMIZER_H +#define EVO_KIT_ADAM_OPTIMIZER_H + +#include +#include +#include "evo_kit/optimizer.h" + +namespace evo_kit { + +/*@brief AdamOptimizer. + * Implements Adam algorithm. + * + *@Args: + * base_lr: learning rate (default: 1e-3). + * beta1: coefficients used for computing running averages of gradient (default: 0.9). + * beta2: coefficients used for computing running averages of gradient's square (default: 0.999). + * epsilon: term added to the denominator to improve numerical stability (default: 1e-8). + */ +class AdamOptimizer: public Optimizer { +public: + AdamOptimizer(float base_lr, float beta1 = 0.9, float beta2 = 0.999, + float epsilon = 1e-8): Optimizer(base_lr), \ + _beta1(beta1), _beta2(beta2), _epsilon(epsilon) {} + ~AdamOptimizer(); + +protected: + void compute_step(float* gradient, int size, std::string param_name); + +private: + float _beta1; + float _beta2; + float _epsilon; + std::unordered_map _momentum; + std::unordered_map _velocity; +}; + +}//namespace + +#endif diff --git a/evo_kit/core/include/evo_kit/cached_gaussian_sampling.h b/evo_kit/core/include/evo_kit/cached_gaussian_sampling.h new file mode 100644 index 0000000000000000000000000000000000000000..c033fb7f23e9d3d91754237cad61e181a823db2d --- /dev/null +++ b/evo_kit/core/include/evo_kit/cached_gaussian_sampling.h @@ -0,0 +1,78 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +#ifndef EVO_KIT_CACHED_GAUSSIAN_SAMPLING_H +#define EVO_KIT_CACHED_GAUSSIAN_SAMPLING_H + +#include +#include +#include +#include +#include +#include "sampling_method.h" +#include "utils.h" + +namespace evo_kit { + +class CachedGaussianSampling: public SamplingMethod { + +public: + CachedGaussianSampling(); + + ~CachedGaussianSampling(); + + /*Initialize the sampling algorithm given the config with the protobuf format. + *EvoKit library uses only one configuration file for all sampling algorithms. + A defalut configuration file can be found at: . // TODO: where? + Usally you won't have to modify the configuration items of other algorithms + if you are not using them. + */ + bool load_config(const EvoKitConfig& config); + + /*@brief generate Gaussian noise and the related key. + * + *@Args: + * key: a unique key associated with the sampled noise. + * noise: a pointer pointed to the memory that stores the noise + * size: the number of float to be sampled. + * + *@return: + * success: generate Gaussian successfully or not. + */ + bool sampling(int* key, float* noise, int64_t size); + + /*@brief reconstruct the Gaussion noise given the key. + * This function is often used for updating the neuron network parameters in the offline environment. + * + *@Args: + * key: a unique key associated with the sampled noise. + * noise: a pointer pointed to the memory that stores the noise + * size: the number of float to be sampled. + * + *@return: + * success: reconstruct Gaussian successfully or not. + */ + bool resampling(int key, float* noise, int64_t size); + +private: + float _std; + int _cache_size; + float* _noise_cache = nullptr; + + bool _create_noise_cache(); +}; + +} + +#endif diff --git a/evo_kit/core/include/evo_kit/gaussian_sampling.h b/evo_kit/core/include/evo_kit/gaussian_sampling.h new file mode 100644 index 0000000000000000000000000000000000000000..c0fc66f058f2d1b9224d19d5c029cdca1853f638 --- /dev/null +++ b/evo_kit/core/include/evo_kit/gaussian_sampling.h @@ -0,0 +1,73 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +#ifndef EVO_KIT_GAUSSIAN_SAMPLING_H +#define EVO_KIT_GAUSSIAN_SAMPLING_H + +#include +#include +#include +#include +#include "evo_kit/sampling_method.h" +#include "evo_kit/utils.h" + +namespace evo_kit { + +class GaussianSampling: public SamplingMethod { + +public: + GaussianSampling() {} + + ~GaussianSampling() {} + + /*Initialize the sampling algorithm given the config with the protobuf format. + *EvoKit library uses only one configuration file for all sampling algorithms. + A defalut configuration file can be found at: . // TODO: where? + Usally you won't have to modify the configuration items of other algorithms + if you are not using them. + */ + bool load_config(const EvoKitConfig& config); + + /*@brief generate Gaussian noise and the related key. + * + *@Args: + * key: a unique key associated with the sampled noise. + * noise: a pointer pointed to the memory that stores the noise + * size: the number of float to be sampled. + * + *@return: + * success: generate Gaussian successfully or not. + */ + bool sampling(int* key, float* noise, int64_t size); + + /*@brief reconstruct the Gaussion noise given the key. + * This function is often used for updating the neuron network parameters in the offline environment. + * + *@Args: + * key: a unique key associated with the sampled noise. + * noise: a pointer pointed to the memory that stores the noise + * size: the number of float to be sampled. + * + *@return: + * success: reconstruct Gaussian successfully or not. + */ + bool resampling(int key, float* noise, int64_t size); + +private: + float _std; +}; + +} + +#endif diff --git a/evo_kit/core/include/evo_kit/optimizer.h b/evo_kit/core/include/evo_kit/optimizer.h new file mode 100644 index 0000000000000000000000000000000000000000..5c41bc5d405b00bef71affa0fa6cb82a13afd1b2 --- /dev/null +++ b/evo_kit/core/include/evo_kit/optimizer.h @@ -0,0 +1,79 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef EVO_KIT_OPTIMIZER_H +#define EVO_KIT_OPTIMIZER_H + +#include +#include + +namespace evo_kit { + +/*@brief Optimizer. Base class for optimizers. + * + *@Args: + * base_lr: learning rate (default: 1e-3). + * + * .. warning: update () is based on the parameter level, + * you need to perform update () on each parameter. + * + * Subclasses are required to implement the following functions: + * 1. compute_steps + */ +class Optimizer { +public: + Optimizer() : _base_lr(1e-3), _update_times(0) {} + Optimizer(float base_lr) : _base_lr(base_lr), _update_times(0) {} + virtual ~Optimizer() { + _params_size.clear(); + } + + template + bool update(T weights, float* gradient, int size, std::string param_name = "") { + /*@ Performs a single optimization step (parameter update) at the parameter level. + * + *@Args: + * weights (array): parameter weights. + * gradient (array): gradient for updating weights. + * size: size of gradient. + * param_name: the name corresponding to the weights. + */ + if (_params_size.count(param_name) == 0) { + _params_size[param_name] = size; + } else if (_params_size[param_name] != size) { + LOG(WARNING) << "[Warning] Update times: " << int(_update_times / _params_size.size()) \ + << ". Size of weights[" << param_name << "] is " << _params_size[param_name] << ", not " << size; + return false; + } + + ++_update_times; + compute_step(gradient, size, param_name); + + for (int i = 0; i < size; ++i) { + weights[i] -= _base_lr * gradient[i]; + } + + return true; + } // template function + +protected: + virtual void compute_step(float* graident, int size, std::string param_name = "") = 0; + float _base_lr; + float _update_times; + std::unordered_map _params_size; +}; + + +}//namespace +#endif diff --git a/evo_kit/core/include/evo_kit/optimizer_factory.h b/evo_kit/core/include/evo_kit/optimizer_factory.h new file mode 100644 index 0000000000000000000000000000000000000000..6e3e099110d17efefd8dce9d5090b06fc27c0d21 --- /dev/null +++ b/evo_kit/core/include/evo_kit/optimizer_factory.h @@ -0,0 +1,36 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef EVO_KIT_OPTIMIZER_FACTORY_H +#define EVO_KIT_OPTIMIZER_FACTORY_H + +#include +#include +#include +#include "evo_kit/adam_optimizer.h" +#include "evo_kit/evo_kit.pb.h" +#include "evo_kit/optimizer.h" +#include "evo_kit/sgd_optimizer.h" + +namespace evo_kit { +/* @brief: create an optimizer according to the configuration" + * @args: + * config: configuration for the optimizer + * + */ +std::shared_ptr create_optimizer(const OptimizerConfig& optimizer_config); + +} // namespace + +#endif diff --git a/evo_kit/core/include/evo_kit/sampling_factory.h b/evo_kit/core/include/evo_kit/sampling_factory.h new file mode 100644 index 0000000000000000000000000000000000000000..e7e859cddcb88784b2d01b9642bcbc1b23e378cb --- /dev/null +++ b/evo_kit/core/include/evo_kit/sampling_factory.h @@ -0,0 +1,36 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef EVO_KIT_SAMPLING_FACTORY_H +#define EVO_KIT_SAMPLING_FACTORY_H + +#include +#include +#include +#include "evo_kit/cached_gaussian_sampling.h" +#include "evo_kit/evo_kit.pb.h" +#include "evo_kit/gaussian_sampling.h" +#include "evo_kit/sampling_method.h" + +namespace evo_kit { +/* @brief: create an sampling_method according to the configuration" + * @args: + * config: configuration for the EvoKit + * + */ +std::shared_ptr create_sampling_method(const EvoKitConfig& Config); + +} // namespace + +#endif diff --git a/evo_kit/core/include/evo_kit/sampling_method.h b/evo_kit/core/include/evo_kit/sampling_method.h new file mode 100644 index 0000000000000000000000000000000000000000..dc07dfe7cfefff694eef6cf7ca17ee35848eea98 --- /dev/null +++ b/evo_kit/core/include/evo_kit/sampling_method.h @@ -0,0 +1,90 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef EVO_KIT_SAMPLING_METHOD_H +#define EVO_KIT_SAMPLING_METHOD_H + +#include +#include +#include "evo_kit/evo_kit.pb.h" + +namespace evo_kit { + +/*Base class for sampling algorithms. All algorithms are required to override the following functions: + * + * 1. load_config + * 2. sampling + * 3. resampling + * + * View an demostrative algorithm in gaussian_sampling.h + * */ + +class SamplingMethod { + +public: + + SamplingMethod(): _seed(0) {} + + virtual ~SamplingMethod() {} + + /*Initialize the sampling algorithm given the config with the protobuf format. + *EvoKit library uses only one configuration file for all sampling algorithms. + A defalut configuration file can be found at: . // TODO: where? + Usally you won't have to modify the configuration items of other algorithms + if you are not using them. + */ + virtual bool load_config(const EvoKitConfig& config) = 0; + + /*@brief generate Gaussian noise and the related key. + * + *@Args: + * key: a unique key associated with the sampled noise. + * noise: a pointer pointed to the memory that stores the noise + * size: the number of float to be sampled. + * + *@return: + * success: generate Gaussian successfully or not. + */ + virtual bool sampling(int* key, float* noise, int64_t size) = 0; + + /*@brief reconstruct the Gaussion noise given the key. + * This function is often used for updating the neuron network parameters in the offline environment. + * + *@Args: + * key: a unique key associated with the sampled noise. + * noise: a pointer pointed to the memory that stores the noise + * size: the number of float to be sampled. + * + *@return: + * success: reconstruct Gaussian successfully or not. + */ + virtual bool resampling(int key, float* noise, int64_t size) = 0; + + bool set_seed(int seed) { + _seed = seed; + srand(_seed); + return true; + } + + int get_seed() { + return _seed; + } + +protected: + int _seed; + +}; + +} +#endif diff --git a/evo_kit/core/include/evo_kit/sgd_optimizer.h b/evo_kit/core/include/evo_kit/sgd_optimizer.h new file mode 100644 index 0000000000000000000000000000000000000000..cd0d68803775df66d1bc90c748fe9801e17176c9 --- /dev/null +++ b/evo_kit/core/include/evo_kit/sgd_optimizer.h @@ -0,0 +1,46 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef EVO_KIT_SGD_OPTIMIZER_H +#define EVO_KIT_SGD_OPTIMIZER_H + +#include +#include +#include "evo_kit/optimizer.h" + +namespace evo_kit { + +/*@brief SGDOptimizer. + * Implements stochastic gradient descent (optionally with momentum). + * + *@Args: + * base_lr: learning rate (default: 1e-3). + * momentum: momentum factor (default: 0.9). + */ +class SGDOptimizer: public Optimizer { +public: + SGDOptimizer(float base_lr, float momentum = 0.9): Optimizer(base_lr), _momentum(momentum) {} + ~SGDOptimizer(); + +protected: + void compute_step(float* gradient, int size, std::string param_name); + +private: + float _momentum; + std::unordered_map _velocity; +}; + +} // namespace + +#endif diff --git a/evo_kit/core/include/evo_kit/utils.h b/evo_kit/core/include/evo_kit/utils.h new file mode 100644 index 0000000000000000000000000000000000000000..fd704fd384de70683445d65d5609f97b9979907a --- /dev/null +++ b/evo_kit/core/include/evo_kit/utils.h @@ -0,0 +1,97 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef EVO_KIT_UTILS_H +#define EVO_KIT_UTILS_H + +#include +#include +#include +#include +#include +#include "evo_kit/evo_kit.pb.h" + +namespace evo_kit { + +/*Return ranks that is normliazed to [-0.5, 0.5] with the rewards as input. + Args: + reward: an array of rewards +*/ +bool compute_centered_ranks(std::vector& reward); + +std::string read_file(const std::string& filename); + +/* Load a protobuf-based configuration from the file. + * Args: + * config_file: file path. + * proto_config: protobuff message for configuration. + * return + */ +template +bool load_proto_conf(const std::string& config_file, T& proto_config) { + bool success = true; + std::ifstream fin(config_file); + + if (!fin || fin.fail()) { + LOG(ERROR) << "open prototxt config failed: " << config_file; + success = false; + } else { + fin.seekg(0, std::ios::end); + size_t file_size = fin.tellg(); + fin.seekg(0, std::ios::beg); + + char* file_content_buffer = new char[file_size]; + fin.read(file_content_buffer, file_size); + + std::string proto_str(file_content_buffer, file_size); + + if (!google::protobuf::TextFormat::ParseFromString(proto_str, &proto_config)) { + LOG(ERROR) << "Failed to load config: " << config_file; + success = false; + } + + delete[] file_content_buffer; + fin.close(); + } + + return success; +} + +template +bool save_proto_conf(const std::string& config_file, T& proto_config) { + bool success = true; + std::ofstream ofs(config_file, std::ofstream::out); + + if (!ofs || ofs.fail()) { + LOG(ERROR) << "open prototxt config failed: " << config_file; + success = false; + } else { + std::string config_str; + success = google::protobuf::TextFormat::PrintToString(proto_config, &config_str); + + if (!success) { + return success; + } + + ofs << config_str; + } + + return success; +} + +std::vector list_all_model_dirs(std::string path); + +} + +#endif diff --git a/evo_kit/core/proto/evo_kit/evo_kit.proto b/evo_kit/core/proto/evo_kit/evo_kit.proto new file mode 100644 index 0000000000000000000000000000000000000000..fc4f68d9247e63b1d98b35ebd338052ffb7eb9a6 --- /dev/null +++ b/evo_kit/core/proto/evo_kit/evo_kit.proto @@ -0,0 +1,57 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto2"; + +package evo_kit; + +message EvoKitConfig { + //sampling configuration + optional int32 seed = 1 [default = 18]; + optional int32 buffer_size = 2 [default = 100000]; + optional GaussianSamplingConfig gaussian_sampling = 3; + // Optimizer Configuration + optional OptimizerConfig optimizer = 4; + // AsyncESAgent Configuration + optional AsyncESConfig async_es = 5; +} + +message GaussianSamplingConfig { + optional float std = 1 [default = 1.0]; + optional bool cached = 2 [default = false]; + optional int32 cache_size = 3 [default = 100000]; +} + +message OptimizerConfig{ + optional string type = 1 [default = "SGD"]; + optional float base_lr = 2 [default = 1e-3]; // The base learning rate. + optional float momentum = 3 [default = 0.9]; // The momentum value for SGD. + + // ------------Adam Optimizer--------- + optional float beta1 = 4 [default = 0.9]; + optional float beta2 = 5 [default = 0.999]; + optional float epsilon = 6 [default = 1e-8]; +} + +message SamplingInfo{ + repeated int32 key = 1; + optional int32 model_iter_id = 2; +} + +message AsyncESConfig{ + optional string model_warehouse = 1 [default = "./model_warehouse"]; + repeated string model_md5 = 2; + optional int32 max_to_keep = 3 [default = 5]; + optional int32 model_iter_id = 4 [default = 0]; +} diff --git a/evo_kit/core/src/adam_optimizer.cc b/evo_kit/core/src/adam_optimizer.cc new file mode 100644 index 0000000000000000000000000000000000000000..44f36e4d1d3e01ae2cceeba16d95d7aaa24a2c09 --- /dev/null +++ b/evo_kit/core/src/adam_optimizer.cc @@ -0,0 +1,55 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "evo_kit/adam_optimizer.h" + +namespace evo_kit { + +AdamOptimizer::~AdamOptimizer() { + for (auto iter = _momentum.begin(); iter != _momentum.end(); iter++) { + delete[] iter->second; + } + + for (auto iter = _velocity.begin(); iter != _velocity.end(); iter++) { + delete[] iter->second; + } + + _momentum.clear(); + _velocity.clear(); +} + +void AdamOptimizer::compute_step(float* gradient, int size, std::string param_name = "") { + if (_momentum.count(param_name) == 0) { + _momentum[param_name] = new float [size]; + memset(_momentum[param_name], 0, size * sizeof(float)); + } + + if (_velocity.count(param_name) == 0) { + _velocity[param_name] = new float [size]; + memset(_velocity[param_name], 0, size * sizeof(float)); + } + + int true_update_times = int(_update_times / _velocity.size()); + float alpha = std::sqrt(1 - std::pow(_beta2, _update_times)) / (1 - std::pow(_beta1, + _update_times)); + + for (int i = 0; i < size; ++i) { + _momentum[param_name][i] = _beta1 * _momentum[param_name][i] + (1 - _beta1) * gradient[i]; + _velocity[param_name][i] = _beta2 * _velocity[param_name][i] + (1 - _beta2) * gradient[i] * + gradient[i]; + gradient[i] = alpha * _momentum[param_name][i] / (std::sqrt(_velocity[param_name][i]) + _epsilon); + } +} + +}//namespace diff --git a/evo_kit/core/src/cached_gaussian_sampling.cc b/evo_kit/core/src/cached_gaussian_sampling.cc new file mode 100644 index 0000000000000000000000000000000000000000..844eca20e2935c4b5e7ac39d5fa07df1c2b13913 --- /dev/null +++ b/evo_kit/core/src/cached_gaussian_sampling.cc @@ -0,0 +1,121 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "evo_kit/cached_gaussian_sampling.h" + +namespace evo_kit { + +CachedGaussianSampling::CachedGaussianSampling() {} + +CachedGaussianSampling::~CachedGaussianSampling() { + delete[] _noise_cache; +} + +bool CachedGaussianSampling::load_config(const EvoKitConfig& config) { + bool success = true; + _std = config.gaussian_sampling().std(); + success = set_seed(config.seed()); + CHECK(success) << "[EvoKit] Fail to set seed while load config."; + _cache_size = config.gaussian_sampling().cache_size(); + _noise_cache = new float [_cache_size]; + memset(_noise_cache, 0, _cache_size * sizeof(float)); + success = _create_noise_cache(); + CHECK(success) << "[EvoKit] Fail to create noise_cache while load config."; + return success; +} + +bool CachedGaussianSampling::sampling(int* key, float* noise, int64_t size) { + bool success = true; + + if (_noise_cache == nullptr) { + LOG(ERROR) << "[EvoKit] Please use load_config() first."; + success = false; + return success; + } + + if (noise == nullptr) { + LOG(ERROR) << "[EvoKit] Input noise array cannot be nullptr."; + success = false; + return success; + } + + if ((size >= _cache_size) || (size < 0)) { + LOG(ERROR) << "[EvoKit] Input size " << size << " is out of bounds [0, " << _cache_size << + "), cache_size: " << _cache_size; + success = false; + return success; + } + + int rand_key = rand(); + std::default_random_engine generator(rand_key); + std::uniform_int_distribution uniform(0, _cache_size - size); + int index = uniform(generator); + *key = index; + + for (int64_t i = 0; i < size; ++i) { + *(noise + i) = *(_noise_cache + index + i); + } + + return success; +} + +bool CachedGaussianSampling::resampling(int key, float* noise, int64_t size) { + bool success = true; + + if (_noise_cache == nullptr) { + LOG(ERROR) << "[EvoKit] Please use load_config() first."; + success = false; + return success; + } + + if (noise == nullptr) { + LOG(ERROR) << "[EvoKit] Input noise array cannot be nullptr."; + success = false; + return success; + } + + if ((size >= _cache_size) || (size < 0)) { + LOG(ERROR) << "[EvoKit] Input size " << size << " is out of bounds [0, " << _cache_size << + "), cache_size: " << _cache_size; + success = false; + return success; + } + + if ((key > _cache_size - size) || (key < 0)) { + LOG(ERROR) << "[EvoKit] Resampling key " << key << " is out of bounds [0, " + << _cache_size - size << + "], cache_size: " << _cache_size << ", size: " << size; + success = false; + return success; + } + + for (int64_t i = 0; i < size; ++i) { + *(noise + i) = *(_noise_cache + key + i); + } + + return success; +} + +bool CachedGaussianSampling::_create_noise_cache() { + std::default_random_engine generator(_seed); + std::normal_distribution norm; + + for (int64_t i = 0; i < _cache_size; ++i) { + *(_noise_cache + i) = norm(generator) * _std; + } + + return true; +} + +} diff --git a/evo_kit/core/src/gaussian_sampling.cc b/evo_kit/core/src/gaussian_sampling.cc new file mode 100644 index 0000000000000000000000000000000000000000..776c2c4da940fafd23e073dd97002876ddfc8673 --- /dev/null +++ b/evo_kit/core/src/gaussian_sampling.cc @@ -0,0 +1,65 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "evo_kit/gaussian_sampling.h" + +namespace evo_kit { + +bool GaussianSampling::load_config(const EvoKitConfig& config) { + bool success = true; + _std = config.gaussian_sampling().std(); + success = set_seed(config.seed()); + return success; +} + +bool GaussianSampling::sampling(int* key, float* noise, int64_t size) { + bool success = true; + + if (noise == nullptr) { + LOG(ERROR) << "[EvoKit] Input noise array cannot be nullptr."; + success = false; + return success; + } + + int rand_key = rand(); + *key = rand_key; + std::default_random_engine generator(rand_key); + std::normal_distribution norm; + + for (int64_t i = 0; i < size; ++i) { + *(noise + i) = norm(generator) * _std; + } + + return success; +} + +bool GaussianSampling::resampling(int key, float* noise, int64_t size) { + bool success = true; + + if (noise == nullptr) { + LOG(ERROR) << "[EvoKit] Input noise array cannot be nullptr."; + success = false; + } else { + std::default_random_engine generator(key); + std::normal_distribution norm; + + for (int64_t i = 0; i < size; ++i) { + *(noise + i) = norm(generator) * _std; + } + } + + return success; +} + +} diff --git a/evo_kit/core/src/optimizer_factory.cc b/evo_kit/core/src/optimizer_factory.cc new file mode 100644 index 0000000000000000000000000000000000000000..6137d623fc1b023cc8d8edc8c988aced66a482c0 --- /dev/null +++ b/evo_kit/core/src/optimizer_factory.cc @@ -0,0 +1,39 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "evo_kit/optimizer_factory.h" + +namespace evo_kit { + +std::shared_ptr create_optimizer(const OptimizerConfig& optimizer_config) { + std::shared_ptr optimizer; + std::string opt_type = optimizer_config.type(); + std::transform(opt_type.begin(), opt_type.end(), opt_type.begin(), ::tolower); + + if (opt_type == "sgd") { + optimizer = std::make_shared(optimizer_config.base_lr(), \ + optimizer_config.momentum()); + } else if (opt_type == "adam") { + optimizer = std::make_shared(optimizer_config.base_lr(), \ + optimizer_config.beta1(), \ + optimizer_config.beta2(), \ + optimizer_config.epsilon()); + } else { + LOG(ERROR) << "type of OptimizerConfig must be SGD or Adam."; // NotImplementedError + } + + return optimizer; +} + +}//namespace diff --git a/evo_kit/core/src/sampling_factory.cc b/evo_kit/core/src/sampling_factory.cc new file mode 100644 index 0000000000000000000000000000000000000000..8a0b8109a61a6ecaa80d82b8a8042c89574ea5a6 --- /dev/null +++ b/evo_kit/core/src/sampling_factory.cc @@ -0,0 +1,41 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "evo_kit/sampling_factory.h" + +namespace evo_kit { + + +std::shared_ptr create_sampling_method(const EvoKitConfig& config) { + std::shared_ptr sampling_method; + bool cached = config.gaussian_sampling().cached(); + + if (cached) { + sampling_method = std::make_shared(); + } else { + sampling_method = std::make_shared(); + } + + bool success = sampling_method->load_config(config); + + if (success) { + return sampling_method; + } else { + LOG(ERROR) << "[EvoKit] Fail to create sampling_method"; + return nullptr; + } + +} + +}//namespace diff --git a/evo_kit/core/src/sgd_optimizer.cc b/evo_kit/core/src/sgd_optimizer.cc new file mode 100644 index 0000000000000000000000000000000000000000..0b3174bffa3d7b3f3b353b18aab8eb428ba70437 --- /dev/null +++ b/evo_kit/core/src/sgd_optimizer.cc @@ -0,0 +1,40 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "evo_kit/sgd_optimizer.h" + +namespace evo_kit { + +SGDOptimizer::~SGDOptimizer() { + for (auto iter = _velocity.begin(); iter != _velocity.end(); iter++) { + delete[] iter->second; + } + + _velocity.clear(); +} + +void SGDOptimizer::compute_step(float* gradient, int size, std::string param_name = "") { + if (_velocity.count(param_name) == 0) { + _velocity[param_name] = new float [size]; + memset(_velocity[param_name], 0, size * sizeof(float)); + } + + for (int i = 0; i < size; ++i) { + _velocity[param_name][i] = _momentum * _velocity[param_name][i] + (1 - _momentum) * gradient[i]; + gradient[i] = _velocity[param_name][i]; + } +} + + +}//namespace diff --git a/evo_kit/core/src/utils.cc b/evo_kit/core/src/utils.cc new file mode 100644 index 0000000000000000000000000000000000000000..e47b7d097f0f164a83fb96f6ae538e5a5f2370ea --- /dev/null +++ b/evo_kit/core/src/utils.cc @@ -0,0 +1,81 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "evo_kit/utils.h" +#include + +namespace evo_kit { + +bool compute_centered_ranks(std::vector& reward) { + std::vector> reward_index; + float gap = 1.0 / (reward.size() - 1); + float normlized_rank = -0.5; + int id = 0; + + for (auto& rew : reward) { + reward_index.push_back(std::make_pair(rew, id)); + ++id; + } + + std::sort(reward_index.begin(), reward_index.end()); + + for (int i = 0; i < reward.size(); ++i) { + id = reward_index[i].second; + reward[id] = normlized_rank; + normlized_rank += gap; + } + + return true; +} + +std::vector list_all_model_dirs(std::string path) { + std::vector model_dirs; + DIR* dpdf; + struct dirent* epdf; + dpdf = opendir(path.data()); + + if (dpdf != NULL) { + while (epdf = readdir(dpdf)) { + std::string dir(epdf->d_name); + + if (dir.find("model_iter_id") != std::string::npos) { + model_dirs.push_back(path + "/" + dir); + } + } + } + + closedir(dpdf); + return model_dirs; +} + +std::string read_file(const std::string& filename) { + std::ifstream ifile(filename.c_str()); + + if (!ifile.is_open()) { + LOG(ERROR) << "Open file: [" << filename << "] failed."; + return ""; + } + + std::ostringstream buf; + char ch = '\n'; + + while (buf && ifile.get(ch)) { + buf.put(ch); + } + + ifile.close(); + return buf.str(); +} + +}//namespace diff --git a/evo_kit/demo/cartpole_config.prototxt b/evo_kit/demo/cartpole_config.prototxt new file mode 100644 index 0000000000000000000000000000000000000000..2707cb60171a47675f1f5a0625de487ad04904f5 --- /dev/null +++ b/evo_kit/demo/cartpole_config.prototxt @@ -0,0 +1,17 @@ +seed: 1024 +gaussian_sampling { + std: 0.5 + cached: true + cache_size: 100000 +} +optimizer { + type: "Adam" + base_lr: 0.05 + momentum: 0.9 + beta1: 0.9 + beta2: 0.999 + epsilon: 1e-08 +} +async_es { + model_iter_id: 0 +} diff --git a/evo_kit/demo/paddle/cartpole_async_solver.cc b/evo_kit/demo/paddle/cartpole_async_solver.cc new file mode 100644 index 0000000000000000000000000000000000000000..22d2507de2ea7f6684e8d835f78f88efd8fc5eb2 --- /dev/null +++ b/evo_kit/demo/paddle/cartpole_async_solver.cc @@ -0,0 +1,134 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include "evo_kit/async_es_agent.h" +#include "cartpole.h" +#include "paddle_api.h" + +using namespace evo_kit; +using namespace paddle::lite_api; + +const int ITER = 10; + +// Use PaddlePredictor of CartPole model to predict the action. +std::vector forward(std::shared_ptr predictor, const float* obs) { + std::unique_ptr input_tensor(std::move(predictor->GetInput(0))); + input_tensor->Resize({1, 4}); + input_tensor->CopyFromCpu(obs); + + predictor->Run(); + + std::vector probs(2, 0.0); + std::unique_ptr output_tensor( + std::move(predictor->GetOutput(0))); + output_tensor->CopyToCpu(probs.data()); + return probs; +} + +int arg_max(const std::vector& vec) { + return static_cast(std::distance(vec.begin(), std::max_element(vec.begin(), vec.end()))); +} + + +float evaluate(CartPole& env, std::shared_ptr agent) { + float total_reward = 0.0; + env.reset(); + const float* obs = env.getState(); + + std::shared_ptr paddle_predictor; + paddle_predictor = agent->get_predictor(); + + while (true) { + std::vector probs = forward(paddle_predictor, obs); + int act = arg_max(probs); + env.step(act); + float reward = env.getReward(); + bool done = env.isDone(); + total_reward += reward; + + if (done) { + break; + } + + obs = env.getState(); + } + + return total_reward; +} + + +int main(int argc, char* argv[]) { + std::vector envs; + + for (int i = 0; i < ITER; ++i) { + envs.push_back(CartPole()); + } + + std::shared_ptr agent = + std::make_shared("./demo/paddle/cartpole_init_model", + "./demo/cartpole_config.prototxt"); + + // Clone agents to sample (explore). + std::vector< std::shared_ptr > sampling_agents; + + for (int i = 0; i < ITER; ++i) { + sampling_agents.push_back(agent->clone()); + } + + std::vector noisy_info; + std::vector last_noisy_info; + std::vector noisy_rewards(ITER, 0.0f); + std::vector last_noisy_rewards; + noisy_info.resize(ITER); + + omp_set_num_threads(10); + + for (int epoch = 0; epoch < 100; ++epoch) { + last_noisy_info.clear(); + last_noisy_rewards.clear(); + + if (epoch != 0) { + for (int i = 0; i < ITER; ++i) { + last_noisy_info.push_back(noisy_info[i]); + last_noisy_rewards.push_back(noisy_rewards[i]); + } + } + + #pragma omp parallel for schedule(dynamic, 1) + + for (int i = 0; i < ITER; ++i) { + std::shared_ptr sampling_agent = sampling_agents[i]; + SamplingInfo info; + bool success = sampling_agent->add_noise(info); + float reward = evaluate(envs[i], sampling_agent); + + noisy_info[i] = info; + noisy_rewards[i] = reward; + } + + for (int i = 0; i < ITER; ++i) { + last_noisy_info.push_back(noisy_info[i]); + last_noisy_rewards.push_back(noisy_rewards[i]); + } + + // NOTE: all parameters of sampling_agents will be updated + bool success = agent->update(last_noisy_info, last_noisy_rewards); + + int reward = evaluate(envs[0], agent); + LOG(INFO) << "Epoch:" << epoch << " Reward: " << reward; + } +} diff --git a/evo_kit/demo/paddle/cartpole_init_model.zip b/evo_kit/demo/paddle/cartpole_init_model.zip new file mode 100644 index 0000000000000000000000000000000000000000..16a7720959786471f8f500e7aa031615d53a1928 Binary files /dev/null and b/evo_kit/demo/paddle/cartpole_init_model.zip differ diff --git a/evo_kit/demo/paddle/cartpole_solver_parallel.cc b/evo_kit/demo/paddle/cartpole_solver_parallel.cc new file mode 100644 index 0000000000000000000000000000000000000000..33aa89990f23c744f494b9d9d75002103a0bfbcc --- /dev/null +++ b/evo_kit/demo/paddle/cartpole_solver_parallel.cc @@ -0,0 +1,115 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include "cartpole.h" +#include "evo_kit/es_agent.h" +#include "paddle_api.h" + +using namespace evo_kit; +using namespace paddle::lite_api; + +const int ITER = 10; + +// Use PaddlePredictor of CartPole model to predict the action. +std::vector forward(std::shared_ptr predictor, const float* obs) { + std::unique_ptr input_tensor(std::move(predictor->GetInput(0))); + input_tensor->Resize({1, 4}); + input_tensor->CopyFromCpu(obs); + + predictor->Run(); + + std::vector probs(2, 0.0); + std::unique_ptr output_tensor( + std::move(predictor->GetOutput(0))); + output_tensor->CopyToCpu(probs.data()); + return probs; +} + +int arg_max(const std::vector& vec) { + return static_cast(std::distance(vec.begin(), std::max_element(vec.begin(), vec.end()))); +} + + +float evaluate(CartPole& env, std::shared_ptr agent) { + float total_reward = 0.0; + env.reset(); + const float* obs = env.getState(); + + std::shared_ptr paddle_predictor; + paddle_predictor = agent->get_predictor(); + + while (true) { + std::vector probs = forward(paddle_predictor, obs); + int act = arg_max(probs); + env.step(act); + float reward = env.getReward(); + bool done = env.isDone(); + total_reward += reward; + + if (done) { + break; + } + + obs = env.getState(); + } + + return total_reward; +} + + +int main(int argc, char* argv[]) { + std::vector envs; + + for (int i = 0; i < ITER; ++i) { + envs.push_back(CartPole()); + } + + std::shared_ptr agent = std::make_shared("./demo/paddle/cartpole_init_model", + "./demo/cartpole_config.prototxt"); + + // Clone agents to sample (explore). + std::vector< std::shared_ptr > sampling_agents; + + for (int i = 0; i < ITER; ++i) { + sampling_agents.push_back(agent->clone()); + } + + std::vector noisy_keys; + std::vector noisy_rewards(ITER, 0.0f); + noisy_keys.resize(ITER); + + omp_set_num_threads(10); + + for (int epoch = 0; epoch < 100; ++epoch) { + #pragma omp parallel for schedule(dynamic, 1) + for (int i = 0; i < ITER; ++i) { + std::shared_ptr sampling_agent = sampling_agents[i]; + SamplingInfo key; + bool success = sampling_agent->add_noise(key); + float reward = evaluate(envs[i], sampling_agent); + + noisy_keys[i] = key; + noisy_rewards[i] = reward; + } + + // NOTE: all parameters of sampling_agents will be updated + bool success = agent->update(noisy_keys, noisy_rewards); + + int reward = evaluate(envs[0], agent); + LOG(INFO) << "Epoch:" << epoch << " Reward: " << reward; + } +} diff --git a/evo_kit/demo/paddle/gen_cartpole_init_model.py b/evo_kit/demo/paddle/gen_cartpole_init_model.py new file mode 100644 index 0000000000000000000000000000000000000000..62228b4f0cf953ffa3c1d11ae7bfd949c3e93925 --- /dev/null +++ b/evo_kit/demo/paddle/gen_cartpole_init_model.py @@ -0,0 +1,41 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle import fluid + + +def net(obs, act_dim): + hid1_size = act_dim * 10 + hid1 = fluid.layers.fc(obs, size=hid1_size) + prob = fluid.layers.fc(hid1, size=act_dim, act='softmax') + return prob + + +if __name__ == '__main__': + obs_dim = 4 + act_dim = 2 + + obs = fluid.layers.data(name="obs", shape=[obs_dim], dtype='float32') + + prob = net(obs, act_dim) + + exe = fluid.Executor(fluid.CPUPlace()) + exe.run(fluid.default_startup_program()) + fluid.io.save_inference_model( + dirname='cartpole_init_model', + feeded_var_names=['obs'], + target_vars=[prob], + params_filename='params', + model_filename='model', + executor=exe) diff --git a/evo_kit/demo/torch/CMakeLists.txt b/evo_kit/demo/torch/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..9ece945581c57a4c0c05fa38d007b00b7266392e --- /dev/null +++ b/evo_kit/demo/torch/CMakeLists.txt @@ -0,0 +1,32 @@ +cmake_minimum_required (VERSION 2.6) +project (EvoKit_demo) +set(TARGET parallel_main) + +set(CMAKE_CXX_STANDARD 11) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CXX_EXTENSIONS OFF) + +find_package(OpenMP) +if (OPENMP_FOUND) + set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") + set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") + set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}") +endif() + +list(APPEND CMAKE_PREFIX_PATH "./libtorch") +find_package(Torch REQUIRED ON) +set(demo "${PROJECT_SOURCE_DIR}/cartpole_solver_parallel.cc") + + +########## main ########## +add_executable(${TARGET} ${demo} ${framework_src}) +target_link_libraries(${TARGET} gflags protobuf pthread glog) + +########## Torch libraries ########## +target_link_libraries(${TARGET} "${TORCH_LIBRARIES}") + + +########## EvoKit libraries ########## +list(APPEND CMAKE_PREFIX_PATH "./libevokit/cmake/Torch") +find_package(EvoKit) +target_link_libraries(${TARGET} "${EVOKIT_LIBRARY}") diff --git a/evo_kit/demo/torch/cartpole_solver_parallel.cc b/evo_kit/demo/torch/cartpole_solver_parallel.cc new file mode 100644 index 0000000000000000000000000000000000000000..6c8f4c821c4b92e69b4755a1126296853a731102 --- /dev/null +++ b/evo_kit/demo/torch/cartpole_solver_parallel.cc @@ -0,0 +1,85 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include +#include "evo_kit/gaussian_sampling.h" +#include "evo_kit/es_agent.h" +#include "cartpole.h" +#include "model.h" + +using namespace evo_kit; +const int ITER = 10; + +float evaluate(CartPole& env, std::shared_ptr> agent) { + float total_reward = 0.0; + env.reset(); + const float* obs = env.getState(); + while (true) { + torch::Tensor obs_tensor = torch::tensor({obs[0], obs[1], obs[2], obs[3]}); + torch::Tensor action = agent->predict(obs_tensor); + int act = std::get<1>(action.max(-1)).item(); + env.step(act); + float reward = env.getReward(); + auto done = env.isDone(); + total_reward += reward; + if (done) break; + obs = env.getState(); + } + return total_reward; +} + +int main(int argc, char* argv[]) { + //google::InitGoogleLogging(argv[0]); + std::vector envs; + for (int i = 0; i < ITER; ++i) { + envs.push_back(CartPole()); + } + + auto model = std::make_shared(4, 2); + std::shared_ptr> agent = std::make_shared>(model, + "./cartpole_config.prototxt"); + + // Clone agents to sample (explore). + std::vector>> sampling_agents; + for (int i = 0; i < ITER; ++i) { + sampling_agents.push_back(agent->clone()); + } + + std::vector noisy_info; + std::vector noisy_rewards(ITER, 0.0f); + noisy_info.resize(ITER); + + for (int epoch = 0; epoch < 100; ++epoch) { +#pragma omp parallel for schedule(dynamic, 1) + for (int i = 0; i < ITER; ++i) { + auto sampling_agent = sampling_agents[i]; + SamplingInfo info; + bool success = sampling_agent->add_noise(info); + float reward = evaluate(envs[i], sampling_agent); + noisy_info[i] = info; + noisy_rewards[i] = reward; + } + + // Will also update parameters of sampling_agents + bool success = agent->update(noisy_info, noisy_rewards); + + // Use original agent to evalute (without noise). + int reward = evaluate(envs[0], agent); + LOG(INFO) << "Epoch:" << epoch << " Reward: " << reward; + } +} diff --git a/evo_kit/demo/torch/model.h b/evo_kit/demo/torch/model.h new file mode 100644 index 0000000000000000000000000000000000000000..27373ceffd66bffd9d8a047a2e4fc5fe3a14005a --- /dev/null +++ b/evo_kit/demo/torch/model.h @@ -0,0 +1,61 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef _MODEL_H +#define _MODEL_H + +#include + +struct Model : public torch::nn::Module{ + + Model() = delete; + + Model(const int obs_dim, const int act_dim) { + + _obs_dim = obs_dim; + _act_dim = act_dim; + int hid1_size = act_dim * 10; + fc1 = register_module("fc1", torch::nn::Linear(obs_dim, hid1_size)); + fc2 = register_module("fc2", torch::nn::Linear(hid1_size, act_dim)); + } + + torch::Tensor forward(torch::Tensor x) { + x = x.reshape({-1, _obs_dim}); + x = torch::tanh(fc1->forward(x)); + x = torch::softmax(fc2->forward(x), 1); + return x; + } + + std::shared_ptr clone() { + std::shared_ptr model = std::make_shared(_obs_dim, _act_dim); + std::vector parameters1 = parameters(); + std::vector parameters2 = model->parameters(); + for (int i = 0; i < parameters1.size(); ++i) { + torch::Tensor src = parameters1[i].view({-1}); + torch::Tensor des = parameters2[i].view({-1}); + auto src_a = src.accessor(); + auto des_a = des.accessor(); + for (int j = 0; j < src.size(0); ++j) { + des_a[j] = src_a[j]; + } + } + return model; + } + + int _act_dim; + int _obs_dim; + torch::nn::Linear fc1{nullptr}, fc2{nullptr}; +}; + +#endif diff --git a/evo_kit/paddle/include/evo_kit/async_es_agent.h b/evo_kit/paddle/include/evo_kit/async_es_agent.h new file mode 100644 index 0000000000000000000000000000000000000000..a8558820bb86f7d4a6f084aea456e2c9a79ed762 --- /dev/null +++ b/evo_kit/paddle/include/evo_kit/async_es_agent.h @@ -0,0 +1,101 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef EVO_KIT_ASYNC_ES_AGENT_H +#define EVO_KIT_ASYNC_ES_AGENT_H + +#include +#include +#include "evo_kit/es_agent.h" + +namespace evo_kit { +/* EvoKit agent with PaddleLite as backend. This agent supports asynchronous update. + * Users mainly focus on the following functions: + * 1. clone: clone an agent for multi-thread evaluation + * 2. add_noise: add noise into parameters. + * 3. update: update parameters given data collected during evaluation. + */ +class AsyncESAgent: public ESAgent { +public: + AsyncESAgent() {} + + ~AsyncESAgent(); + + /** + * @args: + * predictor: predictor created by users for prediction. + * config_path: the path of configuration file. + * Note that AsyncESAgent will update the configuration file after calling the update function. + * Please use the up-to-date configuration. + */ + AsyncESAgent( + const std::string& model_dir, + const std::string& config_path); + + /** + * @brief: Clone an agent for sampling. + */ + std::shared_ptr clone(); + + /** + * @brief: update parameters given data collected during evaluation. + * @args: + * noisy_info: sampling information returned by add_noise function. + * noisy_reward: evaluation rewards. + */ + bool update( + std::vector& noisy_info, + std::vector& noisy_rewards); + +private: + std::unordered_map> _previous_predictors; + std::unordered_map _param_delta; + std::string _config_path; + + /** + * @brief: parse model_iter_id given a string of model directory. + * @return: an integer indicating the model_iter_id + */ + int _parse_model_iter_id(const std::string&); + + /** + * @brief: compute the distance between current parameter and previous models. + */ + bool _compute_model_diff(); + + /** + * @brief: remove expired models to avoid overuse of disk space. + * @args: + * max_to_keep: the maximum number of models to keep locally. + */ + bool _remove_expired_model(int max_to_keep); + + /** + * @brief: save up-to-date parameters to the disk. + */ + bool _save(); + + /** + * @brief: load all models in the model warehouse. + */ + bool _load(); + + /** + * @brief: load a model given the model directory. + */ + std::shared_ptr _load_previous_model(std::string model_dir); +}; + +} // namespace +#endif diff --git a/evo_kit/paddle/include/evo_kit/es_agent.h b/evo_kit/paddle/include/evo_kit/es_agent.h new file mode 100644 index 0000000000000000000000000000000000000000..9a256712a3d99be12ff4a9f409298602192ec21e --- /dev/null +++ b/evo_kit/paddle/include/evo_kit/es_agent.h @@ -0,0 +1,103 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef EVO_KIT_DEEPES_PADDLE_ES_AGENT_H_ +#define EVO_KIT_DEEPES_PADDLE_ES_AGENT_H_ + +#include +#include "evo_kit/evo_kit.pb.h" +#include "evo_kit/optimizer_factory.h" +#include "evo_kit/sampling_factory.h" +#include "evo_kit/utils.h" +#include "paddle_api.h" + +namespace evo_kit { + +typedef paddle::lite_api::PaddlePredictor PaddlePredictor; +typedef paddle::lite_api::CxxConfig CxxConfig; +typedef paddle::lite_api::Tensor Tensor; + +int64_t ShapeProduction(const paddle::lite_api::shape_t& shape); + +/** + * @brief EvoKit agent with PaddleLite as backend. + * Users mainly focus on the following functions: + * 1. clone: clone an agent for multi-thread evaluation + * 2. add_noise: add noise into parameters. + * 3. update: update parameters given data collected during evaluation. + * + */ +class ESAgent { +public: + ESAgent() {} + + ~ESAgent(); + + ESAgent(const std::string& model_dir, const std::string& config_path); + + /** + * @breif Clone a sampling agent + * + * Only cloned ESAgent can call `add_noise` function. + * Each cloned ESAgent will have a copy of original parameters. + * (support sampling in multi-thread way) + */ + std::shared_ptr clone(); + + /** + * @brief Update parameters of predictor based on ES algorithm. + * + * Only not cloned ESAgent can call `update` function. + * Parameters of cloned agents will also be updated. + */ + bool update( + std::vector& noisy_info, + std::vector& noisy_rewards); + + // copied parameters = original parameters + noise + bool add_noise(SamplingInfo& sampling_info); + + /** + * @brief Get paddle predict + * + * if _is_sampling_agent is true, will return predictor with added noise; + * if _is_sampling_agent is false, will return predictor without added noise. + */ + std::shared_ptr get_predictor(); + + // get param size of model + int64_t param_size() { + return _param_size; + } + +protected: + int64_t _calculate_param_size(); + + std::shared_ptr _predictor; + std::shared_ptr _sampling_predictor; + std::shared_ptr _sampling_method; + std::shared_ptr _optimizer; + std::shared_ptr _config; + std::shared_ptr _cxx_config; + std::vector _param_names; + // malloc memory of noise and neg_gradients in advance. + float* _noise; + float* _neg_gradients; + int64_t _param_size; + bool _is_sampling_agent; +}; + +} // namespace + +#endif diff --git a/evo_kit/paddle/src/async_es_agent.cc b/evo_kit/paddle/src/async_es_agent.cc new file mode 100644 index 0000000000000000000000000000000000000000..0bff6e42907f6f83f53ea147051d34d3b4851141 --- /dev/null +++ b/evo_kit/paddle/src/async_es_agent.cc @@ -0,0 +1,290 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "evo_kit/async_es_agent.h" + +namespace evo_kit { + +AsyncESAgent::AsyncESAgent( + const std::string& model_dir, + const std::string& config_path): ESAgent(model_dir, config_path) { + _config_path = config_path; +} +AsyncESAgent::~AsyncESAgent() { + for (const auto kv : _param_delta) { + float* delta = kv.second; + delete[] delta; + } +} + +bool AsyncESAgent::_save() { + using namespace paddle::lite_api; + bool success = true; + + if (_is_sampling_agent) { + LOG(ERROR) << + "[EvoKit] Cloned AsyncESAgent cannot call `save`.Please use original AsyncESAgent."; + success = false; + return success; + } + + int model_iter_id = _config->async_es().model_iter_id() + 1; + //current time + time_t rawtime; + struct tm* timeinfo; + char buffer[80]; + + time(&rawtime); + timeinfo = localtime(&rawtime); + + std::string model_name = "model_iter_id-" + std::to_string(model_iter_id); + std::string model_path = _config->async_es().model_warehouse() + "/" + model_name; + LOG(INFO) << "[save]model_path: " << model_path; + _predictor->SaveOptimizedModel(model_path, LiteModelType::kProtobuf); + // save config + auto async_es = _config->mutable_async_es(); + async_es->set_model_iter_id(model_iter_id); + success = save_proto_conf(_config_path, *_config); + + if (!success) { + LOG(ERROR) << "[]unable to save config for AsyncESAgent"; + success = false; + return success; + } + + int max_to_keep = _config->async_es().max_to_keep(); + success = _remove_expired_model(max_to_keep); + return success; +} + +bool AsyncESAgent::_remove_expired_model(int max_to_keep) { + bool success = true; + std::string model_path = _config->async_es().model_warehouse(); + std::vector model_dirs = list_all_model_dirs(model_path); + int model_iter_id = _config->async_es().model_iter_id() + 1; + + for (const auto& dir : model_dirs) { + int dir_model_iter_id = _parse_model_iter_id(dir); + + if (model_iter_id - dir_model_iter_id >= max_to_keep) { + std::string rm_command = std::string("rm -rf ") + dir; + int ret = system(rm_command.c_str()); + + if (ret == 0) { + LOG(INFO) << "[EvoKit] remove expired Model: " << dir; + } else { + LOG(ERROR) << "[EvoKit] fail to remove expired Model: " << dir; + success = false; + return success; + } + } + } + + return success; +} + +bool AsyncESAgent::_compute_model_diff() { + bool success = true; + + for (const auto& kv : _previous_predictors) { + int model_iter_id = kv.first; + std::shared_ptr old_predictor = kv.second; + float* diff = new float[_param_size]; + memset(diff, 0, _param_size * sizeof(float)); + int offset = 0; + + for (const std::string& param_name : _param_names) { + auto des_tensor = old_predictor->GetTensor(param_name); + auto src_tensor = _predictor->GetTensor(param_name); + const float* des_data = des_tensor->data(); + const float* src_data = src_tensor->data(); + int64_t tensor_size = ShapeProduction(src_tensor->shape()); + + for (int i = 0; i < tensor_size; ++i) { + diff[i + offset] = des_data[i] - src_data[i]; + } + + offset += tensor_size; + } + + _param_delta[model_iter_id] = diff; + } + + return success; +} + +bool AsyncESAgent::_load() { + bool success = true; + std::string model_path = _config->async_es().model_warehouse(); + std::vector model_dirs = list_all_model_dirs(model_path); + + if (model_dirs.size() == 0) { + int model_iter_id = _config->async_es().model_iter_id(); + success = model_iter_id == 0 ? true : false; + + if (!success) { + LOG(WARNING) << "[EvoKit] current_model_iter_id is nonzero, but no model is \ + found at the dir: " << model_path; + } + + return success; + } + + for (auto& dir : model_dirs) { + int model_iter_id = _parse_model_iter_id(dir); + + if (model_iter_id == -1) { + LOG(WARNING) << "[EvoKit] fail to parse model_iter_id: " << dir; + success = false; + return success; + } + + std::shared_ptr predictor = _load_previous_model(dir); + + if (predictor == nullptr) { + success = false; + LOG(WARNING) << "[EvoKit] fail to load model: " << dir; + return success; + } + + _previous_predictors[model_iter_id] = predictor; + } + + success = _compute_model_diff(); + return success; +} + +std::shared_ptr AsyncESAgent::_load_previous_model(std::string model_dir) { + using namespace paddle::lite_api; + // 1. Create CxxConfig + CxxConfig config; + config.set_model_file(model_dir + "/model"); + config.set_param_file(model_dir + "/params"); + config.set_valid_places({ + Place{TARGET(kX86), PRECISION(kFloat)}, + Place{TARGET(kHost), PRECISION(kFloat)} + }); + + // 2. Create PaddlePredictor by CxxConfig + std::shared_ptr predictor = CreatePaddlePredictor(config); + return predictor; +} + +std::shared_ptr AsyncESAgent::clone() { + + std::shared_ptr new_agent = std::make_shared(); + + float* noise = new float [_param_size]; + + new_agent->_predictor = _predictor; + new_agent->_sampling_predictor = paddle::lite_api::CreatePaddlePredictor(*_cxx_config); + new_agent->_is_sampling_agent = true; + new_agent->_sampling_method = _sampling_method; + new_agent->_param_names = _param_names; + new_agent->_param_size = _param_size; + new_agent->_config = _config; + new_agent->_noise = noise; + + return new_agent; +} + +bool AsyncESAgent::update( + std::vector& noisy_info, + std::vector& noisy_rewards) { + + CHECK(!_is_sampling_agent) << "[EvoKit] Cloned ESAgent cannot call update function. \ + Please use original ESAgent."; + + bool success = _load(); + CHECK(success) << "[EvoKit] fail to load previous models."; + + int current_model_iter_id = _config->async_es().model_iter_id(); + + // validate model_iter_id for each sample before the update + for (int i = 0; i < noisy_info.size(); ++i) { + int model_iter_id = noisy_info[i].model_iter_id(); + + if (model_iter_id != current_model_iter_id + && _previous_predictors.count(model_iter_id) == 0) { + LOG(WARNING) << "[EvoKit] The sample with model_dir_id: " << model_iter_id \ + << " cannot match any local model"; + success = false; + return success; + } + } + + compute_centered_ranks(noisy_rewards); + memset(_neg_gradients, 0, _param_size * sizeof(float)); + + for (int i = 0; i < noisy_info.size(); ++i) { + int key = noisy_info[i].key(0); + float reward = noisy_rewards[i]; + int model_iter_id = noisy_info[i].model_iter_id(); + bool success = _sampling_method->resampling(key, _noise, _param_size); + CHECK(success) << "[EvoKit] resampling error occurs at sample: " << i; + float* delta = _param_delta[model_iter_id]; + + // compute neg_gradients + if (model_iter_id == current_model_iter_id) { + for (int64_t j = 0; j < _param_size; ++j) { + _neg_gradients[j] += _noise[j] * reward; + } + } else { + for (int64_t j = 0; j < _param_size; ++j) { + _neg_gradients[j] += (_noise[j] + delta[j]) * reward; + } + } + } + + for (int64_t j = 0; j < _param_size; ++j) { + _neg_gradients[j] /= -1.0 * noisy_info.size(); + } + + //update + int64_t counter = 0; + + for (std::string param_name : _param_names) { + std::unique_ptr tensor = _predictor->GetMutableTensor(param_name); + float* tensor_data = tensor->mutable_data(); + int64_t tensor_size = ShapeProduction(tensor->shape()); + _optimizer->update(tensor_data, _neg_gradients + counter, tensor_size, param_name); + counter += tensor_size; + } + + success = _save(); + CHECK(success) << "[EvoKit] fail to save model."; + return true; +} + +int AsyncESAgent::_parse_model_iter_id(const std::string& model_path) { + int model_iter_id = -1; + int pow = 1; + + for (int i = model_path.size() - 1; i >= 0; --i) { + if (model_path[i] >= '0' && model_path[i] <= '9') { + if (model_iter_id == -1) { + model_iter_id = 0; + } + } else { + break; + } + + model_iter_id += pow * (model_path[i] - '0'); + pow *= 10; + } + + return model_iter_id; +} + +}//namespace diff --git a/evo_kit/paddle/src/es_agent.cc b/evo_kit/paddle/src/es_agent.cc new file mode 100644 index 0000000000000000000000000000000000000000..d8f3ebd37299224791f1380f284849195383f65b --- /dev/null +++ b/evo_kit/paddle/src/es_agent.cc @@ -0,0 +1,185 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "evo_kit/es_agent.h" +#include + +namespace evo_kit { + +int64_t ShapeProduction(const paddle::lite_api::shape_t& shape) { + int64_t res = 1; + + for (auto i : shape) { + res *= i; + } + + return res; +} + +ESAgent::~ESAgent() { + delete[] _noise; + + if (!_is_sampling_agent) { + delete[] _neg_gradients; + } +} + +ESAgent::ESAgent(const std::string& model_dir, const std::string& config_path) { + using namespace paddle::lite_api; + // 1. Create CxxConfig + _cxx_config = std::make_shared(); + std::string model_path = model_dir + "/model"; + std::string param_path = model_dir + "/param"; + std::string model_buffer = read_file(model_path); + std::string param_buffer = read_file(param_path); + _cxx_config->set_model_buffer(model_buffer.c_str(), model_buffer.size(), + param_buffer.c_str(), param_buffer.size()); + _cxx_config->set_valid_places({ + Place{TARGET(kX86), PRECISION(kFloat)}, + Place{TARGET(kHost), PRECISION(kFloat)} + }); + + _predictor = CreatePaddlePredictor(*_cxx_config); + + _is_sampling_agent = false; + // Original agent can't be used to sample, so keep it same with _predictor for evaluating. + _sampling_predictor = _predictor; + + _config = std::make_shared(); + load_proto_conf(config_path, *_config); + + _sampling_method = create_sampling_method(*_config); + + _optimizer = create_optimizer(_config->optimizer()); + + _param_names = _predictor->GetParamNames(); + _param_size = _calculate_param_size(); + + _noise = new float [_param_size]; + _neg_gradients = new float [_param_size]; +} + +std::shared_ptr ESAgent::clone() { + if (_is_sampling_agent) { + LOG(ERROR) << "[EvoKit] only original ESAgent can call `clone` function."; + return nullptr; + } + + std::shared_ptr new_agent = std::make_shared(); + + float* noise = new float [_param_size]; + + new_agent->_sampling_predictor = paddle::lite_api::CreatePaddlePredictor(*_cxx_config); + new_agent->_predictor = _predictor; + new_agent->_cxx_config = _cxx_config; + new_agent->_is_sampling_agent = true; + new_agent->_sampling_method = _sampling_method; + new_agent->_param_names = _param_names; + new_agent->_config = _config; + new_agent->_param_size = _param_size; + new_agent->_noise = noise; + + return new_agent; +} + +bool ESAgent::update( + std::vector& noisy_info, + std::vector& noisy_rewards) { + if (_is_sampling_agent) { + LOG(ERROR) << "[EvoKit] Cloned ESAgent cannot call update function, please use original ESAgent."; + return false; + } + + compute_centered_ranks(noisy_rewards); + + memset(_neg_gradients, 0, _param_size * sizeof(float)); + + for (int i = 0; i < noisy_info.size(); ++i) { + int key = noisy_info[i].key(0); + float reward = noisy_rewards[i]; + bool success = _sampling_method->resampling(key, _noise, _param_size); + CHECK(success) << "[EvoKit] resampling error occurs at sample: " << i; + + for (int64_t j = 0; j < _param_size; ++j) { + _neg_gradients[j] += _noise[j] * reward; + } + } + + for (int64_t j = 0; j < _param_size; ++j) { + _neg_gradients[j] /= -1.0 * noisy_info.size(); + } + + //update + int64_t counter = 0; + + for (std::string param_name : _param_names) { + std::unique_ptr tensor = _predictor->GetMutableTensor(param_name); + float* tensor_data = tensor->mutable_data(); + int64_t tensor_size = ShapeProduction(tensor->shape()); + _optimizer->update(tensor_data, _neg_gradients + counter, tensor_size, param_name); + counter += tensor_size; + } + + return true; +} + +bool ESAgent::add_noise(SamplingInfo& sampling_info) { + bool success = true; + + if (!_is_sampling_agent) { + LOG(ERROR) << + "[EvoKit] Original ESAgent cannot call add_noise function, please use cloned ESAgent."; + success = false; + return success; + } + + int key = 0; + success = _sampling_method->sampling(&key, _noise, _param_size); + CHECK(success) << "[EvoKit] sampling error occurs while add_noise."; + int model_iter_id = _config->async_es().model_iter_id(); + sampling_info.add_key(key); + sampling_info.set_model_iter_id(model_iter_id); + int64_t counter = 0; + + for (std::string param_name : _param_names) { + std::unique_ptr sample_tensor = _sampling_predictor->GetMutableTensor(param_name); + std::unique_ptr tensor = _predictor->GetTensor(param_name); + int64_t tensor_size = ShapeProduction(tensor->shape()); + + for (int64_t j = 0; j < tensor_size; ++j) { + sample_tensor->mutable_data()[j] = tensor->data()[j] + _noise[counter + j]; + } + + counter += tensor_size; + } + + return success; +} + +std::shared_ptr ESAgent::get_predictor() { + return _sampling_predictor; +} + +int64_t ESAgent::_calculate_param_size() { + int64_t param_size = 0; + + for (std::string param_name : _param_names) { + std::unique_ptr tensor = _predictor->GetTensor(param_name); + param_size += ShapeProduction(tensor->shape()); + } + + return param_size; +} + +}//namespace diff --git a/evo_kit/scripts/build_torch_demo.sh b/evo_kit/scripts/build_torch_demo.sh new file mode 100644 index 0000000000000000000000000000000000000000..b2f4df4444012c49bade049a7b30c9ebf637cafb --- /dev/null +++ b/evo_kit/scripts/build_torch_demo.sh @@ -0,0 +1,39 @@ +#!/bin/bash + +cd demo/torch + +#---------------libtorch-------------# +if [ ! -d "./libtorch" ];then + echo "Cannot find the torch library: ./libtorch" + echo "Downloading Torch library" + wget -q https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-1.4.0%2Bcpu.zip + unzip -q libtorch-cxx11-abi-shared-with-deps-1.4.0+cpu.zip + rm -rf libtorch-cxx11-abi-shared-with-deps-1.4.0+cpu.zip + echo "Torch library Downloaded" +fi + + +#---------------libevokit-------------# +cp -r ../../libevokit ./ +if [ ! -d "./libevokit" ];then + echo "Cannot find the EvoKit library: ./libevokit" + echo "Please put the EvoKit libraray to current folder according the instruction in README" # TODO: readme + exit 1 +fi + +# proto +cp ../cartpole_config.prototxt ./ + +#----------------build---------------# +rm -rf build +mkdir build +cd build +cmake ../ +make -j10 +cd - + +#-----------------run----------------# +./build/parallel_main + + +cd ../.. diff --git a/evo_kit/scripts/lib_install.sh b/evo_kit/scripts/lib_install.sh new file mode 100644 index 0000000000000000000000000000000000000000..eb4cc5df7a901618c91b7be8a898d419d607278b --- /dev/null +++ b/evo_kit/scripts/lib_install.sh @@ -0,0 +1,43 @@ +#!/bin/bash + +if [ $# != 1 ]; then + echo "You must choose one framework (paddle/torch) to compile EvoKit." + exit 0 +fi + +if [ $1 = "paddle" ]; then + #---------------paddlelite-------------# + if [ ! -d "./inference_lite_lib" ];then + echo "Cannot find the PaddleLite library: ./inference_lite_lib" + echo "Please put the PaddleLite libraray to current folder according the instruction in README" + exit 1 + fi + + # Initialization model + if [ ! -d ./demo/paddle/cartpole_init_model ]; then + unzip ./demo/paddle/cartpole_init_model.zip -d ./demo/paddle/ + fi + + FLAGS=" -DWITH_PADDLE=ON" +elif [ $1 = "torch" ]; then + FLAGS=" -DWITH_TORCH=ON" +else + echo "Invalid arguments. [paddle/torch]" + exit 0 +fi + + +#----------------protobuf-------------# +cd core/proto/ +protoc evo_kit/evo_kit.proto --cpp_out . +cd - + +#----------------build---------------# +echo ${FLAGS} +rm -rf build +mkdir build +cd build +cmake ../ ${FLAGS} +make -j10 +make install +cd - diff --git a/evo_kit/test/CMakeLists.txt b/evo_kit/test/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..979e5c59afd5e74b2907054a8398fc7d27fbc6e6 --- /dev/null +++ b/evo_kit/test/CMakeLists.txt @@ -0,0 +1,33 @@ +cmake_minimum_required (VERSION 2.6) +project (EvoKit_demo) +set(TARGET unit_test_main) + +set(CMAKE_CXX_STANDARD 11) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CXX_EXTENSIONS OFF) + +find_package(GTest REQUIRED) +find_package(OpenMP) +if (OPENMP_FOUND) + set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") + set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") + set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}") +endif() + +# Torch lib +list(APPEND CMAKE_PREFIX_PATH "../libtorch") +find_package(Torch REQUIRED ON) + +# include and source +include_directories("${PROJECT_SOURCE_DIR}/include") +file(GLOB test_src "${PROJECT_SOURCE_DIR}/src/*.cc") + +# make +add_executable(${TARGET} "unit_test.cc" ${core_src} ${agent_src} ${test_src}) +target_link_libraries(${TARGET} gflags protobuf pthread glog gtest "${TORCH_LIBRARIES}") + + +########## EvoKit libraries ########## +list(APPEND CMAKE_PREFIX_PATH "${PROJECT_SOURCE_DIR}/libevokit/cmake/Torch") +find_package(EvoKit) +target_link_libraries(${TARGET} "${EVOKIT_LIBRARY}") diff --git a/evo_kit/test/include/torch_demo_model.h b/evo_kit/test/include/torch_demo_model.h new file mode 100644 index 0000000000000000000000000000000000000000..cf9d3400ea4358fe109ff6da3f9bec395920336f --- /dev/null +++ b/evo_kit/test/include/torch_demo_model.h @@ -0,0 +1,65 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef _TORCH_DEMO_MODEL_H +#define _TORCH_DEMO_MODEL_H + +#include + +struct Model : public torch::nn::Module{ + + Model() = delete; + + Model(const int obs_dim, const int act_dim, const int h1_size, const int h2_size) { + _obs_dim = obs_dim; + _act_dim = act_dim; + _h1_size = h1_size; + _h2_size = h2_size; + fc1 = register_module("fc1", torch::nn::Linear(obs_dim, h1_size)); + fc2 = register_module("fc2", torch::nn::Linear(h1_size, h2_size)); + fc3 = register_module("fc3", torch::nn::Linear(h2_size, act_dim)); + } + + torch::Tensor forward(torch::Tensor x) { + x = x.reshape({-1, _obs_dim}); + x = torch::tanh(fc1->forward(x)); + x = torch::tanh(fc2->forward(x)); + x = torch::tanh(fc3->forward(x)); + return x; + } + + std::shared_ptr clone() { + std::shared_ptr model = std::make_shared(_obs_dim, _act_dim, _h1_size, _h2_size); + std::vector parameters1 = parameters(); + std::vector parameters2 = model->parameters(); + for (int i = 0; i < parameters1.size(); ++i) { + torch::Tensor src = parameters1[i].view({-1}); + torch::Tensor des = parameters2[i].view({-1}); + auto src_a = src.accessor(); + auto des_a = des.accessor(); + for (int j = 0; j < src.size(0); ++j) { + des_a[j] = src_a[j]; + } + } + return model; + } + + int _act_dim; + int _obs_dim; + int _h1_size; + int _h2_size; + torch::nn::Linear fc1{nullptr}, fc2{nullptr}, fc3{nullptr}; +}; + +#endif diff --git a/evo_kit/test/prototxt/torch_sin_cached_config.prototxt b/evo_kit/test/prototxt/torch_sin_cached_config.prototxt new file mode 100644 index 0000000000000000000000000000000000000000..6fe80b1e07396b0909cb087f1a9b0c20724a0fc4 --- /dev/null +++ b/evo_kit/test/prototxt/torch_sin_cached_config.prototxt @@ -0,0 +1,16 @@ +seed : 1024 + +gaussian_sampling { + std: 0.005 + cached: true + cache_size : 100000 +} + +optimizer { + type: "Adam", + base_lr: 0.005, + momentum: 0.9, + beta1: 0.9, + beta2: 0.999, + epsilon: 1e-8, +} diff --git a/evo_kit/test/prototxt/torch_sin_config.prototxt b/evo_kit/test/prototxt/torch_sin_config.prototxt new file mode 100644 index 0000000000000000000000000000000000000000..3704d64e6b6c7f7976422e33c2f5892b7ca4efc5 --- /dev/null +++ b/evo_kit/test/prototxt/torch_sin_config.prototxt @@ -0,0 +1,15 @@ +seed : 1024 + +gaussian_sampling { + std: 0.005 + cached: false +} + +optimizer { + type: "Adam", + base_lr: 0.005, + momentum: 0.9, + beta1: 0.9, + beta2: 0.999, + epsilon: 1e-8, +} diff --git a/evo_kit/test/run_test.sh b/evo_kit/test/run_test.sh new file mode 100644 index 0000000000000000000000000000000000000000..b39cbc9db8c32c4827aa03a101b45a8011dde7ae --- /dev/null +++ b/evo_kit/test/run_test.sh @@ -0,0 +1,41 @@ +#!/bin/bash +export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH + +#---------------libtorch-------------# +if [ ! -d "./libtorch" ];then +echo "Cannot find the torch library: ../libtorch" + echo "Downloading Torch library" + wget -q https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-1.4.0%2Bcpu.zip + unzip -q libtorch-cxx11-abi-shared-with-deps-1.4.0+cpu.zip + rm -rf libtorch-cxx11-abi-shared-with-deps-1.4.0+cpu.zip + echo "Torch library Downloaded" +fi + +#----------------protobuf-------------# +cd core/proto/ +protoc evo_kit/evo_kit.proto --cpp_out . +cd - + +#----------------build---------------# +sh scripts/lib_install.sh torch + +#----------------build test---------------# +cd test + +cp -r ../libevokit ./ +if [ ! -d "./libevokit" ];then + echo "Cannot find the EvoKit library: ./libevokit" + echo "Please put the EvoKit libraray to current folder according the instruction in README" # TODO: readme + exit 1 +fi + +rm -rf build +mkdir build +cd build +cmake ../ +make -j10 + +#-----------------run----------------# +./unit_test_main + +cd .. diff --git a/evo_kit/test/src/optimizers_test.cc b/evo_kit/test/src/optimizers_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..1c561e3085bdf5f9102ba29115e7e8fabbf8ed75 --- /dev/null +++ b/evo_kit/test/src/optimizers_test.cc @@ -0,0 +1,59 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "gtest/gtest.h" +#include +#include "evo_kit/optimizer_factory.h" +#include + +namespace evo_kit { + +TEST(SGDOptimizersTest, Method_update) { + std::shared_ptr config = std::make_shared(); + auto optimizer_config = config->mutable_optimizer(); + optimizer_config->set_base_lr(1.0); + optimizer_config->set_type("sgd"); + std::shared_ptr optimizer = create_optimizer(config->optimizer()); + float sgd_wei[10] = { 0.0 , 0.0 , 0.04216444, 0.0511456 , 0.04231584, 0.01089015, 0.06569759, 0.00127421,-0.00092832, 0.01128081}; + float sgd_grad[10] = {-0.11992419,-0.0 , 0.07681337,-0.06616384, 0.00249889, 0.01158612,-0.3067452 , 0.36048946,-0.15820622,-0.20014143}; + float sgd_new[10] = { 0.01199242, 0.0 , 0.0344831 , 0.05776198, 0.04206595, 0.00973154, 0.09637211,-0.03477474, 0.014892306, 0.03129495}; + + EXPECT_TRUE(optimizer->update(sgd_wei, sgd_grad, 10, "fc1")); + for (int i = 0; i < 10; ++i) { + EXPECT_FLOAT_EQ(sgd_new[i], sgd_wei[i]) << " i: " << i ; + } + EXPECT_TRUE(optimizer->update(sgd_wei, sgd_grad, 10, "fc1")); + EXPECT_FALSE(optimizer->update(sgd_wei, sgd_grad, 9, "fc1")); +} + +TEST(AdamOptimizersTest, Method_update) { + std::shared_ptr config = std::make_shared(); + auto optimizer_config = config->mutable_optimizer(); + optimizer_config->set_base_lr(1.0); + optimizer_config->set_type("adam"); + std::shared_ptr optimizer = create_optimizer(config->optimizer()); + float adam_wei[10] = { 0.0 , 0.0 , 0.04216444, 0.0511456 , 0.04231584, 0.01089015, 0.06569759, 0.00127421,-0.00092832, 0.01128081}; + float adam_grad[10] = {-0.11992419,-0.0 , 0.07681337,-0.06616384, 0.00249889, 0.01158612,-0.3067452 , 0.36048946,-0.15820622,-0.20014143}; + float adam_new[10] = { 0.99999736, 0. ,-0.95783144, 1.05114082,-0.95755763,-0.98908256, 1.06569656,-0.99872491, 0.99906968, 1.01127923}; + + EXPECT_TRUE(optimizer->update(adam_wei, adam_grad, 10, "fc1")); + for (int i = 0; i < 10; ++i) { + EXPECT_FLOAT_EQ(adam_new[i], adam_wei[i]) << " i: " << i ; + } + EXPECT_TRUE(optimizer->update(adam_wei, adam_grad, 10, "fc1")); + EXPECT_FALSE(optimizer->update(adam_wei, adam_grad, 9, "fc1")); +} + +} // namespace + diff --git a/evo_kit/test/src/sampling_test.cc b/evo_kit/test/src/sampling_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..e707a63354836f3e70b42d819bab8b0fc3f79e70 --- /dev/null +++ b/evo_kit/test/src/sampling_test.cc @@ -0,0 +1,116 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "gtest/gtest.h" +#include +#include "evo_kit/sampling_method.h" +#include "evo_kit/gaussian_sampling.h" +#include "evo_kit/cached_gaussian_sampling.h" +#include + +namespace evo_kit { + +class SamplingTest : public ::testing::Test { + protected: + void init_sampling_method(bool cached) { + config = std::make_shared(); + config->set_seed(1024); + auto sampling_config = config->mutable_gaussian_sampling(); + sampling_config->set_std(1.0); + sampling_config->set_cached(cached); + sampling_config->set_cache_size(cache_size); + if (cached) { + sampler = std::make_shared(); + } else { + sampler = std::make_shared(); + } + } + + std::shared_ptr sampler; + std::shared_ptr config; + float array[3] = {1.0, 2.0, 3.0}; + int cache_size = 100; // default cache_size 100 + int key = 0; +}; + + +TEST_F(SamplingTest, GaussianSampling_load_config) { + init_sampling_method(false); + EXPECT_TRUE(sampler->load_config(*config)); +} + +TEST_F(SamplingTest, GaussianSampling_sampling) { + init_sampling_method(false); + sampler->load_config(*config); + + EXPECT_FALSE(sampler->sampling(&key, nullptr, 0)); + EXPECT_TRUE(sampler->sampling(&key, array, 3)); +} + +TEST_F(SamplingTest, GaussianSampling_resampling) { + init_sampling_method(false); + sampler->load_config(*config); + + EXPECT_FALSE(sampler->resampling(0, nullptr, 0)); + EXPECT_TRUE(sampler->resampling(0, array, 3)); +} + + +TEST_F(SamplingTest, CachedGaussianSampling_load_config) { + init_sampling_method(true); + EXPECT_TRUE(sampler->load_config(*config)); +} + +TEST_F(SamplingTest, CachedGaussianSampling_sampling) { + init_sampling_method(true); + EXPECT_FALSE(sampler->sampling(&key, array, 0)); + + sampler->load_config(*config); + + EXPECT_FALSE(sampler->sampling(&key, nullptr, 0)); + EXPECT_FALSE(sampler->sampling(&key, array, -1)); + EXPECT_FALSE(sampler->sampling(&key, array, cache_size)); + + EXPECT_TRUE(sampler->sampling(&key, array, 0)); + EXPECT_TRUE(sampler->sampling(&key, array, 3)); +} + +TEST_F(SamplingTest, CachedGaussianSampling_resampling) { + init_sampling_method(true); + EXPECT_FALSE(sampler->resampling(0, array, 0)); + + sampler->load_config(*config); + + EXPECT_FALSE(sampler->resampling(0, nullptr, 0)); + EXPECT_FALSE(sampler->resampling(0, array, -1)); + EXPECT_FALSE(sampler->resampling(0, array, cache_size)); + + EXPECT_TRUE(sampler->resampling(0, array, 0)); + EXPECT_TRUE(sampler->resampling(0, array, 1)); + EXPECT_TRUE(sampler->resampling(0, array, 2)); + + EXPECT_FALSE(sampler->resampling(-1, array, 3)); + EXPECT_TRUE(sampler->resampling(0, array, 3)); + EXPECT_TRUE(sampler->resampling(1, array, 3)); + EXPECT_TRUE(sampler->resampling(2, array, 3)); + EXPECT_TRUE(sampler->resampling(cache_size-3, array, 3)); + EXPECT_FALSE(sampler->resampling(cache_size-2, array, 3)); + EXPECT_FALSE(sampler->resampling(cache_size-1, array, 3)); + EXPECT_FALSE(sampler->resampling(cache_size, array, 3)); + EXPECT_FALSE(sampler->resampling(cache_size-3, array, cache_size-1)); +} + + +} // namespace + diff --git a/evo_kit/test/src/torch_agent_test.cc b/evo_kit/test/src/torch_agent_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..080b85391d720a6b500517a6f27976f76d2258b6 --- /dev/null +++ b/evo_kit/test/src/torch_agent_test.cc @@ -0,0 +1,157 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "gtest/gtest.h" +#include +#include +#include + +#include "evo_kit/gaussian_sampling.h" +#include "evo_kit/es_agent.h" +#include "torch_demo_model.h" + +#include +#include +#include +#include + +namespace evo_kit { + + +// The fixture for testing class Foo. +class TorchDemoTest : public ::testing::Test { +protected: + float evaluate(std::vector& x_list, std::vector& y_list, int size, std::shared_ptr> agent) { + float total_loss = 0.0; + for (int i = 0; i < size; ++i) { + torch::Tensor x_input = torch::tensor(x_list[i], torch::dtype(torch::kFloat32)); + torch::Tensor predict_y = agent->predict(x_input); + auto pred_y = predict_y.accessor(); + float loss = pow((pred_y[0][0] - y_list[i]), 2); + total_loss += loss; + } + return -total_loss / float(size); + } + + float train_loss() { + return -1.0 * evaluate(x_list, y_list, train_data_size, agent); + } + + float test_loss() { + return -1.0 * evaluate(test_x_list, test_y_list, test_data_size, agent); + } + + float train_test_gap() { + float train_lo = train_loss(); + float test_lo = test_loss(); + if ( train_lo > test_lo) { + return train_lo - test_lo; + } else { + return test_lo - train_lo; + } + } + + void init_agent(const int in_dim, const int out_dim, const int h1_size, const int h2_size) { + std::shared_ptr model = std::make_shared(in_dim, out_dim, h1_size, h2_size); + agent = std::make_shared>(model, "../prototxt/torch_sin_config.prototxt"); + } + + void train_agent(std::string config_path) { + std::default_random_engine generator(0); // fix seed + std::uniform_real_distribution uniform(-3.0, 9.0); + std::normal_distribution norm; + for (int i = 0; i < train_data_size; ++i) { + float x_i = uniform(generator); // generate data between [-3, 9] + float y_i = sin(x_i) + norm(generator) * 0.05; // label noise std 0.05 + x_list.push_back(x_i); + y_list.push_back(y_i); + } + for (int i= 0; i < test_data_size; ++i) { + float x_i = uniform(generator); + float y_i = sin(x_i); + test_x_list.push_back(x_i); + test_y_list.push_back(y_i); + } + + std::shared_ptr model = std::make_shared(1, 1, 10, 5); + agent = std::make_shared>(model, config_path); + + // Clone agents to sample (explore). + std::vector>> sampling_agents; + for (int i = 0; i < iter; ++i) { + sampling_agents.push_back(agent->clone()); + } + + std::vector noisy_keys; + std::vector noisy_rewards(iter, 0.0f); + noisy_keys.resize(iter); + + LOG(INFO) << "start training..."; + for (int epoch = 0; epoch < 1001; ++epoch) { +#pragma omp parallel for schedule(dynamic, 1) + for (int i = 0; i < iter; ++i) { + auto sampling_agent = sampling_agents[i]; + SamplingInfo key; + bool success = sampling_agent->add_noise(key); + float reward = evaluate(x_list, y_list, train_data_size, sampling_agent); + noisy_keys[i] = key; + noisy_rewards[i] = reward; + } + bool success = agent->update(noisy_keys, noisy_rewards); + + if (epoch % 100 == 0) { + float reward = evaluate(test_x_list, test_y_list, test_data_size, agent); + float train_reward = evaluate(x_list, y_list, train_data_size, agent); + LOG(INFO) << "Epoch:" << epoch << " Loss: " << -reward << ", Train loss" << -train_reward; + } + } + } + + // Class members declared here can be used by all tests in the test suite + int train_data_size = 300; + int test_data_size = 100; + int iter = 10; + std::vector x_list; + std::vector y_list; + std::vector test_x_list; + std::vector test_y_list; + std::shared_ptr> agent; +}; + +TEST_F(TorchDemoTest, TrainingEffectUseNormalSampling) { + train_agent("../prototxt/torch_sin_config.prototxt"); + EXPECT_LT(train_loss(), 0.05); + EXPECT_LT(test_loss(), 0.05); + EXPECT_LT(train_test_gap(), 0.03); +} + +TEST_F(TorchDemoTest, TrainingEffectTestUseTableSampling) { + train_agent("../prototxt/torch_sin_cached_config.prototxt"); + EXPECT_LT(train_loss(), 0.05); + EXPECT_LT(test_loss(), 0.05); + EXPECT_LT(train_test_gap(), 0.03); +} + +TEST_F(TorchDemoTest,ParamSizeTest) { + init_agent(1, 1, 10, 5); + EXPECT_EQ(agent->param_size(), 81); + init_agent(2, 3, 10, 5); + EXPECT_EQ(agent->param_size(), 103); + init_agent(1, 1, 1, 1); + EXPECT_EQ(agent->param_size(), 6); + init_agent(100, 2, 256, 64); + EXPECT_EQ(agent->param_size(), 42434); +} + +} // namespace diff --git a/evo_kit/test/src/utils_test.cc b/evo_kit/test/src/utils_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..a0c8d2c963a698475831a641c3eefc8abcc3693a --- /dev/null +++ b/evo_kit/test/src/utils_test.cc @@ -0,0 +1,30 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "gtest/gtest.h" +#include +#include "evo_kit/utils.h" + +namespace evo_kit { + +// Tests that the Utils::compute_centered_rank() method. +TEST(UtilsTest, Method_compute_centered_ranks) { + float a[5] = {9.0, 8.0, 7.0, 6.0, 5.0}; + std::vector reward_vec(a, a+5); + EXPECT_EQ(compute_centered_ranks(reward_vec), true); +} + + +} // namespace + diff --git a/evo_kit/test/unit_test.cc b/evo_kit/test/unit_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..3bbc21f4cdfb8e7709173a258f66560a7f7e27a1 --- /dev/null +++ b/evo_kit/test/unit_test.cc @@ -0,0 +1,20 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "gtest/gtest.h" + +int main(int argc, char **argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/evo_kit/torch/include/evo_kit/es_agent.h b/evo_kit/torch/include/evo_kit/es_agent.h new file mode 100644 index 0000000000000000000000000000000000000000..856034f75fc2c025cbb3aed74c5eac4edc888178 --- /dev/null +++ b/evo_kit/torch/include/evo_kit/es_agent.h @@ -0,0 +1,196 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef TORCH_ESAGENT_H +#define TORCH_ESAGENT_H + +#include +#include +#include "evo_kit/optimizer_factory.h" +#include "evo_kit/sampling_factory.h" +#include "evo_kit/utils.h" +#include "evo_kit/evo_kit.pb.h" + +namespace evo_kit{ + +/** + * @brief DeepES agent for Torch. + * + * Our implemtation is flexible to support any model that subclass torch::nn::Module. + * That is, we can instantiate an agent by: es_agent = ESAgent(model); + * After that, users can clone an agent for multi-thread processing, add parametric noise for exploration, + * and update the parameteres, according to the evaluation resutls of noisy parameters. + */ +template +class ESAgent{ +public: + ESAgent() {} + + ~ESAgent() { + delete[] _noise; + if (!_is_sampling_agent) + delete[] _neg_gradients; + } + + ESAgent(std::shared_ptr model, std::string config_path): _model(model) { + _is_sampling_agent = false; + _config = std::make_shared(); + load_proto_conf(config_path, *_config); + _sampling_method = create_sampling_method(*_config); + _optimizer = create_optimizer(_config->optimizer()); + // Origin agent can't be used to sample, so keep it same with _model for evaluating. + _sampling_model = model; + _param_size = _calculate_param_size(); + + _noise = new float [_param_size]; + _neg_gradients = new float [_param_size]; + } + + /** + * @breif Clone a sampling agent + * + * Only cloned ESAgent can call `add_noise` function. + * Each cloned ESAgent will have a copy of original parameters. + * (support sampling in multi-thread way) + */ + std::shared_ptr clone() { + std::shared_ptr new_agent = std::make_shared(); + + new_agent->_model = _model; + std::shared_ptr new_model = _model->clone(); + new_agent->_sampling_model = new_model; + + new_agent->_is_sampling_agent = true; + new_agent->_sampling_method = _sampling_method; + new_agent->_param_size = _param_size; + + float* new_noise = new float [_param_size]; + new_agent->_noise = new_noise; + + return new_agent; + } + + /** + * @brief Use the model to predict. + * + * if _is_sampling_agent is true, will use the sampling model with added noise; + * if _is_sampling_agent is false, will use the original model without added noise. + */ + torch::Tensor predict(const torch::Tensor& x) { + return _sampling_model->forward(x); + } + + /** + * @brief Update parameters of model based on ES algorithm. + * + * Only not cloned ESAgent can call `update` function. + * Parameters of cloned agents will also be updated. + */ + bool update(std::vector& noisy_info, std::vector& noisy_rewards) { + if (_is_sampling_agent) { + LOG(ERROR) << "[DeepES] Cloned ESAgent cannot call update function, please use original ESAgent."; + return false; + } + + compute_centered_ranks(noisy_rewards); + + memset(_neg_gradients, 0, _param_size * sizeof(float)); + for (int i = 0; i < noisy_info.size(); ++i) { + int key = noisy_info[i].key(0); + float reward = noisy_rewards[i]; + bool success = _sampling_method->resampling(key, _noise, _param_size); + CHECK(success) << "[DeepES] resampling error occurs at sample: " << i; + for (int64_t j = 0; j < _param_size; ++j) { + _neg_gradients[j] += _noise[j] * reward; + } + } + for (int64_t j = 0; j < _param_size; ++j) { + _neg_gradients[j] /= -1.0 * noisy_info.size(); + } + + //update + auto params = _model->named_parameters(); + int64_t counter = 0; + for (auto& param: params) { + torch::Tensor tensor = param.value().view({-1}); + auto tensor_a = tensor.accessor(); + _optimizer->update(tensor_a, _neg_gradients+counter, tensor.size(0), param.key()); + counter += tensor.size(0); + } + + return true; + } + + // copied parameters = original parameters + noise + bool add_noise(SamplingInfo& sampling_info) { + bool success = true; + if (!_is_sampling_agent) { + LOG(ERROR) << "[DeepES] Original ESAgent cannot call add_noise function, please use cloned ESAgent."; + success = false; + return success; + } + + auto sampling_params = _sampling_model->named_parameters(); + auto params = _model->named_parameters(); + int key = 0; + success = _sampling_method->sampling(&key, _noise, _param_size); + CHECK(success) << "[EvoKit] sampling error occurs while add_noise."; + sampling_info.add_key(key); + int64_t counter = 0; + for (auto& param: sampling_params) { + torch::Tensor sampling_tensor = param.value().view({-1}); + std::string param_name = param.key(); + torch::Tensor tensor = params.find(param_name)->view({-1}); + auto sampling_tensor_a = sampling_tensor.accessor(); + auto tensor_a = tensor.accessor(); + for (int64_t j = 0; j < tensor.size(0); ++j) { + sampling_tensor_a[j] = tensor_a[j] + _noise[counter + j]; + } + counter += tensor.size(0); + } + return success; + } + + // get param size of model + int64_t param_size() { + return _param_size; + } + + +private: + int64_t _calculate_param_size() { + _param_size = 0; + auto params = _model->named_parameters(); + for (auto& param: params) { + torch::Tensor tensor = param.value().view({-1}); + _param_size += tensor.size(0); + } + return _param_size; + } + + std::shared_ptr _model; + std::shared_ptr _sampling_model; + bool _is_sampling_agent; + std::shared_ptr _sampling_method; + std::shared_ptr _optimizer; + std::shared_ptr _config; + int64_t _param_size; + // malloc memory of noise and neg_gradients in advance. + float* _noise; + float* _neg_gradients; +}; + +} + +#endif /* TORCH_ESAGENT_H */ diff --git a/examples/A2C/README.md b/examples/A2C/README.md index d38a5d153b3ab39c59b851775567d90edcdda4fb..2328a3ee350851370c5c44bbed6b4e2daae27512 100755 --- a/examples/A2C/README.md +++ b/examples/A2C/README.md @@ -20,7 +20,7 @@ Performance of A2C on various envrionments ## How to use ### Dependencies + [paddlepaddle>=1.6.1](https://github.com/PaddlePaddle/Paddle) -+ [parl](https://github.com/PaddlePaddle/PARL) ++ [parl>=1.2.1](https://github.com/PaddlePaddle/PARL) + gym==0.12.1 + atari-py==0.1.7 diff --git a/examples/A2C/atari_agent.py b/examples/A2C/atari_agent.py index 5604f71016538650b0ed0355dd6cd2856f52c60e..94d2125214a9ef52273763cea6cc0213cc34c963 100755 --- a/examples/A2C/atari_agent.py +++ b/examples/A2C/atari_agent.py @@ -71,7 +71,10 @@ class AtariAgent(parl.Agent): lr = layers.data( name='lr', shape=[1], dtype='float32', append_batch_size=False) entropy_coeff = layers.data( - name='entropy_coeff', shape=[], dtype='float32') + name='entropy_coeff', + shape=[1], + dtype='float32', + append_batch_size=False) total_loss, pi_loss, vf_loss, entropy = self.alg.learn( obs, actions, advantages, target_values, lr, entropy_coeff) diff --git a/examples/A2C/train.py b/examples/A2C/train.py index 777a22849afcb3ad5b1e237d7e3d0ae9b39fa871..4050a413f0262ec62f76bbc07062578d6a398d5c 100755 --- a/examples/A2C/train.py +++ b/examples/A2C/train.py @@ -25,7 +25,7 @@ from atari_agent import AtariAgent from collections import defaultdict from parl.env.atari_wrappers import wrap_deepmind -from parl.utils import logger, get_gpu_count, tensorboard +from parl.utils import logger, get_gpu_count, summary from parl.utils.scheduler import PiecewiseScheduler from parl.utils.time_stat import TimeStat from parl.utils.window_stat import WindowStat @@ -55,11 +55,6 @@ class Learner(object): assert get_gpu_count() == 1, 'Only support training in single GPU,\ Please set environment variable: `export CUDA_VISIBLE_DEVICES=[GPU_ID_TO_USE]` .' - else: - cpu_num = os.environ.get('CPU_NUM') - assert cpu_num is not None and cpu_num == '1', 'Only support training in single CPU,\ - Please set environment variable: `export CPU_NUM=1`.' - #========== Learner ========== self.total_loss_stat = WindowStat(100) @@ -191,7 +186,7 @@ class Learner(object): min_episode_steps = np.min(np.array(episode_steps).flatten()) metric = { - 'Sample steps': self.sample_total_steps, + 'sample_steps': self.sample_total_steps, 'max_episode_rewards': max_episode_rewards, 'mean_episode_rewards': mean_episode_rewards, 'min_episode_rewards': min_episode_rewards, @@ -210,7 +205,7 @@ class Learner(object): for key, value in metric.items(): if value is not None: - tensorboard.add_scalar(key, value, self.sample_total_steps) + summary.add_scalar(key, value, self.sample_total_steps) logger.info(metric) diff --git a/examples/DDPG/mujoco_agent.py b/examples/DDPG/mujoco_agent.py index 2b2c216e0dfcb6bf675483d13454195b9cc634ed..4a92f3ea01217715a8fae16b8079367d5368f05a 100644 --- a/examples/DDPG/mujoco_agent.py +++ b/examples/DDPG/mujoco_agent.py @@ -55,6 +55,7 @@ class MujocoAgent(parl.Agent): act = self.fluid_executor.run( self.pred_program, feed={'obs': obs}, fetch_list=[self.pred_act])[0] + act = np.squeeze(act) return act def learn(self, obs, act, reward, next_obs, terminal): diff --git a/examples/DDPG/mujoco_model.py b/examples/DDPG/mujoco_model.py index ed59dbbf2e2d3381f24bbc67c7503c681ad87c18..6a812f6e465cd4937bfae3bc2eeabeaaaa8e0d8d 100644 --- a/examples/DDPG/mujoco_model.py +++ b/examples/DDPG/mujoco_model.py @@ -45,7 +45,6 @@ class ActorModel(parl.Model): hid1 = self.fc1(obs) hid2 = self.fc2(hid1) means = self.fc3(hid2) - means = means return means diff --git a/examples/DDPG/train.py b/examples/DDPG/train.py index 1b7e5c1cc9d98f91b024cea7dbdda6e443f3fea9..05b25dc81f3e45985812d526e70c422ada225197 100644 --- a/examples/DDPG/train.py +++ b/examples/DDPG/train.py @@ -21,14 +21,12 @@ from mujoco_agent import MujocoAgent from mujoco_model import MujocoModel from parl.utils import logger, action_mapping, ReplayMemory -MAX_EPISODES = 5000 -TEST_EVERY_EPISODES = 20 ACTOR_LR = 1e-4 CRITIC_LR = 1e-3 GAMMA = 0.99 TAU = 0.001 MEMORY_SIZE = int(1e6) -MIN_LEARN_SIZE = 1e4 +MEMORY_WARMUP_SIZE = 1e4 BATCH_SIZE = 128 REWARD_SCALE = 0.1 ENV_SEED = 1 @@ -37,12 +35,9 @@ ENV_SEED = 1 def run_train_episode(env, agent, rpm): obs = env.reset() total_reward = 0 - steps = 0 while True: - steps += 1 batch_obs = np.expand_dims(obs, axis=0) action = agent.predict(batch_obs.astype('float32')) - action = np.squeeze(action) # Add exploration noise, and clip to [-1.0, 1.0] action = np.clip(np.random.normal(action, 1.0), -1.0, 1.0) @@ -53,7 +48,7 @@ def run_train_episode(env, agent, rpm): rpm.append(obs, action, REWARD_SCALE * reward, next_obs, done) - if rpm.size() > MIN_LEARN_SIZE: + if rpm.size() > MEMORY_WARMUP_SIZE: batch_obs, batch_action, batch_reward, batch_next_obs, batch_terminal = rpm.sample_batch( BATCH_SIZE) agent.learn(batch_obs, batch_action, batch_reward, batch_next_obs, @@ -64,7 +59,7 @@ def run_train_episode(env, agent, rpm): if done: break - return total_reward, steps + return total_reward def run_evaluate_episode(env, agent): @@ -73,7 +68,6 @@ def run_evaluate_episode(env, agent): while True: batch_obs = np.expand_dims(obs, axis=0) action = agent.predict(batch_obs.astype('float32')) - action = np.squeeze(action) action = action_mapping(action, env.action_space.low[0], env.action_space.high[0]) @@ -101,19 +95,19 @@ def main(): rpm = ReplayMemory(MEMORY_SIZE, obs_dim, act_dim) - test_flag = 0 - total_steps = 0 - while total_steps < args.train_total_steps: - train_reward, steps = run_train_episode(env, agent, rpm) - total_steps += steps - logger.info('Steps: {} Reward: {}'.format(total_steps, train_reward)) + while rpm.size() < MEMORY_WARMUP_SIZE: + run_train_episode(env, agent, rpm) + + episode = 0 + while episode < args.train_total_episode: + for i in range(50): + train_reward = run_train_episode(env, agent, rpm) + episode += 1 + logger.info('Episode: {} Reward: {}'.format(episode, train_reward)) - if total_steps // args.test_every_steps >= test_flag: - while total_steps // args.test_every_steps >= test_flag: - test_flag += 1 - evaluate_reward = run_evaluate_episode(env, agent) - logger.info('Steps {}, Evaluate reward: {}'.format( - total_steps, evaluate_reward)) + evaluate_reward = run_evaluate_episode(env, agent) + logger.info('Episode {}, Evaluate reward: {}'.format( + episode, evaluate_reward)) if __name__ == '__main__': @@ -121,15 +115,10 @@ if __name__ == '__main__': parser.add_argument( '--env', help='Mujoco environment name', default='HalfCheetah-v2') parser.add_argument( - '--train_total_steps', - type=int, - default=int(1e7), - help='maximum training steps') - parser.add_argument( - '--test_every_steps', + '--train_total_episode', type=int, default=int(1e4), - help='the step interval between two consecutive evaluations') + help='maximum training episodes') args = parser.parse_args() diff --git a/examples/DQN/README.md b/examples/DQN/README.md index 351e44754ad82125eec4e1346fd6301e8c1555b7..2281cee4e5080a0030926a7f81ec5d4cdf7d82ec 100644 --- a/examples/DQN/README.md +++ b/examples/DQN/README.md @@ -1,22 +1,16 @@ ## Reproduce DQN with PARL -Based on PARL, the DQN algorithm of deep reinforcement learning has been reproduced, reaching the same level of indicators as the paper in Atari benchmarks. +Based on PARL, we provide a simple demonstration of DQN. + DQN in [Human-level Control Through Deep Reinforcement Learning](http://www.nature.com/nature/journal/v518/n7540/full/nature14236.html) -### Atari games introduction -Please see [here](https://gym.openai.com/envs/#atari) to know more about Atari games. +### Result -### Benchmark result +Performance of DQN playing CartPole-v0 -Mean episode rewards for 10 million training steps. - -pong - -Performance of DQN on various environments - -

-result +

+result +result

## How to use @@ -25,13 +19,14 @@ Performance of DQN on various environments + [parl](https://github.com/PaddlePaddle/PARL) + gym + tqdm -+ atari-py -+ [ale_python_interface](https://github.com/mgbellemare/Arcade-Learning-Environment) ### Start Training: ``` -# To train a model for Pong game -python train.py --rom ./rom_files/pong.bin +# To train a model for CartPole-v0 game +python train.py ``` -> To train more games, you can install more rom files from [here](https://github.com/openai/atari-py/tree/master/atari_py/atari_roms). + +## DQN-Variants + +For DQN variants such as Double DQN and Dueling DQN, please check [here](https://github.com/PaddlePaddle/PARL/tree/develop/examples/DQN_variant) diff --git a/examples/DQN/cartpole.jpg b/examples/DQN/cartpole.jpg new file mode 100644 index 0000000000000000000000000000000000000000..978a074468950a36bd385b3f7cb691efb636829b Binary files /dev/null and b/examples/DQN/cartpole.jpg differ diff --git a/examples/DQN/cartpole_agent.py b/examples/DQN/cartpole_agent.py new file mode 100755 index 0000000000000000000000000000000000000000..d98f2ba7cdbd426754e3103ebd4068d5e9fb9871 --- /dev/null +++ b/examples/DQN/cartpole_agent.py @@ -0,0 +1,93 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import paddle.fluid as fluid +import parl +from parl import layers + + +class CartpoleAgent(parl.Agent): + def __init__(self, + algorithm, + obs_dim, + act_dim, + e_greed=0.1, + e_greed_decrement=0): + assert isinstance(obs_dim, int) + assert isinstance(act_dim, int) + self.obs_dim = obs_dim + self.act_dim = act_dim + super(CartpoleAgent, self).__init__(algorithm) + + self.global_step = 0 + self.update_target_steps = 200 + + self.e_greed = e_greed + self.e_greed_decrement = e_greed_decrement + + def build_program(self): + self.pred_program = fluid.Program() + self.learn_program = fluid.Program() + + with fluid.program_guard(self.pred_program): + obs = layers.data( + name='obs', shape=[self.obs_dim], dtype='float32') + self.value = self.alg.predict(obs) + + with fluid.program_guard(self.learn_program): + obs = layers.data( + name='obs', shape=[self.obs_dim], dtype='float32') + action = layers.data(name='act', shape=[1], dtype='int32') + reward = layers.data(name='reward', shape=[], dtype='float32') + next_obs = layers.data( + name='next_obs', shape=[self.obs_dim], dtype='float32') + terminal = layers.data(name='terminal', shape=[], dtype='bool') + self.cost = self.alg.learn(obs, action, reward, next_obs, terminal) + + def sample(self, obs): + sample = np.random.rand() + if sample < self.e_greed: + act = np.random.randint(self.act_dim) + else: + act = self.predict(obs) + self.e_greed = max(0.01, self.e_greed - self.e_greed_decrement) + return act + + def predict(self, obs): + obs = np.expand_dims(obs, axis=0) + pred_Q = self.fluid_executor.run( + self.pred_program, + feed={'obs': obs.astype('float32')}, + fetch_list=[self.value])[0] + pred_Q = np.squeeze(pred_Q, axis=0) + act = np.argmax(pred_Q) + return act + + def learn(self, obs, act, reward, next_obs, terminal): + if self.global_step % self.update_target_steps == 0: + self.alg.sync_target() + self.global_step += 1 + + act = np.expand_dims(act, -1) + feed = { + 'obs': obs.astype('float32'), + 'act': act.astype('int32'), + 'reward': reward, + 'next_obs': next_obs.astype('float32'), + 'terminal': terminal, + } + cost = self.fluid_executor.run( + self.learn_program, feed=feed, fetch_list=[self.cost])[0] + return cost diff --git a/examples/LiftSim_baseline/rl_benchmark/model.py b/examples/DQN/cartpole_model.py old mode 100644 new mode 100755 similarity index 61% rename from examples/LiftSim_baseline/rl_benchmark/model.py rename to examples/DQN/cartpole_model.py index 3b2364df90565565f5d4e3286b6662c134cb4c08..9218fdfca6555551dc90d025777d45d2acb4b27d --- a/examples/LiftSim_baseline/rl_benchmark/model.py +++ b/examples/DQN/cartpole_model.py @@ -12,24 +12,21 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os import paddle.fluid as fluid -from parl import layers -import numpy as np import parl +from parl import layers -class RLDispatcherModel(parl.Model): +class CartpoleModel(parl.Model): def __init__(self, act_dim): - self._act_dim = act_dim - self._fc_1 = layers.fc(size=512, act='relu') - self._fc_2 = layers.fc(size=256, act='relu') - self._fc_3 = layers.fc(size=128, act='tanh') - self._output = layers.fc(size=act_dim) + hid1_size = 128 + hid2_size = 128 + self.fc1 = layers.fc(size=hid1_size, act='relu') + self.fc2 = layers.fc(size=hid2_size, act='relu') + self.fc3 = layers.fc(size=act_dim, act=None) def value(self, obs): - _h_1 = self._fc_1(obs) - _h_2 = self._fc_2(_h_1) - _h_3 = self._fc_3(_h_2) - self._pred = self._output(_h_3) - return self._pred + h1 = self.fc1(obs) + h2 = self.fc2(h1) + Q = self.fc3(h2) + return Q diff --git a/examples/DQN/replay_memory.py b/examples/DQN/replay_memory.py old mode 100644 new mode 100755 index ea8c6565155ddacae568e901566f9b390ee3a8b8..c9474a0dce8d3cc9f5d5610cafbd1df5b1a03586 --- a/examples/DQN/replay_memory.py +++ b/examples/DQN/replay_memory.py @@ -12,103 +12,35 @@ # See the License for the specific language governing permissions and # limitations under the License. -import numpy as np -import copy -from collections import deque, namedtuple +# Modified from https://github.com/seungeunrho/minimalRL/blob/master/dqn.py -Experience = namedtuple('Experience', ['state', 'action', 'reward', 'isOver']) +import random +import collections +import numpy as np class ReplayMemory(object): - def __init__(self, max_size, state_shape, context_len): - self.max_size = int(max_size) - self.state_shape = state_shape - self.context_len = int(context_len) - - self.state = np.zeros((self.max_size, ) + state_shape, dtype='uint8') - self.action = np.zeros((self.max_size, ), dtype='int32') - self.reward = np.zeros((self.max_size, ), dtype='float32') - self.isOver = np.zeros((self.max_size, ), dtype='bool') - - self._curr_size = 0 - self._curr_pos = 0 - self._context = deque(maxlen=context_len - 1) + def __init__(self, max_size): + self.buffer = collections.deque(maxlen=max_size) def append(self, exp): - """append a new experience into replay memory - """ - if self._curr_size < self.max_size: - self._assign(self._curr_pos, exp) - self._curr_size += 1 - else: - self._assign(self._curr_pos, exp) - self._curr_pos = (self._curr_pos + 1) % self.max_size - if exp.isOver: - self._context.clear() - else: - self._context.append(exp) - - def recent_state(self): - """ maintain recent state for training""" - lst = list(self._context) - states = [np.zeros(self.state_shape, dtype='uint8')] * \ - (self._context.maxlen - len(lst)) - states.extend([k.state for k in lst]) - return states + self.buffer.append(exp) - def sample(self, idx): - """ return state, action, reward, isOver, - note that some frames in state may be generated from last episode, - they should be removed from state - """ - state = np.zeros( - (self.context_len + 1, ) + self.state_shape, dtype=np.uint8) - state_idx = np.arange(idx, - idx + self.context_len + 1) % self._curr_size + def sample(self, batch_size): + mini_batch = random.sample(self.buffer, batch_size) + obs_batch, action_batch, reward_batch, next_obs_batch, done_batch = [], [], [], [], [] - # confirm that no frame was generated from last episode - has_last_episode = False - for k in range(self.context_len - 2, -1, -1): - to_check_idx = state_idx[k] - if self.isOver[to_check_idx]: - has_last_episode = True - state_idx = state_idx[k + 1:] - state[k + 1:] = self.state[state_idx] - break + for experience in mini_batch: + s, a, r, s_p, done = experience + obs_batch.append(s) + action_batch.append(a) + reward_batch.append(r) + next_obs_batch.append(s_p) + done_batch.append(done) - if not has_last_episode: - state = self.state[state_idx] - - real_idx = (idx + self.context_len - 1) % self._curr_size - action = self.action[real_idx] - reward = self.reward[real_idx] - isOver = self.isOver[real_idx] - return state, reward, action, isOver + return np.array(obs_batch).astype('float32'), \ + np.array(action_batch).astype('float32'), np.array(reward_batch).astype('float32'),\ + np.array(next_obs_batch).astype('float32'), np.array(done_batch).astype('float32') def __len__(self): - return self._curr_size - - def size(self): - return self._curr_size - - def _assign(self, pos, exp): - self.state[pos] = exp.state - self.reward[pos] = exp.reward - self.action[pos] = exp.action - self.isOver[pos] = exp.isOver - - def sample_batch(self, batch_size): - """sample a batch from replay memory for training - """ - batch_idx = np.random.randint( - self._curr_size - self.context_len - 1, size=batch_size) - batch_idx = (self._curr_pos + batch_idx) % self._curr_size - batch_exp = [self.sample(i) for i in batch_idx] - return self._process_batch(batch_exp) - - def _process_batch(self, batch_exp): - state = np.asarray([e[0] for e in batch_exp], dtype='uint8') - reward = np.asarray([e[1] for e in batch_exp], dtype='float32') - action = np.asarray([e[2] for e in batch_exp], dtype='int8') - isOver = np.asarray([e[3] for e in batch_exp], dtype='bool') - return [state, action, reward, isOver] + return len(self.buffer) diff --git a/examples/DQN/train.py b/examples/DQN/train.py old mode 100644 new mode 100755 index 3149e6b81a34e81aff038a12994e7eb4e91eac22..b634b122eff4abc7177ff830387560c95fb2aa2b --- a/examples/DQN/train.py +++ b/examples/DQN/train.py @@ -12,160 +12,100 @@ # See the License for the specific language governing permissions and # limitations under the License. -import argparse import gym -import paddle.fluid as fluid import numpy as np -import os import parl -from atari_agent import AtariAgent -from atari_model import AtariModel -from datetime import datetime -from replay_memory import ReplayMemory, Experience -from parl.utils import tensorboard, logger -from tqdm import tqdm -from utils import get_player - -MEMORY_SIZE = 1e6 -MEMORY_WARMUP_SIZE = MEMORY_SIZE // 20 -IMAGE_SIZE = (84, 84) -CONTEXT_LEN = 4 -FRAME_SKIP = 4 -UPDATE_FREQ = 4 -GAMMA = 0.99 -LEARNING_RATE = 3e-4 - - -def run_train_episode(env, agent, rpm): - total_reward = 0 - all_cost = [] - state = env.reset() - steps = 0 - while True: - steps += 1 - context = rpm.recent_state() - context.append(state) - context = np.stack(context, axis=0) - action = agent.sample(context) - next_state, reward, isOver, _ = env.step(action) - rpm.append(Experience(state, action, reward, isOver)) - # start training - if rpm.size() > MEMORY_WARMUP_SIZE: - if steps % UPDATE_FREQ == 0: - batch_all_state, batch_action, batch_reward, batch_isOver = rpm.sample_batch( - args.batch_size) - batch_state = batch_all_state[:, :CONTEXT_LEN, :, :] - batch_next_state = batch_all_state[:, 1:, :, :] - cost = agent.learn(batch_state, batch_action, batch_reward, - batch_next_state, batch_isOver) - all_cost.append(float(cost)) - total_reward += reward - state = next_state - if isOver: - break - if all_cost: - logger.info('[Train]total_reward: {}, mean_cost: {}'.format( - total_reward, np.mean(all_cost))) - return total_reward, steps, np.mean(all_cost) +from parl.utils import logger +from cartpole_model import CartpoleModel +from cartpole_agent import CartpoleAgent -def run_evaluate_episode(env, agent): - state = env.reset() +from replay_memory import ReplayMemory + +LEARN_FREQ = 5 # update parameters every 5 steps +MEMORY_SIZE = 20000 # replay memory size +MEMORY_WARMUP_SIZE = 200 # store some experiences in the replay memory in advance +BATCH_SIZE = 32 +LEARNING_RATE = 0.0005 +GAMMA = 0.99 # discount factor of reward + + +def run_episode(agent, env, rpm): total_reward = 0 + obs = env.reset() + step = 0 while True: - action = agent.predict(state) - state, reward, isOver, info = env.step(action) + step += 1 + action = agent.sample(obs) + next_obs, reward, isOver, _ = env.step(action) + rpm.append((obs, action, reward, next_obs, isOver)) + + # train model + if (len(rpm) > MEMORY_WARMUP_SIZE) and (step % LEARN_FREQ == 0): + (batch_obs, batch_action, batch_reward, batch_next_obs, + batch_isOver) = rpm.sample(BATCH_SIZE) + train_loss = agent.learn(batch_obs, batch_action, batch_reward, + batch_next_obs, batch_isOver) + total_reward += reward + obs = next_obs if isOver: break return total_reward +def evaluate(agent, env, render=False): + # test part, run 5 episodes and average + eval_reward = [] + for i in range(5): + obs = env.reset() + episode_reward = 0 + isOver = False + while not isOver: + action = agent.predict(obs) + if render: + env.render() + obs, reward, isOver, _ = env.step(action) + episode_reward += reward + eval_reward.append(episode_reward) + return np.mean(eval_reward) + + def main(): - env = get_player( - args.rom, image_size=IMAGE_SIZE, train=True, frame_skip=FRAME_SKIP) - test_env = get_player( - args.rom, - image_size=IMAGE_SIZE, - frame_skip=FRAME_SKIP, - context_len=CONTEXT_LEN) - rpm = ReplayMemory(MEMORY_SIZE, IMAGE_SIZE, CONTEXT_LEN) - act_dim = env.action_space.n - - model = AtariModel(act_dim, args.algo) - if args.algo == 'Double': - algorithm = parl.algorithms.DDQN(model, act_dim=act_dim, gamma=GAMMA) - elif args.algo in ['DQN', 'Dueling']: - algorithm = parl.algorithms.DQN(model, act_dim=act_dim, gamma=GAMMA) - agent = AtariAgent( + env = gym.make('CartPole-v0') + action_dim = env.action_space.n + obs_shape = env.observation_space.shape + + rpm = ReplayMemory(MEMORY_SIZE) + + model = CartpoleModel(act_dim=action_dim) + algorithm = parl.algorithms.DQN( + model, act_dim=action_dim, gamma=GAMMA, lr=LEARNING_RATE) + agent = CartpoleAgent( algorithm, - act_dim=act_dim, - start_lr=LEARNING_RATE, - total_step=args.train_total_steps, - update_freq=UPDATE_FREQ) - - with tqdm( - total=MEMORY_WARMUP_SIZE, desc='[Replay Memory Warm Up]') as pbar: - while rpm.size() < MEMORY_WARMUP_SIZE: - total_reward, steps, _ = run_train_episode(env, agent, rpm) - pbar.update(steps) - - # train - test_flag = 0 - pbar = tqdm(total=args.train_total_steps) - total_steps = 0 - max_reward = None - while total_steps < args.train_total_steps: - # start epoch - total_reward, steps, loss = run_train_episode(env, agent, rpm) - total_steps += steps - pbar.set_description('[train]exploration:{}'.format(agent.exploration)) - tensorboard.add_scalar('dqn/score', total_reward, total_steps) - tensorboard.add_scalar('dqn/loss', loss, - total_steps) # mean of total loss - tensorboard.add_scalar('dqn/exploration', agent.exploration, - total_steps) - pbar.update(steps) - - if total_steps // args.test_every_steps >= test_flag: - while total_steps // args.test_every_steps >= test_flag: - test_flag += 1 - pbar.write("testing") - eval_rewards = [] - for _ in tqdm(range(3), desc='eval agent'): - eval_reward = run_evaluate_episode(test_env, agent) - eval_rewards.append(eval_reward) - logger.info( - "eval_agent done, (steps, eval_reward): ({}, {})".format( - total_steps, np.mean(eval_rewards))) - eval_test = np.mean(eval_rewards) - tensorboard.add_scalar('dqn/eval', eval_test, total_steps) - - pbar.close() + obs_dim=obs_shape[0], + act_dim=action_dim, + e_greed=0.1, # explore + e_greed_decrement=1e-6 + ) # probability of exploring is decreasing during training + + while len(rpm) < MEMORY_WARMUP_SIZE: # warm up replay memory + run_episode(agent, env, rpm) + + max_episode = 2000 + + # start train + episode = 0 + while episode < max_episode: + # train part + for i in range(0, 50): + total_reward = run_episode(agent, env, rpm) + episode += 1 + + eval_reward = evaluate(agent, env) + logger.info('episode:{} test_reward:{}'.format( + episode, eval_reward)) if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument( - '--rom', help='path of the rom of the atari game', required=True) - parser.add_argument( - '--batch_size', type=int, default=64, help='batch size for training') - parser.add_argument( - '--algo', - default='DQN', - help= - 'DQN/DDQN/Dueling, represent DQN, double DQN, and dueling DQN respectively', - ) - parser.add_argument( - '--train_total_steps', - type=int, - default=int(1e7), - help='maximum environmental steps of games') - parser.add_argument( - '--test_every_steps', - type=int, - default=100000, - help='the step interval between two consecutive evaluations') - - args = parser.parse_args() main() diff --git a/examples/DQN/.benchmark/merge.png b/examples/DQN_variant/.benchmark/merge.png similarity index 100% rename from examples/DQN/.benchmark/merge.png rename to examples/DQN_variant/.benchmark/merge.png diff --git a/examples/DQN/.benchmark/table.png b/examples/DQN_variant/.benchmark/table.png similarity index 100% rename from examples/DQN/.benchmark/table.png rename to examples/DQN_variant/.benchmark/table.png diff --git a/examples/DQN_variant/README.md b/examples/DQN_variant/README.md new file mode 100644 index 0000000000000000000000000000000000000000..351e44754ad82125eec4e1346fd6301e8c1555b7 --- /dev/null +++ b/examples/DQN_variant/README.md @@ -0,0 +1,37 @@ +## Reproduce DQN with PARL +Based on PARL, the DQN algorithm of deep reinforcement learning has been reproduced, reaching the same level of indicators as the paper in Atari benchmarks. + ++ DQN in +[Human-level Control Through Deep Reinforcement Learning](http://www.nature.com/nature/journal/v518/n7540/full/nature14236.html) + +### Atari games introduction +Please see [here](https://gym.openai.com/envs/#atari) to know more about Atari games. + +### Benchmark result + +Mean episode rewards for 10 million training steps. + +pong + +Performance of DQN on various environments + +

+result +

+ +## How to use +### Dependencies: ++ [paddlepaddle>=1.6.1](https://github.com/PaddlePaddle/Paddle) ++ [parl](https://github.com/PaddlePaddle/PARL) ++ gym ++ tqdm ++ atari-py ++ [ale_python_interface](https://github.com/mgbellemare/Arcade-Learning-Environment) + + +### Start Training: +``` +# To train a model for Pong game +python train.py --rom ./rom_files/pong.bin +``` +> To train more games, you can install more rom files from [here](https://github.com/openai/atari-py/tree/master/atari_py/atari_roms). diff --git a/examples/DQN/atari.py b/examples/DQN_variant/atari.py similarity index 100% rename from examples/DQN/atari.py rename to examples/DQN_variant/atari.py diff --git a/examples/DQN/atari_agent.py b/examples/DQN_variant/atari_agent.py similarity index 99% rename from examples/DQN/atari_agent.py rename to examples/DQN_variant/atari_agent.py index 4af4478048bc582f5951446920a1686ed497b3b4..8a33ac4369f4d9f0c55d12c82b6fded63eedbc77 100644 --- a/examples/DQN/atari_agent.py +++ b/examples/DQN_variant/atari_agent.py @@ -106,7 +106,7 @@ class AtariAgent(parl.Agent): 'reward': reward, 'next_obs': next_obs.astype('float32'), 'terminal': terminal, - 'lr': lr + 'lr': np.float32(lr) } cost = self.fluid_executor.run( self.learn_program, feed=feed, fetch_list=[self.cost])[0] diff --git a/examples/DQN/atari_model.py b/examples/DQN_variant/atari_model.py similarity index 100% rename from examples/DQN/atari_model.py rename to examples/DQN_variant/atari_model.py diff --git a/examples/DQN/atari_wrapper.py b/examples/DQN_variant/atari_wrapper.py similarity index 100% rename from examples/DQN/atari_wrapper.py rename to examples/DQN_variant/atari_wrapper.py diff --git a/examples/DQN_variant/replay_memory.py b/examples/DQN_variant/replay_memory.py new file mode 100644 index 0000000000000000000000000000000000000000..25ef0e50a7e53fd68d19b6ac3bffb807b18d6d29 --- /dev/null +++ b/examples/DQN_variant/replay_memory.py @@ -0,0 +1,113 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import copy +from collections import deque, namedtuple + +Experience = namedtuple('Experience', ['obs', 'action', 'reward', 'isOver']) + + +class ReplayMemory(object): + def __init__(self, max_size, obs_shape, context_len): + self.max_size = int(max_size) + self.obs_shape = obs_shape + self.context_len = int(context_len) + + self.obs = np.zeros((self.max_size, ) + obs_shape, dtype='uint8') + self.action = np.zeros((self.max_size, ), dtype='int32') + self.reward = np.zeros((self.max_size, ), dtype='float32') + self.isOver = np.zeros((self.max_size, ), dtype='bool') + + self._curr_size = 0 + self._curr_pos = 0 + self._context = deque(maxlen=context_len - 1) + + def append(self, exp): + """append a new experience into replay memory + """ + if self._curr_size < self.max_size: + self._assign(self._curr_pos, exp) + self._curr_size += 1 + else: + self._assign(self._curr_pos, exp) + self._curr_pos = (self._curr_pos + 1) % self.max_size + if exp.isOver: + self._context.clear() + else: + self._context.append(exp) + + def recent_obs(self): + """ maintain recent obs for training""" + lst = list(self._context) + obs = [np.zeros(self.obs_shape, dtype='uint8')] * \ + (self._context.maxlen - len(lst)) + obs.extend([k.obs for k in lst]) + return obs + + def sample(self, idx): + """ return obs, action, reward, isOver, + note that some frames in obs may be generated from last episode, + they should be removed from obs + """ + obs = np.zeros( + (self.context_len + 1, ) + self.obs_shape, dtype=np.uint8) + obs_idx = np.arange(idx, idx + self.context_len + 1) % self._curr_size + + # confirm that no frame was generated from last episode + has_last_episode = False + for k in range(self.context_len - 2, -1, -1): + to_check_idx = obs_idx[k] + if self.isOver[to_check_idx]: + has_last_episode = True + obs_idx = obs_idx[k + 1:] + obs[k + 1:] = self.obs[obs_idx] + break + + if not has_last_episode: + obs = self.obs[obs_idx] + + real_idx = (idx + self.context_len - 1) % self._curr_size + action = self.action[real_idx] + reward = self.reward[real_idx] + isOver = self.isOver[real_idx] + return obs, reward, action, isOver + + def __len__(self): + return self._curr_size + + def size(self): + return self._curr_size + + def _assign(self, pos, exp): + self.obs[pos] = exp.obs + self.reward[pos] = exp.reward + self.action[pos] = exp.action + self.isOver[pos] = exp.isOver + + def sample_batch(self, batch_size): + """sample a batch from replay memory for training + """ + batch_idx = np.random.randint( + self._curr_size - self.context_len - 1, size=batch_size) + batch_idx = (self._curr_pos + batch_idx) % self._curr_size + batch_exp = [self.sample(i) for i in batch_idx] + return self._process_batch(batch_exp) + + def _process_batch(self, batch_exp): + obs = np.asarray([e[0] for e in batch_exp], dtype='uint8') + reward = np.asarray([e[1] for e in batch_exp], dtype='float32') + action = np.asarray([e[2] for e in batch_exp], dtype='int8') + isOver = np.asarray([e[3] for e in batch_exp], dtype='bool') + return [obs, action, reward, isOver] diff --git a/examples/DQN/rom_files/breakout.bin b/examples/DQN_variant/rom_files/breakout.bin similarity index 100% rename from examples/DQN/rom_files/breakout.bin rename to examples/DQN_variant/rom_files/breakout.bin diff --git a/examples/DQN/rom_files/pong.bin b/examples/DQN_variant/rom_files/pong.bin similarity index 100% rename from examples/DQN/rom_files/pong.bin rename to examples/DQN_variant/rom_files/pong.bin diff --git a/examples/DQN_variant/train.py b/examples/DQN_variant/train.py new file mode 100644 index 0000000000000000000000000000000000000000..5ca16df135c346a08f87efc6a694e1e289b8192c --- /dev/null +++ b/examples/DQN_variant/train.py @@ -0,0 +1,169 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import gym +import paddle.fluid as fluid +import numpy as np +import os +import parl +from atari_agent import AtariAgent +from atari_model import AtariModel +from datetime import datetime +from replay_memory import ReplayMemory, Experience +from parl.utils import summary, logger +from tqdm import tqdm +from utils import get_player + +MEMORY_SIZE = 1e6 +MEMORY_WARMUP_SIZE = MEMORY_SIZE // 20 +IMAGE_SIZE = (84, 84) +CONTEXT_LEN = 4 +FRAME_SKIP = 4 +UPDATE_FREQ = 4 +GAMMA = 0.99 +LEARNING_RATE = 3e-4 + + +def run_train_episode(env, agent, rpm): + total_reward = 0 + all_cost = [] + obs = env.reset() + steps = 0 + while True: + steps += 1 + context = rpm.recent_obs() + context.append(obs) + context = np.stack(context, axis=0) + action = agent.sample(context) + next_obs, reward, isOver, _ = env.step(action) + rpm.append(Experience(obs, action, reward, isOver)) + # start training + if rpm.size() > MEMORY_WARMUP_SIZE: + if steps % UPDATE_FREQ == 0: + batch_all_obs, batch_action, batch_reward, batch_isOver = rpm.sample_batch( + args.batch_size) + batch_obs = batch_all_obs[:, :CONTEXT_LEN, :, :] + batch_next_obs = batch_all_obs[:, 1:, :, :] + cost = agent.learn(batch_obs, batch_action, batch_reward, + batch_next_obs, batch_isOver) + all_cost.append(float(cost)) + total_reward += reward + obs = next_obs + if isOver: + break + if all_cost: + logger.info('[Train]total_reward: {}, mean_cost: {}'.format( + total_reward, np.mean(all_cost))) + return total_reward, steps, np.mean(all_cost) + + +def run_evaluate_episode(env, agent): + obs = env.reset() + total_reward = 0 + while True: + action = agent.predict(obs) + obs, reward, isOver, info = env.step(action) + total_reward += reward + if isOver: + break + return total_reward + + +def main(): + env = get_player( + args.rom, image_size=IMAGE_SIZE, train=True, frame_skip=FRAME_SKIP) + test_env = get_player( + args.rom, + image_size=IMAGE_SIZE, + frame_skip=FRAME_SKIP, + context_len=CONTEXT_LEN) + rpm = ReplayMemory(MEMORY_SIZE, IMAGE_SIZE, CONTEXT_LEN) + act_dim = env.action_space.n + + model = AtariModel(act_dim, args.algo) + if args.algo == 'Double': + algorithm = parl.algorithms.DDQN(model, act_dim=act_dim, gamma=GAMMA) + elif args.algo in ['DQN', 'Dueling']: + algorithm = parl.algorithms.DQN(model, act_dim=act_dim, gamma=GAMMA) + agent = AtariAgent( + algorithm, + act_dim=act_dim, + start_lr=LEARNING_RATE, + total_step=args.train_total_steps, + update_freq=UPDATE_FREQ) + + with tqdm( + total=MEMORY_WARMUP_SIZE, desc='[Replay Memory Warm Up]') as pbar: + while rpm.size() < MEMORY_WARMUP_SIZE: + total_reward, steps, _ = run_train_episode(env, agent, rpm) + pbar.update(steps) + + # train + test_flag = 0 + pbar = tqdm(total=args.train_total_steps) + total_steps = 0 + max_reward = None + while total_steps < args.train_total_steps: + # start epoch + total_reward, steps, loss = run_train_episode(env, agent, rpm) + total_steps += steps + pbar.set_description('[train]exploration:{}'.format(agent.exploration)) + summary.add_scalar('dqn/score', total_reward, total_steps) + summary.add_scalar('dqn/loss', loss, total_steps) # mean of total loss + summary.add_scalar('dqn/exploration', agent.exploration, total_steps) + pbar.update(steps) + + if total_steps // args.test_every_steps >= test_flag: + while total_steps // args.test_every_steps >= test_flag: + test_flag += 1 + pbar.write("testing") + eval_rewards = [] + for _ in tqdm(range(3), desc='eval agent'): + eval_reward = run_evaluate_episode(test_env, agent) + eval_rewards.append(eval_reward) + logger.info( + "eval_agent done, (steps, eval_reward): ({}, {})".format( + total_steps, np.mean(eval_rewards))) + eval_test = np.mean(eval_rewards) + summary.add_scalar('dqn/eval', eval_test, total_steps) + + pbar.close() + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + '--rom', help='path of the rom of the atari game', required=True) + parser.add_argument( + '--batch_size', type=int, default=64, help='batch size for training') + parser.add_argument( + '--algo', + default='DQN', + help= + 'DQN/DDQN/Dueling, represent DQN, double DQN, and dueling DQN respectively', + ) + parser.add_argument( + '--train_total_steps', + type=int, + default=int(1e7), + help='maximum environmental steps of games') + parser.add_argument( + '--test_every_steps', + type=int, + default=100000, + help='the step interval between two consecutive evaluations') + + args = parser.parse_args() + main() diff --git a/examples/DQN/utils.py b/examples/DQN_variant/utils.py similarity index 100% rename from examples/DQN/utils.py rename to examples/DQN_variant/utils.py diff --git a/examples/ES/README.md b/examples/ES/README.md index 207ae2dafa68c5f7d2eb30f956355b07c1bd5d61..d868202753fa34c0799c8c58975c958aa1ffe001 100644 --- a/examples/ES/README.md +++ b/examples/ES/README.md @@ -34,7 +34,7 @@ Then we can start the distributed training by running: python train.py ``` -Training result will be saved in `train_log` with training curve that can be visualized in tensorboard data. +Training result will be saved in `train_log` with training curve. ### Reference + [Ray](https://github.com/ray-project/ray) diff --git a/examples/ES/train.py b/examples/ES/train.py index be2c7d703eeba39931312491274f554ee9a76562..eadf26ea6e7d736abe45e7c08d25a5c7ae8dda2e 100644 --- a/examples/ES/train.py +++ b/examples/ES/train.py @@ -23,7 +23,7 @@ from obs_filter import MeanStdFilter from mujoco_agent import MujocoAgent from mujoco_model import MujocoModel from noise import SharedNoiseTable -from parl.utils import logger, tensorboard +from parl.utils import logger, summary from parl.utils.window_stat import WindowStat from six.moves import queue from actor import Actor @@ -202,7 +202,7 @@ class Learner(object): logger.info(metrics) for k, v in metrics.items(): if v is not None: - tensorboard.add_scalar(k, v, self.sample_total_steps) + summary.add_scalar(k, v, self.sample_total_steps) if __name__ == '__main__': diff --git a/examples/GA3C/train.py b/examples/GA3C/train.py index edc7f33344bc484fff640700dfd80bfc35987843..30f3a415b77cfa83d8868606498379b528ad1c31 100755 --- a/examples/GA3C/train.py +++ b/examples/GA3C/train.py @@ -24,7 +24,7 @@ from atari_model import AtariModel from atari_agent import AtariAgent from collections import defaultdict from parl.env.atari_wrappers import wrap_deepmind -from parl.utils import logger, get_gpu_count, tensorboard +from parl.utils import logger, get_gpu_count, summary from parl.utils.scheduler import PiecewiseScheduler from parl.utils.time_stat import TimeStat from parl.utils.window_stat import WindowStat @@ -313,7 +313,7 @@ class Learner(object): for key, value in metric.items(): if value is not None: - tensorboard.add_scalar(key, value, self.sample_total_steps) + summary.add_scalar(key, value, self.sample_total_steps) logger.info(metric) diff --git a/examples/IMPALA/atari_agent.py b/examples/IMPALA/atari_agent.py index 98d4a4c4fd3ea611f60f2d8da850265025541b4b..0746f951f6920a70b0af87430af51879b635ada7 100755 --- a/examples/IMPALA/atari_agent.py +++ b/examples/IMPALA/atari_agent.py @@ -58,7 +58,10 @@ class AtariAgent(parl.Agent): lr = layers.data( name='lr', shape=[1], dtype='float32', append_batch_size=False) entropy_coeff = layers.data( - name='entropy_coeff', shape=[], dtype='float32') + name='entropy_coeff', + shape=[1], + dtype='float32', + append_batch_size=False) self.learn_reader = fluid.layers.create_py_reader_by_data( capacity=32, diff --git a/examples/IMPALA/train.py b/examples/IMPALA/train.py index 8440ee78cec30f5de568ea277769fe1df938ed9f..9f2a3e65a7962d0aed103318c4a1979520004f8f 100755 --- a/examples/IMPALA/train.py +++ b/examples/IMPALA/train.py @@ -22,7 +22,7 @@ import parl from atari_model import AtariModel from atari_agent import AtariAgent from parl.env.atari_wrappers import wrap_deepmind -from parl.utils import logger, tensorboard, get_gpu_count +from parl.utils import logger, summary, get_gpu_count from parl.utils.scheduler import PiecewiseScheduler from parl.utils.time_stat import TimeStat from parl.utils.window_stat import WindowStat @@ -121,7 +121,9 @@ class Learner(object): yield [ obs_np, actions_np, behaviour_logits_np, rewards_np, - dones_np, self.lr, self.entropy_coeff + dones_np, + np.float32(self.lr), + np.array([self.entropy_coeff], dtype='float32') ] def run_learn(self): @@ -219,7 +221,7 @@ class Learner(object): min_episode_steps = np.min(np.array(episode_steps).flatten()) metric = { - 'Sample steps': self.sample_total_steps, + 'sample_steps': self.sample_total_steps, 'max_episode_rewards': max_episode_rewards, 'mean_episode_rewards': mean_episode_rewards, 'min_episode_rewards': min_episode_rewards, @@ -242,7 +244,7 @@ class Learner(object): for key, value in metric.items(): if value is not None: - tensorboard.add_scalar(key, value, self.sample_total_steps) + summary.add_scalar(key, value, self.sample_total_steps) logger.info(metric) diff --git a/examples/LiftSim_baseline/README.md b/examples/LiftSim_baseline/README.md deleted file mode 100644 index bfc903402d2665fb00e518ae1df77a1b8c88dae5..0000000000000000000000000000000000000000 --- a/examples/LiftSim_baseline/README.md +++ /dev/null @@ -1,28 +0,0 @@ -# LiftSim基线 - -## 简介 - -基于PARL库实现Deep Q-network算法,应用于[RLSchool][rlschool]库中的电梯调度模拟环境[LiftSim][liftsim]。 - -## 依赖库 - -- paddlepaddle >= 1.5.1 -- parl >= 1.1.2 -- rlschool >= 0.0.1 - -Windows版本仅支持Python3.5及以上版本。 - -## 运行 - -```python -python demo.py -``` - -## Benchmark - - - -Accumulated Reward:每3600 steps内reward的总和,可体现电梯调度在单位时间(模拟环境0.5小时)内的效率。 - -[rlschool]: https://github.com/PaddlePaddle/RLSchool -[liftsim]: https://github.com/PaddlePaddle/RLSchool/tree/master/rlschool/liftsim diff --git a/examples/LiftSim_baseline/demo.py b/examples/LiftSim_baseline/demo.py deleted file mode 100644 index cecbf6c1a34d9060dac90abf6e4d648aa0f9a870..0000000000000000000000000000000000000000 --- a/examples/LiftSim_baseline/demo.py +++ /dev/null @@ -1,48 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from rlschool import LiftSim -from wrapper import Wrapper, ActionWrapper, ObservationWrapper -from rl_benchmark.dispatcher import RL_dispatcher -import sys -import argparse - - -# run main program with args -def run_main(args): - - parser = argparse.ArgumentParser(description='demo configuration') - parser.add_argument( - '--iterations', - type=int, - default=100000000, - help='total number of iterations') - args = parser.parse_args(args) - print('iterations:', args.iterations) - - mansion_env = LiftSim() - # mansion_env.seed(1988) - - mansion_env = Wrapper(mansion_env) - mansion_env = ActionWrapper(mansion_env) - mansion_env = ObservationWrapper(mansion_env) - - dispatcher = RL_dispatcher(mansion_env, args.iterations) - dispatcher.run_episode() - - return 0 - - -if __name__ == "__main__": - run_main(sys.argv[1:]) diff --git a/examples/LiftSim_baseline/rl_10.png b/examples/LiftSim_baseline/rl_10.png deleted file mode 100644 index b8f9eef1d10c0a617d8dd462f1d66e5d26484622..0000000000000000000000000000000000000000 Binary files a/examples/LiftSim_baseline/rl_10.png and /dev/null differ diff --git a/examples/LiftSim_baseline/rl_benchmark/__init__.py b/examples/LiftSim_baseline/rl_benchmark/__init__.py deleted file mode 100644 index eca2dce114b069bf9b455d77ce670d73b5047fd2..0000000000000000000000000000000000000000 --- a/examples/LiftSim_baseline/rl_benchmark/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/examples/LiftSim_baseline/rl_benchmark/agent.py b/examples/LiftSim_baseline/rl_benchmark/agent.py deleted file mode 100644 index 846bcf318090916141a4216abb3a889d2548d2ff..0000000000000000000000000000000000000000 --- a/examples/LiftSim_baseline/rl_benchmark/agent.py +++ /dev/null @@ -1,111 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import numpy as np -import numpy.random as random -import paddle.fluid as fluid -from parl import layers -from parl import Agent -from parl.utils import get_gpu_count, machine_info - - -class ElevatorAgent(Agent): - def __init__(self, algorithm, obs_dim, action_dim): - self._action_dim = action_dim - self._obs_dim = obs_dim - self._update_target_steps = 1000 - - self._global_step = 0 - self.exploration_ratio = 0.9 - self.exploration_decre = 1e-7 - self.exploration_min = 0.1 - super(ElevatorAgent, self).__init__(algorithm) - - use_cuda = machine_info.is_gpu_available() - if self.gpu_id >= 0: - assert get_gpu_count() == 1, 'Only support training in single GPU,\ - Please set environment variable: `export CUDA_VISIBLE_DEVICES=[GPU_ID_YOU_WANT_TO_USE]` .' - - else: - os.environ['CPU_NUM'] = str(1) - - exec_strategy = fluid.ExecutionStrategy() - exec_strategy.num_threads = 1 - exec_strategy.num_iteration_per_drop_scope = 10 - build_strategy = fluid.BuildStrategy() - build_strategy.remove_unnecessary_lock = False - - self.learn_pe = fluid.ParallelExecutor( - use_cuda=use_cuda, - main_program=self.learn_program, - build_strategy=build_strategy, - exec_strategy=exec_strategy, - ) - - def build_program(self): - self.pred_program = fluid.Program() - self.learn_program = fluid.Program() - - with fluid.program_guard(self.pred_program): - obs = layers.data( - name='obs', shape=[self._obs_dim], dtype='float32') - self._value = self.alg.define_predict(obs) - - with fluid.program_guard(self.learn_program): - obs = layers.data( - name='obs', shape=[self._obs_dim], dtype='float32') - action = layers.data(name='act', shape=[1], dtype='int32') - reward = layers.data(name='reward', shape=[], dtype='float32') - next_obs = layers.data( - name='next_obs', shape=[self._obs_dim], dtype='float32') - terminal = layers.data(name='terminal', shape=[], dtype='bool') - self._cost = self.alg.define_learn(obs, action, reward, next_obs, - terminal) - - def sample(self, obs): - if self.exploration_ratio > self.exploration_min: - self.exploration_ratio -= self.exploration_decre - q_values = self.predict(obs) - - ret_actions = list() - for i in range(len(q_values)): # number of elevators - if (random.random() < self.exploration_ratio): - action = random.randint(0, self._action_dim) - else: - action = np.argmax(q_values[i]) - ret_actions.append(int(action)) - return ret_actions - - def predict(self, obs): - pred_Q = self.fluid_executor.run( - self.pred_program, - feed={'obs': obs.astype('float32')}, - fetch_list=[self._value]) - return pred_Q[0] - - def learn(self, obs, act, reward, next_obs, terminal): - self._global_step += 1 - if self._global_step % self._update_target_steps == 0: - self.alg.sync_target(self.gpu_id) - - feed = { - 'obs': obs.astype('float32'), - 'act': act.astype('int32'), - 'reward': reward, - 'next_obs': next_obs.astype('float32'), - 'terminal': terminal - } - cost = self.learn_pe.run(feed=feed, fetch_list=[self._cost.name])[0] - return cost diff --git a/examples/LiftSim_baseline/rl_benchmark/dispatcher.py b/examples/LiftSim_baseline/rl_benchmark/dispatcher.py deleted file mode 100644 index a2561ee6d3d9f2c6b8f39c886ef4a1f2f01fb1ea..0000000000000000000000000000000000000000 --- a/examples/LiftSim_baseline/rl_benchmark/dispatcher.py +++ /dev/null @@ -1,105 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import sys -import parl -import numpy as np -import numpy.random as random - -from copy import deepcopy -from collections import deque - -from rlschool import EPSILON, HUGE -from rl_benchmark.model import RLDispatcherModel -from rl_benchmark.agent import ElevatorAgent -from parl.algorithms import DQN -from parl.utils import ReplayMemory - -MEMORY_SIZE = 1000000 -BATCH_SIZE = 64 - - -class RL_dispatcher(): - """ - An RL benchmark for elevator system - """ - - def __init__(self, env, max_episode): - self.env = env - - self._obs_dim = env.observation_space - self._act_dim = env.action_space - self._global_step = 0 - self.max_episode = max_episode - self._rpm = ReplayMemory(MEMORY_SIZE, self._obs_dim, 1) - self._model = RLDispatcherModel(self._act_dim) - hyperparas = { - 'action_dim': self._act_dim, - 'lr': 5.0e-4, - 'gamma': 0.998 - } - - self._algorithm = DQN(self._model, hyperparas) - self._agent = ElevatorAgent(self._algorithm, self._obs_dim, - self._act_dim) - self._warm_up_size = 2000 - self._statistic_freq = 1000 - self._loss_queue = deque() - - def run_episode(self): - self.env.reset() - acc_reward = 0.0 - - while self._global_step < self.max_episode: - # self.env.render() - state = self.env.state - action = self._agent.sample(state) - state_, reward, done, info = self.env.step(action) - output_info = self.learn_step(state, action, reward) - acc_reward += reward - if (isinstance(output_info, dict) and len(output_info) > 0): - self.env.log_notice("%s", output_info) - if (self._global_step % 3600 == 0): - self.env.log_notice( - "Accumulated Reward: %f, Mansion Status: %s", acc_reward, - self.env.statistics) - acc_reward = 0.0 - - self._agent.save('./model.ckpt') - - def learn_step(self, state, action, r): - self._global_step += 1 - if (self._global_step > self._warm_up_size): - for i in range(self.env.elevator_num): - self._rpm.append(self._last_observation_array[i], - self._last_action[i], self._last_reward, - deepcopy(state[i]), False) - self._last_observation_array = deepcopy(state) - self._last_action = deepcopy(action) - self._last_reward = r - - ret_dict = {} - if self._rpm.size() > self._warm_up_size: - batch_obs, batch_action, batch_reward, batch_next_obs, batch_terminal = \ - self._rpm.sample_batch(BATCH_SIZE) - cost = self._agent.learn(batch_obs, batch_action, batch_reward, - batch_next_obs, batch_terminal) - self._loss_queue.appendleft(cost) - if (len(self._loss_queue) > self._statistic_freq): - self._loss_queue.pop() - if (self._global_step % self._statistic_freq == 0): - ret_dict["Temporal Difference Error(Average)"] = \ - float(sum(self._loss_queue)) / float(len(self._loss_queue)) - - return ret_dict diff --git a/examples/LiftSim_baseline/wrapper.py b/examples/LiftSim_baseline/wrapper.py deleted file mode 100644 index 55d525deaeecb76df4f2ba9183ed5ea6c119e5d8..0000000000000000000000000000000000000000 --- a/examples/LiftSim_baseline/wrapper.py +++ /dev/null @@ -1,87 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# wrapper part modified from -# https://github.com/openai/gym/blob/master/gym/core.py - -from rlschool import LiftSim -from wrapper_utils import obs_dim, act_dim, mansion_state_preprocessing -from wrapper_utils import action_idx_to_action - - -class Wrapper(LiftSim): - def __init__(self, env): - self.env = env - self._mansion = env._mansion - self.mansion_attr = self._mansion.attribute - self.elevator_num = self.mansion_attr.ElevatorNumber - self.observation_space = obs_dim(self.mansion_attr) - self.action_space = act_dim(self.mansion_attr) - self.viewer = env.viewer - - def __getattr__(self, name): - if name.startswith('_'): - raise AttributeError( - "attempted to get missing private attribute '{}'".format(name)) - return getattr(self.env, name) - - def seed(self, seed=None): - return self.env.seed(seed) - - def step(self, action): - return self.env.step(action) - - def reset(self): - return self.env.reset() - - def render(self): - return self.env.render() - - def close(self): - return self.env.close() - - -class RewardWrapper(Wrapper): - pass - - -class ActionWrapper(Wrapper): - def reset(self): - return self.env.reset() - - def step(self, action): - act = [] - for a in action: - act.extend(self.action(a, self.action_space)) - return self.env.step(act) - - def action(self, action, action_space): - return action_idx_to_action(action, action_space) - - -class ObservationWrapper(Wrapper): - def reset(self): - self.env.reset() - return self.observation(self._mansion.state) - - def step(self, action): - observation, reward, done, info = self.env.step(action) - return (self.observation(observation), reward, done, info) - - def observation(self, observation): - return mansion_state_preprocessing(observation) - - @property - def state(self): - return self.observation(self._mansion.state) diff --git a/examples/LiftSim_baseline/wrapper_utils.py b/examples/LiftSim_baseline/wrapper_utils.py deleted file mode 100644 index 45afcefbf9ebdbacc2841bd54b1756a1213be5bf..0000000000000000000000000000000000000000 --- a/examples/LiftSim_baseline/wrapper_utils.py +++ /dev/null @@ -1,241 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import sys -import random -import numpy as np -from rlschool import ElevatorState, ElevatorAction -from rlschool import MansionAttribute, MansionState -from rlschool import EPSILON, HUGE -from rlschool import MansionConfig -from rlschool import MansionManager - - -def discretize(value, n_dim, min_val, max_val): - """ - discretize a value into a vector of n_dim dimension 1-hot representation - with the value below min_val being [1, 0, 0, ..., 0] - and the value above max_val being [0, 0, ..., 0, 1] - Args: - value: the value that needs to be discretized into 1-hot format - n_dim: number of dimensions - min_val: minimal value in the result - man_val: maximum value in the result - Returns: - the discretized vector - """ - assert n_dim > 0 - if (n_dim == 1): - return [1] - delta = (max_val - min_val) / float(n_dim - 1) - active_pos = int((value - min_val) / delta + 0.5) - active_pos = min(n_dim - 1, active_pos) - active_pos = max(0, active_pos) - ret_array = [0 for i in range(n_dim)] - ret_array[active_pos] = 1.0 - return ret_array - - -def linear_discretize(value, n_dim, min_val, max_val): - """ - discretize a value into a vector of n_dim dimensional representation - with the value below min_val being [1, 0, 0, ..., 0] - and the value above max_val being [0, 0, ..., 0, 1] - e.g. if n_dim = 2, min_val = 1.0, max_val = 2.0 - if value = 1.5 returns [0.5, 0.5], if value = 1.8 returns [0.2, 0.8] - Args: - value: the value that needs to be discretized - n_dim: number of dimensions - min_val: minimal value in the result - man_val: maximum value in the result - Returns: - the discretized vector - """ - assert n_dim > 0 - if (n_dim == 1): - return [1] - delta = (max_val - min_val) / float(n_dim - 1) - active_pos = int((value - min_val) / delta + 0.5) - active_pos = min(n_dim - 2, active_pos) - active_pos = max(0, active_pos) - anchor_pt = active_pos * delta + min_val - if (anchor_pt > value and anchor_pt > min_val + 0.5 * delta): - anchor_pt -= delta - active_pos -= 1 - weight = (value - anchor_pt) / delta - weight = min(1.0, max(0.0, weight)) - ret_array = [0 for i in range(n_dim)] - ret_array[active_pos] = 1.0 - weight - ret_array[active_pos + 1] = weight - return ret_array - - -def ele_state_preprocessing(ele_state): - """Process elevator state, make it usable for network - Args: - ele_state: ElevatorState, nametuple, defined in rlschool/liftsim/environment/mansion/utils.py - Returns: - ele_feature: list of elevator state - """ - ele_feature = [] - - # add floor information - ele_feature.extend( - linear_discretize(ele_state.Floor, ele_state.MaximumFloor, 1.0, - ele_state.MaximumFloor)) - - # add velocity information - ele_feature.extend( - linear_discretize(ele_state.Velocity, 21, -ele_state.MaximumSpeed, - ele_state.MaximumSpeed)) - - # add door information - ele_feature.append(ele_state.DoorState) - ele_feature.append(float(ele_state.DoorIsOpening)) - ele_feature.append(float(ele_state.DoorIsClosing)) - - # add direction information - ele_feature.extend(discretize(ele_state.Direction, 3, -1, 1)) - - # add load weight information - ele_feature.extend( - linear_discretize(ele_state.LoadWeight / ele_state.MaximumLoad, 5, 0.0, - 1.0)) - - # add other information - target_floor_binaries = [0.0 for i in range(ele_state.MaximumFloor)] - for target_floor in ele_state.ReservedTargetFloors: - target_floor_binaries[target_floor - 1] = 1.0 - ele_feature.extend(target_floor_binaries) - - dispatch_floor_binaries = [0.0 for i in range(ele_state.MaximumFloor + 1)] - dispatch_floor_binaries[ele_state.CurrentDispatchTarget] = 1.0 - ele_feature.extend(dispatch_floor_binaries) - ele_feature.append(ele_state.DispatchTargetDirection) - - return ele_feature - - -def obs_dim(mansion_attr): - """Calculate the observation dimension - Args: - mansion_attr: MansionAttribute, attribute of mansion_manager - Returns: - observation dimension - """ - assert isinstance(mansion_attr, MansionAttribute) - ele_dim = mansion_attr.NumberOfFloor * 3 + 34 - obs_dim = (ele_dim + 1) * mansion_attr.ElevatorNumber + \ - mansion_attr.NumberOfFloor * 2 - return obs_dim - - -def act_dim(mansion_attr): - """Calculate the action dimension, which is number of floor times 2 plus 2. - The additional two are for special cases: the elevator stops at once if the new dispatch_target is 0, - the original dispatch_target does not change if dispatch_target is -1. See implementation in - method action_idx_to_action below. - Args: - mansion_attr: MansionAttribute, attribute of mansion_manager - Returns: - action dimension - """ - assert isinstance(mansion_attr, MansionAttribute) - return mansion_attr.NumberOfFloor * 2 + 2 - - -def mansion_state_preprocessing(mansion_state): - """Process mansion_state to make it usable for networks, convert it into a numpy array - Args: - mansion_state: namedtuple of mansion state, - defined in rlschool/liftsim/environment/mansion/utils.py - Returns: - the converted numpy array - """ - ele_features = list() - for ele_state in mansion_state.ElevatorStates: - ele_features.append(ele_state_preprocessing(ele_state)) - max_floor = ele_state.MaximumFloor - - target_floor_binaries_up = [0.0 for i in range(max_floor)] - target_floor_binaries_down = [0.0 for i in range(max_floor)] - for floor in mansion_state.RequiringUpwardFloors: - target_floor_binaries_up[floor - 1] = 1.0 - for floor in mansion_state.RequiringDownwardFloors: - target_floor_binaries_down[floor - 1] = 1.0 - target_floor_binaries = target_floor_binaries_up + target_floor_binaries_down - - idx = 0 - man_features = list() - for idx in range(len(mansion_state.ElevatorStates)): - elevator_id_vec = discretize(idx + 1, - len(mansion_state.ElevatorStates), 1, - len(mansion_state.ElevatorStates)) - idx_array = list(range(len(mansion_state.ElevatorStates))) - idx_array.remove(idx) - # random.shuffle(idx_array) - man_features.append(ele_features[idx]) - for left_idx in idx_array: - man_features[idx] = man_features[idx] + ele_features[left_idx] - man_features[idx] = man_features[idx] + \ - elevator_id_vec + target_floor_binaries - return np.asarray(man_features, dtype='float32') - - -def action_idx_to_action(action_idx, act_dim): - """Convert action_inx to action - Args: - action_idx: the index needed to be converted - act_dim: action dimension - Returns: - the converted namedtuple - """ - assert isinstance(action_idx, int) - assert isinstance(act_dim, int) - realdim = act_dim - 2 - if (action_idx == realdim): - return ElevatorAction(0, 1) - elif (action_idx == realdim + 1): - return ElevatorAction(-1, 1) - action = action_idx - if (action_idx < realdim / 2): - direction = 1 - action += 1 - else: - direction = -1 - action -= int(realdim / 2) - action += 1 - return [action, direction] - - -def action_to_action_idx(action, act_dim): - """Convert action to number according to act_dim. - Args: - action: namedtuple defined in rlschool/liftsim/environment/mansion/utils.py - act_dim: action dimension - Returns: - action_idx: the result index - """ - assert isinstance(action, ElevatorAction) - assert isinstance(act_dim, int) - realdim = act_dim - 2 - if (action.TargetFloor == 0): - return realdim - elif (action.TargetFloor < 0): - return realdim + 1 - action_idx = 0 - if (action.DirectionIndicator < 0): - action_idx += int(realdim / 2) - action_idx += action.TargetFloor - 1 - return action_idx diff --git a/examples/MADDPG/README.md b/examples/MADDPG/README.md index 55d191474b62f5099d91da51bd80443abd6b87d8..0bf3a599e76a3ecc127385baa0f4d81e47e3662b 100644 --- a/examples/MADDPG/README.md +++ b/examples/MADDPG/README.md @@ -98,7 +98,7 @@ simple_world_comm
+ [paddlepaddle>=1.6.1](https://github.com/PaddlePaddle/Paddle) + [parl](https://github.com/PaddlePaddle/PARL) + [multiagent-particle-envs](https://github.com/openai/multiagent-particle-envs) -+ gym ++ gym==0.10.5 ### Start Training: ``` diff --git a/examples/MADDPG/train.py b/examples/MADDPG/train.py index d0e20dcdb4fd35638432fb1666b76b30c2a388d8..8454a73ee209707c65340897ce9b090d482c6751 100644 --- a/examples/MADDPG/train.py +++ b/examples/MADDPG/train.py @@ -20,7 +20,7 @@ from simple_model import MAModel from simple_agent import MAAgent import parl from parl.env.multiagent_simple_env import MAenv -from parl.utils import logger, tensorboard +from parl.utils import logger, summary def run_episode(env, agents): @@ -62,8 +62,8 @@ def run_episode(env, agents): # learn policy for i, agent in enumerate(agents): critic_loss = agent.learn(agents) - tensorboard.add_scalar('critic_loss_%d' % i, critic_loss, - agent.global_train_step) + summary.add_scalar('critic_loss_%d' % i, critic_loss, + agent.global_train_step) return total_reward, agents_reward, steps @@ -155,12 +155,12 @@ def train_agent(): format(total_steps, total_episodes, mean_episode_reward, use_time)) t_start = time.time() - tensorboard.add_scalar('mean_episode_reward/episode', - mean_episode_reward, total_episodes) - tensorboard.add_scalar('mean_episode_reward/steps', - mean_episode_reward, total_steps) - tensorboard.add_scalar('use_time/1000episode', use_time, - total_episodes) + summary.add_scalar('mean_episode_reward/episode', + mean_episode_reward, total_episodes) + summary.add_scalar('mean_episode_reward/steps', + mean_episode_reward, total_steps) + summary.add_scalar('use_time/1000episode', use_time, + total_episodes) # save model if not args.restore: diff --git a/examples/NeurIPS2019-Learn-to-Move-Challenge/evaluate.py b/examples/NeurIPS2019-Learn-to-Move-Challenge/evaluate.py index e784dae00f9ffdc5528a4c7dafda2916e5d4c456..e3a8066d79128ed9e969bb7d4c1c8cce3bee3775 100755 --- a/examples/NeurIPS2019-Learn-to-Move-Challenge/evaluate.py +++ b/examples/NeurIPS2019-Learn-to-Move-Challenge/evaluate.py @@ -22,7 +22,7 @@ import numpy as np from actor import Actor from opensim_model import OpenSimModel from opensim_agent import OpenSimAgent -from parl.utils import logger, ReplayMemory, tensorboard, get_gpu_count +from parl.utils import logger, ReplayMemory, summary, get_gpu_count from parl.utils.window_stat import WindowStat from parl.remote.client import get_global_client from parl.utils import machine_info diff --git a/examples/NeurIPS2019-Learn-to-Move-Challenge/train.py b/examples/NeurIPS2019-Learn-to-Move-Challenge/train.py index b37fb369a15c8a28a3911dbb9a864cf28d1da8b7..cf14f1e0306c69c8f134cf6c81c279ac982b52d0 100755 --- a/examples/NeurIPS2019-Learn-to-Move-Challenge/train.py +++ b/examples/NeurIPS2019-Learn-to-Move-Challenge/train.py @@ -22,7 +22,7 @@ import numpy as np from actor import Actor from opensim_model import OpenSimModel from opensim_agent import OpenSimAgent -from parl.utils import logger, ReplayMemory, tensorboard, get_gpu_count +from parl.utils import logger, ReplayMemory, summary, get_gpu_count from parl.utils.window_stat import WindowStat from parl.remote.client import get_global_client from parl.utils import machine_info @@ -97,7 +97,7 @@ class Learner(object): # add lock between training and predicting self.model_lock = threading.Lock() - # add lock when appending data to rpm or writing scalars to tensorboard + # add lock when appending data to rpm or writing scalars to summary self.memory_lock = threading.Lock() self.ready_actor_queue = queue.Queue() @@ -246,24 +246,24 @@ class Learner(object): episode_env_reward) if self.env_reward_stat.count > 500: - tensorboard.add_scalar('recent_env_reward', - self.env_reward_stat.mean, - self.total_steps) - tensorboard.add_scalar('recent_shaping_reward', - self.shaping_reward_stat.mean, - self.total_steps) - if self.critic_loss_stat.count > 500: - tensorboard.add_scalar('recent_critic_loss', - self.critic_loss_stat.mean, - self.total_steps) - tensorboard.add_scalar('episode_length', n, self.total_steps) - tensorboard.add_scalar('max_env_reward', self.max_env_reward, + summary.add_scalar('recent_env_reward', + self.env_reward_stat.mean, self.total_steps) - tensorboard.add_scalar('ready_actor_num', - self.ready_actor_queue.qsize(), + summary.add_scalar('recent_shaping_reward', + self.shaping_reward_stat.mean, self.total_steps) - tensorboard.add_scalar('episode_time', episode_time, + if self.critic_loss_stat.count > 500: + summary.add_scalar('recent_critic_loss', + self.critic_loss_stat.mean, self.total_steps) + summary.add_scalar('episode_length', n, self.total_steps) + summary.add_scalar('max_env_reward', self.max_env_reward, + self.total_steps) + summary.add_scalar('ready_actor_num', + self.ready_actor_queue.qsize(), + self.total_steps) + summary.add_scalar('episode_time', episode_time, + self.total_steps) self.noiselevel = self.noiselevel * NOISE_DECAY diff --git a/examples/SAC/train.py b/examples/SAC/train.py index a88260245880a39738f931573dd0b183487722df..3e2b7140e9ab5694c38bd86ded04a5e977da9d3a 100644 --- a/examples/SAC/train.py +++ b/examples/SAC/train.py @@ -21,7 +21,7 @@ import time import parl from mujoco_agent import MujocoAgent from mujoco_model import ActorModel, CriticModel -from parl.utils import logger, tensorboard, action_mapping, ReplayMemory +from parl.utils import logger, summary, action_mapping, ReplayMemory ACTOR_LR = 1e-3 CRITIC_LR = 1e-3 @@ -111,8 +111,7 @@ def main(): train_reward, steps = run_train_episode(env, agent, rpm) total_steps += steps logger.info('Steps: {} Reward: {}'.format(total_steps, train_reward)) - tensorboard.add_scalar('train/episode_reward', train_reward, - total_steps) + summary.add_scalar('train/episode_reward', train_reward, total_steps) if total_steps // args.test_every_steps >= test_flag: while total_steps // args.test_every_steps >= test_flag: @@ -120,8 +119,8 @@ def main(): evaluate_reward = run_evaluate_episode(env, agent) logger.info('Steps {}, Evaluate reward: {}'.format( total_steps, evaluate_reward)) - tensorboard.add_scalar('eval/episode_reward', evaluate_reward, - total_steps) + summary.add_scalar('eval/episode_reward', evaluate_reward, + total_steps) if __name__ == '__main__': diff --git a/examples/TD3/train.py b/examples/TD3/train.py index 4cb74d9c01ab73dcb8cb20385b36262cb7c4aeba..8115a41ba1129e00dda1f2a7ca1b0ad3b9d64c71 100644 --- a/examples/TD3/train.py +++ b/examples/TD3/train.py @@ -19,7 +19,7 @@ import time import parl from mujoco_agent import MujocoAgent from mujoco_model import MujocoModel -from parl.utils import logger, tensorboard, action_mapping, ReplayMemory +from parl.utils import logger, summary, action_mapping, ReplayMemory MAX_EPISODES = 5000 ACTOR_LR = 3e-4 @@ -117,8 +117,7 @@ def main(): train_reward, steps = run_train_episode(env, agent, rpm) total_steps += steps logger.info('Steps: {} Reward: {}'.format(total_steps, train_reward)) - tensorboard.add_scalar('train/episode_reward', train_reward, - total_steps) + summary.add_scalar('train/episode_reward', train_reward, total_steps) if total_steps // args.test_every_steps >= test_flag: while total_steps // args.test_every_steps >= test_flag: @@ -126,8 +125,8 @@ def main(): evaluate_reward = run_evaluate_episode(env, agent) logger.info('Steps {}, Evaluate reward: {}'.format( total_steps, evaluate_reward)) - tensorboard.add_scalar('eval/episode_reward', evaluate_reward, - total_steps) + summary.add_scalar('eval/episode_reward', evaluate_reward, + total_steps) if __name__ == '__main__': diff --git a/examples/offline-Q-learning/atari.py b/examples/offline-Q-learning/atari.py index 11909eba8307ef781337b20ca2fe200ed967cc45..e0e1b3cc097be221483d0a8712951b9d38f5da54 120000 --- a/examples/offline-Q-learning/atari.py +++ b/examples/offline-Q-learning/atari.py @@ -1 +1 @@ -../DQN/atari.py \ No newline at end of file +../DQN_variant/atari.py \ No newline at end of file diff --git a/examples/offline-Q-learning/atari_wrapper.py b/examples/offline-Q-learning/atari_wrapper.py index e58186a870b13dc7fff25c52cbdd1d009a18f4ac..2904fb39b7934d104209d0085ca814d5c132fe90 120000 --- a/examples/offline-Q-learning/atari_wrapper.py +++ b/examples/offline-Q-learning/atari_wrapper.py @@ -1 +1 @@ -../DQN/atari_wrapper.py \ No newline at end of file +../DQN_variant/atari_wrapper.py \ No newline at end of file diff --git a/examples/offline-Q-learning/dqn.py b/examples/offline-Q-learning/dqn.py index feedf7d21797c052ab716412ceb9cb7c2db78350..d761d2f75f27b3d26e1de046b86400e35aebcbf1 100644 --- a/examples/offline-Q-learning/dqn.py +++ b/examples/offline-Q-learning/dqn.py @@ -19,23 +19,16 @@ import copy import paddle.fluid as fluid from parl.core.fluid.algorithm import Algorithm from parl.core.fluid import layers -from parl.utils.deprecation import deprecated __all__ = ['DQN'] class DQN(Algorithm): - def __init__(self, - model, - hyperparas=None, - act_dim=None, - gamma=None, - lr=None): + def __init__(self, model, act_dim=None, gamma=None, lr=None): """ DQN algorithm Args: model (parl.Model): model defining forward network of Q function - hyperparas (dict): (deprecated) dict of hyper parameters. act_dim (int): dimension of the action space gamma (float): discounted factor for reward computation. lr (float): learning rate. @@ -43,20 +36,12 @@ class DQN(Algorithm): self.model = model self.target_model = copy.deepcopy(model) - if hyperparas is not None: - warnings.warn( - "the `hyperparas` argument of `__init__` function in `parl.Algorithms.DQN` is deprecated since version 1.2 and will be removed in version 1.3.", - DeprecationWarning, - stacklevel=2) - self.act_dim = hyperparas['action_dim'] - self.gamma = hyperparas['gamma'] - else: - assert isinstance(act_dim, int) - assert isinstance(gamma, float) - assert isinstance(lr, float) - self.act_dim = act_dim - self.gamma = gamma - self.lr = lr + assert isinstance(act_dim, int) + assert isinstance(gamma, float) + assert isinstance(lr, float) + self.act_dim = act_dim + self.gamma = gamma + self.lr = lr def predict(self, obs): """ use value model self.model to predict the action value @@ -100,12 +85,7 @@ class DQN(Algorithm): cost = layers.reduce_mean(cost) return cost - def sync_target(self, gpu_id=None): + def sync_target(self): """ sync weights of self.model to self.target_model """ - if gpu_id is not None: - warnings.warn( - "the `gpu_id` argument of `sync_target` function in `parl.Algorithms.DQN` is deprecated since version 1.2 and will be removed in version 1.3.", - DeprecationWarning, - stacklevel=2) self.model.sync_weights_to(self.target_model) diff --git a/examples/offline-Q-learning/parallel_run.py b/examples/offline-Q-learning/parallel_run.py index 3416f8cd6708d75ce0884584a43b66c674d8c699..d7da430e83de46be82a935bc01ce35ca6bd83c6e 100644 --- a/examples/offline-Q-learning/parallel_run.py +++ b/examples/offline-Q-learning/parallel_run.py @@ -22,7 +22,7 @@ from tqdm import tqdm import parl import paddle.fluid as fluid from parl.utils import get_gpu_count -from parl.utils import tensorboard, logger +from parl.utils import summary, logger from dqn import DQN # slight changes from parl.algorithms.DQN from atari_agent import AtariAgent @@ -45,21 +45,21 @@ gpu_num = get_gpu_count() def run_train_step(agent, rpm): for step in range(args.train_total_steps): # use the first 80% data to train - batch_all_state, batch_action, batch_reward, batch_isOver = rpm.sample_batch( + batch_all_obs, batch_action, batch_reward, batch_isOver = rpm.sample_batch( args.batch_size * gpu_num) - batch_state = batch_all_state[:, :CONTEXT_LEN, :, :] - batch_next_state = batch_all_state[:, 1:, :, :] - cost = agent.learn(batch_state, batch_action, batch_reward, - batch_next_state, batch_isOver) + batch_obs = batch_all_obs[:, :CONTEXT_LEN, :, :] + batch_next_obs = batch_all_obs[:, 1:, :, :] + cost = agent.learn(batch_obs, batch_action, batch_reward, + batch_next_obs, batch_isOver) if step % 100 == 0: # use the last 20% data to evaluate - batch_all_state, batch_action, batch_reward, batch_isOver = rpm.sample_test_batch( + batch_all_obs, batch_action, batch_reward, batch_isOver = rpm.sample_test_batch( args.batch_size) - batch_state = batch_all_state[:, :CONTEXT_LEN, :, :] - batch_next_state = batch_all_state[:, 1:, :, :] - eval_cost = agent.supervised_eval(batch_state, batch_action, - batch_reward, batch_next_state, + batch_obs = batch_all_obs[:, :CONTEXT_LEN, :, :] + batch_next_obs = batch_all_obs[:, 1:, :, :] + eval_cost = agent.supervised_eval(batch_obs, batch_action, + batch_reward, batch_next_obs, batch_isOver) logger.info( "train step {}, train costs are {}, eval cost is {}.".format( @@ -67,17 +67,17 @@ def run_train_step(agent, rpm): def collect_exp(env, rpm, agent): - state = env.reset() + obs = env.reset() # collect data to fulfill replay memory for i in tqdm(range(MEMORY_SIZE)): - context = rpm.recent_state() - context.append(state) + context = rpm.recent_obs() + context.append(obs) context = np.stack(context, axis=0) action = agent.sample(context) - next_state, reward, isOver, _ = env.step(action) - rpm.append(Experience(state, action, reward, isOver)) - state = next_state + next_obs, reward, isOver, _ = env.step(action) + rpm.append(Experience(obs, action, reward, isOver)) + obs = next_obs def main(): diff --git a/examples/offline-Q-learning/replay_memory.py b/examples/offline-Q-learning/replay_memory.py index 2296ea906ee47a53f697777b6885dad6365460e8..94a43c25d32ac9c9107dfa90a33d1280a5bebd16 100644 --- a/examples/offline-Q-learning/replay_memory.py +++ b/examples/offline-Q-learning/replay_memory.py @@ -18,18 +18,18 @@ import os from collections import deque, namedtuple from parl.utils import logger -Experience = namedtuple('Experience', ['state', 'action', 'reward', 'isOver']) +Experience = namedtuple('Experience', ['obs', 'action', 'reward', 'isOver']) class ReplayMemory(object): def __init__(self, max_size, - state_shape, + obs_shape, context_len, load_file=False, file_path=None): self.max_size = int(max_size) - self.state_shape = state_shape + self.obs_shape = obs_shape self.context_len = int(context_len) self.file_path = file_path @@ -38,8 +38,7 @@ class ReplayMemory(object): self.load_memory() logger.info("memory size is {}".format(self._curr_size)) else: - self.state = np.zeros( - (self.max_size, ) + state_shape, dtype='uint8') + self.obs = np.zeros((self.max_size, ) + obs_shape, dtype='uint8') self.action = np.zeros((self.max_size, ), dtype='int32') self.reward = np.zeros((self.max_size, ), dtype='float32') self.isOver = np.zeros((self.max_size, ), dtype='bool') @@ -62,42 +61,41 @@ class ReplayMemory(object): else: self._context.append(exp) - def recent_state(self): - """ maintain recent state for training""" + def recent_obs(self): + """ maintain recent obs for training""" lst = list(self._context) - states = [np.zeros(self.state_shape, dtype='uint8')] * \ + obs = [np.zeros(self.obs_shape, dtype='uint8')] * \ (self._context.maxlen - len(lst)) - states.extend([k.state for k in lst]) - return states + obs.extend([k.obs for k in lst]) + return obs def sample(self, idx): - """ return state, action, reward, isOver, - note that some frames in state may be generated from last episode, - they should be removed from state + """ return obs, action, reward, isOver, + note that some frames in obs may be generated from last episode, + they should be removed from obs """ - state = np.zeros( - (self.context_len + 1, ) + self.state_shape, dtype=np.uint8) - state_idx = np.arange(idx, - idx + self.context_len + 1) % self._curr_size + obs = np.zeros( + (self.context_len + 1, ) + self.obs_shape, dtype=np.uint8) + obs_idx = np.arange(idx, idx + self.context_len + 1) % self._curr_size # confirm that no frame was generated from last episode has_last_episode = False for k in range(self.context_len - 2, -1, -1): - to_check_idx = state_idx[k] + to_check_idx = obs_idx[k] if self.isOver[to_check_idx]: has_last_episode = True - state_idx = state_idx[k + 1:] - state[k + 1:] = self.state[state_idx] + obs_idx = obs_idx[k + 1:] + obs[k + 1:] = self.obs[obs_idx] break if not has_last_episode: - state = self.state[state_idx] + obs = self.obs[obs_idx] real_idx = (idx + self.context_len - 1) % self._curr_size action = self.action[real_idx] reward = self.reward[real_idx] isOver = self.isOver[real_idx] - return state, reward, action, isOver + return obs, reward, action, isOver def __len__(self): return self._curr_size @@ -106,7 +104,7 @@ class ReplayMemory(object): return self._curr_size def _assign(self, pos, exp): - self.state[pos] = exp.state + self.obs[pos] = exp.obs self.reward[pos] = exp.reward self.action[pos] = exp.action self.isOver[pos] = exp.isOver @@ -129,15 +127,15 @@ class ReplayMemory(object): return self._process_batch(batch_exp) def _process_batch(self, batch_exp): - state = np.asarray([e[0] for e in batch_exp], dtype='uint8') + obs = np.asarray([e[0] for e in batch_exp], dtype='uint8') reward = np.asarray([e[1] for e in batch_exp], dtype='float32') action = np.asarray([e[2] for e in batch_exp], dtype='int8') isOver = np.asarray([e[3] for e in batch_exp], dtype='bool') - return [state, action, reward, isOver] + return [obs, action, reward, isOver] def save_memory(self): save_data = [ - self.state, self.reward, self.action, self.isOver, self._curr_size, + self.obs, self.reward, self.action, self.isOver, self._curr_size, self._curr_pos, self._context ] np.savez(self.file_path, *save_data) @@ -145,7 +143,7 @@ class ReplayMemory(object): def load_memory(self): container = np.load(self.file_path, allow_pickle=True) [ - self.state, self.reward, self.action, self.isOver, self._curr_size, + self.obs, self.reward, self.action, self.isOver, self._curr_size, self._curr_pos, self._context ] = [container[key] for key in container] self._curr_size = self._curr_size.astype(int) diff --git a/examples/offline-Q-learning/rom_files b/examples/offline-Q-learning/rom_files index 966a8940cbb2d928de9f816d41efada9aa3c9b6e..c1c50b9a99991f7f5dd34d7f243e999a636ba926 120000 --- a/examples/offline-Q-learning/rom_files +++ b/examples/offline-Q-learning/rom_files @@ -1 +1 @@ -../DQN/rom_files/ \ No newline at end of file +../DQN_variant/rom_files \ No newline at end of file diff --git a/examples/offline-Q-learning/utils.py b/examples/offline-Q-learning/utils.py index 721338d52451903eb1599e2396c9699a410a188d..04c590ec46f98b6cfa6d1ec833112730900fb840 120000 --- a/examples/offline-Q-learning/utils.py +++ b/examples/offline-Q-learning/utils.py @@ -1 +1 @@ -../DQN/utils.py \ No newline at end of file +../DQN_variant/utils.py \ No newline at end of file diff --git a/examples/others/deepes.py b/examples/others/deepes.py new file mode 100644 index 0000000000000000000000000000000000000000..07be65326274a27892f3e0eff8067081e101b11f --- /dev/null +++ b/examples/others/deepes.py @@ -0,0 +1,100 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import gym +import numpy as np + + +class CartpoleAgent(object): + def __init__(self, obs_dim, act_dim, learning_rate): + self.learning_rate = learning_rate + # init weights + self.w = np.random.random((act_dim, obs_dim)) * 0.1 + self.b = np.zeros(act_dim) + self.weights_total_size = self.w.size + self.b.size + + def predict(self, obs): + out = np.dot(self.w, obs) + self.b + action = np.argmax(out) + return action + + def learn(self, rewards, noises): + gradient = np.dot( + np.asarray(rewards, dtype=np.float32), + np.asarray(noises, dtype=np.float32)) + gradient /= rewards.size + + flat_weights = self.get_flat_weights() + # Compute the new weights. + new_weights = flat_weights + self.learning_rate * gradient + self.set_flat_weights(new_weights) + + def set_flat_weights(self, flat_weights): + self.w = flat_weights[:self.w.size].reshape(self.w.shape) + self.b = flat_weights[self.w.size:] + + def get_flat_weights(self): + flat_weights = np.concatenate(([self.w.ravel(), self.b]), axis=0) + return flat_weights + + +def evaluate(env, agent): + ep_reward = 0 + obs = env.reset() + while True: + action = agent.predict(obs) + obs, reward, done, _ = env.step(action) + ep_reward += reward + if done: + break + return ep_reward + + +def reward_normalize(reward): + reward = np.asarray(reward) + max_r = np.max(reward) + min_r = np.min(reward) + if max_r == min_r: + reward = np.zeros(reward.shape) + else: + reward = (reward - min_r) / (max_r - min_r) + reward -= 0.5 + return reward + + +if __name__ == '__main__': + env = gym.make('CartPole-v0') + agent = CartpoleAgent(obs_dim=4, act_dim=2, learning_rate=0.1) + + for epcho in range(100): + rewards = [] + noises = [] + lastest_flat_weights = agent.get_flat_weights() + + for episode in range(10): + noise = np.random.randn(agent.weights_total_size) + perturbation = noise * 0.05 + + agent.set_flat_weights(lastest_flat_weights + perturbation) + ep_reward = evaluate(env, agent) + + noises.append(noise) + rewards.append(ep_reward) + + normalized_rewards = reward_normalize(rewards) + agent.set_flat_weights(lastest_flat_weights) + agent.learn(normalized_rewards, noises) + # evaluate + if (epcho % 10) == 0: + ep_reward = evaluate(env, agent) + print('Epcho {}, Test reward {}'.format(epcho, ep_reward)) diff --git a/examples/tutorials/README.md b/examples/tutorials/README.md new file mode 100644 index 0000000000000000000000000000000000000000..676fbc5f2d9cc2a909dd9152b2ab0cc8549e9b82 --- /dev/null +++ b/examples/tutorials/README.md @@ -0,0 +1,37 @@ +## 《PARL强化学习入门实践》课程示例 + +针对强化学习初学者,PARL提供了[入门课程](https://aistudio.baidu.com/aistudio/course/introduce/1335),展示最基础的5个强化学习算法代码示例。 + +## 课程大纲 ++ 一、强化学习(RL)初印象 + + RL概述、入门路线 + + 实践:环境搭建([lesson1](lesson1/gridworld.py) 的代码提供了格子环境世界的渲染封装) ++ 二、基于表格型方法求解RL + + MDP、状态价值、Q表格 + + 实践: [Sarsa](lesson2/sarsa)、[Q-learning](lesson2/q_learning) ++ 三、基于神经网络方法求解RL + + 函数逼近方法 + + 实践:[DQN](lesson3/dqn) ++ 四、基于策略梯度求解RL + + 策略近似、策略梯度 + + 实践:[Policy Gradient](lesson4/policy_gradient) ++ 五、连续动作空间上求解RL + + 实战:[DDPG](lesson5/ddpg) + + + +## 使用说明 + +### 安装依赖 + ++ [paddlepaddle==1.6.3](https://github.com/PaddlePaddle/Paddle) ++ [parl==1.3.1](https://github.com/PaddlePaddle/PARL) ++ gym + + +### 运行示例 + +进入每个示例对应的代码文件夹中,运行 +``` +python train.py +``` diff --git a/examples/tutorials/lesson1/gridworld.py b/examples/tutorials/lesson1/gridworld.py new file mode 100644 index 0000000000000000000000000000000000000000..7af6e6aebadf04941e3ee744af35244dbedb31ad --- /dev/null +++ b/examples/tutorials/lesson1/gridworld.py @@ -0,0 +1,195 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# -*- coding: utf-8 -*- + +import gym +import turtle +import numpy as np + +# turtle tutorial : https://docs.python.org/3.3/library/turtle.html + + +def GridWorld(gridmap=None, is_slippery=False): + if gridmap is None: + gridmap = ['SFFF', 'FHFH', 'FFFH', 'HFFG'] + env = gym.make("FrozenLake-v0", desc=gridmap, is_slippery=False) + env = FrozenLakeWapper(env) + return env + + +class FrozenLakeWapper(gym.Wrapper): + def __init__(self, env): + gym.Wrapper.__init__(self, env) + self.max_y = env.desc.shape[0] + self.max_x = env.desc.shape[1] + self.t = None + self.unit = 50 + + def draw_box(self, x, y, fillcolor='', line_color='gray'): + self.t.up() + self.t.goto(x * self.unit, y * self.unit) + self.t.color(line_color) + self.t.fillcolor(fillcolor) + self.t.setheading(90) + self.t.down() + self.t.begin_fill() + for _ in range(4): + self.t.forward(self.unit) + self.t.right(90) + self.t.end_fill() + + def move_player(self, x, y): + self.t.up() + self.t.setheading(90) + self.t.fillcolor('red') + self.t.goto((x + 0.5) * self.unit, (y + 0.5) * self.unit) + + def render(self): + if self.t == None: + self.t = turtle.Turtle() + self.wn = turtle.Screen() + self.wn.setup(self.unit * self.max_x + 100, + self.unit * self.max_y + 100) + self.wn.setworldcoordinates(0, 0, self.unit * self.max_x, + self.unit * self.max_y) + self.t.shape('circle') + self.t.width(2) + self.t.speed(0) + self.t.color('gray') + for i in range(self.desc.shape[0]): + for j in range(self.desc.shape[1]): + x = j + y = self.max_y - 1 - i + if self.desc[i][j] == b'S': # Start + self.draw_box(x, y, 'white') + elif self.desc[i][j] == b'F': # Frozen ice + self.draw_box(x, y, 'white') + elif self.desc[i][j] == b'G': # Goal + self.draw_box(x, y, 'yellow') + elif self.desc[i][j] == b'H': # Hole + self.draw_box(x, y, 'black') + else: + self.draw_box(x, y, 'white') + self.t.shape('turtle') + + x_pos = self.s % self.max_x + y_pos = self.max_y - 1 - int(self.s / self.max_x) + self.move_player(x_pos, y_pos) + + +class CliffWalkingWapper(gym.Wrapper): + def __init__(self, env): + gym.Wrapper.__init__(self, env) + self.t = None + self.unit = 50 + self.max_x = 12 + self.max_y = 4 + + def draw_x_line(self, y, x0, x1, color='gray'): + assert x1 > x0 + self.t.color(color) + self.t.setheading(0) + self.t.up() + self.t.goto(x0, y) + self.t.down() + self.t.forward(x1 - x0) + + def draw_y_line(self, x, y0, y1, color='gray'): + assert y1 > y0 + self.t.color(color) + self.t.setheading(90) + self.t.up() + self.t.goto(x, y0) + self.t.down() + self.t.forward(y1 - y0) + + def draw_box(self, x, y, fillcolor='', line_color='gray'): + self.t.up() + self.t.goto(x * self.unit, y * self.unit) + self.t.color(line_color) + self.t.fillcolor(fillcolor) + self.t.setheading(90) + self.t.down() + self.t.begin_fill() + for i in range(4): + self.t.forward(self.unit) + self.t.right(90) + self.t.end_fill() + + def move_player(self, x, y): + self.t.up() + self.t.setheading(90) + self.t.fillcolor('red') + self.t.goto((x + 0.5) * self.unit, (y + 0.5) * self.unit) + + def render(self): + if self.t == None: + self.t = turtle.Turtle() + self.wn = turtle.Screen() + self.wn.setup(self.unit * self.max_x + 100, + self.unit * self.max_y + 100) + self.wn.setworldcoordinates(0, 0, self.unit * self.max_x, + self.unit * self.max_y) + self.t.shape('circle') + self.t.width(2) + self.t.speed(0) + self.t.color('gray') + for _ in range(2): + self.t.forward(self.max_x * self.unit) + self.t.left(90) + self.t.forward(self.max_y * self.unit) + self.t.left(90) + for i in range(1, self.max_y): + self.draw_x_line( + y=i * self.unit, x0=0, x1=self.max_x * self.unit) + for i in range(1, self.max_x): + self.draw_y_line( + x=i * self.unit, y0=0, y1=self.max_y * self.unit) + + for i in range(1, self.max_x - 1): + self.draw_box(i, 0, 'black') + self.draw_box(self.max_x - 1, 0, 'yellow') + self.t.shape('turtle') + + x_pos = self.s % self.max_x + y_pos = self.max_y - 1 - int(self.s / self.max_x) + self.move_player(x_pos, y_pos) + + +if __name__ == '__main__': + # 环境1:FrozenLake, 可以配置冰面是否是滑的 + # 0 left, 1 down, 2 right, 3 up + env = gym.make("FrozenLake-v0", is_slippery=False) + env = FrozenLakeWapper(env) + + # 环境2:CliffWalking, 悬崖环境 + # env = gym.make("CliffWalking-v0") # 0 up, 1 right, 2 down, 3 left + # env = CliffWalkingWapper(env) + + # 环境3:自定义格子世界,可以配置地图, S为出发点Start, F为平地Floor, H为洞Hole, G为出口目标Goal + # gridmap = [ + # 'SFFF', + # 'FHFF', + # 'FFFF', + # 'HFGF' ] + # env = GridWorld(gridmap) + + env.reset() + for step in range(10): + action = np.random.randint(0, 4) + obs, reward, done, info = env.step(action) + print('step {}: action {}, obs {}, reward {}, done {}, info {}'.format(\ + step, action, obs, reward, done, info)) + env.render() # 渲染一帧图像 diff --git a/examples/tutorials/lesson2/q_learning/agent.py b/examples/tutorials/lesson2/q_learning/agent.py new file mode 100644 index 0000000000000000000000000000000000000000..7d72f9cae03c935431f58043fdb505cec526cb6b --- /dev/null +++ b/examples/tutorials/lesson2/q_learning/agent.py @@ -0,0 +1,75 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# -*- coding: utf-8 -*- + +import numpy as np + + +class QLearningAgent(object): + def __init__(self, + obs_n, + act_n, + learning_rate=0.01, + gamma=0.9, + e_greed=0.1): + self.act_n = act_n # 动作维度,有几个动作可选 + self.lr = learning_rate # 学习率 + self.gamma = gamma # reward的衰减率 + self.epsilon = e_greed # 按一定概率随机选动作 + self.Q = np.zeros((obs_n, act_n)) + + # 根据输入观察值,采样输出的动作值,带探索 + def sample(self, obs): + if np.random.uniform(0, 1) < (1.0 - self.epsilon): #根据table的Q值选动作 + action = self.predict(obs) + else: + action = np.random.choice(self.act_n) #有一定概率随机探索选取一个动作 + return action + + # 根据输入观察值,预测输出的动作值 + def predict(self, obs): + Q_list = self.Q[obs, :] + maxQ = np.max(Q_list) + action_list = np.where(Q_list == maxQ)[0] # maxQ可能对应多个action + action = np.random.choice(action_list) + return action + + # 学习方法,也就是更新Q-table的方法 + def learn(self, obs, action, reward, next_obs, done): + """ off-policy + obs: 交互前的obs, s_t + action: 本次交互选择的action, a_t + reward: 本次动作获得的奖励r + next_obs: 本次交互后的obs, s_t+1 + done: episode是否结束 + """ + predict_Q = self.Q[obs, action] + if done: + target_Q = reward # 没有下一个状态了 + else: + target_Q = reward + self.gamma * np.max( + self.Q[next_obs, :]) # Q-learning + self.Q[obs, action] += self.lr * (target_Q - predict_Q) # 修正q + + # 把 Q表格 的数据保存到文件中 + def save(self): + npy_file = './q_table.npy' + np.save(npy_file, self.Q) + print(npy_file + ' saved.') + + # 从文件中读取数据到 Q表格 + def restore(self, npy_file='./q_table.npy'): + self.Q = np.load(npy_file) + print(npy_file + ' loaded.') diff --git a/examples/tutorials/lesson2/q_learning/gridworld.py b/examples/tutorials/lesson2/q_learning/gridworld.py new file mode 100644 index 0000000000000000000000000000000000000000..ca8acb2da5476e96d3cb95a479b2dfdbd7ba0b48 --- /dev/null +++ b/examples/tutorials/lesson2/q_learning/gridworld.py @@ -0,0 +1,195 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# -*- coding: utf-8 -*- + +import gym +import turtle +import numpy as np + +# turtle tutorial : https://docs.python.org/3.3/library/turtle.html + + +def GridWorld(gridmap=None, is_slippery=False): + if gridmap is None: + gridmap = ['SFFF', 'FHFH', 'FFFH', 'HFFG'] + env = gym.make("FrozenLake-v0", desc=gridmap, is_slippery=False) + env = FrozenLakeWapper(env) + return env + + +class FrozenLakeWapper(gym.Wrapper): + def __init__(self, env): + gym.Wrapper.__init__(self, env) + self.max_y = env.desc.shape[0] + self.max_x = env.desc.shape[1] + self.t = None + self.unit = 50 + + def draw_box(self, x, y, fillcolor='', line_color='gray'): + self.t.up() + self.t.goto(x * self.unit, y * self.unit) + self.t.color(line_color) + self.t.fillcolor(fillcolor) + self.t.setheading(90) + self.t.down() + self.t.begin_fill() + for _ in range(4): + self.t.forward(self.unit) + self.t.right(90) + self.t.end_fill() + + def move_player(self, x, y): + self.t.up() + self.t.setheading(90) + self.t.fillcolor('red') + self.t.goto((x + 0.5) * self.unit, (y + 0.5) * self.unit) + + def render(self): + if self.t == None: + self.t = turtle.Turtle() + self.wn = turtle.Screen() + self.wn.setup(self.unit * self.max_x + 100, + self.unit * self.max_y + 100) + self.wn.setworldcoordinates(0, 0, self.unit * self.max_x, + self.unit * self.max_y) + self.t.shape('circle') + self.t.width(2) + self.t.speed(0) + self.t.color('gray') + for i in range(self.desc.shape[0]): + for j in range(self.desc.shape[1]): + x = j + y = self.max_y - 1 - i + if self.desc[i][j] == b'S': # Start + self.draw_box(x, y, 'white') + elif self.desc[i][j] == b'F': # Frozen ice + self.draw_box(x, y, 'white') + elif self.desc[i][j] == b'G': # Goal + self.draw_box(x, y, 'yellow') + elif self.desc[i][j] == b'H': # Hole + self.draw_box(x, y, 'black') + else: + self.draw_box(x, y, 'white') + self.t.shape('turtle') + + x_pos = self.s % self.max_x + y_pos = self.max_y - 1 - int(self.s / self.max_x) + self.move_player(x_pos, y_pos) + + +class CliffWalkingWapper(gym.Wrapper): + def __init__(self, env): + gym.Wrapper.__init__(self, env) + self.t = None + self.unit = 50 + self.max_x = 12 + self.max_y = 4 + + def draw_x_line(self, y, x0, x1, color='gray'): + assert x1 > x0 + self.t.color(color) + self.t.setheading(0) + self.t.up() + self.t.goto(x0, y) + self.t.down() + self.t.forward(x1 - x0) + + def draw_y_line(self, x, y0, y1, color='gray'): + assert y1 > y0 + self.t.color(color) + self.t.setheading(90) + self.t.up() + self.t.goto(x, y0) + self.t.down() + self.t.forward(y1 - y0) + + def draw_box(self, x, y, fillcolor='', line_color='gray'): + self.t.up() + self.t.goto(x * self.unit, y * self.unit) + self.t.color(line_color) + self.t.fillcolor(fillcolor) + self.t.setheading(90) + self.t.down() + self.t.begin_fill() + for i in range(4): + self.t.forward(self.unit) + self.t.right(90) + self.t.end_fill() + + def move_player(self, x, y): + self.t.up() + self.t.setheading(90) + self.t.fillcolor('red') + self.t.goto((x + 0.5) * self.unit, (y + 0.5) * self.unit) + + def render(self): + if self.t == None: + self.t = turtle.Turtle() + self.wn = turtle.Screen() + self.wn.setup(self.unit * self.max_x + 100, + self.unit * self.max_y + 100) + self.wn.setworldcoordinates(0, 0, self.unit * self.max_x, + self.unit * self.max_y) + self.t.shape('circle') + self.t.width(2) + self.t.speed(0) + self.t.color('gray') + for _ in range(2): + self.t.forward(self.max_x * self.unit) + self.t.left(90) + self.t.forward(self.max_y * self.unit) + self.t.left(90) + for i in range(1, self.max_y): + self.draw_x_line( + y=i * self.unit, x0=0, x1=self.max_x * self.unit) + for i in range(1, self.max_x): + self.draw_y_line( + x=i * self.unit, y0=0, y1=self.max_y * self.unit) + + for i in range(1, self.max_x - 1): + self.draw_box(i, 0, 'black') + self.draw_box(self.max_x - 1, 0, 'yellow') + self.t.shape('turtle') + + x_pos = self.s % self.max_x + y_pos = self.max_y - 1 - int(self.s / self.max_x) + self.move_player(x_pos, y_pos) + + +if __name__ == '__main__': + # 环境1:FrozenLake, 可以配置冰面是否是滑的 + # 0 left, 1 down, 2 right, 3 up + env = gym.make("FrozenLake-v0", is_slippery=False) + env = FrozenLakeWapper(env) + + # 环境2:CliffWalking, 悬崖环境 + # env = gym.make("CliffWalking-v0") # 0 up, 1 right, 2 down, 3 left + # env = CliffWalkingWapper(env) + + # 环境3:自定义格子世界,可以配置地图, S为出发点Start, F为平地Floor, H为洞Hole, G为出口目标Goal + # gridmap = [ + # 'SFFF', + # 'FHFF', + # 'FFFF', + # 'HFGF' ] + # env = GridWorld(gridmap) + + env.reset() + for step in range(10): + action = np.random.randint(0, 4) + obs, reward, done, info = env.step(action) + print('step {}: action {}, obs {}, reward {}, done {}, info {}'.format(\ + step, action, obs, reward, done, info)) + # env.render() # 渲染一帧图像 diff --git a/examples/tutorials/lesson2/q_learning/train.py b/examples/tutorials/lesson2/q_learning/train.py new file mode 100644 index 0000000000000000000000000000000000000000..2e780605117e873091fd8e2ac9ece9a41645b51a --- /dev/null +++ b/examples/tutorials/lesson2/q_learning/train.py @@ -0,0 +1,90 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# -*- coding: utf-8 -*- + +import gym +from gridworld import CliffWalkingWapper, FrozenLakeWapper +from agent import QLearningAgent +import time + + +def run_episode(env, agent, render=False): + total_steps = 0 # 记录每个episode走了多少step + total_reward = 0 + + obs = env.reset() # 重置环境, 重新开一局(即开始新的一个episode) + + while True: + action = agent.sample(obs) # 根据算法选择一个动作 + next_obs, reward, done, _ = env.step(action) # 与环境进行一个交互 + # 训练 Q-learning算法 + agent.learn(obs, action, reward, next_obs, done) + + obs = next_obs # 存储上一个观察值 + total_reward += reward + total_steps += 1 # 计算step数 + if render: + env.render() #渲染新的一帧图形 + if done: + break + return total_reward, total_steps + + +def test_episode(env, agent): + total_reward = 0 + obs = env.reset() + while True: + action = agent.predict(obs) # greedy + next_obs, reward, done, _ = env.step(action) + total_reward += reward + obs = next_obs + time.sleep(0.5) + env.render() + if done: + print('test reward = %.1f' % (total_reward)) + break + + +def main(): + # env = gym.make("FrozenLake-v0", is_slippery=False) # 0 left, 1 down, 2 right, 3 up + # env = FrozenLakeWapper(env) + + env = gym.make("CliffWalking-v0") # 0 up, 1 right, 2 down, 3 left + env = CliffWalkingWapper(env) + + agent = QLearningAgent( + obs_n=env.observation_space.n, + act_n=env.action_space.n, + learning_rate=0.1, + gamma=0.9, + e_greed=0.1) + + is_render = False + for episode in range(500): + ep_reward, ep_steps = run_episode(env, agent, is_render) + print('Episode %s: steps = %s , reward = %.1f' % (episode, ep_steps, + ep_reward)) + + # 每隔20个episode渲染一下看看效果 + if episode % 20 == 0: + is_render = True + else: + is_render = False + # 训练结束,查看算法效果 + test_episode(env, agent) + + +if __name__ == "__main__": + main() diff --git a/examples/tutorials/lesson2/sarsa/agent.py b/examples/tutorials/lesson2/sarsa/agent.py new file mode 100644 index 0000000000000000000000000000000000000000..964230c88bef164dc8f22d5a3eb5e99f242097d3 --- /dev/null +++ b/examples/tutorials/lesson2/sarsa/agent.py @@ -0,0 +1,74 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# -*- coding: utf-8 -*- + +import numpy as np + + +class SarsaAgent(object): + def __init__(self, + obs_n, + act_n, + learning_rate=0.01, + gamma=0.9, + e_greed=0.1): + self.act_n = act_n # 动作维度,有几个动作可选 + self.lr = learning_rate # 学习率 + self.gamma = gamma # reward的衰减率 + self.epsilon = e_greed # 按一定概率随机选动作 + self.Q = np.zeros((obs_n, act_n)) + + # 根据输入观察值,采样输出的动作值,带探索 + def sample(self, obs): + if np.random.uniform(0, 1) < (1.0 - self.epsilon): #根据table的Q值选动作 + action = self.predict(obs) + else: + action = np.random.choice(self.act_n) #有一定概率随机探索选取一个动作 + return action + + # 根据输入观察值,预测输出的动作值 + def predict(self, obs): + Q_list = self.Q[obs, :] + maxQ = np.max(Q_list) + action_list = np.where(Q_list == maxQ)[0] # maxQ可能对应多个action + action = np.random.choice(action_list) + return action + + # 学习方法,也就是更新Q-table的方法 + def learn(self, obs, action, reward, next_obs, next_action, done): + """ on-policy + obs: 交互前的obs, s_t + action: 本次交互选择的action, a_t + reward: 本次动作获得的奖励r + next_obs: 本次交互后的obs, s_t+1 + next_action: 根据当前Q表格, 针对next_obs会选择的动作, a_t+1 + done: episode是否结束 + """ + predict_Q = self.Q[obs, action] + if done: + target_Q = reward # 没有下一个状态了 + else: + target_Q = reward + self.gamma * self.Q[next_obs, + next_action] # Sarsa + self.Q[obs, action] += self.lr * (target_Q - predict_Q) # 修正q + + def save(self): + npy_file = './q_table.npy' + np.save(npy_file, self.Q) + print(npy_file + ' saved.') + + def restore(self, npy_file='./q_table.npy'): + self.Q = np.load(npy_file) + print(npy_file + ' loaded.') diff --git a/examples/tutorials/lesson2/sarsa/gridworld.py b/examples/tutorials/lesson2/sarsa/gridworld.py new file mode 100644 index 0000000000000000000000000000000000000000..ca8acb2da5476e96d3cb95a479b2dfdbd7ba0b48 --- /dev/null +++ b/examples/tutorials/lesson2/sarsa/gridworld.py @@ -0,0 +1,195 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# -*- coding: utf-8 -*- + +import gym +import turtle +import numpy as np + +# turtle tutorial : https://docs.python.org/3.3/library/turtle.html + + +def GridWorld(gridmap=None, is_slippery=False): + if gridmap is None: + gridmap = ['SFFF', 'FHFH', 'FFFH', 'HFFG'] + env = gym.make("FrozenLake-v0", desc=gridmap, is_slippery=False) + env = FrozenLakeWapper(env) + return env + + +class FrozenLakeWapper(gym.Wrapper): + def __init__(self, env): + gym.Wrapper.__init__(self, env) + self.max_y = env.desc.shape[0] + self.max_x = env.desc.shape[1] + self.t = None + self.unit = 50 + + def draw_box(self, x, y, fillcolor='', line_color='gray'): + self.t.up() + self.t.goto(x * self.unit, y * self.unit) + self.t.color(line_color) + self.t.fillcolor(fillcolor) + self.t.setheading(90) + self.t.down() + self.t.begin_fill() + for _ in range(4): + self.t.forward(self.unit) + self.t.right(90) + self.t.end_fill() + + def move_player(self, x, y): + self.t.up() + self.t.setheading(90) + self.t.fillcolor('red') + self.t.goto((x + 0.5) * self.unit, (y + 0.5) * self.unit) + + def render(self): + if self.t == None: + self.t = turtle.Turtle() + self.wn = turtle.Screen() + self.wn.setup(self.unit * self.max_x + 100, + self.unit * self.max_y + 100) + self.wn.setworldcoordinates(0, 0, self.unit * self.max_x, + self.unit * self.max_y) + self.t.shape('circle') + self.t.width(2) + self.t.speed(0) + self.t.color('gray') + for i in range(self.desc.shape[0]): + for j in range(self.desc.shape[1]): + x = j + y = self.max_y - 1 - i + if self.desc[i][j] == b'S': # Start + self.draw_box(x, y, 'white') + elif self.desc[i][j] == b'F': # Frozen ice + self.draw_box(x, y, 'white') + elif self.desc[i][j] == b'G': # Goal + self.draw_box(x, y, 'yellow') + elif self.desc[i][j] == b'H': # Hole + self.draw_box(x, y, 'black') + else: + self.draw_box(x, y, 'white') + self.t.shape('turtle') + + x_pos = self.s % self.max_x + y_pos = self.max_y - 1 - int(self.s / self.max_x) + self.move_player(x_pos, y_pos) + + +class CliffWalkingWapper(gym.Wrapper): + def __init__(self, env): + gym.Wrapper.__init__(self, env) + self.t = None + self.unit = 50 + self.max_x = 12 + self.max_y = 4 + + def draw_x_line(self, y, x0, x1, color='gray'): + assert x1 > x0 + self.t.color(color) + self.t.setheading(0) + self.t.up() + self.t.goto(x0, y) + self.t.down() + self.t.forward(x1 - x0) + + def draw_y_line(self, x, y0, y1, color='gray'): + assert y1 > y0 + self.t.color(color) + self.t.setheading(90) + self.t.up() + self.t.goto(x, y0) + self.t.down() + self.t.forward(y1 - y0) + + def draw_box(self, x, y, fillcolor='', line_color='gray'): + self.t.up() + self.t.goto(x * self.unit, y * self.unit) + self.t.color(line_color) + self.t.fillcolor(fillcolor) + self.t.setheading(90) + self.t.down() + self.t.begin_fill() + for i in range(4): + self.t.forward(self.unit) + self.t.right(90) + self.t.end_fill() + + def move_player(self, x, y): + self.t.up() + self.t.setheading(90) + self.t.fillcolor('red') + self.t.goto((x + 0.5) * self.unit, (y + 0.5) * self.unit) + + def render(self): + if self.t == None: + self.t = turtle.Turtle() + self.wn = turtle.Screen() + self.wn.setup(self.unit * self.max_x + 100, + self.unit * self.max_y + 100) + self.wn.setworldcoordinates(0, 0, self.unit * self.max_x, + self.unit * self.max_y) + self.t.shape('circle') + self.t.width(2) + self.t.speed(0) + self.t.color('gray') + for _ in range(2): + self.t.forward(self.max_x * self.unit) + self.t.left(90) + self.t.forward(self.max_y * self.unit) + self.t.left(90) + for i in range(1, self.max_y): + self.draw_x_line( + y=i * self.unit, x0=0, x1=self.max_x * self.unit) + for i in range(1, self.max_x): + self.draw_y_line( + x=i * self.unit, y0=0, y1=self.max_y * self.unit) + + for i in range(1, self.max_x - 1): + self.draw_box(i, 0, 'black') + self.draw_box(self.max_x - 1, 0, 'yellow') + self.t.shape('turtle') + + x_pos = self.s % self.max_x + y_pos = self.max_y - 1 - int(self.s / self.max_x) + self.move_player(x_pos, y_pos) + + +if __name__ == '__main__': + # 环境1:FrozenLake, 可以配置冰面是否是滑的 + # 0 left, 1 down, 2 right, 3 up + env = gym.make("FrozenLake-v0", is_slippery=False) + env = FrozenLakeWapper(env) + + # 环境2:CliffWalking, 悬崖环境 + # env = gym.make("CliffWalking-v0") # 0 up, 1 right, 2 down, 3 left + # env = CliffWalkingWapper(env) + + # 环境3:自定义格子世界,可以配置地图, S为出发点Start, F为平地Floor, H为洞Hole, G为出口目标Goal + # gridmap = [ + # 'SFFF', + # 'FHFF', + # 'FFFF', + # 'HFGF' ] + # env = GridWorld(gridmap) + + env.reset() + for step in range(10): + action = np.random.randint(0, 4) + obs, reward, done, info = env.step(action) + print('step {}: action {}, obs {}, reward {}, done {}, info {}'.format(\ + step, action, obs, reward, done, info)) + # env.render() # 渲染一帧图像 diff --git a/examples/tutorials/lesson2/sarsa/train.py b/examples/tutorials/lesson2/sarsa/train.py new file mode 100644 index 0000000000000000000000000000000000000000..648ca7b30dd0a4b93a7134cbb209a8ae6558409b --- /dev/null +++ b/examples/tutorials/lesson2/sarsa/train.py @@ -0,0 +1,92 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# -*- coding: utf-8 -*- + +import gym +from gridworld import CliffWalkingWapper, FrozenLakeWapper +from agent import SarsaAgent +import time + + +def run_episode(env, agent, render=False): + total_steps = 0 # 记录每个episode走了多少step + total_reward = 0 + + obs = env.reset() # 重置环境, 重新开一局(即开始新的一个episode) + action = agent.sample(obs) # 根据算法选择一个动作 + + while True: + next_obs, reward, done, _ = env.step(action) # 与环境进行一个交互 + next_action = agent.sample(next_obs) # 根据算法选择一个动作 + # 训练 Sarsa 算法 + agent.learn(obs, action, reward, next_obs, next_action, done) + + action = next_action + obs = next_obs # 存储上一个观察值 + total_reward += reward + total_steps += 1 # 计算step数 + if render: + env.render() #渲染新的一帧图形 + if done: + break + return total_reward, total_steps + + +def test_episode(env, agent): + total_reward = 0 + obs = env.reset() + while True: + action = agent.predict(obs) # greedy + next_obs, reward, done, _ = env.step(action) + total_reward += reward + obs = next_obs + time.sleep(0.5) + env.render() + if done: + print('test reward = %.1f' % (total_reward)) + break + + +def main(): + # env = gym.make("FrozenLake-v0", is_slippery=False) # 0 left, 1 down, 2 right, 3 up + # env = FrozenLakeWapper(env) + + env = gym.make("CliffWalking-v0") # 0 up, 1 right, 2 down, 3 left + env = CliffWalkingWapper(env) + + agent = SarsaAgent( + obs_n=env.observation_space.n, + act_n=env.action_space.n, + learning_rate=0.1, + gamma=0.9, + e_greed=0.1) + + is_render = False + for episode in range(500): + ep_reward, ep_steps = run_episode(env, agent, is_render) + print('Episode %s: steps = %s , reward = %.1f' % (episode, ep_steps, + ep_reward)) + + # 每隔20个episode渲染一下看看效果 + if episode % 20 == 0: + is_render = True + else: + is_render = False + # 训练结束,查看算法效果 + test_episode(env, agent) + + +if __name__ == "__main__": + main() diff --git a/examples/tutorials/lesson3/dqn/agent.py b/examples/tutorials/lesson3/dqn/agent.py new file mode 100644 index 0000000000000000000000000000000000000000..e14a737f16b62256ee0eb0efcfe3290222209f51 --- /dev/null +++ b/examples/tutorials/lesson3/dqn/agent.py @@ -0,0 +1,97 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#-*- coding: utf-8 -*- + +import numpy as np +import paddle.fluid as fluid +import parl +from parl import layers + + +class Agent(parl.Agent): + def __init__(self, + algorithm, + obs_dim, + act_dim, + e_greed=0.1, + e_greed_decrement=0): + assert isinstance(obs_dim, int) + assert isinstance(act_dim, int) + self.obs_dim = obs_dim + self.act_dim = act_dim + super(Agent, self).__init__(algorithm) + + self.global_step = 0 + self.update_target_steps = 200 # 每隔200个training steps再把model的参数复制到target_model中 + + self.e_greed = e_greed # 有一定概率随机选取动作,探索 + self.e_greed_decrement = e_greed_decrement # 随着训练逐步收敛,探索的程度慢慢降低 + + def build_program(self): + self.pred_program = fluid.Program() + self.learn_program = fluid.Program() + + with fluid.program_guard(self.pred_program): # 搭建计算图用于 预测动作,定义输入输出变量 + obs = layers.data( + name='obs', shape=[self.obs_dim], dtype='float32') + self.value = self.alg.predict(obs) + + with fluid.program_guard(self.learn_program): # 搭建计算图用于 更新Q网络,定义输入输出变量 + obs = layers.data( + name='obs', shape=[self.obs_dim], dtype='float32') + action = layers.data(name='act', shape=[1], dtype='int32') + reward = layers.data(name='reward', shape=[], dtype='float32') + next_obs = layers.data( + name='next_obs', shape=[self.obs_dim], dtype='float32') + terminal = layers.data(name='terminal', shape=[], dtype='bool') + self.cost = self.alg.learn(obs, action, reward, next_obs, terminal) + + def sample(self, obs): + sample = np.random.rand() # 产生0~1之间的小数 + if sample < self.e_greed: + act = np.random.randint(self.act_dim) # 探索:每个动作都有概率被选择 + else: + act = self.predict(obs) # 选择最优动作 + self.e_greed = max( + 0.01, self.e_greed - self.e_greed_decrement) # 随着训练逐步收敛,探索的程度慢慢降低 + return act + + def predict(self, obs): # 选择最优动作 + obs = np.expand_dims(obs, axis=0) + pred_Q = self.fluid_executor.run( + self.pred_program, + feed={'obs': obs.astype('float32')}, + fetch_list=[self.value])[0] + pred_Q = np.squeeze(pred_Q, axis=0) + act = np.argmax(pred_Q) # 选择Q最大的下标,即对应的动作 + return act + + def learn(self, obs, act, reward, next_obs, terminal): + # 每隔200个training steps同步一次model和target_model的参数 + if self.global_step % self.update_target_steps == 0: + self.alg.sync_target() + self.global_step += 1 + + act = np.expand_dims(act, -1) + feed = { + 'obs': obs.astype('float32'), + 'act': act.astype('int32'), + 'reward': reward, + 'next_obs': next_obs.astype('float32'), + 'terminal': terminal + } + cost = self.fluid_executor.run( + self.learn_program, feed=feed, fetch_list=[self.cost])[0] # 训练一次网络 + return cost diff --git a/examples/tutorials/lesson3/dqn/algorithm.py b/examples/tutorials/lesson3/dqn/algorithm.py new file mode 100644 index 0000000000000000000000000000000000000000..0f27d9c71b1f13f2d4f5f90b8f8e0608e04b4bb0 --- /dev/null +++ b/examples/tutorials/lesson3/dqn/algorithm.py @@ -0,0 +1,79 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#-*- coding: utf-8 -*- + +import copy +import paddle.fluid as fluid +import parl +from parl import layers + + +class DQN(parl.Algorithm): + def __init__(self, model, act_dim=None, gamma=None, lr=None): + """ DQN algorithm + + Args: + model (parl.Model): 定义Q函数的前向网络结构 + act_dim (int): action空间的维度,即有几个action + gamma (float): reward的衰减因子 + lr (float): learning_rate,学习率. + """ + self.model = model + self.target_model = copy.deepcopy(model) + + assert isinstance(act_dim, int) + assert isinstance(gamma, float) + assert isinstance(lr, float) + self.act_dim = act_dim + self.gamma = gamma + self.lr = lr + + def predict(self, obs): + """ 使用self.model的value网络来获取 [Q(s,a1),Q(s,a2),...] + """ + return self.model.value(obs) + + def learn(self, obs, action, reward, next_obs, terminal): + """ 使用DQN算法更新self.model的value网络 + """ + + # 从target_model中获取 max Q' 的值,用于计算target_Q + next_pred_value = self.target_model.value(next_obs) + best_v = layers.reduce_max(next_pred_value, dim=1) + best_v.stop_gradient = True # 阻止梯度传递 + terminal = layers.cast(terminal, dtype='float32') + target = reward + (1.0 - terminal) * self.gamma * best_v + + pred_value = self.model.value(obs) # 获取Q预测值 + # 将action转onehot向量,比如:3 => [0,0,0,1,0] + action_onehot = layers.one_hot(action, self.act_dim) + action_onehot = layers.cast(action_onehot, dtype='float32') + # 下面一行是逐元素相乘,拿到action对应的 Q(s,a) + # 比如:pred_value = [[2.3, 5.7, 1.2, 3.9, 1.4]], action_onehot = [[0,0,0,1,0]] + # ==> pred_action_value = [[3.9]] + pred_action_value = layers.reduce_sum( + layers.elementwise_mul(action_onehot, pred_value), dim=1) + + # 计算 Q(s,a) 与 target_Q的均方差,得到loss + cost = layers.square_error_cost(pred_action_value, target) + cost = layers.reduce_mean(cost) + optimizer = fluid.optimizer.Adam(learning_rate=self.lr) # 使用Adam优化器 + optimizer.minimize(cost) + return cost + + def sync_target(self): + """ 把 self.model 的模型参数值同步到 self.target_model + """ + self.model.sync_weights_to(self.target_model) diff --git a/examples/tutorials/lesson3/dqn/model.py b/examples/tutorials/lesson3/dqn/model.py new file mode 100644 index 0000000000000000000000000000000000000000..17c7a8d93a532884187abf0a8cb44d3823018e56 --- /dev/null +++ b/examples/tutorials/lesson3/dqn/model.py @@ -0,0 +1,34 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#-*- coding: utf-8 -*- + +import parl +from parl import layers # 封装了 paddle.fluid.layers 的API + + +class Model(parl.Model): + def __init__(self, act_dim): + hid1_size = 128 + hid2_size = 128 + # 3层全连接网络 + self.fc1 = layers.fc(size=hid1_size, act='relu') + self.fc2 = layers.fc(size=hid2_size, act='relu') + self.fc3 = layers.fc(size=act_dim, act=None) + + def value(self, obs): + h1 = self.fc1(obs) + h2 = self.fc2(h1) + Q = self.fc3(h2) + return Q diff --git a/examples/tutorials/lesson3/dqn/replay_memory.py b/examples/tutorials/lesson3/dqn/replay_memory.py new file mode 100644 index 0000000000000000000000000000000000000000..f7c83688184614a23429d7f64461877f283de9f5 --- /dev/null +++ b/examples/tutorials/lesson3/dqn/replay_memory.py @@ -0,0 +1,46 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Modified from https://github.com/seungeunrho/minimalRL/blob/master/dqn.py + +import random +import collections +import numpy as np + + +class ReplayMemory(object): + def __init__(self, max_size): + self.buffer = collections.deque(maxlen=max_size) + + def append(self, exp): + self.buffer.append(exp) + + def sample(self, batch_size): + mini_batch = random.sample(self.buffer, batch_size) + obs_batch, action_batch, reward_batch, next_obs_batch, done_batch = [], [], [], [], [] + + for experience in mini_batch: + s, a, r, s_p, done = experience + obs_batch.append(s) + action_batch.append(a) + reward_batch.append(r) + next_obs_batch.append(s_p) + done_batch.append(done) + + return np.array(obs_batch).astype('float32'), \ + np.array(action_batch).astype('float32'), np.array(reward_batch).astype('float32'),\ + np.array(next_obs_batch).astype('float32'), np.array(done_batch).astype('float32') + + def __len__(self): + return len(self.buffer) diff --git a/examples/tutorials/lesson3/dqn/train.py b/examples/tutorials/lesson3/dqn/train.py new file mode 100644 index 0000000000000000000000000000000000000000..13dfde79636f79ba2e96201f6579360d9c450898 --- /dev/null +++ b/examples/tutorials/lesson3/dqn/train.py @@ -0,0 +1,129 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#-*- coding: utf-8 -*- + +import os +import gym +import numpy as np +import parl +from parl.utils import logger # 日志打印工具 + +from model import Model +from algorithm import DQN # from parl.algorithms import DQN # parl >= 1.3.1 +from agent import Agent + +from replay_memory import ReplayMemory + +LEARN_FREQ = 5 # 训练频率,不需要每一个step都learn,攒一些新增经验后再learn,提高效率 +MEMORY_SIZE = 20000 # replay memory的大小,越大越占用内存 +MEMORY_WARMUP_SIZE = 200 # replay_memory 里需要预存一些经验数据,再从里面sample一个batch的经验让agent去learn +BATCH_SIZE = 32 # 每次给agent learn的数据数量,从replay memory随机里sample一批数据出来 +LEARNING_RATE = 0.001 # 学习率 +GAMMA = 0.99 # reward 的衰减因子,一般取 0.9 到 0.999 不等 + + +# 训练一个episode +def run_episode(env, agent, rpm): + total_reward = 0 + obs = env.reset() + step = 0 + while True: + step += 1 + action = agent.sample(obs) # 采样动作,所有动作都有概率被尝试到 + next_obs, reward, done, _ = env.step(action) + rpm.append((obs, action, reward, next_obs, done)) + + # train model + if (len(rpm) > MEMORY_WARMUP_SIZE) and (step % LEARN_FREQ == 0): + (batch_obs, batch_action, batch_reward, batch_next_obs, + batch_done) = rpm.sample(BATCH_SIZE) + train_loss = agent.learn(batch_obs, batch_action, batch_reward, + batch_next_obs, + batch_done) # s,a,r,s',done + + total_reward += reward + obs = next_obs + if done: + break + return total_reward + + +# 评估 agent, 跑 5 个episode,总reward求平均 +def evaluate(env, agent, render=False): + eval_reward = [] + for i in range(5): + obs = env.reset() + episode_reward = 0 + while True: + action = agent.predict(obs) # 预测动作,只选最优动作 + obs, reward, done, _ = env.step(action) + episode_reward += reward + if render: + env.render() + if done: + break + eval_reward.append(episode_reward) + return np.mean(eval_reward) + + +def main(): + env = gym.make( + 'CartPole-v0' + ) # CartPole-v0: expected reward > 180 MountainCar-v0 : expected reward > -120 + action_dim = env.action_space.n # CartPole-v0: 2 + obs_shape = env.observation_space.shape # CartPole-v0: (4,) + + rpm = ReplayMemory(MEMORY_SIZE) # DQN的经验回放池 + + # 根据parl框架构建agent + model = Model(act_dim=action_dim) + algorithm = DQN(model, act_dim=action_dim, gamma=GAMMA, lr=LEARNING_RATE) + agent = Agent( + algorithm, + obs_dim=obs_shape[0], + act_dim=action_dim, + e_greed=0.1, # 有一定概率随机选取动作,探索 + e_greed_decrement=1e-6) # 随着训练逐步收敛,探索的程度慢慢降低 + + # 加载模型 + # save_path = './dqn_model.ckpt' + # agent.restore(save_path) + + # 先往经验池里存一些数据,避免最开始训练的时候样本丰富度不够 + while len(rpm) < MEMORY_WARMUP_SIZE: + run_episode(env, agent, rpm) + + max_episode = 2000 + + # start train + episode = 0 + while episode < max_episode: # 训练max_episode个回合,test部分不计算入episode数量 + # train part + for i in range(0, 50): + total_reward = run_episode(env, agent, rpm) + episode += 1 + + # test part + eval_reward = evaluate(env, agent, render=True) # render=True 查看显示效果 + logger.info('episode:{} e_greed:{} Test reward:{}'.format( + episode, agent.e_greed, eval_reward)) + + # 训练结束,保存模型 + save_path = './dqn_model.ckpt' + agent.save(save_path) + + +if __name__ == '__main__': + main() diff --git a/examples/tutorials/lesson4/policy_gradient/agent.py b/examples/tutorials/lesson4/policy_gradient/agent.py new file mode 100644 index 0000000000000000000000000000000000000000..fad9528a1d1f4035aece21fb0aec753cf6519ae9 --- /dev/null +++ b/examples/tutorials/lesson4/policy_gradient/agent.py @@ -0,0 +1,75 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#-*- coding: utf-8 -*- + +import numpy as np +import paddle.fluid as fluid +import parl +from parl import layers + + +class Agent(parl.Agent): + def __init__(self, algorithm, obs_dim, act_dim): + self.obs_dim = obs_dim + self.act_dim = act_dim + super(Agent, self).__init__(algorithm) + + def build_program(self): + self.pred_program = fluid.Program() + self.learn_program = fluid.Program() + + with fluid.program_guard(self.pred_program): # 搭建计算图用于 预测动作,定义输入输出变量 + obs = layers.data( + name='obs', shape=[self.obs_dim], dtype='float32') + self.act_prob = self.alg.predict(obs) + + with fluid.program_guard( + self.learn_program): # 搭建计算图用于 更新policy网络,定义输入输出变量 + obs = layers.data( + name='obs', shape=[self.obs_dim], dtype='float32') + act = layers.data(name='act', shape=[1], dtype='int64') + reward = layers.data(name='reward', shape=[], dtype='float32') + self.cost = self.alg.learn(obs, act, reward) + + def sample(self, obs): + obs = np.expand_dims(obs, axis=0) # 增加一维维度 + act_prob = self.fluid_executor.run( + self.pred_program, + feed={'obs': obs.astype('float32')}, + fetch_list=[self.act_prob])[0] + act_prob = np.squeeze(act_prob, axis=0) # 减少一维维度 + act = np.random.choice(range(self.act_dim), p=act_prob) # 根据动作概率选取动作 + return act + + def predict(self, obs): + obs = np.expand_dims(obs, axis=0) + act_prob = self.fluid_executor.run( + self.pred_program, + feed={'obs': obs.astype('float32')}, + fetch_list=[self.act_prob])[0] + act_prob = np.squeeze(act_prob, axis=0) + act = np.argmax(act_prob) # 根据动作概率选择概率最高的动作 + return act + + def learn(self, obs, act, reward): + act = np.expand_dims(act, axis=-1) + feed = { + 'obs': obs.astype('float32'), + 'act': act.astype('int64'), + 'reward': reward.astype('float32') + } + cost = self.fluid_executor.run( + self.learn_program, feed=feed, fetch_list=[self.cost])[0] + return cost diff --git a/examples/tutorials/lesson4/policy_gradient/algorithm.py b/examples/tutorials/lesson4/policy_gradient/algorithm.py new file mode 100644 index 0000000000000000000000000000000000000000..e48de8f5407f7bd7ff339bcd155e71364ee8e8c6 --- /dev/null +++ b/examples/tutorials/lesson4/policy_gradient/algorithm.py @@ -0,0 +1,54 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#-*- coding: utf-8 -*- + +import paddle.fluid as fluid +import parl +from parl import layers + + +class PolicyGradient(parl.Algorithm): + def __init__(self, model, lr=None): + """ Policy Gradient algorithm + + Args: + model (parl.Model): policy的前向网络. + lr (float): 学习率. + """ + + self.model = model + assert isinstance(lr, float) + self.lr = lr + + def predict(self, obs): + """ 使用policy model预测输出的动作概率 + """ + return self.model(obs) + + def learn(self, obs, action, reward): + """ 用policy gradient 算法更新policy model + """ + act_prob = self.model(obs) # 获取输出动作概率 + # log_prob = layers.cross_entropy(act_prob, action) # 交叉熵 + log_prob = layers.reduce_sum( + -1.0 * layers.log(act_prob) * layers.one_hot( + action, act_prob.shape[1]), + dim=1) + cost = log_prob * reward + cost = layers.reduce_mean(cost) + + optimizer = fluid.optimizer.Adam(self.lr) + optimizer.minimize(cost) + return cost diff --git a/parl/framework/policy_distribution.py b/examples/tutorials/lesson4/policy_gradient/model.py similarity index 51% rename from parl/framework/policy_distribution.py rename to examples/tutorials/lesson4/policy_gradient/model.py index 60bd6dd4e246875e3d684a25491ca0a5b80e8590..0273afd2f7ca4915b5f04d264dc0146248bea54d 100644 --- a/parl/framework/policy_distribution.py +++ b/examples/tutorials/lesson4/policy_gradient/model.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,13 +12,21 @@ # See the License for the specific language governing permissions and # limitations under the License. -import warnings +#-*- coding: utf-8 -*- -warnings.simplefilter('default') +import parl +from parl import layers -warnings.warn( - "module `parl.framework.policy_distribution` is deprecated since version 1.2 and will be removed in version 1.3, please use `parl.policy_distribution` instead.", - DeprecationWarning, - stacklevel=2) -from parl.core.fluid.policy_distribution import * +class Model(parl.Model): + def __init__(self, act_dim): + act_dim = act_dim + hid1_size = act_dim * 10 + + self.fc1 = layers.fc(size=hid1_size, act='tanh') + self.fc2 = layers.fc(size=act_dim, act='softmax') + + def forward(self, obs): # 可直接用 model = Model(5); model(obs)调用 + out = self.fc1(obs) + out = self.fc2(out) + return out diff --git a/examples/tutorials/lesson4/policy_gradient/train.py b/examples/tutorials/lesson4/policy_gradient/train.py new file mode 100644 index 0000000000000000000000000000000000000000..306c22526f76a2ecfc1793dcca083856dc51c45b --- /dev/null +++ b/examples/tutorials/lesson4/policy_gradient/train.py @@ -0,0 +1,111 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#-*- coding: utf-8 -*- + +import os +import gym +import numpy as np +import parl + +from agent import Agent +from model import Model +from algorithm import PolicyGradient # from parl.algorithms import PolicyGradient + +from parl.utils import logger + +LEARNING_RATE = 1e-3 + + +# 训练一个episode +def run_episode(env, agent): + obs_list, action_list, reward_list = [], [], [] + obs = env.reset() + while True: + obs_list.append(obs) + action = agent.sample(obs) + action_list.append(action) + + obs, reward, done, info = env.step(action) + reward_list.append(reward) + + if done: + break + return obs_list, action_list, reward_list + + +# 评估 agent, 跑 5 个episode,总reward求平均 +def evaluate(env, agent, render=False): + eval_reward = [] + for i in range(5): + obs = env.reset() + episode_reward = 0 + while True: + action = agent.predict(obs) + obs, reward, isOver, _ = env.step(action) + episode_reward += reward + if render: + env.render() + if isOver: + break + eval_reward.append(episode_reward) + return np.mean(eval_reward) + + +def calc_reward_to_go(reward_list, gamma=1.0): + for i in range(len(reward_list) - 2, -1, -1): + # G_t = r_t + γ·r_t+1 + ... = r_t + γ·G_t+1 + reward_list[i] += gamma * reward_list[i + 1] # Gt + return np.array(reward_list) + + +def main(): + env = gym.make('CartPole-v0') + # env = env.unwrapped # Cancel the minimum score limit + obs_dim = env.observation_space.shape[0] + act_dim = env.action_space.n + logger.info('obs_dim {}, act_dim {}'.format(obs_dim, act_dim)) + + # 根据parl框架构建agent + model = Model(act_dim=act_dim) + alg = PolicyGradient(model, lr=LEARNING_RATE) + agent = Agent(alg, obs_dim=obs_dim, act_dim=act_dim) + + # 加载模型 + # if os.path.exists('./model.ckpt'): + # agent.restore('./model.ckpt') + # run_episode(env, agent, train_or_test='test', render=True) + # exit() + + for i in range(1000): + obs_list, action_list, reward_list = run_episode(env, agent) + if i % 10 == 0: + logger.info("Episode {}, Reward Sum {}.".format( + i, sum(reward_list))) + + batch_obs = np.array(obs_list) + batch_action = np.array(action_list) + batch_reward = calc_reward_to_go(reward_list) + + agent.learn(batch_obs, batch_action, batch_reward) + if (i + 1) % 100 == 0: + total_reward = evaluate(env, agent, render=True) + logger.info('Test reward: {}'.format(total_reward)) + + # save the parameters to ./model.ckpt + agent.save('./model.ckpt') + + +if __name__ == '__main__': + main() diff --git a/examples/tutorials/lesson5/ddpg/agent.py b/examples/tutorials/lesson5/ddpg/agent.py new file mode 100644 index 0000000000000000000000000000000000000000..5a6ab55bfba9ab819a9abecb677e9a05605248db --- /dev/null +++ b/examples/tutorials/lesson5/ddpg/agent.py @@ -0,0 +1,74 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#-*- coding: utf-8 -*- + +import numpy as np +import parl +from parl import layers +from paddle import fluid + + +class Agent(parl.Agent): + def __init__(self, algorithm, obs_dim, act_dim): + assert isinstance(obs_dim, int) + assert isinstance(act_dim, int) + self.obs_dim = obs_dim + self.act_dim = act_dim + super(Agent, self).__init__(algorithm) + + # 注意:最开始先同步self.model和self.target_model的参数. + self.alg.sync_target(decay=0) + + def build_program(self): + self.pred_program = fluid.Program() + self.learn_program = fluid.Program() + + with fluid.program_guard(self.pred_program): + obs = layers.data( + name='obs', shape=[self.obs_dim], dtype='float32') + self.pred_act = self.alg.predict(obs) + + with fluid.program_guard(self.learn_program): + obs = layers.data( + name='obs', shape=[self.obs_dim], dtype='float32') + act = layers.data( + name='act', shape=[self.act_dim], dtype='float32') + reward = layers.data(name='reward', shape=[], dtype='float32') + next_obs = layers.data( + name='next_obs', shape=[self.obs_dim], dtype='float32') + terminal = layers.data(name='terminal', shape=[], dtype='bool') + _, self.critic_cost = self.alg.learn(obs, act, reward, next_obs, + terminal) + + def predict(self, obs): + obs = np.expand_dims(obs, axis=0) + act = self.fluid_executor.run( + self.pred_program, feed={'obs': obs}, + fetch_list=[self.pred_act])[0] + act = np.squeeze(act) + return act + + def learn(self, obs, act, reward, next_obs, terminal): + feed = { + 'obs': obs, + 'act': act, + 'reward': reward, + 'next_obs': next_obs, + 'terminal': terminal + } + critic_cost = self.fluid_executor.run( + self.learn_program, feed=feed, fetch_list=[self.critic_cost])[0] + self.alg.sync_target() + return critic_cost diff --git a/examples/tutorials/lesson5/ddpg/algorithm.py b/examples/tutorials/lesson5/ddpg/algorithm.py new file mode 100644 index 0000000000000000000000000000000000000000..b77beaedc7452fe305684d09cbcc9ca0061d27e1 --- /dev/null +++ b/examples/tutorials/lesson5/ddpg/algorithm.py @@ -0,0 +1,96 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#-*- coding: utf-8 -*- + +import parl +from parl import layers +from copy import deepcopy +from paddle import fluid + + +class DDPG(parl.Algorithm): + def __init__(self, + model, + gamma=None, + tau=None, + actor_lr=None, + critic_lr=None): + """ DDPG algorithm + + Args: + model (parl.Model): actor and critic 的前向网络. + model 必须实现 get_actor_params() 方法. + gamma (float): reward的衰减因子. + tau (float): self.target_model 跟 self.model 同步参数 的 软更新参数 + actor_lr (float): actor 的学习率 + critic_lr (float): critic 的学习率 + """ + assert isinstance(gamma, float) + assert isinstance(tau, float) + assert isinstance(actor_lr, float) + assert isinstance(critic_lr, float) + self.gamma = gamma + self.tau = tau + self.actor_lr = actor_lr + self.critic_lr = critic_lr + + self.model = model + self.target_model = deepcopy(model) + + def predict(self, obs): + """ 使用 self.model 的 actor model 来预测动作 + """ + return self.model.policy(obs) + + def learn(self, obs, action, reward, next_obs, terminal): + """ 用DDPG算法更新 actor 和 critic + """ + actor_cost = self._actor_learn(obs) + critic_cost = self._critic_learn(obs, action, reward, next_obs, + terminal) + return actor_cost, critic_cost + + def _actor_learn(self, obs): + action = self.model.policy(obs) + Q = self.model.value(obs, action) + cost = layers.reduce_mean(-1.0 * Q) + optimizer = fluid.optimizer.AdamOptimizer(self.actor_lr) + optimizer.minimize(cost, parameter_list=self.model.get_actor_params()) + return cost + + def _critic_learn(self, obs, action, reward, next_obs, terminal): + next_action = self.target_model.policy(next_obs) + next_Q = self.target_model.value(next_obs, next_action) + + terminal = layers.cast(terminal, dtype='float32') + target_Q = reward + (1.0 - terminal) * self.gamma * next_Q + target_Q.stop_gradient = True + + Q = self.model.value(obs, action) + cost = layers.square_error_cost(Q, target_Q) + cost = layers.reduce_mean(cost) + optimizer = fluid.optimizer.AdamOptimizer(self.critic_lr) + optimizer.minimize(cost) + return cost + + def sync_target(self, decay=None, share_vars_parallel_executor=None): + """ self.target_model从self.model复制参数过来,若decay不为None,则是软更新 + """ + if decay is None: + decay = 1.0 - self.tau + self.model.sync_weights_to( + self.target_model, + decay=decay, + share_vars_parallel_executor=share_vars_parallel_executor) diff --git a/examples/tutorials/lesson5/ddpg/env.py b/examples/tutorials/lesson5/ddpg/env.py new file mode 100644 index 0000000000000000000000000000000000000000..c3e1e54518b15a08f7a0316b5470b47721a1f288 --- /dev/null +++ b/examples/tutorials/lesson5/ddpg/env.py @@ -0,0 +1,175 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#-*- coding: utf-8 -*- +""" +Classic cart-pole system implemented by Rich Sutton et al. +Copied from http://incompleteideas.net/sutton/book/code/pole.c +permalink: https://perma.cc/C9ZM-652R + +Continuous version by Ian Danforth +""" + +import math +import gym +from gym import spaces, logger +from gym.utils import seeding +import numpy as np + + +class ContinuousCartPoleEnv(gym.Env): + metadata = { + 'render.modes': ['human', 'rgb_array'], + 'video.frames_per_second': 50 + } + + def __init__(self): + self.gravity = 9.8 + self.masscart = 1.0 + self.masspole = 0.1 + self.total_mass = (self.masspole + self.masscart) + self.length = 0.5 # actually half the pole's length + self.polemass_length = (self.masspole * self.length) + self.force_mag = 30.0 + self.tau = 0.02 # seconds between state updates + self.min_action = -1.0 + self.max_action = 1.0 + + # Angle at which to fail the episode + self.theta_threshold_radians = 12 * 2 * math.pi / 360 + self.x_threshold = 2.4 + + # Angle limit set to 2 * theta_threshold_radians so failing observation + # is still within bounds + high = np.array([ + self.x_threshold * 2, + np.finfo(np.float32).max, self.theta_threshold_radians * 2, + np.finfo(np.float32).max + ]) + + self.action_space = spaces.Box( + low=self.min_action, high=self.max_action, shape=(1, )) + self.observation_space = spaces.Box(-high, high) + + self.seed() + self.viewer = None + self.state = None + + self.steps_beyond_done = None + + def seed(self, seed=None): + self.np_random, seed = seeding.np_random(seed) + return [seed] + + def stepPhysics(self, force): + x, x_dot, theta, theta_dot = self.state + costheta = math.cos(theta) + sintheta = math.sin(theta) + temp = (force + self.polemass_length * theta_dot * theta_dot * sintheta + ) / self.total_mass + thetaacc = (self.gravity * sintheta - costheta * temp) / \ + (self.length * (4.0/3.0 - self.masspole * costheta * costheta / self.total_mass)) + xacc = temp - self.polemass_length * thetaacc * costheta / self.total_mass + x = x + self.tau * x_dot + x_dot = x_dot + self.tau * xacc + theta = theta + self.tau * theta_dot + theta_dot = theta_dot + self.tau * thetaacc + return (x, x_dot, theta, theta_dot) + + def step(self, action): + action = np.expand_dims(action, 0) + assert self.action_space.contains(action), \ + "%r (%s) invalid" % (action, type(action)) + # Cast action to float to strip np trappings + force = self.force_mag * float(action) + self.state = self.stepPhysics(force) + x, x_dot, theta, theta_dot = self.state + done = x < -self.x_threshold \ + or x > self.x_threshold \ + or theta < -self.theta_threshold_radians \ + or theta > self.theta_threshold_radians + done = bool(done) + + if not done: + reward = 1.0 + elif self.steps_beyond_done is None: + # Pole just fell! + self.steps_beyond_done = 0 + reward = 1.0 + else: + if self.steps_beyond_done == 0: + logger.warn(""" +You are calling 'step()' even though this environment has already returned +done = True. You should always call 'reset()' once you receive 'done = True' +Any further steps are undefined behavior. + """) + self.steps_beyond_done += 1 + reward = 0.0 + + return np.array(self.state), reward, done, {} + + def reset(self): + self.state = self.np_random.uniform(low=-0.05, high=0.05, size=(4, )) + self.steps_beyond_done = None + return np.array(self.state) + + def render(self, mode='human'): + screen_width = 600 + screen_height = 400 + + world_width = self.x_threshold * 2 + scale = screen_width / world_width + carty = 100 # TOP OF CART + polewidth = 10.0 + polelen = scale * 1.0 + cartwidth = 50.0 + cartheight = 30.0 + + if self.viewer is None: + from gym.envs.classic_control import rendering + self.viewer = rendering.Viewer(screen_width, screen_height) + l, r, t, b = -cartwidth / 2, cartwidth / 2, cartheight / 2, -cartheight / 2 + axleoffset = cartheight / 4.0 + cart = rendering.FilledPolygon([(l, b), (l, t), (r, t), (r, b)]) + self.carttrans = rendering.Transform() + cart.add_attr(self.carttrans) + self.viewer.add_geom(cart) + l, r, t, b = -polewidth / 2, polewidth / 2, polelen - polewidth / 2, -polewidth / 2 + pole = rendering.FilledPolygon([(l, b), (l, t), (r, t), (r, b)]) + pole.set_color(.8, .6, .4) + self.poletrans = rendering.Transform(translation=(0, axleoffset)) + pole.add_attr(self.poletrans) + pole.add_attr(self.carttrans) + self.viewer.add_geom(pole) + self.axle = rendering.make_circle(polewidth / 2) + self.axle.add_attr(self.poletrans) + self.axle.add_attr(self.carttrans) + self.axle.set_color(.5, .5, .8) + self.viewer.add_geom(self.axle) + self.track = rendering.Line((0, carty), (screen_width, carty)) + self.track.set_color(0, 0, 0) + self.viewer.add_geom(self.track) + + if self.state is None: + return None + + x = self.state + cartx = x[0] * scale + screen_width / 2.0 # MIDDLE OF CART + self.carttrans.set_translation(cartx, carty) + self.poletrans.set_rotation(-x[2]) + + return self.viewer.render(return_rgb_array=(mode == 'rgb_array')) + + def close(self): + if self.viewer: + self.viewer.close() diff --git a/examples/tutorials/lesson5/ddpg/model.py b/examples/tutorials/lesson5/ddpg/model.py new file mode 100644 index 0000000000000000000000000000000000000000..c195cd96171fa5d329e06c61882fd6977a8ea77c --- /dev/null +++ b/examples/tutorials/lesson5/ddpg/model.py @@ -0,0 +1,62 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#-*- coding: utf-8 -*- + +import paddle.fluid as fluid +import parl +from parl import layers + + +class Model(parl.Model): + def __init__(self, act_dim): + self.actor_model = ActorModel(act_dim) + self.critic_model = CriticModel() + + def policy(self, obs): + return self.actor_model.policy(obs) + + def value(self, obs, act): + return self.critic_model.value(obs, act) + + def get_actor_params(self): + return self.actor_model.parameters() + + +class ActorModel(parl.Model): + def __init__(self, act_dim): + hid_size = 100 + + self.fc1 = layers.fc(size=hid_size, act='relu') + self.fc2 = layers.fc(size=act_dim, act='tanh') + + def policy(self, obs): + hid = self.fc1(obs) + means = self.fc2(hid) + return means + + +class CriticModel(parl.Model): + def __init__(self): + hid_size = 100 + + self.fc1 = layers.fc(size=hid_size, act='relu') + self.fc2 = layers.fc(size=1, act=None) + + def value(self, obs, act): + concat = layers.concat([obs, act], axis=1) + hid = self.fc1(concat) + Q = self.fc2(hid) + Q = layers.squeeze(Q, axes=[1]) + return Q diff --git a/examples/tutorials/lesson5/ddpg/replay_memory.py b/examples/tutorials/lesson5/ddpg/replay_memory.py new file mode 100644 index 0000000000000000000000000000000000000000..f7c83688184614a23429d7f64461877f283de9f5 --- /dev/null +++ b/examples/tutorials/lesson5/ddpg/replay_memory.py @@ -0,0 +1,46 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Modified from https://github.com/seungeunrho/minimalRL/blob/master/dqn.py + +import random +import collections +import numpy as np + + +class ReplayMemory(object): + def __init__(self, max_size): + self.buffer = collections.deque(maxlen=max_size) + + def append(self, exp): + self.buffer.append(exp) + + def sample(self, batch_size): + mini_batch = random.sample(self.buffer, batch_size) + obs_batch, action_batch, reward_batch, next_obs_batch, done_batch = [], [], [], [], [] + + for experience in mini_batch: + s, a, r, s_p, done = experience + obs_batch.append(s) + action_batch.append(a) + reward_batch.append(r) + next_obs_batch.append(s_p) + done_batch.append(done) + + return np.array(obs_batch).astype('float32'), \ + np.array(action_batch).astype('float32'), np.array(reward_batch).astype('float32'),\ + np.array(next_obs_batch).astype('float32'), np.array(done_batch).astype('float32') + + def __len__(self): + return len(self.buffer) diff --git a/examples/tutorials/lesson5/ddpg/train.py b/examples/tutorials/lesson5/ddpg/train.py new file mode 100644 index 0000000000000000000000000000000000000000..5158fef0c13d900835fe8b0751b82a91c4662c4c --- /dev/null +++ b/examples/tutorials/lesson5/ddpg/train.py @@ -0,0 +1,128 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#-*- coding: utf-8 -*- + +import gym +import numpy as np +import parl +from parl.utils import logger + +from agent import Agent +from model import Model +from algorithm import DDPG # from parl.algorithms import DDPG +from env import ContinuousCartPoleEnv +from replay_memory import ReplayMemory + +ACTOR_LR = 1e-3 # Actor网络的 learning rate +CRITIC_LR = 1e-3 # Critic网络的 learning rate +GAMMA = 0.99 # reward 的衰减因子 +TAU = 0.001 # 软更新的系数 +MEMORY_SIZE = int(1e6) # 经验池大小 +MEMORY_WARMUP_SIZE = MEMORY_SIZE // 20 # 预存一部分经验之后再开始训练 +BATCH_SIZE = 128 +REWARD_SCALE = 0.1 # reward 缩放系数 +NOISE = 0.05 # 动作噪声方差 +TRAIN_EPISODE = 6e3 # 训练的总episode数 + + +# 训练一个episode +def run_episode(agent, env, rpm): + obs = env.reset() + total_reward = 0 + steps = 0 + while True: + steps += 1 + batch_obs = np.expand_dims(obs, axis=0) + action = agent.predict(batch_obs.astype('float32')) + + # 增加探索扰动, 输出限制在 [-1.0, 1.0] 范围内 + action = np.clip(np.random.normal(action, NOISE), -1.0, 1.0) + + next_obs, reward, done, info = env.step(action) + + action = [action] # 方便存入replaymemory + rpm.append((obs, action, REWARD_SCALE * reward, next_obs, done)) + + if len(rpm) > MEMORY_WARMUP_SIZE and (steps % 5) == 0: + (batch_obs, batch_action, batch_reward, batch_next_obs, + batch_done) = rpm.sample(BATCH_SIZE) + agent.learn(batch_obs, batch_action, batch_reward, batch_next_obs, + batch_done) + + obs = next_obs + total_reward += reward + + if done or steps >= 200: + break + return total_reward + + +# 评估 agent, 跑 5 个episode,总reward求平均 +def evaluate(env, agent, render=False): + eval_reward = [] + for i in range(5): + obs = env.reset() + total_reward = 0 + steps = 0 + while True: + batch_obs = np.expand_dims(obs, axis=0) + action = agent.predict(batch_obs.astype('float32')) + action = np.clip(action, -1.0, 1.0) + + steps += 1 + next_obs, reward, done, info = env.step(action) + + obs = next_obs + total_reward += reward + + if render: + env.render() + if done or steps >= 200: + break + eval_reward.append(total_reward) + return np.mean(eval_reward) + + +def main(): + env = ContinuousCartPoleEnv() + + obs_dim = env.observation_space.shape[0] + act_dim = env.action_space.shape[0] + + # 使用PARL框架创建agent + model = Model(act_dim) + algorithm = DDPG( + model, gamma=GAMMA, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR) + agent = Agent(algorithm, obs_dim, act_dim) + + # 创建经验池 + rpm = ReplayMemory(MEMORY_SIZE) + # 往经验池中预存数据 + while len(rpm) < MEMORY_WARMUP_SIZE: + run_episode(agent, env, rpm) + + episode = 0 + while episode < TRAIN_EPISODE: + for i in range(50): + total_reward = run_episode(agent, env, rpm) + episode += 1 + + eval_reward = evaluate(env, agent, render=False) + logger.info('episode:{} Test reward:{}'.format( + episode, eval_reward)) + + +if __name__ == '__main__': + main() diff --git a/papers/AAAI_2020.md b/papers/AAAI_2020.md new file mode 100644 index 0000000000000000000000000000000000000000..9e5564c8ede8d1e6009e0fe28c5aa32e2c4f88cb --- /dev/null +++ b/papers/AAAI_2020.md @@ -0,0 +1,32 @@ +### papers relative to improved RL algorithms +1. **Proximal Distilled Evolutionary Reinforcement Learning** AAAI2020. [paper](https://arxiv.org/pdf/1906.09807.pdf) + + *Cristian Bodnar, Ben Day, Pietro Lio ́* + +2. **Uncertainty-Aware Action Advising for Deep Reinforcement Learning Agents** AAAI2020. [paper](https://aaai.org/Papers/AAAI/2020GB/AAAI-SilvaF.2159.pdf) + + *Felipe Leno da Silva (University of Sao Paulo)*; Pablo Hernandez-Leal (Borealis AI); Bilal Kartal (Borealis AI); Matthew Taylor (Borealis AI)* + +3. **Partner Selection for the Emergence of Cooperation in Multi-Agent Systems Using Reinforcement Learning** AAAI2020. [paper](https://aaai.org/Papers/AAAI/2020GB/AAAI-AnastassacosN.1598.pdf) + + *Nicolas Anastassacos, Stephen Hailes, Mirco Musolesi* + +4. **Reinforcement Learning with Perturbed Reward** AAAI2020. [paper](https://www.aaai.org/Papers/AAAI/2020GB/AAAI-WangJK.4139.pdf) + + *Jingkang Wang, Yang Liu, Bo Li* + +5. **Deep Model-Based Reinforcement Learning via Estimated Uncertainty and Conservative Policy Optimization** AAAI2020. [paper](https://arxiv.org/pdf/1911.12574.pdf) + + *Qi Zhou, HouQiang Li, Jie Wang* + +6. **Reinforcement Learning of Risk-Constrained Policies in Markov Decision Processes** AAAI2020. [paper](https://www.fi.muni.cz/~xnovot18/aaai20.pdf) + + *Toma ́sˇ Bra ́zdil, Krishnendu Chatterjee, Petr Novotny ́, Jirˇ ́ı Vahala* + +7. **Exploratory Combinatorial Optimization with Reinforcement Learning** AAAI2020. [paper](https://arxiv.org/pdf/1909.04063.pdf) + + *Thomas D. Barrett, William R. Clements, Jakob N. Foerster, Alex I. Lvovsky* + +8. **Fixed-Horizon Temporal Difference Methods for Stable Reinforcement Learning** AAAI2020. [paper](https://arxiv.org/pdf/1909.03906.pdf) + + *Kristopher De Asis, Alan Chan, Silviu Pitis, Richard S. Sutton, Daniel Graves* diff --git a/parl/__init__.py b/parl/__init__.py index 7d3c26a00c4671f6aef2810a78e1f92bccaf35ed..cd4975bee788bccf82417f0a1e88e2dca67356a0 100644 --- a/parl/__init__.py +++ b/parl/__init__.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.2.1" +__version__ = "1.3.1" """ generates new PARL python API """ diff --git a/parl/algorithms/fluid/a3c.py b/parl/algorithms/fluid/a3c.py index 9b9f57e8eb5bfd59e3f79c1fc42e4d1374618f23..2786eb640a8d9cc0a4b117c28727c5c23a32fec6 100644 --- a/parl/algorithms/fluid/a3c.py +++ b/parl/algorithms/fluid/a3c.py @@ -24,25 +24,17 @@ __all__ = ['A3C'] class A3C(Algorithm): - def __init__(self, model, hyperparas=None, vf_loss_coeff=None): + def __init__(self, model, vf_loss_coeff=None): """ A3C/A2C algorithm Args: model (parl.Model): forward network of policy and value - hyperparas (dict): (deprecated) dict of hyper parameters. vf_loss_coeff (float): coefficient of the value function loss """ self.model = model - if hyperparas is not None: - warnings.warn( - "the `hyperparas` argument of `__init__` function in `parl.Algorithms.A3C` is deprecated since version 1.2 and will be removed in version 1.3.", - DeprecationWarning, - stacklevel=2) - self.vf_loss_coeff = hyperparas['vf_loss_coeff'] - else: - assert isinstance(vf_loss_coeff, (int, float)) - self.vf_loss_coeff = vf_loss_coeff + assert isinstance(vf_loss_coeff, (int, float)) + self.vf_loss_coeff = vf_loss_coeff def learn(self, obs, actions, advantages, target_values, learning_rate, entropy_coeff): diff --git a/parl/algorithms/fluid/ddpg.py b/parl/algorithms/fluid/ddpg.py index c127109c7d92f3f5b6e42d4eac25a796ae0c89ae..70992ee204449c297ff2a605951162d939f270d4 100644 --- a/parl/algorithms/fluid/ddpg.py +++ b/parl/algorithms/fluid/ddpg.py @@ -19,7 +19,6 @@ from parl.core.fluid import layers from copy import deepcopy from paddle import fluid from parl.core.fluid.algorithm import Algorithm -from parl.utils.deprecation import deprecated __all__ = ['DDPG'] @@ -27,7 +26,6 @@ __all__ = ['DDPG'] class DDPG(Algorithm): def __init__(self, model, - hyperparas=None, gamma=None, tau=None, actor_lr=None, @@ -37,53 +35,28 @@ class DDPG(Algorithm): Args: model (parl.Model): forward network of actor and critic. The function get_actor_params() of model should be implemented. - hyperparas (dict): (deprecated) dict of hyper parameters. gamma (float): discounted factor for reward computation. tau (float): decay coefficient when updating the weights of self.target_model with self.model actor_lr (float): learning rate of the actor model critic_lr (float): learning rate of the critic model """ - if hyperparas is not None: - warnings.warn( - "the `hyperparas` argument of `__init__` function in `parl.Algorithms.DDPG` is deprecated since version 1.2 and will be removed in version 1.3.", - DeprecationWarning, - stacklevel=2) - self.gamma = hyperparas['gamma'] - self.tau = hyperparas['tau'] - self.actor_lr = hyperparas['actor_lr'] - self.critic_lr = hyperparas['critic_lr'] - else: - assert isinstance(gamma, float) - assert isinstance(tau, float) - assert isinstance(actor_lr, float) - assert isinstance(critic_lr, float) - self.gamma = gamma - self.tau = tau - self.actor_lr = actor_lr - self.critic_lr = critic_lr + assert isinstance(gamma, float) + assert isinstance(tau, float) + assert isinstance(actor_lr, float) + assert isinstance(critic_lr, float) + self.gamma = gamma + self.tau = tau + self.actor_lr = actor_lr + self.critic_lr = critic_lr self.model = model self.target_model = deepcopy(model) - @deprecated( - deprecated_in='1.2', removed_in='1.3', replace_function='predict') - def define_predict(self, obs): - """ use actor model of self.model to predict the action - """ - return self.predict(obs) - def predict(self, obs): """ use actor model of self.model to predict the action """ return self.model.policy(obs) - @deprecated( - deprecated_in='1.2', removed_in='1.3', replace_function='learn') - def define_learn(self, obs, action, reward, next_obs, terminal): - """ update actor and critic model with DDPG algorithm - """ - return self.learn(obs, action, reward, next_obs, terminal) - def learn(self, obs, action, reward, next_obs, terminal): """ update actor and critic model with DDPG algorithm """ @@ -115,15 +88,7 @@ class DDPG(Algorithm): optimizer.minimize(cost) return cost - def sync_target(self, - gpu_id=None, - decay=None, - share_vars_parallel_executor=None): - if gpu_id is not None: - warnings.warn( - "the `gpu_id` argument of `sync_target` function in `parl.Algorithms.DDPG` is deprecated since version 1.2 and will be removed in version 1.3.", - DeprecationWarning, - stacklevel=2) + def sync_target(self, decay=None, share_vars_parallel_executor=None): if decay is None: decay = 1.0 - self.tau self.model.sync_weights_to( diff --git a/parl/algorithms/fluid/ddqn.py b/parl/algorithms/fluid/ddqn.py index 5ccd4aaafe78d6b698fb04711cdc6b7df48faac8..03c0ced5019abcef00151a68eca944b32caa8469 100644 --- a/parl/algorithms/fluid/ddqn.py +++ b/parl/algorithms/fluid/ddqn.py @@ -21,19 +21,17 @@ import paddle.fluid as fluid from parl.core.fluid.algorithm import Algorithm from parl.core.fluid import layers +__all__ = ['DDQN'] + class DDQN(Algorithm): - def __init__( - self, - model, - act_dim=None, - gamma=None, - ): + def __init__(self, model, act_dim=None, gamma=None, lr=None): """ Double DQN algorithm - Args: - model (parl.Model): model defining forward network of Q function. + model (parl.Model): model defining forward network of Q function + act_dim (int): dimension of the action space gamma (float): discounted factor for reward computation. + lr (float): learning rate. """ self.model = model self.target_model = copy.deepcopy(model) @@ -43,11 +41,29 @@ class DDQN(Algorithm): self.act_dim = act_dim self.gamma = gamma + self.lr = lr def predict(self, obs): + """ use value model self.model to predict the action value + """ return self.model.value(obs) - def learn(self, obs, action, reward, next_obs, terminal, learning_rate): + def learn(self, + obs, + action, + reward, + next_obs, + terminal, + learning_rate=None): + """ update value model self.model with DQN algorithm + """ + # Support the modification of learning_rate + if learning_rate is None: + assert isinstance( + self.lr, + float), "Please set the learning rate of DQN in initializaion." + learning_rate = self.lr + pred_value = self.model.value(obs) action_onehot = layers.one_hot(action, self.act_dim) action_onehot = layers.cast(action_onehot, dtype='float32') @@ -85,12 +101,7 @@ class DDQN(Algorithm): optimizer.minimize(cost) return cost - def sync_target(self, gpu_id=None): + def sync_target(self): """ sync weights of self.model to self.target_model """ - if gpu_id is not None: - warnings.warn( - "the `gpu_id` argument of `sync_target` function in `parl.Algorithms.DQN` is deprecated since version 1.2 and will be removed in version 1.3.", - DeprecationWarning, - stacklevel=2) self.model.sync_weights_to(self.target_model) diff --git a/parl/algorithms/fluid/dqn.py b/parl/algorithms/fluid/dqn.py index e6e97577d041f77b1899ce460582c29f5bf480a8..56d05e0a67cf5d6653bba2e350a71bb08977733a 100644 --- a/parl/algorithms/fluid/dqn.py +++ b/parl/algorithms/fluid/dqn.py @@ -19,18 +19,16 @@ import copy import paddle.fluid as fluid from parl.core.fluid.algorithm import Algorithm from parl.core.fluid import layers -from parl.utils.deprecation import deprecated __all__ = ['DQN'] class DQN(Algorithm): - def __init__(self, model, hyperparas=None, act_dim=None, gamma=None): + def __init__(self, model, act_dim=None, gamma=None, lr=None): """ DQN algorithm Args: model (parl.Model): model defining forward network of Q function - hyperparas (dict): (deprecated) dict of hyper parameters. act_dim (int): dimension of the action space gamma (float): discounted factor for reward computation. lr (float): learning rate. @@ -38,41 +36,33 @@ class DQN(Algorithm): self.model = model self.target_model = copy.deepcopy(model) - if hyperparas is not None: - warnings.warn( - "the `hyperparas` argument of `__init__` function in `parl.Algorithms.DQN` is deprecated since version 1.2 and will be removed in version 1.3.", - DeprecationWarning, - stacklevel=2) - self.act_dim = hyperparas['action_dim'] - self.gamma = hyperparas['gamma'] - else: - assert isinstance(act_dim, int) - assert isinstance(gamma, float) - self.act_dim = act_dim - self.gamma = gamma + assert isinstance(act_dim, int) + assert isinstance(gamma, float) - @deprecated( - deprecated_in='1.2', removed_in='1.3', replace_function='predict') - def define_predict(self, obs): - """ use value model self.model to predict the action value - """ - return self.predict(obs) + self.act_dim = act_dim + self.gamma = gamma + self.lr = lr def predict(self, obs): """ use value model self.model to predict the action value """ return self.model.value(obs) - @deprecated( - deprecated_in='1.2', removed_in='1.3', replace_function='learn') - def define_learn(self, obs, action, reward, next_obs, terminal, - learning_rate): - return self.learn(obs, action, reward, next_obs, terminal, - learning_rate) - - def learn(self, obs, action, reward, next_obs, terminal, learning_rate): + def learn(self, + obs, + action, + reward, + next_obs, + terminal, + learning_rate=None): """ update value model self.model with DQN algorithm """ + # Support the modification of learning_rate + if learning_rate is None: + assert isinstance( + self.lr, + float), "Please set the learning rate of DQN in initializaion." + learning_rate = self.lr pred_value = self.model.value(obs) next_pred_value = self.target_model.value(next_obs) @@ -92,12 +82,7 @@ class DQN(Algorithm): optimizer.minimize(cost) return cost - def sync_target(self, gpu_id=None): + def sync_target(self): """ sync weights of self.model to self.target_model """ - if gpu_id is not None: - warnings.warn( - "the `gpu_id` argument of `sync_target` function in `parl.Algorithms.DQN` is deprecated since version 1.2 and will be removed in version 1.3.", - DeprecationWarning, - stacklevel=2) self.model.sync_weights_to(self.target_model) diff --git a/parl/algorithms/fluid/impala/impala.py b/parl/algorithms/fluid/impala/impala.py index 025f96f2650e3351552d6525c910d2f29406dbaa..a7adf56ee28f3ec14f304a9a8b163aae31805fda 100644 --- a/parl/algorithms/fluid/impala/impala.py +++ b/parl/algorithms/fluid/impala/impala.py @@ -85,44 +85,31 @@ class VTraceLoss(object): class IMPALA(Algorithm): def __init__(self, model, - hyperparas=None, sample_batch_steps=None, gamma=None, vf_loss_coeff=None, clip_rho_threshold=None, clip_pg_rho_threshold=None): - """ IMPALA algorithm + r""" IMPALA algorithm Args: model (parl.Model): forward network of policy and value - hyperparas (dict): (deprecated) dict of hyper parameters. sample_batch_steps (int): steps of each environment sampling. gamma (float): discounted factor for reward computation. vf_loss_coeff (float): coefficient of the value function loss. clip_rho_threshold (float): clipping threshold for importance weights (rho). clip_pg_rho_threshold (float): clipping threshold on rho_s in \rho_s \delta log \pi(a|x) (r + \gamma v_{s+1} - V(x_s)). """ - if hyperparas is not None: - warnings.warn( - "the `hyperparas` argument of `__init__` function in `parl.Algorithms.IMPALA` is deprecated since version 1.2 and will be removed in version 1.3.", - DeprecationWarning, - stacklevel=2) - self.sample_batch_steps = hyperparas['sample_batch_steps'] - self.gamma = hyperparas['gamma'] - self.vf_loss_coeff = hyperparas['vf_loss_coeff'] - self.clip_rho_threshold = hyperparas['clip_rho_threshold'] - self.clip_pg_rho_threshold = hyperparas['clip_pg_rho_threshold'] - else: - assert isinstance(sample_batch_steps, int) - assert isinstance(gamma, float) - assert isinstance(vf_loss_coeff, float) - assert isinstance(clip_rho_threshold, float) - assert isinstance(clip_pg_rho_threshold, float) - self.sample_batch_steps = sample_batch_steps - self.gamma = gamma - self.vf_loss_coeff = vf_loss_coeff - self.clip_rho_threshold = clip_rho_threshold - self.clip_pg_rho_threshold = clip_pg_rho_threshold + assert isinstance(sample_batch_steps, int) + assert isinstance(gamma, float) + assert isinstance(vf_loss_coeff, float) + assert isinstance(clip_rho_threshold, float) + assert isinstance(clip_pg_rho_threshold, float) + self.sample_batch_steps = sample_batch_steps + self.gamma = gamma + self.vf_loss_coeff = vf_loss_coeff + self.clip_rho_threshold = clip_rho_threshold + self.clip_pg_rho_threshold = clip_pg_rho_threshold self.model = model diff --git a/parl/algorithms/fluid/impala/vtrace.py b/parl/algorithms/fluid/impala/vtrace.py index 9eb75957b60271fd9a5221c67593359efc57614d..99840bbe2c8c89d37a00cfb673b8ed19f7e82346 100644 --- a/parl/algorithms/fluid/impala/vtrace.py +++ b/parl/algorithms/fluid/impala/vtrace.py @@ -146,7 +146,7 @@ def from_importance_weights(behaviour_actions_log_probs, def recursively_scan(discounts, cs, deltas): - """ Recursively calculate vs_minus_v_xs according to following equation: + r""" Recursively calculate vs_minus_v_xs according to following equation: vs_minus_v_xs(t) = deltas(t) + discounts(t) * cs(t) * vs_minus_v_xs(t + 1) Args: diff --git a/parl/algorithms/fluid/maddpg.py b/parl/algorithms/fluid/maddpg.py index 4bf799413165d81d00238a7c156511a03619ba5d..36b14709aaf5e5e0a2cacc97bd94b1097caf2404 100644 --- a/parl/algorithms/fluid/maddpg.py +++ b/parl/algorithms/fluid/maddpg.py @@ -27,10 +27,11 @@ from parl.core.fluid.policy_distribution import SoftMultiCategoricalDistribution def SoftPDistribution(logits, act_space): - """input: + """Args: logits: the output of policy model act_space: action space, must be gym.spaces.Discrete or multiagent.multi_discrete.MultiDiscrete - output: + + Return: instance of SoftCategoricalDistribution or SoftMultiCategoricalDistribution """ # is instance of gym.spaces.Discrete diff --git a/parl/algorithms/fluid/policy_gradient.py b/parl/algorithms/fluid/policy_gradient.py index b1b901ff1cfce1458f72899ba13cfa95a80d6265..d37083fba91d12e6774a139011efd6f281e1c205 100644 --- a/parl/algorithms/fluid/policy_gradient.py +++ b/parl/algorithms/fluid/policy_gradient.py @@ -18,51 +18,28 @@ warnings.simplefilter('default') import paddle.fluid as fluid from parl.core.fluid.algorithm import Algorithm from parl.core.fluid import layers -from parl.utils.deprecation import deprecated __all__ = ['PolicyGradient'] class PolicyGradient(Algorithm): - def __init__(self, model, hyperparas=None, lr=None): + def __init__(self, model, lr=None): """ Policy Gradient algorithm Args: model (parl.Model): forward network of the policy. - hyperparas (dict): (deprecated) dict of hyper parameters. lr (float): learning rate of the policy model. """ self.model = model - if hyperparas is not None: - warnings.warn( - "the `hyperparas` argument of `__init__` function in `parl.Algorithms.PolicyGradient` is deprecated since version 1.2 and will be removed in version 1.3.", - DeprecationWarning, - stacklevel=2) - self.lr = hyperparas['lr'] - else: - assert isinstance(lr, float) - self.lr = lr - - @deprecated( - deprecated_in='1.2', removed_in='1.3', replace_function='predict') - def define_predict(self, obs): - """ use policy model self.model to predict the action probability - """ - return self.predict(obs) + assert isinstance(lr, float) + self.lr = lr def predict(self, obs): """ use policy model self.model to predict the action probability """ return self.model(obs) - @deprecated( - deprecated_in='1.2', removed_in='1.3', replace_function='learn') - def define_learn(self, obs, action, reward): - """ update policy model self.model with policy gradient algorithm - """ - return self.learn(obs, action, reward) - def learn(self, obs, action, reward): """ update policy model self.model with policy gradient algorithm """ diff --git a/parl/algorithms/fluid/ppo.py b/parl/algorithms/fluid/ppo.py index 002ab273833a1fef8deecb08543a2a17f92d4d40..2cd88f46e837d385ab0a7977a1f7123674d8cbaf 100644 --- a/parl/algorithms/fluid/ppo.py +++ b/parl/algorithms/fluid/ppo.py @@ -20,7 +20,6 @@ from copy import deepcopy from paddle import fluid from parl.core.fluid import layers from parl.core.fluid.algorithm import Algorithm -from parl.utils.deprecation import deprecated __all__ = ['PPO'] @@ -28,7 +27,6 @@ __all__ = ['PPO'] class PPO(Algorithm): def __init__(self, model, - hyperparas=None, act_dim=None, policy_lr=None, value_lr=None, @@ -37,7 +35,6 @@ class PPO(Algorithm): Args: model (parl.Model): model defining forward network of policy and value. - hyperparas (dict): (deprecated) dict of hyper parameters. act_dim (float): dimension of the action space. policy_lr (float): learning rate of the policy model. value_lr (float): learning rate of the value model. @@ -47,27 +44,14 @@ class PPO(Algorithm): # Used to calculate probability of action in old policy self.old_policy_model = deepcopy(model.policy_model) - if hyperparas is not None: - warnings.warn( - "the `hyperparas` argument of `__init__` function in `parl.Algorithms.PPO` is deprecated since version 1.2 and will be removed in version 1.3.", - DeprecationWarning, - stacklevel=2) - self.act_dim = hyperparas['act_dim'] - self.policy_lr = hyperparas['policy_lr'] - self.value_lr = hyperparas['value_lr'] - if 'epsilon' in hyperparas: - self.epsilon = hyperparas['epsilon'] - else: - self.epsilon = 0.2 # default - else: - assert isinstance(act_dim, int) - assert isinstance(policy_lr, float) - assert isinstance(value_lr, float) - assert isinstance(epsilon, float) - self.act_dim = act_dim - self.policy_lr = policy_lr - self.value_lr = value_lr - self.epsilon = epsilon + assert isinstance(act_dim, int) + assert isinstance(policy_lr, float) + assert isinstance(value_lr, float) + assert isinstance(epsilon, float) + self.act_dim = act_dim + self.policy_lr = policy_lr + self.value_lr = value_lr + self.epsilon = epsilon def _calc_logprob(self, actions, means, logvars): """ Calculate log probabilities of actions, when given means and logvars @@ -111,49 +95,18 @@ class PPO(Algorithm): log_det_cov_new - log_det_cov_old) + tr_old_new - self.act_dim) return kl - @deprecated( - deprecated_in='1.2', removed_in='1.3', replace_function='predict') - def define_predict(self, obs): - """ Use policy model of self.model to predict means and logvars of actions - """ - return self.predict(obs) - def predict(self, obs): """ Use the policy model of self.model to predict means and logvars of actions """ means, logvars = self.model.policy(obs) return means - @deprecated( - deprecated_in='1.2', removed_in='1.3', replace_function='sample') - def define_sample(self, obs): - """ Use the policy model of self.model to sample actions - """ - return self.sample(obs) - def sample(self, obs): """ Use the policy model of self.model to sample actions """ sampled_act = self.model.policy_sample(obs) return sampled_act - @deprecated( - deprecated_in='1.2', removed_in='1.3', replace_function='policy_learn') - def define_policy_learn(self, obs, actions, advantages, beta=None): - """ Learn policy model with: - 1. CLIP loss: Clipped Surrogate Objective - 2. KLPEN loss: Adaptive KL Penalty Objective - See: https://arxiv.org/pdf/1707.02286.pdf - - Args: - obs: Tensor, (batch_size, obs_dim) - actions: Tensor, (batch_size, act_dim) - advantages: Tensor (batch_size, ) - beta: Tensor (1) or None - if None, use CLIP Loss; else, use KLPEN loss. - """ - return self.policy_learn(obs, actions, advantages, beta) - def policy_learn(self, obs, actions, advantages, beta=None): """ Learn policy model with: 1. CLIP loss: Clipped Surrogate Objective @@ -196,27 +149,11 @@ class PPO(Algorithm): optimizer.minimize(loss) return loss, kl - @deprecated( - deprecated_in='1.2', - removed_in='1.3', - replace_function='value_predict') - def define_value_predict(self, obs): - """ Use value model of self.model to predict value of obs - """ - return self.value_predict(obs) - def value_predict(self, obs): """ Use value model of self.model to predict value of obs """ return self.model.value(obs) - @deprecated( - deprecated_in='1.2', removed_in='1.3', replace_function='value_learn') - def define_value_learn(self, obs, val): - """ Learn value model with square error cost - """ - return self.value_learn(obs, val) - def value_learn(self, obs, val): """ Learn the value model with square error cost """ @@ -227,12 +164,7 @@ class PPO(Algorithm): optimizer.minimize(loss) return loss - def sync_old_policy(self, gpu_id=None): + def sync_old_policy(self): """ Synchronize weights of self.model.policy_model to self.old_policy_model """ - if gpu_id is not None: - warnings.warn( - "the `gpu_id` argument of `sync_old_policy` function in `parl.Algorithms.PPO` is deprecated since version 1.2 and will be removed in version 1.3.", - DeprecationWarning, - stacklevel=2) self.model.policy_model.sync_weights_to(self.old_policy_model) diff --git a/parl/algorithms/fluid/sac.py b/parl/algorithms/fluid/sac.py index cec92c98568905af7bce64252e9f3ff0531da039..32d7b1edfca1498fb40ece392025d310e162dd50 100644 --- a/parl/algorithms/fluid/sac.py +++ b/parl/algorithms/fluid/sac.py @@ -102,11 +102,11 @@ class SAC(Algorithm): return cost def critic_learn(self, obs, action, reward, next_obs, terminal): - next_state_action, next_state_log_pi = self.sample(next_obs) + next_obs_action, next_obs_log_pi = self.sample(next_obs) qf1_next_target, qf2_next_target = self.target_critic.value( - next_obs, next_state_action) + next_obs, next_obs_action) min_qf_next_target = layers.elementwise_min( - qf1_next_target, qf2_next_target) - next_state_log_pi * self.alpha + qf1_next_target, qf2_next_target) - next_obs_log_pi * self.alpha terminal = layers.cast(terminal, dtype='float32') target_Q = reward + (1.0 - terminal) * self.gamma * min_qf_next_target diff --git a/parl/algorithms/fluid/tests/algs_test.py b/parl/algorithms/fluid/tests/algs_test.py new file mode 100644 index 0000000000000000000000000000000000000000..6d272b8f58b59f1e5d0167adfe305ab7ceb51679 --- /dev/null +++ b/parl/algorithms/fluid/tests/algs_test.py @@ -0,0 +1,699 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +import paddle.fluid as fluid +import parl +from parl import layers + + +class DQNModel(parl.Model): + def __init__(self): + self.fc1 = layers.fc(size=32, act='relu') + self.fc2 = layers.fc(size=2) + + def value(self, obs): + x = self.fc1(obs) + act = self.fc2(x) + return act + + +class DQNAgent(parl.Agent): + def __init__(self, algorithm): + super(DQNAgent, self).__init__(algorithm) + self.alg = algorithm + + def build_program(self): + self.pred_program = fluid.Program() + self.learn_program = fluid.Program() + + with fluid.program_guard(self.pred_program): + obs = layers.data(name='obs', shape=[4], dtype='float32') + self.value = self.alg.predict(obs) + + with fluid.program_guard(self.learn_program): + obs = layers.data(name='obs', shape=[4], dtype='float32') + action = layers.data(name='act', shape=[1], dtype='int32') + reward = layers.data(name='reward', shape=[], dtype='float32') + next_obs = layers.data(name='next_obs', shape=[4], dtype='float32') + lr = layers.data( + name='lr', shape=[1], dtype='float32', append_batch_size=False) + terminal = layers.data(name='terminal', shape=[], dtype='bool') + self.cost = self.alg.learn(obs, action, reward, next_obs, terminal, + lr) + + def predict(self, obs): + obs = np.expand_dims(obs, axis=0) + pred_Q = self.fluid_executor.run( + self.pred_program, + feed={'obs': obs.astype('float32')}, + fetch_list=[self.value])[0] + pred_Q = np.squeeze(pred_Q, axis=0) + act = np.argmax(pred_Q) + return act + + def learn(self, obs, act, reward, next_obs, terminal): + lr = 3e-4 + + obs = np.expand_dims(obs, axis=0) + next_obs = np.expand_dims(next_obs, axis=0) + act = np.expand_dims(act, -1) + feed = { + 'obs': obs.astype('float32'), + 'act': act.astype('int32'), + 'reward': reward, + 'next_obs': next_obs.astype('float32'), + 'terminal': terminal, + 'lr': np.float32(lr) + } + cost = self.fluid_executor.run( + self.learn_program, feed=feed, fetch_list=[self.cost])[0] + return cost + + +class A3CModel(parl.Model): + def __init__(self): + self.fc = layers.fc(size=32, act='relu') + + self.policy_fc = layers.fc(size=2) + self.value_fc = layers.fc(size=1) + + def policy(self, obs): + x = self.fc(obs) + policy_logits = self.policy_fc(x) + + return policy_logits + + def value(self, obs): + x = self.fc(obs) + values = self.value_fc(x) + values = layers.squeeze(values, axes=[1]) + + return values + + def policy_and_value(self, obs): + x = self.fc(obs) + policy_logits = self.policy_fc(x) + values = self.value_fc(x) + values = layers.squeeze(values, axes=[1]) + + return policy_logits, values + + +class A3CAgent(parl.Agent): + def __init__(self, algorithm): + super(A3CAgent, self).__init__(algorithm) + self.alg = algorithm + + def build_program(self): + self.predict_program = fluid.Program() + self.value_program = fluid.Program() + self.learn_program = fluid.Program() + + with fluid.program_guard(self.predict_program): + obs = layers.data(name='obs', shape=[4], dtype='float32') + self.predict_actions = self.alg.predict(obs) + + with fluid.program_guard(self.value_program): + obs = layers.data(name='obs', shape=[4], dtype='float32') + self.values = self.alg.value(obs) + + with fluid.program_guard(self.learn_program): + obs = layers.data(name='obs', shape=[4], dtype='float32') + actions = layers.data(name='actions', shape=[], dtype='int64') + advantages = layers.data( + name='advantages', shape=[], dtype='float32') + target_values = layers.data( + name='target_values', shape=[], dtype='float32') + lr = layers.data( + name='lr', shape=[1], dtype='float32', append_batch_size=False) + entropy_coeff = layers.data( + name='entropy_coeff', + shape=[1], + dtype='float32', + append_batch_size=False) + + total_loss, pi_loss, vf_loss, entropy = self.alg.learn( + obs, actions, advantages, target_values, lr, entropy_coeff) + self.learn_outputs = [total_loss, pi_loss, vf_loss, entropy] + + def predict(self, obs_np): + obs_np = obs_np.astype('float32') + + predict_actions = self.fluid_executor.run( + self.predict_program, + feed={'obs': obs_np}, + fetch_list=[self.predict_actions])[0] + return predict_actions + + def value(self, obs_np): + obs_np = obs_np.astype('float32') + + values = self.fluid_executor.run( + self.value_program, feed={'obs': obs_np}, + fetch_list=[self.values])[0] + return values + + def learn(self, obs_np, actions_np, advantages_np, target_values_np): + obs_np = obs_np.astype('float32') + actions_np = actions_np.astype('int64') + advantages_np = advantages_np.astype('float32') + target_values_np = target_values_np.astype('float32') + + lr = 3e-4 + entropy_coeff = 0. + + total_loss, pi_loss, vf_loss, entropy = self.fluid_executor.run( + self.learn_program, + feed={ + 'obs': obs_np, + 'actions': actions_np, + 'advantages': advantages_np, + 'target_values': target_values_np, + 'lr': np.array([lr], dtype='float32'), + 'entropy_coeff': np.array([entropy_coeff], dtype='float32') + }, + fetch_list=self.learn_outputs) + return total_loss, pi_loss, vf_loss, entropy, lr, entropy_coeff + + +class IMPALAModel(parl.Model): + def __init__(self): + self.fc = layers.fc(size=32, act='relu') + + self.policy_fc = layers.fc(size=2) + self.value_fc = layers.fc(size=1) + + def policy(self, obs): + x = self.fc(obs) + policy_logits = self.policy_fc(x) + + return policy_logits + + def value(self, obs): + x = self.fc(obs) + values = self.value_fc(x) + values = layers.squeeze(values, axes=[1]) + + return values + + +class IMPALAAgent(parl.Agent): + def __init__(self, algorithm): + super(IMPALAAgent, self).__init__(algorithm) + self.alg = algorithm + + def build_program(self): + self.predict_program = fluid.Program() + self.learn_program = fluid.Program() + + with fluid.program_guard(self.predict_program): + obs = layers.data(name='obs', shape=[4], dtype='float32') + self.predict_actions = self.alg.predict(obs) + + with fluid.program_guard(self.learn_program): + obs = layers.data(name='obs', shape=[4], dtype='float32') + actions = layers.data(name='actions', shape=[], dtype='int64') + behaviour_logits = layers.data( + name='behaviour_logits', shape=[2], dtype='float32') + rewards = layers.data(name='rewards', shape=[], dtype='float32') + dones = layers.data(name='dones', shape=[], dtype='float32') + lr = layers.data( + name='lr', shape=[1], dtype='float32', append_batch_size=False) + entropy_coeff = layers.data( + name='entropy_coeff', + shape=[1], + dtype='float32', + append_batch_size=False) + + vtrace_loss, kl = self.alg.learn(obs, actions, behaviour_logits, + rewards, dones, lr, entropy_coeff) + self.learn_outputs = [ + vtrace_loss.total_loss, vtrace_loss.pi_loss, + vtrace_loss.vf_loss, vtrace_loss.entropy, kl + ] + + def predict(self, obs_np): + obs_np = obs_np.astype('float32') + + predict_actions = self.fluid_executor.run( + self.predict_program, + feed={'obs': obs_np}, + fetch_list=[self.predict_actions])[0] + return predict_actions + + def learn(self, obs, actions, behaviour_logits, rewards, dones, lr, + entropy_coeff): + total_loss, pi_loss, vf_loss, entropy, kl = self.fluid_executor.run( + self.learn_program, + feed={ + 'obs': obs, + 'actions': actions, + 'behaviour_logits': behaviour_logits, + 'rewards': rewards, + 'dones': dones, + 'lr': np.array([lr], dtype='float32'), + 'entropy_coeff': np.array([entropy_coeff], dtype='float32') + }, + fetch_list=self.learn_outputs) + return total_loss, pi_loss, vf_loss, entropy, kl + + +class SACActor(parl.Model): + def __init__(self): + self.mean_linear = layers.fc(size=1) + self.log_std_linear = layers.fc(size=1) + + def policy(self, obs): + means = self.mean_linear(obs) + log_std = self.log_std_linear(obs) + + return means, log_std + + +class SACCritic(parl.Model): + def __init__(self): + self.fc1 = layers.fc(size=1) + self.fc2 = layers.fc(size=1) + + def value(self, obs, act): + concat = layers.concat([obs, act], axis=1) + Q1 = self.fc1(concat) + Q2 = self.fc2(concat) + Q1 = layers.squeeze(Q1, axes=[1]) + Q2 = layers.squeeze(Q2, axes=[1]) + return Q1, Q2 + + +class SACAgent(parl.Agent): + def __init__(self, algorithm): + super(SACAgent, self).__init__(algorithm) + self.alg.sync_target(decay=0) + + def build_program(self): + self.pred_program = fluid.Program() + self.sample_program = fluid.Program() + self.learn_program = fluid.Program() + + with fluid.program_guard(self.pred_program): + obs = layers.data(name='obs', shape=[4], dtype='float32') + self.pred_act = self.alg.predict(obs) + + with fluid.program_guard(self.sample_program): + obs = layers.data(name='obs', shape=[4], dtype='float32') + self.sample_act, _ = self.alg.sample(obs) + + with fluid.program_guard(self.learn_program): + obs = layers.data(name='obs', shape=[4], dtype='float32') + act = layers.data(name='act', shape=[1], dtype='float32') + reward = layers.data(name='reward', shape=[], dtype='float32') + next_obs = layers.data(name='next_obs', shape=[4], dtype='float32') + terminal = layers.data(name='terminal', shape=[], dtype='bool') + self.critic_cost, self.actor_cost = self.alg.learn( + obs, act, reward, next_obs, terminal) + + def predict(self, obs): + obs = np.expand_dims(obs, axis=0) + act = self.fluid_executor.run( + self.pred_program, feed={'obs': obs}, + fetch_list=[self.pred_act])[0] + return act + + def sample(self, obs): + obs = np.expand_dims(obs, axis=0) + act = self.fluid_executor.run( + self.sample_program, + feed={'obs': obs}, + fetch_list=[self.sample_act])[0] + return act + + def learn(self, obs, act, reward, next_obs, terminal): + feed = { + 'obs': obs, + 'act': act, + 'reward': reward, + 'next_obs': next_obs, + 'terminal': terminal + } + [critic_cost, actor_cost] = self.fluid_executor.run( + self.learn_program, + feed=feed, + fetch_list=[self.critic_cost, self.actor_cost]) + return critic_cost[0], actor_cost[0] + + +class DDPGModel(parl.Model): + def __init__(self): + self.policy_fc = layers.fc(size=1) + self.value_fc = layers.fc(size=1) + + def policy(self, obs): + act = self.policy_fc(obs) + return act + + def value(self, obs, act): + concat = layers.concat([obs, act], axis=1) + Q = self.value_fc(concat) + Q = layers.squeeze(Q, axes=[1]) + return Q + + def get_actor_params(self): + return self.parameters()[:2] + + +class DDPGAgent(parl.Agent): + def __init__(self, algorithm): + super(DDPGAgent, self).__init__(algorithm) + self.alg.sync_target(decay=0) + + def build_program(self): + self.pred_program = fluid.Program() + self.learn_program = fluid.Program() + + with fluid.program_guard(self.pred_program): + obs = layers.data(name='obs', shape=[4], dtype='float32') + self.pred_act = self.alg.predict(obs) + + with fluid.program_guard(self.learn_program): + obs = layers.data(name='obs', shape=[4], dtype='float32') + act = layers.data(name='act', shape=[1], dtype='float32') + reward = layers.data(name='reward', shape=[], dtype='float32') + next_obs = layers.data(name='next_obs', shape=[4], dtype='float32') + terminal = layers.data(name='terminal', shape=[], dtype='bool') + _, self.critic_cost = self.alg.learn(obs, act, reward, next_obs, + terminal) + + def predict(self, obs): + obs = np.expand_dims(obs, axis=0) + act = self.fluid_executor.run( + self.pred_program, feed={'obs': obs}, + fetch_list=[self.pred_act])[0] + return act + + def learn(self, obs, act, reward, next_obs, terminal): + feed = { + 'obs': obs, + 'act': act, + 'reward': reward, + 'next_obs': next_obs, + 'terminal': terminal + } + critic_cost = self.fluid_executor.run( + self.learn_program, feed=feed, fetch_list=[self.critic_cost])[0] + self.alg.sync_target() + return critic_cost + + +class TD3Model(parl.Model): + def __init__(self): + self.actor_fc = layers.fc(size=1) + self.q1 = layers.fc(size=1) + self.q2 = layers.fc(size=1) + + def policy(self, obs): + return self.actor_fc(obs) + + def value(self, obs, act): + concat = layers.concat([obs, act], axis=1) + Q1 = self.q1(concat) + Q1 = layers.squeeze(Q1, axes=[1]) + Q2 = self.q2(concat) + Q2 = layers.squeeze(Q2, axes=[1]) + return Q1, Q2 + + def Q1(self, obs, act): + concat = layers.concat([obs, act], axis=1) + Q1 = self.q1(concat) + Q1 = layers.squeeze(Q1, axes=[1]) + return Q1 + + def get_actor_params(self): + return self.parameters()[:2] + + +class TD3Agent(parl.Agent): + def __init__(self, algorithm): + super(TD3Agent, self).__init__(algorithm) + self.alg.sync_target(decay=0) + + def build_program(self): + self.pred_program = fluid.Program() + self.actor_learn_program = fluid.Program() + self.critic_learn_program = fluid.Program() + + with fluid.program_guard(self.pred_program): + obs = layers.data(name='obs', shape=[4], dtype='float32') + self.pred_act = self.alg.predict(obs) + + with fluid.program_guard(self.actor_learn_program): + obs = layers.data(name='obs', shape=[4], dtype='float32') + self.actor_cost = self.alg.actor_learn(obs) + + with fluid.program_guard(self.critic_learn_program): + obs = layers.data(name='obs', shape=[4], dtype='float32') + act = layers.data(name='act', shape=[1], dtype='float32') + reward = layers.data(name='reward', shape=[], dtype='float32') + next_obs = layers.data(name='next_obs', shape=[4], dtype='float32') + terminal = layers.data(name='terminal', shape=[], dtype='bool') + self.critic_cost = self.alg.critic_learn(obs, act, reward, + next_obs, terminal) + + def predict(self, obs): + obs = np.expand_dims(obs, axis=0) + act = self.fluid_executor.run( + self.pred_program, feed={'obs': obs}, + fetch_list=[self.pred_act])[0] + return act + + def learn(self, obs, act, reward, next_obs, terminal): + feed = { + 'obs': obs, + 'act': act, + 'reward': reward, + 'next_obs': next_obs, + 'terminal': terminal + } + critic_cost = self.fluid_executor.run( + self.critic_learn_program, + feed=feed, + fetch_list=[self.critic_cost])[0] + + actor_cost = self.fluid_executor.run( + self.actor_learn_program, + feed={'obs': obs}, + fetch_list=[self.actor_cost])[0] + self.alg.sync_target() + return actor_cost, critic_cost + + +class PARLtest(unittest.TestCase): + def setUp(self): + # set up DQN test + DQN_model = DQNModel() + DQN_alg = parl.algorithms.DQN(DQN_model, act_dim=2, gamma=0.9) + self.DQN_agent = DQNAgent(DQN_alg) + + # set up A3C test + A3C_model = A3CModel() + A3C_alg = parl.algorithms.A3C(A3C_model, vf_loss_coeff=0.) + self.A3C_agent = A3CAgent(A3C_alg) + + # set up IMPALA test + IMPALA_model = IMPALAModel() + IMPALA_alg = parl.algorithms.IMPALA( + IMPALA_model, + sample_batch_steps=4, + gamma=0.9, + vf_loss_coeff=0., + clip_rho_threshold=1., + clip_pg_rho_threshold=1.) + self.IMPALA_agent = IMPALAAgent(IMPALA_alg) + + # set up SAC test + SAC_actor = SACActor() + SAC_critic = SACCritic() + SAC_alg = parl.algorithms.SAC( + SAC_actor, + SAC_critic, + max_action=1., + gamma=0.99, + tau=0.005, + actor_lr=1e-3, + critic_lr=1e-3) + self.SAC_agent = SACAgent(SAC_alg) + + # set up DDPG test + DDPG_model = DDPGModel() + DDPG_alg = parl.algorithms.DDPG( + DDPG_model, gamma=0.99, tau=0.001, actor_lr=3e-4, critic_lr=3e-4) + self.DDPG_agent = DDPGAgent(DDPG_alg) + + # set up TD3 test + TD3_model = TD3Model() + TD3_alg = parl.algorithms.TD3( + TD3_model, + 1., + gamma=0.99, + tau=0.005, + actor_lr=3e-4, + critic_lr=3e-4) + self.TD3_agent = TD3Agent(TD3_alg) + + def test_DQN_predict(self): + """Test APIs in PARL DQN predict + """ + obs = np.array([-0.02394919, 0.03114079, 0.01136446, 0.00324496]) + + act = self.DQN_agent.predict(obs) + + def test_DQN_learn(self): + """Test APIs in PARL DQN learn + """ + obs = np.array([-0.02394919, 0.03114079, 0.01136446, 0.00324496]) + next_obs = np.array([-0.02332638, -0.16414229, 0.01142936, 0.29949173]) + terminal = np.array([False]).astype('bool') + reward = np.array([1.0]).astype('float32') + act = np.array([0]).astype('int32') + + cost = self.DQN_agent.learn(obs, act, reward, next_obs, terminal) + + def test_A3C_predict(self): + """Test APIs in PARL A3C predict + """ + obs = np.array([-0.02394919, 0.03114079, 0.01136446, 0.00324496]) + obs = np.expand_dims(obs, axis=0) + + logits = self.A3C_agent.predict(obs) + + def test_A3C_value(self): + """Test APIs in PARL A3C predict + """ + obs = np.array([-0.02394919, 0.03114079, 0.01136446, 0.00324496]) + obs = np.expand_dims(obs, axis=0) + + values = self.A3C_agent.value(obs) + + def test_A3C_learn(self): + """Test APIs in PARL A3C learn + """ + obs = np.array([[-0.02394919, 0.03114079, 0.01136446, 0.00324496]]) + action = np.array([0]) + advantages = np.array([-0.02332638]) + target_values = np.array([1.]) + + self.A3C_agent.learn(obs, action, advantages, target_values) + + def test_IMPALA_predict(self): + """Test APIs in PARL IMPALA predict + """ + obs = np.array([[-0.02394919, 0.03114079, 0.01136446, 0.00324496]]) + + policy = self.IMPALA_agent.predict(obs) + + def test_IMPALA_learn(self): + """Test APIs in PARL IMPALA learn + """ + obs = np.array([[-0.02394919, 0.03114079, 0.01136446, 0.00324496], + [-0.02394919, 0.03114079, 0.01136446, 0.00324496], + [-0.02394919, 0.03114079, 0.01136446, 0.00324496], + [-0.02394919, 0.03114079, 0.01136446, + 0.00324496]]).astype('float32') + actions = np.array([1, 1, 1, 1]).astype('int32') + behaviour_logits = np.array([[-1, 1], [-1, 1], [-1, 1], + [-1, 1]]).astype('float32') + rewards = np.array([0, 0, 0, 0]).astype('float32') + dones = np.array([False, False, False, False]).astype('float32') + lr = 3e-4 + entropy_coeff = 0. + + total_loss, pi_loss, vf_loss, entropy, kl = self.IMPALA_agent.learn( + obs, actions, behaviour_logits, rewards, dones, lr, entropy_coeff) + + def test_SAC_predict(self): + """Test APIs in PARL SAC predict + """ + obs = np.array([[-0.02394919, 0.03114079, 0.01136446, + 0.00324496]]).astype(np.float32) + act = self.SAC_agent.predict(obs) + + def test_SAC_sample(self): + """Test APIs in PARL SAC sample + """ + obs = np.array([[-0.02394919, 0.03114079, 0.01136446, + 0.00324496]]).astype(np.float32) + act = self.SAC_agent.sample(obs) + + def test_SAC_learn(self): + """Test APIs in PARL SAC learn + """ + obs = np.array([[-0.02394919, 0.03114079, 0.01136446, + 0.00324496]]).astype(np.float32) + next_obs = np.array( + [[-0.02332638, -0.16414229, 0.01142936, + 0.29949173]]).astype(np.float32) + terminal = np.array([False]).astype('bool') + reward = np.array([1.0]).astype('float32') + act = np.array([[0.]]).astype('float32') + + critic_cost, actor_cost = self.SAC_agent.learn(obs, act, reward, + next_obs, terminal) + + def test_DDPG_predict(self): + """Test APIs in PARL DDPG predict + """ + obs = np.array([[-0.02394919, 0.03114079, 0.01136446, + 0.00324496]]).astype(np.float32) + act = self.DDPG_agent.predict(obs) + + def test_DDPG_learn(self): + """Test APIs in PARL DDPG learn + """ + obs = np.array([[-0.02394919, 0.03114079, 0.01136446, + 0.00324496]]).astype(np.float32) + next_obs = np.array( + [[-0.02332638, -0.16414229, 0.01142936, + 0.29949173]]).astype(np.float32) + terminal = np.array([False]).astype('bool') + reward = np.array([1.0]).astype('float32') + act = np.array([[0.]]).astype('float32') + + critic_cost, actor_cost = self.SAC_agent.learn(obs, act, reward, + next_obs, terminal) + + def test_TD3_predict(self): + """Test APIs in PARL TD3 predict + """ + obs = np.array([[-0.02394919, 0.03114079, 0.01136446, + 0.00324496]]).astype(np.float32) + act = self.TD3_agent.predict(obs) + + def test_TD3_learn(self): + """Test APIs in PARL TD3 learn + """ + obs = np.array([[-0.02394919, 0.03114079, 0.01136446, + 0.00324496]]).astype(np.float32) + next_obs = np.array( + [[-0.02332638, -0.16414229, 0.01142936, + 0.29949173]]).astype(np.float32) + terminal = np.array([False]).astype('bool') + reward = np.array([1.0]).astype('float32') + act = np.array([[0.]]).astype('float32') + + critic_cost, actor_cost = self.TD3_agent.learn(obs, act, reward, + next_obs, terminal) + + +if __name__ == '__main__': + unittest.main() diff --git a/parl/algorithms/torch/__init__.py b/parl/algorithms/torch/__init__.py index 97826766b2581762d1080c05eb979bc4d0e4b03f..9de7afbdd57305b1280b024556e0b1730bcbc494 100644 --- a/parl/algorithms/torch/__init__.py +++ b/parl/algorithms/torch/__init__.py @@ -16,5 +16,5 @@ from parl.algorithms.torch.ddqn import * from parl.algorithms.torch.dqn import * from parl.algorithms.torch.a2c import * from parl.algorithms.torch.td3 import * -from parl.algorithms.torch.coma import * +from parl.algorithms.torch.ppo import * from parl.algorithms.torch.policy_gradient import * diff --git a/parl/algorithms/torch/a2c.py b/parl/algorithms/torch/a2c.py index 3d78ce75938c583e15e4f7321ad836d869ef25b1..43e373907db821fedb4759138de064bd8dda9afa 100644 --- a/parl/algorithms/torch/a2c.py +++ b/parl/algorithms/torch/a2c.py @@ -27,7 +27,7 @@ __all__ = ['A2C'] class A2C(parl.Algorithm): - def __init__(self, model, config, hyperparas=None): + def __init__(self, model, config): assert isinstance(config['vf_loss_coeff'], (int, float)) self.model = model self.vf_loss_coeff = config['vf_loss_coeff'] diff --git a/parl/algorithms/torch/ppo.py b/parl/algorithms/torch/ppo.py new file mode 100644 index 0000000000000000000000000000000000000000..7c838e896e26b35fa078d1db1323476fb776993f --- /dev/null +++ b/parl/algorithms/torch/ppo.py @@ -0,0 +1,94 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import parl +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +from torch.distributions import Normal + +__all__ = ['PPO'] + + +class PPO(parl.Algorithm): + def __init__(self, + model, + clip_param, + value_loss_coef, + entropy_coef, + initial_lr, + eps=None, + max_grad_norm=None, + use_clipped_value_loss=True): + self.model = model + + self.clip_param = clip_param + + self.value_loss_coef = value_loss_coef + self.entropy_coef = entropy_coef + + self.max_grad_norm = max_grad_norm + self.use_clipped_value_loss = use_clipped_value_loss + + self.optimizer = optim.Adam(model.parameters(), lr=initial_lr, eps=eps) + + def learn(self, obs_batch, actions_batch, value_preds_batch, return_batch, + old_action_log_probs_batch, adv_targ): + values = self.model.value(obs_batch) + mean, log_std = self.model.policy(obs_batch) + dist = Normal(mean, log_std.exp()) + + action_log_probs = dist.log_prob(actions_batch).sum(-1, keepdim=True) + dist_entropy = dist.entropy().sum(-1).mean() + + ratio = torch.exp(action_log_probs - old_action_log_probs_batch) + surr1 = ratio * adv_targ + surr2 = torch.clamp(ratio, 1.0 - self.clip_param, + 1.0 + self.clip_param) * adv_targ + action_loss = -torch.min(surr1, surr2).mean() + + if self.use_clipped_value_loss: + value_pred_clipped = value_preds_batch + \ + (values - value_preds_batch).clamp(-self.clip_param, self.clip_param) + value_losses = (values - return_batch).pow(2) + value_losses_clipped = (value_pred_clipped - return_batch).pow(2) + value_loss = 0.5 * torch.max(value_losses, + value_losses_clipped).mean() + else: + value_loss = 0.5 * (return_batch - values).pow(2).mean() + + self.optimizer.zero_grad() + (value_loss * self.value_loss_coef + action_loss - + dist_entropy * self.entropy_coef).backward() + nn.utils.clip_grad_norm_(self.model.parameters(), self.max_grad_norm) + self.optimizer.step() + + return value_loss.item(), action_loss.item(), dist_entropy.item() + + def sample(self, obs): + value = self.model.value(obs) + mean, log_std = self.model.policy(obs) + dist = Normal(mean, log_std.exp()) + action = dist.sample() + action_log_probs = dist.log_prob(action).sum(-1, keepdim=True) + + return value, action, action_log_probs + + def predict(self, obs): + mean, _ = self.model.policy(obs) + return mean + + def value(self, obs): + return self.model.value(obs) diff --git a/parl/core/fluid/agent.py b/parl/core/fluid/agent.py index 8972443c453e75e022751cee707d9bbaeda649df..a3e196358d5775bf15e7730d44e4b6ee2706f668 100644 --- a/parl/core/fluid/agent.py +++ b/parl/core/fluid/agent.py @@ -15,9 +15,9 @@ import warnings warnings.simplefilter('default') +import os import paddle.fluid as fluid from parl.core.fluid import layers -from parl.utils.deprecation import deprecated from parl.core.agent_base import AgentBase from parl.core.fluid.algorithm import Algorithm from parl.utils import machine_info @@ -46,7 +46,6 @@ class Agent(AgentBase): This class will initialize the neural network parameters automatically, and provides an executor for users to run the programs (self.fluid_executor). Attributes: - gpu_id (int): deprecated. specify which GPU to be used. -1 if to use the CPU. fluid_executor (fluid.Executor): executor for running programs of the agent. alg (parl.algorithm): algorithm of this agent. @@ -65,18 +64,12 @@ class Agent(AgentBase): """ - def __init__(self, algorithm, gpu_id=None): + def __init__(self, algorithm): """Build programs by calling the method ``self.build_program()`` and run initialization function of ``fluid.default_startup_program()``. Args: algorithm (parl.Algorithm): an instance of `parl.Algorithm`. This algorithm is then passed to `self.alg`. - gpu_id (int): deprecated. specify which GPU to be used. -1 if to use the CPU. """ - if gpu_id is not None: - warnings.warn( - "the `gpu_id` argument of `__init__` function in `parl.Agent` is deprecated since version 1.2 and will be removed in version 1.3.", - DeprecationWarning, - stacklevel=2) assert isinstance(algorithm, Algorithm) super(Agent, self).__init__(algorithm) @@ -119,26 +112,6 @@ class Agent(AgentBase): """ raise NotImplementedError - @deprecated( - deprecated_in='1.2', removed_in='1.3', replace_function='get_weights') - def get_params(self): - """ Returns a Python dictionary containing the whole parameters of self.alg. - - Returns: - a Python List containing the parameters of self.alg. - """ - return self.algorithm.get_params() - - @deprecated( - deprecated_in='1.2', removed_in='1.3', replace_function='set_weights') - def set_params(self, params): - """Copy parameters from ``get_params()`` into this agent. - - Args: - params(dict): a Python List containing the parameters of self.alg. - """ - self.algorithm.set_params(params) - def learn(self, *args, **kwargs): """The training interface for ``Agent``. This function feeds the training data into the learn_program defined in ``build_program()``. @@ -180,8 +153,8 @@ class Agent(AgentBase): """ if program is None: program = self.learn_program - dirname = '/'.join(save_path.split('/')[:-1]) - filename = save_path.split('/')[-1] + dirname = os.sep.join(save_path.split(os.sep)[:-1]) + filename = save_path.split(os.sep)[-1] fluid.io.save_params( executor=self.fluid_executor, dirname=dirname, @@ -214,8 +187,8 @@ class Agent(AgentBase): program = self.learn_program if type(program) is fluid.compiler.CompiledProgram: program = program._init_program - dirname = '/'.join(save_path.split('/')[:-1]) - filename = save_path.split('/')[-1] + dirname = os.sep.join(save_path.split(os.sep)[:-1]) + filename = save_path.split(os.sep)[-1] fluid.io.load_params( executor=self.fluid_executor, dirname=dirname, diff --git a/parl/core/fluid/algorithm.py b/parl/core/fluid/algorithm.py index 1a05a9991a658e13282f847f2cf4772dc19b2572..2267e3b6d8191d3f3b6028f9e188e7ad5394c863 100644 --- a/parl/core/fluid/algorithm.py +++ b/parl/core/fluid/algorithm.py @@ -17,7 +17,6 @@ warnings.simplefilter('default') from parl.core.algorithm_base import AlgorithmBase from parl.core.fluid.model import Model -from parl.utils.deprecation import deprecated __all__ = ['Algorithm'] @@ -57,47 +56,13 @@ class Algorithm(AlgorithmBase): """ - def __init__(self, model=None, hyperparas=None): + def __init__(self, model=None): """ Args: model(``parl.Model``): a neural network that represents a policy or a Q-value function. - hyperparas(dict): a dict storing the hyper-parameters relative to training. """ - if model is not None: - warnings.warn( - "the `model` argument of `__init__` function in `parl.Algorithm` is deprecated since version 1.2 and will be removed in version 1.3.", - DeprecationWarning, - stacklevel=2) - - assert isinstance(model, Model) - self.model = model - if hyperparas is not None: - warnings.warn( - "the `hyperparas` argument of `__init__` function in `parl.Algorithm` is deprecated since version 1.2 and will be removed in version 1.3.", - DeprecationWarning, - stacklevel=2) - - self.hp = hyperparas - - @deprecated( - deprecated_in='1.2', removed_in='1.3', replace_function='get_weights') - def get_params(self): - """ Get parameters of self.model. - - Returns: - params(dict): a Python List containing the parameters of self.model. - """ - return self.model.get_params() - - @deprecated( - deprecated_in='1.2', removed_in='1.3', replace_function='set_weights') - def set_params(self, params): - """ Set parameters from ``get_params`` to the model. - - Args: - params(dict ): a Python List containing the parameters of self.model. - """ - self.model.set_params(params) + assert isinstance(model, Model) + self.model = model def learn(self, *args, **kwargs): """ Define the loss function and create an optimizer to minize the loss. diff --git a/parl/core/fluid/layers/tests/param_sharing_test.py b/parl/core/fluid/layers/tests/param_sharing_test.py index f457f4571375c70c8ed0333e6727ce97991996bb..d26048b08daae9ae341e88b402898f40ecd22ca4 100644 --- a/parl/core/fluid/layers/tests/param_sharing_test.py +++ b/parl/core/fluid/layers/tests/param_sharing_test.py @@ -45,7 +45,7 @@ class TestParamSharing(unittest.TestCase): dict_size = 100 input_cx = np.random.uniform(0, 1, [batch_size, 100]).astype("float32") input_x = np.random.randint( - dict_size, size=(batch_size, 1)).astype("int") + dict_size, size=(batch_size, 1)).astype("int64") ################################# main_program1 = fluid.Program() @@ -59,7 +59,7 @@ class TestParamSharing(unittest.TestCase): main_program2 = fluid.Program() with fluid.program_guard(main_program2): - x_ = layers.data(name='x', shape=[1], dtype="int") + x_ = layers.data(name='x', shape=[1], dtype="int64") cx_ = layers.cast( x=layers.one_hot(input=x_, depth=dict_size), dtype="float32") y1_ = net.fc1(input=cx_) diff --git a/parl/core/fluid/model.py b/parl/core/fluid/model.py index 38d653ad20275d281a1bca4cf63d1198475a8696..bf7069a68c53748d870c1d9d21c2ec971fee05fe 100644 --- a/parl/core/fluid/model.py +++ b/parl/core/fluid/model.py @@ -17,7 +17,6 @@ import paddle.fluid as fluid from parl.core.fluid.layers.layer_wrappers import LayerFunc from parl.core.fluid.plutils import * from parl.core.model_base import ModelBase -from parl.utils.deprecation import deprecated from parl.utils import machine_info __all__ = ['Model'] @@ -67,30 +66,6 @@ class Model(ModelBase): """ - @deprecated( - deprecated_in='1.2', - removed_in='1.3', - replace_function='sync_weights_to') - def sync_params_to(self, - target_net, - gpu_id=None, - decay=0.0, - share_vars_parallel_executor=None): - """Synchronize parameters in the model to another model (target_net). - - target_net_weights = decay * target_net_weights + (1 - decay) * source_net_weights - - Args: - target_model (`parl.Model`): an instance of ``Model`` that has the same neural network architecture as the current model. - decay (float): the rate of decline in copying parameters. 0 if no parameters decay when synchronizing the parameters. - share_vars_parallel_executor (fluid.ParallelExecutor): Optional. If not None, will use fluid.ParallelExecutor - to run program instead of fluid.Executor - """ - self.sync_weights_to( - target_model=target_net, - decay=decay, - share_vars_parallel_executor=share_vars_parallel_executor) - def sync_weights_to(self, target_model, decay=0.0, @@ -181,21 +156,6 @@ class Model(ModelBase): else: self._cached_fluid_executor.run(fetch_list=[]) - @property - @deprecated( - deprecated_in='1.2', removed_in='1.3', replace_function='parameters') - def parameter_names(self): - """Get names of all parameters in this ``Model``. - - Only parameters created by ``parl.layers`` are included. - The order of parameter names is consistent among - different instances of the same `Model`. - - Returns: - param_names(list): list of string containing parameter names of all parameters. - """ - return self.parameters() - def parameters(self): """Get names of all parameters in this ``Model``. @@ -223,26 +183,6 @@ class Model(ModelBase): self._parameter_names = self._get_parameter_names(self) return self._parameter_names - @deprecated( - deprecated_in='1.2', removed_in='1.3', replace_function='get_weights') - def get_params(self): - """ Return a Python list containing parameters of current model. - - Returns: - parameters: a Python list containing parameters of the current model. - """ - return self.get_weights() - - @deprecated( - deprecated_in='1.2', removed_in='1.3', replace_function='set_weights') - def set_params(self, params, gpu_id=None): - """Set parameters in the model with params. - - Args: - params (List): List of numpy array . - """ - self.set_weights(weights=params) - def get_weights(self): """Returns a Python list containing parameters of current model. diff --git a/parl/core/fluid/tests/agent_base_test_.py b/parl/core/fluid/tests/agent_base_test.py similarity index 94% rename from parl/core/fluid/tests/agent_base_test_.py rename to parl/core/fluid/tests/agent_base_test.py index cd8ca7d06f72ae99c51f12e639d9b0de1080ba7f..5a7f8ac12aaeee0d57daf92b118d1307a65a1cc0 100644 --- a/parl/core/fluid/tests/agent_base_test_.py +++ b/parl/core/fluid/tests/agent_base_test.py @@ -46,8 +46,8 @@ class TestAlgorithm(parl.Algorithm): class TestAgent(parl.Agent): - def __init__(self, algorithm, gpu_id=None): - super(TestAgent, self).__init__(algorithm, gpu_id) + def __init__(self, algorithm): + super(TestAgent, self).__init__(algorithm) def build_program(self): self.predict_program = fluid.Program() @@ -92,8 +92,8 @@ class AgentBaseTest(unittest.TestCase): agent = TestAgent(self.algorithm) obs = np.random.random([3, 10]).astype('float32') output_np = agent.predict(obs) - save_path1 = './model.ckpt' - save_path2 = './my_model/model-2.ckpt' + save_path1 = 'model.ckpt' + save_path2 = os.path.join('my_model', 'model-2.ckpt') agent.save(save_path1) agent.save(save_path2) self.assertTrue(os.path.exists(save_path1)) @@ -103,7 +103,7 @@ class AgentBaseTest(unittest.TestCase): agent = TestAgent(self.algorithm) obs = np.random.random([3, 10]).astype('float32') output_np = agent.predict(obs) - save_path1 = './model.ckpt' + save_path1 = 'model.ckpt' previous_output = agent.predict(obs) agent.save(save_path1) agent.restore(save_path1) @@ -121,7 +121,7 @@ class AgentBaseTest(unittest.TestCase): agent.learn_program = parl.compile(agent.learn_program) obs = np.random.random([3, 10]).astype('float32') previous_output = agent.predict(obs) - save_path1 = './model.ckpt' + save_path1 = 'model.ckpt' agent.save(save_path1) agent.restore(save_path1) diff --git a/parl/core/fluid/tests/model_base_test_.py b/parl/core/fluid/tests/model_base_test.py similarity index 94% rename from parl/core/fluid/tests/model_base_test_.py rename to parl/core/fluid/tests/model_base_test.py index 1656366a2fd97daf019b2cfb42f1ab7be640a65a..faa13684c90d6eabcc0c7561fb386a035f63c4ac 100644 --- a/parl/core/fluid/tests/model_base_test_.py +++ b/parl/core/fluid/tests/model_base_test.py @@ -690,6 +690,43 @@ class ModelBaseTest(unittest.TestCase): self.executor.run( pred_program, feed={'obs': x}, fetch_list=[model_output]) + def test_get_weights_set_weights_with_create_parameter(self): + model1 = TestModel2() + model2 = TestModel2() + + pred_program = fluid.Program() + with fluid.program_guard(pred_program): + obs = layers.data(name='obs', shape=[100], dtype='float32') + model1_output = model1.predict(obs) + model2_output = model2.predict(obs) + + self.executor.run(fluid.default_startup_program()) + + N = 10 + random_obs = np.random.random(size=(N, 100)).astype('float32') + for i in range(N): + x = np.expand_dims(random_obs[i], axis=0) + outputs = self.executor.run( + pred_program, + feed={'obs': x}, + fetch_list=[model1_output, model2_output]) + self.assertNotEqual( + np.sum(outputs[0].flatten()), np.sum(outputs[1].flatten())) + + # pass parameters of self.model to model2 + params = model1.get_weights() + model2.set_weights(params) + + random_obs = np.random.random(size=(N, 100)).astype('float32') + for i in range(N): + x = np.expand_dims(random_obs[i], axis=0) + outputs = self.executor.run( + pred_program, + feed={'obs': x}, + fetch_list=[model1_output, model2_output]) + self.assertEqual( + np.sum(outputs[0].flatten()), np.sum(outputs[1].flatten())) + if __name__ == '__main__': unittest.main() diff --git a/parl/core/fluid/tests/policy_distribution_test_.py b/parl/core/fluid/tests/policy_distribution_test.py similarity index 100% rename from parl/core/fluid/tests/policy_distribution_test_.py rename to parl/core/fluid/tests/policy_distribution_test.py diff --git a/parl/core/torch/agent.py b/parl/core/torch/agent.py index 5d8bb2195dc0fdff48c2e9a3d5f477b793af06eb..7cc87fad2f7d294a707ac5dfd6dad4eedd566b46 100644 --- a/parl/core/torch/agent.py +++ b/parl/core/torch/agent.py @@ -113,8 +113,9 @@ class Agent(AgentBase): """ if model is None: model = self.algorithm.model - dirname = '/'.join(save_path.split('/')[:-1]) - if not os.path.exists(dirname): + sep = os.sep + dirname = sep.join(save_path.split(sep)[:-1]) + if dirname != '' and not os.path.exists(dirname): os.makedirs(dirname) torch.save(model.state_dict(), save_path) diff --git a/parl/core/torch/tests/agent_base_test_torch.py b/parl/core/torch/tests/agent_base_test_torch.py index 96caf7532c38bafea6ba33d41ecb173361c525ac..7bf468db86a7106d62c809bca52c32f8cb55f39a 100644 --- a/parl/core/torch/tests/agent_base_test_torch.py +++ b/parl/core/torch/tests/agent_base_test_torch.py @@ -77,8 +77,8 @@ class AgentBaseTest(unittest.TestCase): def test_save(self): agent = TestAgent(self.alg) obs = torch.randn(3, 10) - save_path1 = './model.ckpt' - save_path2 = './my_model/model-2.ckpt' + save_path1 = 'model.ckpt' + save_path2 = os.path.join('my_model', 'model-2.ckpt') agent.save(save_path1) agent.save(save_path2) self.assertTrue(os.path.exists(save_path1)) @@ -88,7 +88,7 @@ class AgentBaseTest(unittest.TestCase): agent = TestAgent(self.alg) obs = torch.randn(3, 10) output = agent.predict(obs) - save_path1 = './model.ckpt' + save_path1 = 'model.ckpt' previous_output = agent.predict(obs).detach().cpu().numpy() agent.save(save_path1) agent.restore(save_path1) diff --git a/parl/framework/__init__.py b/parl/framework/__init__.py deleted file mode 100644 index 4e48085338abbc3731935722515c5591333922d8..0000000000000000000000000000000000000000 --- a/parl/framework/__init__.py +++ /dev/null @@ -1,25 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import warnings - -warnings.simplefilter('default') - -warnings.warn( - "import way `import parl.framework` is deprecated since version 1.2 and will be removed in version 1.3.", - DeprecationWarning, - stacklevel=2) - -from parl.core.fluid.model import * -from parl.core.fluid.algorithm import * -from parl.core.fluid.agent import * diff --git a/parl/layers/__init__.py b/parl/layers/__init__.py deleted file mode 100644 index 3283927adcb620094c0df4dea0a0ccf8533e3766..0000000000000000000000000000000000000000 --- a/parl/layers/__init__.py +++ /dev/null @@ -1,24 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import warnings - -warnings.simplefilter('default') - -warnings.warn( - "import way `import parl.layers` is deprecated since version 1.2 and will be removed in version 1.3, please use `from parl import layers` or `import parl; parl.layers` instead.", - DeprecationWarning, - stacklevel=2) - -from parl.core.fluid.layers import * diff --git a/parl/plutils/__init__.py b/parl/plutils/__init__.py deleted file mode 100644 index 8bac1d7d3677b82f03e8c64066ef6748fa03d577..0000000000000000000000000000000000000000 --- a/parl/plutils/__init__.py +++ /dev/null @@ -1,19 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -print( - "import way `import parl.plutils` is deprecated since version 1.2 and will be removed in version 1.3, please use `from parl import plutils` or `import parl; parl.plutils` instead." -) - -from parl.core.fluid.plutils.common import * diff --git a/parl/plutils/common.py b/parl/plutils/common.py deleted file mode 100644 index 8bac1d7d3677b82f03e8c64066ef6748fa03d577..0000000000000000000000000000000000000000 --- a/parl/plutils/common.py +++ /dev/null @@ -1,19 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -print( - "import way `import parl.plutils` is deprecated since version 1.2 and will be removed in version 1.3, please use `from parl import plutils` or `import parl; parl.plutils` instead." -) - -from parl.core.fluid.plutils.common import * diff --git a/parl/remote/client.py b/parl/remote/client.py index 0c06effc6aa6554dc72c39ec75739bc82c11453f..379459c5768914a012cf89182724f1233cbf1329 100644 --- a/parl/remote/client.py +++ b/parl/remote/client.py @@ -59,6 +59,7 @@ class Client(object): self.heartbeat_socket_initialized = threading.Event() self.master_is_alive = True self.client_is_alive = True + self.log_monitor_url = None self.executable_path = self.get_executable_path() @@ -105,9 +106,19 @@ class Client(object): for file in distributed_files: assert os.path.exists(file) + assert not os.path.isabs( + file + ), "[XPARL] Please do not distribute a file with absolute path." with open(file, 'rb') as f: content = f.read() pyfiles['other_files'][file] = content + # append entry file to code list + main_file = sys.argv[0] + with open(main_file, 'rb') as code_file: + code = code_file.read() + # parl/remote/remote_decorator.py -> remote_decorator.py + file_name = main_file.split(os.sep)[-1] + pyfiles['python_files'][file_name] = code except AssertionError as e: raise Exception( 'Failed to create the client, the file {} does not exist.'. @@ -132,14 +143,19 @@ class Client(object): thread.start() self.heartbeat_socket_initialized.wait() + self.client_id = self.reply_master_heartbeat_address.replace(':', '_') + \ + '_' + str(int(time.time())) + # check if the master is connected properly try: self.submit_job_socket.send_multipart([ remote_constants.CLIENT_CONNECT_TAG, - to_byte(self.heartbeat_master_address), - to_byte(socket.gethostname()) + to_byte(self.reply_master_heartbeat_address), + to_byte(socket.gethostname()), + to_byte(self.client_id), ]) - _ = self.submit_job_socket.recv_multipart() + message = self.submit_job_socket.recv_multipart() + self.log_monitor_url = to_str(message[1]) except zmq.error.Again as e: logger.warning("[Client] Can not connect to the master, please " "check if master is started and ensure the input " @@ -150,17 +166,18 @@ class Client(object): "address {} is correct.".format(master_address)) def _reply_heartbeat(self): - """Reply heartbeat signals to the specific node.""" + """Reply heartbeat signals to the master node.""" socket = self.ctx.socket(zmq.REP) socket.linger = 0 socket.setsockopt(zmq.RCVTIMEO, remote_constants.HEARTBEAT_RCVTIMEO_S * 1000) - heartbeat_master_port =\ + reply_master_heartbeat_port =\ socket.bind_to_random_port(addr="tcp://*") - self.heartbeat_master_address = "{}:{}".format(get_ip_address(), - heartbeat_master_port) + self.reply_master_heartbeat_address = "{}:{}".format( + get_ip_address(), reply_master_heartbeat_port) self.heartbeat_socket_initialized.set() + connected = False while self.client_is_alive and self.master_is_alive: try: message = socket.recv_multipart() @@ -170,11 +187,18 @@ class Client(object): remote_constants.HEARTBEAT_TAG, to_byte(self.executable_path), to_byte(str(self.actor_num)), - to_byte(str(elapsed_time)) - ]) + to_byte(str(elapsed_time)), + to_byte(str(self.log_monitor_url)), + ]) # TODO: remove additional information except zmq.error.Again as e: - logger.warning("[Client] Cannot connect to the master." - "Please check if it is still alive.") + if connected: + logger.warning("[Client] Cannot connect to the master." + "Please check if it is still alive.") + else: + logger.warning( + "[Client] Cannot connect to the master." + "Please check the firewall between client and master.(e.g., ping the master IP)" + ) self.master_is_alive = False socket.close(0) logger.warning("Client exit replying heartbeat for master.") @@ -182,7 +206,7 @@ class Client(object): def _check_and_monitor_job(self, job_heartbeat_address, ping_heartbeat_address, max_memory): """ Sometimes the client may receive a job that is dead, thus - we have to check if this job is still alive before sending it to the actor. + we have to check if this job is still alive before adding it to the `actor_num`. """ # job_heartbeat_socket: sends heartbeat signal to job job_heartbeat_socket = self.ctx.socket(zmq.REQ) @@ -271,7 +295,8 @@ class Client(object): self.lock.acquire() self.submit_job_socket.send_multipart([ remote_constants.CLIENT_SUBMIT_TAG, - to_byte(self.heartbeat_master_address) + to_byte(self.reply_master_heartbeat_address), + to_byte(self.client_id), ]) message = self.submit_job_socket.recv_multipart() self.lock.release() @@ -326,9 +351,10 @@ def connect(master_address, distributed_files=[]): Exception: An exception is raised if the master node is not started. """ - assert len(master_address.split(":")) == 2, "please input address in " +\ + assert len(master_address.split(":")) == 2, "Please input address in " +\ "{ip}:{port} format" global GLOBAL_CLIENT + addr = master_address.split(":")[0] cur_process_id = os.getpid() if GLOBAL_CLIENT is None: GLOBAL_CLIENT = Client(master_address, cur_process_id, @@ -337,6 +363,8 @@ def connect(master_address, distributed_files=[]): if GLOBAL_CLIENT.process_id != cur_process_id: GLOBAL_CLIENT = Client(master_address, cur_process_id, distributed_files) + logger.info("Remote actors log url: {}".format( + GLOBAL_CLIENT.log_monitor_url)) def get_global_client(): @@ -366,5 +394,5 @@ def disconnect(): GLOBAL_CLIENT = None else: logger.info( - "No client to be released. Please make sure that you have call `parl.connect`" + "No client to be released. Please make sure that you have called `parl.connect`" ) diff --git a/parl/remote/cluster_monitor.py b/parl/remote/cluster_monitor.py index 99bc2beac9e4e60565c31213ec05bdb96c7678a2..889f91586161f94cddb2a16670360cc8b9d4aca0 100644 --- a/parl/remote/cluster_monitor.py +++ b/parl/remote/cluster_monitor.py @@ -28,7 +28,8 @@ class ClusterMonitor(object): def __init__(self): self.status = { 'workers': defaultdict(dict), - 'clients': defaultdict(dict) + 'clients': defaultdict(dict), + 'client_jobs': defaultdict(dict), } self.lock = threading.Lock() @@ -46,6 +47,11 @@ class ClusterMonitor(object): worker_status['hostname'] = hostname self.lock.release() + def add_client_job(self, client_id, job_info): + self.lock.acquire() + self.status['client_jobs'][client_id].update(job_info) + self.lock.release() + def update_client_status(self, client_status, client_address, client_hostname): """Update client status with message send from client heartbeat. @@ -61,7 +67,8 @@ class ClusterMonitor(object): 'client_address': client_hostname, 'file_path': to_str(client_status[1]), 'actor_num': int(to_str(client_status[2])), - 'time': to_str(client_status[3]) + 'time': to_str(client_status[3]), + 'log_monitor_url': to_str(client_status[4]), } self.lock.release() @@ -96,14 +103,15 @@ class ClusterMonitor(object): self.status['workers'].pop(worker_address) self.lock.release() - def drop_cluster_status(self, client_address): - """Drop cluster status when it exits. + def drop_client_status(self, client_address): + """Drop client status when it exits. Args: - cluster_address (str): IP address of the exited client. + client_address (str): IP address of the exited client. """ self.lock.acquire() - self.status['clients'].pop(client_address) + if client_address in self.status['clients']: + self.status['clients'].pop(client_address) self.lock.release() def get_status_info(self): diff --git a/parl/framework/algorithm_base.py b/parl/remote/compatible_trick.py similarity index 53% rename from parl/framework/algorithm_base.py rename to parl/remote/compatible_trick.py index 2499c639077107d4c16387a1941b8252dd6a84fa..e61ade0c50af1dd51160bfbb149672c55ca20a29 100644 --- a/parl/framework/algorithm_base.py +++ b/parl/remote/compatible_trick.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -11,14 +11,27 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +""" +This file is used to fix the problem that cloudpickle cannot load some packages normally in Mac OS. +We hack the problem by trying load these packages in the main module in advance. -import warnings +Template: -warnings.simplefilter('default') +try: + import [PACKAGE1] +except ImportError: + pass -warnings.warn( - "module `parl.framework.algorithm_base.Algorithm` is deprecated since version 1.2 and will be removed in version 1.3, please use `parl.Algorithm` instead.", - DeprecationWarning, - stacklevel=2) +try: + import [PACKAGE2] +except ImportError: + pass -from parl.core.fluid.algorithm import * +""" +from parl.utils import _IS_MAC + +if _IS_MAC: + try: + import rlschool + except ImportError: + pass diff --git a/parl/remote/job.py b/parl/remote/job.py index 00840c088ace82ad22f87d5e8e1433691c2143e4..d835e5389aa447bb69567b61f6f1c60b9cf99d58 100644 --- a/parl/remote/job.py +++ b/parl/remote/job.py @@ -12,6 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. +# Fix cloudpickle compatible problem we known. +import compatible_trick + import os os.environ['CUDA_VISIBLE_DEVICES'] = '' os.environ['XPARL'] = 'True' @@ -33,6 +36,7 @@ from parl.utils.communication import loads_argument, loads_return,\ from parl.remote import remote_constants from parl.utils.exceptions import SerializeError, DeserializeError from parl.remote.message import InitializedJob +from parl.remote.utils import load_remote_class, redirect_stdout_to_file class Job(object): @@ -44,7 +48,7 @@ class Job(object): """ - def __init__(self, worker_address): + def __init__(self, worker_address, log_server_address): """ Args: worker_address(str): worker_address for sending job information(e.g, pid) @@ -56,16 +60,21 @@ class Job(object): self.max_memory = None self.job_address_receiver, job_address_sender = Pipe() + self.job_id_receiver, job_id_sender = Pipe() self.worker_address = worker_address + self.log_server_address = log_server_address self.job_ip = get_ip_address() self.pid = os.getpid() - self.lock = threading.Lock() self.run_job_process = Process( - target=self.run, args=(job_address_sender, )) + target=self.run, args=(job_address_sender, job_id_sender)) self.run_job_process.start() - + """ + NOTE: + In Windows, it will raise errors when creating threading.Lock before starting multiprocess.Process. + """ + self.lock = threading.Lock() self._create_sockets() process = psutil.Process(self.pid) @@ -81,7 +90,7 @@ class Job(object): _ = self.kill_job_socket.recv_multipart() except zmq.error.Again as e: pass - os._exit(1) + os._exit(0) def _create_sockets(self): """Create five sockets for each job in main process. @@ -95,6 +104,7 @@ class Job(object): """ # wait for another process to create reply socket self.job_address = self.job_address_receiver.recv() + self.job_id = self.job_id_receiver.recv() self.ctx = zmq.Context() # create the job_socket @@ -128,7 +138,8 @@ class Job(object): # sends job information to the worker initialized_job = InitializedJob( self.job_address, worker_heartbeat_address, - client_heartbeat_address, ping_heartbeat_address, None, self.pid) + client_heartbeat_address, ping_heartbeat_address, None, self.pid, + self.job_id, self.log_server_address) self.job_socket.send_multipart( [remote_constants.NORMAL_TAG, cloudpickle.dumps(initialized_job)]) @@ -237,7 +248,7 @@ class Job(object): the python files to the job. Later, the job will save these files to a temporary directory and add the temporary diretory to Python's working directory. - + Args: reply_socket (sockert): main socket to accept commands of remote object. job_address (String): address of reply_socket. @@ -262,12 +273,15 @@ class Job(object): # create directory (i.e. ./rom_files/) if '/' in file: try: - os.makedirs(os.path.join(*file.rsplit('/')[:-1])) + sep = os.sep + recursive_dirs = os.path.join(*(file.split(sep)[:-1])) + recursive_dirs = os.path.join(envdir, recursive_dirs) + os.makedirs(recursive_dirs) except OSError as e: pass + file = os.path.join(envdir, file) with open(file, 'wb') as f: f.write(content) - logger.info('[job] reply') reply_socket.send_multipart([remote_constants.NORMAL_TAG]) return envdir else: @@ -295,9 +309,15 @@ class Job(object): if tag == remote_constants.INIT_OBJECT_TAG: try: - cls = cloudpickle.loads(message[1]) + file_name, class_name, end_of_file = cloudpickle.loads( + message[1]) + #/home/nlp-ol/Firework/baidu/nlp/evokit/python_api/es_agent -> es_agent + file_name = file_name.split(os.sep)[-1] + cls = load_remote_class(file_name, class_name, end_of_file) args, kwargs = cloudpickle.loads(message[2]) - obj = cls(*args, **kwargs) + logfile_path = os.path.join(self.log_dir, 'stdout.log') + with redirect_stdout_to_file(logfile_path): + obj = cls(*args, **kwargs) except Exception as e: traceback_str = str(traceback.format_exc()) error_str = str(e) @@ -318,7 +338,7 @@ class Job(object): return obj - def run(self, job_address_sender): + def run(self, job_address_sender, job_id_sender): """An infinite loop waiting for a new task. Args: @@ -333,19 +353,28 @@ class Job(object): job_ip = get_ip_address() job_address = "{}:{}".format(job_ip, job_port) + job_id = job_address.replace(':', '_') + '_' + str(int(time.time())) + self.log_dir = os.path.expanduser('~/.parl_data/job/{}'.format(job_id)) + logger.set_dir(self.log_dir) + logger.info( + "[Job] Job {} initialized. Reply heartbeat socket Address: {}.". + format(job_id, job_address)) + job_address_sender.send(job_address) + job_id_sender.send(job_id) try: # receive source code from the actor and append them to the environment variables. envdir = self.wait_for_files(reply_socket, job_address) - sys.path.append(envdir) + sys.path.insert(0, envdir) + os.chdir(envdir) obj = self.wait_for_connection(reply_socket) assert obj is not None self.single_task(obj, reply_socket, job_address) except Exception as e: logger.error( - "Error occurs when running a single task. We will reset this job. Reason:{}" + "Error occurs when running a single task. We will reset this job. \nReason:{}" .format(e)) traceback_str = str(traceback.format_exc()) logger.error("traceback:\n{}".format(traceback_str)) @@ -376,7 +405,12 @@ class Job(object): function_name = to_str(message[1]) data = message[2] args, kwargs = loads_argument(data) - ret = getattr(obj, function_name)(*args, **kwargs) + + # Redirect stdout to stdout.log temporarily + logfile_path = os.path.join(self.log_dir, 'stdout.log') + with redirect_stdout_to_file(logfile_path): + ret = getattr(obj, function_name)(*args, **kwargs) + ret = dumps_return(ret) reply_socket.send_multipart( @@ -435,5 +469,10 @@ if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "--worker_address", required=True, type=str, help="worker_address") + parser.add_argument( + "--log_server_address", + required=True, + type=str, + help="log_server_address, address of the log web server on worker") args = parser.parse_args() - job = Job(args.worker_address) + job = Job(args.worker_address, args.log_server_address) diff --git a/parl/remote/log_server.py b/parl/remote/log_server.py new file mode 100644 index 0000000000000000000000000000000000000000..f3ad1cf882311b9bdffcd990e7b33ddff0711bc2 --- /dev/null +++ b/parl/remote/log_server.py @@ -0,0 +1,102 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import linecache +import os + +from flask import Flask, current_app, jsonify, make_response, request, send_file +from flask_cors import CORS + +app = Flask(__name__) +CORS(app) + + +@app.route( + "/get-log", methods=[ + 'GET', + ]) +def get_log(): + ''' + args: + job_id: id of the remote job + response: + log: newest `LINE_NUM` lines of the log file + ''' + try: + job_id = request.args['job_id'] + except: + return make_response( + jsonify(message="No job_id provided, please check your request."), + 400) + + log_dir = current_app.config.get('LOG_DIR') + log_dir = os.path.expanduser(log_dir) + log_file_path = os.path.join(log_dir, job_id, 'stdout.log') + if not os.path.isfile(log_file_path): + return make_response( + jsonify(message="Log not exsits, please check your job_id"), 400) + else: + line_num = current_app.config.get('LINE_NUM') + linecache.checkcache(log_file_path) + log_content = ''.join(linecache.getlines(log_file_path)[-line_num:]) + return make_response( + jsonify(message="Log exsits, content in log", log=log_content), + 200) + + +@app.route( + '/download-log', methods=[ + 'GET', + ]) +def download_log(): + ''' + args: + job_id: the id of the remote job + response: + log: log file + ''' + try: + job_id = request.args['job_id'] + except: + return make_response( + jsonify(message="No job_id provided, please check your request."), + 400) + log_dir = current_app.config.get('LOG_DIR') + log_dir = os.path.expanduser(log_dir) + log_file_path = os.path.join(log_dir, job_id, 'stdout.log') + if not os.path.isfile(log_file_path): + return make_response( + jsonify(message="Log not exsits, please check your job_id"), 400) + else: + return send_file(log_file_path, as_attachment=True) + + +if __name__ == "__main__": + import logging + log = logging.getLogger('werkzeug') + log.disabled = True + + parser = argparse.ArgumentParser() + parser.add_argument('--port', required=True, type=int) + parser.add_argument('--log_dir', required=True, type=str) + parser.add_argument('--line_num', required=True, type=int) + args = parser.parse_args() + + app.config.from_mapping( + LOG_DIR=args.log_dir, + LINE_NUM=args.line_num, + ) + + app.run(host="0.0.0.0", port=args.port) diff --git a/parl/remote/master.py b/parl/remote/master.py index a5d09adb59b280dcb04bf8cbd6f99b8f6e5e7845..8cca0290a7ad68407026f2e24c4613da83af56a3 100644 --- a/parl/remote/master.py +++ b/parl/remote/master.py @@ -57,11 +57,12 @@ class Master(object): port: The ip port that the master node binds to. """ - def __init__(self, port): + def __init__(self, port, monitor_port=None): self.ctx = zmq.Context() self.master_ip = get_ip_address() + self.monitor_url = "http://{}:{}".format(self.master_ip, monitor_port) logger.set_dir( - os.path.expanduser('~/.parl_data/master/{}:{}'.format( + os.path.expanduser('~/.parl_data/master/{}_{}'.format( self.master_ip, port))) self.client_socket = self.ctx.socket(zmq.REP) self.client_socket.bind("tcp://*:{}".format(port)) @@ -135,7 +136,7 @@ class Master(object): except zmq.error.Again as e: client_is_alive = False - self.cluster_monitor.drop_cluster_status( + self.cluster_monitor.drop_client_status( client_heartbeat_address) logger.warning("[Master] cannot connect to the client " + "{}. ".format(client_heartbeat_address) + @@ -205,8 +206,11 @@ class Master(object): # a client connects to the master elif tag == remote_constants.CLIENT_CONNECT_TAG: + # `client_heartbeat_address` is the + # `reply_master_heartbeat_address` of the client client_heartbeat_address = to_str(message[1]) client_hostname = to_str(message[2]) + client_id = to_str(message[3]) self.client_hostname[client_heartbeat_address] = client_hostname logger.info( "Client {} is connected.".format(client_heartbeat_address)) @@ -215,11 +219,14 @@ class Master(object): target=self._create_client_monitor, args=(client_heartbeat_address, )) thread.start() - self.client_socket.send_multipart([remote_constants.NORMAL_TAG]) + log_monitor_address = "{}/logs?client_id={}".format( + self.monitor_url, client_id) + self.client_socket.send_multipart( + [remote_constants.NORMAL_TAG, + to_byte(log_monitor_address)]) # a client submits a job to the master elif tag == remote_constants.CLIENT_SUBMIT_TAG: - # check available CPU resources if self.cpu_num: logger.info("Submitting job...") @@ -230,6 +237,9 @@ class Master(object): to_byte(job.client_heartbeat_address), to_byte(job.ping_heartbeat_address), ]) + client_id = to_str(message[2]) + job_info = {job.job_id: job.log_server_address} + self.cluster_monitor.add_client_job(client_id, job_info) self._print_workers() else: self.client_socket.send_multipart([remote_constants.CPU_TAG]) diff --git a/parl/remote/message.py b/parl/remote/message.py index 8be8d4657110011c34cca8702290a9942d225e36..97e5482f9e5a25fe52b6919494f4dde1b21e7d5b 100644 --- a/parl/remote/message.py +++ b/parl/remote/message.py @@ -14,9 +14,15 @@ class InitializedJob(object): - def __init__(self, job_address, worker_heartbeat_address, - client_heartbeat_address, ping_heartbeat_address, - worker_address, pid): + def __init__(self, + job_address, + worker_heartbeat_address, + client_heartbeat_address, + ping_heartbeat_address, + worker_address, + pid, + job_id=None, + log_server_address=None): """ Args: job_address(str): Job address to which the new task connect. @@ -35,6 +41,8 @@ class InitializedJob(object): self.worker_address = worker_address self.pid = pid self.is_alive = True + self.job_id = job_id + self.log_server_address = log_server_address class InitializedWorker(object): diff --git a/parl/remote/monitor.py b/parl/remote/monitor.py index 8f5c1d5f1d4b4919b4230f27a7a656a1417f6d23..452888940c4eb8de94632f5adc55a097255e94c0 100644 --- a/parl/remote/monitor.py +++ b/parl/remote/monitor.py @@ -19,7 +19,7 @@ import time import zmq import threading -from flask import Flask, render_template, jsonify +from flask import Flask, render_template, jsonify, request app = Flask(__name__) @@ -42,7 +42,7 @@ class ClusterMonitor(object): def __init__(self, master_address): ctx = zmq.Context() self.socket = ctx.socket(zmq.REQ) - self.socket.setsockopt(zmq.RCVTIMEO, 10000) + self.socket.setsockopt(zmq.RCVTIMEO, 30000) self.socket.connect('tcp://{}'.format(master_address)) self.data = None @@ -81,6 +81,7 @@ class ClusterMonitor(object): data['total_vacant_cpus'] = total_vacant_cpus data['total_cpus'] = total_used_cpus + total_vacant_cpus data['clients'] = list(status['clients'].values()) + data['client_jobs'] = status['client_jobs'] self.data = data time.sleep(10) @@ -99,7 +100,44 @@ def cluster(): return jsonify(data) +@app.route( + '/logs', methods=[ + 'GET', + ]) +def logs(): + client_id = request.args.get('client_id') + return render_template('jobs.html', client_id=client_id) + + +@app.route( + '/get-jobs', methods=[ + 'GET', + ]) +def get_jobs(): + client_id = request.args.get('client_id') + jobs = CLUSTER_MONITOR.get_data()['client_jobs'].get(client_id) + data = [] + if jobs: + for idx, job_id in enumerate(jobs): + monitor_url = jobs[job_id] + data.append({ + "id": + idx, + "job_id": + job_id, + "log_url": + "http://{}/get-log?job_id={}".format(monitor_url, job_id), + "download_url": + "http://{}/download-log?job_id={}".format(monitor_url, job_id), + }) + return jsonify(data) + + if __name__ == "__main__": + import logging + log = logging.getLogger('werkzeug') + log.disabled = True + parser = argparse.ArgumentParser() parser.add_argument('--monitor_port', default=1234, type=int) parser.add_argument('--address', default='localhost:8010', type=str) diff --git a/parl/remote/remote_decorator.py b/parl/remote/remote_decorator.py index 32a463f85b82acb7483ec5a082f12136565326e2..a066abc40832fdce00fd00d1784aa75c60925e00 100644 --- a/parl/remote/remote_decorator.py +++ b/parl/remote/remote_decorator.py @@ -18,6 +18,7 @@ import threading import time import zmq import numpy as np +import inspect from parl.utils import get_ip_address, logger, to_str, to_byte from parl.utils.communication import loads_argument, loads_return,\ @@ -55,7 +56,7 @@ def remote_class(*args, **kwargs): actor = Actor() actor.step() - # Set maximum memory usage to 300 MB for each object. + # Set maximum memory usage to 300 MB for each object. @parl.remote_class(max_memory=300) class LimitedActor(object): ... @@ -74,6 +75,12 @@ def remote_class(*args, **kwargs): """ def decorator(cls): + # we are not going to create a remote actor in job.py + if 'XPARL' in os.environ and os.environ['XPARL'] == 'True': + logger.warning( + "Note: this object will be runnning as a local object") + return cls + class RemoteWrapper(object): """ Wrapper for remote class in client side. @@ -113,10 +120,13 @@ def remote_class(*args, **kwargs): self.job_shutdown = False self.send_file(self.job_socket) - + file_name = inspect.getfile(cls)[:-3] + cls_source = inspect.getsourcelines(cls) + end_of_file = cls_source[1] + len(cls_source[0]) + class_name = cls.__name__ self.job_socket.send_multipart([ remote_constants.INIT_OBJECT_TAG, - cloudpickle.dumps(cls), + cloudpickle.dumps([file_name, class_name, end_of_file]), cloudpickle.dumps([args, kwargs]), ]) message = self.job_socket.recv_multipart() @@ -128,6 +138,10 @@ def remote_class(*args, **kwargs): def __del__(self): """Delete the remote class object and release remote resources.""" + try: + self.job_socket.setsockopt(zmq.RCVTIMEO, 1 * 1000) + except AttributeError: + pass if not self.job_shutdown: try: self.job_socket.send_multipart( @@ -138,6 +152,8 @@ def remote_class(*args, **kwargs): pass except zmq.error.ZMQError: pass + except TypeError: + pass def send_file(self, socket): try: @@ -212,6 +228,7 @@ def remote_class(*args, **kwargs): return wrapper + RemoteWrapper._original = cls return RemoteWrapper max_memory = kwargs.get('max_memory') diff --git a/parl/remote/scripts.py b/parl/remote/scripts.py index 71677d692878eef63f65b0ff1054cb6233b0d7a5..51cf3cabf7cb3a3dba0cc3d0d00a7ad55406b4f8 100644 --- a/parl/remote/scripts.py +++ b/parl/remote/scripts.py @@ -18,15 +18,18 @@ import multiprocessing import os import random import re -import socket +import requests import subprocess import sys import time import threading +import tempfile import warnings import zmq from multiprocessing import Process -from parl.utils import get_ip_address, to_str +from parl.utils import (_IS_WINDOWS, get_free_tcp_port, get_ip_address, + get_port_from_range, is_port_available, kill_process, + to_str) from parl.remote.remote_constants import STATUS_TAG # A flag to mark if parl is started from a command line @@ -34,33 +37,18 @@ os.environ['XPARL'] = 'True' # Solve `Click will abort further execution because Python 3 was configured # to use ASCII as encoding for the environment` error. -locale.setlocale(locale.LC_ALL, "en_US.UTF-8") + +if not _IS_WINDOWS: + try: + locale.setlocale(locale.LC_ALL, "en_US.UTF-8") + except: + pass #TODO: this line will cause error in python2/macOS if sys.version_info.major == 3: warnings.simplefilter("ignore", ResourceWarning) -def get_free_tcp_port(): - tcp = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - tcp.bind(('', 0)) - addr, port = tcp.getsockname() - tcp.close() - return str(port) - - -def is_port_available(port): - """ Check if a port is used. - - True if the port is available for connection. - """ - port = int(port) - sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - available = sock.connect_ex(('localhost', port)) - sock.close() - return available - - def is_master_started(address): ctx = zmq.Context() socket = ctx.socket(zmq.REQ) @@ -77,6 +65,33 @@ def is_master_started(address): return False +def parse_port_range(log_server_port_range): + try: + re.match(r'\d*[-]\d*', log_server_port_range).span() + except: + raise Exception( + "The input log_server_port_range should be `start-end` format.") + start, end = map(int, log_server_port_range.split('-')) + if start > end: + raise Exception( + "Start port number must be smaller than the end port number.") + + return start, end + + +def is_log_server_started(ip_address, port): + started = False + for _ in range(3): + try: + r = requests.get("http://{}:{}/get-log".format(ip_address, port)) + if r.status_code == 400: + started = True + break + except: + time.sleep(3) + return started + + @click.group() def cli(): pass @@ -95,7 +110,15 @@ def cli(): "cpus of this machine.") @click.option( "--monitor_port", help="The port to start a cluster monitor.", type=str) -def start_master(port, cpu_num, monitor_port, debug): +@click.option( + "--log_server_port_range", + help=''' + Port range (start-end) of the log server on the worker. Default: 8000-9000. + The worker will pick a random avaliable port in [start, end] for the log server. + ''', + default="8000-9000", + type=str) +def start_master(port, cpu_num, monitor_port, debug, log_server_port_range): if debug: os.environ['DEBUG'] = 'True' @@ -112,19 +135,33 @@ def start_master(port, cpu_num, monitor_port, debug): cpu_num) if cpu_num is not None else multiprocessing.cpu_count() start_file = __file__.replace('scripts.pyc', 'start.py') start_file = start_file.replace('scripts.py', 'start.py') + monitor_file = __file__.replace('scripts.pyc', 'monitor.py') + monitor_file = monitor_file.replace('scripts.py', 'monitor.py') + monitor_port = monitor_port if monitor_port else get_free_tcp_port() + start, end = parse_port_range(log_server_port_range) + log_server_port = get_port_from_range(start, end) + while log_server_port == monitor_port or log_server_port == port: + log_server_port = get_port_from_range(start, end) master_command = [ - sys.executable, start_file, "--name", "master", "--port", port + sys.executable, + start_file, + "--name", + "master", + "--port", + port, + "--monitor_port", + monitor_port, ] worker_command = [ sys.executable, start_file, "--name", "worker", "--address", "localhost:" + str(port), "--cpu_num", - str(cpu_num) + str(cpu_num), '--log_server_port', + str(log_server_port) ] monitor_command = [ - sys.executable, '{}/monitor.py'.format(__file__[:__file__.rfind('/')]), - "--monitor_port", + sys.executable, monitor_file, "--monitor_port", str(monitor_port), "--address", "localhost:" + str(port) ] @@ -133,11 +170,21 @@ def start_master(port, cpu_num, monitor_port, debug): # Redirect the output to DEVNULL to solve the warning log. _ = subprocess.Popen( master_command, stdout=FNULL, stderr=subprocess.STDOUT) + if cpu_num > 0: + # Sleep 1s for master ready + time.sleep(1) _ = subprocess.Popen( worker_command, stdout=FNULL, stderr=subprocess.STDOUT) - _ = subprocess.Popen( - monitor_command, stdout=FNULL, stderr=subprocess.STDOUT) + + if _IS_WINDOWS: + # TODO(@zenghsh3) redirecting stdout of monitor subprocess to FNULL will cause occasional failure + tmp_file = tempfile.TemporaryFile() + _ = subprocess.Popen(monitor_command, stdout=tmp_file) + tmp_file.close() + else: + _ = subprocess.Popen( + monitor_command, stdout=FNULL, stderr=subprocess.STDOUT) FNULL.close() if cpu_num > 0: @@ -158,16 +205,20 @@ def start_master(port, cpu_num, monitor_port, debug): click.echo(monitor_info) # check if monitor is started - cmd = r'ps -ef | grep remote/monitor.py\ --monitor_port\ {}\ --address\ localhost:{}'.format( - monitor_port, port) - monitor_is_started = False + if _IS_WINDOWS: + cmd = r'''wmic process where "commandline like '%remote\\monitor.py --monitor_port {} --address localhost:{}%'" get commandline /format:list | findstr /V wmic | findstr CommandLine='''.format( + monitor_port, port) + else: + cmd = r'ps -ef | grep -v grep | grep remote/monitor.py\ --monitor_port\ {}\ --address\ localhost:{}'.format( + monitor_port, port) for i in range(3): - check_monitor_is_started = os.popen(cmd).read().strip().split('\n') - if len(check_monitor_is_started) == 2: + check_monitor_is_started = os.popen(cmd).read() + if len(check_monitor_is_started) > 0: monitor_is_started = True break time.sleep(3) + master_ip = get_ip_address() if monitor_is_started: start_info = """ @@ -194,6 +245,9 @@ def start_master(port, cpu_num, monitor_port, debug): """.format(start_info, master_ip, port) click.echo(monitor_info) + if not is_log_server_started(master_ip, log_server_port): + click.echo("# Fail to start the log server.") + @click.command("connect", short_help="Start a worker node.") @click.option( @@ -203,36 +257,53 @@ def start_master(port, cpu_num, monitor_port, debug): type=int, help="Set number of cpu manually. If not set, it will use all " "cpus of this machine.") -def start_worker(address, cpu_num): +@click.option( + "--log_server_port_range", + help=''' + Port range (start-end) of the log server on the worker. Default: 8000-9000. + The worker will pick a random avaliable port in [start, end] for the log server. + ''', + default="8000-9000", + type=str) +def start_worker(address, cpu_num, log_server_port_range): + start, end = parse_port_range(log_server_port_range) + log_server_port = get_port_from_range(start, end) + if not is_master_started(address): raise Exception("Worker can not connect to the master node, " + "please check if the input address {} ".format( address) + "is correct.") cpu_num = str(cpu_num) if cpu_num else '' + start_file = __file__.replace('scripts.pyc', 'start.py') + start_file = start_file.replace('scripts.py', 'start.py') + command = [ - sys.executable, "{}/start.py".format(__file__[:-11]), "--name", - "worker", "--address", address, "--cpu_num", - str(cpu_num) + sys.executable, start_file, "--name", "worker", "--address", address, + "--cpu_num", + str(cpu_num), "--log_server_port", + str(log_server_port) ] p = subprocess.Popen(command) + if not is_log_server_started(get_ip_address(), log_server_port): + click.echo("# Fail to start the log server.") + @click.command("stop", help="Exit the cluster.") def stop(): - command = ( - "ps aux | grep remote/start.py | awk '{print $2}' | xargs kill -9") - subprocess.call([command], shell=True) - command = ( - "ps aux | grep remote/job.py | awk '{print $2}' | xargs kill -9") - subprocess.call([command], shell=True) - command = ( - "ps aux | grep remote/monitor.py | awk '{print $2}' | xargs kill -9") - subprocess.call([command], shell=True) + kill_process('remote/start.py') + kill_process('remote/job.py') + kill_process('remote/monitor.py') + kill_process('remote/log_server.py') @click.command("status") def status(): - cmd = r'ps -ef | grep remote/start.py\ --name\ worker\ --address' + if _IS_WINDOWS: + cmd = r'''wmic process where "commandline like '%remote\\start.py --name worker --address%'" get commandline /format:list | findstr /V wmic | findstr CommandLine=''' + else: + cmd = r'ps -ef | grep remote/start.py\ --name\ worker\ --address' + content = os.popen(cmd).read().strip() pattern = re.compile('--address (.*?) --cpu') clusters = set(pattern.findall(content)) @@ -242,7 +313,11 @@ def status(): ctx = zmq.Context() status = [] for cluster in clusters: - cmd = r'ps -ef | grep address\ {}'.format(cluster) + if _IS_WINDOWS: + cmd = r'''wmic process where "commandline like '%address {}%'" get commandline /format:list | findstr /V wmic | findstr CommandLine='''.format( + cluster) + else: + cmd = r'ps -ef | grep address\ {}'.format(cluster) content = os.popen(cmd).read() pattern = re.compile('--monitor_port (.*?)\n', re.S) monitors = pattern.findall(content) diff --git a/parl/remote/start.py b/parl/remote/start.py index d9aa231db65a04ee410d7df6660d9eaa75150828..83c8dca86a726b8c77307b0e080829ae67bc186a 100644 --- a/parl/remote/start.py +++ b/parl/remote/start.py @@ -28,13 +28,15 @@ def main(args): if args.name == 'master': port = args.port - master = Master(port) + monitor_port = args.monitor_port + master = Master(port, monitor_port) master.run() elif args.name == 'worker': address = args.address + log_server_port = args.log_server_port cpu_num = int(args.cpu_num) if args.cpu_num else None - worker = Worker(address, cpu_num) + worker = Worker(address, cpu_num, log_server_port) worker.run() else: @@ -48,5 +50,7 @@ if __name__ == "__main__": parser.add_argument('--port', default='1234', type=str) parser.add_argument('--address', default='localhost:1234', type=str) parser.add_argument('--cpu_num', default='', type=str) + parser.add_argument('--monitor_port', default='', type=str) + parser.add_argument('--log_server_port', default='', type=str) args = parser.parse_args() main(args) diff --git a/parl/remote/static/js/ansi_up.js b/parl/remote/static/js/ansi_up.js new file mode 100644 index 0000000000000000000000000000000000000000..b207399e24887a4d2b13e03482f98f16b2137cf0 --- /dev/null +++ b/parl/remote/static/js/ansi_up.js @@ -0,0 +1,421 @@ +/* ansi_up.js + * author : Dru Nelson + * license : MIT + * http://github.com/drudru/ansi_up + */ +(function (root, factory) { + if (typeof define === 'function' && define.amd) { + // AMD. Register as an anonymous module. + define(['exports'], factory); + } else if (typeof exports === 'object' && typeof exports.nodeName !== 'string') { + // CommonJS + factory(exports); + } else { + // Browser globals + var exp = {}; + factory(exp); + root.AnsiUp = exp.default; + } +}(this, function (exports) { +"use strict"; +var __makeTemplateObject = (this && this.__makeTemplateObject) || function (cooked, raw) { + if (Object.defineProperty) { Object.defineProperty(cooked, "raw", { value: raw }); } else { cooked.raw = raw; } + return cooked; +}; +var PacketKind; +(function (PacketKind) { + PacketKind[PacketKind["EOS"] = 0] = "EOS"; + PacketKind[PacketKind["Text"] = 1] = "Text"; + PacketKind[PacketKind["Incomplete"] = 2] = "Incomplete"; + PacketKind[PacketKind["ESC"] = 3] = "ESC"; + PacketKind[PacketKind["Unknown"] = 4] = "Unknown"; + PacketKind[PacketKind["SGR"] = 5] = "SGR"; + PacketKind[PacketKind["OSCURL"] = 6] = "OSCURL"; +})(PacketKind || (PacketKind = {})); +var AnsiUp = (function () { + function AnsiUp() { + this.VERSION = "4.0.3"; + this.setup_palettes(); + this._use_classes = false; + this._escape_for_html = true; + this.bold = false; + this.fg = this.bg = null; + this._buffer = ''; + this._url_whitelist = { 'http': 1, 'https': 1 }; + } + Object.defineProperty(AnsiUp.prototype, "use_classes", { + get: function () { + return this._use_classes; + }, + set: function (arg) { + this._use_classes = arg; + }, + enumerable: true, + configurable: true + }); + Object.defineProperty(AnsiUp.prototype, "escape_for_html", { + get: function () { + return this._escape_for_html; + }, + set: function (arg) { + this._escape_for_html = arg; + }, + enumerable: true, + configurable: true + }); + Object.defineProperty(AnsiUp.prototype, "url_whitelist", { + get: function () { + return this._url_whitelist; + }, + set: function (arg) { + this._url_whitelist = arg; + }, + enumerable: true, + configurable: true + }); + AnsiUp.prototype.setup_palettes = function () { + var _this = this; + this.ansi_colors = + [ + [ + { rgb: [0, 0, 0], class_name: "ansi-black" }, + { rgb: [187, 0, 0], class_name: "ansi-red" }, + { rgb: [0, 187, 0], class_name: "ansi-green" }, + { rgb: [187, 187, 0], class_name: "ansi-yellow" }, + { rgb: [0, 0, 187], class_name: "ansi-blue" }, + { rgb: [187, 0, 187], class_name: "ansi-magenta" }, + { rgb: [0, 187, 187], class_name: "ansi-cyan" }, + { rgb: [255, 255, 255], class_name: "ansi-white" } + ], + [ + { rgb: [85, 85, 85], class_name: "ansi-bright-black" }, + { rgb: [255, 85, 85], class_name: "ansi-bright-red" }, + { rgb: [0, 255, 0], class_name: "ansi-bright-green" }, + { rgb: [255, 255, 85], class_name: "ansi-bright-yellow" }, + { rgb: [85, 85, 255], class_name: "ansi-bright-blue" }, + { rgb: [255, 85, 255], class_name: "ansi-bright-magenta" }, + { rgb: [85, 255, 255], class_name: "ansi-bright-cyan" }, + { rgb: [255, 255, 255], class_name: "ansi-bright-white" } + ] + ]; + this.palette_256 = []; + this.ansi_colors.forEach(function (palette) { + palette.forEach(function (rec) { + _this.palette_256.push(rec); + }); + }); + var levels = [0, 95, 135, 175, 215, 255]; + for (var r = 0; r < 6; ++r) { + for (var g = 0; g < 6; ++g) { + for (var b = 0; b < 6; ++b) { + var col = { rgb: [levels[r], levels[g], levels[b]], class_name: 'truecolor' }; + this.palette_256.push(col); + } + } + } + var grey_level = 8; + for (var i = 0; i < 24; ++i, grey_level += 10) { + var gry = { rgb: [grey_level, grey_level, grey_level], class_name: 'truecolor' }; + this.palette_256.push(gry); + } + }; + AnsiUp.prototype.escape_txt_for_html = function (txt) { + return txt.replace(/[&<>]/gm, function (str) { + if (str === "&") + return "&"; + if (str === "<") + return "<"; + if (str === ">") + return ">"; + }); + }; + AnsiUp.prototype.append_buffer = function (txt) { + var str = this._buffer + txt; + this._buffer = str; + }; + AnsiUp.prototype.get_next_packet = function () { + var pkt = { + kind: PacketKind.EOS, + text: '', + url: '' + }; + var len = this._buffer.length; + if (len == 0) + return pkt; + var pos = this._buffer.indexOf("\x1B"); + if (pos == -1) { + pkt.kind = PacketKind.Text; + pkt.text = this._buffer; + this._buffer = ''; + return pkt; + } + if (pos > 0) { + pkt.kind = PacketKind.Text; + pkt.text = this._buffer.slice(0, pos); + this._buffer = this._buffer.slice(pos); + return pkt; + } + if (pos == 0) { + if (len == 1) { + pkt.kind = PacketKind.Incomplete; + return pkt; + } + var next_char = this._buffer.charAt(1); + if ((next_char != '[') && (next_char != ']')) { + pkt.kind = PacketKind.ESC; + pkt.text = this._buffer.slice(0, 1); + this._buffer = this._buffer.slice(1); + return pkt; + } + if (next_char == '[') { + if (!this._csi_regex) { + this._csi_regex = rgx(__makeTemplateObject(["\n ^ # beginning of line\n #\n # First attempt\n (?: # legal sequence\n \u001B[ # CSI\n ([<-?]?) # private-mode char\n ([d;]*) # any digits or semicolons\n ([ -/]? # an intermediate modifier\n [@-~]) # the command\n )\n | # alternate (second attempt)\n (?: # illegal sequence\n \u001B[ # CSI\n [ -~]* # anything legal\n ([\0-\u001F:]) # anything illegal\n )\n "], ["\n ^ # beginning of line\n #\n # First attempt\n (?: # legal sequence\n \\x1b\\[ # CSI\n ([\\x3c-\\x3f]?) # private-mode char\n ([\\d;]*) # any digits or semicolons\n ([\\x20-\\x2f]? # an intermediate modifier\n [\\x40-\\x7e]) # the command\n )\n | # alternate (second attempt)\n (?: # illegal sequence\n \\x1b\\[ # CSI\n [\\x20-\\x7e]* # anything legal\n ([\\x00-\\x1f:]) # anything illegal\n )\n "])); + } + var match = this._buffer.match(this._csi_regex); + if (match === null) { + pkt.kind = PacketKind.Incomplete; + return pkt; + } + if (match[4]) { + pkt.kind = PacketKind.ESC; + pkt.text = this._buffer.slice(0, 1); + this._buffer = this._buffer.slice(1); + return pkt; + } + if ((match[1] != '') || (match[3] != 'm')) + pkt.kind = PacketKind.Unknown; + else + pkt.kind = PacketKind.SGR; + pkt.text = match[2]; + var rpos = match[0].length; + this._buffer = this._buffer.slice(rpos); + return pkt; + } + if (next_char == ']') { + if (len < 4) { + pkt.kind = PacketKind.Incomplete; + return pkt; + } + if ((this._buffer.charAt(2) != '8') + || (this._buffer.charAt(3) != ';')) { + pkt.kind = PacketKind.ESC; + pkt.text = this._buffer.slice(0, 1); + this._buffer = this._buffer.slice(1); + return pkt; + } + if (!this._osc_st) { + this._osc_st = rgxG(__makeTemplateObject(["\n (?: # legal sequence\n (\u001B\\) # ESC | # alternate\n (\u0007) # BEL (what xterm did)\n )\n | # alternate (second attempt)\n ( # illegal sequence\n [\0-\u0006] # anything illegal\n | # alternate\n [\b-\u001A] # anything illegal\n | # alternate\n [\u001C-\u001F] # anything illegal\n )\n "], ["\n (?: # legal sequence\n (\\x1b\\\\) # ESC \\\n | # alternate\n (\\x07) # BEL (what xterm did)\n )\n | # alternate (second attempt)\n ( # illegal sequence\n [\\x00-\\x06] # anything illegal\n | # alternate\n [\\x08-\\x1a] # anything illegal\n | # alternate\n [\\x1c-\\x1f] # anything illegal\n )\n "])); + } + this._osc_st.lastIndex = 0; + { + var match_1 = this._osc_st.exec(this._buffer); + if (match_1 === null) { + pkt.kind = PacketKind.Incomplete; + return pkt; + } + if (match_1[3]) { + pkt.kind = PacketKind.ESC; + pkt.text = this._buffer.slice(0, 1); + this._buffer = this._buffer.slice(1); + return pkt; + } + } + { + var match_2 = this._osc_st.exec(this._buffer); + if (match_2 === null) { + pkt.kind = PacketKind.Incomplete; + return pkt; + } + if (match_2[3]) { + pkt.kind = PacketKind.ESC; + pkt.text = this._buffer.slice(0, 1); + this._buffer = this._buffer.slice(1); + return pkt; + } + } + if (!this._osc_regex) { + this._osc_regex = rgx(__makeTemplateObject(["\n ^ # beginning of line\n #\n \u001B]8; # OSC Hyperlink\n [ -:<-~]* # params (excluding ;)\n ; # end of params\n ([!-~]{0,512}) # URL capture\n (?: # ST\n (?:\u001B\\) # ESC | # alternate\n (?:\u0007) # BEL (what xterm did)\n )\n ([!-~]+) # TEXT capture\n \u001B]8;; # OSC Hyperlink End\n (?: # ST\n (?:\u001B\\) # ESC | # alternate\n (?:\u0007) # BEL (what xterm did)\n )\n "], ["\n ^ # beginning of line\n #\n \\x1b\\]8; # OSC Hyperlink\n [\\x20-\\x3a\\x3c-\\x7e]* # params (excluding ;)\n ; # end of params\n ([\\x21-\\x7e]{0,512}) # URL capture\n (?: # ST\n (?:\\x1b\\\\) # ESC \\\n | # alternate\n (?:\\x07) # BEL (what xterm did)\n )\n ([\\x21-\\x7e]+) # TEXT capture\n \\x1b\\]8;; # OSC Hyperlink End\n (?: # ST\n (?:\\x1b\\\\) # ESC \\\n | # alternate\n (?:\\x07) # BEL (what xterm did)\n )\n "])); + } + var match = this._buffer.match(this._osc_regex); + if (match === null) { + pkt.kind = PacketKind.ESC; + pkt.text = this._buffer.slice(0, 1); + this._buffer = this._buffer.slice(1); + return pkt; + } + pkt.kind = PacketKind.OSCURL; + pkt.url = match[1]; + pkt.text = match[2]; + var rpos = match[0].length; + this._buffer = this._buffer.slice(rpos); + return pkt; + } + } + }; + AnsiUp.prototype.ansi_to_html = function (txt) { + this.append_buffer(txt); + var blocks = []; + while (true) { + var packet = this.get_next_packet(); + if ((packet.kind == PacketKind.EOS) + || (packet.kind == PacketKind.Incomplete)) + break; + if ((packet.kind == PacketKind.ESC) + || (packet.kind == PacketKind.Unknown)) + continue; + if (packet.kind == PacketKind.Text) + blocks.push(this.transform_to_html(this.with_state(packet))); + else if (packet.kind == PacketKind.SGR) + this.process_ansi(packet); + else if (packet.kind == PacketKind.OSCURL) + blocks.push(this.process_hyperlink(packet)); + } + return blocks.join(""); + }; + AnsiUp.prototype.with_state = function (pkt) { + return { bold: this.bold, fg: this.fg, bg: this.bg, text: pkt.text }; + }; + AnsiUp.prototype.process_ansi = function (pkt) { + var sgr_cmds = pkt.text.split(';'); + while (sgr_cmds.length > 0) { + var sgr_cmd_str = sgr_cmds.shift(); + var num = parseInt(sgr_cmd_str, 10); + if (isNaN(num) || num === 0) { + this.fg = this.bg = null; + this.bold = false; + } + else if (num === 1) { + this.bold = true; + } + else if (num === 22) { + this.bold = false; + } + else if (num === 39) { + this.fg = null; + } + else if (num === 49) { + this.bg = null; + } + else if ((num >= 30) && (num < 38)) { + this.fg = this.ansi_colors[0][(num - 30)]; + } + else if ((num >= 40) && (num < 48)) { + this.bg = this.ansi_colors[0][(num - 40)]; + } + else if ((num >= 90) && (num < 98)) { + this.fg = this.ansi_colors[1][(num - 90)]; + } + else if ((num >= 100) && (num < 108)) { + this.bg = this.ansi_colors[1][(num - 100)]; + } + else if (num === 38 || num === 48) { + if (sgr_cmds.length > 0) { + var is_foreground = (num === 38); + var mode_cmd = sgr_cmds.shift(); + if (mode_cmd === '5' && sgr_cmds.length > 0) { + var palette_index = parseInt(sgr_cmds.shift(), 10); + if (palette_index >= 0 && palette_index <= 255) { + if (is_foreground) + this.fg = this.palette_256[palette_index]; + else + this.bg = this.palette_256[palette_index]; + } + } + if (mode_cmd === '2' && sgr_cmds.length > 2) { + var r = parseInt(sgr_cmds.shift(), 10); + var g = parseInt(sgr_cmds.shift(), 10); + var b = parseInt(sgr_cmds.shift(), 10); + if ((r >= 0 && r <= 255) && (g >= 0 && g <= 255) && (b >= 0 && b <= 255)) { + var c = { rgb: [r, g, b], class_name: 'truecolor' }; + if (is_foreground) + this.fg = c; + else + this.bg = c; + } + } + } + } + } + }; + AnsiUp.prototype.transform_to_html = function (fragment) { + var txt = fragment.text; + if (txt.length === 0) + return txt; + if (this._escape_for_html) + txt = this.escape_txt_for_html(txt); + if (!fragment.bold && fragment.fg === null && fragment.bg === null) + return txt; + var styles = []; + var classes = []; + var fg = fragment.fg; + var bg = fragment.bg; + if (fragment.bold) + styles.push('font-weight:bold'); + if (!this._use_classes) { + if (fg) + styles.push("color:rgb(" + fg.rgb.join(',') + ")"); + if (bg) + styles.push("background-color:rgb(" + bg.rgb + ")"); + } + else { + if (fg) { + if (fg.class_name !== 'truecolor') { + classes.push(fg.class_name + "-fg"); + } + else { + styles.push("color:rgb(" + fg.rgb.join(',') + ")"); + } + } + if (bg) { + if (bg.class_name !== 'truecolor') { + classes.push(bg.class_name + "-bg"); + } + else { + styles.push("background-color:rgb(" + bg.rgb.join(',') + ")"); + } + } + } + var class_string = ''; + var style_string = ''; + if (classes.length) + class_string = " class=\"" + classes.join(' ') + "\""; + if (styles.length) + style_string = " style=\"" + styles.join(';') + "\""; + return "" + txt + ""; + }; + ; + AnsiUp.prototype.process_hyperlink = function (pkt) { + var parts = pkt.url.split(':'); + if (parts.length < 1) + return ''; + if (!this._url_whitelist[parts[0]]) + return ''; + var result = "" + this.escape_txt_for_html(pkt.text) + ""; + return result; + }; + return AnsiUp; +}()); +function rgx(tmplObj) { + var subst = []; + for (var _i = 1; _i < arguments.length; _i++) { + subst[_i - 1] = arguments[_i]; + } + var regexText = tmplObj.raw[0]; + var wsrgx = /^\s+|\s+\n|\s*#[\s\S]*?\n|\n/gm; + var txt2 = regexText.replace(wsrgx, ''); + return new RegExp(txt2); +} +function rgxG(tmplObj) { + var subst = []; + for (var _i = 1; _i < arguments.length; _i++) { + subst[_i - 1] = arguments[_i]; + } + var regexText = tmplObj.raw[0]; + var wsrgx = /^\s+|\s+\n|\s*#[\s\S]*?\n|\n/gm; + var txt2 = regexText.replace(wsrgx, ''); + return new RegExp(txt2, 'g'); +} +//# sourceMappingURL=ansi_up.js.map + Object.defineProperty(exports, "__esModule", { value: true }); + exports.default = AnsiUp; +})); diff --git a/parl/remote/static/js/jquery.ajax-cross-origin.min.js b/parl/remote/static/js/jquery.ajax-cross-origin.min.js new file mode 100644 index 0000000000000000000000000000000000000000..cd57dcff2843d0d298fa36851aefedb7984ebe1e --- /dev/null +++ b/parl/remote/static/js/jquery.ajax-cross-origin.min.js @@ -0,0 +1,57 @@ +/* + jQuery AJAX Cross Origin v1.3 (http://www.ajax-cross-origin.com) + jQuery plugin to bypass Same-origin_policy using Google Apps Script. + + references: + http://en.wikipedia.org/wiki/Same-origin_policy + http://www.google.com/script/start/ + + (c) 2014, Writen by Erez Ninio. site: www.dealhotelbook.com + + Licensed under the Creative Commons Attribution 3.0 Unported License. + For details, see http://creativecommons.org/licenses/by/3.0/. +*/ + +var proxyJsonp = + "https://script.google.com/macros/s/AKfycbwmqG55tt2d2FcT_WQ3WjCSKmtyFpkOcdprSITn45-4UgVJnzp9/exec"; +jQuery.ajaxOrig = jQuery.ajax; +jQuery.ajax = function (a, b) { + function d(a) { + a = encodeURI(a).replace(/&/g, "%26"); + return proxyJsonp + "?url=" + a + "&callback=?"; + } + var c = "object" === typeof a ? a : b || {}; + c.url = c.url || ("string" === typeof a ? a : ""); + var c = jQuery.ajaxSetup({}, c), + e = (function (a, c) { + var b = document.createElement("a"); + b.href = a; + return ( + c.crossOrigin && + "http" == a.substr(0, 4).toLowerCase() && + "localhost" != b.hostname && + "127.0.0.1" != b.hostname && + b.hostname != window.location.hostname + ); + })(c.url, c); + c.proxy && + 0 < c.proxy.length && + ((proxyJsonp = c.proxy), + "object" === typeof a + ? (a.crossDomain = !0) + : "object" === typeof b && (b.crossDomain = !0)); + e && + ("object" === typeof a + ? a.url && + ((a.url = d(a.url)), + a.charset && (a.url += "&charset=" + a.charset), + (a.dataType = "json")) + : "string" === typeof a && + "object" === typeof b && + ((a = d(a)), + b.charset && (a += "&charset=" + b.charset), + (b.dataType = "json"))); + return jQuery.ajaxOrig.apply(this, arguments); +}; +jQuery.ajax.prototype = new jQuery.ajaxOrig(); +jQuery.ajax.prototype.constructor = jQuery.ajax; diff --git a/parl/remote/static/js/parl.js b/parl/remote/static/js/parl.js index 117e2d5542e69213a0b4ae7e04d5b6c6533006c8..e158e69917f969c62b6be2daf4b43176f0674ba7 100644 --- a/parl/remote/static/js/parl.js +++ b/parl/remote/static/js/parl.js @@ -185,7 +185,8 @@ function autoTable(res) { var s3 = `${res.clients[i].client_address}`; var s4 = `${res.clients[i].actor_num}`; var s5 = `${res.clients[i].time}`; - tr.innerHTML = s1 + s2 + s3 + s4 + s5; + var s6 = `link`; + tr.innerHTML = s1 + s2 + s3 + s4 + s5 + s6; table.appendChild(tr); } }; diff --git a/parl/remote/templates/clients.html b/parl/remote/templates/clients.html index b87962f11d1a41d649ec953d426d418be0b2baf1..e0089b6422bb4a5af43372d3962adbb9303218af 100644 --- a/parl/remote/templates/clients.html +++ b/parl/remote/templates/clients.html @@ -43,10 +43,11 @@ Hostname Actor Num Time (min) + Log - Loading Data... + Loading Data... diff --git a/parl/remote/templates/jobs.html b/parl/remote/templates/jobs.html new file mode 100644 index 0000000000000000000000000000000000000000..56e8a775a5dd6eb86e17bfc61fedf660e97d7ff0 --- /dev/null +++ b/parl/remote/templates/jobs.html @@ -0,0 +1,192 @@ + + + + + Parl Cluster + + + + + + + + + + +
+
+ Jobs Monitor +
+ +
+
+

+ Remote Job Log +

+

+ Client ID: {{ client_id }} +

+
+
+
+
+ + + + + + + diff --git a/parl/remote/tests/actor_max_memory_test.py b/parl/remote/tests/actor_max_memory_test.py index ebe7f35d5c2c3a978bb8c257596797c96503ee35..1619651521b9c2c3fd7ece441ad9296d2ab1e852 100644 --- a/parl/remote/tests/actor_max_memory_test.py +++ b/parl/remote/tests/actor_max_memory_test.py @@ -45,7 +45,10 @@ class TestMaxMemory(unittest.TestCase): def tearDown(self): disconnect() - def actor(self): + #In windows, multiprocessing.Process cannot run the method of class, but static method is ok. + @staticmethod + def actor(cluster_addr): + parl.connect(cluster_addr) actor1 = Actor() time.sleep(10) actor1.add_500mb() @@ -56,16 +59,17 @@ class TestMaxMemory(unittest.TestCase): th = threading.Thread(target=master.run) th.start() time.sleep(5) - worker = Worker('localhost:{}'.format(port), 1) - cluster_monitor = ClusterMonitor('localhost:{}'.format(port)) + cluster_addr = 'localhost:{}'.format(port) + worker = Worker(cluster_addr, 1) + cluster_monitor = ClusterMonitor(cluster_addr) time.sleep(5) - parl.connect('localhost:{}'.format(port)) + parl.connect(cluster_addr) actor = Actor() time.sleep(20) self.assertEqual(1, cluster_monitor.data['clients'][0]['actor_num']) del actor time.sleep(10) - p = Process(target=self.actor) + p = Process(target=self.actor, args=(cluster_addr, )) p.start() for _ in range(6): diff --git a/parl/remote/tests/cluster_monitor_2_test.py b/parl/remote/tests/cluster_monitor_2_test.py index 16dd24bd0471700b8a391050f6c51eae7c34dfdf..f27ee587b908093365426d7154f23d3adeccd721 100644 --- a/parl/remote/tests/cluster_monitor_2_test.py +++ b/parl/remote/tests/cluster_monitor_2_test.py @@ -22,7 +22,6 @@ import time import threading from parl.remote.client import disconnect from parl.remote import exceptions -import timeout_decorator import subprocess diff --git a/parl/remote/tests/cluster_monitor_3_test.py b/parl/remote/tests/cluster_monitor_3_test.py index f141bf68268878a61b2562a66592af4efba961d6..6570746a04d35651353960882538f17203610708 100644 --- a/parl/remote/tests/cluster_monitor_3_test.py +++ b/parl/remote/tests/cluster_monitor_3_test.py @@ -22,7 +22,6 @@ import time import threading from parl.remote.client import disconnect from parl.remote import exceptions -import timeout_decorator import subprocess diff --git a/parl/remote/tests/cluster_monitor_test.py b/parl/remote/tests/cluster_monitor_test.py index abf5ea651654e44eeff49817bd993721cd3b21f0..94341700c00ffc8b5e26b818fa9d394d02b0c60a 100644 --- a/parl/remote/tests/cluster_monitor_test.py +++ b/parl/remote/tests/cluster_monitor_test.py @@ -22,7 +22,6 @@ import time import threading from parl.remote.client import disconnect from parl.remote import exceptions -import timeout_decorator import subprocess diff --git a/parl/remote/tests/cluster_test.py b/parl/remote/tests/cluster_test.py index 9025b7b6f295c0ab75b019f03e20b94bf04f3d52..0ac9d0ba4e9b6e023528a91d3ba496aa010755f9 100644 --- a/parl/remote/tests/cluster_test.py +++ b/parl/remote/tests/cluster_test.py @@ -21,8 +21,8 @@ import time import threading from parl.remote.client import disconnect from parl.remote import exceptions -import timeout_decorator import subprocess +from parl.utils import logger @parl.remote_class @@ -63,20 +63,24 @@ class TestCluster(unittest.TestCase): disconnect() def test_actor_exception(self): - master = Master(port=1235) + logger.info("running:test_actor_exception") + master = Master(port=8235) th = threading.Thread(target=master.run) th.start() time.sleep(3) - worker1 = Worker('localhost:1235', 1) + worker1 = Worker('localhost:8235', 1) for _ in range(3): if master.cpu_num == 1: break time.sleep(10) self.assertEqual(1, master.cpu_num) - parl.connect('localhost:1235') + logger.info("running:test_actor_exception: 0") + parl.connect('localhost:8235') + logger.info("running:test_actor_exception: 1") with self.assertRaises(exceptions.RemoteError): actor = Actor(abcd='a bug') + logger.info("running:test_actor_exception: 2") actor2 = Actor() for _ in range(3): @@ -89,15 +93,15 @@ class TestCluster(unittest.TestCase): master.exit() worker1.exit() - @timeout_decorator.timeout(seconds=800) - def test_actor_exception(self): - master = Master(port=1236) + def test_actor_exception_2(self): + logger.info("running: test_actor_exception_2") + master = Master(port=8236) th = threading.Thread(target=master.run) th.start() time.sleep(3) - worker1 = Worker('localhost:1236', 1) + worker1 = Worker('localhost:8236', 1) self.assertEqual(1, master.cpu_num) - parl.connect('localhost:1236') + parl.connect('localhost:8236') actor = Actor() try: actor.will_raise_exception_func() @@ -116,14 +120,15 @@ class TestCluster(unittest.TestCase): master.exit() def test_reset_actor(self): + logger.info("running: test_reset_actor") # start the master - master = Master(port=1237) + master = Master(port=8237) th = threading.Thread(target=master.run) th.start() time.sleep(3) - worker1 = Worker('localhost:1237', 4) - parl.connect('localhost:1237') + worker1 = Worker('localhost:8237', 4) + parl.connect('localhost:8237') for _ in range(10): actor = Actor() ret = actor.add_one(1) @@ -140,19 +145,20 @@ class TestCluster(unittest.TestCase): master.exit() def test_add_worker(self): - master = Master(port=1234) + logger.info("running: test_add_worker") + master = Master(port=8234) th = threading.Thread(target=master.run) th.start() time.sleep(1) - worker1 = Worker('localhost:1234', 4) + worker1 = Worker('localhost:8234', 4) for _ in range(3): if master.cpu_num == 4: break time.sleep(10) self.assertEqual(master.cpu_num, 4) - worker2 = Worker('localhost:1234', 4) + worker2 = Worker('localhost:8234', 4) for _ in range(3): if master.cpu_num == 8: break diff --git a/parl/framework/agent_base.py b/parl/remote/tests/local_actor_test.py similarity index 50% rename from parl/framework/agent_base.py rename to parl/remote/tests/local_actor_test.py index 331f93b3730be0ae6c17d19ba24e8dd1f03d9c05..0435ed233153ec9efee548e012eb70ead11e2dd5 100644 --- a/parl/framework/agent_base.py +++ b/parl/remote/tests/local_actor_test.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -11,14 +11,28 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import os +os.environ['XPARL'] = 'True' +import parl +import unittest -import warnings -warnings.simplefilter('default') +@parl.remote_class(max_memory=350) +class Actor(object): + def __init__(self, x=10): + self.x = x + self.data = [] -warnings.warn( - "module `parl.framework.agent_base.Agent` is deprecated since version 1.2 and will be removed in version 1.3, please use `parl.Agent` instead.", - DeprecationWarning, - stacklevel=2) + def add_500mb(self): + self.data.append(os.urandom(500 * 1024**2)) + self.x += 1 + return self.x -from parl.core.fluid.agent import * + +class TestLocalActor(unittest.TestCase): + def test_create_actors_without_pre_connection(self): + actor = Actor() + + +if __name__ == '__main__': + unittest.main() diff --git a/parl/remote/tests/log_server_test.py b/parl/remote/tests/log_server_test.py new file mode 100644 index 0000000000000000000000000000000000000000..931fc29538df1bc1c960c57e2f97a54e4bb8e0aa --- /dev/null +++ b/parl/remote/tests/log_server_test.py @@ -0,0 +1,186 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import multiprocessing +import os +import pickle +import subprocess +import sys +import tempfile +import threading +import time +import unittest + +import requests + +import parl +from parl.remote.client import disconnect, get_global_client +from parl.remote.master import Master +from parl.remote.worker import Worker +from parl.utils import _IS_WINDOWS + + +@parl.remote_class +class Actor(object): + def __init__(self, number=None, arg1=None, arg2=None): + self.number = number + self.arg1 = arg1 + self.arg2 = arg2 + print("Init actor...") + self.init_output = "Init actor...\n" + + def sim_output(self, start, end): + output = "" + print(self.number) + output += str(self.number) + output += "\n" + for i in range(start, end): + print(i) + output += str(i) + output += "\n" + return self.init_output + output + + +class TestLogServer(unittest.TestCase): + def tearDown(self): + disconnect() + + #In windows, multiprocessing.Process cannot run the method of class, but static method is ok. + @staticmethod + def _connect_and_create_actor(cluster_addr): + parl.connect(cluster_addr) + outputs = [] + for i in range(2): + actor = Actor(number=i) + ret = actor.sim_output(1, 4) + assert ret != "" + outputs.append(ret) + return outputs + + def test_log_server(self): + master_port = 8401 + # start the master + master = Master(port=master_port) + th = threading.Thread(target=master.run) + th.start() + time.sleep(1) + + cluster_addr = 'localhost:{}'.format(master_port) + log_server_port = 8402 + worker = Worker(cluster_addr, 4, log_server_port=log_server_port) + outputs = self._connect_and_create_actor(cluster_addr) + + # Get status + status = master._get_status() + client_jobs = pickle.loads(status).get('client_jobs') + self.assertIsNotNone(client_jobs) + + # Get job id + client = get_global_client() + jobs = client_jobs.get(client.client_id) + self.assertIsNotNone(jobs) + + for job_id, log_server_addr in jobs.items(): + log_url = "http://{}/get-log".format(log_server_addr) + # Test response without job_id + r = requests.get(log_url) + self.assertEqual(r.status_code, 400) + # Test normal response + r = requests.get(log_url, params={'job_id': job_id}) + self.assertEqual(r.status_code, 200) + log_content = json.loads(r.text).get('log') + self.assertIsNotNone(log_content) + log_content = log_content.replace('\r\n', '\n') + self.assertIn(log_content, outputs) + + # Test download + download_url = "http://{}/download-log".format(log_server_addr) + r = requests.get(download_url, params={'job_id': job_id}) + self.assertEqual(r.status_code, 200) + log_content = r.text.replace('\r\n', '\n') + self.assertIn(log_content, outputs) + + disconnect() + worker.exit() + master.exit() + + def test_monitor_query_log_server(self): + master_port = 8403 + monitor_port = 8404 + # start the master + master = Master(port=master_port, monitor_port=monitor_port) + th = threading.Thread(target=master.run) + th.start() + time.sleep(1) + # start the cluster monitor + monitor_file = __file__.replace( + os.path.join('tests', 'log_server_test.pyc'), 'monitor.py') + monitor_file = monitor_file.replace( + os.path.join('tests', 'log_server_test.py'), 'monitor.py') + command = [ + sys.executable, monitor_file, "--monitor_port", + str(monitor_port), "--address", "localhost:" + str(master_port) + ] + if _IS_WINDOWS: + FNULL = tempfile.TemporaryFile() + else: + FNULL = open(os.devnull, 'w') + monitor_proc = subprocess.Popen( + command, + stdout=FNULL, + stderr=subprocess.STDOUT, + ) + + # Start worker + cluster_addr = 'localhost:{}'.format(master_port) + log_server_port = 8405 + worker = Worker(cluster_addr, 4, log_server_port=log_server_port) + + # Test monitor API + outputs = self._connect_and_create_actor(cluster_addr) + time.sleep(5) # Wait for the status update + client = get_global_client() + jobs_url = "{}/get-jobs?client_id={}".format(master.monitor_url, + client.client_id) + r = requests.get(jobs_url) + self.assertEqual(r.status_code, 200) + data = json.loads(r.text) + for job in data: + log_url = job.get('log_url') + self.assertIsNotNone(log_url) + r = requests.get(log_url) + self.assertEqual(r.status_code, 200) + log_content = json.loads(r.text).get('log') + self.assertIsNotNone(log_content) + log_content = log_content.replace('\r\n', '\n') + self.assertIn(log_content, outputs) + + # Test download + download_url = job.get('download_url') + r = requests.get(download_url) + self.assertEqual(r.status_code, 200) + log_content = r.text.replace('\r\n', '\n') + self.assertIn(log_content, outputs) + + # Clean context + monitor_proc.kill() + monitor_proc.wait() + disconnect() + worker.exit() + master.exit() + + +if __name__ == '__main__': + unittest.main() diff --git a/parl/remote/tests/multiprocessing/cluster_multiprocessing_1_test.py b/parl/remote/tests/multiprocessing/cluster_multiprocessing_1_test.py index 5dd7d9fca6737324ea187258f2f32364e72034b2..c87afad15d5e0e92e44bda1a97259eaaf6b32256 100644 --- a/parl/remote/tests/multiprocessing/cluster_multiprocessing_1_test.py +++ b/parl/remote/tests/multiprocessing/cluster_multiprocessing_1_test.py @@ -16,7 +16,6 @@ import unittest import parl import time import threading -import timeout_decorator import multiprocessing from parl.remote.master import Master @@ -39,12 +38,14 @@ class TestCluster(unittest.TestCase): def tearDown(self): disconnect() - def _connect_and_create_actor(self, cluster_addr): + #In windows, multiprocessing.Process cannot run the method of class, but static method is ok. + @staticmethod + def _connect_and_create_actor(cluster_addr): parl.connect(cluster_addr) for _ in range(2): actor = Actor() ret = actor.add_one(1) - self.assertEqual(ret, 2) + assert ret == 2 disconnect() def _create_actor(self): @@ -53,7 +54,6 @@ class TestCluster(unittest.TestCase): ret = actor.add_one(1) self.assertEqual(ret, 2) - @timeout_decorator.timeout(seconds=300) def test_connect_and_create_actor_in_multiprocessing_with_connected_in_main_process( self): # start the master diff --git a/parl/remote/tests/multiprocessing/cluster_multiprocessing_2_test.py b/parl/remote/tests/multiprocessing/cluster_multiprocessing_2_test.py index 8f4458912dbed9a7bcd408fed4b0937271b21b40..09b8b95002bdd5a45d132b04c39811bf5d32cdd8 100644 --- a/parl/remote/tests/multiprocessing/cluster_multiprocessing_2_test.py +++ b/parl/remote/tests/multiprocessing/cluster_multiprocessing_2_test.py @@ -16,7 +16,6 @@ import unittest import parl import time import threading -import timeout_decorator import multiprocessing from parl.remote.master import Master @@ -39,12 +38,14 @@ class TestCluster(unittest.TestCase): def tearDown(self): disconnect() - def _connect_and_create_actor(self, cluster_addr): + #In windows, multiprocessing.Process cannot run the method of class, but static method is ok. + @staticmethod + def _connect_and_create_actor(cluster_addr): parl.connect(cluster_addr) for _ in range(2): actor = Actor() ret = actor.add_one(1) - self.assertEqual(ret, 2) + assert ret == 2 disconnect() def _create_actor(self): @@ -53,7 +54,6 @@ class TestCluster(unittest.TestCase): ret = actor.add_one(1) self.assertEqual(ret, 2) - @timeout_decorator.timeout(seconds=300) def test_connect_and_create_actor_in_multiprocessing_without_connected_in_main_process( self): # start the master diff --git a/parl/remote/tests/multiprocessing/cluster_multiprocessing_test.py b/parl/remote/tests/multiprocessing/cluster_multiprocessing_test.py index 22625c0e959fdc7511eb1b4cb25cd6eaea31780a..3cdffd71aaf238c460fd31ae3c757d11e3fec18c 100644 --- a/parl/remote/tests/multiprocessing/cluster_multiprocessing_test.py +++ b/parl/remote/tests/multiprocessing/cluster_multiprocessing_test.py @@ -4,8 +4,7 @@ # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # -# http://www.apache.org/licenses/LICENSE-2.0 -# +# http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -16,12 +15,12 @@ import unittest import parl import time import threading -import timeout_decorator import multiprocessing from parl.remote.master import Master from parl.remote.worker import Worker from parl.remote.client import disconnect +from parl.utils import _IS_WINDOWS @parl.remote_class @@ -39,21 +38,14 @@ class TestCluster(unittest.TestCase): def tearDown(self): disconnect() - def _connect_and_create_actor(self, cluster_addr): - parl.connect(cluster_addr) - for _ in range(2): - actor = Actor() - ret = actor.add_one(1) - self.assertEqual(ret, 2) - disconnect() - - def _create_actor(self): + #In windows, multiprocessing.Process cannot run the method of class, but static method is ok. + @staticmethod + def _create_actor(): for _ in range(2): actor = Actor() ret = actor.add_one(1) - self.assertEqual(ret, 2) + assert ret == 2 - @timeout_decorator.timeout(seconds=300) def test_create_actor_in_multiprocessing(self): # start the master master = Master(port=8240) @@ -64,14 +56,15 @@ class TestCluster(unittest.TestCase): worker1 = Worker('localhost:8240', 4) parl.connect('localhost:8240') - proc1 = multiprocessing.Process(target=self._create_actor) - proc2 = multiprocessing.Process(target=self._create_actor) - proc1.start() - proc2.start() + if not _IS_WINDOWS: # In windows, fork process cannot access client created in main process. + proc1 = multiprocessing.Process(target=self._create_actor) + proc2 = multiprocessing.Process(target=self._create_actor) + proc1.start() + proc2.start() - proc1.join() - proc2.join() - print("[test_create_actor_in_multiprocessing] Join") + proc1.join() + proc2.join() + print("[test_create_actor_in_multiprocessing] Join") # make sure that the client of the main process still works self._create_actor() diff --git a/parl/remote/tests/recursive_actor_test.py b/parl/remote/tests/recursive_actor_test.py new file mode 100644 index 0000000000000000000000000000000000000000..5e9613b6be23ae64bf2fba10df793cfa57dbeea1 --- /dev/null +++ b/parl/remote/tests/recursive_actor_test.py @@ -0,0 +1,56 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import unittest +from parl.utils import logger +import parl +from parl.remote.client import disconnect +from parl.remote.master import Master +from parl.remote.worker import Worker +import time +import threading + +c = 10 +port = 3002 +if __name__ == '__main__': + master = Master(port=port) + th = threading.Thread(target=master.run) + th.setDaemon(True) + th.start() +time.sleep(5) +cluster_addr = 'localhost:{}'.format(port) +parl.connect(cluster_addr) +worker = Worker(cluster_addr, 1) + + +@parl.remote_class +class Actor(object): + def add(self, a, b): + return a + b + c + + +actor = Actor() + + +class TestRecursive_actor(unittest.TestCase): + def tearDown(self): + disconnect() + + def test_global_running(self): + self.assertEqual(actor.add(1, 2), 13) + master.exit() + worker.exit() + + +if __name__ == '__main__': + unittest.main() diff --git a/parl/remote/tests/reset_job_test.py b/parl/remote/tests/reset_job_test.py index 85f07184a0b55e7d3dcf285a707e1d7862ec08bf..478da6506821306d997ad788240178d4739c1bf6 100644 --- a/parl/remote/tests/reset_job_test.py +++ b/parl/remote/tests/reset_job_test.py @@ -23,7 +23,6 @@ import time import threading import subprocess import sys -import timeout_decorator @parl.remote_class @@ -63,7 +62,6 @@ class TestJob(unittest.TestCase): def tearDown(self): disconnect() - @timeout_decorator.timeout(seconds=600) def test_acor_exit_exceptionally(self): port = 1337 master = Master(port) diff --git a/parl/remote/tests/reset_job_test_alone.py b/parl/remote/tests/reset_job_test_alone.py index 81cc2fe77a102521c0dc0633d215821a2a5d991c..425f02ad4ee8acca93560d6a4de5e4836112ffb9 100644 --- a/parl/remote/tests/reset_job_test_alone.py +++ b/parl/remote/tests/reset_job_test_alone.py @@ -16,7 +16,8 @@ import parl from parl.remote.master import Master from parl.remote.worker import Worker from parl.remote.client import disconnect -from parl.utils import logger +from parl.utils import logger, _IS_WINDOWS +import os import threading import time import subprocess @@ -70,9 +71,14 @@ class TestJobAlone(unittest.TestCase): time.sleep(1) self.assertEqual(master.cpu_num, 4) print("We are going to kill all the jobs.") - command = ( - "ps aux | grep remote/job.py | awk '{print $2}' | xargs kill -9") - subprocess.call([command], shell=True) + if _IS_WINDOWS: + command = r'''for /F "skip=2 tokens=2 delims=," %a in ('wmic process where "commandline like '%remote\\job.py%'" get processid^,status /format:csv') do taskkill /F /T /pid %a''' + print(os.popen(command).read()) + else: + command = ( + "ps aux | grep remote/job.py | awk '{print $2}' | xargs kill -9" + ) + subprocess.call([command], shell=True) parl.connect('localhost:1334') actor = Actor() self.assertEqual(actor.add_one(1), 2) diff --git a/parl/remote/tests/send_job_test.py b/parl/remote/tests/send_job_test.py index 77ea421fde09e042c6620da8087a683fb4710acf..8ea2d4083c45f66184e8a2287c77e3f0ca840257 100644 --- a/parl/remote/tests/send_job_test.py +++ b/parl/remote/tests/send_job_test.py @@ -21,6 +21,7 @@ import threading from parl.remote.master import Master from parl.remote.worker import Worker from parl.remote.client import disconnect +from parl.utils import _IS_WINDOWS @parl.remote_class @@ -44,12 +45,15 @@ class TestSendFile(unittest.TestCase): worker = Worker('localhost:{}'.format(port), 1) time.sleep(2) - os.system('mkdir ./rom_files') - os.system('touch ./rom_files/pong.bin') - assert os.path.exists('./rom_files/pong.bin') - parl.connect( - 'localhost:{}'.format(port), - distributed_files=['./rom_files/pong.bin']) + tmp_dir = 'rom_files' + tmp_file = os.path.join(tmp_dir, 'pong.bin') + os.system('mkdir {}'.format(tmp_dir)) + if _IS_WINDOWS: + os.system('type NUL >> {}'.format(tmp_file)) + else: + os.system('touch {}'.format(tmp_file)) + assert os.path.exists(tmp_file) + parl.connect('localhost:{}'.format(port), distributed_files=[tmp_file]) time.sleep(5) actor = Actor() for _ in range(10): @@ -70,8 +74,9 @@ class TestSendFile(unittest.TestCase): worker = Worker('localhost:{}'.format(port), 1) time.sleep(2) + tmp_file = os.path.join('rom_files', 'no_pong.bin') self.assertRaises(Exception, parl.connect, 'localhost:{}'.format(port), - ['./rom_files/no_pong.bin']) + [tmp_file]) worker.exit() master.exit() diff --git a/parl/remote/tests/sync_config_file_test.py b/parl/remote/tests/sync_config_file_test.py index a4d131d5e13111a1c7faaa209aa2acb114e7c7c7..c8be19443e446e1d90819a63c2a64b471fb23e6d 100644 --- a/parl/remote/tests/sync_config_file_test.py +++ b/parl/remote/tests/sync_config_file_test.py @@ -17,12 +17,10 @@ import parl from parl.remote.master import Master from parl.remote.worker import Worker from parl.remote.client import disconnect - +import os import time import threading - import sys - import numpy as np import json @@ -65,7 +63,8 @@ class TestConfigfile(unittest.TestCase): parl.connect('localhost:1335', ['random.npy', 'config.json']) actor = Actor('random.npy', 'config.json') time.sleep(5) - + os.remove('./random.npy') + os.remove('./config.json') remote_sum = actor.random_sum() self.assertEqual(remote_sum, random_sum) time.sleep(10) diff --git a/parl/remote/utils.py b/parl/remote/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..9a2ece8686ff7de73c8164565f34281e412aa4ee --- /dev/null +++ b/parl/remote/utils.py @@ -0,0 +1,96 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import sys +from contextlib import contextmanager + +__all__ = ['load_remote_class', 'redirect_stdout_to_file'] + + +def simplify_code(code, end_of_file): + """ + @parl.remote_actor has to use this function to simplify the code. + To create a remote object, PARL has to import the module that contains the decorated class. + It may run some unnecessary code when importing the module, and this is why we use this function + to simplify the code. + + For example. + @parl.remote_actor + class A(object): + def add(self, a, b): + return a + b + def data_process(): + XXXX + ------------------> + The last two lines of the above code block will be removed as they are not class related. + """ + to_write_lines = [] + for i, line in enumerate(code): + if line.startswith('parl.connect'): + continue + if i < end_of_file - 1: + to_write_lines.append(line) + else: + break + return to_write_lines + + +def load_remote_class(file_name, class_name, end_of_file): + """ + load a class given its file_name and class_name. + + Args: + file_name: specify the file to load the class + class_name: specify the class to be loaded + end_of_file: line ID to indicate the last line that defines the class. + + Return: + cls: the class to load + """ + with open(file_name + '.py') as t_file: + code = t_file.readlines() + code = simplify_code(code, end_of_file) + module_name = 'xparl_' + file_name + tmp_file_name = 'xparl_' + file_name + '.py' + with open(tmp_file_name, 'w') as t_file: + for line in code: + t_file.write(line) + mod = __import__(module_name) + cls = getattr(mod, class_name) + return cls + + +@contextmanager +def redirect_stdout_to_file(file_path): + """Redirect stdout (e.g., `print`) to specified file. + + Example: + >>> print('test') + test + >>> with redirect_stdout_to_file('test.log'): + ... print('test') # Output nothing, `test` is printed to `test.log`. + >>> print('test') + test + + Args: + file_path: Path of the file to output the stdout. + + """ + tmp = sys.stdout + f = open(file_path, 'a') + sys.stdout = f + try: + yield + finally: + sys.stdout = tmp + f.close() diff --git a/parl/remote/worker.py b/parl/remote/worker.py index fae9cd8306463b4d463d90292dbeabbd79b6b492..eec5598c6d081ca054541657c61670ecffc70cee 100644 --- a/parl/remote/worker.py +++ b/parl/remote/worker.py @@ -20,13 +20,14 @@ import signal import socket import subprocess import sys +import tempfile import time import threading import warnings import zmq from datetime import datetime -from parl.utils import get_ip_address, to_byte, to_str, logger +from parl.utils import get_ip_address, to_byte, to_str, logger, _IS_WINDOWS, kill_process from parl.remote import remote_constants from parl.remote.message import InitializedWorker from parl.remote.status import WorkerStatus @@ -63,7 +64,7 @@ class Worker(object): cpu_num (int): Number of cpu to be used on the worker. """ - def __init__(self, master_address, cpu_num=None): + def __init__(self, master_address, cpu_num=None, log_server_port=None): self.lock = threading.Lock() self.heartbeat_socket_initialized = threading.Event() self.ctx = zmq.Context.instance() @@ -75,9 +76,13 @@ class Worker(object): self._set_cpu_num(cpu_num) self.job_buffer = queue.Queue(maxsize=self.cpu_num) self._create_sockets() + # create log server + self.log_server_proc, self.log_server_address = self._create_log_server( + port=log_server_port) # create a thread that waits commands from the job to kill the job. self.kill_job_thread = threading.Thread(target=self._reply_kill_job) + self.kill_job_thread.setDaemon(True) self.kill_job_thread.start() self._create_jobs() @@ -169,6 +174,7 @@ class Worker(object): def _fill_job_buffer(self): """An endless loop that adds initialized job into the job buffer""" + initialized_jobs = [] while self.worker_is_alive: if self.job_buffer.full() is False: job_num = self.cpu_num - self.job_buffer.qsize() @@ -178,13 +184,7 @@ class Worker(object): self.job_buffer.put(job) time.sleep(0.02) - - # release jobs if the worker is not alive - for job in initialized_jobs: - try: - os.kill(job.pid, signal.SIGTERM) - except OSError: - pass + self.exit() def _init_jobs(self, job_num): """Create jobs. @@ -196,7 +196,8 @@ class Worker(object): job_file = job_file.replace('worker.py', 'job.py') command = [ sys.executable, job_file, "--worker_address", - self.reply_job_address + self.reply_job_address, "--log_server_address", + self.log_server_address ] if sys.version_info.major == 3: @@ -223,6 +224,7 @@ class Worker(object): # a thread for sending heartbeat signals to job thread = threading.Thread( target=self._create_job_monitor, args=(initialized_job, )) + thread.setDaemon(True) thread.start() self.lock.release() assert len(new_jobs) > 0, "init jobs failed" @@ -311,7 +313,10 @@ class Worker(object): total_memory = round(virtual_memory[0] / (1024**3), 2) used_memory = round(virtual_memory[3] / (1024**3), 2) vacant_memory = round(total_memory - used_memory, 2) - load_average = round(os.getloadavg()[0], 2) + if _IS_WINDOWS: + load_average = round(psutil.getloadavg()[0], 2) + else: + load_average = round(os.getloadavg()[0], 2) return (vacant_memory, used_memory, now, load_average) def _reply_heartbeat(self, target): @@ -329,7 +334,7 @@ class Worker(object): logger.set_dir( os.path.expanduser('~/.parl_data/worker/{}'.format( - self.master_heartbeat_address))) + self.master_heartbeat_address.replace(':', '_')))) self.heartbeat_socket_initialized.set() logger.info("[Worker] Connect to the master node successfully. " @@ -351,15 +356,47 @@ class Worker(object): break socket.close(0) logger.warning( - "[Worker] lost connection with the master, will exit replying heartbeat for master." + "[Worker] lost connection with the master, will exit reply heartbeat for master." ) self.worker_status.clear() + self.log_server_proc.kill() + self.log_server_proc.wait() # exit the worker self.worker_is_alive = False + self.exit() + + def _create_log_server(self, port): + log_server_file = __file__.replace('worker.pyc', 'log_server.py') + log_server_file = log_server_file.replace('worker.py', 'log_server.py') + + if port is None: + port = "0" # `0` means using a random port in flask + command = [ + sys.executable, log_server_file, "--port", + str(port), "--log_dir", "~/.parl_data/job/", "--line_num", "500" + ] + + if sys.version_info.major == 3: + warnings.simplefilter("ignore", ResourceWarning) + + if _IS_WINDOWS: + FNULL = tempfile.TemporaryFile() + else: + FNULL = open(os.devnull, 'w') + log_server_proc = subprocess.Popen( + command, + stdout=FNULL, + stderr=subprocess.STDOUT, + ) + FNULL.close() + + log_server_address = "{}:{}".format(self.worker_ip, port) + return log_server_proc, log_server_address def exit(self): """close the worker""" self.worker_is_alive = False + kill_process('remote/job.py.*{}'.format(self.reply_job_address)) def run(self): """Keep running until it lost connection with the master. diff --git a/parl/utils/communication.py b/parl/utils/communication.py index ea201bae16e571ab429ef8f194228fc5b7fa4432..c13c28e93df006fd464b880e4196e22cbacc97cb 100644 --- a/parl/utils/communication.py +++ b/parl/utils/communication.py @@ -14,6 +14,8 @@ import cloudpickle import pyarrow +import subprocess +import os from parl.utils import SerializeError, DeserializeError __all__ = ['dumps_argument', 'loads_argument', 'dumps_return', 'loads_return'] diff --git a/parl/utils/machine_info.py b/parl/utils/machine_info.py index 3ab8e404a442d8ac4eec20e5cbb3bf6c07b0f541..f69319ad02ea0673ef135b1cd914a70fc6a2fea3 100644 --- a/parl/utils/machine_info.py +++ b/parl/utils/machine_info.py @@ -14,40 +14,40 @@ import os import platform +import random +import socket import subprocess -from parl.utils import logger -from parl.utils import utils +from parl.utils import logger, _HAS_FLUID, _IS_WINDOWS -__all__ = ['get_gpu_count', 'get_ip_address', 'is_gpu_available'] +__all__ = [ + 'get_gpu_count', 'get_ip_address', 'is_gpu_available', 'get_free_tcp_port', + 'is_port_available', 'get_port_from_range' +] def get_ip_address(): """ get the IP address of the host. """ - platform_sys = platform.system() - # Only support Linux and MacOS - if platform_sys != 'Linux' and platform_sys != 'Darwin': - logger.warning( - 'get_ip_address only support Linux and MacOS, please set ip address manually.' - ) - return None - - local_ip = None - import socket - try: - # First way, tested in Ubuntu and MacOS - s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) - s.connect(("8.8.8.8", 80)) - local_ip = s.getsockname()[0] - s.close() - except: - # Second way, tested in CentOS + # Windows + if _IS_WINDOWS: + local_ip = socket.gethostbyname(socket.gethostname()) + else: + # Linux and MacOS + local_ip = None try: - local_ip = socket.gethostbyname(socket.gethostname()) + # First way, tested in Ubuntu and MacOS + s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) + s.connect(("8.8.8.8", 80)) + local_ip = s.getsockname()[0] + s.close() except: - pass + # Second way, tested in CentOS + try: + local_ip = socket.gethostbyname(socket.gethostname()) + except: + pass if local_ip == None or local_ip == '127.0.0.1' or local_ip == '127.0.1.1': logger.warning( @@ -97,10 +97,40 @@ def is_gpu_available(): True if a gpu device can be found. """ ret = get_gpu_count() > 0 - if utils._HAS_FLUID: + if _HAS_FLUID: from paddle import fluid if ret is True and not fluid.is_compiled_with_cuda(): logger.warning("Found non-empty CUDA_VISIBLE_DEVICES. \ - But PARL found that Paddle was not complied with CUDA, which may cause issues." - ) + But PARL found that Paddle was not complied with CUDA, which may cause issues. \ + Thus PARL will not use GPU.") + return False return ret + + +def get_free_tcp_port(): + tcp = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + tcp.bind(('', 0)) + addr, port = tcp.getsockname() + tcp.close() + return str(port) + + +def is_port_available(port): + """ Check if a port is used. + + True if the port is available for connection. + """ + port = int(port) + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + available = sock.connect_ex(('localhost', port)) + sock.close() + return available + + +def get_port_from_range(start, end): + while True: + port = random.randint(start, end) + if is_port_available(port): + break + + return port diff --git a/examples/LiftSim_baseline/__init__.py b/parl/utils/summary.py similarity index 86% rename from examples/LiftSim_baseline/__init__.py rename to parl/utils/summary.py index eca2dce114b069bf9b455d77ce670d73b5047fd2..bc3578ef384222a4e55b7b9af90f36d9a7fccb4c 100644 --- a/examples/LiftSim_baseline/__init__.py +++ b/parl/utils/summary.py @@ -11,3 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + +try: + from parl.utils.visualdl import * +except: + from parl.utils.tensorboard import * diff --git a/parl/utils/tensorboard.py b/parl/utils/tensorboard.py index 575fc6b9976906e43dddeb7da2ea1ef32d4644c1..3fef518196216986f33f187c215b8aa4834003d5 100644 --- a/parl/utils/tensorboard.py +++ b/parl/utils/tensorboard.py @@ -14,6 +14,7 @@ from tensorboardX import SummaryWriter from parl.utils import logger +from parl.utils.machine_info import get_ip_address __all__ = [] @@ -29,8 +30,8 @@ def create_file_after_first_call(func_name): if logdir is None: logdir = logger.auto_set_dir(action='d') logger.warning( - "[tensorboard] logdir is None, will save tensorboard files to {}" - .format(logdir)) + "[tensorboard] logdir is None, will save tensorboard files to {}\nView the data using: tensorboard --logdir=./{} --host={}" + .format(logdir, logdir, get_ip_address())) _writer = SummaryWriter(logdir=logger.get_dir()) func = getattr(_writer, func_name) func(*args, **kwargs) diff --git a/parl/utils/tests/tensorboard_test.py b/parl/utils/tests/summary_test.py similarity index 74% rename from parl/utils/tests/tensorboard_test.py rename to parl/utils/tests/summary_test.py index 65fcb82404adfe461395e594dcf112ea41fd330e..401051c5debd3ed69d3cba54bcdda1c9ef75f12c 100644 --- a/parl/utils/tests/tensorboard_test.py +++ b/parl/utils/tests/summary_test.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import unittest -from parl.utils import tensorboard +from parl.utils import summary import numpy as np from parl.utils import logger import os @@ -20,18 +20,21 @@ import os class TestUtils(unittest.TestCase): def tearDown(self): - tensorboard.flush() + if hasattr(summary, 'flush'): + summary.flush() def test_add_scalar(self): x = range(100) for i in x: - tensorboard.add_scalar('y=2x', i * 2, i) - self.assertTrue(os.path.exists('./train_log/tensorboard_test')) + summary.add_scalar('y=2x', i * 2, i) + self.assertTrue(os.path.exists('./train_log/summary_test')) def test_add_histogram(self): + if not hasattr(summary, 'add_histogram'): + return for i in range(10): x = np.random.random(1000) - tensorboard.add_histogram('distribution centers', x + i, i) + summary.add_histogram('distribution centers', x + i, i) if __name__ == '__main__': diff --git a/parl/utils/utils.py b/parl/utils/utils.py index cb95b4d18c2c7aa38b2822a12b2995380609535d..a29a8c825017f9241ea59f76f5fe5e58de4f7b80 100644 --- a/parl/utils/utils.py +++ b/parl/utils/utils.py @@ -13,10 +13,14 @@ # limitations under the License. import sys +import os +import subprocess +import numpy as np __all__ = [ 'has_func', 'action_mapping', 'to_str', 'to_byte', 'is_PY2', 'is_PY3', - 'MAX_INT32', '_HAS_FLUID', '_HAS_TORCH' + 'MAX_INT32', '_HAS_FLUID', '_HAS_TORCH', '_IS_WINDOWS', '_IS_MAC', + 'kill_process' ] @@ -45,9 +49,12 @@ def action_mapping(model_output_act, low_bound, high_bound): Returns: action: np.array, which value is in [low_bound, high_bound] """ + assert np.all(((model_output_act<=1.0), (model_output_act>=-1.0))), \ + 'the action should be in range [-1.0, 1.0]' assert high_bound > low_bound action = low_bound + (model_output_act - (-1.0)) * ( (high_bound - low_bound) / 2.0) + action = np.clip(action, low_bound, high_bound) return action @@ -82,7 +89,7 @@ MAX_INT32 = 0x7fffffff try: from paddle import fluid fluid_version = get_fluid_version() - assert fluid_version >= 151, "PARL requires paddle>=1.5.1" + assert fluid_version >= 161 or fluid_version == 0, "PARL requires paddle>=1.6.1" _HAS_FLUID = True except ImportError: _HAS_FLUID = False @@ -92,3 +99,26 @@ try: _HAS_TORCH = True except ImportError: _HAS_TORCH = False + +_IS_WINDOWS = (sys.platform == 'win32') +_IS_MAC = (sys.platform == 'darwin') + + +def kill_process(regex_pattern): + """kill process whose execution commnad is matched by regex pattern + + Args: + regex_pattern(string): regex pattern used to filter the process to be killed + + NOTE: + In windows, we will replace sep `/` with `\\\\` + """ + if _IS_WINDOWS: + regex_pattern = regex_pattern.replace('/', '\\\\') + command = r'''for /F "skip=2 tokens=2 delims=," %a in ('wmic process where "commandline like '%{}%'" get processid^,status /format:csv') do taskkill /F /T /pid %a'''.format( + regex_pattern) + os.popen(command).read() + else: + command = "ps aux | grep {} | awk '{{print $2}}' | xargs kill -9".format( + regex_pattern) + subprocess.call([command], shell=True) diff --git a/parl/utils/visualdl.py b/parl/utils/visualdl.py new file mode 100644 index 0000000000000000000000000000000000000000..bbf1aa08e313440a47a73936d1be61ca9701f166 --- /dev/null +++ b/parl/utils/visualdl.py @@ -0,0 +1,46 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from visualdl import LogWriter +from parl.utils import logger +from parl.utils.machine_info import get_ip_address + +__all__ = [] + +_writer = None +_WRITTER_METHOD = ['add_scalar'] + + +def create_file_after_first_call(func_name): + def call(*args, **kwargs): + global _writer + if _writer is None: + logdir = logger.get_dir() + if logdir is None: + logdir = logger.auto_set_dir(action='d') + logger.warning( + "[VisualDL] logdir is None, will save VisualDL files to {}\nView the data using: visualdl --logdir=./{} --host={}" + .format(logdir, logdir, get_ip_address())) + _writer = LogWriter(logdir=logger.get_dir()) + func = getattr(_writer, func_name) + func(*args, **kwargs) + _writer.flush() + + return call + + +# export writter functions +for func_name in _WRITTER_METHOD: + locals()[func_name] = create_file_after_first_call(func_name) + __all__.append(func_name) diff --git a/setup.py b/setup.py index 18f7ac96a8b075a3db3a12bb19f8f0c0905e33c2..56af19b09fe0f7bf658557ee5b8ab4f7e25498ab 100644 --- a/setup.py +++ b/setup.py @@ -31,7 +31,12 @@ def _find_packages(prefix=''): prefix = prefix for root, _, files in os.walk(path): if '__init__.py' in files: - packages.append(re.sub('^[^A-z0-9_]', '', root.replace('/', '.'))) + if sys.platform == 'win32': + packages.append( + re.sub('^[^A-z0-9_]', '', root.replace('\\', '.'))) + else: + packages.append( + re.sub('^[^A-z0-9_]', '', root.replace('/', '.'))) return packages @@ -72,9 +77,11 @@ setup( "cloudpickle==1.2.1", "tensorboardX==1.8", "tb-nightly==1.15.0a20190801", - "flask==1.0.4", + "flask>=1.0.4", "click", - "psutil", + "psutil>=5.6.2", + "flask_cors", + "visualdl>=2.0.0b;python_version>='3' and platform_system=='Linux'", ], classifiers=[ 'Intended Audience :: Developers',