diff --git a/ding/league/__init__.py b/ding/league/__init__.py index 372c1376eec0c9212866e15ea98c8494862e15f8..d87ef83193038970b8658b056ea90b83a304da8d 100644 --- a/ding/league/__init__.py +++ b/ding/league/__init__.py @@ -3,3 +3,4 @@ from .one_vs_one_league import OneVsOneLeague from .player import Player, ActivePlayer, HistoricalPlayer, create_player from .starcraft_player import MainPlayer, MainExploiter, LeagueExploiter from .shared_payoff import create_payoff +from .metric import get_elo, get_elo_array, LeagueMetricEnv diff --git a/ding/league/base_league.py b/ding/league/base_league.py index 8575deec91cc64a9699284f1e5b4742ef5043d27..2e27e00e1737a91eef17467e2e116cde43bdfd8a 100644 --- a/ding/league/base_league.py +++ b/ding/league/base_league.py @@ -7,7 +7,9 @@ import os.path as osp from ding.league.player import ActivePlayer, HistoricalPlayer, create_player from ding.league.shared_payoff import create_payoff -from ding.utils import import_module, read_file, save_file, LockContext, LockContextType, LEAGUE_REGISTRY +from ding.utils import import_module, read_file, save_file, LockContext, LockContextType, LEAGUE_REGISTRY, \ + deep_merge_dicts +from .metric import LeagueMetricEnv class BaseLeague: @@ -27,6 +29,41 @@ class BaseLeague: cfg.cfg_type = cls.__name__ + 'Dict' return cfg + config = dict( + league_type='base', + import_names=["ding.league.base_league"], + # ---player---- + # "player_category" is just a name. Depends on the env. + # For example, in StarCraft, this can be ['zerg', 'terran', 'protoss']. + player_category=['default'], + # Support different types of active players for solo and battle league. + # For solo league, supports ['solo_active_player']. + # For battle league, supports ['battle_active_player', 'main_player', 'main_exploiter', 'league_exploiter']. + # active_players=dict(), + # "use_pretrain" means whether to use pretrain model to initialize active player. + use_pretrain=False, + # "use_pretrain_init_historical" means whether to use pretrain model to initialize historical player. + # "pretrain_checkpoint_path" is the pretrain checkpoint path used in "use_pretrain" and + # "use_pretrain_init_historical". If both are False, "pretrain_checkpoint_path" can be omitted as well. + # Otherwise, "pretrain_checkpoint_path" should list paths of all player categories. + use_pretrain_init_historical=False, + pretrain_checkpoint_path=dict(default='default_cate_pretrain.pth', ), + # ---payoff--- + payoff=dict( + # Supports ['battle'] + type='battle', + decay=0.99, + min_win_rate_games=8, + ), + metric=dict( + mu=0, + sigma=25 / 3, + beta=25 / 3 / 2, + tau=0.0, + draw_probability=0.02, + ), + ) + def __init__(self, cfg: EasyDict) -> None: """ Overview: @@ -34,16 +71,19 @@ class BaseLeague: Arguments: - cfg (:obj:`EasyDict`): League config. """ - self.cfg = cfg + self.cfg = deep_merge_dicts(self.default_config(), cfg) self.path_policy = cfg.path_policy if not osp.exists(self.path_policy): os.mkdir(self.path_policy) self.league_uid = str(uuid.uuid1()) + # TODO dict players self.active_players = [] self.historical_players = [] self.player_path = "./league" self.payoff = create_payoff(self.cfg.payoff) + metric_cfg = self.cfg.metric + self.metric_env = LeagueMetricEnv(metric_cfg.mu, metric_cfg.sigma, metric_cfg.tau, metric_cfg.draw_probability) self._active_players_lock = LockContext(type_=LockContextType.THREAD_LOCK) self._init_players() @@ -58,7 +98,9 @@ class BaseLeague: for i in range(n): # This type's active player number name = '{}_{}_{}'.format(k, cate, i) ckpt_path = osp.join(self.path_policy, '{}_ckpt.pth'.format(name)) - player = create_player(self.cfg, k, self.cfg[k], cate, self.payoff, ckpt_path, name, 0) + player = create_player( + self.cfg, k, self.cfg[k], cate, self.payoff, ckpt_path, name, 0, self.metric_env.create_rating() + ) if self.cfg.use_pretrain: self.save_checkpoint(self.cfg.pretrain_checkpoint_path[cate], ckpt_path) self.active_players.append(player) @@ -79,6 +121,7 @@ class BaseLeague: self.cfg.pretrain_checkpoint_path[cate], name, 0, + self.metric_env.create_rating(), parent_id=parent_name ) self.historical_players.append(hp) @@ -140,7 +183,7 @@ class BaseLeague: player = self.active_players[idx] if force or player.is_trained_enough(): # Snapshot - hp = player.snapshot() + hp = player.snapshot(self.metric_env) self.save_checkpoint(player.checkpoint_path, hp.checkpoint_path) self.historical_players.append(hp) self.payoff.add_player(hp) @@ -197,6 +240,21 @@ class BaseLeague: """ # TODO(nyz) more fine-grained job info self.payoff.update(job_info) + if 'eval_flag' in job_info and job_info['eval_flag']: + home_id, away_id = job_info['player_id'] + home_player, away_player = self.get_player_by_id(home_id), self.get_player_by_id(away_id) + job_info_result = job_info['result'] + if isinstance(job_info_result[0], list): + job_info_result = sum(job_info_result, []) + home_player.rating, away_player.rating = self.metric_env.rate_1vs1( + home_player.rating, away_player.rating, result=job_info_result + ) + + def get_player_by_id(self, player_id: str) -> 'Player': # noqa + if 'historical' in player_id: + return [p for p in self.historical_players if p.player_id == player_id][0] + else: + return [p for p in self.active_players if p.player_id == player_id][0] @staticmethod def save_checkpoint(src_checkpoint, dst_checkpoint) -> None: diff --git a/ding/league/metric.py b/ding/league/metric.py new file mode 100644 index 0000000000000000000000000000000000000000..41c587cd02d3d05fc54296f85ecc72290ba07f9c --- /dev/null +++ b/ding/league/metric.py @@ -0,0 +1,127 @@ +from typing import Tuple, Union, List +import math +import numpy as np +from trueskill import TrueSkill, Rating, rate_1vs1 + + +class EloCalculator(object): + score = { + 1: 1.0, # win + 0: 0.5, # draw + -1: 0.0, # lose + } + + @classmethod + def get_new_rating(cls, + rating_a: int, + rating_b: int, + result: int, + k_factor: int = 32, + beta: int = 200) -> Tuple[int, int]: + assert result in [1, 0, -1] + expect_a = 1. / (1. + math.pow(10, (rating_b - rating_a) / (2. * beta))) + expect_b = 1. / (1. + math.pow(10, (rating_a - rating_b) / (2. * beta))) + new_rating_a = rating_a + k_factor * (EloCalculator.score[result] - expect_a) + new_rating_b = rating_b + k_factor * (1 - EloCalculator.score[result] - expect_b) + return round(new_rating_a), round(new_rating_b) + + @classmethod + def get_new_rating_array( + cls, + rating: np.ndarray, + result: np.ndarray, + game_count: np.ndarray, + k_factor: int = 32, + beta: int = 200 + ) -> np.ndarray: + """ + Shapes: + rating: :math:`(N, )`, N is the number of player + result: :math:`(N, N)` + game_count: :math:`(N, N)` + """ + rating_diff = np.expand_dims(rating, 0) - np.expand_dims(rating, 1) + expect = 1. / (1. + np.power(10, rating_diff / (2. * beta))) * game_count + delta = ((result + 1.) / 2 - expect) * (game_count > 0) + delta = delta.sum(axis=1) + return np.round(rating + k_factor * delta).astype(np.int64) + + +class PlayerRating(Rating): + + def __init__(self, mu: float = None, sigma: float = None, elo_init: int = None) -> None: + super(PlayerRating, self).__init__(mu, sigma) + self.elo = elo_init + + def __repr__(self) -> str: + c = type(self) + args = ('.'.join([c.__module__, c.__name__]), self.mu, self.sigma, self.exposure, self.elo) + return '%s(mu=%.3f, sigma=%.3f, exposure=%.3f, elo=%d)' % args + + +class LeagueMetricEnv(TrueSkill): + """ + Overview: + TrueSkill rating system among game players, for more details pleas refer to ``https://trueskill.org/`` + """ + + def __init__(self, *args, elo_init: int = 1200, **kwargs) -> None: + super(LeagueMetricEnv, self).__init__(*args, **kwargs) + self.elo_init = elo_init + + def create_rating(self, mu: float = None, sigma: float = None, elo_init: int = None) -> PlayerRating: + if mu is None: + mu = self.mu + if sigma is None: + sigma = self.sigma + if elo_init is None: + elo_init = self.elo_init + return PlayerRating(mu, sigma, elo_init) + + @staticmethod + def _rate_1vs1(t1, t2, **kwargs): + t1_elo, t2_elo = t1.elo, t2.elo + t1, t2 = rate_1vs1(t1, t2, **kwargs) + if 'drawn' in kwargs: + result = 0 + else: + result = 1 + t1_elo, t2_elo = EloCalculator.get_new_rating(t1_elo, t2_elo, result) + t1 = PlayerRating(t1.mu, t1.sigma, t1_elo) + t2 = PlayerRating(t2.mu, t2.sigma, t2_elo) + return t1, t2 + + def rate_1vs1(self, + team1: PlayerRating, + team2: PlayerRating, + result: List[str] = None, + **kwargs) -> Tuple[PlayerRating, PlayerRating]: + if result is None: + return self._rate_1vs1(team1, team2, **kwargs) + else: + for r in result: + if r == 'wins': + team1, team2 = self._rate_1vs1(team1, team2) + elif r == 'draws': + team1, team2 = self._rate_1vs1(team1, team2, drawn=True) + elif r == 'losses': + team2, team1 = self._rate_1vs1(team2, team1) + else: + raise RuntimeError("invalid result: {}".format(r)) + return team1, team2 + + def rate_1vsC(self, team1: PlayerRating, team2: PlayerRating, result: List[str]) -> PlayerRating: + for r in result: + if r == 'wins': + team1, _ = self._rate_1vs1(team1, team2) + elif r == 'draws': + team1, _ = self._rate_1vs1(team1, team2, drawn=True) + elif r == 'losses': + _, team1 = self._rate_1vs1(team2, team1) + else: + raise RuntimeError("invalid result: {}".format(r)) + return team1 + + +get_elo = EloCalculator.get_new_rating +get_elo_array = EloCalculator.get_new_rating_array diff --git a/ding/league/one_vs_one_league.py b/ding/league/one_vs_one_league.py index 6a13cad5e70f9dad1352b825b11610f6cc4f7980..2555aa1d9130f5d871cd3014b40f3324c0291509 100644 --- a/ding/league/one_vs_one_league.py +++ b/ding/league/one_vs_one_league.py @@ -53,6 +53,13 @@ class OneVsOneLeague(BaseLeague): decay=0.99, min_win_rate_games=8, ), + metric=dict( + mu=0, + sigma=25 / 3, + beta=25 / 3 / 2, + tau=0.0, + draw_probability=0.02, + ), ) # override diff --git a/ding/league/player.py b/ding/league/player.py index 0a1b5df0a4d66858b95674fbb4656371fc915749..e253c0bdadfe460818ebbe241ba494c354e204bb 100644 --- a/ding/league/player.py +++ b/ding/league/player.py @@ -25,7 +25,8 @@ class Player: init_payoff: 'BattleSharedPayoff', # noqa checkpoint_path: str, player_id: str, - total_agent_step: int + total_agent_step: int, + rating: 'PlayerRating', # noqa ) -> None: """ Overview: @@ -39,6 +40,7 @@ class Player: - player_id (:obj:`str`): Player id in string format. - total_agent_step (:obj:`int`): For active player, it should be 0; \ For historical player, it should be parent player's ``_total_agent_step`` when ``snapshot``. + - rating (:obj:`PlayerRating`): player rating information in total league """ self._cfg = cfg self._category = category @@ -48,6 +50,7 @@ class Player: self._player_id = player_id assert isinstance(total_agent_step, int), (total_agent_step, type(total_agent_step)) self._total_agent_step = total_agent_step + self._rating = rating @property def category(self) -> str: @@ -73,6 +76,14 @@ class Player: def total_agent_step(self, step: int) -> None: self._total_agent_step = step + @property + def rating(self) -> 'PlayerRating': # noqa + return self._rating + + @rating.setter + def rating(self, _rating: 'PlayerRating') -> None: # noqa + self._rating = _rating + @PLAYER_REGISTRY.register('historical_player') class HistoricalPlayer(Player): @@ -168,10 +179,12 @@ class ActivePlayer(Player): else: return False - def snapshot(self) -> HistoricalPlayer: + def snapshot(self, metric_env: 'LeagueMetricEnv') -> HistoricalPlayer: # noqa """ Overview: Generate a snapshot historical player from the current player, called in league's ``_snapshot``. + Argument: + - metric_env (:obj:`LeagueMetricEnv`): player rating environment, one league one env Returns: - snapshot_player (:obj:`HistoricalPlayer`): new instantiated historical player @@ -187,6 +200,7 @@ class ActivePlayer(Player): path, self.player_id + '_{}_historical'.format(int(self._total_agent_step)), self._total_agent_step, + metric_env.create_rating(mu=self.rating.mu), parent_id=self.player_id ) diff --git a/ding/league/starcraft_player.py b/ding/league/starcraft_player.py index 3ab1d0b6ed118a1561b32c259ea4c0ae84c74c24..81d53e73bd2e0c54b353e1d5f5421745a2de92b5 100644 --- a/ding/league/starcraft_player.py +++ b/ding/league/starcraft_player.py @@ -1,4 +1,4 @@ -from typing import Optional +from typing import Optional, Union import numpy as np from ding.utils import PLAYER_REGISTRY @@ -105,7 +105,7 @@ class MainPlayer(ActivePlayer): Overview: MainPlayer does not mutate """ - return None + pass @PLAYER_REGISTRY.register('main_exploiter') @@ -168,9 +168,9 @@ class MainExploiter(ActivePlayer): Overview: Main exploiter is sure to mutate(reset) to the supervised learning player Returns: - - ckpt_path (:obj:`str`): the pretrained model's ckpt path + - mutate_ckpt_path (:obj:`str`): mutation target checkpoint path """ - return info['pretrain_checkpoint_path'] + return info['reset_checkpoint_path'] @PLAYER_REGISTRY.register('league_exploiter') @@ -220,15 +220,15 @@ class LeagueExploiter(ActivePlayer): return super().is_trained_enough(select_fn=lambda p: isinstance(p, HistoricalPlayer)) # override - def mutate(self, info) -> Optional[str]: + def mutate(self, info) -> Union[str, None]: """ Overview: League exploiter can mutate to the supervised learning player with 0.25 prob Returns: - - ckpt_path (:obj:`str`): with ``mutate_prob`` prob returns the pretrained model's ckpt path, \ + - ckpt_path (:obj:`Union[str, None]`): with ``mutate_prob`` prob returns the pretrained model's ckpt path, \ with left 1 - ``mutate_prob`` prob returns None, which means no mutation """ p = np.random.uniform() if p < self.mutate_prob: - return info['pretrain_checkpoint_path'] + return info['reset_checkpoint_path'] return None diff --git a/ding/league/tests/test_league_metric.py b/ding/league/tests/test_league_metric.py new file mode 100644 index 0000000000000000000000000000000000000000..41da70e7086d212b9a5b3aa593db2fa019a00711 --- /dev/null +++ b/ding/league/tests/test_league_metric.py @@ -0,0 +1,56 @@ +import pytest +import numpy as np +from ding.league import get_elo, get_elo_array, LeagueMetricEnv + + +@pytest.mark.unittest +def test_elo_calculator(): + game_count = np.array([[0, 1, 2], [1, 0, 0], [2, 0, 0]]) + rating = np.array([1613, 1573, 1601]) + result = np.array([[0, -1, -1 + 1], [1, 0, 0], [1 + (-1), 0, 0]]) + new_rating0, new_rating1 = get_elo(rating[0], rating[1], result[0][1]) + assert new_rating0 == 1595 + assert new_rating1 == 1591 + + old_rating = np.copy(rating) + new_rating = get_elo_array(rating, result, game_count) + assert (rating == old_rating).all() # no inplace modification + assert new_rating.dtype == np.int64 + assert new_rating[0] == 1578 + assert new_rating[1] == 1591 + assert new_rating[2] == 1586 + + +@pytest.mark.unittest +def test_league_metric(): + sigma = 25 / 3 + env = LeagueMetricEnv(mu=0, sigma=sigma, beta=sigma / 2, tau=0.0, draw_probability=0.02, elo_init=1000) + r1 = env.create_rating(elo_init=1613) + r2 = env.create_rating(elo_init=1573) + assert r1.mu == 0 + assert r2.mu == 0 + assert r2.sigma == sigma + assert r2.sigma == sigma + assert r1.elo == 1613 + assert r2.elo == 1573 + # r1 draw r2 + r1, r2 = env.rate_1vs1(r1, r2, drawn=True) + assert r1.mu == r2.mu + assert r1.elo == 1611 + assert r2.elo == 1575 + # r1 win r2 + new_r1, new_r2 = env.rate_1vs1(r1, r2) + assert new_r1.mu > r1.mu + assert new_r2.mu < r2.mu + assert new_r1.mu + new_r2.mu == 0 + assert pytest.approx(new_r1.mu, 3.230) + assert pytest.approx(new_r2.mu, -3.230) + assert new_r1.elo == 1625 + assert new_r2.elo == 1561 + # multi result + new_r1, new_r2 = env.rate_1vs1(r1, r2, result=['wins', 'wins', 'losses']) + assert new_r1.elo > 1611 + # 1vsConstant + new_r1 = env.rate_1vsC(r1, env.create_rating(elo_init=1800), result=['losses', 'losses']) + assert new_r1.elo < 1611 + print('final rating is: ', new_r1) diff --git a/ding/league/tests/test_payoff.py b/ding/league/tests/test_payoff.py index 379855605c27234fc97bb2698f845478db6247c3..b358d88578031e1b377387c51123146f101de758 100644 --- a/ding/league/tests/test_payoff.py +++ b/ding/league/tests/test_payoff.py @@ -8,6 +8,9 @@ from easydict import EasyDict from ding.league.player import Player from ding.league.shared_payoff import BattleRecordDict, create_payoff +from ding.league.metric import LeagueMetricEnv + +env = LeagueMetricEnv() @pytest.mark.unittest @@ -42,7 +45,8 @@ def get_shared_payoff_player(payoff): init_payoff=payoff, checkpoint_path='sp_ckpt_{}.pth'.format(sp_player_count), player_id='sp_player_{}'.format(sp_player_count), - total_agent_step=0 + total_agent_step=0, + rating=env.create_rating(), ) sp_player_count += 1 return player diff --git a/ding/league/tests/test_player.py b/ding/league/tests/test_player.py index 85b6906925efb6b612def6f9d481c78b41283b3c..6cfb03dd6f5469cd72491067fe27c5cc9a6344b8 100644 --- a/ding/league/tests/test_player.py +++ b/ding/league/tests/test_player.py @@ -8,8 +8,10 @@ from ding.league.player import Player, HistoricalPlayer, ActivePlayer, create_pl from ding.league.shared_payoff import create_payoff from ding.league.starcraft_player import MainPlayer, MainExploiter, LeagueExploiter from ding.league.tests.league_test_default_config import league_test_config +from ding.league.metric import LeagueMetricEnv ONE_PHASE_STEP = 2000 +env = LeagueMetricEnv() @pytest.fixture(scope='function') @@ -27,7 +29,7 @@ def setup_league(setup_payoff): players.append( create_player( league_test_config.league, 'main_player', league_test_config.league.main_player, category, setup_payoff, - 'ckpt_{}.pth'.format(main_player_name), main_player_name, 0 + 'ckpt_{}.pth'.format(main_player_name), main_player_name, 0, env.create_rating() ) ) # main_exloiter @@ -35,7 +37,7 @@ def setup_league(setup_payoff): players.append( create_player( league_test_config.league, 'main_exploiter', league_test_config.league.main_exploiter, category, - setup_payoff, 'ckpt_{}.pth'.format(main_exploiter_name), main_exploiter_name, 0 + setup_payoff, 'ckpt_{}.pth'.format(main_exploiter_name), main_exploiter_name, 0, env.create_rating() ) ) # league_exploiter @@ -51,6 +53,7 @@ def setup_league(setup_payoff): 'ckpt_{}.pth'.format(league_exploiter_name), league_exploiter_name, 0, + env.create_rating(), ) ) # historical player: sl player is used as initial HistoricalPlayer @@ -65,7 +68,8 @@ def setup_league(setup_payoff): 'ckpt_sl_{}'.format(sl_hp_name), sl_hp_name, 0, - parent_id=main_player_name + env.create_rating(), + parent_id=main_player_name, ) ) for p in players: @@ -94,7 +98,7 @@ class TestMainPlayer: for p in setup_league: if isinstance(p, ActivePlayer): p.total_agent_step = 2 * ONE_PHASE_STEP - hp = p.snapshot() + hp = p.snapshot(env) hp_list.append(hp) setup_payoff.add_player(hp) setup_league += hp_list # 12+3 + 12 @@ -122,7 +126,7 @@ class TestMainPlayer: for p in setup_league: for i in range(N): if isinstance(p, ActivePlayer): - hp = p.snapshot() + hp = p.snapshot(env) assert isinstance(hp, HistoricalPlayer) assert id(hp.payoff) == id(p.payoff) assert hp.parent_id == p.player_id @@ -146,7 +150,7 @@ class TestMainPlayer: hp_list = [] for p in setup_league: if isinstance(p, MainPlayer): - hp = p.snapshot() + hp = p.snapshot(env) setup_payoff.add_player(hp) hp_list.append(hp) setup_league += hp_list @@ -243,7 +247,7 @@ class TestMainExploiter: for p in setup_league: if isinstance(p, MainPlayer): p.total_agent_step = (i + 1) * 2 * ONE_PHASE_STEP - hp = p.snapshot() + hp = p.snapshot(env) setup_payoff.add_player(hp) hp_list.append(hp) setup_league += hp_list @@ -272,9 +276,9 @@ class TestMainExploiter: def test_mutate(self, setup_league): assert isinstance(setup_league[1], MainExploiter) - info = {'pretrain_checkpoint_path': 'pretrain_checkpoint.pth'} + info = {'reset_checkpoint_path': 'pretrain_checkpoint.pth'} for _ in range(10): - assert setup_league[1].mutate(info) == info['pretrain_checkpoint_path'] + assert setup_league[1].mutate(info) == info['reset_checkpoint_path'] @pytest.mark.unittest @@ -296,7 +300,7 @@ class TestLeagueExploiter: def test_mutate(self, setup_league): assert isinstance(setup_league[2], LeagueExploiter) - info = {'pretrain_checkpoint_path': 'pretrain_checkpoint.pth'} + info = {'reset_checkpoint_path': 'pretrain_checkpoint.pth'} results = [] for _ in range(1000): results.append(setup_league[2].mutate(info)) diff --git a/ding/worker/collector/one_vs_one_serial_evaluator.py b/ding/worker/collector/one_vs_one_serial_evaluator.py index 03cd8831d902a0aaaac41f7aabc535ed1ef56c63..5e1ea54b74380592ff4837ef40bc44e0c2cdfb8b 100644 --- a/ding/worker/collector/one_vs_one_serial_evaluator.py +++ b/ding/worker/collector/one_vs_one_serial_evaluator.py @@ -171,7 +171,7 @@ class OnevOneEvaluator(object): train_iter: int = -1, envstep: int = -1, n_episode: Optional[int] = None - ) -> Tuple[bool, float]: + ) -> Tuple[bool, float, list]: ''' Overview: Evaluate policy and store the best policy based on whether it reaches the highest historical reward. @@ -183,12 +183,14 @@ class OnevOneEvaluator(object): Returns: - stop_flag (:obj:`bool`): Whether this training program can be ended. - eval_reward (:obj:`float`): Current eval_reward. + - return_info (:obj:`list`): Environment information of each finished episode ''' if n_episode is None: n_episode = self._default_n_episode assert n_episode is not None, "please indicate eval n_episode" envstep_count = 0 info = {} + return_info = [[] for _ in range(2)] eval_monitor = VectorEvalMonitor(self._env.env_num, n_episode) self._env.reset() for p in self._policy: @@ -219,6 +221,8 @@ class OnevOneEvaluator(object): if 'episode_info' in t.info[0]: eval_monitor.update_info(env_id, t.info[0]['episode_info']) eval_monitor.update_reward(env_id, reward) + for policy_id in range(2): + return_info[policy_id].append(t.info[policy_id]) self._logger.info( "[EVALUATOR]env {} finish episode, final reward: {}, current episode: {}".format( env_id, eval_monitor.get_latest_reward(env_id), eval_monitor.get_current_episode() @@ -266,7 +270,7 @@ class OnevOneEvaluator(object): "Current eval_reward: {} is greater than stop_value: {}".format(eval_reward, self._stop_value) + ", so your RL agent is converged, you can refer to 'log/evaluator/evaluator_logger.txt' for details." ) - return stop_flag, eval_reward + return stop_flag, eval_reward, return_info class VectorEvalMonitor(object): diff --git a/dizoo/league_demo/demo_league.py b/dizoo/league_demo/demo_league.py index 2df9407c7dc2bc20f85cf8af131ed7636c1f688b..07465efa3473eae53bd6c57b468e2846ca25a009 100644 --- a/dizoo/league_demo/demo_league.py +++ b/dizoo/league_demo/demo_league.py @@ -1,9 +1,15 @@ +import os import shutil from easydict import EasyDict from ding.league import BaseLeague, ActivePlayer class DemoLeague(BaseLeague): + + def __init__(self, cfg): + super(DemoLeague, self).__init__(cfg) + self.reset_checkpoint_path = os.path.join(self.path_policy, 'reset_ckpt.pth') + # override def _get_job_info(self, player: ActivePlayer, eval_flag: bool = False) -> dict: assert isinstance(player, ActivePlayer), player.__class__ @@ -18,7 +24,12 @@ class DemoLeague(BaseLeague): # override def _mutate_player(self, player: ActivePlayer): - pass + for p in self.active_players: + result = p.mutate({'reset_checkpoint_path': self.reset_checkpoint_path}) + if result is not None: + p.rating = self.metric_env.create_rating() + self.load_checkpoint(p.player_id, result) # load_checkpoint is set by the caller of league + self.save_checkpoint(result, p.checkpoint_path) # override def _update_player(self, player: ActivePlayer, player_info: dict) -> None: diff --git a/dizoo/league_demo/game_env.py b/dizoo/league_demo/game_env.py index 8626862c09ff15d0e00d7b552472350eef4df65c..7a76987f13abd2561b04467ee618e21ec9dc0f64 100644 --- a/dizoo/league_demo/game_env.py +++ b/dizoo/league_demo/game_env.py @@ -7,6 +7,10 @@ class GameEnv(BaseEnv): def __init__(self, game_type='prisoner_dilemma'): self.game_type = game_type assert self.game_type in ['zero_sum', 'prisoner_dilemma'] + if self.game_type == 'prisoner_dilemma': + self.optimal_policy = [0, 1] + elif self.game_type == 'zero_sum': + self.optimal_policy = [0.375, 0.625] def seed(self, seed, dynamic_seed=False): pass @@ -36,10 +40,10 @@ class GameEnv(BaseEnv): results = "draws", "draws" elif actions == [0, 1]: rewards = -20, 0 - results = "wins", "losses" + results = "losses", "wins" elif actions == [1, 0]: rewards = 0, -20 - results = "losses", "wins" + results = "wins", "losses" elif actions == [1, 1]: rewards = -10, -10 results = 'draws', 'draws' diff --git a/dizoo/league_demo/league_demo_ppo_config.py b/dizoo/league_demo/league_demo_ppo_config.py index 1209400bbaf81e52ee3eb60817a48d5bd0a9564d..41df4fa1110f2ec4c9bd5fa7273ff29b3a152033 100644 --- a/dizoo/league_demo/league_demo_ppo_config.py +++ b/dizoo/league_demo/league_demo_ppo_config.py @@ -6,7 +6,8 @@ league_demo_ppo_config = dict( collector_env_num=8, evaluator_env_num=10, n_evaluator_episode=100, - stop_value=[-0.01, -5], + env_type='prisoner_dilemma', # ['zero_sum', 'prisoner_dilemma'] + stop_value=[-10.1, -5.05], # prisoner_dilemma manager=dict(shared_memory=False, ), ), policy=dict( @@ -58,7 +59,7 @@ league_demo_ppo_config = dict( one_phase_step=200, branch_probs=dict(pfsp=1.0, ), strong_win_rate=0.7, - mutate_prob=0.0, + mutate_prob=0.5, ), use_pretrain=False, use_pretrain_init_historical=False, @@ -66,7 +67,14 @@ league_demo_ppo_config = dict( type='battle', decay=0.99, min_win_rate_games=8, - ) + ), + metric=dict( + mu=0, + sigma=25 / 3, + beta=25 / 3 / 2, + tau=0.0, + draw_probability=0.02, + ), ), ), ), diff --git a/dizoo/league_demo/league_demo_ppo_main.py b/dizoo/league_demo/league_demo_ppo_main.py index 625bbb7ea16115b2a1d5aeebb65461d683869042..1a520700354616eeda08d3d50223de108d896f76 100644 --- a/dizoo/league_demo/league_demo_ppo_main.py +++ b/dizoo/league_demo/league_demo_ppo_main.py @@ -18,8 +18,17 @@ from dizoo.league_demo.league_demo_ppo_config import league_demo_ppo_config class EvalPolicy1: + def __init__(self, optimal_policy: list) -> None: + assert len(optimal_policy) == 2 + self.optimal_policy = optimal_policy + def forward(self, data: dict) -> dict: - return {env_id: {'action': torch.zeros(1)} for env_id in data.keys()} + return { + env_id: { + 'action': torch.from_numpy(np.random.choice([0, 1], p=self.optimal_policy, size=(1, ))) + } + for env_id in data.keys() + } def reset(self, data_id: list = []) -> None: pass @@ -50,17 +59,26 @@ def main(cfg, seed=0, max_iterations=int(1e10)): NaiveReplayBuffer, save_cfg=True ) + env_type = cfg.env.env_type collector_env_num, evaluator_env_num = cfg.env.collector_env_num, cfg.env.evaluator_env_num - evaluator_env1 = BaseEnvManager(env_fn=[GameEnv for _ in range(evaluator_env_num)], cfg=cfg.env.manager) - evaluator_env2 = BaseEnvManager(env_fn=[GameEnv for _ in range(evaluator_env_num)], cfg=cfg.env.manager) + evaluator_env1 = BaseEnvManager( + env_fn=[lambda: GameEnv(env_type) for _ in range(evaluator_env_num)], cfg=cfg.env.manager + ) + evaluator_env2 = BaseEnvManager( + env_fn=[lambda: GameEnv(env_type) for _ in range(evaluator_env_num)], cfg=cfg.env.manager + ) + evaluator_env3 = BaseEnvManager( + env_fn=[lambda: GameEnv(env_type) for _ in range(evaluator_env_num)], cfg=cfg.env.manager + ) evaluator_env1.seed(seed, dynamic_seed=False) evaluator_env2.seed(seed, dynamic_seed=False) + evaluator_env3.seed(seed, dynamic_seed=False) set_pkg_seed(seed, use_cuda=cfg.policy.cuda) tb_logger = SummaryWriter(os.path.join('./{}/log/'.format(cfg.exp_name), 'serial')) league = DemoLeague(cfg.policy.other.league) - eval_policy1 = EvalPolicy1() + eval_policy1 = EvalPolicy1(evaluator_env1._env_ref.optimal_policy) eval_policy2 = EvalPolicy2() policies = {} learners = {} @@ -70,7 +88,9 @@ def main(cfg, seed=0, max_iterations=int(1e10)): model = VAC(**cfg.policy.model) policy = PPOPolicy(cfg.policy, model=model) policies[player_id] = policy - collector_env = BaseEnvManager(env_fn=[GameEnv for _ in range(collector_env_num)], cfg=cfg.env.manager) + collector_env = BaseEnvManager( + env_fn=[lambda: GameEnv(env_type) for _ in range(collector_env_num)], cfg=cfg.env.manager + ) collector_env.seed(seed) learners[player_id] = BaseLearner( @@ -90,8 +110,11 @@ def main(cfg, seed=0, max_iterations=int(1e10)): model = VAC(**cfg.policy.model) policy = PPOPolicy(cfg.policy, model=model) policies['historical'] = policy + # use initial policy as another eval_policy + eval_policy3 = PPOPolicy(cfg.policy, model=copy.deepcopy(model)).collect_mode main_key = [k for k in learners.keys() if k.startswith('main_player')][0] + main_player = league.get_player_by_id(main_key) main_learner = learners[main_key] main_collector = collectors[main_key] # collect_mode ppo use multimonial sample for selecting action @@ -113,25 +136,68 @@ def main(cfg, seed=0, max_iterations=int(1e10)): exp_name=cfg.exp_name, instance_name='uniform_evaluator' ) + evaluator3_cfg = copy.deepcopy(cfg.policy.eval.evaluator) + evaluator3_cfg.stop_value = 99999999 # stop_value of evaluator3 is a placeholder + evaluator3 = OnevOneEvaluator( + evaluator3_cfg, + evaluator_env3, [policies[main_key].collect_mode, eval_policy3], + tb_logger, + exp_name=cfg.exp_name, + instance_name='init_evaluator' + ) + + def load_checkpoint_fn(player_id: str, ckpt_path: str): + state_dict = torch.load(ckpt_path) + policies[player_id].learn_mode.load_state_dict(state_dict) + torch.save(policies['historical'].learn_mode.state_dict(), league.reset_checkpoint_path) + league.load_checkpoint = load_checkpoint_fn for player_id, player_ckpt_path in zip(league.active_players_ids, league.active_players_ckpts): torch.save(policies[player_id].collect_mode.state_dict(), player_ckpt_path) league.judge_snapshot(player_id, force=True) + init_main_player_rating = league.metric_env.create_rating(mu=0) for run_iter in range(max_iterations): if evaluator1.should_eval(main_learner.train_iter): - stop_flag1, reward = evaluator1.eval( + stop_flag1, reward, episode_info = evaluator1.eval( main_learner.save_checkpoint, main_learner.train_iter, main_collector.envstep ) + win_loss_result = [e['result'] for e in episode_info[0]] + # set fixed NE policy trueskill(exposure) equal 10 + main_player.rating = league.metric_env.rate_1vsC( + main_player.rating, league.metric_env.create_rating(mu=10, sigma=1e-8), win_loss_result + ) tb_logger.add_scalar('fixed_evaluator_step/reward_mean', reward, main_collector.envstep) if evaluator2.should_eval(main_learner.train_iter): - stop_flag2, reward = evaluator2.eval( + stop_flag2, reward, episode_info = evaluator2.eval( main_learner.save_checkpoint, main_learner.train_iter, main_collector.envstep ) + win_loss_result = [e['result'] for e in episode_info[0]] + # set random(uniform) policy trueskill(exposure) equal 0 + main_player.rating = league.metric_env.rate_1vsC( + main_player.rating, league.metric_env.create_rating(mu=0, sigma=1e-8), win_loss_result + ) tb_logger.add_scalar('uniform_evaluator_step/reward_mean', reward, main_collector.envstep) + if evaluator3.should_eval(main_learner.train_iter): + _, reward, episode_info = evaluator3.eval( + main_learner.save_checkpoint, main_learner.train_iter, main_collector.envstep + ) + win_loss_result = [e['result'] for e in episode_info[0]] + # use init main player as another evaluator metric + main_player.rating, init_main_player_rating = league.metric_env.rate_1vs1( + main_player.rating, init_main_player_rating, win_loss_result + ) + tb_logger.add_scalar('init_evaluator_step/reward_mean', reward, main_collector.envstep) + tb_logger.add_scalar( + 'league/init_main_player_trueskill', init_main_player_rating.exposure, main_collector.envstep + ) if stop_flag1 and stop_flag2: break for player_id, player_ckpt_path in zip(league.active_players_ids, league.active_players_ckpts): + tb_logger.add_scalar( + 'league/{}_trueskill'.format(player_id), + league.get_player_by_id(player_id).rating.exposure, main_collector.envstep + ) collector, learner = collectors[player_id], learners[player_id] job = league.get_job_info(player_id) opponent_player_id = job['player_id'][1] @@ -144,7 +210,7 @@ def main(cfg, seed=0, max_iterations=int(1e10)): opponent_policy = policies[opponent_player_id].collect_mode collector.reset_policy([policies[player_id].collect_mode, opponent_policy]) train_data, episode_info = collector.collect(train_iter=learner.train_iter) - train_data, episode_info = train_data[0], episode_info[0] # only use launer player data for training + train_data, episode_info = train_data[0], episode_info[0] # only use launch player data for training for d in train_data: d['adv'] = d['reward'] @@ -156,7 +222,9 @@ def main(cfg, seed=0, max_iterations=int(1e10)): player_info['player_id'] = player_id league.update_active_player(player_info) league.judge_snapshot(player_id) + # set eval_flag=True to enable trueskill update job_finish_info = { + 'eval_flag': True, 'launch_player': job['launch_player'], 'player_id': job['player_id'], 'result': [e['result'] for e in episode_info], diff --git a/dizoo/league_demo/selfplay_demo_ppo_main.py b/dizoo/league_demo/selfplay_demo_ppo_main.py index 529e5cc0a8f8bfafcf3523821c61e13959c9115a..83c27bb675ec237944a709415f7107cd9b576019 100644 --- a/dizoo/league_demo/selfplay_demo_ppo_main.py +++ b/dizoo/league_demo/selfplay_demo_ppo_main.py @@ -50,10 +50,17 @@ def main(cfg, seed=0, max_iterations=int(1e10)): NaiveReplayBuffer, save_cfg=True ) + env_type = cfg.env.env_type collector_env_num, evaluator_env_num = cfg.env.collector_env_num, cfg.env.evaluator_env_num - collector_env = BaseEnvManager(env_fn=[GameEnv for _ in range(collector_env_num)], cfg=cfg.env.manager) - evaluator_env1 = BaseEnvManager(env_fn=[GameEnv for _ in range(evaluator_env_num)], cfg=cfg.env.manager) - evaluator_env2 = BaseEnvManager(env_fn=[GameEnv for _ in range(evaluator_env_num)], cfg=cfg.env.manager) + collector_env = BaseEnvManager( + env_fn=[lambda: GameEnv(env_type) for _ in range(collector_env_num)], cfg=cfg.env.manager + ) + evaluator_env1 = BaseEnvManager( + env_fn=[lambda: GameEnv(env_type) for _ in range(evaluator_env_num)], cfg=cfg.env.manager + ) + evaluator_env2 = BaseEnvManager( + env_fn=[lambda: GameEnv(env_type) for _ in range(evaluator_env_num)], cfg=cfg.env.manager + ) collector_env.seed(seed) evaluator_env1.seed(seed, dynamic_seed=False) @@ -102,10 +109,10 @@ def main(cfg, seed=0, max_iterations=int(1e10)): for _ in range(max_iterations): if evaluator1.should_eval(learner1.train_iter): - stop_flag1, reward = evaluator1.eval(learner1.save_checkpoint, learner1.train_iter, collector.envstep) + stop_flag1, reward, _ = evaluator1.eval(learner1.save_checkpoint, learner1.train_iter, collector.envstep) tb_logger.add_scalar('fixed_evaluator_step/reward_mean', reward, collector.envstep) if evaluator2.should_eval(learner1.train_iter): - stop_flag2, reward = evaluator2.eval(learner1.save_checkpoint, learner1.train_iter, collector.envstep) + stop_flag2, reward, _ = evaluator2.eval(learner1.save_checkpoint, learner1.train_iter, collector.envstep) tb_logger.add_scalar('uniform_evaluator_step/reward_mean', reward, collector.envstep) if stop_flag1 and stop_flag2: break diff --git a/setup.py b/setup.py index 2572e17d2c07016ed6186aa518dfcbcd468da8c0..052d7dde17a8311ad668efaf4a0011419902da79 100755 --- a/setup.py +++ b/setup.py @@ -68,6 +68,7 @@ setup( 'opencv-python', # pypy incompatible 'enum_tools', 'scipy', + 'trueskill', ], extras_require={ 'test': [