Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
OpenDILab开源决策智能平台
DI-engine
提交
dd4472e4
D
DI-engine
项目概览
OpenDILab开源决策智能平台
/
DI-engine
上一次同步 2 年多
通知
62
Star
322
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
1
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DI-engine
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
1
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
dd4472e4
编写于
9月 02, 2021
作者:
N
niuyazhe
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'main' of
https://github.com/opendilab/DI-engine
上级
da19fdbd
d24f1f3d
变更
16
显示空白变更内容
内联
并排
Showing
16 changed file
with
419 addition
and
45 deletion
+419
-45
ding/league/__init__.py
ding/league/__init__.py
+1
-0
ding/league/base_league.py
ding/league/base_league.py
+62
-4
ding/league/metric.py
ding/league/metric.py
+127
-0
ding/league/one_vs_one_league.py
ding/league/one_vs_one_league.py
+7
-0
ding/league/player.py
ding/league/player.py
+16
-2
ding/league/starcraft_player.py
ding/league/starcraft_player.py
+7
-7
ding/league/tests/test_league_metric.py
ding/league/tests/test_league_metric.py
+56
-0
ding/league/tests/test_payoff.py
ding/league/tests/test_payoff.py
+5
-1
ding/league/tests/test_player.py
ding/league/tests/test_player.py
+14
-10
ding/worker/collector/one_vs_one_serial_evaluator.py
ding/worker/collector/one_vs_one_serial_evaluator.py
+6
-2
dizoo/league_demo/demo_league.py
dizoo/league_demo/demo_league.py
+12
-1
dizoo/league_demo/game_env.py
dizoo/league_demo/game_env.py
+6
-2
dizoo/league_demo/league_demo_ppo_config.py
dizoo/league_demo/league_demo_ppo_config.py
+11
-3
dizoo/league_demo/league_demo_ppo_main.py
dizoo/league_demo/league_demo_ppo_main.py
+76
-8
dizoo/league_demo/selfplay_demo_ppo_main.py
dizoo/league_demo/selfplay_demo_ppo_main.py
+12
-5
setup.py
setup.py
+1
-0
未找到文件。
ding/league/__init__.py
浏览文件 @
dd4472e4
...
...
@@ -3,3 +3,4 @@ from .one_vs_one_league import OneVsOneLeague
from
.player
import
Player
,
ActivePlayer
,
HistoricalPlayer
,
create_player
from
.starcraft_player
import
MainPlayer
,
MainExploiter
,
LeagueExploiter
from
.shared_payoff
import
create_payoff
from
.metric
import
get_elo
,
get_elo_array
,
LeagueMetricEnv
ding/league/base_league.py
浏览文件 @
dd4472e4
...
...
@@ -7,7 +7,9 @@ import os.path as osp
from
ding.league.player
import
ActivePlayer
,
HistoricalPlayer
,
create_player
from
ding.league.shared_payoff
import
create_payoff
from
ding.utils
import
import_module
,
read_file
,
save_file
,
LockContext
,
LockContextType
,
LEAGUE_REGISTRY
from
ding.utils
import
import_module
,
read_file
,
save_file
,
LockContext
,
LockContextType
,
LEAGUE_REGISTRY
,
\
deep_merge_dicts
from
.metric
import
LeagueMetricEnv
class
BaseLeague
:
...
...
@@ -27,6 +29,41 @@ class BaseLeague:
cfg
.
cfg_type
=
cls
.
__name__
+
'Dict'
return
cfg
config
=
dict
(
league_type
=
'base'
,
import_names
=
[
"ding.league.base_league"
],
# ---player----
# "player_category" is just a name. Depends on the env.
# For example, in StarCraft, this can be ['zerg', 'terran', 'protoss'].
player_category
=
[
'default'
],
# Support different types of active players for solo and battle league.
# For solo league, supports ['solo_active_player'].
# For battle league, supports ['battle_active_player', 'main_player', 'main_exploiter', 'league_exploiter'].
# active_players=dict(),
# "use_pretrain" means whether to use pretrain model to initialize active player.
use_pretrain
=
False
,
# "use_pretrain_init_historical" means whether to use pretrain model to initialize historical player.
# "pretrain_checkpoint_path" is the pretrain checkpoint path used in "use_pretrain" and
# "use_pretrain_init_historical". If both are False, "pretrain_checkpoint_path" can be omitted as well.
# Otherwise, "pretrain_checkpoint_path" should list paths of all player categories.
use_pretrain_init_historical
=
False
,
pretrain_checkpoint_path
=
dict
(
default
=
'default_cate_pretrain.pth'
,
),
# ---payoff---
payoff
=
dict
(
# Supports ['battle']
type
=
'battle'
,
decay
=
0.99
,
min_win_rate_games
=
8
,
),
metric
=
dict
(
mu
=
0
,
sigma
=
25
/
3
,
beta
=
25
/
3
/
2
,
tau
=
0.0
,
draw_probability
=
0.02
,
),
)
def
__init__
(
self
,
cfg
:
EasyDict
)
->
None
:
"""
Overview:
...
...
@@ -34,16 +71,19 @@ class BaseLeague:
Arguments:
- cfg (:obj:`EasyDict`): League config.
"""
self
.
cfg
=
cfg
self
.
cfg
=
deep_merge_dicts
(
self
.
default_config
(),
cfg
)
self
.
path_policy
=
cfg
.
path_policy
if
not
osp
.
exists
(
self
.
path_policy
):
os
.
mkdir
(
self
.
path_policy
)
self
.
league_uid
=
str
(
uuid
.
uuid1
())
# TODO dict players
self
.
active_players
=
[]
self
.
historical_players
=
[]
self
.
player_path
=
"./league"
self
.
payoff
=
create_payoff
(
self
.
cfg
.
payoff
)
metric_cfg
=
self
.
cfg
.
metric
self
.
metric_env
=
LeagueMetricEnv
(
metric_cfg
.
mu
,
metric_cfg
.
sigma
,
metric_cfg
.
tau
,
metric_cfg
.
draw_probability
)
self
.
_active_players_lock
=
LockContext
(
type_
=
LockContextType
.
THREAD_LOCK
)
self
.
_init_players
()
...
...
@@ -58,7 +98,9 @@ class BaseLeague:
for
i
in
range
(
n
):
# This type's active player number
name
=
'{}_{}_{}'
.
format
(
k
,
cate
,
i
)
ckpt_path
=
osp
.
join
(
self
.
path_policy
,
'{}_ckpt.pth'
.
format
(
name
))
player
=
create_player
(
self
.
cfg
,
k
,
self
.
cfg
[
k
],
cate
,
self
.
payoff
,
ckpt_path
,
name
,
0
)
player
=
create_player
(
self
.
cfg
,
k
,
self
.
cfg
[
k
],
cate
,
self
.
payoff
,
ckpt_path
,
name
,
0
,
self
.
metric_env
.
create_rating
()
)
if
self
.
cfg
.
use_pretrain
:
self
.
save_checkpoint
(
self
.
cfg
.
pretrain_checkpoint_path
[
cate
],
ckpt_path
)
self
.
active_players
.
append
(
player
)
...
...
@@ -79,6 +121,7 @@ class BaseLeague:
self
.
cfg
.
pretrain_checkpoint_path
[
cate
],
name
,
0
,
self
.
metric_env
.
create_rating
(),
parent_id
=
parent_name
)
self
.
historical_players
.
append
(
hp
)
...
...
@@ -140,7 +183,7 @@ class BaseLeague:
player
=
self
.
active_players
[
idx
]
if
force
or
player
.
is_trained_enough
():
# Snapshot
hp
=
player
.
snapshot
()
hp
=
player
.
snapshot
(
self
.
metric_env
)
self
.
save_checkpoint
(
player
.
checkpoint_path
,
hp
.
checkpoint_path
)
self
.
historical_players
.
append
(
hp
)
self
.
payoff
.
add_player
(
hp
)
...
...
@@ -197,6 +240,21 @@ class BaseLeague:
"""
# TODO(nyz) more fine-grained job info
self
.
payoff
.
update
(
job_info
)
if
'eval_flag'
in
job_info
and
job_info
[
'eval_flag'
]:
home_id
,
away_id
=
job_info
[
'player_id'
]
home_player
,
away_player
=
self
.
get_player_by_id
(
home_id
),
self
.
get_player_by_id
(
away_id
)
job_info_result
=
job_info
[
'result'
]
if
isinstance
(
job_info_result
[
0
],
list
):
job_info_result
=
sum
(
job_info_result
,
[])
home_player
.
rating
,
away_player
.
rating
=
self
.
metric_env
.
rate_1vs1
(
home_player
.
rating
,
away_player
.
rating
,
result
=
job_info_result
)
def
get_player_by_id
(
self
,
player_id
:
str
)
->
'Player'
:
# noqa
if
'historical'
in
player_id
:
return
[
p
for
p
in
self
.
historical_players
if
p
.
player_id
==
player_id
][
0
]
else
:
return
[
p
for
p
in
self
.
active_players
if
p
.
player_id
==
player_id
][
0
]
@
staticmethod
def
save_checkpoint
(
src_checkpoint
,
dst_checkpoint
)
->
None
:
...
...
ding/league/metric.py
0 → 100644
浏览文件 @
dd4472e4
from
typing
import
Tuple
,
Union
,
List
import
math
import
numpy
as
np
from
trueskill
import
TrueSkill
,
Rating
,
rate_1vs1
class
EloCalculator
(
object
):
score
=
{
1
:
1.0
,
# win
0
:
0.5
,
# draw
-
1
:
0.0
,
# lose
}
@
classmethod
def
get_new_rating
(
cls
,
rating_a
:
int
,
rating_b
:
int
,
result
:
int
,
k_factor
:
int
=
32
,
beta
:
int
=
200
)
->
Tuple
[
int
,
int
]:
assert
result
in
[
1
,
0
,
-
1
]
expect_a
=
1.
/
(
1.
+
math
.
pow
(
10
,
(
rating_b
-
rating_a
)
/
(
2.
*
beta
)))
expect_b
=
1.
/
(
1.
+
math
.
pow
(
10
,
(
rating_a
-
rating_b
)
/
(
2.
*
beta
)))
new_rating_a
=
rating_a
+
k_factor
*
(
EloCalculator
.
score
[
result
]
-
expect_a
)
new_rating_b
=
rating_b
+
k_factor
*
(
1
-
EloCalculator
.
score
[
result
]
-
expect_b
)
return
round
(
new_rating_a
),
round
(
new_rating_b
)
@
classmethod
def
get_new_rating_array
(
cls
,
rating
:
np
.
ndarray
,
result
:
np
.
ndarray
,
game_count
:
np
.
ndarray
,
k_factor
:
int
=
32
,
beta
:
int
=
200
)
->
np
.
ndarray
:
"""
Shapes:
rating: :math:`(N, )`, N is the number of player
result: :math:`(N, N)`
game_count: :math:`(N, N)`
"""
rating_diff
=
np
.
expand_dims
(
rating
,
0
)
-
np
.
expand_dims
(
rating
,
1
)
expect
=
1.
/
(
1.
+
np
.
power
(
10
,
rating_diff
/
(
2.
*
beta
)))
*
game_count
delta
=
((
result
+
1.
)
/
2
-
expect
)
*
(
game_count
>
0
)
delta
=
delta
.
sum
(
axis
=
1
)
return
np
.
round
(
rating
+
k_factor
*
delta
).
astype
(
np
.
int64
)
class
PlayerRating
(
Rating
):
def
__init__
(
self
,
mu
:
float
=
None
,
sigma
:
float
=
None
,
elo_init
:
int
=
None
)
->
None
:
super
(
PlayerRating
,
self
).
__init__
(
mu
,
sigma
)
self
.
elo
=
elo_init
def
__repr__
(
self
)
->
str
:
c
=
type
(
self
)
args
=
(
'.'
.
join
([
c
.
__module__
,
c
.
__name__
]),
self
.
mu
,
self
.
sigma
,
self
.
exposure
,
self
.
elo
)
return
'%s(mu=%.3f, sigma=%.3f, exposure=%.3f, elo=%d)'
%
args
class
LeagueMetricEnv
(
TrueSkill
):
"""
Overview:
TrueSkill rating system among game players, for more details pleas refer to ``https://trueskill.org/``
"""
def
__init__
(
self
,
*
args
,
elo_init
:
int
=
1200
,
**
kwargs
)
->
None
:
super
(
LeagueMetricEnv
,
self
).
__init__
(
*
args
,
**
kwargs
)
self
.
elo_init
=
elo_init
def
create_rating
(
self
,
mu
:
float
=
None
,
sigma
:
float
=
None
,
elo_init
:
int
=
None
)
->
PlayerRating
:
if
mu
is
None
:
mu
=
self
.
mu
if
sigma
is
None
:
sigma
=
self
.
sigma
if
elo_init
is
None
:
elo_init
=
self
.
elo_init
return
PlayerRating
(
mu
,
sigma
,
elo_init
)
@
staticmethod
def
_rate_1vs1
(
t1
,
t2
,
**
kwargs
):
t1_elo
,
t2_elo
=
t1
.
elo
,
t2
.
elo
t1
,
t2
=
rate_1vs1
(
t1
,
t2
,
**
kwargs
)
if
'drawn'
in
kwargs
:
result
=
0
else
:
result
=
1
t1_elo
,
t2_elo
=
EloCalculator
.
get_new_rating
(
t1_elo
,
t2_elo
,
result
)
t1
=
PlayerRating
(
t1
.
mu
,
t1
.
sigma
,
t1_elo
)
t2
=
PlayerRating
(
t2
.
mu
,
t2
.
sigma
,
t2_elo
)
return
t1
,
t2
def
rate_1vs1
(
self
,
team1
:
PlayerRating
,
team2
:
PlayerRating
,
result
:
List
[
str
]
=
None
,
**
kwargs
)
->
Tuple
[
PlayerRating
,
PlayerRating
]:
if
result
is
None
:
return
self
.
_rate_1vs1
(
team1
,
team2
,
**
kwargs
)
else
:
for
r
in
result
:
if
r
==
'wins'
:
team1
,
team2
=
self
.
_rate_1vs1
(
team1
,
team2
)
elif
r
==
'draws'
:
team1
,
team2
=
self
.
_rate_1vs1
(
team1
,
team2
,
drawn
=
True
)
elif
r
==
'losses'
:
team2
,
team1
=
self
.
_rate_1vs1
(
team2
,
team1
)
else
:
raise
RuntimeError
(
"invalid result: {}"
.
format
(
r
))
return
team1
,
team2
def
rate_1vsC
(
self
,
team1
:
PlayerRating
,
team2
:
PlayerRating
,
result
:
List
[
str
])
->
PlayerRating
:
for
r
in
result
:
if
r
==
'wins'
:
team1
,
_
=
self
.
_rate_1vs1
(
team1
,
team2
)
elif
r
==
'draws'
:
team1
,
_
=
self
.
_rate_1vs1
(
team1
,
team2
,
drawn
=
True
)
elif
r
==
'losses'
:
_
,
team1
=
self
.
_rate_1vs1
(
team2
,
team1
)
else
:
raise
RuntimeError
(
"invalid result: {}"
.
format
(
r
))
return
team1
get_elo
=
EloCalculator
.
get_new_rating
get_elo_array
=
EloCalculator
.
get_new_rating_array
ding/league/one_vs_one_league.py
浏览文件 @
dd4472e4
...
...
@@ -53,6 +53,13 @@ class OneVsOneLeague(BaseLeague):
decay
=
0.99
,
min_win_rate_games
=
8
,
),
metric
=
dict
(
mu
=
0
,
sigma
=
25
/
3
,
beta
=
25
/
3
/
2
,
tau
=
0.0
,
draw_probability
=
0.02
,
),
)
# override
...
...
ding/league/player.py
浏览文件 @
dd4472e4
...
...
@@ -25,7 +25,8 @@ class Player:
init_payoff
:
'BattleSharedPayoff'
,
# noqa
checkpoint_path
:
str
,
player_id
:
str
,
total_agent_step
:
int
total_agent_step
:
int
,
rating
:
'PlayerRating'
,
# noqa
)
->
None
:
"""
Overview:
...
...
@@ -39,6 +40,7 @@ class Player:
- player_id (:obj:`str`): Player id in string format.
- total_agent_step (:obj:`int`): For active player, it should be 0;
\
For historical player, it should be parent player's ``_total_agent_step`` when ``snapshot``.
- rating (:obj:`PlayerRating`): player rating information in total league
"""
self
.
_cfg
=
cfg
self
.
_category
=
category
...
...
@@ -48,6 +50,7 @@ class Player:
self
.
_player_id
=
player_id
assert
isinstance
(
total_agent_step
,
int
),
(
total_agent_step
,
type
(
total_agent_step
))
self
.
_total_agent_step
=
total_agent_step
self
.
_rating
=
rating
@
property
def
category
(
self
)
->
str
:
...
...
@@ -73,6 +76,14 @@ class Player:
def
total_agent_step
(
self
,
step
:
int
)
->
None
:
self
.
_total_agent_step
=
step
@
property
def
rating
(
self
)
->
'PlayerRating'
:
# noqa
return
self
.
_rating
@
rating
.
setter
def
rating
(
self
,
_rating
:
'PlayerRating'
)
->
None
:
# noqa
self
.
_rating
=
_rating
@
PLAYER_REGISTRY
.
register
(
'historical_player'
)
class
HistoricalPlayer
(
Player
):
...
...
@@ -168,10 +179,12 @@ class ActivePlayer(Player):
else
:
return
False
def
snapshot
(
self
)
->
HistoricalPlayer
:
def
snapshot
(
self
,
metric_env
:
'LeagueMetricEnv'
)
->
HistoricalPlayer
:
# noqa
"""
Overview:
Generate a snapshot historical player from the current player, called in league's ``_snapshot``.
Argument:
- metric_env (:obj:`LeagueMetricEnv`): player rating environment, one league one env
Returns:
- snapshot_player (:obj:`HistoricalPlayer`): new instantiated historical player
...
...
@@ -187,6 +200,7 @@ class ActivePlayer(Player):
path
,
self
.
player_id
+
'_{}_historical'
.
format
(
int
(
self
.
_total_agent_step
)),
self
.
_total_agent_step
,
metric_env
.
create_rating
(
mu
=
self
.
rating
.
mu
),
parent_id
=
self
.
player_id
)
...
...
ding/league/starcraft_player.py
浏览文件 @
dd4472e4
from
typing
import
Optional
from
typing
import
Optional
,
Union
import
numpy
as
np
from
ding.utils
import
PLAYER_REGISTRY
...
...
@@ -105,7 +105,7 @@ class MainPlayer(ActivePlayer):
Overview:
MainPlayer does not mutate
"""
return
None
pass
@
PLAYER_REGISTRY
.
register
(
'main_exploiter'
)
...
...
@@ -168,9 +168,9 @@ class MainExploiter(ActivePlayer):
Overview:
Main exploiter is sure to mutate(reset) to the supervised learning player
Returns:
-
ckpt_path (:obj:`str`): the pretrained model's ckp
t path
-
mutate_ckpt_path (:obj:`str`): mutation target checkpoin
t path
"""
return
info
[
'
pretrain
_checkpoint_path'
]
return
info
[
'
reset
_checkpoint_path'
]
@
PLAYER_REGISTRY
.
register
(
'league_exploiter'
)
...
...
@@ -220,15 +220,15 @@ class LeagueExploiter(ActivePlayer):
return
super
().
is_trained_enough
(
select_fn
=
lambda
p
:
isinstance
(
p
,
HistoricalPlayer
))
# override
def
mutate
(
self
,
info
)
->
Optional
[
str
]:
def
mutate
(
self
,
info
)
->
Union
[
str
,
None
]:
"""
Overview:
League exploiter can mutate to the supervised learning player with 0.25 prob
Returns:
- ckpt_path (:obj:`
str
`): with ``mutate_prob`` prob returns the pretrained model's ckpt path,
\
- ckpt_path (:obj:`
Union[str, None]
`): with ``mutate_prob`` prob returns the pretrained model's ckpt path,
\
with left 1 - ``mutate_prob`` prob returns None, which means no mutation
"""
p
=
np
.
random
.
uniform
()
if
p
<
self
.
mutate_prob
:
return
info
[
'
pretrain
_checkpoint_path'
]
return
info
[
'
reset
_checkpoint_path'
]
return
None
ding/league/tests/test_league_metric.py
0 → 100644
浏览文件 @
dd4472e4
import
pytest
import
numpy
as
np
from
ding.league
import
get_elo
,
get_elo_array
,
LeagueMetricEnv
@
pytest
.
mark
.
unittest
def
test_elo_calculator
():
game_count
=
np
.
array
([[
0
,
1
,
2
],
[
1
,
0
,
0
],
[
2
,
0
,
0
]])
rating
=
np
.
array
([
1613
,
1573
,
1601
])
result
=
np
.
array
([[
0
,
-
1
,
-
1
+
1
],
[
1
,
0
,
0
],
[
1
+
(
-
1
),
0
,
0
]])
new_rating0
,
new_rating1
=
get_elo
(
rating
[
0
],
rating
[
1
],
result
[
0
][
1
])
assert
new_rating0
==
1595
assert
new_rating1
==
1591
old_rating
=
np
.
copy
(
rating
)
new_rating
=
get_elo_array
(
rating
,
result
,
game_count
)
assert
(
rating
==
old_rating
).
all
()
# no inplace modification
assert
new_rating
.
dtype
==
np
.
int64
assert
new_rating
[
0
]
==
1578
assert
new_rating
[
1
]
==
1591
assert
new_rating
[
2
]
==
1586
@
pytest
.
mark
.
unittest
def
test_league_metric
():
sigma
=
25
/
3
env
=
LeagueMetricEnv
(
mu
=
0
,
sigma
=
sigma
,
beta
=
sigma
/
2
,
tau
=
0.0
,
draw_probability
=
0.02
,
elo_init
=
1000
)
r1
=
env
.
create_rating
(
elo_init
=
1613
)
r2
=
env
.
create_rating
(
elo_init
=
1573
)
assert
r1
.
mu
==
0
assert
r2
.
mu
==
0
assert
r2
.
sigma
==
sigma
assert
r2
.
sigma
==
sigma
assert
r1
.
elo
==
1613
assert
r2
.
elo
==
1573
# r1 draw r2
r1
,
r2
=
env
.
rate_1vs1
(
r1
,
r2
,
drawn
=
True
)
assert
r1
.
mu
==
r2
.
mu
assert
r1
.
elo
==
1611
assert
r2
.
elo
==
1575
# r1 win r2
new_r1
,
new_r2
=
env
.
rate_1vs1
(
r1
,
r2
)
assert
new_r1
.
mu
>
r1
.
mu
assert
new_r2
.
mu
<
r2
.
mu
assert
new_r1
.
mu
+
new_r2
.
mu
==
0
assert
pytest
.
approx
(
new_r1
.
mu
,
3.230
)
assert
pytest
.
approx
(
new_r2
.
mu
,
-
3.230
)
assert
new_r1
.
elo
==
1625
assert
new_r2
.
elo
==
1561
# multi result
new_r1
,
new_r2
=
env
.
rate_1vs1
(
r1
,
r2
,
result
=
[
'wins'
,
'wins'
,
'losses'
])
assert
new_r1
.
elo
>
1611
# 1vsConstant
new_r1
=
env
.
rate_1vsC
(
r1
,
env
.
create_rating
(
elo_init
=
1800
),
result
=
[
'losses'
,
'losses'
])
assert
new_r1
.
elo
<
1611
print
(
'final rating is: '
,
new_r1
)
ding/league/tests/test_payoff.py
浏览文件 @
dd4472e4
...
...
@@ -8,6 +8,9 @@ from easydict import EasyDict
from
ding.league.player
import
Player
from
ding.league.shared_payoff
import
BattleRecordDict
,
create_payoff
from
ding.league.metric
import
LeagueMetricEnv
env
=
LeagueMetricEnv
()
@
pytest
.
mark
.
unittest
...
...
@@ -42,7 +45,8 @@ def get_shared_payoff_player(payoff):
init_payoff
=
payoff
,
checkpoint_path
=
'sp_ckpt_{}.pth'
.
format
(
sp_player_count
),
player_id
=
'sp_player_{}'
.
format
(
sp_player_count
),
total_agent_step
=
0
total_agent_step
=
0
,
rating
=
env
.
create_rating
(),
)
sp_player_count
+=
1
return
player
...
...
ding/league/tests/test_player.py
浏览文件 @
dd4472e4
...
...
@@ -8,8 +8,10 @@ from ding.league.player import Player, HistoricalPlayer, ActivePlayer, create_pl
from
ding.league.shared_payoff
import
create_payoff
from
ding.league.starcraft_player
import
MainPlayer
,
MainExploiter
,
LeagueExploiter
from
ding.league.tests.league_test_default_config
import
league_test_config
from
ding.league.metric
import
LeagueMetricEnv
ONE_PHASE_STEP
=
2000
env
=
LeagueMetricEnv
()
@
pytest
.
fixture
(
scope
=
'function'
)
...
...
@@ -27,7 +29,7 @@ def setup_league(setup_payoff):
players
.
append
(
create_player
(
league_test_config
.
league
,
'main_player'
,
league_test_config
.
league
.
main_player
,
category
,
setup_payoff
,
'ckpt_{}.pth'
.
format
(
main_player_name
),
main_player_name
,
0
'ckpt_{}.pth'
.
format
(
main_player_name
),
main_player_name
,
0
,
env
.
create_rating
()
)
)
# main_exloiter
...
...
@@ -35,7 +37,7 @@ def setup_league(setup_payoff):
players
.
append
(
create_player
(
league_test_config
.
league
,
'main_exploiter'
,
league_test_config
.
league
.
main_exploiter
,
category
,
setup_payoff
,
'ckpt_{}.pth'
.
format
(
main_exploiter_name
),
main_exploiter_name
,
0
setup_payoff
,
'ckpt_{}.pth'
.
format
(
main_exploiter_name
),
main_exploiter_name
,
0
,
env
.
create_rating
()
)
)
# league_exploiter
...
...
@@ -51,6 +53,7 @@ def setup_league(setup_payoff):
'ckpt_{}.pth'
.
format
(
league_exploiter_name
),
league_exploiter_name
,
0
,
env
.
create_rating
(),
)
)
# historical player: sl player is used as initial HistoricalPlayer
...
...
@@ -65,7 +68,8 @@ def setup_league(setup_payoff):
'ckpt_sl_{}'
.
format
(
sl_hp_name
),
sl_hp_name
,
0
,
parent_id
=
main_player_name
env
.
create_rating
(),
parent_id
=
main_player_name
,
)
)
for
p
in
players
:
...
...
@@ -94,7 +98,7 @@ class TestMainPlayer:
for
p
in
setup_league
:
if
isinstance
(
p
,
ActivePlayer
):
p
.
total_agent_step
=
2
*
ONE_PHASE_STEP
hp
=
p
.
snapshot
()
hp
=
p
.
snapshot
(
env
)
hp_list
.
append
(
hp
)
setup_payoff
.
add_player
(
hp
)
setup_league
+=
hp_list
# 12+3 + 12
...
...
@@ -122,7 +126,7 @@ class TestMainPlayer:
for
p
in
setup_league
:
for
i
in
range
(
N
):
if
isinstance
(
p
,
ActivePlayer
):
hp
=
p
.
snapshot
()
hp
=
p
.
snapshot
(
env
)
assert
isinstance
(
hp
,
HistoricalPlayer
)
assert
id
(
hp
.
payoff
)
==
id
(
p
.
payoff
)
assert
hp
.
parent_id
==
p
.
player_id
...
...
@@ -146,7 +150,7 @@ class TestMainPlayer:
hp_list
=
[]
for
p
in
setup_league
:
if
isinstance
(
p
,
MainPlayer
):
hp
=
p
.
snapshot
()
hp
=
p
.
snapshot
(
env
)
setup_payoff
.
add_player
(
hp
)
hp_list
.
append
(
hp
)
setup_league
+=
hp_list
...
...
@@ -243,7 +247,7 @@ class TestMainExploiter:
for
p
in
setup_league
:
if
isinstance
(
p
,
MainPlayer
):
p
.
total_agent_step
=
(
i
+
1
)
*
2
*
ONE_PHASE_STEP
hp
=
p
.
snapshot
()
hp
=
p
.
snapshot
(
env
)
setup_payoff
.
add_player
(
hp
)
hp_list
.
append
(
hp
)
setup_league
+=
hp_list
...
...
@@ -272,9 +276,9 @@ class TestMainExploiter:
def
test_mutate
(
self
,
setup_league
):
assert
isinstance
(
setup_league
[
1
],
MainExploiter
)
info
=
{
'
pretrain
_checkpoint_path'
:
'pretrain_checkpoint.pth'
}
info
=
{
'
reset
_checkpoint_path'
:
'pretrain_checkpoint.pth'
}
for
_
in
range
(
10
):
assert
setup_league
[
1
].
mutate
(
info
)
==
info
[
'
pretrain
_checkpoint_path'
]
assert
setup_league
[
1
].
mutate
(
info
)
==
info
[
'
reset
_checkpoint_path'
]
@
pytest
.
mark
.
unittest
...
...
@@ -296,7 +300,7 @@ class TestLeagueExploiter:
def
test_mutate
(
self
,
setup_league
):
assert
isinstance
(
setup_league
[
2
],
LeagueExploiter
)
info
=
{
'
pretrain
_checkpoint_path'
:
'pretrain_checkpoint.pth'
}
info
=
{
'
reset
_checkpoint_path'
:
'pretrain_checkpoint.pth'
}
results
=
[]
for
_
in
range
(
1000
):
results
.
append
(
setup_league
[
2
].
mutate
(
info
))
...
...
ding/worker/collector/one_vs_one_serial_evaluator.py
浏览文件 @
dd4472e4
...
...
@@ -171,7 +171,7 @@ class OnevOneEvaluator(object):
train_iter
:
int
=
-
1
,
envstep
:
int
=
-
1
,
n_episode
:
Optional
[
int
]
=
None
)
->
Tuple
[
bool
,
float
]:
)
->
Tuple
[
bool
,
float
,
list
]:
'''
Overview:
Evaluate policy and store the best policy based on whether it reaches the highest historical reward.
...
...
@@ -183,12 +183,14 @@ class OnevOneEvaluator(object):
Returns:
- stop_flag (:obj:`bool`): Whether this training program can be ended.
- eval_reward (:obj:`float`): Current eval_reward.
- return_info (:obj:`list`): Environment information of each finished episode
'''
if
n_episode
is
None
:
n_episode
=
self
.
_default_n_episode
assert
n_episode
is
not
None
,
"please indicate eval n_episode"
envstep_count
=
0
info
=
{}
return_info
=
[[]
for
_
in
range
(
2
)]
eval_monitor
=
VectorEvalMonitor
(
self
.
_env
.
env_num
,
n_episode
)
self
.
_env
.
reset
()
for
p
in
self
.
_policy
:
...
...
@@ -219,6 +221,8 @@ class OnevOneEvaluator(object):
if
'episode_info'
in
t
.
info
[
0
]:
eval_monitor
.
update_info
(
env_id
,
t
.
info
[
0
][
'episode_info'
])
eval_monitor
.
update_reward
(
env_id
,
reward
)
for
policy_id
in
range
(
2
):
return_info
[
policy_id
].
append
(
t
.
info
[
policy_id
])
self
.
_logger
.
info
(
"[EVALUATOR]env {} finish episode, final reward: {}, current episode: {}"
.
format
(
env_id
,
eval_monitor
.
get_latest_reward
(
env_id
),
eval_monitor
.
get_current_episode
()
...
...
@@ -266,7 +270,7 @@ class OnevOneEvaluator(object):
"Current eval_reward: {} is greater than stop_value: {}"
.
format
(
eval_reward
,
self
.
_stop_value
)
+
", so your RL agent is converged, you can refer to 'log/evaluator/evaluator_logger.txt' for details."
)
return
stop_flag
,
eval_reward
return
stop_flag
,
eval_reward
,
return_info
class
VectorEvalMonitor
(
object
):
...
...
dizoo/league_demo/demo_league.py
浏览文件 @
dd4472e4
import
os
import
shutil
from
easydict
import
EasyDict
from
ding.league
import
BaseLeague
,
ActivePlayer
class
DemoLeague
(
BaseLeague
):
def
__init__
(
self
,
cfg
):
super
(
DemoLeague
,
self
).
__init__
(
cfg
)
self
.
reset_checkpoint_path
=
os
.
path
.
join
(
self
.
path_policy
,
'reset_ckpt.pth'
)
# override
def
_get_job_info
(
self
,
player
:
ActivePlayer
,
eval_flag
:
bool
=
False
)
->
dict
:
assert
isinstance
(
player
,
ActivePlayer
),
player
.
__class__
...
...
@@ -18,7 +24,12 @@ class DemoLeague(BaseLeague):
# override
def
_mutate_player
(
self
,
player
:
ActivePlayer
):
pass
for
p
in
self
.
active_players
:
result
=
p
.
mutate
({
'reset_checkpoint_path'
:
self
.
reset_checkpoint_path
})
if
result
is
not
None
:
p
.
rating
=
self
.
metric_env
.
create_rating
()
self
.
load_checkpoint
(
p
.
player_id
,
result
)
# load_checkpoint is set by the caller of league
self
.
save_checkpoint
(
result
,
p
.
checkpoint_path
)
# override
def
_update_player
(
self
,
player
:
ActivePlayer
,
player_info
:
dict
)
->
None
:
...
...
dizoo/league_demo/game_env.py
浏览文件 @
dd4472e4
...
...
@@ -7,6 +7,10 @@ class GameEnv(BaseEnv):
def
__init__
(
self
,
game_type
=
'prisoner_dilemma'
):
self
.
game_type
=
game_type
assert
self
.
game_type
in
[
'zero_sum'
,
'prisoner_dilemma'
]
if
self
.
game_type
==
'prisoner_dilemma'
:
self
.
optimal_policy
=
[
0
,
1
]
elif
self
.
game_type
==
'zero_sum'
:
self
.
optimal_policy
=
[
0.375
,
0.625
]
def
seed
(
self
,
seed
,
dynamic_seed
=
False
):
pass
...
...
@@ -36,10 +40,10 @@ class GameEnv(BaseEnv):
results
=
"draws"
,
"draws"
elif
actions
==
[
0
,
1
]:
rewards
=
-
20
,
0
results
=
"
wins"
,
"losse
s"
results
=
"
losses"
,
"win
s"
elif
actions
==
[
1
,
0
]:
rewards
=
0
,
-
20
results
=
"
losses"
,
"win
s"
results
=
"
wins"
,
"losse
s"
elif
actions
==
[
1
,
1
]:
rewards
=
-
10
,
-
10
results
=
'draws'
,
'draws'
...
...
dizoo/league_demo/league_demo_ppo_config.py
浏览文件 @
dd4472e4
...
...
@@ -6,7 +6,8 @@ league_demo_ppo_config = dict(
collector_env_num
=
8
,
evaluator_env_num
=
10
,
n_evaluator_episode
=
100
,
stop_value
=
[
-
0.01
,
-
5
],
env_type
=
'prisoner_dilemma'
,
# ['zero_sum', 'prisoner_dilemma']
stop_value
=
[
-
10.1
,
-
5.05
],
# prisoner_dilemma
manager
=
dict
(
shared_memory
=
False
,
),
),
policy
=
dict
(
...
...
@@ -58,7 +59,7 @@ league_demo_ppo_config = dict(
one_phase_step
=
200
,
branch_probs
=
dict
(
pfsp
=
1.0
,
),
strong_win_rate
=
0.7
,
mutate_prob
=
0.
0
,
mutate_prob
=
0.
5
,
),
use_pretrain
=
False
,
use_pretrain_init_historical
=
False
,
...
...
@@ -66,7 +67,14 @@ league_demo_ppo_config = dict(
type
=
'battle'
,
decay
=
0.99
,
min_win_rate_games
=
8
,
)
),
metric
=
dict
(
mu
=
0
,
sigma
=
25
/
3
,
beta
=
25
/
3
/
2
,
tau
=
0.0
,
draw_probability
=
0.02
,
),
),
),
),
...
...
dizoo/league_demo/league_demo_ppo_main.py
浏览文件 @
dd4472e4
...
...
@@ -18,8 +18,17 @@ from dizoo.league_demo.league_demo_ppo_config import league_demo_ppo_config
class
EvalPolicy1
:
def
__init__
(
self
,
optimal_policy
:
list
)
->
None
:
assert
len
(
optimal_policy
)
==
2
self
.
optimal_policy
=
optimal_policy
def
forward
(
self
,
data
:
dict
)
->
dict
:
return
{
env_id
:
{
'action'
:
torch
.
zeros
(
1
)}
for
env_id
in
data
.
keys
()}
return
{
env_id
:
{
'action'
:
torch
.
from_numpy
(
np
.
random
.
choice
([
0
,
1
],
p
=
self
.
optimal_policy
,
size
=
(
1
,
)))
}
for
env_id
in
data
.
keys
()
}
def
reset
(
self
,
data_id
:
list
=
[])
->
None
:
pass
...
...
@@ -50,17 +59,26 @@ def main(cfg, seed=0, max_iterations=int(1e10)):
NaiveReplayBuffer
,
save_cfg
=
True
)
env_type
=
cfg
.
env
.
env_type
collector_env_num
,
evaluator_env_num
=
cfg
.
env
.
collector_env_num
,
cfg
.
env
.
evaluator_env_num
evaluator_env1
=
BaseEnvManager
(
env_fn
=
[
GameEnv
for
_
in
range
(
evaluator_env_num
)],
cfg
=
cfg
.
env
.
manager
)
evaluator_env2
=
BaseEnvManager
(
env_fn
=
[
GameEnv
for
_
in
range
(
evaluator_env_num
)],
cfg
=
cfg
.
env
.
manager
)
evaluator_env1
=
BaseEnvManager
(
env_fn
=
[
lambda
:
GameEnv
(
env_type
)
for
_
in
range
(
evaluator_env_num
)],
cfg
=
cfg
.
env
.
manager
)
evaluator_env2
=
BaseEnvManager
(
env_fn
=
[
lambda
:
GameEnv
(
env_type
)
for
_
in
range
(
evaluator_env_num
)],
cfg
=
cfg
.
env
.
manager
)
evaluator_env3
=
BaseEnvManager
(
env_fn
=
[
lambda
:
GameEnv
(
env_type
)
for
_
in
range
(
evaluator_env_num
)],
cfg
=
cfg
.
env
.
manager
)
evaluator_env1
.
seed
(
seed
,
dynamic_seed
=
False
)
evaluator_env2
.
seed
(
seed
,
dynamic_seed
=
False
)
evaluator_env3
.
seed
(
seed
,
dynamic_seed
=
False
)
set_pkg_seed
(
seed
,
use_cuda
=
cfg
.
policy
.
cuda
)
tb_logger
=
SummaryWriter
(
os
.
path
.
join
(
'./{}/log/'
.
format
(
cfg
.
exp_name
),
'serial'
))
league
=
DemoLeague
(
cfg
.
policy
.
other
.
league
)
eval_policy1
=
EvalPolicy1
()
eval_policy1
=
EvalPolicy1
(
evaluator_env1
.
_env_ref
.
optimal_policy
)
eval_policy2
=
EvalPolicy2
()
policies
=
{}
learners
=
{}
...
...
@@ -70,7 +88,9 @@ def main(cfg, seed=0, max_iterations=int(1e10)):
model
=
VAC
(
**
cfg
.
policy
.
model
)
policy
=
PPOPolicy
(
cfg
.
policy
,
model
=
model
)
policies
[
player_id
]
=
policy
collector_env
=
BaseEnvManager
(
env_fn
=
[
GameEnv
for
_
in
range
(
collector_env_num
)],
cfg
=
cfg
.
env
.
manager
)
collector_env
=
BaseEnvManager
(
env_fn
=
[
lambda
:
GameEnv
(
env_type
)
for
_
in
range
(
collector_env_num
)],
cfg
=
cfg
.
env
.
manager
)
collector_env
.
seed
(
seed
)
learners
[
player_id
]
=
BaseLearner
(
...
...
@@ -90,8 +110,11 @@ def main(cfg, seed=0, max_iterations=int(1e10)):
model
=
VAC
(
**
cfg
.
policy
.
model
)
policy
=
PPOPolicy
(
cfg
.
policy
,
model
=
model
)
policies
[
'historical'
]
=
policy
# use initial policy as another eval_policy
eval_policy3
=
PPOPolicy
(
cfg
.
policy
,
model
=
copy
.
deepcopy
(
model
)).
collect_mode
main_key
=
[
k
for
k
in
learners
.
keys
()
if
k
.
startswith
(
'main_player'
)][
0
]
main_player
=
league
.
get_player_by_id
(
main_key
)
main_learner
=
learners
[
main_key
]
main_collector
=
collectors
[
main_key
]
# collect_mode ppo use multimonial sample for selecting action
...
...
@@ -113,25 +136,68 @@ def main(cfg, seed=0, max_iterations=int(1e10)):
exp_name
=
cfg
.
exp_name
,
instance_name
=
'uniform_evaluator'
)
evaluator3_cfg
=
copy
.
deepcopy
(
cfg
.
policy
.
eval
.
evaluator
)
evaluator3_cfg
.
stop_value
=
99999999
# stop_value of evaluator3 is a placeholder
evaluator3
=
OnevOneEvaluator
(
evaluator3_cfg
,
evaluator_env3
,
[
policies
[
main_key
].
collect_mode
,
eval_policy3
],
tb_logger
,
exp_name
=
cfg
.
exp_name
,
instance_name
=
'init_evaluator'
)
def
load_checkpoint_fn
(
player_id
:
str
,
ckpt_path
:
str
):
state_dict
=
torch
.
load
(
ckpt_path
)
policies
[
player_id
].
learn_mode
.
load_state_dict
(
state_dict
)
torch
.
save
(
policies
[
'historical'
].
learn_mode
.
state_dict
(),
league
.
reset_checkpoint_path
)
league
.
load_checkpoint
=
load_checkpoint_fn
for
player_id
,
player_ckpt_path
in
zip
(
league
.
active_players_ids
,
league
.
active_players_ckpts
):
torch
.
save
(
policies
[
player_id
].
collect_mode
.
state_dict
(),
player_ckpt_path
)
league
.
judge_snapshot
(
player_id
,
force
=
True
)
init_main_player_rating
=
league
.
metric_env
.
create_rating
(
mu
=
0
)
for
run_iter
in
range
(
max_iterations
):
if
evaluator1
.
should_eval
(
main_learner
.
train_iter
):
stop_flag1
,
reward
=
evaluator1
.
eval
(
stop_flag1
,
reward
,
episode_info
=
evaluator1
.
eval
(
main_learner
.
save_checkpoint
,
main_learner
.
train_iter
,
main_collector
.
envstep
)
win_loss_result
=
[
e
[
'result'
]
for
e
in
episode_info
[
0
]]
# set fixed NE policy trueskill(exposure) equal 10
main_player
.
rating
=
league
.
metric_env
.
rate_1vsC
(
main_player
.
rating
,
league
.
metric_env
.
create_rating
(
mu
=
10
,
sigma
=
1e-8
),
win_loss_result
)
tb_logger
.
add_scalar
(
'fixed_evaluator_step/reward_mean'
,
reward
,
main_collector
.
envstep
)
if
evaluator2
.
should_eval
(
main_learner
.
train_iter
):
stop_flag2
,
reward
=
evaluator2
.
eval
(
stop_flag2
,
reward
,
episode_info
=
evaluator2
.
eval
(
main_learner
.
save_checkpoint
,
main_learner
.
train_iter
,
main_collector
.
envstep
)
win_loss_result
=
[
e
[
'result'
]
for
e
in
episode_info
[
0
]]
# set random(uniform) policy trueskill(exposure) equal 0
main_player
.
rating
=
league
.
metric_env
.
rate_1vsC
(
main_player
.
rating
,
league
.
metric_env
.
create_rating
(
mu
=
0
,
sigma
=
1e-8
),
win_loss_result
)
tb_logger
.
add_scalar
(
'uniform_evaluator_step/reward_mean'
,
reward
,
main_collector
.
envstep
)
if
evaluator3
.
should_eval
(
main_learner
.
train_iter
):
_
,
reward
,
episode_info
=
evaluator3
.
eval
(
main_learner
.
save_checkpoint
,
main_learner
.
train_iter
,
main_collector
.
envstep
)
win_loss_result
=
[
e
[
'result'
]
for
e
in
episode_info
[
0
]]
# use init main player as another evaluator metric
main_player
.
rating
,
init_main_player_rating
=
league
.
metric_env
.
rate_1vs1
(
main_player
.
rating
,
init_main_player_rating
,
win_loss_result
)
tb_logger
.
add_scalar
(
'init_evaluator_step/reward_mean'
,
reward
,
main_collector
.
envstep
)
tb_logger
.
add_scalar
(
'league/init_main_player_trueskill'
,
init_main_player_rating
.
exposure
,
main_collector
.
envstep
)
if
stop_flag1
and
stop_flag2
:
break
for
player_id
,
player_ckpt_path
in
zip
(
league
.
active_players_ids
,
league
.
active_players_ckpts
):
tb_logger
.
add_scalar
(
'league/{}_trueskill'
.
format
(
player_id
),
league
.
get_player_by_id
(
player_id
).
rating
.
exposure
,
main_collector
.
envstep
)
collector
,
learner
=
collectors
[
player_id
],
learners
[
player_id
]
job
=
league
.
get_job_info
(
player_id
)
opponent_player_id
=
job
[
'player_id'
][
1
]
...
...
@@ -144,7 +210,7 @@ def main(cfg, seed=0, max_iterations=int(1e10)):
opponent_policy
=
policies
[
opponent_player_id
].
collect_mode
collector
.
reset_policy
([
policies
[
player_id
].
collect_mode
,
opponent_policy
])
train_data
,
episode_info
=
collector
.
collect
(
train_iter
=
learner
.
train_iter
)
train_data
,
episode_info
=
train_data
[
0
],
episode_info
[
0
]
# only use laun
er
player data for training
train_data
,
episode_info
=
train_data
[
0
],
episode_info
[
0
]
# only use laun
ch
player data for training
for
d
in
train_data
:
d
[
'adv'
]
=
d
[
'reward'
]
...
...
@@ -156,7 +222,9 @@ def main(cfg, seed=0, max_iterations=int(1e10)):
player_info
[
'player_id'
]
=
player_id
league
.
update_active_player
(
player_info
)
league
.
judge_snapshot
(
player_id
)
# set eval_flag=True to enable trueskill update
job_finish_info
=
{
'eval_flag'
:
True
,
'launch_player'
:
job
[
'launch_player'
],
'player_id'
:
job
[
'player_id'
],
'result'
:
[
e
[
'result'
]
for
e
in
episode_info
],
...
...
dizoo/league_demo/selfplay_demo_ppo_main.py
浏览文件 @
dd4472e4
...
...
@@ -50,10 +50,17 @@ def main(cfg, seed=0, max_iterations=int(1e10)):
NaiveReplayBuffer
,
save_cfg
=
True
)
env_type
=
cfg
.
env
.
env_type
collector_env_num
,
evaluator_env_num
=
cfg
.
env
.
collector_env_num
,
cfg
.
env
.
evaluator_env_num
collector_env
=
BaseEnvManager
(
env_fn
=
[
GameEnv
for
_
in
range
(
collector_env_num
)],
cfg
=
cfg
.
env
.
manager
)
evaluator_env1
=
BaseEnvManager
(
env_fn
=
[
GameEnv
for
_
in
range
(
evaluator_env_num
)],
cfg
=
cfg
.
env
.
manager
)
evaluator_env2
=
BaseEnvManager
(
env_fn
=
[
GameEnv
for
_
in
range
(
evaluator_env_num
)],
cfg
=
cfg
.
env
.
manager
)
collector_env
=
BaseEnvManager
(
env_fn
=
[
lambda
:
GameEnv
(
env_type
)
for
_
in
range
(
collector_env_num
)],
cfg
=
cfg
.
env
.
manager
)
evaluator_env1
=
BaseEnvManager
(
env_fn
=
[
lambda
:
GameEnv
(
env_type
)
for
_
in
range
(
evaluator_env_num
)],
cfg
=
cfg
.
env
.
manager
)
evaluator_env2
=
BaseEnvManager
(
env_fn
=
[
lambda
:
GameEnv
(
env_type
)
for
_
in
range
(
evaluator_env_num
)],
cfg
=
cfg
.
env
.
manager
)
collector_env
.
seed
(
seed
)
evaluator_env1
.
seed
(
seed
,
dynamic_seed
=
False
)
...
...
@@ -102,10 +109,10 @@ def main(cfg, seed=0, max_iterations=int(1e10)):
for
_
in
range
(
max_iterations
):
if
evaluator1
.
should_eval
(
learner1
.
train_iter
):
stop_flag1
,
reward
=
evaluator1
.
eval
(
learner1
.
save_checkpoint
,
learner1
.
train_iter
,
collector
.
envstep
)
stop_flag1
,
reward
,
_
=
evaluator1
.
eval
(
learner1
.
save_checkpoint
,
learner1
.
train_iter
,
collector
.
envstep
)
tb_logger
.
add_scalar
(
'fixed_evaluator_step/reward_mean'
,
reward
,
collector
.
envstep
)
if
evaluator2
.
should_eval
(
learner1
.
train_iter
):
stop_flag2
,
reward
=
evaluator2
.
eval
(
learner1
.
save_checkpoint
,
learner1
.
train_iter
,
collector
.
envstep
)
stop_flag2
,
reward
,
_
=
evaluator2
.
eval
(
learner1
.
save_checkpoint
,
learner1
.
train_iter
,
collector
.
envstep
)
tb_logger
.
add_scalar
(
'uniform_evaluator_step/reward_mean'
,
reward
,
collector
.
envstep
)
if
stop_flag1
and
stop_flag2
:
break
...
...
setup.py
浏览文件 @
dd4472e4
...
...
@@ -68,6 +68,7 @@ setup(
'opencv-python'
,
# pypy incompatible
'enum_tools'
,
'scipy'
,
'trueskill'
,
],
extras_require
=
{
'test'
:
[
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录