Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PARL
提交
de21118e
P
PARL
项目概览
PaddlePaddle
/
PARL
通知
67
Star
3
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
18
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PARL
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
18
Issue
18
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
de21118e
编写于
4月 30, 2020
作者:
L
LI Yunxiang
提交者:
GitHub
4月 30, 2020
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
state to obs (#256)
* state to obs * yapf & update softlink in offline-q-learning
上级
6fa2d081
变更
14
显示空白变更内容
内联
并排
Showing
14 changed file
with
160 addition
and
164 deletion
+160
-164
benchmark/torch/dqn/replay_memory.py
benchmark/torch/dqn/replay_memory.py
+23
-24
benchmark/torch/dqn/train.py
benchmark/torch/dqn/train.py
+28
-28
examples/DQN/cartpole_agent.py
examples/DQN/cartpole_agent.py
+6
-6
examples/DQN/replay_memory.py
examples/DQN/replay_memory.py
+5
-5
examples/DQN/train.py
examples/DQN/train.py
+13
-13
examples/DQN_variant/replay_memory.py
examples/DQN_variant/replay_memory.py
+23
-24
examples/DQN_variant/train.py
examples/DQN_variant/train.py
+14
-14
examples/offline-Q-learning/atari.py
examples/offline-Q-learning/atari.py
+1
-1
examples/offline-Q-learning/atari_wrapper.py
examples/offline-Q-learning/atari_wrapper.py
+1
-1
examples/offline-Q-learning/parallel_run.py
examples/offline-Q-learning/parallel_run.py
+16
-16
examples/offline-Q-learning/replay_memory.py
examples/offline-Q-learning/replay_memory.py
+25
-27
examples/offline-Q-learning/rom_files
examples/offline-Q-learning/rom_files
+1
-1
examples/offline-Q-learning/utils.py
examples/offline-Q-learning/utils.py
+1
-1
parl/algorithms/fluid/sac.py
parl/algorithms/fluid/sac.py
+3
-3
未找到文件。
benchmark/torch/dqn/replay_memory.py
浏览文件 @
de21118e
...
...
@@ -16,16 +16,16 @@ import numpy as np
import
copy
from
collections
import
deque
,
namedtuple
Experience
=
namedtuple
(
'Experience'
,
[
'
state
'
,
'action'
,
'reward'
,
'isOver'
])
Experience
=
namedtuple
(
'Experience'
,
[
'
obs
'
,
'action'
,
'reward'
,
'isOver'
])
class
ReplayMemory
(
object
):
def
__init__
(
self
,
max_size
,
state
_shape
,
context_len
):
def
__init__
(
self
,
max_size
,
obs
_shape
,
context_len
):
self
.
max_size
=
int
(
max_size
)
self
.
state_shape
=
state
_shape
self
.
obs_shape
=
obs
_shape
self
.
context_len
=
int
(
context_len
)
self
.
state
=
np
.
zeros
((
self
.
max_size
,
)
+
state
_shape
,
dtype
=
'uint8'
)
self
.
obs
=
np
.
zeros
((
self
.
max_size
,
)
+
obs
_shape
,
dtype
=
'uint8'
)
self
.
action
=
np
.
zeros
((
self
.
max_size
,
),
dtype
=
'int32'
)
self
.
reward
=
np
.
zeros
((
self
.
max_size
,
),
dtype
=
'float32'
)
self
.
isOver
=
np
.
zeros
((
self
.
max_size
,
),
dtype
=
'bool'
)
...
...
@@ -48,42 +48,41 @@ class ReplayMemory(object):
else
:
self
.
_context
.
append
(
exp
)
def
recent_
state
(
self
):
""" maintain recent
state
for training"""
def
recent_
obs
(
self
):
""" maintain recent
obs
for training"""
lst
=
list
(
self
.
_context
)
states
=
[
np
.
zeros
(
self
.
state
_shape
,
dtype
=
'uint8'
)]
*
\
obs
=
[
np
.
zeros
(
self
.
obs
_shape
,
dtype
=
'uint8'
)]
*
\
(
self
.
_context
.
maxlen
-
len
(
lst
))
states
.
extend
([
k
.
state
for
k
in
lst
])
return
state
s
obs
.
extend
([
k
.
obs
for
k
in
lst
])
return
ob
s
def
sample
(
self
,
idx
):
""" return
state
, action, reward, isOver,
note that some frames in
state
may be generated from last episode,
they should be removed from
state
""" return
obs
, action, reward, isOver,
note that some frames in
obs
may be generated from last episode,
they should be removed from
obs
"""
state
=
np
.
zeros
(
(
self
.
context_len
+
1
,
)
+
self
.
state_shape
,
dtype
=
np
.
uint8
)
state_idx
=
np
.
arange
(
idx
,
idx
+
self
.
context_len
+
1
)
%
self
.
_curr_size
obs
=
np
.
zeros
(
(
self
.
context_len
+
1
,
)
+
self
.
obs_shape
,
dtype
=
np
.
uint8
)
obs_idx
=
np
.
arange
(
idx
,
idx
+
self
.
context_len
+
1
)
%
self
.
_curr_size
# confirm that no frame was generated from last episode
has_last_episode
=
False
for
k
in
range
(
self
.
context_len
-
2
,
-
1
,
-
1
):
to_check_idx
=
state
_idx
[
k
]
to_check_idx
=
obs
_idx
[
k
]
if
self
.
isOver
[
to_check_idx
]:
has_last_episode
=
True
state_idx
=
state
_idx
[
k
+
1
:]
state
[
k
+
1
:]
=
self
.
state
[
state
_idx
]
obs_idx
=
obs
_idx
[
k
+
1
:]
obs
[
k
+
1
:]
=
self
.
obs
[
obs
_idx
]
break
if
not
has_last_episode
:
state
=
self
.
state
[
state
_idx
]
obs
=
self
.
obs
[
obs
_idx
]
real_idx
=
(
idx
+
self
.
context_len
-
1
)
%
self
.
_curr_size
action
=
self
.
action
[
real_idx
]
reward
=
self
.
reward
[
real_idx
]
isOver
=
self
.
isOver
[
real_idx
]
return
state
,
reward
,
action
,
isOver
return
obs
,
reward
,
action
,
isOver
def
__len__
(
self
):
return
self
.
_curr_size
...
...
@@ -92,7 +91,7 @@ class ReplayMemory(object):
return
self
.
_curr_size
def
_assign
(
self
,
pos
,
exp
):
self
.
state
[
pos
]
=
exp
.
state
self
.
obs
[
pos
]
=
exp
.
obs
self
.
reward
[
pos
]
=
exp
.
reward
self
.
action
[
pos
]
=
exp
.
action
self
.
isOver
[
pos
]
=
exp
.
isOver
...
...
@@ -107,8 +106,8 @@ class ReplayMemory(object):
return
self
.
_process_batch
(
batch_exp
)
def
_process_batch
(
self
,
batch_exp
):
state
=
np
.
asarray
([
e
[
0
]
for
e
in
batch_exp
],
dtype
=
'uint8'
)
obs
=
np
.
asarray
([
e
[
0
]
for
e
in
batch_exp
],
dtype
=
'uint8'
)
reward
=
np
.
asarray
([
e
[
1
]
for
e
in
batch_exp
],
dtype
=
'float32'
)
action
=
np
.
asarray
([
e
[
2
]
for
e
in
batch_exp
],
dtype
=
'int8'
)
isOver
=
np
.
asarray
([
e
[
3
]
for
e
in
batch_exp
],
dtype
=
'bool'
)
return
[
state
,
action
,
reward
,
isOver
]
return
[
obs
,
action
,
reward
,
isOver
]
benchmark/torch/dqn/train.py
浏览文件 @
de21118e
...
...
@@ -26,7 +26,7 @@ from parl.utils import tensorboard, logger
from
parl.algorithms
import
DQN
,
DDQN
from
agent
import
AtariAgent
from
atari_wrapper
import
FireResetEnv
,
FrameStack
,
LimitLength
,
MapState
from
atari_wrapper
import
FireResetEnv
,
FrameStack
,
LimitLength
from
model
import
AtariModel
from
replay_memory
import
ReplayMemory
,
Experience
from
utils
import
get_player
...
...
@@ -43,57 +43,57 @@ GAMMA = 0.99
def
run_train_episode
(
env
,
agent
,
rpm
):
total_reward
=
0
all_cost
=
[]
state
=
env
.
reset
()
obs
=
env
.
reset
()
steps
=
0
while
True
:
steps
+=
1
context
=
rpm
.
recent_
state
()
context
.
append
(
state
)
context
=
rpm
.
recent_
obs
()
context
.
append
(
obs
)
context
=
np
.
stack
(
context
,
axis
=
0
)
action
=
agent
.
sample
(
context
)
next_
state
,
reward
,
isOver
,
_
=
env
.
step
(
action
)
rpm
.
append
(
Experience
(
state
,
action
,
reward
,
isOver
))
next_
obs
,
reward
,
isOver
,
_
=
env
.
step
(
action
)
rpm
.
append
(
Experience
(
obs
,
action
,
reward
,
isOver
))
if
rpm
.
size
()
>
MEMORY_WARMUP_SIZE
:
if
steps
%
UPDATE_FREQ
==
0
:
batch_all_
state
,
batch_action
,
batch_reward
,
batch_isOver
=
rpm
.
sample_batch
(
batch_all_
obs
,
batch_action
,
batch_reward
,
batch_isOver
=
rpm
.
sample_batch
(
args
.
batch_size
)
batch_
state
=
batch_all_state
[:,
:
CONTEXT_LEN
,
:,
:]
batch_next_
state
=
batch_all_state
[:,
1
:,
:,
:]
cost
=
agent
.
learn
(
batch_
state
,
batch_action
,
batch_reward
,
batch_next_
state
,
batch_isOver
)
batch_
obs
=
batch_all_obs
[:,
:
CONTEXT_LEN
,
:,
:]
batch_next_
obs
=
batch_all_obs
[:,
1
:,
:,
:]
cost
=
agent
.
learn
(
batch_
obs
,
batch_action
,
batch_reward
,
batch_next_
obs
,
batch_isOver
)
all_cost
.
append
(
cost
)
total_reward
+=
reward
state
=
next_state
obs
=
next_obs
if
isOver
:
mean_loss
=
np
.
mean
(
all_cost
)
if
all_cost
else
None
return
total_reward
,
steps
,
mean_loss
def
run_evaluate_episode
(
env
,
agent
):
state
=
env
.
reset
()
obs
=
env
.
reset
()
total_reward
=
0
while
True
:
pred_Q
=
agent
.
predict
(
state
)
pred_Q
=
agent
.
predict
(
obs
)
action
=
pred_Q
.
max
(
1
)[
1
].
item
()
state
,
reward
,
isOver
,
_
=
env
.
step
(
action
)
obs
,
reward
,
isOver
,
_
=
env
.
step
(
action
)
total_reward
+=
reward
if
isOver
:
return
total_reward
def
get_fixed_
state
s
(
rpm
,
batch_size
):
state
s
=
[]
def
get_fixed_
ob
s
(
rpm
,
batch_size
):
ob
s
=
[]
for
_
in
range
(
3
):
batch_all_
state
=
rpm
.
sample_batch
(
batch_size
)[
0
]
batch_
state
=
batch_all_state
[:,
:
CONTEXT_LEN
,
:,
:]
states
.
append
(
batch_state
)
fixed_
states
=
np
.
concatenate
(
state
s
,
axis
=
0
)
return
fixed_
state
s
batch_all_
obs
=
rpm
.
sample_batch
(
batch_size
)[
0
]
batch_
obs
=
batch_all_obs
[:,
:
CONTEXT_LEN
,
:,
:]
obs
.
append
(
batch_obs
)
fixed_
obs
=
np
.
concatenate
(
ob
s
,
axis
=
0
)
return
fixed_
ob
s
def
evaluate_fixed_Q
(
agent
,
state
s
):
def
evaluate_fixed_Q
(
agent
,
ob
s
):
with
torch
.
no_grad
():
max_pred_Q
=
agent
.
alg
.
model
(
state
s
).
max
(
1
)[
0
].
mean
()
max_pred_Q
=
agent
.
alg
.
model
(
ob
s
).
max
(
1
)[
0
].
mean
()
return
max_pred_Q
.
item
()
...
...
@@ -131,9 +131,9 @@ def main():
total_reward
,
steps
,
_
=
run_train_episode
(
env
,
agent
,
rpm
)
pbar
.
update
(
steps
)
# Get fixed
state
s to check value function.
fixed_
states
=
get_fixed_state
s
(
rpm
,
args
.
batch_size
)
fixed_
states
=
torch
.
tensor
(
fixed_state
s
,
dtype
=
torch
.
float
,
device
=
device
)
# Get fixed
ob
s to check value function.
fixed_
obs
=
get_fixed_ob
s
(
rpm
,
args
.
batch_size
)
fixed_
obs
=
torch
.
tensor
(
fixed_ob
s
,
dtype
=
torch
.
float
,
device
=
device
)
# train
test_flag
=
0
...
...
@@ -159,7 +159,7 @@ def main():
tensorboard
.
add_scalar
(
'dqn/exploration'
,
agent
.
exploration
,
total_steps
)
tensorboard
.
add_scalar
(
'dqn/Q value'
,
evaluate_fixed_Q
(
agent
,
fixed_
state
s
),
evaluate_fixed_Q
(
agent
,
fixed_
ob
s
),
total_steps
)
tensorboard
.
add_scalar
(
'dqn/grad_norm'
,
get_grad_norm
(
agent
.
alg
.
model
),
...
...
examples/DQN/cartpole_agent.py
浏览文件 @
de21118e
...
...
@@ -21,13 +21,13 @@ from parl import layers
class
CartpoleAgent
(
parl
.
Agent
):
def
__init__
(
self
,
algorithm
,
state
_dim
,
obs
_dim
,
act_dim
,
e_greed
=
0.1
,
e_greed_decrement
=
0
):
assert
isinstance
(
state
_dim
,
int
)
assert
isinstance
(
obs
_dim
,
int
)
assert
isinstance
(
act_dim
,
int
)
self
.
state_dim
=
state
_dim
self
.
obs_dim
=
obs
_dim
self
.
act_dim
=
act_dim
super
(
CartpoleAgent
,
self
).
__init__
(
algorithm
)
...
...
@@ -43,16 +43,16 @@ class CartpoleAgent(parl.Agent):
with
fluid
.
program_guard
(
self
.
pred_program
):
obs
=
layers
.
data
(
name
=
'obs'
,
shape
=
[
self
.
state
_dim
],
dtype
=
'float32'
)
name
=
'obs'
,
shape
=
[
self
.
obs
_dim
],
dtype
=
'float32'
)
self
.
value
=
self
.
alg
.
predict
(
obs
)
with
fluid
.
program_guard
(
self
.
learn_program
):
obs
=
layers
.
data
(
name
=
'obs'
,
shape
=
[
self
.
state
_dim
],
dtype
=
'float32'
)
name
=
'obs'
,
shape
=
[
self
.
obs
_dim
],
dtype
=
'float32'
)
action
=
layers
.
data
(
name
=
'act'
,
shape
=
[
1
],
dtype
=
'int32'
)
reward
=
layers
.
data
(
name
=
'reward'
,
shape
=
[],
dtype
=
'float32'
)
next_obs
=
layers
.
data
(
name
=
'next_obs'
,
shape
=
[
self
.
state
_dim
],
dtype
=
'float32'
)
name
=
'next_obs'
,
shape
=
[
self
.
obs
_dim
],
dtype
=
'float32'
)
terminal
=
layers
.
data
(
name
=
'terminal'
,
shape
=
[],
dtype
=
'bool'
)
lr
=
layers
.
data
(
name
=
'lr'
,
shape
=
[
1
],
dtype
=
'float32'
,
append_batch_size
=
False
)
...
...
examples/DQN/replay_memory.py
浏览文件 @
de21118e
...
...
@@ -28,19 +28,19 @@ class ReplayMemory(object):
def
sample
(
self
,
batch_size
):
mini_batch
=
random
.
sample
(
self
.
buffer
,
batch_size
)
state_batch
,
action_batch
,
reward_batch
,
next_state
_batch
,
done_batch
=
[],
[],
[],
[],
[]
obs_batch
,
action_batch
,
reward_batch
,
next_obs
_batch
,
done_batch
=
[],
[],
[],
[],
[]
for
experience
in
mini_batch
:
s
,
a
,
r
,
s_p
,
done
=
experience
state
_batch
.
append
(
s
)
obs
_batch
.
append
(
s
)
action_batch
.
append
(
a
)
reward_batch
.
append
(
r
)
next_
state
_batch
.
append
(
s_p
)
next_
obs
_batch
.
append
(
s_p
)
done_batch
.
append
(
done
)
return
np
.
array
(
state
_batch
).
astype
(
'float32'
),
\
return
np
.
array
(
obs
_batch
).
astype
(
'float32'
),
\
np
.
array
(
action_batch
).
astype
(
'float32'
),
np
.
array
(
reward_batch
).
astype
(
'float32'
),
\
np
.
array
(
next_
state
_batch
).
astype
(
'float32'
),
np
.
array
(
done_batch
).
astype
(
'float32'
)
np
.
array
(
next_
obs
_batch
).
astype
(
'float32'
),
np
.
array
(
done_batch
).
astype
(
'float32'
)
def
__len__
(
self
):
return
len
(
self
.
buffer
)
examples/DQN/train.py
浏览文件 @
de21118e
...
...
@@ -32,24 +32,24 @@ GAMMA = 0.99 # discount factor of reward
def
run_episode
(
agent
,
env
,
rpm
):
total_reward
=
0
state
=
env
.
reset
()
obs
=
env
.
reset
()
step
=
0
while
True
:
step
+=
1
action
=
agent
.
sample
(
state
)
next_
state
,
reward
,
isOver
,
_
=
env
.
step
(
action
)
rpm
.
append
((
state
,
action
,
reward
,
next_state
,
isOver
))
action
=
agent
.
sample
(
obs
)
next_
obs
,
reward
,
isOver
,
_
=
env
.
step
(
action
)
rpm
.
append
((
obs
,
action
,
reward
,
next_obs
,
isOver
))
# train model
if
(
len
(
rpm
)
>
MEMORY_WARMUP_SIZE
)
and
(
step
%
LEARN_FREQ
==
0
):
(
batch_
state
,
batch_action
,
batch_reward
,
batch_next_state
,
(
batch_
obs
,
batch_action
,
batch_reward
,
batch_next_obs
,
batch_isOver
)
=
rpm
.
sample
(
BATCH_SIZE
)
train_loss
=
agent
.
learn
(
batch_
state
,
batch_action
,
batch_reward
,
batch_next_
state
,
batch_isOver
,
train_loss
=
agent
.
learn
(
batch_
obs
,
batch_action
,
batch_reward
,
batch_next_
obs
,
batch_isOver
,
LEARNING_RATE
)
total_reward
+=
reward
state
=
next_state
obs
=
next_obs
if
isOver
:
break
return
total_reward
...
...
@@ -59,14 +59,14 @@ def evaluate(agent, env, render=False):
# test part, run 5 episodes and average
eval_reward
=
[]
for
i
in
range
(
5
):
state
=
env
.
reset
()
obs
=
env
.
reset
()
episode_reward
=
0
isOver
=
False
while
not
isOver
:
action
=
agent
.
predict
(
state
)
action
=
agent
.
predict
(
obs
)
if
render
:
env
.
render
()
state
,
reward
,
isOver
,
_
=
env
.
step
(
action
)
obs
,
reward
,
isOver
,
_
=
env
.
step
(
action
)
episode_reward
+=
reward
eval_reward
.
append
(
episode_reward
)
return
np
.
mean
(
eval_reward
)
...
...
@@ -75,7 +75,7 @@ def evaluate(agent, env, render=False):
def
main
():
env
=
gym
.
make
(
'CartPole-v1'
)
action_dim
=
env
.
action_space
.
n
state
_shape
=
env
.
observation_space
.
shape
obs
_shape
=
env
.
observation_space
.
shape
rpm
=
ReplayMemory
(
MEMORY_SIZE
)
...
...
@@ -83,7 +83,7 @@ def main():
algorithm
=
parl
.
algorithms
.
DQN
(
model
,
act_dim
=
action_dim
,
gamma
=
GAMMA
)
agent
=
CartpoleAgent
(
algorithm
,
state_dim
=
state
_shape
[
0
],
obs_dim
=
obs
_shape
[
0
],
act_dim
=
action_dim
,
e_greed
=
0.1
,
# explore
e_greed_decrement
=
1e-6
...
...
examples/DQN_variant/replay_memory.py
浏览文件 @
de21118e
...
...
@@ -16,16 +16,16 @@ import numpy as np
import
copy
from
collections
import
deque
,
namedtuple
Experience
=
namedtuple
(
'Experience'
,
[
'
state
'
,
'action'
,
'reward'
,
'isOver'
])
Experience
=
namedtuple
(
'Experience'
,
[
'
obs
'
,
'action'
,
'reward'
,
'isOver'
])
class
ReplayMemory
(
object
):
def
__init__
(
self
,
max_size
,
state
_shape
,
context_len
):
def
__init__
(
self
,
max_size
,
obs
_shape
,
context_len
):
self
.
max_size
=
int
(
max_size
)
self
.
state_shape
=
state
_shape
self
.
obs_shape
=
obs
_shape
self
.
context_len
=
int
(
context_len
)
self
.
state
=
np
.
zeros
((
self
.
max_size
,
)
+
state
_shape
,
dtype
=
'uint8'
)
self
.
obs
=
np
.
zeros
((
self
.
max_size
,
)
+
obs
_shape
,
dtype
=
'uint8'
)
self
.
action
=
np
.
zeros
((
self
.
max_size
,
),
dtype
=
'int32'
)
self
.
reward
=
np
.
zeros
((
self
.
max_size
,
),
dtype
=
'float32'
)
self
.
isOver
=
np
.
zeros
((
self
.
max_size
,
),
dtype
=
'bool'
)
...
...
@@ -48,42 +48,41 @@ class ReplayMemory(object):
else
:
self
.
_context
.
append
(
exp
)
def
recent_
state
(
self
):
""" maintain recent
state
for training"""
def
recent_
obs
(
self
):
""" maintain recent
obs
for training"""
lst
=
list
(
self
.
_context
)
states
=
[
np
.
zeros
(
self
.
state
_shape
,
dtype
=
'uint8'
)]
*
\
obs
=
[
np
.
zeros
(
self
.
obs
_shape
,
dtype
=
'uint8'
)]
*
\
(
self
.
_context
.
maxlen
-
len
(
lst
))
states
.
extend
([
k
.
state
for
k
in
lst
])
return
state
s
obs
.
extend
([
k
.
obs
for
k
in
lst
])
return
ob
s
def
sample
(
self
,
idx
):
""" return
state
, action, reward, isOver,
note that some frames in
state
may be generated from last episode,
they should be removed from
state
""" return
obs
, action, reward, isOver,
note that some frames in
obs
may be generated from last episode,
they should be removed from
obs
"""
state
=
np
.
zeros
(
(
self
.
context_len
+
1
,
)
+
self
.
state_shape
,
dtype
=
np
.
uint8
)
state_idx
=
np
.
arange
(
idx
,
idx
+
self
.
context_len
+
1
)
%
self
.
_curr_size
obs
=
np
.
zeros
(
(
self
.
context_len
+
1
,
)
+
self
.
obs_shape
,
dtype
=
np
.
uint8
)
obs_idx
=
np
.
arange
(
idx
,
idx
+
self
.
context_len
+
1
)
%
self
.
_curr_size
# confirm that no frame was generated from last episode
has_last_episode
=
False
for
k
in
range
(
self
.
context_len
-
2
,
-
1
,
-
1
):
to_check_idx
=
state
_idx
[
k
]
to_check_idx
=
obs
_idx
[
k
]
if
self
.
isOver
[
to_check_idx
]:
has_last_episode
=
True
state_idx
=
state
_idx
[
k
+
1
:]
state
[
k
+
1
:]
=
self
.
state
[
state
_idx
]
obs_idx
=
obs
_idx
[
k
+
1
:]
obs
[
k
+
1
:]
=
self
.
obs
[
obs
_idx
]
break
if
not
has_last_episode
:
state
=
self
.
state
[
state
_idx
]
obs
=
self
.
obs
[
obs
_idx
]
real_idx
=
(
idx
+
self
.
context_len
-
1
)
%
self
.
_curr_size
action
=
self
.
action
[
real_idx
]
reward
=
self
.
reward
[
real_idx
]
isOver
=
self
.
isOver
[
real_idx
]
return
state
,
reward
,
action
,
isOver
return
obs
,
reward
,
action
,
isOver
def
__len__
(
self
):
return
self
.
_curr_size
...
...
@@ -92,7 +91,7 @@ class ReplayMemory(object):
return
self
.
_curr_size
def
_assign
(
self
,
pos
,
exp
):
self
.
state
[
pos
]
=
exp
.
state
self
.
obs
[
pos
]
=
exp
.
obs
self
.
reward
[
pos
]
=
exp
.
reward
self
.
action
[
pos
]
=
exp
.
action
self
.
isOver
[
pos
]
=
exp
.
isOver
...
...
@@ -107,8 +106,8 @@ class ReplayMemory(object):
return
self
.
_process_batch
(
batch_exp
)
def
_process_batch
(
self
,
batch_exp
):
state
=
np
.
asarray
([
e
[
0
]
for
e
in
batch_exp
],
dtype
=
'uint8'
)
obs
=
np
.
asarray
([
e
[
0
]
for
e
in
batch_exp
],
dtype
=
'uint8'
)
reward
=
np
.
asarray
([
e
[
1
]
for
e
in
batch_exp
],
dtype
=
'float32'
)
action
=
np
.
asarray
([
e
[
2
]
for
e
in
batch_exp
],
dtype
=
'int8'
)
isOver
=
np
.
asarray
([
e
[
3
]
for
e
in
batch_exp
],
dtype
=
'bool'
)
return
[
state
,
action
,
reward
,
isOver
]
return
[
obs
,
action
,
reward
,
isOver
]
examples/DQN_variant/train.py
浏览文件 @
de21118e
...
...
@@ -39,28 +39,28 @@ LEARNING_RATE = 3e-4
def
run_train_episode
(
env
,
agent
,
rpm
):
total_reward
=
0
all_cost
=
[]
state
=
env
.
reset
()
obs
=
env
.
reset
()
steps
=
0
while
True
:
steps
+=
1
context
=
rpm
.
recent_
state
()
context
.
append
(
state
)
context
=
rpm
.
recent_
obs
()
context
.
append
(
obs
)
context
=
np
.
stack
(
context
,
axis
=
0
)
action
=
agent
.
sample
(
context
)
next_
state
,
reward
,
isOver
,
_
=
env
.
step
(
action
)
rpm
.
append
(
Experience
(
state
,
action
,
reward
,
isOver
))
next_
obs
,
reward
,
isOver
,
_
=
env
.
step
(
action
)
rpm
.
append
(
Experience
(
obs
,
action
,
reward
,
isOver
))
# start training
if
rpm
.
size
()
>
MEMORY_WARMUP_SIZE
:
if
steps
%
UPDATE_FREQ
==
0
:
batch_all_
state
,
batch_action
,
batch_reward
,
batch_isOver
=
rpm
.
sample_batch
(
batch_all_
obs
,
batch_action
,
batch_reward
,
batch_isOver
=
rpm
.
sample_batch
(
args
.
batch_size
)
batch_
state
=
batch_all_state
[:,
:
CONTEXT_LEN
,
:,
:]
batch_next_
state
=
batch_all_state
[:,
1
:,
:,
:]
cost
=
agent
.
learn
(
batch_
state
,
batch_action
,
batch_reward
,
batch_next_
state
,
batch_isOver
)
batch_
obs
=
batch_all_obs
[:,
:
CONTEXT_LEN
,
:,
:]
batch_next_
obs
=
batch_all_obs
[:,
1
:,
:,
:]
cost
=
agent
.
learn
(
batch_
obs
,
batch_action
,
batch_reward
,
batch_next_
obs
,
batch_isOver
)
all_cost
.
append
(
float
(
cost
))
total_reward
+=
reward
state
=
next_state
obs
=
next_obs
if
isOver
:
break
if
all_cost
:
...
...
@@ -70,11 +70,11 @@ def run_train_episode(env, agent, rpm):
def
run_evaluate_episode
(
env
,
agent
):
state
=
env
.
reset
()
obs
=
env
.
reset
()
total_reward
=
0
while
True
:
action
=
agent
.
predict
(
state
)
state
,
reward
,
isOver
,
info
=
env
.
step
(
action
)
action
=
agent
.
predict
(
obs
)
obs
,
reward
,
isOver
,
info
=
env
.
step
(
action
)
total_reward
+=
reward
if
isOver
:
break
...
...
examples/offline-Q-learning/atari.py
浏览文件 @
de21118e
..
/
DQN
/
atari
.
py
\ No newline at end of file
..
/
DQN_variant
/
atari
.
py
\ No newline at end of file
examples/offline-Q-learning/atari_wrapper.py
浏览文件 @
de21118e
..
/
DQN
/
atari_wrapper
.
py
\ No newline at end of file
..
/
DQN_variant
/
atari_wrapper
.
py
\ No newline at end of file
examples/offline-Q-learning/parallel_run.py
浏览文件 @
de21118e
...
...
@@ -45,21 +45,21 @@ gpu_num = get_gpu_count()
def
run_train_step
(
agent
,
rpm
):
for
step
in
range
(
args
.
train_total_steps
):
# use the first 80% data to train
batch_all_
state
,
batch_action
,
batch_reward
,
batch_isOver
=
rpm
.
sample_batch
(
batch_all_
obs
,
batch_action
,
batch_reward
,
batch_isOver
=
rpm
.
sample_batch
(
args
.
batch_size
*
gpu_num
)
batch_
state
=
batch_all_state
[:,
:
CONTEXT_LEN
,
:,
:]
batch_next_
state
=
batch_all_state
[:,
1
:,
:,
:]
cost
=
agent
.
learn
(
batch_
state
,
batch_action
,
batch_reward
,
batch_next_
state
,
batch_isOver
)
batch_
obs
=
batch_all_obs
[:,
:
CONTEXT_LEN
,
:,
:]
batch_next_
obs
=
batch_all_obs
[:,
1
:,
:,
:]
cost
=
agent
.
learn
(
batch_
obs
,
batch_action
,
batch_reward
,
batch_next_
obs
,
batch_isOver
)
if
step
%
100
==
0
:
# use the last 20% data to evaluate
batch_all_
state
,
batch_action
,
batch_reward
,
batch_isOver
=
rpm
.
sample_test_batch
(
batch_all_
obs
,
batch_action
,
batch_reward
,
batch_isOver
=
rpm
.
sample_test_batch
(
args
.
batch_size
)
batch_
state
=
batch_all_state
[:,
:
CONTEXT_LEN
,
:,
:]
batch_next_
state
=
batch_all_state
[:,
1
:,
:,
:]
eval_cost
=
agent
.
supervised_eval
(
batch_
state
,
batch_action
,
batch_reward
,
batch_next_
state
,
batch_
obs
=
batch_all_obs
[:,
:
CONTEXT_LEN
,
:,
:]
batch_next_
obs
=
batch_all_obs
[:,
1
:,
:,
:]
eval_cost
=
agent
.
supervised_eval
(
batch_
obs
,
batch_action
,
batch_reward
,
batch_next_
obs
,
batch_isOver
)
logger
.
info
(
"train step {}, train costs are {}, eval cost is {}."
.
format
(
...
...
@@ -67,17 +67,17 @@ def run_train_step(agent, rpm):
def
collect_exp
(
env
,
rpm
,
agent
):
state
=
env
.
reset
()
obs
=
env
.
reset
()
# collect data to fulfill replay memory
for
i
in
tqdm
(
range
(
MEMORY_SIZE
)):
context
=
rpm
.
recent_
state
()
context
.
append
(
state
)
context
=
rpm
.
recent_
obs
()
context
.
append
(
obs
)
context
=
np
.
stack
(
context
,
axis
=
0
)
action
=
agent
.
sample
(
context
)
next_
state
,
reward
,
isOver
,
_
=
env
.
step
(
action
)
rpm
.
append
(
Experience
(
state
,
action
,
reward
,
isOver
))
state
=
next_state
next_
obs
,
reward
,
isOver
,
_
=
env
.
step
(
action
)
rpm
.
append
(
Experience
(
obs
,
action
,
reward
,
isOver
))
obs
=
next_obs
def
main
():
...
...
examples/offline-Q-learning/replay_memory.py
浏览文件 @
de21118e
...
...
@@ -18,18 +18,18 @@ import os
from
collections
import
deque
,
namedtuple
from
parl.utils
import
logger
Experience
=
namedtuple
(
'Experience'
,
[
'
state
'
,
'action'
,
'reward'
,
'isOver'
])
Experience
=
namedtuple
(
'Experience'
,
[
'
obs
'
,
'action'
,
'reward'
,
'isOver'
])
class
ReplayMemory
(
object
):
def
__init__
(
self
,
max_size
,
state
_shape
,
obs
_shape
,
context_len
,
load_file
=
False
,
file_path
=
None
):
self
.
max_size
=
int
(
max_size
)
self
.
state_shape
=
state
_shape
self
.
obs_shape
=
obs
_shape
self
.
context_len
=
int
(
context_len
)
self
.
file_path
=
file_path
...
...
@@ -38,8 +38,7 @@ class ReplayMemory(object):
self
.
load_memory
()
logger
.
info
(
"memory size is {}"
.
format
(
self
.
_curr_size
))
else
:
self
.
state
=
np
.
zeros
(
(
self
.
max_size
,
)
+
state_shape
,
dtype
=
'uint8'
)
self
.
obs
=
np
.
zeros
((
self
.
max_size
,
)
+
obs_shape
,
dtype
=
'uint8'
)
self
.
action
=
np
.
zeros
((
self
.
max_size
,
),
dtype
=
'int32'
)
self
.
reward
=
np
.
zeros
((
self
.
max_size
,
),
dtype
=
'float32'
)
self
.
isOver
=
np
.
zeros
((
self
.
max_size
,
),
dtype
=
'bool'
)
...
...
@@ -62,42 +61,41 @@ class ReplayMemory(object):
else
:
self
.
_context
.
append
(
exp
)
def
recent_
state
(
self
):
""" maintain recent
state
for training"""
def
recent_
obs
(
self
):
""" maintain recent
obs
for training"""
lst
=
list
(
self
.
_context
)
states
=
[
np
.
zeros
(
self
.
state
_shape
,
dtype
=
'uint8'
)]
*
\
obs
=
[
np
.
zeros
(
self
.
obs
_shape
,
dtype
=
'uint8'
)]
*
\
(
self
.
_context
.
maxlen
-
len
(
lst
))
states
.
extend
([
k
.
state
for
k
in
lst
])
return
state
s
obs
.
extend
([
k
.
obs
for
k
in
lst
])
return
ob
s
def
sample
(
self
,
idx
):
""" return
state
, action, reward, isOver,
note that some frames in
state
may be generated from last episode,
they should be removed from
state
""" return
obs
, action, reward, isOver,
note that some frames in
obs
may be generated from last episode,
they should be removed from
obs
"""
state
=
np
.
zeros
(
(
self
.
context_len
+
1
,
)
+
self
.
state_shape
,
dtype
=
np
.
uint8
)
state_idx
=
np
.
arange
(
idx
,
idx
+
self
.
context_len
+
1
)
%
self
.
_curr_size
obs
=
np
.
zeros
(
(
self
.
context_len
+
1
,
)
+
self
.
obs_shape
,
dtype
=
np
.
uint8
)
obs_idx
=
np
.
arange
(
idx
,
idx
+
self
.
context_len
+
1
)
%
self
.
_curr_size
# confirm that no frame was generated from last episode
has_last_episode
=
False
for
k
in
range
(
self
.
context_len
-
2
,
-
1
,
-
1
):
to_check_idx
=
state
_idx
[
k
]
to_check_idx
=
obs
_idx
[
k
]
if
self
.
isOver
[
to_check_idx
]:
has_last_episode
=
True
state_idx
=
state
_idx
[
k
+
1
:]
state
[
k
+
1
:]
=
self
.
state
[
state
_idx
]
obs_idx
=
obs
_idx
[
k
+
1
:]
obs
[
k
+
1
:]
=
self
.
obs
[
obs
_idx
]
break
if
not
has_last_episode
:
state
=
self
.
state
[
state
_idx
]
obs
=
self
.
obs
[
obs
_idx
]
real_idx
=
(
idx
+
self
.
context_len
-
1
)
%
self
.
_curr_size
action
=
self
.
action
[
real_idx
]
reward
=
self
.
reward
[
real_idx
]
isOver
=
self
.
isOver
[
real_idx
]
return
state
,
reward
,
action
,
isOver
return
obs
,
reward
,
action
,
isOver
def
__len__
(
self
):
return
self
.
_curr_size
...
...
@@ -106,7 +104,7 @@ class ReplayMemory(object):
return
self
.
_curr_size
def
_assign
(
self
,
pos
,
exp
):
self
.
state
[
pos
]
=
exp
.
state
self
.
obs
[
pos
]
=
exp
.
obs
self
.
reward
[
pos
]
=
exp
.
reward
self
.
action
[
pos
]
=
exp
.
action
self
.
isOver
[
pos
]
=
exp
.
isOver
...
...
@@ -129,15 +127,15 @@ class ReplayMemory(object):
return
self
.
_process_batch
(
batch_exp
)
def
_process_batch
(
self
,
batch_exp
):
state
=
np
.
asarray
([
e
[
0
]
for
e
in
batch_exp
],
dtype
=
'uint8'
)
obs
=
np
.
asarray
([
e
[
0
]
for
e
in
batch_exp
],
dtype
=
'uint8'
)
reward
=
np
.
asarray
([
e
[
1
]
for
e
in
batch_exp
],
dtype
=
'float32'
)
action
=
np
.
asarray
([
e
[
2
]
for
e
in
batch_exp
],
dtype
=
'int8'
)
isOver
=
np
.
asarray
([
e
[
3
]
for
e
in
batch_exp
],
dtype
=
'bool'
)
return
[
state
,
action
,
reward
,
isOver
]
return
[
obs
,
action
,
reward
,
isOver
]
def
save_memory
(
self
):
save_data
=
[
self
.
state
,
self
.
reward
,
self
.
action
,
self
.
isOver
,
self
.
_curr_size
,
self
.
obs
,
self
.
reward
,
self
.
action
,
self
.
isOver
,
self
.
_curr_size
,
self
.
_curr_pos
,
self
.
_context
]
np
.
savez
(
self
.
file_path
,
*
save_data
)
...
...
@@ -145,7 +143,7 @@ class ReplayMemory(object):
def
load_memory
(
self
):
container
=
np
.
load
(
self
.
file_path
,
allow_pickle
=
True
)
[
self
.
state
,
self
.
reward
,
self
.
action
,
self
.
isOver
,
self
.
_curr_size
,
self
.
obs
,
self
.
reward
,
self
.
action
,
self
.
isOver
,
self
.
_curr_size
,
self
.
_curr_pos
,
self
.
_context
]
=
[
container
[
key
]
for
key
in
container
]
self
.
_curr_size
=
self
.
_curr_size
.
astype
(
int
)
...
...
examples/offline-Q-learning/rom_files
浏览文件 @
de21118e
../DQN/rom_files/
\ No newline at end of file
../DQN_variant/rom_files
\ No newline at end of file
examples/offline-Q-learning/utils.py
浏览文件 @
de21118e
..
/
DQN
/
utils
.
py
\ No newline at end of file
..
/
DQN_variant
/
utils
.
py
\ No newline at end of file
parl/algorithms/fluid/sac.py
浏览文件 @
de21118e
...
...
@@ -102,11 +102,11 @@ class SAC(Algorithm):
return
cost
def
critic_learn
(
self
,
obs
,
action
,
reward
,
next_obs
,
terminal
):
next_
state_action
,
next_state
_log_pi
=
self
.
sample
(
next_obs
)
next_
obs_action
,
next_obs
_log_pi
=
self
.
sample
(
next_obs
)
qf1_next_target
,
qf2_next_target
=
self
.
target_critic
.
value
(
next_obs
,
next_
state
_action
)
next_obs
,
next_
obs
_action
)
min_qf_next_target
=
layers
.
elementwise_min
(
qf1_next_target
,
qf2_next_target
)
-
next_
state
_log_pi
*
self
.
alpha
qf1_next_target
,
qf2_next_target
)
-
next_
obs
_log_pi
*
self
.
alpha
terminal
=
layers
.
cast
(
terminal
,
dtype
=
'float32'
)
target_Q
=
reward
+
(
1.0
-
terminal
)
*
self
.
gamma
*
min_qf_next_target
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录