提交 f50583b4 编写于 作者: P PaParaZz1

Deploying to gh-pages from @ 87705039902cb91c9eae26ea5842f0aa07160226 🚀

上级 79f88481
......@@ -214,6 +214,13 @@
<span class="n">decay</span><span class="o">=</span><span class="mf">0.99</span><span class="p">,</span>
<span class="n">min_win_rate_games</span><span class="o">=</span><span class="mi">8</span><span class="p">,</span>
<span class="p">),</span>
<span class="n">metric</span><span class="o">=</span><span class="nb">dict</span><span class="p">(</span>
<span class="n">mu</span><span class="o">=</span><span class="mi">0</span><span class="p">,</span>
<span class="n">sigma</span><span class="o">=</span><span class="mi">25</span> <span class="o">/</span> <span class="mi">3</span><span class="p">,</span>
<span class="n">beta</span><span class="o">=</span><span class="mi">25</span> <span class="o">/</span> <span class="mi">3</span> <span class="o">/</span> <span class="mi">2</span><span class="p">,</span>
<span class="n">tau</span><span class="o">=</span><span class="mf">0.0</span><span class="p">,</span>
<span class="n">draw_probability</span><span class="o">=</span><span class="mf">0.02</span><span class="p">,</span>
<span class="p">),</span>
<span class="p">)</span>
<span class="c1"># override</span>
......
......@@ -186,7 +186,8 @@
<span class="n">init_payoff</span><span class="p">:</span> <span class="s1">&#39;BattleSharedPayoff&#39;</span><span class="p">,</span> <span class="c1"># noqa</span>
<span class="n">checkpoint_path</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span>
<span class="n">player_id</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span>
<span class="n">total_agent_step</span><span class="p">:</span> <span class="nb">int</span>
<span class="n">total_agent_step</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span>
<span class="n">rating</span><span class="p">:</span> <span class="s1">&#39;PlayerRating&#39;</span><span class="p">,</span> <span class="c1"># noqa</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Overview:</span>
......@@ -200,6 +201,7 @@
<span class="sd"> - player_id (:obj:`str`): Player id in string format.</span>
<span class="sd"> - total_agent_step (:obj:`int`): For active player, it should be 0; \</span>
<span class="sd"> For historical player, it should be parent player&#39;s ``_total_agent_step`` when ``snapshot``.</span>
<span class="sd"> - rating (:obj:`PlayerRating`): player rating information in total league</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_cfg</span> <span class="o">=</span> <span class="n">cfg</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_category</span> <span class="o">=</span> <span class="n">category</span>
......@@ -208,7 +210,8 @@
<span class="k">assert</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">player_id</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_player_id</span> <span class="o">=</span> <span class="n">player_id</span>
<span class="k">assert</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">total_agent_step</span><span class="p">,</span> <span class="nb">int</span><span class="p">),</span> <span class="p">(</span><span class="n">total_agent_step</span><span class="p">,</span> <span class="nb">type</span><span class="p">(</span><span class="n">total_agent_step</span><span class="p">))</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_total_agent_step</span> <span class="o">=</span> <span class="n">total_agent_step</span></div>
<span class="bp">self</span><span class="o">.</span><span class="n">_total_agent_step</span> <span class="o">=</span> <span class="n">total_agent_step</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_rating</span> <span class="o">=</span> <span class="n">rating</span></div>
<span class="nd">@property</span>
<span class="k">def</span> <span class="nf">category</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">str</span><span class="p">:</span>
......@@ -232,7 +235,15 @@
<span class="nd">@total_agent_step</span><span class="o">.</span><span class="n">setter</span>
<span class="k">def</span> <span class="nf">total_agent_step</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">step</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_total_agent_step</span> <span class="o">=</span> <span class="n">step</span></div>
<span class="bp">self</span><span class="o">.</span><span class="n">_total_agent_step</span> <span class="o">=</span> <span class="n">step</span>
<span class="nd">@property</span>
<span class="k">def</span> <span class="nf">rating</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s1">&#39;PlayerRating&#39;</span><span class="p">:</span> <span class="c1"># noqa</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_rating</span>
<span class="nd">@rating</span><span class="o">.</span><span class="n">setter</span>
<span class="k">def</span> <span class="nf">rating</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">_rating</span><span class="p">:</span> <span class="s1">&#39;PlayerRating&#39;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span> <span class="c1"># noqa</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_rating</span> <span class="o">=</span> <span class="n">_rating</span></div>
<div class="viewcode-block" id="HistoricalPlayer"><a class="viewcode-back" href="../../../api_doc/league/player.html#ding.league.player.HistoricalPlayer">[docs]</a><span class="nd">@PLAYER_REGISTRY</span><span class="o">.</span><span class="n">register</span><span class="p">(</span><span class="s1">&#39;historical_player&#39;</span><span class="p">)</span>
......@@ -329,10 +340,12 @@
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="kc">False</span>
<span class="k">def</span> <span class="nf">snapshot</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">HistoricalPlayer</span><span class="p">:</span>
<span class="k">def</span> <span class="nf">snapshot</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">metric_env</span><span class="p">:</span> <span class="s1">&#39;LeagueMetricEnv&#39;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">HistoricalPlayer</span><span class="p">:</span> <span class="c1"># noqa</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Overview:</span>
<span class="sd"> Generate a snapshot historical player from the current player, called in league&#39;s ``_snapshot``.</span>
<span class="sd"> Argument:</span>
<span class="sd"> - metric_env (:obj:`LeagueMetricEnv`): player rating environment, one league one env</span>
<span class="sd"> Returns:</span>
<span class="sd"> - snapshot_player (:obj:`HistoricalPlayer`): new instantiated historical player</span>
......@@ -348,6 +361,7 @@
<span class="n">path</span><span class="p">,</span>
<span class="bp">self</span><span class="o">.</span><span class="n">player_id</span> <span class="o">+</span> <span class="s1">&#39;_</span><span class="si">{}</span><span class="s1">_historical&#39;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="nb">int</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_total_agent_step</span><span class="p">)),</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_total_agent_step</span><span class="p">,</span>
<span class="n">metric_env</span><span class="o">.</span><span class="n">create_rating</span><span class="p">(</span><span class="n">mu</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">rating</span><span class="o">.</span><span class="n">mu</span><span class="p">),</span>
<span class="n">parent_id</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">player_id</span>
<span class="p">)</span>
......
此差异已折叠。
......@@ -186,6 +186,7 @@
<li><a href="ding/model/template/qmix.html">ding.model.template.qmix</a></li>
<li><a href="ding/model/template/vac.html">ding.model.template.vac</a></li>
<li><a href="ding/policy/a2c.html">ding.policy.a2c</a></li>
<li><a href="ding/policy/acer.html">ding.policy.acer</a></li>
<li><a href="ding/policy/atoc.html">ding.policy.atoc</a></li>
<li><a href="ding/policy/c51.html">ding.policy.c51</a></li>
<li><a href="ding/policy/collaq.html">ding.policy.collaq</a></li>
......
ACER
^^^^^^^
Overview
---------
ACER, short for actor-critic with experience replay, is an off-policy actor-critic model with experience replay. It greatly increases
the sample efficiency and decreasing the data correlation. ACER uses retrace Q-value estimation, an efficient TRPO and truncates importance sample weights with
bias correction to control the stability of the off-policy estimator. You can find more details in this paper
`Sample Efficient Actor-Critic with Experience Replay <https://arxiv.org/abs/1611.01224>`_.
Quick Facts
-------------
1. ACER is a **model-free** and **off-policy** RL algorithm.
2. ACER can support both **discrete** action spaces and **continuous** action spaces with several differences
s
3. ACER is a actor-critic RL algorithm, which optimizes actor network and critic network, respectively.
4. ACER decouples acting from learning. Collectors in ACER needs to record behavior probabilty distributions.
Key Equations
---------------------------
Loss used in ACER contains policy loss and value loss. They often update seperately, so it's necessary to control their relative update speed.
Retrace Q-value estimation
>>>>>>>>>>>>>>>>>>>>>>>>>>>>
Given a trajectory generated under the behavior policy :math:`\mu`, we retrieve a trajectory :math:`\{x_0, a_0, r_0, \mu(\cdot|x_0),..., x_k, a_k, r_k, \mu(\cdot|x_k)}`
the Retrace estimator can be
expressed recursively as follows:
.. math::
Q^{\text{ret}}(x_t,a_t)=r_t+\gamma\bar{\rho}_{t+1}[Q^{\text{ret}}(x_{t+
1},a_{t+1})]+\gamma V(x_{t+1})
where :math:`\bar{\rho}_t` is the truncated importance weight, :math:`\bar{\rho}_t=\min\{c,\rho\}` with :math:`\frac{\pi(a_t|x_t)}{\mu(a_t|x_t)}`. :math:`\pi` is target policy
Retrace is an off-policy, return based algorithm which has low variance and is proven to converge to the value function of the taget policy for any behavior policy
we approximate the Q value by neural network :math:`Q_{\theta}` in a mean squared error loss:
.. math::
L_{\text{value}}=\frac{1}{2}(Q^{\text{ret}}(x_t,a_t)-Q_{\theta}(x_t,a_t))^2.
policy gradient
>>>>>>>>>>>>>>>>>>>>>>
To safe-guard against high variance, ACER uses truncated importance weights and introduaces
correction term via the following decomposition of :math:`g^{acer}`:
.. math::
g^{\text{acer}}=\bar{\rho_t}\nabla_\theta\log\pi_{\theta}(a_t|x_t)[Q^{\text{ret}}(x_t,a_t)-V_{\theta}(x_t)]+\mathbb{E}_{a\sim \pi}\left([\frac{\rho_t(a)-c}{\rho_t(a)}]_+\nabla_{\theta}\log\pi_{\theta}(a|x_t)[Q_\theta(x_t,a)-V_{\theta}(x_t)\right)\right].
To ensure more stability, ACER limit the per-step change to the policy by solving the following linearized KL divergence constraint:
.. math::
\begin{split}
&\text{minimize}_z\quad\frac{1}{2}\|g_t^{\text{acer}}-z\|_2^2\\
&subjec\ to\quad \nabla_{\phi_{\theta}(x_t)}D_{KL}[f(\cdot|\phi_{\theta_a}(x_t))\|f(\cdot|\phi_{\theta}(x_t))]^\top\le\delta
\end{split}
The :math:`\phi(\theta)` is target policy network and the :math:`\phi(\theta_a)` is average policy network.
letting :math:`k=\nabla_{\phi_{\theta}(x_t)}D_{KL}[f(\cdot|\phi_{\theta_a}(x_t))\|f(\cdot|\phi_{\theta}(x_t))]`, the solution can be easily derived in closed form using the KKT condition:
.. math::
z^*=g_{t}^{\text{acer}}-\max\{0,\frac{k^\top g_t^{\text{acer}}-\delta}{\|k\|_2^2}\}k
Pseudocode
----------
There are several differences that using ACER on discrete action space or continuous action space.
.. image:: images/ACER_alg1.png
:align: center
:scale: 70%
.. image:: images/ACER_alg2.png
:align: center
:scale: 70%
In continuous action space, it is impossible to enumerate all actions q value. So ACER uses sample actions to replace the expectation.
.. image:: images/ACER_alg3.png
:align: center
:scale: 70%
Implementations
----------------
Here we show the ACER algorithm on discrete action space.
The default config is defined as follows:
.. autoclass:: ding.policy.acer.ACERPolicy
Usually, we hope to compute everything as a batch to improve efficiency. This is done in ``policy._get_train_sample``.
Once we execute this function in collector, the length of samples will equal to unroll-len in config. For details, please
refer to doc of ``ding.rl_utils.adder``.
You can find more information in :ref:`here <ref2other>`
The whole code of ACER you can find `here <https://github.com/opendilab/DI-engine/blob/main/ding/policy/acer.py>`_.
Here we show some details of this algorithm.
First, we use the following functions to compute retrace Q value.
.. code:: python
def compute_q_retraces(q_values,v_pred,rewards,actions,weights,ratio,gamma=0.9):
"""
Overview:
Get Retrace Q value
Arguments:
- q_values (:obj:`torch.Tensor`): Q values
- v_pred (:obj:`torch.Tensor`): V values
- reward (:obj:`torch.Tensor`): reward values
- actions (:obj:`torch.Tensor`): The actions in replay buffer
- weights (:obj:`torch.Tensor`): setting padding postion
- ratio (:obj:`torch.Tensor`): ratio of new polcy with behavior policy
Returns:
- q_retraces (:obj:`torch.Tensor`): retrace Q values
"""
rewards = rewards.unsqueeze(-1)
actions = actions.unsqueeze(-1)
weights = weights.unsqueeze(-1)
q_retraces = torch.zeros_like(v_pred)
n_len = q_retraces.size()[0]
tmp_retraces = v_pred[-1,...]
q_retraces[-1,...] = v_pred[-1,...]
q_gather = torch.zeros_like(v_pred)
q_gather[0:-1,...] = q_values[0:-1,...].gather(-1,actions)
ratio_gather = ratio.gather(-1,actions)
for idx in reversed(range(n_len-1)):
q_retraces[idx,...] = rewards[idx,...]+gamma*weights[idx,...]*tmp_retraces
tmp_retraces = ratio_gather[idx,...].clamp(max=1.0)*(q_retraces[idx,...]-q_gather[idx,...])+v_pred[idx,...]
return q_retraces
After that, we calculate policy loss value, it will calcuate the actor loss with importance weights trunction and bias correction loss by the following function
.. code:: python
def acer_policy_error(q_values,q_retraces,v_pred,target_pi,actions,ratio,c_clip_ratio=10.0):
"""
Overview:
Get ACER policy loss
Arguments:
- q_values (:obj:`torch.Tensor`): Q values
- q_retraces (:obj:`torch.Tensor`): Q values (be calculated by retrace method)
- v_pred (:obj:`torch.Tensor`): V values
- target_pi (:obj:`torch.Tensor`): The new policy's probability
- actions (:obj:`torch.Tensor`): The actions in replay buffer
- ratio (:obj:`torch.Tensor`): ratio of new polcy with behavior policy
- c_clip_ratio (:obj:`float`): clip value for ratio
Returns:
- actor_loss (:obj:`torch.Tensor`): policy loss from q_retrace
- bc_loss (:obj:`torch.Tensor`): bias correct policy loss
"""
actions=actions.unsqueeze(-1)
with torch.no_grad():
advantage_retraces = q_retraces-v_pred
advantage_native = q_values-v_pred
actor_loss = ratio.gather(-1,actions).clamp(max=c_clip_ratio)*advantage_retraces*(target_pi.gather(-1,actions)+EPS).log()
bc_loss = (1.0-c_clip_ratio/(ratio+EPS)).clamp(min=0.0)*target_pi.detach()*advantage_native*(target_pi+EPS).log()
bc_loss=bc_loss.sum(-1).unsqueeze(-1)
return actor_loss,bc_loss
Then, we execute backward operation towards target_pi. Moreover, we need to calculate the correction gradient in the trust region:
.. code:: python
def acer_trust_region_update(actor_gradients,target_pi,avg_pi,trust_region_value):
"""
Overview:
calcuate gradient with trust region constrain
Arguments:
- actor_gradients (:obj:`list(torch.Tensor)`): gradients value's for different part
- target_pi (:obj:`torch.Tensor`): The new policy's probability
- avg_pi (:obj:`torch.Tensor`): The average policy's probability
- trust_region_value (:obj:`float`): the range of trust region
Returns:
- update_gradients (:obj:`torch.Tensor`): gradients under trust region constraint
"""
with torch.no_grad():
KL_gradients = [(avg_pi/(target_pi+EPS))]
update_gradients = []
for actor_gradient,KL_gradient in zip(actor_gradients,KL_gradients):
scale = actor_gradient.mul(KL_gradient).sum(-1).unsqueeze(-1)-trust_region_value
scale = torch.div(scale,KL_gradient.mul(KL_gradient).sum(-1).unsqueeze(-1)).clamp(min=0.0)
update_gradients.append(actor_gradient-scale*KL_gradient)
return update_gradients
With new gradients, we can continue to backward and then update parameters with gradients.
Finally, we should calculate the Q value loss to update Q-Network
.. code:: python
def acer_value_error(q_values,q_retraces,actions):
"""
Overview:
Get ACER critic loss
Arguments:
- q_values (:obj:`torch.Tensor`): Q values
- q_retraces (:obj:`torch.Tensor`): Q values (be calculated by retrace method)
- actions (:obj:`torch.Tensor`): The actions in replay buffer
- ratio (:obj:`torch.Tensor`): ratio of new polcy with behavior policy
Returns:
- critic_loss (:obj:`torch.Tensor`): critic loss
"""
actions=actions.unsqueeze(-1)
critic_loss=0.5*(q_retraces-q_values.gather(-1,actions)).pow(2)
return critic_loss
Reference
----------
Ziyu Wang, Victor Bapst, Nicolas Heess, Volodymyr Mnih, Remi Munos, Koray Kavukcuoglu, Nando de Freitas: "Sample Efficient Actor-Critic with Experience Replay", 2016; [https://arxiv.org/abs/1611.01224 arxiv:1611.01224].
......@@ -120,6 +120,7 @@ need all training sample (sequences of training data) have the same length. This
Once we execute this function in collector, the length of samples will equal to unroll-len in config. For details, please
refer to doc of ``ding.rl_utils.adder``.
.. _ref2other:
.. code:: python
def _get_train_sample(self, data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
......
......@@ -199,7 +199,7 @@
<h3>Player<a class="headerlink" href="#id1" title="Permalink to this headline"></a></h3>
<dl class="py class">
<dt id="ding.league.player.Player">
<em class="property"><span class="pre">class</span> </em><code class="sig-prename descclassname"><span class="pre">ding.league.player.</span></code><code class="sig-name descname"><span class="pre">Player</span></code><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">cfg</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">easydict.EasyDict</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">category</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">init_payoff</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">BattleSharedPayoff</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">checkpoint_path</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">player_id</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">total_agent_step</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">int</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/ding/league/player.html#Player"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#ding.league.player.Player" title="Permalink to this definition"></a></dt>
<em class="property"><span class="pre">class</span> </em><code class="sig-prename descclassname"><span class="pre">ding.league.player.</span></code><code class="sig-name descname"><span class="pre">Player</span></code><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">cfg</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">easydict.EasyDict</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">category</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">init_payoff</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">BattleSharedPayoff</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">checkpoint_path</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">player_id</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">total_agent_step</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">int</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">rating</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">PlayerRating</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../../_modules/ding/league/player.html#Player"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#ding.league.player.Player" title="Permalink to this definition"></a></dt>
<dd><dl class="simple">
<dt>Overview:</dt><dd><p>Base player class, player is the basic member of a league</p>
</dd>
......@@ -210,7 +210,7 @@
</dl>
<dl class="py method">
<dt id="ding.league.player.Player.__init__">
<code class="sig-name descname"><span class="pre">__init__</span></code><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">cfg</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">easydict.EasyDict</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">category</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">init_payoff</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">BattleSharedPayoff</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">checkpoint_path</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">player_id</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">total_agent_step</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">int</span></span></em><span class="sig-paren">)</span> &#x2192; <span class="pre">None</span><a class="reference internal" href="../../_modules/ding/league/player.html#Player.__init__"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#ding.league.player.Player.__init__" title="Permalink to this definition"></a></dt>
<code class="sig-name descname"><span class="pre">__init__</span></code><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">cfg</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">easydict.EasyDict</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">category</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">init_payoff</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">BattleSharedPayoff</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">checkpoint_path</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">player_id</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">total_agent_step</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">int</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">rating</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">PlayerRating</span></span></em><span class="sig-paren">)</span> &#x2192; <span class="pre">None</span><a class="reference internal" href="../../_modules/ding/league/player.html#Player.__init__"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#ding.league.player.Player.__init__" title="Permalink to this definition"></a></dt>
<dd><dl class="simple">
<dt>Overview:</dt><dd><p>Initialize base player metadata</p>
</dd>
......@@ -221,6 +221,7 @@
<li><p>checkpoint_path (<code class="xref py py-obj docutils literal notranslate"><span class="pre">str</span></code>): The path to load player checkpoint.</p></li>
<li><p>player_id (<code class="xref py py-obj docutils literal notranslate"><span class="pre">str</span></code>): Player id in string format.</p></li>
<li><p>total_agent_step (<code class="xref py py-obj docutils literal notranslate"><span class="pre">int</span></code>): For active player, it should be 0; For historical player, it should be parent player’s <code class="docutils literal notranslate"><span class="pre">_total_agent_step</span></code> when <code class="docutils literal notranslate"><span class="pre">snapshot</span></code>.</p></li>
<li><p>rating (<code class="xref py py-obj docutils literal notranslate"><span class="pre">PlayerRating</span></code>): player rating information in total league</p></li>
</ul>
</dd>
</dl>
......@@ -387,10 +388,14 @@ If yes, set <code class="docutils literal notranslate"><span class="pre">self._l
<dl class="py method">
<dt id="ding.league.player.NaiveSpPlayer.snapshot">
<code class="sig-name descname"><span class="pre">snapshot</span></code><span class="sig-paren">(</span><span class="sig-paren">)</span> &#x2192; <a class="reference internal" href="#ding.league.player.HistoricalPlayer" title="ding.league.player.HistoricalPlayer"><span class="pre">ding.league.player.HistoricalPlayer</span></a><a class="headerlink" href="#ding.league.player.NaiveSpPlayer.snapshot" title="Permalink to this definition"></a></dt>
<code class="sig-name descname"><span class="pre">snapshot</span></code><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">metric_env</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">LeagueMetricEnv</span></span></em><span class="sig-paren">)</span> &#x2192; <a class="reference internal" href="#ding.league.player.HistoricalPlayer" title="ding.league.player.HistoricalPlayer"><span class="pre">ding.league.player.HistoricalPlayer</span></a><a class="headerlink" href="#ding.league.player.NaiveSpPlayer.snapshot" title="Permalink to this definition"></a></dt>
<dd><dl class="simple">
<dt>Overview:</dt><dd><p>Generate a snapshot historical player from the current player, called in league’s <code class="docutils literal notranslate"><span class="pre">_snapshot</span></code>.</p>
</dd>
<dt>Argument:</dt><dd><ul class="simple">
<li><p>metric_env (<code class="xref py py-obj docutils literal notranslate"><span class="pre">LeagueMetricEnv</span></code>): player rating environment, one league one env</p></li>
</ul>
</dd>
<dt>Returns:</dt><dd><ul class="simple">
<li><p>snapshot_player (<a class="reference internal" href="#ding.league.player.HistoricalPlayer" title="ding.league.player.HistoricalPlayer"><code class="xref py py-obj docutils literal notranslate"><span class="pre">HistoricalPlayer</span></code></a>): new instantiated historical player</p></li>
</ul>
......@@ -487,10 +492,14 @@ If yes, set <code class="docutils literal notranslate"><span class="pre">self._l
<dl class="py method">
<dt id="ding.league.starcraft_player.MainPlayer.snapshot">
<code class="sig-name descname"><span class="pre">snapshot</span></code><span class="sig-paren">(</span><span class="sig-paren">)</span> &#x2192; <a class="reference internal" href="#ding.league.player.HistoricalPlayer" title="ding.league.player.HistoricalPlayer"><span class="pre">ding.league.player.HistoricalPlayer</span></a><a class="headerlink" href="#ding.league.starcraft_player.MainPlayer.snapshot" title="Permalink to this definition"></a></dt>
<code class="sig-name descname"><span class="pre">snapshot</span></code><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">metric_env</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">LeagueMetricEnv</span></span></em><span class="sig-paren">)</span> &#x2192; <a class="reference internal" href="#ding.league.player.HistoricalPlayer" title="ding.league.player.HistoricalPlayer"><span class="pre">ding.league.player.HistoricalPlayer</span></a><a class="headerlink" href="#ding.league.starcraft_player.MainPlayer.snapshot" title="Permalink to this definition"></a></dt>
<dd><dl class="simple">
<dt>Overview:</dt><dd><p>Generate a snapshot historical player from the current player, called in league’s <code class="docutils literal notranslate"><span class="pre">_snapshot</span></code>.</p>
</dd>
<dt>Argument:</dt><dd><ul class="simple">
<li><p>metric_env (<code class="xref py py-obj docutils literal notranslate"><span class="pre">LeagueMetricEnv</span></code>): player rating environment, one league one env</p></li>
</ul>
</dd>
<dt>Returns:</dt><dd><ul class="simple">
<li><p>snapshot_player (<code class="xref py py-obj docutils literal notranslate"><span class="pre">HistoricalPlayer</span></code>): new instantiated historical player</p></li>
</ul>
......@@ -575,10 +584,14 @@ If yes, set <code class="docutils literal notranslate"><span class="pre">self._l
<dl class="py method">
<dt id="ding.league.starcraft_player.MainExploiter.snapshot">
<code class="sig-name descname"><span class="pre">snapshot</span></code><span class="sig-paren">(</span><span class="sig-paren">)</span> &#x2192; <a class="reference internal" href="#ding.league.player.HistoricalPlayer" title="ding.league.player.HistoricalPlayer"><span class="pre">ding.league.player.HistoricalPlayer</span></a><a class="headerlink" href="#ding.league.starcraft_player.MainExploiter.snapshot" title="Permalink to this definition"></a></dt>
<code class="sig-name descname"><span class="pre">snapshot</span></code><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">metric_env</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">LeagueMetricEnv</span></span></em><span class="sig-paren">)</span> &#x2192; <a class="reference internal" href="#ding.league.player.HistoricalPlayer" title="ding.league.player.HistoricalPlayer"><span class="pre">ding.league.player.HistoricalPlayer</span></a><a class="headerlink" href="#ding.league.starcraft_player.MainExploiter.snapshot" title="Permalink to this definition"></a></dt>
<dd><dl class="simple">
<dt>Overview:</dt><dd><p>Generate a snapshot historical player from the current player, called in league’s <code class="docutils literal notranslate"><span class="pre">_snapshot</span></code>.</p>
</dd>
<dt>Argument:</dt><dd><ul class="simple">
<li><p>metric_env (<code class="xref py py-obj docutils literal notranslate"><span class="pre">LeagueMetricEnv</span></code>): player rating environment, one league one env</p></li>
</ul>
</dd>
<dt>Returns:</dt><dd><ul class="simple">
<li><p>snapshot_player (<code class="xref py py-obj docutils literal notranslate"><span class="pre">HistoricalPlayer</span></code>): new instantiated historical player</p></li>
</ul>
......@@ -662,10 +675,14 @@ If yes, set <code class="docutils literal notranslate"><span class="pre">self._l
<dl class="py method">
<dt id="ding.league.starcraft_player.LeagueExploiter.snapshot">
<code class="sig-name descname"><span class="pre">snapshot</span></code><span class="sig-paren">(</span><span class="sig-paren">)</span> &#x2192; <a class="reference internal" href="#ding.league.player.HistoricalPlayer" title="ding.league.player.HistoricalPlayer"><span class="pre">ding.league.player.HistoricalPlayer</span></a><a class="headerlink" href="#ding.league.starcraft_player.LeagueExploiter.snapshot" title="Permalink to this definition"></a></dt>
<code class="sig-name descname"><span class="pre">snapshot</span></code><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">metric_env</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">LeagueMetricEnv</span></span></em><span class="sig-paren">)</span> &#x2192; <a class="reference internal" href="#ding.league.player.HistoricalPlayer" title="ding.league.player.HistoricalPlayer"><span class="pre">ding.league.player.HistoricalPlayer</span></a><a class="headerlink" href="#ding.league.starcraft_player.LeagueExploiter.snapshot" title="Permalink to this definition"></a></dt>
<dd><dl class="simple">
<dt>Overview:</dt><dd><p>Generate a snapshot historical player from the current player, called in league’s <code class="docutils literal notranslate"><span class="pre">_snapshot</span></code>.</p>
</dd>
<dt>Argument:</dt><dd><ul class="simple">
<li><p>metric_env (<code class="xref py py-obj docutils literal notranslate"><span class="pre">LeagueMetricEnv</span></code>): player rating environment, one league one env</p></li>
</ul>
</dd>
<dt>Returns:</dt><dd><ul class="simple">
<li><p>snapshot_player (<code class="xref py py-obj docutils literal notranslate"><span class="pre">HistoricalPlayer</span></code>): new instantiated historical player</p></li>
</ul>
......
......@@ -823,6 +823,8 @@
<table style="width: 100%" class="indextable genindextable"><tr>
<td style="width: 33%; vertical-align: top;"><ul>
<li><a href="api_doc/policy/a2c.html#ding.policy.a2c.A2CPolicy">A2CPolicy (class in ding.policy.a2c)</a>
</li>
<li><a href="hands_on/acer.html#ding.policy.acer.ACERPolicy">ACERPolicy (class in ding.policy.acer)</a>
</li>
<li><a href="api_doc/utils/default_helper.html#ding.utils.default_helper.LimitedSpaceContainer.acquire_space">acquire_space() (ding.utils.default_helper.LimitedSpaceContainer method)</a>
</li>
......@@ -835,11 +837,11 @@
<li><a href="api_doc/utils/log_helper.html#ding.utils.log_helper.DistributionTimeImage.add_one_time_step">add_one_time_step() (in module ding.utils.log_helper.DistributionTimeImage)</a>
</li>
<li><a href="api_doc/league/payoff.html#ding.league.shared_payoff.BattleSharedPayoff.add_player">add_player() (ding.league.shared_payoff.BattleSharedPayoff method)</a>
</li>
<li><a href="api_doc/worker/replay_buffer/buffer.html#ding.worker.replay_buffer.utils.UsedDataRemover.add_used_data">add_used_data() (ding.worker.replay_buffer.utils.UsedDataRemover method)</a>
</li>
</ul></td>
<td style="width: 33%; vertical-align: top;"><ul>
<li><a href="api_doc/worker/replay_buffer/buffer.html#ding.worker.replay_buffer.utils.UsedDataRemover.add_used_data">add_used_data() (ding.worker.replay_buffer.utils.UsedDataRemover method)</a>
</li>
<li><a href="api_doc/rl_utils/adder.html#ding.rl_utils.adder.Adder">Adder (class in ding.rl_utils.adder)</a>
</li>
<li><a href="api_doc/worker/replay_buffer/buffer.html#ding.worker.replay_buffer.advanced_buffer.AdvancedReplayBuffer">AdvancedReplayBuffer (class in ding.worker.replay_buffer.advanced_buffer)</a>
......
此差异已折叠。
......@@ -415,7 +415,7 @@ synchronize after each iteration.</p>
need all training sample (sequences of training data) have the same length. This is done in <code class="docutils literal notranslate"><span class="pre">policy._get_train_sample</span></code>.
Once we execute this function in collector, the length of samples will equal to unroll-len in config. For details, please
refer to doc of <code class="docutils literal notranslate"><span class="pre">ding.rl_utils.adder</span></code>.</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="k">def</span> <span class="nf">_get_train_sample</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">data</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]])</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]]:</span>
<div class="highlight-python notranslate" id="ref2other"><div class="highlight"><pre><span></span><span class="k">def</span> <span class="nf">_get_train_sample</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">data</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]])</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]]:</span>
<span class="k">return</span> <span class="n">get_train_sample</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_unroll_len</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">get_train_sample</span><span class="p">(</span><span class="bp">cls</span><span class="p">,</span> <span class="n">data</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]],</span> <span class="n">unroll_len</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">last_fn_type</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s1">&#39;last&#39;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]]:</span>
......
无法预览此类型文件
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册