From 00f3b76e67f1cdfd969e1af982393275ea0b2997 Mon Sep 17 00:00:00 2001 From: Guo Sheng Date: Fri, 1 Feb 2019 00:13:21 +0800 Subject: [PATCH] Speed up Transformer inference (#1476) * Add py-reader and parallel-executor support in Transformer inference * Add statick k, v cache for encoder output in Transformer inference * Replace the cache from compute_qkv with cahce from split_heads in Transformer inference * Fuse k, q, v projection in Transformer * Revert the fused k, q, v projection in Transformer to be compatible with saved models * Use gather_op to replace sequence_expand_op in Transformer inference * Add fluid_transformer.md * Refine README for released models and data in Transformer * Refine README for released models and data in Transformer --- .../transformer/README_cn.md | 45 ++- .../transformer/config.py | 6 +- .../transformer/images/attention_formula.png | Bin 0 -> 24299 bytes .../transformer/infer.py | 256 +++++++++----- .../transformer/model.py | 323 ++++++++++++------ .../transformer/profile.py | 2 +- .../transformer/train.py | 27 +- 7 files changed, 447 insertions(+), 212 deletions(-) create mode 100644 fluid/PaddleNLP/neural_machine_translation/transformer/images/attention_formula.png diff --git a/fluid/PaddleNLP/neural_machine_translation/transformer/README_cn.md b/fluid/PaddleNLP/neural_machine_translation/transformer/README_cn.md index 7e7a09e7..bdac7cb0 100644 --- a/fluid/PaddleNLP/neural_machine_translation/transformer/README_cn.md +++ b/fluid/PaddleNLP/neural_machine_translation/transformer/README_cn.md @@ -69,9 +69,9 @@ WMT 数据集是机器翻译领域公认的主流数据集,[WMT'16 EN-DE 数 └── subword-nmt # BPE 编码的代码 ``` -`gen_data/wmt16_ende_data_bpe` 中是我们最终使用的英德翻译数据,其中 `train.tok.clean.bpe.32000.en-de` 为训练数据,`newstest2016.tok.bpe.32000.en-de` 等为验证和测试数据,。`vocab_all.bpe.32000` 为相应的词典文件(已加入 `` 、`` 和 `` 这三个特殊符号,源语言和目标语言共享该词典文件)。 +`gen_data/wmt16_ende_data_bpe` 中是我们最终使用的英德翻译数据,其中 `train.tok.clean.bpe.32000.en-de` 为训练数据,`newstest2016.tok.bpe.32000.en-de` 等为验证和测试数据,`vocab_all.bpe.32000` 为相应的词典文件(已加入 `` 、`` 和 `` 这三个特殊符号,源语言和目标语言共享该词典文件)。另外我们也整理提供了一份处理好的 WMT'16 EN-DE 数据以供[下载](https://transformer-res.bj.bcebos.com/wmt16_ende_data_bpe_clean.tar.gz)使用(包含训练所需 BPE 数据和词典以及预测和评估所需的 BPE 数据和 tokenize 的数据)。 -对于其他自定义数据,转换为类似 `train.tok.clean.bpe.32000.en-de` 的数据格式(`\t` 分隔的源语言和目标语言句子对,句子中的 token 之间使用空格分隔)即可;如需使用 BPE 编码,可参考,亦可以使用类似 WMT,使用 `gen_data.sh` 进行处理。 +对于其他自定义数据,转换为类似 `train.tok.clean.bpe.32000.en-de` 的数据格式(`\t` 分隔的源语言和目标语言句子对,句子中的 token 之间使用空格分隔)即可;如需使用 BPE 编码,亦可以使用类似 WMT'16 EN-DE 原始数据的格式,参照 `gen_data.sh` 进行处理。 ### 模型训练 @@ -110,11 +110,9 @@ python -u train.py \ --batch_size 3200 \ --sort_type pool \ --pool_size 200000 \ - n_layer 6 \ n_head 16 \ d_model 1024 \ d_inner_hid 4096 \ - n_head 16 \ prepostprocess_dropout 0.3 ``` 有关这些参数更详细信息的请参考 `config.py` 中的注释说明。 @@ -144,30 +142,53 @@ python -u infer.py \ --token_delimiter ' ' \ --batch_size 32 \ model_path trained_models/iter_100000.infer.model \ - beam_size 4 \ + beam_size 5 \ max_out_len 255 ``` -和模型训练时类似,预测时也需要设置数据和 reader 相关的参数,并可以执行 `python infer.py --help` 查看这些参数的说明(部分参数意义和训练时略有不同);同样可以在预测命令中设置模型超参数,但应与模型训练时的设置一致;此外相比于模型训练,预测时还有一些额外的参数,如需要设置 `model_path` 来给出模型所在目录,可以设置 `beam_size` 和 `max_out_len` 来指定 Beam Search 算法的搜索宽度和最大深度(翻译长度),这些参数也可以在 `config.py` 中的 `InferTaskConfig` 内查阅注释说明并进行更改设置。 +和模型训练时类似,预测时也需要设置数据和 reader 相关的参数,并可以执行 `python infer.py --help` 查看这些参数的说明(部分参数意义和训练时略有不同);同样可以在预测命令中设置模型超参数,但应与模型训练时的设置一致,如训练时使用 big model 的参数设置,则预测时对应类似如下命令: +```sh +python -u infer.py \ + --src_vocab_fpath gen_data/wmt16_ende_data_bpe/vocab_all.bpe.32000 \ + --trg_vocab_fpath gen_data/wmt16_ende_data_bpe/vocab_all.bpe.32000 \ + --special_token '' '' '' \ + --test_file_pattern gen_data/wmt16_ende_data_bpe/newstest2016.tok.bpe.32000.en-de \ + --token_delimiter ' ' \ + --batch_size 32 \ + model_path trained_models/iter_100000.infer.model \ + n_head 16 \ + d_model 1024 \ + d_inner_hid 4096 \ + prepostprocess_dropout 0.3 \ + beam_size 5 \ + max_out_len 255 +``` +此外相比于模型训练,预测时还有一些额外的参数,如需要设置 `model_path` 来给出模型所在目录,可以设置 `beam_size` 和 `max_out_len` 来指定 Beam Search 算法的搜索宽度和最大深度(翻译长度),这些参数也可以在 `config.py` 中的 `InferTaskConfig` 内查阅注释说明并进行更改设置。 执行以上预测命令会打印翻译结果到标准输出,每行输出是对应行输入的得分最高的翻译。对于使用 BPE 的英德数据,预测出的翻译结果也将是 BPE 表示的数据,要还原成原始的数据(这里指 tokenize 后的数据)才能进行正确的评估,可以使用以下命令来恢复 `predict.txt` 内的翻译结果到 `predict.tok.txt` 中(无需再次 tokenize 处理): ```sh sed -r 's/(@@ )|(@@ ?$)//g' predict.txt > predict.tok.txt ``` -接下来就可以使用参考翻译对翻译结果进行 BLEU 指标的评估了。以英德翻译 `newstest2016.tok.de` 数据为例,执行如下命令: +接下来就可以使用参考翻译对翻译结果进行 BLEU 指标的评估了,评估需要用到 mosesdecoder 中的脚本,可以通过以下命令获取: +```sh +git clone https://github.com/moses-smt/mosesdecoder.git +``` +以英德翻译 `newstest2014.tok.de` 数据为例,获取 mosesdecoder 后使用 `multi-bleu.perl` 执行如下命令进行翻译结果评估: ```sh -perl gen_data/mosesdecoder/scripts/generic/multi-bleu.perl gen_data/wmt16_ende_data/newstest2016.tok.de < predict.tok.txt +perl gen_data/mosesdecoder/scripts/generic/multi-bleu.perl gen_data/wmt16_ende_data/newstest2014.tok.de < predict.tok.txt ``` -可以看到类似如下的结果(为单机两卡训练 200K 个 iteration 后模型的预测结果)。 +可以看到类似如下的结果: ``` -BLEU = 33.08, 64.2/39.2/26.4/18.5 (BP=0.994, ratio=0.994, hyp_len=61971, ref_len=62362) +BLEU = 26.35, 57.7/32.1/20.0/13.0 (BP=1.000, ratio=1.013, hyp_len=63903, ref_len=63078) ``` -目前在未使用 model average 的情况下,英德翻译 base model 八卡训练 100K 个 iteration 后测试 BLEU 值如下: +目前在未使用 model average 的情况下,英德翻译 base model 和 big model 八卡训练 100K 个 iteration 后测试 BLEU 值如下: | 测试集 | newstest2014 | newstest2015 | newstest2016 | |-|-|-|-| -| BLEU | 26.25 | 29.15 | 33.64 | +| Base | 26.35 | 29.07 | 33.30 | +| Big | 27.07 | 30.09 | 34.38 | +我们这里也提供了以上 [base model](https://transformer-res.bj.bcebos.com/base_model.tar.gz) 和 [big model](https://transformer-res.bj.bcebos.com/big_model.tar.gz) 模型的下载以供使用。 ### 分布式训练 diff --git a/fluid/PaddleNLP/neural_machine_translation/transformer/config.py b/fluid/PaddleNLP/neural_machine_translation/transformer/config.py index ca119aa6..823341ed 100644 --- a/fluid/PaddleNLP/neural_machine_translation/transformer/config.py +++ b/fluid/PaddleNLP/neural_machine_translation/transformer/config.py @@ -164,7 +164,10 @@ input_descs = { # [batch_size * max_trg_len_in_batch, 1] "lbl_weight": [(batch_size * seq_len, 1), "float32"], # This input is used in beam-search decoder. - "init_score": [(batch_size, 1), "float32"], + "init_score": [(batch_size, 1), "float32", 2], + # This input is used in beam-search decoder for the first gather + # (cell states updation) + "init_idx": [(batch_size, ), "int32"], } # Names of word embedding table which might be reused for weight sharing. @@ -194,4 +197,5 @@ label_data_input_fields = ( fast_decoder_data_input_fields = ( "trg_word", "init_score", + "init_idx", "trg_src_attn_bias", ) diff --git a/fluid/PaddleNLP/neural_machine_translation/transformer/images/attention_formula.png b/fluid/PaddleNLP/neural_machine_translation/transformer/images/attention_formula.png new file mode 100644 index 0000000000000000000000000000000000000000..249857f524b4137bafc2d4d1b779ed62d1437b6d GIT binary patch literal 24299 zcmeFXQ*>qFvNjy2W7~FCY;-!dZQFLzvDHC`9d~Toww)E*=AV7e*?XV;UH_M7jCZXu zuWHs?H49HY6`>#}jsS}b3jzXyASofD1Ofui`1L&f4eIM3*vGC90s@b2DJ-lYDJ)E^ z;AC%RX=4fkA`y|S4y~a)f|;$Y^bLjq8I&fMQ^+fMlXtd9Oad$zm6+u3@{B0t($C0m z;gOY*%Zp&cS~YZ00UZL!ikg~Oa2g7pNnov=4;{C@&X+l?hnde)BQ3WRAb%_-M#sj) zl0k-nSl{x%ebXW&M`UKv0zlxYL0t_djo@zE^YeN^e$_l~nLcsUhdT8s;h*XsewI-? zH~-)OkwEV+!6)H`-pL2qJ~`tP0tE@?_{*858&5SDgj@zoECdqXj9n~U(~4azbxj^2 zYf}&C*8__E3_OW9*MK371vw zic0!uN-NoA)-1nsq@NHaH4)5a0%Wj1e$o$prexa8EaSd7D6m2Z&Fo|vZakRmhek4E z?4Z5Phf9R35_gX8N}LTm`Grzb%b5vwW4`pJgF=B&Z&$zlAX)sGMlu?Cn_{S)@_h&H zeCC^;kV83qF|1R*>GjMHKR1yy`pHOmyDRLrMIIpH`=(|$?`&%8r8 z&ubK9y6@{G4Q&jO62S25UiDB*bnnFu6)sg&QsQ&tU5xO;2bndmsAYcbCt$IFPx zI3!M@D4aqf42|)*bOilrpriZthenzWtJ{3?Ogkw&R}@*i%TqE4uo)^?-6S-ySr67! zLGWa>pwA>J>ByxT7ajD3RLXzh#r|a4hI<1+3*VN2Y#~OwT7^!B+6ElUfEa*+Nd^UW{bU=l$?YeD;|t<~Uh3y};N*iB z=-93v$Z||&7A)FXE*0_5BVrE6{?3MX*b`b_2Sf(h?3xMb<-{_hI_M=D18Vd2vi_g~ z!T>RIGaQimgNhTWS;{oqje3yAe-q=o^jUz`o+))rA##S66F3m96WoNYJ3>{$&^vmk zMQ~ogI6DWdUgAU!w)mFq$DKxK`sc*SR2fum6B{S+d`vyXYQJL-JhtIFZu8!WOWQ!Y zL9_*D4OALXRa18N%Nb8-;JGP95Agy^3eX1CqEZ~jemZ&>b1Wh(zAxtZYh+Mp>+1jv zLEYs7pNy|2CNk)@3(e7)I)HZLrck#lYDsC0iyV_6;g4>|i`OY6D|IM?3-+yVtRT(5 zewe;+5{!Wa>`sCFLGqu&X%9T_#0*Fxuu1r%P>NT{fr_@!^VrfCiY&m zA%m_-%&u4`YTdO(pamt8WgvJ*E~3yxDMf*n2AUd+8i~e!e}DgeUw5x~^mY$xOnsCn zv5S6->@%VC2Yd00Qn9k={KEXkJnOvod=ne)MB0Ik1+T4ldp+%e}5&99V8f*_N zeuyaS0&EPdE^HP0K6(s#2ZlK{&Ijnv5^K;dP9{-4JZR8?7(AXQe? zF_p)%#xlgxlD{iu17#D6Woo=7ixb`0gV6?J9L5N{OuHPra}eh+n`q5wYI0mmxz@ej z_Yo(0Cpz~^_dzF3^MaO4>~HL~Y>Sp`mVC4Q>=W$8R+Ee63+;cnnG{{~) zHkW=hj5p*p99W)DTg~ClF3up%PtI+Y`^tHL56j&xOe(1pI{YQq#i+X^oLbx_^jLpE zB*R|B$(o@#!Z(FKKQ{|Ei*{_gWW9tp!#NW-2UryR`#Bt7v~3e_R^9Vw!C+cpiD})T z(=Z6=OU6xRo)4cdE0!9)iI;&*g$>}eWAQW0vqQB}GMP2Fnf_&VVwrBQW$rZ>ZE;|e zV%j)R-iOcb&NRhT#nELKsKupqMc=65B1bCY^t(Sth4VO;VqtczcDlBwb|}&i!%Wp? z!Q|em7Pz-Ja!RS6{Z|l`(QGwsId}DD9x#D*ADyJ>xOlmse>}g~Ae4zikFzD*$o9&npW>OK(u2~& zuhP|<>X>L}?RfO;x~X|UdBVT-ygWT=>wb(TI>0(oKUun{+95jB-~Z(LE=bJ(hs1zs zn#4hG`#13q1&is{KfFDwJ!ji5+das|@sK2k!X>%PxvIJHz2kilh73Rp4~eIjt1uKX zl(+CA31l4n7J(bZUy>?*^Y+RS%K1fV^1hh1X#9dNO7Q$WG z$(dRh$x2%LZJwZ>xRiC7-o@;%fK0ngArj0QJONpHl{QDog7Hp%LUTu5&iKZ}>-Uhr zJ?i}Ayn2v6rOW;3srY>caC_x>=2>xje*pNsRoCs^^AIxJJ0DUJ1&_&~J>SITaGg!@ zi$YV{575ZS2=HrSGqfCIIX8_;z5hUeQWlQ*5%Fhl(y3`*TYa0(gKkxQwPL;f#+s)1 z&g!?Trs&F{>Y1v#f8%q@&Ed>~W!c&iS7nK=w9c5mw|CZIEC2rJzUY3(Y2Om3oq-+o z#+BV}d5k*k8C|1Yo(-z)NhN%l&8AJW=lQ{uDvoMXsc$LLqQydaxo1~MU18Z`*+aM5 zd4;LApzW@0TN}TFw{e!E?f~LWY%jr-+u&vUUDPe$zKzX2`TV6eqpQtZ{x19WUhe+h zyWyeI@;jag!uxB{<>~TeQ`d9Ae&9JY4)hpepm3hB|6BUa>3Lsnn06c&5x2wqk;Me_ zOU;dT6O1%n;KkYC-4?ITPF+5**rTqU$xG#0 ziTIFAT+{VzAe(1U0Vxnuhy)-C=mqY82;$k-Jcz*0;KE1N2rjVdHMoH00g&JxRw)L2 zF=S<+w$EMcJ=ou@ji>ksrin>4+$dOkreLJ}eESAa;s%f_Q6?UqT2q(1F*%6`LJ;$t zLU(~0WZ<)3MhmuXPF@Z68}TOuxw;Kj8t!Y=7fgV0kkE7n0l}pB#}8CeiTnx#(g0=y?Z%hl+Z~<>x*h45vkRFU|Ue3q%Y zPwT0%><;q>nFEHA$w~LyU`_Ay`oE@n0S!?IM`|3TR0jVkz!!v z-~Z$IfwAdmphyHj{ON*8Dt>1r~aIy$GwUyY@G zi|psIYajyV-e03>?lze5q<-c$)jxOtTSQ%U_Z7~#wpyYPyNv~2nYf}IRFa7jt5ksS zfEl_VE7kwXe(^9ag8F3v)x2~w%c~)QxhT(vYNfmAPSU6ggB64WJK5Q2aaB8a$w0-G9Kp z@`6fh=6BAJs^HmJ2Yy54qSscaW^+NkZreWBAa5_f^u;=C68ZD{yqHbL%o|3@@88uT zZ6AUsTkv>k`_$V$E;j;+fI(2LQuCq}gU6R;)ODps?XLNX3G1tz9MH;J3IDU47*K)< zAbG7#?T8HYEvvN>-X2WMy;T4$URc9+xI&xN&yw0*36MTcefiV*~VoO18ihI zy`?wU>)%~UcRlJALiRxHKA!R^_lQ>D#^+*?^r%9W*{}8c=A-20RHv%`?SWLb7wycv zAu5{ufv^X(%0DwkqN>Um7h4MP3YU?T#-Eb?ZzO}ZM^;n z=tze-L}ueby@>2<=jGqEGo&m;?P*tUDFyQ;+nVA5p=0HAEU+^Fw4?zh*ybjbkHbBt zsX&y+ocQT5TJ>KJH5rwq_BBDsFxsnJU%~D_Cwzv*RY-@3VWDLsHS`+l)R+(w+h1NB zXgu3rCJAlhF4bc4bK<;lMh(ZN8a5aTVIq>2JbTyPDd>?tApYJrfWI%n@UI9r!vnRa zdMKww==ufsY~9uw!v1o*LAAtK2J+Zk4s^&{+86Xn(x+VEe3I%vG19)uhM;_P&nfBt znSA5GNmsfS>L4T2E8>pDkrT4NSBiSFF;613gx!k3H(G1ggsA4$-C* zlL>X&=37ed^uS;JEeGCF&P@o<9zaH!UOBEA+H0-J#g*uU{m3t&U)(sqXYWwHl=dzE zjrLEmATZy+-aTy22@zn-cYTxYtSo*~`P5Pa<9h-m$L(?W{LwQO&u@@i%WO`1E)bqa z4md2Idi1Rhn=~w%WJbrwLAg}M#|x!5{$*`=@js7uXy5D#v42ej%5M<*GcCH%pzwF? zGiE(A=wkSmn@XOC?b z+D$BHi!ugLhFG>13l`L#AUb8QS$qz26xIiv$4QM@J6r(wwWuY8){yFYi^80cc7Uqv*82 z4VACQ*vZxXnJ3pw)?1|I@#ydqt8_r)$@q==#lPyMb9lGC{!^}+b`;zqn>wm8>Yd(? zq!(Fzoo32Qg3hD>pjOL+n(Tn z63zmw7Q46Udz`UXcL2L+!G1r3q>Z+q$8{!$pEH76GW{(c!&lvx!C#AQUkCFkU9E&s zuiNE4LXeZ4(brY)DzAQEmbHoS z)(F_;&D?7De`jK2(^lNz2sU0@3O~LI%>Q#Iq`_`UFj$fz6oVu$VL`@9ew6SSE1pCLRWulpk+wF}JnU}cCEz;nomT3+r#I-0=OcK3zDCo@9 zT*G~6JSN;+fV?E}+!y>aNvED8`)4L41|aDT2aCVsoR+5KC zd_`j*n(sZd0=n_+d+U^sWQP7NneRJVRw~ZD|ML z%2hJa$8GtmaH}q~?w^BIKIkLmEg0Ng13T^f7ff~KK6s_eP~DXJA*5tgx8O#ieX;9D#^!*5tVl3FJ2!N5>| zy^xyv=-;3}u!N%e;R2c+hwt`{F5TJQ9^6i|6LX*4Kd`J7rJ~~AK`WzXx5$)hqwK=_ zc&GM`+Be$w&7C06|f#(_3W-$nHoy(BiqbXDw;3&TD zf2nmNoJ@_PAR7qCj8=BAr2|6h<{ZB=;YuFCR1uo^esUX>zJsPJVfR-74Uqecr-YZ6 zAjbH5vr_IL7dewD1Rs~T%hz<>0d^(`x`cu#nrI2)xD&D3o z3sT!Nsx&nZheDffyGd89O_3}b#XKKx;AG@p6S8^8783CjY~vlPy2*m#firlR>rHI3 zBSj?|lRHL$uj==fQONyZ@L)s;ME9!qrSiI;_@SSQewu7sYxVg&<{!tYgJ|FmfsMFu zAI{y4?R0a4W2N-L=g&rDJa+=9$_~mBQ~E<5o)aR{_#o>_j7yHTMMJ}56nLjh&u{Sl zk_J}a3Rdj^jAFLx1mz?p>5s-OPX-Imr%P{TK^Pt8nn#g0uMeU-2V&f6RXS1eS%%IF zh88<%i^GaWN4!4ShIfyT#?@U|Sy+=h!<(-I+l=?uSnW5Tt21kBaUYv@cSvPn#U*{u zO!TwI7wrMXw4G(0+wPb8ua5YZ&BU|E1KSqmYn!CgWGKw>0znI7wKDH#Jt+Hgu#>!2(+jMclxx?@1784J;5~w66O1ior-j+M(R(Q{ z5fqk2I=DospOgg=5#2dFz<7P)@n*K%&cC~Ay`lva#k*Y`nqWMZgLkl_gz<`aw3;o# zHB@i(!P$P#!ku8s5`H~2FW9NLx{cqt*JulKk3jC2SpsVWN@`XqRw`1qJFF7QIl@%d zX3b4X4*%NVSTu?AkT)@`Gls32fAt5r*my}|p-=^kGqLI|_=y_PGL~Ho9L4kV?5G32 zMg`(lmgf;`9E~l)u5_7t)cWsRQkL|zR6yp%(gaCCJ0>LcGCf$RTj{!UwsF>O0}WRC zNj`3V#T6OfCn!9HdcOL>7I59&A%*zyN|~Y}oHI#x27xv&M zMRbWs=7fwtMo5?iy0s#5o!b(0sC-58HVrf1dld246=M&kDWWh;-6LWq)xxVh)I+aE zk*posHF59}!rBP+x6GvhTMBw5A1_`toZc8B{>r^!{{>6ft)pli{|A^jRi0L>7?m4y z&Tf591m&M%8L8LVF)C(X?;%3UseGW2_}>wbhVt8JhgYy0ITD+btrBeno4C1r8|mIU zreN~;_?A+&hBiJArhcA!wYpd_x_4UJ%E(vaS=+&0q<+wnp-s7FOXM{r7!`R_k*AxT z`M0AHj_Puu4Eje=#|ky~*`gx8<=uNJE2bO4>jX(nLVnM-J=tio+#y`c1+}9^%N@=;)_<4hSEt6$9;f&Ajo*-s2^LD- zrWhMu?AI7aav8m?3)ws;m^&-Y;+c}p8`FoD72@VsXUP|qmZa}vVw8-Hljc_!DG(yE z(cPs1>|@~JN(I$OUJIL({?)m2W+){gzPDhDnAE~90|RH8@Q5c(^F|86I!R%T;< zS4ltBLSazJcSV}BQ%Y8tM|s-5T-I|1Hd(z~Y&D*iitd+`zPPP=ZA~9==(T`0to3z^ zRF#6I-Xu@HbRIiWt)vF9@Pdy?g~qr~Qr_OYQkar!Vv0Aj`KiXdS5zX5J4;}c3{wJb zS>H+qk65NHg;VyI>?Bp)+%<|Var>wdK}j zsaA*pNb*X2?@VfEawZn@vMqMZ20fsoovWfAGRx_a|BF$lTt(tV)HM8n0# z{XJoFcs!Ux|Ngr(JJPS^abB;($&kj)i|7LP%p>h z&o|CIUWxec_YBc#f?(W6`1{ECEAA`ks;ry=qQnR(8EgihjG;N2fwt-QOk&mj6`rmDJZr z$+;B#AVxcPqkSHAoIiD9KLhk6o_P-IRYQbOv`yp7%Dc0c^>1q-DEwZ2a;F_-!jJ zEGT^?2zV#>dL=nA@r5FUTthX^fOGe#Xj1uW|40O4e5tHz0iDw5;g#146 zZ1RUS;!1QH&Im)GHD}$vx(Di5bjbTZV3yNG>k+ERv)F%X+KlPN5`68^{I%+N8rc}m zIT+w_s`Br$ii09Cbw<;ZHKcKqf1L{y#iO_5BQ!<< z@LK9~vhh=q#LKN&7MORFTTQl(6>;daZvY_Nl3nNXt16_lxrXF-j9>w%K#_Kj=Vy~e zLD(Vq<}cg)&Yqc^n*qQ*i&`t_v_nBk7eB{1w=yAe^kL|5ZK<^|JOfNN<<2xQv}gdt zF1AZe)s!zAl*}=E6B9a%4jMD3H$)vLm)=0>Ip7UKCu&XWGG@DY2aYy8N$Zq~L61}m z^l0QsBMJVRCLil7&_wjE`I{#d>6W^_e%LW#Q%Ns zfDzca`OvX}P@Vc$7yg5edg??Fv<~qVhaJ$u)g`KhkS@R=D%htOte^?e2{(Mj32-X z7YteE&VLM4(k!1t*RCCas5UhA0n7CW#)pXAiB?m}<2s2Q@dOKVyYFQHpoq^q!g8l{ zB^jcMZzmVfqXx`uf9vNoeNj3Q2U*4BfyVD?e_WXAm@Kj7p7L4iCRWm_w74gWA#9%X z6Gd&^n3EGNwuT3TxAlCc{qU3M@6v#T`87FXe(>3RgYxB~tls@&A7-Zc{u+c!q}b=r z>4Vom;oON15ute>xGpXyKxC5F$n}oRfSQe}??rY_2XTawET^A~S~k#6=Vh08^X)Vk zJC-Hztn5bNDjSTPV{{ZcX9@c9iL2Uy*=d2j_i*578QbUMv4WmnuT_JWnN15Fn8f|= zL+XE-hNKzgoUaGwT9?A_OL+IF)4P3@(K9l=qS4AiRHjBR``yj8^!GuYl=t0|M-jJ! z-HMZSqVDV0G^wH#G0G$}=23PQFL#s|rr2uU7qx~3nJwa(A?Ps9Xs%WFW3z0ui12Ll zPo1O|l!%l-gbuUeA|l)Hp!l}Oak3!TK*Swb%+B#A0XEc;-7^pD{G$nVO@aSywVY2g zH#q=mW;$Ho+s#N8);TwEaxoso2b%!D|I{90RYp^!WuhHObW`)y&)y@r=NEc|HKA*x zuBEYo+sZXsiJa|CCCN0@L^L>R!eJU1+S#o?%$?#4JT*(mSC6AkB@A$nsB7i8qX+qW z5cvDuRy~#RRkFOE50CkC-lXCde1Q@-+;be{S@VZa z%B|LM)GYmv8CE|@-N*H!_PYbjO`1iW)qGsut?0vaFU5Axn^b}+1AyM?CqcHL3fQ%w zx-*|LP~?-hVQ}n8f3U`HOKAB9JjSl|njHs99%>EmYI%vU2N-&Vcy!`mswzvp)gsjJ zBgRJ+o|UH=dh25!ui57qHJ!K5aiDf`nTk^KDU?_7Rs_-wAGXT{a<(BdD^qvb+um^= zoZt7oolCg}S~crOiJZgu&PvIc%<>k0DQ+2{;5B98KBd5~b6)p=|)5Y|_f!~va zwb}dGd_pjcGUhXy(KE)|)ki1b_rBy0oLCaCHmmX=#0rik?)e->v+z9a0;ap~J-Pd) zIw$RVY`&x<#=C^nu}op2LQMpe8t@RfYE`FeE^F}`+f|n7|*fpi^20l92fY~Np zYJVL60_9ckA{8bUHY)i*kbY#X@<3F>T~^T8Nn+_^^uyd9RpMb3Q?lt8^QK-kbd*Ct zxNtS%Rpw5^!epVkNu~5*7N%dSV?)iU&^QHEqT!rvmDM0+Ck}P(7oI|FJrt)uUD=Y& zB89c!CWap~5}OKU33H&V8P(u`^!sw!6JF|Im=;@G+gL_#NY$7jn1AVE-iephbsq&M z<3eglG_|+t9bzk~gSxH7F8>UdxDEADs;h~F)trr%X~Mee0%%8Y%^b<4O`*UbFIxus zDw&%n(rHesUm#m(L1l{8z9D6*VtZ=&)kR}%wj518uwKh$WkdN=g}-3bMPI7ur*r{+ zB2!Wy-snofBDouDTxDClRfC$PiyO?V1t(`lw0G|aC4s{a?Jm)0UOezBDJLZ|0@rkJ z`INbxXeOJ)NoOn4XjhSXc9cg$mrBD80$2W-)VX`bXi~>2aw8FnUt7U!jH`pkPf->L zsELf|)Hd)#oZr?;g$KQD`Y4=jW1p%?voUtjb`xI;mnj9rhW&C6C5WTjXAFh`6C4~9 zja!OqrPzTNgO$ie`%SX9p#3xj3sjKxuZl&v~{ zf!UyS=>7%Elx#Yb1%ummdl#7eQRMA-R^1MPfT*L_F^ zEzEI`JG|sA6+oyinfn_oOvyW^zr9bM;i;b*MAdD-!dfNN&+Z6s1Am43fv)r~Ni1O9 zYlMz@s8`&4M_ao+#w9b2Zl`=d4>3Ag z_*tzkE;Y{puY3>pGxOtDJD)tQeB9u7%-!w+u}`z>FLYJ0`UJmZ`z$c*a@k*CM2Skg zBM|Q21)EQQ*E68?Tctxn1l=Fo6(?$AC8_$iu?dRpS?KCK&2xilF}3rPaa=eM*hWl^ulHc`Q4}3d__Ty&`*<5+paC_fIZmtG~AjusvU|cTzImjpcBUT zbdJq@yRLzEo3e#-MH~y4lGSIAsrG8>iewUSwrM$)#*=n9ncy|PvO2p&Z$=e>h?ivJ znqkRNdIN-#Eic){9F9NK_$t^mtX-$-K)deW`%AHlfOE5sh-$j(7Nt5lnppo+$mmE+ zj(f6W1YFWuDW=Y1g3mU#cK7Ixms7_<1HgbeQcJQAViW>oSaMywF}^)p*m|J;Bj@8_ zCcMwNY*Q$>`v+{Hm(eElLQ{3;}&EGQDR zNm8p|*3EW$dHqFhpAH8|EZx7}tDqez(o||rdPbg;F}X!mK#Fz_=e0X>DtGF;eHF+D z!>h-@mzni><`JpHF^5~-nj(IA(+o5vmW3^owPm(dE=>___6NJsIV=u$NGX+XfKOS$ zw95)XYXI9DeAgV!4&P~3>4eVyvu2W@$KqM#kA@SJTJv)%2emr?1SIoO_t!5!mB9c6 zTDd>?Q=Qr>R1?gG9 zbEa;Sm1>}Vb=pwqICLQwZDVWTnJ9qWrSO*GPORY(?&@MXDjHwzthJ3SVx-&XV>g;0 z|IoTx-&4I|+uxUXP*{a5*W*uXdE``XBGpWp_L=gqsEmy+dg8Xza6)&LheKz<5#G11 zYvBkx6;98WF7FZ=(s;Bc@)j9q5ipu8L_NOZm?fPcU5c^lU60~wO)X|e8{QiyxGS1_ z?TB#2FOG2#?z!K<(qwaFrS?R&d=Unr? zHAoW5B0IMA6vfM)RA~IIaF;>vMEFsw57Fmgc!g_B$lUNbvJ1;n=V#h#1$R;7#Ec0$ z0@ZPYqn){BX01lBX=%}MOfIrdg%aK<>fITZ%?7_=x^0P7tu1Z`v9B&FO{`(g@-qGltA)TIblI?Gy8vsWs;@v{Tru z*0LE8m6}J7wO_jB_+Cry1yZkA{5T$VJGJn)c0K%*(ijE4KTc8Cn+R&6GwJ+zH4b_t4{Dj+D-U+?v0iUuPoZEMXH$8_hwuV_j(f5W zIH<_SUEV1d1~nC%jSPDe8#*6C?Uu{irTd&lkGsNR6Z&xEaiyMS@meFC57TL>R=*V=K%5~pJK71)%vm~GeA z8!X(4l)p;gPly!R>DA}GrrXz{zo)K>Av&zKrOzaY@z@g}YJ3E)LRjC_?ynXGy+VtL z$R`~jP(;R~H2*iPXe!?0>Xk#4-=8zzxAvY_`bdlX*3)L6&8Bw%eh17nGnP#Vnh0q` zgU;hr*Z!gx`#Z@eJ`8g~B}b7swsR49$x=Qi+kZtAQ$J`oFEF+S^>SX=>!I}-bCjHJ zGiA5>M+uIME;{W|X~dzWlBf-frU}}54kQ}Bye;{e0TedNN2}4o9xoLTr0cu&%!X3D zF9yXI_sE(feYxJ)nz(_{#Y4$DcvN00m=t=RDjed$ESkex)dX>N3f^PhXLmp)dm_Kj zA!=LecOprnudVv?pAi{G{D8!qh*tHiSyJs@>gduI7Ceejim__nO7Gg8~G zh2m>6z_JUmy1H6zwNT17D_eBVcEd`H0KtmSe!XZO3Mote+LM!`7OJ#>ze;qS$0-S$ z^?F^mxiQBkV~ncl0+6T9Cjd*uQ}9v_zAwa^3eMhay7F$&l8p9^!zY+qCFz{C)+1kl z!5cdFrK{y6iMTjvwtD(6x>Y#*UHBqB1t_&@lYh_@4#@kWT#X*y%3sG*`$KEB`y#?U zrSCV^YWNqiXB>ttolOB=8I)Dh_VAD#$8awX*?th-=>w9Q`~p-D3Omf~Uxb~_k!5** zDeCz_1#rE8Q#EXWRrsyvKurByfLhycIPkzA1I>)}p7GLv%FpNJ_341n6NzBKup#&C z9r-n;x_e`x8P=M_%8 z=jxpQp_yKoL8KnrQshY6&R?73zT(eAu<#fkV?s^M)a#14s@{(ShxnwaPs|K=tPp z1fmg^>`!QXd0u~_E~wc2_VY)JDk3Nne=8*1YST}y8RIfI#ookES>aU+2a>K2=>@#Q zWUv&_vnCd1kGiJdu_y3qe{NqwK45Jed9HVfUXV8CHu$ufPW}RxDNapOJ!iNH<8DPm z)mZM3wIR~rt&72IR1=j6KItD>V~IKc9mx!`l|xT0xw=BwQF->WQMGrYx5w0}!E)Dv ztC{~CzX2Wynj2fomV)hdKP@|@u8jH&Y5nX$Zd^gQI2jE#)8j%+|cxJ^~*6gf&7f%eg+V zpM0co^<`~37u=rJ=RW8JXuC(JR>H=*45hTX)!JmVA!$gCTf2vS*>%+ul4UUn)&j1K zHHn*jqGRl0QDv0~y987HdQ)$GE4?a)Ht7A5EL(*sf{)>;67Kp>kbU zmdO+L>$!+XfLd|0s*kQ(Z0HhGVggl5;!ZtJ-kmh%qfSkEnJNZw*0h2ozZLyB^Qp|O z;*V&QksNAdboy`SdpL!DUfXqer>dTc^wF2E?SbOb7ZLf)&kz>5ynL+eAC?YsX@Y43 zA*DE{j%R%#;n_K^DeYXd(2NWWA13{?!xMks{9X&Li|D>jxBIs?P=Wkq$hl3~TZrg+Qx@hHI!8 zhzg-_*mh#^x)XJeIHQKPrYjV+4yEJi!vK;5;np(iSC{Kg2Tfo+9IS?D{}%ntEwzP! z=D<3Bcqn4cn40qhD{G1zdBUVRvC=i{iM8*^R^9;KdQ^TDeGki8Y(k_^eZv}0;#z~7{&t!!(R>Rmw z3g%}JZLXyQEchOHZfd@CpCopUr$*T`J-u4Q+{#q zx)Hv(=*3krvFtdxTsOT(N2AjaEJWQh_ z**A75v=NETA=BQju(78nuA&=z8ykGI`c`SZxNOLR36eLtGhoHH8@YNbT0sCn!%|CA5_=UxUnJK;vGLBc=UVfY!tfk5ww5` zPV#3_X>IkqNh1M((ma4Rh>K7xa ziEMx89Z%T47UrxXf0UZ;GvCNHvs&EQlY5f7F=%<=)%eB-gI5b`)zRRUvRu=+FCXA0 z6kEGLa`0V5UyxQxJx@ooaOmf1EQ*c%m36WBSzvsJ2?eZuD@;gPsKc=lGU zC@&*pd3`pWJ3{AntyLxd>Ld8k9jj&UU-=?20CyPuUMJoBK+)t(gn!n7q35PFRE&wF}BVqv4AB{$qdoOlv^*6eYa@!pBQvo~-< z54wR>N2xE+BCB2i7{^*2pE`f_&CmmfwBk-@Ox}S}ks2gLlqwkg_k}9Ph*VDLryknN z7SbpcHv-Jipc0*u!Kt1D`}$rtzAKEc_N{vq`sQoN_<>R@i>ea>C$8dzjykqhJCV+> z20HJ+!+YFZw)7<=@e#O?YW0Jv`z<{BUA=#4vpC~w+m8>Qqiqk?O6y6)z0XT)y%$95Bw zgB{V$|GVEL4j;qS{)u(8>5UP)(SQjoAxAXa^HIjrixC=nciP}&>zsJy+!1$=D{hJQ zAj(F9qnK^#^~!${6;3L=82MUxn!A`IN#U&OywW_Ac36C!Kca|A55lwa$azww7wvje zwbhSqGfEd!2>rxE_Lc~ajUj?{eGTzB7_@hO4zg&j;+K<8eL6z7@ur{j{}EV3lh(>% zzB9T{4oAGfp_ac~;3?kt4h+lEn-8ydUA;V2wf*B`pNW ztKTdr6+5!Y`dsa^uIS*IN5?;T(|H*?a+pZY?=GK6_BYdUL)M@tlnlylVkRb{E zxcS9ZYE_=|#O=c8Iv(1>qDDqb0~5Yjv)$PMiTtvB+0rUE$vn-u0+Qn6zvW3Rq}l{4n&$SRQ_Oo)hZu2_#wED~tXF)8E)$=tK% z7Z-Msh$46~-oc#izqORIzx3?HpJ)Z^IBAL;aWAS?&GhIkkcvVi3_`Dxr!tJgJy=Or znG^*@bzzHXq5lp6Eu{V3pJ}Y?D&^zwgz>j`kZ%m61sz#!7Zr^$JrZU+b_30etJE{s2Bz zt}>M^3`H8M(1KQjEC%P2OOm#E9yUr>73k4YDL!x06nCEaA!6VHoM`zS9;iJj+V zr3yJ#;;02piiUbf(-<4~S(_%|2Ajq=H-C*-JrzKAtl|212TY=Fyu%`2ofXRF*HbsG)Q-M!x`^$l;`{p&%5i@-fLfLuixI+ z^;zHF&gn%#c0n=U_6&@KHRrM2|2C~?KCq!A?%-LD&-g0K0Iro7o^5lgH85M@9})8M zb)&6VUky?H{7ZuZq;|;}MemzbtLPGH)I#m1A%kH_0gL{W)O-poD%u6??&b_GRQzNW z;=*Ob#e!;sXh--Lpn$`?74n7b%Jb0C!gD=(+>}oEs(khjhc`s89p^`X?$FS9(QY|A06YNdUm zjEroF{6k4xItt43u@^Kk|j@gLf%q13D zbd|QPng`_YH{SU)RVV9_)ZTo2PuOd7QovOhbSS5S+Aivd@vC^(9-T{NY*cV_F;md| zD8Tn+X>G~(xfUz;snDpe`h>#;J1e7Gdr``d_TrZCcjJzZO}dmlAaOFdqkFCkhieb3 z=mnO}^+N@^>@fonV#X3dTXA}EWFqct6j%2q`)9GiGm^C2bnNh#MG!Dlb)tk`h)UlV z*W(ij8zKy1TK;?&8S7IrUV-7|ENh|61nS*YUwA_Kf!hu!$DirlcV zbXU%MzUkyVF8;USk9`0zi%R;_@#px%dxJ9$ev8Xk2JmtT^NAkz@fryY^uTVpHkf#c8081LHe$lsGQ-#SAAL7frs~H-E(NyUCCduE?kln0L7Y^2WUT1z z`V&xKtv6U7$QCuZtT37AQ*P)pZs#%}Bs7QG$B~xf1eCUvvq=Hv>qfZI4@8)XJg+0p zu=(tb;<dWlqk>26m?otnFc*A@S9F&`{Cv`O5+F74=lXb8=g%fT!)DuYyyQHct=h z+d4@~sW?-wF>0W!yTXs^2u=@A#eNDPUP;IcCWK*fVPW5x#XC33~g!fB%E18~7!8Ft5dnCROTDzquZQfZmJx)(T zGFWBLaha7Rf&u351B)9a>UtB-HA}TCGW}1I%QTL0PteulM@~u7v(U-8Ge4uX`ePflv6~yM_oF%y_H3JqI&JQD5V6vk}L$-gc3 z!9+^=I>W%?$j=$`dyA6gkX@!6%R{^_LMgo?D~H||m&{HE&=yus!8F3X?ISMg0L-Rg zrnr!JSHUJ!4vKKYGlY`fxkKdq+Y2DbcyOa&(aIWUQbmfie}g-e6UwjED{CabQ zaWoM1*n(S)r*w^X%%%J0NNfA`mz#aLtdI|s_eEs>!3ZW!4087|2s;cnjdw8)w`NZz z$+Y4->Mi}{PSf2AdL4?QEg!L46ElK)w+xZTyYyFz^@NLu^r=XMAg%CU=7nly=$M4_?Q?EXM9w4zcD%F|TOplyKXF)j>pwV9cRG zhyDncyODjfnPAc{Mpn>nt!zpgZO)J{z zLQv#DD?-@#`E<(Q&%0q3T?cv#bfo-KHF3vgavP#YyaB3_lSln=D$7x6u)U&0<>F}3 zeScD>;~Y_)X-}X`n7m2O{TFhzBE&U#^OG0&*jyYJLZaU!Ye5o1L{~N0VCrdSRvk-@ zHK!U1cjQ(tXaJW{4h0MbB|Qc(D2s2N8x5}>$~|vb@;CQy)x(F4z-WjewC-isFYHYy z4!jScgMGw$?8O3R#qBGwI4UF3v}9-B7rCRe6uvEuTm-YWE?L&@y-;2iXgUJ}L3q>0 z%1I;_Lg;4hfVbHcB|a@h*Ebve!?bLKhxP6A?J97v4QrR>NOJ{usW5jc<^Lf&(QH9= z%1Z=PPhi&H$4f>_fk%`miK#j5pqjBNr0^)K#y6m8Mm@Av!vJB-q(5#~ zMH~sy?iq0Ii0qM43BLZe#be#f3~?4T$Ht~%0L5WRn^pS@9`pCa(QoYSXYoz@1#7?u zg8NFX2FE#%c?pz&#muL2zD~;i54DvnNI&ZTq3ISOWy=MdsZ@IgE(46D;p1Ys$b$UUyrpAtX@*%`NAwhB(wZd9r z|7w6)lERx2A>Lpb3PWza$atSC@&FHRBiDzk@Z=r_`kQjwDC`PSi&s#Ky>d*_abMmM z&D1o7mg4L*weMS^`)H{AZYmy8Q0LRKYM#e6O#wk0lUwT4uNtvxlVu1e?7;r}zaEQP zw{ct_;EbB3P0@CRu6|Dvs2ER|Y8!LE_n_zHU>-pcFNy@WckQv~K#iOpF+0@NQoFf@^w6?EOfMCH5EO zN@dBG)`RX@{NpO}xyTawTm5s1U2NCpXAT?#E!^xYgw+OLx-wt8%~AZpI8_r9&V zIl~fVh4den1^aM2{XB{=d&9WWk2SQ^KSkd}3$NDx^)um?$(r2~r2!dWg?ATE#nlbY zQ?;FGk}$KUhGIX2n~Z<6(EH|u@t~4MIv0n0>pB}!Cf@5|7C(@))+ zAG@EfrStRT^11z4bKBklU0V~cuIX9(RWunNE=@aPhA1|+u{7vTDC~QPMrOn8$+Cms%EY10W4HWk=`!aI2DG6760@}49b=- zX@mqUUM76(2`4N^(@@r4yAij8;mS7!bCXP#)Hyv>9Pz`o3ehAS4E;pw$3Q*^?YHk# z0ugem1lXime!J4$scen7SRPo~E ze!>Ar6Ta7s&usY-d^?l8Nev*veSWs%A>l+1;p)5Eb74MUt@_@sDso57WIwEQPn5y7 z!c!<9kiFT6Qs2#Rm~XMeH<(&hN!fa$ArT(X5$Videt4ZtwiK!z)^Km~Vln60 z%YboQ)yv?wto#Q+7RPkHk830l)B?240uoet-8h#qjk7x+=NTpn@6WaNDYIe%l+PQ< ztPr?G;-t8>TTh`#Gdzz>bzX45w=?$Ygso@>fRr)^P+3kv{iT^4K)xgOsW{4_k3^7! z@elu(h=)~&zKQv6;hWlz4gTa0^v?D6)WJgtOs7>*W?4Cd#w;{-PAJf8YC14X)$(|+ z_wsZWmN?B$zU{BT{rM|%^BMazXPUNab>XT|n=%d{4=~`QK9rtv`^s=7pR6dlq zCMOpQ!(-jwokOF9PxZflFk3ycse>S1o{lu9OQ?s0iYC)>zg)!bw-eJnwu>p&uhd(> z2X8pwQ$5+gW*=>IU> ze53liDl;Z>61G7NS3?rzp$(I7FhY;{FNy_i>E3gT0fo-9;VC=_pvR@^_v&MFg6}X4 zuCq&0V3D_H(iPu;gL>xxp#&Nt#3Dfa#{^)&V&)-hJmnz%Rzb7*!-(>L6}6Jl)}T;P zqx3Mbvo3mV0#?!JMm;y$IqGfMR}p9Ih-RZh{B_lCitX!cuDag7Gj>8XuY5*vl#Ft) z!!4Z%0tL0n+~8Jv=PRxF`!@RVRJ8tb7e+Fbbi?b<|IzzF2`#^`%l7=>1;61vOemCH8+pad$tibsU8v_b zxe#{@fMIwtz8Zvcrbpu-UT_+B~H0ZsO!>Yj|{iniAY%%rEJkv)q)c2@V z`JTEEXrH`Mr7wv^;`OPL*|acue>=U}XoK`NXH^5>uA5JdZzwK@)(bP#u4kxcV400QnMhm+yN#)i;(Y|)dyh*DgKR2fLJ#y=xm z1reeHVPtucFZm}B5dZzhzL!BI68~sz3vbGXdnYs$lXXm@Ci4C5W-lY*qXu#M@G zaJ{13Z4Gl*7WNA6&5Q@OoR&1@Q---)&=c4nws<`8KsPbEKVBuLo-Uj_V;`Nj5Pn%j zbdkcIw{iYx$_~JW6|L>Wv_56w^s!oPW>4UO#;V|t``_dAl}r$0g<8qwKQx_gcVl7} z7KheH$kz2e86P_f*1j#H1+g8+vGDk87cnRa`_mOQYAY}Z7?gYb<8DioUUcN zsO^K~Y;O5XQwF7%XJVa%B`+V_8E}O2CDp0~@?%EPrMK^!9$M#Z@s3k)8WbZ$UA+qT z0`;_4+JKbjxZF%LRhi>Ta<@?s&F=Q3vbnSTRYTAQ0QZy3+_%X5__|GN;uLMtrNf0e zFIH!HW^3p2^aXL}ds;^s7rjHLWG5Uyd2r%N$7atjX*bIpn|_!$!W^q;R7LiAHw9^V z^4RdpQqLG+2d*`Svi)XPv2NXJ;PPm`P^v!8o%UpsO(XR;NgEE79RHsb*H>%`<@O0L zi<{rL*xjAqfs6}z8HKIZldm`BWVdTMp87(g-!vKv_-H?*z3J5Uysjk~Q|D+;z>cax zINSH-?h(y{M*{>qEQ&_&tgHUbyb{2C?l`pJbIg^~T6g*&eJ`mM)HIS*-x^(Hks~Qc zS19)6qf}|tSDxR?;yAB$1|7>TgjGImeK2?@4iFDrL(47Lk5E(MZ{kXcI=d?~gm^rR zvOZ%Q2duhBMj=&Nui3j3xp7)9D(@&2UD3hv;PE7hB6%~7_=bE9f#4+;nm3VXp1WFS z{wMs8ayLn1xgfn9Ey_eKj638Gm*b^FQz_``uMFSDnm78z&}()@?%-D81bm=@Ohgpg ziu^FJ5?4t2FfNZyojn_G_g^OsaZCx!>fgd<`wTK(yPXjwZY5M_nuu3$?1+RvewLzr zj*ZKa%~nhHs(fH)cy~;6&#t+sR`;U+Y-S2=VrJox@Vxx*fd#_*SFunN0_#q?4Kd$I zXQPVWoj5*wR>Q$o>$F4~Hr)L-WUNSz7BvWa%`7qn@Np|lEH$&Q=Yj1qETA>Wocb9t z!=K~tNzfWFy3&RTGbenk&Uxn51Z~6`2{2=-=w`sL1QyMwD3bfi)e*CHUhhNbLSxY zm^t3J*C(|9o5?*)w3AAR2AUW7Pd7Ld7xcaDg;A4M)Soo`>n_nNj&_5iP>d0}C9~1) zIDuWiv+sqWDe`CTf7b=Ni`ExQW|{T>99SHakqK=#vWWfS=&u3B?}sMb-@2Uf_{-ft zy8N&0P9hrbCyOHk4c{{EzsKe#KzqXdH|Rgx=f7|IzuV(?MWPGw^2P%LMQ$s%dW3eT PyrUqaB3&eD^7elKWhPFp literal 0 HcmV?d00001 diff --git a/fluid/PaddleNLP/neural_machine_translation/transformer/infer.py b/fluid/PaddleNLP/neural_machine_translation/transformer/infer.py index 6fc04a94..57ea546a 100644 --- a/fluid/PaddleNLP/neural_machine_translation/transformer/infer.py +++ b/fluid/PaddleNLP/neural_machine_translation/transformer/infer.py @@ -1,18 +1,20 @@ import argparse import ast +import multiprocessing import numpy as np +import os from functools import partial import paddle import paddle.fluid as fluid import model +import reader +from config import * from model import wrap_encoder as encoder from model import wrap_decoder as decoder from model import fast_decode as fast_decoder -from config import * -from train import pad_batch_data -import reader +from train import pad_batch_data, prepare_data_generator def parse_args(): @@ -54,6 +56,21 @@ def parse_args(): default=" ", help="The delimiter used to split tokens in source or target sentences. " "For EN-DE BPE data we provided, use spaces as token delimiter. ") + parser.add_argument( + "--use_mem_opt", + type=ast.literal_eval, + default=True, + help="The flag indicating whether to use memory optimization.") + parser.add_argument( + "--use_py_reader", + type=ast.literal_eval, + default=True, + help="The flag indicating whether to use py_reader.") + parser.add_argument( + "--use_parallel_exe", + type=ast.literal_eval, + default=False, + help="The flag indicating whether to use ParallelExecutor.") parser.add_argument( 'opts', help='See config.py for all options', @@ -123,106 +140,185 @@ def prepare_batch_input(insts, data_input_names, src_pad_idx, bos_idx, n_head, trg_word, dtype="float32").reshape(-1, 1), place, [range(trg_word.shape[0] + 1)] * 2) trg_word = to_lodtensor(trg_word, place, [range(trg_word.shape[0] + 1)] * 2) + init_idx = np.asarray(range(len(insts)), dtype="int32") data_input_dict = dict( zip(data_input_names, [ src_word, src_pos, src_slf_attn_bias, trg_word, init_score, - trg_src_attn_bias + init_idx, trg_src_attn_bias ])) + return data_input_dict + + +def prepare_feed_dict_list(data_generator, count, place): + """ + Prepare the list of feed dict for multi-devices. + """ + feed_dict_list = [] + if data_generator is not None: # use_py_reader == False + data_input_names = encoder_data_input_fields + fast_decoder_data_input_fields + data = next(data_generator) + for idx, data_buffer in enumerate(data): + data_input_dict = prepare_batch_input( + data_buffer, data_input_names, ModelHyperParams.eos_idx, + ModelHyperParams.bos_idx, ModelHyperParams.n_head, + ModelHyperParams.d_model, place) + feed_dict_list.append(data_input_dict) + return feed_dict_list if len(feed_dict_list) == count else None + + +def py_reader_provider_wrapper(data_reader, place): + """ + Data provider needed by fluid.layers.py_reader. + """ - input_dict = dict(data_input_dict.items()) - return input_dict + def py_reader_provider(): + data_input_names = encoder_data_input_fields + fast_decoder_data_input_fields + for batch_id, data in enumerate(data_reader()): + data_input_dict = prepare_batch_input( + data, data_input_names, ModelHyperParams.eos_idx, + ModelHyperParams.bos_idx, ModelHyperParams.n_head, + ModelHyperParams.d_model, place) + yield [data_input_dict[item] for item in data_input_names] + return py_reader_provider -def fast_infer(test_data, trg_idx2word): + +def fast_infer(args): """ Inference by beam search decoder based solely on Fluid operators. """ - place = fluid.CUDAPlace(0) if InferTaskConfig.use_gpu else fluid.CPUPlace() - exe = fluid.Executor(place) + out_ids, out_scores, pyreader = fast_decoder( + ModelHyperParams.src_vocab_size, + ModelHyperParams.trg_vocab_size, + ModelHyperParams.max_length + 1, + ModelHyperParams.n_layer, + ModelHyperParams.n_head, + ModelHyperParams.d_key, + ModelHyperParams.d_value, + ModelHyperParams.d_model, + ModelHyperParams.d_inner_hid, + ModelHyperParams.prepostprocess_dropout, + ModelHyperParams.attention_dropout, + ModelHyperParams.relu_dropout, + ModelHyperParams.preprocess_cmd, + ModelHyperParams.postprocess_cmd, + ModelHyperParams.weight_sharing, + InferTaskConfig.beam_size, + InferTaskConfig.max_out_len, + ModelHyperParams.eos_idx, + use_py_reader=args.use_py_reader) + + # This is used here to set dropout to the test mode. + infer_program = fluid.default_main_program().clone(for_test=True) - out_ids, out_scores = fast_decoder( - ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size, - ModelHyperParams.max_length + 1, ModelHyperParams.n_layer, - ModelHyperParams.n_head, ModelHyperParams.d_key, - ModelHyperParams.d_value, ModelHyperParams.d_model, - ModelHyperParams.d_inner_hid, ModelHyperParams.prepostprocess_dropout, - ModelHyperParams.attention_dropout, ModelHyperParams.relu_dropout, - ModelHyperParams.preprocess_cmd, ModelHyperParams.postprocess_cmd, - ModelHyperParams.weight_sharing, InferTaskConfig.beam_size, - InferTaskConfig.max_out_len, ModelHyperParams.eos_idx) + if args.use_mem_opt: + fluid.memory_optimize(infer_program) + + if InferTaskConfig.use_gpu: + place = fluid.CUDAPlace(0) + dev_count = fluid.core.get_cuda_device_count() + else: + place = fluid.CPUPlace() + dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) fluid.io.load_vars( exe, InferTaskConfig.model_path, vars=[ - var for var in fluid.default_main_program().list_vars() + var for var in infer_program.list_vars() if isinstance(var, fluid.framework.Parameter) ]) - # This is used here to set dropout to the test mode. - infer_program = fluid.default_main_program().clone(for_test=True) + exec_strategy = fluid.ExecutionStrategy() + # For faster executor + exec_strategy.use_experimental_executor = True + exec_strategy.num_threads = 1 + build_strategy = fluid.BuildStrategy() + infer_exe = fluid.ParallelExecutor( + use_cuda=TrainTaskConfig.use_gpu, + main_program=infer_program, + build_strategy=build_strategy, + exec_strategy=exec_strategy) - for batch_id, data in enumerate(test_data.batch_generator()): - data_input = prepare_batch_input( - data, encoder_data_input_fields + fast_decoder_data_input_fields, - ModelHyperParams.eos_idx, ModelHyperParams.bos_idx, - ModelHyperParams.n_head, ModelHyperParams.d_model, place) - seq_ids, seq_scores = exe.run(infer_program, - feed=data_input, - fetch_list=[out_ids, out_scores], - return_numpy=False) - # How to parse the results: - # Suppose the lod of seq_ids is: - # [[0, 3, 6], [0, 12, 24, 40, 54, 67, 82]] - # then from lod[0]: - # there are 2 source sentences, beam width is 3. - # from lod[1]: - # the first source sentence has 3 hyps; the lengths are 12, 12, 16 - # the second source sentence has 3 hyps; the lengths are 14, 13, 15 - hyps = [[] for i in range(len(data))] - scores = [[] for i in range(len(data))] - for i in range(len(seq_ids.lod()[0]) - 1): # for each source sentence - start = seq_ids.lod()[0][i] - end = seq_ids.lod()[0][i + 1] - for j in range(end - start): # for each candidate - sub_start = seq_ids.lod()[1][start + j] - sub_end = seq_ids.lod()[1][start + j + 1] - hyps[i].append(" ".join([ - trg_idx2word[idx] - for idx in post_process_seq( - np.array(seq_ids)[sub_start:sub_end]) - ])) - scores[i].append(np.array(seq_scores)[sub_end - 1]) - print(hyps[i][-1]) - if len(hyps[i]) >= InferTaskConfig.n_best: - break - - -def infer(args, inferencer=fast_infer): - place = fluid.CUDAPlace(0) if InferTaskConfig.use_gpu else fluid.CPUPlace() - test_data = reader.DataReader( - src_vocab_fpath=args.src_vocab_fpath, - trg_vocab_fpath=args.trg_vocab_fpath, - fpattern=args.test_file_pattern, - token_delimiter=args.token_delimiter, - use_token_batch=False, - batch_size=args.batch_size, - pool_size=args.pool_size, - sort_type=reader.SortType.NONE, - shuffle=False, - shuffle_batch=False, - start_mark=args.special_token[0], - end_mark=args.special_token[1], - unk_mark=args.special_token[2], - # count start and end tokens out - max_length=ModelHyperParams.max_length - 2, - clip_last_batch=False) - trg_idx2word = test_data.load_dict( + # data reader settings for inference + args.train_file_pattern = args.test_file_pattern + args.use_token_batch = False + args.sort_type = reader.SortType.NONE + args.shuffle = False + args.shuffle_batch = False + test_data = prepare_data_generator( + args, + is_test=False, + count=dev_count, + pyreader=pyreader, + py_reader_provider_wrapper=py_reader_provider_wrapper, + place=place) + if args.use_py_reader: + pyreader.start() + data_generator = None + else: + data_generator = test_data() + trg_idx2word = reader.DataReader.load_dict( dict_path=args.trg_vocab_fpath, reverse=True) - inferencer(test_data, trg_idx2word) + + while True: + try: + feed_dict_list = prepare_feed_dict_list(data_generator, dev_count, + place) + if args.use_parallel_exe: + seq_ids, seq_scores = infer_exe.run( + fetch_list=[out_ids.name, out_scores.name], + feed=feed_dict_list, + return_numpy=False) + else: + seq_ids, seq_scores = exe.run( + program=infer_program, + fetch_list=[out_ids.name, out_scores.name], + feed=feed_dict_list[0] + if feed_dict_list is not None else None, + return_numpy=False, + use_program_cache=True) + seq_ids_list, seq_scores_list = [seq_ids], [ + seq_scores + ] if isinstance( + seq_ids, paddle.fluid.core.LoDTensor) else (seq_ids, seq_scores) + for seq_ids, seq_scores in zip(seq_ids_list, seq_scores_list): + # How to parse the results: + # Suppose the lod of seq_ids is: + # [[0, 3, 6], [0, 12, 24, 40, 54, 67, 82]] + # then from lod[0]: + # there are 2 source sentences, beam width is 3. + # from lod[1]: + # the first source sentence has 3 hyps; the lengths are 12, 12, 16 + # the second source sentence has 3 hyps; the lengths are 14, 13, 15 + hyps = [[] for i in range(len(seq_ids.lod()[0]) - 1)] + scores = [[] for i in range(len(seq_scores.lod()[0]) - 1)] + for i in range(len(seq_ids.lod()[0]) - + 1): # for each source sentence + start = seq_ids.lod()[0][i] + end = seq_ids.lod()[0][i + 1] + for j in range(end - start): # for each candidate + sub_start = seq_ids.lod()[1][start + j] + sub_end = seq_ids.lod()[1][start + j + 1] + hyps[i].append(" ".join([ + trg_idx2word[idx] + for idx in post_process_seq( + np.array(seq_ids)[sub_start:sub_end]) + ])) + scores[i].append(np.array(seq_scores)[sub_end - 1]) + print(hyps[i][-1]) + if len(hyps[i]) >= InferTaskConfig.n_best: + break + except (StopIteration, fluid.core.EOFException): + # The data pass is over. + if args.use_py_reader: + pyreader.reset() + break if __name__ == "__main__": args = parse_args() - infer(args) + fast_infer(args) diff --git a/fluid/PaddleNLP/neural_machine_translation/transformer/model.py b/fluid/PaddleNLP/neural_machine_translation/transformer/model.py index 1e510bc6..bf68e089 100644 --- a/fluid/PaddleNLP/neural_machine_translation/transformer/model.py +++ b/fluid/PaddleNLP/neural_machine_translation/transformer/model.py @@ -7,6 +7,43 @@ import paddle.fluid.layers as layers from config import * +def wrap_layer_with_block(layer, block_idx): + """ + Make layer define support indicating block, by which we can add layers + to other blocks within current block. This will make it easy to define + cache among while loop. + """ + + class BlockGuard(object): + """ + BlockGuard class. + + BlockGuard class is used to switch to the given block in a program by + using the Python `with` keyword. + """ + + def __init__(self, block_idx=None, main_program=None): + self.main_program = fluid.default_main_program( + ) if main_program is None else main_program + self.old_block_idx = self.main_program.current_block().idx + self.new_block_idx = block_idx + + def __enter__(self): + self.main_program.current_block_idx = self.new_block_idx + + def __exit__(self, exc_type, exc_val, exc_tb): + self.main_program.current_block_idx = self.old_block_idx + if exc_type is not None: + return False # re-raise exception + return True + + def layer_wrapper(*args, **kwargs): + with BlockGuard(block_idx): + return layer(*args, **kwargs) + + return layer_wrapper + + def position_encoding_init(n_position, d_pos_vec): """ Generate the initial values for the sinusoid position encoding table. @@ -35,7 +72,9 @@ def multi_head_attention(queries, d_model, n_head=1, dropout_rate=0., - cache=None): + cache=None, + gather_idx=None, + static_kv=False): """ Multi-Head Attention. Note that attn_bias is added to the logit before computing softmax activiation to mask certain selected positions so that @@ -56,42 +95,86 @@ def multi_head_attention(queries, size=d_key * n_head, bias_attr=False, num_flatten_dims=2) - k = layers.fc(input=keys, - size=d_key * n_head, - bias_attr=False, - num_flatten_dims=2) - v = layers.fc(input=values, - size=d_value * n_head, - bias_attr=False, - num_flatten_dims=2) + # For encoder-decoder attention in inference, insert the ops and vars + # into global block to use as cache among beam search. + fc_layer = wrap_layer_with_block( + layers.fc, fluid.default_main_program().current_block() + .parent_idx) if cache is not None and static_kv else layers.fc + k = fc_layer( + input=keys, + size=d_key * n_head, + bias_attr=False, + num_flatten_dims=2) + v = fc_layer( + input=values, + size=d_value * n_head, + bias_attr=False, + num_flatten_dims=2) return q, k, v - def __split_heads(x, n_head): + def __split_heads_qkv(queries, keys, values, n_head, d_key, d_value): """ - Reshape the last dimension of inpunt tensor x so that it becomes two - dimensions and then transpose. Specifically, input a tensor with shape - [bs, max_sequence_length, n_head * hidden_dim] then output a tensor + Reshape input tensors at the last dimension to split multi-heads + and then transpose. Specifically, transform the input tensor with shape + [bs, max_sequence_length, n_head * hidden_dim] to the output tensor with shape [bs, n_head, max_sequence_length, hidden_dim]. """ - if n_head == 1: - return x - - hidden_size = x.shape[-1] # The value 0 in shape attr means copying the corresponding dimension # size of the input as the output dimension size. - reshaped = layers.reshape( - x=x, shape=[0, 0, n_head, hidden_size // n_head], inplace=True) - + reshaped_q = layers.reshape( + x=queries, shape=[0, 0, n_head, d_key], inplace=True) # permuate the dimensions into: # [batch_size, n_head, max_sequence_len, hidden_size_per_head] - return layers.transpose(x=reshaped, perm=[0, 2, 1, 3]) + q = layers.transpose(x=reshaped_q, perm=[0, 2, 1, 3]) + # For encoder-decoder attention in inference, insert the ops and vars + # into global block to use as cache among beam search. + reshape_layer = wrap_layer_with_block( + layers.reshape, + fluid.default_main_program().current_block() + .parent_idx) if cache is not None and static_kv else layers.reshape + transpose_layer = wrap_layer_with_block( + layers.transpose, + fluid.default_main_program().current_block(). + parent_idx) if cache is not None and static_kv else layers.transpose + reshaped_k = reshape_layer( + x=keys, shape=[0, 0, n_head, d_key], inplace=True) + k = transpose_layer(x=reshaped_k, perm=[0, 2, 1, 3]) + reshaped_v = reshape_layer( + x=values, shape=[0, 0, n_head, d_value], inplace=True) + v = transpose_layer(x=reshaped_v, perm=[0, 2, 1, 3]) + + if cache is not None: # only for faster inference + if static_kv: # For encoder-decoder attention in inference + cache_k, cache_v = cache["static_k"], cache["static_v"] + # To init the static_k and static_v in cache. + # Maybe we can use condition_op(if_else) to do these at the first + # step in while loop to replace these, however it might be less + # efficient. + static_cache_init = wrap_layer_with_block( + layers.assign, + fluid.default_main_program().current_block().parent_idx) + static_cache_init(k, cache_k) + static_cache_init(v, cache_v) + else: # For decoder self-attention in inference + cache_k, cache_v = cache["k"], cache["v"] + # gather cell states corresponding to selected parent + select_k = layers.gather(cache_k, index=gather_idx) + select_v = layers.gather(cache_v, index=gather_idx) + if not static_kv: + # For self attention in inference, use cache and concat time steps. + select_k = layers.concat([select_k, k], axis=2) + select_v = layers.concat([select_v, v], axis=2) + # update cell states(caches) cached in global block + layers.assign(select_k, cache_k) + layers.assign(select_v, cache_v) + return q, select_k, select_v + return q, k, v def __combine_heads(x): """ Transpose and then reshape the last two dimensions of inpunt tensor x so that it becomes one dimension, which is reverse to __split_heads. """ - if len(x.shape) == 3: return x if len(x.shape) != 4: raise ValueError("Input(x) should be a 4-D Tensor.") @@ -107,8 +190,7 @@ def multi_head_attention(queries, """ Scaled Dot-Product Attention """ - scaled_q = layers.scale(x=q, scale=d_key**-0.5) - product = layers.matmul(x=scaled_q, y=k, transpose_y=True) + product = layers.matmul(x=q, y=k, transpose_y=True, alpha=d_key**-0.5) if attn_bias: product += attn_bias weights = layers.softmax(product) @@ -122,23 +204,7 @@ def multi_head_attention(queries, return out q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value) - - if cache is not None: # use cache and concat time steps - # Since the inplace reshape in __split_heads changes the shape of k and - # v, which is the cache input for next time step, reshape the cache - # input from the previous time step first. - k = cache["k"] = layers.concat( - [layers.reshape( - cache["k"], shape=[0, 0, d_key * n_head]), k], - axis=1) - v = cache["v"] = layers.concat( - [layers.reshape( - cache["v"], shape=[0, 0, d_value * n_head]), v], - axis=1) - - q = __split_heads(q, n_head) - k = __split_heads(k, n_head) - v = __split_heads(v, n_head) + q, k, v = __split_heads_qkv(q, k, v, n_head, d_key, d_value) ctx_multiheads = scaled_dot_product_attention(q, k, v, attn_bias, d_model, dropout_rate) @@ -327,7 +393,8 @@ def decoder_layer(dec_input, relu_dropout, preprocess_cmd, postprocess_cmd, - cache=None): + cache=None, + gather_idx=None): """ The layer to be stacked in decoder part. The structure of this module is similar to that in the encoder part except a multi-head attention is added to implement encoder-decoder attention. @@ -342,7 +409,8 @@ def decoder_layer(dec_input, d_model, n_head, attention_dropout, - cache, ) + cache=cache, + gather_idx=gather_idx) slf_attn_output = post_process_layer( dec_input, slf_attn_output, @@ -358,7 +426,10 @@ def decoder_layer(dec_input, d_value, d_model, n_head, - attention_dropout, ) + attention_dropout, + cache=cache, + gather_idx=gather_idx, + static_kv=True) enc_attn_output = post_process_layer( slf_attn_output, enc_attn_output, @@ -393,7 +464,8 @@ def decoder(dec_input, relu_dropout, preprocess_cmd, postprocess_cmd, - caches=None): + caches=None, + gather_idx=None): """ The decoder is composed of a stack of identical decoder_layer layers. """ @@ -413,7 +485,8 @@ def decoder(dec_input, relu_dropout, preprocess_cmd, postprocess_cmd, - cache=None if caches is None else caches[i]) + cache=None if caches is None else caches[i], + gather_idx=gather_idx) dec_input = dec_output dec_output = pre_process_layer(dec_output, preprocess_cmd, prepostprocess_dropout) @@ -610,7 +683,8 @@ def wrap_decoder(trg_vocab_size, weight_sharing, dec_inputs=None, enc_output=None, - caches=None): + caches=None, + gather_idx=None): """ The wrapper assembles together all needed layers for the decoder. """ @@ -646,7 +720,8 @@ def wrap_decoder(trg_vocab_size, relu_dropout, preprocess_cmd, postprocess_cmd, - caches=caches) + caches=caches, + gather_idx=gather_idx) # Reshape to 2D tensor to use GEMM instead of BatchedGEMM dec_output = layers.reshape( dec_output, shape=[-1, dec_output.shape[-1]], inplace=True) @@ -666,9 +741,43 @@ def wrap_decoder(trg_vocab_size, return predict -def fast_decode( +def fast_decode(src_vocab_size, + trg_vocab_size, + max_in_len, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + prepostprocess_dropout, + attention_dropout, + relu_dropout, + preprocess_cmd, + postprocess_cmd, + weight_sharing, + beam_size, + max_out_len, + eos_idx, + use_py_reader=False): + """ + Use beam search to decode. Caches will be used to store states of history + steps which can make the decoding faster. + """ + data_input_names = encoder_data_input_fields + fast_decoder_data_input_fields + + if use_py_reader: + all_inputs, reader = make_all_py_reader_inputs(data_input_names) + else: + all_inputs = make_all_inputs(data_input_names) + + enc_inputs_len = len(encoder_data_input_fields) + dec_inputs_len = len(fast_decoder_data_input_fields) + enc_inputs = all_inputs[0:enc_inputs_len] + dec_inputs = all_inputs[enc_inputs_len:enc_inputs_len + dec_inputs_len] + + enc_output = wrap_encoder( src_vocab_size, - trg_vocab_size, max_in_len, n_layer, n_head, @@ -682,64 +791,60 @@ def fast_decode( preprocess_cmd, postprocess_cmd, weight_sharing, - beam_size, - max_out_len, - eos_idx, ): - """ - Use beam search to decode. Caches will be used to store states of history - steps which can make the decoding faster. - """ - enc_output = wrap_encoder( - src_vocab_size, max_in_len, n_layer, n_head, d_key, d_value, d_model, - d_inner_hid, prepostprocess_dropout, attention_dropout, relu_dropout, - preprocess_cmd, postprocess_cmd, weight_sharing) - start_tokens, init_scores, trg_src_attn_bias = make_all_inputs( - fast_decoder_data_input_fields) + enc_inputs, ) + start_tokens, init_scores, parent_idx, trg_src_attn_bias = dec_inputs def beam_search(): max_len = layers.fill_constant( - shape=[1], dtype=start_tokens.dtype, value=max_out_len) + shape=[1], + dtype=start_tokens.dtype, + value=max_out_len, + force_cpu=True) step_idx = layers.fill_constant( - shape=[1], dtype=start_tokens.dtype, value=0) - cond = layers.less_than(x=step_idx, y=max_len) + shape=[1], dtype=start_tokens.dtype, value=0, force_cpu=True) + cond = layers.less_than(x=step_idx, y=max_len) # default force_cpu=True while_op = layers.While(cond) # array states will be stored for each step. ids = layers.array_write( layers.reshape(start_tokens, (-1, 1)), step_idx) scores = layers.array_write(init_scores, step_idx) # cell states will be overwrited at each step. - # caches contains states of history steps to reduce redundant - # computation in decoder. - caches = [{ - "k": layers.fill_constant_batch_size_like( - input=start_tokens, - shape=[-1, 0, d_model], - dtype=enc_output.dtype, - value=0), - "v": layers.fill_constant_batch_size_like( - input=start_tokens, - shape=[-1, 0, d_model], - dtype=enc_output.dtype, - value=0) - } for i in range(n_layer)] + # caches contains states of history steps in decoder self-attention + # and static encoder output projections in encoder-decoder attention + # to reduce redundant computation. + caches = [ + { + "k": # for self attention + layers.fill_constant_batch_size_like( + input=start_tokens, + shape=[-1, n_head, 0, d_key], + dtype=enc_output.dtype, + value=0), + "v": # for self attention + layers.fill_constant_batch_size_like( + input=start_tokens, + shape=[-1, n_head, 0, d_value], + dtype=enc_output.dtype, + value=0), + "static_k": # for encoder-decoder attention + layers.create_tensor(dtype=enc_output.dtype), + "static_v": # for encoder-decoder attention + layers.create_tensor(dtype=enc_output.dtype) + } for i in range(n_layer) + ] + with while_op.block(): pre_ids = layers.array_read(array=ids, i=step_idx) - pre_ids = layers.reshape(pre_ids, (-1, 1, 1)) + # Since beam_search_op dosen't enforce pre_ids' shape, we can do + # inplace reshape here which actually change the shape of pre_ids. + pre_ids = layers.reshape(pre_ids, (-1, 1, 1), inplace=True) pre_scores = layers.array_read(array=scores, i=step_idx) - # sequence_expand can gather sequences according to lod thus can be - # used in beam search to sift states corresponding to selected ids. - pre_src_attn_bias = layers.sequence_expand( - x=trg_src_attn_bias, y=pre_scores) - pre_enc_output = layers.sequence_expand(x=enc_output, y=pre_scores) - pre_caches = [{ - "k": layers.sequence_expand( - x=cache["k"], y=pre_scores), - "v": layers.sequence_expand( - x=cache["v"], y=pre_scores), - } for cache in caches] + # gather cell states corresponding to selected parent + pre_src_attn_bias = layers.gather( + trg_src_attn_bias, index=parent_idx) pre_pos = layers.elementwise_mul( x=layers.fill_constant_batch_size_like( - input=pre_enc_output, # cann't use pre_ids here since it has lod + input=pre_src_attn_bias, # cann't use lod tensor here value=1, shape=[-1, 1, 1], dtype=pre_ids.dtype), @@ -761,35 +866,33 @@ def fast_decode( postprocess_cmd, weight_sharing, dec_inputs=(pre_ids, pre_pos, None, pre_src_attn_bias), - enc_output=pre_enc_output, - caches=pre_caches) - + enc_output=enc_output, + caches=caches, + gather_idx=parent_idx) + # intra-beam topK topk_scores, topk_indices = layers.topk( input=layers.softmax(logits), k=beam_size) accu_scores = layers.elementwise_add( - x=layers.log(topk_scores), - y=layers.reshape( - pre_scores, shape=[-1]), - axis=0) - # beam_search op uses lod to distinguish branches. + x=layers.log(topk_scores), y=pre_scores, axis=0) + # beam_search op uses lod to differentiate branches. topk_indices = layers.lod_reset(topk_indices, pre_ids) - selected_ids, selected_scores = layers.beam_search( + # topK reduction across beams, also contain special handle of + # end beams and end sentences(batch reduction) + selected_ids, selected_scores, gather_idx = layers.beam_search( pre_ids=pre_ids, pre_scores=pre_scores, ids=topk_indices, scores=accu_scores, beam_size=beam_size, - end_id=eos_idx) - + end_id=eos_idx, + return_parent_idx=True) layers.increment(x=step_idx, value=1.0, in_place=True) - # update states + # cell states(caches) have been updated in wrap_decoder, + # only need to update beam search states here. layers.array_write(selected_ids, i=step_idx, array=ids) layers.array_write(selected_scores, i=step_idx, array=scores) + layers.assign(gather_idx, parent_idx) layers.assign(pre_src_attn_bias, trg_src_attn_bias) - layers.assign(pre_enc_output, enc_output) - for i in range(n_layer): - layers.assign(pre_caches[i]["k"], caches[i]["k"]) - layers.assign(pre_caches[i]["v"], caches[i]["v"]) length_cond = layers.less_than(x=step_idx, y=max_len) finish_cond = layers.logical_not(layers.is_empty(x=selected_ids)) layers.logical_and(x=length_cond, y=finish_cond, out=cond) @@ -799,4 +902,4 @@ def fast_decode( return finished_ids, finished_scores finished_ids, finished_scores = beam_search() - return finished_ids, finished_scores + return finished_ids, finished_scores, reader if use_py_reader else None diff --git a/fluid/PaddleNLP/neural_machine_translation/transformer/profile.py b/fluid/PaddleNLP/neural_machine_translation/transformer/profile.py index 9a437725..76711ece 100644 --- a/fluid/PaddleNLP/neural_machine_translation/transformer/profile.py +++ b/fluid/PaddleNLP/neural_machine_translation/transformer/profile.py @@ -186,7 +186,7 @@ def main(args): # Since the token number differs among devices, customize gradient scale to # use token average cost among multi-devices. and the gradient scale is # `1 / token_number` for average cost. - build_strategy.gradient_scale_strategy = fluid.BuildStrategy.GradientScaleStrategy.Customized + # build_strategy.gradient_scale_strategy = fluid.BuildStrategy.GradientScaleStrategy.Customized train_exe = fluid.ParallelExecutor( use_cuda=TrainTaskConfig.use_gpu, loss_name=avg_cost.name, diff --git a/fluid/PaddleNLP/neural_machine_translation/transformer/train.py b/fluid/PaddleNLP/neural_machine_translation/transformer/train.py index 16d48238..4313f8b4 100644 --- a/fluid/PaddleNLP/neural_machine_translation/transformer/train.py +++ b/fluid/PaddleNLP/neural_machine_translation/transformer/train.py @@ -10,7 +10,6 @@ import time import numpy as np import paddle.fluid as fluid -from paddle.fluid.transpiler.details import program_to_code import reader from config import * @@ -258,7 +257,12 @@ def prepare_batch_input(insts, data_input_names, src_pad_idx, trg_pad_idx, return data_input_dict, np.asarray([num_token], dtype="float32") -def prepare_data_generator(args, is_test, count, pyreader): +def prepare_data_generator(args, + is_test, + count, + pyreader, + py_reader_provider_wrapper, + place=None): """ Data generator wrapper for DataReader. If use py_reader, set the data provider for py_reader @@ -319,7 +323,7 @@ def prepare_data_generator(args, is_test, count, pyreader): data_reader = split(data_reader, count) if args.use_py_reader: pyreader.decorate_tensor_provider( - py_reader_provider_wrapper(data_reader)) + py_reader_provider_wrapper(data_reader, place)) data_reader = None else: # Data generator for multi-devices data_reader = stack(data_reader, count) @@ -357,7 +361,7 @@ def prepare_feed_dict_list(data_generator, init_flag, count): return feed_dict_list if len(feed_dict_list) == count else None -def py_reader_provider_wrapper(data_reader): +def py_reader_provider_wrapper(data_reader, place): """ Data provider needed by fluid.layers.py_reader. """ @@ -370,8 +374,7 @@ def py_reader_provider_wrapper(data_reader): data, data_input_names, ModelHyperParams.eos_idx, ModelHyperParams.eos_idx, ModelHyperParams.n_head, ModelHyperParams.d_model) - total_dict = dict(data_input_dict.items()) - yield [total_dict[item] for item in data_input_names] + yield [data_input_dict[item] for item in data_input_names] return py_reader_provider @@ -406,7 +409,11 @@ def test_context(exe, train_exe, dev_count): is_test=True) test_prog = test_prog.clone(for_test=True) test_data = prepare_data_generator( - args, is_test=True, count=dev_count, pyreader=pyreader) + args, + is_test=True, + count=dev_count, + pyreader=pyreader, + py_reader_provider_wrapper=py_reader_provider_wrapper) exe.run(startup_prog) # to init pyreader for testing if TrainTaskConfig.ckpt_path: @@ -477,7 +484,11 @@ def train_loop(exe, logging.info("begin reader") train_data = prepare_data_generator( - args, is_test=False, count=dev_count, pyreader=pyreader) + args, + is_test=False, + count=dev_count, + pyreader=pyreader, + py_reader_provider_wrapper=py_reader_provider_wrapper) # For faster executor exec_strategy = fluid.ExecutionStrategy() -- GitLab