Add models.

fa9e459e · MegEngine Team · Tian Zhongbo · fd807291 · fa9e459e · fa9e459e
63 changed file
--- a/.gitignore
+++ b/.gitignore
+__pycache__/
+*log*/
--- a/ACKNOWLEDGMENTS
+++ b/ACKNOWLEDGMENTS
+MegEngine is licensed under the Apache License Version 2.0, except for the third-party components listed below.
+
+*********************************************************************************************************************************
+Software Licensed under the MIT License:
+--------------------------------------------------------------------
+1. deeplabv3plus-pytorch
+Copyright (c) 2019 Hibercraft
+
+
+Terms of the MIT License:
+--------------------------------------------------------------------
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*********************************************************************************************************************************
+
+
+
+
+*********************************************************************************************************************************
+Software Licensed under the BSD 3-Clause License:
+--------------------------------------------------------------------
+1. pytorch/example
+Copyright (c) 2017,
+All rights reserved.
+
+
+Terms of the BSD 3-Clause License:
+--------------------------------------------------------------------
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*********************************************************************************************************************************
+
+
+
+
+
+
+
+*********************************************************************************************************************************
+Software Licensed under the Apache License Version 2.0:
+--------------------------------------------------------------------
+1. pytorch_pretrained_BERT
+Copyright (c) 2018. All rights reserved.
+
+This software has been modified by Megvii Inc.
+
+
+Terms of Apache License Version 2.0
+---------------------------------------------------
+Apache License
+Version 2.0, January 2004
+http://www.apache.org/licenses/
+
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+1. Definitions.
+
+"License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document.
+
+"Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License.
+
+"Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity.
+
+"You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License.
+
+"Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files.
+
+"Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types.
+
+"Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below).
+
+"Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof.
+
+"Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution."
+
+"Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work.
+
+2. Grant of Copyright License.
+
+Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form.
+
+3. Grant of Patent License.
+
+Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed.
+
+4. Redistribution.
+
+You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions:
+
+You must give any other recipients of the Work or Derivative Works a copy of this License; and
+You must cause any modified files to carry prominent notices stating that You changed the files; and
+You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and
+If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License.
+You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License.
+
+5. Submission of Contributions.
+
+Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions.
+
+6. Trademarks.
+
+This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file.
+
+7. Disclaimer of Warranty.
+
+Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License.
+
+8. Limitation of Liability.
+
+In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages.
+
+9. Accepting Warranty or Additional Liability.
+
+While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability.
+
+END OF TERMS AND CONDITIONS
+*********************************************************************************************************************************
+
--- a/CODE_OF_CONDUCT.md
+++ b/CODE_OF_CONDUCT.md
+# Contributor Covenant Code of Conduct
+
+## Our Pledge
+
+In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation.
+
+## Our Standards
+
+Examples of behavior that contributes to a positive environment for our community include:
+
+* Using welcoming and inclusive language
+* Being respectful of differing viewpoints and experiences
+* Gracefully accepting constructive criticism
+* Focusing on what is best for the community
+* Showing empathy towards other community members
+
+Examples of unacceptable behavior include:
+
+* The use of sexualized language or imagery, and sexual attention or advances of any kind
+* Trolling, insulting or derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others’ private information, such as a physical or email address, without their explicit permission
+* Other conduct which could reasonably be considered inappropriate in a professional setting
+
+All MegEngine forums and spaces are meant for professional interactions, and any behavior which could reasonably be considered inappropriate in a professional setting is unacceptable.
+
+## Our Responsibilities
+
+Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior.
+
+Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful.
+
+## Scope
+
+This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers.
+
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at megengine@megvii.com. The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately.
+
+Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership.
+
+## Attribution
+
+This Code of Conduct is updated from the Contributor Covenant, version 2.0, available at https://www.contributor-covenant.org/version/2/0/code_of_conduct.html.
+
--- a/CONTRIBUTOR_LICENSE_AGREEMENT.md
+++ b/CONTRIBUTOR_LICENSE_AGREEMENT.md
+# MegEngine Contributor License Agreement
+
+In order to clarify the intellectual property license granted with Contributions from any person or entity, the open source project MegEngine ("MegEngine") must have a Contributor License Agreement (CLA) on file that has been signed by each Contributor, indicating agreement to the license terms below. This license is for your protection as a Contributor as well as the protection of MegEngine and its users; it does not change your rights to use your own Contributions for any other purpose.
+
+This Agreement allows an individual or an entity to submit Contributions to MegEngine, to authorize Contributions submitted by its designated employees to MegEngine, and to grant copyright and patent licenses.
+
+thereto. You accept and agree to the following terms and conditions for Your present and future Contributions submitted to MegEngine. Except for the license granted herein to MegEngine and recipients of software distributed by MegEngine, You reserve all right, title, and interest in and to Your Contributions.
+
+1. **Definitions**. "You" (or "Your") shall mean the copyright owner or legal entity authorized by the copyright owner that is making this Agreement with MegEngine. For legal entities, the entity making a Contribution and all other entities that control, are controlled by, or are under common control with that entity are considered to be a single Contributor.
+For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity.
+"Contribution" shall mean the code, documentation or any original work of authorship, including any modifications or additions to an existing work, that is intentionally submitted by You to MegEngine for inclusion in, or documentation of, any of the products owned or managed by MegEngine (the "Work").
+For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to MegEngine or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, MegEngine for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by You as "Not a Contribution."
+
+2. **Grant of Copyright License**. Subject to the terms and conditions of this Agreement, You hereby grant to MegEngine and to recipients of software distributed by MegEngine a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare derivative works of, publicly display, publicly perform, sublicense, and distribute Your Contributions and such derivative works.
+
+3. **Grant of Patent License**. Subject to the terms and conditions of this Agreement, You hereby grant to MegEngine and to recipients of software distributed by MegEngine a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by You that are necessarily infringed by Your Contribution(s) alone or by combination of Your Contribution(s) with the Work to which such Contribution(s) was submitted. If any entity institutes patent litigation against You or any other entity (including a crossclaim or counterclaim in a lawsuit) alleging that Your Contribution, or the Work to which You have contributed, constitutes direct or contributory patent infringement, then any patent licenses granted to that entity under this Agreement for that Contribution or Work shall terminate as of the date such litigation is filed.
+
+4. You represent that You are legally entitled to grant the above license. If You are an entity, You represent further that each of Your employee designated by You is authorized to submit Contributions on behalf of You. If You are an individual and Your employer(s) has rights to intellectual property that You create that includes Your Contributions, You represent further that You have received permission to make Contributions on behalf of that employer, that Your employer has waived such rights for Your Contributions to MegEngine, or that Your employer has executed a separate CLA with MegEngine.
+
+5. If you do post content or submit material on MegEngine and unless we indicate otherwise, you grant MegEngine a nonexclusive, royalty-free, perpetual, irrevocable, and fully sublicensable right to use, reproduce, modify, adapt, publish, perform, translate, create derivative works from, distribute, and display such content throughout the world in any media. You grant MegEngine and sublicensees the right to use your GitHub Public Profile, including but not limited to name, that you submit in connection with such content. You represent and warrant that you own or otherwise control all of the rights to the content that you post; that the content is accurate; that use of the content you supply does not violate this policy and will not cause injury to any person or entity; and that you will indemnify MegEngine for all claims resulting from content you supply. MegEngine has the right but not the obligation to monitor and edit or remove any activity or content. MegEngine takes no responsibility and assumes no liability for any content posted by you or any third party.
+
+6. You represent that each of Your Contributions is Your original creation. Should You wish to submit work that is not Your original creation, You may submit it to MegEngine separately from any Contribution, identifying the complete details of its source and of any license or other restriction (including, but not limited to, related patents, trademarks, and license agreements) of which You are personally aware, and conspicuously marking the work as "Submitted on behalf of a third party: [named here]".
+
+7. You are not expected to provide support for Your Contributions, except to the extent You desire to provide support. You may provide support for free, for a fee, or not at all. Unless required by applicable law or agreed to in writing, You provide Your Contributions on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE.
+
+8. You agree to notify MegEngine of any facts or circumstances of which You become aware that would make these representations inaccurate in any respect.
+
+9. This the effective date of this Contributor License Agreement is 2020/3/23. MegEngine reserves the right to update or change this Agreement at any time, by posting the most current version of the Agreement on MegEngine, with a new effective date. All such changes in the Agreement are effective from the effective date. Your continued use of MegEngine after we post any such changes signifies your agreement to those changes. If you do not agree to the then-current Agreement, you must immediately discontinue using MegEngine.
+
--- a/LICENSE
+++ b/LICENSE
+MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+
+Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+
+Apache License
+Version 2.0, January 2004
+http://www.apache.org/licenses/
+
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+1. Definitions.
+
+"License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document.
+
+"Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License.
+
+"Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity.
+
+"You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License.
+
+"Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files.
+
+"Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types.
+
+"Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below).
+
+"Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof.
+
+"Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution."
+
+"Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work.
+
+2. Grant of Copyright License.
+
+Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form.
+
+3. Grant of Patent License.
+
+Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed.
+
+4. Redistribution.
+
+You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions:
+
+You must give any other recipients of the Work or Derivative Works a copy of this License; and
+You must cause any modified files to carry prominent notices stating that You changed the files; and
+You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and
+If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License.
+You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License.
+
+5. Submission of Contributions.
+
+Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions.
+
+6. Trademarks.
+
+This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file.
+
+7. Disclaimer of Warranty.
+
+Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License.
+
+8. Limitation of Liability.
+
+In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages.
+
+9. Accepting Warranty or Additional Liability.
+
+While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability.
+
+END OF TERMS AND CONDITIONS
--- a/README.md
+++ b/README.md
+# MegEngine Models
+
+本仓库包含了采用[MegEngine](https://github.com/megengine/megengine)实现的各种主流深度学习模型。
+
+[official](./official)目录下提供了各种经典的图像分类、目标检测、图像分割以及自然语言模型的官方实现。每个模型同时提供了模型定义、推理以及训练的代码。
+
+官方会一直维护[official](./official)下的代码，保持适配MegEngine的最新API，提供最优的模型实现。同时，提供高质量的学习文档，帮助新手学习如何在MegEngine下训练自己的模型。
+
+## 综述
+
+对于每个模型，我们提供了至少四个脚本文件：模型定义(`model.py`)、模型推理(`inference.py`)、模型训练(`train.py`)、模型测试(`test.py`)。
+
+每个模型目录下都对应有一个`README`，介绍了模型的详细信息，并详细描述了训练和测试的流程。例如 [ResNet README](./official/vision/classification/resnet/README.md)。
+
+另外，`official`下定义的模型可以通过`megengine.hub`来直接加载，例如：
+
+```bash
+import megengine.hub
+
+# 只加载网络结构
+resnet18 = megengine.hub.load("megengine/models", "resnet18")
+# 加载网络结构和预训练权重
+resnet18 = megengine.hub.load("megengine/models", "resnet18", pretrained=True)
+```
+
+更多可以通过`megengine.hub`接口加载的模型见[hubconf.py](./hubconf.py)。
+
+## 安装和环境配置
+
+在开始运行本仓库下的代码之前，用户需要通过以下步骤来配置本地环境：
+
+1. 克隆仓库
+
+```bash
+git clone https://github.com/MegEngine/Models.git
+```
+
+2. 安装依赖包
+
+```bash
+pip3 install --user -r requirements.txt
+```
+
+3. 添加目录到python环境变量中
+
+```bash
+export PYTHONPATH=/path/to/models:$PYTHONPATH
+```
+
+
+## 官方模型介绍
+
+### 图像分类
+
+图像分类是计算机视觉的基础任务。许多计算机视觉的其它任务（例如物体检测）都使用了基于图像分类的预训练模型。因此，我们提供了各种在ImageNet上预训练好的分类模型，包括[ResNet](./official/vision/classification/resnet)系列, [shufflenet](./official/vision/classification/shufflenet)系列等，这些模型在**ImageNet验证集**上的测试结果如下表：
+
+| 模型 | top1 acc | top5 acc |
+| :---: | :---: | :---: |
+| ResNet18 |  70.312  |  89.430  |
+| ResNet34 |  73.960  |  91.630  |
+| ResNet50 | 76.254 | 93.056 |
+| ResNet101 | 77.944 | 93.844 |
+| ResNeXt50 32x4d | 77.592 | 93.644 |
+| ShuffleNetV2 x1.0 |  69.369  |  88.793  |
+
+
+### 目标检测
+
+目标检测同样是计算机视觉中的常见任务，我们提供了一个经典的目标检测模型[retinanet](./official/vision/detection)，这个模型在**COCO验证集**上的测试结果如下：
+
+| 模型                       | mAP<br>@5-95  |
+| :---:                        | :---:           |
+| retinanet-res50-1x-800size | 36.0          |
+
+
+### 图像分割
+
+我们也提供了经典的语义分割模型--[Deeplabv3plus](./official/vision/segmentation/)，这个模型在**PASCAL VOC验证集**上的测试结果如下：
+
+ |  模型       | Backbone    |  mIoU_single   | mIoU_multi  |
+ |  :--:          |:--:     |:--:           |:--:         |
+ |  Deeplabv3plus | Resnet101   | 79.0          | 79.8        |
+
+
+### 自然语言处理
+
+我们同样支持一些常见的自然语言处理模型，模型的权重来自Google的pre-trained models, 用户可以直接使用`megengine.hub`轻松的调用预训练的bert模型。
+
+另外，我们在[bert](./official/nlp/bert)中还提供了更加方便的脚本, 可以通过任务名直接获取到对应字典, 配置, 与预训练模型。
+
+| 模型                       | 字典 | 配置 |
+| ---                        |  --- |  --- |
+| wwm_cased_L-24_H-1024_A-16| [link](https://data.megengine.org.cn/models/weights/bert/wwm_cased_L-24_H-1024_A-16/vocab.txt) | [link](https://data.megengine.org.cn/models/weights/bert/wwm_cased_L-24_H-1024_A-16/bert_config.json)
+| wwm_uncased_L-24_H-1024_A-16| [link](https://data.megengine.org.cn/models/weights/bert/wwm_uncased_L-24_H-1024_A-16/vocab.txt) | [link](https://data.megengine.org.cn/models/weights/bert/wwm_uncased_L-24_H-1024_A-16/bert_config.json)
+| cased_L-12_H-768_A-12| [link](https://data.megengine.org.cn/models/weights/bert/cased_L-12_H-768_A-12/vocab.txt) | [link](https://data.megengine.org.cn/models/weights/bert/cased_L-12_H-768_A-12/bert_config.json)
+| cased_L-24_H-1024_A-16| [link](https://data.megengine.org.cn/models/weights/bert/cased_L-24_H-1024_A-16/vocab.txt) | [link](https://data.megengine.org.cn/models/weights/bert/cased_L-24_H-1024_A-16/bert_config.json)
+| uncased_L-12_H-768_A-12| [link](https://data.megengine.org.cn/models/weights/bert/uncased_L-12_H-768_A-12/vocab.txt) | [link](https://data.megengine.org.cn/models/weights/bert/uncased_L-12_H-768_A-12/bert_config.json)
+| uncased_L-24_H-1024_A-16| [link](https://data.megengine.org.cn/models/weights/bert/uncased_L-24_H-1024_A-16/vocab.txt) | [link](https://data.megengine.org.cn/models/weights/bert/uncased_L-24_H-1024_A-16/bert_config.json)
+| chinese_L-12_H-768_A-12| [link](https://data.megengine.org.cn/models/weights/bert/chinese_L-12_H-768_A-12/vocab.txt) | [link](https://data.megengine.org.cn/models/weights/bert/chinese_L-12_H-768_A-12/bert_config.json)
+| multi_cased_L-12_H-768_A-12| [link](https://data.megengine.org.cn/models/weights/bert/multi_cased_L-12_H-768_A-12/vocab.txt) | [link](https://data.megengine.org.cn/models/weights/bert/multi_cased_L-12_H-768_A-12/bert_config.json)
+
+
+在glue_data/MRPC数据集中使用默认的超参数进行微调和评估，评估结果介于84％和88％之间。
+
+| Dataset | pretrained_bert | acc |
+| --- |   --- |  --- |
+| glue_data/MRPC |   uncased_L-12_H-768_A-12 |  86.25% |
+
--- a/hubconf.py
+++ b/hubconf.py
+from official.vision.classification.resnet.model import (
+    BasicBlock,
+    Bottleneck,
+    ResNet,
+    resnet18,
+    resnet34,
+    resnet50,
+    resnet101,
+    resnet152,
+    resnext50_32x4d,
+    resnext101_32x8d,
+)
+from official.vision.classification.shufflenet.model import (
+    shufflenet_v2_x0_5,
+    shufflenet_v2_x1_0,
+    shufflenet_v2_x1_5,
+    shufflenet_v2_x2_0,
+)
+
+from official.nlp.bert.model import (
+    uncased_L_12_H_768_A_12,
+    cased_L_12_H_768_A_12,
+    uncased_L_24_H_1024_A_16,
+    cased_L_24_H_1024_A_16,
+    chinese_L_12_H_768_A_12,
+    multi_cased_L_12_H_768_A_12,
+    wwm_uncased_L_24_H_1024_A_16,
+    wwm_cased_L_24_H_1024_A_16,
+)
+
+from official.vision.detection.retinanet_res50_1x_800size import (
+    retinanet_res50_1x_800size,
+    RetinaNet,
+)
+from official.vision.detection.tools.test import DetEvaluator
+
+from official.vision.segmentation.deeplabv3plus import (
+    deeplabv3plus_res101,
+    DeepLabV3Plus,
+)
--- a/official/assets/cat.jpg
+++ b/official/assets/cat.jpg
--- a/official/assets/cat_det_out.jpg
+++ b/official/assets/cat_det_out.jpg
--- a/official/assets/cat_seg_out.jpg
+++ b/official/assets/cat_seg_out.jpg
--- a/official/assets/imagenet_class_info.json
+++ b/official/assets/imagenet_class_info.json
--- a/official/nlp/__init__.py
+++ b/official/nlp/__init__.py
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
--- a/official/nlp/bert/README.md
+++ b/official/nlp/bert/README.md
+# BERT
+
+此仓库包含MegEngine实现的经典`BERT`网络结构，还提供了有关GLUE MRPC任务的完整培训和测试代码。
+
+你可以调用以下预训练模型, 在不同的下游任务中进行finetune.
+
+| 模型                       | 字典 | 配置 |
+| ---                        |  --- |  --- |
+| wwm_cased_L-24_H-1024_A-16|   [link](https://data.megengine.org.cn/models/weights/bert/wwm_cased_L-24_H-1024_A-16/vocab.txt) | [link](https://data.megengine.org.cn/models/weights/bert/wwm_cased_L-24_H-1024_A-16/bert_config.json)
+| wwm_uncased_L-24_H-1024_A-16|   [link](https://data.megengine.org.cn/models/weights/bert/wwm_uncased_L-24_H-1024_A-16/vocab.txt) | [link](https://data.megengine.org.cn/models/weights/bert/wwm_uncased_L-24_H-1024_A-16/bert_config.json)
+| cased_L-12_H-768_A-12|   [link](https://data.megengine.org.cn/models/weights/bert/cased_L-12_H-768_A-12/vocab.txt) | [link](https://data.megengine.org.cn/models/weights/bert/cased_L-12_H-768_A-12/bert_config.json)
+| cased_L-24_H-1024_A-16|   [link](https://data.megengine.org.cn/models/weights/bert/cased_L-24_H-1024_A-16/vocab.txt) | [link](https://data.megengine.org.cn/models/weights/bert/cased_L-24_H-1024_A-16/bert_config.json)
+| uncased_L-12_H-768_A-12|   [link](https://data.megengine.org.cn/models/weights/bert/uncased_L-12_H-768_A-12/vocab.txt) | [link](https://data.megengine.org.cn/models/weights/bert/uncased_L-12_H-768_A-12/bert_config.json)
+| uncased_L-24_H-1024_A-16|   [link](https://data.megengine.org.cn/models/weights/bert/uncased_L-24_H-1024_A-16/vocab.txt) | [link](https://data.megengine.org.cn/models/weights/bert/uncased_L-24_H-1024_A-16/bert_config.json)
+| chinese_L-12_H-768_A-12|   [link](https://data.megengine.org.cn/models/weights/bert/chinese_L-12_H-768_A-12/vocab.txt) | [link](https://data.megengine.org.cn/models/weights/bert/chinese_L-12_H-768_A-12/bert_config.json)
+| multi_cased_L-12_H-768_A-12|   [link](https://data.megengine.org.cn/models/weights/bert/multi_cased_L-12_H-768_A-12/vocab.txt) | [link](https://data.megengine.org.cn/models/weights/bert/multi_cased_L-12_H-768_A-12/bert_config.json)
+
+
+模型的权重来自Google的pre-trained models, 其含义也与其一致, 用户可以直接使用`megengine.hub`轻松的调用预训练的bert模型, 以及下载对应的`vocab.txt`与`bert_config.json`. 我们在[models](./official/nlp/bert)中还提供了更加方便的脚本, 可以通过任务名直接获取到对应字典, 配置, 与预训练模型.
+
+## Training Example
+
+```bash
+python3 train.py \
+  --do_lower_case \
+  --max_seq_length 128 \
+  --data_dir ./glue_data/MRPC \
+  --pretrained_bert uncased_L-12_H-768_A-12 \
+  --learning_rate 2e-5 \
+  --num_train_epochs 3
+```
+
+## Eval Example
+
+```bash
+python3 test.py \
+  --do_lower_case \
+  --max_seq_length 128 \
+  --data_dir ./glue_data/MRPC \
+  --pretrained_bert uncased_L-12_H-768_A-12 \
+```
+
+# How to Use
+`model.py`, 用MegEngine实现的BERT模型, 已经相关的预训练模型设置
+
+`mrpc_dataset.py`, 定义一个dataloader生成器，它可以生成处理过的MRPC数据，这些数据可以直接用于训练/评估。
+
+`train.py`, 训练脚本
+
+`test.py`, 测试脚本
+
+`config.py`, 定义了所有的测试/训练需要的变量
+
+- `--data_dir`, 输入数据目录。 该任务应包含.tsv文件（或其他数据文件）.(示例代码中支持MRPC数据集)
+- `--max_seq_length`,  WordPiece tokenization之后的最大总输入序列长度。 长度大于此长度的序列将被截断，小于长度的序列将被填充。
+- `--do_lower_case`, 如果使用的是无大小写的模型，请设置此标志。
+- `--pretrained_bert`, 使用的pretrained bert, 例如`uncased_L-12_H-768_A-12`
+
+**如果要运行训练脚本 `train.py`，则需要设置:** 
+
+- `--train_batch_size`, 训练使用的batch_size, 默认`16`.
+- `--eval_batch_size`, 测试使用batch_size, 默认`16`.
+- `--learning_rate`, Adam的初始化learning rate, 默认`5e-5`.
+- `--num_train_epochs`, 训练的总轮数, 默认`3`.
+- `--save_model_path`, 需要save的模型路径, 默认`./check_point_last.pkl`.
+
+**如果要运行训练脚本 `test.py`，则需要设置** 
+
+- `--eval_batch_size`, 测试使用batch_size, 默认`16`.
+- `--load_model_path`, 需要load的模型路径, 默认`./check_point_last.pkl`.
+
+# Other Data Files
+
+在运行此示例之前，您应该准备所有GLUE MRPC数据，您可以自己下载它或使用我们存储库中的备份。
+
+`glue_data/MRPC`, MRPC原始数据的目录
+
+# Results
+
+在glue_data/MRPC数据集中使用默认的超参数进行微调和评估，评估结果介于84％和88％之间。
+
+| Dataset | pretrained_bert | acc |
+| --- |   --- |  --- |
+| glue_data/MRPC |   uncased_L-12_H-768_A-12 |  86.25% |
+
+# Reference project
+- "PyTorch Pretrained Bert" <https://github.com/Meelfy/pytorch_pretrained_BERT>
--- a/official/nlp/bert/__init__.py
+++ b/official/nlp/bert/__init__.py
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
--- a/official/nlp/bert/config.py
+++ b/official/nlp/bert/config.py
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+
+import argparse
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+
+    ## parameters
+    parser.add_argument(
+        "--data_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The input data dir. Should contain the .tsv files (or other data files) for the task.",
+    )
+
+    parser.add_argument(
+        "--pretrained_bert", required=True, type=str, help="pretrained bert name"
+    )
+
+    parser.add_argument(
+        "--max_seq_length",
+        default=128,
+        type=int,
+        help="The maximum total input sequence length after WordPiece tokenization. \n"
+        "Sequences longer than this will be truncated, and sequences shorter \n"
+        "than this will be padded.",
+    )
+    parser.add_argument(
+        "--do_lower_case",
+        default=False,
+        action="store_true",
+        help="Set this flag if you are using an uncased model.",
+    )
+
+    parser.add_argument(
+        "--train_batch_size",
+        default=16,
+        type=int,
+        help="Total batch size for training.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        default=5e-5,
+        type=float,
+        help="The initial learning rate for Adam.",
+    )
+    parser.add_argument(
+        "--num_train_epochs",
+        default=3,
+        type=int,
+        help="Total number of training epochs to perform.",
+    )
+
+    parser.add_argument(
+        "--eval_batch_size", default=16, type=int, help="Total batch size for eval."
+    )
+    parser.add_argument(
+        "--load_model_path",
+        default="./check_point_last.pkl",
+        type=str,
+        help="the initial model",
+    )
+
+    parser.add_argument(
+        "--save_model_path",
+        default="./check_point_last.pkl",
+        type=str,
+        help="the path to save model",
+    )
+
+    return parser.parse_args()
--- a/official/nlp/bert/glue_data/MRPC/dev.tsv
+++ b/official/nlp/bert/glue_data/MRPC/dev.tsv
--- a/official/nlp/bert/glue_data/MRPC/dev_ids.tsv
+++ b/official/nlp/bert/glue_data/MRPC/dev_ids.tsv
+1606495	1606619
+3444633	3444733
+1196962	1197061
+1720166	1720115
+2546175	2546198
+2167771	2167744
+2688145	2688162
+2095803	2095786
+1629064	1629043
+116294	116332
+722278	722383
+1912524	1912648
+726966	726945
+936978	937500
+2906104	2906322
+1095780	1095652
+487951	488007
+3294207	3294290
+1744257	1744378
+1490044	1489975
+2896308	2896334
+2907762	2907649
+101747	101777
+1490811	1490840
+2065523	2065836
+886904	887158
+2204592	2204588
+3084554	3084612
+917965	918315
+2396937	2396818
+3147370	3147525
+71501	71627
+1166473	1166857
+1353356	1353174
+1876120	1876059
+1708040	1708062
+2265271	2265152
+2224884	2224819
+2551891	2551563
+360875	360943
+2320654	2320666
+3013483	3013540
+1224743	1225510
+58540	58567
+1015249	1015204
+3039310	3039413
+726399	726078
+2029631	2029565
+1461629	1461781
+1465073	1464854
+283751	283290
+799346	799268
+533823	533909
+801552	801516
+2829648	2829613
+2074182	2074668
+1958079	1958143
+3140260	3140288
+969381	969512
+2307064	2307235
+271891	271839
+1010655	1010430
+1954	2142
+985015	984975
+218848	218851
+753928	753890
+555617	555528
+487993	487952
+3428298	3428362
+389239	389299
+2249237	2249305
+970740	971209
+2988297	2988555
+2204353	2204418
+544217	544325
+2083598	2083810
+263690	263819
+2587300	2587243
+3070979	3070949
+953733	953537
+2561999	2561941
+684848	684557
+162203	162101
+958161	957782
+2926039	2925982
+2112330	2112376
+3329379	3329416
+961836	962243
+1808166	1808434
+2632692	2632767
+2652187	2652218
+1430357	1430425
+144089	143697
+661390	661218
+325763	325928
+3320577	3320553
+2111629	2111786
+2222998	2223097
+205100	205145
+533903	533818
+629316	629289
+1033204	1033365
+3180014	3179967
+2673104	2673130
+349215	349241
+129995	129864
+368067	368018
+655498	655391
+2139506	2139427
+1661381	1661317
+426112	426210
+1990975	1991132
+162632	162653
+1889954	1889847
+347017	347002
+3150803	3150839
+173879	173832
+431076	431242
+1261116	1261234
+1636060	1635946
+2252795	2252970
+1128884	1128865
+3214517	3214483
+1014983	1014963
+1057995	1057778
+2587767	2587673
+539585	539355
+1756329	1756394
+2963943	2963880
+3242051	3241897
+977772	977804
+2131318	2131372
+103280	103431
+581592	581570
+2083612	2083810
+2155514	2155377
+723557	724115
+1351550	1351155
+611663	611716
+886618	886456
+1989515	1989458
+224932	224868
+751520	751373
+396041	396188
+55187	54831
+132553	132725
+1673112	1673068
+1638813	1639087
+2208376	2208198
+849291	849442
+2638861	2638982
+1967578	1967664
+781439	781461
+1119721	1119714
+34513	34742
+644788	644816
+2495223	2495307
+954526	954607
+195728	196099
+2010705	2010779
+1277539	1277527
+2638975	2638855
+2357324	2357271
+2763381	2763517
+1597193	1597119
+555553	555528
+2796658	2796682
+101746	101775
+2116843	2116883
+1100998	1100441
+3255597	3255668
+1909579	1909408
+2919853	2919804
+315785	315653
+1264509	1264471
+3439114	3439084
+3062202	3062308
+2614947	2614904
+1462409	1462504
+2199097	2199072
+331980	332110
+3267026	3266930
+698948	698933
+461779	461815
+1910610	1910455
+389117	389052
+789691	789665
+1348909	1348954
+261202	260995
+2820371	2820525
+696677	696932
+54181	53570
+589579	589557
+2128530	2128455
+3113791	3113782
+637168	637447
+490355	490378
+780604	780466
+219064	218969
+2823575	2823513
+3181118	3181443
+485999	486011
+2304696	2304863
+2916199	2916164
+2829194	2829229
+1167835	1167651
+1438666	1438643
+98432	98657
+249699	249623
+347022	347003
+2749410	2749625
+2517014	2516995
+2766112	2766084
+2198694	2198937
+548867	548785
+2758265	2758282
+981185	981234
+1354501	1354476
+2758944	2758975
+1865364	1865251
+131979	131957
+490376	490490
+146112	146127
+2763517	2763576
+327839	327748
+3111452	3111428
+1831696	1831660
+515581	515752
+315647	315778
+1783137	1782659
+1393764	1393984
+1980654	1980641
+1989213	1989116
+3022833	3023029
+86007	86373
+1685339	1685429
+1592037	1592076
+2493369	2493428
+1726935	1726879
+3389318	3389271
+3394891	3394775
+2324704	2325023
+2455942	2455978
+192285	192327
+3400796	3400822
+4733	4557
+1050307	1050144
+1112021	1111925
+3376093	3376101
+816867	816831
+3218713	3218830
+1864253	1863810
+3107137	3107119
+3039165	3039036
+3039007	3038845
+2015389	2015410
+1605818	1605806
+2796978	2797024
+1201306	1201329
+2339738	2339771
+3300040	3299992
+2749322	2749663
+2745055	2745022
+3046488	3046824
+2241925	2242066
+86020	86007
+69773	69792
+1057876	1057778
+2965576	2965701
+577854	578500
+221515	221509
+587009	586969
+3264732	3264648
+3023029	3023229
+2523564	2523358
+1552068	1551928
+1439663	1439808
+2377289	2377259
+2283737	2283794
+588637	588864
+1825432	1825301
+2748287	2748550
+1704987	1705268
+54142	53641
+2259788	2259747
+2090911	2091154
+3093023	3092996
+3122429	3122305
+1521034	1520582
+2324708	2325028
+3261484	3261306
+1675025	1675047
+2317018	2317252
+3448488	3448449
+1762569	1762526
+1602860	1602844
+1824224	1824209
+2640607	2640576
+2697659	2697747
+2440680	2440474
+818091	817811
+853475	853342
+2175939	2176090
+314997	315030
+1220668	1220801
+554905	554627
+3035788	3035918
+383417	383558
+1089053	1089297
+1831453	1831491
+2274844	2274714
+2706154	2706185
+2889005	2888954
+1355540	1355592
+2380695	2380822
+1616174	1616206
+1528383	1528083
+635783	635802
+1580638	1580663
+1549586	1549609
+2826681	2826474
+221079	221003
+720572	720486
+3311600	3311633
+460211	460445
+2385288	2385256
+1908763	1908744
+2996241	2996734
+2691044	2691264
+1386884	1386857
+2977500	2977547
+1330643	1330622
+2240399	2240149
+2931098	2931144
+919683	919782
+60122	60445
+805457	805985
+3435735	3435717
+110731	110648
+524136	524119
+3439854	3439874
+2008984	2009175
+260952	260924
+844421	844679
+872784	872834
+1423836	1423708
+2079200	2079131
+753858	753890
+787432	787464
+2110220	2110199
+1186754	1187056
+2110775	2110924
+780408	780363
+52758	52343
+763948	763991
+2810634	2810670
+2584416	2584653
+2268396	2268480
+447728	447699
+2573262	2573319
+1550897	1550977
+941617	941673
+3310210	3310286
+2494149	2494073
+1619244	1619274
+2531749	2531607
+374015	374162
+2221603	2221633
+2362761	2362698
+2834988	2835026
+1605350	1605425
+1630585	1630657
+3464314	3464302
+2842562	2842582
+1076861	1077018
+3028143	3028234
+518089	518133
+2336453	2336545
+3061836	3062031
+2738677	2738741
+2046630	2046644
+1919740	1919926
+1721433	1721267
+1269572	1269682
+1771131	1771091
+1757264	1757375
+1984039	1983986
+1609290	1609098
+2728425	2728251
+2020252	2020081
+665419	665612
+2945693	2945847
+2217613	2217659
+2530671	2530542
+2607718	2607708
+1015010	1014963
+1513190	1513246
+969512	969295
+1657632	1657619
+2385348	2385394
+821523	821385
+2577517	2577531
+862804	862715
+977938	978162
+3073773	3073779
+3107118	3107136
+2047034	2046820
+308567	308525
--- a/official/nlp/bert/glue_data/MRPC/msr_paraphrase_test.txt
+++ b/official/nlp/bert/glue_data/MRPC/msr_paraphrase_test.txt
--- a/official/nlp/bert/glue_data/MRPC/msr_paraphrase_train.txt
+++ b/official/nlp/bert/glue_data/MRPC/msr_paraphrase_train.txt
--- a/official/nlp/bert/glue_data/MRPC/test.tsv
+++ b/official/nlp/bert/glue_data/MRPC/test.tsv
--- a/official/nlp/bert/glue_data/MRPC/train.tsv
+++ b/official/nlp/bert/glue_data/MRPC/train.tsv
--- a/official/nlp/bert/model.py
+++ b/official/nlp/bert/model.py
--- a/official/nlp/bert/mrpc_dataset.py
+++ b/official/nlp/bert/mrpc_dataset.py
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import csv
+import os
+
+import megengine as mge
+import numpy as np
+from megengine.data import DataLoader
+from megengine.data.dataset import ArrayDataset
+from megengine.data.sampler import RandomSampler, SequentialSampler
+
+from tokenization import BertTokenizer
+
+logger = mge.get_logger(__name__)
+
+
+class DataProcessor(object):
+    """Base class for data converters for sequence classification data sets."""
+
+    def get_train_examples(self, data_dir):
+        """Gets a collection of `InputExample`s for the train set."""
+        raise NotImplementedError()
+
+    def get_dev_examples(self, data_dir):
+        """Gets a collection of `InputExample`s for the dev set."""
+        raise NotImplementedError()
+
+    def get_labels(self):
+        """Gets the list of labels for this data set."""
+        raise NotImplementedError()
+
+    @classmethod
+    def _read_tsv(cls, input_file, quotechar=None):
+        """Reads a tab separated value file."""
+        with open(input_file, "r") as f:
+            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
+            lines = []
+            for line in reader:
+                lines.append(line)
+            return lines
+
+
+class InputFeatures(object):
+    """A single set of features of data."""
+
+    def __init__(self, input_ids, input_mask, segment_ids, label_id):
+        self.input_ids = input_ids
+        self.input_mask = input_mask
+        self.segment_ids = segment_ids
+        self.label_id = label_id
+
+
+class InputExample(object):
+    """A single training/test example for simple sequence classification."""
+
+    def __init__(self, guid, text_a, text_b=None, label=None):
+        """Constructs a InputExample.
+
+        Args:
+            guid: Unique id for the example.
+            text_a: string. The untokenized text of the first sequence. For single
+            sequence tasks, only this sequence must be specified.
+            text_b: (Optional) string. The untokenized text of the second sequence.
+            Only must be specified for sequence pair tasks.
+            label: (Optional) string. The label of the example. This should be
+            specified for train and dev examples, but not for test examples.
+        """
+        self.guid = guid
+        self.text_a = text_a
+        self.text_b = text_b
+        self.label = label
+
+
+class MrpcProcessor(DataProcessor):
+    """Processor for the MRPC data set (GLUE version)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        logger.info("LOOKING AT {}".format(os.path.join(data_dir, "train.tsv")))
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train"
+        )
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev"
+        )
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, i)
+            text_a = line[3]
+            text_b = line[4]
+            label = line[0]
+            examples.append(
+                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)
+            )
+        return examples
+
+
+def _truncate_seq_pair(tokens_a, tokens_b, max_length):
+    """Truncates a sequence pair in place to the maximum length."""
+
+    # This is a simple heuristic which will always truncate the longer sequence
+    # one token at a time. This makes more sense than truncating an equal percent
+    # of tokens from each, since if one sequence is very short then each token
+    # that's truncated likely contains more information than a longer sequence.
+    while True:
+        total_length = len(tokens_a) + len(tokens_b)
+        if total_length <= max_length:
+            break
+        if len(tokens_a) > len(tokens_b):
+            tokens_a.pop()
+        else:
+            tokens_b.pop()
+
+
+def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer):
+    """Loads a data file into a list of `InputBatch`s."""
+
+    label_map = {label: i for i, label in enumerate(label_list)}
+
+    features = []
+    for (ex_index, example) in enumerate(examples):
+        tokens_a = tokenizer.tokenize(example.text_a)
+
+        tokens_b = None
+        if example.text_b:
+            tokens_b = tokenizer.tokenize(example.text_b)
+            # Modifies `tokens_a` and `tokens_b` in place so that the total
+            # length is less than the specified length.
+            # Account for [CLS], [SEP], [SEP] with "- 3"
+            _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
+        else:
+            # Account for [CLS] and [SEP] with "- 2"
+            if len(tokens_a) > max_seq_length - 2:
+                tokens_a = tokens_a[: (max_seq_length - 2)]
+
+        # The convention in BERT is:
+        # (a) For sequence pairs:
+        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
+        #  type_ids: 0   0  0    0    0     0       0 0    1  1  1  1   1 1
+        # (b) For single sequences:
+        #  tokens:   [CLS] the dog is hairy . [SEP]
+        #  type_ids: 0   0   0   0  0     0 0
+        #
+        # Where "type_ids" are used to indicate whether this is the first
+        # sequence or the second sequence. The embedding vectors for `type=0` and
+        # `type=1` were learned during pre-training and are added to the wordpiece
+        # embedding vector (and position vector). This is not *strictly* necessary
+        # since the [SEP] token unambigiously separates the sequences, but it makes
+        # it easier for the model to learn the concept of sequences.
+        #
+        # For classification tasks, the first vector (corresponding to [CLS]) is
+        # used as as the "sentence vector". Note that this only makes sense because
+        # the entire model is fine-tuned.
+        tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
+        segment_ids = [0] * len(tokens)
+
+        if tokens_b:
+            tokens += tokens_b + ["[SEP]"]
+            segment_ids += [1] * (len(tokens_b) + 1)
+
+        input_ids = tokenizer.convert_tokens_to_ids(tokens)
+
+        # The mask has 1 for real tokens and 0 for padding tokens. Only real
+        # tokens are attended to.
+        input_mask = [1] * len(input_ids)
+
+        # Zero-pad up to the sequence length.
+        padding = [0] * (max_seq_length - len(input_ids))
+        input_ids += padding
+        input_mask += padding
+        segment_ids += padding
+
+        assert len(input_ids) == max_seq_length
+        assert len(input_mask) == max_seq_length
+        assert len(segment_ids) == max_seq_length
+
+        label_id = label_map[example.label]
+        if ex_index < 0:
+            logger.info("*** Example ***")
+            logger.info("guid: %s" % (example.guid))
+            logger.info("tokens: %s" % " ".join([str(x) for x in tokens]))
+            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
+            logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
+            logger.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
+            logger.info("label: %s (id = %d)" % (example.label, label_id))
+
+        features.append(
+            InputFeatures(
+                input_ids=input_ids,
+                input_mask=input_mask,
+                segment_ids=segment_ids,
+                label_id=label_id,
+            )
+        )
+    return features
+
+
+class MRPCDataset:
+    def __init__(self, args):
+        self.args = args
+        self.processor = MrpcProcessor()
+        self.label_list = self.processor.get_labels()
+        self.tokenizer = BertTokenizer(
+            args.vocab_file, do_lower_case=args.do_lower_case
+        )
+
+    def to_inputs(self, inp):
+        return (
+            np.array([f.input_ids for f in inp]).astype(np.int32),
+            np.array([f.input_mask for f in inp]).astype(np.float32),
+            np.array([f.segment_ids for f in inp]).astype(np.int32),
+            np.array([f.label_id for f in inp]).astype(np.int32),
+        )
+
+    def get_dataloader(self, examples, batch_size, is_random=False):
+        features = convert_examples_to_features(
+            examples, self.label_list, self.args.max_seq_length, self.tokenizer
+        )
+        all_input_ids, all_input_mask, all_segment_ids, all_label_ids = self.to_inputs(
+            features
+        )
+        dataset = ArrayDataset(
+            all_input_ids, all_input_mask, all_segment_ids, all_label_ids
+        )
+        if is_random:
+            sampler = RandomSampler(
+                dataset=dataset, batch_size=batch_size, drop_last=True
+            )
+        else:
+            sampler = SequentialSampler(
+                dataset=dataset, batch_size=batch_size, drop_last=True
+            )
+        dataloader = DataLoader(dataset=dataset, sampler=sampler,)
+        return dataloader, len(features)
+
+    def get_train_dataloader(self):
+        examples = self.processor.get_train_examples(self.args.data_dir)
+        return self.get_dataloader(examples, self.args.train_batch_size, is_random=True)
+
+    def get_eval_dataloader(self):
+        examples = self.processor.get_dev_examples(self.args.data_dir)
+        return self.get_dataloader(examples, self.args.eval_batch_size, is_random=False)
--- a/official/nlp/bert/test.py
+++ b/official/nlp/bert/test.py
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+import megengine as mge
+import megengine.functional as F
+from megengine.jit import trace
+from tqdm import tqdm
+
+from config import get_args
+from model import BertForSequenceClassification, create_hub_bert
+from mrpc_dataset import MRPCDataset
+
+args = get_args()
+logger = mge.get_logger(__name__)
+
+
+@trace(symbolic=True)
+def net_eval(input_ids, segment_ids, input_mask, label_ids, opt=None, net=None):
+    net.eval()
+    results = net(input_ids, segment_ids, input_mask, label_ids)
+    logits, loss = results
+    return loss, logits, label_ids
+
+
+def accuracy(out, labels):
+    outputs = F.argmax(out, axis=1)
+    return F.sum(outputs == labels)
+
+
+def eval(dataloader, net):
+    logger.info("***** Running evaluation *****")
+    logger.info("batch size = %d", args.eval_batch_size)
+
+    sum_loss, sum_accuracy, total_steps, total_examples = 0, 0, 0, 0
+
+    for step, batch in enumerate(tqdm(dataloader, desc="Iteration")):
+        input_ids, input_mask, segment_ids, label_ids = tuple(
+            mge.tensor(t) for t in batch
+        )
+        batch_size = input_ids.shape[0]
+        loss, logits, label_ids = net_eval(
+            input_ids, segment_ids, input_mask, label_ids, net=net
+        )
+        sum_loss += loss.mean().item()
+        sum_accuracy += accuracy(logits, label_ids)
+        total_examples += batch_size
+        total_steps += 1
+
+    result = {
+        "eval_loss": sum_loss / total_steps,
+        "eval_accuracy": sum_accuracy / total_examples,
+    }
+
+    logger.info("***** Eval results *****")
+    for key in sorted(result.keys()):
+        logger.info("%s = %s", key, str(result[key]))
+
+
+if __name__ == "__main__":
+    bert, config, vocab_file = create_hub_bert(args.pretrained_bert, pretrained=False)
+    args.vocab_file = vocab_file
+    model = BertForSequenceClassification(config, num_labels=2, bert=bert)
+    mrpc_dataset = MRPCDataset(args)
+    model.load_state_dict(mge.load(args.load_model_path))
+    mrpc_dataset = MRPCDataset(args)
+    eval_dataloader, eval_size = mrpc_dataset.get_eval_dataloader()
+    eval(eval_dataloader, model)
--- a/official/nlp/bert/tokenization.py
+++ b/official/nlp/bert/tokenization.py
+# -*- coding: utf-8 -*-
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# ---------------------------------------------------------------------
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#
+# This file has been modified by Megvii ("Megvii Modifications").
+# All Megvii Modifications are Copyright (C) 2014-2020 Megvii Inc. All rights reserved.
+# ----------------------------------------------------------------------
+"""Tokenization classes."""
+
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+
+import collections
+import os
+import unicodedata
+from io import open
+
+import megengine as megengine
+
+logger = megengine.get_logger(__name__)
+
+
+VOCAB_NAME = "vocab.txt"
+
+
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    index = 0
+    with open(vocab_file, "r", encoding="utf-8") as reader:
+        while True:
+            token = reader.readline()
+            if not token:
+                break
+            token = token.strip()
+            vocab[token] = index
+            index += 1
+    return vocab
+
+
+def whitespace_tokenize(text):
+    """Runs basic whitespace cleaning and splitting on a piece of text."""
+    text = text.strip()
+    if not text:
+        return []
+    tokens = text.split()
+    return tokens
+
+
+class BertTokenizer(object):
+    """Runs end-to-end tokenization: punctuation splitting + wordpiece"""
+
+    def __init__(
+        self,
+        vocab_file,
+        do_lower_case=True,
+        max_len=None,
+        do_basic_tokenize=True,
+        never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]"),
+    ):
+        """Constructs a BertTokenizer.
+
+        Args:
+          vocab_file: Path to a one-wordpiece-per-line vocabulary file
+          do_lower_case: Whether to lower case the input
+                         Only has an effect when do_wordpiece_only=False
+          do_basic_tokenize: Whether to do basic tokenization before wordpiece.
+          max_len: An artificial maximum length to truncate tokenized sequences to;
+                         Effective maximum length is always the minimum of this
+                         value (if specified) and the underlying BERT model's
+                         sequence length.
+          never_split: List of tokens which will never be split during tokenization.
+                         Only has an effect when do_wordpiece_only=False
+        """
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
+                "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
+                    vocab_file
+                )
+            )
+        self.vocab = load_vocab(vocab_file)
+        self.ids_to_tokens = collections.OrderedDict(
+            [(ids, tok) for tok, ids in self.vocab.items()]
+        )
+        self.do_basic_tokenize = do_basic_tokenize
+        if do_basic_tokenize:
+            self.basic_tokenizer = BasicTokenizer(
+                do_lower_case=do_lower_case, never_split=never_split
+            )
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
+        self.max_len = max_len if max_len is not None else int(1e12)
+
+    def tokenize(self, text):
+        split_tokens = []
+        if self.do_basic_tokenize:
+            for token in self.basic_tokenizer.tokenize(text):
+                for sub_token in self.wordpiece_tokenizer.tokenize(token):
+                    split_tokens.append(sub_token)
+        else:
+            split_tokens = self.wordpiece_tokenizer.tokenize(text)
+        return split_tokens
+
+    def convert_tokens_to_ids(self, tokens):
+        """Converts a sequence of tokens into ids using the vocab."""
+        ids = []
+        for token in tokens:
+            ids.append(self.vocab[token])
+        if len(ids) > self.max_len:
+            logger.warning(
+                "Token indices sequence length is longer than the specified maximum "
+                " sequence length for this BERT model ({} > {}). Running this"
+                " sequence through BERT will result in indexing errors".format(
+                    len(ids), self.max_len
+                )
+            )
+        return ids
+
+    def convert_ids_to_tokens(self, ids):
+        """Converts a sequence of ids in wordpiece tokens using the vocab."""
+        tokens = []
+        for i in ids:
+            tokens.append(self.ids_to_tokens[i])
+        return tokens
+
+    def save_vocabulary(self, vocab_path):
+        """Save the tokenizer vocabulary to a directory or file."""
+        index = 0
+        if os.path.isdir(vocab_path):
+            vocab_file = os.path.join(vocab_path, VOCAB_NAME)
+        with open(vocab_file, "w", encoding="utf-8") as writer:
+            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning(
+                        "Saving vocabulary to {}: vocabulary indices are not consecutive."
+                        " Please check that the vocabulary is not corrupted!".format(
+                            vocab_file
+                        )
+                    )
+                    index = token_index
+                writer.write(token + "\n")
+                index += 1
+        return vocab_file
+
+
+class BasicTokenizer(object):
+    """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
+
+    def __init__(
+        self,
+        do_lower_case=True,
+        never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]"),
+    ):
+        """Constructs a BasicTokenizer.
+
+        Args:
+          do_lower_case: Whether to lower case the input.
+        """
+        self.do_lower_case = do_lower_case
+        self.never_split = never_split
+
+    def tokenize(self, text):
+        """Tokenizes a piece of text."""
+        text = self._clean_text(text)
+        # This was added on November 1st, 2018 for the multilingual and Chinese
+        # models. This is also applied to the English models now, but it doesn't
+        # matter since the English models were not trained on any Chinese data
+        # and generally don't have any Chinese data in them (there are Chinese
+        # characters in the vocabulary because Wikipedia does have some Chinese
+        # words in the English Wikipedia.).
+        text = self._tokenize_chinese_chars(text)
+        orig_tokens = whitespace_tokenize(text)
+        split_tokens = []
+        for token in orig_tokens:
+            if self.do_lower_case and token not in self.never_split:
+                token = token.lower()
+                token = self._run_strip_accents(token)
+            split_tokens.extend(self._run_split_on_punc(token))
+
+        output_tokens = whitespace_tokenize(" ".join(split_tokens))
+        return output_tokens
+
+    def _run_strip_accents(self, text):
+        """Strips accents from a piece of text."""
+        text = unicodedata.normalize("NFD", text)
+        output = []
+        for char in text:
+            cat = unicodedata.category(char)
+            if cat == "Mn":
+                continue
+            output.append(char)
+        return "".join(output)
+
+    def _run_split_on_punc(self, text):
+        """Splits punctuation on a piece of text."""
+        if text in self.never_split:
+            return [text]
+        chars = list(text)
+        i = 0
+        start_new_word = True
+        output = []
+        while i < len(chars):
+            char = chars[i]
+            if _is_punctuation(char):
+                output.append([char])
+                start_new_word = True
+            else:
+                if start_new_word:
+                    output.append([])
+                start_new_word = False
+                output[-1].append(char)
+            i += 1
+
+        return ["".join(x) for x in output]
+
+    def _tokenize_chinese_chars(self, text):
+        """Adds whitespace around any CJK character."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if self._is_chinese_char(cp):
+                output.append(" ")
+                output.append(char)
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+    def _is_chinese_char(self, cp):
+        """Checks whether CP is the codepoint of a CJK character."""
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all of the other languages.
+        if (
+            (cp >= 0x4E00 and cp <= 0x9FFF)
+            or (cp >= 0x3400 and cp <= 0x4DBF)  #
+            or (cp >= 0x20000 and cp <= 0x2A6DF)  #
+            or (cp >= 0x2A700 and cp <= 0x2B73F)  #
+            or (cp >= 0x2B740 and cp <= 0x2B81F)  #
+            or (cp >= 0x2B820 and cp <= 0x2CEAF)  #
+            or (cp >= 0xF900 and cp <= 0xFAFF)
+            or (cp >= 0x2F800 and cp <= 0x2FA1F)  #
+        ):  #
+            return True
+
+        return False
+
+    def _clean_text(self, text):
+        """Performs invalid character removal and whitespace cleanup on text."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if cp == 0 or cp == 0xFFFD or _is_control(char):
+                continue
+            if _is_whitespace(char):
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+
+class WordpieceTokenizer(object):
+    """Runs WordPiece tokenization."""
+
+    def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):
+        self.vocab = vocab
+        self.unk_token = unk_token
+        self.max_input_chars_per_word = max_input_chars_per_word
+
+    def tokenize(self, text):
+        """Tokenizes a piece of text into its word pieces.
+
+        This uses a greedy longest-match-first algorithm to perform tokenization
+        using the given vocabulary.
+
+        For example:
+          input = "unaffable"
+          output = ["un", "##aff", "##able"]
+
+        Args:
+          text: A single token or whitespace separated tokens. This should have
+            already been passed through `BasicTokenizer`.
+
+        Returns:
+          A list of wordpiece tokens.
+        """
+
+        output_tokens = []
+        for token in whitespace_tokenize(text):
+            chars = list(token)
+            if len(chars) > self.max_input_chars_per_word:
+                output_tokens.append(self.unk_token)
+                continue
+
+            is_bad = False
+            start = 0
+            sub_tokens = []
+            while start < len(chars):
+                end = len(chars)
+                cur_substr = None
+                while start < end:
+                    substr = "".join(chars[start:end])
+                    if start > 0:
+                        substr = "##" + substr
+                    if substr in self.vocab:
+                        cur_substr = substr
+                        break
+                    end -= 1
+                if cur_substr is None:
+                    is_bad = True
+                    break
+                sub_tokens.append(cur_substr)
+                start = end
+
+            if is_bad:
+                output_tokens.append(self.unk_token)
+            else:
+                output_tokens.extend(sub_tokens)
+        return output_tokens
+
+
+def _is_whitespace(char):
+    """Checks whether `chars` is a whitespace character."""
+    # \t, \n, and \r are technically contorl characters but we treat them
+    # as whitespace since they are generally considered as such.
+    if char == " " or char == "\t" or char == "\n" or char == "\r":
+        return True
+    cat = unicodedata.category(char)
+    if cat == "Zs":
+        return True
+    return False
+
+
+def _is_control(char):
+    """Checks whether `chars` is a control character."""
+    # These are technically control characters but we count them as whitespace
+    # characters.
+    if char == "\t" or char == "\n" or char == "\r":
+        return False
+    cat = unicodedata.category(char)
+    if cat.startswith("C"):
+        return True
+    return False
+
+
+def _is_punctuation(char):
+    """Checks whether `chars` is a punctuation character."""
+    cp = ord(char)
+    # We treat all non-letter/number ASCII as punctuation.
+    # Characters such as "^", "$", and "`" are not in the Unicode
+    # Punctuation class but we treat them as punctuation anyways, for
+    # consistency.
+    if (
+        (cp >= 33 and cp <= 47)
+        or (cp >= 58 and cp <= 64)
+        or (cp >= 91 and cp <= 96)
+        or (cp >= 123 and cp <= 126)
+    ):
+        return True
+    cat = unicodedata.category(char)
+    if cat.startswith("P"):
+        return True
+    return False
--- a/official/nlp/bert/train.py
+++ b/official/nlp/bert/train.py
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+import megengine as mge
+import megengine.functional as F
+import megengine.optimizer as optim
+from megengine.jit import trace
+from tqdm import tqdm
+
+from config import get_args
+from model import BertForSequenceClassification, create_hub_bert
+from mrpc_dataset import MRPCDataset
+
+args = get_args()
+logger = mge.get_logger(__name__)
+
+
+@trace(symbolic=True)
+def net_eval(input_ids, segment_ids, input_mask, label_ids, opt=None, net=None):
+    net.eval()
+    results = net(input_ids, segment_ids, input_mask, label_ids)
+    logits, loss = results
+    return loss, logits, label_ids
+
+
+@trace(symbolic=True)
+def net_train(input_ids, segment_ids, input_mask, label_ids, opt=None, net=None):
+    net.train()
+    results = net(input_ids, segment_ids, input_mask, label_ids)
+    logits, loss = results
+    opt.backward(loss)
+    return loss, logits, label_ids
+
+
+def accuracy(out, labels):
+    outputs = F.argmax(out, axis=1)
+    return F.sum(outputs == labels)
+
+
+def eval(dataloader, net):
+    logger.info("***** Running evaluation *****")
+    logger.info("batch size = %d", args.eval_batch_size)
+
+    sum_loss, sum_accuracy, total_steps, total_examples = 0, 0, 0, 0
+
+    for step, batch in enumerate(tqdm(dataloader, desc="Iteration")):
+        input_ids, input_mask, segment_ids, label_ids = tuple(
+            mge.tensor(t) for t in batch
+        )
+        batch_size = input_ids.shape[0]
+        if batch_size != args.eval_batch_size:
+            break
+        loss, logits, label_ids = net_eval(
+            input_ids, segment_ids, input_mask, label_ids, net=net
+        )
+        sum_loss += loss.mean().item()
+        sum_accuracy += accuracy(logits, label_ids)
+        total_examples += batch_size
+        total_steps += 1
+
+    result = {
+        "eval_loss": sum_loss / total_steps,
+        "eval_accuracy": sum_accuracy / total_examples,
+    }
+
+    logger.info("***** Eval results *****")
+    for key in sorted(result.keys()):
+        logger.info("%s = %s", key, str(result[key]))
+
+
+def train(dataloader, net, opt):
+    logger.info("***** Running training *****")
+    logger.info("batch size = %d", args.train_batch_size)
+    sum_loss, sum_accuracy, total_steps, total_examples = 0, 0, 0, 0
+
+    for step, batch in enumerate(tqdm(dataloader, desc="Iteration")):
+        input_ids, input_mask, segment_ids, label_ids = tuple(
+            mge.tensor(t) for t in batch
+        )
+        batch_size = input_ids.shape[0]
+        opt.zero_grad()
+        loss, logits, label_ids = net_train(
+            input_ids, segment_ids, input_mask, label_ids, opt=opt, net=net
+        )
+        optimizer.step()
+        sum_loss += loss.mean().item()
+        sum_accuracy += accuracy(logits, label_ids)
+        total_examples += batch_size
+        total_steps += 1
+
+    result = {
+        "train_loss": sum_loss / total_steps,
+        "train_accuracy": sum_accuracy / total_examples,
+    }
+
+    logger.info("***** Train results *****")
+    for key in sorted(result.keys()):
+        logger.info("%s = %s", key, str(result[key]))
+
+
+if __name__ == "__main__":
+    bert, config, vocab_file = create_hub_bert(args.pretrained_bert, pretrained=True)
+    args.vocab_file = vocab_file
+    model = BertForSequenceClassification(config, num_labels=2, bert=bert)
+    mrpc_dataset = MRPCDataset(args)
+    optimizer = optim.Adam(model.parameters(requires_grad=True), lr=args.learning_rate,)
+    mrpc_dataset = MRPCDataset(args)
+    train_dataloader, train_size = mrpc_dataset.get_train_dataloader()
+    eval_dataloader, eval_size = mrpc_dataset.get_eval_dataloader()
+    for epoch in range(args.num_train_epochs):
+        logger.info("***** Epoch {} *****".format(epoch + 1))
+        train(train_dataloader, model, optimizer)
+        mge.save(model.state_dict(), args.save_model_path)
+        eval(eval_dataloader, model)
--- a/official/vision/__init__.py
+++ b/official/vision/__init__.py
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
--- a/official/vision/classification/__init__.py
+++ b/official/vision/classification/__init__.py
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
--- a/official/vision/classification/resnet/README.md
+++ b/official/vision/classification/resnet/README.md
+# ResNet Series
+
+本目录包含了采用MegEngine实现的经典`ResNet`网络结构，同时提供了在ImageNet训练集上的完整训练和测试代码。
+
+`model.py`中定义了一些常见的网络结构：`resnet18`, `resnet34`, `resnet50`, `resnet101`, `resnet152`等.
+
+目前我们提供了部分在ImageNet上的预训练模型(见下表)，各个网络结构在ImageNet验证集上的测试结果如下：
+
+| 模型 | top1 acc | top5 acc |
+| --- | --- | --- |
+| ResNet18 |  70.312  |  89.430  | 
+| ResNet34 |  73.960  |  91.630  | 
+| ResNet50 | 76.254 | 93.056 | 
+| ResNet101 | 77.944 | 93.844 | 
+| ResNeXt50 32x4d | 77.592 | 93.644 | 
+
+用户可以通过`megengine.hub`直接加载本目录下定义好的模型，例如：
+
+```bash
+import megengine.hub
+
+# 只加载网络结构
+resnet18 = megengine.hub.load("megengine/models", "resnet18")
+# 加载网络结构和预训练权重
+resnet18 = megengine.hub.load("megengine/models", "resnet18", pretrained=True)
+```
+
+## 安装和环境配置
+
+在开始运行本目录下的代码之前，请确保按照[README](../../../../README.md)进行了正确的环境配置。
+
+## 如何训练
+
+在开始训练前，请确保已经下载解压好[ImageNet数据集](http://image-net.org/download)，并放在合适的目录下，准备好的数据集的目录结构如下所示：
+
+```bash
+/path/to/imagenet
+    train
+         n01440764
+              xxx.jpg
+              ...
+         n01443537
+              xxx.jpg
+              ...
+         ...
+    val
+         n01440764
+              xxx.jpg
+              ...
+         n01443537
+              xxx.jpg
+              ...
+         ...
+```
+
+准备好数据集后，可以运行以下命令开始训练：
+
+```bash
+python3 train.py --dataset-dir=/path/to/imagenet
+```
+
+`train.py`提供了灵活的命令行选项，包括：
+
+- `--data`, ImageNet数据集的根目录，默认`/data/datasets/imagenet`;
+- `--arch`, 需要训练的网络结构，默认`resnet18`；
+- `--batch-size`，训练时每张卡采用的batch size, 默认32；
+- `--ngpus`, 训练时采用的节点/gpu数量，默认1；当使用多张gpu时，将自动切换为分布式训练模式；
+- `--save-dir`, 模型以及log存储的目录，默认`/data/models`;
+- `--learning-rate`, 训练时的初始学习率，默认0.0125，在分布式训练下，实际学习率等于初始学习率乘以节点/gpu数；
+- `--epochs`, 训练多少个epoch，默认100；
+
+例如，可以通过以下命令在2块GPU上以64的batch大小训练一个`resnet50`的模型：
+
+```bash
+python3 train.py --data /path/to/imagenet \
+                 --arch resnet50 \
+                 --batch-size 32 \
+                 --learning-rate 0.0125 \
+                 --ngpus 2 \
+                 --save /path/to/save_dir
+```
+
+更多详细的介绍可以通过运行`python3 train.py --help`查看。
+
+## 如何测试
+
+在训练的过程中，可以通过如下命令测试模型在ImageNet验证集的性能：
+
+```bash
+python3 test.py --data=/path/to/imagenet --arch resnet50 --model /path/to/model --ngpus 1
+```
+
+`test.py`的命令行选项如下：
+
+- `--data`，ImageNet数据集的根目录，默认`/data/datasets/imagenet`；
+- `--arch`, 需要测试的网络结构，默认`resnet18`；
+- `--model`, 需要测试的模型，默认使用官方预训练模型；
+- `--ngpus`, 用于测试的gpu数量，默认1；
+
+更多详细介绍可以通过运行`python3 test.py --help`查看。
+
+## 如何使用
+
+模型训练好之后，可以通过如下命令测试单张图片:
+
+```bash
+python3 inference.py --model /path/to/model --image /path/to/image.jpg
+```
+
+使用默认的测试图片和默认的resnet18模型，将输出如下结果：
+```
+0: class = lynx                 with probability = 25.2 %
+1: class = Siamese_cat          with probability = 12.3 %
+2: class = Egyptian_cat         with probability =  8.7 %
+3: class = Persian_cat          with probability =  8.3 %
+4: class = tabby                with probability =  6.5 %
+```
+
+`inference.py`的命令行选项如下：
+
+- `--arch`, 需要使用的网络结构，默认`resnet18`；
+- `--model`, 训练好的模型权重地址，默认使用官方预训练的resnet18模型；
+- `--image`, 用于测试的图片；
+
+## 参考文献
+
+- [Deep Residual Learning for Image Recognition](http://openaccess.thecvf.com/content_cvpr_2016/papers/He_Deep_Residual_Learning_CVPR_2016_paper.pdf), Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun; The IEEE Conference on Computer Vision and Pattern Recognition (CVPR), 2016, pp. 770-778
+- [Aggregated Residual Transformation for Deep Neural Networks](http://openaccess.thecvf.com/content_cvpr_2017/papers/Xie_Aggregated_Residual_Transformations_CVPR_2017_paper.pdf), Saining Xie, Ross Girshick, Piotr Dollar, Zhuowen Tu, Kaiming He; The IEEE Conference on Computer Vision and Pattern Recognition (CVPR), 2017, pp. 1492-1500
+- [Wide Residual Networks](https://arxiv.org/pdf/1605.07146.pdf), Sergey Zagoruyko, Nikos Komodakis, arXiv:1605.07146
--- a/official/vision/classification/resnet/__init__.py
+++ b/official/vision/classification/resnet/__init__.py
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
--- a/official/vision/classification/resnet/inference.py
+++ b/official/vision/classification/resnet/inference.py
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import argparse
+import json
+
+import cv2
+import megengine as mge
+import megengine.data.transform as T
+import megengine.functional as F
+import megengine.jit as jit
+import numpy as np
+
+import model as M
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-a", "--arch", default="resnet18", type=str)
+    parser.add_argument("-m", "--model", default=None, type=str)
+    parser.add_argument("-i", "--image", default=None, type=str)
+    args = parser.parse_args()
+
+    model = getattr(M, args.arch)(pretrained=(args.model is None))
+    if args.model:
+        state_dict = mge.load(args.model)
+        model.load_state_dict(state_dict)
+
+    if args.image is None:
+        path = "../../../assets/cat.jpg"
+    else:
+        path = args.image
+    image = cv2.imread(path, cv2.IMREAD_COLOR)
+
+    transform = T.Compose(
+        [
+            T.Resize(256),
+            T.CenterCrop(224),
+            T.Normalize(
+                mean=[103.530, 116.280, 123.675], std=[57.375, 57.120, 58.395]
+            ),  # BGR
+            T.ToMode("CHW"),
+        ]
+    )
+
+    @jit.trace(symbolic=True)
+    def infer_func(processed_img):
+        model.eval()
+        logits = model(processed_img)
+        probs = F.softmax(logits)
+        return probs
+
+    processed_img = transform.apply(image)[np.newaxis, :]
+    probs = infer_func(processed_img)
+
+    top_probs, classes = F.top_k(probs, k=5, descending=True)
+
+    with open("../../../assets/imagenet_class_info.json") as fp:
+        imagenet_class_index = json.load(fp)
+
+    for rank, (prob, classid) in enumerate(
+        zip(top_probs.numpy().reshape(-1), classes.numpy().reshape(-1))
+    ):
+        print(
+            "{}: class = {:20s} with probability = {:4.1f} %".format(
+                rank, imagenet_class_index[str(classid)][1], 100 * prob
+            )
+        )
+
+
+if __name__ == "__main__":
+    main()
--- a/official/vision/classification/resnet/model.py
+++ b/official/vision/classification/resnet/model.py
+# BSD 3-Clause License
+
+# Copyright (c) Soumith Chintala 2016,
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# ------------------------------------------------------------------------------
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#
+# This file has been modified by Megvii ("Megvii Modifications").
+# All Megvii Modifications are Copyright (C) 2014-2019 Megvii Inc. All rights reserved.
+# ------------------------------------------------------------------------------
+import math
+
+import megengine.functional as F
+import megengine.hub as hub
+import megengine.module as M
+
+
+class BasicBlock(M.Module):
+    expansion = 1
+
+    def __init__(
+        self,
+        in_channels,
+        channels,
+        stride=1,
+        groups=1,
+        base_width=64,
+        dilation=1,
+        norm=M.BatchNorm2d,
+    ):
+        super(BasicBlock, self).__init__()
+        if groups != 1 or base_width != 64:
+            raise ValueError("BasicBlock only supports groups=1 and base_width=64")
+        if dilation > 1:
+            raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
+        self.conv1 = M.Conv2d(
+            in_channels, channels, 3, stride, padding=dilation, bias=False
+        )
+        self.bn1 = norm(channels)
+        self.conv2 = M.Conv2d(channels, channels, 3, 1, padding=1, bias=False)
+        self.bn2 = norm(channels)
+        self.downsample = (
+            M.Identity()
+            if in_channels == channels and stride == 1
+            else M.Sequential(
+                M.Conv2d(in_channels, channels, 1, stride, bias=False), norm(channels),
+            )
+        )
+
+    def forward(self, x):
+        identity = x
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = F.relu(x)
+        x = self.conv2(x)
+        x = self.bn2(x)
+        identity = self.downsample(identity)
+        x += identity
+        x = F.relu(x)
+        return x
+
+
+class Bottleneck(M.Module):
+    expansion = 4
+
+    def __init__(
+        self,
+        in_channels,
+        channels,
+        stride=1,
+        groups=1,
+        base_width=64,
+        dilation=1,
+        norm=M.BatchNorm2d,
+    ):
+        super(Bottleneck, self).__init__()
+        width = int(channels * (base_width / 64.0)) * groups
+        self.conv1 = M.Conv2d(in_channels, width, 1, 1, bias=False)
+        self.bn1 = norm(width)
+        self.conv2 = M.Conv2d(
+            width,
+            width,
+            3,
+            stride,
+            padding=dilation,
+            groups=groups,
+            dilation=dilation,
+            bias=False,
+        )
+        self.bn2 = norm(width)
+        self.conv3 = M.Conv2d(width, channels * self.expansion, 1, 1, bias=False)
+        self.bn3 = norm(channels * self.expansion)
+        self.downsample = (
+            M.Identity()
+            if in_channels == channels * self.expansion and stride == 1
+            else M.Sequential(
+                M.Conv2d(in_channels, channels * self.expansion, 1, stride, bias=False),
+                norm(channels * self.expansion),
+            )
+        )
+
+    def forward(self, x):
+        identity = x
+
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = F.relu(x)
+
+        x = self.conv2(x)
+        x = self.bn2(x)
+        x = F.relu(x)
+
+        x = self.conv3(x)
+        x = self.bn3(x)
+
+        identity = self.downsample(identity)
+
+        x += identity
+        x = F.relu(x)
+
+        return x
+
+
+class ResNet(M.Module):
+    def __init__(
+        self,
+        block,
+        layers,
+        num_classes=1000,
+        zero_init_residual=False,
+        groups=1,
+        width_per_group=64,
+        replace_stride_with_dilation=None,
+        norm=M.BatchNorm2d,
+    ):
+        super(ResNet, self).__init__()
+        self.in_channels = 64
+        self.dilation = 1
+        if replace_stride_with_dilation is None:
+            # each element in the tuple indicates if we should replace
+            # the 2x2 stride with a dilated convolution instead
+            replace_stride_with_dilation = [False, False, False]
+        if len(replace_stride_with_dilation) != 3:
+            raise ValueError(
+                "replace_stride_with_dilation should be None "
+                "or a 3-element tuple, got {}".format(replace_stride_with_dilation)
+            )
+        self.groups = groups
+        self.base_width = width_per_group
+        self.conv1 = M.Conv2d(
+            3, self.in_channels, kernel_size=7, stride=2, padding=3, bias=False
+        )
+        self.bn1 = norm(self.in_channels)
+        self.maxpool = M.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0], norm=norm)
+        self.layer2 = self._make_layer(
+            block,
+            128,
+            layers[1],
+            stride=2,
+            dilate=replace_stride_with_dilation[0],
+            norm=norm,
+        )
+        self.layer3 = self._make_layer(
+            block,
+            256,
+            layers[2],
+            stride=2,
+            dilate=replace_stride_with_dilation[1],
+            norm=norm,
+        )
+        self.layer4 = self._make_layer(
+            block,
+            512,
+            layers[3],
+            stride=2,
+            dilate=replace_stride_with_dilation[2],
+            norm=norm,
+        )
+        self.fc = M.Linear(512 * block.expansion, num_classes)
+
+        for m in self.modules():
+            if isinstance(m, M.Conv2d):
+                M.init.msra_normal_(m.weight, mode="fan_out", nonlinearity="relu")
+                if m.bias is not None:
+                    fan_in, _ = M.init.calculate_fan_in_and_fan_out(m.weight)
+                    bound = 1 / math.sqrt(fan_in)
+                    M.init.uniform_(m.bias, -bound, bound)
+            elif isinstance(m, M.BatchNorm2d):
+                M.init.ones_(m.weight)
+                M.init.zeros_(m.bias)
+            elif isinstance(m, M.Linear):
+                M.init.msra_uniform_(m.weight, a=math.sqrt(5))
+                if m.bias is not None:
+                    fan_in, _ = M.init.calculate_fan_in_and_fan_out(m.weight)
+                    bound = 1 / math.sqrt(fan_in)
+                    M.init.uniform_(m.bias, -bound, bound)
+
+        # Zero-initialize the last BN in each residual branch,
+        # so that the residual branch starts with zeros, and each residual block behaves like an identity.
+        # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
+        if zero_init_residual:
+            for m in self.modules():
+                if isinstance(m, Bottleneck):
+                    M.init.zeros_(m.bn3.weight)
+                elif isinstance(m, BasicBlock):
+                    M.init.zeros_(m.bn2.weight)
+
+    def _make_layer(
+        self, block, channels, blocks, stride=1, dilate=False, norm=M.BatchNorm2d
+    ):
+        previous_dilation = self.dilation
+        if dilate:
+            self.dilation *= stride
+            stride = 1
+
+        layers = []
+        layers.append(
+            block(
+                self.in_channels,
+                channels,
+                stride,
+                groups=self.groups,
+                base_width=self.base_width,
+                dilation=previous_dilation,
+                norm=norm,
+            )
+        )
+        self.in_channels = channels * block.expansion
+        for _ in range(1, blocks):
+            layers.append(
+                block(
+                    self.in_channels,
+                    channels,
+                    groups=self.groups,
+                    base_width=self.base_width,
+                    dilation=self.dilation,
+                    norm=norm,
+                )
+            )
+
+        return M.Sequential(*layers)
+
+    def extract_features(self, x):
+        outputs = {}
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = F.relu(x)
+        x = self.maxpool(x)
+        outputs["stem"] = x
+
+        x = self.layer1(x)
+        outputs["res2"] = x
+        x = self.layer2(x)
+        outputs["res3"] = x
+        x = self.layer3(x)
+        outputs["res4"] = x
+        x = self.layer4(x)
+        outputs["res5"] = x
+        return outputs
+
+    def forward(self, x):
+        x = self.extract_features(x)["res5"]
+
+        x = F.avg_pool2d(x, 7)
+        x = F.flatten(x, 1)
+        x = self.fc(x)
+
+        return x
+
+
+@hub.pretrained(
+    "https://data.megengine.org.cn/models/weights/resnet18_naiveaug_70312_78a63ca6.pkl"
+)
+def resnet18(**kwargs):
+    r"""ResNet-18 model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
+    """
+    return ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
+
+
+@hub.pretrained(
+    "https://data.megengine.org.cn/models/weights/resnet34_naiveaug_73960_fd9d869d.pkl"
+)
+def resnet34(**kwargs):
+    r"""ResNet-34 model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
+    """
+    return ResNet(BasicBlock, [3, 4, 6, 3], **kwargs)
+
+
+@hub.pretrained(
+    "https://data.megengine.org.cn/models/weights/resnet50_fbaug_76254_4e14b7d1.pkl"
+)
+def resnet50(**kwargs):
+    r"""ResNet-50 model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
+    """
+    return ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
+
+
+@hub.pretrained(
+    "https://data.megengine.org.cn/models/weights/resnet101_fbaug_77944_b7932921.pkl"
+)
+def resnet101(**kwargs):
+    r"""ResNet-101 model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
+    """
+    return ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
+
+
+def resnet152(**kwargs):
+    r"""ResNet-152 model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
+    """
+    return ResNet(Bottleneck, [3, 8, 36, 3], **kwargs)
+
+
+@hub.pretrained(
+    "https://data.megengine.org.cn/models/weights/resnext50_32x4d_fbaug_77592_c4b04e5e.pkl"
+)
+def resnext50_32x4d(**kwargs):
+    r"""ResNeXt-50 32x4d model from
+    `"Aggregated Residual Transformation for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    kwargs["groups"] = 32
+    kwargs["width_per_group"] = 4
+    return ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
+
+
+def resnext101_32x8d(**kwargs):
+    r"""ResNeXt-101 32x8d model from
+    `"Aggregated Residual Transformation for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    kwargs["groups"] = 32
+    kwargs["width_per_group"] = 8
+    return ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
--- a/official/vision/classification/resnet/test.py
+++ b/official/vision/classification/resnet/test.py
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import argparse
+import multiprocessing as mp
+import time
+
+import megengine as mge
+import megengine.data as data
+import megengine.data.transform as T
+import megengine.distributed as dist
+import megengine.functional as F
+import megengine.jit as jit
+
+import model as M
+
+logger = mge.get_logger(__name__)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-a", "--arch", default="resnet18", type=str)
+    parser.add_argument("-d", "--data", default=None, type=str)
+    parser.add_argument("-m", "--model", default=None, type=str)
+
+    parser.add_argument("-n", "--ngpus", default=None, type=int)
+    parser.add_argument("-w", "--workers", default=4, type=int)
+    parser.add_argument("--report-freq", default=50, type=int)
+    args = parser.parse_args()
+
+    world_size = mge.get_device_count("gpu") if args.ngpus is None else args.ngpus
+
+    if world_size > 1:
+        # start distributed training, dispatch sub-processes
+        mp.set_start_method("spawn")
+        processes = []
+        for rank in range(world_size):
+            p = mp.Process(target=worker, args=(rank, world_size, args))
+            p.start()
+            processes.append(p)
+
+        for p in processes:
+            p.join()
+    else:
+        worker(0, 1, args)
+
+
+def worker(rank, world_size, args):
+    if world_size > 1:
+        # Initialize distributed process group
+        logger.info("init distributed process group {} / {}".format(rank, world_size))
+        dist.init_process_group(
+            master_ip="localhost",
+            master_port=23456,
+            world_size=world_size,
+            rank=rank,
+            dev=rank,
+        )
+
+    model = getattr(M, args.arch)(pretrained=(args.model is None))
+
+    if args.model:
+        logger.info("load weights from %s", args.model)
+        model.load_state_dict(mge.load(args.model))
+
+    @jit.trace(symbolic=True)
+    def valid_func(image, label):
+        model.eval()
+        logits = model(image)
+        loss = F.cross_entropy_with_softmax(logits, label)
+        acc1, acc5 = F.accuracy(logits, label, (1, 5))
+        if dist.is_distributed():  # all_reduce_mean
+            loss = dist.all_reduce_sum(loss, "valid_loss") / dist.get_world_size()
+            acc1 = dist.all_reduce_sum(acc1, "valid_acc1") / dist.get_world_size()
+            acc5 = dist.all_reduce_sum(acc5, "valid_acc5") / dist.get_world_size()
+        return loss, acc1, acc5
+
+    logger.info("preparing dataset..")
+    valid_dataset = data.dataset.ImageNet(args.data, train=False)
+    valid_sampler = data.SequentialSampler(
+        valid_dataset, batch_size=100, drop_last=False
+    )
+    valid_queue = data.DataLoader(
+        valid_dataset,
+        sampler=valid_sampler,
+        transform=T.Compose(
+            [
+                T.Resize(256),
+                T.CenterCrop(224),
+                T.Normalize(
+                    mean=[103.530, 116.280, 123.675], std=[57.375, 57.120, 58.395]
+                ),  # BGR
+                T.ToMode("CHW"),
+            ]
+        ),
+        num_workers=args.workers,
+    )
+    _, valid_acc, valid_acc5 = infer(valid_func, valid_queue, args)
+    logger.info("Valid %.3f / %.3f", valid_acc, valid_acc5)
+
+
+def infer(model, data_queue, args, epoch=0):
+    objs = AverageMeter("Loss")
+    top1 = AverageMeter("Acc@1")
+    top5 = AverageMeter("Acc@5")
+    total_time = AverageMeter("Time")
+
+    t = time.time()
+    for step, (image, label) in enumerate(data_queue):
+        n = image.shape[0]
+        image = image.astype("float32")  # convert np.uint8 to float32
+        label = label.astype("int32")
+
+        loss, acc1, acc5 = model(image, label)
+
+        objs.update(loss.numpy()[0], n)
+        top1.update(100 * acc1.numpy()[0], n)
+        top5.update(100 * acc5.numpy()[0], n)
+        total_time.update(time.time() - t)
+        t = time.time()
+
+        if step % args.report_freq == 0 and dist.get_rank() == 0:
+            logger.info(
+                "Epoch %d Step %d, %s %s %s %s",
+                epoch,
+                step,
+                objs,
+                top1,
+                top5,
+                total_time,
+            )
+
+    return objs.avg, top1.avg, top5.avg
+
+
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+
+    def __init__(self, name, fmt=":.3f"):
+        self.name = name
+        self.fmt = fmt
+        self.reset()
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+    def __str__(self):
+        fmtstr = "{name} {val" + self.fmt + "} ({avg" + self.fmt + "})"
+        return fmtstr.format(**self.__dict__)
+
+
+if __name__ == "__main__":
+    main()
--- a/official/vision/classification/resnet/train.py
+++ b/official/vision/classification/resnet/train.py
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import argparse
+import multiprocessing as mp
+import os
+import shutil
+import time
+
+import megengine as mge
+import megengine.data as data
+import megengine.data.transform as T
+import megengine.distributed as dist
+import megengine.functional as F
+import megengine.jit as jit
+import megengine.optimizer as optim
+
+import model as M
+
+logger = mge.get_logger(__name__)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-a",
+        "--arch",
+        default="resnet50",
+        type=str,
+        choices=[
+            "resnet18",
+            "resnet34",
+            "resnet50",
+            "resnet101",
+            "resnet152",
+            "resnext50_32x4d",
+            "resnext101_32x8d",
+        ],
+    )
+    parser.add_argument("-d", "--data", default=None, type=str)
+    parser.add_argument("-s", "--save", default="/data/models", type=str)
+
+    parser.add_argument("-b", "--batch-size", default=32, type=int)
+    parser.add_argument("--learning-rate", default=0.0125, type=float)
+    parser.add_argument("--momentum", default=0.9, type=float)
+    parser.add_argument("--weight-decay", default=1e-4, type=float)
+    parser.add_argument("--epochs", default=90, type=int)
+
+    parser.add_argument("-n", "--ngpus", default=None, type=int)
+    parser.add_argument("-w", "--workers", default=4, type=int)
+    parser.add_argument("--report-freq", default=50, type=int)
+    args = parser.parse_args()
+
+    save_dir = os.path.join(args.save, args.arch)
+    if not os.path.exists(save_dir):
+        os.makedirs(save_dir)
+    mge.set_log_file(os.path.join(save_dir, "log.txt"))
+
+    world_size = mge.get_device_count("gpu") if args.ngpus is None else args.ngpus
+
+    if world_size > 1:
+        # scale learning rate by number of gpus
+        args.learning_rate *= world_size
+        # start distributed training, dispatch sub-processes
+        mp.set_start_method("spawn")
+        processes = []
+        for rank in range(world_size):
+            p = mp.Process(target=worker, args=(rank, world_size, args))
+            p.start()
+            processes.append(p)
+
+        for p in processes:
+            p.join()
+    else:
+        worker(0, 1, args)
+
+
+def worker(rank, world_size, args):
+    if world_size > 1:
+        # Initialize distributed process group
+        logger.info("init distributed process group {} / {}".format(rank, world_size))
+        dist.init_process_group(
+            master_ip="localhost",
+            master_port=23456,
+            world_size=world_size,
+            rank=rank,
+            dev=rank,
+        )
+
+    save_dir = os.path.join(args.save, args.arch)
+
+    model = getattr(M, args.arch)()
+
+    optimizer = optim.SGD(
+        model.parameters(requires_grad=True),
+        lr=args.learning_rate,
+        momentum=args.momentum,
+        weight_decay=args.weight_decay,
+    )
+
+    scheduler = optim.MultiStepLR(optimizer, [30, 60, 80])
+
+    # Define train and valid graph
+    @jit.trace(symbolic=True)
+    def train_func(image, label):
+        model.train()
+        logits = model(image)
+        loss = F.cross_entropy_with_softmax(logits, label)
+        acc1, acc5 = F.accuracy(logits, label, (1, 5))
+        optimizer.backward(loss)  # compute gradients
+        if dist.is_distributed():  # all_reduce_mean
+            loss = dist.all_reduce_sum(loss, "train_loss") / dist.get_world_size()
+            acc1 = dist.all_reduce_sum(acc1, "train_acc1") / dist.get_world_size()
+            acc5 = dist.all_reduce_sum(acc5, "train_acc5") / dist.get_world_size()
+        return loss, acc1, acc5
+
+    @jit.trace(symbolic=True)
+    def valid_func(image, label):
+        model.eval()
+        logits = model(image)
+        loss = F.cross_entropy_with_softmax(logits, label)
+        acc1, acc5 = F.accuracy(logits, label, (1, 5))
+        if dist.is_distributed():  # all_reduce_mean
+            loss = dist.all_reduce_sum(loss, "valid_loss") / dist.get_world_size()
+            acc1 = dist.all_reduce_sum(acc1, "valid_acc1") / dist.get_world_size()
+            acc5 = dist.all_reduce_sum(acc5, "valid_acc5") / dist.get_world_size()
+        return loss, acc1, acc5
+
+    # Build train and valid datasets
+    logger.info("preparing dataset..")
+    train_dataset = data.dataset.ImageNet(args.data, train=True)
+    train_sampler = data.RandomSampler(
+        train_dataset, batch_size=args.batch_size, drop_last=True
+    )
+    train_queue = data.DataLoader(
+        train_dataset,
+        sampler=train_sampler,
+        transform=T.Compose(
+            [  # Baseline Augmentation for small models
+                T.RandomResizedCrop(224),
+                T.RandomHorizontalFlip(),
+                T.Normalize(
+                    mean=[103.530, 116.280, 123.675], std=[57.375, 57.120, 58.395]
+                ),  # BGR
+                T.ToMode("CHW"),
+            ]
+        )
+        if args.arch in ("resnet18", "resnet34")
+        else T.Compose(
+            [  # Facebook Augmentation for large models
+                T.RandomResizedCrop(224),
+                T.RandomHorizontalFlip(),
+                T.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4),
+                T.Lighting(0.1),
+                T.Normalize(
+                    mean=[103.530, 116.280, 123.675], std=[57.375, 57.120, 58.395]
+                ),  # BGR
+                T.ToMode("CHW"),
+            ]
+        ),
+        num_workers=args.workers,
+    )
+    valid_dataset = data.dataset.ImageNet(args.data, train=False)
+    valid_sampler = data.SequentialSampler(
+        valid_dataset, batch_size=100, drop_last=False
+    )
+    valid_queue = data.DataLoader(
+        valid_dataset,
+        sampler=valid_sampler,
+        transform=T.Compose(
+            [
+                T.Resize(256),
+                T.CenterCrop(224),
+                T.Normalize(
+                    mean=[103.530, 116.280, 123.675], std=[57.375, 57.120, 58.395]
+                ),  # BGR
+                T.ToMode("CHW"),
+            ]
+        ),
+        num_workers=args.workers,
+    )
+
+    # Start training
+    top1_acc = 0
+    for epoch in range(0, args.epochs):
+        logger.info("Epoch %d LR %.3e", epoch, scheduler.get_lr()[0])
+        _, train_acc, train_acc5 = train(
+            train_func, train_queue, optimizer, args, epoch=epoch
+        )
+        logger.info("Epoch %d Train %.3f / %.3f", epoch, train_acc, train_acc5)
+        _, valid_acc, valid_acc5 = infer(valid_func, valid_queue, args, epoch=epoch)
+        logger.info("Epoch %d Valid %.3f / %.3f", epoch, valid_acc, valid_acc5)
+        scheduler.step()
+        if rank == 0:  # save checkpoint
+            mge.save(
+                {
+                    "epoch": epoch + 1,
+                    "state_dict": model.state_dict(),
+                    "accuracy": valid_acc,
+                },
+                os.path.join(save_dir, "checkpoint.pkl"),
+            )
+            if valid_acc > top1_acc:
+                top1_acc = valid_acc
+                shutil.copy(
+                    os.path.join(save_dir, "checkpoint.pkl"),
+                    os.path.join(save_dir, "model_best.pkl"),
+                )
+
+
+def train(model, data_queue, optimizer, args, epoch=0):
+    objs = AverageMeter("Loss")
+    top1 = AverageMeter("Acc@1")
+    top5 = AverageMeter("Acc@5")
+    total_time = AverageMeter("Time")
+
+    t = time.time()
+    for step, (image, label) in enumerate(data_queue):
+        n = image.shape[0]
+        image = image.astype("float32")  # convert np.uint8 to float32
+        label = label.astype("int32")
+
+        optimizer.zero_grad()
+        loss, acc1, acc5 = model(image, label)
+        optimizer.step()
+
+        objs.update(loss.numpy()[0], n)
+        top1.update(100 * acc1.numpy()[0], n)
+        top5.update(100 * acc5.numpy()[0], n)
+        total_time.update(time.time() - t)
+        t = time.time()
+
+        if step % args.report_freq == 0 and dist.get_rank() == 0:
+            logger.info(
+                "Epoch %d Step %d, %s %s %s %s",
+                epoch,
+                step,
+                objs,
+                top1,
+                top5,
+                total_time,
+            )
+
+    return objs.avg, top1.avg, top5.avg
+
+
+def infer(model, data_queue, args, epoch=0):
+    objs = AverageMeter("Loss")
+    top1 = AverageMeter("Acc@1")
+    top5 = AverageMeter("Acc@5")
+    total_time = AverageMeter("Time")
+
+    t = time.time()
+    for step, (image, label) in enumerate(data_queue):
+        n = image.shape[0]
+        image = image.astype("float32")  # convert np.uint8 to float32
+        label = label.astype("int32")
+
+        loss, acc1, acc5 = model(image, label)
+
+        objs.update(loss.numpy()[0], n)
+        top1.update(100 * acc1.numpy()[0], n)
+        top5.update(100 * acc5.numpy()[0], n)
+        total_time.update(time.time() - t)
+        t = time.time()
+
+        if step % args.report_freq == 0 and dist.get_rank() == 0:
+            logger.info(
+                "Epoch %d Step %d, %s %s %s %s",
+                epoch,
+                step,
+                objs,
+                top1,
+                top5,
+                total_time,
+            )
+
+    return objs.avg, top1.avg, top5.avg
+
+
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+
+    def __init__(self, name, fmt=":.3f"):
+        self.name = name
+        self.fmt = fmt
+        self.reset()
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+    def __str__(self):
+        fmtstr = "{name} {val" + self.fmt + "} ({avg" + self.fmt + "})"
+        return fmtstr.format(**self.__dict__)
+
+
+if __name__ == "__main__":
+    main()
--- a/official/vision/classification/shufflenet/README.md
+++ b/official/vision/classification/shufflenet/README.md
+# ShuffleNet Series
+
+本目录包含了采用MegEngine实现的`ShuffleNet V2`网络结构，同时提供了在ImageNet训练集上的完整训练和测试代码。
+
+`model.py`中定义了如下常见网络结构：`shufflenet_v2_x0_5`, `shufflenet_v2_x1_0`, `shufflenet_v2_x1_5`, `shufflenet_v2_x2_0`.
+
+目前我们提供了部分在ImageNet上的预训练模型(见下表)，各个网络结构在ImageNet验证集上的表现如下：
+
+| 模型 | top1 acc | top5 acc |
+| --- | --- | --- |
+| shufflenet_v2_x1_0 |  69.369  |  88.793  | 
+
+用户可以通过`megengine.hub`直接加载本目录下定义好的模型，例如：
+
+```bash
+import megengine.hub
+
+# 只加载网络结构
+resnet18 = megengine.hub.load("megengine/models", "shufflenet_v2_x1_0")
+# 加载网络结构和预训练权重
+resnet18 = megengine.hub.load("megengine/models", "shufflenet_v2_x1_0", pretrained=True)
+```
+
+## 安装和环境配置
+
+在开始运行本目录下的代码之前，请确保按照[README](../../../../README.md)进行了正确的环境配置。
+
+## 如何训练
+
+在开始训练前，请确保已经下载解压好[ImageNet数据集](http://image-net.org/download)，并放在合适的目录下，准备好的数据集的目录结构如下所示：
+
+```bash
+/path/to/imagenet
+    train
+         n01440764
+              xxx.jpg
+              ...
+         n01443537
+              xxx.jpg
+              ...
+         ...
+    val
+         n01440764
+              xxx.jpg
+              ...
+         n01443537
+              xxx.jpg
+              ...
+         ...
+```
+
+准备好数据集后，可以运行以下命令开始训练：
+
+```bash
+python3 train.py --dataset-dir=/path/to/imagenet
+```
+
+`train.py`提供了灵活的命令行选项，包括：
+
+- `--data`, ImageNet数据集的根目录，默认`/data/datasets/imagenet`;
+- `--arch`, 需要训练的网络结构，默认`resnet18`；
+- `--batch-size`，训练时每张卡采用的batch size, 默认128；
+- `--ngpus`, 训练时采用的节点/gpu数量，默认1；当使用多张gpu时，将自动切换为分布式训练模式；
+- `--save`, 模型以及log存储的目录，默认`/data/models`;
+- `--learning-rate`, 训练时的初始学习率，默认0.0625，在分布式训练下，实际学习率等于初始学习率乘以节点/gpu数；
+- `--steps`, 训练多少个iteration，默认300,000；
+
+例如，可以通过以下命令在8块GPU上以128 x 8 = 1024的batch大小训练一个`shufflenet_v2_x1_5`的模型：
+
+```bash
+python3 train.py --data /path/to/imagenet \
+                 --arch shufflenet_v2_x1_5 \
+                 --batch-size 128 \
+                 --learning-rate 0.0625 \
+                 --ngpus 8 \
+                 --save /path/to/save_dir
+```
+
+更多详细的介绍可以通过运行`python3 train.py --help`查看。
+
+## 如何测试
+
+在训练的过程中，可以通过如下命令测试模型在ImageNet验证集的性能：
+
+```bash
+python3 test.py --dataset-dir=/path/to/imagenet --arch shufflenet_v2_x1_5 --model /path/to/model -ngpus 1
+```
+
+`test.py`的命令行选项如下：
+
+- `--dataset-dir`，ImageNet数据集的根目录，默认`/data/datasets/imagenet`；
+- `--arch`, 需要测试的网络结构，默认``；
+- `--model`, 需要测试的模型，默认使用官方预训练模型；
+- `--ngpus`, 用于测试的gpu数量，默认1；
+
+更多详细介绍可以通过运行`python3 test.py --help`查看。
+
+## 如何使用
+
+模型训练好之后，可以通过如下命令测试单张图片:
+
+```bash
+python3 inference.py --model /path/to/model --image /path/to/image.jpg
+```
+
+使用默认的测试图片和默认的`shufflenet_v2_x1_0`预训练模型，将输出如下结果：
+```
+0: class = Siamese_cat          with probability = 53.5 %
+1: class = lynx                 with probability =  6.9 %
+2: class = tabby                with probability =  4.6 %
+3: class = Persian_cat          with probability =  2.6 %
+4: class = Angora               with probability =  1.4 %
+```
+
+`inference.py`的命令行选项如下：
+
+- `--arch`, 需要使用的网络结构，默认`shufflenet_v2_x1_0`；
+- `--model`, 训练好的模型权重地址，默认使用官方预训练的`shufflenet_v2_x1_0`模型；
+- `--image`, 用于测试的图片；
+
+## 参考文献
+
+- [ShuffleNet V2: Practical Guidelines for Efficient CNN Architecture Design](https://arxiv.org/abs/1807.11164), Ma, Ningning, et al. "Shufflenet v2: Practical guidelines for efficient cnn architecture design." Proceedings of the European Conference on Computer Vision (ECCV). 2018.
--- a/official/vision/classification/shufflenet/__init__.py
+++ b/official/vision/classification/shufflenet/__init__.py
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
--- a/official/vision/classification/shufflenet/inference.py
+++ b/official/vision/classification/shufflenet/inference.py
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import argparse
+import json
+
+import cv2
+import megengine as mge
+import megengine.data.transform as T
+import megengine.functional as F
+import megengine.jit as jit
+import numpy as np
+
+import model as M
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-a", "--arch", default="shufflenet_v2_x1_0", type=str)
+    parser.add_argument("-m", "--model", default=None, type=str)
+    parser.add_argument("-i", "--image", default=None, type=str)
+    args = parser.parse_args()
+
+    model = getattr(M, args.arch)(pretrained=(args.model is None))
+    if args.model:
+        state_dict = mge.load(args.model)
+        model.load_state_dict(state_dict)
+
+    if args.image is None:
+        path = "../../../assets/cat.jpg"
+    else:
+        path = args.image
+    image = cv2.imread(path, cv2.IMREAD_COLOR)
+
+    transform = T.Compose(
+        [
+            T.Resize(256),
+            T.CenterCrop(224),
+            T.Normalize(
+                mean=[103.530, 116.280, 123.675], std=[57.375, 57.120, 58.395]
+            ),  # BGR
+            T.ToMode("CHW"),
+        ]
+    )
+
+    @jit.trace(symbolic=False)
+    def infer_func(processed_img):
+        model.eval()
+        logits = model(processed_img)
+        probs = F.softmax(logits)
+        return probs
+
+    processed_img = transform.apply(image)[np.newaxis, :]
+    probs = infer_func(processed_img)
+
+    top_probs, classes = F.top_k(probs, k=5, descending=True)
+
+    with open("../../../assets/imagenet_class_info.json") as fp:
+        imagenet_class_index = json.load(fp)
+
+    for rank, (prob, classid) in enumerate(
+        zip(top_probs.numpy().reshape(-1), classes.numpy().reshape(-1))
+    ):
+        print(
+            "{}: class = {:20s} with probability = {:4.1f} %".format(
+                rank, imagenet_class_index[str(classid)][1], 100 * prob
+            )
+        )
+
+
+if __name__ == "__main__":
+    main()
--- a/official/vision/classification/shufflenet/model.py
+++ b/official/vision/classification/shufflenet/model.py
+# -*- coding: utf-8 -*-
+# MIT License
+#
+# Copyright (c) 2019 Megvii Technology
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+# ------------------------------------------------------------------------------
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#
+# This file has been modified by Megvii ("Megvii Modifications").
+# All Megvii Modifications are Copyright (C) 2014-2019 Megvii Inc. All rights reserved.
+# ------------------------------------------------------------------------------
+import megengine.functional as F
+import megengine.hub as hub
+import megengine.module as M
+
+
+class ShuffleV2Block(M.Module):
+    def __init__(self, inp, oup, mid_channels, *, ksize, stride):
+        super(ShuffleV2Block, self).__init__()
+        self.stride = stride
+        assert stride in [1, 2]
+
+        self.mid_channels = mid_channels
+        self.ksize = ksize
+        pad = ksize // 2
+        self.pad = pad
+        self.inp = inp
+
+        outputs = oup - inp
+
+        branch_main = [
+            # pw
+            M.Conv2d(inp, mid_channels, 1, 1, 0, bias=False),
+            M.BatchNorm2d(mid_channels),
+            M.ReLU(),
+            # dw
+            M.Conv2d(
+                mid_channels,
+                mid_channels,
+                ksize,
+                stride,
+                pad,
+                groups=mid_channels,
+                bias=False,
+            ),
+            M.BatchNorm2d(mid_channels),
+            # pw-linear
+            M.Conv2d(mid_channels, outputs, 1, 1, 0, bias=False),
+            M.BatchNorm2d(outputs),
+            M.ReLU(),
+        ]
+        self.branch_main = M.Sequential(*branch_main)
+
+        if stride == 2:
+            branch_proj = [
+                # dw
+                M.Conv2d(inp, inp, ksize, stride, pad, groups=inp, bias=False),
+                M.BatchNorm2d(inp),
+                # pw-linear
+                M.Conv2d(inp, inp, 1, 1, 0, bias=False),
+                M.BatchNorm2d(inp),
+                M.ReLU(),
+            ]
+            self.branch_proj = M.Sequential(*branch_proj)
+        else:
+            self.branch_proj = None
+
+    def forward(self, old_x):
+        if self.stride == 1:
+            x_proj, x = self.channel_shuffle(old_x)
+            return F.concat((x_proj, self.branch_main(x)), 1)
+        elif self.stride == 2:
+            x_proj = old_x
+            x = old_x
+            return F.concat((self.branch_proj(x_proj), self.branch_main(x)), 1)
+
+    def channel_shuffle(self, x):
+        batchsize, num_channels, height, width = x.shape
+        # assert (num_channels % 4 == 0)
+        x = x.reshape(batchsize * num_channels // 2, 2, height * width)
+        x = x.dimshuffle(1, 0, 2)
+        x = x.reshape(2, -1, num_channels // 2, height, width)
+        return x[0], x[1]
+
+
+class ShuffleNetV2(M.Module):
+    def __init__(self, input_size=224, num_classes=1000, model_size="1.5x"):
+        super(ShuffleNetV2, self).__init__()
+
+        self.stage_repeats = [4, 8, 4]
+        self.model_size = model_size
+        if model_size == "0.5x":
+            self.stage_out_channels = [-1, 24, 48, 96, 192, 1024]
+        elif model_size == "1.0x":
+            self.stage_out_channels = [-1, 24, 116, 232, 464, 1024]
+        elif model_size == "1.5x":
+            self.stage_out_channels = [-1, 24, 176, 352, 704, 1024]
+        elif model_size == "2.0x":
+            self.stage_out_channels = [-1, 24, 244, 488, 976, 2048]
+        else:
+            raise NotImplementedError
+
+        # building first layer
+        input_channel = self.stage_out_channels[1]
+        self.first_conv = M.Sequential(
+            M.Conv2d(3, input_channel, 3, 2, 1, bias=False),
+            M.BatchNorm2d(input_channel),
+            M.ReLU(),
+        )
+
+        self.maxpool = M.MaxPool2d(kernel_size=3, stride=2, padding=1)
+
+        self.features = []
+        for idxstage in range(len(self.stage_repeats)):
+            numrepeat = self.stage_repeats[idxstage]
+            output_channel = self.stage_out_channels[idxstage + 2]
+
+            for i in range(numrepeat):
+                if i == 0:
+                    self.features.append(
+                        ShuffleV2Block(
+                            input_channel,
+                            output_channel,
+                            mid_channels=output_channel // 2,
+                            ksize=3,
+                            stride=2,
+                        )
+                    )
+                else:
+                    self.features.append(
+                        ShuffleV2Block(
+                            input_channel // 2,
+                            output_channel,
+                            mid_channels=output_channel // 2,
+                            ksize=3,
+                            stride=1,
+                        )
+                    )
+
+                input_channel = output_channel
+
+        self.features = M.Sequential(*self.features)
+
+        self.conv_last = M.Sequential(
+            M.Conv2d(input_channel, self.stage_out_channels[-1], 1, 1, 0, bias=False),
+            M.BatchNorm2d(self.stage_out_channels[-1]),
+            M.ReLU(),
+        )
+        self.globalpool = M.AvgPool2d(7)
+        if self.model_size == "2.0x":
+            self.dropout = M.Dropout(0.2)
+        self.classifier = M.Sequential(
+            M.Linear(self.stage_out_channels[-1], num_classes, bias=False)
+        )
+        self._initialize_weights()
+
+    def forward(self, x):
+        x = self.first_conv(x)
+        x = self.maxpool(x)
+        x = self.features(x)
+        x = self.conv_last(x)
+
+        x = self.globalpool(x)
+        if self.model_size == "2.0x":
+            x = self.dropout(x)
+        x = x.reshape(-1, self.stage_out_channels[-1])
+        x = self.classifier(x)
+        return x
+
+    def _initialize_weights(self):
+        for name, m in self.named_modules():
+            if isinstance(m, M.Conv2d):
+                if "first" in name:
+                    M.init.normal_(m.weight, 0, 0.01)
+                else:
+                    M.init.normal_(m.weight, 0, 1.0 / m.weight.shape[1])
+                if m.bias is not None:
+                    M.init.fill_(m.bias, 0)
+            elif isinstance(m, M.BatchNorm2d):
+                M.init.fill_(m.weight, 1)
+                if m.bias is not None:
+                    M.init.fill_(m.bias, 0.0001)
+                M.init.fill_(m.running_mean, 0)
+            elif isinstance(m, M.BatchNorm1d):
+                M.init.fill_(m.weight, 1)
+                if m.bias is not None:
+                    M.init.fill_(m.bias, 0.0001)
+                M.init.fill_(m.running_mean, 0)
+            elif isinstance(m, M.Linear):
+                M.init.normal_(m.weight, 0, 0.01)
+                if m.bias is not None:
+                    M.init.fill_(m.bias, 0)
+
+
+def shufflenet_v2_x2_0(num_classes=1000):
+    return ShuffleNetV2(num_classes=num_classes, model_size="2.0x")
+
+
+def shufflenet_v2_x1_5(num_classes=1000):
+    return ShuffleNetV2(num_classes=num_classes, model_size="1.5x")
+
+
+@hub.pretrained(
+    "https://data.megengine.org.cn/models/weights/snetv2_x1_0_69369_daf9dba0.pkl"
+)
+def shufflenet_v2_x1_0(num_classes=1000):
+    return ShuffleNetV2(num_classes=num_classes, model_size="1.0x")
+
+
+def shufflenet_v2_x0_5(num_classes=1000):
+    return ShuffleNetV2(num_classes=num_classes, model_size="0.5x")
--- a/official/vision/classification/shufflenet/test.py
+++ b/official/vision/classification/shufflenet/test.py
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import argparse
+import multiprocessing as mp
+import time
+
+import megengine as mge
+import megengine.data as data
+import megengine.data.transform as T
+import megengine.distributed as dist
+import megengine.functional as F
+import megengine.jit as jit
+
+import model as M
+
+logger = mge.get_logger(__name__)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-a", "--arch", default="shufflenet_v2_x1_0", type=str)
+    parser.add_argument("-d", "--data", default=None, type=str)
+    parser.add_argument("-m", "--model", default=None, type=str)
+
+    parser.add_argument("-n", "--ngpus", default=None, type=int)
+    parser.add_argument("-w", "--workers", default=4, type=int)
+    parser.add_argument("--report-freq", default=50, type=int)
+    args = parser.parse_args()
+
+    world_size = mge.get_device_count("gpu") if args.ngpus is None else args.ngpus
+
+    if world_size > 1:
+        # start distributed training, dispatch sub-processes
+        mp.set_start_method("spawn")
+        processes = []
+        for rank in range(world_size):
+            p = mp.Process(target=worker, args=(rank, world_size, args))
+            p.start()
+            processes.append(p)
+
+        for p in processes:
+            p.join()
+    else:
+        worker(0, 1, args)
+
+
+def worker(rank, world_size, args):
+    if world_size > 1:
+        # Initialize distributed process group
+        logger.info("init distributed process group {} / {}".format(rank, world_size))
+        dist.init_process_group(
+            master_ip="localhost",
+            master_port=23456,
+            world_size=world_size,
+            rank=rank,
+            dev=rank,
+        )
+
+    model = getattr(M, args.arch)(pretrained=(args.model is None))
+    if args.model:
+        logger.info("load weights from %s", args.model)
+        model.load_state_dict(mge.load(args.model))
+
+    @jit.trace(symbolic=True)
+    def valid_func(image, label):
+        model.eval()
+        logits = model(image)
+        loss = F.cross_entropy_with_softmax(logits, label)
+        acc1, acc5 = F.accuracy(logits, label, (1, 5))
+        if dist.is_distributed():  # all_reduce_mean
+            loss = dist.all_reduce_sum(loss, "valid_loss") / dist.get_world_size()
+            acc1 = dist.all_reduce_sum(acc1, "valid_acc1") / dist.get_world_size()
+            acc5 = dist.all_reduce_sum(acc5, "valid_acc5") / dist.get_world_size()
+        return loss, acc1, acc5
+
+    logger.info("preparing dataset..")
+    valid_dataset = data.dataset.ImageNet(args.data, train=False)
+    valid_sampler = data.SequentialSampler(
+        valid_dataset, batch_size=100, drop_last=False
+    )
+    valid_queue = data.DataLoader(
+        valid_dataset,
+        sampler=valid_sampler,
+        transform=T.Compose(
+            [
+                T.Resize(256),
+                T.CenterCrop(224),
+                T.Normalize(
+                    mean=[103.530, 116.280, 123.675], std=[57.375, 57.120, 58.395]
+                ),  # BGR
+                T.ToMode("CHW"),
+            ]
+        ),
+        num_workers=args.workers,
+    )
+    _, valid_acc, valid_acc5 = infer(valid_func, valid_queue, args)
+    logger.info("Valid %.3f / %.3f", valid_acc, valid_acc5)
+
+
+def infer(model, data_queue, args, epoch=0):
+    objs = AverageMeter("Loss")
+    top1 = AverageMeter("Acc@1")
+    top5 = AverageMeter("Acc@5")
+    total_time = AverageMeter("Time")
+
+    t = time.time()
+    for step, (image, label) in enumerate(data_queue):
+        n = image.shape[0]
+        image = image.astype("float32")  # convert np.uint8 to float32
+        label = label.astype("int32")
+
+        loss, acc1, acc5 = model(image, label)
+
+        objs.update(loss.numpy()[0], n)
+        top1.update(100 * acc1.numpy()[0], n)
+        top5.update(100 * acc5.numpy()[0], n)
+        total_time.update(time.time() - t)
+        t = time.time()
+
+        if step % args.report_freq == 0 and dist.get_rank() == 0:
+            logger.info(
+                "Epoch %d Step %d, %s %s %s %s",
+                epoch,
+                step,
+                objs,
+                top1,
+                top5,
+                total_time,
+            )
+
+    return objs.avg, top1.avg, top5.avg
+
+
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+
+    def __init__(self, name, fmt=":.3f"):
+        self.name = name
+        self.fmt = fmt
+        self.reset()
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+    def __str__(self):
+        fmtstr = "{name} {val" + self.fmt + "} ({avg" + self.fmt + "})"
+        return fmtstr.format(**self.__dict__)
+
+
+if __name__ == "__main__":
+    main()
--- a/official/vision/classification/shufflenet/train.py
+++ b/official/vision/classification/shufflenet/train.py
--- a/official/vision/detection/README.md
+++ b/official/vision/detection/README.md
+# Megengine RetinaNet
+
+## 介绍
+
+本目录包含了采用MegEngine实现的经典[RetinaNet](https://arxiv.org/pdf/1708.02002>)网络结构，
+同时提供了在COCO2017数据集上的完整训练和测试代码。
+
+网络的性能在COCO2017验证集上的测试结果如下：
+
+| 模型                       | mAP<br>@5-95              |   batch<br>/gpu | gpu   | speed<br>(8gpu)   | speed<br>(1gpu)|
+| ---                        | ---                       |  ---            | ---   |       ---         |      ---       |
+| retinanet-res50-1x-800size | 36.0 |   2             | 2080  |       2.27(it/s)  |      3.7(it/s) |    
+
+* MegEngine v0.3.0
+
+## 如何使用
+
+模型训练好之后，可以通过如下命令测试单张图片:
+
+```bash
+python3 tools/inference.py -f retinanet_res50_1x_800size.py \
+                           -i ../../assets/cat.jpg \
+                           -m /path/to/retinanet_weights.pkl
+```
+
+`tools/inference.py`的命令行选项如下:
+
+- `-f`, 测试的网络结构描述文件。
+- `-m`, 网络结构文件所对应的训练权重, 可以从顶部的表格中下载训练好的检测器权重。
+- `-i`, 需要测试的样例图片。
+
+使用默认图片和默认模型测试的结果见下图:
+
+![demo image](../../assets/cat_det_out.jpg)
+
+## 如何训练
+
+1. 在开始训练前，请确保已经下载解压好[COCO数据集](http://cocodataset.org/#download)，
+并放在合适的数据目录下，准备好的数据集的目录结构如下所示(目前默认使用coco2017的数据集)：
+
+```
+/path/to/
+    |->coco
+    |    |annotations
+    |    |train2017
+    |    |val2017
+```
+
+2. 准备预训练的`backbone`网络权重：可使用 megengine.hub 下载`megengine`官方提供的在ImageNet上训练的resnet50模型, 并存放在 `/path/to/pretrain.pkl`。
+
+3. 在开始运行本目录下的代码之前，请确保按照[README](../../../README.md)进行了正确的环境配置。
+
+4. 开始训练:
+
+```bash
+python3 tools/train.py -f retinanet_res50_1x_800size.py \
+                       -n 8 \
+                       --batch_size 2 \
+                       -w /path/to/pretrain.pkl
+```
+
+`tools/train.py`提供了灵活的命令行选项，包括：
+
+- `-f`, 所需要训练的网络结构描述文件。
+- `-n`, 用于训练的devices(gpu)数量，默认使用所有可用的gpu.
+- `-w`, 预训练的backbone网络权重的路径。
+- `--batch_size`，训练时采用的`batch size`, 默认2，表示每张卡训2张图。
+- `--dataset-dir`, coco数据集的根目录，默认`/data/datasets/coco`。
+
+默认情况下模型会存在 `log-of-retinanet_res50_1x_800size`目录下。
+
+## 如何测试
+
+在训练的过程中，可以通过如下命令测试模型在`COCO2017`验证集的性能：
+
+```bash
+python3 tools/test.py -n 8 \
+                      -f retinanet_res50_1x_800size.py \
+                      --model /path/to/retinanet_weights.pt \
+                      --dataset_dir /data/datasets/coco
+```
+
+`tools/test.py`的命令行选项如下：
+
+- `-f`, 所需要测试的网络结构描述文件。
+- `-n`, 用于测试的devices(gpu)数量，默认1；
+- `--model`, 需要测试的模型；可以从顶部的表格中下载训练好的检测器权重, 也可以用自行训练好的权重。
+- `--dataset_dir`，coco数据集的根目录，默认`/data/datasets`
+
+## 参考文献
+
+- [Focal Loss for Dense Object Detection](https://arxiv.org/pdf/1708.02002) Tsung-Yi Lin, Priya Goyal, Ross Girshick, Kaiming He, Piotr Dollár. Proceedings of the IEEE international conference on computer vision. 2017: 2980-2988.
+- [Microsoft COCO: Common Objects in Context](https://arxiv.org/pdf/1405.0312.pdf)  Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Dollár, Piotr and Zitnick, C Lawrence
+Lin T Y, Maire M, Belongie S, et al. European conference on computer vision. Springer, Cham, 2014: 740-755.
--- a/official/vision/detection/layers/__init__.py
+++ b/official/vision/detection/layers/__init__.py
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from .basic import *
+from .det import *
+
+_EXCLUDE = {}
+__all__ = [k for k in globals().keys() if k not in _EXCLUDE and not k.startswith("_")]
--- a/official/vision/detection/layers/basic/__init__.py
+++ b/official/vision/detection/layers/basic/__init__.py
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from .functional import *
+from .nn import *
+from .norm import *
+
+_EXCLUDE = {}
+__all__ = [k for k in globals().keys() if k not in _EXCLUDE and not k.startswith("_")]
--- a/official/vision/detection/layers/basic/functional.py
+++ b/official/vision/detection/layers/basic/functional.py
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import megengine as mge
+import megengine.functional as F
+import numpy as np
+
+from megengine import _internal as mgb
+from megengine.core import Tensor, wrap_io_tensor
+
+
+def get_padded_array_np(
+    array: np.ndarray, multiple_number: int = 32, pad_value: float = 0
+) -> np.ndarray:
+    """ pad the nd-array to multiple stride of th e
+
+    Args:
+        array (np.ndarray):
+            the array with the shape of [batch, channel, height, width]
+        multiple_number (int):
+            make the height and width can be divided by multiple_number
+        pad_value (int): the value to be padded
+
+    Returns:
+        padded_array (np.ndarray)
+    """
+    batch, chl, t_height, t_width = array.shape
+    padded_height = (
+        (t_height + multiple_number - 1) // multiple_number * multiple_number
+    )
+    padded_width = (t_width + multiple_number - 1) // multiple_number * multiple_number
+
+    padded_array = (
+        np.ones([batch, chl, padded_height, padded_width], dtype=np.float32) * pad_value
+    )
+
+    ndim = array.ndim
+    if ndim == 4:
+        padded_array[:, :, :t_height, :t_width] = array
+    elif ndim == 3:
+        padded_array[:, :t_height, :t_width] = array
+    else:
+        raise Exception("Not supported tensor dim: %d" % ndim)
+    return padded_array
+
+
+def get_padded_tensor(
+    array: Tensor, multiple_number: int = 32, pad_value: float = 0
+) -> Tensor:
+    """ pad the nd-array to multiple stride of th e
+
+    Args:
+        array (Tensor):
+            the tensor with the shape of [batch, channel, height, width]
+        multiple_number (int):
+            make the height and width can be divided by multiple_number
+        pad_value (int): the value to be padded
+
+    Returns:
+        padded_array (Tensor)
+    """
+    batch, chl, t_height, t_width = array.shape
+    padded_height = (
+        (t_height + multiple_number - 1) // multiple_number * multiple_number
+    )
+    padded_width = (t_width + multiple_number - 1) // multiple_number * multiple_number
+
+    padded_array = (
+        mge.ones(
+            F.concat([batch, chl, padded_height, padded_width], axis=0),
+            dtype=np.float32,
+        )
+        * pad_value
+    )
+
+    ndim = array.ndim
+    if ndim == 4:
+        padded_array = padded_array.set_subtensor(array)[:, :, :t_height, :t_width]
+    elif ndim == 3:
+        padded_array = padded_array.set_subtensor(array)[:, :t_height, :t_width]
+    else:
+        raise Exception("Not supported tensor dim: %d" % ndim)
+    return padded_array
+
+
+@wrap_io_tensor
+def indexing_set_one_hot(inp, axis, idx, value) -> Tensor:
+    return mgb.opr.indexing_set_one_hot(inp, axis, idx, value)
--- a/official/vision/detection/layers/basic/nn.py
+++ b/official/vision/detection/layers/basic/nn.py
--- a/official/vision/detection/layers/basic/norm.py
+++ b/official/vision/detection/layers/basic/norm.py
--- a/official/vision/detection/layers/det/__init__.py
+++ b/official/vision/detection/layers/det/__init__.py
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from .anchor import *
+from .box_utils import *
+from .fpn import *
+from .loss import *
+from .retinanet import *
+
+_EXCLUDE = {}
+__all__ = [k for k in globals().keys() if k not in _EXCLUDE and not k.startswith("_")]
--- a/official/vision/detection/layers/det/anchor.py
+++ b/official/vision/detection/layers/det/anchor.py
--- a/official/vision/detection/layers/det/box_utils.py
+++ b/official/vision/detection/layers/det/box_utils.py
--- a/official/vision/detection/layers/det/fpn.py
+++ b/official/vision/detection/layers/det/fpn.py
--- a/official/vision/detection/layers/det/loss.py
+++ b/official/vision/detection/layers/det/loss.py
--- a/official/vision/detection/layers/det/retinanet.py
+++ b/official/vision/detection/layers/det/retinanet.py
--- a/official/vision/detection/retinanet_res50_1x_800size.py
+++ b/official/vision/detection/retinanet_res50_1x_800size.py
--- a/official/vision/detection/tools/inference.py
+++ b/official/vision/detection/tools/inference.py
--- a/official/vision/detection/tools/nms.py
+++ b/official/vision/detection/tools/nms.py
--- a/official/vision/detection/tools/test.py
+++ b/official/vision/detection/tools/test.py
--- a/official/vision/detection/tools/train.py
+++ b/official/vision/detection/tools/train.py
--- a/official/vision/segmentation/README.md
+++ b/official/vision/segmentation/README.md
--- a/official/vision/segmentation/deeplabv3plus.py
+++ b/official/vision/segmentation/deeplabv3plus.py
--- a/official/vision/segmentation/inference.py
+++ b/official/vision/segmentation/inference.py
--- a/official/vision/segmentation/test.py
+++ b/official/vision/segmentation/test.py
--- a/official/vision/segmentation/train.py
+++ b/official/vision/segmentation/train.py
--- a/requirements.txt
+++ b/requirements.txt
+megengine
+numpy
+opencv-python
+pycocotools
+tqdm
+tabulate