From 0d020394a2ed98d12a361f116f977a0a83081e45 Mon Sep 17 00:00:00 2001
From: Cheerego <35982308+shanyi15@users.noreply.github.com>
Date: Fri, 22 Feb 2019 17:36:32 +0800
Subject: [PATCH] thorough clean for doc (#644) (#645)
* thorough clean
* delete_DS_Store
---
.../deploy/anakin/anakin_arm_benchmark.md | 56 -
.../deploy/anakin/anakin_example.md | 38 -
.../deploy/anakin/anakin_gpu_benchmark.md | 154 --
.../deploy/anakin/anakin_parser_design.md | 92 -
.../deploy/anakin/anakin_run_on_arm.md | 193 --
.../deploy/anakin/anakin_tutorial.md | 645 ------
.../deploy/anakin/convert_paddle_to_anakin.md | 73 -
.../deploy/anakin/how_to_add_anakin_op.md | 405 ----
.../how_to_support_new_device_in_anakin.md | 459 -----
.../advanced_usage/deploy/anakin/index_cn.rst | 29 -
.../deploy/anakin/install_anakin.md | 76 -
.../deploy/anakin/run_anakin_on_arm.md | 185 --
.../development/profiling/benchmark.rst | 120 --
.../profiling/gpu_profiling_cn.rst | 239 ---
.../development/write_docs_cn.md | 204 +-
.../basics/learning_materials.md | 54 -
.../basics/learning_materials_en.md | 72 -
doc/fluid/beginners_guide/index.rst | 5 -
doc/fluid/beginners_guide/index_en.rst | 7 +-
.../build_from_source_cn.rst | 225 --
.../build_from_source_en.rst | 237 ---
.../build_and_install/docker_install_cn.rst | 146 --
.../build_and_install/docker_install_en.rst | 153 --
doc/fluid/build_and_install/index_cn.rst | 56 -
doc/fluid/build_and_install/index_en.rst | 56 -
doc/fluid/build_and_install/paddleci.png | Bin 40242 -> 0 bytes
.../build_and_install/pip_install_cn.rst | 105 -
.../build_and_install/pip_install_en.rst | 123 --
doc/fluid/dev/api_doc_std_cn.md | 221 --
doc/fluid/dev/api_doc_std_en.md | 227 ---
doc/fluid/dev/ci_build_whl.png | Bin 287162 -> 0 bytes
doc/fluid/dev/contribute_to_paddle_cn.md | 1 -
doc/fluid/dev/contribute_to_paddle_en.md | 1 -
doc/fluid/dev/index_cn.rst | 17 -
doc/fluid/dev/index_en.rst | 14 -
doc/fluid/dev/name_convention.md | 65 -
doc/fluid/dev/new_op_cn.md | 452 ----
doc/fluid/dev/new_op_kernel.md | 121 --
doc/fluid/dev/op_markdown_format.md | 64 -
doc/fluid/dev/releasing_process_cn.md | 195 --
doc/fluid/dev/releasing_process_en.md | 228 ---
doc/fluid/dev/src/fc.py | 81 -
doc/fluid/dev/support_new_device.md | 240 ---
doc/fluid/dev/use_eigen_cn.md | 146 --
doc/fluid/dev/use_eigen_en.md | 146 --
doc/fluid/dev/versioning_en.md | 66 -
doc/fluid/dev/write_docs_cn.md | 1 -
doc/fluid/dev/write_docs_cn.rst | 1 -
doc/fluid/dev/write_docs_en.rst | 1 -
doc/fluid/faq/faq.rst | 12 -
doc/fluid/faq/index_cn.rst | 9 -
doc/fluid/faq/index_en.rst | 2 -
.../Developer's_Guide_to_Paddle_Fluid.md | 1814 -----------------
doc/fluid/getstarted/concepts/index_cn.rst | 4 -
doc/fluid/getstarted/concepts/index_en.rst | 4 -
.../getstarted/concepts/reader/README.md | 206 --
.../concepts/save_model/model_format.md | 76 -
doc/fluid/getstarted/index_cn.rst | 20 -
doc/fluid/getstarted/index_en.rst | 19 -
doc/fluid/getstarted/quickstart_cn.rst | 45 -
doc/fluid/getstarted/quickstart_en.rst | 49 -
.../howto/cluster/fluid_cluster_train_cn.md | 181 --
.../howto/cluster/fluid_cluster_train_en.md | 153 --
doc/fluid/howto/cluster/fluid_recordio.md | 127 --
.../howto/cluster/nccl2_rdma_training.md | 110 -
doc/fluid/howto/index_cn.rst | 8 -
doc/fluid/howto/index_en.rst | 7 -
.../inference/build_and_install_lib_cn.rst | 97 -
doc/fluid/howto/inference/index_cn.rst | 8 -
.../inference_support_in_fluid_cn.md | 304 ---
.../howto/optimization/benchmark/index_cn.rst | 8 -
.../howto/optimization/benchmark/index_en.rst | 8 -
doc/fluid/howto/optimization/index_cn.rst | 9 -
doc/fluid/howto/optimization/index_en.rst | 9 -
doc/fluid/howto/optimization/pprof_1.png | Bin 352710 -> 0 bytes
doc/fluid/howto/optimization/pprof_2.png | Bin 194000 -> 0 bytes
doc/fluid/howto/optimization/timeline.jpeg | Bin 70606 -> 0 bytes
doc/fluid/howto/optimization/tracing.jpeg | Bin 30668 -> 0 bytes
doc/fluid/howto/performance/error_clip.md | 92 -
.../howto/performance/images/profiler.png | Bin 51116 -> 0 bytes
doc/fluid/howto/performance/profiler.md | 116 --
.../images/multigpu_allreduce.graffle | Bin 5489 -> 0 bytes
.../third_party/images/multigpu_allreduce.png | Bin 110982 -> 0 bytes
.../images/multigpu_before_convert.graffle | Bin 3056 -> 0 bytes
.../images/multigpu_before_convert.png | Bin 33557 -> 0 bytes
doc/fluid/howto/third_party/mkldnn_fluid.md | 149 --
doc/fluid/howto/third_party/paddle_nccl.md | 65 -
doc/fluid/index_en.rst | 3 +-
doc/fluid/overview.md | 25 -
doc/fluid/read_source.md | 67 -
90 files changed, 205 insertions(+), 10096 deletions(-)
delete mode 100644 doc/fluid/advanced_usage/deploy/anakin/anakin_arm_benchmark.md
delete mode 100644 doc/fluid/advanced_usage/deploy/anakin/anakin_example.md
delete mode 100644 doc/fluid/advanced_usage/deploy/anakin/anakin_gpu_benchmark.md
delete mode 100644 doc/fluid/advanced_usage/deploy/anakin/anakin_parser_design.md
delete mode 100644 doc/fluid/advanced_usage/deploy/anakin/anakin_run_on_arm.md
delete mode 100644 doc/fluid/advanced_usage/deploy/anakin/anakin_tutorial.md
delete mode 100644 doc/fluid/advanced_usage/deploy/anakin/convert_paddle_to_anakin.md
delete mode 100644 doc/fluid/advanced_usage/deploy/anakin/how_to_add_anakin_op.md
delete mode 100644 doc/fluid/advanced_usage/deploy/anakin/how_to_support_new_device_in_anakin.md
delete mode 100644 doc/fluid/advanced_usage/deploy/anakin/index_cn.rst
delete mode 100644 doc/fluid/advanced_usage/deploy/anakin/install_anakin.md
delete mode 100644 doc/fluid/advanced_usage/deploy/anakin/run_anakin_on_arm.md
delete mode 100644 doc/fluid/advanced_usage/development/profiling/benchmark.rst
delete mode 100644 doc/fluid/advanced_usage/development/profiling/gpu_profiling_cn.rst
mode change 120000 => 100644 doc/fluid/advanced_usage/development/write_docs_cn.md
delete mode 100644 doc/fluid/beginners_guide/basics/learning_materials.md
delete mode 100644 doc/fluid/beginners_guide/basics/learning_materials_en.md
delete mode 100644 doc/fluid/build_and_install/build_from_source_cn.rst
delete mode 100644 doc/fluid/build_and_install/build_from_source_en.rst
delete mode 100644 doc/fluid/build_and_install/docker_install_cn.rst
delete mode 100644 doc/fluid/build_and_install/docker_install_en.rst
delete mode 100644 doc/fluid/build_and_install/index_cn.rst
delete mode 100644 doc/fluid/build_and_install/index_en.rst
delete mode 100644 doc/fluid/build_and_install/paddleci.png
delete mode 100644 doc/fluid/build_and_install/pip_install_cn.rst
delete mode 100644 doc/fluid/build_and_install/pip_install_en.rst
delete mode 100644 doc/fluid/dev/api_doc_std_cn.md
delete mode 100644 doc/fluid/dev/api_doc_std_en.md
delete mode 100644 doc/fluid/dev/ci_build_whl.png
delete mode 120000 doc/fluid/dev/contribute_to_paddle_cn.md
delete mode 120000 doc/fluid/dev/contribute_to_paddle_en.md
delete mode 100644 doc/fluid/dev/index_cn.rst
delete mode 100644 doc/fluid/dev/index_en.rst
delete mode 100644 doc/fluid/dev/name_convention.md
delete mode 100644 doc/fluid/dev/new_op_cn.md
delete mode 100644 doc/fluid/dev/new_op_kernel.md
delete mode 100644 doc/fluid/dev/op_markdown_format.md
delete mode 100644 doc/fluid/dev/releasing_process_cn.md
delete mode 100644 doc/fluid/dev/releasing_process_en.md
delete mode 100644 doc/fluid/dev/src/fc.py
delete mode 100644 doc/fluid/dev/support_new_device.md
delete mode 100644 doc/fluid/dev/use_eigen_cn.md
delete mode 100644 doc/fluid/dev/use_eigen_en.md
delete mode 100644 doc/fluid/dev/versioning_en.md
delete mode 120000 doc/fluid/dev/write_docs_cn.md
delete mode 120000 doc/fluid/dev/write_docs_cn.rst
delete mode 100644 doc/fluid/dev/write_docs_en.rst
delete mode 100644 doc/fluid/faq/faq.rst
delete mode 100644 doc/fluid/faq/index_cn.rst
delete mode 100644 doc/fluid/faq/index_en.rst
delete mode 100644 doc/fluid/getstarted/Developer's_Guide_to_Paddle_Fluid.md
delete mode 100644 doc/fluid/getstarted/concepts/index_cn.rst
delete mode 100644 doc/fluid/getstarted/concepts/index_en.rst
delete mode 100644 doc/fluid/getstarted/concepts/reader/README.md
delete mode 100644 doc/fluid/getstarted/concepts/save_model/model_format.md
delete mode 100644 doc/fluid/getstarted/index_cn.rst
delete mode 100644 doc/fluid/getstarted/index_en.rst
delete mode 100644 doc/fluid/getstarted/quickstart_cn.rst
delete mode 100644 doc/fluid/getstarted/quickstart_en.rst
delete mode 100644 doc/fluid/howto/cluster/fluid_cluster_train_cn.md
delete mode 100644 doc/fluid/howto/cluster/fluid_cluster_train_en.md
delete mode 100644 doc/fluid/howto/cluster/fluid_recordio.md
delete mode 100644 doc/fluid/howto/cluster/nccl2_rdma_training.md
delete mode 100644 doc/fluid/howto/index_cn.rst
delete mode 100644 doc/fluid/howto/index_en.rst
delete mode 100644 doc/fluid/howto/inference/build_and_install_lib_cn.rst
delete mode 100644 doc/fluid/howto/inference/index_cn.rst
delete mode 100644 doc/fluid/howto/inference/inference_support_in_fluid_cn.md
delete mode 100644 doc/fluid/howto/optimization/benchmark/index_cn.rst
delete mode 100644 doc/fluid/howto/optimization/benchmark/index_en.rst
delete mode 100644 doc/fluid/howto/optimization/index_cn.rst
delete mode 100644 doc/fluid/howto/optimization/index_en.rst
delete mode 100644 doc/fluid/howto/optimization/pprof_1.png
delete mode 100644 doc/fluid/howto/optimization/pprof_2.png
delete mode 100644 doc/fluid/howto/optimization/timeline.jpeg
delete mode 100644 doc/fluid/howto/optimization/tracing.jpeg
delete mode 100644 doc/fluid/howto/performance/error_clip.md
delete mode 100644 doc/fluid/howto/performance/images/profiler.png
delete mode 100644 doc/fluid/howto/performance/profiler.md
delete mode 100644 doc/fluid/howto/third_party/images/multigpu_allreduce.graffle
delete mode 100644 doc/fluid/howto/third_party/images/multigpu_allreduce.png
delete mode 100644 doc/fluid/howto/third_party/images/multigpu_before_convert.graffle
delete mode 100644 doc/fluid/howto/third_party/images/multigpu_before_convert.png
delete mode 100644 doc/fluid/howto/third_party/mkldnn_fluid.md
delete mode 100644 doc/fluid/howto/third_party/paddle_nccl.md
delete mode 100644 doc/fluid/overview.md
delete mode 100644 doc/fluid/read_source.md
diff --git a/doc/fluid/advanced_usage/deploy/anakin/anakin_arm_benchmark.md b/doc/fluid/advanced_usage/deploy/anakin/anakin_arm_benchmark.md
deleted file mode 100644
index e8701b2b5..000000000
--- a/doc/fluid/advanced_usage/deploy/anakin/anakin_arm_benchmark.md
+++ /dev/null
@@ -1,56 +0,0 @@
-# Anakin ARM 性能测试
-
-## 测试环境和参数:
-+ 测试模型Mobilenetv1, mobilenetv2, mobilenet-ssd
-+ 采用android ndk交叉编译,gcc 4.9,enable neon, ABI: armveabi-v7a with neon -mfloat-abi=softfp
-+ 测试平台
- - 荣耀v9(root): 处理器:麒麟960, 4 big cores in 2.36GHz, 4 little cores in 1.8GHz
- - nubia z17:处理器:高通835, 4 big cores in 2.36GHz, 4 little cores in 1.9GHz
- - 360 N5:处理器:高通653, 4 big cores in 1.8GHz, 4 little cores in 1.4GHz
-+ 多线程:openmp
-+ 时间:warmup10次,运行10次取均值
-+ ncnn版本:来源于github的master branch中commits ID:307a77f04be29875f40d337cfff6df747df09de6(msg:convert LogisticRegressionOutput)版本
-+ TFlite版本:来源于github的master branch中commits ID:65c05bc2ac19f51f7027e66350bc71652662125c(msg:Removed unneeded file copy that was causing failure in Pi builds)版本
-
-在BenchMark中本文将使用**`ncnn`**、**`TFlite`**和**`Anakin`**进行性能对比分析
-
-## BenchMark model
-
-> 注意在性能测试之前,请先将测试model通过[External Converter](#10003)转换为Anakin model
-> 对这些model,本文在ARM上进行多线程的单batch size测试。
-
-- [Mobilenet v1](#11) *caffe model 可以在[这儿](https://github.com/shicai/MobileNet-Caffe)下载*
-- [Mobilenet v2](#22) *caffe model 可以在[这儿](https://github.com/shicai/MobileNet-Caffe)下载*
-- [mobilenet-ssd](#33) *caffe model 可以在[这儿](https://github.com/chuanqi305/MobileNet-SSD)下载*
-
-### mobilenetv1
-
- |platform | Anakin (1) | Anakin (2) | Anakin (4) | ncnn (1) | ncnn (2) | ncnn (4) | TFlite (1) | TFlite (2) | TFlite (4)|
- |:---: | :---: | :---: | :---:| :---:| :---:| :---:| :---:| :---:| :---:|
- |麒麟960|107.7ms|61.1ms|38.2ms|152.8ms|85.2ms|51.9ms|152.6ms|nan|nan|
- |高通835|105.7ms|63.1ms|~~46.8ms~~|152.7ms|87.0ms|~~92.7ms~~|146.9ms|nan|nan|
- |高通653|120.3ms|64.2ms|46.6ms|202.5ms|117.6ms|84.8ms|158.6ms|nan|nan|
-
-### mobilenetv2
-
- |platform | Anakin (1) | Anakin (2) | Anakin (4) | ncnn (1) | ncnn (2) | ncnn (4) | TFlite (1) | TFlite (2) | TFlite (4)|
- |:---: | :---: | :---: | :---:| :---:| :---:| :---:| :---:| :---:| :---:|
- |麒麟960|93.1ms|53.9ms|34.8ms|144.4ms|84.3ms|55.3ms|100.6ms|nan|nan|
- |高通835|93.0ms|55.6ms|41.1ms|139.1ms|88.4ms|58.1ms|95.2ms|nan|nan|
- |高通653|106.6ms|64.2ms|48.0ms|199.9ms|125.1ms|98.9ms|108.5ms|nan|nan|
-
-### mobilenet-ssd
-
- |platform | Anakin (1) | Anakin (2) | Anakin (4) | ncnn (1) | ncnn (2) | ncnn (4) | TFlite (1) | TFlite (2) | TFlite (4)|
- |:---: | :---: | :---: | :---:| :---:| :---:| :---:| :---:| :---:| :---:|
- |麒麟960|213.9ms|120.5ms|74.5ms|307.9ms|166.5ms|104.2ms|nan|nan|nan|
- |高通835|213.0ms|125.7ms|~~98.4ms~~|292.9ms|177.9ms|~~167.8ms~~|nan|nan|nan|
- |高通653|236.0ms|129.6ms|96.0ms|377.7ms|228.9ms|165.0ms|nan|nan|nan
-
-## How to run those Benchmark models?
-
- 1. 首先, 使用[External Converter](./convert_paddle_to_anakin.html)对caffe model 进行转换
- 2. 然后将转换后的Anakin model和编译好的benchmark_arm 二进制文件通过'adb push'命令上传至测试机
- 3. 接着在测试机含有Anakin model的目录中运行'./benchmark_arm ./ anakin_model.anakin.bin 1 10 10 1' 命令
- 4. 最后,终端显示器上将会打印该模型的运行时间
- 5. 其中运行命令的参数个数和含义可以通过运行'./benchmark_arm'看到
diff --git a/doc/fluid/advanced_usage/deploy/anakin/anakin_example.md b/doc/fluid/advanced_usage/deploy/anakin/anakin_example.md
deleted file mode 100644
index 3cd684982..000000000
--- a/doc/fluid/advanced_usage/deploy/anakin/anakin_example.md
+++ /dev/null
@@ -1,38 +0,0 @@
-# Anakin 运行模型示例
-
-Anakin目前只支持NCHW的格式
-
-示例文件在test/framework/net下
-
-## 在NV的GPU上运行CNN模型
-
-示例文件为打开example_nv_cnn_net.cpp,整体流程如下:
-
-- 将模型的的path设置为anakin模型的路径,初始化NV平台的图对象。 anakin模型可以通过转换器转化caffe或Paddle的模型得到
-- 根据模型设置网络图的输入尺寸,进行图优化
-- 根据优化后的网络图初始化网络执行器
-- 取出网络的输入tensor,将数据拷贝到输入tensor
-- 运行推导
-- 取出网络的输出tensor
-
-以NV平台为例演示Anakin框架的使用方法,注意编译时需要打开GPU编译开关
-
-## 在X86上运行RNN模型
-
-示例文件为example_x86_rnn_net.cpp
-
-整体流程与在NV的GPU上运行CNN模型相似,不同之处如下:
-
-- 使用X86标识初始化图对象和网络执行器对象
-- rnn模型的输入尺寸是可变的,初始化图时的输入维度是维度的最大值,输入维度N代表总的词的个数。还需要设置输入tensor的seq_offset来标示这些词是如何划分为句子的,如{0,5,12}表示共有12个词,其中第0到第4个词是第一句话,第5到第11个词是第二句话
-
-以X86平台为例演示Anakin框架的使用方法,注意编译时需要打开X86编译开关
-
-## 在NV的GPU上使用Anakin的线程池运行CNN模型
-
-示例文件为example_nv_cnn_net_multi_thread.cpp ,示例使用worker的同步预测接口
-
-整体流程与在NV的GPU上运行CNN模型相似,不同之处如下:
-
-- 用模型地址和线程池大小初始化worker对象
-- 将输入tensor注入任务队列,获得输出tensor
diff --git a/doc/fluid/advanced_usage/deploy/anakin/anakin_gpu_benchmark.md b/doc/fluid/advanced_usage/deploy/anakin/anakin_gpu_benchmark.md
deleted file mode 100644
index 159286481..000000000
--- a/doc/fluid/advanced_usage/deploy/anakin/anakin_gpu_benchmark.md
+++ /dev/null
@@ -1,154 +0,0 @@
-# Anakin GPU 性能测试
-
-## 环境:
-
-> CPU: `12-core Intel(R) Xeon(R) CPU E5-2620 v2 @2.10GHz`
-> GPU: `Tesla P4`
-> cuDNN: `v7`
-
-
-## anakin 对比对象:
-
-**`Anakin`** 将与高性能的推理引擎 **`NVIDIA TensorRT 3`** 进行比较
-
-## Benchmark Model
-
-> 注意在性能测试之前,请先将测试model通过 `External Converter` 工具转换为Anakin model
-> 对这些model,本文在GPU上进行单线程单GPU卡的性能测试。
-
-- [Vgg16](#1) *caffe model 可以在[这儿](https://gist.github.com/jimmie33/27c1c0a7736ba66c2395)下载*
-- [Yolo](#2) *caffe model 可以在[这儿](https://github.com/hojel/caffe-yolo-model)下载*
-- [Resnet50](#3) *caffe model 可以在[这儿](https://github.com/KaimingHe/deep-residual-networks#models)下载*
-- [Resnet101](#4) *caffe model 可以在[这儿](https://github.com/KaimingHe/deep-residual-networks#models)下载*
-- [Mobilenet v1](#5) *caffe model 可以在[这儿](https://github.com/shicai/MobileNet-Caffe)下载*
-- [Mobilenet v2](#6) *caffe model 可以在[这儿](https://github.com/shicai/MobileNet-Caffe)下载*
-- [RNN](#7) *暂不支持*
-
-### VGG16
-
-- Latency (`ms`) of different batch
-
-| BatchSize | TensorRT | Anakin |
-| --- | --- | --- |
-| 1 | 8.53945 | 8.18737 |
-| 2 | 14.2269 | 13.8976 |
-| 4 | 24.2803 | 21.7976 |
-| 8 | 45.6003 | 40.319 |
-
-- GPU Memory Used (`MB`)
-
-| BatchSize | TensorRT | Anakin |
-| --- | --- | --- |
-| 1 | 1053.88 | 762.73 |
-| 2 | 1055.71 | 762.41 |
-| 4 | 1003.22 | 832.75 |
-| 8 | 1108.77 | 926.9 |
-
-
-### Yolo
-
-- Latency (`ms`) of different batch
-
-| BatchSize | TensorRT | Anakin |
-| --- | --- | --- |
-| 1 | 8.41606| 7.07977 |
-| 2 | 16.6588| 15.2216 |
-| 4 | 31.9955| 30.5102 |
-| 8 | 66.1107 | 64.3658 |
-
-- GPU Memory Used (`MB`)
-
-| BatchSize | TensorRT | Anakin |
-| --- | --- | --- |
-| 1 | 1054.71 | 299.8 |
-| 2 | 951.51 | 347.47 |
-| 4 | 846.9 | 438.47 |
-| 8 | 1042.31 | 515.15 |
-
-### Resnet50
-
-- Latency (`ms`) of different batch
-
-| BatchSize | TensorRT | Anakin |
-| --- | --- | --- |
-| 1 | 4.10063 | 3.33845 |
-| 2 | 6.10941 | 5.54814 |
-| 4 | 9.90233 | 10.2763 |
-| 8 | 17.3287 | 20.0783 |
-
-- GPU Memory Used (`MB`)
-
-| BatchSize | TensorRT | Anakin |
-| --- | --- | --- |
-| 1 | 1059.15 | 299.86 |
-| 2 | 1077.8 | 340.78 |
-| 4 | 903.04 | 395 |
-| 8 | 832.53 | 508.86 |
-
-### Resnet101
-
-- Latency (`ms`) of different batch
-
-| BatchSize | TensorRT | Anakin |
-| --- | --- | --- |
-| 1 | 7.29828 | 5.672 |
-| 2 | 11.2037 | 9.42352 |
-| 4 | 17.9306 | 18.0936 |
-| 8 | 31.4804 | 35.7439 |
-
-- GPU Memory Used (`MB)`
-
-| BatchSize | TensorRT | Anakin |
-| --- | --- | --- |
-| 1 | 1161.94 | 429.22 |
-| 2 | 1190.92 | 531.92 |
-| 4 | 994.11 | 549.7 |
-| 8 | 945.47 | 653.06 |
-
-### MobileNet V1
-
-- Latency (`ms`) of different batch
-
-| BatchSize | TensorRT | Anakin |
-| --- | --- | --- |
-| 1 | 1.52692 | 1.39282 |
-| 2 | 1.98091 | 2.05788 |
-| 4 | 3.2705 | 4.03476 |
-| 8 | 5.15652 | 7.06651 |
-
-- GPU Memory Used (`MB`)
-
-| BatchSize | TensorRT | Anakin |
-| --- | --- | --- |
-| 1 | 1144.35 | 99.6 |
-| 2 | 1160.03 | 199.75 |
-| 4 | 1098 | 184.33 |
-| 8 | 990.71 | 232.11 |
-
-### MobileNet V2
-
-- Latency (`ms`) of different batch
-
-| BatchSize | TensorRT | Anakin |
-| --- | --- | --- |
-| 1 | 1.95961 | 1.78249 |
-| 2 | 2.8709 | 3.01144 |
-| 4 | 4.46131 | 5.43946 |
-| 8 | 7.161 | 10.2081 |
-
-- GPU Memory Used (`MB`)
-
-| BatchSize | TensorRT | Anakin |
-| --- | --- | --- |
-| 1 | 1154.69 | 195.25 |
-| 2 | 1187.25 | 227.6 |
-| 4 | 1053 | 241.75 |
-| 8 | 1062.48 | 352.18 |
-
-
-## How to run those Benchmark models
-
-1. 首先, 使用[External Converter](./convert_paddle_to_anakin.html)对caffe model 进行转换
-2. 然后跳转至 *source_root/benchmark/CNN* 目录下,使用 'mkdir ./models'创建存放模型的目录,并将转换好的Anakin模型放在该目录下
-3. 运行脚本 `sh run.sh`,运行结束后,该模型的运行时间将会显示到终端上
-4. 如果你想获取每层OP的运行时间,你只用将 CMakeLists.txt 中的`ENABLE_OP_TIMER` 设置为 `YES` 即可
diff --git a/doc/fluid/advanced_usage/deploy/anakin/anakin_parser_design.md b/doc/fluid/advanced_usage/deploy/anakin/anakin_parser_design.md
deleted file mode 100644
index e2ec0c68d..000000000
--- a/doc/fluid/advanced_usage/deploy/anakin/anakin_parser_design.md
+++ /dev/null
@@ -1,92 +0,0 @@
-# Parser的编写指南
-
- Parser是一种网络框架转换工具,将其他框架如Caffe、TensorFlow的网络结构转换为Anakin网络结构图,然后对转换后的Anakin图进行预测处理
-
- 本文主要介绍Parser功能的框架结构和根据已有的网络框架改写Parser,以解析得到Anakin框架图,进行Anakin预测
-
- 下文称Anakin为AK,运算操作为OP,本文参考TensorFlow的Parser编写,参考代码目录为tools/external_converter_v2/parser/tensorflow
-
-## Parser的功能和执行流程
-
- Parser功能是将其他深度学习框架(如Caffe,TensorFlow,ONNX)的模型转换为AK的模型
-
- 对AK的作用是屏蔽不同框架间的差异,这种差异包括模型存储、OP的定义、图差异
-
- 因此Parser的执行流程是:
-
- - 将源框架的模型载入Parser
- - 将原框架的图解析为AK中的OP节点和OP节点的连接关系
- - 进行OP定义的转换和图优化
- - 将符合AK标准的图写入protobuf
-
-## Parser的目录结构
-
- Parser工具在tools/external_converter_v2/parser目录下
-
- Parser的目录主要包含3部分:
-
- - Parser的运行配置文件包括 config.py, config.yaml, converter.py, 用户只用执行converter.py,Parser就会按照config.yaml中的声明去解析模型
- - Parser的公共定义,包括operations,pbs,proto三个目录。Parser的公共工具函数 graph*.py logger.py utils.py
- - 各个框架对应的Parser,其目录的命名方式为框架名,如Caffe, TensorFlow
-
-## Parser的编写流程
-
-### 1、声明你的Parser
-
- - 在config.yaml中填写你的Parser运行的必要信息,包括ProtoPath和SavePath等。OPTIONS/Framework改为你的Parser的类型,TARGET下填写对应的参数列表
- - 添加你的Parser目录,如TensorFlow,导出你的Parser符号。注意,Parser的框架默认调用你的Parser类中的__call__方法来执行解析,这个方法需要返回填写完毕的GraphProtoIO对象
- - 在config.py中Configuration下__init__函数中增加对你的Parser的调用,将yaml中读取的配置信息传给你的Parser,此处调用你的Parser中的__init__方法
-
-### 2、添加你的Parser主体
-
- 可以参考parser_tf.py
-
- - 你需要在Parser主体构造时获取模型路径,input,ouput名字等解析必须的信息
- - 在__call__中返回填写好的GraphProtoIO对象,该对象为填写protobuf的辅助工具
- - 建议Parser的解析过程分成三部分,先将原框架的模型载入并转换为一种便于修改的中间的图形式;对中间图修改使得图满足AK的要求;将满足要求的中间图利用NodeProtoIO和GraphProtoIO这两个辅助类填入protobuf,具体细节可以参考parser_tf
-
-### 3、读取原始模型,并将模型转换为中间类型
-
- 可以参考parse_tf_2_med.py
-
- - 这一步与原始框架结合紧密,你可能需要import原始框架的工具函数来完成模型的裁剪、固定、加载等操作
- - 大部分的框架都是使用tensor来连接OP的,但AK中是OP直接相连,这点需要注意
- - AK的shape默认是4维的,有的参数的shape不足4维,需要Parser补全
-
-### 4、对中间类型的图进行优化
-
- 可以参考med_graph.py
-
- - 由于AK不支持普通OP多输出的情况,需要在多输出的OP后面补上Splite类型的OP节点
- - 对于Convlution后接Batchnorm这种可以合并又不会导致OP定义改变的情况,需要Parser在这一步做掉
- - AK规定所有的输入类型OP的名字必须是input_x这种命名方式,其中x为从0开始的数字
-
-### 5、将中间类型的图以GraphProtoIO的方式保存
-
- 可以参考parse_med_2_ak.py 和 parser_tf.py
-
- - 你首先需要构造Node节点,Node节点的名字是OP的名字(如conv2d_1_a_0),Node节点中OP成员变量的名字是Node节点的类型(如Convlution)
- - Node节点需要按照输入的顺序用Node的add_in方法填写输入Node的名字,add_out方法按顺序填写输出Node的名字
- - 通过调用GraphProtoIO的add_node方法将构造好的Node的__call__方法的返回值作为参数,将Node节点加入AK的graph中
- - 调用GraphProtoIO的add_in_edge和add_out_edge完成AK图中OP间关系的构建。如果Node中的in和out填写正确,你也可以通过调用GraphProtoIO的format_edge_from_nodes方法完成这个工作
- - AK的模型需要Parser给出输出Node的名字,使用GraphProtoIO的add_out方法填写输出Node的名字
-
-### 6、检查模型解析的正确性
-
- - 默认的config.yaml配置会在解析结束后启动一个web服务器展示解析后的AK模型图,你需要对比原框架的模型图进行验证。这里最容易出现的错误是边关系的错误,表现为图非常乱,你需要逐条边地检查错误;第二个容易出错的地方是参数漏填,需要你检查OP中的属性
- - 将解析后的模型放入AK中执行,使用相同的输入,原框架与AK有相同的输出。若果输出不一致可以开启AK的DEBUG模式,在net.cpp中将没层的输出打印;如果AK在解析阶段陷入死循环,大概率是边的关系出错
-
-## 如何添加新OP
-
- - 需要在AK代码中加入该OP的实现,包括对应设备Saber的OP,Saber单测和Framework中的OP
- - 根据Framework的OP在ops.py中添加Parser公共的OP定义
- - 从原框架的模型中解析出该OP的节点,并在AK的graph中填入该OP节点
-
-## AK模型与其他框架模型的不同之处
-
- + AK模型与caffe的模型相似,因此与其他模型有很多不同的地方,需要Parser在解析过程中处理掉
- + 最大的不同是与PaddlePaddle或TensorFlow的模型中OP粒度很细,而AK的模型中OP的粒度很粗(目的是为了节省访存开销)。这会导致解析这些框架的模型时存在大量的合并操作
- + 其次是OP的行为不同,如TensorFlow中Pooling默认都是exclusive的,而AK中是inclusive的。TensorFlow的Padding,如果是奇数pad,则在右方和下方多pad,而AK是在左方和上方多Pad
- + AK默认的布局是NCHW,如果其他框架的OP是其他形式的,需要在Parser中做weights的布局转换,并处理reshape的问题
- + AK中有的weights是需要预先做布局转换的(如GRU,LSTM),AK中也支持同一OP的不同算法,如(GRU,Pooling)
-
diff --git a/doc/fluid/advanced_usage/deploy/anakin/anakin_run_on_arm.md b/doc/fluid/advanced_usage/deploy/anakin/anakin_run_on_arm.md
deleted file mode 100644
index cdebd4ae0..000000000
--- a/doc/fluid/advanced_usage/deploy/anakin/anakin_run_on_arm.md
+++ /dev/null
@@ -1,193 +0,0 @@
-## ARM 源码编译 Anakin ##
-
-目前Anakin支持ARM Android平台,采用Android NDK交叉编译工具链,已在mac os和centos上编译和测试通过。
-
-### 安装概览 ###
-
-* [系统需求](#0001)
-* [安装第三方依赖](#0002)
-* [Anakin源码编译](#0003)
-* [验证安装](#0004)
-
-
-### 1. 系统需求 ###
-
-* 宿主机: linux, mac
-* cmake 3.8.2+
-* Android NDK r14, Linux 版本[从这里下载](https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip)
-
-### 2. 安装第三方依赖 ###
-
-- 2.1 protobuf3.4.0
-
- 源码从这里[下载](https://github.com/google/protobuf/releases/tag/v3.4.0)
-
- - 2.1.1 为宿主机编译protobuf
-
- ```bash
- $ tar -xzf protobuf-3.4.0.tar.gz
- $ cd protobuf-3.4.0
- $ ./autogen.sh
- $ ./configure
- $ make
- $ make check
- $ make install
- ```
-
- 上述 $make install 执行后,可在 `/usr/local/include/google` 找到 libprotobuf 所需的头文件,将整个google文件夹拷贝至Anakin/third-party/arm-android/protobuf/下, 然后将已经生成文件清除。
-
- 如有问题,请点[这里](https://github.com/google/protobuf/blob/v3.4.0/src/README.md)。
-
- ```bash
- $ make distclean
- ```
-
- - 2.1.1 交叉编译Android`armeabi-v7a`的protobuf,注意设置ANDROID_NDK的路径,以及ARCH_ABI、HOSTOSN的值
-
- ```bash
-
- $ export ANDROID_NDK=your_ndk_path
- $ ARCH_ABI="arm-linux-androideabi-4.9"
- $ HOSTOSN="darwin-x86_64"
- $ export SYSROOT=$ANDROID_NDK/platforms/android-9/arch-arm
- $ export PREBUILT=$ANDROID_NDK/toolchains/$ARCH_ABI
- $ export LDFLAGS="--sysroot=$SYSROOT"
- $ export LD="$ANDROID_NDK/toolchains/$ARCH_ABI/prebuilt/$HOSTOSN/arm-linux-androideabi/bin/ld $LDFLAGS"
- $ export LIBS="-llog $ANDROID_NDK/sources/cxx-stl/gnu-libstdc++/4.9/libs/armeabi-v7a/libgnustl_static.a"
- $ export CPPFLAGS=""
- $ export INCLUDES="-I$ANDROID_NDK/sources/cxx-stl/gnu-libstdc++/4.9/include/ -I$ANDROID_NDK/platforms/android-9/arch-arm/usr/include/ -I$ANDROID_NDK/sources/cxx-stl/gnu-libstdc++/4.9/libs/armeabi-v7a/include/"
- $ export CXXFLAGS="-march=armv7-a -mfloat-abi=softfp -DGOOGLE_PROTOBUF_NO_RTTI --sysroot=$SYSROOT"
- $ export CCFLAGS="$CXXFLAGS"
- $ export CXX="$PREBUILT/prebuilt/$HOSTOSN/bin/arm-linux-androideabi-g++ $CXXFLAGS"
- $ export CC="$CXX"
- $ export RANLIB="$ANDROID_NDK/toolchains/$ARCH_ABI/prebuilt/$HOSTOSN/bin/arm-linux-androideabi-ranlib"
- $ ./autogen.sh
- $ ./configure --host=arm-linux-androideabi --with-sysroot=$SYSROOT --enable-cross-compile --with-protoc=protoc --disable-shared CXX="$CXX" CC="$CC" LD="$LD"
- $ make
- ```
-
- 编译生成 *.a 静态库,若希望编译*.so 动态链接库 ,请在./configure参数中改--disable-shared为--disable-static --enable-shared
-
- 生成文件在`src/.libs/`下,将生成的文件拷贝至`Anakin/third-party/arm-android/protobuf/lib`下
-
- 在[cmake](../../cmake/find_modules.cmake)中更新`ARM_RPOTO_ROOT`的路径。
-
- ```cmake
- set(ARM_RPOTO_ROOT "${CMAKE_SOURCE_DIR}/third-party/arm-android/protobuf")
- ```
-
-- 2.2 opencv 2.4.3+(optional)
-
- Anakin只在examples示例中使用opencv
-
- Android系统的opencv从[这里下载](https://opencv.org/releases.html)
-
- 解压后将 `3rdparty/libs/armeabi-v7a`中的库文件拷贝到`libs/armeabi-v7a`
-
- 在[cmake](../../cmake/find_modules.cmake)中搜索`anakin_find_opencv`
-
- 并设置 `include_directories` 和 `LINK_DIRECTORIES`为自己安装的库的路径
-
- ```cmake
- include_directories(${CMAKE_SOURCE_DIR}/third-party/arm-android/opencv/sdk/native/jni/include/)
- LINK_DIRECTORIES(${CMAKE_SOURCE_DIR}/third-party/arm-android/opencv/sdk/native/libs/armeabi-v7a/)
- ```
-
-### 3. Anakin源码编译 ###
-
-#### 编译Android版本
-
-克隆[源码](https://github.com/PaddlePaddle/Anakin/tree/arm)
-
-```bash
- cd your_dir
- git clone https://github.com/PaddlePaddle/Anakin.git
- cd Anakin
- git fetch origin arm
- git checkout arm
-```
-
-修改`android_build.sh`
-
- - 修改NDK路径
-
- ```bash
- #modify "your_ndk_path" to your NDK path
- export ANDROID_NDK=your_ndk_path
- ```
-
- - 修改ARM 处理器架构
-
- 对于32位ARM处理器, 将ANDROID_ABI 设置为 `armeabi-v7a with NEON`
- 对于64位ARM处理器, 可以将ANDROID_ABI 设置为 `armeabi-v7a with NEON`或者`arm64-v8a`
- 目前我们只支持 `armeabi-v7a with NEON`;`arm64-v8a` 还在开发中
-
- ```bash
- -DANDROID_ABI="armeabi-v7a with NEON"
- ```
-
-- 设置Android API
-
- 根据Android系统的版本设置API level, 例如API Level 21 -> Android 5.0.1
-
- ```bash
- -DANDROID_NATIVE_API_LEVEL=21
- ```
-
-- 选择编译静态库或动态库
-
- 设置`BUILD_SHARED=NO`编译静态库
- 设置`BUILD_SHARED=YES`编译动态库
-
- ```bash
- -DBUILD_SHARED=NO
- ```
-
-- OpenMP多线程支持
-
- 设置`USE_OPENMP=YES`开启OpenMP多线程
-
- ```bash
- -DUSE_OPENMP=YES
- ```
-
-- 编译单测文件
-
- 设置`BUILD_WITH_UNIT_TEST=YES`将会编译单测文件
-
- ```bash
- -DBUILD_WITH_UNIT_TEST=YES
- ```
-
-- 编译示例文件
-
- 设置`BUILD_EXAMPLES=YES`将会编译示例文件
- ```bash
- -DBUILD_EXAMPLES=YES
- ```
-
-- 开启opencv
-
- 如果使用opencv,设置`USE_OPENCV=YES`
-
- ```bash
- -DUSE_OPENCV=YES
- ```
-
-- 开始编译
-
- 运行脚本 `android_build.sh` 将自动编译Anakin
-
- ```bash
- ./android_build.sh
- ```
-
-### 4. 验证安装 ###
-
-编译好的库会放在目录`${Anakin_root}/output`下;
-
-编译好的单测文件会放在`${Anakin_root}/output/unit_test`目录下;
-
-编译好的示例文件会放在`${Anakin_root}/output/examples`目录下。
-
-对于Android系统,打开设备的调试模式,通过ADB可以访问的目录是`data/local/tmp`,通过ADB push将测试文件、模型和数据发送到设备目录, 运行测试文件。
diff --git a/doc/fluid/advanced_usage/deploy/anakin/anakin_tutorial.md b/doc/fluid/advanced_usage/deploy/anakin/anakin_tutorial.md
deleted file mode 100644
index 1658aae63..000000000
--- a/doc/fluid/advanced_usage/deploy/anakin/anakin_tutorial.md
+++ /dev/null
@@ -1,645 +0,0 @@
-# Anakin 使用教程 ##
-
-本教程将会简略的介绍Anakin的工作原理,一些基本的Anakin API,以及如何调用这些API。
-
-## 内容 ###
-
-- [Anakin的工作原理](#principle)
-- [Anakin APIs](#api)
-- [示例代码](#example)
-
-## Anakin的工作原理 ###
-
-![Anakin_principle](../pics/anakin_fm_ch.png)
-
-用Anakin来进行前向计算主要分为三个步骤:
-
- - 将外部模型通过[Anakin Parser](./convert_paddle_to_anakin.html)解析为Anakin模型
- 在使用Anakin之前,用户必须将所有其他模型转换成Anakin模型,我们提供了转换脚本,用户可通过[Anakin Parser](./convert_paddle_to_anakin.html)进行模型转换。
- - 生成Anakin计算图
- 加载Anakin模型生成原始计算图,然后需要对原始计算图进行优化。你只需要调用相应的API优化即可。
- - 执行计算图
- Anakin会选择不同硬件平台执行计算图。
-
-
-## Anakin APIs ###
-
-### Tensor ####
-
-`Tensor`提供基础的数据操作和管理,为ops提供统一的数据接口。`Tensor`包含以下几个属性:
-
-- Buffer
- 数据存储区
-- Shape
- 数据的维度信息
-- Event
- 用于异步计算的同步
-
-`Tensor`类包含三个`Shape`对象, 分别是`_shape`, `_valid_shape`和 `offset`
-
- - `_shape`为`tensor`真正空间信息
- - `_valid_shape`表示当前`tensor`使用的空间信息
- - `tensor`使用的空间信息
- - `_offset`表示当前`tensor`数据指针相对于真正数据空间的信息
-
-`Tensor`不同维度与分别与数学中的向量、矩阵等相对应如下表所示
-
-Dimentions | Math entity |
-:----: | :----:
-1 | vector
-2 | matrix
-3 | 3-tensor
-n | n-tensor
-
-#### 声明tensor对象
-
-`Tensor`接受三个模板参数:
-
-
-```c++
- template
- class Tensor .../* Inherit other class */{
- //some implements
- ...
- };
-```
-
-TargetType是平台类型,如X86,GPU等等,在Anakin内部有相应的标识与之对应;datatype是普通的数据类型,在Anakin内部也有相应的标志与之对应
-
-[LayOutType](#layout)是数据分布类型,如batch x channel x height x width [NxCxHxW], 在Anakin内部用一个struct来标识
-
-Anakin中数据类型与基本数据类型的对应如下:
-
- 1. TargetType
-
- Anakin TargetType | platform
- :----: | :----:
- NV | NVIDIA GPU
- ARM | ARM
- AMD | AMD GPU
- X86 | X86
- NVHX86 | NVIDIA GPU with Pinned Memory
-
- 2. DataType
-
- Anakin DataType | C++ | Description
- :---: | :---: | :---:
- AK_HALF | short | fp16
- AK_FLOAT | float | fp32
- AK_DOUBLE | double | fp64
- AK_INT8 | char | int8
- AK_INT16 | short | int16
- AK_INT32 | int | int32
- AK_INT64 | long | int64
- AK_UINT8 | unsigned char | uint8
- AK_UINT16 | unsigned short | uint8
- AK_UINT32 | unsigned int | uint32
- AK_STRING | std::string | /
- AK_BOOL | bool | /
- AK_SHAPE | / | Anakin Shape
- AK_TENSOR | / | Anakin Tensor
-
- 3. LayOutType
-
- Anakin LayOutType ( Tensor LayOut ) | Tensor Dimention | Tensor Support | Op Support
- :---: | :---: | :---: | :---:
- W | 1-D | YES | NO
- HW | 2-D | YES | NO
- WH | 2-D | YES | NO
- NW | 2-D | YES | YES
- NHW | 3-D | YES |YES
- NCHW ( default ) | 4-D | YES | YES
- NHWC | 4-D | YES | NO
- NCHW_C4 | 5-D | YES | YES
-
- 理论上,Anakin支持申明1维以上的tensor,但是对于Anakin中的Op来说,只支持NW、NHW、NCHW、NCHW_C4这四种LayOut,其中NCHW是默认的LayOuteType,NCHW_C4是专门针对于int8这种数据类型的。
-
- 例子
-
- 下面的代码将展示如何使用tensor, 我们建议先看看这些示例。
-
- 要想获得更多关于tensor的信息, 请参考 *soure_path/core/tensor.h*
-
- > 1. 使用shape对象初始化tensor
-
- ```c++
- //create a null tensor. A null tensor holds for nothing.
- //tensor's buffer is resident at CPU and its datatype is AK_FLOAT.
- //tensor's Layout is NCHW(default)
- Tensor mytensor;
-
- //1. using shape object to create a tensor.
- Shape shape1(NUM); //1-D shape. NUM is the number of dimention.
- Tensor mytensor1(shape1); //1-D tensor.
-
- // A 4-D shape
- Shape shape2(N, C, H, W); // batch x channel x height x width
- ```
-
- >`注意:Shape的维度必须和tensor的`[LayoutType](#layout)`相同,比如Shape(N,C,H,W), 那么Tensor的 LayoutType必须是NCHW,否则会出错。如下列代码所示`
-
- ```c++
- // A 4-D tensor.
- Tensor mytensor2(shape2); //right
-
- //A 4-D tensor which is resident at GPU and its datatype is AK_INT8
- Tensor mytensor3(shape2); //right
-
- Tensor mytensor4(shape2); //wrong!! shape's dimetion must be equal to tensor's Layout.
- Tensor mytensor5(shape2); //wrong!!!!
-
- ```
-
- > 2. 使用现有的数据和shape初始化tensor
-
- ```c++
-
- /**
- * A construtor of Tensor.
- * data_ptr is a pointer to any data type of data
- * TargetType is type of a platform [Anakin TargetType]
- * id : device id
- * shape: a Anakin shape
- */
- Tensor(Dtype* data_ptr, TargetType_t target, int id, Shape shape);
-
- //using existing data feed to a tensor
- Tensor mytensor(data_ptr, TargetType, device_id, shape); //shape must has dimention (N, C, H, W).
-
- ```
-
- > 3. 使用tensor初始化tensor
-
- ```c++
- Tensor tensor(exist_tensor);
- ```
-
- > 提示: 你可以用` typedef Tensor Tensor4d_X86 `方便定义tensor
-
-#### 填充tensor数据区
-
-填充数据区得看你申明tensor的方式, 下面展示了如何填充tensor的数据区。
-
-首先来看看tensor的四种声明方式:
-
-```c++
- 1. Tensor mytensor;
- 2. Tensor mytensor1(shape1);
- 3. Tensor mytensor(data_ptr, TargetType, device_id, shape);
- 4. Tensor tensor(exist_tensor);
-```
-
-相关的声明方式的数据填充方法如下:
-
-- 声明一个空的tensor,此时没有为其分配内存,所以,我们需要手动的为其分配内存。
-
-```c++
-
- //parama shape
- mytensor.re_alloc(Shape shape);
-
- //Get writable pointer to mytensor.
- //parama index (int): where you start to write.
- //Dtype is your data type such int, float or double.
- Dtype *p = mytensor.mutable_data(index/*=0*/);
- //write data to mytensor
- for(int i = 0; i < mytensor.size(); i++){
- p[i] = 1.0f;
- }
- //do something ...
-```
-
-- 这种声明方式会自动分配内存
-
-```c++
- //Get writable pointer to mytensor.
- //parama index (int): where you start to write.
- //Dtype is your data type such int, float or double.
- Dtype *p = mytensor1.mutable_data(index/*=0*/);
- //write data to mytensor
- for(int i = 0; i < mytensor.size(); i++){
- p[i] = 1.0f;
- }
- //do something ...
-```
-
-- 在该种声明方式中,我们仍不需要手动为其分配内存。但在构造函数内部是否为其分配内存,得依情况而定。如果data_ptr和申明的
- tensor都在都一个目标平台上,那么该tensor就会与data_ptr共享内存空间,相反,如果他们不在同一个平台上(如data_ptr在X86上,而
- tensor在GPU上),那么此时tensor就会开辟一个新的内存空间,并将data_ptr所指向的数据拷贝到tensor的buffer中。
-
-```c++
- //Get writable pointer to mytensor.
- //parama index (int): where you start to write.
- //Dtype is your data type such int, float or double.
- Dtype *p = mytensor.mutable_data(index/*=0*/);
- //write data to mytensor
- for(int i = 0; i < mytensor.size(); i++){
- p[i] = 1.0f;
- }
- //do something ...
-```
-
-- 该种方式仍不需要手动分配内存
-
-```c++
- //Get writable pointer to mytensor.
- //parama index (int): where you start to write.
- //Dtype is your data type such int, float or double.
- Dtype *p = mytensor.mutable_data(index/*=0*/);
- //write data to mytensor
- for(int i = 0; i < mytensor.size(); i++){
- p[i] = 1.0f;
- }
- //do something ...
-```
-
-- 另外,你还可以获取一个tensor的可读指针,示例如下:
-
-```c++
- //Get read-only pointer to mytensor.
- //parama index (int): where you start to read.
- //Dtype is your data type such int, float or double.
- Dtype *p = mytensor.data(index/*=0*/);
- //do something ...
-```
-
-如果想更详细的了解tensor,请查阅*soure_path/saber/core/tensor.h*
-
-#### 获取tensor的shape
-
-```c++
- //some declarations
- // ...
- Shape shape = mytensor.shape();
-
- //Get a first dimetion size of tesor, if it has.
- int d1 = shape[0];
-
- //Get a second dimention size of tensor, if it has.
- int d2 = shape[1];
-
- ...
-
- //Get a n-th dimention size of tensor, if it has.
- int dn = shape[n-1];
-
-
- //Get a tensor's dimention
- int dims = mytensor.dims();
-
- //Get the size of tensor.
- //size = d1 x d2 x ... x dn.
- int size = mytensor.size();
-
- //Get the size of tensor at interval [Di, Dj)
- // form i-th dimention to j-th dimention, but not including the j-th dimention.
- // which means di x (di+1) x ... x (dj -1)
- int size = mytensor.count(start, end);
-```
-
-#### 设置tensor的shape
-
-我们可以用tensor的成员函数set_shape来设置tensor的shape。 下面是set_shape的定义
-
-```c++
- /**
- * \brief set a tensor's shape
- * \param valid_shape [a Shape object]
- * \param shape [a Shape object]
- * \param offset [a Shape object]
- * \return the status of this operation, that means whether it success * or not.
- */
- SaberStatus set_shape(Shape valid_shape, Shape shape = Shape::zero(TensorAPI::layout_dims::value), Shape offset = Shape::minusone(TensorAPI::layout_dims::value));
-```
-
-这个成员函数只设置tensor的shape。这些shape对象(valid_shape, shape, offset)的[LayOutType](#layout)必须和当前的tensor的相应三个shape对象的LayOutType相同,如果不同就会出错,返回SaberInvalidValue。 如果相同,那么将成功设置tensor的shape。
-
-```c++
-
- // some declarations
- // ...
- //valid_shape, shape , offset are Shape object;
- //All these Shape object's LayOutType must be equal to mytensor's.
- mytensor.set_shape(valid_shape, shape, offset);
-
-```
-
-#### 重置 tensor的shape
-
-```c++
- //some declarations
- Shape shape, valid_shape, offset;
-
- //do some initializations
- ...
- mytensor.reshape(valid_shape, shape, offset);
-```
-
-注意: Reshape操作仍然需要shape的[LayOutType](#layout) 与tensor的相同
-
-### Graph ###
-
-`Graph`类负责加载Anakin模型生成计算图、对图进行优化、存储模型等操作。
-
-#### 图的声明
-
-与`Tensor`一样,graph也接受三个模板参数。
-
-```c++
-
- template
- class Graph ... /* inherit other class*/{
-
- //some implements
- ...
-
- };
-```
-
-前面已经介绍过[TargetType](#target)和[DataType](#datatype)是Anakin内部自定义数据类型。[TargetType](#target)表示平台类型 (如NV、X86), [DataType](#datatype)是Anakin基本数据类型与C++/C中的基本数据类型相对应。 [Precision](#precision)为op所支持的精度类型, 稍后我们在介绍它。
-
-```c++
-
- //Create a empty graph object.
- Graph graph = Graph tmp();
-
- //Create a pointer to a empty graph.
- Graph *graph = new Graph();
-
- //Create a pointer to a empty graph.
- auto graph = new Graph();
-
-```
-
-#### 加载 Anakin 模型
-
-```c++
- //some declarations
- ...
- auto graph = new Graph();
- std::string model_path = "the/path/to/where/your/models/are";
- const char *model_path1 = "the/path/to/where/your/models/are";
-
- //Loading Anakin model to generate a compute graph.
- auto status = graph->load(model_path);
-
- //Or this way.
- auto status = graph->load(model_path1);
- //Check whether load operation success.
- if(!status){
- std::cout << "error" << endl;
- //do something...
- }
-
-```
-
-#### 优化计算图
-
-```c++
- //some declarations
- ...
- //Load graph.
- ...
- //According to the ops of loaded graph, optimize compute graph.
- graph->Optimize();
-
-```
-
-> 注意: 第一次加载原始图,必须要优化。
-
-#### 保存模型
-
-你可以在任何时候保存模型, 特别的, 你可以保存一个优化的模型,这样,下次再加载模型时,就不必进行优化操作。
-
-```c++
- //some declarations
- ...
- //Load graph.
- ...
- // save a model
- //save_model_path: the path to where your model is.
- auto status = graph->save(save_model_path);
-
- //Checking
- if(!status){
- cout << "error" << endl;
- //do somethin...
- }
-```
-
-#### 重新设置计算图里的tensor的shape
-
-```c++
- //some declarations
- ...
- //Load graph.
- ...
- vector shape{10, 256, 256, 10};
- //input_name : std::string.
- //Reshape a tensor named input_name.
- graph->Reshape(input_name, shape);//Note: shape is a vector, not a Shape object.
-```
-
-#### 设置 batch size
-
-`Graph` 支持重新设置batch size的大小。
-
-```c++
- //some declarations
- ...
- //Load graph.
- ...
- //input_name : std::string.
- //Reset a tensor named input_name.
- int new_batch_size = 4;
- graph->ResetBatchSize(input_name, new_batch_size);
-```
-
-### Net ###
-
-
-`Net` 是计算图的执行器。你可以通过Net对象获得输入和输出
-#### Creating a graph executor
-
-`Net`接受四个模板参数。
-
-
-```c++
- template
- class Net{
- //some implements
- ...
-
- };
-```
-由于有些Op可能支持多种精度,我们可以通过Precision来指定。OpRunType表示同步或异步类型,异步是默认类型。OpRunType::SYNC表示同步,在GPU上只有单个流;OpRunType::ASYNC表示异步,在GPU上有多个流并以异步方式执行。实际上,Precision和OpRunType都是enum class, 详细设计请参考*source_root/framework/core/types.h*.
-
-
-1. Precision
-
- Precision | Op support
- :---: | :---:
- Precision::INT4 | NO
- Precision::INT8 | NO
- Precision::FP16 | NO
- Precision::FP32 | YES
- Precision::FP64 | NO
-
-现在Op的精度只支持FP32, 但在将来我们会支持剩下的Precision.
-
-2. OpRunType
-
- OpRunType | Sync/Aync |Description
- :---: | :---: | :---:
- OpRunType::SYNC | Synchronization | single-stream on GPU
- OpRunType::ASYNC | Asynchronization | multi-stream on GPU
-
-用graph对象创建一个执行器
-
-```c++
- //some declarations
- ...
- //Create a pointer to a graph.
- auto graph = new Graph();
- //do something...
- ...
-
- //create a executor
- Net executor(*graph);
-
-```
-
-#### 获取输入输出tensor
-
-获取输入输出tensor,并填充输入tensor的buffer。如果想要获取输入和输出tensor,那么必须指定输入的名字,如"input_0", "input_1", "input_2", ..., 必须传入如上字符串才能够获得输入tensor。另外,如果想知道input_i对应哪个输入,你需要去dash board查看,如何使用dash board请看[Anakin Parser](./convert_paddle_to_anakin.html)。请看如下示例代码
-
-```c++
- //some declaratinos
- ...
-
- //create a executor
- //TargetType is NV [NVIDIA GPU]
- Net executor(*graph);
-
- //Get the first input tensor.
- //The following tensors(tensor_in0, tensor_in2 ...) are resident at GPU.
- //Note: Member function get_in returns an pointer to tensor.
- Tensor* tensor_in0 = executor.get_in("input_0");
-
- //If you have multiple input tensors
- //You just type this code below.
- Tensor* tensor_in1 = executor.get_in("input_1");
- ...
- auto tensor_inn = executor.get_in("input_n");
-```
-
-当得到输入tensor之后,就可以填充它的数据区了。
-
-```c++
- //This tensor is resident at GPU.
- auto tensor_d_in = executor.get_in("input_0");
-
- //If we want to feed above tensor, we must feed the tensor which is resident at host. And then copy the host tensor to the device's one.
-
- //using Tensor4d = Tensor;
- Tensor4d tensor_h_in; //host tensor;
- //Tensor tensor_h_in;
-
- //Allocate memory for host tensor.
- tensor_h_in.re_alloc(tensor_d_in->valid_shape());
- //Get a writable pointer to tensor.
- float *h_data = tensor_h_in.mutable_data();
-
- //Feed your tensor.
- /** example
- for(int i = 0; i < tensor_h_in.size(); i++){
- h_data[i] = 1.0f;
- }
- */
- //Copy host tensor's data to device tensor.
- tensor_d_in->copy_from(tensor_h_in);
-
- // And then
-```
-
-类似的,我们可以利用成员函数get_out来获得输出tensor。但与获得输入tensor不同的是, 我们需要指定输入tensor结点的名字,这个可以从dash board中看到,请从[Anakin Parser](./convert_paddle_to_anakin.html)中查看dash board的使用方法。假如有个输出结点叫pred_out, 那么我们可以通过如下代码获得相应的输出tensor:
-
-```c++
- //Note: this tensor are resident at GPU.
- Tensor* tensor_out_d = executor.get_out("pred_out");
-
-```
-
-#### Executing graph
-
-当一切准备就绪后,我们就可以执行真正的计算了!
-```c++
- executor.prediction();
-```
-
-## 示例代码 ##
-
-下面的例子展示了如何调用Anakin。
-
-在这儿之前, 请确保你已经有了Anakin模型。如果还没有,那么请使用[Anakin Parser](./convert_paddle_to_anakin.html)转换你的模型。
-
-### Single-thread
-
-单线程例子在 *`source_root/test/framework/net/net_exec_test.cpp`*
-
-```c++
-
- std::string model_path = "your_Anakin_models/xxxxx.anakin.bin";
- // Create an empty graph object.
- auto graph = new Graph();
- // Load Anakin model.
- auto status = graph->load(model_path);
- if(!status ) {
- LOG(FATAL) << " [ERROR] " << status.info();
- }
- // Reshape
- graph->Reshape("input_0", {10, 384, 960, 10});
- // You must optimize graph for the first time.
- graph->Optimize();
- // Create a executer.
- Net net_executer(*graph);
-
- //Get your input tensors through some specific string such as "input_0", "input_1", and
- //so on.
- //And then, feed the input tensor.
- //If you don't know Which input do these specific string ("input_0", "input_1") correspond with, you can launch dash board to find out.
- auto d_tensor_in_p = net_executer.get_in("input_0");
- Tensor4d h_tensor_in;
- auto valid_shape_in = d_tensor_in_p->valid_shape();
- for (int i=0; icopy_from(h_tensor_in);
-
- //Do inference.
- net_executer.prediction();
-
- //Get result tensor through the name of output node.
- //And also, you need to see the dash board again to find out how many output nodes are and remember their name.
-
- //For example, you've got a output node named obj_pre_out
- //Then, you can get an output tensor.
- auto d_tensor_out_0_p = net_executer.get_out("obj_pred_out"); //get_out returns a pointer to output tensor.
- auto d_tensor_out_1_p = net_executer.get_out("lc_pred_out"); //get_out returns a pointer to output tensor.
- //......
- // do something else ...
- //...
- //save model.
- //You might not optimize the graph when you load the saved model again.
- std::string save_model_path = model_path + std::string(".saved");
- auto status = graph->save(save_model_path);
- if (!status ) {
- LOG(FATAL) << " [ERROR] " << status.info();
- }
-
-```
diff --git a/doc/fluid/advanced_usage/deploy/anakin/convert_paddle_to_anakin.md b/doc/fluid/advanced_usage/deploy/anakin/convert_paddle_to_anakin.md
deleted file mode 100644
index 8a3587540..000000000
--- a/doc/fluid/advanced_usage/deploy/anakin/convert_paddle_to_anakin.md
+++ /dev/null
@@ -1,73 +0,0 @@
-# 模型转换指南
-
-Anakin 支持不同框架的模型预测。但由于格式的差别,Anakin 需要您预先转换模型, 本文档介绍如何转换模型。
-
-## 简介
-
-Anakin 模型转换器输入支持 Caffe 和 Paddle 两种格式的预测模型,模型包含网络结构(model 或 prototxt)和权重参数(param 或 caffemodel)。
-
-模型转换的输出是一个 bin 文件,它作为 Anakin 框架的 graph 参数导入。
-
-您还可以使用模型转换器的 launch board 功能生成网络结构的 HTML 预览。
-
-
-## 系统要求
-
-- python 2.7+
-- pyyaml
-- flask
-- protobuf 3.5+
-
-
-## 用法
-
-### 1、环境
-转换器所需的依赖标注于*系统要求*一节。
-
-### 2、配置
-您需要对 *config.yaml* 文件进行修改以告知您的需求。工程中给出了 *config.yaml* 示例,下面作进一步说明。
-
-#### config.yaml
-```bash
-OPTIONS:
- Framework: CAFFE # 依框架类型填写 CAFFE 或 Paddle
- SavePath: ./output # 转换结束后模型的保存位置
- ResultName: googlenet # 输出模型的名字
- Config:
- LaunchBoard: ON # 是否生成网络结构预览页面
- Server:
- ip: 0.0.0.0
- port: 8888 # 从一个可用端口访问预览页面
- OptimizedGraph: # 当您使用了 Anakin 框架的 Optimized 功能时,才应该打开此项
- enable: OFF
- path: /path/to/anakin_optimized_anakin_model/googlenet.anakin.bin.saved
- LOGGER:
- LogToPath: ./log/ # 生成日志的路径
- WithColor: ON
-
-TARGET:
- CAFFE:
- # 当 Framework 为 CAFFE 时需填写
- ProtoPaths:
- - /path/to/caffe/src/caffe/proto/caffe.proto
- PrototxtPath: /path/to/your/googlenet.prototxt
- ModelPath: /path/to/your/googlenet.caffemodel
-
- Paddle:
- # 当 Framework 为 Paddle 时需填写
- Debug: NULL
- ProtoPaths:
- - /
- PrototxtPath: /path/to/paddle/inference_model
- ModelPath: /path/to/paddle/inference_model
- # ...
-```
-
-### 3、转换
-在完成配置文件的修改后,您只需执行 ```python converter.py``` 就可以进行模型转换了。
-
-
-### 4、预览
-最后一步,就是在浏览器中查看转换结果!网址是在 *config.yaml* 中配置的,例如 http://0.0.0.0:8888 。
-
-> 注意:若您使用了默认的 IP 地址 0.0.0.0,请在预览时使用真实的服务器地址 real_ip:port 替代它。
diff --git a/doc/fluid/advanced_usage/deploy/anakin/how_to_add_anakin_op.md b/doc/fluid/advanced_usage/deploy/anakin/how_to_add_anakin_op.md
deleted file mode 100644
index f2783eb9f..000000000
--- a/doc/fluid/advanced_usage/deploy/anakin/how_to_add_anakin_op.md
+++ /dev/null
@@ -1,405 +0,0 @@
-# 如何增加新的Operator
-
-## 基本概念
-
-简单介绍下几个同Operator相关的基本概念,详情请参考设计文档。
-
-```framework```: 上层的逻辑代码,负责从parser中获取参数及weights,添加op时主要修改framework/operator目录下的内容。
-
-```saber```: 底层的实现代码,Anakin通过saber封装了不同的backends,不同的实现(impl)分别特化出自己的实现,外层framework通过不同的template进入各自的impl完成调用。各个op的parameter放在saber/saber_funcs_param.h文件中,增加op主要修改saber/funcs下的内容。
-
-saber的文件结构:
-* saber/funcs下的是各个funcs的外部接口,这一层的op与具体的设备实现无关,只与各op完成的功能有关。由于跟实现(impl)无关,本层文件明均不带impl。
-* saber/funcs/impl下是各个op的impl声明,特定设备需要完成该层声明的特化版本,如saber/funcs/impl/x86实现了上一层impl声明的x86特化版本,saber/funcs/impl/cuda实现了上一层impl声明的NV特化版本。当增加新的backends时需要特化出新的实现。本层代码同实现相关,均带有```impl_```前缀。
-* saber/funcs/impl/cuda/base/cuda_c内有cuda```.cu```扩展名的文件,添加cuda的kernel需要在该文件目录下添加。
-* saber/funcs/impl/cuda/base/sass 内有不同架构的汇编代码编译的静态库。
-
-### 涉及到的基类及各个类之前的关系
-
-简单介绍相关的基类
-
-* ```anakin::Operator```: framework的operator基类,位于framework/core/operator/operator.h
-
-* ```anakin::saber::BaseFunc```: saber对外的op接口基类,提供统一的对外接口,位于saber/funcs/base.h。BaseFunc的```compute_output_shape```接口只根据input的shape和param的参数计算输出的shape,并通过```tensor```的```set_shape```接口(只设置shape,不分配空间)设置到output中。```operator()```接口为各个op的计算接口。
-
-* ```ankain::saber::ImplBase```: saber设备实现的op的接口,所有设备相关实现的基类。位于saber/funcs/impl/impl_base.h。实现版本中这里分为两类,一类以```vender_```为前缀,带有```vender_```代码意为使用第三方库来实现该op,如cudnn的conv,或mkl的conv等等,这类op的性能我们难以调优,因此单独列为一类。另一类是带有源码的saber实现,这些实现都带有```saber_```为前缀,此类实现带有源码,能够通过后续优化不断提升性能,实现起名时需要注意这一点。
-
-## 添加operator
-
-添加一个新的op需要以下几步:
-
-1. 添加saber的param
-2. 定义saber的Operator类
-3. 定义新的impl声明
-3. 完成新的impl实现
-4. 增加framework的实现或特化
-
-接下来就针对这几步,以一个简单例子为例介绍实现。
-
-例如我们要添加新的Mul op。给出计算公式如下:$$Out = alpha \dot X * Y$$
-
-### 为operator增加param
-
-涉及到的文件:```saber/saber_funcs_param.h```。如果之前已经存在需要添加的op的param,这一步可以跳过。
-这里```XXXParam```是一个```struct```。包含一个无参数的构造函数,含参数的构造函数,复制构造函数,```operator=()```及```operator==()```。
-```
-template // 能够获得target, datatype, layout
-struct MulParam{
- MulParam()
- : alpha(0)
- {}
- MulParam(float alpha_in)
- : alpha(alpha_in)
- {}
- MulParam(const MulParam& right)
- : alpha(right.alpha)
- {}
- MulParam &operator=(const MulParam &right) {
- alpha = right.alpha;
- }
- bool operator==(const MulParam &right) {
- return alpha == right.alpha;
- }
- float alpha;
-};
-```
-
-### 定义Operator类
-涉及到的文件:```saber/funcs/mul.h```。如果之前定义过该op的类,这里需要修改输入的impl定义头文件。
-下面给出一个相对完整的定义结构供参考。
-```
-//不同的设备需要包含对应的operator实现.[详见](#impl)
-#ifdef NVIDIA_GPU
-#include "saber/funcs/impl/cuda/saber_mul.h"
-#include "saber/funcs/impl/cuda/vender_mul.h"
-#endif
-//如果一个设备现在还没有对应的operator实现,需要包含声明。[详见](#declare)
-#ifdef USE_X86_PLACE
-#include "saber/funcs/impl/impl_mul.h"
-#endif
-namespace anakin {
-namespace saber {
-template
-class Mul : public BaseFunc<
- Tensor,
- Tensor,
- Tensor,
- ImplBase, MulParam> {
-public:
- using BaseFunc<
- Tensor,
- Tensor,
- Tensor,
- ImplBase, MulParam>::BaseFunc;
- Mul() = default;
- typedef Tensor InDataTensor;
- typedef Tensor OutDataTensor;
- typedef Tensor OpTensor;
- typedef MulParam Param_t;
- typedef std::vector Input_v;
- typedef std::vector Output_v;
- typedef std::vector Shape_v;
-
- virtual SaberStatus compute_output_shape(const Input_v &input,
- Output_v &output, Param_t ¶m) override {
- //计算输出的shape,
- Shape output_shape = (input[0]->valid_shape());
- /* code */
- return output[0]->set_shape(output_shape);
- }
- virtual SaberStatus init_impl(ImplEnum implenum) override {
- // 不同设备均使用此init_impl, 此接口创建对应impl的实现。
- switch (implenum) {
- case VENDER_IMPL:
- this->_impl.push_back(new VenderMul );
- return SaberSuccess;
- case SABER_IMPL:
- this->_impl.push_back(new SaberMul );
- return SaberSuccess;
- default:
- return SaberUnImplError;
- }
- }
-private:
- virtual void pick_best_static() override {
- if (true) // some condition?
- this->_best_impl = this->_impl[0];
- }
- virtual void pick_best_specify(ImplEnum implenum) override {
- this->_best_impl = this->_impl[0];
- }
-};
-} // namespace saber
-} // namespace anakin
-```
-
-### 为operator增加新的impl声明
-
-涉及的文件:```saber/funcs/impl/impl_mul.h```。不同的设备都特化同一个声明,特化版本放在对应的文件夹下,这里的声明就是给出所有设备的统一声明。下面给出一个参考。
-```
-#include "saber/funcs/impl/impl_macro.h"
-namespace anakin{
-namespace saber{
-DEFINE_OP_CLASS(Mul, MulParam); // 第一个参数是op的名字,第二个是对应param的名字
-}
-}
-```
-
-### 完成新的operator特定后端实现
-
-涉及的文件:```saber/funcs/impl/xxx/vender_mul.h```或```saber/funcs/impl/xxx/saber_mul.h```
-这里```xxx```指代特定的一种设备。```vender```是指的使用第三方库实现的op,```saber```指的源码实现的op。这里以cuda的vender实现为例,简单介绍一下特化出的函数的几个基本接口。
-
-```
-// include 对应的声明
-#include "saber/funcs/impl/impl_mul.h"
-
-namespace anakin{
-namespace saber{
-template
-class VenderMul :
- public ImplBase<
- Tensor,
- Tensor,
- Tensor,
- MulParam > >
-{
-public:
- typedef Tensor DataTensor_in;
- typedef Tensor DataTensor_out;
- typedef Tensor OpTensor;
- typedef typename DataTensor_in::Dtype InDataType;
- typedef typename DataTensor_out::Dtype OutDataType;
- typedef typename OpTensor::Dtype OpDataType;
- VenderMul(){}
- ~VenderMul() {}
-
- virtual SaberStatus init(const std::vector& inputs,
- std::vector& outputs,
- MulParam& param, Context& ctx) {
- this->_ctx = ctx;
- create(inputs, outputs, param, ctx);
- }
-
- virtual SaberStatus create(const std::vector& inputs,
- std::vector& outputs,
- MulParam& param, Context& ctx) {
- // set内部参数
- }
-
- virtual SaberStatus dispatch(const std::vector& inputs,
- std::vector& outputs,
- MulParam& param) {
- // dispatch kernel.
- }
-
-private:
-};
-}
-}
-```
-```init```和```create```的区别:```init```接口是第一次初始化op的时候进入的接口,此函数只在第一次初始化op时调用,这个接口一般放一些只需要执行一次的代码,如malloc或者create之类的函数。```create```函数除了第一次init执行外,在输入发生变化或者param发生变化时会再次触发,create一般放置set函数,设置内部变量,当input发生变化时这里执行一些同input或weights直接相关的代码。但create因为触发位置在网络内,如果```create```函数执行了一些严重耗时的操作,这里会拖慢整个op的执行时间,需要慎重选择操作放置的位置。
-### 添加framework的特化
-
-涉及的文件:```framework/operators/mul.h```和```framework/operators/mul.cpp```。
-这里简单介绍下如果添加或修改framework内的operator
-
-```
-#include "framework/core/base.h"
-#include "framework/core/data_types.h"
-#include "framework/core/operator/operator.h"
-#include "utils/logger/logger.h"
-#include "saber/funcs/mul.h" // 需要包对应的saber头文件
-namespace anakin {
-namespace ops {
-template
-class MulHelper;
-
-template
-class Mul : public Operator {
-public:
- Mul() {}
- /// forward impl
- virtual void operator() (OpContext &ctx,
- const std::vector >& ins,
- std::vector >& outs) {
- LOG(ERROR) << "Not Impl Yet Operator power::type>().type_info()<<">";
- }
- friend class MulHelper;
-};
-template
-class MulHelper : public OperatorHelper {
-public:
- MulHelper() = default;
- ~MulHelper();
- Status InitParam() override;
-
- Status Init(OpContext &ctx,
- const std::vector >& ins,
- std::vector >& outs) override;
- Status InferShape(const std::vector >& ins,
- std::vector >& outs) override;
-
-public:
- saber::MulParam> _param_mul;
- saber::Mul _funcs_mul;
-};
-}
-} /* namespace anakin */
-```
-对应的```.cpp```文件如下:
-```
-#include "framework/operators/mul.h"
-
-namespace anakin {
-namespace ops {
-
-#ifdef USE_CUDA
-template<>
-void Mul::operator()(
- OpContext& ctx,
- const std::vector >& ins,
- std::vector >& outs) {
- auto* impl =
- static_cast*>(this->_helper);
- auto& param =
- static_cast*>(this->_helper)->_param_mul;
- impl->_funcs_mul(ins, outs, param, ctx);
-}
-#endif
-
-template
-Status MulHelper::InitParam() {
- auto alpha = GET_PARAMETER(float, alpha);
- MulParam> param_mul(alpha);
- _param_mul = param_mul;
- return Status::OK();
-}
-
-template
-Status MulHelper::Init(OpContext& ctx,
- const std::vector >& ins,
- std::vector >& outs) {
-
- SABER_CHECK(_funcs_mul.init(ins, outs, _param_mul, SPECIFY, VENDER_IMPL, ctx));
- return Status::OK();
-}
-
-template
-Status MulHelper::InferShape(const
- std::vector >& ins,
- std::vector >& outs) {
- SABER_CHECK(_funcs_mul.compute_output_shape(ins, outs, _param_mul));
- return Status::OK();
-}
-
-#ifdef USE_CUDA
-template class MulHelper;
-#endif
-#ifdef USE_ARM_PLACE
-template class MulHelper;
-#endif
-// register helper
-#ifdef USE_CUDA
-ANAKIN_REGISTER_OP_HELPER(Mul, MulHelper, NV, AK_FLOAT, Precision::FP32);
-#endif
-#ifdef USE_ARM_PLACE
-ANAKIN_REGISTER_OP_HELPER(Mul, MulHelper, ARM, AK_FLOAT, Precision::FP32);
-#endif
-//! register op
-ANAKIN_REGISTER_OP(Mul)
-.Doc("Mul operator")
-#ifdef USE_CUDA
-.__alias__("mul")
-#endif
-#ifdef USE_ARM_PLACE
-.__alias__("mul")
-#endif
-.num_in(1)
-.num_out(1)
-.Args("alpha", " alpha of Mul "); //注册
-
-} /* namespace ops */
-
-} /* namespace anakin */
-```
-
-## 实现单元测试
-涉及的文件:```test/saber/xxx/test_saber_funcs_mul_xxx.cpp```
-在对应的test下需要添加新的单元测试
-
-```
-TEST(TestSaberFuncNV, test_depthwise_conv) {
-
- // init tensors and some param.
-
- // start Reshape & doInfer
- Context ctx1(0, 1, 1);
-
- // create param
- MulParam > param(alpha);
-
- std::vector*> input;
- std::vector*> output;
-
- // create saber op
- Mul mul;
-
- // compute output shape
- mul.compute_output_shape(input, output, param);
-
- // re_alloc output tensors memory based on output shape
- output[0]->re_alloc(output[0]->shape());
-
- // init saber op(calling init and create)
- mul.init(input, output, param, SPECIFY, VENDER_IMPL, ctx1);
-
- // call operator()
- mul(input, output, param, ctx1);
-
- // cuda specified, record events
- cudaStream_t cuda_stream = ctx1.get_compute_stream();
- output[0]->record_event(cuda_stream);
- output_dev.sync();
-
- // param changed
- param.alpha = 2.0;
- // auto calling saber op(create and dispatch)
- mul(input, output, param, ctx1);
-
- cudaDeviceSynchronize();
- CUDA_CHECK(cudaPeekAtLastError());
-}
-
-int main(int argc, const char** argv){
- anakin::saber::Env::env_init();
-
- // initial logger
- //logger::init(argv[0]);
- InitTest();
- RUN_ALL_TESTS(argv[0]);
- return 0;
-}
-
-```
-## 调试及注意事项
-
-一个op需要有对外的op接口和内部实现,由于存在saber/funcs/impl的非特化版本声明,当有op在某种设备下没有对应实现时,也能够编译,但此时是没有任何实现的空实现,
diff --git a/doc/fluid/advanced_usage/deploy/anakin/how_to_support_new_device_in_anakin.md b/doc/fluid/advanced_usage/deploy/anakin/how_to_support_new_device_in_anakin.md
deleted file mode 100644
index da2c64cf4..000000000
--- a/doc/fluid/advanced_usage/deploy/anakin/how_to_support_new_device_in_anakin.md
+++ /dev/null
@@ -1,459 +0,0 @@
-# 如何支持一个新的设备
-
-## 概览
-
-添加一个新的设备需要以下3个步骤:
-
-* [在`CMakeList`中添加设备的支持](#0001)
-* [在`saber`中添加设备的实现](#0002)
-* [在`framework`中添加设备的具体化或实例化](#0003)
-
-假设新设备的名称为`TNEW`, 以下将以这个设备名称进行演示。
-
-## 在`CMakeList`中添加设备的支持 ##
-
-* 修改根目录`CMakeList.txt`
-```cmake
-#select the plantform to build
-anakin_option(USE_GPU_PLACE "Select the build mode for GPU place." NO)
-anakin_option(USE_X86_PLACE "Select the build mode for X86 place." NO)
-anakin_option(USE_ARM_PLACE "Select the build mode for ARM place." NO)
-anakin_option(USE_TNEW_PLACE "Select the build mode for ARM place." YES)
-```
-
-* 修改`saber/CMakeList.txt`
-
-根据新增设备的目录完善`saber`目录下的`CMakeList.txt`。
-```cmake
-if(USE_TNEW_PLACE)
- anakin_fetch_files_with_suffix(${ANAKIN_SABER}/core/impl/tnew "cpp" ANAKIN_SABER_BASE_SRC)
- anakin_fetch_files_with_suffix(${ANAKIN_SABER}/funcs/impl/tnew "cpp" ANAKIN_SABER_BASE_SRC)
-endif()
-```
-
-* 修改`test/CMakeList.txt`
-
-新增设备的单测文件放在`test/saber/tnew`目录下,修改`test`目录下的`CMakeList.txt`。
-```cmake
-if(USE_TNEW_PLACE)
- anakin_fetch_files_with_suffix(${ANAKIN_UNIT_TEST}/saber/tnew "cpp" ANAKIN_TEST_CASE_SRC)
-endif()
-```
-
-* 修改`cmake/anakin_config.h.in`
-```c++
-// plantform to use
-#cmakedefine USE_GPU_PLACE
-
-#cmakedefine USE_X86_PLACE
-
-#cmakedefine USE_ARM_PLACE
-
-#cmakedefine USE_TNEW_PLACE
-```
-
-* 其他依赖和编译选项
-修改`cmake`目录下的`compiler_options.cmake`和`find_modules.cmake`
-
-
-## 在`saber`中添加设备的实现 ##
-`saber`是`Anakin`的基础计算库,对外提供设备无关的统一的API,设备相关的实现都会封装到`TargetWrapper`中。
-
-### 在`saber/saber_types.h`中添加设备
-
-```c++
-enum TargetTypeEnum {
- eINVALID = -1,
- eNV = 1,
- eAMD = 2,
- eARM = 3,
- eX86 = 4,
- eNVHX86 = 5,
- eTNEW = 6
-};
-
-typedef TargetType NV;
-typedef TargetType ARM;
-typedef TargetType AMD;
-typedef TargetType X86;
-typedef TargetType TNEW;
-
-```
-
-### 在`saber/core`中添加设备的实现
-
-1. 在`target_traits.h`中添加新设备
-
-* 增加设备类型
-```c++
-struct __cuda_device{};
-struct __arm_device{};
-struct __amd_device{};
-struct __x86_device{};
-struct __tnew_device{};
-```
-
-* `TargetTypeTraits`模板具体化
-```c++
-template <>
-struct TargetTypeTraits {
- typedef __xxx_target target_category;//根据实际设备是host端还是device端进行选择
- typedef __tnew_device target_type;
-};
-```
-
-2. 在`data_traits.h`中特化`DataTrait`模板类
-
-如果设备需要特殊的数据类型,则特化出设备的`DataTrait`类的实现,例如opencl数据类型的实现如下:
-```c++
-#ifdef USE_OPENCL
-struct ClMem{
- ClMem(){
- dmem = nullptr;
- offset = 0;
- }
-
- ClMem(cl_mem* mem_in, int offset_in = 0) {
- dmem = mem_in;
- offset = offset_in;
- }
-
- ClMem(ClMem& right) {
- dmem = right.dmem;
- offset = right.offset;
- }
-
- ClMem& operator=(ClMem& right) {
- this->dmem = right.dmem;
- this->offset = right.offset;
- return *this;
- }
-
- ClMem& operator+(int offset_in) {
- this->offset += offset_in;
- return *this;
- }
-
- int offset{0};
- cl_mem* dmem;
-};
-
-template <>
-struct DataTrait {
- typedef ClMem Dtype;
- typedef float dtype;
-};
-
-template <>
-struct DataTrait {
- typedef ClMem Dtype;
- typedef double dtype;
-};
-
-template <>
-struct DataTrait {
- typedef ClMem Dtype;
- typedef char dtype;
-};
-#endif //use_opencl
-```
-
-3. 在`target_wrapper.h`中特化`TargetWrapper`模板类
-
-特化`TargetWrapper`模板类,在`target_wrapper.h`中声明函数,具体如下:
-```c++
-template <>
-struct TargetWrapper { //根据TNEW的具体类型修改__xxx_target,__host_target或者__device_target
-
- typedef xxx_event event_t; //根据设备实现xxx_event
- typedef xxx_stream stream_t; //根据设备实现xxx_stream
-
- static void get_device_count(int& count);
-
- static void set_device(int id);
-
- //We should add strategy to avoid malloc directly
- static void mem_alloc(void** ptr, size_t n);
-
- static void mem_free(void* ptr);
-
- static void mem_set(void* ptr, int value, size_t n);
-
- static void create_event(event_t& event, bool flag = false);
-
- static void create_stream(stream_t& stream);
-
- static void create_stream_with_flag(stream_t& stream, unsigned int flag);
-
- static void create_stream_with_priority(stream_t& stream, unsigned int flag, int priority);
-
- static void destroy_stream(stream_t& stream);
-
- static void destroy_event(event_t& event);
-
- static void record_event(event_t& event, stream_t stream);
-
- static void query_event(event_t& event);
-
- static void sync_event(event_t& event);
-
- static void sync_stream(event_t& event, stream_t& stream);
-
- static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
- size_t count, __DtoD);
-
- static void async_memcpy(void* dst, int dst_id, const void* src, int src_id, \
- size_t count, stream_t& stream, __DtoD);
-
- static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
- size_t count, __HtoD);
-
- static void async_memcpy(void* dst, int dst_id, const void* src, int src_id, \
- size_t count, stream_t& stream, __HtoD);
-
- static void sync_memcpy(void* dst, int dst_id, const void* src, int src_id, \
- size_t count, __DtoH);
-
- static void async_memcpy(void* dst, int dst_id, const void* src, int src_id, \
- size_t count, stream_t& stream, __DtoH);
-
- static void sync_memcpy_p2p(void* dst, int dst_dev, const void* src, \
- int src_dev, size_t count);
-
- static void async_memcpy_p2p(void* dst, int dst_dev, const void* src, \
- int src_dev, size_t count, stream_t& stream);
-
- static int get_device_id();
-};
-
-```
-
-4. 在`impl/`目录下添加设备目录和实现
-
-在`saber/core/impl`目录下添加设备目录`tnew`。
-* 实现`TargetWrapper`结构体中各函数的定义。
-如果`TargetWrapper`的实现与默认的模板类一致,则不用特化出该类。
-
-```c++
-typedef TargetWrapper TNEW_API;
-void TNEW_API::get_device_count(int &count) {
- // add implementation
-}
-
-void TNEW_API::set_device(int id){
- // add implementation
-}
-
-void TNEW_API::mem_alloc(void** ptr, size_t n){
- // add implementation
-}
-
-void TNEW_API::mem_free(void* ptr){
- if(ptr != nullptr){
- // add implementation
- }
-}
-...
-
-```
-
-* 特化实现`device.h`中的`Device`
-
-```c++
-template <>
-void Device::create_stream() {
- // add implementation
-}
-
-template <>
-void Device::get_info() {
-
- // add implementation
-}
-
-```
-
-### 在`saber/funcs`中实现设备相关的op
-
-参考[如何增加新的Operator](./how_to_add_anakin_op.html)
-
-
-## 在`framework`中添加设备的具体化或实例化 ##
-
-### `framework/core`
-
-* `net.cpp`中添加实例化
-
-```c++
-#ifdef USE_TNEW_PLACE
-template class Net;
-template class Net;
-#endif
-```
-
-* `operator_func.cpp`中添加实例化
-
-```c++
-#ifdef USE_TNEW_PLACE
-template class OperatorFunc;
-#endif
-```
-
-* `worker.cpp`中添加实例化
-
-```c++
-#ifdef USE_TNEW_PLACE
-template class Worker;
-template class Worker;
-#endif
-```
-
-* `operator_attr.cpp`中添加实例化
-
-```c++
-template
-OpAttrWarpper& OpAttrWarpper::__alias__(const std::string& op_name);
-template
-OpAttrWarpper& OpAttrWarpper::__alias__(const std::string& op_name);
-template
-OpAttrWarpper& OpAttrWarpper::__alias__(const std::string& op_name);
-```
-
-* `parameter.h`中添加设备的实现
-
-```c++
-#ifdef USE_TNEW_PLACE
-template
-class PBlock {
-public:
- typedef Tensor4d::type> type;
-
- PBlock() {
- _inner_tensor = std::make_shared();
- }
- ...
-}
-#endif //TNEW
-```
-
-* `type_traits_extend.h`中添加设备的实现
-
-```c++
-template<>
-struct target_host {
- typedef saber::X86 type; //根据TNEW选择正确的host type
-};
-```
-
-### `framework/graph`
-
-* `graph.cpp`中添加实例化
-
-```c++
- #ifdef USE_TNEW_PLACE
- template class Graph;
- template class Graph;
- template class Graph;
- #endif
-```
-
-### `framework/model_parser`
-
-* `parser.cpp`中添加实例化
-
-```c++
- #ifdef USE_TNEW_PLACE
- template
- Status load(graph::Graph* graph,
- const char* model_path);
- template
- Status load(graph::Graph* graph,
- const char* model_path);
- template
- Status load(graph::Graph* graph,
- const char* model_path);
-
- template
- Status save(graph::Graph* graph,
- std::string& model_path);
- template
- Status save(graph::Graph* graph,
- std::string& model_path);
- template
- Status save(graph::Graph* graph,
- std::string& model_path);
-
- template
- Status load(graph::Graph* graph,
- std::string& model_path);
- template
- Status load(graph::Graph* graph,
- std::string& model_path);
- template
- Status load(graph::Graph* graph,
- std::string& model_path);
-
- template
- Status save(graph::Graph* graph,
- const char* model_path);
- template
- Status save(graph::Graph* graph,
- const char* model_path);
- template
- Status save(graph::Graph* graph,
- const char* model_path);
- #endif
-```
-
-* `model_io.cpp`中添加实例化
-
-```c++
-#ifdef USE_TNEW_PLACE
-template class NodeIO;
-template class NodeIO;
-template class NodeIO;
-#endif
-```
-
-### `framework/operators`
-
-为`framework/operators`目录下所有op添加实例化或具体化
-以`activation.cpp`为例,实例化如下:
-
-```c++
-#ifdef USE_TNEW_PLACE
-INSTANCE_ACTIVATION(TNEW, AK_FLOAT, Precision::FP32);
-INSTANCE_ACTIVATION(TNEW, AK_FLOAT, Precision::FP16);
-INSTANCE_ACTIVATION(TNEW, AK_FLOAT, Precision::INT8);
-template class ActivationHelper;
-ANAKIN_REGISTER_OP_HELPER(Activation, ActivationHelper, TNEW, AK_FLOAT, Precision::FP32);
-#endif
-```
-
-如果TNEW设备函数的实现与现有模板实现不一致,可以特化实现如下(以init()为例):
-```c++
-#ifdef USE_TNEW_PLACE
-INSTANCE_ACTIVATION(TNEW, AK_FLOAT, Precision::FP32);
-INSTANCE_ACTIVATION(TNEW, AK_FLOAT, Precision::FP16);
-INSTANCE_ACTIVATION(TNEW, AK_FLOAT, Precision::INT8);
-template <>
-Status ActivationHelper::Init(OpContext &ctx,\
- const std::vector >& ins, \
- std::vector >& outs) {
- SABER_CHECK(_funcs_activation.init(ins, outs, _param_activation, SPECIFY, SABER_IMPL, ctx)); //在这里选择实现方式
- return Status::OK();
-}
-ANAKIN_REGISTER_OP_HELPER(Activation, ActivationHelper, TNEW, AK_FLOAT, Precision::FP32);
-#endif
-```
-
-在`ANAKIN_REGISTER_OP(Activation)`中添加TNEW的注册
-
-```c++
-#ifdef USE_TNEW_PLACE
-.__alias__("activation")
-#endif
-```
-
-## 注意事项
-不要修改`Tensor`/`Buffer`/`Env`/`Context`这些类函数的接口和实现
diff --git a/doc/fluid/advanced_usage/deploy/anakin/index_cn.rst b/doc/fluid/advanced_usage/deploy/anakin/index_cn.rst
deleted file mode 100644
index 3027bd541..000000000
--- a/doc/fluid/advanced_usage/deploy/anakin/index_cn.rst
+++ /dev/null
@@ -1,29 +0,0 @@
-Anakin 预测引擎
-#######################
-
-
-使用文档
-~~~~~~~
-
-.. toctree::
- :maxdepth: 1
-
- install_anakin.md
- convert_paddle_to_anakin.md
- anakin_tutorial.md
- anakin_run_on_arm.md
- anakin_example.md
- int8_design_anakin.md
- anakin_gpu_benchmark.md
- anakin_arm_benchmark.md
-
-
-开发文档
-~~~~~~~
-
-.. toctree::
- :maxdepth: 1
-
- how_to_add_anakin_op.md
- how_to_support_new_device_in_anakin.md
- anakin_parser_design.md
diff --git a/doc/fluid/advanced_usage/deploy/anakin/install_anakin.md b/doc/fluid/advanced_usage/deploy/anakin/install_anakin.md
deleted file mode 100644
index 0b44a6be3..000000000
--- a/doc/fluid/advanced_usage/deploy/anakin/install_anakin.md
+++ /dev/null
@@ -1,76 +0,0 @@
-## 源码编译安装Anakin ##
-
-我们已经在CentOS 7.3上成功的安装和测试了Anakin,对于其他操作系统,我们将很快支持。
-
-### 安装概览 ###
-
-* [在CentOS上安装 Anakin]()
-* [在Ubuntu上安装 Anakin]()
-* [在ARM上安装 Anakin](./anakin_run_on_arm.html)
-* [验证安装]()
-
-
-### 在CentOS上安装 Anakin ###
-#### 1. 系统要求 ####
-
-* make 3.82+
-* cmake 2.8.12+
-* gcc 4.8.2+
-* g++ 4.8.2+
-
-#### 2. 编译CPU版Anakin ####
-
-暂时不支持
-
-#### 3. 编译支持NVIDIA GPU的Anakin ####
-
-- 3.1. 安装依赖
-
- - 3.1.1 protobuf
-
- ```
- > git clone https://github.com/google/protobuf
- > cd protobuf
- > git submodule update --init --recursive
- > ./autogen.sh
- > ./configure --prefix=/path/to/your/insall_dir
- > make
- > make check
- > make install
- > sudo ldconfig
- ```
-
- 如安装protobuf遇到任何问题,请访问[这里](https://github.com/google/protobuf/blob/master/src/README.md)
-
-- 3.2 CUDA Toolkit
-
- - [CUDA 8.0](https://developer.nvidia.com/cuda-zone) or higher, 具体信息参见[NVIDIA's documentation](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/).
- - [cuDNN v7](https://developer.nvidia.com/cudnn), 具体信息参见[NVIDIA's documentation](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/).
-
-- 3.3 编译Anakin
-
- ```
- > git clone https:/xxxxx
- > cd anakin
- > mkdir build
- > camke ..
- > make
- ```
-
-#### 4. 编译支持AMD GPU的Anakin ####
-
-暂时还不支持
-
-
-### 在Ubuntu上安装 Anakin ###
-
-暂时还不支持
-
-
-### 在ARM上安装 Anakin ###
-
-请参考[ARM安装文档](./anakin_run_on_arm.html)
-
-### 验证安装 ###
-
-安装完成后,如果没有报错信息,你可以通过运行 `output/unit_test`路径下的单测示例验证是否编译成功。
diff --git a/doc/fluid/advanced_usage/deploy/anakin/run_anakin_on_arm.md b/doc/fluid/advanced_usage/deploy/anakin/run_anakin_on_arm.md
deleted file mode 100644
index f61beca7e..000000000
--- a/doc/fluid/advanced_usage/deploy/anakin/run_anakin_on_arm.md
+++ /dev/null
@@ -1,185 +0,0 @@
-## ARM 源码编译 Anakin ##
-
-目前Anakin支持ARM Android平台,采用Android NDK交叉编译工具链,已在mac os和centos上编译和测试通过。
-
-### 安装概览 ###
-
-* [系统需求](#0001)
-* [安装第三方依赖](#0002)
-* [Anakin源码编译](#0003)
-* [验证安装](#0004)
-
-
-### 1. 系统需求 ###
-
-* 宿主机: linux, mac
-* cmake 3.8.2+
-* Android NDK r14, Linux 版本[从这里下载](https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip)
-
-### 2. 安装第三方依赖 ###
-
-- 2.1 protobuf3.4.0
-
- 源码从这里[下载](https://github.com/google/protobuf/releases/tag/v3.4.0)
-
- - 2.1.1 为宿主机编译protobuf
-
-```bash
- $ tar -xzf protobuf-3.4.0.tar.gz
- $ cd protobuf-3.4.0
- $ ./autogen.sh
- $ ./configure
- $ make
- $ make check
- $ make install
-```
-
-上述 $make install 执行后,可在 /usr/local/include/google 找到 libprotobuf 所需的头文件,将整个google文件夹拷贝至Anakin/third-party/arm-android/protobuf/下
-
-如有问题,请点[这里](https://github.com/google/protobuf/blob/v3.4.0/src/README.md),然后将已经生成文件清除。
-
-```bash
- $ make distclean
-```
-
- - 2.1.1 交叉编译Android`armeabi-v7a`的protobuf,注意设置ANDROID_NDK的路径,以及ARCH_ABI、HOSTOSN的值
-
- ```bash
-
- $ export ANDROID_NDK=your_ndk_path
- $ ARCH_ABI="arm-linux-androideabi-4.9"
- $ HOSTOSN="darwin-x86_64"
- $ export SYSROOT=$ANDROID_NDK/platforms/android-9/arch-arm
- $ export PREBUILT=$ANDROID_NDK/toolchains/$ARCH_ABI
- $ export LDFLAGS="--sysroot=$SYSROOT"
- $ export LD="$ANDROID_NDK/toolchains/$ARCH_ABI/prebuilt/$HOSTOSN/arm-linux-androideabi/bin/ld $LDFLAGS"
- $ export LIBS="-llog $ANDROID_NDK/sources/cxx-stl/gnu-libstdc++/4.9/libs/armeabi-v7a/libgnustl_static.a"
- $ export CPPFLAGS=""
- $ export INCLUDES="-I$ANDROID_NDK/sources/cxx-stl/gnu-libstdc++/4.9/include/ -I$ANDROID_NDK/platforms/android-9/arch-arm/usr/include/ -I$ANDROID_NDK/sources/cxx-stl/gnu-libstdc++/4.9/libs/armeabi-v7a/include/"
- $ export CXXFLAGS="-march=armv7-a -mfloat-abi=softfp -DGOOGLE_PROTOBUF_NO_RTTI --sysroot=$SYSROOT"
- $ export CCFLAGS="$CXXFLAGS"
- $ export CXX="$PREBUILT/prebuilt/$HOSTOSN/bin/arm-linux-androideabi-g++ $CXXFLAGS"
- $ export CC="$CXX"
- $ export RANLIB="$ANDROID_NDK/toolchains/$ARCH_ABI/prebuilt/$HOSTOSN/bin/arm-linux-androideabi-ranlib"
- $ ./autogen.sh
- $ ./configure --host=arm-linux-androideabi --with-sysroot=$SYSROOT --enable-cross-compile --with-protoc=protoc --disable-shared CXX="$CXX" CC="$CC" LD="$LD"
- $ make
-```
-
-编译生成 *.a 静态库,若希望编译*.so 动态链接库 ,请在./configure参数中改--disable-shared为--disable-static --enable-shared。
-生成文件在src/.libs/下,将生成的文件拷贝至Anakin/third-party/arm-android/protobuf/lib下。
-在[cmake](../../cmake/find_modules.cmake)中更新`ARM_RPOTO_ROOT`的路径。
-
-```cmake
- set(ARM_RPOTO_ROOT "${CMAKE_SOURCE_DIR}/third-party/arm-android/protobuf")
-```
-
-- 2.2 opencv 2.4.3+(optional)
-
- Anakin只在examples示例中使用opencv
- Android系统的opencv从[这里下载](https://opencv.org/releases.html)
- 解压后将 `3rdparty/libs/armeabi-v7a`中的库文件拷贝到`libs/armeabi-v7a`
- 在[cmake](../../cmake/find_modules.cmake)中搜索`anakin_find_opencv`,
- 并设置 `include_directories` 和 `LINK_DIRECTORIES`为自己安装的库的路径。
-
- ```cmake
- include_directories(${CMAKE_SOURCE_DIR}/third-party/arm-android/opencv/sdk/native/jni/include/)
- LINK_DIRECTORIES(${CMAKE_SOURCE_DIR}/third-party/arm-android/opencv/sdk/native/libs/armeabi-v7a/)
- ```
-### 3. Anakin源码编译 ###
-
-#### 编译Android版本
-
- 克隆[源码](https://github.com/PaddlePaddle/Anakin/tree/arm)
-
-```bash
- cd your_dir
- git clone https://github.com/PaddlePaddle/Anakin.git
- cd Anakin
- git fetch origin arm
- git checkout arm
- ```
-
- 修改`android_build.sh`
-
-- 修改NDK路径
-
- ```bash
- #modify "your_ndk_path" to your NDK path
- export ANDROID_NDK=your_ndk_path
- ```
-
-- 修改ARM 处理器架构
-
- 对于32位ARM处理器, 将ANDROID_ABI 设置为 `armeabi-v7a with NEON`,
- 对于64位ARM处理器, 可以将ANDROID_ABI 设置为 `armeabi-v7a with NEON`或者`arm64-v8a`。
- 目前我们只支持 `armeabi-v7a with NEON`;`arm64-v8a` 还在开发中。
-
- ```bash
- -DANDROID_ABI="armeabi-v7a with NEON"
- ```
-
-- 设置Android API
-
- 根据Android系统的版本设置API level, 例如API Level 21 -> Android 5.0.1
- ```bash
- -DANDROID_NATIVE_API_LEVEL=21
- ```
-
-- 选择编译静态库或动态库
-
- 设置`BUILD_SHARED=NO`编译静态库
- 设置`BUILD_SHARED=YES`编译动态库
-
- ```bash
- -DBUILD_SHARED=NO
- ```
-- OpenMP多线程支持
-
- 设置`USE_OPENMP=YES`开启OpenMP多线程
-
- ```bash
- -DUSE_OPENMP=YES
- ```
-
-- 编译单测文件
-
- 设置`BUILD_WITH_UNIT_TEST=YES`将会编译单测文件
-
- ```bash
- -DBUILD_WITH_UNIT_TEST=YES
- ```
-
-- 编译示例文件
-
- 设置`BUILD_EXAMPLES=YES`将会编译示例文件
-
- ```bash
- -DBUILD_EXAMPLES=YES
- ```
-
-- 开启opencv
-
- 如果使用opencv,设置`USE_OPENCV=YES`
-
- ```bash
- -DUSE_OPENCV=YES
- ```
-
-- 开始编译
-
- 运行脚本 `android_build.sh` 将自动编译Anakin
-
- ```bash
- ./android_build.sh
- ```
-
-### 4. 验证安装 ###
-
- 编译好的库会放在目录`${Anakin_root}/output`下
-
- 编译好的单测文件会放在`${Anakin_root}/output/unit_test`目录下
-
- 编译好的示例文件会放在`${Anakin_root}/output/examples`目录下
-
- 对于Android系统,打开设备的调试模式,通过ADB可以访问的目录是`data/local/tmp`,通过ADB push将测试文件、模型和数据发送到设备目录,运行测试文件。
diff --git a/doc/fluid/advanced_usage/development/profiling/benchmark.rst b/doc/fluid/advanced_usage/development/profiling/benchmark.rst
deleted file mode 100644
index 7854263bf..000000000
--- a/doc/fluid/advanced_usage/development/profiling/benchmark.rst
+++ /dev/null
@@ -1,120 +0,0 @@
-#################
-如何进行基准测试
-#################
-
-本文介绍如何给深度学习框架做基准测试。基准测试主要包含验证模型的精度和性能两方面,下文包含搭建测试环境,选择基准测试模型,验证测试结果等几方面内容。
-
-验证深度学习框架,可分为训练和测试两个阶段, 验证指标略有不同,本文只介绍训练阶段的指标验证。训练阶段关注的是模型训练集上的精度,训练集是完备的,因此关注大batch\_size下的训练速度,关注吞吐量,例如图像模型常用的batch\_size=128, 多卡情况下会加大;预测阶段关注的是在测试集上的精度,线上服务测试数据不能提前收集,因此关注小batch\_size下的预测速度,关注延迟,例如预测服务常用的batch\_size=1, 4等。
-
-`Fluid `__ 是PaddlePaddle从0.11.0版本开始引入的设计,本文的基准测试在该版本上完成。
-
-
-环境搭建
-""""""""""""
-
-基准测试中模型精度和硬件、框架无关,由模型结构和数据共同决定;性能方面由测试硬件和框架性能决定。框架基准测试为了对比框架之间的差异,控制硬件环境,系统库等版本一致。下文中的对比实验都在相同的硬件条件和系统环境条件下进行.
-
-
-不同架构的GPU卡性能差异巨大,在验证模型在GPU上训练性能时,可使用NVIDIA提供的工具:code `nvidia-smi` 检验当前使用的GPU型号,如果测试多卡训练性能,需确认硬件连接是 `nvlink `__ 或 `PCIe `__ 。 同样地,CPU型号会极大影响模型在CPU上的训练性能。可读取`/proc/cpuinfo`中的参数,确认当前正在使用的CPU型号。
-
-下载GPU对应的Cuda Tool Kit和 Cudnn,或者使用NVIDIA官方发布的nvidia-docker镜像 `nvidia-docker `__, 镜像内包含了Cuda和Cudnn,本文采用这种方式。 Cuda Tool Kit包含了GPU代码使用到的基础库,影响在此基础上编译出的Fluid二进制运行性能。
-
-准备好Cuda环境后,从github上的下载Paddle并源码编译,会生成对应的最适合当前GPU的sm\_arch二进制\ `sm\_arch `__\ 。另外,cudnn对卷积类任务影响巨大,在基准测试中需要小版本一致,例如Cudnn7.0.2与Cudnn7.1.4在Resnet上有5%以上差异。
-
-
-选择基准模型
-""""""""""""
-
-对框架做基准测试,需要覆盖不同训练任务和不同大小的模型,本文中选取了图像和NLP的最为常用的5个模型。
-
-============ ============ ================= ============
-任务种类 模型名称 网络结构 数据集
-============ ============ ================= ============
-图像分类 mnist Lenet mnist
-图像分类 VGG VGG-16 Flowers102
-图像分类 Resnet Resnet-50 Flowers102
-文本分类 Stacked-LSTM Stacked-LSTM IMDB
-机器翻译 seq-seq Stacked-LSTM wmt14
-============ ============ ================= ============
-
-其中mnist, VGG, Resnet属于CNN模型, stacked-lstm, seq2seq代表RNN模型。
-`benchmark `__
-基准模型测试脚本中,均跳过了前几个batch的训练过程,原因是加载数据和分配显存受系统当前运行情况影响,会导致统计性能不准确。运行完若干个轮次后,统计对应指标。
-
-
-基准模型的数据的选择方面,数据量大且验证效果多的公开数据集为首选。图像模型VGG和resnet, 本文选择了 `flowers102 `__ ,图像大小预处理为和Imagenet相同大小,因此性能可直接对比
-NLP模型的公开且影响力大数据集较少,seq2seq模型选择了wmt14数据,stacked-lstm模型中选择了 `imdb `__ 数据。
-
-
-注意,图像模型每条样本大小相同,图像经过变换后大小一致,因此经过的计算路径基本相同,计算速度和显存占用波动较小,可以从若干个batch的数据中采样得到当前的训练性能数据。而NLP模型由于样本长度不定,计算路径和显存占用也不相同,因此只能完整运行若干个轮次后,统计速度和显存消耗。
-显存分配是特别耗时的操作,因此Fluid默认会占用所有可用显存空间形成显存池,用以加速计算过程中的显存分配。如果需要统计模型真实显存消耗,可设置环境变量`FLAGS_fraction_of_gpu_memory_to_use=0.0`,观察最大显存开销。
-
-
-测试过程
-""""""""""""
-
-- CPU 单机单线程测试
-
-测试CPU上单线程的性能,先设置CUDA的环境变量为空,``CUDA_VISIBLE_DEVICES=``,并通过环境变量关闭OpenMP和MKL的多线程 ``OMP_NUM_THREADS=1``, ``MKL_NUM_THREADS=1;``。
-然后代码中设置为使用CPUPlace,如果使用Paddle代码库中的脚本,只需要命令行参数传入 use_gpu=False即可。
-
-.. code-block:: python
-
- >>> import paddle.fluid as fluid
- >>> place = fluid.CPUPlace()
-
-.. code:: bash
-
- docker run -it --name CASE_NAME --security-opt seccomp=unconfined -v $PWD/benchmark:/benchmark paddlepaddle/paddle:latest-dev /bin/bash
-
-
-- GPU 单机单卡测试
-
-本教程使用了Cuda8, Cudnn7.0.1。来源为:code `nvidia/cuda:8.0-cudnn7-devel-ubuntu16.04`
-
-.. code:: bash
-
- nvidia-docker run -it --name CASE_NAME --security-opt seccomp=unconfined -v $PWD/benchmark:/benchmark -v /usr/lib/x86_64-linux-gnu:/usr/lib/x86_64-linux-gnu paddlepaddle/paddle:latest-dev /bin/bash
-在单卡上测试,设置CUDA的环境变量使用一块GPU,``CUDA_VISIBLE_DEVICES=0``
-然后代码中设置为使用CUDAPlace,如果使用Paddle代码库中的脚本,只需要命令行参数传入 use_gpu=True即可。
-
-.. code-block:: python
-
- >>> import paddle.fluid as fluid
- >>> place = fluid.CUDAPlace(0) // 0 指第0块GPU
-
-
-测试结果
-""""""""""""
-
-本教程对比相同环境下的Fluid0.12.0和TensorFlow1.4.0的性能表现。
-硬件环境为 CPU: Intel(R) Xeon(R) CPU E5-2660 v4 @ 2.00GHz, GPU: TITAN X(Pascal) 12G x 1, Nvidia-Driver 384.90。
-系统环境为Ubuntu 16.04.3 LTS, 本文中采用了docker环境,系统版本为nvidia-docker17.05.0-ce。
-测试的Fluid版本为\ `v.0.12.0 `__ 。
-TensorFlow版本为\ `v.1.4.0-rc1 `__ 。
-使用的脚本和配置见\ `benchmark `__ 。
-图表中统计单位为samples/秒。
-
-- CPU 单机单线程测试结果
-
- ================ ==================== ===================
- Speed Fluid CPU TensorFlow CPU
- ================ ==================== ===================
- mnist 1298.75 samples/s 637.57 samples/s
- VGG-16 0.4147 images/s 0.1229 images/s
- Resnet-50 1.6935 images/s 0.3657 images/s
- Stacked-LSTM 472.3225 words/s 48.2293words/s
- Seq2Seq 217.1655 words/s 28.6164 words/s
- ================ ==================== ===================
-
-- GPU 单机单卡测试结果
-
- =============== ===================== =================
- Speed Fluid GPU TensorFlow GPU
- =============== ===================== =================
- mnist 19710.90 samples/s 15576.3 samples/s
- VGG-16 59.83327 images/s 40.9967 images/s
- Resnet-50 105.84412 97.8923 images/s
- Stacked-LSTM 1319.99315 1608.2526 words/s
- Seq2Seq 7147.89081 6845.1161 words/s
- =============== ===================== =================
diff --git a/doc/fluid/advanced_usage/development/profiling/gpu_profiling_cn.rst b/doc/fluid/advanced_usage/development/profiling/gpu_profiling_cn.rst
deleted file mode 100644
index 481fc9166..000000000
--- a/doc/fluid/advanced_usage/development/profiling/gpu_profiling_cn.rst
+++ /dev/null
@@ -1,239 +0,0 @@
-============
-GPU性能调优
-============
-
-.. contents::
-
-此教程将向您分步介绍如何使用内置的定时工具、 **nvprof** 或 **nvvp** 来运行性能分析和调优。
-
-- 什么是性能分析?
-- 为什么需要性能分析?
-- 如何进行性能分析?
-- 性能分析工具介绍
-- 详细教程
-- 性能分析小技巧
-
-什么是性能分析?
-================
-在软件工程的范畴里,性能分析(Profiling)是一个动态程序分析的术语,它可以指测量一个程序的空间(内存)复杂度或时间复杂度,
-也可以说是某些特定指令的使用情况,或者是函数调用的频率和耗时等。通常情况下,分析得到的信息用于协助进行程序的优化。
-
-简单来说,性能分析工具是用于给应用程序的性能做定量分析的。如果想很好的理解程序的行为,那程序分析工具是必不可少的利器。简单的性能分析,可以告诉您某个操作到底花了多长时间?而更深入的分析,甚至能解释为什么某个操作花了很长时间?
-
-为什么需要性能分析?
-============================
-训练好一个深层神经网络通常要耗费非常长的时间,所以性能也就逐步变成了深度学习领域最重要的指标。
-而优化性能的首要任务,是需要了解哪些步骤拖慢了整体。
-如果某一块根本就不怎么耗时,那也就不需要急着优化性能啦!
-
-如何进行性能分析?
-========================
-为了达到性能最优,您可以采用下面五个步骤:
-
-- 对代码进行性能分析
-- 找到运行慢的部分
-- 找到运行慢的原因
-- 修改成更快的版本
-- 再次对代码进行性能分析
-
-Usually, processor has two key performance limits include float point throughput and
-memory throughput. For GPU, it also need more parallelism to fulfill its potential.
-This is why they can be so fast.
-
-通常情况下,处理器有两个关键性能限制:一个是浮点计算量,另一个是内存操作量。
-GPU则还需要高并行性,才能发挥其全部能力。这正是它们速度快的原因。
-
-性能分析工具介绍
-======================
-就通常的GPU性能分析来说,市面上已经有NVIDIA或第三方提供的众多工具。
-
-**nvprof** 是Nvidia性能分析工具, **nvvp** 则是带GUI的Nvidia可视化性能分析工具。
-在这个教程中,我们主要会介绍nvprof和nvvp。
-
-:code:`test_GpuProfiler` from :code:`paddle/legacy/math/tests` directory will be used to evaluate
-above profilers.
-
-:code:`paddle/legacy/math/test` 目录中的 :code:`test_GpuProfiler` 就是用于展示上述分析工具的用法。
-
-.. literalinclude:: ../../../../paddle/legacy/math/tests/test_GpuProfiler.cpp
- :language: c++
- :lines: 137-151
- :linenos:
-
-上述的代码片段包含了两种方法,您可以任意使用一个或两个来对感兴趣的代码段做性能分析。
-
-1. :code:`REGISTER_TIMER_INFO` 是一个内置的定时器封装,可以用来计算CPU函数或cuda内核的时间消耗。
-2. :code:`REGISTER_GPU_PROFILER` 是 :code:`cudaProfilerStart` 和 :code:`cudaProfilerStop` 的通用包装对象,避免当CPU版本的PaddlePaddle调用它们时程序崩溃。
-3. :code:`REGISTER_GPU_PROFILER` 是一个封装对象,封装了 :code:`cudaProfilerStart` 和 :code:`cudaProfileStop` 两个操作;同时其内部实现可以避免纯CPU版本PaddlePaddle在执行本语句时发生崩溃。
-
-您会在接下来的部分中获得更多的细节介绍。
-
-详细教程
-============
-
-内置定时器
-------------
-
-如果想要启用PaddlePaddle的内置定时器,您首先需要在相关代码段中加入 :code:`REGISTER_TIMER_INFO`。
-接下来就可以使用 :code:`printStatus` 或者 :code:`printAllStatus` 函数来将信息输出到界面中。
-下面举个简单的例子:
-
-1. 加入 :code:`REGISTER_TIMER_INFO` 和 :code:`printAllStatus` 函数(如高亮部分)。
-
- .. literalinclude:: ../../../../paddle/legacy/math/tests/test_GpuProfiler.cpp
- :language: c++
- :lines: 137-151
- :emphasize-lines: 8-12,14
- :linenos:
-
-2. cmake配置中将 **WITH_TIMER** 打开,重新编译PaddlePaddle。
-
- .. code-block:: bash
-
- cmake .. -DWITH_TIMER=ON
- make
-
-3. 执行您的代码,并观察结果(如高亮部分)。
-
- .. code-block:: bash
- :emphasize-lines: 1,12-15
-
- > ./paddle/legacy/math/tests/test_GpuProfiler
- I1117 11:13:42.313065 2522362816 Util.cpp:155] commandline: ./paddle/legacy/math/tests/test_GpuProfiler
- I1117 11:13:42.845065 2522362816 Util.cpp:130] Calling runInitFunctions
- I1117 11:13:42.845208 2522362816 Util.cpp:143] Call runInitFunctions done.
- [==========] Running 1 test from 1 test case.
- [----------] Global test environment set-up.
- [----------] 1 test from Profiler
- [ RUN ] Profiler.BilinearFwdBwd
- I1117 11:13:42.845310 2522362816 test_GpuProfiler.cpp:114] Enable GPU Profiler Stat: [testBilinearFwdBwd] "numSamples = 10, channels = 16, im
- gSizeX = 64, imgSizeY = 64"
- I1117 11:13:42.850154 2522362816 ThreadLocal.cpp:37] thread use undeterministic rand seed:20659751
- I1117 11:13:42.981501 2522362816 Stat.cpp:130] ======= StatSet: [GlobalStatInfo] status ======
- I1117 11:13:42.981539 2522362816 Stat.cpp:133] Stat=testBilinearFwdBwd total=136.141 avg=136.141 max=136.141 min=136.141 count=1
- I1117 11:13:42.981572 2522362816 Stat.cpp:141] ======= BarrierStatSet status ======
- I1117 11:13:42.981575 2522362816 Stat.cpp:154] --------------------------------------------------
- [ OK ] Profiler.BilinearFwdBwd (136 ms)
- [----------] 1 test from Profiler (136 ms total)
-
- [----------] Global test environment tear-down
- [==========] 1 test from 1 test case ran. (136 ms total)
- [ PASSED ] 1 test.
-
-nvprof 工具
-----------------
-
-要使用命令行分析工具 **nvprof**,您按如下步骤操作即可:
-
-1. 将 :code:`REGISTER_GPU_PROFILER` 函数加到代码中(参考强调部分)。
-
- .. literalinclude:: ../../../../paddle/legacy/math/tests/test_GpuProfiler.cpp
- :language: c++
- :lines: 137-151
- :emphasize-lines: 6-7
- :linenos:
-
-2. cmake中将 **WITH_PROFILER** 配置打开,重新编译PaddlePaddle。
-
- .. code-block:: bash
-
- cmake .. -DWITH_PROFILER=ON
- make
-
-3. 使用 **nvprof** 来分析执行文件。
-
- .. code-block:: bash
-
- nvprof ./paddle/legacy/math/tests/test_GpuProfiler
-
-然后,您就能获得如下的分析结果:
-
-.. code-block:: bash
-
- ==78544== Profiling application: ./paddle/legacy/math/tests/test_GpuProfiler
- ==78544== Profiling result:
- Time(%) Time Calls Avg Min Max Name
- 27.60% 9.6305ms 5 1.9261ms 3.4560us 6.4035ms [CUDA memcpy HtoD]
- 26.07% 9.0957ms 1 9.0957ms 9.0957ms 9.0957ms KeBilinearInterpBw
- 23.78% 8.2977ms 1 8.2977ms 8.2977ms 8.2977ms KeBilinearInterpFw
- 22.55% 7.8661ms 2 3.9330ms 1.5798ms 6.2863ms [CUDA memcpy DtoH]
-
- ==78544== API calls:
- Time(%) Time Calls Avg Min Max Name
- 46.85% 682.28ms 8 85.285ms 12.639us 682.03ms cudaStreamCreateWithFlags
- 39.83% 580.00ms 4 145.00ms 302ns 550.27ms cudaFree
- 9.82% 143.03ms 9 15.892ms 8.7090us 142.78ms cudaStreamCreate
- 1.23% 17.983ms 7 2.5690ms 23.210us 6.4563ms cudaMemcpy
- 1.23% 17.849ms 2 8.9247ms 8.4726ms 9.3768ms cudaStreamSynchronize
- 0.66% 9.5969ms 7 1.3710ms 288.43us 2.4279ms cudaHostAlloc
- 0.13% 1.9530ms 11 177.54us 7.6810us 591.06us cudaMalloc
- 0.07% 1.0424ms 8 130.30us 1.6970us 453.72us cudaGetDevice
- 0.04% 527.90us 40 13.197us 525ns 253.99us cudaEventCreateWithFlags
- 0.03% 435.73us 348 1.2520us 124ns 42.704us cuDeviceGetAttribute
- 0.03% 419.36us 1 419.36us 419.36us 419.36us cudaGetDeviceCount
- 0.02% 260.75us 2 130.38us 129.32us 131.43us cudaGetDeviceProperties
- 0.02% 222.32us 2 111.16us 106.94us 115.39us cudaLaunch
- 0.01% 214.06us 4 53.514us 28.586us 77.655us cuDeviceGetName
- 0.01% 115.45us 4 28.861us 9.8250us 44.526us cuDeviceTotalMem
- 0.01% 83.988us 4 20.997us 578ns 77.760us cudaSetDevice
- 0.00% 38.918us 1 38.918us 38.918us 38.918us cudaEventCreate
- 0.00% 34.573us 31 1.1150us 279ns 12.784us cudaDeviceGetAttribute
- 0.00% 17.767us 1 17.767us 17.767us 17.767us cudaProfilerStart
- 0.00% 15.228us 2 7.6140us 3.5460us 11.682us cudaConfigureCall
- 0.00% 14.536us 2 7.2680us 1.1490us 13.387us cudaGetLastError
- 0.00% 8.6080us 26 331ns 173ns 783ns cudaSetupArgument
- 0.00% 5.5470us 6 924ns 215ns 2.6780us cuDeviceGet
- 0.00% 5.4090us 6 901ns 328ns 3.3320us cuDeviceGetCount
- 0.00% 4.1770us 3 1.3920us 1.0630us 1.8300us cuDriverGetVersion
- 0.00% 3.4650us 3 1.1550us 1.0810us 1.2680us cuInit
- 0.00% 830ns 1 830ns 830ns 830ns cudaRuntimeGetVersion
-
-
-nvvp 工具
---------------
-
-如果想使用可视化的分析器 **nvvp**,您可以导入 :code:`nvprof -o ...` 的输出,或者从工具的界面里运行您的应用。
-
-**备注: nvvp 也支持CPU的性能分析** (需在nvvp界面中选上才能开启)
-
-.. image:: nvvp1.png
- :align: center
- :scale: 33%
-
-从内核函数的角度, **nvvp** 可以精确说明一个长耗时操作的具体原因。
-同时,如下图所示, **nvvp** 的内核block使用情况、寄存器使用情况和共享内存使用情况能让我们对GPU的整体使用有更好的理解。
-
-
-.. image:: nvvp2.png
- :align: center
- :scale: 33%
-
-而从应用的角度, **nvvp** 可以帮您提供一些定位性能瓶颈的建议。
-例如,下图中就展示了一些关于内存数据迁徙和计算资源利用率的建议,为您做性能调优提供了方向。
-
-.. image:: nvvp3.png
- :align: center
- :scale: 33%
-
-.. image:: nvvp4.png
- :align: center
- :scale: 33%
-
-性能分析小技巧
-==================
-
-- 开始阶段,从 **nvprof** 和 **nvvp** 的输出信息入手是个不错的选择。
-- 接下来可以考虑下时间线的分析。
-- 如果真想挖掘内核深处的某个秘密,您最好先确认:这一块的耗时比例真的太高,值得深入分析。
-- 可能的情况下,试着让输出的分析数据和理论值对应。
-
- 1) 例如,如果我知道内核花了10ms来移动1GB数据,那我会期望分析工具统计到速度是100GB/s。
- 2) 若有不一致之处,很有可能实际应用就是没有按照您的预期情况运行。
-- 了解您的硬件:如果您的GPU理论可以达到6 TFLOPs(6万亿次浮点运算每秒),而当前已经有5.5 TFLOPs了,那估计这里的潜力就没啥好挖的了……
-
-性能分析是性能优化的关键一步。有的时候简简单单的改变就能在性能上产生明显的优化效果!
-当然,具体情况因人而异。
-
-参考资料
-===========
-Jeremy Appleyard, `GPU Profiling for Deep Learning `_, 2015
diff --git a/doc/fluid/advanced_usage/development/write_docs_cn.md b/doc/fluid/advanced_usage/development/write_docs_cn.md
deleted file mode 120000
index ea03c90b5..000000000
--- a/doc/fluid/advanced_usage/development/write_docs_cn.md
+++ /dev/null
@@ -1 +0,0 @@
-../../dev/write_docs_cn.md
\ No newline at end of file
diff --git a/doc/fluid/advanced_usage/development/write_docs_cn.md b/doc/fluid/advanced_usage/development/write_docs_cn.md
new file mode 100644
index 000000000..aeeabdec8
--- /dev/null
+++ b/doc/fluid/advanced_usage/development/write_docs_cn.md
@@ -0,0 +1,203 @@
+# 如何贡献文档
+
+PaddlePaddle非常欢迎您贡献文档。如果您撰写/翻译的文档满足我们的要求,您的文档将会呈现在paddlapaddle.org网站和Github上供PaddlePaddle的用户阅读。
+
+Paddle的文档主要分为以下几个模块:
+
+- 新手入门:包括安装说明、深度学习基础知识、学习资料等,旨在帮助用户快速安装和入门;
+
+- 使用指南:包括数据准备、网络配置、训练、Debug、预测部署和模型库文档,旨在为用户提供PaddlePaddle基本用法讲解;
+
+- 进阶使用:包括服务器端和移动端部署、如何贡献代码/文档、如何性能调优等,旨在满足开发者的需求;
+
+我们的文档支持[reStructured Text](http://www.sphinx-doc.org/en/master/usage/restructuredtext/basics.html)和[Markdown](https://guides.github.com/features/mastering-markdown/) (GitHub风格)格式的内容贡献。
+
+撰写文档完成后,您可以使用预览工具查看文档在官网显示的效果,以验证您的文档是否能够在官网正确显示。
+
+
+## 如何使用预览工具
+
+如果您正在修改代码文档(即API),并在Docker容器中使用PaddlePaddle,请在您相应的docker容器中执行下列步骤。因为API的文档生成器依赖于PaddlePaddle。
+
+如果您只改进了文本/媒体内容(不需要安装或构建PaddlePaddle),或者正在主机上构建PaddlePaddle,请继续在主机上执行下列步骤。
+
+### 1. Clone你希望更新或测试的相关仓库:
+
+首先下载完整的文档存储仓库,其中`--recurse-submodules`会同步更新FluidDoc中的submodule(所有的submodule均在`FluidDoc/external`中),以保证所有文档可以正常显示:
+
+```
+git clone --recurse-submodules https://github.com/PaddlePaddle/FluidDoc
+```
+
+其他可拉取的存储库有:
+
+
+```
+git clone https://github.com/PaddlePaddle/book.git
+git clone https://github.com/PaddlePaddle/models.git
+git clone https://github.com/PaddlePaddle/Mobile.git
+
+```
+
+您可以将这些本地副本放在电脑的任意目录下,稍后我们会在启动 PaddlePaddle.org时指定这些仓库的位置。
+
+### 2. 在新目录下拉取 PaddlePaddle.org 并安装其依赖项
+
+在此之前,请确认您的操作系统安装了python的依赖项
+
+以ubuntu系统为例,运行:
+
+```
+sudo apt-get update && apt-get install -y python-dev build-essential
+```
+
+然后:
+
+```
+git clone https://github.com/PaddlePaddle/PaddlePaddle.org.git
+cd PaddlePaddle.org/portal
+# To install in a virtual environment.
+# virtualenv venv; source venv/bin/activate
+pip install -r requirements.txt
+```
+
+**可选项**:如果你希望实现中英网站转换,以改善PaddlePaddle.org,请安装[GNU gettext](https://www.gnu.org/software/gettext/)
+
+### 3. 在本地运行 PaddlePaddle.org
+
+添加您希望加载和构建内容的目录列表(选项包括:--paddle,--book,--models,--mobile)
+
+运行:
+
+```
+./runserver --paddle
+```
+
+**注意:** ``为第一步中paddle副本在您本机的存储地址。
+
+如果您需要处理依赖于`book`、`models`或`mobile`存储库内容的文档,您可以添加一个或多个可选项:
+
+```
+./runserver --paddle \
+ --book /external/book \
+ --models /external/models \
+ --mobile /external/mobile
+```
+然后:打开浏览器并导航到http://localhost:8000。
+
+>*网站可能需要几秒钟才能成功加载,因为构建需要一定的时间*
+
+>*如果您是在docker环境下运行的这些步骤,请检查ip确保可以将端口8000映射到您的主机*
+
+## 贡献新文档或更新API
+
+所有内容都应该以[Markdown](https://guides.github.com/features/mastering-markdown/) (GitHub风格)的形式编写(尽管在文档中有一些使用.rst格式的遗留内容)。
+
+
+在完成安装步骤后,您还需要完成下列操作:
+
+ - 在你开始写作之前,我们建议你回顾一下这些关于贡献内容的指南
+
+ ---
+
+ **贡献新文档**
+
+
+ - 创建一个新的` .md` 文件或者在您当前操作的仓库中修改已存在的文章
+ - 将新增的文档名,添加到对应的index文件中
+
+ ---
+
+ **贡献或修改Python API**
+
+
+ 在编译代码的docker容器内,或主机的对应位置:
+
+ - 运行脚本 `paddle/scripts/paddle_build.sh`(在 Paddle repo 下)
+
+ ```bash
+ # 编译paddle的python库
+ cd Paddle
+ ./paddle/scripts/paddle_docker_build.sh gen_doc_lib full
+ cd ..
+ ```
+
+ - 运行预览工具
+
+ ```
+ # 在编译paddle的对应docker镜像中运行预览工具
+
+ docker run -it -v /Users/xxxx/workspace/paddlepaddle_workplace:/workplace -p 8000:8000 [images_id] /bin/bash
+ ```
+
+ > 其中`/Users/xxxx/workspace/paddlepaddle_workplace`请替换成您本机的paddle工作环境,`/workplace`请替换成您相应的 docker 下的工作环境,这一映射会保证我们同时完成编译python库、修改FluidDoc和使用预览工具。
+
+ > [images_id]为docker中您使用的paddlepaddle的镜像id。
+
+ - 设置环境变量
+
+ ```
+ # 在docker环境中
+ # 设置环境变量`PYTHONPATH`使预览工具可以找到 paddle 的 python 库
+ export PYTHONPATH=/workplace/Paddle/build/python/
+ ```
+
+ - 清理旧文件
+
+ ```
+ # 清除历史生成的文件,如果是第一次使用预览工具可以跳过这一步
+ rm -rf /workplace/FluidDoc/doc/fluid/menu.json /workplace/FluidDoc/doc/fluid/api/menu.json /tmp/docs/ /tmp/api/
+ ```
+
+ - 启动预览工具
+
+ ```
+ cd /workplace/PaddlePaddle.org/portal
+ pip install -r requirements.txt
+ ./runserver --paddle /workplace/FluidDoc/
+ ```
+
+---
+
+ **预览修改**
+
+
+
+ 打开浏览器并导航到http://localhost:8000。
+
+ 在要更新的页面上,单击右上角的Refresh Content
+
+ 进入使用文档单元后,API部分并不包含内容,希望预览API文档需要点击API目录,几分钟后您将看到生成的 API reference。
+
+
+## 提交修改
+
+如果您希望修改代码,请在`Paddle`仓库下参考[如何贡献代码](../development/contribute_to_paddle.html)执行操作。
+
+如果您仅修改文档:
+
+ - 修改的内容在`doc`文件夹内,您只需要在`FluidDoc`仓库下提交`PR`
+
+ - 修改的内容在`external`文件夹内:
+
+ 1.在您修改的仓库下提交PR。这是因为:`FluidDoc`仓库只是一个包装器,将其他仓库的链接(git术语的“submodule”)集合在了一起。
+
+ 2.当您的修改被认可后,更新FluidDoc中对应的`submodule`到源仓库最新的commit-id。
+
+ > 例如,您更新了book仓库中的develop分支下的文档:
+
+
+ > - 进入`FluidDoc/external/book`目录
+ > - 更新 commit-id 到最新的提交:`git pull origin develop`
+ > - 在`FluidDoc`中提交你的修改
+
+ 3.在`FluidDoc`仓库下为您的修改提交PR
+
+提交修改与PR的步骤可以参考[如何贡献代码](../development/contribute_to_paddle.html)
+
+## 帮助改进预览工具
+
+我们非常欢迎您对平台和支持内容的各个方面做出贡献,以便更好地呈现这些内容。您可以Fork或Clone这个存储库,或者提出问题并提供反馈,以及在issues上提交bug信息。详细内容请参考[开发指南](https://github.com/PaddlePaddle/PaddlePaddle.org/blob/develop/DEVELOPING.md)。
+
+## 版权和许可
+PaddlePaddle.org在Apache-2.0的许可下提供。
diff --git a/doc/fluid/beginners_guide/basics/learning_materials.md b/doc/fluid/beginners_guide/basics/learning_materials.md
deleted file mode 100644
index a27499c6e..000000000
--- a/doc/fluid/beginners_guide/basics/learning_materials.md
+++ /dev/null
@@ -1,54 +0,0 @@
-# 学习资料
-
-## 要读的第一本书
-基础理论习得的最直接来源就是书本。按机器学习理论、深度学习理论、编程语言三方面划分,这里推荐如下书籍辅助您。
-
-
-### 机器学习理论
-
-在开启深度学习之前,您需要先行掌握机器学习的理论。深度学习是机器学习中的一个分支,两者内在的理论基础存在强关联。
-机器学习理论的书籍教材比较多,这里推荐一本易懂易学的书籍,可以重点关注神经网络部分。
-
-书名:《机器学习》(周志华著,清华大学出版社,2016年版)
-
-### 深度学习理论
-
-打好机器学习的理论功底后,您可以开始钻研深度学习的理论。通常深度学习理论会给人留下抽象难懂的印象,且和数学结合紧密。
-为了让您能够顺利入门,这里推荐一份易学易用的教材,无论深度学习理论还是数学理论即可一本搞定。
-
-书名:《Deep Learning(深度学习)》(Goodfellow, Bengio, Courville合著,赵申剑、黎彧君、符天凡和李凯合译,人民邮电出版社,2017年版)
-此书电子版在Github上已经开源,详情可参考此链接 [《深度学习》](https://github.com/exacity/deeplearningbook-chinese)
-
-### 编程语言
-
-Python方向:这里推荐您学习Python,一方面各大主流深度学习框架的主力支撑编程语言均为Python;另一方面,对比其他语言,Python较为简单易学。
-Python的教材种类较多,这里推荐一本实操和理论性都兼顾的教材,只要完成书中52个习题,跑代码然后发现问题解决,就能逐步上手。
-
-书名:《“笨办法”学Python》(Zed Shaw著,王巍巍译,人民邮电出版社,2014年11月版)
-
-
-C++方向:C++语言在底层框架中使用较多,您逐步掌握开源框架的基本操作后,在更高阶的框架应用中会用到这个技能点。
-同前面提到的Python一样,学习C++时需要多上手操作。这里推荐迅速上手C++的书籍,不但能够学习功能和结构,还提供了解决方案的示例。
-
-书名:《Essential C++》【美】李普曼(Lippman,S.B.)著,侯捷译,电子工业出版社2013年8月版
-
-
-
-## 要看的视频公开课
-
-在学习一门新技术的同时,除了看书,如果有老师面对面教授,可以更快更好的学会知识。相比于线下授课,视频公开课能够在省钱省力的同时,达到易学易掌握的效果。
-目前深度学习的课程多是公开免费的,通过学习您可以更轻松的理解深度学习中的抽象理论,并在实操方面不绕弯路。
-综合课程生动性、可操作性、紧凑性、连续性这些特点,这里推荐如下课程,同步附上网址,便于您查找学习。
-
-### 理论知识详解视频课
-[机器学习](http://open.163.com/special/opencourse/machinelearning.html) 斯坦福大学教授吴恩达公开课程,包含相关算法的详细讲解。
-
-[AI技术](https://ai.baidu.com/paddlepaddle/player?id=13) 百度推出的“AI核心技术掌握”课程,每节课在20-30分钟左右,从AI技术到深度学习进行全面细致的解读。
-
-[深度学习](http://speech.ee.ntu.edu.tw/~tlkagk/courses_ML17_2.html) 台湾李宏毅教授的在线课程,其中是英文课程,会结合国外的科研成果,但也适合新手入门和理解深度学习。
-
-[编程语言](https://ai.baidu.com/paddlepaddle/openCourses) Python操作课程,从基础到进阶操作都提供详细说明,每节课时长20分钟左右。
-
-### PaddlePaddle实操视频课
-掌握好理论基础,具备编程能力后,您可以开始使用PaddlePaddle Fluid进行实操,从初阶开始学习,向着中高阶努力。
-目前已有PaddlePaddle官方视频公开课在官网呈现,内含PaddlePaddle实战、PaddlePaddle应用场景和机器学习模型讲解课程,帮助开发者从零开始使用PaddlePaddle,从简单场景逐步过渡到工业级应用。[点击这里](http://ai.baidu.com/paddlepaddle/openCourses)您即可开始视频课的学习之旅。
diff --git a/doc/fluid/beginners_guide/basics/learning_materials_en.md b/doc/fluid/beginners_guide/basics/learning_materials_en.md
deleted file mode 100644
index ef870c9ab..000000000
--- a/doc/fluid/beginners_guide/basics/learning_materials_en.md
+++ /dev/null
@@ -1,72 +0,0 @@
-# Learning Materials
-
-## The first book to start your journey
-
-Books are the most direct resources to pick up the rationale of a subject. We recommend the following books for you which are categorized into machine learning theory, deep learning theory and programming languages.
-
-### Books for Machine Learning Theory
-
-Machine learning theory is a prerequisite to deep learning. Deep learning, one of the branches of machine learning, has a theoretical basis strongly relevant to machine learning.
-There have been various textbooks nowadays, from which we select an easier one for you. Please pay more attention to the chapters involving Neural Networks in the textbook.
-
-book:《Machine Learning》(Zhihua Zhou,Tsinghua University Express, 2016)
-
-### Books for Deep Learning Theory
-
-Having consolidated your basis of machine learning, it is time to dive into deep learning.
-It's commonplace that deep learning theory leaves an obscure and abstract impression on learners, and tightly connects with mathematics.
-To help you smoothly get started with deep learning, we recommend the following easy-to-go textbook, which features a good explanation of both deep learning theory and its related mathematic basis.
-
-
-book:《Deep Learning》(Goodfellow, Bengio, Courville)
-
-
-### Books for Programming Languages
-
-Python:
-
-Python is our recommended programming language. On the one hand, Python is the main supportive language of mainstream deep learning frameworks; On the other hand, Python is easier than other languages for beginners.
-Python textbooks abounds in the market, and what lies here is a textbook that ingeniously balanced theoretical knowledge with practical operations. Through resolving the 52 questions in the book, running your answer code, and addressing the problems occurred in this process, you can gradually get the hang of Python.
-
-
-Book:《Learn Python the Hard Way》(Zed Shaw)
-
-
-C++:
-
-C++ is adopted widely in low level part of frameworks. After you have gradually mastered basic operations of an open-source framework, programming in C++ is an important skill in the more advanced operations of a framework.
-C++ also requires frequent practical exercises like Python mentioned above.
-The book lying here is a quick-to-start textbook with introduction to functions and structures, and examples of resolutions.
-
-
-Book:《Essential C++》(Lippman,S.B.)
-
-
-
-## Open Lectures
-
-Besides textbooks, face-to-face instructions from teachers would contribute a robust and quick boost to your learning of new technology. Compared with on-campus lectures, open video lectures can not only make your learning simpler, but also save your time and energy.
-
-Currently, the courses about deep learning are mostly free and public. These courses will facilitate you to comprehend abstract theory embedded in deep learning in a more effortless way, and direct you straightly towards practical applications. Regards to the vitality, operability, continuity, and compactness, we recommend the following courses and their corresponding links are attached afterwards to exempt your time from searching.
-
-### Lectures Aimed at Theory Analysis
-
-[Machine Learning](http://open.163.com/special/opencourse/machinelearning.html) : Delivered by Andrew Ng, Stanford University. This series of lectures encompasses detailed analysis on relevant algorithms.
-
-[Deep Learning](http://speech.ee.ntu.edu.tw/~tlkagk/courses_ML17_2.html) : An online English course delivered by Prof. Hung-yi Lee. It is combined with the abroad research contributions, and at the same time it is suitable for novices to get started and understand deep learning.
-
-The following are several lectures delivered in Chinese:
-
-[AI tech](https://ai.baidu.com/paddlepaddle/player?id=13) : The course named "Master Core AI Technology" organized by Baidu deciphers AI technology to Deep Learning in a comprehensive and fine-grained way. Each lesson lasts for 20 - 30 minutes.
-
-
-[Programming Languages](https://ai.baidu.com/paddlepaddle/openCourses) Python tutorials,with 20 minutes each lesson, illustrates from the basis to advanced usage.
-
-### PaddlePaddle Hands-on Training
-
-Having equipped with a firm grasp of theory basis and programming ability, you can now commence a practical adventure to PaddlePaddle Fluid, and grow up from a beginner level to a medium or high level.
-
-Our official open courses are presented on the official site. The courses embrace PaddlePaddle practical operations, scenarios applied with PaddlePaddle, and introduction to PaddlePaddle machine learning models. Developers can take full advantage of our official courses to start PaddlePaddle from scratch and gradually move to industrial application.
-
-[Click Here](http://ai.baidu.com/paddlepaddle/openCourses) to embark on your sailing in our official deep learning video lectures.
-
diff --git a/doc/fluid/beginners_guide/index.rst b/doc/fluid/beginners_guide/index.rst
index 5cade329f..a01ae5c40 100644
--- a/doc/fluid/beginners_guide/index.rst
+++ b/doc/fluid/beginners_guide/index.rst
@@ -10,10 +10,6 @@ PaddlePaddle (PArallel Distributed Deep LEarning)是一个易用、高效、灵
- `安装说明 <../beginners_guide/install/index_cn.html>`_:我们支持在Ubuntu/CentOS/Windows/MacOS环境上的安装
-如果您初次接触深度学习,在学习PaddlePaddle之前建议您先阅读以下资料:
-
- - `学习资料 <../beginners_guide/basics/learning_materials.html>`_:推荐机器学习、深度学习和编程语言三个方面的书籍与视频公开课
-
如果您已经具备一定的深度学习基础,第一次使用PaddlePaddle时,可以跟随下列简单的模型案例供您快速上手:
- `Fluid编程指南 <../beginners_guide/programming_guide/programming_guide.html>`_:介绍 Fluid 的基本概念和使用方法
@@ -29,5 +25,4 @@ PaddlePaddle (PArallel Distributed Deep LEarning)是一个易用、高效、灵
install/index_cn.rst
quick_start/index.rst
basics/index.rst
- basics/learning_materials.md
programming_guide/programming_guide.md
diff --git a/doc/fluid/beginners_guide/index_en.rst b/doc/fluid/beginners_guide/index_en.rst
index b4aed76cc..3e60ffb58 100644
--- a/doc/fluid/beginners_guide/index_en.rst
+++ b/doc/fluid/beginners_guide/index_en.rst
@@ -11,22 +11,17 @@ For beginners of PaddlePaddle, the following documentation will tutor you about
- `Installation Manuals <../beginners_guide/install/index_en.html>`_ :Installation on Ubuntu/CentOS/Windows/MacOS is supported.
-The following resources are recommended for novices in deep learning:
-
- - `Resources <../beginners_guide/basics/learning_materials_en.html>`_ :Selected books and lectures about machine learning, deep learning and programming languages.
-
If you have been armed with certain level of deep learning knowledge, and it happens to be the first time to try PaddlePaddle, the following cases of model building will expedite your learning process:
- `Programming with Fluid <../beginners_guide/programming_guide/programming_guide_en.html>`_ : Core concepts and basic usage of Fluid
- `Quick Start <../beginners_guide/quick_start/index_en.html>`_: Two easy-to-go models, linear regression model and digit recognition model, are in place to speed up your study of training neural networks
- - `Deep Learning Basics <../beginners_guide/basics/index_en.html>`_: This section encompasses various fields of fundamental deep learning knowledge, such as image classification, customized recommendation, machine translation, and examples implemented by Fluid are provided.
+ - `Deep Learning Basics <../beginners_guide/basics/index_en.html>`_: This section encompasses various fields of fundamental deep learning knowledge, such as image classification, customized recommendation, machine translation, and examples implemented by Fluid are provided.
.. toctree::
:hidden:
install/index_en.rst
- basics/learning_materials_en.md
programming_guide/programming_guide_en.md
diff --git a/doc/fluid/build_and_install/build_from_source_cn.rst b/doc/fluid/build_and_install/build_from_source_cn.rst
deleted file mode 100644
index d0dacb104..000000000
--- a/doc/fluid/build_and_install/build_from_source_cn.rst
+++ /dev/null
@@ -1,225 +0,0 @@
-从源码编译
-======================
-
-.. _requirements:
-
-需要的软硬件
-----------------
-
-为了编译PaddlePaddle,我们需要
-
-1. 一台电脑,可以装的是 Linux, Windows 或者 MacOS 操作系统
-2. Docker
-
-不需要依赖其他任何软件了。即便是 Python 和 GCC 都不需要,因为我们会把所有编译工具都安装进一个 Docker 镜像里。
-
-.. _build_step:
-
-编译方法
-----------------
-
-PaddlePaddle需要使用Docker环境完成编译,这样可以免去单独安装编译依赖的步骤,可选的不同编译环境Docker镜像
-可以在 `这里 `__ 找到,您也可以
-在 `这里 `__ 找到 paddle_manylinux_devel
-镜像的编译以及使用方法。或者参考下述可选步骤,从源码中构建用于编译PaddlePaddle的Docker镜像。
-
-如果您选择不使用Docker镜像,则需要在本机安装下面章节列出的 :ref:`编译依赖 <_compile_deps>` 之后才能开始编译的步骤。
-
-编译PaddlePaddle,需要执行:
-
-.. code-block:: bash
-
- # 1. 获取源码
- git clone https://github.com/PaddlePaddle/Paddle.git
- cd Paddle
- # 2. 可选步骤:源码中构建用于编译PaddlePaddle的Docker镜像
- docker build -t paddle:dev .
- # 3. 执行下面的命令编译CPU-Only的二进制
- docker run -it -v $PWD:/paddle -w /paddle -e "PYTHON_ABI=cp27-cp27mu" -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh build
- # 4. 或者也可以使用为上述可选步骤构建的镜像(必须先执行第2步)
- docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev ./paddle/scripts/paddle_build.sh build
-
-注:
-
-- 上述命令把当前目录(源码树根目录)映射为 container 里的 :code:`/paddle` 目录。
-
-- 如果您使用的是 manylinux 的镜像进行编译, 那么您需要通过环境变量 :code:`PYTHON_ABI` 来指定一个 `Python ABI `__.
-PaddlePaddle目前支持的 Python ABI 有 :code:`cp27-cp27m` 和 :code:`cp27-cp27mu`.
-
-编译完成后会在build/python/dist目录下生成输出的whl包,可以选在在当前机器安装也可以拷贝到目标机器安装:
-
-.. code-block:: bash
-
- pip install build/python/dist/*.whl
-
-如果机器中已经安装过PaddlePaddle,有两种方法:
-
-.. code-block:: bash
-
- 1. 先卸载之前的版本,再重新安装
- pip uninstall paddlepaddle
- pip install build/python/dist/*.whl
-
- 2. 直接升级到更新的版本
- pip install build/python/dist/*.whl -U
-
-.. _run_test:
-
-执行单元测试
-----------------
-
-如果您期望在编译完成后立即执行所有的单元测试,可以按照下面的方法:
-
-设置 :code:`RUN_TEST=ON` 和 :code:`WITH_TESTING=ON` 就会在完成编译之后,立即执行单元测试。
-开启 :code:`WITH_GPU=ON` 可以指定同时执行GPU上的单元测试。
-
-.. code-block:: bash
-
- docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh test
-
-如果期望执行其中一个单元测试,(比如 :code:`test_sum_op` ):
-
-.. code-block:: bash
-
- docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 /bin/bash
- ./paddle/scripts/paddle_build.sh build
- cd build
- ctest -R test_sum_op -V
-
-.. _faq_docker:
-
-常见问题
-----------------
-
-- 什么是 Docker?
-
- 如果您没有听说 Docker,可以把它想象为一个类似 virtualenv 的系统,但是虚拟的不仅仅是 Python 的运行环境。
-
-- Docker 还是虚拟机?
-
- 有人用虚拟机来类比 Docker。需要强调的是:Docker 不会虚拟任何硬件,Docker container 里运行的编译工具实际上都是在本机的 CPU 和操作系统上直接运行的,性能和把编译工具安装在本机运行一样。
-
-- 为什么用 Docker?
-
- 把工具和配置都安装在一个 Docker image 里可以标准化编译环境。这样如果遇到问题,其他人可以复现问题以便帮助。
-
- 另外,对于习惯使用Windows和MacOS的开发者来说,使用Docker就不用配置交叉编译环境了。
-
-- 我可以选择不用Docker吗?
-
- 当然可以。大家可以用把开发工具安装进入 Docker image 一样的方式,把这些工具安装到本机。这篇文档介绍基于 Docker 的开发流程,是因为这个流程比其他方法都更简便。
-
-- 学习 Docker 有多难?
-
- 理解 Docker 并不难,大概花十分钟看一下 `如何使用Docker `_ 。这可以帮您省掉花一小时安装和配置各种开发工具,以及切换机器时需要新安装的辛苦。别忘了 PaddlePaddle 更新可能导致需要新的开发工具。更别提简化问题复现带来的好处了。
-
-- 我可以用 IDE 吗?
-
- 当然可以,因为源码就在本机上。IDE 默认调用 make 之类的程序来编译源码,我们只需要配置 IDE 来调用 Docker 命令编译源码即可。
-
- 很多 PaddlePaddle 开发者使用 Emacs。他们在自己的 `~/.emacs` 配置文件里加两行
-
- .. code-block:: emacs
-
- (global-set-key "\C-cc" 'compile)
- (setq compile-command "docker run --rm -it -v $(git rev-parse --show-toplevel):/paddle paddle:dev")
-
- 就可以按 `Ctrl-C` 和 `c` 键来启动编译了。
-
-- 可以并行编译吗?
-
- 是的。我们的 Docker image 运行一个 `Paddle编译Bash脚本 `_ 。这个脚本调用 `make -j$(nproc)` 来启动和 CPU 核一样多的进程来并行编译。
-
-- Docker 需要 sudo
-
- 如果用自己的电脑开发,自然也就有管理员权限(sudo)了。如果用公用的电脑开发,需要请管理员安装和配置好 Docker。此外,PaddlePaddle 项目在努力开始支持其他不需要 sudo 的集装箱技术,比如 rkt。
-
-- 在 Windows/MacOS 上编译很慢
-
- Docker 在 Windows 和 MacOS 都可以运行。不过实际上是运行在一个 Linux 虚拟机上。可能需要注意给这个虚拟机多分配一些 CPU 和内存,以保证编译高效。具体做法请参考 `如何为Windows/Mac计算机上的Docker增加内存和虚拟机 `_ 。
-
-- 磁盘不够
-
- 本文中的例子里,`docker run` 命令里都用了 `--rm` 参数,这样保证运行结束之后的 containers 不会保留在磁盘上。可以用 `docker ps -a` 命令看到停止后但是没有删除的 containers。`docker build` 命令有时候会产生一些中间结果,是没有名字的 images,也会占用磁盘。可以参考 `如何删除Docker Container `_ 来清理这些内容。
-
-
-.. _compile_deps:
-
-附录:编译依赖
-----------------
-
-PaddlePaddle编译需要使用到下面的依赖(包含但不限于),其他的依赖软件,会自动在编译时下载。
-
-.. csv-table:: PaddlePaddle编译依赖
- :header: "依赖", "版本", "说明"
- :widths: 10, 15, 30
-
- "CMake", ">=3.2", ""
- "GCC", "4.8.2", "推荐使用CentOS的devtools2"
- "Python", "2.7.x", "依赖libpython2.7.so"
- "pip", ">=9.0", ""
- "numpy", "", ""
- "SWIG", ">=2.0", ""
- "Go", ">=1.8", "可选"
-
-
-.. _build_options:
-
-附录:编译选项
-----------------
-
-PaddlePaddle的编译选项,包括生成CPU/GPU二进制文件、链接何种BLAS库等。
-用户可在调用cmake的时候设置它们,详细的cmake使用方法可以参考
-`官方文档 `_ 。
-
-在cmake的命令行中,通过使用 ``-D`` 命令设置该类编译选项,例如:
-
-.. code-block:: bash
-
- cmake .. -DWITH_GPU=OFF
-
-.. csv-table:: 编译选项说明
- :header: "选项", "说明", "默认值"
- :widths: 1, 7, 2
-
- "WITH_GPU", "是否支持GPU", "ON"
- "WITH_C_API", "是否仅编译CAPI", "OFF"
- "WITH_DOUBLE", "是否使用双精度浮点数", "OFF"
- "WITH_DSO", "是否运行时动态加载CUDA动态库,而非静态加载CUDA动态库。", "ON"
- "WITH_AVX", "是否编译含有AVX指令集的PaddlePaddle二进制文件", "ON"
- "WITH_PYTHON", "是否内嵌PYTHON解释器", "ON"
- "WITH_STYLE_CHECK", "是否编译时进行代码风格检查", "ON"
- "WITH_TESTING", "是否开启单元测试", "OFF"
- "WITH_DOC", "是否编译中英文文档", "OFF"
- "WITH_SWIG_PY", "是否编译PYTHON的SWIG接口,该接口可用于预测和定制化训练", "Auto"
- "WITH_GOLANG", "是否编译go语言的可容错parameter server", "OFF"
- "WITH_MKL", "是否使用MKL数学库,如果为否则是用OpenBLAS", "ON"
-
-BLAS
-+++++
-
-PaddlePaddle支持 `MKL `_ 和
-`OpenBlAS `_ 两种BLAS库。默认使用MKL。如果使用MKL并且机器含有AVX2指令集,
-还会下载MKL-DNN数学库,详细参考 `mkldnn设计文档 `_ 。
-
-如果关闭MKL,则会使用OpenBLAS作为BLAS库。
-
-CUDA/cuDNN
-+++++++++++
-
-PaddlePaddle在编译时/运行时会自动找到系统中安装的CUDA和cuDNN库进行编译和执行。
-使用参数 :code:`-DCUDA_ARCH_NAME=Auto` 可以指定开启自动检测SM架构,加速编译。
-
-PaddlePaddle可以使用cuDNN v5.1之后的任何一个版本来编译运行,但尽量请保持编译和运行使用的cuDNN是同一个版本。
-我们推荐使用最新版本的cuDNN。
-
-编译选项的设置
-++++++++++++++
-
-PaddePaddle通过编译时指定路径来实现引用各种BLAS/CUDA/cuDNN库。cmake编译时,首先在系统路径( :code:`/usr/lib:/usr/local/lib` )中搜索这几个库,同时也会读取相关路径变量来进行搜索。 通过使用 ``-D`` 命令可以设置,例如
-
-.. code-block:: bash
-
- cmake .. -DWITH_GPU=ON -DWITH_TESTING=OFF -DCUDNN_ROOT=/opt/cudnnv5
-
-**注意:这几个编译选项的设置,只在第一次cmake的时候有效。如果之后想要重新设置,推荐清理整个编译目录(** :code:`rm -rf` )**后,再指定。**
diff --git a/doc/fluid/build_and_install/build_from_source_en.rst b/doc/fluid/build_and_install/build_from_source_en.rst
deleted file mode 100644
index 664b68da8..000000000
--- a/doc/fluid/build_and_install/build_from_source_en.rst
+++ /dev/null
@@ -1,237 +0,0 @@
-Build from Sources
-==========================
-
-.. _requirements:
-
-Requirements
-----------------
-
-To build PaddlePaddle, you need
-
-1. A computer -- Linux, Windows, MacOS.
-2. Docker.
-
-Nothing else. Not even Python and GCC, because you can install all build tools into a Docker image.
-We run all the tools by running this image.
-
-.. _build_step:
-
-How To Build
-----------------
-
-You need to use Docker to build PaddlePaddle
-to avoid installing dependencies by yourself. We have several pre-built
-Docker images `here `_ ,
-you can also find how to build and use paddle_manylinux_devel Docker image from
-`here `__
-Or you can build your own image from source as the optional step below:
-
-If you don't wish to use docker,you need to install several compile dependencies manually as :ref:`Compile Dependencies <_compile_deps>` shows to start compilation.
-
-.. code-block:: bash
-
- # 1. clone the source code
- git clone https://github.com/PaddlePaddle/Paddle.git
- cd Paddle
- # 2. Optional: build development docker image from source
- docker build -t paddle:dev .
- # 3. Run the following command to build a CPU-Only binaries
- docker run -it -v $PWD:/paddle -w /paddle -e "PYTHON_ABI=cp27-cp27mu" -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh build
- # 4. Or, use your built Docker image to build PaddlePaddle (must run step 2)
- docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev ./paddle/scripts/paddle_build.sh build
-
-NOTE:
-
-- The above command try to mount the current working directory (root directory of source code)
-into :code:`/paddle` directory inside docker container.
-
-- You need to pass in the required environment variable :code:`PYTHON_ABI` to specify a `Python ABI `__.
-Currently PaddlePaddle supported Python ABIs include :code:`cp27-cp27m` and :code:`cp27-cp27mu` .
-
-When the compile finishes, you can get the output whl package under
-build/python/dist, then you can choose to install the whl on local
-machine or copy it to the target machine.
-
-.. code-block:: bash
-
- pip install build/python/dist/*.whl
-
-If the machine has installed PaddlePaddle before, there are two methods:
-
-.. code-block:: bash
-
- 1. uninstall and reinstall
- pip uninstall paddlepaddle
- pip install build/python/dist/*.whl
-
- 2. upgrade directly
- pip install build/python/dist/*.whl -U
-
-.. _run_test:
-
-Run Tests
-----------------
-
-If you wish to run the tests, you may follow the below steps:
-
-When using Docker, set :code:`RUN_TEST=ON` and :code:`WITH_TESTING=ON` will run test immediately after the build.
-Set :code:`WITH_GPU=ON` Can also run tests on GPU.
-
-.. code-block:: bash
-
- docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh test
-
-If you wish to run only one unit test, like :code:`test_sum_op`:
-
-.. code-block:: bash
-
- docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 /bin/bash
- ./paddle/scripts/paddle_build.sh build
- cd build
- ctest -R test_sum_op -V
-
-.. _faq_docker:
-
-Frequently Asked Questions
----------------------------
-
-- What is Docker?
-
- If you haven't heard of it, consider it something like Python's virtualenv.
-
-- Docker or virtual machine?
-
- Some people compare Docker with VMs, but Docker doesn't virtualize any hardware nor running a guest OS, which means there is no compromise on the performance.
-
-- Why Docker?
-
- Using a Docker image of build tools standardizes the building environment, which makes it easier for others to reproduce your problems and to help.
-
- Also, some build tools don't run on Windows or Mac or BSD, but Docker runs almost everywhere, so developers can use whatever computer they want.
-
-- Can I choose not to use Docker?
-
- Sure, you don't have to install build tools into a Docker image; instead, you can install them on your local computer. This document exists because Docker would make the development way easier.
-
-- How difficult is it to learn Docker?
-
- It takes you ten minutes to read `an introductory article `_ and saves you more than one hour to install all required build tools, configure them, especially when new versions of PaddlePaddle require some new tools. Not even to mention the time saved when other people trying to reproduce the issue you have.
-
-- Can I use my favorite IDE?
-
- Yes, of course. The source code resides on your local computer, and you can edit it using whatever editor you like.
-
- Many PaddlePaddle developers are using Emacs. They add the following few lines into their `~/.emacs` configure file:
-
- .. code-block:: emacs
-
- (global-set-key "\C-cc" 'compile)
- (setq compile-command "docker run --rm -it -v $(git rev-parse --show-toplevel):/paddle paddle:dev")
-
- so they could type `Ctrl-C` and `c` to build PaddlePaddle from source.
-
-- Does Docker do parallel building?
-
- Our building Docker image runs a `Bash script `_ , which calls `make -j$(nproc)` to starts as many processes as the number of your CPU cores.
-
-- Docker requires sudo
-
- An owner of a computer has the administrative privilege, a.k.a., sudo, and Docker requires this privilege to work properly. If you use a shared computer for development, please ask the administrator to install and configure Docker. We will do our best to support rkt, another container technology that doesn't require sudo.
-
-- Docker on Windows/MacOS builds slowly
-
- On Windows and MacOS, Docker containers run in a Linux VM. You might want to give this VM some more memory and CPUs so to make the building efficient. Please refer to `this issue `_ for details.
-
-- Not enough disk space
-
- Examples in this article use option `--rm` with the `docker run` command. This option ensures that stopped containers do not exist on hard disks. We can use `docker ps -a` to list all containers, including stopped. Sometimes `docker build` generates some intermediate dangling images, which also take disk space. To clean them, please refer to `this article `_ .
-
-.. _compile_deps:
-
-Appendix: Compile Dependencies
--------------------------------
-
-PaddlePaddle need the following dependencies when compiling, other dependencies
-will be downloaded automatically.
-
-.. csv-table:: PaddlePaddle Compile Dependencies
- :header: "Dependency", "Version", "Description"
- :widths: 10, 15, 30
-
- "CMake", ">=3.2", ""
- "GCC", "4.8.2", "Recommend devtools2 for CentOS"
- "Python", "2.7.x", "Need libpython2.7.so"
- "pip", ">=9.0", ""
- "numpy", "", ""
- "SWIG", ">=2.0", ""
- "Go", ">=1.8", "Optional"
-
-
-.. _build_options:
-
-Appendix: Build Options
--------------------------
-
-Build options include whether build binaries for CPU or GPU, which BLAS
-library to use etc. You may pass these settings when running cmake.
-For detailed cmake tutorial please refer to `here `__ 。
-
-
-You can add :code:`-D` argument to pass such options, like:
-
-.. code-block:: bash
-
- cmake .. -DWITH_GPU=OFF
-
-.. csv-table:: Bool Type Options
- :header: "Option", "Description", "Default"
- :widths: 1, 7, 2
-
- "WITH_GPU", "Build with GPU support", "ON"
- "WITH_C_API", "Build only CAPI", "OFF"
- "WITH_DOUBLE", "Build with double precision", "OFF"
- "WITH_DSO", "Dynamically load CUDA libraries", "ON"
- "WITH_AVX", "Build with AVX support", "ON"
- "WITH_PYTHON", "Build with integrated Python interpreter", "ON"
- "WITH_STYLE_CHECK", "Check code style when building", "ON"
- "WITH_TESTING", "Build unit tests", "OFF"
- "WITH_DOC", "Build documentations", "OFF"
- "WITH_SWIG_PY", "Build Python SWIG interface for V2 API", "Auto"
- "WITH_GOLANG", "Build fault-tolerant parameter server written in go", "OFF"
- "WITH_MKL", "Use MKL as BLAS library, else use OpenBLAS", "ON"
-
-
-BLAS
-+++++
-
-PaddlePaddle supports `MKL `_ and
-`OpenBlAS `_ as BLAS library。By default it uses MKL.
-If you are using MKL and your machine supports AVX2, MKL-DNN will also be downloaded
-and used, for more `details `_ .
-
-If you choose not to use MKL, then OpenBlAS will be used.
-
-CUDA/cuDNN
-+++++++++++
-
-PaddlePaddle will automatically find CUDA and cuDNN when compiling and running.
-parameter :code:`-DCUDA_ARCH_NAME=Auto` can be used to detect SM architecture
-automatically in order to speed up the build.
-
-PaddlePaddle can build with any version later than cuDNN v5.1, and we intend to
-keep on with latest cuDNN versions. Be sure to run with the same version of cuDNN
-you built.
-
-Pass Compile Options
-++++++++++++++++++++++
-
-You can pass compile options to use intended BLAS/CUDA/Cudnn libraries.
-When running cmake command, it will search system paths like
-:code:`/usr/lib:/usr/local/lib` and then search paths that you
-passed to cmake, i.e.
-
-.. code-block:: bash
-
- cmake .. -DWITH_GPU=ON -DWITH_TESTING=OFF -DCUDNN_ROOT=/opt/cudnnv5
-
-**NOTE: These options only take effect when running cmake for the first time, you need to clean the cmake cache or clean the build directory (** :code:`rm -rf` **) if you want to change it.**
diff --git a/doc/fluid/build_and_install/docker_install_cn.rst b/doc/fluid/build_and_install/docker_install_cn.rst
deleted file mode 100644
index 106c86bac..000000000
--- a/doc/fluid/build_and_install/docker_install_cn.rst
+++ /dev/null
@@ -1,146 +0,0 @@
-使用Docker安装运行
-================================
-
-使用Docker安装和运行PaddlePaddle可以无需考虑依赖环境即可运行。并且也可以在Windows的docker中运行。
-您可以在 `Docker官网 `_ 获得基本的Docker安装和使用方法。
-
-如果您在使用Windows,可以参考
-`这篇 `_
-教程,完成在Windows上安装和使用Docker。
-
-在了解Docker的基本使用方法之后,即可开始下面的步骤:
-
-.. _docker_pull:
-
-获取PaddlePaddle的Docker镜像
-------------------------------
-
-执行下面的命令获取最新的PaddlePaddle Docker镜像,版本为cpu_avx_mkl:
-
- .. code-block:: bash
-
- docker pull paddlepaddle/paddle
-
-对于国内用户,我们提供了加速访问的镜像源:
-
- .. code-block:: bash
-
- docker pull docker.paddlepaddlehub.com/paddle
-
-下载GPU版本(cuda8.0_cudnn5_avx_mkl)的Docker镜像:
-
- .. code-block:: bash
-
- docker pull paddlepaddle/paddle:latest-gpu
- docker pull docker.paddlepaddlehub.com/paddle:latest-gpu
-
-选择下载使用不同的BLAS库的Docker镜像:
-
- .. code-block:: bash
-
- # 默认是使用MKL的镜像
- docker pull paddlepaddle/paddle
- # 使用OpenBLAS的镜像
- docker pull paddlepaddle/paddle:latest-openblas
-
-下载指定版本的Docker镜像,可以从 `DockerHub网站 `_ 获取可选的tag,并执行下面的命令:
-
- .. code-block:: bash
-
- docker pull paddlepaddle/paddle:[tag]
- # 比如:
- docker pull docker.paddlepaddlehub.com/paddle:0.11.0-gpu
-
-.. _docker_run:
-
-在Docker中执行PaddlePaddle训练程序
-----------------------------------
-
-假设您已经在当前目录(比如在/home/work)编写了一个PaddlePaddle的程序 :code:`train.py` (可以参考
-`PaddlePaddleBook `_
-编写),就可以使用下面的命令开始执行训练:
-
- .. code-block:: bash
-
- cd /home/work
- docker run -it -v $PWD:/work paddlepaddle/paddle /work/train.py
-
-上述命令中, :code:`-it` 参数说明容器已交互式运行; :code:`-v $PWD:/work`
-指定将当前路径(Linux中$PWD变量会展开为当前路径的绝对路径)挂载到容器内部的 :code:`/work`
-目录; :code:`paddlepaddle/paddle` 指定需要使用的容器; 最后 :code:`/work/train.py`
-为容器内执行的命令,即运行训练程序。
-
-当然,您也可以进入到Docker容器中,以交互式的方式执行或调试您的代码:
-
- .. code-block:: bash
-
- docker run -it -v $PWD:/work paddlepaddle/paddle /bin/bash
- cd /work
- python train.py
-
-**注:PaddlePaddle Docker镜像为了减小体积,默认没有安装vim,您可以在容器中执行** :code:`apt-get install -y vim` **安装后,在容器中编辑代码。**
-
-.. _docker_run_book:
-
-使用Docker启动PaddlePaddle Book教程
------------------------------------
-
-使用Docker可以快速在本地启动一个包含了PaddlePaddle官方Book教程的Jupyter Notebook,可以通过网页浏览。
-PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Notebook。
-如果您想要更深入了解deep learning,PaddlePaddle Book一定是您最好的选择。
-大家可以通过它阅读教程,或者制作和分享带有代码、公式、图表、文字的交互式文档。
-
-我们提供可以直接运行PaddlePaddle Book的Docker镜像,直接运行:
-
- .. code-block:: bash
-
- docker run -p 8888:8888 paddlepaddle/book
-
-国内用户可以使用下面的镜像源来加速访问:
-
- .. code-block:: bash
-
- docker run -p 8888:8888 docker.paddlepaddlehub.com/book
-
-然后在浏览器中输入以下网址:
-
- .. code-block:: text
-
- http://localhost:8888/
-
-就这么简单,享受您的旅程!
-
-.. _docker_run_gpu:
-
-使用Docker执行GPU训练
-------------------------------
-
-为了保证GPU驱动能够在镜像里面正常运行,我们推荐使用
-`nvidia-docker `_ 来运行镜像。
-请不要忘记提前在物理机上安装GPU最新驱动。
-
- .. code-block:: bash
-
- nvidia-docker run -it -v $PWD:/work paddlepaddle/paddle:latest-gpu /bin/bash
-
-**注: 如果没有安装nvidia-docker,可以尝试以下的方法,将CUDA库和Linux设备挂载到Docker容器内:**
-
- .. code-block:: bash
-
- export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
- export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
- docker run ${CUDA_SO} ${DEVICES} -it paddlepaddle/paddle:latest-gpu
-
-**关于AVX:**
-
-AVX是一种CPU指令集,可以加速PaddlePaddle的计算。最新的PaddlePaddle Docker镜像默认
-是开启AVX编译的,所以,如果您的电脑不支持AVX,需要单独
-`编译 <./build_from_source_cn.html>`_ PaddlePaddle为no-avx版本。
-
-以下指令能检查Linux电脑是否支持AVX:
-
- .. code-block:: bash
-
- if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
-
-如果输出是No,就需要选择使用no-AVX的镜像
diff --git a/doc/fluid/build_and_install/docker_install_en.rst b/doc/fluid/build_and_install/docker_install_en.rst
deleted file mode 100644
index 25aecb8d0..000000000
--- a/doc/fluid/build_and_install/docker_install_en.rst
+++ /dev/null
@@ -1,153 +0,0 @@
-Run in Docker Containers
-=================================
-
-Run PaddlePaddle in Docker container so that you don't need to care about
-runtime dependencies, also you can run under Windows system. You can get
-tutorials at `here `_ .
-
-If you are using Windows, please refer to
-`this `_
-tutorial to start running docker under windows.
-
-After you've read above tutorials you may proceed the following steps.
-
-.. _docker_pull:
-
-Pull PaddlePaddle Docker Image
-------------------------------
-
-Run the following command to download the latest Docker images, the version is cpu_avx_mkl:
-
- .. code-block:: bash
-
- docker pull paddlepaddle/paddle
-
-For users in China, we provide a faster mirror:
-
- .. code-block:: bash
-
- docker pull docker.paddlepaddlehub.com/paddle
-
-Download GPU version (cuda8.0_cudnn5_avx_mkl) images:
-
- .. code-block:: bash
-
- docker pull paddlepaddle/paddle:latest-gpu
- docker pull docker.paddlepaddlehub.com/paddle:latest-gpu
-
-Choose between different BLAS version:
-
- .. code-block:: bash
-
- # image using MKL by default
- docker pull paddlepaddle/paddle
- # image using OpenBLAS
- docker pull paddlepaddle/paddle:latest-openblas
-
-
-If you want to use legacy versions, choose a tag from
-`DockerHub `_
-and run:
-
- .. code-block:: bash
-
- docker pull paddlepaddle/paddle:[tag]
- # i.e.
- docker pull docker.paddlepaddlehub.com/paddle:0.11.0-gpu
-
-.. _docker_run:
-
-Launch your training program in Docker
---------------------------------------
-
-Assume that you have already written a PaddlePaddle program
-named :code:`train.py` under directory :code:`/home/work` (refer to
-`PaddlePaddleBook `_
-for more samples), then run the following command:
-
- .. code-block:: bash
-
- cd /home/work
- docker run -it -v $PWD:/work paddlepaddle/paddle /work/train.py
-
-In the above command, :code:`-it` means run the container interactively;
-:code:`-v $PWD:/work` means mount the current directory ($PWD will expand
-to current absolute path in Linux) under :code:`/work` in the container.
-:code:`paddlepaddle/paddle` to specify image to use; finnally
-:code:`/work/train.py` is the command to run inside docker.
-
-Also, you can go into the container shell, run or debug your code
-interactively:
-
- .. code-block:: bash
-
- docker run -it -v $PWD:/work paddlepaddle/paddle /bin/bash
- cd /work
- python train.py
-
-**NOTE: We did not install vim in the default docker image to reduce the image size, you can run** :code:`apt-get install -y vim` **to install it if you need to edit python files.**
-
-.. _docker_run_book:
-
-PaddlePaddle Book
-------------------
-
-You can create a container serving PaddlePaddle Book using Jupyter Notebook in
-one minute using Docker. PaddlePaddle Book is an interactive Jupyter Notebook
-for users and developers.If you want to
-dig deeper into deep learning, PaddlePaddle Book definitely is your best choice.
-
-We provide a packaged book image, simply issue the command:
-
- .. code-block:: bash
-
- docker run -p 8888:8888 paddlepaddle/book
-
-For users in China, we provide a faster mirror:
-
- .. code-block:: bash
-
- docker run -p 8888:8888 docker.paddlepaddlehub.com/book
-
-Then, you would back and paste the address into the local browser:
-
- .. code-block:: text
-
- http://localhost:8888/
-
-That's all. Enjoy your journey!
-
-.. _docker_run_gpu:
-
-Train with Docker with GPU
-------------------------------
-
-We recommend using
-`nvidia-docker `_
-to run GPU training jobs. Please ensure you have latest
-GPU driver installed before move on.
-
- .. code-block:: bash
-
- nvidia-docker run -it -v $PWD:/work paddlepaddle/paddle:latest-gpu /bin/bash
-
-**NOTE: If you don't have nvidia-docker installed, try the following method to mount CUDA libs and devices into the container.**
-
- .. code-block:: bash
-
- export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
- export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
- docker run ${CUDA_SO} ${DEVICES} -it paddlepaddle/paddle:latest-gpu
-
-**About AVX:**
-
-AVX is a kind of CPU instruction can accelerate PaddlePaddle's calculations.
-The latest PaddlePaddle Docker image turns AVX on by default, so, if your
-computer doesn't support AVX, you'll probably need to
-`build <./build_from_source_en.html>`_ with :code:`WITH_AVX=OFF`.
-
-The following command will tell you whether your computer supports AVX.
-
- .. code-block:: bash
-
- if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
diff --git a/doc/fluid/build_and_install/index_cn.rst b/doc/fluid/build_and_install/index_cn.rst
deleted file mode 100644
index 1a9305ac4..000000000
--- a/doc/fluid/build_and_install/index_cn.rst
+++ /dev/null
@@ -1,56 +0,0 @@
-安装与编译
-==========
-
-.. _install_steps:
-
-PaddlePaddle针对不同的用户群体提供了多种安装方式。
-
-专注深度学习模型开发
---------------------
-
-PaddlePaddle提供了多种python wheel包,可通过pip一键安装:
-
-.. toctree::
- :maxdepth: 1
-
- pip_install_cn.rst
-
-这是最便捷的安装方式,请根据机器配置和系统选择对应的安装包。
-
-关注底层框架
--------------
-
-PaddlePaddle提供了基于Docker的安装方式,请参照以下教程:
-
-.. toctree::
- :maxdepth: 1
-
- docker_install_cn.rst
-
-我们推荐在Docker中运行PaddlePaddle,该方式具有以下优势:
-
-- 无需单独安装第三方依赖
-- 方便分享运行时环境,易于问题的复现
-
-对于有定制化二进制文件需求的用户,我们同样提供了从源码编译安装PaddlePaddle的方法:
-
-.. toctree::
- :maxdepth: 1
-
- build_from_source_cn.rst
-
-.. warning::
-
- 需要提醒的是,这种安装方式会涉及到一些第三方库的下载、编译及安装,整个安装过程耗时较长。
-
-
-常见问题汇总
---------------
-
-如果在安装过程中遇到了问题,请先尝试在下面的页面寻找答案:
-
-:ref:`常见问题解答 `
-
-如果问题没有得到解决,欢迎向PaddlePaddle社区反馈问题:
-
-`创建issue `_
diff --git a/doc/fluid/build_and_install/index_en.rst b/doc/fluid/build_and_install/index_en.rst
deleted file mode 100644
index 7990bacbd..000000000
--- a/doc/fluid/build_and_install/index_en.rst
+++ /dev/null
@@ -1,56 +0,0 @@
-install and Compile
-======================
-
-.. _install_steps:
-
-PaddlePaddle provides various methods of installation for many different users
-
-Focus on Deep Learning Model Development
-----------------------------------------
-
-PaddlePaddle provides lots of packages of python wheel , that pip can install:
-
-.. toctree::
- :maxdepth: 1
-
- pip_install_en.rst
-
-This is the most convenient way of installation. Please choose the right installation package with machine configure and system.
-
-Follow the Bottom Frame
-------------------------
-
-PaddlePaddle also supports installation using Docker. Please refer to the tutorial below:
-
-.. toctree::
- :maxdepth: 1
-
- docker_install_en.rst
-
-We recommend running PaddlePaddle in Docker. This method has the following advantages:
-
-- Does not require installation of third-party dependencies.
-- Easy to share runtime environment.
-
-Lastly, users can also compile and install PaddlePaddle from source code. The instructions are below:
-
-.. toctree::
- :maxdepth: 1
-
- build_from_source_en.rst
-
-.. warning::
-
- One caveat with this approach is that developers will have to download, compile and install all third-party dependencies. Thus this process of installation is more time consuming.
-
-
-FAQ
------------
-
-For any problems during installation, please refer to the page below for answers:
-
-:ref:`常见问题解答 `
-
-If the problem still persists, you are welcome to seek assistance from the PaddlePaddle community:
-
-`创建issue `_
diff --git a/doc/fluid/build_and_install/paddleci.png b/doc/fluid/build_and_install/paddleci.png
deleted file mode 100644
index 16087ce059aa3c07ce8c927d983eb86351915825..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001
literal 40242
zcmeGDRa9Kf)&>d#fdEaA1Pe|efdBzQaEA~exVuY`hQ?hQ3lI_{cyM=jcMWchH;ucy
zHJoPe{qAr77ytFSIgG&?t7}!&Dw#Fqna>JQl$XQ?kOGj9kg%n`eo;n3dPa_fgxrDg
z98t2y(=vvH1gNnT6H}BD6Qfpiur;%^{)U9~H6%6;T~)=BsITp6f{Yn&=lfEmJkk#5
z`O=p+Z$&C-rQ(!f#P=J#?&0no3;F&XL18
zXchO?=7Ymo%N_J?>2BoanaD%HJ3P}42PD%o$H151)bsjlR6ZL#JYUjP?e^b2M3@4d
zo&HG1jwfZ8JCwe;YoN0o?Z<~lA5M4q!XMa31VMHH$?g$}>ldptb0|hENckvl%bEl<
zJ`PLbsT0s((=750?^G?a4>MGKNYKJ3TnRjuL^?JllKF#!W0nt%GY=
zT*|xD1uYcH09
zm%f*7NTxSuyl^-Y-X3!O*KfZ9-K%)d&xxPo`F&7CWfZ}8PYd9aSQIO+ix>91q_L59TnsOarRidgc#D(Ps|0a(j`dgR-7y28?H}E+
zB|7LoB8%eTWQwSMMVs(Hf1}b*YAjOv8e}S3{)c&nT&uMXCtOx0wRPpk6$`
zKnoLB8F#AYohLbn2$!fD=iRQ^9^RIAVs0ey#`Go={6Q)1)2H}x_q9qWeo#nASGGx<
z$xpV8Kcuqwo1ZpW%wC^i{_2@157I<07UrT%^3w~1hC+iu-7dy=6=Jh&^%+3
zpY1>BvXs8ALw6;2rF6yO4!sMdU#U5sb~EroNsVCbj$eyd;XkfBWpNzq!1(X&&!IGCcr*A+Kd`$8yZgFv-@cLxUjGW9KlzH54VaLc
zP@ACM!P4@x{(MP8AJwzYyq>i#fBNhc9S~?NVxRX`(Sbqdy>ra3n7o+LPpTwXB)Kmn
z%~E7zWvhmm6Y45awY;@(=W;4p*SSRLuqabdx~~yaLP!$I5UGrg-xtN
z_2j#8*=l~2e^iNb=ma2#F~?E*73ohi5tE5Z3mwZmiz^EQVv7&Qn!g(jC;fxa>WD&_
ziWr)h@)+aab2Gx!9tuBeoGATOohzNlJpDpg0?Z^WZPayc+x-UN2L1$&0Ih3Y)u;e*
zAx6&l7Z?|6yTK68sXh)4&U+42Qcn&RVDoskWuuju#o^Tc82NZ6umN~d?E*}h@etR@
z^c<_&COnrpR{}YMd_WbKZw~YiFe$`REK)wFNTkp=e$flENj#K?vOtvtnOh87ARZSU
zxEGkWFT6`1Y~VI<`)6VSngLR4ebr1Gg}ae5(N@56WJ2)
zGRtb#dmCMK9>uxDX~ju?pweQ{MAIbHuF-0#`?vg5BqZ`-L2Wf>z=<$3^O}1
zlW8NHBMBpY$p$Q+bYmK~`!nAVQxL}uz99}_f^yDXq!)F|Ere~TA6yRU&7`d9uRU~j
z&C+(Q)%WQr$S7o(M{m`~N6(O!$=k-%rMZTC1L(HHE_zq
z>qn8tgNJ9b*mOx+edO477@GKY48O@=lW3A$f9QYv_4O3d3v_hUBMdNkCRHb{B_>+N
z&=^IALg$MvhfXkdUr=zMMxby+`AsJfqJKg)H_8mIL%5#iF$zayew(lM?#?TNCIQHwWY
z7L0$J)Gf`I5-JrgClq%dMmSJt7MAl-S7?
zELpT36sd5PkUOiG%4ml1N4cs*gnBD9Uo-$oCd8j8A
zFIJz5x7t387L66n7B#9{mFc&6*oE~aidkjUFjzHB@02oWbL%c^C^n_m3pv7V1LFd}
z1yXT$+gLD7yk~q*dfRand~2LwY;@8tu@#06{_@y=SK%5xoGX~r8gD0E8m}D}7Y|+-
zTJnO^7o{ZKgvzrX)F6nU6NB0K}hpN^T9rQD#3osyW+FLcZ=;?D(JTM6p2mQwMi
zx-}ntzY3XteikGaR^2-hy_|i$UStsE)8NW@tU6=ayA#BIYktY4lq8*)u^Bs@#k@!)
zW#wh{IOkiX2O0s%wQ1jbRO>ah>?Th%lu&esXNDJ?-E8&op{3ZeWE;HRmEQGmVmn+-
zky(lNU`5w&_2{`8=&^}7Jk!uCtu=6Lbh+eP^BBawBO4ZU5tM72gHKOc!4|wGSIn%^
zObm1$GatfrZ0cARFI&AgaLz`;n%=nH9f`x5gn#o;H;gtso{CNfxZ<8s$_p^Mg}UpW
zemRu7QMt&2w2?h7y|0K*COfCxcPF~pAIJ0DzM#q!Zp9`R%oAYs$+*Zo%()o6dZWqfebaW}ZFPP*ly>NI8QL3$(}v%gftAlKFW(@gq?tM2R{y0yUp5TRas4_5
zYJy?4x6Q(kOZqXH-vwV7Bm7uQyRTY~NJvEQp8k=gl;0jBTwEVH*nvJ#}xlvAV@
zvvv4J&B?;S@{vXeKut|8;9zRTr~F0YKg|)p1Zgasob32mSzTRSSzOs!Y#q#5*?4(*
zSwDVa{q%_$QG?mh-NwntjoHSL_TQcS)6bW0jwTM4c21VIHq=l38X4O>rr~ZZ51pZa?|EKIf
z?-5{qdinnh=HD~@S1DquLI45Q|1O&lK(>=!3kgXCN$QKJsvGh?6w6obTf^f?pG?-&
zZ@%xJmLy`7@JrThd848>zAz*c@cpC|lY1*J9o5Z)s^0&GpmPils{#+R8C5I51y3He
zz=RRVm;2&p)@nY6{-3v!_RNMGYRcK1y7HV(U1PO
zFTDOFBxK1Yi@&e`w-kfN4fns@AX>)2_cC)WN{&%F4z)Jw3T?QW(@@Y>2V(^*S%V|AeR6oIM#D3|p_FA5{|{h@0c*5!PaxdIuN^`C=Arq~jY7l}h_6Md>%C#A>sn+1~yL5Eb=
zsTWvSSnk5&J-cuB82a6I3-b0yeIBZyo<}W~6wa2G1=v(Vi}HLsZznxzqt9>m$`(o>wQ_32CDo3p=vs))v-QdY1)6~nYtU2=)IFWyX1M4;^G_4Xb|^iECCI%V`q@H
z*_R^eU0BVJ4|gsdXvC{rXcxWkjTB07=v)3H2>{pBz0`1irxbGMPF`B-<(;VL&s&`r*}g!Te@QFRtv?8fc7=I!$Hf9Uo+O)EL_ux|6LFJ^LIu9g*fDUGTA?!qUW!UH
z`zis`O6yfSdPl*RRdgLeC<+i4Q~vz#iK5={J%hS;GI5$^`Ypy@XPd+9*aDwFBl*?7
zq+YflGr#mlLJ31@#6L`GvDlV=g@WN^fQ&{qW8l+LqTBE(DBVn{?Gb)-(EbcJv?cBU
z-+Lw>H#kx6h66OuG5KD#K)D`QQsclUcK2Q@NTd`s|7bAw?F~XfyhX-NYINFBR#Bnl
z^gQm6%nsASL`H3lX@_6IYzO06BT^=j(aG^psax6ItVMBBz3$hjxFOb|_8<;3$$3zcIjG458ycWCUlM|G<82F
zMS<%1`erq?J>&8DxBi?LU5@a%H?c-#ICuymv$2St>)>>VxedC`exNAqpnl^k^6;em
z%wb5%I?7_?tA3ao1!9j95fS6<332p48@&`xeDyRuotlqAD7#=nSEg4cRKD)7fR3
zcsatmxT7k^?lAX(_N3Xl1)-Dzt`bMm2#DSGO6vSavxU;6Z$Dg*inEkSc|~B4RvUOz
z-QU3;ah9Uz<4eO@@z25iTh0>=h13UaSWY9P4w^9Woz$vRMSVWpmb|J9ojMymJHhPi
zR^6LKV%igr>o$nSmq#w!CGztPx63*6fHzL?Q(3Ame8bHwD(T$59v6)2I)t6@G(f(D
zy+b{xz3-=u_>+in4?ps;JxTVsIs};~+Xs2y3>wrTHoUFpVnQ#yb}TkFwxgfr@qSBK
zXYroN;N~Yb1>cm=Rl_Lf-rIt4?fd;^g!(l;Hd!>5(E4zV(2)D7o`-CLg?iO=JJQqk
zx@f+6sUD0Nr0cQu;Un%ByO0jW#DbBA;^H9=hLjNo-PirA1W`7H%gTZI+)Hj*f599q
zZHJqTf|s))jz%kTE1ISXBy!+1T_5
zDFbtEWV@^goKtGn0M!%~XUxU?oOyN}9k)_k=M5fjXO0IewiuTmXMErq$9lhk>+eW*
z1&c%}wQu#KVce}JVd)_g2qQqgAX!L%yB}=DeV+1~vXTo@+n}9$gzrrSTxWcjlD4`3
zRdz``ZB7h__bpdsnvWjLXyq+y8W?eW7H+oEs9yYDF_>&%8dGi0B@>fQytyy$!Z|Bi
zUuU})EV3#-YU@54OWyw>i}&Mw`LDar87szLKB({Upm;3kdtX2hTK5nl!raFE&gb*015*A*LKVoy%w^nvVo+>B6)Iv-rmuQMBbZvOw!;$K}rvt0}#nPDH!%Sdk`GtZobyChe&D%xsSV=UG
zHkVl6t%Jp`s)FgsT;_(%RWgt>Af*VnR~9J#kyp-8nE5!_nFjcM`@zg1%qck$`A+!p
zOjt+Ib$Cb!IF-LV#nOEInIGXx_-{2@rf29so<#Vd`bgf4{$VrmA2rFWEB%WLJ`SjU
zLCB-$b<$Hp*~VuX4BM+)@+cC=R?kdUnQ|uh!|+F0?z4gBiOrHcPY-s5^D7}WV4Q;ut#lkaNL@!%}&v3ZAd
zr=@^@)Prh|z2GX>tm9J3FYXq9C{>WZzV+dvTk`Q?V(+1(+Dt5bKgv$6YJVNl3{6ho2;fNGA;LtQ|#1rOD(|ZY?!-`4_t&)0^&T4
znYLsoF=!L2{pxz9(sE_Ow7C+pL<|^H&M9}k=AURQrC#!3;Y5jR-RqK?5nL4h!YzAF
zk$#nh%i|=c55Cz*M$@|`{|?i*O~LR<3yO%?!l;fF8H8bgoKg4D{p8=LQ`tbZUU0{c
zL8ajB^h{rhZeqZikWf)q2>3KQt^1H1_(Osa+?n!Pht?f2t^oMnI6a}ulmNpV?k@l)
z;|H%Tei&w=v99AVXNQ^3!&zE@cTsH@@vb+RVn?Yjm;WB2_3BJ)3dfa^l$4bx_7x%D
zMauR{H<-$rjoHmdUI4VUm1>L%wUpj`jF&=d48f;={@8iLSU;AcghrUK*rK5#Na*XW
zoFOS%SZZGRIR#+L$E|0@S#W~JNcfY*t_I`D2MpL$7iud#&Ho$++*DQBicTaeD32|^
z$pjJ+W*72CB<2qPVe3ludAKp0)T%N=O9MgienhFx;-Zr=hCUnz3%^K|r@Irl+?nj-
zXh^Y<^*Zu|VI}I{&`C71myqFmjDC>W=o>k>{X#(!aChKE~-Di