Ngraph (#1916)

* Added README for the ngraph engine * Fix --model_save_dir * Uncomment naive tests * Update run.sh script * update to use ngraph * update readme * Numactl -l was removed * update instruction * download pretrained model for inference only * fix typo * update

Ngraph (#1916)
* Added README for the ngraph engine * Fix --model_save_dir * Uncomment naive tests * Update run.sh script * update to use ngraph * update readme * Numactl -l was removed * update instruction * download pretrained model for inference only * fix typo * update
46b18b10 · tensor-tang · GitHub · f49ec1d7 · 46b18b10 · 46b18b10
3 changed file
--- a/fluid/PaddleCV/image_classification/README_ngraph.md
+++ b/fluid/PaddleCV/image_classification/README_ngraph.md
+# PaddlePaddle inference and training script
+This directory contains configuration and instructions to run the PaddlePaddle + nGraph for a local training and inference.
+# How to build PaddlePaddle framework with NGraph engine
+In order to build the PaddlePaddle + nGraph engine and run proper script,  follow up a few steps:
+1. Install PaddlePaddle project
+2. set env exports for nGraph and OpenMP
+3. run the inference/training script
+Currently supported models:
+* ResNet50 (inference and training).
+Only support Adam optimizer yet.
+Short description of aforementioned steps:
+## 1. Install PaddlePaddle
+Follow PaddlePaddle [installation instruction](https://github.com/PaddlePaddle/models/tree/develop/fluid/PaddleCV/image_classification#installation) to install PaddlePaddle. If you build PaddlePaddle yourself, please use the following cmake arguments and ensure to set `-DWITH_NGRAPH=ON`.  
+```
+cmake .. -DCMAKE_BUILD_TYPE=Release -DWITH_GPU=OFF -DWITH_MKL=ON -DWITH_MKLDNN=ON  -DWITH_NGRAPH=ON
+```
+Note: MKLDNN and MKL are required.
+## 2. Set env exports for nGraph and OMP
+Set the following exports needed for running nGraph:
+```
+export FLAGS_use_ngraph=true
+export OMP_NUM_THREADS=<num_cpu_cores>
+```
+Optional exports for better performance:
+```
+export KMP_AFFINITY=granularity=fine,compact,1,0
+export KMP_BLOCKTIME=1
+```
+## 3. How the benchmark script might be run.
+If everything built successfully, you can run command in ResNet50 nGraph session in script [run.sh](https://github.com/PaddlePaddle/models/blob/develop/fluid/PaddleCV/image_classification/run.sh) to start the benchmark job locally. You will need to uncomment the `#ResNet50 nGraph` part of script.
+Above is training job using the nGraph, to run the inference job using the nGraph:
+Please download the pre-trained resnet50 model from [supported models](https://github.com/PaddlePaddle/models/tree/72dcc7c1a8d5de9d19fbd65b4143bd0d661eee2c/fluid/PaddleCV/image_classification#supported-models-and-performances) for inference script.
--- a/fluid/PaddleCV/image_classification/run.sh
+++ b/fluid/PaddleCV/image_classification/run.sh
@@ -192,3 +192,26 @@ python train.py \
 #	--model_category=models_name \
 #       --model_save_dir=output/ \
 #	--l2_decay=3e-4
+#ResNet50 nGraph:
+# Training:
+#OMP_NUM_THREADS=`nproc` FLAGS_use_ngraph=true python train.py \
+#    --model=ResNet50 \
+#    --batch_size=128 \
+#    --total_images=1281167 \
+#    --class_dim=1000 \
+#    --image_shape=3,224,224 \
+#    --lr_strategy=none \
+#    --lr=0.001 \
+#    --num_epochs=120 \
+#    --with_mem_opt=False \
+#    --model_category=models_name \
+#    --model_save_dir=output/ \
+#    --lr_strategy=adam \
+#    --use_gpu=False
+# Inference:
+#OMP_NUM_THREADS=`nproc` FLAGS_use_ngraph=true python infer.py  \
+#    --use_gpu=false \
+#    --model=ResNet50 \
+#    --pretrained_model=ResNet50_pretrained
--- a/fluid/PaddleCV/image_classification/train.py
+++ b/fluid/PaddleCV/image_classification/train.py
@@ -116,6 +116,9 @@ def optimizer_setting(params):
            learning_rate=lr,
            momentum=momentum_rate,
            regularization=fluid.regularizer.L2Decay(l2_decay))
+    elif ls["name"] == "adam":
+        lr = params["lr"]
+        optimizer = fluid.optimizer.Adam(learning_rate=lr)
    else:
        lr = params["lr"]
        l2_decay = params["l2_decay"]
@@ -264,14 +267,17 @@ def train(args):
        fluid.io.load_vars(
            exe, pretrained_model, main_program=train_prog, predicate=if_exist)
-    visible_device = os.getenv('CUDA_VISIBLE_DEVICES')
+    if args.use_gpu:
-    if visible_device:
+        visible_device = os.getenv('CUDA_VISIBLE_DEVICES')
-        device_num = len(visible_device.split(','))
+        if visible_device:
+            device_num = len(visible_device.split(','))
+        else:
+            device_num = subprocess.check_output(
+                ['nvidia-smi', '-L']).decode().count('\n')
    else:
-        device_num = subprocess.check_output(
+        device_num = 1
-            ['nvidia-smi', '-L']).decode().count('\n')
    train_batch_size = args.batch_size / device_num
    test_batch_size = 16
    if not args.enable_ce:
        train_reader = paddle.batch(
@@ -292,10 +298,15 @@ def train(args):
    train_py_reader.decorate_paddle_reader(train_reader)
    test_py_reader.decorate_paddle_reader(test_reader)
-    train_exe = fluid.ParallelExecutor(
-        main_program=train_prog,
+    use_ngraph = os.getenv('FLAGS_use_ngraph')
-        use_cuda=bool(args.use_gpu),
+    if not use_ngraph:
-        loss_name=train_cost.name)
+        train_exe = fluid.ParallelExecutor(
+            main_program=train_prog,
+            use_cuda=bool(args.use_gpu),
+            loss_name=train_cost.name)
+    else:
+        train_exe = exe
    train_fetch_list = [
        train_cost.name, train_acc1.name, train_acc5.name, global_lr.name
@@ -314,9 +325,13 @@ def train(args):
        try:
            while True:
                t1 = time.time()
-                loss, acc1, acc5, lr = train_exe.run(
-                    fetch_list=train_fetch_list)
+                if use_ngraph:
+                    loss, acc1, acc5, lr = train_exe.run(
+                        train_prog, fetch_list=train_fetch_list)
+                else:
+                    loss, acc1, acc5, lr = train_exe.run(
+                        fetch_list=train_fetch_list)
                t2 = time.time()
                period = t2 - t1
                loss = np.mean(np.array(loss))