提交 6f6ecbec 编写于 作者: G guru4elephant 提交者: Tao Luo

remove benchmark folder, since there is a benchmark repo already, distributed...

remove benchmark folder, since there is a benchmark repo already, distributed benchmark will be maintained in fleet repo (#18537)

test=develop
上级 1f1cc222
paddle/image/logs
paddle/image/*.pyc
paddle/image/train.list
paddle/rnn/logs
paddle/rnn/*.pyc
paddle/rnn/imdb.pkl
caffe/image/logs
tensorflow/image/logs
tensorflow/rnn/logs
fluid/models/*.pyc
fluid/logs
fluid/nohup.out
name: "alexnet"
input: "data"
input_dim: 64
input_dim: 3
input_dim: 227
input_dim: 227
input: "label"
input_dim: 64
input_dim: 1
input_dim: 1
input_dim: 1
force_backward: true
layer {
name: "conv1"
type: "Convolution"
bottom: "data"
top: "conv1"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 96
kernel_size: 11
stride: 4
weight_filler {
type: "gaussian"
std: 0.01
}
bias_filler {
type: "constant"
value: 0
}
}
}
layer {
name: "relu1"
type: "ReLU"
bottom: "conv1"
top: "conv1"
}
layer {
name: "norm1"
type: "LRN"
bottom: "conv1"
top: "norm1"
lrn_param {
local_size: 5
alpha: 0.0001
beta: 0.75
}
}
layer {
name: "pool1"
type: "Pooling"
bottom: "norm1"
top: "pool1"
pooling_param {
pool: MAX
kernel_size: 3
stride: 2
}
}
layer {
name: "conv2"
type: "Convolution"
bottom: "pool1"
top: "conv2"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 256
pad: 2
kernel_size: 5
group: 1
weight_filler {
type: "gaussian"
std: 0.01
}
bias_filler {
type: "constant"
value: 0.1
}
}
}
layer {
name: "relu2"
type: "ReLU"
bottom: "conv2"
top: "conv2"
}
layer {
name: "norm2"
type: "LRN"
bottom: "conv2"
top: "norm2"
lrn_param {
local_size: 5
alpha: 0.0001
beta: 0.75
}
}
layer {
name: "pool2"
type: "Pooling"
bottom: "norm2"
top: "pool2"
pooling_param {
pool: MAX
kernel_size: 3
stride: 2
}
}
layer {
name: "conv3"
type: "Convolution"
bottom: "pool2"
top: "conv3"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 384
pad: 1
kernel_size: 3
weight_filler {
type: "gaussian"
std: 0.01
}
bias_filler {
type: "constant"
value: 0
}
}
}
layer {
name: "relu3"
type: "ReLU"
bottom: "conv3"
top: "conv3"
}
layer {
name: "conv4"
type: "Convolution"
bottom: "conv3"
top: "conv4"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 384
pad: 1
kernel_size: 3
group: 1
weight_filler {
type: "gaussian"
std: 0.01
}
bias_filler {
type: "constant"
value: 0.1
}
}
}
layer {
name: "relu4"
type: "ReLU"
bottom: "conv4"
top: "conv4"
}
layer {
name: "conv5"
type: "Convolution"
bottom: "conv4"
top: "conv5"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 256
pad: 1
kernel_size: 3
group: 1
weight_filler {
type: "gaussian"
std: 0.01
}
bias_filler {
type: "constant"
value: 0.1
}
}
}
layer {
name: "relu5"
type: "ReLU"
bottom: "conv5"
top: "conv5"
}
layer {
name: "pool5"
type: "Pooling"
bottom: "conv5"
top: "pool5"
pooling_param {
pool: MAX
kernel_size: 3
stride: 2
}
}
layer {
name: "fc6"
type: "InnerProduct"
bottom: "pool5"
top: "fc6"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
inner_product_param {
num_output: 4096
weight_filler {
type: "gaussian"
std: 0.005
}
bias_filler {
type: "constant"
value: 0.1
}
}
}
layer {
name: "relu6"
type: "ReLU"
bottom: "fc6"
top: "fc6"
}
layer {
name: "drop6"
type: "Dropout"
bottom: "fc6"
top: "fc6"
dropout_param {
dropout_ratio: 0.5
}
}
layer {
name: "fc7"
type: "InnerProduct"
bottom: "fc6"
top: "fc7"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
inner_product_param {
num_output: 4096
weight_filler {
type: "gaussian"
std: 0.005
}
bias_filler {
type: "constant"
value: 0.1
}
}
}
layer {
name: "relu7"
type: "ReLU"
bottom: "fc7"
top: "fc7"
}
layer {
name: "drop7"
type: "Dropout"
bottom: "fc7"
top: "fc7"
dropout_param {
dropout_ratio: 0.5
}
}
layer {
name: "fc8"
type: "InnerProduct"
bottom: "fc7"
top: "fc8"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
inner_product_param {
num_output: 1000
weight_filler {
type: "gaussian"
std: 0.01
}
bias_filler {
type: "constant"
value: 0
}
}
}
layer {
name: "loss"
type: "SoftmaxWithLoss"
bottom: "fc8"
bottom: "label"
top: "loss"
}
此差异已折叠。
set -e
function test() {
cfg=$1
batch=$2
prefix=$3
sed -i "/input: \"data\"/{n;s/^input_dim.*/input_dim: $batch/g}" $cfg
sed -i "/input: \"label\"/{n;s/^input_dim.*/input_dim: $batch/g}" $cfg
caffe time --model=$cfg --iterations=50 --gpu 0 > logs/$prefix-1gpu-batch${batch}.log 2>&1
}
if [ ! -d "logs" ]; then
mkdir logs
fi
# alexnet
test alexnet.prototxt 64 alexnet
test alexnet.prototxt 128 alexnet
test alexnet.prototxt 256 alexnet
test alexnet.prototxt 512 alexnet
# googlenet
test googlenet.prototxt 64 googlenet
test googlenet.prototxt 128 googlenet
# small net
test smallnet_mnist_cifar.prototxt 64 smallnet
test smallnet_mnist_cifar.prototxt 128 smallnet
test smallnet_mnist_cifar.prototxt 256 smallnet
test smallnet_mnist_cifar.prototxt 512 smallnet
#!/bin/bash
set -e
function test() {
cfg=$1
batch=$2
prefix=$3
batch_per_gpu=`expr ${batch} / 4`
sed -i "/input: \"data\"/{n;s/^input_dim.*/input_dim: ${batch_per_gpu}/g}" $cfg
sed -i "/input: \"label\"/{n;s/^input_dim.*/input_dim: ${batch_per_gpu}/g}" $cfg
sed -i "1c\net : \"${cfg}\"" solver.prototxt
caffe train --solver=solver.prototxt -gpu 0,1,2,3 > logs/${prefix}-4gpu-batch${batch}.log 2>&1
}
if [ ! -d "logs" ]; then
mkdir logs
fi
# alexnet
test alexnet.prototxt 512 alexnet
test alexnet.prototxt 1024 alexnet
# googlnet
test googlenet.prototxt 512 googlenet
name: "mnist/cifar"
input: "data"
input_dim: 128
input_dim: 3
input_dim: 32
input_dim: 32
input: "label"
input_dim: 128
input_dim: 1
input_dim: 1
input_dim: 1
layer {
name: "conv1"
type: "Convolution"
bottom: "data"
top: "conv1"
param {
lr_mult: 1
}
param {
lr_mult: 2
}
convolution_param {
num_output: 32
pad: 2
kernel_size: 5
stride: 1
weight_filler {
type: "gaussian"
std: 0.0001
}
bias_filler {
type: "constant"
}
}
}
layer {
name: "pool1"
type: "Pooling"
bottom: "conv1"
top: "pool1"
pooling_param {
pool: MAX
kernel_size: 3
stride: 2
}
}
layer {
name: "relu1"
type: "ReLU"
bottom: "pool1"
top: "pool1"
}
layer {
name: "conv2"
type: "Convolution"
bottom: "pool1"
top: "conv2"
param {
lr_mult: 1
}
param {
lr_mult: 2
}
convolution_param {
num_output: 32
pad: 2
kernel_size: 5
stride: 1
weight_filler {
type: "gaussian"
std: 0.01
}
bias_filler {
type: "constant"
}
}
}
layer {
name: "relu2"
type: "ReLU"
bottom: "conv2"
top: "conv2"
}
layer {
name: "pool2"
type: "Pooling"
bottom: "conv2"
top: "pool2"
pooling_param {
pool: AVE
kernel_size: 3
stride: 2
}
}
layer {
name: "conv3"
type: "Convolution"
bottom: "pool2"
top: "conv3"
param {
lr_mult: 1
}
param {
lr_mult: 2
}
convolution_param {
num_output: 64
pad: 2
kernel_size: 5
stride: 1
weight_filler {
type: "gaussian"
std: 0.01
}
bias_filler {
type: "constant"
}
}
}
layer {
name: "relu3"
type: "ReLU"
bottom: "conv3"
top: "conv3"
}
layer {
name: "pool3"
type: "Pooling"
bottom: "conv3"
top: "pool3"
pooling_param {
pool: AVE
kernel_size: 3
stride: 2
}
}
layer {
name: "ip1"
type: "InnerProduct"
bottom: "pool3"
top: "ip1"
param {
lr_mult: 1
}
param {
lr_mult: 2
}
inner_product_param {
num_output: 64
weight_filler {
type: "gaussian"
std: 0.1
}
bias_filler {
type: "constant"
}
}
}
layer {
name: "ip2"
type: "InnerProduct"
bottom: "ip1"
top: "ip2"
param {
lr_mult: 1
}
param {
lr_mult: 2
}
inner_product_param {
num_output: 10
weight_filler {
type: "gaussian"
std: 0.1
}
bias_filler {
type: "constant"
}
}
}
layer {
name: "accuracy"
type: "Accuracy"
bottom: "ip2"
bottom: "label"
top: "accuracy"
include {
phase: TEST
}
}
layer {
name: "loss"
type: "SoftmaxWithLoss"
bottom: "ip2"
bottom: "label"
top: "loss"
}
net: "alexnet.prototxt"
base_lr: 0.01
lr_policy: "fixed"
display: 20
max_iter: 200
momentum: 0.9
weight_decay: 0.0005
snapshot: 10000
snapshot_prefix: "models/caffe_alexnet_train"
solver_mode: GPU
FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04
# Use UBUNTU_MIRROR can speed up apt-get speed.
# ARG UBUNTU_MIRROR
# RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
RUN apt-get update && apt-get install -y python python-pip iputils-ping libgtk2.0-dev wget vim net-tools iftop python-opencv
RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.7 /usr/lib/libcudnn.so && ln -s /usr/lib/x86_64-linux-gnu/libnccl.so.2 /usr/lib/libnccl.so
# IMPORTANT:
# Add "ENV http_proxy=http://ip:port" if your download is slow, and don't forget to unset it at runtime.
# exmaple: unset http_proxy && unset https_proxy && python fluid_benchmark.py ...
RUN pip install -U pip
RUN pip install -U kubernetes paddlepaddle
RUN pip uninstall -y paddlepaddle && mkdir /workspace
ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
RUN chmod +x /usr/bin/paddle_k8s
ADD *.whl /
RUN pip install /*.whl && rm -f /*.whl
ENV LD_LIBRARY_PATH=/usr/local/lib
ADD fluid_benchmark.py recordio_converter.py args.py recordio_converter.py run.sh run_fluid_benchmark.sh imagenet_reader.py /workspace/
ADD models/ /workspace/models/
# Fluid Benchmark
This directory contains several models configurations and tools that used to run
Fluid benchmarks for local and distributed training.
## Run the Benchmark
To start, run the following command to get the full help message:
```bash
python fluid_benchmark.py --help
```
Currently supported `--model` argument include:
* mnist
* resnet
* you can chose to use different dataset using `--data_set cifar10` or
`--data_set flowers`.
* vgg
* stacked_dynamic_lstm
* machine_translation
* Run the following command to start a benchmark job locally:
```bash
python fluid_benchmark.py --model mnist --device GPU
```
You can choose to use GPU/CPU training. With GPU training, you can specify
`--gpus <gpu_num>` to run multi GPU training.
You can set async mode parameter server. With async mode, you can specify
`--async_mode` to train model asynchronous.
* Run distributed training with parameter servers:
* see [run_fluid_benchmark.sh](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/fluid/run_fluid_benchmark.sh) as an example.
* start parameter servers:
```bash
PADDLE_TRAINING_ROLE=PSERVER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=1 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist --device GPU --update_method pserver
sleep 15
```
* start trainers:
```bash
PADDLE_TRAINING_ROLE=TRAINER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=1 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist --device GPU --update_method pserver
```
* Run distributed training using NCCL2
```bash
PADDLE_PSERVER_PORT=7164 PADDLE_TRAINER_IPS=192.168.0.2,192.168.0.3 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist --device GPU --update_method nccl2
```
## Prepare the RecordIO file to Achieve Better Performance
Run the following command will generate RecordIO files like "mnist.recordio" under the path
and batch_size you choose, you can use batch_size=1 so that later reader can change the batch_size
at any time using `fluid.batch`.
```bash
python -c 'from recordio_converter import *; prepare_mnist("data", 1)'
```
## Run Distributed Benchmark on Kubernetes Cluster
You may need to build a Docker image before submitting a cluster job onto Kubernetes, or you will
have to start all those processes manually on each node, which is not recommended.
To build the Docker image, you need to choose a paddle "whl" package to run with, you may either
download it from
http://www.paddlepaddle.org/docs/develop/documentation/zh/build_and_install/pip_install_en.html or
build it by your own. Once you've got the "whl" package, put it under the current directory and run:
```bash
docker build -t [your docker image name]:[your docker image tag] .
```
Then push the image to a Docker registry that your Kubernetes cluster can reach.
We provide a script `kube_gen_job.py` to generate Kubernetes yaml files to submit
distributed benchmark jobs to your cluster. To generate a job yaml, just run:
```bash
python kube_gen_job.py --jobname myjob --pscpu 4 --cpu 8 --gpu 8 --psmemory 20 --memory 40 --pservers 4 --trainers 4 --entry "python fluid_benchmark.py --model mnist --gpus 8 --device GPU --update_method pserver " --disttype pserver
```
Then the yaml files are generated under directory `myjob`, you can run:
```bash
kubectl create -f myjob/
```
The job shall start.
## Notes for Run Fluid Distributed with NCCL2 and RDMA
Before running NCCL2 distributed jobs, please check that whether your node has multiple network
interfaces, try to add the environment variable `export NCCL_SOCKET_IFNAME=eth0` to use your actual
network device.
To run high-performance distributed training, you must prepare your hardware environment to be
able to run RDMA enabled network communication, please check out [this](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/howto/cluster/nccl2_rdma_training.md)
note for details.
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
__all__ = ['parse_args', ]
BENCHMARK_MODELS = [
"machine_translation", "resnet", "se_resnext", "vgg", "mnist",
"stacked_dynamic_lstm", "resnet_with_preprocess"
]
def parse_args():
parser = argparse.ArgumentParser('Fluid model benchmarks.')
parser.add_argument(
'--model',
type=str,
choices=BENCHMARK_MODELS,
default='resnet',
help='The model to run benchmark with.')
parser.add_argument(
'--batch_size', type=int, default=32, help='The minibatch size.')
# args related to learning rate
parser.add_argument(
'--learning_rate', type=float, default=0.001, help='The learning rate.')
# TODO(wuyi): add "--use_fake_data" option back.
parser.add_argument(
'--skip_batch_num',
type=int,
default=5,
help='The first num of minibatch num to skip, for better performance test'
)
parser.add_argument(
'--iterations', type=int, default=80, help='The number of minibatches.')
parser.add_argument(
'--pass_num', type=int, default=100, help='The number of passes.')
parser.add_argument(
'--data_format',
type=str,
default='NCHW',
choices=['NCHW', 'NHWC'],
help='The data data_format, now only support NCHW.')
parser.add_argument(
'--device',
type=str,
default='GPU',
choices=['CPU', 'GPU'],
help='The device type.')
parser.add_argument(
'--gpus',
type=int,
default=1,
help='If gpus > 1, will use ParallelExecutor to run, else use Executor.')
# this option is available only for vgg and resnet.
parser.add_argument(
'--cpus',
type=int,
default=1,
help='If cpus > 1, will set ParallelExecutor to use multiple threads.')
parser.add_argument(
'--data_set',
type=str,
default='flowers',
choices=['cifar10', 'flowers', 'imagenet'],
help='Optional dataset for benchmark.')
parser.add_argument(
'--infer_only', action='store_true', help='If set, run forward only.')
parser.add_argument(
'--use_cprof', action='store_true', help='If set, use cProfile.')
parser.add_argument(
'--use_nvprof',
action='store_true',
help='If set, use nvprof for CUDA.')
parser.add_argument(
'--no_test',
action='store_true',
help='If set, do not test the testset during training.')
parser.add_argument(
'--memory_optimize',
action='store_true',
help='If set, optimize runtime memory before start.')
parser.add_argument(
'--use_fake_data',
action='store_true',
help='If set ommit the actual read data operators.')
parser.add_argument(
'--profile', action='store_true', help='If set, profile a few steps.')
parser.add_argument(
'--update_method',
type=str,
default='local',
choices=['local', 'pserver', 'nccl2'],
help='Choose parameter update method, can be local, pserver, nccl2.')
parser.add_argument(
'--no_split_var',
action='store_true',
default=False,
help='Whether split variables into blocks when update_method is pserver')
parser.add_argument(
'--async_mode',
action='store_true',
default=False,
help='Whether start pserver in async mode to support ASGD')
parser.add_argument(
'--use_reader_op',
action='store_true',
help='Whether to use reader op, and must specify the data path if set this to true.'
)
parser.add_argument(
'--data_path',
type=str,
default="",
help='Directory that contains all the training recordio files.')
parser.add_argument(
'--test_data_path',
type=str,
default="",
help='Directory that contains all the test data (NOT recordio).')
parser.add_argument(
'--use_inference_transpiler',
action='store_true',
help='If set, use inference transpiler to optimize the program.')
parser.add_argument(
'--no_random',
action='store_true',
help='If set, keep the random seed and do not shuffle the data.')
parser.add_argument(
'--reduce_strategy',
type=str,
choices=['reduce', 'all_reduce'],
default='all_reduce',
help='Specify the reduce strategy, can be reduce, all_reduce')
parser.add_argument(
'--fuse_broadcast_op',
action='store_true',
help='If set, would fuse multiple broadcast operators into one fused_broadcast operator.'
)
args = parser.parse_args()
return args
#!/bin/bash
if [ "`uname -s`" != "Linux" ]; then
echo "Current scenario only support in Linux yet!"
exit 0
fi
echo "========================= Hardware Information ========================="
sockets=`grep 'physical id' /proc/cpuinfo | sort -u | wc -l`
cores_per_socket=`grep 'core id' /proc/cpuinfo | sort -u | wc -l`
ht=`lscpu |grep "per core" |awk -F':' '{print $2}'|xargs`
physical_cores=$((sockets * cores_per_socket))
virtual_cores=`grep 'processor' /proc/cpuinfo | sort -u | wc -l`
numa_nodes=`lscpu |grep "NUMA node(s)"|awk -F':' '{print $2}'|xargs`
echo "CPU Name : `cat /proc/cpuinfo |grep -i "model name" |uniq |awk -F ':' '{print $2}'|xargs`"
echo "CPU Family : `lscpu |grep \"CPU family\" |awk -F':' '{print $2}'|xargs`"
echo "Socket Number : $sockets"
echo "Cores Per Socket : $cores_per_socket"
echo "Total Physical Cores : $physical_cores"
echo "Total Virtual Cores : $virtual_cores"
if [ $ht -eq 1 ]; then
echo "Hyper Threading : OFF"
if [ $physical_cores -ne $virtual_cores ]; then
echo "Error: HT logical error"
fi
else
echo "Hyper Threading : ON"
if [ $physical_cores -ge $virtual_cores ]; then
echo "Error: HT logical error"
fi
fi
echo "NUMA Nodes : $numa_nodes"
if [ $numa_nodes -lt $sockets ]; then
echo "Warning: NUMA node is not enough for the best performance,\
at least $sockets"
fi
echo "-------------------------- Memory Information --------------------------"
# dmidecode support start from 2.11
dmi_ver=`dmidecode --version|awk -F '.' '{print $1}'|xargs`
if [ $dmi_ver -lt 2 ]; then
echo "Error: dmidecode unknown or version is too old"
exit 0
fi
if [ `dmidecode | grep -ic "Permission denied"` -ne 0 ]; then
echo "Error: need root to run dmidecode"
exit 0
fi
max_dimms=0
num_dimms_installed=0
for dimm_id in `dmidecode |grep Locator|sort -u | awk -F ':' '{print $2}'`; do
num_refered=`dmidecode |grep -wc "$dimm_id"`
# the actual dimm id should be refered only once
if [ $num_refered -eq 1 ]; then
num_unknown=`dmidecode | awk '/'$dimm_id'/ {s=1; f=0};
/Unknown/ {f=1};
/Manufacturer/ {if (s==1) {print f; exit 0;}};'`
if [ $num_unknown -eq 0 ]; then
dimms_installed="$dimms_installed \n $dimm_id"
((num_dimms_installed++))
else
dimms_uninstalled="$dimms_uninstalled \n $dimm_id"
fi
((max_dimms++))
fi
done
echo "Installed DIMM number : $num_dimms_installed"
num_dimms_mapped=`dmidecode | grep "Memory Device Mapped" | wc -l`
if [ $num_dimms_installed -ne $num_dimms_mapped ]; then
echo "Error: The installed DIMMs number does ont match the mapped memory device: $num_dimms_mapped"
fi
num_clock_configed=`dmidecode | grep -i "Configured Clock Speed" |grep -ic "Hz"`
if [ $num_dimms_installed -ne $num_clock_configed ]; then
echo "Error: The installed DIMMs number does ont match configured clocks: $num_clock_configed"
fi
echo -e "Installed DIMMs Locator: $dimms_installed"
echo -e "Not installed DIMMs : $dimms_uninstalled"
max_dimm_slots=`dmidecode | grep -c "Bank Locator"`
echo "DIMMs max slots : $max_dimm_slots"
if [ $max_dimms -ne $max_dimm_slots ]; then
echo "Error: The max dimm slots do not match the max dimms: $max_dimms"
fi
free_ver_main=`free -V|awk -F ' ' '{print $NF}'|awk -F '.' '{print $1}'`
free_ver_sub=`free -V|awk -F ' ' '{print $NF}'|awk -F '.' '{print $2}'`
if [ $free_ver_main -lt 3 ] || [ $free_ver_sub -lt 3 ]; then
mem_sz=`free |grep -i mem |awk -F' ' '{print $2}'|xargs`
swap_sz=`free |grep -i swap |awk -F' ' '{print $2}'|xargs`
total_sz=`free -t |grep -i total |tail -n 1| awk -F' ' '{print $2}'|xargs`
mem_sz="`awk 'BEGIN{printf "%.1f\n",('$mem_sz'/1024/1024)}'` GB"
swap_sz="`awk 'BEGIN{printf "%.1f\n",('$swap_sz'/1024/1024)}'` GB"
total_sz="`awk 'BEGIN{printf "%.1f\n",('$total_sz'/1024/1024)}'` GB"
else
mem_sz=`free -h |grep -i mem |awk -F' ' '{print $2}'|xargs`
swap_sz=`free -h |grep -i swap |awk -F' ' '{print $2}'|xargs`
total_sz=`free -th |grep -i total |tail -n 1| awk -F' ' '{print $2}'|xargs`
fi
echo "Memory Size : $mem_sz"
echo "Swap Memory Size : $swap_sz"
echo "Total Memory Size : $total_sz"
echo "Max Memory Capacity : `dmidecode |grep -i \"maximum capacity\"|sort -u|awk -F':' '{print $2}'|xargs`"
# DIMMs fequency
clock_speeds=`dmidecode | grep -i "Configured Clock Speed" | grep -i "Hz" |sort -u | awk -F':' '{print $2}'|xargs`
echo "Configed Clock Speed : $clock_speeds"
num_clock_type=`dmidecode | grep -i "Configured Clock Speed" | grep -i "Hz" |sort -u | wc -l`
if [ $num_clock_type -ne 1 ]; then
echo "Warning: Have more than 1 speed type, all DIMMs should have same fequency: $clock_speeds"
fi
echo "-------------------------- Turbo Information --------------------------"
scaling_drive=`cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_driver`
echo "Scaling Driver : $scaling_drive"
if [ $scaling_drive == "intel_pstate" ] && [ -e /sys/devices/system/cpu/intel_pstate/no_turbo ]; then
turbo=`cat /sys/devices/system/cpu/intel_pstate/no_turbo`
if [ $turbo -eq 1 ]; then
echo "Turbo Status : OFF"
else
echo "Turbo Status : ON"
fi
else
echo "Warning: Scaling driver is not intel_pstarte, maybe should enable it in BIOS"
echo "Turbo Status : Unknown"
fi
# cpu frequency
num_max_freq=`cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_max_freq| sort -u |wc -l`
num_min_freq=`cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_min_freq| sort -u |wc -l`
if [ $num_max_freq -ne 1 ]; then
echo "Error: the max_frequency of all CPU should be equal"
fi
if [ $num_min_freq -ne 1 ]; then
echo "Error: the min_frequency of all CPU should be equal"
fi
max_freq=`cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_max_freq| uniq|xargs` # kHz
max_freq=`awk 'BEGIN{printf "%.2f",('$max_freq' / 1000000)}'` # GHz
min_freq=`cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_min_freq| uniq|xargs` # kHz
min_freq=`awk 'BEGIN{printf "%.2f",('$min_freq' / 1000000)}'` # GHz
echo "CPU Max Frequency : $max_freq GHz"
echo "CPU Min Frequency : $min_freq GHz"
# cpu governor
num_governor=`cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor| sort -u |wc -l`
if [ $num_governor -ne 1 ]; then
echo "Error: the governor of all CPU should be the same"
fi
governor=`cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor| sort -u |uniq`
echo "CPU Freq Governor : $governor"
echo "========================= Software Information ========================="
echo "BIOS Release Date : `dmidecode | grep "Release Date"|awk -F ':' '{print $2}'|xargs`"
echo "OS Version : `cat /etc/redhat-release`"
echo "Kernel Release Version : `uname -r`"
echo "Kernel Patch Version : `uname -v`"
echo "GCC Version :`gcc --version | head -n 1|awk -F '\\\(GCC\\\)' '{print $2}'`"
if command -v cmake >/dev/null 2>&1; then
cmake_ver=`cmake --version | head -n 1 | awk -F 'version' '{print $2}'`
else
cmake_ver=" Not installed"
fi
echo "CMake Version :$cmake_ver"
echo "------------------ Environment Variables Information -------------------"
kmp_affinity=`env | grep KMP_AFFINITY`
omp_dynamic=`env | grep OMP_DYNAMIC`
omp_nested=`env | grep OMP_NESTED`
omp_num_threads=`env | grep OMP_NUM_THREADS`
mkl_num_threads=`env | grep MKL_NUM_THREADS`
mkl_dynamic=`env | grep MKL_DYNAMIC`
if [ ! $kmp_affinity ]; then kmp_affinity="unset"; fi
if [ ! $omp_dynamic ]; then omp_dynamic="unset"; fi
if [ ! $omp_nested ]; then omp_nested="unset"; fi
if [ ! $omp_num_threads ]; then omp_num_threads="unset"; fi
if [ ! $mkl_num_threads ]; then mkl_num_threads="unset"; fi
if [ ! $mkl_dynamic ]; then mkl_dynamic="unset"; fi
echo "KMP_AFFINITY : $kmp_affinity"
echo "OMP_DYNAMIC : $omp_dynamic"
echo "OMP_NESTED : $omp_nested"
echo "OMP_NUM_THREADS : $omp_num_threads"
echo "MKL_NUM_THREADS : $mkl_num_threads"
echo "MKL_DYNAMIC : $mkl_dynamic"
# Check if any MKL related libraries have been installed in LD_LIBRARY_PATH
for path in `echo $LD_LIBRARY_PATH | awk -F ':' '{for(i=1;i<=NF;++i)print $i}'`; do
mkldnn_found=`find $path -name "libmkldnn.so"`
if [ "$mkldnn_found" ]; then
echo "Found MKL-DNN : $mkldnn_found"
fi
mklml_found=`find $path -name "libmklml_intel.so"`
if [ "$mklml_found" ]; then
echo "Found MKLML : $mklml_found"
fi
iomp_found=`find $path -name "libiomp5.so"`
if [ "$iomp_found" ]; then
echo "Found IOMP : $iomp_found"
fi
done
# dump all details for fully check
lscpu > lscpu.dump
dmidecode > dmidecode.dump
# The expected result would be like:
# ========================= Hardware Information =========================
# CPU Name : Intel(R) Xeon(R) Gold 6148M CPU @ 2.40GHz
# CPU Family : 6
# Socket Number : 2
# Cores Per Socket : 20
# Total Physical Cores : 40
# Total Virtual Cores : 40
# Hyper Threading : OFF
# NUMA Nodes : 2
# -------------------------- Memory Information --------------------------
# Installed DIMM number : 12
# Installed DIMMs Locator:
# CPU1_DIMM_A1
# CPU1_DIMM_B1
# CPU1_DIMM_C1
# CPU1_DIMM_D1
# CPU1_DIMM_E1
# CPU1_DIMM_F1
# CPU2_DIMM_A1
# CPU2_DIMM_B1
# CPU2_DIMM_C1
# CPU2_DIMM_D1
# CPU2_DIMM_E1
# CPU2_DIMM_F1
# Not installed DIMMs :
# CPU1_DIMM_A2
# CPU1_DIMM_B2
# CPU1_DIMM_C2
# CPU1_DIMM_D2
# CPU1_DIMM_E2
# CPU1_DIMM_F2
# CPU2_DIMM_A2
# CPU2_DIMM_B2
# CPU2_DIMM_C2
# CPU2_DIMM_D2
# CPU2_DIMM_E2
# CPU2_DIMM_F2
# DIMMs max slots : 24
# Memory Size : 376G
# Swap Memory Size : 4.0G
# Total Memory Size : 380G
# Max Memory Capacity : 2304 GB
# Configed Clock Speed : 2666 MHz
# -------------------------- Turbo Information --------------------------
# Scaling Driver : intel_pstate
# Turbo Status : ON
# CPU Max Frequency : 3.70 GHz
# CPU Min Frequency : 1.00 GHz
# CPU Freq Governor : performance
# ========================= Software Information =========================
# BIOS Release Date : 03/10/2017
# OS Version : CentOS Linux release 7.3.1611 (Core)
# Kernel Release Version : 3.10.0-514.el7.x86_64
# Kernel Patch Version : #1 SMP Tue Nov 22 16:42:41 UTC 2016
# GCC Version : 4.8.5 20150623 (Red Hat 4.8.5-11)
# CMake Version : 3.5.2
# ------------------ Environment Variables Information -------------------
# KMP_AFFINITY : unset
# OMP_DYNAMIC : unset
# OMP_NESTED : unset
# OMP_NUM_THREADS : unset
# MKL_NUM_THREADS : unset
# MKL_DYNAMIC : unset
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import cProfile
import time
import os
import traceback
import numpy as np
import paddle.fluid as fluid
import paddle.fluid.core as core
import paddle.fluid.profiler as profiler
import paddle.fluid.transpiler.distribute_transpiler as distribute_transpiler
from args import *
def append_nccl2_prepare(trainer_id, startup_prog):
if trainer_id >= 0:
# append gen_nccl_id at the end of startup program
trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
port = os.getenv("PADDLE_PSERVER_PORT")
worker_ips = os.getenv("PADDLE_TRAINER_IPS")
worker_endpoints = []
for ip in worker_ips.split(","):
worker_endpoints.append(':'.join([ip, port]))
num_trainers = len(worker_endpoints)
current_endpoint = os.getenv("PADDLE_CURRENT_IP") + ":" + port
worker_endpoints.remove(current_endpoint)
nccl_id_var = startup_prog.global_block().create_var(
name="NCCLID",
persistable=True,
type=fluid.core.VarDesc.VarType.RAW)
startup_prog.global_block().append_op(
type="gen_nccl_id",
inputs={},
outputs={"NCCLID": nccl_id_var},
attrs={
"endpoint": current_endpoint,
"endpoint_list": worker_endpoints,
"trainer_id": trainer_id
})
return nccl_id_var, num_trainers, trainer_id
else:
raise Exception("must set positive PADDLE_TRAINER_ID env variables for "
"nccl-based dist train.")
def dist_transpile(trainer_id, args, train_prog, startup_prog):
if trainer_id < 0:
return None, None
# the port of all pservers, needed by both trainer and pserver
port = os.getenv("PADDLE_PSERVER_PORT", "6174")
# comma separated ips of all pservers, needed by trainer and
# pserver
pserver_ips = os.getenv("PADDLE_PSERVER_IPS", "")
eplist = []
for ip in pserver_ips.split(","):
eplist.append(':'.join([ip, port]))
pserver_endpoints = ",".join(eplist)
# total number of workers/trainers in the job, needed by
# trainer and pserver
trainers = int(os.getenv("PADDLE_TRAINERS"))
# the IP of the local machine, needed by pserver only
current_endpoint = os.getenv("PADDLE_CURRENT_IP", "") + ":" + port
# the role, should be either PSERVER or TRAINER
training_role = os.getenv("PADDLE_TRAINING_ROLE")
config = fluid.DistributeTranspilerConfig()
config.slice_var_up = not args.no_split_var
config.min_block_size = 1048576
t = distribute_transpiler.DistributeTranspiler(config=config)
t.transpile(
trainer_id,
# NOTE: *MUST* use train_prog, for we are using with guard to
# generate different program for train and test.
program=train_prog,
pservers=pserver_endpoints,
trainers=trainers,
sync_mode=not args.async_mode,
startup_program=startup_prog)
if training_role == "PSERVER":
pserver_program = t.get_pserver_program(current_endpoint)
pserver_startup_program = t.get_startup_program(
current_endpoint, pserver_program, startup_program=startup_prog)
return pserver_program, pserver_startup_program
elif training_role == "TRAINER":
train_program = t.get_trainer_program()
return train_program, startup_prog
else:
raise ValueError(
'PADDLE_TRAINING_ROLE environment variable must be either TRAINER or PSERVER'
)
def test_parallel(exe, test_args, args, test_prog, feeder):
acc_evaluators = []
for i in xrange(len(test_args[2])):
acc_evaluators.append(fluid.metrics.Accuracy())
to_fetch = [v.name for v in test_args[2]]
if args.use_reader_op:
test_args[4].start()
while True:
try:
acc_rets = exe.run(fetch_list=to_fetch)
for i, e in enumerate(acc_evaluators):
e.update(
value=np.array(acc_rets[i]), weight=args.batch_size)
except fluid.core.EOFException as eof:
test_args[4].reset()
break
else:
for batch_id, data in enumerate(test_args[3]()):
acc_rets = exe.run(feed=feeder.feed(data), fetch_list=to_fetch)
for i, e in enumerate(acc_evaluators):
e.update(value=np.array(acc_rets[i]), weight=len(data))
return [e.eval() for e in acc_evaluators]
# NOTE: only need to benchmark using parallelexe
def train_parallel(train_args, test_args, args, train_prog, test_prog,
startup_prog, nccl_id_var, num_trainers, trainer_id):
over_all_start = time.time()
place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
feeder = None
if not args.use_reader_op:
feed_var_list = [
var for var in train_prog.global_block().vars.itervalues()
if var.is_data
]
feeder = fluid.DataFeeder(feed_var_list, place)
# generate fake:
if args.use_fake_data:
for var in feed_var_list:
v = startup_prog.global_block()._clone_variable(var)
var.persistable = True
v.persistable = True
real_shape = list(var.shape)
real_shape[0] = args.batch_size / args.gpus
startup_prog.global_block().append_op(
outputs={"Out": v},
type="fill_constant",
attrs={"shape": real_shape,
"value": 1.0,
"dtype": var.dtype})
if nccl_id_var and trainer_id == 0:
#FIXME(wuyi): wait other trainer to start listening
time.sleep(30)
startup_exe = fluid.Executor(place)
startup_exe.run(startup_prog)
strategy = fluid.ExecutionStrategy()
strategy.num_threads = args.cpus
strategy.allow_op_delay = False
build_strategy = fluid.BuildStrategy()
if args.reduce_strategy == "reduce":
build_strategy.reduce_strategy = fluid.BuildStrategy(
).ReduceStrategy.Reduce
else:
build_strategy.reduce_strategy = fluid.BuildStrategy(
).ReduceStrategy.AllReduce
avg_loss = train_args[0]
if args.update_method == "pserver":
# parameter server mode distributed training, merge
# gradients on local server, do not initialize
# ParallelExecutor with multi server all-reduce mode.
num_trainers = 1
trainer_id = 0
exe = fluid.ParallelExecutor(
True,
avg_loss.name,
main_program=train_prog,
exec_strategy=strategy,
build_strategy=build_strategy,
num_trainers=num_trainers,
trainer_id=trainer_id)
if not args.no_test:
if args.update_method == "pserver":
test_scope = None
else:
# NOTE: use an empty scope to avoid test exe using NCCLID
test_scope = fluid.Scope()
test_exe = fluid.ParallelExecutor(
True, main_program=test_prog, share_vars_from=exe)
for pass_id in range(args.pass_num):
num_samples = 0
iters = 0
start_time = time.time()
if not args.use_reader_op:
reader_generator = train_args[3]() #train_reader
batch_id = 0
data = None
if args.use_reader_op:
train_args[4].start()
while True:
if not args.use_reader_op:
data = next(reader_generator, None)
if data == None:
break
if args.profile and batch_id == 5:
profiler.start_profiler("All")
profiler.reset_profiler()
elif args.profile and batch_id == 10:
print("profiling total time: ", time.time() - start_time)
profiler.stop_profiler("total", "/tmp/profile_%d_pass%d" %
(trainer_id, pass_id))
if iters == args.iterations:
reader_generator.close()
break
if iters == args.skip_batch_num:
start_time = time.time()
num_samples = 0
fetch_list = [avg_loss.name]
acc_name_list = [v.name for v in train_args[2]]
fetch_list.extend(acc_name_list)
if args.use_fake_data or args.use_reader_op:
try:
fetch_ret = exe.run(fetch_list)
except fluid.core.EOFException as eof:
break
except fluid.core.EnforceNotMet as ex:
traceback.print_exc()
break
else:
fetch_ret = exe.run(fetch_list, feed=feeder.feed(data))
if args.use_reader_op:
num_samples += args.batch_size * args.gpus
else:
num_samples += len(data)
iters += 1
if batch_id % 1 == 0:
fetched_data = [np.mean(np.array(d)) for d in fetch_ret]
print("Pass %d, batch %d, loss %s, accucacys: %s" %
(pass_id, batch_id, fetched_data[0], fetched_data[1:]))
batch_id += 1
print_train_time(start_time, time.time(), num_samples)
if args.use_reader_op:
train_args[4].reset() # reset reader handle
else:
del reader_generator
if not args.no_test and test_args[2]:
test_feeder = None
if not args.use_reader_op:
test_feed_var_list = [
var for var in test_prog.global_block().vars.itervalues()
if var.is_data
]
test_feeder = fluid.DataFeeder(test_feed_var_list, place)
test_ret = test_parallel(test_exe, test_args, args, test_prog,
test_feeder)
print("Pass: %d, Test Accuracy: %s\n" %
(pass_id, [np.mean(np.array(v)) for v in test_ret]))
print("total train time: ", time.time() - over_all_start)
def print_arguments(args):
vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
vars(args)['device'] == 'GPU')
print('----------- Configuration Arguments -----------')
for arg, value in sorted(vars(args).iteritems()):
print('%s: %s' % (arg, value))
print('------------------------------------------------')
def print_train_time(start_time, end_time, num_samples):
train_elapsed = end_time - start_time
examples_per_sec = num_samples / train_elapsed
print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
(num_samples, train_elapsed, examples_per_sec))
def print_paddle_envs():
print('----------- Configuration envs -----------')
for k in os.environ:
if "PADDLE_" in k:
print "ENV %s:%s" % (k, os.environ[k])
print('------------------------------------------------')
def main():
args = parse_args()
print_arguments(args)
print_paddle_envs()
if args.no_random:
fluid.default_startup_program().random_seed = 1
# the unique trainer id, starting from 0, needed by trainer
# only
nccl_id_var, num_trainers, trainer_id = (
None, 1, int(os.getenv("PADDLE_TRAINER_ID", "0")))
if args.use_cprof:
pr = cProfile.Profile()
pr.enable()
model_def = __import__("models.%s" % args.model, fromlist=["models"])
train_prog = fluid.Program()
test_prog = fluid.Program()
startup_prog = fluid.Program()
train_args = list(model_def.get_model(args, True, train_prog, startup_prog))
test_args = list(model_def.get_model(args, False, test_prog, startup_prog))
all_args = [train_args, test_args, args]
if args.update_method == "pserver":
train_prog, startup_prog = dist_transpile(trainer_id, args, train_prog,
startup_prog)
if not train_prog:
raise Exception(
"Must configure correct environments to run dist train.")
all_args.extend([train_prog, test_prog, startup_prog])
if args.gpus > 1 and os.getenv("PADDLE_TRAINING_ROLE") == "TRAINER":
all_args.extend([nccl_id_var, num_trainers, trainer_id])
train_parallel(*all_args)
elif os.getenv("PADDLE_TRAINING_ROLE") == "PSERVER":
# start pserver with Executor
server_exe = fluid.Executor(fluid.CPUPlace())
server_exe.run(startup_prog)
server_exe.run(train_prog)
exit(0)
# for other update methods, use default programs
all_args.extend([train_prog, test_prog, startup_prog])
if args.update_method == "nccl2":
nccl_id_var, num_trainers, trainer_id = append_nccl2_prepare(
trainer_id, startup_prog)
if args.device == "CPU":
raise Exception("Only support GPU perf with parallel exe")
all_args.extend([nccl_id_var, num_trainers, trainer_id])
train_parallel(*all_args)
if __name__ == "__main__":
main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import math
import random
import functools
import numpy as np
from threading import Thread
import subprocess
import time
from Queue import Queue
import paddle
from PIL import Image, ImageEnhance
random.seed(0)
DATA_DIM = 224
THREAD = int(os.getenv("PREPROCESS_THREADS", "10"))
BUF_SIZE = 5120
DATA_DIR = '/mnt/ImageNet'
TRAIN_LIST = '/mnt/ImageNet/train.txt'
TEST_LIST = '/mnt/ImageNet/val.txt'
img_mean = np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1))
img_std = np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1))
def resize_short(img, target_size):
percent = float(target_size) / min(img.size[0], img.size[1])
resized_width = int(round(img.size[0] * percent))
resized_height = int(round(img.size[1] * percent))
img = img.resize((resized_width, resized_height), Image.LANCZOS)
return img
def crop_image(img, target_size, center):
width, height = img.size
size = target_size
if center == True:
w_start = (width - size) / 2
h_start = (height - size) / 2
else:
w_start = random.randint(0, width - size)
h_start = random.randint(0, height - size)
w_end = w_start + size
h_end = h_start + size
img = img.crop((w_start, h_start, w_end, h_end))
return img
def random_crop(img, size, scale=[0.08, 1.0], ratio=[3. / 4., 4. / 3.]):
aspect_ratio = math.sqrt(random.uniform(*ratio))
w = 1. * aspect_ratio
h = 1. / aspect_ratio
bound = min((float(img.size[0]) / img.size[1]) / (w**2),
(float(img.size[1]) / img.size[0]) / (h**2))
scale_max = min(scale[1], bound)
scale_min = min(scale[0], bound)
target_area = img.size[0] * img.size[1] * random.uniform(scale_min,
scale_max)
target_size = math.sqrt(target_area)
w = int(target_size * w)
h = int(target_size * h)
i = random.randint(0, img.size[0] - w)
j = random.randint(0, img.size[1] - h)
img = img.crop((i, j, i + w, j + h))
img = img.resize((size, size), Image.LANCZOS)
return img
def rotate_image(img):
angle = random.randint(-10, 10)
img = img.rotate(angle)
return img
def distort_color(img):
def random_brightness(img, lower=0.5, upper=1.5):
e = random.uniform(lower, upper)
return ImageEnhance.Brightness(img).enhance(e)
def random_contrast(img, lower=0.5, upper=1.5):
e = random.uniform(lower, upper)
return ImageEnhance.Contrast(img).enhance(e)
def random_color(img, lower=0.5, upper=1.5):
e = random.uniform(lower, upper)
return ImageEnhance.Color(img).enhance(e)
ops = [random_brightness, random_contrast, random_color]
random.shuffle(ops)
img = ops[0](img)
img = ops[1](img)
img = ops[2](img)
return img
def process_image(sample, mode, color_jitter, rotate):
img_path = sample[0]
img = Image.open(img_path)
if mode == 'train':
if rotate: img = rotate_image(img)
img = random_crop(img, DATA_DIM)
else:
img = resize_short(img, target_size=256)
img = crop_image(img, target_size=DATA_DIM, center=True)
if mode == 'train':
if color_jitter:
img = distort_color(img)
if random.randint(0, 1) == 1:
img = img.transpose(Image.FLIP_LEFT_RIGHT)
if img.mode != 'RGB':
img = img.convert('RGB')
img = np.array(img).astype('float32').transpose((2, 0, 1)) / 255
img -= img_mean
img /= img_std
if mode == 'train' or mode == 'val':
return img, sample[1]
elif mode == 'test':
return [img]
class XmapEndSignal():
pass
def xmap_readers(mapper,
reader,
process_num,
buffer_size,
order=False,
print_queue_state=True):
end = XmapEndSignal()
# define a worker to read samples from reader to in_queue
def read_worker(reader, in_queue):
for i in reader():
in_queue.put(i)
in_queue.put(end)
# define a worker to read samples from reader to in_queue with order flag
def order_read_worker(reader, in_queue, file_queue):
in_order = 0
for i in reader():
in_queue.put((in_order, i))
in_order += 1
in_queue.put(end)
# define a worker to handle samples from in_queue by mapper
# and put mapped samples into out_queue
def handle_worker(in_queue, out_queue, mapper):
sample = in_queue.get()
while not isinstance(sample, XmapEndSignal):
r = mapper(sample)
out_queue.put(r)
sample = in_queue.get()
in_queue.put(end)
out_queue.put(end)
# define a worker to handle samples from in_queue by mapper
# and put mapped samples into out_queue by order
def order_handle_worker(in_queue, out_queue, mapper, out_order):
ins = in_queue.get()
while not isinstance(ins, XmapEndSignal):
order, sample = ins
r = mapper(sample)
while order != out_order[0]:
pass
out_queue.put(r)
out_order[0] += 1
ins = in_queue.get()
in_queue.put(end)
out_queue.put(end)
def xreader():
file_queue = Queue()
in_queue = Queue(buffer_size)
out_queue = Queue(buffer_size)
out_order = [0]
# start a read worker in a thread
target = order_read_worker if order else read_worker
t = Thread(target=target, args=(reader, in_queue))
t.daemon = True
t.start()
# start several handle_workers
target = order_handle_worker if order else handle_worker
args = (in_queue, out_queue, mapper, out_order) if order else (
in_queue, out_queue, mapper)
workers = []
for i in xrange(process_num):
worker = Thread(target=target, args=args)
worker.daemon = True
workers.append(worker)
for w in workers:
w.start()
sample = out_queue.get()
start_t = time.time()
while not isinstance(sample, XmapEndSignal):
yield sample
sample = out_queue.get()
if time.time() - start_t > 3:
if print_queue_state:
print("queue sizes: ", in_queue.qsize(), out_queue.qsize())
start_t = time.time()
finish = 1
while finish < process_num:
sample = out_queue.get()
if isinstance(sample, XmapEndSignal):
finish += 1
else:
yield sample
return xreader
def _reader_creator(file_list,
mode,
shuffle=False,
color_jitter=False,
rotate=False,
xmap=True):
def reader():
with open(file_list) as flist:
full_lines = [line.strip() for line in flist]
if shuffle:
random.shuffle(full_lines)
if mode == 'train':
trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
trainer_count = int(os.getenv("PADDLE_TRAINERS"))
per_node_lines = len(full_lines) / trainer_count
lines = full_lines[trainer_id * per_node_lines:(trainer_id + 1)
* per_node_lines]
print(
"read images from %d, length: %d, lines length: %d, total: %d"
% (trainer_id * per_node_lines, per_node_lines, len(lines),
len(full_lines)))
else:
lines = full_lines
for line in lines:
if mode == 'train':
img_path, label = line.split()
img_path = img_path.replace("JPEG", "jpeg")
img_path = os.path.join(DATA_DIR, "train", img_path)
yield (img_path, int(label))
elif mode == 'val':
img_path, label = line.split()
img_path = img_path.replace("JPEG", "jpeg")
img_path = os.path.join(DATA_DIR, "val", img_path)
yield (img_path, int(label))
elif mode == 'test':
img_path = os.path.join(DATA_DIR, line)
yield [img_path]
mapper = functools.partial(
process_image, mode=mode, color_jitter=color_jitter, rotate=rotate)
return paddle.reader.xmap_readers(mapper, reader, THREAD, BUF_SIZE)
def load_raw_image_uint8(sample):
img_arr = np.array(Image.open(sample[0])).astype('int64')
return img_arr, int(sample[1])
def train_raw(file_list=TRAIN_LIST, shuffle=True):
def reader():
with open(file_list) as flist:
full_lines = [line.strip() for line in flist]
if shuffle:
random.shuffle(full_lines)
trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
trainer_count = int(os.getenv("PADDLE_TRAINERS"))
per_node_lines = len(full_lines) / trainer_count
lines = full_lines[trainer_id * per_node_lines:(trainer_id + 1) *
per_node_lines]
print("read images from %d, length: %d, lines length: %d, total: %d"
% (trainer_id * per_node_lines, per_node_lines, len(lines),
len(full_lines)))
for line in lines:
img_path, label = line.split()
img_path = img_path.replace("JPEG", "jpeg")
img_path = os.path.join(DATA_DIR, "train", img_path)
yield (img_path, int(label))
return paddle.reader.xmap_readers(load_raw_image_uint8, reader, THREAD,
BUF_SIZE)
def train(file_list=TRAIN_LIST, xmap=True):
return _reader_creator(
file_list,
'train',
shuffle=True,
color_jitter=False,
rotate=False,
xmap=xmap)
def val(file_list=TEST_LIST, xmap=True):
return _reader_creator(file_list, 'val', shuffle=False, xmap=xmap)
def test(file_list=TEST_LIST):
return _reader_creator(file_list, 'test', shuffle=False)
if __name__ == "__main__":
c = 0
start_t = time.time()
for d in train()():
c += 1
if c >= 10000:
break
spent = time.time() - start_t
print("read 10000 speed: ", 10000 / spent, spent)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import yaml
import copy
import argparse
import random
import os
import copy
from kube_templates import pserver, trainer, envs
def parse_args():
parser = argparse.ArgumentParser(description='Generate dist job yamls.')
parser.add_argument(
'--jobname', default="paddlejob", help='unique job name')
parser.add_argument(
'--cpu', default=1, type=int, help='CPU cores per trainer node')
parser.add_argument(
'--pscpu', default=1, type=int, help='CPU cores per pserver node')
parser.add_argument(
'--gpu', default=0, type=int, help='num of GPUs per node')
parser.add_argument(
'--image',
default="bootstrapper:5000/fluid_benchmark:gpu",
help='num of GPUs per node')
parser.add_argument(
'--pservers', default=1, type=int, help='num of pservers')
parser.add_argument(
'--trainers', default=1, type=int, help='num of trainers')
parser.add_argument('--memory', default=1, type=int, help='trainer memory')
parser.add_argument(
'--psmemory', default=1, type=int, help='pserver memory')
parser.add_argument(
'--port', default=30236, type=int, help='num of trainers')
parser.add_argument(
'--entry', default="python train.py", help='command to run')
parser.add_argument(
'--fluid', default=1, type=int, help='whether is fluid job')
parser.add_argument(
'--rdma', action='store_true', help='whether mount rdma libs')
parser.add_argument(
'--disttype',
default="pserver",
type=str,
choices=['pserver', 'nccl2', 'local'],
help='pserver or nccl2 or local')
args = parser.parse_args()
return args
def gen_job():
ps = pserver
tn = trainer
args = parse_args()
ps_container = ps["spec"]["template"]["spec"]["containers"][0]
tn_container = tn["spec"]["template"]["spec"]["containers"][0]
if args.fluid == 1:
ps_container["command"] = \
["paddle_k8s", "start_fluid"]
tn_container["command"] = \
["paddle_k8s", "start_fluid"]
ps["metadata"]["name"] = args.jobname + "-pserver"
ps["spec"]["template"]["metadata"]["labels"][
"paddle-job-pserver"] = args.jobname
tn["metadata"]["name"] = args.jobname + "-trainer"
tn["spec"]["template"]["metadata"]["labels"]["paddle-job"] = args.jobname
ps_container["image"] = args.image
tn_container["image"] = args.image
ps_container["resources"]["requests"]["cpu"] = str(args.pscpu)
ps_container["resources"]["requests"]["memory"] = str(args.psmemory) + "Gi"
ps_container["resources"]["limits"]["cpu"] = str(args.pscpu)
ps_container["resources"]["limits"]["memory"] = str(args.psmemory) + "Gi"
tn_container["resources"]["requests"]["cpu"] = str(args.cpu)
tn_container["resources"]["requests"]["memory"] = str(args.memory) + "Gi"
tn_container["resources"]["limits"]["cpu"] = str(args.cpu)
tn_container["resources"]["limits"]["memory"] = str(args.memory) + "Gi"
if args.gpu > 0:
tn_container["resources"]["requests"][
"alpha.kubernetes.io/nvidia-gpu"] = str(args.gpu)
tn_container["resources"]["limits"][
"alpha.kubernetes.io/nvidia-gpu"] = str(args.gpu)
ps["spec"]["replicas"] = int(args.pservers)
tn["spec"]["parallelism"] = int(args.trainers)
tn["spec"]["completions"] = int(args.trainers)
ps_container["ports"][0]["name"] = "jobport-" + str(args.port)
ps_container["ports"][0]["containerPort"] = args.port
spreadport = random.randint(40000, 60000)
tn_container["ports"][0]["name"] = "spr-" + str(spreadport)
tn_container["ports"][0]["containerPort"] = spreadport
envs.append({"name": "PADDLE_JOB_NAME", "value": args.jobname})
envs.append({"name": "PADDLE_TRAINERS", "value": str(args.trainers)})
envs.append({"name": "PADDLE_PSERVERS", "value": str(args.pservers)})
envs.append({"name": "ENTRY", "value": args.entry})
envs.append({"name": "PADDLE_PSERVER_PORT", "value": str(args.port)})
# NOTE: these directories below are cluster specific, please modify
# this settings before you run on your own cluster.
envs.append({
"name": "LD_LIBRARY_PATH",
"value":
"/usr/local/lib:/usr/local/nvidia/lib64:/usr/local/rdma/lib64:/usr/lib64/mlnx_ofed/valgrind"
})
volumes = [{
"name": "nvidia-driver",
"hostPath": {
"path": "/usr/local/nvidia/lib64"
}
}]
volumeMounts = [{
"mountPath": "/usr/local/nvidia/lib64",
"name": "nvidia-driver"
}]
if args.rdma:
volumes.extend([{
"name": "ibetc",
"hostPath": {
"path": "/etc/libibverbs.d"
}
}, {
"name": "iblibs",
"hostPath": {
"path": "/usr/local/rdma"
}
}, {
"name": "valgrind",
"hostPath": {
"path": "/usr/lib64/mlnx_ofed/valgrind"
}
}])
volumeMounts.extend([{
"mountPath": "/etc/libibverbs.d",
"name": "ibetc"
}, {
"mountPath": "/usr/local/rdma",
"name": "iblibs"
}, {
"mountPath": "/usr/lib64/mlnx_ofed/valgrind",
"name": "valgrind"
}])
# append shm for NCCL2
volumes.append({"name": "dshm", "emptyDir": {"medium": "Memory"}})
volumeMounts.append({"mountPath": "/dev/shm", "name": "dshm"})
# add ceph volumes
volumes.append({
"name": "ceph-data",
"cephfs": {
"monitors": ["192.168.16.23:6789"],
"secretRef": {
"name": "ceph-secret"
},
"user": "admin",
}
})
volumeMounts.append({"mountPath": "/mnt/data", "name": "ceph-data"})
tn["spec"]["template"]["spec"]["volumes"] = volumes
tn_container["volumeMounts"] = volumeMounts
ps_container["env"] = copy.deepcopy(envs)
ps_container["env"].append({
"name": "PADDLE_TRAINING_ROLE",
"value": "PSERVER"
})
tn_container["env"] = envs
if args.disttype == "pserver":
tn_container["env"].append({
"name": "PADDLE_TRAINING_ROLE",
"value": "TRAINER"
})
elif args.disttype == "nccl2" or args.disttype == "local":
# NCCL2 have no training role, set to plain WORKER
tn_container["env"].append({
"name": "PADDLE_TRAINING_ROLE",
"value": "WORKER"
})
os.mkdir(args.jobname)
if args.disttype == "pserver":
with open("%s/pserver.yaml" % args.jobname, "w") as fn:
yaml.dump(ps, fn)
with open("%s/trainer.yaml" % args.jobname, "w") as fn:
yaml.dump(tn, fn)
if __name__ == "__main__":
gen_job()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from pserver import pserver
from trainer import trainer
__all__ = ["pserver", "trainer", "envs"]
envs = [
# envs that don't need to change
{
"name": "GLOG_v",
"value": "0"
},
{
"name": "GLOG_logtostderr",
"value": "1"
},
{
"name": "TOPOLOGY",
"value": ""
},
{
"name": "TRAINER_PACKAGE",
"value": "/workspace"
},
{
"name": "PADDLE_INIT_NICS",
"value": "eth2"
},
{
"name": "NAMESPACE",
"valueFrom": {
"fieldRef": {
"fieldPath": "metadata.namespace"
}
}
},
{
"name": "POD_IP",
"valueFrom": {
"fieldRef": {
"fieldPath": "status.podIP"
}
}
},
{
"name": "PADDLE_CURRENT_IP",
"valueFrom": {
"fieldRef": {
"fieldPath": "status.podIP"
}
}
}
]
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
pserver = {
"apiVersion": "extensions/v1beta1",
"kind": "ReplicaSet",
"metadata": {
"name": "jobname-pserver"
},
"spec": {
"replicas": 1,
"template": {
"metadata": {
"labels": {
"paddle-job-pserver": "jobname"
}
},
"spec": {
"hostNetwork": True,
"imagePullSecrets": [{
"name": "job-registry-secret"
}],
"containers": [{
"name": "pserver",
"image": "",
"imagePullPolicy": "Always",
"ports": [{
"name": "jobport-1",
"containerPort": 1
}],
"env": [],
"command": ["paddle_k8s", "start_pserver"],
"resources": {
"requests": {
"memory": "10Gi",
"cpu": "4"
},
"limits": {
"memory": "10Gi",
"cpu": "4"
}
}
}]
}
}
}
}
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
trainer = {
"apiVersion": "batch/v1",
"kind": "Job",
"metadata": {
"name": "jobname-pserver"
},
"spec": {
"parallelism": 4,
"completions": 4,
"template": {
"metadata": {
"labels": {
"paddle-job": "jobname"
}
},
"spec": {
"hostNetwork": True,
"imagePullSecrets": [{
"name": "job-registry-secret"
}],
"restartPolicy": "Never",
"containers": [{
"name": "trainer",
"image": "",
"imagePullPolicy": "Always",
# to let container set rlimit
"securityContext": {
"privileged": True
# TODO(wuyi): use below specific cap instead of privileged,
# using privileged will cause all GPU device are visible
# in the container.
# "capabilities": {
# "add": ["SYS_RESOURCE"]
# }
},
"ports": [{
"name": "jobport-1",
"containerPort": 1
}],
"env": [],
"command": ["paddle_k8s", "start_trainer", "v2"],
"resources": {
"requests": {
"memory": "10Gi",
"cpu": "4",
},
"limits": {
"memory": "10Gi",
"cpu": "4",
}
}
}]
}
}
}
}
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
__all__ = [
"machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm",
"resnet_with_preprocess"
]
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""seq2seq model for fluid."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import argparse
import time
import distutils.util
import paddle
import paddle.fluid as fluid
import paddle.fluid.core as core
import paddle.fluid.framework as framework
from paddle.fluid.executor import Executor
def lstm_step(x_t, hidden_t_prev, cell_t_prev, size):
def linear(inputs):
return fluid.layers.fc(input=inputs, size=size, bias_attr=True)
forget_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
input_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
output_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
cell_tilde = fluid.layers.tanh(x=linear([hidden_t_prev, x_t]))
cell_t = fluid.layers.sums(input=[
fluid.layers.elementwise_mul(
x=forget_gate, y=cell_t_prev), fluid.layers.elementwise_mul(
x=input_gate, y=cell_tilde)
])
hidden_t = fluid.layers.elementwise_mul(
x=output_gate, y=fluid.layers.tanh(x=cell_t))
return hidden_t, cell_t
def seq_to_seq_net(embedding_dim, encoder_size, decoder_size, source_dict_dim,
target_dict_dim, is_generating, beam_size, max_length):
"""Construct a seq2seq network."""
def bi_lstm_encoder(input_seq, gate_size):
# Linear transformation part for input gate, output gate, forget gate
# and cell activation vectors need be done outside of dynamic_lstm.
# So the output size is 4 times of gate_size.
input_forward_proj = fluid.layers.fc(input=input_seq,
size=gate_size * 4,
act=None,
bias_attr=False)
forward, _ = fluid.layers.dynamic_lstm(
input=input_forward_proj, size=gate_size * 4, use_peepholes=False)
input_reversed_proj = fluid.layers.fc(input=input_seq,
size=gate_size * 4,
act=None,
bias_attr=False)
reversed, _ = fluid.layers.dynamic_lstm(
input=input_reversed_proj,
size=gate_size * 4,
is_reverse=True,
use_peepholes=False)
return forward, reversed
src_word_idx = fluid.layers.data(
name='source_sequence', shape=[1], dtype='int64', lod_level=1)
src_embedding = fluid.layers.embedding(
input=src_word_idx,
size=[source_dict_dim, embedding_dim],
dtype='float32')
src_forward, src_reversed = bi_lstm_encoder(
input_seq=src_embedding, gate_size=encoder_size)
encoded_vector = fluid.layers.concat(
input=[src_forward, src_reversed], axis=1)
encoded_proj = fluid.layers.fc(input=encoded_vector,
size=decoder_size,
bias_attr=False)
backward_first = fluid.layers.sequence_pool(
input=src_reversed, pool_type='first')
decoder_boot = fluid.layers.fc(input=backward_first,
size=decoder_size,
bias_attr=False,
act='tanh')
def lstm_decoder_with_attention(target_embedding, encoder_vec, encoder_proj,
decoder_boot, decoder_size):
def simple_attention(encoder_vec, encoder_proj, decoder_state):
decoder_state_proj = fluid.layers.fc(input=decoder_state,
size=decoder_size,
bias_attr=False)
decoder_state_expand = fluid.layers.sequence_expand(
x=decoder_state_proj, y=encoder_proj)
concated = fluid.layers.concat(
input=[encoder_proj, decoder_state_expand], axis=1)
attention_weights = fluid.layers.fc(input=concated,
size=1,
act='tanh',
bias_attr=False)
attention_weights = fluid.layers.sequence_softmax(
input=attention_weights)
weigths_reshape = fluid.layers.reshape(
x=attention_weights, shape=[-1])
scaled = fluid.layers.elementwise_mul(
x=encoder_vec, y=weigths_reshape, axis=0)
context = fluid.layers.sequence_pool(input=scaled, pool_type='sum')
return context
rnn = fluid.layers.DynamicRNN()
cell_init = fluid.layers.fill_constant_batch_size_like(
input=decoder_boot,
value=0.0,
shape=[-1, decoder_size],
dtype='float32')
cell_init.stop_gradient = False
with rnn.block():
current_word = rnn.step_input(target_embedding)
encoder_vec = rnn.static_input(encoder_vec)
encoder_proj = rnn.static_input(encoder_proj)
hidden_mem = rnn.memory(init=decoder_boot, need_reorder=True)
cell_mem = rnn.memory(init=cell_init)
context = simple_attention(encoder_vec, encoder_proj, hidden_mem)
decoder_inputs = fluid.layers.concat(
input=[context, current_word], axis=1)
h, c = lstm_step(decoder_inputs, hidden_mem, cell_mem, decoder_size)
rnn.update_memory(hidden_mem, h)
rnn.update_memory(cell_mem, c)
out = fluid.layers.fc(input=h,
size=target_dict_dim,
bias_attr=True,
act='softmax')
rnn.output(out)
return rnn()
if not is_generating:
trg_word_idx = fluid.layers.data(
name='target_sequence', shape=[1], dtype='int64', lod_level=1)
trg_embedding = fluid.layers.embedding(
input=trg_word_idx,
size=[target_dict_dim, embedding_dim],
dtype='float32')
prediction = lstm_decoder_with_attention(trg_embedding, encoded_vector,
encoded_proj, decoder_boot,
decoder_size)
label = fluid.layers.data(
name='label_sequence', shape=[1], dtype='int64', lod_level=1)
cost = fluid.layers.cross_entropy(input=prediction, label=label)
avg_cost = fluid.layers.mean(x=cost)
feeding_list = ["source_sequence", "target_sequence", "label_sequence"]
return avg_cost, feeding_list
def lodtensor_to_ndarray(lod_tensor):
dims = lod_tensor.get_dims()
ndarray = np.zeros(shape=dims).astype('float32')
for i in xrange(np.product(dims)):
ndarray.ravel()[i] = lod_tensor.get_float_element(i)
return ndarray
def get_model(args, is_train, main_prog, startup_prog):
if args.use_reader_op:
raise Exception("machine_translation do not support reader op for now.")
embedding_dim = 512
encoder_size = 512
decoder_size = 512
dict_size = 30000
beam_size = 3
max_length = 250
with fluid.program_guard(main_prog, startup_prog):
with fluid.unique_name.guard():
avg_cost, feeding_list = seq_to_seq_net(
embedding_dim,
encoder_size,
decoder_size,
dict_size,
dict_size,
False,
beam_size=beam_size,
max_length=max_length)
if is_train:
optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
optimizer.minimize(avg_cost)
batch_generator = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.wmt14.train(dict_size)
if is_train else paddle.dataset.wmt14.test(dict_size),
buf_size=1000),
batch_size=args.batch_size * args.gpus)
return avg_cost, optimizer, [], batch_generator, None
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import argparse
import time
import cProfile
import os
import paddle
import paddle.fluid as fluid
import paddle.fluid.profiler as profiler
SEED = 1
DTYPE = "float32"
# random seed must set before configuring the network.
# fluid.default_startup_program().random_seed = SEED
def cnn_model(data):
conv_pool_1 = fluid.nets.simple_img_conv_pool(
input=data,
filter_size=5,
num_filters=20,
pool_size=2,
pool_stride=2,
act="relu")
conv_pool_2 = fluid.nets.simple_img_conv_pool(
input=conv_pool_1,
filter_size=5,
num_filters=50,
pool_size=2,
pool_stride=2,
act="relu")
# TODO(dzhwinter) : refine the initializer and random seed settting
SIZE = 10
input_shape = conv_pool_2.shape
param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE]
scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5
predict = fluid.layers.fc(
input=conv_pool_2,
size=SIZE,
act="softmax",
param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.NormalInitializer(
loc=0.0, scale=scale)))
return predict
def get_model(args, is_train, main_prog, startup_prog):
# NOTE: mnist is small, we don't implement data sharding yet.
opt = None
data_file_handle = None
with fluid.program_guard(main_prog, startup_prog):
if args.use_reader_op:
filelist = [
os.path.join(args.data_path, f)
for f in os.listdir(args.data_path)
]
data_file_handle = fluid.layers.open_files(
filenames=filelist,
shapes=[[-1, 1, 28, 28], (-1, 1)],
lod_levels=[0, 0],
dtypes=["float32", "int64"],
thread_num=1,
pass_num=1)
data_file = fluid.layers.double_buffer(
fluid.layers.batch(
data_file_handle, batch_size=args.batch_size))
with fluid.unique_name.guard():
if args.use_reader_op:
input, label = fluid.layers.read_file(data_file)
else:
images = fluid.layers.data(
name='pixel', shape=[1, 28, 28], dtype='float32')
label = fluid.layers.data(
name='label', shape=[1], dtype='int64')
predict = cnn_model(images)
cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(x=cost)
# Evaluator
batch_acc = fluid.layers.accuracy(input=predict, label=label)
# Optimization
if is_train:
opt = fluid.optimizer.AdamOptimizer(
learning_rate=0.001, beta1=0.9, beta2=0.999)
opt.minimize(avg_cost)
if args.memory_optimize:
fluid.memory_optimize(main_prog)
# Reader
if is_train:
reader = paddle.dataset.mnist.train()
else:
reader = paddle.dataset.mnist.test()
batched_reader = paddle.batch(
reader, batch_size=args.batch_size * args.gpus)
return avg_cost, opt, [batch_acc], batched_reader, data_file_handle
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
#!/bin/bash
PADDLE_TRAINING_ROLE=PSERVER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=2 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model resnet --device CPU --update_method pserver --iterations=10000 &
sleep 15
CUDA_VISIBLE_DEVICES=0,1 PADDLE_TRAINING_ROLE=TRAINER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=2 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model resnet --device GPU --update_method pserver --iterations=10000 --gpus 2 &
CUDA_VISIBLE_DEVICES=2,3 PADDLE_TRAINING_ROLE=TRAINER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=2 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=1 python fluid_benchmark.py --model resnet --device GPU --update_method pserver --iterations=10000 --gpus 2 &
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
You also should install tflearn:
```bash
pip install -r requirements.txt
```
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册