提交 6f6ecbec 编写于 作者: G guru4elephant 提交者: Tao Luo

remove benchmark folder, since there is a benchmark repo already, distributed...

remove benchmark folder, since there is a benchmark repo already, distributed benchmark will be maintained in fleet repo (#18537)

test=develop
上级 1f1cc222
paddle/image/logs
paddle/image/*.pyc
paddle/image/train.list
paddle/rnn/logs
paddle/rnn/*.pyc
paddle/rnn/imdb.pkl
caffe/image/logs
tensorflow/image/logs
tensorflow/rnn/logs
fluid/models/*.pyc
fluid/logs
fluid/nohup.out
name: "alexnet"
input: "data"
input_dim: 64
input_dim: 3
input_dim: 227
input_dim: 227
input: "label"
input_dim: 64
input_dim: 1
input_dim: 1
input_dim: 1
force_backward: true
layer {
name: "conv1"
type: "Convolution"
bottom: "data"
top: "conv1"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 96
kernel_size: 11
stride: 4
weight_filler {
type: "gaussian"
std: 0.01
}
bias_filler {
type: "constant"
value: 0
}
}
}
layer {
name: "relu1"
type: "ReLU"
bottom: "conv1"
top: "conv1"
}
layer {
name: "norm1"
type: "LRN"
bottom: "conv1"
top: "norm1"
lrn_param {
local_size: 5
alpha: 0.0001
beta: 0.75
}
}
layer {
name: "pool1"
type: "Pooling"
bottom: "norm1"
top: "pool1"
pooling_param {
pool: MAX
kernel_size: 3
stride: 2
}
}
layer {
name: "conv2"
type: "Convolution"
bottom: "pool1"
top: "conv2"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 256
pad: 2
kernel_size: 5
group: 1
weight_filler {
type: "gaussian"
std: 0.01
}
bias_filler {
type: "constant"
value: 0.1
}
}
}
layer {
name: "relu2"
type: "ReLU"
bottom: "conv2"
top: "conv2"
}
layer {
name: "norm2"
type: "LRN"
bottom: "conv2"
top: "norm2"
lrn_param {
local_size: 5
alpha: 0.0001
beta: 0.75
}
}
layer {
name: "pool2"
type: "Pooling"
bottom: "norm2"
top: "pool2"
pooling_param {
pool: MAX
kernel_size: 3
stride: 2
}
}
layer {
name: "conv3"
type: "Convolution"
bottom: "pool2"
top: "conv3"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 384
pad: 1
kernel_size: 3
weight_filler {
type: "gaussian"
std: 0.01
}
bias_filler {
type: "constant"
value: 0
}
}
}
layer {
name: "relu3"
type: "ReLU"
bottom: "conv3"
top: "conv3"
}
layer {
name: "conv4"
type: "Convolution"
bottom: "conv3"
top: "conv4"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 384
pad: 1
kernel_size: 3
group: 1
weight_filler {
type: "gaussian"
std: 0.01
}
bias_filler {
type: "constant"
value: 0.1
}
}
}
layer {
name: "relu4"
type: "ReLU"
bottom: "conv4"
top: "conv4"
}
layer {
name: "conv5"
type: "Convolution"
bottom: "conv4"
top: "conv5"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 256
pad: 1
kernel_size: 3
group: 1
weight_filler {
type: "gaussian"
std: 0.01
}
bias_filler {
type: "constant"
value: 0.1
}
}
}
layer {
name: "relu5"
type: "ReLU"
bottom: "conv5"
top: "conv5"
}
layer {
name: "pool5"
type: "Pooling"
bottom: "conv5"
top: "pool5"
pooling_param {
pool: MAX
kernel_size: 3
stride: 2
}
}
layer {
name: "fc6"
type: "InnerProduct"
bottom: "pool5"
top: "fc6"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
inner_product_param {
num_output: 4096
weight_filler {
type: "gaussian"
std: 0.005
}
bias_filler {
type: "constant"
value: 0.1
}
}
}
layer {
name: "relu6"
type: "ReLU"
bottom: "fc6"
top: "fc6"
}
layer {
name: "drop6"
type: "Dropout"
bottom: "fc6"
top: "fc6"
dropout_param {
dropout_ratio: 0.5
}
}
layer {
name: "fc7"
type: "InnerProduct"
bottom: "fc6"
top: "fc7"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
inner_product_param {
num_output: 4096
weight_filler {
type: "gaussian"
std: 0.005
}
bias_filler {
type: "constant"
value: 0.1
}
}
}
layer {
name: "relu7"
type: "ReLU"
bottom: "fc7"
top: "fc7"
}
layer {
name: "drop7"
type: "Dropout"
bottom: "fc7"
top: "fc7"
dropout_param {
dropout_ratio: 0.5
}
}
layer {
name: "fc8"
type: "InnerProduct"
bottom: "fc7"
top: "fc8"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
inner_product_param {
num_output: 1000
weight_filler {
type: "gaussian"
std: 0.01
}
bias_filler {
type: "constant"
value: 0
}
}
}
layer {
name: "loss"
type: "SoftmaxWithLoss"
bottom: "fc8"
bottom: "label"
top: "loss"
}
name: "googlenet"
input: "data"
input_dim: 128
input_dim: 3
input_dim: 224
input_dim: 224
input: "label"
input_dim: 128
input_dim: 1
input_dim: 1
input_dim: 1
layer {
name: "conv1/7x7_s2"
type: "Convolution"
bottom: "data"
top: "conv1/7x7_s2"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 64
pad: 3
kernel_size: 7
stride: 2
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0.2
}
}
}
layer {
name: "conv1/relu_7x7"
type: "ReLU"
bottom: "conv1/7x7_s2"
top: "conv1/7x7_s2"
}
layer {
name: "pool1/3x3_s2"
type: "Pooling"
bottom: "conv1/7x7_s2"
top: "pool1/3x3_s2"
pooling_param {
pool: MAX
kernel_size: 3
stride: 2
}
}
#layer {
# name: "pool1/norm1"
# type: "LRN"
# bottom: "pool1/3x3_s2"
# top: "pool1/norm1"
# lrn_param {
# local_size: 5
# alpha: 0.0001
# beta: 0.75
# }
#}
layer {
name: "conv2/3x3_reduce"
type: "Convolution"
# bottom: "pool1/norm1"
bottom: "pool1/3x3_s2"
top: "conv2/3x3_reduce"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 64
kernel_size: 1
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0.2
}
}
}
layer {
name: "conv2/relu_3x3_reduce"
type: "ReLU"
bottom: "conv2/3x3_reduce"
top: "conv2/3x3_reduce"
}
layer {
name: "conv2/3x3"
type: "Convolution"
bottom: "conv2/3x3_reduce"
top: "conv2/3x3"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 192
pad: 1
kernel_size: 3
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0.2
}
}
}
layer {
name: "conv2/relu_3x3"
type: "ReLU"
bottom: "conv2/3x3"
top: "conv2/3x3"
}
#layer {
# name: "conv2/norm2"
# type: "LRN"
# bottom: "conv2/3x3"
# top: "conv2/norm2"
# lrn_param {
# local_size: 5
# alpha: 0.0001
# beta: 0.75
# }
#}
layer {
name: "pool2/3x3_s2"
type: "Pooling"
# bottom: "conv2/norm2"
bottom: "conv2/3x3"
top: "pool2/3x3_s2"
pooling_param {
pool: MAX
kernel_size: 3
stride: 2
}
}
layer {
name: "inception_3a/1x1"
type: "Convolution"
bottom: "pool2/3x3_s2"
top: "inception_3a/1x1"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 64
kernel_size: 1
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0.2
}
}
}
layer {
name: "inception_3a/relu_1x1"
type: "ReLU"
bottom: "inception_3a/1x1"
top: "inception_3a/1x1"
}
layer {
name: "inception_3a/3x3_reduce"
type: "Convolution"
bottom: "pool2/3x3_s2"
top: "inception_3a/3x3_reduce"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 96
kernel_size: 1
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0.2
}
}
}
layer {
name: "inception_3a/relu_3x3_reduce"
type: "ReLU"
bottom: "inception_3a/3x3_reduce"
top: "inception_3a/3x3_reduce"
}
layer {
name: "inception_3a/3x3"
type: "Convolution"
bottom: "inception_3a/3x3_reduce"
top: "inception_3a/3x3"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 128
pad: 1
kernel_size: 3
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0.2
}
}
}
layer {
name: "inception_3a/relu_3x3"
type: "ReLU"
bottom: "inception_3a/3x3"
top: "inception_3a/3x3"
}
layer {
name: "inception_3a/5x5_reduce"
type: "Convolution"
bottom: "pool2/3x3_s2"
top: "inception_3a/5x5_reduce"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 16
kernel_size: 1
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0.2
}
}
}
layer {
name: "inception_3a/relu_5x5_reduce"
type: "ReLU"
bottom: "inception_3a/5x5_reduce"
top: "inception_3a/5x5_reduce"
}
layer {
name: "inception_3a/5x5"
type: "Convolution"
bottom: "inception_3a/5x5_reduce"
top: "inception_3a/5x5"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 32
pad: 2
kernel_size: 5
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0.2
}
}
}
layer {
name: "inception_3a/relu_5x5"
type: "ReLU"
bottom: "inception_3a/5x5"
top: "inception_3a/5x5"
}
layer {
name: "inception_3a/pool"
type: "Pooling"
bottom: "pool2/3x3_s2"
top: "inception_3a/pool"
pooling_param {
pool: MAX
kernel_size: 3
stride: 1
pad: 1
}
}
layer {
name: "inception_3a/pool_proj"
type: "Convolution"
bottom: "inception_3a/pool"
top: "inception_3a/pool_proj"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 32
kernel_size: 1
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0.2
}
}
}
layer {
name: "inception_3a/relu_pool_proj"
type: "ReLU"
bottom: "inception_3a/pool_proj"
top: "inception_3a/pool_proj"
}
layer {
name: "inception_3a/output"
type: "Concat"
bottom: "inception_3a/1x1"
bottom: "inception_3a/3x3"
bottom: "inception_3a/5x5"
bottom: "inception_3a/pool_proj"
top: "inception_3a/output"
}
layer {
name: "inception_3b/1x1"
type: "Convolution"
bottom: "inception_3a/output"
top: "inception_3b/1x1"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 128
kernel_size: 1
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0.2
}
}
}
layer {
name: "inception_3b/relu_1x1"
type: "ReLU"
bottom: "inception_3b/1x1"
top: "inception_3b/1x1"
}
layer {
name: "inception_3b/3x3_reduce"
type: "Convolution"
bottom: "inception_3a/output"
top: "inception_3b/3x3_reduce"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 128
kernel_size: 1
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0.2
}
}
}
layer {
name: "inception_3b/relu_3x3_reduce"
type: "ReLU"
bottom: "inception_3b/3x3_reduce"
top: "inception_3b/3x3_reduce"
}
layer {
name: "inception_3b/3x3"
type: "Convolution"
bottom: "inception_3b/3x3_reduce"
top: "inception_3b/3x3"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 192
pad: 1
kernel_size: 3
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0.2
}
}
}
layer {
name: "inception_3b/relu_3x3"
type: "ReLU"
bottom: "inception_3b/3x3"
top: "inception_3b/3x3"
}
layer {
name: "inception_3b/5x5_reduce"
type: "Convolution"
bottom: "inception_3a/output"
top: "inception_3b/5x5_reduce"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 32
kernel_size: 1
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0.2
}
}
}
layer {
name: "inception_3b/relu_5x5_reduce"
type: "ReLU"
bottom: "inception_3b/5x5_reduce"
top: "inception_3b/5x5_reduce"
}
layer {
name: "inception_3b/5x5"
type: "Convolution"
bottom: "inception_3b/5x5_reduce"
top: "inception_3b/5x5"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 96
pad: 2
kernel_size: 5
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0.2
}
}
}
layer {
name: "inception_3b/relu_5x5"
type: "ReLU"
bottom: "inception_3b/5x5"
top: "inception_3b/5x5"
}
layer {
name: "inception_3b/pool"
type: "Pooling"
bottom: "inception_3a/output"
top: "inception_3b/pool"
pooling_param {
pool: MAX
kernel_size: 3
stride: 1
pad: 1
}
}
layer {
name: "inception_3b/pool_proj"
type: "Convolution"
bottom: "inception_3b/pool"
top: "inception_3b/pool_proj"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 64
kernel_size: 1
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0.2
}
}
}
layer {
name: "inception_3b/relu_pool_proj"
type: "ReLU"
bottom: "inception_3b/pool_proj"
top: "inception_3b/pool_proj"
}
layer {
name: "inception_3b/output"
type: "Concat"
bottom: "inception_3b/1x1"
bottom: "inception_3b/3x3"
bottom: "inception_3b/5x5"
bottom: "inception_3b/pool_proj"
top: "inception_3b/output"
}
layer {
name: "pool3/3x3_s2"
type: "Pooling"
bottom: "inception_3b/output"
top: "pool3/3x3_s2"
pooling_param {
pool: MAX
kernel_size: 3
stride: 2
}
}
layer {
name: "inception_4a/1x1"
type: "Convolution"
bottom: "pool3/3x3_s2"
top: "inception_4a/1x1"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 192
kernel_size: 1
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0.2
}
}
}
layer {
name: "inception_4a/relu_1x1"
type: "ReLU"
bottom: "inception_4a/1x1"
top: "inception_4a/1x1"
}
layer {
name: "inception_4a/3x3_reduce"
type: "Convolution"
bottom: "pool3/3x3_s2"
top: "inception_4a/3x3_reduce"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 96
kernel_size: 1
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0.2
}
}
}
layer {
name: "inception_4a/relu_3x3_reduce"
type: "ReLU"
bottom: "inception_4a/3x3_reduce"
top: "inception_4a/3x3_reduce"
}
layer {
name: "inception_4a/3x3"
type: "Convolution"
bottom: "inception_4a/3x3_reduce"
top: "inception_4a/3x3"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 208
pad: 1
kernel_size: 3
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0.2
}
}
}
layer {
name: "inception_4a/relu_3x3"
type: "ReLU"
bottom: "inception_4a/3x3"
top: "inception_4a/3x3"
}
layer {
name: "inception_4a/5x5_reduce"
type: "Convolution"
bottom: "pool3/3x3_s2"
top: "inception_4a/5x5_reduce"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 16
kernel_size: 1
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0.2
}
}
}
layer {
name: "inception_4a/relu_5x5_reduce"
type: "ReLU"
bottom: "inception_4a/5x5_reduce"
top: "inception_4a/5x5_reduce"
}
layer {
name: "inception_4a/5x5"
type: "Convolution"
bottom: "inception_4a/5x5_reduce"
top: "inception_4a/5x5"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 48
pad: 2
kernel_size: 5
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0.2
}
}
}
layer {
name: "inception_4a/relu_5x5"
type: "ReLU"
bottom: "inception_4a/5x5"
top: "inception_4a/5x5"
}
layer {
name: "inception_4a/pool"
type: "Pooling"
bottom: "pool3/3x3_s2"
top: "inception_4a/pool"
pooling_param {
pool: MAX
kernel_size: 3
stride: 1
pad: 1
}
}
layer {
name: "inception_4a/pool_proj"
type: "Convolution"
bottom: "inception_4a/pool"
top: "inception_4a/pool_proj"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 64
kernel_size: 1
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0.2
}
}
}
layer {
name: "inception_4a/relu_pool_proj"
type: "ReLU"
bottom: "inception_4a/pool_proj"
top: "inception_4a/pool_proj"
}
layer {
name: "inception_4a/output"
type: "Concat"
bottom: "inception_4a/1x1"
bottom: "inception_4a/3x3"
bottom: "inception_4a/5x5"
bottom: "inception_4a/pool_proj"
top: "inception_4a/output"
}
#layer {
# name: "loss1/ave_pool"
# type: "Pooling"
# bottom: "inception_4a/output"
# top: "loss1/ave_pool"
# pooling_param {
# pool: AVE
# kernel_size: 5
# stride: 3
# }
#}
#layer {
# name: "loss1/conv"
# type: "Convolution"
# bottom: "loss1/ave_pool"
# top: "loss1/conv"
# param {
# lr_mult: 1
# decay_mult: 1
# }
# param {
# lr_mult: 2
# decay_mult: 0
# }
# convolution_param {
# num_output: 128
# kernel_size: 1
# weight_filler {
# type: "xavier"
# }
# bias_filler {
# type: "constant"
# value: 0.2
# }
# }
#}
#layer {
# name: "loss1/relu_conv"
# type: "ReLU"
# bottom: "loss1/conv"
# top: "loss1/conv"
#}
#layer {
# name: "loss1/fc"
# type: "InnerProduct"
# bottom: "loss1/conv"
# top: "loss1/fc"
# param {
# lr_mult: 1
# decay_mult: 1
# }
# param {
# lr_mult: 2
# decay_mult: 0
# }
# inner_product_param {
# num_output: 1024
# weight_filler {
# type: "xavier"
# }
# bias_filler {
# type: "constant"
# value: 0.2
# }
# }
#}
#layer {
# name: "loss1/relu_fc"
# type: "ReLU"
# bottom: "loss1/fc"
# top: "loss1/fc"
#}
#layer {
# name: "loss1/drop_fc"
# type: "Dropout"
# bottom: "loss1/fc"
# top: "loss1/fc"
# dropout_param {
# dropout_ratio: 0.7
# }
#}
#layer {
# name: "loss1/classifier"
# type: "InnerProduct"
# bottom: "loss1/fc"
# top: "loss1/classifier"
# param {
# lr_mult: 1
# decay_mult: 1
# }
# param {
# lr_mult: 2
# decay_mult: 0
# }
# inner_product_param {
# num_output: 1000
# weight_filler {
# type: "xavier"
# }
# bias_filler {
# type: "constant"
# value: 0
# }
# }
#}
#layer {
# name: "loss1/loss"
# type: "SoftmaxWithLoss"
# bottom: "loss1/classifier"
# bottom: "label"
# top: "loss1/loss1"
# loss_weight: 0.3
#}
layer {
name: "inception_4b/1x1"
type: "Convolution"
bottom: "inception_4a/output"
top: "inception_4b/1x1"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 160
kernel_size: 1
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0.2
}
}
}
layer {
name: "inception_4b/relu_1x1"
type: "ReLU"
bottom: "inception_4b/1x1"
top: "inception_4b/1x1"
}
layer {
name: "inception_4b/3x3_reduce"
type: "Convolution"
bottom: "inception_4a/output"
top: "inception_4b/3x3_reduce"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 112
kernel_size: 1
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0.2
}
}
}
layer {
name: "inception_4b/relu_3x3_reduce"
type: "ReLU"
bottom: "inception_4b/3x3_reduce"
top: "inception_4b/3x3_reduce"
}
layer {
name: "inception_4b/3x3"
type: "Convolution"
bottom: "inception_4b/3x3_reduce"
top: "inception_4b/3x3"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 224
pad: 1
kernel_size: 3
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0.2
}
}
}
layer {
name: "inception_4b/relu_3x3"
type: "ReLU"
bottom: "inception_4b/3x3"
top: "inception_4b/3x3"
}
layer {
name: "inception_4b/5x5_reduce"
type: "Convolution"
bottom: "inception_4a/output"
top: "inception_4b/5x5_reduce"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 24
kernel_size: 1
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0.2
}
}
}
layer {
name: "inception_4b/relu_5x5_reduce"
type: "ReLU"
bottom: "inception_4b/5x5_reduce"
top: "inception_4b/5x5_reduce"
}
layer {
name: "inception_4b/5x5"
type: "Convolution"
bottom: "inception_4b/5x5_reduce"
top: "inception_4b/5x5"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 64
pad: 2
kernel_size: 5
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0.2
}
}
}
layer {
name: "inception_4b/relu_5x5"
type: "ReLU"
bottom: "inception_4b/5x5"
top: "inception_4b/5x5"
}
layer {
name: "inception_4b/pool"
type: "Pooling"
bottom: "inception_4a/output"
top: "inception_4b/pool"
pooling_param {
pool: MAX
kernel_size: 3
stride: 1
pad: 1
}
}
layer {
name: "inception_4b/pool_proj"
type: "Convolution"
bottom: "inception_4b/pool"
top: "inception_4b/pool_proj"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 64
kernel_size: 1
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0.2
}
}
}
layer {
name: "inception_4b/relu_pool_proj"
type: "ReLU"
bottom: "inception_4b/pool_proj"
top: "inception_4b/pool_proj"
}
layer {
name: "inception_4b/output"
type: "Concat"
bottom: "inception_4b/1x1"
bottom: "inception_4b/3x3"
bottom: "inception_4b/5x5"
bottom: "inception_4b/pool_proj"
top: "inception_4b/output"
}
layer {
name: "inception_4c/1x1"
type: "Convolution"
bottom: "inception_4b/output"
top: "inception_4c/1x1"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 128
kernel_size: 1
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0.2
}
}
}
layer {
name: "inception_4c/relu_1x1"
type: "ReLU"
bottom: "inception_4c/1x1"
top: "inception_4c/1x1"
}
layer {
name: "inception_4c/3x3_reduce"
type: "Convolution"
bottom: "inception_4b/output"
top: "inception_4c/3x3_reduce"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 128
kernel_size: 1
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0.2
}
}
}
layer {
name: "inception_4c/relu_3x3_reduce"
type: "ReLU"
bottom: "inception_4c/3x3_reduce"
top: "inception_4c/3x3_reduce"
}
layer {
name: "inception_4c/3x3"
type: "Convolution"
bottom: "inception_4c/3x3_reduce"
top: "inception_4c/3x3"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 256
pad: 1
kernel_size: 3
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0.2
}
}
}
layer {
name: "inception_4c/relu_3x3"
type: "ReLU"
bottom: "inception_4c/3x3"
top: "inception_4c/3x3"
}
layer {
name: "inception_4c/5x5_reduce"
type: "Convolution"
bottom: "inception_4b/output"
top: "inception_4c/5x5_reduce"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 24
kernel_size: 1
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0.2
}
}
}
layer {
name: "inception_4c/relu_5x5_reduce"
type: "ReLU"
bottom: "inception_4c/5x5_reduce"
top: "inception_4c/5x5_reduce"
}
layer {
name: "inception_4c/5x5"
type: "Convolution"
bottom: "inception_4c/5x5_reduce"
top: "inception_4c/5x5"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 64
pad: 2
kernel_size: 5
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0.2
}
}
}
layer {
name: "inception_4c/relu_5x5"
type: "ReLU"
bottom: "inception_4c/5x5"
top: "inception_4c/5x5"
}
layer {
name: "inception_4c/pool"
type: "Pooling"
bottom: "inception_4b/output"
top: "inception_4c/pool"
pooling_param {
pool: MAX
kernel_size: 3
stride: 1
pad: 1
}
}
layer {
name: "inception_4c/pool_proj"
type: "Convolution"
bottom: "inception_4c/pool"
top: "inception_4c/pool_proj"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 64
kernel_size: 1
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0.2
}
}
}
layer {
name: "inception_4c/relu_pool_proj"
type: "ReLU"
bottom: "inception_4c/pool_proj"
top: "inception_4c/pool_proj"
}
layer {
name: "inception_4c/output"
type: "Concat"
bottom: "inception_4c/1x1"
bottom: "inception_4c/3x3"
bottom: "inception_4c/5x5"
bottom: "inception_4c/pool_proj"
top: "inception_4c/output"
}
layer {
name: "inception_4d/1x1"
type: "Convolution"
bottom: "inception_4c/output"
top: "inception_4d/1x1"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 112
kernel_size: 1
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0.2
}
}
}
layer {
name: "inception_4d/relu_1x1"
type: "ReLU"
bottom: "inception_4d/1x1"
top: "inception_4d/1x1"
}
layer {
name: "inception_4d/3x3_reduce"
type: "Convolution"
bottom: "inception_4c/output"
top: "inception_4d/3x3_reduce"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 144
kernel_size: 1
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0.2
}
}
}
layer {
name: "inception_4d/relu_3x3_reduce"
type: "ReLU"
bottom: "inception_4d/3x3_reduce"
top: "inception_4d/3x3_reduce"
}
layer {
name: "inception_4d/3x3"
type: "Convolution"
bottom: "inception_4d/3x3_reduce"
top: "inception_4d/3x3"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 288
pad: 1
kernel_size: 3
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0.2
}
}
}
layer {
name: "inception_4d/relu_3x3"
type: "ReLU"
bottom: "inception_4d/3x3"
top: "inception_4d/3x3"
}
layer {
name: "inception_4d/5x5_reduce"
type: "Convolution"
bottom: "inception_4c/output"
top: "inception_4d/5x5_reduce"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 32
kernel_size: 1
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0.2
}
}
}
layer {
name: "inception_4d/relu_5x5_reduce"
type: "ReLU"
bottom: "inception_4d/5x5_reduce"
top: "inception_4d/5x5_reduce"
}
layer {
name: "inception_4d/5x5"
type: "Convolution"
bottom: "inception_4d/5x5_reduce"
top: "inception_4d/5x5"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 64
pad: 2
kernel_size: 5
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0.2
}
}
}
layer {
name: "inception_4d/relu_5x5"
type: "ReLU"
bottom: "inception_4d/5x5"
top: "inception_4d/5x5"
}
layer {
name: "inception_4d/pool"
type: "Pooling"
bottom: "inception_4c/output"
top: "inception_4d/pool"
pooling_param {
pool: MAX
kernel_size: 3
stride: 1
pad: 1
}
}
layer {
name: "inception_4d/pool_proj"
type: "Convolution"
bottom: "inception_4d/pool"
top: "inception_4d/pool_proj"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 64
kernel_size: 1
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0.2
}
}
}
layer {
name: "inception_4d/relu_pool_proj"
type: "ReLU"
bottom: "inception_4d/pool_proj"
top: "inception_4d/pool_proj"
}
layer {
name: "inception_4d/output"
type: "Concat"
bottom: "inception_4d/1x1"
bottom: "inception_4d/3x3"
bottom: "inception_4d/5x5"
bottom: "inception_4d/pool_proj"
top: "inception_4d/output"
}
#layer {
# name: "loss2/ave_pool"
# type: "Pooling"
# bottom: "inception_4d/output"
# top: "loss2/ave_pool"
# pooling_param {
# pool: AVE
# kernel_size: 5
# stride: 3
# }
#}
#layer {
# name: "loss2/conv"
# type: "Convolution"
# bottom: "loss2/ave_pool"
# top: "loss2/conv"
# param {
# lr_mult: 1
# decay_mult: 1
# }
# param {
# lr_mult: 2
# decay_mult: 0
# }
# convolution_param {
# num_output: 128
# kernel_size: 1
# weight_filler {
# type: "xavier"
# }
# bias_filler {
# type: "constant"
# value: 0.2
# }
# }
#}
#layer {
# name: "loss2/relu_conv"
# type: "ReLU"
# bottom: "loss2/conv"
# top: "loss2/conv"
#}
#layer {
# name: "loss2/fc"
# type: "InnerProduct"
# bottom: "loss2/conv"
# top: "loss2/fc"
# param {
# lr_mult: 1
# decay_mult: 1
# }
# param {
# lr_mult: 2
# decay_mult: 0
# }
# inner_product_param {
# num_output: 1024
# weight_filler {
# type: "xavier"
# }
# bias_filler {
# type: "constant"
# value: 0.2
# }
# }
#}
#layer {
# name: "loss2/relu_fc"
# type: "ReLU"
# bottom: "loss2/fc"
# top: "loss2/fc"
#}
#layer {
# name: "loss2/drop_fc"
# type: "Dropout"
# bottom: "loss2/fc"
# top: "loss2/fc"
# dropout_param {
# dropout_ratio: 0.7
# }
#}
#layer {
# name: "loss2/classifier"
# type: "InnerProduct"
# bottom: "loss2/fc"
# top: "loss2/classifier"
# param {
# lr_mult: 1
# decay_mult: 1
# }
# param {
# lr_mult: 2
# decay_mult: 0
# }
# inner_product_param {
# num_output: 1000
# weight_filler {
# type: "xavier"
# }
# bias_filler {
# type: "constant"
# value: 0
# }
# }
#}
#layer {
# name: "loss2/loss"
# type: "SoftmaxWithLoss"
# bottom: "loss2/classifier"
# bottom: "label"
# top: "loss2/loss1"
# loss_weight: 0.3
#}
layer {
name: "inception_4e/1x1"
type: "Convolution"
bottom: "inception_4d/output"
top: "inception_4e/1x1"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 256
kernel_size: 1
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0.2
}
}
}
layer {
name: "inception_4e/relu_1x1"
type: "ReLU"
bottom: "inception_4e/1x1"
top: "inception_4e/1x1"
}
layer {
name: "inception_4e/3x3_reduce"
type: "Convolution"
bottom: "inception_4d/output"
top: "inception_4e/3x3_reduce"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 160
kernel_size: 1
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0.2
}
}
}
layer {
name: "inception_4e/relu_3x3_reduce"
type: "ReLU"
bottom: "inception_4e/3x3_reduce"
top: "inception_4e/3x3_reduce"
}
layer {
name: "inception_4e/3x3"
type: "Convolution"
bottom: "inception_4e/3x3_reduce"
top: "inception_4e/3x3"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 320
pad: 1
kernel_size: 3
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0.2
}
}
}
layer {
name: "inception_4e/relu_3x3"
type: "ReLU"
bottom: "inception_4e/3x3"
top: "inception_4e/3x3"
}
layer {
name: "inception_4e/5x5_reduce"
type: "Convolution"
bottom: "inception_4d/output"
top: "inception_4e/5x5_reduce"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 32
kernel_size: 1
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0.2
}
}
}
layer {
name: "inception_4e/relu_5x5_reduce"
type: "ReLU"
bottom: "inception_4e/5x5_reduce"
top: "inception_4e/5x5_reduce"
}
layer {
name: "inception_4e/5x5"
type: "Convolution"
bottom: "inception_4e/5x5_reduce"
top: "inception_4e/5x5"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 128
pad: 2
kernel_size: 5
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0.2
}
}
}
layer {
name: "inception_4e/relu_5x5"
type: "ReLU"
bottom: "inception_4e/5x5"
top: "inception_4e/5x5"
}
layer {
name: "inception_4e/pool"
type: "Pooling"
bottom: "inception_4d/output"
top: "inception_4e/pool"
pooling_param {
pool: MAX
kernel_size: 3
stride: 1
pad: 1
}
}
layer {
name: "inception_4e/pool_proj"
type: "Convolution"
bottom: "inception_4e/pool"
top: "inception_4e/pool_proj"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 128
kernel_size: 1
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0.2
}
}
}
layer {
name: "inception_4e/relu_pool_proj"
type: "ReLU"
bottom: "inception_4e/pool_proj"
top: "inception_4e/pool_proj"
}
layer {
name: "inception_4e/output"
type: "Concat"
bottom: "inception_4e/1x1"
bottom: "inception_4e/3x3"
bottom: "inception_4e/5x5"
bottom: "inception_4e/pool_proj"
top: "inception_4e/output"
}
layer {
name: "pool4/3x3_s2"
type: "Pooling"
bottom: "inception_4e/output"
top: "pool4/3x3_s2"
pooling_param {
pool: MAX
kernel_size: 3
stride: 2
}
}
layer {
name: "inception_5a/1x1"
type: "Convolution"
bottom: "pool4/3x3_s2"
top: "inception_5a/1x1"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 256
kernel_size: 1
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0.2
}
}
}
layer {
name: "inception_5a/relu_1x1"
type: "ReLU"
bottom: "inception_5a/1x1"
top: "inception_5a/1x1"
}
layer {
name: "inception_5a/3x3_reduce"
type: "Convolution"
bottom: "pool4/3x3_s2"
top: "inception_5a/3x3_reduce"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 160
kernel_size: 1
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0.2
}
}
}
layer {
name: "inception_5a/relu_3x3_reduce"
type: "ReLU"
bottom: "inception_5a/3x3_reduce"
top: "inception_5a/3x3_reduce"
}
layer {
name: "inception_5a/3x3"
type: "Convolution"
bottom: "inception_5a/3x3_reduce"
top: "inception_5a/3x3"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 320
pad: 1
kernel_size: 3
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0.2
}
}
}
layer {
name: "inception_5a/relu_3x3"
type: "ReLU"
bottom: "inception_5a/3x3"
top: "inception_5a/3x3"
}
layer {
name: "inception_5a/5x5_reduce"
type: "Convolution"
bottom: "pool4/3x3_s2"
top: "inception_5a/5x5_reduce"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 32
kernel_size: 1
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0.2
}
}
}
layer {
name: "inception_5a/relu_5x5_reduce"
type: "ReLU"
bottom: "inception_5a/5x5_reduce"
top: "inception_5a/5x5_reduce"
}
layer {
name: "inception_5a/5x5"
type: "Convolution"
bottom: "inception_5a/5x5_reduce"
top: "inception_5a/5x5"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 128
pad: 2
kernel_size: 5
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0.2
}
}
}
layer {
name: "inception_5a/relu_5x5"
type: "ReLU"
bottom: "inception_5a/5x5"
top: "inception_5a/5x5"
}
layer {
name: "inception_5a/pool"
type: "Pooling"
bottom: "pool4/3x3_s2"
top: "inception_5a/pool"
pooling_param {
pool: MAX
kernel_size: 3
stride: 1
pad: 1
}
}
layer {
name: "inception_5a/pool_proj"
type: "Convolution"
bottom: "inception_5a/pool"
top: "inception_5a/pool_proj"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 128
kernel_size: 1
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0.2
}
}
}
layer {
name: "inception_5a/relu_pool_proj"
type: "ReLU"
bottom: "inception_5a/pool_proj"
top: "inception_5a/pool_proj"
}
layer {
name: "inception_5a/output"
type: "Concat"
bottom: "inception_5a/1x1"
bottom: "inception_5a/3x3"
bottom: "inception_5a/5x5"
bottom: "inception_5a/pool_proj"
top: "inception_5a/output"
}
layer {
name: "inception_5b/1x1"
type: "Convolution"
bottom: "inception_5a/output"
top: "inception_5b/1x1"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 384
kernel_size: 1
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0.2
}
}
}
layer {
name: "inception_5b/relu_1x1"
type: "ReLU"
bottom: "inception_5b/1x1"
top: "inception_5b/1x1"
}
layer {
name: "inception_5b/3x3_reduce"
type: "Convolution"
bottom: "inception_5a/output"
top: "inception_5b/3x3_reduce"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 192
kernel_size: 1
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0.2
}
}
}
layer {
name: "inception_5b/relu_3x3_reduce"
type: "ReLU"
bottom: "inception_5b/3x3_reduce"
top: "inception_5b/3x3_reduce"
}
layer {
name: "inception_5b/3x3"
type: "Convolution"
bottom: "inception_5b/3x3_reduce"
top: "inception_5b/3x3"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 384
pad: 1
kernel_size: 3
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0.2
}
}
}
layer {
name: "inception_5b/relu_3x3"
type: "ReLU"
bottom: "inception_5b/3x3"
top: "inception_5b/3x3"
}
layer {
name: "inception_5b/5x5_reduce"
type: "Convolution"
bottom: "inception_5a/output"
top: "inception_5b/5x5_reduce"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 48
kernel_size: 1
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0.2
}
}
}
layer {
name: "inception_5b/relu_5x5_reduce"
type: "ReLU"
bottom: "inception_5b/5x5_reduce"
top: "inception_5b/5x5_reduce"
}
layer {
name: "inception_5b/5x5"
type: "Convolution"
bottom: "inception_5b/5x5_reduce"
top: "inception_5b/5x5"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 128
pad: 2
kernel_size: 5
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0.2
}
}
}
layer {
name: "inception_5b/relu_5x5"
type: "ReLU"
bottom: "inception_5b/5x5"
top: "inception_5b/5x5"
}
layer {
name: "inception_5b/pool"
type: "Pooling"
bottom: "inception_5a/output"
top: "inception_5b/pool"
pooling_param {
pool: MAX
kernel_size: 3
stride: 1
pad: 1
}
}
layer {
name: "inception_5b/pool_proj"
type: "Convolution"
bottom: "inception_5b/pool"
top: "inception_5b/pool_proj"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 128
kernel_size: 1
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0.2
}
}
}
layer {
name: "inception_5b/relu_pool_proj"
type: "ReLU"
bottom: "inception_5b/pool_proj"
top: "inception_5b/pool_proj"
}
layer {
name: "inception_5b/output"
type: "Concat"
bottom: "inception_5b/1x1"
bottom: "inception_5b/3x3"
bottom: "inception_5b/5x5"
bottom: "inception_5b/pool_proj"
top: "inception_5b/output"
}
layer {
name: "pool5/7x7_s1"
type: "Pooling"
bottom: "inception_5b/output"
top: "pool5/7x7_s1"
pooling_param {
pool: AVE
kernel_size: 7
stride: 1
}
}
layer {
name: "pool5/drop_7x7_s1"
type: "Dropout"
bottom: "pool5/7x7_s1"
top: "pool5/7x7_s1"
dropout_param {
dropout_ratio: 0.4
}
}
layer {
name: "loss3/classifier"
type: "InnerProduct"
bottom: "pool5/7x7_s1"
top: "loss3/classifier"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
inner_product_param {
num_output: 1000
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
value: 0
}
}
}
layer {
name: "loss3/loss3"
type: "SoftmaxWithLoss"
bottom: "loss3/classifier"
bottom: "label"
top: "loss3/loss3"
loss_weight: 1
}
set -e
function test() {
cfg=$1
batch=$2
prefix=$3
sed -i "/input: \"data\"/{n;s/^input_dim.*/input_dim: $batch/g}" $cfg
sed -i "/input: \"label\"/{n;s/^input_dim.*/input_dim: $batch/g}" $cfg
caffe time --model=$cfg --iterations=50 --gpu 0 > logs/$prefix-1gpu-batch${batch}.log 2>&1
}
if [ ! -d "logs" ]; then
mkdir logs
fi
# alexnet
test alexnet.prototxt 64 alexnet
test alexnet.prototxt 128 alexnet
test alexnet.prototxt 256 alexnet
test alexnet.prototxt 512 alexnet
# googlenet
test googlenet.prototxt 64 googlenet
test googlenet.prototxt 128 googlenet
# small net
test smallnet_mnist_cifar.prototxt 64 smallnet
test smallnet_mnist_cifar.prototxt 128 smallnet
test smallnet_mnist_cifar.prototxt 256 smallnet
test smallnet_mnist_cifar.prototxt 512 smallnet
#!/bin/bash
set -e
function test() {
cfg=$1
batch=$2
prefix=$3
batch_per_gpu=`expr ${batch} / 4`
sed -i "/input: \"data\"/{n;s/^input_dim.*/input_dim: ${batch_per_gpu}/g}" $cfg
sed -i "/input: \"label\"/{n;s/^input_dim.*/input_dim: ${batch_per_gpu}/g}" $cfg
sed -i "1c\net : \"${cfg}\"" solver.prototxt
caffe train --solver=solver.prototxt -gpu 0,1,2,3 > logs/${prefix}-4gpu-batch${batch}.log 2>&1
}
if [ ! -d "logs" ]; then
mkdir logs
fi
# alexnet
test alexnet.prototxt 512 alexnet
test alexnet.prototxt 1024 alexnet
# googlnet
test googlenet.prototxt 512 googlenet
name: "mnist/cifar"
input: "data"
input_dim: 128
input_dim: 3
input_dim: 32
input_dim: 32
input: "label"
input_dim: 128
input_dim: 1
input_dim: 1
input_dim: 1
layer {
name: "conv1"
type: "Convolution"
bottom: "data"
top: "conv1"
param {
lr_mult: 1
}
param {
lr_mult: 2
}
convolution_param {
num_output: 32
pad: 2
kernel_size: 5
stride: 1
weight_filler {
type: "gaussian"
std: 0.0001
}
bias_filler {
type: "constant"
}
}
}
layer {
name: "pool1"
type: "Pooling"
bottom: "conv1"
top: "pool1"
pooling_param {
pool: MAX
kernel_size: 3
stride: 2
}
}
layer {
name: "relu1"
type: "ReLU"
bottom: "pool1"
top: "pool1"
}
layer {
name: "conv2"
type: "Convolution"
bottom: "pool1"
top: "conv2"
param {
lr_mult: 1
}
param {
lr_mult: 2
}
convolution_param {
num_output: 32
pad: 2
kernel_size: 5
stride: 1
weight_filler {
type: "gaussian"
std: 0.01
}
bias_filler {
type: "constant"
}
}
}
layer {
name: "relu2"
type: "ReLU"
bottom: "conv2"
top: "conv2"
}
layer {
name: "pool2"
type: "Pooling"
bottom: "conv2"
top: "pool2"
pooling_param {
pool: AVE
kernel_size: 3
stride: 2
}
}
layer {
name: "conv3"
type: "Convolution"
bottom: "pool2"
top: "conv3"
param {
lr_mult: 1
}
param {
lr_mult: 2
}
convolution_param {
num_output: 64
pad: 2
kernel_size: 5
stride: 1
weight_filler {
type: "gaussian"
std: 0.01
}
bias_filler {
type: "constant"
}
}
}
layer {
name: "relu3"
type: "ReLU"
bottom: "conv3"
top: "conv3"
}
layer {
name: "pool3"
type: "Pooling"
bottom: "conv3"
top: "pool3"
pooling_param {
pool: AVE
kernel_size: 3
stride: 2
}
}
layer {
name: "ip1"
type: "InnerProduct"
bottom: "pool3"
top: "ip1"
param {
lr_mult: 1
}
param {
lr_mult: 2
}
inner_product_param {
num_output: 64
weight_filler {
type: "gaussian"
std: 0.1
}
bias_filler {
type: "constant"
}
}
}
layer {
name: "ip2"
type: "InnerProduct"
bottom: "ip1"
top: "ip2"
param {
lr_mult: 1
}
param {
lr_mult: 2
}
inner_product_param {
num_output: 10
weight_filler {
type: "gaussian"
std: 0.1
}
bias_filler {
type: "constant"
}
}
}
layer {
name: "accuracy"
type: "Accuracy"
bottom: "ip2"
bottom: "label"
top: "accuracy"
include {
phase: TEST
}
}
layer {
name: "loss"
type: "SoftmaxWithLoss"
bottom: "ip2"
bottom: "label"
top: "loss"
}
net: "alexnet.prototxt"
base_lr: 0.01
lr_policy: "fixed"
display: 20
max_iter: 200
momentum: 0.9
weight_decay: 0.0005
snapshot: 10000
snapshot_prefix: "models/caffe_alexnet_train"
solver_mode: GPU
FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04
# Use UBUNTU_MIRROR can speed up apt-get speed.
# ARG UBUNTU_MIRROR
# RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
RUN apt-get update && apt-get install -y python python-pip iputils-ping libgtk2.0-dev wget vim net-tools iftop python-opencv
RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.7 /usr/lib/libcudnn.so && ln -s /usr/lib/x86_64-linux-gnu/libnccl.so.2 /usr/lib/libnccl.so
# IMPORTANT:
# Add "ENV http_proxy=http://ip:port" if your download is slow, and don't forget to unset it at runtime.
# exmaple: unset http_proxy && unset https_proxy && python fluid_benchmark.py ...
RUN pip install -U pip
RUN pip install -U kubernetes paddlepaddle
RUN pip uninstall -y paddlepaddle && mkdir /workspace
ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
RUN chmod +x /usr/bin/paddle_k8s
ADD *.whl /
RUN pip install /*.whl && rm -f /*.whl
ENV LD_LIBRARY_PATH=/usr/local/lib
ADD fluid_benchmark.py recordio_converter.py args.py recordio_converter.py run.sh run_fluid_benchmark.sh imagenet_reader.py /workspace/
ADD models/ /workspace/models/
# Fluid Benchmark
This directory contains several models configurations and tools that used to run
Fluid benchmarks for local and distributed training.
## Run the Benchmark
To start, run the following command to get the full help message:
```bash
python fluid_benchmark.py --help
```
Currently supported `--model` argument include:
* mnist
* resnet
* you can chose to use different dataset using `--data_set cifar10` or
`--data_set flowers`.
* vgg
* stacked_dynamic_lstm
* machine_translation
* Run the following command to start a benchmark job locally:
```bash
python fluid_benchmark.py --model mnist --device GPU
```
You can choose to use GPU/CPU training. With GPU training, you can specify
`--gpus <gpu_num>` to run multi GPU training.
You can set async mode parameter server. With async mode, you can specify
`--async_mode` to train model asynchronous.
* Run distributed training with parameter servers:
* see [run_fluid_benchmark.sh](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/fluid/run_fluid_benchmark.sh) as an example.
* start parameter servers:
```bash
PADDLE_TRAINING_ROLE=PSERVER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=1 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist --device GPU --update_method pserver
sleep 15
```
* start trainers:
```bash
PADDLE_TRAINING_ROLE=TRAINER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=1 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist --device GPU --update_method pserver
```
* Run distributed training using NCCL2
```bash
PADDLE_PSERVER_PORT=7164 PADDLE_TRAINER_IPS=192.168.0.2,192.168.0.3 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist --device GPU --update_method nccl2
```
## Prepare the RecordIO file to Achieve Better Performance
Run the following command will generate RecordIO files like "mnist.recordio" under the path
and batch_size you choose, you can use batch_size=1 so that later reader can change the batch_size
at any time using `fluid.batch`.
```bash
python -c 'from recordio_converter import *; prepare_mnist("data", 1)'
```
## Run Distributed Benchmark on Kubernetes Cluster
You may need to build a Docker image before submitting a cluster job onto Kubernetes, or you will
have to start all those processes manually on each node, which is not recommended.
To build the Docker image, you need to choose a paddle "whl" package to run with, you may either
download it from
http://www.paddlepaddle.org/docs/develop/documentation/zh/build_and_install/pip_install_en.html or
build it by your own. Once you've got the "whl" package, put it under the current directory and run:
```bash
docker build -t [your docker image name]:[your docker image tag] .
```
Then push the image to a Docker registry that your Kubernetes cluster can reach.
We provide a script `kube_gen_job.py` to generate Kubernetes yaml files to submit
distributed benchmark jobs to your cluster. To generate a job yaml, just run:
```bash
python kube_gen_job.py --jobname myjob --pscpu 4 --cpu 8 --gpu 8 --psmemory 20 --memory 40 --pservers 4 --trainers 4 --entry "python fluid_benchmark.py --model mnist --gpus 8 --device GPU --update_method pserver " --disttype pserver
```
Then the yaml files are generated under directory `myjob`, you can run:
```bash
kubectl create -f myjob/
```
The job shall start.
## Notes for Run Fluid Distributed with NCCL2 and RDMA
Before running NCCL2 distributed jobs, please check that whether your node has multiple network
interfaces, try to add the environment variable `export NCCL_SOCKET_IFNAME=eth0` to use your actual
network device.
To run high-performance distributed training, you must prepare your hardware environment to be
able to run RDMA enabled network communication, please check out [this](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/howto/cluster/nccl2_rdma_training.md)
note for details.
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
__all__ = ['parse_args', ]
BENCHMARK_MODELS = [
"machine_translation", "resnet", "se_resnext", "vgg", "mnist",
"stacked_dynamic_lstm", "resnet_with_preprocess"
]
def parse_args():
parser = argparse.ArgumentParser('Fluid model benchmarks.')
parser.add_argument(
'--model',
type=str,
choices=BENCHMARK_MODELS,
default='resnet',
help='The model to run benchmark with.')
parser.add_argument(
'--batch_size', type=int, default=32, help='The minibatch size.')
# args related to learning rate
parser.add_argument(
'--learning_rate', type=float, default=0.001, help='The learning rate.')
# TODO(wuyi): add "--use_fake_data" option back.
parser.add_argument(
'--skip_batch_num',
type=int,
default=5,
help='The first num of minibatch num to skip, for better performance test'
)
parser.add_argument(
'--iterations', type=int, default=80, help='The number of minibatches.')
parser.add_argument(
'--pass_num', type=int, default=100, help='The number of passes.')
parser.add_argument(
'--data_format',
type=str,
default='NCHW',
choices=['NCHW', 'NHWC'],
help='The data data_format, now only support NCHW.')
parser.add_argument(
'--device',
type=str,
default='GPU',
choices=['CPU', 'GPU'],
help='The device type.')
parser.add_argument(
'--gpus',
type=int,
default=1,
help='If gpus > 1, will use ParallelExecutor to run, else use Executor.')
# this option is available only for vgg and resnet.
parser.add_argument(
'--cpus',
type=int,
default=1,
help='If cpus > 1, will set ParallelExecutor to use multiple threads.')
parser.add_argument(
'--data_set',
type=str,
default='flowers',
choices=['cifar10', 'flowers', 'imagenet'],
help='Optional dataset for benchmark.')
parser.add_argument(
'--infer_only', action='store_true', help='If set, run forward only.')
parser.add_argument(
'--use_cprof', action='store_true', help='If set, use cProfile.')
parser.add_argument(
'--use_nvprof',
action='store_true',
help='If set, use nvprof for CUDA.')
parser.add_argument(
'--no_test',
action='store_true',
help='If set, do not test the testset during training.')
parser.add_argument(
'--memory_optimize',
action='store_true',
help='If set, optimize runtime memory before start.')
parser.add_argument(
'--use_fake_data',
action='store_true',
help='If set ommit the actual read data operators.')
parser.add_argument(
'--profile', action='store_true', help='If set, profile a few steps.')
parser.add_argument(
'--update_method',
type=str,
default='local',
choices=['local', 'pserver', 'nccl2'],
help='Choose parameter update method, can be local, pserver, nccl2.')
parser.add_argument(
'--no_split_var',
action='store_true',
default=False,
help='Whether split variables into blocks when update_method is pserver')
parser.add_argument(
'--async_mode',
action='store_true',
default=False,
help='Whether start pserver in async mode to support ASGD')
parser.add_argument(
'--use_reader_op',
action='store_true',
help='Whether to use reader op, and must specify the data path if set this to true.'
)
parser.add_argument(
'--data_path',
type=str,
default="",
help='Directory that contains all the training recordio files.')
parser.add_argument(
'--test_data_path',
type=str,
default="",
help='Directory that contains all the test data (NOT recordio).')
parser.add_argument(
'--use_inference_transpiler',
action='store_true',
help='If set, use inference transpiler to optimize the program.')
parser.add_argument(
'--no_random',
action='store_true',
help='If set, keep the random seed and do not shuffle the data.')
parser.add_argument(
'--reduce_strategy',
type=str,
choices=['reduce', 'all_reduce'],
default='all_reduce',
help='Specify the reduce strategy, can be reduce, all_reduce')
parser.add_argument(
'--fuse_broadcast_op',
action='store_true',
help='If set, would fuse multiple broadcast operators into one fused_broadcast operator.'
)
args = parser.parse_args()
return args
#!/bin/bash
if [ "`uname -s`" != "Linux" ]; then
echo "Current scenario only support in Linux yet!"
exit 0
fi
echo "========================= Hardware Information ========================="
sockets=`grep 'physical id' /proc/cpuinfo | sort -u | wc -l`
cores_per_socket=`grep 'core id' /proc/cpuinfo | sort -u | wc -l`
ht=`lscpu |grep "per core" |awk -F':' '{print $2}'|xargs`
physical_cores=$((sockets * cores_per_socket))
virtual_cores=`grep 'processor' /proc/cpuinfo | sort -u | wc -l`
numa_nodes=`lscpu |grep "NUMA node(s)"|awk -F':' '{print $2}'|xargs`
echo "CPU Name : `cat /proc/cpuinfo |grep -i "model name" |uniq |awk -F ':' '{print $2}'|xargs`"
echo "CPU Family : `lscpu |grep \"CPU family\" |awk -F':' '{print $2}'|xargs`"
echo "Socket Number : $sockets"
echo "Cores Per Socket : $cores_per_socket"
echo "Total Physical Cores : $physical_cores"
echo "Total Virtual Cores : $virtual_cores"
if [ $ht -eq 1 ]; then
echo "Hyper Threading : OFF"
if [ $physical_cores -ne $virtual_cores ]; then
echo "Error: HT logical error"
fi
else
echo "Hyper Threading : ON"
if [ $physical_cores -ge $virtual_cores ]; then
echo "Error: HT logical error"
fi
fi
echo "NUMA Nodes : $numa_nodes"
if [ $numa_nodes -lt $sockets ]; then
echo "Warning: NUMA node is not enough for the best performance,\
at least $sockets"
fi
echo "-------------------------- Memory Information --------------------------"
# dmidecode support start from 2.11
dmi_ver=`dmidecode --version|awk -F '.' '{print $1}'|xargs`
if [ $dmi_ver -lt 2 ]; then
echo "Error: dmidecode unknown or version is too old"
exit 0
fi
if [ `dmidecode | grep -ic "Permission denied"` -ne 0 ]; then
echo "Error: need root to run dmidecode"
exit 0
fi
max_dimms=0
num_dimms_installed=0
for dimm_id in `dmidecode |grep Locator|sort -u | awk -F ':' '{print $2}'`; do
num_refered=`dmidecode |grep -wc "$dimm_id"`
# the actual dimm id should be refered only once
if [ $num_refered -eq 1 ]; then
num_unknown=`dmidecode | awk '/'$dimm_id'/ {s=1; f=0};
/Unknown/ {f=1};
/Manufacturer/ {if (s==1) {print f; exit 0;}};'`
if [ $num_unknown -eq 0 ]; then
dimms_installed="$dimms_installed \n $dimm_id"
((num_dimms_installed++))
else
dimms_uninstalled="$dimms_uninstalled \n $dimm_id"
fi
((max_dimms++))
fi
done
echo "Installed DIMM number : $num_dimms_installed"
num_dimms_mapped=`dmidecode | grep "Memory Device Mapped" | wc -l`
if [ $num_dimms_installed -ne $num_dimms_mapped ]; then
echo "Error: The installed DIMMs number does ont match the mapped memory device: $num_dimms_mapped"
fi
num_clock_configed=`dmidecode | grep -i "Configured Clock Speed" |grep -ic "Hz"`
if [ $num_dimms_installed -ne $num_clock_configed ]; then
echo "Error: The installed DIMMs number does ont match configured clocks: $num_clock_configed"
fi
echo -e "Installed DIMMs Locator: $dimms_installed"
echo -e "Not installed DIMMs : $dimms_uninstalled"
max_dimm_slots=`dmidecode | grep -c "Bank Locator"`
echo "DIMMs max slots : $max_dimm_slots"
if [ $max_dimms -ne $max_dimm_slots ]; then
echo "Error: The max dimm slots do not match the max dimms: $max_dimms"
fi
free_ver_main=`free -V|awk -F ' ' '{print $NF}'|awk -F '.' '{print $1}'`
free_ver_sub=`free -V|awk -F ' ' '{print $NF}'|awk -F '.' '{print $2}'`
if [ $free_ver_main -lt 3 ] || [ $free_ver_sub -lt 3 ]; then
mem_sz=`free |grep -i mem |awk -F' ' '{print $2}'|xargs`
swap_sz=`free |grep -i swap |awk -F' ' '{print $2}'|xargs`
total_sz=`free -t |grep -i total |tail -n 1| awk -F' ' '{print $2}'|xargs`
mem_sz="`awk 'BEGIN{printf "%.1f\n",('$mem_sz'/1024/1024)}'` GB"
swap_sz="`awk 'BEGIN{printf "%.1f\n",('$swap_sz'/1024/1024)}'` GB"
total_sz="`awk 'BEGIN{printf "%.1f\n",('$total_sz'/1024/1024)}'` GB"
else
mem_sz=`free -h |grep -i mem |awk -F' ' '{print $2}'|xargs`
swap_sz=`free -h |grep -i swap |awk -F' ' '{print $2}'|xargs`
total_sz=`free -th |grep -i total |tail -n 1| awk -F' ' '{print $2}'|xargs`
fi
echo "Memory Size : $mem_sz"
echo "Swap Memory Size : $swap_sz"
echo "Total Memory Size : $total_sz"
echo "Max Memory Capacity : `dmidecode |grep -i \"maximum capacity\"|sort -u|awk -F':' '{print $2}'|xargs`"
# DIMMs fequency
clock_speeds=`dmidecode | grep -i "Configured Clock Speed" | grep -i "Hz" |sort -u | awk -F':' '{print $2}'|xargs`
echo "Configed Clock Speed : $clock_speeds"
num_clock_type=`dmidecode | grep -i "Configured Clock Speed" | grep -i "Hz" |sort -u | wc -l`
if [ $num_clock_type -ne 1 ]; then
echo "Warning: Have more than 1 speed type, all DIMMs should have same fequency: $clock_speeds"
fi
echo "-------------------------- Turbo Information --------------------------"
scaling_drive=`cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_driver`
echo "Scaling Driver : $scaling_drive"
if [ $scaling_drive == "intel_pstate" ] && [ -e /sys/devices/system/cpu/intel_pstate/no_turbo ]; then
turbo=`cat /sys/devices/system/cpu/intel_pstate/no_turbo`
if [ $turbo -eq 1 ]; then
echo "Turbo Status : OFF"
else
echo "Turbo Status : ON"
fi
else
echo "Warning: Scaling driver is not intel_pstarte, maybe should enable it in BIOS"
echo "Turbo Status : Unknown"
fi
# cpu frequency
num_max_freq=`cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_max_freq| sort -u |wc -l`
num_min_freq=`cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_min_freq| sort -u |wc -l`
if [ $num_max_freq -ne 1 ]; then
echo "Error: the max_frequency of all CPU should be equal"
fi
if [ $num_min_freq -ne 1 ]; then
echo "Error: the min_frequency of all CPU should be equal"
fi
max_freq=`cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_max_freq| uniq|xargs` # kHz
max_freq=`awk 'BEGIN{printf "%.2f",('$max_freq' / 1000000)}'` # GHz
min_freq=`cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_min_freq| uniq|xargs` # kHz
min_freq=`awk 'BEGIN{printf "%.2f",('$min_freq' / 1000000)}'` # GHz
echo "CPU Max Frequency : $max_freq GHz"
echo "CPU Min Frequency : $min_freq GHz"
# cpu governor
num_governor=`cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor| sort -u |wc -l`
if [ $num_governor -ne 1 ]; then
echo "Error: the governor of all CPU should be the same"
fi
governor=`cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor| sort -u |uniq`
echo "CPU Freq Governor : $governor"
echo "========================= Software Information ========================="
echo "BIOS Release Date : `dmidecode | grep "Release Date"|awk -F ':' '{print $2}'|xargs`"
echo "OS Version : `cat /etc/redhat-release`"
echo "Kernel Release Version : `uname -r`"
echo "Kernel Patch Version : `uname -v`"
echo "GCC Version :`gcc --version | head -n 1|awk -F '\\\(GCC\\\)' '{print $2}'`"
if command -v cmake >/dev/null 2>&1; then
cmake_ver=`cmake --version | head -n 1 | awk -F 'version' '{print $2}'`
else
cmake_ver=" Not installed"
fi
echo "CMake Version :$cmake_ver"
echo "------------------ Environment Variables Information -------------------"
kmp_affinity=`env | grep KMP_AFFINITY`
omp_dynamic=`env | grep OMP_DYNAMIC`
omp_nested=`env | grep OMP_NESTED`
omp_num_threads=`env | grep OMP_NUM_THREADS`
mkl_num_threads=`env | grep MKL_NUM_THREADS`
mkl_dynamic=`env | grep MKL_DYNAMIC`
if [ ! $kmp_affinity ]; then kmp_affinity="unset"; fi
if [ ! $omp_dynamic ]; then omp_dynamic="unset"; fi
if [ ! $omp_nested ]; then omp_nested="unset"; fi
if [ ! $omp_num_threads ]; then omp_num_threads="unset"; fi
if [ ! $mkl_num_threads ]; then mkl_num_threads="unset"; fi
if [ ! $mkl_dynamic ]; then mkl_dynamic="unset"; fi
echo "KMP_AFFINITY : $kmp_affinity"
echo "OMP_DYNAMIC : $omp_dynamic"
echo "OMP_NESTED : $omp_nested"
echo "OMP_NUM_THREADS : $omp_num_threads"
echo "MKL_NUM_THREADS : $mkl_num_threads"
echo "MKL_DYNAMIC : $mkl_dynamic"
# Check if any MKL related libraries have been installed in LD_LIBRARY_PATH
for path in `echo $LD_LIBRARY_PATH | awk -F ':' '{for(i=1;i<=NF;++i)print $i}'`; do
mkldnn_found=`find $path -name "libmkldnn.so"`
if [ "$mkldnn_found" ]; then
echo "Found MKL-DNN : $mkldnn_found"
fi
mklml_found=`find $path -name "libmklml_intel.so"`
if [ "$mklml_found" ]; then
echo "Found MKLML : $mklml_found"
fi
iomp_found=`find $path -name "libiomp5.so"`
if [ "$iomp_found" ]; then
echo "Found IOMP : $iomp_found"
fi
done
# dump all details for fully check
lscpu > lscpu.dump
dmidecode > dmidecode.dump
# The expected result would be like:
# ========================= Hardware Information =========================
# CPU Name : Intel(R) Xeon(R) Gold 6148M CPU @ 2.40GHz
# CPU Family : 6
# Socket Number : 2
# Cores Per Socket : 20
# Total Physical Cores : 40
# Total Virtual Cores : 40
# Hyper Threading : OFF
# NUMA Nodes : 2
# -------------------------- Memory Information --------------------------
# Installed DIMM number : 12
# Installed DIMMs Locator:
# CPU1_DIMM_A1
# CPU1_DIMM_B1
# CPU1_DIMM_C1
# CPU1_DIMM_D1
# CPU1_DIMM_E1
# CPU1_DIMM_F1
# CPU2_DIMM_A1
# CPU2_DIMM_B1
# CPU2_DIMM_C1
# CPU2_DIMM_D1
# CPU2_DIMM_E1
# CPU2_DIMM_F1
# Not installed DIMMs :
# CPU1_DIMM_A2
# CPU1_DIMM_B2
# CPU1_DIMM_C2
# CPU1_DIMM_D2
# CPU1_DIMM_E2
# CPU1_DIMM_F2
# CPU2_DIMM_A2
# CPU2_DIMM_B2
# CPU2_DIMM_C2
# CPU2_DIMM_D2
# CPU2_DIMM_E2
# CPU2_DIMM_F2
# DIMMs max slots : 24
# Memory Size : 376G
# Swap Memory Size : 4.0G
# Total Memory Size : 380G
# Max Memory Capacity : 2304 GB
# Configed Clock Speed : 2666 MHz
# -------------------------- Turbo Information --------------------------
# Scaling Driver : intel_pstate
# Turbo Status : ON
# CPU Max Frequency : 3.70 GHz
# CPU Min Frequency : 1.00 GHz
# CPU Freq Governor : performance
# ========================= Software Information =========================
# BIOS Release Date : 03/10/2017
# OS Version : CentOS Linux release 7.3.1611 (Core)
# Kernel Release Version : 3.10.0-514.el7.x86_64
# Kernel Patch Version : #1 SMP Tue Nov 22 16:42:41 UTC 2016
# GCC Version : 4.8.5 20150623 (Red Hat 4.8.5-11)
# CMake Version : 3.5.2
# ------------------ Environment Variables Information -------------------
# KMP_AFFINITY : unset
# OMP_DYNAMIC : unset
# OMP_NESTED : unset
# OMP_NUM_THREADS : unset
# MKL_NUM_THREADS : unset
# MKL_DYNAMIC : unset
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import cProfile
import time
import os
import traceback
import numpy as np
import paddle.fluid as fluid
import paddle.fluid.core as core
import paddle.fluid.profiler as profiler
import paddle.fluid.transpiler.distribute_transpiler as distribute_transpiler
from args import *
def append_nccl2_prepare(trainer_id, startup_prog):
if trainer_id >= 0:
# append gen_nccl_id at the end of startup program
trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
port = os.getenv("PADDLE_PSERVER_PORT")
worker_ips = os.getenv("PADDLE_TRAINER_IPS")
worker_endpoints = []
for ip in worker_ips.split(","):
worker_endpoints.append(':'.join([ip, port]))
num_trainers = len(worker_endpoints)
current_endpoint = os.getenv("PADDLE_CURRENT_IP") + ":" + port
worker_endpoints.remove(current_endpoint)
nccl_id_var = startup_prog.global_block().create_var(
name="NCCLID",
persistable=True,
type=fluid.core.VarDesc.VarType.RAW)
startup_prog.global_block().append_op(
type="gen_nccl_id",
inputs={},
outputs={"NCCLID": nccl_id_var},
attrs={
"endpoint": current_endpoint,
"endpoint_list": worker_endpoints,
"trainer_id": trainer_id
})
return nccl_id_var, num_trainers, trainer_id
else:
raise Exception("must set positive PADDLE_TRAINER_ID env variables for "
"nccl-based dist train.")
def dist_transpile(trainer_id, args, train_prog, startup_prog):
if trainer_id < 0:
return None, None
# the port of all pservers, needed by both trainer and pserver
port = os.getenv("PADDLE_PSERVER_PORT", "6174")
# comma separated ips of all pservers, needed by trainer and
# pserver
pserver_ips = os.getenv("PADDLE_PSERVER_IPS", "")
eplist = []
for ip in pserver_ips.split(","):
eplist.append(':'.join([ip, port]))
pserver_endpoints = ",".join(eplist)
# total number of workers/trainers in the job, needed by
# trainer and pserver
trainers = int(os.getenv("PADDLE_TRAINERS"))
# the IP of the local machine, needed by pserver only
current_endpoint = os.getenv("PADDLE_CURRENT_IP", "") + ":" + port
# the role, should be either PSERVER or TRAINER
training_role = os.getenv("PADDLE_TRAINING_ROLE")
config = fluid.DistributeTranspilerConfig()
config.slice_var_up = not args.no_split_var
config.min_block_size = 1048576
t = distribute_transpiler.DistributeTranspiler(config=config)
t.transpile(
trainer_id,
# NOTE: *MUST* use train_prog, for we are using with guard to
# generate different program for train and test.
program=train_prog,
pservers=pserver_endpoints,
trainers=trainers,
sync_mode=not args.async_mode,
startup_program=startup_prog)
if training_role == "PSERVER":
pserver_program = t.get_pserver_program(current_endpoint)
pserver_startup_program = t.get_startup_program(
current_endpoint, pserver_program, startup_program=startup_prog)
return pserver_program, pserver_startup_program
elif training_role == "TRAINER":
train_program = t.get_trainer_program()
return train_program, startup_prog
else:
raise ValueError(
'PADDLE_TRAINING_ROLE environment variable must be either TRAINER or PSERVER'
)
def test_parallel(exe, test_args, args, test_prog, feeder):
acc_evaluators = []
for i in xrange(len(test_args[2])):
acc_evaluators.append(fluid.metrics.Accuracy())
to_fetch = [v.name for v in test_args[2]]
if args.use_reader_op:
test_args[4].start()
while True:
try:
acc_rets = exe.run(fetch_list=to_fetch)
for i, e in enumerate(acc_evaluators):
e.update(
value=np.array(acc_rets[i]), weight=args.batch_size)
except fluid.core.EOFException as eof:
test_args[4].reset()
break
else:
for batch_id, data in enumerate(test_args[3]()):
acc_rets = exe.run(feed=feeder.feed(data), fetch_list=to_fetch)
for i, e in enumerate(acc_evaluators):
e.update(value=np.array(acc_rets[i]), weight=len(data))
return [e.eval() for e in acc_evaluators]
# NOTE: only need to benchmark using parallelexe
def train_parallel(train_args, test_args, args, train_prog, test_prog,
startup_prog, nccl_id_var, num_trainers, trainer_id):
over_all_start = time.time()
place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
feeder = None
if not args.use_reader_op:
feed_var_list = [
var for var in train_prog.global_block().vars.itervalues()
if var.is_data
]
feeder = fluid.DataFeeder(feed_var_list, place)
# generate fake:
if args.use_fake_data:
for var in feed_var_list:
v = startup_prog.global_block()._clone_variable(var)
var.persistable = True
v.persistable = True
real_shape = list(var.shape)
real_shape[0] = args.batch_size / args.gpus
startup_prog.global_block().append_op(
outputs={"Out": v},
type="fill_constant",
attrs={"shape": real_shape,
"value": 1.0,
"dtype": var.dtype})
if nccl_id_var and trainer_id == 0:
#FIXME(wuyi): wait other trainer to start listening
time.sleep(30)
startup_exe = fluid.Executor(place)
startup_exe.run(startup_prog)
strategy = fluid.ExecutionStrategy()
strategy.num_threads = args.cpus
strategy.allow_op_delay = False
build_strategy = fluid.BuildStrategy()
if args.reduce_strategy == "reduce":
build_strategy.reduce_strategy = fluid.BuildStrategy(
).ReduceStrategy.Reduce
else:
build_strategy.reduce_strategy = fluid.BuildStrategy(
).ReduceStrategy.AllReduce
avg_loss = train_args[0]
if args.update_method == "pserver":
# parameter server mode distributed training, merge
# gradients on local server, do not initialize
# ParallelExecutor with multi server all-reduce mode.
num_trainers = 1
trainer_id = 0
exe = fluid.ParallelExecutor(
True,
avg_loss.name,
main_program=train_prog,
exec_strategy=strategy,
build_strategy=build_strategy,
num_trainers=num_trainers,
trainer_id=trainer_id)
if not args.no_test:
if args.update_method == "pserver":
test_scope = None
else:
# NOTE: use an empty scope to avoid test exe using NCCLID
test_scope = fluid.Scope()
test_exe = fluid.ParallelExecutor(
True, main_program=test_prog, share_vars_from=exe)
for pass_id in range(args.pass_num):
num_samples = 0
iters = 0
start_time = time.time()
if not args.use_reader_op:
reader_generator = train_args[3]() #train_reader
batch_id = 0
data = None
if args.use_reader_op:
train_args[4].start()
while True:
if not args.use_reader_op:
data = next(reader_generator, None)
if data == None:
break
if args.profile and batch_id == 5:
profiler.start_profiler("All")
profiler.reset_profiler()
elif args.profile and batch_id == 10:
print("profiling total time: ", time.time() - start_time)
profiler.stop_profiler("total", "/tmp/profile_%d_pass%d" %
(trainer_id, pass_id))
if iters == args.iterations:
reader_generator.close()
break
if iters == args.skip_batch_num:
start_time = time.time()
num_samples = 0
fetch_list = [avg_loss.name]
acc_name_list = [v.name for v in train_args[2]]
fetch_list.extend(acc_name_list)
if args.use_fake_data or args.use_reader_op:
try:
fetch_ret = exe.run(fetch_list)
except fluid.core.EOFException as eof:
break
except fluid.core.EnforceNotMet as ex:
traceback.print_exc()
break
else:
fetch_ret = exe.run(fetch_list, feed=feeder.feed(data))
if args.use_reader_op:
num_samples += args.batch_size * args.gpus
else:
num_samples += len(data)
iters += 1
if batch_id % 1 == 0:
fetched_data = [np.mean(np.array(d)) for d in fetch_ret]
print("Pass %d, batch %d, loss %s, accucacys: %s" %
(pass_id, batch_id, fetched_data[0], fetched_data[1:]))
batch_id += 1
print_train_time(start_time, time.time(), num_samples)
if args.use_reader_op:
train_args[4].reset() # reset reader handle
else:
del reader_generator
if not args.no_test and test_args[2]:
test_feeder = None
if not args.use_reader_op:
test_feed_var_list = [
var for var in test_prog.global_block().vars.itervalues()
if var.is_data
]
test_feeder = fluid.DataFeeder(test_feed_var_list, place)
test_ret = test_parallel(test_exe, test_args, args, test_prog,
test_feeder)
print("Pass: %d, Test Accuracy: %s\n" %
(pass_id, [np.mean(np.array(v)) for v in test_ret]))
print("total train time: ", time.time() - over_all_start)
def print_arguments(args):
vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
vars(args)['device'] == 'GPU')
print('----------- Configuration Arguments -----------')
for arg, value in sorted(vars(args).iteritems()):
print('%s: %s' % (arg, value))
print('------------------------------------------------')
def print_train_time(start_time, end_time, num_samples):
train_elapsed = end_time - start_time
examples_per_sec = num_samples / train_elapsed
print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
(num_samples, train_elapsed, examples_per_sec))
def print_paddle_envs():
print('----------- Configuration envs -----------')
for k in os.environ:
if "PADDLE_" in k:
print "ENV %s:%s" % (k, os.environ[k])
print('------------------------------------------------')
def main():
args = parse_args()
print_arguments(args)
print_paddle_envs()
if args.no_random:
fluid.default_startup_program().random_seed = 1
# the unique trainer id, starting from 0, needed by trainer
# only
nccl_id_var, num_trainers, trainer_id = (
None, 1, int(os.getenv("PADDLE_TRAINER_ID", "0")))
if args.use_cprof:
pr = cProfile.Profile()
pr.enable()
model_def = __import__("models.%s" % args.model, fromlist=["models"])
train_prog = fluid.Program()
test_prog = fluid.Program()
startup_prog = fluid.Program()
train_args = list(model_def.get_model(args, True, train_prog, startup_prog))
test_args = list(model_def.get_model(args, False, test_prog, startup_prog))
all_args = [train_args, test_args, args]
if args.update_method == "pserver":
train_prog, startup_prog = dist_transpile(trainer_id, args, train_prog,
startup_prog)
if not train_prog:
raise Exception(
"Must configure correct environments to run dist train.")
all_args.extend([train_prog, test_prog, startup_prog])
if args.gpus > 1 and os.getenv("PADDLE_TRAINING_ROLE") == "TRAINER":
all_args.extend([nccl_id_var, num_trainers, trainer_id])
train_parallel(*all_args)
elif os.getenv("PADDLE_TRAINING_ROLE") == "PSERVER":
# start pserver with Executor
server_exe = fluid.Executor(fluid.CPUPlace())
server_exe.run(startup_prog)
server_exe.run(train_prog)
exit(0)
# for other update methods, use default programs
all_args.extend([train_prog, test_prog, startup_prog])
if args.update_method == "nccl2":
nccl_id_var, num_trainers, trainer_id = append_nccl2_prepare(
trainer_id, startup_prog)
if args.device == "CPU":
raise Exception("Only support GPU perf with parallel exe")
all_args.extend([nccl_id_var, num_trainers, trainer_id])
train_parallel(*all_args)
if __name__ == "__main__":
main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import math
import random
import functools
import numpy as np
from threading import Thread
import subprocess
import time
from Queue import Queue
import paddle
from PIL import Image, ImageEnhance
random.seed(0)
DATA_DIM = 224
THREAD = int(os.getenv("PREPROCESS_THREADS", "10"))
BUF_SIZE = 5120
DATA_DIR = '/mnt/ImageNet'
TRAIN_LIST = '/mnt/ImageNet/train.txt'
TEST_LIST = '/mnt/ImageNet/val.txt'
img_mean = np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1))
img_std = np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1))
def resize_short(img, target_size):
percent = float(target_size) / min(img.size[0], img.size[1])
resized_width = int(round(img.size[0] * percent))
resized_height = int(round(img.size[1] * percent))
img = img.resize((resized_width, resized_height), Image.LANCZOS)
return img
def crop_image(img, target_size, center):
width, height = img.size
size = target_size
if center == True:
w_start = (width - size) / 2
h_start = (height - size) / 2
else:
w_start = random.randint(0, width - size)
h_start = random.randint(0, height - size)
w_end = w_start + size
h_end = h_start + size
img = img.crop((w_start, h_start, w_end, h_end))
return img
def random_crop(img, size, scale=[0.08, 1.0], ratio=[3. / 4., 4. / 3.]):
aspect_ratio = math.sqrt(random.uniform(*ratio))
w = 1. * aspect_ratio
h = 1. / aspect_ratio
bound = min((float(img.size[0]) / img.size[1]) / (w**2),
(float(img.size[1]) / img.size[0]) / (h**2))
scale_max = min(scale[1], bound)
scale_min = min(scale[0], bound)
target_area = img.size[0] * img.size[1] * random.uniform(scale_min,
scale_max)
target_size = math.sqrt(target_area)
w = int(target_size * w)
h = int(target_size * h)
i = random.randint(0, img.size[0] - w)
j = random.randint(0, img.size[1] - h)
img = img.crop((i, j, i + w, j + h))
img = img.resize((size, size), Image.LANCZOS)
return img
def rotate_image(img):
angle = random.randint(-10, 10)
img = img.rotate(angle)
return img
def distort_color(img):
def random_brightness(img, lower=0.5, upper=1.5):
e = random.uniform(lower, upper)
return ImageEnhance.Brightness(img).enhance(e)
def random_contrast(img, lower=0.5, upper=1.5):
e = random.uniform(lower, upper)
return ImageEnhance.Contrast(img).enhance(e)
def random_color(img, lower=0.5, upper=1.5):
e = random.uniform(lower, upper)
return ImageEnhance.Color(img).enhance(e)
ops = [random_brightness, random_contrast, random_color]
random.shuffle(ops)
img = ops[0](img)
img = ops[1](img)
img = ops[2](img)
return img
def process_image(sample, mode, color_jitter, rotate):
img_path = sample[0]
img = Image.open(img_path)
if mode == 'train':
if rotate: img = rotate_image(img)
img = random_crop(img, DATA_DIM)
else:
img = resize_short(img, target_size=256)
img = crop_image(img, target_size=DATA_DIM, center=True)
if mode == 'train':
if color_jitter:
img = distort_color(img)
if random.randint(0, 1) == 1:
img = img.transpose(Image.FLIP_LEFT_RIGHT)
if img.mode != 'RGB':
img = img.convert('RGB')
img = np.array(img).astype('float32').transpose((2, 0, 1)) / 255
img -= img_mean
img /= img_std
if mode == 'train' or mode == 'val':
return img, sample[1]
elif mode == 'test':
return [img]
class XmapEndSignal():
pass
def xmap_readers(mapper,
reader,
process_num,
buffer_size,
order=False,
print_queue_state=True):
end = XmapEndSignal()
# define a worker to read samples from reader to in_queue
def read_worker(reader, in_queue):
for i in reader():
in_queue.put(i)
in_queue.put(end)
# define a worker to read samples from reader to in_queue with order flag
def order_read_worker(reader, in_queue, file_queue):
in_order = 0
for i in reader():
in_queue.put((in_order, i))
in_order += 1
in_queue.put(end)
# define a worker to handle samples from in_queue by mapper
# and put mapped samples into out_queue
def handle_worker(in_queue, out_queue, mapper):
sample = in_queue.get()
while not isinstance(sample, XmapEndSignal):
r = mapper(sample)
out_queue.put(r)
sample = in_queue.get()
in_queue.put(end)
out_queue.put(end)
# define a worker to handle samples from in_queue by mapper
# and put mapped samples into out_queue by order
def order_handle_worker(in_queue, out_queue, mapper, out_order):
ins = in_queue.get()
while not isinstance(ins, XmapEndSignal):
order, sample = ins
r = mapper(sample)
while order != out_order[0]:
pass
out_queue.put(r)
out_order[0] += 1
ins = in_queue.get()
in_queue.put(end)
out_queue.put(end)
def xreader():
file_queue = Queue()
in_queue = Queue(buffer_size)
out_queue = Queue(buffer_size)
out_order = [0]
# start a read worker in a thread
target = order_read_worker if order else read_worker
t = Thread(target=target, args=(reader, in_queue))
t.daemon = True
t.start()
# start several handle_workers
target = order_handle_worker if order else handle_worker
args = (in_queue, out_queue, mapper, out_order) if order else (
in_queue, out_queue, mapper)
workers = []
for i in xrange(process_num):
worker = Thread(target=target, args=args)
worker.daemon = True
workers.append(worker)
for w in workers:
w.start()
sample = out_queue.get()
start_t = time.time()
while not isinstance(sample, XmapEndSignal):
yield sample
sample = out_queue.get()
if time.time() - start_t > 3:
if print_queue_state:
print("queue sizes: ", in_queue.qsize(), out_queue.qsize())
start_t = time.time()
finish = 1
while finish < process_num:
sample = out_queue.get()
if isinstance(sample, XmapEndSignal):
finish += 1
else:
yield sample
return xreader
def _reader_creator(file_list,
mode,
shuffle=False,
color_jitter=False,
rotate=False,
xmap=True):
def reader():
with open(file_list) as flist:
full_lines = [line.strip() for line in flist]
if shuffle:
random.shuffle(full_lines)
if mode == 'train':
trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
trainer_count = int(os.getenv("PADDLE_TRAINERS"))
per_node_lines = len(full_lines) / trainer_count
lines = full_lines[trainer_id * per_node_lines:(trainer_id + 1)
* per_node_lines]
print(
"read images from %d, length: %d, lines length: %d, total: %d"
% (trainer_id * per_node_lines, per_node_lines, len(lines),
len(full_lines)))
else:
lines = full_lines
for line in lines:
if mode == 'train':
img_path, label = line.split()
img_path = img_path.replace("JPEG", "jpeg")
img_path = os.path.join(DATA_DIR, "train", img_path)
yield (img_path, int(label))
elif mode == 'val':
img_path, label = line.split()
img_path = img_path.replace("JPEG", "jpeg")
img_path = os.path.join(DATA_DIR, "val", img_path)
yield (img_path, int(label))
elif mode == 'test':
img_path = os.path.join(DATA_DIR, line)
yield [img_path]
mapper = functools.partial(
process_image, mode=mode, color_jitter=color_jitter, rotate=rotate)
return paddle.reader.xmap_readers(mapper, reader, THREAD, BUF_SIZE)
def load_raw_image_uint8(sample):
img_arr = np.array(Image.open(sample[0])).astype('int64')
return img_arr, int(sample[1])
def train_raw(file_list=TRAIN_LIST, shuffle=True):
def reader():
with open(file_list) as flist:
full_lines = [line.strip() for line in flist]
if shuffle:
random.shuffle(full_lines)
trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
trainer_count = int(os.getenv("PADDLE_TRAINERS"))
per_node_lines = len(full_lines) / trainer_count
lines = full_lines[trainer_id * per_node_lines:(trainer_id + 1) *
per_node_lines]
print("read images from %d, length: %d, lines length: %d, total: %d"
% (trainer_id * per_node_lines, per_node_lines, len(lines),
len(full_lines)))
for line in lines:
img_path, label = line.split()
img_path = img_path.replace("JPEG", "jpeg")
img_path = os.path.join(DATA_DIR, "train", img_path)
yield (img_path, int(label))
return paddle.reader.xmap_readers(load_raw_image_uint8, reader, THREAD,
BUF_SIZE)
def train(file_list=TRAIN_LIST, xmap=True):
return _reader_creator(
file_list,
'train',
shuffle=True,
color_jitter=False,
rotate=False,
xmap=xmap)
def val(file_list=TEST_LIST, xmap=True):
return _reader_creator(file_list, 'val', shuffle=False, xmap=xmap)
def test(file_list=TEST_LIST):
return _reader_creator(file_list, 'test', shuffle=False)
if __name__ == "__main__":
c = 0
start_t = time.time()
for d in train()():
c += 1
if c >= 10000:
break
spent = time.time() - start_t
print("read 10000 speed: ", 10000 / spent, spent)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import yaml
import copy
import argparse
import random
import os
import copy
from kube_templates import pserver, trainer, envs
def parse_args():
parser = argparse.ArgumentParser(description='Generate dist job yamls.')
parser.add_argument(
'--jobname', default="paddlejob", help='unique job name')
parser.add_argument(
'--cpu', default=1, type=int, help='CPU cores per trainer node')
parser.add_argument(
'--pscpu', default=1, type=int, help='CPU cores per pserver node')
parser.add_argument(
'--gpu', default=0, type=int, help='num of GPUs per node')
parser.add_argument(
'--image',
default="bootstrapper:5000/fluid_benchmark:gpu",
help='num of GPUs per node')
parser.add_argument(
'--pservers', default=1, type=int, help='num of pservers')
parser.add_argument(
'--trainers', default=1, type=int, help='num of trainers')
parser.add_argument('--memory', default=1, type=int, help='trainer memory')
parser.add_argument(
'--psmemory', default=1, type=int, help='pserver memory')
parser.add_argument(
'--port', default=30236, type=int, help='num of trainers')
parser.add_argument(
'--entry', default="python train.py", help='command to run')
parser.add_argument(
'--fluid', default=1, type=int, help='whether is fluid job')
parser.add_argument(
'--rdma', action='store_true', help='whether mount rdma libs')
parser.add_argument(
'--disttype',
default="pserver",
type=str,
choices=['pserver', 'nccl2', 'local'],
help='pserver or nccl2 or local')
args = parser.parse_args()
return args
def gen_job():
ps = pserver
tn = trainer
args = parse_args()
ps_container = ps["spec"]["template"]["spec"]["containers"][0]
tn_container = tn["spec"]["template"]["spec"]["containers"][0]
if args.fluid == 1:
ps_container["command"] = \
["paddle_k8s", "start_fluid"]
tn_container["command"] = \
["paddle_k8s", "start_fluid"]
ps["metadata"]["name"] = args.jobname + "-pserver"
ps["spec"]["template"]["metadata"]["labels"][
"paddle-job-pserver"] = args.jobname
tn["metadata"]["name"] = args.jobname + "-trainer"
tn["spec"]["template"]["metadata"]["labels"]["paddle-job"] = args.jobname
ps_container["image"] = args.image
tn_container["image"] = args.image
ps_container["resources"]["requests"]["cpu"] = str(args.pscpu)
ps_container["resources"]["requests"]["memory"] = str(args.psmemory) + "Gi"
ps_container["resources"]["limits"]["cpu"] = str(args.pscpu)
ps_container["resources"]["limits"]["memory"] = str(args.psmemory) + "Gi"
tn_container["resources"]["requests"]["cpu"] = str(args.cpu)
tn_container["resources"]["requests"]["memory"] = str(args.memory) + "Gi"
tn_container["resources"]["limits"]["cpu"] = str(args.cpu)
tn_container["resources"]["limits"]["memory"] = str(args.memory) + "Gi"
if args.gpu > 0:
tn_container["resources"]["requests"][
"alpha.kubernetes.io/nvidia-gpu"] = str(args.gpu)
tn_container["resources"]["limits"][
"alpha.kubernetes.io/nvidia-gpu"] = str(args.gpu)
ps["spec"]["replicas"] = int(args.pservers)
tn["spec"]["parallelism"] = int(args.trainers)
tn["spec"]["completions"] = int(args.trainers)
ps_container["ports"][0]["name"] = "jobport-" + str(args.port)
ps_container["ports"][0]["containerPort"] = args.port
spreadport = random.randint(40000, 60000)
tn_container["ports"][0]["name"] = "spr-" + str(spreadport)
tn_container["ports"][0]["containerPort"] = spreadport
envs.append({"name": "PADDLE_JOB_NAME", "value": args.jobname})
envs.append({"name": "PADDLE_TRAINERS", "value": str(args.trainers)})
envs.append({"name": "PADDLE_PSERVERS", "value": str(args.pservers)})
envs.append({"name": "ENTRY", "value": args.entry})
envs.append({"name": "PADDLE_PSERVER_PORT", "value": str(args.port)})
# NOTE: these directories below are cluster specific, please modify
# this settings before you run on your own cluster.
envs.append({
"name": "LD_LIBRARY_PATH",
"value":
"/usr/local/lib:/usr/local/nvidia/lib64:/usr/local/rdma/lib64:/usr/lib64/mlnx_ofed/valgrind"
})
volumes = [{
"name": "nvidia-driver",
"hostPath": {
"path": "/usr/local/nvidia/lib64"
}
}]
volumeMounts = [{
"mountPath": "/usr/local/nvidia/lib64",
"name": "nvidia-driver"
}]
if args.rdma:
volumes.extend([{
"name": "ibetc",
"hostPath": {
"path": "/etc/libibverbs.d"
}
}, {
"name": "iblibs",
"hostPath": {
"path": "/usr/local/rdma"
}
}, {
"name": "valgrind",
"hostPath": {
"path": "/usr/lib64/mlnx_ofed/valgrind"
}
}])
volumeMounts.extend([{
"mountPath": "/etc/libibverbs.d",
"name": "ibetc"
}, {
"mountPath": "/usr/local/rdma",
"name": "iblibs"
}, {
"mountPath": "/usr/lib64/mlnx_ofed/valgrind",
"name": "valgrind"
}])
# append shm for NCCL2
volumes.append({"name": "dshm", "emptyDir": {"medium": "Memory"}})
volumeMounts.append({"mountPath": "/dev/shm", "name": "dshm"})
# add ceph volumes
volumes.append({
"name": "ceph-data",
"cephfs": {
"monitors": ["192.168.16.23:6789"],
"secretRef": {
"name": "ceph-secret"
},
"user": "admin",
}
})
volumeMounts.append({"mountPath": "/mnt/data", "name": "ceph-data"})
tn["spec"]["template"]["spec"]["volumes"] = volumes
tn_container["volumeMounts"] = volumeMounts
ps_container["env"] = copy.deepcopy(envs)
ps_container["env"].append({
"name": "PADDLE_TRAINING_ROLE",
"value": "PSERVER"
})
tn_container["env"] = envs
if args.disttype == "pserver":
tn_container["env"].append({
"name": "PADDLE_TRAINING_ROLE",
"value": "TRAINER"
})
elif args.disttype == "nccl2" or args.disttype == "local":
# NCCL2 have no training role, set to plain WORKER
tn_container["env"].append({
"name": "PADDLE_TRAINING_ROLE",
"value": "WORKER"
})
os.mkdir(args.jobname)
if args.disttype == "pserver":
with open("%s/pserver.yaml" % args.jobname, "w") as fn:
yaml.dump(ps, fn)
with open("%s/trainer.yaml" % args.jobname, "w") as fn:
yaml.dump(tn, fn)
if __name__ == "__main__":
gen_job()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from pserver import pserver
from trainer import trainer
__all__ = ["pserver", "trainer", "envs"]
envs = [
# envs that don't need to change
{
"name": "GLOG_v",
"value": "0"
},
{
"name": "GLOG_logtostderr",
"value": "1"
},
{
"name": "TOPOLOGY",
"value": ""
},
{
"name": "TRAINER_PACKAGE",
"value": "/workspace"
},
{
"name": "PADDLE_INIT_NICS",
"value": "eth2"
},
{
"name": "NAMESPACE",
"valueFrom": {
"fieldRef": {
"fieldPath": "metadata.namespace"
}
}
},
{
"name": "POD_IP",
"valueFrom": {
"fieldRef": {
"fieldPath": "status.podIP"
}
}
},
{
"name": "PADDLE_CURRENT_IP",
"valueFrom": {
"fieldRef": {
"fieldPath": "status.podIP"
}
}
}
]
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
pserver = {
"apiVersion": "extensions/v1beta1",
"kind": "ReplicaSet",
"metadata": {
"name": "jobname-pserver"
},
"spec": {
"replicas": 1,
"template": {
"metadata": {
"labels": {
"paddle-job-pserver": "jobname"
}
},
"spec": {
"hostNetwork": True,
"imagePullSecrets": [{
"name": "job-registry-secret"
}],
"containers": [{
"name": "pserver",
"image": "",
"imagePullPolicy": "Always",
"ports": [{
"name": "jobport-1",
"containerPort": 1
}],
"env": [],
"command": ["paddle_k8s", "start_pserver"],
"resources": {
"requests": {
"memory": "10Gi",
"cpu": "4"
},
"limits": {
"memory": "10Gi",
"cpu": "4"
}
}
}]
}
}
}
}
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
trainer = {
"apiVersion": "batch/v1",
"kind": "Job",
"metadata": {
"name": "jobname-pserver"
},
"spec": {
"parallelism": 4,
"completions": 4,
"template": {
"metadata": {
"labels": {
"paddle-job": "jobname"
}
},
"spec": {
"hostNetwork": True,
"imagePullSecrets": [{
"name": "job-registry-secret"
}],
"restartPolicy": "Never",
"containers": [{
"name": "trainer",
"image": "",
"imagePullPolicy": "Always",
# to let container set rlimit
"securityContext": {
"privileged": True
# TODO(wuyi): use below specific cap instead of privileged,
# using privileged will cause all GPU device are visible
# in the container.
# "capabilities": {
# "add": ["SYS_RESOURCE"]
# }
},
"ports": [{
"name": "jobport-1",
"containerPort": 1
}],
"env": [],
"command": ["paddle_k8s", "start_trainer", "v2"],
"resources": {
"requests": {
"memory": "10Gi",
"cpu": "4",
},
"limits": {
"memory": "10Gi",
"cpu": "4",
}
}
}]
}
}
}
}
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
__all__ = [
"machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm",
"resnet_with_preprocess"
]
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""seq2seq model for fluid."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import argparse
import time
import distutils.util
import paddle
import paddle.fluid as fluid
import paddle.fluid.core as core
import paddle.fluid.framework as framework
from paddle.fluid.executor import Executor
def lstm_step(x_t, hidden_t_prev, cell_t_prev, size):
def linear(inputs):
return fluid.layers.fc(input=inputs, size=size, bias_attr=True)
forget_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
input_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
output_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
cell_tilde = fluid.layers.tanh(x=linear([hidden_t_prev, x_t]))
cell_t = fluid.layers.sums(input=[
fluid.layers.elementwise_mul(
x=forget_gate, y=cell_t_prev), fluid.layers.elementwise_mul(
x=input_gate, y=cell_tilde)
])
hidden_t = fluid.layers.elementwise_mul(
x=output_gate, y=fluid.layers.tanh(x=cell_t))
return hidden_t, cell_t
def seq_to_seq_net(embedding_dim, encoder_size, decoder_size, source_dict_dim,
target_dict_dim, is_generating, beam_size, max_length):
"""Construct a seq2seq network."""
def bi_lstm_encoder(input_seq, gate_size):
# Linear transformation part for input gate, output gate, forget gate
# and cell activation vectors need be done outside of dynamic_lstm.
# So the output size is 4 times of gate_size.
input_forward_proj = fluid.layers.fc(input=input_seq,
size=gate_size * 4,
act=None,
bias_attr=False)
forward, _ = fluid.layers.dynamic_lstm(
input=input_forward_proj, size=gate_size * 4, use_peepholes=False)
input_reversed_proj = fluid.layers.fc(input=input_seq,
size=gate_size * 4,
act=None,
bias_attr=False)
reversed, _ = fluid.layers.dynamic_lstm(
input=input_reversed_proj,
size=gate_size * 4,
is_reverse=True,
use_peepholes=False)
return forward, reversed
src_word_idx = fluid.layers.data(
name='source_sequence', shape=[1], dtype='int64', lod_level=1)
src_embedding = fluid.layers.embedding(
input=src_word_idx,
size=[source_dict_dim, embedding_dim],
dtype='float32')
src_forward, src_reversed = bi_lstm_encoder(
input_seq=src_embedding, gate_size=encoder_size)
encoded_vector = fluid.layers.concat(
input=[src_forward, src_reversed], axis=1)
encoded_proj = fluid.layers.fc(input=encoded_vector,
size=decoder_size,
bias_attr=False)
backward_first = fluid.layers.sequence_pool(
input=src_reversed, pool_type='first')
decoder_boot = fluid.layers.fc(input=backward_first,
size=decoder_size,
bias_attr=False,
act='tanh')
def lstm_decoder_with_attention(target_embedding, encoder_vec, encoder_proj,
decoder_boot, decoder_size):
def simple_attention(encoder_vec, encoder_proj, decoder_state):
decoder_state_proj = fluid.layers.fc(input=decoder_state,
size=decoder_size,
bias_attr=False)
decoder_state_expand = fluid.layers.sequence_expand(
x=decoder_state_proj, y=encoder_proj)
concated = fluid.layers.concat(
input=[encoder_proj, decoder_state_expand], axis=1)
attention_weights = fluid.layers.fc(input=concated,
size=1,
act='tanh',
bias_attr=False)
attention_weights = fluid.layers.sequence_softmax(
input=attention_weights)
weigths_reshape = fluid.layers.reshape(
x=attention_weights, shape=[-1])
scaled = fluid.layers.elementwise_mul(
x=encoder_vec, y=weigths_reshape, axis=0)
context = fluid.layers.sequence_pool(input=scaled, pool_type='sum')
return context
rnn = fluid.layers.DynamicRNN()
cell_init = fluid.layers.fill_constant_batch_size_like(
input=decoder_boot,
value=0.0,
shape=[-1, decoder_size],
dtype='float32')
cell_init.stop_gradient = False
with rnn.block():
current_word = rnn.step_input(target_embedding)
encoder_vec = rnn.static_input(encoder_vec)
encoder_proj = rnn.static_input(encoder_proj)
hidden_mem = rnn.memory(init=decoder_boot, need_reorder=True)
cell_mem = rnn.memory(init=cell_init)
context = simple_attention(encoder_vec, encoder_proj, hidden_mem)
decoder_inputs = fluid.layers.concat(
input=[context, current_word], axis=1)
h, c = lstm_step(decoder_inputs, hidden_mem, cell_mem, decoder_size)
rnn.update_memory(hidden_mem, h)
rnn.update_memory(cell_mem, c)
out = fluid.layers.fc(input=h,
size=target_dict_dim,
bias_attr=True,
act='softmax')
rnn.output(out)
return rnn()
if not is_generating:
trg_word_idx = fluid.layers.data(
name='target_sequence', shape=[1], dtype='int64', lod_level=1)
trg_embedding = fluid.layers.embedding(
input=trg_word_idx,
size=[target_dict_dim, embedding_dim],
dtype='float32')
prediction = lstm_decoder_with_attention(trg_embedding, encoded_vector,
encoded_proj, decoder_boot,
decoder_size)
label = fluid.layers.data(
name='label_sequence', shape=[1], dtype='int64', lod_level=1)
cost = fluid.layers.cross_entropy(input=prediction, label=label)
avg_cost = fluid.layers.mean(x=cost)
feeding_list = ["source_sequence", "target_sequence", "label_sequence"]
return avg_cost, feeding_list
def lodtensor_to_ndarray(lod_tensor):
dims = lod_tensor.get_dims()
ndarray = np.zeros(shape=dims).astype('float32')
for i in xrange(np.product(dims)):
ndarray.ravel()[i] = lod_tensor.get_float_element(i)
return ndarray
def get_model(args, is_train, main_prog, startup_prog):
if args.use_reader_op:
raise Exception("machine_translation do not support reader op for now.")
embedding_dim = 512
encoder_size = 512
decoder_size = 512
dict_size = 30000
beam_size = 3
max_length = 250
with fluid.program_guard(main_prog, startup_prog):
with fluid.unique_name.guard():
avg_cost, feeding_list = seq_to_seq_net(
embedding_dim,
encoder_size,
decoder_size,
dict_size,
dict_size,
False,
beam_size=beam_size,
max_length=max_length)
if is_train:
optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
optimizer.minimize(avg_cost)
batch_generator = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.wmt14.train(dict_size)
if is_train else paddle.dataset.wmt14.test(dict_size),
buf_size=1000),
batch_size=args.batch_size * args.gpus)
return avg_cost, optimizer, [], batch_generator, None
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import argparse
import time
import cProfile
import os
import paddle
import paddle.fluid as fluid
import paddle.fluid.profiler as profiler
SEED = 1
DTYPE = "float32"
# random seed must set before configuring the network.
# fluid.default_startup_program().random_seed = SEED
def cnn_model(data):
conv_pool_1 = fluid.nets.simple_img_conv_pool(
input=data,
filter_size=5,
num_filters=20,
pool_size=2,
pool_stride=2,
act="relu")
conv_pool_2 = fluid.nets.simple_img_conv_pool(
input=conv_pool_1,
filter_size=5,
num_filters=50,
pool_size=2,
pool_stride=2,
act="relu")
# TODO(dzhwinter) : refine the initializer and random seed settting
SIZE = 10
input_shape = conv_pool_2.shape
param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE]
scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5
predict = fluid.layers.fc(
input=conv_pool_2,
size=SIZE,
act="softmax",
param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.NormalInitializer(
loc=0.0, scale=scale)))
return predict
def get_model(args, is_train, main_prog, startup_prog):
# NOTE: mnist is small, we don't implement data sharding yet.
opt = None
data_file_handle = None
with fluid.program_guard(main_prog, startup_prog):
if args.use_reader_op:
filelist = [
os.path.join(args.data_path, f)
for f in os.listdir(args.data_path)
]
data_file_handle = fluid.layers.open_files(
filenames=filelist,
shapes=[[-1, 1, 28, 28], (-1, 1)],
lod_levels=[0, 0],
dtypes=["float32", "int64"],
thread_num=1,
pass_num=1)
data_file = fluid.layers.double_buffer(
fluid.layers.batch(
data_file_handle, batch_size=args.batch_size))
with fluid.unique_name.guard():
if args.use_reader_op:
input, label = fluid.layers.read_file(data_file)
else:
images = fluid.layers.data(
name='pixel', shape=[1, 28, 28], dtype='float32')
label = fluid.layers.data(
name='label', shape=[1], dtype='int64')
predict = cnn_model(images)
cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(x=cost)
# Evaluator
batch_acc = fluid.layers.accuracy(input=predict, label=label)
# Optimization
if is_train:
opt = fluid.optimizer.AdamOptimizer(
learning_rate=0.001, beta1=0.9, beta2=0.999)
opt.minimize(avg_cost)
if args.memory_optimize:
fluid.memory_optimize(main_prog)
# Reader
if is_train:
reader = paddle.dataset.mnist.train()
else:
reader = paddle.dataset.mnist.test()
batched_reader = paddle.batch(
reader, batch_size=args.batch_size * args.gpus)
return avg_cost, opt, [batch_acc], batched_reader, data_file_handle
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import functools
import numpy as np
import time
import os
import math
import cProfile, pstats, StringIO
import paddle
import paddle.fluid as fluid
import paddle.fluid.core as core
import paddle.fluid.profiler as profiler
from imagenet_reader import train, val
train_parameters = {
"input_size": [3, 224, 224],
"input_mean": [0.485, 0.456, 0.406],
"input_std": [0.229, 0.224, 0.225],
"learning_strategy": {
"name": "piecewise_decay",
"batch_size": 256,
"epochs": [30, 60, 90],
"steps": [0.1, 0.01, 0.001, 0.0001]
}
}
class ResNet():
def __init__(self, layers=50, is_train=True):
self.params = train_parameters
self.layers = layers
self.is_train = is_train
def net(self, input, class_dim=1000):
layers = self.layers
supported_layers = [50, 101, 152]
assert layers in supported_layers, \
"supported layers are {} but input layer is {}".format(supported_layers, layers)
if layers == 50:
depth = [3, 4, 6, 3]
elif layers == 101:
depth = [3, 4, 23, 3]
elif layers == 152:
depth = [3, 8, 36, 3]
num_filters = [64, 128, 256, 512]
conv = self.conv_bn_layer(
input=input, num_filters=64, filter_size=7, stride=2, act='relu')
conv = fluid.layers.pool2d(
input=conv,
pool_size=3,
pool_stride=2,
pool_padding=1,
pool_type='max')
for block in range(len(depth)):
for i in range(depth[block]):
conv = self.bottleneck_block(
input=conv,
num_filters=num_filters[block],
stride=2 if i == 0 and block != 0 else 1)
pool = fluid.layers.pool2d(
input=conv, pool_size=7, pool_type='avg', global_pooling=True)
stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
out = fluid.layers.fc(input=pool,
size=class_dim,
act='softmax',
param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.Uniform(-stdv,
stdv)))
return out
def conv_bn_layer(self,
input,
num_filters,
filter_size,
stride=1,
groups=1,
act=None):
conv = fluid.layers.conv2d(
input=input,
num_filters=num_filters,
filter_size=filter_size,
stride=stride,
padding=(filter_size - 1) // 2,
groups=groups,
act=None,
bias_attr=False)
return fluid.layers.batch_norm(
input=conv, act=act, is_test=not self.is_train)
def shortcut(self, input, ch_out, stride):
ch_in = input.shape[1]
if ch_in != ch_out or stride != 1:
return self.conv_bn_layer(input, ch_out, 1, stride)
else:
return input
def bottleneck_block(self, input, num_filters, stride):
conv0 = self.conv_bn_layer(
input=input, num_filters=num_filters, filter_size=1, act='relu')
conv1 = self.conv_bn_layer(
input=conv0,
num_filters=num_filters,
filter_size=3,
stride=stride,
act='relu')
conv2 = self.conv_bn_layer(
input=conv1, num_filters=num_filters * 4, filter_size=1, act=None)
short = self.shortcut(input, num_filters * 4, stride)
return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')
def _model_reader_dshape_classdim(args, is_train):
model = None
reader = None
if args.data_set == "flowers":
class_dim = 102
if args.data_format == 'NCHW':
dshape = [3, 224, 224]
else:
dshape = [224, 224, 3]
if is_train:
reader = paddle.dataset.flowers.train()
else:
reader = paddle.dataset.flowers.test()
elif args.data_set == "imagenet":
class_dim = 1000
if args.data_format == 'NCHW':
dshape = [3, 224, 224]
else:
dshape = [224, 224, 3]
if not args.data_path:
raise Exception(
"Must specify --data_path when training with imagenet")
if not args.use_reader_op:
if is_train:
reader = train()
else:
reader = val()
else:
if is_train:
reader = train(xmap=False)
else:
reader = val(xmap=False)
return reader, dshape, class_dim
def get_model(args, is_train, main_prog, startup_prog):
reader, dshape, class_dim = _model_reader_dshape_classdim(args, is_train)
pyreader = None
trainer_count = int(os.getenv("PADDLE_TRAINERS"))
with fluid.program_guard(main_prog, startup_prog):
with fluid.unique_name.guard():
if args.use_reader_op:
pyreader = fluid.layers.py_reader(
capacity=args.batch_size * args.gpus,
shapes=([-1] + dshape, (-1, 1)),
dtypes=('float32', 'int64'),
name="train_reader" if is_train else "test_reader",
use_double_buffer=True)
input, label = fluid.layers.read_file(pyreader)
else:
input = fluid.layers.data(
name='data', shape=dshape, dtype='float32')
label = fluid.layers.data(
name='label', shape=[1], dtype='int64')
model = ResNet(is_train=is_train)
predict = model.net(input, class_dim=class_dim)
cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(x=cost)
batch_acc1 = fluid.layers.accuracy(input=predict, label=label, k=1)
batch_acc5 = fluid.layers.accuracy(input=predict, label=label, k=5)
# configure optimize
optimizer = None
if is_train:
total_images = 1281167 / trainer_count
step = int(total_images / (args.batch_size * args.gpus) + 1)
epochs = [30, 60, 90]
bd = [step * e for e in epochs]
base_lr = args.learning_rate
lr = []
lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
optimizer = fluid.optimizer.Momentum(
learning_rate=fluid.layers.piecewise_decay(
boundaries=bd, values=lr),
momentum=0.9,
regularization=fluid.regularizer.L2Decay(1e-4))
optimizer.minimize(avg_cost)
if args.memory_optimize:
fluid.memory_optimize(main_prog)
# config readers
if not args.use_reader_op:
batched_reader = paddle.batch(
reader if args.no_random else paddle.reader.shuffle(
reader, buf_size=5120),
batch_size=args.batch_size * args.gpus,
drop_last=True)
else:
batched_reader = None
pyreader.decorate_paddle_reader(
paddle.batch(
reader if args.no_random else paddle.reader.shuffle(
reader, buf_size=5120),
batch_size=args.batch_size))
return avg_cost, optimizer, [batch_acc1,
batch_acc5], batched_reader, pyreader
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import functools
import numpy as np
import time
import os
import cProfile, pstats, StringIO
import paddle
import paddle.fluid as fluid
import paddle.fluid.core as core
import paddle.fluid.profiler as profiler
# from recordio_converter import imagenet_train, imagenet_test
from imagenet_reader import train_raw, val
def conv_bn_layer(input,
ch_out,
filter_size,
stride,
padding,
act='relu',
is_train=True):
conv1 = fluid.layers.conv2d(
input=input,
filter_size=filter_size,
num_filters=ch_out,
stride=stride,
padding=padding,
act=None,
bias_attr=False)
return fluid.layers.batch_norm(input=conv1, act=act, is_test=not is_train)
def shortcut(input, ch_out, stride, is_train=True):
ch_in = input.shape[1] # if args.data_format == 'NCHW' else input.shape[-1]
if ch_in != ch_out:
return conv_bn_layer(
input, ch_out, 1, stride, 0, None, is_train=is_train)
else:
return input
def basicblock(input, ch_out, stride, is_train=True):
short = shortcut(input, ch_out, stride, is_train=is_train)
conv1 = conv_bn_layer(input, ch_out, 3, stride, 1, is_train=is_train)
conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, act=None, is_train=is_train)
return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')
def bottleneck(input, ch_out, stride, is_train=True):
short = shortcut(input, ch_out * 4, stride, is_train=is_train)
conv1 = conv_bn_layer(input, ch_out, 1, stride, 0, is_train=is_train)
conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, is_train=is_train)
conv3 = conv_bn_layer(
conv2, ch_out * 4, 1, 1, 0, act=None, is_train=is_train)
return fluid.layers.elementwise_add(x=short, y=conv3, act='relu')
def layer_warp(block_func, input, ch_out, count, stride):
res_out = block_func(input, ch_out, stride)
for i in range(1, count):
res_out = block_func(res_out, ch_out, 1)
return res_out
def resnet_imagenet(input,
class_dim,
depth=50,
data_format='NCHW',
is_train=True):
cfg = {
18: ([2, 2, 2, 1], basicblock),
34: ([3, 4, 6, 3], basicblock),
50: ([3, 4, 6, 3], bottleneck),
101: ([3, 4, 23, 3], bottleneck),
152: ([3, 8, 36, 3], bottleneck)
}
stages, block_func = cfg[depth]
conv1 = conv_bn_layer(input, ch_out=64, filter_size=7, stride=2, padding=3)
pool1 = fluid.layers.pool2d(
input=conv1, pool_type='avg', pool_size=3, pool_stride=2)
res1 = layer_warp(block_func, pool1, 64, stages[0], 1)
res2 = layer_warp(block_func, res1, 128, stages[1], 2)
res3 = layer_warp(block_func, res2, 256, stages[2], 2)
res4 = layer_warp(block_func, res3, 512, stages[3], 2)
pool2 = fluid.layers.pool2d(
input=res4,
pool_size=7,
pool_type='avg',
pool_stride=1,
global_pooling=True)
out = fluid.layers.fc(input=pool2, size=class_dim, act='softmax')
return out
def resnet_cifar10(input, class_dim, depth=32, data_format='NCHW'):
assert (depth - 2) % 6 == 0
n = (depth - 2) // 6
conv1 = conv_bn_layer(
input=input, ch_out=16, filter_size=3, stride=1, padding=1)
res1 = layer_warp(basicblock, conv1, 16, n, 1)
res2 = layer_warp(basicblock, res1, 32, n, 2)
res3 = layer_warp(basicblock, res2, 64, n, 2)
pool = fluid.layers.pool2d(
input=res3, pool_size=8, pool_type='avg', pool_stride=1)
out = fluid.layers.fc(input=pool, size=class_dim, act='softmax')
return out
def _model_reader_dshape_classdim(args, is_train):
model = resnet_cifar10
reader = None
if args.data_set == "cifar10":
class_dim = 10
if args.data_format == 'NCHW':
dshape = [3, 32, 32]
else:
dshape = [32, 32, 3]
model = resnet_cifar10
if is_train:
reader = paddle.dataset.cifar.train10()
else:
reader = paddle.dataset.cifar.test10()
elif args.data_set == "flowers":
class_dim = 102
if args.data_format == 'NCHW':
dshape = [3, 224, 224]
else:
dshape = [224, 224, 3]
model = resnet_imagenet
if is_train:
reader = paddle.dataset.flowers.train()
else:
reader = paddle.dataset.flowers.test()
elif args.data_set == "imagenet":
class_dim = 1000
if args.data_format == 'NCHW':
dshape = [3, 224, 224]
else:
dshape = [224, 224, 3]
model = resnet_imagenet
if not args.data_path:
raise Exception(
"Must specify --data_path when training with imagenet")
if not args.use_reader_op:
if is_train:
reader = train_raw()
else:
reader = val()
else:
if is_train:
reader = train_raw()
else:
reader = val(xmap=False)
return model, reader, dshape, class_dim
def get_model(args, is_train, main_prog, startup_prog):
model, reader, dshape, class_dim = _model_reader_dshape_classdim(args,
is_train)
pyreader = None
trainer_count = int(os.getenv("PADDLE_TRAINERS"))
with fluid.program_guard(main_prog, startup_prog):
with fluid.unique_name.guard():
if args.use_reader_op:
pyreader = fluid.layers.py_reader(
capacity=args.batch_size * args.gpus,
shapes=([-1] + dshape, (-1, 1)),
dtypes=('uint8', 'int64'),
name="train_reader" if is_train else "test_reader",
use_double_buffer=True)
input, label = fluid.layers.read_file(pyreader)
else:
input = fluid.layers.data(
name='data', shape=dshape, dtype='uint8')
label = fluid.layers.data(
name='label', shape=[1], dtype='int64')
# add imagenet preprocessors
random_crop = fluid.layers.random_crop(input, dshape)
casted = fluid.layers.cast(random_crop, 'float32')
# input is HWC
trans = fluid.layers.transpose(casted, [0, 3, 1, 2]) / 255.0
img_mean = fluid.layers.tensor.assign(
np.array([0.485, 0.456, 0.406]).astype('float32').reshape((3, 1,
1)))
img_std = fluid.layers.tensor.assign(
np.array([0.229, 0.224, 0.225]).astype('float32').reshape((3, 1,
1)))
h1 = fluid.layers.elementwise_sub(trans, img_mean, axis=1)
h2 = fluid.layers.elementwise_div(h1, img_std, axis=1)
# pre_out = (trans - img_mean) / img_std
predict = model(h2, class_dim, is_train=is_train)
cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(x=cost)
batch_acc1 = fluid.layers.accuracy(input=predict, label=label, k=1)
batch_acc5 = fluid.layers.accuracy(input=predict, label=label, k=5)
# configure optimize
optimizer = None
if is_train:
total_images = 1281167 / trainer_count
step = int(total_images / args.batch_size + 1)
epochs = [30, 60, 80, 90]
bd = [step * e for e in epochs]
base_lr = args.learning_rate
lr = []
lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
optimizer = fluid.optimizer.Momentum(
learning_rate=base_lr,
#learning_rate=fluid.layers.piecewise_decay(
# boundaries=bd, values=lr),
momentum=0.9,
regularization=fluid.regularizer.L2Decay(1e-4))
optimizer.minimize(avg_cost)
if args.memory_optimize:
fluid.memory_optimize(main_prog)
# config readers
if not args.use_reader_op:
batched_reader = paddle.batch(
reader if args.no_random else paddle.reader.shuffle(
reader, buf_size=5120),
batch_size=args.batch_size * args.gpus,
drop_last=True)
else:
batched_reader = None
pyreader.decorate_paddle_reader(
paddle.batch(
# reader if args.no_random else paddle.reader.shuffle(
# reader, buf_size=5120),
reader,
batch_size=args.batch_size))
return avg_cost, optimizer, [batch_acc1,
batch_acc5], batched_reader, pyreader
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.fluid as fluid
import math
import os
from imagenet_reader import train, val
__all__ = [
"SE_ResNeXt", "SE_ResNeXt50_32x4d", "SE_ResNeXt101_32x4d",
"SE_ResNeXt152_32x4d", "get_model"
]
train_parameters = {
"input_size": [3, 224, 224],
"input_mean": [0.485, 0.456, 0.406],
"input_std": [0.229, 0.224, 0.225],
"learning_strategy": {
"name": "piecewise_decay",
"batch_size": 256,
"epochs": [30, 60, 90],
"steps": [0.1, 0.01, 0.001, 0.0001]
}
}
class SE_ResNeXt():
def __init__(self, layers=50, is_train=True):
self.params = train_parameters
self.layers = layers
self.is_train = is_train
def net(self, input, class_dim=1000):
layers = self.layers
supported_layers = [50, 101, 152]
assert layers in supported_layers, \
"supported layers are {} but input layer is {}".format(supported_layers, layers)
if layers == 50:
cardinality = 32
reduction_ratio = 16
depth = [3, 4, 6, 3]
num_filters = [128, 256, 512, 1024]
conv = self.conv_bn_layer(
input=input,
num_filters=64,
filter_size=7,
stride=2,
act='relu')
conv = fluid.layers.pool2d(
input=conv,
pool_size=3,
pool_stride=2,
pool_padding=1,
pool_type='max')
elif layers == 101:
cardinality = 32
reduction_ratio = 16
depth = [3, 4, 23, 3]
num_filters = [128, 256, 512, 1024]
conv = self.conv_bn_layer(
input=input,
num_filters=64,
filter_size=7,
stride=2,
act='relu')
conv = fluid.layers.pool2d(
input=conv,
pool_size=3,
pool_stride=2,
pool_padding=1,
pool_type='max')
elif layers == 152:
cardinality = 64
reduction_ratio = 16
depth = [3, 8, 36, 3]
num_filters = [128, 256, 512, 1024]
conv = self.conv_bn_layer(
input=input,
num_filters=64,
filter_size=3,
stride=2,
act='relu')
conv = self.conv_bn_layer(
input=conv, num_filters=64, filter_size=3, stride=1, act='relu')
conv = self.conv_bn_layer(
input=conv,
num_filters=128,
filter_size=3,
stride=1,
act='relu')
conv = fluid.layers.pool2d(
input=conv, pool_size=3, pool_stride=2, pool_padding=1, \
pool_type='max')
for block in range(len(depth)):
for i in range(depth[block]):
conv = self.bottleneck_block(
input=conv,
num_filters=num_filters[block],
stride=2 if i == 0 and block != 0 else 1,
cardinality=cardinality,
reduction_ratio=reduction_ratio)
pool = fluid.layers.pool2d(
input=conv, pool_size=7, pool_type='avg', global_pooling=True)
drop = fluid.layers.dropout(x=pool, dropout_prob=0.5)
stdv = 1.0 / math.sqrt(drop.shape[1] * 1.0)
out = fluid.layers.fc(input=drop,
size=class_dim,
act='softmax',
param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.Uniform(-stdv,
stdv)))
return out
def shortcut(self, input, ch_out, stride):
ch_in = input.shape[1]
if ch_in != ch_out or stride != 1:
filter_size = 1
return self.conv_bn_layer(input, ch_out, filter_size, stride)
else:
return input
def bottleneck_block(self, input, num_filters, stride, cardinality,
reduction_ratio):
conv0 = self.conv_bn_layer(
input=input, num_filters=num_filters, filter_size=1, act='relu')
conv1 = self.conv_bn_layer(
input=conv0,
num_filters=num_filters,
filter_size=3,
stride=stride,
groups=cardinality,
act='relu')
conv2 = self.conv_bn_layer(
input=conv1, num_filters=num_filters * 2, filter_size=1, act=None)
scale = self.squeeze_excitation(
input=conv2,
num_channels=num_filters * 2,
reduction_ratio=reduction_ratio)
short = self.shortcut(input, num_filters * 2, stride)
return fluid.layers.elementwise_add(x=short, y=scale, act='relu')
def conv_bn_layer(self,
input,
num_filters,
filter_size,
stride=1,
groups=1,
act=None):
conv = fluid.layers.conv2d(
input=input,
num_filters=num_filters,
filter_size=filter_size,
stride=stride,
padding=(filter_size - 1) / 2,
groups=groups,
act=None,
bias_attr=False)
return fluid.layers.batch_norm(
input=conv, act=act, is_test=not self.is_train)
def squeeze_excitation(self, input, num_channels, reduction_ratio):
pool = fluid.layers.pool2d(
input=input, pool_size=0, pool_type='avg', global_pooling=True)
stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
squeeze = fluid.layers.fc(input=pool,
size=num_channels / reduction_ratio,
act='relu',
param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.Uniform(
-stdv, stdv)))
stdv = 1.0 / math.sqrt(squeeze.shape[1] * 1.0)
excitation = fluid.layers.fc(input=squeeze,
size=num_channels,
act='sigmoid',
param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.Uniform(
-stdv, stdv)))
scale = fluid.layers.elementwise_mul(x=input, y=excitation, axis=0)
return scale
def SE_ResNeXt50_32x4d():
model = SE_ResNeXt(layers=50)
return model
def SE_ResNeXt101_32x4d():
model = SE_ResNeXt(layers=101)
return model
def SE_ResNeXt152_32x4d():
model = SE_ResNeXt(layers=152)
return model
def get_model(args, is_train, main_prog, startup_prog):
model = SE_ResNeXt(layers=50)
batched_reader = None
pyreader = None
trainer_count = int(os.getenv("PADDLE_TRAINERS"))
dshape = train_parameters["input_size"]
with fluid.program_guard(main_prog, startup_prog):
with fluid.unique_name.guard():
if args.use_reader_op:
pyreader = fluid.layers.py_reader(
capacity=10,
shapes=([-1] + dshape, (-1, 1)),
dtypes=('float32', 'int64'),
name="train_reader" if is_train else "test_reader",
use_double_buffer=True)
input, label = fluid.layers.read_file(pyreader)
else:
input = fluid.layers.data(
name='data', shape=dshape, dtype='float32')
label = fluid.layers.data(
name='label', shape=[1], dtype='int64')
out = model.net(input=input)
cost = fluid.layers.cross_entropy(input=out, label=label)
avg_cost = fluid.layers.mean(x=cost)
acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
optimizer = None
if is_train:
total_images = 1281167 / trainer_count
step = int(total_images / args.batch_size + 1)
epochs = [40, 80, 100]
bd = [step * e for e in epochs]
base_lr = args.learning_rate
lr = []
lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
optimizer = fluid.optimizer.Momentum(
# learning_rate=base_lr,
learning_rate=fluid.layers.piecewise_decay(
boundaries=bd, values=lr),
momentum=0.9,
regularization=fluid.regularizer.L2Decay(1e-4))
optimizer.minimize(avg_cost)
if args.memory_optimize:
fluid.memory_optimize(main_prog)
# config readers
if is_train:
reader = train()
else:
reader = val()
if not args.use_reader_op:
batched_reader = paddle.batch(
reader, batch_size=args.batch_size * args.gpus, drop_last=True)
else:
pyreader.decorate_paddle_reader(
paddle.batch(
reader, batch_size=args.batch_size))
return avg_cost, optimizer, [acc_top1, acc_top5], batched_reader, pyreader
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import cPickle
import os
import random
import time
import numpy
import paddle
import paddle.dataset.imdb as imdb
import paddle.fluid as fluid
import paddle.fluid.profiler as profiler
word_dict = imdb.word_dict()
def crop_sentence(reader, crop_size):
unk_value = word_dict['<unk>']
def __impl__():
for item in reader():
if len([x for x in item[0] if x != unk_value]) < crop_size:
yield item
return __impl__
def lstm_net(sentence, lstm_size):
sentence = fluid.layers.fc(input=sentence, size=lstm_size, act='tanh')
rnn = fluid.layers.DynamicRNN()
with rnn.block():
word = rnn.step_input(sentence)
prev_hidden = rnn.memory(value=0.0, shape=[lstm_size])
prev_cell = rnn.memory(value=0.0, shape=[lstm_size])
def gate_common(
ipt,
hidden,
size, ):
gate0 = fluid.layers.fc(input=ipt, size=size, bias_attr=True)
gate1 = fluid.layers.fc(input=hidden, size=size, bias_attr=False)
gate = fluid.layers.sums(input=[gate0, gate1])
return gate
forget_gate = fluid.layers.sigmoid(
x=gate_common(word, prev_hidden, lstm_size))
input_gate = fluid.layers.sigmoid(
x=gate_common(word, prev_hidden, lstm_size))
output_gate = fluid.layers.sigmoid(
x=gate_common(word, prev_hidden, lstm_size))
cell_gate = fluid.layers.tanh(
x=gate_common(word, prev_hidden, lstm_size))
cell = fluid.layers.sums(input=[
fluid.layers.elementwise_mul(
x=forget_gate, y=prev_cell), fluid.layers.elementwise_mul(
x=input_gate, y=cell_gate)
])
hidden = fluid.layers.elementwise_mul(
x=output_gate, y=fluid.layers.tanh(x=cell))
rnn.update_memory(prev_cell, cell)
rnn.update_memory(prev_hidden, hidden)
rnn.output(hidden)
last = fluid.layers.sequence_pool(rnn(), 'last')
logit = fluid.layers.fc(input=last, size=2, act='softmax')
return logit
def get_model(args, is_train, main_prog, startup_prog):
if args.use_reader_op:
raise Exception(
"stacked_dynamic_lstm do not support reader op for now.")
lstm_size = 512
emb_dim = 512
crop_size = 1500
with fluid.program_guard(main_prog, startup_prog):
with fluid.unique_name.guard():
data = fluid.layers.data(
name="words", shape=[1], lod_level=1, dtype='int64')
sentence = fluid.layers.embedding(
input=data, size=[len(word_dict), emb_dim])
logit = lstm_net(sentence, lstm_size)
loss = fluid.layers.cross_entropy(
input=logit,
label=fluid.layers.data(
name='label', shape=[1], dtype='int64'))
loss = fluid.layers.mean(x=loss)
# add acc
batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
batch_acc = fluid.layers.accuracy(input=logit, label=fluid.layers.data(name='label', \
shape=[1], dtype='int64'), total=batch_size_tensor)
if is_train:
adam = fluid.optimizer.Adam()
adam.minimize(loss)
if is_train:
reader = crop_sentence(imdb.train(word_dict), crop_size)
else:
reader = crop_sentence(imdb.test(word_dict), crop_size)
batched_reader = paddle.batch(
paddle.reader.shuffle(
reader, buf_size=25000),
batch_size=args.batch_size * args.gpus)
return loss, adam, [batch_acc], batched_reader, None
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""VGG16 benchmark in Fluid"""
from __future__ import print_function
import sys
import time
import numpy as np
import paddle
import paddle.fluid as fluid
import paddle.fluid.core as core
import argparse
import functools
import os
def vgg16_bn_drop(input, is_train=True):
def conv_block(input, num_filter, groups, dropouts):
return fluid.nets.img_conv_group(
input=input,
pool_size=2,
pool_stride=2,
conv_num_filter=[num_filter] * groups,
conv_filter_size=3,
conv_act='relu',
conv_with_batchnorm=True,
conv_batchnorm_drop_rate=dropouts,
pool_type='max')
conv1 = conv_block(input, 64, 2, [0.3, 0])
conv2 = conv_block(conv1, 128, 2, [0.4, 0])
conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
fc1 = fluid.layers.fc(input=drop, size=512, act=None)
bn = fluid.layers.batch_norm(input=fc1, act='relu', is_test=not is_train)
drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
return fc2
def get_model(args, is_train, main_prog, startup_prog):
if args.data_set == "cifar10":
classdim = 10
if args.data_format == 'NCHW':
data_shape = [3, 32, 32]
else:
data_shape = [32, 32, 3]
else:
classdim = 102
if args.data_format == 'NCHW':
data_shape = [3, 224, 224]
else:
data_shape = [224, 224, 3]
filelist = [
os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
]
with fluid.program_guard(main_prog, startup_prog):
if args.use_reader_op:
data_file_handle = fluid.layers.open_files(
filenames=filelist,
shapes=[[-1] + data_shape, (-1, 1)],
lod_levels=[0, 0],
dtypes=["float32", "int64"],
thread_num=1,
pass_num=1)
data_file = fluid.layers.double_buffer(
fluid.layers.batch(
data_file_handle, batch_size=args.batch_size))
with fluid.unique_name.guard():
if args.use_reader_op:
images, label = fluid.layers.read_file(data_file)
else:
images = fluid.layers.data(
name='data', shape=data_shape, dtype='float32')
label = fluid.layers.data(
name='label', shape=[1], dtype='int64')
# Train program
net = vgg16_bn_drop(images, is_train=is_train)
predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(x=cost)
# Evaluator
batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
batch_acc = fluid.layers.accuracy(
input=predict, label=label, total=batch_size_tensor)
# Optimization
if is_train:
optimizer = fluid.optimizer.Adam(
learning_rate=args.learning_rate)
optimizer.minimize(avg_cost)
# data reader
if is_train:
reader = paddle.dataset.cifar.train10() \
if args.data_set == 'cifar10' else paddle.dataset.flowers.train()
else:
reader = paddle.dataset.cifar.test10() \
if args.data_set == 'cifar10' else paddle.dataset.flowers.test()
batched_reader = paddle.batch(
paddle.reader.shuffle(
reader, buf_size=5120),
batch_size=args.batch_size * args.gpus)
return avg_cost, optimizer, [batch_acc], batched_reader, data_file_handle
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import random
import paddle
import paddle.fluid as fluid
import paddle.fluid.core as core
from paddle.dataset import mnist, cifar, flowers, image
def convert_2_recordio(py_reader, outfilepath, batch_size, shape_data,
shape_label):
num_batches = 0
with fluid.program_guard(fluid.Program(), fluid.Program()):
reader = paddle.batch(py_reader(), batch_size=batch_size)
feeder = fluid.DataFeeder(
feed_list=[ # order is image and label
fluid.layers.data(
name='image', shape=shape_data),
fluid.layers.data(
name='label', shape=shape_label, dtype='int64'),
],
place=fluid.CPUPlace())
num_batches = fluid.recordio_writer.convert_reader_to_recordio_file(
outfilepath, reader, feeder)
return num_batches
def prepare_mnist(outpath, batch_size):
outfilepath = os.path.join(outpath, "mnist.recordio")
convert_2_recordio(mnist.train, outfilepath, batch_size, [784], [1])
def prepare_cifar10(outpath, batch_size):
outfilepath = os.path.join(outpath, "cifar.recordio")
convert_2_recordio(cifar.train10, outfilepath, batch_size, [3, 32, 32], [1])
def prepare_flowers(outpath, batch_size):
outfilepath = os.path.join(outpath, "flowers.recordio")
convert_2_recordio(flowers.train, outfilepath, batch_size, [3, 224, 224],
[1])
def default_mapper(sample):
img, label = sample
img = image.simple_transform(
img, 256, 224, True, mean=[103.94, 116.78, 123.68])
return img.flatten().astype('float32'), label
def imagenet_train(data_dir):
contents = os.listdir(data_dir)
if set(contents) != set(
["train", "train.txt", "val", "val_set", "val.txt", "unzip.sh"]):
raise Exception("Imagenet data contents error!")
img2label = dict()
imgfilelist = []
with open(os.path.join(data_dir, "train.txt")) as fn:
while 1:
l = fn.readline()
if not l:
break
img, lbl = l[:-1].split(" ")
img2label[img] = int(lbl)
imgfilelist.append(img)
# shuffle all, this is slow
random.shuffle(imgfilelist)
def train_reader():
for idx, imgfile in enumerate(imgfilelist):
data = image.load_image(
os.path.join(data_dir, "train", imgfile.lower()))
label = [img2label[imgfile], ]
yield [data, label]
return paddle.reader.map_readers(default_mapper, train_reader)
def imagenet_test(data_dir):
contents = os.listdir(data_dir)
if set(contents) != set(
["train", "train.txt", "val", "val_set", "val.txt", "unzip.sh"]):
raise Exception("Imagenet data contents error!")
img2label = dict()
imgfilelist = []
with open(os.path.join(data_dir, "val.txt")) as fn:
while 1:
l = fn.readline()
if not l:
break
img, lbl = l[:-1].split(" ")
img2label[img] = int(lbl)
imgfilelist.append(img)
def test_reader():
for idx, imgfile in enumerate(imgfilelist):
base_path = os.path.join(data_dir, "val", imgfile.split(".")[0])
image_path = ".".join([base_path, "jpeg"])
data = image.load_image(image_path)
label = [img2label[imgfile], ]
yield [data, label]
return paddle.reader.map_readers(default_mapper, test_reader)
# FIXME(wuyi): delete this when https://github.com/PaddlePaddle/Paddle/pull/11066 is merged
def convert_reader_to_recordio_files(
filename,
batch_per_file,
reader_creator,
feeder,
compressor=core.RecordIOWriter.Compressor.Snappy,
max_num_records=1000,
feed_order=None):
if feed_order is None:
feed_order = feeder.feed_names
f_name, f_ext = os.path.splitext(filename)
assert (f_ext == ".recordio")
lines = []
f_idx = 0
counter = 0
for idx, batch in enumerate(reader_creator()):
lines.append(batch)
if idx >= batch_per_file and idx % batch_per_file == 0:
filename = "%s-%05d%s" % (f_name, f_idx, f_ext)
with fluid.recordio_writer.create_recordio_writer(
filename, compressor, max_num_records) as writer:
for l in lines:
res = feeder.feed(l)
for each in feed_order:
writer.append_tensor(res[each])
writer.complete_append_tensor()
counter += 1
lines = []
f_idx += 1
print("written file: ", filename)
return counter
def prepare_imagenet(inpath, outpath, batch_size):
r = paddle.batch(imagenet_train(inpath), batch_size=batch_size)
feeder = fluid.DataFeeder(
feed_list=[
fluid.layers.data(
name="image", shape=[3, 224, 224]), fluid.layers.data(
name="label", shape=[1], dtype='int64')
],
place=fluid.CPUPlace())
outpath = os.path.join(outpath, "imagenet.recordio")
convert_reader_to_recordio_files(outpath, 10000, r, feeder)
#!/bin/bash
# This script benchmarking the PaddlePaddle Fluid on
# single thread single GPU.
mkdir -p logs
#export FLAGS_fraction_of_gpu_memory_to_use=0.0
export CUDNN_PATH=/paddle/cudnn_v5
# disable openmp and mkl parallel
#https://github.com/PaddlePaddle/Paddle/issues/7199
export MKL_NUM_THREADS=1
export OMP_NUM_THREADS=1
ht=`lscpu |grep "per core"|awk -F':' '{print $2}'|xargs`
if [ $ht -eq 1 ]; then # HT is OFF
if [ -z "$KMP_AFFINITY" ]; then
export KMP_AFFINITY="granularity=fine,compact,0,0"
fi
if [ -z "$OMP_DYNAMIC" ]; then
export OMP_DYNAMIC="FALSE"
fi
else # HT is ON
if [ -z "$KMP_AFFINITY" ]; then
export KMP_AFFINITY="granularity=fine,compact,1,0"
fi
fi
# disable multi-gpu if have more than one
export CUDA_VISIBLE_DEVICES=0
export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=$CUDNN_PATH:$LD_LIBRARY_PATH
# only query the gpu used
nohup stdbuf -oL nvidia-smi \
--id=${CUDA_VISIBLE_DEVICES} \
--query-gpu=timestamp \
--query-compute-apps=pid,process_name,used_memory \
--format=csv \
--filename=mem.log \
-l 1 &
# mnist
# mnist gpu mnist 128
FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
--model=mnist \
--device=GPU \
--batch_size=128 \
--skip_batch_num=5 \
--iterations=500 \
2>&1 | tee -a logs/mnist_gpu_128.log
# vgg16
# gpu cifar10 128
FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
--model=vgg16 \
--device=GPU \
--batch_size=128 \
--skip_batch_num=5 \
--iterations=30 \
2>&1 | tee -a logs/vgg16_gpu_128.log
# flowers gpu 128
FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
--model=vgg16 \
--device=GPU \
--batch_size=32 \
--data_set=flowers \
--skip_batch_num=5 \
--iterations=30 \
2>&1 | tee -a logs/vgg16_gpu_flowers_32.log
# resnet50
# resnet50 gpu cifar10 128
FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
--model=resnet \
--device=GPU \
--batch_size=128 \
--data_set=cifar10 \
--skip_batch_num=5 \
--iterations=30 \
2>&1 | tee -a logs/resnet50_gpu_128.log
# resnet50 gpu flowers 64
FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
--model=resnet \
--device=GPU \
--batch_size=64 \
--data_set=flowers \
--skip_batch_num=5 \
--iterations=30 \
2>&1 | tee -a logs/resnet50_gpu_flowers_64.log
# lstm
# lstm gpu imdb 32 # tensorflow only support batch=32
FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
--model=stacked_dynamic_lstm \
--device=GPU \
--batch_size=32 \
--skip_batch_num=5 \
--iterations=30 \
2>&1 | tee -a logs/lstm_gpu_32.log
# seq2seq
# seq2seq gpu wmb 128
FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
--model=machine_translation \
--device=GPU \
--batch_size=128 \
--skip_batch_num=5 \
--iterations=30 \
2>&1 | tee -a logs/lstm_gpu_128.log
#!/bin/bash
PADDLE_TRAINING_ROLE=PSERVER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=2 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model resnet --device CPU --update_method pserver --iterations=10000 &
sleep 15
CUDA_VISIBLE_DEVICES=0,1 PADDLE_TRAINING_ROLE=TRAINER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=2 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model resnet --device GPU --update_method pserver --iterations=10000 --gpus 2 &
CUDA_VISIBLE_DEVICES=2,3 PADDLE_TRAINING_ROLE=TRAINER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=2 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=1 python fluid_benchmark.py --model resnet --device GPU --update_method pserver --iterations=10000 --gpus 2 &
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from six.moves import xrange # pylint: disable=redefined-builtin
from datetime import datetime
import math
import time
import tensorflow.python.platform
import tensorflow as tf
FLAGS = tf.app.flags.FLAGS
tf.app.flags.DEFINE_integer('batch_size', 128, """Batch size.""")
tf.app.flags.DEFINE_integer('num_batches', 100, """Number of batches to run.""")
tf.app.flags.DEFINE_boolean('forward_only', False,
"""Only run the forward pass.""")
tf.app.flags.DEFINE_boolean('forward_backward_only', False,
"""Only run the forward-forward pass.""")
tf.app.flags.DEFINE_string('data_format', 'NCHW',
"""The data format for Convnet operations.
Can be either NHWC or NCHW.
""")
tf.app.flags.DEFINE_boolean('log_device_placement', False,
"""Whether to log device placement.""")
def _conv(name, inpOp, nIn, nOut, kH, kW, dH, dW, padType, wd=0.0005):
with tf.name_scope(name) as scope:
kernel = tf.get_variable(
name + '_w', [kH, kW, nIn, nOut],
initializer=tf.truncated_normal_initializer(
stddev=0.01, dtype=tf.float32),
dtype=tf.float32)
if wd is not None and wd > 0:
weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss')
tf.add_to_collection('losses', weight_decay)
if FLAGS.data_format == 'NCHW':
strides = [1, 1, dH, dW]
else:
strides = [1, dH, dW, 1]
conv = tf.nn.conv2d(
inpOp,
kernel,
strides,
padding=padType,
data_format=FLAGS.data_format)
biases = tf.get_variable(
name=name + '_b',
shape=[nOut],
initializer=tf.constant_initializer(
value=0.0, dtype=tf.float32),
dtype=tf.float32)
bias = tf.reshape(
tf.nn.bias_add(
conv, biases, data_format=FLAGS.data_format),
conv.get_shape())
conv1 = tf.nn.relu(bias, name=scope)
return conv1
def _affine(name, inpOp, nIn, nOut, wd=0.0005, act=True, drop=None):
with tf.name_scope(name) as scope:
kernel = tf.get_variable(
name + '_w', [nIn, nOut],
initializer=tf.truncated_normal_initializer(
stddev=0.01, dtype=tf.float32),
dtype=tf.float32)
if wd is not None and wd > 0:
weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss')
tf.add_to_collection('losses', weight_decay)
biases = tf.get_variable(
name + '_b', [nOut],
initializer=tf.constant_initializer(
value=0.0, dtype=tf.float32),
dtype=tf.float32,
trainable=True)
affine1 = tf.nn.relu_layer(inpOp, kernel, biases, name=name) if act else \
tf.matmul(inpOp, kernel) + biases
output = tf.nn.dropout(affine1, drop) if drop else affine1
return output
def _mpool(name, inpOp, kH, kW, dH, dW):
if FLAGS.data_format == 'NCHW':
ksize = [1, 1, kH, kW]
strides = [1, 1, dH, dW]
else:
ksize = [1, kH, kW, 1]
strides = [1, dH, dW, 1]
return tf.nn.max_pool(
inpOp,
ksize=ksize,
strides=strides,
padding='VALID',
data_format=FLAGS.data_format,
name=name)
def _norm(name, l_input, lsize=4):
return tf.nn.lrn(l_input,
lsize,
bias=1.0,
alpha=0.001 / 9.0,
beta=0.75,
name=name)
def loss(logits, labels):
labels = tf.cast(labels, tf.int64)
cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
logits, labels, name='cross_entropy_per_example')
cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy')
tf.add_to_collection('losses', cross_entropy_mean)
# The total loss is defined as the cross entropy loss plus all of the weight
# decay terms (L2 loss).
return tf.add_n(tf.get_collection('losses'), name='total_loss')
def get_incoming_shape(incoming):
""" Returns the incoming data shape """
if isinstance(incoming, tf.Tensor):
return incoming.get_shape().as_list()
elif type(incoming) in [np.array, list, tuple]:
return np.shape(incoming)
else:
raise Exception("Invalid incoming layer.")
def inference(images):
conv1 = _conv('conv1', images, 3, 96, 11, 11, 4, 4, 'VALID')
pool1 = _mpool('pool1', conv1, 3, 3, 2, 2)
norm1 = _norm('norm1', pool1, lsize=5)
conv2 = _conv('conv2', norm1, 96, 256, 5, 5, 1, 1, 'SAME')
pool2 = _mpool('pool2', conv2, 3, 3, 2, 2)
norm2 = _norm('norm2', pool2, lsize=5)
conv3 = _conv('conv3', norm2, 256, 384, 3, 3, 1, 1, 'SAME')
conv4 = _conv('conv4', conv3, 384, 384, 3, 3, 1, 1, 'SAME')
conv5 = _conv('conv5', conv4, 384, 256, 3, 3, 1, 1, 'SAME')
pool5 = _mpool('pool5', conv5, 3, 3, 2, 2)
resh1 = tf.reshape(pool5, [-1, 256 * 6 * 6])
affn1 = _affine('fc6', resh1, 256 * 6 * 6, 4096, 0.5)
affn2 = _affine('fc7', affn1, 4096, 4096, 0.5)
affn3 = _affine('fc8', affn2, 4096, 1000, wd=None, act=False) # last fc
return affn3
def time_tensorflow_run(session, target, info_string):
num_steps_burn_in = 10
total_duration = 0.0
total_duration_squared = 0.0
if not isinstance(target, list):
target = [target]
target_op = tf.group(*target)
for i in xrange(FLAGS.num_batches + num_steps_burn_in):
start_time = time.time()
_ = session.run(target_op)
duration = time.time() - start_time
if i > num_steps_burn_in:
if not i % 10:
print('%s: step %d, duration = %.3f' %
(datetime.now(), i - num_steps_burn_in, duration))
total_duration += duration
total_duration_squared += duration * duration
mn = total_duration / FLAGS.num_batches
vr = total_duration_squared / FLAGS.num_batches - mn * mn
sd = math.sqrt(vr)
print('%s: %s across %d steps, %.3f +/- %.3f sec / batch' %
(datetime.now(), info_string, FLAGS.num_batches, mn, sd))
def _add_loss_summaries(total_loss):
"""
Generates moving average for all losses and associated summaries for
visualizing the performance of the network.
Args:
total_loss: Total loss from loss().
Returns:
loss_averages_op: op for generating moving averages of losses.
"""
# Compute the moving average of all individual losses and the total loss.
loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg')
losses = tf.get_collection('losses')
loss_averages_op = loss_averages.apply(losses + [total_loss])
# Attach a scalar summary to all individual losses and the total loss; do the
# same for the averaged version of the losses.
for l in losses + [total_loss]:
# Name each loss as '(raw)' and name the moving average version of the loss
# as the original loss name.
tf.scalar_summary(l.op.name + ' (raw)', l)
tf.scalar_summary(l.op.name, loss_averages.average(l))
return loss_averages_op
def run_benchmark():
with tf.Graph().as_default():
with tf.device('/gpu:0'):
# Generate some dummy images.
image_size = 224
# Note that our padding definition is slightly different the cuda-convnet.
# In order to force the model to start with the same activations sizes,
# we add 3 to the image_size and employ VALID padding above.
if FLAGS.data_format == 'NCHW':
image_shape = [
FLAGS.batch_size, 3, image_size + 3, image_size + 3
]
else:
image_shape = [
FLAGS.batch_size, image_size + 3, image_size + 3, 3
]
images = tf.get_variable(
'image',
image_shape,
initializer=tf.truncated_normal_initializer(
stddev=0.1, dtype=tf.float32),
dtype=tf.float32,
trainable=False)
labels = tf.get_variable(
'label', [FLAGS.batch_size],
initializer=tf.constant_initializer(1),
dtype=tf.int32,
trainable=False)
# Build a Graph that computes the logits predictions from the
# inference model.
last_layer = inference(images)
objective = loss(last_layer, labels)
# Compute the gradient with respect to all the parameters.
# Compute gradients.
# opt = tf.train.GradientDescentOptimizer(0.001)
opt = tf.train.MomentumOptimizer(0.001, 0.9)
grads = opt.compute_gradients(objective)
global_step = tf.get_variable(
'global_step', [],
initializer=tf.constant_initializer(
0.0, dtype=tf.float32),
trainable=False,
dtype=tf.float32)
apply_gradient_op = opt.apply_gradients(
grads, global_step=global_step)
# Track the moving averages of all trainable variables.
variable_averages = tf.train.ExponentialMovingAverage(0.9,
global_step)
variables_averages_op = variable_averages.apply(
tf.trainable_variables())
# Build an initialization operation.
init = tf.initialize_all_variables()
# Start running operations on the Graph.
sess = tf.Session(config=tf.ConfigProto(
allow_soft_placement=True,
log_device_placement=FLAGS.log_device_placement))
sess.run(init)
run_forward = True
run_forward_backward = True
if FLAGS.forward_only and FLAGS.forward_backward_only:
raise ValueError("Cannot specify --forward_only and "
"--forward_backward_only at the same time.")
if FLAGS.forward_only:
run_forward_backward = False
elif FLAGS.forward_backward_only:
run_forward = False
if run_forward:
time_tensorflow_run(sess, last_layer, "Forward")
if run_forward_backward:
with tf.control_dependencies(
[apply_gradient_op, variables_averages_op]):
train_op = tf.no_op(name='train')
time_tensorflow_run(sess, [train_op, objective],
"Forward-backward")
def main(_):
run_benchmark()
if __name__ == '__main__':
tf.app.run()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from six.moves import xrange # pylint: disable=redefined-builtin
from datetime import datetime
import math
import re
import time
import tensorflow.python.platform
import tensorflow as tf
FLAGS = tf.app.flags.FLAGS
tf.app.flags.DEFINE_integer('batch_size', 64, """Batch size.""")
tf.app.flags.DEFINE_integer('num_batches', 100, """Number of batches to run.""")
tf.app.flags.DEFINE_string('data_format', 'NCHW',
"""The data format for Convnet operations.
Can be either NHWC or NCHW.
""")
tf.app.flags.DEFINE_string('train_dir', '/train_model',
"""Directory where to write event logs """
"""and checkpoint.""")
tf.app.flags.DEFINE_integer('num_gpus', 4, """How many GPUs to use.""")
tf.app.flags.DEFINE_boolean('log_device_placement', False,
"""Whether to log device placement.""")
NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN = 50000
NUM_EPOCHS_PER_DECAY = 50
INITIAL_LEARNING_RATE = 0.1
LEARNING_RATE_DECAY_FACTOR = 0.1
TOWER_NAME = 'tower'
def _conv(name, inpOp, nIn, nOut, kH, kW, dH, dW, padType, wd=0.005):
with tf.name_scope(name) as scope:
kernel = tf.get_variable(
name + '_w', [kH, kW, nIn, nOut],
initializer=tf.truncated_normal_initializer(
stddev=0.01, dtype=tf.float32),
dtype=tf.float32)
if wd is not None:
weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss')
tf.add_to_collection('losses', weight_decay)
if FLAGS.data_format == 'NCHW':
strides = [1, 1, dH, dW]
else:
strides = [1, dH, dW, 1]
conv = tf.nn.conv2d(
inpOp,
kernel,
strides,
padding=padType,
data_format=FLAGS.data_format)
biases = tf.get_variable(
name=name + '_b',
shape=[nOut],
initializer=tf.constant_initializer(
value=0.0, dtype=tf.float32),
dtype=tf.float32)
bias = tf.reshape(
tf.nn.bias_add(
conv, biases, data_format=FLAGS.data_format),
conv.get_shape())
conv1 = tf.nn.relu(bias, name=scope)
return conv1
def _affine(name, inpOp, nIn, nOut, wd=0.005, act=True):
with tf.name_scope(name) as scope:
kernel = tf.get_variable(
name + '_w', [nIn, nOut],
initializer=tf.truncated_normal_initializer(
stddev=0.01, dtype=tf.float32),
dtype=tf.float32)
if wd is not None:
weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss')
tf.add_to_collection('losses', weight_decay)
biases = tf.get_variable(
name + '_b', [nOut],
initializer=tf.constant_initializer(
value=0.0, dtype=tf.float32),
dtype=tf.float32,
trainable=True)
affine1 = tf.nn.relu_layer(inpOp, kernel, biases, name=name) if act else \
tf.matmul(inpOp, kernel) + biases
return affine1
def _mpool(name, inpOp, kH, kW, dH, dW):
if FLAGS.data_format == 'NCHW':
ksize = [1, 1, kH, kW]
strides = [1, 1, dH, dW]
else:
ksize = [1, kH, kW, 1]
strides = [1, dH, dW, 1]
return tf.nn.max_pool(
inpOp,
ksize=ksize,
strides=strides,
padding='VALID',
data_format=FLAGS.data_format,
name=name)
def _norm(name, l_input, lsize=4):
return tf.nn.lrn(l_input,
lsize,
bias=1.0,
alpha=0.001 / 9.0,
beta=0.75,
name=name)
def loss(logits, labels):
labels = tf.cast(labels, tf.int64)
cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
logits, labels, name='cross_entropy_per_example')
cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy')
tf.add_to_collection('losses', cross_entropy_mean)
# The total loss is defined as the cross entropy loss plus all of the weight
# decay terms (L2 loss).
return tf.add_n(tf.get_collection('losses'), name='total_loss')
def get_incoming_shape(incoming):
""" Returns the incoming data shape """
if isinstance(incoming, tf.Tensor):
return incoming.get_shape().as_list()
elif type(incoming) in [np.array, list, tuple]:
return np.shape(incoming)
else:
raise Exception("Invalid incoming layer.")
def inference(images):
conv1 = _conv('conv1', images, 3, 96, 11, 11, 4, 4, 'VALID')
pool1 = _mpool('pool1', conv1, 3, 3, 2, 2)
norm1 = _norm('norm1', pool1, lsize=5)
conv2 = _conv('conv2', norm1, 96, 256, 5, 5, 1, 1, 'SAME')
pool2 = _mpool('pool2', conv2, 3, 3, 2, 2)
norm2 = _norm('norm2', pool2, lsize=5)
conv3 = _conv('conv3', norm2, 256, 384, 3, 3, 1, 1, 'SAME')
conv4 = _conv('conv4', conv3, 384, 384, 3, 3, 1, 1, 'SAME')
conv5 = _conv('conv5', conv4, 384, 256, 3, 3, 1, 1, 'SAME')
pool5 = _mpool('pool5', conv5, 3, 3, 2, 2)
resh1 = tf.reshape(pool5, [-1, 256 * 6 * 6])
affn1 = _affine('fc6', resh1, 256 * 6 * 6, 4096)
affn2 = _affine('fc7', affn1, 4096, 4096)
affn3 = _affine('fc8', affn2, 4096, 1000, wd=None, act=False) # last fc
return affn3
def tower_loss(scope):
"""Calculate the total loss on a single tower running the model.
Args:
scope: unique prefix string identifying the tower, e.g. 'tower_0'
Returns:
Tensor of shape [] containing the total loss for a batch of data
"""
image_size = 224
if FLAGS.data_format == 'NCHW':
image_shape = [FLAGS.batch_size, 3, image_size + 3, image_size + 3]
else:
image_shape = [FLAGS.batch_size, image_size + 3, image_size + 3, 3]
images = tf.get_variable(
'image',
image_shape,
initializer=tf.truncated_normal_initializer(
stddev=0.1, dtype=tf.float32),
dtype=tf.float32,
trainable=False)
labels = tf.get_variable(
'label', [FLAGS.batch_size],
initializer=tf.constant_initializer(1),
dtype=tf.int32,
trainable=False)
# Build a Graph that computes the logits predictions from the
# inference model.
last_layer = inference(images)
# Build the portion of the Graph calculating the losses. Note that we will
# assemble the total_loss using a custom function below.
_ = loss(last_layer, labels)
# Assemble all of the losses for the current tower only.
losses = tf.get_collection('losses', scope)
# Calculate the total loss for the current tower.
total_loss = tf.add_n(losses, name='total_loss')
# Compute the moving average of all individual losses and the total loss.
loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg')
loss_averages_op = loss_averages.apply(losses + [total_loss])
# Attach a scalar summary to all individual losses and the total loss; do the
# same for the averaged version of the losses.
for l in losses + [total_loss]:
# Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training
# session. This helps the clarity of presentation on tensorboard.
loss_name = re.sub('%s_[0-9]*/' % TOWER_NAME, '', l.op.name)
# Name each loss as '(raw)' and name the moving average version of the loss
# as the original loss name.
tf.scalar_summary(loss_name + ' (raw)', l)
tf.scalar_summary(loss_name, loss_averages.average(l))
with tf.control_dependencies([loss_averages_op]):
total_loss = tf.identity(total_loss)
return total_loss
def average_gradients(tower_grads):
"""Calculate the average gradient for each shared variable across all towers.
Note that this function provides a synchronization point across all towers.
Args:
tower_grads: List of lists of (gradient, variable) tuples. The outer list
is over individual gradients. The inner list is over the gradient
calculation for each tower.
Returns:
List of pairs of (gradient, variable) where the gradient has been averaged
across all towers.
"""
average_grads = []
for grad_and_vars in zip(*tower_grads):
# Note that each grad_and_vars looks like the following:
# ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
grads = []
for g, _ in grad_and_vars:
# Add 0 dimension to the gradients to represent the tower.
expanded_g = tf.expand_dims(g, 0)
# Append on a 'tower' dimension which we will average over below.
grads.append(expanded_g)
# Average over the 'tower' dimension.
grad = tf.concat(0, grads)
grad = tf.reduce_mean(grad, 0)
# Keep in mind that the Variables are redundant because they are shared
# across towers. So .. we will just return the first tower's pointer to
# the Variable.
v = grad_and_vars[0][1]
grad_and_var = (grad, v)
average_grads.append(grad_and_var)
return average_grads
def time_tensorflow_run(session, target):
num_steps_burn_in = 50
total_duration = 0.0
total_duration_squared = 0.0
for i in xrange(FLAGS.num_batches + num_steps_burn_in):
start_time = time.time()
_, loss_value = session.run(target)
duration = time.time() - start_time
if i > num_steps_burn_in:
if not i % 10:
num_examples_per_step = FLAGS.batch_size * FLAGS.num_gpus
examples_per_sec = num_examples_per_step / duration
sec_per_batch = duration
format_str = (
'%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
'sec/batch batch_size = %d)')
print(format_str %
(datetime.now(), i - num_steps_burn_in, loss_value,
duration, sec_per_batch, num_examples_per_step))
total_duration += duration
total_duration_squared += duration * duration
mn = total_duration / FLAGS.num_batches
vr = total_duration_squared / FLAGS.num_batches - mn * mn
sd = math.sqrt(vr)
print('%s: FwdBwd across %d steps, %.3f +/- %.3f sec / batch' %
(datetime.now(), FLAGS.num_batches, mn, sd))
def run_benchmark():
with tf.Graph().as_default(), tf.device('/cpu:0'):
# Create a variable to count the number of train() calls. This equals the
# number of batches processed * FLAGS.num_gpus.
global_step = tf.get_variable(
'global_step', [],
initializer=tf.constant_initializer(0),
trainable=False)
# Calculate the learning rate schedule.
num_batches_per_epoch = (NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN /
FLAGS.batch_size)
decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY)
# Decay the learning rate exponentially based on the number of steps.
lr = tf.train.exponential_decay(
INITIAL_LEARNING_RATE,
global_step,
decay_steps,
LEARNING_RATE_DECAY_FACTOR,
staircase=True)
# Create an optimizer that performs gradient descent.
opt = tf.train.MomentumOptimizer(lr, 0.9)
# Calculate the gradients for each model tower.
tower_grads = []
for i in xrange(FLAGS.num_gpus):
with tf.device('/gpu:%d' % i):
with tf.name_scope('%s_%d' % (TOWER_NAME, i)) as scope:
# Calculate the loss for one tower of the model. This function
# constructs the entire model but shares the variables across
# all towers.
loss = tower_loss(scope)
# Reuse variables for the next tower.
tf.get_variable_scope().reuse_variables()
# Retain the summaries from the final tower.
summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope)
# Calculate the gradients for the batch of data on this tower.
grads = opt.compute_gradients(loss)
# Keep track of the gradients across all towers.
tower_grads.append(grads)
# We must calculate the mean of each gradient. Note that this is the
# synchronization point across all towers.
grads = average_gradients(tower_grads)
# Apply the gradients to adjust the shared variables.
apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
# Group all updates to into a single train op.
train_op = tf.group(apply_gradient_op)
# Build an initialization operation.
init = tf.initialize_all_variables()
# Start running operations on the Graph. allow_soft_placement must be set to
# True to build towers on GPU, as some of the ops do not have GPU
# implementations.
sess = tf.Session(config=tf.ConfigProto(
allow_soft_placement=True,
log_device_placement=FLAGS.log_device_placement))
sess.run(init)
time_tensorflow_run(sess, [train_op, loss])
def main(_):
run_benchmark()
if __name__ == '__main__':
tf.app.run()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from six.moves import xrange
from datetime import datetime
import math
import time
import tensorflow.python.platform
import tensorflow as tf
FLAGS = tf.app.flags.FLAGS
tf.app.flags.DEFINE_integer('batch_size', 128, """Batch size.""")
tf.app.flags.DEFINE_integer('num_batches', 100, """Number of batches to run.""")
tf.app.flags.DEFINE_boolean('forward_only', False,
"""Only run the forward pass.""")
tf.app.flags.DEFINE_boolean('forward_backward_only', False,
"""Only run the forward-forward pass.""")
tf.app.flags.DEFINE_string('data_format', 'NCHW',
"""The data format for Convnet operations.
Can be either NHWC or NCHW.
""")
tf.app.flags.DEFINE_boolean('log_device_placement', False,
"""Whether to log device placement.""")
parameters = []
conv_counter = 1
pool_counter = 1
affine_counter = 1
def _conv(inpOp, nIn, nOut, kH, kW, dH, dW, padType, wd=0.0005):
global conv_counter
global parameters
name = 'conv' + str(conv_counter)
conv_counter += 1
with tf.name_scope(name) as scope:
kernel = tf.Variable(
tf.truncated_normal(
[kH, kW, nIn, nOut], dtype=tf.float32, stddev=1e-1),
name='weights')
if wd is not None and wd > 0:
weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss')
tf.add_to_collection('losses', weight_decay)
if FLAGS.data_format == 'NCHW':
strides = [1, 1, dH, dW]
else:
strides = [1, dH, dW, 1]
conv = tf.nn.conv2d(
inpOp,
kernel,
strides,
padding=padType,
data_format=FLAGS.data_format)
biases = tf.Variable(
tf.constant(
0.0, shape=[nOut], dtype=tf.float32),
trainable=True,
name='biases')
bias = tf.reshape(
tf.nn.bias_add(
conv, biases, data_format=FLAGS.data_format),
conv.get_shape())
conv1 = tf.nn.relu(bias, name=scope)
parameters += [kernel, biases]
return conv1
def _affine(inpOp, nIn, nOut, act=True, wd=0.0005):
global affine_counter
global parameters
name = 'affine' + str(affine_counter)
affine_counter += 1
with tf.name_scope(name) as scope:
kernel = tf.Variable(
tf.truncated_normal(
[nIn, nOut], dtype=tf.float32, stddev=1e-1),
name='weights')
if wd is not None and wd > 0:
weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss')
tf.add_to_collection('losses', weight_decay)
biases = tf.Variable(
tf.constant(
0.0, shape=[nOut], dtype=tf.float32),
trainable=True,
name='biases')
affine1 = tf.nn.relu_layer(
inpOp, kernel, biases,
name=name) if act else tf.matmul(inpOp, kernel) + biases
parameters += [kernel, biases]
return affine1
def _mpool(inpOp, kH, kW, dH, dW, padding):
global pool_counter
global parameters
name = 'pool' + str(pool_counter)
pool_counter += 1
if FLAGS.data_format == 'NCHW':
ksize = [1, 1, kH, kW]
strides = [1, 1, dH, dW]
else:
ksize = [1, kH, kW, 1]
strides = [1, dH, dW, 1]
return tf.nn.max_pool(
inpOp,
ksize=ksize,
strides=strides,
padding=padding,
data_format=FLAGS.data_format,
name=name)
def _apool(inpOp, kH, kW, dH, dW, padding):
global pool_counter
global parameters
name = 'pool' + str(pool_counter)
pool_counter += 1
if FLAGS.data_format == 'NCHW':
ksize = [1, 1, kH, kW]
strides = [1, 1, dH, dW]
else:
ksize = [1, kH, kW, 1]
strides = [1, dH, dW, 1]
return tf.nn.avg_pool(
inpOp,
ksize=ksize,
strides=strides,
padding=padding,
data_format=FLAGS.data_format,
name=name)
def _inception(inp, inSize, o1s, o2s1, o2s2, o3s1, o3s2, o4s1, o4s2):
conv1 = _conv(inp, inSize, o1s, 1, 1, 1, 1, 'VALID')
conv3_ = _conv(inp, inSize, o2s1, 1, 1, 1, 1, 'VALID')
conv3 = _conv(conv3_, o2s1, o2s2, 3, 3, 1, 1, 'SAME')
conv5_ = _conv(inp, inSize, o3s1, 1, 1, 1, 1, 'VALID')
conv5 = _conv(conv5_, o3s1, o3s2, 5, 5, 1, 1, 'SAME')
pool_ = _mpool(inp, o4s1, o4s1, 1, 1, 'SAME')
pool = _conv(pool_, inSize, o4s2, 1, 1, 1, 1, 'VALID')
if FLAGS.data_format == 'NCHW':
channel_dim = 1
else:
channel_dim = 3
incept = tf.concat(channel_dim, [conv1, conv3, conv5, pool])
return incept
def loss(logits, labels):
batch_size = tf.size(labels)
labels = tf.expand_dims(labels, 1)
indices = tf.expand_dims(tf.range(0, batch_size, 1), 1)
concated = tf.concat(1, [indices, labels])
onehot_labels = tf.sparse_to_dense(concated,
tf.pack([batch_size, 1000]), 1.0, 0.0)
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
logits, onehot_labels, name='xentropy')
loss = tf.reduce_mean(cross_entropy, name='xentropy_mean')
return loss
def inference(images):
# stage 1
conv1 = _conv(images, 3, 64, 7, 7, 2, 2, 'SAME')
pool1 = _mpool(conv1, 3, 3, 2, 2, 'SAME')
# stage 2
conv2 = _conv(pool1, 64, 64, 1, 1, 1, 1, 'VALID')
conv3 = _conv(conv2, 64, 192, 3, 3, 1, 1, 'SAME')
pool3 = _mpool(conv3, 3, 3, 2, 2, 'SAME')
# stage 3
incept3a = _inception(pool3, 192, 64, 96, 128, 16, 32, 3, 32)
incept3b = _inception(incept3a, 256, 128, 128, 192, 32, 96, 3, 64)
pool4 = _mpool(incept3b, 3, 3, 2, 2, 'SAME')
# stage 4
incept4a = _inception(pool4, 480, 192, 96, 208, 16, 48, 3, 64)
incept4b = _inception(incept4a, 512, 160, 112, 224, 24, 64, 3, 64)
incept4c = _inception(incept4b, 512, 128, 128, 256, 24, 64, 3, 64)
incept4d = _inception(incept4c, 512, 112, 144, 288, 32, 64, 3, 64)
incept4e = _inception(incept4d, 528, 256, 160, 320, 32, 128, 3, 128)
pool5 = _mpool(incept4e, 3, 3, 2, 2, 'SAME')
# stage 5
incept5a = _inception(pool5, 832, 256, 160, 320, 32, 128, 3, 128)
incept5b = _inception(incept5a, 832, 384, 192, 384, 48, 128, 3, 128)
pool6 = _apool(incept5b, 7, 7, 1, 1, 'VALID')
# output 1
resh1 = tf.reshape(pool6, [-1, 1024])
drop = tf.nn.dropout(resh1, 0.4)
affn1 = _affine(resh1, 1024, 1000, act=False)
return affn1
def time_tensorflow_run(session, target, info_string):
num_steps_burn_in = 10
total_duration = 0.0
total_duration_squared = 0.0
if not isinstance(target, list):
target = [target]
target_op = tf.group(*target)
for i in range(FLAGS.num_batches + num_steps_burn_in):
start_time = time.time()
_ = session.run(target_op)
duration = time.time() - start_time
if i > num_steps_burn_in:
if not i % 10:
print('%s: step %d, duration = %.3f' %
(datetime.now(), i - num_steps_burn_in, duration))
total_duration += duration
total_duration_squared += duration * duration
mn = total_duration / FLAGS.num_batches
vr = total_duration_squared / FLAGS.num_batches - mn * mn
sd = math.sqrt(vr)
print('%s: %s across %d steps, %.3f +/- %.3f sec / batch' %
(datetime.now(), info_string, FLAGS.num_batches, mn, sd))
def run_benchmark():
global parameters
with tf.Graph().as_default():
# Generate some dummy images.
image_size = 224
if FLAGS.data_format == 'NCHW':
image_shape = [FLAGS.batch_size, 3, image_size, image_size]
else:
image_shape = [FLAGS.batch_size, image_size, image_size, 3]
images = tf.get_variable(
'image',
image_shape,
initializer=tf.truncated_normal_initializer(
stddev=0.1, dtype=tf.float32),
dtype=tf.float32,
trainable=False)
labels = tf.get_variable(
'label', [FLAGS.batch_size],
initializer=tf.constant_initializer(1),
dtype=tf.int32,
trainable=False)
# Build a Graph that computes the logits predictions from the
# inference model.
last_layer = inference(images)
objective = loss(last_layer, labels)
# Compute gradients.
# opt = tf.train.GradientDescentOptimizer(0.001)
opt = tf.train.MomentumOptimizer(0.001, 0.9)
grads = opt.compute_gradients(objective)
global_step = tf.get_variable(
'global_step', [],
initializer=tf.constant_initializer(
0.0, dtype=tf.float32),
trainable=False,
dtype=tf.float32)
apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
# Track the moving averages of all trainable variables.
variable_averages = tf.train.ExponentialMovingAverage(0.9, global_step)
variables_averages_op = variable_averages.apply(tf.trainable_variables(
))
# Build an initialization operation.
init = tf.initialize_all_variables()
# Start running operations on the Graph.
sess = tf.Session(config=tf.ConfigProto(
allow_soft_placement=True,
log_device_placement=FLAGS.log_device_placement))
sess.run(init)
run_forward = True
run_forward_backward = True
if FLAGS.forward_only and FLAGS.forward_backward_only:
raise ValueError("Cannot specify --forward_only and "
"--forward_backward_only at the same time.")
if FLAGS.forward_only:
run_forward_backward = False
elif FLAGS.forward_backward_only:
run_forward = False
if run_forward:
# Run the forward benchmark.
time_tensorflow_run(sess, last_layer, "Forward")
if run_forward_backward:
with tf.control_dependencies(
[apply_gradient_op, variables_averages_op]):
train_op = tf.no_op(name='train')
time_tensorflow_run(sess, [train_op, objective], "Forward-backward")
def main(_):
run_benchmark()
if __name__ == '__main__':
tf.app.run()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from six.moves import xrange # pylint: disable=redefined-builtin
from datetime import datetime
import math
import re
import time
import tensorflow.python.platform
import tensorflow as tf
FLAGS = tf.app.flags.FLAGS
tf.app.flags.DEFINE_integer('batch_size', 64, """Batch size.""")
tf.app.flags.DEFINE_integer('num_batches', 100, """Number of batches to run.""")
tf.app.flags.DEFINE_string('data_format', 'NCHW',
"""The data format for Convnet operations.
Can be either NHWC or NCHW.
""")
tf.app.flags.DEFINE_string('train_dir', '/train_model',
"""Directory where to write event logs """
"""and checkpoint.""")
tf.app.flags.DEFINE_integer('num_gpus', 4, """How many GPUs to use.""")
tf.app.flags.DEFINE_boolean('log_device_placement', False,
"""Whether to log device placement.""")
NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN = 50000
NUM_EPOCHS_PER_DECAY = 50
INITIAL_LEARNING_RATE = 0.1
LEARNING_RATE_DECAY_FACTOR = 0.1
TOWER_NAME = 'tower'
def _conv(name, inpOp, nIn, nOut, kH, kW, dH, dW, padType, wd=0.005):
with tf.name_scope(name) as scope:
kernel = tf.get_variable(
name + '_w', [kH, kW, nIn, nOut],
initializer=tf.truncated_normal_initializer(
stddev=0.01, dtype=tf.float32),
dtype=tf.float32)
if wd is not None:
weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss')
tf.add_to_collection('losses', weight_decay)
if FLAGS.data_format == 'NCHW':
strides = [1, 1, dH, dW]
else:
strides = [1, dH, dW, 1]
conv = tf.nn.conv2d(
inpOp,
kernel,
strides,
padding=padType,
data_format=FLAGS.data_format)
biases = tf.get_variable(
name=name + '_b',
shape=[nOut],
initializer=tf.constant_initializer(
value=0.0, dtype=tf.float32),
dtype=tf.float32)
bias = tf.reshape(
tf.nn.bias_add(
conv, biases, data_format=FLAGS.data_format),
conv.get_shape())
conv1 = tf.nn.relu(bias, name=scope)
return conv1
def _affine(name, inpOp, nIn, nOut, wd=0.005, act=True):
with tf.name_scope(name) as scope:
kernel = tf.get_variable(
name + '_w', [nIn, nOut],
initializer=tf.truncated_normal_initializer(
stddev=0.01, dtype=tf.float32),
dtype=tf.float32)
if wd is not None:
weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss')
tf.add_to_collection('losses', weight_decay)
biases = tf.get_variable(
name + '_b', [nOut],
initializer=tf.constant_initializer(
value=0.0, dtype=tf.float32),
dtype=tf.float32,
trainable=True)
affine1 = tf.nn.relu_layer(inpOp, kernel, biases, name=name) if act else \
tf.matmul(inpOp, kernel) + biases
return affine1
def _mpool(name, inpOp, kH, kW, dH, dW, padding):
if FLAGS.data_format == 'NCHW':
ksize = [1, 1, kH, kW]
strides = [1, 1, dH, dW]
else:
ksize = [1, kH, kW, 1]
strides = [1, dH, dW, 1]
return tf.nn.max_pool(
inpOp,
ksize=ksize,
strides=strides,
padding=padding,
data_format=FLAGS.data_format,
name=name)
def _apool(name, inpOp, kH, kW, dH, dW, padding):
if FLAGS.data_format == 'NCHW':
ksize = [1, 1, kH, kW]
strides = [1, 1, dH, dW]
else:
ksize = [1, kH, kW, 1]
strides = [1, dH, dW, 1]
return tf.nn.avg_pool(
inpOp,
ksize=ksize,
strides=strides,
padding=padding,
data_format=FLAGS.data_format,
name=name)
def loss(logits, labels):
labels = tf.cast(labels, tf.int64)
cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
logits, labels, name='cross_entropy_per_example')
cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy')
tf.add_to_collection('losses', cross_entropy_mean)
# The total loss is defined as the cross entropy loss plus all of the weight
# decay terms (L2 loss).
return tf.add_n(tf.get_collection('losses'), name='total_loss')
def get_incoming_shape(incoming):
""" Returns the incoming data shape """
if isinstance(incoming, tf.Tensor):
return incoming.get_shape().as_list()
elif type(incoming) in [np.array, list, tuple]:
return np.shape(incoming)
else:
raise Exception("Invalid incoming layer.")
def _inception(name, inp, inSize, o1s, o2s1, o2s2, o3s1, o3s2, o4s1, o4s2):
conv1 = _conv(name + '_1', inp, inSize, o1s, 1, 1, 1, 1, 'VALID')
conv3_ = _conv(name + '_3r', inp, inSize, o2s1, 1, 1, 1, 1, 'VALID')
conv3 = _conv(name + '_3', conv3_, o2s1, o2s2, 3, 3, 1, 1, 'SAME')
conv5_ = _conv(name + '_5r', inp, inSize, o3s1, 1, 1, 1, 1, 'VALID')
conv5 = _conv(name + '5', conv5_, o3s1, o3s2, 5, 5, 1, 1, 'SAME')
pool_ = _mpool(name + 'pool', inp, o4s1, o4s1, 1, 1, 'SAME')
pool = _conv(name + 'proj', pool_, inSize, o4s2, 1, 1, 1, 1, 'VALID')
if FLAGS.data_format == 'NCHW':
channel_dim = 1
else:
channel_dim = 3
incept = tf.concat(channel_dim, [conv1, conv3, conv5, pool])
return incept
def inference(images):
# stage 1
conv1 = _conv('conv1', images, 3, 64, 7, 7, 2, 2, 'SAME')
pool1 = _mpool('pool1', conv1, 3, 3, 2, 2, 'SAME')
# stage 2
conv2 = _conv('conv2', pool1, 64, 64, 1, 1, 1, 1, 'VALID')
conv3 = _conv('conv3', conv2, 64, 192, 3, 3, 1, 1, 'SAME')
pool3 = _mpool('pool3', conv3, 3, 3, 2, 2, 'SAME')
# stage 3
incept3a = _inception('ince3a', pool3, 192, 64, 96, 128, 16, 32, 3, 32)
incept3b = _inception('ince3b', incept3a, 256, 128, 128, 192, 32, 96, 3, 64)
pool4 = _mpool('pool4', incept3b, 3, 3, 2, 2, 'SAME')
# stage 4
incept4a = _inception('ince4a', pool4, 480, 192, 96, 208, 16, 48, 3, 64)
incept4b = _inception('ince4b', incept4a, 512, 160, 112, 224, 24, 64, 3, 64)
incept4c = _inception('ince4c', incept4b, 512, 128, 128, 256, 24, 64, 3, 64)
incept4d = _inception('ince4d', incept4c, 512, 112, 144, 288, 32, 64, 3, 64)
incept4e = _inception('ince4e', incept4d, 528, 256, 160, 320, 32, 128, 3,
128)
pool5 = _mpool('pool5', incept4e, 3, 3, 2, 2, 'SAME')
# stage 5
incept5a = _inception('ince5a', pool5, 832, 256, 160, 320, 32, 128, 3, 128)
incept5b = _inception('ince5b', incept5a, 832, 384, 192, 384, 48, 128, 3,
128)
pool6 = _apool('pool6', incept5b, 7, 7, 1, 1, 'VALID')
# output 1
resh1 = tf.reshape(pool6, [-1, 1024])
drop = tf.nn.dropout(resh1, 0.4)
affn1 = _affine('fc_out', resh1, 1024, 1000, act=False)
return affn1
def tower_loss(scope):
"""Calculate the total loss on a single tower running the model.
Args:
scope: unique prefix string identifying the tower, e.g. 'tower_0'
Returns:
Tensor of shape [] containing the total loss for a batch of data
"""
image_size = 224
if FLAGS.data_format == 'NCHW':
image_shape = [FLAGS.batch_size, 3, image_size, image_size]
else:
image_shape = [FLAGS.batch_size, image_size, image_size, 3]
images = tf.get_variable(
'image',
image_shape,
initializer=tf.truncated_normal_initializer(
stddev=0.1, dtype=tf.float32),
dtype=tf.float32,
trainable=False)
labels = tf.get_variable(
'label', [FLAGS.batch_size],
initializer=tf.constant_initializer(1),
dtype=tf.int32,
trainable=False)
# Build a Graph that computes the logits predictions from the
# inference model.
last_layer = inference(images)
# Build the portion of the Graph calculating the losses. Note that we will
# assemble the total_loss using a custom function below.
_ = loss(last_layer, labels)
# Assemble all of the losses for the current tower only.
losses = tf.get_collection('losses', scope)
# Calculate the total loss for the current tower.
total_loss = tf.add_n(losses, name='total_loss')
# Compute the moving average of all individual losses and the total loss.
loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg')
loss_averages_op = loss_averages.apply(losses + [total_loss])
# Attach a scalar summary to all individual losses and the total loss; do the
# same for the averaged version of the losses.
for l in losses + [total_loss]:
# Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training
# session. This helps the clarity of presentation on tensorboard.
loss_name = re.sub('%s_[0-9]*/' % TOWER_NAME, '', l.op.name)
# Name each loss as '(raw)' and name the moving average version of the loss
# as the original loss name.
tf.scalar_summary(loss_name + ' (raw)', l)
tf.scalar_summary(loss_name, loss_averages.average(l))
with tf.control_dependencies([loss_averages_op]):
total_loss = tf.identity(total_loss)
return total_loss
def average_gradients(tower_grads):
"""Calculate the average gradient for each shared variable across all towers.
Note that this function provides a synchronization point across all towers.
Args:
tower_grads: List of lists of (gradient, variable) tuples. The outer list
is over individual gradients. The inner list is over the gradient
calculation for each tower.
Returns:
List of pairs of (gradient, variable) where the gradient has been averaged
across all towers.
"""
average_grads = []
for grad_and_vars in zip(*tower_grads):
# Note that each grad_and_vars looks like the following:
# ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
grads = []
for g, _ in grad_and_vars:
# Add 0 dimension to the gradients to represent the tower.
expanded_g = tf.expand_dims(g, 0)
# Append on a 'tower' dimension which we will average over below.
grads.append(expanded_g)
# Average over the 'tower' dimension.
grad = tf.concat(0, grads)
grad = tf.reduce_mean(grad, 0)
# Keep in mind that the Variables are redundant because they are shared
# across towers. So .. we will just return the first tower's pointer to
# the Variable.
v = grad_and_vars[0][1]
grad_and_var = (grad, v)
average_grads.append(grad_and_var)
return average_grads
def time_tensorflow_run(session, target):
num_steps_burn_in = 50
total_duration = 0.0
total_duration_squared = 0.0
for i in xrange(FLAGS.num_batches + num_steps_burn_in):
start_time = time.time()
_, loss_value = session.run(target)
duration = time.time() - start_time
if i > num_steps_burn_in:
if not i % 10:
num_examples_per_step = FLAGS.batch_size * FLAGS.num_gpus
examples_per_sec = num_examples_per_step / duration
sec_per_batch = duration
format_str = (
'%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
'sec/batch batch_size = %d)')
print(format_str %
(datetime.now(), i - num_steps_burn_in, loss_value,
duration, sec_per_batch, num_examples_per_step))
total_duration += duration
total_duration_squared += duration * duration
mn = total_duration / FLAGS.num_batches
vr = total_duration_squared / FLAGS.num_batches - mn * mn
sd = math.sqrt(vr)
print('%s: FwdBwd across %d steps, %.3f +/- %.3f sec / batch' %
(datetime.now(), FLAGS.num_batches, mn, sd))
def run_benchmark():
with tf.Graph().as_default(), tf.device('/cpu:0'):
# Create a variable to count the number of train() calls. This equals the
# number of batches processed * FLAGS.num_gpus.
global_step = tf.get_variable(
'global_step', [],
initializer=tf.constant_initializer(0),
trainable=False)
# Calculate the learning rate schedule.
num_batches_per_epoch = (NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN /
FLAGS.batch_size)
decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY)
# Decay the learning rate exponentially based on the number of steps.
lr = tf.train.exponential_decay(
INITIAL_LEARNING_RATE,
global_step,
decay_steps,
LEARNING_RATE_DECAY_FACTOR,
staircase=True)
# Create an optimizer that performs gradient descent.
opt = tf.train.MomentumOptimizer(lr, 0.9)
# Calculate the gradients for each model tower.
tower_grads = []
for i in xrange(FLAGS.num_gpus):
with tf.device('/gpu:%d' % i):
with tf.name_scope('%s_%d' % (TOWER_NAME, i)) as scope:
# Calculate the loss for one tower of the model. This function
# constructs the entire model but shares the variables across
# all towers.
loss = tower_loss(scope)
# Reuse variables for the next tower.
tf.get_variable_scope().reuse_variables()
# Retain the summaries from the final tower.
summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope)
# Calculate the gradients for the batch of data on this tower.
grads = opt.compute_gradients(loss)
# Keep track of the gradients across all towers.
tower_grads.append(grads)
# We must calculate the mean of each gradient. Note that this is the
# synchronization point across all towers.
grads = average_gradients(tower_grads)
# Apply the gradients to adjust the shared variables.
apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
# Group all updates to into a single train op.
train_op = tf.group(apply_gradient_op)
# Build an initialization operation.
init = tf.initialize_all_variables()
# Start running operations on the Graph. allow_soft_placement must be set to
# True to build towers on GPU, as some of the ops do not have GPU
# implementations.
sess = tf.Session(config=tf.ConfigProto(
allow_soft_placement=True,
log_device_placement=FLAGS.log_device_placement))
sess.run(init)
time_tensorflow_run(sess, [train_op, loss])
def main(_):
run_benchmark()
if __name__ == '__main__':
tf.app.run()
#!/bin/bash
set -e
function test() {
cfg=$1
batch_size=$2
prefix=$3
python $cfg --batch_size=$batch_size > logs/${prefix}-1gpu-${batch_size}.log 2>&1
}
if [ ! -d "logs" ]; then
mkdir logs
fi
# alexnet
test alexnet.py 64 alexnet
test alexnet.py 128 alexnet
test alexnet.py 256 alexnet
test alexnet.py 512 alexnet
# googlenet
test googlenet.py 64 googlenet
test googlenet.py 128 googlenet
# smallnet
test smallnet_mnist_cifar.py 64 smallnet
test smallnet_mnist_cifar.py 128 smallnet
test smallnet_mnist_cifar.py 256 smallnet
test smallnet_mnist_cifar.py 512 smallnet
#!/bin/bash
set -e
function test() {
cfg=$1
num_gpu=$2
batch_size=$3
batch_per_gpu=`expr ${batch_size} / ${num_gpu}`
prefix=$4
python $cfg --num_gpus=$num_gpu --batch_size=${batch_per_gpu} > logs/${prefix}-4gpu-${batch_size}.log 2>&1
}
if [ ! -d "logs" ]; then
mkdir logs
fi
# alexnet
test alexnet_multi_gpu.py 4 512 alexnet
test alexnet_multi_gpu.py 4 1024 alexnet
# googlenet
test googlenet_multi_gpu.py 4 512 alexnet
test googlenet_multi_gpu.py 4 1024 alexnet
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from six.moves import xrange # pylint: disable=redefined-builtin
from datetime import datetime
import math
import time
import tensorflow.python.platform
import tensorflow as tf
FLAGS = tf.app.flags.FLAGS
tf.app.flags.DEFINE_integer('batch_size', 128, """Batch size.""")
tf.app.flags.DEFINE_integer('num_batches', 100, """Number of batches to run.""")
tf.app.flags.DEFINE_boolean('forward_only', False,
"""Only run the forward pass.""")
tf.app.flags.DEFINE_boolean('forward_backward_only', False,
"""Only run the forward-forward pass.""")
tf.app.flags.DEFINE_string('data_format', 'NCHW',
"""The data format for Convnet operations.
Can be either NHWC or NCHW.
""")
tf.app.flags.DEFINE_boolean('log_device_placement', False,
"""Whether to log device placement.""")
parameters = []
conv_counter = 1
pool_counter = 1
affine_counter = 1
def _conv(inpOp, nIn, nOut, kH, kW, dH, dW, padType, wd=0.005, act=True):
global conv_counter
global parameters
name = 'conv' + str(conv_counter)
conv_counter += 1
with tf.name_scope(name) as scope:
kernel = tf.Variable(
tf.truncated_normal(
[kH, kW, nIn, nOut], dtype=tf.float32, stddev=1e-1),
name='weights')
if wd is not None:
weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss')
tf.add_to_collection('losses', weight_decay)
if FLAGS.data_format == 'NCHW':
strides = [1, 1, dH, dW]
else:
strides = [1, dH, dW, 1]
conv = tf.nn.conv2d(
inpOp,
kernel,
strides,
padding=padType,
data_format=FLAGS.data_format)
biases = tf.Variable(
tf.constant(
0.0, shape=[nOut], dtype=tf.float32),
trainable=True,
name='biases')
bias = tf.reshape(
tf.nn.bias_add(
conv, biases, data_format=FLAGS.data_format),
conv.get_shape())
conv1 = tf.nn.relu(bias, name=scope) if act else bias
parameters += [kernel, biases]
return conv1
def _affine(inpOp, nIn, nOut, wd=None, act=True):
global affine_counter
global parameters
name = 'affine' + str(affine_counter)
affine_counter += 1
with tf.name_scope(name) as scope:
kernel = tf.Variable(
tf.truncated_normal(
[nIn, nOut], dtype=tf.float32, stddev=1e-1),
name='weights')
if wd is not None:
weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss')
tf.add_to_collection('losses', weight_decay)
biases = tf.Variable(
tf.constant(
0.0, shape=[nOut], dtype=tf.float32),
trainable=True,
name='biases')
affine1 = tf.nn.relu_layer(
inpOp, kernel, biases,
name=name) if act else tf.matmul(inpOp, kernel) + biases
parameters += [kernel, biases]
return affine1
def _mpool(inpOp, kH, kW, dH, dW, padding):
global pool_counter
global parameters
name = 'pool' + str(pool_counter)
pool_counter += 1
if FLAGS.data_format == 'NCHW':
ksize = [1, 1, kH, kW]
strides = [1, 1, dH, dW]
else:
ksize = [1, kH, kW, 1]
strides = [1, dH, dW, 1]
return tf.nn.max_pool(
inpOp,
ksize=ksize,
strides=strides,
padding=padding,
data_format=FLAGS.data_format,
name=name)
def _apool(inpOp, kH, kW, dH, dW, padding):
global pool_counter
global parameters
name = 'pool' + str(pool_counter)
pool_counter += 1
if FLAGS.data_format == 'NCHW':
ksize = [1, 1, kH, kW]
strides = [1, 1, dH, dW]
else:
ksize = [1, kH, kW, 1]
strides = [1, dH, dW, 1]
return tf.nn.avg_pool(
inpOp,
ksize=ksize,
strides=strides,
padding=padding,
data_format=FLAGS.data_format,
name=name)
def _norm(name, l_input, lsize=4):
return tf.nn.lrn(l_input,
lsize,
bias=1.0,
alpha=0.001 / 9.0,
beta=0.75,
name=name)
def loss(logits, labels):
batch_size = tf.size(labels)
labels = tf.expand_dims(labels, 1)
indices = tf.expand_dims(tf.range(0, batch_size, 1), 1)
concated = tf.concat(1, [indices, labels])
onehot_labels = tf.sparse_to_dense(concated,
tf.pack([batch_size, 10]), 1.0, 0.0)
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
logits, onehot_labels, name='xentropy')
loss = tf.reduce_mean(cross_entropy, name='xentropy_mean')
return loss
def get_incoming_shape(incoming):
""" Returns the incoming data shape """
if isinstance(incoming, tf.Tensor):
return incoming.get_shape().as_list()
elif type(incoming) in [np.array, list, tuple]:
return np.shape(incoming)
else:
raise Exception("Invalid incoming layer.")
def inference(images):
conv1 = _conv(images, 3, 32, 5, 5, 1, 1, 'SAME')
pool1 = _mpool(conv1, 3, 3, 2, 2, 'SAME')
conv2 = _conv(pool1, 32, 32, 5, 5, 1, 1, 'SAME')
pool2 = _apool(conv2, 3, 3, 2, 2, 'SAME')
conv3 = _conv(pool2, 32, 64, 5, 5, 1, 1, 'SAME')
pool3 = _apool(conv3, 3, 3, 2, 2, 'SAME')
resh1 = tf.reshape(pool3, [-1, 64 * 4 * 4])
affn1 = _affine(resh1, 64 * 4 * 4, 64)
affn2 = _affine(affn1, 64, 10, act=False)
print('conv1:', get_incoming_shape(conv1))
print('pool1:', get_incoming_shape(pool1))
print('conv2:', get_incoming_shape(conv2))
print('pool2:', get_incoming_shape(pool2))
print('conv3:', get_incoming_shape(conv3))
print('pool3:', get_incoming_shape(pool3))
return affn2
def time_tensorflow_run(session, target, info_string):
num_steps_burn_in = 10
total_duration = 0.0
total_duration_squared = 0.0
if not isinstance(target, list):
target = [target]
target_op = tf.group(*target)
for i in xrange(FLAGS.num_batches + num_steps_burn_in):
start_time = time.time()
_ = session.run(target_op)
duration = time.time() - start_time
if i > num_steps_burn_in:
if not i % 10:
print('%s: step %d, duration = %.3f' %
(datetime.now(), i - num_steps_burn_in, duration))
total_duration += duration
total_duration_squared += duration * duration
mn = total_duration / FLAGS.num_batches
vr = total_duration_squared / FLAGS.num_batches - mn * mn
sd = math.sqrt(vr)
print('%s: %s across %d steps, %.3f +/- %.3f sec / batch' %
(datetime.now(), info_string, FLAGS.num_batches, mn, sd))
def run_benchmark():
global parameters
with tf.Graph().as_default():
# Generate some dummy images.
image_size = 32
# Note that our padding definition is slightly different the cuda-convnet.
# In order to force the model to start with the same activations sizes,
# we add 3 to the image_size and employ VALID padding above.
if FLAGS.data_format == 'NCHW':
image_shape = [FLAGS.batch_size, 3, image_size, image_size]
else:
image_shape = [FLAGS.batch_size, image_size, image_size, 3]
images = tf.get_variable(
'image',
image_shape,
initializer=tf.truncated_normal_initializer(
stddev=0.1, dtype=tf.float32),
dtype=tf.float32,
trainable=False)
labels = tf.get_variable(
'label', [FLAGS.batch_size],
initializer=tf.constant_initializer(1),
dtype=tf.int32,
trainable=False)
# Build a Graph that computes the logits predictions from the
# inference model.
last_layer = inference(images)
objective = loss(last_layer, labels)
# Compute gradients.
opt = tf.train.MomentumOptimizer(0.001, 0.9)
grads = opt.compute_gradients(objective)
global_step = tf.get_variable(
'global_step', [],
initializer=tf.constant_initializer(
0.0, dtype=tf.float32),
trainable=False,
dtype=tf.float32)
apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
# Track the moving averages of all trainable variables.
variable_averages = tf.train.ExponentialMovingAverage(0.9, global_step)
variables_averages_op = variable_averages.apply(tf.trainable_variables(
))
# Build an initialization operation.
init = tf.initialize_all_variables()
# Start running operations on the Graph.
sess = tf.Session(config=tf.ConfigProto(
allow_soft_placement=True,
log_device_placement=FLAGS.log_device_placement))
sess.run(init)
run_forward = True
run_forward_backward = True
if FLAGS.forward_only and FLAGS.forward_backward_only:
raise ValueError("Cannot specify --forward_only and "
"--forward_backward_only at the same time.")
if FLAGS.forward_only:
run_forward_backward = False
elif FLAGS.forward_backward_only:
run_forward = False
if run_forward:
# Run the forward benchmark.
time_tensorflow_run(sess, last_layer, "Forward")
if run_forward_backward:
with tf.control_dependencies(
[apply_gradient_op, variables_averages_op]):
train_op = tf.no_op(name='train')
time_tensorflow_run(sess, [train_op, objective], "Forward-backward")
def main(_):
run_benchmark()
if __name__ == '__main__':
tf.app.run()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
from tensorflow.python.framework import dtypes
from tensorflow.python.layers.core import Dense
from tensorflow.python.ops import check_ops
from tensorflow.python.ops import math_ops
from tensorflow.python.framework import ops
from tensorflow.python.ops import rnn_cell_impl
from tensorflow.python.ops.rnn_cell_impl import RNNCell, BasicLSTMCell
from tensorflow.python.ops.rnn_cell_impl import LSTMStateTuple
from tensorflow.contrib.rnn.python.ops import core_rnn_cell
from tensorflow.python.ops import array_ops
from tensorflow.python.util import nest
import tensorflow.contrib.seq2seq as seq2seq
from tensorflow.contrib.seq2seq.python.ops import beam_search_decoder
import numpy as np
import os
import argparse
import time
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--embedding_dim",
type=int,
default=512,
help="The dimension of embedding table. (default: %(default)d)")
parser.add_argument(
"--encoder_size",
type=int,
default=512,
help="The size of encoder bi-rnn unit. (default: %(default)d)")
parser.add_argument(
"--decoder_size",
type=int,
default=512,
help="The size of decoder rnn unit. (default: %(default)d)")
parser.add_argument(
"--batch_size",
type=int,
default=128,
help="The sequence number of a mini-batch data. (default: %(default)d)")
parser.add_argument(
"--dict_size",
type=int,
default=30000,
help="The dictionary capacity. Dictionaries of source sequence and "
"target dictionary have same capacity. (default: %(default)d)")
parser.add_argument(
"--max_time_steps",
type=int,
default=81,
help="Max number of time steps for sequence. (default: %(default)d)")
parser.add_argument(
"--pass_num",
type=int,
default=10,
help="The pass number to train. (default: %(default)d)")
parser.add_argument(
"--learning_rate",
type=float,
default=0.0002,
help="Learning rate used to train the model. (default: %(default)f)")
parser.add_argument(
"--infer_only", action='store_true', help="If set, run forward only.")
parser.add_argument(
"--beam_size",
type=int,
default=3,
help="The width for beam searching. (default: %(default)d)")
parser.add_argument(
"--max_generation_length",
type=int,
default=250,
help="The maximum length of sequence when doing generation. "
"(default: %(default)d)")
parser.add_argument(
"--save_freq",
type=int,
default=500,
help="Save model checkpoint every this interation. (default: %(default)d)")
parser.add_argument(
"--model_dir",
type=str,
default='./checkpoint',
help="Path to save model checkpoints. (default: %(default)d)")
_Linear = core_rnn_cell._Linear # pylint: disable=invalid-name
START_TOKEN_IDX = 0
END_TOKEN_IDX = 1
class LSTMCellWithSimpleAttention(RNNCell):
"""Add attention mechanism to BasicLSTMCell.
This class is a wrapper based on tensorflow's `BasicLSTMCell`.
"""
def __init__(self,
num_units,
encoder_vector,
encoder_proj,
source_sequence_length,
forget_bias=1.0,
state_is_tuple=True,
activation=None,
reuse=None):
super(LSTMCellWithSimpleAttention, self).__init__(_reuse=reuse)
if not state_is_tuple:
logging.warn("%s: Using a concatenated state is slower and will "
"soon be deprecated. Use state_is_tuple=True.", self)
self._num_units = num_units
# set padding part to 0
self._encoder_vector = self._reset_padding(encoder_vector,
source_sequence_length)
self._encoder_proj = self._reset_padding(encoder_proj,
source_sequence_length)
self._forget_bias = forget_bias
self._state_is_tuple = state_is_tuple
self._activation = activation or math_ops.tanh
self._linear = None
@property
def state_size(self):
return (LSTMStateTuple(self._num_units, self._num_units) \
if self._state_is_tuple else 2 * self._num_units)
@property
def output_size(self):
return self._num_units
def zero_state(self, batch_size, dtype):
state_size = self.state_size
if hasattr(self, "_last_zero_state"):
(last_state_size, last_batch_size, last_dtype,
last_output) = getattr(self, "_last_zero_state")
if (last_batch_size == batch_size and last_dtype == dtype and
last_state_size == state_size):
return last_output
with ops.name_scope(
type(self).__name__ + "ZeroState", values=[batch_size]):
output = _zero_state_tensors(state_size, batch_size, dtype)
self._last_zero_state = (state_size, batch_size, dtype, output)
return output
def call(self, inputs, state):
sigmoid = math_ops.sigmoid
# Parameters of gates are concatenated into one multiply for efficiency.
if self._state_is_tuple:
c, h = state
else:
c, h = array_ops.split(value=state, num_or_size_splits=2, axis=1)
# get context from encoder outputs
context = self._simple_attention(self._encoder_vector,
self._encoder_proj, h)
if self._linear is None:
self._linear = _Linear([inputs, context, h], 4 * self._num_units,
True)
# i = input_gate, j = new_input, f = forget_gate, o = output_gate
i, j, f, o = array_ops.split(
value=self._linear([inputs, context, h]),
num_or_size_splits=4,
axis=1)
new_c = (c * sigmoid(f + self._forget_bias) + sigmoid(i) *
self._activation(j))
new_h = self._activation(new_c) * sigmoid(o)
if self._state_is_tuple:
new_state = LSTMStateTuple(new_c, new_h)
else:
new_state = array_ops.concat([new_c, new_h], 1)
return new_h, new_state
def _simple_attention(self, encoder_vec, encoder_proj, decoder_state):
"""Implement the attention function.
The implementation has the same logic to the fluid decoder.
"""
decoder_state_proj = tf.contrib.layers.fully_connected(
inputs=decoder_state,
num_outputs=self._num_units,
activation_fn=None,
biases_initializer=None)
decoder_state_expand = tf.tile(
tf.expand_dims(
input=decoder_state_proj, axis=1),
[1, tf.shape(encoder_proj)[1], 1])
concated = tf.concat([decoder_state_expand, encoder_proj], axis=2)
# need reduce the first dimension
attention_weights = tf.contrib.layers.fully_connected(
inputs=tf.reshape(
concated, shape=[-1, self._num_units * 2]),
num_outputs=1,
activation_fn=tf.nn.tanh,
biases_initializer=None)
attention_weights_reshaped = tf.reshape(
attention_weights, shape=[tf.shape(encoder_vec)[0], -1, 1])
# normalize the attention weights using softmax
attention_weights_normed = tf.nn.softmax(
attention_weights_reshaped, dim=1)
scaled = tf.multiply(attention_weights_normed, encoder_vec)
context = tf.reduce_sum(scaled, axis=1)
return context
def _reset_padding(self,
memory,
memory_sequence_length,
check_inner_dims_defined=True):
"""Reset the padding part for encoder inputs.
This funtion comes from tensorflow's `_prepare_memory` function.
"""
memory = nest.map_structure(
lambda m: ops.convert_to_tensor(m, name="memory"), memory)
if memory_sequence_length is not None:
memory_sequence_length = ops.convert_to_tensor(
memory_sequence_length, name="memory_sequence_length")
if check_inner_dims_defined:
def _check_dims(m):
if not m.get_shape()[2:].is_fully_defined():
raise ValueError(
"Expected memory %s to have fully defined inner dims, "
"but saw shape: %s" % (m.name, m.get_shape()))
nest.map_structure(_check_dims, memory)
if memory_sequence_length is None:
seq_len_mask = None
else:
seq_len_mask = array_ops.sequence_mask(
memory_sequence_length,
maxlen=array_ops.shape(nest.flatten(memory)[0])[1],
dtype=nest.flatten(memory)[0].dtype)
seq_len_batch_size = (memory_sequence_length.shape[0].value or
array_ops.shape(memory_sequence_length)[0])
def _maybe_mask(m, seq_len_mask):
rank = m.get_shape().ndims
rank = rank if rank is not None else array_ops.rank(m)
extra_ones = array_ops.ones(rank - 2, dtype=dtypes.int32)
m_batch_size = m.shape[0].value or array_ops.shape(m)[0]
if memory_sequence_length is not None:
message = ("memory_sequence_length and memory tensor "
"batch sizes do not match.")
with ops.control_dependencies([
check_ops.assert_equal(
seq_len_batch_size, m_batch_size, message=message)
]):
seq_len_mask = array_ops.reshape(
seq_len_mask,
array_ops.concat(
(array_ops.shape(seq_len_mask), extra_ones), 0))
return m * seq_len_mask
else:
return m
return nest.map_structure(lambda m: _maybe_mask(m, seq_len_mask),
memory)
def seq_to_seq_net(embedding_dim, encoder_size, decoder_size, source_dict_dim,
target_dict_dim, is_generating, beam_size,
max_generation_length):
src_word_idx = tf.placeholder(tf.int32, shape=[None, None])
src_sequence_length = tf.placeholder(tf.int32, shape=[None, ])
src_embedding_weights = tf.get_variable("source_word_embeddings",
[source_dict_dim, embedding_dim])
src_embedding = tf.nn.embedding_lookup(src_embedding_weights, src_word_idx)
src_forward_cell = tf.nn.rnn_cell.BasicLSTMCell(encoder_size)
src_reversed_cell = tf.nn.rnn_cell.BasicLSTMCell(encoder_size)
# no peephole
encoder_outputs, _ = tf.nn.bidirectional_dynamic_rnn(
cell_fw=src_forward_cell,
cell_bw=src_reversed_cell,
inputs=src_embedding,
sequence_length=src_sequence_length,
dtype=tf.float32)
# concat the forward outputs and backward outputs
encoded_vec = tf.concat(encoder_outputs, axis=2)
# project the encoder outputs to size of decoder lstm
encoded_proj = tf.contrib.layers.fully_connected(
inputs=tf.reshape(
encoded_vec, shape=[-1, embedding_dim * 2]),
num_outputs=decoder_size,
activation_fn=None,
biases_initializer=None)
encoded_proj_reshape = tf.reshape(
encoded_proj, shape=[-1, tf.shape(encoded_vec)[1], decoder_size])
# get init state for decoder lstm's H
backword_first = tf.slice(encoder_outputs[1], [0, 0, 0], [-1, 1, -1])
decoder_boot = tf.contrib.layers.fully_connected(
inputs=tf.reshape(
backword_first, shape=[-1, embedding_dim]),
num_outputs=decoder_size,
activation_fn=tf.nn.tanh,
biases_initializer=None)
# prepare the initial state for decoder lstm
cell_init = tf.zeros(tf.shape(decoder_boot), tf.float32)
initial_state = LSTMStateTuple(cell_init, decoder_boot)
# create decoder lstm cell
decoder_cell = LSTMCellWithSimpleAttention(
decoder_size,
encoded_vec
if not is_generating else seq2seq.tile_batch(encoded_vec, beam_size),
encoded_proj_reshape if not is_generating else
seq2seq.tile_batch(encoded_proj_reshape, beam_size),
src_sequence_length if not is_generating else
seq2seq.tile_batch(src_sequence_length, beam_size),
forget_bias=0.0)
output_layer = Dense(target_dict_dim, name='output_projection')
if not is_generating:
trg_word_idx = tf.placeholder(tf.int32, shape=[None, None])
trg_sequence_length = tf.placeholder(tf.int32, shape=[None, ])
trg_embedding_weights = tf.get_variable(
"target_word_embeddings", [target_dict_dim, embedding_dim])
trg_embedding = tf.nn.embedding_lookup(trg_embedding_weights,
trg_word_idx)
training_helper = seq2seq.TrainingHelper(
inputs=trg_embedding,
sequence_length=trg_sequence_length,
time_major=False,
name='training_helper')
training_decoder = seq2seq.BasicDecoder(
cell=decoder_cell,
helper=training_helper,
initial_state=initial_state,
output_layer=output_layer)
# get the max length of target sequence
max_decoder_length = tf.reduce_max(trg_sequence_length)
decoder_outputs_train, _, _ = seq2seq.dynamic_decode(
decoder=training_decoder,
output_time_major=False,
impute_finished=True,
maximum_iterations=max_decoder_length)
decoder_logits_train = tf.identity(decoder_outputs_train.rnn_output)
decoder_pred_train = tf.argmax(
decoder_logits_train, axis=-1, name='decoder_pred_train')
masks = tf.sequence_mask(
lengths=trg_sequence_length,
maxlen=max_decoder_length,
dtype=tf.float32,
name='masks')
# place holder of label sequence
lbl_word_idx = tf.placeholder(tf.int32, shape=[None, None])
# compute the loss
loss = seq2seq.sequence_loss(
logits=decoder_logits_train,
targets=lbl_word_idx,
weights=masks,
average_across_timesteps=True,
average_across_batch=True)
# return feeding list and loss operator
return {
'src_word_idx': src_word_idx,
'src_sequence_length': src_sequence_length,
'trg_word_idx': trg_word_idx,
'trg_sequence_length': trg_sequence_length,
'lbl_word_idx': lbl_word_idx
}, loss
else:
start_tokens = tf.ones([tf.shape(src_word_idx)[0], ],
tf.int32) * START_TOKEN_IDX
# share the same embedding weights with target word
trg_embedding_weights = tf.get_variable(
"target_word_embeddings", [target_dict_dim, embedding_dim])
inference_decoder = beam_search_decoder.BeamSearchDecoder(
cell=decoder_cell,
embedding=lambda tokens: tf.nn.embedding_lookup(trg_embedding_weights, tokens),
start_tokens=start_tokens,
end_token=END_TOKEN_IDX,
initial_state=tf.nn.rnn_cell.LSTMStateTuple(
tf.contrib.seq2seq.tile_batch(initial_state[0], beam_size),
tf.contrib.seq2seq.tile_batch(initial_state[1], beam_size)),
beam_width=beam_size,
output_layer=output_layer)
decoder_outputs_decode, _, _ = seq2seq.dynamic_decode(
decoder=inference_decoder,
output_time_major=False,
#impute_finished=True,# error occurs
maximum_iterations=max_generation_length)
predicted_ids = decoder_outputs_decode.predicted_ids
return {
'src_word_idx': src_word_idx,
'src_sequence_length': src_sequence_length
}, predicted_ids
def print_arguments(args):
print('----------- Configuration Arguments -----------')
for arg, value in vars(args).iteritems():
print('%s: %s' % (arg, value))
print('------------------------------------------------')
def padding_data(data, padding_size, value):
data = data + [value] * padding_size
return data[:padding_size]
def save(sess, path, var_list=None, global_step=None):
saver = tf.train.Saver(var_list)
save_path = saver.save(sess, save_path=path, global_step=global_step)
print('Model save at %s' % save_path)
def restore(sess, path, var_list=None):
# var_list = None returns the list of all saveable variables
saver = tf.train.Saver(var_list)
saver.restore(sess, save_path=path)
print('model restored from %s' % path)
def adapt_batch_data(data):
src_seq = map(lambda x: x[0], data)
trg_seq = map(lambda x: x[1], data)
lbl_seq = map(lambda x: x[2], data)
src_sequence_length = np.array(
[len(seq) for seq in src_seq]).astype('int32')
src_seq_maxlen = np.max(src_sequence_length)
trg_sequence_length = np.array(
[len(seq) for seq in trg_seq]).astype('int32')
trg_seq_maxlen = np.max(trg_sequence_length)
src_seq = np.array(
[padding_data(seq, src_seq_maxlen, END_TOKEN_IDX)
for seq in src_seq]).astype('int32')
trg_seq = np.array(
[padding_data(seq, trg_seq_maxlen, END_TOKEN_IDX)
for seq in trg_seq]).astype('int32')
lbl_seq = np.array(
[padding_data(seq, trg_seq_maxlen, END_TOKEN_IDX)
for seq in lbl_seq]).astype('int32')
return {
'src_word_idx': src_seq,
'src_sequence_length': src_sequence_length,
'trg_word_idx': trg_seq,
'trg_sequence_length': trg_sequence_length,
'lbl_word_idx': lbl_seq
}
def train():
feeding_dict, loss = seq_to_seq_net(
embedding_dim=args.embedding_dim,
encoder_size=args.encoder_size,
decoder_size=args.decoder_size,
source_dict_dim=args.dict_size,
target_dict_dim=args.dict_size,
is_generating=False,
beam_size=args.beam_size,
max_generation_length=args.max_generation_length)
global_step = tf.Variable(0, trainable=False, name='global_step')
trainable_params = tf.trainable_variables()
optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
gradients = tf.gradients(loss, trainable_params)
# may clip the parameters
clip_gradients, _ = tf.clip_by_global_norm(gradients, 1.0)
updates = optimizer.apply_gradients(
zip(gradients, trainable_params), global_step=global_step)
src_dict, trg_dict = paddle.dataset.wmt14.get_dict(args.dict_size)
train_batch_generator = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.wmt14.train(args.dict_size), buf_size=1000),
batch_size=args.batch_size)
test_batch_generator = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.wmt14.test(args.dict_size), buf_size=1000),
batch_size=args.batch_size)
def do_validataion():
total_loss = 0.0
count = 0
for batch_id, data in enumerate(test_batch_generator()):
adapted_batch_data = adapt_batch_data(data)
outputs = sess.run([loss],
feed_dict={
item[1]: adapted_batch_data[item[0]]
for item in feeding_dict.items()
})
total_loss += outputs[0]
count += 1
return total_loss / count
config = tf.ConfigProto(
intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
config.gpu_options.allow_growth = True
with tf.Session(config=config) as sess:
init_g = tf.global_variables_initializer()
init_l = tf.local_variables_initializer()
sess.run(init_l)
sess.run(init_g)
for pass_id in xrange(args.pass_num):
pass_start_time = time.time()
words_seen = 0
for batch_id, data in enumerate(train_batch_generator()):
adapted_batch_data = adapt_batch_data(data)
words_seen += np.sum(adapted_batch_data['src_sequence_length'])
words_seen += np.sum(adapted_batch_data['trg_sequence_length'])
outputs = sess.run([updates, loss],
feed_dict={
item[1]: adapted_batch_data[item[0]]
for item in feeding_dict.items()
})
print("pass_id=%d, batch_id=%d, train_loss: %f" %
(pass_id, batch_id, outputs[1]))
pass_end_time = time.time()
test_loss = do_validataion()
time_consumed = pass_end_time - pass_start_time
words_per_sec = words_seen / time_consumed
print("pass_id=%d, test_loss: %f, words/s: %f, sec/pass: %f" %
(pass_id, test_loss, words_per_sec, time_consumed))
def infer():
feeding_dict, predicted_ids = seq_to_seq_net(
embedding_dim=args.embedding_dim,
encoder_size=args.encoder_size,
decoder_size=args.decoder_size,
source_dict_dim=args.dict_size,
target_dict_dim=args.dict_size,
is_generating=True,
beam_size=args.beam_size,
max_generation_length=args.max_generation_length)
src_dict, trg_dict = paddle.dataset.wmt14.get_dict(args.dict_size)
test_batch_generator = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.wmt14.train(args.dict_size), buf_size=1000),
batch_size=args.batch_size)
config = tf.ConfigProto(
intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
with tf.Session(config=config) as sess:
restore(sess, './checkpoint/tf_seq2seq-1500')
for batch_id, data in enumerate(test_batch_generator()):
src_seq = map(lambda x: x[0], data)
source_language_seq = [
src_dict[item] for seq in src_seq for item in seq
]
src_sequence_length = np.array(
[len(seq) for seq in src_seq]).astype('int32')
src_seq_maxlen = np.max(src_sequence_length)
src_seq = np.array([
padding_data(seq, src_seq_maxlen, END_TOKEN_IDX)
for seq in src_seq
]).astype('int32')
outputs = sess.run([predicted_ids],
feed_dict={
feeding_dict['src_word_idx']: src_seq,
feeding_dict['src_sequence_length']:
src_sequence_length
})
print("\nDecoder result comparison: ")
source_language_seq = ' '.join(source_language_seq).lstrip(
'<s>').rstrip('<e>').strip()
inference_seq = ''
print(" --> source: " + source_language_seq)
for item in outputs[0][0]:
if item[0] == END_TOKEN_IDX: break
inference_seq += ' ' + trg_dict.get(item[0], '<unk>')
print(" --> inference: " + inference_seq)
if __name__ == '__main__':
args = parser.parse_args()
print_arguments(args)
if args.infer_only:
infer()
else:
train()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import time
import numpy as np
import tensorflow as tf
DTYPE = tf.float32
def parse_args():
parser = argparse.ArgumentParser("mnist model benchmark.")
parser.add_argument(
'--batch_size', type=int, default=128, help='The minibatch size.')
parser.add_argument(
'--iterations', type=int, default=35, help='The number of minibatches.')
parser.add_argument(
'--pass_num', type=int, default=5, help='The number of passes.')
parser.add_argument(
'--device',
type=str,
default='GPU',
choices=['CPU', 'GPU'],
help='The device type.')
args = parser.parse_args()
return args
def run_benchmark(args):
def weight_variable(dtype, shape):
initial = tf.truncated_normal(shape, stddev=0.1, dtype=dtype)
return tf.Variable(initial)
def bias_variable(dtype, shape):
initial = tf.constant(0.1, shape=shape, dtype=dtype)
return tf.Variable(initial)
device = '/cpu:0' if args.device == 'CPU' else '/device:GPU:0'
with tf.device(device):
images = tf.placeholder(DTYPE, shape=(None, 28, 28, 1))
labels = tf.placeholder(tf.int64, shape=(None, ))
# conv1, relu, pool1
conv1_weights = weight_variable(DTYPE, [5, 5, 1, 20])
conv1_bias = bias_variable(DTYPE, [20])
conv1 = tf.nn.conv2d(
images, conv1_weights, strides=[1, 1, 1, 1], padding="VALID")
relu1 = tf.nn.relu(tf.nn.bias_add(conv1, conv1_bias))
pool1 = tf.nn.max_pool(
relu1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding="VALID")
# conv2, relu, pool2
conv2_weights = weight_variable(DTYPE, [5, 5, 20, 50])
conv2_bias = bias_variable(DTYPE, [50])
conv2 = tf.nn.conv2d(
pool1, conv2_weights, strides=[1, 1, 1, 1], padding="VALID")
relu2 = tf.nn.relu(tf.nn.bias_add(conv2, conv2_bias))
pool2 = tf.nn.max_pool(
relu2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding="VALID")
# FC
pool_shape = pool2.get_shape().as_list()
hidden_dim = reduce(lambda a, b: a * b, pool_shape[1:], 1)
reshape = tf.reshape(pool2, shape=(tf.shape(pool2)[0], hidden_dim))
fc_weights = weight_variable(DTYPE, [hidden_dim, 10])
fc_bias = bias_variable(DTYPE, [10])
logits = tf.matmul(reshape, fc_weights) + fc_bias
# Get prediction
prediction = tf.nn.softmax(logits)
# Loss
one_hot_labels = tf.one_hot(labels, depth=10)
cost = -tf.reduce_sum(tf.log(prediction) * one_hot_labels, [1])
avg_cost = tf.reduce_mean(cost)
# Get accuracy
correct = tf.equal(tf.argmax(prediction, 1), labels)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
# metrics, g_accuracy
with tf.variable_scope("reset_metrics_accuracy_scope") as scope:
g_accuracy = tf.metrics.accuracy(
labels, tf.argmax(
prediction, axis=1))
vars = tf.contrib.framework.get_variables(
scope, collection=tf.GraphKeys.LOCAL_VARIABLES)
g_accuracy_reset_op = tf.variables_initializer(vars)
# Optimizer
opt = tf.train.AdamOptimizer(
learning_rate=0.001, beta1=0.9, beta2=0.999)
train_op = opt.minimize(avg_cost)
# train_op = tf.train.AdamOptimizer(1e-4).minimize(avg_cost)
train_reader = paddle.batch(
paddle.dataset.mnist.train(), batch_size=args.batch_size)
test_reader = paddle.batch(
paddle.dataset.mnist.test(), batch_size=args.batch_size)
def eval_test():
sess.run(g_accuracy_reset_op)
for batch_id, data in enumerate(test_reader()):
images_data = np.array(
map(lambda x: np.transpose(x[0].reshape([1, 28, 28]), axes=[1,2,0]), data)).astype("float32")
labels_data = np.array(map(lambda x: x[1], data)).astype("int64")
loss, acc, g_acc = sess.run(
[avg_cost, accuracy, g_accuracy],
feed_dict={images: images_data,
labels: labels_data})
return g_acc[1]
config = tf.ConfigProto(
intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
config.gpu_options.allow_growth = True
with tf.Session(config=config) as sess:
init_g = tf.global_variables_initializer()
init_l = tf.local_variables_initializer()
sess.run(init_g)
sess.run(init_l)
for pass_id in range(args.pass_num):
sess.run(g_accuracy_reset_op)
pass_start = time.time()
for batch_id, data in enumerate(train_reader()):
images_data = np.array(
map(lambda x: np.transpose(x[0].reshape([1, 28, 28]), axes=[1,2,0]), data)).astype("float32")
labels_data = np.array(map(lambda x: x[1], data)).astype(
"int64")
start = time.time()
_, loss, acc, g_acc = sess.run(
[train_op, avg_cost, accuracy, g_accuracy],
feed_dict={images: images_data,
labels: labels_data})
end = time.time()
print("pass=%d, batch=%d, loss=%f, error=%f, elapse=%f" %
(pass_id, batch_id, loss, 1 - acc, (end - start) / 1000))
pass_end = time.time()
test_avg_acc = eval_test()
print(
"pass=%d, training_avg_accuracy=%f, test_avg_acc=%f, elapse=%f"
% (pass_id, g_acc[1], test_avg_acc,
(pass_end - pass_start) / 1000))
def print_arguments(args):
print('----------- Configuration Arguments -----------')
for arg, value in sorted(vars(args).iteritems()):
print('%s: %s' % (arg, value))
print('------------------------------------------------')
if __name__ == '__main__':
args = parse_args()
print_arguments(args)
run_benchmark(args)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
based on https://github.com/tensorflow/models/blob/master/official/resnet/resnet_model.py
Get help: python resnet.py --help
See performance on flowers: python resnet.py
Train on cifar10: python resnet.py --data=cifar10 --with_test
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import time
import numpy as np
import tensorflow as tf
DTYPE = tf.float32
def parse_args():
parser = argparse.ArgumentParser('Convolution model benchmark.')
parser.add_argument(
'--model',
type=str,
choices=['resnet'],
default='resnet',
help='The model architecture.')
parser.add_argument(
'--batch_size', type=int, default=32, help='The minibatch size.')
parser.add_argument(
'--use_fake_data',
action='store_true',
help='use real data or fake data')
parser.add_argument(
'--skip_batch_num',
type=int,
default=5,
help='The first num of minibatch num to skip, for better performance test'
)
parser.add_argument(
'--iterations',
type=int,
default=105,
help='The number of minibatches.')
parser.add_argument(
'--pass_num', type=int, default=300, help='The number of passes.')
parser.add_argument(
'--order',
type=str,
default='NHWC',
choices=['NCHW', 'NHWC'],
help='The data order, now only support NCHW.')
parser.add_argument(
'--device',
type=str,
default='GPU',
choices=['CPU', 'GPU'],
help='The device type.')
parser.add_argument(
'--data',
type=str,
default='flowers102',
choices=['flowers102', 'cifar10'],
help='The kinds of data.')
parser.add_argument(
'--infer_only', action='store_true', help='If set, run forward only.')
parser.add_argument(
'--use_cprof', action='store_true', help='If set, use cProfile.')
parser.add_argument(
'--with_test',
action='store_true',
help='If set, test the testset during training.')
parser.add_argument(
'--use_nvprof',
action='store_true',
help='If set, use nvprof for CUDA.')
args = parser.parse_args()
return args
def print_arguments(args):
vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
vars(args)['device'] == 'GPU')
vars(args)['iterations'] = vars(args)['pass_num'] * 1000 if vars(args)[
'with_test'] else vars(args)['iterations']
print('----------- Configuration Arguments -----------')
for arg, value in sorted(vars(args).iteritems()):
print('%s: %s' % (arg, value))
print('------------------------------------------------')
def fixed_padding(inputs, kernel_size, data_format):
"""Pads the input along the spatial dimensions independently of input size.
Args:
inputs: A tensor of size [batch, channels, height_in, width_in] or
[batch, height_in, width_in, channels] depending on data_format.
kernel_size: The kernel to be used in the conv2d or max_pool2d operation.
Should be a positive integer.
data_format: The input format ('channels_last' or 'channels_first').
Returns:
A tensor with the same format as the input with the data either intact
(if kernel_size == 1) or padded (if kernel_size > 1).
"""
pad_total = kernel_size - 1
pad_beg = pad_total // 2
pad_end = pad_total - pad_beg
if data_format == 'channels_first':
padded_inputs = tf.pad(inputs, [[0, 0], [0, 0], [pad_beg, pad_end],
[pad_beg, pad_end]])
else:
padded_inputs = tf.pad(inputs, [[0, 0], [pad_beg, pad_end],
[pad_beg, pad_end], [0, 0]])
return padded_inputs
def conv2d_fixed_padding(inputs, filters, kernel_size, strides, data_format):
"""Strided 2-D convolution with explicit padding."""
# The padding is consistent and is based only on `kernel_size`, not on the
# dimensions of `inputs` (as opposed to using `tf.layers.conv2d` alone).
# This is consistent with PaddlePaddle.
# In addition, the calculation for output size in TensorFlow can refer:
# https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/common_shape_fns.cc
if strides > 1:
inputs = fixed_padding(inputs, kernel_size, data_format)
return tf.layers.conv2d(
inputs=inputs,
filters=filters,
kernel_size=kernel_size,
strides=strides,
padding=('SAME' if strides == 1 else 'VALID'),
use_bias=False,
kernel_initializer=tf.variance_scaling_initializer(),
data_format=data_format)
def conv_bn(inputs,
filters,
kernel_size,
strides,
is_training,
data_format,
act=True):
# def conv2d_fixed_padding(inputs, filters, kernel_size, strides, data_format):
# set fused=True for a significant performance boost. See
# https://www.tensorflow.org/performance/performance_guide#common_fused_ops
inputs = conv2d_fixed_padding(
inputs=inputs,
filters=filters,
kernel_size=kernel_size,
strides=strides,
data_format=data_format)
inputs = tf.layers.batch_normalization(
inputs=inputs,
axis=1 if data_format == 'channels_first' else 3,
momentum=0.9,
epsilon=1e-05,
center=True,
scale=True,
training=is_training,
fused=True)
if act:
inputs = tf.nn.relu(inputs)
return inputs
def basicblock(inputs, filters, is_training, projection_shortcut, strides,
data_format):
shortcut = inputs
if projection_shortcut is not None:
shortcut = projection_shortcut(inputs)
inputs = conv_bn(inputs, filters, 3, strides, is_training, data_format)
inputs = conv_bn(inputs, filters, 3, 1, is_training, data_format, act=False)
inputs = inputs + shortcut
inputs = tf.nn.relu(inputs)
return inputs
def bottleneck(inputs, filters, is_training, projection_shortcut, strides,
data_format):
shortcut = inputs
if projection_shortcut is not None:
shortcut = projection_shortcut(inputs)
inputs = conv_bn(inputs, filters, 1, strides, is_training, data_format)
inputs = conv_bn(inputs, filters, 3, 1, is_training, data_format, act=False)
inputs = conv_bn(
inputs, filters * 4, 1, 1, is_training, data_format, act=False)
inputs = inputs + shortcut
inputs = tf.nn.relu(inputs)
return inputs
def block_layer(inputs, filters, block_fn, blocks, strides, is_training, name,
data_format):
# Bottleneck blocks end with 4x the number of filters as they start with
filters_out = 4 * filters if block_fn is bottleneck else filters
def projection_shortcut(inputs):
return conv2d_fixed_padding(
inputs=inputs,
filters=filters_out,
kernel_size=1,
strides=strides,
data_format=data_format)
# Only the first block per block_layer uses projection_shortcut and strides
inputs = block_fn(inputs, filters, is_training, projection_shortcut,
strides, data_format)
for _ in range(1, blocks):
inputs = block_fn(inputs, filters, is_training, None, 1, data_format)
return tf.identity(inputs, name)
def resnet_imagenet(depth, class_dim, data_format):
"""Returns the ResNet model for a given size and number of output classes."""
def resnet_generator(block_fn,
layers,
num_classes,
data_format='channels_last'):
if data_format is None:
data_format = ('channels_first'
if tf.test.is_built_with_cuda() else 'channels_last')
def model(inputs, is_training):
"""Constructs the ResNet model given the inputs."""
if data_format == 'channels_first':
# Convert the inputs from channels_last (NHWC) to channels_first (NCHW).
# This provides a large performance boost on GPU. See
# https://www.tensorflow.org/performance/performance_guide#data_formats
inputs = tf.transpose(inputs, [0, 3, 1, 2])
inputs = conv_bn(inputs, 64, 7, 2, is_training, data_format)
inputs = tf.identity(inputs, 'initial_conv')
inputs = tf.layers.max_pooling2d(
inputs=inputs,
pool_size=3,
strides=2,
padding='SAME',
data_format=data_format)
inputs = tf.identity(inputs, 'initial_max_pool')
inputs = block_layer(inputs, 64, block_fn, layers[0], 1,
is_training, 'block_layer1', data_format)
inputs = block_layer(inputs, 128, block_fn, layers[1], 2,
is_training, 'block_layer2', data_format)
inputs = block_layer(inputs, 256, block_fn, layers[2], 2,
is_training, 'block_layer3', data_format)
inputs = block_layer(inputs, 512, block_fn, layers[3], 2,
is_training, 'block_layer4', data_format)
inputs = tf.layers.average_pooling2d(
inputs=inputs,
pool_size=7,
strides=1,
padding='VALID',
data_format=data_format)
inputs = tf.identity(inputs, 'final_avg_pool')
inputs = tf.reshape(inputs,
[-1, 512 if block_fn is basicblock else 2048])
inputs = tf.layers.dense(inputs=inputs, units=num_classes)
inputs = tf.identity(inputs, 'final_dense')
return inputs
return model
model_params = {
18: {
'block': basicblock,
'layers': [2, 2, 2, 2]
},
34: {
'block': basicblock,
'layers': [3, 4, 6, 3]
},
50: {
'block': bottleneck,
'layers': [3, 4, 6, 3]
},
101: {
'block': bottleneck,
'layers': [3, 4, 23, 3]
},
152: {
'block': bottleneck,
'layers': [3, 8, 36, 3]
},
200: {
'block': bottleneck,
'layers': [3, 24, 36, 3]
}
}
if depth not in model_params:
raise ValueError('Not a valid depth:', depth)
params = model_params[depth]
return resnet_generator(params['block'], params['layers'], class_dim,
data_format)
def resnet_cifar10(depth, num_classes, data_format):
if depth % 6 != 2:
raise ValueError('depth must be 6n + 2:', depth)
num_blocks = (depth - 2) // 6
if data_format is None:
data_format = ('channels_first'
if tf.test.is_built_with_cuda() else 'channels_last')
def model(inputs, is_training):
inputs = conv_bn(inputs, 16, 3, 1, is_training, data_format)
inputs = tf.identity(inputs, 'initial_conv')
inputs = block_layer(inputs, 16, basicblock, num_blocks, 1, is_training,
'block_layer1', data_format)
inputs = block_layer(inputs, 32, basicblock, num_blocks, 2, is_training,
'block_layer2', data_format)
inputs = block_layer(inputs, 64, basicblock, num_blocks, 2, is_training,
'block_layer3', data_format)
inputs = tf.layers.average_pooling2d(
inputs=inputs,
pool_size=8,
strides=1,
padding='VALID',
data_format=data_format)
inputs = tf.identity(inputs, 'final_avg_pool')
inputs = tf.reshape(inputs, [-1, 64])
inputs = tf.layers.dense(inputs=inputs, units=num_classes)
inputs = tf.identity(inputs, 'final_dense')
return inputs
return model
def run_benchmark(args, data_format='channels_last', device='/cpu:0'):
"""Our model_fn for ResNet to be used with our Estimator."""
class_dim = 1000
dshape = (None, 224, 224, 3)
pdshape = (3, 224, 224)
if args.data == 'flowers102':
class_dim = 102
dshape = (None, 224, 224, 3)
pdshape = (3, 224, 224)
elif args.data == 'cifar10':
class_dim = 10
dshape = (None, 32, 32, 3)
pdshape = (3, 32, 32)
with tf.device(device):
images = tf.placeholder(DTYPE, shape=dshape)
labels = tf.placeholder(tf.int64, shape=(None, ))
is_training = tf.placeholder('bool')
onehot_labels = tf.one_hot(labels, depth=class_dim)
network = resnet_cifar10(
32, class_dim,
data_format) if args.data == 'cifar10' else resnet_imagenet(
50, class_dim, data_format)
logits = network(inputs=images, is_training=is_training)
cross_entropy = tf.losses.softmax_cross_entropy(
logits=logits, onehot_labels=onehot_labels)
avg_cost = tf.reduce_mean(cross_entropy)
correct = tf.equal(tf.argmax(logits, 1), labels)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
lr = 0.1 if args.data == 'cifar10' else 0.01
optimizer = tf.train.MomentumOptimizer(learning_rate=lr, momentum=0.9)
# Batch norm requires update_ops to be added as a train_op dependency.
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(update_ops):
train_op = optimizer.minimize(avg_cost)
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.cifar.train10()
if args.data == 'cifar10' else paddle.dataset.flowers.train(),
buf_size=5120),
batch_size=args.batch_size)
test_reader = paddle.batch(
paddle.dataset.cifar.test10()
if args.data == 'cifar10' else paddle.dataset.flowers.test(),
batch_size=100)
def test():
test_accs = []
for batch_id, data in enumerate(test_reader()):
test_images = np.array(
map(lambda x: np.transpose(x[0].reshape(pdshape),
axes=[1, 2, 0]), data)).astype("float32")
test_labels = np.array(map(lambda x: x[1], data)).astype('int64')
test_accs.append(
accuracy.eval(feed_dict={
images: test_images,
labels: test_labels,
is_training: False
}))
print("Pass = %d, Train performance = %f imgs/s, Test accuracy = %f\n" %
(pass_id, num_samples / train_elapsed, np.mean(test_accs)))
config = tf.ConfigProto(
intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
config.gpu_options.allow_growth = True
with tf.Session(config=config) as sess:
init_g = tf.global_variables_initializer()
init_l = tf.local_variables_initializer()
sess.run(init_g)
sess.run(init_l)
if args.use_fake_data:
data = train_reader().next()
images_data = np.array(
map(lambda x: np.transpose(x[0].reshape(pdshape),
axes=[1, 2, 0]), data)).astype("float32")
labels_data = np.array(map(lambda x: x[1], data)).astype('int64')
iters, num_samples, start_time = 0, 0, 0.0
for pass_id in range(args.pass_num):
if iters == args.iterations:
break
train_accs = []
train_losses = []
for batch_id, data in enumerate(train_reader()):
if iters == args.skip_batch_num:
start_time = time.time()
num_samples = 0
if iters == args.iterations:
break
if not args.use_fake_data:
images_data = np.array(
map(lambda x: np.transpose(x[0].reshape(pdshape),
axes=[1, 2, 0]), data)).astype("float32")
labels_data = np.array(map(lambda x: x[1], data)).astype(
'int64')
_, loss, acc = sess.run([train_op, avg_cost, accuracy],
feed_dict={
images: images_data,
labels: labels_data,
is_training: True
})
iters += 1
train_accs.append(acc)
train_losses.append(loss)
num_samples += len(data)
print("Pass=%d, Iter=%d, Loss=%f, Accuray=%f\n" %
(pass_id, iters, loss, acc))
train_elapsed = time.time() - start_time
print("Pass=%d, Loss=%f, Accuray=%f\n" %
(pass_id, np.mean(train_losses), np.mean(train_accs)))
# evaluation
if args.with_test:
test()
if not args.with_test:
duration = time.time() - start_time
examples_per_sec = num_samples / duration
sec_per_batch = duration / (iters - args.skip_batch_num)
print('Total examples: %d, total time: %.5f' %
(num_samples, duration))
print('%.5f examples/sec, %.5f sec/batch' %
(examples_per_sec, sec_per_batch))
if __name__ == '__main__':
args = parse_args()
print_arguments(args)
if tf.test.is_built_with_cuda():
device = '/device:GPU:0'
if args.order == 'NHWC':
data_format = 'channels_last'
else:
data_format = 'channels_first'
else:
device = '/cpu:0'
if args.order == 'NHWC':
data_format = 'channels_last'
else:
raise ValueError('Only support NHWC order in CPU mode')
run_benchmark(args, data_format, device)
You also should install tflearn:
```bash
pip install -r requirements.txt
```
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os.path
import io
import numpy as np
import tensorflow as tf
# tflearn
import tflearn
from tflearn.data_utils import to_categorical, pad_sequences
from tflearn.datasets import imdb
FLAGS = tf.app.flags.FLAGS
class DataSet(object):
def __init__(self, data, labels):
assert data.shape[0] == labels.shape[0], (
'data.shape: %s labels.shape: %s' % (data.shape, labels.shape))
self._num_examples = data.shape[0]
self._data = data
self._labels = labels
self._epochs_completed = 0
self._index_in_epoch = 0
@property
def data(self):
return self._data
@property
def labels(self):
return self._labels
@property
def num_examples(self):
return self._num_examples
@property
def epochs_completed(self):
return self._epochs_completed
def next_batch(self, batch_size):
assert batch_size <= self._num_examples
start = self._index_in_epoch
self._index_in_epoch += batch_size
if self._index_in_epoch > self._num_examples:
# Finished epoch
self._epochs_completed += 1
# Shuffle the data
perm = np.arange(self._num_examples)
np.random.shuffle(perm)
self._data = self._data[perm]
self._labels = self._labels[perm]
# Start next epoch
start = 0
self._index_in_epoch = batch_size
end = self._index_in_epoch
return self._data[start:end], self._labels[start:end]
def create_datasets(file_path, vocab_size=30000, val_fraction=0.0):
# IMDB Dataset loading
train, test, _ = imdb.load_data(
path=file_path,
n_words=vocab_size,
valid_portion=val_fraction,
sort_by_len=False)
trainX, trainY = train
testX, testY = test
# Data preprocessing
# Sequence padding
trainX = pad_sequences(trainX, maxlen=FLAGS.max_len, value=0.)
testX = pad_sequences(testX, maxlen=FLAGS.max_len, value=0.)
# Converting labels to binary vectors
trainY = to_categorical(trainY, nb_classes=2)
testY = to_categorical(testY, nb_classes=2)
train_dataset = DataSet(trainX, trainY)
return train_dataset
def main():
create_datasets('imdb.pkl')
if __name__ == "__main__":
main()
#!/usr/bin/env python
from six.moves import xrange # pylint: disable=redefined-builtin
import math
import time
import numpy as np
from datetime import datetime
import reader
import tensorflow as tf
from tensorflow.python.ops import rnn
FLAGS = tf.app.flags.FLAGS
tf.app.flags.DEFINE_integer('batch_size', 128, """Batch size.""")
tf.app.flags.DEFINE_integer('num_batches', 100, """Number of batches to run.""")
tf.app.flags.DEFINE_integer('num_layers', 1, """Number of batches to run.""")
tf.app.flags.DEFINE_integer('max_len', 100, """Number of batches to run.""")
tf.app.flags.DEFINE_boolean('forward_only', False,
"""Only run the forward pass.""")
tf.app.flags.DEFINE_boolean('forward_backward_only', False,
"""Only run the forward-forward pass.""")
tf.app.flags.DEFINE_integer('hidden_size', 128, """Number of batches to run.""")
tf.app.flags.DEFINE_integer('emb_size', 128, """Number of batches to run.""")
tf.app.flags.DEFINE_boolean('log_device_placement', False,
"""Whether to log device placement.""")
VOCAB_SIZE = 30000
NUM_CLASS = 2
def get_feed_dict(x_data, y_data=None):
feed_dict = {}
if y_data is not None:
feed_dict[y_input] = y_data
for i in xrange(x_data.shape[0]):
feed_dict[x_input[i]] = x_data[i, :, :]
return feed_dict
def get_incoming_shape(incoming):
""" Returns the incoming data shape """
if isinstance(incoming, tf.Tensor):
return incoming.get_shape().as_list()
elif type(incoming) in [np.array, list, tuple]:
return np.shape(incoming)
else:
raise Exception("Invalid incoming layer.")
# Note input * W is done in LSTMCell,
# which is different from PaddlePaddle
def single_lstm(name,
incoming,
n_units,
use_peepholes=True,
return_seq=False,
return_state=False):
with tf.name_scope(name) as scope:
cell = tf.nn.rnn_cell.LSTMCell(n_units, use_peepholes=use_peepholes)
output, _cell_state = rnn.rnn(cell, incoming, dtype=tf.float32)
out = output if return_seq else output[-1]
return (out, _cell_state) if return_state else out
def lstm(name,
incoming,
n_units,
use_peepholes=True,
return_seq=False,
return_state=False,
num_layers=1):
with tf.name_scope(name) as scope:
lstm_cell = tf.nn.rnn_cell.LSTMCell(
n_units, use_peepholes=use_peepholes)
cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * num_layers)
initial_state = cell.zero_state(FLAGS.batch_size, dtype=tf.float32)
if not isinstance(incoming, list):
# if the input is embeding, the Tensor shape : [None, time_step, emb_size]
incoming = [
tf.squeeze(input_, [1])
for input_ in tf.split(1, FLAGS.max_len, incoming)
]
outputs, state = tf.nn.rnn(cell,
incoming,
initial_state=initial_state,
dtype=tf.float32)
out = outputs if return_seq else outputs[-1]
return (out, _cell_state) if return_state else out
def embedding(name, incoming, vocab_size, emb_size):
with tf.name_scope(name) as scope:
#with tf.device("/cpu:0"):
embedding = tf.get_variable(
name + '_emb', [vocab_size, emb_size], dtype=tf.float32)
out = tf.nn.embedding_lookup(embedding, incoming)
return out
def fc(name, inpOp, nIn, nOut, act=True):
with tf.name_scope(name) as scope:
kernel = tf.get_variable(
name + '_w', [nIn, nOut],
initializer=tf.truncated_normal_initializer(
stddev=0.01, dtype=tf.float32),
dtype=tf.float32)
biases = tf.get_variable(
name + '_b', [nOut],
initializer=tf.constant_initializer(
value=0.0, dtype=tf.float32),
dtype=tf.float32,
trainable=True)
net = tf.nn.relu_layer(inpOp, kernel, biases, name=name) if act else \
tf.matmul(inpOp, kernel) + biases
return net
def inference(seq):
net = embedding('emb', seq, VOCAB_SIZE, FLAGS.emb_size)
print "emb:", get_incoming_shape(net)
net = lstm('lstm', net, FLAGS.hidden_size, num_layers=FLAGS.num_layers)
print "lstm:", get_incoming_shape(net)
net = fc('fc1', net, FLAGS.hidden_size, 2)
return net
def loss(logits, labels):
# one label index for one sample
labels = tf.cast(labels, tf.float32)
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
logits, labels, name='cross_entropy_per_example')
cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy')
tf.add_to_collection('losses', cross_entropy_mean)
return tf.add_n(tf.get_collection('losses'), name='total_loss')
def time_tensorflow_run(session, target, x_input, y_input, info_string):
num_steps_burn_in = 50
total_duration = 0.0
total_duration_squared = 0.0
if not isinstance(target, list):
target = [target]
target_op = tf.group(*target)
train_dataset = reader.create_datasets("imdb.pkl", VOCAB_SIZE)
for i in xrange(FLAGS.num_batches + num_steps_burn_in):
start_time = time.time()
data, label = train_dataset.next_batch(FLAGS.batch_size)
_ = session.run(target_op, feed_dict={x_input: data, y_input: label})
duration = time.time() - start_time
if i > num_steps_burn_in:
if not i % 10:
print('%s: step %d, duration = %.3f' %
(datetime.now(), i - num_steps_burn_in, duration))
total_duration += duration
total_duration_squared += duration * duration
mn = total_duration / FLAGS.num_batches
vr = total_duration_squared / FLAGS.num_batches - mn * mn
sd = math.sqrt(vr)
print('%s: %s across %d steps, %.3f +/- %.3f sec / batch' %
(datetime.now(), info_string, FLAGS.num_batches, mn, sd))
def run_benchmark():
with tf.Graph().as_default():
global_step = 0
with tf.device('/cpu:0'):
global_step = tf.Variable(0, trainable=False)
with tf.device('/gpu:0'):
#x_input = tf.placeholder(tf.int32, [None, FLAGS.max_len], name="x_input")
#y_input = tf.placeholder(tf.int32, [None, NUM_CLASS], name="y_input")
x_input = tf.placeholder(
tf.int32, [FLAGS.batch_size, FLAGS.max_len], name="x_input")
y_input = tf.placeholder(
tf.int32, [FLAGS.batch_size, NUM_CLASS], name="y_input")
# Generate some dummy sequnce.
last_layer = inference(x_input)
objective = loss(last_layer, y_input)
opt = tf.train.AdamOptimizer(0.001)
grads = opt.compute_gradients(objective)
apply_gradient_op = opt.apply_gradients(
grads, global_step=global_step)
init = tf.initialize_all_variables()
sess = tf.Session(config=tf.ConfigProto(
allow_soft_placement=True,
log_device_placement=FLAGS.log_device_placement))
sess.run(init)
run_forward = True
run_forward_backward = True
if FLAGS.forward_only and FLAGS.forward_backward_only:
raise ValueError("Cannot specify --forward_only and "
"--forward_backward_only at the same time.")
if FLAGS.forward_only:
run_forward_backward = False
elif FLAGS.forward_backward_only:
run_forward = False
if run_forward:
time_tensorflow_run(sess, last_layer, x_input, y_input,
"Forward")
if run_forward_backward:
with tf.control_dependencies([apply_gradient_op]):
train_op = tf.no_op(name='train')
time_tensorflow_run(sess, [train_op, objective], x_input,
y_input, "Forward-backward")
def main(_):
run_benchmark()
if __name__ == '__main__':
tf.app.run()
#!/usr/bin/env python
from six.moves import xrange # pylint: disable=redefined-builtin
import re
import math
import time
import numpy as np
from datetime import datetime
import reader
import tensorflow as tf
from tensorflow.python.ops import rnn
FLAGS = tf.app.flags.FLAGS
tf.app.flags.DEFINE_integer('batch_size', 64, """Batch size.""")
tf.app.flags.DEFINE_integer('num_batches', 100, """Number of batches to run.""")
tf.app.flags.DEFINE_integer('num_layers', 1, """Number of batches to run.""")
tf.app.flags.DEFINE_integer('max_len', 100, """Number of batches to run.""")
tf.app.flags.DEFINE_integer('hidden_size', 128, """Number of batches to run.""")
tf.app.flags.DEFINE_integer('emb_size', 64, """Number of batches to run.""")
tf.app.flags.DEFINE_boolean('log_device_placement', False,
"""Whether to log device placement.""")
tf.app.flags.DEFINE_integer('num_gpus', 4, """How many GPUs to use.""")
VOCAB_SIZE = 30000
NUM_CLASS = 2
NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN = 50000
NUM_EPOCHS_PER_DECAY = 50
INITIAL_LEARNING_RATE = 0.1
LEARNING_RATE_DECAY_FACTOR = 0.1
TOWER_NAME = 'tower'
train_dataset = reader.create_datasets("imdb.pkl", VOCAB_SIZE)
def get_incoming_shape(incoming):
""" Returns the incoming data shape """
if isinstance(incoming, tf.Tensor):
return incoming.get_shape().as_list()
elif type(incoming) in [np.array, list, tuple]:
return np.shape(incoming)
else:
raise Exception("Invalid incoming layer.")
# Note input * W is done in LSTMCell,
# which is different from PaddlePaddle
def single_lstm(name,
incoming,
n_units,
use_peepholes=True,
return_seq=False,
return_state=False):
with tf.name_scope(name) as scope:
cell = tf.nn.rnn_cell.LSTMCell(n_units, use_peepholes=use_peepholes)
output, _cell_state = rnn.rnn(cell, incoming, dtype=tf.float32)
out = output if return_seq else output[-1]
return (out, _cell_state) if return_state else out
def lstm(name,
incoming,
n_units,
use_peepholes=True,
return_seq=False,
return_state=False,
num_layers=1):
with tf.name_scope(name) as scope:
lstm_cell = tf.nn.rnn_cell.LSTMCell(
n_units, use_peepholes=use_peepholes)
cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * num_layers)
initial_state = cell.zero_state(FLAGS.batch_size, dtype=tf.float32)
if not isinstance(incoming, list):
# if the input is embeding, the Tensor shape : [None, time_step, emb_size]
incoming = [
tf.squeeze(input_, [1])
for input_ in tf.split(1, FLAGS.max_len, incoming)
]
outputs, state = tf.nn.rnn(cell,
incoming,
initial_state=initial_state,
dtype=tf.float32)
out = outputs if return_seq else outputs[-1]
return (out, _cell_state) if return_state else out
def embedding(name, incoming, vocab_size, emb_size):
with tf.name_scope(name) as scope:
#with tf.device("/cpu:0"):
embedding = tf.get_variable(
name + '_emb', [vocab_size, emb_size], dtype=tf.float32)
out = tf.nn.embedding_lookup(embedding, incoming)
return out
def fc(name, inpOp, nIn, nOut, act=True):
with tf.name_scope(name) as scope:
kernel = tf.get_variable(
name + '_w', [nIn, nOut],
initializer=tf.truncated_normal_initializer(
stddev=0.01, dtype=tf.float32),
dtype=tf.float32)
biases = tf.get_variable(
name + '_b', [nOut],
initializer=tf.constant_initializer(
value=0.0, dtype=tf.float32),
dtype=tf.float32,
trainable=True)
net = tf.nn.relu_layer(inpOp, kernel, biases, name=name) if act else \
tf.matmul(inpOp, kernel) + biases
return net
def inference(seq):
net = embedding('emb', seq, VOCAB_SIZE, FLAGS.emb_size)
print "emb:", get_incoming_shape(net)
net = lstm('lstm', net, FLAGS.hidden_size, num_layers=FLAGS.num_layers)
print "lstm:", get_incoming_shape(net)
net = fc('fc1', net, FLAGS.hidden_size, 2)
return net
def loss(logits, labels):
# one label index for one sample
#labels = tf.cast(labels, tf.int64)
# cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
# logits, labels, name='cross_entropy_per_example')
labels = tf.cast(labels, tf.float32)
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
logits, labels, name='cross_entropy_per_example')
cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy')
tf.add_to_collection('losses', cross_entropy_mean)
return tf.add_n(tf.get_collection('losses'), name='total_loss')
def tower_loss(scope):
"""Calculate the total loss on a single tower running the model.
Args:
scope: unique prefix string identifying the tower, e.g. 'tower_0'
Returns:
Tensor of shape [] containing the total loss for a batch of data
"""
data, label = train_dataset.next_batch(FLAGS.batch_size)
# Build a Graph that computes the logits predictions from the
# inference model.
last_layer = inference(data)
# Build the portion of the Graph calculating the losses. Note that we will
# assemble the total_loss using a custom function below.
#_ = loss(last_layer, label)
_ = loss(last_layer, label)
# Assemble all of the losses for the current tower only.
losses = tf.get_collection('losses', scope)
# Calculate the total loss for the current tower.
total_loss = tf.add_n(losses, name='total_loss')
# Compute the moving average of all individual losses and the total loss.
loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg')
loss_averages_op = loss_averages.apply(losses + [total_loss])
# Attach a scalar summary to all individual losses and the total loss; do the
# same for the averaged version of the losses.
for l in losses + [total_loss]:
# Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training
# session. This helps the clarity of presentation on tensorboard.
loss_name = re.sub('%s_[0-9]*/' % TOWER_NAME, '', l.op.name)
# Name each loss as '(raw)' and name the moving average version of the loss
# as the original loss name.
tf.scalar_summary(loss_name + ' (raw)', l)
#tf.scalar_summary(loss_name, loss_averages.average(l))
with tf.control_dependencies([loss_averages_op]):
total_loss = tf.identity(total_loss)
return total_loss
def average_gradients(tower_grads):
"""Calculate the average gradient for each shared variable across all towers.
Note that this function provides a synchronization point across all towers.
Args:
tower_grads: List of lists of (gradient, variable) tuples. The outer list
is over individual gradients. The inner list is over the gradient
calculation for each tower.
Returns:
List of pairs of (gradient, variable) where the gradient has been averaged
across all towers.
"""
average_grads = []
for grad_and_vars in zip(*tower_grads):
# Note that each grad_and_vars looks like the following:
# ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
grads = []
for g, _ in grad_and_vars:
# Add 0 dimension to the gradients to represent the tower.
expanded_g = tf.expand_dims(g, 0)
# Append on a 'tower' dimension which we will average over below.
grads.append(expanded_g)
# Average over the 'tower' dimension.
grad = tf.concat(0, grads)
grad = tf.reduce_mean(grad, 0)
# Keep in mind that the Variables are redundant because they are shared
# across towers. So .. we will just return the first tower's pointer to
# the Variable.
v = grad_and_vars[0][1]
grad_and_var = (grad, v)
average_grads.append(grad_and_var)
return average_grads
def time_tensorflow_run(session, target):
num_steps_burn_in = 80
total_duration = 0.0
total_duration_squared = 0.0
for i in xrange(FLAGS.num_batches + num_steps_burn_in):
start_time = time.time()
_ = session.run(target, feed_dict={x_input: data, y_input: label})
_, loss_value = session.run(target)
duration = time.time() - start_time
if i > num_steps_burn_in:
if not i % 10:
num_examples_per_step = FLAGS.batch_size * FLAGS.num_gpus
examples_per_sec = num_examples_per_step / duration
# sec_per_batch = duration / FLAGS.num_gpus
sec_per_batch = duration
format_str = (
'%s: step %d, loss= %.2f (%.1f examples/sec; %.3f '
'sec/batch batch_size= %d)')
print(format_str %
(datetime.now(), i - num_steps_burn_in, loss_value,
duration, sec_per_batch, num_examples_per_step))
total_duration += duration
total_duration_squared += duration * duration
mn = total_duration / FLAGS.num_batches
vr = total_duration_squared / FLAGS.num_batches - mn * mn
sd = math.sqrt(vr)
print('%s: FwdBwd across %d steps, %.3f +/- %.3f sec / batch' %
(datetime.now(), FLAGS.num_batches, mn, sd))
def run_benchmark():
with tf.Graph().as_default(), tf.device('/cpu:0'):
# Create a variable to count the number of train() calls. This equals the
# number of batches processed * FLAGS.num_gpus.
global_step = tf.get_variable(
'global_step', [],
initializer=tf.constant_initializer(0),
trainable=False)
# Calculate the learning rate schedule.
num_batches_per_epoch = (NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN /
FLAGS.batch_size)
decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY)
# Create an optimizer that performs gradient descent.
opt = tf.train.AdamOptimizer(0.001)
#train_dataset = reader.create_datasets("imdb.pkl", VOCAB_SIZE)
# Calculate the gradients for each model tower.
tower_grads = []
for i in xrange(FLAGS.num_gpus):
with tf.device('/gpu:%d' % i):
with tf.name_scope('%s_%d' % (TOWER_NAME, i)) as scope:
# Calculate the loss for one tower of the model. This function
# constructs the entire model but shares the variables across
# all towers.
loss = tower_loss(scope)
# Reuse variables for the next tower.
tf.get_variable_scope().reuse_variables()
# Retain the summaries from the final tower.
# summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope)
# Calculate the gradients for the batch of data on this tower.
grads = opt.compute_gradients(loss)
# Keep track of the gradients across all towers.
tower_grads.append(grads)
# We must calculate the mean of each gradient. Note that this is the
# synchronization point across all towers.
grads = average_gradients(tower_grads)
# Apply the gradients to adjust the shared variables.
apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
# Group all updates to into a single train op.
train_op = tf.group(apply_gradient_op)
# Build an initialization operation.
init = tf.initialize_all_variables()
# Start running operations on the Graph. allow_soft_placement must be set to
# True to build towers on GPU, as some of the ops do not have GPU
# implementations.
sess = tf.Session(config=tf.ConfigProto(
allow_soft_placement=True,
log_device_placement=FLAGS.log_device_placement))
sess.run(init)
time_tensorflow_run(sess, [train_op, loss])
def main(_):
run_benchmark()
if __name__ == '__main__':
tf.app.run()
#!/bin/bash
set -e
function test() {
lstm_num=$1
batch_size=$2
hid_size=$3
prefix=$4
python rnn.py --num_layers=${lstm_num} --batch_size=$batch_size \
--hidden_size=${hid_size} \
--forward_backward_only=1 \
> logs/1gpu-${lstm_num}lstm-batch${batch_size}-hid${hid_size}.log 2>&1
}
if [ ! -d "logs" ]; then
mkdir logs
fi
#--lstm_num--batch_size--hidden_size--#
test 2 64 256
test 2 64 512
test 2 64 1280
test 2 128 256
test 2 128 512
test 2 128 1280
test 2 256 256
test 2 256 512
test 2 256 1280
#!/bin/bash
set -e
function test() {
num_gpu=$1
lstm_num=$2
hid_size=$3
batch_per_gpu=`expr ${batch_size} / ${num_gpu}`
batch_size=$4
python rnn_multi_gpu.py --num_layers=${lstm_num} --batch_size=$batch_per_gpu \
--num_gpus=${num_gpu} \
--hidden_size=${hid_size} \
--forward_backward_only=1 \
> logs/${num_gpu}gpu-${lstm_num}lstm-hid${hid_size}-batch${batch_size}.log 2>&1
}
if [ ! -d "logs" ]; then
mkdir logs
fi
#--num_gpus--lstm_num--hiddne_size--batch_size--#
test 4 2 256 128
test 4 2 256 256
test 4 2 256 512
test 4 2 512 128
test 4 2 512 256
test 4 2 512 512
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import argparse
import time
import tensorflow as tf
def parse_args():
parser = argparse.ArgumentParser("LSTM model benchmark.")
parser.add_argument(
'--batch_size',
type=int,
default=32,
help='The sequence number of a batch data. (default: %(default)d)')
parser.add_argument(
'--stacked_num',
type=int,
default=5,
help='Number of lstm layers to stack. (default: %(default)d)')
parser.add_argument(
'--embedding_dim',
type=int,
default=512,
help='Dimension of embedding table. (default: %(default)d)')
parser.add_argument(
'--hidden_dim',
type=int,
default=512,
help='Hidden size of lstm unit. (default: %(default)d)')
parser.add_argument(
'--pass_num',
type=int,
default=10,
help='Epoch number to train. (default: %(default)d)')
parser.add_argument(
'--learning_rate',
type=float,
default=0.0002,
help='Learning rate used to train. (default: %(default)f)')
parser.add_argument(
'--infer_only', action='store_true', help='If set, run forward only.')
args = parser.parse_args()
return args
def print_arguments(args):
print('----------- Configuration Arguments -----------')
for arg, value in sorted(vars(args).iteritems()):
print('%s: %s' % (arg, value))
print('------------------------------------------------')
def dynamic_lstm_model(dict_size,
embedding_dim,
hidden_dim,
stacked_num,
class_num=2,
is_train=True):
word_idx = tf.placeholder(tf.int64, shape=[None, None])
sequence_length = tf.placeholder(tf.int64, shape=[None, ])
embedding_weights = tf.get_variable('word_embeddings',
[dict_size, embedding_dim])
embedding = tf.nn.embedding_lookup(embedding_weights, word_idx)
lstm_cell = tf.nn.rnn_cell.LSTMCell(
num_units=hidden_dim, use_peepholes=False)
stacked_cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * stacked_num)
# final_state [LSTMTuple(c, h), LSTMTuple(c, h) ...] total stacked_num LSTMTuples
_, final_state = tf.nn.dynamic_rnn(
cell=stacked_cell,
inputs=embedding,
dtype=tf.float32,
sequence_length=sequence_length)
w = tf.Variable(
tf.truncated_normal([hidden_dim, class_num]), dtype=tf.float32)
bias = tf.Variable(
tf.constant(
value=0.0, shape=[class_num], dtype=tf.float32))
prediction = tf.matmul(final_state[-1][1], w) + bias
if not is_train:
return (word_idx, sequence_length), tf.nn.softmax(prediction)
label = tf.placeholder(tf.int64, shape=[None, ])
loss = tf.nn.softmax_cross_entropy_with_logits(
labels=tf.one_hot(label, 2), logits=prediction)
avg_loss = tf.reduce_mean(loss)
correct_count = tf.equal(tf.argmax(prediction, 1), label)
acc = tf.reduce_mean(tf.cast(correct_count, tf.float32))
with tf.variable_scope("reset_metrics_accuracy_scope") as scope:
g_acc = tf.metrics.accuracy(label, tf.argmax(prediction, axis=1))
vars = tf.contrib.framework.get_variables(
scope, collection=tf.GraphKeys.LOCAL_VARIABLES)
reset_op = tf.variables_initializer(vars)
return (word_idx, sequence_length, label), avg_loss, acc, g_acc, reset_op
def padding_data(data, padding_size, value):
data = data + [value] * padding_size
return data[:padding_size]
def train(args):
word_dict = paddle.dataset.imdb.word_dict()
dict_size = len(word_dict)
feeding_list, avg_loss, acc, g_acc, reset_op = dynamic_lstm_model(
dict_size, args.embedding_dim, args.hidden_dim, args.stacked_num)
adam_optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
train_op = adam_optimizer.minimize(avg_loss)
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.imdb.train(word_dict), buf_size=25000),
batch_size=args.batch_size)
test_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.imdb.test(word_dict), buf_size=25000),
batch_size=args.batch_size)
def do_validation(sess):
sess.run(reset_op)
for batch_id, data in enumerate(test_reader()):
word_idx = map(lambda x: x[0], data)
sequence_length = np.array(
[len(seq) for seq in word_idx]).astype('int64')
maxlen = np.max(sequence_length)
word_idx = [padding_data(seq, maxlen, 0) for seq in word_idx]
word_idx = np.array(word_idx).astype('int64')
label = np.array(map(lambda x: x[1], data)).astype('int64')
_, loss, fetch_acc, fetch_g_acc = sess.run(
[train_op, avg_loss, acc, g_acc],
feed_dict={
feeding_list[0]: word_idx,
feeding_list[1]: sequence_length,
feeding_list[2]: label
})
return fetch_g_acc[1]
config = tf.ConfigProto(
intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
config.gpu_options.allow_growth = True
with tf.Session(config=config) as sess:
init_g = tf.global_variables_initializer()
init_l = tf.local_variables_initializer()
sess.run(init_l)
sess.run(init_g)
for pass_id in xrange(args.pass_num):
# clear accuracy local variable
sess.run(reset_op)
pass_start_time = time.time()
words_seen = 0
for batch_id, data in enumerate(train_reader()):
word_idx = map(lambda x: x[0], data)
sequence_length = np.array(
[len(seq) for seq in word_idx]).astype('int64')
words_seen += np.sum(sequence_length)
maxlen = np.max(sequence_length)
word_idx = [padding_data(seq, maxlen, 0) for seq in word_idx]
word_idx = np.array(word_idx).astype('int64')
label = np.array(map(lambda x: x[1], data)).astype('int64')
_, loss, fetch_acc, fetch_g_acc = sess.run(
[train_op, avg_loss, acc, g_acc],
feed_dict={
feeding_list[0]: word_idx,
feeding_list[1]: sequence_length,
feeding_list[2]: label
})
print("pass_id=%d, batch_id=%d, loss: %f, acc: %f, avg_acc: %f"
% (pass_id, batch_id, loss, fetch_acc, fetch_g_acc[1]))
pass_end_time = time.time()
time_consumed = pass_end_time - pass_start_time
words_per_sec = words_seen / time_consumed
test_acc = do_validation(sess)
print("pass_id=%d, test_acc: %f, words/s: %f, sec/pass: %f" %
(pass_id, test_acc, words_per_sec, time_consumed))
if __name__ == '__main__':
args = parse_args()
print_arguments(args)
if args.infer_only:
pass
else:
train(args)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""VGG16 benchmark in TensorFlow"""
import tensorflow as tf
import numpy as np
import argparse
import time
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
'--batch_size', type=int, default=128, help="Batch size for training.")
parser.add_argument(
'--skip_batch_num',
type=int,
default=5,
help='The first num of minibatch num to skip, for better performance test')
parser.add_argument(
'--iterations', type=int, default=80, help='The number of minibatches.')
parser.add_argument(
'--learning_rate',
type=float,
default=1e-3,
help="Learning rate for training.")
parser.add_argument('--num_passes', type=int, default=50, help="No. of passes.")
parser.add_argument(
'--device',
type=str,
default='GPU',
choices=['CPU', 'GPU'],
help="The device type.")
parser.add_argument(
'--data_format',
type=str,
default='NHWC',
choices=['NCHW', 'NHWC'],
help='The data order, NCHW=[batch, channels, height, width].'
'Only support NHWC right now.')
parser.add_argument(
'--data_set',
type=str,
default='cifar10',
choices=['cifar10', 'flowers'],
help='Optional dataset for benchmark.')
args = parser.parse_args()
class VGG16Model(object):
def __init__(self):
self.parameters = []
def batch_norm_relu(self, inputs, is_training):
"""Performs a batch normalization followed by a ReLU."""
# We set fused=True for a significant speed boost. See
# https://www.tensorflow.org/speed/speed_guide#common_fused_ops
inputs = tf.layers.batch_normalization(
inputs=inputs,
axis=1 if args.data_format == 'NCHW' else -1,
momentum=0.9,
epsilon=1e-05,
center=True,
scale=True,
training=is_training,
fused=True)
inputs = tf.nn.relu(inputs)
return inputs
def conv_bn_layer(self,
name,
images,
kernel_shape,
is_training,
drop_rate=0.0):
with tf.name_scope(name) as scope:
kernel = tf.Variable(
tf.truncated_normal(
kernel_shape, dtype=tf.float32, stddev=1e-1),
name='weights')
conv = tf.nn.conv2d(
images,
kernel, [1, 1, 1, 1],
data_format=args.data_format,
padding='SAME')
biases = tf.Variable(
tf.constant(
0.0, shape=[kernel_shape[-1]], dtype=tf.float32),
trainable=True,
name='biases')
out = tf.nn.bias_add(conv, biases)
out = self.batch_norm_relu(out, is_training)
out = tf.layers.dropout(out, rate=drop_rate, training=is_training)
return out
def fc_layer(self, name, inputs, shape):
with tf.name_scope(name) as scope:
fc_w = tf.Variable(
tf.truncated_normal(
shape, dtype=tf.float32, stddev=1e-1),
name='weights')
fc_b = tf.Variable(
tf.constant(
0.0, shape=[shape[-1]], dtype=tf.float32),
trainable=True,
name='biases')
out = tf.nn.bias_add(tf.matmul(inputs, fc_w), fc_b)
return out
def network(self, images, class_dim, is_training):
""" VGG16 model structure.
TODO(kuke): enable this network to support the 'NCHW' data format
"""
# conv1
conv1_1 = self.conv_bn_layer(
'conv1_1', images, [3, 3, 3, 64], is_training, drop_rate=0.3)
conv1_2 = self.conv_bn_layer(
'conv1_2', conv1_1, [3, 3, 64, 64], is_training, drop_rate=0.0)
# pool1
pool1 = tf.nn.max_pool(
conv1_2,
ksize=[1, 2, 2, 1],
strides=[1, 2, 2, 1],
padding='SAME',
name='pool1')
# conv2
conv2_1 = self.conv_bn_layer(
'conv2_1', pool1, [3, 3, 64, 128], is_training, drop_rate=0.4)
conv2_2 = self.conv_bn_layer(
'conv2_2', conv2_1, [3, 3, 128, 128], is_training, drop_rate=0.0)
# pool2
pool2 = tf.nn.max_pool(
conv2_2,
ksize=[1, 2, 2, 1],
strides=[1, 2, 2, 1],
padding='SAME',
name='pool2')
# conv3
conv3_1 = self.conv_bn_layer(
'conv3_1', pool2, [3, 3, 128, 256], is_training, drop_rate=0.4)
conv3_2 = self.conv_bn_layer(
'conv3_2', conv3_1, [3, 3, 256, 256], is_training, drop_rate=0.4)
conv3_3 = self.conv_bn_layer(
'conv3_3', conv3_2, [3, 3, 256, 256], is_training, drop_rate=0.0)
# pool3
pool3 = tf.nn.max_pool(
conv3_3,
ksize=[1, 2, 2, 1],
strides=[1, 2, 2, 1],
padding='SAME',
name='pool3')
# conv4
conv4_1 = self.conv_bn_layer(
'conv4_1', pool3, [3, 3, 256, 512], is_training, drop_rate=0.4)
conv4_2 = self.conv_bn_layer(
'conv4_2', conv4_1, [3, 3, 512, 512], is_training, drop_rate=0.4)
conv4_3 = self.conv_bn_layer(
'conv4_3', conv4_2, [3, 3, 512, 512], is_training, drop_rate=0.0)
# pool4
pool4 = tf.nn.max_pool(
conv4_3,
ksize=[1, 2, 2, 1],
strides=[1, 2, 2, 1],
padding='SAME',
name='pool4')
# conv5
conv5_1 = self.conv_bn_layer(
'conv5_1', pool4, [3, 3, 512, 512], is_training, drop_rate=0.4)
conv5_2 = self.conv_bn_layer(
'conv5_2', conv5_1, [3, 3, 512, 512], is_training, drop_rate=0.4)
conv5_3 = self.conv_bn_layer(
'conv5_3', conv5_2, [3, 3, 512, 512], is_training, drop_rate=0.0)
# pool5
pool5 = tf.nn.max_pool(
conv5_3,
ksize=[1, 2, 2, 1],
strides=[1, 2, 2, 1],
padding='SAME',
name='pool4')
# flatten
shape = int(np.prod(pool5.get_shape()[1:]))
pool5_flat = tf.reshape(pool5, [-1, shape])
# fc1
drop = tf.layers.dropout(pool5_flat, rate=0.5, training=is_training)
fc1 = self.fc_layer('fc1', drop, [shape, 512])
# fc2
bn = self.batch_norm_relu(fc1, is_training)
drop = tf.layers.dropout(bn, rate=0.5, training=is_training)
fc2 = self.fc_layer('fc2', drop, [512, 512])
fc3 = self.fc_layer('fc3', fc2, [512, class_dim])
return fc3
def run_benchmark():
"""Run benchmark on cifar10 or flowers."""
if args.data_set == "cifar10":
class_dim = 10
raw_shape = (3, 32, 32)
dat_shape = (None, 32, 32, 3) if args.data_format == 'NHWC' else (
None, 3, 32, 32)
else:
class_dim = 102
raw_shape = (3, 224, 224)
dat_shape = (None, 224, 224, 3) if args.data_format == 'NHWC' else (
None, 3, 224, 224)
device = '/cpu:0' if args.device == 'CPU' else '/device:GPU:0'
with tf.device(device):
images = tf.placeholder(tf.float32, shape=dat_shape)
labels = tf.placeholder(tf.int64, shape=(None, ))
is_training = tf.placeholder('bool')
onehot_labels = tf.one_hot(labels, depth=class_dim)
vgg16 = VGG16Model()
logits = vgg16.network(images, class_dim, is_training)
loss = tf.losses.softmax_cross_entropy(
onehot_labels=onehot_labels, logits=logits)
avg_loss = tf.reduce_mean(loss)
correct = tf.equal(tf.argmax(logits, 1), labels)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(update_ops):
train_op = optimizer.minimize(avg_loss)
# data reader
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.cifar.train10()
if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
buf_size=5120),
batch_size=args.batch_size)
test_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.cifar.test10()
if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
buf_size=5120),
batch_size=args.batch_size)
# test
def test():
test_accs = []
for batch_id, data in enumerate(test_reader()):
test_images = np.array(
map(lambda x: np.transpose(x[0].reshape(raw_shape),
axes=[1, 2, 0]) if args.data_format == 'NHWC' else x[0], data)).astype("float32")
test_labels = np.array(map(lambda x: x[1], data)).astype('int64')
test_accs.append(
accuracy.eval(feed_dict={
images: test_images,
labels: test_labels,
is_training: False
}))
return np.mean(test_accs)
config = tf.ConfigProto(
intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
config.gpu_options.allow_growth = True
with tf.Session(config=config) as sess:
init_g = tf.global_variables_initializer()
init_l = tf.local_variables_initializer()
sess.run(init_g)
sess.run(init_l)
iters, num_samples, start_time = 0, 0, time.time()
for pass_id in range(args.num_passes):
# train
num_samples = 0
start_time = time.time()
for batch_id, data in enumerate(train_reader()):
if iters == args.skip_batch_num:
start_time = time.time()
num_samples = 0
if iters == args.iterations:
break
train_images = np.array(
map(lambda x: np.transpose(x[0].reshape(raw_shape),
axes=[1, 2, 0]) if args.data_format == 'NHWC' else x[0], data)).astype("float32")
train_labels = np.array(map(lambda x: x[1], data)).astype(
'int64')
_, loss, acc = sess.run([train_op, avg_loss, accuracy],
feed_dict={
images: train_images,
labels: train_labels,
is_training: True
})
iters += 1
num_samples += len(data)
print("Pass = %d, Iters = %d, Loss = %f, Accuracy = %f" %
(pass_id, iters, loss, acc))
train_elapsed = time.time() - start_time
# test
pass_test_acc = test()
print("Pass = %d, Train speed = %f imgs/s, Test accuracy = %f\n" %
(pass_id, num_samples / train_elapsed, pass_test_acc))
def print_arguments():
print('----------- Configuration Arguments -----------')
for arg, value in sorted(vars(args).iteritems()):
print('%s: %s' % (arg, value))
print('------------------------------------------------')
if __name__ == '__main__':
print_arguments()
run_benchmark()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册