diff --git a/benchmark/.gitignore b/benchmark/.gitignore deleted file mode 100644 index fb4114356d4f37efc8ad672316fd4f99443d9fcd..0000000000000000000000000000000000000000 --- a/benchmark/.gitignore +++ /dev/null @@ -1,12 +0,0 @@ -paddle/image/logs -paddle/image/*.pyc -paddle/image/train.list -paddle/rnn/logs -paddle/rnn/*.pyc -paddle/rnn/imdb.pkl -caffe/image/logs -tensorflow/image/logs -tensorflow/rnn/logs -fluid/models/*.pyc -fluid/logs -fluid/nohup.out diff --git a/benchmark/caffe/image/alexnet.prototxt b/benchmark/caffe/image/alexnet.prototxt deleted file mode 100644 index aca184ddaf2ca2b5e2bea17d131055e0621b8271..0000000000000000000000000000000000000000 --- a/benchmark/caffe/image/alexnet.prototxt +++ /dev/null @@ -1,347 +0,0 @@ -name: "alexnet" -input: "data" -input_dim: 64 -input_dim: 3 -input_dim: 227 -input_dim: 227 -input: "label" -input_dim: 64 -input_dim: 1 -input_dim: 1 -input_dim: 1 -force_backward: true -layer { - name: "conv1" - type: "Convolution" - bottom: "data" - top: "conv1" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 96 - kernel_size: 11 - stride: 4 - weight_filler { - type: "gaussian" - std: 0.01 - } - bias_filler { - type: "constant" - value: 0 - } - } -} -layer { - name: "relu1" - type: "ReLU" - bottom: "conv1" - top: "conv1" -} -layer { - name: "norm1" - type: "LRN" - bottom: "conv1" - top: "norm1" - lrn_param { - local_size: 5 - alpha: 0.0001 - beta: 0.75 - } -} -layer { - name: "pool1" - type: "Pooling" - bottom: "norm1" - top: "pool1" - pooling_param { - pool: MAX - kernel_size: 3 - stride: 2 - } -} -layer { - name: "conv2" - type: "Convolution" - bottom: "pool1" - top: "conv2" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 256 - pad: 2 - kernel_size: 5 - group: 1 - weight_filler { - type: "gaussian" - std: 0.01 - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "relu2" - type: "ReLU" - bottom: "conv2" - top: "conv2" -} -layer { - name: "norm2" - type: "LRN" - bottom: "conv2" - top: "norm2" - lrn_param { - local_size: 5 - alpha: 0.0001 - beta: 0.75 - } -} -layer { - name: "pool2" - type: "Pooling" - bottom: "norm2" - top: "pool2" - pooling_param { - pool: MAX - kernel_size: 3 - stride: 2 - } -} -layer { - name: "conv3" - type: "Convolution" - bottom: "pool2" - top: "conv3" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 384 - pad: 1 - kernel_size: 3 - weight_filler { - type: "gaussian" - std: 0.01 - } - bias_filler { - type: "constant" - value: 0 - } - } -} -layer { - name: "relu3" - type: "ReLU" - bottom: "conv3" - top: "conv3" -} -layer { - name: "conv4" - type: "Convolution" - bottom: "conv3" - top: "conv4" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 384 - pad: 1 - kernel_size: 3 - group: 1 - weight_filler { - type: "gaussian" - std: 0.01 - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "relu4" - type: "ReLU" - bottom: "conv4" - top: "conv4" -} -layer { - name: "conv5" - type: "Convolution" - bottom: "conv4" - top: "conv5" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 256 - pad: 1 - kernel_size: 3 - group: 1 - weight_filler { - type: "gaussian" - std: 0.01 - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "relu5" - type: "ReLU" - bottom: "conv5" - top: "conv5" -} -layer { - name: "pool5" - type: "Pooling" - bottom: "conv5" - top: "pool5" - pooling_param { - pool: MAX - kernel_size: 3 - stride: 2 - } -} -layer { - name: "fc6" - type: "InnerProduct" - bottom: "pool5" - top: "fc6" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - inner_product_param { - num_output: 4096 - weight_filler { - type: "gaussian" - std: 0.005 - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "relu6" - type: "ReLU" - bottom: "fc6" - top: "fc6" -} -layer { - name: "drop6" - type: "Dropout" - bottom: "fc6" - top: "fc6" - dropout_param { - dropout_ratio: 0.5 - } -} -layer { - name: "fc7" - type: "InnerProduct" - bottom: "fc6" - top: "fc7" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - inner_product_param { - num_output: 4096 - weight_filler { - type: "gaussian" - std: 0.005 - } - bias_filler { - type: "constant" - value: 0.1 - } - } -} -layer { - name: "relu7" - type: "ReLU" - bottom: "fc7" - top: "fc7" -} -layer { - name: "drop7" - type: "Dropout" - bottom: "fc7" - top: "fc7" - dropout_param { - dropout_ratio: 0.5 - } -} -layer { - name: "fc8" - type: "InnerProduct" - bottom: "fc7" - top: "fc8" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - inner_product_param { - num_output: 1000 - weight_filler { - type: "gaussian" - std: 0.01 - } - bias_filler { - type: "constant" - value: 0 - } - } -} -layer { - name: "loss" - type: "SoftmaxWithLoss" - bottom: "fc8" - bottom: "label" - top: "loss" -} diff --git a/benchmark/caffe/image/googlenet.prototxt b/benchmark/caffe/image/googlenet.prototxt deleted file mode 100644 index c5f3b4fe3efcb6f7397031c086997fa914c67b7f..0000000000000000000000000000000000000000 --- a/benchmark/caffe/image/googlenet.prototxt +++ /dev/null @@ -1,2334 +0,0 @@ -name: "googlenet" -input: "data" -input_dim: 128 -input_dim: 3 -input_dim: 224 -input_dim: 224 -input: "label" -input_dim: 128 -input_dim: 1 -input_dim: 1 -input_dim: 1 -layer { - name: "conv1/7x7_s2" - type: "Convolution" - bottom: "data" - top: "conv1/7x7_s2" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 64 - pad: 3 - kernel_size: 7 - stride: 2 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.2 - } - } -} -layer { - name: "conv1/relu_7x7" - type: "ReLU" - bottom: "conv1/7x7_s2" - top: "conv1/7x7_s2" -} -layer { - name: "pool1/3x3_s2" - type: "Pooling" - bottom: "conv1/7x7_s2" - top: "pool1/3x3_s2" - pooling_param { - pool: MAX - kernel_size: 3 - stride: 2 - } -} -#layer { -# name: "pool1/norm1" -# type: "LRN" -# bottom: "pool1/3x3_s2" -# top: "pool1/norm1" -# lrn_param { -# local_size: 5 -# alpha: 0.0001 -# beta: 0.75 -# } -#} -layer { - name: "conv2/3x3_reduce" - type: "Convolution" -# bottom: "pool1/norm1" - bottom: "pool1/3x3_s2" - top: "conv2/3x3_reduce" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 64 - kernel_size: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.2 - } - } -} -layer { - name: "conv2/relu_3x3_reduce" - type: "ReLU" - bottom: "conv2/3x3_reduce" - top: "conv2/3x3_reduce" -} -layer { - name: "conv2/3x3" - type: "Convolution" - bottom: "conv2/3x3_reduce" - top: "conv2/3x3" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 192 - pad: 1 - kernel_size: 3 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.2 - } - } -} -layer { - name: "conv2/relu_3x3" - type: "ReLU" - bottom: "conv2/3x3" - top: "conv2/3x3" -} -#layer { -# name: "conv2/norm2" -# type: "LRN" -# bottom: "conv2/3x3" -# top: "conv2/norm2" -# lrn_param { -# local_size: 5 -# alpha: 0.0001 -# beta: 0.75 -# } -#} -layer { - name: "pool2/3x3_s2" - type: "Pooling" -# bottom: "conv2/norm2" - bottom: "conv2/3x3" - top: "pool2/3x3_s2" - pooling_param { - pool: MAX - kernel_size: 3 - stride: 2 - } -} -layer { - name: "inception_3a/1x1" - type: "Convolution" - bottom: "pool2/3x3_s2" - top: "inception_3a/1x1" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 64 - kernel_size: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.2 - } - } -} -layer { - name: "inception_3a/relu_1x1" - type: "ReLU" - bottom: "inception_3a/1x1" - top: "inception_3a/1x1" -} -layer { - name: "inception_3a/3x3_reduce" - type: "Convolution" - bottom: "pool2/3x3_s2" - top: "inception_3a/3x3_reduce" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 96 - kernel_size: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.2 - } - } -} -layer { - name: "inception_3a/relu_3x3_reduce" - type: "ReLU" - bottom: "inception_3a/3x3_reduce" - top: "inception_3a/3x3_reduce" -} -layer { - name: "inception_3a/3x3" - type: "Convolution" - bottom: "inception_3a/3x3_reduce" - top: "inception_3a/3x3" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 128 - pad: 1 - kernel_size: 3 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.2 - } - } -} -layer { - name: "inception_3a/relu_3x3" - type: "ReLU" - bottom: "inception_3a/3x3" - top: "inception_3a/3x3" -} -layer { - name: "inception_3a/5x5_reduce" - type: "Convolution" - bottom: "pool2/3x3_s2" - top: "inception_3a/5x5_reduce" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 16 - kernel_size: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.2 - } - } -} -layer { - name: "inception_3a/relu_5x5_reduce" - type: "ReLU" - bottom: "inception_3a/5x5_reduce" - top: "inception_3a/5x5_reduce" -} -layer { - name: "inception_3a/5x5" - type: "Convolution" - bottom: "inception_3a/5x5_reduce" - top: "inception_3a/5x5" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 32 - pad: 2 - kernel_size: 5 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.2 - } - } -} -layer { - name: "inception_3a/relu_5x5" - type: "ReLU" - bottom: "inception_3a/5x5" - top: "inception_3a/5x5" -} -layer { - name: "inception_3a/pool" - type: "Pooling" - bottom: "pool2/3x3_s2" - top: "inception_3a/pool" - pooling_param { - pool: MAX - kernel_size: 3 - stride: 1 - pad: 1 - } -} -layer { - name: "inception_3a/pool_proj" - type: "Convolution" - bottom: "inception_3a/pool" - top: "inception_3a/pool_proj" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 32 - kernel_size: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.2 - } - } -} -layer { - name: "inception_3a/relu_pool_proj" - type: "ReLU" - bottom: "inception_3a/pool_proj" - top: "inception_3a/pool_proj" -} -layer { - name: "inception_3a/output" - type: "Concat" - bottom: "inception_3a/1x1" - bottom: "inception_3a/3x3" - bottom: "inception_3a/5x5" - bottom: "inception_3a/pool_proj" - top: "inception_3a/output" -} -layer { - name: "inception_3b/1x1" - type: "Convolution" - bottom: "inception_3a/output" - top: "inception_3b/1x1" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 128 - kernel_size: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.2 - } - } -} -layer { - name: "inception_3b/relu_1x1" - type: "ReLU" - bottom: "inception_3b/1x1" - top: "inception_3b/1x1" -} -layer { - name: "inception_3b/3x3_reduce" - type: "Convolution" - bottom: "inception_3a/output" - top: "inception_3b/3x3_reduce" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 128 - kernel_size: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.2 - } - } -} -layer { - name: "inception_3b/relu_3x3_reduce" - type: "ReLU" - bottom: "inception_3b/3x3_reduce" - top: "inception_3b/3x3_reduce" -} -layer { - name: "inception_3b/3x3" - type: "Convolution" - bottom: "inception_3b/3x3_reduce" - top: "inception_3b/3x3" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 192 - pad: 1 - kernel_size: 3 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.2 - } - } -} -layer { - name: "inception_3b/relu_3x3" - type: "ReLU" - bottom: "inception_3b/3x3" - top: "inception_3b/3x3" -} -layer { - name: "inception_3b/5x5_reduce" - type: "Convolution" - bottom: "inception_3a/output" - top: "inception_3b/5x5_reduce" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 32 - kernel_size: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.2 - } - } -} -layer { - name: "inception_3b/relu_5x5_reduce" - type: "ReLU" - bottom: "inception_3b/5x5_reduce" - top: "inception_3b/5x5_reduce" -} -layer { - name: "inception_3b/5x5" - type: "Convolution" - bottom: "inception_3b/5x5_reduce" - top: "inception_3b/5x5" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 96 - pad: 2 - kernel_size: 5 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.2 - } - } -} -layer { - name: "inception_3b/relu_5x5" - type: "ReLU" - bottom: "inception_3b/5x5" - top: "inception_3b/5x5" -} -layer { - name: "inception_3b/pool" - type: "Pooling" - bottom: "inception_3a/output" - top: "inception_3b/pool" - pooling_param { - pool: MAX - kernel_size: 3 - stride: 1 - pad: 1 - } -} -layer { - name: "inception_3b/pool_proj" - type: "Convolution" - bottom: "inception_3b/pool" - top: "inception_3b/pool_proj" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 64 - kernel_size: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.2 - } - } -} -layer { - name: "inception_3b/relu_pool_proj" - type: "ReLU" - bottom: "inception_3b/pool_proj" - top: "inception_3b/pool_proj" -} -layer { - name: "inception_3b/output" - type: "Concat" - bottom: "inception_3b/1x1" - bottom: "inception_3b/3x3" - bottom: "inception_3b/5x5" - bottom: "inception_3b/pool_proj" - top: "inception_3b/output" -} -layer { - name: "pool3/3x3_s2" - type: "Pooling" - bottom: "inception_3b/output" - top: "pool3/3x3_s2" - pooling_param { - pool: MAX - kernel_size: 3 - stride: 2 - } -} -layer { - name: "inception_4a/1x1" - type: "Convolution" - bottom: "pool3/3x3_s2" - top: "inception_4a/1x1" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 192 - kernel_size: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.2 - } - } -} -layer { - name: "inception_4a/relu_1x1" - type: "ReLU" - bottom: "inception_4a/1x1" - top: "inception_4a/1x1" -} -layer { - name: "inception_4a/3x3_reduce" - type: "Convolution" - bottom: "pool3/3x3_s2" - top: "inception_4a/3x3_reduce" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 96 - kernel_size: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.2 - } - } -} -layer { - name: "inception_4a/relu_3x3_reduce" - type: "ReLU" - bottom: "inception_4a/3x3_reduce" - top: "inception_4a/3x3_reduce" -} -layer { - name: "inception_4a/3x3" - type: "Convolution" - bottom: "inception_4a/3x3_reduce" - top: "inception_4a/3x3" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 208 - pad: 1 - kernel_size: 3 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.2 - } - } -} -layer { - name: "inception_4a/relu_3x3" - type: "ReLU" - bottom: "inception_4a/3x3" - top: "inception_4a/3x3" -} -layer { - name: "inception_4a/5x5_reduce" - type: "Convolution" - bottom: "pool3/3x3_s2" - top: "inception_4a/5x5_reduce" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 16 - kernel_size: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.2 - } - } -} -layer { - name: "inception_4a/relu_5x5_reduce" - type: "ReLU" - bottom: "inception_4a/5x5_reduce" - top: "inception_4a/5x5_reduce" -} -layer { - name: "inception_4a/5x5" - type: "Convolution" - bottom: "inception_4a/5x5_reduce" - top: "inception_4a/5x5" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 48 - pad: 2 - kernel_size: 5 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.2 - } - } -} -layer { - name: "inception_4a/relu_5x5" - type: "ReLU" - bottom: "inception_4a/5x5" - top: "inception_4a/5x5" -} -layer { - name: "inception_4a/pool" - type: "Pooling" - bottom: "pool3/3x3_s2" - top: "inception_4a/pool" - pooling_param { - pool: MAX - kernel_size: 3 - stride: 1 - pad: 1 - } -} -layer { - name: "inception_4a/pool_proj" - type: "Convolution" - bottom: "inception_4a/pool" - top: "inception_4a/pool_proj" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 64 - kernel_size: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.2 - } - } -} -layer { - name: "inception_4a/relu_pool_proj" - type: "ReLU" - bottom: "inception_4a/pool_proj" - top: "inception_4a/pool_proj" -} -layer { - name: "inception_4a/output" - type: "Concat" - bottom: "inception_4a/1x1" - bottom: "inception_4a/3x3" - bottom: "inception_4a/5x5" - bottom: "inception_4a/pool_proj" - top: "inception_4a/output" -} -#layer { -# name: "loss1/ave_pool" -# type: "Pooling" -# bottom: "inception_4a/output" -# top: "loss1/ave_pool" -# pooling_param { -# pool: AVE -# kernel_size: 5 -# stride: 3 -# } -#} -#layer { -# name: "loss1/conv" -# type: "Convolution" -# bottom: "loss1/ave_pool" -# top: "loss1/conv" -# param { -# lr_mult: 1 -# decay_mult: 1 -# } -# param { -# lr_mult: 2 -# decay_mult: 0 -# } -# convolution_param { -# num_output: 128 -# kernel_size: 1 -# weight_filler { -# type: "xavier" -# } -# bias_filler { -# type: "constant" -# value: 0.2 -# } -# } -#} -#layer { -# name: "loss1/relu_conv" -# type: "ReLU" -# bottom: "loss1/conv" -# top: "loss1/conv" -#} -#layer { -# name: "loss1/fc" -# type: "InnerProduct" -# bottom: "loss1/conv" -# top: "loss1/fc" -# param { -# lr_mult: 1 -# decay_mult: 1 -# } -# param { -# lr_mult: 2 -# decay_mult: 0 -# } -# inner_product_param { -# num_output: 1024 -# weight_filler { -# type: "xavier" -# } -# bias_filler { -# type: "constant" -# value: 0.2 -# } -# } -#} -#layer { -# name: "loss1/relu_fc" -# type: "ReLU" -# bottom: "loss1/fc" -# top: "loss1/fc" -#} -#layer { -# name: "loss1/drop_fc" -# type: "Dropout" -# bottom: "loss1/fc" -# top: "loss1/fc" -# dropout_param { -# dropout_ratio: 0.7 -# } -#} -#layer { -# name: "loss1/classifier" -# type: "InnerProduct" -# bottom: "loss1/fc" -# top: "loss1/classifier" -# param { -# lr_mult: 1 -# decay_mult: 1 -# } -# param { -# lr_mult: 2 -# decay_mult: 0 -# } -# inner_product_param { -# num_output: 1000 -# weight_filler { -# type: "xavier" -# } -# bias_filler { -# type: "constant" -# value: 0 -# } -# } -#} -#layer { -# name: "loss1/loss" -# type: "SoftmaxWithLoss" -# bottom: "loss1/classifier" -# bottom: "label" -# top: "loss1/loss1" -# loss_weight: 0.3 -#} -layer { - name: "inception_4b/1x1" - type: "Convolution" - bottom: "inception_4a/output" - top: "inception_4b/1x1" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 160 - kernel_size: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.2 - } - } -} -layer { - name: "inception_4b/relu_1x1" - type: "ReLU" - bottom: "inception_4b/1x1" - top: "inception_4b/1x1" -} -layer { - name: "inception_4b/3x3_reduce" - type: "Convolution" - bottom: "inception_4a/output" - top: "inception_4b/3x3_reduce" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 112 - kernel_size: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.2 - } - } -} -layer { - name: "inception_4b/relu_3x3_reduce" - type: "ReLU" - bottom: "inception_4b/3x3_reduce" - top: "inception_4b/3x3_reduce" -} -layer { - name: "inception_4b/3x3" - type: "Convolution" - bottom: "inception_4b/3x3_reduce" - top: "inception_4b/3x3" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 224 - pad: 1 - kernel_size: 3 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.2 - } - } -} -layer { - name: "inception_4b/relu_3x3" - type: "ReLU" - bottom: "inception_4b/3x3" - top: "inception_4b/3x3" -} -layer { - name: "inception_4b/5x5_reduce" - type: "Convolution" - bottom: "inception_4a/output" - top: "inception_4b/5x5_reduce" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 24 - kernel_size: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.2 - } - } -} -layer { - name: "inception_4b/relu_5x5_reduce" - type: "ReLU" - bottom: "inception_4b/5x5_reduce" - top: "inception_4b/5x5_reduce" -} -layer { - name: "inception_4b/5x5" - type: "Convolution" - bottom: "inception_4b/5x5_reduce" - top: "inception_4b/5x5" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 64 - pad: 2 - kernel_size: 5 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.2 - } - } -} -layer { - name: "inception_4b/relu_5x5" - type: "ReLU" - bottom: "inception_4b/5x5" - top: "inception_4b/5x5" -} -layer { - name: "inception_4b/pool" - type: "Pooling" - bottom: "inception_4a/output" - top: "inception_4b/pool" - pooling_param { - pool: MAX - kernel_size: 3 - stride: 1 - pad: 1 - } -} -layer { - name: "inception_4b/pool_proj" - type: "Convolution" - bottom: "inception_4b/pool" - top: "inception_4b/pool_proj" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 64 - kernel_size: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.2 - } - } -} -layer { - name: "inception_4b/relu_pool_proj" - type: "ReLU" - bottom: "inception_4b/pool_proj" - top: "inception_4b/pool_proj" -} -layer { - name: "inception_4b/output" - type: "Concat" - bottom: "inception_4b/1x1" - bottom: "inception_4b/3x3" - bottom: "inception_4b/5x5" - bottom: "inception_4b/pool_proj" - top: "inception_4b/output" -} -layer { - name: "inception_4c/1x1" - type: "Convolution" - bottom: "inception_4b/output" - top: "inception_4c/1x1" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 128 - kernel_size: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.2 - } - } -} -layer { - name: "inception_4c/relu_1x1" - type: "ReLU" - bottom: "inception_4c/1x1" - top: "inception_4c/1x1" -} -layer { - name: "inception_4c/3x3_reduce" - type: "Convolution" - bottom: "inception_4b/output" - top: "inception_4c/3x3_reduce" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 128 - kernel_size: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.2 - } - } -} -layer { - name: "inception_4c/relu_3x3_reduce" - type: "ReLU" - bottom: "inception_4c/3x3_reduce" - top: "inception_4c/3x3_reduce" -} -layer { - name: "inception_4c/3x3" - type: "Convolution" - bottom: "inception_4c/3x3_reduce" - top: "inception_4c/3x3" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 256 - pad: 1 - kernel_size: 3 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.2 - } - } -} -layer { - name: "inception_4c/relu_3x3" - type: "ReLU" - bottom: "inception_4c/3x3" - top: "inception_4c/3x3" -} -layer { - name: "inception_4c/5x5_reduce" - type: "Convolution" - bottom: "inception_4b/output" - top: "inception_4c/5x5_reduce" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 24 - kernel_size: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.2 - } - } -} -layer { - name: "inception_4c/relu_5x5_reduce" - type: "ReLU" - bottom: "inception_4c/5x5_reduce" - top: "inception_4c/5x5_reduce" -} -layer { - name: "inception_4c/5x5" - type: "Convolution" - bottom: "inception_4c/5x5_reduce" - top: "inception_4c/5x5" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 64 - pad: 2 - kernel_size: 5 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.2 - } - } -} -layer { - name: "inception_4c/relu_5x5" - type: "ReLU" - bottom: "inception_4c/5x5" - top: "inception_4c/5x5" -} -layer { - name: "inception_4c/pool" - type: "Pooling" - bottom: "inception_4b/output" - top: "inception_4c/pool" - pooling_param { - pool: MAX - kernel_size: 3 - stride: 1 - pad: 1 - } -} -layer { - name: "inception_4c/pool_proj" - type: "Convolution" - bottom: "inception_4c/pool" - top: "inception_4c/pool_proj" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 64 - kernel_size: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.2 - } - } -} -layer { - name: "inception_4c/relu_pool_proj" - type: "ReLU" - bottom: "inception_4c/pool_proj" - top: "inception_4c/pool_proj" -} -layer { - name: "inception_4c/output" - type: "Concat" - bottom: "inception_4c/1x1" - bottom: "inception_4c/3x3" - bottom: "inception_4c/5x5" - bottom: "inception_4c/pool_proj" - top: "inception_4c/output" -} -layer { - name: "inception_4d/1x1" - type: "Convolution" - bottom: "inception_4c/output" - top: "inception_4d/1x1" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 112 - kernel_size: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.2 - } - } -} -layer { - name: "inception_4d/relu_1x1" - type: "ReLU" - bottom: "inception_4d/1x1" - top: "inception_4d/1x1" -} -layer { - name: "inception_4d/3x3_reduce" - type: "Convolution" - bottom: "inception_4c/output" - top: "inception_4d/3x3_reduce" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 144 - kernel_size: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.2 - } - } -} -layer { - name: "inception_4d/relu_3x3_reduce" - type: "ReLU" - bottom: "inception_4d/3x3_reduce" - top: "inception_4d/3x3_reduce" -} -layer { - name: "inception_4d/3x3" - type: "Convolution" - bottom: "inception_4d/3x3_reduce" - top: "inception_4d/3x3" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 288 - pad: 1 - kernel_size: 3 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.2 - } - } -} -layer { - name: "inception_4d/relu_3x3" - type: "ReLU" - bottom: "inception_4d/3x3" - top: "inception_4d/3x3" -} -layer { - name: "inception_4d/5x5_reduce" - type: "Convolution" - bottom: "inception_4c/output" - top: "inception_4d/5x5_reduce" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 32 - kernel_size: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.2 - } - } -} -layer { - name: "inception_4d/relu_5x5_reduce" - type: "ReLU" - bottom: "inception_4d/5x5_reduce" - top: "inception_4d/5x5_reduce" -} -layer { - name: "inception_4d/5x5" - type: "Convolution" - bottom: "inception_4d/5x5_reduce" - top: "inception_4d/5x5" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 64 - pad: 2 - kernel_size: 5 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.2 - } - } -} -layer { - name: "inception_4d/relu_5x5" - type: "ReLU" - bottom: "inception_4d/5x5" - top: "inception_4d/5x5" -} -layer { - name: "inception_4d/pool" - type: "Pooling" - bottom: "inception_4c/output" - top: "inception_4d/pool" - pooling_param { - pool: MAX - kernel_size: 3 - stride: 1 - pad: 1 - } -} -layer { - name: "inception_4d/pool_proj" - type: "Convolution" - bottom: "inception_4d/pool" - top: "inception_4d/pool_proj" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 64 - kernel_size: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.2 - } - } -} -layer { - name: "inception_4d/relu_pool_proj" - type: "ReLU" - bottom: "inception_4d/pool_proj" - top: "inception_4d/pool_proj" -} -layer { - name: "inception_4d/output" - type: "Concat" - bottom: "inception_4d/1x1" - bottom: "inception_4d/3x3" - bottom: "inception_4d/5x5" - bottom: "inception_4d/pool_proj" - top: "inception_4d/output" -} -#layer { -# name: "loss2/ave_pool" -# type: "Pooling" -# bottom: "inception_4d/output" -# top: "loss2/ave_pool" -# pooling_param { -# pool: AVE -# kernel_size: 5 -# stride: 3 -# } -#} -#layer { -# name: "loss2/conv" -# type: "Convolution" -# bottom: "loss2/ave_pool" -# top: "loss2/conv" -# param { -# lr_mult: 1 -# decay_mult: 1 -# } -# param { -# lr_mult: 2 -# decay_mult: 0 -# } -# convolution_param { -# num_output: 128 -# kernel_size: 1 -# weight_filler { -# type: "xavier" -# } -# bias_filler { -# type: "constant" -# value: 0.2 -# } -# } -#} -#layer { -# name: "loss2/relu_conv" -# type: "ReLU" -# bottom: "loss2/conv" -# top: "loss2/conv" -#} -#layer { -# name: "loss2/fc" -# type: "InnerProduct" -# bottom: "loss2/conv" -# top: "loss2/fc" -# param { -# lr_mult: 1 -# decay_mult: 1 -# } -# param { -# lr_mult: 2 -# decay_mult: 0 -# } -# inner_product_param { -# num_output: 1024 -# weight_filler { -# type: "xavier" -# } -# bias_filler { -# type: "constant" -# value: 0.2 -# } -# } -#} -#layer { -# name: "loss2/relu_fc" -# type: "ReLU" -# bottom: "loss2/fc" -# top: "loss2/fc" -#} -#layer { -# name: "loss2/drop_fc" -# type: "Dropout" -# bottom: "loss2/fc" -# top: "loss2/fc" -# dropout_param { -# dropout_ratio: 0.7 -# } -#} -#layer { -# name: "loss2/classifier" -# type: "InnerProduct" -# bottom: "loss2/fc" -# top: "loss2/classifier" -# param { -# lr_mult: 1 -# decay_mult: 1 -# } -# param { -# lr_mult: 2 -# decay_mult: 0 -# } -# inner_product_param { -# num_output: 1000 -# weight_filler { -# type: "xavier" -# } -# bias_filler { -# type: "constant" -# value: 0 -# } -# } -#} -#layer { -# name: "loss2/loss" -# type: "SoftmaxWithLoss" -# bottom: "loss2/classifier" -# bottom: "label" -# top: "loss2/loss1" -# loss_weight: 0.3 -#} -layer { - name: "inception_4e/1x1" - type: "Convolution" - bottom: "inception_4d/output" - top: "inception_4e/1x1" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 256 - kernel_size: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.2 - } - } -} -layer { - name: "inception_4e/relu_1x1" - type: "ReLU" - bottom: "inception_4e/1x1" - top: "inception_4e/1x1" -} -layer { - name: "inception_4e/3x3_reduce" - type: "Convolution" - bottom: "inception_4d/output" - top: "inception_4e/3x3_reduce" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 160 - kernel_size: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.2 - } - } -} -layer { - name: "inception_4e/relu_3x3_reduce" - type: "ReLU" - bottom: "inception_4e/3x3_reduce" - top: "inception_4e/3x3_reduce" -} -layer { - name: "inception_4e/3x3" - type: "Convolution" - bottom: "inception_4e/3x3_reduce" - top: "inception_4e/3x3" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 320 - pad: 1 - kernel_size: 3 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.2 - } - } -} -layer { - name: "inception_4e/relu_3x3" - type: "ReLU" - bottom: "inception_4e/3x3" - top: "inception_4e/3x3" -} -layer { - name: "inception_4e/5x5_reduce" - type: "Convolution" - bottom: "inception_4d/output" - top: "inception_4e/5x5_reduce" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 32 - kernel_size: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.2 - } - } -} -layer { - name: "inception_4e/relu_5x5_reduce" - type: "ReLU" - bottom: "inception_4e/5x5_reduce" - top: "inception_4e/5x5_reduce" -} -layer { - name: "inception_4e/5x5" - type: "Convolution" - bottom: "inception_4e/5x5_reduce" - top: "inception_4e/5x5" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 128 - pad: 2 - kernel_size: 5 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.2 - } - } -} -layer { - name: "inception_4e/relu_5x5" - type: "ReLU" - bottom: "inception_4e/5x5" - top: "inception_4e/5x5" -} -layer { - name: "inception_4e/pool" - type: "Pooling" - bottom: "inception_4d/output" - top: "inception_4e/pool" - pooling_param { - pool: MAX - kernel_size: 3 - stride: 1 - pad: 1 - } -} -layer { - name: "inception_4e/pool_proj" - type: "Convolution" - bottom: "inception_4e/pool" - top: "inception_4e/pool_proj" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 128 - kernel_size: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.2 - } - } -} -layer { - name: "inception_4e/relu_pool_proj" - type: "ReLU" - bottom: "inception_4e/pool_proj" - top: "inception_4e/pool_proj" -} -layer { - name: "inception_4e/output" - type: "Concat" - bottom: "inception_4e/1x1" - bottom: "inception_4e/3x3" - bottom: "inception_4e/5x5" - bottom: "inception_4e/pool_proj" - top: "inception_4e/output" -} -layer { - name: "pool4/3x3_s2" - type: "Pooling" - bottom: "inception_4e/output" - top: "pool4/3x3_s2" - pooling_param { - pool: MAX - kernel_size: 3 - stride: 2 - } -} -layer { - name: "inception_5a/1x1" - type: "Convolution" - bottom: "pool4/3x3_s2" - top: "inception_5a/1x1" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 256 - kernel_size: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.2 - } - } -} -layer { - name: "inception_5a/relu_1x1" - type: "ReLU" - bottom: "inception_5a/1x1" - top: "inception_5a/1x1" -} -layer { - name: "inception_5a/3x3_reduce" - type: "Convolution" - bottom: "pool4/3x3_s2" - top: "inception_5a/3x3_reduce" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 160 - kernel_size: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.2 - } - } -} -layer { - name: "inception_5a/relu_3x3_reduce" - type: "ReLU" - bottom: "inception_5a/3x3_reduce" - top: "inception_5a/3x3_reduce" -} -layer { - name: "inception_5a/3x3" - type: "Convolution" - bottom: "inception_5a/3x3_reduce" - top: "inception_5a/3x3" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 320 - pad: 1 - kernel_size: 3 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.2 - } - } -} -layer { - name: "inception_5a/relu_3x3" - type: "ReLU" - bottom: "inception_5a/3x3" - top: "inception_5a/3x3" -} -layer { - name: "inception_5a/5x5_reduce" - type: "Convolution" - bottom: "pool4/3x3_s2" - top: "inception_5a/5x5_reduce" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 32 - kernel_size: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.2 - } - } -} -layer { - name: "inception_5a/relu_5x5_reduce" - type: "ReLU" - bottom: "inception_5a/5x5_reduce" - top: "inception_5a/5x5_reduce" -} -layer { - name: "inception_5a/5x5" - type: "Convolution" - bottom: "inception_5a/5x5_reduce" - top: "inception_5a/5x5" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 128 - pad: 2 - kernel_size: 5 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.2 - } - } -} -layer { - name: "inception_5a/relu_5x5" - type: "ReLU" - bottom: "inception_5a/5x5" - top: "inception_5a/5x5" -} -layer { - name: "inception_5a/pool" - type: "Pooling" - bottom: "pool4/3x3_s2" - top: "inception_5a/pool" - pooling_param { - pool: MAX - kernel_size: 3 - stride: 1 - pad: 1 - } -} -layer { - name: "inception_5a/pool_proj" - type: "Convolution" - bottom: "inception_5a/pool" - top: "inception_5a/pool_proj" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 128 - kernel_size: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.2 - } - } -} -layer { - name: "inception_5a/relu_pool_proj" - type: "ReLU" - bottom: "inception_5a/pool_proj" - top: "inception_5a/pool_proj" -} -layer { - name: "inception_5a/output" - type: "Concat" - bottom: "inception_5a/1x1" - bottom: "inception_5a/3x3" - bottom: "inception_5a/5x5" - bottom: "inception_5a/pool_proj" - top: "inception_5a/output" -} -layer { - name: "inception_5b/1x1" - type: "Convolution" - bottom: "inception_5a/output" - top: "inception_5b/1x1" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 384 - kernel_size: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.2 - } - } -} -layer { - name: "inception_5b/relu_1x1" - type: "ReLU" - bottom: "inception_5b/1x1" - top: "inception_5b/1x1" -} -layer { - name: "inception_5b/3x3_reduce" - type: "Convolution" - bottom: "inception_5a/output" - top: "inception_5b/3x3_reduce" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 192 - kernel_size: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.2 - } - } -} -layer { - name: "inception_5b/relu_3x3_reduce" - type: "ReLU" - bottom: "inception_5b/3x3_reduce" - top: "inception_5b/3x3_reduce" -} -layer { - name: "inception_5b/3x3" - type: "Convolution" - bottom: "inception_5b/3x3_reduce" - top: "inception_5b/3x3" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 384 - pad: 1 - kernel_size: 3 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.2 - } - } -} -layer { - name: "inception_5b/relu_3x3" - type: "ReLU" - bottom: "inception_5b/3x3" - top: "inception_5b/3x3" -} -layer { - name: "inception_5b/5x5_reduce" - type: "Convolution" - bottom: "inception_5a/output" - top: "inception_5b/5x5_reduce" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 48 - kernel_size: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.2 - } - } -} -layer { - name: "inception_5b/relu_5x5_reduce" - type: "ReLU" - bottom: "inception_5b/5x5_reduce" - top: "inception_5b/5x5_reduce" -} -layer { - name: "inception_5b/5x5" - type: "Convolution" - bottom: "inception_5b/5x5_reduce" - top: "inception_5b/5x5" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 128 - pad: 2 - kernel_size: 5 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.2 - } - } -} -layer { - name: "inception_5b/relu_5x5" - type: "ReLU" - bottom: "inception_5b/5x5" - top: "inception_5b/5x5" -} -layer { - name: "inception_5b/pool" - type: "Pooling" - bottom: "inception_5a/output" - top: "inception_5b/pool" - pooling_param { - pool: MAX - kernel_size: 3 - stride: 1 - pad: 1 - } -} -layer { - name: "inception_5b/pool_proj" - type: "Convolution" - bottom: "inception_5b/pool" - top: "inception_5b/pool_proj" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - convolution_param { - num_output: 128 - kernel_size: 1 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0.2 - } - } -} -layer { - name: "inception_5b/relu_pool_proj" - type: "ReLU" - bottom: "inception_5b/pool_proj" - top: "inception_5b/pool_proj" -} -layer { - name: "inception_5b/output" - type: "Concat" - bottom: "inception_5b/1x1" - bottom: "inception_5b/3x3" - bottom: "inception_5b/5x5" - bottom: "inception_5b/pool_proj" - top: "inception_5b/output" -} -layer { - name: "pool5/7x7_s1" - type: "Pooling" - bottom: "inception_5b/output" - top: "pool5/7x7_s1" - pooling_param { - pool: AVE - kernel_size: 7 - stride: 1 - } -} -layer { - name: "pool5/drop_7x7_s1" - type: "Dropout" - bottom: "pool5/7x7_s1" - top: "pool5/7x7_s1" - dropout_param { - dropout_ratio: 0.4 - } -} -layer { - name: "loss3/classifier" - type: "InnerProduct" - bottom: "pool5/7x7_s1" - top: "loss3/classifier" - param { - lr_mult: 1 - decay_mult: 1 - } - param { - lr_mult: 2 - decay_mult: 0 - } - inner_product_param { - num_output: 1000 - weight_filler { - type: "xavier" - } - bias_filler { - type: "constant" - value: 0 - } - } -} -layer { - name: "loss3/loss3" - type: "SoftmaxWithLoss" - bottom: "loss3/classifier" - bottom: "label" - top: "loss3/loss3" - loss_weight: 1 -} diff --git a/benchmark/caffe/image/run.sh b/benchmark/caffe/image/run.sh deleted file mode 100755 index aa9ac20ca5cc1d48a07ce39f7d6c6d70ad4121ab..0000000000000000000000000000000000000000 --- a/benchmark/caffe/image/run.sh +++ /dev/null @@ -1,30 +0,0 @@ -set -e - -function test() { - cfg=$1 - batch=$2 - prefix=$3 - sed -i "/input: \"data\"/{n;s/^input_dim.*/input_dim: $batch/g}" $cfg - sed -i "/input: \"label\"/{n;s/^input_dim.*/input_dim: $batch/g}" $cfg - caffe time --model=$cfg --iterations=50 --gpu 0 > logs/$prefix-1gpu-batch${batch}.log 2>&1 -} - -if [ ! -d "logs" ]; then - mkdir logs -fi - -# alexnet -test alexnet.prototxt 64 alexnet -test alexnet.prototxt 128 alexnet -test alexnet.prototxt 256 alexnet -test alexnet.prototxt 512 alexnet - -# googlenet -test googlenet.prototxt 64 googlenet -test googlenet.prototxt 128 googlenet - -# small net -test smallnet_mnist_cifar.prototxt 64 smallnet -test smallnet_mnist_cifar.prototxt 128 smallnet -test smallnet_mnist_cifar.prototxt 256 smallnet -test smallnet_mnist_cifar.prototxt 512 smallnet diff --git a/benchmark/caffe/image/run_multi.sh b/benchmark/caffe/image/run_multi.sh deleted file mode 100755 index 9a0a71bc185a421842265ea6d2310429adb86913..0000000000000000000000000000000000000000 --- a/benchmark/caffe/image/run_multi.sh +++ /dev/null @@ -1,24 +0,0 @@ -#!/bin/bash -set -e - -function test() { - cfg=$1 - batch=$2 - prefix=$3 - batch_per_gpu=`expr ${batch} / 4` - sed -i "/input: \"data\"/{n;s/^input_dim.*/input_dim: ${batch_per_gpu}/g}" $cfg - sed -i "/input: \"label\"/{n;s/^input_dim.*/input_dim: ${batch_per_gpu}/g}" $cfg - sed -i "1c\net : \"${cfg}\"" solver.prototxt - caffe train --solver=solver.prototxt -gpu 0,1,2,3 > logs/${prefix}-4gpu-batch${batch}.log 2>&1 -} - -if [ ! -d "logs" ]; then - mkdir logs -fi - -# alexnet -test alexnet.prototxt 512 alexnet -test alexnet.prototxt 1024 alexnet - -# googlnet -test googlenet.prototxt 512 googlenet diff --git a/benchmark/caffe/image/smallnet_mnist_cifar.prototxt b/benchmark/caffe/image/smallnet_mnist_cifar.prototxt deleted file mode 100644 index 3cb0e32bbfb9f785ece6d428356987e5503dd25d..0000000000000000000000000000000000000000 --- a/benchmark/caffe/image/smallnet_mnist_cifar.prototxt +++ /dev/null @@ -1,198 +0,0 @@ -name: "mnist/cifar" -input: "data" -input_dim: 128 -input_dim: 3 -input_dim: 32 -input_dim: 32 -input: "label" -input_dim: 128 -input_dim: 1 -input_dim: 1 -input_dim: 1 -layer { - name: "conv1" - type: "Convolution" - bottom: "data" - top: "conv1" - param { - lr_mult: 1 - } - param { - lr_mult: 2 - } - convolution_param { - num_output: 32 - pad: 2 - kernel_size: 5 - stride: 1 - weight_filler { - type: "gaussian" - std: 0.0001 - } - bias_filler { - type: "constant" - } - } -} -layer { - name: "pool1" - type: "Pooling" - bottom: "conv1" - top: "pool1" - pooling_param { - pool: MAX - kernel_size: 3 - stride: 2 - } -} -layer { - name: "relu1" - type: "ReLU" - bottom: "pool1" - top: "pool1" -} -layer { - name: "conv2" - type: "Convolution" - bottom: "pool1" - top: "conv2" - param { - lr_mult: 1 - } - param { - lr_mult: 2 - } - convolution_param { - num_output: 32 - pad: 2 - kernel_size: 5 - stride: 1 - weight_filler { - type: "gaussian" - std: 0.01 - } - bias_filler { - type: "constant" - } - } -} -layer { - name: "relu2" - type: "ReLU" - bottom: "conv2" - top: "conv2" -} -layer { - name: "pool2" - type: "Pooling" - bottom: "conv2" - top: "pool2" - pooling_param { - pool: AVE - kernel_size: 3 - stride: 2 - } -} -layer { - name: "conv3" - type: "Convolution" - bottom: "pool2" - top: "conv3" - param { - lr_mult: 1 - } - param { - lr_mult: 2 - } - convolution_param { - num_output: 64 - pad: 2 - kernel_size: 5 - stride: 1 - weight_filler { - type: "gaussian" - std: 0.01 - } - bias_filler { - type: "constant" - } - } -} -layer { - name: "relu3" - type: "ReLU" - bottom: "conv3" - top: "conv3" -} -layer { - name: "pool3" - type: "Pooling" - bottom: "conv3" - top: "pool3" - pooling_param { - pool: AVE - kernel_size: 3 - stride: 2 - } -} -layer { - name: "ip1" - type: "InnerProduct" - bottom: "pool3" - top: "ip1" - param { - lr_mult: 1 - } - param { - lr_mult: 2 - } - inner_product_param { - num_output: 64 - weight_filler { - type: "gaussian" - std: 0.1 - } - bias_filler { - type: "constant" - } - } -} -layer { - name: "ip2" - type: "InnerProduct" - bottom: "ip1" - top: "ip2" - param { - lr_mult: 1 - } - param { - lr_mult: 2 - } - inner_product_param { - num_output: 10 - weight_filler { - type: "gaussian" - std: 0.1 - } - bias_filler { - type: "constant" - } - } -} -layer { - name: "accuracy" - type: "Accuracy" - bottom: "ip2" - bottom: "label" - top: "accuracy" - include { - phase: TEST - } -} -layer { - name: "loss" - type: "SoftmaxWithLoss" - bottom: "ip2" - bottom: "label" - top: "loss" -} diff --git a/benchmark/caffe/image/solver.prototxt b/benchmark/caffe/image/solver.prototxt deleted file mode 100644 index 61c10284e6027b4cc0b3d4c8fcf949e0a5a22a85..0000000000000000000000000000000000000000 --- a/benchmark/caffe/image/solver.prototxt +++ /dev/null @@ -1,10 +0,0 @@ -net: "alexnet.prototxt" -base_lr: 0.01 -lr_policy: "fixed" -display: 20 -max_iter: 200 -momentum: 0.9 -weight_decay: 0.0005 -snapshot: 10000 -snapshot_prefix: "models/caffe_alexnet_train" -solver_mode: GPU diff --git a/benchmark/figs/alexnet-4gpu.png b/benchmark/figs/alexnet-4gpu.png deleted file mode 100644 index 28b95a44508f0ee7ad270c9ccdf8659009406b03..0000000000000000000000000000000000000000 Binary files a/benchmark/figs/alexnet-4gpu.png and /dev/null differ diff --git a/benchmark/figs/alexnet-cpu-infer.png b/benchmark/figs/alexnet-cpu-infer.png deleted file mode 100644 index 6215ae4e4288f969a909c258ddd5b5f51e6abb3f..0000000000000000000000000000000000000000 Binary files a/benchmark/figs/alexnet-cpu-infer.png and /dev/null differ diff --git a/benchmark/figs/alexnet-cpu-train.png b/benchmark/figs/alexnet-cpu-train.png deleted file mode 100644 index b3200bbc049a9d75857fb5692902d7b475aa8f68..0000000000000000000000000000000000000000 Binary files a/benchmark/figs/alexnet-cpu-train.png and /dev/null differ diff --git a/benchmark/figs/googlenet-4gpu.png b/benchmark/figs/googlenet-4gpu.png deleted file mode 100644 index 9b5331f05a3e54cacf949f10b6603bf627a6d106..0000000000000000000000000000000000000000 Binary files a/benchmark/figs/googlenet-4gpu.png and /dev/null differ diff --git a/benchmark/figs/googlenet-cpu-infer.png b/benchmark/figs/googlenet-cpu-infer.png deleted file mode 100644 index 19478d433bae651f4506153ded11a96d5137b409..0000000000000000000000000000000000000000 Binary files a/benchmark/figs/googlenet-cpu-infer.png and /dev/null differ diff --git a/benchmark/figs/googlenet-cpu-train.png b/benchmark/figs/googlenet-cpu-train.png deleted file mode 100644 index 4e86e058d0654d02c898bf7f5fe73aa1c7614e20..0000000000000000000000000000000000000000 Binary files a/benchmark/figs/googlenet-cpu-train.png and /dev/null differ diff --git a/benchmark/figs/resnet-cpu-infer.png b/benchmark/figs/resnet-cpu-infer.png deleted file mode 100644 index bc43d4b8d20c600d6f1046a5986a6c62adfa6b44..0000000000000000000000000000000000000000 Binary files a/benchmark/figs/resnet-cpu-infer.png and /dev/null differ diff --git a/benchmark/figs/resnet-cpu-train.png b/benchmark/figs/resnet-cpu-train.png deleted file mode 100644 index 96746b1759fa17d25ac5f40ed3678e16086364ba..0000000000000000000000000000000000000000 Binary files a/benchmark/figs/resnet-cpu-train.png and /dev/null differ diff --git a/benchmark/figs/rnn_lstm_4gpus.png b/benchmark/figs/rnn_lstm_4gpus.png deleted file mode 100644 index 973ce2fa5f65e9681c972d4f5bd5776b5c4aa264..0000000000000000000000000000000000000000 Binary files a/benchmark/figs/rnn_lstm_4gpus.png and /dev/null differ diff --git a/benchmark/figs/rnn_lstm_cls.png b/benchmark/figs/rnn_lstm_cls.png deleted file mode 100644 index 26d05cac11aa7ae8cdfbcd8c4401f6547a9404f6..0000000000000000000000000000000000000000 Binary files a/benchmark/figs/rnn_lstm_cls.png and /dev/null differ diff --git a/benchmark/figs/vgg-cpu-infer.png b/benchmark/figs/vgg-cpu-infer.png deleted file mode 100644 index 3a51ec6c474f0e0f0c4384c8ccd1e08c4382230b..0000000000000000000000000000000000000000 Binary files a/benchmark/figs/vgg-cpu-infer.png and /dev/null differ diff --git a/benchmark/figs/vgg-cpu-train.png b/benchmark/figs/vgg-cpu-train.png deleted file mode 100644 index 6d548cfd59f86f8166c011d71ebde4e4b33ef644..0000000000000000000000000000000000000000 Binary files a/benchmark/figs/vgg-cpu-train.png and /dev/null differ diff --git a/benchmark/fluid/Dockerfile b/benchmark/fluid/Dockerfile deleted file mode 100644 index 81ea870050fe5db4a60fee40221991e38de6bd2e..0000000000000000000000000000000000000000 --- a/benchmark/fluid/Dockerfile +++ /dev/null @@ -1,30 +0,0 @@ -FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04 - -# Use UBUNTU_MIRROR can speed up apt-get speed. -# ARG UBUNTU_MIRROR -# RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi' - -RUN apt-get update && apt-get install -y python python-pip iputils-ping libgtk2.0-dev wget vim net-tools iftop python-opencv -RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.7 /usr/lib/libcudnn.so && ln -s /usr/lib/x86_64-linux-gnu/libnccl.so.2 /usr/lib/libnccl.so - -# IMPORTANT: -# Add "ENV http_proxy=http://ip:port" if your download is slow, and don't forget to unset it at runtime. -# exmaple: unset http_proxy && unset https_proxy && python fluid_benchmark.py ... - - -RUN pip install -U pip -RUN pip install -U kubernetes paddlepaddle - -RUN pip uninstall -y paddlepaddle && mkdir /workspace - -ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin -ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root -RUN chmod +x /usr/bin/paddle_k8s - -ADD *.whl / -RUN pip install /*.whl && rm -f /*.whl - -ENV LD_LIBRARY_PATH=/usr/local/lib -ADD fluid_benchmark.py recordio_converter.py args.py recordio_converter.py run.sh run_fluid_benchmark.sh imagenet_reader.py /workspace/ -ADD models/ /workspace/models/ - diff --git a/benchmark/fluid/README.md b/benchmark/fluid/README.md deleted file mode 100644 index 0a18d9fbd93e509bbffb6220acd2f310b2c66ced..0000000000000000000000000000000000000000 --- a/benchmark/fluid/README.md +++ /dev/null @@ -1,99 +0,0 @@ -# Fluid Benchmark - -This directory contains several models configurations and tools that used to run -Fluid benchmarks for local and distributed training. - - -## Run the Benchmark - -To start, run the following command to get the full help message: - -```bash -python fluid_benchmark.py --help -``` - -Currently supported `--model` argument include: - -* mnist -* resnet - * you can chose to use different dataset using `--data_set cifar10` or - `--data_set flowers`. -* vgg -* stacked_dynamic_lstm -* machine_translation - -* Run the following command to start a benchmark job locally: - ```bash - python fluid_benchmark.py --model mnist --device GPU - ``` - You can choose to use GPU/CPU training. With GPU training, you can specify - `--gpus ` to run multi GPU training. - You can set async mode parameter server. With async mode, you can specify - `--async_mode` to train model asynchronous. -* Run distributed training with parameter servers: - * see [run_fluid_benchmark.sh](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/fluid/run_fluid_benchmark.sh) as an example. - * start parameter servers: - ```bash - PADDLE_TRAINING_ROLE=PSERVER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=1 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist --device GPU --update_method pserver - sleep 15 - ``` - * start trainers: - ```bash - PADDLE_TRAINING_ROLE=TRAINER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=1 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist --device GPU --update_method pserver - ``` -* Run distributed training using NCCL2 - ```bash - PADDLE_PSERVER_PORT=7164 PADDLE_TRAINER_IPS=192.168.0.2,192.168.0.3 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist --device GPU --update_method nccl2 - ``` - -## Prepare the RecordIO file to Achieve Better Performance - -Run the following command will generate RecordIO files like "mnist.recordio" under the path -and batch_size you choose, you can use batch_size=1 so that later reader can change the batch_size -at any time using `fluid.batch`. - -```bash -python -c 'from recordio_converter import *; prepare_mnist("data", 1)' -``` - -## Run Distributed Benchmark on Kubernetes Cluster - -You may need to build a Docker image before submitting a cluster job onto Kubernetes, or you will -have to start all those processes manually on each node, which is not recommended. - -To build the Docker image, you need to choose a paddle "whl" package to run with, you may either -download it from -http://www.paddlepaddle.org/docs/develop/documentation/zh/build_and_install/pip_install_en.html or -build it by your own. Once you've got the "whl" package, put it under the current directory and run: - -```bash -docker build -t [your docker image name]:[your docker image tag] . -``` - -Then push the image to a Docker registry that your Kubernetes cluster can reach. - -We provide a script `kube_gen_job.py` to generate Kubernetes yaml files to submit -distributed benchmark jobs to your cluster. To generate a job yaml, just run: - -```bash -python kube_gen_job.py --jobname myjob --pscpu 4 --cpu 8 --gpu 8 --psmemory 20 --memory 40 --pservers 4 --trainers 4 --entry "python fluid_benchmark.py --model mnist --gpus 8 --device GPU --update_method pserver " --disttype pserver -``` - -Then the yaml files are generated under directory `myjob`, you can run: - -```bash -kubectl create -f myjob/ -``` - -The job shall start. - - -## Notes for Run Fluid Distributed with NCCL2 and RDMA - -Before running NCCL2 distributed jobs, please check that whether your node has multiple network -interfaces, try to add the environment variable `export NCCL_SOCKET_IFNAME=eth0` to use your actual -network device. - -To run high-performance distributed training, you must prepare your hardware environment to be -able to run RDMA enabled network communication, please check out [this](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/howto/cluster/nccl2_rdma_training.md) -note for details. diff --git a/benchmark/fluid/args.py b/benchmark/fluid/args.py deleted file mode 100644 index ff616ddbb2cb1cb7f348d6d164815823b08b7629..0000000000000000000000000000000000000000 --- a/benchmark/fluid/args.py +++ /dev/null @@ -1,151 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse - -__all__ = ['parse_args', ] - -BENCHMARK_MODELS = [ - "machine_translation", "resnet", "se_resnext", "vgg", "mnist", - "stacked_dynamic_lstm", "resnet_with_preprocess" -] - - -def parse_args(): - parser = argparse.ArgumentParser('Fluid model benchmarks.') - parser.add_argument( - '--model', - type=str, - choices=BENCHMARK_MODELS, - default='resnet', - help='The model to run benchmark with.') - parser.add_argument( - '--batch_size', type=int, default=32, help='The minibatch size.') - # args related to learning rate - parser.add_argument( - '--learning_rate', type=float, default=0.001, help='The learning rate.') - # TODO(wuyi): add "--use_fake_data" option back. - parser.add_argument( - '--skip_batch_num', - type=int, - default=5, - help='The first num of minibatch num to skip, for better performance test' - ) - parser.add_argument( - '--iterations', type=int, default=80, help='The number of minibatches.') - parser.add_argument( - '--pass_num', type=int, default=100, help='The number of passes.') - parser.add_argument( - '--data_format', - type=str, - default='NCHW', - choices=['NCHW', 'NHWC'], - help='The data data_format, now only support NCHW.') - parser.add_argument( - '--device', - type=str, - default='GPU', - choices=['CPU', 'GPU'], - help='The device type.') - parser.add_argument( - '--gpus', - type=int, - default=1, - help='If gpus > 1, will use ParallelExecutor to run, else use Executor.') - # this option is available only for vgg and resnet. - parser.add_argument( - '--cpus', - type=int, - default=1, - help='If cpus > 1, will set ParallelExecutor to use multiple threads.') - parser.add_argument( - '--data_set', - type=str, - default='flowers', - choices=['cifar10', 'flowers', 'imagenet'], - help='Optional dataset for benchmark.') - parser.add_argument( - '--infer_only', action='store_true', help='If set, run forward only.') - parser.add_argument( - '--use_cprof', action='store_true', help='If set, use cProfile.') - parser.add_argument( - '--use_nvprof', - action='store_true', - help='If set, use nvprof for CUDA.') - parser.add_argument( - '--no_test', - action='store_true', - help='If set, do not test the testset during training.') - parser.add_argument( - '--memory_optimize', - action='store_true', - help='If set, optimize runtime memory before start.') - parser.add_argument( - '--use_fake_data', - action='store_true', - help='If set ommit the actual read data operators.') - parser.add_argument( - '--profile', action='store_true', help='If set, profile a few steps.') - parser.add_argument( - '--update_method', - type=str, - default='local', - choices=['local', 'pserver', 'nccl2'], - help='Choose parameter update method, can be local, pserver, nccl2.') - parser.add_argument( - '--no_split_var', - action='store_true', - default=False, - help='Whether split variables into blocks when update_method is pserver') - parser.add_argument( - '--async_mode', - action='store_true', - default=False, - help='Whether start pserver in async mode to support ASGD') - parser.add_argument( - '--use_reader_op', - action='store_true', - help='Whether to use reader op, and must specify the data path if set this to true.' - ) - parser.add_argument( - '--data_path', - type=str, - default="", - help='Directory that contains all the training recordio files.') - parser.add_argument( - '--test_data_path', - type=str, - default="", - help='Directory that contains all the test data (NOT recordio).') - parser.add_argument( - '--use_inference_transpiler', - action='store_true', - help='If set, use inference transpiler to optimize the program.') - parser.add_argument( - '--no_random', - action='store_true', - help='If set, keep the random seed and do not shuffle the data.') - parser.add_argument( - '--reduce_strategy', - type=str, - choices=['reduce', 'all_reduce'], - default='all_reduce', - help='Specify the reduce strategy, can be reduce, all_reduce') - parser.add_argument( - '--fuse_broadcast_op', - action='store_true', - help='If set, would fuse multiple broadcast operators into one fused_broadcast operator.' - ) - args = parser.parse_args() - return args diff --git a/benchmark/fluid/check_env.sh b/benchmark/fluid/check_env.sh deleted file mode 100755 index af16b84ca8a18151f0fa36d39fd201d3cab21a5f..0000000000000000000000000000000000000000 --- a/benchmark/fluid/check_env.sh +++ /dev/null @@ -1,261 +0,0 @@ -#!/bin/bash - -if [ "`uname -s`" != "Linux" ]; then - echo "Current scenario only support in Linux yet!" - exit 0 -fi - -echo "========================= Hardware Information =========================" -sockets=`grep 'physical id' /proc/cpuinfo | sort -u | wc -l` -cores_per_socket=`grep 'core id' /proc/cpuinfo | sort -u | wc -l` -ht=`lscpu |grep "per core" |awk -F':' '{print $2}'|xargs` -physical_cores=$((sockets * cores_per_socket)) -virtual_cores=`grep 'processor' /proc/cpuinfo | sort -u | wc -l` -numa_nodes=`lscpu |grep "NUMA node(s)"|awk -F':' '{print $2}'|xargs` -echo "CPU Name : `cat /proc/cpuinfo |grep -i "model name" |uniq |awk -F ':' '{print $2}'|xargs`" -echo "CPU Family : `lscpu |grep \"CPU family\" |awk -F':' '{print $2}'|xargs`" -echo "Socket Number : $sockets" -echo "Cores Per Socket : $cores_per_socket" -echo "Total Physical Cores : $physical_cores" -echo "Total Virtual Cores : $virtual_cores" -if [ $ht -eq 1 ]; then - echo "Hyper Threading : OFF" - if [ $physical_cores -ne $virtual_cores ]; then - echo "Error: HT logical error" - fi -else - echo "Hyper Threading : ON" - if [ $physical_cores -ge $virtual_cores ]; then - echo "Error: HT logical error" - fi -fi -echo "NUMA Nodes : $numa_nodes" -if [ $numa_nodes -lt $sockets ]; then - echo "Warning: NUMA node is not enough for the best performance,\ - at least $sockets" -fi - -echo "-------------------------- Memory Information --------------------------" -# dmidecode support start from 2.11 -dmi_ver=`dmidecode --version|awk -F '.' '{print $1}'|xargs` -if [ $dmi_ver -lt 2 ]; then - echo "Error: dmidecode unknown or version is too old" - exit 0 -fi -if [ `dmidecode | grep -ic "Permission denied"` -ne 0 ]; then - echo "Error: need root to run dmidecode" - exit 0 -fi -max_dimms=0 -num_dimms_installed=0 -for dimm_id in `dmidecode |grep Locator|sort -u | awk -F ':' '{print $2}'`; do - num_refered=`dmidecode |grep -wc "$dimm_id"` - # the actual dimm id should be refered only once - if [ $num_refered -eq 1 ]; then - num_unknown=`dmidecode | awk '/'$dimm_id'/ {s=1; f=0}; - /Unknown/ {f=1}; - /Manufacturer/ {if (s==1) {print f; exit 0;}};'` - if [ $num_unknown -eq 0 ]; then - dimms_installed="$dimms_installed \n $dimm_id" - ((num_dimms_installed++)) - else - dimms_uninstalled="$dimms_uninstalled \n $dimm_id" - fi - ((max_dimms++)) - fi -done -echo "Installed DIMM number : $num_dimms_installed" -num_dimms_mapped=`dmidecode | grep "Memory Device Mapped" | wc -l` -if [ $num_dimms_installed -ne $num_dimms_mapped ]; then - echo "Error: The installed DIMMs number does ont match the mapped memory device: $num_dimms_mapped" -fi -num_clock_configed=`dmidecode | grep -i "Configured Clock Speed" |grep -ic "Hz"` -if [ $num_dimms_installed -ne $num_clock_configed ]; then - echo "Error: The installed DIMMs number does ont match configured clocks: $num_clock_configed" -fi -echo -e "Installed DIMMs Locator: $dimms_installed" -echo -e "Not installed DIMMs : $dimms_uninstalled" -max_dimm_slots=`dmidecode | grep -c "Bank Locator"` -echo "DIMMs max slots : $max_dimm_slots" -if [ $max_dimms -ne $max_dimm_slots ]; then - echo "Error: The max dimm slots do not match the max dimms: $max_dimms" -fi -free_ver_main=`free -V|awk -F ' ' '{print $NF}'|awk -F '.' '{print $1}'` -free_ver_sub=`free -V|awk -F ' ' '{print $NF}'|awk -F '.' '{print $2}'` -if [ $free_ver_main -lt 3 ] || [ $free_ver_sub -lt 3 ]; then - mem_sz=`free |grep -i mem |awk -F' ' '{print $2}'|xargs` - swap_sz=`free |grep -i swap |awk -F' ' '{print $2}'|xargs` - total_sz=`free -t |grep -i total |tail -n 1| awk -F' ' '{print $2}'|xargs` - mem_sz="`awk 'BEGIN{printf "%.1f\n",('$mem_sz'/1024/1024)}'` GB" - swap_sz="`awk 'BEGIN{printf "%.1f\n",('$swap_sz'/1024/1024)}'` GB" - total_sz="`awk 'BEGIN{printf "%.1f\n",('$total_sz'/1024/1024)}'` GB" -else - mem_sz=`free -h |grep -i mem |awk -F' ' '{print $2}'|xargs` - swap_sz=`free -h |grep -i swap |awk -F' ' '{print $2}'|xargs` - total_sz=`free -th |grep -i total |tail -n 1| awk -F' ' '{print $2}'|xargs` -fi -echo "Memory Size : $mem_sz" -echo "Swap Memory Size : $swap_sz" -echo "Total Memory Size : $total_sz" -echo "Max Memory Capacity : `dmidecode |grep -i \"maximum capacity\"|sort -u|awk -F':' '{print $2}'|xargs`" -# DIMMs fequency -clock_speeds=`dmidecode | grep -i "Configured Clock Speed" | grep -i "Hz" |sort -u | awk -F':' '{print $2}'|xargs` -echo "Configed Clock Speed : $clock_speeds" -num_clock_type=`dmidecode | grep -i "Configured Clock Speed" | grep -i "Hz" |sort -u | wc -l` -if [ $num_clock_type -ne 1 ]; then - echo "Warning: Have more than 1 speed type, all DIMMs should have same fequency: $clock_speeds" -fi - -echo "-------------------------- Turbo Information --------------------------" -scaling_drive=`cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_driver` -echo "Scaling Driver : $scaling_drive" -if [ $scaling_drive == "intel_pstate" ] && [ -e /sys/devices/system/cpu/intel_pstate/no_turbo ]; then - turbo=`cat /sys/devices/system/cpu/intel_pstate/no_turbo` - if [ $turbo -eq 1 ]; then - echo "Turbo Status : OFF" - else - echo "Turbo Status : ON" - fi -else - echo "Warning: Scaling driver is not intel_pstarte, maybe should enable it in BIOS" - echo "Turbo Status : Unknown" -fi -# cpu frequency -num_max_freq=`cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_max_freq| sort -u |wc -l` -num_min_freq=`cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_min_freq| sort -u |wc -l` -if [ $num_max_freq -ne 1 ]; then - echo "Error: the max_frequency of all CPU should be equal" -fi -if [ $num_min_freq -ne 1 ]; then - echo "Error: the min_frequency of all CPU should be equal" -fi -max_freq=`cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_max_freq| uniq|xargs` # kHz -max_freq=`awk 'BEGIN{printf "%.2f",('$max_freq' / 1000000)}'` # GHz -min_freq=`cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_min_freq| uniq|xargs` # kHz -min_freq=`awk 'BEGIN{printf "%.2f",('$min_freq' / 1000000)}'` # GHz -echo "CPU Max Frequency : $max_freq GHz" -echo "CPU Min Frequency : $min_freq GHz" -# cpu governor -num_governor=`cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor| sort -u |wc -l` -if [ $num_governor -ne 1 ]; then - echo "Error: the governor of all CPU should be the same" -fi -governor=`cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor| sort -u |uniq` -echo "CPU Freq Governor : $governor" - - -echo "========================= Software Information =========================" -echo "BIOS Release Date : `dmidecode | grep "Release Date"|awk -F ':' '{print $2}'|xargs`" -echo "OS Version : `cat /etc/redhat-release`" -echo "Kernel Release Version : `uname -r`" -echo "Kernel Patch Version : `uname -v`" -echo "GCC Version :`gcc --version | head -n 1|awk -F '\\\(GCC\\\)' '{print $2}'`" -if command -v cmake >/dev/null 2>&1; then - cmake_ver=`cmake --version | head -n 1 | awk -F 'version' '{print $2}'` -else - cmake_ver=" Not installed" -fi -echo "CMake Version :$cmake_ver" -echo "------------------ Environment Variables Information -------------------" -kmp_affinity=`env | grep KMP_AFFINITY` -omp_dynamic=`env | grep OMP_DYNAMIC` -omp_nested=`env | grep OMP_NESTED` -omp_num_threads=`env | grep OMP_NUM_THREADS` -mkl_num_threads=`env | grep MKL_NUM_THREADS` -mkl_dynamic=`env | grep MKL_DYNAMIC` -if [ ! $kmp_affinity ]; then kmp_affinity="unset"; fi -if [ ! $omp_dynamic ]; then omp_dynamic="unset"; fi -if [ ! $omp_nested ]; then omp_nested="unset"; fi -if [ ! $omp_num_threads ]; then omp_num_threads="unset"; fi -if [ ! $mkl_num_threads ]; then mkl_num_threads="unset"; fi -if [ ! $mkl_dynamic ]; then mkl_dynamic="unset"; fi -echo "KMP_AFFINITY : $kmp_affinity" -echo "OMP_DYNAMIC : $omp_dynamic" -echo "OMP_NESTED : $omp_nested" -echo "OMP_NUM_THREADS : $omp_num_threads" -echo "MKL_NUM_THREADS : $mkl_num_threads" -echo "MKL_DYNAMIC : $mkl_dynamic" -# Check if any MKL related libraries have been installed in LD_LIBRARY_PATH -for path in `echo $LD_LIBRARY_PATH | awk -F ':' '{for(i=1;i<=NF;++i)print $i}'`; do - mkldnn_found=`find $path -name "libmkldnn.so"` - if [ "$mkldnn_found" ]; then - echo "Found MKL-DNN : $mkldnn_found" - fi - mklml_found=`find $path -name "libmklml_intel.so"` - if [ "$mklml_found" ]; then - echo "Found MKLML : $mklml_found" - fi - iomp_found=`find $path -name "libiomp5.so"` - if [ "$iomp_found" ]; then - echo "Found IOMP : $iomp_found" - fi -done - -# dump all details for fully check -lscpu > lscpu.dump -dmidecode > dmidecode.dump - -# The expected result would be like: -# ========================= Hardware Information ========================= -# CPU Name : Intel(R) Xeon(R) Gold 6148M CPU @ 2.40GHz -# CPU Family : 6 -# Socket Number : 2 -# Cores Per Socket : 20 -# Total Physical Cores : 40 -# Total Virtual Cores : 40 -# Hyper Threading : OFF -# NUMA Nodes : 2 -# -------------------------- Memory Information -------------------------- -# Installed DIMM number : 12 -# Installed DIMMs Locator: -# CPU1_DIMM_A1 -# CPU1_DIMM_B1 -# CPU1_DIMM_C1 -# CPU1_DIMM_D1 -# CPU1_DIMM_E1 -# CPU1_DIMM_F1 -# CPU2_DIMM_A1 -# CPU2_DIMM_B1 -# CPU2_DIMM_C1 -# CPU2_DIMM_D1 -# CPU2_DIMM_E1 -# CPU2_DIMM_F1 -# Not installed DIMMs : -# CPU1_DIMM_A2 -# CPU1_DIMM_B2 -# CPU1_DIMM_C2 -# CPU1_DIMM_D2 -# CPU1_DIMM_E2 -# CPU1_DIMM_F2 -# CPU2_DIMM_A2 -# CPU2_DIMM_B2 -# CPU2_DIMM_C2 -# CPU2_DIMM_D2 -# CPU2_DIMM_E2 -# CPU2_DIMM_F2 -# DIMMs max slots : 24 -# Memory Size : 376G -# Swap Memory Size : 4.0G -# Total Memory Size : 380G -# Max Memory Capacity : 2304 GB -# Configed Clock Speed : 2666 MHz -# -------------------------- Turbo Information -------------------------- -# Scaling Driver : intel_pstate -# Turbo Status : ON -# CPU Max Frequency : 3.70 GHz -# CPU Min Frequency : 1.00 GHz -# CPU Freq Governor : performance -# ========================= Software Information ========================= -# BIOS Release Date : 03/10/2017 -# OS Version : CentOS Linux release 7.3.1611 (Core) -# Kernel Release Version : 3.10.0-514.el7.x86_64 -# Kernel Patch Version : #1 SMP Tue Nov 22 16:42:41 UTC 2016 -# GCC Version : 4.8.5 20150623 (Red Hat 4.8.5-11) -# CMake Version : 3.5.2 -# ------------------ Environment Variables Information ------------------- -# KMP_AFFINITY : unset -# OMP_DYNAMIC : unset -# OMP_NESTED : unset -# OMP_NUM_THREADS : unset -# MKL_NUM_THREADS : unset -# MKL_DYNAMIC : unset diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py deleted file mode 100644 index df159a334e86d62e175bce3b363b74ec78c1fd64..0000000000000000000000000000000000000000 --- a/benchmark/fluid/fluid_benchmark.py +++ /dev/null @@ -1,369 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import cProfile -import time -import os -import traceback - -import numpy as np - -import paddle.fluid as fluid -import paddle.fluid.core as core -import paddle.fluid.profiler as profiler -import paddle.fluid.transpiler.distribute_transpiler as distribute_transpiler - -from args import * - - -def append_nccl2_prepare(trainer_id, startup_prog): - if trainer_id >= 0: - # append gen_nccl_id at the end of startup program - trainer_id = int(os.getenv("PADDLE_TRAINER_ID")) - port = os.getenv("PADDLE_PSERVER_PORT") - worker_ips = os.getenv("PADDLE_TRAINER_IPS") - worker_endpoints = [] - for ip in worker_ips.split(","): - worker_endpoints.append(':'.join([ip, port])) - num_trainers = len(worker_endpoints) - current_endpoint = os.getenv("PADDLE_CURRENT_IP") + ":" + port - worker_endpoints.remove(current_endpoint) - - nccl_id_var = startup_prog.global_block().create_var( - name="NCCLID", - persistable=True, - type=fluid.core.VarDesc.VarType.RAW) - startup_prog.global_block().append_op( - type="gen_nccl_id", - inputs={}, - outputs={"NCCLID": nccl_id_var}, - attrs={ - "endpoint": current_endpoint, - "endpoint_list": worker_endpoints, - "trainer_id": trainer_id - }) - return nccl_id_var, num_trainers, trainer_id - else: - raise Exception("must set positive PADDLE_TRAINER_ID env variables for " - "nccl-based dist train.") - - -def dist_transpile(trainer_id, args, train_prog, startup_prog): - if trainer_id < 0: - return None, None - - # the port of all pservers, needed by both trainer and pserver - port = os.getenv("PADDLE_PSERVER_PORT", "6174") - # comma separated ips of all pservers, needed by trainer and - # pserver - pserver_ips = os.getenv("PADDLE_PSERVER_IPS", "") - eplist = [] - for ip in pserver_ips.split(","): - eplist.append(':'.join([ip, port])) - pserver_endpoints = ",".join(eplist) - # total number of workers/trainers in the job, needed by - # trainer and pserver - trainers = int(os.getenv("PADDLE_TRAINERS")) - # the IP of the local machine, needed by pserver only - current_endpoint = os.getenv("PADDLE_CURRENT_IP", "") + ":" + port - # the role, should be either PSERVER or TRAINER - training_role = os.getenv("PADDLE_TRAINING_ROLE") - - config = fluid.DistributeTranspilerConfig() - config.slice_var_up = not args.no_split_var - config.min_block_size = 1048576 - t = distribute_transpiler.DistributeTranspiler(config=config) - - t.transpile( - trainer_id, - # NOTE: *MUST* use train_prog, for we are using with guard to - # generate different program for train and test. - program=train_prog, - pservers=pserver_endpoints, - trainers=trainers, - sync_mode=not args.async_mode, - startup_program=startup_prog) - if training_role == "PSERVER": - pserver_program = t.get_pserver_program(current_endpoint) - pserver_startup_program = t.get_startup_program( - current_endpoint, pserver_program, startup_program=startup_prog) - return pserver_program, pserver_startup_program - elif training_role == "TRAINER": - train_program = t.get_trainer_program() - return train_program, startup_prog - else: - raise ValueError( - 'PADDLE_TRAINING_ROLE environment variable must be either TRAINER or PSERVER' - ) - - -def test_parallel(exe, test_args, args, test_prog, feeder): - acc_evaluators = [] - for i in xrange(len(test_args[2])): - acc_evaluators.append(fluid.metrics.Accuracy()) - - to_fetch = [v.name for v in test_args[2]] - if args.use_reader_op: - test_args[4].start() - while True: - try: - acc_rets = exe.run(fetch_list=to_fetch) - for i, e in enumerate(acc_evaluators): - e.update( - value=np.array(acc_rets[i]), weight=args.batch_size) - except fluid.core.EOFException as eof: - test_args[4].reset() - break - else: - for batch_id, data in enumerate(test_args[3]()): - acc_rets = exe.run(feed=feeder.feed(data), fetch_list=to_fetch) - for i, e in enumerate(acc_evaluators): - e.update(value=np.array(acc_rets[i]), weight=len(data)) - - return [e.eval() for e in acc_evaluators] - - -# NOTE: only need to benchmark using parallelexe -def train_parallel(train_args, test_args, args, train_prog, test_prog, - startup_prog, nccl_id_var, num_trainers, trainer_id): - over_all_start = time.time() - place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0) - feeder = None - if not args.use_reader_op: - feed_var_list = [ - var for var in train_prog.global_block().vars.itervalues() - if var.is_data - ] - feeder = fluid.DataFeeder(feed_var_list, place) - # generate fake: - if args.use_fake_data: - for var in feed_var_list: - v = startup_prog.global_block()._clone_variable(var) - var.persistable = True - v.persistable = True - - real_shape = list(var.shape) - real_shape[0] = args.batch_size / args.gpus - startup_prog.global_block().append_op( - outputs={"Out": v}, - type="fill_constant", - attrs={"shape": real_shape, - "value": 1.0, - "dtype": var.dtype}) - - if nccl_id_var and trainer_id == 0: - #FIXME(wuyi): wait other trainer to start listening - time.sleep(30) - - startup_exe = fluid.Executor(place) - startup_exe.run(startup_prog) - strategy = fluid.ExecutionStrategy() - strategy.num_threads = args.cpus - strategy.allow_op_delay = False - build_strategy = fluid.BuildStrategy() - if args.reduce_strategy == "reduce": - build_strategy.reduce_strategy = fluid.BuildStrategy( - ).ReduceStrategy.Reduce - else: - build_strategy.reduce_strategy = fluid.BuildStrategy( - ).ReduceStrategy.AllReduce - - avg_loss = train_args[0] - - if args.update_method == "pserver": - # parameter server mode distributed training, merge - # gradients on local server, do not initialize - # ParallelExecutor with multi server all-reduce mode. - num_trainers = 1 - trainer_id = 0 - - exe = fluid.ParallelExecutor( - True, - avg_loss.name, - main_program=train_prog, - exec_strategy=strategy, - build_strategy=build_strategy, - num_trainers=num_trainers, - trainer_id=trainer_id) - - if not args.no_test: - if args.update_method == "pserver": - test_scope = None - else: - # NOTE: use an empty scope to avoid test exe using NCCLID - test_scope = fluid.Scope() - test_exe = fluid.ParallelExecutor( - True, main_program=test_prog, share_vars_from=exe) - - for pass_id in range(args.pass_num): - num_samples = 0 - iters = 0 - start_time = time.time() - if not args.use_reader_op: - reader_generator = train_args[3]() #train_reader - batch_id = 0 - data = None - if args.use_reader_op: - train_args[4].start() - while True: - if not args.use_reader_op: - data = next(reader_generator, None) - if data == None: - break - if args.profile and batch_id == 5: - profiler.start_profiler("All") - profiler.reset_profiler() - elif args.profile and batch_id == 10: - print("profiling total time: ", time.time() - start_time) - profiler.stop_profiler("total", "/tmp/profile_%d_pass%d" % - (trainer_id, pass_id)) - if iters == args.iterations: - reader_generator.close() - break - - if iters == args.skip_batch_num: - start_time = time.time() - num_samples = 0 - fetch_list = [avg_loss.name] - acc_name_list = [v.name for v in train_args[2]] - fetch_list.extend(acc_name_list) - - if args.use_fake_data or args.use_reader_op: - try: - fetch_ret = exe.run(fetch_list) - except fluid.core.EOFException as eof: - break - except fluid.core.EnforceNotMet as ex: - traceback.print_exc() - break - else: - fetch_ret = exe.run(fetch_list, feed=feeder.feed(data)) - if args.use_reader_op: - num_samples += args.batch_size * args.gpus - else: - num_samples += len(data) - - iters += 1 - if batch_id % 1 == 0: - fetched_data = [np.mean(np.array(d)) for d in fetch_ret] - print("Pass %d, batch %d, loss %s, accucacys: %s" % - (pass_id, batch_id, fetched_data[0], fetched_data[1:])) - batch_id += 1 - - print_train_time(start_time, time.time(), num_samples) - if args.use_reader_op: - train_args[4].reset() # reset reader handle - else: - del reader_generator - - if not args.no_test and test_args[2]: - test_feeder = None - if not args.use_reader_op: - test_feed_var_list = [ - var for var in test_prog.global_block().vars.itervalues() - if var.is_data - ] - test_feeder = fluid.DataFeeder(test_feed_var_list, place) - test_ret = test_parallel(test_exe, test_args, args, test_prog, - test_feeder) - print("Pass: %d, Test Accuracy: %s\n" % - (pass_id, [np.mean(np.array(v)) for v in test_ret])) - - print("total train time: ", time.time() - over_all_start) - - -def print_arguments(args): - vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and - vars(args)['device'] == 'GPU') - print('----------- Configuration Arguments -----------') - for arg, value in sorted(vars(args).iteritems()): - print('%s: %s' % (arg, value)) - print('------------------------------------------------') - - -def print_train_time(start_time, end_time, num_samples): - train_elapsed = end_time - start_time - examples_per_sec = num_samples / train_elapsed - print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' % - (num_samples, train_elapsed, examples_per_sec)) - - -def print_paddle_envs(): - print('----------- Configuration envs -----------') - for k in os.environ: - if "PADDLE_" in k: - print "ENV %s:%s" % (k, os.environ[k]) - print('------------------------------------------------') - - -def main(): - args = parse_args() - print_arguments(args) - print_paddle_envs() - if args.no_random: - fluid.default_startup_program().random_seed = 1 - - # the unique trainer id, starting from 0, needed by trainer - # only - nccl_id_var, num_trainers, trainer_id = ( - None, 1, int(os.getenv("PADDLE_TRAINER_ID", "0"))) - - if args.use_cprof: - pr = cProfile.Profile() - pr.enable() - - model_def = __import__("models.%s" % args.model, fromlist=["models"]) - - train_prog = fluid.Program() - test_prog = fluid.Program() - startup_prog = fluid.Program() - - train_args = list(model_def.get_model(args, True, train_prog, startup_prog)) - test_args = list(model_def.get_model(args, False, test_prog, startup_prog)) - - all_args = [train_args, test_args, args] - - if args.update_method == "pserver": - train_prog, startup_prog = dist_transpile(trainer_id, args, train_prog, - startup_prog) - if not train_prog: - raise Exception( - "Must configure correct environments to run dist train.") - all_args.extend([train_prog, test_prog, startup_prog]) - if args.gpus > 1 and os.getenv("PADDLE_TRAINING_ROLE") == "TRAINER": - all_args.extend([nccl_id_var, num_trainers, trainer_id]) - train_parallel(*all_args) - elif os.getenv("PADDLE_TRAINING_ROLE") == "PSERVER": - # start pserver with Executor - server_exe = fluid.Executor(fluid.CPUPlace()) - server_exe.run(startup_prog) - server_exe.run(train_prog) - exit(0) - - # for other update methods, use default programs - all_args.extend([train_prog, test_prog, startup_prog]) - - if args.update_method == "nccl2": - nccl_id_var, num_trainers, trainer_id = append_nccl2_prepare( - trainer_id, startup_prog) - - if args.device == "CPU": - raise Exception("Only support GPU perf with parallel exe") - all_args.extend([nccl_id_var, num_trainers, trainer_id]) - train_parallel(*all_args) - - -if __name__ == "__main__": - main() diff --git a/benchmark/fluid/imagenet_reader.py b/benchmark/fluid/imagenet_reader.py deleted file mode 100644 index a39485a61f12417fbdb512fc81e90ec49c310bf5..0000000000000000000000000000000000000000 --- a/benchmark/fluid/imagenet_reader.py +++ /dev/null @@ -1,344 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import math -import random -import functools -import numpy as np -from threading import Thread -import subprocess -import time - -from Queue import Queue -import paddle -from PIL import Image, ImageEnhance - -random.seed(0) - -DATA_DIM = 224 - -THREAD = int(os.getenv("PREPROCESS_THREADS", "10")) -BUF_SIZE = 5120 - -DATA_DIR = '/mnt/ImageNet' -TRAIN_LIST = '/mnt/ImageNet/train.txt' -TEST_LIST = '/mnt/ImageNet/val.txt' - -img_mean = np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1)) -img_std = np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1)) - - -def resize_short(img, target_size): - percent = float(target_size) / min(img.size[0], img.size[1]) - resized_width = int(round(img.size[0] * percent)) - resized_height = int(round(img.size[1] * percent)) - img = img.resize((resized_width, resized_height), Image.LANCZOS) - return img - - -def crop_image(img, target_size, center): - width, height = img.size - size = target_size - if center == True: - w_start = (width - size) / 2 - h_start = (height - size) / 2 - else: - w_start = random.randint(0, width - size) - h_start = random.randint(0, height - size) - w_end = w_start + size - h_end = h_start + size - img = img.crop((w_start, h_start, w_end, h_end)) - return img - - -def random_crop(img, size, scale=[0.08, 1.0], ratio=[3. / 4., 4. / 3.]): - aspect_ratio = math.sqrt(random.uniform(*ratio)) - w = 1. * aspect_ratio - h = 1. / aspect_ratio - - bound = min((float(img.size[0]) / img.size[1]) / (w**2), - (float(img.size[1]) / img.size[0]) / (h**2)) - scale_max = min(scale[1], bound) - scale_min = min(scale[0], bound) - - target_area = img.size[0] * img.size[1] * random.uniform(scale_min, - scale_max) - target_size = math.sqrt(target_area) - w = int(target_size * w) - h = int(target_size * h) - - i = random.randint(0, img.size[0] - w) - j = random.randint(0, img.size[1] - h) - - img = img.crop((i, j, i + w, j + h)) - img = img.resize((size, size), Image.LANCZOS) - return img - - -def rotate_image(img): - angle = random.randint(-10, 10) - img = img.rotate(angle) - return img - - -def distort_color(img): - def random_brightness(img, lower=0.5, upper=1.5): - e = random.uniform(lower, upper) - return ImageEnhance.Brightness(img).enhance(e) - - def random_contrast(img, lower=0.5, upper=1.5): - e = random.uniform(lower, upper) - return ImageEnhance.Contrast(img).enhance(e) - - def random_color(img, lower=0.5, upper=1.5): - e = random.uniform(lower, upper) - return ImageEnhance.Color(img).enhance(e) - - ops = [random_brightness, random_contrast, random_color] - random.shuffle(ops) - - img = ops[0](img) - img = ops[1](img) - img = ops[2](img) - - return img - - -def process_image(sample, mode, color_jitter, rotate): - img_path = sample[0] - - img = Image.open(img_path) - if mode == 'train': - if rotate: img = rotate_image(img) - img = random_crop(img, DATA_DIM) - else: - img = resize_short(img, target_size=256) - img = crop_image(img, target_size=DATA_DIM, center=True) - if mode == 'train': - if color_jitter: - img = distort_color(img) - if random.randint(0, 1) == 1: - img = img.transpose(Image.FLIP_LEFT_RIGHT) - - if img.mode != 'RGB': - img = img.convert('RGB') - - img = np.array(img).astype('float32').transpose((2, 0, 1)) / 255 - img -= img_mean - img /= img_std - - if mode == 'train' or mode == 'val': - return img, sample[1] - elif mode == 'test': - return [img] - - -class XmapEndSignal(): - pass - - -def xmap_readers(mapper, - reader, - process_num, - buffer_size, - order=False, - print_queue_state=True): - end = XmapEndSignal() - - # define a worker to read samples from reader to in_queue - def read_worker(reader, in_queue): - for i in reader(): - in_queue.put(i) - in_queue.put(end) - - # define a worker to read samples from reader to in_queue with order flag - def order_read_worker(reader, in_queue, file_queue): - in_order = 0 - for i in reader(): - in_queue.put((in_order, i)) - in_order += 1 - in_queue.put(end) - - # define a worker to handle samples from in_queue by mapper - # and put mapped samples into out_queue - def handle_worker(in_queue, out_queue, mapper): - sample = in_queue.get() - while not isinstance(sample, XmapEndSignal): - r = mapper(sample) - out_queue.put(r) - sample = in_queue.get() - in_queue.put(end) - out_queue.put(end) - - # define a worker to handle samples from in_queue by mapper - # and put mapped samples into out_queue by order - def order_handle_worker(in_queue, out_queue, mapper, out_order): - ins = in_queue.get() - while not isinstance(ins, XmapEndSignal): - order, sample = ins - r = mapper(sample) - while order != out_order[0]: - pass - out_queue.put(r) - out_order[0] += 1 - ins = in_queue.get() - in_queue.put(end) - out_queue.put(end) - - def xreader(): - file_queue = Queue() - in_queue = Queue(buffer_size) - out_queue = Queue(buffer_size) - out_order = [0] - # start a read worker in a thread - target = order_read_worker if order else read_worker - t = Thread(target=target, args=(reader, in_queue)) - t.daemon = True - t.start() - # start several handle_workers - target = order_handle_worker if order else handle_worker - args = (in_queue, out_queue, mapper, out_order) if order else ( - in_queue, out_queue, mapper) - workers = [] - for i in xrange(process_num): - worker = Thread(target=target, args=args) - worker.daemon = True - workers.append(worker) - for w in workers: - w.start() - - sample = out_queue.get() - start_t = time.time() - while not isinstance(sample, XmapEndSignal): - yield sample - sample = out_queue.get() - if time.time() - start_t > 3: - if print_queue_state: - print("queue sizes: ", in_queue.qsize(), out_queue.qsize()) - start_t = time.time() - finish = 1 - while finish < process_num: - sample = out_queue.get() - if isinstance(sample, XmapEndSignal): - finish += 1 - else: - yield sample - - return xreader - - -def _reader_creator(file_list, - mode, - shuffle=False, - color_jitter=False, - rotate=False, - xmap=True): - def reader(): - with open(file_list) as flist: - full_lines = [line.strip() for line in flist] - if shuffle: - random.shuffle(full_lines) - if mode == 'train': - trainer_id = int(os.getenv("PADDLE_TRAINER_ID")) - trainer_count = int(os.getenv("PADDLE_TRAINERS")) - per_node_lines = len(full_lines) / trainer_count - lines = full_lines[trainer_id * per_node_lines:(trainer_id + 1) - * per_node_lines] - print( - "read images from %d, length: %d, lines length: %d, total: %d" - % (trainer_id * per_node_lines, per_node_lines, len(lines), - len(full_lines))) - else: - lines = full_lines - - for line in lines: - if mode == 'train': - img_path, label = line.split() - img_path = img_path.replace("JPEG", "jpeg") - img_path = os.path.join(DATA_DIR, "train", img_path) - yield (img_path, int(label)) - elif mode == 'val': - img_path, label = line.split() - img_path = img_path.replace("JPEG", "jpeg") - img_path = os.path.join(DATA_DIR, "val", img_path) - yield (img_path, int(label)) - elif mode == 'test': - img_path = os.path.join(DATA_DIR, line) - yield [img_path] - - mapper = functools.partial( - process_image, mode=mode, color_jitter=color_jitter, rotate=rotate) - - return paddle.reader.xmap_readers(mapper, reader, THREAD, BUF_SIZE) - - -def load_raw_image_uint8(sample): - img_arr = np.array(Image.open(sample[0])).astype('int64') - return img_arr, int(sample[1]) - - -def train_raw(file_list=TRAIN_LIST, shuffle=True): - def reader(): - with open(file_list) as flist: - full_lines = [line.strip() for line in flist] - if shuffle: - random.shuffle(full_lines) - - trainer_id = int(os.getenv("PADDLE_TRAINER_ID")) - trainer_count = int(os.getenv("PADDLE_TRAINERS")) - per_node_lines = len(full_lines) / trainer_count - lines = full_lines[trainer_id * per_node_lines:(trainer_id + 1) * - per_node_lines] - print("read images from %d, length: %d, lines length: %d, total: %d" - % (trainer_id * per_node_lines, per_node_lines, len(lines), - len(full_lines))) - - for line in lines: - img_path, label = line.split() - img_path = img_path.replace("JPEG", "jpeg") - img_path = os.path.join(DATA_DIR, "train", img_path) - yield (img_path, int(label)) - - return paddle.reader.xmap_readers(load_raw_image_uint8, reader, THREAD, - BUF_SIZE) - - -def train(file_list=TRAIN_LIST, xmap=True): - return _reader_creator( - file_list, - 'train', - shuffle=True, - color_jitter=False, - rotate=False, - xmap=xmap) - - -def val(file_list=TEST_LIST, xmap=True): - return _reader_creator(file_list, 'val', shuffle=False, xmap=xmap) - - -def test(file_list=TEST_LIST): - return _reader_creator(file_list, 'test', shuffle=False) - - -if __name__ == "__main__": - c = 0 - start_t = time.time() - for d in train()(): - c += 1 - if c >= 10000: - break - spent = time.time() - start_t - print("read 10000 speed: ", 10000 / spent, spent) diff --git a/benchmark/fluid/kube_gen_job.py b/benchmark/fluid/kube_gen_job.py deleted file mode 100644 index c1f22f1bfa02dd409edc8e1c39a72524240f4088..0000000000000000000000000000000000000000 --- a/benchmark/fluid/kube_gen_job.py +++ /dev/null @@ -1,210 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import yaml -import copy -import argparse -import random -import os -import copy -from kube_templates import pserver, trainer, envs - - -def parse_args(): - parser = argparse.ArgumentParser(description='Generate dist job yamls.') - - parser.add_argument( - '--jobname', default="paddlejob", help='unique job name') - parser.add_argument( - '--cpu', default=1, type=int, help='CPU cores per trainer node') - parser.add_argument( - '--pscpu', default=1, type=int, help='CPU cores per pserver node') - parser.add_argument( - '--gpu', default=0, type=int, help='num of GPUs per node') - parser.add_argument( - '--image', - default="bootstrapper:5000/fluid_benchmark:gpu", - help='num of GPUs per node') - parser.add_argument( - '--pservers', default=1, type=int, help='num of pservers') - parser.add_argument( - '--trainers', default=1, type=int, help='num of trainers') - parser.add_argument('--memory', default=1, type=int, help='trainer memory') - parser.add_argument( - '--psmemory', default=1, type=int, help='pserver memory') - parser.add_argument( - '--port', default=30236, type=int, help='num of trainers') - parser.add_argument( - '--entry', default="python train.py", help='command to run') - parser.add_argument( - '--fluid', default=1, type=int, help='whether is fluid job') - parser.add_argument( - '--rdma', action='store_true', help='whether mount rdma libs') - parser.add_argument( - '--disttype', - default="pserver", - type=str, - choices=['pserver', 'nccl2', 'local'], - help='pserver or nccl2 or local') - - args = parser.parse_args() - return args - - -def gen_job(): - ps = pserver - tn = trainer - args = parse_args() - - ps_container = ps["spec"]["template"]["spec"]["containers"][0] - tn_container = tn["spec"]["template"]["spec"]["containers"][0] - - if args.fluid == 1: - ps_container["command"] = \ - ["paddle_k8s", "start_fluid"] - tn_container["command"] = \ - ["paddle_k8s", "start_fluid"] - ps["metadata"]["name"] = args.jobname + "-pserver" - ps["spec"]["template"]["metadata"]["labels"][ - "paddle-job-pserver"] = args.jobname - tn["metadata"]["name"] = args.jobname + "-trainer" - tn["spec"]["template"]["metadata"]["labels"]["paddle-job"] = args.jobname - - ps_container["image"] = args.image - tn_container["image"] = args.image - - ps_container["resources"]["requests"]["cpu"] = str(args.pscpu) - ps_container["resources"]["requests"]["memory"] = str(args.psmemory) + "Gi" - ps_container["resources"]["limits"]["cpu"] = str(args.pscpu) - ps_container["resources"]["limits"]["memory"] = str(args.psmemory) + "Gi" - - tn_container["resources"]["requests"]["cpu"] = str(args.cpu) - tn_container["resources"]["requests"]["memory"] = str(args.memory) + "Gi" - tn_container["resources"]["limits"]["cpu"] = str(args.cpu) - tn_container["resources"]["limits"]["memory"] = str(args.memory) + "Gi" - if args.gpu > 0: - tn_container["resources"]["requests"][ - "alpha.kubernetes.io/nvidia-gpu"] = str(args.gpu) - tn_container["resources"]["limits"][ - "alpha.kubernetes.io/nvidia-gpu"] = str(args.gpu) - - ps["spec"]["replicas"] = int(args.pservers) - tn["spec"]["parallelism"] = int(args.trainers) - tn["spec"]["completions"] = int(args.trainers) - ps_container["ports"][0]["name"] = "jobport-" + str(args.port) - ps_container["ports"][0]["containerPort"] = args.port - spreadport = random.randint(40000, 60000) - tn_container["ports"][0]["name"] = "spr-" + str(spreadport) - tn_container["ports"][0]["containerPort"] = spreadport - - envs.append({"name": "PADDLE_JOB_NAME", "value": args.jobname}) - envs.append({"name": "PADDLE_TRAINERS", "value": str(args.trainers)}) - envs.append({"name": "PADDLE_PSERVERS", "value": str(args.pservers)}) - envs.append({"name": "ENTRY", "value": args.entry}) - envs.append({"name": "PADDLE_PSERVER_PORT", "value": str(args.port)}) - # NOTE: these directories below are cluster specific, please modify - # this settings before you run on your own cluster. - envs.append({ - "name": "LD_LIBRARY_PATH", - "value": - "/usr/local/lib:/usr/local/nvidia/lib64:/usr/local/rdma/lib64:/usr/lib64/mlnx_ofed/valgrind" - }) - - volumes = [{ - "name": "nvidia-driver", - "hostPath": { - "path": "/usr/local/nvidia/lib64" - } - }] - volumeMounts = [{ - "mountPath": "/usr/local/nvidia/lib64", - "name": "nvidia-driver" - }] - - if args.rdma: - volumes.extend([{ - "name": "ibetc", - "hostPath": { - "path": "/etc/libibverbs.d" - } - }, { - "name": "iblibs", - "hostPath": { - "path": "/usr/local/rdma" - } - }, { - "name": "valgrind", - "hostPath": { - "path": "/usr/lib64/mlnx_ofed/valgrind" - } - }]) - volumeMounts.extend([{ - "mountPath": "/etc/libibverbs.d", - "name": "ibetc" - }, { - "mountPath": "/usr/local/rdma", - "name": "iblibs" - }, { - "mountPath": "/usr/lib64/mlnx_ofed/valgrind", - "name": "valgrind" - }]) - # append shm for NCCL2 - volumes.append({"name": "dshm", "emptyDir": {"medium": "Memory"}}) - volumeMounts.append({"mountPath": "/dev/shm", "name": "dshm"}) - - # add ceph volumes - volumes.append({ - "name": "ceph-data", - "cephfs": { - "monitors": ["192.168.16.23:6789"], - "secretRef": { - "name": "ceph-secret" - }, - "user": "admin", - } - }) - volumeMounts.append({"mountPath": "/mnt/data", "name": "ceph-data"}) - - tn["spec"]["template"]["spec"]["volumes"] = volumes - tn_container["volumeMounts"] = volumeMounts - - ps_container["env"] = copy.deepcopy(envs) - ps_container["env"].append({ - "name": "PADDLE_TRAINING_ROLE", - "value": "PSERVER" - }) - tn_container["env"] = envs - if args.disttype == "pserver": - tn_container["env"].append({ - "name": "PADDLE_TRAINING_ROLE", - "value": "TRAINER" - }) - elif args.disttype == "nccl2" or args.disttype == "local": - # NCCL2 have no training role, set to plain WORKER - tn_container["env"].append({ - "name": "PADDLE_TRAINING_ROLE", - "value": "WORKER" - }) - - os.mkdir(args.jobname) - if args.disttype == "pserver": - with open("%s/pserver.yaml" % args.jobname, "w") as fn: - yaml.dump(ps, fn) - - with open("%s/trainer.yaml" % args.jobname, "w") as fn: - yaml.dump(tn, fn) - - -if __name__ == "__main__": - gen_job() diff --git a/benchmark/fluid/kube_templates/__init__.py b/benchmark/fluid/kube_templates/__init__.py deleted file mode 100644 index 2d09d940a5ee638e4b55405d05924e2d76006cfc..0000000000000000000000000000000000000000 --- a/benchmark/fluid/kube_templates/__init__.py +++ /dev/null @@ -1,66 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from pserver import pserver -from trainer import trainer - -__all__ = ["pserver", "trainer", "envs"] - -envs = [ - # envs that don't need to change - { - "name": "GLOG_v", - "value": "0" - }, - { - "name": "GLOG_logtostderr", - "value": "1" - }, - { - "name": "TOPOLOGY", - "value": "" - }, - { - "name": "TRAINER_PACKAGE", - "value": "/workspace" - }, - { - "name": "PADDLE_INIT_NICS", - "value": "eth2" - }, - { - "name": "NAMESPACE", - "valueFrom": { - "fieldRef": { - "fieldPath": "metadata.namespace" - } - } - }, - { - "name": "POD_IP", - "valueFrom": { - "fieldRef": { - "fieldPath": "status.podIP" - } - } - }, - { - "name": "PADDLE_CURRENT_IP", - "valueFrom": { - "fieldRef": { - "fieldPath": "status.podIP" - } - } - } -] diff --git a/benchmark/fluid/kube_templates/pserver.py b/benchmark/fluid/kube_templates/pserver.py deleted file mode 100644 index b54982c806ad4229fbd4bd7edf82a4e7eb4c5ad1..0000000000000000000000000000000000000000 --- a/benchmark/fluid/kube_templates/pserver.py +++ /dev/null @@ -1,58 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -pserver = { - "apiVersion": "extensions/v1beta1", - "kind": "ReplicaSet", - "metadata": { - "name": "jobname-pserver" - }, - "spec": { - "replicas": 1, - "template": { - "metadata": { - "labels": { - "paddle-job-pserver": "jobname" - } - }, - "spec": { - "hostNetwork": True, - "imagePullSecrets": [{ - "name": "job-registry-secret" - }], - "containers": [{ - "name": "pserver", - "image": "", - "imagePullPolicy": "Always", - "ports": [{ - "name": "jobport-1", - "containerPort": 1 - }], - "env": [], - "command": ["paddle_k8s", "start_pserver"], - "resources": { - "requests": { - "memory": "10Gi", - "cpu": "4" - }, - "limits": { - "memory": "10Gi", - "cpu": "4" - } - } - }] - } - } - } -} diff --git a/benchmark/fluid/kube_templates/trainer.py b/benchmark/fluid/kube_templates/trainer.py deleted file mode 100644 index b915d31e371d9d787ff64d705e32baf301e16abe..0000000000000000000000000000000000000000 --- a/benchmark/fluid/kube_templates/trainer.py +++ /dev/null @@ -1,70 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -trainer = { - "apiVersion": "batch/v1", - "kind": "Job", - "metadata": { - "name": "jobname-pserver" - }, - "spec": { - "parallelism": 4, - "completions": 4, - "template": { - "metadata": { - "labels": { - "paddle-job": "jobname" - } - }, - "spec": { - "hostNetwork": True, - "imagePullSecrets": [{ - "name": "job-registry-secret" - }], - "restartPolicy": "Never", - "containers": [{ - "name": "trainer", - "image": "", - "imagePullPolicy": "Always", - # to let container set rlimit - "securityContext": { - "privileged": True - # TODO(wuyi): use below specific cap instead of privileged, - # using privileged will cause all GPU device are visible - # in the container. - # "capabilities": { - # "add": ["SYS_RESOURCE"] - # } - }, - "ports": [{ - "name": "jobport-1", - "containerPort": 1 - }], - "env": [], - "command": ["paddle_k8s", "start_trainer", "v2"], - "resources": { - "requests": { - "memory": "10Gi", - "cpu": "4", - }, - "limits": { - "memory": "10Gi", - "cpu": "4", - } - } - }] - } - } - } -} diff --git a/benchmark/fluid/models/__init__.py b/benchmark/fluid/models/__init__.py deleted file mode 100644 index 1b8f63c7070c2cd45531966b0bcdff95a848574d..0000000000000000000000000000000000000000 --- a/benchmark/fluid/models/__init__.py +++ /dev/null @@ -1,18 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -__all__ = [ - "machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm", - "resnet_with_preprocess" -] diff --git a/benchmark/fluid/models/machine_translation.py b/benchmark/fluid/models/machine_translation.py deleted file mode 100644 index 18163c35d65a28c046cfeb33f5b96c34a1a6a35a..0000000000000000000000000000000000000000 --- a/benchmark/fluid/models/machine_translation.py +++ /dev/null @@ -1,217 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""seq2seq model for fluid.""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import numpy as np -import argparse -import time -import distutils.util - -import paddle -import paddle.fluid as fluid -import paddle.fluid.core as core -import paddle.fluid.framework as framework -from paddle.fluid.executor import Executor - - -def lstm_step(x_t, hidden_t_prev, cell_t_prev, size): - def linear(inputs): - return fluid.layers.fc(input=inputs, size=size, bias_attr=True) - - forget_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t])) - input_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t])) - output_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t])) - cell_tilde = fluid.layers.tanh(x=linear([hidden_t_prev, x_t])) - - cell_t = fluid.layers.sums(input=[ - fluid.layers.elementwise_mul( - x=forget_gate, y=cell_t_prev), fluid.layers.elementwise_mul( - x=input_gate, y=cell_tilde) - ]) - - hidden_t = fluid.layers.elementwise_mul( - x=output_gate, y=fluid.layers.tanh(x=cell_t)) - - return hidden_t, cell_t - - -def seq_to_seq_net(embedding_dim, encoder_size, decoder_size, source_dict_dim, - target_dict_dim, is_generating, beam_size, max_length): - """Construct a seq2seq network.""" - - def bi_lstm_encoder(input_seq, gate_size): - # Linear transformation part for input gate, output gate, forget gate - # and cell activation vectors need be done outside of dynamic_lstm. - # So the output size is 4 times of gate_size. - input_forward_proj = fluid.layers.fc(input=input_seq, - size=gate_size * 4, - act=None, - bias_attr=False) - forward, _ = fluid.layers.dynamic_lstm( - input=input_forward_proj, size=gate_size * 4, use_peepholes=False) - input_reversed_proj = fluid.layers.fc(input=input_seq, - size=gate_size * 4, - act=None, - bias_attr=False) - reversed, _ = fluid.layers.dynamic_lstm( - input=input_reversed_proj, - size=gate_size * 4, - is_reverse=True, - use_peepholes=False) - return forward, reversed - - src_word_idx = fluid.layers.data( - name='source_sequence', shape=[1], dtype='int64', lod_level=1) - - src_embedding = fluid.layers.embedding( - input=src_word_idx, - size=[source_dict_dim, embedding_dim], - dtype='float32') - - src_forward, src_reversed = bi_lstm_encoder( - input_seq=src_embedding, gate_size=encoder_size) - - encoded_vector = fluid.layers.concat( - input=[src_forward, src_reversed], axis=1) - - encoded_proj = fluid.layers.fc(input=encoded_vector, - size=decoder_size, - bias_attr=False) - - backward_first = fluid.layers.sequence_pool( - input=src_reversed, pool_type='first') - - decoder_boot = fluid.layers.fc(input=backward_first, - size=decoder_size, - bias_attr=False, - act='tanh') - - def lstm_decoder_with_attention(target_embedding, encoder_vec, encoder_proj, - decoder_boot, decoder_size): - def simple_attention(encoder_vec, encoder_proj, decoder_state): - decoder_state_proj = fluid.layers.fc(input=decoder_state, - size=decoder_size, - bias_attr=False) - decoder_state_expand = fluid.layers.sequence_expand( - x=decoder_state_proj, y=encoder_proj) - concated = fluid.layers.concat( - input=[encoder_proj, decoder_state_expand], axis=1) - attention_weights = fluid.layers.fc(input=concated, - size=1, - act='tanh', - bias_attr=False) - attention_weights = fluid.layers.sequence_softmax( - input=attention_weights) - weigths_reshape = fluid.layers.reshape( - x=attention_weights, shape=[-1]) - scaled = fluid.layers.elementwise_mul( - x=encoder_vec, y=weigths_reshape, axis=0) - context = fluid.layers.sequence_pool(input=scaled, pool_type='sum') - return context - - rnn = fluid.layers.DynamicRNN() - - cell_init = fluid.layers.fill_constant_batch_size_like( - input=decoder_boot, - value=0.0, - shape=[-1, decoder_size], - dtype='float32') - cell_init.stop_gradient = False - - with rnn.block(): - current_word = rnn.step_input(target_embedding) - encoder_vec = rnn.static_input(encoder_vec) - encoder_proj = rnn.static_input(encoder_proj) - hidden_mem = rnn.memory(init=decoder_boot, need_reorder=True) - cell_mem = rnn.memory(init=cell_init) - context = simple_attention(encoder_vec, encoder_proj, hidden_mem) - decoder_inputs = fluid.layers.concat( - input=[context, current_word], axis=1) - h, c = lstm_step(decoder_inputs, hidden_mem, cell_mem, decoder_size) - rnn.update_memory(hidden_mem, h) - rnn.update_memory(cell_mem, c) - out = fluid.layers.fc(input=h, - size=target_dict_dim, - bias_attr=True, - act='softmax') - rnn.output(out) - return rnn() - - if not is_generating: - trg_word_idx = fluid.layers.data( - name='target_sequence', shape=[1], dtype='int64', lod_level=1) - - trg_embedding = fluid.layers.embedding( - input=trg_word_idx, - size=[target_dict_dim, embedding_dim], - dtype='float32') - - prediction = lstm_decoder_with_attention(trg_embedding, encoded_vector, - encoded_proj, decoder_boot, - decoder_size) - label = fluid.layers.data( - name='label_sequence', shape=[1], dtype='int64', lod_level=1) - cost = fluid.layers.cross_entropy(input=prediction, label=label) - avg_cost = fluid.layers.mean(x=cost) - - feeding_list = ["source_sequence", "target_sequence", "label_sequence"] - - return avg_cost, feeding_list - - -def lodtensor_to_ndarray(lod_tensor): - dims = lod_tensor.get_dims() - ndarray = np.zeros(shape=dims).astype('float32') - for i in xrange(np.product(dims)): - ndarray.ravel()[i] = lod_tensor.get_float_element(i) - return ndarray - - -def get_model(args, is_train, main_prog, startup_prog): - if args.use_reader_op: - raise Exception("machine_translation do not support reader op for now.") - embedding_dim = 512 - encoder_size = 512 - decoder_size = 512 - dict_size = 30000 - beam_size = 3 - max_length = 250 - - with fluid.program_guard(main_prog, startup_prog): - with fluid.unique_name.guard(): - avg_cost, feeding_list = seq_to_seq_net( - embedding_dim, - encoder_size, - decoder_size, - dict_size, - dict_size, - False, - beam_size=beam_size, - max_length=max_length) - if is_train: - optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate) - optimizer.minimize(avg_cost) - - batch_generator = paddle.batch( - paddle.reader.shuffle( - paddle.dataset.wmt14.train(dict_size) - if is_train else paddle.dataset.wmt14.test(dict_size), - buf_size=1000), - batch_size=args.batch_size * args.gpus) - - return avg_cost, optimizer, [], batch_generator, None diff --git a/benchmark/fluid/models/mnist.py b/benchmark/fluid/models/mnist.py deleted file mode 100644 index f123e07fb711bd8ff67c1ecf5ec9a02c1e79eb1d..0000000000000000000000000000000000000000 --- a/benchmark/fluid/models/mnist.py +++ /dev/null @@ -1,117 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import numpy as np -import argparse -import time -import cProfile -import os - -import paddle -import paddle.fluid as fluid -import paddle.fluid.profiler as profiler - -SEED = 1 -DTYPE = "float32" - -# random seed must set before configuring the network. -# fluid.default_startup_program().random_seed = SEED - - -def cnn_model(data): - conv_pool_1 = fluid.nets.simple_img_conv_pool( - input=data, - filter_size=5, - num_filters=20, - pool_size=2, - pool_stride=2, - act="relu") - conv_pool_2 = fluid.nets.simple_img_conv_pool( - input=conv_pool_1, - filter_size=5, - num_filters=50, - pool_size=2, - pool_stride=2, - act="relu") - - # TODO(dzhwinter) : refine the initializer and random seed settting - SIZE = 10 - input_shape = conv_pool_2.shape - param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE] - scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5 - - predict = fluid.layers.fc( - input=conv_pool_2, - size=SIZE, - act="softmax", - param_attr=fluid.param_attr.ParamAttr( - initializer=fluid.initializer.NormalInitializer( - loc=0.0, scale=scale))) - return predict - - -def get_model(args, is_train, main_prog, startup_prog): - # NOTE: mnist is small, we don't implement data sharding yet. - opt = None - data_file_handle = None - with fluid.program_guard(main_prog, startup_prog): - if args.use_reader_op: - filelist = [ - os.path.join(args.data_path, f) - for f in os.listdir(args.data_path) - ] - data_file_handle = fluid.layers.open_files( - filenames=filelist, - shapes=[[-1, 1, 28, 28], (-1, 1)], - lod_levels=[0, 0], - dtypes=["float32", "int64"], - thread_num=1, - pass_num=1) - data_file = fluid.layers.double_buffer( - fluid.layers.batch( - data_file_handle, batch_size=args.batch_size)) - with fluid.unique_name.guard(): - if args.use_reader_op: - input, label = fluid.layers.read_file(data_file) - else: - images = fluid.layers.data( - name='pixel', shape=[1, 28, 28], dtype='float32') - label = fluid.layers.data( - name='label', shape=[1], dtype='int64') - - predict = cnn_model(images) - cost = fluid.layers.cross_entropy(input=predict, label=label) - avg_cost = fluid.layers.mean(x=cost) - # Evaluator - batch_acc = fluid.layers.accuracy(input=predict, label=label) - # Optimization - if is_train: - opt = fluid.optimizer.AdamOptimizer( - learning_rate=0.001, beta1=0.9, beta2=0.999) - opt.minimize(avg_cost) - if args.memory_optimize: - fluid.memory_optimize(main_prog) - - # Reader - if is_train: - reader = paddle.dataset.mnist.train() - else: - reader = paddle.dataset.mnist.test() - batched_reader = paddle.batch( - reader, batch_size=args.batch_size * args.gpus) - return avg_cost, opt, [batch_acc], batched_reader, data_file_handle diff --git a/benchmark/fluid/models/resnet.py b/benchmark/fluid/models/resnet.py deleted file mode 100644 index f692e7722a1c9a54a4509ce7c78cc68e1f28da74..0000000000000000000000000000000000000000 --- a/benchmark/fluid/models/resnet.py +++ /dev/null @@ -1,237 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import functools -import numpy as np -import time -import os -import math - -import cProfile, pstats, StringIO - -import paddle -import paddle.fluid as fluid -import paddle.fluid.core as core -import paddle.fluid.profiler as profiler -from imagenet_reader import train, val - -train_parameters = { - "input_size": [3, 224, 224], - "input_mean": [0.485, 0.456, 0.406], - "input_std": [0.229, 0.224, 0.225], - "learning_strategy": { - "name": "piecewise_decay", - "batch_size": 256, - "epochs": [30, 60, 90], - "steps": [0.1, 0.01, 0.001, 0.0001] - } -} - - -class ResNet(): - def __init__(self, layers=50, is_train=True): - self.params = train_parameters - self.layers = layers - self.is_train = is_train - - def net(self, input, class_dim=1000): - layers = self.layers - supported_layers = [50, 101, 152] - assert layers in supported_layers, \ - "supported layers are {} but input layer is {}".format(supported_layers, layers) - - if layers == 50: - depth = [3, 4, 6, 3] - elif layers == 101: - depth = [3, 4, 23, 3] - elif layers == 152: - depth = [3, 8, 36, 3] - num_filters = [64, 128, 256, 512] - - conv = self.conv_bn_layer( - input=input, num_filters=64, filter_size=7, stride=2, act='relu') - conv = fluid.layers.pool2d( - input=conv, - pool_size=3, - pool_stride=2, - pool_padding=1, - pool_type='max') - - for block in range(len(depth)): - for i in range(depth[block]): - conv = self.bottleneck_block( - input=conv, - num_filters=num_filters[block], - stride=2 if i == 0 and block != 0 else 1) - - pool = fluid.layers.pool2d( - input=conv, pool_size=7, pool_type='avg', global_pooling=True) - stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0) - out = fluid.layers.fc(input=pool, - size=class_dim, - act='softmax', - param_attr=fluid.param_attr.ParamAttr( - initializer=fluid.initializer.Uniform(-stdv, - stdv))) - return out - - def conv_bn_layer(self, - input, - num_filters, - filter_size, - stride=1, - groups=1, - act=None): - conv = fluid.layers.conv2d( - input=input, - num_filters=num_filters, - filter_size=filter_size, - stride=stride, - padding=(filter_size - 1) // 2, - groups=groups, - act=None, - bias_attr=False) - return fluid.layers.batch_norm( - input=conv, act=act, is_test=not self.is_train) - - def shortcut(self, input, ch_out, stride): - ch_in = input.shape[1] - if ch_in != ch_out or stride != 1: - return self.conv_bn_layer(input, ch_out, 1, stride) - else: - return input - - def bottleneck_block(self, input, num_filters, stride): - conv0 = self.conv_bn_layer( - input=input, num_filters=num_filters, filter_size=1, act='relu') - conv1 = self.conv_bn_layer( - input=conv0, - num_filters=num_filters, - filter_size=3, - stride=stride, - act='relu') - conv2 = self.conv_bn_layer( - input=conv1, num_filters=num_filters * 4, filter_size=1, act=None) - - short = self.shortcut(input, num_filters * 4, stride) - - return fluid.layers.elementwise_add(x=short, y=conv2, act='relu') - - -def _model_reader_dshape_classdim(args, is_train): - model = None - reader = None - if args.data_set == "flowers": - class_dim = 102 - if args.data_format == 'NCHW': - dshape = [3, 224, 224] - else: - dshape = [224, 224, 3] - if is_train: - reader = paddle.dataset.flowers.train() - else: - reader = paddle.dataset.flowers.test() - elif args.data_set == "imagenet": - class_dim = 1000 - if args.data_format == 'NCHW': - dshape = [3, 224, 224] - else: - dshape = [224, 224, 3] - if not args.data_path: - raise Exception( - "Must specify --data_path when training with imagenet") - if not args.use_reader_op: - if is_train: - reader = train() - else: - reader = val() - else: - if is_train: - reader = train(xmap=False) - else: - reader = val(xmap=False) - return reader, dshape, class_dim - - -def get_model(args, is_train, main_prog, startup_prog): - reader, dshape, class_dim = _model_reader_dshape_classdim(args, is_train) - - pyreader = None - trainer_count = int(os.getenv("PADDLE_TRAINERS")) - with fluid.program_guard(main_prog, startup_prog): - with fluid.unique_name.guard(): - if args.use_reader_op: - pyreader = fluid.layers.py_reader( - capacity=args.batch_size * args.gpus, - shapes=([-1] + dshape, (-1, 1)), - dtypes=('float32', 'int64'), - name="train_reader" if is_train else "test_reader", - use_double_buffer=True) - input, label = fluid.layers.read_file(pyreader) - else: - input = fluid.layers.data( - name='data', shape=dshape, dtype='float32') - label = fluid.layers.data( - name='label', shape=[1], dtype='int64') - - model = ResNet(is_train=is_train) - predict = model.net(input, class_dim=class_dim) - cost = fluid.layers.cross_entropy(input=predict, label=label) - avg_cost = fluid.layers.mean(x=cost) - - batch_acc1 = fluid.layers.accuracy(input=predict, label=label, k=1) - batch_acc5 = fluid.layers.accuracy(input=predict, label=label, k=5) - - # configure optimize - optimizer = None - if is_train: - total_images = 1281167 / trainer_count - - step = int(total_images / (args.batch_size * args.gpus) + 1) - epochs = [30, 60, 90] - bd = [step * e for e in epochs] - base_lr = args.learning_rate - lr = [] - lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)] - optimizer = fluid.optimizer.Momentum( - learning_rate=fluid.layers.piecewise_decay( - boundaries=bd, values=lr), - momentum=0.9, - regularization=fluid.regularizer.L2Decay(1e-4)) - optimizer.minimize(avg_cost) - - if args.memory_optimize: - fluid.memory_optimize(main_prog) - - # config readers - if not args.use_reader_op: - batched_reader = paddle.batch( - reader if args.no_random else paddle.reader.shuffle( - reader, buf_size=5120), - batch_size=args.batch_size * args.gpus, - drop_last=True) - else: - batched_reader = None - pyreader.decorate_paddle_reader( - paddle.batch( - reader if args.no_random else paddle.reader.shuffle( - reader, buf_size=5120), - batch_size=args.batch_size)) - - return avg_cost, optimizer, [batch_acc1, - batch_acc5], batched_reader, pyreader diff --git a/benchmark/fluid/models/resnet_with_preprocess.py b/benchmark/fluid/models/resnet_with_preprocess.py deleted file mode 100644 index e996c9a704531757891354c7c75a9d7915195ee0..0000000000000000000000000000000000000000 --- a/benchmark/fluid/models/resnet_with_preprocess.py +++ /dev/null @@ -1,263 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import functools -import numpy as np -import time -import os - -import cProfile, pstats, StringIO - -import paddle -import paddle.fluid as fluid -import paddle.fluid.core as core -import paddle.fluid.profiler as profiler -# from recordio_converter import imagenet_train, imagenet_test -from imagenet_reader import train_raw, val - - -def conv_bn_layer(input, - ch_out, - filter_size, - stride, - padding, - act='relu', - is_train=True): - conv1 = fluid.layers.conv2d( - input=input, - filter_size=filter_size, - num_filters=ch_out, - stride=stride, - padding=padding, - act=None, - bias_attr=False) - return fluid.layers.batch_norm(input=conv1, act=act, is_test=not is_train) - - -def shortcut(input, ch_out, stride, is_train=True): - ch_in = input.shape[1] # if args.data_format == 'NCHW' else input.shape[-1] - if ch_in != ch_out: - return conv_bn_layer( - input, ch_out, 1, stride, 0, None, is_train=is_train) - else: - return input - - -def basicblock(input, ch_out, stride, is_train=True): - short = shortcut(input, ch_out, stride, is_train=is_train) - conv1 = conv_bn_layer(input, ch_out, 3, stride, 1, is_train=is_train) - conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, act=None, is_train=is_train) - return fluid.layers.elementwise_add(x=short, y=conv2, act='relu') - - -def bottleneck(input, ch_out, stride, is_train=True): - short = shortcut(input, ch_out * 4, stride, is_train=is_train) - conv1 = conv_bn_layer(input, ch_out, 1, stride, 0, is_train=is_train) - conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, is_train=is_train) - conv3 = conv_bn_layer( - conv2, ch_out * 4, 1, 1, 0, act=None, is_train=is_train) - return fluid.layers.elementwise_add(x=short, y=conv3, act='relu') - - -def layer_warp(block_func, input, ch_out, count, stride): - res_out = block_func(input, ch_out, stride) - for i in range(1, count): - res_out = block_func(res_out, ch_out, 1) - return res_out - - -def resnet_imagenet(input, - class_dim, - depth=50, - data_format='NCHW', - is_train=True): - - cfg = { - 18: ([2, 2, 2, 1], basicblock), - 34: ([3, 4, 6, 3], basicblock), - 50: ([3, 4, 6, 3], bottleneck), - 101: ([3, 4, 23, 3], bottleneck), - 152: ([3, 8, 36, 3], bottleneck) - } - stages, block_func = cfg[depth] - conv1 = conv_bn_layer(input, ch_out=64, filter_size=7, stride=2, padding=3) - pool1 = fluid.layers.pool2d( - input=conv1, pool_type='avg', pool_size=3, pool_stride=2) - res1 = layer_warp(block_func, pool1, 64, stages[0], 1) - res2 = layer_warp(block_func, res1, 128, stages[1], 2) - res3 = layer_warp(block_func, res2, 256, stages[2], 2) - res4 = layer_warp(block_func, res3, 512, stages[3], 2) - pool2 = fluid.layers.pool2d( - input=res4, - pool_size=7, - pool_type='avg', - pool_stride=1, - global_pooling=True) - out = fluid.layers.fc(input=pool2, size=class_dim, act='softmax') - return out - - -def resnet_cifar10(input, class_dim, depth=32, data_format='NCHW'): - assert (depth - 2) % 6 == 0 - - n = (depth - 2) // 6 - - conv1 = conv_bn_layer( - input=input, ch_out=16, filter_size=3, stride=1, padding=1) - res1 = layer_warp(basicblock, conv1, 16, n, 1) - res2 = layer_warp(basicblock, res1, 32, n, 2) - res3 = layer_warp(basicblock, res2, 64, n, 2) - pool = fluid.layers.pool2d( - input=res3, pool_size=8, pool_type='avg', pool_stride=1) - out = fluid.layers.fc(input=pool, size=class_dim, act='softmax') - return out - - -def _model_reader_dshape_classdim(args, is_train): - model = resnet_cifar10 - reader = None - if args.data_set == "cifar10": - class_dim = 10 - if args.data_format == 'NCHW': - dshape = [3, 32, 32] - else: - dshape = [32, 32, 3] - model = resnet_cifar10 - if is_train: - reader = paddle.dataset.cifar.train10() - else: - reader = paddle.dataset.cifar.test10() - elif args.data_set == "flowers": - class_dim = 102 - if args.data_format == 'NCHW': - dshape = [3, 224, 224] - else: - dshape = [224, 224, 3] - model = resnet_imagenet - if is_train: - reader = paddle.dataset.flowers.train() - else: - reader = paddle.dataset.flowers.test() - elif args.data_set == "imagenet": - class_dim = 1000 - if args.data_format == 'NCHW': - dshape = [3, 224, 224] - else: - dshape = [224, 224, 3] - model = resnet_imagenet - if not args.data_path: - raise Exception( - "Must specify --data_path when training with imagenet") - if not args.use_reader_op: - if is_train: - reader = train_raw() - else: - reader = val() - else: - if is_train: - reader = train_raw() - else: - reader = val(xmap=False) - return model, reader, dshape, class_dim - - -def get_model(args, is_train, main_prog, startup_prog): - model, reader, dshape, class_dim = _model_reader_dshape_classdim(args, - is_train) - - pyreader = None - trainer_count = int(os.getenv("PADDLE_TRAINERS")) - with fluid.program_guard(main_prog, startup_prog): - with fluid.unique_name.guard(): - if args.use_reader_op: - pyreader = fluid.layers.py_reader( - capacity=args.batch_size * args.gpus, - shapes=([-1] + dshape, (-1, 1)), - dtypes=('uint8', 'int64'), - name="train_reader" if is_train else "test_reader", - use_double_buffer=True) - input, label = fluid.layers.read_file(pyreader) - else: - input = fluid.layers.data( - name='data', shape=dshape, dtype='uint8') - label = fluid.layers.data( - name='label', shape=[1], dtype='int64') - - # add imagenet preprocessors - random_crop = fluid.layers.random_crop(input, dshape) - casted = fluid.layers.cast(random_crop, 'float32') - # input is HWC - trans = fluid.layers.transpose(casted, [0, 3, 1, 2]) / 255.0 - img_mean = fluid.layers.tensor.assign( - np.array([0.485, 0.456, 0.406]).astype('float32').reshape((3, 1, - 1))) - img_std = fluid.layers.tensor.assign( - np.array([0.229, 0.224, 0.225]).astype('float32').reshape((3, 1, - 1))) - h1 = fluid.layers.elementwise_sub(trans, img_mean, axis=1) - h2 = fluid.layers.elementwise_div(h1, img_std, axis=1) - - # pre_out = (trans - img_mean) / img_std - - predict = model(h2, class_dim, is_train=is_train) - cost = fluid.layers.cross_entropy(input=predict, label=label) - avg_cost = fluid.layers.mean(x=cost) - - batch_acc1 = fluid.layers.accuracy(input=predict, label=label, k=1) - batch_acc5 = fluid.layers.accuracy(input=predict, label=label, k=5) - - # configure optimize - optimizer = None - if is_train: - total_images = 1281167 / trainer_count - - step = int(total_images / args.batch_size + 1) - epochs = [30, 60, 80, 90] - bd = [step * e for e in epochs] - base_lr = args.learning_rate - lr = [] - lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)] - optimizer = fluid.optimizer.Momentum( - learning_rate=base_lr, - #learning_rate=fluid.layers.piecewise_decay( - # boundaries=bd, values=lr), - momentum=0.9, - regularization=fluid.regularizer.L2Decay(1e-4)) - optimizer.minimize(avg_cost) - - if args.memory_optimize: - fluid.memory_optimize(main_prog) - - # config readers - if not args.use_reader_op: - batched_reader = paddle.batch( - reader if args.no_random else paddle.reader.shuffle( - reader, buf_size=5120), - batch_size=args.batch_size * args.gpus, - drop_last=True) - else: - batched_reader = None - pyreader.decorate_paddle_reader( - paddle.batch( - # reader if args.no_random else paddle.reader.shuffle( - # reader, buf_size=5120), - reader, - batch_size=args.batch_size)) - - return avg_cost, optimizer, [batch_acc1, - batch_acc5], batched_reader, pyreader diff --git a/benchmark/fluid/models/se_resnext.py b/benchmark/fluid/models/se_resnext.py deleted file mode 100644 index 7fbb83c2ec1bab29731ae4e432dda202007b2e2c..0000000000000000000000000000000000000000 --- a/benchmark/fluid/models/se_resnext.py +++ /dev/null @@ -1,280 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import paddle -import paddle.fluid as fluid -import math -import os -from imagenet_reader import train, val - -__all__ = [ - "SE_ResNeXt", "SE_ResNeXt50_32x4d", "SE_ResNeXt101_32x4d", - "SE_ResNeXt152_32x4d", "get_model" -] - -train_parameters = { - "input_size": [3, 224, 224], - "input_mean": [0.485, 0.456, 0.406], - "input_std": [0.229, 0.224, 0.225], - "learning_strategy": { - "name": "piecewise_decay", - "batch_size": 256, - "epochs": [30, 60, 90], - "steps": [0.1, 0.01, 0.001, 0.0001] - } -} - - -class SE_ResNeXt(): - def __init__(self, layers=50, is_train=True): - self.params = train_parameters - self.layers = layers - self.is_train = is_train - - def net(self, input, class_dim=1000): - layers = self.layers - supported_layers = [50, 101, 152] - assert layers in supported_layers, \ - "supported layers are {} but input layer is {}".format(supported_layers, layers) - if layers == 50: - cardinality = 32 - reduction_ratio = 16 - depth = [3, 4, 6, 3] - num_filters = [128, 256, 512, 1024] - - conv = self.conv_bn_layer( - input=input, - num_filters=64, - filter_size=7, - stride=2, - act='relu') - conv = fluid.layers.pool2d( - input=conv, - pool_size=3, - pool_stride=2, - pool_padding=1, - pool_type='max') - elif layers == 101: - cardinality = 32 - reduction_ratio = 16 - depth = [3, 4, 23, 3] - num_filters = [128, 256, 512, 1024] - - conv = self.conv_bn_layer( - input=input, - num_filters=64, - filter_size=7, - stride=2, - act='relu') - conv = fluid.layers.pool2d( - input=conv, - pool_size=3, - pool_stride=2, - pool_padding=1, - pool_type='max') - elif layers == 152: - cardinality = 64 - reduction_ratio = 16 - depth = [3, 8, 36, 3] - num_filters = [128, 256, 512, 1024] - - conv = self.conv_bn_layer( - input=input, - num_filters=64, - filter_size=3, - stride=2, - act='relu') - conv = self.conv_bn_layer( - input=conv, num_filters=64, filter_size=3, stride=1, act='relu') - conv = self.conv_bn_layer( - input=conv, - num_filters=128, - filter_size=3, - stride=1, - act='relu') - conv = fluid.layers.pool2d( - input=conv, pool_size=3, pool_stride=2, pool_padding=1, \ - pool_type='max') - - for block in range(len(depth)): - for i in range(depth[block]): - conv = self.bottleneck_block( - input=conv, - num_filters=num_filters[block], - stride=2 if i == 0 and block != 0 else 1, - cardinality=cardinality, - reduction_ratio=reduction_ratio) - - pool = fluid.layers.pool2d( - input=conv, pool_size=7, pool_type='avg', global_pooling=True) - drop = fluid.layers.dropout(x=pool, dropout_prob=0.5) - stdv = 1.0 / math.sqrt(drop.shape[1] * 1.0) - out = fluid.layers.fc(input=drop, - size=class_dim, - act='softmax', - param_attr=fluid.param_attr.ParamAttr( - initializer=fluid.initializer.Uniform(-stdv, - stdv))) - return out - - def shortcut(self, input, ch_out, stride): - ch_in = input.shape[1] - if ch_in != ch_out or stride != 1: - filter_size = 1 - return self.conv_bn_layer(input, ch_out, filter_size, stride) - else: - return input - - def bottleneck_block(self, input, num_filters, stride, cardinality, - reduction_ratio): - conv0 = self.conv_bn_layer( - input=input, num_filters=num_filters, filter_size=1, act='relu') - conv1 = self.conv_bn_layer( - input=conv0, - num_filters=num_filters, - filter_size=3, - stride=stride, - groups=cardinality, - act='relu') - conv2 = self.conv_bn_layer( - input=conv1, num_filters=num_filters * 2, filter_size=1, act=None) - scale = self.squeeze_excitation( - input=conv2, - num_channels=num_filters * 2, - reduction_ratio=reduction_ratio) - - short = self.shortcut(input, num_filters * 2, stride) - - return fluid.layers.elementwise_add(x=short, y=scale, act='relu') - - def conv_bn_layer(self, - input, - num_filters, - filter_size, - stride=1, - groups=1, - act=None): - conv = fluid.layers.conv2d( - input=input, - num_filters=num_filters, - filter_size=filter_size, - stride=stride, - padding=(filter_size - 1) / 2, - groups=groups, - act=None, - bias_attr=False) - return fluid.layers.batch_norm( - input=conv, act=act, is_test=not self.is_train) - - def squeeze_excitation(self, input, num_channels, reduction_ratio): - pool = fluid.layers.pool2d( - input=input, pool_size=0, pool_type='avg', global_pooling=True) - stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0) - squeeze = fluid.layers.fc(input=pool, - size=num_channels / reduction_ratio, - act='relu', - param_attr=fluid.param_attr.ParamAttr( - initializer=fluid.initializer.Uniform( - -stdv, stdv))) - stdv = 1.0 / math.sqrt(squeeze.shape[1] * 1.0) - excitation = fluid.layers.fc(input=squeeze, - size=num_channels, - act='sigmoid', - param_attr=fluid.param_attr.ParamAttr( - initializer=fluid.initializer.Uniform( - -stdv, stdv))) - scale = fluid.layers.elementwise_mul(x=input, y=excitation, axis=0) - return scale - - -def SE_ResNeXt50_32x4d(): - model = SE_ResNeXt(layers=50) - return model - - -def SE_ResNeXt101_32x4d(): - model = SE_ResNeXt(layers=101) - return model - - -def SE_ResNeXt152_32x4d(): - model = SE_ResNeXt(layers=152) - return model - - -def get_model(args, is_train, main_prog, startup_prog): - model = SE_ResNeXt(layers=50) - batched_reader = None - pyreader = None - trainer_count = int(os.getenv("PADDLE_TRAINERS")) - dshape = train_parameters["input_size"] - - with fluid.program_guard(main_prog, startup_prog): - with fluid.unique_name.guard(): - if args.use_reader_op: - pyreader = fluid.layers.py_reader( - capacity=10, - shapes=([-1] + dshape, (-1, 1)), - dtypes=('float32', 'int64'), - name="train_reader" if is_train else "test_reader", - use_double_buffer=True) - input, label = fluid.layers.read_file(pyreader) - else: - input = fluid.layers.data( - name='data', shape=dshape, dtype='float32') - label = fluid.layers.data( - name='label', shape=[1], dtype='int64') - - out = model.net(input=input) - cost = fluid.layers.cross_entropy(input=out, label=label) - avg_cost = fluid.layers.mean(x=cost) - acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1) - acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5) - - optimizer = None - if is_train: - total_images = 1281167 / trainer_count - - step = int(total_images / args.batch_size + 1) - epochs = [40, 80, 100] - bd = [step * e for e in epochs] - base_lr = args.learning_rate - lr = [] - lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)] - optimizer = fluid.optimizer.Momentum( - # learning_rate=base_lr, - learning_rate=fluid.layers.piecewise_decay( - boundaries=bd, values=lr), - momentum=0.9, - regularization=fluid.regularizer.L2Decay(1e-4)) - optimizer.minimize(avg_cost) - - if args.memory_optimize: - fluid.memory_optimize(main_prog) - - # config readers - if is_train: - reader = train() - else: - reader = val() - - if not args.use_reader_op: - batched_reader = paddle.batch( - reader, batch_size=args.batch_size * args.gpus, drop_last=True) - else: - pyreader.decorate_paddle_reader( - paddle.batch( - reader, batch_size=args.batch_size)) - - return avg_cost, optimizer, [acc_top1, acc_top5], batched_reader, pyreader diff --git a/benchmark/fluid/models/stacked_dynamic_lstm.py b/benchmark/fluid/models/stacked_dynamic_lstm.py deleted file mode 100644 index f23bb59de9158b0481320cc409879b3b72cbd43e..0000000000000000000000000000000000000000 --- a/benchmark/fluid/models/stacked_dynamic_lstm.py +++ /dev/null @@ -1,130 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import argparse -import cPickle -import os -import random -import time - -import numpy -import paddle -import paddle.dataset.imdb as imdb -import paddle.fluid as fluid -import paddle.fluid.profiler as profiler - -word_dict = imdb.word_dict() - - -def crop_sentence(reader, crop_size): - unk_value = word_dict[''] - - def __impl__(): - for item in reader(): - if len([x for x in item[0] if x != unk_value]) < crop_size: - yield item - - return __impl__ - - -def lstm_net(sentence, lstm_size): - sentence = fluid.layers.fc(input=sentence, size=lstm_size, act='tanh') - - rnn = fluid.layers.DynamicRNN() - with rnn.block(): - word = rnn.step_input(sentence) - prev_hidden = rnn.memory(value=0.0, shape=[lstm_size]) - prev_cell = rnn.memory(value=0.0, shape=[lstm_size]) - - def gate_common( - ipt, - hidden, - size, ): - gate0 = fluid.layers.fc(input=ipt, size=size, bias_attr=True) - gate1 = fluid.layers.fc(input=hidden, size=size, bias_attr=False) - gate = fluid.layers.sums(input=[gate0, gate1]) - return gate - - forget_gate = fluid.layers.sigmoid( - x=gate_common(word, prev_hidden, lstm_size)) - input_gate = fluid.layers.sigmoid( - x=gate_common(word, prev_hidden, lstm_size)) - output_gate = fluid.layers.sigmoid( - x=gate_common(word, prev_hidden, lstm_size)) - cell_gate = fluid.layers.tanh( - x=gate_common(word, prev_hidden, lstm_size)) - - cell = fluid.layers.sums(input=[ - fluid.layers.elementwise_mul( - x=forget_gate, y=prev_cell), fluid.layers.elementwise_mul( - x=input_gate, y=cell_gate) - ]) - - hidden = fluid.layers.elementwise_mul( - x=output_gate, y=fluid.layers.tanh(x=cell)) - - rnn.update_memory(prev_cell, cell) - rnn.update_memory(prev_hidden, hidden) - rnn.output(hidden) - - last = fluid.layers.sequence_pool(rnn(), 'last') - logit = fluid.layers.fc(input=last, size=2, act='softmax') - return logit - - -def get_model(args, is_train, main_prog, startup_prog): - if args.use_reader_op: - raise Exception( - "stacked_dynamic_lstm do not support reader op for now.") - lstm_size = 512 - emb_dim = 512 - crop_size = 1500 - - with fluid.program_guard(main_prog, startup_prog): - with fluid.unique_name.guard(): - data = fluid.layers.data( - name="words", shape=[1], lod_level=1, dtype='int64') - sentence = fluid.layers.embedding( - input=data, size=[len(word_dict), emb_dim]) - logit = lstm_net(sentence, lstm_size) - loss = fluid.layers.cross_entropy( - input=logit, - label=fluid.layers.data( - name='label', shape=[1], dtype='int64')) - loss = fluid.layers.mean(x=loss) - - # add acc - batch_size_tensor = fluid.layers.create_tensor(dtype='int64') - batch_acc = fluid.layers.accuracy(input=logit, label=fluid.layers.data(name='label', \ - shape=[1], dtype='int64'), total=batch_size_tensor) - - if is_train: - adam = fluid.optimizer.Adam() - adam.minimize(loss) - - if is_train: - reader = crop_sentence(imdb.train(word_dict), crop_size) - else: - reader = crop_sentence(imdb.test(word_dict), crop_size) - - batched_reader = paddle.batch( - paddle.reader.shuffle( - reader, buf_size=25000), - batch_size=args.batch_size * args.gpus) - - return loss, adam, [batch_acc], batched_reader, None diff --git a/benchmark/fluid/models/vgg.py b/benchmark/fluid/models/vgg.py deleted file mode 100644 index cf9708d500684465dc8ec1666bf269e7e1300f59..0000000000000000000000000000000000000000 --- a/benchmark/fluid/models/vgg.py +++ /dev/null @@ -1,120 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""VGG16 benchmark in Fluid""" -from __future__ import print_function - -import sys -import time -import numpy as np -import paddle -import paddle.fluid as fluid -import paddle.fluid.core as core -import argparse -import functools -import os - - -def vgg16_bn_drop(input, is_train=True): - def conv_block(input, num_filter, groups, dropouts): - return fluid.nets.img_conv_group( - input=input, - pool_size=2, - pool_stride=2, - conv_num_filter=[num_filter] * groups, - conv_filter_size=3, - conv_act='relu', - conv_with_batchnorm=True, - conv_batchnorm_drop_rate=dropouts, - pool_type='max') - - conv1 = conv_block(input, 64, 2, [0.3, 0]) - conv2 = conv_block(conv1, 128, 2, [0.4, 0]) - conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0]) - conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0]) - conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0]) - - drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5) - fc1 = fluid.layers.fc(input=drop, size=512, act=None) - bn = fluid.layers.batch_norm(input=fc1, act='relu', is_test=not is_train) - drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5) - fc2 = fluid.layers.fc(input=drop2, size=512, act=None) - return fc2 - - -def get_model(args, is_train, main_prog, startup_prog): - if args.data_set == "cifar10": - classdim = 10 - if args.data_format == 'NCHW': - data_shape = [3, 32, 32] - else: - data_shape = [32, 32, 3] - else: - classdim = 102 - if args.data_format == 'NCHW': - data_shape = [3, 224, 224] - else: - data_shape = [224, 224, 3] - filelist = [ - os.path.join(args.data_path, f) for f in os.listdir(args.data_path) - ] - with fluid.program_guard(main_prog, startup_prog): - if args.use_reader_op: - data_file_handle = fluid.layers.open_files( - filenames=filelist, - shapes=[[-1] + data_shape, (-1, 1)], - lod_levels=[0, 0], - dtypes=["float32", "int64"], - thread_num=1, - pass_num=1) - data_file = fluid.layers.double_buffer( - fluid.layers.batch( - data_file_handle, batch_size=args.batch_size)) - with fluid.unique_name.guard(): - if args.use_reader_op: - images, label = fluid.layers.read_file(data_file) - else: - images = fluid.layers.data( - name='data', shape=data_shape, dtype='float32') - label = fluid.layers.data( - name='label', shape=[1], dtype='int64') - # Train program - net = vgg16_bn_drop(images, is_train=is_train) - predict = fluid.layers.fc(input=net, size=classdim, act='softmax') - cost = fluid.layers.cross_entropy(input=predict, label=label) - avg_cost = fluid.layers.mean(x=cost) - - # Evaluator - batch_size_tensor = fluid.layers.create_tensor(dtype='int64') - batch_acc = fluid.layers.accuracy( - input=predict, label=label, total=batch_size_tensor) - # Optimization - if is_train: - optimizer = fluid.optimizer.Adam( - learning_rate=args.learning_rate) - optimizer.minimize(avg_cost) - - # data reader - if is_train: - reader = paddle.dataset.cifar.train10() \ - if args.data_set == 'cifar10' else paddle.dataset.flowers.train() - else: - reader = paddle.dataset.cifar.test10() \ - if args.data_set == 'cifar10' else paddle.dataset.flowers.test() - - batched_reader = paddle.batch( - paddle.reader.shuffle( - reader, buf_size=5120), - batch_size=args.batch_size * args.gpus) - - return avg_cost, optimizer, [batch_acc], batched_reader, data_file_handle diff --git a/benchmark/fluid/recordio_converter.py b/benchmark/fluid/recordio_converter.py deleted file mode 100644 index f2dc39109bf1beaf147b046560c92fbd2416d8e6..0000000000000000000000000000000000000000 --- a/benchmark/fluid/recordio_converter.py +++ /dev/null @@ -1,164 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import random -import paddle -import paddle.fluid as fluid -import paddle.fluid.core as core -from paddle.dataset import mnist, cifar, flowers, image - - -def convert_2_recordio(py_reader, outfilepath, batch_size, shape_data, - shape_label): - num_batches = 0 - with fluid.program_guard(fluid.Program(), fluid.Program()): - reader = paddle.batch(py_reader(), batch_size=batch_size) - feeder = fluid.DataFeeder( - feed_list=[ # order is image and label - fluid.layers.data( - name='image', shape=shape_data), - fluid.layers.data( - name='label', shape=shape_label, dtype='int64'), - ], - place=fluid.CPUPlace()) - num_batches = fluid.recordio_writer.convert_reader_to_recordio_file( - outfilepath, reader, feeder) - return num_batches - - -def prepare_mnist(outpath, batch_size): - outfilepath = os.path.join(outpath, "mnist.recordio") - convert_2_recordio(mnist.train, outfilepath, batch_size, [784], [1]) - - -def prepare_cifar10(outpath, batch_size): - outfilepath = os.path.join(outpath, "cifar.recordio") - convert_2_recordio(cifar.train10, outfilepath, batch_size, [3, 32, 32], [1]) - - -def prepare_flowers(outpath, batch_size): - outfilepath = os.path.join(outpath, "flowers.recordio") - convert_2_recordio(flowers.train, outfilepath, batch_size, [3, 224, 224], - [1]) - - -def default_mapper(sample): - img, label = sample - img = image.simple_transform( - img, 256, 224, True, mean=[103.94, 116.78, 123.68]) - return img.flatten().astype('float32'), label - - -def imagenet_train(data_dir): - contents = os.listdir(data_dir) - if set(contents) != set( - ["train", "train.txt", "val", "val_set", "val.txt", "unzip.sh"]): - raise Exception("Imagenet data contents error!") - img2label = dict() - imgfilelist = [] - with open(os.path.join(data_dir, "train.txt")) as fn: - while 1: - l = fn.readline() - if not l: - break - img, lbl = l[:-1].split(" ") - img2label[img] = int(lbl) - imgfilelist.append(img) - # shuffle all, this is slow - random.shuffle(imgfilelist) - - def train_reader(): - for idx, imgfile in enumerate(imgfilelist): - data = image.load_image( - os.path.join(data_dir, "train", imgfile.lower())) - label = [img2label[imgfile], ] - yield [data, label] - - return paddle.reader.map_readers(default_mapper, train_reader) - - -def imagenet_test(data_dir): - contents = os.listdir(data_dir) - if set(contents) != set( - ["train", "train.txt", "val", "val_set", "val.txt", "unzip.sh"]): - raise Exception("Imagenet data contents error!") - img2label = dict() - imgfilelist = [] - with open(os.path.join(data_dir, "val.txt")) as fn: - while 1: - l = fn.readline() - if not l: - break - img, lbl = l[:-1].split(" ") - img2label[img] = int(lbl) - imgfilelist.append(img) - - def test_reader(): - for idx, imgfile in enumerate(imgfilelist): - base_path = os.path.join(data_dir, "val", imgfile.split(".")[0]) - image_path = ".".join([base_path, "jpeg"]) - data = image.load_image(image_path) - label = [img2label[imgfile], ] - yield [data, label] - - return paddle.reader.map_readers(default_mapper, test_reader) - - -# FIXME(wuyi): delete this when https://github.com/PaddlePaddle/Paddle/pull/11066 is merged -def convert_reader_to_recordio_files( - filename, - batch_per_file, - reader_creator, - feeder, - compressor=core.RecordIOWriter.Compressor.Snappy, - max_num_records=1000, - feed_order=None): - if feed_order is None: - feed_order = feeder.feed_names - f_name, f_ext = os.path.splitext(filename) - assert (f_ext == ".recordio") - - lines = [] - f_idx = 0 - counter = 0 - for idx, batch in enumerate(reader_creator()): - lines.append(batch) - if idx >= batch_per_file and idx % batch_per_file == 0: - filename = "%s-%05d%s" % (f_name, f_idx, f_ext) - with fluid.recordio_writer.create_recordio_writer( - filename, compressor, max_num_records) as writer: - for l in lines: - res = feeder.feed(l) - for each in feed_order: - writer.append_tensor(res[each]) - writer.complete_append_tensor() - counter += 1 - lines = [] - f_idx += 1 - print("written file: ", filename) - return counter - - -def prepare_imagenet(inpath, outpath, batch_size): - r = paddle.batch(imagenet_train(inpath), batch_size=batch_size) - feeder = fluid.DataFeeder( - feed_list=[ - fluid.layers.data( - name="image", shape=[3, 224, 224]), fluid.layers.data( - name="label", shape=[1], dtype='int64') - ], - place=fluid.CPUPlace()) - outpath = os.path.join(outpath, "imagenet.recordio") - convert_reader_to_recordio_files(outpath, 10000, r, feeder) diff --git a/benchmark/fluid/run.sh b/benchmark/fluid/run.sh deleted file mode 100755 index 5d9b2db87135e53470b106dcd11a6bcfdc5dbda9..0000000000000000000000000000000000000000 --- a/benchmark/fluid/run.sh +++ /dev/null @@ -1,109 +0,0 @@ -#!/bin/bash -# This script benchmarking the PaddlePaddle Fluid on -# single thread single GPU. - -mkdir -p logs -#export FLAGS_fraction_of_gpu_memory_to_use=0.0 -export CUDNN_PATH=/paddle/cudnn_v5 - -# disable openmp and mkl parallel -#https://github.com/PaddlePaddle/Paddle/issues/7199 -export MKL_NUM_THREADS=1 -export OMP_NUM_THREADS=1 -ht=`lscpu |grep "per core"|awk -F':' '{print $2}'|xargs` -if [ $ht -eq 1 ]; then # HT is OFF - if [ -z "$KMP_AFFINITY" ]; then - export KMP_AFFINITY="granularity=fine,compact,0,0" - fi - if [ -z "$OMP_DYNAMIC" ]; then - export OMP_DYNAMIC="FALSE" - fi -else # HT is ON - if [ -z "$KMP_AFFINITY" ]; then - export KMP_AFFINITY="granularity=fine,compact,1,0" - fi -fi -# disable multi-gpu if have more than one -export CUDA_VISIBLE_DEVICES=0 -export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH -export LD_LIBRARY_PATH=$CUDNN_PATH:$LD_LIBRARY_PATH - -# only query the gpu used -nohup stdbuf -oL nvidia-smi \ - --id=${CUDA_VISIBLE_DEVICES} \ - --query-gpu=timestamp \ - --query-compute-apps=pid,process_name,used_memory \ - --format=csv \ - --filename=mem.log \ - -l 1 & - -# mnist -# mnist gpu mnist 128 -FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \ - --model=mnist \ - --device=GPU \ - --batch_size=128 \ - --skip_batch_num=5 \ - --iterations=500 \ - 2>&1 | tee -a logs/mnist_gpu_128.log - -# vgg16 -# gpu cifar10 128 -FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \ - --model=vgg16 \ - --device=GPU \ - --batch_size=128 \ - --skip_batch_num=5 \ - --iterations=30 \ - 2>&1 | tee -a logs/vgg16_gpu_128.log - -# flowers gpu 128 -FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \ - --model=vgg16 \ - --device=GPU \ - --batch_size=32 \ - --data_set=flowers \ - --skip_batch_num=5 \ - --iterations=30 \ - 2>&1 | tee -a logs/vgg16_gpu_flowers_32.log - -# resnet50 -# resnet50 gpu cifar10 128 -FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \ - --model=resnet \ - --device=GPU \ - --batch_size=128 \ - --data_set=cifar10 \ - --skip_batch_num=5 \ - --iterations=30 \ - 2>&1 | tee -a logs/resnet50_gpu_128.log - -# resnet50 gpu flowers 64 -FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \ - --model=resnet \ - --device=GPU \ - --batch_size=64 \ - --data_set=flowers \ - --skip_batch_num=5 \ - --iterations=30 \ - 2>&1 | tee -a logs/resnet50_gpu_flowers_64.log - -# lstm -# lstm gpu imdb 32 # tensorflow only support batch=32 -FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \ - --model=stacked_dynamic_lstm \ - --device=GPU \ - --batch_size=32 \ - --skip_batch_num=5 \ - --iterations=30 \ - 2>&1 | tee -a logs/lstm_gpu_32.log - -# seq2seq -# seq2seq gpu wmb 128 -FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \ - --model=machine_translation \ - --device=GPU \ - --batch_size=128 \ - --skip_batch_num=5 \ - --iterations=30 \ - 2>&1 | tee -a logs/lstm_gpu_128.log diff --git a/benchmark/fluid/run_fluid_benchmark.sh b/benchmark/fluid/run_fluid_benchmark.sh deleted file mode 100644 index 4309a3126c1d72fe1eb2d5ec423075aea4d3ec88..0000000000000000000000000000000000000000 --- a/benchmark/fluid/run_fluid_benchmark.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/bash - -PADDLE_TRAINING_ROLE=PSERVER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=2 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model resnet --device CPU --update_method pserver --iterations=10000 & - -sleep 15 - -CUDA_VISIBLE_DEVICES=0,1 PADDLE_TRAINING_ROLE=TRAINER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=2 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model resnet --device GPU --update_method pserver --iterations=10000 --gpus 2 & - -CUDA_VISIBLE_DEVICES=2,3 PADDLE_TRAINING_ROLE=TRAINER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=2 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=1 python fluid_benchmark.py --model resnet --device GPU --update_method pserver --iterations=10000 --gpus 2 & diff --git a/benchmark/tensorflow/image/alexnet.py b/benchmark/tensorflow/image/alexnet.py deleted file mode 100644 index 95728b7a85afbc231cb4ade0b3ad835de109c980..0000000000000000000000000000000000000000 --- a/benchmark/tensorflow/image/alexnet.py +++ /dev/null @@ -1,312 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from six.moves import xrange # pylint: disable=redefined-builtin -from datetime import datetime -import math -import time - -import tensorflow.python.platform -import tensorflow as tf - -FLAGS = tf.app.flags.FLAGS - -tf.app.flags.DEFINE_integer('batch_size', 128, """Batch size.""") -tf.app.flags.DEFINE_integer('num_batches', 100, """Number of batches to run.""") -tf.app.flags.DEFINE_boolean('forward_only', False, - """Only run the forward pass.""") -tf.app.flags.DEFINE_boolean('forward_backward_only', False, - """Only run the forward-forward pass.""") -tf.app.flags.DEFINE_string('data_format', 'NCHW', - """The data format for Convnet operations. - Can be either NHWC or NCHW. - """) -tf.app.flags.DEFINE_boolean('log_device_placement', False, - """Whether to log device placement.""") - - -def _conv(name, inpOp, nIn, nOut, kH, kW, dH, dW, padType, wd=0.0005): - with tf.name_scope(name) as scope: - kernel = tf.get_variable( - name + '_w', [kH, kW, nIn, nOut], - initializer=tf.truncated_normal_initializer( - stddev=0.01, dtype=tf.float32), - dtype=tf.float32) - - if wd is not None and wd > 0: - weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss') - tf.add_to_collection('losses', weight_decay) - - if FLAGS.data_format == 'NCHW': - strides = [1, 1, dH, dW] - else: - strides = [1, dH, dW, 1] - conv = tf.nn.conv2d( - inpOp, - kernel, - strides, - padding=padType, - data_format=FLAGS.data_format) - - biases = tf.get_variable( - name=name + '_b', - shape=[nOut], - initializer=tf.constant_initializer( - value=0.0, dtype=tf.float32), - dtype=tf.float32) - - bias = tf.reshape( - tf.nn.bias_add( - conv, biases, data_format=FLAGS.data_format), - conv.get_shape()) - - conv1 = tf.nn.relu(bias, name=scope) - return conv1 - - -def _affine(name, inpOp, nIn, nOut, wd=0.0005, act=True, drop=None): - with tf.name_scope(name) as scope: - kernel = tf.get_variable( - name + '_w', [nIn, nOut], - initializer=tf.truncated_normal_initializer( - stddev=0.01, dtype=tf.float32), - dtype=tf.float32) - - if wd is not None and wd > 0: - weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss') - tf.add_to_collection('losses', weight_decay) - - biases = tf.get_variable( - name + '_b', [nOut], - initializer=tf.constant_initializer( - value=0.0, dtype=tf.float32), - dtype=tf.float32, - trainable=True) - - affine1 = tf.nn.relu_layer(inpOp, kernel, biases, name=name) if act else \ - tf.matmul(inpOp, kernel) + biases - - output = tf.nn.dropout(affine1, drop) if drop else affine1 - - return output - - -def _mpool(name, inpOp, kH, kW, dH, dW): - if FLAGS.data_format == 'NCHW': - ksize = [1, 1, kH, kW] - strides = [1, 1, dH, dW] - else: - ksize = [1, kH, kW, 1] - strides = [1, dH, dW, 1] - return tf.nn.max_pool( - inpOp, - ksize=ksize, - strides=strides, - padding='VALID', - data_format=FLAGS.data_format, - name=name) - - -def _norm(name, l_input, lsize=4): - return tf.nn.lrn(l_input, - lsize, - bias=1.0, - alpha=0.001 / 9.0, - beta=0.75, - name=name) - - -def loss(logits, labels): - labels = tf.cast(labels, tf.int64) - cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( - logits, labels, name='cross_entropy_per_example') - cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy') - tf.add_to_collection('losses', cross_entropy_mean) - - # The total loss is defined as the cross entropy loss plus all of the weight - # decay terms (L2 loss). - return tf.add_n(tf.get_collection('losses'), name='total_loss') - - -def get_incoming_shape(incoming): - """ Returns the incoming data shape """ - if isinstance(incoming, tf.Tensor): - return incoming.get_shape().as_list() - elif type(incoming) in [np.array, list, tuple]: - return np.shape(incoming) - else: - raise Exception("Invalid incoming layer.") - - -def inference(images): - conv1 = _conv('conv1', images, 3, 96, 11, 11, 4, 4, 'VALID') - pool1 = _mpool('pool1', conv1, 3, 3, 2, 2) - norm1 = _norm('norm1', pool1, lsize=5) - conv2 = _conv('conv2', norm1, 96, 256, 5, 5, 1, 1, 'SAME') - pool2 = _mpool('pool2', conv2, 3, 3, 2, 2) - norm2 = _norm('norm2', pool2, lsize=5) - conv3 = _conv('conv3', norm2, 256, 384, 3, 3, 1, 1, 'SAME') - conv4 = _conv('conv4', conv3, 384, 384, 3, 3, 1, 1, 'SAME') - conv5 = _conv('conv5', conv4, 384, 256, 3, 3, 1, 1, 'SAME') - pool5 = _mpool('pool5', conv5, 3, 3, 2, 2) - resh1 = tf.reshape(pool5, [-1, 256 * 6 * 6]) - affn1 = _affine('fc6', resh1, 256 * 6 * 6, 4096, 0.5) - affn2 = _affine('fc7', affn1, 4096, 4096, 0.5) - affn3 = _affine('fc8', affn2, 4096, 1000, wd=None, act=False) # last fc - - return affn3 - - -def time_tensorflow_run(session, target, info_string): - num_steps_burn_in = 10 - total_duration = 0.0 - total_duration_squared = 0.0 - if not isinstance(target, list): - target = [target] - target_op = tf.group(*target) - for i in xrange(FLAGS.num_batches + num_steps_burn_in): - start_time = time.time() - _ = session.run(target_op) - duration = time.time() - start_time - if i > num_steps_burn_in: - if not i % 10: - print('%s: step %d, duration = %.3f' % - (datetime.now(), i - num_steps_burn_in, duration)) - total_duration += duration - total_duration_squared += duration * duration - mn = total_duration / FLAGS.num_batches - vr = total_duration_squared / FLAGS.num_batches - mn * mn - sd = math.sqrt(vr) - print('%s: %s across %d steps, %.3f +/- %.3f sec / batch' % - (datetime.now(), info_string, FLAGS.num_batches, mn, sd)) - - -def _add_loss_summaries(total_loss): - """ - Generates moving average for all losses and associated summaries for - visualizing the performance of the network. - - Args: - total_loss: Total loss from loss(). - Returns: - loss_averages_op: op for generating moving averages of losses. - """ - # Compute the moving average of all individual losses and the total loss. - loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg') - losses = tf.get_collection('losses') - loss_averages_op = loss_averages.apply(losses + [total_loss]) - - # Attach a scalar summary to all individual losses and the total loss; do the - # same for the averaged version of the losses. - for l in losses + [total_loss]: - # Name each loss as '(raw)' and name the moving average version of the loss - # as the original loss name. - tf.scalar_summary(l.op.name + ' (raw)', l) - tf.scalar_summary(l.op.name, loss_averages.average(l)) - - return loss_averages_op - - -def run_benchmark(): - with tf.Graph().as_default(): - with tf.device('/gpu:0'): - # Generate some dummy images. - image_size = 224 - # Note that our padding definition is slightly different the cuda-convnet. - # In order to force the model to start with the same activations sizes, - # we add 3 to the image_size and employ VALID padding above. - if FLAGS.data_format == 'NCHW': - image_shape = [ - FLAGS.batch_size, 3, image_size + 3, image_size + 3 - ] - else: - image_shape = [ - FLAGS.batch_size, image_size + 3, image_size + 3, 3 - ] - images = tf.get_variable( - 'image', - image_shape, - initializer=tf.truncated_normal_initializer( - stddev=0.1, dtype=tf.float32), - dtype=tf.float32, - trainable=False) - - labels = tf.get_variable( - 'label', [FLAGS.batch_size], - initializer=tf.constant_initializer(1), - dtype=tf.int32, - trainable=False) - - # Build a Graph that computes the logits predictions from the - # inference model. - last_layer = inference(images) - - objective = loss(last_layer, labels) - # Compute the gradient with respect to all the parameters. - - # Compute gradients. - # opt = tf.train.GradientDescentOptimizer(0.001) - opt = tf.train.MomentumOptimizer(0.001, 0.9) - grads = opt.compute_gradients(objective) - global_step = tf.get_variable( - 'global_step', [], - initializer=tf.constant_initializer( - 0.0, dtype=tf.float32), - trainable=False, - dtype=tf.float32) - apply_gradient_op = opt.apply_gradients( - grads, global_step=global_step) - - # Track the moving averages of all trainable variables. - variable_averages = tf.train.ExponentialMovingAverage(0.9, - global_step) - variables_averages_op = variable_averages.apply( - tf.trainable_variables()) - - # Build an initialization operation. - init = tf.initialize_all_variables() - - # Start running operations on the Graph. - sess = tf.Session(config=tf.ConfigProto( - allow_soft_placement=True, - log_device_placement=FLAGS.log_device_placement)) - sess.run(init) - - run_forward = True - run_forward_backward = True - if FLAGS.forward_only and FLAGS.forward_backward_only: - raise ValueError("Cannot specify --forward_only and " - "--forward_backward_only at the same time.") - if FLAGS.forward_only: - run_forward_backward = False - elif FLAGS.forward_backward_only: - run_forward = False - - if run_forward: - time_tensorflow_run(sess, last_layer, "Forward") - - if run_forward_backward: - with tf.control_dependencies( - [apply_gradient_op, variables_averages_op]): - train_op = tf.no_op(name='train') - time_tensorflow_run(sess, [train_op, objective], - "Forward-backward") - - -def main(_): - run_benchmark() - - -if __name__ == '__main__': - tf.app.run() diff --git a/benchmark/tensorflow/image/alexnet_multi_gpu.py b/benchmark/tensorflow/image/alexnet_multi_gpu.py deleted file mode 100644 index 51dfe3f1cb26e394cd7df0f99e2e9f7e431c1bfb..0000000000000000000000000000000000000000 --- a/benchmark/tensorflow/image/alexnet_multi_gpu.py +++ /dev/null @@ -1,379 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from six.moves import xrange # pylint: disable=redefined-builtin -from datetime import datetime -import math -import re -import time - -import tensorflow.python.platform -import tensorflow as tf - -FLAGS = tf.app.flags.FLAGS - -tf.app.flags.DEFINE_integer('batch_size', 64, """Batch size.""") -tf.app.flags.DEFINE_integer('num_batches', 100, """Number of batches to run.""") -tf.app.flags.DEFINE_string('data_format', 'NCHW', - """The data format for Convnet operations. - Can be either NHWC or NCHW. - """) - -tf.app.flags.DEFINE_string('train_dir', '/train_model', - """Directory where to write event logs """ - """and checkpoint.""") -tf.app.flags.DEFINE_integer('num_gpus', 4, """How many GPUs to use.""") -tf.app.flags.DEFINE_boolean('log_device_placement', False, - """Whether to log device placement.""") - -NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN = 50000 -NUM_EPOCHS_PER_DECAY = 50 -INITIAL_LEARNING_RATE = 0.1 -LEARNING_RATE_DECAY_FACTOR = 0.1 -TOWER_NAME = 'tower' - - -def _conv(name, inpOp, nIn, nOut, kH, kW, dH, dW, padType, wd=0.005): - with tf.name_scope(name) as scope: - kernel = tf.get_variable( - name + '_w', [kH, kW, nIn, nOut], - initializer=tf.truncated_normal_initializer( - stddev=0.01, dtype=tf.float32), - dtype=tf.float32) - - if wd is not None: - weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss') - tf.add_to_collection('losses', weight_decay) - - if FLAGS.data_format == 'NCHW': - strides = [1, 1, dH, dW] - else: - strides = [1, dH, dW, 1] - conv = tf.nn.conv2d( - inpOp, - kernel, - strides, - padding=padType, - data_format=FLAGS.data_format) - - biases = tf.get_variable( - name=name + '_b', - shape=[nOut], - initializer=tf.constant_initializer( - value=0.0, dtype=tf.float32), - dtype=tf.float32) - - bias = tf.reshape( - tf.nn.bias_add( - conv, biases, data_format=FLAGS.data_format), - conv.get_shape()) - - conv1 = tf.nn.relu(bias, name=scope) - return conv1 - - -def _affine(name, inpOp, nIn, nOut, wd=0.005, act=True): - with tf.name_scope(name) as scope: - kernel = tf.get_variable( - name + '_w', [nIn, nOut], - initializer=tf.truncated_normal_initializer( - stddev=0.01, dtype=tf.float32), - dtype=tf.float32) - - if wd is not None: - weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss') - tf.add_to_collection('losses', weight_decay) - - biases = tf.get_variable( - name + '_b', [nOut], - initializer=tf.constant_initializer( - value=0.0, dtype=tf.float32), - dtype=tf.float32, - trainable=True) - - affine1 = tf.nn.relu_layer(inpOp, kernel, biases, name=name) if act else \ - tf.matmul(inpOp, kernel) + biases - - return affine1 - - -def _mpool(name, inpOp, kH, kW, dH, dW): - if FLAGS.data_format == 'NCHW': - ksize = [1, 1, kH, kW] - strides = [1, 1, dH, dW] - else: - ksize = [1, kH, kW, 1] - strides = [1, dH, dW, 1] - return tf.nn.max_pool( - inpOp, - ksize=ksize, - strides=strides, - padding='VALID', - data_format=FLAGS.data_format, - name=name) - - -def _norm(name, l_input, lsize=4): - return tf.nn.lrn(l_input, - lsize, - bias=1.0, - alpha=0.001 / 9.0, - beta=0.75, - name=name) - - -def loss(logits, labels): - labels = tf.cast(labels, tf.int64) - cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( - logits, labels, name='cross_entropy_per_example') - cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy') - tf.add_to_collection('losses', cross_entropy_mean) - - # The total loss is defined as the cross entropy loss plus all of the weight - # decay terms (L2 loss). - return tf.add_n(tf.get_collection('losses'), name='total_loss') - - -def get_incoming_shape(incoming): - """ Returns the incoming data shape """ - if isinstance(incoming, tf.Tensor): - return incoming.get_shape().as_list() - elif type(incoming) in [np.array, list, tuple]: - return np.shape(incoming) - else: - raise Exception("Invalid incoming layer.") - - -def inference(images): - conv1 = _conv('conv1', images, 3, 96, 11, 11, 4, 4, 'VALID') - pool1 = _mpool('pool1', conv1, 3, 3, 2, 2) - norm1 = _norm('norm1', pool1, lsize=5) - conv2 = _conv('conv2', norm1, 96, 256, 5, 5, 1, 1, 'SAME') - pool2 = _mpool('pool2', conv2, 3, 3, 2, 2) - norm2 = _norm('norm2', pool2, lsize=5) - conv3 = _conv('conv3', norm2, 256, 384, 3, 3, 1, 1, 'SAME') - conv4 = _conv('conv4', conv3, 384, 384, 3, 3, 1, 1, 'SAME') - conv5 = _conv('conv5', conv4, 384, 256, 3, 3, 1, 1, 'SAME') - pool5 = _mpool('pool5', conv5, 3, 3, 2, 2) - resh1 = tf.reshape(pool5, [-1, 256 * 6 * 6]) - affn1 = _affine('fc6', resh1, 256 * 6 * 6, 4096) - affn2 = _affine('fc7', affn1, 4096, 4096) - affn3 = _affine('fc8', affn2, 4096, 1000, wd=None, act=False) # last fc - - return affn3 - - -def tower_loss(scope): - """Calculate the total loss on a single tower running the model. - Args: - scope: unique prefix string identifying the tower, e.g. 'tower_0' - Returns: - Tensor of shape [] containing the total loss for a batch of data - """ - image_size = 224 - if FLAGS.data_format == 'NCHW': - image_shape = [FLAGS.batch_size, 3, image_size + 3, image_size + 3] - else: - image_shape = [FLAGS.batch_size, image_size + 3, image_size + 3, 3] - images = tf.get_variable( - 'image', - image_shape, - initializer=tf.truncated_normal_initializer( - stddev=0.1, dtype=tf.float32), - dtype=tf.float32, - trainable=False) - - labels = tf.get_variable( - 'label', [FLAGS.batch_size], - initializer=tf.constant_initializer(1), - dtype=tf.int32, - trainable=False) - - # Build a Graph that computes the logits predictions from the - # inference model. - last_layer = inference(images) - - # Build the portion of the Graph calculating the losses. Note that we will - # assemble the total_loss using a custom function below. - _ = loss(last_layer, labels) - - # Assemble all of the losses for the current tower only. - losses = tf.get_collection('losses', scope) - - # Calculate the total loss for the current tower. - total_loss = tf.add_n(losses, name='total_loss') - - # Compute the moving average of all individual losses and the total loss. - loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg') - loss_averages_op = loss_averages.apply(losses + [total_loss]) - - # Attach a scalar summary to all individual losses and the total loss; do the - # same for the averaged version of the losses. - for l in losses + [total_loss]: - # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training - # session. This helps the clarity of presentation on tensorboard. - loss_name = re.sub('%s_[0-9]*/' % TOWER_NAME, '', l.op.name) - # Name each loss as '(raw)' and name the moving average version of the loss - # as the original loss name. - tf.scalar_summary(loss_name + ' (raw)', l) - tf.scalar_summary(loss_name, loss_averages.average(l)) - - with tf.control_dependencies([loss_averages_op]): - total_loss = tf.identity(total_loss) - return total_loss - - -def average_gradients(tower_grads): - """Calculate the average gradient for each shared variable across all towers. - Note that this function provides a synchronization point across all towers. - Args: - tower_grads: List of lists of (gradient, variable) tuples. The outer list - is over individual gradients. The inner list is over the gradient - calculation for each tower. - Returns: - List of pairs of (gradient, variable) where the gradient has been averaged - across all towers. - """ - average_grads = [] - for grad_and_vars in zip(*tower_grads): - # Note that each grad_and_vars looks like the following: - # ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN)) - grads = [] - for g, _ in grad_and_vars: - # Add 0 dimension to the gradients to represent the tower. - expanded_g = tf.expand_dims(g, 0) - - # Append on a 'tower' dimension which we will average over below. - grads.append(expanded_g) - - # Average over the 'tower' dimension. - grad = tf.concat(0, grads) - grad = tf.reduce_mean(grad, 0) - - # Keep in mind that the Variables are redundant because they are shared - # across towers. So .. we will just return the first tower's pointer to - # the Variable. - v = grad_and_vars[0][1] - grad_and_var = (grad, v) - average_grads.append(grad_and_var) - return average_grads - - -def time_tensorflow_run(session, target): - num_steps_burn_in = 50 - total_duration = 0.0 - total_duration_squared = 0.0 - for i in xrange(FLAGS.num_batches + num_steps_burn_in): - start_time = time.time() - _, loss_value = session.run(target) - duration = time.time() - start_time - if i > num_steps_burn_in: - if not i % 10: - num_examples_per_step = FLAGS.batch_size * FLAGS.num_gpus - examples_per_sec = num_examples_per_step / duration - sec_per_batch = duration - - format_str = ( - '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' - 'sec/batch batch_size = %d)') - print(format_str % - (datetime.now(), i - num_steps_burn_in, loss_value, - duration, sec_per_batch, num_examples_per_step)) - - total_duration += duration - total_duration_squared += duration * duration - - mn = total_duration / FLAGS.num_batches - vr = total_duration_squared / FLAGS.num_batches - mn * mn - sd = math.sqrt(vr) - print('%s: FwdBwd across %d steps, %.3f +/- %.3f sec / batch' % - (datetime.now(), FLAGS.num_batches, mn, sd)) - - -def run_benchmark(): - with tf.Graph().as_default(), tf.device('/cpu:0'): - # Create a variable to count the number of train() calls. This equals the - # number of batches processed * FLAGS.num_gpus. - global_step = tf.get_variable( - 'global_step', [], - initializer=tf.constant_initializer(0), - trainable=False) - - # Calculate the learning rate schedule. - num_batches_per_epoch = (NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / - FLAGS.batch_size) - decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY) - - # Decay the learning rate exponentially based on the number of steps. - lr = tf.train.exponential_decay( - INITIAL_LEARNING_RATE, - global_step, - decay_steps, - LEARNING_RATE_DECAY_FACTOR, - staircase=True) - - # Create an optimizer that performs gradient descent. - opt = tf.train.MomentumOptimizer(lr, 0.9) - - # Calculate the gradients for each model tower. - tower_grads = [] - for i in xrange(FLAGS.num_gpus): - with tf.device('/gpu:%d' % i): - with tf.name_scope('%s_%d' % (TOWER_NAME, i)) as scope: - # Calculate the loss for one tower of the model. This function - # constructs the entire model but shares the variables across - # all towers. - loss = tower_loss(scope) - - # Reuse variables for the next tower. - tf.get_variable_scope().reuse_variables() - - # Retain the summaries from the final tower. - summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope) - - # Calculate the gradients for the batch of data on this tower. - grads = opt.compute_gradients(loss) - - # Keep track of the gradients across all towers. - tower_grads.append(grads) - - # We must calculate the mean of each gradient. Note that this is the - # synchronization point across all towers. - grads = average_gradients(tower_grads) - - # Apply the gradients to adjust the shared variables. - apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) - - # Group all updates to into a single train op. - train_op = tf.group(apply_gradient_op) - - # Build an initialization operation. - init = tf.initialize_all_variables() - - # Start running operations on the Graph. allow_soft_placement must be set to - # True to build towers on GPU, as some of the ops do not have GPU - # implementations. - sess = tf.Session(config=tf.ConfigProto( - allow_soft_placement=True, - log_device_placement=FLAGS.log_device_placement)) - sess.run(init) - time_tensorflow_run(sess, [train_op, loss]) - - -def main(_): - run_benchmark() - - -if __name__ == '__main__': - tf.app.run() diff --git a/benchmark/tensorflow/image/googlenet.py b/benchmark/tensorflow/image/googlenet.py deleted file mode 100644 index 37b2ba6911514b96bc0be8994d8d095512a1cd4a..0000000000000000000000000000000000000000 --- a/benchmark/tensorflow/image/googlenet.py +++ /dev/null @@ -1,325 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from six.moves import xrange -from datetime import datetime -import math -import time - -import tensorflow.python.platform -import tensorflow as tf - -FLAGS = tf.app.flags.FLAGS - -tf.app.flags.DEFINE_integer('batch_size', 128, """Batch size.""") -tf.app.flags.DEFINE_integer('num_batches', 100, """Number of batches to run.""") -tf.app.flags.DEFINE_boolean('forward_only', False, - """Only run the forward pass.""") -tf.app.flags.DEFINE_boolean('forward_backward_only', False, - """Only run the forward-forward pass.""") -tf.app.flags.DEFINE_string('data_format', 'NCHW', - """The data format for Convnet operations. - Can be either NHWC or NCHW. - """) -tf.app.flags.DEFINE_boolean('log_device_placement', False, - """Whether to log device placement.""") - -parameters = [] - -conv_counter = 1 -pool_counter = 1 -affine_counter = 1 - - -def _conv(inpOp, nIn, nOut, kH, kW, dH, dW, padType, wd=0.0005): - global conv_counter - global parameters - name = 'conv' + str(conv_counter) - conv_counter += 1 - with tf.name_scope(name) as scope: - kernel = tf.Variable( - tf.truncated_normal( - [kH, kW, nIn, nOut], dtype=tf.float32, stddev=1e-1), - name='weights') - - if wd is not None and wd > 0: - weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss') - tf.add_to_collection('losses', weight_decay) - - if FLAGS.data_format == 'NCHW': - strides = [1, 1, dH, dW] - else: - strides = [1, dH, dW, 1] - conv = tf.nn.conv2d( - inpOp, - kernel, - strides, - padding=padType, - data_format=FLAGS.data_format) - biases = tf.Variable( - tf.constant( - 0.0, shape=[nOut], dtype=tf.float32), - trainable=True, - name='biases') - bias = tf.reshape( - tf.nn.bias_add( - conv, biases, data_format=FLAGS.data_format), - conv.get_shape()) - conv1 = tf.nn.relu(bias, name=scope) - parameters += [kernel, biases] - return conv1 - - -def _affine(inpOp, nIn, nOut, act=True, wd=0.0005): - global affine_counter - global parameters - name = 'affine' + str(affine_counter) - affine_counter += 1 - with tf.name_scope(name) as scope: - kernel = tf.Variable( - tf.truncated_normal( - [nIn, nOut], dtype=tf.float32, stddev=1e-1), - name='weights') - - if wd is not None and wd > 0: - weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss') - tf.add_to_collection('losses', weight_decay) - - biases = tf.Variable( - tf.constant( - 0.0, shape=[nOut], dtype=tf.float32), - trainable=True, - name='biases') - affine1 = tf.nn.relu_layer( - inpOp, kernel, biases, - name=name) if act else tf.matmul(inpOp, kernel) + biases - parameters += [kernel, biases] - return affine1 - - -def _mpool(inpOp, kH, kW, dH, dW, padding): - global pool_counter - global parameters - name = 'pool' + str(pool_counter) - pool_counter += 1 - if FLAGS.data_format == 'NCHW': - ksize = [1, 1, kH, kW] - strides = [1, 1, dH, dW] - else: - ksize = [1, kH, kW, 1] - strides = [1, dH, dW, 1] - return tf.nn.max_pool( - inpOp, - ksize=ksize, - strides=strides, - padding=padding, - data_format=FLAGS.data_format, - name=name) - - -def _apool(inpOp, kH, kW, dH, dW, padding): - global pool_counter - global parameters - name = 'pool' + str(pool_counter) - pool_counter += 1 - if FLAGS.data_format == 'NCHW': - ksize = [1, 1, kH, kW] - strides = [1, 1, dH, dW] - else: - ksize = [1, kH, kW, 1] - strides = [1, dH, dW, 1] - return tf.nn.avg_pool( - inpOp, - ksize=ksize, - strides=strides, - padding=padding, - data_format=FLAGS.data_format, - name=name) - - -def _inception(inp, inSize, o1s, o2s1, o2s2, o3s1, o3s2, o4s1, o4s2): - conv1 = _conv(inp, inSize, o1s, 1, 1, 1, 1, 'VALID') - - conv3_ = _conv(inp, inSize, o2s1, 1, 1, 1, 1, 'VALID') - conv3 = _conv(conv3_, o2s1, o2s2, 3, 3, 1, 1, 'SAME') - - conv5_ = _conv(inp, inSize, o3s1, 1, 1, 1, 1, 'VALID') - conv5 = _conv(conv5_, o3s1, o3s2, 5, 5, 1, 1, 'SAME') - - pool_ = _mpool(inp, o4s1, o4s1, 1, 1, 'SAME') - pool = _conv(pool_, inSize, o4s2, 1, 1, 1, 1, 'VALID') - - if FLAGS.data_format == 'NCHW': - channel_dim = 1 - else: - channel_dim = 3 - incept = tf.concat(channel_dim, [conv1, conv3, conv5, pool]) - return incept - - -def loss(logits, labels): - batch_size = tf.size(labels) - labels = tf.expand_dims(labels, 1) - indices = tf.expand_dims(tf.range(0, batch_size, 1), 1) - concated = tf.concat(1, [indices, labels]) - onehot_labels = tf.sparse_to_dense(concated, - tf.pack([batch_size, 1000]), 1.0, 0.0) - cross_entropy = tf.nn.softmax_cross_entropy_with_logits( - logits, onehot_labels, name='xentropy') - loss = tf.reduce_mean(cross_entropy, name='xentropy_mean') - return loss - - -def inference(images): - # stage 1 - conv1 = _conv(images, 3, 64, 7, 7, 2, 2, 'SAME') - pool1 = _mpool(conv1, 3, 3, 2, 2, 'SAME') - # stage 2 - conv2 = _conv(pool1, 64, 64, 1, 1, 1, 1, 'VALID') - conv3 = _conv(conv2, 64, 192, 3, 3, 1, 1, 'SAME') - pool3 = _mpool(conv3, 3, 3, 2, 2, 'SAME') - - # stage 3 - incept3a = _inception(pool3, 192, 64, 96, 128, 16, 32, 3, 32) - incept3b = _inception(incept3a, 256, 128, 128, 192, 32, 96, 3, 64) - pool4 = _mpool(incept3b, 3, 3, 2, 2, 'SAME') - - # stage 4 - incept4a = _inception(pool4, 480, 192, 96, 208, 16, 48, 3, 64) - incept4b = _inception(incept4a, 512, 160, 112, 224, 24, 64, 3, 64) - incept4c = _inception(incept4b, 512, 128, 128, 256, 24, 64, 3, 64) - incept4d = _inception(incept4c, 512, 112, 144, 288, 32, 64, 3, 64) - incept4e = _inception(incept4d, 528, 256, 160, 320, 32, 128, 3, 128) - pool5 = _mpool(incept4e, 3, 3, 2, 2, 'SAME') - - # stage 5 - incept5a = _inception(pool5, 832, 256, 160, 320, 32, 128, 3, 128) - incept5b = _inception(incept5a, 832, 384, 192, 384, 48, 128, 3, 128) - pool6 = _apool(incept5b, 7, 7, 1, 1, 'VALID') - - # output 1 - resh1 = tf.reshape(pool6, [-1, 1024]) - drop = tf.nn.dropout(resh1, 0.4) - affn1 = _affine(resh1, 1024, 1000, act=False) - - return affn1 - - -def time_tensorflow_run(session, target, info_string): - num_steps_burn_in = 10 - total_duration = 0.0 - total_duration_squared = 0.0 - if not isinstance(target, list): - target = [target] - target_op = tf.group(*target) - for i in range(FLAGS.num_batches + num_steps_burn_in): - start_time = time.time() - _ = session.run(target_op) - duration = time.time() - start_time - if i > num_steps_burn_in: - if not i % 10: - print('%s: step %d, duration = %.3f' % - (datetime.now(), i - num_steps_burn_in, duration)) - total_duration += duration - total_duration_squared += duration * duration - mn = total_duration / FLAGS.num_batches - vr = total_duration_squared / FLAGS.num_batches - mn * mn - sd = math.sqrt(vr) - print('%s: %s across %d steps, %.3f +/- %.3f sec / batch' % - (datetime.now(), info_string, FLAGS.num_batches, mn, sd)) - - -def run_benchmark(): - global parameters - with tf.Graph().as_default(): - # Generate some dummy images. - image_size = 224 - if FLAGS.data_format == 'NCHW': - image_shape = [FLAGS.batch_size, 3, image_size, image_size] - else: - image_shape = [FLAGS.batch_size, image_size, image_size, 3] - - images = tf.get_variable( - 'image', - image_shape, - initializer=tf.truncated_normal_initializer( - stddev=0.1, dtype=tf.float32), - dtype=tf.float32, - trainable=False) - - labels = tf.get_variable( - 'label', [FLAGS.batch_size], - initializer=tf.constant_initializer(1), - dtype=tf.int32, - trainable=False) - - # Build a Graph that computes the logits predictions from the - # inference model. - last_layer = inference(images) - - objective = loss(last_layer, labels) - - # Compute gradients. - # opt = tf.train.GradientDescentOptimizer(0.001) - opt = tf.train.MomentumOptimizer(0.001, 0.9) - grads = opt.compute_gradients(objective) - global_step = tf.get_variable( - 'global_step', [], - initializer=tf.constant_initializer( - 0.0, dtype=tf.float32), - trainable=False, - dtype=tf.float32) - apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) - - # Track the moving averages of all trainable variables. - variable_averages = tf.train.ExponentialMovingAverage(0.9, global_step) - variables_averages_op = variable_averages.apply(tf.trainable_variables( - )) - - # Build an initialization operation. - init = tf.initialize_all_variables() - - # Start running operations on the Graph. - sess = tf.Session(config=tf.ConfigProto( - allow_soft_placement=True, - log_device_placement=FLAGS.log_device_placement)) - sess.run(init) - - run_forward = True - run_forward_backward = True - if FLAGS.forward_only and FLAGS.forward_backward_only: - raise ValueError("Cannot specify --forward_only and " - "--forward_backward_only at the same time.") - if FLAGS.forward_only: - run_forward_backward = False - elif FLAGS.forward_backward_only: - run_forward = False - - if run_forward: - # Run the forward benchmark. - time_tensorflow_run(sess, last_layer, "Forward") - - if run_forward_backward: - with tf.control_dependencies( - [apply_gradient_op, variables_averages_op]): - train_op = tf.no_op(name='train') - time_tensorflow_run(sess, [train_op, objective], "Forward-backward") - - -def main(_): - run_benchmark() - - -if __name__ == '__main__': - tf.app.run() diff --git a/benchmark/tensorflow/image/googlenet_multi_gpu.py b/benchmark/tensorflow/image/googlenet_multi_gpu.py deleted file mode 100644 index 7179c5301cd0dc87744c2233ff5eb402dfbd7496..0000000000000000000000000000000000000000 --- a/benchmark/tensorflow/image/googlenet_multi_gpu.py +++ /dev/null @@ -1,425 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from six.moves import xrange # pylint: disable=redefined-builtin -from datetime import datetime -import math -import re -import time - -import tensorflow.python.platform -import tensorflow as tf - -FLAGS = tf.app.flags.FLAGS - -tf.app.flags.DEFINE_integer('batch_size', 64, """Batch size.""") -tf.app.flags.DEFINE_integer('num_batches', 100, """Number of batches to run.""") -tf.app.flags.DEFINE_string('data_format', 'NCHW', - """The data format for Convnet operations. - Can be either NHWC or NCHW. - """) - -tf.app.flags.DEFINE_string('train_dir', '/train_model', - """Directory where to write event logs """ - """and checkpoint.""") -tf.app.flags.DEFINE_integer('num_gpus', 4, """How many GPUs to use.""") -tf.app.flags.DEFINE_boolean('log_device_placement', False, - """Whether to log device placement.""") - -NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN = 50000 -NUM_EPOCHS_PER_DECAY = 50 -INITIAL_LEARNING_RATE = 0.1 -LEARNING_RATE_DECAY_FACTOR = 0.1 -TOWER_NAME = 'tower' - - -def _conv(name, inpOp, nIn, nOut, kH, kW, dH, dW, padType, wd=0.005): - with tf.name_scope(name) as scope: - kernel = tf.get_variable( - name + '_w', [kH, kW, nIn, nOut], - initializer=tf.truncated_normal_initializer( - stddev=0.01, dtype=tf.float32), - dtype=tf.float32) - - if wd is not None: - weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss') - tf.add_to_collection('losses', weight_decay) - - if FLAGS.data_format == 'NCHW': - strides = [1, 1, dH, dW] - else: - strides = [1, dH, dW, 1] - conv = tf.nn.conv2d( - inpOp, - kernel, - strides, - padding=padType, - data_format=FLAGS.data_format) - - biases = tf.get_variable( - name=name + '_b', - shape=[nOut], - initializer=tf.constant_initializer( - value=0.0, dtype=tf.float32), - dtype=tf.float32) - - bias = tf.reshape( - tf.nn.bias_add( - conv, biases, data_format=FLAGS.data_format), - conv.get_shape()) - - conv1 = tf.nn.relu(bias, name=scope) - return conv1 - - -def _affine(name, inpOp, nIn, nOut, wd=0.005, act=True): - with tf.name_scope(name) as scope: - kernel = tf.get_variable( - name + '_w', [nIn, nOut], - initializer=tf.truncated_normal_initializer( - stddev=0.01, dtype=tf.float32), - dtype=tf.float32) - - if wd is not None: - weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss') - tf.add_to_collection('losses', weight_decay) - - biases = tf.get_variable( - name + '_b', [nOut], - initializer=tf.constant_initializer( - value=0.0, dtype=tf.float32), - dtype=tf.float32, - trainable=True) - - affine1 = tf.nn.relu_layer(inpOp, kernel, biases, name=name) if act else \ - tf.matmul(inpOp, kernel) + biases - - return affine1 - - -def _mpool(name, inpOp, kH, kW, dH, dW, padding): - if FLAGS.data_format == 'NCHW': - ksize = [1, 1, kH, kW] - strides = [1, 1, dH, dW] - else: - ksize = [1, kH, kW, 1] - strides = [1, dH, dW, 1] - return tf.nn.max_pool( - inpOp, - ksize=ksize, - strides=strides, - padding=padding, - data_format=FLAGS.data_format, - name=name) - - -def _apool(name, inpOp, kH, kW, dH, dW, padding): - if FLAGS.data_format == 'NCHW': - ksize = [1, 1, kH, kW] - strides = [1, 1, dH, dW] - else: - ksize = [1, kH, kW, 1] - strides = [1, dH, dW, 1] - return tf.nn.avg_pool( - inpOp, - ksize=ksize, - strides=strides, - padding=padding, - data_format=FLAGS.data_format, - name=name) - - -def loss(logits, labels): - labels = tf.cast(labels, tf.int64) - cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( - logits, labels, name='cross_entropy_per_example') - cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy') - tf.add_to_collection('losses', cross_entropy_mean) - - # The total loss is defined as the cross entropy loss plus all of the weight - # decay terms (L2 loss). - return tf.add_n(tf.get_collection('losses'), name='total_loss') - - -def get_incoming_shape(incoming): - """ Returns the incoming data shape """ - if isinstance(incoming, tf.Tensor): - return incoming.get_shape().as_list() - elif type(incoming) in [np.array, list, tuple]: - return np.shape(incoming) - else: - raise Exception("Invalid incoming layer.") - - -def _inception(name, inp, inSize, o1s, o2s1, o2s2, o3s1, o3s2, o4s1, o4s2): - conv1 = _conv(name + '_1', inp, inSize, o1s, 1, 1, 1, 1, 'VALID') - - conv3_ = _conv(name + '_3r', inp, inSize, o2s1, 1, 1, 1, 1, 'VALID') - conv3 = _conv(name + '_3', conv3_, o2s1, o2s2, 3, 3, 1, 1, 'SAME') - - conv5_ = _conv(name + '_5r', inp, inSize, o3s1, 1, 1, 1, 1, 'VALID') - conv5 = _conv(name + '5', conv5_, o3s1, o3s2, 5, 5, 1, 1, 'SAME') - - pool_ = _mpool(name + 'pool', inp, o4s1, o4s1, 1, 1, 'SAME') - pool = _conv(name + 'proj', pool_, inSize, o4s2, 1, 1, 1, 1, 'VALID') - - if FLAGS.data_format == 'NCHW': - channel_dim = 1 - else: - channel_dim = 3 - incept = tf.concat(channel_dim, [conv1, conv3, conv5, pool]) - return incept - - -def inference(images): - # stage 1 - conv1 = _conv('conv1', images, 3, 64, 7, 7, 2, 2, 'SAME') - pool1 = _mpool('pool1', conv1, 3, 3, 2, 2, 'SAME') - - # stage 2 - conv2 = _conv('conv2', pool1, 64, 64, 1, 1, 1, 1, 'VALID') - conv3 = _conv('conv3', conv2, 64, 192, 3, 3, 1, 1, 'SAME') - pool3 = _mpool('pool3', conv3, 3, 3, 2, 2, 'SAME') - - # stage 3 - incept3a = _inception('ince3a', pool3, 192, 64, 96, 128, 16, 32, 3, 32) - incept3b = _inception('ince3b', incept3a, 256, 128, 128, 192, 32, 96, 3, 64) - pool4 = _mpool('pool4', incept3b, 3, 3, 2, 2, 'SAME') - - # stage 4 - incept4a = _inception('ince4a', pool4, 480, 192, 96, 208, 16, 48, 3, 64) - incept4b = _inception('ince4b', incept4a, 512, 160, 112, 224, 24, 64, 3, 64) - incept4c = _inception('ince4c', incept4b, 512, 128, 128, 256, 24, 64, 3, 64) - incept4d = _inception('ince4d', incept4c, 512, 112, 144, 288, 32, 64, 3, 64) - incept4e = _inception('ince4e', incept4d, 528, 256, 160, 320, 32, 128, 3, - 128) - pool5 = _mpool('pool5', incept4e, 3, 3, 2, 2, 'SAME') - - # stage 5 - incept5a = _inception('ince5a', pool5, 832, 256, 160, 320, 32, 128, 3, 128) - incept5b = _inception('ince5b', incept5a, 832, 384, 192, 384, 48, 128, 3, - 128) - pool6 = _apool('pool6', incept5b, 7, 7, 1, 1, 'VALID') - - # output 1 - resh1 = tf.reshape(pool6, [-1, 1024]) - drop = tf.nn.dropout(resh1, 0.4) - affn1 = _affine('fc_out', resh1, 1024, 1000, act=False) - - return affn1 - - -def tower_loss(scope): - """Calculate the total loss on a single tower running the model. - Args: - scope: unique prefix string identifying the tower, e.g. 'tower_0' - Returns: - Tensor of shape [] containing the total loss for a batch of data - """ - image_size = 224 - if FLAGS.data_format == 'NCHW': - image_shape = [FLAGS.batch_size, 3, image_size, image_size] - else: - image_shape = [FLAGS.batch_size, image_size, image_size, 3] - images = tf.get_variable( - 'image', - image_shape, - initializer=tf.truncated_normal_initializer( - stddev=0.1, dtype=tf.float32), - dtype=tf.float32, - trainable=False) - - labels = tf.get_variable( - 'label', [FLAGS.batch_size], - initializer=tf.constant_initializer(1), - dtype=tf.int32, - trainable=False) - - # Build a Graph that computes the logits predictions from the - # inference model. - last_layer = inference(images) - - # Build the portion of the Graph calculating the losses. Note that we will - # assemble the total_loss using a custom function below. - _ = loss(last_layer, labels) - - # Assemble all of the losses for the current tower only. - losses = tf.get_collection('losses', scope) - - # Calculate the total loss for the current tower. - total_loss = tf.add_n(losses, name='total_loss') - - # Compute the moving average of all individual losses and the total loss. - loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg') - loss_averages_op = loss_averages.apply(losses + [total_loss]) - - # Attach a scalar summary to all individual losses and the total loss; do the - # same for the averaged version of the losses. - for l in losses + [total_loss]: - # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training - # session. This helps the clarity of presentation on tensorboard. - loss_name = re.sub('%s_[0-9]*/' % TOWER_NAME, '', l.op.name) - # Name each loss as '(raw)' and name the moving average version of the loss - # as the original loss name. - tf.scalar_summary(loss_name + ' (raw)', l) - tf.scalar_summary(loss_name, loss_averages.average(l)) - - with tf.control_dependencies([loss_averages_op]): - total_loss = tf.identity(total_loss) - return total_loss - - -def average_gradients(tower_grads): - """Calculate the average gradient for each shared variable across all towers. - Note that this function provides a synchronization point across all towers. - Args: - tower_grads: List of lists of (gradient, variable) tuples. The outer list - is over individual gradients. The inner list is over the gradient - calculation for each tower. - Returns: - List of pairs of (gradient, variable) where the gradient has been averaged - across all towers. - """ - average_grads = [] - for grad_and_vars in zip(*tower_grads): - # Note that each grad_and_vars looks like the following: - # ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN)) - grads = [] - for g, _ in grad_and_vars: - # Add 0 dimension to the gradients to represent the tower. - expanded_g = tf.expand_dims(g, 0) - - # Append on a 'tower' dimension which we will average over below. - grads.append(expanded_g) - - # Average over the 'tower' dimension. - grad = tf.concat(0, grads) - grad = tf.reduce_mean(grad, 0) - - # Keep in mind that the Variables are redundant because they are shared - # across towers. So .. we will just return the first tower's pointer to - # the Variable. - v = grad_and_vars[0][1] - grad_and_var = (grad, v) - average_grads.append(grad_and_var) - return average_grads - - -def time_tensorflow_run(session, target): - num_steps_burn_in = 50 - total_duration = 0.0 - total_duration_squared = 0.0 - for i in xrange(FLAGS.num_batches + num_steps_burn_in): - start_time = time.time() - _, loss_value = session.run(target) - duration = time.time() - start_time - if i > num_steps_burn_in: - if not i % 10: - num_examples_per_step = FLAGS.batch_size * FLAGS.num_gpus - examples_per_sec = num_examples_per_step / duration - sec_per_batch = duration - - format_str = ( - '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' - 'sec/batch batch_size = %d)') - print(format_str % - (datetime.now(), i - num_steps_burn_in, loss_value, - duration, sec_per_batch, num_examples_per_step)) - - total_duration += duration - total_duration_squared += duration * duration - - mn = total_duration / FLAGS.num_batches - vr = total_duration_squared / FLAGS.num_batches - mn * mn - sd = math.sqrt(vr) - print('%s: FwdBwd across %d steps, %.3f +/- %.3f sec / batch' % - (datetime.now(), FLAGS.num_batches, mn, sd)) - - -def run_benchmark(): - with tf.Graph().as_default(), tf.device('/cpu:0'): - # Create a variable to count the number of train() calls. This equals the - # number of batches processed * FLAGS.num_gpus. - global_step = tf.get_variable( - 'global_step', [], - initializer=tf.constant_initializer(0), - trainable=False) - - # Calculate the learning rate schedule. - num_batches_per_epoch = (NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / - FLAGS.batch_size) - decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY) - - # Decay the learning rate exponentially based on the number of steps. - lr = tf.train.exponential_decay( - INITIAL_LEARNING_RATE, - global_step, - decay_steps, - LEARNING_RATE_DECAY_FACTOR, - staircase=True) - - # Create an optimizer that performs gradient descent. - opt = tf.train.MomentumOptimizer(lr, 0.9) - - # Calculate the gradients for each model tower. - tower_grads = [] - for i in xrange(FLAGS.num_gpus): - with tf.device('/gpu:%d' % i): - with tf.name_scope('%s_%d' % (TOWER_NAME, i)) as scope: - # Calculate the loss for one tower of the model. This function - # constructs the entire model but shares the variables across - # all towers. - loss = tower_loss(scope) - - # Reuse variables for the next tower. - tf.get_variable_scope().reuse_variables() - - # Retain the summaries from the final tower. - summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope) - - # Calculate the gradients for the batch of data on this tower. - grads = opt.compute_gradients(loss) - - # Keep track of the gradients across all towers. - tower_grads.append(grads) - - # We must calculate the mean of each gradient. Note that this is the - # synchronization point across all towers. - grads = average_gradients(tower_grads) - - # Apply the gradients to adjust the shared variables. - apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) - - # Group all updates to into a single train op. - train_op = tf.group(apply_gradient_op) - - # Build an initialization operation. - init = tf.initialize_all_variables() - - # Start running operations on the Graph. allow_soft_placement must be set to - # True to build towers on GPU, as some of the ops do not have GPU - # implementations. - sess = tf.Session(config=tf.ConfigProto( - allow_soft_placement=True, - log_device_placement=FLAGS.log_device_placement)) - sess.run(init) - time_tensorflow_run(sess, [train_op, loss]) - - -def main(_): - run_benchmark() - - -if __name__ == '__main__': - tf.app.run() diff --git a/benchmark/tensorflow/image/run.sh b/benchmark/tensorflow/image/run.sh deleted file mode 100755 index cf894fe3f2dca24e3acf863d625b3a7008793b83..0000000000000000000000000000000000000000 --- a/benchmark/tensorflow/image/run.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/bin/bash - -set -e - -function test() { - cfg=$1 - batch_size=$2 - prefix=$3 - python $cfg --batch_size=$batch_size > logs/${prefix}-1gpu-${batch_size}.log 2>&1 -} - -if [ ! -d "logs" ]; then - mkdir logs -fi - -# alexnet -test alexnet.py 64 alexnet -test alexnet.py 128 alexnet -test alexnet.py 256 alexnet -test alexnet.py 512 alexnet - -# googlenet -test googlenet.py 64 googlenet -test googlenet.py 128 googlenet - -# smallnet -test smallnet_mnist_cifar.py 64 smallnet -test smallnet_mnist_cifar.py 128 smallnet -test smallnet_mnist_cifar.py 256 smallnet -test smallnet_mnist_cifar.py 512 smallnet diff --git a/benchmark/tensorflow/image/run_multi.sh b/benchmark/tensorflow/image/run_multi.sh deleted file mode 100755 index bf1435bc55b90669e0b8bd893b8ed7bbb99d51e2..0000000000000000000000000000000000000000 --- a/benchmark/tensorflow/image/run_multi.sh +++ /dev/null @@ -1,24 +0,0 @@ -#!/bin/bash - -set -e - -function test() { - cfg=$1 - num_gpu=$2 - batch_size=$3 - batch_per_gpu=`expr ${batch_size} / ${num_gpu}` - prefix=$4 - python $cfg --num_gpus=$num_gpu --batch_size=${batch_per_gpu} > logs/${prefix}-4gpu-${batch_size}.log 2>&1 -} - -if [ ! -d "logs" ]; then - mkdir logs -fi - -# alexnet -test alexnet_multi_gpu.py 4 512 alexnet -test alexnet_multi_gpu.py 4 1024 alexnet - -# googlenet -test googlenet_multi_gpu.py 4 512 alexnet -test googlenet_multi_gpu.py 4 1024 alexnet diff --git a/benchmark/tensorflow/image/smallnet_mnist_cifar.py b/benchmark/tensorflow/image/smallnet_mnist_cifar.py deleted file mode 100644 index 2ca1623b6b47a5bbea2ff67928f0a8a6374da0d7..0000000000000000000000000000000000000000 --- a/benchmark/tensorflow/image/smallnet_mnist_cifar.py +++ /dev/null @@ -1,318 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from six.moves import xrange # pylint: disable=redefined-builtin -from datetime import datetime -import math -import time - -import tensorflow.python.platform -import tensorflow as tf - -FLAGS = tf.app.flags.FLAGS - -tf.app.flags.DEFINE_integer('batch_size', 128, """Batch size.""") -tf.app.flags.DEFINE_integer('num_batches', 100, """Number of batches to run.""") -tf.app.flags.DEFINE_boolean('forward_only', False, - """Only run the forward pass.""") -tf.app.flags.DEFINE_boolean('forward_backward_only', False, - """Only run the forward-forward pass.""") -tf.app.flags.DEFINE_string('data_format', 'NCHW', - """The data format for Convnet operations. - Can be either NHWC or NCHW. - """) -tf.app.flags.DEFINE_boolean('log_device_placement', False, - """Whether to log device placement.""") - -parameters = [] - -conv_counter = 1 -pool_counter = 1 -affine_counter = 1 - - -def _conv(inpOp, nIn, nOut, kH, kW, dH, dW, padType, wd=0.005, act=True): - global conv_counter - global parameters - name = 'conv' + str(conv_counter) - conv_counter += 1 - with tf.name_scope(name) as scope: - kernel = tf.Variable( - tf.truncated_normal( - [kH, kW, nIn, nOut], dtype=tf.float32, stddev=1e-1), - name='weights') - - if wd is not None: - weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss') - tf.add_to_collection('losses', weight_decay) - - if FLAGS.data_format == 'NCHW': - strides = [1, 1, dH, dW] - else: - strides = [1, dH, dW, 1] - conv = tf.nn.conv2d( - inpOp, - kernel, - strides, - padding=padType, - data_format=FLAGS.data_format) - biases = tf.Variable( - tf.constant( - 0.0, shape=[nOut], dtype=tf.float32), - trainable=True, - name='biases') - bias = tf.reshape( - tf.nn.bias_add( - conv, biases, data_format=FLAGS.data_format), - conv.get_shape()) - - conv1 = tf.nn.relu(bias, name=scope) if act else bias - - parameters += [kernel, biases] - - return conv1 - - -def _affine(inpOp, nIn, nOut, wd=None, act=True): - global affine_counter - global parameters - name = 'affine' + str(affine_counter) - affine_counter += 1 - with tf.name_scope(name) as scope: - kernel = tf.Variable( - tf.truncated_normal( - [nIn, nOut], dtype=tf.float32, stddev=1e-1), - name='weights') - - if wd is not None: - weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss') - tf.add_to_collection('losses', weight_decay) - - biases = tf.Variable( - tf.constant( - 0.0, shape=[nOut], dtype=tf.float32), - trainable=True, - name='biases') - - affine1 = tf.nn.relu_layer( - inpOp, kernel, biases, - name=name) if act else tf.matmul(inpOp, kernel) + biases - - parameters += [kernel, biases] - - return affine1 - - -def _mpool(inpOp, kH, kW, dH, dW, padding): - global pool_counter - global parameters - name = 'pool' + str(pool_counter) - pool_counter += 1 - if FLAGS.data_format == 'NCHW': - ksize = [1, 1, kH, kW] - strides = [1, 1, dH, dW] - else: - ksize = [1, kH, kW, 1] - strides = [1, dH, dW, 1] - return tf.nn.max_pool( - inpOp, - ksize=ksize, - strides=strides, - padding=padding, - data_format=FLAGS.data_format, - name=name) - - -def _apool(inpOp, kH, kW, dH, dW, padding): - global pool_counter - global parameters - name = 'pool' + str(pool_counter) - pool_counter += 1 - if FLAGS.data_format == 'NCHW': - ksize = [1, 1, kH, kW] - strides = [1, 1, dH, dW] - else: - ksize = [1, kH, kW, 1] - strides = [1, dH, dW, 1] - return tf.nn.avg_pool( - inpOp, - ksize=ksize, - strides=strides, - padding=padding, - data_format=FLAGS.data_format, - name=name) - - -def _norm(name, l_input, lsize=4): - return tf.nn.lrn(l_input, - lsize, - bias=1.0, - alpha=0.001 / 9.0, - beta=0.75, - name=name) - - -def loss(logits, labels): - batch_size = tf.size(labels) - labels = tf.expand_dims(labels, 1) - indices = tf.expand_dims(tf.range(0, batch_size, 1), 1) - concated = tf.concat(1, [indices, labels]) - onehot_labels = tf.sparse_to_dense(concated, - tf.pack([batch_size, 10]), 1.0, 0.0) - cross_entropy = tf.nn.softmax_cross_entropy_with_logits( - logits, onehot_labels, name='xentropy') - loss = tf.reduce_mean(cross_entropy, name='xentropy_mean') - return loss - - -def get_incoming_shape(incoming): - """ Returns the incoming data shape """ - if isinstance(incoming, tf.Tensor): - return incoming.get_shape().as_list() - elif type(incoming) in [np.array, list, tuple]: - return np.shape(incoming) - else: - raise Exception("Invalid incoming layer.") - - -def inference(images): - conv1 = _conv(images, 3, 32, 5, 5, 1, 1, 'SAME') - pool1 = _mpool(conv1, 3, 3, 2, 2, 'SAME') - conv2 = _conv(pool1, 32, 32, 5, 5, 1, 1, 'SAME') - pool2 = _apool(conv2, 3, 3, 2, 2, 'SAME') - conv3 = _conv(pool2, 32, 64, 5, 5, 1, 1, 'SAME') - pool3 = _apool(conv3, 3, 3, 2, 2, 'SAME') - resh1 = tf.reshape(pool3, [-1, 64 * 4 * 4]) - affn1 = _affine(resh1, 64 * 4 * 4, 64) - affn2 = _affine(affn1, 64, 10, act=False) - - print('conv1:', get_incoming_shape(conv1)) - print('pool1:', get_incoming_shape(pool1)) - print('conv2:', get_incoming_shape(conv2)) - print('pool2:', get_incoming_shape(pool2)) - print('conv3:', get_incoming_shape(conv3)) - print('pool3:', get_incoming_shape(pool3)) - - return affn2 - - -def time_tensorflow_run(session, target, info_string): - num_steps_burn_in = 10 - total_duration = 0.0 - total_duration_squared = 0.0 - if not isinstance(target, list): - target = [target] - target_op = tf.group(*target) - for i in xrange(FLAGS.num_batches + num_steps_burn_in): - start_time = time.time() - _ = session.run(target_op) - duration = time.time() - start_time - if i > num_steps_burn_in: - if not i % 10: - print('%s: step %d, duration = %.3f' % - (datetime.now(), i - num_steps_burn_in, duration)) - total_duration += duration - total_duration_squared += duration * duration - mn = total_duration / FLAGS.num_batches - vr = total_duration_squared / FLAGS.num_batches - mn * mn - sd = math.sqrt(vr) - print('%s: %s across %d steps, %.3f +/- %.3f sec / batch' % - (datetime.now(), info_string, FLAGS.num_batches, mn, sd)) - - -def run_benchmark(): - global parameters - with tf.Graph().as_default(): - # Generate some dummy images. - image_size = 32 - # Note that our padding definition is slightly different the cuda-convnet. - # In order to force the model to start with the same activations sizes, - # we add 3 to the image_size and employ VALID padding above. - if FLAGS.data_format == 'NCHW': - image_shape = [FLAGS.batch_size, 3, image_size, image_size] - else: - image_shape = [FLAGS.batch_size, image_size, image_size, 3] - - images = tf.get_variable( - 'image', - image_shape, - initializer=tf.truncated_normal_initializer( - stddev=0.1, dtype=tf.float32), - dtype=tf.float32, - trainable=False) - - labels = tf.get_variable( - 'label', [FLAGS.batch_size], - initializer=tf.constant_initializer(1), - dtype=tf.int32, - trainable=False) - - # Build a Graph that computes the logits predictions from the - # inference model. - last_layer = inference(images) - - objective = loss(last_layer, labels) - - # Compute gradients. - opt = tf.train.MomentumOptimizer(0.001, 0.9) - grads = opt.compute_gradients(objective) - global_step = tf.get_variable( - 'global_step', [], - initializer=tf.constant_initializer( - 0.0, dtype=tf.float32), - trainable=False, - dtype=tf.float32) - apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) - - # Track the moving averages of all trainable variables. - variable_averages = tf.train.ExponentialMovingAverage(0.9, global_step) - variables_averages_op = variable_averages.apply(tf.trainable_variables( - )) - - # Build an initialization operation. - init = tf.initialize_all_variables() - - # Start running operations on the Graph. - sess = tf.Session(config=tf.ConfigProto( - allow_soft_placement=True, - log_device_placement=FLAGS.log_device_placement)) - sess.run(init) - - run_forward = True - run_forward_backward = True - if FLAGS.forward_only and FLAGS.forward_backward_only: - raise ValueError("Cannot specify --forward_only and " - "--forward_backward_only at the same time.") - if FLAGS.forward_only: - run_forward_backward = False - elif FLAGS.forward_backward_only: - run_forward = False - - if run_forward: - # Run the forward benchmark. - time_tensorflow_run(sess, last_layer, "Forward") - - if run_forward_backward: - with tf.control_dependencies( - [apply_gradient_op, variables_averages_op]): - train_op = tf.no_op(name='train') - time_tensorflow_run(sess, [train_op, objective], "Forward-backward") - - -def main(_): - run_benchmark() - - -if __name__ == '__main__': - tf.app.run() diff --git a/benchmark/tensorflow/machine_translation.py b/benchmark/tensorflow/machine_translation.py deleted file mode 100644 index 7837669edc7a206c03e5b9fa2989bf45b35f0605..0000000000000000000000000000000000000000 --- a/benchmark/tensorflow/machine_translation.py +++ /dev/null @@ -1,624 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import tensorflow as tf -from tensorflow.python.framework import dtypes -from tensorflow.python.layers.core import Dense -from tensorflow.python.ops import check_ops -from tensorflow.python.ops import math_ops -from tensorflow.python.framework import ops -from tensorflow.python.ops import rnn_cell_impl -from tensorflow.python.ops.rnn_cell_impl import RNNCell, BasicLSTMCell -from tensorflow.python.ops.rnn_cell_impl import LSTMStateTuple -from tensorflow.contrib.rnn.python.ops import core_rnn_cell -from tensorflow.python.ops import array_ops -from tensorflow.python.util import nest -import tensorflow.contrib.seq2seq as seq2seq -from tensorflow.contrib.seq2seq.python.ops import beam_search_decoder -import numpy as np -import os -import argparse -import time - -parser = argparse.ArgumentParser(description=__doc__) -parser.add_argument( - "--embedding_dim", - type=int, - default=512, - help="The dimension of embedding table. (default: %(default)d)") -parser.add_argument( - "--encoder_size", - type=int, - default=512, - help="The size of encoder bi-rnn unit. (default: %(default)d)") -parser.add_argument( - "--decoder_size", - type=int, - default=512, - help="The size of decoder rnn unit. (default: %(default)d)") -parser.add_argument( - "--batch_size", - type=int, - default=128, - help="The sequence number of a mini-batch data. (default: %(default)d)") -parser.add_argument( - "--dict_size", - type=int, - default=30000, - help="The dictionary capacity. Dictionaries of source sequence and " - "target dictionary have same capacity. (default: %(default)d)") -parser.add_argument( - "--max_time_steps", - type=int, - default=81, - help="Max number of time steps for sequence. (default: %(default)d)") -parser.add_argument( - "--pass_num", - type=int, - default=10, - help="The pass number to train. (default: %(default)d)") -parser.add_argument( - "--learning_rate", - type=float, - default=0.0002, - help="Learning rate used to train the model. (default: %(default)f)") -parser.add_argument( - "--infer_only", action='store_true', help="If set, run forward only.") -parser.add_argument( - "--beam_size", - type=int, - default=3, - help="The width for beam searching. (default: %(default)d)") -parser.add_argument( - "--max_generation_length", - type=int, - default=250, - help="The maximum length of sequence when doing generation. " - "(default: %(default)d)") -parser.add_argument( - "--save_freq", - type=int, - default=500, - help="Save model checkpoint every this interation. (default: %(default)d)") -parser.add_argument( - "--model_dir", - type=str, - default='./checkpoint', - help="Path to save model checkpoints. (default: %(default)d)") - -_Linear = core_rnn_cell._Linear # pylint: disable=invalid-name - -START_TOKEN_IDX = 0 -END_TOKEN_IDX = 1 - - -class LSTMCellWithSimpleAttention(RNNCell): - """Add attention mechanism to BasicLSTMCell. - This class is a wrapper based on tensorflow's `BasicLSTMCell`. - """ - - def __init__(self, - num_units, - encoder_vector, - encoder_proj, - source_sequence_length, - forget_bias=1.0, - state_is_tuple=True, - activation=None, - reuse=None): - super(LSTMCellWithSimpleAttention, self).__init__(_reuse=reuse) - if not state_is_tuple: - logging.warn("%s: Using a concatenated state is slower and will " - "soon be deprecated. Use state_is_tuple=True.", self) - self._num_units = num_units - # set padding part to 0 - self._encoder_vector = self._reset_padding(encoder_vector, - source_sequence_length) - self._encoder_proj = self._reset_padding(encoder_proj, - source_sequence_length) - self._forget_bias = forget_bias - self._state_is_tuple = state_is_tuple - self._activation = activation or math_ops.tanh - self._linear = None - - @property - def state_size(self): - return (LSTMStateTuple(self._num_units, self._num_units) \ - if self._state_is_tuple else 2 * self._num_units) - - @property - def output_size(self): - return self._num_units - - def zero_state(self, batch_size, dtype): - state_size = self.state_size - if hasattr(self, "_last_zero_state"): - (last_state_size, last_batch_size, last_dtype, - last_output) = getattr(self, "_last_zero_state") - if (last_batch_size == batch_size and last_dtype == dtype and - last_state_size == state_size): - return last_output - with ops.name_scope( - type(self).__name__ + "ZeroState", values=[batch_size]): - output = _zero_state_tensors(state_size, batch_size, dtype) - self._last_zero_state = (state_size, batch_size, dtype, output) - return output - - def call(self, inputs, state): - sigmoid = math_ops.sigmoid - # Parameters of gates are concatenated into one multiply for efficiency. - if self._state_is_tuple: - c, h = state - else: - c, h = array_ops.split(value=state, num_or_size_splits=2, axis=1) - - # get context from encoder outputs - context = self._simple_attention(self._encoder_vector, - self._encoder_proj, h) - - if self._linear is None: - self._linear = _Linear([inputs, context, h], 4 * self._num_units, - True) - # i = input_gate, j = new_input, f = forget_gate, o = output_gate - i, j, f, o = array_ops.split( - value=self._linear([inputs, context, h]), - num_or_size_splits=4, - axis=1) - - new_c = (c * sigmoid(f + self._forget_bias) + sigmoid(i) * - self._activation(j)) - new_h = self._activation(new_c) * sigmoid(o) - - if self._state_is_tuple: - new_state = LSTMStateTuple(new_c, new_h) - else: - new_state = array_ops.concat([new_c, new_h], 1) - return new_h, new_state - - def _simple_attention(self, encoder_vec, encoder_proj, decoder_state): - """Implement the attention function. - The implementation has the same logic to the fluid decoder. - """ - decoder_state_proj = tf.contrib.layers.fully_connected( - inputs=decoder_state, - num_outputs=self._num_units, - activation_fn=None, - biases_initializer=None) - decoder_state_expand = tf.tile( - tf.expand_dims( - input=decoder_state_proj, axis=1), - [1, tf.shape(encoder_proj)[1], 1]) - concated = tf.concat([decoder_state_expand, encoder_proj], axis=2) - # need reduce the first dimension - attention_weights = tf.contrib.layers.fully_connected( - inputs=tf.reshape( - concated, shape=[-1, self._num_units * 2]), - num_outputs=1, - activation_fn=tf.nn.tanh, - biases_initializer=None) - attention_weights_reshaped = tf.reshape( - attention_weights, shape=[tf.shape(encoder_vec)[0], -1, 1]) - # normalize the attention weights using softmax - attention_weights_normed = tf.nn.softmax( - attention_weights_reshaped, dim=1) - scaled = tf.multiply(attention_weights_normed, encoder_vec) - context = tf.reduce_sum(scaled, axis=1) - return context - - def _reset_padding(self, - memory, - memory_sequence_length, - check_inner_dims_defined=True): - """Reset the padding part for encoder inputs. - This funtion comes from tensorflow's `_prepare_memory` function. - """ - memory = nest.map_structure( - lambda m: ops.convert_to_tensor(m, name="memory"), memory) - if memory_sequence_length is not None: - memory_sequence_length = ops.convert_to_tensor( - memory_sequence_length, name="memory_sequence_length") - if check_inner_dims_defined: - - def _check_dims(m): - if not m.get_shape()[2:].is_fully_defined(): - raise ValueError( - "Expected memory %s to have fully defined inner dims, " - "but saw shape: %s" % (m.name, m.get_shape())) - - nest.map_structure(_check_dims, memory) - if memory_sequence_length is None: - seq_len_mask = None - else: - seq_len_mask = array_ops.sequence_mask( - memory_sequence_length, - maxlen=array_ops.shape(nest.flatten(memory)[0])[1], - dtype=nest.flatten(memory)[0].dtype) - seq_len_batch_size = (memory_sequence_length.shape[0].value or - array_ops.shape(memory_sequence_length)[0]) - - def _maybe_mask(m, seq_len_mask): - rank = m.get_shape().ndims - rank = rank if rank is not None else array_ops.rank(m) - extra_ones = array_ops.ones(rank - 2, dtype=dtypes.int32) - m_batch_size = m.shape[0].value or array_ops.shape(m)[0] - if memory_sequence_length is not None: - message = ("memory_sequence_length and memory tensor " - "batch sizes do not match.") - with ops.control_dependencies([ - check_ops.assert_equal( - seq_len_batch_size, m_batch_size, message=message) - ]): - seq_len_mask = array_ops.reshape( - seq_len_mask, - array_ops.concat( - (array_ops.shape(seq_len_mask), extra_ones), 0)) - return m * seq_len_mask - else: - return m - - return nest.map_structure(lambda m: _maybe_mask(m, seq_len_mask), - memory) - - -def seq_to_seq_net(embedding_dim, encoder_size, decoder_size, source_dict_dim, - target_dict_dim, is_generating, beam_size, - max_generation_length): - src_word_idx = tf.placeholder(tf.int32, shape=[None, None]) - src_sequence_length = tf.placeholder(tf.int32, shape=[None, ]) - - src_embedding_weights = tf.get_variable("source_word_embeddings", - [source_dict_dim, embedding_dim]) - src_embedding = tf.nn.embedding_lookup(src_embedding_weights, src_word_idx) - - src_forward_cell = tf.nn.rnn_cell.BasicLSTMCell(encoder_size) - src_reversed_cell = tf.nn.rnn_cell.BasicLSTMCell(encoder_size) - # no peephole - encoder_outputs, _ = tf.nn.bidirectional_dynamic_rnn( - cell_fw=src_forward_cell, - cell_bw=src_reversed_cell, - inputs=src_embedding, - sequence_length=src_sequence_length, - dtype=tf.float32) - - # concat the forward outputs and backward outputs - encoded_vec = tf.concat(encoder_outputs, axis=2) - - # project the encoder outputs to size of decoder lstm - encoded_proj = tf.contrib.layers.fully_connected( - inputs=tf.reshape( - encoded_vec, shape=[-1, embedding_dim * 2]), - num_outputs=decoder_size, - activation_fn=None, - biases_initializer=None) - encoded_proj_reshape = tf.reshape( - encoded_proj, shape=[-1, tf.shape(encoded_vec)[1], decoder_size]) - - # get init state for decoder lstm's H - backword_first = tf.slice(encoder_outputs[1], [0, 0, 0], [-1, 1, -1]) - decoder_boot = tf.contrib.layers.fully_connected( - inputs=tf.reshape( - backword_first, shape=[-1, embedding_dim]), - num_outputs=decoder_size, - activation_fn=tf.nn.tanh, - biases_initializer=None) - - # prepare the initial state for decoder lstm - cell_init = tf.zeros(tf.shape(decoder_boot), tf.float32) - initial_state = LSTMStateTuple(cell_init, decoder_boot) - - # create decoder lstm cell - decoder_cell = LSTMCellWithSimpleAttention( - decoder_size, - encoded_vec - if not is_generating else seq2seq.tile_batch(encoded_vec, beam_size), - encoded_proj_reshape if not is_generating else - seq2seq.tile_batch(encoded_proj_reshape, beam_size), - src_sequence_length if not is_generating else - seq2seq.tile_batch(src_sequence_length, beam_size), - forget_bias=0.0) - - output_layer = Dense(target_dict_dim, name='output_projection') - - if not is_generating: - trg_word_idx = tf.placeholder(tf.int32, shape=[None, None]) - trg_sequence_length = tf.placeholder(tf.int32, shape=[None, ]) - trg_embedding_weights = tf.get_variable( - "target_word_embeddings", [target_dict_dim, embedding_dim]) - trg_embedding = tf.nn.embedding_lookup(trg_embedding_weights, - trg_word_idx) - - training_helper = seq2seq.TrainingHelper( - inputs=trg_embedding, - sequence_length=trg_sequence_length, - time_major=False, - name='training_helper') - - training_decoder = seq2seq.BasicDecoder( - cell=decoder_cell, - helper=training_helper, - initial_state=initial_state, - output_layer=output_layer) - - # get the max length of target sequence - max_decoder_length = tf.reduce_max(trg_sequence_length) - - decoder_outputs_train, _, _ = seq2seq.dynamic_decode( - decoder=training_decoder, - output_time_major=False, - impute_finished=True, - maximum_iterations=max_decoder_length) - - decoder_logits_train = tf.identity(decoder_outputs_train.rnn_output) - decoder_pred_train = tf.argmax( - decoder_logits_train, axis=-1, name='decoder_pred_train') - masks = tf.sequence_mask( - lengths=trg_sequence_length, - maxlen=max_decoder_length, - dtype=tf.float32, - name='masks') - - # place holder of label sequence - lbl_word_idx = tf.placeholder(tf.int32, shape=[None, None]) - - # compute the loss - loss = seq2seq.sequence_loss( - logits=decoder_logits_train, - targets=lbl_word_idx, - weights=masks, - average_across_timesteps=True, - average_across_batch=True) - - # return feeding list and loss operator - return { - 'src_word_idx': src_word_idx, - 'src_sequence_length': src_sequence_length, - 'trg_word_idx': trg_word_idx, - 'trg_sequence_length': trg_sequence_length, - 'lbl_word_idx': lbl_word_idx - }, loss - else: - start_tokens = tf.ones([tf.shape(src_word_idx)[0], ], - tf.int32) * START_TOKEN_IDX - # share the same embedding weights with target word - trg_embedding_weights = tf.get_variable( - "target_word_embeddings", [target_dict_dim, embedding_dim]) - - inference_decoder = beam_search_decoder.BeamSearchDecoder( - cell=decoder_cell, - embedding=lambda tokens: tf.nn.embedding_lookup(trg_embedding_weights, tokens), - start_tokens=start_tokens, - end_token=END_TOKEN_IDX, - initial_state=tf.nn.rnn_cell.LSTMStateTuple( - tf.contrib.seq2seq.tile_batch(initial_state[0], beam_size), - tf.contrib.seq2seq.tile_batch(initial_state[1], beam_size)), - beam_width=beam_size, - output_layer=output_layer) - - decoder_outputs_decode, _, _ = seq2seq.dynamic_decode( - decoder=inference_decoder, - output_time_major=False, - #impute_finished=True,# error occurs - maximum_iterations=max_generation_length) - - predicted_ids = decoder_outputs_decode.predicted_ids - - return { - 'src_word_idx': src_word_idx, - 'src_sequence_length': src_sequence_length - }, predicted_ids - - -def print_arguments(args): - print('----------- Configuration Arguments -----------') - for arg, value in vars(args).iteritems(): - print('%s: %s' % (arg, value)) - print('------------------------------------------------') - - -def padding_data(data, padding_size, value): - data = data + [value] * padding_size - return data[:padding_size] - - -def save(sess, path, var_list=None, global_step=None): - saver = tf.train.Saver(var_list) - save_path = saver.save(sess, save_path=path, global_step=global_step) - print('Model save at %s' % save_path) - - -def restore(sess, path, var_list=None): - # var_list = None returns the list of all saveable variables - saver = tf.train.Saver(var_list) - saver.restore(sess, save_path=path) - print('model restored from %s' % path) - - -def adapt_batch_data(data): - src_seq = map(lambda x: x[0], data) - trg_seq = map(lambda x: x[1], data) - lbl_seq = map(lambda x: x[2], data) - - src_sequence_length = np.array( - [len(seq) for seq in src_seq]).astype('int32') - src_seq_maxlen = np.max(src_sequence_length) - - trg_sequence_length = np.array( - [len(seq) for seq in trg_seq]).astype('int32') - trg_seq_maxlen = np.max(trg_sequence_length) - - src_seq = np.array( - [padding_data(seq, src_seq_maxlen, END_TOKEN_IDX) - for seq in src_seq]).astype('int32') - - trg_seq = np.array( - [padding_data(seq, trg_seq_maxlen, END_TOKEN_IDX) - for seq in trg_seq]).astype('int32') - - lbl_seq = np.array( - [padding_data(seq, trg_seq_maxlen, END_TOKEN_IDX) - for seq in lbl_seq]).astype('int32') - - return { - 'src_word_idx': src_seq, - 'src_sequence_length': src_sequence_length, - 'trg_word_idx': trg_seq, - 'trg_sequence_length': trg_sequence_length, - 'lbl_word_idx': lbl_seq - } - - -def train(): - feeding_dict, loss = seq_to_seq_net( - embedding_dim=args.embedding_dim, - encoder_size=args.encoder_size, - decoder_size=args.decoder_size, - source_dict_dim=args.dict_size, - target_dict_dim=args.dict_size, - is_generating=False, - beam_size=args.beam_size, - max_generation_length=args.max_generation_length) - - global_step = tf.Variable(0, trainable=False, name='global_step') - trainable_params = tf.trainable_variables() - optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate) - - gradients = tf.gradients(loss, trainable_params) - # may clip the parameters - clip_gradients, _ = tf.clip_by_global_norm(gradients, 1.0) - - updates = optimizer.apply_gradients( - zip(gradients, trainable_params), global_step=global_step) - - src_dict, trg_dict = paddle.dataset.wmt14.get_dict(args.dict_size) - - train_batch_generator = paddle.batch( - paddle.reader.shuffle( - paddle.dataset.wmt14.train(args.dict_size), buf_size=1000), - batch_size=args.batch_size) - - test_batch_generator = paddle.batch( - paddle.reader.shuffle( - paddle.dataset.wmt14.test(args.dict_size), buf_size=1000), - batch_size=args.batch_size) - - def do_validataion(): - total_loss = 0.0 - count = 0 - for batch_id, data in enumerate(test_batch_generator()): - adapted_batch_data = adapt_batch_data(data) - outputs = sess.run([loss], - feed_dict={ - item[1]: adapted_batch_data[item[0]] - for item in feeding_dict.items() - }) - total_loss += outputs[0] - count += 1 - return total_loss / count - - config = tf.ConfigProto( - intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) - config.gpu_options.allow_growth = True - - with tf.Session(config=config) as sess: - init_g = tf.global_variables_initializer() - init_l = tf.local_variables_initializer() - sess.run(init_l) - sess.run(init_g) - for pass_id in xrange(args.pass_num): - pass_start_time = time.time() - words_seen = 0 - for batch_id, data in enumerate(train_batch_generator()): - adapted_batch_data = adapt_batch_data(data) - words_seen += np.sum(adapted_batch_data['src_sequence_length']) - words_seen += np.sum(adapted_batch_data['trg_sequence_length']) - outputs = sess.run([updates, loss], - feed_dict={ - item[1]: adapted_batch_data[item[0]] - for item in feeding_dict.items() - }) - print("pass_id=%d, batch_id=%d, train_loss: %f" % - (pass_id, batch_id, outputs[1])) - pass_end_time = time.time() - test_loss = do_validataion() - time_consumed = pass_end_time - pass_start_time - words_per_sec = words_seen / time_consumed - print("pass_id=%d, test_loss: %f, words/s: %f, sec/pass: %f" % - (pass_id, test_loss, words_per_sec, time_consumed)) - - -def infer(): - feeding_dict, predicted_ids = seq_to_seq_net( - embedding_dim=args.embedding_dim, - encoder_size=args.encoder_size, - decoder_size=args.decoder_size, - source_dict_dim=args.dict_size, - target_dict_dim=args.dict_size, - is_generating=True, - beam_size=args.beam_size, - max_generation_length=args.max_generation_length) - - src_dict, trg_dict = paddle.dataset.wmt14.get_dict(args.dict_size) - test_batch_generator = paddle.batch( - paddle.reader.shuffle( - paddle.dataset.wmt14.train(args.dict_size), buf_size=1000), - batch_size=args.batch_size) - - config = tf.ConfigProto( - intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) - with tf.Session(config=config) as sess: - restore(sess, './checkpoint/tf_seq2seq-1500') - for batch_id, data in enumerate(test_batch_generator()): - src_seq = map(lambda x: x[0], data) - - source_language_seq = [ - src_dict[item] for seq in src_seq for item in seq - ] - - src_sequence_length = np.array( - [len(seq) for seq in src_seq]).astype('int32') - src_seq_maxlen = np.max(src_sequence_length) - src_seq = np.array([ - padding_data(seq, src_seq_maxlen, END_TOKEN_IDX) - for seq in src_seq - ]).astype('int32') - - outputs = sess.run([predicted_ids], - feed_dict={ - feeding_dict['src_word_idx']: src_seq, - feeding_dict['src_sequence_length']: - src_sequence_length - }) - - print("\nDecoder result comparison: ") - source_language_seq = ' '.join(source_language_seq).lstrip( - '').rstrip('').strip() - inference_seq = '' - print(" --> source: " + source_language_seq) - for item in outputs[0][0]: - if item[0] == END_TOKEN_IDX: break - inference_seq += ' ' + trg_dict.get(item[0], '') - print(" --> inference: " + inference_seq) - - -if __name__ == '__main__': - args = parser.parse_args() - print_arguments(args) - if args.infer_only: - infer() - else: - train() diff --git a/benchmark/tensorflow/mnist.py b/benchmark/tensorflow/mnist.py deleted file mode 100644 index 03d533fecfededddd3956ba83ea600456782cfc9..0000000000000000000000000000000000000000 --- a/benchmark/tensorflow/mnist.py +++ /dev/null @@ -1,179 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import argparse -import time -import numpy as np - -import tensorflow as tf - -DTYPE = tf.float32 - - -def parse_args(): - parser = argparse.ArgumentParser("mnist model benchmark.") - parser.add_argument( - '--batch_size', type=int, default=128, help='The minibatch size.') - parser.add_argument( - '--iterations', type=int, default=35, help='The number of minibatches.') - parser.add_argument( - '--pass_num', type=int, default=5, help='The number of passes.') - parser.add_argument( - '--device', - type=str, - default='GPU', - choices=['CPU', 'GPU'], - help='The device type.') - args = parser.parse_args() - return args - - -def run_benchmark(args): - def weight_variable(dtype, shape): - initial = tf.truncated_normal(shape, stddev=0.1, dtype=dtype) - return tf.Variable(initial) - - def bias_variable(dtype, shape): - initial = tf.constant(0.1, shape=shape, dtype=dtype) - return tf.Variable(initial) - - device = '/cpu:0' if args.device == 'CPU' else '/device:GPU:0' - with tf.device(device): - images = tf.placeholder(DTYPE, shape=(None, 28, 28, 1)) - labels = tf.placeholder(tf.int64, shape=(None, )) - - # conv1, relu, pool1 - conv1_weights = weight_variable(DTYPE, [5, 5, 1, 20]) - conv1_bias = bias_variable(DTYPE, [20]) - conv1 = tf.nn.conv2d( - images, conv1_weights, strides=[1, 1, 1, 1], padding="VALID") - relu1 = tf.nn.relu(tf.nn.bias_add(conv1, conv1_bias)) - pool1 = tf.nn.max_pool( - relu1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding="VALID") - - # conv2, relu, pool2 - conv2_weights = weight_variable(DTYPE, [5, 5, 20, 50]) - conv2_bias = bias_variable(DTYPE, [50]) - conv2 = tf.nn.conv2d( - pool1, conv2_weights, strides=[1, 1, 1, 1], padding="VALID") - relu2 = tf.nn.relu(tf.nn.bias_add(conv2, conv2_bias)) - pool2 = tf.nn.max_pool( - relu2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding="VALID") - - # FC - pool_shape = pool2.get_shape().as_list() - hidden_dim = reduce(lambda a, b: a * b, pool_shape[1:], 1) - reshape = tf.reshape(pool2, shape=(tf.shape(pool2)[0], hidden_dim)) - fc_weights = weight_variable(DTYPE, [hidden_dim, 10]) - fc_bias = bias_variable(DTYPE, [10]) - logits = tf.matmul(reshape, fc_weights) + fc_bias - - # Get prediction - prediction = tf.nn.softmax(logits) - - # Loss - one_hot_labels = tf.one_hot(labels, depth=10) - cost = -tf.reduce_sum(tf.log(prediction) * one_hot_labels, [1]) - avg_cost = tf.reduce_mean(cost) - - # Get accuracy - correct = tf.equal(tf.argmax(prediction, 1), labels) - accuracy = tf.reduce_mean(tf.cast(correct, tf.float32)) - - # metrics, g_accuracy - with tf.variable_scope("reset_metrics_accuracy_scope") as scope: - g_accuracy = tf.metrics.accuracy( - labels, tf.argmax( - prediction, axis=1)) - vars = tf.contrib.framework.get_variables( - scope, collection=tf.GraphKeys.LOCAL_VARIABLES) - g_accuracy_reset_op = tf.variables_initializer(vars) - - # Optimizer - opt = tf.train.AdamOptimizer( - learning_rate=0.001, beta1=0.9, beta2=0.999) - train_op = opt.minimize(avg_cost) - # train_op = tf.train.AdamOptimizer(1e-4).minimize(avg_cost) - - train_reader = paddle.batch( - paddle.dataset.mnist.train(), batch_size=args.batch_size) - test_reader = paddle.batch( - paddle.dataset.mnist.test(), batch_size=args.batch_size) - - def eval_test(): - sess.run(g_accuracy_reset_op) - for batch_id, data in enumerate(test_reader()): - images_data = np.array( - map(lambda x: np.transpose(x[0].reshape([1, 28, 28]), axes=[1,2,0]), data)).astype("float32") - labels_data = np.array(map(lambda x: x[1], data)).astype("int64") - - loss, acc, g_acc = sess.run( - [avg_cost, accuracy, g_accuracy], - feed_dict={images: images_data, - labels: labels_data}) - return g_acc[1] - - config = tf.ConfigProto( - intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) - config.gpu_options.allow_growth = True - - with tf.Session(config=config) as sess: - init_g = tf.global_variables_initializer() - init_l = tf.local_variables_initializer() - sess.run(init_g) - sess.run(init_l) - for pass_id in range(args.pass_num): - sess.run(g_accuracy_reset_op) - - pass_start = time.time() - for batch_id, data in enumerate(train_reader()): - images_data = np.array( - map(lambda x: np.transpose(x[0].reshape([1, 28, 28]), axes=[1,2,0]), data)).astype("float32") - labels_data = np.array(map(lambda x: x[1], data)).astype( - "int64") - - start = time.time() - _, loss, acc, g_acc = sess.run( - [train_op, avg_cost, accuracy, g_accuracy], - feed_dict={images: images_data, - labels: labels_data}) - end = time.time() - - print("pass=%d, batch=%d, loss=%f, error=%f, elapse=%f" % - (pass_id, batch_id, loss, 1 - acc, (end - start) / 1000)) - - pass_end = time.time() - test_avg_acc = eval_test() - - print( - "pass=%d, training_avg_accuracy=%f, test_avg_acc=%f, elapse=%f" - % (pass_id, g_acc[1], test_avg_acc, - (pass_end - pass_start) / 1000)) - - -def print_arguments(args): - print('----------- Configuration Arguments -----------') - for arg, value in sorted(vars(args).iteritems()): - print('%s: %s' % (arg, value)) - print('------------------------------------------------') - - -if __name__ == '__main__': - args = parse_args() - print_arguments(args) - run_benchmark(args) diff --git a/benchmark/tensorflow/resnet.py b/benchmark/tensorflow/resnet.py deleted file mode 100644 index fdb044195766b847e16a0cc33424a999c1d9166e..0000000000000000000000000000000000000000 --- a/benchmark/tensorflow/resnet.py +++ /dev/null @@ -1,503 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -based on https://github.com/tensorflow/models/blob/master/official/resnet/resnet_model.py - -Get help: python resnet.py --help -See performance on flowers: python resnet.py -Train on cifar10: python resnet.py --data=cifar10 --with_test -""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import argparse -import time -import numpy as np - -import tensorflow as tf - -DTYPE = tf.float32 - - -def parse_args(): - parser = argparse.ArgumentParser('Convolution model benchmark.') - parser.add_argument( - '--model', - type=str, - choices=['resnet'], - default='resnet', - help='The model architecture.') - parser.add_argument( - '--batch_size', type=int, default=32, help='The minibatch size.') - parser.add_argument( - '--use_fake_data', - action='store_true', - help='use real data or fake data') - parser.add_argument( - '--skip_batch_num', - type=int, - default=5, - help='The first num of minibatch num to skip, for better performance test' - ) - parser.add_argument( - '--iterations', - type=int, - default=105, - help='The number of minibatches.') - parser.add_argument( - '--pass_num', type=int, default=300, help='The number of passes.') - parser.add_argument( - '--order', - type=str, - default='NHWC', - choices=['NCHW', 'NHWC'], - help='The data order, now only support NCHW.') - parser.add_argument( - '--device', - type=str, - default='GPU', - choices=['CPU', 'GPU'], - help='The device type.') - parser.add_argument( - '--data', - type=str, - default='flowers102', - choices=['flowers102', 'cifar10'], - help='The kinds of data.') - parser.add_argument( - '--infer_only', action='store_true', help='If set, run forward only.') - parser.add_argument( - '--use_cprof', action='store_true', help='If set, use cProfile.') - parser.add_argument( - '--with_test', - action='store_true', - help='If set, test the testset during training.') - parser.add_argument( - '--use_nvprof', - action='store_true', - help='If set, use nvprof for CUDA.') - args = parser.parse_args() - return args - - -def print_arguments(args): - vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and - vars(args)['device'] == 'GPU') - vars(args)['iterations'] = vars(args)['pass_num'] * 1000 if vars(args)[ - 'with_test'] else vars(args)['iterations'] - print('----------- Configuration Arguments -----------') - for arg, value in sorted(vars(args).iteritems()): - print('%s: %s' % (arg, value)) - print('------------------------------------------------') - - -def fixed_padding(inputs, kernel_size, data_format): - """Pads the input along the spatial dimensions independently of input size. - Args: - inputs: A tensor of size [batch, channels, height_in, width_in] or - [batch, height_in, width_in, channels] depending on data_format. - kernel_size: The kernel to be used in the conv2d or max_pool2d operation. - Should be a positive integer. - data_format: The input format ('channels_last' or 'channels_first'). - Returns: - A tensor with the same format as the input with the data either intact - (if kernel_size == 1) or padded (if kernel_size > 1). - """ - pad_total = kernel_size - 1 - pad_beg = pad_total // 2 - pad_end = pad_total - pad_beg - - if data_format == 'channels_first': - padded_inputs = tf.pad(inputs, [[0, 0], [0, 0], [pad_beg, pad_end], - [pad_beg, pad_end]]) - else: - padded_inputs = tf.pad(inputs, [[0, 0], [pad_beg, pad_end], - [pad_beg, pad_end], [0, 0]]) - return padded_inputs - - -def conv2d_fixed_padding(inputs, filters, kernel_size, strides, data_format): - """Strided 2-D convolution with explicit padding.""" - # The padding is consistent and is based only on `kernel_size`, not on the - # dimensions of `inputs` (as opposed to using `tf.layers.conv2d` alone). - # This is consistent with PaddlePaddle. - # In addition, the calculation for output size in TensorFlow can refer: - # https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/common_shape_fns.cc - if strides > 1: - inputs = fixed_padding(inputs, kernel_size, data_format) - - return tf.layers.conv2d( - inputs=inputs, - filters=filters, - kernel_size=kernel_size, - strides=strides, - padding=('SAME' if strides == 1 else 'VALID'), - use_bias=False, - kernel_initializer=tf.variance_scaling_initializer(), - data_format=data_format) - - -def conv_bn(inputs, - filters, - kernel_size, - strides, - is_training, - data_format, - act=True): - # def conv2d_fixed_padding(inputs, filters, kernel_size, strides, data_format): - # set fused=True for a significant performance boost. See - # https://www.tensorflow.org/performance/performance_guide#common_fused_ops - inputs = conv2d_fixed_padding( - inputs=inputs, - filters=filters, - kernel_size=kernel_size, - strides=strides, - data_format=data_format) - inputs = tf.layers.batch_normalization( - inputs=inputs, - axis=1 if data_format == 'channels_first' else 3, - momentum=0.9, - epsilon=1e-05, - center=True, - scale=True, - training=is_training, - fused=True) - if act: - inputs = tf.nn.relu(inputs) - return inputs - - -def basicblock(inputs, filters, is_training, projection_shortcut, strides, - data_format): - shortcut = inputs - if projection_shortcut is not None: - shortcut = projection_shortcut(inputs) - inputs = conv_bn(inputs, filters, 3, strides, is_training, data_format) - inputs = conv_bn(inputs, filters, 3, 1, is_training, data_format, act=False) - inputs = inputs + shortcut - inputs = tf.nn.relu(inputs) - return inputs - - -def bottleneck(inputs, filters, is_training, projection_shortcut, strides, - data_format): - shortcut = inputs - if projection_shortcut is not None: - shortcut = projection_shortcut(inputs) - inputs = conv_bn(inputs, filters, 1, strides, is_training, data_format) - inputs = conv_bn(inputs, filters, 3, 1, is_training, data_format, act=False) - inputs = conv_bn( - inputs, filters * 4, 1, 1, is_training, data_format, act=False) - inputs = inputs + shortcut - inputs = tf.nn.relu(inputs) - return inputs - - -def block_layer(inputs, filters, block_fn, blocks, strides, is_training, name, - data_format): - # Bottleneck blocks end with 4x the number of filters as they start with - filters_out = 4 * filters if block_fn is bottleneck else filters - - def projection_shortcut(inputs): - return conv2d_fixed_padding( - inputs=inputs, - filters=filters_out, - kernel_size=1, - strides=strides, - data_format=data_format) - - # Only the first block per block_layer uses projection_shortcut and strides - inputs = block_fn(inputs, filters, is_training, projection_shortcut, - strides, data_format) - - for _ in range(1, blocks): - inputs = block_fn(inputs, filters, is_training, None, 1, data_format) - - return tf.identity(inputs, name) - - -def resnet_imagenet(depth, class_dim, data_format): - """Returns the ResNet model for a given size and number of output classes.""" - - def resnet_generator(block_fn, - layers, - num_classes, - data_format='channels_last'): - if data_format is None: - data_format = ('channels_first' - if tf.test.is_built_with_cuda() else 'channels_last') - - def model(inputs, is_training): - """Constructs the ResNet model given the inputs.""" - if data_format == 'channels_first': - # Convert the inputs from channels_last (NHWC) to channels_first (NCHW). - # This provides a large performance boost on GPU. See - # https://www.tensorflow.org/performance/performance_guide#data_formats - inputs = tf.transpose(inputs, [0, 3, 1, 2]) - - inputs = conv_bn(inputs, 64, 7, 2, is_training, data_format) - inputs = tf.identity(inputs, 'initial_conv') - inputs = tf.layers.max_pooling2d( - inputs=inputs, - pool_size=3, - strides=2, - padding='SAME', - data_format=data_format) - inputs = tf.identity(inputs, 'initial_max_pool') - inputs = block_layer(inputs, 64, block_fn, layers[0], 1, - is_training, 'block_layer1', data_format) - inputs = block_layer(inputs, 128, block_fn, layers[1], 2, - is_training, 'block_layer2', data_format) - inputs = block_layer(inputs, 256, block_fn, layers[2], 2, - is_training, 'block_layer3', data_format) - inputs = block_layer(inputs, 512, block_fn, layers[3], 2, - is_training, 'block_layer4', data_format) - inputs = tf.layers.average_pooling2d( - inputs=inputs, - pool_size=7, - strides=1, - padding='VALID', - data_format=data_format) - inputs = tf.identity(inputs, 'final_avg_pool') - inputs = tf.reshape(inputs, - [-1, 512 if block_fn is basicblock else 2048]) - inputs = tf.layers.dense(inputs=inputs, units=num_classes) - inputs = tf.identity(inputs, 'final_dense') - return inputs - - return model - - model_params = { - 18: { - 'block': basicblock, - 'layers': [2, 2, 2, 2] - }, - 34: { - 'block': basicblock, - 'layers': [3, 4, 6, 3] - }, - 50: { - 'block': bottleneck, - 'layers': [3, 4, 6, 3] - }, - 101: { - 'block': bottleneck, - 'layers': [3, 4, 23, 3] - }, - 152: { - 'block': bottleneck, - 'layers': [3, 8, 36, 3] - }, - 200: { - 'block': bottleneck, - 'layers': [3, 24, 36, 3] - } - } - if depth not in model_params: - raise ValueError('Not a valid depth:', depth) - params = model_params[depth] - return resnet_generator(params['block'], params['layers'], class_dim, - data_format) - - -def resnet_cifar10(depth, num_classes, data_format): - if depth % 6 != 2: - raise ValueError('depth must be 6n + 2:', depth) - - num_blocks = (depth - 2) // 6 - - if data_format is None: - data_format = ('channels_first' - if tf.test.is_built_with_cuda() else 'channels_last') - - def model(inputs, is_training): - inputs = conv_bn(inputs, 16, 3, 1, is_training, data_format) - inputs = tf.identity(inputs, 'initial_conv') - inputs = block_layer(inputs, 16, basicblock, num_blocks, 1, is_training, - 'block_layer1', data_format) - inputs = block_layer(inputs, 32, basicblock, num_blocks, 2, is_training, - 'block_layer2', data_format) - inputs = block_layer(inputs, 64, basicblock, num_blocks, 2, is_training, - 'block_layer3', data_format) - inputs = tf.layers.average_pooling2d( - inputs=inputs, - pool_size=8, - strides=1, - padding='VALID', - data_format=data_format) - inputs = tf.identity(inputs, 'final_avg_pool') - inputs = tf.reshape(inputs, [-1, 64]) - inputs = tf.layers.dense(inputs=inputs, units=num_classes) - inputs = tf.identity(inputs, 'final_dense') - return inputs - - return model - - -def run_benchmark(args, data_format='channels_last', device='/cpu:0'): - """Our model_fn for ResNet to be used with our Estimator.""" - - class_dim = 1000 - dshape = (None, 224, 224, 3) - - pdshape = (3, 224, 224) - if args.data == 'flowers102': - class_dim = 102 - dshape = (None, 224, 224, 3) - pdshape = (3, 224, 224) - elif args.data == 'cifar10': - class_dim = 10 - dshape = (None, 32, 32, 3) - pdshape = (3, 32, 32) - - with tf.device(device): - images = tf.placeholder(DTYPE, shape=dshape) - labels = tf.placeholder(tf.int64, shape=(None, )) - is_training = tf.placeholder('bool') - onehot_labels = tf.one_hot(labels, depth=class_dim) - - network = resnet_cifar10( - 32, class_dim, - data_format) if args.data == 'cifar10' else resnet_imagenet( - 50, class_dim, data_format) - - logits = network(inputs=images, is_training=is_training) - - cross_entropy = tf.losses.softmax_cross_entropy( - logits=logits, onehot_labels=onehot_labels) - avg_cost = tf.reduce_mean(cross_entropy) - - correct = tf.equal(tf.argmax(logits, 1), labels) - accuracy = tf.reduce_mean(tf.cast(correct, tf.float32)) - - lr = 0.1 if args.data == 'cifar10' else 0.01 - optimizer = tf.train.MomentumOptimizer(learning_rate=lr, momentum=0.9) - - # Batch norm requires update_ops to be added as a train_op dependency. - update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) - with tf.control_dependencies(update_ops): - train_op = optimizer.minimize(avg_cost) - - train_reader = paddle.batch( - paddle.reader.shuffle( - paddle.dataset.cifar.train10() - if args.data == 'cifar10' else paddle.dataset.flowers.train(), - buf_size=5120), - batch_size=args.batch_size) - test_reader = paddle.batch( - paddle.dataset.cifar.test10() - if args.data == 'cifar10' else paddle.dataset.flowers.test(), - batch_size=100) - - def test(): - test_accs = [] - for batch_id, data in enumerate(test_reader()): - test_images = np.array( - map(lambda x: np.transpose(x[0].reshape(pdshape), - axes=[1, 2, 0]), data)).astype("float32") - test_labels = np.array(map(lambda x: x[1], data)).astype('int64') - test_accs.append( - accuracy.eval(feed_dict={ - images: test_images, - labels: test_labels, - is_training: False - })) - print("Pass = %d, Train performance = %f imgs/s, Test accuracy = %f\n" % - (pass_id, num_samples / train_elapsed, np.mean(test_accs))) - - config = tf.ConfigProto( - intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) - config.gpu_options.allow_growth = True - - with tf.Session(config=config) as sess: - init_g = tf.global_variables_initializer() - init_l = tf.local_variables_initializer() - sess.run(init_g) - sess.run(init_l) - - if args.use_fake_data: - data = train_reader().next() - images_data = np.array( - map(lambda x: np.transpose(x[0].reshape(pdshape), - axes=[1, 2, 0]), data)).astype("float32") - labels_data = np.array(map(lambda x: x[1], data)).astype('int64') - iters, num_samples, start_time = 0, 0, 0.0 - for pass_id in range(args.pass_num): - if iters == args.iterations: - break - train_accs = [] - train_losses = [] - for batch_id, data in enumerate(train_reader()): - if iters == args.skip_batch_num: - start_time = time.time() - num_samples = 0 - if iters == args.iterations: - break - if not args.use_fake_data: - images_data = np.array( - map(lambda x: np.transpose(x[0].reshape(pdshape), - axes=[1, 2, 0]), data)).astype("float32") - labels_data = np.array(map(lambda x: x[1], data)).astype( - 'int64') - _, loss, acc = sess.run([train_op, avg_cost, accuracy], - feed_dict={ - images: images_data, - labels: labels_data, - is_training: True - }) - iters += 1 - train_accs.append(acc) - train_losses.append(loss) - num_samples += len(data) - print("Pass=%d, Iter=%d, Loss=%f, Accuray=%f\n" % - (pass_id, iters, loss, acc)) - - train_elapsed = time.time() - start_time - print("Pass=%d, Loss=%f, Accuray=%f\n" % - (pass_id, np.mean(train_losses), np.mean(train_accs))) - - # evaluation - if args.with_test: - test() - - if not args.with_test: - duration = time.time() - start_time - examples_per_sec = num_samples / duration - sec_per_batch = duration / (iters - args.skip_batch_num) - - print('Total examples: %d, total time: %.5f' % - (num_samples, duration)) - print('%.5f examples/sec, %.5f sec/batch' % - (examples_per_sec, sec_per_batch)) - - -if __name__ == '__main__': - args = parse_args() - print_arguments(args) - if tf.test.is_built_with_cuda(): - device = '/device:GPU:0' - if args.order == 'NHWC': - data_format = 'channels_last' - else: - data_format = 'channels_first' - else: - device = '/cpu:0' - if args.order == 'NHWC': - data_format = 'channels_last' - else: - raise ValueError('Only support NHWC order in CPU mode') - - run_benchmark(args, data_format, device) diff --git a/benchmark/tensorflow/rnn/README.md b/benchmark/tensorflow/rnn/README.md deleted file mode 100644 index da8e7b8b07969051cbec3ac6a713eaf7fc738a55..0000000000000000000000000000000000000000 --- a/benchmark/tensorflow/rnn/README.md +++ /dev/null @@ -1,5 +0,0 @@ -You also should install tflearn: - -```bash -pip install -r requirements.txt -``` diff --git a/benchmark/tensorflow/rnn/reader.py b/benchmark/tensorflow/rnn/reader.py deleted file mode 100755 index ac08c10a4232a39b4f7ce93a3b14c778ee0c9e2e..0000000000000000000000000000000000000000 --- a/benchmark/tensorflow/rnn/reader.py +++ /dev/null @@ -1,106 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os.path -import io -import numpy as np -import tensorflow as tf - -# tflearn -import tflearn -from tflearn.data_utils import to_categorical, pad_sequences -from tflearn.datasets import imdb - -FLAGS = tf.app.flags.FLAGS - - -class DataSet(object): - def __init__(self, data, labels): - assert data.shape[0] == labels.shape[0], ( - 'data.shape: %s labels.shape: %s' % (data.shape, labels.shape)) - self._num_examples = data.shape[0] - - self._data = data - self._labels = labels - self._epochs_completed = 0 - self._index_in_epoch = 0 - - @property - def data(self): - return self._data - - @property - def labels(self): - return self._labels - - @property - def num_examples(self): - return self._num_examples - - @property - def epochs_completed(self): - return self._epochs_completed - - def next_batch(self, batch_size): - assert batch_size <= self._num_examples - - start = self._index_in_epoch - self._index_in_epoch += batch_size - if self._index_in_epoch > self._num_examples: - # Finished epoch - self._epochs_completed += 1 - # Shuffle the data - perm = np.arange(self._num_examples) - np.random.shuffle(perm) - self._data = self._data[perm] - self._labels = self._labels[perm] - # Start next epoch - start = 0 - self._index_in_epoch = batch_size - - end = self._index_in_epoch - - return self._data[start:end], self._labels[start:end] - - -def create_datasets(file_path, vocab_size=30000, val_fraction=0.0): - - # IMDB Dataset loading - train, test, _ = imdb.load_data( - path=file_path, - n_words=vocab_size, - valid_portion=val_fraction, - sort_by_len=False) - trainX, trainY = train - testX, testY = test - - # Data preprocessing - # Sequence padding - trainX = pad_sequences(trainX, maxlen=FLAGS.max_len, value=0.) - testX = pad_sequences(testX, maxlen=FLAGS.max_len, value=0.) - # Converting labels to binary vectors - trainY = to_categorical(trainY, nb_classes=2) - testY = to_categorical(testY, nb_classes=2) - - train_dataset = DataSet(trainX, trainY) - - return train_dataset - - -def main(): - create_datasets('imdb.pkl') - - -if __name__ == "__main__": - main() diff --git a/benchmark/tensorflow/rnn/requirements.txt b/benchmark/tensorflow/rnn/requirements.txt deleted file mode 100644 index 4242e7d24fbbeb18e8fb9a760d76fa6d5363b03f..0000000000000000000000000000000000000000 --- a/benchmark/tensorflow/rnn/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -tflearn diff --git a/benchmark/tensorflow/rnn/rnn.py b/benchmark/tensorflow/rnn/rnn.py deleted file mode 100755 index f288083e13656563b511980553245142efec4e65..0000000000000000000000000000000000000000 --- a/benchmark/tensorflow/rnn/rnn.py +++ /dev/null @@ -1,223 +0,0 @@ -#!/usr/bin/env python -from six.moves import xrange # pylint: disable=redefined-builtin -import math -import time -import numpy as np -from datetime import datetime - -import reader -import tensorflow as tf -from tensorflow.python.ops import rnn - -FLAGS = tf.app.flags.FLAGS - -tf.app.flags.DEFINE_integer('batch_size', 128, """Batch size.""") -tf.app.flags.DEFINE_integer('num_batches', 100, """Number of batches to run.""") -tf.app.flags.DEFINE_integer('num_layers', 1, """Number of batches to run.""") -tf.app.flags.DEFINE_integer('max_len', 100, """Number of batches to run.""") -tf.app.flags.DEFINE_boolean('forward_only', False, - """Only run the forward pass.""") -tf.app.flags.DEFINE_boolean('forward_backward_only', False, - """Only run the forward-forward pass.""") -tf.app.flags.DEFINE_integer('hidden_size', 128, """Number of batches to run.""") -tf.app.flags.DEFINE_integer('emb_size', 128, """Number of batches to run.""") -tf.app.flags.DEFINE_boolean('log_device_placement', False, - """Whether to log device placement.""") - -VOCAB_SIZE = 30000 -NUM_CLASS = 2 - - -def get_feed_dict(x_data, y_data=None): - feed_dict = {} - - if y_data is not None: - feed_dict[y_input] = y_data - - for i in xrange(x_data.shape[0]): - feed_dict[x_input[i]] = x_data[i, :, :] - - return feed_dict - - -def get_incoming_shape(incoming): - """ Returns the incoming data shape """ - if isinstance(incoming, tf.Tensor): - return incoming.get_shape().as_list() - elif type(incoming) in [np.array, list, tuple]: - return np.shape(incoming) - else: - raise Exception("Invalid incoming layer.") - - -# Note input * W is done in LSTMCell, -# which is different from PaddlePaddle -def single_lstm(name, - incoming, - n_units, - use_peepholes=True, - return_seq=False, - return_state=False): - with tf.name_scope(name) as scope: - cell = tf.nn.rnn_cell.LSTMCell(n_units, use_peepholes=use_peepholes) - output, _cell_state = rnn.rnn(cell, incoming, dtype=tf.float32) - out = output if return_seq else output[-1] - return (out, _cell_state) if return_state else out - - -def lstm(name, - incoming, - n_units, - use_peepholes=True, - return_seq=False, - return_state=False, - num_layers=1): - with tf.name_scope(name) as scope: - lstm_cell = tf.nn.rnn_cell.LSTMCell( - n_units, use_peepholes=use_peepholes) - cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * num_layers) - initial_state = cell.zero_state(FLAGS.batch_size, dtype=tf.float32) - if not isinstance(incoming, list): - # if the input is embeding, the Tensor shape : [None, time_step, emb_size] - incoming = [ - tf.squeeze(input_, [1]) - for input_ in tf.split(1, FLAGS.max_len, incoming) - ] - outputs, state = tf.nn.rnn(cell, - incoming, - initial_state=initial_state, - dtype=tf.float32) - out = outputs if return_seq else outputs[-1] - return (out, _cell_state) if return_state else out - - -def embedding(name, incoming, vocab_size, emb_size): - with tf.name_scope(name) as scope: - #with tf.device("/cpu:0"): - embedding = tf.get_variable( - name + '_emb', [vocab_size, emb_size], dtype=tf.float32) - out = tf.nn.embedding_lookup(embedding, incoming) - return out - - -def fc(name, inpOp, nIn, nOut, act=True): - with tf.name_scope(name) as scope: - kernel = tf.get_variable( - name + '_w', [nIn, nOut], - initializer=tf.truncated_normal_initializer( - stddev=0.01, dtype=tf.float32), - dtype=tf.float32) - - biases = tf.get_variable( - name + '_b', [nOut], - initializer=tf.constant_initializer( - value=0.0, dtype=tf.float32), - dtype=tf.float32, - trainable=True) - - net = tf.nn.relu_layer(inpOp, kernel, biases, name=name) if act else \ - tf.matmul(inpOp, kernel) + biases - - return net - - -def inference(seq): - net = embedding('emb', seq, VOCAB_SIZE, FLAGS.emb_size) - print "emb:", get_incoming_shape(net) - net = lstm('lstm', net, FLAGS.hidden_size, num_layers=FLAGS.num_layers) - print "lstm:", get_incoming_shape(net) - net = fc('fc1', net, FLAGS.hidden_size, 2) - return net - - -def loss(logits, labels): - # one label index for one sample - labels = tf.cast(labels, tf.float32) - cross_entropy = tf.nn.softmax_cross_entropy_with_logits( - logits, labels, name='cross_entropy_per_example') - cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy') - tf.add_to_collection('losses', cross_entropy_mean) - return tf.add_n(tf.get_collection('losses'), name='total_loss') - - -def time_tensorflow_run(session, target, x_input, y_input, info_string): - num_steps_burn_in = 50 - total_duration = 0.0 - total_duration_squared = 0.0 - if not isinstance(target, list): - target = [target] - target_op = tf.group(*target) - train_dataset = reader.create_datasets("imdb.pkl", VOCAB_SIZE) - for i in xrange(FLAGS.num_batches + num_steps_burn_in): - start_time = time.time() - data, label = train_dataset.next_batch(FLAGS.batch_size) - _ = session.run(target_op, feed_dict={x_input: data, y_input: label}) - duration = time.time() - start_time - if i > num_steps_burn_in: - if not i % 10: - print('%s: step %d, duration = %.3f' % - (datetime.now(), i - num_steps_burn_in, duration)) - total_duration += duration - total_duration_squared += duration * duration - mn = total_duration / FLAGS.num_batches - vr = total_duration_squared / FLAGS.num_batches - mn * mn - sd = math.sqrt(vr) - print('%s: %s across %d steps, %.3f +/- %.3f sec / batch' % - (datetime.now(), info_string, FLAGS.num_batches, mn, sd)) - - -def run_benchmark(): - with tf.Graph().as_default(): - global_step = 0 - with tf.device('/cpu:0'): - global_step = tf.Variable(0, trainable=False) - with tf.device('/gpu:0'): - #x_input = tf.placeholder(tf.int32, [None, FLAGS.max_len], name="x_input") - #y_input = tf.placeholder(tf.int32, [None, NUM_CLASS], name="y_input") - x_input = tf.placeholder( - tf.int32, [FLAGS.batch_size, FLAGS.max_len], name="x_input") - y_input = tf.placeholder( - tf.int32, [FLAGS.batch_size, NUM_CLASS], name="y_input") - # Generate some dummy sequnce. - - last_layer = inference(x_input) - - objective = loss(last_layer, y_input) - opt = tf.train.AdamOptimizer(0.001) - grads = opt.compute_gradients(objective) - apply_gradient_op = opt.apply_gradients( - grads, global_step=global_step) - - init = tf.initialize_all_variables() - sess = tf.Session(config=tf.ConfigProto( - allow_soft_placement=True, - log_device_placement=FLAGS.log_device_placement)) - sess.run(init) - - run_forward = True - run_forward_backward = True - if FLAGS.forward_only and FLAGS.forward_backward_only: - raise ValueError("Cannot specify --forward_only and " - "--forward_backward_only at the same time.") - if FLAGS.forward_only: - run_forward_backward = False - elif FLAGS.forward_backward_only: - run_forward = False - - if run_forward: - time_tensorflow_run(sess, last_layer, x_input, y_input, - "Forward") - - if run_forward_backward: - with tf.control_dependencies([apply_gradient_op]): - train_op = tf.no_op(name='train') - time_tensorflow_run(sess, [train_op, objective], x_input, - y_input, "Forward-backward") - - -def main(_): - run_benchmark() - - -if __name__ == '__main__': - tf.app.run() diff --git a/benchmark/tensorflow/rnn/rnn_multi_gpu.py b/benchmark/tensorflow/rnn/rnn_multi_gpu.py deleted file mode 100755 index eabee4fa8fe6325212ace1c11be4862cd2720b08..0000000000000000000000000000000000000000 --- a/benchmark/tensorflow/rnn/rnn_multi_gpu.py +++ /dev/null @@ -1,322 +0,0 @@ -#!/usr/bin/env python -from six.moves import xrange # pylint: disable=redefined-builtin -import re -import math -import time -import numpy as np -from datetime import datetime - -import reader -import tensorflow as tf -from tensorflow.python.ops import rnn - -FLAGS = tf.app.flags.FLAGS - -tf.app.flags.DEFINE_integer('batch_size', 64, """Batch size.""") -tf.app.flags.DEFINE_integer('num_batches', 100, """Number of batches to run.""") -tf.app.flags.DEFINE_integer('num_layers', 1, """Number of batches to run.""") -tf.app.flags.DEFINE_integer('max_len', 100, """Number of batches to run.""") -tf.app.flags.DEFINE_integer('hidden_size', 128, """Number of batches to run.""") -tf.app.flags.DEFINE_integer('emb_size', 64, """Number of batches to run.""") -tf.app.flags.DEFINE_boolean('log_device_placement', False, - """Whether to log device placement.""") -tf.app.flags.DEFINE_integer('num_gpus', 4, """How many GPUs to use.""") - -VOCAB_SIZE = 30000 -NUM_CLASS = 2 - -NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN = 50000 -NUM_EPOCHS_PER_DECAY = 50 -INITIAL_LEARNING_RATE = 0.1 -LEARNING_RATE_DECAY_FACTOR = 0.1 -TOWER_NAME = 'tower' - -train_dataset = reader.create_datasets("imdb.pkl", VOCAB_SIZE) - - -def get_incoming_shape(incoming): - """ Returns the incoming data shape """ - if isinstance(incoming, tf.Tensor): - return incoming.get_shape().as_list() - elif type(incoming) in [np.array, list, tuple]: - return np.shape(incoming) - else: - raise Exception("Invalid incoming layer.") - - -# Note input * W is done in LSTMCell, -# which is different from PaddlePaddle -def single_lstm(name, - incoming, - n_units, - use_peepholes=True, - return_seq=False, - return_state=False): - with tf.name_scope(name) as scope: - cell = tf.nn.rnn_cell.LSTMCell(n_units, use_peepholes=use_peepholes) - output, _cell_state = rnn.rnn(cell, incoming, dtype=tf.float32) - out = output if return_seq else output[-1] - return (out, _cell_state) if return_state else out - - -def lstm(name, - incoming, - n_units, - use_peepholes=True, - return_seq=False, - return_state=False, - num_layers=1): - with tf.name_scope(name) as scope: - lstm_cell = tf.nn.rnn_cell.LSTMCell( - n_units, use_peepholes=use_peepholes) - cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * num_layers) - initial_state = cell.zero_state(FLAGS.batch_size, dtype=tf.float32) - if not isinstance(incoming, list): - # if the input is embeding, the Tensor shape : [None, time_step, emb_size] - incoming = [ - tf.squeeze(input_, [1]) - for input_ in tf.split(1, FLAGS.max_len, incoming) - ] - outputs, state = tf.nn.rnn(cell, - incoming, - initial_state=initial_state, - dtype=tf.float32) - out = outputs if return_seq else outputs[-1] - return (out, _cell_state) if return_state else out - - -def embedding(name, incoming, vocab_size, emb_size): - with tf.name_scope(name) as scope: - #with tf.device("/cpu:0"): - embedding = tf.get_variable( - name + '_emb', [vocab_size, emb_size], dtype=tf.float32) - out = tf.nn.embedding_lookup(embedding, incoming) - return out - - -def fc(name, inpOp, nIn, nOut, act=True): - with tf.name_scope(name) as scope: - kernel = tf.get_variable( - name + '_w', [nIn, nOut], - initializer=tf.truncated_normal_initializer( - stddev=0.01, dtype=tf.float32), - dtype=tf.float32) - - biases = tf.get_variable( - name + '_b', [nOut], - initializer=tf.constant_initializer( - value=0.0, dtype=tf.float32), - dtype=tf.float32, - trainable=True) - - net = tf.nn.relu_layer(inpOp, kernel, biases, name=name) if act else \ - tf.matmul(inpOp, kernel) + biases - - return net - - -def inference(seq): - net = embedding('emb', seq, VOCAB_SIZE, FLAGS.emb_size) - print "emb:", get_incoming_shape(net) - net = lstm('lstm', net, FLAGS.hidden_size, num_layers=FLAGS.num_layers) - print "lstm:", get_incoming_shape(net) - net = fc('fc1', net, FLAGS.hidden_size, 2) - return net - - -def loss(logits, labels): - # one label index for one sample - #labels = tf.cast(labels, tf.int64) - # cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( - # logits, labels, name='cross_entropy_per_example') - labels = tf.cast(labels, tf.float32) - cross_entropy = tf.nn.softmax_cross_entropy_with_logits( - logits, labels, name='cross_entropy_per_example') - cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy') - tf.add_to_collection('losses', cross_entropy_mean) - return tf.add_n(tf.get_collection('losses'), name='total_loss') - - -def tower_loss(scope): - """Calculate the total loss on a single tower running the model. - Args: - scope: unique prefix string identifying the tower, e.g. 'tower_0' - Returns: - Tensor of shape [] containing the total loss for a batch of data - """ - data, label = train_dataset.next_batch(FLAGS.batch_size) - - # Build a Graph that computes the logits predictions from the - # inference model. - last_layer = inference(data) - - # Build the portion of the Graph calculating the losses. Note that we will - # assemble the total_loss using a custom function below. - #_ = loss(last_layer, label) - _ = loss(last_layer, label) - - # Assemble all of the losses for the current tower only. - losses = tf.get_collection('losses', scope) - - # Calculate the total loss for the current tower. - total_loss = tf.add_n(losses, name='total_loss') - - # Compute the moving average of all individual losses and the total loss. - loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg') - loss_averages_op = loss_averages.apply(losses + [total_loss]) - - # Attach a scalar summary to all individual losses and the total loss; do the - # same for the averaged version of the losses. - for l in losses + [total_loss]: - # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training - # session. This helps the clarity of presentation on tensorboard. - loss_name = re.sub('%s_[0-9]*/' % TOWER_NAME, '', l.op.name) - # Name each loss as '(raw)' and name the moving average version of the loss - # as the original loss name. - tf.scalar_summary(loss_name + ' (raw)', l) - #tf.scalar_summary(loss_name, loss_averages.average(l)) - - with tf.control_dependencies([loss_averages_op]): - total_loss = tf.identity(total_loss) - return total_loss - - -def average_gradients(tower_grads): - """Calculate the average gradient for each shared variable across all towers. - Note that this function provides a synchronization point across all towers. - Args: - tower_grads: List of lists of (gradient, variable) tuples. The outer list - is over individual gradients. The inner list is over the gradient - calculation for each tower. - Returns: - List of pairs of (gradient, variable) where the gradient has been averaged - across all towers. - """ - average_grads = [] - for grad_and_vars in zip(*tower_grads): - # Note that each grad_and_vars looks like the following: - # ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN)) - grads = [] - for g, _ in grad_and_vars: - # Add 0 dimension to the gradients to represent the tower. - expanded_g = tf.expand_dims(g, 0) - - # Append on a 'tower' dimension which we will average over below. - grads.append(expanded_g) - - # Average over the 'tower' dimension. - grad = tf.concat(0, grads) - grad = tf.reduce_mean(grad, 0) - - # Keep in mind that the Variables are redundant because they are shared - # across towers. So .. we will just return the first tower's pointer to - # the Variable. - v = grad_and_vars[0][1] - grad_and_var = (grad, v) - average_grads.append(grad_and_var) - return average_grads - - -def time_tensorflow_run(session, target): - num_steps_burn_in = 80 - total_duration = 0.0 - total_duration_squared = 0.0 - for i in xrange(FLAGS.num_batches + num_steps_burn_in): - start_time = time.time() - _ = session.run(target, feed_dict={x_input: data, y_input: label}) - _, loss_value = session.run(target) - duration = time.time() - start_time - if i > num_steps_burn_in: - if not i % 10: - num_examples_per_step = FLAGS.batch_size * FLAGS.num_gpus - examples_per_sec = num_examples_per_step / duration - # sec_per_batch = duration / FLAGS.num_gpus - sec_per_batch = duration - - format_str = ( - '%s: step %d, loss= %.2f (%.1f examples/sec; %.3f ' - 'sec/batch batch_size= %d)') - print(format_str % - (datetime.now(), i - num_steps_burn_in, loss_value, - duration, sec_per_batch, num_examples_per_step)) - - total_duration += duration - total_duration_squared += duration * duration - - mn = total_duration / FLAGS.num_batches - vr = total_duration_squared / FLAGS.num_batches - mn * mn - sd = math.sqrt(vr) - print('%s: FwdBwd across %d steps, %.3f +/- %.3f sec / batch' % - (datetime.now(), FLAGS.num_batches, mn, sd)) - - -def run_benchmark(): - with tf.Graph().as_default(), tf.device('/cpu:0'): - # Create a variable to count the number of train() calls. This equals the - # number of batches processed * FLAGS.num_gpus. - global_step = tf.get_variable( - 'global_step', [], - initializer=tf.constant_initializer(0), - trainable=False) - - # Calculate the learning rate schedule. - num_batches_per_epoch = (NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / - FLAGS.batch_size) - decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY) - - # Create an optimizer that performs gradient descent. - opt = tf.train.AdamOptimizer(0.001) - - #train_dataset = reader.create_datasets("imdb.pkl", VOCAB_SIZE) - - # Calculate the gradients for each model tower. - tower_grads = [] - for i in xrange(FLAGS.num_gpus): - with tf.device('/gpu:%d' % i): - with tf.name_scope('%s_%d' % (TOWER_NAME, i)) as scope: - # Calculate the loss for one tower of the model. This function - # constructs the entire model but shares the variables across - # all towers. - loss = tower_loss(scope) - - # Reuse variables for the next tower. - tf.get_variable_scope().reuse_variables() - - # Retain the summaries from the final tower. - # summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope) - - # Calculate the gradients for the batch of data on this tower. - grads = opt.compute_gradients(loss) - - # Keep track of the gradients across all towers. - tower_grads.append(grads) - - # We must calculate the mean of each gradient. Note that this is the - # synchronization point across all towers. - grads = average_gradients(tower_grads) - - # Apply the gradients to adjust the shared variables. - apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) - - # Group all updates to into a single train op. - train_op = tf.group(apply_gradient_op) - - # Build an initialization operation. - init = tf.initialize_all_variables() - - # Start running operations on the Graph. allow_soft_placement must be set to - # True to build towers on GPU, as some of the ops do not have GPU - # implementations. - sess = tf.Session(config=tf.ConfigProto( - allow_soft_placement=True, - log_device_placement=FLAGS.log_device_placement)) - sess.run(init) - time_tensorflow_run(sess, [train_op, loss]) - - -def main(_): - run_benchmark() - - -if __name__ == '__main__': - tf.app.run() diff --git a/benchmark/tensorflow/rnn/run.sh b/benchmark/tensorflow/rnn/run.sh deleted file mode 100755 index db10eefdea8676ad34fb84a161f0fc1309147824..0000000000000000000000000000000000000000 --- a/benchmark/tensorflow/rnn/run.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/bin/bash - -set -e - -function test() { - lstm_num=$1 - batch_size=$2 - hid_size=$3 - prefix=$4 - python rnn.py --num_layers=${lstm_num} --batch_size=$batch_size \ - --hidden_size=${hid_size} \ - --forward_backward_only=1 \ - > logs/1gpu-${lstm_num}lstm-batch${batch_size}-hid${hid_size}.log 2>&1 -} - -if [ ! -d "logs" ]; then - mkdir logs -fi - -#--lstm_num--batch_size--hidden_size--# -test 2 64 256 -test 2 64 512 -test 2 64 1280 - -test 2 128 256 -test 2 128 512 -test 2 128 1280 - -test 2 256 256 -test 2 256 512 -test 2 256 1280 diff --git a/benchmark/tensorflow/rnn/run_multi.sh b/benchmark/tensorflow/rnn/run_multi.sh deleted file mode 100755 index ec62fc26b51543f2f8ddfc5e73aa6ff7d611e4dd..0000000000000000000000000000000000000000 --- a/benchmark/tensorflow/rnn/run_multi.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/bin/bash - -set -e - -function test() { - num_gpu=$1 - lstm_num=$2 - hid_size=$3 - batch_per_gpu=`expr ${batch_size} / ${num_gpu}` - batch_size=$4 - python rnn_multi_gpu.py --num_layers=${lstm_num} --batch_size=$batch_per_gpu \ - --num_gpus=${num_gpu} \ - --hidden_size=${hid_size} \ - --forward_backward_only=1 \ - > logs/${num_gpu}gpu-${lstm_num}lstm-hid${hid_size}-batch${batch_size}.log 2>&1 -} - -if [ ! -d "logs" ]; then - mkdir logs -fi - -#--num_gpus--lstm_num--hiddne_size--batch_size--# -test 4 2 256 128 -test 4 2 256 256 -test 4 2 256 512 - -test 4 2 512 128 -test 4 2 512 256 -test 4 2 512 512 diff --git a/benchmark/tensorflow/stacked_dynamic_lstm.py b/benchmark/tensorflow/stacked_dynamic_lstm.py deleted file mode 100644 index 1f532dc2fa082ea0f6b1da560e1a57b96d2ef1bb..0000000000000000000000000000000000000000 --- a/benchmark/tensorflow/stacked_dynamic_lstm.py +++ /dev/null @@ -1,218 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import numpy as np -import argparse -import time -import tensorflow as tf - - -def parse_args(): - parser = argparse.ArgumentParser("LSTM model benchmark.") - parser.add_argument( - '--batch_size', - type=int, - default=32, - help='The sequence number of a batch data. (default: %(default)d)') - parser.add_argument( - '--stacked_num', - type=int, - default=5, - help='Number of lstm layers to stack. (default: %(default)d)') - parser.add_argument( - '--embedding_dim', - type=int, - default=512, - help='Dimension of embedding table. (default: %(default)d)') - parser.add_argument( - '--hidden_dim', - type=int, - default=512, - help='Hidden size of lstm unit. (default: %(default)d)') - parser.add_argument( - '--pass_num', - type=int, - default=10, - help='Epoch number to train. (default: %(default)d)') - parser.add_argument( - '--learning_rate', - type=float, - default=0.0002, - help='Learning rate used to train. (default: %(default)f)') - parser.add_argument( - '--infer_only', action='store_true', help='If set, run forward only.') - args = parser.parse_args() - return args - - -def print_arguments(args): - print('----------- Configuration Arguments -----------') - for arg, value in sorted(vars(args).iteritems()): - print('%s: %s' % (arg, value)) - print('------------------------------------------------') - - -def dynamic_lstm_model(dict_size, - embedding_dim, - hidden_dim, - stacked_num, - class_num=2, - is_train=True): - word_idx = tf.placeholder(tf.int64, shape=[None, None]) - sequence_length = tf.placeholder(tf.int64, shape=[None, ]) - - embedding_weights = tf.get_variable('word_embeddings', - [dict_size, embedding_dim]) - embedding = tf.nn.embedding_lookup(embedding_weights, word_idx) - - lstm_cell = tf.nn.rnn_cell.LSTMCell( - num_units=hidden_dim, use_peepholes=False) - stacked_cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * stacked_num) - - # final_state [LSTMTuple(c, h), LSTMTuple(c, h) ...] total stacked_num LSTMTuples - _, final_state = tf.nn.dynamic_rnn( - cell=stacked_cell, - inputs=embedding, - dtype=tf.float32, - sequence_length=sequence_length) - - w = tf.Variable( - tf.truncated_normal([hidden_dim, class_num]), dtype=tf.float32) - bias = tf.Variable( - tf.constant( - value=0.0, shape=[class_num], dtype=tf.float32)) - prediction = tf.matmul(final_state[-1][1], w) + bias - - if not is_train: - return (word_idx, sequence_length), tf.nn.softmax(prediction) - - label = tf.placeholder(tf.int64, shape=[None, ]) - loss = tf.nn.softmax_cross_entropy_with_logits( - labels=tf.one_hot(label, 2), logits=prediction) - avg_loss = tf.reduce_mean(loss) - - correct_count = tf.equal(tf.argmax(prediction, 1), label) - acc = tf.reduce_mean(tf.cast(correct_count, tf.float32)) - - with tf.variable_scope("reset_metrics_accuracy_scope") as scope: - g_acc = tf.metrics.accuracy(label, tf.argmax(prediction, axis=1)) - vars = tf.contrib.framework.get_variables( - scope, collection=tf.GraphKeys.LOCAL_VARIABLES) - reset_op = tf.variables_initializer(vars) - - return (word_idx, sequence_length, label), avg_loss, acc, g_acc, reset_op - - -def padding_data(data, padding_size, value): - data = data + [value] * padding_size - return data[:padding_size] - - -def train(args): - word_dict = paddle.dataset.imdb.word_dict() - dict_size = len(word_dict) - - feeding_list, avg_loss, acc, g_acc, reset_op = dynamic_lstm_model( - dict_size, args.embedding_dim, args.hidden_dim, args.stacked_num) - - adam_optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate) - train_op = adam_optimizer.minimize(avg_loss) - - train_reader = paddle.batch( - paddle.reader.shuffle( - paddle.dataset.imdb.train(word_dict), buf_size=25000), - batch_size=args.batch_size) - - test_reader = paddle.batch( - paddle.reader.shuffle( - paddle.dataset.imdb.test(word_dict), buf_size=25000), - batch_size=args.batch_size) - - def do_validation(sess): - sess.run(reset_op) - for batch_id, data in enumerate(test_reader()): - word_idx = map(lambda x: x[0], data) - sequence_length = np.array( - [len(seq) for seq in word_idx]).astype('int64') - maxlen = np.max(sequence_length) - word_idx = [padding_data(seq, maxlen, 0) for seq in word_idx] - word_idx = np.array(word_idx).astype('int64') - label = np.array(map(lambda x: x[1], data)).astype('int64') - - _, loss, fetch_acc, fetch_g_acc = sess.run( - [train_op, avg_loss, acc, g_acc], - feed_dict={ - feeding_list[0]: word_idx, - feeding_list[1]: sequence_length, - feeding_list[2]: label - }) - - return fetch_g_acc[1] - - config = tf.ConfigProto( - intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) - config.gpu_options.allow_growth = True - with tf.Session(config=config) as sess: - init_g = tf.global_variables_initializer() - init_l = tf.local_variables_initializer() - sess.run(init_l) - sess.run(init_g) - - for pass_id in xrange(args.pass_num): - # clear accuracy local variable - sess.run(reset_op) - pass_start_time = time.time() - words_seen = 0 - - for batch_id, data in enumerate(train_reader()): - word_idx = map(lambda x: x[0], data) - sequence_length = np.array( - [len(seq) for seq in word_idx]).astype('int64') - words_seen += np.sum(sequence_length) - maxlen = np.max(sequence_length) - word_idx = [padding_data(seq, maxlen, 0) for seq in word_idx] - word_idx = np.array(word_idx).astype('int64') - label = np.array(map(lambda x: x[1], data)).astype('int64') - - _, loss, fetch_acc, fetch_g_acc = sess.run( - [train_op, avg_loss, acc, g_acc], - feed_dict={ - feeding_list[0]: word_idx, - feeding_list[1]: sequence_length, - feeding_list[2]: label - }) - - print("pass_id=%d, batch_id=%d, loss: %f, acc: %f, avg_acc: %f" - % (pass_id, batch_id, loss, fetch_acc, fetch_g_acc[1])) - - pass_end_time = time.time() - time_consumed = pass_end_time - pass_start_time - words_per_sec = words_seen / time_consumed - test_acc = do_validation(sess) - print("pass_id=%d, test_acc: %f, words/s: %f, sec/pass: %f" % - (pass_id, test_acc, words_per_sec, time_consumed)) - - -if __name__ == '__main__': - args = parse_args() - print_arguments(args) - - if args.infer_only: - pass - else: - train(args) diff --git a/benchmark/tensorflow/vgg.py b/benchmark/tensorflow/vgg.py deleted file mode 100644 index d32c835bd7a7dafaafe0970fb6b422db3c866370..0000000000000000000000000000000000000000 --- a/benchmark/tensorflow/vgg.py +++ /dev/null @@ -1,323 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""VGG16 benchmark in TensorFlow""" -import tensorflow as tf -import numpy as np -import argparse -import time - -parser = argparse.ArgumentParser(description=__doc__) -parser.add_argument( - '--batch_size', type=int, default=128, help="Batch size for training.") -parser.add_argument( - '--skip_batch_num', - type=int, - default=5, - help='The first num of minibatch num to skip, for better performance test') -parser.add_argument( - '--iterations', type=int, default=80, help='The number of minibatches.') -parser.add_argument( - '--learning_rate', - type=float, - default=1e-3, - help="Learning rate for training.") -parser.add_argument('--num_passes', type=int, default=50, help="No. of passes.") -parser.add_argument( - '--device', - type=str, - default='GPU', - choices=['CPU', 'GPU'], - help="The device type.") -parser.add_argument( - '--data_format', - type=str, - default='NHWC', - choices=['NCHW', 'NHWC'], - help='The data order, NCHW=[batch, channels, height, width].' - 'Only support NHWC right now.') -parser.add_argument( - '--data_set', - type=str, - default='cifar10', - choices=['cifar10', 'flowers'], - help='Optional dataset for benchmark.') -args = parser.parse_args() - - -class VGG16Model(object): - def __init__(self): - self.parameters = [] - - def batch_norm_relu(self, inputs, is_training): - """Performs a batch normalization followed by a ReLU.""" - # We set fused=True for a significant speed boost. See - # https://www.tensorflow.org/speed/speed_guide#common_fused_ops - inputs = tf.layers.batch_normalization( - inputs=inputs, - axis=1 if args.data_format == 'NCHW' else -1, - momentum=0.9, - epsilon=1e-05, - center=True, - scale=True, - training=is_training, - fused=True) - inputs = tf.nn.relu(inputs) - return inputs - - def conv_bn_layer(self, - name, - images, - kernel_shape, - is_training, - drop_rate=0.0): - with tf.name_scope(name) as scope: - kernel = tf.Variable( - tf.truncated_normal( - kernel_shape, dtype=tf.float32, stddev=1e-1), - name='weights') - conv = tf.nn.conv2d( - images, - kernel, [1, 1, 1, 1], - data_format=args.data_format, - padding='SAME') - biases = tf.Variable( - tf.constant( - 0.0, shape=[kernel_shape[-1]], dtype=tf.float32), - trainable=True, - name='biases') - out = tf.nn.bias_add(conv, biases) - out = self.batch_norm_relu(out, is_training) - out = tf.layers.dropout(out, rate=drop_rate, training=is_training) - return out - - def fc_layer(self, name, inputs, shape): - with tf.name_scope(name) as scope: - fc_w = tf.Variable( - tf.truncated_normal( - shape, dtype=tf.float32, stddev=1e-1), - name='weights') - fc_b = tf.Variable( - tf.constant( - 0.0, shape=[shape[-1]], dtype=tf.float32), - trainable=True, - name='biases') - out = tf.nn.bias_add(tf.matmul(inputs, fc_w), fc_b) - return out - - def network(self, images, class_dim, is_training): - """ VGG16 model structure. - - TODO(kuke): enable this network to support the 'NCHW' data format - """ - - # conv1 - conv1_1 = self.conv_bn_layer( - 'conv1_1', images, [3, 3, 3, 64], is_training, drop_rate=0.3) - conv1_2 = self.conv_bn_layer( - 'conv1_2', conv1_1, [3, 3, 64, 64], is_training, drop_rate=0.0) - # pool1 - pool1 = tf.nn.max_pool( - conv1_2, - ksize=[1, 2, 2, 1], - strides=[1, 2, 2, 1], - padding='SAME', - name='pool1') - # conv2 - conv2_1 = self.conv_bn_layer( - 'conv2_1', pool1, [3, 3, 64, 128], is_training, drop_rate=0.4) - conv2_2 = self.conv_bn_layer( - 'conv2_2', conv2_1, [3, 3, 128, 128], is_training, drop_rate=0.0) - # pool2 - pool2 = tf.nn.max_pool( - conv2_2, - ksize=[1, 2, 2, 1], - strides=[1, 2, 2, 1], - padding='SAME', - name='pool2') - # conv3 - conv3_1 = self.conv_bn_layer( - 'conv3_1', pool2, [3, 3, 128, 256], is_training, drop_rate=0.4) - conv3_2 = self.conv_bn_layer( - 'conv3_2', conv3_1, [3, 3, 256, 256], is_training, drop_rate=0.4) - conv3_3 = self.conv_bn_layer( - 'conv3_3', conv3_2, [3, 3, 256, 256], is_training, drop_rate=0.0) - # pool3 - pool3 = tf.nn.max_pool( - conv3_3, - ksize=[1, 2, 2, 1], - strides=[1, 2, 2, 1], - padding='SAME', - name='pool3') - # conv4 - conv4_1 = self.conv_bn_layer( - 'conv4_1', pool3, [3, 3, 256, 512], is_training, drop_rate=0.4) - conv4_2 = self.conv_bn_layer( - 'conv4_2', conv4_1, [3, 3, 512, 512], is_training, drop_rate=0.4) - conv4_3 = self.conv_bn_layer( - 'conv4_3', conv4_2, [3, 3, 512, 512], is_training, drop_rate=0.0) - # pool4 - pool4 = tf.nn.max_pool( - conv4_3, - ksize=[1, 2, 2, 1], - strides=[1, 2, 2, 1], - padding='SAME', - name='pool4') - # conv5 - conv5_1 = self.conv_bn_layer( - 'conv5_1', pool4, [3, 3, 512, 512], is_training, drop_rate=0.4) - conv5_2 = self.conv_bn_layer( - 'conv5_2', conv5_1, [3, 3, 512, 512], is_training, drop_rate=0.4) - conv5_3 = self.conv_bn_layer( - 'conv5_3', conv5_2, [3, 3, 512, 512], is_training, drop_rate=0.0) - # pool5 - pool5 = tf.nn.max_pool( - conv5_3, - ksize=[1, 2, 2, 1], - strides=[1, 2, 2, 1], - padding='SAME', - name='pool4') - # flatten - shape = int(np.prod(pool5.get_shape()[1:])) - pool5_flat = tf.reshape(pool5, [-1, shape]) - # fc1 - drop = tf.layers.dropout(pool5_flat, rate=0.5, training=is_training) - fc1 = self.fc_layer('fc1', drop, [shape, 512]) - # fc2 - bn = self.batch_norm_relu(fc1, is_training) - drop = tf.layers.dropout(bn, rate=0.5, training=is_training) - fc2 = self.fc_layer('fc2', drop, [512, 512]) - - fc3 = self.fc_layer('fc3', fc2, [512, class_dim]) - - return fc3 - - -def run_benchmark(): - """Run benchmark on cifar10 or flowers.""" - - if args.data_set == "cifar10": - class_dim = 10 - raw_shape = (3, 32, 32) - dat_shape = (None, 32, 32, 3) if args.data_format == 'NHWC' else ( - None, 3, 32, 32) - else: - class_dim = 102 - raw_shape = (3, 224, 224) - dat_shape = (None, 224, 224, 3) if args.data_format == 'NHWC' else ( - None, 3, 224, 224) - - device = '/cpu:0' if args.device == 'CPU' else '/device:GPU:0' - - with tf.device(device): - images = tf.placeholder(tf.float32, shape=dat_shape) - labels = tf.placeholder(tf.int64, shape=(None, )) - is_training = tf.placeholder('bool') - onehot_labels = tf.one_hot(labels, depth=class_dim) - - vgg16 = VGG16Model() - logits = vgg16.network(images, class_dim, is_training) - loss = tf.losses.softmax_cross_entropy( - onehot_labels=onehot_labels, logits=logits) - avg_loss = tf.reduce_mean(loss) - - correct = tf.equal(tf.argmax(logits, 1), labels) - accuracy = tf.reduce_mean(tf.cast(correct, tf.float32)) - - optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate) - update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) - with tf.control_dependencies(update_ops): - train_op = optimizer.minimize(avg_loss) - - # data reader - train_reader = paddle.batch( - paddle.reader.shuffle( - paddle.dataset.cifar.train10() - if args.data_set == 'cifar10' else paddle.dataset.flowers.train(), - buf_size=5120), - batch_size=args.batch_size) - test_reader = paddle.batch( - paddle.reader.shuffle( - paddle.dataset.cifar.test10() - if args.data_set == 'cifar10' else paddle.dataset.flowers.test(), - buf_size=5120), - batch_size=args.batch_size) - - # test - def test(): - test_accs = [] - for batch_id, data in enumerate(test_reader()): - test_images = np.array( - map(lambda x: np.transpose(x[0].reshape(raw_shape), - axes=[1, 2, 0]) if args.data_format == 'NHWC' else x[0], data)).astype("float32") - test_labels = np.array(map(lambda x: x[1], data)).astype('int64') - test_accs.append( - accuracy.eval(feed_dict={ - images: test_images, - labels: test_labels, - is_training: False - })) - return np.mean(test_accs) - - config = tf.ConfigProto( - intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) - config.gpu_options.allow_growth = True - - with tf.Session(config=config) as sess: - init_g = tf.global_variables_initializer() - init_l = tf.local_variables_initializer() - sess.run(init_g) - sess.run(init_l) - iters, num_samples, start_time = 0, 0, time.time() - for pass_id in range(args.num_passes): - # train - num_samples = 0 - start_time = time.time() - for batch_id, data in enumerate(train_reader()): - if iters == args.skip_batch_num: - start_time = time.time() - num_samples = 0 - if iters == args.iterations: - break - train_images = np.array( - map(lambda x: np.transpose(x[0].reshape(raw_shape), - axes=[1, 2, 0]) if args.data_format == 'NHWC' else x[0], data)).astype("float32") - train_labels = np.array(map(lambda x: x[1], data)).astype( - 'int64') - _, loss, acc = sess.run([train_op, avg_loss, accuracy], - feed_dict={ - images: train_images, - labels: train_labels, - is_training: True - }) - iters += 1 - num_samples += len(data) - print("Pass = %d, Iters = %d, Loss = %f, Accuracy = %f" % - (pass_id, iters, loss, acc)) - train_elapsed = time.time() - start_time - # test - pass_test_acc = test() - print("Pass = %d, Train speed = %f imgs/s, Test accuracy = %f\n" % - (pass_id, num_samples / train_elapsed, pass_test_acc)) - - -def print_arguments(): - print('----------- Configuration Arguments -----------') - for arg, value in sorted(vars(args).iteritems()): - print('%s: %s' % (arg, value)) - print('------------------------------------------------') - - -if __name__ == '__main__': - print_arguments() - run_benchmark()