提交 aed8803a 编写于 作者: Y Yu Yang

Merge branch 'develop' of github.com:baidu/Paddle into feature/refine_doc_drnn

.gitignore
\ No newline at end of file
......@@ -8,3 +8,4 @@ build/
.cproject
.pydevproject
Makefile
.test_env/
[submodule "warp-ctc"]
path = warp-ctc
url = https://github.com/baidu-research/warp-ctc.git
......@@ -2,6 +2,7 @@
sha: c25201a00e6b0514370501050cf2a8538ac12270
hooks:
- id: remove-crlf
files: (?!.*warp-ctc)^.*$
- repo: https://github.com/reyoung/mirrors-yapf.git
sha: v0.13.2
hooks:
......@@ -13,6 +14,7 @@
- id: check-merge-conflict
- id: check-symlinks
- id: detect-private-key
files: (?!.*warp-ctc)^.*$
- id: end-of-file-fixer
- repo: https://github.com/PaddlePaddle/clang-format-pre-commit-hook.git
sha: 28c0ea8a67a3e2dbbf4822ef44e85b63a0080a29
......
......@@ -50,7 +50,7 @@ before_install:
fi
- if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then sudo paddle/scripts/travis/before_install.linux.sh; fi
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then paddle/scripts/travis/before_install.osx.sh; fi
- pip install wheel protobuf sphinx breathe recommonmark virtualenv numpy
- pip install wheel protobuf sphinx breathe recommonmark virtualenv numpy sphinx_rtd_theme
script:
- paddle/scripts/travis/main.sh
notifications:
......
cmake_minimum_required(VERSION 2.8)
project(paddle CXX C)
set(PADDLE_MAJOR_VERSION 0)
set(PADDLE_MINOR_VERSION 9)
set(PADDLE_PATCH_VERSION 0a0)
set(PADDLE_VERSION ${PADDLE_MAJOR_VERSION}.${PADDLE_MINOR_VERSION}.${PADDLE_PATCH_VERSION})
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake")
set(PROJ_ROOT ${CMAKE_SOURCE_DIR})
......@@ -12,6 +8,17 @@ include(package)
find_package(SWIG 2.0)
find_package(CUDA QUIET)
find_package(Protobuf REQUIRED)
# Check protobuf library version.
execute_process(COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} --version
OUTPUT_VARIABLE PROTOBUF_VERSION)
string(REPLACE "libprotoc " "" PROTOBUF_VERSION ${PROTOBUF_VERSION})
set(PROTOBUF_3 OFF)
if (${PROTOBUF_VERSION} VERSION_GREATER "3.0.0" OR ${PROTOBUF_VERSION} VERSION_EQUAL "3.0.0")
set(PROTOBUF_3 ON)
endif()
find_package(PythonLibs 2.7 REQUIRED)
find_package(PythonInterp 2.7 REQUIRED)
find_package(ZLIB REQUIRED)
......@@ -45,7 +52,7 @@ option(ON_COVERALLS "Generating code coverage data on coveralls or not." OFF)
option(COVERALLS_UPLOAD "Uploading the generated coveralls json." ON)
if(NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING
set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING
"Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel"
FORCE)
endif()
......@@ -64,36 +71,16 @@ include(check_packages)
include(swig)
include(coveralls)
# add PaddlePaddle version
if(DEFINED ENV{PADDLE_VERSION})
add_definitions(-DPADDLE_VERSION=\"$ENV{PADDLE_VERSION}\")
else()
if(EXISTS ${PROJ_ROOT}/.svn/)
find_package(Subversion REQUIRED)
if(SUBVERSION_FOUND)
Subversion_WC_INFO(${PROJ_ROOT} Project)
add_definitions(-DPADDLE_VERSION=${Project_WC_REVISION})
endif()
elseif(EXISTS ${PROJ_ROOT}/.git/)
find_package(Git REQUIRED)
execute_process(
COMMAND ${GIT_EXECUTABLE} log -1 --format=%H
WORKING_DIRECTORY ${PROJ_ROOT}
OUTPUT_VARIABLE GIT_SHA1
RESULT_VARIABLE GIT_RESULT
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
if(NOT ${GIT_RESULT})
add_definitions(-DPADDLE_VERSION=\"${GIT_SHA1}\")
else()
message(WARNING "Cannot add paddle version from git tag")
endif()
endif()
endif()
# Set PaddlePaddle version to Git tag name or Git commit ID.
find_package(Git REQUIRED)
# version.cmake will get the current PADDLE_VERSION
include(version)
add_definitions(-DPADDLE_VERSION=\"${PADDLE_VERSION}\")
if(NOT WITH_GPU)
add_definitions(-DPADDLE_ONLY_CPU)
add_definitions(-DHPPL_STUB_FUNC)
list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
else()
if(${CUDA_VERSION_MAJOR} GREATER 6)
......@@ -115,15 +102,15 @@ else()
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SSE3_FLAG}")
endif(WITH_AVX)
if(WITH_DSO)
add_definitions(-DPADDLE_USE_DSO)
endif(WITH_DSO)
# Include cuda and cudnn
include_directories(${CUDNN_INCLUDE_DIR})
include_directories(${CUDA_TOOLKIT_INCLUDE})
endif(NOT WITH_GPU)
if(WITH_DSO)
add_definitions(-DPADDLE_USE_DSO)
endif(WITH_DSO)
if(WITH_DOUBLE)
add_definitions(-DPADDLE_TYPE_DOUBLE)
set(ACCURACY double)
......
# Release v0.9.0
## New Features:
* New Layers
* bilinear interpolation layer.
* spatial pyramid-pool layer.
* de-convolution layer.
* maxout layer.
* Support rectangle padding, stride, window and input for Pooling Operation.
* Add —job=time in trainer, which can be used to print time info without compiler option -WITH_TIMER=ON.
* Expose cost_weight/nce_layer in `trainer_config_helpers`
* Add FAQ, concepts, h-rnn docs.
* Add Bidi-LSTM and DB-LSTM to quick start demo @alvations
* Add usage track scripts.
## Improvements
* Add Travis-CI for Mac OS X. Enable swig unittest in Travis-CI. Skip Travis-CI when only docs are changed.
* Add code coverage tools.
* Refine convolution layer to speedup and reduce GPU memory.
* Speed up PyDataProvider2
* Add ubuntu deb package build scripts.
* Make Paddle use git-flow branching model.
* PServer support no parameter blocks.
## Bug Fixes
* add zlib link to py_paddle
* add input sparse data check for sparse layer at runtime
* Bug fix for sparse matrix multiplication
* Fix floating-point overflow problem of tanh
* Fix some nvcc compile options
* Fix a bug in yield dictionary in DataProvider
* Fix SRL hang when exit.
# Release v0.8.0beta.1
New features:
* Mac OSX is supported by source code. #138
* Both GPU and CPU versions of PaddlePaddle are supported.
* Support CUDA 8.0
* Enhance `PyDataProvider2`
* Add dictionary yield format. `PyDataProvider2` can yield a dictionary with key is data_layer's name, value is features.
* Add `min_pool_size` to control memory pool in provider.
* Add `deb` install package & docker image for no_avx machines.
* Especially for cloud computing and virtual machines
* Automatically disable `avx` instructions in cmake when machine's CPU don't support `avx` instructions.
* Add Parallel NN api in trainer_config_helpers.
* Add `travis ci` for Github
Bug fixes:
* Several bugs in trainer_config_helpers. Also complete the unittest for trainer_config_helpers
* Check if PaddlePaddle is installed when unittest.
* Fix bugs in GTX series GPU
* Fix bug in MultinomialSampler
Also more documentation was written since last release.
# Release v0.8.0beta.0
PaddlePaddle v0.8.0beta.0 release. The install package is not stable yet and it's a pre-release version.
paddle/image/logs
paddle/image/*.pyc
paddle/image/train.list
paddle/rnn/logs
paddle/rnn/*.pyc
paddle/rnn/imdb.pkl
caffe/image/logs
tensorflow/image/logs
tensorflow/rnn/logs
# Benchmark
Machine:
- CPU: 12-core Intel(R) Xeon(R) CPU E5-2620 v2 @2.10GHz
- GPU: Tesla K40m
- cuDNN: v5.1
- system: Docker 1.12.1, all platforms are tested in docker environment.
Platforms:
- PaddlePaddle: paddledev/paddle:gpu-devel-v0.9.0a0
- Tensorflow: gcr.io/tensorflow/tensorflow:0.11.0rc0-gpu
- Caffe: kaixhin/cuda-caffe
Several convolutional neural networks and recurrent neural networks are used to test.
## Image
### Benchmark Model
AlexNet, GoogleNet and a small network used in Caffe.
- [AlexNet](https://github.com/BVLC/caffe/tree/master/models/bvlc_alexnet): but the group size is one.
- [GoogleNet](https://github.com/BVLC/caffe/tree/master/models/bvlc_googlenet): but remove loss1 and loss2 when testing benchmark.
- [SmallNet](https://github.com/BVLC/caffe/blob/master/examples/cifar10/cifar10\_quick\_train\_test.prototxt)
### Single-GPU
- AlexNet: input - 3 * 227 * 227, Time: ms/batch
| BatchSize | 64 | 128 | 256 | 512 |
|--------------|-----| -----| ------| -----|
| PaddlePaddle | 195 | 334 | 602 | 1629 |
| TensorFlow | 223 | 364 | 645 | 1235 |
| Caffe | 324 | 627 | 1232 | 2513 |
**Notation**
All platforms use cuDNN-v5.1. We see that caffe is slower in this experiment, because its workspace limit size of cuDNN-conv interface is 8 * 1024 * 1024, which is smaller in PaddlePaddle and TensorFlow. Note that Caffe will be faster if increasing the workspace limit size.
- GoogletNet: input - 3 * 224 * 224, Time: ms/batch
| BatchSize | 64 | 128 | 256 |
|--------------|-------| -------| --------|
| PaddlePaddle | 613 | 1149 | 2348 |
| TensorFlow | 644 | 1176 | 2219 |
| Caffe | 694 | 1364 | out of memory |
- SmallNet: input - 3 * 32 * 32, Time ms/batch
| BatchSize | 64 | 128 | 256 | 512 |
|--------------|--------| -------- | --------|---------|
| PaddlePaddle | 10.463 | 18.184 | 33.113 | 63.039 |
| TensorFlow | 9 | 15 | 28 | 59 |
| Caffe | 9.373 | 16.6606 | 31.4797 | 59.719 |
**Notation**
All the single-GPU experiments in caffe use `caffe time` to calculate elapsed time, which does not include parameter updating time. However, both PaddlePaddle and TensorFlow experiments contain the parameter updating time. As compared with the total time, this part is relatively little on single machine, we can ignore it.
In Tensorflow, they implement algorithm searching method instead of using the algorithm searching interface in cuDNN.
### Multi-GPU: 4 GPUs
- AlexNet, ms / batch
| total-BatchSize | 128 * 4 | 256 * 4 |
|------------------|----------| -----------|
| PaddlePaddle | 347 | 622 |
| TensorFlow | 377 | 675 |
| Caffe | 1229 | 2435 |
For example, if `total-BatchSize = 128 * 4`, the speedup ratio is calculated by
```
time_at_1gpu_batch_128 * 4 / time_at_4gpu_total_batch_512
= (334 * 4)/347
= 3.85
```
<img src="figs/alexnet-4gpu.png" width="420">
- GoogleNet, ms / batch
| total-BatchSize | 128 * 4 | 256 * 4 |
|-------------------|--------------| ----------- |
| PaddlePaddle | 1178 | 2367 |
| TensorFlow | 1210 | 2292 |
| Caffe | 2007 | out of memory |
<img src="figs/googlenet-4gpu.png" width="420">
## RNN
We use lstm network for text classfication to test benchmark.
### Dataset
- [IMDB](http://www.iro.umontreal.ca/~lisa/deep/data/imdb.pkl)
- Sequence length is 100. In fact, PaddlePaddle supports training with variable-length sequence, but TensorFlow needs to pad. Thus, we also pad sequence length to 100 in PaddlePaddle in order to compare.
- Dictionary size=30000
- Peephole connection is used in `lstmemory` by default in PaddlePaddle. It is also configured in TensorFlow.
### Single-GPU
#### LSTM in Text Classification
Testing `2 lstm layer + fc` network with different hidden size and batch size.
- Batch size = 64, ms / batch
| hidden_size | 256 | 512 | 1280 |
|--------------|-------| -------| --------|
| PaddlePaddle | 83 | 184 | 641 |
| TensorFlow | 175 | 280 | 818 |
- Batch size = 128, ms / batch
| hidden_size | 256 | 512 | 1280 |
|--------------|------- | -------| --------|
| PaddlePaddle | 110 | 261 | 1007 |
| TensorFlow | 181 | 361 | 1237 |
- Batch size = 256, ms / batch
| hidden_size | 256 | 512 | 1280 |
|--------------|-------| -------| --------|
| PaddlePaddle | 170 | 414 | 1655 |
| TensorFlow | 238 | 536 | 1905 |
<img src="figs/rnn_lstm_cls.png" width="600">
#### Seq2Seq
The benchmark of sequence-to-sequence network will be added later.
### Multi GPU: 4 GPUs
#### LSTM in Text Classification
- hidden_size = 256, ms / batch
| batch_size | 256 | 512 |
|--------------| -------| --------|
| PaddlePaddle | 90 | 118 |
| TensorFlow | 226 | 118 |
- hidden_size = 512, ms / batch
| batch_size | 256 | 512 |
|--------------| -------| --------|
| PaddlePaddle | 189 | 268 |
| TensorFlow | 297 | 383 |
<img src="figs/rnn_lstm_4gpus.png" width="420">
#### Seq2Seq
The benchmark of sequence-to-sequence network will be added later.
name: "alexnet"
input: "data"
input_dim: 64
input_dim: 3
input_dim: 227
input_dim: 227
input: "label"
input_dim: 64
input_dim: 1
input_dim: 1
input_dim: 1
force_backward: true
layer {
name: "conv1"
type: "Convolution"
bottom: "data"
top: "conv1"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 96
kernel_size: 11
stride: 4
weight_filler {
type: "gaussian"
std: 0.01
}
bias_filler {
type: "constant"
value: 0
}
}
}
layer {
name: "relu1"
type: "ReLU"
bottom: "conv1"
top: "conv1"
}
layer {
name: "norm1"
type: "LRN"
bottom: "conv1"
top: "norm1"
lrn_param {
local_size: 5
alpha: 0.0001
beta: 0.75
}
}
layer {
name: "pool1"
type: "Pooling"
bottom: "norm1"
top: "pool1"
pooling_param {
pool: MAX
kernel_size: 3
stride: 2
}
}
layer {
name: "conv2"
type: "Convolution"
bottom: "pool1"
top: "conv2"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 256
pad: 2
kernel_size: 5
group: 1
weight_filler {
type: "gaussian"
std: 0.01
}
bias_filler {
type: "constant"
value: 0.1
}
}
}
layer {
name: "relu2"
type: "ReLU"
bottom: "conv2"
top: "conv2"
}
layer {
name: "norm2"
type: "LRN"
bottom: "conv2"
top: "norm2"
lrn_param {
local_size: 5
alpha: 0.0001
beta: 0.75
}
}
layer {
name: "pool2"
type: "Pooling"
bottom: "norm2"
top: "pool2"
pooling_param {
pool: MAX
kernel_size: 3
stride: 2
}
}
layer {
name: "conv3"
type: "Convolution"
bottom: "pool2"
top: "conv3"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 384
pad: 1
kernel_size: 3
weight_filler {
type: "gaussian"
std: 0.01
}
bias_filler {
type: "constant"
value: 0
}
}
}
layer {
name: "relu3"
type: "ReLU"
bottom: "conv3"
top: "conv3"
}
layer {
name: "conv4"
type: "Convolution"
bottom: "conv3"
top: "conv4"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 384
pad: 1
kernel_size: 3
group: 1
weight_filler {
type: "gaussian"
std: 0.01
}
bias_filler {
type: "constant"
value: 0.1
}
}
}
layer {
name: "relu4"
type: "ReLU"
bottom: "conv4"
top: "conv4"
}
layer {
name: "conv5"
type: "Convolution"
bottom: "conv4"
top: "conv5"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
convolution_param {
num_output: 256
pad: 1
kernel_size: 3
group: 1
weight_filler {
type: "gaussian"
std: 0.01
}
bias_filler {
type: "constant"
value: 0.1
}
}
}
layer {
name: "relu5"
type: "ReLU"
bottom: "conv5"
top: "conv5"
}
layer {
name: "pool5"
type: "Pooling"
bottom: "conv5"
top: "pool5"
pooling_param {
pool: MAX
kernel_size: 3
stride: 2
}
}
layer {
name: "fc6"
type: "InnerProduct"
bottom: "pool5"
top: "fc6"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
inner_product_param {
num_output: 4096
weight_filler {
type: "gaussian"
std: 0.005
}
bias_filler {
type: "constant"
value: 0.1
}
}
}
layer {
name: "relu6"
type: "ReLU"
bottom: "fc6"
top: "fc6"
}
layer {
name: "drop6"
type: "Dropout"
bottom: "fc6"
top: "fc6"
dropout_param {
dropout_ratio: 0.5
}
}
layer {
name: "fc7"
type: "InnerProduct"
bottom: "fc6"
top: "fc7"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
inner_product_param {
num_output: 4096
weight_filler {
type: "gaussian"
std: 0.005
}
bias_filler {
type: "constant"
value: 0.1
}
}
}
layer {
name: "relu7"
type: "ReLU"
bottom: "fc7"
top: "fc7"
}
layer {
name: "drop7"
type: "Dropout"
bottom: "fc7"
top: "fc7"
dropout_param {
dropout_ratio: 0.5
}
}
layer {
name: "fc8"
type: "InnerProduct"
bottom: "fc7"
top: "fc8"
param {
lr_mult: 1
decay_mult: 1
}
param {
lr_mult: 2
decay_mult: 0
}
inner_product_param {
num_output: 1000
weight_filler {
type: "gaussian"
std: 0.01
}
bias_filler {
type: "constant"
value: 0
}
}
}
layer {
name: "loss"
type: "SoftmaxWithLoss"
bottom: "fc8"
bottom: "label"
top: "loss"
}
此差异已折叠。
set -e
function test() {
cfg=$1
batch=$2
prefix=$3
sed -i "/input: \"data\"/{n;s/^input_dim.*/input_dim: $batch/g}" $cfg
sed -i "/input: \"label\"/{n;s/^input_dim.*/input_dim: $batch/g}" $cfg
caffe time --model=$cfg --iterations=50 --gpu 0 > logs/$prefix-1gpu-batch${batch}.log 2>&1
}
if [ ! -d "logs" ]; then
mkdir logs
fi
# alexnet
test alexnet.prototxt 64 alexnet
test alexnet.prototxt 128 alexnet
test alexnet.prototxt 256 alexnet
test alexnet.prototxt 512 alexnet
# googlenet
test googlenet.prototxt 64 googlenet
test googlenet.prototxt 128 googlenet
# small net
test smallnet_mnist_cifar.prototxt 64 smallnet
test smallnet_mnist_cifar.prototxt 128 smallnet
test smallnet_mnist_cifar.prototxt 256 smallnet
test smallnet_mnist_cifar.prototxt 512 smallnet
#!/bin/bash
set -e
function test() {
cfg=$1
batch=$2
prefix=$3
batch_per_gpu=`expr ${batch} / 4`
sed -i "/input: \"data\"/{n;s/^input_dim.*/input_dim: ${batch_per_gpu}/g}" $cfg
sed -i "/input: \"label\"/{n;s/^input_dim.*/input_dim: ${batch_per_gpu}/g}" $cfg
sed -i "1c\net : \"${cfg}\"" solver.prototxt
caffe train --solver=solver.prototxt -gpu 0,1,2,3 > logs/${prefix}-4gpu-batch${batch}.log 2>&1
}
if [ ! -d "logs" ]; then
mkdir logs
fi
# alexnet
test alexnet.prototxt 512 alexnet
test alexnet.prototxt 1024 alexnet
# googlnet
test googlenet.prototxt 512 googlenet
name: "mnist/cifar"
input: "data"
input_dim: 128
input_dim: 3
input_dim: 32
input_dim: 32
input: "label"
input_dim: 128
input_dim: 1
input_dim: 1
input_dim: 1
layer {
name: "conv1"
type: "Convolution"
bottom: "data"
top: "conv1"
param {
lr_mult: 1
}
param {
lr_mult: 2
}
convolution_param {
num_output: 32
pad: 2
kernel_size: 5
stride: 1
weight_filler {
type: "gaussian"
std: 0.0001
}
bias_filler {
type: "constant"
}
}
}
layer {
name: "pool1"
type: "Pooling"
bottom: "conv1"
top: "pool1"
pooling_param {
pool: MAX
kernel_size: 3
stride: 2
}
}
layer {
name: "relu1"
type: "ReLU"
bottom: "pool1"
top: "pool1"
}
layer {
name: "conv2"
type: "Convolution"
bottom: "pool1"
top: "conv2"
param {
lr_mult: 1
}
param {
lr_mult: 2
}
convolution_param {
num_output: 32
pad: 2
kernel_size: 5
stride: 1
weight_filler {
type: "gaussian"
std: 0.01
}
bias_filler {
type: "constant"
}
}
}
layer {
name: "relu2"
type: "ReLU"
bottom: "conv2"
top: "conv2"
}
layer {
name: "pool2"
type: "Pooling"
bottom: "conv2"
top: "pool2"
pooling_param {
pool: AVE
kernel_size: 3
stride: 2
}
}
layer {
name: "conv3"
type: "Convolution"
bottom: "pool2"
top: "conv3"
param {
lr_mult: 1
}
param {
lr_mult: 2
}
convolution_param {
num_output: 64
pad: 2
kernel_size: 5
stride: 1
weight_filler {
type: "gaussian"
std: 0.01
}
bias_filler {
type: "constant"
}
}
}
layer {
name: "relu3"
type: "ReLU"
bottom: "conv3"
top: "conv3"
}
layer {
name: "pool3"
type: "Pooling"
bottom: "conv3"
top: "pool3"
pooling_param {
pool: AVE
kernel_size: 3
stride: 2
}
}
layer {
name: "ip1"
type: "InnerProduct"
bottom: "pool3"
top: "ip1"
param {
lr_mult: 1
}
param {
lr_mult: 2
}
inner_product_param {
num_output: 64
weight_filler {
type: "gaussian"
std: 0.1
}
bias_filler {
type: "constant"
}
}
}
layer {
name: "ip2"
type: "InnerProduct"
bottom: "ip1"
top: "ip2"
param {
lr_mult: 1
}
param {
lr_mult: 2
}
inner_product_param {
num_output: 10
weight_filler {
type: "gaussian"
std: 0.1
}
bias_filler {
type: "constant"
}
}
}
layer {
name: "accuracy"
type: "Accuracy"
bottom: "ip2"
bottom: "label"
top: "accuracy"
include {
phase: TEST
}
}
layer {
name: "loss"
type: "SoftmaxWithLoss"
bottom: "ip2"
bottom: "label"
top: "loss"
}
net: "alexnet.prototxt"
base_lr: 0.01
lr_policy: "fixed"
display: 20
max_iter: 200
momentum: 0.9
weight_decay: 0.0005
snapshot: 10000
snapshot_prefix: "models/caffe_alexnet_train"
solver_mode: GPU
#!/usr/bin/env python
from paddle.trainer_config_helpers import *
height = 227
width = 227
num_class = 1000
batch_size = get_config_arg('batch_size', int, 128)
args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
define_py_data_sources2(
"train.list", None, module="provider", obj="process", args=args)
settings(
batch_size=batch_size,
learning_rate=0.01 / batch_size,
learning_method=MomentumOptimizer(0.9),
regularization=L2Regularization(0.0005 * batch_size))
# conv1
net = data_layer('data', size=height * width * 3)
net = img_conv_layer(
input=net,
filter_size=11,
num_channels=3,
num_filters=96,
stride=4,
padding=1)
net = img_cmrnorm_layer(input=net, size=5, scale=0.0001, power=0.75)
net = img_pool_layer(input=net, pool_size=3, stride=2)
# conv2
net = img_conv_layer(
input=net, filter_size=5, num_filters=256, stride=1, padding=2, groups=1)
net = img_cmrnorm_layer(input=net, size=5, scale=0.0001, power=0.75)
net = img_pool_layer(input=net, pool_size=3, stride=2)
# conv3
net = img_conv_layer(
input=net, filter_size=3, num_filters=384, stride=1, padding=1)
# conv4
net = img_conv_layer(
input=net, filter_size=3, num_filters=384, stride=1, padding=1, groups=1)
# conv5
net = img_conv_layer(
input=net, filter_size=3, num_filters=256, stride=1, padding=1, groups=1)
net = img_pool_layer(input=net, pool_size=3, stride=2)
net = fc_layer(
input=net,
size=4096,
act=ReluActivation(),
layer_attr=ExtraAttr(drop_rate=0.5))
net = fc_layer(
input=net,
size=4096,
act=ReluActivation(),
layer_attr=ExtraAttr(drop_rate=0.5))
net = fc_layer(input=net, size=1000, act=SoftmaxActivation())
lab = data_layer('label', num_class)
loss = cross_entropy(input=net, label=lab)
outputs(loss)
#!/usr/bin/env python
from paddle.trainer_config_helpers import *
height = 224
width = 224
num_class = 1000
batch_size = get_config_arg('batch_size', int, 128)
args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
define_py_data_sources2(
"train.list", None, module="provider", obj="process", args=args)
settings(
batch_size=batch_size,
learning_rate=0.01 / batch_size,
learning_method=MomentumOptimizer(0.9),
regularization=L2Regularization(0.0005 * batch_size))
def inception2(name, input, channels, \
filter1,
filter3R, filter3,
filter5R, filter5,
proj):
conv1 = name + '_1'
conv3r = name + '_3r'
conv3 = name + '_3'
conv5r = name + '_5r'
conv5 = name + '_5'
maxpool = name + '_max'
convproj = name + '_proj'
cov1 = img_conv_layer(
name=conv1,
input=input,
filter_size=1,
num_channels=channels,
num_filters=filter1,
stride=1,
padding=0)
cov3r = img_conv_layer(
name=conv3r,
input=input,
filter_size=1,
num_channels=channels,
num_filters=filter3R,
stride=1,
padding=0)
cov3 = img_conv_layer(
name=conv3,
input=cov3r,
filter_size=3,
num_filters=filter3,
stride=1,
padding=1)
cov5r = img_conv_layer(
name=conv5r,
input=input,
filter_size=1,
num_channels=channels,
num_filters=filter5R,
stride=1,
padding=0)
cov5 = img_conv_layer(
name=conv5,
input=cov5r,
filter_size=5,
num_filters=filter5,
stride=1,
padding=2)
pool1 = img_pool_layer(
name=maxpool,
input=input,
pool_size=3,
num_channels=channels,
stride=1,
padding=1)
covprj = img_conv_layer(
name=convproj,
input=pool1,
filter_size=1,
num_filters=proj,
stride=1,
padding=0)
cat = concat_layer(name=name, input=[cov1, cov3, cov5, covprj])
return cat
def inception(name, input, channels, \
filter1,
filter3R, filter3,
filter5R, filter5,
proj):
cov1 = conv_projection(
input=input,
filter_size=1,
num_channels=channels,
num_filters=filter1,
stride=1,
padding=0)
cov3r = img_conv_layer(
name=name + '_3r',
input=input,
filter_size=1,
num_channels=channels,
num_filters=filter3R,
stride=1,
padding=0)
cov3 = conv_projection(
input=cov3r, filter_size=3, num_filters=filter3, stride=1, padding=1)
cov5r = img_conv_layer(
name=name + '_5r',
input=input,
filter_size=1,
num_channels=channels,
num_filters=filter5R,
stride=1,
padding=0)
cov5 = conv_projection(
input=cov5r, filter_size=5, num_filters=filter5, stride=1, padding=2)
pool1 = img_pool_layer(
name=name + '_max',
input=input,
pool_size=3,
num_channels=channels,
stride=1,
padding=1)
covprj = conv_projection(
input=pool1, filter_size=1, num_filters=proj, stride=1, padding=0)
cat = concat_layer(
name=name,
input=[cov1, cov3, cov5, covprj],
bias_attr=True,
act=ReluActivation())
return cat
lab = data_layer(name="label", size=1000)
data = data_layer(name="input", size=3 * height * width)
# stage 1
conv1 = img_conv_layer(
name="conv1",
input=data,
filter_size=7,
num_channels=3,
num_filters=64,
stride=2,
padding=3)
pool1 = img_pool_layer(
name="pool1", input=conv1, pool_size=3, num_channels=64, stride=2)
# stage 2
conv2_1 = img_conv_layer(
name="conv2_1",
input=pool1,
filter_size=1,
num_filters=64,
stride=1,
padding=0)
conv2_2 = img_conv_layer(
name="conv2_2",
input=conv2_1,
filter_size=3,
num_filters=192,
stride=1,
padding=1)
pool2 = img_pool_layer(
name="pool2", input=conv2_2, pool_size=3, num_channels=192, stride=2)
# stage 3
ince3a = inception("ince3a", pool2, 192, 64, 96, 128, 16, 32, 32)
ince3b = inception("ince3b", ince3a, 256, 128, 128, 192, 32, 96, 64)
pool3 = img_pool_layer(
name="pool3", input=ince3b, num_channels=480, pool_size=3, stride=2)
# stage 4
ince4a = inception("ince4a", pool3, 480, 192, 96, 208, 16, 48, 64)
ince4b = inception("ince4b", ince4a, 512, 160, 112, 224, 24, 64, 64)
ince4c = inception("ince4c", ince4b, 512, 128, 128, 256, 24, 64, 64)
ince4d = inception("ince4d", ince4c, 512, 112, 144, 288, 32, 64, 64)
ince4e = inception("ince4e", ince4d, 528, 256, 160, 320, 32, 128, 128)
pool4 = img_pool_layer(
name="pool4", input=ince4e, num_channels=832, pool_size=3, stride=2)
# stage 5
ince5a = inception("ince5a", pool4, 832, 256, 160, 320, 32, 128, 128)
ince5b = inception("ince5b", ince5a, 832, 384, 192, 384, 48, 128, 128)
pool5 = img_pool_layer(
name="pool5",
input=ince5b,
num_channels=1024,
pool_size=7,
stride=7,
pool_type=AvgPooling())
# We remove loss1 and loss2 for all system when testing benchmark
# output 1
# pool_o1 = img_pool_layer(name="pool_o1", input=ince4a, num_channels=512, pool_size=5, stride=3, pool_type=AvgPooling())
# conv_o1 = img_conv_layer(name="conv_o1", input=pool_o1, filter_size=1, num_filters=128, stride=1, padding=0)
# fc_o1 = fc_layer(name="fc_o1", input=conv_o1, size=1024, layer_attr=ExtraAttr(drop_rate=0.7), act=ReluActivation())
# out1 = fc_layer(name="output1", input=fc_o1, size=1000, act=SoftmaxActivation())
# loss1 = cross_entropy(name='loss1', input=out1, label=lab, coeff=0.3)
# output 2
#pool_o2 = img_pool_layer(name="pool_o2", input=ince4d, num_channels=528, pool_size=5, stride=3, pool_type=AvgPooling())
#conv_o2 = img_conv_layer(name="conv_o2", input=pool_o2, filter_size=1, num_filters=128, stride=1, padding=0)
#fc_o2 = fc_layer(name="fc_o2", input=conv_o2, size=1024, layer_attr=ExtraAttr(drop_rate=0.7), act=ReluActivation())
#out2 = fc_layer(name="output2", input=fc_o2, size=1000, act=SoftmaxActivation())
#loss2 = cross_entropy(name='loss2', input=out2, label=lab, coeff=0.3)
# output 3
dropout = dropout_layer(name="dropout", input=pool5, dropout_rate=0.4)
out3 = fc_layer(
name="output3", input=dropout, size=1000, act=SoftmaxActivation())
loss3 = cross_entropy(name='loss3', input=out3, label=lab)
outputs(loss3)
import io, os
import random
import numpy as np
from paddle.trainer.PyDataProvider2 import *
def initHook(settings, height, width, color, num_class, **kwargs):
settings.height = height
settings.width = width
settings.color = color
settings.num_class = num_class
if settings.color:
settings.data_size = settings.height * settings.width * 3
else:
settings.data_size = settings.height * settings.width
settings.slots = [dense_vector(settings.data_size), integer_value(1)]
@provider(
init_hook=initHook, min_pool_size=-1, cache=CacheType.CACHE_PASS_IN_MEM)
def process(settings, file_list):
for i in xrange(1024):
img = np.random.rand(1, settings.data_size).reshape(-1, 1).flatten()
lab = random.randint(0, settings.num_class)
yield img.astype('float32'), int(lab)
set -e
function train() {
cfg=$1
thread=$2
bz=$3
args="batch_size=$3"
prefix=$4
paddle train --job=time \
--config=$cfg \
--use_gpu=True \
--trainer_count=$thread \
--log_period=10 \
--test_period=100 \
--config_args=$args \
> logs/$prefix-${thread}gpu-$bz.log 2>&1
}
if [ ! -d "train.list" ]; then
echo " " > train.list
fi
if [ ! -d "logs" ]; then
mkdir logs
fi
#========single-gpu=========#
# alexnet
train alexnet.py 1 64 alexnet
train alexnet.py 1 128 alexnet
train alexnet.py 1 256 alexnet
train alexnet.py 1 512 alexnet
# googlenet
train googlenet.py 1 64 googlenet
train googlenet.py 1 128 googlenet
train googlenet.py 1 256 googlenet
# smallnet
train smallnet_mnist_cifar.py 1 64 smallnet
train smallnet_mnist_cifar.py 1 128 smallnet
train smallnet_mnist_cifar.py 1 256 smallnet
train smallnet_mnist_cifar.py 1 512 smallnet
############################
#========multi-gpus=========#
train alexnet.py 4 512 alexnet
train alexnet.py 4 1024 alexnet
train googlenet.py 4 512 googlenet
train googlenet.py 4 1024 googlenet
#!/usr/bin/env python
from paddle.trainer_config_helpers import *
height = 32
width = 32
num_class = 10
batch_size = get_config_arg('batch_size', int, 128)
args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
define_py_data_sources2(
"train.list", None, module="provider", obj="process", args=args)
settings(
batch_size=batch_size,
learning_rate=0.01 / batch_size,
learning_method=MomentumOptimizer(0.9),
regularization=L2Regularization(0.0005 * batch_size))
# conv1
net = data_layer('data', size=height * width * 3)
net = img_conv_layer(
input=net,
filter_size=5,
num_channels=3,
num_filters=32,
stride=1,
padding=2)
net = img_pool_layer(input=net, pool_size=3, stride=2, padding=1)
# conv2
net = img_conv_layer(
input=net, filter_size=5, num_filters=32, stride=1, padding=2)
net = img_pool_layer(
input=net, pool_size=3, stride=2, padding=1, pool_type=AvgPooling())
# conv3
net = img_conv_layer(
input=net, filter_size=3, num_filters=64, stride=1, padding=1)
net = img_pool_layer(
input=net, pool_size=3, stride=2, padding=1, pool_type=AvgPooling())
net = fc_layer(input=net, size=64, act=ReluActivation())
net = fc_layer(input=net, size=10, act=SoftmaxActivation())
lab = data_layer('label', num_class)
loss = classification_cost(input=net, label=lab)
outputs(loss)
from __future__ import print_function
import six.moves.cPickle as pickle
import gzip
import os
import numpy
def get_dataset_file(dataset, default_dataset, origin):
data_dir, data_file = os.path.split(dataset)
if (not os.path.isfile(dataset)) and data_file == default_dataset:
from six.moves import urllib
print('Downloading data from %s' % origin)
urllib.request.urlretrieve(origin, dataset)
return dataset
def create_data(path="imdb.pkl"):
if (not os.path.isfile('imdb.train.pkl')):
path = get_dataset_file(
path, "imdb.pkl",
"http://www.iro.umontreal.ca/~lisa/deep/data/imdb.pkl")
if path.endswith(".gz"):
f = gzip.open(path, 'rb')
else:
f = open(path, 'rb')
train_set = pickle.load(f)
test_set = pickle.load(f)
f.close()
pickle.dump(train_set, open('imdb.train.pkl', 'wb'))
pickle.dump(test_set, open('imdb.test.pkl', 'wb'))
if (not os.path.isfile('train.list')):
file('train.list', 'w').write('imdb.train.pkl\n')
def main():
create_data('imdb.pkl')
if __name__ == "__main__":
main()
import io, os
import random
import numpy as np
import six.moves.cPickle as pickle
from paddle.trainer.PyDataProvider2 import *
def remove_unk(x, n_words):
return [[1 if w >= n_words else w for w in sen] for sen in x]
# ==============================================================
# tensorflow uses fixed length, but PaddlePaddle can process
# variable-length. Padding is used in benchmark in order to
# compare with other platform.
# ==============================================================
def pad_sequences(sequences,
maxlen=None,
dtype='int32',
padding='post',
truncating='post',
value=0.):
lengths = [len(s) for s in sequences]
nb_samples = len(sequences)
if maxlen is None:
maxlen = np.max(lengths)
x = (np.ones((nb_samples, maxlen)) * value).astype(dtype)
for idx, s in enumerate(sequences):
if len(s) == 0:
continue # empty list was found
if truncating == 'pre':
trunc = s[-maxlen:]
elif truncating == 'post':
trunc = s[:maxlen]
else:
raise ValueError("Truncating type '%s' not understood" % padding)
if padding == 'post':
x[idx, :len(trunc)] = trunc
elif padding == 'pre':
x[idx, -len(trunc):] = trunc
else:
raise ValueError("Padding type '%s' not understood" % padding)
return x
def initHook(settings, vocab_size, pad_seq, maxlen, **kwargs):
settings.vocab_size = vocab_size
settings.pad_seq = pad_seq
settings.maxlen = maxlen
settings.input_types = [
integer_value_sequence(vocab_size), integer_value(2)
]
@provider(
init_hook=initHook, min_pool_size=-1, cache=CacheType.CACHE_PASS_IN_MEM)
def process(settings, file):
f = open(file, 'rb')
train_set = pickle.load(f)
f.close()
x, y = train_set
# remove unk, namely remove the words out of dictionary
x = remove_unk(x, settings.vocab_size)
if settings.pad_seq:
x = pad_sequences(x, maxlen=settings.maxlen, value=0.)
for i in range(len(y)):
yield map(int, x[i]), int(y[i])
#!/usr/bin/env python
from paddle.trainer_config_helpers import *
import imdb
num_class = 2
vocab_size = 30000
fixedlen = 100
batch_size = get_config_arg('batch_size', int, 128)
lstm_num = get_config_arg('lstm_num', int, 1)
hidden_size = get_config_arg('hidden_size', int, 128)
# whether to pad sequence into fixed length
pad_seq = get_config_arg('pad_seq', bool, True)
imdb.create_data('imdb.pkl')
args = {'vocab_size': vocab_size, 'pad_seq': pad_seq, 'maxlen': fixedlen}
define_py_data_sources2(
"train.list", None, module="provider", obj="process", args=args)
settings(
batch_size=batch_size,
learning_rate=2e-3,
learning_method=AdamOptimizer(),
regularization=L2Regularization(8e-4),
gradient_clipping_threshold=25)
net = data_layer('data', size=vocab_size)
net = embedding_layer(input=net, size=128)
for i in xrange(lstm_num):
net = simple_lstm(input=net, size=hidden_size)
net = last_seq(input=net)
net = fc_layer(input=net, size=2, act=SoftmaxActivation())
lab = data_layer('label', num_class)
loss = classification_cost(input=net, label=lab)
outputs(loss)
set -e
function train() {
cfg=$1
thread=$2
args="lstm_num=${3},seq_pad=${4},hidden_size=${5},batch_size=${6}"
paddle train --job=time \
--config=$cfg \
--use_gpu=1 \
--trainer_count=$thread \
--log_period=10 \
--test_period=100 \
--num_passes=1 \
--feed_data=1 \
--config_args=$args \
>logs/rnn-pad${4}-${thread}gpu-lstm${3}-batch${6}-hid${5}.log 2>&1
}
if [ ! -d "logs" ]; then
mkdir logs
fi
## padding, single gpu
#-----config--gpu--lstm_num--padding--hidden_size--batch_size
## lstm_num=2, batch_size=64
train rnn.py 1 2 1 256 64
train rnn.py 1 2 1 512 64
train rnn.py 1 2 1 1280 64
## lstm_num=2, batch_size=128
train rnn.py 1 2 1 256 128
train rnn.py 1 2 1 512 128
train rnn.py 1 2 1 1280 128
## lstm_num=4, batch_size=256
train rnn.py 1 2 1 256 256
train rnn.py 1 2 1 512 256
train rnn.py 1 2 1 1280 256
#==================multi gpus=====================#
# hidden_size=256, lstm_num=2, different batch size
train rnn.py 4 2 1 256 128
train rnn.py 4 2 1 256 256
train rnn.py 4 2 1 256 512
# hidden_size=512, lstm_num=4, different batch size
train rnn.py 4 2 1 512 128
train rnn.py 4 2 1 512 256
train rnn.py 4 2 1 512 512
from six.moves import xrange # pylint: disable=redefined-builtin
from datetime import datetime
import math
import time
import tensorflow.python.platform
import tensorflow as tf
FLAGS = tf.app.flags.FLAGS
tf.app.flags.DEFINE_integer('batch_size', 128, """Batch size.""")
tf.app.flags.DEFINE_integer('num_batches', 100, """Number of batches to run.""")
tf.app.flags.DEFINE_boolean('forward_only', False,
"""Only run the forward pass.""")
tf.app.flags.DEFINE_boolean('forward_backward_only', False,
"""Only run the forward-forward pass.""")
tf.app.flags.DEFINE_string('data_format', 'NCHW',
"""The data format for Convnet operations.
Can be either NHWC or NCHW.
""")
tf.app.flags.DEFINE_boolean('log_device_placement', False,
"""Whether to log device placement.""")
def _conv(name, inpOp, nIn, nOut, kH, kW, dH, dW, padType, wd=0.0005):
with tf.name_scope(name) as scope:
kernel = tf.get_variable(
name + '_w', [kH, kW, nIn, nOut],
initializer=tf.truncated_normal_initializer(
stddev=0.01, dtype=tf.float32),
dtype=tf.float32)
if wd is not None and wd > 0:
weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss')
tf.add_to_collection('losses', weight_decay)
if FLAGS.data_format == 'NCHW':
strides = [1, 1, dH, dW]
else:
strides = [1, dH, dW, 1]
conv = tf.nn.conv2d(
inpOp,
kernel,
strides,
padding=padType,
data_format=FLAGS.data_format)
biases = tf.get_variable(
name=name + '_b',
shape=[nOut],
initializer=tf.constant_initializer(
value=0.0, dtype=tf.float32),
dtype=tf.float32)
bias = tf.reshape(
tf.nn.bias_add(
conv, biases, data_format=FLAGS.data_format),
conv.get_shape())
conv1 = tf.nn.relu(bias, name=scope)
return conv1
def _affine(name, inpOp, nIn, nOut, wd=0.0005, act=True, drop=None):
with tf.name_scope(name) as scope:
kernel = tf.get_variable(
name + '_w', [nIn, nOut],
initializer=tf.truncated_normal_initializer(
stddev=0.01, dtype=tf.float32),
dtype=tf.float32)
if wd is not None and wd > 0:
weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss')
tf.add_to_collection('losses', weight_decay)
biases = tf.get_variable(
name + '_b', [nOut],
initializer=tf.constant_initializer(
value=0.0, dtype=tf.float32),
dtype=tf.float32,
trainable=True)
affine1 = tf.nn.relu_layer(inpOp, kernel, biases, name=name) if act else \
tf.matmul(inpOp, kernel) + biases
output = tf.nn.dropout(affine1, drop) if drop else affine1
return output
def _mpool(name, inpOp, kH, kW, dH, dW):
if FLAGS.data_format == 'NCHW':
ksize = [1, 1, kH, kW]
strides = [1, 1, dH, dW]
else:
ksize = [1, kH, kW, 1]
strides = [1, dH, dW, 1]
return tf.nn.max_pool(
inpOp,
ksize=ksize,
strides=strides,
padding='VALID',
data_format=FLAGS.data_format,
name=name)
def _norm(name, l_input, lsize=4):
return tf.nn.lrn(l_input,
lsize,
bias=1.0,
alpha=0.001 / 9.0,
beta=0.75,
name=name)
def loss(logits, labels):
labels = tf.cast(labels, tf.int64)
cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
logits, labels, name='cross_entropy_per_example')
cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy')
tf.add_to_collection('losses', cross_entropy_mean)
# The total loss is defined as the cross entropy loss plus all of the weight
# decay terms (L2 loss).
return tf.add_n(tf.get_collection('losses'), name='total_loss')
def get_incoming_shape(incoming):
""" Returns the incoming data shape """
if isinstance(incoming, tf.Tensor):
return incoming.get_shape().as_list()
elif type(incoming) in [np.array, list, tuple]:
return np.shape(incoming)
else:
raise Exception("Invalid incoming layer.")
def inference(images):
conv1 = _conv('conv1', images, 3, 96, 11, 11, 4, 4, 'VALID')
pool1 = _mpool('pool1', conv1, 3, 3, 2, 2)
norm1 = _norm('norm1', pool1, lsize=5)
conv2 = _conv('conv2', norm1, 96, 256, 5, 5, 1, 1, 'SAME')
pool2 = _mpool('pool2', conv2, 3, 3, 2, 2)
norm2 = _norm('norm2', pool2, lsize=5)
conv3 = _conv('conv3', norm2, 256, 384, 3, 3, 1, 1, 'SAME')
conv4 = _conv('conv4', conv3, 384, 384, 3, 3, 1, 1, 'SAME')
conv5 = _conv('conv5', conv4, 384, 256, 3, 3, 1, 1, 'SAME')
pool5 = _mpool('pool5', conv5, 3, 3, 2, 2)
resh1 = tf.reshape(pool5, [-1, 256 * 6 * 6])
affn1 = _affine('fc6', resh1, 256 * 6 * 6, 4096, 0.5)
affn2 = _affine('fc7', affn1, 4096, 4096, 0.5)
affn3 = _affine('fc8', affn2, 4096, 1000, wd=None, act=False) # last fc
return affn3
def time_tensorflow_run(session, target, info_string):
num_steps_burn_in = 10
total_duration = 0.0
total_duration_squared = 0.0
if not isinstance(target, list):
target = [target]
target_op = tf.group(*target)
for i in xrange(FLAGS.num_batches + num_steps_burn_in):
start_time = time.time()
_ = session.run(target_op)
duration = time.time() - start_time
if i > num_steps_burn_in:
if not i % 10:
print('%s: step %d, duration = %.3f' %
(datetime.now(), i - num_steps_burn_in, duration))
total_duration += duration
total_duration_squared += duration * duration
mn = total_duration / FLAGS.num_batches
vr = total_duration_squared / FLAGS.num_batches - mn * mn
sd = math.sqrt(vr)
print('%s: %s across %d steps, %.3f +/- %.3f sec / batch' %
(datetime.now(), info_string, FLAGS.num_batches, mn, sd))
def _add_loss_summaries(total_loss):
"""
Generates moving average for all losses and associated summaries for
visualizing the performance of the network.
Args:
total_loss: Total loss from loss().
Returns:
loss_averages_op: op for generating moving averages of losses.
"""
# Compute the moving average of all individual losses and the total loss.
loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg')
losses = tf.get_collection('losses')
loss_averages_op = loss_averages.apply(losses + [total_loss])
# Attach a scalar summary to all individual losses and the total loss; do the
# same for the averaged version of the losses.
for l in losses + [total_loss]:
# Name each loss as '(raw)' and name the moving average version of the loss
# as the original loss name.
tf.scalar_summary(l.op.name + ' (raw)', l)
tf.scalar_summary(l.op.name, loss_averages.average(l))
return loss_averages_op
def run_benchmark():
with tf.Graph().as_default():
with tf.device('/gpu:0'):
# Generate some dummy images.
image_size = 224
# Note that our padding definition is slightly different the cuda-convnet.
# In order to force the model to start with the same activations sizes,
# we add 3 to the image_size and employ VALID padding above.
if FLAGS.data_format == 'NCHW':
image_shape = [
FLAGS.batch_size, 3, image_size + 3, image_size + 3
]
else:
image_shape = [
FLAGS.batch_size, image_size + 3, image_size + 3, 3
]
images = tf.get_variable(
'image',
image_shape,
initializer=tf.truncated_normal_initializer(
stddev=0.1, dtype=tf.float32),
dtype=tf.float32,
trainable=False)
labels = tf.get_variable(
'label', [FLAGS.batch_size],
initializer=tf.constant_initializer(1),
dtype=tf.int32,
trainable=False)
# Build a Graph that computes the logits predictions from the
# inference model.
last_layer = inference(images)
objective = loss(last_layer, labels)
# Compute the gradient with respect to all the parameters.
# Compute gradients.
# opt = tf.train.GradientDescentOptimizer(0.001)
opt = tf.train.MomentumOptimizer(0.001, 0.9)
grads = opt.compute_gradients(objective)
global_step = tf.get_variable(
'global_step', [],
initializer=tf.constant_initializer(
0.0, dtype=tf.float32),
trainable=False,
dtype=tf.float32)
apply_gradient_op = opt.apply_gradients(
grads, global_step=global_step)
# Track the moving averages of all trainable variables.
variable_averages = tf.train.ExponentialMovingAverage(0.9,
global_step)
variables_averages_op = variable_averages.apply(
tf.trainable_variables())
# Build an initialization operation.
init = tf.initialize_all_variables()
# Start running operations on the Graph.
sess = tf.Session(config=tf.ConfigProto(
allow_soft_placement=True,
log_device_placement=FLAGS.log_device_placement))
sess.run(init)
run_forward = True
run_forward_backward = True
if FLAGS.forward_only and FLAGS.forward_backward_only:
raise ValueError("Cannot specify --forward_only and "
"--forward_backward_only at the same time.")
if FLAGS.forward_only:
run_forward_backward = False
elif FLAGS.forward_backward_only:
run_forward = False
if run_forward:
time_tensorflow_run(sess, last_layer, "Forward")
if run_forward_backward:
with tf.control_dependencies(
[apply_gradient_op, variables_averages_op]):
train_op = tf.no_op(name='train')
time_tensorflow_run(sess, [train_op, objective],
"Forward-backward")
def main(_):
run_benchmark()
if __name__ == '__main__':
tf.app.run()
from six.moves import xrange # pylint: disable=redefined-builtin
from datetime import datetime
import math
import re
import time
import tensorflow.python.platform
import tensorflow as tf
FLAGS = tf.app.flags.FLAGS
tf.app.flags.DEFINE_integer('batch_size', 64, """Batch size.""")
tf.app.flags.DEFINE_integer('num_batches', 100, """Number of batches to run.""")
tf.app.flags.DEFINE_string('data_format', 'NCHW',
"""The data format for Convnet operations.
Can be either NHWC or NCHW.
""")
tf.app.flags.DEFINE_string('train_dir', '/train_model',
"""Directory where to write event logs """
"""and checkpoint.""")
tf.app.flags.DEFINE_integer('num_gpus', 4, """How many GPUs to use.""")
tf.app.flags.DEFINE_boolean('log_device_placement', False,
"""Whether to log device placement.""")
NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN = 50000
NUM_EPOCHS_PER_DECAY = 50
INITIAL_LEARNING_RATE = 0.1
LEARNING_RATE_DECAY_FACTOR = 0.1
TOWER_NAME = 'tower'
def _conv(name, inpOp, nIn, nOut, kH, kW, dH, dW, padType, wd=0.005):
with tf.name_scope(name) as scope:
kernel = tf.get_variable(
name + '_w', [kH, kW, nIn, nOut],
initializer=tf.truncated_normal_initializer(
stddev=0.01, dtype=tf.float32),
dtype=tf.float32)
if wd is not None:
weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss')
tf.add_to_collection('losses', weight_decay)
if FLAGS.data_format == 'NCHW':
strides = [1, 1, dH, dW]
else:
strides = [1, dH, dW, 1]
conv = tf.nn.conv2d(
inpOp,
kernel,
strides,
padding=padType,
data_format=FLAGS.data_format)
biases = tf.get_variable(
name=name + '_b',
shape=[nOut],
initializer=tf.constant_initializer(
value=0.0, dtype=tf.float32),
dtype=tf.float32)
bias = tf.reshape(
tf.nn.bias_add(
conv, biases, data_format=FLAGS.data_format),
conv.get_shape())
conv1 = tf.nn.relu(bias, name=scope)
return conv1
def _affine(name, inpOp, nIn, nOut, wd=0.005, act=True):
with tf.name_scope(name) as scope:
kernel = tf.get_variable(
name + '_w', [nIn, nOut],
initializer=tf.truncated_normal_initializer(
stddev=0.01, dtype=tf.float32),
dtype=tf.float32)
if wd is not None:
weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss')
tf.add_to_collection('losses', weight_decay)
biases = tf.get_variable(
name + '_b', [nOut],
initializer=tf.constant_initializer(
value=0.0, dtype=tf.float32),
dtype=tf.float32,
trainable=True)
affine1 = tf.nn.relu_layer(inpOp, kernel, biases, name=name) if act else \
tf.matmul(inpOp, kernel) + biases
return affine1
def _mpool(name, inpOp, kH, kW, dH, dW):
if FLAGS.data_format == 'NCHW':
ksize = [1, 1, kH, kW]
strides = [1, 1, dH, dW]
else:
ksize = [1, kH, kW, 1]
strides = [1, dH, dW, 1]
return tf.nn.max_pool(
inpOp,
ksize=ksize,
strides=strides,
padding='VALID',
data_format=FLAGS.data_format,
name=name)
def _norm(name, l_input, lsize=4):
return tf.nn.lrn(l_input,
lsize,
bias=1.0,
alpha=0.001 / 9.0,
beta=0.75,
name=name)
def loss(logits, labels):
labels = tf.cast(labels, tf.int64)
cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
logits, labels, name='cross_entropy_per_example')
cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy')
tf.add_to_collection('losses', cross_entropy_mean)
# The total loss is defined as the cross entropy loss plus all of the weight
# decay terms (L2 loss).
return tf.add_n(tf.get_collection('losses'), name='total_loss')
def get_incoming_shape(incoming):
""" Returns the incoming data shape """
if isinstance(incoming, tf.Tensor):
return incoming.get_shape().as_list()
elif type(incoming) in [np.array, list, tuple]:
return np.shape(incoming)
else:
raise Exception("Invalid incoming layer.")
def inference(images):
conv1 = _conv('conv1', images, 3, 96, 11, 11, 4, 4, 'VALID')
pool1 = _mpool('pool1', conv1, 3, 3, 2, 2)
norm1 = _norm('norm1', pool1, lsize=5)
conv2 = _conv('conv2', norm1, 96, 256, 5, 5, 1, 1, 'SAME')
pool2 = _mpool('pool2', conv2, 3, 3, 2, 2)
norm2 = _norm('norm2', pool2, lsize=5)
conv3 = _conv('conv3', norm2, 256, 384, 3, 3, 1, 1, 'SAME')
conv4 = _conv('conv4', conv3, 384, 384, 3, 3, 1, 1, 'SAME')
conv5 = _conv('conv5', conv4, 384, 256, 3, 3, 1, 1, 'SAME')
pool5 = _mpool('pool5', conv5, 3, 3, 2, 2)
resh1 = tf.reshape(pool5, [-1, 256 * 6 * 6])
affn1 = _affine('fc6', resh1, 256 * 6 * 6, 4096)
affn2 = _affine('fc7', affn1, 4096, 4096)
affn3 = _affine('fc8', affn2, 4096, 1000, wd=None, act=False) # last fc
return affn3
def tower_loss(scope):
"""Calculate the total loss on a single tower running the model.
Args:
scope: unique prefix string identifying the tower, e.g. 'tower_0'
Returns:
Tensor of shape [] containing the total loss for a batch of data
"""
image_size = 224
if FLAGS.data_format == 'NCHW':
image_shape = [FLAGS.batch_size, 3, image_size + 3, image_size + 3]
else:
image_shape = [FLAGS.batch_size, image_size + 3, image_size + 3, 3]
images = tf.get_variable(
'image',
image_shape,
initializer=tf.truncated_normal_initializer(
stddev=0.1, dtype=tf.float32),
dtype=tf.float32,
trainable=False)
labels = tf.get_variable(
'label', [FLAGS.batch_size],
initializer=tf.constant_initializer(1),
dtype=tf.int32,
trainable=False)
# Build a Graph that computes the logits predictions from the
# inference model.
last_layer = inference(images)
# Build the portion of the Graph calculating the losses. Note that we will
# assemble the total_loss using a custom function below.
_ = loss(last_layer, labels)
# Assemble all of the losses for the current tower only.
losses = tf.get_collection('losses', scope)
# Calculate the total loss for the current tower.
total_loss = tf.add_n(losses, name='total_loss')
# Compute the moving average of all individual losses and the total loss.
loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg')
loss_averages_op = loss_averages.apply(losses + [total_loss])
# Attach a scalar summary to all individual losses and the total loss; do the
# same for the averaged version of the losses.
for l in losses + [total_loss]:
# Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training
# session. This helps the clarity of presentation on tensorboard.
loss_name = re.sub('%s_[0-9]*/' % TOWER_NAME, '', l.op.name)
# Name each loss as '(raw)' and name the moving average version of the loss
# as the original loss name.
tf.scalar_summary(loss_name + ' (raw)', l)
tf.scalar_summary(loss_name, loss_averages.average(l))
with tf.control_dependencies([loss_averages_op]):
total_loss = tf.identity(total_loss)
return total_loss
def average_gradients(tower_grads):
"""Calculate the average gradient for each shared variable across all towers.
Note that this function provides a synchronization point across all towers.
Args:
tower_grads: List of lists of (gradient, variable) tuples. The outer list
is over individual gradients. The inner list is over the gradient
calculation for each tower.
Returns:
List of pairs of (gradient, variable) where the gradient has been averaged
across all towers.
"""
average_grads = []
for grad_and_vars in zip(*tower_grads):
# Note that each grad_and_vars looks like the following:
# ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
grads = []
for g, _ in grad_and_vars:
# Add 0 dimension to the gradients to represent the tower.
expanded_g = tf.expand_dims(g, 0)
# Append on a 'tower' dimension which we will average over below.
grads.append(expanded_g)
# Average over the 'tower' dimension.
grad = tf.concat(0, grads)
grad = tf.reduce_mean(grad, 0)
# Keep in mind that the Variables are redundant because they are shared
# across towers. So .. we will just return the first tower's pointer to
# the Variable.
v = grad_and_vars[0][1]
grad_and_var = (grad, v)
average_grads.append(grad_and_var)
return average_grads
def time_tensorflow_run(session, target):
num_steps_burn_in = 50
total_duration = 0.0
total_duration_squared = 0.0
for i in xrange(FLAGS.num_batches + num_steps_burn_in):
start_time = time.time()
_, loss_value = session.run(target)
duration = time.time() - start_time
if i > num_steps_burn_in:
if not i % 10:
num_examples_per_step = FLAGS.batch_size * FLAGS.num_gpus
examples_per_sec = num_examples_per_step / duration
sec_per_batch = duration
format_str = (
'%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
'sec/batch batch_size = %d)')
print(format_str %
(datetime.now(), i - num_steps_burn_in, loss_value,
duration, sec_per_batch, num_examples_per_step))
total_duration += duration
total_duration_squared += duration * duration
mn = total_duration / FLAGS.num_batches
vr = total_duration_squared / FLAGS.num_batches - mn * mn
sd = math.sqrt(vr)
print('%s: FwdBwd across %d steps, %.3f +/- %.3f sec / batch' %
(datetime.now(), FLAGS.num_batches, mn, sd))
def run_benchmark():
with tf.Graph().as_default(), tf.device('/cpu:0'):
# Create a variable to count the number of train() calls. This equals the
# number of batches processed * FLAGS.num_gpus.
global_step = tf.get_variable(
'global_step', [],
initializer=tf.constant_initializer(0),
trainable=False)
# Calculate the learning rate schedule.
num_batches_per_epoch = (NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN /
FLAGS.batch_size)
decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY)
# Decay the learning rate exponentially based on the number of steps.
lr = tf.train.exponential_decay(
INITIAL_LEARNING_RATE,
global_step,
decay_steps,
LEARNING_RATE_DECAY_FACTOR,
staircase=True)
# Create an optimizer that performs gradient descent.
opt = tf.train.MomentumOptimizer(lr, 0.9)
# Calculate the gradients for each model tower.
tower_grads = []
for i in xrange(FLAGS.num_gpus):
with tf.device('/gpu:%d' % i):
with tf.name_scope('%s_%d' % (TOWER_NAME, i)) as scope:
# Calculate the loss for one tower of the model. This function
# constructs the entire model but shares the variables across
# all towers.
loss = tower_loss(scope)
# Reuse variables for the next tower.
tf.get_variable_scope().reuse_variables()
# Retain the summaries from the final tower.
summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope)
# Calculate the gradients for the batch of data on this tower.
grads = opt.compute_gradients(loss)
# Keep track of the gradients across all towers.
tower_grads.append(grads)
# We must calculate the mean of each gradient. Note that this is the
# synchronization point across all towers.
grads = average_gradients(tower_grads)
# Apply the gradients to adjust the shared variables.
apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
# Group all updates to into a single train op.
train_op = tf.group(apply_gradient_op)
# Build an initialization operation.
init = tf.initialize_all_variables()
# Start running operations on the Graph. allow_soft_placement must be set to
# True to build towers on GPU, as some of the ops do not have GPU
# implementations.
sess = tf.Session(config=tf.ConfigProto(
allow_soft_placement=True,
log_device_placement=FLAGS.log_device_placement))
sess.run(init)
time_tensorflow_run(sess, [train_op, loss])
def main(_):
run_benchmark()
if __name__ == '__main__':
tf.app.run()
from six.moves import xrange
from datetime import datetime
import math
import time
import tensorflow.python.platform
import tensorflow as tf
FLAGS = tf.app.flags.FLAGS
tf.app.flags.DEFINE_integer('batch_size', 128, """Batch size.""")
tf.app.flags.DEFINE_integer('num_batches', 100, """Number of batches to run.""")
tf.app.flags.DEFINE_boolean('forward_only', False,
"""Only run the forward pass.""")
tf.app.flags.DEFINE_boolean('forward_backward_only', False,
"""Only run the forward-forward pass.""")
tf.app.flags.DEFINE_string('data_format', 'NCHW',
"""The data format for Convnet operations.
Can be either NHWC or NCHW.
""")
tf.app.flags.DEFINE_boolean('log_device_placement', False,
"""Whether to log device placement.""")
parameters = []
conv_counter = 1
pool_counter = 1
affine_counter = 1
def _conv(inpOp, nIn, nOut, kH, kW, dH, dW, padType, wd=0.0005):
global conv_counter
global parameters
name = 'conv' + str(conv_counter)
conv_counter += 1
with tf.name_scope(name) as scope:
kernel = tf.Variable(
tf.truncated_normal(
[kH, kW, nIn, nOut], dtype=tf.float32, stddev=1e-1),
name='weights')
if wd is not None and wd > 0:
weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss')
tf.add_to_collection('losses', weight_decay)
if FLAGS.data_format == 'NCHW':
strides = [1, 1, dH, dW]
else:
strides = [1, dH, dW, 1]
conv = tf.nn.conv2d(
inpOp,
kernel,
strides,
padding=padType,
data_format=FLAGS.data_format)
biases = tf.Variable(
tf.constant(
0.0, shape=[nOut], dtype=tf.float32),
trainable=True,
name='biases')
bias = tf.reshape(
tf.nn.bias_add(
conv, biases, data_format=FLAGS.data_format),
conv.get_shape())
conv1 = tf.nn.relu(bias, name=scope)
parameters += [kernel, biases]
return conv1
def _affine(inpOp, nIn, nOut, act=True, wd=0.0005):
global affine_counter
global parameters
name = 'affine' + str(affine_counter)
affine_counter += 1
with tf.name_scope(name) as scope:
kernel = tf.Variable(
tf.truncated_normal(
[nIn, nOut], dtype=tf.float32, stddev=1e-1),
name='weights')
if wd is not None and wd > 0:
weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss')
tf.add_to_collection('losses', weight_decay)
biases = tf.Variable(
tf.constant(
0.0, shape=[nOut], dtype=tf.float32),
trainable=True,
name='biases')
affine1 = tf.nn.relu_layer(
inpOp, kernel, biases,
name=name) if act else tf.matmul(inpOp, kernel) + biases
parameters += [kernel, biases]
return affine1
def _mpool(inpOp, kH, kW, dH, dW, padding):
global pool_counter
global parameters
name = 'pool' + str(pool_counter)
pool_counter += 1
if FLAGS.data_format == 'NCHW':
ksize = [1, 1, kH, kW]
strides = [1, 1, dH, dW]
else:
ksize = [1, kH, kW, 1]
strides = [1, dH, dW, 1]
return tf.nn.max_pool(
inpOp,
ksize=ksize,
strides=strides,
padding=padding,
data_format=FLAGS.data_format,
name=name)
def _apool(inpOp, kH, kW, dH, dW, padding):
global pool_counter
global parameters
name = 'pool' + str(pool_counter)
pool_counter += 1
if FLAGS.data_format == 'NCHW':
ksize = [1, 1, kH, kW]
strides = [1, 1, dH, dW]
else:
ksize = [1, kH, kW, 1]
strides = [1, dH, dW, 1]
return tf.nn.avg_pool(
inpOp,
ksize=ksize,
strides=strides,
padding=padding,
data_format=FLAGS.data_format,
name=name)
def _inception(inp, inSize, o1s, o2s1, o2s2, o3s1, o3s2, o4s1, o4s2):
conv1 = _conv(inp, inSize, o1s, 1, 1, 1, 1, 'VALID')
conv3_ = _conv(inp, inSize, o2s1, 1, 1, 1, 1, 'VALID')
conv3 = _conv(conv3_, o2s1, o2s2, 3, 3, 1, 1, 'SAME')
conv5_ = _conv(inp, inSize, o3s1, 1, 1, 1, 1, 'VALID')
conv5 = _conv(conv5_, o3s1, o3s2, 5, 5, 1, 1, 'SAME')
pool_ = _mpool(inp, o4s1, o4s1, 1, 1, 'SAME')
pool = _conv(pool_, inSize, o4s2, 1, 1, 1, 1, 'VALID')
if FLAGS.data_format == 'NCHW':
channel_dim = 1
else:
channel_dim = 3
incept = tf.concat(channel_dim, [conv1, conv3, conv5, pool])
return incept
def loss(logits, labels):
batch_size = tf.size(labels)
labels = tf.expand_dims(labels, 1)
indices = tf.expand_dims(tf.range(0, batch_size, 1), 1)
concated = tf.concat(1, [indices, labels])
onehot_labels = tf.sparse_to_dense(concated,
tf.pack([batch_size, 1000]), 1.0, 0.0)
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
logits, onehot_labels, name='xentropy')
loss = tf.reduce_mean(cross_entropy, name='xentropy_mean')
return loss
def inference(images):
# stage 1
conv1 = _conv(images, 3, 64, 7, 7, 2, 2, 'SAME')
pool1 = _mpool(conv1, 3, 3, 2, 2, 'SAME')
# stage 2
conv2 = _conv(pool1, 64, 64, 1, 1, 1, 1, 'VALID')
conv3 = _conv(conv2, 64, 192, 3, 3, 1, 1, 'SAME')
pool3 = _mpool(conv3, 3, 3, 2, 2, 'SAME')
# stage 3
incept3a = _inception(pool3, 192, 64, 96, 128, 16, 32, 3, 32)
incept3b = _inception(incept3a, 256, 128, 128, 192, 32, 96, 3, 64)
pool4 = _mpool(incept3b, 3, 3, 2, 2, 'SAME')
# stage 4
incept4a = _inception(pool4, 480, 192, 96, 208, 16, 48, 3, 64)
incept4b = _inception(incept4a, 512, 160, 112, 224, 24, 64, 3, 64)
incept4c = _inception(incept4b, 512, 128, 128, 256, 24, 64, 3, 64)
incept4d = _inception(incept4c, 512, 112, 144, 288, 32, 64, 3, 64)
incept4e = _inception(incept4d, 528, 256, 160, 320, 32, 128, 3, 128)
pool5 = _mpool(incept4e, 3, 3, 2, 2, 'SAME')
# stage 5
incept5a = _inception(pool5, 832, 256, 160, 320, 32, 128, 3, 128)
incept5b = _inception(incept5a, 832, 384, 192, 384, 48, 128, 3, 128)
pool6 = _apool(incept5b, 7, 7, 1, 1, 'VALID')
# output 1
resh1 = tf.reshape(pool6, [-1, 1024])
drop = tf.nn.dropout(resh1, 0.4)
affn1 = _affine(resh1, 1024, 1000, act=False)
return affn1
def time_tensorflow_run(session, target, info_string):
num_steps_burn_in = 10
total_duration = 0.0
total_duration_squared = 0.0
if not isinstance(target, list):
target = [target]
target_op = tf.group(*target)
for i in range(FLAGS.num_batches + num_steps_burn_in):
start_time = time.time()
_ = session.run(target_op)
duration = time.time() - start_time
if i > num_steps_burn_in:
if not i % 10:
print('%s: step %d, duration = %.3f' %
(datetime.now(), i - num_steps_burn_in, duration))
total_duration += duration
total_duration_squared += duration * duration
mn = total_duration / FLAGS.num_batches
vr = total_duration_squared / FLAGS.num_batches - mn * mn
sd = math.sqrt(vr)
print('%s: %s across %d steps, %.3f +/- %.3f sec / batch' %
(datetime.now(), info_string, FLAGS.num_batches, mn, sd))
def run_benchmark():
global parameters
with tf.Graph().as_default():
# Generate some dummy images.
image_size = 224
if FLAGS.data_format == 'NCHW':
image_shape = [FLAGS.batch_size, 3, image_size, image_size]
else:
image_shape = [FLAGS.batch_size, image_size, image_size, 3]
images = tf.get_variable(
'image',
image_shape,
initializer=tf.truncated_normal_initializer(
stddev=0.1, dtype=tf.float32),
dtype=tf.float32,
trainable=False)
labels = tf.get_variable(
'label', [FLAGS.batch_size],
initializer=tf.constant_initializer(1),
dtype=tf.int32,
trainable=False)
# Build a Graph that computes the logits predictions from the
# inference model.
last_layer = inference(images)
objective = loss(last_layer, labels)
# Compute gradients.
# opt = tf.train.GradientDescentOptimizer(0.001)
opt = tf.train.MomentumOptimizer(0.001, 0.9)
grads = opt.compute_gradients(objective)
global_step = tf.get_variable(
'global_step', [],
initializer=tf.constant_initializer(
0.0, dtype=tf.float32),
trainable=False,
dtype=tf.float32)
apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
# Track the moving averages of all trainable variables.
variable_averages = tf.train.ExponentialMovingAverage(0.9, global_step)
variables_averages_op = variable_averages.apply(tf.trainable_variables(
))
# Build an initialization operation.
init = tf.initialize_all_variables()
# Start running operations on the Graph.
sess = tf.Session(config=tf.ConfigProto(
allow_soft_placement=True,
log_device_placement=FLAGS.log_device_placement))
sess.run(init)
run_forward = True
run_forward_backward = True
if FLAGS.forward_only and FLAGS.forward_backward_only:
raise ValueError("Cannot specify --forward_only and "
"--forward_backward_only at the same time.")
if FLAGS.forward_only:
run_forward_backward = False
elif FLAGS.forward_backward_only:
run_forward = False
if run_forward:
# Run the forward benchmark.
time_tensorflow_run(sess, last_layer, "Forward")
if run_forward_backward:
with tf.control_dependencies(
[apply_gradient_op, variables_averages_op]):
train_op = tf.no_op(name='train')
time_tensorflow_run(sess, [train_op, objective], "Forward-backward")
def main(_):
run_benchmark()
if __name__ == '__main__':
tf.app.run()
此差异已折叠。
set -e
function test() {
cfg=$1
batch_size=$2
prefix=$3
python $cfg --batch_size=$batch_size > logs/${prefix}-1gpu-${batch_size}.log 2>&1
}
if [ ! -d "logs" ]; then
mkdir logs
fi
# alexnet
test alexnet.py 64 alexnet
test alexnet.py 128 alexnet
test alexnet.py 256 alexnet
test alexnet.py 512 alexnet
# googlenet
test googlenet.py 64 googlenet
test googlenet.py 128 googlenet
# smallnet
test smallnet_mnist_cifar.py 64 smallnet
test smallnet_mnist_cifar.py 128 smallnet
test smallnet_mnist_cifar.py 256 smallnet
test smallnet_mnist_cifar.py 512 smallnet
set -e
function test() {
cfg=$1
num_gpu=$2
batch_size=$3
batch_per_gpu=`expr ${batch_size} / ${num_gpu}`
prefix=$4
python $cfg --num_gpus=$num_gpu --batch_size=${batch_per_gpu} > logs/${prefix}-4gpu-${batch_size}.log 2>&1
}
if [ ! -d "logs" ]; then
mkdir logs
fi
# alexnet
test alexnet_multi_gpu.py 4 512 alexnet
test alexnet_multi_gpu.py 4 1024 alexnet
# googlenet
test googlenet_multi_gpu.py 4 512 alexnet
test googlenet_multi_gpu.py 4 1024 alexnet
此差异已折叠。
You also should install tflearn:
```bash
pip install -r requirements.txt
```
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
set -e
function test() {
num_gpu=$1
lstm_num=$2
hid_size=$3
batch_per_gpu=`expr ${batch_size} / ${num_gpu}`
batch_size=$4
python rnn_multi_gpu.py --num_layers=${lstm_num} --batch_size=$batch_per_gpu \
--num_gpus=${num_gpu} \
--hidden_size=${hid_size} \
--forward_backward_only=1 \
> logs/${num_gpu}gpu-${lstm_num}lstm-hid${hid_size}-batch${batch_size}.log 2>&1
}
if [ ! -d "logs" ]; then
mkdir logs
fi
#--num_gpus--lstm_num--hiddne_size--batch_size--#
test 4 2 256 128
test 4 2 256 256
test 4 2 256 512
test 4 2 512 128
test 4 2 512 256
test 4 2 512 512
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
文件模式从 100644 更改为 100755
文件模式从 100644 更改为 100755
文件模式从 100644 更改为 100755
文件模式从 100644 更改为 100755
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册