提交 fd4eeaf5 编写于 作者: L liaogang

Merge conflict with maxout layer

...@@ -3,4 +3,6 @@ build/ ...@@ -3,4 +3,6 @@ build/
*.user *.user
.vscode .vscode
.idea .idea
\ No newline at end of file .project
.pydevproject
...@@ -2,9 +2,17 @@ language: cpp ...@@ -2,9 +2,17 @@ language: cpp
cache: ccache cache: ccache
sudo: required sudo: required
dist: trusty dist: trusty
os:
- linux
- osx
env: env:
- JOB=DOCS - JOB=DOCS
- JOB=BUILD_AND_TEST - JOB=BUILD_AND_TEST
matrix:
exclude:
- os: osx
env: JOB=DOCS # Only generate documentation in linux
addons: addons:
apt: apt:
packages: packages:
...@@ -27,9 +35,11 @@ addons: ...@@ -27,9 +35,11 @@ addons:
- libgoogle-glog-dev - libgoogle-glog-dev
- libgflags-dev - libgflags-dev
- libgtest-dev - libgtest-dev
- graphviz
before_install: before_install:
- if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then sudo paddle/scripts/travis/before_install.linux.sh; fi
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then paddle/scripts/travis/before_install.osx.sh; fi
- pip install wheel protobuf sphinx breathe recommonmark - pip install wheel protobuf sphinx breathe recommonmark
- sudo paddle/scripts/travis/before_install.sh
script: script:
- paddle/scripts/travis/main.sh - paddle/scripts/travis/main.sh
notifications: notifications:
......
...@@ -3,7 +3,7 @@ cmake_minimum_required(VERSION 2.8) ...@@ -3,7 +3,7 @@ cmake_minimum_required(VERSION 2.8)
project(paddle CXX C) project(paddle CXX C)
set(PADDLE_MAJOR_VERSION 0) set(PADDLE_MAJOR_VERSION 0)
set(PADDLE_MINOR_VERSION 8) set(PADDLE_MINOR_VERSION 8)
set(PADDLE_PATCH_VERSION 0b1) set(PADDLE_PATCH_VERSION 0b2)
set(PADDLE_VERSION ${PADDLE_MAJOR_VERSION}.${PADDLE_MINOR_VERSION}.${PADDLE_PATCH_VERSION}) set(PADDLE_VERSION ${PADDLE_MAJOR_VERSION}.${PADDLE_MINOR_VERSION}.${PADDLE_PATCH_VERSION})
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake") set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake")
...@@ -104,7 +104,7 @@ else() ...@@ -104,7 +104,7 @@ else()
endif(NOT WITH_GPU) endif(NOT WITH_GPU)
if(WITH_DOUBLE) if(WITH_DOUBLE)
add_definitions(-DPADDLE_TYPE_DOUBLE -DHPPL_TYPE_DOUBLE) add_definitions(-DPADDLE_TYPE_DOUBLE)
set(ACCURACY double) set(ACCURACY double)
else(WITH_DOUBLE) else(WITH_DOUBLE)
set(ACCURACY float) set(ACCURACY float)
......
...@@ -17,10 +17,17 @@ ...@@ -17,10 +17,17 @@
## Find MKL First. ## Find MKL First.
set(MKL_ROOT $ENV{MKL_ROOT} CACHE PATH "Folder contains MKL") set(MKL_ROOT $ENV{MKL_ROOT} CACHE PATH "Folder contains MKL")
find_path(MKL_INCLUDE_DIR mkl.h PATHS ${MKL_ROOT}/include) find_path(MKL_INCLUDE_DIR mkl.h PATHS
find_library(MKL_CORE_LIB NAMES mkl_core PATHS ${MKL_ROOT}/lib) ${MKL_ROOT}/include)
find_library(MKL_SEQUENTIAL_LIB NAMES mkl_sequential PATHS ${MKL_ROOT}/lib) find_library(MKL_CORE_LIB NAMES mkl_core PATHS
find_library(MKL_INTEL_LP64 NAMES mkl_intel_lp64 PATHS ${MKL_ROOT}/lib) ${MKL_ROOT}/lib
${MKL_ROOT}/lib/intel64)
find_library(MKL_SEQUENTIAL_LIB NAMES mkl_sequential PATHS
${MKL_ROOT}/lib
${MKL_ROOT}/lib/intel64)
find_library(MKL_INTEL_LP64 NAMES mkl_intel_lp64 PATHS
${MKL_ROOT}/lib
${MKL_ROOT}/lib/intel64)
if(MKL_INCLUDE_DIR AND MKL_CORE_LIB AND MKL_SEQUENTIAL_LIB AND MKL_INTEL_LP64) if(MKL_INCLUDE_DIR AND MKL_CORE_LIB AND MKL_SEQUENTIAL_LIB AND MKL_INTEL_LP64)
......
...@@ -64,7 +64,9 @@ set(COMMON_FLAGS ...@@ -64,7 +64,9 @@ set(COMMON_FLAGS
-Wdelete-non-virtual-dtor -Wdelete-non-virtual-dtor
-Wno-unused-parameter -Wno-unused-parameter
-Wno-error=literal-suffix -Wno-error=literal-suffix
-Wno-error=unused-local-typedefs) -Wno-error=unused-local-typedefs
-Wno-error=unused-function # Warnings in Numpy Header.
)
foreach(flag ${COMMON_FLAGS}) foreach(flag ${COMMON_FLAGS})
safe_set_cflag(CMAKE_C_FLAGS ${flag}) safe_set_cflag(CMAKE_C_FLAGS ${flag})
......
...@@ -184,3 +184,20 @@ macro(add_paddle_culib TARGET_NAME) ...@@ -184,3 +184,20 @@ macro(add_paddle_culib TARGET_NAME)
cuda_add_library(${TARGET_NAME} STATIC ${ARGN}) cuda_add_library(${TARGET_NAME} STATIC ${ARGN})
set(CUDA_NVCC_FLAGS ${NVCC_FLAG}) set(CUDA_NVCC_FLAGS ${NVCC_FLAG})
endmacro() endmacro()
# Creates C resources file from files in given resource file
function(create_resources res_file output)
# Create empty output file
file(WRITE ${output} "")
# Get short filename
string(REGEX MATCH "([^/]+)$" filename ${res_file})
# Replace filename spaces & extension separator for C compatibility
string(REGEX REPLACE "\\.| |-" "_" filename ${filename})
# Read hex data from file
file(READ ${res_file} filedata HEX)
# Convert hex data for C compatibility
string(REGEX REPLACE "([0-9a-f][0-9a-f])" "0x\\1," filedata ${filedata})
# Append data to output file
file(APPEND ${output} "const unsigned char ${filename}[] = {${filedata}};\nconst unsigned ${filename}_size = sizeof(${filename});\n")
endfunction()
data/raw_data
data/*.list
mnist_vgg_model
plot.png
train.log
*pyc
# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
o = open("./" + "train.list", "w")
o.write("./data/raw_data/train" +"\n")
o.close()
o = open("./" + "test.list", "w")
o.write("./data/raw_data/t10k" +"\n")
o.close()
\ No newline at end of file
#!/usr/bin/env sh
# This scripts downloads the mnist data and unzips it.
set -e
DIR="$( cd "$(dirname "$0")" ; pwd -P )"
rm -rf "$DIR/raw_data"
mkdir "$DIR/raw_data"
cd "$DIR/raw_data"
echo "Downloading..."
for fname in train-images-idx3-ubyte train-labels-idx1-ubyte t10k-images-idx3-ubyte t10k-labels-idx1-ubyte
do
if [ ! -e $fname ]; then
wget --no-check-certificate http://yann.lecun.com/exdb/mnist/${fname}.gz
gunzip ${fname}.gz
fi
done
cd $DIR
rm -f *.list
python generate_list.py
from paddle.trainer.PyDataProvider2 import *
# Define a py data provider
@provider(input_types={
'pixel': dense_vector(28 * 28),
'label': integer_value(10)
})
def process(settings, filename): # settings is not used currently.
imgf = filename + "-images-idx3-ubyte"
labelf = filename + "-labels-idx1-ubyte"
f = open(imgf, "rb")
l = open(labelf, "rb")
f.read(16)
l.read(8)
# Define number of samples for train/test
if "train" in filename:
n = 60000
else:
n = 10000
for i in range(n):
label = ord(l.read(1))
pixels = []
for j in range(28 * 28):
pixels.append(float(ord(f.read(1))) / 255.0)
yield {"pixel": pixels, 'label': label}
f.close()
l.close()
#!/bin/bash
# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
config=vgg_16_mnist.py
output=./mnist_vgg_model
log=train.log
paddle train \
--config=$config \
--dot_period=10 \
--log_period=100 \
--test_all_data_in_one_period=1 \
--use_gpu=0 \
--trainer_count=1 \
--num_passes=100 \
--save_dir=$output \
2>&1 | tee $log
python -m paddle.utils.plotcurve -i $log > plot.png
# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.trainer_config_helpers import *
is_predict = get_config_arg("is_predict", bool, False)
####################Data Configuration ##################
if not is_predict:
data_dir='./data/'
define_py_data_sources2(train_list= data_dir + 'train.list',
test_list= data_dir + 'test.list',
module='mnist_provider',
obj='process')
######################Algorithm Configuration #############
settings(
batch_size = 128,
learning_rate = 0.1 / 128.0,
learning_method = MomentumOptimizer(0.9),
regularization = L2Regularization(0.0005 * 128)
)
#######################Network Configuration #############
data_size=1*28*28
label_size=10
img = data_layer(name='pixel', size=data_size)
# small_vgg is predined in trainer_config_helpers.network
predict = small_vgg(input_image=img,
num_channels=1,
num_classes=label_size)
if not is_predict:
lbl = data_layer(name="label", size=label_size)
inputs(img, lbl)
outputs(classification_cost(input=predict, label=lbl))
else:
outputs(predict)
...@@ -20,6 +20,8 @@ ...@@ -20,6 +20,8 @@
set -e set -e
export LC_ALL=C
mkdir -p data/tmp mkdir -p data/tmp
python preprocess.py -i data/reviews_Electronics_5.json.gz python preprocess.py -i data/reviews_Electronics_5.json.gz
# uniq and shuffle # uniq and shuffle
......
...@@ -18,6 +18,8 @@ cfg=trainer_config.lr.py ...@@ -18,6 +18,8 @@ cfg=trainer_config.lr.py
#cfg=trainer_config.emb.py #cfg=trainer_config.emb.py
#cfg=trainer_config.cnn.py #cfg=trainer_config.cnn.py
#cfg=trainer_config.lstm.py #cfg=trainer_config.lstm.py
#cfg=trainer_config.bidi-lstm.py
#cfg=trainer_config.db-lstm.py
paddle train \ paddle train \
--config=$cfg \ --config=$cfg \
--save_dir=./output \ --save_dir=./output \
......
# edit-mode: -*- python -*-
# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.trainer_config_helpers import *
dict_file = "./data/dict.txt"
word_dict = dict()
with open(dict_file, 'r') as f:
for i, line in enumerate(f):
w = line.strip().split()[0]
word_dict[w] = i
is_predict = get_config_arg('is_predict', bool, False)
trn = 'data/train.list' if not is_predict else None
tst = 'data/test.list' if not is_predict else 'data/pred.list'
process = 'process' if not is_predict else 'process_predict'
define_py_data_sources2(train_list=trn,
test_list=tst,
module="dataprovider_emb",
obj=process,
args={"dictionary": word_dict})
batch_size = 128 if not is_predict else 1
settings(
batch_size=batch_size,
learning_rate=2e-3,
learning_method=AdamOptimizer(),
regularization=L2Regularization(8e-4),
gradient_clipping_threshold=25
)
bias_attr = ParamAttr(initial_std=0.,l2_rate=0.)
data = data_layer(name="word", size=len(word_dict))
emb = embedding_layer(input=data, size=128)
bi_lstm = bidirectional_lstm(input=emb, size=128)
dropout = dropout_layer(input=bi_lstm, dropout_rate=0.5)
output = fc_layer(input=dropout, size=2,
bias_attr=bias_attr,
act=SoftmaxActivation())
if is_predict:
maxid = maxid_layer(output)
outputs([maxid, output])
else:
label = data_layer(name="label", size=2)
cls = classification_cost(input=output, label=label)
outputs(cls)
# edit-mode: -*- python -*-
# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.trainer_config_helpers import *
dict_file = "./data/dict.txt"
word_dict = dict()
with open(dict_file, 'r') as f:
for i, line in enumerate(f):
w = line.strip().split()[0]
word_dict[w] = i
is_predict = get_config_arg('is_predict', bool, False)
trn = 'data/train.list' if not is_predict else None
tst = 'data/test.list' if not is_predict else 'data/pred.list'
process = 'process' if not is_predict else 'process_predict'
define_py_data_sources2(train_list=trn,
test_list=tst,
module="dataprovider_emb",
obj=process,
args={"dictionary": word_dict})
batch_size = 128 if not is_predict else 1
settings(
batch_size=batch_size,
learning_rate=2e-3,
learning_method=AdamOptimizer(),
regularization=L2Regularization(8e-4),
gradient_clipping_threshold=25
)
bias_attr = ParamAttr(initial_std=0.,l2_rate=0.)
data = data_layer(name="word", size=len(word_dict))
emb = embedding_layer(input=data, size=128)
hidden_0 = mixed_layer(size=128, input=[full_matrix_projection(input=emb)])
lstm_0 = lstmemory(input=hidden_0, layer_attr=ExtraAttr(drop_rate=0.1))
input_layers = [hidden_0, lstm_0]
for i in range(1,8):
fc = fc_layer(input=input_layers, size=128)
lstm = lstmemory(input=fc, layer_attr=ExtraAttr(drop_rate=0.1),
reverse=(i % 2) == 1,)
input_layers = [fc, lstm]
lstm_last = pooling_layer(input=lstm, pooling_type=MaxPooling())
output = fc_layer(input=lstm_last, size=2,
bias_attr=bias_attr,
act=SoftmaxActivation())
if is_predict:
maxid = maxid_layer(output)
outputs([maxid, output])
else:
label = data_layer(name="label", size=2)
cls = classification_cost(input=output, label=label)
outputs(cls)
...@@ -96,12 +96,12 @@ def gru_encoder_decoder(data_conf, ...@@ -96,12 +96,12 @@ def gru_encoder_decoder(data_conf,
encoded_vector = concat_layer(input=[src_forward, src_backward]) encoded_vector = concat_layer(input=[src_forward, src_backward])
with mixed_layer(size=decoder_size) as encoded_proj: with mixed_layer(size=decoder_size) as encoded_proj:
encoded_proj += full_matrix_projection(encoded_vector) encoded_proj += full_matrix_projection(input=encoded_vector)
backward_first = first_seq(input=src_backward) backward_first = first_seq(input=src_backward)
with mixed_layer(size=decoder_size, with mixed_layer(size=decoder_size,
act=TanhActivation(), ) as decoder_boot: act=TanhActivation(), ) as decoder_boot:
decoder_boot += full_matrix_projection(backward_first) decoder_boot += full_matrix_projection(input=backward_first)
def gru_decoder_with_attention(enc_vec, enc_proj, current_word): def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
decoder_mem = memory(name='gru_decoder', decoder_mem = memory(name='gru_decoder',
...@@ -113,8 +113,8 @@ def gru_encoder_decoder(data_conf, ...@@ -113,8 +113,8 @@ def gru_encoder_decoder(data_conf,
decoder_state=decoder_mem, ) decoder_state=decoder_mem, )
with mixed_layer(size=decoder_size * 3) as decoder_inputs: with mixed_layer(size=decoder_size * 3) as decoder_inputs:
decoder_inputs += full_matrix_projection(context) decoder_inputs += full_matrix_projection(input=context)
decoder_inputs += full_matrix_projection(current_word) decoder_inputs += full_matrix_projection(input=current_word)
gru_step = gru_step_layer(name='gru_decoder', gru_step = gru_step_layer(name='gru_decoder',
input=decoder_inputs, input=decoder_inputs,
......
#!/bin/bash
# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
DIR="$( cd "$(dirname "$0")" ; pwd -P )"
cd $DIR
wget http://www.cnts.ua.ac.be/conll2000/chunking/train.txt.gz
wget http://www.cnts.ua.ac.be/conll2000/chunking/test.txt.gz
# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.trainer.PyDataProvider2 import *
import gzip
import logging
logging.basicConfig(
format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s',
)
logger = logging.getLogger('paddle')
logger.setLevel(logging.INFO)
OOV_POLICY_IGNORE = 0
OOV_POLICY_USE = 1
OOV_POLICY_ERROR = 2
num_original_columns = 3
# Feature combination patterns.
# [[-1,0], [0,0]] means previous token at column 0 and current token at
# column 0 are combined as one feature.
patterns = [
[[-2,0]],
[[-1,0]],
[[0,0]],
[[1,0]],
[[2,0]],
[[-1,0], [0,0]],
[[0,0], [1,0]],
[[-2,1]],
[[-1,1]],
[[0,1]],
[[1,1]],
[[2,1]],
[[-2,1], [-1,1]],
[[-1,1], [0,1]],
[[0,1], [1,1]],
[[1,1], [2,1]],
[[-2,1], [-1,1], [0,1]],
[[-1,1], [0,1], [1,1]],
[[0,1], [1,1], [2,1]],
]
dict_label = {
'B-ADJP': 0,
'I-ADJP': 1,
'B-ADVP': 2,
'I-ADVP': 3,
'B-CONJP': 4,
'I-CONJP': 5,
'B-INTJ': 6,
'I-INTJ': 7,
'B-LST': 8,
'I-LST': 9,
'B-NP': 10,
'I-NP': 11,
'B-PP': 12,
'I-PP': 13,
'B-PRT': 14,
'I-PRT': 15,
'B-SBAR': 16,
'I-SBAR': 17,
'B-UCP': 18,
'I-UCP': 19,
'B-VP': 20,
'I-VP': 21,
'O': 22
}
def make_features(sequence):
length = len(sequence)
num_features = len(sequence[0])
def get_features(pos):
if pos < 0:
return ['#B%s' % -pos] * num_features
if pos >= length:
return ['#E%s' % (pos - length + 1)] * num_features
return sequence[pos]
for i in xrange(length):
for pattern in patterns:
fname = '/'.join([get_features(i+pos)[f] for pos, f in pattern])
sequence[i].append(fname)
'''
Source file format:
Each line is for one timestep. The features are separated by space.
An empty line indicates end of a sequence.
cutoff: a list of numbers. If count of a feature is smaller than this,
it will be ignored.
if oov_policy[i] is OOV_POLICY_USE, id 0 is reserved for OOV features of
i-th column.
return a list of dict for each column
'''
def create_dictionaries(filename, cutoff, oov_policy):
def add_to_dict(sequence, dicts):
num_features = len(dicts)
for features in sequence:
l = len(features)
assert l == num_features, "Wrong number of features " + line
for i in xrange(l):
if features[i] in dicts[i]:
dicts[i][features[i]] += 1
else:
dicts[i][features[i]] = 1
num_features = len(cutoff)
dicts = []
for i in xrange(num_features):
dicts.append(dict())
f = gzip.open(filename, 'rb')
sequence = []
for line in f:
line = line.strip()
if not line:
make_features(sequence)
add_to_dict(sequence, dicts)
sequence = []
continue
features = line.split(' ')
sequence.append(features)
for i in xrange(num_features):
dct = dicts[i]
n = 1 if oov_policy[i] == OOV_POLICY_USE else 0
todo = []
for k, v in dct.iteritems():
if v < cutoff[i]:
todo.append(k)
else:
dct[k] = n
n += 1
if oov_policy[i] == OOV_POLICY_USE:
# placeholder so that len(dct) will be the number of features
# including OOV
dct['#OOV#'] = 0
logger.info('column %d dict size=%d, ignored %d' % (i, n, len(todo)))
for k in todo:
del dct[k]
f.close()
return dicts
def initializer(settings, **xargs):
cutoff = [3, 1, 0]
cutoff += [3] * len(patterns)
oov_policy = [OOV_POLICY_IGNORE, OOV_POLICY_ERROR, OOV_POLICY_ERROR]
oov_policy += [OOV_POLICY_IGNORE] * len(patterns)
dicts = create_dictionaries('data/train.txt.gz', cutoff, oov_policy)
dicts[2] = dict_label
settings.dicts = dicts
settings.oov_policy = oov_policy
input_types = []
num_features = len(dicts)
for i in xrange(num_original_columns):
input_types.append(integer_sequence(len(dicts[i])))
logger.info("slot %s size=%s" % (i, len(dicts[i])))
if patterns:
dim = 0
for i in xrange(num_original_columns, num_features):
dim += len(dicts[i])
input_types.append(sparse_binary_vector_sequence(dim))
logger.info("feature size=%s" % dim)
settings.input_types = input_types
'''
if oov_policy[i] == OOV_POLICY_USE, features in i-th column which are not
existed in dicts[i] will be assigned to id 0.
if oov_policy[i] == OOV_POLICY_ERROR, all features in i-th column MUST exist
in dicts[i].
'''
@provider(init_hook=initializer, cache=CacheType.CACHE_PASS_IN_MEM)
def process(settings, filename):
input_file = filename
dicts = settings.dicts
oov_policy = settings.oov_policy
def gen_sample(sequence):
num_features = len(dicts)
sample = [list() for i in xrange(num_original_columns)]
if patterns:
sample.append([])
for features in sequence:
assert len(features) == num_features, \
"Wrong number of features: " + line
for i in xrange(num_original_columns):
id = dicts[i].get(features[i], -1)
if id != -1:
sample[i].append(id)
elif oov_policy[i] == OOV_POLICY_IGNORE:
sample[i].append(0xffffffff)
elif oov_policy[i] == OOV_POLICY_ERROR:
logger.fatal("Unknown token: %s" % features[i])
else:
sample[i].append(0)
if patterns:
dim = 0
vec = []
for i in xrange(num_original_columns, num_features):
id = dicts[i].get(features[i], -1)
if id != -1:
vec.append(dim + id)
elif oov_policy[i] == OOV_POLICY_IGNORE:
pass
elif oov_policy[i] == OOV_POLICY_ERROR:
logger.fatal("Unknown token: %s" % features[i])
else:
vec.ids.append(dim + 0)
dim += len(dicts[i])
sample[-1].append(vec)
return sample
num_features = len(dicts)
f = gzip.open(input_file, 'rb')
num_sequences = 0
sequence = []
for line in f:
line = line.strip()
if not line:
make_features(sequence)
yield gen_sample(sequence)
sequence = []
num_sequences += 1
continue
features = line.split(' ')
sequence.append(features)
f.close()
logger.info("num_sequences=%s" % num_sequences)
# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.trainer_config_helpers import *
import math
define_py_data_sources2(train_list="data/train.list",
test_list="data/test.list",
module="dataprovider",
obj="process")
batch_size = 1
settings(
learning_method=MomentumOptimizer(),
batch_size=batch_size,
regularization=L2Regularization(batch_size * 1e-4),
average_window=0.5,
learning_rate=1e-1,
learning_rate_decay_a=1e-5,
learning_rate_decay_b=0.25,
)
num_label_types=23
def get_simd_size(size):
return int(math.ceil(float(size) / 8)) * 8
# Currently, in order to use sparse_update=True,
# the size has to be aligned.
num_label_types = get_simd_size(num_label_types)
features = data_layer(name="features", size=76328)
word = data_layer(name="word", size=6778)
pos = data_layer(name="pos", size=44)
chunk = data_layer(name="chunk",
size=num_label_types)
crf_input = fc_layer(
input=features,
size=num_label_types,
act=LinearActivation(),
bias_attr=False,
param_attr=ParamAttr(initial_std=0, sparse_update=True))
crf=crf_layer(
input=crf_input,
label=chunk,
param_attr=ParamAttr(name="crfw", initial_std=0),
)
crf_decoding=crf_decoding_layer(
size=num_label_types,
input=crf_input,
label=chunk,
param_attr=ParamAttr(name="crfw"),
)
sum_evaluator(
name="error",
input=crf_decoding,
)
chunk_evaluator(
name="chunk_f1",
input =[crf_decoding, chunk],
chunk_scheme="IOB",
num_chunk_types=11,
)
inputs(word, pos, chunk, features)
outputs(crf)
# Sequence Tagging
This demo is a sequence model for assigning tags to each token in a sentence. The task is described at <a href = "http://www.cnts.ua.ac.be/conll2000/chunking">CONLL2000 Text Chunking</a> task.
## Download data
```bash
cd demo/sequence_tagging
./data/get_data.sh
```
## Train model
```bash
cd demo/sequence_tagging
./train.sh
```
## Model description
We provide two models. One is a linear CRF model (linear_crf.py) with is equivalent to the one at <a href="http://leon.bottou.org/projects/sgd#stochastic_gradient_crfs">leon.bottou.org/projects/sgd</a>. The second one is a stacked bidirectional RNN and CRF model (rnn_crf.py).
<center>
<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
<thead>
<th scope="col" class="left">Model name</th>
<th scope="col" class="left">Number of parameters</th>
<th scope="col" class="left">F1 score</th>
</thead>
<tbody>
<tr>
<td class="left">linear_crf</td>
<td class="left"> 1.8M </td>
<td class="left"> 0.937</td>
</tr>
<tr>
<td class="left">rnn_crf</td>
<td class="left"> 960K </td>
<td class="left">0.941</td>
</tr>
</tbody>
</table>
</center>
<br>
# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.trainer_config_helpers import *
import math
define_py_data_sources2(train_list="data/train.list",
test_list="data/test.list",
module="dataprovider",
obj="process")
batch_size = 16
settings(
learning_method=MomentumOptimizer(),
batch_size=batch_size,
regularization=L2Regularization(batch_size * 1e-5),
average_window=0.5,
learning_rate = 2e-3,
learning_rate_decay_a = 5e-7,
learning_rate_decay_b = 0.5,
)
word_dim=128
hidden_dim = 128
with_rnn = True
initial_std=1/math.sqrt(hidden_dim)
param_attr=ParamAttr(initial_std=initial_std)
cpu_layer_attr=ExtraLayerAttribute(device=-1)
default_device(0)
num_label_types=23
features = data_layer(name="features", size=76328)
word = data_layer(name="word", size=6778)
pos = data_layer(name="pos", size=44)
chunk = data_layer(name="chunk",
size=num_label_types,
layer_attr=cpu_layer_attr)
emb = embedding_layer(
input=word, size=word_dim, param_attr=ParamAttr(initial_std=0))
hidden1 = mixed_layer(
size=hidden_dim,
act=STanhActivation(),
bias_attr=True,
input=[full_matrix_projection(emb),
table_projection(pos, param_attr=param_attr)]
)
if with_rnn:
rnn1 = recurrent_layer(
act=ReluActivation(),
bias_attr=True,
input=hidden1,
param_attr=ParamAttr(initial_std=0),
)
hidden2 = mixed_layer(
size=hidden_dim,
act=STanhActivation(),
bias_attr=True,
input=[full_matrix_projection(hidden1)
] + ([
full_matrix_projection(rnn1, param_attr=ParamAttr(initial_std=0))
] if with_rnn else []),
)
if with_rnn:
rnn2=recurrent_layer(
reverse=True,
act=ReluActivation(),
bias_attr=True,
input=hidden2,
param_attr=ParamAttr(initial_std=0),
)
crf_input = mixed_layer(
size=num_label_types,
bias_attr=False,
input=[
full_matrix_projection(hidden2),
] + ([
full_matrix_projection(rnn2, param_attr=ParamAttr(initial_std=0))
] if with_rnn else []),
)
crf = crf_layer(
input=crf_input,
label=chunk,
param_attr=ParamAttr(name="crfw", initial_std=0),
layer_attr=cpu_layer_attr,
)
crf_decoding = crf_decoding_layer(
size=num_label_types,
input=crf_input,
label=chunk,
param_attr=ParamAttr(name="crfw"),
layer_attr=cpu_layer_attr,
)
sum_evaluator(
name="error",
input=crf_decoding,
)
chunk_evaluator(
name="chunk_f1",
input =[crf_decoding, chunk],
chunk_scheme="IOB",
num_chunk_types=11,
)
inputs(word, pos, chunk, features)
outputs(crf)
#!/bin/bash
paddle train \
--config rnn_crf.py \
--parallel_nn=1 \
--use_gpu=1 \
--dot_period=10 \
--log_period=1000 \
--test_period=0 \
--num_passes=10
#!/bin/bash
paddle train \
--config linear_crf.py \
--use_gpu=0 \
--dot_period=100 \
--log_period=10000 \
--test_period=0 \
--num_passes=10
...@@ -99,3 +99,7 @@ git pull --rebase upstream HEAD ...@@ -99,3 +99,7 @@ git pull --rebase upstream HEAD
git push -f origin HEAD git push -f origin HEAD
``` ```
Now your Pull Request is updated with the latest version. Now your Pull Request is updated with the latest version.
## Revise your pull request
When you revise your pull request according to reviewer's comments, please use 'git commit' instead of 'git commit --amend' to commit your changes so that the reviewers can see the difference between the new pull requrest and the old pull request.
...@@ -69,7 +69,7 @@ If you want to launch container with GPU support, you need to set some environme ...@@ -69,7 +69,7 @@ If you want to launch container with GPU support, you need to set some environme
.. code-block:: bash .. code-block:: bash
export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}" export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}') export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:gpu-latest docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:gpu-latest
......
...@@ -134,7 +134,7 @@ def process(settings, file_name): ...@@ -134,7 +134,7 @@ def process(settings, file_name):
You need to add a data provider definition `define_py_data_sources2` in our network configuration. This definition specifies: You need to add a data provider definition `define_py_data_sources2` in our network configuration. This definition specifies:
- The path of the training and testing data (`data/train.list`, `data/test.list`). - The path of the training and testing data (`data/train.list`, `data/test.list`).
- The location of the data provider file (`dataprovider_pow`). - The location of the data provider file (`dataprovider_bow`).
- The function to call to get data. (`process`). - The function to call to get data. (`process`).
- Additional arguments or data. Here it passes the path of word dictionary. - Additional arguments or data. Here it passes the path of word dictionary.
......
...@@ -73,6 +73,12 @@ img_pool_layer ...@@ -73,6 +73,12 @@ img_pool_layer
:members: img_pool_layer :members: img_pool_layer
:noindex: :noindex:
maxout_layer
------------
.. automodule:: paddle.trainer_config_helpers.layers
:members: maxout_layer
:noindex:
Norm Layer Norm Layer
========== ==========
...@@ -130,6 +136,12 @@ gru_step_layer ...@@ -130,6 +136,12 @@ gru_step_layer
Recurrent Layer Group Recurrent Layer Group
===================== =====================
memory
------
.. automodule:: paddle.trainer_config_helpers.layers
:members: memory
:noindex:
recurrent_group recurrent_group
--------------- ---------------
.. automodule:: paddle.trainer_config_helpers.layers .. automodule:: paddle.trainer_config_helpers.layers
...@@ -377,6 +389,12 @@ ctc_layer ...@@ -377,6 +389,12 @@ ctc_layer
:members: ctc_layer :members: ctc_layer
:noindex: :noindex:
nce_layer
-----------
.. automodule:: paddle.trainer_config_helpers.layers
:members: nce_layer
:noindex:
hsigmoid hsigmoid
--------- ---------
.. automodule:: paddle.trainer_config_helpers.layers .. automodule:: paddle.trainer_config_helpers.layers
......
# 支持双层序列作为输入的Layer
## 概述
在自然语言处理任务中,序列是一种常见的数据类型。一个独立的词语,可以看作是一个非序列输入,或者,我们称之为一个0层的序列;由词语构成的句子,是一个单层序列;若干个句子构成一个段落,是一个双层的序列。
双层序列是一个嵌套的序列,它的每一个元素,又是一个单层的序列。这是一种非常灵活的数据组织方式,帮助我们构造一些复杂的输入信息。
我们可以按照如下层次定义非序列,单层序列,以及双层序列。
+ 0层序列:一个独立的元素,类型可以是PaddlePaddle支持的任意输入数据类型
+ 单层序列:排成一列的多个元素,每个元素是一个0层序列,元素之间的顺序是重要的输入信息
+ 双层序列:排成一列的多个元素,每个元素是一个单层序列,称之为双层序列的一个子序列(subseq),subseq的每个元素是一个0层序列
在 PaddlePaddle中,下面这些Layer能够接受双层序列作为输入,完成相应的计算。
## pooling_layer
pooling_layer的使用示例如下,详细见<a href = "../../../doc/ui/api/trainer_config_helpers/layers.html#pooling-layer">配置API</a>
```python
seq_pool = pooling_layer(input=layer,
pooling_type=AvgPooling(),
agg_level=AggregateLevel.EACH_SEQUENCE)
```
- `pooling_type` 目前支持两种,分别是:MaxPooling()和AvgPooling()。
- `agg_level=AggregateLevel.TIMESTEP`时(默认值):
- 作用:双层序列经过运算变成一个0层序列,或单层序列经过运算变成一个0层序列
- 输入:一个双层序列,或一个单层序列
- 输出:一个0层序列,即整个输入序列(单层或双层)的平均值(或最大值)
- `agg_level=AggregateLevel.EACH_SEQUENCE`时:
- 作用:一个双层序列经过运算变成一个单层序列
- 输入:必须是一个双层序列
- 输出:一个单层序列,序列的每个元素是原来双层序列每个subseq元素的平均值(或最大值)
## last_seq 和 first_seq
last_seq的使用示例如下(first_seq类似),详细见<a href = "../../../doc/ui/api/trainer_config_helpers/layers.html#last-seq">配置API</a>
```python
last = last_seq(input=layer,
agg_level=AggregateLevel.EACH_SEQUENCE)
```
- `agg_level=AggregateLevel.TIMESTEP`时(默认值):
- 作用:一个双层序列经过运算变成一个0层序列,或一个单层序列经过运算变成一个0层序列
- 输入:一个双层序列或一个单层序列
- 输出:一个0层序列,即整个输入序列(双层或者单层)最后一个,或第一个元素。
- `agg_level=AggregateLevel.EACH_SEQUENCE`时:
- 作用:一个双层序列经过运算变成一个单层序列
- 输入:必须是一个双层序列
- 输出:一个单层序列,其中每个元素是双层序列中每个subseq最后一个(或第一个)元素。
## expand_layer
expand_layer的使用示例如下,详细见<a href = "../../../doc/ui/api/trainer_config_helpers/layers.html#expand-layer">配置API</a>
```python
expand = expand_layer(input=layer1,
expand_as=layer2,
expand_level=ExpandLevel.FROM_TIMESTEP)
```
- `expand_level=ExpandLevel.FROM_TIMESTEP`时(默认值):
- 作用:一个0层序列经过运算扩展成一个单层序列,或者一个双层序列
- 输入:layer1必须是一个0层序列,是待扩展的数据;layer2可以是一个单层序列,或者是一个双层序列,提供扩展的长度信息
- 输出:一个单层序列,或一个双层序列,输出序列的类型(双层序列,或单层序列)和序列中含有元素的数目同 layer2一致。若输出是单层序列,单层序列的每个元素(0层序列),都是对layer1元素的拷贝;若输出是双层序列,双层序列每个subseq中每个元素(0层序列),都是对layer1元素的拷贝
- `expand_level=ExpandLevel.FROM_SEQUENCE`时:
- 作用:一个单层序列经过运算扩展成一个双层序列
- 输入:layer1必须是一个单层序列,是待扩展的数据;layer2必须是一个双层序列,提供扩展的长度信息
- 输出:一个双层序列,序列中含有元素的数目同layer2一致。要求单层序列含有元素的数目(0层序列),和双层序列含有subseq 的数目一致。单层序列第i个元素(0层序列),被扩展为一个单层序列,构成了输出双层序列的第i个subseq。
\ No newline at end of file
# 双层RNN配置与示例
我们在`paddle/gserver/tests/test_RecurrentGradientMachine`单测中,通过多组语义相同的单双层RNN配置,讲解如何使用双层RNN。
## 示例1:双进双出,subseq间无memory
配置:单层RNN(`sequence_layer_group`)和双层RNN(`sequence_nest_layer_group`),语义完全相同。
### 读取双层序列的方法
首先,我们看一下单双层序列的不同数据组织形式(您也可以采用别的组织形式):
- 单层序列的数据(`Sequence/tour_train_wdseg`)如下,一共有10个样本。每个样本由两部分组成,一个label(此处都为2)和一个已经分词后的句子。
```text
2 酒店 有 很 舒适 的 床垫 子 , 床上用品 也 应该 是 一人 一 换 , 感觉 很 利落 对 卫生 很 放心 呀 。
2 很 温馨 , 也 挺 干净 的 * 地段 不错 , 出来 就 有 全家 , 离 地铁站 也 近 , 交通 很方便 * 就是 都 不 给 刷牙 的 杯子 啊 , 就 第一天 给 了 一次性杯子 *
2 位置 方便 , 强烈推荐 , 十一 出去玩 的 时候 选 的 , 对面 就是 华润万家 , 周围 吃饭 的 也 不少 。
2 交通便利 , 吃 很 便利 , 乾 浄 、 安静 , 商务 房 有 电脑 、 上网 快 , 价格 可以 , 就 早餐 不 好吃 。 整体 是 不错 的 。 適 合 出差 來 住 。
2 本来 准备 住 两 晚 , 第 2 天 一早 居然 停电 , 且 无 通知 , 只有 口头 道歉 。 总体来说 性价比 尚可 , 房间 较 新 , 还是 推荐 .
2 这个 酒店 去过 很多 次 了 , 选择 的 主要原因 是 离 客户 最 便宜 相对 又 近 的 酒店
2 挺好 的 汉庭 , 前台 服务 很 热情 , 卫生 很 整洁 , 房间 安静 , 水温 适中 , 挺好 !
2 HowardJohnson 的 品质 , 服务 相当 好 的 一 家 五星级 。 房间 不错 、 泳池 不错 、 楼层 安排 很 合理 。 还有 就是 地理位置 , 简直 一 流 。 就 在 天一阁 、 月湖 旁边 , 离 天一广场 也 不远 。 下次 来 宁波 还会 住 。
2 酒店 很干净 , 很安静 , 很 温馨 , 服务员 服务 好 , 各方面 都 不错 *
2 挺好 的 , 就是 没 窗户 , 不过 对 得 起 这 价格
```
- 双层序列的数据(`Sequence/tour_train_wdseg.nest`)如下,一共有4个样本。样本间用空行分开,代表不同的双层序列,序列数据和上面的完全一样。每个样本的子句数分别为2,3,2,3。
```text
2 酒店 有 很 舒适 的 床垫 子 , 床上用品 也 应该 是 一人 一 换 , 感觉 很 利落 对 卫生 很 放心 呀 。
2 很 温馨 , 也 挺 干净 的 * 地段 不错 , 出来 就 有 全家 , 离 地铁站 也 近 , 交通 很方便 * 就是 都 不 给 刷牙 的 杯子 啊 , 就 第一天 给 了 一次性杯子 *
2 位置 方便 , 强烈推荐 , 十一 出去玩 的 时候 选 的 , 对面 就是 华润万家 , 周围 吃饭 的 也 不少 。
2 交通便利 , 吃 很 便利 , 乾 浄 、 安静 , 商务 房 有 电脑 、 上网 快 , 价格 可以 , 就 早餐 不 好吃 。 整体 是 不错 的 。 適 合 出差 來 住 。
2 本来 准备 住 两 晚 , 第 2 天 一早 居然 停电 , 且 无 通知 , 只有 口头 道歉 。 总体来说 性价比 尚可 , 房间 较 新 , 还是 推荐 .
2 这个 酒店 去过 很多 次 了 , 选择 的 主要原因 是 离 客户 最 便宜 相对 又 近 的 酒店
2 挺好 的 汉庭 , 前台 服务 很 热情 , 卫生 很 整洁 , 房间 安静 , 水温 适中 , 挺好 !
2 HowardJohnson 的 品质 , 服务 相当 好 的 一 家 五星级 。 房间 不错 、 泳池 不错 、 楼层 安排 很 合理 。 还有 就是 地理位置 , 简直 一 流 。 就 在 天一阁 、 月湖 旁边 , 离 天一广场 也 不远 。 下次 来 宁波 还会 住 。
2 酒店 很干净 , 很安静 , 很 温馨 , 服务员 服务 好 , 各方面 都 不错 *
2 挺好 的 , 就是 没 窗户 , 不过 对 得 起 这 价格
```
其次,我们看一下单双层序列的不同dataprovider(见`sequenceGen.py`):
- 单层序列的dataprovider如下:
- word_slot是integer_value_sequence类型,代表单层序列。
- label是integer_value类型,代表一个向量。
```python
def hook(settings, dict_file, **kwargs):
settings.word_dict = dict_file
settings.input_types = [integer_value_sequence(len(settings.word_dict)),
integer_value(3)]
@provider(init_hook=hook)
def process(settings, file_name):
with open(file_name, 'r') as fdata:
for line in fdata:
label, comment = line.strip().split('\t')
label = int(''.join(label.split()))
words = comment.split()
word_slot = [settings.word_dict[w] for w in words if w in settings.word_dict]
yield word_slot, label
```
- 双层序列的dataprovider如下:
- word_slot是integer_value_sub_sequence类型,代表双层序列。
- label是integer_value_sequence类型,代表单层序列,即一个子句一个label。注意:也可以为integer_value类型,代表一个向量,即一个句子一个label。通常根据任务需求进行不同设置。
- 关于dataprovider中input_types的详细用法,参见PyDataProvider2。
```python
def hook2(settings, dict_file, **kwargs):
settings.word_dict = dict_file
settings.input_types = [integer_value_sub_sequence(len(settings.word_dict)),
integer_value_sequence(3)]
@provider(init_hook=hook2)
def process2(settings, file_name):
with open(file_name) as fdata:
label_list = []
word_slot_list = []
for line in fdata:
if (len(line)) > 1:
label,comment = line.strip().split('\t')
label = int(''.join(label.split()))
words = comment.split()
word_slot = [settings.word_dict[w] for w in words if w in settings.word_dict]
label_list.append(label)
word_slot_list.append(word_slot)
else:
yield word_slot_list, label_list
label_list = []
word_slot_list = []
```
### 模型中的配置
首先,我们看一下单层序列的配置(见`sequence_layer_group.conf`)。注意:batchsize=5表示一次过5句单层序列,因此2个batch就可以完成1个pass。
```python
settings(batch_size=5)
data = data_layer(name="word", size=dict_dim)
emb = embedding_layer(input=data, size=word_dim)
# (lstm_input + lstm) is equal to lstmemory
with mixed_layer(size=hidden_dim*4) as lstm_input:
lstm_input += full_matrix_projection(input=emb)
lstm = lstmemory_group(input=lstm_input,
size=hidden_dim,
act=TanhActivation(),
gate_act=SigmoidActivation(),
state_act=TanhActivation(),
lstm_layer_attr=ExtraLayerAttribute(error_clipping_threshold=50))
lstm_last = last_seq(input=lstm)
with mixed_layer(size=label_dim,
act=SoftmaxActivation(),
bias_attr=True) as output:
output += full_matrix_projection(input=lstm_last)
outputs(classification_cost(input=output, label=data_layer(name="label", size=1)))
```
其次,我们看一下语义相同的双层序列配置(见`sequence_nest_layer_group.conf`),并对其详细分析:
- batchsize=2表示一次过2句双层序列。但从上面的数据格式可知,2句双层序列和5句单层序列的数据完全一样。
- data_layer和embedding_layer不关心数据是否是序列格式,因此两个配置在这两层上的输出是一样的。
- lstmemory:
- 单层序列过了一个mixed_layer和lstmemory_group。
- 双层序列在同样的mixed_layer和lstmemory_group外,直接加了一层group。由于这个外层group里面没有memory,表示subseq间不存在联系,即起到的作用仅仅是把双层seq拆成单层,因此双层序列过完lstmemory的输出和单层的一样。
- last_seq:
- 单层序列直接取了最后一个元素
- 双层序列首先(last_seq层)取了每个subseq的最后一个元素,将其拼接成一个新的单层序列;接着(expand_layer层)将其扩展成一个新的双层序列,其中第i个subseq中的所有向量均为输入的单层序列中的第i个向量;最后(average_layer层)取了每个subseq的平均值。
- 分析得出:第一个last_seq后,每个subseq的最后一个元素就等于单层序列的最后一个元素,而expand_layer和average_layer后,依然保持每个subseq最后一个元素的值不变(这两层仅是为了展示它们的用法,实际中并不需要)。因此单双层序列的输出是一样旳。
```python
settings(batch_size=2)
data = data_layer(name="word", size=dict_dim)
emb_group = embedding_layer(input=data, size=word_dim)
# (lstm_input + lstm) is equal to lstmemory
def lstm_group(lstm_group_input):
with mixed_layer(size=hidden_dim*4) as group_input:
group_input += full_matrix_projection(input=lstm_group_input)
lstm_output = lstmemory_group(input=group_input,
name="lstm_group",
size=hidden_dim,
act=TanhActivation(),
gate_act=SigmoidActivation(),
state_act=TanhActivation(),
lstm_layer_attr=ExtraLayerAttribute(error_clipping_threshold=50))
return lstm_output
lstm_nest_group = recurrent_group(input=SubsequenceInput(emb_group),
step=lstm_group,
name="lstm_nest_group")
# hasSubseq ->(seqlastins) seq
lstm_last = last_seq(input=lstm_nest_group, agg_level=AggregateLevel.EACH_SEQUENCE)
# seq ->(expand) hasSubseq
lstm_expand = expand_layer(input=lstm_last, expand_as=emb_group, expand_level=ExpandLevel.FROM_SEQUENCE)
# hasSubseq ->(average) seq
lstm_average = pooling_layer(input=lstm_expand,
pooling_type=AvgPooling(),
agg_level=AggregateLevel.EACH_SEQUENCE)
with mixed_layer(size=label_dim,
act=SoftmaxActivation(),
bias_attr=True) as output:
output += full_matrix_projection(input=lstm_average)
outputs(classification_cost(input=output, label=data_layer(name="label", size=1)))
```
## 示例2:双进双出,subseq间有memory
配置:单层RNN(`sequence_rnn.conf`),双层RNN(`sequence_nest_rnn.conf``sequence_nest_rnn_readonly_memory.conf`),语义完全相同。
### 读取双层序列的方法
我们看一下单双层序列的不同数据组织形式和dataprovider(见`rnn_data_provider.py`
```python
data = [
[[[1, 3, 2], [4, 5, 2]], 0],
[[[0, 2], [2, 5], [0, 1, 2]], 1],
]
@provider(input_types=[integer_value_sub_sequence(10),
integer_value(3)])
def process_subseq(settings, file_name):
for d in data:
yield d
@provider(input_types=[integer_value_sequence(10),
integer_value(3)])
def process_seq(settings, file_name):
for d in data:
seq = []
```
- 单层序列:有两句,分别为[1,3,2,4,5,2]和[0,2,2,5,0,1,2]。
- 双层序列:有两句,分别为[[1,3,2],[4,5,2]](2个子句)和[[0,2],[2,5],[0,1,2]](3个子句)。
- 单双层序列的label都分别是0和1
### 模型中的配置
我们选取单双层序列配置中的不同部分,来对比分析两者语义相同的原因。
- 单层序列:过了一个很简单的recurrent_group。每一个时间步,当前的输入y和上一个时间步的输出rnn_state做了一个全链接。
```python
def step(y):
mem = memory(name="rnn_state", size=hidden_dim)
return fc_layer(input=[y, mem],
size=hidden_dim,
act=TanhActivation(),
bias_attr=True,
name="rnn_state")
out = recurrent_group(step=step, input=emb)
```
- 双层序列,外层memory是一个元素:
- 内层inner_step的recurrent_group和单层序列的几乎一样。除了boot_layer=outer_mem,表示将外层的outer_mem作为内层memory的初始状态。外层outer_step中,outer_mem是一个子句的最后一个向量,即整个双层group是将前一个子句的最后一个向量,作为下一个子句memory的初始状态。
- 从输入数据上看,单双层序列的句子是一样的,只是双层序列将其又做了子序列划分。因此双层序列的配置中,必须将前一个子句的最后一个元素,作为boot_layer传给下一个子句的memory,才能保证和单层序列的配置中“每一个时间步都用了上一个时间步的输出结果”一致。
```python
def outer_step(x):
outer_mem = memory(name="outer_rnn_state", size=hidden_dim)
def inner_step(y):
inner_mem = memory(name="inner_rnn_state",
size=hidden_dim,
boot_layer=outer_mem)
return fc_layer(input=[y, inner_mem],
size=hidden_dim,
act=TanhActivation(),
bias_attr=True,
name="inner_rnn_state")
inner_rnn_output = recurrent_group(
step=inner_step,
input=x)
last = last_seq(input=inner_rnn_output, name="outer_rnn_state")
return inner_rnn_output
out = recurrent_group(step=outer_step, input=SubsequenceInput(emb))
```
- 双层序列,外层memory是单层序列:
- 由于外层每个时间步返回的是一个子句,这些子句的长度往往不等长。因此当外层有is_seq=True的memory时,内层是**无法直接使用**它的,即内层memory的boot_layer不能链接外层的这个memory。
- 如果内层memory想**间接使用**这个外层memory,只能通过`pooling_layer``last_seq``first_seq`这三个layer将它先变成一个元素。但这种情况下,外层memory必须有boot_layer,否则在第0个时间步时,由于外层memory没有任何seq信息,因此上述三个layer的前向会报出“**Check failed: input.sequenceStartPositions**”的错误。
## 示例3:双进双出,输入不等长
**输入不等长**是指recurrent_group的多个输入在各时刻的长度可以不相等, 但需要指定一个和输出长度一致的input,用<font color="red">targetInlink</font>表示。参考配置:单层RNN(`sequence_rnn_multi_unequalength_inputs.conf`),双层RNN(`sequence_nest_rnn_multi_unequalength_inputs.conf`
### 读取双层序列的方法
我们看一下单双层序列的数据组织形式和dataprovider(见`rnn_data_provider.py`
```python
data2 = [
[[[1, 2], [4, 5, 2]], [[5, 4, 1], [3, 1]] ,0],
[[[0, 2], [2, 5], [0, 1, 2]],[[1, 5], [4], [2, 3, 6, 1]], 1],
]
@provider(input_types=[integer_value_sub_sequence(10),
integer_value_sub_sequence(10),
integer_value(2)],
should_shuffle=False)
def process_unequalength_subseq(settings, file_name): #双层RNN的dataprovider
for d in data2:
yield d
@provider(input_types=[integer_value_sequence(10),
integer_value_sequence(10),
integer_value(2)],
should_shuffle=False)
def process_unequalength_seq(settings, file_name): #单层RNN的dataprovider
for d in data2:
words1=reduce(lambda x,y: x+y, d[0])
words2=reduce(lambda x,y: x+y, d[1])
yield words1, words2, d[2]
```
data2 中有两个样本,每个样本有两个特征, 记fea1, fea2。
- 单层序列:两个样本分别为[[1, 2, 4, 5, 2], [5, 4, 1, 3, 1]] 和 [[0, 2, 2, 5, 0, 1, 2], [1, 5, 4, 2, 3, 6, 1]]
- 双层序列:两个样本分别为
- **样本1**:[[[1, 2], [4, 5, 2]], [[5, 4, 1], [3, 1]]]。fea1和fea2都分别有2个子句,fea1=[[1, 2], [4, 5, 2]], fea2=[[5, 4, 1], [3, 1]]
- **样本2**:[[[0, 2], [2, 5], [0, 1, 2]],[[1, 5], [4], [2, 3, 6, 1]]]。fea1和fea2都分别有3个子句, fea1=[[0, 2], [2, 5], [0, 1, 2]], fea2=[[1, 5], [4], [2, 3, 6, 1]]。<br/>
- **注意**:每个样本中,各特征的子句数目需要相等。这里说的“双进双出,输入不等长”是指fea1在i时刻的输入的长度可以不等于fea2在i时刻的输入的长度。如对于第1个样本,时刻i=2, fea1[2]=[4, 5, 2],fea2[2]=[3, 1],3≠2。
- 单双层序列中,两个样本的label都分别是0和1
### 模型中的配置
单层RNN(`sequence_rnn_multi_unequalength_inputs.conf`)和双层RNN(`sequence_nest_rnn_multi_unequalength_inputs.conf`)两个模型配置达到的效果完全一样,区别只在于输入为单层还是双层序列,现在我们来看它们内部分别是如何实现的。
- 单层序列:
- 过了一个简单的recurrent_group。每一个时间步,当前的输入y和上一个时间步的输出rnn_state做了一个全连接,功能与示例2中`sequence_rnn.conf``step`函数完全相同。这里,两个输入x1,x2分别通过calrnn返回最后时刻的状态。结果得到的encoder1_rep和encoder2_rep分别是单层序列,最后取encoder1_rep的最后一个时刻和encoder2_rep的所有时刻分别相加得到context。
- 注意到这里recurrent_group输入的每个样本中,fea1和fea2的长度都分别相等,这并非偶然,而是因为recurrent_group要求输入为单层序列时,所有输入的长度都必须相等。
```python
def step(x1, x2):
def calrnn(y):
mem = memory(name = 'rnn_state_' + y.name, size = hidden_dim)
out = fc_layer(input = [y, mem],
size = hidden_dim,
act = TanhActivation(),
bias_attr = True,
name = 'rnn_state_' + y.name)
return out
encoder1 = calrnn(x1)
encoder2 = calrnn(x2)
return [encoder1, encoder2]
encoder1_rep, encoder2_rep = recurrent_group(
name="stepout",
step=step,
input=[emb1, emb2])
encoder1_last = last_seq(input = encoder1_rep)
encoder1_expandlast = expand_layer(input = encoder1_last,
expand_as = encoder2_rep)
context = mixed_layer(input = [identity_projection(encoder1_expandlast),
identity_projection(encoder2_rep)],
size = hidden_dim)
```
- 双层序列:
- 双层RNN中,对输入的两个特征分别求时序上的连续全连接(`inner_step1``inner_step2`分别处理fea1和fea2),其功能与示例2中`sequence_nest_rnn.conf``outer_step`函数完全相同。不同之处是,此时输入`[SubsequenceInput(emb1), SubsequenceInput(emb2)]`在各时刻并不等长。
- 函数`outer_step`中可以分别处理这两个特征,但我们需要用<font color=red>targetInlink</font>指定recurrent_group的输出的格式(各子句长度)只能和其中一个保持一致,如这里选择了和emb2的长度一致。
- 最后,依然是取encoder1_rep的最后一个时刻和encoder2_rep的所有时刻分别相加得到context。
```python
def outer_step(x1, x2):
outer_mem1 = memory(name = "outer_rnn_state1", size = hidden_dim)
outer_mem2 = memory(name = "outer_rnn_state2", size = hidden_dim)
def inner_step1(y):
inner_mem = memory(name = 'inner_rnn_state_' + y.name,
size = hidden_dim,
boot_layer = outer_mem1)
out = fc_layer(input = [y, inner_mem],
size = hidden_dim,
act = TanhActivation(),
bias_attr = True,
name = 'inner_rnn_state_' + y.name)
return out
def inner_step2(y):
inner_mem = memory(name = 'inner_rnn_state_' + y.name,
size = hidden_dim,
boot_layer = outer_mem2)
out = fc_layer(input = [y, inner_mem],
size = hidden_dim,
act = TanhActivation(),
bias_attr = True,
name = 'inner_rnn_state_' + y.name)
return out
encoder1 = recurrent_group(
step = inner_step1,
name = 'inner1',
input = x1)
encoder2 = recurrent_group(
step = inner_step2,
name = 'inner2',
input = x2)
sentence_last_state1 = last_seq(input = encoder1, name = 'outer_rnn_state1')
sentence_last_state2_ = last_seq(input = encoder2, name = 'outer_rnn_state2')
encoder1_expand = expand_layer(input = sentence_last_state1,
expand_as = encoder2)
return [encoder1_expand, encoder2]
encoder1_rep, encoder2_rep = recurrent_group(
name="outer",
step=outer_step,
input=[SubsequenceInput(emb1), SubsequenceInput(emb2)],
targetInlink=emb2)
encoder1_last = last_seq(input = encoder1_rep)
encoder1_expandlast = expand_layer(input = encoder1_last,
expand_as = encoder2_rep)
context = mixed_layer(input = [identity_projection(encoder1_expandlast),
identity_projection(encoder2_rep)],
size = hidden_dim)
```
## 示例4:beam_search的生成
TBD
\ No newline at end of file
# Recurrent Group教程
## 概述
序列数据是自然语言处理任务面对的一种主要输入数据类型。
一句话是由词语构成的序列,多句话进一步构成了段落。因此,段落可以看作是一个嵌套的双层的序列,这个序列的每个元素又是一个序列。
双层序列是PaddlePaddle支持的一种非常灵活的数据组织方式,帮助我们更好地描述段落、多轮对话等更为复杂的语言数据。基于双层序列输入,我们可以设计搭建一个灵活的、层次化的RNN,分别从词语和句子级别编码输入数据,同时也能够引入更加复杂的记忆机制,更好地完成一些复杂的语言理解任务。
在PaddlePaddle中,`recurrent_group`是一种任意复杂的RNN单元,用户只需定义RNN在一个时间步内完成的计算,PaddlePaddle负责完成信息和误差在时间序列上的传播。
更进一步,`recurrent_group`同样可以扩展到双层序列的处理上。通过两个嵌套的`recurrent_group`分别定义子句级别和词语级别上需要完成的运算,最终实现一个层次化的复杂RNN。
目前,在PaddlePaddle中,能够对双向序列进行处理的有`recurrent_group`和部分Layer,具体可参考文档:<a href = "hierarchical-layer.html">支持双层序列作为输入的Layer</a>
## 相关概念
### 基本原理
`recurrent_group` 是PaddlePaddle支持的一种任意复杂的RNN单元。使用者只需要关注于设计RNN在一个时间步之内完成的计算,PaddlePaddle负责完成信息和梯度在时间序列上的传播。
PaddlePaddle中,`recurrent_group`的一个简单调用如下:
``` python
recurrent_group(step, input, reverse)
```
- step:一个可调用的函数,定义一个时间步之内RNN单元完成的计算
- input:输入,必须是一个单层序列,或者一个双层序列
- reverse:是否以逆序处理输入序列
使用`recurrent_group`的核心是设计step函数的计算逻辑。step函数内部可以自由组合PaddlePaddle支持的各种layer,完成任意的运算逻辑。`recurrent_group` 的输入(即input)会成为step函数的输入,由于step 函数只关注于RNN一个时间步之内的计算,在这里`recurrent_group`替我们完成了原始输入数据的拆分。
### 输入
`recurrent_group`处理的输入序列主要分为以下三种类型:
- **数据输入**:一个双层序列进入`recurrent_group`会被拆解为一个单层序列,一个单层序列进入`recurrent_group`会被拆解为非序列,然后交给step函数,这一过程对用户是完全透明的。可以有以下两种:1)通过data_layer拿到的用户输入;2)其它layer的输出。
- **只读Memory输入**`StaticInput` 定义了一个只读的Memory,由`StaticInput`指定的输入不会被`recurrent_group`拆解,`recurrent_group` 循环展开的每个时间步总是能够引用所有输入,可以是一个非序列,或者一个单层序列。
- **序列生成任务的输入**`GeneratedInput`只用于在序列生成任务中指定输入数据。
### 输入示例
序列生成任务大多遵循encoder-decoer架构,encoder和decoder可以是能够处理序列的任意神经网络单元,而RNN是最流行的选择。
给定encoder输出和当前词,decoder每次预测产生下一个最可能的词语。在这种结构中,decoder接受两个输入:
- 要生成的目标序列:是decoder的数据输入,也是decoder循环展开的依据,`recurrent_group`会对这类输入进行拆解。
- encoder输出,可以是一个非序列,或者一个单层序列:是一个unbounded memory,decoder循环展开的每一个时间步会引用全部结果,不应该被拆解,这种类型的输入必须通过`StaticInput`指定。关于Unbounded Memory的更多讨论请参考论文 [Neural Turning Machine](https://arxiv.org/abs/1410.5401)
在序列生成任务中,decoder RNN总是引用上一时刻预测出的词的词向量,作为当前时刻输入。`GeneratedInput`自动完成这一过程。
### 输出
`step`函数必须返回一个或多个Layer的输出,这个Layer的输出会作为整个`recurrent_group` 最终的输出结果。在输出的过程中,`recurrent_group` 会将每个时间步的输出拼接,这个过程对用户也是透明的。
### memory
memory只能在`recurrent_group`中定义和使用。memory不能独立存在,必须指向一个PaddlePaddle定义的Layer。引用memory得到这layer上一时刻输出,因此,可以将memory理解为一个时延操作。
可以显示地指定一个layer的输出用于初始化memory。不指定时,memory默认初始化为0。
## 双层RNN介绍
`recurrent_group`帮助我们完成对输入序列的拆分,对输出的合并,以及计算逻辑在序列上的循环展开。
利用这种特性,两个嵌套的`recurrent_group`能够处理双层序列,实现词语和句子两个级别的双层RNN结构。
- 单层(word-level)RNN:每个状态(state)对应一个词(word)。
- 双层(sequence-level)RNN:一个双层RNN由多个单层RNN组成,每个单层RNN(即双层RNN的每个状态)对应一个子句(subseq)。
为了描述方便,下文以NLP任务为例,将含有子句(subseq)的段落定义为一个双层序列,将含有词语的句子定义为一个单层序列,那么0层序列即为一个词语。
## 双层RNN的使用
### 训练流程的使用方法
使用 `recurrent_group`需要遵循以下约定:
- **单进单出**:输入和输出都是单层序列。
- 如果有多个输入,不同输入序列含有的词语数必须严格相等。
- 输出一个单层序列,输出序列的词语数和输入序列一致。
- memory:在step函数中定义 memory指向一个layer,通过引用memory得到这个layer上一个时刻输出,形成recurrent 连接。memory的is_seq参数必须为false。如果没有定义memory,每个时间步之内的运算是独立的。
- boot_layer:memory的初始状态,默认初始状为0,memory的is_seq参数必须为false。
- **双进双出**:输入和输出都是双层序列。
- 如果有多个输入序列,不同输入含有的子句(subseq)数必须严格相等,但子句含有的词语数可以不相等。
- 输出一个双层序列,子句(subseq)数、子句的单词数和指定的一个输入序列一致,默认为第一个输入。
- memory:在step函数中定义memory,指向一个layer,通过引用memory得到这个layer上一个时刻的输出,形成recurrent连接。定义在外层`recurrent_group` step函数中的memory,能够记录上一个subseq 的状态,可以是一个单层序列(只作为read-only memory),也可以是一个词语。如果没有定义memory,那么 subseq 之间的运算是独立的。
- boot_layer:memory 初始状态,可以是一个单层序列(只作为read-only memory)或一个向量。默认不设置,即初始状态为0。
- **双进单出**:目前还未支持,会报错"In hierachical RNN, all out links should be from sequences now"。
### 生成流程的使用方法
使用`beam_search`需要遵循以下约定:
- 单层RNN:从一个word生成下一个word。
- 双层RNN:即把单层RNN生成后的subseq给拼接成一个新的双层seq。从语义上看,也不存在一个subseq直接生成下一个subseq的情况。
\ No newline at end of file
...@@ -23,9 +23,9 @@ PaddlePaddle提供的Docker镜像版本 ...@@ -23,9 +23,9 @@ PaddlePaddle提供的Docker镜像版本
+-----------------+------------------+------------------------+-----------------------+ +-----------------+------------------+------------------------+-----------------------+
| GPU | gpu-latest | gpu-devel-latest | gpu-demo-latest | | GPU | gpu-latest | gpu-devel-latest | gpu-demo-latest |
+-----------------+------------------+------------------------+-----------------------+ +-----------------+------------------+------------------------+-----------------------+
| CPU WITHOUT AVX | cpu-noavx-latest | cpu-devel-noavx-latest | cpu-demo-noavx-latest | | CPU WITHOUT AVX | cpu-noavx-latest | cpu-noavx-devel-latest | cpu-noavx-demo-latest |
+-----------------+------------------+------------------------+-----------------------+ +-----------------+------------------+------------------------+-----------------------+
| GPU WITHOUT AVX | gpu-noavx-latest | gpu-devel-noavx-latest | gpu-demo-noavx-latest | | GPU WITHOUT AVX | gpu-noavx-latest | gpu-noavx-devel-latest | gpu-noavx-demo-latest |
+-----------------+------------------+------------------------+-----------------------+ +-----------------+------------------+------------------------+-----------------------+
其中,横向包括三个版本,normal,devel和demo。 其中,横向包括三个版本,normal,devel和demo。
......
...@@ -47,6 +47,7 @@ extensions = [ ...@@ -47,6 +47,7 @@ extensions = [
'sphinx.ext.autosummary', 'sphinx.ext.autosummary',
'sphinx.ext.mathjax', 'sphinx.ext.mathjax',
'sphinx.ext.napoleon', 'sphinx.ext.napoleon',
'sphinx.ext.graphviz'
] ]
table_styling_embed_css = True table_styling_embed_css = True
......
####################
PaddlePaddle常见问题
####################
.. contents::
1. 如何减少PaddlePaddle的内存占用
---------------------------------
神经网络的训练本身是一个非常消耗内存和显存的工作。经常会消耗数十G的内存和数G的显存。
PaddlePaddle的内存占用主要分为如下几个方面\:
* DataProvider缓冲池内存 (只针对内存)
* 神经元激活内存 (针对内存和显存)
* 参数内存 (针对内存和显存)
* 其他内存杂项
这其中,其他内存杂项是指PaddlePaddle本身所用的一些内存,包括字符串分配,临时变量等等,
这些内存就不考虑如何缩减了。
其他的内存的减少方法依次为
减少DataProvider缓冲池内存
++++++++++++++++++++++++++
PyDataProvider使用的是异步加载,同时在内存里直接随即选取数据来做Shuffle。即
.. graphviz::
digraph {
rankdir=LR;
数据文件 -> 内存池 -> PaddlePaddle训练
}
所以,减小这个内存池即可减小内存占用,同时也可以加速开始训练前数据载入的过程。但是,这
个内存池实际上决定了shuffle的粒度。所以,如果将这个内存池减小,又要保证数据是随机的,
那么最好将数据文件在每次读取之前做一次shuffle。可能的代码为
.. literalinclude:: reduce_min_pool_size.py
这样做可以极大的减少内存占用,并且可能会加速训练过程。 详细文档参考 `这里
<../ui/data_provider/pydataprovider2.html#provider>`_ 。
神经元激活内存
++++++++++++++
神经网络在训练的时候,会对每一个激活暂存一些数据,包括激活,參差等等。
在反向传递的时候,这些数据会被用来更新参数。这些数据使用的内存主要和两个参数有关系,
一是batch size,另一个是每条序列(Sequence)长度。所以,其实也是和每个mini-batch中包含
的时间步信息成正比。
所以,做法可以有两种。他们是
* 减小batch size。 即在网络配置中 :code:`settings(batch_size=1000)` 设置成一个小一些的值。但是batch size本身是神经网络的超参数,减小batch size可能会对训练结果产生影响。
* 减小序列的长度,或者直接扔掉非常长的序列。比如,一个数据集大部分序列长度是100-200,
但是突然有一个10000长的序列,就很容易导致内存超限。特别是在LSTM等RNN中。
参数内存
++++++++
PaddlePaddle支持非常多的优化算法(Optimizer),不同的优化算法需要使用不同大小的内存。
例如如果使用 :code:`adadelta` 算法,则需要使用参数规模大约5倍的内存。 如果参数保存下来的
文件为 :code:`100M`, 那么该优化算法至少需要 :code:`500M` 的内存。
可以考虑使用一些优化算法,例如 :code:`momentum`。
2. 如何加速PaddlePaddle的训练速度
---------------------------------
PaddlePaddle是神经网络训练平台,加速PaddlePaddle训练有如下几个方面\:
* 减少数据载入的耗时
* 加速训练速度
* 利用更多的计算资源
减少数据载入的耗时
++++++++++++++++++
使用 :code:`pydataprovider`时,可以减少缓存池的大小,同时设置内存缓存功能,即可以极大的加速数据载入流程。
:code:`DataProvider` 缓存池的减小,和之前减小通过减小缓存池来减小内存占用的原理一致。
.. literalinclude:: reduce_min_pool_size.py
同时 :code:`@provider` 接口有一个 :code:`cache` 参数来控制缓存方法,将其设置成 :code:`CacheType.CACHE_PASS_IN_MEM` 的话,会将第一个 :code:`pass` (过完所有训练数据即为一个pass)生成的数据缓存在内存里,在之后的 :code:`pass` 中,不会再从 :code:`python` 端读取数据,而是直接从内存的缓存里读取数据。这也会极大减少数据读入的耗时。
加速训练速度
++++++++++++
PaddlePaddle支持Sparse的训练,sparse训练需要训练特征是 :code:`sparse_binary_vector` 、 :code:`sparse_vector` 、或者 :code:`integer_value` 的任一一种。同时,与这个训练数据交互的Layer,需要将其Parameter设置成 sparse 更新模式,即设置 :code:`sparse_update=True`
这里使用简单的 :code:`word2vec` 训练语言模型距离,具体使用方法为\:
使用一个词前两个词和后两个词,来预测这个中间的词。这个任务的DataProvider为\:
.. literalinclude:: word2vec_dataprovider.py
这个任务的配置为\:
.. literalinclude:: word2vec_config.py
更多关于sparse训练的内容请参考 `sparse训练的文档 <TBD>`_
利用更多的计算资源
++++++++++++++++++
利用更多的计算资源可以分为一下几个方式来进行\:
* 单机CPU训练
* 使用多线程训练。设置命令行参数 :code:`trainer_count`,即可以设置参与训练的线程数量。使用方法为 :code:`paddle train --trainer_count=4`
* 单机GPU训练
* 使用显卡训练。设置命令行参数 :code:`use_gpu`。 使用方法为 :code:`paddle train --use_gpu=true`
* 使用多块显卡训练。设置命令行参数 :code:`use_gpu` 和 :code:`trainer_count`。使用 :code:`--use_gpu=True` 开启GPU训练,使用 :code:`trainer_count` 指定显卡数量。使用方法为 :code:`paddle train --use_gpu=true --trainer_count=4`
* 多机训练
* 使用多机训练的方法也比较简单,需要先在每个节点启动 :code:`paddle pserver`,在使用 :code:`paddle train --pservers=192.168.100.1,192.168.100.2` 来指定每个pserver的ip地址
* 具体的多机训练方法参考 `多机训练 <TBD>`_ 文档。
3. 遇到“非法指令”或者是“illegal instruction”
--------------------------------------------
paddle在进行计算的时候为了提升计算性能,使用了avx指令。部分老的cpu型号无法支持这样的指令。通常来说执行下grep avx /proc/cpuinfo看看是否有输出即可知道是否支持。(另:用此方法部分虚拟机可能检测到支持avx指令但是实际运行会挂掉,请当成是不支持,看下面的解决方案)
解决办法是\:
* 使用 NO_AVX的 `安装包 <../build_and_install/index.html>`_ 或者 `Docker image <../build_and_install/install/docker_install.html>`_
* 或者,使用 :code:`-DWITH_AVX=OFF` 重新编译PaddlePaddle。
4. 如何选择SGD算法的学习率
--------------------------
在采用sgd/async_sgd进行训练时,一个重要的问题是选择正确的learning_rate。如果learning_rate太大,那么训练有可能不收敛,如果learning_rate太小,那么收敛可能很慢,导致训练时间过长。
通常做法是从一个比较大的learning_rate开始试,如果不收敛,那减少学习率10倍继续试验,直到训练收敛为止。那么如何判断训练不收敛呢?可以估计出如果模型采用不变的输出最小的cost0是多少。
如果训练过程的的cost明显高于这个常数输出的cost,那么我们可以判断为训练不收敛。举一个例子,假如我们是三分类问题,采用multi-class-cross-entropy作为cost,数据中0,1,2三类的比例为 :code:`0.2, 0.5, 0.3` , 那么常数输出所能达到的最小cost是 :code:`-(0.2*log(0.2)+0.5*log(0.5)+0.3*log(0.3))=1.03` 。如果训练一个pass(或者更早)后,cost还大于这个数,那么可以认为训练不收敛,应该降低学习率。
5. 如何初始化参数
-----------------
默认情况下,PaddlePaddle使用均值0,标准差为 :math:`\frac{1}{\sqrt{d}}` 来初始化参数。其中 :math:`d` 为参数矩阵的宽度。这种初始化方式在一般情况下不会产生很差的结果。如果用户想要自定义初始化方式,PaddlePaddle目前提供两种参数初始化的方式\:
* 高斯分布。将 :code:`param_attr` 设置成 :code:`param_attr=ParamAttr(initial_mean=0.0, initial_std=1.0)`
* 均匀分布。将 :code:`param_attr` 设置成 :code:`param_attr=ParamAttr(initial_max=1.0, initial_min=-1.0)`
比如设置一个全连接层的参数初始化方式和bias初始化方式,可以使用如下代码。
.. code-block:: python
hidden = fc_layer(input=ipt, param_attr=ParamAttr(initial_max=1.0, initial_min=-1.0),
bias_attr=ParamAttr(initial_mean=1.0, initial_std=0.0))
上述代码将bias全部初始化为1.0, 同时将参数初始化为 :code:`[1.0, -1.0]` 的均匀分布。
6. 如何共享参数
---------------
PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID,相同名字的参数,会共享参数。设置参数的名字,可以使用 :code:`ParamAttr(name="YOUR_PARAM_NAME")` 来设置。更方便的设置方式,是想要共享的参数使用同样的 :code:`ParamAttr` 对象。
简单的全连接网络,参数共享的配置示例为\:
.. literalinclude:: ../../python/paddle/trainer_config_helpers/tests/configs/shared_fc.py
这里 :code:`hidden_a` 和 :code:`hidden_b` 使用了同样的parameter和bias。并且softmax层的两个输入也使用了同样的参数 :code:`softmax_param`。
@provider(min_pool_size=0, ...)
def process(settings, filename):
os.system('shuf %s > %s.shuf' % (filename, filename)) # shuffle before.
with open('%s.shuf' % filename, 'r') as f:
for line in f:
yield get_sample_from_line(line)
\ No newline at end of file
... # the settings and define data provider is omitted.
DICT_DIM=3000 # dictionary dimension.
word_ids=data_layer('word_ids', size=DICT_DIM)
emb = embedding_layer(input=word_ids, size=256, param_attr=ParamAttr(sparse_update=True))
emb_sum = pooling_layer(input=emb, pooling_type=SumPooling())
predict = fc_layer(input=emb_sum, size=DICT_DIM, act=Softmax())
outputs(classification_cost(input=predict, label=data_layer('label', size=DICT_DIM)))
\ No newline at end of file
DICT_DIM=3000
@provider(input_types=[integer_sequence(DICT_DIM), integer_value(DICT_DIM)])
def process(settings, filename):
with open(filename) as f:
# yield word ids to predict inner word id
# such as [28, 29, 10, 4], 4
# It means the sentance is 28, 29, 4, 10, 4.
yield read_next_from_file(f)
\ No newline at end of file
...@@ -3,6 +3,7 @@ PaddlePaddle文档 ...@@ -3,6 +3,7 @@ PaddlePaddle文档
使用指南 使用指南
-------- --------
* `快速入门 <demo/quick_start/index.html>`_ * `快速入门 <demo/quick_start/index.html>`_
* `编译与安装 <build_and_install/index.html>`_ * `编译与安装 <build_and_install/index.html>`_
* `用户接口 <ui/index.html>`_ * `用户接口 <ui/index.html>`_
...@@ -16,4 +17,13 @@ PaddlePaddle文档 ...@@ -16,4 +17,13 @@ PaddlePaddle文档
算法教程 算法教程
-------- --------
* `RNN配置 <../doc/algorithm/rnn/rnn.html>`_
* `Recurrent Group教程 <algorithm/rnn/rnn-tutorial.html>`_
* `单层RNN示例 <../doc/algorithm/rnn/rnn.html>`_
* `双层RNN示例 <algorithm/rnn/hierarchical-rnn.html>`_
* `支持双层序列作为输入的Layer <algorithm/rnn/hierarchical-layer.html>`_
常见问题
--------
* `常见问题 <faq/index.html>`_
...@@ -2,10 +2,10 @@ from paddle.trainer.PyDataProvider2 import * ...@@ -2,10 +2,10 @@ from paddle.trainer.PyDataProvider2 import *
# Define a py data provider # Define a py data provider
@provider(input_types=[ @provider(input_types={
dense_vector(28 * 28), 'pixel': dense_vector(28 * 28),
integer_value(10) 'label': integer_value(10)
]) })
def process(settings, filename): # settings is not used currently. def process(settings, filename): # settings is not used currently.
f = open(filename, 'r') # open one of training file f = open(filename, 'r') # open one of training file
...@@ -20,6 +20,6 @@ def process(settings, filename): # settings is not used currently. ...@@ -20,6 +20,6 @@ def process(settings, filename): # settings is not used currently.
pixels_float.append(float(each_pixel_str)) pixels_float.append(float(each_pixel_str))
# give data to paddle. # give data to paddle.
yield { "pixel": pixels_float, 'label': int(label) } yield {"pixel": pixels_float, 'label': int(label)}
f.close() # close file f.close() # close file
...@@ -141,8 +141,6 @@ DataProvider创建的时候执行。这个初始化函数具有如下参数: ...@@ -141,8 +141,6 @@ DataProvider创建的时候执行。这个初始化函数具有如下参数:
是一个batch size,但是有时为了计算均衡性,可以将一条数据设置成多个batch size 是一个batch size,但是有时为了计算均衡性,可以将一条数据设置成多个batch size
* cache 是数据缓存的策略,参考 `cache`_ * cache 是数据缓存的策略,参考 `cache`_
* init_hook 是初始化时调用的函数,参考 `init_hook`_ * init_hook 是初始化时调用的函数,参考 `init_hook`_
* use_dynamic_order 如果是true的话,可以返回一个dict,key是data_layer的名字,value是特征值。同时,也可以
返回一个list或者tuple。如果是false的话,只能够返回list或者tuple
* check 设置成true的话,会根据input_types检查数据的合法性。 * check 设置成true的话,会根据input_types检查数据的合法性。
* check_fail_continue 如果设置成true的话,即使在check中数据不合法,也会扔到这条数据,继续训练。 如果 * check_fail_continue 如果设置成true的话,即使在check中数据不合法,也会扔到这条数据,继续训练。 如果
check是false的话,没有作用。 check是false的话,没有作用。
......
...@@ -33,7 +33,7 @@ if ! python -c "import paddle" >/dev/null 2>/dev/null; then ...@@ -33,7 +33,7 @@ if ! python -c "import paddle" >/dev/null 2>/dev/null; then
esac esac
done done
shift $(($OPTIND - 1)) shift $(($OPTIND - 1))
export PYTHONPATH=$PYPATH export PYTHONPATH=$PYPATH:$PYTHONPATH
$@ $@
else else
echo "paddle package is already in your PYTHONPATH. But unittest need a clean environment." echo "paddle package is already in your PYTHONPATH. But unittest need a clean environment."
......
...@@ -2,10 +2,17 @@ set(AVX_SOURCES ...@@ -2,10 +2,17 @@ set(AVX_SOURCES
src/hl_math.cc src/hl_math.cc
src/hl_avx_functions.cc src/hl_avx_functions.cc
) )
set(CUDA_SOURCES
src/hl_time.cc if(WITH_AVX)
src/hl_cpu_functions.cc set(CUDA_SOURCES
${AVX_SOURCES}) src/hl_time.cc
src/hl_cpu_functions.cc
${AVX_SOURCES})
else()
set(CUDA_SOURCES
src/hl_time.cc
src/hl_cpu_functions.cc)
endif()
set(CUDA_CXX_WITH_GPU_SOURCES set(CUDA_CXX_WITH_GPU_SOURCES
src/hl_cuda_cublas.cc src/hl_cuda_cublas.cc
......
...@@ -185,7 +185,7 @@ typedef struct { ...@@ -185,7 +185,7 @@ typedef struct {
size_t nnz; size_t nnz;
} _hl_sparse_matrix_s, *hl_sparse_matrix_s; } _hl_sparse_matrix_s, *hl_sparse_matrix_s;
#ifndef HPPL_TYPE_DOUBLE #ifndef PADDLE_TYPE_DOUBLE
/** /**
* HPPL data type: real (float or double) * HPPL data type: real (float or double)
* *
......
...@@ -169,7 +169,7 @@ extern void hl_avgpool_forward( ...@@ -169,7 +169,7 @@ extern void hl_avgpool_forward(
* @brief Maximum pool backward. * @brief Maximum pool backward.
* *
* @param[in] frameCnt batch size of input image. * @param[in] frameCnt batch size of input image.
* @param[in] outGrad input data. * @param[in] outGrad output grad data.
* @param[in] channels number of channel. * @param[in] channels number of channel.
* @param[in] height image height. * @param[in] height image height.
* @param[in] width image width. * @param[in] width image width.
...@@ -296,4 +296,34 @@ extern void hl_bilinear_backward(real* inGrad, ...@@ -296,4 +296,34 @@ extern void hl_bilinear_backward(real* inGrad,
const size_t outputW, const size_t outputW,
const size_t numChannels); const size_t numChannels);
/**
* @brief MaxOut forward.
*
* @param[in] inData input data.
* @param[out] outData output data.
* @param[out] idData output maxId.
* @param[in] batchSize batchSize.
* @param[in] size number of channels * image height * image width.
* @param[in] featLen feature length = image height * image width.
* @param[in] groups number of groups.
*/
extern void hl_maxout_forward(
const real* inData, real* outData, int* idData,
size_t batchSize, size_t size, size_t featLen, size_t groups);
/**
* @brief MaxOut backward.
*
* @param[out] inGrad input grad data.
* @param[in] outGrad output grad data.
* @param[in] idData output maxId.
* @param[in] batchSize batchSize.
* @param[in] size number of channels * image height * image width.
* @param[in] featLen feature length = image height * image width.
* @param[in] groups number of groups.
*/
extern void hl_maxout_backward(
real* inGrad, const real* outGrad, const int* idData,
size_t batchSize, size_t size, size_t featLen, size_t groups);
#endif /* HL_CNN_H_ */ #endif /* HL_CNN_H_ */
...@@ -20,7 +20,7 @@ limitations under the License. */ ...@@ -20,7 +20,7 @@ limitations under the License. */
#include "paddle/math/MathFunctions.h" #include "paddle/math/MathFunctions.h"
#ifndef HPPL_TYPE_DOUBLE #ifndef PADDLE_TYPE_DOUBLE
#define CBLAS_GEMM paddle::gemm<float> #define CBLAS_GEMM paddle::gemm<float>
#else #else
#define CBLAS_GEMM paddle::gemm<double> #define CBLAS_GEMM paddle::gemm<double>
......
...@@ -28,7 +28,7 @@ namespace hppl { ...@@ -28,7 +28,7 @@ namespace hppl {
const real min = SIGMOID_THRESHOLD_MIN; const real min = SIGMOID_THRESHOLD_MIN;
const real max = SIGMOID_THRESHOLD_MAX; const real max = SIGMOID_THRESHOLD_MAX;
real tmp = (a < min) ? min : ((a > max) ? max : a); real tmp = (a < min) ? min : ((a > max) ? max : a);
#ifndef HPPL_TYPE_DOUBLE #ifndef PADDLE_TYPE_DOUBLE
return __fdividef(1.0f, 1.0f + __expf(-tmp)); return __fdividef(1.0f, 1.0f + __expf(-tmp));
#else #else
return 1.0 / (1.0 + exp(-tmp)); return 1.0 / (1.0 + exp(-tmp));
...@@ -36,7 +36,7 @@ namespace hppl { ...@@ -36,7 +36,7 @@ namespace hppl {
} }
__device__ static real tanh(const real a) { __device__ static real tanh(const real a) {
#ifndef HPPL_TYPE_DOUBLE #ifndef PADDLE_TYPE_DOUBLE
return __fdividef(2.0f, (1.0f + __expf(-2.0f*a))) - 1.0f; return __fdividef(2.0f, (1.0f + __expf(-2.0f*a))) - 1.0f;
#else #else
return (2.0 / (1.0 + exp(-2.0*a))) - 1.0; return (2.0 / (1.0 + exp(-2.0*a))) - 1.0;
......
...@@ -30,7 +30,7 @@ limitations under the License. */ ...@@ -30,7 +30,7 @@ limitations under the License. */
#define INLINE inline #define INLINE inline
#endif #endif
#ifndef HPPL_TYPE_DOUBLE #ifndef PADDLE_TYPE_DOUBLE
#define DEVICE_FMAX fmaxf #define DEVICE_FMAX fmaxf
#define DEVICE_FMIN fminf #define DEVICE_FMIN fminf
#else #else
......
...@@ -21,7 +21,7 @@ limitations under the License. */ ...@@ -21,7 +21,7 @@ limitations under the License. */
#ifdef __CUDA_ARCH__ #ifdef __CUDA_ARCH__
// typedef void* vecType; // typedef void* vecType;
#include <vector_types.h> #include <vector_types.h>
#ifndef HPPL_TYPE_DOUBLE #ifndef PADDLE_TYPE_DOUBLE
typedef float4 vecType; typedef float4 vecType;
#else #else
typedef double2 vecType; typedef double2 vecType;
...@@ -30,7 +30,7 @@ typedef double2 vecType; ...@@ -30,7 +30,7 @@ typedef double2 vecType;
#include <mmintrin.h> #include <mmintrin.h>
#include <xmmintrin.h> #include <xmmintrin.h>
#include <emmintrin.h> #include <emmintrin.h>
#ifndef HPPL_TYPE_DOUBLE #ifndef PADDLE_TYPE_DOUBLE
typedef __m128 vecType; typedef __m128 vecType;
#else #else
typedef __m128d vecType; typedef __m128d vecType;
......
...@@ -143,7 +143,7 @@ extern void hl_context_projection_backward_weight(real* outputGrad, ...@@ -143,7 +143,7 @@ extern void hl_context_projection_backward_weight(real* outputGrad,
*/ */
extern void hl_sequence2batch_copy(real *batch, extern void hl_sequence2batch_copy(real *batch,
real *sequence, real *sequence,
int *batchIndex, const int *batchIndex,
int seqWidth, int seqWidth,
int batchCount, int batchCount,
bool seq2batch); bool seq2batch);
......
...@@ -20,7 +20,7 @@ limitations under the License. */ ...@@ -20,7 +20,7 @@ limitations under the License. */
#define VECTOR_SIZE 16 #define VECTOR_SIZE 16
#ifndef HPPL_TYPE_DOUBLE #ifndef PADDLE_TYPE_DOUBLE
/* number of float in vector */ /* number of float in vector */
#define VECTOR_LEN 4 #define VECTOR_LEN 4
#define VECTOR_SET _mm_set_ps1 #define VECTOR_SET _mm_set_ps1
...@@ -41,7 +41,7 @@ inline bool hl_check_align(void *ptr) { ...@@ -41,7 +41,7 @@ inline bool hl_check_align(void *ptr) {
return hl_check_align(reinterpret_cast<size_t>(ptr)); return hl_check_align(reinterpret_cast<size_t>(ptr));
} }
#ifndef HPPL_TYPE_DOUBLE #ifndef PADDLE_TYPE_DOUBLE
template <class Agg> template <class Agg>
inline real hl_agg_op(Agg agg, vecType mm) { inline real hl_agg_op(Agg agg, vecType mm) {
__m128 lo = _mm_unpacklo_ps(mm, mm); __m128 lo = _mm_unpacklo_ps(mm, mm);
......
...@@ -113,4 +113,12 @@ inline void hl_bilinear_backward(real* inGrad, ...@@ -113,4 +113,12 @@ inline void hl_bilinear_backward(real* inGrad,
const size_t outputW, const size_t outputW,
const size_t numChannels) {} const size_t numChannels) {}
inline void hl_maxout_forward(
const real* inData, real* outData, int* idData,
size_t batchSize, size_t size, size_t featLen, size_t group) {}
inline void hl_maxout_backward(
real* inGrad, const real* outGrad, const int* idData,
size_t batchSize, size_t size, size_t featLen, size_t group) {}
#endif // HL_CNN_STUB_H_ #endif // HL_CNN_STUB_H_
...@@ -62,7 +62,7 @@ inline void hl_context_projection_backward_weight(real* outputGrad, ...@@ -62,7 +62,7 @@ inline void hl_context_projection_backward_weight(real* outputGrad,
inline void hl_sequence2batch_copy(real *batch, inline void hl_sequence2batch_copy(real *batch,
real *sequence, real *sequence,
int *batchIndex, const int *batchIndex,
int seqWidth, int seqWidth,
int batchCount, int batchCount,
bool seq2batch) {} bool seq2batch) {}
......
...@@ -662,4 +662,63 @@ void hl_bilinear_backward(real* inGrad, ...@@ -662,4 +662,63 @@ void hl_bilinear_backward(real* inGrad,
threadNum, inGrad, inImgH, inImgW, inputH, inputW, outGrad, threadNum, inGrad, inImgH, inImgW, inputH, inputW, outGrad,
outImgH, outImgW, outputH, outputW, numChannels, ratioH, ratioW); outImgH, outImgW, outputH, outputW, numChannels, ratioH, ratioW);
CHECK_SYNC("hl_bilinear_backward failed"); CHECK_SYNC("hl_bilinear_backward failed");
} }
\ No newline at end of file
__global__ void maxoutFpCompute(size_t nthreads, const real * inData,
real * outData, int* idData,
size_t size, size_t featLen, size_t groups) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
if(index < nthreads) {
size_t batch_idx = index / size;
size_t i = index % size;
size_t channel_idx = i / featLen;
size_t feat_idx = i % featLen;
size_t data_idx = (batch_idx * size + channel_idx * featLen) * groups + feat_idx;
real max = inData[data_idx];
int maxId = 0;
for (size_t g = 1; g < groups; ++g) {
real tmp = inData[data_idx + g * featLen];
if (tmp > max) {
max = tmp;
maxId = g;
}
}
outData[index] = max;
idData[index] = maxId;
}
}
void hl_maxout_forward(const real* inData, real* outData,
int* idData, size_t batchSize, size_t size,
size_t featLen, size_t groups) {
int num_kernels = size * batchSize;
int blocks = (num_kernels + 1024 - 1) / 1024;
maxoutFpCompute<<< blocks, 1024, 0, STREAM_DEFAULT>>>(
num_kernels, inData, outData, idData, size, featLen, groups);
CHECK_SYNC("hl_maxout_forward failed");
}
__global__ void maxoutBpCompute(size_t nthreads, real* inGrad,
const real* outGrad, const int* idData,
size_t size, size_t featLen, size_t groups) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
if(index < nthreads) {
size_t batch_idx = index / size;
size_t i = index % size;
size_t channel_idx = i / featLen;
size_t feat_idx = i % featLen;
size_t newIndex = batch_idx * size;
size_t gradIdx = (channel_idx * groups + (idData + newIndex)[i]) * featLen + feat_idx;
(inGrad + newIndex * groups)[gradIdx] += (outGrad + newIndex)[i];
}
}
void hl_maxout_backward(real* inGrad, const real* outGrad,
const int* idData, size_t batchSize, size_t size,
size_t featLen, size_t groups) {
int num_kernels = size * batchSize;
int blocks = (num_kernels + 1024 - 1) / 1024;
maxoutBpCompute<<< blocks, 1024, 0, STREAM_DEFAULT >>>(
num_kernels, inGrad, outGrad, idData, size, featLen, groups);
CHECK_SYNC("hl_maxout_backward failed");
}
...@@ -84,7 +84,7 @@ CUBLAS_BLAS_ROUTINE_EACH(DYNAMIC_LOAD_CUBLAS_V2_WRAP) ...@@ -84,7 +84,7 @@ CUBLAS_BLAS_ROUTINE_EACH(DYNAMIC_LOAD_CUBLAS_V2_WRAP)
} /* namespace dynload */ } /* namespace dynload */
#ifndef HPPL_TYPE_DOUBLE #ifndef PADDLE_TYPE_DOUBLE
#define CUBLAS_GEAM dynload::cublasSgeam #define CUBLAS_GEAM dynload::cublasSgeam
#define CUBLAS_GEMV dynload::cublasSgemv #define CUBLAS_GEMV dynload::cublasSgemv
#define CUBLAS_GEMM dynload::cublasSgemm #define CUBLAS_GEMM dynload::cublasSgemm
......
...@@ -340,7 +340,7 @@ void hl_create_tensor_descriptor(hl_tensor_descriptor* image_desc, ...@@ -340,7 +340,7 @@ void hl_create_tensor_descriptor(hl_tensor_descriptor* image_desc,
(cudnn_tensor_descriptor)malloc(sizeof(_cudnn_tensor_descriptor)); (cudnn_tensor_descriptor)malloc(sizeof(_cudnn_tensor_descriptor));
CHECK_NOTNULL(hl_desc); CHECK_NOTNULL(hl_desc);
#ifndef HPPL_TYPE_DOUBLE #ifndef PADDLE_TYPE_DOUBLE
cudnnDataType_t data_type = CUDNN_DATA_FLOAT; cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
#else #else
cudnnDataType_t data_type = CUDNN_DATA_DOUBLE; cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
...@@ -373,7 +373,7 @@ void hl_create_tensor_descriptor(hl_tensor_descriptor* image_desc) { ...@@ -373,7 +373,7 @@ void hl_create_tensor_descriptor(hl_tensor_descriptor* image_desc) {
(cudnn_tensor_descriptor)malloc(sizeof(_cudnn_tensor_descriptor)); (cudnn_tensor_descriptor)malloc(sizeof(_cudnn_tensor_descriptor));
CHECK_NOTNULL(hl_desc); CHECK_NOTNULL(hl_desc);
#ifndef HPPL_TYPE_DOUBLE #ifndef PADDLE_TYPE_DOUBLE
cudnnDataType_t data_type = CUDNN_DATA_FLOAT; cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
#else #else
cudnnDataType_t data_type = CUDNN_DATA_DOUBLE; cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
...@@ -611,7 +611,7 @@ void hl_create_filter_descriptor(hl_filter_descriptor* filter, ...@@ -611,7 +611,7 @@ void hl_create_filter_descriptor(hl_filter_descriptor* filter,
CHECK_CUDNN(dynload::cudnnCreateFilterDescriptor(&hl_filter->desc)); CHECK_CUDNN(dynload::cudnnCreateFilterDescriptor(&hl_filter->desc));
#ifndef HPPL_TYPE_DOUBLE #ifndef PADDLE_TYPE_DOUBLE
cudnnDataType_t data_type = CUDNN_DATA_FLOAT; cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
#else #else
cudnnDataType_t data_type = CUDNN_DATA_DOUBLE; cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
...@@ -921,7 +921,7 @@ void hl_softmax_forward(real *input, ...@@ -921,7 +921,7 @@ void hl_softmax_forward(real *input,
int height, int height,
int width) int width)
{ {
#ifndef HPPL_TYPE_DOUBLE #ifndef PADDLE_TYPE_DOUBLE
cudnnDataType_t data_type = CUDNN_DATA_FLOAT; cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
#else #else
cudnnDataType_t data_type = CUDNN_DATA_DOUBLE; cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
...@@ -955,7 +955,7 @@ void hl_softmax_backward(real *output_value, ...@@ -955,7 +955,7 @@ void hl_softmax_backward(real *output_value,
int height, int height,
int width) int width)
{ {
#ifndef HPPL_TYPE_DOUBLE #ifndef PADDLE_TYPE_DOUBLE
cudnnDataType_t data_type = CUDNN_DATA_FLOAT; cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
#else #else
cudnnDataType_t data_type = CUDNN_DATA_DOUBLE; cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
......
...@@ -626,7 +626,7 @@ void hl_specify_devices_start(int* device, int number) { ...@@ -626,7 +626,7 @@ void hl_specify_devices_start(int* device, int number) {
void hl_rand(real *dest_d, size_t num) { void hl_rand(real *dest_d, size_t num) {
pthread_mutex_lock(t_resource.gen_mutex); pthread_mutex_lock(t_resource.gen_mutex);
CHECK_EQ( CHECK_EQ(
#ifndef HPPL_TYPE_DOUBLE #ifndef PADDLE_TYPE_DOUBLE
dynload::curandGenerateUniform(t_resource.gen, dest_d, num), dynload::curandGenerateUniform(t_resource.gen, dest_d, num),
#else #else
dynload::curandGenerateUniformDouble(t_resource.gen, dest_d, num), dynload::curandGenerateUniformDouble(t_resource.gen, dest_d, num),
......
...@@ -47,7 +47,7 @@ void hl_matrix_add(real *A_d, ...@@ -47,7 +47,7 @@ void hl_matrix_add(real *A_d,
CHECK_SYNC("hl_matrix_add failed"); CHECK_SYNC("hl_matrix_add failed");
} }
#ifdef HPPL_TYPE_DOUBLE #ifdef PADDLE_TYPE_DOUBLE
#define THRESHOLD 128 #define THRESHOLD 128
#else #else
#define THRESHOLD 64 #define THRESHOLD 64
...@@ -102,7 +102,7 @@ void subMaxAndExp(real* I, ...@@ -102,7 +102,7 @@ void subMaxAndExp(real* I,
val = -THRESHOLD; val = -THRESHOLD;
} }
I[nextIdx] = val; I[nextIdx] = val;
#ifndef HPPL_TYPE_DOUBLE #ifndef PADDLE_TYPE_DOUBLE
O[nextIdx] = __expf(val); O[nextIdx] = __expf(val);
#else #else
O[nextIdx] = exp(val); O[nextIdx] = exp(val);
......
...@@ -374,7 +374,7 @@ template<int blockDimX, int blockDimY, int gridDimX, bool seq2batch, bool isAdd> ...@@ -374,7 +374,7 @@ template<int blockDimX, int blockDimY, int gridDimX, bool seq2batch, bool isAdd>
__global__ __global__
void KeSequence2Batch(real *batch, void KeSequence2Batch(real *batch,
real *sequence, real *sequence,
int *batchIndex, const int *batchIndex,
int seqWidth, int seqWidth,
int batchCount) { int batchCount) {
int idx = threadIdx.x; int idx = threadIdx.x;
...@@ -405,7 +405,7 @@ void KeSequence2Batch(real *batch, ...@@ -405,7 +405,7 @@ void KeSequence2Batch(real *batch,
void hl_sequence2batch_copy(real *batch, void hl_sequence2batch_copy(real *batch,
real *sequence, real *sequence,
int *batchIndex, const int *batchIndex,
int seqWidth, int seqWidth,
int batchCount, int batchCount,
bool seq2batch) { bool seq2batch) {
......
...@@ -355,7 +355,7 @@ __global__ void KeSMatrixCscMulDense(real *C_d, ...@@ -355,7 +355,7 @@ __global__ void KeSMatrixCscMulDense(real *C_d,
} }
/* best perf */ /* best perf */
#ifndef HPPL_TYPE_DOUBLE #ifndef PADDLE_TYPE_DOUBLE
#define CU_CSCMM_THREAD_M_BEST 9 #define CU_CSCMM_THREAD_M_BEST 9
#else #else
#define CU_CSCMM_THREAD_M_BEST 4 #define CU_CSCMM_THREAD_M_BEST 4
......
...@@ -57,7 +57,8 @@ void BufferBatch::clone(DataBatch* srcBatch, bool useGpu) { ...@@ -57,7 +57,8 @@ void BufferBatch::clone(DataBatch* srcBatch, bool useGpu) {
} }
} }
DoubleBuffer::DoubleBuffer(DataProvider* dataPool, bool useGpu, DoubleBuffer::DoubleBuffer(DataProvider *dataPool,
bool useGpu,
int64_t batchSize) { int64_t batchSize) {
batchSize_ = batchSize; batchSize_ = batchSize;
dataPool_ = dataPool; dataPool_ = dataPool;
...@@ -110,6 +111,9 @@ void DoubleBuffer::removeOneBatch(DataBatch* dataBatch) { ...@@ -110,6 +111,9 @@ void DoubleBuffer::removeOneBatch(DataBatch* dataBatch) {
} }
void DoubleBuffer::insertOneBatch(DataBatch* batch) { void DoubleBuffer::insertOneBatch(DataBatch* batch) {
while (!bufferQueue_->waitNotEmptyFor(2 /* seconds */)) { // time out
if (stopping_) return;
}
BufferBatch* bufBatch = bufferQueue_->dequeue(); BufferBatch* bufBatch = bufferQueue_->dequeue();
// clone and copy the data from an Threadlocal Variable // clone and copy the data from an Threadlocal Variable
bufBatch->clone(batch, useGpu_); bufBatch->clone(batch, useGpu_);
...@@ -138,7 +142,7 @@ void DoubleBuffer::asyncLoadBatch() { ...@@ -138,7 +142,7 @@ void DoubleBuffer::asyncLoadBatch() {
actualSize = dataPool_->getNextBatchInternal(batchSize_, &newBatch); actualSize = dataPool_->getNextBatchInternal(batchSize_, &newBatch);
} }
insertOneBatch(&newBatch); insertOneBatch(&newBatch);
} while (actualSize > 0); } while (actualSize > 0 && !stopping_);
} }
} }
......
...@@ -259,7 +259,9 @@ typedef Queue<BufferBatch*> BufferBatchQueue; ...@@ -259,7 +259,9 @@ typedef Queue<BufferBatch*> BufferBatchQueue;
class DoubleBuffer { class DoubleBuffer {
public: public:
DoubleBuffer(DataProvider* dataPool, bool useGpu, int64_t batchSize = 0); DoubleBuffer(DataProvider* dataPool,
bool useGpu,
int64_t batchSize = 0);
virtual ~DoubleBuffer(); virtual ~DoubleBuffer();
void removeOneBatch(DataBatch* dataBatch); void removeOneBatch(DataBatch* dataBatch);
...@@ -308,7 +310,8 @@ public: ...@@ -308,7 +310,8 @@ public:
/** /**
* @brief create only used for unittest. * @brief create only used for unittest.
*/ */
inline static DataProvider* create(const DataConfig &config, bool useGpu) { inline static DataProvider* create(const DataConfig &config,
bool useGpu = FLAGS_use_gpu) {
return create(config, ModelConfig(), useGpu); return create(config, ModelConfig(), useGpu);
} }
...@@ -348,7 +351,6 @@ public: ...@@ -348,7 +351,6 @@ public:
*/ */
virtual void reset() { virtual void reset() {
if (doubleBuffer_ != nullptr) { if (doubleBuffer_ != nullptr) {
LOG(INFO) << "the double-buffer is starting ...";
doubleBuffer_->startAsyncLoad(); doubleBuffer_->startAsyncLoad();
} }
} }
......
...@@ -14,13 +14,20 @@ limitations under the License. */ ...@@ -14,13 +14,20 @@ limitations under the License. */
#ifndef PADDLE_NO_PYTHON #ifndef PADDLE_NO_PYTHON
#include <Python.h>
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <unordered_set> #include <unordered_set>
#include <list> #include <list>
#include <numpy/numpyconfig.h>
#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
#include <numpy/ndarrayobject.h>
#include "DataProvider.h" #include "DataProvider.h"
#include "paddle/utils/PythonUtil.h" #include "paddle/utils/PythonUtil.h"
#include "paddle/utils/Locks.h"
#include "paddle/utils/Stat.h"
namespace paddle { namespace paddle {
...@@ -202,7 +209,10 @@ public: ...@@ -202,7 +209,10 @@ public:
PyDataProvider2(const DataConfig& config, PyDataProvider2(const DataConfig& config,
const ModelConfig& modelConfig, const ModelConfig& modelConfig,
bool useGpu) bool useGpu)
:DataProvider(config, useGpu), callingContextCreated_(2) { :DataProvider(config, useGpu),
callingContextCreated_(2) {
if (PyArray_API == NULL)
import_array();
auto& args = config.load_data_args(); auto& args = config.load_data_args();
PyObjectPtr kwargs = PyObjectPtr(PyDict_New()); PyObjectPtr kwargs = PyObjectPtr(PyDict_New());
if (!args.empty()) { if (!args.empty()) {
...@@ -246,8 +256,7 @@ private: ...@@ -246,8 +256,7 @@ private:
PyObjectPtr && kwargs) { PyObjectPtr && kwargs) {
LOG(INFO) << "loading dataprovider " << model <<"::" << className; LOG(INFO) << "loading dataprovider " << model <<"::" << className;
PyObjectPtr module(PyImport_ImportModule(model.c_str())); PyObjectPtr module = py::import(model);
CHECK_PY(module) << "Cannot imort module " << model.c_str();
PyObjectPtr moduleDict(PyModule_GetDict(module.get())); PyObjectPtr moduleDict(PyModule_GetDict(module.get()));
CHECK_PY(moduleDict) << "Invoke module.__dict__ error"; CHECK_PY(moduleDict) << "Invoke module.__dict__ error";
PyObjectPtr cls(PyDict_GetItemString(moduleDict.get(), PyObjectPtr cls(PyDict_GetItemString(moduleDict.get(),
...@@ -455,6 +464,7 @@ private: ...@@ -455,6 +464,7 @@ private:
std::condition_variable pushCV_; std::condition_variable pushCV_;
std::condition_variable pullCV_; std::condition_variable pullCV_;
std::mutex mtx_; std::mutex mtx_;
ThreadBarrier callingContextCreated_; ThreadBarrier callingContextCreated_;
std::unique_ptr<IPyDataProviderCache> cache_; std::unique_ptr<IPyDataProviderCache> cache_;
...@@ -497,8 +507,8 @@ public: ...@@ -497,8 +507,8 @@ public:
* Resetting the PyDataProvider. May start reading thread here. * Resetting the PyDataProvider. May start reading thread here.
*/ */
virtual void reset() { virtual void reset() {
DataProvider::reset();
resetImpl(true); resetImpl(true);
DataProvider::reset();
} }
/** /**
...@@ -519,6 +529,7 @@ public: ...@@ -519,6 +529,7 @@ public:
* Loading a batch of data. * Loading a batch of data.
*/ */
int64_t getNextBatchInternal(int64_t size_, DataBatch *batch) { int64_t getNextBatchInternal(int64_t size_, DataBatch *batch) {
REGISTER_TIMER("PyDP2.getNextBatchInternal")
CHECK_GE(size_, 0); CHECK_GE(size_, 0);
size_t size = (size_t) size_; size_t size = (size_t) size_;
if (loadThread_) { // loading from thread should wait for data pool ready. if (loadThread_) { // loading from thread should wait for data pool ready.
...@@ -699,10 +710,22 @@ public: ...@@ -699,10 +710,22 @@ public:
*/ */
virtual void fill(Argument &argument, PyObject *obj) { virtual void fill(Argument &argument, PyObject *obj) {
real* dat = argument.value->getData() + height_ * headerPtr_->dim; real* dat = argument.value->getData() + height_ * headerPtr_->dim;
py::SequenceHelper s(obj); if (PyArray_Check(obj)) {
// TODO(yuyang18): Here we can use AVX or SSE to accelerate memory copy. auto dtype = PyArray_DTYPE((PyArrayObject*)obj);
for (size_t i=0; i < headerPtr_->dim; ++i) { if (dtype->type == 'f' && dtype->elsize == sizeof(real)) {
dat[i] = (real) s.getDouble(i); real * data = (real*)PyArray_DATA((PyArrayObject*)obj);
auto sz = PyArray_SIZE((PyArrayObject*)obj);
std::copy(data, data + sz, dat);
} else {
LOG(FATAL) << "You should yield float" << sizeof(real) * 8
<< " array";
}
} else {
py::SequenceHelper s(obj);
// TODO(yuyang18): Here we can use AVX or SSE to accelerate memory copy.
for (size_t i=0; i < headerPtr_->dim; ++i) {
dat[i] = (real) s.getDouble(i);
}
} }
++height_; ++height_;
} }
......
...@@ -75,7 +75,6 @@ class ChunkEvaluator : public Evaluator { ...@@ -75,7 +75,6 @@ class ChunkEvaluator : public Evaluator {
public: public:
virtual void init(const EvaluatorConfig& config) { virtual void init(const EvaluatorConfig& config) {
CHECK(!FLAGS_use_gpu) << "Not supported";
Evaluator::init(config); Evaluator::init(config);
if (config.chunk_scheme() == "IOB") { if (config.chunk_scheme() == "IOB") {
numTagTypes_ = 2; numTagTypes_ = 2;
...@@ -137,6 +136,7 @@ public: ...@@ -137,6 +136,7 @@ public:
CHECK_EQ(arguments.size(), (size_t)2); CHECK_EQ(arguments.size(), (size_t)2);
IVectorPtr& output = arguments[0].ids; IVectorPtr& output = arguments[0].ids;
IVectorPtr& label = arguments[1].ids; IVectorPtr& label = arguments[1].ids;
CHECK(!output->useGpu() && !label->useGpu()) << "Not supported";
auto sequenceStartPositions = auto sequenceStartPositions =
arguments[1].sequenceStartPositions->getVector(false); arguments[1].sequenceStartPositions->getVector(false);
CHECK_EQ(output->getSize(), label->getSize()); CHECK_EQ(output->getSize(), label->getSize());
......
...@@ -813,7 +813,6 @@ void TrainerThread::mergeGradSparse( ...@@ -813,7 +813,6 @@ void TrainerThread::mergeGradSparse(
para->getMat(PARAMETER_GRADIENT).get()); para->getMat(PARAMETER_GRADIENT).get());
std::vector<uint32_t>& ids = mainMat->getIds(threadId_); std::vector<uint32_t>& ids = mainMat->getIds(threadId_);
ids.clear();
for (auto slaveParams : slaveParameters) { for (auto slaveParams : slaveParameters) {
SparseRowCpuMatrix* mat = SparseRowCpuMatrix* mat =
dynamic_cast<SparseRowCpuMatrix*>((*slaveParams)[pid] dynamic_cast<SparseRowCpuMatrix*>((*slaveParams)[pid]
......
...@@ -544,6 +544,12 @@ void RecurrentGradientMachine::forward(const std::vector<Argument>& inArgs, ...@@ -544,6 +544,12 @@ void RecurrentGradientMachine::forward(const std::vector<Argument>& inArgs,
const std::vector<Argument> inArgs; const std::vector<Argument> inArgs;
std::vector<Argument> outArgs; std::vector<Argument> outArgs;
frames_[i]->forward(inArgs, &outArgs, passType); frames_[i]->forward(inArgs, &outArgs, passType);
if (hasSubseq) {
for (auto& outFrameLine : outFrameLines_) {
CHECK(outFrameLine.frames[i]->getOutput().sequenceStartPositions)
<< "In hierachical RNN, all out links should be from sequences.";
}
}
} }
if (evaluator_ && passType == PASS_TEST) { if (evaluator_ && passType == PASS_TEST) {
this->eval(evaluator_.get()); this->eval(evaluator_.get());
...@@ -635,16 +641,15 @@ void RecurrentGradientMachine::createInFrameInfo(int inlinkId, ...@@ -635,16 +641,15 @@ void RecurrentGradientMachine::createInFrameInfo(int inlinkId,
std::vector<int> sequenceStartPositions; std::vector<int> sequenceStartPositions;
const int* subSequenceStartPositions = nullptr; const int* subSequenceStartPositions = nullptr;
if (hasSubseq) { // for sequenceScatterAgentLayer if (hasSubseq) { // for sequenceScatterAgentLayer
subSequenceStartPositions = subSequenceStartPositions = input.subSequenceStartPositions->getData(false);
input.subSequenceStartPositions->getData(false);
inlinkInfo->seqStartPosIndex.clear(); inlinkInfo->seqStartPosIndex.clear();
inlinkInfo->seqStartPosIndex.push_back(0); // first seqStartPosIndex = 0 inlinkInfo->seqStartPosIndex.push_back(0); // first seqStartPosIndex = 0
} }
// maxSequenceLength_: max topLevelLength in allsamples // maxSequenceLength_: max topLevelLength in allsamples
for (int i = 0; i < maxSequenceLength_; ++i) { for (int i = 0; i < maxSequenceLength_; ++i) {
if (hasSubseq) { if (hasSubseq) {
sequenceStartPositions.push_back(0); // first element = 0 sequenceStartPositions.push_back(0); // first element = 0
} }
int numSeqs = 0; int numSeqs = 0;
for (size_t j = 0; j < numSequences; ++j) { for (size_t j = 0; j < numSequences; ++j) {
...@@ -676,9 +681,9 @@ void RecurrentGradientMachine::createInFrameInfo(int inlinkId, ...@@ -676,9 +681,9 @@ void RecurrentGradientMachine::createInFrameInfo(int inlinkId,
} }
if (hasSubseq) { if (hasSubseq) {
// inFrameLine create sequenceStartPositions one time // inFrameLine create sequenceStartPositions one time
CHECK_EQ(sequenceStartPositions.size(), CHECK_EQ(
static_cast<size_t>(maxSequenceLength_ + sequenceStartPositions.size(),
input.getNumSubSequences())); static_cast<size_t>(maxSequenceLength_ + input.getNumSubSequences()));
CHECK_EQ(inlinkInfo->seqStartPosIndex.size(), CHECK_EQ(inlinkInfo->seqStartPosIndex.size(),
static_cast<size_t>(maxSequenceLength_ + 1)); static_cast<size_t>(maxSequenceLength_ + 1));
createSeqPos(sequenceStartPositions, &inlinkInfo->sequenceStartPositions); createSeqPos(sequenceStartPositions, &inlinkInfo->sequenceStartPositions);
...@@ -1102,10 +1107,12 @@ size_t RecurrentGradientMachine::beamShrink(std::vector<Path>& newPaths, ...@@ -1102,10 +1107,12 @@ size_t RecurrentGradientMachine::beamShrink(std::vector<Path>& newPaths,
newPaths.end(), Path::greaterPath); newPaths.end(), Path::greaterPath);
newPaths.resize(totalExpandCount + minNewPathSize); newPaths.resize(totalExpandCount + minNewPathSize);
real minPathLogProb = std::min_element(newPaths.end() - minNewPathSize, real minPathLogProb =
newPaths.end())->logProb; std::min_element(newPaths.end() - minNewPathSize, newPaths.end())
real maxPathLogProb = std::max_element(newPaths.end() - minNewPathSize, ->logProb;
newPaths.end())->logProb; real maxPathLogProb =
std::max_element(newPaths.end() - minNewPathSize, newPaths.end())
->logProb;
// Remove the already formed paths that are relatively short // Remove the already formed paths that are relatively short
finalPaths_[seqId].erase( finalPaths_[seqId].erase(
......
...@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "AgentLayer.h" #include "AgentLayer.h"
#include "paddle/utils/Logging.h" #include "paddle/utils/Logging.h"
...@@ -62,8 +61,8 @@ void SequenceAgentLayer::forward(PassType passType) { ...@@ -62,8 +61,8 @@ void SequenceAgentLayer::forward(PassType passType) {
// get Arguments from real layers // get Arguments from real layers
if (numSamples_ > 0 && numSamples_ < realNumSequences) { if (numSamples_ > 0 && numSamples_ < realNumSequences) {
int numRows = realOutput.sequenceStartPositions-> int numRows =
getData(false)[numSamples_]; realOutput.sequenceStartPositions->getData(false)[numSamples_];
CHECK(!realOutput.ids) << "Not supported"; CHECK(!realOutput.ids) << "Not supported";
output_.subArgFrom(realOutput, /* offset */ 0, numRows, getSize(), useGpu_, output_.subArgFrom(realOutput, /* offset */ 0, numRows, getSize(), useGpu_,
/* trans */ false, /* seqFlag */ true, /* trans */ false, /* seqFlag */ true,
...@@ -141,8 +140,8 @@ void ScatterAgentLayer::forward(PassType passType) { ...@@ -141,8 +140,8 @@ void ScatterAgentLayer::forward(PassType passType) {
int width = this->getSize(); int width = this->getSize();
if (realOutArg_.value || realOutArg_.ids) { if (realOutArg_.value || realOutArg_.ids) {
output_.subArgFrom(realOutArg_, /* offset */ idIndex_, idSize_, output_.subArgFrom(realOutArg_, /* offset */ idIndex_, idSize_, width,
width, useGpu_); useGpu_);
} else { // used in generation } else { // used in generation
if (realLayer_->getOutput().ids) { if (realLayer_->getOutput().ids) {
IVector::resizeOrCreate(output_.ids, ids_->getSize(), useGpu_); IVector::resizeOrCreate(output_.ids, ids_->getSize(), useGpu_);
...@@ -224,8 +223,8 @@ void SequenceScatterAgentLayer::forward(PassType passType) { ...@@ -224,8 +223,8 @@ void SequenceScatterAgentLayer::forward(PassType passType) {
if (realOutArg_.value || realOutArg_.ids) { if (realOutArg_.value || realOutArg_.ids) {
CHECK(realOutArg_.sequenceStartPositions); CHECK(realOutArg_.sequenceStartPositions);
output_.subArgFrom(realOutArg_, /* offset */ idIndex_, idSize_, output_.subArgFrom(realOutArg_, /* offset */ idIndex_, idSize_, width,
width, useGpu_, /* trans */ false, /* seqFlag */ true, useGpu_, /* trans */ false, /* seqFlag */ true,
/* seqStart */ seqStartPosIndex_, /* seqStart */ seqStartPosIndex_,
/* seqSize */ numSequences_); /* seqSize */ numSequences_);
} else { } else {
...@@ -249,11 +248,12 @@ void SequenceScatterAgentLayer::forward(PassType passType) { ...@@ -249,11 +248,12 @@ void SequenceScatterAgentLayer::forward(PassType passType) {
CHECK_NE(input.sequenceStartPositions.get(), CHECK_NE(input.sequenceStartPositions.get(),
output_.sequenceStartPositions.get()); output_.sequenceStartPositions.get());
ICpuGpuVector::resizeOrCreate(output_.sequenceStartPositions, ICpuGpuVector::resizeOrCreate(output_.sequenceStartPositions,
numSequences + 1, false); numSequences + 1, false);
int* outStarts = output_.sequenceStartPositions->getMutableData(false); int* outStarts = output_.sequenceStartPositions->getMutableData(false);
IVector::resizeOrCreate(cpuInputStartPos_, height, false); ICpuGpuVector::resizeOrCreate(inputStartPos_, height, false);
int* inStarts = cpuInputStartPos_->getData(); int* inStarts = inputStartPos_->getMutableData(false);
size_t offsetOut = 0; size_t offsetOut = 0;
for (size_t i = 0; i < numSequences; ++i) { for (size_t i = 0; i < numSequences; ++i) {
outStarts[i] = offsetOut; outStarts[i] = offsetOut;
...@@ -266,13 +266,8 @@ void SequenceScatterAgentLayer::forward(PassType passType) { ...@@ -266,13 +266,8 @@ void SequenceScatterAgentLayer::forward(PassType passType) {
} }
outStarts[numSequences] = offsetOut; outStarts[numSequences] = offsetOut;
if (useGpu_) { outputValue->copyByRowIndex(*input.value,
IVector::resizeOrCreate(inputStartPos_, height, true); *inputStartPos_->getVector(useGpu_));
inputStartPos_->copyFrom(*cpuInputStartPos_, HPPL_STREAM_DEFAULT);
} else {
inputStartPos_ = cpuInputStartPos_;
}
outputValue->copyByRowIndex(*input.value, *inputStartPos_);
} }
} }
......
...@@ -191,11 +191,7 @@ class SequenceScatterAgentLayer : public ScatterAgentLayer { ...@@ -191,11 +191,7 @@ class SequenceScatterAgentLayer : public ScatterAgentLayer {
protected: protected:
// use to store expanded cpuStartPositions or subSequenceStartPositions // use to store expanded cpuStartPositions or subSequenceStartPositions
// of real layer. // of real layer.
IVectorPtr cpuInputStartPos_; ICpuGpuVectorPtr inputStartPos_;
// point to cpuInputStartPos_ when useGpu_ is false
// copy from cpuInputStartPos_ when useGpu_ is true
IVectorPtr inputStartPos_;
public: public:
explicit SequenceScatterAgentLayer(const LayerConfig& config) explicit SequenceScatterAgentLayer(const LayerConfig& config)
......
...@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "AverageLayer.h" #include "AverageLayer.h"
#include "paddle/utils/Logging.h" #include "paddle/utils/Logging.h"
...@@ -25,13 +24,8 @@ REGISTER_LAYER(average, AverageLayer); ...@@ -25,13 +24,8 @@ REGISTER_LAYER(average, AverageLayer);
bool AverageLayer::init(const LayerMap& layerMap, bool AverageLayer::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) { const ParameterMap& parameterMap) {
/* Initialize the basic parent class */ SequencePoolLayer::init(layerMap, parameterMap);
Layer::init(layerMap, parameterMap);
/* initialize biases_ */
if (biasParameter_.get() != NULL) {
biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
}
dataMtx_ = Matrix::create(nullptr, 1, 1, false, useGpu_); dataMtx_ = Matrix::create(nullptr, 1, 1, false, useGpu_);
outMtx_ = Matrix::create(nullptr, 1, getSize(), false, useGpu_); outMtx_ = Matrix::create(nullptr, 1, getSize(), false, useGpu_);
// average strategy // average strategy
...@@ -44,57 +38,15 @@ bool AverageLayer::init(const LayerMap& layerMap, ...@@ -44,57 +38,15 @@ bool AverageLayer::init(const LayerMap& layerMap,
} else { } else {
LOG(FATAL) << "Unknown average strategy: " << config_.average_strategy(); LOG(FATAL) << "Unknown average strategy: " << config_.average_strategy();
} }
// transform to which sequence type
if (config_.trans_type() == "non-seq") {
type_ = kNonSeq;
} else if (config_.trans_type() == "seq") {
type_ = kSeq;
} else {
LOG(FATAL) << "Unknown trans_type: " << config_.trans_type();
}
setNeedSequenceInfo(false);
return true; return true;
} }
void AverageLayer::forward(PassType passType) { void AverageLayer::forward(PassType passType) {
Layer::forward(passType); SequencePoolLayer::forward(passType);
// average layer should have exactly 1 input
CHECK_EQ(1U, inputLayers_.size());
size_t dim = getSize();
const Argument& input = getInput(0);
int64_t newBatchSize =
type_ ? input.getNumSubSequences() : input.getNumSequences();
ICpuGpuVectorPtr startPositions =
type_ ? input.subSequenceStartPositions
: input.sequenceStartPositions;
const int* starts = startPositions->getData(false);
size_t numSequences = startPositions->getSize() - 1;
// check
CHECK_EQ(numSequences, (size_t)newBatchSize);
CHECK_EQ(starts[numSequences], input.getBatchSize());
if (type_) {
// when trans_type = seq, input must hasSubseq
CHECK_EQ(input.hasSubseq(), 1UL);
}
CHECK_EQ(dim, input.value->getWidth());
resetOutput(newBatchSize, dim);
auto startsPos = startPositions->getVector(useGpu_);
MatrixPtr inputValue = getInputValue(0); MatrixPtr inputValue = getInputValue(0);
getOutputValue()->sequenceAvgForward(*inputValue, *startsPos, mode_); getOutputValue()->sequenceAvgForward(
*inputValue, *startPositions_->getVector(useGpu_), mode_);
/* If type_ = kNonSeq, both seq has or not has sub-seq degrade to a non-seq,
* thus, in this case, output_ has no sequenceStartPositions.
* If type_ = kSeq, seq has sub-seq degrades to a seq, thus, only in this
* case, we should compute the new sequenceStartPositions.
*/
if (type_) {
output_.degradeSequence(input, useGpu_);
}
/* add the bias-vector AFTER average operation */ /* add the bias-vector AFTER average operation */
if (biases_.get() != NULL) { if (biases_.get() != NULL) {
...@@ -106,26 +58,16 @@ void AverageLayer::forward(PassType passType) { ...@@ -106,26 +58,16 @@ void AverageLayer::forward(PassType passType) {
} }
void AverageLayer::backward(const UpdateCallback& callback) { void AverageLayer::backward(const UpdateCallback& callback) {
const Argument& input = getInput(0); SequencePoolLayer::backward(callback);
ICpuGpuVectorPtr startPositions =
type_ ? input.subSequenceStartPositions
: input.sequenceStartPositions;
const int* starts = startPositions->getData(false);
/* Do derivation */ { backwardActivation(); }
if (biases_ && biases_->getWGrad()) {
biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
// Increasing the number of gradient
biases_->getParameterPtr()->incUpdate(callback);
}
const int* starts = startPositions_->getData(false);
MatrixPtr grad = getInputGrad(0); MatrixPtr grad = getInputGrad(0);
if (grad) { if (grad) {
size_t dim = getSize(); size_t dim = getSize();
real* gradientData = getInputGrad(0)->getData(); real* gradientData = getInputGrad(0)->getData();
real* gradient = getOutputGrad()->getData(); real* gradient = getOutputGrad()->getData();
size_t numSequences = startPositions->getSize() - 1; size_t numSequences = startPositions_->getSize() - 1;
for (size_t sequenceId = 0; sequenceId < numSequences; ++sequenceId) { for (size_t sequenceId = 0; sequenceId < numSequences; ++sequenceId) {
// TODO(Dangqingqing) optimization for GPU // TODO(Dangqingqing) optimization for GPU
int sequenceLength = starts[sequenceId + 1] - starts[sequenceId]; int sequenceLength = starts[sequenceId + 1] - starts[sequenceId];
......
...@@ -12,10 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,10 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include "Layer.h" #include "SequencePoolLayer.h"
#include "paddle/math/Matrix.h" #include "paddle/math/Matrix.h"
namespace paddle { namespace paddle {
...@@ -23,20 +22,21 @@ namespace paddle { ...@@ -23,20 +22,21 @@ namespace paddle {
/** /**
* A layer for "internal average" for sequence input. * A layer for "internal average" for sequence input.
* Input: one or more sequences. Each sequence contains some instances. * Input: one or more sequences. Each sequence contains some instances.
* If AverageLevel = kNonSeq: * If SequenceLevel = kNonSeq:
* Output: output size is the number of input sequences (NOT input instances) * Output: output size is the number of input sequences (NOT input instances)
* output[i] = average_{for each instance in this sequence}{input[i]} * output[i] = average_{for each instance in this sequence}{input[i]}
* If AverageLevel = kSeq: * If SequenceLevel = kSeq:
* Check input sequence must has sub-sequence * Check input sequence must has sub-sequence
* Output: output size is the number of input sub-sequences * Output: output size is the number of input sub-sequences
* output[i] = average_{for each instance in this sub-sequence}{input[i]} * output[i] = average_{for each instance in this sub-sequence}{input[i]}
*
* The config file api is pooling_layer.
*/ */
class AverageLayer : public SequencePoolLayer {
class AverageLayer : public Layer {
public: public:
enum AverageStrategy { kAverage = 0, kSum = 1, kAverageSquareRootN = 2 }; enum AverageStrategy { kAverage = 0, kSum = 1, kAverageSquareRootN = 2 };
enum AverageLevel { kNonSeq = 0, kSeq = 1 }; explicit AverageLayer(const LayerConfig& config)
explicit AverageLayer(const LayerConfig& config) : Layer(config) {} : SequencePoolLayer(config) {}
~AverageLayer() {} ~AverageLayer() {}
...@@ -46,11 +46,8 @@ public: ...@@ -46,11 +46,8 @@ public:
void backward(const UpdateCallback& callback = nullptr); void backward(const UpdateCallback& callback = nullptr);
protected: protected:
std::unique_ptr<Weight> biases_;
MatrixPtr outMtx_; MatrixPtr outMtx_;
MatrixPtr dataMtx_; MatrixPtr dataMtx_;
int mode_; int mode_;
int type_;
}; };
} // namespace paddle } // namespace paddle
...@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "ExpandLayer.h" #include "ExpandLayer.h"
#include "paddle/utils/Logging.h" #include "paddle/utils/Logging.h"
#include "paddle/utils/Stat.h" #include "paddle/utils/Stat.h"
...@@ -53,9 +52,8 @@ void ExpandLayer::forward(PassType passType) { ...@@ -53,9 +52,8 @@ void ExpandLayer::forward(PassType passType) {
const Argument& shapeInput = getInput(1); const Argument& shapeInput = getInput(1);
const Argument& dataInput = getInput(0); const Argument& dataInput = getInput(0);
size_t outputBatchSize = shapeInput.getBatchSize(); size_t outputBatchSize = shapeInput.getBatchSize();
auto startPositions = auto startPositions = type_ ? shapeInput.subSequenceStartPositions
type_ ? shapeInput.subSequenceStartPositions : shapeInput.sequenceStartPositions;
: shapeInput.sequenceStartPositions;
size_t numSequences = startPositions->getSize() - 1; size_t numSequences = startPositions->getSize() - 1;
const int* starts = startPositions->getData(false); const int* starts = startPositions->getData(false);
...@@ -71,8 +69,7 @@ void ExpandLayer::forward(PassType passType) { ...@@ -71,8 +69,7 @@ void ExpandLayer::forward(PassType passType) {
// set output sequence info as shape sequence // set output sequence info as shape sequence
output_.sequenceStartPositions = shapeInput.sequenceStartPositions; output_.sequenceStartPositions = shapeInput.sequenceStartPositions;
if (shapeInput.hasSubseq()) { if (shapeInput.hasSubseq()) {
output_.subSequenceStartPositions = output_.subSequenceStartPositions = shapeInput.subSequenceStartPositions;
shapeInput.subSequenceStartPositions;
} }
// reserve output: Expand output to batchsize of sequence data. // reserve output: Expand output to batchsize of sequence data.
...@@ -81,8 +78,8 @@ void ExpandLayer::forward(PassType passType) { ...@@ -81,8 +78,8 @@ void ExpandLayer::forward(PassType passType) {
MatrixPtr inputValue = getInputValue(0); MatrixPtr inputValue = getInputValue(0);
MatrixPtr outputValue = getOutputValue(); MatrixPtr outputValue = getOutputValue();
IVector::resizeOrCreate(cpuExpandStartsPos_, outputBatchSize, false); ICpuGpuVector::resizeOrCreate(expandStartsPos_, outputBatchSize, false);
int* expandStarts = cpuExpandStartsPos_->getData(); int* expandStarts = expandStartsPos_->getMutableData(false);
for (size_t sequenceId = 0; sequenceId < numSequences; ++sequenceId) { for (size_t sequenceId = 0; sequenceId < numSequences; ++sequenceId) {
int sequenceLength = starts[sequenceId + 1] - starts[sequenceId]; int sequenceLength = starts[sequenceId + 1] - starts[sequenceId];
for (int j = 0; j < sequenceLength; j++) { for (int j = 0; j < sequenceLength; j++) {
...@@ -90,15 +87,8 @@ void ExpandLayer::forward(PassType passType) { ...@@ -90,15 +87,8 @@ void ExpandLayer::forward(PassType passType) {
} }
} }
if (useGpu_) { outputValue->copyByRowIndex(*inputValue,
// TODO(Dangqingqing) move copyFrom *expandStartsPos_->getVector(useGpu_));
IVector::resizeOrCreate(expandStartsPos_, outputBatchSize, true);
expandStartsPos_->copyFrom(*cpuExpandStartsPos_, HPPL_STREAM_DEFAULT);
} else {
expandStartsPos_ = cpuExpandStartsPos_;
}
outputValue->copyByRowIndex(*inputValue, *expandStartsPos_);
if (biases_.get() != NULL) { if (biases_.get() != NULL) {
outputValue->addBias(*(biases_->getW()), 1); outputValue->addBias(*(biases_->getW()), 1);
...@@ -108,16 +98,15 @@ void ExpandLayer::forward(PassType passType) { ...@@ -108,16 +98,15 @@ void ExpandLayer::forward(PassType passType) {
void ExpandLayer::backward(const UpdateCallback& callback) { void ExpandLayer::backward(const UpdateCallback& callback) {
if (biases_ && biases_->getWGrad()) { if (biases_ && biases_->getWGrad()) {
biases_->getWGrad()->collectBias(*getOutputGrad(), 1); biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
/* Increasing the number of gradient */ /* Increasing the number of gradient */
biases_->getParameterPtr()->incUpdate(callback); biases_->getParameterPtr()->incUpdate(callback);
} }
if (!getInputGrad(0)) return; if (!getInputGrad(0)) return;
MatrixPtr inputGrad = getInputGrad(0); MatrixPtr inputGrad = getInputGrad(0);
MatrixPtr outputGrad = getOutputGrad(); MatrixPtr outputGrad = getOutputGrad();
auto cpuSeqStartPos = auto cpuSeqStartPos = type_ ? getInput(1).subSequenceStartPositions
type_ ? getInput(1).subSequenceStartPositions : getInput(1).sequenceStartPositions;
: getInput(1).sequenceStartPositions;
size_t numSequences = cpuSeqStartPos->getSize() - 1; size_t numSequences = cpuSeqStartPos->getSize() - 1;
const int* starts = cpuSeqStartPos->getData(false); const int* starts = cpuSeqStartPos->getData(false);
......
...@@ -44,14 +44,9 @@ protected: ...@@ -44,14 +44,9 @@ protected:
enum ExpandLevel { kNonSeq = 0, kSeq = 1 }; enum ExpandLevel { kNonSeq = 0, kSeq = 1 };
/// store the ExpandLevel /// store the ExpandLevel
int type_; int type_;
// TODO(luotao) use ICpuGpuVectorPtr to merge cpuExpandStartsPos_
// and expandStartsPos_
/// expanded sequenceStartPositions or subSequenceStartPositions /// expanded sequenceStartPositions or subSequenceStartPositions
/// of input[1] /// of input[1]
IVectorPtr cpuExpandStartsPos_; ICpuGpuVectorPtr expandStartsPos_;
/// point to cpuExpandStartsPos_ when useGpu_ is false,
/// copy from cpuExpandStartsPos_ when useGpu_ is true
IVectorPtr expandStartsPos_;
public: public:
explicit ExpandLayer(const LayerConfig& config) : Layer(config) {} explicit ExpandLayer(const LayerConfig& config) : Layer(config) {}
......
...@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "MaxLayer.h" #include "MaxLayer.h"
#include "paddle/utils/Logging.h" #include "paddle/utils/Logging.h"
#include "paddle/utils/Stat.h" #include "paddle/utils/Stat.h"
...@@ -21,55 +20,11 @@ namespace paddle { ...@@ -21,55 +20,11 @@ namespace paddle {
REGISTER_LAYER(max, MaxLayer); REGISTER_LAYER(max, MaxLayer);
bool MaxLayer::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) {
/* Initialize the basic parent class */
Layer::init(layerMap, parameterMap);
/* initialize biases_ */
if (biasParameter_.get() != NULL) {
biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
}
// transform to which sequence type
if (config_.trans_type() == "non-seq") {
type_ = kNonSeq;
} else if (config_.trans_type() == "seq") {
type_ = kSeq;
} else {
LOG(FATAL) << "Unknown trans_type: " << config_.trans_type();
}
setNeedSequenceInfo(false);
return true;
}
void MaxLayer::forward(PassType passType) { void MaxLayer::forward(PassType passType) {
Layer::forward(passType); SequencePoolLayer::forward(passType);
// max layer should have exactly 1 input
CHECK_EQ(1U, inputLayers_.size());
size_t dim = getSize();
const Argument& input = getInput(0);
int64_t newBatchSize =
type_ ? input.getNumSubSequences() : input.getNumSequences();
ICpuGpuVectorPtr startPositions =
type_ ? input.subSequenceStartPositions
: input.sequenceStartPositions;
auto starts = startPositions->getVector(useGpu_);
size_t numSequences = startPositions->getSize() - 1;
CHECK_EQ(dim, input.value->getWidth()); IVector::resizeOrCreate(maxIndex_, newBatchSize_ * getSize(),
CHECK_EQ(numSequences, (size_t)newBatchSize); useGpu(deviceId_));
CHECK_EQ(startPositions->getData(false)[numSequences], input.getBatchSize());
if (type_) {
// when trans_type = seq, input must hasSubseq
CHECK_EQ(input.hasSubseq(), 1UL);
}
// reset output: resize to "num of sequences", not "batch size".
resetOutput(newBatchSize, dim);
IVector::resizeOrCreate(maxIndex_, newBatchSize * dim, useGpu(deviceId_));
maxIndex_->zeroMem(); maxIndex_->zeroMem();
MatrixPtr inputValue = getInputValue(0); MatrixPtr inputValue = getInputValue(0);
...@@ -77,16 +32,8 @@ void MaxLayer::forward(PassType passType) { ...@@ -77,16 +32,8 @@ void MaxLayer::forward(PassType passType) {
{ {
REGISTER_TIMER_INFO("MaxLayerForward", getName().c_str()); REGISTER_TIMER_INFO("MaxLayerForward", getName().c_str());
outputValue->maxSequenceForward(*inputValue, *starts, *maxIndex_); outputValue->maxSequenceForward(
} *inputValue, *startPositions_->getVector(useGpu_), *maxIndex_);
/* If type_ = kNonSeq, both seq has or not has sub-seq degrade to a non-seq,
* thus, in this case, output_ has no cpuSequenceStartPositions.
* If type_ = kSeq, seq has sub-seq degrades to a seq, thus, only in this
* case, we should compute the new cpuSequenceStartPositions.
*/
if (type_) {
output_.degradeSequence(input, useGpu_);
} }
if (config_.output_max_index()) { if (config_.output_max_index()) {
...@@ -104,24 +51,14 @@ void MaxLayer::forward(PassType passType) { ...@@ -104,24 +51,14 @@ void MaxLayer::forward(PassType passType) {
void MaxLayer::backward(const UpdateCallback& callback) { void MaxLayer::backward(const UpdateCallback& callback) {
CHECK(!config_.output_max_index()) CHECK(!config_.output_max_index())
<< "backward is not available when output_max_index is set"; << "backward is not available when output_max_index is set";
/* Do derivation */ { backwardActivation(); } SequencePoolLayer::backward(callback);
if (biases_ && biases_->getWGrad()) {
biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
// Increasing the number of gradient
biases_->getParameterPtr()->incUpdate(callback);
}
MatrixPtr inputGrad = getInputGrad(0); MatrixPtr inputGrad = getInputGrad(0);
MatrixPtr outputGrad = getOutputGrad(); MatrixPtr outputGrad = getOutputGrad();
if (inputGrad) { if (inputGrad) {
ICpuGpuVectorPtr starts =
type_ ? getInput(0).subSequenceStartPositions
: getInput(0).sequenceStartPositions;
REGISTER_TIMER_INFO("MaxLayerBackward", getName().c_str()); REGISTER_TIMER_INFO("MaxLayerBackward", getName().c_str());
inputGrad->maxSequenceBackward(*outputGrad, inputGrad->maxSequenceBackward(
*(starts->getVector(useGpu_)), *maxIndex_); *outputGrad, *(startPositions_->getVector(useGpu_)), *maxIndex_);
} }
} }
......
...@@ -15,7 +15,7 @@ limitations under the License. */ ...@@ -15,7 +15,7 @@ limitations under the License. */
#pragma once #pragma once
#include "Layer.h" #include "SequencePoolLayer.h"
#include "paddle/math/Matrix.h" #include "paddle/math/Matrix.h"
#include "paddle/utils/ThreadLocal.h" #include "paddle/utils/ThreadLocal.h"
...@@ -24,29 +24,30 @@ namespace paddle { ...@@ -24,29 +24,30 @@ namespace paddle {
/** /**
* A layer for "internal max" for sequence input. * A layer for "internal max" for sequence input.
* Input: one or more sequences. Each sequence contains some instances. * Input: one or more sequences. Each sequence contains some instances.
* If MaxLevel = kNonSeq: * If SequenceLevel = kNonSeq:
* Output: output size is the number of input sequences (NOT input instances) * Output: output size is the number of input sequences (NOT input instances)
* output[i] = max_{for each instance in this sequence}{input[i]} * output[i] = max_{for each instance in this sequence}{input[i]}
* If MaxLevel = kSeq: * If SequenceLevel = kSeq:
* Check input sequence must has sub-sequence * Check input sequence must has sub-sequence
* Output: output size is the number of input sub-sequences * Output: output size is the number of input sub-sequences
* output[i] = max_{for each instance in this sub-sequence}{input[i]} * output[i] = max_{for each instance in this sub-sequence}{input[i]}
*
* The config file api is pooling_layer.
*/ */
class MaxLayer : public Layer { class MaxLayer : public SequencePoolLayer {
protected: protected:
std::unique_ptr<Weight> biases_;
// maxIndex_[i][j] = k : the value at (i, j) is from input[k]. // maxIndex_[i][j] = k : the value at (i, j) is from input[k].
IVectorPtr maxIndex_; IVectorPtr maxIndex_;
int type_;
public: public:
explicit MaxLayer(const LayerConfig& config) : Layer(config) {} explicit MaxLayer(const LayerConfig& config) : SequencePoolLayer(config) {}
enum MaxLevel {kNonSeq = 0, kSeq = 1 };
~MaxLayer() {} ~MaxLayer() {}
bool init(const LayerMap& layerMap, const ParameterMap& parameterMap); bool init(const LayerMap& layerMap, const ParameterMap& parameterMap) {
return SequencePoolLayer::init(layerMap, parameterMap);
}
void forward(PassType passType); void forward(PassType passType);
void backward(const UpdateCallback& callback = nullptr); void backward(const UpdateCallback& callback = nullptr);
......
/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "MaxOutLayer.h"
#include "hl_gpu.h"
#include "hl_cnn.h"
namespace paddle {
REGISTER_LAYER(maxout, MaxOutLayer);
size_t MaxOutLayer::getSize() {
const MaxOutConfig& maxoutConf = config_.inputs(0).maxout_conf();
imgSizeH_ = inputLayers_[0]->getOutput().getFrameHeight();
imgSizeW_ = inputLayers_[0]->getOutput().getFrameWidth();
if (imgSizeH_ == 0) {
imgSizeH_ = maxoutConf.img_size_y();
}
if (imgSizeW_ == 0) {
imgSizeW_ = maxoutConf.img_size_x();
}
featLen_ = imgSizeH_ * imgSizeW_;
size_t layerSize = featLen_ * outputChannels_;
getOutput().setFrameHeight(imgSizeH_);
getOutput().setFrameWidth(imgSizeW_);
return layerSize;
}
bool MaxOutLayer::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) {
/* Initialize the basic parent class */
Layer::init(layerMap, parameterMap);
/* the size of inputs for maxout-layer is 1 */
CHECK_EQ(config_.inputs_size(), 1);
const MaxOutConfig& conf = config_.inputs(0).maxout_conf();
groups_ = conf.groups();
channels_ = conf.channels();
CHECK_EQ(channels_ % groups_, 0UL);
outputChannels_ = channels_ / groups_;
return true;
}
void MaxOutLayer::forward(PassType passType) {
Layer::forward(passType);
/* malloc memory for the output_ if necessary */
/* note: one sample correspond to one column */
size_t batchSize = getInput(0).getBatchSize();
size_t size = getSize();
resetOutput(batchSize, size);
MatrixPtr inputV = getInputValue(0);
MatrixPtr outV = getOutputValue();
IVector::resizeOrCreate(maxoutId_, size * batchSize, useGpu_);
outV->maxoutForward(*inputV, *maxoutId_, outputChannels_, groups_);
}
void MaxOutLayer::backward(const UpdateCallback& callback) {
(void)callback;
/* Do derivation */
MatrixPtr inputG = getInputGrad(0);
MatrixPtr outG = getOutputGrad();
if (inputG) {
inputG->maxoutBackward(*outG, *maxoutId_, outputChannels_, groups_);
}
}
} // namespace paddle
/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "Layer.h"
#include "paddle/math/Matrix.h"
namespace paddle {
/**
* A layer to do max out on conv layer output.
* Input: output of a conv layer.
* Output: feature map size same as input. Channel is (input channel) / groups.
* So the num of channels should be able to devided by groups.
*
* The config file api is maxout_layer.
*/
class MaxOutLayer : public Layer {
protected:
size_t groups_;
size_t imgSizeH_, imgSizeW_;
/// outputChannels_ = channels_ / groups_
size_t channels_, outputChannels_;
/// feature length = imgSizeH_ * imgSizeW_
size_t featLen_;
IVectorPtr maxoutId_;
public:
/// return imgSizeH_ * imgSizeW_ * outputChannels_;
size_t getSize();
explicit MaxOutLayer(const LayerConfig& config) : Layer(config) {}
virtual ~MaxOutLayer() {}
bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
void forward(PassType passType);
void backward(const UpdateCallback& callback = nullptr);
};
} // namespace paddle
...@@ -21,14 +21,18 @@ limitations under the License. */ ...@@ -21,14 +21,18 @@ limitations under the License. */
namespace paddle { namespace paddle {
/** /**
* Noise-contrastive estimation * Noise-contrastive estimation.
* Implements the method in the following paper: * Implements the method in the following paper:
* A fast and simple algorithm for training neural probabilistic language models * A fast and simple algorithm for training neural probabilistic language models.
*
* The config file api is nce_layer.
*/ */
class NCELayer : public Layer { class NCELayer : public Layer {
int numClasses_; int numClasses_;
int numInputs_; // number of input layer besides labelLayer and weightLayer /// number of input layer besides labelLayer and weightLayer
int numInputs_;
LayerPtr labelLayer_; LayerPtr labelLayer_;
/// weight layer, can be None
LayerPtr weightLayer_; LayerPtr weightLayer_;
WeightList weights_; WeightList weights_;
std::unique_ptr<Weight> biases_; std::unique_ptr<Weight> biases_;
...@@ -43,7 +47,8 @@ class NCELayer : public Layer { ...@@ -43,7 +47,8 @@ class NCELayer : public Layer {
real weight; real weight;
}; };
std::vector<Sample> samples_; std::vector<Sample> samples_;
bool prepared_; // whether samples_ is prepared /// whether samples_ is prepared
bool prepared_;
Argument sampleOut_; Argument sampleOut_;
IVectorPtr labelIds_; IVectorPtr labelIds_;
......
...@@ -15,7 +15,7 @@ limitations under the License. */ ...@@ -15,7 +15,7 @@ limitations under the License. */
#include "paddle/utils/Logging.h" #include "paddle/utils/Logging.h"
#include "Layer.h" #include "SequencePoolLayer.h"
#include "paddle/math/Matrix.h" #include "paddle/math/Matrix.h"
#include "paddle/utils/Stat.h" #include "paddle/utils/Stat.h"
...@@ -29,20 +29,19 @@ namespace paddle { ...@@ -29,20 +29,19 @@ namespace paddle {
* If SequenceLevel = kSeq: * If SequenceLevel = kSeq:
* Check input sequence must has sub-sequence * Check input sequence must has sub-sequence
* Output: a sequence containing only the last instance of each sub-sequence * Output: a sequence containing only the last instance of each sub-sequence
* of the input sequence * of the input sequence
*
* The config file api is last_seq and first_seq.
*/ */
class SequenceLastInstanceLayer : public Layer { class SequenceLastInstanceLayer : public SequencePoolLayer {
protected: protected:
std::unique_ptr<Weight> biases_;
MatrixPtr tmpSrc_; MatrixPtr tmpSrc_;
MatrixPtr tmpDest_; MatrixPtr tmpDest_;
enum SequenceLevel { kNonSeq = 0, kSeq = 1 };
int type_;
public: public:
explicit SequenceLastInstanceLayer(const LayerConfig& config) explicit SequenceLastInstanceLayer(const LayerConfig& config)
: Layer(config) {} : SequencePoolLayer(config) {}
~SequenceLastInstanceLayer() {} ~SequenceLastInstanceLayer() {}
...@@ -56,55 +55,20 @@ REGISTER_LAYER(seqlastins, SequenceLastInstanceLayer); ...@@ -56,55 +55,20 @@ REGISTER_LAYER(seqlastins, SequenceLastInstanceLayer);
bool SequenceLastInstanceLayer::init(const LayerMap& layerMap, bool SequenceLastInstanceLayer::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) { const ParameterMap& parameterMap) {
/* Initialize the basic parent class */ SequencePoolLayer::init(layerMap, parameterMap);
Layer::init(layerMap, parameterMap);
// seqlastins layer should have exactly 1 input
CHECK_EQ(1U, inputLayers_.size());
/* initialize biases_ */
if (biasParameter_.get() != NULL) {
biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
}
tmpSrc_ = tmpSrc_ =
Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_); Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
tmpDest_ = tmpDest_ =
Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_); Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
// transform to which sequence type
if (config_.trans_type() == "non-seq") {
type_ = kNonSeq;
} else if (config_.trans_type() == "seq") {
type_ = kSeq;
} else {
LOG(FATAL) << "Unknown trans_type: " << config_.trans_type();
}
setNeedSequenceInfo(false);
return true; return true;
} }
void SequenceLastInstanceLayer::forward(PassType passType) { void SequenceLastInstanceLayer::forward(PassType passType) {
Layer::forward(passType); SequencePoolLayer::forward(passType);
size_t dim = getSize();
const Argument& input = getInput(0);
// check
auto startPositions =
type_ ? input.subSequenceStartPositions->getVector(false)
: input.sequenceStartPositions->getVector(false);
size_t height = type_ ? input.getNumSubSequences() : input.getNumSequences();
CHECK_EQ(dim, input.value->getWidth());
CHECK_EQ(startPositions->getData()[height], input.getBatchSize());
CHECK_EQ(height, startPositions->getSize() - 1);
if (type_) {
// when trans_type = seq, input must hasSubseq
CHECK_EQ(input.hasSubseq(), 1UL);
}
reserveOutput(height, dim); const int* starts = startPositions_->getData(false);
const int* starts = startPositions->getData();
MatrixPtr inputValue = getInputValue(0); MatrixPtr inputValue = getInputValue(0);
MatrixPtr outputValue = getOutputValue(); MatrixPtr outputValue = getOutputValue();
...@@ -112,21 +76,13 @@ void SequenceLastInstanceLayer::forward(PassType passType) { ...@@ -112,21 +76,13 @@ void SequenceLastInstanceLayer::forward(PassType passType) {
AsyncGpuBlock asyncGpuBlock; AsyncGpuBlock asyncGpuBlock;
REGISTER_TIMER_INFO("SequenceLastInstanceLayerForward", getName().c_str()); REGISTER_TIMER_INFO("SequenceLastInstanceLayerForward", getName().c_str());
for (size_t seqId = 0; seqId < height; ++seqId) { for (size_t seqId = 0; seqId < newBatchSize_; ++seqId) {
int insId = int insId =
config_.select_first() ? starts[seqId] : starts[seqId + 1] - 1; config_.select_first() ? starts[seqId] : starts[seqId + 1] - 1;
outputValue->subMatrix(seqId, 1, tmpDest_) outputValue->subMatrix(seqId, 1, tmpDest_)
->assign(*(inputValue->subMatrix(insId, 1, tmpSrc_))); ->assign(*(inputValue->subMatrix(insId, 1, tmpSrc_)));
} }
/* If type_ = kNonSeq, both seq has or not has sub-seq degrade to a non-seq,
* thus, in this case, output_ has no sequenceStartPositions.
* If type_ = kSeq, seq has sub-seq degrades to a seq, thus, only in this
* case, we should compute the new sequenceStartPositions.
*/
if (type_) {
output_.degradeSequence(input, useGpu_);
}
} }
if (biases_.get() != NULL) { if (biases_.get() != NULL) {
...@@ -138,23 +94,12 @@ void SequenceLastInstanceLayer::forward(PassType passType) { ...@@ -138,23 +94,12 @@ void SequenceLastInstanceLayer::forward(PassType passType) {
} }
void SequenceLastInstanceLayer::backward(const UpdateCallback& callback) { void SequenceLastInstanceLayer::backward(const UpdateCallback& callback) {
/* activation, should set to 'linear' in most cases */ SequencePoolLayer::backward(callback);
backwardActivation();
if (biases_ && biases_->getWGrad()) {
biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
// Increasing the number of gradient
biases_->getParameterPtr()->incUpdate(callback);
}
MatrixPtr inputGrad = getInputGrad(0); MatrixPtr inputGrad = getInputGrad(0);
MatrixPtr outputGrad = getOutputGrad(); MatrixPtr outputGrad = getOutputGrad();
auto startPositions = const int* starts = startPositions_->getData(false);
type_ ? getInput(0).subSequenceStartPositions->getVector(false) size_t numSequences = startPositions_->getSize() - 1;
: getInput(0).sequenceStartPositions->getVector(false);
const int* starts = startPositions->getData();
size_t numSequences = startPositions->getSize() - 1;
if (inputGrad) { if (inputGrad) {
AsyncGpuBlock asyncGpuBlock; AsyncGpuBlock asyncGpuBlock;
......
/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/utils/Logging.h"
#include "SequencePoolLayer.h"
namespace paddle {
bool SequencePoolLayer::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) {
/* Initialize the basic parent class */
Layer::init(layerMap, parameterMap);
// seqlastins/max/average layer should have exactly 1 input
CHECK_EQ(1U, inputLayers_.size());
/* initialize biases_ */
if (biasParameter_.get() != NULL) {
biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
}
// transform to which sequence type
if (config_.trans_type() == "non-seq") {
type_ = kNonSeq;
} else if (config_.trans_type() == "seq") {
type_ = kSeq;
} else {
LOG(FATAL) << "Unknown trans_type: " << config_.trans_type();
}
setNeedSequenceInfo(false);
return true;
}
void SequencePoolLayer::forward(PassType passType) {
Layer::forward(passType);
const Argument& input = getInput(0);
newBatchSize_ = type_ ? input.getNumSubSequences() : input.getNumSequences();
size_t dim = getSize();
// check
CHECK_EQ(dim, input.value->getWidth());
startPositions_ =
type_ ? input.subSequenceStartPositions : input.sequenceStartPositions;
auto starts = startPositions_->getVector(false);
CHECK_EQ(starts->getData()[newBatchSize_], input.getBatchSize());
CHECK_EQ(newBatchSize_, starts->getSize() - 1);
resetOutput(newBatchSize_, dim);
if (type_) {
CHECK(input.subSequenceStartPositions)
<< "when trans_type = seq, input must hasSubseq";
}
/* If type_ = kNonSeq, both seq has or not has sub-seq degrade to a non-seq,
* thus, in this case, output_ has no sequenceStartPositions.
* If type_ = kSeq, seq has sub-seq degrades to a seq, thus, only in this
* case, we should compute the new sequenceStartPositions.
*/
if (type_) {
output_.degradeSequence(input, useGpu_);
}
}
void SequencePoolLayer::backward(const UpdateCallback& callback) {
/* Do derivation */ { backwardActivation(); }
if (biases_ && biases_->getWGrad()) {
biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
// Increasing the number of gradient
biases_->getParameterPtr()->incUpdate(callback);
}
}
} // namespace paddle
/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "Layer.h"
#include "paddle/math/Matrix.h"
namespace paddle {
/**
* A base layer for SequenceLastInstanceLayer/AverageLayer/MaxLayer.
*
* Input: one or more sequences. Each sequence contains some instances.
* If SequenceLevel = kNonSeq:
* Output: output size is the number of input sequences (NOT input instances)
* output[i] = seqlastin/average/max_{for each instance in this
* sequence}{input[i]}
* If SequenceLevel = kSeq:
* Check input sequence must has sub-sequence
* Output: output size is the number of input sub-sequences
* output[i] = seqlastin/average/max_{for each instance in this
* sub-sequence}{input[i]}
*
* The config file api is pooling_layer.
*/
class SequencePoolLayer : public Layer {
protected:
int type_;
std::unique_ptr<Weight> biases_;
enum SequenceLevel { kNonSeq = 0, kSeq = 1 };
size_t newBatchSize_;
ICpuGpuVectorPtr startPositions_;
public:
explicit SequencePoolLayer(const LayerConfig& config) : Layer(config) {}
virtual ~SequencePoolLayer() {}
bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
void forward(PassType passType);
void backward(const UpdateCallback& callback = nullptr);
};
} // namespace paddle
...@@ -14,12 +14,15 @@ ...@@ -14,12 +14,15 @@
from paddle.trainer.PyDataProvider2 import * from paddle.trainer.PyDataProvider2 import *
# Note that each config should has an independent provider
# in current design of PyDataProvider2.
#######################################################
data = [ data = [
[[[1, 3, 2], [4, 5, 2]], 0], [[[1, 3, 2], [4, 5, 2]], 0],
[[[0, 2], [2, 5], [0, 1, 2]], 1], [[[0, 2], [2, 5], [0, 1, 2]], 1],
] ]
# Used for sequence_nest_rnn.conf
@provider(input_types=[integer_value_sub_sequence(10), @provider(input_types=[integer_value_sub_sequence(10),
integer_value(3)], integer_value(3)],
should_shuffle=False) should_shuffle=False)
...@@ -27,7 +30,7 @@ def process_subseq(settings, file_name): ...@@ -27,7 +30,7 @@ def process_subseq(settings, file_name):
for d in data: for d in data:
yield d yield d
# Used for sequence_rnn.conf
@provider(input_types=[integer_value_sequence(10), @provider(input_types=[integer_value_sequence(10),
integer_value(3)], integer_value(3)],
should_shuffle=False) should_shuffle=False)
...@@ -38,11 +41,32 @@ def process_seq(settings, file_name): ...@@ -38,11 +41,32 @@ def process_seq(settings, file_name):
seq += subseq seq += subseq
yield seq, d[1] yield seq, d[1]
# Used for sequence_nest_rnn_multi_input.conf
@provider(input_types=[integer_value_sub_sequence(10),
integer_value(3)],
should_shuffle=False)
def process_subseq2(settings, file_name):
for d in data:
yield d
# Used for sequence_rnn_multi_input.conf
@provider(input_types=[integer_value_sequence(10),
integer_value(3)],
should_shuffle=False)
def process_seq2(settings, file_name):
for d in data:
seq = []
for subseq in d[0]:
seq += subseq
yield seq, d[1]
###########################################################
data2 = [ data2 = [
[[[1, 2], [4, 5, 2]], [[5, 4, 1], [3, 1]] ,0], [[[1, 2], [4, 5, 2]], [[5, 4, 1], [3, 1]] ,0],
[[[0, 2], [2, 5], [0, 1, 2]],[[1, 5], [4], [2, 3, 6, 1]], 1], [[[0, 2], [2, 5], [0, 1, 2]],[[1, 5], [4], [2, 3, 6, 1]], 1],
] ]
# Used for sequence_nest_rnn_multi_unequalength_inputs.conf
@provider(input_types=[integer_value_sub_sequence(10), @provider(input_types=[integer_value_sub_sequence(10),
integer_value_sub_sequence(10), integer_value_sub_sequence(10),
integer_value(2)], integer_value(2)],
...@@ -52,6 +76,7 @@ def process_unequalength_subseq(settings, file_name): ...@@ -52,6 +76,7 @@ def process_unequalength_subseq(settings, file_name):
yield d yield d
# Used for sequence_rnn_multi_unequalength_inputs.conf
@provider(input_types=[integer_value_sequence(10), @provider(input_types=[integer_value_sequence(10),
integer_value_sequence(10), integer_value_sequence(10),
integer_value(2)], integer_value(2)],
......
...@@ -21,7 +21,7 @@ from paddle.trainer.PyDataProvider2 import * ...@@ -21,7 +21,7 @@ from paddle.trainer.PyDataProvider2 import *
def hook(settings, dict_file, **kwargs): def hook(settings, dict_file, **kwargs):
settings.word_dict = dict_file settings.word_dict = dict_file
settings.input_types = [integer_value_sequence(len(settings.word_dict)), settings.input_types = [integer_value_sequence(len(settings.word_dict)),
integer_value_sequence(3)] integer_value(3)]
settings.logger.info('dict len : %d' % (len(settings.word_dict))) settings.logger.info('dict len : %d' % (len(settings.word_dict)))
...@@ -34,14 +34,14 @@ def process(settings, file_name): ...@@ -34,14 +34,14 @@ def process(settings, file_name):
words = comment.split() words = comment.split()
word_slot = [settings.word_dict[w] for w in words if word_slot = [settings.word_dict[w] for w in words if
w in settings.word_dict] w in settings.word_dict]
yield word_slot, [label] yield word_slot, label
## for hierarchical sequence network ## for hierarchical sequence network
def hook2(settings, dict_file, **kwargs): def hook2(settings, dict_file, **kwargs):
settings.word_dict = dict_file settings.word_dict = dict_file
settings.input_types = [integer_value_sub_sequence(len(settings.word_dict)), settings.input_types = [integer_value_sub_sequence(len(settings.word_dict)),
integer_value_sub_sequence(3)] integer_value_sequence(3)]
settings.logger.info('dict len : %d' % (len(settings.word_dict))) settings.logger.info('dict len : %d' % (len(settings.word_dict)))
...@@ -57,7 +57,7 @@ def process2(settings, file_name): ...@@ -57,7 +57,7 @@ def process2(settings, file_name):
words = comment.split() words = comment.split()
word_slot = [settings.word_dict[w] for w in words if word_slot = [settings.word_dict[w] for w in words if
w in settings.word_dict] w in settings.word_dict]
label_list.append([label]) label_list.append(label)
word_slot_list.append(word_slot) word_slot_list.append(word_slot)
else: else:
yield word_slot_list, label_list yield word_slot_list, label_list
......
...@@ -56,9 +56,8 @@ def outer_step(x): ...@@ -56,9 +56,8 @@ def outer_step(x):
last = last_seq(input=inner_rnn_output, name="outer_rnn_state") last = last_seq(input=inner_rnn_output, name="outer_rnn_state")
# "return last" should also work. But currently RecurrentGradientMachine # "return last" should also work. But currently RecurrentGradientMachine
# does not handle it correctly. Current implementation requires that # does not handle it, and will report error: In hierachical RNN, all out
# all the out links are from sequences. However, it does not report error # links should be from sequences now.
# when the out links are not sequences.
return inner_rnn_output return inner_rnn_output
out = recurrent_group( out = recurrent_group(
......
...@@ -19,7 +19,7 @@ from paddle.trainer_config_helpers import * ...@@ -19,7 +19,7 @@ from paddle.trainer_config_helpers import *
define_py_data_sources2(train_list='gserver/tests/Sequence/dummy.list', define_py_data_sources2(train_list='gserver/tests/Sequence/dummy.list',
test_list=None, test_list=None,
module='rnn_data_provider', module='rnn_data_provider',
obj='process_subseq') obj='process_subseq2')
settings(batch_size=2, learning_rate=0.01) settings(batch_size=2, learning_rate=0.01)
...@@ -57,9 +57,8 @@ def outer_step(wid, x): ...@@ -57,9 +57,8 @@ def outer_step(wid, x):
last = last_seq(input=inner_rnn_output, name="outer_rnn_state") last = last_seq(input=inner_rnn_output, name="outer_rnn_state")
# "return last" should also work. But currently RecurrentGradientMachine # "return last" should also work. But currently RecurrentGradientMachine
# does not handle it correctly. Current implementation requires that # does not handle it, and will report error: In hierachical RNN, all out
# all the out links are from sequences. However, it does not report error # links should be from sequences now.
# when the out links are not sequences.
return inner_rnn_output return inner_rnn_output
out = recurrent_group( out = recurrent_group(
......
...@@ -19,7 +19,7 @@ from paddle.trainer_config_helpers import * ...@@ -19,7 +19,7 @@ from paddle.trainer_config_helpers import *
define_py_data_sources2(train_list='gserver/tests/Sequence/dummy.list', define_py_data_sources2(train_list='gserver/tests/Sequence/dummy.list',
test_list=None, test_list=None,
module='rnn_data_provider', module='rnn_data_provider',
obj='process_seq') obj='process_seq2')
settings(batch_size=2, learning_rate=0.01) settings(batch_size=2, learning_rate=0.01)
......
...@@ -327,6 +327,24 @@ TEST(Layer, blockExpandLayer) { ...@@ -327,6 +327,24 @@ TEST(Layer, blockExpandLayer) {
} }
} }
TEST(Layer, maxoutLayer) {
TestConfig config;
config.biasSize = 0;
config.layerConfig.set_type("maxout");
config.inputDefs.push_back({INPUT_DATA, "layer_0", 4096, 0});
LayerInputConfig* input = config.layerConfig.add_inputs();
MaxOutConfig* maxout = input->mutable_maxout_conf();
maxout->set_img_size_x(32);
maxout->set_img_size_y(32);
maxout->set_channels(4);
maxout->set_groups(2);
for (auto useGpu : {false, true}) {
testLayerGrad(config, "maxout", 10, false, useGpu);
}
}
void testFcLayer(string format, size_t nnz) { void testFcLayer(string format, size_t nnz) {
TestConfig config; TestConfig config;
config.biasSize = 4096; config.biasSize = 4096;
......
...@@ -117,7 +117,7 @@ TEST(PyDataProvider2, index_no_seq) { ...@@ -117,7 +117,7 @@ TEST(PyDataProvider2, index_no_seq) {
} }
TEST(PyDataProvider2, init_hook) { TEST(PyDataProvider2, init_hook) {
paddle::PyObjectPtr pickle(PyImport_ImportModule("pickle")); paddle::PyObjectPtr pickle = paddle::py::import("pickle");
paddle::PyObjectPtr globals( paddle::PyObjectPtr globals(
PyModule_GetDict(PyImport_AddModule("__main__"))); PyModule_GetDict(PyImport_AddModule("__main__")));
PyDict_SetItemString(globals.get(), "pickle", pickle.get()); PyDict_SetItemString(globals.get(), "pickle", pickle.get());
......
...@@ -86,7 +86,7 @@ def test_can_over_batch_size(setting, filename): ...@@ -86,7 +86,7 @@ def test_can_over_batch_size(setting, filename):
yield [random.randint(0, 100 - 1) for _ in xrange(seq_len)] yield [random.randint(0, 100 - 1) for _ in xrange(seq_len)]
@provider(input_types=[index_slot(10), index_slot(10)]) @provider(input_types={'input1':index_slot(10), 'input2': index_slot(10)})
def test_input_order(setting, filename): def test_input_order(setting, filename):
for _ in xrange(1000): for _ in xrange(1000):
yield { yield {
......
...@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <paddle/utils/Util.h> #include <paddle/utils/Util.h>
#include <paddle/utils/Version.h> #include <paddle/utils/Version.h>
...@@ -24,7 +23,7 @@ limitations under the License. */ ...@@ -24,7 +23,7 @@ limitations under the License. */
P_DECLARE_int32(seed); P_DECLARE_int32(seed);
using namespace paddle; // NOLINT using namespace paddle; // NOLINT
using namespace std; // NOLINT using namespace std; // NOLINT
class TrainerForTest : public paddle::Trainer { class TrainerForTest : public paddle::Trainer {
public: public:
void startTrain() { void startTrain() {
...@@ -44,11 +43,10 @@ public: ...@@ -44,11 +43,10 @@ public:
*/ */
size_t getTotalParameterSize() const { size_t getTotalParameterSize() const {
auto p = const_cast<TrainerForTest*>(this); auto p = const_cast<TrainerForTest*>(this);
auto & params = p->getGradientMachine()->getParameters(); auto& params = p->getGradientMachine()->getParameters();
return std::accumulate(params.begin(), params.end(), 0UL, return std::accumulate(
[](size_t a, const ParameterPtr& p){ params.begin(), params.end(), 0UL,
return a+p->getSize(); [](size_t a, const ParameterPtr& p) { return a + p->getSize(); });
});
} }
}; };
......
...@@ -283,13 +283,13 @@ void GpuMatrix::copyFrom(const IVector& src) { ...@@ -283,13 +283,13 @@ void GpuMatrix::copyFrom(const IVector& src) {
copyFrom(matrix); copyFrom(matrix);
} }
void GpuMatrix::copyByRowIndex(Matrix& b, IVector& rowIndex) { void GpuMatrix::copyByRowIndex(Matrix& b, const IVector& rowIndex) {
size_t height = getHeight(); size_t height = getHeight();
size_t width = getWidth(); size_t width = getWidth();
CHECK_EQ(b.getWidth(), width); CHECK_EQ(b.getWidth(), width);
real* dst = getData(); real* dst = getData();
real* src = b.getData(); real* src = b.getData();
int* index = rowIndex.getData(); const int* index = rowIndex.getData();
hl_sequence2batch_copy(dst, src, index, width, height, true); hl_sequence2batch_copy(dst, src, index, width, height, true);
} }
...@@ -584,6 +584,42 @@ void GpuMatrix::colMax(Matrix& max) { ...@@ -584,6 +584,42 @@ void GpuMatrix::colMax(Matrix& max) {
max.maxCols(*this); max.maxCols(*this);
} }
void GpuMatrix::colMax(IVector& maxIds, Matrix& maxVal) {
LOG(FATAL) << "Is not supported";
}
void GpuMatrix::maxoutForward(Matrix& a, IVector& id, size_t channels,
size_t groups) {
CHECK(dynamic_cast<GpuMatrix*>(&a));
CHECK(dynamic_cast<GpuIVector*>(&id));
CHECK_EQ(a.getHeight(), getHeight());
size_t size = getWidth();
size_t batchSize = getHeight();
const real* input = a.getData();
real* output = getData();
int* idForGpu = id.getData();
hl_maxout_forward(input, output, idForGpu, batchSize, size,
size / channels, groups);
}
void GpuMatrix::maxoutBackward(Matrix& a, IVector& id, size_t channels,
size_t groups) {
CHECK(dynamic_cast<GpuMatrix*>(&a));
CHECK(dynamic_cast<GpuIVector*>(&id));
CHECK_EQ(a.getHeight(), getHeight());
size_t size = a.getWidth();
size_t batchSize = getHeight();
real* input = getData();
const real* output = a.getData();
const int* idForGpu = id.getData();
hl_maxout_backward(input, output, idForGpu, batchSize, size,
size / channels, groups);
}
/*calulate the error of classification */ /*calulate the error of classification */
void GpuMatrix::classificationError(MatrixPtr output, IVectorPtr label) { void GpuMatrix::classificationError(MatrixPtr output, IVectorPtr label) {
GpuMatrixPtr output_ptr = std::dynamic_pointer_cast<GpuMatrix>(output); GpuMatrixPtr output_ptr = std::dynamic_pointer_cast<GpuMatrix>(output);
...@@ -1329,11 +1365,11 @@ void CpuMatrix::copyFrom(const IVector& src) { ...@@ -1329,11 +1365,11 @@ void CpuMatrix::copyFrom(const IVector& src) {
} }
} }
void CpuMatrix::copyByRowIndex(Matrix& b, IVector& rowIndex) { void CpuMatrix::copyByRowIndex(Matrix& b, const IVector& rowIndex) {
size_t height = getHeight(); size_t height = getHeight();
size_t width = getWidth(); size_t width = getWidth();
CHECK_EQ(b.getWidth(), width); CHECK_EQ(b.getWidth(), width);
int* index = rowIndex.getData(); const int* index = rowIndex.getData();
for (size_t i = 0; i < height; i++) { for (size_t i = 0; i < height; i++) {
CHECK_LT(static_cast<size_t>(index[i]), b.getHeight()); CHECK_LT(static_cast<size_t>(index[i]), b.getHeight());
real* src = b.getData() + index[i] * width; real* src = b.getData() + index[i] * width;
...@@ -2799,6 +2835,95 @@ void CpuMatrix::colMax(Matrix& max) { ...@@ -2799,6 +2835,95 @@ void CpuMatrix::colMax(Matrix& max) {
max.maxCols(*this); max.maxCols(*this);
} }
void CpuMatrix::colMax(IVector& maxIds, Matrix& maxVal) {
CHECK(isContiguous());
CHECK(!maxIds.useGpu() && !maxVal.useGpu()) << "Matrix type are not equal";
size_t numSamples = getWidth();
size_t beam = maxVal.getHeight();
CHECK_EQ(maxIds.getSize(), numSamples * beam);
CHECK_EQ(maxVal.getWidth(), numSamples);
real* a = getData();
int* s = maxIds.getData();
real* t = maxVal.getData();
size_t dim = getHeight();
for (size_t i = 0; i < numSamples; i++) {
std::vector<std::pair<real, size_t>> vec;
for (size_t j = 0; j < dim; j++) {
vec.push_back(std::pair<real, size_t>(a[i + j * numSamples], j));
}
std::partial_sort(
vec.begin(), vec.begin() + beam, vec.end(),
[](const std::pair<real, size_t>& l, const std::pair<real, size_t>& r) {
return l.first > r.first;
});
for (size_t j = 0; j < beam; j++) {
t[i + j * numSamples] = vec[j].first;
s[i + j * numSamples] = vec[j].second;
}
}
}
void CpuMatrix::maxoutForward(Matrix& a, IVector& id, size_t channels,
size_t groups) {
CHECK(dynamic_cast<CpuMatrix*>(&a));
CHECK(dynamic_cast<CpuIVector*>(&id));
CHECK_EQ(a.getHeight(), getHeight());
size_t size = getWidth();
size_t batchSize = getHeight();
size_t featLen = size / channels;
const real* input = a.getData();
int* idForCpu = id.getData();
MatrixPtr maxInMat, maxOutMat;
Matrix::resizeOrCreate(maxInMat, groups, size, false, false);
Matrix::resizeOrCreate(maxOutMat, 1, size, false, false);
for (size_t batch_idx = 0; batch_idx < batchSize; ++batch_idx) {
size_t newIndex = batch_idx * size;
IVectorPtr tmpId = IVector::create(idForCpu + newIndex, size, false);
for (size_t i = 0; i < channels; ++i) {
size_t newFeatLen = i * featLen;
for (size_t j = 0; j < groups; ++j) {
maxInMat->subMatrix(j, j + 1, newFeatLen, newFeatLen + featLen)
->copyFrom(input + (newIndex + newFeatLen) * groups + j * featLen,
featLen);
}
}
maxInMat->colMax(*tmpId, *maxOutMat);
this->subRowMatrix(batch_idx, batch_idx + 1)->copyFrom(*maxOutMat);
}
}
void CpuMatrix::maxoutBackward(Matrix& a, IVector& id, size_t channels,
size_t groups) {
CHECK(dynamic_cast<CpuMatrix*>(&a));
CHECK(dynamic_cast<CpuIVector*>(&id));
CHECK_EQ(a.getHeight(), getHeight());
size_t size = a.getWidth();
size_t batchSize = getHeight();
size_t featLen = size / channels;
size_t newFeatLen = groups * featLen;
real* inputG = getData();
const real* outG = a.getData();
int* idForCpu = id.getData();
for (size_t batch_idx = 0; batch_idx < batchSize; ++batch_idx) {
size_t newIndex = batch_idx * size;
int* idData = idForCpu + newIndex;
for (size_t i = 0; i < size; ++i) {
int gradIdx =
idData[i] * featLen + (i / featLen) * newFeatLen + i % featLen;
(inputG + newIndex * groups)[gradIdx] += (outG + newIndex)[i];
}
}
}
void CpuMatrix::rowNormalizeL1(Matrix& out) { void CpuMatrix::rowNormalizeL1(Matrix& out) {
CHECK(!out.useGpu()); CHECK(!out.useGpu());
......
...@@ -253,7 +253,7 @@ public: ...@@ -253,7 +253,7 @@ public:
LOG(FATAL) << "copy data from int vector only available on CpuMatrix."; LOG(FATAL) << "copy data from int vector only available on CpuMatrix.";
} }
virtual void copyByRowIndex(Matrix& b, IVector& rowIndex) { virtual void copyByRowIndex(Matrix& b, const IVector& rowIndex) {
LOG(FATAL) << "Not implemented"; LOG(FATAL) << "Not implemented";
} }
...@@ -493,16 +493,40 @@ public: ...@@ -493,16 +493,40 @@ public:
LOG(FATAL) << "Not implemeted"; LOG(FATAL) << "Not implemeted";
} }
/**
* set the max of each column of this to mat
*/
virtual void colMax(Matrix& max) { LOG(FATAL) << "not implemented"; } virtual void colMax(Matrix& max) { LOG(FATAL) << "not implemented"; }
/**
* @brief Get the top k elements of each column of this matrix.
*
* The row ids and values of these elements are stored in
* maxIds and max respectively. where k is the size of maxIds.
* And note that the top k elements are not sorted.
*/
virtual void colMax(IVector& maxIds, Matrix& maxVal) {
LOG(FATAL) << "not implemented";
}
virtual void maxoutForward(Matrix& a, IVector& id, size_t channels,
size_t groups) {
LOG(FATAL) << "not implemented";
}
virtual void maxoutBackward(Matrix& a, IVector& id, size_t channels,
size_t groups) {
LOG(FATAL) << "not implemented";
}
virtual void rowMaxId(IVector& maxIds) { LOG(FATAL) << "Not implemented"; } virtual void rowMaxId(IVector& maxIds) { LOG(FATAL) << "Not implemented"; }
/** /**
* @brief Get the top k elements of each row of this matrix. * @brief Get the top k elements of each row of this matrix.
* *
* The column ids and values of these elements are stored in * The column ids and values of these elements are stored in
* maxIds and max respectively. Note that the top k * maxIds and max respectively. where k is the size of maxIds.
* elements are not sorted. * And note that the top k elements are not sorted.
*/ */
virtual void rowMax(IVector& maxIds, Matrix& max) { virtual void rowMax(IVector& maxIds, Matrix& max) {
LOG(FATAL) << "Not implemented"; LOG(FATAL) << "Not implemented";
...@@ -995,7 +1019,7 @@ public: ...@@ -995,7 +1019,7 @@ public:
void copyFrom(const IVector& src); void copyFrom(const IVector& src);
void copyByRowIndex(Matrix& b, IVector& rowIndex); void copyByRowIndex(Matrix& b, const IVector& rowIndex);
MatrixPtr clone(size_t height, size_t width, bool useGpu = false); MatrixPtr clone(size_t height, size_t width, bool useGpu = false);
...@@ -1101,6 +1125,9 @@ public: ...@@ -1101,6 +1125,9 @@ public:
void rowMax(Matrix& max); void rowMax(Matrix& max);
void rowMax(IVector& maxIds, Matrix& max); void rowMax(IVector& maxIds, Matrix& max);
void colMax(Matrix& max); void colMax(Matrix& max);
void colMax(IVector& maxIds, Matrix& max);
void maxoutForward(Matrix& a, IVector& id, size_t channels, size_t groups);
void maxoutBackward(Matrix& a, IVector& id, size_t channels, size_t groups);
void oneHotCrossEntropy(Matrix& output, IVector& label); void oneHotCrossEntropy(Matrix& output, IVector& label);
void oneHotCrossEntropyBp(Matrix& outputV, IVector& label); void oneHotCrossEntropyBp(Matrix& outputV, IVector& label);
...@@ -1271,7 +1298,7 @@ public: ...@@ -1271,7 +1298,7 @@ public:
void copyFrom(CpuSparseMatrix& src); void copyFrom(CpuSparseMatrix& src);
void copyByRowIndex(Matrix& b, IVector& rowIndex); void copyByRowIndex(Matrix& b, const IVector& rowIndex);
MatrixPtr clone(size_t height, size_t width, bool useGpu = false); MatrixPtr clone(size_t height, size_t width, bool useGpu = false);
...@@ -1425,6 +1452,9 @@ public: ...@@ -1425,6 +1452,9 @@ public:
void rowMax(Matrix& max); void rowMax(Matrix& max);
void rowMax(IVector& maxIds, Matrix& maxVal); void rowMax(IVector& maxIds, Matrix& maxVal);
void colMax(Matrix& max); void colMax(Matrix& max);
void colMax(IVector& maxIds, Matrix& maxVal);
void maxoutForward(Matrix& a, IVector& id, size_t channels, size_t groups);
void maxoutBackward(Matrix& a, IVector& id, size_t channels, size_t groups);
void rowNormalizeL1(Matrix& out); void rowNormalizeL1(Matrix& out);
void oneHotCrossEntropy(Matrix& output, IVector& label); void oneHotCrossEntropy(Matrix& output, IVector& label);
......
...@@ -227,12 +227,18 @@ void CacheRowCpuMatrix::mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB, ...@@ -227,12 +227,18 @@ void CacheRowCpuMatrix::mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB,
void SparsePrefetchRowCpuMatrix::addRows(const unsigned int* ids, size_t len) { void SparsePrefetchRowCpuMatrix::addRows(const unsigned int* ids, size_t len) {
std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices; std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices;
for (size_t i = 0; i < len; i ++) {
CHECK_LT(*(ids + i), this->getHeight())
<< "id:" << *(ids + i) << "Height:" << this->getHeight()
<< "sparse id value exceeds the max input dimension, "
<< "it could be caused invalid input data samples";
}
localIndices.insert(localIndices.end(), ids, ids + len); localIndices.insert(localIndices.end(), ids, ids + len);
} }
void SparsePrefetchRowCpuMatrix::addRows(MatrixPtr input) { void SparsePrefetchRowCpuMatrix::addRows(MatrixPtr input) {
CpuSparseMatrix* mat = dynamic_cast<CpuSparseMatrix*>(input.get()); CpuSparseMatrix* mat = dynamic_cast<CpuSparseMatrix*>(input.get());
CHECK(mat) << "only support non value sparse matrix"; CHECK(mat) << "only support sparse matrix";
addRows(reinterpret_cast<const unsigned int*>(mat->getCols()), addRows(reinterpret_cast<const unsigned int*>(mat->getCols()),
mat->getElementCnt()); mat->getElementCnt());
} }
...@@ -243,7 +249,13 @@ void SparsePrefetchRowCpuMatrix::addRows(IVectorPtr ids) { ...@@ -243,7 +249,13 @@ void SparsePrefetchRowCpuMatrix::addRows(IVectorPtr ids) {
int* index = ids->getData(); int* index = ids->getData();
for (size_t i = 0; i < numSamples; ++i) { for (size_t i = 0; i < numSamples; ++i) {
if (index[i] == -1) continue; if (index[i] == -1) continue;
localIndices.push_back((unsigned int)index[i]);
unsigned int id = (unsigned int)index[i];
CHECK_LT(id, this->getHeight())
<< "id:" << id << "Height:" << this->getHeight()
<< "sparse id value exceeds the max input dimension, "
<< "it could be caused invalid input data samples";
localIndices.push_back(id);
} }
} }
......
...@@ -2065,6 +2065,78 @@ TEST(Matrix, PoolFwdBwd) { ...@@ -2065,6 +2065,78 @@ TEST(Matrix, PoolFwdBwd) {
} }
} }
void testMaxOutFwdBwd(int numSamples, int imgSizeH, int imgSizeW,
int channels, int groups) {
int inWidth = imgSizeH * imgSizeW * channels;
int outChannels = channels / groups;
int outWidth = imgSizeH * imgSizeW * outChannels;
// forward
MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
MatrixPtr targetCheck = CpuMatrix::create(numSamples, outWidth, false, false);
IVectorPtr id = CpuIVector::create(numSamples * outWidth, false);
IVectorPtr idGpu = GpuIVector::create(numSamples * outWidth, true);
IVectorPtr idCheck = CpuIVector::create(numSamples * outWidth, false);
input->randomizeUniform();
inputGpu->copyFrom(*input);
target->maxoutForward(*input, *id, outChannels, groups);
targetGpu->maxoutForward(*inputGpu, *idGpu, outChannels, groups);
// check
targetCheck->copyFrom(*targetGpu);
MatrixCheckErr(*target, *targetCheck);
idCheck->copyFrom(*idGpu);
VectorCheckEqual(*id, *idCheck);
// backward
MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
MatrixPtr targetGpuGrad = GpuMatrix::create(numSamples, outWidth, false,
true);
MatrixPtr targetCheckGrad = CpuMatrix::create(numSamples, inWidth, false,
false);
inputGrad->randomizeUniform();
targetGrad->randomizeUniform();
inputGpuGrad->copyFrom(*inputGrad);
targetGpuGrad->copyFrom(*targetGrad);
inputGrad->maxoutBackward(*targetGrad, *id, outChannels, groups);
inputGpuGrad->maxoutBackward(*targetGpuGrad, *idGpu, outChannels, groups);
// check
targetCheckGrad->copyFrom(*inputGpuGrad);
MatrixCheckErr(*inputGrad, *targetCheckGrad);
}
TEST(Matrix, MaxOutFwdBwd) {
for (auto numSamples : {5, 10}) {
for (auto channels : {8, 16}) {
for (auto imgSizeH : {14, 28}) {
for (auto imgSizeW : {16, 30}) {
for (auto groups : {2, 4}) {
VLOG(3) << " numSamples=" << numSamples
<< " channels=" << channels
<< " imgSizeH=" << imgSizeH
<< " imgSizeW=" << imgSizeW
<< " groups=" << groups;
testMaxOutFwdBwd(numSamples, imgSizeH, imgSizeW, channels, groups);
}
}
}
}
}
}
int main(int argc, char** argv) { int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv); testing::InitGoogleTest(&argc, argv);
initMain(argc, argv); initMain(argc, argv);
......
...@@ -146,6 +146,12 @@ public: ...@@ -146,6 +146,12 @@ public:
} }
} }
void enableBufType(ParameterType type) {
if (bufs_[type]) return;
bufs_[type] = Vector::createParallelVector(config_.size(), useGpu_);
bufs_[type]->zeroMem();
}
void enableIntType(ParameterType type, size_t intStoreSize = 0) { void enableIntType(ParameterType type, size_t intStoreSize = 0) {
if (!intBufs_[type]) { if (!intBufs_[type]) {
SetDevice device(deviceId_); SetDevice device(deviceId_);
......
/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/pserver/ParameterClient.h"
#include "paddle/pserver/ParameterServer.h"
#include "paddle/parameter/Parameter.h"
#include <Python.h>
namespace paddle {
struct PyObjectDeleter {
void operator()(PyObject* obj) {
if (obj) {
Py_DECREF(obj);
}
}
};
class ParameterClientPy : public ParameterClient {
protected:
typedef std::unique_ptr<PyObject, PyObjectDeleter> PyObjectPtr;
std::vector<ParameterPtr> parameter_;
int initArgc_;
char** initArgv_;
public:
ParameterClientPy(std::vector<std::string> configs, int argc,
std::vector<std::string> argv, bool useGpu) {
initArgc_ = argc;
initArgv_ = new char* [argc];
for (int i = 0; i < argc; i++) {
initArgv_[i] = new char[argv[i].size()];
strcpy(initArgv_[i], // NOLINT
argv[i].c_str()); // NOLINT TODO(yuyang18): use snprintf instead.
}
ParameterConfig pyConfig;
ParameterPtr param;
for (auto& config : configs) {
pyConfig.ParseFromString(config);
param.reset(new Parameter(pyConfig, useGpu));
parameter_.push_back(param);
}
Py_Initialize();
CHECK(Py_IsInitialized());
}
~ParameterClientPy() {
delete initArgv_;
Py_Finalize();
}
Parameter getParameter(int idx) { return *(parameter_[idx].get()); }
void initClientPy() {
initMain(initArgc_, initArgv_);
CHECK(init(parameter_)) << "Init Client Failed.";
}
void setConfigPy(std::string config) {
OptimizationConfig optConfig;
optConfig.ParseFromString(config);
setConfig(optConfig);
}
bool inStatusPy(int status) { return inStatus(PServerStatus(status)); }
void setStatusPy(int status) { setStatus(PServerStatus(status)); }
void waitForStatusPy(int status) { waitForStatus(PServerStatus(status)); }
void sendParameterPy(int updateMode, int parameterType, int numSamples,
real cost, bool sendBackParameter) {
sendParameter(ParameterUpdateMode(updateMode), ParameterType(parameterType),
int64_t(numSamples), real(cost), sendBackParameter);
}
template <class ProtoIn, class ProtoOut>
std::string asyncCallPy(const char* serviceName, const char* funcName,
const std::string in) {
ProtoIn protoIn;
ProtoOut protoOut;
std::mutex waitLock;
std::string data;
protoIn.ParseFromString(in);
waitLock.lock();
auto callback = [&](ProtoOut* pOut, bool isSuccessful) {
if (isSuccessful) {
pOut->SerializeToString(&data);
} else {
LOG(INFO) << "Async Talk Failed.";
}
waitLock.unlock();
};
ubClient_.asyncCall<ProtoIn, ProtoOut>(serviceName, funcName, protoIn,
&protoOut, callback);
waitLock.lock();
protoOut.SerializeToString(&data);
return data;
}
};
} // namespace paddle
...@@ -63,7 +63,8 @@ class SparseBinaryScanner(IScanner): ...@@ -63,7 +63,8 @@ class SparseBinaryScanner(IScanner):
def scan(self, dat): def scan(self, dat):
self.extend_cols(dat) self.extend_cols(dat)
self.__rows__.append(len(dat)) self.__rows__.append(len(dat) + self.__rows__[-1])
self.__height__ += 1
def extend_cols(self, dat): def extend_cols(self, dat):
self.__cols__.extend(dat) self.__cols__.extend(dat)
......
#!/bin/bash
brew update
brew tap homebrew/science
brew install python
sudo pip install --upgrade protobuf==2.6.0
brew install homebrew/versions/protobuf260 --without-python
brew install cmake python glog gflags openblas wget md5sha1sum
wget https://github.com/google/googletest/archive/release-1.8.0.tar.gz -O gtest.tar.gz
tar xf gtest.tar.gz
cd googletest-release-1.8.0/
cmake .
make install
#!/bin/bash #!/bin/bash
source ./common.sh source ./common.sh
cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=OFF -DWITH_TESTING=ON -DON_TRAVIS=ON CMAKE_EXTRA=""
make -j `nproc` if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then
env CTEST_OUTPUT_ON_FAILURE=1 make test ARGS="-j `nproc`" CMAKE_EXTRA="-DPYTHON_LIBRARY=/usr/local/Cellar/python/2.7.12_1/Frameworks/Python.framework/Versions/2.7/lib/python2.7/config/libpython2.7.dylib"
fi
cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=OFF -DWITH_TESTING=ON -DON_TRAVIS=ON ${CMAKE_EXTRA}
NPROC=1
if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then
NRPOC=`nproc`
elif [[ "$TRAVIS_OS_NAME" == "osx" ]]; then
NPROC=`sysctl -n hw.ncpu`
fi
make -j $NPROC
env CTEST_OUTPUT_ON_FAILURE=1 make test ARGS="-j $NPROC"
sudo make install sudo make install
sudo paddle version sudo paddle version
...@@ -20,6 +20,8 @@ limitations under the License. */ ...@@ -20,6 +20,8 @@ limitations under the License. */
#include "paddle/math/SparseRowMatrix.h" #include "paddle/math/SparseRowMatrix.h"
#include "paddle/utils/Thread.h" #include "paddle/utils/Thread.h"
P_DECLARE_int32(trainer_count);
namespace paddle { namespace paddle {
SgdThreadUpdater::SgdThreadUpdater(const OptimizationConfig& optConfig) SgdThreadUpdater::SgdThreadUpdater(const OptimizationConfig& optConfig)
...@@ -48,6 +50,13 @@ void SgdThreadUpdater::init(std::vector<ParameterPtr>& parameters) { ...@@ -48,6 +50,13 @@ void SgdThreadUpdater::init(std::vector<ParameterPtr>& parameters) {
false /*inPserver*/)); false /*inPserver*/));
size_t numRows = para->isGradSparseUpdate() ? para->getConfig().dims(0) : 0; size_t numRows = para->isGradSparseUpdate() ? para->getConfig().dims(0) : 0;
optimizers_[pid]->init(numRows, &para->getConfig()); optimizers_[pid]->init(numRows, &para->getConfig());
if (para->isGradSparseUpdate() && FLAGS_trainer_count == 1) {
// For trainer_count=1, the gradient machine is NeuralNetwork, which does
// not create parameter buf for PARAMETER_GRADIENT for sparse update in
// Parameter::enableType(). But gradient parameter buf is still used
// in SgdThreadUpdater. We need to explicitly create it.
para->enableBufType(PARAMETER_GRADIENT);
}
} }
} }
...@@ -211,7 +220,7 @@ void SgdThreadUpdater::threadUpdateSparse( ...@@ -211,7 +220,7 @@ void SgdThreadUpdater::threadUpdateSparse(
// From MultiGradientMachine // From MultiGradientMachine
SparseRowIdsCpuMatrix* mainMat = dynamic_cast<SparseRowIdsCpuMatrix*>( SparseRowIdsCpuMatrix* mainMat = dynamic_cast<SparseRowIdsCpuMatrix*>(
para->getMat(PARAMETER_GRADIENT).get()); para->getMat(PARAMETER_GRADIENT).get());
const std::vector<uint32_t>& sparseIds = mainMat->getIds(tid); std::vector<uint32_t>& sparseIds = mainMat->getIds(tid);
for (auto id : sparseIds) { for (auto id : sparseIds) {
// setup sub bufs // setup sub bufs
...@@ -221,6 +230,7 @@ void SgdThreadUpdater::threadUpdateSparse( ...@@ -221,6 +230,7 @@ void SgdThreadUpdater::threadUpdateSparse(
optimizer->update(vecs, para->getConfig(), id); optimizer->update(vecs, para->getConfig(), id);
vecs[PARAMETER_GRADIENT]->zeroMem(); vecs[PARAMETER_GRADIENT]->zeroMem();
} }
sparseIds.clear();
} else if (dynamic_cast<SparseRowCpuMatrix*>( } else if (dynamic_cast<SparseRowCpuMatrix*>(
para->getMat(PARAMETER_GRADIENT).get())) { para->getMat(PARAMETER_GRADIENT).get())) {
// From NeuralNetwork // From NeuralNetwork
...@@ -246,6 +256,10 @@ void SgdThreadUpdater::threadUpdateSparse( ...@@ -246,6 +256,10 @@ void SgdThreadUpdater::threadUpdateSparse(
optimizer->update(vecs, para->getConfig(), id); optimizer->update(vecs, para->getConfig(), id);
vecs[PARAMETER_GRADIENT]->zeroMem(); vecs[PARAMETER_GRADIENT]->zeroMem();
} }
// For numThreads > 1, MultiGradientMachine is used, which goes
// to the above branch.
CHECK_EQ(numThreads, 1UL);
mainMat->clearIndices();
} else { } else {
auto & m = *para->getMat(PARAMETER_GRADIENT).get(); auto & m = *para->getMat(PARAMETER_GRADIENT).get();
LOG(FATAL) << "Internal error: " << para->getName() << " " LOG(FATAL) << "Internal error: " << para->getName() << " "
......
...@@ -13,157 +13,71 @@ ...@@ -13,157 +13,71 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
#Todo(luotao02) This config is only used for unitest. It is out of date now, and will be updated later. from paddle.trainer_config_helpers import *
default_initial_std(0.5) TrainData(ProtoData(
files = "dummy_list",
model_type("nn") constant_slots = [1.0],
async_load_data = True))
DataLayer(
name = "input", TestData(SimpleData(
size = 3, files = "trainer/tests/sample_filelist.txt",
) feat_dim = 3,
context_len = 0,
DataLayer( buffer_capacity = 1000000,
name = "weight", async_load_data = False))
size = 1,
) settings(batch_size = 100)
Layer( data = data_layer(name='input', size=3)
name = "layer1_1",
type = "fc", wt = data_layer(name='weight', size=1)
size = 5,
active_type = "sigmoid", fc1 = fc_layer(input=data, size=5,
inputs = "input", bias_attr=True,
) act=SigmoidActivation())
Layer( fc2 = fc_layer(input=data, size=12,
name = "layer1_2", bias_attr=True,
type = "fc", param_attr=ParamAttr(name='sharew'),
size = 12, act=LinearActivation())
active_type = "linear",
inputs = Input("input", parameter_name='sharew'), fc3 = fc_layer(input=data, size=3,
) bias_attr=True,
act=TanhActivation())
Layer(
name = "layer1_3", fc4 = fc_layer(input=data, size=5,
type = "fc", bias_attr=True,
size = 3, layer_attr=ExtraAttr(drop_rate=0.5),
active_type = "tanh", act=SquareActivation())
inputs = "input",
) pool = img_pool_layer(input=fc2,
pool_size=2,
Layer( pool_size_y=3,
name = "layer1_5", num_channels=1,
type = "fc", padding=1,
size = 3, padding_y=2,
active_type = "tanh", stride=2,
inputs = Input("input", stride_y=3,
learning_rate=0.01, img_width=3,
momentum=0.9, pool_type=CudnnAvgPooling())
decay_rate=0.05,
initial_mean=0.0, concat = concat_layer(input=[fc3, fc4])
initial_std=0.01,
format = "csc", with mixed_layer(size=3, act=SoftmaxActivation()) as output:
nnz = 4) output += full_matrix_projection(input=fc1)
) output += trans_full_matrix_projection(input=fc2,
param_attr=ParamAttr(name='sharew'))
FCLayer( output += full_matrix_projection(input=concat)
name = "layer1_4", output += identity_projection(input=fc3)
size = 5,
active_type = "square", lbl = data_layer(name='label', size=1)
inputs = "input",
drop_rate = 0.5, cost = classification_cost(input=output, label=lbl, weight=wt,
) layer_attr=ExtraAttr(device=-1))
Layer( nce = nce_layer(input=fc2, label=lbl, weight=wt,
name = "pool", num_classes=3,
type = "pool", neg_distribution=[0.1, 0.3, 0.6])
inputs = Input("layer1_2",
pool = Pool(pool_type="cudnn-avg-pool", outputs(cost, nce)
channels = 1,
size_x = 2,
size_y = 3,
img_width = 3,
padding = 1,
padding_y = 2,
stride = 2,
stride_y = 3))
)
Layer(
name = "concat",
type = "concat",
inputs = ["layer1_3", "layer1_4"],
)
MixedLayer(
name = "output",
size = 3,
active_type = "softmax",
inputs = [
FullMatrixProjection("layer1_1",
learning_rate=0.1),
TransposedFullMatrixProjection("layer1_2", parameter_name='sharew'),
FullMatrixProjection("concat"),
IdentityProjection("layer1_3"),
],
)
Layer(
name = "label",
type = "data",
size = 1,
)
Layer(
name = "cost",
type = "multi-class-cross-entropy",
inputs = ["output", "label", "weight"],
)
Layer(
name = "cost2",
type = "nce",
num_classes = 3,
active_type = "sigmoid",
neg_sampling_dist = [0.1, 0.3, 0.6],
inputs = ["layer1_2", "label", "weight"],
)
Evaluator(
name = "error",
type = "classification_error",
inputs = ["output", "label", "weight"]
)
Inputs("input", "label", "weight")
Outputs("cost", "cost2")
TrainData(
ProtoData(
files = "dummy_list",
constant_slots = [1.0],
async_load_data = True,
)
)
TestData(
SimpleData(
files = "trainer/tests/sample_filelist.txt",
feat_dim = 3,
context_len = 0,
buffer_capacity = 1000000,
async_load_data = False,
),
)
Settings(
algorithm = "sgd",
num_batches_per_send_parameter = 1,
num_batches_per_get_parameter = 1,
batch_size = 100,
learning_rate = 0.001,
learning_rate_decay_a = 1e-5,
learning_rate_decay_b = 0.5,
)
enable_virtualenv.c
...@@ -2,6 +2,9 @@ ...@@ -2,6 +2,9 @@
file(GLOB UTIL_HEADERS . *.h) file(GLOB UTIL_HEADERS . *.h)
file(GLOB UTIL_SOURCES . *.cpp) file(GLOB UTIL_SOURCES . *.cpp)
create_resources(enable_virtualenv.py enable_virtualenv.c)
set(UTIL_RES enable_virtualenv.c)
if(APPLE) if(APPLE)
file(GLOB UTIL_ARCH_SOURCES . arch/osx/*.cpp) file(GLOB UTIL_ARCH_SOURCES . arch/osx/*.cpp)
else() else()
...@@ -9,7 +12,8 @@ else() ...@@ -9,7 +12,8 @@ else()
endif() endif()
add_library(paddle_utils STATIC add_library(paddle_utils STATIC
${UTIL_SOURCES} ${UTIL_SOURCES}
${UTIL_ARCH_SOURCES}) ${UTIL_ARCH_SOURCES}
${UTIL_RES})
add_style_check_target(paddle_utils ${UTIL_HEADERS}) add_style_check_target(paddle_utils ${UTIL_HEADERS})
add_style_check_target(paddle_utils ${UTIL_SOURCES} add_style_check_target(paddle_utils ${UTIL_SOURCES}
${UTIL_ARCH_SOURCES}) ${UTIL_ARCH_SOURCES})
......
...@@ -191,7 +191,7 @@ void installFailureWriter(void(*callback)(const char*, int)); ...@@ -191,7 +191,7 @@ void installFailureWriter(void(*callback)(const char*, int));
} }
#endif // PADDLE_USE_GLOG #endif // PADDLE_USE_GLOG
#ifdef NDEBUG #ifndef NDEBUG
#define DEBUG_LEVEL 5 #define DEBUG_LEVEL 5
#define DBG VLOG(DEBUG_LEVEL) #define DBG VLOG(DEBUG_LEVEL)
#else #else
......
...@@ -77,11 +77,18 @@ static std::recursive_mutex g_pyMutex; ...@@ -77,11 +77,18 @@ static std::recursive_mutex g_pyMutex;
PyGuard::PyGuard() : guard_(g_pyMutex) {} PyGuard::PyGuard() : guard_(g_pyMutex) {}
static void printPyErrorStack(std::ostream& os, bool withEndl = false) { static void printPyErrorStack(std::ostream& os, bool withEndl = false,
bool withPyPath = true) {
PyObject * ptype, *pvalue, *ptraceback; PyObject * ptype, *pvalue, *ptraceback;
PyErr_Fetch(&ptype, &pvalue, &ptraceback); PyErr_Fetch(&ptype, &pvalue, &ptraceback);
PyErr_NormalizeException(&ptype, &pvalue, &ptraceback); PyErr_NormalizeException(&ptype, &pvalue, &ptraceback);
PyErr_Clear(); PyErr_Clear();
if (withPyPath) {
os << "Current PYTHONPATH: " << py::repr(PySys_GetObject(strdup("path")));
if (withEndl) {
os << std::endl;
}
}
PyTracebackObject* obj = (PyTracebackObject*)ptraceback; PyTracebackObject* obj = (PyTracebackObject*)ptraceback;
os << "Python Error: " << PyString_AsString(PyObject_Str(ptype)) os << "Python Error: " << PyString_AsString(PyObject_Str(ptype))
...@@ -114,10 +121,7 @@ PyObjectPtr callPythonFuncRetPyObj(const std::string& moduleName, ...@@ -114,10 +121,7 @@ PyObjectPtr callPythonFuncRetPyObj(const std::string& moduleName,
const std::string& funcName, const std::string& funcName,
const std::vector<std::string>& args) { const std::vector<std::string>& args) {
PyGuard guard; PyGuard guard;
PyObjectPtr pyModuleName(PyString_FromString(moduleName.c_str())); PyObjectPtr pyModule = py::import(moduleName);
CHECK_PY(pyModuleName) << "Import PyModule failed" << moduleName;
PyObjectPtr pyModule(PyImport_Import(pyModuleName.get()));
CHECK_PY(pyModule) << "Import Python Module"<< moduleName << " failed.";
PyObjectPtr pyFunc(PyObject_GetAttrString(pyModule.get(), funcName.c_str())); PyObjectPtr pyFunc(PyObject_GetAttrString(pyModule.get(), funcName.c_str()));
CHECK_PY(pyFunc) << "GetAttrString failed."; CHECK_PY(pyFunc) << "GetAttrString failed.";
PyObjectPtr pyArgs(PyTuple_New(args.size())); PyObjectPtr pyArgs(PyTuple_New(args.size()));
...@@ -143,7 +147,7 @@ PyObjectPtr createPythonClass( ...@@ -143,7 +147,7 @@ PyObjectPtr createPythonClass(
const std::vector<std::string>& args, const std::vector<std::string>& args,
const std::map<std::string, std::string>& kwargs) { const std::map<std::string, std::string>& kwargs) {
PyGuard guard; PyGuard guard;
PyObjectPtr pyModule(PyImport_ImportModule(moduleName.c_str())); PyObjectPtr pyModule = py::import(moduleName);
LOG(INFO) << "createPythonClass moduleName.c_str:" << moduleName.c_str(); LOG(INFO) << "createPythonClass moduleName.c_str:" << moduleName.c_str();
CHECK_PY(pyModule) << "Import module " << moduleName << " failed."; CHECK_PY(pyModule) << "Import module " << moduleName << " failed.";
PyObjectPtr pyDict(PyModule_GetDict(pyModule.get())); PyObjectPtr pyDict(PyModule_GetDict(pyModule.get()));
...@@ -181,18 +185,29 @@ std::string getPyCallStack() { ...@@ -181,18 +185,29 @@ std::string getPyCallStack() {
printPyErrorStack(os, true); printPyErrorStack(os, true);
return os.str(); return os.str();
} }
PyObjectPtr import(const std::string &moduleName) {
auto module = PyImport_ImportModule(moduleName.c_str());
CHECK_PY(module) << "Import " << moduleName << "Error";
return PyObjectPtr(module);
}
} // namespace py } // namespace py
#endif #endif
extern "C" {
extern const char enable_virtualenv_py[];
}
void initPython(int argc, char** argv) { void initPython(int argc, char** argv) {
#ifndef PADDLE_NO_PYTHON #ifndef PADDLE_NO_PYTHON
Py_SetProgramName(argv[0]); Py_SetProgramName(argv[0]);
Py_Initialize(); Py_Initialize();
PySys_SetArgv(argc, argv); PySys_SetArgv(argc, argv);
// python blocks SIGINT. Need to enable it. // python blocks SIGINT. Need to enable it.
signal(SIGINT, SIG_DFL); signal(SIGINT, SIG_DFL);
// Manually activate virtualenv when user is using virtualenv
PyRun_SimpleString(enable_virtualenv_py);
#endif #endif
} }
......
...@@ -87,6 +87,8 @@ PyObjectPtr createPythonClass(const std::string& moduleName, ...@@ -87,6 +87,8 @@ PyObjectPtr createPythonClass(const std::string& moduleName,
CHECK((x) != nullptr) << ::paddle::py::getPyCallStack() CHECK((x) != nullptr) << ::paddle::py::getPyCallStack()
namespace py { namespace py {
PyObjectPtr import(const std::string& moduleName);
/** /**
* Cast a PyLong or PyInt to int type T. * Cast a PyLong or PyInt to int type T.
* @tparam T return type. * @tparam T return type.
......
...@@ -135,6 +135,21 @@ public: ...@@ -135,6 +135,21 @@ public:
queueCV_.wait(lock, [this]() { return numElements_ == 0; }); queueCV_.wait(lock, [this]() { return numElements_ == 0; });
} }
/**
* @brief wait queue is not empty at most for some seconds.
* @param seconds wait time limit.
* @return true if queue is not empty. false if timeout.
*/
bool waitNotEmptyFor(int seconds) {
std::unique_lock<std::mutex> lock(queueLock_);
return queueCV_.wait_for(
lock,
std::chrono::seconds(seconds),
[this] {
return numElements_ != 0;
});
}
private: private:
std::deque<T> elements_; std::deque<T> elements_;
int numElements_; int numElements_;
......
import os
def __activate_virtual_env__():
__path__ = os.getenv('VIRTUAL_ENV')
if __path__ is None:
return
__script__ = os.path.join(__path__, 'bin', 'activate_this.py')
execfile(__script__, {'__file__': __script__})
__activate_virtual_env__()
...@@ -170,6 +170,15 @@ message BlockExpandConfig { ...@@ -170,6 +170,15 @@ message BlockExpandConfig {
required uint32 img_size_y = 11; required uint32 img_size_y = 11;
} }
message MaxOutConfig {
required uint32 channels = 1;
required uint32 groups = 2;
// The size of input feature map.
required uint32 img_size_x = 3;
required uint32 img_size_y = 4;
}
message ProjectionConfig { message ProjectionConfig {
required string type = 1; required string type = 1;
required string name = 2; required string name = 2;
...@@ -235,6 +244,7 @@ message LayerInputConfig { ...@@ -235,6 +244,7 @@ message LayerInputConfig {
// Set the argument name. // Set the argument name.
optional string input_layer_argument = 9; optional string input_layer_argument = 9;
optional BilinearInterpConfig bilinear_interp_conf = 10; optional BilinearInterpConfig bilinear_interp_conf = 10;
optional MaxOutConfig maxout_conf = 11;
} }
message LayerConfig { message LayerConfig {
......
...@@ -208,7 +208,6 @@ def provider(input_types=None, should_shuffle=None, pool_size=-1, ...@@ -208,7 +208,6 @@ def provider(input_types=None, should_shuffle=None, pool_size=-1,
calc_batch_size=None, calc_batch_size=None,
cache=CacheType.NO_CACHE, cache=CacheType.NO_CACHE,
check=False, check_fail_continue=False, check=False, check_fail_continue=False,
use_dynamic_order=True,
init_hook=None, **kwargs): init_hook=None, **kwargs):
""" """
Provider decorator. Use it to make a function into PyDataProvider2 object. Provider decorator. Use it to make a function into PyDataProvider2 object.
...@@ -228,9 +227,15 @@ def provider(input_types=None, should_shuffle=None, pool_size=-1, ...@@ -228,9 +227,15 @@ def provider(input_types=None, should_shuffle=None, pool_size=-1,
The configuration of data provider should be setup by\: The configuration of data provider should be setup by\:
:param input_types: Specify the input types, can also be set in init_hook. :param input_types: Specify the input types, can also be set in init_hook.
It is a list of InputType object. For example, input_types= \ It could be a list of InputType object. For example,
[dense_vector(9), integer_value(2)]. input_types=[dense_vector(9), integer_value(2)]. Or user
:type input_types: list|tuple can set a dict of InputType object, which key is
data_layer's name. For example, input_types=\
{'img': img_features, 'label': label}. when using dict of
InputType, user could yield a dict of feature values, which
key is also data_layer's name.
:type input_types: list|tuple|dict
:param should_shuffle: True if data should shuffle. Pass None means shuffle :param should_shuffle: True if data should shuffle. Pass None means shuffle
when is training and not to shuffle when is testing. when is training and not to shuffle when is testing.
...@@ -281,12 +286,6 @@ def provider(input_types=None, should_shuffle=None, pool_size=-1, ...@@ -281,12 +286,6 @@ def provider(input_types=None, should_shuffle=None, pool_size=-1,
drop the wrong format data when it is True. Has drop the wrong format data when it is True. Has
no effect when check set to False. no effect when check set to False.
:type check_fail_continue: bool :type check_fail_continue: bool
:param use_dynamic_order: Allow provider to yield a dictionary object, whose
key is a input data layer name, and value is the
feature value. The tuples are still allowed when
use_dynmaic_order is True.
:type use_dynamic_order: bool
""" """
def __wrapper__(generator): def __wrapper__(generator):
...@@ -340,6 +339,11 @@ def provider(input_types=None, should_shuffle=None, pool_size=-1, ...@@ -340,6 +339,11 @@ def provider(input_types=None, should_shuffle=None, pool_size=-1,
assert self.slots is not None assert self.slots is not None
assert self.generator is not None assert self.generator is not None
use_dynamic_order = False
if isinstance(self.slots, dict): # reorder input_types
self.slots = [self.slots[ipt] for ipt in self.input_order]
use_dynamic_order = True
if len(self.slots) == 1: if len(self.slots) == 1:
self.generator = SingleSlotWrapper(self.generator) self.generator = SingleSlotWrapper(self.generator)
......
...@@ -216,6 +216,10 @@ def Inputs(*args): ...@@ -216,6 +216,10 @@ def Inputs(*args):
if g_current_submodel is g_root_submodel: if g_current_submodel is g_root_submodel:
g_config.model_config.input_layer_names.append(name) g_config.model_config.input_layer_names.append(name)
@config_func
def HasInputsSet():
return len(g_config.model_config.input_layer_names) != 0
# Define the name of the output layers of the NeuralNetwork. # Define the name of the output layers of the NeuralNetwork.
# Usually the output is simply the cost layer. # Usually the output is simply the cost layer.
...@@ -466,6 +470,7 @@ class Input(Cfg): ...@@ -466,6 +470,7 @@ class Input(Cfg):
pool=None, pool=None,
image=None, image=None,
block_expand=None, block_expand=None,
maxout=None,
format=None, format=None,
nnz=None, nnz=None,
is_static=None, is_static=None,
...@@ -794,6 +799,16 @@ class BlockExpand(Cfg): ...@@ -794,6 +799,16 @@ class BlockExpand(Cfg):
output_y = 0): output_y = 0):
self.add_keys(locals()) self.add_keys(locals())
@config_class
class MaxOut(Cfg):
def __init__(
self,
channels,
groups,
img_size_x = 0,
img_size_y = 0):
self.add_keys(locals())
def DataBase(async_load_data=False, def DataBase(async_load_data=False,
constant_slots=None, constant_slots=None,
data_ratio=1, data_ratio=1,
...@@ -1098,6 +1113,12 @@ def parse_block_expand(block_expand, input_layer_name, block_expand_conf): ...@@ -1098,6 +1113,12 @@ def parse_block_expand(block_expand, input_layer_name, block_expand_conf):
int(math.ceil((2 * block_expand.padding_y + block_expand.img_size_y \ int(math.ceil((2 * block_expand.padding_y + block_expand.img_size_y \
- block_expand.block_y) / float(block_expand.stride_y))) - block_expand.block_y) / float(block_expand.stride_y)))
def parse_maxout(maxout, input_layer_name, maxout_conf):
maxout_conf.channels = maxout.channels
maxout_conf.groups = maxout.groups
maxout_conf.img_size_x = maxout.img_size_x
maxout_conf.img_size_y = maxout.img_size_y
# Define an evaluator # Define an evaluator
@config_func @config_func
def Evaluator( def Evaluator(
...@@ -1721,6 +1742,21 @@ class BlockExpandLayer(LayerBase): ...@@ -1721,6 +1742,21 @@ class BlockExpandLayer(LayerBase):
self.set_layer_size(block_expand_conf.block_x * block_expand_conf.block_y self.set_layer_size(block_expand_conf.block_x * block_expand_conf.block_y
* block_expand_conf.channels) * block_expand_conf.channels)
@config_layer('maxout')
class MaxOutLayer(LayerBase):
def __init__(
self,
name,
inputs,
**xargs):
super(MaxOutLayer, self).__init__(name, 'maxout', 0, inputs=inputs, **xargs)
input_layer = self.get_input_layer(0)
parse_maxout(self.inputs[0].maxout,
input_layer.name,
self.config.inputs[0].maxout_conf)
maxout_conf = self.config.inputs[0].maxout_conf
self.set_layer_size(g_layer_map[input_layer.name].size / maxout_conf.groups)
# key: cost type # key: cost type
# value: cost class # value: cost class
g_cost_map = {} g_cost_map = {}
...@@ -1735,7 +1771,6 @@ def define_cost(class_name, cost_type): ...@@ -1735,7 +1771,6 @@ def define_cost(class_name, cost_type):
g_cost_map[cost_type] = cls g_cost_map[cost_type] = cls
define_cost('MultiClassCrossEntropy', 'multi-class-cross-entropy') define_cost('MultiClassCrossEntropy', 'multi-class-cross-entropy')
define_cost('ClassificationErrorLayer', 'classification_error')
define_cost('RankingCost', 'rank-cost') define_cost('RankingCost', 'rank-cost')
define_cost('AucValidation', 'auc-validation') define_cost('AucValidation', 'auc-validation')
define_cost('PnpairValidation', 'pnpair-validation') define_cost('PnpairValidation', 'pnpair-validation')
......
...@@ -68,7 +68,7 @@ def define_py_data_source(file_list, cls, module, ...@@ -68,7 +68,7 @@ def define_py_data_source(file_list, cls, module,
file_list_name = 'train.list' file_list_name = 'train.list'
if isinstance(cls, TestData): if isinstance(cls, TestData):
file_list_name = 'test.list' file_list_name = 'test.list'
with open(file_list_name, 'r') as f: with open(file_list_name, 'w') as f:
f.writelines(file_list) f.writelines(file_list)
file_list = file_list_name file_list = file_list_name
...@@ -84,6 +84,7 @@ def define_py_data_source(file_list, cls, module, ...@@ -84,6 +84,7 @@ def define_py_data_source(file_list, cls, module,
data.load_data_module = load_data_module data.load_data_module = load_data_module
data.load_data_object = load_data_object data.load_data_object = load_data_object
data.load_data_args = load_data_args data.load_data_args = load_data_args
data.async_load_data = True
return data return data
data_cls = py_data2 data_cls = py_data2
......
...@@ -50,11 +50,12 @@ __all__ = ["full_matrix_projection", "AggregateLevel", "ExpandLevel", ...@@ -50,11 +50,12 @@ __all__ = ["full_matrix_projection", "AggregateLevel", "ExpandLevel",
'slope_intercept_layer', 'trans_full_matrix_projection', 'slope_intercept_layer', 'trans_full_matrix_projection',
'linear_comb_layer', 'linear_comb_layer',
'convex_comb_layer', 'ctc_layer', 'crf_layer', 'crf_decoding_layer', 'convex_comb_layer', 'ctc_layer', 'crf_layer', 'crf_decoding_layer',
'nce_layer',
'cross_entropy_with_selfnorm', 'cross_entropy', 'cross_entropy_with_selfnorm', 'cross_entropy',
'multi_binary_label_cross_entropy', 'multi_binary_label_cross_entropy',
'rank_cost', 'lambda_cost', 'huber_cost', 'rank_cost', 'lambda_cost', 'huber_cost',
# 'block_expand_layer', # TODO(yuyang18): this layer is not correct # 'block_expand_layer', # TODO(yuyang18): this layer is not correct
'out_prod_layer', 'print_layer' 'maxout_layer', 'out_prod_layer', 'print_layer'
] ]
...@@ -110,12 +111,14 @@ class LayerType(object): ...@@ -110,12 +111,14 @@ class LayerType(object):
SLOPE_INTERCEPT_LAYER = "slope_intercept" SLOPE_INTERCEPT_LAYER = "slope_intercept"
LINEAR_COMBINATION_LAYER = "convex_comb" LINEAR_COMBINATION_LAYER = "convex_comb"
BLOCK_EXPAND = "blockexpand" BLOCK_EXPAND = "blockexpand"
MAXOUT = "maxout"
PRINT_LAYER = "print" PRINT_LAYER = "print"
CTC_LAYER = "ctc" CTC_LAYER = "ctc"
CRF_LAYER = "crf" CRF_LAYER = "crf"
CRF_DECODING_LAYER = "crf_decoding" CRF_DECODING_LAYER = "crf_decoding"
NCE_LAYER = 'nce'
RANK_COST = "rank-cost" RANK_COST = "rank-cost"
LAMBDA_COST = "lambda_cost" LAMBDA_COST = "lambda_cost"
...@@ -169,7 +172,7 @@ class LayerOutput(object): ...@@ -169,7 +172,7 @@ class LayerOutput(object):
:param activation: Layer Activation. :param activation: Layer Activation.
:type activation: BaseActivation. :type activation: BaseActivation.
:param parents: Layer's parents. :param parents: Layer's parents.
:type parents: list|tuple|collection.Sequence :type parents: list|tuple|collections.Sequence
""" """
def __init__(self, name, layer_type, parents=None, activation=None, def __init__(self, name, layer_type, parents=None, activation=None,
...@@ -1692,7 +1695,7 @@ def img_conv_layer(input, filter_size, num_filters, ...@@ -1692,7 +1695,7 @@ def img_conv_layer(input, filter_size, num_filters,
@layer_support() @layer_support()
def img_pool_layer(input, pool_size, name=None, def img_pool_layer(input, pool_size, name=None,
num_channels=None, pool_type=None, num_channels=None, pool_type=None,
stride=1, start=None, padding=0, layer_attr=None, stride=1, padding=0, layer_attr=None,
pool_size_y=None, stride_y=None, padding_y=None, pool_size_y=None, stride_y=None, padding_y=None,
img_width=None): img_width=None):
""" """
...@@ -1723,8 +1726,6 @@ def img_pool_layer(input, pool_size, name=None, ...@@ -1723,8 +1726,6 @@ def img_pool_layer(input, pool_size, name=None,
:type stride: int :type stride: int
:param stride_y: stride height of pooling. It is equal to stride by default. :param stride_y: stride height of pooling. It is equal to stride by default.
:type stride_y: int|None :type stride_y: int|None
:param start: start position of pooling operation. Note it is deprecated now.
:type start: int|None
:param layer_attr: Extra Layer attribute. :param layer_attr: Extra Layer attribute.
:type layer_attr: ExtraLayerAttribute :type layer_attr: ExtraLayerAttribute
:param img_width: the width of input feature map. If it is None, the input feature :param img_width: the width of input feature map. If it is None, the input feature
...@@ -1758,7 +1759,7 @@ def img_pool_layer(input, pool_size, name=None, ...@@ -1758,7 +1759,7 @@ def img_pool_layer(input, pool_size, name=None,
pool_type=type_name, pool_type=type_name,
channels=num_channels, channels=num_channels,
size_x=pool_size, size_x=pool_size,
start=start, start=None,
stride=stride, stride=stride,
padding=padding, padding=padding,
size_y=pool_size_y, size_y=pool_size_y,
...@@ -2053,10 +2054,16 @@ def concat_layer(input, act=None, name=None, layer_attr=None): ...@@ -2053,10 +2054,16 @@ def concat_layer(input, act=None, name=None, layer_attr=None):
Concat all input vector into one huge vector. Concat all input vector into one huge vector.
Inputs can be list of LayerOutput or list of projection. Inputs can be list of LayerOutput or list of projection.
The example usage is:
.. code-block:: python
concat = concat_layer(input=[layer1, layer2])
:param name: Layer name. :param name: Layer name.
:type name: basestring :type name: basestring
:param input: input layers or projections :param input: input layers or projections
:type input: list|tuple|collection.Sequence :type input: list|tuple|collections.Sequence
:param act: Activation type. :param act: Activation type.
:type act: BaseActivation :type act: BaseActivation
:param layer_attr: Extra Layer Attribute. :param layer_attr: Extra Layer Attribute.
...@@ -2842,30 +2849,52 @@ def beam_search(step, input, bos_id, eos_id, beam_size, ...@@ -2842,30 +2849,52 @@ def beam_search(step, input, bos_id, eos_id, beam_size,
return tmp return tmp
def __cost_input__(input, label, weight=None):
"""
inputs and parents for cost layers.
"""
ipts = [Input(input.name), Input(label.name)]
parents = [input, label]
if weight is not None:
assert weight.layer_type == LayerType.DATA
ipts.append(Input(weight.name))
parents.append(weight)
return ipts, parents
@wrap_name_default() @wrap_name_default()
def regression_cost(input, label, cost='square_error', name=None): @layer_support()
def regression_cost(input, label, weight=None, name=None,
layer_attr=None):
""" """
Regression Layer. Regression Layer.
TODO(yuyang18): Complete this method. TODO(yuyang18): Complete this method.
:param name: layer name. :param name: layer name.
:type name: basestring
:param input: Network prediction. :param input: Network prediction.
:type input: LayerOutput
:param label: Data label. :param label: Data label.
:param cost: Cost method. :type label: LayerOutput
:param weight: The weight affects the cost, namely the scale of cost.
It is an optional argument.
:type weight: LayerOutput
:param layer_attr: layer's extra attribute.
:type layer_attr: ExtraLayerAttribute
:return: LayerOutput object. :return: LayerOutput object.
:rtype: LayerOutput
""" """
Layer(inputs=[Input(input.name), Input(label.name)], type=cost, name=name) ipts, parents = __cost_input__(input, label, weight)
return LayerOutput(
name, LayerType.COST, parents=[input, label] Layer(inputs=ipts, type="square_error", name=name,
) **ExtraLayerAttribute.to_kwargs(layer_attr))
return LayerOutput(name, LayerType.COST, parents=parents)
@wrap_name_default("cost") @wrap_name_default("cost")
@layer_support() @layer_support()
def classification_cost(input, label, name=None, def classification_cost(input, label, weight=None, name=None,
cost="multi-class-cross-entropy",
evaluator=classification_error_evaluator, evaluator=classification_error_evaluator,
layer_attr=None): layer_attr=None):
""" """
...@@ -2877,8 +2906,9 @@ def classification_cost(input, label, name=None, ...@@ -2877,8 +2906,9 @@ def classification_cost(input, label, name=None,
:type input: LayerOutput :type input: LayerOutput
:param label: label layer name. data_layer often. :param label: label layer name. data_layer often.
:type label: LayerOutput :type label: LayerOutput
:param cost: cost method. :param weight: The weight affects the cost, namely the scale of cost.
:type cost: basestring It is an optional argument.
:type weight: LayerOutput
:param evaluator: Evaluator method. :param evaluator: Evaluator method.
:param layer_attr: layer's extra attribute. :param layer_attr: layer's extra attribute.
:type layer_attr: ExtraLayerAttribute :type layer_attr: ExtraLayerAttribute
...@@ -2888,7 +2918,10 @@ def classification_cost(input, label, name=None, ...@@ -2888,7 +2918,10 @@ def classification_cost(input, label, name=None,
assert input.layer_type != LayerType.DATA assert input.layer_type != LayerType.DATA
assert isinstance(input.activation, SoftmaxActivation) assert isinstance(input.activation, SoftmaxActivation)
assert label.layer_type == LayerType.DATA assert label.layer_type == LayerType.DATA
Layer(name=name, type=cost, inputs=[Input(input.name), Input(label.name)],
ipts, parents = __cost_input__(input, label, weight)
Layer(name=name, type="multi-class-cross-entropy", inputs=ipts,
**ExtraLayerAttribute.to_kwargs(layer_attr)) **ExtraLayerAttribute.to_kwargs(layer_attr))
def __add_evaluator__(e): def __add_evaluator__(e):
...@@ -2900,7 +2933,7 @@ def classification_cost(input, label, name=None, ...@@ -2900,7 +2933,7 @@ def classification_cost(input, label, name=None,
assert isinstance(e.for_classification, bool) assert isinstance(e.for_classification, bool)
assert e.for_classification assert e.for_classification
e(name=e.__name__, input=input, label=label) e(name=e.__name__, input=input, label=label, weight=weight)
if not isinstance(evaluator, collections.Sequence): if not isinstance(evaluator, collections.Sequence):
evaluator = [evaluator] evaluator = [evaluator]
...@@ -2908,7 +2941,7 @@ def classification_cost(input, label, name=None, ...@@ -2908,7 +2941,7 @@ def classification_cost(input, label, name=None,
for each_evaluator in evaluator: for each_evaluator in evaluator:
__add_evaluator__(each_evaluator) __add_evaluator__(each_evaluator)
return LayerOutput(name, LayerType.COST, parents=[input, label]) return LayerOutput(name, LayerType.COST, parents=parents)
def conv_operator(img, filter, filter_size, num_filters, def conv_operator(img, filter, filter_size, num_filters,
...@@ -2984,7 +3017,8 @@ def conv_operator(img, filter, filter_size, num_filters, ...@@ -2984,7 +3017,8 @@ def conv_operator(img, filter, filter_size, num_filters,
@wrap_name_default() @wrap_name_default()
def conv_shift_layer(a, b, name=None): @layer_support()
def conv_shift_layer(a, b, name=None, layer_attr=None):
""" """
This layer performs cyclic convolution for two input. For example: This layer performs cyclic convolution for two input. For example:
- a[in]: contains M elements. - a[in]: contains M elements.
...@@ -3013,6 +3047,8 @@ def conv_shift_layer(a, b, name=None): ...@@ -3013,6 +3047,8 @@ def conv_shift_layer(a, b, name=None):
:type a: LayerOutput :type a: LayerOutput
:param b: input layer b :param b: input layer b
:type b: LayerOutput :type b: LayerOutput
:param layer_attr: layer's extra attribute.
:type layer_attr: ExtraLayerAttribute
:return: LayerOutput object. :return: LayerOutput object.
:rtype: LayerOutput :rtype: LayerOutput
""" """
...@@ -3022,6 +3058,7 @@ def conv_shift_layer(a, b, name=None): ...@@ -3022,6 +3058,7 @@ def conv_shift_layer(a, b, name=None):
name=name, name=name,
type=LayerType.CONV_SHIFT_LAYER, type=LayerType.CONV_SHIFT_LAYER,
inputs=[a.name, b.name], inputs=[a.name, b.name],
**ExtraLayerAttribute.to_kwargs(layer_attr)
) )
return LayerOutput(name, LayerType.CONV_SHIFT_LAYER, parents=[a, b], return LayerOutput(name, LayerType.CONV_SHIFT_LAYER, parents=[a, b],
...@@ -3095,6 +3132,7 @@ def tensor_layer(a, b, size, act=None, name=None, ...@@ -3095,6 +3132,7 @@ def tensor_layer(a, b, size, act=None, name=None,
@wrap_param_attr_default() @wrap_param_attr_default()
@wrap_bias_attr_default() @wrap_bias_attr_default()
@wrap_act_default() @wrap_act_default()
@layer_support()
def selective_fc_layer(input, select, size, act=None, name=None, def selective_fc_layer(input, select, size, act=None, name=None,
pass_generation=False, pass_generation=False,
has_selected_colums=True, has_selected_colums=True,
...@@ -3167,7 +3205,8 @@ def selective_fc_layer(input, select, size, act=None, name=None, ...@@ -3167,7 +3205,8 @@ def selective_fc_layer(input, select, size, act=None, name=None,
@wrap_name_default() @wrap_name_default()
def sampling_id_layer(input, name=None): @layer_support()
def sampling_id_layer(input, name=None, layer_attr=None):
""" """
A layer for sampling id from multinomial distribution from the input layer. A layer for sampling id from multinomial distribution from the input layer.
Sampling one id for one sample. Sampling one id for one sample.
...@@ -3182,6 +3221,8 @@ def sampling_id_layer(input, name=None): ...@@ -3182,6 +3221,8 @@ def sampling_id_layer(input, name=None):
:type input: LayerOutput :type input: LayerOutput
:param name: The Layer Name. :param name: The Layer Name.
:type name: basestring :type name: basestring
:param layer_attr: Extra Layer config.
:type layer_attr: ExtraLayerAttribute|None
:return: LayerOutput object. :return: LayerOutput object.
:rtype: LayerOutput :rtype: LayerOutput
""" """
...@@ -3189,12 +3230,15 @@ def sampling_id_layer(input, name=None): ...@@ -3189,12 +3230,15 @@ def sampling_id_layer(input, name=None):
name=name, name=name,
type=LayerType.SAMPLING_ID_LAYER, type=LayerType.SAMPLING_ID_LAYER,
inputs=[Input(input.name)], inputs=[Input(input.name)],
**ExtraLayerAttribute.to_kwargs(layer_attr)
) )
return LayerOutput(name, LayerType.SAMPLING_ID_LAYER, input) return LayerOutput(name, LayerType.SAMPLING_ID_LAYER, input)
@wrap_name_default() @wrap_name_default()
def slope_intercept_layer(input, name=None, slope=1.0, intercept=0.0): @layer_support()
def slope_intercept_layer(input, name=None, slope=1.0, intercept=0.0,
layer_attr=None):
""" """
This layer for applying a slope and an intercept to the input This layer for applying a slope and an intercept to the input
element-wise. There is no activation and weight. element-wise. There is no activation and weight.
...@@ -3216,6 +3260,8 @@ def slope_intercept_layer(input, name=None, slope=1.0, intercept=0.0): ...@@ -3216,6 +3260,8 @@ def slope_intercept_layer(input, name=None, slope=1.0, intercept=0.0):
:type slope: float. :type slope: float.
:param intercept: the offset. :param intercept: the offset.
:type intercept: float. :type intercept: float.
:param layer_attr: Extra Layer config.
:type layer_attr: ExtraLayerAttribute|None
:return: LayerOutput object. :return: LayerOutput object.
:rtype: LayerOutput :rtype: LayerOutput
""" """
...@@ -3225,12 +3271,15 @@ def slope_intercept_layer(input, name=None, slope=1.0, intercept=0.0): ...@@ -3225,12 +3271,15 @@ def slope_intercept_layer(input, name=None, slope=1.0, intercept=0.0):
slope=slope, slope=slope,
intercept=intercept, intercept=intercept,
inputs=[Input(input.name)], inputs=[Input(input.name)],
**ExtraLayerAttribute.to_kwargs(layer_attr)
) )
return LayerOutput(name, LayerType.SLOPE_INTERCEPT_LAYER, input) return LayerOutput(name, LayerType.SLOPE_INTERCEPT_LAYER, input)
@wrap_name_default() @wrap_name_default()
def linear_comb_layer(weights, vectors, size=None, name=None): @layer_support()
def linear_comb_layer(weights, vectors, size=None, name=None,
layer_attr=None):
""" """
A layer for weighted sum of vectors takes two inputs. A layer for weighted sum of vectors takes two inputs.
- Input: size of weights is M - Input: size of weights is M
...@@ -3271,6 +3320,8 @@ def linear_comb_layer(weights, vectors, size=None, name=None): ...@@ -3271,6 +3320,8 @@ def linear_comb_layer(weights, vectors, size=None, name=None):
:type size: int :type size: int
:param name: The Layer Name. :param name: The Layer Name.
:type name: basestring :type name: basestring
:param layer_attr: Extra Layer config.
:type layer_attr: ExtraLayerAttribute|None
:return: LayerOutput object. :return: LayerOutput object.
:rtype: LayerOutput :rtype: LayerOutput
""" """
...@@ -3286,6 +3337,7 @@ def linear_comb_layer(weights, vectors, size=None, name=None): ...@@ -3286,6 +3337,7 @@ def linear_comb_layer(weights, vectors, size=None, name=None):
type=LayerType.LINEAR_COMBINATION_LAYER, type=LayerType.LINEAR_COMBINATION_LAYER,
size=size, size=size,
inputs=[Input(weights.name), Input(vectors.name)], inputs=[Input(weights.name), Input(vectors.name)],
**ExtraLayerAttribute.to_kwargs(layer_attr)
) )
return LayerOutput(name, LayerType.LINEAR_COMBINATION_LAYER, return LayerOutput(name, LayerType.LINEAR_COMBINATION_LAYER,
[weights, vectors], size=size) [weights, vectors], size=size)
...@@ -3295,6 +3347,7 @@ convex_comb_layer = linear_comb_layer ...@@ -3295,6 +3347,7 @@ convex_comb_layer = linear_comb_layer
@wrap_name_default() @wrap_name_default()
@layer_support()
def block_expand_layer(input, def block_expand_layer(input,
channel=0, channel=0,
block_x=0, block_x=0,
...@@ -3303,7 +3356,8 @@ def block_expand_layer(input, ...@@ -3303,7 +3356,8 @@ def block_expand_layer(input,
stride_y=0, stride_y=0,
padding_x=0, padding_x=0,
padding_y=0, padding_y=0,
name=None): name=None,
layer_attr=None):
""" """
Expand feature map to minibatch matrix. Expand feature map to minibatch matrix.
- matrix width is: block_y * block_x * channel - matrix width is: block_y * block_x * channel
...@@ -3350,6 +3404,8 @@ def block_expand_layer(input, ...@@ -3350,6 +3404,8 @@ def block_expand_layer(input,
:type padding_y: int :type padding_y: int
:param name: The name of this layer, which can not specify. :param name: The name of this layer, which can not specify.
:type name: None|basestring. :type name: None|basestring.
:param layer_attr: Extra Layer config.
:type layer_attr: ExtraLayerAttribute|None
:return: LayerOutput object. :return: LayerOutput object.
:rtype: LayerOutput :rtype: LayerOutput
""" """
...@@ -3364,13 +3420,83 @@ def block_expand_layer(input, ...@@ -3364,13 +3420,83 @@ def block_expand_layer(input,
padding_y=padding_y) padding_y=padding_y)
), ),
type=LayerType.BLOCK_EXPAND, type=LayerType.BLOCK_EXPAND,
**ExtraLayerAttribute.to_kwargs(layer_attr)
) )
return LayerOutput(name, LayerType.BLOCK_EXPAND, parents=[input]) return LayerOutput(name, LayerType.BLOCK_EXPAND, parents=[input])
@wrap_name_default() @wrap_name_default()
def ctc_layer(input, label, size=None, name=None, norm_by_times=False): @layer_support()
def maxout_layer(input,
groups,
num_channels=None,
size_x=None,
size_y=None,
name=None,
layer_attr=None):
"""
A layer to do max out on conv layer output.
- Input: output of a conv layer.
- Output: feature map size same as input. Channel is (input channel) / groups.
So groups should be larger than 1, and the num of channels should be able
to devided by groups.
Please refer to Paper:
- Maxout Networks: http://www.jmlr.org/proceedings/papers/v28/goodfellow13.pdf
- Multi-digit Number Recognition from Street View \
Imagery using Deep Convolutional Neural Networks: \
https://arxiv.org/pdf/1312.6082v4.pdf
The simple usage is:
.. code-block:: python
maxout = maxout_layer(input,
num_channels=128,
groups=4)
:param input: The input layer.
:type input: LayerOutput
:param num_channels: The channel number of input layer. If None will be set
automatically from previous output.
:type num_channels: int|None
:param groups: The group number of input layer.
:type groups: int
:param size_x: conv output width. If None will be set
automatically from previous output.
:type size_x: int|None
:param size_y: conv output height. If None will be set
automatically from previous output.
:type size_y: int|None
:param name: The name of this layer, which can not specify.
:type name: None|basestring.
:param layer_attr: Extra Layer attribute.
:type layer_attr: ExtraLayerAttribute
:return: LayerOutput object.
:rtype: LayerOutput
"""
assert input.layer_type == LayerType.CONV_LAYER
assert isinstance(input.activation, LinearActivation)
assert groups > 1
if num_channels is None:
assert input.num_filters is not None
num_channels = input.num_filters
assert num_channels % groups == 0
Layer(name=name,
inputs=Input(input.name,
maxout=MaxOut(channels=num_channels,
groups=groups)),
type=LayerType.MAXOUT,
**ExtraLayerAttribute.to_kwargs(layer_attr))
return LayerOutput(name, LayerType.MAXOUT, parents=[input])
@wrap_name_default()
@layer_support()
def ctc_layer(input, label, size=None, name=None, norm_by_times=False,
layer_attr=None):
""" """
Connectionist Temporal Classification (CTC) is designed for temporal Connectionist Temporal Classification (CTC) is designed for temporal
classication task. That is, for sequence labeling problems where the classication task. That is, for sequence labeling problems where the
...@@ -3407,6 +3533,8 @@ def ctc_layer(input, label, size=None, name=None, norm_by_times=False): ...@@ -3407,6 +3533,8 @@ def ctc_layer(input, label, size=None, name=None, norm_by_times=False):
:type name: basestring|None :type name: basestring|None
:param norm_by_times: Whether to normalization by times. False by default. :param norm_by_times: Whether to normalization by times. False by default.
:type norm_by_times: bool :type norm_by_times: bool
:param layer_attr: Extra Layer config.
:type layer_attr: ExtraLayerAttribute|None
:return: LayerOutput object. :return: LayerOutput object.
:rtype: LayerOutput :rtype: LayerOutput
""" """
...@@ -3422,14 +3550,17 @@ def ctc_layer(input, label, size=None, name=None, norm_by_times=False): ...@@ -3422,14 +3550,17 @@ def ctc_layer(input, label, size=None, name=None, norm_by_times=False):
type=LayerType.CTC_LAYER, type=LayerType.CTC_LAYER,
size=size, size=size,
norm_by_times=norm_by_times, norm_by_times=norm_by_times,
inputs=[input.name, label.name] inputs=[input.name, label.name],
**ExtraLayerAttribute.to_kwargs(layer_attr)
) )
return LayerOutput(name, LayerType.CTC_LAYER, [input, label], size=size) return LayerOutput(name, LayerType.CTC_LAYER, [input, label], size=size)
@wrap_name_default() @wrap_name_default()
@wrap_param_attr_default() @wrap_param_attr_default()
def crf_layer(input, label, size=None, weight=None, param_attr=None, name=None): @layer_support()
def crf_layer(input, label, size=None, weight=None, param_attr=None, name=None,
layer_attr=None):
""" """
A layer for calculating the cost of sequential conditional random A layer for calculating the cost of sequential conditional random
field model. field model.
...@@ -3455,6 +3586,8 @@ def crf_layer(input, label, size=None, weight=None, param_attr=None, name=None): ...@@ -3455,6 +3586,8 @@ def crf_layer(input, label, size=None, weight=None, param_attr=None, name=None):
:type param_attr: ParameterAttribute :type param_attr: ParameterAttribute
:param name: The name of this layers. It is not necessary. :param name: The name of this layers. It is not necessary.
:type name: None|basestring :type name: None|basestring
:param layer_attr: Extra Layer config.
:type layer_attr: ExtraLayerAttribute|None
:return: LayerOutput object. :return: LayerOutput object.
:rtype: LayerOutput :rtype: LayerOutput
""" """
...@@ -3478,6 +3611,7 @@ def crf_layer(input, label, size=None, weight=None, param_attr=None, name=None): ...@@ -3478,6 +3611,7 @@ def crf_layer(input, label, size=None, weight=None, param_attr=None, name=None):
type=LayerType.CRF_LAYER, type=LayerType.CRF_LAYER,
size=size, size=size,
inputs=ipts, inputs=ipts,
**ExtraLayerAttribute.to_kwargs(layer_attr)
) )
parents = [input, label] parents = [input, label]
if weight is not None: if weight is not None:
...@@ -3487,7 +3621,9 @@ def crf_layer(input, label, size=None, weight=None, param_attr=None, name=None): ...@@ -3487,7 +3621,9 @@ def crf_layer(input, label, size=None, weight=None, param_attr=None, name=None):
@wrap_name_default() @wrap_name_default()
@wrap_param_attr_default() @wrap_param_attr_default()
def crf_decoding_layer(input, size, label=None, param_attr=None, name=None): @layer_support()
def crf_decoding_layer(input, size, label=None, param_attr=None, name=None,
layer_attr=None):
""" """
A layer for calculating the decoding sequence of sequential conditional A layer for calculating the decoding sequence of sequential conditional
random field model. The decoding sequence is stored in output.ids. random field model. The decoding sequence is stored in output.ids.
...@@ -3505,6 +3641,8 @@ def crf_decoding_layer(input, size, label=None, param_attr=None, name=None): ...@@ -3505,6 +3641,8 @@ def crf_decoding_layer(input, size, label=None, param_attr=None, name=None):
:type param_attr: ParameterAttribute :type param_attr: ParameterAttribute
:param name: The name of this layers. It is not necessary. :param name: The name of this layers. It is not necessary.
:type name: None|basestring :type name: None|basestring
:param layer_attr: Extra Layer config.
:type layer_attr: ExtraLayerAttribute|None
:return: LayerOutput object. :return: LayerOutput object.
:rtype: LayerOutput :rtype: LayerOutput
""" """
...@@ -3521,12 +3659,90 @@ def crf_decoding_layer(input, size, label=None, param_attr=None, name=None): ...@@ -3521,12 +3659,90 @@ def crf_decoding_layer(input, size, label=None, param_attr=None, name=None):
type=LayerType.CRF_DECODING_LAYER, type=LayerType.CRF_DECODING_LAYER,
size=size, size=size,
inputs=ipts, inputs=ipts,
**ExtraLayerAttribute.to_kwargs(layer_attr)
) )
parents = [input] parents = [input]
if label is not None: if label is not None:
parents.append(label) parents.append(label)
return LayerOutput(name, LayerType.CRF_DECODING_LAYER, parents, size=size) return LayerOutput(name, LayerType.CRF_DECODING_LAYER, parents, size=size)
@wrap_bias_attr_default(has_bias=True)
@wrap_name_default()
@layer_support()
def nce_layer(input, label, num_classes, weight=None,
num_neg_samples=10, neg_distribution=None,
name=None, bias_attr=None, layer_attr=None):
"""
Noise-contrastive estimation.
Implements the method in the following paper:
A fast and simple algorithm for training neural probabilistic language models.
The example usage is:
.. code-block:: python
cost = nce_layer(input=layer1, label=layer2, weight=layer3,
num_classes=3, neg_distribution=[0.1,0.3,0.6])
:param name: layer name
:type name: basestring
:param input: input layers. It could be a LayerOutput of list/tuple of LayerOutput.
:type input: LayerOutput|list|tuple|collections.Sequence
:param label: label layer
:type label: LayerOutput
:param weight: weight layer, can be None(default)
:type weight: LayerOutput
:param num_classes: number of classes.
:type num_classes: int
:param num_neg_samples: number of negative samples. Default is 10.
:type num_neg_samples: int
:param neg_distribution: The distribution for generating the random negative labels.
A uniform distribution will be used if not provided.
If not None, its length must be equal to num_classes.
:type neg_distribution: list|tuple|collections.Sequence|None
:param bias_attr: Bias parameter attribute. True if no bias.
:type bias_attr: ParameterAttribute|None|False
:param layer_attr: Extra Layer Attribute.
:type layer_attr: ExtraLayerAttribute
:return: layer name.
:rtype: LayerOutput
"""
if isinstance(input, LayerOutput):
input = [input]
assert isinstance(input, collections.Sequence)
assert isinstance(label, LayerOutput)
assert label.layer_type == LayerType.DATA
if neg_distribution is not None:
assert isinstance(neg_distribution, collections.Sequence)
assert len(neg_distribution) == num_classes
assert sum(neg_distribution) == 1
ipts_for_layer = []
parents = []
for each_input in input:
assert isinstance(each_input, LayerOutput)
ipts_for_layer.append(each_input.name)
parents.append(each_input)
ipts_for_layer.append(label.name)
parents.append(label)
if weight is not None:
assert isinstance(weight, LayerOutput)
assert weight.layer_type == LayerType.DATA
ipts_for_layer.append(weight.name)
parents.append(weight)
Layer(
name=name,
type=LayerType.NCE_LAYER,
num_classes=num_classes,
neg_sampling_dist=neg_distribution,
num_neg_samples=num_neg_samples,
inputs=ipts_for_layer,
bias=ParamAttr.to_bias(bias_attr),
**ExtraLayerAttribute.to_kwargs(layer_attr)
)
return LayerOutput(name, LayerType.NCE_LAYER, parents=parents)
""" """
following are cost Layers. following are cost Layers.
...@@ -3534,7 +3750,8 @@ following are cost Layers. ...@@ -3534,7 +3750,8 @@ following are cost Layers.
@wrap_name_default() @wrap_name_default()
def rank_cost(left, right, label, weight=None, name=None, coeff=1.0): @layer_support()
def rank_cost(left, right, label, weight=None, name=None, coeff=1.0, layer_attr=None):
""" """
A cost Layer for learning to rank using gradient descent. Details can refer A cost Layer for learning to rank using gradient descent. Details can refer
to `papers <http://research.microsoft.com/en-us/um/people/cburges/papers/ to `papers <http://research.microsoft.com/en-us/um/people/cburges/papers/
...@@ -3578,6 +3795,8 @@ def rank_cost(left, right, label, weight=None, name=None, coeff=1.0): ...@@ -3578,6 +3795,8 @@ def rank_cost(left, right, label, weight=None, name=None, coeff=1.0):
:type name: None|basestring :type name: None|basestring
:param coeff: The coefficient affects the gradient in the backward. :param coeff: The coefficient affects the gradient in the backward.
:type coeff: float :type coeff: float
:param layer_attr: Extra Layer Attribute.
:type layer_attr: ExtraLayerAttribute
:return: LayerOutput object. :return: LayerOutput object.
:rtype: LayerOutput :rtype: LayerOutput
""" """
...@@ -3595,13 +3814,15 @@ def rank_cost(left, right, label, weight=None, name=None, coeff=1.0): ...@@ -3595,13 +3814,15 @@ def rank_cost(left, right, label, weight=None, name=None, coeff=1.0):
type=LayerType.RANK_COST, type=LayerType.RANK_COST,
inputs=ipts, inputs=ipts,
coeff=coeff, coeff=coeff,
**ExtraLayerAttribute.to_kwargs(layer_attr)
) )
return LayerOutput(name, LayerType.RANK_COST, parents=parents) return LayerOutput(name, LayerType.RANK_COST, parents=parents)
@wrap_name_default() @wrap_name_default()
def lambda_cost(input, score, name, NDCG_num=5, max_sort_size=-1): @layer_support()
def lambda_cost(input, score, name, NDCG_num=5, max_sort_size=-1, layer_attr=None):
""" """
lambdaCost for lambdaRank LTR approach. lambdaCost for lambdaRank LTR approach.
...@@ -3632,6 +3853,8 @@ def lambda_cost(input, score, name, NDCG_num=5, max_sort_size=-1): ...@@ -3632,6 +3853,8 @@ def lambda_cost(input, score, name, NDCG_num=5, max_sort_size=-1):
:type max_sort_size: int :type max_sort_size: int
:param name: The name of this layers. It is not necessary. :param name: The name of this layers. It is not necessary.
:type name: None|basestring :type name: None|basestring
:param layer_attr: Extra Layer Attribute.
:type layer_attr: ExtraLayerAttribute
:return: LayerOutput object. :return: LayerOutput object.
:rtype: LayerOutput :rtype: LayerOutput
""" """
...@@ -3642,14 +3865,16 @@ def lambda_cost(input, score, name, NDCG_num=5, max_sort_size=-1): ...@@ -3642,14 +3865,16 @@ def lambda_cost(input, score, name, NDCG_num=5, max_sort_size=-1):
type=LayerType.LAMBDA_COST, type=LayerType.LAMBDA_COST,
inputs=[input.name, score.name], inputs=[input.name, score.name],
NDCG_num=NDCG_num, NDCG_num=NDCG_num,
max_sort_size=max_sort_size max_sort_size=max_sort_size,
**ExtraLayerAttribute.to_kwargs(layer_attr)
) )
return LayerOutput(name, LayerType.LAMBDA_COST, parents=[input, score]) return LayerOutput(name, LayerType.LAMBDA_COST, parents=[input, score])
@wrap_name_default() @wrap_name_default()
def cross_entropy(input, label, name=None, coeff=1.0): @layer_support()
def cross_entropy(input, label, name=None, coeff=1.0, layer_attr=None):
""" """
A loss layer for multi class entropy. A loss layer for multi class entropy.
...@@ -3667,6 +3892,8 @@ def cross_entropy(input, label, name=None, coeff=1.0): ...@@ -3667,6 +3892,8 @@ def cross_entropy(input, label, name=None, coeff=1.0):
:type name: None|basestring. :type name: None|basestring.
:param coeff: The coefficient affects the gradient in the backward. :param coeff: The coefficient affects the gradient in the backward.
:type coeff: float. :type coeff: float.
:param layer_attr: Extra Layer Attribute.
:type layer_attr: ExtraLayerAttribute
:return: LayerOutput object. :return: LayerOutput object.
:rtype: LayerOutput. :rtype: LayerOutput.
""" """
...@@ -3675,13 +3902,16 @@ def cross_entropy(input, label, name=None, coeff=1.0): ...@@ -3675,13 +3902,16 @@ def cross_entropy(input, label, name=None, coeff=1.0):
type=LayerType.CROSS_ENTROPY, type=LayerType.CROSS_ENTROPY,
inputs=[input.name, label.name], inputs=[input.name, label.name],
coeff=coeff, coeff=coeff,
**ExtraLayerAttribute.to_kwargs(layer_attr)
) )
return LayerOutput(name, LayerType.CROSS_ENTROPY, parents=[input, label]) return LayerOutput(name, LayerType.CROSS_ENTROPY, parents=[input, label])
@wrap_name_default() @wrap_name_default()
@layer_support()
def cross_entropy_with_selfnorm(input, label, name=None, coeff=1.0, def cross_entropy_with_selfnorm(input, label, name=None, coeff=1.0,
softmax_selfnorm_alpha=0.1): softmax_selfnorm_alpha=0.1,
layer_attr=None):
""" """
A loss layer for multi class entropy with selfnorm. A loss layer for multi class entropy with selfnorm.
...@@ -3701,6 +3931,8 @@ def cross_entropy_with_selfnorm(input, label, name=None, coeff=1.0, ...@@ -3701,6 +3931,8 @@ def cross_entropy_with_selfnorm(input, label, name=None, coeff=1.0,
:type coeff: float. :type coeff: float.
:param softmax_selfnorm_alpha: The scale factor affects the cost. :param softmax_selfnorm_alpha: The scale factor affects the cost.
:type softmax_selfnorm_alpha: float. :type softmax_selfnorm_alpha: float.
:param layer_attr: Extra Layer Attribute.
:type layer_attr: ExtraLayerAttribute
:return: LayerOutput object. :return: LayerOutput object.
:rtype: LayerOutput. :rtype: LayerOutput.
""" """
...@@ -3709,6 +3941,7 @@ def cross_entropy_with_selfnorm(input, label, name=None, coeff=1.0, ...@@ -3709,6 +3941,7 @@ def cross_entropy_with_selfnorm(input, label, name=None, coeff=1.0,
inputs=[input.name, label.name], inputs=[input.name, label.name],
coeff=coeff, coeff=coeff,
softmax_selfnorm_alpha=softmax_selfnorm_alpha, softmax_selfnorm_alpha=softmax_selfnorm_alpha,
**ExtraLayerAttribute.to_kwargs(layer_attr)
) )
return LayerOutput(name, return LayerOutput(name,
...@@ -3717,7 +3950,8 @@ def cross_entropy_with_selfnorm(input, label, name=None, coeff=1.0, ...@@ -3717,7 +3950,8 @@ def cross_entropy_with_selfnorm(input, label, name=None, coeff=1.0,
@wrap_name_default() @wrap_name_default()
def huber_cost(input, label, name=None, coeff=1.0): @layer_support()
def huber_cost(input, label, name=None, coeff=1.0, layer_attr=None):
""" """
A loss layer for huber loss. A loss layer for huber loss.
...@@ -3733,6 +3967,8 @@ def huber_cost(input, label, name=None, coeff=1.0): ...@@ -3733,6 +3967,8 @@ def huber_cost(input, label, name=None, coeff=1.0):
:type name: None|basestring. :type name: None|basestring.
:param coeff: The coefficient affects the gradient in the backward. :param coeff: The coefficient affects the gradient in the backward.
:type coeff: float. :type coeff: float.
:param layer_attr: Extra Layer Attribute.
:type layer_attr: ExtraLayerAttribute
:return: LayerOutput object. :return: LayerOutput object.
:rtype: LayerOutput. :rtype: LayerOutput.
""" """
...@@ -3743,12 +3979,15 @@ def huber_cost(input, label, name=None, coeff=1.0): ...@@ -3743,12 +3979,15 @@ def huber_cost(input, label, name=None, coeff=1.0):
type=LayerType.HUBER, type=LayerType.HUBER,
inputs=[input.name, label.name], inputs=[input.name, label.name],
coeff=coeff, coeff=coeff,
**ExtraLayerAttribute.to_kwargs(layer_attr)
) )
return LayerOutput(name, LayerType.HUBER, parents=[input, label]) return LayerOutput(name, LayerType.HUBER, parents=[input, label])
@wrap_name_default() @wrap_name_default()
def multi_binary_label_cross_entropy(input, label, name=None, coeff=1.0): @layer_support()
def multi_binary_label_cross_entropy(input, label, name=None, coeff=1.0,
layer_attr=None):
""" """
A loss layer for multi binary label cross entropy. A loss layer for multi binary label cross entropy.
...@@ -3766,6 +4005,8 @@ def multi_binary_label_cross_entropy(input, label, name=None, coeff=1.0): ...@@ -3766,6 +4005,8 @@ def multi_binary_label_cross_entropy(input, label, name=None, coeff=1.0):
:type name: None|basestring :type name: None|basestring
:param coeff: The coefficient affects the gradient in the backward. :param coeff: The coefficient affects the gradient in the backward.
:type coeff: float :type coeff: float
:param layer_attr: Extra Layer Attribute.
:type layer_attr: ExtraLayerAttribute
:return: LayerOutput object. :return: LayerOutput object.
:rtype: LayerOutput :rtype: LayerOutput
""" """
...@@ -3773,13 +4014,14 @@ def multi_binary_label_cross_entropy(input, label, name=None, coeff=1.0): ...@@ -3773,13 +4014,14 @@ def multi_binary_label_cross_entropy(input, label, name=None, coeff=1.0):
if input.activation is None or \ if input.activation is None or \
not isinstance(input.activation, SigmoidActivation): not isinstance(input.activation, SigmoidActivation):
logger.log(logging.WARN, logger.log(logging.WARN,
"%s is not recommend for batch normalization's activation, " "%s is not recommend for multi_binary_label_cross_entropy's activation, "
"maybe the relu is better" % repr(input.activation)) "maybe the sigmoid is better" % repr(input.activation))
Layer(name=name, Layer(name=name,
type=LayerType.MULTI_BIN_LABEL_CROSS_ENTROPY, type=LayerType.MULTI_BIN_LABEL_CROSS_ENTROPY,
inputs=[input.name, label.name], inputs=[input.name, label.name],
coeff=coeff, coeff=coeff,
**ExtraLayerAttribute.to_kwargs(layer_attr)
) )
return LayerOutput(name, LayerType.MULTI_BIN_LABEL_CROSS_ENTROPY, return LayerOutput(name, LayerType.MULTI_BIN_LABEL_CROSS_ENTROPY,
parents=[input, label]) parents=[input, label])
...@@ -20,7 +20,7 @@ from activations import LinearActivation, ReluActivation, SoftmaxActivation, \ ...@@ -20,7 +20,7 @@ from activations import LinearActivation, ReluActivation, SoftmaxActivation, \
IdentityActivation, TanhActivation, SequenceSoftmaxActivation IdentityActivation, TanhActivation, SequenceSoftmaxActivation
from attrs import ExtraAttr from attrs import ExtraAttr
from default_decorators import wrap_name_default, wrap_act_default, \ from default_decorators import wrap_name_default, wrap_act_default, \
wrap_param_default wrap_param_default, wrap_bias_attr_default, wrap_param_attr_default
from layers import * # There are too many layers used in network, so import * from layers import * # There are too many layers used in network, so import *
from poolings import MaxPooling, SumPooling from poolings import MaxPooling, SumPooling
from paddle.trainer.config_parser import * from paddle.trainer.config_parser import *
...@@ -30,7 +30,7 @@ __all__ = ['sequence_conv_pool', 'simple_lstm', "simple_img_conv_pool", ...@@ -30,7 +30,7 @@ __all__ = ['sequence_conv_pool', 'simple_lstm', "simple_img_conv_pool",
'lstmemory_unit', 'small_vgg', 'img_conv_group', 'vgg_16_network', 'lstmemory_unit', 'small_vgg', 'img_conv_group', 'vgg_16_network',
'gru_unit', 'gru_group', 'simple_gru', 'simple_attention', 'gru_unit', 'gru_group', 'simple_gru', 'simple_attention',
'text_conv_pool', 'text_conv_pool',
'bidirectional_lstm', 'outputs'] 'bidirectional_lstm', 'inputs', 'outputs']
###################################################### ######################################################
...@@ -133,7 +133,7 @@ def simple_img_conv_pool(input, filter_size, num_filters, pool_size, name=None, ...@@ -133,7 +133,7 @@ def simple_img_conv_pool(input, filter_size, num_filters, pool_size, name=None,
pool_type=None, act=None, groups=1, conv_stride=1, pool_type=None, act=None, groups=1, conv_stride=1,
conv_padding=0, bias_attr=None, num_channel=None, conv_padding=0, bias_attr=None, num_channel=None,
param_attr=None, shared_bias=True, param_attr=None, shared_bias=True,
conv_layer_attr=None, pool_stride=1, pool_start=None, conv_layer_attr=None, pool_stride=1,
pool_padding=0, pool_layer_attr=None): pool_padding=0, pool_layer_attr=None):
""" """
Simple image convolution and pooling group. Simple image convolution and pooling group.
...@@ -172,8 +172,6 @@ def simple_img_conv_pool(input, filter_size, num_filters, pool_size, name=None, ...@@ -172,8 +172,6 @@ def simple_img_conv_pool(input, filter_size, num_filters, pool_size, name=None,
:type conv_layer_attr: ExtraLayerAttribute :type conv_layer_attr: ExtraLayerAttribute
:param pool_stride: see img_pool_layer for details :param pool_stride: see img_pool_layer for details
:type pool_stride: int :type pool_stride: int
:param pool_start: see img_pool_layer for details. It is deprecated now.
:type pool_start: int
:param pool_padding: see img_pool_layer for details :param pool_padding: see img_pool_layer for details
:type pool_padding: int :type pool_padding: int
:param pool_layer_attr: see img_pool_layer for details :param pool_layer_attr: see img_pool_layer for details
...@@ -192,7 +190,7 @@ def simple_img_conv_pool(input, filter_size, num_filters, pool_size, name=None, ...@@ -192,7 +190,7 @@ def simple_img_conv_pool(input, filter_size, num_filters, pool_size, name=None,
return img_pool_layer(name="%s_pool" % name, input=_conv_, return img_pool_layer(name="%s_pool" % name, input=_conv_,
pool_size=pool_size, pool_size=pool_size,
pool_type=pool_type, stride=pool_stride, pool_type=pool_type, stride=pool_stride,
start=pool_start, padding=pool_padding, padding=pool_padding,
layer_attr=pool_layer_attr) layer_attr=pool_layer_attr)
...@@ -203,7 +201,7 @@ def img_conv_bn_pool(input, filter_size, num_filters, pool_size, name=None, ...@@ -203,7 +201,7 @@ def img_conv_bn_pool(input, filter_size, num_filters, pool_size, name=None,
conv_param_attr=None, shared_bias=True, conv_param_attr=None, shared_bias=True,
conv_layer_attr=None, bn_param_attr=None, conv_layer_attr=None, bn_param_attr=None,
bn_bias_attr=None, bn_layer_attr=None, pool_stride=1, bn_bias_attr=None, bn_layer_attr=None, pool_stride=1,
pool_start=None, pool_padding=0, pool_layer_attr=None): pool_padding=0, pool_layer_attr=None):
""" """
Convolution, batch normalization, pooling group. Convolution, batch normalization, pooling group.
...@@ -243,8 +241,6 @@ def img_conv_bn_pool(input, filter_size, num_filters, pool_size, name=None, ...@@ -243,8 +241,6 @@ def img_conv_bn_pool(input, filter_size, num_filters, pool_size, name=None,
:param bn_layer_attr: ParameterAttribute. :param bn_layer_attr: ParameterAttribute.
:param pool_stride: see img_pool_layer's document. :param pool_stride: see img_pool_layer's document.
:type pool_stride: int :type pool_stride: int
:param pool_start: see img_pool_layer's document. It is deprecated now.
:type pool_start: int
:param pool_padding: see img_pool_layer's document. :param pool_padding: see img_pool_layer's document.
:type pool_padding: int :type pool_padding: int
:param pool_layer_attr: see img_pool_layer's document. :param pool_layer_attr: see img_pool_layer's document.
...@@ -268,7 +264,7 @@ def img_conv_bn_pool(input, filter_size, num_filters, pool_size, name=None, ...@@ -268,7 +264,7 @@ def img_conv_bn_pool(input, filter_size, num_filters, pool_size, name=None,
return img_pool_layer(name="%s_pool" % name, return img_pool_layer(name="%s_pool" % name,
input=__bn__, pool_type=pool_type, input=__bn__, pool_type=pool_type,
pool_size=pool_size, stride=pool_stride, pool_size=pool_size, stride=pool_stride,
start=pool_start, padding=pool_padding, padding=pool_padding,
layer_attr=pool_layer_attr) layer_attr=pool_layer_attr)
...@@ -372,8 +368,8 @@ def small_vgg(input_image, num_channels, num_classes): ...@@ -372,8 +368,8 @@ def small_vgg(input_image, num_channels, num_classes):
tmp = __vgg__(tmp, 128, 2, [0.4, 0]) tmp = __vgg__(tmp, 128, 2, [0.4, 0])
tmp = __vgg__(tmp, 256, 3, [0.4, 0.4, 0]) tmp = __vgg__(tmp, 256, 3, [0.4, 0.4, 0])
tmp = __vgg__(tmp, 512, 3, [0.4, 0.4, 0]) tmp = __vgg__(tmp, 512, 3, [0.4, 0.4, 0])
tmp = img_pool_layer(input = tmp, stride = 2, tmp = img_pool_layer(input=tmp, stride=2,
pool_size = 2, pool_type = MaxPooling()) pool_size=2, pool_type=MaxPooling())
tmp = dropout_layer(input=tmp, dropout_rate=0.5) tmp = dropout_layer(input=tmp, dropout_rate=0.5)
tmp = fc_layer(input=tmp, size=512, layer_attr=ExtraAttr(drop_rate=0.5), tmp = fc_layer(input=tmp, size=512, layer_attr=ExtraAttr(drop_rate=0.5),
act=LinearActivation()) act=LinearActivation())
...@@ -505,7 +501,7 @@ def simple_lstm(input, size, name=None, reverse=False, mat_param_attr=None, ...@@ -505,7 +501,7 @@ def simple_lstm(input, size, name=None, reverse=False, mat_param_attr=None,
def lstmemory_unit(input, name=None, size=None, param_attr=None, def lstmemory_unit(input, name=None, size=None, param_attr=None,
act=None, gate_act=None, state_act=None, act=None, gate_act=None, state_act=None,
mixed_bias_attr=None, lstm_bias_attr=None, mixed_bias_attr=None, lstm_bias_attr=None,
mixed_layer_attr=None,lstm_layer_attr=None, mixed_layer_attr=None, lstm_layer_attr=None,
get_output_layer_attr=None): get_output_layer_attr=None):
""" """
Define calculations that a LSTM unit performs in a single time step. Define calculations that a LSTM unit performs in a single time step.
...@@ -745,7 +741,6 @@ def gru_group(input, ...@@ -745,7 +741,6 @@ def gru_group(input,
gru_bias_attr=None, gru_bias_attr=None,
act=None, gate_act=None, act=None, gate_act=None,
gru_layer_attr=None): gru_layer_attr=None):
""" """
gru_group is a recurrent layer group version Gated Recurrent Unit. It gru_group is a recurrent layer group version Gated Recurrent Unit. It
does exactly the same calculation as the grumemory layer does. A promising does exactly the same calculation as the grumemory layer does. A promising
...@@ -919,12 +914,12 @@ def bidirectional_lstm(input, size, name=None, return_seq=False, ...@@ -919,12 +914,12 @@ def bidirectional_lstm(input, size, name=None, return_seq=False,
fw = simple_lstm(name='%s_fw' % name, input=input, size=size, fw = simple_lstm(name='%s_fw' % name, input=input, size=size,
**dict((k[len('fwd_'):], v) for k, v in args.iteritems() **dict((k[len('fwd_'):], v) for k, v in args.iteritems()
if k.startswith('fwd_'))) if k.startswith('fwd_')))
bw = simple_lstm(name="%s_bw" % name, input=input, size=size, bw = simple_lstm(name="%s_bw" % name, input=input, size=size,
reverse=True, reverse=True,
**dict((k[len('bwd_'):], v) for k, v in args.iteritems() **dict((k[len('bwd_'):], v) for k, v in args.iteritems()
if k.startswith('bwd_'))) if k.startswith('bwd_')))
if return_seq: if return_seq:
return concat_layer(name=name, input=[fw, bw], layer_attr=concat_attr, return concat_layer(name=name, input=[fw, bw], layer_attr=concat_attr,
...@@ -1052,14 +1047,30 @@ def dropout_layer(input, dropout_rate, name=None): ...@@ -1052,14 +1047,30 @@ def dropout_layer(input, dropout_rate, name=None):
layer_attr=ExtraAttr(drop_rate=dropout_rate)) layer_attr=ExtraAttr(drop_rate=dropout_rate))
def outputs(layers, *args): def inputs(layers, *args):
"""
Declare the inputs of network. The order of input should be as same as
the data provider's return order.
:param layers: Input Layers.
:type layers: list|tuple|LayerOutput.
:return:
""" """
Declare the end of network. Currently it will only calculate the
input/output order of network. It will calculate the predict network or
train network's output automatically.
if isinstance(layers, LayerOutput) or isinstance(layers, basestring):
layers = [layers]
if len(args) != 0:
layers.extend(args)
:param layers: Inputs(*[l.name for l in layers])
def outputs(layers, *args):
"""
Declare the outputs of network. If user have not defined the inputs of
network, this method will calculate the input order by dfs travel.
:param layers: Output layers.
:type layers: list|tuple|LayerOutput :type layers: list|tuple|LayerOutput
:return: :return:
""" """
...@@ -1093,6 +1104,11 @@ def outputs(layers, *args): ...@@ -1093,6 +1104,11 @@ def outputs(layers, *args):
layers.extend(args) layers.extend(args)
assert len(layers) > 0 assert len(layers) > 0
if HasInputsSet(): # input already set
Outputs(*[l.name for l in layers])
return # just return outputs.
if len(layers) != 1: if len(layers) != 1:
logger.warning("`outputs` routine try to calculate network's" logger.warning("`outputs` routine try to calculate network's"
" inputs and outputs order. It might not work well." " inputs and outputs order. It might not work well."
......
...@@ -362,6 +362,13 @@ def __extends__(dict1, dict2): ...@@ -362,6 +362,13 @@ def __extends__(dict1, dict2):
default_factory=lambda _: BaseRegularization()) default_factory=lambda _: BaseRegularization())
def settings(batch_size, def settings(batch_size,
learning_rate=1e-3, learning_rate=1e-3,
learning_rate_decay_a=0.,
learning_rate_decay_b=0.,
learning_rate_schedule='poly',
learning_rate_args='',
average_window=0,
do_average_in_cpu=False,
max_average_window=None,
learning_method=None, learning_method=None,
regularization=None, regularization=None,
is_async=False, is_async=False,
...@@ -408,10 +415,14 @@ def settings(batch_size, ...@@ -408,10 +415,14 @@ def settings(batch_size,
else: else:
algorithm = 'owlqn' algorithm = 'owlqn'
args=['batch_size', 'learning_rate', 'learning_rate_decay_a',
'learning_rate_decay_b', 'learning_rate_schedule',
'learning_rate_args', 'average_window', 'do_average_in_cpu',
'max_average_window']
kwargs = dict() kwargs = dict()
kwargs['batch_size'] = batch_size
kwargs['learning_rate'] = learning_rate
kwargs['algorithm'] = algorithm kwargs['algorithm'] = algorithm
for arg in args:
kwargs[arg] = locals()[arg]
kwargs = __extends__(kwargs, learning_method.to_setting_kwargs()) kwargs = __extends__(kwargs, learning_method.to_setting_kwargs())
learning_method.extra_settings() learning_method.extra_settings()
......
...@@ -2,13 +2,17 @@ ...@@ -2,13 +2,17 @@
a5d9259ff1fd7ca23d0ef090052cb1f2 last_first_seq.protostr a5d9259ff1fd7ca23d0ef090052cb1f2 last_first_seq.protostr
9c038249ec8ff719753a746cdb04c026 layer_activations.protostr 9c038249ec8ff719753a746cdb04c026 layer_activations.protostr
5913f87b39cee3b2701fa158270aca26 projections.protostr 5913f87b39cee3b2701fa158270aca26 projections.protostr
7334ba0a4544f0623231330fc51d390d shared_fc.protostr
8b8b6bb128a7dfcc937be86145f53e2f shared_lstm.protostr
6b39e34beea8dfb782bee9bd3dea9eb5 simple_rnn_layers.protostr 6b39e34beea8dfb782bee9bd3dea9eb5 simple_rnn_layers.protostr
0fc1409600f1a3301da994ab9d28b0bf test_cost_layers.protostr 0fc1409600f1a3301da994ab9d28b0bf test_cost_layers.protostr
6cd5f28a3416344f20120698470e0a4c test_cost_layers_with_weight.protostr
144bc6d3a509de74115fa623741797ed test_expand_layer.protostr 144bc6d3a509de74115fa623741797ed test_expand_layer.protostr
2378518bdb71e8c6e888b1842923df58 test_fc.protostr 2378518bdb71e8c6e888b1842923df58 test_fc.protostr
8bb44e1e5072d0c261572307e7672bda test_grumemory_layer.protostr 8bb44e1e5072d0c261572307e7672bda test_grumemory_layer.protostr
1f3510672dce7a9ed25317fc58579ac7 test_hsigmoid.protostr 1f3510672dce7a9ed25317fc58579ac7 test_hsigmoid.protostr
d350bd91a0dc13e854b1364c3d9339c6 test_lstmemory_layer.protostr d350bd91a0dc13e854b1364c3d9339c6 test_lstmemory_layer.protostr
6fa59551808ee7012bbd24f757e782d2 test_maxout.protostr
251a948ba41c1071afcd3d9cf9c233f7 test_ntm_layers.protostr 251a948ba41c1071afcd3d9cf9c233f7 test_ntm_layers.protostr
e6ff04e70aea27c7b06d808cc49c9497 test_print_layer.protostr e6ff04e70aea27c7b06d808cc49c9497 test_print_layer.protostr
2a75dd33b640c49a8821c2da6e574577 test_rnn_group.protostr 2a75dd33b640c49a8821c2da6e574577 test_rnn_group.protostr
......
...@@ -8,8 +8,8 @@ configs=(test_fc layer_activations projections test_print_layer ...@@ -8,8 +8,8 @@ configs=(test_fc layer_activations projections test_print_layer
test_sequence_pooling test_lstmemory_layer test_grumemory_layer test_sequence_pooling test_lstmemory_layer test_grumemory_layer
last_first_seq test_expand_layer test_ntm_layers test_hsigmoid last_first_seq test_expand_layer test_ntm_layers test_hsigmoid
img_layers util_layers simple_rnn_layers unused_layers test_cost_layers img_layers util_layers simple_rnn_layers unused_layers test_cost_layers
test_rnn_group test_bilinear_interp) test_rnn_group shared_fc shared_lstm test_cost_layers_with_weight
test_bilinear_interp test_maxout)
for conf in ${configs[*]} for conf in ${configs[*]}
do do
......
from paddle.trainer_config_helpers import *
settings(
learning_rate=1e-4,
batch_size=1000
)
a = data_layer(name='feature_a', size=200)
b = data_layer(name='feature_b', size=200)
fc_param = ParamAttr(name='fc_param', initial_max=1.0, initial_min=-1.0)
bias_param = ParamAttr(name='bias_param', initial_mean=0.0, initial_std=0.0)
softmax_param = ParamAttr(name='softmax_param', initial_max=1.0, initial_min=-1.0)
hidden_a = fc_layer(input=a, size=200, param_attr=fc_param, bias_attr=bias_param)
hidden_b = fc_layer(input=b, size=200, param_attr=fc_param, bias_attr=bias_param)
predict = fc_layer(input=[hidden_a, hidden_b], param_attr=[softmax_param, softmax_param],
bias_attr=False, size=10, act=SoftmaxActivation())
outputs(classification_cost(input=predict, label=data_layer(name='label', size=10)))
from paddle.trainer_config_helpers import *
settings(learning_rate=1e-4, batch_size=1000)
data_1 = data_layer(name='data_a', size=100)
data_2 = data_layer(name='data_b', size=100)
mixed_param = ParamAttr(name='mixed_param')
with mixed_layer(size=400, bias_attr=False) as m1:
m1 += full_matrix_projection(input=data_1, param_attr=mixed_param)
with mixed_layer(size=400, bias_attr=False) as m2:
m2 += full_matrix_projection(input=data_2, param_attr=mixed_param)
lstm_param = ParamAttr(name='lstm_param')
lstm_bias = ParamAttr(name='lstm_bias', initial_mean=0., initial_std=0.)
lstm1 = lstmemory_group(input=m1, param_attr=lstm_param, lstm_bias_attr=lstm_bias, mixed_bias_attr=False)
lstm2 = lstmemory_group(input=m2, param_attr=lstm_param, lstm_bias_attr=lstm_bias, mixed_bias_attr=False)
softmax_param = ParamAttr(name='softmax_param')
predict = fc_layer(input=[last_seq(input=lstm1), last_seq(input=lstm2)],
size=10,
param_attr=[softmax_param, softmax_param],
bias_attr=False,
act=SoftmaxActivation())
outputs(classification_cost(input=predict, label=data_layer(name='label', size=10)))
from paddle.trainer_config_helpers import *
settings(
learning_rate=1e-4,
batch_size=1000
)
data = data_layer(name='input', size=300)
lbl = data_layer(name='label', size=1)
wt = data_layer(name='weight', size=1)
fc = fc_layer(input=data, size=10, act=SoftmaxActivation())
outputs(classification_cost(input=fc, label=lbl, weight=wt),
regression_cost(input=fc, label=lbl, weight=wt))
from paddle.trainer_config_helpers import *
settings(
batch_size=1000,
learning_rate=1e-5
)
data = data_layer(name='data', size=2304)
conv = img_conv_layer(input=data,
filter_size = 3,
num_channels=1,
num_filters=16,
padding=1,
act=LinearActivation(),
bias_attr=True)
maxout = maxout_layer(input=conv,
num_channels=16,
groups=2)
pool = img_pool_layer(input=maxout,
num_channels=8,
pool_size=2,
stride=2,
pool_type=MaxPooling())
fc = fc_layer(input=pool, size=384, bias_attr=False)
outputs(fc)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册