提交 fd4eeaf5 编写于 作者: L liaogang

Merge conflict with maxout layer

......@@ -3,4 +3,6 @@ build/
*.user
.vscode
.idea
\ No newline at end of file
.idea
.project
.pydevproject
......@@ -2,9 +2,17 @@ language: cpp
cache: ccache
sudo: required
dist: trusty
os:
- linux
- osx
env:
- JOB=DOCS
- JOB=BUILD_AND_TEST
matrix:
exclude:
- os: osx
env: JOB=DOCS # Only generate documentation in linux
addons:
apt:
packages:
......@@ -27,9 +35,11 @@ addons:
- libgoogle-glog-dev
- libgflags-dev
- libgtest-dev
- graphviz
before_install:
- if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then sudo paddle/scripts/travis/before_install.linux.sh; fi
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then paddle/scripts/travis/before_install.osx.sh; fi
- pip install wheel protobuf sphinx breathe recommonmark
- sudo paddle/scripts/travis/before_install.sh
script:
- paddle/scripts/travis/main.sh
notifications:
......
......@@ -3,7 +3,7 @@ cmake_minimum_required(VERSION 2.8)
project(paddle CXX C)
set(PADDLE_MAJOR_VERSION 0)
set(PADDLE_MINOR_VERSION 8)
set(PADDLE_PATCH_VERSION 0b1)
set(PADDLE_PATCH_VERSION 0b2)
set(PADDLE_VERSION ${PADDLE_MAJOR_VERSION}.${PADDLE_MINOR_VERSION}.${PADDLE_PATCH_VERSION})
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake")
......@@ -104,7 +104,7 @@ else()
endif(NOT WITH_GPU)
if(WITH_DOUBLE)
add_definitions(-DPADDLE_TYPE_DOUBLE -DHPPL_TYPE_DOUBLE)
add_definitions(-DPADDLE_TYPE_DOUBLE)
set(ACCURACY double)
else(WITH_DOUBLE)
set(ACCURACY float)
......
......@@ -17,10 +17,17 @@
## Find MKL First.
set(MKL_ROOT $ENV{MKL_ROOT} CACHE PATH "Folder contains MKL")
find_path(MKL_INCLUDE_DIR mkl.h PATHS ${MKL_ROOT}/include)
find_library(MKL_CORE_LIB NAMES mkl_core PATHS ${MKL_ROOT}/lib)
find_library(MKL_SEQUENTIAL_LIB NAMES mkl_sequential PATHS ${MKL_ROOT}/lib)
find_library(MKL_INTEL_LP64 NAMES mkl_intel_lp64 PATHS ${MKL_ROOT}/lib)
find_path(MKL_INCLUDE_DIR mkl.h PATHS
${MKL_ROOT}/include)
find_library(MKL_CORE_LIB NAMES mkl_core PATHS
${MKL_ROOT}/lib
${MKL_ROOT}/lib/intel64)
find_library(MKL_SEQUENTIAL_LIB NAMES mkl_sequential PATHS
${MKL_ROOT}/lib
${MKL_ROOT}/lib/intel64)
find_library(MKL_INTEL_LP64 NAMES mkl_intel_lp64 PATHS
${MKL_ROOT}/lib
${MKL_ROOT}/lib/intel64)
if(MKL_INCLUDE_DIR AND MKL_CORE_LIB AND MKL_SEQUENTIAL_LIB AND MKL_INTEL_LP64)
......
......@@ -64,7 +64,9 @@ set(COMMON_FLAGS
-Wdelete-non-virtual-dtor
-Wno-unused-parameter
-Wno-error=literal-suffix
-Wno-error=unused-local-typedefs)
-Wno-error=unused-local-typedefs
-Wno-error=unused-function # Warnings in Numpy Header.
)
foreach(flag ${COMMON_FLAGS})
safe_set_cflag(CMAKE_C_FLAGS ${flag})
......
......@@ -184,3 +184,20 @@ macro(add_paddle_culib TARGET_NAME)
cuda_add_library(${TARGET_NAME} STATIC ${ARGN})
set(CUDA_NVCC_FLAGS ${NVCC_FLAG})
endmacro()
# Creates C resources file from files in given resource file
function(create_resources res_file output)
# Create empty output file
file(WRITE ${output} "")
# Get short filename
string(REGEX MATCH "([^/]+)$" filename ${res_file})
# Replace filename spaces & extension separator for C compatibility
string(REGEX REPLACE "\\.| |-" "_" filename ${filename})
# Read hex data from file
file(READ ${res_file} filedata HEX)
# Convert hex data for C compatibility
string(REGEX REPLACE "([0-9a-f][0-9a-f])" "0x\\1," filedata ${filedata})
# Append data to output file
file(APPEND ${output} "const unsigned char ${filename}[] = {${filedata}};\nconst unsigned ${filename}_size = sizeof(${filename});\n")
endfunction()
data/raw_data
data/*.list
mnist_vgg_model
plot.png
train.log
*pyc
# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
o = open("./" + "train.list", "w")
o.write("./data/raw_data/train" +"\n")
o.close()
o = open("./" + "test.list", "w")
o.write("./data/raw_data/t10k" +"\n")
o.close()
\ No newline at end of file
#!/usr/bin/env sh
# This scripts downloads the mnist data and unzips it.
set -e
DIR="$( cd "$(dirname "$0")" ; pwd -P )"
rm -rf "$DIR/raw_data"
mkdir "$DIR/raw_data"
cd "$DIR/raw_data"
echo "Downloading..."
for fname in train-images-idx3-ubyte train-labels-idx1-ubyte t10k-images-idx3-ubyte t10k-labels-idx1-ubyte
do
if [ ! -e $fname ]; then
wget --no-check-certificate http://yann.lecun.com/exdb/mnist/${fname}.gz
gunzip ${fname}.gz
fi
done
cd $DIR
rm -f *.list
python generate_list.py
from paddle.trainer.PyDataProvider2 import *
# Define a py data provider
@provider(input_types={
'pixel': dense_vector(28 * 28),
'label': integer_value(10)
})
def process(settings, filename): # settings is not used currently.
imgf = filename + "-images-idx3-ubyte"
labelf = filename + "-labels-idx1-ubyte"
f = open(imgf, "rb")
l = open(labelf, "rb")
f.read(16)
l.read(8)
# Define number of samples for train/test
if "train" in filename:
n = 60000
else:
n = 10000
for i in range(n):
label = ord(l.read(1))
pixels = []
for j in range(28 * 28):
pixels.append(float(ord(f.read(1))) / 255.0)
yield {"pixel": pixels, 'label': label}
f.close()
l.close()
#!/bin/bash
# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
config=vgg_16_mnist.py
output=./mnist_vgg_model
log=train.log
paddle train \
--config=$config \
--dot_period=10 \
--log_period=100 \
--test_all_data_in_one_period=1 \
--use_gpu=0 \
--trainer_count=1 \
--num_passes=100 \
--save_dir=$output \
2>&1 | tee $log
python -m paddle.utils.plotcurve -i $log > plot.png
# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.trainer_config_helpers import *
is_predict = get_config_arg("is_predict", bool, False)
####################Data Configuration ##################
if not is_predict:
data_dir='./data/'
define_py_data_sources2(train_list= data_dir + 'train.list',
test_list= data_dir + 'test.list',
module='mnist_provider',
obj='process')
######################Algorithm Configuration #############
settings(
batch_size = 128,
learning_rate = 0.1 / 128.0,
learning_method = MomentumOptimizer(0.9),
regularization = L2Regularization(0.0005 * 128)
)
#######################Network Configuration #############
data_size=1*28*28
label_size=10
img = data_layer(name='pixel', size=data_size)
# small_vgg is predined in trainer_config_helpers.network
predict = small_vgg(input_image=img,
num_channels=1,
num_classes=label_size)
if not is_predict:
lbl = data_layer(name="label", size=label_size)
inputs(img, lbl)
outputs(classification_cost(input=predict, label=lbl))
else:
outputs(predict)
......@@ -20,6 +20,8 @@
set -e
export LC_ALL=C
mkdir -p data/tmp
python preprocess.py -i data/reviews_Electronics_5.json.gz
# uniq and shuffle
......
......@@ -18,6 +18,8 @@ cfg=trainer_config.lr.py
#cfg=trainer_config.emb.py
#cfg=trainer_config.cnn.py
#cfg=trainer_config.lstm.py
#cfg=trainer_config.bidi-lstm.py
#cfg=trainer_config.db-lstm.py
paddle train \
--config=$cfg \
--save_dir=./output \
......
# edit-mode: -*- python -*-
# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.trainer_config_helpers import *
dict_file = "./data/dict.txt"
word_dict = dict()
with open(dict_file, 'r') as f:
for i, line in enumerate(f):
w = line.strip().split()[0]
word_dict[w] = i
is_predict = get_config_arg('is_predict', bool, False)
trn = 'data/train.list' if not is_predict else None
tst = 'data/test.list' if not is_predict else 'data/pred.list'
process = 'process' if not is_predict else 'process_predict'
define_py_data_sources2(train_list=trn,
test_list=tst,
module="dataprovider_emb",
obj=process,
args={"dictionary": word_dict})
batch_size = 128 if not is_predict else 1
settings(
batch_size=batch_size,
learning_rate=2e-3,
learning_method=AdamOptimizer(),
regularization=L2Regularization(8e-4),
gradient_clipping_threshold=25
)
bias_attr = ParamAttr(initial_std=0.,l2_rate=0.)
data = data_layer(name="word", size=len(word_dict))
emb = embedding_layer(input=data, size=128)
bi_lstm = bidirectional_lstm(input=emb, size=128)
dropout = dropout_layer(input=bi_lstm, dropout_rate=0.5)
output = fc_layer(input=dropout, size=2,
bias_attr=bias_attr,
act=SoftmaxActivation())
if is_predict:
maxid = maxid_layer(output)
outputs([maxid, output])
else:
label = data_layer(name="label", size=2)
cls = classification_cost(input=output, label=label)
outputs(cls)
# edit-mode: -*- python -*-
# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.trainer_config_helpers import *
dict_file = "./data/dict.txt"
word_dict = dict()
with open(dict_file, 'r') as f:
for i, line in enumerate(f):
w = line.strip().split()[0]
word_dict[w] = i
is_predict = get_config_arg('is_predict', bool, False)
trn = 'data/train.list' if not is_predict else None
tst = 'data/test.list' if not is_predict else 'data/pred.list'
process = 'process' if not is_predict else 'process_predict'
define_py_data_sources2(train_list=trn,
test_list=tst,
module="dataprovider_emb",
obj=process,
args={"dictionary": word_dict})
batch_size = 128 if not is_predict else 1
settings(
batch_size=batch_size,
learning_rate=2e-3,
learning_method=AdamOptimizer(),
regularization=L2Regularization(8e-4),
gradient_clipping_threshold=25
)
bias_attr = ParamAttr(initial_std=0.,l2_rate=0.)
data = data_layer(name="word", size=len(word_dict))
emb = embedding_layer(input=data, size=128)
hidden_0 = mixed_layer(size=128, input=[full_matrix_projection(input=emb)])
lstm_0 = lstmemory(input=hidden_0, layer_attr=ExtraAttr(drop_rate=0.1))
input_layers = [hidden_0, lstm_0]
for i in range(1,8):
fc = fc_layer(input=input_layers, size=128)
lstm = lstmemory(input=fc, layer_attr=ExtraAttr(drop_rate=0.1),
reverse=(i % 2) == 1,)
input_layers = [fc, lstm]
lstm_last = pooling_layer(input=lstm, pooling_type=MaxPooling())
output = fc_layer(input=lstm_last, size=2,
bias_attr=bias_attr,
act=SoftmaxActivation())
if is_predict:
maxid = maxid_layer(output)
outputs([maxid, output])
else:
label = data_layer(name="label", size=2)
cls = classification_cost(input=output, label=label)
outputs(cls)
......@@ -96,12 +96,12 @@ def gru_encoder_decoder(data_conf,
encoded_vector = concat_layer(input=[src_forward, src_backward])
with mixed_layer(size=decoder_size) as encoded_proj:
encoded_proj += full_matrix_projection(encoded_vector)
encoded_proj += full_matrix_projection(input=encoded_vector)
backward_first = first_seq(input=src_backward)
with mixed_layer(size=decoder_size,
act=TanhActivation(), ) as decoder_boot:
decoder_boot += full_matrix_projection(backward_first)
decoder_boot += full_matrix_projection(input=backward_first)
def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
decoder_mem = memory(name='gru_decoder',
......@@ -113,8 +113,8 @@ def gru_encoder_decoder(data_conf,
decoder_state=decoder_mem, )
with mixed_layer(size=decoder_size * 3) as decoder_inputs:
decoder_inputs += full_matrix_projection(context)
decoder_inputs += full_matrix_projection(current_word)
decoder_inputs += full_matrix_projection(input=context)
decoder_inputs += full_matrix_projection(input=current_word)
gru_step = gru_step_layer(name='gru_decoder',
input=decoder_inputs,
......
#!/bin/bash
# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
DIR="$( cd "$(dirname "$0")" ; pwd -P )"
cd $DIR
wget http://www.cnts.ua.ac.be/conll2000/chunking/train.txt.gz
wget http://www.cnts.ua.ac.be/conll2000/chunking/test.txt.gz
# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.trainer.PyDataProvider2 import *
import gzip
import logging
logging.basicConfig(
format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s',
)
logger = logging.getLogger('paddle')
logger.setLevel(logging.INFO)
OOV_POLICY_IGNORE = 0
OOV_POLICY_USE = 1
OOV_POLICY_ERROR = 2
num_original_columns = 3
# Feature combination patterns.
# [[-1,0], [0,0]] means previous token at column 0 and current token at
# column 0 are combined as one feature.
patterns = [
[[-2,0]],
[[-1,0]],
[[0,0]],
[[1,0]],
[[2,0]],
[[-1,0], [0,0]],
[[0,0], [1,0]],
[[-2,1]],
[[-1,1]],
[[0,1]],
[[1,1]],
[[2,1]],
[[-2,1], [-1,1]],
[[-1,1], [0,1]],
[[0,1], [1,1]],
[[1,1], [2,1]],
[[-2,1], [-1,1], [0,1]],
[[-1,1], [0,1], [1,1]],
[[0,1], [1,1], [2,1]],
]
dict_label = {
'B-ADJP': 0,
'I-ADJP': 1,
'B-ADVP': 2,
'I-ADVP': 3,
'B-CONJP': 4,
'I-CONJP': 5,
'B-INTJ': 6,
'I-INTJ': 7,
'B-LST': 8,
'I-LST': 9,
'B-NP': 10,
'I-NP': 11,
'B-PP': 12,
'I-PP': 13,
'B-PRT': 14,
'I-PRT': 15,
'B-SBAR': 16,
'I-SBAR': 17,
'B-UCP': 18,
'I-UCP': 19,
'B-VP': 20,
'I-VP': 21,
'O': 22
}
def make_features(sequence):
length = len(sequence)
num_features = len(sequence[0])
def get_features(pos):
if pos < 0:
return ['#B%s' % -pos] * num_features
if pos >= length:
return ['#E%s' % (pos - length + 1)] * num_features
return sequence[pos]
for i in xrange(length):
for pattern in patterns:
fname = '/'.join([get_features(i+pos)[f] for pos, f in pattern])
sequence[i].append(fname)
'''
Source file format:
Each line is for one timestep. The features are separated by space.
An empty line indicates end of a sequence.
cutoff: a list of numbers. If count of a feature is smaller than this,
it will be ignored.
if oov_policy[i] is OOV_POLICY_USE, id 0 is reserved for OOV features of
i-th column.
return a list of dict for each column
'''
def create_dictionaries(filename, cutoff, oov_policy):
def add_to_dict(sequence, dicts):
num_features = len(dicts)
for features in sequence:
l = len(features)
assert l == num_features, "Wrong number of features " + line
for i in xrange(l):
if features[i] in dicts[i]:
dicts[i][features[i]] += 1
else:
dicts[i][features[i]] = 1
num_features = len(cutoff)
dicts = []
for i in xrange(num_features):
dicts.append(dict())
f = gzip.open(filename, 'rb')
sequence = []
for line in f:
line = line.strip()
if not line:
make_features(sequence)
add_to_dict(sequence, dicts)
sequence = []
continue
features = line.split(' ')
sequence.append(features)
for i in xrange(num_features):
dct = dicts[i]
n = 1 if oov_policy[i] == OOV_POLICY_USE else 0
todo = []
for k, v in dct.iteritems():
if v < cutoff[i]:
todo.append(k)
else:
dct[k] = n
n += 1
if oov_policy[i] == OOV_POLICY_USE:
# placeholder so that len(dct) will be the number of features
# including OOV
dct['#OOV#'] = 0
logger.info('column %d dict size=%d, ignored %d' % (i, n, len(todo)))
for k in todo:
del dct[k]
f.close()
return dicts
def initializer(settings, **xargs):
cutoff = [3, 1, 0]
cutoff += [3] * len(patterns)
oov_policy = [OOV_POLICY_IGNORE, OOV_POLICY_ERROR, OOV_POLICY_ERROR]
oov_policy += [OOV_POLICY_IGNORE] * len(patterns)
dicts = create_dictionaries('data/train.txt.gz', cutoff, oov_policy)
dicts[2] = dict_label
settings.dicts = dicts
settings.oov_policy = oov_policy
input_types = []
num_features = len(dicts)
for i in xrange(num_original_columns):
input_types.append(integer_sequence(len(dicts[i])))
logger.info("slot %s size=%s" % (i, len(dicts[i])))
if patterns:
dim = 0
for i in xrange(num_original_columns, num_features):
dim += len(dicts[i])
input_types.append(sparse_binary_vector_sequence(dim))
logger.info("feature size=%s" % dim)
settings.input_types = input_types
'''
if oov_policy[i] == OOV_POLICY_USE, features in i-th column which are not
existed in dicts[i] will be assigned to id 0.
if oov_policy[i] == OOV_POLICY_ERROR, all features in i-th column MUST exist
in dicts[i].
'''
@provider(init_hook=initializer, cache=CacheType.CACHE_PASS_IN_MEM)
def process(settings, filename):
input_file = filename
dicts = settings.dicts
oov_policy = settings.oov_policy
def gen_sample(sequence):
num_features = len(dicts)
sample = [list() for i in xrange(num_original_columns)]
if patterns:
sample.append([])
for features in sequence:
assert len(features) == num_features, \
"Wrong number of features: " + line
for i in xrange(num_original_columns):
id = dicts[i].get(features[i], -1)
if id != -1:
sample[i].append(id)
elif oov_policy[i] == OOV_POLICY_IGNORE:
sample[i].append(0xffffffff)
elif oov_policy[i] == OOV_POLICY_ERROR:
logger.fatal("Unknown token: %s" % features[i])
else:
sample[i].append(0)
if patterns:
dim = 0
vec = []
for i in xrange(num_original_columns, num_features):
id = dicts[i].get(features[i], -1)
if id != -1:
vec.append(dim + id)
elif oov_policy[i] == OOV_POLICY_IGNORE:
pass
elif oov_policy[i] == OOV_POLICY_ERROR:
logger.fatal("Unknown token: %s" % features[i])
else:
vec.ids.append(dim + 0)
dim += len(dicts[i])
sample[-1].append(vec)
return sample
num_features = len(dicts)
f = gzip.open(input_file, 'rb')
num_sequences = 0
sequence = []
for line in f:
line = line.strip()
if not line:
make_features(sequence)
yield gen_sample(sequence)
sequence = []
num_sequences += 1
continue
features = line.split(' ')
sequence.append(features)
f.close()
logger.info("num_sequences=%s" % num_sequences)
# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.trainer_config_helpers import *
import math
define_py_data_sources2(train_list="data/train.list",
test_list="data/test.list",
module="dataprovider",
obj="process")
batch_size = 1
settings(
learning_method=MomentumOptimizer(),
batch_size=batch_size,
regularization=L2Regularization(batch_size * 1e-4),
average_window=0.5,
learning_rate=1e-1,
learning_rate_decay_a=1e-5,
learning_rate_decay_b=0.25,
)
num_label_types=23
def get_simd_size(size):
return int(math.ceil(float(size) / 8)) * 8
# Currently, in order to use sparse_update=True,
# the size has to be aligned.
num_label_types = get_simd_size(num_label_types)
features = data_layer(name="features", size=76328)
word = data_layer(name="word", size=6778)
pos = data_layer(name="pos", size=44)
chunk = data_layer(name="chunk",
size=num_label_types)
crf_input = fc_layer(
input=features,
size=num_label_types,
act=LinearActivation(),
bias_attr=False,
param_attr=ParamAttr(initial_std=0, sparse_update=True))
crf=crf_layer(
input=crf_input,
label=chunk,
param_attr=ParamAttr(name="crfw", initial_std=0),
)
crf_decoding=crf_decoding_layer(
size=num_label_types,
input=crf_input,
label=chunk,
param_attr=ParamAttr(name="crfw"),
)
sum_evaluator(
name="error",
input=crf_decoding,
)
chunk_evaluator(
name="chunk_f1",
input =[crf_decoding, chunk],
chunk_scheme="IOB",
num_chunk_types=11,
)
inputs(word, pos, chunk, features)
outputs(crf)
# Sequence Tagging
This demo is a sequence model for assigning tags to each token in a sentence. The task is described at <a href = "http://www.cnts.ua.ac.be/conll2000/chunking">CONLL2000 Text Chunking</a> task.
## Download data
```bash
cd demo/sequence_tagging
./data/get_data.sh
```
## Train model
```bash
cd demo/sequence_tagging
./train.sh
```
## Model description
We provide two models. One is a linear CRF model (linear_crf.py) with is equivalent to the one at <a href="http://leon.bottou.org/projects/sgd#stochastic_gradient_crfs">leon.bottou.org/projects/sgd</a>. The second one is a stacked bidirectional RNN and CRF model (rnn_crf.py).
<center>
<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
<thead>
<th scope="col" class="left">Model name</th>
<th scope="col" class="left">Number of parameters</th>
<th scope="col" class="left">F1 score</th>
</thead>
<tbody>
<tr>
<td class="left">linear_crf</td>
<td class="left"> 1.8M </td>
<td class="left"> 0.937</td>
</tr>
<tr>
<td class="left">rnn_crf</td>
<td class="left"> 960K </td>
<td class="left">0.941</td>
</tr>
</tbody>
</table>
</center>
<br>
# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.trainer_config_helpers import *
import math
define_py_data_sources2(train_list="data/train.list",
test_list="data/test.list",
module="dataprovider",
obj="process")
batch_size = 16
settings(
learning_method=MomentumOptimizer(),
batch_size=batch_size,
regularization=L2Regularization(batch_size * 1e-5),
average_window=0.5,
learning_rate = 2e-3,
learning_rate_decay_a = 5e-7,
learning_rate_decay_b = 0.5,
)
word_dim=128
hidden_dim = 128
with_rnn = True
initial_std=1/math.sqrt(hidden_dim)
param_attr=ParamAttr(initial_std=initial_std)
cpu_layer_attr=ExtraLayerAttribute(device=-1)
default_device(0)
num_label_types=23
features = data_layer(name="features", size=76328)
word = data_layer(name="word", size=6778)
pos = data_layer(name="pos", size=44)
chunk = data_layer(name="chunk",
size=num_label_types,
layer_attr=cpu_layer_attr)
emb = embedding_layer(
input=word, size=word_dim, param_attr=ParamAttr(initial_std=0))
hidden1 = mixed_layer(
size=hidden_dim,
act=STanhActivation(),
bias_attr=True,
input=[full_matrix_projection(emb),
table_projection(pos, param_attr=param_attr)]
)
if with_rnn:
rnn1 = recurrent_layer(
act=ReluActivation(),
bias_attr=True,
input=hidden1,
param_attr=ParamAttr(initial_std=0),
)
hidden2 = mixed_layer(
size=hidden_dim,
act=STanhActivation(),
bias_attr=True,
input=[full_matrix_projection(hidden1)
] + ([
full_matrix_projection(rnn1, param_attr=ParamAttr(initial_std=0))
] if with_rnn else []),
)
if with_rnn:
rnn2=recurrent_layer(
reverse=True,
act=ReluActivation(),
bias_attr=True,
input=hidden2,
param_attr=ParamAttr(initial_std=0),
)
crf_input = mixed_layer(
size=num_label_types,
bias_attr=False,
input=[
full_matrix_projection(hidden2),
] + ([
full_matrix_projection(rnn2, param_attr=ParamAttr(initial_std=0))
] if with_rnn else []),
)
crf = crf_layer(
input=crf_input,
label=chunk,
param_attr=ParamAttr(name="crfw", initial_std=0),
layer_attr=cpu_layer_attr,
)
crf_decoding = crf_decoding_layer(
size=num_label_types,
input=crf_input,
label=chunk,
param_attr=ParamAttr(name="crfw"),
layer_attr=cpu_layer_attr,
)
sum_evaluator(
name="error",
input=crf_decoding,
)
chunk_evaluator(
name="chunk_f1",
input =[crf_decoding, chunk],
chunk_scheme="IOB",
num_chunk_types=11,
)
inputs(word, pos, chunk, features)
outputs(crf)
#!/bin/bash
paddle train \
--config rnn_crf.py \
--parallel_nn=1 \
--use_gpu=1 \
--dot_period=10 \
--log_period=1000 \
--test_period=0 \
--num_passes=10
#!/bin/bash
paddle train \
--config linear_crf.py \
--use_gpu=0 \
--dot_period=100 \
--log_period=10000 \
--test_period=0 \
--num_passes=10
......@@ -99,3 +99,7 @@ git pull --rebase upstream HEAD
git push -f origin HEAD
```
Now your Pull Request is updated with the latest version.
## Revise your pull request
When you revise your pull request according to reviewer's comments, please use 'git commit' instead of 'git commit --amend' to commit your changes so that the reviewers can see the difference between the new pull requrest and the old pull request.
......@@ -69,7 +69,7 @@ If you want to launch container with GPU support, you need to set some environme
.. code-block:: bash
export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}"
export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:gpu-latest
......
......@@ -134,7 +134,7 @@ def process(settings, file_name):
You need to add a data provider definition `define_py_data_sources2` in our network configuration. This definition specifies:
- The path of the training and testing data (`data/train.list`, `data/test.list`).
- The location of the data provider file (`dataprovider_pow`).
- The location of the data provider file (`dataprovider_bow`).
- The function to call to get data. (`process`).
- Additional arguments or data. Here it passes the path of word dictionary.
......
......@@ -73,6 +73,12 @@ img_pool_layer
:members: img_pool_layer
:noindex:
maxout_layer
------------
.. automodule:: paddle.trainer_config_helpers.layers
:members: maxout_layer
:noindex:
Norm Layer
==========
......@@ -130,6 +136,12 @@ gru_step_layer
Recurrent Layer Group
=====================
memory
------
.. automodule:: paddle.trainer_config_helpers.layers
:members: memory
:noindex:
recurrent_group
---------------
.. automodule:: paddle.trainer_config_helpers.layers
......@@ -377,6 +389,12 @@ ctc_layer
:members: ctc_layer
:noindex:
nce_layer
-----------
.. automodule:: paddle.trainer_config_helpers.layers
:members: nce_layer
:noindex:
hsigmoid
---------
.. automodule:: paddle.trainer_config_helpers.layers
......
# 支持双层序列作为输入的Layer
## 概述
在自然语言处理任务中,序列是一种常见的数据类型。一个独立的词语,可以看作是一个非序列输入,或者,我们称之为一个0层的序列;由词语构成的句子,是一个单层序列;若干个句子构成一个段落,是一个双层的序列。
双层序列是一个嵌套的序列,它的每一个元素,又是一个单层的序列。这是一种非常灵活的数据组织方式,帮助我们构造一些复杂的输入信息。
我们可以按照如下层次定义非序列,单层序列,以及双层序列。
+ 0层序列:一个独立的元素,类型可以是PaddlePaddle支持的任意输入数据类型
+ 单层序列:排成一列的多个元素,每个元素是一个0层序列,元素之间的顺序是重要的输入信息
+ 双层序列:排成一列的多个元素,每个元素是一个单层序列,称之为双层序列的一个子序列(subseq),subseq的每个元素是一个0层序列
在 PaddlePaddle中,下面这些Layer能够接受双层序列作为输入,完成相应的计算。
## pooling_layer
pooling_layer的使用示例如下,详细见<a href = "../../../doc/ui/api/trainer_config_helpers/layers.html#pooling-layer">配置API</a>
```python
seq_pool = pooling_layer(input=layer,
pooling_type=AvgPooling(),
agg_level=AggregateLevel.EACH_SEQUENCE)
```
- `pooling_type` 目前支持两种,分别是:MaxPooling()和AvgPooling()。
- `agg_level=AggregateLevel.TIMESTEP`时(默认值):
- 作用:双层序列经过运算变成一个0层序列,或单层序列经过运算变成一个0层序列
- 输入:一个双层序列,或一个单层序列
- 输出:一个0层序列,即整个输入序列(单层或双层)的平均值(或最大值)
- `agg_level=AggregateLevel.EACH_SEQUENCE`时:
- 作用:一个双层序列经过运算变成一个单层序列
- 输入:必须是一个双层序列
- 输出:一个单层序列,序列的每个元素是原来双层序列每个subseq元素的平均值(或最大值)
## last_seq 和 first_seq
last_seq的使用示例如下(first_seq类似),详细见<a href = "../../../doc/ui/api/trainer_config_helpers/layers.html#last-seq">配置API</a>
```python
last = last_seq(input=layer,
agg_level=AggregateLevel.EACH_SEQUENCE)
```
- `agg_level=AggregateLevel.TIMESTEP`时(默认值):
- 作用:一个双层序列经过运算变成一个0层序列,或一个单层序列经过运算变成一个0层序列
- 输入:一个双层序列或一个单层序列
- 输出:一个0层序列,即整个输入序列(双层或者单层)最后一个,或第一个元素。
- `agg_level=AggregateLevel.EACH_SEQUENCE`时:
- 作用:一个双层序列经过运算变成一个单层序列
- 输入:必须是一个双层序列
- 输出:一个单层序列,其中每个元素是双层序列中每个subseq最后一个(或第一个)元素。
## expand_layer
expand_layer的使用示例如下,详细见<a href = "../../../doc/ui/api/trainer_config_helpers/layers.html#expand-layer">配置API</a>
```python
expand = expand_layer(input=layer1,
expand_as=layer2,
expand_level=ExpandLevel.FROM_TIMESTEP)
```
- `expand_level=ExpandLevel.FROM_TIMESTEP`时(默认值):
- 作用:一个0层序列经过运算扩展成一个单层序列,或者一个双层序列
- 输入:layer1必须是一个0层序列,是待扩展的数据;layer2可以是一个单层序列,或者是一个双层序列,提供扩展的长度信息
- 输出:一个单层序列,或一个双层序列,输出序列的类型(双层序列,或单层序列)和序列中含有元素的数目同 layer2一致。若输出是单层序列,单层序列的每个元素(0层序列),都是对layer1元素的拷贝;若输出是双层序列,双层序列每个subseq中每个元素(0层序列),都是对layer1元素的拷贝
- `expand_level=ExpandLevel.FROM_SEQUENCE`时:
- 作用:一个单层序列经过运算扩展成一个双层序列
- 输入:layer1必须是一个单层序列,是待扩展的数据;layer2必须是一个双层序列,提供扩展的长度信息
- 输出:一个双层序列,序列中含有元素的数目同layer2一致。要求单层序列含有元素的数目(0层序列),和双层序列含有subseq 的数目一致。单层序列第i个元素(0层序列),被扩展为一个单层序列,构成了输出双层序列的第i个subseq。
\ No newline at end of file
此差异已折叠。
# Recurrent Group教程
## 概述
序列数据是自然语言处理任务面对的一种主要输入数据类型。
一句话是由词语构成的序列,多句话进一步构成了段落。因此,段落可以看作是一个嵌套的双层的序列,这个序列的每个元素又是一个序列。
双层序列是PaddlePaddle支持的一种非常灵活的数据组织方式,帮助我们更好地描述段落、多轮对话等更为复杂的语言数据。基于双层序列输入,我们可以设计搭建一个灵活的、层次化的RNN,分别从词语和句子级别编码输入数据,同时也能够引入更加复杂的记忆机制,更好地完成一些复杂的语言理解任务。
在PaddlePaddle中,`recurrent_group`是一种任意复杂的RNN单元,用户只需定义RNN在一个时间步内完成的计算,PaddlePaddle负责完成信息和误差在时间序列上的传播。
更进一步,`recurrent_group`同样可以扩展到双层序列的处理上。通过两个嵌套的`recurrent_group`分别定义子句级别和词语级别上需要完成的运算,最终实现一个层次化的复杂RNN。
目前,在PaddlePaddle中,能够对双向序列进行处理的有`recurrent_group`和部分Layer,具体可参考文档:<a href = "hierarchical-layer.html">支持双层序列作为输入的Layer</a>
## 相关概念
### 基本原理
`recurrent_group` 是PaddlePaddle支持的一种任意复杂的RNN单元。使用者只需要关注于设计RNN在一个时间步之内完成的计算,PaddlePaddle负责完成信息和梯度在时间序列上的传播。
PaddlePaddle中,`recurrent_group`的一个简单调用如下:
``` python
recurrent_group(step, input, reverse)
```
- step:一个可调用的函数,定义一个时间步之内RNN单元完成的计算
- input:输入,必须是一个单层序列,或者一个双层序列
- reverse:是否以逆序处理输入序列
使用`recurrent_group`的核心是设计step函数的计算逻辑。step函数内部可以自由组合PaddlePaddle支持的各种layer,完成任意的运算逻辑。`recurrent_group` 的输入(即input)会成为step函数的输入,由于step 函数只关注于RNN一个时间步之内的计算,在这里`recurrent_group`替我们完成了原始输入数据的拆分。
### 输入
`recurrent_group`处理的输入序列主要分为以下三种类型:
- **数据输入**:一个双层序列进入`recurrent_group`会被拆解为一个单层序列,一个单层序列进入`recurrent_group`会被拆解为非序列,然后交给step函数,这一过程对用户是完全透明的。可以有以下两种:1)通过data_layer拿到的用户输入;2)其它layer的输出。
- **只读Memory输入**`StaticInput` 定义了一个只读的Memory,由`StaticInput`指定的输入不会被`recurrent_group`拆解,`recurrent_group` 循环展开的每个时间步总是能够引用所有输入,可以是一个非序列,或者一个单层序列。
- **序列生成任务的输入**`GeneratedInput`只用于在序列生成任务中指定输入数据。
### 输入示例
序列生成任务大多遵循encoder-decoer架构,encoder和decoder可以是能够处理序列的任意神经网络单元,而RNN是最流行的选择。
给定encoder输出和当前词,decoder每次预测产生下一个最可能的词语。在这种结构中,decoder接受两个输入:
- 要生成的目标序列:是decoder的数据输入,也是decoder循环展开的依据,`recurrent_group`会对这类输入进行拆解。
- encoder输出,可以是一个非序列,或者一个单层序列:是一个unbounded memory,decoder循环展开的每一个时间步会引用全部结果,不应该被拆解,这种类型的输入必须通过`StaticInput`指定。关于Unbounded Memory的更多讨论请参考论文 [Neural Turning Machine](https://arxiv.org/abs/1410.5401)
在序列生成任务中,decoder RNN总是引用上一时刻预测出的词的词向量,作为当前时刻输入。`GeneratedInput`自动完成这一过程。
### 输出
`step`函数必须返回一个或多个Layer的输出,这个Layer的输出会作为整个`recurrent_group` 最终的输出结果。在输出的过程中,`recurrent_group` 会将每个时间步的输出拼接,这个过程对用户也是透明的。
### memory
memory只能在`recurrent_group`中定义和使用。memory不能独立存在,必须指向一个PaddlePaddle定义的Layer。引用memory得到这layer上一时刻输出,因此,可以将memory理解为一个时延操作。
可以显示地指定一个layer的输出用于初始化memory。不指定时,memory默认初始化为0。
## 双层RNN介绍
`recurrent_group`帮助我们完成对输入序列的拆分,对输出的合并,以及计算逻辑在序列上的循环展开。
利用这种特性,两个嵌套的`recurrent_group`能够处理双层序列,实现词语和句子两个级别的双层RNN结构。
- 单层(word-level)RNN:每个状态(state)对应一个词(word)。
- 双层(sequence-level)RNN:一个双层RNN由多个单层RNN组成,每个单层RNN(即双层RNN的每个状态)对应一个子句(subseq)。
为了描述方便,下文以NLP任务为例,将含有子句(subseq)的段落定义为一个双层序列,将含有词语的句子定义为一个单层序列,那么0层序列即为一个词语。
## 双层RNN的使用
### 训练流程的使用方法
使用 `recurrent_group`需要遵循以下约定:
- **单进单出**:输入和输出都是单层序列。
- 如果有多个输入,不同输入序列含有的词语数必须严格相等。
- 输出一个单层序列,输出序列的词语数和输入序列一致。
- memory:在step函数中定义 memory指向一个layer,通过引用memory得到这个layer上一个时刻输出,形成recurrent 连接。memory的is_seq参数必须为false。如果没有定义memory,每个时间步之内的运算是独立的。
- boot_layer:memory的初始状态,默认初始状为0,memory的is_seq参数必须为false。
- **双进双出**:输入和输出都是双层序列。
- 如果有多个输入序列,不同输入含有的子句(subseq)数必须严格相等,但子句含有的词语数可以不相等。
- 输出一个双层序列,子句(subseq)数、子句的单词数和指定的一个输入序列一致,默认为第一个输入。
- memory:在step函数中定义memory,指向一个layer,通过引用memory得到这个layer上一个时刻的输出,形成recurrent连接。定义在外层`recurrent_group` step函数中的memory,能够记录上一个subseq 的状态,可以是一个单层序列(只作为read-only memory),也可以是一个词语。如果没有定义memory,那么 subseq 之间的运算是独立的。
- boot_layer:memory 初始状态,可以是一个单层序列(只作为read-only memory)或一个向量。默认不设置,即初始状态为0。
- **双进单出**:目前还未支持,会报错"In hierachical RNN, all out links should be from sequences now"。
### 生成流程的使用方法
使用`beam_search`需要遵循以下约定:
- 单层RNN:从一个word生成下一个word。
- 双层RNN:即把单层RNN生成后的subseq给拼接成一个新的双层seq。从语义上看,也不存在一个subseq直接生成下一个subseq的情况。
\ No newline at end of file
......@@ -23,9 +23,9 @@ PaddlePaddle提供的Docker镜像版本
+-----------------+------------------+------------------------+-----------------------+
| GPU | gpu-latest | gpu-devel-latest | gpu-demo-latest |
+-----------------+------------------+------------------------+-----------------------+
| CPU WITHOUT AVX | cpu-noavx-latest | cpu-devel-noavx-latest | cpu-demo-noavx-latest |
| CPU WITHOUT AVX | cpu-noavx-latest | cpu-noavx-devel-latest | cpu-noavx-demo-latest |
+-----------------+------------------+------------------------+-----------------------+
| GPU WITHOUT AVX | gpu-noavx-latest | gpu-devel-noavx-latest | gpu-demo-noavx-latest |
| GPU WITHOUT AVX | gpu-noavx-latest | gpu-noavx-devel-latest | gpu-noavx-demo-latest |
+-----------------+------------------+------------------------+-----------------------+
其中,横向包括三个版本,normal,devel和demo。
......
......@@ -47,6 +47,7 @@ extensions = [
'sphinx.ext.autosummary',
'sphinx.ext.mathjax',
'sphinx.ext.napoleon',
'sphinx.ext.graphviz'
]
table_styling_embed_css = True
......
####################
PaddlePaddle常见问题
####################
.. contents::
1. 如何减少PaddlePaddle的内存占用
---------------------------------
神经网络的训练本身是一个非常消耗内存和显存的工作。经常会消耗数十G的内存和数G的显存。
PaddlePaddle的内存占用主要分为如下几个方面\:
* DataProvider缓冲池内存 (只针对内存)
* 神经元激活内存 (针对内存和显存)
* 参数内存 (针对内存和显存)
* 其他内存杂项
这其中,其他内存杂项是指PaddlePaddle本身所用的一些内存,包括字符串分配,临时变量等等,
这些内存就不考虑如何缩减了。
其他的内存的减少方法依次为
减少DataProvider缓冲池内存
++++++++++++++++++++++++++
PyDataProvider使用的是异步加载,同时在内存里直接随即选取数据来做Shuffle。即
.. graphviz::
digraph {
rankdir=LR;
数据文件 -> 内存池 -> PaddlePaddle训练
}
所以,减小这个内存池即可减小内存占用,同时也可以加速开始训练前数据载入的过程。但是,这
个内存池实际上决定了shuffle的粒度。所以,如果将这个内存池减小,又要保证数据是随机的,
那么最好将数据文件在每次读取之前做一次shuffle。可能的代码为
.. literalinclude:: reduce_min_pool_size.py
这样做可以极大的减少内存占用,并且可能会加速训练过程。 详细文档参考 `这里
<../ui/data_provider/pydataprovider2.html#provider>`_ 。
神经元激活内存
++++++++++++++
神经网络在训练的时候,会对每一个激活暂存一些数据,包括激活,參差等等。
在反向传递的时候,这些数据会被用来更新参数。这些数据使用的内存主要和两个参数有关系,
一是batch size,另一个是每条序列(Sequence)长度。所以,其实也是和每个mini-batch中包含
的时间步信息成正比。
所以,做法可以有两种。他们是
* 减小batch size。 即在网络配置中 :code:`settings(batch_size=1000)` 设置成一个小一些的值。但是batch size本身是神经网络的超参数,减小batch size可能会对训练结果产生影响。
* 减小序列的长度,或者直接扔掉非常长的序列。比如,一个数据集大部分序列长度是100-200,
但是突然有一个10000长的序列,就很容易导致内存超限。特别是在LSTM等RNN中。
参数内存
++++++++
PaddlePaddle支持非常多的优化算法(Optimizer),不同的优化算法需要使用不同大小的内存。
例如如果使用 :code:`adadelta` 算法,则需要使用参数规模大约5倍的内存。 如果参数保存下来的
文件为 :code:`100M`, 那么该优化算法至少需要 :code:`500M` 的内存。
可以考虑使用一些优化算法,例如 :code:`momentum`。
2. 如何加速PaddlePaddle的训练速度
---------------------------------
PaddlePaddle是神经网络训练平台,加速PaddlePaddle训练有如下几个方面\:
* 减少数据载入的耗时
* 加速训练速度
* 利用更多的计算资源
减少数据载入的耗时
++++++++++++++++++
使用 :code:`pydataprovider`时,可以减少缓存池的大小,同时设置内存缓存功能,即可以极大的加速数据载入流程。
:code:`DataProvider` 缓存池的减小,和之前减小通过减小缓存池来减小内存占用的原理一致。
.. literalinclude:: reduce_min_pool_size.py
同时 :code:`@provider` 接口有一个 :code:`cache` 参数来控制缓存方法,将其设置成 :code:`CacheType.CACHE_PASS_IN_MEM` 的话,会将第一个 :code:`pass` (过完所有训练数据即为一个pass)生成的数据缓存在内存里,在之后的 :code:`pass` 中,不会再从 :code:`python` 端读取数据,而是直接从内存的缓存里读取数据。这也会极大减少数据读入的耗时。
加速训练速度
++++++++++++
PaddlePaddle支持Sparse的训练,sparse训练需要训练特征是 :code:`sparse_binary_vector` 、 :code:`sparse_vector` 、或者 :code:`integer_value` 的任一一种。同时,与这个训练数据交互的Layer,需要将其Parameter设置成 sparse 更新模式,即设置 :code:`sparse_update=True`
这里使用简单的 :code:`word2vec` 训练语言模型距离,具体使用方法为\:
使用一个词前两个词和后两个词,来预测这个中间的词。这个任务的DataProvider为\:
.. literalinclude:: word2vec_dataprovider.py
这个任务的配置为\:
.. literalinclude:: word2vec_config.py
更多关于sparse训练的内容请参考 `sparse训练的文档 <TBD>`_
利用更多的计算资源
++++++++++++++++++
利用更多的计算资源可以分为一下几个方式来进行\:
* 单机CPU训练
* 使用多线程训练。设置命令行参数 :code:`trainer_count`,即可以设置参与训练的线程数量。使用方法为 :code:`paddle train --trainer_count=4`
* 单机GPU训练
* 使用显卡训练。设置命令行参数 :code:`use_gpu`。 使用方法为 :code:`paddle train --use_gpu=true`
* 使用多块显卡训练。设置命令行参数 :code:`use_gpu` 和 :code:`trainer_count`。使用 :code:`--use_gpu=True` 开启GPU训练,使用 :code:`trainer_count` 指定显卡数量。使用方法为 :code:`paddle train --use_gpu=true --trainer_count=4`
* 多机训练
* 使用多机训练的方法也比较简单,需要先在每个节点启动 :code:`paddle pserver`,在使用 :code:`paddle train --pservers=192.168.100.1,192.168.100.2` 来指定每个pserver的ip地址
* 具体的多机训练方法参考 `多机训练 <TBD>`_ 文档。
3. 遇到“非法指令”或者是“illegal instruction”
--------------------------------------------
paddle在进行计算的时候为了提升计算性能,使用了avx指令。部分老的cpu型号无法支持这样的指令。通常来说执行下grep avx /proc/cpuinfo看看是否有输出即可知道是否支持。(另:用此方法部分虚拟机可能检测到支持avx指令但是实际运行会挂掉,请当成是不支持,看下面的解决方案)
解决办法是\:
* 使用 NO_AVX的 `安装包 <../build_and_install/index.html>`_ 或者 `Docker image <../build_and_install/install/docker_install.html>`_
* 或者,使用 :code:`-DWITH_AVX=OFF` 重新编译PaddlePaddle。
4. 如何选择SGD算法的学习率
--------------------------
在采用sgd/async_sgd进行训练时,一个重要的问题是选择正确的learning_rate。如果learning_rate太大,那么训练有可能不收敛,如果learning_rate太小,那么收敛可能很慢,导致训练时间过长。
通常做法是从一个比较大的learning_rate开始试,如果不收敛,那减少学习率10倍继续试验,直到训练收敛为止。那么如何判断训练不收敛呢?可以估计出如果模型采用不变的输出最小的cost0是多少。
如果训练过程的的cost明显高于这个常数输出的cost,那么我们可以判断为训练不收敛。举一个例子,假如我们是三分类问题,采用multi-class-cross-entropy作为cost,数据中0,1,2三类的比例为 :code:`0.2, 0.5, 0.3` , 那么常数输出所能达到的最小cost是 :code:`-(0.2*log(0.2)+0.5*log(0.5)+0.3*log(0.3))=1.03` 。如果训练一个pass(或者更早)后,cost还大于这个数,那么可以认为训练不收敛,应该降低学习率。
5. 如何初始化参数
-----------------
默认情况下,PaddlePaddle使用均值0,标准差为 :math:`\frac{1}{\sqrt{d}}` 来初始化参数。其中 :math:`d` 为参数矩阵的宽度。这种初始化方式在一般情况下不会产生很差的结果。如果用户想要自定义初始化方式,PaddlePaddle目前提供两种参数初始化的方式\:
* 高斯分布。将 :code:`param_attr` 设置成 :code:`param_attr=ParamAttr(initial_mean=0.0, initial_std=1.0)`
* 均匀分布。将 :code:`param_attr` 设置成 :code:`param_attr=ParamAttr(initial_max=1.0, initial_min=-1.0)`
比如设置一个全连接层的参数初始化方式和bias初始化方式,可以使用如下代码。
.. code-block:: python
hidden = fc_layer(input=ipt, param_attr=ParamAttr(initial_max=1.0, initial_min=-1.0),
bias_attr=ParamAttr(initial_mean=1.0, initial_std=0.0))
上述代码将bias全部初始化为1.0, 同时将参数初始化为 :code:`[1.0, -1.0]` 的均匀分布。
6. 如何共享参数
---------------
PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID,相同名字的参数,会共享参数。设置参数的名字,可以使用 :code:`ParamAttr(name="YOUR_PARAM_NAME")` 来设置。更方便的设置方式,是想要共享的参数使用同样的 :code:`ParamAttr` 对象。
简单的全连接网络,参数共享的配置示例为\:
.. literalinclude:: ../../python/paddle/trainer_config_helpers/tests/configs/shared_fc.py
这里 :code:`hidden_a` 和 :code:`hidden_b` 使用了同样的parameter和bias。并且softmax层的两个输入也使用了同样的参数 :code:`softmax_param`。
@provider(min_pool_size=0, ...)
def process(settings, filename):
os.system('shuf %s > %s.shuf' % (filename, filename)) # shuffle before.
with open('%s.shuf' % filename, 'r') as f:
for line in f:
yield get_sample_from_line(line)
\ No newline at end of file
... # the settings and define data provider is omitted.
DICT_DIM=3000 # dictionary dimension.
word_ids=data_layer('word_ids', size=DICT_DIM)
emb = embedding_layer(input=word_ids, size=256, param_attr=ParamAttr(sparse_update=True))
emb_sum = pooling_layer(input=emb, pooling_type=SumPooling())
predict = fc_layer(input=emb_sum, size=DICT_DIM, act=Softmax())
outputs(classification_cost(input=predict, label=data_layer('label', size=DICT_DIM)))
\ No newline at end of file
DICT_DIM=3000
@provider(input_types=[integer_sequence(DICT_DIM), integer_value(DICT_DIM)])
def process(settings, filename):
with open(filename) as f:
# yield word ids to predict inner word id
# such as [28, 29, 10, 4], 4
# It means the sentance is 28, 29, 4, 10, 4.
yield read_next_from_file(f)
\ No newline at end of file
......@@ -3,6 +3,7 @@ PaddlePaddle文档
使用指南
--------
* `快速入门 <demo/quick_start/index.html>`_
* `编译与安装 <build_and_install/index.html>`_
* `用户接口 <ui/index.html>`_
......@@ -16,4 +17,13 @@ PaddlePaddle文档
算法教程
--------
* `RNN配置 <../doc/algorithm/rnn/rnn.html>`_
* `Recurrent Group教程 <algorithm/rnn/rnn-tutorial.html>`_
* `单层RNN示例 <../doc/algorithm/rnn/rnn.html>`_
* `双层RNN示例 <algorithm/rnn/hierarchical-rnn.html>`_
* `支持双层序列作为输入的Layer <algorithm/rnn/hierarchical-layer.html>`_
常见问题
--------
* `常见问题 <faq/index.html>`_
......@@ -2,10 +2,10 @@ from paddle.trainer.PyDataProvider2 import *
# Define a py data provider
@provider(input_types=[
dense_vector(28 * 28),
integer_value(10)
])
@provider(input_types={
'pixel': dense_vector(28 * 28),
'label': integer_value(10)
})
def process(settings, filename): # settings is not used currently.
f = open(filename, 'r') # open one of training file
......@@ -20,6 +20,6 @@ def process(settings, filename): # settings is not used currently.
pixels_float.append(float(each_pixel_str))
# give data to paddle.
yield { "pixel": pixels_float, 'label': int(label) }
yield {"pixel": pixels_float, 'label': int(label)}
f.close() # close file
......@@ -141,8 +141,6 @@ DataProvider创建的时候执行。这个初始化函数具有如下参数:
是一个batch size,但是有时为了计算均衡性,可以将一条数据设置成多个batch size
* cache 是数据缓存的策略,参考 `cache`_
* init_hook 是初始化时调用的函数,参考 `init_hook`_
* use_dynamic_order 如果是true的话,可以返回一个dict,key是data_layer的名字,value是特征值。同时,也可以
返回一个list或者tuple。如果是false的话,只能够返回list或者tuple
* check 设置成true的话,会根据input_types检查数据的合法性。
* check_fail_continue 如果设置成true的话,即使在check中数据不合法,也会扔到这条数据,继续训练。 如果
check是false的话,没有作用。
......
......@@ -33,7 +33,7 @@ if ! python -c "import paddle" >/dev/null 2>/dev/null; then
esac
done
shift $(($OPTIND - 1))
export PYTHONPATH=$PYPATH
export PYTHONPATH=$PYPATH:$PYTHONPATH
$@
else
echo "paddle package is already in your PYTHONPATH. But unittest need a clean environment."
......
......@@ -2,10 +2,17 @@ set(AVX_SOURCES
src/hl_math.cc
src/hl_avx_functions.cc
)
set(CUDA_SOURCES
src/hl_time.cc
src/hl_cpu_functions.cc
${AVX_SOURCES})
if(WITH_AVX)
set(CUDA_SOURCES
src/hl_time.cc
src/hl_cpu_functions.cc
${AVX_SOURCES})
else()
set(CUDA_SOURCES
src/hl_time.cc
src/hl_cpu_functions.cc)
endif()
set(CUDA_CXX_WITH_GPU_SOURCES
src/hl_cuda_cublas.cc
......
......@@ -185,7 +185,7 @@ typedef struct {
size_t nnz;
} _hl_sparse_matrix_s, *hl_sparse_matrix_s;
#ifndef HPPL_TYPE_DOUBLE
#ifndef PADDLE_TYPE_DOUBLE
/**
* HPPL data type: real (float or double)
*
......
......@@ -169,7 +169,7 @@ extern void hl_avgpool_forward(
* @brief Maximum pool backward.
*
* @param[in] frameCnt batch size of input image.
* @param[in] outGrad input data.
* @param[in] outGrad output grad data.
* @param[in] channels number of channel.
* @param[in] height image height.
* @param[in] width image width.
......@@ -296,4 +296,34 @@ extern void hl_bilinear_backward(real* inGrad,
const size_t outputW,
const size_t numChannels);
/**
* @brief MaxOut forward.
*
* @param[in] inData input data.
* @param[out] outData output data.
* @param[out] idData output maxId.
* @param[in] batchSize batchSize.
* @param[in] size number of channels * image height * image width.
* @param[in] featLen feature length = image height * image width.
* @param[in] groups number of groups.
*/
extern void hl_maxout_forward(
const real* inData, real* outData, int* idData,
size_t batchSize, size_t size, size_t featLen, size_t groups);
/**
* @brief MaxOut backward.
*
* @param[out] inGrad input grad data.
* @param[in] outGrad output grad data.
* @param[in] idData output maxId.
* @param[in] batchSize batchSize.
* @param[in] size number of channels * image height * image width.
* @param[in] featLen feature length = image height * image width.
* @param[in] groups number of groups.
*/
extern void hl_maxout_backward(
real* inGrad, const real* outGrad, const int* idData,
size_t batchSize, size_t size, size_t featLen, size_t groups);
#endif /* HL_CNN_H_ */
......@@ -20,7 +20,7 @@ limitations under the License. */
#include "paddle/math/MathFunctions.h"
#ifndef HPPL_TYPE_DOUBLE
#ifndef PADDLE_TYPE_DOUBLE
#define CBLAS_GEMM paddle::gemm<float>
#else
#define CBLAS_GEMM paddle::gemm<double>
......
......@@ -28,7 +28,7 @@ namespace hppl {
const real min = SIGMOID_THRESHOLD_MIN;
const real max = SIGMOID_THRESHOLD_MAX;
real tmp = (a < min) ? min : ((a > max) ? max : a);
#ifndef HPPL_TYPE_DOUBLE
#ifndef PADDLE_TYPE_DOUBLE
return __fdividef(1.0f, 1.0f + __expf(-tmp));
#else
return 1.0 / (1.0 + exp(-tmp));
......@@ -36,7 +36,7 @@ namespace hppl {
}
__device__ static real tanh(const real a) {
#ifndef HPPL_TYPE_DOUBLE
#ifndef PADDLE_TYPE_DOUBLE
return __fdividef(2.0f, (1.0f + __expf(-2.0f*a))) - 1.0f;
#else
return (2.0 / (1.0 + exp(-2.0*a))) - 1.0;
......
......@@ -30,7 +30,7 @@ limitations under the License. */
#define INLINE inline
#endif
#ifndef HPPL_TYPE_DOUBLE
#ifndef PADDLE_TYPE_DOUBLE
#define DEVICE_FMAX fmaxf
#define DEVICE_FMIN fminf
#else
......
......@@ -21,7 +21,7 @@ limitations under the License. */
#ifdef __CUDA_ARCH__
// typedef void* vecType;
#include <vector_types.h>
#ifndef HPPL_TYPE_DOUBLE
#ifndef PADDLE_TYPE_DOUBLE
typedef float4 vecType;
#else
typedef double2 vecType;
......@@ -30,7 +30,7 @@ typedef double2 vecType;
#include <mmintrin.h>
#include <xmmintrin.h>
#include <emmintrin.h>
#ifndef HPPL_TYPE_DOUBLE
#ifndef PADDLE_TYPE_DOUBLE
typedef __m128 vecType;
#else
typedef __m128d vecType;
......
......@@ -143,7 +143,7 @@ extern void hl_context_projection_backward_weight(real* outputGrad,
*/
extern void hl_sequence2batch_copy(real *batch,
real *sequence,
int *batchIndex,
const int *batchIndex,
int seqWidth,
int batchCount,
bool seq2batch);
......
......@@ -20,7 +20,7 @@ limitations under the License. */
#define VECTOR_SIZE 16
#ifndef HPPL_TYPE_DOUBLE
#ifndef PADDLE_TYPE_DOUBLE
/* number of float in vector */
#define VECTOR_LEN 4
#define VECTOR_SET _mm_set_ps1
......@@ -41,7 +41,7 @@ inline bool hl_check_align(void *ptr) {
return hl_check_align(reinterpret_cast<size_t>(ptr));
}
#ifndef HPPL_TYPE_DOUBLE
#ifndef PADDLE_TYPE_DOUBLE
template <class Agg>
inline real hl_agg_op(Agg agg, vecType mm) {
__m128 lo = _mm_unpacklo_ps(mm, mm);
......
......@@ -113,4 +113,12 @@ inline void hl_bilinear_backward(real* inGrad,
const size_t outputW,
const size_t numChannels) {}
inline void hl_maxout_forward(
const real* inData, real* outData, int* idData,
size_t batchSize, size_t size, size_t featLen, size_t group) {}
inline void hl_maxout_backward(
real* inGrad, const real* outGrad, const int* idData,
size_t batchSize, size_t size, size_t featLen, size_t group) {}
#endif // HL_CNN_STUB_H_
......@@ -62,7 +62,7 @@ inline void hl_context_projection_backward_weight(real* outputGrad,
inline void hl_sequence2batch_copy(real *batch,
real *sequence,
int *batchIndex,
const int *batchIndex,
int seqWidth,
int batchCount,
bool seq2batch) {}
......
......@@ -662,4 +662,63 @@ void hl_bilinear_backward(real* inGrad,
threadNum, inGrad, inImgH, inImgW, inputH, inputW, outGrad,
outImgH, outImgW, outputH, outputW, numChannels, ratioH, ratioW);
CHECK_SYNC("hl_bilinear_backward failed");
}
\ No newline at end of file
}
__global__ void maxoutFpCompute(size_t nthreads, const real * inData,
real * outData, int* idData,
size_t size, size_t featLen, size_t groups) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
if(index < nthreads) {
size_t batch_idx = index / size;
size_t i = index % size;
size_t channel_idx = i / featLen;
size_t feat_idx = i % featLen;
size_t data_idx = (batch_idx * size + channel_idx * featLen) * groups + feat_idx;
real max = inData[data_idx];
int maxId = 0;
for (size_t g = 1; g < groups; ++g) {
real tmp = inData[data_idx + g * featLen];
if (tmp > max) {
max = tmp;
maxId = g;
}
}
outData[index] = max;
idData[index] = maxId;
}
}
void hl_maxout_forward(const real* inData, real* outData,
int* idData, size_t batchSize, size_t size,
size_t featLen, size_t groups) {
int num_kernels = size * batchSize;
int blocks = (num_kernels + 1024 - 1) / 1024;
maxoutFpCompute<<< blocks, 1024, 0, STREAM_DEFAULT>>>(
num_kernels, inData, outData, idData, size, featLen, groups);
CHECK_SYNC("hl_maxout_forward failed");
}
__global__ void maxoutBpCompute(size_t nthreads, real* inGrad,
const real* outGrad, const int* idData,
size_t size, size_t featLen, size_t groups) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
if(index < nthreads) {
size_t batch_idx = index / size;
size_t i = index % size;
size_t channel_idx = i / featLen;
size_t feat_idx = i % featLen;
size_t newIndex = batch_idx * size;
size_t gradIdx = (channel_idx * groups + (idData + newIndex)[i]) * featLen + feat_idx;
(inGrad + newIndex * groups)[gradIdx] += (outGrad + newIndex)[i];
}
}
void hl_maxout_backward(real* inGrad, const real* outGrad,
const int* idData, size_t batchSize, size_t size,
size_t featLen, size_t groups) {
int num_kernels = size * batchSize;
int blocks = (num_kernels + 1024 - 1) / 1024;
maxoutBpCompute<<< blocks, 1024, 0, STREAM_DEFAULT >>>(
num_kernels, inGrad, outGrad, idData, size, featLen, groups);
CHECK_SYNC("hl_maxout_backward failed");
}
......@@ -84,7 +84,7 @@ CUBLAS_BLAS_ROUTINE_EACH(DYNAMIC_LOAD_CUBLAS_V2_WRAP)
} /* namespace dynload */
#ifndef HPPL_TYPE_DOUBLE
#ifndef PADDLE_TYPE_DOUBLE
#define CUBLAS_GEAM dynload::cublasSgeam
#define CUBLAS_GEMV dynload::cublasSgemv
#define CUBLAS_GEMM dynload::cublasSgemm
......
......@@ -340,7 +340,7 @@ void hl_create_tensor_descriptor(hl_tensor_descriptor* image_desc,
(cudnn_tensor_descriptor)malloc(sizeof(_cudnn_tensor_descriptor));
CHECK_NOTNULL(hl_desc);
#ifndef HPPL_TYPE_DOUBLE
#ifndef PADDLE_TYPE_DOUBLE
cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
#else
cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
......@@ -373,7 +373,7 @@ void hl_create_tensor_descriptor(hl_tensor_descriptor* image_desc) {
(cudnn_tensor_descriptor)malloc(sizeof(_cudnn_tensor_descriptor));
CHECK_NOTNULL(hl_desc);
#ifndef HPPL_TYPE_DOUBLE
#ifndef PADDLE_TYPE_DOUBLE
cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
#else
cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
......@@ -611,7 +611,7 @@ void hl_create_filter_descriptor(hl_filter_descriptor* filter,
CHECK_CUDNN(dynload::cudnnCreateFilterDescriptor(&hl_filter->desc));
#ifndef HPPL_TYPE_DOUBLE
#ifndef PADDLE_TYPE_DOUBLE
cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
#else
cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
......@@ -921,7 +921,7 @@ void hl_softmax_forward(real *input,
int height,
int width)
{
#ifndef HPPL_TYPE_DOUBLE
#ifndef PADDLE_TYPE_DOUBLE
cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
#else
cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
......@@ -955,7 +955,7 @@ void hl_softmax_backward(real *output_value,
int height,
int width)
{
#ifndef HPPL_TYPE_DOUBLE
#ifndef PADDLE_TYPE_DOUBLE
cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
#else
cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
......
......@@ -626,7 +626,7 @@ void hl_specify_devices_start(int* device, int number) {
void hl_rand(real *dest_d, size_t num) {
pthread_mutex_lock(t_resource.gen_mutex);
CHECK_EQ(
#ifndef HPPL_TYPE_DOUBLE
#ifndef PADDLE_TYPE_DOUBLE
dynload::curandGenerateUniform(t_resource.gen, dest_d, num),
#else
dynload::curandGenerateUniformDouble(t_resource.gen, dest_d, num),
......
......@@ -47,7 +47,7 @@ void hl_matrix_add(real *A_d,
CHECK_SYNC("hl_matrix_add failed");
}
#ifdef HPPL_TYPE_DOUBLE
#ifdef PADDLE_TYPE_DOUBLE
#define THRESHOLD 128
#else
#define THRESHOLD 64
......@@ -102,7 +102,7 @@ void subMaxAndExp(real* I,
val = -THRESHOLD;
}
I[nextIdx] = val;
#ifndef HPPL_TYPE_DOUBLE
#ifndef PADDLE_TYPE_DOUBLE
O[nextIdx] = __expf(val);
#else
O[nextIdx] = exp(val);
......
......@@ -374,7 +374,7 @@ template<int blockDimX, int blockDimY, int gridDimX, bool seq2batch, bool isAdd>
__global__
void KeSequence2Batch(real *batch,
real *sequence,
int *batchIndex,
const int *batchIndex,
int seqWidth,
int batchCount) {
int idx = threadIdx.x;
......@@ -405,7 +405,7 @@ void KeSequence2Batch(real *batch,
void hl_sequence2batch_copy(real *batch,
real *sequence,
int *batchIndex,
const int *batchIndex,
int seqWidth,
int batchCount,
bool seq2batch) {
......
......@@ -355,7 +355,7 @@ __global__ void KeSMatrixCscMulDense(real *C_d,
}
/* best perf */
#ifndef HPPL_TYPE_DOUBLE
#ifndef PADDLE_TYPE_DOUBLE
#define CU_CSCMM_THREAD_M_BEST 9
#else
#define CU_CSCMM_THREAD_M_BEST 4
......
......@@ -57,7 +57,8 @@ void BufferBatch::clone(DataBatch* srcBatch, bool useGpu) {
}
}
DoubleBuffer::DoubleBuffer(DataProvider* dataPool, bool useGpu,
DoubleBuffer::DoubleBuffer(DataProvider *dataPool,
bool useGpu,
int64_t batchSize) {
batchSize_ = batchSize;
dataPool_ = dataPool;
......@@ -110,6 +111,9 @@ void DoubleBuffer::removeOneBatch(DataBatch* dataBatch) {
}
void DoubleBuffer::insertOneBatch(DataBatch* batch) {
while (!bufferQueue_->waitNotEmptyFor(2 /* seconds */)) { // time out
if (stopping_) return;
}
BufferBatch* bufBatch = bufferQueue_->dequeue();
// clone and copy the data from an Threadlocal Variable
bufBatch->clone(batch, useGpu_);
......@@ -138,7 +142,7 @@ void DoubleBuffer::asyncLoadBatch() {
actualSize = dataPool_->getNextBatchInternal(batchSize_, &newBatch);
}
insertOneBatch(&newBatch);
} while (actualSize > 0);
} while (actualSize > 0 && !stopping_);
}
}
......
......@@ -259,7 +259,9 @@ typedef Queue<BufferBatch*> BufferBatchQueue;
class DoubleBuffer {
public:
DoubleBuffer(DataProvider* dataPool, bool useGpu, int64_t batchSize = 0);
DoubleBuffer(DataProvider* dataPool,
bool useGpu,
int64_t batchSize = 0);
virtual ~DoubleBuffer();
void removeOneBatch(DataBatch* dataBatch);
......@@ -308,7 +310,8 @@ public:
/**
* @brief create only used for unittest.
*/
inline static DataProvider* create(const DataConfig &config, bool useGpu) {
inline static DataProvider* create(const DataConfig &config,
bool useGpu = FLAGS_use_gpu) {
return create(config, ModelConfig(), useGpu);
}
......@@ -348,7 +351,6 @@ public:
*/
virtual void reset() {
if (doubleBuffer_ != nullptr) {
LOG(INFO) << "the double-buffer is starting ...";
doubleBuffer_->startAsyncLoad();
}
}
......
......@@ -14,13 +14,20 @@ limitations under the License. */
#ifndef PADDLE_NO_PYTHON
#include <Python.h>
#include <stdio.h>
#include <stdlib.h>
#include <unordered_set>
#include <list>
#include <numpy/numpyconfig.h>
#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
#include <numpy/ndarrayobject.h>
#include "DataProvider.h"
#include "paddle/utils/PythonUtil.h"
#include "paddle/utils/Locks.h"
#include "paddle/utils/Stat.h"
namespace paddle {
......@@ -202,7 +209,10 @@ public:
PyDataProvider2(const DataConfig& config,
const ModelConfig& modelConfig,
bool useGpu)
:DataProvider(config, useGpu), callingContextCreated_(2) {
:DataProvider(config, useGpu),
callingContextCreated_(2) {
if (PyArray_API == NULL)
import_array();
auto& args = config.load_data_args();
PyObjectPtr kwargs = PyObjectPtr(PyDict_New());
if (!args.empty()) {
......@@ -246,8 +256,7 @@ private:
PyObjectPtr && kwargs) {
LOG(INFO) << "loading dataprovider " << model <<"::" << className;
PyObjectPtr module(PyImport_ImportModule(model.c_str()));
CHECK_PY(module) << "Cannot imort module " << model.c_str();
PyObjectPtr module = py::import(model);
PyObjectPtr moduleDict(PyModule_GetDict(module.get()));
CHECK_PY(moduleDict) << "Invoke module.__dict__ error";
PyObjectPtr cls(PyDict_GetItemString(moduleDict.get(),
......@@ -455,6 +464,7 @@ private:
std::condition_variable pushCV_;
std::condition_variable pullCV_;
std::mutex mtx_;
ThreadBarrier callingContextCreated_;
std::unique_ptr<IPyDataProviderCache> cache_;
......@@ -497,8 +507,8 @@ public:
* Resetting the PyDataProvider. May start reading thread here.
*/
virtual void reset() {
DataProvider::reset();
resetImpl(true);
DataProvider::reset();
}
/**
......@@ -519,6 +529,7 @@ public:
* Loading a batch of data.
*/
int64_t getNextBatchInternal(int64_t size_, DataBatch *batch) {
REGISTER_TIMER("PyDP2.getNextBatchInternal")
CHECK_GE(size_, 0);
size_t size = (size_t) size_;
if (loadThread_) { // loading from thread should wait for data pool ready.
......@@ -699,10 +710,22 @@ public:
*/
virtual void fill(Argument &argument, PyObject *obj) {
real* dat = argument.value->getData() + height_ * headerPtr_->dim;
py::SequenceHelper s(obj);
// TODO(yuyang18): Here we can use AVX or SSE to accelerate memory copy.
for (size_t i=0; i < headerPtr_->dim; ++i) {
dat[i] = (real) s.getDouble(i);
if (PyArray_Check(obj)) {
auto dtype = PyArray_DTYPE((PyArrayObject*)obj);
if (dtype->type == 'f' && dtype->elsize == sizeof(real)) {
real * data = (real*)PyArray_DATA((PyArrayObject*)obj);
auto sz = PyArray_SIZE((PyArrayObject*)obj);
std::copy(data, data + sz, dat);
} else {
LOG(FATAL) << "You should yield float" << sizeof(real) * 8
<< " array";
}
} else {
py::SequenceHelper s(obj);
// TODO(yuyang18): Here we can use AVX or SSE to accelerate memory copy.
for (size_t i=0; i < headerPtr_->dim; ++i) {
dat[i] = (real) s.getDouble(i);
}
}
++height_;
}
......
......@@ -75,7 +75,6 @@ class ChunkEvaluator : public Evaluator {
public:
virtual void init(const EvaluatorConfig& config) {
CHECK(!FLAGS_use_gpu) << "Not supported";
Evaluator::init(config);
if (config.chunk_scheme() == "IOB") {
numTagTypes_ = 2;
......@@ -137,6 +136,7 @@ public:
CHECK_EQ(arguments.size(), (size_t)2);
IVectorPtr& output = arguments[0].ids;
IVectorPtr& label = arguments[1].ids;
CHECK(!output->useGpu() && !label->useGpu()) << "Not supported";
auto sequenceStartPositions =
arguments[1].sequenceStartPositions->getVector(false);
CHECK_EQ(output->getSize(), label->getSize());
......
......@@ -813,7 +813,6 @@ void TrainerThread::mergeGradSparse(
para->getMat(PARAMETER_GRADIENT).get());
std::vector<uint32_t>& ids = mainMat->getIds(threadId_);
ids.clear();
for (auto slaveParams : slaveParameters) {
SparseRowCpuMatrix* mat =
dynamic_cast<SparseRowCpuMatrix*>((*slaveParams)[pid]
......
......@@ -544,6 +544,12 @@ void RecurrentGradientMachine::forward(const std::vector<Argument>& inArgs,
const std::vector<Argument> inArgs;
std::vector<Argument> outArgs;
frames_[i]->forward(inArgs, &outArgs, passType);
if (hasSubseq) {
for (auto& outFrameLine : outFrameLines_) {
CHECK(outFrameLine.frames[i]->getOutput().sequenceStartPositions)
<< "In hierachical RNN, all out links should be from sequences.";
}
}
}
if (evaluator_ && passType == PASS_TEST) {
this->eval(evaluator_.get());
......@@ -635,16 +641,15 @@ void RecurrentGradientMachine::createInFrameInfo(int inlinkId,
std::vector<int> sequenceStartPositions;
const int* subSequenceStartPositions = nullptr;
if (hasSubseq) { // for sequenceScatterAgentLayer
subSequenceStartPositions =
input.subSequenceStartPositions->getData(false);
if (hasSubseq) { // for sequenceScatterAgentLayer
subSequenceStartPositions = input.subSequenceStartPositions->getData(false);
inlinkInfo->seqStartPosIndex.clear();
inlinkInfo->seqStartPosIndex.push_back(0); // first seqStartPosIndex = 0
}
// maxSequenceLength_: max topLevelLength in allsamples
for (int i = 0; i < maxSequenceLength_; ++i) {
if (hasSubseq) {
sequenceStartPositions.push_back(0); // first element = 0
sequenceStartPositions.push_back(0); // first element = 0
}
int numSeqs = 0;
for (size_t j = 0; j < numSequences; ++j) {
......@@ -676,9 +681,9 @@ void RecurrentGradientMachine::createInFrameInfo(int inlinkId,
}
if (hasSubseq) {
// inFrameLine create sequenceStartPositions one time
CHECK_EQ(sequenceStartPositions.size(),
static_cast<size_t>(maxSequenceLength_ +
input.getNumSubSequences()));
CHECK_EQ(
sequenceStartPositions.size(),
static_cast<size_t>(maxSequenceLength_ + input.getNumSubSequences()));
CHECK_EQ(inlinkInfo->seqStartPosIndex.size(),
static_cast<size_t>(maxSequenceLength_ + 1));
createSeqPos(sequenceStartPositions, &inlinkInfo->sequenceStartPositions);
......@@ -1102,10 +1107,12 @@ size_t RecurrentGradientMachine::beamShrink(std::vector<Path>& newPaths,
newPaths.end(), Path::greaterPath);
newPaths.resize(totalExpandCount + minNewPathSize);
real minPathLogProb = std::min_element(newPaths.end() - minNewPathSize,
newPaths.end())->logProb;
real maxPathLogProb = std::max_element(newPaths.end() - minNewPathSize,
newPaths.end())->logProb;
real minPathLogProb =
std::min_element(newPaths.end() - minNewPathSize, newPaths.end())
->logProb;
real maxPathLogProb =
std::max_element(newPaths.end() - minNewPathSize, newPaths.end())
->logProb;
// Remove the already formed paths that are relatively short
finalPaths_[seqId].erase(
......
......@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "AgentLayer.h"
#include "paddle/utils/Logging.h"
......@@ -62,8 +61,8 @@ void SequenceAgentLayer::forward(PassType passType) {
// get Arguments from real layers
if (numSamples_ > 0 && numSamples_ < realNumSequences) {
int numRows = realOutput.sequenceStartPositions->
getData(false)[numSamples_];
int numRows =
realOutput.sequenceStartPositions->getData(false)[numSamples_];
CHECK(!realOutput.ids) << "Not supported";
output_.subArgFrom(realOutput, /* offset */ 0, numRows, getSize(), useGpu_,
/* trans */ false, /* seqFlag */ true,
......@@ -141,8 +140,8 @@ void ScatterAgentLayer::forward(PassType passType) {
int width = this->getSize();
if (realOutArg_.value || realOutArg_.ids) {
output_.subArgFrom(realOutArg_, /* offset */ idIndex_, idSize_,
width, useGpu_);
output_.subArgFrom(realOutArg_, /* offset */ idIndex_, idSize_, width,
useGpu_);
} else { // used in generation
if (realLayer_->getOutput().ids) {
IVector::resizeOrCreate(output_.ids, ids_->getSize(), useGpu_);
......@@ -224,8 +223,8 @@ void SequenceScatterAgentLayer::forward(PassType passType) {
if (realOutArg_.value || realOutArg_.ids) {
CHECK(realOutArg_.sequenceStartPositions);
output_.subArgFrom(realOutArg_, /* offset */ idIndex_, idSize_,
width, useGpu_, /* trans */ false, /* seqFlag */ true,
output_.subArgFrom(realOutArg_, /* offset */ idIndex_, idSize_, width,
useGpu_, /* trans */ false, /* seqFlag */ true,
/* seqStart */ seqStartPosIndex_,
/* seqSize */ numSequences_);
} else {
......@@ -249,11 +248,12 @@ void SequenceScatterAgentLayer::forward(PassType passType) {
CHECK_NE(input.sequenceStartPositions.get(),
output_.sequenceStartPositions.get());
ICpuGpuVector::resizeOrCreate(output_.sequenceStartPositions,
numSequences + 1, false);
numSequences + 1, false);
int* outStarts = output_.sequenceStartPositions->getMutableData(false);
IVector::resizeOrCreate(cpuInputStartPos_, height, false);
int* inStarts = cpuInputStartPos_->getData();
ICpuGpuVector::resizeOrCreate(inputStartPos_, height, false);
int* inStarts = inputStartPos_->getMutableData(false);
size_t offsetOut = 0;
for (size_t i = 0; i < numSequences; ++i) {
outStarts[i] = offsetOut;
......@@ -266,13 +266,8 @@ void SequenceScatterAgentLayer::forward(PassType passType) {
}
outStarts[numSequences] = offsetOut;
if (useGpu_) {
IVector::resizeOrCreate(inputStartPos_, height, true);
inputStartPos_->copyFrom(*cpuInputStartPos_, HPPL_STREAM_DEFAULT);
} else {
inputStartPos_ = cpuInputStartPos_;
}
outputValue->copyByRowIndex(*input.value, *inputStartPos_);
outputValue->copyByRowIndex(*input.value,
*inputStartPos_->getVector(useGpu_));
}
}
......
......@@ -191,11 +191,7 @@ class SequenceScatterAgentLayer : public ScatterAgentLayer {
protected:
// use to store expanded cpuStartPositions or subSequenceStartPositions
// of real layer.
IVectorPtr cpuInputStartPos_;
// point to cpuInputStartPos_ when useGpu_ is false
// copy from cpuInputStartPos_ when useGpu_ is true
IVectorPtr inputStartPos_;
ICpuGpuVectorPtr inputStartPos_;
public:
explicit SequenceScatterAgentLayer(const LayerConfig& config)
......
......@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "AverageLayer.h"
#include "paddle/utils/Logging.h"
......@@ -25,13 +24,8 @@ REGISTER_LAYER(average, AverageLayer);
bool AverageLayer::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) {
/* Initialize the basic parent class */
Layer::init(layerMap, parameterMap);
SequencePoolLayer::init(layerMap, parameterMap);
/* initialize biases_ */
if (biasParameter_.get() != NULL) {
biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
}
dataMtx_ = Matrix::create(nullptr, 1, 1, false, useGpu_);
outMtx_ = Matrix::create(nullptr, 1, getSize(), false, useGpu_);
// average strategy
......@@ -44,57 +38,15 @@ bool AverageLayer::init(const LayerMap& layerMap,
} else {
LOG(FATAL) << "Unknown average strategy: " << config_.average_strategy();
}
// transform to which sequence type
if (config_.trans_type() == "non-seq") {
type_ = kNonSeq;
} else if (config_.trans_type() == "seq") {
type_ = kSeq;
} else {
LOG(FATAL) << "Unknown trans_type: " << config_.trans_type();
}
setNeedSequenceInfo(false);
return true;
}
void AverageLayer::forward(PassType passType) {
Layer::forward(passType);
// average layer should have exactly 1 input
CHECK_EQ(1U, inputLayers_.size());
size_t dim = getSize();
const Argument& input = getInput(0);
int64_t newBatchSize =
type_ ? input.getNumSubSequences() : input.getNumSequences();
ICpuGpuVectorPtr startPositions =
type_ ? input.subSequenceStartPositions
: input.sequenceStartPositions;
const int* starts = startPositions->getData(false);
size_t numSequences = startPositions->getSize() - 1;
// check
CHECK_EQ(numSequences, (size_t)newBatchSize);
CHECK_EQ(starts[numSequences], input.getBatchSize());
if (type_) {
// when trans_type = seq, input must hasSubseq
CHECK_EQ(input.hasSubseq(), 1UL);
}
SequencePoolLayer::forward(passType);
CHECK_EQ(dim, input.value->getWidth());
resetOutput(newBatchSize, dim);
auto startsPos = startPositions->getVector(useGpu_);
MatrixPtr inputValue = getInputValue(0);
getOutputValue()->sequenceAvgForward(*inputValue, *startsPos, mode_);
/* If type_ = kNonSeq, both seq has or not has sub-seq degrade to a non-seq,
* thus, in this case, output_ has no sequenceStartPositions.
* If type_ = kSeq, seq has sub-seq degrades to a seq, thus, only in this
* case, we should compute the new sequenceStartPositions.
*/
if (type_) {
output_.degradeSequence(input, useGpu_);
}
getOutputValue()->sequenceAvgForward(
*inputValue, *startPositions_->getVector(useGpu_), mode_);
/* add the bias-vector AFTER average operation */
if (biases_.get() != NULL) {
......@@ -106,26 +58,16 @@ void AverageLayer::forward(PassType passType) {
}
void AverageLayer::backward(const UpdateCallback& callback) {
const Argument& input = getInput(0);
ICpuGpuVectorPtr startPositions =
type_ ? input.subSequenceStartPositions
: input.sequenceStartPositions;
const int* starts = startPositions->getData(false);
/* Do derivation */ { backwardActivation(); }
if (biases_ && biases_->getWGrad()) {
biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
// Increasing the number of gradient
biases_->getParameterPtr()->incUpdate(callback);
}
SequencePoolLayer::backward(callback);
const int* starts = startPositions_->getData(false);
MatrixPtr grad = getInputGrad(0);
if (grad) {
size_t dim = getSize();
real* gradientData = getInputGrad(0)->getData();
real* gradient = getOutputGrad()->getData();
size_t numSequences = startPositions->getSize() - 1;
size_t numSequences = startPositions_->getSize() - 1;
for (size_t sequenceId = 0; sequenceId < numSequences; ++sequenceId) {
// TODO(Dangqingqing) optimization for GPU
int sequenceLength = starts[sequenceId + 1] - starts[sequenceId];
......
......@@ -12,10 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "Layer.h"
#include "SequencePoolLayer.h"
#include "paddle/math/Matrix.h"
namespace paddle {
......@@ -23,20 +22,21 @@ namespace paddle {
/**
* A layer for "internal average" for sequence input.
* Input: one or more sequences. Each sequence contains some instances.
* If AverageLevel = kNonSeq:
* If SequenceLevel = kNonSeq:
* Output: output size is the number of input sequences (NOT input instances)
* output[i] = average_{for each instance in this sequence}{input[i]}
* If AverageLevel = kSeq:
* If SequenceLevel = kSeq:
* Check input sequence must has sub-sequence
* Output: output size is the number of input sub-sequences
* output[i] = average_{for each instance in this sub-sequence}{input[i]}
*
* The config file api is pooling_layer.
*/
class AverageLayer : public Layer {
class AverageLayer : public SequencePoolLayer {
public:
enum AverageStrategy { kAverage = 0, kSum = 1, kAverageSquareRootN = 2 };
enum AverageLevel { kNonSeq = 0, kSeq = 1 };
explicit AverageLayer(const LayerConfig& config) : Layer(config) {}
explicit AverageLayer(const LayerConfig& config)
: SequencePoolLayer(config) {}
~AverageLayer() {}
......@@ -46,11 +46,8 @@ public:
void backward(const UpdateCallback& callback = nullptr);
protected:
std::unique_ptr<Weight> biases_;
MatrixPtr outMtx_;
MatrixPtr dataMtx_;
int mode_;
int type_;
};
} // namespace paddle
......@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "ExpandLayer.h"
#include "paddle/utils/Logging.h"
#include "paddle/utils/Stat.h"
......@@ -53,9 +52,8 @@ void ExpandLayer::forward(PassType passType) {
const Argument& shapeInput = getInput(1);
const Argument& dataInput = getInput(0);
size_t outputBatchSize = shapeInput.getBatchSize();
auto startPositions =
type_ ? shapeInput.subSequenceStartPositions
: shapeInput.sequenceStartPositions;
auto startPositions = type_ ? shapeInput.subSequenceStartPositions
: shapeInput.sequenceStartPositions;
size_t numSequences = startPositions->getSize() - 1;
const int* starts = startPositions->getData(false);
......@@ -71,8 +69,7 @@ void ExpandLayer::forward(PassType passType) {
// set output sequence info as shape sequence
output_.sequenceStartPositions = shapeInput.sequenceStartPositions;
if (shapeInput.hasSubseq()) {
output_.subSequenceStartPositions =
shapeInput.subSequenceStartPositions;
output_.subSequenceStartPositions = shapeInput.subSequenceStartPositions;
}
// reserve output: Expand output to batchsize of sequence data.
......@@ -81,8 +78,8 @@ void ExpandLayer::forward(PassType passType) {
MatrixPtr inputValue = getInputValue(0);
MatrixPtr outputValue = getOutputValue();
IVector::resizeOrCreate(cpuExpandStartsPos_, outputBatchSize, false);
int* expandStarts = cpuExpandStartsPos_->getData();
ICpuGpuVector::resizeOrCreate(expandStartsPos_, outputBatchSize, false);
int* expandStarts = expandStartsPos_->getMutableData(false);
for (size_t sequenceId = 0; sequenceId < numSequences; ++sequenceId) {
int sequenceLength = starts[sequenceId + 1] - starts[sequenceId];
for (int j = 0; j < sequenceLength; j++) {
......@@ -90,15 +87,8 @@ void ExpandLayer::forward(PassType passType) {
}
}
if (useGpu_) {
// TODO(Dangqingqing) move copyFrom
IVector::resizeOrCreate(expandStartsPos_, outputBatchSize, true);
expandStartsPos_->copyFrom(*cpuExpandStartsPos_, HPPL_STREAM_DEFAULT);
} else {
expandStartsPos_ = cpuExpandStartsPos_;
}
outputValue->copyByRowIndex(*inputValue, *expandStartsPos_);
outputValue->copyByRowIndex(*inputValue,
*expandStartsPos_->getVector(useGpu_));
if (biases_.get() != NULL) {
outputValue->addBias(*(biases_->getW()), 1);
......@@ -108,16 +98,15 @@ void ExpandLayer::forward(PassType passType) {
void ExpandLayer::backward(const UpdateCallback& callback) {
if (biases_ && biases_->getWGrad()) {
biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
/* Increasing the number of gradient */
/* Increasing the number of gradient */
biases_->getParameterPtr()->incUpdate(callback);
}
if (!getInputGrad(0)) return;
MatrixPtr inputGrad = getInputGrad(0);
MatrixPtr outputGrad = getOutputGrad();
auto cpuSeqStartPos =
type_ ? getInput(1).subSequenceStartPositions
: getInput(1).sequenceStartPositions;
auto cpuSeqStartPos = type_ ? getInput(1).subSequenceStartPositions
: getInput(1).sequenceStartPositions;
size_t numSequences = cpuSeqStartPos->getSize() - 1;
const int* starts = cpuSeqStartPos->getData(false);
......
......@@ -44,14 +44,9 @@ protected:
enum ExpandLevel { kNonSeq = 0, kSeq = 1 };
/// store the ExpandLevel
int type_;
// TODO(luotao) use ICpuGpuVectorPtr to merge cpuExpandStartsPos_
// and expandStartsPos_
/// expanded sequenceStartPositions or subSequenceStartPositions
/// of input[1]
IVectorPtr cpuExpandStartsPos_;
/// point to cpuExpandStartsPos_ when useGpu_ is false,
/// copy from cpuExpandStartsPos_ when useGpu_ is true
IVectorPtr expandStartsPos_;
ICpuGpuVectorPtr expandStartsPos_;
public:
explicit ExpandLayer(const LayerConfig& config) : Layer(config) {}
......
......@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "MaxLayer.h"
#include "paddle/utils/Logging.h"
#include "paddle/utils/Stat.h"
......@@ -21,55 +20,11 @@ namespace paddle {
REGISTER_LAYER(max, MaxLayer);
bool MaxLayer::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) {
/* Initialize the basic parent class */
Layer::init(layerMap, parameterMap);
/* initialize biases_ */
if (biasParameter_.get() != NULL) {
biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
}
// transform to which sequence type
if (config_.trans_type() == "non-seq") {
type_ = kNonSeq;
} else if (config_.trans_type() == "seq") {
type_ = kSeq;
} else {
LOG(FATAL) << "Unknown trans_type: " << config_.trans_type();
}
setNeedSequenceInfo(false);
return true;
}
void MaxLayer::forward(PassType passType) {
Layer::forward(passType);
// max layer should have exactly 1 input
CHECK_EQ(1U, inputLayers_.size());
size_t dim = getSize();
const Argument& input = getInput(0);
int64_t newBatchSize =
type_ ? input.getNumSubSequences() : input.getNumSequences();
ICpuGpuVectorPtr startPositions =
type_ ? input.subSequenceStartPositions
: input.sequenceStartPositions;
auto starts = startPositions->getVector(useGpu_);
size_t numSequences = startPositions->getSize() - 1;
SequencePoolLayer::forward(passType);
CHECK_EQ(dim, input.value->getWidth());
CHECK_EQ(numSequences, (size_t)newBatchSize);
CHECK_EQ(startPositions->getData(false)[numSequences], input.getBatchSize());
if (type_) {
// when trans_type = seq, input must hasSubseq
CHECK_EQ(input.hasSubseq(), 1UL);
}
// reset output: resize to "num of sequences", not "batch size".
resetOutput(newBatchSize, dim);
IVector::resizeOrCreate(maxIndex_, newBatchSize * dim, useGpu(deviceId_));
IVector::resizeOrCreate(maxIndex_, newBatchSize_ * getSize(),
useGpu(deviceId_));
maxIndex_->zeroMem();
MatrixPtr inputValue = getInputValue(0);
......@@ -77,16 +32,8 @@ void MaxLayer::forward(PassType passType) {
{
REGISTER_TIMER_INFO("MaxLayerForward", getName().c_str());
outputValue->maxSequenceForward(*inputValue, *starts, *maxIndex_);
}
/* If type_ = kNonSeq, both seq has or not has sub-seq degrade to a non-seq,
* thus, in this case, output_ has no cpuSequenceStartPositions.
* If type_ = kSeq, seq has sub-seq degrades to a seq, thus, only in this
* case, we should compute the new cpuSequenceStartPositions.
*/
if (type_) {
output_.degradeSequence(input, useGpu_);
outputValue->maxSequenceForward(
*inputValue, *startPositions_->getVector(useGpu_), *maxIndex_);
}
if (config_.output_max_index()) {
......@@ -104,24 +51,14 @@ void MaxLayer::forward(PassType passType) {
void MaxLayer::backward(const UpdateCallback& callback) {
CHECK(!config_.output_max_index())
<< "backward is not available when output_max_index is set";
/* Do derivation */ { backwardActivation(); }
if (biases_ && biases_->getWGrad()) {
biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
// Increasing the number of gradient
biases_->getParameterPtr()->incUpdate(callback);
}
SequencePoolLayer::backward(callback);
MatrixPtr inputGrad = getInputGrad(0);
MatrixPtr outputGrad = getOutputGrad();
if (inputGrad) {
ICpuGpuVectorPtr starts =
type_ ? getInput(0).subSequenceStartPositions
: getInput(0).sequenceStartPositions;
REGISTER_TIMER_INFO("MaxLayerBackward", getName().c_str());
inputGrad->maxSequenceBackward(*outputGrad,
*(starts->getVector(useGpu_)), *maxIndex_);
inputGrad->maxSequenceBackward(
*outputGrad, *(startPositions_->getVector(useGpu_)), *maxIndex_);
}
}
......
......@@ -15,7 +15,7 @@ limitations under the License. */
#pragma once
#include "Layer.h"
#include "SequencePoolLayer.h"
#include "paddle/math/Matrix.h"
#include "paddle/utils/ThreadLocal.h"
......@@ -24,29 +24,30 @@ namespace paddle {
/**
* A layer for "internal max" for sequence input.
* Input: one or more sequences. Each sequence contains some instances.
* If MaxLevel = kNonSeq:
* If SequenceLevel = kNonSeq:
* Output: output size is the number of input sequences (NOT input instances)
* output[i] = max_{for each instance in this sequence}{input[i]}
* If MaxLevel = kSeq:
* If SequenceLevel = kSeq:
* Check input sequence must has sub-sequence
* Output: output size is the number of input sub-sequences
* output[i] = max_{for each instance in this sub-sequence}{input[i]}
*
* The config file api is pooling_layer.
*/
class MaxLayer : public Layer {
class MaxLayer : public SequencePoolLayer {
protected:
std::unique_ptr<Weight> biases_;
// maxIndex_[i][j] = k : the value at (i, j) is from input[k].
IVectorPtr maxIndex_;
int type_;
public:
explicit MaxLayer(const LayerConfig& config) : Layer(config) {}
enum MaxLevel {kNonSeq = 0, kSeq = 1 };
explicit MaxLayer(const LayerConfig& config) : SequencePoolLayer(config) {}
~MaxLayer() {}
bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
bool init(const LayerMap& layerMap, const ParameterMap& parameterMap) {
return SequencePoolLayer::init(layerMap, parameterMap);
}
void forward(PassType passType);
void backward(const UpdateCallback& callback = nullptr);
......
/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "MaxOutLayer.h"
#include "hl_gpu.h"
#include "hl_cnn.h"
namespace paddle {
REGISTER_LAYER(maxout, MaxOutLayer);
size_t MaxOutLayer::getSize() {
const MaxOutConfig& maxoutConf = config_.inputs(0).maxout_conf();
imgSizeH_ = inputLayers_[0]->getOutput().getFrameHeight();
imgSizeW_ = inputLayers_[0]->getOutput().getFrameWidth();
if (imgSizeH_ == 0) {
imgSizeH_ = maxoutConf.img_size_y();
}
if (imgSizeW_ == 0) {
imgSizeW_ = maxoutConf.img_size_x();
}
featLen_ = imgSizeH_ * imgSizeW_;
size_t layerSize = featLen_ * outputChannels_;
getOutput().setFrameHeight(imgSizeH_);
getOutput().setFrameWidth(imgSizeW_);
return layerSize;
}
bool MaxOutLayer::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) {
/* Initialize the basic parent class */
Layer::init(layerMap, parameterMap);
/* the size of inputs for maxout-layer is 1 */
CHECK_EQ(config_.inputs_size(), 1);
const MaxOutConfig& conf = config_.inputs(0).maxout_conf();
groups_ = conf.groups();
channels_ = conf.channels();
CHECK_EQ(channels_ % groups_, 0UL);
outputChannels_ = channels_ / groups_;
return true;
}
void MaxOutLayer::forward(PassType passType) {
Layer::forward(passType);
/* malloc memory for the output_ if necessary */
/* note: one sample correspond to one column */
size_t batchSize = getInput(0).getBatchSize();
size_t size = getSize();
resetOutput(batchSize, size);
MatrixPtr inputV = getInputValue(0);
MatrixPtr outV = getOutputValue();
IVector::resizeOrCreate(maxoutId_, size * batchSize, useGpu_);
outV->maxoutForward(*inputV, *maxoutId_, outputChannels_, groups_);
}
void MaxOutLayer::backward(const UpdateCallback& callback) {
(void)callback;
/* Do derivation */
MatrixPtr inputG = getInputGrad(0);
MatrixPtr outG = getOutputGrad();
if (inputG) {
inputG->maxoutBackward(*outG, *maxoutId_, outputChannels_, groups_);
}
}
} // namespace paddle
/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "Layer.h"
#include "paddle/math/Matrix.h"
namespace paddle {
/**
* A layer to do max out on conv layer output.
* Input: output of a conv layer.
* Output: feature map size same as input. Channel is (input channel) / groups.
* So the num of channels should be able to devided by groups.
*
* The config file api is maxout_layer.
*/
class MaxOutLayer : public Layer {
protected:
size_t groups_;
size_t imgSizeH_, imgSizeW_;
/// outputChannels_ = channels_ / groups_
size_t channels_, outputChannels_;
/// feature length = imgSizeH_ * imgSizeW_
size_t featLen_;
IVectorPtr maxoutId_;
public:
/// return imgSizeH_ * imgSizeW_ * outputChannels_;
size_t getSize();
explicit MaxOutLayer(const LayerConfig& config) : Layer(config) {}
virtual ~MaxOutLayer() {}
bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
void forward(PassType passType);
void backward(const UpdateCallback& callback = nullptr);
};
} // namespace paddle
......@@ -21,14 +21,18 @@ limitations under the License. */
namespace paddle {
/**
* Noise-contrastive estimation
* Noise-contrastive estimation.
* Implements the method in the following paper:
* A fast and simple algorithm for training neural probabilistic language models
* A fast and simple algorithm for training neural probabilistic language models.
*
* The config file api is nce_layer.
*/
class NCELayer : public Layer {
int numClasses_;
int numInputs_; // number of input layer besides labelLayer and weightLayer
/// number of input layer besides labelLayer and weightLayer
int numInputs_;
LayerPtr labelLayer_;
/// weight layer, can be None
LayerPtr weightLayer_;
WeightList weights_;
std::unique_ptr<Weight> biases_;
......@@ -43,7 +47,8 @@ class NCELayer : public Layer {
real weight;
};
std::vector<Sample> samples_;
bool prepared_; // whether samples_ is prepared
/// whether samples_ is prepared
bool prepared_;
Argument sampleOut_;
IVectorPtr labelIds_;
......
......@@ -15,7 +15,7 @@ limitations under the License. */
#include "paddle/utils/Logging.h"
#include "Layer.h"
#include "SequencePoolLayer.h"
#include "paddle/math/Matrix.h"
#include "paddle/utils/Stat.h"
......@@ -29,20 +29,19 @@ namespace paddle {
* If SequenceLevel = kSeq:
* Check input sequence must has sub-sequence
* Output: a sequence containing only the last instance of each sub-sequence
* of the input sequence
* of the input sequence
*
* The config file api is last_seq and first_seq.
*/
class SequenceLastInstanceLayer : public Layer {
class SequenceLastInstanceLayer : public SequencePoolLayer {
protected:
std::unique_ptr<Weight> biases_;
MatrixPtr tmpSrc_;
MatrixPtr tmpDest_;
enum SequenceLevel { kNonSeq = 0, kSeq = 1 };
int type_;
public:
explicit SequenceLastInstanceLayer(const LayerConfig& config)
: Layer(config) {}
: SequencePoolLayer(config) {}
~SequenceLastInstanceLayer() {}
......@@ -56,55 +55,20 @@ REGISTER_LAYER(seqlastins, SequenceLastInstanceLayer);
bool SequenceLastInstanceLayer::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) {
/* Initialize the basic parent class */
Layer::init(layerMap, parameterMap);
// seqlastins layer should have exactly 1 input
CHECK_EQ(1U, inputLayers_.size());
/* initialize biases_ */
if (biasParameter_.get() != NULL) {
biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
}
SequencePoolLayer::init(layerMap, parameterMap);
tmpSrc_ =
Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
tmpDest_ =
Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
// transform to which sequence type
if (config_.trans_type() == "non-seq") {
type_ = kNonSeq;
} else if (config_.trans_type() == "seq") {
type_ = kSeq;
} else {
LOG(FATAL) << "Unknown trans_type: " << config_.trans_type();
}
setNeedSequenceInfo(false);
return true;
}
void SequenceLastInstanceLayer::forward(PassType passType) {
Layer::forward(passType);
size_t dim = getSize();
const Argument& input = getInput(0);
// check
auto startPositions =
type_ ? input.subSequenceStartPositions->getVector(false)
: input.sequenceStartPositions->getVector(false);
size_t height = type_ ? input.getNumSubSequences() : input.getNumSequences();
CHECK_EQ(dim, input.value->getWidth());
CHECK_EQ(startPositions->getData()[height], input.getBatchSize());
CHECK_EQ(height, startPositions->getSize() - 1);
if (type_) {
// when trans_type = seq, input must hasSubseq
CHECK_EQ(input.hasSubseq(), 1UL);
}
SequencePoolLayer::forward(passType);
reserveOutput(height, dim);
const int* starts = startPositions->getData();
const int* starts = startPositions_->getData(false);
MatrixPtr inputValue = getInputValue(0);
MatrixPtr outputValue = getOutputValue();
......@@ -112,21 +76,13 @@ void SequenceLastInstanceLayer::forward(PassType passType) {
AsyncGpuBlock asyncGpuBlock;
REGISTER_TIMER_INFO("SequenceLastInstanceLayerForward", getName().c_str());
for (size_t seqId = 0; seqId < height; ++seqId) {
for (size_t seqId = 0; seqId < newBatchSize_; ++seqId) {
int insId =
config_.select_first() ? starts[seqId] : starts[seqId + 1] - 1;
outputValue->subMatrix(seqId, 1, tmpDest_)
->assign(*(inputValue->subMatrix(insId, 1, tmpSrc_)));
}
/* If type_ = kNonSeq, both seq has or not has sub-seq degrade to a non-seq,
* thus, in this case, output_ has no sequenceStartPositions.
* If type_ = kSeq, seq has sub-seq degrades to a seq, thus, only in this
* case, we should compute the new sequenceStartPositions.
*/
if (type_) {
output_.degradeSequence(input, useGpu_);
}
}
if (biases_.get() != NULL) {
......@@ -138,23 +94,12 @@ void SequenceLastInstanceLayer::forward(PassType passType) {
}
void SequenceLastInstanceLayer::backward(const UpdateCallback& callback) {
/* activation, should set to 'linear' in most cases */
backwardActivation();
if (biases_ && biases_->getWGrad()) {
biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
// Increasing the number of gradient
biases_->getParameterPtr()->incUpdate(callback);
}
SequencePoolLayer::backward(callback);
MatrixPtr inputGrad = getInputGrad(0);
MatrixPtr outputGrad = getOutputGrad();
auto startPositions =
type_ ? getInput(0).subSequenceStartPositions->getVector(false)
: getInput(0).sequenceStartPositions->getVector(false);
const int* starts = startPositions->getData();
size_t numSequences = startPositions->getSize() - 1;
const int* starts = startPositions_->getData(false);
size_t numSequences = startPositions_->getSize() - 1;
if (inputGrad) {
AsyncGpuBlock asyncGpuBlock;
......
/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/utils/Logging.h"
#include "SequencePoolLayer.h"
namespace paddle {
bool SequencePoolLayer::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) {
/* Initialize the basic parent class */
Layer::init(layerMap, parameterMap);
// seqlastins/max/average layer should have exactly 1 input
CHECK_EQ(1U, inputLayers_.size());
/* initialize biases_ */
if (biasParameter_.get() != NULL) {
biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
}
// transform to which sequence type
if (config_.trans_type() == "non-seq") {
type_ = kNonSeq;
} else if (config_.trans_type() == "seq") {
type_ = kSeq;
} else {
LOG(FATAL) << "Unknown trans_type: " << config_.trans_type();
}
setNeedSequenceInfo(false);
return true;
}
void SequencePoolLayer::forward(PassType passType) {
Layer::forward(passType);
const Argument& input = getInput(0);
newBatchSize_ = type_ ? input.getNumSubSequences() : input.getNumSequences();
size_t dim = getSize();
// check
CHECK_EQ(dim, input.value->getWidth());
startPositions_ =
type_ ? input.subSequenceStartPositions : input.sequenceStartPositions;
auto starts = startPositions_->getVector(false);
CHECK_EQ(starts->getData()[newBatchSize_], input.getBatchSize());
CHECK_EQ(newBatchSize_, starts->getSize() - 1);
resetOutput(newBatchSize_, dim);
if (type_) {
CHECK(input.subSequenceStartPositions)
<< "when trans_type = seq, input must hasSubseq";
}
/* If type_ = kNonSeq, both seq has or not has sub-seq degrade to a non-seq,
* thus, in this case, output_ has no sequenceStartPositions.
* If type_ = kSeq, seq has sub-seq degrades to a seq, thus, only in this
* case, we should compute the new sequenceStartPositions.
*/
if (type_) {
output_.degradeSequence(input, useGpu_);
}
}
void SequencePoolLayer::backward(const UpdateCallback& callback) {
/* Do derivation */ { backwardActivation(); }
if (biases_ && biases_->getWGrad()) {
biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
// Increasing the number of gradient
biases_->getParameterPtr()->incUpdate(callback);
}
}
} // namespace paddle
/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "Layer.h"
#include "paddle/math/Matrix.h"
namespace paddle {
/**
* A base layer for SequenceLastInstanceLayer/AverageLayer/MaxLayer.
*
* Input: one or more sequences. Each sequence contains some instances.
* If SequenceLevel = kNonSeq:
* Output: output size is the number of input sequences (NOT input instances)
* output[i] = seqlastin/average/max_{for each instance in this
* sequence}{input[i]}
* If SequenceLevel = kSeq:
* Check input sequence must has sub-sequence
* Output: output size is the number of input sub-sequences
* output[i] = seqlastin/average/max_{for each instance in this
* sub-sequence}{input[i]}
*
* The config file api is pooling_layer.
*/
class SequencePoolLayer : public Layer {
protected:
int type_;
std::unique_ptr<Weight> biases_;
enum SequenceLevel { kNonSeq = 0, kSeq = 1 };
size_t newBatchSize_;
ICpuGpuVectorPtr startPositions_;
public:
explicit SequencePoolLayer(const LayerConfig& config) : Layer(config) {}
virtual ~SequencePoolLayer() {}
bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
void forward(PassType passType);
void backward(const UpdateCallback& callback = nullptr);
};
} // namespace paddle
......@@ -14,12 +14,15 @@
from paddle.trainer.PyDataProvider2 import *
# Note that each config should has an independent provider
# in current design of PyDataProvider2.
#######################################################
data = [
[[[1, 3, 2], [4, 5, 2]], 0],
[[[0, 2], [2, 5], [0, 1, 2]], 1],
]
# Used for sequence_nest_rnn.conf
@provider(input_types=[integer_value_sub_sequence(10),
integer_value(3)],
should_shuffle=False)
......@@ -27,7 +30,7 @@ def process_subseq(settings, file_name):
for d in data:
yield d
# Used for sequence_rnn.conf
@provider(input_types=[integer_value_sequence(10),
integer_value(3)],
should_shuffle=False)
......@@ -38,11 +41,32 @@ def process_seq(settings, file_name):
seq += subseq
yield seq, d[1]
# Used for sequence_nest_rnn_multi_input.conf
@provider(input_types=[integer_value_sub_sequence(10),
integer_value(3)],
should_shuffle=False)
def process_subseq2(settings, file_name):
for d in data:
yield d
# Used for sequence_rnn_multi_input.conf
@provider(input_types=[integer_value_sequence(10),
integer_value(3)],
should_shuffle=False)
def process_seq2(settings, file_name):
for d in data:
seq = []
for subseq in d[0]:
seq += subseq
yield seq, d[1]
###########################################################
data2 = [
[[[1, 2], [4, 5, 2]], [[5, 4, 1], [3, 1]] ,0],
[[[0, 2], [2, 5], [0, 1, 2]],[[1, 5], [4], [2, 3, 6, 1]], 1],
]
# Used for sequence_nest_rnn_multi_unequalength_inputs.conf
@provider(input_types=[integer_value_sub_sequence(10),
integer_value_sub_sequence(10),
integer_value(2)],
......@@ -52,6 +76,7 @@ def process_unequalength_subseq(settings, file_name):
yield d
# Used for sequence_rnn_multi_unequalength_inputs.conf
@provider(input_types=[integer_value_sequence(10),
integer_value_sequence(10),
integer_value(2)],
......
......@@ -21,7 +21,7 @@ from paddle.trainer.PyDataProvider2 import *
def hook(settings, dict_file, **kwargs):
settings.word_dict = dict_file
settings.input_types = [integer_value_sequence(len(settings.word_dict)),
integer_value_sequence(3)]
integer_value(3)]
settings.logger.info('dict len : %d' % (len(settings.word_dict)))
......@@ -34,14 +34,14 @@ def process(settings, file_name):
words = comment.split()
word_slot = [settings.word_dict[w] for w in words if
w in settings.word_dict]
yield word_slot, [label]
yield word_slot, label
## for hierarchical sequence network
def hook2(settings, dict_file, **kwargs):
settings.word_dict = dict_file
settings.input_types = [integer_value_sub_sequence(len(settings.word_dict)),
integer_value_sub_sequence(3)]
integer_value_sequence(3)]
settings.logger.info('dict len : %d' % (len(settings.word_dict)))
......@@ -57,7 +57,7 @@ def process2(settings, file_name):
words = comment.split()
word_slot = [settings.word_dict[w] for w in words if
w in settings.word_dict]
label_list.append([label])
label_list.append(label)
word_slot_list.append(word_slot)
else:
yield word_slot_list, label_list
......
......@@ -56,9 +56,8 @@ def outer_step(x):
last = last_seq(input=inner_rnn_output, name="outer_rnn_state")
# "return last" should also work. But currently RecurrentGradientMachine
# does not handle it correctly. Current implementation requires that
# all the out links are from sequences. However, it does not report error
# when the out links are not sequences.
# does not handle it, and will report error: In hierachical RNN, all out
# links should be from sequences now.
return inner_rnn_output
out = recurrent_group(
......
......@@ -19,7 +19,7 @@ from paddle.trainer_config_helpers import *
define_py_data_sources2(train_list='gserver/tests/Sequence/dummy.list',
test_list=None,
module='rnn_data_provider',
obj='process_subseq')
obj='process_subseq2')
settings(batch_size=2, learning_rate=0.01)
......@@ -57,9 +57,8 @@ def outer_step(wid, x):
last = last_seq(input=inner_rnn_output, name="outer_rnn_state")
# "return last" should also work. But currently RecurrentGradientMachine
# does not handle it correctly. Current implementation requires that
# all the out links are from sequences. However, it does not report error
# when the out links are not sequences.
# does not handle it, and will report error: In hierachical RNN, all out
# links should be from sequences now.
return inner_rnn_output
out = recurrent_group(
......
......@@ -19,7 +19,7 @@ from paddle.trainer_config_helpers import *
define_py_data_sources2(train_list='gserver/tests/Sequence/dummy.list',
test_list=None,
module='rnn_data_provider',
obj='process_seq')
obj='process_seq2')
settings(batch_size=2, learning_rate=0.01)
......
......@@ -327,6 +327,24 @@ TEST(Layer, blockExpandLayer) {
}
}
TEST(Layer, maxoutLayer) {
TestConfig config;
config.biasSize = 0;
config.layerConfig.set_type("maxout");
config.inputDefs.push_back({INPUT_DATA, "layer_0", 4096, 0});
LayerInputConfig* input = config.layerConfig.add_inputs();
MaxOutConfig* maxout = input->mutable_maxout_conf();
maxout->set_img_size_x(32);
maxout->set_img_size_y(32);
maxout->set_channels(4);
maxout->set_groups(2);
for (auto useGpu : {false, true}) {
testLayerGrad(config, "maxout", 10, false, useGpu);
}
}
void testFcLayer(string format, size_t nnz) {
TestConfig config;
config.biasSize = 4096;
......
......@@ -117,7 +117,7 @@ TEST(PyDataProvider2, index_no_seq) {
}
TEST(PyDataProvider2, init_hook) {
paddle::PyObjectPtr pickle(PyImport_ImportModule("pickle"));
paddle::PyObjectPtr pickle = paddle::py::import("pickle");
paddle::PyObjectPtr globals(
PyModule_GetDict(PyImport_AddModule("__main__")));
PyDict_SetItemString(globals.get(), "pickle", pickle.get());
......
......@@ -86,7 +86,7 @@ def test_can_over_batch_size(setting, filename):
yield [random.randint(0, 100 - 1) for _ in xrange(seq_len)]
@provider(input_types=[index_slot(10), index_slot(10)])
@provider(input_types={'input1':index_slot(10), 'input2': index_slot(10)})
def test_input_order(setting, filename):
for _ in xrange(1000):
yield {
......
......@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include <paddle/utils/Util.h>
#include <paddle/utils/Version.h>
......@@ -24,7 +23,7 @@ limitations under the License. */
P_DECLARE_int32(seed);
using namespace paddle; // NOLINT
using namespace std; // NOLINT
using namespace std; // NOLINT
class TrainerForTest : public paddle::Trainer {
public:
void startTrain() {
......@@ -44,11 +43,10 @@ public:
*/
size_t getTotalParameterSize() const {
auto p = const_cast<TrainerForTest*>(this);
auto & params = p->getGradientMachine()->getParameters();
return std::accumulate(params.begin(), params.end(), 0UL,
[](size_t a, const ParameterPtr& p){
return a+p->getSize();
});
auto& params = p->getGradientMachine()->getParameters();
return std::accumulate(
params.begin(), params.end(), 0UL,
[](size_t a, const ParameterPtr& p) { return a + p->getSize(); });
}
};
......
......@@ -283,13 +283,13 @@ void GpuMatrix::copyFrom(const IVector& src) {
copyFrom(matrix);
}
void GpuMatrix::copyByRowIndex(Matrix& b, IVector& rowIndex) {
void GpuMatrix::copyByRowIndex(Matrix& b, const IVector& rowIndex) {
size_t height = getHeight();
size_t width = getWidth();
CHECK_EQ(b.getWidth(), width);
real* dst = getData();
real* src = b.getData();
int* index = rowIndex.getData();
const int* index = rowIndex.getData();
hl_sequence2batch_copy(dst, src, index, width, height, true);
}
......@@ -584,6 +584,42 @@ void GpuMatrix::colMax(Matrix& max) {
max.maxCols(*this);
}
void GpuMatrix::colMax(IVector& maxIds, Matrix& maxVal) {
LOG(FATAL) << "Is not supported";
}
void GpuMatrix::maxoutForward(Matrix& a, IVector& id, size_t channels,
size_t groups) {
CHECK(dynamic_cast<GpuMatrix*>(&a));
CHECK(dynamic_cast<GpuIVector*>(&id));
CHECK_EQ(a.getHeight(), getHeight());
size_t size = getWidth();
size_t batchSize = getHeight();
const real* input = a.getData();
real* output = getData();
int* idForGpu = id.getData();
hl_maxout_forward(input, output, idForGpu, batchSize, size,
size / channels, groups);
}
void GpuMatrix::maxoutBackward(Matrix& a, IVector& id, size_t channels,
size_t groups) {
CHECK(dynamic_cast<GpuMatrix*>(&a));
CHECK(dynamic_cast<GpuIVector*>(&id));
CHECK_EQ(a.getHeight(), getHeight());
size_t size = a.getWidth();
size_t batchSize = getHeight();
real* input = getData();
const real* output = a.getData();
const int* idForGpu = id.getData();
hl_maxout_backward(input, output, idForGpu, batchSize, size,
size / channels, groups);
}
/*calulate the error of classification */
void GpuMatrix::classificationError(MatrixPtr output, IVectorPtr label) {
GpuMatrixPtr output_ptr = std::dynamic_pointer_cast<GpuMatrix>(output);
......@@ -1329,11 +1365,11 @@ void CpuMatrix::copyFrom(const IVector& src) {
}
}
void CpuMatrix::copyByRowIndex(Matrix& b, IVector& rowIndex) {
void CpuMatrix::copyByRowIndex(Matrix& b, const IVector& rowIndex) {
size_t height = getHeight();
size_t width = getWidth();
CHECK_EQ(b.getWidth(), width);
int* index = rowIndex.getData();
const int* index = rowIndex.getData();
for (size_t i = 0; i < height; i++) {
CHECK_LT(static_cast<size_t>(index[i]), b.getHeight());
real* src = b.getData() + index[i] * width;
......@@ -2799,6 +2835,95 @@ void CpuMatrix::colMax(Matrix& max) {
max.maxCols(*this);
}
void CpuMatrix::colMax(IVector& maxIds, Matrix& maxVal) {
CHECK(isContiguous());
CHECK(!maxIds.useGpu() && !maxVal.useGpu()) << "Matrix type are not equal";
size_t numSamples = getWidth();
size_t beam = maxVal.getHeight();
CHECK_EQ(maxIds.getSize(), numSamples * beam);
CHECK_EQ(maxVal.getWidth(), numSamples);
real* a = getData();
int* s = maxIds.getData();
real* t = maxVal.getData();
size_t dim = getHeight();
for (size_t i = 0; i < numSamples; i++) {
std::vector<std::pair<real, size_t>> vec;
for (size_t j = 0; j < dim; j++) {
vec.push_back(std::pair<real, size_t>(a[i + j * numSamples], j));
}
std::partial_sort(
vec.begin(), vec.begin() + beam, vec.end(),
[](const std::pair<real, size_t>& l, const std::pair<real, size_t>& r) {
return l.first > r.first;
});
for (size_t j = 0; j < beam; j++) {
t[i + j * numSamples] = vec[j].first;
s[i + j * numSamples] = vec[j].second;
}
}
}
void CpuMatrix::maxoutForward(Matrix& a, IVector& id, size_t channels,
size_t groups) {
CHECK(dynamic_cast<CpuMatrix*>(&a));
CHECK(dynamic_cast<CpuIVector*>(&id));
CHECK_EQ(a.getHeight(), getHeight());
size_t size = getWidth();
size_t batchSize = getHeight();
size_t featLen = size / channels;
const real* input = a.getData();
int* idForCpu = id.getData();
MatrixPtr maxInMat, maxOutMat;
Matrix::resizeOrCreate(maxInMat, groups, size, false, false);
Matrix::resizeOrCreate(maxOutMat, 1, size, false, false);
for (size_t batch_idx = 0; batch_idx < batchSize; ++batch_idx) {
size_t newIndex = batch_idx * size;
IVectorPtr tmpId = IVector::create(idForCpu + newIndex, size, false);
for (size_t i = 0; i < channels; ++i) {
size_t newFeatLen = i * featLen;
for (size_t j = 0; j < groups; ++j) {
maxInMat->subMatrix(j, j + 1, newFeatLen, newFeatLen + featLen)
->copyFrom(input + (newIndex + newFeatLen) * groups + j * featLen,
featLen);
}
}
maxInMat->colMax(*tmpId, *maxOutMat);
this->subRowMatrix(batch_idx, batch_idx + 1)->copyFrom(*maxOutMat);
}
}
void CpuMatrix::maxoutBackward(Matrix& a, IVector& id, size_t channels,
size_t groups) {
CHECK(dynamic_cast<CpuMatrix*>(&a));
CHECK(dynamic_cast<CpuIVector*>(&id));
CHECK_EQ(a.getHeight(), getHeight());
size_t size = a.getWidth();
size_t batchSize = getHeight();
size_t featLen = size / channels;
size_t newFeatLen = groups * featLen;
real* inputG = getData();
const real* outG = a.getData();
int* idForCpu = id.getData();
for (size_t batch_idx = 0; batch_idx < batchSize; ++batch_idx) {
size_t newIndex = batch_idx * size;
int* idData = idForCpu + newIndex;
for (size_t i = 0; i < size; ++i) {
int gradIdx =
idData[i] * featLen + (i / featLen) * newFeatLen + i % featLen;
(inputG + newIndex * groups)[gradIdx] += (outG + newIndex)[i];
}
}
}
void CpuMatrix::rowNormalizeL1(Matrix& out) {
CHECK(!out.useGpu());
......
......@@ -253,7 +253,7 @@ public:
LOG(FATAL) << "copy data from int vector only available on CpuMatrix.";
}
virtual void copyByRowIndex(Matrix& b, IVector& rowIndex) {
virtual void copyByRowIndex(Matrix& b, const IVector& rowIndex) {
LOG(FATAL) << "Not implemented";
}
......@@ -493,16 +493,40 @@ public:
LOG(FATAL) << "Not implemeted";
}
/**
* set the max of each column of this to mat
*/
virtual void colMax(Matrix& max) { LOG(FATAL) << "not implemented"; }
/**
* @brief Get the top k elements of each column of this matrix.
*
* The row ids and values of these elements are stored in
* maxIds and max respectively. where k is the size of maxIds.
* And note that the top k elements are not sorted.
*/
virtual void colMax(IVector& maxIds, Matrix& maxVal) {
LOG(FATAL) << "not implemented";
}
virtual void maxoutForward(Matrix& a, IVector& id, size_t channels,
size_t groups) {
LOG(FATAL) << "not implemented";
}
virtual void maxoutBackward(Matrix& a, IVector& id, size_t channels,
size_t groups) {
LOG(FATAL) << "not implemented";
}
virtual void rowMaxId(IVector& maxIds) { LOG(FATAL) << "Not implemented"; }
/**
* @brief Get the top k elements of each row of this matrix.
*
* The column ids and values of these elements are stored in
* maxIds and max respectively. Note that the top k
* elements are not sorted.
* maxIds and max respectively. where k is the size of maxIds.
* And note that the top k elements are not sorted.
*/
virtual void rowMax(IVector& maxIds, Matrix& max) {
LOG(FATAL) << "Not implemented";
......@@ -995,7 +1019,7 @@ public:
void copyFrom(const IVector& src);
void copyByRowIndex(Matrix& b, IVector& rowIndex);
void copyByRowIndex(Matrix& b, const IVector& rowIndex);
MatrixPtr clone(size_t height, size_t width, bool useGpu = false);
......@@ -1101,6 +1125,9 @@ public:
void rowMax(Matrix& max);
void rowMax(IVector& maxIds, Matrix& max);
void colMax(Matrix& max);
void colMax(IVector& maxIds, Matrix& max);
void maxoutForward(Matrix& a, IVector& id, size_t channels, size_t groups);
void maxoutBackward(Matrix& a, IVector& id, size_t channels, size_t groups);
void oneHotCrossEntropy(Matrix& output, IVector& label);
void oneHotCrossEntropyBp(Matrix& outputV, IVector& label);
......@@ -1271,7 +1298,7 @@ public:
void copyFrom(CpuSparseMatrix& src);
void copyByRowIndex(Matrix& b, IVector& rowIndex);
void copyByRowIndex(Matrix& b, const IVector& rowIndex);
MatrixPtr clone(size_t height, size_t width, bool useGpu = false);
......@@ -1425,6 +1452,9 @@ public:
void rowMax(Matrix& max);
void rowMax(IVector& maxIds, Matrix& maxVal);
void colMax(Matrix& max);
void colMax(IVector& maxIds, Matrix& maxVal);
void maxoutForward(Matrix& a, IVector& id, size_t channels, size_t groups);
void maxoutBackward(Matrix& a, IVector& id, size_t channels, size_t groups);
void rowNormalizeL1(Matrix& out);
void oneHotCrossEntropy(Matrix& output, IVector& label);
......
......@@ -227,12 +227,18 @@ void CacheRowCpuMatrix::mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB,
void SparsePrefetchRowCpuMatrix::addRows(const unsigned int* ids, size_t len) {
std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices;
for (size_t i = 0; i < len; i ++) {
CHECK_LT(*(ids + i), this->getHeight())
<< "id:" << *(ids + i) << "Height:" << this->getHeight()
<< "sparse id value exceeds the max input dimension, "
<< "it could be caused invalid input data samples";
}
localIndices.insert(localIndices.end(), ids, ids + len);
}
void SparsePrefetchRowCpuMatrix::addRows(MatrixPtr input) {
CpuSparseMatrix* mat = dynamic_cast<CpuSparseMatrix*>(input.get());
CHECK(mat) << "only support non value sparse matrix";
CHECK(mat) << "only support sparse matrix";
addRows(reinterpret_cast<const unsigned int*>(mat->getCols()),
mat->getElementCnt());
}
......@@ -243,7 +249,13 @@ void SparsePrefetchRowCpuMatrix::addRows(IVectorPtr ids) {
int* index = ids->getData();
for (size_t i = 0; i < numSamples; ++i) {
if (index[i] == -1) continue;
localIndices.push_back((unsigned int)index[i]);
unsigned int id = (unsigned int)index[i];
CHECK_LT(id, this->getHeight())
<< "id:" << id << "Height:" << this->getHeight()
<< "sparse id value exceeds the max input dimension, "
<< "it could be caused invalid input data samples";
localIndices.push_back(id);
}
}
......
......@@ -2065,6 +2065,78 @@ TEST(Matrix, PoolFwdBwd) {
}
}
void testMaxOutFwdBwd(int numSamples, int imgSizeH, int imgSizeW,
int channels, int groups) {
int inWidth = imgSizeH * imgSizeW * channels;
int outChannels = channels / groups;
int outWidth = imgSizeH * imgSizeW * outChannels;
// forward
MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
MatrixPtr targetCheck = CpuMatrix::create(numSamples, outWidth, false, false);
IVectorPtr id = CpuIVector::create(numSamples * outWidth, false);
IVectorPtr idGpu = GpuIVector::create(numSamples * outWidth, true);
IVectorPtr idCheck = CpuIVector::create(numSamples * outWidth, false);
input->randomizeUniform();
inputGpu->copyFrom(*input);
target->maxoutForward(*input, *id, outChannels, groups);
targetGpu->maxoutForward(*inputGpu, *idGpu, outChannels, groups);
// check
targetCheck->copyFrom(*targetGpu);
MatrixCheckErr(*target, *targetCheck);
idCheck->copyFrom(*idGpu);
VectorCheckEqual(*id, *idCheck);
// backward
MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
MatrixPtr targetGpuGrad = GpuMatrix::create(numSamples, outWidth, false,
true);
MatrixPtr targetCheckGrad = CpuMatrix::create(numSamples, inWidth, false,
false);
inputGrad->randomizeUniform();
targetGrad->randomizeUniform();
inputGpuGrad->copyFrom(*inputGrad);
targetGpuGrad->copyFrom(*targetGrad);
inputGrad->maxoutBackward(*targetGrad, *id, outChannels, groups);
inputGpuGrad->maxoutBackward(*targetGpuGrad, *idGpu, outChannels, groups);
// check
targetCheckGrad->copyFrom(*inputGpuGrad);
MatrixCheckErr(*inputGrad, *targetCheckGrad);
}
TEST(Matrix, MaxOutFwdBwd) {
for (auto numSamples : {5, 10}) {
for (auto channels : {8, 16}) {
for (auto imgSizeH : {14, 28}) {
for (auto imgSizeW : {16, 30}) {
for (auto groups : {2, 4}) {
VLOG(3) << " numSamples=" << numSamples
<< " channels=" << channels
<< " imgSizeH=" << imgSizeH
<< " imgSizeW=" << imgSizeW
<< " groups=" << groups;
testMaxOutFwdBwd(numSamples, imgSizeH, imgSizeW, channels, groups);
}
}
}
}
}
}
int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv);
initMain(argc, argv);
......
......@@ -146,6 +146,12 @@ public:
}
}
void enableBufType(ParameterType type) {
if (bufs_[type]) return;
bufs_[type] = Vector::createParallelVector(config_.size(), useGpu_);
bufs_[type]->zeroMem();
}
void enableIntType(ParameterType type, size_t intStoreSize = 0) {
if (!intBufs_[type]) {
SetDevice device(deviceId_);
......
/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/pserver/ParameterClient.h"
#include "paddle/pserver/ParameterServer.h"
#include "paddle/parameter/Parameter.h"
#include <Python.h>
namespace paddle {
struct PyObjectDeleter {
void operator()(PyObject* obj) {
if (obj) {
Py_DECREF(obj);
}
}
};
class ParameterClientPy : public ParameterClient {
protected:
typedef std::unique_ptr<PyObject, PyObjectDeleter> PyObjectPtr;
std::vector<ParameterPtr> parameter_;
int initArgc_;
char** initArgv_;
public:
ParameterClientPy(std::vector<std::string> configs, int argc,
std::vector<std::string> argv, bool useGpu) {
initArgc_ = argc;
initArgv_ = new char* [argc];
for (int i = 0; i < argc; i++) {
initArgv_[i] = new char[argv[i].size()];
strcpy(initArgv_[i], // NOLINT
argv[i].c_str()); // NOLINT TODO(yuyang18): use snprintf instead.
}
ParameterConfig pyConfig;
ParameterPtr param;
for (auto& config : configs) {
pyConfig.ParseFromString(config);
param.reset(new Parameter(pyConfig, useGpu));
parameter_.push_back(param);
}
Py_Initialize();
CHECK(Py_IsInitialized());
}
~ParameterClientPy() {
delete initArgv_;
Py_Finalize();
}
Parameter getParameter(int idx) { return *(parameter_[idx].get()); }
void initClientPy() {
initMain(initArgc_, initArgv_);
CHECK(init(parameter_)) << "Init Client Failed.";
}
void setConfigPy(std::string config) {
OptimizationConfig optConfig;
optConfig.ParseFromString(config);
setConfig(optConfig);
}
bool inStatusPy(int status) { return inStatus(PServerStatus(status)); }
void setStatusPy(int status) { setStatus(PServerStatus(status)); }
void waitForStatusPy(int status) { waitForStatus(PServerStatus(status)); }
void sendParameterPy(int updateMode, int parameterType, int numSamples,
real cost, bool sendBackParameter) {
sendParameter(ParameterUpdateMode(updateMode), ParameterType(parameterType),
int64_t(numSamples), real(cost), sendBackParameter);
}
template <class ProtoIn, class ProtoOut>
std::string asyncCallPy(const char* serviceName, const char* funcName,
const std::string in) {
ProtoIn protoIn;
ProtoOut protoOut;
std::mutex waitLock;
std::string data;
protoIn.ParseFromString(in);
waitLock.lock();
auto callback = [&](ProtoOut* pOut, bool isSuccessful) {
if (isSuccessful) {
pOut->SerializeToString(&data);
} else {
LOG(INFO) << "Async Talk Failed.";
}
waitLock.unlock();
};
ubClient_.asyncCall<ProtoIn, ProtoOut>(serviceName, funcName, protoIn,
&protoOut, callback);
waitLock.lock();
protoOut.SerializeToString(&data);
return data;
}
};
} // namespace paddle
......@@ -63,7 +63,8 @@ class SparseBinaryScanner(IScanner):
def scan(self, dat):
self.extend_cols(dat)
self.__rows__.append(len(dat))
self.__rows__.append(len(dat) + self.__rows__[-1])
self.__height__ += 1
def extend_cols(self, dat):
self.__cols__.extend(dat)
......
#!/bin/bash
brew update
brew tap homebrew/science
brew install python
sudo pip install --upgrade protobuf==2.6.0
brew install homebrew/versions/protobuf260 --without-python
brew install cmake python glog gflags openblas wget md5sha1sum
wget https://github.com/google/googletest/archive/release-1.8.0.tar.gz -O gtest.tar.gz
tar xf gtest.tar.gz
cd googletest-release-1.8.0/
cmake .
make install
#!/bin/bash
source ./common.sh
cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=OFF -DWITH_TESTING=ON -DON_TRAVIS=ON
make -j `nproc`
env CTEST_OUTPUT_ON_FAILURE=1 make test ARGS="-j `nproc`"
CMAKE_EXTRA=""
if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then
CMAKE_EXTRA="-DPYTHON_LIBRARY=/usr/local/Cellar/python/2.7.12_1/Frameworks/Python.framework/Versions/2.7/lib/python2.7/config/libpython2.7.dylib"
fi
cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=OFF -DWITH_TESTING=ON -DON_TRAVIS=ON ${CMAKE_EXTRA}
NPROC=1
if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then
NRPOC=`nproc`
elif [[ "$TRAVIS_OS_NAME" == "osx" ]]; then
NPROC=`sysctl -n hw.ncpu`
fi
make -j $NPROC
env CTEST_OUTPUT_ON_FAILURE=1 make test ARGS="-j $NPROC"
sudo make install
sudo paddle version
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册