提交 253da41b 编写于 作者: Y Yancey1989

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into develop

...@@ -110,14 +110,13 @@ endmacro() ...@@ -110,14 +110,13 @@ endmacro()
# Get the coverage data. # Get the coverage data.
file(GLOB_RECURSE GCDA_FILES "${COV_PATH}" "*.gcda") file(GLOB_RECURSE GCDA_FILES "${COV_PATH}" "*.gcda")
message("GCDA files:") message("Process GCDA files:")
message("===============================")
# Get a list of all the object directories needed by gcov # Get a list of all the object directories needed by gcov
# (The directories the .gcda files and .o files are found in) # (The directories the .gcda files and .o files are found in)
# and run gcov on those. # and run gcov on those.
foreach(GCDA ${GCDA_FILES}) foreach(GCDA ${GCDA_FILES})
message("Process: ${GCDA}")
message("------------------------------------------------------------------------------")
get_filename_component(GCDA_DIR ${GCDA} PATH) get_filename_component(GCDA_DIR ${GCDA} PATH)
# #
...@@ -135,7 +134,7 @@ foreach(GCDA ${GCDA_FILES}) ...@@ -135,7 +134,7 @@ foreach(GCDA ${GCDA_FILES})
# If -p is not specified then the file is named only "the_file.c.gcov" # If -p is not specified then the file is named only "the_file.c.gcov"
# #
execute_process( execute_process(
COMMAND ${GCOV_EXECUTABLE} -p -o ${GCDA_DIR} ${GCDA} COMMAND "${GCOV_EXECUTABLE} -p -o ${GCDA_DIR} ${GCDA} >/dev/null"
WORKING_DIRECTORY ${GCDA_DIR} WORKING_DIRECTORY ${GCDA_DIR}
) )
endforeach() endforeach()
...@@ -383,7 +382,6 @@ foreach(NOT_COVERED_SRC ${COVERAGE_SRCS_REMAINING}) ...@@ -383,7 +382,6 @@ foreach(NOT_COVERED_SRC ${COVERAGE_SRCS_REMAINING})
set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}]") set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}]")
# Generate the final JSON for this file. # Generate the final JSON for this file.
message("Generate JSON for non-gcov file: ${NOT_COVERED_SRC}...")
string(CONFIGURE ${SRC_FILE_TEMPLATE} FILE_JSON) string(CONFIGURE ${SRC_FILE_TEMPLATE} FILE_JSON)
set(JSON_GCOV_FILES "${JSON_GCOV_FILES}${FILE_JSON}, ") set(JSON_GCOV_FILES "${JSON_GCOV_FILES}${FILE_JSON}, ")
endforeach() endforeach()
......
import paddle.v2 as paddle
import paddle.v2.dataset.uci_housing as uci_housing
def main():
# init
paddle.init(use_gpu=False, trainer_count=1)
# network config
x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
y_predict = paddle.layer.fc(input=x,
param_attr=paddle.attr.Param(name='w'),
size=1,
act=paddle.activation.Linear(),
bias_attr=paddle.attr.Param(name='b'))
y = paddle.layer.data(name='y', type=paddle.data_type.dense_vector(1))
cost = paddle.layer.regression_cost(input=y_predict, label=y)
# create parameters
parameters = paddle.parameters.create(cost)
# create optimizer
optimizer = paddle.optimizer.Momentum(momentum=0)
trainer = paddle.trainer.SGD(cost=cost,
parameters=parameters,
update_equation=optimizer)
# event_handler to print training and testing info
def event_handler(event):
if isinstance(event, paddle.event.EndIteration):
if event.batch_id % 100 == 0:
print "Pass %d, Batch %d, Cost %f, %s" % (
event.pass_id, event.batch_id, event.cost, event.metrics)
if isinstance(event, paddle.event.EndPass):
result = trainer.test(
reader=paddle.reader.batched(
uci_housing.test(), batch_size=2),
reader_dict={'x': 0,
'y': 1})
if event.pass_id % 10 == 0:
print "Test %d, %s" % (event.pass_id, result.metrics)
# training
trainer.train(
reader=paddle.reader.batched(
paddle.reader.shuffle(
uci_housing.train(), buf_size=500),
batch_size=2),
reader_dict={'x': 0,
'y': 1},
event_handler=event_handler,
num_passes=30)
if __name__ == '__main__':
main()
import paddle.v2 as paddle import paddle.v2 as paddle
def softmax_regression(img):
predict = paddle.layer.fc(input=img,
size=10,
act=paddle.activation.Softmax())
return predict
def multilayer_perceptron(img):
# The first fully-connected layer
hidden1 = paddle.layer.fc(input=img, size=128, act=paddle.activation.Relu())
# The second fully-connected layer and the according activation function
hidden2 = paddle.layer.fc(input=hidden1,
size=64,
act=paddle.activation.Relu())
# The thrid fully-connected layer, note that the hidden size should be 10,
# which is the number of unique digits
predict = paddle.layer.fc(input=hidden2,
size=10,
act=paddle.activation.Softmax())
return predict
def convolutional_neural_network(img):
# first conv layer
conv_pool_1 = paddle.networks.simple_img_conv_pool(
input=img,
filter_size=5,
num_filters=20,
num_channel=1,
pool_size=2,
pool_stride=2,
act=paddle.activation.Tanh())
# second conv layer
conv_pool_2 = paddle.networks.simple_img_conv_pool(
input=conv_pool_1,
filter_size=5,
num_filters=50,
num_channel=20,
pool_size=2,
pool_stride=2,
act=paddle.activation.Tanh())
# The first fully-connected layer
fc1 = paddle.layer.fc(input=conv_pool_2,
size=128,
act=paddle.activation.Tanh())
# The softmax layer, note that the hidden size should be 10,
# which is the number of unique digits
predict = paddle.layer.fc(input=fc1,
size=10,
act=paddle.activation.Softmax())
return predict
def main(): def main():
paddle.init(use_gpu=False, trainer_count=1) paddle.init(use_gpu=False, trainer_count=1)
...@@ -9,45 +62,58 @@ def main(): ...@@ -9,45 +62,58 @@ def main():
name='pixel', type=paddle.data_type.dense_vector(784)) name='pixel', type=paddle.data_type.dense_vector(784))
label = paddle.layer.data( label = paddle.layer.data(
name='label', type=paddle.data_type.integer_value(10)) name='label', type=paddle.data_type.integer_value(10))
hidden1 = paddle.layer.fc(input=images, size=200)
hidden2 = paddle.layer.fc(input=hidden1, size=200) # Here we can build the prediction network in different ways. Please
inference = paddle.layer.fc(input=hidden2, # choose one by uncomment corresponding line.
size=10, predict = softmax_regression(images)
act=paddle.activation.Softmax()) #predict = multilayer_perceptron(images)
cost = paddle.layer.classification_cost(input=inference, label=label) #predict = convolutional_neural_network(images)
cost = paddle.layer.classification_cost(input=predict, label=label)
parameters = paddle.parameters.create(cost) parameters = paddle.parameters.create(cost)
adam_optimizer = paddle.optimizer.Adam(learning_rate=0.01) optimizer = paddle.optimizer.Momentum(
learning_rate=0.1 / 128.0,
momentum=0.9,
regularization=paddle.optimizer.L2Regularization(rate=0.0005 * 128))
trainer = paddle.trainer.SGD(cost=cost, trainer = paddle.trainer.SGD(cost=cost,
parameters=parameters, parameters=parameters,
update_equation=adam_optimizer) update_equation=optimizer)
lists = []
def event_handler(event): def event_handler(event):
if isinstance(event, paddle.event.EndIteration): if isinstance(event, paddle.event.EndIteration):
if event.batch_id % 1000 == 0: if event.batch_id % 100 == 0:
print "Pass %d, Batch %d, Cost %f, %s" % (
event.pass_id, event.batch_id, event.cost, event.metrics)
if isinstance(event, paddle.event.EndPass):
result = trainer.test(reader=paddle.reader.batched( result = trainer.test(reader=paddle.reader.batched(
paddle.dataset.mnist.test(), batch_size=256)) paddle.dataset.mnist.test(), batch_size=128))
print "Test with Pass %d, Cost %f, %s\n" % (
print "Pass %d, Batch %d, Cost %f, %s, Testing metrics %s" % ( event.pass_id, result.cost, result.metrics)
event.pass_id, event.batch_id, event.cost, event.metrics, lists.append((event.pass_id, result.cost,
result.metrics) result.metrics['classification_error_evaluator']))
else:
pass
trainer.train( trainer.train(
reader=paddle.reader.batched( reader=paddle.reader.batched(
paddle.reader.shuffle( paddle.reader.shuffle(
paddle.dataset.mnist.train(), buf_size=8192), paddle.dataset.mnist.train(), buf_size=8192),
batch_size=32), batch_size=128),
event_handler=event_handler) event_handler=event_handler,
num_passes=100)
# find the best pass
best = sorted(lists, key=lambda list: float(list[1]))[0]
print 'Best pass is %s, testing Avgcost is %s' % (best[0], best[1])
print 'The classification accuracy is %.2f%%' % (100 - float(best[2]) * 100)
# output is a softmax layer. It returns probabilities. # output is a softmax layer. It returns probabilities.
# Shape should be (100, 10) # Shape should be (100, 10)
probs = paddle.infer( probs = paddle.infer(
output=inference, output=predict,
parameters=parameters, parameters=parameters,
reader=paddle.reader.batched( reader=paddle.reader.batched(
paddle.reader.firstn( paddle.reader.firstn(
......
...@@ -167,8 +167,23 @@ def main(): ...@@ -167,8 +167,23 @@ def main():
paddle.reader.shuffle( paddle.reader.shuffle(
conll05.test(), buf_size=8192), batch_size=10) conll05.test(), buf_size=8192), batch_size=10)
reader_dict = {
'word_data': 0,
'ctx_n2_data': 1,
'ctx_n1_data': 2,
'ctx_0_data': 3,
'ctx_p1_data': 4,
'ctx_p2_data': 5,
'verb_data': 6,
'mark_data': 7,
'target': 8
}
trainer.train( trainer.train(
reader=trn_reader, event_handler=event_handler, num_passes=10000) reader=trn_reader,
event_handler=event_handler,
num_passes=10000,
reader_dict=reader_dict)
if __name__ == '__main__': if __name__ == '__main__':
......
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import paddle.trainer_config_helpers.attrs as attrs
from paddle.trainer_config_helpers.poolings import MaxPooling
import paddle.v2 as paddle
def convolution_net(input_dim,
class_dim=2,
emb_dim=128,
hid_dim=128,
is_predict=False):
data = paddle.layer.data("word",
paddle.data_type.integer_value_sequence(input_dim))
emb = paddle.layer.embedding(input=data, size=emb_dim)
conv_3 = paddle.networks.sequence_conv_pool(
input=emb, context_len=3, hidden_size=hid_dim)
conv_4 = paddle.networks.sequence_conv_pool(
input=emb, context_len=4, hidden_size=hid_dim)
output = paddle.layer.fc(input=[conv_3, conv_4],
size=class_dim,
act=paddle.activation.Softmax())
lbl = paddle.layer.data("label", paddle.data_type.integer_value(2))
cost = paddle.layer.classification_cost(input=output, label=lbl)
return cost
def stacked_lstm_net(input_dim,
class_dim=2,
emb_dim=128,
hid_dim=512,
stacked_num=3,
is_predict=False):
"""
A Wrapper for sentiment classification task.
This network uses bi-directional recurrent network,
consisting three LSTM layers. This configure is referred to
the paper as following url, but use fewer layrs.
http://www.aclweb.org/anthology/P15-1109
input_dim: here is word dictionary dimension.
class_dim: number of categories.
emb_dim: dimension of word embedding.
hid_dim: dimension of hidden layer.
stacked_num: number of stacked lstm-hidden layer.
is_predict: is predicting or not.
Some layers is not needed in network when predicting.
"""
assert stacked_num % 2 == 1
layer_attr = attrs.ExtraLayerAttribute(drop_rate=0.5)
fc_para_attr = attrs.ParameterAttribute(learning_rate=1e-3)
lstm_para_attr = attrs.ParameterAttribute(initial_std=0., learning_rate=1.)
para_attr = [fc_para_attr, lstm_para_attr]
bias_attr = attrs.ParameterAttribute(initial_std=0., l2_rate=0.)
relu = paddle.activation.Relu()
linear = paddle.activation.Linear()
data = paddle.layer.data("word",
paddle.data_type.integer_value_sequence(input_dim))
emb = paddle.layer.embedding(input=data, size=emb_dim)
fc1 = paddle.layer.fc(input=emb,
size=hid_dim,
act=linear,
bias_attr=bias_attr)
lstm1 = paddle.layer.lstmemory(
input=fc1, act=relu, bias_attr=bias_attr, layer_attr=layer_attr)
inputs = [fc1, lstm1]
for i in range(2, stacked_num + 1):
fc = paddle.layer.fc(input=inputs,
size=hid_dim,
act=linear,
param_attr=para_attr,
bias_attr=bias_attr)
lstm = paddle.layer.lstmemory(
input=fc,
reverse=(i % 2) == 0,
act=relu,
bias_attr=bias_attr,
layer_attr=layer_attr)
inputs = [fc, lstm]
fc_last = paddle.layer.pooling(input=inputs[0], pooling_type=MaxPooling())
lstm_last = paddle.layer.pooling(input=inputs[1], pooling_type=MaxPooling())
output = paddle.layer.fc(input=[fc_last, lstm_last],
size=class_dim,
act=paddle.activation.Softmax(),
bias_attr=bias_attr,
param_attr=para_attr)
lbl = paddle.layer.data("label", paddle.data_type.integer_value(2))
cost = paddle.layer.classification_cost(input=output, label=lbl)
return cost
if __name__ == '__main__':
# init
paddle.init(use_gpu=True, trainer_count=4)
# network config
print 'load dictionary...'
word_dict = paddle.dataset.imdb.word_dict()
dict_dim = len(word_dict)
class_dim = 2
# Please choose the way to build the network
# by uncommenting the corresponding line.
cost = convolution_net(dict_dim, class_dim=class_dim)
# cost = stacked_lstm_net(dict_dim, class_dim=class_dim, stacked_num=3)
# create parameters
parameters = paddle.parameters.create(cost)
# create optimizer
adam_optimizer = paddle.optimizer.Adam(
learning_rate=2e-3,
regularization=paddle.optimizer.L2Regularization(rate=8e-4),
model_average=paddle.optimizer.ModelAverage(average_window=0.5))
# End batch and end pass event handler
def event_handler(event):
if isinstance(event, paddle.event.EndIteration):
if event.batch_id % 100 == 0:
print "\nPass %d, Batch %d, Cost %f, %s" % (
event.pass_id, event.batch_id, event.cost, event.metrics)
else:
sys.stdout.write('.')
sys.stdout.flush()
if isinstance(event, paddle.event.EndPass):
result = trainer.test(
reader=paddle.reader.batched(
lambda: paddle.dataset.imdb.test(word_dict),
batch_size=128),
reader_dict={'word': 0,
'label': 1})
print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
# create trainer
trainer = paddle.trainer.SGD(cost=cost,
parameters=parameters,
update_equation=adam_optimizer)
trainer.train(
reader=paddle.reader.batched(
paddle.reader.shuffle(
lambda: paddle.dataset.imdb.train(word_dict), buf_size=1000),
batch_size=100),
event_handler=event_handler,
reader_dict={'word': 0,
'label': 1},
num_passes=10)
import os
import paddle.v2 as paddle
from seqToseq_net_v2 import seqToseq_net_v2
# Data Definiation.
# TODO:This code should be merged to dataset package.
data_dir = "./data/pre-wmt14"
src_lang_dict = os.path.join(data_dir, 'src.dict')
trg_lang_dict = os.path.join(data_dir, 'trg.dict')
source_dict_dim = len(open(src_lang_dict, "r").readlines())
target_dict_dim = len(open(trg_lang_dict, "r").readlines())
def read_to_dict(dict_path):
with open(dict_path, "r") as fin:
out_dict = {
line.strip(): line_count
for line_count, line in enumerate(fin)
}
return out_dict
src_dict = read_to_dict(src_lang_dict)
trg_dict = read_to_dict(trg_lang_dict)
train_list = os.path.join(data_dir, 'train.list')
test_list = os.path.join(data_dir, 'test.list')
UNK_IDX = 2
START = "<s>"
END = "<e>"
def _get_ids(s, dictionary):
words = s.strip().split()
return [dictionary[START]] + \
[dictionary.get(w, UNK_IDX) for w in words] + \
[dictionary[END]]
def train_reader(file_name):
def reader():
with open(file_name, 'r') as f:
for line_count, line in enumerate(f):
line_split = line.strip().split('\t')
if len(line_split) != 2:
continue
src_seq = line_split[0] # one source sequence
src_ids = _get_ids(src_seq, src_dict)
trg_seq = line_split[1] # one target sequence
trg_words = trg_seq.split()
trg_ids = [trg_dict.get(w, UNK_IDX) for w in trg_words]
# remove sequence whose length > 80 in training mode
if len(src_ids) > 80 or len(trg_ids) > 80:
continue
trg_ids_next = trg_ids + [trg_dict[END]]
trg_ids = [trg_dict[START]] + trg_ids
yield src_ids, trg_ids, trg_ids_next
return reader
def main():
paddle.init(use_gpu=False, trainer_count=1)
# define network topology
cost = seqToseq_net_v2(source_dict_dim, target_dict_dim)
parameters = paddle.parameters.create(cost)
optimizer = paddle.optimizer.Adam(learning_rate=1e-4)
def event_handler(event):
if isinstance(event, paddle.event.EndIteration):
if event.batch_id % 10 == 0:
print "Pass %d, Batch %d, Cost %f, %s" % (
event.pass_id, event.batch_id, event.cost, event.metrics)
trainer = paddle.trainer.SGD(cost=cost,
parameters=parameters,
update_equation=optimizer)
reader_dict = {
'source_language_word': 0,
'target_language_word': 1,
'target_language_next_word': 2
}
trn_reader = paddle.reader.batched(
paddle.reader.shuffle(
train_reader("data/pre-wmt14/train/train"), buf_size=8192),
batch_size=5)
trainer.train(
reader=trn_reader,
event_handler=event_handler,
num_passes=10000,
reader_dict=reader_dict)
if __name__ == '__main__':
main()
import paddle.v2.activation as activation
import paddle.v2.attr as attr
import paddle.v2.data_type as data_type
import paddle.v2.layer as layer
import paddle.v2.networks as networks
def seqToseq_net_v2(source_dict_dim, target_dict_dim):
### Network Architecture
word_vector_dim = 512 # dimension of word vector
decoder_size = 512 # dimension of hidden unit in GRU Decoder network
encoder_size = 512 # dimension of hidden unit in GRU Encoder network
#### Encoder
src_word_id = layer.data(
name='source_language_word',
type=data_type.integer_value_sequence(source_dict_dim))
src_embedding = layer.embedding(
input=src_word_id,
size=word_vector_dim,
param_attr=attr.ParamAttr(name='_source_language_embedding'))
src_forward = networks.simple_gru(input=src_embedding, size=encoder_size)
src_backward = networks.simple_gru(
input=src_embedding, size=encoder_size, reverse=True)
encoded_vector = layer.concat(input=[src_forward, src_backward])
#### Decoder
with layer.mixed(size=decoder_size) as encoded_proj:
encoded_proj += layer.full_matrix_projection(input=encoded_vector)
backward_first = layer.first_seq(input=src_backward)
with layer.mixed(size=decoder_size, act=activation.Tanh()) as decoder_boot:
decoder_boot += layer.full_matrix_projection(input=backward_first)
def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
decoder_mem = layer.memory(
name='gru_decoder', size=decoder_size, boot_layer=decoder_boot)
context = networks.simple_attention(
encoded_sequence=enc_vec,
encoded_proj=enc_proj,
decoder_state=decoder_mem)
with layer.mixed(size=decoder_size * 3) as decoder_inputs:
decoder_inputs += layer.full_matrix_projection(input=context)
decoder_inputs += layer.full_matrix_projection(input=current_word)
gru_step = layer.gru_step(
name='gru_decoder',
input=decoder_inputs,
output_mem=decoder_mem,
size=decoder_size)
with layer.mixed(
size=target_dict_dim, bias_attr=True,
act=activation.Softmax()) as out:
out += layer.full_matrix_projection(input=gru_step)
return out
decoder_group_name = "decoder_group"
group_input1 = layer.StaticInputV2(input=encoded_vector, is_seq=True)
group_input2 = layer.StaticInputV2(input=encoded_proj, is_seq=True)
group_inputs = [group_input1, group_input2]
trg_embedding = layer.embedding(
input=layer.data(
name='target_language_word',
type=data_type.integer_value_sequence(target_dict_dim)),
size=word_vector_dim,
param_attr=attr.ParamAttr(name='_target_language_embedding'))
group_inputs.append(trg_embedding)
# For decoder equipped with attention mechanism, in training,
# target embeding (the groudtruth) is the data input,
# while encoded source sequence is accessed to as an unbounded memory.
# Here, the StaticInput defines a read-only memory
# for the recurrent_group.
decoder = layer.recurrent_group(
name=decoder_group_name,
step=gru_decoder_with_attention,
input=group_inputs)
lbl = layer.data(
name='target_language_next_word',
type=data_type.integer_value_sequence(target_dict_dim))
cost = layer.classification_cost(input=decoder, label=lbl)
return cost
...@@ -4,9 +4,10 @@ At training and testing time, PaddlePaddle programs need to read data. To ease t ...@@ -4,9 +4,10 @@ At training and testing time, PaddlePaddle programs need to read data. To ease t
- A *reader* is a function that reads data (from file, network, random number generator, etc) and yields data items. - A *reader* is a function that reads data (from file, network, random number generator, etc) and yields data items.
- A *reader creator* is a function that returns a reader function. - A *reader creator* is a function that returns a reader function.
- A *reader* decorator is a function, which accepts one or more readers, and returns a reader. - A *reader decorator* is a function, which accepts one or more readers, and returns a reader.
- A *batch reader* is a function that reads data (from *reader*, file, network, random number generator, etc) and yields a batch of data items.
and provide frequently used reader creators and reader decorators. and provide function which converts reader to batch reader, frequently used reader creators and reader decorators.
## Data Reader Interface ## Data Reader Interface
...@@ -37,9 +38,54 @@ def reader_creator_random_imageand_label(widht, height, label): ...@@ -37,9 +38,54 @@ def reader_creator_random_imageand_label(widht, height, label):
return reader return reader
``` ```
## Batch Reader Interface
*batch reader* can be any function with no parameter that creates a iterable (anything can be used in `for x in iterable`). The output of the iterable should be a batch (list) of data items. Each item inside the list must be a tuple.
Here are valid outputs:
```python
# a mini batch of three data items. Each data item consist three columns of data, each of which is 1.
[(1, 1, 1),
(2, 2, 2),
(3, 3, 3)]
# a mini batch of three data items, each data item is a list (single column).
[([1,1,1],),
([2,2,2],),
([3,3,3],),
```
Please note that each item inside the list must be a tuple, below is an invalid output:
```python
# wrong, [1,1,1] needs to be inside a tuple: ([1,1,1],).
# Otherwise it's ambiguous whether [1,1,1] means a single column of data [1, 1, 1],
# or three column of datas, each of which is 1.
[[1,1,1],
[2,2,2],
[3,3,3]]
```
It's easy to convert from reader to batch reader:
```python
mnist_train = paddle.dataset.mnist.train()
mnist_train_batch_reader = paddle.batch(mnist_train, 128)
```
Also easy to create custom batch reader:
```python
def custom_batch_reader():
while True:
batch = []
for i in xrange(128):
batch.append((numpy.random.uniform(-1, 1, 28*28),)) # note that it's a tuple being appended.
yield batch
mnist_random_image_batch_reader = custom_batch_reader
```
## Usage ## Usage
data reader, mapping from item(s) read to data layer, batch size and number of total pass will be passed into `paddle.train`: batch reader, mapping from item(s) read to data layer, batch size and number of total pass will be passed into `paddle.train`:
```python ```python
# two data layer is created: # two data layer is created:
...@@ -47,8 +93,8 @@ image_layer = paddle.layer.data("image", ...) ...@@ -47,8 +93,8 @@ image_layer = paddle.layer.data("image", ...)
label_layer = paddle.layer.data("label", ...) label_layer = paddle.layer.data("label", ...)
# ... # ...
batch_reader = paddle.batch(paddle.dataset.mnist.train(), 128)
paddle.train(paddle.dataset.mnist, {"image":0, "label":1}, 128, 10, ...) paddle.train(batch_reader, {"image":0, "label":1}, 128, 10, ...)
``` ```
## Data Reader Decorator ## Data Reader Decorator
...@@ -64,7 +110,7 @@ Since reading data may take time and training can not proceed without data. It i ...@@ -64,7 +110,7 @@ Since reading data may take time and training can not proceed without data. It i
Use `paddle.reader.buffered` to prefetch data: Use `paddle.reader.buffered` to prefetch data:
```python ```python
buffered_reader = paddle.reader.buffered(paddle.dataset.mnist, 100) buffered_reader = paddle.reader.buffered(paddle.dataset.mnist.train(), 100)
``` ```
`buffered_reader` will try to buffer (prefetch) `100` data entries. `buffered_reader` will try to buffer (prefetch) `100` data entries.
...@@ -91,10 +137,10 @@ def reader_creator_bool(t): ...@@ -91,10 +137,10 @@ def reader_creator_bool(t):
true_reader = reader_creator_bool(True) true_reader = reader_creator_bool(True)
false_reader = reader_creator_bool(False) false_reader = reader_creator_bool(False)
reader = paddle.reader.compose(paddle.dataset.mnist, data_reader_creator_random_image(20, 20), true_reader, false_reader) reader = paddle.reader.compose(paddle.dataset.mnist.train(), data_reader_creator_random_image(20, 20), true_reader, false_reader)
# Skipped 1 because paddle.dataset.mnist produces two items per data entry. # Skipped 1 because paddle.dataset.mnist.train() produces two items per data entry.
# And we don't care second item at this time. # And we don't care second item at this time.
paddle.train(reader, {"true_image":0, "fake_image": 2, "true_label": 3, "false_label": 4}, ...) paddle.train(paddle.batch(reader, 128), {"true_image":0, "fake_image": 2, "true_label": 3, "false_label": 4}, ...)
``` ```
### Shuffle ### Shuffle
...@@ -103,16 +149,20 @@ Given shuffle buffer size `n`, `paddle.reader.shuffle` will return a data reader ...@@ -103,16 +149,20 @@ Given shuffle buffer size `n`, `paddle.reader.shuffle` will return a data reader
Example: Example:
```python ```python
reader = paddle.reader.shuffle(paddle.dataset.mnist, 512) reader = paddle.reader.shuffle(paddle.dataset.mnist.train(), 512)
``` ```
## Q & A ## Q & A
### Why return only a single entry, but not a mini batch? ### Why reader return only a single entry, but not a mini batch?
Always returning a single entry make reusing existing data readers much easier (e.g., if existing reader return not a single entry but 3 entries, training code will be more complex because it need to handle cases like batch size 2).
We provide function `paddle.batch` to turn (single entry) reader into batch reader.
If a mini batch is returned, data reader need to take care of batch size. But batch size is a concept for training, it makes more sense for user to specify batch size as a parameter for `train`. ### Why do we need batch reader, isn't train take reader and batch_size as arguments sufficient?
Practically, always return a single entry make reusing existing data readers much easier (e.g., if existing reader return not a single entry but 3 entries, training code will be more complex because it need to handle cases like batch size 2). In most of the case, train taking reader and batch_size as arguments would be sufficent. However sometimes user want to customize order of data entries inside a mini batch. Or even change batch size dynamically.
### Why use a dictionary but not a list to provide mapping? ### Why use a dictionary but not a list to provide mapping?
...@@ -137,7 +187,7 @@ def image_reader_creator(image_path, label_path, n): ...@@ -137,7 +187,7 @@ def image_reader_creator(image_path, label_path, n):
# images_reader_creator creates a reader # images_reader_creator creates a reader
reader = image_reader_creator("/path/to/image_file", "/path/to/label_file", 1024) reader = image_reader_creator("/path/to/image_file", "/path/to/label_file", 1024)
paddle.train(reader, {"image":0, "label":1}, ...) paddle.train(paddle.batch(reader, 128), {"image":0, "label":1}, ...)
``` ```
### How is `paddle.train` implemented ### How is `paddle.train` implemented
...@@ -145,17 +195,8 @@ paddle.train(reader, {"image":0, "label":1}, ...) ...@@ -145,17 +195,8 @@ paddle.train(reader, {"image":0, "label":1}, ...)
An example implementation of paddle.train could be: An example implementation of paddle.train could be:
```python ```python
def make_minibatch(reader, minibatch_size): def train(batch_reader, mapping, batch_size, total_pass):
def ret():
r = reader()
buf = [r.next() for x in xrange(minibatch_size)]
while len(buf) > 0:
yield buf
buf = [r.next() for x in xrange(minibatch_size)]
return ret
def train(reader, mapping, batch_size, total_pass):
for pass_idx in range(total_pass): for pass_idx in range(total_pass):
for mini_batch in make_minibatch(reader): # this loop will never end in online learning. for mini_batch in batch_reader(): # this loop will never end in online learning.
do_forward_backward(mini_batch, mapping) do_forward_backward(mini_batch, mapping)
``` ```
...@@ -132,6 +132,7 @@ def startPaddle(idMap={}, train_args_dict=None): ...@@ -132,6 +132,7 @@ def startPaddle(idMap={}, train_args_dict=None):
logDir = JOB_PATH_OUTPUT + "/node_" + str(trainerId) logDir = JOB_PATH_OUTPUT + "/node_" + str(trainerId)
if not os.path.exists(JOB_PATH_OUTPUT): if not os.path.exists(JOB_PATH_OUTPUT):
os.makedirs(JOB_PATH_OUTPUT) os.makedirs(JOB_PATH_OUTPUT)
if not os.path.exists(logDir):
os.mkdir(logDir) os.mkdir(logDir)
copyCommand = 'cp -rf ' + JOB_PATH + \ copyCommand = 'cp -rf ' + JOB_PATH + \
"/" + str(trainerId) + "/data/*" + " ./data/" "/" + str(trainerId) + "/data/*" + " ./data/"
......
...@@ -144,9 +144,7 @@ void Arguments::setSlotSequenceDim(size_t idx, IVector* vec) throw(RangeError) { ...@@ -144,9 +144,7 @@ void Arguments::setSlotSequenceDim(size_t idx, IVector* vec) throw(RangeError) {
a.cpuSequenceDims = m->cast<paddle::IVector>(vec->getSharedPtr()); a.cpuSequenceDims = m->cast<paddle::IVector>(vec->getSharedPtr());
} }
float Arguments::sumCosts() const { float Arguments::sum() const { return paddle::Argument::sum(m->outputs); }
return paddle::Argument::sumCosts(m->outputs);
}
int64_t Arguments::getBatchSize(size_t idx) const throw(RangeError) { int64_t Arguments::getBatchSize(size_t idx) const throw(RangeError) {
auto& a = m->getArg(idx); auto& a = m->getArg(idx);
......
...@@ -453,7 +453,7 @@ public: ...@@ -453,7 +453,7 @@ public:
IVector* vec) throw(RangeError); IVector* vec) throw(RangeError);
void setSlotSequenceDim(size_t idx, IVector* vec) throw(RangeError); void setSlotSequenceDim(size_t idx, IVector* vec) throw(RangeError);
float sumCosts() const; float sum() const;
private: private:
static Arguments* createByPaddleArgumentVector(void* ptr); static Arguments* createByPaddleArgumentVector(void* ptr);
......
...@@ -22,7 +22,7 @@ class TestArguments(unittest.TestCase): ...@@ -22,7 +22,7 @@ class TestArguments(unittest.TestCase):
args = swig_paddle.Arguments.createArguments(1) args = swig_paddle.Arguments.createArguments(1)
args.setSlotValue(0, m) args.setSlotValue(0, m)
self.assertAlmostEqual(27.0, args.sumCosts()) self.assertAlmostEqual(27.0, args.sum())
mat = args.getSlotValue(0) mat = args.getSlotValue(0)
assert isinstance(mat, swig_paddle.Matrix) assert isinstance(mat, swig_paddle.Matrix)
......
...@@ -24,7 +24,7 @@ real getCostSum(LayerPtr& testLayer, MatrixPtr weights) { ...@@ -24,7 +24,7 @@ real getCostSum(LayerPtr& testLayer, MatrixPtr weights) {
if (weights) { if (weights) {
outArgs[0].value->dotMul(*outArgs[0].value, *weights); outArgs[0].value->dotMul(*outArgs[0].value, *weights);
} }
return Argument::sumCosts(outArgs); return Argument::sum(outArgs);
} }
real getDiffAndPrint(real newCost1, real getDiffAndPrint(real newCost1,
...@@ -241,7 +241,7 @@ void testBatchState(LayerPtr testLayer, ...@@ -241,7 +241,7 @@ void testBatchState(LayerPtr testLayer,
std::vector<Argument> args; std::vector<Argument> args;
args.push_back(out); args.push_back(out);
EXPECT_EQ(0, Argument::sumCosts(args)) << "testBatchState failed"; EXPECT_EQ(0, Argument::sum(args)) << "testBatchState failed";
for (size_t seqId = 0; seqId < numSequences; ++seqId) { for (size_t seqId = 0; seqId < numSequences; ++seqId) {
start[seqId] += seqLens[seqId]; start[seqId] += seqLens[seqId];
} }
...@@ -672,7 +672,7 @@ void testLayerGradKernel(TestConfig testConf, ...@@ -672,7 +672,7 @@ void testLayerGradKernel(TestConfig testConf,
outArgs[0].value->dotMul(*testLayer->getOutput().value, *weights); outArgs[0].value->dotMul(*testLayer->getOutput().value, *weights);
} }
real cost = Argument::sumCosts(outArgs); real cost = Argument::sum(outArgs);
LOG(INFO) << " cost " << cost; LOG(INFO) << " cost " << cost;
EXPECT_FALSE(std::isnan(cost)); EXPECT_FALSE(std::isnan(cost));
......
...@@ -163,7 +163,7 @@ struct Argument { ...@@ -163,7 +163,7 @@ struct Argument {
: sequenceStartPositions->getData(false); : sequenceStartPositions->getData(false);
} }
static inline real sumCosts(const std::vector<Argument>& arguments) { static inline real sum(const std::vector<Argument>& arguments) {
real cost = 0; real cost = 0;
for (auto& arg : arguments) { for (auto& arg : arguments) {
if (arg.value) { if (arg.value) {
......
...@@ -10,9 +10,11 @@ add_test(NAME socket_test ...@@ -10,9 +10,11 @@ add_test(NAME socket_test
add_unittest_without_exec(test_ProtoServer add_unittest_without_exec(test_ProtoServer
test_ProtoServer.cpp) test_ProtoServer.cpp)
add_test(NAME test_ProtoServer IF(NOT ON_TRAVIS)
add_test(NAME test_ProtoServer
COMMAND ${PROJ_ROOT}/paddle/.set_port.sh -p port COMMAND ${PROJ_ROOT}/paddle/.set_port.sh -p port
${CMAKE_CURRENT_BINARY_DIR}/test_ProtoServer) ${CMAKE_CURRENT_BINARY_DIR}/test_ProtoServer)
ENDIF(NOT ON_TRAVIS)
# TODO(yuyang18): Run test_ProtoServer when with rdma # TODO(yuyang18): Run test_ProtoServer when with rdma
# add_test(NAME test_ProtoServerRDMA # add_test(NAME test_ProtoServerRDMA
......
...@@ -72,6 +72,7 @@ setup(name="py_paddle", ...@@ -72,6 +72,7 @@ setup(name="py_paddle",
packages=['py_paddle'], packages=['py_paddle'],
include_dirs = include_dirs, include_dirs = include_dirs,
install_requires = [ install_requires = [
'nltk>=3.2.2',
'numpy>=1.8.0', # The numpy is required. 'numpy>=1.8.0', # The numpy is required.
'protobuf>=3.0.0' # The paddle protobuf version 'protobuf>=3.0.0' # The paddle protobuf version
], ],
......
...@@ -208,7 +208,7 @@ real Tester::forwardOneBatch(const DataBatch& dataBatch, ...@@ -208,7 +208,7 @@ real Tester::forwardOneBatch(const DataBatch& dataBatch,
return 0.0; // In this case, there is no meaning to calculate cost return 0.0; // In this case, there is no meaning to calculate cost
} }
return Argument::sumCosts(outArgs); return Argument::sum(outArgs);
} }
void Tester::testOnePassBatch(int passId) { void Tester::testOnePassBatch(int passId) {
......
...@@ -310,7 +310,7 @@ real Trainer::checkGradient() { ...@@ -310,7 +310,7 @@ real Trainer::checkGradient() {
std::vector<Argument> outArgs; std::vector<Argument> outArgs;
trainerInternal_.getGradientMachine()->forward(inArgs, &outArgs, PASS_GC); trainerInternal_.getGradientMachine()->forward(inArgs, &outArgs, PASS_GC);
real cost = Argument::sumCosts(outArgs); real cost = Argument::sum(outArgs);
LOG(INFO) << "original cost=" << cost; LOG(INFO) << "original cost=" << cost;
trainerInternal_.getGradientMachine()->backward(); trainerInternal_.getGradientMachine()->backward();
...@@ -340,7 +340,7 @@ real Trainer::checkGradient() { ...@@ -340,7 +340,7 @@ real Trainer::checkGradient() {
parameter->getBuf(PARAMETER_VALUE)->copyFrom(newPara); parameter->getBuf(PARAMETER_VALUE)->copyFrom(newPara);
parameter->setValueUpdated(); parameter->setValueUpdated();
trainerInternal_.getGradientMachine()->forward(inArgs, &outArgs, PASS_GC); trainerInternal_.getGradientMachine()->forward(inArgs, &outArgs, PASS_GC);
real newCost1 = Argument::sumCosts(outArgs); real newCost1 = Argument::sum(outArgs);
for (size_t i = 0; i < dim; ++i) { for (size_t i = 0; i < dim; ++i) {
newp[i] = oldp[i] - step * d[i]; newp[i] = oldp[i] - step * d[i];
...@@ -349,7 +349,7 @@ real Trainer::checkGradient() { ...@@ -349,7 +349,7 @@ real Trainer::checkGradient() {
parameter->getBuf(PARAMETER_VALUE)->copyFrom(newPara); parameter->getBuf(PARAMETER_VALUE)->copyFrom(newPara);
parameter->setValueUpdated(); parameter->setValueUpdated();
trainerInternal_.getGradientMachine()->forward(inArgs, &outArgs, PASS_GC); trainerInternal_.getGradientMachine()->forward(inArgs, &outArgs, PASS_GC);
real newCost2 = Argument::sumCosts(outArgs); real newCost2 = Argument::sum(outArgs);
real trueDelta = 0.5 * (newCost1 - newCost2); real trueDelta = 0.5 * (newCost1 - newCost2);
real diff = (1e-20 + trueDelta) / (1e-20 + delta) - 1; real diff = (1e-20 + trueDelta) / (1e-20 + delta) - 1;
...@@ -575,7 +575,7 @@ real Trainer::calcGradient(const DataBatch& dataBatch, ...@@ -575,7 +575,7 @@ real Trainer::calcGradient(const DataBatch& dataBatch,
trainerInternal_.getGradientMachine()->forwardBackward( trainerInternal_.getGradientMachine()->forwardBackward(
inArgs, &outArgs, PASS_TRAIN); inArgs, &outArgs, PASS_TRAIN);
real cost = Argument::sumCosts(outArgs); real cost = Argument::sum(outArgs);
offset = 0; offset = 0;
for (auto& para : parameters) { for (auto& para : parameters) {
......
...@@ -134,7 +134,7 @@ void TrainerInternal::trainOneBatch(int64_t batchId, ...@@ -134,7 +134,7 @@ void TrainerInternal::trainOneBatch(int64_t batchId,
real cost = 0; real cost = 0;
{ {
REGISTER_TIMER("sumCost"); REGISTER_TIMER("sumCost");
cost = Argument::sumCosts(*outArgs); cost = Argument::sum(*outArgs);
} }
if (batchId % intconfig_->log_period == 0) { if (batchId % intconfig_->log_period == 0) {
......
...@@ -65,14 +65,18 @@ def sparse_value_slot(dim, seq_type=SequenceType.NO_SEQUENCE): ...@@ -65,14 +65,18 @@ def sparse_value_slot(dim, seq_type=SequenceType.NO_SEQUENCE):
return InputType(dim, seq_type, DataType.SparseValue) return InputType(dim, seq_type, DataType.SparseValue)
def index_slot(dim, seq_type=SequenceType.NO_SEQUENCE): def index_slot(value_range, seq_type=SequenceType.NO_SEQUENCE):
return InputType(dim, seq_type, DataType.Index) """Data type of integer.
:param value_range: range of this integer.
"""
return InputType(value_range, seq_type, DataType.Index)
dense_vector = dense_slot dense_vector = dense_slot
sparse_binary_vector = sparse_non_value_slot sparse_binary_vector = sparse_non_value_slot
sparse_vector = sparse_value_slot sparse_vector = sparse_value_slot
integer_value = index_slot integer_value = index_slot
integer_value.__doc__ = index_slot.__doc__
def dense_vector_sequence(dim): def dense_vector_sequence(dim):
...@@ -99,8 +103,11 @@ def sparse_vector_sub_sequence(dim): ...@@ -99,8 +103,11 @@ def sparse_vector_sub_sequence(dim):
return sparse_vector(dim, seq_type=SequenceType.SUB_SEQUENCE) return sparse_vector(dim, seq_type=SequenceType.SUB_SEQUENCE)
def integer_value_sequence(dim): def integer_value_sequence(value_range):
return integer_value(dim, seq_type=SequenceType.SEQUENCE) """Data type of a sequence of integer.
:param value_range: range of each element.
"""
return integer_value(value_range, seq_type=SequenceType.SEQUENCE)
def integer_value_sub_sequence(dim): def integer_value_sub_sequence(dim):
...@@ -108,6 +115,7 @@ def integer_value_sub_sequence(dim): ...@@ -108,6 +115,7 @@ def integer_value_sub_sequence(dim):
integer_sequence = integer_value_sequence integer_sequence = integer_value_sequence
integer_sequence.__doc__ = integer_value_sequence.__doc__
class SingleSlotWrapper(object): class SingleSlotWrapper(object):
......
...@@ -25,14 +25,14 @@ from . import dataset ...@@ -25,14 +25,14 @@ from . import dataset
from . import reader from . import reader
import attr import attr
import pooling import pooling
import inferencer import inference
import networks import networks
import py_paddle.swig_paddle as api import py_paddle.swig_paddle as api
__all__ = [ __all__ = [
'optimizer', 'layer', 'activation', 'parameters', 'init', 'trainer', 'optimizer', 'layer', 'activation', 'parameters', 'init', 'trainer',
'event', 'data_type', 'attr', 'pooling', 'data_feeder', 'dataset', 'reader', 'event', 'data_type', 'attr', 'pooling', 'data_feeder', 'dataset', 'reader',
'topology', 'networks', 'inferencer', 'infer' 'topology', 'networks', 'infer'
] ]
...@@ -44,4 +44,4 @@ def init(**kwargs): ...@@ -44,4 +44,4 @@ def init(**kwargs):
api.initPaddle(*args) api.initPaddle(*args)
infer = inferencer.infer infer = inference.infer
...@@ -18,5 +18,10 @@ import imdb ...@@ -18,5 +18,10 @@ import imdb
import cifar import cifar
import movielens import movielens
import conll05 import conll05
import uci_housing
import sentiment
__all__ = ['mnist', 'imikolov', 'imdb', 'cifar', 'movielens', 'conll05'] __all__ = [
'mnist', 'imikolov', 'imdb', 'cifar', 'movielens', 'conll05', 'sentiment'
'uci_housing'
]
...@@ -16,6 +16,7 @@ import requests ...@@ -16,6 +16,7 @@ import requests
import hashlib import hashlib
import os import os
import shutil import shutil
import sys
__all__ = ['DATA_HOME', 'download', 'md5file'] __all__ = ['DATA_HOME', 'download', 'md5file']
...@@ -41,9 +42,24 @@ def download(url, module_name, md5sum): ...@@ -41,9 +42,24 @@ def download(url, module_name, md5sum):
filename = os.path.join(dirname, url.split('/')[-1]) filename = os.path.join(dirname, url.split('/')[-1])
if not (os.path.exists(filename) and md5file(filename) == md5sum): if not (os.path.exists(filename) and md5file(filename) == md5sum):
print "Cache file %s not found, downloading %s" % (filename, url)
r = requests.get(url, stream=True) r = requests.get(url, stream=True)
total_length = r.headers.get('content-length')
if total_length is None:
with open(filename, 'w') as f: with open(filename, 'w') as f:
shutil.copyfileobj(r.raw, f) shutil.copyfileobj(r.raw, f)
else:
with open(filename, 'w') as f:
dl = 0
total_length = int(total_length)
for data in r.iter_content(chunk_size=4096):
dl += len(data)
f.write(data)
done = int(50 * dl / total_length)
sys.stdout.write("\r[%s%s]" % ('=' * done,
' ' * (50 - done)))
sys.stdout.flush()
return filename return filename
......
...@@ -116,3 +116,8 @@ def test(word_idx): ...@@ -116,3 +116,8 @@ def test(word_idx):
return reader_creator( return reader_creator(
re.compile("aclImdb/test/pos/.*\.txt$"), re.compile("aclImdb/test/pos/.*\.txt$"),
re.compile("aclImdb/test/neg/.*\.txt$"), word_idx, 1000) re.compile("aclImdb/test/neg/.*\.txt$"), word_idx, 1000)
def word_dict():
return build_dict(
re.compile("aclImdb/((train)|(test))/((pos)|(neg))/.*\.txt$"), 150)
...@@ -17,7 +17,7 @@ imikolov's simple dataset: http://www.fit.vutbr.cz/~imikolov/rnnlm/ ...@@ -17,7 +17,7 @@ imikolov's simple dataset: http://www.fit.vutbr.cz/~imikolov/rnnlm/
import paddle.v2.dataset.common import paddle.v2.dataset.common
import tarfile import tarfile
__all__ = ['train', 'test'] __all__ = ['train', 'test', 'build_dict']
URL = 'http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz' URL = 'http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz'
MD5 = '30177ea32e27c525793142b6bf2c8e2d' MD5 = '30177ea32e27c525793142b6bf2c8e2d'
...@@ -37,7 +37,9 @@ def word_count(f, word_freq=None): ...@@ -37,7 +37,9 @@ def word_count(f, word_freq=None):
return word_freq return word_freq
def build_dict(train_filename, test_filename): def build_dict():
train_filename = './simple-examples/data/ptb.train.txt'
test_filename = './simple-examples/data/ptb.valid.txt'
with tarfile.open( with tarfile.open(
paddle.v2.dataset.common.download( paddle.v2.dataset.common.download(
paddle.v2.dataset.imikolov.URL, 'imikolov', paddle.v2.dataset.imikolov.URL, 'imikolov',
...@@ -45,27 +47,22 @@ def build_dict(train_filename, test_filename): ...@@ -45,27 +47,22 @@ def build_dict(train_filename, test_filename):
trainf = tf.extractfile(train_filename) trainf = tf.extractfile(train_filename)
testf = tf.extractfile(test_filename) testf = tf.extractfile(test_filename)
word_freq = word_count(testf, word_count(trainf)) word_freq = word_count(testf, word_count(trainf))
if '<unk>' in word_freq:
# remove <unk> for now, since we will set it as last index
del word_freq['<unk>']
TYPO_FREQ = 50 TYPO_FREQ = 50
word_freq = filter(lambda x: x[1] > TYPO_FREQ, word_freq.items()) word_freq = filter(lambda x: x[1] > TYPO_FREQ, word_freq.items())
dictionary = sorted(word_freq, key=lambda x: (-x[1], x[0])) word_freq_sorted = sorted(word_freq, key=lambda x: (-x[1], x[0]))
words, _ = list(zip(*dictionary)) words, _ = list(zip(*word_freq_sorted))
word_idx = dict(zip(words, xrange(len(words)))) word_idx = dict(zip(words, xrange(len(words))))
word_idx['<unk>'] = len(words) word_idx['<unk>'] = len(words)
return word_idx return word_idx
word_idx = {} def reader_creator(filename, word_idx, n):
def reader_creator(filename, n):
global word_idx
if len(word_idx) == 0:
word_idx = build_dict('./simple-examples/data/ptb.train.txt',
'./simple-examples/data/ptb.valid.txt')
def reader(): def reader():
with tarfile.open( with tarfile.open(
paddle.v2.dataset.common.download( paddle.v2.dataset.common.download(
...@@ -84,9 +81,9 @@ def reader_creator(filename, n): ...@@ -84,9 +81,9 @@ def reader_creator(filename, n):
return reader return reader
def train(n): def train(word_idx, n):
return reader_creator('./simple-examples/data/ptb.train.txt', n) return reader_creator('./simple-examples/data/ptb.train.txt', word_idx, n)
def test(n): def test(word_idx, n):
return reader_creator('./simple-examples/data/ptb.valid.txt', n) return reader_creator('./simple-examples/data/ptb.valid.txt', word_idx, n)
# /usr/bin/env python
# -*- coding:utf-8 -*-
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
The script fetch and preprocess movie_reviews data set
that provided by NLTK
"""
import common
import collections
import nltk
import numpy as np
from itertools import chain
from nltk.corpus import movie_reviews
__all__ = ['train', 'test', 'get_word_dict']
NUM_TRAINING_INSTANCES = 1600
NUM_TOTAL_INSTANCES = 2000
def download_data_if_not_yet():
"""
Download the data set, if the data set is not download.
"""
try:
# make sure that nltk can find the data
if common.DATA_HOME not in nltk.data.path:
nltk.data.path.append(common.DATA_HOME)
movie_reviews.categories()
except LookupError:
print "Downloading movie_reviews data set, please wait....."
nltk.download('movie_reviews', download_dir=common.DATA_HOME)
print "Download data set success....."
print "Path is " + nltk.data.find('corpora/movie_reviews').path
def get_word_dict():
"""
Sorted the words by the frequency of words which occur in sample
:return:
words_freq_sorted
"""
words_freq_sorted = list()
word_freq_dict = collections.defaultdict(int)
download_data_if_not_yet()
for category in movie_reviews.categories():
for field in movie_reviews.fileids(category):
for words in movie_reviews.words(field):
word_freq_dict[words] += 1
words_sort_list = word_freq_dict.items()
words_sort_list.sort(cmp=lambda a, b: b[1] - a[1])
for index, word in enumerate(words_sort_list):
words_freq_sorted.append((word[0], index))
return words_freq_sorted
def sort_files():
"""
Sorted the sample for cross reading the sample
:return:
files_list
"""
files_list = list()
neg_file_list = movie_reviews.fileids('neg')
pos_file_list = movie_reviews.fileids('pos')
files_list = list(chain.from_iterable(zip(neg_file_list, pos_file_list)))
return files_list
def load_sentiment_data():
"""
Load the data set
:return:
data_set
"""
data_set = list()
download_data_if_not_yet()
words_ids = dict(get_word_dict())
for sample_file in sort_files():
words_list = list()
category = 0 if 'neg' in sample_file else 1
for word in movie_reviews.words(sample_file):
words_list.append(words_ids[word.lower()])
data_set.append((words_list, category))
return data_set
def reader_creator(data):
"""
Reader creator, generate an iterator for data set
:param data:
train data set or test data set
"""
for each in data:
yield each[0], each[1]
def train():
"""
Default train set reader creator
"""
data_set = load_sentiment_data()
return reader_creator(data_set[0:NUM_TRAINING_INSTANCES])
def test():
"""
Default test set reader creator
"""
data_set = load_sentiment_data()
return reader_creator(data_set[NUM_TRAINING_INSTANCES:])
import paddle.v2.dataset.imikolov import paddle.v2.dataset.imikolov
import unittest import unittest
WORD_DICT = paddle.v2.dataset.imikolov.build_dict()
class TestMikolov(unittest.TestCase): class TestMikolov(unittest.TestCase):
def check_reader(self, reader, n): def check_reader(self, reader, n):
...@@ -9,11 +11,15 @@ class TestMikolov(unittest.TestCase): ...@@ -9,11 +11,15 @@ class TestMikolov(unittest.TestCase):
def test_train(self): def test_train(self):
n = 5 n = 5
self.check_reader(paddle.v2.dataset.imikolov.train(n), n) self.check_reader(paddle.v2.dataset.imikolov.train(WORD_DICT, n), n)
def test_test(self): def test_test(self):
n = 5 n = 5
self.check_reader(paddle.v2.dataset.imikolov.test(n), n) self.check_reader(paddle.v2.dataset.imikolov.test(WORD_DICT, n), n)
def test_total(self):
_, idx = zip(*WORD_DICT.items())
self.assertEqual(sorted(idx)[-1], len(WORD_DICT) - 1)
if __name__ == '__main__': if __name__ == '__main__':
......
# /usr/bin/env python
# -*- coding:utf-8 -*-
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import nltk
import paddle.v2.dataset.sentiment as st
from nltk.corpus import movie_reviews
class TestSentimentMethods(unittest.TestCase):
def test_get_word_dict(self):
word_dict = st.get_word_dict()[0:10]
test_word_list = [(u',', 0), (u'the', 1), (u'.', 2), (u'a', 3),
(u'and', 4), (u'of', 5), (u'to', 6), (u"'", 7),
(u'is', 8), (u'in', 9)]
for idx, each in enumerate(word_dict):
self.assertEqual(each, test_word_list[idx])
self.assertTrue("/root/.cache/paddle/dataset" in nltk.data.path)
def test_sort_files(self):
last_label = ''
for sample_file in st.sort_files():
current_label = sample_file.split("/")[0]
self.assertNotEqual(current_label, last_label)
last_label = current_label
def test_data_set(self):
data_set = st.load_sentiment_data()
last_label = -1
for each in st.test():
self.assertNotEqual(each[1], last_label)
last_label = each[1]
self.assertEqual(len(data_set), st.NUM_TOTAL_INSTANCES)
self.assertEqual(len(list(st.train())), st.NUM_TRAINING_INSTANCES)
self.assertEqual(
len(list(st.test())),
(st.NUM_TOTAL_INSTANCES - st.NUM_TRAINING_INSTANCES))
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import os
from common import download
__all__ = ['train', 'test']
URL = 'https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data'
MD5 = 'd4accdce7a25600298819f8e28e8d593'
feature_names = [
'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
'PTRATIO', 'B', 'LSTAT'
]
UCI_TRAIN_DATA = None
UCI_TEST_DATA = None
def feature_range(maximums, minimums):
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
feature_num = len(maximums)
ax.bar(range(feature_num), maximums - minimums, color='r', align='center')
ax.set_title('feature scale')
plt.xticks(range(feature_num), feature_names)
plt.xlim([-1, feature_num])
fig.set_figheight(6)
fig.set_figwidth(10)
if not os.path.exists('./image'):
os.makedirs('./image')
fig.savefig('image/ranges.png', dpi=48)
plt.close(fig)
def load_data(filename, feature_num=14, ratio=0.8):
global UCI_TRAIN_DATA, UCI_TEST_DATA
if UCI_TRAIN_DATA is not None and UCI_TEST_DATA is not None:
return
data = np.fromfile(filename, sep=' ')
data = data.reshape(data.shape[0] / feature_num, feature_num)
maximums, minimums, avgs = data.max(axis=0), data.min(axis=0), data.sum(
axis=0) / data.shape[0]
feature_range(maximums[:-1], minimums[:-1])
for i in xrange(feature_num - 1):
data[:, i] = (data[:, i] - avgs[i]) / (maximums[i] - minimums[i])
offset = int(data.shape[0] * ratio)
UCI_TRAIN_DATA = data[:offset]
UCI_TEST_DATA = data[offset:]
def train():
global UCI_TRAIN_DATA
load_data(download(URL, 'uci_housing', MD5))
def reader():
for d in UCI_TRAIN_DATA:
yield d[:-1], d[-1:]
return reader
def test():
global UCI_TEST_DATA
load_data(download(URL, 'uci_housing', MD5))
def reader():
for d in UCI_TEST_DATA:
yield d[:-1], d[-1:]
return reader
...@@ -34,8 +34,9 @@ class WithMetric(object): ...@@ -34,8 +34,9 @@ class WithMetric(object):
class TestResult(WithMetric): class TestResult(WithMetric):
def __init__(self, evaluator): def __init__(self, evaluator, cost):
super(TestResult, self).__init__(evaluator) super(TestResult, self).__init__(evaluator)
self.cost = cost
class BeginPass(object): class BeginPass(object):
......
...@@ -5,7 +5,7 @@ from data_feeder import DataFeeder ...@@ -5,7 +5,7 @@ from data_feeder import DataFeeder
import itertools import itertools
import numpy import numpy
__all__ = ['Inference', 'infer'] __all__ = ['infer']
class Inference(object): class Inference(object):
......
...@@ -262,7 +262,7 @@ class StaticInputV2(object): ...@@ -262,7 +262,7 @@ class StaticInputV2(object):
self.input = input self.input = input
self.is_seq = is_seq self.is_seq = is_seq
self.size = size self.size = size
# TODO(qiaolongfei): add size # TODO(add size check)
# assert input.size is not None or size is not None # assert input.size is not None or size is not None
......
...@@ -110,14 +110,14 @@ class DataFeederTest(unittest.TestCase): ...@@ -110,14 +110,14 @@ class DataFeederTest(unittest.TestCase):
self.assertAlmostEqual(value.all(), w[i].all()) self.assertAlmostEqual(value.all(), w[i].all())
def test_integer(self): def test_integer(self):
dim = 100 value_range = 100
batch_size = 32 batch_size = 32
index = [] index = []
for i in xrange(batch_size): for i in xrange(batch_size):
each_sample = [] each_sample = []
each_sample.append(np.random.randint(dim)) each_sample.append(np.random.randint(value_range))
index.append(each_sample) index.append(each_sample)
feeder = DataFeeder([('input', data_type.integer_value(dim))], feeder = DataFeeder([('input', data_type.integer_value(value_range))],
{'input': 0}) {'input': 0})
arg = feeder(index) arg = feeder(index)
output = arg.getSlotIds(0).copyToNumpyArray() output = arg.getSlotIds(0).copyToNumpyArray()
...@@ -125,7 +125,7 @@ class DataFeederTest(unittest.TestCase): ...@@ -125,7 +125,7 @@ class DataFeederTest(unittest.TestCase):
self.assertEqual(output.all(), index.flatten().all()) self.assertEqual(output.all(), index.flatten().all())
def test_integer_sequence(self): def test_integer_sequence(self):
dim = 10000 value_range = 10000
batch_size = 32 batch_size = 32
start = [0] start = [0]
data = [] data = []
...@@ -133,10 +133,11 @@ class DataFeederTest(unittest.TestCase): ...@@ -133,10 +133,11 @@ class DataFeederTest(unittest.TestCase):
each_sample = [] each_sample = []
each_sample.append( each_sample.append(
self.sparse_binary_reader( self.sparse_binary_reader(
dim, 30, non_empty=True)) value_range, 30, non_empty=True))
data.append(each_sample) data.append(each_sample)
start.append(len(each_sample[0]) + start[-1]) start.append(len(each_sample[0]) + start[-1])
feeder = DataFeeder([('input', data_type.integer_value_sequence(dim))], feeder = DataFeeder(
[('input', data_type.integer_value_sequence(value_range))],
{'input': 0}) {'input': 0})
arg = feeder(data) arg = feeder(data)
output_data = arg.getSlotIds(0).copyToNumpyArray() output_data = arg.getSlotIds(0).copyToNumpyArray()
......
...@@ -17,6 +17,7 @@ import collections ...@@ -17,6 +17,7 @@ import collections
from paddle.proto.ModelConfig_pb2 import ModelConfig from paddle.proto.ModelConfig_pb2 import ModelConfig
import layer as v2_layer import layer as v2_layer
from layer import WithExtraParent
__all__ = ['Topology'] __all__ = ['Topology']
...@@ -40,7 +41,10 @@ def __bfs_travel__(callback, *layers): ...@@ -40,7 +41,10 @@ def __bfs_travel__(callback, *layers):
__break__ = callback(each_layer) __break__ = callback(each_layer)
if __break__: if __break__:
return return
__bfs_travel__(callback, *each_layer.__parent_layers__.values()) __layers__ = each_layer.__parent_layers__.values()
if isinstance(each_layer, WithExtraParent):
__layers__ = __layers__ + each_layer.extra_parent()
__bfs_travel__(callback, *__layers__)
class Topology(object): class Topology(object):
......
...@@ -8,7 +8,7 @@ from . import event as v2_event ...@@ -8,7 +8,7 @@ from . import event as v2_event
from . import optimizer as v2_optimizer from . import optimizer as v2_optimizer
from . import parameters as v2_parameters from . import parameters as v2_parameters
__all__ = ['ITrainer', 'SGD'] __all__ = ['SGD']
def default_event_handler(event): def default_event_handler(event):
...@@ -22,26 +22,7 @@ def default_event_handler(event): ...@@ -22,26 +22,7 @@ def default_event_handler(event):
pass pass
class ITrainer(object): class SGD():
"""
The interface of Trainer. The only exposed method is `train`.
"""
def train(self, reader, topology, parameters, event_handler=None):
"""
train method.
:param reader:
:param topology:
:param parameters:
:param event_handler:
:return:
"""
raise NotImplementedError()
class SGD(ITrainer):
def __init__(self, cost, parameters, update_equation): def __init__(self, cost, parameters, update_equation):
""" """
Simple SGD Trainer. Simple SGD Trainer.
...@@ -120,10 +101,8 @@ class SGD(ITrainer): ...@@ -120,10 +101,8 @@ class SGD(ITrainer):
for each_param in self.__gradient_machine__.getNonStaticParameters( for each_param in self.__gradient_machine__.getNonStaticParameters(
): ):
updater.update(each_param) updater.update(each_param)
# Get cost. We use numpy to calculate total cost for this batch. cost_sum = out_args.sumCosts()
cost_vec = out_args.getSlotValue(0) cost = cost_sum / len(data_batch)
cost_vec = cost_vec.copyToNumpyMat()
cost = cost_vec.sum() / len(data_batch)
updater.finishBatch(cost) updater.finishBatch(cost)
batch_evaluator.finish() batch_evaluator.finish()
event_handler( event_handler(
...@@ -152,13 +131,18 @@ class SGD(ITrainer): ...@@ -152,13 +131,18 @@ class SGD(ITrainer):
evaluator = self.__gradient_machine__.makeEvaluator() evaluator = self.__gradient_machine__.makeEvaluator()
out_args = api.Arguments.createArguments(0) out_args = api.Arguments.createArguments(0)
evaluator.start() evaluator.start()
total_cost = 0
num_samples = 0.0
for data_batch in reader(): for data_batch in reader():
num_samples += len(data_batch)
self.__gradient_machine__.forward( self.__gradient_machine__.forward(
feeder(data_batch), out_args, api.PASS_TEST) feeder(data_batch), out_args, api.PASS_TEST)
total_cost += out_args.sumCosts()
self.__gradient_machine__.eval(evaluator) self.__gradient_machine__.eval(evaluator)
evaluator.finish() evaluator.finish()
return v2_event.TestResult(evaluator=evaluator) return v2_event.TestResult(
evaluator=evaluator, cost=total_cost / num_samples)
def __check_train_args__(reader, event_handler, **kwargs): def __check_train_args__(reader, event_handler, **kwargs):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册