提交 5c2206c5 编写于 作者: Y Yu Yang 提交者: GitHub

Merge pull request #921 from reyoung/feature/refine_demo_dataprovider

Feature/refine demo dataprovider
#!/bin/bash
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
......
文件模式从 100644 更改为 100755
#!/bin/bash
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
......
...@@ -21,7 +21,7 @@ from paddle.trainer.PyDataProvider2 import * ...@@ -21,7 +21,7 @@ from paddle.trainer.PyDataProvider2 import *
# #
# {'img_size': 32, # {'img_size': 32,
# 'settings': <paddle.trainer.PyDataProviderWrapper.Cls instance at 0x7fea27cb6050>, # 'settings': a global object,
# 'color': True, # 'color': True,
# 'mean_img_size': 32, # 'mean_img_size': 32,
# 'meta': './data/cifar-out/batches/batches.meta', # 'meta': './data/cifar-out/batches/batches.meta',
...@@ -50,10 +50,10 @@ def hook(settings, img_size, mean_img_size, num_classes, color, meta, use_jpeg, ...@@ -50,10 +50,10 @@ def hook(settings, img_size, mean_img_size, num_classes, color, meta, use_jpeg,
settings.logger.info('Image size: %s', settings.img_size) settings.logger.info('Image size: %s', settings.img_size)
settings.logger.info('Meta path: %s', settings.meta_path) settings.logger.info('Meta path: %s', settings.meta_path)
settings.input_types = [ settings.input_types = {
dense_vector(settings.img_raw_size), # image feature 'image': dense_vector(settings.img_raw_size),
integer_value(settings.num_classes) 'label': integer_value(settings.num_classes)
] # labels }
settings.logger.info('DataProvider Initialization finished') settings.logger.info('DataProvider Initialization finished')
...@@ -83,4 +83,7 @@ def processData(settings, file_list): ...@@ -83,4 +83,7 @@ def processData(settings, file_list):
img, settings.img_mean, settings.img_size, img, settings.img_mean, settings.img_size,
settings.is_train, settings.color) settings.is_train, settings.color)
label = data['labels'][i] label = data['labels'][i]
yield img_feat.astype('float32'), int(label) yield {
'image': img_feat.astype('float32'),
'label': int(label)
}
dataprovider.pyc
empty.list
train.log
output
train.list
...@@ -17,8 +17,10 @@ import random ...@@ -17,8 +17,10 @@ import random
# define data types of input: 2 real numbers # define data types of input: 2 real numbers
@provider(input_types=[dense_vector(1), dense_vector(1)], use_seq=False) @provider(
input_types={'x': dense_vector(1),
'y': dense_vector(1)}, use_seq=False)
def process(settings, input_file): def process(settings, input_file):
for i in xrange(2000): for i in xrange(2000):
x = random.random() x = random.random()
yield [x], [2 * x + 0.3] yield {'x': [x], 'y': [2 * x + 0.3]}
...@@ -15,11 +15,8 @@ ...@@ -15,11 +15,8 @@
from paddle.trainer_config_helpers import * from paddle.trainer_config_helpers import *
# 1. read data. Suppose you saved above python code as dataprovider.py # 1. read data. Suppose you saved above python code as dataprovider.py
data_file = 'empty.list'
with open(data_file, 'w') as f:
f.writelines(' ')
define_py_data_sources2( define_py_data_sources2(
train_list=data_file, train_list=['no_matter.txt'],
test_list=None, test_list=None,
module='dataprovider', module='dataprovider',
obj='process', obj='process',
......
...@@ -8,6 +8,8 @@ data/test.list ...@@ -8,6 +8,8 @@ data/test.list
data/test.txt data/test.txt
data/train.list data/train.list
data/train.txt data/train.txt
data/pred.list
data/pred.txt
dataprovider_copy_1.py dataprovider_copy_1.py
train.log train.log
output output
...@@ -31,16 +31,16 @@ def initializer(settings, dictionary, **kwargs): ...@@ -31,16 +31,16 @@ def initializer(settings, dictionary, **kwargs):
# setting.input_types specifies what the data types the data provider # setting.input_types specifies what the data types the data provider
# generates. # generates.
settings.input_types = [ settings.input_types = {
# The first input is a sparse_binary_vector, # The first input is a sparse_binary_vector,
# which means each dimension of the vector is either 0 or 1. It is the # which means each dimension of the vector is either 0 or 1. It is the
# bag-of-words (BOW) representation of the texts. # bag-of-words (BOW) representation of the texts.
sparse_binary_vector(len(dictionary)), 'word': sparse_binary_vector(len(dictionary)),
# The second input is an integer. It represents the category id of the # The second input is an integer. It represents the category id of the
# sample. 2 means there are two labels in the dataset. # sample. 2 means there are two labels in the dataset.
# (1 for positive and 0 for negative) # (1 for positive and 0 for negative)
integer_value(2) 'label': integer_value(2)
] }
# Delaring a data provider. It has an initializer 'data_initialzer'. # Delaring a data provider. It has an initializer 'data_initialzer'.
...@@ -67,12 +67,12 @@ def process(settings, file_name): ...@@ -67,12 +67,12 @@ def process(settings, file_name):
# Return the features for the current comment. The first is a list # Return the features for the current comment. The first is a list
# of ids representing a 0-1 binary sparse vector of the text, # of ids representing a 0-1 binary sparse vector of the text,
# the second is the integer id of the label. # the second is the integer id of the label.
yield word_vector, int(label) yield {'word': word_vector, 'label': int(label)}
def predict_initializer(settings, dictionary, **kwargs): def predict_initializer(settings, dictionary, **kwargs):
settings.word_dict = dictionary settings.word_dict = dictionary
settings.input_types = [sparse_binary_vector(len(dictionary))] settings.input_types = {'word': sparse_binary_vector(len(dictionary))}
# Declaring a data provider for prediction. The difference with process # Declaring a data provider for prediction. The difference with process
...@@ -83,4 +83,4 @@ def process_predict(settings, file_name): ...@@ -83,4 +83,4 @@ def process_predict(settings, file_name):
for line in f: for line in f:
comment = line.strip().split() comment = line.strip().split()
word_vector = [settings.word_dict.get(w, UNK_IDX) for w in comment] word_vector = [settings.word_dict.get(w, UNK_IDX) for w in comment]
yield word_vector yield {'word': word_vector}
...@@ -19,13 +19,13 @@ UNK_IDX = 0 ...@@ -19,13 +19,13 @@ UNK_IDX = 0
def initializer(settings, dictionary, **kwargs): def initializer(settings, dictionary, **kwargs):
settings.word_dict = dictionary settings.word_dict = dictionary
settings.input_types = [ settings.input_types = {
# Define the type of the first input as sequence of integer. # Define the type of the first input as sequence of integer.
# The value of the integers range from 0 to len(dictrionary)-1 # The value of the integers range from 0 to len(dictrionary)-1
integer_value_sequence(len(dictionary)), 'word': integer_value_sequence(len(dictionary)),
# Define the second input for label id # Define the second input for label id
integer_value(2) 'label': integer_value(2)
] }
@provider(init_hook=initializer, cache=CacheType.CACHE_PASS_IN_MEM) @provider(init_hook=initializer, cache=CacheType.CACHE_PASS_IN_MEM)
...@@ -35,15 +35,12 @@ def process(settings, file_name): ...@@ -35,15 +35,12 @@ def process(settings, file_name):
label, comment = line.strip().split('\t') label, comment = line.strip().split('\t')
words = comment.split() words = comment.split()
word_slot = [settings.word_dict.get(w, UNK_IDX) for w in words] word_slot = [settings.word_dict.get(w, UNK_IDX) for w in words]
yield word_slot, int(label) yield {'word': word_slot, 'label': int(label)}
def predict_initializer(settings, dictionary, **kwargs): def predict_initializer(settings, dictionary, **kwargs):
settings.word_dict = dictionary settings.word_dict = dictionary
settings.input_types = [ settings.input_types = {'word': integer_value_sequence(len(dictionary))}
integer_value(
len(dictionary), seq_type=SequenceType.SEQUENCE)
]
@provider(init_hook=predict_initializer, should_shuffle=False) @provider(init_hook=predict_initializer, should_shuffle=False)
...@@ -52,4 +49,4 @@ def process_predict(settings, file_name): ...@@ -52,4 +49,4 @@ def process_predict(settings, file_name):
for line in f: for line in f:
comment = line.strip().split() comment = line.strip().split()
word_slot = [settings.word_dict.get(w, UNK_IDX) for w in comment] word_slot = [settings.word_dict.get(w, UNK_IDX) for w in comment]
yield word_slot yield {'word': word_slot}
...@@ -17,13 +17,14 @@ from paddle.trainer.PyDataProvider2 import * ...@@ -17,13 +17,14 @@ from paddle.trainer.PyDataProvider2 import *
def meta_to_header(meta, name): def meta_to_header(meta, name):
metas = meta[name]['__meta__']['raw_meta'] metas = meta[name]['__meta__']['raw_meta']
for each_meta in metas: for each_meta in metas:
slot_name = each_meta.get('name', '%s_id' % name)
if each_meta['type'] == 'id': if each_meta['type'] == 'id':
yield integer_value(each_meta['max']) yield slot_name, integer_value(each_meta['max'])
elif each_meta['type'] == 'embedding': elif each_meta['type'] == 'embedding':
is_seq = each_meta['seq'] == 'sequence' is_seq = each_meta['seq'] == 'sequence'
yield integer_value( yield slot_name, integer_value(
len(each_meta['dict']), len(each_meta['dict']),
seq_type=SequenceType.SEQUENCE seq_type=SequenceType.SEQUENCE
if is_seq else SequenceType.NO_SEQUENCE) if is_seq else SequenceType.NO_SEQUENCE)
elif each_meta['type'] == 'one_hot_dense': elif each_meta['type'] == 'one_hot_dense':
yield dense_vector(len(each_meta['dict'])) yield slot_name, dense_vector(len(each_meta['dict']))
...@@ -16,6 +16,14 @@ from paddle.trainer.PyDataProvider2 import * ...@@ -16,6 +16,14 @@ from paddle.trainer.PyDataProvider2 import *
import common_utils # parse import common_utils # parse
def __list_to_map__(lst):
ret_val = dict()
for each in lst:
k, v = each
ret_val[k] = v
return ret_val
def hook(settings, meta, **kwargs): def hook(settings, meta, **kwargs):
""" """
Init hook is invoked before process data. It will set obj.slots and store Init hook is invoked before process data. It will set obj.slots and store
...@@ -34,12 +42,16 @@ def hook(settings, meta, **kwargs): ...@@ -34,12 +42,16 @@ def hook(settings, meta, **kwargs):
# second part is user features. # second part is user features.
# final part is rating score. # final part is rating score.
# header is a list of [USE_SEQ_OR_NOT?, SlotType] # header is a list of [USE_SEQ_OR_NOT?, SlotType]
headers = list(common_utils.meta_to_header(meta, 'movie')) movie_headers = list(common_utils.meta_to_header(meta, 'movie'))
headers.extend(list(common_utils.meta_to_header(meta, 'user'))) settings.movie_names = [h[0] for h in movie_headers]
headers.append(dense_vector(1)) # Score headers = movie_headers
user_headers = list(common_utils.meta_to_header(meta, 'user'))
settings.user_names = [h[0] for h in user_headers]
headers.extend(user_headers)
headers.append(("rating", dense_vector(1))) # Score
# slot types. # slot types.
settings.input_types = headers settings.input_types = __list_to_map__(headers)
settings.meta = meta settings.meta = meta
...@@ -57,20 +69,20 @@ def process(settings, filename): ...@@ -57,20 +69,20 @@ def process(settings, filename):
movie_meta = settings.meta['movie'][movie_id] movie_meta = settings.meta['movie'][movie_id]
user_meta = settings.meta['user'][user_id] user_meta = settings.meta['user'][user_id]
outputs = [movie_id - 1] outputs = [('movie_id', movie_id - 1)]
# Then add movie features # Then add movie features
for each_meta in movie_meta: for i, each_meta in enumerate(movie_meta):
outputs.append(each_meta) outputs.append((settings.movie_names[i + 1], each_meta))
# Then add user id. # Then add user id.
outputs.append(user_id - 1) outputs.append(('user_id', user_id - 1))
# Then add user features. # Then add user features.
for each_meta in user_meta: for i, each_meta in enumerate(user_meta):
outputs.append(each_meta) outputs.append((settings.user_names[i + 1], each_meta))
# Finally, add score # Finally, add score
outputs.append([score]) outputs.append(('rating', [score]))
# Return data to paddle # Return data to paddle
yield outputs yield __list_to_map__(outputs)
...@@ -34,8 +34,8 @@ if __name__ == '__main__': ...@@ -34,8 +34,8 @@ if __name__ == '__main__':
network.loadParameters(model_path) network.loadParameters(model_path)
with open('./data/meta.bin', 'rb') as f: with open('./data/meta.bin', 'rb') as f:
meta = pickle.load(f) meta = pickle.load(f)
headers = list(meta_to_header(meta, 'movie')) headers = [h[1] for h in meta_to_header(meta, 'movie')]
headers.extend(list(meta_to_header(meta, 'user'))) headers.extend([h[1] for h in meta_to_header(meta, 'user')])
cvt = DataProviderConverter(headers) cvt = DataProviderConverter(headers)
while True: while True:
movie_id = int(raw_input("Input movie_id: ")) movie_id = int(raw_input("Input movie_id: "))
......
...@@ -25,7 +25,7 @@ python meta_generator.py $dir meta.bin --config=meta_config.json ...@@ -25,7 +25,7 @@ python meta_generator.py $dir meta.bin --config=meta_config.json
echo 'split train/test file' echo 'split train/test file'
python split.py $dir/ratings.dat --delimiter=${delimiter} --test_ratio=0.1 python split.py $dir/ratings.dat --delimiter=${delimiter} --test_ratio=0.1
echo 'shuffle train file' echo 'shuffle train file'
shuf $dir/ratings.dat.train > ratings.dat.train gshuf $dir/ratings.dat.train > ratings.dat.train
cp $dir/ratings.dat.test . cp $dir/ratings.dat.test .
echo "./data/ratings.dat.train" > train.list echo "./data/ratings.dat.train" > train.list
echo "./data/ratings.dat.test" > test.list echo "./data/ratings.dat.test" > test.list
...@@ -8,3 +8,7 @@ data/test.wsj.seq_pair ...@@ -8,3 +8,7 @@ data/test.wsj.seq_pair
data/test.wsj.words data/test.wsj.words
data/tgt.dict data/tgt.dict
output output
data/emb
data/targetDict.txt
data/verbDict.txt
data/wordDict.txt
文件模式从 100644 更改为 100755
...@@ -69,7 +69,7 @@ def define_py_data_source(file_list, ...@@ -69,7 +69,7 @@ def define_py_data_source(file_list,
""" """
if isinstance(file_list, list): if isinstance(file_list, list):
file_list_name = 'train.list' file_list_name = 'train.list'
if isinstance(cls, TestData): if cls == TestData:
file_list_name = 'test.list' file_list_name = 'test.list'
with open(file_list_name, 'w') as f: with open(file_list_name, 'w') as f:
f.writelines(file_list) f.writelines(file_list)
......
...@@ -970,7 +970,7 @@ def pooling_layer(input, ...@@ -970,7 +970,7 @@ def pooling_layer(input,
:param layer_attr: The Extra Attributes for layer, such as dropout. :param layer_attr: The Extra Attributes for layer, such as dropout.
:type layer_attr: ExtraLayerAttribute|None :type layer_attr: ExtraLayerAttribute|None
:return: LayerOutput object. :return: LayerOutput object.
:rtype: LayerType :rtype: LayerOutput
""" """
extra_dict = dict() extra_dict = dict()
# noinspection PyUnresolvedReferences # noinspection PyUnresolvedReferences
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册