提交 319742c6 编写于 作者: Q qijun

format python code in demo, doc, doc_cn and paddle directories

上级 ef5e483c
...@@ -16,7 +16,6 @@ import numpy as np ...@@ -16,7 +16,6 @@ import numpy as np
import sys import sys
import os import os
import PIL.Image as Image import PIL.Image as Image
""" """
Usage: python process_cifar input_dir output_dir Usage: python process_cifar input_dir output_dir
""" """
...@@ -30,6 +29,7 @@ def mkdir_not_exist(path): ...@@ -30,6 +29,7 @@ def mkdir_not_exist(path):
if not os.path.exists(path): if not os.path.exists(path):
os.mkdir(path) os.mkdir(path)
def create_dir_structure(output_dir): def create_dir_structure(output_dir):
""" """
Create the directory structure for the directory. Create the directory structure for the directory.
...@@ -39,8 +39,8 @@ def create_dir_structure(output_dir): ...@@ -39,8 +39,8 @@ def create_dir_structure(output_dir):
mkdir_not_exist(os.path.join(output_dir, "train")) mkdir_not_exist(os.path.join(output_dir, "train"))
mkdir_not_exist(os.path.join(output_dir, "test")) mkdir_not_exist(os.path.join(output_dir, "test"))
def convert_batch(batch_path, label_set, label_map,
output_dir, data_split): def convert_batch(batch_path, label_set, label_map, output_dir, data_split):
""" """
Convert CIFAR batch to the structure of Paddle format. Convert CIFAR batch to the structure of Paddle format.
batch_path: the batch to be converted. batch_path: the batch to be converted.
...@@ -67,11 +67,23 @@ if __name__ == '__main__': ...@@ -67,11 +67,23 @@ if __name__ == '__main__':
output_dir = sys.argv[2] output_dir = sys.argv[2]
num_batch = 5 num_batch = 5
create_dir_structure(output_dir) create_dir_structure(output_dir)
label_map = {0: "airplane", 1: "automobile", 2: "bird", 3: "cat", 4: "deer", label_map = {
5: "dog", 6: "frog", 7: "horse", 8: "ship", 9: "truck"} 0: "airplane",
1: "automobile",
2: "bird",
3: "cat",
4: "deer",
5: "dog",
6: "frog",
7: "horse",
8: "ship",
9: "truck"
}
labels = {} labels = {}
for i in range(1, num_batch + 1): for i in range(1, num_batch + 1):
convert_batch(os.path.join(input_dir, "data_batch_%d" % i), labels, convert_batch(
label_map, output_dir, "train") os.path.join(input_dir, "data_batch_%d" % i), labels, label_map,
convert_batch(os.path.join(input_dir, "test_batch"), {}, output_dir, "train")
label_map, output_dir, "test") convert_batch(
\ No newline at end of file os.path.join(input_dir, "test_batch"), {}, label_map, output_dir,
"test")
...@@ -46,14 +46,14 @@ def hook(settings, img_size, mean_img_size, num_classes, color, meta, use_jpeg, ...@@ -46,14 +46,14 @@ def hook(settings, img_size, mean_img_size, num_classes, color, meta, use_jpeg,
settings.img_mean = image_util.load_meta(settings.meta_path, settings.img_mean = image_util.load_meta(settings.meta_path,
settings.mean_img_size, settings.mean_img_size,
settings.img_size, settings.img_size, settings.color)
settings.color)
settings.logger.info('Image size: %s', settings.img_size) settings.logger.info('Image size: %s', settings.img_size)
settings.logger.info('Meta path: %s', settings.meta_path) settings.logger.info('Meta path: %s', settings.meta_path)
settings.input_types = [ settings.input_types = [
dense_vector(settings.img_raw_size), # image feature dense_vector(settings.img_raw_size), # image feature
integer_value(settings.num_classes)] # labels integer_value(settings.num_classes)
] # labels
settings.logger.info('DataProvider Initialization finished') settings.logger.info('DataProvider Initialization finished')
...@@ -79,8 +79,8 @@ def processData(settings, file_list): ...@@ -79,8 +79,8 @@ def processData(settings, file_list):
img = image_util.decode_jpeg(data['images'][i]) img = image_util.decode_jpeg(data['images'][i])
else: else:
img = data['images'][i] img = data['images'][i]
img_feat = image_util.preprocess_img(img, settings.img_mean, img_feat = image_util.preprocess_img(
settings.img_size, settings.is_train, img, settings.img_mean, settings.img_size,
settings.color) settings.is_train, settings.color)
label = data['labels'][i] label = data['labels'][i]
yield img_feat.astype('float32'), int(label) yield img_feat.astype('float32'), int(label)
...@@ -16,17 +16,20 @@ import numpy as np ...@@ -16,17 +16,20 @@ import numpy as np
from PIL import Image from PIL import Image
from cStringIO import StringIO from cStringIO import StringIO
def resize_image(img, target_size): def resize_image(img, target_size):
""" """
Resize an image so that the shorter edge has length target_size. Resize an image so that the shorter edge has length target_size.
img: the input image to be resized. img: the input image to be resized.
target_size: the target resized image size. target_size: the target resized image size.
""" """
percent = (target_size/float(min(img.size[0], img.size[1]))) percent = (target_size / float(min(img.size[0], img.size[1])))
resized_size = int(round(img.size[0] * percent)), int(round(img.size[1] * percent)) resized_size = int(round(img.size[0] * percent)), int(
round(img.size[1] * percent))
img = img.resize(resized_size, Image.ANTIALIAS) img = img.resize(resized_size, Image.ANTIALIAS)
return img return img
def flip(im): def flip(im):
""" """
Return the flipped image. Return the flipped image.
...@@ -38,6 +41,7 @@ def flip(im): ...@@ -38,6 +41,7 @@ def flip(im):
else: else:
return im[:, ::-1] return im[:, ::-1]
def crop_img(im, inner_size, color=True, test=True): def crop_img(im, inner_size, color=True, test=True):
""" """
Return cropped image. Return cropped image.
...@@ -50,20 +54,22 @@ def crop_img(im, inner_size, color=True, test=True): ...@@ -50,20 +54,22 @@ def crop_img(im, inner_size, color=True, test=True):
If True, crop the center of images. If True, crop the center of images.
""" """
if color: if color:
height, width = max(inner_size, im.shape[1]), max(inner_size, im.shape[2]) height, width = max(inner_size, im.shape[1]), max(inner_size,
im.shape[2])
padded_im = np.zeros((3, height, width)) padded_im = np.zeros((3, height, width))
startY = (height - im.shape[1]) / 2 startY = (height - im.shape[1]) / 2
startX = (width - im.shape[2]) / 2 startX = (width - im.shape[2]) / 2
endY, endX = startY + im.shape[1], startX + im.shape[2] endY, endX = startY + im.shape[1], startX + im.shape[2]
padded_im[:, startY: endY, startX: endX] = im padded_im[:, startY:endY, startX:endX] = im
else: else:
im = im.astype('float32') im = im.astype('float32')
height, width = max(inner_size, im.shape[0]), max(inner_size, im.shape[1]) height, width = max(inner_size, im.shape[0]), max(inner_size,
im.shape[1])
padded_im = np.zeros((height, width)) padded_im = np.zeros((height, width))
startY = (height - im.shape[0]) / 2 startY = (height - im.shape[0]) / 2
startX = (width - im.shape[1]) / 2 startX = (width - im.shape[1]) / 2
endY, endX = startY + im.shape[0], startX + im.shape[1] endY, endX = startY + im.shape[0], startX + im.shape[1]
padded_im[startY: endY, startX: endX] = im padded_im[startY:endY, startX:endX] = im
if test: if test:
startY = (height - inner_size) / 2 startY = (height - inner_size) / 2
startX = (width - inner_size) / 2 startX = (width - inner_size) / 2
...@@ -72,19 +78,21 @@ def crop_img(im, inner_size, color=True, test=True): ...@@ -72,19 +78,21 @@ def crop_img(im, inner_size, color=True, test=True):
startX = np.random.randint(0, width - inner_size + 1) startX = np.random.randint(0, width - inner_size + 1)
endY, endX = startY + inner_size, startX + inner_size endY, endX = startY + inner_size, startX + inner_size
if color: if color:
pic = padded_im[:, startY: endY, startX: endX] pic = padded_im[:, startY:endY, startX:endX]
else: else:
pic = padded_im[startY: endY, startX: endX] pic = padded_im[startY:endY, startX:endX]
if (not test) and (np.random.randint(2) == 0): if (not test) and (np.random.randint(2) == 0):
pic = flip(pic) pic = flip(pic)
return pic return pic
def decode_jpeg(jpeg_string): def decode_jpeg(jpeg_string):
np_array = np.array(Image.open(StringIO(jpeg_string))) np_array = np.array(Image.open(StringIO(jpeg_string)))
if len(np_array.shape) == 3: if len(np_array.shape) == 3:
np_array = np.transpose(np_array, (2, 0, 1)) np_array = np.transpose(np_array, (2, 0, 1))
return np_array return np_array
def preprocess_img(im, img_mean, crop_size, is_train, color=True): def preprocess_img(im, img_mean, crop_size, is_train, color=True):
""" """
Does data augmentation for images. Does data augmentation for images.
...@@ -99,6 +107,7 @@ def preprocess_img(im, img_mean, crop_size, is_train, color=True): ...@@ -99,6 +107,7 @@ def preprocess_img(im, img_mean, crop_size, is_train, color=True):
pic -= img_mean pic -= img_mean
return pic.flatten() return pic.flatten()
def load_meta(meta_path, mean_img_size, crop_size, color=True): def load_meta(meta_path, mean_img_size, crop_size, color=True):
""" """
Return the loaded meta file. Return the loaded meta file.
...@@ -109,17 +118,18 @@ def load_meta(meta_path, mean_img_size, crop_size, color=True): ...@@ -109,17 +118,18 @@ def load_meta(meta_path, mean_img_size, crop_size, color=True):
mean = np.load(meta_path)['data_mean'] mean = np.load(meta_path)['data_mean']
border = (mean_img_size - crop_size) / 2 border = (mean_img_size - crop_size) / 2
if color: if color:
assert(mean_img_size * mean_img_size * 3 == mean.shape[0]) assert (mean_img_size * mean_img_size * 3 == mean.shape[0])
mean = mean.reshape(3, mean_img_size, mean_img_size) mean = mean.reshape(3, mean_img_size, mean_img_size)
mean = mean[:, border: border + crop_size, mean = mean[:, border:border + crop_size, border:border +
border: border + crop_size].astype('float32') crop_size].astype('float32')
else: else:
assert(mean_img_size * mean_img_size == mean.shape[0]) assert (mean_img_size * mean_img_size == mean.shape[0])
mean = mean.reshape(mean_img_size, mean_img_size) mean = mean.reshape(mean_img_size, mean_img_size)
mean = mean[border: border + crop_size, mean = mean[border:border + crop_size, border:border +
border: border + crop_size].astype('float32') crop_size].astype('float32')
return mean return mean
def load_image(img_path, is_color=True): def load_image(img_path, is_color=True):
""" """
Load image and return. Load image and return.
...@@ -130,6 +140,7 @@ def load_image(img_path, is_color=True): ...@@ -130,6 +140,7 @@ def load_image(img_path, is_color=True):
img.load() img.load()
return img return img
def oversample(img, crop_dims): def oversample(img, crop_dims):
""" """
image : iterable of (H x W x K) ndarrays image : iterable of (H x W x K) ndarrays
...@@ -152,50 +163,53 @@ def oversample(img, crop_dims): ...@@ -152,50 +163,53 @@ def oversample(img, crop_dims):
for j in w_indices: for j in w_indices:
crops_ix[curr] = (i, j, i + crop_dims[0], j + crop_dims[1]) crops_ix[curr] = (i, j, i + crop_dims[0], j + crop_dims[1])
curr += 1 curr += 1
crops_ix[4] = np.tile(im_center, (1, 2)) + np.concatenate([ crops_ix[4] = np.tile(im_center, (1, 2)) + np.concatenate(
-crop_dims / 2.0, [-crop_dims / 2.0, crop_dims / 2.0])
crop_dims / 2.0
])
crops_ix = np.tile(crops_ix, (2, 1)) crops_ix = np.tile(crops_ix, (2, 1))
# Extract crops # Extract crops
crops = np.empty((10 * len(img), crop_dims[0], crop_dims[1], crops = np.empty(
im_shape[-1]), dtype=np.float32) (10 * len(img), crop_dims[0], crop_dims[1], im_shape[-1]),
dtype=np.float32)
ix = 0 ix = 0
for im in img: for im in img:
for crop in crops_ix: for crop in crops_ix:
crops[ix] = im[crop[0]:crop[2], crop[1]:crop[3], :] crops[ix] = im[crop[0]:crop[2], crop[1]:crop[3], :]
ix += 1 ix += 1
crops[ix-5:ix] = crops[ix-5:ix, :, ::-1, :] # flip for mirrors crops[ix - 5:ix] = crops[ix - 5:ix, :, ::-1, :] # flip for mirrors
return crops return crops
class ImageTransformer: class ImageTransformer:
def __init__(self, transpose = None, def __init__(self,
channel_swap = None, mean = None, is_color = True): transpose=None,
channel_swap=None,
mean=None,
is_color=True):
self.transpose = transpose self.transpose = transpose
self.channel_swap = None self.channel_swap = None
self.mean = None self.mean = None
self.is_color = is_color self.is_color = is_color
def set_transpose(self, order): def set_transpose(self, order):
if self.is_color: if self.is_color:
assert 3 == len(order) assert 3 == len(order)
self.transpose = order self.transpose = order
def set_channel_swap(self, order): def set_channel_swap(self, order):
if self.is_color: if self.is_color:
assert 3 == len(order) assert 3 == len(order)
self.channel_swap = order self.channel_swap = order
def set_mean(self, mean): def set_mean(self, mean):
# mean value, may be one value per channel # mean value, may be one value per channel
if mean.ndim == 1: if mean.ndim == 1:
mean = mean[:, np.newaxis, np.newaxis] mean = mean[:, np.newaxis, np.newaxis]
else: else:
# elementwise mean # elementwise mean
if self.is_color: if self.is_color:
assert len(mean.shape) == 3 assert len(mean.shape) == 3
self.mean = mean self.mean = mean
def transformer(self, data): def transformer(self, data):
if self.transpose is not None: if self.transpose is not None:
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import os,sys import os, sys
import numpy as np import numpy as np
import logging import logging
from PIL import Image from PIL import Image
...@@ -24,9 +24,11 @@ from py_paddle import swig_paddle, DataProviderConverter ...@@ -24,9 +24,11 @@ from py_paddle import swig_paddle, DataProviderConverter
from paddle.trainer.PyDataProvider2 import dense_vector from paddle.trainer.PyDataProvider2 import dense_vector
from paddle.trainer.config_parser import parse_config from paddle.trainer.config_parser import parse_config
logging.basicConfig(format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s') logging.basicConfig(
format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s')
logging.getLogger().setLevel(logging.INFO) logging.getLogger().setLevel(logging.INFO)
class ImageClassifier(): class ImageClassifier():
def __init__(self, def __init__(self,
train_conf, train_conf,
...@@ -58,18 +60,19 @@ class ImageClassifier(): ...@@ -58,18 +60,19 @@ class ImageClassifier():
self.oversample = oversample self.oversample = oversample
self.is_color = is_color self.is_color = is_color
self.transformer = image_util.ImageTransformer(is_color = is_color) self.transformer = image_util.ImageTransformer(is_color=is_color)
self.transformer.set_transpose((2,0,1)) self.transformer.set_transpose((2, 0, 1))
self.mean_file = mean_file self.mean_file = mean_file
mean = np.load(self.mean_file)['data_mean'] mean = np.load(self.mean_file)['data_mean']
mean = mean.reshape(3, self.crop_dims[0], self.crop_dims[1]) mean = mean.reshape(3, self.crop_dims[0], self.crop_dims[1])
self.transformer.set_mean(mean) # mean pixel self.transformer.set_mean(mean) # mean pixel
gpu = 1 if use_gpu else 0 gpu = 1 if use_gpu else 0
conf_args = "is_test=1,use_gpu=%d,is_predict=1" % (gpu) conf_args = "is_test=1,use_gpu=%d,is_predict=1" % (gpu)
conf = parse_config(train_conf, conf_args) conf = parse_config(train_conf, conf_args)
swig_paddle.initPaddle("--use_gpu=%d" % (gpu)) swig_paddle.initPaddle("--use_gpu=%d" % (gpu))
self.network = swig_paddle.GradientMachine.createFromConfigProto(conf.model_config) self.network = swig_paddle.GradientMachine.createFromConfigProto(
conf.model_config)
assert isinstance(self.network, swig_paddle.GradientMachine) assert isinstance(self.network, swig_paddle.GradientMachine)
self.network.loadParameters(self.model_dir) self.network.loadParameters(self.model_dir)
...@@ -90,14 +93,14 @@ class ImageClassifier(): ...@@ -90,14 +93,14 @@ class ImageClassifier():
# image_util.resize_image: short side is self.resize_dim # image_util.resize_image: short side is self.resize_dim
image = image_util.resize_image(image, self.resize_dim) image = image_util.resize_image(image, self.resize_dim)
image = np.array(image) image = np.array(image)
input = np.zeros((1, image.shape[0], image.shape[1], 3), input = np.zeros(
dtype=np.float32) (1, image.shape[0], image.shape[1], 3), dtype=np.float32)
input[0] = image.astype(np.float32) input[0] = image.astype(np.float32)
input = image_util.oversample(input, self.crop_dims) input = image_util.oversample(input, self.crop_dims)
else: else:
image = image.resize(self.crop_dims, Image.ANTIALIAS) image = image.resize(self.crop_dims, Image.ANTIALIAS)
input = np.zeros((1, self.crop_dims[0], self.crop_dims[1], 3), input = np.zeros(
dtype=np.float32) (1, self.crop_dims[0], self.crop_dims[1], 3), dtype=np.float32)
input[0] = np.array(image).astype(np.float32) input[0] = np.array(image).astype(np.float32)
data_in = [] data_in = []
...@@ -133,22 +136,24 @@ class ImageClassifier(): ...@@ -133,22 +136,24 @@ class ImageClassifier():
lab = np.argsort(-prob) lab = np.argsort(-prob)
logging.info("Label of %s is: %d", image, lab[0]) logging.info("Label of %s is: %d", image, lab[0])
if __name__ == '__main__': if __name__ == '__main__':
image_size=32 image_size = 32
crop_size=32 crop_size = 32
multi_crop=True multi_crop = True
config="vgg_16_cifar.py" config = "vgg_16_cifar.py"
output_layer="__fc_layer_1__" output_layer = "__fc_layer_1__"
mean_path="data/cifar-out/batches/batches.meta" mean_path = "data/cifar-out/batches/batches.meta"
model_path=sys.argv[1] model_path = sys.argv[1]
image=sys.argv[2] image = sys.argv[2]
use_gpu=bool(int(sys.argv[3])) use_gpu = bool(int(sys.argv[3]))
obj = ImageClassifier(train_conf=config, obj = ImageClassifier(
model_dir=model_path, train_conf=config,
resize_dim=image_size, model_dir=model_path,
crop_dim=crop_size, resize_dim=image_size,
mean_file=mean_path, crop_dim=crop_size,
use_gpu=use_gpu, mean_file=mean_path,
oversample=multi_crop) use_gpu=use_gpu,
oversample=multi_crop)
obj.predict(image, output_layer) obj.predict(image, output_layer)
...@@ -19,24 +19,36 @@ from optparse import OptionParser ...@@ -19,24 +19,36 @@ from optparse import OptionParser
def option_parser(): def option_parser():
parser = OptionParser(usage="usage: python preprcoess.py "\ parser = OptionParser(usage="usage: python preprcoess.py "\
"-i data_dir [options]") "-i data_dir [options]")
parser.add_option("-i", "--input", action="store", parser.add_option(
dest="input", help="Input data directory.") "-i",
parser.add_option("-s", "--size", action="store", "--input",
dest="size", help="Processed image size.") action="store",
parser.add_option("-c", "--color", action="store", dest="input",
dest="color", help="whether to use color images.") help="Input data directory.")
parser.add_option(
"-s",
"--size",
action="store",
dest="size",
help="Processed image size.")
parser.add_option(
"-c",
"--color",
action="store",
dest="color",
help="whether to use color images.")
return parser.parse_args() return parser.parse_args()
if __name__ == '__main__': if __name__ == '__main__':
options, args = option_parser() options, args = option_parser()
data_dir = options.input data_dir = options.input
processed_image_size = int(options.size) processed_image_size = int(options.size)
color = options.color == "1" color = options.color == "1"
data_creator = ImageClassificationDatasetCreater(data_dir, data_creator = ImageClassificationDatasetCreater(
processed_image_size, data_dir, processed_image_size, color)
color) data_creator.train_list_name = "train.txt"
data_creator.train_list_name = "train.txt" data_creator.test_list_name = "test.txt"
data_creator.test_list_name = "test.txt" data_creator.num_per_batch = 1000
data_creator.num_per_batch = 1000 data_creator.overwrite = True
data_creator.overwrite = True data_creator.create_batches()
data_creator.create_batches()
...@@ -18,36 +18,38 @@ is_predict = get_config_arg("is_predict", bool, False) ...@@ -18,36 +18,38 @@ is_predict = get_config_arg("is_predict", bool, False)
####################Data Configuration ################## ####################Data Configuration ##################
if not is_predict: if not is_predict:
data_dir='data/cifar-out/batches/' data_dir = 'data/cifar-out/batches/'
meta_path=data_dir+'batches.meta' meta_path = data_dir + 'batches.meta'
args = {'meta':meta_path,'mean_img_size': 32, args = {
'img_size': 32,'num_classes': 10, 'meta': meta_path,
'use_jpeg': 1,'color': "color"} 'mean_img_size': 32,
'img_size': 32,
define_py_data_sources2(train_list="train.list", 'num_classes': 10,
test_list="train.list", 'use_jpeg': 1,
module='image_provider', 'color': "color"
obj='processData', }
args=args)
define_py_data_sources2(
train_list="train.list",
test_list="train.list",
module='image_provider',
obj='processData',
args=args)
######################Algorithm Configuration ############# ######################Algorithm Configuration #############
settings( settings(
batch_size = 128, batch_size=128,
learning_rate = 0.1 / 128.0, learning_rate=0.1 / 128.0,
learning_method = MomentumOptimizer(0.9), learning_method=MomentumOptimizer(0.9),
regularization = L2Regularization(0.0005 * 128) regularization=L2Regularization(0.0005 * 128))
)
#######################Network Configuration ############# #######################Network Configuration #############
data_size=3*32*32 data_size = 3 * 32 * 32
label_size=10 label_size = 10
img = data_layer(name='image', img = data_layer(name='image', size=data_size)
size=data_size)
# small_vgg is predefined in trainer_config_helpers.networks # small_vgg is predefined in trainer_config_helpers.networks
predict = small_vgg(input_image=img, predict = small_vgg(input_image=img, num_channels=3, num_classes=label_size)
num_channels=3,
num_classes=label_size)
if not is_predict: if not is_predict:
lbl = data_layer(name="label", size=label_size) lbl = data_layer(name="label", size=label_size)
......
...@@ -15,10 +15,10 @@ ...@@ -15,10 +15,10 @@
from paddle.trainer.PyDataProvider2 import * from paddle.trainer.PyDataProvider2 import *
import random import random
# define data types of input: 2 real numbers # define data types of input: 2 real numbers
@provider(input_types=[dense_vector(1), dense_vector(1)],use_seq=False) @provider(input_types=[dense_vector(1), dense_vector(1)], use_seq=False)
def process(settings, input_file): def process(settings, input_file):
for i in xrange(2000): for i in xrange(2000):
x = random.random() x = random.random()
yield [x], [2*x+0.3] yield [x], [2 * x + 0.3]
...@@ -23,14 +23,17 @@ Usage: ...@@ -23,14 +23,17 @@ Usage:
import numpy as np import numpy as np
import os import os
def load(file_name): def load(file_name):
with open(file_name, 'rb') as f: with open(file_name, 'rb') as f:
f.read(16) # skip header for float type. f.read(16) # skip header for float type.
return np.fromfile(f, dtype=np.float32) return np.fromfile(f, dtype=np.float32)
def main(): def main():
print 'w=%.6f, b=%.6f from pass 29' % (load('output/pass-00029/w'), print 'w=%.6f, b=%.6f from pass 29' % (load('output/pass-00029/w'),
load('output/pass-00029/b')) load('output/pass-00029/b'))
if __name__ == '__main__': if __name__ == '__main__':
main() main()
...@@ -16,9 +16,14 @@ from paddle.trainer_config_helpers import * ...@@ -16,9 +16,14 @@ from paddle.trainer_config_helpers import *
# 1. read data. Suppose you saved above python code as dataprovider.py # 1. read data. Suppose you saved above python code as dataprovider.py
data_file = 'empty.list' data_file = 'empty.list'
with open(data_file, 'w') as f: f.writelines(' ') with open(data_file, 'w') as f:
define_py_data_sources2(train_list=data_file, test_list=None, f.writelines(' ')
module='dataprovider', obj='process',args={}) define_py_data_sources2(
train_list=data_file,
test_list=None,
module='dataprovider',
obj='process',
args={})
# 2. learning algorithm # 2. learning algorithm
settings(batch_size=12, learning_rate=1e-3, learning_method=MomentumOptimizer()) settings(batch_size=12, learning_rate=1e-3, learning_method=MomentumOptimizer())
...@@ -26,7 +31,11 @@ settings(batch_size=12, learning_rate=1e-3, learning_method=MomentumOptimizer()) ...@@ -26,7 +31,11 @@ settings(batch_size=12, learning_rate=1e-3, learning_method=MomentumOptimizer())
# 3. Network configuration # 3. Network configuration
x = data_layer(name='x', size=1) x = data_layer(name='x', size=1)
y = data_layer(name='y', size=1) y = data_layer(name='y', size=1)
y_predict = fc_layer(input=x, param_attr=ParamAttr(name='w'), size=1, act=LinearActivation(), bias_attr=ParamAttr(name='b')) y_predict = fc_layer(
input=x,
param_attr=ParamAttr(name='w'),
size=1,
act=LinearActivation(),
bias_attr=ParamAttr(name='b'))
cost = regression_cost(input=y_predict, label=y) cost = regression_cost(input=y_predict, label=y)
outputs(cost) outputs(cost)
...@@ -13,9 +13,9 @@ ...@@ -13,9 +13,9 @@
# limitations under the License. # limitations under the License.
o = open("./" + "train.list", "w") o = open("./" + "train.list", "w")
o.write("./data/raw_data/train" +"\n") o.write("./data/raw_data/train" + "\n")
o.close() o.close()
o = open("./" + "test.list", "w") o = open("./" + "test.list", "w")
o.write("./data/raw_data/t10k" +"\n") o.write("./data/raw_data/t10k" + "\n")
o.close() o.close()
\ No newline at end of file
...@@ -2,10 +2,9 @@ from paddle.trainer.PyDataProvider2 import * ...@@ -2,10 +2,9 @@ from paddle.trainer.PyDataProvider2 import *
# Define a py data provider # Define a py data provider
@provider(input_types={ @provider(
'pixel': dense_vector(28 * 28), input_types={'pixel': dense_vector(28 * 28),
'label': integer_value(10) 'label': integer_value(10)})
})
def process(settings, filename): # settings is not used currently. def process(settings, filename): # settings is not used currently.
imgf = filename + "-images-idx3-ubyte" imgf = filename + "-images-idx3-ubyte"
labelf = filename + "-labels-idx1-ubyte" labelf = filename + "-labels-idx1-ubyte"
......
...@@ -18,32 +18,29 @@ is_predict = get_config_arg("is_predict", bool, False) ...@@ -18,32 +18,29 @@ is_predict = get_config_arg("is_predict", bool, False)
####################Data Configuration ################## ####################Data Configuration ##################
if not is_predict: if not is_predict:
data_dir='./data/' data_dir = './data/'
define_py_data_sources2(train_list= data_dir + 'train.list', define_py_data_sources2(
test_list= data_dir + 'test.list', train_list=data_dir + 'train.list',
module='mnist_provider', test_list=data_dir + 'test.list',
obj='process') module='mnist_provider',
obj='process')
######################Algorithm Configuration ############# ######################Algorithm Configuration #############
settings( settings(
batch_size = 128, batch_size=128,
learning_rate = 0.1 / 128.0, learning_rate=0.1 / 128.0,
learning_method = MomentumOptimizer(0.9), learning_method=MomentumOptimizer(0.9),
regularization = L2Regularization(0.0005 * 128) regularization=L2Regularization(0.0005 * 128))
)
#######################Network Configuration ############# #######################Network Configuration #############
data_size=1*28*28 data_size = 1 * 28 * 28
label_size=10 label_size = 10
img = data_layer(name='pixel', size=data_size) img = data_layer(name='pixel', size=data_size)
# small_vgg is predined in trainer_config_helpers.network # small_vgg is predined in trainer_config_helpers.network
predict = small_vgg(input_image=img, predict = small_vgg(input_image=img, num_channels=1, num_classes=label_size)
num_channels=1,
num_classes=label_size)
if not is_predict: if not is_predict:
lbl = data_layer(name="label", size=label_size) lbl = data_layer(name="label", size=label_size)
......
...@@ -12,7 +12,6 @@ ...@@ -12,7 +12,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
""" """
Example: Example:
python extract_para.py --preModel PREMODEL --preDict PREDICT \ python extract_para.py --preModel PREMODEL --preDict PREDICT \
...@@ -29,6 +28,7 @@ Options: ...@@ -29,6 +28,7 @@ Options:
from optparse import OptionParser from optparse import OptionParser
import struct import struct
def get_row_index(preDict, usrDict): def get_row_index(preDict, usrDict):
""" """
Get the row positions for all words in user dictionary from pre-trained dictionary. Get the row positions for all words in user dictionary from pre-trained dictionary.
...@@ -47,7 +47,9 @@ def get_row_index(preDict, usrDict): ...@@ -47,7 +47,9 @@ def get_row_index(preDict, usrDict):
pos.append(index[word]) pos.append(index[word])
return pos return pos
def extract_parameters_by_usrDict(preModel, preDict, usrModel, usrDict, paraDim):
def extract_parameters_by_usrDict(preModel, preDict, usrModel, usrDict,
paraDim):
""" """
Extract desired parameters from a pretrained embedding model based on user dictionary Extract desired parameters from a pretrained embedding model based on user dictionary
""" """
...@@ -70,6 +72,7 @@ def extract_parameters_by_usrDict(preModel, preDict, usrModel, usrDict, paraDim) ...@@ -70,6 +72,7 @@ def extract_parameters_by_usrDict(preModel, preDict, usrModel, usrDict, paraDim)
print "extract parameters finish, total", len(rowIndex), "lines" print "extract parameters finish, total", len(rowIndex), "lines"
fi.close() fi.close()
def main(): def main():
""" """
Main entry for running paraconvert.py Main entry for running paraconvert.py
...@@ -78,19 +81,33 @@ def main(): ...@@ -78,19 +81,33 @@ def main():
"python %prog --preModel PREMODEL --preDict PREDICT" \ "python %prog --preModel PREMODEL --preDict PREDICT" \
" --usrModel USRMODEL --usrDict USRDICT -d DIM" " --usrModel USRMODEL --usrDict USRDICT -d DIM"
parser = OptionParser(usage) parser = OptionParser(usage)
parser.add_option("--preModel", action="store", dest="preModel", parser.add_option(
help="the name of pretrained embedding model") "--preModel",
parser.add_option("--preDict", action="store", dest="preDict", action="store",
help="the name of pretrained dictionary") dest="preModel",
parser.add_option("--usrModel", action="store", dest="usrModel", help="the name of pretrained embedding model")
help="the name of output usr embedding model") parser.add_option(
parser.add_option("--usrDict", action="store", dest="usrDict", "--preDict",
help="the name of user specified dictionary") action="store",
parser.add_option("-d", action="store", dest="dim", dest="preDict",
help="dimension of parameter") help="the name of pretrained dictionary")
parser.add_option(
"--usrModel",
action="store",
dest="usrModel",
help="the name of output usr embedding model")
parser.add_option(
"--usrDict",
action="store",
dest="usrDict",
help="the name of user specified dictionary")
parser.add_option(
"-d", action="store", dest="dim", help="dimension of parameter")
(options, args) = parser.parse_args() (options, args) = parser.parse_args()
extract_parameters_by_usrDict(options.preModel, options.preDict, extract_parameters_by_usrDict(options.preModel, options.preDict,
options.usrModel, options.usrDict, int(options.dim)) options.usrModel, options.usrDict,
int(options.dim))
if __name__ == '__main__': if __name__ == '__main__':
main() main()
...@@ -12,7 +12,6 @@ ...@@ -12,7 +12,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
""" """
Example: Example:
python paraconvert.py --b2t -i INPUT -o OUTPUT -d DIM python paraconvert.py --b2t -i INPUT -o OUTPUT -d DIM
...@@ -29,6 +28,7 @@ Options: ...@@ -29,6 +28,7 @@ Options:
from optparse import OptionParser from optparse import OptionParser
import struct import struct
def binary2text(input, output, paraDim): def binary2text(input, output, paraDim):
""" """
Convert a binary parameter file of embedding model to be a text file. Convert a binary parameter file of embedding model to be a text file.
...@@ -76,12 +76,13 @@ def binary2text(input, output, paraDim): ...@@ -76,12 +76,13 @@ def binary2text(input, output, paraDim):
fo.close() fo.close()
print "binary2text finish, total", line, "lines" print "binary2text finish, total", line, "lines"
def get_para_count(input): def get_para_count(input):
""" """
Compute the total number of embedding parameters in input text file. Compute the total number of embedding parameters in input text file.
input: the name of input text file input: the name of input text file
""" """
numRows = 1 numRows = 1
paraDim = 0 paraDim = 0
with open(input) as f: with open(input) as f:
line = f.readline() line = f.readline()
...@@ -90,6 +91,7 @@ def get_para_count(input): ...@@ -90,6 +91,7 @@ def get_para_count(input):
numRows += 1 numRows += 1
return numRows * paraDim return numRows * paraDim
def text2binary(input, output, paddle_head=True): def text2binary(input, output, paddle_head=True):
""" """
Convert a text parameter file of embedding model to be a binary file. Convert a text parameter file of embedding model to be a binary file.
...@@ -123,6 +125,7 @@ def text2binary(input, output, paddle_head=True): ...@@ -123,6 +125,7 @@ def text2binary(input, output, paddle_head=True):
fo.close() fo.close()
print "text2binary finish, total", count, "lines" print "text2binary finish, total", count, "lines"
def main(): def main():
""" """
Main entry for running paraconvert.py Main entry for running paraconvert.py
...@@ -131,21 +134,26 @@ def main(): ...@@ -131,21 +134,26 @@ def main():
"python %prog --b2t -i INPUT -o OUTPUT -d DIM \n" \ "python %prog --b2t -i INPUT -o OUTPUT -d DIM \n" \
"python %prog --t2b -i INPUT -o OUTPUT" "python %prog --t2b -i INPUT -o OUTPUT"
parser = OptionParser(usage) parser = OptionParser(usage)
parser.add_option("--b2t", action="store_true", parser.add_option(
help="convert parameter file of embedding model from binary to text") "--b2t",
parser.add_option("--t2b", action="store_true", action="store_true",
help="convert parameter file of embedding model from text to binary") help="convert parameter file of embedding model from binary to text")
parser.add_option("-i", action="store", dest="input", parser.add_option(
help="input parameter file name") "--t2b",
parser.add_option("-o", action="store", dest="output", action="store_true",
help="output parameter file name") help="convert parameter file of embedding model from text to binary")
parser.add_option("-d", action="store", dest="dim", parser.add_option(
help="dimension of parameter") "-i", action="store", dest="input", help="input parameter file name")
parser.add_option(
"-o", action="store", dest="output", help="output parameter file name")
parser.add_option(
"-d", action="store", dest="dim", help="dimension of parameter")
(options, args) = parser.parse_args() (options, args) = parser.parse_args()
if options.b2t: if options.b2t:
binary2text(options.input, options.output, options.dim) binary2text(options.input, options.output, options.dim)
if options.t2b: if options.t2b:
text2binary(options.input, options.output) text2binary(options.input, options.output)
if __name__ == '__main__': if __name__ == '__main__':
main() main()
...@@ -26,16 +26,22 @@ from py_paddle import swig_paddle, DataProviderConverter ...@@ -26,16 +26,22 @@ from py_paddle import swig_paddle, DataProviderConverter
from paddle.trainer.PyDataProvider2 import dense_vector from paddle.trainer.PyDataProvider2 import dense_vector
from paddle.trainer.config_parser import parse_config from paddle.trainer.config_parser import parse_config
logging.basicConfig(format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s') logging.basicConfig(
format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s')
logging.getLogger().setLevel(logging.INFO) logging.getLogger().setLevel(logging.INFO)
class ImageClassifier(): class ImageClassifier():
def __init__(self, train_conf, model_dir=None, def __init__(self,
resize_dim=256, crop_dim=224, train_conf,
model_dir=None,
resize_dim=256,
crop_dim=224,
use_gpu=True, use_gpu=True,
mean_file=None, mean_file=None,
output_layer=None, output_layer=None,
oversample=False, is_color=True): oversample=False,
is_color=True):
""" """
train_conf: network configure. train_conf: network configure.
model_dir: string, directory of model. model_dir: string, directory of model.
...@@ -62,24 +68,25 @@ class ImageClassifier(): ...@@ -62,24 +68,25 @@ class ImageClassifier():
assert isinstance(self.output_layer, basestring) assert isinstance(self.output_layer, basestring)
self.output_layer = self.output_layer.split(",") self.output_layer = self.output_layer.split(",")
self.transformer = image_util.ImageTransformer(is_color = is_color) self.transformer = image_util.ImageTransformer(is_color=is_color)
self.transformer.set_transpose((2,0,1)) self.transformer.set_transpose((2, 0, 1))
self.transformer.set_channel_swap((2,1,0)) self.transformer.set_channel_swap((2, 1, 0))
self.mean_file = mean_file self.mean_file = mean_file
if self.mean_file is not None: if self.mean_file is not None:
mean = np.load(self.mean_file)['data_mean'] mean = np.load(self.mean_file)['data_mean']
mean = mean.reshape(3, self.crop_dims[0], self.crop_dims[1]) mean = mean.reshape(3, self.crop_dims[0], self.crop_dims[1])
self.transformer.set_mean(mean) # mean pixel self.transformer.set_mean(mean) # mean pixel
else: else:
# if you use three mean value, set like: # if you use three mean value, set like:
# this three mean value is calculated from ImageNet. # this three mean value is calculated from ImageNet.
self.transformer.set_mean(np.array([103.939,116.779,123.68])) self.transformer.set_mean(np.array([103.939, 116.779, 123.68]))
conf_args = "is_test=1,use_gpu=%d,is_predict=1" % (int(use_gpu)) conf_args = "is_test=1,use_gpu=%d,is_predict=1" % (int(use_gpu))
conf = parse_config(train_conf, conf_args) conf = parse_config(train_conf, conf_args)
swig_paddle.initPaddle("--use_gpu=%d" % (int(use_gpu))) swig_paddle.initPaddle("--use_gpu=%d" % (int(use_gpu)))
self.network = swig_paddle.GradientMachine.createFromConfigProto(conf.model_config) self.network = swig_paddle.GradientMachine.createFromConfigProto(
conf.model_config)
assert isinstance(self.network, swig_paddle.GradientMachine) assert isinstance(self.network, swig_paddle.GradientMachine)
self.network.loadParameters(self.model_dir) self.network.loadParameters(self.model_dir)
...@@ -105,14 +112,14 @@ class ImageClassifier(): ...@@ -105,14 +112,14 @@ class ImageClassifier():
# image_util.resize_image: short side is self.resize_dim # image_util.resize_image: short side is self.resize_dim
image = image_util.resize_image(image, self.resize_dim) image = image_util.resize_image(image, self.resize_dim)
image = np.array(image) image = np.array(image)
input = np.zeros((1, image.shape[0], image.shape[1], 3), input = np.zeros(
dtype=np.float32) (1, image.shape[0], image.shape[1], 3), dtype=np.float32)
input[0] = image.astype(np.float32) input[0] = image.astype(np.float32)
input = image_util.oversample(input, self.crop_dims) input = image_util.oversample(input, self.crop_dims)
else: else:
image = image.resize(self.crop_dims, Image.ANTIALIAS) image = image.resize(self.crop_dims, Image.ANTIALIAS)
input = np.zeros((1, self.crop_dims[0], self.crop_dims[1], 3), input = np.zeros(
dtype=np.float32) (1, self.crop_dims[0], self.crop_dims[1], 3), dtype=np.float32)
input[0] = np.array(image).astype(np.float32) input[0] = np.array(image).astype(np.float32)
data_in = [] data_in = []
...@@ -172,7 +179,7 @@ class ImageClassifier(): ...@@ -172,7 +179,7 @@ class ImageClassifier():
logging.info("Label of %s is: %d", image, lab[0]) logging.info("Label of %s is: %d", image, lab[0])
return results return results
def extract(self, data_file, output_dir, batch_size = 10000): def extract(self, data_file, output_dir, batch_size=10000):
""" """
extract and save features of output layers, which are extract and save features of output layers, which are
specify in Outputs() in network configure. specify in Outputs() in network configure.
...@@ -197,7 +204,7 @@ class ImageClassifier(): ...@@ -197,7 +204,7 @@ class ImageClassifier():
image_feature[file_name] = feature image_feature[file_name] = feature
sample_num += 1 sample_num += 1
if sample_num == batch_size: if sample_num == batch_size:
batch_name = os.path.join(output_dir, 'batch_%d' %(batch_num)) batch_name = os.path.join(output_dir, 'batch_%d' % (batch_num))
self.save_file(image_feature, batch_name) self.save_file(image_feature, batch_name)
logging.info('Finish batch %d', batch_num) logging.info('Finish batch %d', batch_num)
batch_num += 1 batch_num += 1
...@@ -206,7 +213,7 @@ class ImageClassifier(): ...@@ -206,7 +213,7 @@ class ImageClassifier():
if idx % 1000 == 0: if idx % 1000 == 0:
logging.info('%d/%d, %s', idx, len(image_files), file_name) logging.info('%d/%d, %s', idx, len(image_files), file_name)
if sample_num > 0: if sample_num > 0:
batch_name = os.path.join(output_dir, 'batch_%d' %(batch_num)) batch_name = os.path.join(output_dir, 'batch_%d' % (batch_num))
self.save_file(image_feature, batch_name) self.save_file(image_feature, batch_name)
logging.info('Finish batch %d', batch_num) logging.info('Finish batch %d', batch_num)
logging.info('Done: make image feature batch') logging.info('Done: make image feature batch')
...@@ -215,38 +222,64 @@ class ImageClassifier(): ...@@ -215,38 +222,64 @@ class ImageClassifier():
of = open(file, 'wb') of = open(file, 'wb')
cPickle.dump(data, of, protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(data, of, protocol=cPickle.HIGHEST_PROTOCOL)
def option_parser(): def option_parser():
""" """
Main entry for predciting Main entry for predciting
""" """
usage = "%prog -c config -i data_list -w model_dir [options]" usage = "%prog -c config -i data_list -w model_dir [options]"
parser = OptionParser(usage="usage: %s" % usage) parser = OptionParser(usage="usage: %s" % usage)
parser.add_option("-j", "--job", parser.add_option(
action="store", dest="job_type", "-j",
help="job type: predict, extract\ "--job",
action="store",
dest="job_type",
help="job type: predict, extract\
predict: predicting,\ predict: predicting,\
extract: extract features") extract: extract features")
parser.add_option("-c", "--conf", parser.add_option(
action="store", dest="train_conf", "-c",
help="network config") "--conf",
parser.add_option("-i", "--data", action="store",
action="store", dest="data_file", dest="train_conf",
help="image list") help="network config")
parser.add_option("-w", "--model", parser.add_option(
action="store", dest="model_path", "-i", "--data", action="store", dest="data_file", help="image list")
default=None, help="model path") parser.add_option(
parser.add_option("-g", "--use_gpu", action="store", "-w",
dest="use_gpu", default=True, "--model",
help="Whether to use gpu mode.") action="store",
parser.add_option("-o", "--output_dir", dest="model_path",
action="store", dest="output_dir", default=None,
default="output", help="output path") help="model path")
parser.add_option("-m", "--mean", action="store", parser.add_option(
dest="mean", default=None, "-g",
help="mean file.") "--use_gpu",
parser.add_option("-p", "--multi_crop", action="store_true", action="store",
dest="multi_crop", default=False, dest="use_gpu",
help="Wether to use multiple crops on image.") default=True,
help="Whether to use gpu mode.")
parser.add_option(
"-o",
"--output_dir",
action="store",
dest="output_dir",
default="output",
help="output path")
parser.add_option(
"-m",
"--mean",
action="store",
dest="mean",
default=None,
help="mean file.")
parser.add_option(
"-p",
"--multi_crop",
action="store_true",
dest="multi_crop",
default=False,
help="Wether to use multiple crops on image.")
parser.add_option("-l", "--output_layer", action="store", parser.add_option("-l", "--output_layer", action="store",
dest="output_layer", default=None, dest="output_layer", default=None,
help="--job=extract, specify layers to extract "\ help="--job=extract, specify layers to extract "\
...@@ -254,24 +287,26 @@ def option_parser(): ...@@ -254,24 +287,26 @@ def option_parser():
"classification probability, output in resnet.py.") "classification probability, output in resnet.py.")
return parser.parse_args() return parser.parse_args()
def main(): def main():
""" """
1. parse input arguments. 1. parse input arguments.
2. predicting or extract features according job type. 2. predicting or extract features according job type.
""" """
options, args = option_parser() options, args = option_parser()
obj = ImageClassifier(options.train_conf, obj = ImageClassifier(
options.model_path, options.train_conf,
use_gpu=options.use_gpu, options.model_path,
mean_file=options.mean, use_gpu=options.use_gpu,
output_layer=options.output_layer, mean_file=options.mean,
oversample=options.multi_crop) output_layer=options.output_layer,
oversample=options.multi_crop)
if options.job_type == "predict": if options.job_type == "predict":
obj.predict(options.data_file) obj.predict(options.data_file)
elif options.job_type == "extract": elif options.job_type == "extract":
obj.extract(options.data_file, obj.extract(options.data_file, options.output_dir)
options.output_dir)
if __name__ == '__main__': if __name__ == '__main__':
main() main()
...@@ -11,4 +11,3 @@ ...@@ -11,4 +11,3 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
...@@ -16,8 +16,7 @@ from paddle.utils.image_util import * ...@@ -16,8 +16,7 @@ from paddle.utils.image_util import *
from paddle.trainer.PyDataProvider2 import * from paddle.trainer.PyDataProvider2 import *
def hook(settings, image_size, crop_size, color, file_list, def hook(settings, image_size, crop_size, color, file_list, is_train, **kwargs):
is_train, **kwargs):
""" """
Description: Init with a list of data file Description: Init with a list of data file
file_list is the name list of input files. file_list is the name list of input files.
...@@ -58,7 +57,7 @@ def hook(settings, image_size, crop_size, color, file_list, ...@@ -58,7 +57,7 @@ def hook(settings, image_size, crop_size, color, file_list,
sz = settings.crop_size * settings.crop_size sz = settings.crop_size * settings.crop_size
settings.img_mean = np.zeros(sz * 3, dtype=np.single) settings.img_mean = np.zeros(sz * 3, dtype=np.single)
for idx, value in enumerate(settings.mean_value): for idx, value in enumerate(settings.mean_value):
settings.img_mean[idx * sz: (idx + 1) * sz] = value settings.img_mean[idx * sz:(idx + 1) * sz] = value
settings.img_mean = settings.img_mean.reshape(3, settings.crop_size, settings.img_mean = settings.img_mean.reshape(3, settings.crop_size,
settings.crop_size) settings.crop_size)
...@@ -69,7 +68,8 @@ def hook(settings, image_size, crop_size, color, file_list, ...@@ -69,7 +68,8 @@ def hook(settings, image_size, crop_size, color, file_list,
settings.input_types = [ settings.input_types = [
dense_vector(settings.img_input_size), # image feature dense_vector(settings.img_input_size), # image feature
integer_value(1)] # labels integer_value(1)
] # labels
settings.logger.info('Image short side: %s', settings.img_size) settings.logger.info('Image short side: %s', settings.img_size)
settings.logger.info('Crop size: %s', settings.crop_size) settings.logger.info('Crop size: %s', settings.crop_size)
...@@ -97,9 +97,6 @@ def processData(settings, file_list): ...@@ -97,9 +97,6 @@ def processData(settings, file_list):
# swap channel # swap channel
if settings.is_swap_channel: if settings.is_swap_channel:
img = img[settings.swap_channel, :, :] img = img[settings.swap_channel, :, :]
img_feat = preprocess_img(img, img_feat = preprocess_img(img, settings.img_mean, settings.crop_size,
settings.img_mean, settings.is_train, settings.color)
settings.crop_size,
settings.is_train,
settings.color)
yield img_feat.tolist(), int(lab.strip()) yield img_feat.tolist(), int(lab.strip())
...@@ -17,9 +17,11 @@ import sys ...@@ -17,9 +17,11 @@ import sys
import cPickle import cPickle
import logging import logging
logging.basicConfig(format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s') logging.basicConfig(
format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s')
logging.getLogger().setLevel(logging.INFO) logging.getLogger().setLevel(logging.INFO)
def load_feature_c(file): def load_feature_c(file):
""" """
Load feature extracted by C++ interface. Load feature extracted by C++ interface.
...@@ -30,14 +32,15 @@ def load_feature_c(file): ...@@ -30,14 +32,15 @@ def load_feature_c(file):
f = open(file, 'r') f = open(file, 'r')
for line in f: for line in f:
sample = [] sample = []
for slot in line.strip().split(";"): for slot in line.strip().split(";"):
fea = [float(val) for val in slot.strip().split()] fea = [float(val) for val in slot.strip().split()]
if fea: if fea:
sample.append(fea) sample.append(fea)
features.append(sample) features.append(sample)
f.close() f.close()
return features return features
def load_feature_py(feature_dir): def load_feature_py(feature_dir):
""" """
Load feature extracted by python interface. Load feature extracted by python interface.
...@@ -54,6 +57,7 @@ def load_feature_py(feature_dir): ...@@ -54,6 +57,7 @@ def load_feature_py(feature_dir):
logging.info('Load feature file %s', file_name) logging.info('Load feature file %s', file_name)
return features return features
if __name__ == '__main__': if __name__ == '__main__':
print load_feature_py(sys.argv[1]) print load_feature_py(sys.argv[1])
#print load_feature_c(sys.argv[1]) #print load_feature_c(sys.argv[1])
...@@ -13,7 +13,6 @@ ...@@ -13,7 +13,6 @@
# limitations under the License. # limitations under the License.
from paddle.trainer_config_helpers import * from paddle.trainer_config_helpers import *
""" """
paper: https://arxiv.org/abs/1512.03385 paper: https://arxiv.org/abs/1512.03385
""" """
...@@ -28,15 +27,19 @@ if not is_predict and data_provider: ...@@ -28,15 +27,19 @@ if not is_predict and data_provider:
# mean.meta size : 3 x 224 x 224. # mean.meta size : 3 x 224 x 224.
# If you use three mean value, set like: # If you use three mean value, set like:
# "mean_value:103.939,116.779,123.68;" # "mean_value:103.939,116.779,123.68;"
args={ args = {
'mean_meta': "model/mean_meta_224/mean.meta", 'mean_meta': "model/mean_meta_224/mean.meta",
'image_size': 224, 'crop_size': 224, 'image_size': 224,
'color': True,'swap_channel:': [2, 1, 0]} 'crop_size': 224,
define_py_data_sources2(train_list, 'color': True,
'example/test.list', 'swap_channel:': [2, 1, 0]
module="example.image_list_provider", }
obj="processData", define_py_data_sources2(
args=args) train_list,
'example/test.list',
module="example.image_list_provider",
obj="processData",
args=args)
batch_size = 1 batch_size = 1
learning_rate = 0.1 / batch_size learning_rate = 0.1 / batch_size
...@@ -54,12 +57,16 @@ Settings( ...@@ -54,12 +57,16 @@ Settings(
learning_method='momentum', learning_method='momentum',
learning_rate_decay_a=0.5, learning_rate_decay_a=0.5,
learning_rate_decay_b=1200000 * 10, learning_rate_decay_b=1200000 * 10,
learning_rate_schedule="discexp", learning_rate_schedule="discexp", )
)
def conv_bn_layer(name, input, filter_size, num_filters, def conv_bn_layer(name,
stride, padding, channels=None, input,
filter_size,
num_filters,
stride,
padding,
channels=None,
active_type=ReluActivation()): active_type=ReluActivation()):
""" """
A wrapper for conv layer with batch normalization layers. A wrapper for conv layer with batch normalization layers.
...@@ -67,19 +74,18 @@ def conv_bn_layer(name, input, filter_size, num_filters, ...@@ -67,19 +74,18 @@ def conv_bn_layer(name, input, filter_size, num_filters,
conv layer has no activation. conv layer has no activation.
""" """
tmp = img_conv_layer(name=name + "_conv", tmp = img_conv_layer(
input=input, name=name + "_conv",
filter_size=filter_size, input=input,
num_channels=channels, filter_size=filter_size,
num_filters=num_filters, num_channels=channels,
stride=stride, num_filters=num_filters,
padding=padding, stride=stride,
act=LinearActivation(), padding=padding,
bias_attr=False) act=LinearActivation(),
return batch_norm_layer(name=name + "_bn", bias_attr=False)
input=tmp, return batch_norm_layer(
act=active_type, name=name + "_bn", input=tmp, act=active_type, use_global_stats=is_test)
use_global_stats=is_test)
def bottleneck_block(name, input, num_filters1, num_filters2): def bottleneck_block(name, input, num_filters1, num_filters2):
...@@ -88,29 +94,31 @@ def bottleneck_block(name, input, num_filters1, num_filters2): ...@@ -88,29 +94,31 @@ def bottleneck_block(name, input, num_filters1, num_filters2):
Last conv_bn_layer has no activation. Last conv_bn_layer has no activation.
Addto layer has activation of relu. Addto layer has activation of relu.
""" """
last_name = conv_bn_layer(name=name + '_branch2a', last_name = conv_bn_layer(
input=input, name=name + '_branch2a',
filter_size=1, input=input,
num_filters=num_filters1, filter_size=1,
stride=1, num_filters=num_filters1,
padding=0) stride=1,
last_name = conv_bn_layer(name=name + '_branch2b', padding=0)
input=last_name, last_name = conv_bn_layer(
filter_size=3, name=name + '_branch2b',
num_filters=num_filters1, input=last_name,
stride=1, filter_size=3,
padding=1) num_filters=num_filters1,
last_name = conv_bn_layer(name=name + '_branch2c', stride=1,
input=last_name, padding=1)
filter_size=1, last_name = conv_bn_layer(
num_filters=num_filters2, name=name + '_branch2c',
stride=1, input=last_name,
padding=0, filter_size=1,
active_type=LinearActivation()) num_filters=num_filters2,
stride=1,
return addto_layer(name=name + "_addto", padding=0,
input=[input, last_name], active_type=LinearActivation())
act=ReluActivation())
return addto_layer(
name=name + "_addto", input=[input, last_name], act=ReluActivation())
def mid_projection(name, input, num_filters1, num_filters2, stride=2): def mid_projection(name, input, num_filters1, num_filters2, stride=2):
...@@ -123,38 +131,41 @@ def mid_projection(name, input, num_filters1, num_filters2, stride=2): ...@@ -123,38 +131,41 @@ def mid_projection(name, input, num_filters1, num_filters2, stride=2):
branch2x: bottleneck building block, shortcuts are identity. branch2x: bottleneck building block, shortcuts are identity.
""" """
# stride = 2 # stride = 2
branch1 = conv_bn_layer(name=name + '_branch1', branch1 = conv_bn_layer(
input=input, name=name + '_branch1',
filter_size=1, input=input,
num_filters=num_filters2, filter_size=1,
stride=stride, num_filters=num_filters2,
padding=0, stride=stride,
active_type=LinearActivation()) padding=0,
active_type=LinearActivation())
last_name = conv_bn_layer(name=name + '_branch2a',
input=input, last_name = conv_bn_layer(
filter_size=1, name=name + '_branch2a',
num_filters=num_filters1, input=input,
stride=stride, filter_size=1,
padding=0) num_filters=num_filters1,
last_name = conv_bn_layer(name=name + '_branch2b', stride=stride,
input=last_name, padding=0)
filter_size=3, last_name = conv_bn_layer(
num_filters=num_filters1, name=name + '_branch2b',
stride=1, input=last_name,
padding=1) filter_size=3,
num_filters=num_filters1,
last_name = conv_bn_layer(name=name + '_branch2c', stride=1,
input=last_name, padding=1)
filter_size=1,
num_filters=num_filters2, last_name = conv_bn_layer(
stride=1, name=name + '_branch2c',
padding=0, input=last_name,
active_type=LinearActivation()) filter_size=1,
num_filters=num_filters2,
return addto_layer(name=name + "_addto", stride=1,
input=[branch1, last_name], padding=0,
act=ReluActivation()) active_type=LinearActivation())
return addto_layer(
name=name + "_addto", input=[branch1, last_name], act=ReluActivation())
def deep_res_net(res2_num=3, res3_num=4, res4_num=6, res5_num=3): def deep_res_net(res2_num=3, res3_num=4, res4_num=6, res5_num=3):
...@@ -168,67 +179,67 @@ def deep_res_net(res2_num=3, res3_num=4, res4_num=6, res5_num=3): ...@@ -168,67 +179,67 @@ def deep_res_net(res2_num=3, res3_num=4, res4_num=6, res5_num=3):
# For ImageNet # For ImageNet
# conv1: 112x112 # conv1: 112x112
img = data_layer(name='input', size=224 * 224 * 3) img = data_layer(name='input', size=224 * 224 * 3)
tmp = conv_bn_layer("conv1", img, tmp = conv_bn_layer(
filter_size=7, "conv1",
channels=3, img,
num_filters=64, filter_size=7,
stride=2, channels=3,
padding=3) num_filters=64,
stride=2,
padding=3)
tmp = img_pool_layer(name="pool1", input=tmp, pool_size=3, stride=2) tmp = img_pool_layer(name="pool1", input=tmp, pool_size=3, stride=2)
# conv2_x: 56x56 # conv2_x: 56x56
tmp = mid_projection(name="res2_1", tmp = mid_projection(
input=tmp, name="res2_1", input=tmp, num_filters1=64, num_filters2=256, stride=1)
num_filters1=64,
num_filters2=256,
stride=1)
for i in xrange(2, res2_num + 1, 1): for i in xrange(2, res2_num + 1, 1):
tmp = bottleneck_block(name="res2_" + str(i), tmp = bottleneck_block(
input=tmp, name="res2_" + str(i), input=tmp, num_filters1=64, num_filters2=256)
num_filters1=64,
num_filters2=256)
# conv3_x: 28x28 # conv3_x: 28x28
tmp = mid_projection(name="res3_1", tmp = mid_projection(
input=tmp, name="res3_1", input=tmp, num_filters1=128, num_filters2=512)
num_filters1=128,
num_filters2=512)
for i in xrange(2, res3_num + 1, 1): for i in xrange(2, res3_num + 1, 1):
tmp = bottleneck_block(name="res3_" + str(i), tmp = bottleneck_block(
input=tmp, num_filters1=128, name="res3_" + str(i),
num_filters2=512) input=tmp,
num_filters1=128,
num_filters2=512)
# conv4_x: 14x14 # conv4_x: 14x14
tmp = mid_projection(name="res4_1", input=tmp, tmp = mid_projection(
num_filters1=256, num_filters2=1024) name="res4_1", input=tmp, num_filters1=256, num_filters2=1024)
for i in xrange(2, res4_num + 1, 1): for i in xrange(2, res4_num + 1, 1):
tmp = bottleneck_block(name="res4_" + str(i), tmp = bottleneck_block(
input=tmp, name="res4_" + str(i),
num_filters1=256, input=tmp,
num_filters2=1024) num_filters1=256,
num_filters2=1024)
# conv5_x: 7x7 # conv5_x: 7x7
tmp = mid_projection(name="res5_1", input=tmp, tmp = mid_projection(
num_filters1=512, num_filters2=2048) name="res5_1", input=tmp, num_filters1=512, num_filters2=2048)
for i in xrange(2, res5_num + 1, 1): for i in xrange(2, res5_num + 1, 1):
tmp = bottleneck_block(name="res5_" + str(i), tmp = bottleneck_block(
input=tmp, num_filters1=512, name="res5_" + str(i),
num_filters2=2048) input=tmp,
num_filters1=512,
tmp = img_pool_layer(name='avgpool', num_filters2=2048)
input=tmp,
pool_size=7, tmp = img_pool_layer(
stride=1, name='avgpool',
pool_type=AvgPooling()) input=tmp,
pool_size=7,
output = fc_layer(name='output', stride=1,
input=tmp, pool_type=AvgPooling())
size=1000,
act=SoftmaxActivation()) output = fc_layer(
name='output', input=tmp, size=1000, act=SoftmaxActivation())
if not is_predict: if not is_predict:
classification_cost(input=output, label=data_layer(name='label', classification_cost(
size=1)) input=output, label=data_layer(
name='label', size=1))
def res_net_50(): def res_net_50():
......
...@@ -22,27 +22,32 @@ from py_paddle import DataProviderConverter ...@@ -22,27 +22,32 @@ from py_paddle import DataProviderConverter
from paddle.trainer.PyDataProvider2 \ from paddle.trainer.PyDataProvider2 \
import integer_value, integer_value_sequence, sparse_binary_vector import integer_value, integer_value_sequence, sparse_binary_vector
def parse_arguments(): def parse_arguments():
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("--train_data", parser.add_argument(
type=str, required=False, help="train data file") "--train_data", type=str, required=False, help="train data file")
parser.add_argument("--test_data", type=str, help="test data file") parser.add_argument("--test_data", type=str, help="test data file")
parser.add_argument("--config", parser.add_argument(
type=str, required=True, help="config file name") "--config", type=str, required=True, help="config file name")
parser.add_argument("--dict_file", required=True, help="dictionary file") parser.add_argument("--dict_file", required=True, help="dictionary file")
parser.add_argument("--seq", parser.add_argument(
default=1, type=int, "--seq", default=1, type=int, help="whether use sequence training")
help="whether use sequence training") parser.add_argument(
parser.add_argument("--use_gpu", default=0, type=int, "--use_gpu", default=0, type=int, help="whether use GPU for training")
help="whether use GPU for training") parser.add_argument(
parser.add_argument("--trainer_count", default=1, type=int, "--trainer_count",
help="Number of threads for training") default=1,
parser.add_argument("--num_passes", default=5, type=int, type=int,
help="Number of training passes") help="Number of threads for training")
parser.add_argument(
"--num_passes", default=5, type=int, help="Number of training passes")
return parser.parse_args() return parser.parse_args()
UNK_IDX = 0 UNK_IDX = 0
def load_data(file_name, word_dict): def load_data(file_name, word_dict):
with open(file_name, 'r') as f: with open(file_name, 'r') as f:
for line in f: for line in f:
...@@ -51,6 +56,7 @@ def load_data(file_name, word_dict): ...@@ -51,6 +56,7 @@ def load_data(file_name, word_dict):
word_slot = [word_dict.get(w, UNK_IDX) for w in words] word_slot = [word_dict.get(w, UNK_IDX) for w in words]
yield word_slot, int(label) yield word_slot, int(label)
def load_dict(dict_file): def load_dict(dict_file):
word_dict = dict() word_dict = dict()
with open(dict_file, 'r') as f: with open(dict_file, 'r') as f:
...@@ -59,6 +65,7 @@ def load_dict(dict_file): ...@@ -59,6 +65,7 @@ def load_dict(dict_file):
word_dict[w] = i word_dict[w] = i
return word_dict return word_dict
def main(): def main():
options = parse_arguments() options = parse_arguments()
api.initPaddle("--use_gpu=%s" % options.use_gpu, api.initPaddle("--use_gpu=%s" % options.use_gpu,
...@@ -86,9 +93,9 @@ def main(): ...@@ -86,9 +93,9 @@ def main():
# create a data converter which converts data to PaddlePaddle # create a data converter which converts data to PaddlePaddle
# internal format # internal format
input_types = [ input_types = [
integer_value_sequence(len(word_dict)) if options.seq integer_value_sequence(len(word_dict)) if options.seq else
else sparse_binary_vector(len(word_dict)), sparse_binary_vector(len(word_dict)), integer_value(2)
integer_value(2)] ]
converter = DataProviderConverter(input_types) converter = DataProviderConverter(input_types)
batch_size = trainer_config.opt_config.batch_size batch_size = trainer_config.opt_config.batch_size
...@@ -102,7 +109,7 @@ def main(): ...@@ -102,7 +109,7 @@ def main():
trainer.trainOneDataBatch(size, converter(batch)) trainer.trainOneDataBatch(size, converter(batch))
trainer.finishTrainPass() trainer.finishTrainPass()
if test_dataset: if test_dataset:
trainer.startTestPeriod(); trainer.startTestPeriod()
for pos in xrange(0, len(test_dataset), batch_size): for pos in xrange(0, len(test_dataset), batch_size):
batch = itertools.islice(test_dataset, pos, pos + batch_size) batch = itertools.islice(test_dataset, pos, pos + batch_size)
size = min(batch_size, len(test_dataset) - pos) size = min(batch_size, len(test_dataset) - pos)
...@@ -110,5 +117,6 @@ def main(): ...@@ -110,5 +117,6 @@ def main():
trainer.finishTestPeriod() trainer.finishTestPeriod()
trainer.finishTrain() trainer.finishTrain()
if __name__ == '__main__': if __name__ == '__main__':
main() main()
...@@ -17,6 +17,7 @@ from paddle.trainer.PyDataProvider2 import * ...@@ -17,6 +17,7 @@ from paddle.trainer.PyDataProvider2 import *
# id of the word not in dictionary # id of the word not in dictionary
UNK_IDX = 0 UNK_IDX = 0
# initializer is called by the framework during initialization. # initializer is called by the framework during initialization.
# It allows the user to describe the data types and setup the # It allows the user to describe the data types and setup the
# necessary data structure for later use. # necessary data structure for later use.
...@@ -38,7 +39,9 @@ def initializer(settings, dictionary, **kwargs): ...@@ -38,7 +39,9 @@ def initializer(settings, dictionary, **kwargs):
# The second input is an integer. It represents the category id of the # The second input is an integer. It represents the category id of the
# sample. 2 means there are two labels in the dataset. # sample. 2 means there are two labels in the dataset.
# (1 for positive and 0 for negative) # (1 for positive and 0 for negative)
integer_value(2)] integer_value(2)
]
# Delaring a data provider. It has an initializer 'data_initialzer'. # Delaring a data provider. It has an initializer 'data_initialzer'.
# It will cache the generated data of the first pass in memory, so that # It will cache the generated data of the first pass in memory, so that
...@@ -69,9 +72,8 @@ def process(settings, file_name): ...@@ -69,9 +72,8 @@ def process(settings, file_name):
def predict_initializer(settings, dictionary, **kwargs): def predict_initializer(settings, dictionary, **kwargs):
settings.word_dict = dictionary settings.word_dict = dictionary
settings.input_types = [ settings.input_types = [sparse_binary_vector(len(dictionary))]
sparse_binary_vector(len(dictionary))
]
# Declaring a data provider for prediction. The difference with process # Declaring a data provider for prediction. The difference with process
# is that label is not generated. # is that label is not generated.
......
...@@ -24,7 +24,8 @@ def initializer(settings, dictionary, **kwargs): ...@@ -24,7 +24,8 @@ def initializer(settings, dictionary, **kwargs):
# The value of the integers range from 0 to len(dictrionary)-1 # The value of the integers range from 0 to len(dictrionary)-1
integer_value_sequence(len(dictionary)), integer_value_sequence(len(dictionary)),
# Define the second input for label id # Define the second input for label id
integer_value(2)] integer_value(2)
]
@provider(init_hook=initializer, cache=CacheType.CACHE_PASS_IN_MEM) @provider(init_hook=initializer, cache=CacheType.CACHE_PASS_IN_MEM)
...@@ -40,7 +41,8 @@ def process(settings, file_name): ...@@ -40,7 +41,8 @@ def process(settings, file_name):
def predict_initializer(settings, dictionary, **kwargs): def predict_initializer(settings, dictionary, **kwargs):
settings.word_dict = dictionary settings.word_dict = dictionary
settings.input_types = [ settings.input_types = [
integer_value(len(dictionary), seq_type=SequenceType.SEQUENCE) integer_value(
len(dictionary), seq_type=SequenceType.SEQUENCE)
] ]
......
...@@ -13,7 +13,6 @@ ...@@ -13,7 +13,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
""" """
1. (remove HTML before or not)tokensizing 1. (remove HTML before or not)tokensizing
2. pos sample : rating score 5; neg sample: rating score 1-2. 2. pos sample : rating score 5; neg sample: rating score 1-2.
...@@ -35,7 +34,8 @@ import multiprocessing ...@@ -35,7 +34,8 @@ import multiprocessing
batch_size = 5000 batch_size = 5000
word_count = {} word_count = {}
num_tokenize = max(1, multiprocessing.cpu_count() - 2) # parse + tokenize + save num_tokenize = max(1,
multiprocessing.cpu_count() - 2) # parse + tokenize + save
max_queue_size = 8 max_queue_size = 8
parse_queue = Queue(maxsize=max_queue_size + num_tokenize) parse_queue = Queue(maxsize=max_queue_size + num_tokenize)
tokenize_queue = Queue(maxsize=max_queue_size + num_tokenize) tokenize_queue = Queue(maxsize=max_queue_size + num_tokenize)
......
...@@ -27,11 +27,12 @@ is_predict = get_config_arg('is_predict', bool, False) ...@@ -27,11 +27,12 @@ is_predict = get_config_arg('is_predict', bool, False)
trn = 'data/train.list' if not is_predict else None trn = 'data/train.list' if not is_predict else None
tst = 'data/test.list' if not is_predict else 'data/pred.list' tst = 'data/test.list' if not is_predict else 'data/pred.list'
process = 'process' if not is_predict else 'process_predict' process = 'process' if not is_predict else 'process_predict'
define_py_data_sources2(train_list=trn, define_py_data_sources2(
test_list=tst, train_list=trn,
module="dataprovider_emb", test_list=tst,
obj=process, module="dataprovider_emb",
args={"dictionary": word_dict}) obj=process,
args={"dictionary": word_dict})
batch_size = 128 if not is_predict else 1 batch_size = 128 if not is_predict else 1
settings( settings(
...@@ -39,19 +40,17 @@ settings( ...@@ -39,19 +40,17 @@ settings(
learning_rate=2e-3, learning_rate=2e-3,
learning_method=AdamOptimizer(), learning_method=AdamOptimizer(),
regularization=L2Regularization(8e-4), regularization=L2Regularization(8e-4),
gradient_clipping_threshold=25 gradient_clipping_threshold=25)
)
bias_attr = ParamAttr(initial_std=0.,l2_rate=0.) bias_attr = ParamAttr(initial_std=0., l2_rate=0.)
data = data_layer(name="word", size=len(word_dict)) data = data_layer(name="word", size=len(word_dict))
emb = embedding_layer(input=data, size=128) emb = embedding_layer(input=data, size=128)
bi_lstm = bidirectional_lstm(input=emb, size=128) bi_lstm = bidirectional_lstm(input=emb, size=128)
dropout = dropout_layer(input=bi_lstm, dropout_rate=0.5) dropout = dropout_layer(input=bi_lstm, dropout_rate=0.5)
output = fc_layer(input=dropout, size=2, output = fc_layer(
bias_attr=bias_attr, input=dropout, size=2, bias_attr=bias_attr, act=SoftmaxActivation())
act=SoftmaxActivation())
if is_predict: if is_predict:
maxid = maxid_layer(output) maxid = maxid_layer(output)
......
...@@ -27,11 +27,12 @@ is_predict = get_config_arg('is_predict', bool, False) ...@@ -27,11 +27,12 @@ is_predict = get_config_arg('is_predict', bool, False)
trn = 'data/train.list' if not is_predict else None trn = 'data/train.list' if not is_predict else None
tst = 'data/test.list' if not is_predict else 'data/pred.list' tst = 'data/test.list' if not is_predict else 'data/pred.list'
process = 'process' if not is_predict else 'process_predict' process = 'process' if not is_predict else 'process_predict'
define_py_data_sources2(train_list=trn, define_py_data_sources2(
test_list=tst, train_list=trn,
module="dataprovider_emb", test_list=tst,
obj=process, module="dataprovider_emb",
args={"dictionary": word_dict}) obj=process,
args={"dictionary": word_dict})
batch_size = 128 if not is_predict else 1 batch_size = 128 if not is_predict else 1
settings( settings(
...@@ -39,8 +40,7 @@ settings( ...@@ -39,8 +40,7 @@ settings(
learning_rate=2e-3, learning_rate=2e-3,
learning_method=AdamOptimizer(), learning_method=AdamOptimizer(),
regularization=L2Regularization(8e-4), regularization=L2Regularization(8e-4),
gradient_clipping_threshold=25 gradient_clipping_threshold=25)
)
data = data_layer(name="word", size=len(word_dict)) data = data_layer(name="word", size=len(word_dict))
embedding = embedding_layer(input=data, size=128) embedding = embedding_layer(input=data, size=128)
......
...@@ -27,11 +27,12 @@ is_predict = get_config_arg('is_predict', bool, False) ...@@ -27,11 +27,12 @@ is_predict = get_config_arg('is_predict', bool, False)
trn = 'data/train.list' if not is_predict else None trn = 'data/train.list' if not is_predict else None
tst = 'data/test.list' if not is_predict else 'data/pred.list' tst = 'data/test.list' if not is_predict else 'data/pred.list'
process = 'process' if not is_predict else 'process_predict' process = 'process' if not is_predict else 'process_predict'
define_py_data_sources2(train_list=trn, define_py_data_sources2(
test_list=tst, train_list=trn,
module="dataprovider_emb", test_list=tst,
obj=process, module="dataprovider_emb",
args={"dictionary": word_dict}) obj=process,
args={"dictionary": word_dict})
batch_size = 128 if not is_predict else 1 batch_size = 128 if not is_predict else 1
settings( settings(
...@@ -39,10 +40,9 @@ settings( ...@@ -39,10 +40,9 @@ settings(
learning_rate=2e-3, learning_rate=2e-3,
learning_method=AdamOptimizer(), learning_method=AdamOptimizer(),
regularization=L2Regularization(8e-4), regularization=L2Regularization(8e-4),
gradient_clipping_threshold=25 gradient_clipping_threshold=25)
)
bias_attr = ParamAttr(initial_std=0.,l2_rate=0.) bias_attr = ParamAttr(initial_std=0., l2_rate=0.)
data = data_layer(name="word", size=len(word_dict)) data = data_layer(name="word", size=len(word_dict))
emb = embedding_layer(input=data, size=128) emb = embedding_layer(input=data, size=128)
...@@ -52,17 +52,18 @@ lstm_0 = lstmemory(input=hidden_0, layer_attr=ExtraAttr(drop_rate=0.1)) ...@@ -52,17 +52,18 @@ lstm_0 = lstmemory(input=hidden_0, layer_attr=ExtraAttr(drop_rate=0.1))
input_layers = [hidden_0, lstm_0] input_layers = [hidden_0, lstm_0]
for i in range(1,8): for i in range(1, 8):
fc = fc_layer(input=input_layers, size=128) fc = fc_layer(input=input_layers, size=128)
lstm = lstmemory(input=fc, layer_attr=ExtraAttr(drop_rate=0.1), lstm = lstmemory(
reverse=(i % 2) == 1,) input=fc,
layer_attr=ExtraAttr(drop_rate=0.1),
reverse=(i % 2) == 1, )
input_layers = [fc, lstm] input_layers = [fc, lstm]
lstm_last = pooling_layer(input=lstm, pooling_type=MaxPooling()) lstm_last = pooling_layer(input=lstm, pooling_type=MaxPooling())
output = fc_layer(input=lstm_last, size=2, output = fc_layer(
bias_attr=bias_attr, input=lstm_last, size=2, bias_attr=bias_attr, act=SoftmaxActivation())
act=SoftmaxActivation())
if is_predict: if is_predict:
maxid = maxid_layer(output) maxid = maxid_layer(output)
......
...@@ -27,18 +27,16 @@ is_predict = get_config_arg('is_predict', bool, False) ...@@ -27,18 +27,16 @@ is_predict = get_config_arg('is_predict', bool, False)
trn = 'data/train.list' if not is_predict else None trn = 'data/train.list' if not is_predict else None
tst = 'data/test.list' if not is_predict else 'data/pred.list' tst = 'data/test.list' if not is_predict else 'data/pred.list'
process = 'process' if not is_predict else 'process_predict' process = 'process' if not is_predict else 'process_predict'
define_py_data_sources2(train_list=trn, define_py_data_sources2(
test_list=tst, train_list=trn,
module="dataprovider_emb", test_list=tst,
obj=process, module="dataprovider_emb",
args={"dictionary": word_dict}) obj=process,
args={"dictionary": word_dict})
batch_size = 128 if not is_predict else 1 batch_size = 128 if not is_predict else 1
settings( settings(
batch_size=batch_size, batch_size=batch_size, learning_rate=2e-3, learning_method=AdamOptimizer())
learning_rate=2e-3,
learning_method=AdamOptimizer()
)
data = data_layer(name="word", size=len(word_dict)) data = data_layer(name="word", size=len(word_dict))
embedding = embedding_layer(input=data, size=128) embedding = embedding_layer(input=data, size=128)
......
...@@ -32,11 +32,12 @@ process = 'process' if not is_predict else 'process_predict' ...@@ -32,11 +32,12 @@ process = 'process' if not is_predict else 'process_predict'
# We need to use different process for training and prediction. # We need to use different process for training and prediction.
# For training, the input data includes both word IDs and labels. # For training, the input data includes both word IDs and labels.
# For prediction, the input data only includs word Ids. # For prediction, the input data only includs word Ids.
define_py_data_sources2(train_list=trn, define_py_data_sources2(
test_list=tst, train_list=trn,
module="dataprovider_bow", test_list=tst,
obj=process, module="dataprovider_bow",
args={"dictionary": word_dict}) obj=process,
args={"dictionary": word_dict})
batch_size = 128 if not is_predict else 1 batch_size = 128 if not is_predict else 1
settings( settings(
...@@ -44,8 +45,7 @@ settings( ...@@ -44,8 +45,7 @@ settings(
learning_rate=2e-3, learning_rate=2e-3,
learning_method=AdamOptimizer(), learning_method=AdamOptimizer(),
regularization=L2Regularization(8e-4), regularization=L2Regularization(8e-4),
gradient_clipping_threshold=25 gradient_clipping_threshold=25)
)
# Define the data for text features. The size of the data layer is the number # Define the data for text features. The size of the data layer is the number
# of words in the dictionary. # of words in the dictionary.
......
...@@ -27,11 +27,12 @@ is_predict = get_config_arg('is_predict', bool, False) ...@@ -27,11 +27,12 @@ is_predict = get_config_arg('is_predict', bool, False)
trn = 'data/train.list' if not is_predict else None trn = 'data/train.list' if not is_predict else None
tst = 'data/test.list' if not is_predict else 'data/pred.list' tst = 'data/test.list' if not is_predict else 'data/pred.list'
process = 'process' if not is_predict else 'process_predict' process = 'process' if not is_predict else 'process_predict'
define_py_data_sources2(train_list=trn, define_py_data_sources2(
test_list=tst, train_list=trn,
module="dataprovider_emb", test_list=tst,
obj=process, module="dataprovider_emb",
args={"dictionary": word_dict}) obj=process,
args={"dictionary": word_dict})
batch_size = 128 if not is_predict else 1 batch_size = 128 if not is_predict else 1
settings( settings(
...@@ -39,17 +40,14 @@ settings( ...@@ -39,17 +40,14 @@ settings(
learning_rate=2e-3, learning_rate=2e-3,
learning_method=AdamOptimizer(), learning_method=AdamOptimizer(),
regularization=L2Regularization(8e-4), regularization=L2Regularization(8e-4),
gradient_clipping_threshold=25 gradient_clipping_threshold=25)
)
data = data_layer(name="word", size=len(word_dict)) data = data_layer(name="word", size=len(word_dict))
emb = embedding_layer(input=data, size=128) emb = embedding_layer(input=data, size=128)
lstm = simple_lstm(input=emb, size=128, lstm = simple_lstm(
lstm_cell_attr=ExtraAttr(drop_rate=0.25)) input=emb, size=128, lstm_cell_attr=ExtraAttr(drop_rate=0.25))
lstm_max = pooling_layer(input=lstm, pooling_type=MaxPooling()) lstm_max = pooling_layer(input=lstm, pooling_type=MaxPooling())
output = fc_layer(input=lstm_max, size=2, output = fc_layer(input=lstm_max, size=2, act=SoftmaxActivation())
act=SoftmaxActivation())
if is_predict: if is_predict:
maxid = maxid_layer(output) maxid = maxid_layer(output)
outputs([maxid, output]) outputs([maxid, output])
......
...@@ -21,8 +21,9 @@ def meta_to_header(meta, name): ...@@ -21,8 +21,9 @@ def meta_to_header(meta, name):
yield integer_value(each_meta['max']) yield integer_value(each_meta['max'])
elif each_meta['type'] == 'embedding': elif each_meta['type'] == 'embedding':
is_seq = each_meta['seq'] == 'sequence' is_seq = each_meta['seq'] == 'sequence'
yield integer_value(len(each_meta['dict']), yield integer_value(
seq_type=SequenceType.SEQUENCE if is_seq len(each_meta['dict']),
else SequenceType.NO_SEQUENCE) seq_type=SequenceType.SEQUENCE
if is_seq else SequenceType.NO_SEQUENCE)
elif each_meta['type'] == 'one_hot_dense': elif each_meta['type'] == 'one_hot_dense':
yield dense_vector(len(each_meta['dict'])) yield dense_vector(len(each_meta['dict']))
...@@ -12,7 +12,6 @@ ...@@ -12,7 +12,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
""" """
config_generator.py config_generator.py
...@@ -29,10 +28,7 @@ import json ...@@ -29,10 +28,7 @@ import json
import docopt import docopt
import copy import copy
DEFAULT_FILE = { DEFAULT_FILE = {"type": "split", "delimiter": ","}
"type": "split",
"delimiter": ","
}
DEFAULT_FIELD = { DEFAULT_FIELD = {
"id": { "id": {
...@@ -107,19 +103,16 @@ def main(filename, fmt): ...@@ -107,19 +103,16 @@ def main(filename, fmt):
field = copy.deepcopy(DEFAULT_FIELD[field_key]) field = copy.deepcopy(DEFAULT_FIELD[field_key])
field['pos'] = pos field['pos'] = pos
fields.append(field) fields.append(field)
obj[k] = { obj[k] = {"file": file_dict, "fields": fields}
"file": file_dict, meta = {"meta": obj}
"fields": fields
}
meta = {
"meta": obj
}
# print meta # print meta
if fmt == 'json': if fmt == 'json':
def formatter(x): def formatter(x):
import json import json
return json.dumps(x, indent=2) return json.dumps(x, indent=2)
elif fmt == 'yaml': elif fmt == 'yaml':
def formatter(x): def formatter(x):
import yaml import yaml
return yaml.safe_dump(x, default_flow_style=False) return yaml.safe_dump(x, default_flow_style=False)
......
...@@ -12,7 +12,6 @@ ...@@ -12,7 +12,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
""" """
Preprocess Movielens dataset, to get movie/user object. Preprocess Movielens dataset, to get movie/user object.
...@@ -66,8 +65,8 @@ class SortedIDGenerator(object): ...@@ -66,8 +65,8 @@ class SortedIDGenerator(object):
self.__key_set__.add(key) self.__key_set__.add(key)
def finish_scan(self, compare=None, key=None, reverse=False): def finish_scan(self, compare=None, key=None, reverse=False):
self.__key_set__ = sorted(list(self.__key_set__), cmp=compare, self.__key_set__ = sorted(
key=key, reverse=reverse) list(self.__key_set__), cmp=compare, key=key, reverse=reverse)
self.dict = dict() self.dict = dict()
for idx, each_key in enumerate(self.__key_set__): for idx, each_key in enumerate(self.__key_set__):
self.dict[each_key] = idx self.dict[each_key] = idx
...@@ -207,11 +206,10 @@ class EmbeddingFieldParser(object): ...@@ -207,11 +206,10 @@ class EmbeddingFieldParser(object):
self.dict = EmbeddingFieldParser.CharBasedEmbeddingDict( self.dict = EmbeddingFieldParser.CharBasedEmbeddingDict(
self.seq_type == EmbeddingFieldParser.SEQUENCE) self.seq_type == EmbeddingFieldParser.SEQUENCE)
elif config['dict']['type'] == 'split': elif config['dict']['type'] == 'split':
self.dict = SplitEmbeddingDict( self.dict = SplitEmbeddingDict(config['dict'].get('delimiter', ','))
config['dict'].get('delimiter', ','))
elif config['dict']['type'] == 'whole_content': elif config['dict']['type'] == 'whole_content':
self.dict = EmbeddingFieldParser.WholeContentDict( self.dict = EmbeddingFieldParser.WholeContentDict(config['dict'][
config['dict']['sort']) 'sort'])
else: else:
print config print config
assert False assert False
...@@ -333,8 +331,8 @@ class ContentExtractorFactory(object): ...@@ -333,8 +331,8 @@ class ContentExtractorFactory(object):
return PositionContentExtractor(config['pos']) return PositionContentExtractor(config['pos'])
else: else:
extra_args = config['regex'] extra_args = config['regex']
return RegexPositionContentExtractor(pos=config['pos'], return RegexPositionContentExtractor(
**extra_args) pos=config['pos'], **extra_args)
class MetaFile(object): class MetaFile(object):
...@@ -364,9 +362,10 @@ class MetaFile(object): ...@@ -364,9 +362,10 @@ class MetaFile(object):
metas = map(lambda x: x.meta_field(), field_parsers) metas = map(lambda x: x.meta_field(), field_parsers)
# print metas # print metas
key_index = filter(lambda x: x is not None, map( key_index = filter(
lambda (idx, meta): idx if 'is_key' in meta and meta['is_key'] lambda x: x is not None,
else None, enumerate(metas)))[0] map(lambda (idx, meta): idx if 'is_key' in meta and meta['is_key'] else None,
enumerate(metas)))[0]
key_map = [] key_map = []
for i in range(min(key_index, len(metas))): for i in range(min(key_index, len(metas))):
...@@ -374,12 +373,7 @@ class MetaFile(object): ...@@ -374,12 +373,7 @@ class MetaFile(object):
for i in range(key_index + 1, len(metas)): for i in range(key_index + 1, len(metas)):
key_map.append(i) key_map.append(i)
obj = { obj = {'__meta__': {'raw_meta': metas, 'feature_map': key_map}}
'__meta__': {
'raw_meta': metas,
'feature_map': key_map
}
}
for each_block in reader.read(): for each_block in reader.read():
idx = field_parsers[key_index].parse(each_block) idx = field_parsers[key_index].parse(each_block)
......
...@@ -12,7 +12,6 @@ ...@@ -12,7 +12,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
""" """
Separate movielens 1m dataset to train/test file. Separate movielens 1m dataset to train/test file.
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
from paddle.trainer.PyDataProvider2 import * from paddle.trainer.PyDataProvider2 import *
import common_utils # parse import common_utils # parse
def hook(settings, meta, **kwargs): def hook(settings, meta, **kwargs):
""" """
Init hook is invoked before process data. It will set obj.slots and store Init hook is invoked before process data. It will set obj.slots and store
...@@ -41,6 +42,7 @@ def hook(settings, meta, **kwargs): ...@@ -41,6 +42,7 @@ def hook(settings, meta, **kwargs):
settings.input_types = headers settings.input_types = headers
settings.meta = meta settings.meta = meta
@provider(init_hook=hook, cache=CacheType.CACHE_PASS_IN_MEM) @provider(init_hook=hook, cache=CacheType.CACHE_PASS_IN_MEM)
def process(settings, filename): def process(settings, filename):
with open(filename, 'r') as f: with open(filename, 'r') as f:
......
...@@ -28,7 +28,8 @@ if __name__ == '__main__': ...@@ -28,7 +28,8 @@ if __name__ == '__main__':
model_path = sys.argv[1] model_path = sys.argv[1]
swig_paddle.initPaddle('--use_gpu=0') swig_paddle.initPaddle('--use_gpu=0')
conf = parse_config("trainer_config.py", "is_predict=1") conf = parse_config("trainer_config.py", "is_predict=1")
network = swig_paddle.GradientMachine.createFromConfigProto(conf.model_config) network = swig_paddle.GradientMachine.createFromConfigProto(
conf.model_config)
assert isinstance(network, swig_paddle.GradientMachine) assert isinstance(network, swig_paddle.GradientMachine)
network.loadParameters(model_path) network.loadParameters(model_path)
with open('./data/meta.bin', 'rb') as f: with open('./data/meta.bin', 'rb') as f:
...@@ -39,11 +40,12 @@ if __name__ == '__main__': ...@@ -39,11 +40,12 @@ if __name__ == '__main__':
while True: while True:
movie_id = int(raw_input("Input movie_id: ")) movie_id = int(raw_input("Input movie_id: "))
user_id = int(raw_input("Input user_id: ")) user_id = int(raw_input("Input user_id: "))
movie_meta = meta['movie'][movie_id] # Query Data From Meta. movie_meta = meta['movie'][movie_id] # Query Data From Meta.
user_meta = meta['user'][user_id] user_meta = meta['user'][user_id]
data = [movie_id - 1] data = [movie_id - 1]
data.extend(movie_meta) data.extend(movie_meta)
data.append(user_id - 1) data.append(user_id - 1)
data.extend(user_meta) data.extend(user_meta)
print "Prediction Score is %.2f" % ((network.forwardTest( print "Prediction Score is %.2f" % (
cvt.convert([data]))[0]['value'][0][0] + 5) / 2) (network.forwardTest(cvt.convert([data]))[0]['value'][0][0] + 5)
/ 2)
...@@ -27,8 +27,8 @@ with open(META_FILE, 'rb') as f: ...@@ -27,8 +27,8 @@ with open(META_FILE, 'rb') as f:
# load meta file # load meta file
meta = pickle.load(f) meta = pickle.load(f)
settings(batch_size=1600, learning_rate=1e-3, settings(
learning_method=RMSPropOptimizer()) batch_size=1600, learning_rate=1e-3, learning_method=RMSPropOptimizer())
def construct_feature(name): def construct_feature(name):
...@@ -59,11 +59,10 @@ def construct_feature(name): ...@@ -59,11 +59,10 @@ def construct_feature(name):
slot_name = each_meta.get('name', '%s_id' % name) slot_name = each_meta.get('name', '%s_id' % name)
if type_name == 'id': if type_name == 'id':
slot_dim = each_meta['max'] slot_dim = each_meta['max']
embedding = embedding_layer(input=data_layer(slot_name, embedding = embedding_layer(
size=slot_dim), input=data_layer(
size=256) slot_name, size=slot_dim), size=256)
fusion.append(fc_layer(input=embedding, fusion.append(fc_layer(input=embedding, size=256))
size=256))
elif type_name == 'embedding': elif type_name == 'embedding':
is_seq = each_meta['seq'] == 'sequence' is_seq = each_meta['seq'] == 'sequence'
slot_dim = len(each_meta['dict']) slot_dim = len(each_meta['dict'])
...@@ -71,17 +70,14 @@ def construct_feature(name): ...@@ -71,17 +70,14 @@ def construct_feature(name):
embedding = embedding_layer(input=din, size=256) embedding = embedding_layer(input=din, size=256)
if is_seq: if is_seq:
fusion.append( fusion.append(
text_conv_pool(input=embedding, context_len=5, text_conv_pool(
hidden_size=256)) input=embedding, context_len=5, hidden_size=256))
else: else:
fusion.append(fc_layer(input=embedding, fusion.append(fc_layer(input=embedding, size=256))
size=256))
elif type_name == 'one_hot_dense': elif type_name == 'one_hot_dense':
slot_dim = len(each_meta['dict']) slot_dim = len(each_meta['dict'])
hidden = fc_layer(input=data_layer(slot_name, slot_dim), hidden = fc_layer(input=data_layer(slot_name, slot_dim), size=256)
size=256) fusion.append(fc_layer(input=hidden, size=256))
fusion.append(fc_layer(input=hidden,
size=256))
return fc_layer(name="%s_fusion" % name, input=fusion, size=256) return fc_layer(name="%s_fusion" % name, input=fusion, size=256)
...@@ -90,10 +86,16 @@ movie_feature = construct_feature("movie") ...@@ -90,10 +86,16 @@ movie_feature = construct_feature("movie")
user_feature = construct_feature("user") user_feature = construct_feature("user")
similarity = cos_sim(a=movie_feature, b=user_feature) similarity = cos_sim(a=movie_feature, b=user_feature)
if not is_predict: if not is_predict:
outputs(regression_cost(input=similarity, outputs(
label=data_layer('rating', size=1))) regression_cost(
input=similarity, label=data_layer(
define_py_data_sources2('data/train.list', 'data/test.list', module='dataprovider', 'rating', size=1)))
obj='process', args={'meta': meta})
define_py_data_sources2(
'data/train.list',
'data/test.list',
module='dataprovider',
obj='process',
args={'meta': meta})
else: else:
outputs(similarity) outputs(similarity)
...@@ -26,9 +26,9 @@ def hook(settings, word_dict, label_dict, **kwargs): ...@@ -26,9 +26,9 @@ def hook(settings, word_dict, label_dict, **kwargs):
integer_value_sequence(len(word_dict)), integer_value_sequence(len(word_dict)),
integer_value_sequence(len(word_dict)), integer_value_sequence(len(word_dict)),
integer_value_sequence(len(word_dict)), integer_value_sequence(len(word_dict)),
integer_value_sequence(len(word_dict)), integer_value_sequence(len(word_dict)), integer_value_sequence(2),
integer_value_sequence(2), integer_value_sequence(len(label_dict))
integer_value_sequence(len(label_dict))] ]
@provider(init_hook=hook) @provider(init_hook=hook)
......
...@@ -12,7 +12,6 @@ ...@@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import math import math
import os import os
import sys import sys
...@@ -42,7 +41,7 @@ if not is_predict: ...@@ -42,7 +41,7 @@ if not is_predict:
label_dict[w] = i label_dict[w] = i
if is_test: if is_test:
train_list_file = None train_list_file = None
#define data provider #define data provider
define_py_data_sources2( define_py_data_sources2(
......
...@@ -41,22 +41,16 @@ class Prediction(): ...@@ -41,22 +41,16 @@ class Prediction():
len_dict = len(self.dict) len_dict = len(self.dict)
len_label = len(self.labels) len_label = len(self.labels)
conf = parse_config( conf = parse_config(train_conf, 'dict_len=' + str(len_dict) +
train_conf, ',label_len=' + str(len_label) + ',is_predict=True')
'dict_len=' + str(len_dict) +
',label_len=' + str(len_label) +
',is_predict=True')
self.network = swig_paddle.GradientMachine.createFromConfigProto( self.network = swig_paddle.GradientMachine.createFromConfigProto(
conf.model_config) conf.model_config)
self.network.loadParameters(model_dir) self.network.loadParameters(model_dir)
slots = [ slots = [
integer_value_sequence(len_dict), integer_value_sequence(len_dict), integer_value_sequence(len_dict),
integer_value_sequence(len_dict), integer_value_sequence(len_dict), integer_value_sequence(len_dict),
integer_value_sequence(len_dict), integer_value_sequence(len_dict), integer_value_sequence(2)
integer_value_sequence(len_dict),
integer_value_sequence(len_dict),
integer_value_sequence(2)
] ]
self.converter = DataProviderConverter(slots) self.converter = DataProviderConverter(slots)
...@@ -110,8 +104,8 @@ class Prediction(): ...@@ -110,8 +104,8 @@ class Prediction():
len_sen = len(sen.split()) len_sen = len(sen.split())
line_labels = lab[index:index + len_sen] line_labels = lab[index:index + len_sen]
index += len_sen index += len_sen
fout.write(sen + '\t' + ' '.join([self.labels_reverse[ fout.write(sen + '\t' + ' '.join(
i] for i in line_labels]) + '\n') [self.labels_reverse[i] for i in line_labels]) + '\n')
def option_parser(): def option_parser():
......
...@@ -17,8 +17,8 @@ from paddle.trainer.PyDataProvider2 import * ...@@ -17,8 +17,8 @@ from paddle.trainer.PyDataProvider2 import *
def hook(settings, dictionary, **kwargs): def hook(settings, dictionary, **kwargs):
settings.word_dict = dictionary settings.word_dict = dictionary
settings.input_types = [ settings.input_types = [
integer_value_sequence(len(settings.word_dict)), integer_value_sequence(len(settings.word_dict)), integer_value(2)
integer_value(2)] ]
settings.logger.info('dict len : %d' % (len(settings.word_dict))) settings.logger.info('dict len : %d' % (len(settings.word_dict)))
...@@ -29,6 +29,7 @@ def process(settings, file_name): ...@@ -29,6 +29,7 @@ def process(settings, file_name):
label, comment = line.strip().split('\t\t') label, comment = line.strip().split('\t\t')
label = int(label) label = int(label)
words = comment.split() words = comment.split()
word_slot = [settings.word_dict[w] for w in words if w in word_slot = [
settings.word_dict] settings.word_dict[w] for w in words if w in settings.word_dict
]
yield word_slot, label yield word_slot, label
...@@ -18,14 +18,14 @@ from optparse import OptionParser ...@@ -18,14 +18,14 @@ from optparse import OptionParser
from py_paddle import swig_paddle, DataProviderConverter from py_paddle import swig_paddle, DataProviderConverter
from paddle.trainer.PyDataProvider2 import integer_value_sequence from paddle.trainer.PyDataProvider2 import integer_value_sequence
from paddle.trainer.config_parser import parse_config from paddle.trainer.config_parser import parse_config
""" """
Usage: run following command to show help message. Usage: run following command to show help message.
python predict.py -h python predict.py -h
""" """
class SentimentPrediction(): class SentimentPrediction():
def __init__(self, train_conf, dict_file, model_dir=None, label_file = None): def __init__(self, train_conf, dict_file, model_dir=None, label_file=None):
""" """
train_conf: trainer configure. train_conf: trainer configure.
dict_file: word dictionary file name. dict_file: word dictionary file name.
...@@ -44,7 +44,8 @@ class SentimentPrediction(): ...@@ -44,7 +44,8 @@ class SentimentPrediction():
self.load_label(label_file) self.load_label(label_file)
conf = parse_config(train_conf, "is_predict=1") conf = parse_config(train_conf, "is_predict=1")
self.network = swig_paddle.GradientMachine.createFromConfigProto(conf.model_config) self.network = swig_paddle.GradientMachine.createFromConfigProto(
conf.model_config)
self.network.loadParameters(self.model_dir) self.network.loadParameters(self.model_dir)
input_types = [integer_value_sequence(self.dict_dim)] input_types = [integer_value_sequence(self.dict_dim)]
self.converter = DataProviderConverter(input_types) self.converter = DataProviderConverter(input_types)
...@@ -61,7 +62,7 @@ class SentimentPrediction(): ...@@ -61,7 +62,7 @@ class SentimentPrediction():
""" """
Load label. Load label.
""" """
self.label={} self.label = {}
for v in open(label_file, 'r'): for v in open(label_file, 'r'):
self.label[int(v.split('\t')[1])] = v.split('\t')[0] self.label[int(v.split('\t')[1])] = v.split('\t')[0]
...@@ -72,7 +73,9 @@ class SentimentPrediction(): ...@@ -72,7 +73,9 @@ class SentimentPrediction():
with open(data_file, 'r') as fdata: with open(data_file, 'r') as fdata:
for line in fdata: for line in fdata:
words = line.strip().split() words = line.strip().split()
word_slot = [self.word_dict[w] for w in words if w in self.word_dict] word_slot = [
self.word_dict[w] for w in words if w in self.word_dict
]
if not word_slot: if not word_slot:
print "all words are not in dictionary: %s", line print "all words are not in dictionary: %s", line
continue continue
...@@ -89,25 +92,48 @@ class SentimentPrediction(): ...@@ -89,25 +92,48 @@ class SentimentPrediction():
if self.label is None: if self.label is None:
print("%s: predicting label is %d" % (data_file, lab[0][0])) print("%s: predicting label is %d" % (data_file, lab[0][0]))
else: else:
print("%s: predicting label is %s" % (data_file, self.label[lab[0][0]])) print("%s: predicting label is %s" %
(data_file, self.label[lab[0][0]]))
def option_parser(): def option_parser():
usage = "python predict.py -n config -w model_dir -d dictionary -i input_file " usage = "python predict.py -n config -w model_dir -d dictionary -i input_file "
parser = OptionParser(usage="usage: %s [options]" % usage) parser = OptionParser(usage="usage: %s [options]" % usage)
parser.add_option("-n", "--tconf", action="store", parser.add_option(
dest="train_conf", help="network config") "-n",
parser.add_option("-d", "--dict", action="store", "--tconf",
dest="dict_file",help="dictionary file") action="store",
parser.add_option("-b", "--label", action="store", dest="train_conf",
dest="label", default=None, help="network config")
help="dictionary file") parser.add_option(
parser.add_option("-i", "--data", action="store", "-d",
dest="data", help="data file to predict") "--dict",
parser.add_option("-w", "--model", action="store", action="store",
dest="model_path", default=None, dest="dict_file",
help="model path") help="dictionary file")
parser.add_option(
"-b",
"--label",
action="store",
dest="label",
default=None,
help="dictionary file")
parser.add_option(
"-i",
"--data",
action="store",
dest="data",
help="data file to predict")
parser.add_option(
"-w",
"--model",
action="store",
dest="model_path",
default=None,
help="model path")
return parser.parse_args() return parser.parse_args()
def main(): def main():
options, args = option_parser() options, args = option_parser()
train_conf = options.train_conf train_conf = options.train_conf
...@@ -119,5 +145,6 @@ def main(): ...@@ -119,5 +145,6 @@ def main():
predict = SentimentPrediction(train_conf, dict_file, model_path, label) predict = SentimentPrediction(train_conf, dict_file, model_path, label)
predict.predict(data) predict.predict(data)
if __name__ == '__main__': if __name__ == '__main__':
main() main()
...@@ -22,13 +22,13 @@ from os.path import join as join_path ...@@ -22,13 +22,13 @@ from os.path import join as join_path
from optparse import OptionParser from optparse import OptionParser
from paddle.utils.preprocess_util import * from paddle.utils.preprocess_util import *
""" """
Usage: run following command to show help message. Usage: run following command to show help message.
python preprocess.py -h python preprocess.py -h
""" """
def save_dict(dict, filename, is_reverse = True):
def save_dict(dict, filename, is_reverse=True):
""" """
Save dictionary into file. Save dictionary into file.
dict: input dictionary. dict: input dictionary.
...@@ -39,9 +39,10 @@ def save_dict(dict, filename, is_reverse = True): ...@@ -39,9 +39,10 @@ def save_dict(dict, filename, is_reverse = True):
f = open(filename, 'w') f = open(filename, 'w')
for k, v in sorted(dict.items(), key=operator.itemgetter(1),\ for k, v in sorted(dict.items(), key=operator.itemgetter(1),\
reverse=is_reverse): reverse=is_reverse):
f.write('%s\t%s\n'%(k, v)) f.write('%s\t%s\n' % (k, v))
f.close() f.close()
def tokenize(sentences): def tokenize(sentences):
""" """
Use tokenizer.perl to tokenize input sentences. Use tokenizer.perl to tokenize input sentences.
...@@ -58,6 +59,7 @@ def tokenize(sentences): ...@@ -58,6 +59,7 @@ def tokenize(sentences):
toks = tok_text.split('\n')[:-1] toks = tok_text.split('\n')[:-1]
return toks return toks
def read_lines(path): def read_lines(path):
""" """
path: String, file path. path: String, file path.
...@@ -71,12 +73,17 @@ def read_lines(path): ...@@ -71,12 +73,17 @@ def read_lines(path):
seqs.append(line) seqs.append(line)
return seqs return seqs
class SentimentDataSetCreate(): class SentimentDataSetCreate():
""" """
A class to process data for sentiment analysis task. A class to process data for sentiment analysis task.
""" """
def __init__(self, data_path, output_path,
use_okenizer = True, multi_lines = False): def __init__(self,
data_path,
output_path,
use_okenizer=True,
multi_lines=False):
""" """
data_path: string, traing and testing dataset path data_path: string, traing and testing dataset path
output_path: string, output path, store processed dataset output_path: string, output path, store processed dataset
...@@ -164,23 +171,17 @@ class SentimentDataSetCreate(): ...@@ -164,23 +171,17 @@ class SentimentDataSetCreate():
# Preprocess train data. # Preprocess train data.
train_data, train_lab_set = self.data_list(self.train_dir) train_data, train_lab_set = self.data_list(self.train_dir)
print "processing train set..." print "processing train set..."
file_lists = self.save_data(train_data, file_lists = self.save_data(train_data, "train", self.batch_size, True,
"train", True)
self.batch_size,
True,
True)
save_list(file_lists, self.train_list) save_list(file_lists, self.train_list)
# If have test data path, preprocess test data. # If have test data path, preprocess test data.
if os.path.exists(self.test_dir): if os.path.exists(self.test_dir):
test_data, test_lab_set = self.data_list(self.test_dir) test_data, test_lab_set = self.data_list(self.test_dir)
assert(train_lab_set == test_lab_set) assert (train_lab_set == test_lab_set)
print "processing test set..." print "processing test set..."
file_lists = self.save_data(test_data, file_lists = self.save_data(test_data, "test", self.batch_size,
"test", False, self.dict_with_test)
self.batch_size,
False,
self.dict_with_test)
save_list(file_lists, self.test_list) save_list(file_lists, self.test_list)
# save labels set. # save labels set.
...@@ -191,7 +192,9 @@ class SentimentDataSetCreate(): ...@@ -191,7 +192,9 @@ class SentimentDataSetCreate():
save_dict(self.word_count, self.dict_file, True) save_dict(self.word_count, self.dict_file, True)
self.dict_size = len(self.word_count) self.dict_size = len(self.word_count)
def save_data(self, data, prefix = "", def save_data(self,
data,
prefix="",
batch_size=50000, batch_size=50000,
is_shuffle=False, is_shuffle=False,
build_dict=False): build_dict=False):
...@@ -205,7 +208,8 @@ class SentimentDataSetCreate(): ...@@ -205,7 +208,8 @@ class SentimentDataSetCreate():
return: list of batch names return: list of batch names
""" """
if is_shuffle and self.multi_lines: if is_shuffle and self.multi_lines:
return self.save_data_multi_lines(data, prefix, batch_size, build_dict) return self.save_data_multi_lines(data, prefix, batch_size,
build_dict)
if is_shuffle: if is_shuffle:
random.shuffle(data) random.shuffle(data)
...@@ -213,7 +217,7 @@ class SentimentDataSetCreate(): ...@@ -213,7 +217,7 @@ class SentimentDataSetCreate():
batch_names = [] batch_names = []
for i in range(num_batches): for i in range(num_batches):
batch_name = join_path(self.output_path, batch_name = join_path(self.output_path,
"%s_part_%03d" %(prefix, i)) "%s_part_%03d" % (prefix, i))
begin = i * batch_size begin = i * batch_size
end = min((i + 1) * batch_size, len(data)) end = min((i + 1) * batch_size, len(data))
# read a batch of data # read a batch of data
...@@ -246,7 +250,9 @@ class SentimentDataSetCreate(): ...@@ -246,7 +250,9 @@ class SentimentDataSetCreate():
data_list = tokenize(data_list) data_list = tokenize(data_list)
return label_list, data_list return label_list, data_list
def save_data_multi_lines(self, data, prefix = "", def save_data_multi_lines(self,
data,
prefix="",
batch_size=50000, batch_size=50000,
build_dict=False): build_dict=False):
""" """
...@@ -274,14 +280,14 @@ class SentimentDataSetCreate(): ...@@ -274,14 +280,14 @@ class SentimentDataSetCreate():
self.create_dict(data_list) self.create_dict(data_list)
length = len(label_list) length = len(label_list)
perm_list = np.array([ i for i in xrange(length) ]) perm_list = np.array([i for i in xrange(length)])
random.shuffle(perm_list) random.shuffle(perm_list)
num_batches = int(math.ceil(length / float(batch_size))) num_batches = int(math.ceil(length / float(batch_size)))
batch_names = [] batch_names = []
for i in range(num_batches): for i in range(num_batches):
batch_name = join_path(self.output_path, batch_name = join_path(self.output_path,
"%s_part_%03d" %(prefix, i)) "%s_part_%03d" % (prefix, i))
begin = i * batch_size begin = i * batch_size
end = min((i + 1) * batch_size, length) end = min((i + 1) * batch_size, length)
sub_label = [label_list[perm_list[i]] for i in range(begin, end)] sub_label = [label_list[perm_list[i]] for i in range(begin, end)]
...@@ -304,35 +310,50 @@ class SentimentDataSetCreate(): ...@@ -304,35 +310,50 @@ class SentimentDataSetCreate():
f.write('%s\t\t%s\n' % (lab, seq)) f.write('%s\t\t%s\n' % (lab, seq))
f.close() f.close()
def option_parser(): def option_parser():
parser = OptionParser(usage="usage: python preprcoess.py "\ parser = OptionParser(usage="usage: python preprcoess.py "\
"-i data_dir [options]") "-i data_dir [options]")
parser.add_option("-i", "--data", action="store", parser.add_option(
dest="input", help="Input data directory.") "-i",
parser.add_option("-o", "--output", action="store", "--data",
dest="output", default=None, action="store",
help="Output directory.") dest="input",
parser.add_option("-t", "--tokenizer", action="store", help="Input data directory.")
dest="use_tokenizer", default=True, parser.add_option(
help="Whether to use tokenizer.") "-o",
"--output",
action="store",
dest="output",
default=None,
help="Output directory.")
parser.add_option(
"-t",
"--tokenizer",
action="store",
dest="use_tokenizer",
default=True,
help="Whether to use tokenizer.")
parser.add_option("-m", "--multi_lines", action="store", parser.add_option("-m", "--multi_lines", action="store",
dest="multi_lines", default=False, dest="multi_lines", default=False,
help="If input text files have multi lines and they "\ help="If input text files have multi lines and they "\
"need to be shuffled, you should set -m True,") "need to be shuffled, you should set -m True,")
return parser.parse_args() return parser.parse_args()
def main(): def main():
options, args = option_parser() options, args = option_parser()
data_dir=options.input data_dir = options.input
output_dir=options.output output_dir = options.output
use_tokenizer=options.use_tokenizer use_tokenizer = options.use_tokenizer
multi_lines=options.multi_lines multi_lines = options.multi_lines
if output_dir is None: if output_dir is None:
outname = os.path.basename(options.input) outname = os.path.basename(options.input)
output_dir = join_path(os.path.dirname(data_dir), 'pre-' + outname) output_dir = join_path(os.path.dirname(data_dir), 'pre-' + outname)
data_creator = SentimentDataSetCreate(data_dir, output_dir, data_creator = SentimentDataSetCreate(data_dir, output_dir, use_tokenizer,
use_tokenizer, multi_lines) multi_lines)
data_creator.create_dataset() data_creator.create_dataset()
if __name__ == '__main__': if __name__ == '__main__':
main() main()
...@@ -47,10 +47,12 @@ def sentiment_data(data_dir=None, ...@@ -47,10 +47,12 @@ def sentiment_data(data_dir=None,
for i, line in enumerate(open(dict_file, 'r')): for i, line in enumerate(open(dict_file, 'r')):
word_dict[line.split('\t')[0]] = i word_dict[line.split('\t')[0]] = i
define_py_data_sources2(train_list, test_list, define_py_data_sources2(
module="dataprovider", train_list,
obj="process", test_list,
args={'dictionary': word_dict}) module="dataprovider",
obj="process",
args={'dictionary': word_dict})
return dict_dim, class_dim return dict_dim, class_dim
...@@ -64,8 +66,7 @@ def bidirectional_lstm_net(input_dim, ...@@ -64,8 +66,7 @@ def bidirectional_lstm_net(input_dim,
emb = embedding_layer(input=data, size=emb_dim) emb = embedding_layer(input=data, size=emb_dim)
bi_lstm = bidirectional_lstm(input=emb, size=lstm_dim) bi_lstm = bidirectional_lstm(input=emb, size=lstm_dim)
dropout = dropout_layer(input=bi_lstm, dropout_rate=0.5) dropout = dropout_layer(input=bi_lstm, dropout_rate=0.5)
output = fc_layer(input=dropout, size=class_dim, output = fc_layer(input=dropout, size=class_dim, act=SoftmaxActivation())
act=SoftmaxActivation())
if not is_predict: if not is_predict:
lbl = data_layer("label", 1) lbl = data_layer("label", 1)
...@@ -109,27 +110,36 @@ def stacked_lstm_net(input_dim, ...@@ -109,27 +110,36 @@ def stacked_lstm_net(input_dim,
data = data_layer("word", input_dim) data = data_layer("word", input_dim)
emb = embedding_layer(input=data, size=emb_dim) emb = embedding_layer(input=data, size=emb_dim)
fc1 = fc_layer(input=emb, size=hid_dim, act=linear, fc1 = fc_layer(input=emb, size=hid_dim, act=linear, bias_attr=bias_attr)
bias_attr=bias_attr) lstm1 = lstmemory(
lstm1 = lstmemory(input=fc1, act=relu, bias_attr=bias_attr, input=fc1, act=relu, bias_attr=bias_attr, layer_attr=layer_attr)
layer_attr=layer_attr)
inputs = [fc1, lstm1] inputs = [fc1, lstm1]
for i in range(2, stacked_num + 1): for i in range(2, stacked_num + 1):
fc = fc_layer(input=inputs, size=hid_dim, act=linear, fc = fc_layer(
param_attr=para_attr, bias_attr=bias_attr) input=inputs,
lstm = lstmemory(input=fc, reverse=(i % 2) == 0, act=relu, size=hid_dim,
bias_attr=bias_attr, layer_attr=layer_attr) act=linear,
param_attr=para_attr,
bias_attr=bias_attr)
lstm = lstmemory(
input=fc,
reverse=(i % 2) == 0,
act=relu,
bias_attr=bias_attr,
layer_attr=layer_attr)
inputs = [fc, lstm] inputs = [fc, lstm]
fc_last = pooling_layer(input=inputs[0], pooling_type=MaxPooling()) fc_last = pooling_layer(input=inputs[0], pooling_type=MaxPooling())
lstm_last = pooling_layer(input=inputs[1], pooling_type=MaxPooling()) lstm_last = pooling_layer(input=inputs[1], pooling_type=MaxPooling())
output = fc_layer(input=[fc_last, lstm_last], size=class_dim, output = fc_layer(
act=SoftmaxActivation(), input=[fc_last, lstm_last],
bias_attr=bias_attr, param_attr=para_attr) size=class_dim,
act=SoftmaxActivation(),
bias_attr=bias_attr,
param_attr=para_attr)
if is_predict: if is_predict:
outputs(output) outputs(output)
else: else:
outputs( outputs(classification_cost(input=output, label=data_layer('label', 1)))
classification_cost(input=output, label=data_layer('label', 1)))
...@@ -20,20 +20,19 @@ is_test = get_config_arg('is_test', bool, False) ...@@ -20,20 +20,19 @@ is_test = get_config_arg('is_test', bool, False)
# whether this config is used for prediction # whether this config is used for prediction
is_predict = get_config_arg('is_predict', bool, False) is_predict = get_config_arg('is_predict', bool, False)
data_dir = "./data/pre-imdb" data_dir = "./data/pre-imdb"
dict_dim, class_dim = sentiment_data(data_dir, is_test, is_predict) dict_dim, class_dim = sentiment_data(data_dir, is_test, is_predict)
################## Algorithm Config ##################### ################## Algorithm Config #####################
settings( settings(
batch_size=128, batch_size=128,
learning_rate=2e-3, learning_rate=2e-3,
learning_method=AdamOptimizer(), learning_method=AdamOptimizer(),
regularization=L2Regularization(8e-4), regularization=L2Regularization(8e-4),
gradient_clipping_threshold=25 gradient_clipping_threshold=25)
)
#################### Network Config ###################### #################### Network Config ######################
stacked_lstm_net(dict_dim, class_dim=class_dim, stacked_lstm_net(
stacked_num=3, is_predict=is_predict) dict_dim, class_dim=class_dim, stacked_num=3, is_predict=is_predict)
# bidirectional_lstm_net(dict_dim, class_dim=class_dim, is_predict=is_predict) # bidirectional_lstm_net(dict_dim, class_dim=class_dim, is_predict=is_predict)
...@@ -30,14 +30,14 @@ def hook(settings, src_dict, trg_dict, file_list, **kwargs): ...@@ -30,14 +30,14 @@ def hook(settings, src_dict, trg_dict, file_list, **kwargs):
if settings.job_mode: if settings.job_mode:
settings.trg_dict = trg_dict settings.trg_dict = trg_dict
settings.slots = [ settings.slots = [
integer_value_sequence(len(settings.src_dict)), integer_value_sequence(len(settings.src_dict)),
integer_value_sequence(len(settings.trg_dict)), integer_value_sequence(len(settings.trg_dict)),
integer_value_sequence(len(settings.trg_dict)) integer_value_sequence(len(settings.trg_dict))
] ]
settings.logger.info("trg dict len : %d" % (len(settings.trg_dict))) settings.logger.info("trg dict len : %d" % (len(settings.trg_dict)))
else: else:
settings.slots = [ settings.slots = [
integer_value_sequence(len(settings.src_dict)), integer_value_sequence(len(settings.src_dict)),
integer_value_sequence(len(open(file_list[0], "r").readlines())) integer_value_sequence(len(open(file_list[0], "r").readlines()))
] ]
...@@ -62,8 +62,7 @@ def process(settings, file_name): ...@@ -62,8 +62,7 @@ def process(settings, file_name):
if settings.job_mode: if settings.job_mode:
trg_seq = line_split[1] # one target sequence trg_seq = line_split[1] # one target sequence
trg_words = trg_seq.split() trg_words = trg_seq.split()
trg_ids = [settings.trg_dict.get(w, UNK_IDX) trg_ids = [settings.trg_dict.get(w, UNK_IDX) for w in trg_words]
for w in trg_words]
# remove sequence whose length > 80 in training mode # remove sequence whose length > 80 in training mode
if len(src_ids) > 80 or len(trg_ids) > 80: if len(src_ids) > 80 or len(trg_ids) > 80:
......
...@@ -12,7 +12,6 @@ ...@@ -12,7 +12,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
""" """
Example: Example:
python preprocess.py -i INPUT [-d DICTSIZE] [-m] python preprocess.py -i INPUT [-d DICTSIZE] [-m]
...@@ -24,12 +23,13 @@ Options: ...@@ -24,12 +23,13 @@ Options:
-m --mergeDict merge source and target dictionary -m --mergeDict merge source and target dictionary
""" """
import os import os
import sys import sys
import string import string
from optparse import OptionParser from optparse import OptionParser
from paddle.utils.preprocess_util import save_list, DatasetCreater from paddle.utils.preprocess_util import save_list, DatasetCreater
class SeqToSeqDatasetCreater(DatasetCreater): class SeqToSeqDatasetCreater(DatasetCreater):
""" """
A class to process data for sequence to sequence application. A class to process data for sequence to sequence application.
...@@ -75,7 +75,7 @@ class SeqToSeqDatasetCreater(DatasetCreater): ...@@ -75,7 +75,7 @@ class SeqToSeqDatasetCreater(DatasetCreater):
if not os.path.exists(output): if not os.path.exists(output):
os.system(cmd + '> ' + output) os.system(cmd + '> ' + output)
def build_dict(self, file_path, dict_path, dict_size = -1): def build_dict(self, file_path, dict_path, dict_size=-1):
""" """
Create the dictionary for the file, Note that Create the dictionary for the file, Note that
1. Valid characters include all printable characters 1. Valid characters include all printable characters
...@@ -99,20 +99,23 @@ class SeqToSeqDatasetCreater(DatasetCreater): ...@@ -99,20 +99,23 @@ class SeqToSeqDatasetCreater(DatasetCreater):
for word in words: for word in words:
if word not in dictory: if word not in dictory:
dictory[word] = 1 dictory[word] = 1
else: else:
dictory[word] += 1 dictory[word] += 1
output = open(dict_path, "w+") output = open(dict_path, "w+")
output.write('<s>\n<e>\n<unk>\n') output.write('<s>\n<e>\n<unk>\n')
count = 3 count = 3
for key, value in sorted(dictory.items(), key = lambda d:d[1], reverse = True): for key, value in sorted(
dictory.items(), key=lambda d: d[1], reverse=True):
output.write(key + "\n") output.write(key + "\n")
count += 1 count += 1
if count == dict_size: if count == dict_size:
break break
self.dict_size = count self.dict_size = count
def create_dataset(self, dict_size = -1, mergeDict = False, def create_dataset(self,
suffixes = ['.src', '.trg']): dict_size=-1,
mergeDict=False,
suffixes=['.src', '.trg']):
""" """
Create seqToseq dataset Create seqToseq dataset
""" """
...@@ -135,13 +138,14 @@ class SeqToSeqDatasetCreater(DatasetCreater): ...@@ -135,13 +138,14 @@ class SeqToSeqDatasetCreater(DatasetCreater):
# checkout dataset should be parallel corpora # checkout dataset should be parallel corpora
suffix_len = len(suffixes[0]) suffix_len = len(suffixes[0])
for dataset in dataset_list: for dataset in dataset_list:
file_list = os.listdir(dataset) file_list = os.listdir(dataset)
if len(file_list) % 2 == 1: if len(file_list) % 2 == 1:
raise RuntimeError("dataset should be parallel corpora") raise RuntimeError("dataset should be parallel corpora")
file_list.sort() file_list.sort()
for i in range(0, len(file_list), 2): for i in range(0, len(file_list), 2):
if file_list[i][:-suffix_len] != file_list[i + 1][:-suffix_len]: if file_list[i][:-suffix_len] != file_list[i + 1][:-suffix_len]:
raise RuntimeError("source and target file name should be equal") raise RuntimeError(
"source and target file name should be equal")
# cat all the files with the same suffix in dataset # cat all the files with the same suffix in dataset
for suffix in suffixes: for suffix in suffixes:
...@@ -155,16 +159,18 @@ class SeqToSeqDatasetCreater(DatasetCreater): ...@@ -155,16 +159,18 @@ class SeqToSeqDatasetCreater(DatasetCreater):
list = ['train.list', 'test.list', 'gen.list'] list = ['train.list', 'test.list', 'gen.list']
for dataset in dataset_list: for dataset in dataset_list:
outname = os.path.basename(dataset) outname = os.path.basename(dataset)
self.concat_file(dataset, outname + suffixes[0], self.concat_file(dataset, outname + suffixes[0],
outname + suffixes[1], dir_list[id], outname) outname + suffixes[1], dir_list[id], outname)
save_list([os.path.join(dir_list[id], outname)], save_list([os.path.join(dir_list[id], outname)],
os.path.join(self.output_path, list[id])) os.path.join(self.output_path, list[id]))
id += 1 id += 1
# build dictionary for train data # build dictionary for train data
dict = ['src.dict', 'trg.dict'] dict = ['src.dict', 'trg.dict']
dict_path = [os.path.join(self.output_path, dict[0]), dict_path = [
os.path.join(self.output_path, dict[1])] os.path.join(self.output_path, dict[0]),
os.path.join(self.output_path, dict[1])
]
if mergeDict: if mergeDict:
outname = os.path.join(train_dir, train_dataset.split('/')[-1]) outname = os.path.join(train_dir, train_dataset.split('/')[-1])
print 'build src dictionary for train data' print 'build src dictionary for train data'
...@@ -173,22 +179,30 @@ class SeqToSeqDatasetCreater(DatasetCreater): ...@@ -173,22 +179,30 @@ class SeqToSeqDatasetCreater(DatasetCreater):
os.system('cp ' + dict_path[0] + ' ' + dict_path[1]) os.system('cp ' + dict_path[0] + ' ' + dict_path[1])
else: else:
outname = os.path.join(train_dataset, self.train_dir_name) outname = os.path.join(train_dataset, self.train_dir_name)
for id in range(0,2): for id in range(0, 2):
suffix = suffixes[id] suffix = suffixes[id]
print 'build ' + suffix[1:] + ' dictionary for train data' print 'build ' + suffix[1:] + ' dictionary for train data'
self.build_dict(outname + suffix, dict_path[id], dict_size) self.build_dict(outname + suffix, dict_path[id], dict_size)
print 'dictionary size is', self.dict_size print 'dictionary size is', self.dict_size
def main(): def main():
usage = "usage: \n" \ usage = "usage: \n" \
"python %prog -i INPUT [-d DICTSIZE] [-m]" "python %prog -i INPUT [-d DICTSIZE] [-m]"
parser = OptionParser(usage) parser = OptionParser(usage)
parser.add_option("-i", action="store", dest="input", parser.add_option(
help="input original dataset path") "-i", action="store", dest="input", help="input original dataset path")
parser.add_option("-d", action="store", dest="dictsize", parser.add_option(
help="specified word count of dictionary") "-d",
parser.add_option("-m", "--mergeDict", action="store_true", dest="mergeDict", action="store",
help="merge source and target dictionary") dest="dictsize",
help="specified word count of dictionary")
parser.add_option(
"-m",
"--mergeDict",
action="store_true",
dest="mergeDict",
help="merge source and target dictionary")
(options, args) = parser.parse_args() (options, args) = parser.parse_args()
if options.input[-1] == os.path.sep: if options.input[-1] == os.path.sep:
options.input = options.input[:-1] options.input = options.input[:-1]
...@@ -200,5 +214,6 @@ def main(): ...@@ -200,5 +214,6 @@ def main():
data_creator = SeqToSeqDatasetCreater(options.input, output_path) data_creator = SeqToSeqDatasetCreater(options.input, output_path)
data_creator.create_dataset(dictsize, options.mergeDict) data_creator.create_dataset(dictsize, options.mergeDict)
if __name__ == "__main__": if __name__ == "__main__":
main(); main()
...@@ -50,16 +50,21 @@ def seq_to_seq_data(data_dir, ...@@ -50,16 +50,21 @@ def seq_to_seq_data(data_dir,
trg_dict = None trg_dict = None
else: else:
train_list = os.path.join(data_dir, train_list) train_list = os.path.join(data_dir, train_list)
test_list = os.path.join(data_dir,test_list) test_list = os.path.join(data_dir, test_list)
define_py_data_sources2(train_list, test_list, define_py_data_sources2(
module = "dataprovider", train_list,
obj = "process", test_list,
args = {"src_dict": src_dict, module="dataprovider",
"trg_dict": trg_dict}) obj="process",
args={"src_dict": src_dict,
"trg_dict": trg_dict})
return {"src_dict_path": src_lang_dict, "trg_dict_path": trg_lang_dict, return {
"gen_result": gen_result} "src_dict_path": src_lang_dict,
"trg_dict_path": trg_lang_dict,
"gen_result": gen_result
}
def gru_encoder_decoder(data_conf, def gru_encoder_decoder(data_conf,
...@@ -90,51 +95,55 @@ def gru_encoder_decoder(data_conf, ...@@ -90,51 +95,55 @@ def gru_encoder_decoder(data_conf,
size=word_vector_dim, size=word_vector_dim,
param_attr=ParamAttr(name='_source_language_embedding')) param_attr=ParamAttr(name='_source_language_embedding'))
src_forward = simple_gru(input=src_embedding, size=encoder_size) src_forward = simple_gru(input=src_embedding, size=encoder_size)
src_backward = simple_gru(input=src_embedding, src_backward = simple_gru(
size=encoder_size, input=src_embedding, size=encoder_size, reverse=True)
reverse=True)
encoded_vector = concat_layer(input=[src_forward, src_backward]) encoded_vector = concat_layer(input=[src_forward, src_backward])
with mixed_layer(size=decoder_size) as encoded_proj: with mixed_layer(size=decoder_size) as encoded_proj:
encoded_proj += full_matrix_projection(input=encoded_vector) encoded_proj += full_matrix_projection(input=encoded_vector)
backward_first = first_seq(input=src_backward) backward_first = first_seq(input=src_backward)
with mixed_layer(size=decoder_size, with mixed_layer(
act=TanhActivation(), ) as decoder_boot: size=decoder_size,
act=TanhActivation(), ) as decoder_boot:
decoder_boot += full_matrix_projection(input=backward_first) decoder_boot += full_matrix_projection(input=backward_first)
def gru_decoder_with_attention(enc_vec, enc_proj, current_word): def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
decoder_mem = memory(name='gru_decoder', decoder_mem = memory(
size=decoder_size, name='gru_decoder', size=decoder_size, boot_layer=decoder_boot)
boot_layer=decoder_boot)
context = simple_attention(encoded_sequence=enc_vec, context = simple_attention(
encoded_proj=enc_proj, encoded_sequence=enc_vec,
decoder_state=decoder_mem, ) encoded_proj=enc_proj,
decoder_state=decoder_mem, )
with mixed_layer(size=decoder_size * 3) as decoder_inputs: with mixed_layer(size=decoder_size * 3) as decoder_inputs:
decoder_inputs += full_matrix_projection(input=context) decoder_inputs += full_matrix_projection(input=context)
decoder_inputs += full_matrix_projection(input=current_word) decoder_inputs += full_matrix_projection(input=current_word)
gru_step = gru_step_layer(name='gru_decoder', gru_step = gru_step_layer(
input=decoder_inputs, name='gru_decoder',
output_mem=decoder_mem, input=decoder_inputs,
size=decoder_size) output_mem=decoder_mem,
size=decoder_size)
with mixed_layer(size=target_dict_dim, with mixed_layer(
bias_attr=True, size=target_dict_dim, bias_attr=True,
act=SoftmaxActivation()) as out: act=SoftmaxActivation()) as out:
out += full_matrix_projection(input=gru_step) out += full_matrix_projection(input=gru_step)
return out return out
decoder_group_name = "decoder_group" decoder_group_name = "decoder_group"
group_inputs=[StaticInput(input=encoded_vector,is_seq=True), group_inputs = [
StaticInput(input=encoded_proj,is_seq=True)] StaticInput(
input=encoded_vector, is_seq=True), StaticInput(
input=encoded_proj, is_seq=True)
]
if not is_generating: if not is_generating:
trg_embedding = embedding_layer( trg_embedding = embedding_layer(
input=data_layer(name='target_language_word', input=data_layer(
size=target_dict_dim), name='target_language_word', size=target_dict_dim),
size=word_vector_dim, size=word_vector_dim,
param_attr=ParamAttr(name='_target_language_embedding')) param_attr=ParamAttr(name='_target_language_embedding'))
group_inputs.append(trg_embedding) group_inputs.append(trg_embedding)
...@@ -144,12 +153,12 @@ def gru_encoder_decoder(data_conf, ...@@ -144,12 +153,12 @@ def gru_encoder_decoder(data_conf,
# while encoded source sequence is accessed to as an unbounded memory. # while encoded source sequence is accessed to as an unbounded memory.
# Here, the StaticInput defines a read-only memory # Here, the StaticInput defines a read-only memory
# for the recurrent_group. # for the recurrent_group.
decoder = recurrent_group(name=decoder_group_name, decoder = recurrent_group(
step=gru_decoder_with_attention, name=decoder_group_name,
input=group_inputs) step=gru_decoder_with_attention,
input=group_inputs)
lbl = data_layer(name='target_language_next_word', lbl = data_layer(name='target_language_next_word', size=target_dict_dim)
size=target_dict_dim)
cost = classification_cost(input=decoder, label=lbl) cost = classification_cost(input=decoder, label=lbl)
outputs(cost) outputs(cost)
else: else:
...@@ -168,16 +177,19 @@ def gru_encoder_decoder(data_conf, ...@@ -168,16 +177,19 @@ def gru_encoder_decoder(data_conf,
embedding_size=word_vector_dim) embedding_size=word_vector_dim)
group_inputs.append(trg_embedding) group_inputs.append(trg_embedding)
beam_gen = beam_search(name=decoder_group_name, beam_gen = beam_search(
step=gru_decoder_with_attention, name=decoder_group_name,
input=group_inputs, step=gru_decoder_with_attention,
bos_id=0, input=group_inputs,
eos_id=1, bos_id=0,
beam_size=beam_size, eos_id=1,
max_length=max_length) beam_size=beam_size,
max_length=max_length)
seqtext_printer_evaluator(input=beam_gen,
id_input=data_layer(name="sent_id", size=1), seqtext_printer_evaluator(
dict_file=trg_dict_path, input=beam_gen,
result_file=gen_trans_file) id_input=data_layer(
name="sent_id", size=1),
dict_file=trg_dict_path,
result_file=gen_trans_file)
outputs(beam_gen) outputs(beam_gen)
...@@ -17,8 +17,7 @@ import gzip ...@@ -17,8 +17,7 @@ import gzip
import logging import logging
logging.basicConfig( logging.basicConfig(
format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s', format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s', )
)
logger = logging.getLogger('paddle') logger = logging.getLogger('paddle')
logger.setLevel(logging.INFO) logger.setLevel(logging.INFO)
...@@ -32,59 +31,58 @@ num_original_columns = 3 ...@@ -32,59 +31,58 @@ num_original_columns = 3
# [[-1,0], [0,0]] means previous token at column 0 and current token at # [[-1,0], [0,0]] means previous token at column 0 and current token at
# column 0 are combined as one feature. # column 0 are combined as one feature.
patterns = [ patterns = [
[[-2,0]], [[-2, 0]],
[[-1,0]], [[-1, 0]],
[[0,0]], [[0, 0]],
[[1,0]], [[1, 0]],
[[2,0]], [[2, 0]],
[[-1, 0], [0, 0]],
[[-1,0], [0,0]], [[0, 0], [1, 0]],
[[0,0], [1,0]], [[-2, 1]],
[[-1, 1]],
[[-2,1]], [[0, 1]],
[[-1,1]], [[1, 1]],
[[0,1]], [[2, 1]],
[[1,1]], [[-2, 1], [-1, 1]],
[[2,1]], [[-1, 1], [0, 1]],
[[-2,1], [-1,1]], [[0, 1], [1, 1]],
[[-1,1], [0,1]], [[1, 1], [2, 1]],
[[0,1], [1,1]], [[-2, 1], [-1, 1], [0, 1]],
[[1,1], [2,1]], [[-1, 1], [0, 1], [1, 1]],
[[0, 1], [1, 1], [2, 1]],
[[-2,1], [-1,1], [0,1]],
[[-1,1], [0,1], [1,1]],
[[0,1], [1,1], [2,1]],
] ]
dict_label = { dict_label = {
'B-ADJP': 0, 'B-ADJP': 0,
'I-ADJP': 1, 'I-ADJP': 1,
'B-ADVP': 2, 'B-ADVP': 2,
'I-ADVP': 3, 'I-ADVP': 3,
'B-CONJP': 4, 'B-CONJP': 4,
'I-CONJP': 5, 'I-CONJP': 5,
'B-INTJ': 6, 'B-INTJ': 6,
'I-INTJ': 7, 'I-INTJ': 7,
'B-LST': 8, 'B-LST': 8,
'I-LST': 9, 'I-LST': 9,
'B-NP': 10, 'B-NP': 10,
'I-NP': 11, 'I-NP': 11,
'B-PP': 12, 'B-PP': 12,
'I-PP': 13, 'I-PP': 13,
'B-PRT': 14, 'B-PRT': 14,
'I-PRT': 15, 'I-PRT': 15,
'B-SBAR': 16, 'B-SBAR': 16,
'I-SBAR': 17, 'I-SBAR': 17,
'B-UCP': 18, 'B-UCP': 18,
'I-UCP': 19, 'I-UCP': 19,
'B-VP': 20, 'B-VP': 20,
'I-VP': 21, 'I-VP': 21,
'O': 22 'O': 22
} }
def make_features(sequence): def make_features(sequence):
length = len(sequence) length = len(sequence)
num_features = len(sequence[0]) num_features = len(sequence[0])
def get_features(pos): def get_features(pos):
if pos < 0: if pos < 0:
return ['#B%s' % -pos] * num_features return ['#B%s' % -pos] * num_features
...@@ -94,9 +92,10 @@ def make_features(sequence): ...@@ -94,9 +92,10 @@ def make_features(sequence):
for i in xrange(length): for i in xrange(length):
for pattern in patterns: for pattern in patterns:
fname = '/'.join([get_features(i+pos)[f] for pos, f in pattern]) fname = '/'.join([get_features(i + pos)[f] for pos, f in pattern])
sequence[i].append(fname) sequence[i].append(fname)
''' '''
Source file format: Source file format:
Each line is for one timestep. The features are separated by space. Each line is for one timestep. The features are separated by space.
...@@ -109,6 +108,8 @@ i-th column. ...@@ -109,6 +108,8 @@ i-th column.
return a list of dict for each column return a list of dict for each column
''' '''
def create_dictionaries(filename, cutoff, oov_policy): def create_dictionaries(filename, cutoff, oov_policy):
def add_to_dict(sequence, dicts): def add_to_dict(sequence, dicts):
num_features = len(dicts) num_features = len(dicts)
...@@ -140,7 +141,6 @@ def create_dictionaries(filename, cutoff, oov_policy): ...@@ -140,7 +141,6 @@ def create_dictionaries(filename, cutoff, oov_policy):
features = line.split(' ') features = line.split(' ')
sequence.append(features) sequence.append(features)
for i in xrange(num_features): for i in xrange(num_features):
dct = dicts[i] dct = dicts[i]
n = 1 if oov_policy[i] == OOV_POLICY_USE else 0 n = 1 if oov_policy[i] == OOV_POLICY_USE else 0
...@@ -151,7 +151,7 @@ def create_dictionaries(filename, cutoff, oov_policy): ...@@ -151,7 +151,7 @@ def create_dictionaries(filename, cutoff, oov_policy):
else: else:
dct[k] = n dct[k] = n
n += 1 n += 1
if oov_policy[i] == OOV_POLICY_USE: if oov_policy[i] == OOV_POLICY_USE:
# placeholder so that len(dct) will be the number of features # placeholder so that len(dct) will be the number of features
# including OOV # including OOV
...@@ -187,12 +187,15 @@ def initializer(settings, **xargs): ...@@ -187,12 +187,15 @@ def initializer(settings, **xargs):
logger.info("feature size=%s" % dim) logger.info("feature size=%s" % dim)
settings.input_types = input_types settings.input_types = input_types
''' '''
if oov_policy[i] == OOV_POLICY_USE, features in i-th column which are not if oov_policy[i] == OOV_POLICY_USE, features in i-th column which are not
existed in dicts[i] will be assigned to id 0. existed in dicts[i] will be assigned to id 0.
if oov_policy[i] == OOV_POLICY_ERROR, all features in i-th column MUST exist if oov_policy[i] == OOV_POLICY_ERROR, all features in i-th column MUST exist
in dicts[i]. in dicts[i].
''' '''
@provider(init_hook=initializer, cache=CacheType.CACHE_PASS_IN_MEM) @provider(init_hook=initializer, cache=CacheType.CACHE_PASS_IN_MEM)
def process(settings, filename): def process(settings, filename):
input_file = filename input_file = filename
...@@ -231,7 +234,7 @@ def process(settings, filename): ...@@ -231,7 +234,7 @@ def process(settings, filename):
logger.fatal("Unknown token: %s" % features[i]) logger.fatal("Unknown token: %s" % features[i])
else: else:
vec.ids.append(dim + 0) vec.ids.append(dim + 0)
dim += len(dicts[i]) dim += len(dicts[i])
sample[-1].append(vec) sample[-1].append(vec)
return sample return sample
...@@ -255,4 +258,3 @@ def process(settings, filename): ...@@ -255,4 +258,3 @@ def process(settings, filename):
f.close() f.close()
logger.info("num_sequences=%s" % num_sequences) logger.info("num_sequences=%s" % num_sequences)
...@@ -16,11 +16,11 @@ from paddle.trainer_config_helpers import * ...@@ -16,11 +16,11 @@ from paddle.trainer_config_helpers import *
import math import math
define_py_data_sources2(train_list="data/train.list", define_py_data_sources2(
test_list="data/test.list", train_list="data/train.list",
module="dataprovider", test_list="data/test.list",
obj="process") module="dataprovider",
obj="process")
batch_size = 1 batch_size = 1
settings( settings(
...@@ -30,14 +30,15 @@ settings( ...@@ -30,14 +30,15 @@ settings(
average_window=0.5, average_window=0.5,
learning_rate=1e-1, learning_rate=1e-1,
learning_rate_decay_a=1e-5, learning_rate_decay_a=1e-5,
learning_rate_decay_b=0.25, learning_rate_decay_b=0.25, )
)
num_label_types = 23
num_label_types=23
def get_simd_size(size): def get_simd_size(size):
return int(math.ceil(float(size) / 8)) * 8 return int(math.ceil(float(size) / 8)) * 8
# Currently, in order to use sparse_update=True, # Currently, in order to use sparse_update=True,
# the size has to be aligned. # the size has to be aligned.
num_label_types = get_simd_size(num_label_types) num_label_types = get_simd_size(num_label_types)
...@@ -45,40 +46,37 @@ num_label_types = get_simd_size(num_label_types) ...@@ -45,40 +46,37 @@ num_label_types = get_simd_size(num_label_types)
features = data_layer(name="features", size=76328) features = data_layer(name="features", size=76328)
word = data_layer(name="word", size=6778) word = data_layer(name="word", size=6778)
pos = data_layer(name="pos", size=44) pos = data_layer(name="pos", size=44)
chunk = data_layer(name="chunk", chunk = data_layer(name="chunk", size=num_label_types)
size=num_label_types)
crf_input = fc_layer( crf_input = fc_layer(
input=features, input=features,
size=num_label_types, size=num_label_types,
act=LinearActivation(), act=LinearActivation(),
bias_attr=False, bias_attr=False,
param_attr=ParamAttr(initial_std=0, sparse_update=True)) param_attr=ParamAttr(
initial_std=0, sparse_update=True))
crf=crf_layer( crf = crf_layer(
input=crf_input, input=crf_input,
label=chunk, label=chunk,
param_attr=ParamAttr(name="crfw", initial_std=0), param_attr=ParamAttr(
) name="crfw", initial_std=0), )
crf_decoding=crf_decoding_layer( crf_decoding = crf_decoding_layer(
size=num_label_types, size=num_label_types,
input=crf_input, input=crf_input,
label=chunk, label=chunk,
param_attr=ParamAttr(name="crfw"), param_attr=ParamAttr(name="crfw"), )
)
sum_evaluator( sum_evaluator(
name="error", name="error",
input=crf_decoding, input=crf_decoding, )
)
chunk_evaluator( chunk_evaluator(
name="chunk_f1", name="chunk_f1",
input =[crf_decoding, chunk], input=[crf_decoding, chunk],
chunk_scheme="IOB", chunk_scheme="IOB",
num_chunk_types=11, num_chunk_types=11, )
)
inputs(word, pos, chunk, features) inputs(word, pos, chunk, features)
outputs(crf) outputs(crf)
...@@ -16,10 +16,11 @@ from paddle.trainer_config_helpers import * ...@@ -16,10 +16,11 @@ from paddle.trainer_config_helpers import *
import math import math
define_py_data_sources2(train_list="data/train.list", define_py_data_sources2(
test_list="data/test.list", train_list="data/train.list",
module="dataprovider", test_list="data/test.list",
obj="process") module="dataprovider",
obj="process")
batch_size = 16 batch_size = 16
settings( settings(
...@@ -27,29 +28,27 @@ settings( ...@@ -27,29 +28,27 @@ settings(
batch_size=batch_size, batch_size=batch_size,
regularization=L2Regularization(batch_size * 1e-5), regularization=L2Regularization(batch_size * 1e-5),
average_window=0.5, average_window=0.5,
learning_rate = 2e-3, learning_rate=2e-3,
learning_rate_decay_a = 5e-7, learning_rate_decay_a=5e-7,
learning_rate_decay_b = 0.5, learning_rate_decay_b=0.5, )
)
word_dim=128 word_dim = 128
hidden_dim = 128 hidden_dim = 128
with_rnn = True with_rnn = True
initial_std=1/math.sqrt(hidden_dim) initial_std = 1 / math.sqrt(hidden_dim)
param_attr=ParamAttr(initial_std=initial_std) param_attr = ParamAttr(initial_std=initial_std)
cpu_layer_attr=ExtraLayerAttribute(device=-1) cpu_layer_attr = ExtraLayerAttribute(device=-1)
default_device(0) default_device(0)
num_label_types=23 num_label_types = 23
features = data_layer(name="features", size=76328) features = data_layer(name="features", size=76328)
word = data_layer(name="word", size=6778) word = data_layer(name="word", size=6778)
pos = data_layer(name="pos", size=44) pos = data_layer(name="pos", size=44)
chunk = data_layer(name="chunk", chunk = data_layer(
size=num_label_types, name="chunk", size=num_label_types, layer_attr=cpu_layer_attr)
layer_attr=cpu_layer_attr)
emb = embedding_layer( emb = embedding_layer(
input=word, size=word_dim, param_attr=ParamAttr(initial_std=0)) input=word, size=word_dim, param_attr=ParamAttr(initial_std=0))
...@@ -58,73 +57,64 @@ hidden1 = mixed_layer( ...@@ -58,73 +57,64 @@ hidden1 = mixed_layer(
size=hidden_dim, size=hidden_dim,
act=STanhActivation(), act=STanhActivation(),
bias_attr=True, bias_attr=True,
input=[full_matrix_projection(emb), input=[
table_projection(pos, param_attr=param_attr)] full_matrix_projection(emb), table_projection(
) pos, param_attr=param_attr)
])
if with_rnn: if with_rnn:
rnn1 = recurrent_layer( rnn1 = recurrent_layer(
act=ReluActivation(), act=ReluActivation(),
bias_attr=True, bias_attr=True,
input=hidden1, input=hidden1,
param_attr=ParamAttr(initial_std=0), param_attr=ParamAttr(initial_std=0), )
)
hidden2 = mixed_layer( hidden2 = mixed_layer(
size=hidden_dim, size=hidden_dim,
act=STanhActivation(), act=STanhActivation(),
bias_attr=True, bias_attr=True,
input=[full_matrix_projection(hidden1) input=[full_matrix_projection(hidden1)] +
] + ([ ([full_matrix_projection(
full_matrix_projection(rnn1, param_attr=ParamAttr(initial_std=0)) rnn1, param_attr=ParamAttr(initial_std=0))] if with_rnn else []), )
] if with_rnn else []),
)
if with_rnn: if with_rnn:
rnn2=recurrent_layer( rnn2 = recurrent_layer(
reverse=True, reverse=True,
act=ReluActivation(), act=ReluActivation(),
bias_attr=True, bias_attr=True,
input=hidden2, input=hidden2,
param_attr=ParamAttr(initial_std=0), param_attr=ParamAttr(initial_std=0), )
)
crf_input = mixed_layer( crf_input = mixed_layer(
size=num_label_types, size=num_label_types,
bias_attr=False, bias_attr=False,
input=[ input=[full_matrix_projection(hidden2), ] +
full_matrix_projection(hidden2), ([full_matrix_projection(
] + ([ rnn2, param_attr=ParamAttr(initial_std=0))] if with_rnn else []), )
full_matrix_projection(rnn2, param_attr=ParamAttr(initial_std=0))
] if with_rnn else []),
)
crf = crf_layer( crf = crf_layer(
input=crf_input, input=crf_input,
label=chunk, label=chunk,
param_attr=ParamAttr(name="crfw", initial_std=0), param_attr=ParamAttr(
layer_attr=cpu_layer_attr, name="crfw", initial_std=0),
) layer_attr=cpu_layer_attr, )
crf_decoding = crf_decoding_layer( crf_decoding = crf_decoding_layer(
size=num_label_types, size=num_label_types,
input=crf_input, input=crf_input,
label=chunk, label=chunk,
param_attr=ParamAttr(name="crfw"), param_attr=ParamAttr(name="crfw"),
layer_attr=cpu_layer_attr, layer_attr=cpu_layer_attr, )
)
sum_evaluator( sum_evaluator(
name="error", name="error",
input=crf_decoding, input=crf_decoding, )
)
chunk_evaluator( chunk_evaluator(
name="chunk_f1", name="chunk_f1",
input =[crf_decoding, chunk], input=[crf_decoding, chunk],
chunk_scheme="IOB", chunk_scheme="IOB",
num_chunk_types=11, num_chunk_types=11, )
)
inputs(word, pos, chunk, features) inputs(word, pos, chunk, features)
outputs(crf) outputs(crf)
...@@ -16,82 +16,113 @@ from py_paddle import swig_paddle, DataProviderConverter ...@@ -16,82 +16,113 @@ from py_paddle import swig_paddle, DataProviderConverter
from paddle.trainer.PyDataProvider2 import dense_vector from paddle.trainer.PyDataProvider2 import dense_vector
from paddle.trainer.config_parser import parse_config from paddle.trainer.config_parser import parse_config
TEST_DATA = [[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, TEST_DATA = [[[
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.215686, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0.533333, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.67451, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0.992157, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.070588, 0.886275, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0.992157, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.192157, 0.070588, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.215686, 0.533333, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0.670588, 0.992157, 0.992157, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.117647, 0.933333, 0.858824, 0.313725, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.67451, 0.992157, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0.090196, 0.858824, 0.992157, 0.831373, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.141176, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0.992157, 0.992157, 0.611765, 0.054902, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.258824, 0.992157, 0.992157, 0.070588, 0.886275, 0.992157, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.192157,
0.529412, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.368627, 0.992157, 0.992157, 0.419608, 0.003922, 0, 0, 0, 0, 0, 0, 0.070588, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.670588, 0.992157,
0, 0, 0, 0.094118, 0.835294, 0.992157, 0.992157, 0.517647, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.603922, 0.992157, 0.992157, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.117647, 0.933333, 0.858824, 0.313725,
0.992157, 0.992157, 0.603922, 0.545098, 0.043137, 0, 0, 0, 0, 0, 0, 0, 0.447059, 0.992157, 0.992157, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.090196, 0.858824, 0.992157, 0.831373, 0,
0.956863, 0.062745, 0, 0, 0, 0, 0, 0, 0, 0, 0.011765, 0.666667, 0.992157, 0.992157, 0.992157, 0.992157, 0, 0, 0, 0, 0, 0, 0, 0, 0.141176, 0.992157, 0.992157, 0.611765, 0.054902, 0,
0.992157, 0.745098, 0.137255, 0, 0, 0, 0, 0, 0.152941, 0.866667, 0.992157, 0.992157, 0.521569, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.258824, 0.992157, 0.992157, 0.529412, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0.070588, 0.992157, 0.992157, 0.992157, 0.803922, 0.352941, 0.745098, 0.992157, 0, 0, 0, 0, 0, 0, 0.368627, 0.992157, 0.992157, 0.419608, 0.003922, 0, 0, 0,
0.945098, 0.317647, 0, 0, 0, 0, 0.580392, 0.992157, 0.992157, 0.764706, 0.043137, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.094118, 0.835294, 0.992157, 0.992157, 0.517647, 0, 0, 0,
0, 0.070588, 0.992157, 0.992157, 0.776471, 0.043137, 0, 0.007843, 0.27451, 0.882353, 0.941176, 0.176471, 0, 0, 0, 0, 0, 0, 0.603922, 0.992157, 0.992157, 0.992157, 0.603922,
0, 0, 0.180392, 0.898039, 0.992157, 0.992157, 0.313725, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.070588, 0.992157, 0.545098, 0.043137, 0, 0, 0, 0, 0, 0, 0, 0.447059, 0.992157, 0.992157,
0.992157, 0.713725, 0, 0, 0, 0, 0.627451, 0.992157, 0.729412, 0.062745, 0, 0.509804, 0.992157, 0.992157, 0.956863, 0.062745, 0, 0, 0, 0, 0, 0, 0, 0, 0.011765, 0.666667, 0.992157,
0.776471, 0.035294, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.494118, 0.992157, 0.992157, 0.968627, 0.168627, 0, 0, 0.992157, 0.992157, 0.992157, 0.992157, 0.745098, 0.137255, 0, 0, 0, 0, 0,
0, 0.423529, 0.992157, 0.992157, 0.364706, 0, 0.717647, 0.992157, 0.992157, 0.317647, 0, 0, 0, 0, 0, 0, 0.152941, 0.866667, 0.992157, 0.992157, 0.521569, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0.533333, 0.992157, 0.984314, 0.945098, 0.603922, 0, 0, 0, 0.003922, 0.466667, 0.992157, 0.070588, 0.992157, 0.992157, 0.992157, 0.803922, 0.352941, 0.745098,
0.988235, 0.976471, 0.992157, 0.992157, 0.788235, 0.007843, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.686275, 0.992157, 0.945098, 0.317647, 0, 0, 0, 0, 0.580392, 0.992157, 0.992157,
0.882353, 0.364706, 0, 0, 0, 0, 0, 0, 0.098039, 0.588235, 0.992157, 0.992157, 0.992157, 0.980392, 0.764706, 0.043137, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.070588, 0.992157, 0.992157,
0.305882, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.101961, 0.67451, 0.321569, 0, 0, 0, 0, 0, 0, 0, 0.105882, 0.776471, 0.043137, 0, 0.007843, 0.27451, 0.882353, 0.941176, 0.176471, 0,
0.733333, 0.976471, 0.811765, 0.713725, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.65098, 0.992157, 0, 0.180392, 0.898039, 0.992157, 0.992157, 0.313725, 0, 0, 0, 0, 0, 0, 0, 0,
0.321569, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.25098, 0.007843, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0.070588, 0.992157, 0.992157, 0.713725, 0, 0, 0, 0, 0.627451,
0.94902, 0.219608, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.968627, 0.992157, 0.729412, 0.062745, 0, 0.509804, 0.992157, 0.992157, 0.776471,
0.764706, 0.152941, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.498039, 0.035294, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.494118, 0.992157, 0.992157,
0.25098, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.968627, 0.168627, 0, 0, 0, 0.423529, 0.992157, 0.992157, 0.364706, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.717647, 0.992157, 0.992157, 0.317647, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.533333, 0.992157, 0.984314, 0.945098, 0.603922, 0, 0, 0, 0.003922,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [ 0.466667, 0.992157, 0.988235, 0.976471, 0.992157, 0.992157, 0.788235,
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.007843, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.686275, 0.882353, 0.364706, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.098039, 0.588235, 0.992157, 0.992157, 0.992157, 0.980392,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.305882, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.101961, 0.67451, 0.321569,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.105882, 0.733333, 0.976471, 0.811765, 0.713725, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.65098, 0.992157, 0.321569, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.298039, 0.333333, 0.333333, 0.333333, 0.337255, 0.333333, 0, 0, 0, 0, 0.25098, 0.007843, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
0.333333, 0.109804, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.027451, 0.223529, 0.776471, 0.94902, 0.219608, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0.964706, 0.988235, 0.988235, 0.988235, 0.992157, 0.988235, 0.988235, 0.780392, 0.098039, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.968627, 0.764706, 0.152941, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.14902, 0.698039, 0.988235, 0.992157, 0.988235, 0.901961, 0.87451, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.498039, 0.25098, 0, 0, 0,
0.568627, 0.882353, 0.976471, 0.988235, 0.988235, 0.501961, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0.188235, 0.647059, 0.988235, 0.988235, 0.745098, 0.439216, 0.098039, 0, 0, 0, 0.572549, 0.988235, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0.988235, 0.988235, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.2, 0.933333, 0.992157, 0.941176, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0.247059, 0, 0, 0, 0, 0, 0, 0.188235, 0.898039, 0.992157, 0.992157, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0.039216, 0.639216, 0.933333, 0.988235, 0.913725, 0.278431, 0, 0, 0, 0, 0, 0, 0, 0.113725, 0.843137, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0.988235, 0.988235, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.235294, 0.988235, 0.992157, 0.988235, 0.815686, 0, 0, 0, 0, 0
0.07451, 0, 0, 0, 0, 0, 0, 0, 0.333333, 0.988235, 0.988235, 0.552941, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]], [[
0.211765, 0.878431, 0.988235, 0.992157, 0.701961, 0.329412, 0.109804, 0, 0, 0, 0, 0, 0, 0, 0.698039, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0.988235, 0.913725, 0.145098, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.188235, 0.890196, 0.988235, 0.988235, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0.745098, 0.047059, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.882353, 0.988235, 0.568627, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0.2, 0.933333, 0.992157, 0.992157, 0.992157, 0.447059, 0.294118, 0, 0, 0, 0, 0, 0, 0, 0, 0.447059, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0.992157, 0.768627, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.623529, 0.988235, 0.988235, 0.988235, 0.988235, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0.992157, 0.47451, 0, 0, 0, 0, 0, 0, 0, 0.188235, 0.933333, 0.87451, 0.509804, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0.992157, 0.988235, 0.937255, 0.792157, 0.988235, 0.894118, 0.082353, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0.027451, 0.647059, 0.992157, 0.654902, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.623529, 0.988235, 0.913725, 0, 0, 0, 0, 0, 0, 0, 0, 0.298039, 0.333333, 0.333333, 0.333333, 0.337255,
0.329412, 0.376471, 0.184314, 0, 0, 0, 0, 0, 0, 0.027451, 0.513725, 0.988235, 0.635294, 0.219608, 0, 0.333333, 0.333333, 0.109804, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.196078, 0.929412, 0.988235, 0.988235, 0.741176, 0.309804, 0, 0, 0, 0, 0, 0, 0.027451, 0.223529, 0.776471, 0.964706, 0.988235, 0.988235, 0.988235,
0, 0, 0.529412, 0.988235, 0.678431, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.223529, 0.992157, 0.992157, 0.988235, 0.988235, 0.780392, 0.098039, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0.992157, 1, 0.992157, 0.992157, 0.992157, 0.992157, 1, 0.992157, 0.992157, 0.882353, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.14902, 0.698039, 0.988235, 0.992157, 0.988235, 0.901961,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.023529, 0.478431, 0.654902, 0.658824, 0.952941, 0.988235, 0.988235, 0.87451, 0.568627, 0.882353, 0.976471, 0.988235, 0.988235, 0.501961, 0, 0,
0.988235, 0.992157, 0.988235, 0.729412, 0.278431, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.188235, 0.647059, 0.988235, 0.988235,
0, 0, 0, 0.196078, 0.647059, 0.764706, 0.764706, 0.768627, 0.580392, 0.047059, 0, 0, 0, 0, 0, 0, 0, 0, 0.745098, 0.439216, 0.098039, 0, 0, 0, 0.572549, 0.988235, 0.988235,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.988235, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.2, 0.933333, 0.992157,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.941176, 0.247059, 0, 0, 0, 0, 0, 0, 0.188235, 0.898039, 0.992157,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.992157, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.039216, 0.639216, 0.933333,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.988235, 0.913725, 0.278431, 0, 0, 0, 0, 0, 0, 0, 0.113725, 0.843137,
0, 0, 0, 0, 0, 0, 0]]] 0.988235, 0.988235, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.235294, 0.988235,
0.992157, 0.988235, 0.815686, 0.07451, 0, 0, 0, 0, 0, 0, 0, 0.333333,
0.988235, 0.988235, 0.552941, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.211765,
0.878431, 0.988235, 0.992157, 0.701961, 0.329412, 0.109804, 0, 0, 0, 0, 0,
0, 0, 0.698039, 0.988235, 0.913725, 0.145098, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0.188235, 0.890196, 0.988235, 0.988235, 0.745098, 0.047059, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0.882353, 0.988235, 0.568627, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.2,
0.933333, 0.992157, 0.992157, 0.992157, 0.447059, 0.294118, 0, 0, 0, 0, 0,
0, 0, 0, 0.447059, 0.992157, 0.768627, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0.623529, 0.988235, 0.988235, 0.988235, 0.988235, 0.992157, 0.47451, 0, 0,
0, 0, 0, 0, 0, 0.188235, 0.933333, 0.87451, 0.509804, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0.992157, 0.988235, 0.937255, 0.792157, 0.988235, 0.894118,
0.082353, 0, 0, 0, 0, 0, 0, 0.027451, 0.647059, 0.992157, 0.654902, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0.623529, 0.988235, 0.913725, 0.329412, 0.376471,
0.184314, 0, 0, 0, 0, 0, 0, 0.027451, 0.513725, 0.988235, 0.635294,
0.219608, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.196078, 0.929412, 0.988235,
0.988235, 0.741176, 0.309804, 0, 0, 0, 0, 0, 0, 0.529412, 0.988235,
0.678431, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.223529, 0.992157,
0.992157, 1, 0.992157, 0.992157, 0.992157, 0.992157, 1, 0.992157, 0.992157,
0.882353, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.023529,
0.478431, 0.654902, 0.658824, 0.952941, 0.988235, 0.988235, 0.988235,
0.992157, 0.988235, 0.729412, 0.278431, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0.196078, 0.647059, 0.764706, 0.764706, 0.768627,
0.580392, 0.047059, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0
]]]
def main(): def main():
conf = parse_config("./mnist_model/trainer_config.py", "") conf = parse_config("./mnist_model/trainer_config.py", "")
print conf.data_config.load_data_args print conf.data_config.load_data_args
network = swig_paddle.GradientMachine.createFromConfigProto(conf.model_config) network = swig_paddle.GradientMachine.createFromConfigProto(
conf.model_config)
assert isinstance(network, swig_paddle.GradientMachine) # For code hint. assert isinstance(network, swig_paddle.GradientMachine) # For code hint.
network.loadParameters("./mnist_model/") network.loadParameters("./mnist_model/")
converter = DataProviderConverter([dense_vector(784)]) converter = DataProviderConverter([dense_vector(784)])
......
from paddle.trainer_config_helpers import * from paddle.trainer_config_helpers import *
define_py_data_sources2(train_list='train.list', define_py_data_sources2(
test_list='test.list', train_list='train.list',
module='provider', test_list='test.list',
obj='process') module='provider',
obj='process')
settings( settings(
batch_size=128, batch_size=128,
learning_rate=1e-3, learning_rate=1e-3,
learning_method=AdamOptimizer(), learning_method=AdamOptimizer(),
regularization=L2Regularization(0.5) regularization=L2Regularization(0.5))
)
img = data_layer(name='pixel', size=28 * 28) img = data_layer(name='pixel', size=28 * 28)
hidden1 = simple_img_conv_pool(input=img, filter_size=3, num_filters=32, pool_size=3, hidden1 = simple_img_conv_pool(
num_channel=1) input=img, filter_size=3, num_filters=32, pool_size=3, num_channel=1)
hidden2 = fc_layer(input=hidden1, size=200, act=TanhActivation(), hidden2 = fc_layer(
layer_attr=ExtraAttr(drop_rate=0.5)) input=hidden1,
size=200,
act=TanhActivation(),
layer_attr=ExtraAttr(drop_rate=0.5))
predict = fc_layer(input=hidden2, size=10, act=SoftmaxActivation()) predict = fc_layer(input=hidden2, size=10, act=SoftmaxActivation())
outputs(classification_cost(input=predict, label=data_layer(name='label', size=10))) outputs(
classification_cost(
input=predict, label=data_layer(
name='label', size=10)))
... # the settings and define data provider is omitted. ... # the settings and define data provider is omitted.
DICT_DIM=3000 # dictionary dimension. DICT_DIM = 3000 # dictionary dimension.
word_ids=data_layer('word_ids', size=DICT_DIM) word_ids = data_layer('word_ids', size=DICT_DIM)
emb = embedding_layer(input=word_ids, size=256, param_attr=ParamAttr(sparse_update=True)) emb = embedding_layer(
input=word_ids, size=256, param_attr=ParamAttr(sparse_update=True))
emb_sum = pooling_layer(input=emb, pooling_type=SumPooling()) emb_sum = pooling_layer(input=emb, pooling_type=SumPooling())
predict = fc_layer(input=emb_sum, size=DICT_DIM, act=Softmax()) predict = fc_layer(input=emb_sum, size=DICT_DIM, act=Softmax())
outputs(classification_cost(input=predict, label=data_layer('label', size=DICT_DIM))) outputs(
\ No newline at end of file classification_cost(
input=predict, label=data_layer(
'label', size=DICT_DIM)))
DICT_DIM=3000 DICT_DIM = 3000
@provider(input_types=[integer_sequence(DICT_DIM), integer_value(DICT_DIM)]) @provider(input_types=[integer_sequence(DICT_DIM), integer_value(DICT_DIM)])
def process(settings, filename): def process(settings, filename):
with open(filename) as f: with open(filename) as f:
# yield word ids to predict inner word id # yield word ids to predict inner word id
# such as [28, 29, 10, 4], 4 # such as [28, 29, 10, 4], 4
# It means the sentance is 28, 29, 4, 10, 4. # It means the sentance is 28, 29, 4, 10, 4.
yield read_next_from_file(f) yield read_next_from_file(f)
\ No newline at end of file
from paddle.trainer_config_helpers import * from paddle.trainer_config_helpers import *
define_py_data_sources2(train_list='train.list', define_py_data_sources2(
test_list=None, train_list='train.list',
module='mnist_provider', test_list=None,
obj='process') module='mnist_provider',
obj='process')
img = data_layer(name='pixel', size=784) img = data_layer(name='pixel', size=784)
label = data_layer(name='label', size=10) label = data_layer(name='label', size=10)
...@@ -2,10 +2,9 @@ from paddle.trainer.PyDataProvider2 import * ...@@ -2,10 +2,9 @@ from paddle.trainer.PyDataProvider2 import *
# Define a py data provider # Define a py data provider
@provider(input_types={ @provider(
'pixel': dense_vector(28 * 28), input_types={'pixel': dense_vector(28 * 28),
'label': integer_value(10) 'label': integer_value(10)})
})
def process(settings, filename): # settings is not used currently. def process(settings, filename): # settings is not used currently.
f = open(filename, 'r') # open one of training file f = open(filename, 'r') # open one of training file
......
...@@ -2,10 +2,7 @@ from paddle.trainer.PyDataProvider2 import * ...@@ -2,10 +2,7 @@ from paddle.trainer.PyDataProvider2 import *
# Define a py data provider # Define a py data provider
@provider(input_types=[ @provider(input_types=[dense_vector(28 * 28), integer_value(10)])
dense_vector(28 * 28),
integer_value(10)
])
def process(settings, filename): # settings is not used currently. def process(settings, filename): # settings is not used currently.
f = open(filename, 'r') # open one of training file f = open(filename, 'r') # open one of training file
......
...@@ -3,9 +3,12 @@ from paddle.trainer_config_helpers import * ...@@ -3,9 +3,12 @@ from paddle.trainer_config_helpers import *
dictionary = dict() dictionary = dict()
... # read dictionary from outside ... # read dictionary from outside
define_py_data_sources2(train_list='train.list', test_list=None, define_py_data_sources2(
module='sentimental_provider', obj='process', train_list='train.list',
# above codes same as mnist sample. test_list=None,
args={ # pass to provider. module='sentimental_provider',
'dictionary': dictionary obj='process',
}) # above codes same as mnist sample.
args={ # pass to provider.
'dictionary': dictionary
})
...@@ -12,7 +12,8 @@ def on_init(settings, dictionary, **kwargs): ...@@ -12,7 +12,8 @@ def on_init(settings, dictionary, **kwargs):
# The text is a sequence of integer values, and each value is a word id. # The text is a sequence of integer values, and each value is a word id.
# The whole sequence is the sentences that we want to predict its # The whole sequence is the sentences that we want to predict its
# sentimental. # sentimental.
integer_value(len(dictionary), seq_type=SequenceType), # text input integer_value(
len(dictionary), seq_type=SequenceType), # text input
# label positive/negative # label positive/negative
integer_value(2) integer_value(2)
......
...@@ -11,4 +11,3 @@ ...@@ -11,4 +11,3 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
...@@ -29,7 +29,10 @@ try: ...@@ -29,7 +29,10 @@ try:
whole_start = "" whole_start = ""
whole_end = "" whole_end = ""
LIB_DIRS = ["math", 'utils', 'parameter', "gserver", "api", "cuda", "pserver", "trainer"] LIB_DIRS = [
"math", 'utils', 'parameter', "gserver", "api", "cuda", "pserver",
"trainer"
]
PARENT_LIB_DIRS = ['proto'] PARENT_LIB_DIRS = ['proto']
class PaddleLDFlag(object): class PaddleLDFlag(object):
...@@ -55,19 +58,20 @@ try: ...@@ -55,19 +58,20 @@ try:
self.curt = CUDA_LIBRARIES self.curt = CUDA_LIBRARIES
def ldflag_str(self): def ldflag_str(self):
return " ".join([self.libs_dir_str(), return " ".join(
self.parent_dir_str(), [self.libs_dir_str(), self.parent_dir_str(), self.libs_str()])
self.libs_str()])
def libs_dir_str(self): def libs_dir_str(self):
libdirs = LIB_DIRS libdirs = LIB_DIRS
return " ".join(map(lambda x: "-L" + os.path.join(self.paddle_build_dir, x), return " ".join(
libdirs)) map(lambda x: "-L" + os.path.join(self.paddle_build_dir, x),
libdirs))
def parent_dir_str(self): def parent_dir_str(self):
libdirs = PARENT_LIB_DIRS libdirs = PARENT_LIB_DIRS
return " ".join(map(lambda x: "-L" + os.path.join(self.paddle_build_dir, '..', x), return " ".join(
libdirs)) map(lambda x: "-L" + os.path.join(self.paddle_build_dir, '..', x),
libdirs))
def libs_str(self): def libs_str(self):
libs = [ libs = [
...@@ -113,10 +117,10 @@ try: ...@@ -113,10 +117,10 @@ try:
return cmake_flag return cmake_flag
elif cmake_flag.startswith("-l"): # normal link command elif cmake_flag.startswith("-l"): # normal link command
return cmake_flag return cmake_flag
elif cmake_flag in ["gflags-shared", elif cmake_flag in [
"gflags-static", "gflags-shared", "gflags-static", "gflags_nothreads-shared",
"gflags_nothreads-shared", "gflags_nothreads-static"
"gflags_nothreads-static"]: # special for gflags ]: # special for gflags
assert PaddleLDFlag.cmake_bool(self.gflags_location) assert PaddleLDFlag.cmake_bool(self.gflags_location)
return self.gflags_location return self.gflags_location
elif len(cmake_flag) != 0: elif len(cmake_flag) != 0:
...@@ -132,18 +136,22 @@ try: ...@@ -132,18 +136,22 @@ try:
:type cmake_str: str :type cmake_str: str
:rtype: bool :rtype: bool
""" """
if cmake_str in ["FALSE", "OFF", "NO"] or cmake_str.endswith("-NOTFOUND"): if cmake_str in ["FALSE", "OFF", "NO"] or cmake_str.endswith(
"-NOTFOUND"):
return False return False
else: else:
return True return True
def c_flag(self): def c_flag(self):
if self.with_coverage: if self.with_coverage:
return ["-fprofile-arcs", "-ftest-coverage", "-O0", "-g"] return ["-fprofile-arcs", "-ftest-coverage", "-O0", "-g"]
else: else:
return None return None
except ImportError: except ImportError:
class PaddleLDFlag(object): class PaddleLDFlag(object):
def ldflag_str(self): def ldflag_str(self):
pass pass
def c_flag(self): def c_flag(self):
pass pass
...@@ -32,7 +32,7 @@ class TestArguments(unittest.TestCase): ...@@ -32,7 +32,7 @@ class TestArguments(unittest.TestCase):
iv = args.getSlotIds(0) iv = args.getSlotIds(0)
assert isinstance(iv, swig_paddle.IVector) assert isinstance(iv, swig_paddle.IVector)
np_arr = iv.toNumpyArrayInplace() np_arr = iv.toNumpyArrayInplace()
self.assertEqual(np_arr.shape, (6,)) self.assertEqual(np_arr.shape, (6, ))
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -30,8 +30,8 @@ class TestGradientMachine(unittest.TestCase): ...@@ -30,8 +30,8 @@ class TestGradientMachine(unittest.TestCase):
self.assertIsNotNone(model_config) self.assertIsNotNone(model_config)
machine = swig_paddle.GradientMachine.createByModelConfig( machine = swig_paddle.GradientMachine.createByModelConfig(
model_config, swig_paddle.CREATE_MODE_NORMAL, model_config, swig_paddle.CREATE_MODE_NORMAL,
swig_paddle.ParameterOptimizer.create( swig_paddle.ParameterOptimizer.create(opt_config).getParameterTypes(
opt_config).getParameterTypes()) ))
self.assertIsNotNone(machine) self.assertIsNotNone(machine)
ipt, _ = util.loadMNISTTrainData() ipt, _ = util.loadMNISTTrainData()
output = swig_paddle.Arguments.createArguments(0) output = swig_paddle.Arguments.createArguments(0)
...@@ -43,7 +43,7 @@ class TestGradientMachine(unittest.TestCase): ...@@ -43,7 +43,7 @@ class TestGradientMachine(unittest.TestCase):
assert isinstance(param, swig_paddle.Parameter) assert isinstance(param, swig_paddle.Parameter)
val = param.getBuf(swig_paddle.PARAMETER_VALUE) val = param.getBuf(swig_paddle.PARAMETER_VALUE)
assert isinstance(val, swig_paddle.Vector) assert isinstance(val, swig_paddle.Vector)
arr = numpy.full((len(val),), 0.1, dtype="float32") arr = numpy.full((len(val), ), 0.1, dtype="float32")
val.copyFromNumpyArray(arr) val.copyFromNumpyArray(arr)
param_config = param.getConfig().toProto() param_config = param.getConfig().toProto()
assert isinstance(param_config, assert isinstance(param_config,
......
...@@ -69,7 +69,8 @@ class TestMatrix(unittest.TestCase): ...@@ -69,7 +69,8 @@ class TestMatrix(unittest.TestCase):
def test_numpy(self): def test_numpy(self):
numpy_mat = np.matrix([[1, 2], [3, 4], [5, 6]], dtype="float32") numpy_mat = np.matrix([[1, 2], [3, 4], [5, 6]], dtype="float32")
m = swig_paddle.Matrix.createCpuDenseFromNumpy(numpy_mat) m = swig_paddle.Matrix.createCpuDenseFromNumpy(numpy_mat)
self.assertEqual((int(m.getHeight()), int(m.getWidth())), numpy_mat.shape) self.assertEqual(
(int(m.getHeight()), int(m.getWidth())), numpy_mat.shape)
# the numpy matrix and paddle matrix shared the same memory. # the numpy matrix and paddle matrix shared the same memory.
numpy_mat[0, 1] = 342.23 numpy_mat[0, 1] = 342.23
......
...@@ -98,7 +98,8 @@ def main(): ...@@ -98,7 +98,8 @@ def main():
cost_vec = outArgs.getSlotValue(0) cost_vec = outArgs.getSlotValue(0)
assert isinstance(cost_vec, swig_paddle.Matrix) assert isinstance(cost_vec, swig_paddle.Matrix)
cost_vec = cost_vec.copyToNumpyMat() cost_vec = cost_vec.copyToNumpyMat()
print 'Finish Batch', batch_id, 'with cost ', cost_vec.sum() / batch_size print 'Finish Batch', batch_id, 'with cost ', cost_vec.sum(
) / batch_size
batch_id += 1 batch_id += 1
for optimizer in optimizers: for optimizer in optimizers:
......
from paddle.trainer_config_helpers import * from paddle.trainer_config_helpers import *
settings( settings(batch_size=100, learning_method=AdamOptimizer())
batch_size=100,
learning_method=AdamOptimizer()
)
din = data_layer(name='input', size=784) din = data_layer(name='input', size=784)
......
...@@ -17,9 +17,9 @@ from paddle.trainer.config_parser import logger ...@@ -17,9 +17,9 @@ from paddle.trainer.config_parser import logger
from py_paddle import swig_paddle from py_paddle import swig_paddle
import util import util
def main(): def main():
trainer_config = parse_config( trainer_config = parse_config("./testTrainConfig.py", "")
"./testTrainConfig.py", "")
model = swig_paddle.GradientMachine.createFromConfigProto( model = swig_paddle.GradientMachine.createFromConfigProto(
trainer_config.model_config) trainer_config.model_config)
trainer = swig_paddle.Trainer.create(trainer_config, model) trainer = swig_paddle.Trainer.create(trainer_config, model)
...@@ -56,7 +56,7 @@ def main(): ...@@ -56,7 +56,7 @@ def main():
logger.info('test cost=%f' % (cost / num)) logger.info('test cost=%f' % (cost / num))
trainer.finishTrain() trainer.finishTrain()
if __name__ == '__main__': if __name__ == '__main__':
swig_paddle.initPaddle("--use_gpu=0", "--trainer_count=1") swig_paddle.initPaddle("--use_gpu=0", "--trainer_count=1")
......
...@@ -112,5 +112,6 @@ class TestVector(unittest.TestCase): ...@@ -112,5 +112,6 @@ class TestVector(unittest.TestCase):
if __name__ == '__main__': if __name__ == '__main__':
swig_paddle.initPaddle("--use_gpu=1" if swig_paddle.isGpuVersion() else "--use_gpu=0") swig_paddle.initPaddle("--use_gpu=1"
if swig_paddle.isGpuVersion() else "--use_gpu=0")
unittest.main() unittest.main()
...@@ -11,4 +11,3 @@ ...@@ -11,4 +11,3 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
...@@ -16,72 +16,79 @@ import numpy ...@@ -16,72 +16,79 @@ import numpy
import struct import struct
import traceback import traceback
def header_creator(): def header_creator():
ret = "" ret = ""
ret += struct.pack('i', 3) # slot num ret += struct.pack('i', 3) # slot num
ret += struct.pack('i', 1) # sequence flag ret += struct.pack('i', 1) # sequence flag
ret += struct.pack('i', 0) # slot0 dense type ret += struct.pack('i', 0) # slot0 dense type
ret += struct.pack('i', 3) # slot0 dim ret += struct.pack('i', 3) # slot0 dim
ret += struct.pack('i', 1) # slot1 sparse non value type ret += struct.pack('i', 1) # slot1 sparse non value type
ret += struct.pack('i', 7) # slot1 dim ret += struct.pack('i', 7) # slot1 dim
ret += struct.pack('i', 3) # slot2 index type ret += struct.pack('i', 3) # slot2 index type
ret += struct.pack('i', 2) # slot2 dim ret += struct.pack('i', 2) # slot2 dim
return ret return ret
def dense_value_creator(sample_num): def dense_value_creator(sample_num):
ret = "" ret = ""
ret += struct.pack('i', sample_num) # slot0 sample num ret += struct.pack('i', sample_num) # slot0 sample num
for i in range(sample_num): # slot0 value for i in range(sample_num): # slot0 value
ret += struct.pack('f', 1.0) ret += struct.pack('f', 1.0)
ret += struct.pack('f', 2.0) ret += struct.pack('f', 2.0)
ret += struct.pack('f', 3.0) ret += struct.pack('f', 3.0)
return ret return ret
def sparse_value_creator(sample_num): def sparse_value_creator(sample_num):
ret = "" ret = ""
ret += struct.pack('i', sample_num) # slot1 sample num ret += struct.pack('i', sample_num) # slot1 sample num
for i in range(sample_num): # slot1 index for i in range(sample_num): # slot1 index
ret += struct.pack('i', i * 2) ret += struct.pack('i', i * 2)
ret += struct.pack('i', sample_num * 2) #slot1 length ret += struct.pack('i', sample_num * 2) #slot1 length
for i in range(sample_num): # slot1 value for i in range(sample_num): # slot1 value
ret += struct.pack('i', 1) ret += struct.pack('i', 1)
ret += struct.pack('i', 2) ret += struct.pack('i', 2)
return ret return ret
def index_value_creator(sample_num): def index_value_creator(sample_num):
ret = "" ret = ""
ret += struct.pack('i', sample_num) # slot2 sample num ret += struct.pack('i', sample_num) # slot2 sample num
for i in range(sample_num): # slot2 value for i in range(sample_num): # slot2 value
ret += struct.pack('i', 0) ret += struct.pack('i', 0)
return ret return ret
def sequenceStartPositions_creator(): def sequenceStartPositions_creator():
ret = "" ret = ""
ret += struct.pack('i', 2) # slot0 sequence num ret += struct.pack('i', 2) # slot0 sequence num
ret += struct.pack('i', 0) # slot0 sequence value1 ret += struct.pack('i', 0) # slot0 sequence value1
ret += struct.pack('i', 1) # slot0 sequence value2 ret += struct.pack('i', 1) # slot0 sequence value2
ret += struct.pack('i', 1) # slot1 sequence num ret += struct.pack('i', 1) # slot1 sequence num
ret += struct.pack('i', 0) # slot1 sequence value1 ret += struct.pack('i', 0) # slot1 sequence value1
ret += struct.pack('i', 2) # slot2 sequence num ret += struct.pack('i', 2) # slot2 sequence num
ret += struct.pack('i', 0) # slot2 sequence value1 ret += struct.pack('i', 0) # slot2 sequence value1
ret += struct.pack('i', 1) # slot2 sequence value2 ret += struct.pack('i', 1) # slot2 sequence value2
return ret return ret
def subSequenceStartPositions_creator(): def subSequenceStartPositions_creator():
ret = "" ret = ""
ret += struct.pack('i', 3) # slot0 subsequence num ret += struct.pack('i', 3) # slot0 subsequence num
ret += struct.pack('i', 0) # slot0 subsequence value1 ret += struct.pack('i', 0) # slot0 subsequence value1
ret += struct.pack('i', 1) # slot0 subsequence value2 ret += struct.pack('i', 1) # slot0 subsequence value2
ret += struct.pack('i', 2) # slot0 subsequence value3 ret += struct.pack('i', 2) # slot0 subsequence value3
ret += struct.pack('i', 2) # slot1 subsequence num ret += struct.pack('i', 2) # slot1 subsequence num
ret += struct.pack('i', 0) # slot1 subsequence value1 ret += struct.pack('i', 0) # slot1 subsequence value1
ret += struct.pack('i', 1) # slot1 subsequence value2 ret += struct.pack('i', 1) # slot1 subsequence value2
ret += struct.pack('i', 3) # slot2 subsequence num ret += struct.pack('i', 3) # slot2 subsequence num
ret += struct.pack('i', 0) # slot2 subsequence value1 ret += struct.pack('i', 0) # slot2 subsequence value1
ret += struct.pack('i', 1) # slot2 subsequence value2 ret += struct.pack('i', 1) # slot2 subsequence value2
ret += struct.pack('i', 2) # slot2 subsequence value3 ret += struct.pack('i', 2) # slot2 subsequence value3
return ret return ret
class SimpleDataProvider: class SimpleDataProvider:
def __init__(self, *file_list): def __init__(self, *file_list):
self.file_list = file_list self.file_list = file_list
...@@ -93,17 +100,18 @@ class SimpleDataProvider: ...@@ -93,17 +100,18 @@ class SimpleDataProvider:
pass pass
def getHeader(self): def getHeader(self):
return header_creator() return header_creator()
def getNextBatch(self, batch_size): def getNextBatch(self, batch_size):
ret = "" ret = ""
ret += struct.pack('i', 2) # batch size ret += struct.pack('i', 2) # batch size
ret += dense_value_creator(2) # slot0 ret += dense_value_creator(2) # slot0
ret += sparse_value_creator(2) # slot1 ret += sparse_value_creator(2) # slot1
ret += index_value_creator(2) # slot2 ret += index_value_creator(2) # slot2
ret += sequenceStartPositions_creator() ret += sequenceStartPositions_creator()
return ret return ret
class SimpleNestDataProvider: class SimpleNestDataProvider:
def __init__(self, *file_list): def __init__(self, *file_list):
self.file_list = file_list self.file_list = file_list
...@@ -119,14 +127,15 @@ class SimpleNestDataProvider: ...@@ -119,14 +127,15 @@ class SimpleNestDataProvider:
def getNextBatch(self, batch_size): def getNextBatch(self, batch_size):
ret = "" ret = ""
ret += struct.pack('i', 2) # batch size ret += struct.pack('i', 2) # batch size
ret += dense_value_creator(4) # slot0 ret += dense_value_creator(4) # slot0
ret += sparse_value_creator(4) # slot1 ret += sparse_value_creator(4) # slot1
ret += index_value_creator(4) # slot2 ret += index_value_creator(4) # slot2
ret += sequenceStartPositions_creator() ret += sequenceStartPositions_creator()
ret += subSequenceStartPositions_creator() ret += subSequenceStartPositions_creator()
return ret return ret
if __name__ == "__main__": if __name__ == "__main__":
# test code # test code
data_provider = SimpleDataProvider('./test_batch') data_provider = SimpleDataProvider('./test_batch')
......
...@@ -22,18 +22,20 @@ data = [ ...@@ -22,18 +22,20 @@ data = [
[[[0, 2], [2, 5], [0, 1, 2]], 1], [[[0, 2], [2, 5], [0, 1, 2]], 1],
] ]
# Used for sequence_nest_rnn.conf # Used for sequence_nest_rnn.conf
@provider(input_types=[integer_value_sub_sequence(10), @provider(
integer_value(3)], input_types=[integer_value_sub_sequence(10), integer_value(3)],
should_shuffle=False) should_shuffle=False)
def process_subseq(settings, file_name): def process_subseq(settings, file_name):
for d in data: for d in data:
yield d yield d
# Used for sequence_rnn.conf # Used for sequence_rnn.conf
@provider(input_types=[integer_value_sequence(10), @provider(
integer_value(3)], input_types=[integer_value_sequence(10), integer_value(3)],
should_shuffle=False) should_shuffle=False)
def process_seq(settings, file_name): def process_seq(settings, file_name):
for d in data: for d in data:
seq = [] seq = []
...@@ -41,18 +43,20 @@ def process_seq(settings, file_name): ...@@ -41,18 +43,20 @@ def process_seq(settings, file_name):
seq += subseq seq += subseq
yield seq, d[1] yield seq, d[1]
# Used for sequence_nest_rnn_multi_input.conf # Used for sequence_nest_rnn_multi_input.conf
@provider(input_types=[integer_value_sub_sequence(10), @provider(
integer_value(3)], input_types=[integer_value_sub_sequence(10), integer_value(3)],
should_shuffle=False) should_shuffle=False)
def process_subseq2(settings, file_name): def process_subseq2(settings, file_name):
for d in data: for d in data:
yield d yield d
# Used for sequence_rnn_multi_input.conf # Used for sequence_rnn_multi_input.conf
@provider(input_types=[integer_value_sequence(10), @provider(
integer_value(3)], input_types=[integer_value_sequence(10), integer_value(3)],
should_shuffle=False) should_shuffle=False)
def process_seq2(settings, file_name): def process_seq2(settings, file_name):
for d in data: for d in data:
seq = [] seq = []
...@@ -60,31 +64,34 @@ def process_seq2(settings, file_name): ...@@ -60,31 +64,34 @@ def process_seq2(settings, file_name):
seq += subseq seq += subseq
yield seq, d[1] yield seq, d[1]
########################################################### ###########################################################
data2 = [ data2 = [
[[[1, 2], [4, 5, 2]], [[5, 4, 1], [3, 1]] ,0], [[[1, 2], [4, 5, 2]], [[5, 4, 1], [3, 1]], 0],
[[[0, 2], [2, 5], [0, 1, 2]],[[1, 5], [4], [2, 3, 6, 1]], 1], [[[0, 2], [2, 5], [0, 1, 2]], [[1, 5], [4], [2, 3, 6, 1]], 1],
] ]
# Used for sequence_nest_rnn_multi_unequalength_inputs.conf # Used for sequence_nest_rnn_multi_unequalength_inputs.conf
@provider(input_types=[integer_value_sub_sequence(10), @provider(
integer_value_sub_sequence(10), input_types=[
integer_value(2)], integer_value_sub_sequence(10), integer_value_sub_sequence(10),
should_shuffle=False) integer_value(2)
],
should_shuffle=False)
def process_unequalength_subseq(settings, file_name): def process_unequalength_subseq(settings, file_name):
for d in data2: for d in data2:
yield d yield d
# Used for sequence_rnn_multi_unequalength_inputs.conf # Used for sequence_rnn_multi_unequalength_inputs.conf
@provider(input_types=[integer_value_sequence(10), @provider(
integer_value_sequence(10), input_types=[
integer_value(2)], integer_value_sequence(10), integer_value_sequence(10), integer_value(2)
should_shuffle=False) ],
should_shuffle=False)
def process_unequalength_seq(settings, file_name): def process_unequalength_seq(settings, file_name):
for d in data2: for d in data2:
words1=reduce(lambda x,y: x+y, d[0]) words1 = reduce(lambda x, y: x + y, d[0])
words2=reduce(lambda x,y: x+y, d[1]) words2 = reduce(lambda x, y: x + y, d[1])
yield words1, words2, d[2] yield words1, words2, d[2]
...@@ -20,8 +20,9 @@ from paddle.trainer.PyDataProvider2 import * ...@@ -20,8 +20,9 @@ from paddle.trainer.PyDataProvider2 import *
def hook(settings, dict_file, **kwargs): def hook(settings, dict_file, **kwargs):
settings.word_dict = dict_file settings.word_dict = dict_file
settings.input_types = [integer_value_sequence(len(settings.word_dict)), settings.input_types = [
integer_value(3)] integer_value_sequence(len(settings.word_dict)), integer_value(3)
]
settings.logger.info('dict len : %d' % (len(settings.word_dict))) settings.logger.info('dict len : %d' % (len(settings.word_dict)))
...@@ -32,16 +33,19 @@ def process(settings, file_name): ...@@ -32,16 +33,19 @@ def process(settings, file_name):
label, comment = line.strip().split('\t') label, comment = line.strip().split('\t')
label = int(''.join(label.split())) label = int(''.join(label.split()))
words = comment.split() words = comment.split()
word_slot = [settings.word_dict[w] for w in words if word_slot = [
w in settings.word_dict] settings.word_dict[w] for w in words if w in settings.word_dict
]
yield word_slot, label yield word_slot, label
## for hierarchical sequence network ## for hierarchical sequence network
def hook2(settings, dict_file, **kwargs): def hook2(settings, dict_file, **kwargs):
settings.word_dict = dict_file settings.word_dict = dict_file
settings.input_types = [integer_value_sub_sequence(len(settings.word_dict)), settings.input_types = [
integer_value_sequence(3)] integer_value_sub_sequence(len(settings.word_dict)),
integer_value_sequence(3)
]
settings.logger.info('dict len : %d' % (len(settings.word_dict))) settings.logger.info('dict len : %d' % (len(settings.word_dict)))
...@@ -55,8 +59,10 @@ def process2(settings, file_name): ...@@ -55,8 +59,10 @@ def process2(settings, file_name):
label, comment = line.strip().split('\t') label, comment = line.strip().split('\t')
label = int(''.join(label.split())) label = int(''.join(label.split()))
words = comment.split() words = comment.split()
word_slot = [settings.word_dict[w] for w in words if word_slot = [
w in settings.word_dict] settings.word_dict[w] for w in words
if w in settings.word_dict
]
label_list.append(label) label_list.append(label)
word_slot_list.append(word_slot) word_slot_list.append(word_slot)
else: else:
......
...@@ -21,15 +21,16 @@ dict_file = dict() ...@@ -21,15 +21,16 @@ dict_file = dict()
for line_count, line in enumerate(open(dict_path, "r")): for line_count, line in enumerate(open(dict_path, "r")):
dict_file[line.strip()] = line_count dict_file[line.strip()] = line_count
define_py_data_sources2(train_list='gserver/tests/Sequence/train.list', define_py_data_sources2(
test_list=None, train_list='gserver/tests/Sequence/train.list',
module='sequenceGen', test_list=None,
obj='process', module='sequenceGen',
args={"dict_file":dict_file}) obj='process',
args={"dict_file": dict_file})
settings(batch_size=5) settings(batch_size=5)
######################## network configure ################################ ######################## network configure ################################
dict_dim = len(open(dict_path,'r').readlines()) dict_dim = len(open(dict_path, 'r').readlines())
word_dim = 128 word_dim = 128
hidden_dim = 256 hidden_dim = 256
label_dim = 3 label_dim = 3
...@@ -39,21 +40,24 @@ data = data_layer(name="word", size=dict_dim) ...@@ -39,21 +40,24 @@ data = data_layer(name="word", size=dict_dim)
emb = embedding_layer(input=data, size=word_dim) emb = embedding_layer(input=data, size=word_dim)
# (lstm_input + lstm) is equal to lstmemory # (lstm_input + lstm) is equal to lstmemory
with mixed_layer(size=hidden_dim*4) as lstm_input: with mixed_layer(size=hidden_dim * 4) as lstm_input:
lstm_input += full_matrix_projection(input=emb) lstm_input += full_matrix_projection(input=emb)
lstm = lstmemory_group(input=lstm_input, lstm = lstmemory_group(
size=hidden_dim, input=lstm_input,
act=TanhActivation(), size=hidden_dim,
gate_act=SigmoidActivation(), act=TanhActivation(),
state_act=TanhActivation(), gate_act=SigmoidActivation(),
lstm_layer_attr=ExtraLayerAttribute(error_clipping_threshold=50)) state_act=TanhActivation(),
lstm_layer_attr=ExtraLayerAttribute(error_clipping_threshold=50))
lstm_last = last_seq(input=lstm) lstm_last = last_seq(input=lstm)
with mixed_layer(size=label_dim, with mixed_layer(
act=SoftmaxActivation(), size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
bias_attr=True) as output:
output += full_matrix_projection(input=lstm_last) output += full_matrix_projection(input=lstm_last)
outputs(classification_cost(input=output, label=data_layer(name="label", size=1))) outputs(
classification_cost(
input=output, label=data_layer(
name="label", size=1)))
...@@ -21,15 +21,16 @@ dict_file = dict() ...@@ -21,15 +21,16 @@ dict_file = dict()
for line_count, line in enumerate(open(dict_path, "r")): for line_count, line in enumerate(open(dict_path, "r")):
dict_file[line.strip()] = line_count dict_file[line.strip()] = line_count
define_py_data_sources2(train_list='gserver/tests/Sequence/train.list.nest', define_py_data_sources2(
test_list=None, train_list='gserver/tests/Sequence/train.list.nest',
module='sequenceGen', test_list=None,
obj='process2', module='sequenceGen',
args={"dict_file":dict_file}) obj='process2',
args={"dict_file": dict_file})
settings(batch_size=2) settings(batch_size=2)
######################## network configure ################################ ######################## network configure ################################
dict_dim = len(open(dict_path,'r').readlines()) dict_dim = len(open(dict_path, 'r').readlines())
word_dim = 128 word_dim = 128
hidden_dim = 256 hidden_dim = 256
label_dim = 3 label_dim = 3
...@@ -38,37 +39,46 @@ data = data_layer(name="word", size=dict_dim) ...@@ -38,37 +39,46 @@ data = data_layer(name="word", size=dict_dim)
emb_group = embedding_layer(input=data, size=word_dim) emb_group = embedding_layer(input=data, size=word_dim)
# (lstm_input + lstm) is equal to lstmemory # (lstm_input + lstm) is equal to lstmemory
def lstm_group(lstm_group_input): def lstm_group(lstm_group_input):
with mixed_layer(size=hidden_dim*4) as group_input: with mixed_layer(size=hidden_dim * 4) as group_input:
group_input += full_matrix_projection(input=lstm_group_input) group_input += full_matrix_projection(input=lstm_group_input)
lstm_output = lstmemory_group(input=group_input, lstm_output = lstmemory_group(
name="lstm_group", input=group_input,
size=hidden_dim, name="lstm_group",
act=TanhActivation(), size=hidden_dim,
gate_act=SigmoidActivation(), act=TanhActivation(),
state_act=TanhActivation(), gate_act=SigmoidActivation(),
lstm_layer_attr=ExtraLayerAttribute(error_clipping_threshold=50)) state_act=TanhActivation(),
lstm_layer_attr=ExtraLayerAttribute(error_clipping_threshold=50))
return lstm_output return lstm_output
lstm_nest_group = recurrent_group(input=SubsequenceInput(emb_group),
step=lstm_group, lstm_nest_group = recurrent_group(
name="lstm_nest_group") input=SubsequenceInput(emb_group), step=lstm_group, name="lstm_nest_group")
# hasSubseq ->(seqlastins) seq # hasSubseq ->(seqlastins) seq
lstm_last = last_seq(input=lstm_nest_group, agg_level=AggregateLevel.EACH_SEQUENCE) lstm_last = last_seq(
input=lstm_nest_group, agg_level=AggregateLevel.EACH_SEQUENCE)
# seq ->(expand) hasSubseq # seq ->(expand) hasSubseq
lstm_expand = expand_layer(input=lstm_last, expand_as=emb_group, expand_level=ExpandLevel.FROM_SEQUENCE) lstm_expand = expand_layer(
input=lstm_last,
expand_as=emb_group,
expand_level=ExpandLevel.FROM_SEQUENCE)
# hasSubseq ->(average) seq # hasSubseq ->(average) seq
lstm_average = pooling_layer(input=lstm_expand, lstm_average = pooling_layer(
pooling_type=AvgPooling(), input=lstm_expand,
agg_level=AggregateLevel.EACH_SEQUENCE) pooling_type=AvgPooling(),
agg_level=AggregateLevel.EACH_SEQUENCE)
with mixed_layer(size=label_dim, with mixed_layer(
act=SoftmaxActivation(), size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
bias_attr=True) as output:
output += full_matrix_projection(input=lstm_average) output += full_matrix_projection(input=lstm_average)
outputs(classification_cost(input=output, label=data_layer(name="label", size=1))) outputs(
classification_cost(
input=output, label=data_layer(
name="label", size=1)))
...@@ -33,16 +33,19 @@ def test_init_hooker(setting, value, **kwargs): ...@@ -33,16 +33,19 @@ def test_init_hooker(setting, value, **kwargs):
setting.value = value setting.value = value
@provider(input_types=[dense_vector(20, seq_type=SequenceType.NO_SEQUENCE)], @provider(
init_hook=test_init_hooker) input_types=[dense_vector(
20, seq_type=SequenceType.NO_SEQUENCE)],
init_hook=test_init_hooker)
def test_init_hook(setting, filename): def test_init_hook(setting, filename):
for i in xrange(200): for i in xrange(200):
yield setting.value yield setting.value
@provider( @provider(input_types=[
input_types=[ sparse_binary_vector(
sparse_binary_vector(30000, seq_type=SequenceType.NO_SEQUENCE)]) 30000, seq_type=SequenceType.NO_SEQUENCE)
])
def test_sparse_non_value_no_seq(setting, filename): def test_sparse_non_value_no_seq(setting, filename):
for i in xrange(200): for i in xrange(200):
yield [(i + 1) * (j + 1) for j in xrange(10)] yield [(i + 1) * (j + 1) for j in xrange(10)]
...@@ -77,28 +80,28 @@ def test_min_pool_size(setting, filename): ...@@ -77,28 +80,28 @@ def test_min_pool_size(setting, filename):
yield random.randint(0, 100 - 1) yield random.randint(0, 100 - 1)
@provider(input_types=[index_slot(100, seq_type=SequenceType.SEQUENCE)], @provider(
can_over_batch_size=False, input_types=[index_slot(
calc_batch_size=lambda x: len(x[0])) 100, seq_type=SequenceType.SEQUENCE)],
can_over_batch_size=False,
calc_batch_size=lambda x: len(x[0]))
def test_can_over_batch_size(setting, filename): def test_can_over_batch_size(setting, filename):
for _ in xrange(1 << 10): for _ in xrange(1 << 10):
seq_len = random.randint(0, 99) seq_len = random.randint(0, 99)
yield [random.randint(0, 100 - 1) for _ in xrange(seq_len)] yield [random.randint(0, 100 - 1) for _ in xrange(seq_len)]
@provider(input_types={'input1':index_slot(10), 'input2': index_slot(10)}) @provider(input_types={'input1': index_slot(10), 'input2': index_slot(10)})
def test_input_order(setting, filename): def test_input_order(setting, filename):
for _ in xrange(1000): for _ in xrange(1000):
yield { yield {'input1': 0, 'input2': 1}
'input1': 0,
'input2': 1
}
@provider(input_types=[index_slot(10)], @provider(
check=True, input_types=[index_slot(10)],
check_fail_continue=True, check=True,
should_shuffle="123") # also test should shuffle check_fail_continue=True,
should_shuffle="123") # also test should shuffle
def test_check(settings, filename): def test_check(settings, filename):
yield_good_value = False yield_good_value = False
...@@ -108,4 +111,3 @@ def test_check(settings, filename): ...@@ -108,4 +111,3 @@ def test_check(settings, filename):
if i < 10: if i < 10:
yield_good_value = True yield_good_value = True
yield i yield i
...@@ -15,9 +15,10 @@ ...@@ -15,9 +15,10 @@
from util import DataProviderWrapperConverter from util import DataProviderWrapperConverter
from dataprovider_converter import DataProviderConverter from dataprovider_converter import DataProviderConverter
__all__ = ['paddle', __all__ = [
'DataProviderConverter', 'paddle',
'DataProviderWrapperConverter', # for deprecated usage. 'DataProviderConverter',
'loadParameterFile'] 'DataProviderWrapperConverter', # for deprecated usage.
'loadParameterFile'
]
util.monkeypatches() util.monkeypatches()
...@@ -45,10 +45,8 @@ class DenseScanner(IScanner): ...@@ -45,10 +45,8 @@ class DenseScanner(IScanner):
def finish_scan(self, argument): def finish_scan(self, argument):
assert isinstance(argument, swig_paddle.Arguments) assert isinstance(argument, swig_paddle.Arguments)
assert isinstance(self.input_type, dp2.InputType) assert isinstance(self.input_type, dp2.InputType)
m = swig_paddle.Matrix.createDense(self.__mat__, m = swig_paddle.Matrix.createDense(self.__mat__, self.__height__,
self.__height__, self.input_type.dim, False)
self.input_type.dim,
False)
argument.setSlotValue(self.pos, m) argument.setSlotValue(self.pos, m)
...@@ -141,8 +139,10 @@ class DataProviderConverter(object): ...@@ -141,8 +139,10 @@ class DataProviderConverter(object):
assert isinstance(argument, swig_paddle.Arguments) assert isinstance(argument, swig_paddle.Arguments)
argument.resize(len(self.input_types)) argument.resize(len(self.input_types))
scanners = [DataProviderConverter.create_scanner(i, each_type) scanners = [
for i, each_type in enumerate(self.input_types)] DataProviderConverter.create_scanner(i, each_type)
for i, each_type in enumerate(self.input_types)
]
for each_sample in dat: for each_sample in dat:
for each_step, scanner in zip(each_sample, scanners): for each_step, scanner in zip(each_sample, scanners):
...@@ -171,11 +171,14 @@ class DataProviderConverter(object): ...@@ -171,11 +171,14 @@ class DataProviderConverter(object):
assert retv is not None assert retv is not None
if each.seq_type == dp2.SequenceType.SUB_SEQUENCE: if each.seq_type == dp2.SequenceType.SUB_SEQUENCE:
retv = SequenceScanner(each, i, retv, lambda a, p, seq: retv = SequenceScanner(
a.setSlotSubSequenceStartPositions(p, seq)) each, i, retv,
lambda a, p, seq: a.setSlotSubSequenceStartPositions(p, seq))
if each.seq_type in [dp2.SequenceType.SUB_SEQUENCE,
dp2.SequenceType.SEQUENCE]: if each.seq_type in [
retv = SequenceScanner(each, i, retv, lambda a, p, seq: dp2.SequenceType.SUB_SEQUENCE, dp2.SequenceType.SEQUENCE
a.setSlotSequenceStartPositions(p, seq)) ]:
retv = SequenceScanner(
each, i, retv,
lambda a, p, seq: a.setSlotSequenceStartPositions(p, seq))
return retv return retv
...@@ -11,7 +11,6 @@ ...@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
""" """
Some Useful method for py_paddle. Some Useful method for py_paddle.
""" """
...@@ -79,6 +78,7 @@ class __ParameterCallbackWrapper__(swig_paddle.UpdateCallback): ...@@ -79,6 +78,7 @@ class __ParameterCallbackWrapper__(swig_paddle.UpdateCallback):
else: else:
return __ParameterCallbackWrapper__(callback).__disown__() return __ParameterCallbackWrapper__(callback).__disown__()
def __arguments_to_numpy__(i, arg): def __arguments_to_numpy__(i, arg):
assert isinstance(arg, swig_paddle.Arguments) assert isinstance(arg, swig_paddle.Arguments)
value = arg.getSlotValue(i) value = arg.getSlotValue(i)
...@@ -89,10 +89,8 @@ def __arguments_to_numpy__(i, arg): ...@@ -89,10 +89,8 @@ def __arguments_to_numpy__(i, arg):
if ids is not None: if ids is not None:
assert isinstance(ids, swig_paddle.IVector) assert isinstance(ids, swig_paddle.IVector)
ids = ids.copyToNumpyArray() ids = ids.copyToNumpyArray()
return { return {"value": value, "id": ids}
"value": value,
"id": ids
}
def __monkeypatch_gradient_machine__(): def __monkeypatch_gradient_machine__():
""" """
...@@ -102,7 +100,6 @@ def __monkeypatch_gradient_machine__(): ...@@ -102,7 +100,6 @@ def __monkeypatch_gradient_machine__():
swig_paddle.GradientMachine.loadFromConfigFile = \ swig_paddle.GradientMachine.loadFromConfigFile = \
staticmethod(loadGradientMachine) staticmethod(loadGradientMachine)
def __matrix_to_numpy__(m): def __matrix_to_numpy__(m):
if isinstance(m, swig_paddle.Matrix): if isinstance(m, swig_paddle.Matrix):
return m.copyToNumpyMat() return m.copyToNumpyMat()
...@@ -113,9 +110,11 @@ def __monkeypatch_gradient_machine__(): ...@@ -113,9 +110,11 @@ def __monkeypatch_gradient_machine__():
def createFromConfigProto(protoObj, def createFromConfigProto(protoObj,
createMode=swig_paddle.CREATE_MODE_NORMAL, createMode=swig_paddle.CREATE_MODE_NORMAL,
paramTypes=[swig_paddle.PARAMETER_VALUE, paramTypes=[
swig_paddle.PARAMETER_GRADIENT, swig_paddle.PARAMETER_VALUE,
swig_paddle.PARAMETER_MOMENTUM]): swig_paddle.PARAMETER_GRADIENT,
swig_paddle.PARAMETER_MOMENTUM
]):
""" """
Create Gradient Machine From Proto object. Create Gradient Machine From Proto object.
:param protoObj: Model config :param protoObj: Model config
...@@ -145,8 +144,10 @@ def __monkeypatch_gradient_machine__(): ...@@ -145,8 +144,10 @@ def __monkeypatch_gradient_machine__():
""" """
outArgs = swig_paddle.Arguments.createArguments(0) outArgs = swig_paddle.Arguments.createArguments(0)
self.forward(inArgs, outArgs, swig_paddle.PASS_TEST) self.forward(inArgs, outArgs, swig_paddle.PASS_TEST)
return [__arguments_to_numpy__(i, outArgs) for i in xrange( return [
outArgs.getSlotNum())] __arguments_to_numpy__(i, outArgs)
for i in xrange(outArgs.getSlotNum())
]
swig_paddle.GradientMachine.forwardTest = forwardTest swig_paddle.GradientMachine.forwardTest = forwardTest
...@@ -167,7 +168,10 @@ def __monkeypatch_gradient_machine__(): ...@@ -167,7 +168,10 @@ def __monkeypatch_gradient_machine__():
swig_paddle.GradientMachine.__forwardBackward__ = \ swig_paddle.GradientMachine.__forwardBackward__ = \
swig_paddle.GradientMachine.forwardBackward swig_paddle.GradientMachine.forwardBackward
def forwardBackward(self, inArgs, outArgs, passType, def forwardBackward(self,
inArgs,
outArgs,
passType,
callback=swig_paddle.UpdateCallback()): callback=swig_paddle.UpdateCallback()):
""" """
GradientMachine forward backward. GradientMachine forward backward.
...@@ -315,9 +319,8 @@ class DataProviderWrapperConverter(object): ...@@ -315,9 +319,8 @@ class DataProviderWrapperConverter(object):
self.cols += other self.cols += other
def __call__(self, slot_idx, arg): def __call__(self, slot_idx, arg):
mat = swig_paddle.Matrix.createSparse(len(self.indices) - 1, mat = swig_paddle.Matrix.createSparse(
self.dim, len(self.indices) - 1, self.dim, len(self.cols), True)
len(self.cols), True)
assert isinstance(mat, swig_paddle.Matrix) assert isinstance(mat, swig_paddle.Matrix)
mat.sparseCopyFrom(self.indices, self.cols) mat.sparseCopyFrom(self.indices, self.cols)
self.putIntoArg(slot_idx, arg, mat) self.putIntoArg(slot_idx, arg, mat)
...@@ -341,9 +344,8 @@ class DataProviderWrapperConverter(object): ...@@ -341,9 +344,8 @@ class DataProviderWrapperConverter(object):
self.values += map(lambda x: x[1], other) self.values += map(lambda x: x[1], other)
def __call__(self, slot_idx, arg): def __call__(self, slot_idx, arg):
mat = swig_paddle.Matrix.createSparse(len(self.indices) - 1, mat = swig_paddle.Matrix.createSparse(
self.dim, len(self.indices) - 1, self.dim, len(self.cols), False)
len(self.cols), False)
assert isinstance(mat, swig_paddle.Matrix) assert isinstance(mat, swig_paddle.Matrix)
mat.sparseCopyFrom(self.indices, self.cols, self.values) mat.sparseCopyFrom(self.indices, self.cols, self.values)
self.putIntoArg(slot_idx, arg, mat) self.putIntoArg(slot_idx, arg, mat)
...@@ -352,8 +354,9 @@ class DataProviderWrapperConverter(object): ...@@ -352,8 +354,9 @@ class DataProviderWrapperConverter(object):
paddle.trainer.PyDataProviderWrapper.DenseSlot: DenseValueConverter, paddle.trainer.PyDataProviderWrapper.DenseSlot: DenseValueConverter,
paddle.trainer.PyDataProviderWrapper.IndexSlot: IdValueConverter, paddle.trainer.PyDataProviderWrapper.IndexSlot: IdValueConverter,
paddle.trainer.PyDataProviderWrapper.SparseNonValueSlot: paddle.trainer.PyDataProviderWrapper.SparseNonValueSlot:
SparseNonValueConverter, SparseNonValueConverter,
paddle.trainer.PyDataProviderWrapper.SparseValueSlot: SparseValueConverter paddle.trainer.PyDataProviderWrapper.SparseValueSlot:
SparseValueConverter
} }
def __init__(self, use_seq, header): def __init__(self, use_seq, header):
...@@ -381,10 +384,9 @@ class DataProviderWrapperConverter(object): ...@@ -381,10 +384,9 @@ class DataProviderWrapperConverter(object):
assert isinstance(argument, swig_paddle.Arguments) assert isinstance(argument, swig_paddle.Arguments)
argument.resize(len(self.__header__)) argument.resize(len(self.__header__))
values = map(lambda x: values = map(
DataProviderWrapperConverter.__SLOT_VALUE_CONVERTER_MAP__[ lambda x: DataProviderWrapperConverter.__SLOT_VALUE_CONVERTER_MAP__[x.__class__](x),
x.__class__](x), self.__header__)
self.__header__)
if self.__use_seq__: if self.__use_seq__:
seq_dim = [[] for _ in xrange(self.__header__.__len__())] seq_dim = [[] for _ in xrange(self.__header__.__len__())]
...@@ -394,14 +396,13 @@ class DataProviderWrapperConverter(object): ...@@ -394,14 +396,13 @@ class DataProviderWrapperConverter(object):
for slot_idx, sequence in enumerate(each_sample): for slot_idx, sequence in enumerate(each_sample):
for raw_data in sequence: for raw_data in sequence:
values[slot_idx].append(raw_data) values[slot_idx].append(raw_data)
seq_start_pos[slot_idx].append( seq_start_pos[slot_idx].append(seq_start_pos[slot_idx][-1] +
seq_start_pos[slot_idx][-1] + len(sequence)) len(sequence))
seq_dim[slot_idx].append(len(sequence)) seq_dim[slot_idx].append(len(sequence))
for slot_idx in xrange(len(self.__header__)): for slot_idx in xrange(len(self.__header__)):
argument.setSlotSequenceDim(slot_idx, argument.setSlotSequenceDim(
swig_paddle.IVector.create( slot_idx, swig_paddle.IVector.create(seq_dim[slot_idx]))
seq_dim[slot_idx]))
argument.setSlotSequenceStartPositions( argument.setSlotSequenceStartPositions(
slot_idx, slot_idx,
swig_paddle.IVector.create(seq_start_pos[slot_idx])) swig_paddle.IVector.create(seq_start_pos[slot_idx]))
...@@ -422,7 +423,6 @@ class DataProviderWrapperConverter(object): ...@@ -422,7 +423,6 @@ class DataProviderWrapperConverter(object):
return self.convert(wrapper_data, argument) return self.convert(wrapper_data, argument)
def __monkey_patch_protobuf_objects__(): def __monkey_patch_protobuf_objects__():
def ParameterConfig_toProto(self): def ParameterConfig_toProto(self):
""" """
...@@ -459,8 +459,7 @@ def __monkey_patch_protobuf_objects__(): ...@@ -459,8 +459,7 @@ def __monkey_patch_protobuf_objects__():
:return: paddle.OptimizationConfig :return: paddle.OptimizationConfig
""" """
assert isinstance(protoObj, assert isinstance(protoObj, paddle.proto.OptimizationConfig)
paddle.proto.OptimizationConfig)
return swig_paddle.OptimizationConfig.createFromProtoString( return swig_paddle.OptimizationConfig.createFromProtoString(
protoObj.SerializeToString()) protoObj.SerializeToString())
...@@ -475,8 +474,7 @@ def __monkey_patch_protobuf_objects__(): ...@@ -475,8 +474,7 @@ def __monkey_patch_protobuf_objects__():
:param protoObj: proto.TrainerConfig :param protoObj: proto.TrainerConfig
:return: paddle.TrainerConfig :return: paddle.TrainerConfig
""" """
assert isinstance(protoObj, assert isinstance(protoObj, paddle.proto.TrainerConfig)
paddle.proto.TrainerConfig)
return swig_paddle.TrainerConfig.createFromProtoString( return swig_paddle.TrainerConfig.createFromProtoString(
protoObj.SerializeToString()) protoObj.SerializeToString())
...@@ -537,6 +535,7 @@ def __monkey_patch_trainer__(): ...@@ -537,6 +535,7 @@ def __monkey_patch_trainer__():
assert isinstance(model, swig_paddle.GradientMachine) assert isinstance(model, swig_paddle.GradientMachine)
return swig_paddle.Trainer.__create__( return swig_paddle.Trainer.__create__(
swig_paddle.TrainerConfig.createFromProto(config), model) swig_paddle.TrainerConfig.createFromProto(config), model)
swig_paddle.Trainer.create = staticmethod(Trainer_create) swig_paddle.Trainer.create = staticmethod(Trainer_create)
swig_paddle.Trainer.__getForwardOutput__ = \ swig_paddle.Trainer.__getForwardOutput__ = \
...@@ -551,14 +550,19 @@ def __monkey_patch_trainer__(): ...@@ -551,14 +550,19 @@ def __monkey_patch_trainer__():
numpy.ndarray. numpy.ndarray.
""" """
outArgs = self.__getForwardOutput__() outArgs = self.__getForwardOutput__()
return [__arguments_to_numpy__(i, outArgs) for i in xrange( return [
outArgs.getSlotNum())] __arguments_to_numpy__(i, outArgs)
for i in xrange(outArgs.getSlotNum())
]
swig_paddle.Trainer.getForwardOutput = getForwardOutput swig_paddle.Trainer.getForwardOutput = getForwardOutput
def monkeypatches(): def monkeypatches():
patches = [__monkeypatch_init_paddle__, __monkeypatch_gradient_machine__, patches = [
__monkey_patch_protobuf_objects__, __monkeypatch_init_paddle__, __monkeypatch_gradient_machine__,
__monkey_patch_parameter__, __monkey_patch_trainer__] __monkey_patch_protobuf_objects__, __monkey_patch_parameter__,
__monkey_patch_trainer__
]
for patch in patches: for patch in patches:
patch() patch()
...@@ -13,17 +13,14 @@ ...@@ -13,17 +13,14 @@
# limitations under the License. # limitations under the License.
HOSTS = [ HOSTS = [
"root@192.168.100.17", "root@192.168.100.17",
"root@192.168.100.18", "root@192.168.100.18",
] ]
''' '''
workspace configuration workspace configuration
''' '''
#root dir for workspace, can be set as any director with real user account #root dir for workspace, can be set as any director with real user account
ROOT_DIR = "/home/paddle" ROOT_DIR = "/home/paddle"
''' '''
network configuration network configuration
''' '''
...@@ -37,4 +34,4 @@ PADDLE_PORTS_NUM = 2 ...@@ -37,4 +34,4 @@ PADDLE_PORTS_NUM = 2
PADDLE_PORTS_NUM_FOR_SPARSE = 2 PADDLE_PORTS_NUM_FOR_SPARSE = 2
#environments setting for all processes in cluster job #environments setting for all processes in cluster job
LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/lib64" LD_LIBRARY_PATH = "/usr/local/cuda/lib64:/usr/lib64"
...@@ -12,8 +12,6 @@ ...@@ -12,8 +12,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
""" module for launching cluster job """ """ module for launching cluster job """
import os import os
...@@ -23,13 +21,13 @@ import copy ...@@ -23,13 +21,13 @@ import copy
import time import time
import signal import signal
from fabric.api import run, put, settings, env, prefix from fabric.api import run, put, settings, env, prefix
from fabric.tasks import execute from fabric.tasks import execute
#configuration for cluster #configuration for cluster
import conf import conf
def refine_unknown_args(cmd_args): def refine_unknown_args(cmd_args):
''' '''
refine unknown parameters to handle some special parameters refine unknown parameters to handle some special parameters
...@@ -37,7 +35,7 @@ def refine_unknown_args(cmd_args): ...@@ -37,7 +35,7 @@ def refine_unknown_args(cmd_args):
new_args = [] new_args = []
for arg in cmd_args: for arg in cmd_args:
if arg.startswith("--") and arg.find("=") != -1: if arg.startswith("--") and arg.find("=") != -1:
equal_pos = arg.find("=") #find first = pos equal_pos = arg.find("=") #find first = pos
arglist = list(arg) arglist = list(arg)
arglist[equal_pos] = " " arglist[equal_pos] = " "
arg = "".join(arglist) arg = "".join(arglist)
...@@ -50,6 +48,7 @@ def refine_unknown_args(cmd_args): ...@@ -50,6 +48,7 @@ def refine_unknown_args(cmd_args):
new_args.append(arg) new_args.append(arg)
return new_args return new_args
def kill_process(): def kill_process():
''' '''
kill comments threads kill comments threads
...@@ -60,6 +59,7 @@ def kill_process(): ...@@ -60,6 +59,7 @@ def kill_process():
| awk '{print $2}' \ | awk '{print $2}' \
| xargs kill > /dev/null 2>&1") | xargs kill > /dev/null 2>&1")
def job_prepare(jobdir, data=None): def job_prepare(jobdir, data=None):
''' '''
prepare job related workspace data prepare job related workspace data
...@@ -70,6 +70,7 @@ def job_prepare(jobdir, data=None): ...@@ -70,6 +70,7 @@ def job_prepare(jobdir, data=None):
This function just prepare all related model and other resources This function just prepare all related model and other resources
needed at runtime. needed at runtime.
''' '''
def job_create_workspace(jobdir, data=None): def job_create_workspace(jobdir, data=None):
''' '''
prepare job workspace, common file, etc. prepare job workspace, common file, etc.
...@@ -94,7 +95,8 @@ def job_prepare(jobdir, data=None): ...@@ -94,7 +95,8 @@ def job_prepare(jobdir, data=None):
execute(set_nodefile, i, hosts=conf.HOSTS[i]) execute(set_nodefile, i, hosts=conf.HOSTS[i])
#clean rubbish caused by exception #clean rubbish caused by exception
with settings(warn_only=True): with settings(warn_only=True):
execute(kill_process, hosts=conf.HOSTS) execute(kill_process, hosts=conf.HOSTS)
def job_pserver(jobdir, pids=None): def job_pserver(jobdir, pids=None):
''' '''
...@@ -124,9 +126,8 @@ def job_pserver(jobdir, pids=None): ...@@ -124,9 +126,8 @@ def job_pserver(jobdir, pids=None):
execute(start_pserver, jobdir, pargs, hosts=conf.HOSTS) execute(start_pserver, jobdir, pargs, hosts=conf.HOSTS)
def job_trainer(jobdir,
train_args_dict, def job_trainer(jobdir, train_args_dict, pids=None):
pids=None):
''' '''
start paddle trainer start paddle trainer
''' '''
...@@ -171,9 +172,8 @@ def job_trainer(jobdir, ...@@ -171,9 +172,8 @@ def job_trainer(jobdir,
train_args += " --trainer_id=" + str(i) train_args += " --trainer_id=" + str(i)
execute(start_trainer, jobdir, train_args, hosts=conf.HOSTS[i]) execute(start_trainer, jobdir, train_args, hosts=conf.HOSTS[i])
def job_all(job_package,
jobdir=None, def job_all(job_package, jobdir=None, train_args_dict=None):
train_args_dict=None):
''' '''
param job_package param job_package
param train_args_dict param train_args_dict
...@@ -183,41 +183,52 @@ def job_all(job_package, ...@@ -183,41 +183,52 @@ def job_all(job_package,
jobdir = conf.ROOT_DIR + "/JOB" + timestamp jobdir = conf.ROOT_DIR + "/JOB" + timestamp
job_prepare(jobdir, job_package) job_prepare(jobdir, job_package)
job_pserver(jobdir) job_pserver(jobdir)
time.sleep(5) #wait until pservers completely start time.sleep(5) #wait until pservers completely start
job_trainer(jobdir, train_args_dict) job_trainer(jobdir, train_args_dict)
job_clean() job_clean()
def job_clean(): def job_clean():
''' '''
if starting job failed from paddle internal, the framework always if starting job failed from paddle internal, the framework always
is launched successfully since these process are daemon processes. is launched successfully since these process are daemon processes.
so this job_clean can alway clean job rubbish process with ctrl+c. so this job_clean can alway clean job rubbish process with ctrl+c.
''' '''
def signal_handler(signal, frame): def signal_handler(signal, frame):
''' '''
SIGINT handler SIGINT handler
''' '''
def kill_process(): def kill_process():
run("ps aux \ run("ps aux \
| grep paddle_process_by_paddle \ | grep paddle_process_by_paddle \
| grep -v grep \ | grep -v grep \
| awk '{print $2}' \ | awk '{print $2}' \
| xargs kill > /dev/null 2>&1") | xargs kill > /dev/null 2>&1")
with settings(warn_only=True): with settings(warn_only=True):
execute(kill_process, hosts=conf.HOSTS) execute(kill_process, hosts=conf.HOSTS)
signal.signal(signal.SIGINT, signal_handler) signal.signal(signal.SIGINT, signal_handler)
signal.pause() signal.pause()
if __name__ == '__main__': if __name__ == '__main__':
parser = argparse.ArgumentParser(prog="paddle.py", parser = argparse.ArgumentParser(
description='simple tool for cluster training') prog="paddle.py", description='simple tool for cluster training')
parser.add_argument('-j', '--job_workspace', parser.add_argument(
required=False, default=None, '-j',
help='job workspace') '--job_workspace',
parser.add_argument('-p', '--job_dispatch_package', required=False,
required=False, default=None, default=None,
help='job package for dispatching to all other nodes') help='job workspace')
parser.add_argument(
'-p',
'--job_dispatch_package',
required=False,
default=None,
help='job package for dispatching to all other nodes')
args, train_args_list = parser.parse_known_args() args, train_args_list = parser.parse_known_args()
train_args = refine_unknown_args(train_args_list) train_args = refine_unknown_args(train_args_list)
...@@ -227,14 +238,10 @@ if __name__ == '__main__': ...@@ -227,14 +238,10 @@ if __name__ == '__main__':
#if assigned workspace, do not need to dispatch data, #if assigned workspace, do not need to dispatch data,
#so job_local_package should be None #so job_local_package should be None
assert args.job_dispatch_package is None assert args.job_dispatch_package is None
job_all(None, job_all(None, args.job_workspace, train_args_dict)
args.job_workspace,
train_args_dict)
elif args.job_dispatch_package is not None: elif args.job_dispatch_package is not None:
assert args.job_workspace is None assert args.job_workspace is None
assert os.path.isdir(args.job_dispatch_package) assert os.path.isdir(args.job_dispatch_package)
job_all(args.job_dispatch_package, job_all(args.job_dispatch_package, None, train_args_dict)
None,
train_args_dict)
else: else:
print "--job_workspace or --job_dispatch_package should be set" print "--job_workspace or --job_dispatch_package should be set"
...@@ -11,4 +11,3 @@ ...@@ -11,4 +11,3 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
...@@ -17,6 +17,6 @@ from paddle.trainer.config_parser import parse_config_and_serialize ...@@ -17,6 +17,6 @@ from paddle.trainer.config_parser import parse_config_and_serialize
if __name__ == '__main__': if __name__ == '__main__':
parse_config_and_serialize('trainer/tests/test_config.conf', '') parse_config_and_serialize('trainer/tests/test_config.conf', '')
parse_config_and_serialize( parse_config_and_serialize(
'trainer/tests/sample_trainer_config.conf', 'trainer/tests/sample_trainer_config.conf',
'extension_module_name=paddle.trainer.config_parser_extension') 'extension_module_name=paddle.trainer.config_parser_extension')
parse_config_and_serialize('gserver/tests/pyDataProvider/trainer.conf', '') parse_config_and_serialize('gserver/tests/pyDataProvider/trainer.conf', '')
...@@ -21,8 +21,7 @@ import logging ...@@ -21,8 +21,7 @@ import logging
import pprint import pprint
logging.basicConfig( logging.basicConfig(
format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s', format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s', )
)
logger = logging.getLogger('paddle') logger = logging.getLogger('paddle')
logger.setLevel(logging.INFO) logger.setLevel(logging.INFO)
...@@ -36,33 +35,32 @@ num_original_columns = 3 ...@@ -36,33 +35,32 @@ num_original_columns = 3
# [[-1,0], [0,0]] means previous token at column 0 and current token at # [[-1,0], [0,0]] means previous token at column 0 and current token at
# column 0 are combined as one feature. # column 0 are combined as one feature.
patterns = [ patterns = [
[[-2,0]], [[-2, 0]],
[[-1,0]], [[-1, 0]],
[[0,0]], [[0, 0]],
[[1,0]], [[1, 0]],
[[2,0]], [[2, 0]],
[[-1, 0], [0, 0]],
[[-1,0], [0,0]], [[0, 0], [1, 0]],
[[0,0], [1,0]], [[-2, 1]],
[[-1, 1]],
[[-2,1]], [[0, 1]],
[[-1,1]], [[1, 1]],
[[0,1]], [[2, 1]],
[[1,1]], [[-2, 1], [-1, 1]],
[[2,1]], [[-1, 1], [0, 1]],
[[-2,1], [-1,1]], [[0, 1], [1, 1]],
[[-1,1], [0,1]], [[1, 1], [2, 1]],
[[0,1], [1,1]], [[-2, 1], [-1, 1], [0, 1]],
[[1,1], [2,1]], [[-1, 1], [0, 1], [1, 1]],
[[0, 1], [1, 1], [2, 1]],
[[-2,1], [-1,1], [0,1]],
[[-1,1], [0,1], [1,1]],
[[0,1], [1,1], [2,1]],
] ]
def make_features(sequence): def make_features(sequence):
length = len(sequence) length = len(sequence)
num_features = len(sequence[0]) num_features = len(sequence[0])
def get_features(pos): def get_features(pos):
if pos < 0: if pos < 0:
return ['#B%s' % -pos] * num_features return ['#B%s' % -pos] * num_features
...@@ -72,9 +70,10 @@ def make_features(sequence): ...@@ -72,9 +70,10 @@ def make_features(sequence):
for i in xrange(length): for i in xrange(length):
for pattern in patterns: for pattern in patterns:
fname = '/'.join([get_features(i+pos)[f] for pos, f in pattern]) fname = '/'.join([get_features(i + pos)[f] for pos, f in pattern])
sequence[i].append(fname) sequence[i].append(fname)
''' '''
Source file format: Source file format:
Each line is for one timestep. The features are separated by space. Each line is for one timestep. The features are separated by space.
...@@ -87,6 +86,8 @@ i-th column. ...@@ -87,6 +86,8 @@ i-th column.
return a list of dict for each column return a list of dict for each column
''' '''
def create_dictionaries(filename, cutoff, oov_policy): def create_dictionaries(filename, cutoff, oov_policy):
def add_to_dict(sequence, dicts): def add_to_dict(sequence, dicts):
num_features = len(dicts) num_features = len(dicts)
...@@ -118,7 +119,6 @@ def create_dictionaries(filename, cutoff, oov_policy): ...@@ -118,7 +119,6 @@ def create_dictionaries(filename, cutoff, oov_policy):
features = line.split(' ') features = line.split(' ')
sequence.append(features) sequence.append(features)
for i in xrange(num_features): for i in xrange(num_features):
dct = dicts[i] dct = dicts[i]
n = 1 if oov_policy[i] == OOV_POLICY_USE else 0 n = 1 if oov_policy[i] == OOV_POLICY_USE else 0
...@@ -161,12 +161,9 @@ existed in dicts[i] will be assigned to id 0. ...@@ -161,12 +161,9 @@ existed in dicts[i] will be assigned to id 0.
if oov_policy[i] == OOV_POLICY_ERROR, all features in i-th column MUST exist if oov_policy[i] == OOV_POLICY_ERROR, all features in i-th column MUST exist
in dicts[i]. in dicts[i].
''' '''
def gen_proto_file(
input_file,
dicts,
oov_policy,
output_file):
def gen_proto_file(input_file, dicts, oov_policy, output_file):
def write_sequence(out, sequence): def write_sequence(out, sequence):
num_features = len(dicts) num_features = len(dicts)
is_beginning = True is_beginning = True
...@@ -213,8 +210,8 @@ def gen_proto_file( ...@@ -213,8 +210,8 @@ def gen_proto_file(
if patterns: if patterns:
slot_def = header.slot_defs.add() slot_def = header.slot_defs.add()
slot_def.type = DataFormat.SlotDef.VECTOR_SPARSE_NON_VALUE slot_def.type = DataFormat.SlotDef.VECTOR_SPARSE_NON_VALUE
slot_def.dim = sum([len(dicts[i]) slot_def.dim = sum(
for i in xrange(num_original_columns, len(dicts))]) [len(dicts[i]) for i in xrange(num_original_columns, len(dicts))])
logger.info("feature_dim=%s" % slot_def.dim) logger.info("feature_dim=%s" % slot_def.dim)
for i in xrange(num_original_columns): for i in xrange(num_original_columns):
...@@ -242,30 +239,31 @@ def gen_proto_file( ...@@ -242,30 +239,31 @@ def gen_proto_file(
logger.info("num_sequences=%s" % num_sequences) logger.info("num_sequences=%s" % num_sequences)
dict2 = { dict2 = {
'B-ADJP': 0, 'B-ADJP': 0,
'I-ADJP': 1, 'I-ADJP': 1,
'B-ADVP': 2, 'B-ADVP': 2,
'I-ADVP': 3, 'I-ADVP': 3,
'B-CONJP': 4, 'B-CONJP': 4,
'I-CONJP': 5, 'I-CONJP': 5,
'B-INTJ': 6, 'B-INTJ': 6,
'I-INTJ': 7, 'I-INTJ': 7,
'B-LST': 8, 'B-LST': 8,
'I-LST': 9, 'I-LST': 9,
'B-NP': 10, 'B-NP': 10,
'I-NP': 11, 'I-NP': 11,
'B-PP': 12, 'B-PP': 12,
'I-PP': 13, 'I-PP': 13,
'B-PRT': 14, 'B-PRT': 14,
'I-PRT': 15, 'I-PRT': 15,
'B-SBAR': 16, 'B-SBAR': 16,
'I-SBAR': 17, 'I-SBAR': 17,
'B-UCP': 18, 'B-UCP': 18,
'I-UCP': 19, 'I-UCP': 19,
'B-VP': 20, 'B-VP': 20,
'I-VP': 21, 'I-VP': 21,
'O': 22 'O': 22
} }
if __name__ == '__main__': if __name__ == '__main__':
...@@ -273,16 +271,9 @@ if __name__ == '__main__': ...@@ -273,16 +271,9 @@ if __name__ == '__main__':
cutoff += [3] * len(patterns) cutoff += [3] * len(patterns)
oov_policy = [OOV_POLICY_IGNORE, OOV_POLICY_ERROR, OOV_POLICY_ERROR] oov_policy = [OOV_POLICY_IGNORE, OOV_POLICY_ERROR, OOV_POLICY_ERROR]
oov_policy += [OOV_POLICY_IGNORE] * len(patterns) oov_policy += [OOV_POLICY_IGNORE] * len(patterns)
dicts = create_dictionaries( dicts = create_dictionaries('trainer/tests/train.txt', cutoff, oov_policy)
'trainer/tests/train.txt', cutoff, oov_policy)
dicts[2] = dict2 dicts[2] = dict2
gen_proto_file( gen_proto_file('trainer/tests/train.txt', dicts, oov_policy,
'trainer/tests/train.txt', 'trainer/tests/train_proto.bin')
dicts, gen_proto_file('trainer/tests/test.txt', dicts, oov_policy,
oov_policy, 'trainer/tests/test_proto.bin')
'trainer/tests/train_proto.bin')
gen_proto_file(
'trainer/tests/test.txt',
dicts,
oov_policy,
'trainer/tests/test_proto.bin')
...@@ -21,7 +21,10 @@ import json ...@@ -21,7 +21,10 @@ import json
import string import string
@provider(slots=[SparseNonValueSlot(10), DenseSlot(2), SparseValueSlot(10), StringSlot(1), IndexSlot(3)]) @provider(slots=[
SparseNonValueSlot(10), DenseSlot(2), SparseValueSlot(10), StringSlot(1),
IndexSlot(3)
])
def processNonSequenceData(obj, filename): def processNonSequenceData(obj, filename):
with open(filename, "rb") as f: with open(filename, "rb") as f:
for line in f: for line in f:
...@@ -50,6 +53,7 @@ val_randomer = lambda: random.uniform(-1.0, 1.0) ...@@ -50,6 +53,7 @@ val_randomer = lambda: random.uniform(-1.0, 1.0)
seq_count_randomer = lambda: random.randrange(1, SEQUENCE_LIMIT) seq_count_randomer = lambda: random.randrange(1, SEQUENCE_LIMIT)
str_count_randomer = lambda: random.randrange(1, STRING_LIMIT) str_count_randomer = lambda: random.randrange(1, STRING_LIMIT)
class IDRandomer(): # A random generator, return unique id class IDRandomer(): # A random generator, return unique id
def __init__(self): def __init__(self):
self.id_set = set() self.id_set = set()
...@@ -61,38 +65,57 @@ class IDRandomer(): # A random generator, return unique id ...@@ -61,38 +65,57 @@ class IDRandomer(): # A random generator, return unique id
return idx return idx
else: else:
return self.__call__() return self.__call__()
# SparseValueSlot # SparseValueSlot
def sparse_value_creator(_): def sparse_value_creator(_):
rand = IDRandomer() rand = IDRandomer()
return [(rand(), val_randomer()) for _ in xrange(sparse_count_randomer())] return [(rand(), val_randomer()) for _ in xrange(sparse_count_randomer())]
sparse_value = map(sparse_value_creator, range(seq_count_randomer())) sparse_value = map(sparse_value_creator, range(seq_count_randomer()))
# DenseSlot # DenseSlot
def dense_creator(_): def dense_creator(_):
return [val_randomer() for _ in xrange(SPARSE_ID_LIMIT)] return [val_randomer() for _ in xrange(SPARSE_ID_LIMIT)]
dense = map(dense_creator, range(seq_count_randomer())) dense = map(dense_creator, range(seq_count_randomer()))
# SparseNonValueSlot # SparseNonValueSlot
def sparse_creator(_): def sparse_creator(_):
rand = IDRandomer() rand = IDRandomer()
return [rand() for _ in xrange(sparse_count_randomer())] return [rand() for _ in xrange(sparse_count_randomer())]
sparse_nonvalue = map(sparse_creator, range(seq_count_randomer())) sparse_nonvalue = map(sparse_creator, range(seq_count_randomer()))
# IndexSlot # IndexSlot
ids = [sparse_id_randomer() for _ in range(seq_count_randomer())] ids = [sparse_id_randomer() for _ in range(seq_count_randomer())]
# StringSlot # StringSlot
def random_str(size = 8, chars=string.ascii_letters + string.digits): def random_str(size=8, chars=string.ascii_letters + string.digits):
return ''.join(random.choice(chars) for _ in range(size)) return ''.join(random.choice(chars) for _ in range(size))
strs = [random_str(str_count_randomer()) for _ in range(seq_count_randomer())] strs = [random_str(str_count_randomer()) for _ in range(seq_count_randomer())]
def processSeqAndGenerateDataInit(obj, *args, **kwargs): def processSeqAndGenerateDataInit(obj, *args, **kwargs):
obj.json_filename = kwargs.get("load_data_args", "test_data.json") obj.json_filename = kwargs.get("load_data_args", "test_data.json")
@provider(slots=[SparseValueSlot(SPARSE_ID_LIMIT), DenseSlot(SPARSE_ID_LIMIT),
SparseNonValueSlot(SPARSE_ID_LIMIT), IndexSlot(SPARSE_ID_LIMIT), @provider(
StringSlot(SPARSE_ID_LIMIT)], slots=[
use_seq=True, init_hook=processSeqAndGenerateDataInit) SparseValueSlot(SPARSE_ID_LIMIT), DenseSlot(SPARSE_ID_LIMIT),
SparseNonValueSlot(SPARSE_ID_LIMIT), IndexSlot(SPARSE_ID_LIMIT),
StringSlot(SPARSE_ID_LIMIT)
],
use_seq=True,
init_hook=processSeqAndGenerateDataInit)
def processSeqAndGenerateData(obj, name): def processSeqAndGenerateData(obj, name):
retv = [sparse_value, dense, sparse_nonvalue, ids, strs] retv = [sparse_value, dense, sparse_nonvalue, ids, strs]
# Write to protoseq. # Write to protoseq.
...@@ -104,10 +127,15 @@ def processSeqAndGenerateData(obj, name): ...@@ -104,10 +127,15 @@ def processSeqAndGenerateData(obj, name):
def processSubSeqAndGenerateDataInit(obj, *args, **kwargs): def processSubSeqAndGenerateDataInit(obj, *args, **kwargs):
obj.json_filename = kwargs.get("load_data_args", "test_data.json") obj.json_filename = kwargs.get("load_data_args", "test_data.json")
@provider(slots=[SparseValueSlot(SPARSE_ID_LIMIT), DenseSlot(SPARSE_ID_LIMIT),
SparseNonValueSlot(SPARSE_ID_LIMIT), IndexSlot(SPARSE_ID_LIMIT), @provider(
StringSlot(SPARSE_ID_LIMIT)], slots=[
use_seq=True, init_hook=processSubSeqAndGenerateDataInit) SparseValueSlot(SPARSE_ID_LIMIT), DenseSlot(SPARSE_ID_LIMIT),
SparseNonValueSlot(SPARSE_ID_LIMIT), IndexSlot(SPARSE_ID_LIMIT),
StringSlot(SPARSE_ID_LIMIT)
],
use_seq=True,
init_hook=processSubSeqAndGenerateDataInit)
def processSubSeqAndGenerateData(obj, name): def processSubSeqAndGenerateData(obj, name):
retv_json = [sparse_value, dense, sparse_nonvalue, ids, strs] retv_json = [sparse_value, dense, sparse_nonvalue, ids, strs]
retv_wrapper = [[sparse_value], [dense], [sparse_nonvalue], [ids], [strs]] retv_wrapper = [[sparse_value], [dense], [sparse_nonvalue], [ids], [strs]]
...@@ -116,6 +144,7 @@ def processSubSeqAndGenerateData(obj, name): ...@@ -116,6 +144,7 @@ def processSubSeqAndGenerateData(obj, name):
json.dump(retv_json, f) json.dump(retv_json, f)
yield retv_wrapper yield retv_wrapper
if __name__ == "__main__": if __name__ == "__main__":
pvd = processNonSequenceData("test.txt") pvd = processNonSequenceData("test.txt")
print pvd.getNextBatch(100) print pvd.getNextBatch(100)
......
import os import os
def __activate_virtual_env__(): def __activate_virtual_env__():
__path__ = os.getenv('VIRTUAL_ENV') __path__ = os.getenv('VIRTUAL_ENV')
if __path__ is None: if __path__ is None:
return return
__script__ = os.path.join(__path__, 'bin', 'activate_this.py') __script__ = os.path.join(__path__, 'bin', 'activate_this.py')
execfile(__script__, {'__file__': __script__}) execfile(__script__, {'__file__': __script__})
__activate_virtual_env__() __activate_virtual_env__()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册