提交 319742c6 编写于 作者: Q qijun

format python code in demo, doc, doc_cn and paddle directories

上级 ef5e483c
......@@ -16,7 +16,6 @@ import numpy as np
import sys
import os
import PIL.Image as Image
"""
Usage: python process_cifar input_dir output_dir
"""
......@@ -30,6 +29,7 @@ def mkdir_not_exist(path):
if not os.path.exists(path):
os.mkdir(path)
def create_dir_structure(output_dir):
"""
Create the directory structure for the directory.
......@@ -39,8 +39,8 @@ def create_dir_structure(output_dir):
mkdir_not_exist(os.path.join(output_dir, "train"))
mkdir_not_exist(os.path.join(output_dir, "test"))
def convert_batch(batch_path, label_set, label_map,
output_dir, data_split):
def convert_batch(batch_path, label_set, label_map, output_dir, data_split):
"""
Convert CIFAR batch to the structure of Paddle format.
batch_path: the batch to be converted.
......@@ -67,11 +67,23 @@ if __name__ == '__main__':
output_dir = sys.argv[2]
num_batch = 5
create_dir_structure(output_dir)
label_map = {0: "airplane", 1: "automobile", 2: "bird", 3: "cat", 4: "deer",
5: "dog", 6: "frog", 7: "horse", 8: "ship", 9: "truck"}
label_map = {
0: "airplane",
1: "automobile",
2: "bird",
3: "cat",
4: "deer",
5: "dog",
6: "frog",
7: "horse",
8: "ship",
9: "truck"
}
labels = {}
for i in range(1, num_batch + 1):
convert_batch(os.path.join(input_dir, "data_batch_%d" % i), labels,
label_map, output_dir, "train")
convert_batch(os.path.join(input_dir, "test_batch"), {},
label_map, output_dir, "test")
\ No newline at end of file
convert_batch(
os.path.join(input_dir, "data_batch_%d" % i), labels, label_map,
output_dir, "train")
convert_batch(
os.path.join(input_dir, "test_batch"), {}, label_map, output_dir,
"test")
......@@ -46,14 +46,14 @@ def hook(settings, img_size, mean_img_size, num_classes, color, meta, use_jpeg,
settings.img_mean = image_util.load_meta(settings.meta_path,
settings.mean_img_size,
settings.img_size,
settings.color)
settings.img_size, settings.color)
settings.logger.info('Image size: %s', settings.img_size)
settings.logger.info('Meta path: %s', settings.meta_path)
settings.input_types = [
dense_vector(settings.img_raw_size), # image feature
integer_value(settings.num_classes)] # labels
integer_value(settings.num_classes)
] # labels
settings.logger.info('DataProvider Initialization finished')
......@@ -79,8 +79,8 @@ def processData(settings, file_list):
img = image_util.decode_jpeg(data['images'][i])
else:
img = data['images'][i]
img_feat = image_util.preprocess_img(img, settings.img_mean,
settings.img_size, settings.is_train,
settings.color)
img_feat = image_util.preprocess_img(
img, settings.img_mean, settings.img_size,
settings.is_train, settings.color)
label = data['labels'][i]
yield img_feat.astype('float32'), int(label)
......@@ -16,17 +16,20 @@ import numpy as np
from PIL import Image
from cStringIO import StringIO
def resize_image(img, target_size):
"""
Resize an image so that the shorter edge has length target_size.
img: the input image to be resized.
target_size: the target resized image size.
"""
percent = (target_size/float(min(img.size[0], img.size[1])))
resized_size = int(round(img.size[0] * percent)), int(round(img.size[1] * percent))
percent = (target_size / float(min(img.size[0], img.size[1])))
resized_size = int(round(img.size[0] * percent)), int(
round(img.size[1] * percent))
img = img.resize(resized_size, Image.ANTIALIAS)
return img
def flip(im):
"""
Return the flipped image.
......@@ -38,6 +41,7 @@ def flip(im):
else:
return im[:, ::-1]
def crop_img(im, inner_size, color=True, test=True):
"""
Return cropped image.
......@@ -50,20 +54,22 @@ def crop_img(im, inner_size, color=True, test=True):
If True, crop the center of images.
"""
if color:
height, width = max(inner_size, im.shape[1]), max(inner_size, im.shape[2])
height, width = max(inner_size, im.shape[1]), max(inner_size,
im.shape[2])
padded_im = np.zeros((3, height, width))
startY = (height - im.shape[1]) / 2
startX = (width - im.shape[2]) / 2
endY, endX = startY + im.shape[1], startX + im.shape[2]
padded_im[:, startY: endY, startX: endX] = im
padded_im[:, startY:endY, startX:endX] = im
else:
im = im.astype('float32')
height, width = max(inner_size, im.shape[0]), max(inner_size, im.shape[1])
height, width = max(inner_size, im.shape[0]), max(inner_size,
im.shape[1])
padded_im = np.zeros((height, width))
startY = (height - im.shape[0]) / 2
startX = (width - im.shape[1]) / 2
endY, endX = startY + im.shape[0], startX + im.shape[1]
padded_im[startY: endY, startX: endX] = im
padded_im[startY:endY, startX:endX] = im
if test:
startY = (height - inner_size) / 2
startX = (width - inner_size) / 2
......@@ -72,19 +78,21 @@ def crop_img(im, inner_size, color=True, test=True):
startX = np.random.randint(0, width - inner_size + 1)
endY, endX = startY + inner_size, startX + inner_size
if color:
pic = padded_im[:, startY: endY, startX: endX]
pic = padded_im[:, startY:endY, startX:endX]
else:
pic = padded_im[startY: endY, startX: endX]
pic = padded_im[startY:endY, startX:endX]
if (not test) and (np.random.randint(2) == 0):
pic = flip(pic)
return pic
def decode_jpeg(jpeg_string):
np_array = np.array(Image.open(StringIO(jpeg_string)))
if len(np_array.shape) == 3:
np_array = np.transpose(np_array, (2, 0, 1))
return np_array
def preprocess_img(im, img_mean, crop_size, is_train, color=True):
"""
Does data augmentation for images.
......@@ -99,6 +107,7 @@ def preprocess_img(im, img_mean, crop_size, is_train, color=True):
pic -= img_mean
return pic.flatten()
def load_meta(meta_path, mean_img_size, crop_size, color=True):
"""
Return the loaded meta file.
......@@ -109,17 +118,18 @@ def load_meta(meta_path, mean_img_size, crop_size, color=True):
mean = np.load(meta_path)['data_mean']
border = (mean_img_size - crop_size) / 2
if color:
assert(mean_img_size * mean_img_size * 3 == mean.shape[0])
assert (mean_img_size * mean_img_size * 3 == mean.shape[0])
mean = mean.reshape(3, mean_img_size, mean_img_size)
mean = mean[:, border: border + crop_size,
border: border + crop_size].astype('float32')
mean = mean[:, border:border + crop_size, border:border +
crop_size].astype('float32')
else:
assert(mean_img_size * mean_img_size == mean.shape[0])
assert (mean_img_size * mean_img_size == mean.shape[0])
mean = mean.reshape(mean_img_size, mean_img_size)
mean = mean[border: border + crop_size,
border: border + crop_size].astype('float32')
mean = mean[border:border + crop_size, border:border +
crop_size].astype('float32')
return mean
def load_image(img_path, is_color=True):
"""
Load image and return.
......@@ -130,6 +140,7 @@ def load_image(img_path, is_color=True):
img.load()
return img
def oversample(img, crop_dims):
"""
image : iterable of (H x W x K) ndarrays
......@@ -152,50 +163,53 @@ def oversample(img, crop_dims):
for j in w_indices:
crops_ix[curr] = (i, j, i + crop_dims[0], j + crop_dims[1])
curr += 1
crops_ix[4] = np.tile(im_center, (1, 2)) + np.concatenate([
-crop_dims / 2.0,
crop_dims / 2.0
])
crops_ix[4] = np.tile(im_center, (1, 2)) + np.concatenate(
[-crop_dims / 2.0, crop_dims / 2.0])
crops_ix = np.tile(crops_ix, (2, 1))
# Extract crops
crops = np.empty((10 * len(img), crop_dims[0], crop_dims[1],
im_shape[-1]), dtype=np.float32)
crops = np.empty(
(10 * len(img), crop_dims[0], crop_dims[1], im_shape[-1]),
dtype=np.float32)
ix = 0
for im in img:
for crop in crops_ix:
crops[ix] = im[crop[0]:crop[2], crop[1]:crop[3], :]
ix += 1
crops[ix-5:ix] = crops[ix-5:ix, :, ::-1, :] # flip for mirrors
crops[ix - 5:ix] = crops[ix - 5:ix, :, ::-1, :] # flip for mirrors
return crops
class ImageTransformer:
def __init__(self, transpose = None,
channel_swap = None, mean = None, is_color = True):
def __init__(self,
transpose=None,
channel_swap=None,
mean=None,
is_color=True):
self.transpose = transpose
self.channel_swap = None
self.mean = None
self.is_color = is_color
self.is_color = is_color
def set_transpose(self, order):
def set_transpose(self, order):
if self.is_color:
assert 3 == len(order)
assert 3 == len(order)
self.transpose = order
def set_channel_swap(self, order):
def set_channel_swap(self, order):
if self.is_color:
assert 3 == len(order)
assert 3 == len(order)
self.channel_swap = order
def set_mean(self, mean):
# mean value, may be one value per channel
if mean.ndim == 1:
mean = mean[:, np.newaxis, np.newaxis]
else:
mean = mean[:, np.newaxis, np.newaxis]
else:
# elementwise mean
if self.is_color:
assert len(mean.shape) == 3
self.mean = mean
self.mean = mean
def transformer(self, data):
if self.transpose is not None:
......
......@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import os,sys
import os, sys
import numpy as np
import logging
from PIL import Image
......@@ -24,9 +24,11 @@ from py_paddle import swig_paddle, DataProviderConverter
from paddle.trainer.PyDataProvider2 import dense_vector
from paddle.trainer.config_parser import parse_config
logging.basicConfig(format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s')
logging.basicConfig(
format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s')
logging.getLogger().setLevel(logging.INFO)
class ImageClassifier():
def __init__(self,
train_conf,
......@@ -58,18 +60,19 @@ class ImageClassifier():
self.oversample = oversample
self.is_color = is_color
self.transformer = image_util.ImageTransformer(is_color = is_color)
self.transformer.set_transpose((2,0,1))
self.transformer = image_util.ImageTransformer(is_color=is_color)
self.transformer.set_transpose((2, 0, 1))
self.mean_file = mean_file
mean = np.load(self.mean_file)['data_mean']
mean = mean.reshape(3, self.crop_dims[0], self.crop_dims[1])
self.transformer.set_mean(mean) # mean pixel
self.transformer.set_mean(mean) # mean pixel
gpu = 1 if use_gpu else 0
conf_args = "is_test=1,use_gpu=%d,is_predict=1" % (gpu)
conf = parse_config(train_conf, conf_args)
swig_paddle.initPaddle("--use_gpu=%d" % (gpu))
self.network = swig_paddle.GradientMachine.createFromConfigProto(conf.model_config)
self.network = swig_paddle.GradientMachine.createFromConfigProto(
conf.model_config)
assert isinstance(self.network, swig_paddle.GradientMachine)
self.network.loadParameters(self.model_dir)
......@@ -90,14 +93,14 @@ class ImageClassifier():
# image_util.resize_image: short side is self.resize_dim
image = image_util.resize_image(image, self.resize_dim)
image = np.array(image)
input = np.zeros((1, image.shape[0], image.shape[1], 3),
dtype=np.float32)
input = np.zeros(
(1, image.shape[0], image.shape[1], 3), dtype=np.float32)
input[0] = image.astype(np.float32)
input = image_util.oversample(input, self.crop_dims)
else:
image = image.resize(self.crop_dims, Image.ANTIALIAS)
input = np.zeros((1, self.crop_dims[0], self.crop_dims[1], 3),
dtype=np.float32)
input = np.zeros(
(1, self.crop_dims[0], self.crop_dims[1], 3), dtype=np.float32)
input[0] = np.array(image).astype(np.float32)
data_in = []
......@@ -133,22 +136,24 @@ class ImageClassifier():
lab = np.argsort(-prob)
logging.info("Label of %s is: %d", image, lab[0])
if __name__ == '__main__':
image_size=32
crop_size=32
multi_crop=True
config="vgg_16_cifar.py"
output_layer="__fc_layer_1__"
mean_path="data/cifar-out/batches/batches.meta"
model_path=sys.argv[1]
image=sys.argv[2]
use_gpu=bool(int(sys.argv[3]))
obj = ImageClassifier(train_conf=config,
model_dir=model_path,
resize_dim=image_size,
crop_dim=crop_size,
mean_file=mean_path,
use_gpu=use_gpu,
oversample=multi_crop)
image_size = 32
crop_size = 32
multi_crop = True
config = "vgg_16_cifar.py"
output_layer = "__fc_layer_1__"
mean_path = "data/cifar-out/batches/batches.meta"
model_path = sys.argv[1]
image = sys.argv[2]
use_gpu = bool(int(sys.argv[3]))
obj = ImageClassifier(
train_conf=config,
model_dir=model_path,
resize_dim=image_size,
crop_dim=crop_size,
mean_file=mean_path,
use_gpu=use_gpu,
oversample=multi_crop)
obj.predict(image, output_layer)
......@@ -19,24 +19,36 @@ from optparse import OptionParser
def option_parser():
parser = OptionParser(usage="usage: python preprcoess.py "\
"-i data_dir [options]")
parser.add_option("-i", "--input", action="store",
dest="input", help="Input data directory.")
parser.add_option("-s", "--size", action="store",
dest="size", help="Processed image size.")
parser.add_option("-c", "--color", action="store",
dest="color", help="whether to use color images.")
parser.add_option(
"-i",
"--input",
action="store",
dest="input",
help="Input data directory.")
parser.add_option(
"-s",
"--size",
action="store",
dest="size",
help="Processed image size.")
parser.add_option(
"-c",
"--color",
action="store",
dest="color",
help="whether to use color images.")
return parser.parse_args()
if __name__ == '__main__':
options, args = option_parser()
data_dir = options.input
processed_image_size = int(options.size)
color = options.color == "1"
data_creator = ImageClassificationDatasetCreater(data_dir,
processed_image_size,
color)
data_creator.train_list_name = "train.txt"
data_creator.test_list_name = "test.txt"
data_creator.num_per_batch = 1000
data_creator.overwrite = True
data_creator.create_batches()
options, args = option_parser()
data_dir = options.input
processed_image_size = int(options.size)
color = options.color == "1"
data_creator = ImageClassificationDatasetCreater(
data_dir, processed_image_size, color)
data_creator.train_list_name = "train.txt"
data_creator.test_list_name = "test.txt"
data_creator.num_per_batch = 1000
data_creator.overwrite = True
data_creator.create_batches()
......@@ -18,36 +18,38 @@ is_predict = get_config_arg("is_predict", bool, False)
####################Data Configuration ##################
if not is_predict:
data_dir='data/cifar-out/batches/'
meta_path=data_dir+'batches.meta'
args = {'meta':meta_path,'mean_img_size': 32,
'img_size': 32,'num_classes': 10,
'use_jpeg': 1,'color': "color"}
define_py_data_sources2(train_list="train.list",
test_list="train.list",
module='image_provider',
obj='processData',
args=args)
data_dir = 'data/cifar-out/batches/'
meta_path = data_dir + 'batches.meta'
args = {
'meta': meta_path,
'mean_img_size': 32,
'img_size': 32,
'num_classes': 10,
'use_jpeg': 1,
'color': "color"
}
define_py_data_sources2(
train_list="train.list",
test_list="train.list",
module='image_provider',
obj='processData',
args=args)
######################Algorithm Configuration #############
settings(
batch_size = 128,
learning_rate = 0.1 / 128.0,
learning_method = MomentumOptimizer(0.9),
regularization = L2Regularization(0.0005 * 128)
)
batch_size=128,
learning_rate=0.1 / 128.0,
learning_method=MomentumOptimizer(0.9),
regularization=L2Regularization(0.0005 * 128))
#######################Network Configuration #############
data_size=3*32*32
label_size=10
img = data_layer(name='image',
size=data_size)
data_size = 3 * 32 * 32
label_size = 10
img = data_layer(name='image', size=data_size)
# small_vgg is predefined in trainer_config_helpers.networks
predict = small_vgg(input_image=img,
num_channels=3,
num_classes=label_size)
predict = small_vgg(input_image=img, num_channels=3, num_classes=label_size)
if not is_predict:
lbl = data_layer(name="label", size=label_size)
......
......@@ -15,10 +15,10 @@
from paddle.trainer.PyDataProvider2 import *
import random
# define data types of input: 2 real numbers
@provider(input_types=[dense_vector(1), dense_vector(1)],use_seq=False)
@provider(input_types=[dense_vector(1), dense_vector(1)], use_seq=False)
def process(settings, input_file):
for i in xrange(2000):
x = random.random()
yield [x], [2*x+0.3]
yield [x], [2 * x + 0.3]
......@@ -23,14 +23,17 @@ Usage:
import numpy as np
import os
def load(file_name):
with open(file_name, 'rb') as f:
f.read(16) # skip header for float type.
f.read(16) # skip header for float type.
return np.fromfile(f, dtype=np.float32)
def main():
print 'w=%.6f, b=%.6f from pass 29' % (load('output/pass-00029/w'),
load('output/pass-00029/b'))
load('output/pass-00029/b'))
if __name__ == '__main__':
main()
......@@ -16,9 +16,14 @@ from paddle.trainer_config_helpers import *
# 1. read data. Suppose you saved above python code as dataprovider.py
data_file = 'empty.list'
with open(data_file, 'w') as f: f.writelines(' ')
define_py_data_sources2(train_list=data_file, test_list=None,
module='dataprovider', obj='process',args={})
with open(data_file, 'w') as f:
f.writelines(' ')
define_py_data_sources2(
train_list=data_file,
test_list=None,
module='dataprovider',
obj='process',
args={})
# 2. learning algorithm
settings(batch_size=12, learning_rate=1e-3, learning_method=MomentumOptimizer())
......@@ -26,7 +31,11 @@ settings(batch_size=12, learning_rate=1e-3, learning_method=MomentumOptimizer())
# 3. Network configuration
x = data_layer(name='x', size=1)
y = data_layer(name='y', size=1)
y_predict = fc_layer(input=x, param_attr=ParamAttr(name='w'), size=1, act=LinearActivation(), bias_attr=ParamAttr(name='b'))
y_predict = fc_layer(
input=x,
param_attr=ParamAttr(name='w'),
size=1,
act=LinearActivation(),
bias_attr=ParamAttr(name='b'))
cost = regression_cost(input=y_predict, label=y)
outputs(cost)
......@@ -13,9 +13,9 @@
# limitations under the License.
o = open("./" + "train.list", "w")
o.write("./data/raw_data/train" +"\n")
o.write("./data/raw_data/train" + "\n")
o.close()
o = open("./" + "test.list", "w")
o.write("./data/raw_data/t10k" +"\n")
o.close()
\ No newline at end of file
o.write("./data/raw_data/t10k" + "\n")
o.close()
......@@ -2,10 +2,9 @@ from paddle.trainer.PyDataProvider2 import *
# Define a py data provider
@provider(input_types={
'pixel': dense_vector(28 * 28),
'label': integer_value(10)
})
@provider(
input_types={'pixel': dense_vector(28 * 28),
'label': integer_value(10)})
def process(settings, filename): # settings is not used currently.
imgf = filename + "-images-idx3-ubyte"
labelf = filename + "-labels-idx1-ubyte"
......
......@@ -18,32 +18,29 @@ is_predict = get_config_arg("is_predict", bool, False)
####################Data Configuration ##################
if not is_predict:
data_dir='./data/'
define_py_data_sources2(train_list= data_dir + 'train.list',
test_list= data_dir + 'test.list',
module='mnist_provider',
obj='process')
data_dir = './data/'
define_py_data_sources2(
train_list=data_dir + 'train.list',
test_list=data_dir + 'test.list',
module='mnist_provider',
obj='process')
######################Algorithm Configuration #############
settings(
batch_size = 128,
learning_rate = 0.1 / 128.0,
learning_method = MomentumOptimizer(0.9),
regularization = L2Regularization(0.0005 * 128)
)
batch_size=128,
learning_rate=0.1 / 128.0,
learning_method=MomentumOptimizer(0.9),
regularization=L2Regularization(0.0005 * 128))
#######################Network Configuration #############
data_size=1*28*28
label_size=10
data_size = 1 * 28 * 28
label_size = 10
img = data_layer(name='pixel', size=data_size)
# small_vgg is predined in trainer_config_helpers.network
predict = small_vgg(input_image=img,
num_channels=1,
num_classes=label_size)
predict = small_vgg(input_image=img, num_channels=1, num_classes=label_size)
if not is_predict:
lbl = data_layer(name="label", size=label_size)
......
......@@ -12,7 +12,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Example:
python extract_para.py --preModel PREMODEL --preDict PREDICT \
......@@ -29,6 +28,7 @@ Options:
from optparse import OptionParser
import struct
def get_row_index(preDict, usrDict):
"""
Get the row positions for all words in user dictionary from pre-trained dictionary.
......@@ -47,7 +47,9 @@ def get_row_index(preDict, usrDict):
pos.append(index[word])
return pos
def extract_parameters_by_usrDict(preModel, preDict, usrModel, usrDict, paraDim):
def extract_parameters_by_usrDict(preModel, preDict, usrModel, usrDict,
paraDim):
"""
Extract desired parameters from a pretrained embedding model based on user dictionary
"""
......@@ -70,6 +72,7 @@ def extract_parameters_by_usrDict(preModel, preDict, usrModel, usrDict, paraDim)
print "extract parameters finish, total", len(rowIndex), "lines"
fi.close()
def main():
"""
Main entry for running paraconvert.py
......@@ -78,19 +81,33 @@ def main():
"python %prog --preModel PREMODEL --preDict PREDICT" \
" --usrModel USRMODEL --usrDict USRDICT -d DIM"
parser = OptionParser(usage)
parser.add_option("--preModel", action="store", dest="preModel",
help="the name of pretrained embedding model")
parser.add_option("--preDict", action="store", dest="preDict",
help="the name of pretrained dictionary")
parser.add_option("--usrModel", action="store", dest="usrModel",
help="the name of output usr embedding model")
parser.add_option("--usrDict", action="store", dest="usrDict",
help="the name of user specified dictionary")
parser.add_option("-d", action="store", dest="dim",
help="dimension of parameter")
parser.add_option(
"--preModel",
action="store",
dest="preModel",
help="the name of pretrained embedding model")
parser.add_option(
"--preDict",
action="store",
dest="preDict",
help="the name of pretrained dictionary")
parser.add_option(
"--usrModel",
action="store",
dest="usrModel",
help="the name of output usr embedding model")
parser.add_option(
"--usrDict",
action="store",
dest="usrDict",
help="the name of user specified dictionary")
parser.add_option(
"-d", action="store", dest="dim", help="dimension of parameter")
(options, args) = parser.parse_args()
extract_parameters_by_usrDict(options.preModel, options.preDict,
options.usrModel, options.usrDict, int(options.dim))
extract_parameters_by_usrDict(options.preModel, options.preDict,
options.usrModel, options.usrDict,
int(options.dim))
if __name__ == '__main__':
main()
......@@ -12,7 +12,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Example:
python paraconvert.py --b2t -i INPUT -o OUTPUT -d DIM
......@@ -29,6 +28,7 @@ Options:
from optparse import OptionParser
import struct
def binary2text(input, output, paraDim):
"""
Convert a binary parameter file of embedding model to be a text file.
......@@ -76,12 +76,13 @@ def binary2text(input, output, paraDim):
fo.close()
print "binary2text finish, total", line, "lines"
def get_para_count(input):
"""
Compute the total number of embedding parameters in input text file.
input: the name of input text file
"""
numRows = 1
numRows = 1
paraDim = 0
with open(input) as f:
line = f.readline()
......@@ -90,6 +91,7 @@ def get_para_count(input):
numRows += 1
return numRows * paraDim
def text2binary(input, output, paddle_head=True):
"""
Convert a text parameter file of embedding model to be a binary file.
......@@ -123,6 +125,7 @@ def text2binary(input, output, paddle_head=True):
fo.close()
print "text2binary finish, total", count, "lines"
def main():
"""
Main entry for running paraconvert.py
......@@ -131,21 +134,26 @@ def main():
"python %prog --b2t -i INPUT -o OUTPUT -d DIM \n" \
"python %prog --t2b -i INPUT -o OUTPUT"
parser = OptionParser(usage)
parser.add_option("--b2t", action="store_true",
help="convert parameter file of embedding model from binary to text")
parser.add_option("--t2b", action="store_true",
help="convert parameter file of embedding model from text to binary")
parser.add_option("-i", action="store", dest="input",
help="input parameter file name")
parser.add_option("-o", action="store", dest="output",
help="output parameter file name")
parser.add_option("-d", action="store", dest="dim",
help="dimension of parameter")
parser.add_option(
"--b2t",
action="store_true",
help="convert parameter file of embedding model from binary to text")
parser.add_option(
"--t2b",
action="store_true",
help="convert parameter file of embedding model from text to binary")
parser.add_option(
"-i", action="store", dest="input", help="input parameter file name")
parser.add_option(
"-o", action="store", dest="output", help="output parameter file name")
parser.add_option(
"-d", action="store", dest="dim", help="dimension of parameter")
(options, args) = parser.parse_args()
if options.b2t:
binary2text(options.input, options.output, options.dim)
if options.t2b:
text2binary(options.input, options.output)
if __name__ == '__main__':
main()
......@@ -26,16 +26,22 @@ from py_paddle import swig_paddle, DataProviderConverter
from paddle.trainer.PyDataProvider2 import dense_vector
from paddle.trainer.config_parser import parse_config
logging.basicConfig(format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s')
logging.basicConfig(
format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s')
logging.getLogger().setLevel(logging.INFO)
class ImageClassifier():
def __init__(self, train_conf, model_dir=None,
resize_dim=256, crop_dim=224,
def __init__(self,
train_conf,
model_dir=None,
resize_dim=256,
crop_dim=224,
use_gpu=True,
mean_file=None,
output_layer=None,
oversample=False, is_color=True):
oversample=False,
is_color=True):
"""
train_conf: network configure.
model_dir: string, directory of model.
......@@ -62,24 +68,25 @@ class ImageClassifier():
assert isinstance(self.output_layer, basestring)
self.output_layer = self.output_layer.split(",")
self.transformer = image_util.ImageTransformer(is_color = is_color)
self.transformer.set_transpose((2,0,1))
self.transformer.set_channel_swap((2,1,0))
self.transformer = image_util.ImageTransformer(is_color=is_color)
self.transformer.set_transpose((2, 0, 1))
self.transformer.set_channel_swap((2, 1, 0))
self.mean_file = mean_file
if self.mean_file is not None:
mean = np.load(self.mean_file)['data_mean']
mean = mean.reshape(3, self.crop_dims[0], self.crop_dims[1])
self.transformer.set_mean(mean) # mean pixel
self.transformer.set_mean(mean) # mean pixel
else:
# if you use three mean value, set like:
# this three mean value is calculated from ImageNet.
self.transformer.set_mean(np.array([103.939,116.779,123.68]))
self.transformer.set_mean(np.array([103.939, 116.779, 123.68]))
conf_args = "is_test=1,use_gpu=%d,is_predict=1" % (int(use_gpu))
conf = parse_config(train_conf, conf_args)
swig_paddle.initPaddle("--use_gpu=%d" % (int(use_gpu)))
self.network = swig_paddle.GradientMachine.createFromConfigProto(conf.model_config)
self.network = swig_paddle.GradientMachine.createFromConfigProto(
conf.model_config)
assert isinstance(self.network, swig_paddle.GradientMachine)
self.network.loadParameters(self.model_dir)
......@@ -105,14 +112,14 @@ class ImageClassifier():
# image_util.resize_image: short side is self.resize_dim
image = image_util.resize_image(image, self.resize_dim)
image = np.array(image)
input = np.zeros((1, image.shape[0], image.shape[1], 3),
dtype=np.float32)
input = np.zeros(
(1, image.shape[0], image.shape[1], 3), dtype=np.float32)
input[0] = image.astype(np.float32)
input = image_util.oversample(input, self.crop_dims)
else:
image = image.resize(self.crop_dims, Image.ANTIALIAS)
input = np.zeros((1, self.crop_dims[0], self.crop_dims[1], 3),
dtype=np.float32)
input = np.zeros(
(1, self.crop_dims[0], self.crop_dims[1], 3), dtype=np.float32)
input[0] = np.array(image).astype(np.float32)
data_in = []
......@@ -172,7 +179,7 @@ class ImageClassifier():
logging.info("Label of %s is: %d", image, lab[0])
return results
def extract(self, data_file, output_dir, batch_size = 10000):
def extract(self, data_file, output_dir, batch_size=10000):
"""
extract and save features of output layers, which are
specify in Outputs() in network configure.
......@@ -197,7 +204,7 @@ class ImageClassifier():
image_feature[file_name] = feature
sample_num += 1
if sample_num == batch_size:
batch_name = os.path.join(output_dir, 'batch_%d' %(batch_num))
batch_name = os.path.join(output_dir, 'batch_%d' % (batch_num))
self.save_file(image_feature, batch_name)
logging.info('Finish batch %d', batch_num)
batch_num += 1
......@@ -206,7 +213,7 @@ class ImageClassifier():
if idx % 1000 == 0:
logging.info('%d/%d, %s', idx, len(image_files), file_name)
if sample_num > 0:
batch_name = os.path.join(output_dir, 'batch_%d' %(batch_num))
batch_name = os.path.join(output_dir, 'batch_%d' % (batch_num))
self.save_file(image_feature, batch_name)
logging.info('Finish batch %d', batch_num)
logging.info('Done: make image feature batch')
......@@ -215,38 +222,64 @@ class ImageClassifier():
of = open(file, 'wb')
cPickle.dump(data, of, protocol=cPickle.HIGHEST_PROTOCOL)
def option_parser():
"""
Main entry for predciting
"""
usage = "%prog -c config -i data_list -w model_dir [options]"
parser = OptionParser(usage="usage: %s" % usage)
parser.add_option("-j", "--job",
action="store", dest="job_type",
help="job type: predict, extract\
parser.add_option(
"-j",
"--job",
action="store",
dest="job_type",
help="job type: predict, extract\
predict: predicting,\
extract: extract features")
parser.add_option("-c", "--conf",
action="store", dest="train_conf",
help="network config")
parser.add_option("-i", "--data",
action="store", dest="data_file",
help="image list")
parser.add_option("-w", "--model",
action="store", dest="model_path",
default=None, help="model path")
parser.add_option("-g", "--use_gpu", action="store",
dest="use_gpu", default=True,
help="Whether to use gpu mode.")
parser.add_option("-o", "--output_dir",
action="store", dest="output_dir",
default="output", help="output path")
parser.add_option("-m", "--mean", action="store",
dest="mean", default=None,
help="mean file.")
parser.add_option("-p", "--multi_crop", action="store_true",
dest="multi_crop", default=False,
help="Wether to use multiple crops on image.")
parser.add_option(
"-c",
"--conf",
action="store",
dest="train_conf",
help="network config")
parser.add_option(
"-i", "--data", action="store", dest="data_file", help="image list")
parser.add_option(
"-w",
"--model",
action="store",
dest="model_path",
default=None,
help="model path")
parser.add_option(
"-g",
"--use_gpu",
action="store",
dest="use_gpu",
default=True,
help="Whether to use gpu mode.")
parser.add_option(
"-o",
"--output_dir",
action="store",
dest="output_dir",
default="output",
help="output path")
parser.add_option(
"-m",
"--mean",
action="store",
dest="mean",
default=None,
help="mean file.")
parser.add_option(
"-p",
"--multi_crop",
action="store_true",
dest="multi_crop",
default=False,
help="Wether to use multiple crops on image.")
parser.add_option("-l", "--output_layer", action="store",
dest="output_layer", default=None,
help="--job=extract, specify layers to extract "\
......@@ -254,24 +287,26 @@ def option_parser():
"classification probability, output in resnet.py.")
return parser.parse_args()
def main():
"""
1. parse input arguments.
2. predicting or extract features according job type.
"""
options, args = option_parser()
obj = ImageClassifier(options.train_conf,
options.model_path,
use_gpu=options.use_gpu,
mean_file=options.mean,
output_layer=options.output_layer,
oversample=options.multi_crop)
obj = ImageClassifier(
options.train_conf,
options.model_path,
use_gpu=options.use_gpu,
mean_file=options.mean,
output_layer=options.output_layer,
oversample=options.multi_crop)
if options.job_type == "predict":
obj.predict(options.data_file)
elif options.job_type == "extract":
obj.extract(options.data_file,
options.output_dir)
obj.extract(options.data_file, options.output_dir)
if __name__ == '__main__':
main()
......@@ -11,4 +11,3 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
......@@ -16,8 +16,7 @@ from paddle.utils.image_util import *
from paddle.trainer.PyDataProvider2 import *
def hook(settings, image_size, crop_size, color, file_list,
is_train, **kwargs):
def hook(settings, image_size, crop_size, color, file_list, is_train, **kwargs):
"""
Description: Init with a list of data file
file_list is the name list of input files.
......@@ -58,7 +57,7 @@ def hook(settings, image_size, crop_size, color, file_list,
sz = settings.crop_size * settings.crop_size
settings.img_mean = np.zeros(sz * 3, dtype=np.single)
for idx, value in enumerate(settings.mean_value):
settings.img_mean[idx * sz: (idx + 1) * sz] = value
settings.img_mean[idx * sz:(idx + 1) * sz] = value
settings.img_mean = settings.img_mean.reshape(3, settings.crop_size,
settings.crop_size)
......@@ -69,7 +68,8 @@ def hook(settings, image_size, crop_size, color, file_list,
settings.input_types = [
dense_vector(settings.img_input_size), # image feature
integer_value(1)] # labels
integer_value(1)
] # labels
settings.logger.info('Image short side: %s', settings.img_size)
settings.logger.info('Crop size: %s', settings.crop_size)
......@@ -97,9 +97,6 @@ def processData(settings, file_list):
# swap channel
if settings.is_swap_channel:
img = img[settings.swap_channel, :, :]
img_feat = preprocess_img(img,
settings.img_mean,
settings.crop_size,
settings.is_train,
settings.color)
img_feat = preprocess_img(img, settings.img_mean, settings.crop_size,
settings.is_train, settings.color)
yield img_feat.tolist(), int(lab.strip())
......@@ -17,9 +17,11 @@ import sys
import cPickle
import logging
logging.basicConfig(format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s')
logging.basicConfig(
format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s')
logging.getLogger().setLevel(logging.INFO)
def load_feature_c(file):
"""
Load feature extracted by C++ interface.
......@@ -30,14 +32,15 @@ def load_feature_c(file):
f = open(file, 'r')
for line in f:
sample = []
for slot in line.strip().split(";"):
fea = [float(val) for val in slot.strip().split()]
for slot in line.strip().split(";"):
fea = [float(val) for val in slot.strip().split()]
if fea:
sample.append(fea)
features.append(sample)
f.close()
return features
def load_feature_py(feature_dir):
"""
Load feature extracted by python interface.
......@@ -54,6 +57,7 @@ def load_feature_py(feature_dir):
logging.info('Load feature file %s', file_name)
return features
if __name__ == '__main__':
print load_feature_py(sys.argv[1])
print load_feature_py(sys.argv[1])
#print load_feature_c(sys.argv[1])
......@@ -13,7 +13,6 @@
# limitations under the License.
from paddle.trainer_config_helpers import *
"""
paper: https://arxiv.org/abs/1512.03385
"""
......@@ -28,15 +27,19 @@ if not is_predict and data_provider:
# mean.meta size : 3 x 224 x 224.
# If you use three mean value, set like:
# "mean_value:103.939,116.779,123.68;"
args={
args = {
'mean_meta': "model/mean_meta_224/mean.meta",
'image_size': 224, 'crop_size': 224,
'color': True,'swap_channel:': [2, 1, 0]}
define_py_data_sources2(train_list,
'example/test.list',
module="example.image_list_provider",
obj="processData",
args=args)
'image_size': 224,
'crop_size': 224,
'color': True,
'swap_channel:': [2, 1, 0]
}
define_py_data_sources2(
train_list,
'example/test.list',
module="example.image_list_provider",
obj="processData",
args=args)
batch_size = 1
learning_rate = 0.1 / batch_size
......@@ -54,12 +57,16 @@ Settings(
learning_method='momentum',
learning_rate_decay_a=0.5,
learning_rate_decay_b=1200000 * 10,
learning_rate_schedule="discexp",
)
learning_rate_schedule="discexp", )
def conv_bn_layer(name, input, filter_size, num_filters,
stride, padding, channels=None,
def conv_bn_layer(name,
input,
filter_size,
num_filters,
stride,
padding,
channels=None,
active_type=ReluActivation()):
"""
A wrapper for conv layer with batch normalization layers.
......@@ -67,19 +74,18 @@ def conv_bn_layer(name, input, filter_size, num_filters,
conv layer has no activation.
"""
tmp = img_conv_layer(name=name + "_conv",
input=input,
filter_size=filter_size,
num_channels=channels,
num_filters=num_filters,
stride=stride,
padding=padding,
act=LinearActivation(),
bias_attr=False)
return batch_norm_layer(name=name + "_bn",
input=tmp,
act=active_type,
use_global_stats=is_test)
tmp = img_conv_layer(
name=name + "_conv",
input=input,
filter_size=filter_size,
num_channels=channels,
num_filters=num_filters,
stride=stride,
padding=padding,
act=LinearActivation(),
bias_attr=False)
return batch_norm_layer(
name=name + "_bn", input=tmp, act=active_type, use_global_stats=is_test)
def bottleneck_block(name, input, num_filters1, num_filters2):
......@@ -88,29 +94,31 @@ def bottleneck_block(name, input, num_filters1, num_filters2):
Last conv_bn_layer has no activation.
Addto layer has activation of relu.
"""
last_name = conv_bn_layer(name=name + '_branch2a',
input=input,
filter_size=1,
num_filters=num_filters1,
stride=1,
padding=0)
last_name = conv_bn_layer(name=name + '_branch2b',
input=last_name,
filter_size=3,
num_filters=num_filters1,
stride=1,
padding=1)
last_name = conv_bn_layer(name=name + '_branch2c',
input=last_name,
filter_size=1,
num_filters=num_filters2,
stride=1,
padding=0,
active_type=LinearActivation())
return addto_layer(name=name + "_addto",
input=[input, last_name],
act=ReluActivation())
last_name = conv_bn_layer(
name=name + '_branch2a',
input=input,
filter_size=1,
num_filters=num_filters1,
stride=1,
padding=0)
last_name = conv_bn_layer(
name=name + '_branch2b',
input=last_name,
filter_size=3,
num_filters=num_filters1,
stride=1,
padding=1)
last_name = conv_bn_layer(
name=name + '_branch2c',
input=last_name,
filter_size=1,
num_filters=num_filters2,
stride=1,
padding=0,
active_type=LinearActivation())
return addto_layer(
name=name + "_addto", input=[input, last_name], act=ReluActivation())
def mid_projection(name, input, num_filters1, num_filters2, stride=2):
......@@ -123,38 +131,41 @@ def mid_projection(name, input, num_filters1, num_filters2, stride=2):
branch2x: bottleneck building block, shortcuts are identity.
"""
# stride = 2
branch1 = conv_bn_layer(name=name + '_branch1',
input=input,
filter_size=1,
num_filters=num_filters2,
stride=stride,
padding=0,
active_type=LinearActivation())
last_name = conv_bn_layer(name=name + '_branch2a',
input=input,
filter_size=1,
num_filters=num_filters1,
stride=stride,
padding=0)
last_name = conv_bn_layer(name=name + '_branch2b',
input=last_name,
filter_size=3,
num_filters=num_filters1,
stride=1,
padding=1)
last_name = conv_bn_layer(name=name + '_branch2c',
input=last_name,
filter_size=1,
num_filters=num_filters2,
stride=1,
padding=0,
active_type=LinearActivation())
return addto_layer(name=name + "_addto",
input=[branch1, last_name],
act=ReluActivation())
branch1 = conv_bn_layer(
name=name + '_branch1',
input=input,
filter_size=1,
num_filters=num_filters2,
stride=stride,
padding=0,
active_type=LinearActivation())
last_name = conv_bn_layer(
name=name + '_branch2a',
input=input,
filter_size=1,
num_filters=num_filters1,
stride=stride,
padding=0)
last_name = conv_bn_layer(
name=name + '_branch2b',
input=last_name,
filter_size=3,
num_filters=num_filters1,
stride=1,
padding=1)
last_name = conv_bn_layer(
name=name + '_branch2c',
input=last_name,
filter_size=1,
num_filters=num_filters2,
stride=1,
padding=0,
active_type=LinearActivation())
return addto_layer(
name=name + "_addto", input=[branch1, last_name], act=ReluActivation())
def deep_res_net(res2_num=3, res3_num=4, res4_num=6, res5_num=3):
......@@ -168,67 +179,67 @@ def deep_res_net(res2_num=3, res3_num=4, res4_num=6, res5_num=3):
# For ImageNet
# conv1: 112x112
img = data_layer(name='input', size=224 * 224 * 3)
tmp = conv_bn_layer("conv1", img,
filter_size=7,
channels=3,
num_filters=64,
stride=2,
padding=3)
tmp = conv_bn_layer(
"conv1",
img,
filter_size=7,
channels=3,
num_filters=64,
stride=2,
padding=3)
tmp = img_pool_layer(name="pool1", input=tmp, pool_size=3, stride=2)
# conv2_x: 56x56
tmp = mid_projection(name="res2_1",
input=tmp,
num_filters1=64,
num_filters2=256,
stride=1)
tmp = mid_projection(
name="res2_1", input=tmp, num_filters1=64, num_filters2=256, stride=1)
for i in xrange(2, res2_num + 1, 1):
tmp = bottleneck_block(name="res2_" + str(i),
input=tmp,
num_filters1=64,
num_filters2=256)
tmp = bottleneck_block(
name="res2_" + str(i), input=tmp, num_filters1=64, num_filters2=256)
# conv3_x: 28x28
tmp = mid_projection(name="res3_1",
input=tmp,
num_filters1=128,
num_filters2=512)
tmp = mid_projection(
name="res3_1", input=tmp, num_filters1=128, num_filters2=512)
for i in xrange(2, res3_num + 1, 1):
tmp = bottleneck_block(name="res3_" + str(i),
input=tmp, num_filters1=128,
num_filters2=512)
tmp = bottleneck_block(
name="res3_" + str(i),
input=tmp,
num_filters1=128,
num_filters2=512)
# conv4_x: 14x14
tmp = mid_projection(name="res4_1", input=tmp,
num_filters1=256, num_filters2=1024)
tmp = mid_projection(
name="res4_1", input=tmp, num_filters1=256, num_filters2=1024)
for i in xrange(2, res4_num + 1, 1):
tmp = bottleneck_block(name="res4_" + str(i),
input=tmp,
num_filters1=256,
num_filters2=1024)
tmp = bottleneck_block(
name="res4_" + str(i),
input=tmp,
num_filters1=256,
num_filters2=1024)
# conv5_x: 7x7
tmp = mid_projection(name="res5_1", input=tmp,
num_filters1=512, num_filters2=2048)
tmp = mid_projection(
name="res5_1", input=tmp, num_filters1=512, num_filters2=2048)
for i in xrange(2, res5_num + 1, 1):
tmp = bottleneck_block(name="res5_" + str(i),
input=tmp, num_filters1=512,
num_filters2=2048)
tmp = img_pool_layer(name='avgpool',
input=tmp,
pool_size=7,
stride=1,
pool_type=AvgPooling())
output = fc_layer(name='output',
input=tmp,
size=1000,
act=SoftmaxActivation())
tmp = bottleneck_block(
name="res5_" + str(i),
input=tmp,
num_filters1=512,
num_filters2=2048)
tmp = img_pool_layer(
name='avgpool',
input=tmp,
pool_size=7,
stride=1,
pool_type=AvgPooling())
output = fc_layer(
name='output', input=tmp, size=1000, act=SoftmaxActivation())
if not is_predict:
classification_cost(input=output, label=data_layer(name='label',
size=1))
classification_cost(
input=output, label=data_layer(
name='label', size=1))
def res_net_50():
......
......@@ -22,27 +22,32 @@ from py_paddle import DataProviderConverter
from paddle.trainer.PyDataProvider2 \
import integer_value, integer_value_sequence, sparse_binary_vector
def parse_arguments():
parser = argparse.ArgumentParser()
parser.add_argument("--train_data",
type=str, required=False, help="train data file")
parser.add_argument(
"--train_data", type=str, required=False, help="train data file")
parser.add_argument("--test_data", type=str, help="test data file")
parser.add_argument("--config",
type=str, required=True, help="config file name")
parser.add_argument(
"--config", type=str, required=True, help="config file name")
parser.add_argument("--dict_file", required=True, help="dictionary file")
parser.add_argument("--seq",
default=1, type=int,
help="whether use sequence training")
parser.add_argument("--use_gpu", default=0, type=int,
help="whether use GPU for training")
parser.add_argument("--trainer_count", default=1, type=int,
help="Number of threads for training")
parser.add_argument("--num_passes", default=5, type=int,
help="Number of training passes")
parser.add_argument(
"--seq", default=1, type=int, help="whether use sequence training")
parser.add_argument(
"--use_gpu", default=0, type=int, help="whether use GPU for training")
parser.add_argument(
"--trainer_count",
default=1,
type=int,
help="Number of threads for training")
parser.add_argument(
"--num_passes", default=5, type=int, help="Number of training passes")
return parser.parse_args()
UNK_IDX = 0
def load_data(file_name, word_dict):
with open(file_name, 'r') as f:
for line in f:
......@@ -51,6 +56,7 @@ def load_data(file_name, word_dict):
word_slot = [word_dict.get(w, UNK_IDX) for w in words]
yield word_slot, int(label)
def load_dict(dict_file):
word_dict = dict()
with open(dict_file, 'r') as f:
......@@ -59,6 +65,7 @@ def load_dict(dict_file):
word_dict[w] = i
return word_dict
def main():
options = parse_arguments()
api.initPaddle("--use_gpu=%s" % options.use_gpu,
......@@ -86,9 +93,9 @@ def main():
# create a data converter which converts data to PaddlePaddle
# internal format
input_types = [
integer_value_sequence(len(word_dict)) if options.seq
else sparse_binary_vector(len(word_dict)),
integer_value(2)]
integer_value_sequence(len(word_dict)) if options.seq else
sparse_binary_vector(len(word_dict)), integer_value(2)
]
converter = DataProviderConverter(input_types)
batch_size = trainer_config.opt_config.batch_size
......@@ -102,7 +109,7 @@ def main():
trainer.trainOneDataBatch(size, converter(batch))
trainer.finishTrainPass()
if test_dataset:
trainer.startTestPeriod();
trainer.startTestPeriod()
for pos in xrange(0, len(test_dataset), batch_size):
batch = itertools.islice(test_dataset, pos, pos + batch_size)
size = min(batch_size, len(test_dataset) - pos)
......@@ -110,5 +117,6 @@ def main():
trainer.finishTestPeriod()
trainer.finishTrain()
if __name__ == '__main__':
main()
......@@ -17,6 +17,7 @@ from paddle.trainer.PyDataProvider2 import *
# id of the word not in dictionary
UNK_IDX = 0
# initializer is called by the framework during initialization.
# It allows the user to describe the data types and setup the
# necessary data structure for later use.
......@@ -38,7 +39,9 @@ def initializer(settings, dictionary, **kwargs):
# The second input is an integer. It represents the category id of the
# sample. 2 means there are two labels in the dataset.
# (1 for positive and 0 for negative)
integer_value(2)]
integer_value(2)
]
# Delaring a data provider. It has an initializer 'data_initialzer'.
# It will cache the generated data of the first pass in memory, so that
......@@ -69,9 +72,8 @@ def process(settings, file_name):
def predict_initializer(settings, dictionary, **kwargs):
settings.word_dict = dictionary
settings.input_types = [
sparse_binary_vector(len(dictionary))
]
settings.input_types = [sparse_binary_vector(len(dictionary))]
# Declaring a data provider for prediction. The difference with process
# is that label is not generated.
......
......@@ -24,7 +24,8 @@ def initializer(settings, dictionary, **kwargs):
# The value of the integers range from 0 to len(dictrionary)-1
integer_value_sequence(len(dictionary)),
# Define the second input for label id
integer_value(2)]
integer_value(2)
]
@provider(init_hook=initializer, cache=CacheType.CACHE_PASS_IN_MEM)
......@@ -40,7 +41,8 @@ def process(settings, file_name):
def predict_initializer(settings, dictionary, **kwargs):
settings.word_dict = dictionary
settings.input_types = [
integer_value(len(dictionary), seq_type=SequenceType.SEQUENCE)
integer_value(
len(dictionary), seq_type=SequenceType.SEQUENCE)
]
......
......@@ -13,7 +13,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
1. (remove HTML before or not)tokensizing
2. pos sample : rating score 5; neg sample: rating score 1-2.
......@@ -35,7 +34,8 @@ import multiprocessing
batch_size = 5000
word_count = {}
num_tokenize = max(1, multiprocessing.cpu_count() - 2) # parse + tokenize + save
num_tokenize = max(1,
multiprocessing.cpu_count() - 2) # parse + tokenize + save
max_queue_size = 8
parse_queue = Queue(maxsize=max_queue_size + num_tokenize)
tokenize_queue = Queue(maxsize=max_queue_size + num_tokenize)
......
......@@ -27,11 +27,12 @@ is_predict = get_config_arg('is_predict', bool, False)
trn = 'data/train.list' if not is_predict else None
tst = 'data/test.list' if not is_predict else 'data/pred.list'
process = 'process' if not is_predict else 'process_predict'
define_py_data_sources2(train_list=trn,
test_list=tst,
module="dataprovider_emb",
obj=process,
args={"dictionary": word_dict})
define_py_data_sources2(
train_list=trn,
test_list=tst,
module="dataprovider_emb",
obj=process,
args={"dictionary": word_dict})
batch_size = 128 if not is_predict else 1
settings(
......@@ -39,19 +40,17 @@ settings(
learning_rate=2e-3,
learning_method=AdamOptimizer(),
regularization=L2Regularization(8e-4),
gradient_clipping_threshold=25
)
gradient_clipping_threshold=25)
bias_attr = ParamAttr(initial_std=0.,l2_rate=0.)
bias_attr = ParamAttr(initial_std=0., l2_rate=0.)
data = data_layer(name="word", size=len(word_dict))
emb = embedding_layer(input=data, size=128)
bi_lstm = bidirectional_lstm(input=emb, size=128)
dropout = dropout_layer(input=bi_lstm, dropout_rate=0.5)
output = fc_layer(input=dropout, size=2,
bias_attr=bias_attr,
act=SoftmaxActivation())
output = fc_layer(
input=dropout, size=2, bias_attr=bias_attr, act=SoftmaxActivation())
if is_predict:
maxid = maxid_layer(output)
......
......@@ -27,11 +27,12 @@ is_predict = get_config_arg('is_predict', bool, False)
trn = 'data/train.list' if not is_predict else None
tst = 'data/test.list' if not is_predict else 'data/pred.list'
process = 'process' if not is_predict else 'process_predict'
define_py_data_sources2(train_list=trn,
test_list=tst,
module="dataprovider_emb",
obj=process,
args={"dictionary": word_dict})
define_py_data_sources2(
train_list=trn,
test_list=tst,
module="dataprovider_emb",
obj=process,
args={"dictionary": word_dict})
batch_size = 128 if not is_predict else 1
settings(
......@@ -39,8 +40,7 @@ settings(
learning_rate=2e-3,
learning_method=AdamOptimizer(),
regularization=L2Regularization(8e-4),
gradient_clipping_threshold=25
)
gradient_clipping_threshold=25)
data = data_layer(name="word", size=len(word_dict))
embedding = embedding_layer(input=data, size=128)
......
......@@ -27,11 +27,12 @@ is_predict = get_config_arg('is_predict', bool, False)
trn = 'data/train.list' if not is_predict else None
tst = 'data/test.list' if not is_predict else 'data/pred.list'
process = 'process' if not is_predict else 'process_predict'
define_py_data_sources2(train_list=trn,
test_list=tst,
module="dataprovider_emb",
obj=process,
args={"dictionary": word_dict})
define_py_data_sources2(
train_list=trn,
test_list=tst,
module="dataprovider_emb",
obj=process,
args={"dictionary": word_dict})
batch_size = 128 if not is_predict else 1
settings(
......@@ -39,10 +40,9 @@ settings(
learning_rate=2e-3,
learning_method=AdamOptimizer(),
regularization=L2Regularization(8e-4),
gradient_clipping_threshold=25
)
gradient_clipping_threshold=25)
bias_attr = ParamAttr(initial_std=0.,l2_rate=0.)
bias_attr = ParamAttr(initial_std=0., l2_rate=0.)
data = data_layer(name="word", size=len(word_dict))
emb = embedding_layer(input=data, size=128)
......@@ -52,17 +52,18 @@ lstm_0 = lstmemory(input=hidden_0, layer_attr=ExtraAttr(drop_rate=0.1))
input_layers = [hidden_0, lstm_0]
for i in range(1,8):
for i in range(1, 8):
fc = fc_layer(input=input_layers, size=128)
lstm = lstmemory(input=fc, layer_attr=ExtraAttr(drop_rate=0.1),
reverse=(i % 2) == 1,)
lstm = lstmemory(
input=fc,
layer_attr=ExtraAttr(drop_rate=0.1),
reverse=(i % 2) == 1, )
input_layers = [fc, lstm]
lstm_last = pooling_layer(input=lstm, pooling_type=MaxPooling())
output = fc_layer(input=lstm_last, size=2,
bias_attr=bias_attr,
act=SoftmaxActivation())
output = fc_layer(
input=lstm_last, size=2, bias_attr=bias_attr, act=SoftmaxActivation())
if is_predict:
maxid = maxid_layer(output)
......
......@@ -27,18 +27,16 @@ is_predict = get_config_arg('is_predict', bool, False)
trn = 'data/train.list' if not is_predict else None
tst = 'data/test.list' if not is_predict else 'data/pred.list'
process = 'process' if not is_predict else 'process_predict'
define_py_data_sources2(train_list=trn,
test_list=tst,
module="dataprovider_emb",
obj=process,
args={"dictionary": word_dict})
define_py_data_sources2(
train_list=trn,
test_list=tst,
module="dataprovider_emb",
obj=process,
args={"dictionary": word_dict})
batch_size = 128 if not is_predict else 1
settings(
batch_size=batch_size,
learning_rate=2e-3,
learning_method=AdamOptimizer()
)
batch_size=batch_size, learning_rate=2e-3, learning_method=AdamOptimizer())
data = data_layer(name="word", size=len(word_dict))
embedding = embedding_layer(input=data, size=128)
......
......@@ -32,11 +32,12 @@ process = 'process' if not is_predict else 'process_predict'
# We need to use different process for training and prediction.
# For training, the input data includes both word IDs and labels.
# For prediction, the input data only includs word Ids.
define_py_data_sources2(train_list=trn,
test_list=tst,
module="dataprovider_bow",
obj=process,
args={"dictionary": word_dict})
define_py_data_sources2(
train_list=trn,
test_list=tst,
module="dataprovider_bow",
obj=process,
args={"dictionary": word_dict})
batch_size = 128 if not is_predict else 1
settings(
......@@ -44,8 +45,7 @@ settings(
learning_rate=2e-3,
learning_method=AdamOptimizer(),
regularization=L2Regularization(8e-4),
gradient_clipping_threshold=25
)
gradient_clipping_threshold=25)
# Define the data for text features. The size of the data layer is the number
# of words in the dictionary.
......
......@@ -27,11 +27,12 @@ is_predict = get_config_arg('is_predict', bool, False)
trn = 'data/train.list' if not is_predict else None
tst = 'data/test.list' if not is_predict else 'data/pred.list'
process = 'process' if not is_predict else 'process_predict'
define_py_data_sources2(train_list=trn,
test_list=tst,
module="dataprovider_emb",
obj=process,
args={"dictionary": word_dict})
define_py_data_sources2(
train_list=trn,
test_list=tst,
module="dataprovider_emb",
obj=process,
args={"dictionary": word_dict})
batch_size = 128 if not is_predict else 1
settings(
......@@ -39,17 +40,14 @@ settings(
learning_rate=2e-3,
learning_method=AdamOptimizer(),
regularization=L2Regularization(8e-4),
gradient_clipping_threshold=25
)
gradient_clipping_threshold=25)
data = data_layer(name="word", size=len(word_dict))
emb = embedding_layer(input=data, size=128)
lstm = simple_lstm(input=emb, size=128,
lstm_cell_attr=ExtraAttr(drop_rate=0.25))
lstm = simple_lstm(
input=emb, size=128, lstm_cell_attr=ExtraAttr(drop_rate=0.25))
lstm_max = pooling_layer(input=lstm, pooling_type=MaxPooling())
output = fc_layer(input=lstm_max, size=2,
act=SoftmaxActivation())
output = fc_layer(input=lstm_max, size=2, act=SoftmaxActivation())
if is_predict:
maxid = maxid_layer(output)
outputs([maxid, output])
......
......@@ -21,8 +21,9 @@ def meta_to_header(meta, name):
yield integer_value(each_meta['max'])
elif each_meta['type'] == 'embedding':
is_seq = each_meta['seq'] == 'sequence'
yield integer_value(len(each_meta['dict']),
seq_type=SequenceType.SEQUENCE if is_seq
else SequenceType.NO_SEQUENCE)
yield integer_value(
len(each_meta['dict']),
seq_type=SequenceType.SEQUENCE
if is_seq else SequenceType.NO_SEQUENCE)
elif each_meta['type'] == 'one_hot_dense':
yield dense_vector(len(each_meta['dict']))
......@@ -12,7 +12,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
config_generator.py
......@@ -29,10 +28,7 @@ import json
import docopt
import copy
DEFAULT_FILE = {
"type": "split",
"delimiter": ","
}
DEFAULT_FILE = {"type": "split", "delimiter": ","}
DEFAULT_FIELD = {
"id": {
......@@ -107,19 +103,16 @@ def main(filename, fmt):
field = copy.deepcopy(DEFAULT_FIELD[field_key])
field['pos'] = pos
fields.append(field)
obj[k] = {
"file": file_dict,
"fields": fields
}
meta = {
"meta": obj
}
obj[k] = {"file": file_dict, "fields": fields}
meta = {"meta": obj}
# print meta
if fmt == 'json':
def formatter(x):
import json
return json.dumps(x, indent=2)
elif fmt == 'yaml':
def formatter(x):
import yaml
return yaml.safe_dump(x, default_flow_style=False)
......
......@@ -12,7 +12,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Preprocess Movielens dataset, to get movie/user object.
......@@ -66,8 +65,8 @@ class SortedIDGenerator(object):
self.__key_set__.add(key)
def finish_scan(self, compare=None, key=None, reverse=False):
self.__key_set__ = sorted(list(self.__key_set__), cmp=compare,
key=key, reverse=reverse)
self.__key_set__ = sorted(
list(self.__key_set__), cmp=compare, key=key, reverse=reverse)
self.dict = dict()
for idx, each_key in enumerate(self.__key_set__):
self.dict[each_key] = idx
......@@ -207,11 +206,10 @@ class EmbeddingFieldParser(object):
self.dict = EmbeddingFieldParser.CharBasedEmbeddingDict(
self.seq_type == EmbeddingFieldParser.SEQUENCE)
elif config['dict']['type'] == 'split':
self.dict = SplitEmbeddingDict(
config['dict'].get('delimiter', ','))
self.dict = SplitEmbeddingDict(config['dict'].get('delimiter', ','))
elif config['dict']['type'] == 'whole_content':
self.dict = EmbeddingFieldParser.WholeContentDict(
config['dict']['sort'])
self.dict = EmbeddingFieldParser.WholeContentDict(config['dict'][
'sort'])
else:
print config
assert False
......@@ -333,8 +331,8 @@ class ContentExtractorFactory(object):
return PositionContentExtractor(config['pos'])
else:
extra_args = config['regex']
return RegexPositionContentExtractor(pos=config['pos'],
**extra_args)
return RegexPositionContentExtractor(
pos=config['pos'], **extra_args)
class MetaFile(object):
......@@ -364,9 +362,10 @@ class MetaFile(object):
metas = map(lambda x: x.meta_field(), field_parsers)
# print metas
key_index = filter(lambda x: x is not None, map(
lambda (idx, meta): idx if 'is_key' in meta and meta['is_key']
else None, enumerate(metas)))[0]
key_index = filter(
lambda x: x is not None,
map(lambda (idx, meta): idx if 'is_key' in meta and meta['is_key'] else None,
enumerate(metas)))[0]
key_map = []
for i in range(min(key_index, len(metas))):
......@@ -374,12 +373,7 @@ class MetaFile(object):
for i in range(key_index + 1, len(metas)):
key_map.append(i)
obj = {
'__meta__': {
'raw_meta': metas,
'feature_map': key_map
}
}
obj = {'__meta__': {'raw_meta': metas, 'feature_map': key_map}}
for each_block in reader.read():
idx = field_parsers[key_index].parse(each_block)
......
......@@ -12,7 +12,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Separate movielens 1m dataset to train/test file.
......
......@@ -15,6 +15,7 @@
from paddle.trainer.PyDataProvider2 import *
import common_utils # parse
def hook(settings, meta, **kwargs):
"""
Init hook is invoked before process data. It will set obj.slots and store
......@@ -41,6 +42,7 @@ def hook(settings, meta, **kwargs):
settings.input_types = headers
settings.meta = meta
@provider(init_hook=hook, cache=CacheType.CACHE_PASS_IN_MEM)
def process(settings, filename):
with open(filename, 'r') as f:
......
......@@ -28,7 +28,8 @@ if __name__ == '__main__':
model_path = sys.argv[1]
swig_paddle.initPaddle('--use_gpu=0')
conf = parse_config("trainer_config.py", "is_predict=1")
network = swig_paddle.GradientMachine.createFromConfigProto(conf.model_config)
network = swig_paddle.GradientMachine.createFromConfigProto(
conf.model_config)
assert isinstance(network, swig_paddle.GradientMachine)
network.loadParameters(model_path)
with open('./data/meta.bin', 'rb') as f:
......@@ -39,11 +40,12 @@ if __name__ == '__main__':
while True:
movie_id = int(raw_input("Input movie_id: "))
user_id = int(raw_input("Input user_id: "))
movie_meta = meta['movie'][movie_id] # Query Data From Meta.
movie_meta = meta['movie'][movie_id] # Query Data From Meta.
user_meta = meta['user'][user_id]
data = [movie_id - 1]
data.extend(movie_meta)
data.append(user_id - 1)
data.extend(user_meta)
print "Prediction Score is %.2f" % ((network.forwardTest(
cvt.convert([data]))[0]['value'][0][0] + 5) / 2)
print "Prediction Score is %.2f" % (
(network.forwardTest(cvt.convert([data]))[0]['value'][0][0] + 5)
/ 2)
......@@ -27,8 +27,8 @@ with open(META_FILE, 'rb') as f:
# load meta file
meta = pickle.load(f)
settings(batch_size=1600, learning_rate=1e-3,
learning_method=RMSPropOptimizer())
settings(
batch_size=1600, learning_rate=1e-3, learning_method=RMSPropOptimizer())
def construct_feature(name):
......@@ -59,11 +59,10 @@ def construct_feature(name):
slot_name = each_meta.get('name', '%s_id' % name)
if type_name == 'id':
slot_dim = each_meta['max']
embedding = embedding_layer(input=data_layer(slot_name,
size=slot_dim),
size=256)
fusion.append(fc_layer(input=embedding,
size=256))
embedding = embedding_layer(
input=data_layer(
slot_name, size=slot_dim), size=256)
fusion.append(fc_layer(input=embedding, size=256))
elif type_name == 'embedding':
is_seq = each_meta['seq'] == 'sequence'
slot_dim = len(each_meta['dict'])
......@@ -71,17 +70,14 @@ def construct_feature(name):
embedding = embedding_layer(input=din, size=256)
if is_seq:
fusion.append(
text_conv_pool(input=embedding, context_len=5,
hidden_size=256))
text_conv_pool(
input=embedding, context_len=5, hidden_size=256))
else:
fusion.append(fc_layer(input=embedding,
size=256))
fusion.append(fc_layer(input=embedding, size=256))
elif type_name == 'one_hot_dense':
slot_dim = len(each_meta['dict'])
hidden = fc_layer(input=data_layer(slot_name, slot_dim),
size=256)
fusion.append(fc_layer(input=hidden,
size=256))
hidden = fc_layer(input=data_layer(slot_name, slot_dim), size=256)
fusion.append(fc_layer(input=hidden, size=256))
return fc_layer(name="%s_fusion" % name, input=fusion, size=256)
......@@ -90,10 +86,16 @@ movie_feature = construct_feature("movie")
user_feature = construct_feature("user")
similarity = cos_sim(a=movie_feature, b=user_feature)
if not is_predict:
outputs(regression_cost(input=similarity,
label=data_layer('rating', size=1)))
define_py_data_sources2('data/train.list', 'data/test.list', module='dataprovider',
obj='process', args={'meta': meta})
outputs(
regression_cost(
input=similarity, label=data_layer(
'rating', size=1)))
define_py_data_sources2(
'data/train.list',
'data/test.list',
module='dataprovider',
obj='process',
args={'meta': meta})
else:
outputs(similarity)
......@@ -26,9 +26,9 @@ def hook(settings, word_dict, label_dict, **kwargs):
integer_value_sequence(len(word_dict)),
integer_value_sequence(len(word_dict)),
integer_value_sequence(len(word_dict)),
integer_value_sequence(len(word_dict)),
integer_value_sequence(2),
integer_value_sequence(len(label_dict))]
integer_value_sequence(len(word_dict)), integer_value_sequence(2),
integer_value_sequence(len(label_dict))
]
@provider(init_hook=hook)
......
......@@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import os
import sys
......@@ -42,7 +41,7 @@ if not is_predict:
label_dict[w] = i
if is_test:
train_list_file = None
train_list_file = None
#define data provider
define_py_data_sources2(
......
......@@ -41,22 +41,16 @@ class Prediction():
len_dict = len(self.dict)
len_label = len(self.labels)
conf = parse_config(
train_conf,
'dict_len=' + str(len_dict) +
',label_len=' + str(len_label) +
',is_predict=True')
conf = parse_config(train_conf, 'dict_len=' + str(len_dict) +
',label_len=' + str(len_label) + ',is_predict=True')
self.network = swig_paddle.GradientMachine.createFromConfigProto(
conf.model_config)
self.network.loadParameters(model_dir)
slots = [
integer_value_sequence(len_dict),
integer_value_sequence(len_dict),
integer_value_sequence(len_dict),
integer_value_sequence(len_dict),
integer_value_sequence(len_dict),
integer_value_sequence(2)
integer_value_sequence(len_dict), integer_value_sequence(len_dict),
integer_value_sequence(len_dict), integer_value_sequence(len_dict),
integer_value_sequence(len_dict), integer_value_sequence(2)
]
self.converter = DataProviderConverter(slots)
......@@ -110,8 +104,8 @@ class Prediction():
len_sen = len(sen.split())
line_labels = lab[index:index + len_sen]
index += len_sen
fout.write(sen + '\t' + ' '.join([self.labels_reverse[
i] for i in line_labels]) + '\n')
fout.write(sen + '\t' + ' '.join(
[self.labels_reverse[i] for i in line_labels]) + '\n')
def option_parser():
......
......@@ -17,8 +17,8 @@ from paddle.trainer.PyDataProvider2 import *
def hook(settings, dictionary, **kwargs):
settings.word_dict = dictionary
settings.input_types = [
integer_value_sequence(len(settings.word_dict)),
integer_value(2)]
integer_value_sequence(len(settings.word_dict)), integer_value(2)
]
settings.logger.info('dict len : %d' % (len(settings.word_dict)))
......@@ -29,6 +29,7 @@ def process(settings, file_name):
label, comment = line.strip().split('\t\t')
label = int(label)
words = comment.split()
word_slot = [settings.word_dict[w] for w in words if w in
settings.word_dict]
word_slot = [
settings.word_dict[w] for w in words if w in settings.word_dict
]
yield word_slot, label
......@@ -18,14 +18,14 @@ from optparse import OptionParser
from py_paddle import swig_paddle, DataProviderConverter
from paddle.trainer.PyDataProvider2 import integer_value_sequence
from paddle.trainer.config_parser import parse_config
"""
Usage: run following command to show help message.
python predict.py -h
"""
class SentimentPrediction():
def __init__(self, train_conf, dict_file, model_dir=None, label_file = None):
def __init__(self, train_conf, dict_file, model_dir=None, label_file=None):
"""
train_conf: trainer configure.
dict_file: word dictionary file name.
......@@ -44,7 +44,8 @@ class SentimentPrediction():
self.load_label(label_file)
conf = parse_config(train_conf, "is_predict=1")
self.network = swig_paddle.GradientMachine.createFromConfigProto(conf.model_config)
self.network = swig_paddle.GradientMachine.createFromConfigProto(
conf.model_config)
self.network.loadParameters(self.model_dir)
input_types = [integer_value_sequence(self.dict_dim)]
self.converter = DataProviderConverter(input_types)
......@@ -61,7 +62,7 @@ class SentimentPrediction():
"""
Load label.
"""
self.label={}
self.label = {}
for v in open(label_file, 'r'):
self.label[int(v.split('\t')[1])] = v.split('\t')[0]
......@@ -72,7 +73,9 @@ class SentimentPrediction():
with open(data_file, 'r') as fdata:
for line in fdata:
words = line.strip().split()
word_slot = [self.word_dict[w] for w in words if w in self.word_dict]
word_slot = [
self.word_dict[w] for w in words if w in self.word_dict
]
if not word_slot:
print "all words are not in dictionary: %s", line
continue
......@@ -89,25 +92,48 @@ class SentimentPrediction():
if self.label is None:
print("%s: predicting label is %d" % (data_file, lab[0][0]))
else:
print("%s: predicting label is %s" % (data_file, self.label[lab[0][0]]))
print("%s: predicting label is %s" %
(data_file, self.label[lab[0][0]]))
def option_parser():
usage = "python predict.py -n config -w model_dir -d dictionary -i input_file "
parser = OptionParser(usage="usage: %s [options]" % usage)
parser.add_option("-n", "--tconf", action="store",
dest="train_conf", help="network config")
parser.add_option("-d", "--dict", action="store",
dest="dict_file",help="dictionary file")
parser.add_option("-b", "--label", action="store",
dest="label", default=None,
help="dictionary file")
parser.add_option("-i", "--data", action="store",
dest="data", help="data file to predict")
parser.add_option("-w", "--model", action="store",
dest="model_path", default=None,
help="model path")
parser.add_option(
"-n",
"--tconf",
action="store",
dest="train_conf",
help="network config")
parser.add_option(
"-d",
"--dict",
action="store",
dest="dict_file",
help="dictionary file")
parser.add_option(
"-b",
"--label",
action="store",
dest="label",
default=None,
help="dictionary file")
parser.add_option(
"-i",
"--data",
action="store",
dest="data",
help="data file to predict")
parser.add_option(
"-w",
"--model",
action="store",
dest="model_path",
default=None,
help="model path")
return parser.parse_args()
def main():
options, args = option_parser()
train_conf = options.train_conf
......@@ -119,5 +145,6 @@ def main():
predict = SentimentPrediction(train_conf, dict_file, model_path, label)
predict.predict(data)
if __name__ == '__main__':
main()
......@@ -22,13 +22,13 @@ from os.path import join as join_path
from optparse import OptionParser
from paddle.utils.preprocess_util import *
"""
Usage: run following command to show help message.
python preprocess.py -h
"""
def save_dict(dict, filename, is_reverse = True):
def save_dict(dict, filename, is_reverse=True):
"""
Save dictionary into file.
dict: input dictionary.
......@@ -39,9 +39,10 @@ def save_dict(dict, filename, is_reverse = True):
f = open(filename, 'w')
for k, v in sorted(dict.items(), key=operator.itemgetter(1),\
reverse=is_reverse):
f.write('%s\t%s\n'%(k, v))
f.write('%s\t%s\n' % (k, v))
f.close()
def tokenize(sentences):
"""
Use tokenizer.perl to tokenize input sentences.
......@@ -58,6 +59,7 @@ def tokenize(sentences):
toks = tok_text.split('\n')[:-1]
return toks
def read_lines(path):
"""
path: String, file path.
......@@ -71,12 +73,17 @@ def read_lines(path):
seqs.append(line)
return seqs
class SentimentDataSetCreate():
"""
A class to process data for sentiment analysis task.
"""
def __init__(self, data_path, output_path,
use_okenizer = True, multi_lines = False):
def __init__(self,
data_path,
output_path,
use_okenizer=True,
multi_lines=False):
"""
data_path: string, traing and testing dataset path
output_path: string, output path, store processed dataset
......@@ -164,23 +171,17 @@ class SentimentDataSetCreate():
# Preprocess train data.
train_data, train_lab_set = self.data_list(self.train_dir)
print "processing train set..."
file_lists = self.save_data(train_data,
"train",
self.batch_size,
True,
True)
file_lists = self.save_data(train_data, "train", self.batch_size, True,
True)
save_list(file_lists, self.train_list)
# If have test data path, preprocess test data.
if os.path.exists(self.test_dir):
test_data, test_lab_set = self.data_list(self.test_dir)
assert(train_lab_set == test_lab_set)
assert (train_lab_set == test_lab_set)
print "processing test set..."
file_lists = self.save_data(test_data,
"test",
self.batch_size,
False,
self.dict_with_test)
file_lists = self.save_data(test_data, "test", self.batch_size,
False, self.dict_with_test)
save_list(file_lists, self.test_list)
# save labels set.
......@@ -191,7 +192,9 @@ class SentimentDataSetCreate():
save_dict(self.word_count, self.dict_file, True)
self.dict_size = len(self.word_count)
def save_data(self, data, prefix = "",
def save_data(self,
data,
prefix="",
batch_size=50000,
is_shuffle=False,
build_dict=False):
......@@ -205,7 +208,8 @@ class SentimentDataSetCreate():
return: list of batch names
"""
if is_shuffle and self.multi_lines:
return self.save_data_multi_lines(data, prefix, batch_size, build_dict)
return self.save_data_multi_lines(data, prefix, batch_size,
build_dict)
if is_shuffle:
random.shuffle(data)
......@@ -213,7 +217,7 @@ class SentimentDataSetCreate():
batch_names = []
for i in range(num_batches):
batch_name = join_path(self.output_path,
"%s_part_%03d" %(prefix, i))
"%s_part_%03d" % (prefix, i))
begin = i * batch_size
end = min((i + 1) * batch_size, len(data))
# read a batch of data
......@@ -246,7 +250,9 @@ class SentimentDataSetCreate():
data_list = tokenize(data_list)
return label_list, data_list
def save_data_multi_lines(self, data, prefix = "",
def save_data_multi_lines(self,
data,
prefix="",
batch_size=50000,
build_dict=False):
"""
......@@ -274,14 +280,14 @@ class SentimentDataSetCreate():
self.create_dict(data_list)
length = len(label_list)
perm_list = np.array([ i for i in xrange(length) ])
perm_list = np.array([i for i in xrange(length)])
random.shuffle(perm_list)
num_batches = int(math.ceil(length / float(batch_size)))
batch_names = []
for i in range(num_batches):
batch_name = join_path(self.output_path,
"%s_part_%03d" %(prefix, i))
"%s_part_%03d" % (prefix, i))
begin = i * batch_size
end = min((i + 1) * batch_size, length)
sub_label = [label_list[perm_list[i]] for i in range(begin, end)]
......@@ -304,35 +310,50 @@ class SentimentDataSetCreate():
f.write('%s\t\t%s\n' % (lab, seq))
f.close()
def option_parser():
parser = OptionParser(usage="usage: python preprcoess.py "\
"-i data_dir [options]")
parser.add_option("-i", "--data", action="store",
dest="input", help="Input data directory.")
parser.add_option("-o", "--output", action="store",
dest="output", default=None,
help="Output directory.")
parser.add_option("-t", "--tokenizer", action="store",
dest="use_tokenizer", default=True,
help="Whether to use tokenizer.")
parser.add_option(
"-i",
"--data",
action="store",
dest="input",
help="Input data directory.")
parser.add_option(
"-o",
"--output",
action="store",
dest="output",
default=None,
help="Output directory.")
parser.add_option(
"-t",
"--tokenizer",
action="store",
dest="use_tokenizer",
default=True,
help="Whether to use tokenizer.")
parser.add_option("-m", "--multi_lines", action="store",
dest="multi_lines", default=False,
help="If input text files have multi lines and they "\
"need to be shuffled, you should set -m True,")
return parser.parse_args()
def main():
options, args = option_parser()
data_dir=options.input
output_dir=options.output
use_tokenizer=options.use_tokenizer
multi_lines=options.multi_lines
data_dir = options.input
output_dir = options.output
use_tokenizer = options.use_tokenizer
multi_lines = options.multi_lines
if output_dir is None:
outname = os.path.basename(options.input)
output_dir = join_path(os.path.dirname(data_dir), 'pre-' + outname)
data_creator = SentimentDataSetCreate(data_dir, output_dir,
use_tokenizer, multi_lines)
data_creator = SentimentDataSetCreate(data_dir, output_dir, use_tokenizer,
multi_lines)
data_creator.create_dataset()
if __name__ == '__main__':
main()
......@@ -47,10 +47,12 @@ def sentiment_data(data_dir=None,
for i, line in enumerate(open(dict_file, 'r')):
word_dict[line.split('\t')[0]] = i
define_py_data_sources2(train_list, test_list,
module="dataprovider",
obj="process",
args={'dictionary': word_dict})
define_py_data_sources2(
train_list,
test_list,
module="dataprovider",
obj="process",
args={'dictionary': word_dict})
return dict_dim, class_dim
......@@ -64,8 +66,7 @@ def bidirectional_lstm_net(input_dim,
emb = embedding_layer(input=data, size=emb_dim)
bi_lstm = bidirectional_lstm(input=emb, size=lstm_dim)
dropout = dropout_layer(input=bi_lstm, dropout_rate=0.5)
output = fc_layer(input=dropout, size=class_dim,
act=SoftmaxActivation())
output = fc_layer(input=dropout, size=class_dim, act=SoftmaxActivation())
if not is_predict:
lbl = data_layer("label", 1)
......@@ -109,27 +110,36 @@ def stacked_lstm_net(input_dim,
data = data_layer("word", input_dim)
emb = embedding_layer(input=data, size=emb_dim)
fc1 = fc_layer(input=emb, size=hid_dim, act=linear,
bias_attr=bias_attr)
lstm1 = lstmemory(input=fc1, act=relu, bias_attr=bias_attr,
layer_attr=layer_attr)
fc1 = fc_layer(input=emb, size=hid_dim, act=linear, bias_attr=bias_attr)
lstm1 = lstmemory(
input=fc1, act=relu, bias_attr=bias_attr, layer_attr=layer_attr)
inputs = [fc1, lstm1]
for i in range(2, stacked_num + 1):
fc = fc_layer(input=inputs, size=hid_dim, act=linear,
param_attr=para_attr, bias_attr=bias_attr)
lstm = lstmemory(input=fc, reverse=(i % 2) == 0, act=relu,
bias_attr=bias_attr, layer_attr=layer_attr)
fc = fc_layer(
input=inputs,
size=hid_dim,
act=linear,
param_attr=para_attr,
bias_attr=bias_attr)
lstm = lstmemory(
input=fc,
reverse=(i % 2) == 0,
act=relu,
bias_attr=bias_attr,
layer_attr=layer_attr)
inputs = [fc, lstm]
fc_last = pooling_layer(input=inputs[0], pooling_type=MaxPooling())
lstm_last = pooling_layer(input=inputs[1], pooling_type=MaxPooling())
output = fc_layer(input=[fc_last, lstm_last], size=class_dim,
act=SoftmaxActivation(),
bias_attr=bias_attr, param_attr=para_attr)
output = fc_layer(
input=[fc_last, lstm_last],
size=class_dim,
act=SoftmaxActivation(),
bias_attr=bias_attr,
param_attr=para_attr)
if is_predict:
outputs(output)
else:
outputs(
classification_cost(input=output, label=data_layer('label', 1)))
outputs(classification_cost(input=output, label=data_layer('label', 1)))
......@@ -20,20 +20,19 @@ is_test = get_config_arg('is_test', bool, False)
# whether this config is used for prediction
is_predict = get_config_arg('is_predict', bool, False)
data_dir = "./data/pre-imdb"
data_dir = "./data/pre-imdb"
dict_dim, class_dim = sentiment_data(data_dir, is_test, is_predict)
################## Algorithm Config #####################
settings(
batch_size=128,
learning_rate=2e-3,
learning_method=AdamOptimizer(),
regularization=L2Regularization(8e-4),
gradient_clipping_threshold=25
)
batch_size=128,
learning_rate=2e-3,
learning_method=AdamOptimizer(),
regularization=L2Regularization(8e-4),
gradient_clipping_threshold=25)
#################### Network Config ######################
stacked_lstm_net(dict_dim, class_dim=class_dim,
stacked_num=3, is_predict=is_predict)
stacked_lstm_net(
dict_dim, class_dim=class_dim, stacked_num=3, is_predict=is_predict)
# bidirectional_lstm_net(dict_dim, class_dim=class_dim, is_predict=is_predict)
......@@ -30,14 +30,14 @@ def hook(settings, src_dict, trg_dict, file_list, **kwargs):
if settings.job_mode:
settings.trg_dict = trg_dict
settings.slots = [
integer_value_sequence(len(settings.src_dict)),
integer_value_sequence(len(settings.trg_dict)),
integer_value_sequence(len(settings.src_dict)),
integer_value_sequence(len(settings.trg_dict)),
integer_value_sequence(len(settings.trg_dict))
]
settings.logger.info("trg dict len : %d" % (len(settings.trg_dict)))
else:
settings.slots = [
integer_value_sequence(len(settings.src_dict)),
integer_value_sequence(len(settings.src_dict)),
integer_value_sequence(len(open(file_list[0], "r").readlines()))
]
......@@ -62,8 +62,7 @@ def process(settings, file_name):
if settings.job_mode:
trg_seq = line_split[1] # one target sequence
trg_words = trg_seq.split()
trg_ids = [settings.trg_dict.get(w, UNK_IDX)
for w in trg_words]
trg_ids = [settings.trg_dict.get(w, UNK_IDX) for w in trg_words]
# remove sequence whose length > 80 in training mode
if len(src_ids) > 80 or len(trg_ids) > 80:
......
......@@ -12,7 +12,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Example:
python preprocess.py -i INPUT [-d DICTSIZE] [-m]
......@@ -24,12 +23,13 @@ Options:
-m --mergeDict merge source and target dictionary
"""
import os
import sys
import sys
import string
from optparse import OptionParser
from paddle.utils.preprocess_util import save_list, DatasetCreater
class SeqToSeqDatasetCreater(DatasetCreater):
"""
A class to process data for sequence to sequence application.
......@@ -75,7 +75,7 @@ class SeqToSeqDatasetCreater(DatasetCreater):
if not os.path.exists(output):
os.system(cmd + '> ' + output)
def build_dict(self, file_path, dict_path, dict_size = -1):
def build_dict(self, file_path, dict_path, dict_size=-1):
"""
Create the dictionary for the file, Note that
1. Valid characters include all printable characters
......@@ -99,20 +99,23 @@ class SeqToSeqDatasetCreater(DatasetCreater):
for word in words:
if word not in dictory:
dictory[word] = 1
else:
else:
dictory[word] += 1
output = open(dict_path, "w+")
output.write('<s>\n<e>\n<unk>\n')
count = 3
for key, value in sorted(dictory.items(), key = lambda d:d[1], reverse = True):
for key, value in sorted(
dictory.items(), key=lambda d: d[1], reverse=True):
output.write(key + "\n")
count += 1
if count == dict_size:
break
self.dict_size = count
def create_dataset(self, dict_size = -1, mergeDict = False,
suffixes = ['.src', '.trg']):
def create_dataset(self,
dict_size=-1,
mergeDict=False,
suffixes=['.src', '.trg']):
"""
Create seqToseq dataset
"""
......@@ -135,13 +138,14 @@ class SeqToSeqDatasetCreater(DatasetCreater):
# checkout dataset should be parallel corpora
suffix_len = len(suffixes[0])
for dataset in dataset_list:
file_list = os.listdir(dataset)
if len(file_list) % 2 == 1:
raise RuntimeError("dataset should be parallel corpora")
file_list.sort()
for i in range(0, len(file_list), 2):
if file_list[i][:-suffix_len] != file_list[i + 1][:-suffix_len]:
raise RuntimeError("source and target file name should be equal")
file_list = os.listdir(dataset)
if len(file_list) % 2 == 1:
raise RuntimeError("dataset should be parallel corpora")
file_list.sort()
for i in range(0, len(file_list), 2):
if file_list[i][:-suffix_len] != file_list[i + 1][:-suffix_len]:
raise RuntimeError(
"source and target file name should be equal")
# cat all the files with the same suffix in dataset
for suffix in suffixes:
......@@ -155,16 +159,18 @@ class SeqToSeqDatasetCreater(DatasetCreater):
list = ['train.list', 'test.list', 'gen.list']
for dataset in dataset_list:
outname = os.path.basename(dataset)
self.concat_file(dataset, outname + suffixes[0],
self.concat_file(dataset, outname + suffixes[0],
outname + suffixes[1], dir_list[id], outname)
save_list([os.path.join(dir_list[id], outname)],
save_list([os.path.join(dir_list[id], outname)],
os.path.join(self.output_path, list[id]))
id += 1
# build dictionary for train data
dict = ['src.dict', 'trg.dict']
dict_path = [os.path.join(self.output_path, dict[0]),
os.path.join(self.output_path, dict[1])]
dict_path = [
os.path.join(self.output_path, dict[0]),
os.path.join(self.output_path, dict[1])
]
if mergeDict:
outname = os.path.join(train_dir, train_dataset.split('/')[-1])
print 'build src dictionary for train data'
......@@ -173,22 +179,30 @@ class SeqToSeqDatasetCreater(DatasetCreater):
os.system('cp ' + dict_path[0] + ' ' + dict_path[1])
else:
outname = os.path.join(train_dataset, self.train_dir_name)
for id in range(0,2):
for id in range(0, 2):
suffix = suffixes[id]
print 'build ' + suffix[1:] + ' dictionary for train data'
self.build_dict(outname + suffix, dict_path[id], dict_size)
print 'dictionary size is', self.dict_size
def main():
usage = "usage: \n" \
"python %prog -i INPUT [-d DICTSIZE] [-m]"
parser = OptionParser(usage)
parser.add_option("-i", action="store", dest="input",
help="input original dataset path")
parser.add_option("-d", action="store", dest="dictsize",
help="specified word count of dictionary")
parser.add_option("-m", "--mergeDict", action="store_true", dest="mergeDict",
help="merge source and target dictionary")
parser.add_option(
"-i", action="store", dest="input", help="input original dataset path")
parser.add_option(
"-d",
action="store",
dest="dictsize",
help="specified word count of dictionary")
parser.add_option(
"-m",
"--mergeDict",
action="store_true",
dest="mergeDict",
help="merge source and target dictionary")
(options, args) = parser.parse_args()
if options.input[-1] == os.path.sep:
options.input = options.input[:-1]
......@@ -200,5 +214,6 @@ def main():
data_creator = SeqToSeqDatasetCreater(options.input, output_path)
data_creator.create_dataset(dictsize, options.mergeDict)
if __name__ == "__main__":
main();
main()
......@@ -50,16 +50,21 @@ def seq_to_seq_data(data_dir,
trg_dict = None
else:
train_list = os.path.join(data_dir, train_list)
test_list = os.path.join(data_dir,test_list)
test_list = os.path.join(data_dir, test_list)
define_py_data_sources2(train_list, test_list,
module = "dataprovider",
obj = "process",
args = {"src_dict": src_dict,
"trg_dict": trg_dict})
define_py_data_sources2(
train_list,
test_list,
module="dataprovider",
obj="process",
args={"src_dict": src_dict,
"trg_dict": trg_dict})
return {"src_dict_path": src_lang_dict, "trg_dict_path": trg_lang_dict,
"gen_result": gen_result}
return {
"src_dict_path": src_lang_dict,
"trg_dict_path": trg_lang_dict,
"gen_result": gen_result
}
def gru_encoder_decoder(data_conf,
......@@ -90,51 +95,55 @@ def gru_encoder_decoder(data_conf,
size=word_vector_dim,
param_attr=ParamAttr(name='_source_language_embedding'))
src_forward = simple_gru(input=src_embedding, size=encoder_size)
src_backward = simple_gru(input=src_embedding,
size=encoder_size,
reverse=True)
src_backward = simple_gru(
input=src_embedding, size=encoder_size, reverse=True)
encoded_vector = concat_layer(input=[src_forward, src_backward])
with mixed_layer(size=decoder_size) as encoded_proj:
encoded_proj += full_matrix_projection(input=encoded_vector)
backward_first = first_seq(input=src_backward)
with mixed_layer(size=decoder_size,
act=TanhActivation(), ) as decoder_boot:
with mixed_layer(
size=decoder_size,
act=TanhActivation(), ) as decoder_boot:
decoder_boot += full_matrix_projection(input=backward_first)
def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
decoder_mem = memory(name='gru_decoder',
size=decoder_size,
boot_layer=decoder_boot)
decoder_mem = memory(
name='gru_decoder', size=decoder_size, boot_layer=decoder_boot)
context = simple_attention(encoded_sequence=enc_vec,
encoded_proj=enc_proj,
decoder_state=decoder_mem, )
context = simple_attention(
encoded_sequence=enc_vec,
encoded_proj=enc_proj,
decoder_state=decoder_mem, )
with mixed_layer(size=decoder_size * 3) as decoder_inputs:
decoder_inputs += full_matrix_projection(input=context)
decoder_inputs += full_matrix_projection(input=current_word)
gru_step = gru_step_layer(name='gru_decoder',
input=decoder_inputs,
output_mem=decoder_mem,
size=decoder_size)
gru_step = gru_step_layer(
name='gru_decoder',
input=decoder_inputs,
output_mem=decoder_mem,
size=decoder_size)
with mixed_layer(size=target_dict_dim,
bias_attr=True,
act=SoftmaxActivation()) as out:
with mixed_layer(
size=target_dict_dim, bias_attr=True,
act=SoftmaxActivation()) as out:
out += full_matrix_projection(input=gru_step)
return out
decoder_group_name = "decoder_group"
group_inputs=[StaticInput(input=encoded_vector,is_seq=True),
StaticInput(input=encoded_proj,is_seq=True)]
group_inputs = [
StaticInput(
input=encoded_vector, is_seq=True), StaticInput(
input=encoded_proj, is_seq=True)
]
if not is_generating:
trg_embedding = embedding_layer(
input=data_layer(name='target_language_word',
size=target_dict_dim),
input=data_layer(
name='target_language_word', size=target_dict_dim),
size=word_vector_dim,
param_attr=ParamAttr(name='_target_language_embedding'))
group_inputs.append(trg_embedding)
......@@ -144,12 +153,12 @@ def gru_encoder_decoder(data_conf,
# while encoded source sequence is accessed to as an unbounded memory.
# Here, the StaticInput defines a read-only memory
# for the recurrent_group.
decoder = recurrent_group(name=decoder_group_name,
step=gru_decoder_with_attention,
input=group_inputs)
decoder = recurrent_group(
name=decoder_group_name,
step=gru_decoder_with_attention,
input=group_inputs)
lbl = data_layer(name='target_language_next_word',
size=target_dict_dim)
lbl = data_layer(name='target_language_next_word', size=target_dict_dim)
cost = classification_cost(input=decoder, label=lbl)
outputs(cost)
else:
......@@ -168,16 +177,19 @@ def gru_encoder_decoder(data_conf,
embedding_size=word_vector_dim)
group_inputs.append(trg_embedding)
beam_gen = beam_search(name=decoder_group_name,
step=gru_decoder_with_attention,
input=group_inputs,
bos_id=0,
eos_id=1,
beam_size=beam_size,
max_length=max_length)
seqtext_printer_evaluator(input=beam_gen,
id_input=data_layer(name="sent_id", size=1),
dict_file=trg_dict_path,
result_file=gen_trans_file)
beam_gen = beam_search(
name=decoder_group_name,
step=gru_decoder_with_attention,
input=group_inputs,
bos_id=0,
eos_id=1,
beam_size=beam_size,
max_length=max_length)
seqtext_printer_evaluator(
input=beam_gen,
id_input=data_layer(
name="sent_id", size=1),
dict_file=trg_dict_path,
result_file=gen_trans_file)
outputs(beam_gen)
......@@ -17,8 +17,7 @@ import gzip
import logging
logging.basicConfig(
format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s',
)
format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s', )
logger = logging.getLogger('paddle')
logger.setLevel(logging.INFO)
......@@ -32,59 +31,58 @@ num_original_columns = 3
# [[-1,0], [0,0]] means previous token at column 0 and current token at
# column 0 are combined as one feature.
patterns = [
[[-2,0]],
[[-1,0]],
[[0,0]],
[[1,0]],
[[2,0]],
[[-1,0], [0,0]],
[[0,0], [1,0]],
[[-2,1]],
[[-1,1]],
[[0,1]],
[[1,1]],
[[2,1]],
[[-2,1], [-1,1]],
[[-1,1], [0,1]],
[[0,1], [1,1]],
[[1,1], [2,1]],
[[-2,1], [-1,1], [0,1]],
[[-1,1], [0,1], [1,1]],
[[0,1], [1,1], [2,1]],
[[-2, 0]],
[[-1, 0]],
[[0, 0]],
[[1, 0]],
[[2, 0]],
[[-1, 0], [0, 0]],
[[0, 0], [1, 0]],
[[-2, 1]],
[[-1, 1]],
[[0, 1]],
[[1, 1]],
[[2, 1]],
[[-2, 1], [-1, 1]],
[[-1, 1], [0, 1]],
[[0, 1], [1, 1]],
[[1, 1], [2, 1]],
[[-2, 1], [-1, 1], [0, 1]],
[[-1, 1], [0, 1], [1, 1]],
[[0, 1], [1, 1], [2, 1]],
]
dict_label = {
'B-ADJP': 0,
'I-ADJP': 1,
'B-ADVP': 2,
'I-ADVP': 3,
'B-CONJP': 4,
'I-CONJP': 5,
'B-INTJ': 6,
'I-INTJ': 7,
'B-LST': 8,
'I-LST': 9,
'B-NP': 10,
'I-NP': 11,
'B-PP': 12,
'I-PP': 13,
'B-PRT': 14,
'I-PRT': 15,
'B-SBAR': 16,
'I-SBAR': 17,
'B-UCP': 18,
'I-UCP': 19,
'B-VP': 20,
'I-VP': 21,
'O': 22
'B-ADJP': 0,
'I-ADJP': 1,
'B-ADVP': 2,
'I-ADVP': 3,
'B-CONJP': 4,
'I-CONJP': 5,
'B-INTJ': 6,
'I-INTJ': 7,
'B-LST': 8,
'I-LST': 9,
'B-NP': 10,
'I-NP': 11,
'B-PP': 12,
'I-PP': 13,
'B-PRT': 14,
'I-PRT': 15,
'B-SBAR': 16,
'I-SBAR': 17,
'B-UCP': 18,
'I-UCP': 19,
'B-VP': 20,
'I-VP': 21,
'O': 22
}
def make_features(sequence):
length = len(sequence)
num_features = len(sequence[0])
def get_features(pos):
if pos < 0:
return ['#B%s' % -pos] * num_features
......@@ -94,9 +92,10 @@ def make_features(sequence):
for i in xrange(length):
for pattern in patterns:
fname = '/'.join([get_features(i+pos)[f] for pos, f in pattern])
fname = '/'.join([get_features(i + pos)[f] for pos, f in pattern])
sequence[i].append(fname)
'''
Source file format:
Each line is for one timestep. The features are separated by space.
......@@ -109,6 +108,8 @@ i-th column.
return a list of dict for each column
'''
def create_dictionaries(filename, cutoff, oov_policy):
def add_to_dict(sequence, dicts):
num_features = len(dicts)
......@@ -140,7 +141,6 @@ def create_dictionaries(filename, cutoff, oov_policy):
features = line.split(' ')
sequence.append(features)
for i in xrange(num_features):
dct = dicts[i]
n = 1 if oov_policy[i] == OOV_POLICY_USE else 0
......@@ -151,7 +151,7 @@ def create_dictionaries(filename, cutoff, oov_policy):
else:
dct[k] = n
n += 1
if oov_policy[i] == OOV_POLICY_USE:
# placeholder so that len(dct) will be the number of features
# including OOV
......@@ -187,12 +187,15 @@ def initializer(settings, **xargs):
logger.info("feature size=%s" % dim)
settings.input_types = input_types
'''
if oov_policy[i] == OOV_POLICY_USE, features in i-th column which are not
existed in dicts[i] will be assigned to id 0.
if oov_policy[i] == OOV_POLICY_ERROR, all features in i-th column MUST exist
in dicts[i].
'''
@provider(init_hook=initializer, cache=CacheType.CACHE_PASS_IN_MEM)
def process(settings, filename):
input_file = filename
......@@ -231,7 +234,7 @@ def process(settings, filename):
logger.fatal("Unknown token: %s" % features[i])
else:
vec.ids.append(dim + 0)
dim += len(dicts[i])
sample[-1].append(vec)
return sample
......@@ -255,4 +258,3 @@ def process(settings, filename):
f.close()
logger.info("num_sequences=%s" % num_sequences)
......@@ -16,11 +16,11 @@ from paddle.trainer_config_helpers import *
import math
define_py_data_sources2(train_list="data/train.list",
test_list="data/test.list",
module="dataprovider",
obj="process")
define_py_data_sources2(
train_list="data/train.list",
test_list="data/test.list",
module="dataprovider",
obj="process")
batch_size = 1
settings(
......@@ -30,14 +30,15 @@ settings(
average_window=0.5,
learning_rate=1e-1,
learning_rate_decay_a=1e-5,
learning_rate_decay_b=0.25,
)
learning_rate_decay_b=0.25, )
num_label_types = 23
num_label_types=23
def get_simd_size(size):
return int(math.ceil(float(size) / 8)) * 8
# Currently, in order to use sparse_update=True,
# the size has to be aligned.
num_label_types = get_simd_size(num_label_types)
......@@ -45,40 +46,37 @@ num_label_types = get_simd_size(num_label_types)
features = data_layer(name="features", size=76328)
word = data_layer(name="word", size=6778)
pos = data_layer(name="pos", size=44)
chunk = data_layer(name="chunk",
size=num_label_types)
chunk = data_layer(name="chunk", size=num_label_types)
crf_input = fc_layer(
input=features,
size=num_label_types,
act=LinearActivation(),
bias_attr=False,
param_attr=ParamAttr(initial_std=0, sparse_update=True))
param_attr=ParamAttr(
initial_std=0, sparse_update=True))
crf=crf_layer(
crf = crf_layer(
input=crf_input,
label=chunk,
param_attr=ParamAttr(name="crfw", initial_std=0),
)
param_attr=ParamAttr(
name="crfw", initial_std=0), )
crf_decoding=crf_decoding_layer(
crf_decoding = crf_decoding_layer(
size=num_label_types,
input=crf_input,
label=chunk,
param_attr=ParamAttr(name="crfw"),
)
param_attr=ParamAttr(name="crfw"), )
sum_evaluator(
name="error",
input=crf_decoding,
)
input=crf_decoding, )
chunk_evaluator(
name="chunk_f1",
input =[crf_decoding, chunk],
input=[crf_decoding, chunk],
chunk_scheme="IOB",
num_chunk_types=11,
)
num_chunk_types=11, )
inputs(word, pos, chunk, features)
outputs(crf)
......@@ -16,10 +16,11 @@ from paddle.trainer_config_helpers import *
import math
define_py_data_sources2(train_list="data/train.list",
test_list="data/test.list",
module="dataprovider",
obj="process")
define_py_data_sources2(
train_list="data/train.list",
test_list="data/test.list",
module="dataprovider",
obj="process")
batch_size = 16
settings(
......@@ -27,29 +28,27 @@ settings(
batch_size=batch_size,
regularization=L2Regularization(batch_size * 1e-5),
average_window=0.5,
learning_rate = 2e-3,
learning_rate_decay_a = 5e-7,
learning_rate_decay_b = 0.5,
)
learning_rate=2e-3,
learning_rate_decay_a=5e-7,
learning_rate_decay_b=0.5, )
word_dim=128
word_dim = 128
hidden_dim = 128
with_rnn = True
initial_std=1/math.sqrt(hidden_dim)
param_attr=ParamAttr(initial_std=initial_std)
cpu_layer_attr=ExtraLayerAttribute(device=-1)
initial_std = 1 / math.sqrt(hidden_dim)
param_attr = ParamAttr(initial_std=initial_std)
cpu_layer_attr = ExtraLayerAttribute(device=-1)
default_device(0)
num_label_types=23
num_label_types = 23
features = data_layer(name="features", size=76328)
word = data_layer(name="word", size=6778)
pos = data_layer(name="pos", size=44)
chunk = data_layer(name="chunk",
size=num_label_types,
layer_attr=cpu_layer_attr)
chunk = data_layer(
name="chunk", size=num_label_types, layer_attr=cpu_layer_attr)
emb = embedding_layer(
input=word, size=word_dim, param_attr=ParamAttr(initial_std=0))
......@@ -58,73 +57,64 @@ hidden1 = mixed_layer(
size=hidden_dim,
act=STanhActivation(),
bias_attr=True,
input=[full_matrix_projection(emb),
table_projection(pos, param_attr=param_attr)]
)
input=[
full_matrix_projection(emb), table_projection(
pos, param_attr=param_attr)
])
if with_rnn:
rnn1 = recurrent_layer(
act=ReluActivation(),
bias_attr=True,
input=hidden1,
param_attr=ParamAttr(initial_std=0),
)
param_attr=ParamAttr(initial_std=0), )
hidden2 = mixed_layer(
size=hidden_dim,
act=STanhActivation(),
bias_attr=True,
input=[full_matrix_projection(hidden1)
] + ([
full_matrix_projection(rnn1, param_attr=ParamAttr(initial_std=0))
] if with_rnn else []),
)
input=[full_matrix_projection(hidden1)] +
([full_matrix_projection(
rnn1, param_attr=ParamAttr(initial_std=0))] if with_rnn else []), )
if with_rnn:
rnn2=recurrent_layer(
rnn2 = recurrent_layer(
reverse=True,
act=ReluActivation(),
bias_attr=True,
input=hidden2,
param_attr=ParamAttr(initial_std=0),
)
param_attr=ParamAttr(initial_std=0), )
crf_input = mixed_layer(
size=num_label_types,
bias_attr=False,
input=[
full_matrix_projection(hidden2),
] + ([
full_matrix_projection(rnn2, param_attr=ParamAttr(initial_std=0))
] if with_rnn else []),
)
input=[full_matrix_projection(hidden2), ] +
([full_matrix_projection(
rnn2, param_attr=ParamAttr(initial_std=0))] if with_rnn else []), )
crf = crf_layer(
input=crf_input,
label=chunk,
param_attr=ParamAttr(name="crfw", initial_std=0),
layer_attr=cpu_layer_attr,
)
param_attr=ParamAttr(
name="crfw", initial_std=0),
layer_attr=cpu_layer_attr, )
crf_decoding = crf_decoding_layer(
size=num_label_types,
input=crf_input,
label=chunk,
param_attr=ParamAttr(name="crfw"),
layer_attr=cpu_layer_attr,
)
layer_attr=cpu_layer_attr, )
sum_evaluator(
name="error",
input=crf_decoding,
)
input=crf_decoding, )
chunk_evaluator(
name="chunk_f1",
input =[crf_decoding, chunk],
input=[crf_decoding, chunk],
chunk_scheme="IOB",
num_chunk_types=11,
)
num_chunk_types=11, )
inputs(word, pos, chunk, features)
outputs(crf)
......@@ -16,82 +16,113 @@ from py_paddle import swig_paddle, DataProviderConverter
from paddle.trainer.PyDataProvider2 import dense_vector
from paddle.trainer.config_parser import parse_config
TEST_DATA = [[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.215686,
0.533333, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.67451,
0.992157, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.070588, 0.886275,
0.992157, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.192157, 0.070588, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0.670588, 0.992157, 0.992157, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.117647, 0.933333, 0.858824, 0.313725, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0.090196, 0.858824, 0.992157, 0.831373, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.141176,
0.992157, 0.992157, 0.611765, 0.054902, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.258824, 0.992157, 0.992157,
0.529412, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.368627, 0.992157, 0.992157, 0.419608, 0.003922, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0.094118, 0.835294, 0.992157, 0.992157, 0.517647, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.603922, 0.992157,
0.992157, 0.992157, 0.603922, 0.545098, 0.043137, 0, 0, 0, 0, 0, 0, 0, 0.447059, 0.992157, 0.992157,
0.956863, 0.062745, 0, 0, 0, 0, 0, 0, 0, 0, 0.011765, 0.666667, 0.992157, 0.992157, 0.992157, 0.992157,
0.992157, 0.745098, 0.137255, 0, 0, 0, 0, 0, 0.152941, 0.866667, 0.992157, 0.992157, 0.521569, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0.070588, 0.992157, 0.992157, 0.992157, 0.803922, 0.352941, 0.745098, 0.992157,
0.945098, 0.317647, 0, 0, 0, 0, 0.580392, 0.992157, 0.992157, 0.764706, 0.043137, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0.070588, 0.992157, 0.992157, 0.776471, 0.043137, 0, 0.007843, 0.27451, 0.882353, 0.941176, 0.176471,
0, 0, 0.180392, 0.898039, 0.992157, 0.992157, 0.313725, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.070588, 0.992157,
0.992157, 0.713725, 0, 0, 0, 0, 0.627451, 0.992157, 0.729412, 0.062745, 0, 0.509804, 0.992157, 0.992157,
0.776471, 0.035294, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.494118, 0.992157, 0.992157, 0.968627, 0.168627, 0, 0,
0, 0.423529, 0.992157, 0.992157, 0.364706, 0, 0.717647, 0.992157, 0.992157, 0.317647, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0.533333, 0.992157, 0.984314, 0.945098, 0.603922, 0, 0, 0, 0.003922, 0.466667, 0.992157,
0.988235, 0.976471, 0.992157, 0.992157, 0.788235, 0.007843, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.686275,
0.882353, 0.364706, 0, 0, 0, 0, 0, 0, 0.098039, 0.588235, 0.992157, 0.992157, 0.992157, 0.980392,
0.305882, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.101961, 0.67451, 0.321569, 0, 0, 0, 0, 0, 0, 0, 0.105882,
0.733333, 0.976471, 0.811765, 0.713725, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.65098, 0.992157,
0.321569, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.25098, 0.007843, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
0.94902, 0.219608, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.968627,
0.764706, 0.152941, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.498039,
0.25098, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.298039, 0.333333, 0.333333, 0.333333, 0.337255, 0.333333,
0.333333, 0.109804, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.027451, 0.223529, 0.776471,
0.964706, 0.988235, 0.988235, 0.988235, 0.992157, 0.988235, 0.988235, 0.780392, 0.098039, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.14902, 0.698039, 0.988235, 0.992157, 0.988235, 0.901961, 0.87451,
0.568627, 0.882353, 0.976471, 0.988235, 0.988235, 0.501961, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0.188235, 0.647059, 0.988235, 0.988235, 0.745098, 0.439216, 0.098039, 0, 0, 0, 0.572549, 0.988235,
0.988235, 0.988235, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.2, 0.933333, 0.992157, 0.941176,
0.247059, 0, 0, 0, 0, 0, 0, 0.188235, 0.898039, 0.992157, 0.992157, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0.039216, 0.639216, 0.933333, 0.988235, 0.913725, 0.278431, 0, 0, 0, 0, 0, 0, 0, 0.113725, 0.843137,
0.988235, 0.988235, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.235294, 0.988235, 0.992157, 0.988235, 0.815686,
0.07451, 0, 0, 0, 0, 0, 0, 0, 0.333333, 0.988235, 0.988235, 0.552941, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0.211765, 0.878431, 0.988235, 0.992157, 0.701961, 0.329412, 0.109804, 0, 0, 0, 0, 0, 0, 0, 0.698039,
0.988235, 0.913725, 0.145098, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.188235, 0.890196, 0.988235, 0.988235,
0.745098, 0.047059, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.882353, 0.988235, 0.568627, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0.2, 0.933333, 0.992157, 0.992157, 0.992157, 0.447059, 0.294118, 0, 0, 0, 0, 0, 0, 0, 0, 0.447059,
0.992157, 0.768627, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.623529, 0.988235, 0.988235, 0.988235, 0.988235,
0.992157, 0.47451, 0, 0, 0, 0, 0, 0, 0, 0.188235, 0.933333, 0.87451, 0.509804, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0.992157, 0.988235, 0.937255, 0.792157, 0.988235, 0.894118, 0.082353, 0, 0, 0, 0, 0, 0,
0.027451, 0.647059, 0.992157, 0.654902, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.623529, 0.988235, 0.913725,
0.329412, 0.376471, 0.184314, 0, 0, 0, 0, 0, 0, 0.027451, 0.513725, 0.988235, 0.635294, 0.219608, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.196078, 0.929412, 0.988235, 0.988235, 0.741176, 0.309804, 0, 0, 0, 0,
0, 0, 0.529412, 0.988235, 0.678431, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.223529, 0.992157,
0.992157, 1, 0.992157, 0.992157, 0.992157, 0.992157, 1, 0.992157, 0.992157, 0.882353, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.023529, 0.478431, 0.654902, 0.658824, 0.952941, 0.988235, 0.988235,
0.988235, 0.992157, 0.988235, 0.729412, 0.278431, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0.196078, 0.647059, 0.764706, 0.764706, 0.768627, 0.580392, 0.047059, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0]]]
TEST_DATA = [[[
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.215686, 0.533333, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.67451, 0.992157, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0.070588, 0.886275, 0.992157, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.192157,
0.070588, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.670588, 0.992157,
0.992157, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.117647, 0.933333, 0.858824, 0.313725,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.090196, 0.858824, 0.992157, 0.831373, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0.141176, 0.992157, 0.992157, 0.611765, 0.054902, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0.258824, 0.992157, 0.992157, 0.529412, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0.368627, 0.992157, 0.992157, 0.419608, 0.003922, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0.094118, 0.835294, 0.992157, 0.992157, 0.517647, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0.603922, 0.992157, 0.992157, 0.992157, 0.603922,
0.545098, 0.043137, 0, 0, 0, 0, 0, 0, 0, 0.447059, 0.992157, 0.992157,
0.956863, 0.062745, 0, 0, 0, 0, 0, 0, 0, 0, 0.011765, 0.666667, 0.992157,
0.992157, 0.992157, 0.992157, 0.992157, 0.745098, 0.137255, 0, 0, 0, 0, 0,
0.152941, 0.866667, 0.992157, 0.992157, 0.521569, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0.070588, 0.992157, 0.992157, 0.992157, 0.803922, 0.352941, 0.745098,
0.992157, 0.945098, 0.317647, 0, 0, 0, 0, 0.580392, 0.992157, 0.992157,
0.764706, 0.043137, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.070588, 0.992157, 0.992157,
0.776471, 0.043137, 0, 0.007843, 0.27451, 0.882353, 0.941176, 0.176471, 0,
0, 0.180392, 0.898039, 0.992157, 0.992157, 0.313725, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0.070588, 0.992157, 0.992157, 0.713725, 0, 0, 0, 0, 0.627451,
0.992157, 0.729412, 0.062745, 0, 0.509804, 0.992157, 0.992157, 0.776471,
0.035294, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.494118, 0.992157, 0.992157,
0.968627, 0.168627, 0, 0, 0, 0.423529, 0.992157, 0.992157, 0.364706, 0,
0.717647, 0.992157, 0.992157, 0.317647, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0.533333, 0.992157, 0.984314, 0.945098, 0.603922, 0, 0, 0, 0.003922,
0.466667, 0.992157, 0.988235, 0.976471, 0.992157, 0.992157, 0.788235,
0.007843, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.686275, 0.882353, 0.364706, 0,
0, 0, 0, 0, 0, 0.098039, 0.588235, 0.992157, 0.992157, 0.992157, 0.980392,
0.305882, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.101961, 0.67451, 0.321569,
0, 0, 0, 0, 0, 0, 0, 0.105882, 0.733333, 0.976471, 0.811765, 0.713725, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.65098, 0.992157, 0.321569, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0.25098, 0.007843, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
0.94902, 0.219608, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0.968627, 0.764706, 0.152941, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.498039, 0.25098, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0
]], [[
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0.298039, 0.333333, 0.333333, 0.333333, 0.337255,
0.333333, 0.333333, 0.109804, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0.027451, 0.223529, 0.776471, 0.964706, 0.988235, 0.988235, 0.988235,
0.992157, 0.988235, 0.988235, 0.780392, 0.098039, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0.14902, 0.698039, 0.988235, 0.992157, 0.988235, 0.901961,
0.87451, 0.568627, 0.882353, 0.976471, 0.988235, 0.988235, 0.501961, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.188235, 0.647059, 0.988235, 0.988235,
0.745098, 0.439216, 0.098039, 0, 0, 0, 0.572549, 0.988235, 0.988235,
0.988235, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.2, 0.933333, 0.992157,
0.941176, 0.247059, 0, 0, 0, 0, 0, 0, 0.188235, 0.898039, 0.992157,
0.992157, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.039216, 0.639216, 0.933333,
0.988235, 0.913725, 0.278431, 0, 0, 0, 0, 0, 0, 0, 0.113725, 0.843137,
0.988235, 0.988235, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.235294, 0.988235,
0.992157, 0.988235, 0.815686, 0.07451, 0, 0, 0, 0, 0, 0, 0, 0.333333,
0.988235, 0.988235, 0.552941, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.211765,
0.878431, 0.988235, 0.992157, 0.701961, 0.329412, 0.109804, 0, 0, 0, 0, 0,
0, 0, 0.698039, 0.988235, 0.913725, 0.145098, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0.188235, 0.890196, 0.988235, 0.988235, 0.745098, 0.047059, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0.882353, 0.988235, 0.568627, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.2,
0.933333, 0.992157, 0.992157, 0.992157, 0.447059, 0.294118, 0, 0, 0, 0, 0,
0, 0, 0, 0.447059, 0.992157, 0.768627, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0.623529, 0.988235, 0.988235, 0.988235, 0.988235, 0.992157, 0.47451, 0, 0,
0, 0, 0, 0, 0, 0.188235, 0.933333, 0.87451, 0.509804, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0.992157, 0.988235, 0.937255, 0.792157, 0.988235, 0.894118,
0.082353, 0, 0, 0, 0, 0, 0, 0.027451, 0.647059, 0.992157, 0.654902, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0.623529, 0.988235, 0.913725, 0.329412, 0.376471,
0.184314, 0, 0, 0, 0, 0, 0, 0.027451, 0.513725, 0.988235, 0.635294,
0.219608, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.196078, 0.929412, 0.988235,
0.988235, 0.741176, 0.309804, 0, 0, 0, 0, 0, 0, 0.529412, 0.988235,
0.678431, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.223529, 0.992157,
0.992157, 1, 0.992157, 0.992157, 0.992157, 0.992157, 1, 0.992157, 0.992157,
0.882353, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.023529,
0.478431, 0.654902, 0.658824, 0.952941, 0.988235, 0.988235, 0.988235,
0.992157, 0.988235, 0.729412, 0.278431, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0.196078, 0.647059, 0.764706, 0.764706, 0.768627,
0.580392, 0.047059, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0
]]]
def main():
conf = parse_config("./mnist_model/trainer_config.py", "")
print conf.data_config.load_data_args
network = swig_paddle.GradientMachine.createFromConfigProto(conf.model_config)
network = swig_paddle.GradientMachine.createFromConfigProto(
conf.model_config)
assert isinstance(network, swig_paddle.GradientMachine) # For code hint.
network.loadParameters("./mnist_model/")
converter = DataProviderConverter([dense_vector(784)])
......
from paddle.trainer_config_helpers import *
define_py_data_sources2(train_list='train.list',
test_list='test.list',
module='provider',
obj='process')
define_py_data_sources2(
train_list='train.list',
test_list='test.list',
module='provider',
obj='process')
settings(
batch_size=128,
learning_rate=1e-3,
learning_method=AdamOptimizer(),
regularization=L2Regularization(0.5)
)
regularization=L2Regularization(0.5))
img = data_layer(name='pixel', size=28 * 28)
hidden1 = simple_img_conv_pool(input=img, filter_size=3, num_filters=32, pool_size=3,
num_channel=1)
hidden1 = simple_img_conv_pool(
input=img, filter_size=3, num_filters=32, pool_size=3, num_channel=1)
hidden2 = fc_layer(input=hidden1, size=200, act=TanhActivation(),
layer_attr=ExtraAttr(drop_rate=0.5))
hidden2 = fc_layer(
input=hidden1,
size=200,
act=TanhActivation(),
layer_attr=ExtraAttr(drop_rate=0.5))
predict = fc_layer(input=hidden2, size=10, act=SoftmaxActivation())
outputs(classification_cost(input=predict, label=data_layer(name='label', size=10)))
outputs(
classification_cost(
input=predict, label=data_layer(
name='label', size=10)))
... # the settings and define data provider is omitted.
DICT_DIM=3000 # dictionary dimension.
word_ids=data_layer('word_ids', size=DICT_DIM)
... # the settings and define data provider is omitted.
DICT_DIM = 3000 # dictionary dimension.
word_ids = data_layer('word_ids', size=DICT_DIM)
emb = embedding_layer(input=word_ids, size=256, param_attr=ParamAttr(sparse_update=True))
emb = embedding_layer(
input=word_ids, size=256, param_attr=ParamAttr(sparse_update=True))
emb_sum = pooling_layer(input=emb, pooling_type=SumPooling())
predict = fc_layer(input=emb_sum, size=DICT_DIM, act=Softmax())
outputs(classification_cost(input=predict, label=data_layer('label', size=DICT_DIM)))
\ No newline at end of file
outputs(
classification_cost(
input=predict, label=data_layer(
'label', size=DICT_DIM)))
DICT_DIM=3000
DICT_DIM = 3000
@provider(input_types=[integer_sequence(DICT_DIM), integer_value(DICT_DIM)])
def process(settings, filename):
with open(filename) as f:
# yield word ids to predict inner word id
# such as [28, 29, 10, 4], 4
# It means the sentance is 28, 29, 4, 10, 4.
yield read_next_from_file(f)
\ No newline at end of file
with open(filename) as f:
# yield word ids to predict inner word id
# such as [28, 29, 10, 4], 4
# It means the sentance is 28, 29, 4, 10, 4.
yield read_next_from_file(f)
from paddle.trainer_config_helpers import *
define_py_data_sources2(train_list='train.list',
test_list=None,
module='mnist_provider',
obj='process')
define_py_data_sources2(
train_list='train.list',
test_list=None,
module='mnist_provider',
obj='process')
img = data_layer(name='pixel', size=784)
label = data_layer(name='label', size=10)
......@@ -2,10 +2,9 @@ from paddle.trainer.PyDataProvider2 import *
# Define a py data provider
@provider(input_types={
'pixel': dense_vector(28 * 28),
'label': integer_value(10)
})
@provider(
input_types={'pixel': dense_vector(28 * 28),
'label': integer_value(10)})
def process(settings, filename): # settings is not used currently.
f = open(filename, 'r') # open one of training file
......
......@@ -2,10 +2,7 @@ from paddle.trainer.PyDataProvider2 import *
# Define a py data provider
@provider(input_types=[
dense_vector(28 * 28),
integer_value(10)
])
@provider(input_types=[dense_vector(28 * 28), integer_value(10)])
def process(settings, filename): # settings is not used currently.
f = open(filename, 'r') # open one of training file
......
......@@ -3,9 +3,12 @@ from paddle.trainer_config_helpers import *
dictionary = dict()
... # read dictionary from outside
define_py_data_sources2(train_list='train.list', test_list=None,
module='sentimental_provider', obj='process',
# above codes same as mnist sample.
args={ # pass to provider.
'dictionary': dictionary
})
define_py_data_sources2(
train_list='train.list',
test_list=None,
module='sentimental_provider',
obj='process',
# above codes same as mnist sample.
args={ # pass to provider.
'dictionary': dictionary
})
......@@ -12,7 +12,8 @@ def on_init(settings, dictionary, **kwargs):
# The text is a sequence of integer values, and each value is a word id.
# The whole sequence is the sentences that we want to predict its
# sentimental.
integer_value(len(dictionary), seq_type=SequenceType), # text input
integer_value(
len(dictionary), seq_type=SequenceType), # text input
# label positive/negative
integer_value(2)
......
......@@ -11,4 +11,3 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
......@@ -29,7 +29,10 @@ try:
whole_start = ""
whole_end = ""
LIB_DIRS = ["math", 'utils', 'parameter', "gserver", "api", "cuda", "pserver", "trainer"]
LIB_DIRS = [
"math", 'utils', 'parameter', "gserver", "api", "cuda", "pserver",
"trainer"
]
PARENT_LIB_DIRS = ['proto']
class PaddleLDFlag(object):
......@@ -55,19 +58,20 @@ try:
self.curt = CUDA_LIBRARIES
def ldflag_str(self):
return " ".join([self.libs_dir_str(),
self.parent_dir_str(),
self.libs_str()])
return " ".join(
[self.libs_dir_str(), self.parent_dir_str(), self.libs_str()])
def libs_dir_str(self):
libdirs = LIB_DIRS
return " ".join(map(lambda x: "-L" + os.path.join(self.paddle_build_dir, x),
libdirs))
return " ".join(
map(lambda x: "-L" + os.path.join(self.paddle_build_dir, x),
libdirs))
def parent_dir_str(self):
libdirs = PARENT_LIB_DIRS
return " ".join(map(lambda x: "-L" + os.path.join(self.paddle_build_dir, '..', x),
libdirs))
return " ".join(
map(lambda x: "-L" + os.path.join(self.paddle_build_dir, '..', x),
libdirs))
def libs_str(self):
libs = [
......@@ -113,10 +117,10 @@ try:
return cmake_flag
elif cmake_flag.startswith("-l"): # normal link command
return cmake_flag
elif cmake_flag in ["gflags-shared",
"gflags-static",
"gflags_nothreads-shared",
"gflags_nothreads-static"]: # special for gflags
elif cmake_flag in [
"gflags-shared", "gflags-static", "gflags_nothreads-shared",
"gflags_nothreads-static"
]: # special for gflags
assert PaddleLDFlag.cmake_bool(self.gflags_location)
return self.gflags_location
elif len(cmake_flag) != 0:
......@@ -132,18 +136,22 @@ try:
:type cmake_str: str
:rtype: bool
"""
if cmake_str in ["FALSE", "OFF", "NO"] or cmake_str.endswith("-NOTFOUND"):
if cmake_str in ["FALSE", "OFF", "NO"] or cmake_str.endswith(
"-NOTFOUND"):
return False
else:
return True
def c_flag(self):
if self.with_coverage:
return ["-fprofile-arcs", "-ftest-coverage", "-O0", "-g"]
else:
return None
except ImportError:
class PaddleLDFlag(object):
def ldflag_str(self):
pass
def c_flag(self):
pass
......@@ -32,7 +32,7 @@ class TestArguments(unittest.TestCase):
iv = args.getSlotIds(0)
assert isinstance(iv, swig_paddle.IVector)
np_arr = iv.toNumpyArrayInplace()
self.assertEqual(np_arr.shape, (6,))
self.assertEqual(np_arr.shape, (6, ))
if __name__ == '__main__':
......
......@@ -30,8 +30,8 @@ class TestGradientMachine(unittest.TestCase):
self.assertIsNotNone(model_config)
machine = swig_paddle.GradientMachine.createByModelConfig(
model_config, swig_paddle.CREATE_MODE_NORMAL,
swig_paddle.ParameterOptimizer.create(
opt_config).getParameterTypes())
swig_paddle.ParameterOptimizer.create(opt_config).getParameterTypes(
))
self.assertIsNotNone(machine)
ipt, _ = util.loadMNISTTrainData()
output = swig_paddle.Arguments.createArguments(0)
......@@ -43,7 +43,7 @@ class TestGradientMachine(unittest.TestCase):
assert isinstance(param, swig_paddle.Parameter)
val = param.getBuf(swig_paddle.PARAMETER_VALUE)
assert isinstance(val, swig_paddle.Vector)
arr = numpy.full((len(val),), 0.1, dtype="float32")
arr = numpy.full((len(val), ), 0.1, dtype="float32")
val.copyFromNumpyArray(arr)
param_config = param.getConfig().toProto()
assert isinstance(param_config,
......
......@@ -69,7 +69,8 @@ class TestMatrix(unittest.TestCase):
def test_numpy(self):
numpy_mat = np.matrix([[1, 2], [3, 4], [5, 6]], dtype="float32")
m = swig_paddle.Matrix.createCpuDenseFromNumpy(numpy_mat)
self.assertEqual((int(m.getHeight()), int(m.getWidth())), numpy_mat.shape)
self.assertEqual(
(int(m.getHeight()), int(m.getWidth())), numpy_mat.shape)
# the numpy matrix and paddle matrix shared the same memory.
numpy_mat[0, 1] = 342.23
......
......@@ -98,7 +98,8 @@ def main():
cost_vec = outArgs.getSlotValue(0)
assert isinstance(cost_vec, swig_paddle.Matrix)
cost_vec = cost_vec.copyToNumpyMat()
print 'Finish Batch', batch_id, 'with cost ', cost_vec.sum() / batch_size
print 'Finish Batch', batch_id, 'with cost ', cost_vec.sum(
) / batch_size
batch_id += 1
for optimizer in optimizers:
......
from paddle.trainer_config_helpers import *
settings(
batch_size=100,
learning_method=AdamOptimizer()
)
settings(batch_size=100, learning_method=AdamOptimizer())
din = data_layer(name='input', size=784)
......
......@@ -17,9 +17,9 @@ from paddle.trainer.config_parser import logger
from py_paddle import swig_paddle
import util
def main():
trainer_config = parse_config(
"./testTrainConfig.py", "")
trainer_config = parse_config("./testTrainConfig.py", "")
model = swig_paddle.GradientMachine.createFromConfigProto(
trainer_config.model_config)
trainer = swig_paddle.Trainer.create(trainer_config, model)
......@@ -56,7 +56,7 @@ def main():
logger.info('test cost=%f' % (cost / num))
trainer.finishTrain()
if __name__ == '__main__':
swig_paddle.initPaddle("--use_gpu=0", "--trainer_count=1")
......
......@@ -112,5 +112,6 @@ class TestVector(unittest.TestCase):
if __name__ == '__main__':
swig_paddle.initPaddle("--use_gpu=1" if swig_paddle.isGpuVersion() else "--use_gpu=0")
swig_paddle.initPaddle("--use_gpu=1"
if swig_paddle.isGpuVersion() else "--use_gpu=0")
unittest.main()
......@@ -11,4 +11,3 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
......@@ -16,72 +16,79 @@ import numpy
import struct
import traceback
def header_creator():
ret = ""
ret += struct.pack('i', 3) # slot num
ret += struct.pack('i', 1) # sequence flag
ret += struct.pack('i', 0) # slot0 dense type
ret += struct.pack('i', 3) # slot0 dim
ret += struct.pack('i', 1) # slot1 sparse non value type
ret += struct.pack('i', 7) # slot1 dim
ret += struct.pack('i', 3) # slot2 index type
ret += struct.pack('i', 2) # slot2 dim
ret += struct.pack('i', 3) # slot num
ret += struct.pack('i', 1) # sequence flag
ret += struct.pack('i', 0) # slot0 dense type
ret += struct.pack('i', 3) # slot0 dim
ret += struct.pack('i', 1) # slot1 sparse non value type
ret += struct.pack('i', 7) # slot1 dim
ret += struct.pack('i', 3) # slot2 index type
ret += struct.pack('i', 2) # slot2 dim
return ret
def dense_value_creator(sample_num):
ret = ""
ret += struct.pack('i', sample_num) # slot0 sample num
for i in range(sample_num): # slot0 value
ret += struct.pack('i', sample_num) # slot0 sample num
for i in range(sample_num): # slot0 value
ret += struct.pack('f', 1.0)
ret += struct.pack('f', 2.0)
ret += struct.pack('f', 3.0)
return ret
def sparse_value_creator(sample_num):
ret = ""
ret += struct.pack('i', sample_num) # slot1 sample num
for i in range(sample_num): # slot1 index
ret += struct.pack('i', sample_num) # slot1 sample num
for i in range(sample_num): # slot1 index
ret += struct.pack('i', i * 2)
ret += struct.pack('i', sample_num * 2) #slot1 length
for i in range(sample_num): # slot1 value
ret += struct.pack('i', sample_num * 2) #slot1 length
for i in range(sample_num): # slot1 value
ret += struct.pack('i', 1)
ret += struct.pack('i', 2)
return ret
def index_value_creator(sample_num):
ret = ""
ret += struct.pack('i', sample_num) # slot2 sample num
for i in range(sample_num): # slot2 value
ret += struct.pack('i', sample_num) # slot2 sample num
for i in range(sample_num): # slot2 value
ret += struct.pack('i', 0)
return ret
def sequenceStartPositions_creator():
ret = ""
ret += struct.pack('i', 2) # slot0 sequence num
ret += struct.pack('i', 0) # slot0 sequence value1
ret += struct.pack('i', 1) # slot0 sequence value2
ret += struct.pack('i', 1) # slot1 sequence num
ret += struct.pack('i', 0) # slot1 sequence value1
ret += struct.pack('i', 2) # slot2 sequence num
ret += struct.pack('i', 0) # slot2 sequence value1
ret += struct.pack('i', 1) # slot2 sequence value2
ret += struct.pack('i', 2) # slot0 sequence num
ret += struct.pack('i', 0) # slot0 sequence value1
ret += struct.pack('i', 1) # slot0 sequence value2
ret += struct.pack('i', 1) # slot1 sequence num
ret += struct.pack('i', 0) # slot1 sequence value1
ret += struct.pack('i', 2) # slot2 sequence num
ret += struct.pack('i', 0) # slot2 sequence value1
ret += struct.pack('i', 1) # slot2 sequence value2
return ret
def subSequenceStartPositions_creator():
ret = ""
ret += struct.pack('i', 3) # slot0 subsequence num
ret += struct.pack('i', 0) # slot0 subsequence value1
ret += struct.pack('i', 1) # slot0 subsequence value2
ret += struct.pack('i', 2) # slot0 subsequence value3
ret += struct.pack('i', 2) # slot1 subsequence num
ret += struct.pack('i', 0) # slot1 subsequence value1
ret += struct.pack('i', 1) # slot1 subsequence value2
ret += struct.pack('i', 3) # slot2 subsequence num
ret += struct.pack('i', 0) # slot2 subsequence value1
ret += struct.pack('i', 1) # slot2 subsequence value2
ret += struct.pack('i', 2) # slot2 subsequence value3
ret += struct.pack('i', 3) # slot0 subsequence num
ret += struct.pack('i', 0) # slot0 subsequence value1
ret += struct.pack('i', 1) # slot0 subsequence value2
ret += struct.pack('i', 2) # slot0 subsequence value3
ret += struct.pack('i', 2) # slot1 subsequence num
ret += struct.pack('i', 0) # slot1 subsequence value1
ret += struct.pack('i', 1) # slot1 subsequence value2
ret += struct.pack('i', 3) # slot2 subsequence num
ret += struct.pack('i', 0) # slot2 subsequence value1
ret += struct.pack('i', 1) # slot2 subsequence value2
ret += struct.pack('i', 2) # slot2 subsequence value3
return ret
class SimpleDataProvider:
def __init__(self, *file_list):
self.file_list = file_list
......@@ -93,17 +100,18 @@ class SimpleDataProvider:
pass
def getHeader(self):
return header_creator()
return header_creator()
def getNextBatch(self, batch_size):
ret = ""
ret += struct.pack('i', 2) # batch size
ret += dense_value_creator(2) # slot0
ret += sparse_value_creator(2) # slot1
ret += index_value_creator(2) # slot2
ret += struct.pack('i', 2) # batch size
ret += dense_value_creator(2) # slot0
ret += sparse_value_creator(2) # slot1
ret += index_value_creator(2) # slot2
ret += sequenceStartPositions_creator()
return ret
class SimpleNestDataProvider:
def __init__(self, *file_list):
self.file_list = file_list
......@@ -119,14 +127,15 @@ class SimpleNestDataProvider:
def getNextBatch(self, batch_size):
ret = ""
ret += struct.pack('i', 2) # batch size
ret += dense_value_creator(4) # slot0
ret += sparse_value_creator(4) # slot1
ret += index_value_creator(4) # slot2
ret += struct.pack('i', 2) # batch size
ret += dense_value_creator(4) # slot0
ret += sparse_value_creator(4) # slot1
ret += index_value_creator(4) # slot2
ret += sequenceStartPositions_creator()
ret += subSequenceStartPositions_creator()
return ret
if __name__ == "__main__":
# test code
data_provider = SimpleDataProvider('./test_batch')
......
......@@ -22,18 +22,20 @@ data = [
[[[0, 2], [2, 5], [0, 1, 2]], 1],
]
# Used for sequence_nest_rnn.conf
@provider(input_types=[integer_value_sub_sequence(10),
integer_value(3)],
should_shuffle=False)
@provider(
input_types=[integer_value_sub_sequence(10), integer_value(3)],
should_shuffle=False)
def process_subseq(settings, file_name):
for d in data:
yield d
# Used for sequence_rnn.conf
@provider(input_types=[integer_value_sequence(10),
integer_value(3)],
should_shuffle=False)
@provider(
input_types=[integer_value_sequence(10), integer_value(3)],
should_shuffle=False)
def process_seq(settings, file_name):
for d in data:
seq = []
......@@ -41,18 +43,20 @@ def process_seq(settings, file_name):
seq += subseq
yield seq, d[1]
# Used for sequence_nest_rnn_multi_input.conf
@provider(input_types=[integer_value_sub_sequence(10),
integer_value(3)],
should_shuffle=False)
@provider(
input_types=[integer_value_sub_sequence(10), integer_value(3)],
should_shuffle=False)
def process_subseq2(settings, file_name):
for d in data:
yield d
# Used for sequence_rnn_multi_input.conf
@provider(input_types=[integer_value_sequence(10),
integer_value(3)],
should_shuffle=False)
@provider(
input_types=[integer_value_sequence(10), integer_value(3)],
should_shuffle=False)
def process_seq2(settings, file_name):
for d in data:
seq = []
......@@ -60,31 +64,34 @@ def process_seq2(settings, file_name):
seq += subseq
yield seq, d[1]
###########################################################
data2 = [
[[[1, 2], [4, 5, 2]], [[5, 4, 1], [3, 1]] ,0],
[[[0, 2], [2, 5], [0, 1, 2]],[[1, 5], [4], [2, 3, 6, 1]], 1],
[[[1, 2], [4, 5, 2]], [[5, 4, 1], [3, 1]], 0],
[[[0, 2], [2, 5], [0, 1, 2]], [[1, 5], [4], [2, 3, 6, 1]], 1],
]
# Used for sequence_nest_rnn_multi_unequalength_inputs.conf
@provider(input_types=[integer_value_sub_sequence(10),
integer_value_sub_sequence(10),
integer_value(2)],
should_shuffle=False)
@provider(
input_types=[
integer_value_sub_sequence(10), integer_value_sub_sequence(10),
integer_value(2)
],
should_shuffle=False)
def process_unequalength_subseq(settings, file_name):
for d in data2:
yield d
# Used for sequence_rnn_multi_unequalength_inputs.conf
@provider(input_types=[integer_value_sequence(10),
integer_value_sequence(10),
integer_value(2)],
should_shuffle=False)
@provider(
input_types=[
integer_value_sequence(10), integer_value_sequence(10), integer_value(2)
],
should_shuffle=False)
def process_unequalength_seq(settings, file_name):
for d in data2:
words1=reduce(lambda x,y: x+y, d[0])
words2=reduce(lambda x,y: x+y, d[1])
words1 = reduce(lambda x, y: x + y, d[0])
words2 = reduce(lambda x, y: x + y, d[1])
yield words1, words2, d[2]
......@@ -20,8 +20,9 @@ from paddle.trainer.PyDataProvider2 import *
def hook(settings, dict_file, **kwargs):
settings.word_dict = dict_file
settings.input_types = [integer_value_sequence(len(settings.word_dict)),
integer_value(3)]
settings.input_types = [
integer_value_sequence(len(settings.word_dict)), integer_value(3)
]
settings.logger.info('dict len : %d' % (len(settings.word_dict)))
......@@ -32,16 +33,19 @@ def process(settings, file_name):
label, comment = line.strip().split('\t')
label = int(''.join(label.split()))
words = comment.split()
word_slot = [settings.word_dict[w] for w in words if
w in settings.word_dict]
word_slot = [
settings.word_dict[w] for w in words if w in settings.word_dict
]
yield word_slot, label
## for hierarchical sequence network
def hook2(settings, dict_file, **kwargs):
settings.word_dict = dict_file
settings.input_types = [integer_value_sub_sequence(len(settings.word_dict)),
integer_value_sequence(3)]
settings.input_types = [
integer_value_sub_sequence(len(settings.word_dict)),
integer_value_sequence(3)
]
settings.logger.info('dict len : %d' % (len(settings.word_dict)))
......@@ -55,8 +59,10 @@ def process2(settings, file_name):
label, comment = line.strip().split('\t')
label = int(''.join(label.split()))
words = comment.split()
word_slot = [settings.word_dict[w] for w in words if
w in settings.word_dict]
word_slot = [
settings.word_dict[w] for w in words
if w in settings.word_dict
]
label_list.append(label)
word_slot_list.append(word_slot)
else:
......
......@@ -21,15 +21,16 @@ dict_file = dict()
for line_count, line in enumerate(open(dict_path, "r")):
dict_file[line.strip()] = line_count
define_py_data_sources2(train_list='gserver/tests/Sequence/train.list',
test_list=None,
module='sequenceGen',
obj='process',
args={"dict_file":dict_file})
define_py_data_sources2(
train_list='gserver/tests/Sequence/train.list',
test_list=None,
module='sequenceGen',
obj='process',
args={"dict_file": dict_file})
settings(batch_size=5)
######################## network configure ################################
dict_dim = len(open(dict_path,'r').readlines())
dict_dim = len(open(dict_path, 'r').readlines())
word_dim = 128
hidden_dim = 256
label_dim = 3
......@@ -39,21 +40,24 @@ data = data_layer(name="word", size=dict_dim)
emb = embedding_layer(input=data, size=word_dim)
# (lstm_input + lstm) is equal to lstmemory
with mixed_layer(size=hidden_dim*4) as lstm_input:
with mixed_layer(size=hidden_dim * 4) as lstm_input:
lstm_input += full_matrix_projection(input=emb)
lstm = lstmemory_group(input=lstm_input,
size=hidden_dim,
act=TanhActivation(),
gate_act=SigmoidActivation(),
state_act=TanhActivation(),
lstm_layer_attr=ExtraLayerAttribute(error_clipping_threshold=50))
lstm = lstmemory_group(
input=lstm_input,
size=hidden_dim,
act=TanhActivation(),
gate_act=SigmoidActivation(),
state_act=TanhActivation(),
lstm_layer_attr=ExtraLayerAttribute(error_clipping_threshold=50))
lstm_last = last_seq(input=lstm)
with mixed_layer(size=label_dim,
act=SoftmaxActivation(),
bias_attr=True) as output:
with mixed_layer(
size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
output += full_matrix_projection(input=lstm_last)
outputs(classification_cost(input=output, label=data_layer(name="label", size=1)))
outputs(
classification_cost(
input=output, label=data_layer(
name="label", size=1)))
......@@ -21,15 +21,16 @@ dict_file = dict()
for line_count, line in enumerate(open(dict_path, "r")):
dict_file[line.strip()] = line_count
define_py_data_sources2(train_list='gserver/tests/Sequence/train.list.nest',
test_list=None,
module='sequenceGen',
obj='process2',
args={"dict_file":dict_file})
define_py_data_sources2(
train_list='gserver/tests/Sequence/train.list.nest',
test_list=None,
module='sequenceGen',
obj='process2',
args={"dict_file": dict_file})
settings(batch_size=2)
######################## network configure ################################
dict_dim = len(open(dict_path,'r').readlines())
dict_dim = len(open(dict_path, 'r').readlines())
word_dim = 128
hidden_dim = 256
label_dim = 3
......@@ -38,37 +39,46 @@ data = data_layer(name="word", size=dict_dim)
emb_group = embedding_layer(input=data, size=word_dim)
# (lstm_input + lstm) is equal to lstmemory
def lstm_group(lstm_group_input):
with mixed_layer(size=hidden_dim*4) as group_input:
group_input += full_matrix_projection(input=lstm_group_input)
with mixed_layer(size=hidden_dim * 4) as group_input:
group_input += full_matrix_projection(input=lstm_group_input)
lstm_output = lstmemory_group(input=group_input,
name="lstm_group",
size=hidden_dim,
act=TanhActivation(),
gate_act=SigmoidActivation(),
state_act=TanhActivation(),
lstm_layer_attr=ExtraLayerAttribute(error_clipping_threshold=50))
lstm_output = lstmemory_group(
input=group_input,
name="lstm_group",
size=hidden_dim,
act=TanhActivation(),
gate_act=SigmoidActivation(),
state_act=TanhActivation(),
lstm_layer_attr=ExtraLayerAttribute(error_clipping_threshold=50))
return lstm_output
lstm_nest_group = recurrent_group(input=SubsequenceInput(emb_group),
step=lstm_group,
name="lstm_nest_group")
lstm_nest_group = recurrent_group(
input=SubsequenceInput(emb_group), step=lstm_group, name="lstm_nest_group")
# hasSubseq ->(seqlastins) seq
lstm_last = last_seq(input=lstm_nest_group, agg_level=AggregateLevel.EACH_SEQUENCE)
lstm_last = last_seq(
input=lstm_nest_group, agg_level=AggregateLevel.EACH_SEQUENCE)
# seq ->(expand) hasSubseq
lstm_expand = expand_layer(input=lstm_last, expand_as=emb_group, expand_level=ExpandLevel.FROM_SEQUENCE)
lstm_expand = expand_layer(
input=lstm_last,
expand_as=emb_group,
expand_level=ExpandLevel.FROM_SEQUENCE)
# hasSubseq ->(average) seq
lstm_average = pooling_layer(input=lstm_expand,
pooling_type=AvgPooling(),
agg_level=AggregateLevel.EACH_SEQUENCE)
lstm_average = pooling_layer(
input=lstm_expand,
pooling_type=AvgPooling(),
agg_level=AggregateLevel.EACH_SEQUENCE)
with mixed_layer(size=label_dim,
act=SoftmaxActivation(),
bias_attr=True) as output:
with mixed_layer(
size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
output += full_matrix_projection(input=lstm_average)
outputs(classification_cost(input=output, label=data_layer(name="label", size=1)))
outputs(
classification_cost(
input=output, label=data_layer(
name="label", size=1)))
......@@ -33,16 +33,19 @@ def test_init_hooker(setting, value, **kwargs):
setting.value = value
@provider(input_types=[dense_vector(20, seq_type=SequenceType.NO_SEQUENCE)],
init_hook=test_init_hooker)
@provider(
input_types=[dense_vector(
20, seq_type=SequenceType.NO_SEQUENCE)],
init_hook=test_init_hooker)
def test_init_hook(setting, filename):
for i in xrange(200):
yield setting.value
@provider(
input_types=[
sparse_binary_vector(30000, seq_type=SequenceType.NO_SEQUENCE)])
@provider(input_types=[
sparse_binary_vector(
30000, seq_type=SequenceType.NO_SEQUENCE)
])
def test_sparse_non_value_no_seq(setting, filename):
for i in xrange(200):
yield [(i + 1) * (j + 1) for j in xrange(10)]
......@@ -77,28 +80,28 @@ def test_min_pool_size(setting, filename):
yield random.randint(0, 100 - 1)
@provider(input_types=[index_slot(100, seq_type=SequenceType.SEQUENCE)],
can_over_batch_size=False,
calc_batch_size=lambda x: len(x[0]))
@provider(
input_types=[index_slot(
100, seq_type=SequenceType.SEQUENCE)],
can_over_batch_size=False,
calc_batch_size=lambda x: len(x[0]))
def test_can_over_batch_size(setting, filename):
for _ in xrange(1 << 10):
seq_len = random.randint(0, 99)
yield [random.randint(0, 100 - 1) for _ in xrange(seq_len)]
@provider(input_types={'input1':index_slot(10), 'input2': index_slot(10)})
@provider(input_types={'input1': index_slot(10), 'input2': index_slot(10)})
def test_input_order(setting, filename):
for _ in xrange(1000):
yield {
'input1': 0,
'input2': 1
}
yield {'input1': 0, 'input2': 1}
@provider(input_types=[index_slot(10)],
check=True,
check_fail_continue=True,
should_shuffle="123") # also test should shuffle
@provider(
input_types=[index_slot(10)],
check=True,
check_fail_continue=True,
should_shuffle="123") # also test should shuffle
def test_check(settings, filename):
yield_good_value = False
......@@ -108,4 +111,3 @@ def test_check(settings, filename):
if i < 10:
yield_good_value = True
yield i
......@@ -15,9 +15,10 @@
from util import DataProviderWrapperConverter
from dataprovider_converter import DataProviderConverter
__all__ = ['paddle',
'DataProviderConverter',
'DataProviderWrapperConverter', # for deprecated usage.
'loadParameterFile']
__all__ = [
'paddle',
'DataProviderConverter',
'DataProviderWrapperConverter', # for deprecated usage.
'loadParameterFile'
]
util.monkeypatches()
......@@ -45,10 +45,8 @@ class DenseScanner(IScanner):
def finish_scan(self, argument):
assert isinstance(argument, swig_paddle.Arguments)
assert isinstance(self.input_type, dp2.InputType)
m = swig_paddle.Matrix.createDense(self.__mat__,
self.__height__,
self.input_type.dim,
False)
m = swig_paddle.Matrix.createDense(self.__mat__, self.__height__,
self.input_type.dim, False)
argument.setSlotValue(self.pos, m)
......@@ -141,8 +139,10 @@ class DataProviderConverter(object):
assert isinstance(argument, swig_paddle.Arguments)
argument.resize(len(self.input_types))
scanners = [DataProviderConverter.create_scanner(i, each_type)
for i, each_type in enumerate(self.input_types)]
scanners = [
DataProviderConverter.create_scanner(i, each_type)
for i, each_type in enumerate(self.input_types)
]
for each_sample in dat:
for each_step, scanner in zip(each_sample, scanners):
......@@ -171,11 +171,14 @@ class DataProviderConverter(object):
assert retv is not None
if each.seq_type == dp2.SequenceType.SUB_SEQUENCE:
retv = SequenceScanner(each, i, retv, lambda a, p, seq:
a.setSlotSubSequenceStartPositions(p, seq))
if each.seq_type in [dp2.SequenceType.SUB_SEQUENCE,
dp2.SequenceType.SEQUENCE]:
retv = SequenceScanner(each, i, retv, lambda a, p, seq:
a.setSlotSequenceStartPositions(p, seq))
retv = SequenceScanner(
each, i, retv,
lambda a, p, seq: a.setSlotSubSequenceStartPositions(p, seq))
if each.seq_type in [
dp2.SequenceType.SUB_SEQUENCE, dp2.SequenceType.SEQUENCE
]:
retv = SequenceScanner(
each, i, retv,
lambda a, p, seq: a.setSlotSequenceStartPositions(p, seq))
return retv
......@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Some Useful method for py_paddle.
"""
......@@ -79,6 +78,7 @@ class __ParameterCallbackWrapper__(swig_paddle.UpdateCallback):
else:
return __ParameterCallbackWrapper__(callback).__disown__()
def __arguments_to_numpy__(i, arg):
assert isinstance(arg, swig_paddle.Arguments)
value = arg.getSlotValue(i)
......@@ -89,10 +89,8 @@ def __arguments_to_numpy__(i, arg):
if ids is not None:
assert isinstance(ids, swig_paddle.IVector)
ids = ids.copyToNumpyArray()
return {
"value": value,
"id": ids
}
return {"value": value, "id": ids}
def __monkeypatch_gradient_machine__():
"""
......@@ -102,7 +100,6 @@ def __monkeypatch_gradient_machine__():
swig_paddle.GradientMachine.loadFromConfigFile = \
staticmethod(loadGradientMachine)
def __matrix_to_numpy__(m):
if isinstance(m, swig_paddle.Matrix):
return m.copyToNumpyMat()
......@@ -113,9 +110,11 @@ def __monkeypatch_gradient_machine__():
def createFromConfigProto(protoObj,
createMode=swig_paddle.CREATE_MODE_NORMAL,
paramTypes=[swig_paddle.PARAMETER_VALUE,
swig_paddle.PARAMETER_GRADIENT,
swig_paddle.PARAMETER_MOMENTUM]):
paramTypes=[
swig_paddle.PARAMETER_VALUE,
swig_paddle.PARAMETER_GRADIENT,
swig_paddle.PARAMETER_MOMENTUM
]):
"""
Create Gradient Machine From Proto object.
:param protoObj: Model config
......@@ -145,8 +144,10 @@ def __monkeypatch_gradient_machine__():
"""
outArgs = swig_paddle.Arguments.createArguments(0)
self.forward(inArgs, outArgs, swig_paddle.PASS_TEST)
return [__arguments_to_numpy__(i, outArgs) for i in xrange(
outArgs.getSlotNum())]
return [
__arguments_to_numpy__(i, outArgs)
for i in xrange(outArgs.getSlotNum())
]
swig_paddle.GradientMachine.forwardTest = forwardTest
......@@ -167,7 +168,10 @@ def __monkeypatch_gradient_machine__():
swig_paddle.GradientMachine.__forwardBackward__ = \
swig_paddle.GradientMachine.forwardBackward
def forwardBackward(self, inArgs, outArgs, passType,
def forwardBackward(self,
inArgs,
outArgs,
passType,
callback=swig_paddle.UpdateCallback()):
"""
GradientMachine forward backward.
......@@ -315,9 +319,8 @@ class DataProviderWrapperConverter(object):
self.cols += other
def __call__(self, slot_idx, arg):
mat = swig_paddle.Matrix.createSparse(len(self.indices) - 1,
self.dim,
len(self.cols), True)
mat = swig_paddle.Matrix.createSparse(
len(self.indices) - 1, self.dim, len(self.cols), True)
assert isinstance(mat, swig_paddle.Matrix)
mat.sparseCopyFrom(self.indices, self.cols)
self.putIntoArg(slot_idx, arg, mat)
......@@ -341,9 +344,8 @@ class DataProviderWrapperConverter(object):
self.values += map(lambda x: x[1], other)
def __call__(self, slot_idx, arg):
mat = swig_paddle.Matrix.createSparse(len(self.indices) - 1,
self.dim,
len(self.cols), False)
mat = swig_paddle.Matrix.createSparse(
len(self.indices) - 1, self.dim, len(self.cols), False)
assert isinstance(mat, swig_paddle.Matrix)
mat.sparseCopyFrom(self.indices, self.cols, self.values)
self.putIntoArg(slot_idx, arg, mat)
......@@ -352,8 +354,9 @@ class DataProviderWrapperConverter(object):
paddle.trainer.PyDataProviderWrapper.DenseSlot: DenseValueConverter,
paddle.trainer.PyDataProviderWrapper.IndexSlot: IdValueConverter,
paddle.trainer.PyDataProviderWrapper.SparseNonValueSlot:
SparseNonValueConverter,
paddle.trainer.PyDataProviderWrapper.SparseValueSlot: SparseValueConverter
SparseNonValueConverter,
paddle.trainer.PyDataProviderWrapper.SparseValueSlot:
SparseValueConverter
}
def __init__(self, use_seq, header):
......@@ -381,10 +384,9 @@ class DataProviderWrapperConverter(object):
assert isinstance(argument, swig_paddle.Arguments)
argument.resize(len(self.__header__))
values = map(lambda x:
DataProviderWrapperConverter.__SLOT_VALUE_CONVERTER_MAP__[
x.__class__](x),
self.__header__)
values = map(
lambda x: DataProviderWrapperConverter.__SLOT_VALUE_CONVERTER_MAP__[x.__class__](x),
self.__header__)
if self.__use_seq__:
seq_dim = [[] for _ in xrange(self.__header__.__len__())]
......@@ -394,14 +396,13 @@ class DataProviderWrapperConverter(object):
for slot_idx, sequence in enumerate(each_sample):
for raw_data in sequence:
values[slot_idx].append(raw_data)
seq_start_pos[slot_idx].append(
seq_start_pos[slot_idx][-1] + len(sequence))
seq_start_pos[slot_idx].append(seq_start_pos[slot_idx][-1] +
len(sequence))
seq_dim[slot_idx].append(len(sequence))
for slot_idx in xrange(len(self.__header__)):
argument.setSlotSequenceDim(slot_idx,
swig_paddle.IVector.create(
seq_dim[slot_idx]))
argument.setSlotSequenceDim(
slot_idx, swig_paddle.IVector.create(seq_dim[slot_idx]))
argument.setSlotSequenceStartPositions(
slot_idx,
swig_paddle.IVector.create(seq_start_pos[slot_idx]))
......@@ -422,7 +423,6 @@ class DataProviderWrapperConverter(object):
return self.convert(wrapper_data, argument)
def __monkey_patch_protobuf_objects__():
def ParameterConfig_toProto(self):
"""
......@@ -459,8 +459,7 @@ def __monkey_patch_protobuf_objects__():
:return: paddle.OptimizationConfig
"""
assert isinstance(protoObj,
paddle.proto.OptimizationConfig)
assert isinstance(protoObj, paddle.proto.OptimizationConfig)
return swig_paddle.OptimizationConfig.createFromProtoString(
protoObj.SerializeToString())
......@@ -475,8 +474,7 @@ def __monkey_patch_protobuf_objects__():
:param protoObj: proto.TrainerConfig
:return: paddle.TrainerConfig
"""
assert isinstance(protoObj,
paddle.proto.TrainerConfig)
assert isinstance(protoObj, paddle.proto.TrainerConfig)
return swig_paddle.TrainerConfig.createFromProtoString(
protoObj.SerializeToString())
......@@ -537,6 +535,7 @@ def __monkey_patch_trainer__():
assert isinstance(model, swig_paddle.GradientMachine)
return swig_paddle.Trainer.__create__(
swig_paddle.TrainerConfig.createFromProto(config), model)
swig_paddle.Trainer.create = staticmethod(Trainer_create)
swig_paddle.Trainer.__getForwardOutput__ = \
......@@ -551,14 +550,19 @@ def __monkey_patch_trainer__():
numpy.ndarray.
"""
outArgs = self.__getForwardOutput__()
return [__arguments_to_numpy__(i, outArgs) for i in xrange(
outArgs.getSlotNum())]
return [
__arguments_to_numpy__(i, outArgs)
for i in xrange(outArgs.getSlotNum())
]
swig_paddle.Trainer.getForwardOutput = getForwardOutput
def monkeypatches():
patches = [__monkeypatch_init_paddle__, __monkeypatch_gradient_machine__,
__monkey_patch_protobuf_objects__,
__monkey_patch_parameter__, __monkey_patch_trainer__]
patches = [
__monkeypatch_init_paddle__, __monkeypatch_gradient_machine__,
__monkey_patch_protobuf_objects__, __monkey_patch_parameter__,
__monkey_patch_trainer__
]
for patch in patches:
patch()
......@@ -13,17 +13,14 @@
# limitations under the License.
HOSTS = [
"root@192.168.100.17",
"root@192.168.100.18",
]
"root@192.168.100.17",
"root@192.168.100.18",
]
'''
workspace configuration
'''
#root dir for workspace, can be set as any director with real user account
ROOT_DIR = "/home/paddle"
'''
network configuration
'''
......@@ -37,4 +34,4 @@ PADDLE_PORTS_NUM = 2
PADDLE_PORTS_NUM_FOR_SPARSE = 2
#environments setting for all processes in cluster job
LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/lib64"
LD_LIBRARY_PATH = "/usr/local/cuda/lib64:/usr/lib64"
......@@ -12,8 +12,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" module for launching cluster job """
import os
......@@ -23,13 +21,13 @@ import copy
import time
import signal
from fabric.api import run, put, settings, env, prefix
from fabric.tasks import execute
#configuration for cluster
import conf
def refine_unknown_args(cmd_args):
'''
refine unknown parameters to handle some special parameters
......@@ -37,7 +35,7 @@ def refine_unknown_args(cmd_args):
new_args = []
for arg in cmd_args:
if arg.startswith("--") and arg.find("=") != -1:
equal_pos = arg.find("=") #find first = pos
equal_pos = arg.find("=") #find first = pos
arglist = list(arg)
arglist[equal_pos] = " "
arg = "".join(arglist)
......@@ -50,6 +48,7 @@ def refine_unknown_args(cmd_args):
new_args.append(arg)
return new_args
def kill_process():
'''
kill comments threads
......@@ -60,6 +59,7 @@ def kill_process():
| awk '{print $2}' \
| xargs kill > /dev/null 2>&1")
def job_prepare(jobdir, data=None):
'''
prepare job related workspace data
......@@ -70,6 +70,7 @@ def job_prepare(jobdir, data=None):
This function just prepare all related model and other resources
needed at runtime.
'''
def job_create_workspace(jobdir, data=None):
'''
prepare job workspace, common file, etc.
......@@ -94,7 +95,8 @@ def job_prepare(jobdir, data=None):
execute(set_nodefile, i, hosts=conf.HOSTS[i])
#clean rubbish caused by exception
with settings(warn_only=True):
execute(kill_process, hosts=conf.HOSTS)
execute(kill_process, hosts=conf.HOSTS)
def job_pserver(jobdir, pids=None):
'''
......@@ -124,9 +126,8 @@ def job_pserver(jobdir, pids=None):
execute(start_pserver, jobdir, pargs, hosts=conf.HOSTS)
def job_trainer(jobdir,
train_args_dict,
pids=None):
def job_trainer(jobdir, train_args_dict, pids=None):
'''
start paddle trainer
'''
......@@ -171,9 +172,8 @@ def job_trainer(jobdir,
train_args += " --trainer_id=" + str(i)
execute(start_trainer, jobdir, train_args, hosts=conf.HOSTS[i])
def job_all(job_package,
jobdir=None,
train_args_dict=None):
def job_all(job_package, jobdir=None, train_args_dict=None):
'''
param job_package
param train_args_dict
......@@ -183,41 +183,52 @@ def job_all(job_package,
jobdir = conf.ROOT_DIR + "/JOB" + timestamp
job_prepare(jobdir, job_package)
job_pserver(jobdir)
time.sleep(5) #wait until pservers completely start
time.sleep(5) #wait until pservers completely start
job_trainer(jobdir, train_args_dict)
job_clean()
def job_clean():
'''
if starting job failed from paddle internal, the framework always
is launched successfully since these process are daemon processes.
so this job_clean can alway clean job rubbish process with ctrl+c.
'''
def signal_handler(signal, frame):
'''
SIGINT handler
'''
def kill_process():
run("ps aux \
run("ps aux \
| grep paddle_process_by_paddle \
| grep -v grep \
| awk '{print $2}' \
| xargs kill > /dev/null 2>&1")
with settings(warn_only=True):
execute(kill_process, hosts=conf.HOSTS)
execute(kill_process, hosts=conf.HOSTS)
signal.signal(signal.SIGINT, signal_handler)
signal.pause()
if __name__ == '__main__':
parser = argparse.ArgumentParser(prog="paddle.py",
description='simple tool for cluster training')
parser.add_argument('-j', '--job_workspace',
required=False, default=None,
help='job workspace')
parser.add_argument('-p', '--job_dispatch_package',
required=False, default=None,
help='job package for dispatching to all other nodes')
parser = argparse.ArgumentParser(
prog="paddle.py", description='simple tool for cluster training')
parser.add_argument(
'-j',
'--job_workspace',
required=False,
default=None,
help='job workspace')
parser.add_argument(
'-p',
'--job_dispatch_package',
required=False,
default=None,
help='job package for dispatching to all other nodes')
args, train_args_list = parser.parse_known_args()
train_args = refine_unknown_args(train_args_list)
......@@ -227,14 +238,10 @@ if __name__ == '__main__':
#if assigned workspace, do not need to dispatch data,
#so job_local_package should be None
assert args.job_dispatch_package is None
job_all(None,
args.job_workspace,
train_args_dict)
job_all(None, args.job_workspace, train_args_dict)
elif args.job_dispatch_package is not None:
assert args.job_workspace is None
assert os.path.isdir(args.job_dispatch_package)
job_all(args.job_dispatch_package,
None,
train_args_dict)
job_all(args.job_dispatch_package, None, train_args_dict)
else:
print "--job_workspace or --job_dispatch_package should be set"
......@@ -11,4 +11,3 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
......@@ -17,6 +17,6 @@ from paddle.trainer.config_parser import parse_config_and_serialize
if __name__ == '__main__':
parse_config_and_serialize('trainer/tests/test_config.conf', '')
parse_config_and_serialize(
'trainer/tests/sample_trainer_config.conf',
'trainer/tests/sample_trainer_config.conf',
'extension_module_name=paddle.trainer.config_parser_extension')
parse_config_and_serialize('gserver/tests/pyDataProvider/trainer.conf', '')
......@@ -21,8 +21,7 @@ import logging
import pprint
logging.basicConfig(
format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s',
)
format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s', )
logger = logging.getLogger('paddle')
logger.setLevel(logging.INFO)
......@@ -36,33 +35,32 @@ num_original_columns = 3
# [[-1,0], [0,0]] means previous token at column 0 and current token at
# column 0 are combined as one feature.
patterns = [
[[-2,0]],
[[-1,0]],
[[0,0]],
[[1,0]],
[[2,0]],
[[-1,0], [0,0]],
[[0,0], [1,0]],
[[-2,1]],
[[-1,1]],
[[0,1]],
[[1,1]],
[[2,1]],
[[-2,1], [-1,1]],
[[-1,1], [0,1]],
[[0,1], [1,1]],
[[1,1], [2,1]],
[[-2,1], [-1,1], [0,1]],
[[-1,1], [0,1], [1,1]],
[[0,1], [1,1], [2,1]],
[[-2, 0]],
[[-1, 0]],
[[0, 0]],
[[1, 0]],
[[2, 0]],
[[-1, 0], [0, 0]],
[[0, 0], [1, 0]],
[[-2, 1]],
[[-1, 1]],
[[0, 1]],
[[1, 1]],
[[2, 1]],
[[-2, 1], [-1, 1]],
[[-1, 1], [0, 1]],
[[0, 1], [1, 1]],
[[1, 1], [2, 1]],
[[-2, 1], [-1, 1], [0, 1]],
[[-1, 1], [0, 1], [1, 1]],
[[0, 1], [1, 1], [2, 1]],
]
def make_features(sequence):
length = len(sequence)
num_features = len(sequence[0])
def get_features(pos):
if pos < 0:
return ['#B%s' % -pos] * num_features
......@@ -72,9 +70,10 @@ def make_features(sequence):
for i in xrange(length):
for pattern in patterns:
fname = '/'.join([get_features(i+pos)[f] for pos, f in pattern])
fname = '/'.join([get_features(i + pos)[f] for pos, f in pattern])
sequence[i].append(fname)
'''
Source file format:
Each line is for one timestep. The features are separated by space.
......@@ -87,6 +86,8 @@ i-th column.
return a list of dict for each column
'''
def create_dictionaries(filename, cutoff, oov_policy):
def add_to_dict(sequence, dicts):
num_features = len(dicts)
......@@ -118,7 +119,6 @@ def create_dictionaries(filename, cutoff, oov_policy):
features = line.split(' ')
sequence.append(features)
for i in xrange(num_features):
dct = dicts[i]
n = 1 if oov_policy[i] == OOV_POLICY_USE else 0
......@@ -161,12 +161,9 @@ existed in dicts[i] will be assigned to id 0.
if oov_policy[i] == OOV_POLICY_ERROR, all features in i-th column MUST exist
in dicts[i].
'''
def gen_proto_file(
input_file,
dicts,
oov_policy,
output_file):
def gen_proto_file(input_file, dicts, oov_policy, output_file):
def write_sequence(out, sequence):
num_features = len(dicts)
is_beginning = True
......@@ -213,8 +210,8 @@ def gen_proto_file(
if patterns:
slot_def = header.slot_defs.add()
slot_def.type = DataFormat.SlotDef.VECTOR_SPARSE_NON_VALUE
slot_def.dim = sum([len(dicts[i])
for i in xrange(num_original_columns, len(dicts))])
slot_def.dim = sum(
[len(dicts[i]) for i in xrange(num_original_columns, len(dicts))])
logger.info("feature_dim=%s" % slot_def.dim)
for i in xrange(num_original_columns):
......@@ -242,30 +239,31 @@ def gen_proto_file(
logger.info("num_sequences=%s" % num_sequences)
dict2 = {
'B-ADJP': 0,
'I-ADJP': 1,
'B-ADVP': 2,
'I-ADVP': 3,
'B-CONJP': 4,
'I-CONJP': 5,
'B-INTJ': 6,
'I-INTJ': 7,
'B-LST': 8,
'I-LST': 9,
'B-NP': 10,
'I-NP': 11,
'B-PP': 12,
'I-PP': 13,
'B-PRT': 14,
'I-PRT': 15,
'B-SBAR': 16,
'I-SBAR': 17,
'B-UCP': 18,
'I-UCP': 19,
'B-VP': 20,
'I-VP': 21,
'O': 22
'B-ADJP': 0,
'I-ADJP': 1,
'B-ADVP': 2,
'I-ADVP': 3,
'B-CONJP': 4,
'I-CONJP': 5,
'B-INTJ': 6,
'I-INTJ': 7,
'B-LST': 8,
'I-LST': 9,
'B-NP': 10,
'I-NP': 11,
'B-PP': 12,
'I-PP': 13,
'B-PRT': 14,
'I-PRT': 15,
'B-SBAR': 16,
'I-SBAR': 17,
'B-UCP': 18,
'I-UCP': 19,
'B-VP': 20,
'I-VP': 21,
'O': 22
}
if __name__ == '__main__':
......@@ -273,16 +271,9 @@ if __name__ == '__main__':
cutoff += [3] * len(patterns)
oov_policy = [OOV_POLICY_IGNORE, OOV_POLICY_ERROR, OOV_POLICY_ERROR]
oov_policy += [OOV_POLICY_IGNORE] * len(patterns)
dicts = create_dictionaries(
'trainer/tests/train.txt', cutoff, oov_policy)
dicts = create_dictionaries('trainer/tests/train.txt', cutoff, oov_policy)
dicts[2] = dict2
gen_proto_file(
'trainer/tests/train.txt',
dicts,
oov_policy,
'trainer/tests/train_proto.bin')
gen_proto_file(
'trainer/tests/test.txt',
dicts,
oov_policy,
'trainer/tests/test_proto.bin')
gen_proto_file('trainer/tests/train.txt', dicts, oov_policy,
'trainer/tests/train_proto.bin')
gen_proto_file('trainer/tests/test.txt', dicts, oov_policy,
'trainer/tests/test_proto.bin')
......@@ -21,7 +21,10 @@ import json
import string
@provider(slots=[SparseNonValueSlot(10), DenseSlot(2), SparseValueSlot(10), StringSlot(1), IndexSlot(3)])
@provider(slots=[
SparseNonValueSlot(10), DenseSlot(2), SparseValueSlot(10), StringSlot(1),
IndexSlot(3)
])
def processNonSequenceData(obj, filename):
with open(filename, "rb") as f:
for line in f:
......@@ -50,6 +53,7 @@ val_randomer = lambda: random.uniform(-1.0, 1.0)
seq_count_randomer = lambda: random.randrange(1, SEQUENCE_LIMIT)
str_count_randomer = lambda: random.randrange(1, STRING_LIMIT)
class IDRandomer(): # A random generator, return unique id
def __init__(self):
self.id_set = set()
......@@ -61,38 +65,57 @@ class IDRandomer(): # A random generator, return unique id
return idx
else:
return self.__call__()
# SparseValueSlot
def sparse_value_creator(_):
rand = IDRandomer()
return [(rand(), val_randomer()) for _ in xrange(sparse_count_randomer())]
sparse_value = map(sparse_value_creator, range(seq_count_randomer()))
# DenseSlot
def dense_creator(_):
return [val_randomer() for _ in xrange(SPARSE_ID_LIMIT)]
dense = map(dense_creator, range(seq_count_randomer()))
# SparseNonValueSlot
def sparse_creator(_):
rand = IDRandomer()
return [rand() for _ in xrange(sparse_count_randomer())]
sparse_nonvalue = map(sparse_creator, range(seq_count_randomer()))
# IndexSlot
ids = [sparse_id_randomer() for _ in range(seq_count_randomer())]
# StringSlot
def random_str(size = 8, chars=string.ascii_letters + string.digits):
def random_str(size=8, chars=string.ascii_letters + string.digits):
return ''.join(random.choice(chars) for _ in range(size))
strs = [random_str(str_count_randomer()) for _ in range(seq_count_randomer())]
def processSeqAndGenerateDataInit(obj, *args, **kwargs):
obj.json_filename = kwargs.get("load_data_args", "test_data.json")
@provider(slots=[SparseValueSlot(SPARSE_ID_LIMIT), DenseSlot(SPARSE_ID_LIMIT),
SparseNonValueSlot(SPARSE_ID_LIMIT), IndexSlot(SPARSE_ID_LIMIT),
StringSlot(SPARSE_ID_LIMIT)],
use_seq=True, init_hook=processSeqAndGenerateDataInit)
@provider(
slots=[
SparseValueSlot(SPARSE_ID_LIMIT), DenseSlot(SPARSE_ID_LIMIT),
SparseNonValueSlot(SPARSE_ID_LIMIT), IndexSlot(SPARSE_ID_LIMIT),
StringSlot(SPARSE_ID_LIMIT)
],
use_seq=True,
init_hook=processSeqAndGenerateDataInit)
def processSeqAndGenerateData(obj, name):
retv = [sparse_value, dense, sparse_nonvalue, ids, strs]
# Write to protoseq.
......@@ -104,10 +127,15 @@ def processSeqAndGenerateData(obj, name):
def processSubSeqAndGenerateDataInit(obj, *args, **kwargs):
obj.json_filename = kwargs.get("load_data_args", "test_data.json")
@provider(slots=[SparseValueSlot(SPARSE_ID_LIMIT), DenseSlot(SPARSE_ID_LIMIT),
SparseNonValueSlot(SPARSE_ID_LIMIT), IndexSlot(SPARSE_ID_LIMIT),
StringSlot(SPARSE_ID_LIMIT)],
use_seq=True, init_hook=processSubSeqAndGenerateDataInit)
@provider(
slots=[
SparseValueSlot(SPARSE_ID_LIMIT), DenseSlot(SPARSE_ID_LIMIT),
SparseNonValueSlot(SPARSE_ID_LIMIT), IndexSlot(SPARSE_ID_LIMIT),
StringSlot(SPARSE_ID_LIMIT)
],
use_seq=True,
init_hook=processSubSeqAndGenerateDataInit)
def processSubSeqAndGenerateData(obj, name):
retv_json = [sparse_value, dense, sparse_nonvalue, ids, strs]
retv_wrapper = [[sparse_value], [dense], [sparse_nonvalue], [ids], [strs]]
......@@ -116,6 +144,7 @@ def processSubSeqAndGenerateData(obj, name):
json.dump(retv_json, f)
yield retv_wrapper
if __name__ == "__main__":
pvd = processNonSequenceData("test.txt")
print pvd.getNextBatch(100)
......
import os
def __activate_virtual_env__():
__path__ = os.getenv('VIRTUAL_ENV')
if __path__ is None:
return
__script__ = os.path.join(__path__, 'bin', 'activate_this.py')
execfile(__script__, {'__file__': __script__})
__path__ = os.getenv('VIRTUAL_ENV')
if __path__ is None:
return
__script__ = os.path.join(__path__, 'bin', 'activate_this.py')
execfile(__script__, {'__file__': __script__})
__activate_virtual_env__()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册