提交 911e0bb3 编写于 作者: X xiaohang

Merge branch 'develop' of https://github.com/PaddlePaddle/models into ctc_reader

......@@ -24,11 +24,22 @@ unittest(){
trap 'abort' 0
set -e
for proj in */ ; do
for proj in * ; do
if [ -d $proj ]; then
unittest $proj
if [ $? != 0 ]; then
exit 1
if [ "$proj" = "fluid" ]; then
for proj in fluid/* ; do
if [ -d $proj ]; then
unittest $proj
if [ $? != 0 ]; then
exit 1
fi
fi
done
else
unittest $proj
if [ $? != 0 ]; then
exit 1
fi
fi
fi
done
......
16.2845556399 11.6891798673
17.21509949 12.3788567902
18.1143704548 14.9912618017
19.2335963752 18.5419556172
19.9266772451 21.2768220522
19.8245737202 21.2347210705
19.5432940972 20.2784036567
19.4631271754 20.2934452329
19.3929919324 20.457971868
19.2924788362 20.3626439234
18.9207244502 19.9196569759
18.7202605641 19.5920276899
18.4844279398 19.2068349019
18.2670948624 18.8716893824
18.0929628855 18.5439666541
17.8428896026 18.0255891747
17.6646850635 17.473764296
17.4955705896 16.8966859471
17.3706720293 16.4294027467
17.2530867792 16.0514717623
17.1304341172 15.7234699057
17.0038353287 15.4344471514
16.902550309 15.1603287337
16.8375590047 14.9304337826
16.816287853 14.9119310513
16.828838265 15.0930023024
16.8602209498 15.3771992423
16.9101763812 15.6897991789
16.9466065143 15.9364556489
16.9486061956 16.0699417826
16.9041374104 16.0796970272
16.8410093699 16.0111444599
16.7045718836 15.7991985601
16.51128489 15.5208920129
16.3253910608 15.2603181921
16.1297317333 14.9499965958
15.903428372 14.5958280409
15.6131718105 14.2709618
15.1395035533 13.9993939893
14.4298229999 13.3841189151
0.0034970565424 0.246184766149
0.00501284154705 0.238484972472
0.00605942680019 0.269064381708
0.00687266156243 0.319479238011
0.00734065019253 0.371947383205
0.00718807218417 0.384426479694
0.00652195540212 0.384676838281
0.00660416525951 0.395543910317
0.00680202057642 0.400803979681
0.00659144183007 0.393228973031
0.00605294530423 0.385021118038
0.00590452969394 0.361763039625
0.00612315374687 0.346777773373
0.00582354093973 0.335802403976
0.00574556002554 0.320733728218
0.00612254485891 0.310153103033
0.00626733043219 0.299854747445
0.00567398408041 0.293353685493
0.00519236700706 0.287668810947
0.00529581474367 0.281479660772
0.00479019484082 0.27451415777
0.00486381039428 0.266294391154
0.00491126372868 0.258105116126
0.00452105305011 0.252926328298
0.00531483334271 0.250910887373
0.00546572110469 0.253302256977
0.00479544857908 0.258484183394
0.00422106426297 0.264582900173
0.00401824135188 0.268467945623
0.0041705465252 0.269699480291
0.00405239564143 0.270406162975
0.0040059737566 0.270407601782
0.00406426729317 0.267951582656
0.00416613791013 0.264543833042
0.00427847607653 0.26247798891
0.00428050903034 0.259635263243
0.00454842971786 0.255829377617
0.00393747552387 0.253802307025
0.00374143688909 0.251011478787
0.00335475310258 0.236543650856
0.000373194755312 0.0419494800709
0.000230909648678 0.0394102370205
0.000150840015851 0.0414956922398
8.44401840771e-05 0.0460502231327
-6.24759314572e-06 0.0528049937739
-8.82957758148e-05 0.055711244886
1.16795791952e-05 0.0563188428833
-1.68716267856e-05 0.0575232763711
-0.000112625308645 0.057979929947
-0.000122619090002 0.0564126233493
1.73569637319e-05 0.05522573909
6.49872782342e-05 0.0507353361334
4.17746389178e-05 0.0479568131253
5.13884475653e-05 0.0461253238047
1.8860115143e-05 0.0436860476919
-5.64317701105e-05 0.042516381059
-0.000136859948115 0.0413574820205
-7.00847019726e-05 0.0409516370727
-5.39392223336e-05 0.040441504085
-9.24897162815e-05 0.0397800398173
4.7104970622e-05 0.039046286243
6.24805896165e-06 0.0380185986602
-2.35272813418e-05 0.036851063786
5.88344154127e-05 0.0361640489242
-8.39162076993e-05 0.0357639427311
-0.000108702805776 0.0358774639538
3.22013961834e-06 0.0363644530435
9.43501518394e-05 0.0370309934774
0.000134406229423 0.0374972993343
3.84007008533e-05 0.037676222515
3.05989328157e-05 0.0379111939182
9.52201629091e-05 0.0380927209106
0.000102126083729 0.0379925358499
6.98628072264e-05 0.0377276252241
4.55782256339e-05 0.0375165468654
4.76370987786e-05 0.0371482526345
-2.24128832709e-05 0.0366810742947
0.000125621306953 0.036628355271
0.000134568666093 0.0364860461759
0.000159858844464 0.0345583593149
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import sys
import unittest
import numpy as np
import data_utils.augmentor.trans_mean_variance_norm as trans_mean_variance_norm
import data_utils.augmentor.trans_add_delta as trans_add_delta
import data_utils.augmentor.trans_splice as trans_splice
class TestTransMeanVarianceNorm(unittest.TestCase):
"""unit test for TransMeanVarianceNorm
"""
def setUp(self):
self._file_path = "./data_utils/augmentor/tests/data/" \
"global_mean_var_search26kHr"
def test(self):
feature = np.zeros((2, 120), dtype="float32")
feature.fill(1)
trans = trans_mean_variance_norm.TransMeanVarianceNorm(self._file_path)
(feature1, label1) = trans.perform_trans((feature, None))
(mean, var) = trans.get_mean_var()
feature_flat1 = feature1.flatten()
feature_flat = feature.flatten()
one = np.ones((1), dtype="float32")
for idx, val in enumerate(feature_flat1):
cur_idx = idx % 120
self.assertAlmostEqual(val, (one[0] - mean[cur_idx]) * var[cur_idx])
class TestTransAddDelta(unittest.TestCase):
"""unit test TestTransAddDelta
"""
def test_regress(self):
"""test regress
"""
feature = np.zeros((14, 120), dtype="float32")
feature[0:5, 0:40].fill(1)
feature[0 + 5, 0:40].fill(1)
feature[1 + 5, 0:40].fill(2)
feature[2 + 5, 0:40].fill(3)
feature[3 + 5, 0:40].fill(4)
feature[8:14, 0:40].fill(4)
trans = trans_add_delta.TransAddDelta()
feature = feature.reshape((14 * 120))
trans._regress(feature, 5 * 120, feature, 5 * 120 + 40, 40, 4, 120)
trans._regress(feature, 5 * 120 + 40, feature, 5 * 120 + 80, 40, 4, 120)
feature = feature.reshape((14, 120))
tmp_feature = feature[5:5 + 4, :]
self.assertAlmostEqual(1.0, tmp_feature[0][0])
self.assertAlmostEqual(0.24, tmp_feature[0][119])
self.assertAlmostEqual(2.0, tmp_feature[1][0])
self.assertAlmostEqual(0.13, tmp_feature[1][119])
self.assertAlmostEqual(3.0, tmp_feature[2][0])
self.assertAlmostEqual(-0.13, tmp_feature[2][119])
self.assertAlmostEqual(4.0, tmp_feature[3][0])
self.assertAlmostEqual(-0.24, tmp_feature[3][119])
def test_perform(self):
"""test perform
"""
feature = np.zeros((4, 40), dtype="float32")
feature[0, 0:40].fill(1)
feature[1, 0:40].fill(2)
feature[2, 0:40].fill(3)
feature[3, 0:40].fill(4)
trans = trans_add_delta.TransAddDelta()
(feature, label) = trans.perform_trans((feature, None))
self.assertAlmostEqual(feature.shape[0], 4)
self.assertAlmostEqual(feature.shape[1], 120)
self.assertAlmostEqual(1.0, feature[0][0])
self.assertAlmostEqual(0.24, feature[0][119])
self.assertAlmostEqual(2.0, feature[1][0])
self.assertAlmostEqual(0.13, feature[1][119])
self.assertAlmostEqual(3.0, feature[2][0])
self.assertAlmostEqual(-0.13, feature[2][119])
self.assertAlmostEqual(4.0, feature[3][0])
self.assertAlmostEqual(-0.24, feature[3][119])
class TestTransSplict(unittest.TestCase):
"""unit test Test TransSplict
"""
def test_perfrom(self):
feature = np.zeros((8, 10), dtype="float32")
for i in xrange(feature.shape[0]):
feature[i, :].fill(i)
trans = trans_splice.TransSplice()
(feature, label) = trans.perform_trans((feature, None))
self.assertEqual(feature.shape[1], 110)
for i in xrange(8):
nzero_num = 5 - i
cur_val = 0.0
if nzero_num < 0:
cur_val = i - 5 - 1
for j in xrange(11):
if j <= nzero_num:
for k in xrange(10):
self.assertAlmostEqual(feature[i][j * 10 + k], cur_val)
else:
if cur_val < 7:
cur_val += 1.0
for k in xrange(10):
self.assertAlmostEqual(feature[i][j * 10 + k], cur_val)
if __name__ == '__main__':
unittest.main()
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import math
import copy
class TransAddDelta(object):
""" add delta of feature data
trans feature for shape(a, b) to shape(a, b * 3)
Attributes:
_norder(int):
_window(int):
"""
def __init__(self, norder=2, nwindow=2):
""" init construction
Args:
norder: default 2
nwindow: default 2
"""
self._norder = norder
self._nwindow = nwindow
def perform_trans(self, sample):
""" add delta for feature
trans feature shape from (a,b) to (a, b * 3)
Args:
sample(object,tuple): contain feature numpy and label numpy
Returns:
(feature, label)
"""
(feature, label) = sample
frame_dim = feature.shape[1]
d_frame_dim = frame_dim * 3
head_filled = 5
tail_filled = 5
mat = np.zeros(
(feature.shape[0] + head_filled + tail_filled, d_frame_dim),
dtype="float32")
#copy first frame
for i in xrange(head_filled):
np.copyto(mat[i, 0:frame_dim], feature[0, :])
np.copyto(mat[head_filled:head_filled + feature.shape[0], 0:frame_dim],
feature[:, :])
# copy last frame
for i in xrange(head_filled + feature.shape[0], mat.shape[0], 1):
np.copyto(mat[i, 0:frame_dim], feature[feature.shape[0] - 1, :])
nframe = feature.shape[0]
start = head_filled
tmp_shape = mat.shape
mat = mat.reshape((tmp_shape[0] * tmp_shape[1]))
self._regress(mat, start * d_frame_dim, mat,
start * d_frame_dim + frame_dim, frame_dim, nframe,
d_frame_dim)
self._regress(mat, start * d_frame_dim + frame_dim, mat,
start * d_frame_dim + 2 * frame_dim, frame_dim, nframe,
d_frame_dim)
mat.shape = tmp_shape
return (mat[head_filled:mat.shape[0] - tail_filled, :], label)
def _regress(self, data_in, start_in, data_out, start_out, size, n, step):
""" regress
Args:
data_in: in data
start_in: start index of data_in
data_out: out data
start_out: start index of data_out
size: frame dimentional
n: frame num
step: 3 * (frame num)
Returns:
None
"""
sigma_t2 = 0.0
delta_window = self._nwindow
for t in xrange(1, delta_window + 1):
sigma_t2 += t * t
sigma_t2 *= 2.0
for i in xrange(n):
fp1 = start_in
fp2 = start_out
for j in xrange(size):
back = fp1
forw = fp1
sum = 0.0
for t in xrange(1, delta_window + 1):
back -= step
forw += step
sum += t * (data_in[forw] - data_in[back])
data_out[fp2] = sum / sigma_t2
fp1 += 1
fp2 += 1
start_in += step
start_out += step
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import math
class TransMeanVarianceNorm(object):
""" normalization of mean variance for feature data
Attributes:
_mean(numpy.array): the feature mean vector
_var(numpy.array): the feature variance
"""
def __init__(self, snorm_path):
"""init construction
Args:
snorm_path: the path of mean and variance
"""
self._mean = None
self._var = None
self._load_norm(snorm_path)
def _load_norm(self, snorm_path):
""" load mean var file
Args:
snorm_path(str):the file path
"""
lLines = open(snorm_path).readlines()
nLen = len(lLines)
self._mean = np.zeros((nLen), dtype="float32")
self._var = np.zeros((nLen), dtype="float32")
self._nLen = nLen
for nidx, l in enumerate(lLines):
s = l.split()
assert len(s) == 2
self._mean[nidx] = float(s[0])
self._var[nidx] = 1.0 / math.sqrt(float(s[1]))
if self._var[nidx] > 100000.0:
self._var[nidx] = 100000.0
def get_mean_var(self):
""" get mean and var
Args:
Returns:
(mean, var)
"""
return (self._mean, self._var)
def perform_trans(self, sample):
""" feature = (feature - mean) * var
Args:
sample(object):input sample, contain feature numpy and label numpy
Returns:
(feature, label)
"""
(feature, label) = sample
shape = feature.shape
assert len(shape) == 2
nfeature_len = shape[0] * shape[1]
assert nfeature_len % self._nLen == 0
ncur_idx = 0
feature = feature.reshape((nfeature_len))
while ncur_idx < nfeature_len:
block = feature[ncur_idx:ncur_idx + self._nLen]
block = (block - self._mean) * self._var
feature[ncur_idx:ncur_idx + self._nLen] = block
ncur_idx += self._nLen
feature = feature.reshape(shape)
return (feature, label)
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import math
class TransSplice(object):
""" copy feature context to construct new feature
expand feature data from shape (frame_num, frame_dim)
to shape (frame_num, frame_dim * 11)
Attributes:
_nleft_context(int): copy left context number
_nright_context(int): copy right context number
"""
def __init__(self, nleft_context=5, nright_context=5):
""" init construction
Args:
nleft_context(int):
nright_context(int):
"""
self._nleft_context = nleft_context
self._nright_context = nright_context
def perform_trans(self, sample):
""" copy feature context
Args:
sample(object): input sample(feature, label)
Return:
(feature, label)
"""
(feature, label) = sample
nframe_num = feature.shape[0]
nframe_dim = feature.shape[1]
nnew_frame_dim = nframe_dim * (
self._nleft_context + self._nright_context + 1)
mat = np.zeros(
(nframe_num + self._nleft_context + self._nright_context,
nframe_dim),
dtype="float32")
ret = np.zeros((nframe_num, nnew_frame_dim), dtype="float32")
#copy left
for i in xrange(self._nleft_context):
mat[i, :] = feature[0, :]
#copy middle
mat[self._nleft_context:self._nleft_context +
nframe_num, :] = feature[:, :]
#copy right
for i in xrange(self._nright_context):
mat[i + self._nleft_context + nframe_num, :] = feature[-1, :]
mat = mat.reshape(mat.shape[0] * mat.shape[1])
ret = ret.reshape(ret.shape[0] * ret.shape[1])
for i in xrange(nframe_num):
np.copyto(ret[i * nnew_frame_dim:(i + 1) * nnew_frame_dim],
mat[i * nframe_dim:i * nframe_dim + nnew_frame_dim])
ret = ret.reshape((nframe_num, nnew_frame_dim))
return (ret, label)
"""This model read the sample from disk.
use multiprocessing to reading samples
push samples from one block to multiprocessing queue
Todos:
1. multiprocess read block from disk
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import random
import Queue
import numpy as np
import struct
import data_utils.augmentor.trans_mean_variance_norm as trans_mean_variance_norm
import data_utils.augmentor.trans_add_delta as trans_add_delta
class OneBlock(object):
""" struct for one block :
contain label, label desc, feature, feature_desc
Attributes:
label(str) : label path of one block
label_desc(str) : label description path of one block
feature(str) : feature path of on block
feature_desc(str) : feature description path of on block
"""
def __init__(self):
"""the constructor."""
self.label = "label"
self.label_desc = "label_desc"
self.feature = "feature"
self.feature_desc = "feature_desc"
class DataRead(object):
"""
Attributes:
_lblock(obj:`OneBlock`) : the list of OneBlock
_ndrop_sentence_len(int): dropout the sentence which's frame_num large than _ndrop_sentence_len
_que_sample(obj:`Queue`): sample buffer
_nframe_dim(int): the batch sample frame_dim(todo remove)
_nstart_block_idx(int): the start block id
_nload_block_num(int): the block num
"""
def __init__(self, sfeature_lst, slabel_lst, ndrop_sentence_len=512):
"""
Args:
sfeature_lst(str):feature lst path
slabel_lst(str):label lst path
Returns:
None
"""
self._lblock = []
self._ndrop_sentence_len = ndrop_sentence_len
self._que_sample = Queue.Queue()
self._nframe_dim = 120 * 11
self._nstart_block_idx = 0
self._nload_block_num = 1
self._ndrop_frame_len = 256
self._load_list(sfeature_lst, slabel_lst)
def _load_list(self, sfeature_lst, slabel_lst):
""" load list and shuffle
Args:
sfeature_lst(str):feature lst path
slabel_lst(str):label lst path
Returns:
None
"""
lfeature = open(sfeature_lst).readlines()
llabel = open(slabel_lst).readlines()
assert len(llabel) == len(lfeature)
for i in range(0, len(lfeature), 2):
one_block = OneBlock()
one_block.label = llabel[i]
one_block.label_desc = llabel[i + 1]
one_block.feature = lfeature[i]
one_block.feature_desc = lfeature[i + 1]
self._lblock.append(one_block)
random.shuffle(self._lblock)
def _load_one_block(self, lsample, id):
"""read one block by id and push load sample in list lsample
Args:
lsample(list): return sample list
id(int): block id
Returns:
None
"""
if id >= len(self._lblock):
return
slabel_path = self._lblock[id].label.strip()
slabel_desc_path = self._lblock[id].label_desc.strip()
sfeature_path = self._lblock[id].feature.strip()
sfeature_desc_path = self._lblock[id].feature_desc.strip()
llabel_line = open(slabel_desc_path).readlines()
lfeature_line = open(sfeature_desc_path).readlines()
file_lable_bin = open(slabel_path, "r")
file_feature_bin = open(sfeature_path, "r")
sample_num = int(llabel_line[0].split()[1])
assert sample_num == int(lfeature_line[0].split()[1])
llabel_line = llabel_line[1:]
lfeature_line = lfeature_line[1:]
for i in range(sample_num):
# read label
llabel_split = llabel_line[i].split()
nlabel_start = int(llabel_split[2])
nlabel_size = int(llabel_split[3])
nlabel_frame_num = int(llabel_split[4])
file_lable_bin.seek(nlabel_start, 0)
label_bytes = file_lable_bin.read(nlabel_size)
assert nlabel_frame_num * 4 == len(label_bytes)
label_array = struct.unpack('I' * nlabel_frame_num, label_bytes)
label_data = np.array(label_array, dtype="int64")
label_data = label_data.reshape((nlabel_frame_num, 1))
# read feature
lfeature_split = lfeature_line[i].split()
nfeature_start = int(lfeature_split[2])
nfeature_size = int(lfeature_split[3])
nfeature_frame_num = int(lfeature_split[4])
nfeature_frame_dim = int(lfeature_split[5])
file_feature_bin.seek(nfeature_start, 0)
feature_bytes = file_feature_bin.read(nfeature_size)
assert nfeature_frame_num * nfeature_frame_dim * 4 == len(
feature_bytes)
feature_array = struct.unpack('f' * nfeature_frame_num *
nfeature_frame_dim, feature_bytes)
feature_data = np.array(feature_array, dtype="float32")
feature_data = feature_data.reshape(
(nfeature_frame_num, nfeature_frame_dim))
#drop long sentence
if self._ndrop_frame_len < feature_data.shape[0]:
continue
lsample.append((feature_data, label_data))
def get_one_batch(self, nbatch_size):
"""construct one batch(feature, label), batch size is nbatch_size
Args:
nbatch_size(int): batch size
Returns:
None
"""
if self._que_sample.empty():
lsample = self._load_block(
range(self._nstart_block_idx, self._nstart_block_idx +
self._nload_block_num, 1))
self._move_sample(lsample)
self._nstart_block_idx += self._nload_block_num
if self._que_sample.empty():
self._nstart_block_idx = 0
return None
#cal all frame num
ncur_len = 0
lod = [0]
samples = []
bat_feature = np.zeros((nbatch_size, self._nframe_dim))
for i in range(nbatch_size):
# empty clear zero
if self._que_sample.empty():
self._nstart_block_idx = 0
# copy
else:
(one_feature, one_label) = self._que_sample.get()
samples.append((one_feature, one_label))
ncur_len += one_feature.shape[0]
lod.append(ncur_len)
bat_feature = np.zeros((ncur_len, self._nframe_dim), dtype="float32")
bat_label = np.zeros((ncur_len, 1), dtype="int64")
ncur_len = 0
for sample in samples:
one_feature = sample[0]
one_label = sample[1]
nframe_num = one_feature.shape[0]
nstart = ncur_len
nend = ncur_len + nframe_num
bat_feature[nstart:nend, :] = one_feature
bat_label[nstart:nend, :] = one_label
ncur_len += nframe_num
return (bat_feature, bat_label, lod)
def set_trans(self, ltrans):
""" set transform list
Args:
ltrans(list): data tranform list
Returns:
None
"""
self._ltrans = ltrans
def _load_block(self, lblock_id):
"""read blocks
"""
lsample = []
for id in lblock_id:
self._load_one_block(lsample, id)
# transform sample
for (nidx, sample) in enumerate(lsample):
for trans in self._ltrans:
sample = trans.perform_trans(sample)
lsample[nidx] = sample
return lsample
def load_block(self, lblock_id):
"""read blocks
Args:
lblock_id(list):the block list id
Returns:
None
"""
lsample = []
for id in lblock_id:
self._load_one_block(lsample, id)
# transform sample
for (nidx, sample) in enumerate(lsample):
for trans in self._ltrans:
sample = trans.perform_trans(sample)
lsample[nidx] = sample
return lsample
def _move_sample(self, lsample):
"""move sample to queue
Args:
lsample(list): one block of samples read from disk
Returns:
None
"""
# random
random.shuffle(lsample)
for sample in lsample:
self._que_sample.put(sample)
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
def to_lodtensor(data, place):
"""convert tensor to lodtensor
"""
seq_lens = [len(seq) for seq in data]
cur_len = 0
lod = [cur_len]
for l in seq_lens:
cur_len += l
lod.append(cur_len)
flattened_data = numpy.concatenate(data, axis=0).astype("int64")
flattened_data = flattened_data.reshape([len(flattened_data), 1])
res = fluid.LoDTensor()
res.set(flattened_data, place)
res.set_lod([lod])
return res
def lodtensor_to_ndarray(lod_tensor):
"""conver lodtensor to ndarray
"""
dims = lod_tensor.get_dims()
ret = np.zeros(shape=dims).astype('float32')
for i in xrange(np.product(dims)):
ret.ravel()[i] = lod_tensor.get_float_element(i)
return ret, lod_tensor.lod()
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import argparse
import time
import paddle.v2 as paddle
import paddle.v2.fluid as fluid
import paddle.v2.fluid.profiler as profiler
import data_utils.trans_mean_variance_norm as trans_mean_variance_norm
import data_utils.trans_add_delta as trans_add_delta
import data_utils.trans_splice as trans_splice
import data_utils.data_reader as reader
def parse_args():
parser = argparse.ArgumentParser("LSTM model benchmark.")
parser.add_argument(
'--batch_size',
type=int,
default=32,
help='The sequence number of a batch data. (default: %(default)d)')
parser.add_argument(
'--stacked_num',
type=int,
default=5,
help='Number of lstm layers to stack. (default: %(default)d)')
parser.add_argument(
'--proj_dim',
type=int,
default=512,
help='Project size of lstm unit. (default: %(default)d)')
parser.add_argument(
'--hidden_dim',
type=int,
default=1024,
help='Hidden size of lstm unit. (default: %(default)d)')
parser.add_argument(
'--pass_num',
type=int,
default=100,
help='Epoch number to train. (default: %(default)d)')
parser.add_argument(
'--learning_rate',
type=float,
default=0.002,
help='Learning rate used to train. (default: %(default)f)')
parser.add_argument(
'--device',
type=str,
default='GPU',
choices=['CPU', 'GPU'],
help='The device type. (default: %(default)s)')
parser.add_argument(
'--infer_only', action='store_true', help='If set, run forward only.')
parser.add_argument(
'--use_cprof', action='store_true', help='If set, use cProfile.')
parser.add_argument(
'--use_nvprof',
action='store_true',
help='If set, use nvprof for CUDA.')
parser.add_argument('--mean_var', type=str, help='mean var path')
parser.add_argument('--feature_lst', type=str, help='mean var path')
parser.add_argument('--label_lst', type=str, help='mean var path')
args = parser.parse_args()
return args
def print_arguments(args):
vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
vars(args)['device'] == 'GPU')
print('----------- Configuration Arguments -----------')
for arg, value in sorted(vars(args).iteritems()):
print('%s: %s' % (arg, value))
print('------------------------------------------------')
def dynamic_lstmp_model(hidden_dim,
proj_dim,
stacked_num,
class_num=1749,
is_train=True):
feature = fluid.layers.data(
name="feature", shape=[-1, 120 * 11], dtype="float32", lod_level=1)
seq_conv1 = fluid.layers.sequence_conv(
input=feature,
num_filters=1024,
filter_size=3,
filter_stride=1,
bias_attr=True)
bn1 = fluid.layers.batch_norm(
input=seq_conv1,
act="sigmoid",
is_test=False,
momentum=0.9,
epsilon=1e-05,
data_layout='NCHW')
stack_input = bn1
for i in range(stacked_num):
fc = fluid.layers.fc(input=stack_input,
size=hidden_dim * 4,
bias_attr=True)
proj, cell = fluid.layers.dynamic_lstmp(
input=fc,
size=hidden_dim * 4,
proj_size=proj_dim,
bias_attr=True,
use_peepholes=True,
is_reverse=False,
cell_activation="tanh",
proj_activation="tanh")
bn = fluid.layers.batch_norm(
input=proj,
act="sigmoid",
is_test=False,
momentum=0.9,
epsilon=1e-05,
data_layout='NCHW')
stack_input = bn
prediction = fluid.layers.fc(input=stack_input,
size=class_num,
act='softmax')
if not is_train: return feature, prediction
label = fluid.layers.data(
name="label", shape=[-1, 1], dtype="int64", lod_level=1)
cost = fluid.layers.cross_entropy(input=prediction, label=label)
avg_cost = fluid.layers.mean(x=cost)
return prediction, label, avg_cost
def train(args):
if args.use_cprof:
pr = cProfile.Profile()
pr.enable()
prediction, label, avg_cost = dynamic_lstmp_model(
args.hidden_dim, args.proj_dim, args.stacked_num)
adam_optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
adam_optimizer.minimize(avg_cost)
accuracy = fluid.evaluator.Accuracy(input=prediction, label=label)
# clone from default main program
inference_program = fluid.default_main_program().clone()
with fluid.program_guard(inference_program):
test_accuracy = fluid.evaluator.Accuracy(input=prediction, label=label)
test_target = [avg_cost] + test_accuracy.metrics + test_accuracy.states
inference_program = fluid.io.get_inference_program(test_target)
place = fluid.CPUPlace() if args.device == 'CPU' else fluid.CUDAPlace(0)
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
ltrans = [
trans_add_delta.TransAddDelta(2, 2),
trans_mean_variance_norm.TransMeanVarianceNorm(args.mean_var),
trans_splice.TransSplice()
]
data_reader = reader.DataRead(args.feature_lst, args.label_lst)
data_reader.set_trans(ltrans)
res_feature = fluid.LoDTensor()
res_label = fluid.LoDTensor()
for pass_id in xrange(args.pass_num):
pass_start_time = time.time()
words_seen = 0
accuracy.reset(exe)
batch_id = 0
while True:
# load_data
one_batch = data_reader.get_one_batch(args.batch_size)
if one_batch == None:
break
(bat_feature, bat_label, lod) = one_batch
res_feature.set(bat_feature, place)
res_feature.set_lod([lod])
res_label.set(bat_label, place)
res_label.set_lod([lod])
batch_id += 1
words_seen += lod[-1]
loss, acc = exe.run(
fluid.default_main_program(),
feed={"feature": res_feature,
"label": res_label},
fetch_list=[avg_cost] + accuracy.metrics,
return_numpy=False)
train_acc = accuracy.eval(exe)
print("acc:", lodtensor_to_ndarray(loss))
pass_end_time = time.time()
time_consumed = pass_end_time - pass_start_time
words_per_sec = words_seen / time_consumed
def lodtensor_to_ndarray(lod_tensor):
dims = lod_tensor.get_dims()
ret = np.zeros(shape=dims).astype('float32')
for i in xrange(np.product(dims)):
ret.ravel()[i] = lod_tensor.get_float_element(i)
return ret, lod_tensor.lod()
if __name__ == '__main__':
args = parse_args()
print_arguments(args)
if args.infer_only:
pass
else:
if args.use_nvprof and args.device == 'GPU':
with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof:
train(args)
else:
train(args)
此差异已折叠。
此差异已折叠。
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys
import argparse
import os
import cPickle
from utils import logger
"""
This script will output 2 files:
1. feature_dict.pkl
2. item_freq.pkl
"""
class FeatureGenerator(object):
"""
Encode feature values with low-frequency filtering.
"""
def __init__(self, feat_appear_limit=20):
"""
@feat_appear_limit: int
"""
self._dic = None # feature value --> id
self._count = None # numbers of appearances of feature values
self._feat_appear_limit = feat_appear_limit
def add_feat_val(self, feat_val):
"""
Add feature values and count numbers of its appearance.
"""
if self._count is None:
self._count = {'<unk>': 0}
if feat_val == "NULL":
feat_val = '<unk>'
if feat_val not in self._count:
self._count[feat_val] = 1
else:
self._count[feat_val] += 1
self._count['<unk>'] += 1
def _filter_feat(self):
"""
Filter low-frequency feature values.
"""
self._items = filter(lambda x: x[1] > self._feat_appear_limit,
self._count.items())
self._items.sort(key=lambda x: x[1], reverse=True)
def _build_dict(self):
"""
Build feature values --> ids dict.
"""
self._dic = {}
self._filter_feat()
for i in xrange(len(self._items)):
self._dic[self._items[i][0]] = i
self.dim = len(self._dic)
def get_feat_id(self, feat_val):
"""
Get id of feature value after encoding.
"""
# build dict
if self._dic is None:
self._build_dict()
# find id
if feat_val in self._dic:
return self._dic[feat_val]
else:
return self._dic['<unk>']
def get_dim(self):
"""
Get dim.
"""
# build dict
if self._dic is None:
self._build_dict()
return len(self._dic)
def get_dict(self):
"""
Get dict.
"""
# build dict
if self._dic is None:
self._build_dict()
return self._dic
def get_total_count(self):
"""
Compute total num of count.
"""
total_count = 0
for i in xrange(len(self._items)):
feat_val = self._items[i][0]
c = self._items[i][1]
total_count += c
return total_count
def count_iterator(self):
"""
Iterate feature values and its num of appearance.
"""
for i in xrange(len(self._items)):
yield self._items[i][0], self._items[i][1]
def __repr__(self):
"""
"""
return '<FeatureGenerator %d>' % self._dim
def scan_build_dict(data_path, features_dict):
"""
Scan the raw data and add all feature values.
"""
logger.info('scan data set')
with open(data_path, 'r') as f:
for (line_id, line) in enumerate(f):
fields = line.strip('\n').split('\t')
user_id = fields[0]
province = fields[1]
features_dict['province'].add_feat_val(province)
city = fields[2]
features_dict['city'].add_feat_val(city)
item_infos = fields[3]
phone = fields[4]
features_dict['phone'].add_feat_val(phone)
for item_info in item_infos.split(";"):
item_info_array = item_info.split(":")
item = item_info_array[0]
features_dict['history_clicked_items'].add_feat_val(item)
features_dict['user_id'].add_feat_val(user_id)
category = item_info_array[1]
features_dict['history_clicked_categories'].add_feat_val(
category)
tags = item_info_array[2]
for tag in tags.split("_"):
features_dict['history_clicked_tags'].add_feat_val(tag)
def parse_args():
"""
parse arguments
"""
parser = argparse.ArgumentParser(
description="PaddlePaddle Youtube Recall Model Example")
parser.add_argument(
'--train_set_path',
type=str,
required=True,
help="path of the train set")
parser.add_argument(
'--output_dir', type=str, required=True, help="directory to output")
parser.add_argument(
'--feat_appear_limit',
type=int,
default=20,
help="the minimum number of feature values appears (default: 20)")
return parser.parse_args()
if __name__ == '__main__':
args = parse_args()
# check argument
assert os.path.exists(
args.train_set_path), 'The train set path does not exist.'
# features used
features = [
'user_id', 'province', 'city', 'phone', 'history_clicked_items',
'history_clicked_tags', 'history_clicked_categories'
]
# init feature generators
features_dict = {}
for feature in features:
features_dict[feature] = FeatureGenerator(
feat_appear_limit=args.feat_appear_limit)
# scan data for building dict
scan_build_dict(args.train_set_path, features_dict)
# generate feature_dict.pkl
feature_encoding_dict = {}
for feature in features:
d = features_dict[feature].get_dict()
feature_encoding_dict[feature] = d
logger.info('Feature:%s, dimension is %d' % (feature, len(d)))
output_dict_path = os.path.join(args.output_dir, 'feature_dict.pkl')
with open(output_dict_path, "w") as f:
cPickle.dump(feature_encoding_dict, f, -1)
# generate item_freq.pkl
item_freq_list = []
g = features_dict['history_clicked_items']
total_count = g.get_total_count()
for feat_val, feat_count in g.count_iterator():
item_freq_list.append(float(feat_count) / total_count)
logger.info('item_freq, dimension is %d' % (len(item_freq_list)))
output_item_freq_path = os.path.join(args.output_dir, 'item_freq.pkl')
with open(output_item_freq_path, "w") as f:
cPickle.dump(item_freq_list, f, -1)
logger.info('Complete!')
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import gzip
import paddle.v2 as paddle
import argparse
import cPickle
from reader import Reader
from network_conf import DNNmodel
from utils import logger
def parse_args():
"""
parse arguments
:return:
"""
parser = argparse.ArgumentParser(
description="PaddlePaddle Youtube Recall Model Example")
parser.add_argument(
'--infer_set_path',
type=str,
required=True,
help="path of the infer set")
parser.add_argument(
'--model_path', type=str, required=True, help="path of the model")
parser.add_argument(
'--feature_dict',
type=str,
required=True,
help="path of feature_dict.pkl")
parser.add_argument(
'--batch_size',
type=int,
default=50,
help="size of mini-batch (default:50)")
return parser.parse_args()
def infer():
"""
infer
"""
args = parse_args()
# check argument
assert os.path.exists(
args.infer_set_path), 'The infer_set_path path does not exist.'
assert os.path.exists(
args.model_path), 'The model_path path does not exist.'
assert os.path.exists(
args.feature_dict), 'The feature_dict path does not exist.'
paddle.init(use_gpu=False, trainer_count=1)
with open(args.feature_dict) as f:
feature_dict = cPickle.load(f)
nid_dict = feature_dict['history_clicked_items']
nid_to_word = dict((v, k) for k, v in nid_dict.items())
# load the trained model.
with gzip.open(args.model_path) as f:
parameters = paddle.parameters.Parameters.from_tar(f)
# build model
prediction_layer, fc = DNNmodel(
dnn_layer_dims=[256, 31], feature_dict=feature_dict,
is_infer=True).model_cost
inferer = paddle.inference.Inference(
output_layer=[prediction_layer, fc], parameters=parameters)
reader = Reader(feature_dict)
test_batch = []
for idx, item in enumerate(reader.infer(args.infer_set_path)):
test_batch.append(item)
if len(test_batch) == args.batch_size:
infer_a_batch(inferer, test_batch, nid_to_word)
test_batch = []
if len(test_batch):
infer_a_batch(inferer, test_batch, nid_to_word)
def infer_a_batch(inferer, test_batch, nid_to_word):
"""
input a batch of data and infer
"""
feeding = {
'user_id': 0,
'province': 1,
'city': 2,
'history_clicked_items': 3,
'history_clicked_categories': 4,
'history_clicked_tags': 5,
'phone': 6
}
probs = inferer.infer(
input=test_batch,
feeding=feeding,
field=["value"],
flatten_result=False)
for i, res in enumerate(zip(test_batch, probs[0], probs[1])):
softmax_output = res[1]
sort_nid = res[1].argsort()
# print top 30 recommended item
ret = ""
for j in range(1, 30):
item_id = sort_nid[-1 * j]
item_id_to_word = nid_to_word[item_id]
ret += "%s:%.6f," \
% (item_id_to_word, softmax_output[item_id])
print ret.rstrip(",")
if __name__ == "__main__":
infer()
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import gzip
import paddle.v2 as paddle
import argparse
import cPickle
from reader import Reader
from network_conf import DNNmodel
from utils import logger
import numpy as np
def parse_args():
"""
parse arguments
:return:
"""
parser = argparse.ArgumentParser(
description="PaddlePaddle Youtube Recall Model Example")
parser.add_argument(
'--model_path', type=str, required=True, help="path of the model")
parser.add_argument(
'--feature_dict',
type=str,
required=True,
help="path of feature_dict.pkl")
return parser.parse_args()
def infer_user():
"""
infer_user
"""
args = parse_args()
# check argument
assert os.path.exists(
args.model_path), 'The model_path path does not exist.'
assert os.path.exists(
args.feature_dict), 'The feature_dict path does not exist.'
paddle.init(use_gpu=False, trainer_count=1)
with open(args.feature_dict) as f:
feature_dict = cPickle.load(f)
nid_dict = feature_dict['history_clicked_items']
nid_to_word = dict((v, k) for k, v in nid_dict.items())
# load the trained model.
with gzip.open(args.model_path) as f:
parameters = paddle.parameters.Parameters.from_tar(f)
parameters.set('_proj_province', \
np.zeros(shape=parameters.get('_proj_province').shape))
parameters.set('_proj_city', \
np.zeros(shape=parameters.get('_proj_city').shape))
parameters.set('_proj_phone', \
np.zeros(shape=parameters.get('_proj_phone').shape))
parameters.set('_proj_history_clicked_items', \
np.zeros(shape= parameters.get('_proj_history_clicked_items').shape))
parameters.set('_proj_history_clicked_categories', \
np.zeros(shape= parameters.get('_proj_history_clicked_categories').shape))
parameters.set('_proj_history_clicked_tags', \
np.zeros(shape= parameters.get('_proj_history_clicked_tags').shape))
# build model
prediction_layer, fc = DNNmodel(
dnn_layer_dims=[256, 31], feature_dict=feature_dict,
is_infer=True).model_cost
inferer = paddle.inference.Inference(
output_layer=[prediction_layer, fc], parameters=parameters)
reader = Reader(feature_dict)
test_batch = []
for idx, item in enumerate(
reader.infer_user(['USER_ID_0', 'USER_ID_981', 'USER_ID_310806'])):
test_batch.append(item)
infer_a_batch(inferer, test_batch, nid_to_word)
def infer_a_batch(inferer, test_batch, nid_to_word):
"""
input a batch of data and infer
"""
feeding = {
'user_id': 0,
'province': 1,
'city': 2,
'history_clicked_items': 3,
'history_clicked_categories': 4,
'history_clicked_tags': 5,
'phone': 6
}
probs = inferer.infer(
input=test_batch,
feeding=feeding,
field=["value"],
flatten_result=False)
for i, res in enumerate(zip(test_batch, probs[0], probs[1])):
softmax_output = res[1]
sort_nid = res[1].argsort()
# print top 30 recommended item
ret = ""
for j in range(1, 30):
item_id = sort_nid[-1 * j]
item_id_to_word = nid_to_word[item_id]
ret += "%s:%.6f," \
% (item_id_to_word, softmax_output[item_id])
print ret.rstrip(",")
if __name__ == "__main__":
infer_user()
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import gzip
import paddle.v2 as paddle
import argparse
import cPickle
from reader import Reader
from network_conf import DNNmodel
from utils import logger
import numpy as np
import math
def parse_args():
"""
parse arguments
:return:
"""
parser = argparse.ArgumentParser(
description="PaddlePaddle Youtube Recall Model Example")
parser.add_argument(
'--model_path', type=str, required=True, help="path of the model")
parser.add_argument(
'--feature_dict',
type=str,
required=True,
help="path of feature_dict.pkl")
return parser.parse_args()
def get_item_vec_from_softmax(nce_w, nce_b):
"""
get item vectors from softmax parameter
"""
if nce_w is None or nce_b is None:
return None
vector = []
total_items_num = nce_w.shape[0]
if total_items_num != nce_b.shape[1]:
return None
dim_vector = nce_w.shape[1] + 1
for i in range(0, total_items_num):
vector.append([])
vector[i].append(nce_b[0][i])
for j in range(1, dim_vector):
vector[i].append(nce_w[i][j - 1])
return vector
def convt_simple_lsh(vector):
"""
do simple lsh conversion
"""
max_norm = 0
num_of_vec = len(vector)
for i in range(0, num_of_vec):
norm = np.linalg.norm(vector[i])
if norm > max_norm:
max_norm = norm
for i in range(0, num_of_vec):
vector[i].append(
math.sqrt(
math.pow(max_norm, 2) - math.pow(np.linalg.norm(vector[i]), 2)))
return vector
def item_vector():
"""
get item vectors
"""
args = parse_args()
# check argument
assert os.path.exists(
args.model_path), 'The model_path path does not exist.'
assert os.path.exists(
args.feature_dict), 'The feature_dict path does not exist.'
paddle.init(use_gpu=False, trainer_count=1)
with open(args.feature_dict) as f:
feature_dict = cPickle.load(f)
# load the trained model.
with gzip.open(args.model_path) as f:
parameters = paddle.parameters.Parameters.from_tar(f)
nid_dict = feature_dict['history_clicked_items']
nid_to_word = dict((v, k) for k, v in nid_dict.items())
nce_w = parameters.get("nce_w")
nce_b = parameters.get("nce_b")
item_vector = convt_simple_lsh(get_item_vec_from_softmax(nce_w, nce_b))
for i in range(0, len(item_vector)):
itemid = nid_to_word[i]
print itemid + "\t" + ",".join(map(str, item_vector[i]))
if __name__ == "__main__":
item_vector()
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import paddle.v2 as paddle
import cPickle
class DNNmodel(object):
"""
Deep Neural Networks for YouTube candidate generation
"""
def __init__(self,
dnn_layer_dims=None,
feature_dict=None,
item_freq=None,
is_infer=False):
"""
initialize model
@dnn_layer_dims: dimension of each hidden layer
@feature_dict: dictionary of encoded feature
@item_freq: dictionary of feature values and its frequency
@is_infer: if infer mode
"""
self._dnn_layer_dims = dnn_layer_dims
self._feature_dict = feature_dict
self._item_freq = item_freq
self._is_infer = is_infer
# build model
self._build_input_layer()
self._build_embedding_layer()
self.model_cost = self._build_dnn_model()
def _build_input_layer(self):
"""
build input layer
"""
self._history_clicked_items = paddle.layer.data(
name="history_clicked_items",
type=paddle.data_type.integer_value_sequence(
len(self._feature_dict['history_clicked_items'])))
self._history_clicked_categories = paddle.layer.data(
name="history_clicked_categories",
type=paddle.data_type.integer_value_sequence(
len(self._feature_dict['history_clicked_categories'])))
self._history_clicked_tags = paddle.layer.data(
name="history_clicked_tags",
type=paddle.data_type.integer_value_sequence(
len(self._feature_dict['history_clicked_tags'])))
self._user_id = paddle.layer.data(
name="user_id",
type=paddle.data_type.integer_value(
len(self._feature_dict['user_id'])))
self._province = paddle.layer.data(
name="province",
type=paddle.data_type.integer_value(
len(self._feature_dict['province'])))
self._city = paddle.layer.data(
name="city",
type=paddle.data_type.integer_value(
len(self._feature_dict['city'])))
self._phone = paddle.layer.data(
name="phone",
type=paddle.data_type.integer_value(
len(self._feature_dict['phone'])))
self._target_item = paddle.layer.data(
name="target_item",
type=paddle.data_type.integer_value(
len(self._feature_dict['history_clicked_items'])))
def _create_emb_attr(self, name):
"""
create embedding parameter
"""
return paddle.attr.Param(
name=name,
initial_std=0.001,
learning_rate=1,
l2_rate=0,
sparse_update=False)
def _build_embedding_layer(self):
"""
build embedding layer
"""
self._user_id_emb = paddle.layer.embedding(
input=self._user_id,
size=64,
param_attr=self._create_emb_attr('_proj_user_id'))
self._province_emb = paddle.layer.embedding(
input=self._province,
size=8,
param_attr=self._create_emb_attr('_proj_province'))
self._city_emb = paddle.layer.embedding(
input=self._city,
size=16,
param_attr=self._create_emb_attr('_proj_city'))
self._phone_emb = paddle.layer.embedding(
input=self._phone,
size=16,
param_attr=self._create_emb_attr('_proj_phone'))
self._history_clicked_items_emb = paddle.layer.embedding(
input=self._history_clicked_items,
size=64,
param_attr=self._create_emb_attr('_proj_history_clicked_items'))
self._history_clicked_categories_emb = paddle.layer.embedding(
input=self._history_clicked_categories,
size=8,
param_attr=self._create_emb_attr(
'_proj_history_clicked_categories'))
self._history_clicked_tags_emb = paddle.layer.embedding(
input=self._history_clicked_tags,
size=64,
param_attr=self._create_emb_attr('_proj_history_clicked_tags'))
def _build_dnn_model(self):
"""
build dnn model
"""
self._rnn_cell = paddle.networks.simple_lstm(
input=self._history_clicked_items_emb, size=64)
self._lstm_last = paddle.layer.pooling(
input=self._rnn_cell, pooling_type=paddle.pooling.Max())
self._avg_emb_cats = paddle.layer.pooling(
input=self._history_clicked_categories_emb,
pooling_type=paddle.pooling.Avg())
self._avg_emb_tags = paddle.layer.pooling(
input=self._history_clicked_tags_emb,
pooling_type=paddle.pooling.Avg())
self._fc_0 = paddle.layer.fc(
name="Relu1",
input=[
self._lstm_last, self._user_id_emb, self._province_emb,
self._city_emb, self._avg_emb_cats, self._avg_emb_tags,
self._phone_emb
],
size=self._dnn_layer_dims[0],
act=paddle.activation.Relu())
self._fc_1 = paddle.layer.fc(name="Relu2",
input=self._fc_0,
size=self._dnn_layer_dims[1],
act=paddle.activation.Relu())
if not self._is_infer:
return paddle.layer.nce(
input=self._fc_1,
label=self._target_item,
num_classes=len(self._feature_dict['history_clicked_items']),
param_attr=paddle.attr.Param(name="nce_w"),
bias_attr=paddle.attr.Param(name="nce_b"),
num_neg_samples=5,
neg_distribution=self._item_freq)
else:
self.prediction_layer = paddle.layer.mixed(
size=len(self._feature_dict['history_clicked_items']),
input=paddle.layer.trans_full_matrix_projection(
self._fc_1, param_attr=paddle.attr.Param(name="nce_w")),
act=paddle.activation.Softmax(),
bias_attr=paddle.attr.Param(name="nce_b"))
return self.prediction_layer, self._fc_1
if __name__ == "__main__":
# this is to test and debug the network topology defination.
# please set the hyper-parameters as needed.
item_freq_path = "./output/item_freq.pkl"
with open(item_freq_path) as f:
item_freq = cPickle.load(f)
feature_dict_path = "./output/feature_dict.pkl"
with open(feature_dict_path) as f:
feature_dict = cPickle.load(f)
a = DNNmodel(
dnn_layer_dims=[256, 31],
feature_dict=feature_dict,
item_freq=item_freq,
is_infer=False)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys
from utils import logger
from utils import TaskMode
class Reader(object):
"""
Reader
"""
def __init__(self, feature_dict=None, window_size=20):
"""
init
@window_size: window_size
"""
self._feature_dict = feature_dict
self._window_size = window_size
def train(self, path):
"""
load train set
@path: train set path
"""
logger.info("start train reader from %s" % path)
mode = TaskMode.create_train()
return self._reader(path, mode)
def test(self, path):
"""
load test set
@path: test set path
"""
logger.info("start test reader from %s" % path)
mode = TaskMode.create_test()
return self._reader(path, mode)
def infer(self, path):
"""
load infer set
@path: infer set path
"""
logger.info("start infer reader from %s" % path)
mode = TaskMode.create_infer()
return self._reader(path, mode)
def infer_user(self, user_list):
"""
load user set to infer
@user_list: user list
"""
return self._reader_user(user_list)
def _reader(self, path, mode):
"""
parse data set
"""
USER_ID_UNK = self._feature_dict['user_id'].get('<unk>')
PROVINCE_UNK = self._feature_dict['province'].get('<unk>')
CITY_UNK = self._feature_dict['city'].get('<unk>')
ITEM_UNK = self._feature_dict['history_clicked_items'].get('<unk>')
CATEGORY_UNK = self._feature_dict['history_clicked_categories'].get(
'<unk>')
TAG_UNK = self._feature_dict['history_clicked_tags'].get('<unk>')
PHONE_UNK = self._feature_dict['phone'].get('<unk>')
with open(path) as f:
for line in f:
fields = line.strip('\n').split('\t')
user_id = self._feature_dict['user_id'].get(fields[0],
USER_ID_UNK)
province = self._feature_dict['province'].get(fields[1],
PROVINCE_UNK)
city = self._feature_dict['city'].get(fields[2], CITY_UNK)
item_infos = fields[3]
phone = self._feature_dict['phone'].get(fields[4], PHONE_UNK)
history_clicked_items_all = []
history_clicked_tags_all = []
history_clicked_categories_all = []
for item_info in item_infos.split(';'):
item_info_array = item_info.split(':')
item = item_info_array[0]
item_encoded_id = self._feature_dict['history_clicked_items'].get(\
item, ITEM_UNK)
if item_encoded_id != ITEM_UNK:
history_clicked_items_all.append(item_encoded_id)
category = item_info_array[1]
history_clicked_categories_all.append(
self._feature_dict['history_clicked_categories'].get(\
category, CATEGORY_UNK))
tags = item_info_array[2]
tag_split = map(str, [self._feature_dict['history_clicked_tags'].get(\
tag, TAG_UNK) \
for tag in tags.strip().split("_")])
history_clicked_tags_all.append("_".join(tag_split))
if not mode.is_infer():
history_clicked_items_all.insert(0, 0)
history_clicked_tags_all.insert(0, "0")
history_clicked_categories_all.insert(0, 0)
for i in range(1, len(history_clicked_items_all)):
start = max(0, i - self._window_size)
history_clicked_items = history_clicked_items_all[start:
i]
history_clicked_categories = history_clicked_categories_all[
start:i]
history_clicked_tags_str = history_clicked_tags_all[
start:i]
history_clicked_tags = []
for tags_a in history_clicked_tags_str:
for tag in tags_a.split("_"):
history_clicked_tags.append(int(tag))
target_item = history_clicked_items_all[i]
yield user_id, province, city, \
history_clicked_items, history_clicked_categories, \
history_clicked_tags, phone, target_item
else:
history_clicked_items = history_clicked_items_all
history_clicked_categories = history_clicked_categories_all
history_clicked_tags_str = history_clicked_tags_all
history_clicked_tags = []
for tags_a in history_clicked_tags_str:
for tag in tags_a.split("_"):
history_clicked_tags.append(int(tag))
yield user_id, province, city, \
history_clicked_items, history_clicked_categories, \
history_clicked_tags, phone
def _reader_user(self, user_list):
"""
parse user list
"""
USER_ID_UNK = self._feature_dict['user_id'].get('<unk>')
for user in user_list:
user_id = self._feature_dict['user_id'].get(user, USER_ID_UNK)
yield user_id, 0, 0, [0], [0], [0], 0
if __name__ == "__main__":
# this is to test and debug reader function
train_data = sys.argv[1]
feature_dict = sys.argv[2]
window_size = int(sys.argv[3])
import cPickle
with open(feature_dict) as f:
feature_dict = cPickle.load(f)
r = Reader(feature_dict, window_size)
for dat in r.train(train_data):
print dat
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import gzip
import paddle.v2 as paddle
import argparse
import cPickle
from reader import Reader
from network_conf import DNNmodel
from utils import logger
def parse_args():
"""
parse arguments
"""
parser = argparse.ArgumentParser(
description="PaddlePaddle Youtube Recall Model Example")
parser.add_argument(
'--train_set_path',
type=str,
required=True,
help="path of the train set")
parser.add_argument(
'--test_set_path', type=str, required=True, help="path of the test set")
parser.add_argument(
'--model_output_dir',
type=str,
required=True,
help="directory to output")
parser.add_argument(
'--feature_dict',
type=str,
required=True,
help="path of feature_dict.pkl")
parser.add_argument(
'--item_freq', type=str, required=True, help="path of item_freq.pkl ")
parser.add_argument(
'--window_size', type=int, default=20, help="window size(default: 20)")
parser.add_argument(
'--num_passes', type=int, default=1, help="number of passes to train")
parser.add_argument(
'--batch_size',
type=int,
default=50,
help="size of mini-batch (default:50)")
return parser.parse_args()
def train():
"""
train
"""
args = parse_args()
# check argument
assert os.path.exists(
args.train_set_path), 'The train_set_path path does not exist.'
assert os.path.exists(
args.test_set_path), 'The test_set_path path does not exist.'
assert os.path.exists(
args.feature_dict), 'The feature_dict path does not exist.'
assert os.path.exists(args.item_freq), 'The item_freq path does not exist.'
assert os.path.exists(
args.model_output_dir), 'The model_output_dir path does not exist.'
paddle.init(use_gpu=False, trainer_count=1)
with open(args.feature_dict) as f:
feature_dict = cPickle.load(f)
with open(args.item_freq) as f:
item_freq = cPickle.load(f)
feeding = {
'user_id': 0,
'province': 1,
'city': 2,
'history_clicked_items': 3,
'history_clicked_categories': 4,
'history_clicked_tags': 5,
'phone': 6,
'target_item': 7
}
optimizer = paddle.optimizer.AdaGrad(
learning_rate=1e-1,
regularization=paddle.optimizer.L2Regularization(rate=1e-3))
cost = DNNmodel(
dnn_layer_dims=[256, 31],
feature_dict=feature_dict,
item_freq=item_freq,
is_infer=False).model_cost
parameters = paddle.parameters.create(cost)
trainer = paddle.trainer.SGD(cost, parameters, optimizer)
def event_handler(event):
"""
event handler
"""
if isinstance(event, paddle.event.EndIteration):
if event.batch_id and not event.batch_id % 10:
logger.info("Pass %d, Batch %d, Cost %f" %
(event.pass_id, event.batch_id, event.cost))
elif isinstance(event, paddle.event.EndPass):
save_path = os.path.join(args.model_output_dir,
"model_pass_%05d.tar.gz" % event.pass_id)
logger.info("Save model into %s ..." % save_path)
with gzip.open(save_path, "w") as f:
trainer.save_parameter_to_tar(f)
reader = Reader(feature_dict, args.window_size)
trainer.train(
paddle.batch(
paddle.reader.shuffle(
lambda: reader.train(args.train_set_path), buf_size=7000),
args.batch_size),
num_passes=args.num_passes,
feeding=feeding,
event_handler=event_handler)
if __name__ == "__main__":
train()
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import gzip
import paddle.v2 as paddle
import argparse
import cPickle
from reader import Reader
from network_conf import DNNmodel
from utils import logger
import numpy as np
def parse_args():
"""
parse arguments
"""
parser = argparse.ArgumentParser(
description="PaddlePaddle Youtube Recall Model Example")
parser.add_argument(
'--infer_set_path',
type=str,
required=True,
help="path of the infer set")
parser.add_argument(
'--model_path', type=str, required=True, help="path of the model")
parser.add_argument(
'--feature_dict',
type=str,
required=True,
help="path of feature_dict.pkl")
parser.add_argument(
'--batch_size',
type=int,
default=50,
help="size of mini-batch (default:50)")
return parser.parse_args()
def user_vector():
"""
get user vectors
"""
args = parse_args()
# check argument
assert os.path.exists(
args.infer_set_path), 'The infer_set_path path does not exist.'
assert os.path.exists(
args.model_path), 'The model_path path does not exist.'
assert os.path.exists(
args.feature_dict), 'The feature_dict path does not exist.'
paddle.init(use_gpu=False, trainer_count=1)
with open(args.feature_dict) as f:
feature_dict = cPickle.load(f)
# load the trained model.
with gzip.open(args.model_path) as f:
parameters = paddle.parameters.Parameters.from_tar(f)
# build model
prediction_layer, fc = DNNmodel(
dnn_layer_dims=[256, 31], feature_dict=feature_dict,
is_infer=True).model_cost
inferer = paddle.inference.Inference(
output_layer=[prediction_layer, fc], parameters=parameters)
reader = Reader(feature_dict)
test_batch = []
for idx, item in enumerate(reader.infer(args.infer_set_path)):
test_batch.append(item)
if len(test_batch) == args.batch_size:
get_a_batch_user_vector(inferer, test_batch)
test_batch = []
if len(test_batch):
get_a_batch_user_vector(inferer, test_batch)
def get_a_batch_user_vector(inferer, test_batch):
"""
input a batch of data and get user vectors
"""
feeding = {
'user_id': 0,
'province': 1,
'city': 2,
'history_clicked_items': 3,
'history_clicked_categories': 4,
'history_clicked_tags': 5,
'phone': 6
}
probs = inferer.infer(
input=test_batch,
feeding=feeding,
field=["value"],
flatten_result=False)
for i, res in enumerate(zip(probs[1])):
# do simple lsh conversion
user_vector = [1.000]
for i in res[0]:
user_vector.append(i)
user_vector.append(0.000)
norm = np.linalg.norm(user_vector)
user_vector_norm = [str(_ / norm) for _ in user_vector]
print ",".join(user_vector_norm)
if __name__ == "__main__":
user_vector()
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import logging
logging.basicConfig()
logger = logging.getLogger("paddle")
logger.setLevel(logging.INFO)
class TaskMode(object):
"""
TaskMode
"""
TRAIN_MODE = 0
TEST_MODE = 1
INFER_MODE = 2
def __init__(self, mode):
"""
:param mode:
"""
self.mode = mode
def is_train(self):
"""
:return:
"""
return self.mode == self.TRAIN_MODE
def is_test(self):
"""
:return:
"""
return self.mode == self.TEST_MODE
def is_infer(self):
"""
:return:
"""
return self.mode == self.INFER_MODE
@staticmethod
def create_train():
"""
:return:
"""
return TaskMode(TaskMode.TRAIN_MODE)
@staticmethod
def create_test():
"""
:return:
"""
return TaskMode(TaskMode.TEST_MODE)
@staticmethod
def create_infer():
"""
:return:
"""
return TaskMode(TaskMode.INFER_MODE)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册