提交 cc1167d7 编写于 作者: F frankwhzhang

update ssr/word2vec/ms/tagspace/ssr api

上级 1a098de6
......@@ -206,6 +206,7 @@ def infer_network(vocab_size, batch_size, hid_size, dropout=0.2):
dtype="int64")
emb_all_label = fluid.embedding(
input=all_label, size=[vocab_size, hid_size], param_attr="emb")
emb_all_label = fluid.layers.squeeze(input=emb_all_label, axes=[1])
emb_all_label_drop = fluid.layers.dropout(
emb_all_label, dropout_prob=dropout, is_test=True)
......
......@@ -110,7 +110,7 @@ def prepare_data(file_dir,
batch_size * 20)
else:
vocab_size = get_vocab_size(vocab_path)
reader = paddle.io.batch(
reader = fluid.io.batch(
test(
file_dir, buffer_size, data_type=DataType.SEQ), batch_size)
return vocab_size, reader
......
......@@ -16,7 +16,7 @@ def construct_train_data(file_dir, vocab_path, batch_size):
vocab_size = get_vocab_size(vocab_path)
files = [file_dir + '/' + f for f in os.listdir(file_dir)]
y_data = reader.YoochooseDataset(vocab_size)
train_reader = paddle.batch(
train_reader = fluid.io.batch(
paddle.reader.shuffle(
y_data.train(files), buf_size=batch_size * 100),
batch_size=batch_size)
......@@ -27,7 +27,7 @@ def construct_test_data(file_dir, vocab_path, batch_size):
vocab_size = get_vocab_size(vocab_path)
files = [file_dir + '/' + f for f in os.listdir(file_dir)]
y_data = reader.YoochooseDataset(vocab_size)
test_reader = paddle.batch(y_data.test(files), batch_size=batch_size)
test_reader = fluid.io.batch(y_data.test(files), batch_size=batch_size)
return test_reader, vocab_size
......
......@@ -35,7 +35,7 @@ mv 1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tok
```bash
mkdir data
wget https://paddlerec.bj.bcebos.com/word2vec/1-billion-word-language-modeling-benchmark-r13output.tar
wget --no-check-certificate https://paddlerec.bj.bcebos.com/word2vec/1-billion-word-language-modeling-benchmark-r13output.tar
tar xvf 1-billion-word-language-modeling-benchmark-r13output.tar
mv 1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/ data/
```
......@@ -44,7 +44,7 @@ mv 1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tok
```bash
mkdir data
wget https://paddlerec.bj.bcebos.com/word2vec/text.tar
wget --no-check-certificate https://paddlerec.bj.bcebos.com/word2vec/text.tar
tar xvf text.tar
mv text data/
```
......@@ -105,9 +105,9 @@ sh cluster_train.sh
```bash
#全量数据集测试集
wget https://paddlerec.bj.bcebos.com/word2vec/test_dir.tar
wget --no-check-certificate https://paddlerec.bj.bcebos.com/word2vec/test_dir.tar
#样本数据集测试集
wget https://paddlerec.bj.bcebos.com/word2vec/test_mid_dir.tar
wget --no-check-certificate https://paddlerec.bj.bcebos.com/word2vec/test_mid_dir.tar
```
预测命令,注意词典名称需要加后缀"_word_to_id_", 此文件是预处理阶段生成的。
......
......@@ -78,13 +78,13 @@ def infer_epoch(args, vocab_size, test_reader, use_cuda, i2w):
b_size = len([dat[0] for dat in data])
wa = np.array(
[dat[0] for dat in data]).astype("int64").reshape(
b_size, 1)
b_size)
wb = np.array(
[dat[1] for dat in data]).astype("int64").reshape(
b_size, 1)
b_size)
wc = np.array(
[dat[2] for dat in data]).astype("int64").reshape(
b_size, 1)
b_size)
label = [dat[3] for dat in data]
input_word = [dat[4] for dat in data]
......@@ -95,7 +95,7 @@ def infer_epoch(args, vocab_size, test_reader, use_cuda, i2w):
"analogy_c": wc,
"all_label":
np.arange(vocab_size).reshape(
vocab_size, 1).astype("int64"),
vocab_size).astype("int64"),
},
fetch_list=[pred.name, values],
return_numpy=False)
......@@ -145,13 +145,13 @@ def infer_step(args, vocab_size, test_reader, use_cuda, i2w):
b_size = len([dat[0] for dat in data])
wa = np.array(
[dat[0] for dat in data]).astype("int64").reshape(
b_size, 1)
b_size)
wb = np.array(
[dat[1] for dat in data]).astype("int64").reshape(
b_size, 1)
b_size)
wc = np.array(
[dat[2] for dat in data]).astype("int64").reshape(
b_size, 1)
b_size)
label = [dat[3] for dat in data]
input_word = [dat[4] for dat in data]
......@@ -162,7 +162,7 @@ def infer_step(args, vocab_size, test_reader, use_cuda, i2w):
"analogy_b": wb,
"analogy_c": wc,
"all_label":
np.arange(vocab_size).reshape(vocab_size, 1),
np.arange(vocab_size).reshape(vocab_size),
},
fetch_list=[pred.name, values],
return_numpy=False)
......
......@@ -23,10 +23,10 @@ import paddle.fluid as fluid
def skip_gram_word2vec(dict_size, embedding_size, is_sparse=False, neg_num=5):
datas = []
input_word = fluid.layers.data(name="input_word", shape=[1], dtype='int64')
true_word = fluid.layers.data(name='true_label', shape=[1], dtype='int64')
neg_word = fluid.layers.data(
name="neg_label", shape=[neg_num], dtype='int64')
input_word = fluid.data(name="input_word", shape=[None, 1], dtype='int64')
true_word = fluid.data(name='true_label', shape=[None, 1], dtype='int64')
neg_word = fluid.data(
name="neg_label", shape=[None, neg_num], dtype='int64')
datas.append(input_word)
datas.append(true_word)
......@@ -37,7 +37,7 @@ def skip_gram_word2vec(dict_size, embedding_size, is_sparse=False, neg_num=5):
words = fluid.layers.read_file(py_reader)
init_width = 0.5 / embedding_size
input_emb = fluid.layers.embedding(
input_emb = fluid.embedding(
input=words[0],
is_sparse=is_sparse,
size=[dict_size, embedding_size],
......@@ -45,38 +45,37 @@ def skip_gram_word2vec(dict_size, embedding_size, is_sparse=False, neg_num=5):
name='emb',
initializer=fluid.initializer.Uniform(-init_width, init_width)))
true_emb_w = fluid.layers.embedding(
true_emb_w = fluid.embedding(
input=words[1],
is_sparse=is_sparse,
size=[dict_size, embedding_size],
param_attr=fluid.ParamAttr(
name='emb_w', initializer=fluid.initializer.Constant(value=0.0)))
true_emb_b = fluid.layers.embedding(
true_emb_b = fluid.embedding(
input=words[1],
is_sparse=is_sparse,
size=[dict_size, 1],
param_attr=fluid.ParamAttr(
name='emb_b', initializer=fluid.initializer.Constant(value=0.0)))
neg_word_reshape = fluid.layers.reshape(words[2], shape=[-1, 1])
neg_word_reshape.stop_gradient = True
input_emb = fluid.layers.squeeze(input=input_emb, axes=[1])
true_emb_w = fluid.layers.squeeze(input=true_emb_w, axes=[1])
true_emb_b = fluid.layers.squeeze(input=true_emb_b, axes=[1])
neg_emb_w = fluid.layers.embedding(
input=neg_word_reshape,
neg_emb_w = fluid.embedding(
input=words[2],
is_sparse=is_sparse,
size=[dict_size, embedding_size],
param_attr=fluid.ParamAttr(
name='emb_w', learning_rate=1.0))
neg_emb_w_re = fluid.layers.reshape(
neg_emb_w, shape=[-1, neg_num, embedding_size])
neg_emb_b = fluid.layers.embedding(
input=neg_word_reshape,
neg_emb_b = fluid.embedding(
input=words[2],
is_sparse=is_sparse,
size=[dict_size, 1],
param_attr=fluid.ParamAttr(
name='emb_b', learning_rate=1.0))
neg_emb_b_vec = fluid.layers.reshape(neg_emb_b, shape=[-1, neg_num])
true_logits = fluid.layers.elementwise_add(
fluid.layers.reduce_sum(
......@@ -87,7 +86,7 @@ def skip_gram_word2vec(dict_size, embedding_size, is_sparse=False, neg_num=5):
input_emb_re = fluid.layers.reshape(
input_emb, shape=[-1, 1, embedding_size])
neg_matmul = fluid.layers.matmul(
input_emb_re, neg_emb_w_re, transpose_y=True)
input_emb_re, neg_emb_w, transpose_y=True)
neg_matmul_re = fluid.layers.reshape(neg_matmul, shape=[-1, neg_num])
neg_logits = fluid.layers.elementwise_add(neg_matmul_re, neg_emb_b_vec)
#nce loss
......@@ -111,22 +110,21 @@ def skip_gram_word2vec(dict_size, embedding_size, is_sparse=False, neg_num=5):
def infer_network(vocab_size, emb_size):
analogy_a = fluid.layers.data(name="analogy_a", shape=[1], dtype='int64')
analogy_b = fluid.layers.data(name="analogy_b", shape=[1], dtype='int64')
analogy_c = fluid.layers.data(name="analogy_c", shape=[1], dtype='int64')
all_label = fluid.layers.data(
analogy_a = fluid.data(name="analogy_a", shape=[None], dtype='int64')
analogy_b = fluid.data(name="analogy_b", shape=[None], dtype='int64')
analogy_c = fluid.data(name="analogy_c", shape=[None], dtype='int64')
all_label = fluid.data(
name="all_label",
shape=[vocab_size, 1],
dtype='int64',
append_batch_size=False)
emb_all_label = fluid.layers.embedding(
shape=[vocab_size],
dtype='int64')
emb_all_label = fluid.embedding(
input=all_label, size=[vocab_size, emb_size], param_attr="emb")
emb_a = fluid.layers.embedding(
emb_a = fluid.embedding(
input=analogy_a, size=[vocab_size, emb_size], param_attr="emb")
emb_b = fluid.layers.embedding(
emb_b = fluid.embedding(
input=analogy_b, size=[vocab_size, emb_size], param_attr="emb")
emb_c = fluid.layers.embedding(
emb_c = fluid.embedding(
input=analogy_c, size=[vocab_size, emb_size], param_attr="emb")
target = fluid.layers.elementwise_add(
fluid.layers.elementwise_sub(emb_b, emb_a), emb_c)
......
......@@ -22,7 +22,7 @@ def BuildWord_IdMap(dict_path):
def prepare_data(file_dir, dict_path, batch_size):
w2i, i2w = BuildWord_IdMap(dict_path)
vocab_size = len(i2w)
reader = paddle.batch(test(file_dir, w2i), batch_size)
reader = fluid.io.batch(test(file_dir, w2i), batch_size)
return vocab_size, reader, i2w
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册