未验证 提交 3d0cffc3 编写于 作者: W wuzhihua 提交者: GitHub

Merge pull request #28 from 123malin/fasttext

add Fasttext
......@@ -228,10 +228,12 @@ class SingleTrainer(TranspileTrainer):
scope = self._model[model_name][2]
program = self._model[model_name][0]
reader = self._dataset[reader_name]
threads = model_dict.get("thread_num", 1)
with fluid.scope_guard(scope):
self._exe.train_from_dataset(
program=program,
dataset=reader,
thread=threads,
fetch_list=fetch_vars,
fetch_info=fetch_alias,
print_period=fetch_period)
......@@ -241,8 +243,23 @@ class SingleTrainer(TranspileTrainer):
model_name = model_dict["name"]
model_class = self._model[model_name][3]
program = self._model[model_name][0].clone()
_build_strategy = fluid.BuildStrategy()
_exe_strategy = fluid.ExecutionStrategy()
# 0: kCoeffNumDevice; 1: One; 2: Customized
_build_strategy.gradient_scale_strategy = model_dict.get(
"gradient_scale_strategy", 0)
if "thread_num" in model_dict and model_dict["thread_num"] > 1:
_build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
_exe_strategy.num_threads = model_dict["thread_num"]
os.environ['CPU_NUM'] = str(_exe_strategy.num_threads)
program = fluid.compiler.CompiledProgram(program).with_data_parallel(
loss_name=model_class.get_avg_cost().name)
loss_name=model_class.get_avg_cost().name,
build_strategy=_build_strategy,
exec_strategy=_exe_strategy)
fetch_vars = []
fetch_alias = []
fetch_period = int(
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
workspace: "paddlerec.models.recall.fasttext"
# list of dataset
dataset:
- name: dataset_train # name of dataset to distinguish different datasets
batch_size: 10
type: DataLoader # or QueueDataset
data_path: "{workspace}/data/train"
word_count_dict_path: "{workspace}/data/dict/word_count_dict.txt"
word_ngrams_path: "{workspace}/data/dict/word_ngrams_id.txt"
data_converter: "{workspace}/reader.py"
- name: dataset_infer # name
batch_size: 10
type: DataLoader # or QueueDataset
data_path: "{workspace}/data/test"
word_id_dict_path: "{workspace}/data/dict/word_id_dict.txt"
data_converter: "{workspace}/evaluate_reader.py"
hyper_parameters:
optimizer:
learning_rate: 1.0
decay_steps: 100000
decay_rate: 0.999
class: sgd
strategy: async
sparse_feature_number: 227915
sparse_feature_dim: 300
with_shuffle_batch: False
neg_num: 5
window_size: 5
min_n: 3
max_n: 5
# select runner by name
mode: train_runner
# config of each runner.
# runner is a kind of paddle training class, which wraps the train/infer process.
runner:
- name: train_runner
class: single_train
# num of epochs
epochs: 2
# device to run training or infer
device: cpu
save_checkpoint_interval: 1 # save model interval of epochs
save_inference_interval: 1 # save inference
save_checkpoint_path: "increment" # save checkpoint path
save_inference_path: "inference" # save inference path
save_inference_feed_varnames: [] # feed vars of save inference
save_inference_fetch_varnames: [] # fetch vars of save inference
init_model_path: "" # load model path
fetch_period: 10
- name: infer_runner
class: single_infer
# num of epochs
epochs: 1
# device to run training or infer
device: cpu
init_model_path: "increment/0" # load model path
# runner will run all the phase in each epoch
phase:
- name: phase1
model: "{workspace}/model.py" # user-defined model
dataset_name: dataset_train # select dataset by name
thread_num: 1
gradient_scale_strategy: 1
#- name: phase2
# model: "{workspace}/model.py" # user-defined model
# dataset_name: dataset_infer # select dataset by name
# thread_num: 1
此差异已折叠。
此差异已折叠。
此差异已折叠。
Athens Greece Baghdad Iraq
Athens Greece Bangkok Thailand
Athens Greece Beijing China
Athens Greece Berlin Germany
Athens Greece Bern Switzerland
Athens Greece Cairo Egypt
Athens Greece Canberra Australia
Athens Greece Hanoi Vietnam
Athens Greece Havana Cuba
Athens Greece Helsinki Finland
Athens Greece Islamabad Pakistan
Athens Greece Kabul Afghanistan
Athens Greece London England
Athens Greece Madrid Spain
Athens Greece Moscow Russia
Athens Greece Oslo Norway
Athens Greece Ottawa Canada
Athens Greece Paris France
Athens Greece Rome Italy
Athens Greece Stockholm Sweden
Athens Greece Tehran Iran
Athens Greece Tokyo Japan
Baghdad Iraq Bangkok Thailand
Baghdad Iraq Beijing China
Baghdad Iraq Berlin Germany
Baghdad Iraq Bern Switzerland
Baghdad Iraq Cairo Egypt
Baghdad Iraq Canberra Australia
Baghdad Iraq Hanoi Vietnam
Baghdad Iraq Havana Cuba
Baghdad Iraq Helsinki Finland
Baghdad Iraq Islamabad Pakistan
Baghdad Iraq Kabul Afghanistan
Baghdad Iraq London England
Baghdad Iraq Madrid Spain
Baghdad Iraq Moscow Russia
Baghdad Iraq Oslo Norway
Baghdad Iraq Ottawa Canada
Baghdad Iraq Paris France
Baghdad Iraq Rome Italy
Baghdad Iraq Stockholm Sweden
Baghdad Iraq Tehran Iran
Baghdad Iraq Tokyo Japan
Baghdad Iraq Athens Greece
Bangkok Thailand Beijing China
Bangkok Thailand Berlin Germany
Bangkok Thailand Bern Switzerland
Bangkok Thailand Cairo Egypt
Bangkok Thailand Canberra Australia
Bangkok Thailand Hanoi Vietnam
Bangkok Thailand Havana Cuba
Bangkok Thailand Helsinki Finland
Bangkok Thailand Islamabad Pakistan
Bangkok Thailand Kabul Afghanistan
Bangkok Thailand London England
Bangkok Thailand Madrid Spain
Bangkok Thailand Moscow Russia
Bangkok Thailand Oslo Norway
Bangkok Thailand Ottawa Canada
Bangkok Thailand Paris France
Bangkok Thailand Rome Italy
Bangkok Thailand Stockholm Sweden
Bangkok Thailand Tehran Iran
Bangkok Thailand Tokyo Japan
Bangkok Thailand Athens Greece
Bangkok Thailand Baghdad Iraq
Beijing China Berlin Germany
Beijing China Bern Switzerland
Beijing China Cairo Egypt
Beijing China Canberra Australia
Beijing China Hanoi Vietnam
Beijing China Havana Cuba
Beijing China Helsinki Finland
Beijing China Islamabad Pakistan
Beijing China Kabul Afghanistan
Beijing China London England
Beijing China Madrid Spain
Beijing China Moscow Russia
Beijing China Oslo Norway
Beijing China Ottawa Canada
Beijing China Paris France
Beijing China Rome Italy
Beijing China Stockholm Sweden
Beijing China Tehran Iran
Beijing China Tokyo Japan
Beijing China Athens Greece
Beijing China Baghdad Iraq
Beijing China Bangkok Thailand
Berlin Germany Bern Switzerland
Berlin Germany Cairo Egypt
Berlin Germany Canberra Australia
Berlin Germany Hanoi Vietnam
Berlin Germany Havana Cuba
Berlin Germany Helsinki Finland
Berlin Germany Islamabad Pakistan
Berlin Germany Kabul Afghanistan
Berlin Germany London England
Berlin Germany Madrid Spain
Berlin Germany Moscow Russia
Berlin Germany Oslo Norway
Berlin Germany Ottawa Canada
Berlin Germany Paris France
Berlin Germany Rome Italy
Berlin Germany Stockholm Sweden
Berlin Germany Tehran Iran
Berlin Germany Tokyo Japan
Berlin Germany Athens Greece
Berlin Germany Baghdad Iraq
Berlin Germany Bangkok Thailand
Berlin Germany Beijing China
Bern Switzerland Cairo Egypt
Bern Switzerland Canberra Australia
Bern Switzerland Hanoi Vietnam
Bern Switzerland Havana Cuba
Bern Switzerland Helsinki Finland
Bern Switzerland Islamabad Pakistan
Bern Switzerland Kabul Afghanistan
Bern Switzerland London England
Bern Switzerland Madrid Spain
Bern Switzerland Moscow Russia
Bern Switzerland Oslo Norway
Bern Switzerland Ottawa Canada
Bern Switzerland Paris France
Bern Switzerland Rome Italy
Bern Switzerland Stockholm Sweden
Bern Switzerland Tehran Iran
Bern Switzerland Tokyo Japan
Bern Switzerland Athens Greece
Bern Switzerland Baghdad Iraq
Bern Switzerland Bangkok Thailand
Bern Switzerland Beijing China
Bern Switzerland Berlin Germany
Cairo Egypt Canberra Australia
Cairo Egypt Hanoi Vietnam
Cairo Egypt Havana Cuba
Cairo Egypt Helsinki Finland
Cairo Egypt Islamabad Pakistan
Cairo Egypt Kabul Afghanistan
Cairo Egypt London England
Cairo Egypt Madrid Spain
Cairo Egypt Moscow Russia
Cairo Egypt Oslo Norway
Cairo Egypt Ottawa Canada
Cairo Egypt Paris France
Cairo Egypt Rome Italy
Cairo Egypt Stockholm Sweden
Cairo Egypt Tehran Iran
Cairo Egypt Tokyo Japan
Cairo Egypt Athens Greece
Cairo Egypt Baghdad Iraq
Cairo Egypt Bangkok Thailand
Cairo Egypt Beijing China
Cairo Egypt Berlin Germany
Cairo Egypt Bern Switzerland
Canberra Australia Hanoi Vietnam
Canberra Australia Havana Cuba
Canberra Australia Helsinki Finland
Canberra Australia Islamabad Pakistan
Canberra Australia Kabul Afghanistan
Canberra Australia London England
Canberra Australia Madrid Spain
Canberra Australia Moscow Russia
Canberra Australia Oslo Norway
Canberra Australia Ottawa Canada
Canberra Australia Paris France
Canberra Australia Rome Italy
Canberra Australia Stockholm Sweden
Canberra Australia Tehran Iran
Canberra Australia Tokyo Japan
Canberra Australia Athens Greece
Canberra Australia Baghdad Iraq
Canberra Australia Bangkok Thailand
Canberra Australia Beijing China
Canberra Australia Berlin Germany
Canberra Australia Bern Switzerland
Canberra Australia Cairo Egypt
Hanoi Vietnam Havana Cuba
Hanoi Vietnam Helsinki Finland
Hanoi Vietnam Islamabad Pakistan
Hanoi Vietnam Kabul Afghanistan
Hanoi Vietnam London England
Hanoi Vietnam Madrid Spain
Hanoi Vietnam Moscow Russia
Hanoi Vietnam Oslo Norway
Hanoi Vietnam Ottawa Canada
Hanoi Vietnam Paris France
Hanoi Vietnam Rome Italy
Hanoi Vietnam Stockholm Sweden
Hanoi Vietnam Tehran Iran
Hanoi Vietnam Tokyo Japan
Hanoi Vietnam Athens Greece
Hanoi Vietnam Baghdad Iraq
Hanoi Vietnam Bangkok Thailand
Hanoi Vietnam Beijing China
Hanoi Vietnam Berlin Germany
Hanoi Vietnam Bern Switzerland
Hanoi Vietnam Cairo Egypt
Hanoi Vietnam Canberra Australia
Havana Cuba Helsinki Finland
Havana Cuba Islamabad Pakistan
183648 183648 183648 183648 65918 183648 94834 93002 71149 183648 89518 183648 68731 183648 183648 63766 183648 183648 63766 183648 183648 63690 63766 83291 183648 183648 63766 183648 183648 183648 63766 65918 183648 73068 71149 183648 183648 183648 65370 183648 183648 67665 63945 93281 71149 183648 139630 183648 183648 63766 183648 183648 183648 183648 65062 110843 175871 94491 183648 183648 183648 183648 89277 183648 78014 183648 183648 63766 69529 183648 183648 183648 102013 63766 139449 183648 183648 113280 87363 64479 183648 183648 183648 183648 183648 183648 63766 183648 93281 183648 64068 183648 63690 183648 183648 183648 183648 71353 64068 183648 183648 183648 102334 97824 139630 183648 110843 183648 183648 183648 183648 183648 93281 183648 63766 183648 183648 183648 183648 183648 63766 67946 63766 69529 183648 183648 74700 183648 64292 183648 97584 64890 112995 125717 183648 183648 183648 65927 183648 183648 183648 64366 183648 183648 183648 68450 183648 102334 64068 183648 183648 183648 183648 183648 63690 183648 183648 183648 183648 183648 183648 183648 129534 92829 183648 183648 183648 183648 183648 183648 66719 63690 78014 183648 97719 183648 183648 74700 183648 183648 183648 102334 183648 183648 183648 183648 110843 80622 183648 183648 183648 102334 118096 183648 183648 183648 183648 208852 67073 183648 183648 125996 95715 66971 183648 183648 183648 97660 65663 98873 183648 183648 70581 183648 183648 183648 71353 183648 110843 183648 95715 66971 183648 67073 183648 97660 183648 183648 95715 183648 63690 183648 183648 63690 183648 99530 93281 183648 183648 183648 70654 183648 68437 183648 183648 183648 183648 74700 183648 183648 183648 63690 183648 183648 183648 93281 183648 183648 106973 75457 183648 63690 183648 183648 183648 183648 183648 183648 183648 183648 67438 63690 66087 76115 183648 183648 183648 183648 183648 183648 183648 63766 63766 183648 183648 63690 183648 80719 183648 183648 183648 183648 102013 183648 89431 183648 71932 183648 125996 183648 99901 213612 183648 183648 183648 183648 183648 213612 183648 63766 183648 183648 63766 67946 65113 183648 63690 183648 63690 183648 63766 183648 183648 63766 75094 67438 183648 63766 183648 63766 183648 183648 63856 79145 183648 183648 183648 64068 183648 183648 183648 183648 183648 183648 183648 183648 183648 183648 183648 183648 64649 183648 183648 183648 183648 183648 93281 63766 183648 183648 63689 67438 105276 118096 82908 93281 63766 72124 63702 68692 183648 183648 183648 88510
#! /bin/bash
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# download train_data
mkdir raw_data
wget --no-check-certificate https://paddlerec.bj.bcebos.com/word2vec/1-billion-word-language-modeling-benchmark-r13output.tar
tar xvf 1-billion-word-language-modeling-benchmark-r13output.tar
mv 1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/ raw_data/
# preprocess data
python preprocess.py --build_dict --build_dict_corpus_dir raw_data/training-monolingual.tokenized.shuffled --dict_path raw_data/word_count_dict.txt --ngrams_path raw_data/word_ngrams.txt
python preprocess.py --filter_corpus --dict_path raw_data/word_count_dict.txt --word_id_path raw_data/word_id_dict.txt --input_corpus_dir raw_data/training-monolingual.tokenized.shuffled --output_corpus_dir raw_data/convert_text8 --ngrams_id_path raw_data/word_ngrams_id.txt --ngrams_path raw_data/word_ngrams.txt --min_count 5 --downsample 0.001
mv raw_data/word_count_dict.txt data/dict/
mv raw_data/word_id_dict.txt data/dict/
mv raw_data/word_ngrams.txt data/dict/
mv raw_data/word_ngrams_id.txt data/dict/
rm -rf data/train/*
rm -rf data/test/*
python preprocess.py --data_resplit --file_nums 24 --input_corpus_dir=raw_data/convert_text8 --output_corpus_dir=data/train
# download test data
wget --no-check-certificate https://paddlerec.bj.bcebos.com/word2vec/test_dir.tar
tar xzvf test_dir.tar -C raw_data
mv raw_data/data/test_dir/* data/test/
rm -rf raw_data
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import io
import six
from paddlerec.core.reader import Reader
from paddlerec.core.utils import envs
class TrainReader(Reader):
def init(self):
dict_path = envs.get_global_env(
"dataset.dataset_infer.word_id_dict_path")
self.min_n = envs.get_global_env("hyper_parameters.min_n")
self.max_n = envs.get_global_env("hyper_parameters.max_n")
self.word_to_id = dict()
self.id_to_word = dict()
with io.open(dict_path, 'r', encoding='utf-8') as f:
for line in f:
self.word_to_id[line.split(' ')[0]] = int(line.split(' ')[1])
self.id_to_word[int(line.split(' ')[1])] = line.split(' ')[0]
self.dict_size = len(self.word_to_id)
def computeSubwords(self, word):
ngrams = set()
for i in range(len(word) - self.min_n + 1):
for j in range(self.min_n, self.max_n + 1):
end = min(len(word), i + j)
ngrams.add("".join(word[i:end]))
return list(ngrams)
def native_to_unicode(self, s):
if self._is_unicode(s):
return s
try:
return self._to_unicode(s)
except UnicodeDecodeError:
res = self._to_unicode(s, ignore_errors=True)
return res
def _is_unicode(self, s):
if six.PY2:
if isinstance(s, unicode):
return True
else:
if isinstance(s, str):
return True
return False
def _to_unicode(self, s, ignore_errors=False):
if self._is_unicode(s):
return s
error_mode = "ignore" if ignore_errors else "strict"
return s.decode("utf-8", errors=error_mode)
def strip_lines(self, line, vocab):
return self._replace_oov(vocab, self.native_to_unicode(line))
def _replace_oov(self, original_vocab, line):
"""Replace out-of-vocab words with "<UNK>".
This maintains compatibility with published results.
Args:
original_vocab: a set of strings (The standard vocabulary for the dataset)
line: a unicode string - a space-delimited sequence of words.
Returns:
a unicode string - a space-delimited sequence of words.
"""
return u" ".join([
"<" + word + ">"
if "<" + word + ">" in original_vocab else u"<UNK>"
for word in line.split()
])
def generate_sample(self, line):
def reader():
if ':' in line:
pass
features = self.strip_lines(line.lower(), self.word_to_id)
features = features.split()
inputs = []
for item in features:
if item == "<UNK>":
inputs.append([self.word_to_id[item]])
else:
ngrams = self.computeSubwords(item)
res = []
res.append(self.word_to_id[item])
for _ in ngrams:
res.append(self.word_to_id[_])
inputs.append(res)
yield [('analogy_a', inputs[0]), ('analogy_b', inputs[1]),
('analogy_c', inputs[2]), ('analogy_d', inputs[3][0:1])]
return reader
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import paddle.fluid as fluid
from paddlerec.core.utils import envs
from paddlerec.core.model import Model as ModelBase
class Model(ModelBase):
def __init__(self, config):
ModelBase.__init__(self, config)
def _init_hyper_parameters(self):
self.is_distributed = True if envs.get_trainer(
) == "CtrTrainer" else False
self.sparse_feature_number = envs.get_global_env(
"hyper_parameters.sparse_feature_number")
self.sparse_feature_dim = envs.get_global_env(
"hyper_parameters.sparse_feature_dim")
self.neg_num = envs.get_global_env("hyper_parameters.neg_num")
self.with_shuffle_batch = envs.get_global_env(
"hyper_parameters.with_shuffle_batch")
self.learning_rate = envs.get_global_env(
"hyper_parameters.optimizer.learning_rate")
self.decay_steps = envs.get_global_env(
"hyper_parameters.optimizer.decay_steps")
self.decay_rate = envs.get_global_env(
"hyper_parameters.optimizer.decay_rate")
def input_data(self, is_infer=False, **kwargs):
if is_infer:
analogy_a = fluid.data(
name="analogy_a", shape=[None, 1], lod_level=1, dtype='int64')
analogy_b = fluid.data(
name="analogy_b", shape=[None, 1], lod_level=1, dtype='int64')
analogy_c = fluid.data(
name="analogy_c", shape=[None, 1], lod_level=1, dtype='int64')
analogy_d = fluid.data(
name="analogy_d", shape=[None, 1], dtype='int64')
return [analogy_a, analogy_b, analogy_c, analogy_d]
input_word = fluid.data(
name="input_word", shape=[None, 1], lod_level=1, dtype='int64')
true_word = fluid.data(
name='true_label', shape=[None, 1], lod_level=1, dtype='int64')
if self.with_shuffle_batch:
return [input_word, true_word]
neg_word = fluid.data(
name="neg_label", shape=[None, self.neg_num], dtype='int64')
return [input_word, true_word, neg_word]
def net(self, inputs, is_infer=False):
if is_infer:
self.infer_net(inputs)
return
def embedding_layer(input,
table_name,
initializer_instance=None,
sequence_pool=False):
emb = fluid.embedding(
input=input,
is_sparse=True,
is_distributed=self.is_distributed,
size=[self.sparse_feature_number, self.sparse_feature_dim],
param_attr=fluid.ParamAttr(
name=table_name, initializer=initializer_instance), )
if sequence_pool:
emb = fluid.layers.sequence_pool(
input=emb, pool_type='average')
return emb
init_width = 1.0 / self.sparse_feature_dim
emb_initializer = fluid.initializer.Uniform(-init_width, init_width)
emb_w_initializer = fluid.initializer.Constant(value=0.0)
input_emb = embedding_layer(inputs[0], "emb", emb_initializer, True)
input_emb = fluid.layers.squeeze(input=input_emb, axes=[1])
true_emb_w = embedding_layer(inputs[1], "emb_w", emb_w_initializer,
True)
true_emb_w = fluid.layers.squeeze(input=true_emb_w, axes=[1])
if self.with_shuffle_batch:
neg_emb_w_list = []
for i in range(self.neg_num):
neg_emb_w_list.append(
fluid.contrib.layers.shuffle_batch(
true_emb_w)) # shuffle true_word
neg_emb_w_concat = fluid.layers.concat(neg_emb_w_list, axis=0)
neg_emb_w = fluid.layers.reshape(
neg_emb_w_concat,
shape=[-1, self.neg_num, self.sparse_feature_dim])
else:
neg_emb_w = embedding_layer(inputs[2], "emb_w", emb_w_initializer)
true_logits = fluid.layers.reduce_sum(
fluid.layers.elementwise_mul(input_emb, true_emb_w),
dim=1,
keep_dim=True)
input_emb_re = fluid.layers.reshape(
input_emb, shape=[-1, 1, self.sparse_feature_dim])
neg_matmul = fluid.layers.matmul(
input_emb_re, neg_emb_w, transpose_y=True)
neg_logits = fluid.layers.reshape(neg_matmul, shape=[-1, 1])
logits = fluid.layers.concat([true_logits, neg_logits], axis=0)
label_ones = fluid.layers.fill_constant(
shape=[fluid.layers.shape(true_logits)[0], 1],
value=1.0,
dtype='float32')
label_zeros = fluid.layers.fill_constant(
shape=[fluid.layers.shape(neg_logits)[0], 1],
value=0.0,
dtype='float32')
label = fluid.layers.concat([label_ones, label_zeros], axis=0)
loss = fluid.layers.log_loss(fluid.layers.sigmoid(logits), label)
avg_cost = fluid.layers.reduce_sum(loss)
global_right_cnt = fluid.layers.create_global_var(
name="global_right_cnt",
persistable=True,
dtype='float32',
shape=[1],
value=0)
global_total_cnt = fluid.layers.create_global_var(
name="global_total_cnt",
persistable=True,
dtype='float32',
shape=[1],
value=0)
global_right_cnt.stop_gradient = True
global_total_cnt.stop_gradient = True
self._cost = avg_cost
self._metrics["LOSS"] = avg_cost
def optimizer(self):
optimizer = fluid.optimizer.SGD(
learning_rate=fluid.layers.exponential_decay(
learning_rate=self.learning_rate,
decay_steps=self.decay_steps,
decay_rate=self.decay_rate,
staircase=True))
return optimizer
def infer_net(self, inputs):
def embedding_layer(input,
table_name,
initializer_instance=None,
sequence_pool=False):
emb = fluid.embedding(
input=input,
size=[self.sparse_feature_number, self.sparse_feature_dim],
param_attr=table_name)
if sequence_pool:
emb = fluid.layers.sequence_pool(
input=emb, pool_type='average')
return emb
all_label = np.arange(self.sparse_feature_number).reshape(
self.sparse_feature_number).astype('int32')
self.all_label = fluid.layers.cast(
x=fluid.layers.assign(all_label), dtype='int64')
emb_all_label = embedding_layer(self.all_label, "emb")
emb_a = embedding_layer(inputs[0], "emb", sequence_pool=True)
emb_b = embedding_layer(inputs[1], "emb", sequence_pool=True)
emb_c = embedding_layer(inputs[2], "emb", sequence_pool=True)
target = fluid.layers.elementwise_add(
fluid.layers.elementwise_sub(emb_b, emb_a), emb_c)
emb_all_label_l2 = fluid.layers.l2_normalize(x=emb_all_label, axis=1)
dist = fluid.layers.matmul(
x=target, y=emb_all_label_l2, transpose_y=True)
values, pred_idx = fluid.layers.topk(input=dist, k=4)
label = fluid.layers.expand(inputs[3], expand_times=[1, 4])
label_ones = fluid.layers.fill_constant_batch_size_like(
label, shape=[-1, 1], value=1.0, dtype='float32')
right_cnt = fluid.layers.reduce_sum(input=fluid.layers.cast(
fluid.layers.equal(pred_idx, label), dtype='float32'))
total_cnt = fluid.layers.reduce_sum(label_ones)
global_right_cnt = fluid.layers.create_global_var(
name="global_right_cnt",
persistable=True,
dtype='float32',
shape=[1],
value=0)
global_total_cnt = fluid.layers.create_global_var(
name="global_total_cnt",
persistable=True,
dtype='float32',
shape=[1],
value=0)
global_right_cnt.stop_gradient = True
global_total_cnt.stop_gradient = True
tmp1 = fluid.layers.elementwise_add(right_cnt, global_right_cnt)
fluid.layers.assign(tmp1, global_right_cnt)
tmp2 = fluid.layers.elementwise_add(total_cnt, global_total_cnt)
fluid.layers.assign(tmp2, global_total_cnt)
acc = fluid.layers.elementwise_div(
global_right_cnt, global_total_cnt, name="total_acc")
self._infer_results['acc'] = acc
# -*- coding: utf-8 -*
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import io
import math
import os
import random
import re
import six
import argparse
prog = re.compile("[^a-z ]", flags=0)
def parse_args():
parser = argparse.ArgumentParser(
description="Paddle Fluid word2 vector preprocess")
parser.add_argument(
'--build_dict_corpus_dir', type=str, help="The dir of corpus")
parser.add_argument(
'--input_corpus_dir', type=str, help="The dir of input corpus")
parser.add_argument(
'--output_corpus_dir', type=str, help="The dir of output corpus")
parser.add_argument(
'--dict_path',
type=str,
default='./dict',
help="The path of dictionary ")
parser.add_argument(
'--word_id_path',
type=str,
default='./word_id',
help="The path of word_id ")
parser.add_argument(
'--ngrams_path',
type=str,
default='./word_ngrams',
help="The path of word_ngrams ")
parser.add_argument(
'--ngrams_id_path',
type=str,
default='./word_ngrams_id',
help="The path of word_ngrams_id ")
parser.add_argument(
'--min_count',
type=int,
default=5,
help="If the word count is less then min_count, it will be removed from dict"
)
parser.add_argument('--min_n', type=int, default=3, help="min_n of ngrams")
parser.add_argument('--max_n', type=int, default=5, help="max_n of ngrams")
parser.add_argument(
'--file_nums',
type=int,
default=1024,
help="re-split input corpus file nums")
parser.add_argument(
'--downsample',
type=float,
default=0.001,
help="filter word by downsample")
parser.add_argument(
'--filter_corpus',
action='store_true',
default=False,
help='Filter corpus')
parser.add_argument(
'--build_dict',
action='store_true',
default=False,
help='Build dict from corpus')
parser.add_argument(
'--data_resplit',
action='store_true',
default=False,
help='re-split input corpus files')
return parser.parse_args()
def text_strip(text):
# English Preprocess Rule
return prog.sub("", text.lower())
# Shameless copy from Tensorflow https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/data_generators/text_encoder.py
# Unicode utility functions that work with Python 2 and 3
def native_to_unicode(s):
if _is_unicode(s):
return s
try:
return _to_unicode(s)
except UnicodeDecodeError:
res = _to_unicode(s, ignore_errors=True)
return res
def _is_unicode(s):
if six.PY2:
if isinstance(s, unicode):
return True
else:
if isinstance(s, str):
return True
return False
def _to_unicode(s, ignore_errors=False):
if _is_unicode(s):
return s
error_mode = "ignore" if ignore_errors else "strict"
return s.decode("utf-8", errors=error_mode)
def filter_corpus(args):
"""
filter corpus and convert id.
"""
word_count = dict()
word_to_id_ = dict()
word_all_count = 0
id_counts = []
word_id = 0
# read dict
with io.open(args.dict_path, 'r', encoding='utf-8') as f:
for line in f:
word, count = line.split()[0], int(line.split()[1])
word_count[word] = count
word_to_id_[word] = word_id
word_id += 1
id_counts.append(count)
word_all_count += count
word_ngrams = dict()
with io.open(args.ngrams_path, 'r', encoding='utf-8') as f:
for line in f:
word, ngrams = line.rstrip().split(':')
ngrams = ngrams.split()
ngrams = [str(word_to_id_[_]) for _ in ngrams]
word_ngrams[word_to_id_[word]] = ' '.join(ngrams)
with io.open(args.ngrams_id_path, 'w+', encoding='utf-8') as fid:
for k, v in word_ngrams.items():
fid.write(u'{} {}\n'.format(k, v))
# write word2id file
print("write word2id file to : " + args.dict_path + "_word_to_id_")
with io.open(args.word_id_path, 'w+', encoding='utf-8') as fid:
for k, v in word_to_id_.items():
fid.write(k + " " + str(v) + '\n')
# filter corpus and convert id
if not os.path.exists(args.output_corpus_dir):
os.makedirs(args.output_corpus_dir)
for file in os.listdir(args.input_corpus_dir):
with io.open(args.output_corpus_dir + '/convert_' + file + '.csv',
"w") as wf:
with io.open(
args.input_corpus_dir + '/' + file,
encoding='utf-8') as rf:
print(args.input_corpus_dir + '/' + file)
for line in rf:
signal = False
line = text_strip(line)
words = line.split()
write_line = ""
for item in words:
if item in word_count:
idx = word_to_id_[item]
else:
idx = word_to_id_[native_to_unicode('<UNK>')]
count_w = id_counts[idx]
corpus_size = word_all_count
keep_prob = (
math.sqrt(count_w /
(args.downsample * corpus_size)) + 1
) * (args.downsample * corpus_size) / count_w
r_value = random.random()
if r_value > keep_prob:
continue
write_line += str(idx)
write_line += " "
signal = True
if signal:
write_line = write_line[:-1] + "\n"
wf.write(_to_unicode(write_line))
def computeSubwords(word, min_n, max_n):
ngrams = set()
for i in range(len(word) - min_n + 1):
for j in range(min_n, max_n + 1):
end = min(len(word), i + j)
ngrams.add("".join(word[i:end]))
return list(ngrams)
def build_dict(args):
"""
proprocess the data, generate dictionary and save into dict_path.
:param corpus_dir: the input data dir.
:param dict_path: the generated dict path. the data in dict is "word count"
:param min_count:
:return:
"""
# word to count
word_count = dict()
for file in os.listdir(args.build_dict_corpus_dir):
with io.open(
args.build_dict_corpus_dir + "/" + file,
encoding='utf-8') as f:
print("build dict : ", args.build_dict_corpus_dir + "/" + file)
for line in f:
line = text_strip(line)
words = line.split()
for item in words:
item = '<' + item + '>'
if item in word_count:
word_count[item] = word_count[item] + 1
else:
word_count[item] = 1
item_to_remove = []
for item in word_count:
if word_count[item] <= args.min_count:
item_to_remove.append(item)
unk_sum = 0
for item in item_to_remove:
unk_sum += word_count[item]
del word_count[item]
# sort by count
word_count[native_to_unicode('<UNK>')] = unk_sum
word_ngrams = dict()
ngrams_count = dict()
for item in word_count:
ngrams = computeSubwords(item, args.min_n, args.max_n)
word_ngrams[item] = ngrams
for sub_word in ngrams:
if sub_word not in ngrams_count:
ngrams_count[sub_word] = 1
else:
ngrams_count[sub_word] = ngrams_count[sub_word] + 1
ngrams_count = sorted(
ngrams_count.items(), key=lambda ngrams_count: -ngrams_count[1])
word_count = sorted(
word_count.items(), key=lambda word_count: -word_count[1])
with io.open(args.dict_path, 'w+', encoding='utf-8') as f:
for k, v in word_count:
f.write(k + " " + str(v) + '\n')
for k, v in ngrams_count:
f.write(k + " " + str(v) + '\n')
with io.open(args.ngrams_path, 'w+', encoding='utf-8') as f:
for key in word_ngrams:
f.write(key + ":")
f.write(" ".join(word_ngrams[key]))
f.write(u'\n')
def data_split(args):
raw_data_dir = args.input_corpus_dir
new_data_dir = args.output_corpus_dir
if not os.path.exists(new_data_dir):
os.mkdir(new_data_dir)
files = os.listdir(raw_data_dir)
print(files)
index = 0
contents = []
for file_ in files:
with open(os.path.join(raw_data_dir, file_), 'r') as f:
contents.extend(f.readlines())
num = int(args.file_nums)
lines_per_file = len(contents) / num
print("contents: ", str(len(contents)))
print("lines_per_file: ", str(lines_per_file))
for i in range(1, num + 1):
with open(os.path.join(new_data_dir, "part_" + str(i)), 'w') as fout:
data = contents[(i - 1) * lines_per_file:min(i * lines_per_file,
len(contents))]
for line in data:
fout.write(line)
if __name__ == "__main__":
args = parse_args()
if args.build_dict:
build_dict(args)
elif args.filter_corpus:
filter_corpus(args)
elif args.data_resplit:
data_split(args)
else:
print(
"error command line, please choose --build_dict or --filter_corpus")
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import io
import numpy as np
from paddlerec.core.reader import Reader
from paddlerec.core.utils import envs
class NumpyRandomInt(object):
def __init__(self, a, b, buf_size=1000):
self.idx = 0
self.buffer = np.random.random_integers(a, b, buf_size)
self.a = a
self.b = b
def __call__(self):
if self.idx == len(self.buffer):
self.buffer = np.random.random_integers(self.a, self.b,
len(self.buffer))
self.idx = 0
result = self.buffer[self.idx]
self.idx += 1
return result
class TrainReader(Reader):
def init(self):
dict_path = envs.get_global_env(
"dataset.dataset_train.word_count_dict_path")
word_ngrams_path = envs.get_global_env(
"dataset.dataset_train.word_ngrams_path")
self.window_size = envs.get_global_env("hyper_parameters.window_size")
self.neg_num = envs.get_global_env("hyper_parameters.neg_num")
self.with_shuffle_batch = envs.get_global_env(
"hyper_parameters.with_shuffle_batch")
self.random_generator = NumpyRandomInt(1, self.window_size + 1)
self.word_ngrams = dict()
with io.open(word_ngrams_path, 'r', encoding='utf-8') as f:
for line in f:
line = line.rstrip().split()
self.word_ngrams[str(line[0])] = map(int, line[1:])
self.cs = None
if not self.with_shuffle_batch:
id_counts = []
word_all_count = 0
with io.open(dict_path, 'r', encoding='utf-8') as f:
for line in f:
word, count = line.split()[0], int(line.split()[1])
id_counts.append(count)
word_all_count += count
id_frequencys = [
float(count) / word_all_count for count in id_counts
]
np_power = np.power(np.array(id_frequencys), 0.75)
id_frequencys_pow = np_power / np_power.sum()
self.cs = np.array(id_frequencys_pow).cumsum()
def get_context_words(self, words, idx):
"""
Get the context word list of target word.
words: the words of the current line
idx: input word index
window_size: window size
"""
target_window = self.random_generator()
start_point = idx - target_window # if (idx - target_window) > 0 else 0
if start_point < 0:
start_point = 0
end_point = idx + target_window
targets = words[start_point:idx] + words[idx + 1:end_point + 1]
return targets
def generate_sample(self, line):
def reader():
word_ids = [w for w in line.split()]
for idx, target_id in enumerate(word_ids):
input_word = [int(target_id)]
if target_id in self.word_ngrams:
input_word += self.word_ngrams[target_id]
context_word_ids = self.get_context_words(word_ids, idx)
for context_id in context_word_ids:
output = [('input_word', input_word),
('true_label', [int(context_id)])]
if not self.with_shuffle_batch:
neg_array = self.cs.searchsorted(
np.random.sample(self.neg_num))
output += [('neg_label',
[int(str(i)) for i in neg_array])]
yield output
return reader
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册