未验证 提交 5bc12f64 编写于 作者: P pkpk 提交者: GitHub

Add word2vec dygraph model (#4357)

* Update README.md (#4267)

* test=develop (#4269)

* 3d use new api (#4275)

* PointNet++ and PointRCNN use new API

* Update Readme of Dygraph BERT (#4277)

Fix some typos.

* Update run_classifier_multi_gpu.sh (#4279)

remove the CUDA_VISIBLE_DEVICES

* Update README.md (#4280)

* 17 update api (#4294)

* update1.7 save/load & fluid.data

* update datafeed to dataloader

* Update resnet_acnet.py (#4297)

Bias attr of square conv should be "False" rather than None during training mode.

* move danet to Paddle/Contrib (#4285)

* update new api for rrpn (#4296)

update new api for rrpn

* Fix transformer save_inference_model (#4306)

* upgrade save and load interface (#4311)

* upgrade dcn and xdeepfm, change from all old save/load api to fluid.save and fluid.load
* test=develop

* modify save and load to 1.7 api for rrpn (#4310)

* modify save and load to 1.7 api

* add func to load parm

* Add VOT models (#4257)

* First version for VOT models.
* Include SiamFC and ATOM.
* A unified architecture for ATOM and Siames series models.

* update vot code (#4338)

* [VOT]Remove local.py generate step, add tracking gif to README (#4344)

* update vot code
* remove local.py generate step, add tracking gif to README
* fix word usage in readme
* add got10k download website
* add pip install paddlepaddle-gpu
* fix word usage

* do not print stack frame when train process killed (#4346)

* do not print stack frame when train process killed
* add note about VOT does not support windows platform!

* test=develop add word2vec demo
Co-authored-by: NKaipeng Deng <dengkaipeng@baidu.com>
Co-authored-by: Nzhang wenhui <frankwhzhang@126.com>
Co-authored-by: Nparap1uie-s <parap1uie-s@users.noreply.github.com>
Co-authored-by: Nwangguanzhong <jerrywgz@126.com>
Co-authored-by: Nchengjuntao <18222160892@163.com>
Co-authored-by: Nliu zhengxi <380185688@qq.com>
Co-authored-by: Nxujiaqi01 <173596896@qq.com>
Co-authored-by: NDouble_V <liuvv0203@163.com>
上级 f492ae4f
......@@ -28,6 +28,19 @@ import logging
logger = logging.getLogger(__name__)
def _load_state(path):
if os.path.exists(path + '.pdopt'):
# XXX another hack to ignore the optimizer state
tmp = tempfile.mkdtemp()
dst = os.path.join(tmp, os.path.basename(os.path.normpath(path)))
shutil.copy(path + '.pdparams', dst + '.pdparams')
state = fluid.io.load_program_state(dst)
shutil.rmtree(tmp)
else:
state = fluid.io.load_program_state(path)
return state
def load_params(exe, prog, path):
"""
Load model from the given path.
......@@ -64,7 +77,7 @@ def save(exe, prog, path):
if os.path.isdir(path):
shutil.rmtree(path)
logger.info('Save model to {}.'.format(path))
fluid.io.save_persistables(exe, path, prog)
fluid.save(prog, path)
def load_and_fusebn(exe, prog, path):
......@@ -81,15 +94,6 @@ def load_and_fusebn(exe, prog, path):
if not os.path.exists(path):
raise ValueError("Model path {} does not exists.".format(path))
def _if_exist(var):
b = os.path.exists(os.path.join(path, var.name))
if b:
logger.debug('load weight {}'.format(var.name))
return b
all_vars = list(filter(_if_exist, prog.list_vars()))
# Since the program uses affine-channel, there is no running mean and var
# in the program, here append running mean and var.
# NOTE, the params of batch norm should be like:
......@@ -101,15 +105,25 @@ def load_and_fusebn(exe, prog, path):
mean_variances = set()
bn_vars = []
bn_in_path = True
state = None
if os.path.exists(path + '.pdparams'):
state = _load_state(path)
inner_prog = fluid.Program()
inner_start_prog = fluid.Program()
inner_block = inner_prog.global_block()
with fluid.program_guard(inner_prog, inner_start_prog):
def check_mean_and_bias(prefix):
m = prefix + 'mean'
v = prefix + 'variance'
if state:
return v in state and m in state
else:
return (os.path.exists(os.path.join(path, m)) and
os.path.exists(os.path.join(path, v)))
has_mean_bias = True
with fluid.program_guard(prog, fluid.Program()):
for block in prog.blocks:
ops = list(block.ops)
if not bn_in_path:
if not has_mean_bias:
break
for op in ops:
if op.type == 'affine_channel':
......@@ -119,28 +133,22 @@ def load_and_fusebn(exe, prog, path):
prefix = scale_name[:-5]
mean_name = prefix + 'mean'
variance_name = prefix + 'variance'
if not os.path.exists(os.path.join(path, mean_name)):
bn_in_path = False
break
if not os.path.exists(os.path.join(path, variance_name)):
bn_in_path = False
if not check_mean_and_bias(prefix):
has_mean_bias = False
break
bias = block.var(bias_name)
mean_vb = inner_block.create_var(
mean_vb = block.create_var(
name=mean_name,
type=bias.type,
shape=bias.shape,
dtype=bias.dtype,
persistable=True)
variance_vb = inner_block.create_var(
dtype=bias.dtype)
variance_vb = block.create_var(
name=variance_name,
type=bias.type,
shape=bias.shape,
dtype=bias.dtype,
persistable=True)
dtype=bias.dtype)
mean_variances.add(mean_vb)
mean_variances.add(variance_vb)
......@@ -148,21 +156,16 @@ def load_and_fusebn(exe, prog, path):
bn_vars.append(
[scale_name, bias_name, mean_name, variance_name])
if not bn_in_path:
fluid.io.load_vars(exe, path, prog, vars=all_vars)
if state:
fluid.io.set_program_state(prog, state)
else:
load_params(exe, prog, path)
if not has_mean_bias:
logger.warning(
"There is no paramters of batch norm in model {}. "
"Skip to fuse batch norm. And load paramters done.".format(path))
return
# load running mean and running variance on cpu place into global scope.
place = fluid.CPUPlace()
exe_cpu = fluid.Executor(place)
fluid.io.load_vars(exe_cpu, path, vars=[v for v in mean_variances])
# load params on real place into global scope.
fluid.io.load_vars(exe, path, prog, vars=all_vars)
eps = 1e-5
for names in bn_vars:
scale_name, bias_name, mean_name, var_name = names
......
......@@ -36,7 +36,6 @@ def eval():
place = fluid.CUDAPlace(0) if cfg.use_gpu else fluid.CPUPlace()
exe = fluid.Executor(place)
image_shape = [3, cfg.TEST.max_size, cfg.TEST.max_size]
class_nums = cfg.class_num
model = model_builder.RRPN(
add_conv_body_func=resnet.ResNet(),
......@@ -48,19 +47,14 @@ def eval():
infer_prog = fluid.Program()
with fluid.program_guard(infer_prog, startup_prog):
with fluid.unique_name.guard():
model.build_model(image_shape)
model.build_model()
pred_boxes = model.eval_bbox_out()
infer_prog = infer_prog.clone(True)
exe.run(startup_prog)
# yapf: disable
def if_exist(var):
return os.path.exists(os.path.join(cfg.pretrained_model, var.name))
if cfg.pretrained_model:
checkpoint.load_params(exe, infer_prog, cfg.pretrained_model)
# yapf: enable
fluid.load(infer_prog, cfg.pretrained_model, exe)
test_reader = reader.test(1)
feeder = fluid.DataFeeder(place=place, feed_list=model.feeds())
data_loader = model.data_loader
data_loader.set_sample_list_generator(test_reader, places=place)
fetch_list = [pred_boxes]
res_list = []
......@@ -68,11 +62,10 @@ def eval():
'bbox', 'gt_box', 'gt_class', 'is_crowed', 'im_info', 'im_id',
'is_difficult'
]
for i, data in enumerate(test_reader()):
im_info = [data[0][1]]
for i, data in enumerate(data_loader()):
result = exe.run(infer_prog,
fetch_list=[v.name for v in fetch_list],
feed=feeder.feed(data),
feed=data,
return_numpy=False)
pred_boxes_v = result[0]
nmsed_out = pred_boxes_v
......
......@@ -31,11 +31,11 @@ logger = logging.getLogger(__name__)
def get_key_dict(out, data, key):
res = {}
for i in range(len(key)):
if i == 0:
res[key[i]] = out
for name in key:
if name == 'bbox':
res[name] = np.array(out)
else:
res[key[i]] = data[i]
res[name] = np.array(data[name])
return res
......@@ -167,7 +167,7 @@ def calculate_ap(rec, prec):
def icdar_map(result, class_name, ovthresh):
im_ids = []
for res in result:
im_ids.append(res['im_id'])
im_ids.append(res['im_id'][0][0])
recs = {}
for i, im_id in enumerate(im_ids):
......@@ -185,11 +185,11 @@ def icdar_map(result, class_name, ovthresh):
confidence = []
bbox = []
for res in result:
im_info = res['im_info']
im_info = res['im_info'][0]
pred_boxes = res['bbox']
for box in pred_boxes:
if box[0] == class_name:
image_ids.append(res['im_id'])
image_ids.append(res['im_id'][0][0])
confidence.append(box[1])
clipd_box = clip_box(box[2:].reshape(-1, 8), im_info)
bbox.append(clipd_box[0])
......@@ -286,7 +286,7 @@ def icdar_box_eval(result, thresh):
num_global_care_gt = 0
num_global_care_det = 0
for res in result:
im_info = res['im_info']
im_info = res['im_info'][0]
h = im_info[1]
w = im_info[2]
gt_boxes = res['gt_box']
......
......@@ -32,7 +32,6 @@ from utility import print_arguments, parse_args, check_gpu
def infer():
place = fluid.CUDAPlace(0) if cfg.use_gpu else fluid.CPUPlace()
exe = fluid.Executor(place)
image_shape = [3, cfg.TEST.max_size, cfg.TEST.max_size]
class_nums = cfg.class_num
model = model_builder.RRPN(
add_conv_body_func=resnet.ResNet(),
......@@ -43,31 +42,25 @@ def infer():
infer_prog = fluid.Program()
with fluid.program_guard(infer_prog, startup_prog):
with fluid.unique_name.guard():
model.build_model(image_shape)
model.build_model()
pred_boxes = model.eval_bbox_out()
infer_prog = infer_prog.clone(True)
exe.run(startup_prog)
# yapf: disable
def if_exist(var):
return os.path.exists(os.path.join(cfg.pretrained_model, var.name))
if cfg.pretrained_model:
checkpoint.load_params(exe, infer_prog, cfg.pretrained_model)
# yapf: enable
fluid.load(infer_prog, cfg.pretrained_model, exe)
infer_reader = reader.infer(cfg.image_path)
feeder = fluid.DataFeeder(place=place, feed_list=model.feeds())
data_loader = model.data_loader
data_loader.set_sample_list_generator(infer_reader, places=place)
fetch_list = [pred_boxes]
imgs = os.listdir(cfg.image_path)
imgs.sort()
for i, data in enumerate(infer_reader()):
for i, data in enumerate(data_loader()):
result = exe.run(infer_prog,
fetch_list=[v.name for v in fetch_list],
feed=feeder.feed(data),
feed=data,
return_numpy=False)
nmsed_out = result[0]
im_info = data[0][1]
im_info = np.array(data[0]['im_info'])[0]
im_scale = im_info[2]
outs = np.array(nmsed_out)
draw_bounding_box_on_image(cfg.image_path, imgs[i], outs, im_scale,
......
......@@ -35,8 +35,8 @@ class RRPN(object):
self.use_pyreader = use_pyreader
self.use_random = use_random
def build_model(self, image_shape):
self.build_input(image_shape)
def build_model(self):
self.build_input()
body_conv = self.add_conv_body_func(self.image)
# RPN
self.rpn_heads(body_conv)
......@@ -61,56 +61,42 @@ class RRPN(object):
def eval_bbox_out(self):
return self.pred_result
def build_input(self, image_shape):
if self.use_pyreader:
in_shapes = [[-1] + image_shape, [-1, 5], [-1, 1], [-1, 1],
[-1, 3], [-1, 1]]
lod_levels = [0, 1, 1, 1, 0, 0]
dtypes = [
'float32', 'float32', 'int32', 'int32', 'float32', 'int64'
def build_input(self):
self.image = fluid.data(
name='image', shape=[None, 3, None, None], dtype='float32')
if self.mode == 'train':
self.gt_box = fluid.data(
name='gt_box', shape=[None, 5], dtype='float32', lod_level=1)
else:
self.gt_box = fluid.data(
name='gt_box', shape=[None, 8], dtype='float32', lod_level=1)
self.gt_label = fluid.data(
name='gt_class', shape=[None, 1], dtype='int32', lod_level=1)
self.is_crowd = fluid.data(
name='is_crowed', shape=[None, 1], dtype='int32', lod_level=1)
self.im_info = fluid.data(
name='im_info', shape=[None, 3], dtype='float32')
self.im_id = fluid.data(name='im_id', shape=[None, 1], dtype='int64')
self.difficult = fluid.data(
name='is_difficult', shape=[None, -1], dtype='float32', lod_level=1)
if self.mode == 'train':
feed_data = [
self.image, self.gt_box, self.gt_label, self.is_crowd,
self.im_info, self.im_id
]
self.py_reader = fluid.layers.py_reader(
capacity=64,
shapes=in_shapes,
lod_levels=lod_levels,
dtypes=dtypes,
use_double_buffer=True)
ins = fluid.layers.read_file(self.py_reader)
self.image = ins[0]
self.gt_box = ins[1]
self.gt_label = ins[2]
self.is_crowd = ins[3]
self.im_info = ins[4]
self.im_id = ins[5]
elif self.mode == 'infer':
feed_data = [self.image, self.im_info]
else:
self.image = fluid.layers.data(
name='image', shape=image_shape, dtype='float32')
self.gt_box = fluid.layers.data(
name='gt_box', shape=[4], dtype='float32', lod_level=1)
self.gt_label = fluid.layers.data(
name='gt_label', shape=[1], dtype='int32', lod_level=1)
self.is_crowd = fluid.layers.data(
name='is_crowd', shape=[1], dtype='int32', lod_level=1)
self.im_info = fluid.layers.data(
name='im_info', shape=[3], dtype='float32')
self.im_id = fluid.layers.data(
name='im_id', shape=[1], dtype='int64')
self.difficult = fluid.layers.data(
name='difficult', shape=[1], dtype='float32', lod_level=1)
def feeds(self):
if self.mode == 'infer':
return [self.image, self.im_info]
if self.mode == 'val':
return [
feed_data = [
self.image, self.gt_box, self.gt_label, self.is_crowd,
self.im_info, self.im_id, self.difficult
]
return [
self.image, self.gt_box, self.gt_label, self.is_crowd, self.im_info,
self.im_id
]
if self.mode == 'train':
self.data_loader = fluid.io.DataLoader.from_generator(
feed_list=feed_data, capacity=64, iterable=False)
else:
self.data_loader = fluid.io.DataLoader.from_generator(
feed_list=feed_data, capacity=64, iterable=True)
def eval_bbox(self):
self.im_scale = fluid.layers.slice(
......@@ -151,23 +137,37 @@ class RRPN(object):
dimension = fluid.layers.fill_constant(
shape=[1, 1], value=2, dtype='int32')
cond = fluid.layers.less_than(dimension, res_dimension)
res = fluid.layers.create_global_var(
shape=[1, 10], value=0.0, dtype='float32', persistable=False)
with fluid.layers.control_flow.Switch() as switch:
with switch.case(cond):
coordinate = fluid.layers.fill_constant(
shape=[9], value=0.0, dtype='float32')
pred_class = fluid.layers.fill_constant(
shape=[1], value=i + 1, dtype='float32')
add_class = fluid.layers.concat(
[pred_class, coordinate], axis=0)
normal_result = fluid.layers.elementwise_add(pred_result,
add_class)
fluid.layers.assign(normal_result, res)
with switch.default():
normal_result = fluid.layers.fill_constant(
shape=[1, 10], value=-1.0, dtype='float32')
fluid.layers.assign(normal_result, res)
def case1():
res = fluid.layers.create_global_var(
shape=[1, 10],
value=0.0,
dtype='float32',
persistable=False)
coordinate = fluid.layers.fill_constant(
shape=[9], value=0.0, dtype='float32')
pred_class = fluid.layers.fill_constant(
shape=[1], value=i + 1, dtype='float32')
add_class = fluid.layers.concat(
[pred_class, coordinate], axis=0)
normal_result = fluid.layers.elementwise_add(pred_result,
add_class)
fluid.layers.assign(normal_result, res)
return res
def case2():
res = fluid.layers.create_global_var(
shape=[1, 10],
value=0.0,
dtype='float32',
persistable=False)
normal_result = fluid.layers.fill_constant(
shape=[1, 10], value=-1.0, dtype='float32')
fluid.layers.assign(normal_result, res)
return res
res = fluid.layers.case(
pred_fn_pairs=[(cond, case1)], default=case2)
results.append(res)
if len(results) == 1:
self.pred_result = results[0]
......
......@@ -56,7 +56,7 @@ def get_device_num():
def train():
learning_rate = cfg.learning_rate
image_shape = [3, cfg.TRAIN.max_size, cfg.TRAIN.max_size]
#image_shape = [-1, 3, cfg.TRAIN.max_size, cfg.TRAIN.max_size]
devices_num = get_device_num()
total_batch_size = devices_num * cfg.TRAIN.im_per_batch
......@@ -71,7 +71,7 @@ def train():
add_roi_box_head_func=resnet.ResNetC5(),
use_pyreader=cfg.use_pyreader,
use_random=use_random)
model.build_model(image_shape)
model.build_model()
losses, keys, rpn_rois = model.loss()
loss = losses[0]
fetch_list = losses
......@@ -132,16 +132,16 @@ def train():
if num_trainers > 1:
train_reader = fluid.contrib.reader.distributed_batch_reader(
train_reader)
py_reader = model.py_reader
py_reader.decorate_paddle_reader(train_reader)
data_loader = model.data_loader
data_loader.set_sample_list_generator(train_reader, places=place)
else:
if num_trainers > 1: shuffle = False
train_reader = reader.train(
batch_size=total_batch_size, shuffle=shuffle)
feeder = fluid.DataFeeder(place=place, feed_list=model.feeds())
def train_loop_pyreader():
py_reader.start()
def train_loop():
data_loader.start()
train_stats = TrainingStats(cfg.log_window, keys)
try:
start_time = time.time()
......@@ -173,48 +173,9 @@ def train():
total_time = end_time - start_time
last_loss = np.array(outs[0]).mean()
except (StopIteration, fluid.core.EOFException):
py_reader.reset()
def train_loop():
start_time = time.time()
prev_start_time = start_time
start = start_time
train_stats = TrainingStats(cfg.log_window, keys)
for iter_id, data in enumerate(train_reader()):
prev_start_time = start_time
start_time = time.time()
if data[0][1].shape[0] == 0:
continue
outs = exe.run(compiled_train_prog,
fetch_list=[v.name for v in fetch_list],
feed=feeder.feed(data))
stats = {k: np.array(v).mean() for k, v in zip(keys, outs[:-1])}
train_stats.update(stats)
logs = train_stats.log()
if iter_id % 10 == 0:
strs = '{}, iter: {}, lr: {:.5f}, {}, time: {:.3f}'.format(
now_time(), iter_id,
np.mean(outs[-1]), logs, start_time - prev_start_time)
print(strs)
sys.stdout.flush()
if (iter_id + 1) % cfg.TRAIN.snapshot_iter == 0 and iter_id != 0:
save_name = "{}".format(iter_id + 1)
checkpoint.save(exe, train_prog,
os.path.join(cfg.model_save_dir, save_name))
if (iter_id + 1) == cfg.max_iter:
checkpoint.save(exe, train_prog,
os.path.join(cfg.model_save_dir, "model_final"))
break
end_time = time.time()
total_time = end_time - start_time
last_loss = np.array(outs[0]).mean()
data_loader.reset()
if cfg.use_pyreader:
train_loop_pyreader()
else:
train_loop()
train_loop()
if __name__ == '__main__':
......
......@@ -133,7 +133,6 @@ def parse_args():
add_arg('dataset', str, 'icdar2015', "icdar2015, icdar2017.")
add_arg('class_num', int, 2, "Class number.")
add_arg('data_dir', str, 'dataset/icdar2015', "The data root path.")
add_arg('use_pyreader', bool, False, "Use pyreader.")
add_arg('use_profile', bool, False, "Whether use profiler.")
add_arg('padding_minibatch',bool, False,
"If False, only resize image and not pad, image shape is different between"
......
# 词向量模型
## 注意
本模型使用paddle 1.6.3开发,用于飞桨课程:零基础深度学习,教学场景使用。
下面的代码介绍了如何使用飞桨实现word2vec模型,因为以教学为目的,因此整体代码比较简单。
若有复杂需求请通过Issue或者qq群等渠道反馈。
## 安装
1. 需要安装paddlepaddle 1.6.3
2. 需要python3.7及以上环境
## 运行
请参考以下命令运行
```shell
CUDA_VISIBLE_DEVICES=0 python3.7 word2vec.py
```
#encoding=utf-8
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import io
import os
import sys
import requests
from collections import OrderedDict
import math
import random
import numpy as np
import paddle
import paddle.fluid as fluid
from paddle.fluid.dygraph.nn import Embedding
#下载语料用来训练word2vec
def download():
corpus_url = "https://dataset.bj.bcebos.com/word2vec/text8.txt"
web_request = requests.get(corpus_url)
corpus = web_request.content
with open("./text8.txt", "wb") as f:
f.write(corpus)
f.close()
download()
#读取text8数据
def load_text8():
corpus = []
with open("./text8.txt", "r") as f:
for line in f:
line = line.strip()
corpus.append(line)
f.close()
return corpus
corpus = load_text8()
#打印前500个字符,简要看一下这个语料的样子
#对语料进行预处理(分词)
def data_preprocess(corpus):
new_corpus = []
for line in corpus:
line = line.strip().lower()
line = line.split(" ")
new_corpus.append(line)
return new_corpus
corpus = data_preprocess(corpus)
#构造词典,统计每个词的频率,并根据频率将每个词转换为一个整数id
def build_dict(corpus, min_freq=3):
word_freq_dict = dict()
for line in corpus:
for word in line:
if word not in word_freq_dict:
word_freq_dict[word] = 0
word_freq_dict[word] += 1
word_freq_dict = sorted(
word_freq_dict.items(), key=lambda x: x[1], reverse=True)
word2id_dict = dict()
word2id_freq = dict()
id2word_dict = dict()
word2id_freq[0] = 1.
word2id_dict['[oov]'] = 0
id2word_dict[0] = '[oov]'
for word, freq in word_freq_dict:
if freq < min_freq:
word2id_freq[0] += freq
continue
curr_id = len(word2id_dict)
word2id_dict[word] = curr_id
word2id_freq[word2id_dict[word]] = freq
id2word_dict[curr_id] = word
return word2id_freq, word2id_dict, id2word_dict
word2id_freq, word2id_dict, id2word_dict = build_dict(corpus)
vocab_size = len(word2id_freq)
print("there are totoally %d different words in the corpus" % vocab_size)
for _, (word, word_id) in zip(range(50), word2id_dict.items()):
print("word %s, its id %d, its word freq %d" %
(word, word_id, word2id_freq[word_id]))
#把语料转换为id序列
def convert_corpus_to_id(corpus, word2id_dict):
new_corpus = []
for line in corpus:
new_line = [
word2id_dict[word]
if word in word2id_dict else word2id_dict['[oov]'] for word in line
]
new_corpus.append(new_line)
return new_corpus
corpus = convert_corpus_to_id(corpus, word2id_dict)
#使用二次采样算法(subsampling)处理语料,强化训练效果
def subsampling(corpus, word2id_freq):
def keep(word_id):
return random.uniform(0, 1) < math.sqrt(1e-4 / word2id_freq[word_id] *
len(corpus))
new_corpus = []
for line in corpus:
new_line = [word for word in line if keep(word)]
new_corpus.append(line)
return new_corpus
corpus = subsampling(corpus, word2id_freq)
#构造数据,准备模型训练
def build_data(corpus,
word2id_dict,
word2id_freq,
max_window_size=3,
negative_sample_num=10):
dataset = []
for line in corpus:
for center_word_idx in range(len(line)):
window_size = random.randint(1, max_window_size)
center_word = line[center_word_idx]
positive_word_range = (max(0, center_word_idx - window_size), min(
len(line) - 1, center_word_idx + window_size))
positive_word_candidates = [
line[idx]
for idx in range(positive_word_range[0], positive_word_range[1]
+ 1)
if idx != center_word_idx and line[idx] != line[center_word_idx]
]
if not positive_word_candidates:
continue
for positive_word in positive_word_candidates:
dataset.append((center_word, positive_word, 1))
i = 0
while i < negative_sample_num:
negative_word_candidate = random.randint(0, vocab_size - 1)
if negative_word_candidate not in positive_word_candidates:
dataset.append((center_word, negative_word_candidate, 0))
i += 1
return dataset
dataset = build_data(corpus, word2id_dict, word2id_freq)
for _, (center_word, target_word, label) in zip(range(50), dataset):
print("center_word %s, target %s, label %d" %
(id2word_dict[center_word], id2word_dict[target_word], label))
def build_batch(dataset, batch_size, epoch_num):
center_word_batch = []
target_word_batch = []
label_batch = []
eval_word_batch = []
for epoch in range(epoch_num):
random.shuffle(dataset)
for center_word, target_word, label in dataset:
center_word_batch.append([center_word])
target_word_batch.append([target_word])
label_batch.append(label)
if len(eval_word_batch) < 5:
eval_word_batch.append([random.randint(0, 99)])
elif len(eval_word_batch) < 10:
eval_word_batch.append([random.randint(0, vocab_size - 1)])
if len(center_word_batch) == batch_size:
yield np.array(center_word_batch).astype("int64"), np.array(
target_word_batch).astype("int64"), np.array(
label_batch).astype("float32"), np.array(
eval_word_batch).astype("int64")
center_word_batch = []
target_word_batch = []
label_batch = []
eval_word_batch = []
if len(center_word_batch) > 0:
yield np.array(center_word_batch).astype("int64"), np.array(
target_word_batch).astype("int64"), np.array(label_batch).astype(
"float32"), np.array(eval_word_batch).astype("int64")
for _, batch in zip(range(10), build_batch(dataset, 128, 3)):
print(batch)
#定义skip-gram训练网络结构
class SkipGram(fluid.dygraph.Layer):
def __init__(self, name_scope, vocab_size, embedding_size, init_scale=0.1):
super(SkipGram, self).__init__(name_scope)
self.vocab_size = vocab_size
self.embedding_size = embedding_size
self.embedding = Embedding(
self.full_name(),
size=[self.vocab_size, self.embedding_size],
dtype='float32',
param_attr=fluid.ParamAttr(
name='embedding_para',
initializer=fluid.initializer.UniformInitializer(
low=-0.5 / self.embedding_size,
high=0.5 / self.embedding_size)))
self.embedding_out = Embedding(
self.full_name(),
size=[self.vocab_size, self.embedding_size],
dtype='float32',
param_attr=fluid.ParamAttr(
name='embedding_out_para',
initializer=fluid.initializer.UniformInitializer(
low=-0.5 / self.embedding_size,
high=0.5 / self.embedding_size)))
def forward(self, center_words, target_words, label):
center_words_emb = self.embedding(center_words)
target_words_emb = self.embedding_out(target_words)
# center_words_emb = [batch_size, embedding_size]
# target_words_emb = [batch_size, embedding_size]
word_sim = fluid.layers.elementwise_mul(center_words_emb,
target_words_emb)
word_sim = fluid.layers.reduce_sum(word_sim, dim=-1)
pred = fluid.layers.sigmoid(word_sim)
loss = fluid.layers.sigmoid_cross_entropy_with_logits(word_sim, label)
loss = fluid.layers.reduce_mean(loss)
return pred, loss
#开始训练
batch_size = 512
epoch_num = 3
embedding_size = 200
step = 0
learning_rate = 1e-3
total_steps = len(dataset) * epoch_num // batch_size
def get_similar_tokens(query_token, k, embed):
W = embed.numpy()
x = W[word2id_dict[query_token]]
cos = np.dot(W, x) / np.sqrt(np.sum(W * W, axis=1) * np.sum(x * x) + 1e-9)
flat = cos.flatten()
indices = np.argpartition(flat, -k)[-k:]
indices = indices[np.argsort(-flat[indices])]
for i in indices: # Remove the input words
print('for word %s, the similar word is %s' %
(query_token, str(id2word_dict[i])))
with fluid.dygraph.guard(fluid.CUDAPlace(0)):
skip_gram_model = SkipGram("skip_gram_model", vocab_size, embedding_size)
adam = fluid.optimizer.AdamOptimizer(learning_rate=learning_rate)
for center_words, target_words, label, eval_words in build_batch(
dataset, batch_size, epoch_num):
center_words_var = fluid.dygraph.to_variable(center_words)
target_words_var = fluid.dygraph.to_variable(target_words)
label_var = fluid.dygraph.to_variable(label)
pred, loss = skip_gram_model(center_words_var, target_words_var,
label_var)
loss.backward()
adam.minimize(loss)
skip_gram_model.clear_gradients()
step += 1
if step % 100 == 0:
print("step %d / %d, loss %.3f" %
(step, total_steps, loss.numpy()[0]))
if step % 10000 == 0:
get_similar_tokens('king', 5, skip_gram_model.embedding._w)
get_similar_tokens('one', 5, skip_gram_model.embedding._w)
get_similar_tokens('chip', 5, skip_gram_model.embedding._w)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册