From 5bc12f64a4aea66013a7b806065dc8d0f9f2299a Mon Sep 17 00:00:00 2001 From: pkpk Date: Fri, 28 Feb 2020 11:43:37 +0800 Subject: [PATCH] Add word2vec dygraph model (#4357) * Update README.md (#4267) * test=develop (#4269) * 3d use new api (#4275) * PointNet++ and PointRCNN use new API * Update Readme of Dygraph BERT (#4277) Fix some typos. * Update run_classifier_multi_gpu.sh (#4279) remove the CUDA_VISIBLE_DEVICES * Update README.md (#4280) * 17 update api (#4294) * update1.7 save/load & fluid.data * update datafeed to dataloader * Update resnet_acnet.py (#4297) Bias attr of square conv should be "False" rather than None during training mode. * move danet to Paddle/Contrib (#4285) * update new api for rrpn (#4296) update new api for rrpn * Fix transformer save_inference_model (#4306) * upgrade save and load interface (#4311) * upgrade dcn and xdeepfm, change from all old save/load api to fluid.save and fluid.load * test=develop * modify save and load to 1.7 api for rrpn (#4310) * modify save and load to 1.7 api * add func to load parm * Add VOT models (#4257) * First version for VOT models. * Include SiamFC and ATOM. * A unified architecture for ATOM and Siames series models. * update vot code (#4338) * [VOT]Remove local.py generate step, add tracking gif to README (#4344) * update vot code * remove local.py generate step, add tracking gif to README * fix word usage in readme * add got10k download website * add pip install paddlepaddle-gpu * fix word usage * do not print stack frame when train process killed (#4346) * do not print stack frame when train process killed * add note about VOT does not support windows platform! * test=develop add word2vec demo Co-authored-by: Kaipeng Deng Co-authored-by: zhang wenhui Co-authored-by: parap1uie-s Co-authored-by: wangguanzhong Co-authored-by: chengjuntao <18222160892@163.com> Co-authored-by: liu zhengxi <380185688@qq.com> Co-authored-by: xujiaqi01 <173596896@qq.com> Co-authored-by: Double_V --- PaddleCV/rrpn/checkpoint.py | 79 ++++--- PaddleCV/rrpn/eval.py | 19 +- PaddleCV/rrpn/eval_helper.py | 16 +- PaddleCV/rrpn/infer.py | 21 +- PaddleCV/rrpn/models/model_builder.py | 128 +++++----- PaddleCV/rrpn/train.py | 55 +---- PaddleCV/rrpn/utility.py | 1 - dygraph/word2vec/README.md | 17 ++ dygraph/word2vec/word2vec.py | 327 ++++++++++++++++++++++++++ 9 files changed, 478 insertions(+), 185 deletions(-) create mode 100644 dygraph/word2vec/README.md create mode 100644 dygraph/word2vec/word2vec.py diff --git a/PaddleCV/rrpn/checkpoint.py b/PaddleCV/rrpn/checkpoint.py index 7062199e..d4c37008 100644 --- a/PaddleCV/rrpn/checkpoint.py +++ b/PaddleCV/rrpn/checkpoint.py @@ -28,6 +28,19 @@ import logging logger = logging.getLogger(__name__) +def _load_state(path): + if os.path.exists(path + '.pdopt'): + # XXX another hack to ignore the optimizer state + tmp = tempfile.mkdtemp() + dst = os.path.join(tmp, os.path.basename(os.path.normpath(path))) + shutil.copy(path + '.pdparams', dst + '.pdparams') + state = fluid.io.load_program_state(dst) + shutil.rmtree(tmp) + else: + state = fluid.io.load_program_state(path) + return state + + def load_params(exe, prog, path): """ Load model from the given path. @@ -64,7 +77,7 @@ def save(exe, prog, path): if os.path.isdir(path): shutil.rmtree(path) logger.info('Save model to {}.'.format(path)) - fluid.io.save_persistables(exe, path, prog) + fluid.save(prog, path) def load_and_fusebn(exe, prog, path): @@ -81,15 +94,6 @@ def load_and_fusebn(exe, prog, path): if not os.path.exists(path): raise ValueError("Model path {} does not exists.".format(path)) - def _if_exist(var): - b = os.path.exists(os.path.join(path, var.name)) - - if b: - logger.debug('load weight {}'.format(var.name)) - return b - - all_vars = list(filter(_if_exist, prog.list_vars())) - # Since the program uses affine-channel, there is no running mean and var # in the program, here append running mean and var. # NOTE, the params of batch norm should be like: @@ -101,15 +105,25 @@ def load_and_fusebn(exe, prog, path): mean_variances = set() bn_vars = [] - bn_in_path = True + state = None + if os.path.exists(path + '.pdparams'): + state = _load_state(path) - inner_prog = fluid.Program() - inner_start_prog = fluid.Program() - inner_block = inner_prog.global_block() - with fluid.program_guard(inner_prog, inner_start_prog): + def check_mean_and_bias(prefix): + m = prefix + 'mean' + v = prefix + 'variance' + if state: + return v in state and m in state + else: + return (os.path.exists(os.path.join(path, m)) and + os.path.exists(os.path.join(path, v))) + + has_mean_bias = True + + with fluid.program_guard(prog, fluid.Program()): for block in prog.blocks: ops = list(block.ops) - if not bn_in_path: + if not has_mean_bias: break for op in ops: if op.type == 'affine_channel': @@ -119,28 +133,22 @@ def load_and_fusebn(exe, prog, path): prefix = scale_name[:-5] mean_name = prefix + 'mean' variance_name = prefix + 'variance' - - if not os.path.exists(os.path.join(path, mean_name)): - bn_in_path = False - break - if not os.path.exists(os.path.join(path, variance_name)): - bn_in_path = False + if not check_mean_and_bias(prefix): + has_mean_bias = False break bias = block.var(bias_name) - mean_vb = inner_block.create_var( + mean_vb = block.create_var( name=mean_name, type=bias.type, shape=bias.shape, - dtype=bias.dtype, - persistable=True) - variance_vb = inner_block.create_var( + dtype=bias.dtype) + variance_vb = block.create_var( name=variance_name, type=bias.type, shape=bias.shape, - dtype=bias.dtype, - persistable=True) + dtype=bias.dtype) mean_variances.add(mean_vb) mean_variances.add(variance_vb) @@ -148,21 +156,16 @@ def load_and_fusebn(exe, prog, path): bn_vars.append( [scale_name, bias_name, mean_name, variance_name]) - if not bn_in_path: - fluid.io.load_vars(exe, path, prog, vars=all_vars) + if state: + fluid.io.set_program_state(prog, state) + else: + load_params(exe, prog, path) + if not has_mean_bias: logger.warning( "There is no paramters of batch norm in model {}. " "Skip to fuse batch norm. And load paramters done.".format(path)) return - # load running mean and running variance on cpu place into global scope. - place = fluid.CPUPlace() - exe_cpu = fluid.Executor(place) - fluid.io.load_vars(exe_cpu, path, vars=[v for v in mean_variances]) - - # load params on real place into global scope. - fluid.io.load_vars(exe, path, prog, vars=all_vars) - eps = 1e-5 for names in bn_vars: scale_name, bias_name, mean_name, var_name = names diff --git a/PaddleCV/rrpn/eval.py b/PaddleCV/rrpn/eval.py index bf773207..5f0d94b8 100755 --- a/PaddleCV/rrpn/eval.py +++ b/PaddleCV/rrpn/eval.py @@ -36,7 +36,6 @@ def eval(): place = fluid.CUDAPlace(0) if cfg.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) - image_shape = [3, cfg.TEST.max_size, cfg.TEST.max_size] class_nums = cfg.class_num model = model_builder.RRPN( add_conv_body_func=resnet.ResNet(), @@ -48,19 +47,14 @@ def eval(): infer_prog = fluid.Program() with fluid.program_guard(infer_prog, startup_prog): with fluid.unique_name.guard(): - model.build_model(image_shape) + model.build_model() pred_boxes = model.eval_bbox_out() infer_prog = infer_prog.clone(True) exe.run(startup_prog) - - # yapf: disable - def if_exist(var): - return os.path.exists(os.path.join(cfg.pretrained_model, var.name)) - if cfg.pretrained_model: - checkpoint.load_params(exe, infer_prog, cfg.pretrained_model) - # yapf: enable + fluid.load(infer_prog, cfg.pretrained_model, exe) test_reader = reader.test(1) - feeder = fluid.DataFeeder(place=place, feed_list=model.feeds()) + data_loader = model.data_loader + data_loader.set_sample_list_generator(test_reader, places=place) fetch_list = [pred_boxes] res_list = [] @@ -68,11 +62,10 @@ def eval(): 'bbox', 'gt_box', 'gt_class', 'is_crowed', 'im_info', 'im_id', 'is_difficult' ] - for i, data in enumerate(test_reader()): - im_info = [data[0][1]] + for i, data in enumerate(data_loader()): result = exe.run(infer_prog, fetch_list=[v.name for v in fetch_list], - feed=feeder.feed(data), + feed=data, return_numpy=False) pred_boxes_v = result[0] nmsed_out = pred_boxes_v diff --git a/PaddleCV/rrpn/eval_helper.py b/PaddleCV/rrpn/eval_helper.py index c9e66e67..9dbbac36 100755 --- a/PaddleCV/rrpn/eval_helper.py +++ b/PaddleCV/rrpn/eval_helper.py @@ -31,11 +31,11 @@ logger = logging.getLogger(__name__) def get_key_dict(out, data, key): res = {} - for i in range(len(key)): - if i == 0: - res[key[i]] = out + for name in key: + if name == 'bbox': + res[name] = np.array(out) else: - res[key[i]] = data[i] + res[name] = np.array(data[name]) return res @@ -167,7 +167,7 @@ def calculate_ap(rec, prec): def icdar_map(result, class_name, ovthresh): im_ids = [] for res in result: - im_ids.append(res['im_id']) + im_ids.append(res['im_id'][0][0]) recs = {} for i, im_id in enumerate(im_ids): @@ -185,11 +185,11 @@ def icdar_map(result, class_name, ovthresh): confidence = [] bbox = [] for res in result: - im_info = res['im_info'] + im_info = res['im_info'][0] pred_boxes = res['bbox'] for box in pred_boxes: if box[0] == class_name: - image_ids.append(res['im_id']) + image_ids.append(res['im_id'][0][0]) confidence.append(box[1]) clipd_box = clip_box(box[2:].reshape(-1, 8), im_info) bbox.append(clipd_box[0]) @@ -286,7 +286,7 @@ def icdar_box_eval(result, thresh): num_global_care_gt = 0 num_global_care_det = 0 for res in result: - im_info = res['im_info'] + im_info = res['im_info'][0] h = im_info[1] w = im_info[2] gt_boxes = res['gt_box'] diff --git a/PaddleCV/rrpn/infer.py b/PaddleCV/rrpn/infer.py index 3af9d21c..566afaac 100755 --- a/PaddleCV/rrpn/infer.py +++ b/PaddleCV/rrpn/infer.py @@ -32,7 +32,6 @@ from utility import print_arguments, parse_args, check_gpu def infer(): place = fluid.CUDAPlace(0) if cfg.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) - image_shape = [3, cfg.TEST.max_size, cfg.TEST.max_size] class_nums = cfg.class_num model = model_builder.RRPN( add_conv_body_func=resnet.ResNet(), @@ -43,31 +42,25 @@ def infer(): infer_prog = fluid.Program() with fluid.program_guard(infer_prog, startup_prog): with fluid.unique_name.guard(): - model.build_model(image_shape) + model.build_model() pred_boxes = model.eval_bbox_out() infer_prog = infer_prog.clone(True) exe.run(startup_prog) - - # yapf: disable - def if_exist(var): - return os.path.exists(os.path.join(cfg.pretrained_model, var.name)) - if cfg.pretrained_model: - checkpoint.load_params(exe, infer_prog, cfg.pretrained_model) - # yapf: enable + fluid.load(infer_prog, cfg.pretrained_model, exe) infer_reader = reader.infer(cfg.image_path) - feeder = fluid.DataFeeder(place=place, feed_list=model.feeds()) - + data_loader = model.data_loader + data_loader.set_sample_list_generator(infer_reader, places=place) fetch_list = [pred_boxes] imgs = os.listdir(cfg.image_path) imgs.sort() - for i, data in enumerate(infer_reader()): + for i, data in enumerate(data_loader()): result = exe.run(infer_prog, fetch_list=[v.name for v in fetch_list], - feed=feeder.feed(data), + feed=data, return_numpy=False) nmsed_out = result[0] - im_info = data[0][1] + im_info = np.array(data[0]['im_info'])[0] im_scale = im_info[2] outs = np.array(nmsed_out) draw_bounding_box_on_image(cfg.image_path, imgs[i], outs, im_scale, diff --git a/PaddleCV/rrpn/models/model_builder.py b/PaddleCV/rrpn/models/model_builder.py index 1f976fac..e37e940e 100755 --- a/PaddleCV/rrpn/models/model_builder.py +++ b/PaddleCV/rrpn/models/model_builder.py @@ -35,8 +35,8 @@ class RRPN(object): self.use_pyreader = use_pyreader self.use_random = use_random - def build_model(self, image_shape): - self.build_input(image_shape) + def build_model(self): + self.build_input() body_conv = self.add_conv_body_func(self.image) # RPN self.rpn_heads(body_conv) @@ -61,56 +61,42 @@ class RRPN(object): def eval_bbox_out(self): return self.pred_result - def build_input(self, image_shape): - if self.use_pyreader: - in_shapes = [[-1] + image_shape, [-1, 5], [-1, 1], [-1, 1], - [-1, 3], [-1, 1]] - lod_levels = [0, 1, 1, 1, 0, 0] - dtypes = [ - 'float32', 'float32', 'int32', 'int32', 'float32', 'int64' + def build_input(self): + self.image = fluid.data( + name='image', shape=[None, 3, None, None], dtype='float32') + if self.mode == 'train': + self.gt_box = fluid.data( + name='gt_box', shape=[None, 5], dtype='float32', lod_level=1) + else: + self.gt_box = fluid.data( + name='gt_box', shape=[None, 8], dtype='float32', lod_level=1) + self.gt_label = fluid.data( + name='gt_class', shape=[None, 1], dtype='int32', lod_level=1) + self.is_crowd = fluid.data( + name='is_crowed', shape=[None, 1], dtype='int32', lod_level=1) + self.im_info = fluid.data( + name='im_info', shape=[None, 3], dtype='float32') + self.im_id = fluid.data(name='im_id', shape=[None, 1], dtype='int64') + self.difficult = fluid.data( + name='is_difficult', shape=[None, -1], dtype='float32', lod_level=1) + if self.mode == 'train': + feed_data = [ + self.image, self.gt_box, self.gt_label, self.is_crowd, + self.im_info, self.im_id ] - self.py_reader = fluid.layers.py_reader( - capacity=64, - shapes=in_shapes, - lod_levels=lod_levels, - dtypes=dtypes, - use_double_buffer=True) - ins = fluid.layers.read_file(self.py_reader) - self.image = ins[0] - self.gt_box = ins[1] - self.gt_label = ins[2] - self.is_crowd = ins[3] - self.im_info = ins[4] - self.im_id = ins[5] + elif self.mode == 'infer': + feed_data = [self.image, self.im_info] else: - self.image = fluid.layers.data( - name='image', shape=image_shape, dtype='float32') - self.gt_box = fluid.layers.data( - name='gt_box', shape=[4], dtype='float32', lod_level=1) - self.gt_label = fluid.layers.data( - name='gt_label', shape=[1], dtype='int32', lod_level=1) - self.is_crowd = fluid.layers.data( - name='is_crowd', shape=[1], dtype='int32', lod_level=1) - self.im_info = fluid.layers.data( - name='im_info', shape=[3], dtype='float32') - self.im_id = fluid.layers.data( - name='im_id', shape=[1], dtype='int64') - - self.difficult = fluid.layers.data( - name='difficult', shape=[1], dtype='float32', lod_level=1) - - def feeds(self): - if self.mode == 'infer': - return [self.image, self.im_info] - if self.mode == 'val': - return [ + feed_data = [ self.image, self.gt_box, self.gt_label, self.is_crowd, self.im_info, self.im_id, self.difficult ] - return [ - self.image, self.gt_box, self.gt_label, self.is_crowd, self.im_info, - self.im_id - ] + if self.mode == 'train': + self.data_loader = fluid.io.DataLoader.from_generator( + feed_list=feed_data, capacity=64, iterable=False) + else: + self.data_loader = fluid.io.DataLoader.from_generator( + feed_list=feed_data, capacity=64, iterable=True) def eval_bbox(self): self.im_scale = fluid.layers.slice( @@ -151,23 +137,37 @@ class RRPN(object): dimension = fluid.layers.fill_constant( shape=[1, 1], value=2, dtype='int32') cond = fluid.layers.less_than(dimension, res_dimension) - res = fluid.layers.create_global_var( - shape=[1, 10], value=0.0, dtype='float32', persistable=False) - with fluid.layers.control_flow.Switch() as switch: - with switch.case(cond): - coordinate = fluid.layers.fill_constant( - shape=[9], value=0.0, dtype='float32') - pred_class = fluid.layers.fill_constant( - shape=[1], value=i + 1, dtype='float32') - add_class = fluid.layers.concat( - [pred_class, coordinate], axis=0) - normal_result = fluid.layers.elementwise_add(pred_result, - add_class) - fluid.layers.assign(normal_result, res) - with switch.default(): - normal_result = fluid.layers.fill_constant( - shape=[1, 10], value=-1.0, dtype='float32') - fluid.layers.assign(normal_result, res) + + def case1(): + res = fluid.layers.create_global_var( + shape=[1, 10], + value=0.0, + dtype='float32', + persistable=False) + coordinate = fluid.layers.fill_constant( + shape=[9], value=0.0, dtype='float32') + pred_class = fluid.layers.fill_constant( + shape=[1], value=i + 1, dtype='float32') + add_class = fluid.layers.concat( + [pred_class, coordinate], axis=0) + normal_result = fluid.layers.elementwise_add(pred_result, + add_class) + fluid.layers.assign(normal_result, res) + return res + + def case2(): + res = fluid.layers.create_global_var( + shape=[1, 10], + value=0.0, + dtype='float32', + persistable=False) + normal_result = fluid.layers.fill_constant( + shape=[1, 10], value=-1.0, dtype='float32') + fluid.layers.assign(normal_result, res) + return res + + res = fluid.layers.case( + pred_fn_pairs=[(cond, case1)], default=case2) results.append(res) if len(results) == 1: self.pred_result = results[0] diff --git a/PaddleCV/rrpn/train.py b/PaddleCV/rrpn/train.py index 11dafa99..5f451929 100755 --- a/PaddleCV/rrpn/train.py +++ b/PaddleCV/rrpn/train.py @@ -56,7 +56,7 @@ def get_device_num(): def train(): learning_rate = cfg.learning_rate - image_shape = [3, cfg.TRAIN.max_size, cfg.TRAIN.max_size] + #image_shape = [-1, 3, cfg.TRAIN.max_size, cfg.TRAIN.max_size] devices_num = get_device_num() total_batch_size = devices_num * cfg.TRAIN.im_per_batch @@ -71,7 +71,7 @@ def train(): add_roi_box_head_func=resnet.ResNetC5(), use_pyreader=cfg.use_pyreader, use_random=use_random) - model.build_model(image_shape) + model.build_model() losses, keys, rpn_rois = model.loss() loss = losses[0] fetch_list = losses @@ -132,16 +132,16 @@ def train(): if num_trainers > 1: train_reader = fluid.contrib.reader.distributed_batch_reader( train_reader) - py_reader = model.py_reader - py_reader.decorate_paddle_reader(train_reader) + data_loader = model.data_loader + data_loader.set_sample_list_generator(train_reader, places=place) else: if num_trainers > 1: shuffle = False train_reader = reader.train( batch_size=total_batch_size, shuffle=shuffle) feeder = fluid.DataFeeder(place=place, feed_list=model.feeds()) - def train_loop_pyreader(): - py_reader.start() + def train_loop(): + data_loader.start() train_stats = TrainingStats(cfg.log_window, keys) try: start_time = time.time() @@ -173,48 +173,9 @@ def train(): total_time = end_time - start_time last_loss = np.array(outs[0]).mean() except (StopIteration, fluid.core.EOFException): - py_reader.reset() - - def train_loop(): - start_time = time.time() - prev_start_time = start_time - start = start_time - train_stats = TrainingStats(cfg.log_window, keys) - for iter_id, data in enumerate(train_reader()): - prev_start_time = start_time - start_time = time.time() - if data[0][1].shape[0] == 0: - continue - - outs = exe.run(compiled_train_prog, - fetch_list=[v.name for v in fetch_list], - feed=feeder.feed(data)) - stats = {k: np.array(v).mean() for k, v in zip(keys, outs[:-1])} - train_stats.update(stats) - logs = train_stats.log() - if iter_id % 10 == 0: - strs = '{}, iter: {}, lr: {:.5f}, {}, time: {:.3f}'.format( - now_time(), iter_id, - np.mean(outs[-1]), logs, start_time - prev_start_time) - print(strs) - sys.stdout.flush() - if (iter_id + 1) % cfg.TRAIN.snapshot_iter == 0 and iter_id != 0: - save_name = "{}".format(iter_id + 1) - checkpoint.save(exe, train_prog, - os.path.join(cfg.model_save_dir, save_name)) - if (iter_id + 1) == cfg.max_iter: - checkpoint.save(exe, train_prog, - os.path.join(cfg.model_save_dir, "model_final")) - break - - end_time = time.time() - total_time = end_time - start_time - last_loss = np.array(outs[0]).mean() + data_loader.reset() - if cfg.use_pyreader: - train_loop_pyreader() - else: - train_loop() + train_loop() if __name__ == '__main__': diff --git a/PaddleCV/rrpn/utility.py b/PaddleCV/rrpn/utility.py index d737d3e7..226f6e37 100755 --- a/PaddleCV/rrpn/utility.py +++ b/PaddleCV/rrpn/utility.py @@ -133,7 +133,6 @@ def parse_args(): add_arg('dataset', str, 'icdar2015', "icdar2015, icdar2017.") add_arg('class_num', int, 2, "Class number.") add_arg('data_dir', str, 'dataset/icdar2015', "The data root path.") - add_arg('use_pyreader', bool, False, "Use pyreader.") add_arg('use_profile', bool, False, "Whether use profiler.") add_arg('padding_minibatch',bool, False, "If False, only resize image and not pad, image shape is different between" diff --git a/dygraph/word2vec/README.md b/dygraph/word2vec/README.md new file mode 100644 index 00000000..f53de9b9 --- /dev/null +++ b/dygraph/word2vec/README.md @@ -0,0 +1,17 @@ +# 词向量模型 + +## 注意 +本模型使用paddle 1.6.3开发,用于飞桨课程:零基础深度学习,教学场景使用。 +下面的代码介绍了如何使用飞桨实现word2vec模型,因为以教学为目的,因此整体代码比较简单。 +若有复杂需求请通过Issue或者qq群等渠道反馈。 + + +## 安装 +1. 需要安装paddlepaddle 1.6.3 +2. 需要python3.7及以上环境 + +## 运行 +请参考以下命令运行 +```shell +CUDA_VISIBLE_DEVICES=0 python3.7 word2vec.py +``` diff --git a/dygraph/word2vec/word2vec.py b/dygraph/word2vec/word2vec.py new file mode 100644 index 00000000..76368c6c --- /dev/null +++ b/dygraph/word2vec/word2vec.py @@ -0,0 +1,327 @@ +#encoding=utf-8 +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import io +import os +import sys +import requests +from collections import OrderedDict +import math +import random +import numpy as np +import paddle +import paddle.fluid as fluid + +from paddle.fluid.dygraph.nn import Embedding + + +#下载语料用来训练word2vec +def download(): + + corpus_url = "https://dataset.bj.bcebos.com/word2vec/text8.txt" + web_request = requests.get(corpus_url) + corpus = web_request.content + + with open("./text8.txt", "wb") as f: + f.write(corpus) + f.close() + + +download() + + +#读取text8数据 +def load_text8(): + corpus = [] + with open("./text8.txt", "r") as f: + for line in f: + line = line.strip() + corpus.append(line) + f.close() + + return corpus + + +corpus = load_text8() + +#打印前500个字符,简要看一下这个语料的样子 + + +#对语料进行预处理(分词) +def data_preprocess(corpus): + new_corpus = [] + for line in corpus: + line = line.strip().lower() + line = line.split(" ") + new_corpus.append(line) + + return new_corpus + + +corpus = data_preprocess(corpus) + + +#构造词典,统计每个词的频率,并根据频率将每个词转换为一个整数id +def build_dict(corpus, min_freq=3): + word_freq_dict = dict() + for line in corpus: + for word in line: + if word not in word_freq_dict: + word_freq_dict[word] = 0 + word_freq_dict[word] += 1 + + word_freq_dict = sorted( + word_freq_dict.items(), key=lambda x: x[1], reverse=True) + + word2id_dict = dict() + word2id_freq = dict() + id2word_dict = dict() + + word2id_freq[0] = 1. + word2id_dict['[oov]'] = 0 + id2word_dict[0] = '[oov]' + + for word, freq in word_freq_dict: + + if freq < min_freq: + word2id_freq[0] += freq + continue + + curr_id = len(word2id_dict) + word2id_dict[word] = curr_id + word2id_freq[word2id_dict[word]] = freq + id2word_dict[curr_id] = word + + return word2id_freq, word2id_dict, id2word_dict + + +word2id_freq, word2id_dict, id2word_dict = build_dict(corpus) +vocab_size = len(word2id_freq) +print("there are totoally %d different words in the corpus" % vocab_size) +for _, (word, word_id) in zip(range(50), word2id_dict.items()): + print("word %s, its id %d, its word freq %d" % + (word, word_id, word2id_freq[word_id])) + + +#把语料转换为id序列 +def convert_corpus_to_id(corpus, word2id_dict): + new_corpus = [] + for line in corpus: + new_line = [ + word2id_dict[word] + if word in word2id_dict else word2id_dict['[oov]'] for word in line + ] + new_corpus.append(new_line) + return new_corpus + + +corpus = convert_corpus_to_id(corpus, word2id_dict) + + +#使用二次采样算法(subsampling)处理语料,强化训练效果 +def subsampling(corpus, word2id_freq): + def keep(word_id): + return random.uniform(0, 1) < math.sqrt(1e-4 / word2id_freq[word_id] * + len(corpus)) + + new_corpus = [] + for line in corpus: + new_line = [word for word in line if keep(word)] + new_corpus.append(line) + return new_corpus + + +corpus = subsampling(corpus, word2id_freq) + + +#构造数据,准备模型训练 +def build_data(corpus, + word2id_dict, + word2id_freq, + max_window_size=3, + negative_sample_num=10): + + dataset = [] + + for line in corpus: + for center_word_idx in range(len(line)): + window_size = random.randint(1, max_window_size) + center_word = line[center_word_idx] + + positive_word_range = (max(0, center_word_idx - window_size), min( + len(line) - 1, center_word_idx + window_size)) + positive_word_candidates = [ + line[idx] + for idx in range(positive_word_range[0], positive_word_range[1] + + 1) + if idx != center_word_idx and line[idx] != line[center_word_idx] + ] + + if not positive_word_candidates: + continue + + for positive_word in positive_word_candidates: + dataset.append((center_word, positive_word, 1)) + + i = 0 + while i < negative_sample_num: + negative_word_candidate = random.randint(0, vocab_size - 1) + + if negative_word_candidate not in positive_word_candidates: + dataset.append((center_word, negative_word_candidate, 0)) + i += 1 + + return dataset + + +dataset = build_data(corpus, word2id_dict, word2id_freq) +for _, (center_word, target_word, label) in zip(range(50), dataset): + print("center_word %s, target %s, label %d" % + (id2word_dict[center_word], id2word_dict[target_word], label)) + + +def build_batch(dataset, batch_size, epoch_num): + + center_word_batch = [] + target_word_batch = [] + label_batch = [] + eval_word_batch = [] + + for epoch in range(epoch_num): + + random.shuffle(dataset) + + for center_word, target_word, label in dataset: + center_word_batch.append([center_word]) + target_word_batch.append([target_word]) + label_batch.append(label) + + if len(eval_word_batch) < 5: + eval_word_batch.append([random.randint(0, 99)]) + elif len(eval_word_batch) < 10: + eval_word_batch.append([random.randint(0, vocab_size - 1)]) + + if len(center_word_batch) == batch_size: + yield np.array(center_word_batch).astype("int64"), np.array( + target_word_batch).astype("int64"), np.array( + label_batch).astype("float32"), np.array( + eval_word_batch).astype("int64") + center_word_batch = [] + target_word_batch = [] + label_batch = [] + eval_word_batch = [] + + if len(center_word_batch) > 0: + yield np.array(center_word_batch).astype("int64"), np.array( + target_word_batch).astype("int64"), np.array(label_batch).astype( + "float32"), np.array(eval_word_batch).astype("int64") + + +for _, batch in zip(range(10), build_batch(dataset, 128, 3)): + print(batch) + +#定义skip-gram训练网络结构 + + +class SkipGram(fluid.dygraph.Layer): + def __init__(self, name_scope, vocab_size, embedding_size, init_scale=0.1): + super(SkipGram, self).__init__(name_scope) + self.vocab_size = vocab_size + self.embedding_size = embedding_size + + self.embedding = Embedding( + self.full_name(), + size=[self.vocab_size, self.embedding_size], + dtype='float32', + param_attr=fluid.ParamAttr( + name='embedding_para', + initializer=fluid.initializer.UniformInitializer( + low=-0.5 / self.embedding_size, + high=0.5 / self.embedding_size))) + + self.embedding_out = Embedding( + self.full_name(), + size=[self.vocab_size, self.embedding_size], + dtype='float32', + param_attr=fluid.ParamAttr( + name='embedding_out_para', + initializer=fluid.initializer.UniformInitializer( + low=-0.5 / self.embedding_size, + high=0.5 / self.embedding_size))) + + def forward(self, center_words, target_words, label): + center_words_emb = self.embedding(center_words) + target_words_emb = self.embedding_out(target_words) + + # center_words_emb = [batch_size, embedding_size] + # target_words_emb = [batch_size, embedding_size] + word_sim = fluid.layers.elementwise_mul(center_words_emb, + target_words_emb) + word_sim = fluid.layers.reduce_sum(word_sim, dim=-1) + + pred = fluid.layers.sigmoid(word_sim) + + loss = fluid.layers.sigmoid_cross_entropy_with_logits(word_sim, label) + loss = fluid.layers.reduce_mean(loss) + + return pred, loss + + +#开始训练 +batch_size = 512 +epoch_num = 3 +embedding_size = 200 +step = 0 +learning_rate = 1e-3 +total_steps = len(dataset) * epoch_num // batch_size + + +def get_similar_tokens(query_token, k, embed): + W = embed.numpy() + x = W[word2id_dict[query_token]] + cos = np.dot(W, x) / np.sqrt(np.sum(W * W, axis=1) * np.sum(x * x) + 1e-9) + flat = cos.flatten() + indices = np.argpartition(flat, -k)[-k:] + indices = indices[np.argsort(-flat[indices])] + for i in indices: # Remove the input words + print('for word %s, the similar word is %s' % + (query_token, str(id2word_dict[i]))) + + +with fluid.dygraph.guard(fluid.CUDAPlace(0)): + skip_gram_model = SkipGram("skip_gram_model", vocab_size, embedding_size) + adam = fluid.optimizer.AdamOptimizer(learning_rate=learning_rate) + + for center_words, target_words, label, eval_words in build_batch( + dataset, batch_size, epoch_num): + center_words_var = fluid.dygraph.to_variable(center_words) + target_words_var = fluid.dygraph.to_variable(target_words) + label_var = fluid.dygraph.to_variable(label) + pred, loss = skip_gram_model(center_words_var, target_words_var, + label_var) + + loss.backward() + adam.minimize(loss) + skip_gram_model.clear_gradients() + + step += 1 + if step % 100 == 0: + print("step %d / %d, loss %.3f" % + (step, total_steps, loss.numpy()[0])) + + if step % 10000 == 0: + get_similar_tokens('king', 5, skip_gram_model.embedding._w) + get_similar_tokens('one', 5, skip_gram_model.embedding._w) + get_similar_tokens('chip', 5, skip_gram_model.embedding._w) -- GitLab