From afa5c8a07daeb104de3837bbc467d03360a5ae24 Mon Sep 17 00:00:00 2001 From: zhangxuefei Date: Mon, 16 Sep 2019 17:53:45 +0800 Subject: [PATCH] add sentence similarity tutorial --- demo/sentence_similarity/sensim.py | 5 - tutorial/autofinetune.ipynb | 6 +- tutorial/sentence_sim.ipynb | 163 +++++++++++++++++++++++++++++ 3 files changed, 166 insertions(+), 8 deletions(-) create mode 100644 tutorial/sentence_sim.ipynb diff --git a/demo/sentence_similarity/sensim.py b/demo/sentence_similarity/sensim.py index bfe9b1b3..3f6f969f 100644 --- a/demo/sentence_similarity/sensim.py +++ b/demo/sentence_similarity/sensim.py @@ -46,11 +46,6 @@ if __name__ == "__main__": place = fluid.CPUPlace() exe = fluid.Executor(place) feeder = fluid.DataFeeder(feed_list=[word_ids], place=place) - w2v, = exe.run( - program, - feed=feeder.feed([[[1123]]]), - fetch_list=[embedding.name], - return_numpy=False) data = [ [ diff --git a/tutorial/autofinetune.ipynb b/tutorial/autofinetune.ipynb index 5b3a1e05..2ea7e6cc 100644 --- a/tutorial/autofinetune.ipynb +++ b/tutorial/autofinetune.ipynb @@ -4,7 +4,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# 利用PaddleHub Auto Fine-tune进行自动超参搜索\n", + "# PaddleHub 自动超参搜索(Auto Fine-tune)\n", "\n", "## 一、简介\n", "\n", @@ -174,7 +174,7 @@ "通过以下命令方式:\n", "```shell\n", "$ OUTPUT=result/\n", - "$ hub autofientune finetunee.py --param_file=hparam.yaml --cuda=['1','2'] --popsize=5 --round=10 \n", + "$ hub autofinetune finetunee.py --param_file=hparam.yaml --cuda=['1','2'] --popsize=5 --round=10 \n", "$ --output_dir=${OUTPUT} --evaluate_choice=fulltrail --tuning_strategy=hazero\n", "```\n", "\n", @@ -200,7 +200,7 @@ "\n", "Auto Finetune API在搜索超参过程中会自动对关键训练指标进行打点,启动程序后执行下面命令\n", "```bash\n", - "$ tensorboard --logdir $CKPT_DIR/visualization --host ${HOST_IP} --port ${PORT_NUM}\n", + "$ tensorboard --logdir $OUTPUT/tb_paddle --host ${HOST_IP} --port ${PORT_NUM}\n", "```\n", "其中${HOST_IP}为本机IP地址,${PORT_NUM}为可用端口号,如本机IP地址为192.168.0.1,端口号8040,用浏览器打开192.168.0.1:8040,\n", "即可看到搜素过程中各超参以及指标的变化情况" diff --git a/tutorial/sentence_sim.ipynb b/tutorial/sentence_sim.ipynb new file mode 100644 index 00000000..46653d66 --- /dev/null +++ b/tutorial/sentence_sim.ipynb @@ -0,0 +1,163 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 使用Word2Vec进行文本语义相似度计算\n", + "\n", + "本示例展示利用PaddleHub“端到端地”完成文本相似度计算\n", + "\n", + "## 一、准备文本数据\n", + "\n", + "如\n", + "```\n", + "驾驶违章一次扣12分用两个驾驶证处理可以吗 一次性扣12分的违章,能用不满十二分的驾驶证扣分吗\n", + "水果放冰箱里储存好吗 中国银行纪念币网上怎么预约\n", + "电脑反应很慢怎么办 反应速度慢,电脑总是卡是怎么回事\n", + "```\n", + "\n", + "## 二、分词\n", + "利用PaddleHub Module LAC对文本数据进行分词" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# coding:utf-8\n", + "# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\"\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# http://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License.\n", + "\"\"\"similarity between two sentences\"\"\"\n", + "\n", + "import numpy as np\n", + "import scipy\n", + "from scipy.spatial import distance\n", + "\n", + "from paddlehub.reader.tokenization import load_vocab\n", + "import paddle.fluid as fluid\n", + "import paddlehub as hub\n", + "\n", + "raw_data = [\n", + " [\"驾驶违章一次扣12分用两个驾驶证处理可以吗\", \"一次性扣12分的违章,能用不满十二分的驾驶证扣分吗\"],\n", + " [\"水果放冰箱里储存好吗\", \"中国银行纪念币网上怎么预约\"],\n", + " [\"电脑反应很慢怎么办\", \"反应速度慢,电脑总是卡是怎么回事\"]\n", + "]\n", + "\n", + "lac = hub.Module(name=\"lac\")\n", + "\n", + "processed_data = []\n", + "for text_pair in raw_data:\n", + " inputs = {\"text\" : text_pair}\n", + " results = lac.lexical_analysis(data=inputs, use_gpu=True, batch_size=2)\n", + " data = []\n", + " for result in results:\n", + " data.append(\" \".join(result[\"word\"])\n", + " processed_data.append(data)\n", + "\n", + "processed_data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 三、计算文本语义相似度\n", + "\n", + "将分词文本中的单词相应替换为wordid,之后输入wor2vec module中计算两个文本语义相似度" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def convert_tokens_to_ids(vocab, text):\n", + " wids = []\n", + " tokens = text.split(\" \")\n", + " for token in tokens:\n", + " wid = vocab.get(token, None)\n", + " if not wid:\n", + " wid = vocab[\"unknown\"]\n", + " wids.append(wid)\n", + " return wids" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "inputs, outputs, program = module.context(trainable=False)\n", + "vocab = load_vocab(module.get_vocab_path())\n", + "\n", + "word_ids = inputs[\"word_ids\"]\n", + "embedding = outputs[\"word_embs\"]\n", + "\n", + "place = fluid.CPUPlace()\n", + "exe = fluid.Executor(place)\n", + "feeder = fluid.DataFeeder(feed_list=[word_ids], place=place)\n", + "\n", + "for item in processed_data:\n", + " text_a = convert_tokens_to_ids(vocab, item[0])\n", + " text_b = convert_tokens_to_ids(vocab, item[1])\n", + "\n", + " vecs_a, = exe.run(\n", + " program,\n", + " feed=feeder.feed([[text_a]]),\n", + " fetch_list=[embedding.name],\n", + " return_numpy=False)\n", + " vecs_a = np.array(vecs_a)\n", + " vecs_b, = exe.run(\n", + " program,\n", + " feed=feeder.feed([[text_b]]),\n", + " fetch_list=[embedding.name],\n", + " return_numpy=False)\n", + " vecs_b = np.array(vecs_b)\n", + "\n", + " sent_emb_a = np.sum(vecs_a, axis=0)\n", + " sent_emb_b = np.sum(vecs_b, axis=0)\n", + " cos_sim = 1 - distance.cosine(sent_emb_a, sent_emb_b)\n", + "\n", + " print(\"text_a: %s; text_b: %s; cosine_similarity: %.5f\" %\n", + " (item[0], item[1], cos_sim))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} -- GitLab