Merge pull request #746 from DesmonDay/add-TencentEmbedding

Add tencent embedding

Merge pull request #746 from DesmonDay/add-TencentEmbedding
Add tencent embedding
76c52fd1 · Steffy-zxf · GitHub · ba3de902 · 56efef06 · 76c52fd1
6 changed file
--- a/hub_module/modules/text/embedding/tencent_ailab_chinese_embedding/README.md
+++ b/hub_module/modules/text/embedding/tencent_ailab_chinese_embedding/README.md
+## 概述
+
+Tencent_AILab_ChineseEmbedding提供了基于海量中文语料训练学习得到的800多万个中文词语和短语的词向量表示，每一个词向量为200维。可以用于各种下游任务迁移学习。
+
+更多详情参考: https://ai.tencent.com/ailab/nlp/en/embedding.html
+
+注：该Module由第三方开发者DesmonDay贡献。
+
+## API
+
+```python
+def context(trainable=False, max_seq_len=128, num_slots=1)
+```
+
+获取该Module的预训练program以及program相应的输入输出。
+
+**参数**
+
+* trainable(bool): trainable=True表示program中的参数在Fine-tune时需要微调，否则保持不变。
+* max_seq_len(int): 模型使用的最大序列长度。
+* num_slots(int): 输入到模型所需要的文本个数，如完成单句文本分类任务，则num_slots=1；完成pointwise文本匹配任务，则num_slots=2；完成pairtwise文本匹配任务，则num_slots=3；
+
+**返回**
+
+* inputs(dict): program的输入变量
+* outputs(dict): program的输出变量
+* main_program(Program): 带有预训练参数的program
+
+### 代码示例
+
+```python
+import paddlehub as hub
+import cv2
+
+tencent_ailab_chinese_embedding = hub.Module(name="tencent_ailab_chinese_embedding")
+inputs, outputs, program = tencent_ailab_chinese_embedding.context(trainable=True, max_seq_len=128, num_slots=1)
+```
+
+## 依赖
+
+paddlepaddle >= 1.8.2
+
+paddlehub >= 1.8.0
+
+## 更新历史
+
+* 1.0.0
+
+  初始发布
--- a/hub_module/modules/text/embedding/tencent_ailab_chinese_embedding/module.py
+++ b/hub_module/modules/text/embedding/tencent_ailab_chinese_embedding/module.py
+# -*- coding:utf-8 -*-
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import io
+import os
+
+import paddle.fluid as fluid
+import paddlehub as hub
+from paddlehub.common.paddle_helper import add_vars_prefix
+from paddlehub.module.module import moduleinfo
+
+
+def load_vocab(file_path):
+    """
+    load the given vocabulary
+    """
+    vocab = {}
+    with io.open(file_path, 'r', encoding='utf8') as f:
+        for line in f:
+            parts = line.split("\t")
+            vocab[parts[0]] = int(parts[1])
+
+    return vocab
+
+
+@moduleinfo(
+    name="tencent_ailab_chinese_embedding",
+    version="1.0.0",
+    summary=
+    "Tencent AI Lab Embedding Corpus for Chinese Words and Phrases and the vocab size is 8,824,331. For more information, please refer to https://ai.tencent.com/ailab/nlp/zh/embedding.html",
+    author="",
+    author_email="",
+    type="nlp/semantic_model")
+class TencentAILabChineseEmbedding(hub.Module):
+    def _initialize(self):
+        """
+        initialize with the necessary elements
+        """
+        self.pretrained_model_path = os.path.join(self.directory, "assets",
+                                                  "model")
+        self.vocab_path = os.path.join(self.directory, "assets", "vocab.txt")
+        self.vocab = load_vocab(self.vocab_path)
+
+    def context(self, trainable=False, max_seq_len=128, num_slots=1):
+        """
+        Get the input ,output and program of the pretrained tencent_ailab_chinese_embedding
+
+        Args:
+             trainable(bool): whether fine-tune the pretrained parameters of simnet_bow or not
+             num_slots(int): It's number of slots inputted to the model, selectted as following options:
+
+                 - 1(default): There's only one data to be feeded in the model, e.g. the module is used for sentence classification task.
+                 - 2: There are two data to be feeded in the model, e.g. the module is used for text matching task (point-wise).
+                 - 3: There are three data to be feeded in the model, e.g. the module is used for text matching task (pair-wise).
+
+        Returns:
+             inputs(dict): the input variables of tencent_ailab_chinese_embedding (words)
+             outputs(dict): the output variables of input words (word embeddings)
+             main_program(Program): the main_program of tencent_ailab_chinese_embedding with pretrained prameters
+        """
+        assert num_slots >= 1 and num_slots <= 3, "num_slots must be 1, 2, or 3, but the input is %d" % num_slots
+        main_program = fluid.Program()
+        startup_program = fluid.Program()
+        with fluid.program_guard(main_program, startup_program):
+            with fluid.unique_name.guard():
+                w_param_attrs = fluid.ParamAttr(
+                    name="embedding_0.w_0",
+                    initializer=fluid.initializer.TruncatedNormal(scale=0.02),
+                    trainable=trainable)
+
+                text_1 = fluid.data(
+                    name='text',
+                    shape=[-1, max_seq_len],
+                    dtype='int64',
+                    lod_level=0)
+                emb_1 = fluid.embedding(
+                    input=text_1,
+                    size=[len(self.vocab), 200],
+                    is_sparse=True,
+                    padding_idx=len(self.vocab) - 1,
+                    dtype='float32',
+                    param_attr=w_param_attrs)
+                emb_1_name = emb_1.name
+                data_list = [text_1]
+                emb_name_list = [emb_1_name]
+
+                if num_slots > 1:
+                    text_2 = fluid.data(
+                        name='text_2',
+                        shape=[-1, max_seq_len],
+                        dtype='int64',
+                        lod_level=0)
+                    emb_2 = fluid.embedding(
+                        input=text_2,
+                        size=[len(self.vocab), 200],
+                        is_sparse=True,
+                        padding_idx=len(self.vocab) - 1,
+                        dtype='float32',
+                        param_attr=w_param_attrs)
+                    emb_2_name = emb_2.name
+                    data_list.append(text_2)
+                    emb_name_list.append(emb_2_name)
+
+                if num_slots > 2:
+                    text_3 = fluid.data(
+                        name='text_3',
+                        shape=[-1, max_seq_len],
+                        dtype='int64',
+                        lod_level=0)
+                    emb_3 = fluid.embedding(
+                        input=text_3,
+                        size=[len(self.vocab), 200],
+                        is_sparse=True,
+                        padding_idx=len(self.vocab) - 1,
+                        dtype='float32',
+                        param_attr=w_param_attrs)
+                    emb_3_name = emb_3.name
+                    data_list.append(text_3)
+                    emb_name_list.append(emb_3_name)
+
+                variable_names = filter(
+                    lambda v: v not in ['text', 'text_2', 'text_3'],
+                    list(main_program.global_block().vars.keys()))
+
+                prefix_name = "@HUB_{}@".format(self.name)
+                add_vars_prefix(
+                    program=main_program,
+                    prefix=prefix_name,
+                    vars=variable_names)
+                for param in main_program.global_block().iter_parameters():
+                    param.trainable = trainable
+
+                place = fluid.CPUPlace()
+                exe = fluid.Executor(place)
+
+                # load the pretrained model
+                def if_exist(var):
+                    return os.path.exists(
+                        os.path.join(self.pretrained_model_path, var.name))
+
+                fluid.io.load_vars(
+                    exe, self.pretrained_model_path, predicate=if_exist)
+
+                inputs = {}
+                outputs = {}
+                for index, data in enumerate(data_list):
+                    if index == 0:
+                        inputs['text'] = data
+                        outputs['emb'] = main_program.global_block().vars[
+                            prefix_name + emb_name_list[0]]
+                    else:
+                        inputs['text_%s' % (index + 1)] = data
+                        outputs['emb_%s' %
+                                (index + 1)] = main_program.global_block().vars[
+                                    prefix_name + emb_name_list[index]]
+
+                return inputs, outputs, main_program
+
+    def get_vocab_path(self):
+        return self.vocab_path
+
+
+if __name__ == "__main__":
+    w2v = TencentAILabChineseEmbedding()
+    inputs, outputs, program = w2v.context(num_slots=3)
+    print(inputs)
+    print(outputs)
+    print(w2v.get_vocab_path())
--- a/hub_module/modules/text/embedding/tencent_ailab_chinese_embedding_small/README.md
+++ b/hub_module/modules/text/embedding/tencent_ailab_chinese_embedding_small/README.md
+## 概述
+
+Tencent_AILab_ChineseEmbedding提供了基于海量中文语料训练学习得到的800多万个中文词语和短语的词向量表示，每一个词向量为200维。
+该Module截取了原来词汇表中前200万的词语，同样可以用于各种下游任务迁移学习。
+
+更多详情参考: https://ai.tencent.com/ailab/nlp/en/embedding.html
+
+注：该Module由第三方开发者DesmonDay贡献。
+
+## API
+
+```python
+def context(trainable=False, max_seq_len=128, num_slots=1)
+```
+
+获取该Module的预训练program以及program相应的输入输出。
+
+**参数**
+
+* trainable(bool): trainable=True表示program中的参数在Fine-tune时需要微调，否则保持不变。
+* max_seq_len(int): 模型使用的最大序列长度。
+* num_slots(int): 输入到模型所需要的文本个数，如完成单句文本分类任务，则num_slots=1；完成pointwise文本匹配任务，则num_slots=2；完成pairtwise文本匹配任务，则num_slots=3；
+
+**返回**
+
+* inputs(dict): program的输入变量
+* outputs(dict): program的输出变量
+* main_program(Program): 带有预训练参数的program
+
+### 代码示例
+
+```python
+import paddlehub as hub
+import cv2
+
+tencent_ailab_chinese_embedding = hub.Module(name="tencent_ailab_chinese_embedding_small")
+inputs, outputs, program = tencent_ailab_chinese_embedding.context(trainable=True, max_seq_len=128, num_slots=1)
+```
+
+## 依赖
+
+paddlepaddle >= 1.8.2
+
+paddlehub >= 1.8.0
+
+## 更新历史
+
+* 1.0.0
+
+  初始发布
--- a/hub_module/modules/text/embedding/tencent_ailab_chinese_embedding_small/module.py
+++ b/hub_module/modules/text/embedding/tencent_ailab_chinese_embedding_small/module.py
+# -*- coding:utf-8 -*-
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import io
+import os
+
+import paddle.fluid as fluid
+import paddlehub as hub
+from paddlehub.common.paddle_helper import add_vars_prefix
+from paddlehub.module.module import moduleinfo
+
+
+def load_vocab(file_path):
+    """
+    load the given vocabulary
+    """
+    vocab = {}
+    with io.open(file_path, 'r', encoding='utf8') as f:
+        for line in f:
+            parts = line.split("\t")
+            vocab[parts[0]] = int(parts[1])
+
+    return vocab
+
+
+@moduleinfo(
+    name="tencent_ailab_chinese_embedding_small",
+    version="1.0.0",
+    summary=
+    "Tencent AI Lab Embedding Corpus for Chinese Words and Phrases and the vocab size is 2,000,002. For more information, please refer to https://ai.tencent.com/ailab/nlp/zh/embedding.html",
+    author="",
+    author_email="",
+    type="nlp/semantic_model")
+class TencentAILabChineseEmbeddingSmall(hub.Module):
+    def _initialize(self):
+        """
+        initialize with the necessary elements
+        """
+        self.pretrained_model_path = os.path.join(self.directory, "assets",
+                                                  "model")
+        self.vocab_path = os.path.join(self.directory, "assets", "vocab.txt")
+        self.vocab = load_vocab(self.vocab_path)
+
+    def context(self, trainable=False, max_seq_len=128, num_slots=1):
+        """
+        Get the input ,output and program of the pretrained word2vec_skipgram
+
+        Args:
+             trainable(bool): Whether fine-tune the pretrained parameters of tencent_ailab_chinese_embedding_small or not.
+             num_slots(int): It's number of data inputted to the model, selectted as following options:
+
+                 - 1(default): There's only one data to be feeded in the model, e.g. the module is used for sentence classification task.
+                 - 2: There are two data to be feeded in the model, e.g. the module is used for text matching task (point-wise).
+                 - 3: There are three data to be feeded in the model, e.g. the module is used for text matching task (pair-wise).
+
+        Returns:
+             inputs(dict): the input variables of tencent_ailab_chinese_embedding_small (words)
+             outputs(dict): the output variables of input words (word embeddings)
+             main_program(Program): the main_program of tencent_ailab_chinese_embedding_small with pretrained prameters
+        """
+        assert num_slots >= 1 and num_slots <= 3, "num_slots must be 1, 2, or 3, but the input is %d" % num_slots
+        main_program = fluid.Program()
+        startup_program = fluid.Program()
+        with fluid.program_guard(main_program, startup_program):
+            with fluid.unique_name.guard():
+                w_param_attrs = fluid.ParamAttr(
+                    name="embedding_0.w_0",
+                    initializer=fluid.initializer.TruncatedNormal(scale=0.02),
+                    trainable=trainable)
+
+                text_1 = fluid.data(
+                    name='text',
+                    shape=[-1, max_seq_len],
+                    dtype='int64',
+                    lod_level=0)
+                emb_1 = fluid.embedding(
+                    input=text_1,
+                    size=[len(self.vocab), 200],
+                    is_sparse=True,
+                    padding_idx=len(self.vocab) - 1,
+                    dtype='float32',
+                    param_attr=w_param_attrs)
+                emb_1_name = emb_1.name
+                data_list = [text_1]
+                emb_name_list = [emb_1_name]
+
+                if num_slots > 1:
+                    text_2 = fluid.data(
+                        name='text_2',
+                        shape=[-1, max_seq_len],
+                        dtype='int64',
+                        lod_level=0)
+                    emb_2 = fluid.embedding(
+                        input=text_2,
+                        size=[len(self.vocab), 200],
+                        is_sparse=True,
+                        padding_idx=len(self.vocab) - 1,
+                        dtype='float32',
+                        param_attr=w_param_attrs)
+                    emb_2_name = emb_2.name
+                    data_list.append(text_2)
+                    emb_name_list.append(emb_2_name)
+
+                if num_slots > 2:
+                    text_3 = fluid.data(
+                        name='text_3',
+                        shape=[-1, max_seq_len],
+                        dtype='int64',
+                        lod_level=0)
+                    emb_3 = fluid.embedding(
+                        input=text_3,
+                        size=[len(self.vocab), 200],
+                        is_sparse=True,
+                        padding_idx=len(self.vocab) - 1,
+                        dtype='float32',
+                        param_attr=w_param_attrs)
+                    emb_3_name = emb_3.name
+                    data_list.append(text_3)
+                    emb_name_list.append(emb_3_name)
+
+                variable_names = filter(
+                    lambda v: v not in ['text', 'text_2', 'text_3'],
+                    list(main_program.global_block().vars.keys()))
+
+                prefix_name = "@HUB_{}@".format(self.name)
+                add_vars_prefix(
+                    program=main_program,
+                    prefix=prefix_name,
+                    vars=variable_names)
+                for param in main_program.global_block().iter_parameters():
+                    param.trainable = trainable
+
+                place = fluid.CPUPlace()
+                exe = fluid.Executor(place)
+
+                # load the pretrained model
+                def if_exist(var):
+                    return os.path.exists(
+                        os.path.join(self.pretrained_model_path, var.name))
+
+                fluid.io.load_vars(
+                    exe, self.pretrained_model_path, predicate=if_exist)
+
+                inputs = {}
+                outputs = {}
+                for index, data in enumerate(data_list):
+                    if index == 0:
+                        inputs['text'] = data
+                        outputs['emb'] = main_program.global_block().vars[
+                            prefix_name + emb_name_list[0]]
+                    else:
+                        inputs['text_%s' % (index + 1)] = data
+                        outputs['emb_%s' %
+                                (index + 1)] = main_program.global_block().vars[
+                                    prefix_name + emb_name_list[index]]
+
+                return inputs, outputs, main_program
+
+    def get_vocab_path(self):
+        return self.vocab_path
+
+
+if __name__ == "__main__":
+    w2v = TencentAILabChineseEmbeddingSmall()
+    inputs, outputs, program = w2v.context(num_slots=3)
+    print(inputs)
+    print(outputs)
+    print(w2v.get_vocab_path())
--- a/hub_module/scripts/configs/tencent_ailab_chinese_embedding.yml
+++ b/hub_module/scripts/configs/tencent_ailab_chinese_embedding.yml
+name: tencent_ailab_chinese_embedding
+dir: "modules/text/embedding/tencent_ailab_chinese_embedding"
+exclude:
+  - README.md
+  - test.py
+resources:
+  -
+    url: https://bj.bcebos.com/paddlehub/model/nlp/embedding/tencent_ailab_chinese_embedding_assets.tar.gz
+    dest: .
+    uncompress: True
--- a/hub_module/scripts/configs/tencent_ailab_chinese_embedding_small.yml
+++ b/hub_module/scripts/configs/tencent_ailab_chinese_embedding_small.yml
+name: tencent_ailab_chinese_embedding_small
+dir: "modules/text/embedding/tencent_ailab_chinese_embedding_small"
+exclude:
+  - README.md
+  - test.py
+resources:
+  -
+    url: https://bj.bcebos.com/paddlehub/model/nlp/embedding/tencent_ailab_chinese_embedding_small_assets.tar.gz
+    dest: .
+    uncompress: True