diff --git a/hub_module/modules/text/embedding/tencent_ailab_chinese_embedding/module.py b/hub_module/modules/text/embedding/tencent_ailab_chinese_embedding/module.py new file mode 100644 index 0000000000000000000000000000000000000000..0f00030040ce91826dfd5e2102ac602885a538a2 --- /dev/null +++ b/hub_module/modules/text/embedding/tencent_ailab_chinese_embedding/module.py @@ -0,0 +1,183 @@ +# -*- coding:utf-8 -*- +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import io +import os + +import paddle.fluid as fluid +import paddlehub as hub +from paddlehub.common.paddle_helper import add_vars_prefix +from paddlehub.module.module import moduleinfo + + +def load_vocab(file_path): + """ + load the given vocabulary + """ + vocab = {} + with io.open(file_path, 'r', encoding='utf8') as f: + for line in f: + parts = line.split("\t") + vocab[parts[0]] = int(parts[1]) + + return vocab + + +@moduleinfo( + name="tencent_ailab_chinese_embedding", + version="1.0.0", + summary= + "Tencent AI Lab Embedding Corpus for Chinese Words and Phrases and the vocab size is 8,824,331. For more information, please refer to https://ai.tencent.com/ailab/nlp/zh/embedding.html", + author="", + author_email="", + type="nlp/semantic_model") +class TencentAILabChineseEmbedding(hub.Module): + def _initialize(self): + """ + initialize with the necessary elements + """ + self.pretrained_model_path = os.path.join(self.directory, "assets", + "model") + self.vocab_path = os.path.join(self.directory, "assets", "vocab.txt") + self.vocab = load_vocab(self.vocab_path) + + def context(self, trainable=False, max_seq_len=128, num_slots=1): + """ + Get the input ,output and program of the pretrained tencent_ailab_chinese_embedding + + Args: + trainable(bool): whether fine-tune the pretrained parameters of simnet_bow or not + num_slots(int): It's number of slots inputted to the model, selectted as following options: + + - 1(default): There's only one data to be feeded in the model, e.g. the module is used for sentence classification task. + - 2: There are two data to be feeded in the model, e.g. the module is used for text matching task (point-wise). + - 3: There are three data to be feeded in the model, e.g. the module is used for text matching task (pair-wise). + + Returns: + inputs(dict): the input variables of tencent_ailab_chinese_embedding (words) + outputs(dict): the output variables of input words (word embeddings) + main_program(Program): the main_program of tencent_ailab_chinese_embedding with pretrained prameters + """ + assert num_slots >= 1 and num_slots <= 3, "num_slots must be 1, 2, or 3, but the input is %d" % num_slots + main_program = fluid.Program() + startup_program = fluid.Program() + with fluid.program_guard(main_program, startup_program): + with fluid.unique_name.guard(): + w_param_attrs = fluid.ParamAttr( + name="embedding_0.w_0", + initializer=fluid.initializer.TruncatedNormal(scale=0.02), + trainable=trainable) + + text_1 = fluid.data( + name='text', + shape=[-1, max_seq_len], + dtype='int64', + lod_level=0) + emb_1 = fluid.embedding( + input=text_1, + size=[len(self.vocab), 200], + is_sparse=True, + padding_idx=len(self.vocab) - 1, + dtype='float32', + param_attr=w_param_attrs) + emb_1_name = emb_1.name + data_list = [text_1] + emb_name_list = [emb_1_name] + + if num_slots > 1: + text_2 = fluid.data( + name='text_2', + shape=[-1, max_seq_len], + dtype='int64', + lod_level=0) + emb_2 = fluid.embedding( + input=text_2, + size=[len(self.vocab), 200], + is_sparse=True, + padding_idx=len(self.vocab) - 1, + dtype='float32', + param_attr=w_param_attrs) + emb_2_name = emb_2.name + data_list.append(text_2) + emb_name_list.append(emb_2_name) + + if num_slots > 2: + text_3 = fluid.data( + name='text_3', + shape=[-1, max_seq_len], + dtype='int64', + lod_level=0) + emb_3 = fluid.embedding( + input=text_3, + size=[len(self.vocab), 200], + is_sparse=True, + padding_idx=len(self.vocab) - 1, + dtype='float32', + param_attr=w_param_attrs) + emb_3_name = emb_3.name + data_list.append(text_3) + emb_name_list.append(emb_3_name) + + variable_names = filter( + lambda v: v not in ['text', 'text_2', 'text_3'], + list(main_program.global_block().vars.keys())) + + prefix_name = "@HUB_{}@".format(self.name) + add_vars_prefix( + program=main_program, + prefix=prefix_name, + vars=variable_names) + for param in main_program.global_block().iter_parameters(): + param.trainable = trainable + + place = fluid.CPUPlace() + exe = fluid.Executor(place) + + # load the pretrained model + def if_exist(var): + return os.path.exists( + os.path.join(self.pretrained_model_path, var.name)) + + fluid.io.load_vars( + exe, self.pretrained_model_path, predicate=if_exist) + + inputs = {} + outputs = {} + for index, data in enumerate(data_list): + if index == 0: + inputs['text'] = data + outputs['emb'] = main_program.global_block().vars[ + prefix_name + emb_name_list[0]] + else: + inputs['text_%s' % (index + 1)] = data + outputs['emb_%s' % + (index + 1)] = main_program.global_block().vars[ + prefix_name + emb_name_list[index]] + + return inputs, outputs, main_program + + def get_vocab_path(self): + return self.vocab_path + + +if __name__ == "__main__": + w2v = TencentAILabChineseEmbedding() + inputs, outputs, program = w2v.context(num_slots=3) + print(inputs) + print(outputs) + print(w2v.get_vocab_path()) diff --git a/hub_module/modules/text/embedding/tencent_ailab_chinese_embedding_small/module.py b/hub_module/modules/text/embedding/tencent_ailab_chinese_embedding_small/module.py new file mode 100644 index 0000000000000000000000000000000000000000..34516dfadecf763c3c021a0cbf114db9fd49de9a --- /dev/null +++ b/hub_module/modules/text/embedding/tencent_ailab_chinese_embedding_small/module.py @@ -0,0 +1,183 @@ +# -*- coding:utf-8 -*- +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import io +import os + +import paddle.fluid as fluid +import paddlehub as hub +from paddlehub.common.paddle_helper import add_vars_prefix +from paddlehub.module.module import moduleinfo + + +def load_vocab(file_path): + """ + load the given vocabulary + """ + vocab = {} + with io.open(file_path, 'r', encoding='utf8') as f: + for line in f: + parts = line.split("\t") + vocab[parts[0]] = int(parts[1]) + + return vocab + + +@moduleinfo( + name="tencent_ailab_chinese_embedding_small", + version="1.0.0", + summary= + "Tencent AI Lab Embedding Corpus for Chinese Words and Phrases and the vocab size is 2,000,002. For more information, please refer to https://ai.tencent.com/ailab/nlp/zh/embedding.html", + author="", + author_email="", + type="nlp/semantic_model") +class TencentAILabChineseEmbeddingSmall(hub.Module): + def _initialize(self): + """ + initialize with the necessary elements + """ + self.pretrained_model_path = os.path.join(self.directory, "assets", + "model") + self.vocab_path = os.path.join(self.directory, "assets", "vocab.txt") + self.vocab = load_vocab(self.vocab_path) + + def context(self, trainable=False, max_seq_len=128, num_slots=1): + """ + Get the input ,output and program of the pretrained word2vec_skipgram + + Args: + trainable(bool): Whether fine-tune the pretrained parameters of tencent_ailab_chinese_embedding_small or not. + num_slots(int): It's number of data inputted to the model, selectted as following options: + + - 1(default): There's only one data to be feeded in the model, e.g. the module is used for sentence classification task. + - 2: There are two data to be feeded in the model, e.g. the module is used for text matching task (point-wise). + - 3: There are three data to be feeded in the model, e.g. the module is used for text matching task (pair-wise). + + Returns: + inputs(dict): the input variables of tencent_ailab_chinese_embedding_small (words) + outputs(dict): the output variables of input words (word embeddings) + main_program(Program): the main_program of tencent_ailab_chinese_embedding_small with pretrained prameters + """ + assert num_slots >= 1 and num_slots <= 3, "num_slots must be 1, 2, or 3, but the input is %d" % num_slots + main_program = fluid.Program() + startup_program = fluid.Program() + with fluid.program_guard(main_program, startup_program): + with fluid.unique_name.guard(): + w_param_attrs = fluid.ParamAttr( + name="embedding_0.w_0", + initializer=fluid.initializer.TruncatedNormal(scale=0.02), + trainable=trainable) + + text_1 = fluid.data( + name='text', + shape=[-1, max_seq_len], + dtype='int64', + lod_level=0) + emb_1 = fluid.embedding( + input=text_1, + size=[len(self.vocab), 200], + is_sparse=True, + padding_idx=len(self.vocab) - 1, + dtype='float32', + param_attr=w_param_attrs) + emb_1_name = emb_1.name + data_list = [text_1] + emb_name_list = [emb_1_name] + + if num_slots > 1: + text_2 = fluid.data( + name='text_2', + shape=[-1, max_seq_len], + dtype='int64', + lod_level=0) + emb_2 = fluid.embedding( + input=text_2, + size=[len(self.vocab), 200], + is_sparse=True, + padding_idx=len(self.vocab) - 1, + dtype='float32', + param_attr=w_param_attrs) + emb_2_name = emb_2.name + data_list.append(text_2) + emb_name_list.append(emb_2_name) + + if num_slots > 2: + text_3 = fluid.data( + name='text_3', + shape=[-1, max_seq_len], + dtype='int64', + lod_level=0) + emb_3 = fluid.embedding( + input=text_3, + size=[len(self.vocab), 200], + is_sparse=True, + padding_idx=len(self.vocab) - 1, + dtype='float32', + param_attr=w_param_attrs) + emb_3_name = emb_3.name + data_list.append(text_3) + emb_name_list.append(emb_3_name) + + variable_names = filter( + lambda v: v not in ['text', 'text_2', 'text_3'], + list(main_program.global_block().vars.keys())) + + prefix_name = "@HUB_{}@".format(self.name) + add_vars_prefix( + program=main_program, + prefix=prefix_name, + vars=variable_names) + for param in main_program.global_block().iter_parameters(): + param.trainable = trainable + + place = fluid.CPUPlace() + exe = fluid.Executor(place) + + # load the pretrained model + def if_exist(var): + return os.path.exists( + os.path.join(self.pretrained_model_path, var.name)) + + fluid.io.load_vars( + exe, self.pretrained_model_path, predicate=if_exist) + + inputs = {} + outputs = {} + for index, data in enumerate(data_list): + if index == 0: + inputs['text'] = data + outputs['emb'] = main_program.global_block().vars[ + prefix_name + emb_name_list[0]] + else: + inputs['text_%s' % (index + 1)] = data + outputs['emb_%s' % + (index + 1)] = main_program.global_block().vars[ + prefix_name + emb_name_list[index]] + + return inputs, outputs, main_program + + def get_vocab_path(self): + return self.vocab_path + + +if __name__ == "__main__": + w2v = TencentAILabChineseEmbeddingSmall() + inputs, outputs, program = w2v.context(num_slots=3) + print(inputs) + print(outputs) + print(w2v.get_vocab_path())