# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from typing import List from paddlenlp.embeddings import TokenEmbedding from paddlehub.module.module import moduleinfo, serving @moduleinfo( name="w2v_wiki_target_bigram-char_dim300", version="1.0.0", summary="", author="paddlepaddle", author_email="", type="nlp/semantic_model") class Embedding(TokenEmbedding): """ Embedding model """ def __init__(self, *args, **kwargs): super(Embedding, self).__init__(embedding_name="w2v.wiki.target.bigram-char.dim300", *args, **kwargs) @serving def calc_similarity(self, data: List[List[str]]): """ Calculate similarities of giving word pairs. """ results = [] for word_pair in data: if len(word_pair) != 2: raise RuntimeError( f'The input must have two words, but got {len(word_pair)}. Please check your inputs.') if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str): raise RuntimeError( f'The types of text pair must be (str, str), but got' f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.') for word in word_pair: if self.get_idx_from_word(word) == \ self.get_idx_from_word(self.vocab.unk_token): raise RuntimeError( f'Word "{word}" is not in vocab. Please check your inputs.') results.append(str(self.cosine_sim(*word_pair))) return results