未验证 提交 c915936f 编写于 作者: K KP 提交者: GitHub

Upgrade embedding modules (#1230)

* do finetune text cls & ner via using embedding module
上级 af009a04
......@@ -56,6 +56,25 @@ def dot(
* `word_a`: 需要计算内积的单词a。
* `word_b`: 需要计算内积的单词b。
```python
def get_vocab_path()
```
获取本地词表文件的路径信息。
```python
def get_tokenizer(*args, **kwargs)
```
获取当前模型的tokenizer,返回一个JiebaTokenizer的实例,当前只支持中文embedding模型。
**参数**
* `*args`: 额外传递的列表形式的参数。
* `**kwargs`: 额外传递的字典形式的参数。
关于额外参数的详情,可查看[paddlenlp.data.tokenizer.JiebaTokenizer](https://github.com/PaddlePaddle/models/blob/develop/PaddleNLP/paddlenlp/data/tokenizer.py)
更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings)
## 代码示例
......@@ -125,3 +144,6 @@ paddlehub >= 2.0.0
初始发布
* 1.0.1
支持基于embedding的文本分类和序列标注finetune任务
......@@ -12,44 +12,24 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
from paddlenlp.embeddings import TokenEmbedding
from paddlehub.module.module import moduleinfo, serving
from paddlehub.module.module import moduleinfo
from paddlehub.module.nlp_module import EmbeddingModule
@moduleinfo(
name="fasttext_crawl_target_word-word_dim300_en",
version="1.0.0",
version="1.0.1",
summary="",
author="paddlepaddle",
author_email="",
type="nlp/semantic_model")
type="nlp/semantic_model",
meta=EmbeddingModule)
class Embedding(TokenEmbedding):
"""
Embedding model
"""
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name="fasttext.crawl.target.word-word.dim300.en", *args, **kwargs)
@serving
def calc_similarity(self, data: List[List[str]]):
"""
Calculate similarities of giving word pairs.
"""
results = []
for word_pair in data:
if len(word_pair) != 2:
raise RuntimeError(
f'The input must have two words, but got {len(word_pair)}. Please check your inputs.')
if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str):
raise RuntimeError(
f'The types of text pair must be (str, str), but got'
f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.')
embedding_name = "fasttext.crawl.target.word-word.dim300.en"
for word in word_pair:
if self.get_idx_from_word(word) == \
self.get_idx_from_word(self.vocab.unk_token):
raise RuntimeError(
f'Word "{word}" is not in vocab. Please check your inputs.')
results.append(str(self.cosine_sim(*word_pair)))
return results
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name=self.embedding_name, *args, **kwargs)
......@@ -56,6 +56,25 @@ def dot(
* `word_a`: 需要计算内积的单词a。
* `word_b`: 需要计算内积的单词b。
```python
def get_vocab_path()
```
获取本地词表文件的路径信息。
```python
def get_tokenizer(*args, **kwargs)
```
获取当前模型的tokenizer,返回一个JiebaTokenizer的实例,当前只支持中文embedding模型。
**参数**
* `*args`: 额外传递的列表形式的参数。
* `**kwargs`: 额外传递的字典形式的参数。
关于额外参数的详情,可查看[paddlenlp.data.tokenizer.JiebaTokenizer](https://github.com/PaddlePaddle/models/blob/develop/PaddleNLP/paddlenlp/data/tokenizer.py)
更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings)
## 代码示例
......@@ -125,3 +144,6 @@ paddlehub >= 2.0.0
初始发布
* 1.0.1
支持基于embedding的文本分类和序列标注finetune任务
......@@ -12,44 +12,24 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
from paddlenlp.embeddings import TokenEmbedding
from paddlehub.module.module import moduleinfo, serving
from paddlehub.module.module import moduleinfo
from paddlehub.module.nlp_module import EmbeddingModule
@moduleinfo(
name="fasttext_wiki-news_target_word-word_dim300_en",
version="1.0.0",
version="1.0.1",
summary="",
author="paddlepaddle",
author_email="",
type="nlp/semantic_model")
type="nlp/semantic_model",
meta=EmbeddingModule)
class Embedding(TokenEmbedding):
"""
Embedding model
"""
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name="fasttext.wiki-news.target.word-word.dim300.en", *args, **kwargs)
@serving
def calc_similarity(self, data: List[List[str]]):
"""
Calculate similarities of giving word pairs.
"""
results = []
for word_pair in data:
if len(word_pair) != 2:
raise RuntimeError(
f'The input must have two words, but got {len(word_pair)}. Please check your inputs.')
if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str):
raise RuntimeError(
f'The types of text pair must be (str, str), but got'
f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.')
embedding_name = "fasttext.wiki-news.target.word-word.dim300.en"
for word in word_pair:
if self.get_idx_from_word(word) == \
self.get_idx_from_word(self.vocab.unk_token):
raise RuntimeError(
f'Word "{word}" is not in vocab. Please check your inputs.')
results.append(str(self.cosine_sim(*word_pair)))
return results
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name=self.embedding_name, *args, **kwargs)
......@@ -56,6 +56,25 @@ def dot(
* `word_a`: 需要计算内积的单词a。
* `word_b`: 需要计算内积的单词b。
```python
def get_vocab_path()
```
获取本地词表文件的路径信息。
```python
def get_tokenizer(*args, **kwargs)
```
获取当前模型的tokenizer,返回一个JiebaTokenizer的实例,当前只支持中文embedding模型。
**参数**
* `*args`: 额外传递的列表形式的参数。
* `**kwargs`: 额外传递的字典形式的参数。
关于额外参数的详情,可查看[paddlenlp.data.tokenizer.JiebaTokenizer](https://github.com/PaddlePaddle/models/blob/develop/PaddleNLP/paddlenlp/data/tokenizer.py)
更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings)
## 代码示例
......@@ -125,3 +144,6 @@ paddlehub >= 2.0.0
初始发布
* 1.0.1
支持基于embedding的文本分类和序列标注finetune任务
......@@ -12,44 +12,24 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
from paddlenlp.embeddings import TokenEmbedding
from paddlehub.module.module import moduleinfo, serving
from paddlehub.module.module import moduleinfo
from paddlehub.module.nlp_module import EmbeddingModule
@moduleinfo(
name="glove_twitter_target_word-word_dim100_en",
version="1.0.0",
version="1.0.1",
summary="",
author="paddlepaddle",
author_email="",
type="nlp/semantic_model")
type="nlp/semantic_model",
meta=EmbeddingModule)
class Embedding(TokenEmbedding):
"""
Embedding model
"""
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name="glove.twitter.target.word-word.dim100.en", *args, **kwargs)
@serving
def calc_similarity(self, data: List[List[str]]):
"""
Calculate similarities of giving word pairs.
"""
results = []
for word_pair in data:
if len(word_pair) != 2:
raise RuntimeError(
f'The input must have two words, but got {len(word_pair)}. Please check your inputs.')
if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str):
raise RuntimeError(
f'The types of text pair must be (str, str), but got'
f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.')
embedding_name = "glove.twitter.target.word-word.dim100.en"
for word in word_pair:
if self.get_idx_from_word(word) == \
self.get_idx_from_word(self.vocab.unk_token):
raise RuntimeError(
f'Word "{word}" is not in vocab. Please check your inputs.')
results.append(str(self.cosine_sim(*word_pair)))
return results
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name=self.embedding_name, *args, **kwargs)
......@@ -56,6 +56,25 @@ def dot(
* `word_a`: 需要计算内积的单词a。
* `word_b`: 需要计算内积的单词b。
```python
def get_vocab_path()
```
获取本地词表文件的路径信息。
```python
def get_tokenizer(*args, **kwargs)
```
获取当前模型的tokenizer,返回一个JiebaTokenizer的实例,当前只支持中文embedding模型。
**参数**
* `*args`: 额外传递的列表形式的参数。
* `**kwargs`: 额外传递的字典形式的参数。
关于额外参数的详情,可查看[paddlenlp.data.tokenizer.JiebaTokenizer](https://github.com/PaddlePaddle/models/blob/develop/PaddleNLP/paddlenlp/data/tokenizer.py)
更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings)
## 代码示例
......@@ -125,3 +144,6 @@ paddlehub >= 2.0.0
初始发布
* 1.0.1
支持基于embedding的文本分类和序列标注finetune任务
......@@ -12,44 +12,24 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
from paddlenlp.embeddings import TokenEmbedding
from paddlehub.module.module import moduleinfo, serving
from paddlehub.module.module import moduleinfo
from paddlehub.module.nlp_module import EmbeddingModule
@moduleinfo(
name="glove_twitter_target_word-word_dim200_en",
version="1.0.0",
version="1.0.1",
summary="",
author="paddlepaddle",
author_email="",
type="nlp/semantic_model")
type="nlp/semantic_model",
meta=EmbeddingModule)
class Embedding(TokenEmbedding):
"""
Embedding model
"""
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name="glove.twitter.target.word-word.dim200.en", *args, **kwargs)
@serving
def calc_similarity(self, data: List[List[str]]):
"""
Calculate similarities of giving word pairs.
"""
results = []
for word_pair in data:
if len(word_pair) != 2:
raise RuntimeError(
f'The input must have two words, but got {len(word_pair)}. Please check your inputs.')
if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str):
raise RuntimeError(
f'The types of text pair must be (str, str), but got'
f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.')
embedding_name = "glove.twitter.target.word-word.dim200.en"
for word in word_pair:
if self.get_idx_from_word(word) == \
self.get_idx_from_word(self.vocab.unk_token):
raise RuntimeError(
f'Word "{word}" is not in vocab. Please check your inputs.')
results.append(str(self.cosine_sim(*word_pair)))
return results
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name=self.embedding_name, *args, **kwargs)
......@@ -56,6 +56,25 @@ def dot(
* `word_a`: 需要计算内积的单词a。
* `word_b`: 需要计算内积的单词b。
```python
def get_vocab_path()
```
获取本地词表文件的路径信息。
```python
def get_tokenizer(*args, **kwargs)
```
获取当前模型的tokenizer,返回一个JiebaTokenizer的实例,当前只支持中文embedding模型。
**参数**
* `*args`: 额外传递的列表形式的参数。
* `**kwargs`: 额外传递的字典形式的参数。
关于额外参数的详情,可查看[paddlenlp.data.tokenizer.JiebaTokenizer](https://github.com/PaddlePaddle/models/blob/develop/PaddleNLP/paddlenlp/data/tokenizer.py)
更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings)
## 代码示例
......@@ -125,3 +144,6 @@ paddlehub >= 2.0.0
初始发布
* 1.0.1
支持基于embedding的文本分类和序列标注finetune任务
......@@ -12,44 +12,24 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
from paddlenlp.embeddings import TokenEmbedding
from paddlehub.module.module import moduleinfo, serving
from paddlehub.module.module import moduleinfo
from paddlehub.module.nlp_module import EmbeddingModule
@moduleinfo(
name="glove_twitter_target_word-word_dim25_en",
version="1.0.0",
version="1.0.1",
summary="",
author="paddlepaddle",
author_email="",
type="nlp/semantic_model")
type="nlp/semantic_model",
meta=EmbeddingModule)
class Embedding(TokenEmbedding):
"""
Embedding model
"""
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name="glove.twitter.target.word-word.dim25.en", *args, **kwargs)
@serving
def calc_similarity(self, data: List[List[str]]):
"""
Calculate similarities of giving word pairs.
"""
results = []
for word_pair in data:
if len(word_pair) != 2:
raise RuntimeError(
f'The input must have two words, but got {len(word_pair)}. Please check your inputs.')
if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str):
raise RuntimeError(
f'The types of text pair must be (str, str), but got'
f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.')
embedding_name = "glove.twitter.target.word-word.dim25.en"
for word in word_pair:
if self.get_idx_from_word(word) == \
self.get_idx_from_word(self.vocab.unk_token):
raise RuntimeError(
f'Word "{word}" is not in vocab. Please check your inputs.')
results.append(str(self.cosine_sim(*word_pair)))
return results
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name=self.embedding_name, *args, **kwargs)
......@@ -56,6 +56,25 @@ def dot(
* `word_a`: 需要计算内积的单词a。
* `word_b`: 需要计算内积的单词b。
```python
def get_vocab_path()
```
获取本地词表文件的路径信息。
```python
def get_tokenizer(*args, **kwargs)
```
获取当前模型的tokenizer,返回一个JiebaTokenizer的实例,当前只支持中文embedding模型。
**参数**
* `*args`: 额外传递的列表形式的参数。
* `**kwargs`: 额外传递的字典形式的参数。
关于额外参数的详情,可查看[paddlenlp.data.tokenizer.JiebaTokenizer](https://github.com/PaddlePaddle/models/blob/develop/PaddleNLP/paddlenlp/data/tokenizer.py)
更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings)
## 代码示例
......@@ -125,3 +144,6 @@ paddlehub >= 2.0.0
初始发布
* 1.0.1
支持基于embedding的文本分类和序列标注finetune任务
......@@ -12,44 +12,24 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
from paddlenlp.embeddings import TokenEmbedding
from paddlehub.module.module import moduleinfo, serving
from paddlehub.module.module import moduleinfo
from paddlehub.module.nlp_module import EmbeddingModule
@moduleinfo(
name="glove_twitter_target_word-word_dim50_en",
version="1.0.0",
version="1.0.1",
summary="",
author="paddlepaddle",
author_email="",
type="nlp/semantic_model")
type="nlp/semantic_model",
meta=EmbeddingModule)
class Embedding(TokenEmbedding):
"""
Embedding model
"""
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name="glove.twitter.target.word-word.dim50.en", *args, **kwargs)
@serving
def calc_similarity(self, data: List[List[str]]):
"""
Calculate similarities of giving word pairs.
"""
results = []
for word_pair in data:
if len(word_pair) != 2:
raise RuntimeError(
f'The input must have two words, but got {len(word_pair)}. Please check your inputs.')
if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str):
raise RuntimeError(
f'The types of text pair must be (str, str), but got'
f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.')
embedding_name = "glove.twitter.target.word-word.dim50.en"
for word in word_pair:
if self.get_idx_from_word(word) == \
self.get_idx_from_word(self.vocab.unk_token):
raise RuntimeError(
f'Word "{word}" is not in vocab. Please check your inputs.')
results.append(str(self.cosine_sim(*word_pair)))
return results
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name=self.embedding_name, *args, **kwargs)
......@@ -56,6 +56,25 @@ def dot(
* `word_a`: 需要计算内积的单词a。
* `word_b`: 需要计算内积的单词b。
```python
def get_vocab_path()
```
获取本地词表文件的路径信息。
```python
def get_tokenizer(*args, **kwargs)
```
获取当前模型的tokenizer,返回一个JiebaTokenizer的实例,当前只支持中文embedding模型。
**参数**
* `*args`: 额外传递的列表形式的参数。
* `**kwargs`: 额外传递的字典形式的参数。
关于额外参数的详情,可查看[paddlenlp.data.tokenizer.JiebaTokenizer](https://github.com/PaddlePaddle/models/blob/develop/PaddleNLP/paddlenlp/data/tokenizer.py)
更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings)
## 代码示例
......@@ -125,3 +144,6 @@ paddlehub >= 2.0.0
初始发布
* 1.0.1
支持基于embedding的文本分类和序列标注finetune任务
......@@ -12,44 +12,24 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
from paddlenlp.embeddings import TokenEmbedding
from paddlehub.module.module import moduleinfo, serving
from paddlehub.module.module import moduleinfo
from paddlehub.module.nlp_module import EmbeddingModule
@moduleinfo(
name="glove_wiki2014-gigaword_target_word-word_dim100_en",
version="1.0.0",
version="1.0.1",
summary="",
author="paddlepaddle",
author_email="",
type="nlp/semantic_model")
type="nlp/semantic_model",
meta=EmbeddingModule)
class Embedding(TokenEmbedding):
"""
Embedding model
"""
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name="glove.wiki2014-gigaword.target.word-word.dim100.en", *args, **kwargs)
@serving
def calc_similarity(self, data: List[List[str]]):
"""
Calculate similarities of giving word pairs.
"""
results = []
for word_pair in data:
if len(word_pair) != 2:
raise RuntimeError(
f'The input must have two words, but got {len(word_pair)}. Please check your inputs.')
if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str):
raise RuntimeError(
f'The types of text pair must be (str, str), but got'
f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.')
embedding_name = "glove.wiki2014-gigaword.target.word-word.dim100.en"
for word in word_pair:
if self.get_idx_from_word(word) == \
self.get_idx_from_word(self.vocab.unk_token):
raise RuntimeError(
f'Word "{word}" is not in vocab. Please check your inputs.')
results.append(str(self.cosine_sim(*word_pair)))
return results
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name=self.embedding_name, *args, **kwargs)
......@@ -56,6 +56,25 @@ def dot(
* `word_a`: 需要计算内积的单词a。
* `word_b`: 需要计算内积的单词b。
```python
def get_vocab_path()
```
获取本地词表文件的路径信息。
```python
def get_tokenizer(*args, **kwargs)
```
获取当前模型的tokenizer,返回一个JiebaTokenizer的实例,当前只支持中文embedding模型。
**参数**
* `*args`: 额外传递的列表形式的参数。
* `**kwargs`: 额外传递的字典形式的参数。
关于额外参数的详情,可查看[paddlenlp.data.tokenizer.JiebaTokenizer](https://github.com/PaddlePaddle/models/blob/develop/PaddleNLP/paddlenlp/data/tokenizer.py)
更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings)
## 代码示例
......@@ -125,3 +144,6 @@ paddlehub >= 2.0.0
初始发布
* 1.0.1
支持基于embedding的文本分类和序列标注finetune任务
......@@ -12,44 +12,24 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
from paddlenlp.embeddings import TokenEmbedding
from paddlehub.module.module import moduleinfo, serving
from paddlehub.module.module import moduleinfo
from paddlehub.module.nlp_module import EmbeddingModule
@moduleinfo(
name="glove_wiki2014-gigaword_target_word-word_dim200_en",
version="1.0.0",
version="1.0.1",
summary="",
author="paddlepaddle",
author_email="",
type="nlp/semantic_model")
type="nlp/semantic_model",
meta=EmbeddingModule)
class Embedding(TokenEmbedding):
"""
Embedding model
"""
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name="glove.wiki2014-gigaword.target.word-word.dim200.en", *args, **kwargs)
@serving
def calc_similarity(self, data: List[List[str]]):
"""
Calculate similarities of giving word pairs.
"""
results = []
for word_pair in data:
if len(word_pair) != 2:
raise RuntimeError(
f'The input must have two words, but got {len(word_pair)}. Please check your inputs.')
if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str):
raise RuntimeError(
f'The types of text pair must be (str, str), but got'
f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.')
embedding_name = "glove.wiki2014-gigaword.target.word-word.dim200.en"
for word in word_pair:
if self.get_idx_from_word(word) == \
self.get_idx_from_word(self.vocab.unk_token):
raise RuntimeError(
f'Word "{word}" is not in vocab. Please check your inputs.')
results.append(str(self.cosine_sim(*word_pair)))
return results
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name=self.embedding_name, *args, **kwargs)
......@@ -56,6 +56,25 @@ def dot(
* `word_a`: 需要计算内积的单词a。
* `word_b`: 需要计算内积的单词b。
```python
def get_vocab_path()
```
获取本地词表文件的路径信息。
```python
def get_tokenizer(*args, **kwargs)
```
获取当前模型的tokenizer,返回一个JiebaTokenizer的实例,当前只支持中文embedding模型。
**参数**
* `*args`: 额外传递的列表形式的参数。
* `**kwargs`: 额外传递的字典形式的参数。
关于额外参数的详情,可查看[paddlenlp.data.tokenizer.JiebaTokenizer](https://github.com/PaddlePaddle/models/blob/develop/PaddleNLP/paddlenlp/data/tokenizer.py)
更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings)
## 代码示例
......@@ -125,3 +144,6 @@ paddlehub >= 2.0.0
初始发布
* 1.0.1
支持基于embedding的文本分类和序列标注finetune任务
......@@ -12,44 +12,24 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
from paddlenlp.embeddings import TokenEmbedding
from paddlehub.module.module import moduleinfo, serving
from paddlehub.module.module import moduleinfo
from paddlehub.module.nlp_module import EmbeddingModule
@moduleinfo(
name="glove_wiki2014-gigaword_target_word-word_dim300_en",
version="1.0.0",
version="1.0.1",
summary="",
author="paddlepaddle",
author_email="",
type="nlp/semantic_model")
type="nlp/semantic_model",
meta=EmbeddingModule)
class Embedding(TokenEmbedding):
"""
Embedding model
"""
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name="glove.wiki2014-gigaword.target.word-word.dim300.en", *args, **kwargs)
@serving
def calc_similarity(self, data: List[List[str]]):
"""
Calculate similarities of giving word pairs.
"""
results = []
for word_pair in data:
if len(word_pair) != 2:
raise RuntimeError(
f'The input must have two words, but got {len(word_pair)}. Please check your inputs.')
if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str):
raise RuntimeError(
f'The types of text pair must be (str, str), but got'
f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.')
embedding_name = "glove.wiki2014-gigaword.target.word-word.dim300.en"
for word in word_pair:
if self.get_idx_from_word(word) == \
self.get_idx_from_word(self.vocab.unk_token):
raise RuntimeError(
f'Word "{word}" is not in vocab. Please check your inputs.')
results.append(str(self.cosine_sim(*word_pair)))
return results
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name=self.embedding_name, *args, **kwargs)
......@@ -56,6 +56,25 @@ def dot(
* `word_a`: 需要计算内积的单词a。
* `word_b`: 需要计算内积的单词b。
```python
def get_vocab_path()
```
获取本地词表文件的路径信息。
```python
def get_tokenizer(*args, **kwargs)
```
获取当前模型的tokenizer,返回一个JiebaTokenizer的实例,当前只支持中文embedding模型。
**参数**
* `*args`: 额外传递的列表形式的参数。
* `**kwargs`: 额外传递的字典形式的参数。
关于额外参数的详情,可查看[paddlenlp.data.tokenizer.JiebaTokenizer](https://github.com/PaddlePaddle/models/blob/develop/PaddleNLP/paddlenlp/data/tokenizer.py)
更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings)
## 代码示例
......@@ -125,3 +144,6 @@ paddlehub >= 2.0.0
初始发布
* 1.0.1
支持基于embedding的文本分类和序列标注finetune任务
......@@ -12,44 +12,24 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
from paddlenlp.embeddings import TokenEmbedding
from paddlehub.module.module import moduleinfo, serving
from paddlehub.module.module import moduleinfo
from paddlehub.module.nlp_module import EmbeddingModule
@moduleinfo(
name="glove_wiki2014-gigaword_target_word-word_dim50_en",
version="1.0.0",
version="1.0.1",
summary="",
author="paddlepaddle",
author_email="",
type="nlp/semantic_model")
type="nlp/semantic_model",
meta=EmbeddingModule)
class Embedding(TokenEmbedding):
"""
Embedding model
"""
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name="glove.wiki2014-gigaword.target.word-word.dim50.en", *args, **kwargs)
@serving
def calc_similarity(self, data: List[List[str]]):
"""
Calculate similarities of giving word pairs.
"""
results = []
for word_pair in data:
if len(word_pair) != 2:
raise RuntimeError(
f'The input must have two words, but got {len(word_pair)}. Please check your inputs.')
if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str):
raise RuntimeError(
f'The types of text pair must be (str, str), but got'
f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.')
embedding_name = "glove.wiki2014-gigaword.target.word-word.dim50.en"
for word in word_pair:
if self.get_idx_from_word(word) == \
self.get_idx_from_word(self.vocab.unk_token):
raise RuntimeError(
f'Word "{word}" is not in vocab. Please check your inputs.')
results.append(str(self.cosine_sim(*word_pair)))
return results
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name=self.embedding_name, *args, **kwargs)
......@@ -56,6 +56,25 @@ def dot(
* `word_a`: 需要计算内积的单词a。
* `word_b`: 需要计算内积的单词b。
```python
def get_vocab_path()
```
获取本地词表文件的路径信息。
```python
def get_tokenizer(*args, **kwargs)
```
获取当前模型的tokenizer,返回一个JiebaTokenizer的实例,当前只支持中文embedding模型。
**参数**
* `*args`: 额外传递的列表形式的参数。
* `**kwargs`: 额外传递的字典形式的参数。
关于额外参数的详情,可查看[paddlenlp.data.tokenizer.JiebaTokenizer](https://github.com/PaddlePaddle/models/blob/develop/PaddleNLP/paddlenlp/data/tokenizer.py)
更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings)
## 代码示例
......@@ -125,3 +144,6 @@ paddlehub >= 2.0.0
初始发布
* 1.0.1
支持基于embedding的文本分类和序列标注finetune任务
......@@ -12,44 +12,24 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
from paddlenlp.embeddings import TokenEmbedding
from paddlehub.module.module import moduleinfo, serving
from paddlehub.module.module import moduleinfo
from paddlehub.module.nlp_module import EmbeddingModule
@moduleinfo(
name="w2v_baidu_encyclopedia_context_word-character_char1-1_dim300",
version="1.0.0",
version="1.0.1",
summary="",
author="paddlepaddle",
author_email="",
type="nlp/semantic_model")
type="nlp/semantic_model",
meta=EmbeddingModule)
class Embedding(TokenEmbedding):
"""
Embedding model
"""
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name="w2v.baidu_encyclopedia.context.word-character.char1-1.dim300", *args, **kwargs)
@serving
def calc_similarity(self, data: List[List[str]]):
"""
Calculate similarities of giving word pairs.
"""
results = []
for word_pair in data:
if len(word_pair) != 2:
raise RuntimeError(
f'The input must have two words, but got {len(word_pair)}. Please check your inputs.')
if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str):
raise RuntimeError(
f'The types of text pair must be (str, str), but got'
f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.')
embedding_name = "w2v.baidu_encyclopedia.context.word-character.char1-1.dim300"
for word in word_pair:
if self.get_idx_from_word(word) == \
self.get_idx_from_word(self.vocab.unk_token):
raise RuntimeError(
f'Word "{word}" is not in vocab. Please check your inputs.')
results.append(str(self.cosine_sim(*word_pair)))
return results
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name=self.embedding_name, *args, **kwargs)
......@@ -56,6 +56,25 @@ def dot(
* `word_a`: 需要计算内积的单词a。
* `word_b`: 需要计算内积的单词b。
```python
def get_vocab_path()
```
获取本地词表文件的路径信息。
```python
def get_tokenizer(*args, **kwargs)
```
获取当前模型的tokenizer,返回一个JiebaTokenizer的实例,当前只支持中文embedding模型。
**参数**
* `*args`: 额外传递的列表形式的参数。
* `**kwargs`: 额外传递的字典形式的参数。
关于额外参数的详情,可查看[paddlenlp.data.tokenizer.JiebaTokenizer](https://github.com/PaddlePaddle/models/blob/develop/PaddleNLP/paddlenlp/data/tokenizer.py)
更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings)
## 代码示例
......@@ -125,3 +144,6 @@ paddlehub >= 2.0.0
初始发布
* 1.0.1
支持基于embedding的文本分类和序列标注finetune任务
......@@ -12,44 +12,24 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
from paddlenlp.embeddings import TokenEmbedding
from paddlehub.module.module import moduleinfo, serving
from paddlehub.module.module import moduleinfo
from paddlehub.module.nlp_module import EmbeddingModule
@moduleinfo(
name="w2v_baidu_encyclopedia_context_word-character_char1-2_dim300",
version="1.0.0",
version="1.0.1",
summary="",
author="paddlepaddle",
author_email="",
type="nlp/semantic_model")
type="nlp/semantic_model",
meta=EmbeddingModule)
class Embedding(TokenEmbedding):
"""
Embedding model
"""
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name="w2v.baidu_encyclopedia.context.word-character.char1-2.dim300", *args, **kwargs)
@serving
def calc_similarity(self, data: List[List[str]]):
"""
Calculate similarities of giving word pairs.
"""
results = []
for word_pair in data:
if len(word_pair) != 2:
raise RuntimeError(
f'The input must have two words, but got {len(word_pair)}. Please check your inputs.')
if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str):
raise RuntimeError(
f'The types of text pair must be (str, str), but got'
f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.')
embedding_name = "w2v.baidu_encyclopedia.context.word-character.char1-2.dim300"
for word in word_pair:
if self.get_idx_from_word(word) == \
self.get_idx_from_word(self.vocab.unk_token):
raise RuntimeError(
f'Word "{word}" is not in vocab. Please check your inputs.')
results.append(str(self.cosine_sim(*word_pair)))
return results
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name=self.embedding_name, *args, **kwargs)
......@@ -56,6 +56,25 @@ def dot(
* `word_a`: 需要计算内积的单词a。
* `word_b`: 需要计算内积的单词b。
```python
def get_vocab_path()
```
获取本地词表文件的路径信息。
```python
def get_tokenizer(*args, **kwargs)
```
获取当前模型的tokenizer,返回一个JiebaTokenizer的实例,当前只支持中文embedding模型。
**参数**
* `*args`: 额外传递的列表形式的参数。
* `**kwargs`: 额外传递的字典形式的参数。
关于额外参数的详情,可查看[paddlenlp.data.tokenizer.JiebaTokenizer](https://github.com/PaddlePaddle/models/blob/develop/PaddleNLP/paddlenlp/data/tokenizer.py)
更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings)
## 代码示例
......@@ -125,3 +144,6 @@ paddlehub >= 2.0.0
初始发布
* 1.0.1
支持基于embedding的文本分类和序列标注finetune任务
......@@ -12,44 +12,24 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
from paddlenlp.embeddings import TokenEmbedding
from paddlehub.module.module import moduleinfo, serving
from paddlehub.module.module import moduleinfo
from paddlehub.module.nlp_module import EmbeddingModule
@moduleinfo(
name="w2v_baidu_encyclopedia_context_word-character_char1-4_dim300",
version="1.0.0",
version="1.0.1",
summary="",
author="paddlepaddle",
author_email="",
type="nlp/semantic_model")
type="nlp/semantic_model",
meta=EmbeddingModule)
class Embedding(TokenEmbedding):
"""
Embedding model
"""
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name="w2v.baidu_encyclopedia.context.word-character.char1-4.dim300", *args, **kwargs)
@serving
def calc_similarity(self, data: List[List[str]]):
"""
Calculate similarities of giving word pairs.
"""
results = []
for word_pair in data:
if len(word_pair) != 2:
raise RuntimeError(
f'The input must have two words, but got {len(word_pair)}. Please check your inputs.')
if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str):
raise RuntimeError(
f'The types of text pair must be (str, str), but got'
f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.')
embedding_name = "w2v.baidu_encyclopedia.context.word-character.char1-4.dim300"
for word in word_pair:
if self.get_idx_from_word(word) == \
self.get_idx_from_word(self.vocab.unk_token):
raise RuntimeError(
f'Word "{word}" is not in vocab. Please check your inputs.')
results.append(str(self.cosine_sim(*word_pair)))
return results
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name=self.embedding_name, *args, **kwargs)
......@@ -56,6 +56,25 @@ def dot(
* `word_a`: 需要计算内积的单词a。
* `word_b`: 需要计算内积的单词b。
```python
def get_vocab_path()
```
获取本地词表文件的路径信息。
```python
def get_tokenizer(*args, **kwargs)
```
获取当前模型的tokenizer,返回一个JiebaTokenizer的实例,当前只支持中文embedding模型。
**参数**
* `*args`: 额外传递的列表形式的参数。
* `**kwargs`: 额外传递的字典形式的参数。
关于额外参数的详情,可查看[paddlenlp.data.tokenizer.JiebaTokenizer](https://github.com/PaddlePaddle/models/blob/develop/PaddleNLP/paddlenlp/data/tokenizer.py)
更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings)
## 代码示例
......@@ -125,3 +144,6 @@ paddlehub >= 2.0.0
初始发布
* 1.0.1
支持基于embedding的文本分类和序列标注finetune任务
......@@ -12,44 +12,24 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
from paddlenlp.embeddings import TokenEmbedding
from paddlehub.module.module import moduleinfo, serving
from paddlehub.module.module import moduleinfo
from paddlehub.module.nlp_module import EmbeddingModule
@moduleinfo(
name="w2v_baidu_encyclopedia_context_word-ngram_1-2_dim300",
version="1.0.0",
version="1.0.1",
summary="",
author="paddlepaddle",
author_email="",
type="nlp/semantic_model")
type="nlp/semantic_model",
meta=EmbeddingModule)
class Embedding(TokenEmbedding):
"""
Embedding model
"""
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name="w2v.baidu_encyclopedia.context.word-ngram.1-2.dim300", *args, **kwargs)
@serving
def calc_similarity(self, data: List[List[str]]):
"""
Calculate similarities of giving word pairs.
"""
results = []
for word_pair in data:
if len(word_pair) != 2:
raise RuntimeError(
f'The input must have two words, but got {len(word_pair)}. Please check your inputs.')
if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str):
raise RuntimeError(
f'The types of text pair must be (str, str), but got'
f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.')
embedding_name = "w2v.baidu_encyclopedia.context.word-ngram.1-2.dim300"
for word in word_pair:
if self.get_idx_from_word(word) == \
self.get_idx_from_word(self.vocab.unk_token):
raise RuntimeError(
f'Word "{word}" is not in vocab. Please check your inputs.')
results.append(str(self.cosine_sim(*word_pair)))
return results
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name=self.embedding_name, *args, **kwargs)
......@@ -56,6 +56,25 @@ def dot(
* `word_a`: 需要计算内积的单词a。
* `word_b`: 需要计算内积的单词b。
```python
def get_vocab_path()
```
获取本地词表文件的路径信息。
```python
def get_tokenizer(*args, **kwargs)
```
获取当前模型的tokenizer,返回一个JiebaTokenizer的实例,当前只支持中文embedding模型。
**参数**
* `*args`: 额外传递的列表形式的参数。
* `**kwargs`: 额外传递的字典形式的参数。
关于额外参数的详情,可查看[paddlenlp.data.tokenizer.JiebaTokenizer](https://github.com/PaddlePaddle/models/blob/develop/PaddleNLP/paddlenlp/data/tokenizer.py)
更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings)
## 代码示例
......@@ -125,3 +144,6 @@ paddlehub >= 2.0.0
初始发布
* 1.0.1
支持基于embedding的文本分类和序列标注finetune任务
......@@ -12,44 +12,24 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
from paddlenlp.embeddings import TokenEmbedding
from paddlehub.module.module import moduleinfo, serving
from paddlehub.module.module import moduleinfo
from paddlehub.module.nlp_module import EmbeddingModule
@moduleinfo(
name="w2v_baidu_encyclopedia_context_word-ngram_1-3_dim300",
version="1.0.0",
version="1.0.1",
summary="",
author="paddlepaddle",
author_email="",
type="nlp/semantic_model")
type="nlp/semantic_model",
meta=EmbeddingModule)
class Embedding(TokenEmbedding):
"""
Embedding model
"""
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name="w2v.baidu_encyclopedia.context.word-ngram.1-3.dim300", *args, **kwargs)
@serving
def calc_similarity(self, data: List[List[str]]):
"""
Calculate similarities of giving word pairs.
"""
results = []
for word_pair in data:
if len(word_pair) != 2:
raise RuntimeError(
f'The input must have two words, but got {len(word_pair)}. Please check your inputs.')
if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str):
raise RuntimeError(
f'The types of text pair must be (str, str), but got'
f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.')
embedding_name = "w2v.baidu_encyclopedia.context.word-ngram.1-3.dim300"
for word in word_pair:
if self.get_idx_from_word(word) == \
self.get_idx_from_word(self.vocab.unk_token):
raise RuntimeError(
f'Word "{word}" is not in vocab. Please check your inputs.')
results.append(str(self.cosine_sim(*word_pair)))
return results
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name=self.embedding_name, *args, **kwargs)
......@@ -56,6 +56,25 @@ def dot(
* `word_a`: 需要计算内积的单词a。
* `word_b`: 需要计算内积的单词b。
```python
def get_vocab_path()
```
获取本地词表文件的路径信息。
```python
def get_tokenizer(*args, **kwargs)
```
获取当前模型的tokenizer,返回一个JiebaTokenizer的实例,当前只支持中文embedding模型。
**参数**
* `*args`: 额外传递的列表形式的参数。
* `**kwargs`: 额外传递的字典形式的参数。
关于额外参数的详情,可查看[paddlenlp.data.tokenizer.JiebaTokenizer](https://github.com/PaddlePaddle/models/blob/develop/PaddleNLP/paddlenlp/data/tokenizer.py)
更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings)
## 代码示例
......@@ -125,3 +144,6 @@ paddlehub >= 2.0.0
初始发布
* 1.0.1
支持基于embedding的文本分类和序列标注finetune任务
......@@ -12,44 +12,24 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
from paddlenlp.embeddings import TokenEmbedding
from paddlehub.module.module import moduleinfo, serving
from paddlehub.module.module import moduleinfo
from paddlehub.module.nlp_module import EmbeddingModule
@moduleinfo(
name="w2v_baidu_encyclopedia_context_word-ngram_2-2_dim300",
version="1.0.0",
version="1.0.1",
summary="",
author="paddlepaddle",
author_email="",
type="nlp/semantic_model")
type="nlp/semantic_model",
meta=EmbeddingModule)
class Embedding(TokenEmbedding):
"""
Embedding model
"""
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name="w2v.baidu_encyclopedia.context.word-ngram.2-2.dim300", *args, **kwargs)
@serving
def calc_similarity(self, data: List[List[str]]):
"""
Calculate similarities of giving word pairs.
"""
results = []
for word_pair in data:
if len(word_pair) != 2:
raise RuntimeError(
f'The input must have two words, but got {len(word_pair)}. Please check your inputs.')
if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str):
raise RuntimeError(
f'The types of text pair must be (str, str), but got'
f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.')
embedding_name = "w2v.baidu_encyclopedia.context.word-ngram.2-2.dim300"
for word in word_pair:
if self.get_idx_from_word(word) == \
self.get_idx_from_word(self.vocab.unk_token):
raise RuntimeError(
f'Word "{word}" is not in vocab. Please check your inputs.')
results.append(str(self.cosine_sim(*word_pair)))
return results
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name=self.embedding_name, *args, **kwargs)
......@@ -56,6 +56,25 @@ def dot(
* `word_a`: 需要计算内积的单词a。
* `word_b`: 需要计算内积的单词b。
```python
def get_vocab_path()
```
获取本地词表文件的路径信息。
```python
def get_tokenizer(*args, **kwargs)
```
获取当前模型的tokenizer,返回一个JiebaTokenizer的实例,当前只支持中文embedding模型。
**参数**
* `*args`: 额外传递的列表形式的参数。
* `**kwargs`: 额外传递的字典形式的参数。
关于额外参数的详情,可查看[paddlenlp.data.tokenizer.JiebaTokenizer](https://github.com/PaddlePaddle/models/blob/develop/PaddleNLP/paddlenlp/data/tokenizer.py)
更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings)
## 代码示例
......@@ -125,3 +144,6 @@ paddlehub >= 2.0.0
初始发布
* 1.0.1
支持基于embedding的文本分类和序列标注finetune任务
......@@ -12,44 +12,24 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
from paddlenlp.embeddings import TokenEmbedding
from paddlehub.module.module import moduleinfo, serving
from paddlehub.module.module import moduleinfo
from paddlehub.module.nlp_module import EmbeddingModule
@moduleinfo(
name="w2v_baidu_encyclopedia_context_word-wordLR_dim300",
version="1.0.0",
version="1.0.1",
summary="",
author="paddlepaddle",
author_email="",
type="nlp/semantic_model")
type="nlp/semantic_model",
meta=EmbeddingModule)
class Embedding(TokenEmbedding):
"""
Embedding model
"""
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name="w2v.baidu_encyclopedia.context.word-wordLR.dim300", *args, **kwargs)
@serving
def calc_similarity(self, data: List[List[str]]):
"""
Calculate similarities of giving word pairs.
"""
results = []
for word_pair in data:
if len(word_pair) != 2:
raise RuntimeError(
f'The input must have two words, but got {len(word_pair)}. Please check your inputs.')
if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str):
raise RuntimeError(
f'The types of text pair must be (str, str), but got'
f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.')
embedding_name = "w2v.baidu_encyclopedia.context.word-wordLR.dim300"
for word in word_pair:
if self.get_idx_from_word(word) == \
self.get_idx_from_word(self.vocab.unk_token):
raise RuntimeError(
f'Word "{word}" is not in vocab. Please check your inputs.')
results.append(str(self.cosine_sim(*word_pair)))
return results
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name=self.embedding_name, *args, **kwargs)
......@@ -56,6 +56,25 @@ def dot(
* `word_a`: 需要计算内积的单词a。
* `word_b`: 需要计算内积的单词b。
```python
def get_vocab_path()
```
获取本地词表文件的路径信息。
```python
def get_tokenizer(*args, **kwargs)
```
获取当前模型的tokenizer,返回一个JiebaTokenizer的实例,当前只支持中文embedding模型。
**参数**
* `*args`: 额外传递的列表形式的参数。
* `**kwargs`: 额外传递的字典形式的参数。
关于额外参数的详情,可查看[paddlenlp.data.tokenizer.JiebaTokenizer](https://github.com/PaddlePaddle/models/blob/develop/PaddleNLP/paddlenlp/data/tokenizer.py)
更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings)
## 代码示例
......@@ -125,3 +144,6 @@ paddlehub >= 2.0.0
初始发布
* 1.0.1
支持基于embedding的文本分类和序列标注finetune任务
......@@ -12,44 +12,24 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
from paddlenlp.embeddings import TokenEmbedding
from paddlehub.module.module import moduleinfo, serving
from paddlehub.module.module import moduleinfo
from paddlehub.module.nlp_module import EmbeddingModule
@moduleinfo(
name="w2v_baidu_encyclopedia_context_word-wordPosition_dim300",
version="1.0.0",
version="1.0.1",
summary="",
author="paddlepaddle",
author_email="",
type="nlp/semantic_model")
type="nlp/semantic_model",
meta=EmbeddingModule)
class Embedding(TokenEmbedding):
"""
Embedding model
"""
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name="w2v.baidu_encyclopedia.context.word-wordPosition.dim300", *args, **kwargs)
@serving
def calc_similarity(self, data: List[List[str]]):
"""
Calculate similarities of giving word pairs.
"""
results = []
for word_pair in data:
if len(word_pair) != 2:
raise RuntimeError(
f'The input must have two words, but got {len(word_pair)}. Please check your inputs.')
if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str):
raise RuntimeError(
f'The types of text pair must be (str, str), but got'
f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.')
embedding_name = "w2v.baidu_encyclopedia.context.word-wordPosition.dim300"
for word in word_pair:
if self.get_idx_from_word(word) == \
self.get_idx_from_word(self.vocab.unk_token):
raise RuntimeError(
f'Word "{word}" is not in vocab. Please check your inputs.')
results.append(str(self.cosine_sim(*word_pair)))
return results
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name=self.embedding_name, *args, **kwargs)
......@@ -56,6 +56,25 @@ def dot(
* `word_a`: 需要计算内积的单词a。
* `word_b`: 需要计算内积的单词b。
```python
def get_vocab_path()
```
获取本地词表文件的路径信息。
```python
def get_tokenizer(*args, **kwargs)
```
获取当前模型的tokenizer,返回一个JiebaTokenizer的实例,当前只支持中文embedding模型。
**参数**
* `*args`: 额外传递的列表形式的参数。
* `**kwargs`: 额外传递的字典形式的参数。
关于额外参数的详情,可查看[paddlenlp.data.tokenizer.JiebaTokenizer](https://github.com/PaddlePaddle/models/blob/develop/PaddleNLP/paddlenlp/data/tokenizer.py)
更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings)
## 代码示例
......@@ -125,3 +144,6 @@ paddlehub >= 2.0.0
初始发布
* 1.0.1
支持基于embedding的文本分类和序列标注finetune任务
......@@ -12,44 +12,24 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
from paddlenlp.embeddings import TokenEmbedding
from paddlehub.module.module import moduleinfo, serving
from paddlehub.module.module import moduleinfo
from paddlehub.module.nlp_module import EmbeddingModule
@moduleinfo(
name="w2v_baidu_encyclopedia_context_word-word_dim300",
version="1.0.0",
version="1.0.1",
summary="",
author="paddlepaddle",
author_email="",
type="nlp/semantic_model")
type="nlp/semantic_model",
meta=EmbeddingModule)
class Embedding(TokenEmbedding):
"""
Embedding model
"""
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name="w2v.baidu_encyclopedia.context.word-word.dim300", *args, **kwargs)
@serving
def calc_similarity(self, data: List[List[str]]):
"""
Calculate similarities of giving word pairs.
"""
results = []
for word_pair in data:
if len(word_pair) != 2:
raise RuntimeError(
f'The input must have two words, but got {len(word_pair)}. Please check your inputs.')
if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str):
raise RuntimeError(
f'The types of text pair must be (str, str), but got'
f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.')
embedding_name = "w2v.baidu_encyclopedia.context.word-word.dim300"
for word in word_pair:
if self.get_idx_from_word(word) == \
self.get_idx_from_word(self.vocab.unk_token):
raise RuntimeError(
f'Word "{word}" is not in vocab. Please check your inputs.')
results.append(str(self.cosine_sim(*word_pair)))
return results
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name=self.embedding_name, *args, **kwargs)
......@@ -56,6 +56,25 @@ def dot(
* `word_a`: 需要计算内积的单词a。
* `word_b`: 需要计算内积的单词b。
```python
def get_vocab_path()
```
获取本地词表文件的路径信息。
```python
def get_tokenizer(*args, **kwargs)
```
获取当前模型的tokenizer,返回一个JiebaTokenizer的实例,当前只支持中文embedding模型。
**参数**
* `*args`: 额外传递的列表形式的参数。
* `**kwargs`: 额外传递的字典形式的参数。
关于额外参数的详情,可查看[paddlenlp.data.tokenizer.JiebaTokenizer](https://github.com/PaddlePaddle/models/blob/develop/PaddleNLP/paddlenlp/data/tokenizer.py)
更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings)
## 代码示例
......@@ -125,3 +144,6 @@ paddlehub >= 2.0.0
初始发布
* 1.0.1
支持基于embedding的文本分类和序列标注finetune任务
......@@ -12,44 +12,24 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
from paddlenlp.embeddings import TokenEmbedding
from paddlehub.module.module import moduleinfo, serving
from paddlehub.module.module import moduleinfo
from paddlehub.module.nlp_module import EmbeddingModule
@moduleinfo(
name="w2v_baidu_encyclopedia_target_bigram-char_dim300",
version="1.0.0",
version="1.0.1",
summary="",
author="paddlepaddle",
author_email="",
type="nlp/semantic_model")
type="nlp/semantic_model",
meta=EmbeddingModule)
class Embedding(TokenEmbedding):
"""
Embedding model
"""
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name="w2v.baidu_encyclopedia.target.bigram-char.dim300", *args, **kwargs)
@serving
def calc_similarity(self, data: List[List[str]]):
"""
Calculate similarities of giving word pairs.
"""
results = []
for word_pair in data:
if len(word_pair) != 2:
raise RuntimeError(
f'The input must have two words, but got {len(word_pair)}. Please check your inputs.')
if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str):
raise RuntimeError(
f'The types of text pair must be (str, str), but got'
f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.')
embedding_name = "w2v.baidu_encyclopedia.target.bigram-char.dim300"
for word in word_pair:
if self.get_idx_from_word(word) == \
self.get_idx_from_word(self.vocab.unk_token):
raise RuntimeError(
f'Word "{word}" is not in vocab. Please check your inputs.')
results.append(str(self.cosine_sim(*word_pair)))
return results
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name=self.embedding_name, *args, **kwargs)
......@@ -56,6 +56,25 @@ def dot(
* `word_a`: 需要计算内积的单词a。
* `word_b`: 需要计算内积的单词b。
```python
def get_vocab_path()
```
获取本地词表文件的路径信息。
```python
def get_tokenizer(*args, **kwargs)
```
获取当前模型的tokenizer,返回一个JiebaTokenizer的实例,当前只支持中文embedding模型。
**参数**
* `*args`: 额外传递的列表形式的参数。
* `**kwargs`: 额外传递的字典形式的参数。
关于额外参数的详情,可查看[paddlenlp.data.tokenizer.JiebaTokenizer](https://github.com/PaddlePaddle/models/blob/develop/PaddleNLP/paddlenlp/data/tokenizer.py)
更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings)
## 代码示例
......@@ -125,3 +144,6 @@ paddlehub >= 2.0.0
初始发布
* 1.0.1
支持基于embedding的文本分类和序列标注finetune任务
......@@ -12,44 +12,24 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
from paddlenlp.embeddings import TokenEmbedding
from paddlehub.module.module import moduleinfo, serving
from paddlehub.module.module import moduleinfo
from paddlehub.module.nlp_module import EmbeddingModule
@moduleinfo(
name="w2v_baidu_encyclopedia_target_word-character_char1-1_dim300",
version="1.0.0",
version="1.0.1",
summary="",
author="paddlepaddle",
author_email="",
type="nlp/semantic_model")
type="nlp/semantic_model",
meta=EmbeddingModule)
class Embedding(TokenEmbedding):
"""
Embedding model
"""
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name="w2v.baidu_encyclopedia.target.word-character.char1-1.dim300", *args, **kwargs)
@serving
def calc_similarity(self, data: List[List[str]]):
"""
Calculate similarities of giving word pairs.
"""
results = []
for word_pair in data:
if len(word_pair) != 2:
raise RuntimeError(
f'The input must have two words, but got {len(word_pair)}. Please check your inputs.')
if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str):
raise RuntimeError(
f'The types of text pair must be (str, str), but got'
f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.')
embedding_name = "w2v.baidu_encyclopedia.target.word-character.char1-1.dim300"
for word in word_pair:
if self.get_idx_from_word(word) == \
self.get_idx_from_word(self.vocab.unk_token):
raise RuntimeError(
f'Word "{word}" is not in vocab. Please check your inputs.')
results.append(str(self.cosine_sim(*word_pair)))
return results
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name=self.embedding_name, *args, **kwargs)
......@@ -56,6 +56,25 @@ def dot(
* `word_a`: 需要计算内积的单词a。
* `word_b`: 需要计算内积的单词b。
```python
def get_vocab_path()
```
获取本地词表文件的路径信息。
```python
def get_tokenizer(*args, **kwargs)
```
获取当前模型的tokenizer,返回一个JiebaTokenizer的实例,当前只支持中文embedding模型。
**参数**
* `*args`: 额外传递的列表形式的参数。
* `**kwargs`: 额外传递的字典形式的参数。
关于额外参数的详情,可查看[paddlenlp.data.tokenizer.JiebaTokenizer](https://github.com/PaddlePaddle/models/blob/develop/PaddleNLP/paddlenlp/data/tokenizer.py)
更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings)
## 代码示例
......@@ -125,3 +144,6 @@ paddlehub >= 2.0.0
初始发布
* 1.0.1
支持基于embedding的文本分类和序列标注finetune任务
......@@ -12,44 +12,24 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
from paddlenlp.embeddings import TokenEmbedding
from paddlehub.module.module import moduleinfo, serving
from paddlehub.module.module import moduleinfo
from paddlehub.module.nlp_module import EmbeddingModule
@moduleinfo(
name="w2v_baidu_encyclopedia_target_word-character_char1-2_dim300",
version="1.0.0",
version="1.0.1",
summary="",
author="paddlepaddle",
author_email="",
type="nlp/semantic_model")
type="nlp/semantic_model",
meta=EmbeddingModule)
class Embedding(TokenEmbedding):
"""
Embedding model
"""
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name="w2v.baidu_encyclopedia.target.word-character.char1-2.dim300", *args, **kwargs)
@serving
def calc_similarity(self, data: List[List[str]]):
"""
Calculate similarities of giving word pairs.
"""
results = []
for word_pair in data:
if len(word_pair) != 2:
raise RuntimeError(
f'The input must have two words, but got {len(word_pair)}. Please check your inputs.')
if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str):
raise RuntimeError(
f'The types of text pair must be (str, str), but got'
f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.')
embedding_name = "w2v.baidu_encyclopedia.target.word-character.char1-2.dim300"
for word in word_pair:
if self.get_idx_from_word(word) == \
self.get_idx_from_word(self.vocab.unk_token):
raise RuntimeError(
f'Word "{word}" is not in vocab. Please check your inputs.')
results.append(str(self.cosine_sim(*word_pair)))
return results
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name=self.embedding_name, *args, **kwargs)
......@@ -56,6 +56,25 @@ def dot(
* `word_a`: 需要计算内积的单词a。
* `word_b`: 需要计算内积的单词b。
```python
def get_vocab_path()
```
获取本地词表文件的路径信息。
```python
def get_tokenizer(*args, **kwargs)
```
获取当前模型的tokenizer,返回一个JiebaTokenizer的实例,当前只支持中文embedding模型。
**参数**
* `*args`: 额外传递的列表形式的参数。
* `**kwargs`: 额外传递的字典形式的参数。
关于额外参数的详情,可查看[paddlenlp.data.tokenizer.JiebaTokenizer](https://github.com/PaddlePaddle/models/blob/develop/PaddleNLP/paddlenlp/data/tokenizer.py)
更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings)
## 代码示例
......@@ -125,3 +144,6 @@ paddlehub >= 2.0.0
初始发布
* 1.0.1
支持基于embedding的文本分类和序列标注finetune任务
......@@ -12,44 +12,24 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
from paddlenlp.embeddings import TokenEmbedding
from paddlehub.module.module import moduleinfo, serving
from paddlehub.module.module import moduleinfo
from paddlehub.module.nlp_module import EmbeddingModule
@moduleinfo(
name="w2v_baidu_encyclopedia_target_word-character_char1-4_dim300",
version="1.0.0",
version="1.0.1",
summary="",
author="paddlepaddle",
author_email="",
type="nlp/semantic_model")
type="nlp/semantic_model",
meta=EmbeddingModule)
class Embedding(TokenEmbedding):
"""
Embedding model
"""
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name="w2v.baidu_encyclopedia.target.word-character.char1-4.dim300", *args, **kwargs)
@serving
def calc_similarity(self, data: List[List[str]]):
"""
Calculate similarities of giving word pairs.
"""
results = []
for word_pair in data:
if len(word_pair) != 2:
raise RuntimeError(
f'The input must have two words, but got {len(word_pair)}. Please check your inputs.')
if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str):
raise RuntimeError(
f'The types of text pair must be (str, str), but got'
f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.')
embedding_name = "w2v.baidu_encyclopedia.target.word-character.char1-4.dim300"
for word in word_pair:
if self.get_idx_from_word(word) == \
self.get_idx_from_word(self.vocab.unk_token):
raise RuntimeError(
f'Word "{word}" is not in vocab. Please check your inputs.')
results.append(str(self.cosine_sim(*word_pair)))
return results
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name=self.embedding_name, *args, **kwargs)
......@@ -56,6 +56,25 @@ def dot(
* `word_a`: 需要计算内积的单词a。
* `word_b`: 需要计算内积的单词b。
```python
def get_vocab_path()
```
获取本地词表文件的路径信息。
```python
def get_tokenizer(*args, **kwargs)
```
获取当前模型的tokenizer,返回一个JiebaTokenizer的实例,当前只支持中文embedding模型。
**参数**
* `*args`: 额外传递的列表形式的参数。
* `**kwargs`: 额外传递的字典形式的参数。
关于额外参数的详情,可查看[paddlenlp.data.tokenizer.JiebaTokenizer](https://github.com/PaddlePaddle/models/blob/develop/PaddleNLP/paddlenlp/data/tokenizer.py)
更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings)
## 代码示例
......@@ -125,3 +144,6 @@ paddlehub >= 2.0.0
初始发布
* 1.0.1
支持基于embedding的文本分类和序列标注finetune任务
......@@ -12,44 +12,24 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
from paddlenlp.embeddings import TokenEmbedding
from paddlehub.module.module import moduleinfo, serving
from paddlehub.module.module import moduleinfo
from paddlehub.module.nlp_module import EmbeddingModule
@moduleinfo(
name="w2v_baidu_encyclopedia_target_word-ngram_1-2_dim300",
version="1.0.0",
version="1.0.1",
summary="",
author="paddlepaddle",
author_email="",
type="nlp/semantic_model")
type="nlp/semantic_model",
meta=EmbeddingModule)
class Embedding(TokenEmbedding):
"""
Embedding model
"""
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name="w2v.baidu_encyclopedia.target.word-ngram.1-2.dim300", *args, **kwargs)
@serving
def calc_similarity(self, data: List[List[str]]):
"""
Calculate similarities of giving word pairs.
"""
results = []
for word_pair in data:
if len(word_pair) != 2:
raise RuntimeError(
f'The input must have two words, but got {len(word_pair)}. Please check your inputs.')
if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str):
raise RuntimeError(
f'The types of text pair must be (str, str), but got'
f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.')
embedding_name = "w2v.baidu_encyclopedia.target.word-ngram.1-2.dim300"
for word in word_pair:
if self.get_idx_from_word(word) == \
self.get_idx_from_word(self.vocab.unk_token):
raise RuntimeError(
f'Word "{word}" is not in vocab. Please check your inputs.')
results.append(str(self.cosine_sim(*word_pair)))
return results
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name=self.embedding_name, *args, **kwargs)
......@@ -56,6 +56,25 @@ def dot(
* `word_a`: 需要计算内积的单词a。
* `word_b`: 需要计算内积的单词b。
```python
def get_vocab_path()
```
获取本地词表文件的路径信息。
```python
def get_tokenizer(*args, **kwargs)
```
获取当前模型的tokenizer,返回一个JiebaTokenizer的实例,当前只支持中文embedding模型。
**参数**
* `*args`: 额外传递的列表形式的参数。
* `**kwargs`: 额外传递的字典形式的参数。
关于额外参数的详情,可查看[paddlenlp.data.tokenizer.JiebaTokenizer](https://github.com/PaddlePaddle/models/blob/develop/PaddleNLP/paddlenlp/data/tokenizer.py)
更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings)
## 代码示例
......@@ -125,3 +144,6 @@ paddlehub >= 2.0.0
初始发布
* 1.0.1
支持基于embedding的文本分类和序列标注finetune任务
......@@ -12,44 +12,24 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
from paddlenlp.embeddings import TokenEmbedding
from paddlehub.module.module import moduleinfo, serving
from paddlehub.module.module import moduleinfo
from paddlehub.module.nlp_module import EmbeddingModule
@moduleinfo(
name="w2v_baidu_encyclopedia_target_word-ngram_1-3_dim300",
version="1.0.0",
version="1.0.1",
summary="",
author="paddlepaddle",
author_email="",
type="nlp/semantic_model")
type="nlp/semantic_model",
meta=EmbeddingModule)
class Embedding(TokenEmbedding):
"""
Embedding model
"""
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name="w2v.baidu_encyclopedia.target.word-ngram.1-3.dim300", *args, **kwargs)
@serving
def calc_similarity(self, data: List[List[str]]):
"""
Calculate similarities of giving word pairs.
"""
results = []
for word_pair in data:
if len(word_pair) != 2:
raise RuntimeError(
f'The input must have two words, but got {len(word_pair)}. Please check your inputs.')
if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str):
raise RuntimeError(
f'The types of text pair must be (str, str), but got'
f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.')
embedding_name = "w2v.baidu_encyclopedia.target.word-ngram.1-3.dim300"
for word in word_pair:
if self.get_idx_from_word(word) == \
self.get_idx_from_word(self.vocab.unk_token):
raise RuntimeError(
f'Word "{word}" is not in vocab. Please check your inputs.')
results.append(str(self.cosine_sim(*word_pair)))
return results
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name=self.embedding_name, *args, **kwargs)
......@@ -56,6 +56,25 @@ def dot(
* `word_a`: 需要计算内积的单词a。
* `word_b`: 需要计算内积的单词b。
```python
def get_vocab_path()
```
获取本地词表文件的路径信息。
```python
def get_tokenizer(*args, **kwargs)
```
获取当前模型的tokenizer,返回一个JiebaTokenizer的实例,当前只支持中文embedding模型。
**参数**
* `*args`: 额外传递的列表形式的参数。
* `**kwargs`: 额外传递的字典形式的参数。
关于额外参数的详情,可查看[paddlenlp.data.tokenizer.JiebaTokenizer](https://github.com/PaddlePaddle/models/blob/develop/PaddleNLP/paddlenlp/data/tokenizer.py)
更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings)
## 代码示例
......@@ -125,3 +144,6 @@ paddlehub >= 2.0.0
初始发布
* 1.0.1
支持基于embedding的文本分类和序列标注finetune任务
......@@ -12,44 +12,24 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
from paddlenlp.embeddings import TokenEmbedding
from paddlehub.module.module import moduleinfo, serving
from paddlehub.module.module import moduleinfo
from paddlehub.module.nlp_module import EmbeddingModule
@moduleinfo(
name="w2v_baidu_encyclopedia_target_word-ngram_2-2_dim300",
version="1.0.0",
version="1.0.1",
summary="",
author="paddlepaddle",
author_email="",
type="nlp/semantic_model")
type="nlp/semantic_model",
meta=EmbeddingModule)
class Embedding(TokenEmbedding):
"""
Embedding model
"""
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name="w2v.baidu_encyclopedia.target.word-ngram.2-2.dim300", *args, **kwargs)
@serving
def calc_similarity(self, data: List[List[str]]):
"""
Calculate similarities of giving word pairs.
"""
results = []
for word_pair in data:
if len(word_pair) != 2:
raise RuntimeError(
f'The input must have two words, but got {len(word_pair)}. Please check your inputs.')
if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str):
raise RuntimeError(
f'The types of text pair must be (str, str), but got'
f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.')
embedding_name = "w2v.baidu_encyclopedia.target.word-ngram.2-2.dim300"
for word in word_pair:
if self.get_idx_from_word(word) == \
self.get_idx_from_word(self.vocab.unk_token):
raise RuntimeError(
f'Word "{word}" is not in vocab. Please check your inputs.')
results.append(str(self.cosine_sim(*word_pair)))
return results
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name=self.embedding_name, *args, **kwargs)
......@@ -56,6 +56,25 @@ def dot(
* `word_a`: 需要计算内积的单词a。
* `word_b`: 需要计算内积的单词b。
```python
def get_vocab_path()
```
获取本地词表文件的路径信息。
```python
def get_tokenizer(*args, **kwargs)
```
获取当前模型的tokenizer,返回一个JiebaTokenizer的实例,当前只支持中文embedding模型。
**参数**
* `*args`: 额外传递的列表形式的参数。
* `**kwargs`: 额外传递的字典形式的参数。
关于额外参数的详情,可查看[paddlenlp.data.tokenizer.JiebaTokenizer](https://github.com/PaddlePaddle/models/blob/develop/PaddleNLP/paddlenlp/data/tokenizer.py)
更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings)
## 代码示例
......@@ -125,3 +144,6 @@ paddlehub >= 2.0.0
初始发布
* 1.0.1
支持基于embedding的文本分类和序列标注finetune任务
......@@ -12,44 +12,24 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
from paddlenlp.embeddings import TokenEmbedding
from paddlehub.module.module import moduleinfo, serving
from paddlehub.module.module import moduleinfo
from paddlehub.module.nlp_module import EmbeddingModule
@moduleinfo(
name="w2v_baidu_encyclopedia_target_word-wordLR_dim300",
version="1.0.0",
version="1.0.1",
summary="",
author="paddlepaddle",
author_email="",
type="nlp/semantic_model")
type="nlp/semantic_model",
meta=EmbeddingModule)
class Embedding(TokenEmbedding):
"""
Embedding model
"""
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name="w2v.baidu_encyclopedia.target.word-wordLR.dim300", *args, **kwargs)
@serving
def calc_similarity(self, data: List[List[str]]):
"""
Calculate similarities of giving word pairs.
"""
results = []
for word_pair in data:
if len(word_pair) != 2:
raise RuntimeError(
f'The input must have two words, but got {len(word_pair)}. Please check your inputs.')
if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str):
raise RuntimeError(
f'The types of text pair must be (str, str), but got'
f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.')
embedding_name = "w2v.baidu_encyclopedia.target.word-wordLR.dim300"
for word in word_pair:
if self.get_idx_from_word(word) == \
self.get_idx_from_word(self.vocab.unk_token):
raise RuntimeError(
f'Word "{word}" is not in vocab. Please check your inputs.')
results.append(str(self.cosine_sim(*word_pair)))
return results
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name=self.embedding_name, *args, **kwargs)
......@@ -56,6 +56,25 @@ def dot(
* `word_a`: 需要计算内积的单词a。
* `word_b`: 需要计算内积的单词b。
```python
def get_vocab_path()
```
获取本地词表文件的路径信息。
```python
def get_tokenizer(*args, **kwargs)
```
获取当前模型的tokenizer,返回一个JiebaTokenizer的实例,当前只支持中文embedding模型。
**参数**
* `*args`: 额外传递的列表形式的参数。
* `**kwargs`: 额外传递的字典形式的参数。
关于额外参数的详情,可查看[paddlenlp.data.tokenizer.JiebaTokenizer](https://github.com/PaddlePaddle/models/blob/develop/PaddleNLP/paddlenlp/data/tokenizer.py)
更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings)
## 代码示例
......@@ -125,3 +144,6 @@ paddlehub >= 2.0.0
初始发布
* 1.0.1
支持基于embedding的文本分类和序列标注finetune任务
......@@ -12,44 +12,24 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
from paddlenlp.embeddings import TokenEmbedding
from paddlehub.module.module import moduleinfo, serving
from paddlehub.module.module import moduleinfo
from paddlehub.module.nlp_module import EmbeddingModule
@moduleinfo(
name="w2v_baidu_encyclopedia_target_word-wordPosition_dim300",
version="1.0.0",
version="1.0.1",
summary="",
author="paddlepaddle",
author_email="",
type="nlp/semantic_model")
type="nlp/semantic_model",
meta=EmbeddingModule)
class Embedding(TokenEmbedding):
"""
Embedding model
"""
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name="w2v.baidu_encyclopedia.target.word-wordPosition.dim300", *args, **kwargs)
@serving
def calc_similarity(self, data: List[List[str]]):
"""
Calculate similarities of giving word pairs.
"""
results = []
for word_pair in data:
if len(word_pair) != 2:
raise RuntimeError(
f'The input must have two words, but got {len(word_pair)}. Please check your inputs.')
if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str):
raise RuntimeError(
f'The types of text pair must be (str, str), but got'
f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.')
embedding_name = "w2v.baidu_encyclopedia.target.word-wordPosition.dim300"
for word in word_pair:
if self.get_idx_from_word(word) == \
self.get_idx_from_word(self.vocab.unk_token):
raise RuntimeError(
f'Word "{word}" is not in vocab. Please check your inputs.')
results.append(str(self.cosine_sim(*word_pair)))
return results
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name=self.embedding_name, *args, **kwargs)
......@@ -56,6 +56,25 @@ def dot(
* `word_a`: 需要计算内积的单词a。
* `word_b`: 需要计算内积的单词b。
```python
def get_vocab_path()
```
获取本地词表文件的路径信息。
```python
def get_tokenizer(*args, **kwargs)
```
获取当前模型的tokenizer,返回一个JiebaTokenizer的实例,当前只支持中文embedding模型。
**参数**
* `*args`: 额外传递的列表形式的参数。
* `**kwargs`: 额外传递的字典形式的参数。
关于额外参数的详情,可查看[paddlenlp.data.tokenizer.JiebaTokenizer](https://github.com/PaddlePaddle/models/blob/develop/PaddleNLP/paddlenlp/data/tokenizer.py)
更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings)
## 代码示例
......@@ -125,3 +144,6 @@ paddlehub >= 2.0.0
初始发布
* 1.0.1
支持基于embedding的文本分类和序列标注finetune任务
......@@ -12,15 +12,14 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
from paddlenlp.embeddings import TokenEmbedding
from paddlehub.module.module import moduleinfo, serving
from paddlehub.module.module import moduleinfo
from paddlehub.module.nlp_module import EmbeddingModule
@moduleinfo(
name="w2v_baidu_encyclopedia_target_word-word_dim300",
version="1.0.0",
version="1.0.1",
summary="",
author="paddlepaddle",
author_email="",
......@@ -30,7 +29,7 @@ class Embedding(TokenEmbedding):
"""
Embedding model
"""
embedding_name = 'w2v.baidu_encyclopedia.target.word-word.dim300'
embedding_name = "w2v.baidu_encyclopedia.target.word-word.dim300"
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name=self.embedding_name, *args, **kwargs)
\ No newline at end of file
super(Embedding, self).__init__(embedding_name=self.embedding_name, *args, **kwargs)
......@@ -56,6 +56,25 @@ def dot(
* `word_a`: 需要计算内积的单词a。
* `word_b`: 需要计算内积的单词b。
```python
def get_vocab_path()
```
获取本地词表文件的路径信息。
```python
def get_tokenizer(*args, **kwargs)
```
获取当前模型的tokenizer,返回一个JiebaTokenizer的实例,当前只支持中文embedding模型。
**参数**
* `*args`: 额外传递的列表形式的参数。
* `**kwargs`: 额外传递的字典形式的参数。
关于额外参数的详情,可查看[paddlenlp.data.tokenizer.JiebaTokenizer](https://github.com/PaddlePaddle/models/blob/develop/PaddleNLP/paddlenlp/data/tokenizer.py)
更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings)
## 代码示例
......@@ -125,3 +144,6 @@ paddlehub >= 2.0.0
初始发布
* 1.0.1
支持基于embedding的文本分类和序列标注finetune任务
......@@ -12,44 +12,24 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
from paddlenlp.embeddings import TokenEmbedding
from paddlehub.module.module import moduleinfo, serving
from paddlehub.module.module import moduleinfo
from paddlehub.module.nlp_module import EmbeddingModule
@moduleinfo(
name="w2v_financial_target_bigram-char_dim300",
version="1.0.0",
version="1.0.1",
summary="",
author="paddlepaddle",
author_email="",
type="nlp/semantic_model")
type="nlp/semantic_model",
meta=EmbeddingModule)
class Embedding(TokenEmbedding):
"""
Embedding model
"""
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name="w2v.financial.target.bigram-char.dim300", *args, **kwargs)
@serving
def calc_similarity(self, data: List[List[str]]):
"""
Calculate similarities of giving word pairs.
"""
results = []
for word_pair in data:
if len(word_pair) != 2:
raise RuntimeError(
f'The input must have two words, but got {len(word_pair)}. Please check your inputs.')
if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str):
raise RuntimeError(
f'The types of text pair must be (str, str), but got'
f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.')
embedding_name = "w2v.financial.target.bigram-char.dim300"
for word in word_pair:
if self.get_idx_from_word(word) == \
self.get_idx_from_word(self.vocab.unk_token):
raise RuntimeError(
f'Word "{word}" is not in vocab. Please check your inputs.')
results.append(str(self.cosine_sim(*word_pair)))
return results
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name=self.embedding_name, *args, **kwargs)
......@@ -56,6 +56,25 @@ def dot(
* `word_a`: 需要计算内积的单词a。
* `word_b`: 需要计算内积的单词b。
```python
def get_vocab_path()
```
获取本地词表文件的路径信息。
```python
def get_tokenizer(*args, **kwargs)
```
获取当前模型的tokenizer,返回一个JiebaTokenizer的实例,当前只支持中文embedding模型。
**参数**
* `*args`: 额外传递的列表形式的参数。
* `**kwargs`: 额外传递的字典形式的参数。
关于额外参数的详情,可查看[paddlenlp.data.tokenizer.JiebaTokenizer](https://github.com/PaddlePaddle/models/blob/develop/PaddleNLP/paddlenlp/data/tokenizer.py)
更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings)
## 代码示例
......@@ -125,3 +144,6 @@ paddlehub >= 2.0.0
初始发布
* 1.0.1
支持基于embedding的文本分类和序列标注finetune任务
......@@ -12,44 +12,24 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
from paddlenlp.embeddings import TokenEmbedding
from paddlehub.module.module import moduleinfo, serving
from paddlehub.module.module import moduleinfo
from paddlehub.module.nlp_module import EmbeddingModule
@moduleinfo(
name="w2v_financial_target_word-bigram_dim300",
version="1.0.0",
version="1.0.1",
summary="",
author="paddlepaddle",
author_email="",
type="nlp/semantic_model")
type="nlp/semantic_model",
meta=EmbeddingModule)
class Embedding(TokenEmbedding):
"""
Embedding model
"""
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name="w2v.financial.target.word-bigram.dim300", *args, **kwargs)
@serving
def calc_similarity(self, data: List[List[str]]):
"""
Calculate similarities of giving word pairs.
"""
results = []
for word_pair in data:
if len(word_pair) != 2:
raise RuntimeError(
f'The input must have two words, but got {len(word_pair)}. Please check your inputs.')
if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str):
raise RuntimeError(
f'The types of text pair must be (str, str), but got'
f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.')
embedding_name = "w2v.financial.target.word-bigram.dim300"
for word in word_pair:
if self.get_idx_from_word(word) == \
self.get_idx_from_word(self.vocab.unk_token):
raise RuntimeError(
f'Word "{word}" is not in vocab. Please check your inputs.')
results.append(str(self.cosine_sim(*word_pair)))
return results
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name=self.embedding_name, *args, **kwargs)
......@@ -56,6 +56,25 @@ def dot(
* `word_a`: 需要计算内积的单词a。
* `word_b`: 需要计算内积的单词b。
```python
def get_vocab_path()
```
获取本地词表文件的路径信息。
```python
def get_tokenizer(*args, **kwargs)
```
获取当前模型的tokenizer,返回一个JiebaTokenizer的实例,当前只支持中文embedding模型。
**参数**
* `*args`: 额外传递的列表形式的参数。
* `**kwargs`: 额外传递的字典形式的参数。
关于额外参数的详情,可查看[paddlenlp.data.tokenizer.JiebaTokenizer](https://github.com/PaddlePaddle/models/blob/develop/PaddleNLP/paddlenlp/data/tokenizer.py)
更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings)
## 代码示例
......@@ -125,3 +144,6 @@ paddlehub >= 2.0.0
初始发布
* 1.0.1
支持基于embedding的文本分类和序列标注finetune任务
......@@ -12,44 +12,24 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
from paddlenlp.embeddings import TokenEmbedding
from paddlehub.module.module import moduleinfo, serving
from paddlehub.module.module import moduleinfo
from paddlehub.module.nlp_module import EmbeddingModule
@moduleinfo(
name="w2v_financial_target_word-char_dim300",
version="1.0.0",
version="1.0.1",
summary="",
author="paddlepaddle",
author_email="",
type="nlp/semantic_model")
type="nlp/semantic_model",
meta=EmbeddingModule)
class Embedding(TokenEmbedding):
"""
Embedding model
"""
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name="w2v.financial.target.word-char.dim300", *args, **kwargs)
@serving
def calc_similarity(self, data: List[List[str]]):
"""
Calculate similarities of giving word pairs.
"""
results = []
for word_pair in data:
if len(word_pair) != 2:
raise RuntimeError(
f'The input must have two words, but got {len(word_pair)}. Please check your inputs.')
if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str):
raise RuntimeError(
f'The types of text pair must be (str, str), but got'
f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.')
embedding_name = "w2v.financial.target.word-char.dim300"
for word in word_pair:
if self.get_idx_from_word(word) == \
self.get_idx_from_word(self.vocab.unk_token):
raise RuntimeError(
f'Word "{word}" is not in vocab. Please check your inputs.')
results.append(str(self.cosine_sim(*word_pair)))
return results
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name=self.embedding_name, *args, **kwargs)
......@@ -56,6 +56,25 @@ def dot(
* `word_a`: 需要计算内积的单词a。
* `word_b`: 需要计算内积的单词b。
```python
def get_vocab_path()
```
获取本地词表文件的路径信息。
```python
def get_tokenizer(*args, **kwargs)
```
获取当前模型的tokenizer,返回一个JiebaTokenizer的实例,当前只支持中文embedding模型。
**参数**
* `*args`: 额外传递的列表形式的参数。
* `**kwargs`: 额外传递的字典形式的参数。
关于额外参数的详情,可查看[paddlenlp.data.tokenizer.JiebaTokenizer](https://github.com/PaddlePaddle/models/blob/develop/PaddleNLP/paddlenlp/data/tokenizer.py)
更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings)
## 代码示例
......@@ -125,3 +144,6 @@ paddlehub >= 2.0.0
初始发布
* 1.0.1
支持基于embedding的文本分类和序列标注finetune任务
......@@ -12,44 +12,24 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
from paddlenlp.embeddings import TokenEmbedding
from paddlehub.module.module import moduleinfo, serving
from paddlehub.module.module import moduleinfo
from paddlehub.module.nlp_module import EmbeddingModule
@moduleinfo(
name="w2v_financial_target_word-word_dim300",
version="1.0.0",
version="1.0.1",
summary="",
author="paddlepaddle",
author_email="",
type="nlp/semantic_model")
type="nlp/semantic_model",
meta=EmbeddingModule)
class Embedding(TokenEmbedding):
"""
Embedding model
"""
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name="w2v.financial.target.word-word.dim300", *args, **kwargs)
@serving
def calc_similarity(self, data: List[List[str]]):
"""
Calculate similarities of giving word pairs.
"""
results = []
for word_pair in data:
if len(word_pair) != 2:
raise RuntimeError(
f'The input must have two words, but got {len(word_pair)}. Please check your inputs.')
if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str):
raise RuntimeError(
f'The types of text pair must be (str, str), but got'
f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.')
embedding_name = "w2v.financial.target.word-word.dim300"
for word in word_pair:
if self.get_idx_from_word(word) == \
self.get_idx_from_word(self.vocab.unk_token):
raise RuntimeError(
f'Word "{word}" is not in vocab. Please check your inputs.')
results.append(str(self.cosine_sim(*word_pair)))
return results
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name=self.embedding_name, *args, **kwargs)
......@@ -56,6 +56,25 @@ def dot(
* `word_a`: 需要计算内积的单词a。
* `word_b`: 需要计算内积的单词b。
```python
def get_vocab_path()
```
获取本地词表文件的路径信息。
```python
def get_tokenizer(*args, **kwargs)
```
获取当前模型的tokenizer,返回一个JiebaTokenizer的实例,当前只支持中文embedding模型。
**参数**
* `*args`: 额外传递的列表形式的参数。
* `**kwargs`: 额外传递的字典形式的参数。
关于额外参数的详情,可查看[paddlenlp.data.tokenizer.JiebaTokenizer](https://github.com/PaddlePaddle/models/blob/develop/PaddleNLP/paddlenlp/data/tokenizer.py)
更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings)
## 代码示例
......@@ -125,3 +144,6 @@ paddlehub >= 2.0.0
初始发布
* 1.0.1
支持基于embedding的文本分类和序列标注finetune任务
......@@ -12,44 +12,24 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
from paddlenlp.embeddings import TokenEmbedding
from paddlehub.module.module import moduleinfo, serving
from paddlehub.module.module import moduleinfo
from paddlehub.module.nlp_module import EmbeddingModule
@moduleinfo(
name="w2v_literature_target_bigram-char_dim300",
version="1.0.0",
version="1.0.1",
summary="",
author="paddlepaddle",
author_email="",
type="nlp/semantic_model")
type="nlp/semantic_model",
meta=EmbeddingModule)
class Embedding(TokenEmbedding):
"""
Embedding model
"""
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name="w2v.literature.target.bigram-char.dim300", *args, **kwargs)
@serving
def calc_similarity(self, data: List[List[str]]):
"""
Calculate similarities of giving word pairs.
"""
results = []
for word_pair in data:
if len(word_pair) != 2:
raise RuntimeError(
f'The input must have two words, but got {len(word_pair)}. Please check your inputs.')
if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str):
raise RuntimeError(
f'The types of text pair must be (str, str), but got'
f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.')
embedding_name = "w2v.literature.target.bigram-char.dim300"
for word in word_pair:
if self.get_idx_from_word(word) == \
self.get_idx_from_word(self.vocab.unk_token):
raise RuntimeError(
f'Word "{word}" is not in vocab. Please check your inputs.')
results.append(str(self.cosine_sim(*word_pair)))
return results
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name=self.embedding_name, *args, **kwargs)
......@@ -56,6 +56,25 @@ def dot(
* `word_a`: 需要计算内积的单词a。
* `word_b`: 需要计算内积的单词b。
```python
def get_vocab_path()
```
获取本地词表文件的路径信息。
```python
def get_tokenizer(*args, **kwargs)
```
获取当前模型的tokenizer,返回一个JiebaTokenizer的实例,当前只支持中文embedding模型。
**参数**
* `*args`: 额外传递的列表形式的参数。
* `**kwargs`: 额外传递的字典形式的参数。
关于额外参数的详情,可查看[paddlenlp.data.tokenizer.JiebaTokenizer](https://github.com/PaddlePaddle/models/blob/develop/PaddleNLP/paddlenlp/data/tokenizer.py)
更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings)
## 代码示例
......@@ -125,3 +144,6 @@ paddlehub >= 2.0.0
初始发布
* 1.0.1
支持基于embedding的文本分类和序列标注finetune任务
......@@ -12,44 +12,24 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
from paddlenlp.embeddings import TokenEmbedding
from paddlehub.module.module import moduleinfo, serving
from paddlehub.module.module import moduleinfo
from paddlehub.module.nlp_module import EmbeddingModule
@moduleinfo(
name="w2v_literature_target_word-bigram_dim300",
version="1.0.0",
version="1.0.1",
summary="",
author="paddlepaddle",
author_email="",
type="nlp/semantic_model")
type="nlp/semantic_model",
meta=EmbeddingModule)
class Embedding(TokenEmbedding):
"""
Embedding model
"""
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name="w2v.literature.target.word-bigram.dim300", *args, **kwargs)
@serving
def calc_similarity(self, data: List[List[str]]):
"""
Calculate similarities of giving word pairs.
"""
results = []
for word_pair in data:
if len(word_pair) != 2:
raise RuntimeError(
f'The input must have two words, but got {len(word_pair)}. Please check your inputs.')
if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str):
raise RuntimeError(
f'The types of text pair must be (str, str), but got'
f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.')
embedding_name = "w2v.literature.target.word-bigram.dim300"
for word in word_pair:
if self.get_idx_from_word(word) == \
self.get_idx_from_word(self.vocab.unk_token):
raise RuntimeError(
f'Word "{word}" is not in vocab. Please check your inputs.')
results.append(str(self.cosine_sim(*word_pair)))
return results
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name=self.embedding_name, *args, **kwargs)
......@@ -56,6 +56,25 @@ def dot(
* `word_a`: 需要计算内积的单词a。
* `word_b`: 需要计算内积的单词b。
```python
def get_vocab_path()
```
获取本地词表文件的路径信息。
```python
def get_tokenizer(*args, **kwargs)
```
获取当前模型的tokenizer,返回一个JiebaTokenizer的实例,当前只支持中文embedding模型。
**参数**
* `*args`: 额外传递的列表形式的参数。
* `**kwargs`: 额外传递的字典形式的参数。
关于额外参数的详情,可查看[paddlenlp.data.tokenizer.JiebaTokenizer](https://github.com/PaddlePaddle/models/blob/develop/PaddleNLP/paddlenlp/data/tokenizer.py)
更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings)
## 代码示例
......@@ -125,3 +144,6 @@ paddlehub >= 2.0.0
初始发布
* 1.0.1
支持基于embedding的文本分类和序列标注finetune任务
......@@ -12,44 +12,24 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
from paddlenlp.embeddings import TokenEmbedding
from paddlehub.module.module import moduleinfo, serving
from paddlehub.module.module import moduleinfo
from paddlehub.module.nlp_module import EmbeddingModule
@moduleinfo(
name="w2v_literature_target_word-char_dim300",
version="1.0.0",
version="1.0.1",
summary="",
author="paddlepaddle",
author_email="",
type="nlp/semantic_model")
type="nlp/semantic_model",
meta=EmbeddingModule)
class Embedding(TokenEmbedding):
"""
Embedding model
"""
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name="w2v.literature.target.word-char.dim300", *args, **kwargs)
@serving
def calc_similarity(self, data: List[List[str]]):
"""
Calculate similarities of giving word pairs.
"""
results = []
for word_pair in data:
if len(word_pair) != 2:
raise RuntimeError(
f'The input must have two words, but got {len(word_pair)}. Please check your inputs.')
if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str):
raise RuntimeError(
f'The types of text pair must be (str, str), but got'
f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.')
embedding_name = "w2v.literature.target.word-char.dim300"
for word in word_pair:
if self.get_idx_from_word(word) == \
self.get_idx_from_word(self.vocab.unk_token):
raise RuntimeError(
f'Word "{word}" is not in vocab. Please check your inputs.')
results.append(str(self.cosine_sim(*word_pair)))
return results
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name=self.embedding_name, *args, **kwargs)
......@@ -56,6 +56,25 @@ def dot(
* `word_a`: 需要计算内积的单词a。
* `word_b`: 需要计算内积的单词b。
```python
def get_vocab_path()
```
获取本地词表文件的路径信息。
```python
def get_tokenizer(*args, **kwargs)
```
获取当前模型的tokenizer,返回一个JiebaTokenizer的实例,当前只支持中文embedding模型。
**参数**
* `*args`: 额外传递的列表形式的参数。
* `**kwargs`: 额外传递的字典形式的参数。
关于额外参数的详情,可查看[paddlenlp.data.tokenizer.JiebaTokenizer](https://github.com/PaddlePaddle/models/blob/develop/PaddleNLP/paddlenlp/data/tokenizer.py)
更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings)
## 代码示例
......@@ -125,3 +144,6 @@ paddlehub >= 2.0.0
初始发布
* 1.0.1
支持基于embedding的文本分类和序列标注finetune任务
......@@ -12,44 +12,24 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
from paddlenlp.embeddings import TokenEmbedding
from paddlehub.module.module import moduleinfo, serving
from paddlehub.module.module import moduleinfo
from paddlehub.module.nlp_module import EmbeddingModule
@moduleinfo(
name="w2v_literature_target_word-word_dim300",
version="1.0.0",
version="1.0.1",
summary="",
author="paddlepaddle",
author_email="",
type="nlp/semantic_model")
type="nlp/semantic_model",
meta=EmbeddingModule)
class Embedding(TokenEmbedding):
"""
Embedding model
"""
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name="w2v.literature.target.word-word.dim300", *args, **kwargs)
@serving
def calc_similarity(self, data: List[List[str]]):
"""
Calculate similarities of giving word pairs.
"""
results = []
for word_pair in data:
if len(word_pair) != 2:
raise RuntimeError(
f'The input must have two words, but got {len(word_pair)}. Please check your inputs.')
if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str):
raise RuntimeError(
f'The types of text pair must be (str, str), but got'
f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.')
embedding_name = "w2v.literature.target.word-word.dim300"
for word in word_pair:
if self.get_idx_from_word(word) == \
self.get_idx_from_word(self.vocab.unk_token):
raise RuntimeError(
f'Word "{word}" is not in vocab. Please check your inputs.')
results.append(str(self.cosine_sim(*word_pair)))
return results
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name=self.embedding_name, *args, **kwargs)
......@@ -56,6 +56,25 @@ def dot(
* `word_a`: 需要计算内积的单词a。
* `word_b`: 需要计算内积的单词b。
```python
def get_vocab_path()
```
获取本地词表文件的路径信息。
```python
def get_tokenizer(*args, **kwargs)
```
获取当前模型的tokenizer,返回一个JiebaTokenizer的实例,当前只支持中文embedding模型。
**参数**
* `*args`: 额外传递的列表形式的参数。
* `**kwargs`: 额外传递的字典形式的参数。
关于额外参数的详情,可查看[paddlenlp.data.tokenizer.JiebaTokenizer](https://github.com/PaddlePaddle/models/blob/develop/PaddleNLP/paddlenlp/data/tokenizer.py)
更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings)
## 代码示例
......@@ -125,3 +144,6 @@ paddlehub >= 2.0.0
初始发布
* 1.0.1
支持基于embedding的文本分类和序列标注finetune任务
......@@ -12,44 +12,24 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
from paddlenlp.embeddings import TokenEmbedding
from paddlehub.module.module import moduleinfo, serving
from paddlehub.module.module import moduleinfo
from paddlehub.module.nlp_module import EmbeddingModule
@moduleinfo(
name="w2v_mixed-large_target_word-char_dim300",
version="1.0.0",
version="1.0.1",
summary="",
author="paddlepaddle",
author_email="",
type="nlp/semantic_model")
type="nlp/semantic_model",
meta=EmbeddingModule)
class Embedding(TokenEmbedding):
"""
Embedding model
"""
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name="w2v.mixed-large.target.word-char.dim300", *args, **kwargs)
@serving
def calc_similarity(self, data: List[List[str]]):
"""
Calculate similarities of giving word pairs.
"""
results = []
for word_pair in data:
if len(word_pair) != 2:
raise RuntimeError(
f'The input must have two words, but got {len(word_pair)}. Please check your inputs.')
if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str):
raise RuntimeError(
f'The types of text pair must be (str, str), but got'
f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.')
embedding_name = "w2v.mixed-large.target.word-char.dim300"
for word in word_pair:
if self.get_idx_from_word(word) == \
self.get_idx_from_word(self.vocab.unk_token):
raise RuntimeError(
f'Word "{word}" is not in vocab. Please check your inputs.')
results.append(str(self.cosine_sim(*word_pair)))
return results
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name=self.embedding_name, *args, **kwargs)
......@@ -56,6 +56,25 @@ def dot(
* `word_a`: 需要计算内积的单词a。
* `word_b`: 需要计算内积的单词b。
```python
def get_vocab_path()
```
获取本地词表文件的路径信息。
```python
def get_tokenizer(*args, **kwargs)
```
获取当前模型的tokenizer,返回一个JiebaTokenizer的实例,当前只支持中文embedding模型。
**参数**
* `*args`: 额外传递的列表形式的参数。
* `**kwargs`: 额外传递的字典形式的参数。
关于额外参数的详情,可查看[paddlenlp.data.tokenizer.JiebaTokenizer](https://github.com/PaddlePaddle/models/blob/develop/PaddleNLP/paddlenlp/data/tokenizer.py)
更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings)
## 代码示例
......@@ -125,3 +144,6 @@ paddlehub >= 2.0.0
初始发布
* 1.0.1
支持基于embedding的文本分类和序列标注finetune任务
......@@ -12,44 +12,24 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
from paddlenlp.embeddings import TokenEmbedding
from paddlehub.module.module import moduleinfo, serving
from paddlehub.module.module import moduleinfo
from paddlehub.module.nlp_module import EmbeddingModule
@moduleinfo(
name="w2v_mixed-large_target_word-word_dim300",
version="1.0.0",
version="1.0.1",
summary="",
author="paddlepaddle",
author_email="",
type="nlp/semantic_model")
type="nlp/semantic_model",
meta=EmbeddingModule)
class Embedding(TokenEmbedding):
"""
Embedding model
"""
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name="w2v.mixed-large.target.word-word.dim300", *args, **kwargs)
@serving
def calc_similarity(self, data: List[List[str]]):
"""
Calculate similarities of giving word pairs.
"""
results = []
for word_pair in data:
if len(word_pair) != 2:
raise RuntimeError(
f'The input must have two words, but got {len(word_pair)}. Please check your inputs.')
if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str):
raise RuntimeError(
f'The types of text pair must be (str, str), but got'
f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.')
embedding_name = "w2v.mixed-large.target.word-word.dim300"
for word in word_pair:
if self.get_idx_from_word(word) == \
self.get_idx_from_word(self.vocab.unk_token):
raise RuntimeError(
f'Word "{word}" is not in vocab. Please check your inputs.')
results.append(str(self.cosine_sim(*word_pair)))
return results
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name=self.embedding_name, *args, **kwargs)
......@@ -56,6 +56,25 @@ def dot(
* `word_a`: 需要计算内积的单词a。
* `word_b`: 需要计算内积的单词b。
```python
def get_vocab_path()
```
获取本地词表文件的路径信息。
```python
def get_tokenizer(*args, **kwargs)
```
获取当前模型的tokenizer,返回一个JiebaTokenizer的实例,当前只支持中文embedding模型。
**参数**
* `*args`: 额外传递的列表形式的参数。
* `**kwargs`: 额外传递的字典形式的参数。
关于额外参数的详情,可查看[paddlenlp.data.tokenizer.JiebaTokenizer](https://github.com/PaddlePaddle/models/blob/develop/PaddleNLP/paddlenlp/data/tokenizer.py)
更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings)
## 代码示例
......@@ -125,3 +144,6 @@ paddlehub >= 2.0.0
初始发布
* 1.0.1
支持基于embedding的文本分类和序列标注finetune任务
......@@ -12,44 +12,24 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
from paddlenlp.embeddings import TokenEmbedding
from paddlehub.module.module import moduleinfo, serving
from paddlehub.module.module import moduleinfo
from paddlehub.module.nlp_module import EmbeddingModule
@moduleinfo(
name="w2v_people_daily_target_bigram-char_dim300",
version="1.0.0",
version="1.0.1",
summary="",
author="paddlepaddle",
author_email="",
type="nlp/semantic_model")
type="nlp/semantic_model",
meta=EmbeddingModule)
class Embedding(TokenEmbedding):
"""
Embedding model
"""
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name="w2v.people_daily.target.bigram-char.dim300", *args, **kwargs)
@serving
def calc_similarity(self, data: List[List[str]]):
"""
Calculate similarities of giving word pairs.
"""
results = []
for word_pair in data:
if len(word_pair) != 2:
raise RuntimeError(
f'The input must have two words, but got {len(word_pair)}. Please check your inputs.')
if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str):
raise RuntimeError(
f'The types of text pair must be (str, str), but got'
f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.')
embedding_name = "w2v.people_daily.target.bigram-char.dim300"
for word in word_pair:
if self.get_idx_from_word(word) == \
self.get_idx_from_word(self.vocab.unk_token):
raise RuntimeError(
f'Word "{word}" is not in vocab. Please check your inputs.')
results.append(str(self.cosine_sim(*word_pair)))
return results
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name=self.embedding_name, *args, **kwargs)
......@@ -56,6 +56,25 @@ def dot(
* `word_a`: 需要计算内积的单词a。
* `word_b`: 需要计算内积的单词b。
```python
def get_vocab_path()
```
获取本地词表文件的路径信息。
```python
def get_tokenizer(*args, **kwargs)
```
获取当前模型的tokenizer,返回一个JiebaTokenizer的实例,当前只支持中文embedding模型。
**参数**
* `*args`: 额外传递的列表形式的参数。
* `**kwargs`: 额外传递的字典形式的参数。
关于额外参数的详情,可查看[paddlenlp.data.tokenizer.JiebaTokenizer](https://github.com/PaddlePaddle/models/blob/develop/PaddleNLP/paddlenlp/data/tokenizer.py)
更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings)
## 代码示例
......@@ -125,3 +144,6 @@ paddlehub >= 2.0.0
初始发布
* 1.0.1
支持基于embedding的文本分类和序列标注finetune任务
......@@ -12,44 +12,24 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
from paddlenlp.embeddings import TokenEmbedding
from paddlehub.module.module import moduleinfo, serving
from paddlehub.module.module import moduleinfo
from paddlehub.module.nlp_module import EmbeddingModule
@moduleinfo(
name="w2v_people_daily_target_word-bigram_dim300",
version="1.0.0",
version="1.0.1",
summary="",
author="paddlepaddle",
author_email="",
type="nlp/semantic_model")
type="nlp/semantic_model",
meta=EmbeddingModule)
class Embedding(TokenEmbedding):
"""
Embedding model
"""
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name="w2v.people_daily.target.word-bigram.dim300", *args, **kwargs)
@serving
def calc_similarity(self, data: List[List[str]]):
"""
Calculate similarities of giving word pairs.
"""
results = []
for word_pair in data:
if len(word_pair) != 2:
raise RuntimeError(
f'The input must have two words, but got {len(word_pair)}. Please check your inputs.')
if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str):
raise RuntimeError(
f'The types of text pair must be (str, str), but got'
f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.')
embedding_name = "w2v.people_daily.target.word-bigram.dim300"
for word in word_pair:
if self.get_idx_from_word(word) == \
self.get_idx_from_word(self.vocab.unk_token):
raise RuntimeError(
f'Word "{word}" is not in vocab. Please check your inputs.')
results.append(str(self.cosine_sim(*word_pair)))
return results
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name=self.embedding_name, *args, **kwargs)
......@@ -56,6 +56,25 @@ def dot(
* `word_a`: 需要计算内积的单词a。
* `word_b`: 需要计算内积的单词b。
```python
def get_vocab_path()
```
获取本地词表文件的路径信息。
```python
def get_tokenizer(*args, **kwargs)
```
获取当前模型的tokenizer,返回一个JiebaTokenizer的实例,当前只支持中文embedding模型。
**参数**
* `*args`: 额外传递的列表形式的参数。
* `**kwargs`: 额外传递的字典形式的参数。
关于额外参数的详情,可查看[paddlenlp.data.tokenizer.JiebaTokenizer](https://github.com/PaddlePaddle/models/blob/develop/PaddleNLP/paddlenlp/data/tokenizer.py)
更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings)
## 代码示例
......@@ -125,3 +144,6 @@ paddlehub >= 2.0.0
初始发布
* 1.0.1
支持基于embedding的文本分类和序列标注finetune任务
......@@ -12,44 +12,24 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
from paddlenlp.embeddings import TokenEmbedding
from paddlehub.module.module import moduleinfo, serving
from paddlehub.module.module import moduleinfo
from paddlehub.module.nlp_module import EmbeddingModule
@moduleinfo(
name="w2v_people_daily_target_word-char_dim300",
version="1.0.0",
version="1.0.1",
summary="",
author="paddlepaddle",
author_email="",
type="nlp/semantic_model")
type="nlp/semantic_model",
meta=EmbeddingModule)
class Embedding(TokenEmbedding):
"""
Embedding model
"""
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name="w2v.people_daily.target.word-char.dim300", *args, **kwargs)
@serving
def calc_similarity(self, data: List[List[str]]):
"""
Calculate similarities of giving word pairs.
"""
results = []
for word_pair in data:
if len(word_pair) != 2:
raise RuntimeError(
f'The input must have two words, but got {len(word_pair)}. Please check your inputs.')
if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str):
raise RuntimeError(
f'The types of text pair must be (str, str), but got'
f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.')
embedding_name = "w2v.people_daily.target.word-char.dim300"
for word in word_pair:
if self.get_idx_from_word(word) == \
self.get_idx_from_word(self.vocab.unk_token):
raise RuntimeError(
f'Word "{word}" is not in vocab. Please check your inputs.')
results.append(str(self.cosine_sim(*word_pair)))
return results
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name=self.embedding_name, *args, **kwargs)
......@@ -56,6 +56,25 @@ def dot(
* `word_a`: 需要计算内积的单词a。
* `word_b`: 需要计算内积的单词b。
```python
def get_vocab_path()
```
获取本地词表文件的路径信息。
```python
def get_tokenizer(*args, **kwargs)
```
获取当前模型的tokenizer,返回一个JiebaTokenizer的实例,当前只支持中文embedding模型。
**参数**
* `*args`: 额外传递的列表形式的参数。
* `**kwargs`: 额外传递的字典形式的参数。
关于额外参数的详情,可查看[paddlenlp.data.tokenizer.JiebaTokenizer](https://github.com/PaddlePaddle/models/blob/develop/PaddleNLP/paddlenlp/data/tokenizer.py)
更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings)
## 代码示例
......@@ -125,3 +144,6 @@ paddlehub >= 2.0.0
初始发布
* 1.0.1
支持基于embedding的文本分类和序列标注finetune任务
......@@ -12,44 +12,24 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
from paddlenlp.embeddings import TokenEmbedding
from paddlehub.module.module import moduleinfo, serving
from paddlehub.module.module import moduleinfo
from paddlehub.module.nlp_module import EmbeddingModule
@moduleinfo(
name="w2v_people_daily_target_word-word_dim300",
version="1.0.0",
version="1.0.1",
summary="",
author="paddlepaddle",
author_email="",
type="nlp/semantic_model")
type="nlp/semantic_model",
meta=EmbeddingModule)
class Embedding(TokenEmbedding):
"""
Embedding model
"""
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name="w2v.people_daily.target.word-word.dim300", *args, **kwargs)
@serving
def calc_similarity(self, data: List[List[str]]):
"""
Calculate similarities of giving word pairs.
"""
results = []
for word_pair in data:
if len(word_pair) != 2:
raise RuntimeError(
f'The input must have two words, but got {len(word_pair)}. Please check your inputs.')
if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str):
raise RuntimeError(
f'The types of text pair must be (str, str), but got'
f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.')
embedding_name = "w2v.people_daily.target.word-word.dim300"
for word in word_pair:
if self.get_idx_from_word(word) == \
self.get_idx_from_word(self.vocab.unk_token):
raise RuntimeError(
f'Word "{word}" is not in vocab. Please check your inputs.')
results.append(str(self.cosine_sim(*word_pair)))
return results
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name=self.embedding_name, *args, **kwargs)
......@@ -56,6 +56,25 @@ def dot(
* `word_a`: 需要计算内积的单词a。
* `word_b`: 需要计算内积的单词b。
```python
def get_vocab_path()
```
获取本地词表文件的路径信息。
```python
def get_tokenizer(*args, **kwargs)
```
获取当前模型的tokenizer,返回一个JiebaTokenizer的实例,当前只支持中文embedding模型。
**参数**
* `*args`: 额外传递的列表形式的参数。
* `**kwargs`: 额外传递的字典形式的参数。
关于额外参数的详情,可查看[paddlenlp.data.tokenizer.JiebaTokenizer](https://github.com/PaddlePaddle/models/blob/develop/PaddleNLP/paddlenlp/data/tokenizer.py)
更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings)
## 代码示例
......@@ -125,3 +144,6 @@ paddlehub >= 2.0.0
初始发布
* 1.0.1
支持基于embedding的文本分类和序列标注finetune任务
......@@ -12,44 +12,24 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
from paddlenlp.embeddings import TokenEmbedding
from paddlehub.module.module import moduleinfo, serving
from paddlehub.module.module import moduleinfo
from paddlehub.module.nlp_module import EmbeddingModule
@moduleinfo(
name="w2v_sikuquanshu_target_word-bigram_dim300",
version="1.0.0",
version="1.0.1",
summary="",
author="paddlepaddle",
author_email="",
type="nlp/semantic_model")
type="nlp/semantic_model",
meta=EmbeddingModule)
class Embedding(TokenEmbedding):
"""
Embedding model
"""
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name="w2v.sikuquanshu.target.word-bigram.dim300", *args, **kwargs)
@serving
def calc_similarity(self, data: List[List[str]]):
"""
Calculate similarities of giving word pairs.
"""
results = []
for word_pair in data:
if len(word_pair) != 2:
raise RuntimeError(
f'The input must have two words, but got {len(word_pair)}. Please check your inputs.')
if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str):
raise RuntimeError(
f'The types of text pair must be (str, str), but got'
f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.')
embedding_name = "w2v.sikuquanshu.target.word-bigram.dim300"
for word in word_pair:
if self.get_idx_from_word(word) == \
self.get_idx_from_word(self.vocab.unk_token):
raise RuntimeError(
f'Word "{word}" is not in vocab. Please check your inputs.')
results.append(str(self.cosine_sim(*word_pair)))
return results
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name=self.embedding_name, *args, **kwargs)
......@@ -56,6 +56,25 @@ def dot(
* `word_a`: 需要计算内积的单词a。
* `word_b`: 需要计算内积的单词b。
```python
def get_vocab_path()
```
获取本地词表文件的路径信息。
```python
def get_tokenizer(*args, **kwargs)
```
获取当前模型的tokenizer,返回一个JiebaTokenizer的实例,当前只支持中文embedding模型。
**参数**
* `*args`: 额外传递的列表形式的参数。
* `**kwargs`: 额外传递的字典形式的参数。
关于额外参数的详情,可查看[paddlenlp.data.tokenizer.JiebaTokenizer](https://github.com/PaddlePaddle/models/blob/develop/PaddleNLP/paddlenlp/data/tokenizer.py)
更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings)
## 代码示例
......@@ -125,3 +144,6 @@ paddlehub >= 2.0.0
初始发布
* 1.0.1
支持基于embedding的文本分类和序列标注finetune任务
......@@ -12,44 +12,24 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
from paddlenlp.embeddings import TokenEmbedding
from paddlehub.module.module import moduleinfo, serving
from paddlehub.module.module import moduleinfo
from paddlehub.module.nlp_module import EmbeddingModule
@moduleinfo(
name="w2v_sikuquanshu_target_word-word_dim300",
version="1.0.0",
version="1.0.1",
summary="",
author="paddlepaddle",
author_email="",
type="nlp/semantic_model")
type="nlp/semantic_model",
meta=EmbeddingModule)
class Embedding(TokenEmbedding):
"""
Embedding model
"""
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name="w2v.sikuquanshu.target.word-word.dim300", *args, **kwargs)
@serving
def calc_similarity(self, data: List[List[str]]):
"""
Calculate similarities of giving word pairs.
"""
results = []
for word_pair in data:
if len(word_pair) != 2:
raise RuntimeError(
f'The input must have two words, but got {len(word_pair)}. Please check your inputs.')
if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str):
raise RuntimeError(
f'The types of text pair must be (str, str), but got'
f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.')
embedding_name = "w2v.sikuquanshu.target.word-word.dim300"
for word in word_pair:
if self.get_idx_from_word(word) == \
self.get_idx_from_word(self.vocab.unk_token):
raise RuntimeError(
f'Word "{word}" is not in vocab. Please check your inputs.')
results.append(str(self.cosine_sim(*word_pair)))
return results
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name=self.embedding_name, *args, **kwargs)
......@@ -56,6 +56,25 @@ def dot(
* `word_a`: 需要计算内积的单词a。
* `word_b`: 需要计算内积的单词b。
```python
def get_vocab_path()
```
获取本地词表文件的路径信息。
```python
def get_tokenizer(*args, **kwargs)
```
获取当前模型的tokenizer,返回一个JiebaTokenizer的实例,当前只支持中文embedding模型。
**参数**
* `*args`: 额外传递的列表形式的参数。
* `**kwargs`: 额外传递的字典形式的参数。
关于额外参数的详情,可查看[paddlenlp.data.tokenizer.JiebaTokenizer](https://github.com/PaddlePaddle/models/blob/develop/PaddleNLP/paddlenlp/data/tokenizer.py)
更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings)
## 代码示例
......@@ -125,3 +144,6 @@ paddlehub >= 2.0.0
初始发布
* 1.0.1
支持基于embedding的文本分类和序列标注finetune任务
......@@ -12,44 +12,24 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
from paddlenlp.embeddings import TokenEmbedding
from paddlehub.module.module import moduleinfo, serving
from paddlehub.module.module import moduleinfo
from paddlehub.module.nlp_module import EmbeddingModule
@moduleinfo(
name="w2v_sogou_target_bigram-char_dim300",
version="1.0.0",
version="1.0.1",
summary="",
author="paddlepaddle",
author_email="",
type="nlp/semantic_model")
type="nlp/semantic_model",
meta=EmbeddingModule)
class Embedding(TokenEmbedding):
"""
Embedding model
"""
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name="w2v.sogou.target.bigram-char.dim300", *args, **kwargs)
@serving
def calc_similarity(self, data: List[List[str]]):
"""
Calculate similarities of giving word pairs.
"""
results = []
for word_pair in data:
if len(word_pair) != 2:
raise RuntimeError(
f'The input must have two words, but got {len(word_pair)}. Please check your inputs.')
if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str):
raise RuntimeError(
f'The types of text pair must be (str, str), but got'
f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.')
embedding_name = "w2v.sogou.target.bigram-char.dim300"
for word in word_pair:
if self.get_idx_from_word(word) == \
self.get_idx_from_word(self.vocab.unk_token):
raise RuntimeError(
f'Word "{word}" is not in vocab. Please check your inputs.')
results.append(str(self.cosine_sim(*word_pair)))
return results
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name=self.embedding_name, *args, **kwargs)
......@@ -56,6 +56,25 @@ def dot(
* `word_a`: 需要计算内积的单词a。
* `word_b`: 需要计算内积的单词b。
```python
def get_vocab_path()
```
获取本地词表文件的路径信息。
```python
def get_tokenizer(*args, **kwargs)
```
获取当前模型的tokenizer,返回一个JiebaTokenizer的实例,当前只支持中文embedding模型。
**参数**
* `*args`: 额外传递的列表形式的参数。
* `**kwargs`: 额外传递的字典形式的参数。
关于额外参数的详情,可查看[paddlenlp.data.tokenizer.JiebaTokenizer](https://github.com/PaddlePaddle/models/blob/develop/PaddleNLP/paddlenlp/data/tokenizer.py)
更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings)
## 代码示例
......@@ -125,3 +144,6 @@ paddlehub >= 2.0.0
初始发布
* 1.0.1
支持基于embedding的文本分类和序列标注finetune任务
......@@ -12,44 +12,24 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
from paddlenlp.embeddings import TokenEmbedding
from paddlehub.module.module import moduleinfo, serving
from paddlehub.module.module import moduleinfo
from paddlehub.module.nlp_module import EmbeddingModule
@moduleinfo(
name="w2v_sogou_target_word-bigram_dim300",
version="1.0.0",
version="1.0.1",
summary="",
author="paddlepaddle",
author_email="",
type="nlp/semantic_model")
type="nlp/semantic_model",
meta=EmbeddingModule)
class Embedding(TokenEmbedding):
"""
Embedding model
"""
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name="w2v.sogou.target.word-bigram.dim300", *args, **kwargs)
@serving
def calc_similarity(self, data: List[List[str]]):
"""
Calculate similarities of giving word pairs.
"""
results = []
for word_pair in data:
if len(word_pair) != 2:
raise RuntimeError(
f'The input must have two words, but got {len(word_pair)}. Please check your inputs.')
if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str):
raise RuntimeError(
f'The types of text pair must be (str, str), but got'
f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.')
embedding_name = "w2v.sogou.target.word-bigram.dim300"
for word in word_pair:
if self.get_idx_from_word(word) == \
self.get_idx_from_word(self.vocab.unk_token):
raise RuntimeError(
f'Word "{word}" is not in vocab. Please check your inputs.')
results.append(str(self.cosine_sim(*word_pair)))
return results
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name=self.embedding_name, *args, **kwargs)
......@@ -56,6 +56,25 @@ def dot(
* `word_a`: 需要计算内积的单词a。
* `word_b`: 需要计算内积的单词b。
```python
def get_vocab_path()
```
获取本地词表文件的路径信息。
```python
def get_tokenizer(*args, **kwargs)
```
获取当前模型的tokenizer,返回一个JiebaTokenizer的实例,当前只支持中文embedding模型。
**参数**
* `*args`: 额外传递的列表形式的参数。
* `**kwargs`: 额外传递的字典形式的参数。
关于额外参数的详情,可查看[paddlenlp.data.tokenizer.JiebaTokenizer](https://github.com/PaddlePaddle/models/blob/develop/PaddleNLP/paddlenlp/data/tokenizer.py)
更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings)
## 代码示例
......@@ -125,3 +144,6 @@ paddlehub >= 2.0.0
初始发布
* 1.0.1
支持基于embedding的文本分类和序列标注finetune任务
......@@ -12,44 +12,24 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
from paddlenlp.embeddings import TokenEmbedding
from paddlehub.module.module import moduleinfo, serving
from paddlehub.module.module import moduleinfo
from paddlehub.module.nlp_module import EmbeddingModule
@moduleinfo(
name="w2v_sogou_target_word-char_dim300",
version="1.0.0",
version="1.0.1",
summary="",
author="paddlepaddle",
author_email="",
type="nlp/semantic_model")
type="nlp/semantic_model",
meta=EmbeddingModule)
class Embedding(TokenEmbedding):
"""
Embedding model
"""
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name="w2v.sogou.target.word-char.dim300", *args, **kwargs)
@serving
def calc_similarity(self, data: List[List[str]]):
"""
Calculate similarities of giving word pairs.
"""
results = []
for word_pair in data:
if len(word_pair) != 2:
raise RuntimeError(
f'The input must have two words, but got {len(word_pair)}. Please check your inputs.')
if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str):
raise RuntimeError(
f'The types of text pair must be (str, str), but got'
f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.')
embedding_name = "w2v.sogou.target.word-char.dim300"
for word in word_pair:
if self.get_idx_from_word(word) == \
self.get_idx_from_word(self.vocab.unk_token):
raise RuntimeError(
f'Word "{word}" is not in vocab. Please check your inputs.')
results.append(str(self.cosine_sim(*word_pair)))
return results
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name=self.embedding_name, *args, **kwargs)
......@@ -56,6 +56,25 @@ def dot(
* `word_a`: 需要计算内积的单词a。
* `word_b`: 需要计算内积的单词b。
```python
def get_vocab_path()
```
获取本地词表文件的路径信息。
```python
def get_tokenizer(*args, **kwargs)
```
获取当前模型的tokenizer,返回一个JiebaTokenizer的实例,当前只支持中文embedding模型。
**参数**
* `*args`: 额外传递的列表形式的参数。
* `**kwargs`: 额外传递的字典形式的参数。
关于额外参数的详情,可查看[paddlenlp.data.tokenizer.JiebaTokenizer](https://github.com/PaddlePaddle/models/blob/develop/PaddleNLP/paddlenlp/data/tokenizer.py)
更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings)
## 代码示例
......@@ -125,3 +144,6 @@ paddlehub >= 2.0.0
初始发布
* 1.0.1
支持基于embedding的文本分类和序列标注finetune任务
......@@ -12,44 +12,24 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
from paddlenlp.embeddings import TokenEmbedding
from paddlehub.module.module import moduleinfo, serving
from paddlehub.module.module import moduleinfo
from paddlehub.module.nlp_module import EmbeddingModule
@moduleinfo(
name="w2v_sogou_target_word-word_dim300",
version="1.0.0",
version="1.0.1",
summary="",
author="paddlepaddle",
author_email="",
type="nlp/semantic_model")
type="nlp/semantic_model",
meta=EmbeddingModule)
class Embedding(TokenEmbedding):
"""
Embedding model
"""
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name="w2v.sogou.target.word-word.dim300", *args, **kwargs)
@serving
def calc_similarity(self, data: List[List[str]]):
"""
Calculate similarities of giving word pairs.
"""
results = []
for word_pair in data:
if len(word_pair) != 2:
raise RuntimeError(
f'The input must have two words, but got {len(word_pair)}. Please check your inputs.')
if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str):
raise RuntimeError(
f'The types of text pair must be (str, str), but got'
f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.')
embedding_name = "w2v.sogou.target.word-word.dim300"
for word in word_pair:
if self.get_idx_from_word(word) == \
self.get_idx_from_word(self.vocab.unk_token):
raise RuntimeError(
f'Word "{word}" is not in vocab. Please check your inputs.')
results.append(str(self.cosine_sim(*word_pair)))
return results
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name=self.embedding_name, *args, **kwargs)
......@@ -56,6 +56,25 @@ def dot(
* `word_a`: 需要计算内积的单词a。
* `word_b`: 需要计算内积的单词b。
```python
def get_vocab_path()
```
获取本地词表文件的路径信息。
```python
def get_tokenizer(*args, **kwargs)
```
获取当前模型的tokenizer,返回一个JiebaTokenizer的实例,当前只支持中文embedding模型。
**参数**
* `*args`: 额外传递的列表形式的参数。
* `**kwargs`: 额外传递的字典形式的参数。
关于额外参数的详情,可查看[paddlenlp.data.tokenizer.JiebaTokenizer](https://github.com/PaddlePaddle/models/blob/develop/PaddleNLP/paddlenlp/data/tokenizer.py)
更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings)
## 代码示例
......@@ -125,3 +144,6 @@ paddlehub >= 2.0.0
初始发布
* 1.0.1
支持基于embedding的文本分类和序列标注finetune任务
......@@ -12,44 +12,24 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
from paddlenlp.embeddings import TokenEmbedding
from paddlehub.module.module import moduleinfo, serving
from paddlehub.module.module import moduleinfo
from paddlehub.module.nlp_module import EmbeddingModule
@moduleinfo(
name="w2v_weibo_target_bigram-char_dim300",
version="1.0.0",
version="1.0.1",
summary="",
author="paddlepaddle",
author_email="",
type="nlp/semantic_model")
type="nlp/semantic_model",
meta=EmbeddingModule)
class Embedding(TokenEmbedding):
"""
Embedding model
"""
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name="w2v.weibo.target.bigram-char.dim300", *args, **kwargs)
@serving
def calc_similarity(self, data: List[List[str]]):
"""
Calculate similarities of giving word pairs.
"""
results = []
for word_pair in data:
if len(word_pair) != 2:
raise RuntimeError(
f'The input must have two words, but got {len(word_pair)}. Please check your inputs.')
if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str):
raise RuntimeError(
f'The types of text pair must be (str, str), but got'
f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.')
embedding_name = "w2v.weibo.target.bigram-char.dim300"
for word in word_pair:
if self.get_idx_from_word(word) == \
self.get_idx_from_word(self.vocab.unk_token):
raise RuntimeError(
f'Word "{word}" is not in vocab. Please check your inputs.')
results.append(str(self.cosine_sim(*word_pair)))
return results
def __init__(self, *args, **kwargs):
super(Embedding, self).__init__(embedding_name=self.embedding_name, *args, **kwargs)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册