From 08cb2a7376edab19e7be894568471b3d52de2d62 Mon Sep 17 00:00:00 2001 From: KP <109694228@qq.com> Date: Wed, 13 Jan 2021 10:16:13 +0800 Subject: [PATCH] Add embedding modules (#1179) * Add embedding modules --- .../README.md | 127 ++++++++++++++++++ .../__init__.py | 0 .../module.py | 55 ++++++++ .../README.md | 127 ++++++++++++++++++ .../__init__.py | 0 .../module.py | 55 ++++++++ .../README.md | 127 ++++++++++++++++++ .../__init__.py | 0 .../module.py | 55 ++++++++ .../README.md | 127 ++++++++++++++++++ .../__init__.py | 0 .../module.py | 55 ++++++++ .../README.md | 127 ++++++++++++++++++ .../__init__.py | 0 .../module.py | 55 ++++++++ .../README.md | 127 ++++++++++++++++++ .../__init__.py | 0 .../module.py | 55 ++++++++ .../README.md | 127 ++++++++++++++++++ .../__init__.py | 0 .../module.py | 55 ++++++++ .../README.md | 127 ++++++++++++++++++ .../__init__.py | 0 .../module.py | 55 ++++++++ .../README.md | 127 ++++++++++++++++++ .../__init__.py | 0 .../module.py | 55 ++++++++ .../README.md | 127 ++++++++++++++++++ .../__init__.py | 0 .../module.py | 55 ++++++++ .../README.md | 127 ++++++++++++++++++ .../__init__.py | 0 .../module.py | 55 ++++++++ .../README.md | 127 ++++++++++++++++++ .../__init__.py | 0 .../module.py | 55 ++++++++ .../README.md | 127 ++++++++++++++++++ .../__init__.py | 0 .../module.py | 55 ++++++++ .../README.md | 127 ++++++++++++++++++ .../__init__.py | 0 .../module.py | 55 ++++++++ .../README.md | 127 ++++++++++++++++++ .../__init__.py | 0 .../module.py | 55 ++++++++ .../README.md | 127 ++++++++++++++++++ .../__init__.py | 0 .../module.py | 55 ++++++++ .../README.md | 127 ++++++++++++++++++ .../__init__.py | 0 .../module.py | 55 ++++++++ .../README.md | 127 ++++++++++++++++++ .../__init__.py | 0 .../module.py | 55 ++++++++ .../README.md | 127 ++++++++++++++++++ .../__init__.py | 0 .../module.py | 55 ++++++++ .../README.md | 127 ++++++++++++++++++ .../__init__.py | 0 .../module.py | 55 ++++++++ .../README.md | 127 ++++++++++++++++++ .../__init__.py | 0 .../module.py | 55 ++++++++ .../README.md | 127 ++++++++++++++++++ .../__init__.py | 0 .../module.py | 55 ++++++++ .../README.md | 127 ++++++++++++++++++ .../__init__.py | 0 .../module.py | 55 ++++++++ .../README.md | 127 ++++++++++++++++++ .../__init__.py | 0 .../module.py | 55 ++++++++ .../README.md | 127 ++++++++++++++++++ .../__init__.py | 0 .../module.py | 55 ++++++++ .../README.md | 127 ++++++++++++++++++ .../__init__.py | 0 .../module.py | 55 ++++++++ .../README.md | 127 ++++++++++++++++++ .../__init__.py | 0 .../module.py | 55 ++++++++ .../README.md | 127 ++++++++++++++++++ .../__init__.py | 0 .../module.py | 55 ++++++++ .../README.md | 127 ++++++++++++++++++ .../__init__.py | 0 .../module.py | 55 ++++++++ .../README.md | 127 ++++++++++++++++++ .../__init__.py | 0 .../module.py | 55 ++++++++ .../README.md | 127 ++++++++++++++++++ .../__init__.py | 0 .../module.py | 55 ++++++++ .../README.md | 127 ++++++++++++++++++ .../__init__.py | 0 .../module.py | 55 ++++++++ .../README.md | 127 ++++++++++++++++++ .../__init__.py | 0 .../module.py | 55 ++++++++ .../README.md | 127 ++++++++++++++++++ .../__init__.py | 0 .../module.py | 55 ++++++++ .../README.md | 127 ++++++++++++++++++ .../__init__.py | 0 .../module.py | 55 ++++++++ .../README.md | 127 ++++++++++++++++++ .../__init__.py | 0 .../module.py | 55 ++++++++ .../README.md | 127 ++++++++++++++++++ .../__init__.py | 0 .../module.py | 55 ++++++++ .../README.md | 127 ++++++++++++++++++ .../__init__.py | 0 .../module.py | 55 ++++++++ .../README.md | 127 ++++++++++++++++++ .../__init__.py | 0 .../module.py | 55 ++++++++ .../README.md | 127 ++++++++++++++++++ .../__init__.py | 0 .../module.py | 55 ++++++++ .../README.md | 127 ++++++++++++++++++ .../__init__.py | 0 .../module.py | 55 ++++++++ .../README.md | 127 ++++++++++++++++++ .../__init__.py | 0 .../module.py | 55 ++++++++ .../README.md | 127 ++++++++++++++++++ .../__init__.py | 0 .../module.py | 55 ++++++++ .../README.md | 127 ++++++++++++++++++ .../__init__.py | 0 .../module.py | 55 ++++++++ .../README.md | 127 ++++++++++++++++++ .../__init__.py | 0 .../module.py | 55 ++++++++ .../README.md | 127 ++++++++++++++++++ .../__init__.py | 0 .../module.py | 55 ++++++++ .../README.md | 127 ++++++++++++++++++ .../__init__.py | 0 .../module.py | 55 ++++++++ .../README.md | 127 ++++++++++++++++++ .../__init__.py | 0 .../module.py | 55 ++++++++ .../README.md | 127 ++++++++++++++++++ .../__init__.py | 0 .../module.py | 55 ++++++++ .../README.md | 127 ++++++++++++++++++ .../__init__.py | 0 .../module.py | 55 ++++++++ .../README.md | 127 ++++++++++++++++++ .../__init__.py | 0 .../module.py | 55 ++++++++ .../README.md | 127 ++++++++++++++++++ .../__init__.py | 0 .../module.py | 55 ++++++++ .../README.md | 127 ++++++++++++++++++ .../__init__.py | 0 .../module.py | 55 ++++++++ .../README.md | 127 ++++++++++++++++++ .../__init__.py | 0 .../module.py | 55 ++++++++ .../README.md | 127 ++++++++++++++++++ .../__init__.py | 0 .../module.py | 55 ++++++++ .../README.md | 127 ++++++++++++++++++ .../__init__.py | 0 .../module.py | 55 ++++++++ .../README.md | 127 ++++++++++++++++++ .../__init__.py | 0 .../module.py | 55 ++++++++ .../README.md | 127 ++++++++++++++++++ .../__init__.py | 0 .../module.py | 55 ++++++++ .../README.md | 127 ++++++++++++++++++ .../__init__.py | 0 .../module.py | 55 ++++++++ .../README.md | 127 ++++++++++++++++++ .../__init__.py | 0 .../module.py | 55 ++++++++ .../README.md | 127 ++++++++++++++++++ .../__init__.py | 0 .../module.py | 55 ++++++++ 183 files changed, 11102 insertions(+) create mode 100644 modules/text/embedding/fasttext_crawl_target_word-word_dim300_en/README.md create mode 100644 modules/text/embedding/fasttext_crawl_target_word-word_dim300_en/__init__.py create mode 100644 modules/text/embedding/fasttext_crawl_target_word-word_dim300_en/module.py create mode 100644 modules/text/embedding/fasttext_wiki-news_target_word-word_dim300_en/README.md create mode 100644 modules/text/embedding/fasttext_wiki-news_target_word-word_dim300_en/__init__.py create mode 100644 modules/text/embedding/fasttext_wiki-news_target_word-word_dim300_en/module.py create mode 100644 modules/text/embedding/glove_twitter_target_word-word_dim100_en/README.md create mode 100644 modules/text/embedding/glove_twitter_target_word-word_dim100_en/__init__.py create mode 100644 modules/text/embedding/glove_twitter_target_word-word_dim100_en/module.py create mode 100644 modules/text/embedding/glove_twitter_target_word-word_dim200_en/README.md create mode 100644 modules/text/embedding/glove_twitter_target_word-word_dim200_en/__init__.py create mode 100644 modules/text/embedding/glove_twitter_target_word-word_dim200_en/module.py create mode 100644 modules/text/embedding/glove_twitter_target_word-word_dim25_en/README.md create mode 100644 modules/text/embedding/glove_twitter_target_word-word_dim25_en/__init__.py create mode 100644 modules/text/embedding/glove_twitter_target_word-word_dim25_en/module.py create mode 100644 modules/text/embedding/glove_twitter_target_word-word_dim50_en/README.md create mode 100644 modules/text/embedding/glove_twitter_target_word-word_dim50_en/__init__.py create mode 100644 modules/text/embedding/glove_twitter_target_word-word_dim50_en/module.py create mode 100644 modules/text/embedding/glove_wiki2014-gigaword_target_word-word_dim100_en/README.md create mode 100644 modules/text/embedding/glove_wiki2014-gigaword_target_word-word_dim100_en/__init__.py create mode 100644 modules/text/embedding/glove_wiki2014-gigaword_target_word-word_dim100_en/module.py create mode 100644 modules/text/embedding/glove_wiki2014-gigaword_target_word-word_dim200_en/README.md create mode 100644 modules/text/embedding/glove_wiki2014-gigaword_target_word-word_dim200_en/__init__.py create mode 100644 modules/text/embedding/glove_wiki2014-gigaword_target_word-word_dim200_en/module.py create mode 100644 modules/text/embedding/glove_wiki2014-gigaword_target_word-word_dim300_en/README.md create mode 100644 modules/text/embedding/glove_wiki2014-gigaword_target_word-word_dim300_en/__init__.py create mode 100644 modules/text/embedding/glove_wiki2014-gigaword_target_word-word_dim300_en/module.py create mode 100644 modules/text/embedding/glove_wiki2014-gigaword_target_word-word_dim50_en/README.md create mode 100644 modules/text/embedding/glove_wiki2014-gigaword_target_word-word_dim50_en/__init__.py create mode 100644 modules/text/embedding/glove_wiki2014-gigaword_target_word-word_dim50_en/module.py create mode 100644 modules/text/embedding/w2v_baidu_encyclopedia_context_word-character_char1-1_dim300/README.md create mode 100644 modules/text/embedding/w2v_baidu_encyclopedia_context_word-character_char1-1_dim300/__init__.py create mode 100644 modules/text/embedding/w2v_baidu_encyclopedia_context_word-character_char1-1_dim300/module.py create mode 100644 modules/text/embedding/w2v_baidu_encyclopedia_context_word-character_char1-2_dim300/README.md create mode 100644 modules/text/embedding/w2v_baidu_encyclopedia_context_word-character_char1-2_dim300/__init__.py create mode 100644 modules/text/embedding/w2v_baidu_encyclopedia_context_word-character_char1-2_dim300/module.py create mode 100644 modules/text/embedding/w2v_baidu_encyclopedia_context_word-character_char1-4_dim300/README.md create mode 100644 modules/text/embedding/w2v_baidu_encyclopedia_context_word-character_char1-4_dim300/__init__.py create mode 100644 modules/text/embedding/w2v_baidu_encyclopedia_context_word-character_char1-4_dim300/module.py create mode 100644 modules/text/embedding/w2v_baidu_encyclopedia_context_word-ngram_1-2_dim300/README.md create mode 100644 modules/text/embedding/w2v_baidu_encyclopedia_context_word-ngram_1-2_dim300/__init__.py create mode 100644 modules/text/embedding/w2v_baidu_encyclopedia_context_word-ngram_1-2_dim300/module.py create mode 100644 modules/text/embedding/w2v_baidu_encyclopedia_context_word-ngram_1-3_dim300/README.md create mode 100644 modules/text/embedding/w2v_baidu_encyclopedia_context_word-ngram_1-3_dim300/__init__.py create mode 100644 modules/text/embedding/w2v_baidu_encyclopedia_context_word-ngram_1-3_dim300/module.py create mode 100644 modules/text/embedding/w2v_baidu_encyclopedia_context_word-ngram_2-2_dim300/README.md create mode 100644 modules/text/embedding/w2v_baidu_encyclopedia_context_word-ngram_2-2_dim300/__init__.py create mode 100644 modules/text/embedding/w2v_baidu_encyclopedia_context_word-ngram_2-2_dim300/module.py create mode 100644 modules/text/embedding/w2v_baidu_encyclopedia_context_word-wordLR_dim300/README.md create mode 100644 modules/text/embedding/w2v_baidu_encyclopedia_context_word-wordLR_dim300/__init__.py create mode 100644 modules/text/embedding/w2v_baidu_encyclopedia_context_word-wordLR_dim300/module.py create mode 100644 modules/text/embedding/w2v_baidu_encyclopedia_context_word-wordPosition_dim300/README.md create mode 100644 modules/text/embedding/w2v_baidu_encyclopedia_context_word-wordPosition_dim300/__init__.py create mode 100644 modules/text/embedding/w2v_baidu_encyclopedia_context_word-wordPosition_dim300/module.py create mode 100644 modules/text/embedding/w2v_baidu_encyclopedia_context_word-word_dim300/README.md create mode 100644 modules/text/embedding/w2v_baidu_encyclopedia_context_word-word_dim300/__init__.py create mode 100644 modules/text/embedding/w2v_baidu_encyclopedia_context_word-word_dim300/module.py create mode 100644 modules/text/embedding/w2v_baidu_encyclopedia_target_bigram-char_dim300/README.md create mode 100644 modules/text/embedding/w2v_baidu_encyclopedia_target_bigram-char_dim300/__init__.py create mode 100644 modules/text/embedding/w2v_baidu_encyclopedia_target_bigram-char_dim300/module.py create mode 100644 modules/text/embedding/w2v_baidu_encyclopedia_target_word-character_char1-1_dim300/README.md create mode 100644 modules/text/embedding/w2v_baidu_encyclopedia_target_word-character_char1-1_dim300/__init__.py create mode 100644 modules/text/embedding/w2v_baidu_encyclopedia_target_word-character_char1-1_dim300/module.py create mode 100644 modules/text/embedding/w2v_baidu_encyclopedia_target_word-character_char1-2_dim300/README.md create mode 100644 modules/text/embedding/w2v_baidu_encyclopedia_target_word-character_char1-2_dim300/__init__.py create mode 100644 modules/text/embedding/w2v_baidu_encyclopedia_target_word-character_char1-2_dim300/module.py create mode 100644 modules/text/embedding/w2v_baidu_encyclopedia_target_word-character_char1-4_dim300/README.md create mode 100644 modules/text/embedding/w2v_baidu_encyclopedia_target_word-character_char1-4_dim300/__init__.py create mode 100644 modules/text/embedding/w2v_baidu_encyclopedia_target_word-character_char1-4_dim300/module.py create mode 100644 modules/text/embedding/w2v_baidu_encyclopedia_target_word-ngram_1-2_dim300/README.md create mode 100644 modules/text/embedding/w2v_baidu_encyclopedia_target_word-ngram_1-2_dim300/__init__.py create mode 100644 modules/text/embedding/w2v_baidu_encyclopedia_target_word-ngram_1-2_dim300/module.py create mode 100644 modules/text/embedding/w2v_baidu_encyclopedia_target_word-ngram_1-3_dim300/README.md create mode 100644 modules/text/embedding/w2v_baidu_encyclopedia_target_word-ngram_1-3_dim300/__init__.py create mode 100644 modules/text/embedding/w2v_baidu_encyclopedia_target_word-ngram_1-3_dim300/module.py create mode 100644 modules/text/embedding/w2v_baidu_encyclopedia_target_word-ngram_2-2_dim300/README.md create mode 100644 modules/text/embedding/w2v_baidu_encyclopedia_target_word-ngram_2-2_dim300/__init__.py create mode 100644 modules/text/embedding/w2v_baidu_encyclopedia_target_word-ngram_2-2_dim300/module.py create mode 100644 modules/text/embedding/w2v_baidu_encyclopedia_target_word-wordLR_dim300/README.md create mode 100644 modules/text/embedding/w2v_baidu_encyclopedia_target_word-wordLR_dim300/__init__.py create mode 100644 modules/text/embedding/w2v_baidu_encyclopedia_target_word-wordLR_dim300/module.py create mode 100644 modules/text/embedding/w2v_baidu_encyclopedia_target_word-wordPosition_dim300/README.md create mode 100644 modules/text/embedding/w2v_baidu_encyclopedia_target_word-wordPosition_dim300/__init__.py create mode 100644 modules/text/embedding/w2v_baidu_encyclopedia_target_word-wordPosition_dim300/module.py create mode 100644 modules/text/embedding/w2v_baidu_encyclopedia_target_word-word_dim300/README.md create mode 100644 modules/text/embedding/w2v_baidu_encyclopedia_target_word-word_dim300/__init__.py create mode 100644 modules/text/embedding/w2v_baidu_encyclopedia_target_word-word_dim300/module.py create mode 100644 modules/text/embedding/w2v_financial_target_bigram-char_dim300/README.md create mode 100644 modules/text/embedding/w2v_financial_target_bigram-char_dim300/__init__.py create mode 100644 modules/text/embedding/w2v_financial_target_bigram-char_dim300/module.py create mode 100644 modules/text/embedding/w2v_financial_target_word-bigram_dim300/README.md create mode 100644 modules/text/embedding/w2v_financial_target_word-bigram_dim300/__init__.py create mode 100644 modules/text/embedding/w2v_financial_target_word-bigram_dim300/module.py create mode 100644 modules/text/embedding/w2v_financial_target_word-char_dim300/README.md create mode 100644 modules/text/embedding/w2v_financial_target_word-char_dim300/__init__.py create mode 100644 modules/text/embedding/w2v_financial_target_word-char_dim300/module.py create mode 100644 modules/text/embedding/w2v_financial_target_word-word_dim300/README.md create mode 100644 modules/text/embedding/w2v_financial_target_word-word_dim300/__init__.py create mode 100644 modules/text/embedding/w2v_financial_target_word-word_dim300/module.py create mode 100644 modules/text/embedding/w2v_literature_target_bigram-char_dim300/README.md create mode 100644 modules/text/embedding/w2v_literature_target_bigram-char_dim300/__init__.py create mode 100644 modules/text/embedding/w2v_literature_target_bigram-char_dim300/module.py create mode 100644 modules/text/embedding/w2v_literature_target_word-bigram_dim300/README.md create mode 100644 modules/text/embedding/w2v_literature_target_word-bigram_dim300/__init__.py create mode 100644 modules/text/embedding/w2v_literature_target_word-bigram_dim300/module.py create mode 100644 modules/text/embedding/w2v_literature_target_word-char_dim300/README.md create mode 100644 modules/text/embedding/w2v_literature_target_word-char_dim300/__init__.py create mode 100644 modules/text/embedding/w2v_literature_target_word-char_dim300/module.py create mode 100644 modules/text/embedding/w2v_literature_target_word-word_dim300/README.md create mode 100644 modules/text/embedding/w2v_literature_target_word-word_dim300/__init__.py create mode 100644 modules/text/embedding/w2v_literature_target_word-word_dim300/module.py create mode 100644 modules/text/embedding/w2v_mixed-large_target_word-char_dim300/README.md create mode 100644 modules/text/embedding/w2v_mixed-large_target_word-char_dim300/__init__.py create mode 100644 modules/text/embedding/w2v_mixed-large_target_word-char_dim300/module.py create mode 100644 modules/text/embedding/w2v_mixed-large_target_word-word_dim300/README.md create mode 100644 modules/text/embedding/w2v_mixed-large_target_word-word_dim300/__init__.py create mode 100644 modules/text/embedding/w2v_mixed-large_target_word-word_dim300/module.py create mode 100644 modules/text/embedding/w2v_people_daily_target_bigram-char_dim300/README.md create mode 100644 modules/text/embedding/w2v_people_daily_target_bigram-char_dim300/__init__.py create mode 100644 modules/text/embedding/w2v_people_daily_target_bigram-char_dim300/module.py create mode 100644 modules/text/embedding/w2v_people_daily_target_word-bigram_dim300/README.md create mode 100644 modules/text/embedding/w2v_people_daily_target_word-bigram_dim300/__init__.py create mode 100644 modules/text/embedding/w2v_people_daily_target_word-bigram_dim300/module.py create mode 100644 modules/text/embedding/w2v_people_daily_target_word-char_dim300/README.md create mode 100644 modules/text/embedding/w2v_people_daily_target_word-char_dim300/__init__.py create mode 100644 modules/text/embedding/w2v_people_daily_target_word-char_dim300/module.py create mode 100644 modules/text/embedding/w2v_people_daily_target_word-word_dim300/README.md create mode 100644 modules/text/embedding/w2v_people_daily_target_word-word_dim300/__init__.py create mode 100644 modules/text/embedding/w2v_people_daily_target_word-word_dim300/module.py create mode 100644 modules/text/embedding/w2v_sikuquanshu_target_word-bigram_dim300/README.md create mode 100644 modules/text/embedding/w2v_sikuquanshu_target_word-bigram_dim300/__init__.py create mode 100644 modules/text/embedding/w2v_sikuquanshu_target_word-bigram_dim300/module.py create mode 100644 modules/text/embedding/w2v_sikuquanshu_target_word-word_dim300/README.md create mode 100644 modules/text/embedding/w2v_sikuquanshu_target_word-word_dim300/__init__.py create mode 100644 modules/text/embedding/w2v_sikuquanshu_target_word-word_dim300/module.py create mode 100644 modules/text/embedding/w2v_sogou_target_bigram-char_dim300/README.md create mode 100644 modules/text/embedding/w2v_sogou_target_bigram-char_dim300/__init__.py create mode 100644 modules/text/embedding/w2v_sogou_target_bigram-char_dim300/module.py create mode 100644 modules/text/embedding/w2v_sogou_target_word-bigram_dim300/README.md create mode 100644 modules/text/embedding/w2v_sogou_target_word-bigram_dim300/__init__.py create mode 100644 modules/text/embedding/w2v_sogou_target_word-bigram_dim300/module.py create mode 100644 modules/text/embedding/w2v_sogou_target_word-char_dim300/README.md create mode 100644 modules/text/embedding/w2v_sogou_target_word-char_dim300/__init__.py create mode 100644 modules/text/embedding/w2v_sogou_target_word-char_dim300/module.py create mode 100644 modules/text/embedding/w2v_sogou_target_word-word_dim300/README.md create mode 100644 modules/text/embedding/w2v_sogou_target_word-word_dim300/__init__.py create mode 100644 modules/text/embedding/w2v_sogou_target_word-word_dim300/module.py create mode 100644 modules/text/embedding/w2v_weibo_target_bigram-char_dim300/README.md create mode 100644 modules/text/embedding/w2v_weibo_target_bigram-char_dim300/__init__.py create mode 100644 modules/text/embedding/w2v_weibo_target_bigram-char_dim300/module.py create mode 100644 modules/text/embedding/w2v_weibo_target_word-bigram_dim300/README.md create mode 100644 modules/text/embedding/w2v_weibo_target_word-bigram_dim300/__init__.py create mode 100644 modules/text/embedding/w2v_weibo_target_word-bigram_dim300/module.py create mode 100644 modules/text/embedding/w2v_weibo_target_word-char_dim300/README.md create mode 100644 modules/text/embedding/w2v_weibo_target_word-char_dim300/__init__.py create mode 100644 modules/text/embedding/w2v_weibo_target_word-char_dim300/module.py create mode 100644 modules/text/embedding/w2v_weibo_target_word-word_dim300/README.md create mode 100644 modules/text/embedding/w2v_weibo_target_word-word_dim300/__init__.py create mode 100644 modules/text/embedding/w2v_weibo_target_word-word_dim300/module.py create mode 100644 modules/text/embedding/w2v_wiki_target_bigram-char_dim300/README.md create mode 100644 modules/text/embedding/w2v_wiki_target_bigram-char_dim300/__init__.py create mode 100644 modules/text/embedding/w2v_wiki_target_bigram-char_dim300/module.py create mode 100644 modules/text/embedding/w2v_wiki_target_word-bigram_dim300/README.md create mode 100644 modules/text/embedding/w2v_wiki_target_word-bigram_dim300/__init__.py create mode 100644 modules/text/embedding/w2v_wiki_target_word-bigram_dim300/module.py create mode 100644 modules/text/embedding/w2v_wiki_target_word-char_dim300/README.md create mode 100644 modules/text/embedding/w2v_wiki_target_word-char_dim300/__init__.py create mode 100644 modules/text/embedding/w2v_wiki_target_word-char_dim300/module.py create mode 100644 modules/text/embedding/w2v_wiki_target_word-word_dim300/README.md create mode 100644 modules/text/embedding/w2v_wiki_target_word-word_dim300/__init__.py create mode 100644 modules/text/embedding/w2v_wiki_target_word-word_dim300/module.py create mode 100644 modules/text/embedding/w2v_zhihu_target_bigram-char_dim300/README.md create mode 100644 modules/text/embedding/w2v_zhihu_target_bigram-char_dim300/__init__.py create mode 100644 modules/text/embedding/w2v_zhihu_target_bigram-char_dim300/module.py create mode 100644 modules/text/embedding/w2v_zhihu_target_word-bigram_dim300/README.md create mode 100644 modules/text/embedding/w2v_zhihu_target_word-bigram_dim300/__init__.py create mode 100644 modules/text/embedding/w2v_zhihu_target_word-bigram_dim300/module.py create mode 100644 modules/text/embedding/w2v_zhihu_target_word-char_dim300/README.md create mode 100644 modules/text/embedding/w2v_zhihu_target_word-char_dim300/__init__.py create mode 100644 modules/text/embedding/w2v_zhihu_target_word-char_dim300/module.py create mode 100644 modules/text/embedding/w2v_zhihu_target_word-word_dim300/README.md create mode 100644 modules/text/embedding/w2v_zhihu_target_word-word_dim300/__init__.py create mode 100644 modules/text/embedding/w2v_zhihu_target_word-word_dim300/module.py diff --git a/modules/text/embedding/fasttext_crawl_target_word-word_dim300_en/README.md b/modules/text/embedding/fasttext_crawl_target_word-word_dim300_en/README.md new file mode 100644 index 00000000..5e2ca5d3 --- /dev/null +++ b/modules/text/embedding/fasttext_crawl_target_word-word_dim300_en/README.md @@ -0,0 +1,127 @@ +## 概述 +PaddleHub提供多个开源的预训练Embedding模型。这些Embedding模型可根据不同语料、不同训练方式和不同的维度进行区分,关于模型的具体信息可参考PaddleNLP的文档:[Embedding模型汇总](https://github.com/PaddlePaddle/models/blob/release/2.0-beta/PaddleNLP/docs/embeddings.md) + +## API + +```python +def __init__( + *args, + **kwargs +) +``` + +创建一个Embedding Module对象,默认无需参数。 + +**参数** +* `*args`: 用户额外指定的列表类型的参数。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 + +关于额外参数的详情可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + + +```python +def search( + words: Union[List[str], str, int], +) +``` + +获取一个或多个词的embedding。输入可以是`str`、`List[str]`和`int`类型,分别代表获取一个词,多个词和指定词编号的embedding,词的编号和模型的词典相关,词典可通过模型实例的`vocab`属性获取。 + +**参数** +* `words`: 需要获取的词向量的词、词列表或者词编号。 + + +```python +def cosine_sim( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的余弦相似度。需要注意的是`word_a`和`word_b`都需要是词典里的单词,否则将会被认为是OOV(Out-Of-Vocabulary),同时被替换为`unknown_token`。 + +**参数** +* `word_a`: 需要计算余弦相似度的单词a。 +* `word_b`: 需要计算余弦相似度的单词b。 + + +```python +def dot( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的内积。对于输入单词同样需要注意OOV问题。 + +**参数** +* `word_a`: 需要计算内积的单词a。 +* `word_b`: 需要计算内积的单词b。 + +更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + +## 代码示例 + +```python +import paddlehub as hub +embedding = hub.Module(name='fasttext_crawl_target_word-word_dim300_en') + +# 获取单词的embedding +embedding.search("中国") +# 计算两个词向量的余弦相似度 +embedding.cosine_sim("中国", "美国") +# 计算两个词向量的内积 +embedding.dot("中国", "美国") +``` + +## 部署服务 + +通过PaddleHub Serving,可以部署一个在线获取两个词向量的余弦相似度的服务。 + +### Step1: 启动PaddleHub Serving + +运行启动命令: + +```shell +$ hub serving start -m fasttext_crawl_target_word-word_dim300_en +``` + +这样就完成了一个获取词向量的余弦相似度服务化API的部署,默认端口号为8866。 + +**NOTE:** 如使用GPU预测,则需要在启动服务之前,请设置CUDA_VISIBLE_DEVICES环境变量,否则不用设置。 + +### Step2: 发送预测请求 + +配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果 + +```python +import requests +import json + +# 指定用于计算余弦相似度的单词对[[word_a, word_b], [word_a, word_b], ... ]] +word_pairs = [["中国", "美国"], ["今天", "明天"]] +# 以key的方式指定word_pairs传入预测方法的时的参数,此例中为"data",对于每一对单词,调用cosine_sim进行余弦相似度的计算 +data = {"data": word_pairs} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip +url = "http://10.12.121.132:8866/predict/fasttext_crawl_target_word-word_dim300_en" +# 指定post请求的headers为application/json方式 +headers = {"Content-Type": "application/json"} + +r = requests.post(url=url, headers=headers, data=json.dumps(data)) +print(r.json()) +``` + +## 查看代码 + +https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings + +## 依赖 + +paddlepaddle >= 2.0.0 + +paddlehub >= 2.0.0 + +## 更新历史 + +* 1.0.0 + + 初始发布 + diff --git a/modules/text/embedding/fasttext_crawl_target_word-word_dim300_en/__init__.py b/modules/text/embedding/fasttext_crawl_target_word-word_dim300_en/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modules/text/embedding/fasttext_crawl_target_word-word_dim300_en/module.py b/modules/text/embedding/fasttext_crawl_target_word-word_dim300_en/module.py new file mode 100644 index 00000000..2974ae7d --- /dev/null +++ b/modules/text/embedding/fasttext_crawl_target_word-word_dim300_en/module.py @@ -0,0 +1,55 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List +from paddlenlp.embeddings import TokenEmbedding +from paddlehub.module.module import moduleinfo, serving + + +@moduleinfo( + name="fasttext_crawl_target_word-word_dim300_en", + version="1.0.0", + summary="", + author="paddlepaddle", + author_email="", + type="nlp/semantic_model") +class Embedding(TokenEmbedding): + """ + Embedding model + """ + def __init__(self, *args, **kwargs): + super(Embedding, self).__init__(embedding_name="fasttext.crawl.target.word-word.dim300.en", *args, **kwargs) + + @serving + def calc_similarity(self, data: List[List[str]]): + """ + Calculate similarities of giving word pairs. + """ + results = [] + for word_pair in data: + if len(word_pair) != 2: + raise RuntimeError( + f'The input must have two words, but got {len(word_pair)}. Please check your inputs.') + if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str): + raise RuntimeError( + f'The types of text pair must be (str, str), but got' + f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.') + + for word in word_pair: + if self.get_idx_from_word(word) == \ + self.get_idx_from_word(self.vocab.unk_token): + raise RuntimeError( + f'Word "{word}" is not in vocab. Please check your inputs.') + results.append(str(self.cosine_sim(*word_pair))) + return results diff --git a/modules/text/embedding/fasttext_wiki-news_target_word-word_dim300_en/README.md b/modules/text/embedding/fasttext_wiki-news_target_word-word_dim300_en/README.md new file mode 100644 index 00000000..cc70e4a4 --- /dev/null +++ b/modules/text/embedding/fasttext_wiki-news_target_word-word_dim300_en/README.md @@ -0,0 +1,127 @@ +## 概述 +PaddleHub提供多个开源的预训练Embedding模型。这些Embedding模型可根据不同语料、不同训练方式和不同的维度进行区分,关于模型的具体信息可参考PaddleNLP的文档:[Embedding模型汇总](https://github.com/PaddlePaddle/models/blob/release/2.0-beta/PaddleNLP/docs/embeddings.md) + +## API + +```python +def __init__( + *args, + **kwargs +) +``` + +创建一个Embedding Module对象,默认无需参数。 + +**参数** +* `*args`: 用户额外指定的列表类型的参数。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 + +关于额外参数的详情可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + + +```python +def search( + words: Union[List[str], str, int], +) +``` + +获取一个或多个词的embedding。输入可以是`str`、`List[str]`和`int`类型,分别代表获取一个词,多个词和指定词编号的embedding,词的编号和模型的词典相关,词典可通过模型实例的`vocab`属性获取。 + +**参数** +* `words`: 需要获取的词向量的词、词列表或者词编号。 + + +```python +def cosine_sim( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的余弦相似度。需要注意的是`word_a`和`word_b`都需要是词典里的单词,否则将会被认为是OOV(Out-Of-Vocabulary),同时被替换为`unknown_token`。 + +**参数** +* `word_a`: 需要计算余弦相似度的单词a。 +* `word_b`: 需要计算余弦相似度的单词b。 + + +```python +def dot( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的内积。对于输入单词同样需要注意OOV问题。 + +**参数** +* `word_a`: 需要计算内积的单词a。 +* `word_b`: 需要计算内积的单词b。 + +更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + +## 代码示例 + +```python +import paddlehub as hub +embedding = hub.Module(name='fasttext_wiki-news_target_word-word_dim300_en') + +# 获取单词的embedding +embedding.search("中国") +# 计算两个词向量的余弦相似度 +embedding.cosine_sim("中国", "美国") +# 计算两个词向量的内积 +embedding.dot("中国", "美国") +``` + +## 部署服务 + +通过PaddleHub Serving,可以部署一个在线获取两个词向量的余弦相似度的服务。 + +### Step1: 启动PaddleHub Serving + +运行启动命令: + +```shell +$ hub serving start -m fasttext_wiki-news_target_word-word_dim300_en +``` + +这样就完成了一个获取词向量的余弦相似度服务化API的部署,默认端口号为8866。 + +**NOTE:** 如使用GPU预测,则需要在启动服务之前,请设置CUDA_VISIBLE_DEVICES环境变量,否则不用设置。 + +### Step2: 发送预测请求 + +配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果 + +```python +import requests +import json + +# 指定用于计算余弦相似度的单词对[[word_a, word_b], [word_a, word_b], ... ]] +word_pairs = [["中国", "美国"], ["今天", "明天"]] +# 以key的方式指定word_pairs传入预测方法的时的参数,此例中为"data",对于每一对单词,调用cosine_sim进行余弦相似度的计算 +data = {"data": word_pairs} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip +url = "http://10.12.121.132:8866/predict/fasttext_wiki-news_target_word-word_dim300_en" +# 指定post请求的headers为application/json方式 +headers = {"Content-Type": "application/json"} + +r = requests.post(url=url, headers=headers, data=json.dumps(data)) +print(r.json()) +``` + +## 查看代码 + +https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings + +## 依赖 + +paddlepaddle >= 2.0.0 + +paddlehub >= 2.0.0 + +## 更新历史 + +* 1.0.0 + + 初始发布 + diff --git a/modules/text/embedding/fasttext_wiki-news_target_word-word_dim300_en/__init__.py b/modules/text/embedding/fasttext_wiki-news_target_word-word_dim300_en/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modules/text/embedding/fasttext_wiki-news_target_word-word_dim300_en/module.py b/modules/text/embedding/fasttext_wiki-news_target_word-word_dim300_en/module.py new file mode 100644 index 00000000..4e3c08db --- /dev/null +++ b/modules/text/embedding/fasttext_wiki-news_target_word-word_dim300_en/module.py @@ -0,0 +1,55 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List +from paddlenlp.embeddings import TokenEmbedding +from paddlehub.module.module import moduleinfo, serving + + +@moduleinfo( + name="fasttext_wiki-news_target_word-word_dim300_en", + version="1.0.0", + summary="", + author="paddlepaddle", + author_email="", + type="nlp/semantic_model") +class Embedding(TokenEmbedding): + """ + Embedding model + """ + def __init__(self, *args, **kwargs): + super(Embedding, self).__init__(embedding_name="fasttext.wiki-news.target.word-word.dim300.en", *args, **kwargs) + + @serving + def calc_similarity(self, data: List[List[str]]): + """ + Calculate similarities of giving word pairs. + """ + results = [] + for word_pair in data: + if len(word_pair) != 2: + raise RuntimeError( + f'The input must have two words, but got {len(word_pair)}. Please check your inputs.') + if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str): + raise RuntimeError( + f'The types of text pair must be (str, str), but got' + f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.') + + for word in word_pair: + if self.get_idx_from_word(word) == \ + self.get_idx_from_word(self.vocab.unk_token): + raise RuntimeError( + f'Word "{word}" is not in vocab. Please check your inputs.') + results.append(str(self.cosine_sim(*word_pair))) + return results diff --git a/modules/text/embedding/glove_twitter_target_word-word_dim100_en/README.md b/modules/text/embedding/glove_twitter_target_word-word_dim100_en/README.md new file mode 100644 index 00000000..982adee9 --- /dev/null +++ b/modules/text/embedding/glove_twitter_target_word-word_dim100_en/README.md @@ -0,0 +1,127 @@ +## 概述 +PaddleHub提供多个开源的预训练Embedding模型。这些Embedding模型可根据不同语料、不同训练方式和不同的维度进行区分,关于模型的具体信息可参考PaddleNLP的文档:[Embedding模型汇总](https://github.com/PaddlePaddle/models/blob/release/2.0-beta/PaddleNLP/docs/embeddings.md) + +## API + +```python +def __init__( + *args, + **kwargs +) +``` + +创建一个Embedding Module对象,默认无需参数。 + +**参数** +* `*args`: 用户额外指定的列表类型的参数。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 + +关于额外参数的详情可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + + +```python +def search( + words: Union[List[str], str, int], +) +``` + +获取一个或多个词的embedding。输入可以是`str`、`List[str]`和`int`类型,分别代表获取一个词,多个词和指定词编号的embedding,词的编号和模型的词典相关,词典可通过模型实例的`vocab`属性获取。 + +**参数** +* `words`: 需要获取的词向量的词、词列表或者词编号。 + + +```python +def cosine_sim( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的余弦相似度。需要注意的是`word_a`和`word_b`都需要是词典里的单词,否则将会被认为是OOV(Out-Of-Vocabulary),同时被替换为`unknown_token`。 + +**参数** +* `word_a`: 需要计算余弦相似度的单词a。 +* `word_b`: 需要计算余弦相似度的单词b。 + + +```python +def dot( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的内积。对于输入单词同样需要注意OOV问题。 + +**参数** +* `word_a`: 需要计算内积的单词a。 +* `word_b`: 需要计算内积的单词b。 + +更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + +## 代码示例 + +```python +import paddlehub as hub +embedding = hub.Module(name='glove_twitter_target_word-word_dim100_en') + +# 获取单词的embedding +embedding.search("中国") +# 计算两个词向量的余弦相似度 +embedding.cosine_sim("中国", "美国") +# 计算两个词向量的内积 +embedding.dot("中国", "美国") +``` + +## 部署服务 + +通过PaddleHub Serving,可以部署一个在线获取两个词向量的余弦相似度的服务。 + +### Step1: 启动PaddleHub Serving + +运行启动命令: + +```shell +$ hub serving start -m glove_twitter_target_word-word_dim100_en +``` + +这样就完成了一个获取词向量的余弦相似度服务化API的部署,默认端口号为8866。 + +**NOTE:** 如使用GPU预测,则需要在启动服务之前,请设置CUDA_VISIBLE_DEVICES环境变量,否则不用设置。 + +### Step2: 发送预测请求 + +配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果 + +```python +import requests +import json + +# 指定用于计算余弦相似度的单词对[[word_a, word_b], [word_a, word_b], ... ]] +word_pairs = [["中国", "美国"], ["今天", "明天"]] +# 以key的方式指定word_pairs传入预测方法的时的参数,此例中为"data",对于每一对单词,调用cosine_sim进行余弦相似度的计算 +data = {"data": word_pairs} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip +url = "http://10.12.121.132:8866/predict/glove_twitter_target_word-word_dim100_en" +# 指定post请求的headers为application/json方式 +headers = {"Content-Type": "application/json"} + +r = requests.post(url=url, headers=headers, data=json.dumps(data)) +print(r.json()) +``` + +## 查看代码 + +https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings + +## 依赖 + +paddlepaddle >= 2.0.0 + +paddlehub >= 2.0.0 + +## 更新历史 + +* 1.0.0 + + 初始发布 + diff --git a/modules/text/embedding/glove_twitter_target_word-word_dim100_en/__init__.py b/modules/text/embedding/glove_twitter_target_word-word_dim100_en/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modules/text/embedding/glove_twitter_target_word-word_dim100_en/module.py b/modules/text/embedding/glove_twitter_target_word-word_dim100_en/module.py new file mode 100644 index 00000000..df7484ea --- /dev/null +++ b/modules/text/embedding/glove_twitter_target_word-word_dim100_en/module.py @@ -0,0 +1,55 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List +from paddlenlp.embeddings import TokenEmbedding +from paddlehub.module.module import moduleinfo, serving + + +@moduleinfo( + name="glove_twitter_target_word-word_dim100_en", + version="1.0.0", + summary="", + author="paddlepaddle", + author_email="", + type="nlp/semantic_model") +class Embedding(TokenEmbedding): + """ + Embedding model + """ + def __init__(self, *args, **kwargs): + super(Embedding, self).__init__(embedding_name="glove.twitter.target.word-word.dim100.en", *args, **kwargs) + + @serving + def calc_similarity(self, data: List[List[str]]): + """ + Calculate similarities of giving word pairs. + """ + results = [] + for word_pair in data: + if len(word_pair) != 2: + raise RuntimeError( + f'The input must have two words, but got {len(word_pair)}. Please check your inputs.') + if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str): + raise RuntimeError( + f'The types of text pair must be (str, str), but got' + f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.') + + for word in word_pair: + if self.get_idx_from_word(word) == \ + self.get_idx_from_word(self.vocab.unk_token): + raise RuntimeError( + f'Word "{word}" is not in vocab. Please check your inputs.') + results.append(str(self.cosine_sim(*word_pair))) + return results diff --git a/modules/text/embedding/glove_twitter_target_word-word_dim200_en/README.md b/modules/text/embedding/glove_twitter_target_word-word_dim200_en/README.md new file mode 100644 index 00000000..3bad58e9 --- /dev/null +++ b/modules/text/embedding/glove_twitter_target_word-word_dim200_en/README.md @@ -0,0 +1,127 @@ +## 概述 +PaddleHub提供多个开源的预训练Embedding模型。这些Embedding模型可根据不同语料、不同训练方式和不同的维度进行区分,关于模型的具体信息可参考PaddleNLP的文档:[Embedding模型汇总](https://github.com/PaddlePaddle/models/blob/release/2.0-beta/PaddleNLP/docs/embeddings.md) + +## API + +```python +def __init__( + *args, + **kwargs +) +``` + +创建一个Embedding Module对象,默认无需参数。 + +**参数** +* `*args`: 用户额外指定的列表类型的参数。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 + +关于额外参数的详情可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + + +```python +def search( + words: Union[List[str], str, int], +) +``` + +获取一个或多个词的embedding。输入可以是`str`、`List[str]`和`int`类型,分别代表获取一个词,多个词和指定词编号的embedding,词的编号和模型的词典相关,词典可通过模型实例的`vocab`属性获取。 + +**参数** +* `words`: 需要获取的词向量的词、词列表或者词编号。 + + +```python +def cosine_sim( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的余弦相似度。需要注意的是`word_a`和`word_b`都需要是词典里的单词,否则将会被认为是OOV(Out-Of-Vocabulary),同时被替换为`unknown_token`。 + +**参数** +* `word_a`: 需要计算余弦相似度的单词a。 +* `word_b`: 需要计算余弦相似度的单词b。 + + +```python +def dot( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的内积。对于输入单词同样需要注意OOV问题。 + +**参数** +* `word_a`: 需要计算内积的单词a。 +* `word_b`: 需要计算内积的单词b。 + +更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + +## 代码示例 + +```python +import paddlehub as hub +embedding = hub.Module(name='glove_twitter_target_word-word_dim200_en') + +# 获取单词的embedding +embedding.search("中国") +# 计算两个词向量的余弦相似度 +embedding.cosine_sim("中国", "美国") +# 计算两个词向量的内积 +embedding.dot("中国", "美国") +``` + +## 部署服务 + +通过PaddleHub Serving,可以部署一个在线获取两个词向量的余弦相似度的服务。 + +### Step1: 启动PaddleHub Serving + +运行启动命令: + +```shell +$ hub serving start -m glove_twitter_target_word-word_dim200_en +``` + +这样就完成了一个获取词向量的余弦相似度服务化API的部署,默认端口号为8866。 + +**NOTE:** 如使用GPU预测,则需要在启动服务之前,请设置CUDA_VISIBLE_DEVICES环境变量,否则不用设置。 + +### Step2: 发送预测请求 + +配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果 + +```python +import requests +import json + +# 指定用于计算余弦相似度的单词对[[word_a, word_b], [word_a, word_b], ... ]] +word_pairs = [["中国", "美国"], ["今天", "明天"]] +# 以key的方式指定word_pairs传入预测方法的时的参数,此例中为"data",对于每一对单词,调用cosine_sim进行余弦相似度的计算 +data = {"data": word_pairs} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip +url = "http://10.12.121.132:8866/predict/glove_twitter_target_word-word_dim200_en" +# 指定post请求的headers为application/json方式 +headers = {"Content-Type": "application/json"} + +r = requests.post(url=url, headers=headers, data=json.dumps(data)) +print(r.json()) +``` + +## 查看代码 + +https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings + +## 依赖 + +paddlepaddle >= 2.0.0 + +paddlehub >= 2.0.0 + +## 更新历史 + +* 1.0.0 + + 初始发布 + diff --git a/modules/text/embedding/glove_twitter_target_word-word_dim200_en/__init__.py b/modules/text/embedding/glove_twitter_target_word-word_dim200_en/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modules/text/embedding/glove_twitter_target_word-word_dim200_en/module.py b/modules/text/embedding/glove_twitter_target_word-word_dim200_en/module.py new file mode 100644 index 00000000..e98dab55 --- /dev/null +++ b/modules/text/embedding/glove_twitter_target_word-word_dim200_en/module.py @@ -0,0 +1,55 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List +from paddlenlp.embeddings import TokenEmbedding +from paddlehub.module.module import moduleinfo, serving + + +@moduleinfo( + name="glove_twitter_target_word-word_dim200_en", + version="1.0.0", + summary="", + author="paddlepaddle", + author_email="", + type="nlp/semantic_model") +class Embedding(TokenEmbedding): + """ + Embedding model + """ + def __init__(self, *args, **kwargs): + super(Embedding, self).__init__(embedding_name="glove.twitter.target.word-word.dim200.en", *args, **kwargs) + + @serving + def calc_similarity(self, data: List[List[str]]): + """ + Calculate similarities of giving word pairs. + """ + results = [] + for word_pair in data: + if len(word_pair) != 2: + raise RuntimeError( + f'The input must have two words, but got {len(word_pair)}. Please check your inputs.') + if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str): + raise RuntimeError( + f'The types of text pair must be (str, str), but got' + f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.') + + for word in word_pair: + if self.get_idx_from_word(word) == \ + self.get_idx_from_word(self.vocab.unk_token): + raise RuntimeError( + f'Word "{word}" is not in vocab. Please check your inputs.') + results.append(str(self.cosine_sim(*word_pair))) + return results diff --git a/modules/text/embedding/glove_twitter_target_word-word_dim25_en/README.md b/modules/text/embedding/glove_twitter_target_word-word_dim25_en/README.md new file mode 100644 index 00000000..a7d781dd --- /dev/null +++ b/modules/text/embedding/glove_twitter_target_word-word_dim25_en/README.md @@ -0,0 +1,127 @@ +## 概述 +PaddleHub提供多个开源的预训练Embedding模型。这些Embedding模型可根据不同语料、不同训练方式和不同的维度进行区分,关于模型的具体信息可参考PaddleNLP的文档:[Embedding模型汇总](https://github.com/PaddlePaddle/models/blob/release/2.0-beta/PaddleNLP/docs/embeddings.md) + +## API + +```python +def __init__( + *args, + **kwargs +) +``` + +创建一个Embedding Module对象,默认无需参数。 + +**参数** +* `*args`: 用户额外指定的列表类型的参数。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 + +关于额外参数的详情可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + + +```python +def search( + words: Union[List[str], str, int], +) +``` + +获取一个或多个词的embedding。输入可以是`str`、`List[str]`和`int`类型,分别代表获取一个词,多个词和指定词编号的embedding,词的编号和模型的词典相关,词典可通过模型实例的`vocab`属性获取。 + +**参数** +* `words`: 需要获取的词向量的词、词列表或者词编号。 + + +```python +def cosine_sim( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的余弦相似度。需要注意的是`word_a`和`word_b`都需要是词典里的单词,否则将会被认为是OOV(Out-Of-Vocabulary),同时被替换为`unknown_token`。 + +**参数** +* `word_a`: 需要计算余弦相似度的单词a。 +* `word_b`: 需要计算余弦相似度的单词b。 + + +```python +def dot( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的内积。对于输入单词同样需要注意OOV问题。 + +**参数** +* `word_a`: 需要计算内积的单词a。 +* `word_b`: 需要计算内积的单词b。 + +更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + +## 代码示例 + +```python +import paddlehub as hub +embedding = hub.Module(name='glove_twitter_target_word-word_dim25_en') + +# 获取单词的embedding +embedding.search("中国") +# 计算两个词向量的余弦相似度 +embedding.cosine_sim("中国", "美国") +# 计算两个词向量的内积 +embedding.dot("中国", "美国") +``` + +## 部署服务 + +通过PaddleHub Serving,可以部署一个在线获取两个词向量的余弦相似度的服务。 + +### Step1: 启动PaddleHub Serving + +运行启动命令: + +```shell +$ hub serving start -m glove_twitter_target_word-word_dim25_en +``` + +这样就完成了一个获取词向量的余弦相似度服务化API的部署,默认端口号为8866。 + +**NOTE:** 如使用GPU预测,则需要在启动服务之前,请设置CUDA_VISIBLE_DEVICES环境变量,否则不用设置。 + +### Step2: 发送预测请求 + +配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果 + +```python +import requests +import json + +# 指定用于计算余弦相似度的单词对[[word_a, word_b], [word_a, word_b], ... ]] +word_pairs = [["中国", "美国"], ["今天", "明天"]] +# 以key的方式指定word_pairs传入预测方法的时的参数,此例中为"data",对于每一对单词,调用cosine_sim进行余弦相似度的计算 +data = {"data": word_pairs} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip +url = "http://10.12.121.132:8866/predict/glove_twitter_target_word-word_dim25_en" +# 指定post请求的headers为application/json方式 +headers = {"Content-Type": "application/json"} + +r = requests.post(url=url, headers=headers, data=json.dumps(data)) +print(r.json()) +``` + +## 查看代码 + +https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings + +## 依赖 + +paddlepaddle >= 2.0.0 + +paddlehub >= 2.0.0 + +## 更新历史 + +* 1.0.0 + + 初始发布 + diff --git a/modules/text/embedding/glove_twitter_target_word-word_dim25_en/__init__.py b/modules/text/embedding/glove_twitter_target_word-word_dim25_en/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modules/text/embedding/glove_twitter_target_word-word_dim25_en/module.py b/modules/text/embedding/glove_twitter_target_word-word_dim25_en/module.py new file mode 100644 index 00000000..90ffad8f --- /dev/null +++ b/modules/text/embedding/glove_twitter_target_word-word_dim25_en/module.py @@ -0,0 +1,55 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List +from paddlenlp.embeddings import TokenEmbedding +from paddlehub.module.module import moduleinfo, serving + + +@moduleinfo( + name="glove_twitter_target_word-word_dim25_en", + version="1.0.0", + summary="", + author="paddlepaddle", + author_email="", + type="nlp/semantic_model") +class Embedding(TokenEmbedding): + """ + Embedding model + """ + def __init__(self, *args, **kwargs): + super(Embedding, self).__init__(embedding_name="glove.twitter.target.word-word.dim25.en", *args, **kwargs) + + @serving + def calc_similarity(self, data: List[List[str]]): + """ + Calculate similarities of giving word pairs. + """ + results = [] + for word_pair in data: + if len(word_pair) != 2: + raise RuntimeError( + f'The input must have two words, but got {len(word_pair)}. Please check your inputs.') + if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str): + raise RuntimeError( + f'The types of text pair must be (str, str), but got' + f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.') + + for word in word_pair: + if self.get_idx_from_word(word) == \ + self.get_idx_from_word(self.vocab.unk_token): + raise RuntimeError( + f'Word "{word}" is not in vocab. Please check your inputs.') + results.append(str(self.cosine_sim(*word_pair))) + return results diff --git a/modules/text/embedding/glove_twitter_target_word-word_dim50_en/README.md b/modules/text/embedding/glove_twitter_target_word-word_dim50_en/README.md new file mode 100644 index 00000000..3a11c3aa --- /dev/null +++ b/modules/text/embedding/glove_twitter_target_word-word_dim50_en/README.md @@ -0,0 +1,127 @@ +## 概述 +PaddleHub提供多个开源的预训练Embedding模型。这些Embedding模型可根据不同语料、不同训练方式和不同的维度进行区分,关于模型的具体信息可参考PaddleNLP的文档:[Embedding模型汇总](https://github.com/PaddlePaddle/models/blob/release/2.0-beta/PaddleNLP/docs/embeddings.md) + +## API + +```python +def __init__( + *args, + **kwargs +) +``` + +创建一个Embedding Module对象,默认无需参数。 + +**参数** +* `*args`: 用户额外指定的列表类型的参数。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 + +关于额外参数的详情可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + + +```python +def search( + words: Union[List[str], str, int], +) +``` + +获取一个或多个词的embedding。输入可以是`str`、`List[str]`和`int`类型,分别代表获取一个词,多个词和指定词编号的embedding,词的编号和模型的词典相关,词典可通过模型实例的`vocab`属性获取。 + +**参数** +* `words`: 需要获取的词向量的词、词列表或者词编号。 + + +```python +def cosine_sim( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的余弦相似度。需要注意的是`word_a`和`word_b`都需要是词典里的单词,否则将会被认为是OOV(Out-Of-Vocabulary),同时被替换为`unknown_token`。 + +**参数** +* `word_a`: 需要计算余弦相似度的单词a。 +* `word_b`: 需要计算余弦相似度的单词b。 + + +```python +def dot( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的内积。对于输入单词同样需要注意OOV问题。 + +**参数** +* `word_a`: 需要计算内积的单词a。 +* `word_b`: 需要计算内积的单词b。 + +更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + +## 代码示例 + +```python +import paddlehub as hub +embedding = hub.Module(name='glove_twitter_target_word-word_dim50_en') + +# 获取单词的embedding +embedding.search("中国") +# 计算两个词向量的余弦相似度 +embedding.cosine_sim("中国", "美国") +# 计算两个词向量的内积 +embedding.dot("中国", "美国") +``` + +## 部署服务 + +通过PaddleHub Serving,可以部署一个在线获取两个词向量的余弦相似度的服务。 + +### Step1: 启动PaddleHub Serving + +运行启动命令: + +```shell +$ hub serving start -m glove_twitter_target_word-word_dim50_en +``` + +这样就完成了一个获取词向量的余弦相似度服务化API的部署,默认端口号为8866。 + +**NOTE:** 如使用GPU预测,则需要在启动服务之前,请设置CUDA_VISIBLE_DEVICES环境变量,否则不用设置。 + +### Step2: 发送预测请求 + +配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果 + +```python +import requests +import json + +# 指定用于计算余弦相似度的单词对[[word_a, word_b], [word_a, word_b], ... ]] +word_pairs = [["中国", "美国"], ["今天", "明天"]] +# 以key的方式指定word_pairs传入预测方法的时的参数,此例中为"data",对于每一对单词,调用cosine_sim进行余弦相似度的计算 +data = {"data": word_pairs} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip +url = "http://10.12.121.132:8866/predict/glove_twitter_target_word-word_dim50_en" +# 指定post请求的headers为application/json方式 +headers = {"Content-Type": "application/json"} + +r = requests.post(url=url, headers=headers, data=json.dumps(data)) +print(r.json()) +``` + +## 查看代码 + +https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings + +## 依赖 + +paddlepaddle >= 2.0.0 + +paddlehub >= 2.0.0 + +## 更新历史 + +* 1.0.0 + + 初始发布 + diff --git a/modules/text/embedding/glove_twitter_target_word-word_dim50_en/__init__.py b/modules/text/embedding/glove_twitter_target_word-word_dim50_en/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modules/text/embedding/glove_twitter_target_word-word_dim50_en/module.py b/modules/text/embedding/glove_twitter_target_word-word_dim50_en/module.py new file mode 100644 index 00000000..e492ac62 --- /dev/null +++ b/modules/text/embedding/glove_twitter_target_word-word_dim50_en/module.py @@ -0,0 +1,55 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List +from paddlenlp.embeddings import TokenEmbedding +from paddlehub.module.module import moduleinfo, serving + + +@moduleinfo( + name="glove_twitter_target_word-word_dim50_en", + version="1.0.0", + summary="", + author="paddlepaddle", + author_email="", + type="nlp/semantic_model") +class Embedding(TokenEmbedding): + """ + Embedding model + """ + def __init__(self, *args, **kwargs): + super(Embedding, self).__init__(embedding_name="glove.twitter.target.word-word.dim50.en", *args, **kwargs) + + @serving + def calc_similarity(self, data: List[List[str]]): + """ + Calculate similarities of giving word pairs. + """ + results = [] + for word_pair in data: + if len(word_pair) != 2: + raise RuntimeError( + f'The input must have two words, but got {len(word_pair)}. Please check your inputs.') + if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str): + raise RuntimeError( + f'The types of text pair must be (str, str), but got' + f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.') + + for word in word_pair: + if self.get_idx_from_word(word) == \ + self.get_idx_from_word(self.vocab.unk_token): + raise RuntimeError( + f'Word "{word}" is not in vocab. Please check your inputs.') + results.append(str(self.cosine_sim(*word_pair))) + return results diff --git a/modules/text/embedding/glove_wiki2014-gigaword_target_word-word_dim100_en/README.md b/modules/text/embedding/glove_wiki2014-gigaword_target_word-word_dim100_en/README.md new file mode 100644 index 00000000..c12a8079 --- /dev/null +++ b/modules/text/embedding/glove_wiki2014-gigaword_target_word-word_dim100_en/README.md @@ -0,0 +1,127 @@ +## 概述 +PaddleHub提供多个开源的预训练Embedding模型。这些Embedding模型可根据不同语料、不同训练方式和不同的维度进行区分,关于模型的具体信息可参考PaddleNLP的文档:[Embedding模型汇总](https://github.com/PaddlePaddle/models/blob/release/2.0-beta/PaddleNLP/docs/embeddings.md) + +## API + +```python +def __init__( + *args, + **kwargs +) +``` + +创建一个Embedding Module对象,默认无需参数。 + +**参数** +* `*args`: 用户额外指定的列表类型的参数。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 + +关于额外参数的详情可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + + +```python +def search( + words: Union[List[str], str, int], +) +``` + +获取一个或多个词的embedding。输入可以是`str`、`List[str]`和`int`类型,分别代表获取一个词,多个词和指定词编号的embedding,词的编号和模型的词典相关,词典可通过模型实例的`vocab`属性获取。 + +**参数** +* `words`: 需要获取的词向量的词、词列表或者词编号。 + + +```python +def cosine_sim( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的余弦相似度。需要注意的是`word_a`和`word_b`都需要是词典里的单词,否则将会被认为是OOV(Out-Of-Vocabulary),同时被替换为`unknown_token`。 + +**参数** +* `word_a`: 需要计算余弦相似度的单词a。 +* `word_b`: 需要计算余弦相似度的单词b。 + + +```python +def dot( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的内积。对于输入单词同样需要注意OOV问题。 + +**参数** +* `word_a`: 需要计算内积的单词a。 +* `word_b`: 需要计算内积的单词b。 + +更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + +## 代码示例 + +```python +import paddlehub as hub +embedding = hub.Module(name='glove_wiki2014-gigaword_target_word-word_dim100_en') + +# 获取单词的embedding +embedding.search("中国") +# 计算两个词向量的余弦相似度 +embedding.cosine_sim("中国", "美国") +# 计算两个词向量的内积 +embedding.dot("中国", "美国") +``` + +## 部署服务 + +通过PaddleHub Serving,可以部署一个在线获取两个词向量的余弦相似度的服务。 + +### Step1: 启动PaddleHub Serving + +运行启动命令: + +```shell +$ hub serving start -m glove_wiki2014-gigaword_target_word-word_dim100_en +``` + +这样就完成了一个获取词向量的余弦相似度服务化API的部署,默认端口号为8866。 + +**NOTE:** 如使用GPU预测,则需要在启动服务之前,请设置CUDA_VISIBLE_DEVICES环境变量,否则不用设置。 + +### Step2: 发送预测请求 + +配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果 + +```python +import requests +import json + +# 指定用于计算余弦相似度的单词对[[word_a, word_b], [word_a, word_b], ... ]] +word_pairs = [["中国", "美国"], ["今天", "明天"]] +# 以key的方式指定word_pairs传入预测方法的时的参数,此例中为"data",对于每一对单词,调用cosine_sim进行余弦相似度的计算 +data = {"data": word_pairs} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip +url = "http://10.12.121.132:8866/predict/glove_wiki2014-gigaword_target_word-word_dim100_en" +# 指定post请求的headers为application/json方式 +headers = {"Content-Type": "application/json"} + +r = requests.post(url=url, headers=headers, data=json.dumps(data)) +print(r.json()) +``` + +## 查看代码 + +https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings + +## 依赖 + +paddlepaddle >= 2.0.0 + +paddlehub >= 2.0.0 + +## 更新历史 + +* 1.0.0 + + 初始发布 + diff --git a/modules/text/embedding/glove_wiki2014-gigaword_target_word-word_dim100_en/__init__.py b/modules/text/embedding/glove_wiki2014-gigaword_target_word-word_dim100_en/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modules/text/embedding/glove_wiki2014-gigaword_target_word-word_dim100_en/module.py b/modules/text/embedding/glove_wiki2014-gigaword_target_word-word_dim100_en/module.py new file mode 100644 index 00000000..118ca98e --- /dev/null +++ b/modules/text/embedding/glove_wiki2014-gigaword_target_word-word_dim100_en/module.py @@ -0,0 +1,55 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List +from paddlenlp.embeddings import TokenEmbedding +from paddlehub.module.module import moduleinfo, serving + + +@moduleinfo( + name="glove_wiki2014-gigaword_target_word-word_dim100_en", + version="1.0.0", + summary="", + author="paddlepaddle", + author_email="", + type="nlp/semantic_model") +class Embedding(TokenEmbedding): + """ + Embedding model + """ + def __init__(self, *args, **kwargs): + super(Embedding, self).__init__(embedding_name="glove.wiki2014-gigaword.target.word-word.dim100.en", *args, **kwargs) + + @serving + def calc_similarity(self, data: List[List[str]]): + """ + Calculate similarities of giving word pairs. + """ + results = [] + for word_pair in data: + if len(word_pair) != 2: + raise RuntimeError( + f'The input must have two words, but got {len(word_pair)}. Please check your inputs.') + if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str): + raise RuntimeError( + f'The types of text pair must be (str, str), but got' + f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.') + + for word in word_pair: + if self.get_idx_from_word(word) == \ + self.get_idx_from_word(self.vocab.unk_token): + raise RuntimeError( + f'Word "{word}" is not in vocab. Please check your inputs.') + results.append(str(self.cosine_sim(*word_pair))) + return results diff --git a/modules/text/embedding/glove_wiki2014-gigaword_target_word-word_dim200_en/README.md b/modules/text/embedding/glove_wiki2014-gigaword_target_word-word_dim200_en/README.md new file mode 100644 index 00000000..a657a641 --- /dev/null +++ b/modules/text/embedding/glove_wiki2014-gigaword_target_word-word_dim200_en/README.md @@ -0,0 +1,127 @@ +## 概述 +PaddleHub提供多个开源的预训练Embedding模型。这些Embedding模型可根据不同语料、不同训练方式和不同的维度进行区分,关于模型的具体信息可参考PaddleNLP的文档:[Embedding模型汇总](https://github.com/PaddlePaddle/models/blob/release/2.0-beta/PaddleNLP/docs/embeddings.md) + +## API + +```python +def __init__( + *args, + **kwargs +) +``` + +创建一个Embedding Module对象,默认无需参数。 + +**参数** +* `*args`: 用户额外指定的列表类型的参数。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 + +关于额外参数的详情可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + + +```python +def search( + words: Union[List[str], str, int], +) +``` + +获取一个或多个词的embedding。输入可以是`str`、`List[str]`和`int`类型,分别代表获取一个词,多个词和指定词编号的embedding,词的编号和模型的词典相关,词典可通过模型实例的`vocab`属性获取。 + +**参数** +* `words`: 需要获取的词向量的词、词列表或者词编号。 + + +```python +def cosine_sim( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的余弦相似度。需要注意的是`word_a`和`word_b`都需要是词典里的单词,否则将会被认为是OOV(Out-Of-Vocabulary),同时被替换为`unknown_token`。 + +**参数** +* `word_a`: 需要计算余弦相似度的单词a。 +* `word_b`: 需要计算余弦相似度的单词b。 + + +```python +def dot( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的内积。对于输入单词同样需要注意OOV问题。 + +**参数** +* `word_a`: 需要计算内积的单词a。 +* `word_b`: 需要计算内积的单词b。 + +更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + +## 代码示例 + +```python +import paddlehub as hub +embedding = hub.Module(name='glove_wiki2014-gigaword_target_word-word_dim200_en') + +# 获取单词的embedding +embedding.search("中国") +# 计算两个词向量的余弦相似度 +embedding.cosine_sim("中国", "美国") +# 计算两个词向量的内积 +embedding.dot("中国", "美国") +``` + +## 部署服务 + +通过PaddleHub Serving,可以部署一个在线获取两个词向量的余弦相似度的服务。 + +### Step1: 启动PaddleHub Serving + +运行启动命令: + +```shell +$ hub serving start -m glove_wiki2014-gigaword_target_word-word_dim200_en +``` + +这样就完成了一个获取词向量的余弦相似度服务化API的部署,默认端口号为8866。 + +**NOTE:** 如使用GPU预测,则需要在启动服务之前,请设置CUDA_VISIBLE_DEVICES环境变量,否则不用设置。 + +### Step2: 发送预测请求 + +配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果 + +```python +import requests +import json + +# 指定用于计算余弦相似度的单词对[[word_a, word_b], [word_a, word_b], ... ]] +word_pairs = [["中国", "美国"], ["今天", "明天"]] +# 以key的方式指定word_pairs传入预测方法的时的参数,此例中为"data",对于每一对单词,调用cosine_sim进行余弦相似度的计算 +data = {"data": word_pairs} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip +url = "http://10.12.121.132:8866/predict/glove_wiki2014-gigaword_target_word-word_dim200_en" +# 指定post请求的headers为application/json方式 +headers = {"Content-Type": "application/json"} + +r = requests.post(url=url, headers=headers, data=json.dumps(data)) +print(r.json()) +``` + +## 查看代码 + +https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings + +## 依赖 + +paddlepaddle >= 2.0.0 + +paddlehub >= 2.0.0 + +## 更新历史 + +* 1.0.0 + + 初始发布 + diff --git a/modules/text/embedding/glove_wiki2014-gigaword_target_word-word_dim200_en/__init__.py b/modules/text/embedding/glove_wiki2014-gigaword_target_word-word_dim200_en/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modules/text/embedding/glove_wiki2014-gigaword_target_word-word_dim200_en/module.py b/modules/text/embedding/glove_wiki2014-gigaword_target_word-word_dim200_en/module.py new file mode 100644 index 00000000..2f271db7 --- /dev/null +++ b/modules/text/embedding/glove_wiki2014-gigaword_target_word-word_dim200_en/module.py @@ -0,0 +1,55 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List +from paddlenlp.embeddings import TokenEmbedding +from paddlehub.module.module import moduleinfo, serving + + +@moduleinfo( + name="glove_wiki2014-gigaword_target_word-word_dim200_en", + version="1.0.0", + summary="", + author="paddlepaddle", + author_email="", + type="nlp/semantic_model") +class Embedding(TokenEmbedding): + """ + Embedding model + """ + def __init__(self, *args, **kwargs): + super(Embedding, self).__init__(embedding_name="glove.wiki2014-gigaword.target.word-word.dim200.en", *args, **kwargs) + + @serving + def calc_similarity(self, data: List[List[str]]): + """ + Calculate similarities of giving word pairs. + """ + results = [] + for word_pair in data: + if len(word_pair) != 2: + raise RuntimeError( + f'The input must have two words, but got {len(word_pair)}. Please check your inputs.') + if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str): + raise RuntimeError( + f'The types of text pair must be (str, str), but got' + f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.') + + for word in word_pair: + if self.get_idx_from_word(word) == \ + self.get_idx_from_word(self.vocab.unk_token): + raise RuntimeError( + f'Word "{word}" is not in vocab. Please check your inputs.') + results.append(str(self.cosine_sim(*word_pair))) + return results diff --git a/modules/text/embedding/glove_wiki2014-gigaword_target_word-word_dim300_en/README.md b/modules/text/embedding/glove_wiki2014-gigaword_target_word-word_dim300_en/README.md new file mode 100644 index 00000000..a9ab65c3 --- /dev/null +++ b/modules/text/embedding/glove_wiki2014-gigaword_target_word-word_dim300_en/README.md @@ -0,0 +1,127 @@ +## 概述 +PaddleHub提供多个开源的预训练Embedding模型。这些Embedding模型可根据不同语料、不同训练方式和不同的维度进行区分,关于模型的具体信息可参考PaddleNLP的文档:[Embedding模型汇总](https://github.com/PaddlePaddle/models/blob/release/2.0-beta/PaddleNLP/docs/embeddings.md) + +## API + +```python +def __init__( + *args, + **kwargs +) +``` + +创建一个Embedding Module对象,默认无需参数。 + +**参数** +* `*args`: 用户额外指定的列表类型的参数。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 + +关于额外参数的详情可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + + +```python +def search( + words: Union[List[str], str, int], +) +``` + +获取一个或多个词的embedding。输入可以是`str`、`List[str]`和`int`类型,分别代表获取一个词,多个词和指定词编号的embedding,词的编号和模型的词典相关,词典可通过模型实例的`vocab`属性获取。 + +**参数** +* `words`: 需要获取的词向量的词、词列表或者词编号。 + + +```python +def cosine_sim( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的余弦相似度。需要注意的是`word_a`和`word_b`都需要是词典里的单词,否则将会被认为是OOV(Out-Of-Vocabulary),同时被替换为`unknown_token`。 + +**参数** +* `word_a`: 需要计算余弦相似度的单词a。 +* `word_b`: 需要计算余弦相似度的单词b。 + + +```python +def dot( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的内积。对于输入单词同样需要注意OOV问题。 + +**参数** +* `word_a`: 需要计算内积的单词a。 +* `word_b`: 需要计算内积的单词b。 + +更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + +## 代码示例 + +```python +import paddlehub as hub +embedding = hub.Module(name='glove_wiki2014-gigaword_target_word-word_dim300_en') + +# 获取单词的embedding +embedding.search("中国") +# 计算两个词向量的余弦相似度 +embedding.cosine_sim("中国", "美国") +# 计算两个词向量的内积 +embedding.dot("中国", "美国") +``` + +## 部署服务 + +通过PaddleHub Serving,可以部署一个在线获取两个词向量的余弦相似度的服务。 + +### Step1: 启动PaddleHub Serving + +运行启动命令: + +```shell +$ hub serving start -m glove_wiki2014-gigaword_target_word-word_dim300_en +``` + +这样就完成了一个获取词向量的余弦相似度服务化API的部署,默认端口号为8866。 + +**NOTE:** 如使用GPU预测,则需要在启动服务之前,请设置CUDA_VISIBLE_DEVICES环境变量,否则不用设置。 + +### Step2: 发送预测请求 + +配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果 + +```python +import requests +import json + +# 指定用于计算余弦相似度的单词对[[word_a, word_b], [word_a, word_b], ... ]] +word_pairs = [["中国", "美国"], ["今天", "明天"]] +# 以key的方式指定word_pairs传入预测方法的时的参数,此例中为"data",对于每一对单词,调用cosine_sim进行余弦相似度的计算 +data = {"data": word_pairs} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip +url = "http://10.12.121.132:8866/predict/glove_wiki2014-gigaword_target_word-word_dim300_en" +# 指定post请求的headers为application/json方式 +headers = {"Content-Type": "application/json"} + +r = requests.post(url=url, headers=headers, data=json.dumps(data)) +print(r.json()) +``` + +## 查看代码 + +https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings + +## 依赖 + +paddlepaddle >= 2.0.0 + +paddlehub >= 2.0.0 + +## 更新历史 + +* 1.0.0 + + 初始发布 + diff --git a/modules/text/embedding/glove_wiki2014-gigaword_target_word-word_dim300_en/__init__.py b/modules/text/embedding/glove_wiki2014-gigaword_target_word-word_dim300_en/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modules/text/embedding/glove_wiki2014-gigaword_target_word-word_dim300_en/module.py b/modules/text/embedding/glove_wiki2014-gigaword_target_word-word_dim300_en/module.py new file mode 100644 index 00000000..8e4baa5c --- /dev/null +++ b/modules/text/embedding/glove_wiki2014-gigaword_target_word-word_dim300_en/module.py @@ -0,0 +1,55 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List +from paddlenlp.embeddings import TokenEmbedding +from paddlehub.module.module import moduleinfo, serving + + +@moduleinfo( + name="glove_wiki2014-gigaword_target_word-word_dim300_en", + version="1.0.0", + summary="", + author="paddlepaddle", + author_email="", + type="nlp/semantic_model") +class Embedding(TokenEmbedding): + """ + Embedding model + """ + def __init__(self, *args, **kwargs): + super(Embedding, self).__init__(embedding_name="glove.wiki2014-gigaword.target.word-word.dim300.en", *args, **kwargs) + + @serving + def calc_similarity(self, data: List[List[str]]): + """ + Calculate similarities of giving word pairs. + """ + results = [] + for word_pair in data: + if len(word_pair) != 2: + raise RuntimeError( + f'The input must have two words, but got {len(word_pair)}. Please check your inputs.') + if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str): + raise RuntimeError( + f'The types of text pair must be (str, str), but got' + f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.') + + for word in word_pair: + if self.get_idx_from_word(word) == \ + self.get_idx_from_word(self.vocab.unk_token): + raise RuntimeError( + f'Word "{word}" is not in vocab. Please check your inputs.') + results.append(str(self.cosine_sim(*word_pair))) + return results diff --git a/modules/text/embedding/glove_wiki2014-gigaword_target_word-word_dim50_en/README.md b/modules/text/embedding/glove_wiki2014-gigaword_target_word-word_dim50_en/README.md new file mode 100644 index 00000000..9c9cc68a --- /dev/null +++ b/modules/text/embedding/glove_wiki2014-gigaword_target_word-word_dim50_en/README.md @@ -0,0 +1,127 @@ +## 概述 +PaddleHub提供多个开源的预训练Embedding模型。这些Embedding模型可根据不同语料、不同训练方式和不同的维度进行区分,关于模型的具体信息可参考PaddleNLP的文档:[Embedding模型汇总](https://github.com/PaddlePaddle/models/blob/release/2.0-beta/PaddleNLP/docs/embeddings.md) + +## API + +```python +def __init__( + *args, + **kwargs +) +``` + +创建一个Embedding Module对象,默认无需参数。 + +**参数** +* `*args`: 用户额外指定的列表类型的参数。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 + +关于额外参数的详情可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + + +```python +def search( + words: Union[List[str], str, int], +) +``` + +获取一个或多个词的embedding。输入可以是`str`、`List[str]`和`int`类型,分别代表获取一个词,多个词和指定词编号的embedding,词的编号和模型的词典相关,词典可通过模型实例的`vocab`属性获取。 + +**参数** +* `words`: 需要获取的词向量的词、词列表或者词编号。 + + +```python +def cosine_sim( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的余弦相似度。需要注意的是`word_a`和`word_b`都需要是词典里的单词,否则将会被认为是OOV(Out-Of-Vocabulary),同时被替换为`unknown_token`。 + +**参数** +* `word_a`: 需要计算余弦相似度的单词a。 +* `word_b`: 需要计算余弦相似度的单词b。 + + +```python +def dot( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的内积。对于输入单词同样需要注意OOV问题。 + +**参数** +* `word_a`: 需要计算内积的单词a。 +* `word_b`: 需要计算内积的单词b。 + +更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + +## 代码示例 + +```python +import paddlehub as hub +embedding = hub.Module(name='glove_wiki2014-gigaword_target_word-word_dim50_en') + +# 获取单词的embedding +embedding.search("中国") +# 计算两个词向量的余弦相似度 +embedding.cosine_sim("中国", "美国") +# 计算两个词向量的内积 +embedding.dot("中国", "美国") +``` + +## 部署服务 + +通过PaddleHub Serving,可以部署一个在线获取两个词向量的余弦相似度的服务。 + +### Step1: 启动PaddleHub Serving + +运行启动命令: + +```shell +$ hub serving start -m glove_wiki2014-gigaword_target_word-word_dim50_en +``` + +这样就完成了一个获取词向量的余弦相似度服务化API的部署,默认端口号为8866。 + +**NOTE:** 如使用GPU预测,则需要在启动服务之前,请设置CUDA_VISIBLE_DEVICES环境变量,否则不用设置。 + +### Step2: 发送预测请求 + +配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果 + +```python +import requests +import json + +# 指定用于计算余弦相似度的单词对[[word_a, word_b], [word_a, word_b], ... ]] +word_pairs = [["中国", "美国"], ["今天", "明天"]] +# 以key的方式指定word_pairs传入预测方法的时的参数,此例中为"data",对于每一对单词,调用cosine_sim进行余弦相似度的计算 +data = {"data": word_pairs} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip +url = "http://10.12.121.132:8866/predict/glove_wiki2014-gigaword_target_word-word_dim50_en" +# 指定post请求的headers为application/json方式 +headers = {"Content-Type": "application/json"} + +r = requests.post(url=url, headers=headers, data=json.dumps(data)) +print(r.json()) +``` + +## 查看代码 + +https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings + +## 依赖 + +paddlepaddle >= 2.0.0 + +paddlehub >= 2.0.0 + +## 更新历史 + +* 1.0.0 + + 初始发布 + diff --git a/modules/text/embedding/glove_wiki2014-gigaword_target_word-word_dim50_en/__init__.py b/modules/text/embedding/glove_wiki2014-gigaword_target_word-word_dim50_en/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modules/text/embedding/glove_wiki2014-gigaword_target_word-word_dim50_en/module.py b/modules/text/embedding/glove_wiki2014-gigaword_target_word-word_dim50_en/module.py new file mode 100644 index 00000000..96b6eae4 --- /dev/null +++ b/modules/text/embedding/glove_wiki2014-gigaword_target_word-word_dim50_en/module.py @@ -0,0 +1,55 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List +from paddlenlp.embeddings import TokenEmbedding +from paddlehub.module.module import moduleinfo, serving + + +@moduleinfo( + name="glove_wiki2014-gigaword_target_word-word_dim50_en", + version="1.0.0", + summary="", + author="paddlepaddle", + author_email="", + type="nlp/semantic_model") +class Embedding(TokenEmbedding): + """ + Embedding model + """ + def __init__(self, *args, **kwargs): + super(Embedding, self).__init__(embedding_name="glove.wiki2014-gigaword.target.word-word.dim50.en", *args, **kwargs) + + @serving + def calc_similarity(self, data: List[List[str]]): + """ + Calculate similarities of giving word pairs. + """ + results = [] + for word_pair in data: + if len(word_pair) != 2: + raise RuntimeError( + f'The input must have two words, but got {len(word_pair)}. Please check your inputs.') + if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str): + raise RuntimeError( + f'The types of text pair must be (str, str), but got' + f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.') + + for word in word_pair: + if self.get_idx_from_word(word) == \ + self.get_idx_from_word(self.vocab.unk_token): + raise RuntimeError( + f'Word "{word}" is not in vocab. Please check your inputs.') + results.append(str(self.cosine_sim(*word_pair))) + return results diff --git a/modules/text/embedding/w2v_baidu_encyclopedia_context_word-character_char1-1_dim300/README.md b/modules/text/embedding/w2v_baidu_encyclopedia_context_word-character_char1-1_dim300/README.md new file mode 100644 index 00000000..a4058dac --- /dev/null +++ b/modules/text/embedding/w2v_baidu_encyclopedia_context_word-character_char1-1_dim300/README.md @@ -0,0 +1,127 @@ +## 概述 +PaddleHub提供多个开源的预训练Embedding模型。这些Embedding模型可根据不同语料、不同训练方式和不同的维度进行区分,关于模型的具体信息可参考PaddleNLP的文档:[Embedding模型汇总](https://github.com/PaddlePaddle/models/blob/release/2.0-beta/PaddleNLP/docs/embeddings.md) + +## API + +```python +def __init__( + *args, + **kwargs +) +``` + +创建一个Embedding Module对象,默认无需参数。 + +**参数** +* `*args`: 用户额外指定的列表类型的参数。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 + +关于额外参数的详情可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + + +```python +def search( + words: Union[List[str], str, int], +) +``` + +获取一个或多个词的embedding。输入可以是`str`、`List[str]`和`int`类型,分别代表获取一个词,多个词和指定词编号的embedding,词的编号和模型的词典相关,词典可通过模型实例的`vocab`属性获取。 + +**参数** +* `words`: 需要获取的词向量的词、词列表或者词编号。 + + +```python +def cosine_sim( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的余弦相似度。需要注意的是`word_a`和`word_b`都需要是词典里的单词,否则将会被认为是OOV(Out-Of-Vocabulary),同时被替换为`unknown_token`。 + +**参数** +* `word_a`: 需要计算余弦相似度的单词a。 +* `word_b`: 需要计算余弦相似度的单词b。 + + +```python +def dot( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的内积。对于输入单词同样需要注意OOV问题。 + +**参数** +* `word_a`: 需要计算内积的单词a。 +* `word_b`: 需要计算内积的单词b。 + +更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + +## 代码示例 + +```python +import paddlehub as hub +embedding = hub.Module(name='w2v_baidu_encyclopedia_context_word-character_char1-1_dim300') + +# 获取单词的embedding +embedding.search("中国") +# 计算两个词向量的余弦相似度 +embedding.cosine_sim("中国", "美国") +# 计算两个词向量的内积 +embedding.dot("中国", "美国") +``` + +## 部署服务 + +通过PaddleHub Serving,可以部署一个在线获取两个词向量的余弦相似度的服务。 + +### Step1: 启动PaddleHub Serving + +运行启动命令: + +```shell +$ hub serving start -m w2v_baidu_encyclopedia_context_word-character_char1-1_dim300 +``` + +这样就完成了一个获取词向量的余弦相似度服务化API的部署,默认端口号为8866。 + +**NOTE:** 如使用GPU预测,则需要在启动服务之前,请设置CUDA_VISIBLE_DEVICES环境变量,否则不用设置。 + +### Step2: 发送预测请求 + +配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果 + +```python +import requests +import json + +# 指定用于计算余弦相似度的单词对[[word_a, word_b], [word_a, word_b], ... ]] +word_pairs = [["中国", "美国"], ["今天", "明天"]] +# 以key的方式指定word_pairs传入预测方法的时的参数,此例中为"data",对于每一对单词,调用cosine_sim进行余弦相似度的计算 +data = {"data": word_pairs} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip +url = "http://10.12.121.132:8866/predict/w2v_baidu_encyclopedia_context_word-character_char1-1_dim300" +# 指定post请求的headers为application/json方式 +headers = {"Content-Type": "application/json"} + +r = requests.post(url=url, headers=headers, data=json.dumps(data)) +print(r.json()) +``` + +## 查看代码 + +https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings + +## 依赖 + +paddlepaddle >= 2.0.0 + +paddlehub >= 2.0.0 + +## 更新历史 + +* 1.0.0 + + 初始发布 + diff --git a/modules/text/embedding/w2v_baidu_encyclopedia_context_word-character_char1-1_dim300/__init__.py b/modules/text/embedding/w2v_baidu_encyclopedia_context_word-character_char1-1_dim300/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modules/text/embedding/w2v_baidu_encyclopedia_context_word-character_char1-1_dim300/module.py b/modules/text/embedding/w2v_baidu_encyclopedia_context_word-character_char1-1_dim300/module.py new file mode 100644 index 00000000..73a7ec54 --- /dev/null +++ b/modules/text/embedding/w2v_baidu_encyclopedia_context_word-character_char1-1_dim300/module.py @@ -0,0 +1,55 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List +from paddlenlp.embeddings import TokenEmbedding +from paddlehub.module.module import moduleinfo, serving + + +@moduleinfo( + name="w2v_baidu_encyclopedia_context_word-character_char1-1_dim300", + version="1.0.0", + summary="", + author="paddlepaddle", + author_email="", + type="nlp/semantic_model") +class Embedding(TokenEmbedding): + """ + Embedding model + """ + def __init__(self, *args, **kwargs): + super(Embedding, self).__init__(embedding_name="w2v.baidu_encyclopedia.context.word-character.char1-1.dim300", *args, **kwargs) + + @serving + def calc_similarity(self, data: List[List[str]]): + """ + Calculate similarities of giving word pairs. + """ + results = [] + for word_pair in data: + if len(word_pair) != 2: + raise RuntimeError( + f'The input must have two words, but got {len(word_pair)}. Please check your inputs.') + if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str): + raise RuntimeError( + f'The types of text pair must be (str, str), but got' + f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.') + + for word in word_pair: + if self.get_idx_from_word(word) == \ + self.get_idx_from_word(self.vocab.unk_token): + raise RuntimeError( + f'Word "{word}" is not in vocab. Please check your inputs.') + results.append(str(self.cosine_sim(*word_pair))) + return results diff --git a/modules/text/embedding/w2v_baidu_encyclopedia_context_word-character_char1-2_dim300/README.md b/modules/text/embedding/w2v_baidu_encyclopedia_context_word-character_char1-2_dim300/README.md new file mode 100644 index 00000000..e97dcbcb --- /dev/null +++ b/modules/text/embedding/w2v_baidu_encyclopedia_context_word-character_char1-2_dim300/README.md @@ -0,0 +1,127 @@ +## 概述 +PaddleHub提供多个开源的预训练Embedding模型。这些Embedding模型可根据不同语料、不同训练方式和不同的维度进行区分,关于模型的具体信息可参考PaddleNLP的文档:[Embedding模型汇总](https://github.com/PaddlePaddle/models/blob/release/2.0-beta/PaddleNLP/docs/embeddings.md) + +## API + +```python +def __init__( + *args, + **kwargs +) +``` + +创建一个Embedding Module对象,默认无需参数。 + +**参数** +* `*args`: 用户额外指定的列表类型的参数。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 + +关于额外参数的详情可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + + +```python +def search( + words: Union[List[str], str, int], +) +``` + +获取一个或多个词的embedding。输入可以是`str`、`List[str]`和`int`类型,分别代表获取一个词,多个词和指定词编号的embedding,词的编号和模型的词典相关,词典可通过模型实例的`vocab`属性获取。 + +**参数** +* `words`: 需要获取的词向量的词、词列表或者词编号。 + + +```python +def cosine_sim( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的余弦相似度。需要注意的是`word_a`和`word_b`都需要是词典里的单词,否则将会被认为是OOV(Out-Of-Vocabulary),同时被替换为`unknown_token`。 + +**参数** +* `word_a`: 需要计算余弦相似度的单词a。 +* `word_b`: 需要计算余弦相似度的单词b。 + + +```python +def dot( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的内积。对于输入单词同样需要注意OOV问题。 + +**参数** +* `word_a`: 需要计算内积的单词a。 +* `word_b`: 需要计算内积的单词b。 + +更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + +## 代码示例 + +```python +import paddlehub as hub +embedding = hub.Module(name='w2v_baidu_encyclopedia_context_word-character_char1-2_dim300') + +# 获取单词的embedding +embedding.search("中国") +# 计算两个词向量的余弦相似度 +embedding.cosine_sim("中国", "美国") +# 计算两个词向量的内积 +embedding.dot("中国", "美国") +``` + +## 部署服务 + +通过PaddleHub Serving,可以部署一个在线获取两个词向量的余弦相似度的服务。 + +### Step1: 启动PaddleHub Serving + +运行启动命令: + +```shell +$ hub serving start -m w2v_baidu_encyclopedia_context_word-character_char1-2_dim300 +``` + +这样就完成了一个获取词向量的余弦相似度服务化API的部署,默认端口号为8866。 + +**NOTE:** 如使用GPU预测,则需要在启动服务之前,请设置CUDA_VISIBLE_DEVICES环境变量,否则不用设置。 + +### Step2: 发送预测请求 + +配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果 + +```python +import requests +import json + +# 指定用于计算余弦相似度的单词对[[word_a, word_b], [word_a, word_b], ... ]] +word_pairs = [["中国", "美国"], ["今天", "明天"]] +# 以key的方式指定word_pairs传入预测方法的时的参数,此例中为"data",对于每一对单词,调用cosine_sim进行余弦相似度的计算 +data = {"data": word_pairs} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip +url = "http://10.12.121.132:8866/predict/w2v_baidu_encyclopedia_context_word-character_char1-2_dim300" +# 指定post请求的headers为application/json方式 +headers = {"Content-Type": "application/json"} + +r = requests.post(url=url, headers=headers, data=json.dumps(data)) +print(r.json()) +``` + +## 查看代码 + +https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings + +## 依赖 + +paddlepaddle >= 2.0.0 + +paddlehub >= 2.0.0 + +## 更新历史 + +* 1.0.0 + + 初始发布 + diff --git a/modules/text/embedding/w2v_baidu_encyclopedia_context_word-character_char1-2_dim300/__init__.py b/modules/text/embedding/w2v_baidu_encyclopedia_context_word-character_char1-2_dim300/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modules/text/embedding/w2v_baidu_encyclopedia_context_word-character_char1-2_dim300/module.py b/modules/text/embedding/w2v_baidu_encyclopedia_context_word-character_char1-2_dim300/module.py new file mode 100644 index 00000000..8683810b --- /dev/null +++ b/modules/text/embedding/w2v_baidu_encyclopedia_context_word-character_char1-2_dim300/module.py @@ -0,0 +1,55 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List +from paddlenlp.embeddings import TokenEmbedding +from paddlehub.module.module import moduleinfo, serving + + +@moduleinfo( + name="w2v_baidu_encyclopedia_context_word-character_char1-2_dim300", + version="1.0.0", + summary="", + author="paddlepaddle", + author_email="", + type="nlp/semantic_model") +class Embedding(TokenEmbedding): + """ + Embedding model + """ + def __init__(self, *args, **kwargs): + super(Embedding, self).__init__(embedding_name="w2v.baidu_encyclopedia.context.word-character.char1-2.dim300", *args, **kwargs) + + @serving + def calc_similarity(self, data: List[List[str]]): + """ + Calculate similarities of giving word pairs. + """ + results = [] + for word_pair in data: + if len(word_pair) != 2: + raise RuntimeError( + f'The input must have two words, but got {len(word_pair)}. Please check your inputs.') + if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str): + raise RuntimeError( + f'The types of text pair must be (str, str), but got' + f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.') + + for word in word_pair: + if self.get_idx_from_word(word) == \ + self.get_idx_from_word(self.vocab.unk_token): + raise RuntimeError( + f'Word "{word}" is not in vocab. Please check your inputs.') + results.append(str(self.cosine_sim(*word_pair))) + return results diff --git a/modules/text/embedding/w2v_baidu_encyclopedia_context_word-character_char1-4_dim300/README.md b/modules/text/embedding/w2v_baidu_encyclopedia_context_word-character_char1-4_dim300/README.md new file mode 100644 index 00000000..3d46f6a2 --- /dev/null +++ b/modules/text/embedding/w2v_baidu_encyclopedia_context_word-character_char1-4_dim300/README.md @@ -0,0 +1,127 @@ +## 概述 +PaddleHub提供多个开源的预训练Embedding模型。这些Embedding模型可根据不同语料、不同训练方式和不同的维度进行区分,关于模型的具体信息可参考PaddleNLP的文档:[Embedding模型汇总](https://github.com/PaddlePaddle/models/blob/release/2.0-beta/PaddleNLP/docs/embeddings.md) + +## API + +```python +def __init__( + *args, + **kwargs +) +``` + +创建一个Embedding Module对象,默认无需参数。 + +**参数** +* `*args`: 用户额外指定的列表类型的参数。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 + +关于额外参数的详情可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + + +```python +def search( + words: Union[List[str], str, int], +) +``` + +获取一个或多个词的embedding。输入可以是`str`、`List[str]`和`int`类型,分别代表获取一个词,多个词和指定词编号的embedding,词的编号和模型的词典相关,词典可通过模型实例的`vocab`属性获取。 + +**参数** +* `words`: 需要获取的词向量的词、词列表或者词编号。 + + +```python +def cosine_sim( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的余弦相似度。需要注意的是`word_a`和`word_b`都需要是词典里的单词,否则将会被认为是OOV(Out-Of-Vocabulary),同时被替换为`unknown_token`。 + +**参数** +* `word_a`: 需要计算余弦相似度的单词a。 +* `word_b`: 需要计算余弦相似度的单词b。 + + +```python +def dot( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的内积。对于输入单词同样需要注意OOV问题。 + +**参数** +* `word_a`: 需要计算内积的单词a。 +* `word_b`: 需要计算内积的单词b。 + +更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + +## 代码示例 + +```python +import paddlehub as hub +embedding = hub.Module(name='w2v_baidu_encyclopedia_context_word-character_char1-4_dim300') + +# 获取单词的embedding +embedding.search("中国") +# 计算两个词向量的余弦相似度 +embedding.cosine_sim("中国", "美国") +# 计算两个词向量的内积 +embedding.dot("中国", "美国") +``` + +## 部署服务 + +通过PaddleHub Serving,可以部署一个在线获取两个词向量的余弦相似度的服务。 + +### Step1: 启动PaddleHub Serving + +运行启动命令: + +```shell +$ hub serving start -m w2v_baidu_encyclopedia_context_word-character_char1-4_dim300 +``` + +这样就完成了一个获取词向量的余弦相似度服务化API的部署,默认端口号为8866。 + +**NOTE:** 如使用GPU预测,则需要在启动服务之前,请设置CUDA_VISIBLE_DEVICES环境变量,否则不用设置。 + +### Step2: 发送预测请求 + +配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果 + +```python +import requests +import json + +# 指定用于计算余弦相似度的单词对[[word_a, word_b], [word_a, word_b], ... ]] +word_pairs = [["中国", "美国"], ["今天", "明天"]] +# 以key的方式指定word_pairs传入预测方法的时的参数,此例中为"data",对于每一对单词,调用cosine_sim进行余弦相似度的计算 +data = {"data": word_pairs} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip +url = "http://10.12.121.132:8866/predict/w2v_baidu_encyclopedia_context_word-character_char1-4_dim300" +# 指定post请求的headers为application/json方式 +headers = {"Content-Type": "application/json"} + +r = requests.post(url=url, headers=headers, data=json.dumps(data)) +print(r.json()) +``` + +## 查看代码 + +https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings + +## 依赖 + +paddlepaddle >= 2.0.0 + +paddlehub >= 2.0.0 + +## 更新历史 + +* 1.0.0 + + 初始发布 + diff --git a/modules/text/embedding/w2v_baidu_encyclopedia_context_word-character_char1-4_dim300/__init__.py b/modules/text/embedding/w2v_baidu_encyclopedia_context_word-character_char1-4_dim300/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modules/text/embedding/w2v_baidu_encyclopedia_context_word-character_char1-4_dim300/module.py b/modules/text/embedding/w2v_baidu_encyclopedia_context_word-character_char1-4_dim300/module.py new file mode 100644 index 00000000..f82bf7fa --- /dev/null +++ b/modules/text/embedding/w2v_baidu_encyclopedia_context_word-character_char1-4_dim300/module.py @@ -0,0 +1,55 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List +from paddlenlp.embeddings import TokenEmbedding +from paddlehub.module.module import moduleinfo, serving + + +@moduleinfo( + name="w2v_baidu_encyclopedia_context_word-character_char1-4_dim300", + version="1.0.0", + summary="", + author="paddlepaddle", + author_email="", + type="nlp/semantic_model") +class Embedding(TokenEmbedding): + """ + Embedding model + """ + def __init__(self, *args, **kwargs): + super(Embedding, self).__init__(embedding_name="w2v.baidu_encyclopedia.context.word-character.char1-4.dim300", *args, **kwargs) + + @serving + def calc_similarity(self, data: List[List[str]]): + """ + Calculate similarities of giving word pairs. + """ + results = [] + for word_pair in data: + if len(word_pair) != 2: + raise RuntimeError( + f'The input must have two words, but got {len(word_pair)}. Please check your inputs.') + if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str): + raise RuntimeError( + f'The types of text pair must be (str, str), but got' + f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.') + + for word in word_pair: + if self.get_idx_from_word(word) == \ + self.get_idx_from_word(self.vocab.unk_token): + raise RuntimeError( + f'Word "{word}" is not in vocab. Please check your inputs.') + results.append(str(self.cosine_sim(*word_pair))) + return results diff --git a/modules/text/embedding/w2v_baidu_encyclopedia_context_word-ngram_1-2_dim300/README.md b/modules/text/embedding/w2v_baidu_encyclopedia_context_word-ngram_1-2_dim300/README.md new file mode 100644 index 00000000..8895eeb6 --- /dev/null +++ b/modules/text/embedding/w2v_baidu_encyclopedia_context_word-ngram_1-2_dim300/README.md @@ -0,0 +1,127 @@ +## 概述 +PaddleHub提供多个开源的预训练Embedding模型。这些Embedding模型可根据不同语料、不同训练方式和不同的维度进行区分,关于模型的具体信息可参考PaddleNLP的文档:[Embedding模型汇总](https://github.com/PaddlePaddle/models/blob/release/2.0-beta/PaddleNLP/docs/embeddings.md) + +## API + +```python +def __init__( + *args, + **kwargs +) +``` + +创建一个Embedding Module对象,默认无需参数。 + +**参数** +* `*args`: 用户额外指定的列表类型的参数。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 + +关于额外参数的详情可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + + +```python +def search( + words: Union[List[str], str, int], +) +``` + +获取一个或多个词的embedding。输入可以是`str`、`List[str]`和`int`类型,分别代表获取一个词,多个词和指定词编号的embedding,词的编号和模型的词典相关,词典可通过模型实例的`vocab`属性获取。 + +**参数** +* `words`: 需要获取的词向量的词、词列表或者词编号。 + + +```python +def cosine_sim( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的余弦相似度。需要注意的是`word_a`和`word_b`都需要是词典里的单词,否则将会被认为是OOV(Out-Of-Vocabulary),同时被替换为`unknown_token`。 + +**参数** +* `word_a`: 需要计算余弦相似度的单词a。 +* `word_b`: 需要计算余弦相似度的单词b。 + + +```python +def dot( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的内积。对于输入单词同样需要注意OOV问题。 + +**参数** +* `word_a`: 需要计算内积的单词a。 +* `word_b`: 需要计算内积的单词b。 + +更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + +## 代码示例 + +```python +import paddlehub as hub +embedding = hub.Module(name='w2v_baidu_encyclopedia_context_word-ngram_1-2_dim300') + +# 获取单词的embedding +embedding.search("中国") +# 计算两个词向量的余弦相似度 +embedding.cosine_sim("中国", "美国") +# 计算两个词向量的内积 +embedding.dot("中国", "美国") +``` + +## 部署服务 + +通过PaddleHub Serving,可以部署一个在线获取两个词向量的余弦相似度的服务。 + +### Step1: 启动PaddleHub Serving + +运行启动命令: + +```shell +$ hub serving start -m w2v_baidu_encyclopedia_context_word-ngram_1-2_dim300 +``` + +这样就完成了一个获取词向量的余弦相似度服务化API的部署,默认端口号为8866。 + +**NOTE:** 如使用GPU预测,则需要在启动服务之前,请设置CUDA_VISIBLE_DEVICES环境变量,否则不用设置。 + +### Step2: 发送预测请求 + +配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果 + +```python +import requests +import json + +# 指定用于计算余弦相似度的单词对[[word_a, word_b], [word_a, word_b], ... ]] +word_pairs = [["中国", "美国"], ["今天", "明天"]] +# 以key的方式指定word_pairs传入预测方法的时的参数,此例中为"data",对于每一对单词,调用cosine_sim进行余弦相似度的计算 +data = {"data": word_pairs} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip +url = "http://10.12.121.132:8866/predict/w2v_baidu_encyclopedia_context_word-ngram_1-2_dim300" +# 指定post请求的headers为application/json方式 +headers = {"Content-Type": "application/json"} + +r = requests.post(url=url, headers=headers, data=json.dumps(data)) +print(r.json()) +``` + +## 查看代码 + +https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings + +## 依赖 + +paddlepaddle >= 2.0.0 + +paddlehub >= 2.0.0 + +## 更新历史 + +* 1.0.0 + + 初始发布 + diff --git a/modules/text/embedding/w2v_baidu_encyclopedia_context_word-ngram_1-2_dim300/__init__.py b/modules/text/embedding/w2v_baidu_encyclopedia_context_word-ngram_1-2_dim300/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modules/text/embedding/w2v_baidu_encyclopedia_context_word-ngram_1-2_dim300/module.py b/modules/text/embedding/w2v_baidu_encyclopedia_context_word-ngram_1-2_dim300/module.py new file mode 100644 index 00000000..fad914ee --- /dev/null +++ b/modules/text/embedding/w2v_baidu_encyclopedia_context_word-ngram_1-2_dim300/module.py @@ -0,0 +1,55 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List +from paddlenlp.embeddings import TokenEmbedding +from paddlehub.module.module import moduleinfo, serving + + +@moduleinfo( + name="w2v_baidu_encyclopedia_context_word-ngram_1-2_dim300", + version="1.0.0", + summary="", + author="paddlepaddle", + author_email="", + type="nlp/semantic_model") +class Embedding(TokenEmbedding): + """ + Embedding model + """ + def __init__(self, *args, **kwargs): + super(Embedding, self).__init__(embedding_name="w2v.baidu_encyclopedia.context.word-ngram.1-2.dim300", *args, **kwargs) + + @serving + def calc_similarity(self, data: List[List[str]]): + """ + Calculate similarities of giving word pairs. + """ + results = [] + for word_pair in data: + if len(word_pair) != 2: + raise RuntimeError( + f'The input must have two words, but got {len(word_pair)}. Please check your inputs.') + if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str): + raise RuntimeError( + f'The types of text pair must be (str, str), but got' + f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.') + + for word in word_pair: + if self.get_idx_from_word(word) == \ + self.get_idx_from_word(self.vocab.unk_token): + raise RuntimeError( + f'Word "{word}" is not in vocab. Please check your inputs.') + results.append(str(self.cosine_sim(*word_pair))) + return results diff --git a/modules/text/embedding/w2v_baidu_encyclopedia_context_word-ngram_1-3_dim300/README.md b/modules/text/embedding/w2v_baidu_encyclopedia_context_word-ngram_1-3_dim300/README.md new file mode 100644 index 00000000..a0177dd8 --- /dev/null +++ b/modules/text/embedding/w2v_baidu_encyclopedia_context_word-ngram_1-3_dim300/README.md @@ -0,0 +1,127 @@ +## 概述 +PaddleHub提供多个开源的预训练Embedding模型。这些Embedding模型可根据不同语料、不同训练方式和不同的维度进行区分,关于模型的具体信息可参考PaddleNLP的文档:[Embedding模型汇总](https://github.com/PaddlePaddle/models/blob/release/2.0-beta/PaddleNLP/docs/embeddings.md) + +## API + +```python +def __init__( + *args, + **kwargs +) +``` + +创建一个Embedding Module对象,默认无需参数。 + +**参数** +* `*args`: 用户额外指定的列表类型的参数。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 + +关于额外参数的详情可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + + +```python +def search( + words: Union[List[str], str, int], +) +``` + +获取一个或多个词的embedding。输入可以是`str`、`List[str]`和`int`类型,分别代表获取一个词,多个词和指定词编号的embedding,词的编号和模型的词典相关,词典可通过模型实例的`vocab`属性获取。 + +**参数** +* `words`: 需要获取的词向量的词、词列表或者词编号。 + + +```python +def cosine_sim( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的余弦相似度。需要注意的是`word_a`和`word_b`都需要是词典里的单词,否则将会被认为是OOV(Out-Of-Vocabulary),同时被替换为`unknown_token`。 + +**参数** +* `word_a`: 需要计算余弦相似度的单词a。 +* `word_b`: 需要计算余弦相似度的单词b。 + + +```python +def dot( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的内积。对于输入单词同样需要注意OOV问题。 + +**参数** +* `word_a`: 需要计算内积的单词a。 +* `word_b`: 需要计算内积的单词b。 + +更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + +## 代码示例 + +```python +import paddlehub as hub +embedding = hub.Module(name='w2v_baidu_encyclopedia_context_word-ngram_1-3_dim300') + +# 获取单词的embedding +embedding.search("中国") +# 计算两个词向量的余弦相似度 +embedding.cosine_sim("中国", "美国") +# 计算两个词向量的内积 +embedding.dot("中国", "美国") +``` + +## 部署服务 + +通过PaddleHub Serving,可以部署一个在线获取两个词向量的余弦相似度的服务。 + +### Step1: 启动PaddleHub Serving + +运行启动命令: + +```shell +$ hub serving start -m w2v_baidu_encyclopedia_context_word-ngram_1-3_dim300 +``` + +这样就完成了一个获取词向量的余弦相似度服务化API的部署,默认端口号为8866。 + +**NOTE:** 如使用GPU预测,则需要在启动服务之前,请设置CUDA_VISIBLE_DEVICES环境变量,否则不用设置。 + +### Step2: 发送预测请求 + +配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果 + +```python +import requests +import json + +# 指定用于计算余弦相似度的单词对[[word_a, word_b], [word_a, word_b], ... ]] +word_pairs = [["中国", "美国"], ["今天", "明天"]] +# 以key的方式指定word_pairs传入预测方法的时的参数,此例中为"data",对于每一对单词,调用cosine_sim进行余弦相似度的计算 +data = {"data": word_pairs} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip +url = "http://10.12.121.132:8866/predict/w2v_baidu_encyclopedia_context_word-ngram_1-3_dim300" +# 指定post请求的headers为application/json方式 +headers = {"Content-Type": "application/json"} + +r = requests.post(url=url, headers=headers, data=json.dumps(data)) +print(r.json()) +``` + +## 查看代码 + +https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings + +## 依赖 + +paddlepaddle >= 2.0.0 + +paddlehub >= 2.0.0 + +## 更新历史 + +* 1.0.0 + + 初始发布 + diff --git a/modules/text/embedding/w2v_baidu_encyclopedia_context_word-ngram_1-3_dim300/__init__.py b/modules/text/embedding/w2v_baidu_encyclopedia_context_word-ngram_1-3_dim300/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modules/text/embedding/w2v_baidu_encyclopedia_context_word-ngram_1-3_dim300/module.py b/modules/text/embedding/w2v_baidu_encyclopedia_context_word-ngram_1-3_dim300/module.py new file mode 100644 index 00000000..1917fe2f --- /dev/null +++ b/modules/text/embedding/w2v_baidu_encyclopedia_context_word-ngram_1-3_dim300/module.py @@ -0,0 +1,55 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List +from paddlenlp.embeddings import TokenEmbedding +from paddlehub.module.module import moduleinfo, serving + + +@moduleinfo( + name="w2v_baidu_encyclopedia_context_word-ngram_1-3_dim300", + version="1.0.0", + summary="", + author="paddlepaddle", + author_email="", + type="nlp/semantic_model") +class Embedding(TokenEmbedding): + """ + Embedding model + """ + def __init__(self, *args, **kwargs): + super(Embedding, self).__init__(embedding_name="w2v.baidu_encyclopedia.context.word-ngram.1-3.dim300", *args, **kwargs) + + @serving + def calc_similarity(self, data: List[List[str]]): + """ + Calculate similarities of giving word pairs. + """ + results = [] + for word_pair in data: + if len(word_pair) != 2: + raise RuntimeError( + f'The input must have two words, but got {len(word_pair)}. Please check your inputs.') + if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str): + raise RuntimeError( + f'The types of text pair must be (str, str), but got' + f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.') + + for word in word_pair: + if self.get_idx_from_word(word) == \ + self.get_idx_from_word(self.vocab.unk_token): + raise RuntimeError( + f'Word "{word}" is not in vocab. Please check your inputs.') + results.append(str(self.cosine_sim(*word_pair))) + return results diff --git a/modules/text/embedding/w2v_baidu_encyclopedia_context_word-ngram_2-2_dim300/README.md b/modules/text/embedding/w2v_baidu_encyclopedia_context_word-ngram_2-2_dim300/README.md new file mode 100644 index 00000000..cef67685 --- /dev/null +++ b/modules/text/embedding/w2v_baidu_encyclopedia_context_word-ngram_2-2_dim300/README.md @@ -0,0 +1,127 @@ +## 概述 +PaddleHub提供多个开源的预训练Embedding模型。这些Embedding模型可根据不同语料、不同训练方式和不同的维度进行区分,关于模型的具体信息可参考PaddleNLP的文档:[Embedding模型汇总](https://github.com/PaddlePaddle/models/blob/release/2.0-beta/PaddleNLP/docs/embeddings.md) + +## API + +```python +def __init__( + *args, + **kwargs +) +``` + +创建一个Embedding Module对象,默认无需参数。 + +**参数** +* `*args`: 用户额外指定的列表类型的参数。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 + +关于额外参数的详情可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + + +```python +def search( + words: Union[List[str], str, int], +) +``` + +获取一个或多个词的embedding。输入可以是`str`、`List[str]`和`int`类型,分别代表获取一个词,多个词和指定词编号的embedding,词的编号和模型的词典相关,词典可通过模型实例的`vocab`属性获取。 + +**参数** +* `words`: 需要获取的词向量的词、词列表或者词编号。 + + +```python +def cosine_sim( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的余弦相似度。需要注意的是`word_a`和`word_b`都需要是词典里的单词,否则将会被认为是OOV(Out-Of-Vocabulary),同时被替换为`unknown_token`。 + +**参数** +* `word_a`: 需要计算余弦相似度的单词a。 +* `word_b`: 需要计算余弦相似度的单词b。 + + +```python +def dot( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的内积。对于输入单词同样需要注意OOV问题。 + +**参数** +* `word_a`: 需要计算内积的单词a。 +* `word_b`: 需要计算内积的单词b。 + +更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + +## 代码示例 + +```python +import paddlehub as hub +embedding = hub.Module(name='w2v_baidu_encyclopedia_context_word-ngram_2-2_dim300') + +# 获取单词的embedding +embedding.search("中国") +# 计算两个词向量的余弦相似度 +embedding.cosine_sim("中国", "美国") +# 计算两个词向量的内积 +embedding.dot("中国", "美国") +``` + +## 部署服务 + +通过PaddleHub Serving,可以部署一个在线获取两个词向量的余弦相似度的服务。 + +### Step1: 启动PaddleHub Serving + +运行启动命令: + +```shell +$ hub serving start -m w2v_baidu_encyclopedia_context_word-ngram_2-2_dim300 +``` + +这样就完成了一个获取词向量的余弦相似度服务化API的部署,默认端口号为8866。 + +**NOTE:** 如使用GPU预测,则需要在启动服务之前,请设置CUDA_VISIBLE_DEVICES环境变量,否则不用设置。 + +### Step2: 发送预测请求 + +配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果 + +```python +import requests +import json + +# 指定用于计算余弦相似度的单词对[[word_a, word_b], [word_a, word_b], ... ]] +word_pairs = [["中国", "美国"], ["今天", "明天"]] +# 以key的方式指定word_pairs传入预测方法的时的参数,此例中为"data",对于每一对单词,调用cosine_sim进行余弦相似度的计算 +data = {"data": word_pairs} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip +url = "http://10.12.121.132:8866/predict/w2v_baidu_encyclopedia_context_word-ngram_2-2_dim300" +# 指定post请求的headers为application/json方式 +headers = {"Content-Type": "application/json"} + +r = requests.post(url=url, headers=headers, data=json.dumps(data)) +print(r.json()) +``` + +## 查看代码 + +https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings + +## 依赖 + +paddlepaddle >= 2.0.0 + +paddlehub >= 2.0.0 + +## 更新历史 + +* 1.0.0 + + 初始发布 + diff --git a/modules/text/embedding/w2v_baidu_encyclopedia_context_word-ngram_2-2_dim300/__init__.py b/modules/text/embedding/w2v_baidu_encyclopedia_context_word-ngram_2-2_dim300/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modules/text/embedding/w2v_baidu_encyclopedia_context_word-ngram_2-2_dim300/module.py b/modules/text/embedding/w2v_baidu_encyclopedia_context_word-ngram_2-2_dim300/module.py new file mode 100644 index 00000000..65eea6a7 --- /dev/null +++ b/modules/text/embedding/w2v_baidu_encyclopedia_context_word-ngram_2-2_dim300/module.py @@ -0,0 +1,55 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List +from paddlenlp.embeddings import TokenEmbedding +from paddlehub.module.module import moduleinfo, serving + + +@moduleinfo( + name="w2v_baidu_encyclopedia_context_word-ngram_2-2_dim300", + version="1.0.0", + summary="", + author="paddlepaddle", + author_email="", + type="nlp/semantic_model") +class Embedding(TokenEmbedding): + """ + Embedding model + """ + def __init__(self, *args, **kwargs): + super(Embedding, self).__init__(embedding_name="w2v.baidu_encyclopedia.context.word-ngram.2-2.dim300", *args, **kwargs) + + @serving + def calc_similarity(self, data: List[List[str]]): + """ + Calculate similarities of giving word pairs. + """ + results = [] + for word_pair in data: + if len(word_pair) != 2: + raise RuntimeError( + f'The input must have two words, but got {len(word_pair)}. Please check your inputs.') + if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str): + raise RuntimeError( + f'The types of text pair must be (str, str), but got' + f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.') + + for word in word_pair: + if self.get_idx_from_word(word) == \ + self.get_idx_from_word(self.vocab.unk_token): + raise RuntimeError( + f'Word "{word}" is not in vocab. Please check your inputs.') + results.append(str(self.cosine_sim(*word_pair))) + return results diff --git a/modules/text/embedding/w2v_baidu_encyclopedia_context_word-wordLR_dim300/README.md b/modules/text/embedding/w2v_baidu_encyclopedia_context_word-wordLR_dim300/README.md new file mode 100644 index 00000000..c13497cd --- /dev/null +++ b/modules/text/embedding/w2v_baidu_encyclopedia_context_word-wordLR_dim300/README.md @@ -0,0 +1,127 @@ +## 概述 +PaddleHub提供多个开源的预训练Embedding模型。这些Embedding模型可根据不同语料、不同训练方式和不同的维度进行区分,关于模型的具体信息可参考PaddleNLP的文档:[Embedding模型汇总](https://github.com/PaddlePaddle/models/blob/release/2.0-beta/PaddleNLP/docs/embeddings.md) + +## API + +```python +def __init__( + *args, + **kwargs +) +``` + +创建一个Embedding Module对象,默认无需参数。 + +**参数** +* `*args`: 用户额外指定的列表类型的参数。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 + +关于额外参数的详情可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + + +```python +def search( + words: Union[List[str], str, int], +) +``` + +获取一个或多个词的embedding。输入可以是`str`、`List[str]`和`int`类型,分别代表获取一个词,多个词和指定词编号的embedding,词的编号和模型的词典相关,词典可通过模型实例的`vocab`属性获取。 + +**参数** +* `words`: 需要获取的词向量的词、词列表或者词编号。 + + +```python +def cosine_sim( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的余弦相似度。需要注意的是`word_a`和`word_b`都需要是词典里的单词,否则将会被认为是OOV(Out-Of-Vocabulary),同时被替换为`unknown_token`。 + +**参数** +* `word_a`: 需要计算余弦相似度的单词a。 +* `word_b`: 需要计算余弦相似度的单词b。 + + +```python +def dot( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的内积。对于输入单词同样需要注意OOV问题。 + +**参数** +* `word_a`: 需要计算内积的单词a。 +* `word_b`: 需要计算内积的单词b。 + +更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + +## 代码示例 + +```python +import paddlehub as hub +embedding = hub.Module(name='w2v_baidu_encyclopedia_context_word-wordLR_dim300') + +# 获取单词的embedding +embedding.search("中国") +# 计算两个词向量的余弦相似度 +embedding.cosine_sim("中国", "美国") +# 计算两个词向量的内积 +embedding.dot("中国", "美国") +``` + +## 部署服务 + +通过PaddleHub Serving,可以部署一个在线获取两个词向量的余弦相似度的服务。 + +### Step1: 启动PaddleHub Serving + +运行启动命令: + +```shell +$ hub serving start -m w2v_baidu_encyclopedia_context_word-wordLR_dim300 +``` + +这样就完成了一个获取词向量的余弦相似度服务化API的部署,默认端口号为8866。 + +**NOTE:** 如使用GPU预测,则需要在启动服务之前,请设置CUDA_VISIBLE_DEVICES环境变量,否则不用设置。 + +### Step2: 发送预测请求 + +配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果 + +```python +import requests +import json + +# 指定用于计算余弦相似度的单词对[[word_a, word_b], [word_a, word_b], ... ]] +word_pairs = [["中国", "美国"], ["今天", "明天"]] +# 以key的方式指定word_pairs传入预测方法的时的参数,此例中为"data",对于每一对单词,调用cosine_sim进行余弦相似度的计算 +data = {"data": word_pairs} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip +url = "http://10.12.121.132:8866/predict/w2v_baidu_encyclopedia_context_word-wordLR_dim300" +# 指定post请求的headers为application/json方式 +headers = {"Content-Type": "application/json"} + +r = requests.post(url=url, headers=headers, data=json.dumps(data)) +print(r.json()) +``` + +## 查看代码 + +https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings + +## 依赖 + +paddlepaddle >= 2.0.0 + +paddlehub >= 2.0.0 + +## 更新历史 + +* 1.0.0 + + 初始发布 + diff --git a/modules/text/embedding/w2v_baidu_encyclopedia_context_word-wordLR_dim300/__init__.py b/modules/text/embedding/w2v_baidu_encyclopedia_context_word-wordLR_dim300/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modules/text/embedding/w2v_baidu_encyclopedia_context_word-wordLR_dim300/module.py b/modules/text/embedding/w2v_baidu_encyclopedia_context_word-wordLR_dim300/module.py new file mode 100644 index 00000000..5f36eabc --- /dev/null +++ b/modules/text/embedding/w2v_baidu_encyclopedia_context_word-wordLR_dim300/module.py @@ -0,0 +1,55 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List +from paddlenlp.embeddings import TokenEmbedding +from paddlehub.module.module import moduleinfo, serving + + +@moduleinfo( + name="w2v_baidu_encyclopedia_context_word-wordLR_dim300", + version="1.0.0", + summary="", + author="paddlepaddle", + author_email="", + type="nlp/semantic_model") +class Embedding(TokenEmbedding): + """ + Embedding model + """ + def __init__(self, *args, **kwargs): + super(Embedding, self).__init__(embedding_name="w2v.baidu_encyclopedia.context.word-wordLR.dim300", *args, **kwargs) + + @serving + def calc_similarity(self, data: List[List[str]]): + """ + Calculate similarities of giving word pairs. + """ + results = [] + for word_pair in data: + if len(word_pair) != 2: + raise RuntimeError( + f'The input must have two words, but got {len(word_pair)}. Please check your inputs.') + if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str): + raise RuntimeError( + f'The types of text pair must be (str, str), but got' + f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.') + + for word in word_pair: + if self.get_idx_from_word(word) == \ + self.get_idx_from_word(self.vocab.unk_token): + raise RuntimeError( + f'Word "{word}" is not in vocab. Please check your inputs.') + results.append(str(self.cosine_sim(*word_pair))) + return results diff --git a/modules/text/embedding/w2v_baidu_encyclopedia_context_word-wordPosition_dim300/README.md b/modules/text/embedding/w2v_baidu_encyclopedia_context_word-wordPosition_dim300/README.md new file mode 100644 index 00000000..f94fa50e --- /dev/null +++ b/modules/text/embedding/w2v_baidu_encyclopedia_context_word-wordPosition_dim300/README.md @@ -0,0 +1,127 @@ +## 概述 +PaddleHub提供多个开源的预训练Embedding模型。这些Embedding模型可根据不同语料、不同训练方式和不同的维度进行区分,关于模型的具体信息可参考PaddleNLP的文档:[Embedding模型汇总](https://github.com/PaddlePaddle/models/blob/release/2.0-beta/PaddleNLP/docs/embeddings.md) + +## API + +```python +def __init__( + *args, + **kwargs +) +``` + +创建一个Embedding Module对象,默认无需参数。 + +**参数** +* `*args`: 用户额外指定的列表类型的参数。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 + +关于额外参数的详情可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + + +```python +def search( + words: Union[List[str], str, int], +) +``` + +获取一个或多个词的embedding。输入可以是`str`、`List[str]`和`int`类型,分别代表获取一个词,多个词和指定词编号的embedding,词的编号和模型的词典相关,词典可通过模型实例的`vocab`属性获取。 + +**参数** +* `words`: 需要获取的词向量的词、词列表或者词编号。 + + +```python +def cosine_sim( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的余弦相似度。需要注意的是`word_a`和`word_b`都需要是词典里的单词,否则将会被认为是OOV(Out-Of-Vocabulary),同时被替换为`unknown_token`。 + +**参数** +* `word_a`: 需要计算余弦相似度的单词a。 +* `word_b`: 需要计算余弦相似度的单词b。 + + +```python +def dot( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的内积。对于输入单词同样需要注意OOV问题。 + +**参数** +* `word_a`: 需要计算内积的单词a。 +* `word_b`: 需要计算内积的单词b。 + +更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + +## 代码示例 + +```python +import paddlehub as hub +embedding = hub.Module(name='w2v_baidu_encyclopedia_context_word-wordPosition_dim300') + +# 获取单词的embedding +embedding.search("中国") +# 计算两个词向量的余弦相似度 +embedding.cosine_sim("中国", "美国") +# 计算两个词向量的内积 +embedding.dot("中国", "美国") +``` + +## 部署服务 + +通过PaddleHub Serving,可以部署一个在线获取两个词向量的余弦相似度的服务。 + +### Step1: 启动PaddleHub Serving + +运行启动命令: + +```shell +$ hub serving start -m w2v_baidu_encyclopedia_context_word-wordPosition_dim300 +``` + +这样就完成了一个获取词向量的余弦相似度服务化API的部署,默认端口号为8866。 + +**NOTE:** 如使用GPU预测,则需要在启动服务之前,请设置CUDA_VISIBLE_DEVICES环境变量,否则不用设置。 + +### Step2: 发送预测请求 + +配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果 + +```python +import requests +import json + +# 指定用于计算余弦相似度的单词对[[word_a, word_b], [word_a, word_b], ... ]] +word_pairs = [["中国", "美国"], ["今天", "明天"]] +# 以key的方式指定word_pairs传入预测方法的时的参数,此例中为"data",对于每一对单词,调用cosine_sim进行余弦相似度的计算 +data = {"data": word_pairs} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip +url = "http://10.12.121.132:8866/predict/w2v_baidu_encyclopedia_context_word-wordPosition_dim300" +# 指定post请求的headers为application/json方式 +headers = {"Content-Type": "application/json"} + +r = requests.post(url=url, headers=headers, data=json.dumps(data)) +print(r.json()) +``` + +## 查看代码 + +https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings + +## 依赖 + +paddlepaddle >= 2.0.0 + +paddlehub >= 2.0.0 + +## 更新历史 + +* 1.0.0 + + 初始发布 + diff --git a/modules/text/embedding/w2v_baidu_encyclopedia_context_word-wordPosition_dim300/__init__.py b/modules/text/embedding/w2v_baidu_encyclopedia_context_word-wordPosition_dim300/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modules/text/embedding/w2v_baidu_encyclopedia_context_word-wordPosition_dim300/module.py b/modules/text/embedding/w2v_baidu_encyclopedia_context_word-wordPosition_dim300/module.py new file mode 100644 index 00000000..411623e9 --- /dev/null +++ b/modules/text/embedding/w2v_baidu_encyclopedia_context_word-wordPosition_dim300/module.py @@ -0,0 +1,55 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List +from paddlenlp.embeddings import TokenEmbedding +from paddlehub.module.module import moduleinfo, serving + + +@moduleinfo( + name="w2v_baidu_encyclopedia_context_word-wordPosition_dim300", + version="1.0.0", + summary="", + author="paddlepaddle", + author_email="", + type="nlp/semantic_model") +class Embedding(TokenEmbedding): + """ + Embedding model + """ + def __init__(self, *args, **kwargs): + super(Embedding, self).__init__(embedding_name="w2v.baidu_encyclopedia.context.word-wordPosition.dim300", *args, **kwargs) + + @serving + def calc_similarity(self, data: List[List[str]]): + """ + Calculate similarities of giving word pairs. + """ + results = [] + for word_pair in data: + if len(word_pair) != 2: + raise RuntimeError( + f'The input must have two words, but got {len(word_pair)}. Please check your inputs.') + if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str): + raise RuntimeError( + f'The types of text pair must be (str, str), but got' + f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.') + + for word in word_pair: + if self.get_idx_from_word(word) == \ + self.get_idx_from_word(self.vocab.unk_token): + raise RuntimeError( + f'Word "{word}" is not in vocab. Please check your inputs.') + results.append(str(self.cosine_sim(*word_pair))) + return results diff --git a/modules/text/embedding/w2v_baidu_encyclopedia_context_word-word_dim300/README.md b/modules/text/embedding/w2v_baidu_encyclopedia_context_word-word_dim300/README.md new file mode 100644 index 00000000..681f7937 --- /dev/null +++ b/modules/text/embedding/w2v_baidu_encyclopedia_context_word-word_dim300/README.md @@ -0,0 +1,127 @@ +## 概述 +PaddleHub提供多个开源的预训练Embedding模型。这些Embedding模型可根据不同语料、不同训练方式和不同的维度进行区分,关于模型的具体信息可参考PaddleNLP的文档:[Embedding模型汇总](https://github.com/PaddlePaddle/models/blob/release/2.0-beta/PaddleNLP/docs/embeddings.md) + +## API + +```python +def __init__( + *args, + **kwargs +) +``` + +创建一个Embedding Module对象,默认无需参数。 + +**参数** +* `*args`: 用户额外指定的列表类型的参数。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 + +关于额外参数的详情可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + + +```python +def search( + words: Union[List[str], str, int], +) +``` + +获取一个或多个词的embedding。输入可以是`str`、`List[str]`和`int`类型,分别代表获取一个词,多个词和指定词编号的embedding,词的编号和模型的词典相关,词典可通过模型实例的`vocab`属性获取。 + +**参数** +* `words`: 需要获取的词向量的词、词列表或者词编号。 + + +```python +def cosine_sim( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的余弦相似度。需要注意的是`word_a`和`word_b`都需要是词典里的单词,否则将会被认为是OOV(Out-Of-Vocabulary),同时被替换为`unknown_token`。 + +**参数** +* `word_a`: 需要计算余弦相似度的单词a。 +* `word_b`: 需要计算余弦相似度的单词b。 + + +```python +def dot( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的内积。对于输入单词同样需要注意OOV问题。 + +**参数** +* `word_a`: 需要计算内积的单词a。 +* `word_b`: 需要计算内积的单词b。 + +更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + +## 代码示例 + +```python +import paddlehub as hub +embedding = hub.Module(name='w2v_baidu_encyclopedia_context_word-word_dim300') + +# 获取单词的embedding +embedding.search("中国") +# 计算两个词向量的余弦相似度 +embedding.cosine_sim("中国", "美国") +# 计算两个词向量的内积 +embedding.dot("中国", "美国") +``` + +## 部署服务 + +通过PaddleHub Serving,可以部署一个在线获取两个词向量的余弦相似度的服务。 + +### Step1: 启动PaddleHub Serving + +运行启动命令: + +```shell +$ hub serving start -m w2v_baidu_encyclopedia_context_word-word_dim300 +``` + +这样就完成了一个获取词向量的余弦相似度服务化API的部署,默认端口号为8866。 + +**NOTE:** 如使用GPU预测,则需要在启动服务之前,请设置CUDA_VISIBLE_DEVICES环境变量,否则不用设置。 + +### Step2: 发送预测请求 + +配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果 + +```python +import requests +import json + +# 指定用于计算余弦相似度的单词对[[word_a, word_b], [word_a, word_b], ... ]] +word_pairs = [["中国", "美国"], ["今天", "明天"]] +# 以key的方式指定word_pairs传入预测方法的时的参数,此例中为"data",对于每一对单词,调用cosine_sim进行余弦相似度的计算 +data = {"data": word_pairs} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip +url = "http://10.12.121.132:8866/predict/w2v_baidu_encyclopedia_context_word-word_dim300" +# 指定post请求的headers为application/json方式 +headers = {"Content-Type": "application/json"} + +r = requests.post(url=url, headers=headers, data=json.dumps(data)) +print(r.json()) +``` + +## 查看代码 + +https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings + +## 依赖 + +paddlepaddle >= 2.0.0 + +paddlehub >= 2.0.0 + +## 更新历史 + +* 1.0.0 + + 初始发布 + diff --git a/modules/text/embedding/w2v_baidu_encyclopedia_context_word-word_dim300/__init__.py b/modules/text/embedding/w2v_baidu_encyclopedia_context_word-word_dim300/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modules/text/embedding/w2v_baidu_encyclopedia_context_word-word_dim300/module.py b/modules/text/embedding/w2v_baidu_encyclopedia_context_word-word_dim300/module.py new file mode 100644 index 00000000..3b7fa87d --- /dev/null +++ b/modules/text/embedding/w2v_baidu_encyclopedia_context_word-word_dim300/module.py @@ -0,0 +1,55 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List +from paddlenlp.embeddings import TokenEmbedding +from paddlehub.module.module import moduleinfo, serving + + +@moduleinfo( + name="w2v_baidu_encyclopedia_context_word-word_dim300", + version="1.0.0", + summary="", + author="paddlepaddle", + author_email="", + type="nlp/semantic_model") +class Embedding(TokenEmbedding): + """ + Embedding model + """ + def __init__(self, *args, **kwargs): + super(Embedding, self).__init__(embedding_name="w2v.baidu_encyclopedia.context.word-word.dim300", *args, **kwargs) + + @serving + def calc_similarity(self, data: List[List[str]]): + """ + Calculate similarities of giving word pairs. + """ + results = [] + for word_pair in data: + if len(word_pair) != 2: + raise RuntimeError( + f'The input must have two words, but got {len(word_pair)}. Please check your inputs.') + if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str): + raise RuntimeError( + f'The types of text pair must be (str, str), but got' + f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.') + + for word in word_pair: + if self.get_idx_from_word(word) == \ + self.get_idx_from_word(self.vocab.unk_token): + raise RuntimeError( + f'Word "{word}" is not in vocab. Please check your inputs.') + results.append(str(self.cosine_sim(*word_pair))) + return results diff --git a/modules/text/embedding/w2v_baidu_encyclopedia_target_bigram-char_dim300/README.md b/modules/text/embedding/w2v_baidu_encyclopedia_target_bigram-char_dim300/README.md new file mode 100644 index 00000000..ec1e8e06 --- /dev/null +++ b/modules/text/embedding/w2v_baidu_encyclopedia_target_bigram-char_dim300/README.md @@ -0,0 +1,127 @@ +## 概述 +PaddleHub提供多个开源的预训练Embedding模型。这些Embedding模型可根据不同语料、不同训练方式和不同的维度进行区分,关于模型的具体信息可参考PaddleNLP的文档:[Embedding模型汇总](https://github.com/PaddlePaddle/models/blob/release/2.0-beta/PaddleNLP/docs/embeddings.md) + +## API + +```python +def __init__( + *args, + **kwargs +) +``` + +创建一个Embedding Module对象,默认无需参数。 + +**参数** +* `*args`: 用户额外指定的列表类型的参数。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 + +关于额外参数的详情可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + + +```python +def search( + words: Union[List[str], str, int], +) +``` + +获取一个或多个词的embedding。输入可以是`str`、`List[str]`和`int`类型,分别代表获取一个词,多个词和指定词编号的embedding,词的编号和模型的词典相关,词典可通过模型实例的`vocab`属性获取。 + +**参数** +* `words`: 需要获取的词向量的词、词列表或者词编号。 + + +```python +def cosine_sim( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的余弦相似度。需要注意的是`word_a`和`word_b`都需要是词典里的单词,否则将会被认为是OOV(Out-Of-Vocabulary),同时被替换为`unknown_token`。 + +**参数** +* `word_a`: 需要计算余弦相似度的单词a。 +* `word_b`: 需要计算余弦相似度的单词b。 + + +```python +def dot( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的内积。对于输入单词同样需要注意OOV问题。 + +**参数** +* `word_a`: 需要计算内积的单词a。 +* `word_b`: 需要计算内积的单词b。 + +更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + +## 代码示例 + +```python +import paddlehub as hub +embedding = hub.Module(name='w2v_baidu_encyclopedia_target_bigram-char_dim300') + +# 获取单词的embedding +embedding.search("中国") +# 计算两个词向量的余弦相似度 +embedding.cosine_sim("中国", "美国") +# 计算两个词向量的内积 +embedding.dot("中国", "美国") +``` + +## 部署服务 + +通过PaddleHub Serving,可以部署一个在线获取两个词向量的余弦相似度的服务。 + +### Step1: 启动PaddleHub Serving + +运行启动命令: + +```shell +$ hub serving start -m w2v_baidu_encyclopedia_target_bigram-char_dim300 +``` + +这样就完成了一个获取词向量的余弦相似度服务化API的部署,默认端口号为8866。 + +**NOTE:** 如使用GPU预测,则需要在启动服务之前,请设置CUDA_VISIBLE_DEVICES环境变量,否则不用设置。 + +### Step2: 发送预测请求 + +配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果 + +```python +import requests +import json + +# 指定用于计算余弦相似度的单词对[[word_a, word_b], [word_a, word_b], ... ]] +word_pairs = [["中国", "美国"], ["今天", "明天"]] +# 以key的方式指定word_pairs传入预测方法的时的参数,此例中为"data",对于每一对单词,调用cosine_sim进行余弦相似度的计算 +data = {"data": word_pairs} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip +url = "http://10.12.121.132:8866/predict/w2v_baidu_encyclopedia_target_bigram-char_dim300" +# 指定post请求的headers为application/json方式 +headers = {"Content-Type": "application/json"} + +r = requests.post(url=url, headers=headers, data=json.dumps(data)) +print(r.json()) +``` + +## 查看代码 + +https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings + +## 依赖 + +paddlepaddle >= 2.0.0 + +paddlehub >= 2.0.0 + +## 更新历史 + +* 1.0.0 + + 初始发布 + diff --git a/modules/text/embedding/w2v_baidu_encyclopedia_target_bigram-char_dim300/__init__.py b/modules/text/embedding/w2v_baidu_encyclopedia_target_bigram-char_dim300/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modules/text/embedding/w2v_baidu_encyclopedia_target_bigram-char_dim300/module.py b/modules/text/embedding/w2v_baidu_encyclopedia_target_bigram-char_dim300/module.py new file mode 100644 index 00000000..6606db71 --- /dev/null +++ b/modules/text/embedding/w2v_baidu_encyclopedia_target_bigram-char_dim300/module.py @@ -0,0 +1,55 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List +from paddlenlp.embeddings import TokenEmbedding +from paddlehub.module.module import moduleinfo, serving + + +@moduleinfo( + name="w2v_baidu_encyclopedia_target_bigram-char_dim300", + version="1.0.0", + summary="", + author="paddlepaddle", + author_email="", + type="nlp/semantic_model") +class Embedding(TokenEmbedding): + """ + Embedding model + """ + def __init__(self, *args, **kwargs): + super(Embedding, self).__init__(embedding_name="w2v.baidu_encyclopedia.target.bigram-char.dim300", *args, **kwargs) + + @serving + def calc_similarity(self, data: List[List[str]]): + """ + Calculate similarities of giving word pairs. + """ + results = [] + for word_pair in data: + if len(word_pair) != 2: + raise RuntimeError( + f'The input must have two words, but got {len(word_pair)}. Please check your inputs.') + if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str): + raise RuntimeError( + f'The types of text pair must be (str, str), but got' + f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.') + + for word in word_pair: + if self.get_idx_from_word(word) == \ + self.get_idx_from_word(self.vocab.unk_token): + raise RuntimeError( + f'Word "{word}" is not in vocab. Please check your inputs.') + results.append(str(self.cosine_sim(*word_pair))) + return results diff --git a/modules/text/embedding/w2v_baidu_encyclopedia_target_word-character_char1-1_dim300/README.md b/modules/text/embedding/w2v_baidu_encyclopedia_target_word-character_char1-1_dim300/README.md new file mode 100644 index 00000000..be277781 --- /dev/null +++ b/modules/text/embedding/w2v_baidu_encyclopedia_target_word-character_char1-1_dim300/README.md @@ -0,0 +1,127 @@ +## 概述 +PaddleHub提供多个开源的预训练Embedding模型。这些Embedding模型可根据不同语料、不同训练方式和不同的维度进行区分,关于模型的具体信息可参考PaddleNLP的文档:[Embedding模型汇总](https://github.com/PaddlePaddle/models/blob/release/2.0-beta/PaddleNLP/docs/embeddings.md) + +## API + +```python +def __init__( + *args, + **kwargs +) +``` + +创建一个Embedding Module对象,默认无需参数。 + +**参数** +* `*args`: 用户额外指定的列表类型的参数。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 + +关于额外参数的详情可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + + +```python +def search( + words: Union[List[str], str, int], +) +``` + +获取一个或多个词的embedding。输入可以是`str`、`List[str]`和`int`类型,分别代表获取一个词,多个词和指定词编号的embedding,词的编号和模型的词典相关,词典可通过模型实例的`vocab`属性获取。 + +**参数** +* `words`: 需要获取的词向量的词、词列表或者词编号。 + + +```python +def cosine_sim( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的余弦相似度。需要注意的是`word_a`和`word_b`都需要是词典里的单词,否则将会被认为是OOV(Out-Of-Vocabulary),同时被替换为`unknown_token`。 + +**参数** +* `word_a`: 需要计算余弦相似度的单词a。 +* `word_b`: 需要计算余弦相似度的单词b。 + + +```python +def dot( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的内积。对于输入单词同样需要注意OOV问题。 + +**参数** +* `word_a`: 需要计算内积的单词a。 +* `word_b`: 需要计算内积的单词b。 + +更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + +## 代码示例 + +```python +import paddlehub as hub +embedding = hub.Module(name='w2v_baidu_encyclopedia_target_word-character_char1-1_dim300') + +# 获取单词的embedding +embedding.search("中国") +# 计算两个词向量的余弦相似度 +embedding.cosine_sim("中国", "美国") +# 计算两个词向量的内积 +embedding.dot("中国", "美国") +``` + +## 部署服务 + +通过PaddleHub Serving,可以部署一个在线获取两个词向量的余弦相似度的服务。 + +### Step1: 启动PaddleHub Serving + +运行启动命令: + +```shell +$ hub serving start -m w2v_baidu_encyclopedia_target_word-character_char1-1_dim300 +``` + +这样就完成了一个获取词向量的余弦相似度服务化API的部署,默认端口号为8866。 + +**NOTE:** 如使用GPU预测,则需要在启动服务之前,请设置CUDA_VISIBLE_DEVICES环境变量,否则不用设置。 + +### Step2: 发送预测请求 + +配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果 + +```python +import requests +import json + +# 指定用于计算余弦相似度的单词对[[word_a, word_b], [word_a, word_b], ... ]] +word_pairs = [["中国", "美国"], ["今天", "明天"]] +# 以key的方式指定word_pairs传入预测方法的时的参数,此例中为"data",对于每一对单词,调用cosine_sim进行余弦相似度的计算 +data = {"data": word_pairs} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip +url = "http://10.12.121.132:8866/predict/w2v_baidu_encyclopedia_target_word-character_char1-1_dim300" +# 指定post请求的headers为application/json方式 +headers = {"Content-Type": "application/json"} + +r = requests.post(url=url, headers=headers, data=json.dumps(data)) +print(r.json()) +``` + +## 查看代码 + +https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings + +## 依赖 + +paddlepaddle >= 2.0.0 + +paddlehub >= 2.0.0 + +## 更新历史 + +* 1.0.0 + + 初始发布 + diff --git a/modules/text/embedding/w2v_baidu_encyclopedia_target_word-character_char1-1_dim300/__init__.py b/modules/text/embedding/w2v_baidu_encyclopedia_target_word-character_char1-1_dim300/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modules/text/embedding/w2v_baidu_encyclopedia_target_word-character_char1-1_dim300/module.py b/modules/text/embedding/w2v_baidu_encyclopedia_target_word-character_char1-1_dim300/module.py new file mode 100644 index 00000000..89c5c4c6 --- /dev/null +++ b/modules/text/embedding/w2v_baidu_encyclopedia_target_word-character_char1-1_dim300/module.py @@ -0,0 +1,55 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List +from paddlenlp.embeddings import TokenEmbedding +from paddlehub.module.module import moduleinfo, serving + + +@moduleinfo( + name="w2v_baidu_encyclopedia_target_word-character_char1-1_dim300", + version="1.0.0", + summary="", + author="paddlepaddle", + author_email="", + type="nlp/semantic_model") +class Embedding(TokenEmbedding): + """ + Embedding model + """ + def __init__(self, *args, **kwargs): + super(Embedding, self).__init__(embedding_name="w2v.baidu_encyclopedia.target.word-character.char1-1.dim300", *args, **kwargs) + + @serving + def calc_similarity(self, data: List[List[str]]): + """ + Calculate similarities of giving word pairs. + """ + results = [] + for word_pair in data: + if len(word_pair) != 2: + raise RuntimeError( + f'The input must have two words, but got {len(word_pair)}. Please check your inputs.') + if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str): + raise RuntimeError( + f'The types of text pair must be (str, str), but got' + f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.') + + for word in word_pair: + if self.get_idx_from_word(word) == \ + self.get_idx_from_word(self.vocab.unk_token): + raise RuntimeError( + f'Word "{word}" is not in vocab. Please check your inputs.') + results.append(str(self.cosine_sim(*word_pair))) + return results diff --git a/modules/text/embedding/w2v_baidu_encyclopedia_target_word-character_char1-2_dim300/README.md b/modules/text/embedding/w2v_baidu_encyclopedia_target_word-character_char1-2_dim300/README.md new file mode 100644 index 00000000..a098b2bf --- /dev/null +++ b/modules/text/embedding/w2v_baidu_encyclopedia_target_word-character_char1-2_dim300/README.md @@ -0,0 +1,127 @@ +## 概述 +PaddleHub提供多个开源的预训练Embedding模型。这些Embedding模型可根据不同语料、不同训练方式和不同的维度进行区分,关于模型的具体信息可参考PaddleNLP的文档:[Embedding模型汇总](https://github.com/PaddlePaddle/models/blob/release/2.0-beta/PaddleNLP/docs/embeddings.md) + +## API + +```python +def __init__( + *args, + **kwargs +) +``` + +创建一个Embedding Module对象,默认无需参数。 + +**参数** +* `*args`: 用户额外指定的列表类型的参数。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 + +关于额外参数的详情可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + + +```python +def search( + words: Union[List[str], str, int], +) +``` + +获取一个或多个词的embedding。输入可以是`str`、`List[str]`和`int`类型,分别代表获取一个词,多个词和指定词编号的embedding,词的编号和模型的词典相关,词典可通过模型实例的`vocab`属性获取。 + +**参数** +* `words`: 需要获取的词向量的词、词列表或者词编号。 + + +```python +def cosine_sim( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的余弦相似度。需要注意的是`word_a`和`word_b`都需要是词典里的单词,否则将会被认为是OOV(Out-Of-Vocabulary),同时被替换为`unknown_token`。 + +**参数** +* `word_a`: 需要计算余弦相似度的单词a。 +* `word_b`: 需要计算余弦相似度的单词b。 + + +```python +def dot( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的内积。对于输入单词同样需要注意OOV问题。 + +**参数** +* `word_a`: 需要计算内积的单词a。 +* `word_b`: 需要计算内积的单词b。 + +更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + +## 代码示例 + +```python +import paddlehub as hub +embedding = hub.Module(name='w2v_baidu_encyclopedia_target_word-character_char1-2_dim300') + +# 获取单词的embedding +embedding.search("中国") +# 计算两个词向量的余弦相似度 +embedding.cosine_sim("中国", "美国") +# 计算两个词向量的内积 +embedding.dot("中国", "美国") +``` + +## 部署服务 + +通过PaddleHub Serving,可以部署一个在线获取两个词向量的余弦相似度的服务。 + +### Step1: 启动PaddleHub Serving + +运行启动命令: + +```shell +$ hub serving start -m w2v_baidu_encyclopedia_target_word-character_char1-2_dim300 +``` + +这样就完成了一个获取词向量的余弦相似度服务化API的部署,默认端口号为8866。 + +**NOTE:** 如使用GPU预测,则需要在启动服务之前,请设置CUDA_VISIBLE_DEVICES环境变量,否则不用设置。 + +### Step2: 发送预测请求 + +配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果 + +```python +import requests +import json + +# 指定用于计算余弦相似度的单词对[[word_a, word_b], [word_a, word_b], ... ]] +word_pairs = [["中国", "美国"], ["今天", "明天"]] +# 以key的方式指定word_pairs传入预测方法的时的参数,此例中为"data",对于每一对单词,调用cosine_sim进行余弦相似度的计算 +data = {"data": word_pairs} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip +url = "http://10.12.121.132:8866/predict/w2v_baidu_encyclopedia_target_word-character_char1-2_dim300" +# 指定post请求的headers为application/json方式 +headers = {"Content-Type": "application/json"} + +r = requests.post(url=url, headers=headers, data=json.dumps(data)) +print(r.json()) +``` + +## 查看代码 + +https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings + +## 依赖 + +paddlepaddle >= 2.0.0 + +paddlehub >= 2.0.0 + +## 更新历史 + +* 1.0.0 + + 初始发布 + diff --git a/modules/text/embedding/w2v_baidu_encyclopedia_target_word-character_char1-2_dim300/__init__.py b/modules/text/embedding/w2v_baidu_encyclopedia_target_word-character_char1-2_dim300/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modules/text/embedding/w2v_baidu_encyclopedia_target_word-character_char1-2_dim300/module.py b/modules/text/embedding/w2v_baidu_encyclopedia_target_word-character_char1-2_dim300/module.py new file mode 100644 index 00000000..391595c4 --- /dev/null +++ b/modules/text/embedding/w2v_baidu_encyclopedia_target_word-character_char1-2_dim300/module.py @@ -0,0 +1,55 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List +from paddlenlp.embeddings import TokenEmbedding +from paddlehub.module.module import moduleinfo, serving + + +@moduleinfo( + name="w2v_baidu_encyclopedia_target_word-character_char1-2_dim300", + version="1.0.0", + summary="", + author="paddlepaddle", + author_email="", + type="nlp/semantic_model") +class Embedding(TokenEmbedding): + """ + Embedding model + """ + def __init__(self, *args, **kwargs): + super(Embedding, self).__init__(embedding_name="w2v.baidu_encyclopedia.target.word-character.char1-2.dim300", *args, **kwargs) + + @serving + def calc_similarity(self, data: List[List[str]]): + """ + Calculate similarities of giving word pairs. + """ + results = [] + for word_pair in data: + if len(word_pair) != 2: + raise RuntimeError( + f'The input must have two words, but got {len(word_pair)}. Please check your inputs.') + if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str): + raise RuntimeError( + f'The types of text pair must be (str, str), but got' + f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.') + + for word in word_pair: + if self.get_idx_from_word(word) == \ + self.get_idx_from_word(self.vocab.unk_token): + raise RuntimeError( + f'Word "{word}" is not in vocab. Please check your inputs.') + results.append(str(self.cosine_sim(*word_pair))) + return results diff --git a/modules/text/embedding/w2v_baidu_encyclopedia_target_word-character_char1-4_dim300/README.md b/modules/text/embedding/w2v_baidu_encyclopedia_target_word-character_char1-4_dim300/README.md new file mode 100644 index 00000000..f3edfa49 --- /dev/null +++ b/modules/text/embedding/w2v_baidu_encyclopedia_target_word-character_char1-4_dim300/README.md @@ -0,0 +1,127 @@ +## 概述 +PaddleHub提供多个开源的预训练Embedding模型。这些Embedding模型可根据不同语料、不同训练方式和不同的维度进行区分,关于模型的具体信息可参考PaddleNLP的文档:[Embedding模型汇总](https://github.com/PaddlePaddle/models/blob/release/2.0-beta/PaddleNLP/docs/embeddings.md) + +## API + +```python +def __init__( + *args, + **kwargs +) +``` + +创建一个Embedding Module对象,默认无需参数。 + +**参数** +* `*args`: 用户额外指定的列表类型的参数。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 + +关于额外参数的详情可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + + +```python +def search( + words: Union[List[str], str, int], +) +``` + +获取一个或多个词的embedding。输入可以是`str`、`List[str]`和`int`类型,分别代表获取一个词,多个词和指定词编号的embedding,词的编号和模型的词典相关,词典可通过模型实例的`vocab`属性获取。 + +**参数** +* `words`: 需要获取的词向量的词、词列表或者词编号。 + + +```python +def cosine_sim( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的余弦相似度。需要注意的是`word_a`和`word_b`都需要是词典里的单词,否则将会被认为是OOV(Out-Of-Vocabulary),同时被替换为`unknown_token`。 + +**参数** +* `word_a`: 需要计算余弦相似度的单词a。 +* `word_b`: 需要计算余弦相似度的单词b。 + + +```python +def dot( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的内积。对于输入单词同样需要注意OOV问题。 + +**参数** +* `word_a`: 需要计算内积的单词a。 +* `word_b`: 需要计算内积的单词b。 + +更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + +## 代码示例 + +```python +import paddlehub as hub +embedding = hub.Module(name='w2v_baidu_encyclopedia_target_word-character_char1-4_dim300') + +# 获取单词的embedding +embedding.search("中国") +# 计算两个词向量的余弦相似度 +embedding.cosine_sim("中国", "美国") +# 计算两个词向量的内积 +embedding.dot("中国", "美国") +``` + +## 部署服务 + +通过PaddleHub Serving,可以部署一个在线获取两个词向量的余弦相似度的服务。 + +### Step1: 启动PaddleHub Serving + +运行启动命令: + +```shell +$ hub serving start -m w2v_baidu_encyclopedia_target_word-character_char1-4_dim300 +``` + +这样就完成了一个获取词向量的余弦相似度服务化API的部署,默认端口号为8866。 + +**NOTE:** 如使用GPU预测,则需要在启动服务之前,请设置CUDA_VISIBLE_DEVICES环境变量,否则不用设置。 + +### Step2: 发送预测请求 + +配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果 + +```python +import requests +import json + +# 指定用于计算余弦相似度的单词对[[word_a, word_b], [word_a, word_b], ... ]] +word_pairs = [["中国", "美国"], ["今天", "明天"]] +# 以key的方式指定word_pairs传入预测方法的时的参数,此例中为"data",对于每一对单词,调用cosine_sim进行余弦相似度的计算 +data = {"data": word_pairs} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip +url = "http://10.12.121.132:8866/predict/w2v_baidu_encyclopedia_target_word-character_char1-4_dim300" +# 指定post请求的headers为application/json方式 +headers = {"Content-Type": "application/json"} + +r = requests.post(url=url, headers=headers, data=json.dumps(data)) +print(r.json()) +``` + +## 查看代码 + +https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings + +## 依赖 + +paddlepaddle >= 2.0.0 + +paddlehub >= 2.0.0 + +## 更新历史 + +* 1.0.0 + + 初始发布 + diff --git a/modules/text/embedding/w2v_baidu_encyclopedia_target_word-character_char1-4_dim300/__init__.py b/modules/text/embedding/w2v_baidu_encyclopedia_target_word-character_char1-4_dim300/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modules/text/embedding/w2v_baidu_encyclopedia_target_word-character_char1-4_dim300/module.py b/modules/text/embedding/w2v_baidu_encyclopedia_target_word-character_char1-4_dim300/module.py new file mode 100644 index 00000000..edf2a64f --- /dev/null +++ b/modules/text/embedding/w2v_baidu_encyclopedia_target_word-character_char1-4_dim300/module.py @@ -0,0 +1,55 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List +from paddlenlp.embeddings import TokenEmbedding +from paddlehub.module.module import moduleinfo, serving + + +@moduleinfo( + name="w2v_baidu_encyclopedia_target_word-character_char1-4_dim300", + version="1.0.0", + summary="", + author="paddlepaddle", + author_email="", + type="nlp/semantic_model") +class Embedding(TokenEmbedding): + """ + Embedding model + """ + def __init__(self, *args, **kwargs): + super(Embedding, self).__init__(embedding_name="w2v.baidu_encyclopedia.target.word-character.char1-4.dim300", *args, **kwargs) + + @serving + def calc_similarity(self, data: List[List[str]]): + """ + Calculate similarities of giving word pairs. + """ + results = [] + for word_pair in data: + if len(word_pair) != 2: + raise RuntimeError( + f'The input must have two words, but got {len(word_pair)}. Please check your inputs.') + if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str): + raise RuntimeError( + f'The types of text pair must be (str, str), but got' + f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.') + + for word in word_pair: + if self.get_idx_from_word(word) == \ + self.get_idx_from_word(self.vocab.unk_token): + raise RuntimeError( + f'Word "{word}" is not in vocab. Please check your inputs.') + results.append(str(self.cosine_sim(*word_pair))) + return results diff --git a/modules/text/embedding/w2v_baidu_encyclopedia_target_word-ngram_1-2_dim300/README.md b/modules/text/embedding/w2v_baidu_encyclopedia_target_word-ngram_1-2_dim300/README.md new file mode 100644 index 00000000..b1dc0fff --- /dev/null +++ b/modules/text/embedding/w2v_baidu_encyclopedia_target_word-ngram_1-2_dim300/README.md @@ -0,0 +1,127 @@ +## 概述 +PaddleHub提供多个开源的预训练Embedding模型。这些Embedding模型可根据不同语料、不同训练方式和不同的维度进行区分,关于模型的具体信息可参考PaddleNLP的文档:[Embedding模型汇总](https://github.com/PaddlePaddle/models/blob/release/2.0-beta/PaddleNLP/docs/embeddings.md) + +## API + +```python +def __init__( + *args, + **kwargs +) +``` + +创建一个Embedding Module对象,默认无需参数。 + +**参数** +* `*args`: 用户额外指定的列表类型的参数。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 + +关于额外参数的详情可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + + +```python +def search( + words: Union[List[str], str, int], +) +``` + +获取一个或多个词的embedding。输入可以是`str`、`List[str]`和`int`类型,分别代表获取一个词,多个词和指定词编号的embedding,词的编号和模型的词典相关,词典可通过模型实例的`vocab`属性获取。 + +**参数** +* `words`: 需要获取的词向量的词、词列表或者词编号。 + + +```python +def cosine_sim( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的余弦相似度。需要注意的是`word_a`和`word_b`都需要是词典里的单词,否则将会被认为是OOV(Out-Of-Vocabulary),同时被替换为`unknown_token`。 + +**参数** +* `word_a`: 需要计算余弦相似度的单词a。 +* `word_b`: 需要计算余弦相似度的单词b。 + + +```python +def dot( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的内积。对于输入单词同样需要注意OOV问题。 + +**参数** +* `word_a`: 需要计算内积的单词a。 +* `word_b`: 需要计算内积的单词b。 + +更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + +## 代码示例 + +```python +import paddlehub as hub +embedding = hub.Module(name='w2v_baidu_encyclopedia_target_word-ngram_1-2_dim300') + +# 获取单词的embedding +embedding.search("中国") +# 计算两个词向量的余弦相似度 +embedding.cosine_sim("中国", "美国") +# 计算两个词向量的内积 +embedding.dot("中国", "美国") +``` + +## 部署服务 + +通过PaddleHub Serving,可以部署一个在线获取两个词向量的余弦相似度的服务。 + +### Step1: 启动PaddleHub Serving + +运行启动命令: + +```shell +$ hub serving start -m w2v_baidu_encyclopedia_target_word-ngram_1-2_dim300 +``` + +这样就完成了一个获取词向量的余弦相似度服务化API的部署,默认端口号为8866。 + +**NOTE:** 如使用GPU预测,则需要在启动服务之前,请设置CUDA_VISIBLE_DEVICES环境变量,否则不用设置。 + +### Step2: 发送预测请求 + +配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果 + +```python +import requests +import json + +# 指定用于计算余弦相似度的单词对[[word_a, word_b], [word_a, word_b], ... ]] +word_pairs = [["中国", "美国"], ["今天", "明天"]] +# 以key的方式指定word_pairs传入预测方法的时的参数,此例中为"data",对于每一对单词,调用cosine_sim进行余弦相似度的计算 +data = {"data": word_pairs} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip +url = "http://10.12.121.132:8866/predict/w2v_baidu_encyclopedia_target_word-ngram_1-2_dim300" +# 指定post请求的headers为application/json方式 +headers = {"Content-Type": "application/json"} + +r = requests.post(url=url, headers=headers, data=json.dumps(data)) +print(r.json()) +``` + +## 查看代码 + +https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings + +## 依赖 + +paddlepaddle >= 2.0.0 + +paddlehub >= 2.0.0 + +## 更新历史 + +* 1.0.0 + + 初始发布 + diff --git a/modules/text/embedding/w2v_baidu_encyclopedia_target_word-ngram_1-2_dim300/__init__.py b/modules/text/embedding/w2v_baidu_encyclopedia_target_word-ngram_1-2_dim300/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modules/text/embedding/w2v_baidu_encyclopedia_target_word-ngram_1-2_dim300/module.py b/modules/text/embedding/w2v_baidu_encyclopedia_target_word-ngram_1-2_dim300/module.py new file mode 100644 index 00000000..5a423485 --- /dev/null +++ b/modules/text/embedding/w2v_baidu_encyclopedia_target_word-ngram_1-2_dim300/module.py @@ -0,0 +1,55 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List +from paddlenlp.embeddings import TokenEmbedding +from paddlehub.module.module import moduleinfo, serving + + +@moduleinfo( + name="w2v_baidu_encyclopedia_target_word-ngram_1-2_dim300", + version="1.0.0", + summary="", + author="paddlepaddle", + author_email="", + type="nlp/semantic_model") +class Embedding(TokenEmbedding): + """ + Embedding model + """ + def __init__(self, *args, **kwargs): + super(Embedding, self).__init__(embedding_name="w2v.baidu_encyclopedia.target.word-ngram.1-2.dim300", *args, **kwargs) + + @serving + def calc_similarity(self, data: List[List[str]]): + """ + Calculate similarities of giving word pairs. + """ + results = [] + for word_pair in data: + if len(word_pair) != 2: + raise RuntimeError( + f'The input must have two words, but got {len(word_pair)}. Please check your inputs.') + if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str): + raise RuntimeError( + f'The types of text pair must be (str, str), but got' + f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.') + + for word in word_pair: + if self.get_idx_from_word(word) == \ + self.get_idx_from_word(self.vocab.unk_token): + raise RuntimeError( + f'Word "{word}" is not in vocab. Please check your inputs.') + results.append(str(self.cosine_sim(*word_pair))) + return results diff --git a/modules/text/embedding/w2v_baidu_encyclopedia_target_word-ngram_1-3_dim300/README.md b/modules/text/embedding/w2v_baidu_encyclopedia_target_word-ngram_1-3_dim300/README.md new file mode 100644 index 00000000..3b05771e --- /dev/null +++ b/modules/text/embedding/w2v_baidu_encyclopedia_target_word-ngram_1-3_dim300/README.md @@ -0,0 +1,127 @@ +## 概述 +PaddleHub提供多个开源的预训练Embedding模型。这些Embedding模型可根据不同语料、不同训练方式和不同的维度进行区分,关于模型的具体信息可参考PaddleNLP的文档:[Embedding模型汇总](https://github.com/PaddlePaddle/models/blob/release/2.0-beta/PaddleNLP/docs/embeddings.md) + +## API + +```python +def __init__( + *args, + **kwargs +) +``` + +创建一个Embedding Module对象,默认无需参数。 + +**参数** +* `*args`: 用户额外指定的列表类型的参数。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 + +关于额外参数的详情可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + + +```python +def search( + words: Union[List[str], str, int], +) +``` + +获取一个或多个词的embedding。输入可以是`str`、`List[str]`和`int`类型,分别代表获取一个词,多个词和指定词编号的embedding,词的编号和模型的词典相关,词典可通过模型实例的`vocab`属性获取。 + +**参数** +* `words`: 需要获取的词向量的词、词列表或者词编号。 + + +```python +def cosine_sim( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的余弦相似度。需要注意的是`word_a`和`word_b`都需要是词典里的单词,否则将会被认为是OOV(Out-Of-Vocabulary),同时被替换为`unknown_token`。 + +**参数** +* `word_a`: 需要计算余弦相似度的单词a。 +* `word_b`: 需要计算余弦相似度的单词b。 + + +```python +def dot( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的内积。对于输入单词同样需要注意OOV问题。 + +**参数** +* `word_a`: 需要计算内积的单词a。 +* `word_b`: 需要计算内积的单词b。 + +更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + +## 代码示例 + +```python +import paddlehub as hub +embedding = hub.Module(name='w2v_baidu_encyclopedia_target_word-ngram_1-3_dim300') + +# 获取单词的embedding +embedding.search("中国") +# 计算两个词向量的余弦相似度 +embedding.cosine_sim("中国", "美国") +# 计算两个词向量的内积 +embedding.dot("中国", "美国") +``` + +## 部署服务 + +通过PaddleHub Serving,可以部署一个在线获取两个词向量的余弦相似度的服务。 + +### Step1: 启动PaddleHub Serving + +运行启动命令: + +```shell +$ hub serving start -m w2v_baidu_encyclopedia_target_word-ngram_1-3_dim300 +``` + +这样就完成了一个获取词向量的余弦相似度服务化API的部署,默认端口号为8866。 + +**NOTE:** 如使用GPU预测,则需要在启动服务之前,请设置CUDA_VISIBLE_DEVICES环境变量,否则不用设置。 + +### Step2: 发送预测请求 + +配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果 + +```python +import requests +import json + +# 指定用于计算余弦相似度的单词对[[word_a, word_b], [word_a, word_b], ... ]] +word_pairs = [["中国", "美国"], ["今天", "明天"]] +# 以key的方式指定word_pairs传入预测方法的时的参数,此例中为"data",对于每一对单词,调用cosine_sim进行余弦相似度的计算 +data = {"data": word_pairs} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip +url = "http://10.12.121.132:8866/predict/w2v_baidu_encyclopedia_target_word-ngram_1-3_dim300" +# 指定post请求的headers为application/json方式 +headers = {"Content-Type": "application/json"} + +r = requests.post(url=url, headers=headers, data=json.dumps(data)) +print(r.json()) +``` + +## 查看代码 + +https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings + +## 依赖 + +paddlepaddle >= 2.0.0 + +paddlehub >= 2.0.0 + +## 更新历史 + +* 1.0.0 + + 初始发布 + diff --git a/modules/text/embedding/w2v_baidu_encyclopedia_target_word-ngram_1-3_dim300/__init__.py b/modules/text/embedding/w2v_baidu_encyclopedia_target_word-ngram_1-3_dim300/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modules/text/embedding/w2v_baidu_encyclopedia_target_word-ngram_1-3_dim300/module.py b/modules/text/embedding/w2v_baidu_encyclopedia_target_word-ngram_1-3_dim300/module.py new file mode 100644 index 00000000..245a2f11 --- /dev/null +++ b/modules/text/embedding/w2v_baidu_encyclopedia_target_word-ngram_1-3_dim300/module.py @@ -0,0 +1,55 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List +from paddlenlp.embeddings import TokenEmbedding +from paddlehub.module.module import moduleinfo, serving + + +@moduleinfo( + name="w2v_baidu_encyclopedia_target_word-ngram_1-3_dim300", + version="1.0.0", + summary="", + author="paddlepaddle", + author_email="", + type="nlp/semantic_model") +class Embedding(TokenEmbedding): + """ + Embedding model + """ + def __init__(self, *args, **kwargs): + super(Embedding, self).__init__(embedding_name="w2v.baidu_encyclopedia.target.word-ngram.1-3.dim300", *args, **kwargs) + + @serving + def calc_similarity(self, data: List[List[str]]): + """ + Calculate similarities of giving word pairs. + """ + results = [] + for word_pair in data: + if len(word_pair) != 2: + raise RuntimeError( + f'The input must have two words, but got {len(word_pair)}. Please check your inputs.') + if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str): + raise RuntimeError( + f'The types of text pair must be (str, str), but got' + f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.') + + for word in word_pair: + if self.get_idx_from_word(word) == \ + self.get_idx_from_word(self.vocab.unk_token): + raise RuntimeError( + f'Word "{word}" is not in vocab. Please check your inputs.') + results.append(str(self.cosine_sim(*word_pair))) + return results diff --git a/modules/text/embedding/w2v_baidu_encyclopedia_target_word-ngram_2-2_dim300/README.md b/modules/text/embedding/w2v_baidu_encyclopedia_target_word-ngram_2-2_dim300/README.md new file mode 100644 index 00000000..41066f53 --- /dev/null +++ b/modules/text/embedding/w2v_baidu_encyclopedia_target_word-ngram_2-2_dim300/README.md @@ -0,0 +1,127 @@ +## 概述 +PaddleHub提供多个开源的预训练Embedding模型。这些Embedding模型可根据不同语料、不同训练方式和不同的维度进行区分,关于模型的具体信息可参考PaddleNLP的文档:[Embedding模型汇总](https://github.com/PaddlePaddle/models/blob/release/2.0-beta/PaddleNLP/docs/embeddings.md) + +## API + +```python +def __init__( + *args, + **kwargs +) +``` + +创建一个Embedding Module对象,默认无需参数。 + +**参数** +* `*args`: 用户额外指定的列表类型的参数。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 + +关于额外参数的详情可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + + +```python +def search( + words: Union[List[str], str, int], +) +``` + +获取一个或多个词的embedding。输入可以是`str`、`List[str]`和`int`类型,分别代表获取一个词,多个词和指定词编号的embedding,词的编号和模型的词典相关,词典可通过模型实例的`vocab`属性获取。 + +**参数** +* `words`: 需要获取的词向量的词、词列表或者词编号。 + + +```python +def cosine_sim( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的余弦相似度。需要注意的是`word_a`和`word_b`都需要是词典里的单词,否则将会被认为是OOV(Out-Of-Vocabulary),同时被替换为`unknown_token`。 + +**参数** +* `word_a`: 需要计算余弦相似度的单词a。 +* `word_b`: 需要计算余弦相似度的单词b。 + + +```python +def dot( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的内积。对于输入单词同样需要注意OOV问题。 + +**参数** +* `word_a`: 需要计算内积的单词a。 +* `word_b`: 需要计算内积的单词b。 + +更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + +## 代码示例 + +```python +import paddlehub as hub +embedding = hub.Module(name='w2v_baidu_encyclopedia_target_word-ngram_2-2_dim300') + +# 获取单词的embedding +embedding.search("中国") +# 计算两个词向量的余弦相似度 +embedding.cosine_sim("中国", "美国") +# 计算两个词向量的内积 +embedding.dot("中国", "美国") +``` + +## 部署服务 + +通过PaddleHub Serving,可以部署一个在线获取两个词向量的余弦相似度的服务。 + +### Step1: 启动PaddleHub Serving + +运行启动命令: + +```shell +$ hub serving start -m w2v_baidu_encyclopedia_target_word-ngram_2-2_dim300 +``` + +这样就完成了一个获取词向量的余弦相似度服务化API的部署,默认端口号为8866。 + +**NOTE:** 如使用GPU预测,则需要在启动服务之前,请设置CUDA_VISIBLE_DEVICES环境变量,否则不用设置。 + +### Step2: 发送预测请求 + +配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果 + +```python +import requests +import json + +# 指定用于计算余弦相似度的单词对[[word_a, word_b], [word_a, word_b], ... ]] +word_pairs = [["中国", "美国"], ["今天", "明天"]] +# 以key的方式指定word_pairs传入预测方法的时的参数,此例中为"data",对于每一对单词,调用cosine_sim进行余弦相似度的计算 +data = {"data": word_pairs} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip +url = "http://10.12.121.132:8866/predict/w2v_baidu_encyclopedia_target_word-ngram_2-2_dim300" +# 指定post请求的headers为application/json方式 +headers = {"Content-Type": "application/json"} + +r = requests.post(url=url, headers=headers, data=json.dumps(data)) +print(r.json()) +``` + +## 查看代码 + +https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings + +## 依赖 + +paddlepaddle >= 2.0.0 + +paddlehub >= 2.0.0 + +## 更新历史 + +* 1.0.0 + + 初始发布 + diff --git a/modules/text/embedding/w2v_baidu_encyclopedia_target_word-ngram_2-2_dim300/__init__.py b/modules/text/embedding/w2v_baidu_encyclopedia_target_word-ngram_2-2_dim300/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modules/text/embedding/w2v_baidu_encyclopedia_target_word-ngram_2-2_dim300/module.py b/modules/text/embedding/w2v_baidu_encyclopedia_target_word-ngram_2-2_dim300/module.py new file mode 100644 index 00000000..fd0a3afb --- /dev/null +++ b/modules/text/embedding/w2v_baidu_encyclopedia_target_word-ngram_2-2_dim300/module.py @@ -0,0 +1,55 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List +from paddlenlp.embeddings import TokenEmbedding +from paddlehub.module.module import moduleinfo, serving + + +@moduleinfo( + name="w2v_baidu_encyclopedia_target_word-ngram_2-2_dim300", + version="1.0.0", + summary="", + author="paddlepaddle", + author_email="", + type="nlp/semantic_model") +class Embedding(TokenEmbedding): + """ + Embedding model + """ + def __init__(self, *args, **kwargs): + super(Embedding, self).__init__(embedding_name="w2v.baidu_encyclopedia.target.word-ngram.2-2.dim300", *args, **kwargs) + + @serving + def calc_similarity(self, data: List[List[str]]): + """ + Calculate similarities of giving word pairs. + """ + results = [] + for word_pair in data: + if len(word_pair) != 2: + raise RuntimeError( + f'The input must have two words, but got {len(word_pair)}. Please check your inputs.') + if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str): + raise RuntimeError( + f'The types of text pair must be (str, str), but got' + f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.') + + for word in word_pair: + if self.get_idx_from_word(word) == \ + self.get_idx_from_word(self.vocab.unk_token): + raise RuntimeError( + f'Word "{word}" is not in vocab. Please check your inputs.') + results.append(str(self.cosine_sim(*word_pair))) + return results diff --git a/modules/text/embedding/w2v_baidu_encyclopedia_target_word-wordLR_dim300/README.md b/modules/text/embedding/w2v_baidu_encyclopedia_target_word-wordLR_dim300/README.md new file mode 100644 index 00000000..fbdc48d1 --- /dev/null +++ b/modules/text/embedding/w2v_baidu_encyclopedia_target_word-wordLR_dim300/README.md @@ -0,0 +1,127 @@ +## 概述 +PaddleHub提供多个开源的预训练Embedding模型。这些Embedding模型可根据不同语料、不同训练方式和不同的维度进行区分,关于模型的具体信息可参考PaddleNLP的文档:[Embedding模型汇总](https://github.com/PaddlePaddle/models/blob/release/2.0-beta/PaddleNLP/docs/embeddings.md) + +## API + +```python +def __init__( + *args, + **kwargs +) +``` + +创建一个Embedding Module对象,默认无需参数。 + +**参数** +* `*args`: 用户额外指定的列表类型的参数。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 + +关于额外参数的详情可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + + +```python +def search( + words: Union[List[str], str, int], +) +``` + +获取一个或多个词的embedding。输入可以是`str`、`List[str]`和`int`类型,分别代表获取一个词,多个词和指定词编号的embedding,词的编号和模型的词典相关,词典可通过模型实例的`vocab`属性获取。 + +**参数** +* `words`: 需要获取的词向量的词、词列表或者词编号。 + + +```python +def cosine_sim( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的余弦相似度。需要注意的是`word_a`和`word_b`都需要是词典里的单词,否则将会被认为是OOV(Out-Of-Vocabulary),同时被替换为`unknown_token`。 + +**参数** +* `word_a`: 需要计算余弦相似度的单词a。 +* `word_b`: 需要计算余弦相似度的单词b。 + + +```python +def dot( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的内积。对于输入单词同样需要注意OOV问题。 + +**参数** +* `word_a`: 需要计算内积的单词a。 +* `word_b`: 需要计算内积的单词b。 + +更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + +## 代码示例 + +```python +import paddlehub as hub +embedding = hub.Module(name='w2v_baidu_encyclopedia_target_word-wordLR_dim300') + +# 获取单词的embedding +embedding.search("中国") +# 计算两个词向量的余弦相似度 +embedding.cosine_sim("中国", "美国") +# 计算两个词向量的内积 +embedding.dot("中国", "美国") +``` + +## 部署服务 + +通过PaddleHub Serving,可以部署一个在线获取两个词向量的余弦相似度的服务。 + +### Step1: 启动PaddleHub Serving + +运行启动命令: + +```shell +$ hub serving start -m w2v_baidu_encyclopedia_target_word-wordLR_dim300 +``` + +这样就完成了一个获取词向量的余弦相似度服务化API的部署,默认端口号为8866。 + +**NOTE:** 如使用GPU预测,则需要在启动服务之前,请设置CUDA_VISIBLE_DEVICES环境变量,否则不用设置。 + +### Step2: 发送预测请求 + +配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果 + +```python +import requests +import json + +# 指定用于计算余弦相似度的单词对[[word_a, word_b], [word_a, word_b], ... ]] +word_pairs = [["中国", "美国"], ["今天", "明天"]] +# 以key的方式指定word_pairs传入预测方法的时的参数,此例中为"data",对于每一对单词,调用cosine_sim进行余弦相似度的计算 +data = {"data": word_pairs} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip +url = "http://10.12.121.132:8866/predict/w2v_baidu_encyclopedia_target_word-wordLR_dim300" +# 指定post请求的headers为application/json方式 +headers = {"Content-Type": "application/json"} + +r = requests.post(url=url, headers=headers, data=json.dumps(data)) +print(r.json()) +``` + +## 查看代码 + +https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings + +## 依赖 + +paddlepaddle >= 2.0.0 + +paddlehub >= 2.0.0 + +## 更新历史 + +* 1.0.0 + + 初始发布 + diff --git a/modules/text/embedding/w2v_baidu_encyclopedia_target_word-wordLR_dim300/__init__.py b/modules/text/embedding/w2v_baidu_encyclopedia_target_word-wordLR_dim300/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modules/text/embedding/w2v_baidu_encyclopedia_target_word-wordLR_dim300/module.py b/modules/text/embedding/w2v_baidu_encyclopedia_target_word-wordLR_dim300/module.py new file mode 100644 index 00000000..687b5ac2 --- /dev/null +++ b/modules/text/embedding/w2v_baidu_encyclopedia_target_word-wordLR_dim300/module.py @@ -0,0 +1,55 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List +from paddlenlp.embeddings import TokenEmbedding +from paddlehub.module.module import moduleinfo, serving + + +@moduleinfo( + name="w2v_baidu_encyclopedia_target_word-wordLR_dim300", + version="1.0.0", + summary="", + author="paddlepaddle", + author_email="", + type="nlp/semantic_model") +class Embedding(TokenEmbedding): + """ + Embedding model + """ + def __init__(self, *args, **kwargs): + super(Embedding, self).__init__(embedding_name="w2v.baidu_encyclopedia.target.word-wordLR.dim300", *args, **kwargs) + + @serving + def calc_similarity(self, data: List[List[str]]): + """ + Calculate similarities of giving word pairs. + """ + results = [] + for word_pair in data: + if len(word_pair) != 2: + raise RuntimeError( + f'The input must have two words, but got {len(word_pair)}. Please check your inputs.') + if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str): + raise RuntimeError( + f'The types of text pair must be (str, str), but got' + f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.') + + for word in word_pair: + if self.get_idx_from_word(word) == \ + self.get_idx_from_word(self.vocab.unk_token): + raise RuntimeError( + f'Word "{word}" is not in vocab. Please check your inputs.') + results.append(str(self.cosine_sim(*word_pair))) + return results diff --git a/modules/text/embedding/w2v_baidu_encyclopedia_target_word-wordPosition_dim300/README.md b/modules/text/embedding/w2v_baidu_encyclopedia_target_word-wordPosition_dim300/README.md new file mode 100644 index 00000000..626a7325 --- /dev/null +++ b/modules/text/embedding/w2v_baidu_encyclopedia_target_word-wordPosition_dim300/README.md @@ -0,0 +1,127 @@ +## 概述 +PaddleHub提供多个开源的预训练Embedding模型。这些Embedding模型可根据不同语料、不同训练方式和不同的维度进行区分,关于模型的具体信息可参考PaddleNLP的文档:[Embedding模型汇总](https://github.com/PaddlePaddle/models/blob/release/2.0-beta/PaddleNLP/docs/embeddings.md) + +## API + +```python +def __init__( + *args, + **kwargs +) +``` + +创建一个Embedding Module对象,默认无需参数。 + +**参数** +* `*args`: 用户额外指定的列表类型的参数。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 + +关于额外参数的详情可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + + +```python +def search( + words: Union[List[str], str, int], +) +``` + +获取一个或多个词的embedding。输入可以是`str`、`List[str]`和`int`类型,分别代表获取一个词,多个词和指定词编号的embedding,词的编号和模型的词典相关,词典可通过模型实例的`vocab`属性获取。 + +**参数** +* `words`: 需要获取的词向量的词、词列表或者词编号。 + + +```python +def cosine_sim( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的余弦相似度。需要注意的是`word_a`和`word_b`都需要是词典里的单词,否则将会被认为是OOV(Out-Of-Vocabulary),同时被替换为`unknown_token`。 + +**参数** +* `word_a`: 需要计算余弦相似度的单词a。 +* `word_b`: 需要计算余弦相似度的单词b。 + + +```python +def dot( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的内积。对于输入单词同样需要注意OOV问题。 + +**参数** +* `word_a`: 需要计算内积的单词a。 +* `word_b`: 需要计算内积的单词b。 + +更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + +## 代码示例 + +```python +import paddlehub as hub +embedding = hub.Module(name='w2v_baidu_encyclopedia_target_word-wordPosition_dim300') + +# 获取单词的embedding +embedding.search("中国") +# 计算两个词向量的余弦相似度 +embedding.cosine_sim("中国", "美国") +# 计算两个词向量的内积 +embedding.dot("中国", "美国") +``` + +## 部署服务 + +通过PaddleHub Serving,可以部署一个在线获取两个词向量的余弦相似度的服务。 + +### Step1: 启动PaddleHub Serving + +运行启动命令: + +```shell +$ hub serving start -m w2v_baidu_encyclopedia_target_word-wordPosition_dim300 +``` + +这样就完成了一个获取词向量的余弦相似度服务化API的部署,默认端口号为8866。 + +**NOTE:** 如使用GPU预测,则需要在启动服务之前,请设置CUDA_VISIBLE_DEVICES环境变量,否则不用设置。 + +### Step2: 发送预测请求 + +配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果 + +```python +import requests +import json + +# 指定用于计算余弦相似度的单词对[[word_a, word_b], [word_a, word_b], ... ]] +word_pairs = [["中国", "美国"], ["今天", "明天"]] +# 以key的方式指定word_pairs传入预测方法的时的参数,此例中为"data",对于每一对单词,调用cosine_sim进行余弦相似度的计算 +data = {"data": word_pairs} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip +url = "http://10.12.121.132:8866/predict/w2v_baidu_encyclopedia_target_word-wordPosition_dim300" +# 指定post请求的headers为application/json方式 +headers = {"Content-Type": "application/json"} + +r = requests.post(url=url, headers=headers, data=json.dumps(data)) +print(r.json()) +``` + +## 查看代码 + +https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings + +## 依赖 + +paddlepaddle >= 2.0.0 + +paddlehub >= 2.0.0 + +## 更新历史 + +* 1.0.0 + + 初始发布 + diff --git a/modules/text/embedding/w2v_baidu_encyclopedia_target_word-wordPosition_dim300/__init__.py b/modules/text/embedding/w2v_baidu_encyclopedia_target_word-wordPosition_dim300/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modules/text/embedding/w2v_baidu_encyclopedia_target_word-wordPosition_dim300/module.py b/modules/text/embedding/w2v_baidu_encyclopedia_target_word-wordPosition_dim300/module.py new file mode 100644 index 00000000..478c1771 --- /dev/null +++ b/modules/text/embedding/w2v_baidu_encyclopedia_target_word-wordPosition_dim300/module.py @@ -0,0 +1,55 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List +from paddlenlp.embeddings import TokenEmbedding +from paddlehub.module.module import moduleinfo, serving + + +@moduleinfo( + name="w2v_baidu_encyclopedia_target_word-wordPosition_dim300", + version="1.0.0", + summary="", + author="paddlepaddle", + author_email="", + type="nlp/semantic_model") +class Embedding(TokenEmbedding): + """ + Embedding model + """ + def __init__(self, *args, **kwargs): + super(Embedding, self).__init__(embedding_name="w2v.baidu_encyclopedia.target.word-wordPosition.dim300", *args, **kwargs) + + @serving + def calc_similarity(self, data: List[List[str]]): + """ + Calculate similarities of giving word pairs. + """ + results = [] + for word_pair in data: + if len(word_pair) != 2: + raise RuntimeError( + f'The input must have two words, but got {len(word_pair)}. Please check your inputs.') + if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str): + raise RuntimeError( + f'The types of text pair must be (str, str), but got' + f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.') + + for word in word_pair: + if self.get_idx_from_word(word) == \ + self.get_idx_from_word(self.vocab.unk_token): + raise RuntimeError( + f'Word "{word}" is not in vocab. Please check your inputs.') + results.append(str(self.cosine_sim(*word_pair))) + return results diff --git a/modules/text/embedding/w2v_baidu_encyclopedia_target_word-word_dim300/README.md b/modules/text/embedding/w2v_baidu_encyclopedia_target_word-word_dim300/README.md new file mode 100644 index 00000000..88d46d57 --- /dev/null +++ b/modules/text/embedding/w2v_baidu_encyclopedia_target_word-word_dim300/README.md @@ -0,0 +1,127 @@ +## 概述 +PaddleHub提供多个开源的预训练Embedding模型。这些Embedding模型可根据不同语料、不同训练方式和不同的维度进行区分,关于模型的具体信息可参考PaddleNLP的文档:[Embedding模型汇总](https://github.com/PaddlePaddle/models/blob/release/2.0-beta/PaddleNLP/docs/embeddings.md) + +## API + +```python +def __init__( + *args, + **kwargs +) +``` + +创建一个Embedding Module对象,默认无需参数。 + +**参数** +* `*args`: 用户额外指定的列表类型的参数。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 + +关于额外参数的详情可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + + +```python +def search( + words: Union[List[str], str, int], +) +``` + +获取一个或多个词的embedding。输入可以是`str`、`List[str]`和`int`类型,分别代表获取一个词,多个词和指定词编号的embedding,词的编号和模型的词典相关,词典可通过模型实例的`vocab`属性获取。 + +**参数** +* `words`: 需要获取的词向量的词、词列表或者词编号。 + + +```python +def cosine_sim( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的余弦相似度。需要注意的是`word_a`和`word_b`都需要是词典里的单词,否则将会被认为是OOV(Out-Of-Vocabulary),同时被替换为`unknown_token`。 + +**参数** +* `word_a`: 需要计算余弦相似度的单词a。 +* `word_b`: 需要计算余弦相似度的单词b。 + + +```python +def dot( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的内积。对于输入单词同样需要注意OOV问题。 + +**参数** +* `word_a`: 需要计算内积的单词a。 +* `word_b`: 需要计算内积的单词b。 + +更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + +## 代码示例 + +```python +import paddlehub as hub +embedding = hub.Module(name='w2v_baidu_encyclopedia_target_word-word_dim300') + +# 获取单词的embedding +embedding.search("中国") +# 计算两个词向量的余弦相似度 +embedding.cosine_sim("中国", "美国") +# 计算两个词向量的内积 +embedding.dot("中国", "美国") +``` + +## 部署服务 + +通过PaddleHub Serving,可以部署一个在线获取两个词向量的余弦相似度的服务。 + +### Step1: 启动PaddleHub Serving + +运行启动命令: + +```shell +$ hub serving start -m w2v_baidu_encyclopedia_target_word-word_dim300 +``` + +这样就完成了一个获取词向量的余弦相似度服务化API的部署,默认端口号为8866。 + +**NOTE:** 如使用GPU预测,则需要在启动服务之前,请设置CUDA_VISIBLE_DEVICES环境变量,否则不用设置。 + +### Step2: 发送预测请求 + +配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果 + +```python +import requests +import json + +# 指定用于计算余弦相似度的单词对[[word_a, word_b], [word_a, word_b], ... ]] +word_pairs = [["中国", "美国"], ["今天", "明天"]] +# 以key的方式指定word_pairs传入预测方法的时的参数,此例中为"data",对于每一对单词,调用cosine_sim进行余弦相似度的计算 +data = {"data": word_pairs} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip +url = "http://10.12.121.132:8866/predict/w2v_baidu_encyclopedia_target_word-word_dim300" +# 指定post请求的headers为application/json方式 +headers = {"Content-Type": "application/json"} + +r = requests.post(url=url, headers=headers, data=json.dumps(data)) +print(r.json()) +``` + +## 查看代码 + +https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings + +## 依赖 + +paddlepaddle >= 2.0.0 + +paddlehub >= 2.0.0 + +## 更新历史 + +* 1.0.0 + + 初始发布 + diff --git a/modules/text/embedding/w2v_baidu_encyclopedia_target_word-word_dim300/__init__.py b/modules/text/embedding/w2v_baidu_encyclopedia_target_word-word_dim300/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modules/text/embedding/w2v_baidu_encyclopedia_target_word-word_dim300/module.py b/modules/text/embedding/w2v_baidu_encyclopedia_target_word-word_dim300/module.py new file mode 100644 index 00000000..03e3d582 --- /dev/null +++ b/modules/text/embedding/w2v_baidu_encyclopedia_target_word-word_dim300/module.py @@ -0,0 +1,55 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List +from paddlenlp.embeddings import TokenEmbedding +from paddlehub.module.module import moduleinfo, serving + + +@moduleinfo( + name="w2v_baidu_encyclopedia_target_word-word_dim300", + version="1.0.0", + summary="", + author="paddlepaddle", + author_email="", + type="nlp/semantic_model") +class Embedding(TokenEmbedding): + """ + Embedding model + """ + def __init__(self, *args, **kwargs): + super(Embedding, self).__init__(embedding_name="w2v.baidu_encyclopedia.target.word-word.dim300", *args, **kwargs) + + @serving + def calc_similarity(self, data: List[List[str]]): + """ + Calculate similarities of giving word pairs. + """ + results = [] + for word_pair in data: + if len(word_pair) != 2: + raise RuntimeError( + f'The input must have two words, but got {len(word_pair)}. Please check your inputs.') + if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str): + raise RuntimeError( + f'The types of text pair must be (str, str), but got' + f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.') + + for word in word_pair: + if self.get_idx_from_word(word) == \ + self.get_idx_from_word(self.vocab.unk_token): + raise RuntimeError( + f'Word "{word}" is not in vocab. Please check your inputs.') + results.append(str(self.cosine_sim(*word_pair))) + return results diff --git a/modules/text/embedding/w2v_financial_target_bigram-char_dim300/README.md b/modules/text/embedding/w2v_financial_target_bigram-char_dim300/README.md new file mode 100644 index 00000000..45a110ab --- /dev/null +++ b/modules/text/embedding/w2v_financial_target_bigram-char_dim300/README.md @@ -0,0 +1,127 @@ +## 概述 +PaddleHub提供多个开源的预训练Embedding模型。这些Embedding模型可根据不同语料、不同训练方式和不同的维度进行区分,关于模型的具体信息可参考PaddleNLP的文档:[Embedding模型汇总](https://github.com/PaddlePaddle/models/blob/release/2.0-beta/PaddleNLP/docs/embeddings.md) + +## API + +```python +def __init__( + *args, + **kwargs +) +``` + +创建一个Embedding Module对象,默认无需参数。 + +**参数** +* `*args`: 用户额外指定的列表类型的参数。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 + +关于额外参数的详情可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + + +```python +def search( + words: Union[List[str], str, int], +) +``` + +获取一个或多个词的embedding。输入可以是`str`、`List[str]`和`int`类型,分别代表获取一个词,多个词和指定词编号的embedding,词的编号和模型的词典相关,词典可通过模型实例的`vocab`属性获取。 + +**参数** +* `words`: 需要获取的词向量的词、词列表或者词编号。 + + +```python +def cosine_sim( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的余弦相似度。需要注意的是`word_a`和`word_b`都需要是词典里的单词,否则将会被认为是OOV(Out-Of-Vocabulary),同时被替换为`unknown_token`。 + +**参数** +* `word_a`: 需要计算余弦相似度的单词a。 +* `word_b`: 需要计算余弦相似度的单词b。 + + +```python +def dot( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的内积。对于输入单词同样需要注意OOV问题。 + +**参数** +* `word_a`: 需要计算内积的单词a。 +* `word_b`: 需要计算内积的单词b。 + +更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + +## 代码示例 + +```python +import paddlehub as hub +embedding = hub.Module(name='w2v_financial_target_bigram-char_dim300') + +# 获取单词的embedding +embedding.search("中国") +# 计算两个词向量的余弦相似度 +embedding.cosine_sim("中国", "美国") +# 计算两个词向量的内积 +embedding.dot("中国", "美国") +``` + +## 部署服务 + +通过PaddleHub Serving,可以部署一个在线获取两个词向量的余弦相似度的服务。 + +### Step1: 启动PaddleHub Serving + +运行启动命令: + +```shell +$ hub serving start -m w2v_financial_target_bigram-char_dim300 +``` + +这样就完成了一个获取词向量的余弦相似度服务化API的部署,默认端口号为8866。 + +**NOTE:** 如使用GPU预测,则需要在启动服务之前,请设置CUDA_VISIBLE_DEVICES环境变量,否则不用设置。 + +### Step2: 发送预测请求 + +配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果 + +```python +import requests +import json + +# 指定用于计算余弦相似度的单词对[[word_a, word_b], [word_a, word_b], ... ]] +word_pairs = [["中国", "美国"], ["今天", "明天"]] +# 以key的方式指定word_pairs传入预测方法的时的参数,此例中为"data",对于每一对单词,调用cosine_sim进行余弦相似度的计算 +data = {"data": word_pairs} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip +url = "http://10.12.121.132:8866/predict/w2v_financial_target_bigram-char_dim300" +# 指定post请求的headers为application/json方式 +headers = {"Content-Type": "application/json"} + +r = requests.post(url=url, headers=headers, data=json.dumps(data)) +print(r.json()) +``` + +## 查看代码 + +https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings + +## 依赖 + +paddlepaddle >= 2.0.0 + +paddlehub >= 2.0.0 + +## 更新历史 + +* 1.0.0 + + 初始发布 + diff --git a/modules/text/embedding/w2v_financial_target_bigram-char_dim300/__init__.py b/modules/text/embedding/w2v_financial_target_bigram-char_dim300/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modules/text/embedding/w2v_financial_target_bigram-char_dim300/module.py b/modules/text/embedding/w2v_financial_target_bigram-char_dim300/module.py new file mode 100644 index 00000000..13c80a0d --- /dev/null +++ b/modules/text/embedding/w2v_financial_target_bigram-char_dim300/module.py @@ -0,0 +1,55 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List +from paddlenlp.embeddings import TokenEmbedding +from paddlehub.module.module import moduleinfo, serving + + +@moduleinfo( + name="w2v_financial_target_bigram-char_dim300", + version="1.0.0", + summary="", + author="paddlepaddle", + author_email="", + type="nlp/semantic_model") +class Embedding(TokenEmbedding): + """ + Embedding model + """ + def __init__(self, *args, **kwargs): + super(Embedding, self).__init__(embedding_name="w2v.financial.target.bigram-char.dim300", *args, **kwargs) + + @serving + def calc_similarity(self, data: List[List[str]]): + """ + Calculate similarities of giving word pairs. + """ + results = [] + for word_pair in data: + if len(word_pair) != 2: + raise RuntimeError( + f'The input must have two words, but got {len(word_pair)}. Please check your inputs.') + if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str): + raise RuntimeError( + f'The types of text pair must be (str, str), but got' + f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.') + + for word in word_pair: + if self.get_idx_from_word(word) == \ + self.get_idx_from_word(self.vocab.unk_token): + raise RuntimeError( + f'Word "{word}" is not in vocab. Please check your inputs.') + results.append(str(self.cosine_sim(*word_pair))) + return results diff --git a/modules/text/embedding/w2v_financial_target_word-bigram_dim300/README.md b/modules/text/embedding/w2v_financial_target_word-bigram_dim300/README.md new file mode 100644 index 00000000..e8d0bac0 --- /dev/null +++ b/modules/text/embedding/w2v_financial_target_word-bigram_dim300/README.md @@ -0,0 +1,127 @@ +## 概述 +PaddleHub提供多个开源的预训练Embedding模型。这些Embedding模型可根据不同语料、不同训练方式和不同的维度进行区分,关于模型的具体信息可参考PaddleNLP的文档:[Embedding模型汇总](https://github.com/PaddlePaddle/models/blob/release/2.0-beta/PaddleNLP/docs/embeddings.md) + +## API + +```python +def __init__( + *args, + **kwargs +) +``` + +创建一个Embedding Module对象,默认无需参数。 + +**参数** +* `*args`: 用户额外指定的列表类型的参数。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 + +关于额外参数的详情可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + + +```python +def search( + words: Union[List[str], str, int], +) +``` + +获取一个或多个词的embedding。输入可以是`str`、`List[str]`和`int`类型,分别代表获取一个词,多个词和指定词编号的embedding,词的编号和模型的词典相关,词典可通过模型实例的`vocab`属性获取。 + +**参数** +* `words`: 需要获取的词向量的词、词列表或者词编号。 + + +```python +def cosine_sim( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的余弦相似度。需要注意的是`word_a`和`word_b`都需要是词典里的单词,否则将会被认为是OOV(Out-Of-Vocabulary),同时被替换为`unknown_token`。 + +**参数** +* `word_a`: 需要计算余弦相似度的单词a。 +* `word_b`: 需要计算余弦相似度的单词b。 + + +```python +def dot( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的内积。对于输入单词同样需要注意OOV问题。 + +**参数** +* `word_a`: 需要计算内积的单词a。 +* `word_b`: 需要计算内积的单词b。 + +更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + +## 代码示例 + +```python +import paddlehub as hub +embedding = hub.Module(name='w2v_financial_target_word-bigram_dim300') + +# 获取单词的embedding +embedding.search("中国") +# 计算两个词向量的余弦相似度 +embedding.cosine_sim("中国", "美国") +# 计算两个词向量的内积 +embedding.dot("中国", "美国") +``` + +## 部署服务 + +通过PaddleHub Serving,可以部署一个在线获取两个词向量的余弦相似度的服务。 + +### Step1: 启动PaddleHub Serving + +运行启动命令: + +```shell +$ hub serving start -m w2v_financial_target_word-bigram_dim300 +``` + +这样就完成了一个获取词向量的余弦相似度服务化API的部署,默认端口号为8866。 + +**NOTE:** 如使用GPU预测,则需要在启动服务之前,请设置CUDA_VISIBLE_DEVICES环境变量,否则不用设置。 + +### Step2: 发送预测请求 + +配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果 + +```python +import requests +import json + +# 指定用于计算余弦相似度的单词对[[word_a, word_b], [word_a, word_b], ... ]] +word_pairs = [["中国", "美国"], ["今天", "明天"]] +# 以key的方式指定word_pairs传入预测方法的时的参数,此例中为"data",对于每一对单词,调用cosine_sim进行余弦相似度的计算 +data = {"data": word_pairs} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip +url = "http://10.12.121.132:8866/predict/w2v_financial_target_word-bigram_dim300" +# 指定post请求的headers为application/json方式 +headers = {"Content-Type": "application/json"} + +r = requests.post(url=url, headers=headers, data=json.dumps(data)) +print(r.json()) +``` + +## 查看代码 + +https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings + +## 依赖 + +paddlepaddle >= 2.0.0 + +paddlehub >= 2.0.0 + +## 更新历史 + +* 1.0.0 + + 初始发布 + diff --git a/modules/text/embedding/w2v_financial_target_word-bigram_dim300/__init__.py b/modules/text/embedding/w2v_financial_target_word-bigram_dim300/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modules/text/embedding/w2v_financial_target_word-bigram_dim300/module.py b/modules/text/embedding/w2v_financial_target_word-bigram_dim300/module.py new file mode 100644 index 00000000..254f78a1 --- /dev/null +++ b/modules/text/embedding/w2v_financial_target_word-bigram_dim300/module.py @@ -0,0 +1,55 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List +from paddlenlp.embeddings import TokenEmbedding +from paddlehub.module.module import moduleinfo, serving + + +@moduleinfo( + name="w2v_financial_target_word-bigram_dim300", + version="1.0.0", + summary="", + author="paddlepaddle", + author_email="", + type="nlp/semantic_model") +class Embedding(TokenEmbedding): + """ + Embedding model + """ + def __init__(self, *args, **kwargs): + super(Embedding, self).__init__(embedding_name="w2v.financial.target.word-bigram.dim300", *args, **kwargs) + + @serving + def calc_similarity(self, data: List[List[str]]): + """ + Calculate similarities of giving word pairs. + """ + results = [] + for word_pair in data: + if len(word_pair) != 2: + raise RuntimeError( + f'The input must have two words, but got {len(word_pair)}. Please check your inputs.') + if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str): + raise RuntimeError( + f'The types of text pair must be (str, str), but got' + f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.') + + for word in word_pair: + if self.get_idx_from_word(word) == \ + self.get_idx_from_word(self.vocab.unk_token): + raise RuntimeError( + f'Word "{word}" is not in vocab. Please check your inputs.') + results.append(str(self.cosine_sim(*word_pair))) + return results diff --git a/modules/text/embedding/w2v_financial_target_word-char_dim300/README.md b/modules/text/embedding/w2v_financial_target_word-char_dim300/README.md new file mode 100644 index 00000000..03547a64 --- /dev/null +++ b/modules/text/embedding/w2v_financial_target_word-char_dim300/README.md @@ -0,0 +1,127 @@ +## 概述 +PaddleHub提供多个开源的预训练Embedding模型。这些Embedding模型可根据不同语料、不同训练方式和不同的维度进行区分,关于模型的具体信息可参考PaddleNLP的文档:[Embedding模型汇总](https://github.com/PaddlePaddle/models/blob/release/2.0-beta/PaddleNLP/docs/embeddings.md) + +## API + +```python +def __init__( + *args, + **kwargs +) +``` + +创建一个Embedding Module对象,默认无需参数。 + +**参数** +* `*args`: 用户额外指定的列表类型的参数。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 + +关于额外参数的详情可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + + +```python +def search( + words: Union[List[str], str, int], +) +``` + +获取一个或多个词的embedding。输入可以是`str`、`List[str]`和`int`类型,分别代表获取一个词,多个词和指定词编号的embedding,词的编号和模型的词典相关,词典可通过模型实例的`vocab`属性获取。 + +**参数** +* `words`: 需要获取的词向量的词、词列表或者词编号。 + + +```python +def cosine_sim( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的余弦相似度。需要注意的是`word_a`和`word_b`都需要是词典里的单词,否则将会被认为是OOV(Out-Of-Vocabulary),同时被替换为`unknown_token`。 + +**参数** +* `word_a`: 需要计算余弦相似度的单词a。 +* `word_b`: 需要计算余弦相似度的单词b。 + + +```python +def dot( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的内积。对于输入单词同样需要注意OOV问题。 + +**参数** +* `word_a`: 需要计算内积的单词a。 +* `word_b`: 需要计算内积的单词b。 + +更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + +## 代码示例 + +```python +import paddlehub as hub +embedding = hub.Module(name='w2v_financial_target_word-char_dim300') + +# 获取单词的embedding +embedding.search("中国") +# 计算两个词向量的余弦相似度 +embedding.cosine_sim("中国", "美国") +# 计算两个词向量的内积 +embedding.dot("中国", "美国") +``` + +## 部署服务 + +通过PaddleHub Serving,可以部署一个在线获取两个词向量的余弦相似度的服务。 + +### Step1: 启动PaddleHub Serving + +运行启动命令: + +```shell +$ hub serving start -m w2v_financial_target_word-char_dim300 +``` + +这样就完成了一个获取词向量的余弦相似度服务化API的部署,默认端口号为8866。 + +**NOTE:** 如使用GPU预测,则需要在启动服务之前,请设置CUDA_VISIBLE_DEVICES环境变量,否则不用设置。 + +### Step2: 发送预测请求 + +配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果 + +```python +import requests +import json + +# 指定用于计算余弦相似度的单词对[[word_a, word_b], [word_a, word_b], ... ]] +word_pairs = [["中国", "美国"], ["今天", "明天"]] +# 以key的方式指定word_pairs传入预测方法的时的参数,此例中为"data",对于每一对单词,调用cosine_sim进行余弦相似度的计算 +data = {"data": word_pairs} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip +url = "http://10.12.121.132:8866/predict/w2v_financial_target_word-char_dim300" +# 指定post请求的headers为application/json方式 +headers = {"Content-Type": "application/json"} + +r = requests.post(url=url, headers=headers, data=json.dumps(data)) +print(r.json()) +``` + +## 查看代码 + +https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings + +## 依赖 + +paddlepaddle >= 2.0.0 + +paddlehub >= 2.0.0 + +## 更新历史 + +* 1.0.0 + + 初始发布 + diff --git a/modules/text/embedding/w2v_financial_target_word-char_dim300/__init__.py b/modules/text/embedding/w2v_financial_target_word-char_dim300/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modules/text/embedding/w2v_financial_target_word-char_dim300/module.py b/modules/text/embedding/w2v_financial_target_word-char_dim300/module.py new file mode 100644 index 00000000..b7b4ff88 --- /dev/null +++ b/modules/text/embedding/w2v_financial_target_word-char_dim300/module.py @@ -0,0 +1,55 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List +from paddlenlp.embeddings import TokenEmbedding +from paddlehub.module.module import moduleinfo, serving + + +@moduleinfo( + name="w2v_financial_target_word-char_dim300", + version="1.0.0", + summary="", + author="paddlepaddle", + author_email="", + type="nlp/semantic_model") +class Embedding(TokenEmbedding): + """ + Embedding model + """ + def __init__(self, *args, **kwargs): + super(Embedding, self).__init__(embedding_name="w2v.financial.target.word-char.dim300", *args, **kwargs) + + @serving + def calc_similarity(self, data: List[List[str]]): + """ + Calculate similarities of giving word pairs. + """ + results = [] + for word_pair in data: + if len(word_pair) != 2: + raise RuntimeError( + f'The input must have two words, but got {len(word_pair)}. Please check your inputs.') + if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str): + raise RuntimeError( + f'The types of text pair must be (str, str), but got' + f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.') + + for word in word_pair: + if self.get_idx_from_word(word) == \ + self.get_idx_from_word(self.vocab.unk_token): + raise RuntimeError( + f'Word "{word}" is not in vocab. Please check your inputs.') + results.append(str(self.cosine_sim(*word_pair))) + return results diff --git a/modules/text/embedding/w2v_financial_target_word-word_dim300/README.md b/modules/text/embedding/w2v_financial_target_word-word_dim300/README.md new file mode 100644 index 00000000..5df79b28 --- /dev/null +++ b/modules/text/embedding/w2v_financial_target_word-word_dim300/README.md @@ -0,0 +1,127 @@ +## 概述 +PaddleHub提供多个开源的预训练Embedding模型。这些Embedding模型可根据不同语料、不同训练方式和不同的维度进行区分,关于模型的具体信息可参考PaddleNLP的文档:[Embedding模型汇总](https://github.com/PaddlePaddle/models/blob/release/2.0-beta/PaddleNLP/docs/embeddings.md) + +## API + +```python +def __init__( + *args, + **kwargs +) +``` + +创建一个Embedding Module对象,默认无需参数。 + +**参数** +* `*args`: 用户额外指定的列表类型的参数。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 + +关于额外参数的详情可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + + +```python +def search( + words: Union[List[str], str, int], +) +``` + +获取一个或多个词的embedding。输入可以是`str`、`List[str]`和`int`类型,分别代表获取一个词,多个词和指定词编号的embedding,词的编号和模型的词典相关,词典可通过模型实例的`vocab`属性获取。 + +**参数** +* `words`: 需要获取的词向量的词、词列表或者词编号。 + + +```python +def cosine_sim( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的余弦相似度。需要注意的是`word_a`和`word_b`都需要是词典里的单词,否则将会被认为是OOV(Out-Of-Vocabulary),同时被替换为`unknown_token`。 + +**参数** +* `word_a`: 需要计算余弦相似度的单词a。 +* `word_b`: 需要计算余弦相似度的单词b。 + + +```python +def dot( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的内积。对于输入单词同样需要注意OOV问题。 + +**参数** +* `word_a`: 需要计算内积的单词a。 +* `word_b`: 需要计算内积的单词b。 + +更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + +## 代码示例 + +```python +import paddlehub as hub +embedding = hub.Module(name='w2v_financial_target_word-word_dim300') + +# 获取单词的embedding +embedding.search("中国") +# 计算两个词向量的余弦相似度 +embedding.cosine_sim("中国", "美国") +# 计算两个词向量的内积 +embedding.dot("中国", "美国") +``` + +## 部署服务 + +通过PaddleHub Serving,可以部署一个在线获取两个词向量的余弦相似度的服务。 + +### Step1: 启动PaddleHub Serving + +运行启动命令: + +```shell +$ hub serving start -m w2v_financial_target_word-word_dim300 +``` + +这样就完成了一个获取词向量的余弦相似度服务化API的部署,默认端口号为8866。 + +**NOTE:** 如使用GPU预测,则需要在启动服务之前,请设置CUDA_VISIBLE_DEVICES环境变量,否则不用设置。 + +### Step2: 发送预测请求 + +配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果 + +```python +import requests +import json + +# 指定用于计算余弦相似度的单词对[[word_a, word_b], [word_a, word_b], ... ]] +word_pairs = [["中国", "美国"], ["今天", "明天"]] +# 以key的方式指定word_pairs传入预测方法的时的参数,此例中为"data",对于每一对单词,调用cosine_sim进行余弦相似度的计算 +data = {"data": word_pairs} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip +url = "http://10.12.121.132:8866/predict/w2v_financial_target_word-word_dim300" +# 指定post请求的headers为application/json方式 +headers = {"Content-Type": "application/json"} + +r = requests.post(url=url, headers=headers, data=json.dumps(data)) +print(r.json()) +``` + +## 查看代码 + +https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings + +## 依赖 + +paddlepaddle >= 2.0.0 + +paddlehub >= 2.0.0 + +## 更新历史 + +* 1.0.0 + + 初始发布 + diff --git a/modules/text/embedding/w2v_financial_target_word-word_dim300/__init__.py b/modules/text/embedding/w2v_financial_target_word-word_dim300/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modules/text/embedding/w2v_financial_target_word-word_dim300/module.py b/modules/text/embedding/w2v_financial_target_word-word_dim300/module.py new file mode 100644 index 00000000..a7937c05 --- /dev/null +++ b/modules/text/embedding/w2v_financial_target_word-word_dim300/module.py @@ -0,0 +1,55 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List +from paddlenlp.embeddings import TokenEmbedding +from paddlehub.module.module import moduleinfo, serving + + +@moduleinfo( + name="w2v_financial_target_word-word_dim300", + version="1.0.0", + summary="", + author="paddlepaddle", + author_email="", + type="nlp/semantic_model") +class Embedding(TokenEmbedding): + """ + Embedding model + """ + def __init__(self, *args, **kwargs): + super(Embedding, self).__init__(embedding_name="w2v.financial.target.word-word.dim300", *args, **kwargs) + + @serving + def calc_similarity(self, data: List[List[str]]): + """ + Calculate similarities of giving word pairs. + """ + results = [] + for word_pair in data: + if len(word_pair) != 2: + raise RuntimeError( + f'The input must have two words, but got {len(word_pair)}. Please check your inputs.') + if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str): + raise RuntimeError( + f'The types of text pair must be (str, str), but got' + f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.') + + for word in word_pair: + if self.get_idx_from_word(word) == \ + self.get_idx_from_word(self.vocab.unk_token): + raise RuntimeError( + f'Word "{word}" is not in vocab. Please check your inputs.') + results.append(str(self.cosine_sim(*word_pair))) + return results diff --git a/modules/text/embedding/w2v_literature_target_bigram-char_dim300/README.md b/modules/text/embedding/w2v_literature_target_bigram-char_dim300/README.md new file mode 100644 index 00000000..b216ddca --- /dev/null +++ b/modules/text/embedding/w2v_literature_target_bigram-char_dim300/README.md @@ -0,0 +1,127 @@ +## 概述 +PaddleHub提供多个开源的预训练Embedding模型。这些Embedding模型可根据不同语料、不同训练方式和不同的维度进行区分,关于模型的具体信息可参考PaddleNLP的文档:[Embedding模型汇总](https://github.com/PaddlePaddle/models/blob/release/2.0-beta/PaddleNLP/docs/embeddings.md) + +## API + +```python +def __init__( + *args, + **kwargs +) +``` + +创建一个Embedding Module对象,默认无需参数。 + +**参数** +* `*args`: 用户额外指定的列表类型的参数。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 + +关于额外参数的详情可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + + +```python +def search( + words: Union[List[str], str, int], +) +``` + +获取一个或多个词的embedding。输入可以是`str`、`List[str]`和`int`类型,分别代表获取一个词,多个词和指定词编号的embedding,词的编号和模型的词典相关,词典可通过模型实例的`vocab`属性获取。 + +**参数** +* `words`: 需要获取的词向量的词、词列表或者词编号。 + + +```python +def cosine_sim( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的余弦相似度。需要注意的是`word_a`和`word_b`都需要是词典里的单词,否则将会被认为是OOV(Out-Of-Vocabulary),同时被替换为`unknown_token`。 + +**参数** +* `word_a`: 需要计算余弦相似度的单词a。 +* `word_b`: 需要计算余弦相似度的单词b。 + + +```python +def dot( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的内积。对于输入单词同样需要注意OOV问题。 + +**参数** +* `word_a`: 需要计算内积的单词a。 +* `word_b`: 需要计算内积的单词b。 + +更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + +## 代码示例 + +```python +import paddlehub as hub +embedding = hub.Module(name='w2v_literature_target_bigram-char_dim300') + +# 获取单词的embedding +embedding.search("中国") +# 计算两个词向量的余弦相似度 +embedding.cosine_sim("中国", "美国") +# 计算两个词向量的内积 +embedding.dot("中国", "美国") +``` + +## 部署服务 + +通过PaddleHub Serving,可以部署一个在线获取两个词向量的余弦相似度的服务。 + +### Step1: 启动PaddleHub Serving + +运行启动命令: + +```shell +$ hub serving start -m w2v_literature_target_bigram-char_dim300 +``` + +这样就完成了一个获取词向量的余弦相似度服务化API的部署,默认端口号为8866。 + +**NOTE:** 如使用GPU预测,则需要在启动服务之前,请设置CUDA_VISIBLE_DEVICES环境变量,否则不用设置。 + +### Step2: 发送预测请求 + +配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果 + +```python +import requests +import json + +# 指定用于计算余弦相似度的单词对[[word_a, word_b], [word_a, word_b], ... ]] +word_pairs = [["中国", "美国"], ["今天", "明天"]] +# 以key的方式指定word_pairs传入预测方法的时的参数,此例中为"data",对于每一对单词,调用cosine_sim进行余弦相似度的计算 +data = {"data": word_pairs} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip +url = "http://10.12.121.132:8866/predict/w2v_literature_target_bigram-char_dim300" +# 指定post请求的headers为application/json方式 +headers = {"Content-Type": "application/json"} + +r = requests.post(url=url, headers=headers, data=json.dumps(data)) +print(r.json()) +``` + +## 查看代码 + +https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings + +## 依赖 + +paddlepaddle >= 2.0.0 + +paddlehub >= 2.0.0 + +## 更新历史 + +* 1.0.0 + + 初始发布 + diff --git a/modules/text/embedding/w2v_literature_target_bigram-char_dim300/__init__.py b/modules/text/embedding/w2v_literature_target_bigram-char_dim300/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modules/text/embedding/w2v_literature_target_bigram-char_dim300/module.py b/modules/text/embedding/w2v_literature_target_bigram-char_dim300/module.py new file mode 100644 index 00000000..34e1b7a1 --- /dev/null +++ b/modules/text/embedding/w2v_literature_target_bigram-char_dim300/module.py @@ -0,0 +1,55 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List +from paddlenlp.embeddings import TokenEmbedding +from paddlehub.module.module import moduleinfo, serving + + +@moduleinfo( + name="w2v_literature_target_bigram-char_dim300", + version="1.0.0", + summary="", + author="paddlepaddle", + author_email="", + type="nlp/semantic_model") +class Embedding(TokenEmbedding): + """ + Embedding model + """ + def __init__(self, *args, **kwargs): + super(Embedding, self).__init__(embedding_name="w2v.literature.target.bigram-char.dim300", *args, **kwargs) + + @serving + def calc_similarity(self, data: List[List[str]]): + """ + Calculate similarities of giving word pairs. + """ + results = [] + for word_pair in data: + if len(word_pair) != 2: + raise RuntimeError( + f'The input must have two words, but got {len(word_pair)}. Please check your inputs.') + if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str): + raise RuntimeError( + f'The types of text pair must be (str, str), but got' + f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.') + + for word in word_pair: + if self.get_idx_from_word(word) == \ + self.get_idx_from_word(self.vocab.unk_token): + raise RuntimeError( + f'Word "{word}" is not in vocab. Please check your inputs.') + results.append(str(self.cosine_sim(*word_pair))) + return results diff --git a/modules/text/embedding/w2v_literature_target_word-bigram_dim300/README.md b/modules/text/embedding/w2v_literature_target_word-bigram_dim300/README.md new file mode 100644 index 00000000..16a31a13 --- /dev/null +++ b/modules/text/embedding/w2v_literature_target_word-bigram_dim300/README.md @@ -0,0 +1,127 @@ +## 概述 +PaddleHub提供多个开源的预训练Embedding模型。这些Embedding模型可根据不同语料、不同训练方式和不同的维度进行区分,关于模型的具体信息可参考PaddleNLP的文档:[Embedding模型汇总](https://github.com/PaddlePaddle/models/blob/release/2.0-beta/PaddleNLP/docs/embeddings.md) + +## API + +```python +def __init__( + *args, + **kwargs +) +``` + +创建一个Embedding Module对象,默认无需参数。 + +**参数** +* `*args`: 用户额外指定的列表类型的参数。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 + +关于额外参数的详情可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + + +```python +def search( + words: Union[List[str], str, int], +) +``` + +获取一个或多个词的embedding。输入可以是`str`、`List[str]`和`int`类型,分别代表获取一个词,多个词和指定词编号的embedding,词的编号和模型的词典相关,词典可通过模型实例的`vocab`属性获取。 + +**参数** +* `words`: 需要获取的词向量的词、词列表或者词编号。 + + +```python +def cosine_sim( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的余弦相似度。需要注意的是`word_a`和`word_b`都需要是词典里的单词,否则将会被认为是OOV(Out-Of-Vocabulary),同时被替换为`unknown_token`。 + +**参数** +* `word_a`: 需要计算余弦相似度的单词a。 +* `word_b`: 需要计算余弦相似度的单词b。 + + +```python +def dot( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的内积。对于输入单词同样需要注意OOV问题。 + +**参数** +* `word_a`: 需要计算内积的单词a。 +* `word_b`: 需要计算内积的单词b。 + +更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + +## 代码示例 + +```python +import paddlehub as hub +embedding = hub.Module(name='w2v_literature_target_word-bigram_dim300') + +# 获取单词的embedding +embedding.search("中国") +# 计算两个词向量的余弦相似度 +embedding.cosine_sim("中国", "美国") +# 计算两个词向量的内积 +embedding.dot("中国", "美国") +``` + +## 部署服务 + +通过PaddleHub Serving,可以部署一个在线获取两个词向量的余弦相似度的服务。 + +### Step1: 启动PaddleHub Serving + +运行启动命令: + +```shell +$ hub serving start -m w2v_literature_target_word-bigram_dim300 +``` + +这样就完成了一个获取词向量的余弦相似度服务化API的部署,默认端口号为8866。 + +**NOTE:** 如使用GPU预测,则需要在启动服务之前,请设置CUDA_VISIBLE_DEVICES环境变量,否则不用设置。 + +### Step2: 发送预测请求 + +配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果 + +```python +import requests +import json + +# 指定用于计算余弦相似度的单词对[[word_a, word_b], [word_a, word_b], ... ]] +word_pairs = [["中国", "美国"], ["今天", "明天"]] +# 以key的方式指定word_pairs传入预测方法的时的参数,此例中为"data",对于每一对单词,调用cosine_sim进行余弦相似度的计算 +data = {"data": word_pairs} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip +url = "http://10.12.121.132:8866/predict/w2v_literature_target_word-bigram_dim300" +# 指定post请求的headers为application/json方式 +headers = {"Content-Type": "application/json"} + +r = requests.post(url=url, headers=headers, data=json.dumps(data)) +print(r.json()) +``` + +## 查看代码 + +https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings + +## 依赖 + +paddlepaddle >= 2.0.0 + +paddlehub >= 2.0.0 + +## 更新历史 + +* 1.0.0 + + 初始发布 + diff --git a/modules/text/embedding/w2v_literature_target_word-bigram_dim300/__init__.py b/modules/text/embedding/w2v_literature_target_word-bigram_dim300/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modules/text/embedding/w2v_literature_target_word-bigram_dim300/module.py b/modules/text/embedding/w2v_literature_target_word-bigram_dim300/module.py new file mode 100644 index 00000000..01a07ccd --- /dev/null +++ b/modules/text/embedding/w2v_literature_target_word-bigram_dim300/module.py @@ -0,0 +1,55 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List +from paddlenlp.embeddings import TokenEmbedding +from paddlehub.module.module import moduleinfo, serving + + +@moduleinfo( + name="w2v_literature_target_word-bigram_dim300", + version="1.0.0", + summary="", + author="paddlepaddle", + author_email="", + type="nlp/semantic_model") +class Embedding(TokenEmbedding): + """ + Embedding model + """ + def __init__(self, *args, **kwargs): + super(Embedding, self).__init__(embedding_name="w2v.literature.target.word-bigram.dim300", *args, **kwargs) + + @serving + def calc_similarity(self, data: List[List[str]]): + """ + Calculate similarities of giving word pairs. + """ + results = [] + for word_pair in data: + if len(word_pair) != 2: + raise RuntimeError( + f'The input must have two words, but got {len(word_pair)}. Please check your inputs.') + if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str): + raise RuntimeError( + f'The types of text pair must be (str, str), but got' + f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.') + + for word in word_pair: + if self.get_idx_from_word(word) == \ + self.get_idx_from_word(self.vocab.unk_token): + raise RuntimeError( + f'Word "{word}" is not in vocab. Please check your inputs.') + results.append(str(self.cosine_sim(*word_pair))) + return results diff --git a/modules/text/embedding/w2v_literature_target_word-char_dim300/README.md b/modules/text/embedding/w2v_literature_target_word-char_dim300/README.md new file mode 100644 index 00000000..5635a68b --- /dev/null +++ b/modules/text/embedding/w2v_literature_target_word-char_dim300/README.md @@ -0,0 +1,127 @@ +## 概述 +PaddleHub提供多个开源的预训练Embedding模型。这些Embedding模型可根据不同语料、不同训练方式和不同的维度进行区分,关于模型的具体信息可参考PaddleNLP的文档:[Embedding模型汇总](https://github.com/PaddlePaddle/models/blob/release/2.0-beta/PaddleNLP/docs/embeddings.md) + +## API + +```python +def __init__( + *args, + **kwargs +) +``` + +创建一个Embedding Module对象,默认无需参数。 + +**参数** +* `*args`: 用户额外指定的列表类型的参数。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 + +关于额外参数的详情可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + + +```python +def search( + words: Union[List[str], str, int], +) +``` + +获取一个或多个词的embedding。输入可以是`str`、`List[str]`和`int`类型,分别代表获取一个词,多个词和指定词编号的embedding,词的编号和模型的词典相关,词典可通过模型实例的`vocab`属性获取。 + +**参数** +* `words`: 需要获取的词向量的词、词列表或者词编号。 + + +```python +def cosine_sim( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的余弦相似度。需要注意的是`word_a`和`word_b`都需要是词典里的单词,否则将会被认为是OOV(Out-Of-Vocabulary),同时被替换为`unknown_token`。 + +**参数** +* `word_a`: 需要计算余弦相似度的单词a。 +* `word_b`: 需要计算余弦相似度的单词b。 + + +```python +def dot( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的内积。对于输入单词同样需要注意OOV问题。 + +**参数** +* `word_a`: 需要计算内积的单词a。 +* `word_b`: 需要计算内积的单词b。 + +更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + +## 代码示例 + +```python +import paddlehub as hub +embedding = hub.Module(name='w2v_literature_target_word-char_dim300') + +# 获取单词的embedding +embedding.search("中国") +# 计算两个词向量的余弦相似度 +embedding.cosine_sim("中国", "美国") +# 计算两个词向量的内积 +embedding.dot("中国", "美国") +``` + +## 部署服务 + +通过PaddleHub Serving,可以部署一个在线获取两个词向量的余弦相似度的服务。 + +### Step1: 启动PaddleHub Serving + +运行启动命令: + +```shell +$ hub serving start -m w2v_literature_target_word-char_dim300 +``` + +这样就完成了一个获取词向量的余弦相似度服务化API的部署,默认端口号为8866。 + +**NOTE:** 如使用GPU预测,则需要在启动服务之前,请设置CUDA_VISIBLE_DEVICES环境变量,否则不用设置。 + +### Step2: 发送预测请求 + +配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果 + +```python +import requests +import json + +# 指定用于计算余弦相似度的单词对[[word_a, word_b], [word_a, word_b], ... ]] +word_pairs = [["中国", "美国"], ["今天", "明天"]] +# 以key的方式指定word_pairs传入预测方法的时的参数,此例中为"data",对于每一对单词,调用cosine_sim进行余弦相似度的计算 +data = {"data": word_pairs} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip +url = "http://10.12.121.132:8866/predict/w2v_literature_target_word-char_dim300" +# 指定post请求的headers为application/json方式 +headers = {"Content-Type": "application/json"} + +r = requests.post(url=url, headers=headers, data=json.dumps(data)) +print(r.json()) +``` + +## 查看代码 + +https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings + +## 依赖 + +paddlepaddle >= 2.0.0 + +paddlehub >= 2.0.0 + +## 更新历史 + +* 1.0.0 + + 初始发布 + diff --git a/modules/text/embedding/w2v_literature_target_word-char_dim300/__init__.py b/modules/text/embedding/w2v_literature_target_word-char_dim300/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modules/text/embedding/w2v_literature_target_word-char_dim300/module.py b/modules/text/embedding/w2v_literature_target_word-char_dim300/module.py new file mode 100644 index 00000000..966ae9c7 --- /dev/null +++ b/modules/text/embedding/w2v_literature_target_word-char_dim300/module.py @@ -0,0 +1,55 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List +from paddlenlp.embeddings import TokenEmbedding +from paddlehub.module.module import moduleinfo, serving + + +@moduleinfo( + name="w2v_literature_target_word-char_dim300", + version="1.0.0", + summary="", + author="paddlepaddle", + author_email="", + type="nlp/semantic_model") +class Embedding(TokenEmbedding): + """ + Embedding model + """ + def __init__(self, *args, **kwargs): + super(Embedding, self).__init__(embedding_name="w2v.literature.target.word-char.dim300", *args, **kwargs) + + @serving + def calc_similarity(self, data: List[List[str]]): + """ + Calculate similarities of giving word pairs. + """ + results = [] + for word_pair in data: + if len(word_pair) != 2: + raise RuntimeError( + f'The input must have two words, but got {len(word_pair)}. Please check your inputs.') + if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str): + raise RuntimeError( + f'The types of text pair must be (str, str), but got' + f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.') + + for word in word_pair: + if self.get_idx_from_word(word) == \ + self.get_idx_from_word(self.vocab.unk_token): + raise RuntimeError( + f'Word "{word}" is not in vocab. Please check your inputs.') + results.append(str(self.cosine_sim(*word_pair))) + return results diff --git a/modules/text/embedding/w2v_literature_target_word-word_dim300/README.md b/modules/text/embedding/w2v_literature_target_word-word_dim300/README.md new file mode 100644 index 00000000..4b5eda30 --- /dev/null +++ b/modules/text/embedding/w2v_literature_target_word-word_dim300/README.md @@ -0,0 +1,127 @@ +## 概述 +PaddleHub提供多个开源的预训练Embedding模型。这些Embedding模型可根据不同语料、不同训练方式和不同的维度进行区分,关于模型的具体信息可参考PaddleNLP的文档:[Embedding模型汇总](https://github.com/PaddlePaddle/models/blob/release/2.0-beta/PaddleNLP/docs/embeddings.md) + +## API + +```python +def __init__( + *args, + **kwargs +) +``` + +创建一个Embedding Module对象,默认无需参数。 + +**参数** +* `*args`: 用户额外指定的列表类型的参数。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 + +关于额外参数的详情可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + + +```python +def search( + words: Union[List[str], str, int], +) +``` + +获取一个或多个词的embedding。输入可以是`str`、`List[str]`和`int`类型,分别代表获取一个词,多个词和指定词编号的embedding,词的编号和模型的词典相关,词典可通过模型实例的`vocab`属性获取。 + +**参数** +* `words`: 需要获取的词向量的词、词列表或者词编号。 + + +```python +def cosine_sim( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的余弦相似度。需要注意的是`word_a`和`word_b`都需要是词典里的单词,否则将会被认为是OOV(Out-Of-Vocabulary),同时被替换为`unknown_token`。 + +**参数** +* `word_a`: 需要计算余弦相似度的单词a。 +* `word_b`: 需要计算余弦相似度的单词b。 + + +```python +def dot( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的内积。对于输入单词同样需要注意OOV问题。 + +**参数** +* `word_a`: 需要计算内积的单词a。 +* `word_b`: 需要计算内积的单词b。 + +更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + +## 代码示例 + +```python +import paddlehub as hub +embedding = hub.Module(name='w2v_literature_target_word-word_dim300') + +# 获取单词的embedding +embedding.search("中国") +# 计算两个词向量的余弦相似度 +embedding.cosine_sim("中国", "美国") +# 计算两个词向量的内积 +embedding.dot("中国", "美国") +``` + +## 部署服务 + +通过PaddleHub Serving,可以部署一个在线获取两个词向量的余弦相似度的服务。 + +### Step1: 启动PaddleHub Serving + +运行启动命令: + +```shell +$ hub serving start -m w2v_literature_target_word-word_dim300 +``` + +这样就完成了一个获取词向量的余弦相似度服务化API的部署,默认端口号为8866。 + +**NOTE:** 如使用GPU预测,则需要在启动服务之前,请设置CUDA_VISIBLE_DEVICES环境变量,否则不用设置。 + +### Step2: 发送预测请求 + +配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果 + +```python +import requests +import json + +# 指定用于计算余弦相似度的单词对[[word_a, word_b], [word_a, word_b], ... ]] +word_pairs = [["中国", "美国"], ["今天", "明天"]] +# 以key的方式指定word_pairs传入预测方法的时的参数,此例中为"data",对于每一对单词,调用cosine_sim进行余弦相似度的计算 +data = {"data": word_pairs} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip +url = "http://10.12.121.132:8866/predict/w2v_literature_target_word-word_dim300" +# 指定post请求的headers为application/json方式 +headers = {"Content-Type": "application/json"} + +r = requests.post(url=url, headers=headers, data=json.dumps(data)) +print(r.json()) +``` + +## 查看代码 + +https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings + +## 依赖 + +paddlepaddle >= 2.0.0 + +paddlehub >= 2.0.0 + +## 更新历史 + +* 1.0.0 + + 初始发布 + diff --git a/modules/text/embedding/w2v_literature_target_word-word_dim300/__init__.py b/modules/text/embedding/w2v_literature_target_word-word_dim300/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modules/text/embedding/w2v_literature_target_word-word_dim300/module.py b/modules/text/embedding/w2v_literature_target_word-word_dim300/module.py new file mode 100644 index 00000000..a40b2a5a --- /dev/null +++ b/modules/text/embedding/w2v_literature_target_word-word_dim300/module.py @@ -0,0 +1,55 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List +from paddlenlp.embeddings import TokenEmbedding +from paddlehub.module.module import moduleinfo, serving + + +@moduleinfo( + name="w2v_literature_target_word-word_dim300", + version="1.0.0", + summary="", + author="paddlepaddle", + author_email="", + type="nlp/semantic_model") +class Embedding(TokenEmbedding): + """ + Embedding model + """ + def __init__(self, *args, **kwargs): + super(Embedding, self).__init__(embedding_name="w2v.literature.target.word-word.dim300", *args, **kwargs) + + @serving + def calc_similarity(self, data: List[List[str]]): + """ + Calculate similarities of giving word pairs. + """ + results = [] + for word_pair in data: + if len(word_pair) != 2: + raise RuntimeError( + f'The input must have two words, but got {len(word_pair)}. Please check your inputs.') + if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str): + raise RuntimeError( + f'The types of text pair must be (str, str), but got' + f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.') + + for word in word_pair: + if self.get_idx_from_word(word) == \ + self.get_idx_from_word(self.vocab.unk_token): + raise RuntimeError( + f'Word "{word}" is not in vocab. Please check your inputs.') + results.append(str(self.cosine_sim(*word_pair))) + return results diff --git a/modules/text/embedding/w2v_mixed-large_target_word-char_dim300/README.md b/modules/text/embedding/w2v_mixed-large_target_word-char_dim300/README.md new file mode 100644 index 00000000..7c4132a6 --- /dev/null +++ b/modules/text/embedding/w2v_mixed-large_target_word-char_dim300/README.md @@ -0,0 +1,127 @@ +## 概述 +PaddleHub提供多个开源的预训练Embedding模型。这些Embedding模型可根据不同语料、不同训练方式和不同的维度进行区分,关于模型的具体信息可参考PaddleNLP的文档:[Embedding模型汇总](https://github.com/PaddlePaddle/models/blob/release/2.0-beta/PaddleNLP/docs/embeddings.md) + +## API + +```python +def __init__( + *args, + **kwargs +) +``` + +创建一个Embedding Module对象,默认无需参数。 + +**参数** +* `*args`: 用户额外指定的列表类型的参数。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 + +关于额外参数的详情可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + + +```python +def search( + words: Union[List[str], str, int], +) +``` + +获取一个或多个词的embedding。输入可以是`str`、`List[str]`和`int`类型,分别代表获取一个词,多个词和指定词编号的embedding,词的编号和模型的词典相关,词典可通过模型实例的`vocab`属性获取。 + +**参数** +* `words`: 需要获取的词向量的词、词列表或者词编号。 + + +```python +def cosine_sim( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的余弦相似度。需要注意的是`word_a`和`word_b`都需要是词典里的单词,否则将会被认为是OOV(Out-Of-Vocabulary),同时被替换为`unknown_token`。 + +**参数** +* `word_a`: 需要计算余弦相似度的单词a。 +* `word_b`: 需要计算余弦相似度的单词b。 + + +```python +def dot( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的内积。对于输入单词同样需要注意OOV问题。 + +**参数** +* `word_a`: 需要计算内积的单词a。 +* `word_b`: 需要计算内积的单词b。 + +更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + +## 代码示例 + +```python +import paddlehub as hub +embedding = hub.Module(name='w2v_mixed-large_target_word-char_dim300') + +# 获取单词的embedding +embedding.search("中国") +# 计算两个词向量的余弦相似度 +embedding.cosine_sim("中国", "美国") +# 计算两个词向量的内积 +embedding.dot("中国", "美国") +``` + +## 部署服务 + +通过PaddleHub Serving,可以部署一个在线获取两个词向量的余弦相似度的服务。 + +### Step1: 启动PaddleHub Serving + +运行启动命令: + +```shell +$ hub serving start -m w2v_mixed-large_target_word-char_dim300 +``` + +这样就完成了一个获取词向量的余弦相似度服务化API的部署,默认端口号为8866。 + +**NOTE:** 如使用GPU预测,则需要在启动服务之前,请设置CUDA_VISIBLE_DEVICES环境变量,否则不用设置。 + +### Step2: 发送预测请求 + +配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果 + +```python +import requests +import json + +# 指定用于计算余弦相似度的单词对[[word_a, word_b], [word_a, word_b], ... ]] +word_pairs = [["中国", "美国"], ["今天", "明天"]] +# 以key的方式指定word_pairs传入预测方法的时的参数,此例中为"data",对于每一对单词,调用cosine_sim进行余弦相似度的计算 +data = {"data": word_pairs} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip +url = "http://10.12.121.132:8866/predict/w2v_mixed-large_target_word-char_dim300" +# 指定post请求的headers为application/json方式 +headers = {"Content-Type": "application/json"} + +r = requests.post(url=url, headers=headers, data=json.dumps(data)) +print(r.json()) +``` + +## 查看代码 + +https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings + +## 依赖 + +paddlepaddle >= 2.0.0 + +paddlehub >= 2.0.0 + +## 更新历史 + +* 1.0.0 + + 初始发布 + diff --git a/modules/text/embedding/w2v_mixed-large_target_word-char_dim300/__init__.py b/modules/text/embedding/w2v_mixed-large_target_word-char_dim300/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modules/text/embedding/w2v_mixed-large_target_word-char_dim300/module.py b/modules/text/embedding/w2v_mixed-large_target_word-char_dim300/module.py new file mode 100644 index 00000000..3ef93dd4 --- /dev/null +++ b/modules/text/embedding/w2v_mixed-large_target_word-char_dim300/module.py @@ -0,0 +1,55 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List +from paddlenlp.embeddings import TokenEmbedding +from paddlehub.module.module import moduleinfo, serving + + +@moduleinfo( + name="w2v_mixed-large_target_word-char_dim300", + version="1.0.0", + summary="", + author="paddlepaddle", + author_email="", + type="nlp/semantic_model") +class Embedding(TokenEmbedding): + """ + Embedding model + """ + def __init__(self, *args, **kwargs): + super(Embedding, self).__init__(embedding_name="w2v.mixed-large.target.word-char.dim300", *args, **kwargs) + + @serving + def calc_similarity(self, data: List[List[str]]): + """ + Calculate similarities of giving word pairs. + """ + results = [] + for word_pair in data: + if len(word_pair) != 2: + raise RuntimeError( + f'The input must have two words, but got {len(word_pair)}. Please check your inputs.') + if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str): + raise RuntimeError( + f'The types of text pair must be (str, str), but got' + f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.') + + for word in word_pair: + if self.get_idx_from_word(word) == \ + self.get_idx_from_word(self.vocab.unk_token): + raise RuntimeError( + f'Word "{word}" is not in vocab. Please check your inputs.') + results.append(str(self.cosine_sim(*word_pair))) + return results diff --git a/modules/text/embedding/w2v_mixed-large_target_word-word_dim300/README.md b/modules/text/embedding/w2v_mixed-large_target_word-word_dim300/README.md new file mode 100644 index 00000000..09b9d58f --- /dev/null +++ b/modules/text/embedding/w2v_mixed-large_target_word-word_dim300/README.md @@ -0,0 +1,127 @@ +## 概述 +PaddleHub提供多个开源的预训练Embedding模型。这些Embedding模型可根据不同语料、不同训练方式和不同的维度进行区分,关于模型的具体信息可参考PaddleNLP的文档:[Embedding模型汇总](https://github.com/PaddlePaddle/models/blob/release/2.0-beta/PaddleNLP/docs/embeddings.md) + +## API + +```python +def __init__( + *args, + **kwargs +) +``` + +创建一个Embedding Module对象,默认无需参数。 + +**参数** +* `*args`: 用户额外指定的列表类型的参数。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 + +关于额外参数的详情可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + + +```python +def search( + words: Union[List[str], str, int], +) +``` + +获取一个或多个词的embedding。输入可以是`str`、`List[str]`和`int`类型,分别代表获取一个词,多个词和指定词编号的embedding,词的编号和模型的词典相关,词典可通过模型实例的`vocab`属性获取。 + +**参数** +* `words`: 需要获取的词向量的词、词列表或者词编号。 + + +```python +def cosine_sim( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的余弦相似度。需要注意的是`word_a`和`word_b`都需要是词典里的单词,否则将会被认为是OOV(Out-Of-Vocabulary),同时被替换为`unknown_token`。 + +**参数** +* `word_a`: 需要计算余弦相似度的单词a。 +* `word_b`: 需要计算余弦相似度的单词b。 + + +```python +def dot( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的内积。对于输入单词同样需要注意OOV问题。 + +**参数** +* `word_a`: 需要计算内积的单词a。 +* `word_b`: 需要计算内积的单词b。 + +更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + +## 代码示例 + +```python +import paddlehub as hub +embedding = hub.Module(name='w2v_mixed-large_target_word-word_dim300') + +# 获取单词的embedding +embedding.search("中国") +# 计算两个词向量的余弦相似度 +embedding.cosine_sim("中国", "美国") +# 计算两个词向量的内积 +embedding.dot("中国", "美国") +``` + +## 部署服务 + +通过PaddleHub Serving,可以部署一个在线获取两个词向量的余弦相似度的服务。 + +### Step1: 启动PaddleHub Serving + +运行启动命令: + +```shell +$ hub serving start -m w2v_mixed-large_target_word-word_dim300 +``` + +这样就完成了一个获取词向量的余弦相似度服务化API的部署,默认端口号为8866。 + +**NOTE:** 如使用GPU预测,则需要在启动服务之前,请设置CUDA_VISIBLE_DEVICES环境变量,否则不用设置。 + +### Step2: 发送预测请求 + +配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果 + +```python +import requests +import json + +# 指定用于计算余弦相似度的单词对[[word_a, word_b], [word_a, word_b], ... ]] +word_pairs = [["中国", "美国"], ["今天", "明天"]] +# 以key的方式指定word_pairs传入预测方法的时的参数,此例中为"data",对于每一对单词,调用cosine_sim进行余弦相似度的计算 +data = {"data": word_pairs} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip +url = "http://10.12.121.132:8866/predict/w2v_mixed-large_target_word-word_dim300" +# 指定post请求的headers为application/json方式 +headers = {"Content-Type": "application/json"} + +r = requests.post(url=url, headers=headers, data=json.dumps(data)) +print(r.json()) +``` + +## 查看代码 + +https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings + +## 依赖 + +paddlepaddle >= 2.0.0 + +paddlehub >= 2.0.0 + +## 更新历史 + +* 1.0.0 + + 初始发布 + diff --git a/modules/text/embedding/w2v_mixed-large_target_word-word_dim300/__init__.py b/modules/text/embedding/w2v_mixed-large_target_word-word_dim300/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modules/text/embedding/w2v_mixed-large_target_word-word_dim300/module.py b/modules/text/embedding/w2v_mixed-large_target_word-word_dim300/module.py new file mode 100644 index 00000000..b70a0a42 --- /dev/null +++ b/modules/text/embedding/w2v_mixed-large_target_word-word_dim300/module.py @@ -0,0 +1,55 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List +from paddlenlp.embeddings import TokenEmbedding +from paddlehub.module.module import moduleinfo, serving + + +@moduleinfo( + name="w2v_mixed-large_target_word-word_dim300", + version="1.0.0", + summary="", + author="paddlepaddle", + author_email="", + type="nlp/semantic_model") +class Embedding(TokenEmbedding): + """ + Embedding model + """ + def __init__(self, *args, **kwargs): + super(Embedding, self).__init__(embedding_name="w2v.mixed-large.target.word-word.dim300", *args, **kwargs) + + @serving + def calc_similarity(self, data: List[List[str]]): + """ + Calculate similarities of giving word pairs. + """ + results = [] + for word_pair in data: + if len(word_pair) != 2: + raise RuntimeError( + f'The input must have two words, but got {len(word_pair)}. Please check your inputs.') + if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str): + raise RuntimeError( + f'The types of text pair must be (str, str), but got' + f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.') + + for word in word_pair: + if self.get_idx_from_word(word) == \ + self.get_idx_from_word(self.vocab.unk_token): + raise RuntimeError( + f'Word "{word}" is not in vocab. Please check your inputs.') + results.append(str(self.cosine_sim(*word_pair))) + return results diff --git a/modules/text/embedding/w2v_people_daily_target_bigram-char_dim300/README.md b/modules/text/embedding/w2v_people_daily_target_bigram-char_dim300/README.md new file mode 100644 index 00000000..b96c6b68 --- /dev/null +++ b/modules/text/embedding/w2v_people_daily_target_bigram-char_dim300/README.md @@ -0,0 +1,127 @@ +## 概述 +PaddleHub提供多个开源的预训练Embedding模型。这些Embedding模型可根据不同语料、不同训练方式和不同的维度进行区分,关于模型的具体信息可参考PaddleNLP的文档:[Embedding模型汇总](https://github.com/PaddlePaddle/models/blob/release/2.0-beta/PaddleNLP/docs/embeddings.md) + +## API + +```python +def __init__( + *args, + **kwargs +) +``` + +创建一个Embedding Module对象,默认无需参数。 + +**参数** +* `*args`: 用户额外指定的列表类型的参数。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 + +关于额外参数的详情可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + + +```python +def search( + words: Union[List[str], str, int], +) +``` + +获取一个或多个词的embedding。输入可以是`str`、`List[str]`和`int`类型,分别代表获取一个词,多个词和指定词编号的embedding,词的编号和模型的词典相关,词典可通过模型实例的`vocab`属性获取。 + +**参数** +* `words`: 需要获取的词向量的词、词列表或者词编号。 + + +```python +def cosine_sim( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的余弦相似度。需要注意的是`word_a`和`word_b`都需要是词典里的单词,否则将会被认为是OOV(Out-Of-Vocabulary),同时被替换为`unknown_token`。 + +**参数** +* `word_a`: 需要计算余弦相似度的单词a。 +* `word_b`: 需要计算余弦相似度的单词b。 + + +```python +def dot( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的内积。对于输入单词同样需要注意OOV问题。 + +**参数** +* `word_a`: 需要计算内积的单词a。 +* `word_b`: 需要计算内积的单词b。 + +更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + +## 代码示例 + +```python +import paddlehub as hub +embedding = hub.Module(name='w2v_people_daily_target_bigram-char_dim300') + +# 获取单词的embedding +embedding.search("中国") +# 计算两个词向量的余弦相似度 +embedding.cosine_sim("中国", "美国") +# 计算两个词向量的内积 +embedding.dot("中国", "美国") +``` + +## 部署服务 + +通过PaddleHub Serving,可以部署一个在线获取两个词向量的余弦相似度的服务。 + +### Step1: 启动PaddleHub Serving + +运行启动命令: + +```shell +$ hub serving start -m w2v_people_daily_target_bigram-char_dim300 +``` + +这样就完成了一个获取词向量的余弦相似度服务化API的部署,默认端口号为8866。 + +**NOTE:** 如使用GPU预测,则需要在启动服务之前,请设置CUDA_VISIBLE_DEVICES环境变量,否则不用设置。 + +### Step2: 发送预测请求 + +配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果 + +```python +import requests +import json + +# 指定用于计算余弦相似度的单词对[[word_a, word_b], [word_a, word_b], ... ]] +word_pairs = [["中国", "美国"], ["今天", "明天"]] +# 以key的方式指定word_pairs传入预测方法的时的参数,此例中为"data",对于每一对单词,调用cosine_sim进行余弦相似度的计算 +data = {"data": word_pairs} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip +url = "http://10.12.121.132:8866/predict/w2v_people_daily_target_bigram-char_dim300" +# 指定post请求的headers为application/json方式 +headers = {"Content-Type": "application/json"} + +r = requests.post(url=url, headers=headers, data=json.dumps(data)) +print(r.json()) +``` + +## 查看代码 + +https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings + +## 依赖 + +paddlepaddle >= 2.0.0 + +paddlehub >= 2.0.0 + +## 更新历史 + +* 1.0.0 + + 初始发布 + diff --git a/modules/text/embedding/w2v_people_daily_target_bigram-char_dim300/__init__.py b/modules/text/embedding/w2v_people_daily_target_bigram-char_dim300/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modules/text/embedding/w2v_people_daily_target_bigram-char_dim300/module.py b/modules/text/embedding/w2v_people_daily_target_bigram-char_dim300/module.py new file mode 100644 index 00000000..2626de97 --- /dev/null +++ b/modules/text/embedding/w2v_people_daily_target_bigram-char_dim300/module.py @@ -0,0 +1,55 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List +from paddlenlp.embeddings import TokenEmbedding +from paddlehub.module.module import moduleinfo, serving + + +@moduleinfo( + name="w2v_people_daily_target_bigram-char_dim300", + version="1.0.0", + summary="", + author="paddlepaddle", + author_email="", + type="nlp/semantic_model") +class Embedding(TokenEmbedding): + """ + Embedding model + """ + def __init__(self, *args, **kwargs): + super(Embedding, self).__init__(embedding_name="w2v.people_daily.target.bigram-char.dim300", *args, **kwargs) + + @serving + def calc_similarity(self, data: List[List[str]]): + """ + Calculate similarities of giving word pairs. + """ + results = [] + for word_pair in data: + if len(word_pair) != 2: + raise RuntimeError( + f'The input must have two words, but got {len(word_pair)}. Please check your inputs.') + if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str): + raise RuntimeError( + f'The types of text pair must be (str, str), but got' + f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.') + + for word in word_pair: + if self.get_idx_from_word(word) == \ + self.get_idx_from_word(self.vocab.unk_token): + raise RuntimeError( + f'Word "{word}" is not in vocab. Please check your inputs.') + results.append(str(self.cosine_sim(*word_pair))) + return results diff --git a/modules/text/embedding/w2v_people_daily_target_word-bigram_dim300/README.md b/modules/text/embedding/w2v_people_daily_target_word-bigram_dim300/README.md new file mode 100644 index 00000000..05cd6342 --- /dev/null +++ b/modules/text/embedding/w2v_people_daily_target_word-bigram_dim300/README.md @@ -0,0 +1,127 @@ +## 概述 +PaddleHub提供多个开源的预训练Embedding模型。这些Embedding模型可根据不同语料、不同训练方式和不同的维度进行区分,关于模型的具体信息可参考PaddleNLP的文档:[Embedding模型汇总](https://github.com/PaddlePaddle/models/blob/release/2.0-beta/PaddleNLP/docs/embeddings.md) + +## API + +```python +def __init__( + *args, + **kwargs +) +``` + +创建一个Embedding Module对象,默认无需参数。 + +**参数** +* `*args`: 用户额外指定的列表类型的参数。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 + +关于额外参数的详情可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + + +```python +def search( + words: Union[List[str], str, int], +) +``` + +获取一个或多个词的embedding。输入可以是`str`、`List[str]`和`int`类型,分别代表获取一个词,多个词和指定词编号的embedding,词的编号和模型的词典相关,词典可通过模型实例的`vocab`属性获取。 + +**参数** +* `words`: 需要获取的词向量的词、词列表或者词编号。 + + +```python +def cosine_sim( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的余弦相似度。需要注意的是`word_a`和`word_b`都需要是词典里的单词,否则将会被认为是OOV(Out-Of-Vocabulary),同时被替换为`unknown_token`。 + +**参数** +* `word_a`: 需要计算余弦相似度的单词a。 +* `word_b`: 需要计算余弦相似度的单词b。 + + +```python +def dot( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的内积。对于输入单词同样需要注意OOV问题。 + +**参数** +* `word_a`: 需要计算内积的单词a。 +* `word_b`: 需要计算内积的单词b。 + +更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + +## 代码示例 + +```python +import paddlehub as hub +embedding = hub.Module(name='w2v_people_daily_target_word-bigram_dim300') + +# 获取单词的embedding +embedding.search("中国") +# 计算两个词向量的余弦相似度 +embedding.cosine_sim("中国", "美国") +# 计算两个词向量的内积 +embedding.dot("中国", "美国") +``` + +## 部署服务 + +通过PaddleHub Serving,可以部署一个在线获取两个词向量的余弦相似度的服务。 + +### Step1: 启动PaddleHub Serving + +运行启动命令: + +```shell +$ hub serving start -m w2v_people_daily_target_word-bigram_dim300 +``` + +这样就完成了一个获取词向量的余弦相似度服务化API的部署,默认端口号为8866。 + +**NOTE:** 如使用GPU预测,则需要在启动服务之前,请设置CUDA_VISIBLE_DEVICES环境变量,否则不用设置。 + +### Step2: 发送预测请求 + +配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果 + +```python +import requests +import json + +# 指定用于计算余弦相似度的单词对[[word_a, word_b], [word_a, word_b], ... ]] +word_pairs = [["中国", "美国"], ["今天", "明天"]] +# 以key的方式指定word_pairs传入预测方法的时的参数,此例中为"data",对于每一对单词,调用cosine_sim进行余弦相似度的计算 +data = {"data": word_pairs} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip +url = "http://10.12.121.132:8866/predict/w2v_people_daily_target_word-bigram_dim300" +# 指定post请求的headers为application/json方式 +headers = {"Content-Type": "application/json"} + +r = requests.post(url=url, headers=headers, data=json.dumps(data)) +print(r.json()) +``` + +## 查看代码 + +https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings + +## 依赖 + +paddlepaddle >= 2.0.0 + +paddlehub >= 2.0.0 + +## 更新历史 + +* 1.0.0 + + 初始发布 + diff --git a/modules/text/embedding/w2v_people_daily_target_word-bigram_dim300/__init__.py b/modules/text/embedding/w2v_people_daily_target_word-bigram_dim300/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modules/text/embedding/w2v_people_daily_target_word-bigram_dim300/module.py b/modules/text/embedding/w2v_people_daily_target_word-bigram_dim300/module.py new file mode 100644 index 00000000..8fd75b80 --- /dev/null +++ b/modules/text/embedding/w2v_people_daily_target_word-bigram_dim300/module.py @@ -0,0 +1,55 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List +from paddlenlp.embeddings import TokenEmbedding +from paddlehub.module.module import moduleinfo, serving + + +@moduleinfo( + name="w2v_people_daily_target_word-bigram_dim300", + version="1.0.0", + summary="", + author="paddlepaddle", + author_email="", + type="nlp/semantic_model") +class Embedding(TokenEmbedding): + """ + Embedding model + """ + def __init__(self, *args, **kwargs): + super(Embedding, self).__init__(embedding_name="w2v.people_daily.target.word-bigram.dim300", *args, **kwargs) + + @serving + def calc_similarity(self, data: List[List[str]]): + """ + Calculate similarities of giving word pairs. + """ + results = [] + for word_pair in data: + if len(word_pair) != 2: + raise RuntimeError( + f'The input must have two words, but got {len(word_pair)}. Please check your inputs.') + if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str): + raise RuntimeError( + f'The types of text pair must be (str, str), but got' + f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.') + + for word in word_pair: + if self.get_idx_from_word(word) == \ + self.get_idx_from_word(self.vocab.unk_token): + raise RuntimeError( + f'Word "{word}" is not in vocab. Please check your inputs.') + results.append(str(self.cosine_sim(*word_pair))) + return results diff --git a/modules/text/embedding/w2v_people_daily_target_word-char_dim300/README.md b/modules/text/embedding/w2v_people_daily_target_word-char_dim300/README.md new file mode 100644 index 00000000..891a89be --- /dev/null +++ b/modules/text/embedding/w2v_people_daily_target_word-char_dim300/README.md @@ -0,0 +1,127 @@ +## 概述 +PaddleHub提供多个开源的预训练Embedding模型。这些Embedding模型可根据不同语料、不同训练方式和不同的维度进行区分,关于模型的具体信息可参考PaddleNLP的文档:[Embedding模型汇总](https://github.com/PaddlePaddle/models/blob/release/2.0-beta/PaddleNLP/docs/embeddings.md) + +## API + +```python +def __init__( + *args, + **kwargs +) +``` + +创建一个Embedding Module对象,默认无需参数。 + +**参数** +* `*args`: 用户额外指定的列表类型的参数。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 + +关于额外参数的详情可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + + +```python +def search( + words: Union[List[str], str, int], +) +``` + +获取一个或多个词的embedding。输入可以是`str`、`List[str]`和`int`类型,分别代表获取一个词,多个词和指定词编号的embedding,词的编号和模型的词典相关,词典可通过模型实例的`vocab`属性获取。 + +**参数** +* `words`: 需要获取的词向量的词、词列表或者词编号。 + + +```python +def cosine_sim( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的余弦相似度。需要注意的是`word_a`和`word_b`都需要是词典里的单词,否则将会被认为是OOV(Out-Of-Vocabulary),同时被替换为`unknown_token`。 + +**参数** +* `word_a`: 需要计算余弦相似度的单词a。 +* `word_b`: 需要计算余弦相似度的单词b。 + + +```python +def dot( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的内积。对于输入单词同样需要注意OOV问题。 + +**参数** +* `word_a`: 需要计算内积的单词a。 +* `word_b`: 需要计算内积的单词b。 + +更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + +## 代码示例 + +```python +import paddlehub as hub +embedding = hub.Module(name='w2v_people_daily_target_word-char_dim300') + +# 获取单词的embedding +embedding.search("中国") +# 计算两个词向量的余弦相似度 +embedding.cosine_sim("中国", "美国") +# 计算两个词向量的内积 +embedding.dot("中国", "美国") +``` + +## 部署服务 + +通过PaddleHub Serving,可以部署一个在线获取两个词向量的余弦相似度的服务。 + +### Step1: 启动PaddleHub Serving + +运行启动命令: + +```shell +$ hub serving start -m w2v_people_daily_target_word-char_dim300 +``` + +这样就完成了一个获取词向量的余弦相似度服务化API的部署,默认端口号为8866。 + +**NOTE:** 如使用GPU预测,则需要在启动服务之前,请设置CUDA_VISIBLE_DEVICES环境变量,否则不用设置。 + +### Step2: 发送预测请求 + +配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果 + +```python +import requests +import json + +# 指定用于计算余弦相似度的单词对[[word_a, word_b], [word_a, word_b], ... ]] +word_pairs = [["中国", "美国"], ["今天", "明天"]] +# 以key的方式指定word_pairs传入预测方法的时的参数,此例中为"data",对于每一对单词,调用cosine_sim进行余弦相似度的计算 +data = {"data": word_pairs} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip +url = "http://10.12.121.132:8866/predict/w2v_people_daily_target_word-char_dim300" +# 指定post请求的headers为application/json方式 +headers = {"Content-Type": "application/json"} + +r = requests.post(url=url, headers=headers, data=json.dumps(data)) +print(r.json()) +``` + +## 查看代码 + +https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings + +## 依赖 + +paddlepaddle >= 2.0.0 + +paddlehub >= 2.0.0 + +## 更新历史 + +* 1.0.0 + + 初始发布 + diff --git a/modules/text/embedding/w2v_people_daily_target_word-char_dim300/__init__.py b/modules/text/embedding/w2v_people_daily_target_word-char_dim300/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modules/text/embedding/w2v_people_daily_target_word-char_dim300/module.py b/modules/text/embedding/w2v_people_daily_target_word-char_dim300/module.py new file mode 100644 index 00000000..4adef19e --- /dev/null +++ b/modules/text/embedding/w2v_people_daily_target_word-char_dim300/module.py @@ -0,0 +1,55 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List +from paddlenlp.embeddings import TokenEmbedding +from paddlehub.module.module import moduleinfo, serving + + +@moduleinfo( + name="w2v_people_daily_target_word-char_dim300", + version="1.0.0", + summary="", + author="paddlepaddle", + author_email="", + type="nlp/semantic_model") +class Embedding(TokenEmbedding): + """ + Embedding model + """ + def __init__(self, *args, **kwargs): + super(Embedding, self).__init__(embedding_name="w2v.people_daily.target.word-char.dim300", *args, **kwargs) + + @serving + def calc_similarity(self, data: List[List[str]]): + """ + Calculate similarities of giving word pairs. + """ + results = [] + for word_pair in data: + if len(word_pair) != 2: + raise RuntimeError( + f'The input must have two words, but got {len(word_pair)}. Please check your inputs.') + if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str): + raise RuntimeError( + f'The types of text pair must be (str, str), but got' + f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.') + + for word in word_pair: + if self.get_idx_from_word(word) == \ + self.get_idx_from_word(self.vocab.unk_token): + raise RuntimeError( + f'Word "{word}" is not in vocab. Please check your inputs.') + results.append(str(self.cosine_sim(*word_pair))) + return results diff --git a/modules/text/embedding/w2v_people_daily_target_word-word_dim300/README.md b/modules/text/embedding/w2v_people_daily_target_word-word_dim300/README.md new file mode 100644 index 00000000..31626338 --- /dev/null +++ b/modules/text/embedding/w2v_people_daily_target_word-word_dim300/README.md @@ -0,0 +1,127 @@ +## 概述 +PaddleHub提供多个开源的预训练Embedding模型。这些Embedding模型可根据不同语料、不同训练方式和不同的维度进行区分,关于模型的具体信息可参考PaddleNLP的文档:[Embedding模型汇总](https://github.com/PaddlePaddle/models/blob/release/2.0-beta/PaddleNLP/docs/embeddings.md) + +## API + +```python +def __init__( + *args, + **kwargs +) +``` + +创建一个Embedding Module对象,默认无需参数。 + +**参数** +* `*args`: 用户额外指定的列表类型的参数。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 + +关于额外参数的详情可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + + +```python +def search( + words: Union[List[str], str, int], +) +``` + +获取一个或多个词的embedding。输入可以是`str`、`List[str]`和`int`类型,分别代表获取一个词,多个词和指定词编号的embedding,词的编号和模型的词典相关,词典可通过模型实例的`vocab`属性获取。 + +**参数** +* `words`: 需要获取的词向量的词、词列表或者词编号。 + + +```python +def cosine_sim( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的余弦相似度。需要注意的是`word_a`和`word_b`都需要是词典里的单词,否则将会被认为是OOV(Out-Of-Vocabulary),同时被替换为`unknown_token`。 + +**参数** +* `word_a`: 需要计算余弦相似度的单词a。 +* `word_b`: 需要计算余弦相似度的单词b。 + + +```python +def dot( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的内积。对于输入单词同样需要注意OOV问题。 + +**参数** +* `word_a`: 需要计算内积的单词a。 +* `word_b`: 需要计算内积的单词b。 + +更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + +## 代码示例 + +```python +import paddlehub as hub +embedding = hub.Module(name='w2v_people_daily_target_word-word_dim300') + +# 获取单词的embedding +embedding.search("中国") +# 计算两个词向量的余弦相似度 +embedding.cosine_sim("中国", "美国") +# 计算两个词向量的内积 +embedding.dot("中国", "美国") +``` + +## 部署服务 + +通过PaddleHub Serving,可以部署一个在线获取两个词向量的余弦相似度的服务。 + +### Step1: 启动PaddleHub Serving + +运行启动命令: + +```shell +$ hub serving start -m w2v_people_daily_target_word-word_dim300 +``` + +这样就完成了一个获取词向量的余弦相似度服务化API的部署,默认端口号为8866。 + +**NOTE:** 如使用GPU预测,则需要在启动服务之前,请设置CUDA_VISIBLE_DEVICES环境变量,否则不用设置。 + +### Step2: 发送预测请求 + +配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果 + +```python +import requests +import json + +# 指定用于计算余弦相似度的单词对[[word_a, word_b], [word_a, word_b], ... ]] +word_pairs = [["中国", "美国"], ["今天", "明天"]] +# 以key的方式指定word_pairs传入预测方法的时的参数,此例中为"data",对于每一对单词,调用cosine_sim进行余弦相似度的计算 +data = {"data": word_pairs} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip +url = "http://10.12.121.132:8866/predict/w2v_people_daily_target_word-word_dim300" +# 指定post请求的headers为application/json方式 +headers = {"Content-Type": "application/json"} + +r = requests.post(url=url, headers=headers, data=json.dumps(data)) +print(r.json()) +``` + +## 查看代码 + +https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings + +## 依赖 + +paddlepaddle >= 2.0.0 + +paddlehub >= 2.0.0 + +## 更新历史 + +* 1.0.0 + + 初始发布 + diff --git a/modules/text/embedding/w2v_people_daily_target_word-word_dim300/__init__.py b/modules/text/embedding/w2v_people_daily_target_word-word_dim300/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modules/text/embedding/w2v_people_daily_target_word-word_dim300/module.py b/modules/text/embedding/w2v_people_daily_target_word-word_dim300/module.py new file mode 100644 index 00000000..a30b51c0 --- /dev/null +++ b/modules/text/embedding/w2v_people_daily_target_word-word_dim300/module.py @@ -0,0 +1,55 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List +from paddlenlp.embeddings import TokenEmbedding +from paddlehub.module.module import moduleinfo, serving + + +@moduleinfo( + name="w2v_people_daily_target_word-word_dim300", + version="1.0.0", + summary="", + author="paddlepaddle", + author_email="", + type="nlp/semantic_model") +class Embedding(TokenEmbedding): + """ + Embedding model + """ + def __init__(self, *args, **kwargs): + super(Embedding, self).__init__(embedding_name="w2v.people_daily.target.word-word.dim300", *args, **kwargs) + + @serving + def calc_similarity(self, data: List[List[str]]): + """ + Calculate similarities of giving word pairs. + """ + results = [] + for word_pair in data: + if len(word_pair) != 2: + raise RuntimeError( + f'The input must have two words, but got {len(word_pair)}. Please check your inputs.') + if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str): + raise RuntimeError( + f'The types of text pair must be (str, str), but got' + f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.') + + for word in word_pair: + if self.get_idx_from_word(word) == \ + self.get_idx_from_word(self.vocab.unk_token): + raise RuntimeError( + f'Word "{word}" is not in vocab. Please check your inputs.') + results.append(str(self.cosine_sim(*word_pair))) + return results diff --git a/modules/text/embedding/w2v_sikuquanshu_target_word-bigram_dim300/README.md b/modules/text/embedding/w2v_sikuquanshu_target_word-bigram_dim300/README.md new file mode 100644 index 00000000..8f329e93 --- /dev/null +++ b/modules/text/embedding/w2v_sikuquanshu_target_word-bigram_dim300/README.md @@ -0,0 +1,127 @@ +## 概述 +PaddleHub提供多个开源的预训练Embedding模型。这些Embedding模型可根据不同语料、不同训练方式和不同的维度进行区分,关于模型的具体信息可参考PaddleNLP的文档:[Embedding模型汇总](https://github.com/PaddlePaddle/models/blob/release/2.0-beta/PaddleNLP/docs/embeddings.md) + +## API + +```python +def __init__( + *args, + **kwargs +) +``` + +创建一个Embedding Module对象,默认无需参数。 + +**参数** +* `*args`: 用户额外指定的列表类型的参数。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 + +关于额外参数的详情可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + + +```python +def search( + words: Union[List[str], str, int], +) +``` + +获取一个或多个词的embedding。输入可以是`str`、`List[str]`和`int`类型,分别代表获取一个词,多个词和指定词编号的embedding,词的编号和模型的词典相关,词典可通过模型实例的`vocab`属性获取。 + +**参数** +* `words`: 需要获取的词向量的词、词列表或者词编号。 + + +```python +def cosine_sim( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的余弦相似度。需要注意的是`word_a`和`word_b`都需要是词典里的单词,否则将会被认为是OOV(Out-Of-Vocabulary),同时被替换为`unknown_token`。 + +**参数** +* `word_a`: 需要计算余弦相似度的单词a。 +* `word_b`: 需要计算余弦相似度的单词b。 + + +```python +def dot( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的内积。对于输入单词同样需要注意OOV问题。 + +**参数** +* `word_a`: 需要计算内积的单词a。 +* `word_b`: 需要计算内积的单词b。 + +更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + +## 代码示例 + +```python +import paddlehub as hub +embedding = hub.Module(name='w2v_sikuquanshu_target_word-bigram_dim300') + +# 获取单词的embedding +embedding.search("中国") +# 计算两个词向量的余弦相似度 +embedding.cosine_sim("中国", "美国") +# 计算两个词向量的内积 +embedding.dot("中国", "美国") +``` + +## 部署服务 + +通过PaddleHub Serving,可以部署一个在线获取两个词向量的余弦相似度的服务。 + +### Step1: 启动PaddleHub Serving + +运行启动命令: + +```shell +$ hub serving start -m w2v_sikuquanshu_target_word-bigram_dim300 +``` + +这样就完成了一个获取词向量的余弦相似度服务化API的部署,默认端口号为8866。 + +**NOTE:** 如使用GPU预测,则需要在启动服务之前,请设置CUDA_VISIBLE_DEVICES环境变量,否则不用设置。 + +### Step2: 发送预测请求 + +配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果 + +```python +import requests +import json + +# 指定用于计算余弦相似度的单词对[[word_a, word_b], [word_a, word_b], ... ]] +word_pairs = [["中国", "美国"], ["今天", "明天"]] +# 以key的方式指定word_pairs传入预测方法的时的参数,此例中为"data",对于每一对单词,调用cosine_sim进行余弦相似度的计算 +data = {"data": word_pairs} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip +url = "http://10.12.121.132:8866/predict/w2v_sikuquanshu_target_word-bigram_dim300" +# 指定post请求的headers为application/json方式 +headers = {"Content-Type": "application/json"} + +r = requests.post(url=url, headers=headers, data=json.dumps(data)) +print(r.json()) +``` + +## 查看代码 + +https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings + +## 依赖 + +paddlepaddle >= 2.0.0 + +paddlehub >= 2.0.0 + +## 更新历史 + +* 1.0.0 + + 初始发布 + diff --git a/modules/text/embedding/w2v_sikuquanshu_target_word-bigram_dim300/__init__.py b/modules/text/embedding/w2v_sikuquanshu_target_word-bigram_dim300/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modules/text/embedding/w2v_sikuquanshu_target_word-bigram_dim300/module.py b/modules/text/embedding/w2v_sikuquanshu_target_word-bigram_dim300/module.py new file mode 100644 index 00000000..43030c21 --- /dev/null +++ b/modules/text/embedding/w2v_sikuquanshu_target_word-bigram_dim300/module.py @@ -0,0 +1,55 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List +from paddlenlp.embeddings import TokenEmbedding +from paddlehub.module.module import moduleinfo, serving + + +@moduleinfo( + name="w2v_sikuquanshu_target_word-bigram_dim300", + version="1.0.0", + summary="", + author="paddlepaddle", + author_email="", + type="nlp/semantic_model") +class Embedding(TokenEmbedding): + """ + Embedding model + """ + def __init__(self, *args, **kwargs): + super(Embedding, self).__init__(embedding_name="w2v.sikuquanshu.target.word-bigram.dim300", *args, **kwargs) + + @serving + def calc_similarity(self, data: List[List[str]]): + """ + Calculate similarities of giving word pairs. + """ + results = [] + for word_pair in data: + if len(word_pair) != 2: + raise RuntimeError( + f'The input must have two words, but got {len(word_pair)}. Please check your inputs.') + if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str): + raise RuntimeError( + f'The types of text pair must be (str, str), but got' + f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.') + + for word in word_pair: + if self.get_idx_from_word(word) == \ + self.get_idx_from_word(self.vocab.unk_token): + raise RuntimeError( + f'Word "{word}" is not in vocab. Please check your inputs.') + results.append(str(self.cosine_sim(*word_pair))) + return results diff --git a/modules/text/embedding/w2v_sikuquanshu_target_word-word_dim300/README.md b/modules/text/embedding/w2v_sikuquanshu_target_word-word_dim300/README.md new file mode 100644 index 00000000..e6116a9f --- /dev/null +++ b/modules/text/embedding/w2v_sikuquanshu_target_word-word_dim300/README.md @@ -0,0 +1,127 @@ +## 概述 +PaddleHub提供多个开源的预训练Embedding模型。这些Embedding模型可根据不同语料、不同训练方式和不同的维度进行区分,关于模型的具体信息可参考PaddleNLP的文档:[Embedding模型汇总](https://github.com/PaddlePaddle/models/blob/release/2.0-beta/PaddleNLP/docs/embeddings.md) + +## API + +```python +def __init__( + *args, + **kwargs +) +``` + +创建一个Embedding Module对象,默认无需参数。 + +**参数** +* `*args`: 用户额外指定的列表类型的参数。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 + +关于额外参数的详情可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + + +```python +def search( + words: Union[List[str], str, int], +) +``` + +获取一个或多个词的embedding。输入可以是`str`、`List[str]`和`int`类型,分别代表获取一个词,多个词和指定词编号的embedding,词的编号和模型的词典相关,词典可通过模型实例的`vocab`属性获取。 + +**参数** +* `words`: 需要获取的词向量的词、词列表或者词编号。 + + +```python +def cosine_sim( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的余弦相似度。需要注意的是`word_a`和`word_b`都需要是词典里的单词,否则将会被认为是OOV(Out-Of-Vocabulary),同时被替换为`unknown_token`。 + +**参数** +* `word_a`: 需要计算余弦相似度的单词a。 +* `word_b`: 需要计算余弦相似度的单词b。 + + +```python +def dot( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的内积。对于输入单词同样需要注意OOV问题。 + +**参数** +* `word_a`: 需要计算内积的单词a。 +* `word_b`: 需要计算内积的单词b。 + +更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + +## 代码示例 + +```python +import paddlehub as hub +embedding = hub.Module(name='w2v_sikuquanshu_target_word-word_dim300') + +# 获取单词的embedding +embedding.search("中国") +# 计算两个词向量的余弦相似度 +embedding.cosine_sim("中国", "美国") +# 计算两个词向量的内积 +embedding.dot("中国", "美国") +``` + +## 部署服务 + +通过PaddleHub Serving,可以部署一个在线获取两个词向量的余弦相似度的服务。 + +### Step1: 启动PaddleHub Serving + +运行启动命令: + +```shell +$ hub serving start -m w2v_sikuquanshu_target_word-word_dim300 +``` + +这样就完成了一个获取词向量的余弦相似度服务化API的部署,默认端口号为8866。 + +**NOTE:** 如使用GPU预测,则需要在启动服务之前,请设置CUDA_VISIBLE_DEVICES环境变量,否则不用设置。 + +### Step2: 发送预测请求 + +配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果 + +```python +import requests +import json + +# 指定用于计算余弦相似度的单词对[[word_a, word_b], [word_a, word_b], ... ]] +word_pairs = [["中国", "美国"], ["今天", "明天"]] +# 以key的方式指定word_pairs传入预测方法的时的参数,此例中为"data",对于每一对单词,调用cosine_sim进行余弦相似度的计算 +data = {"data": word_pairs} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip +url = "http://10.12.121.132:8866/predict/w2v_sikuquanshu_target_word-word_dim300" +# 指定post请求的headers为application/json方式 +headers = {"Content-Type": "application/json"} + +r = requests.post(url=url, headers=headers, data=json.dumps(data)) +print(r.json()) +``` + +## 查看代码 + +https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings + +## 依赖 + +paddlepaddle >= 2.0.0 + +paddlehub >= 2.0.0 + +## 更新历史 + +* 1.0.0 + + 初始发布 + diff --git a/modules/text/embedding/w2v_sikuquanshu_target_word-word_dim300/__init__.py b/modules/text/embedding/w2v_sikuquanshu_target_word-word_dim300/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modules/text/embedding/w2v_sikuquanshu_target_word-word_dim300/module.py b/modules/text/embedding/w2v_sikuquanshu_target_word-word_dim300/module.py new file mode 100644 index 00000000..e16fac97 --- /dev/null +++ b/modules/text/embedding/w2v_sikuquanshu_target_word-word_dim300/module.py @@ -0,0 +1,55 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List +from paddlenlp.embeddings import TokenEmbedding +from paddlehub.module.module import moduleinfo, serving + + +@moduleinfo( + name="w2v_sikuquanshu_target_word-word_dim300", + version="1.0.0", + summary="", + author="paddlepaddle", + author_email="", + type="nlp/semantic_model") +class Embedding(TokenEmbedding): + """ + Embedding model + """ + def __init__(self, *args, **kwargs): + super(Embedding, self).__init__(embedding_name="w2v.sikuquanshu.target.word-word.dim300", *args, **kwargs) + + @serving + def calc_similarity(self, data: List[List[str]]): + """ + Calculate similarities of giving word pairs. + """ + results = [] + for word_pair in data: + if len(word_pair) != 2: + raise RuntimeError( + f'The input must have two words, but got {len(word_pair)}. Please check your inputs.') + if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str): + raise RuntimeError( + f'The types of text pair must be (str, str), but got' + f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.') + + for word in word_pair: + if self.get_idx_from_word(word) == \ + self.get_idx_from_word(self.vocab.unk_token): + raise RuntimeError( + f'Word "{word}" is not in vocab. Please check your inputs.') + results.append(str(self.cosine_sim(*word_pair))) + return results diff --git a/modules/text/embedding/w2v_sogou_target_bigram-char_dim300/README.md b/modules/text/embedding/w2v_sogou_target_bigram-char_dim300/README.md new file mode 100644 index 00000000..c3fb7a2d --- /dev/null +++ b/modules/text/embedding/w2v_sogou_target_bigram-char_dim300/README.md @@ -0,0 +1,127 @@ +## 概述 +PaddleHub提供多个开源的预训练Embedding模型。这些Embedding模型可根据不同语料、不同训练方式和不同的维度进行区分,关于模型的具体信息可参考PaddleNLP的文档:[Embedding模型汇总](https://github.com/PaddlePaddle/models/blob/release/2.0-beta/PaddleNLP/docs/embeddings.md) + +## API + +```python +def __init__( + *args, + **kwargs +) +``` + +创建一个Embedding Module对象,默认无需参数。 + +**参数** +* `*args`: 用户额外指定的列表类型的参数。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 + +关于额外参数的详情可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + + +```python +def search( + words: Union[List[str], str, int], +) +``` + +获取一个或多个词的embedding。输入可以是`str`、`List[str]`和`int`类型,分别代表获取一个词,多个词和指定词编号的embedding,词的编号和模型的词典相关,词典可通过模型实例的`vocab`属性获取。 + +**参数** +* `words`: 需要获取的词向量的词、词列表或者词编号。 + + +```python +def cosine_sim( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的余弦相似度。需要注意的是`word_a`和`word_b`都需要是词典里的单词,否则将会被认为是OOV(Out-Of-Vocabulary),同时被替换为`unknown_token`。 + +**参数** +* `word_a`: 需要计算余弦相似度的单词a。 +* `word_b`: 需要计算余弦相似度的单词b。 + + +```python +def dot( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的内积。对于输入单词同样需要注意OOV问题。 + +**参数** +* `word_a`: 需要计算内积的单词a。 +* `word_b`: 需要计算内积的单词b。 + +更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + +## 代码示例 + +```python +import paddlehub as hub +embedding = hub.Module(name='w2v_sogou_target_bigram-char_dim300') + +# 获取单词的embedding +embedding.search("中国") +# 计算两个词向量的余弦相似度 +embedding.cosine_sim("中国", "美国") +# 计算两个词向量的内积 +embedding.dot("中国", "美国") +``` + +## 部署服务 + +通过PaddleHub Serving,可以部署一个在线获取两个词向量的余弦相似度的服务。 + +### Step1: 启动PaddleHub Serving + +运行启动命令: + +```shell +$ hub serving start -m w2v_sogou_target_bigram-char_dim300 +``` + +这样就完成了一个获取词向量的余弦相似度服务化API的部署,默认端口号为8866。 + +**NOTE:** 如使用GPU预测,则需要在启动服务之前,请设置CUDA_VISIBLE_DEVICES环境变量,否则不用设置。 + +### Step2: 发送预测请求 + +配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果 + +```python +import requests +import json + +# 指定用于计算余弦相似度的单词对[[word_a, word_b], [word_a, word_b], ... ]] +word_pairs = [["中国", "美国"], ["今天", "明天"]] +# 以key的方式指定word_pairs传入预测方法的时的参数,此例中为"data",对于每一对单词,调用cosine_sim进行余弦相似度的计算 +data = {"data": word_pairs} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip +url = "http://10.12.121.132:8866/predict/w2v_sogou_target_bigram-char_dim300" +# 指定post请求的headers为application/json方式 +headers = {"Content-Type": "application/json"} + +r = requests.post(url=url, headers=headers, data=json.dumps(data)) +print(r.json()) +``` + +## 查看代码 + +https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings + +## 依赖 + +paddlepaddle >= 2.0.0 + +paddlehub >= 2.0.0 + +## 更新历史 + +* 1.0.0 + + 初始发布 + diff --git a/modules/text/embedding/w2v_sogou_target_bigram-char_dim300/__init__.py b/modules/text/embedding/w2v_sogou_target_bigram-char_dim300/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modules/text/embedding/w2v_sogou_target_bigram-char_dim300/module.py b/modules/text/embedding/w2v_sogou_target_bigram-char_dim300/module.py new file mode 100644 index 00000000..a0cf623f --- /dev/null +++ b/modules/text/embedding/w2v_sogou_target_bigram-char_dim300/module.py @@ -0,0 +1,55 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List +from paddlenlp.embeddings import TokenEmbedding +from paddlehub.module.module import moduleinfo, serving + + +@moduleinfo( + name="w2v_sogou_target_bigram-char_dim300", + version="1.0.0", + summary="", + author="paddlepaddle", + author_email="", + type="nlp/semantic_model") +class Embedding(TokenEmbedding): + """ + Embedding model + """ + def __init__(self, *args, **kwargs): + super(Embedding, self).__init__(embedding_name="w2v.sogou.target.bigram-char.dim300", *args, **kwargs) + + @serving + def calc_similarity(self, data: List[List[str]]): + """ + Calculate similarities of giving word pairs. + """ + results = [] + for word_pair in data: + if len(word_pair) != 2: + raise RuntimeError( + f'The input must have two words, but got {len(word_pair)}. Please check your inputs.') + if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str): + raise RuntimeError( + f'The types of text pair must be (str, str), but got' + f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.') + + for word in word_pair: + if self.get_idx_from_word(word) == \ + self.get_idx_from_word(self.vocab.unk_token): + raise RuntimeError( + f'Word "{word}" is not in vocab. Please check your inputs.') + results.append(str(self.cosine_sim(*word_pair))) + return results diff --git a/modules/text/embedding/w2v_sogou_target_word-bigram_dim300/README.md b/modules/text/embedding/w2v_sogou_target_word-bigram_dim300/README.md new file mode 100644 index 00000000..909f51a0 --- /dev/null +++ b/modules/text/embedding/w2v_sogou_target_word-bigram_dim300/README.md @@ -0,0 +1,127 @@ +## 概述 +PaddleHub提供多个开源的预训练Embedding模型。这些Embedding模型可根据不同语料、不同训练方式和不同的维度进行区分,关于模型的具体信息可参考PaddleNLP的文档:[Embedding模型汇总](https://github.com/PaddlePaddle/models/blob/release/2.0-beta/PaddleNLP/docs/embeddings.md) + +## API + +```python +def __init__( + *args, + **kwargs +) +``` + +创建一个Embedding Module对象,默认无需参数。 + +**参数** +* `*args`: 用户额外指定的列表类型的参数。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 + +关于额外参数的详情可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + + +```python +def search( + words: Union[List[str], str, int], +) +``` + +获取一个或多个词的embedding。输入可以是`str`、`List[str]`和`int`类型,分别代表获取一个词,多个词和指定词编号的embedding,词的编号和模型的词典相关,词典可通过模型实例的`vocab`属性获取。 + +**参数** +* `words`: 需要获取的词向量的词、词列表或者词编号。 + + +```python +def cosine_sim( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的余弦相似度。需要注意的是`word_a`和`word_b`都需要是词典里的单词,否则将会被认为是OOV(Out-Of-Vocabulary),同时被替换为`unknown_token`。 + +**参数** +* `word_a`: 需要计算余弦相似度的单词a。 +* `word_b`: 需要计算余弦相似度的单词b。 + + +```python +def dot( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的内积。对于输入单词同样需要注意OOV问题。 + +**参数** +* `word_a`: 需要计算内积的单词a。 +* `word_b`: 需要计算内积的单词b。 + +更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + +## 代码示例 + +```python +import paddlehub as hub +embedding = hub.Module(name='w2v_sogou_target_word-bigram_dim300') + +# 获取单词的embedding +embedding.search("中国") +# 计算两个词向量的余弦相似度 +embedding.cosine_sim("中国", "美国") +# 计算两个词向量的内积 +embedding.dot("中国", "美国") +``` + +## 部署服务 + +通过PaddleHub Serving,可以部署一个在线获取两个词向量的余弦相似度的服务。 + +### Step1: 启动PaddleHub Serving + +运行启动命令: + +```shell +$ hub serving start -m w2v_sogou_target_word-bigram_dim300 +``` + +这样就完成了一个获取词向量的余弦相似度服务化API的部署,默认端口号为8866。 + +**NOTE:** 如使用GPU预测,则需要在启动服务之前,请设置CUDA_VISIBLE_DEVICES环境变量,否则不用设置。 + +### Step2: 发送预测请求 + +配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果 + +```python +import requests +import json + +# 指定用于计算余弦相似度的单词对[[word_a, word_b], [word_a, word_b], ... ]] +word_pairs = [["中国", "美国"], ["今天", "明天"]] +# 以key的方式指定word_pairs传入预测方法的时的参数,此例中为"data",对于每一对单词,调用cosine_sim进行余弦相似度的计算 +data = {"data": word_pairs} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip +url = "http://10.12.121.132:8866/predict/w2v_sogou_target_word-bigram_dim300" +# 指定post请求的headers为application/json方式 +headers = {"Content-Type": "application/json"} + +r = requests.post(url=url, headers=headers, data=json.dumps(data)) +print(r.json()) +``` + +## 查看代码 + +https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings + +## 依赖 + +paddlepaddle >= 2.0.0 + +paddlehub >= 2.0.0 + +## 更新历史 + +* 1.0.0 + + 初始发布 + diff --git a/modules/text/embedding/w2v_sogou_target_word-bigram_dim300/__init__.py b/modules/text/embedding/w2v_sogou_target_word-bigram_dim300/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modules/text/embedding/w2v_sogou_target_word-bigram_dim300/module.py b/modules/text/embedding/w2v_sogou_target_word-bigram_dim300/module.py new file mode 100644 index 00000000..472fd1ae --- /dev/null +++ b/modules/text/embedding/w2v_sogou_target_word-bigram_dim300/module.py @@ -0,0 +1,55 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List +from paddlenlp.embeddings import TokenEmbedding +from paddlehub.module.module import moduleinfo, serving + + +@moduleinfo( + name="w2v_sogou_target_word-bigram_dim300", + version="1.0.0", + summary="", + author="paddlepaddle", + author_email="", + type="nlp/semantic_model") +class Embedding(TokenEmbedding): + """ + Embedding model + """ + def __init__(self, *args, **kwargs): + super(Embedding, self).__init__(embedding_name="w2v.sogou.target.word-bigram.dim300", *args, **kwargs) + + @serving + def calc_similarity(self, data: List[List[str]]): + """ + Calculate similarities of giving word pairs. + """ + results = [] + for word_pair in data: + if len(word_pair) != 2: + raise RuntimeError( + f'The input must have two words, but got {len(word_pair)}. Please check your inputs.') + if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str): + raise RuntimeError( + f'The types of text pair must be (str, str), but got' + f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.') + + for word in word_pair: + if self.get_idx_from_word(word) == \ + self.get_idx_from_word(self.vocab.unk_token): + raise RuntimeError( + f'Word "{word}" is not in vocab. Please check your inputs.') + results.append(str(self.cosine_sim(*word_pair))) + return results diff --git a/modules/text/embedding/w2v_sogou_target_word-char_dim300/README.md b/modules/text/embedding/w2v_sogou_target_word-char_dim300/README.md new file mode 100644 index 00000000..a319c5e1 --- /dev/null +++ b/modules/text/embedding/w2v_sogou_target_word-char_dim300/README.md @@ -0,0 +1,127 @@ +## 概述 +PaddleHub提供多个开源的预训练Embedding模型。这些Embedding模型可根据不同语料、不同训练方式和不同的维度进行区分,关于模型的具体信息可参考PaddleNLP的文档:[Embedding模型汇总](https://github.com/PaddlePaddle/models/blob/release/2.0-beta/PaddleNLP/docs/embeddings.md) + +## API + +```python +def __init__( + *args, + **kwargs +) +``` + +创建一个Embedding Module对象,默认无需参数。 + +**参数** +* `*args`: 用户额外指定的列表类型的参数。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 + +关于额外参数的详情可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + + +```python +def search( + words: Union[List[str], str, int], +) +``` + +获取一个或多个词的embedding。输入可以是`str`、`List[str]`和`int`类型,分别代表获取一个词,多个词和指定词编号的embedding,词的编号和模型的词典相关,词典可通过模型实例的`vocab`属性获取。 + +**参数** +* `words`: 需要获取的词向量的词、词列表或者词编号。 + + +```python +def cosine_sim( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的余弦相似度。需要注意的是`word_a`和`word_b`都需要是词典里的单词,否则将会被认为是OOV(Out-Of-Vocabulary),同时被替换为`unknown_token`。 + +**参数** +* `word_a`: 需要计算余弦相似度的单词a。 +* `word_b`: 需要计算余弦相似度的单词b。 + + +```python +def dot( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的内积。对于输入单词同样需要注意OOV问题。 + +**参数** +* `word_a`: 需要计算内积的单词a。 +* `word_b`: 需要计算内积的单词b。 + +更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + +## 代码示例 + +```python +import paddlehub as hub +embedding = hub.Module(name='w2v_sogou_target_word-char_dim300') + +# 获取单词的embedding +embedding.search("中国") +# 计算两个词向量的余弦相似度 +embedding.cosine_sim("中国", "美国") +# 计算两个词向量的内积 +embedding.dot("中国", "美国") +``` + +## 部署服务 + +通过PaddleHub Serving,可以部署一个在线获取两个词向量的余弦相似度的服务。 + +### Step1: 启动PaddleHub Serving + +运行启动命令: + +```shell +$ hub serving start -m w2v_sogou_target_word-char_dim300 +``` + +这样就完成了一个获取词向量的余弦相似度服务化API的部署,默认端口号为8866。 + +**NOTE:** 如使用GPU预测,则需要在启动服务之前,请设置CUDA_VISIBLE_DEVICES环境变量,否则不用设置。 + +### Step2: 发送预测请求 + +配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果 + +```python +import requests +import json + +# 指定用于计算余弦相似度的单词对[[word_a, word_b], [word_a, word_b], ... ]] +word_pairs = [["中国", "美国"], ["今天", "明天"]] +# 以key的方式指定word_pairs传入预测方法的时的参数,此例中为"data",对于每一对单词,调用cosine_sim进行余弦相似度的计算 +data = {"data": word_pairs} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip +url = "http://10.12.121.132:8866/predict/w2v_sogou_target_word-char_dim300" +# 指定post请求的headers为application/json方式 +headers = {"Content-Type": "application/json"} + +r = requests.post(url=url, headers=headers, data=json.dumps(data)) +print(r.json()) +``` + +## 查看代码 + +https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings + +## 依赖 + +paddlepaddle >= 2.0.0 + +paddlehub >= 2.0.0 + +## 更新历史 + +* 1.0.0 + + 初始发布 + diff --git a/modules/text/embedding/w2v_sogou_target_word-char_dim300/__init__.py b/modules/text/embedding/w2v_sogou_target_word-char_dim300/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modules/text/embedding/w2v_sogou_target_word-char_dim300/module.py b/modules/text/embedding/w2v_sogou_target_word-char_dim300/module.py new file mode 100644 index 00000000..cc6ac716 --- /dev/null +++ b/modules/text/embedding/w2v_sogou_target_word-char_dim300/module.py @@ -0,0 +1,55 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List +from paddlenlp.embeddings import TokenEmbedding +from paddlehub.module.module import moduleinfo, serving + + +@moduleinfo( + name="w2v_sogou_target_word-char_dim300", + version="1.0.0", + summary="", + author="paddlepaddle", + author_email="", + type="nlp/semantic_model") +class Embedding(TokenEmbedding): + """ + Embedding model + """ + def __init__(self, *args, **kwargs): + super(Embedding, self).__init__(embedding_name="w2v.sogou.target.word-char.dim300", *args, **kwargs) + + @serving + def calc_similarity(self, data: List[List[str]]): + """ + Calculate similarities of giving word pairs. + """ + results = [] + for word_pair in data: + if len(word_pair) != 2: + raise RuntimeError( + f'The input must have two words, but got {len(word_pair)}. Please check your inputs.') + if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str): + raise RuntimeError( + f'The types of text pair must be (str, str), but got' + f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.') + + for word in word_pair: + if self.get_idx_from_word(word) == \ + self.get_idx_from_word(self.vocab.unk_token): + raise RuntimeError( + f'Word "{word}" is not in vocab. Please check your inputs.') + results.append(str(self.cosine_sim(*word_pair))) + return results diff --git a/modules/text/embedding/w2v_sogou_target_word-word_dim300/README.md b/modules/text/embedding/w2v_sogou_target_word-word_dim300/README.md new file mode 100644 index 00000000..192315a8 --- /dev/null +++ b/modules/text/embedding/w2v_sogou_target_word-word_dim300/README.md @@ -0,0 +1,127 @@ +## 概述 +PaddleHub提供多个开源的预训练Embedding模型。这些Embedding模型可根据不同语料、不同训练方式和不同的维度进行区分,关于模型的具体信息可参考PaddleNLP的文档:[Embedding模型汇总](https://github.com/PaddlePaddle/models/blob/release/2.0-beta/PaddleNLP/docs/embeddings.md) + +## API + +```python +def __init__( + *args, + **kwargs +) +``` + +创建一个Embedding Module对象,默认无需参数。 + +**参数** +* `*args`: 用户额外指定的列表类型的参数。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 + +关于额外参数的详情可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + + +```python +def search( + words: Union[List[str], str, int], +) +``` + +获取一个或多个词的embedding。输入可以是`str`、`List[str]`和`int`类型,分别代表获取一个词,多个词和指定词编号的embedding,词的编号和模型的词典相关,词典可通过模型实例的`vocab`属性获取。 + +**参数** +* `words`: 需要获取的词向量的词、词列表或者词编号。 + + +```python +def cosine_sim( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的余弦相似度。需要注意的是`word_a`和`word_b`都需要是词典里的单词,否则将会被认为是OOV(Out-Of-Vocabulary),同时被替换为`unknown_token`。 + +**参数** +* `word_a`: 需要计算余弦相似度的单词a。 +* `word_b`: 需要计算余弦相似度的单词b。 + + +```python +def dot( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的内积。对于输入单词同样需要注意OOV问题。 + +**参数** +* `word_a`: 需要计算内积的单词a。 +* `word_b`: 需要计算内积的单词b。 + +更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + +## 代码示例 + +```python +import paddlehub as hub +embedding = hub.Module(name='w2v_sogou_target_word-word_dim300') + +# 获取单词的embedding +embedding.search("中国") +# 计算两个词向量的余弦相似度 +embedding.cosine_sim("中国", "美国") +# 计算两个词向量的内积 +embedding.dot("中国", "美国") +``` + +## 部署服务 + +通过PaddleHub Serving,可以部署一个在线获取两个词向量的余弦相似度的服务。 + +### Step1: 启动PaddleHub Serving + +运行启动命令: + +```shell +$ hub serving start -m w2v_sogou_target_word-word_dim300 +``` + +这样就完成了一个获取词向量的余弦相似度服务化API的部署,默认端口号为8866。 + +**NOTE:** 如使用GPU预测,则需要在启动服务之前,请设置CUDA_VISIBLE_DEVICES环境变量,否则不用设置。 + +### Step2: 发送预测请求 + +配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果 + +```python +import requests +import json + +# 指定用于计算余弦相似度的单词对[[word_a, word_b], [word_a, word_b], ... ]] +word_pairs = [["中国", "美国"], ["今天", "明天"]] +# 以key的方式指定word_pairs传入预测方法的时的参数,此例中为"data",对于每一对单词,调用cosine_sim进行余弦相似度的计算 +data = {"data": word_pairs} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip +url = "http://10.12.121.132:8866/predict/w2v_sogou_target_word-word_dim300" +# 指定post请求的headers为application/json方式 +headers = {"Content-Type": "application/json"} + +r = requests.post(url=url, headers=headers, data=json.dumps(data)) +print(r.json()) +``` + +## 查看代码 + +https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings + +## 依赖 + +paddlepaddle >= 2.0.0 + +paddlehub >= 2.0.0 + +## 更新历史 + +* 1.0.0 + + 初始发布 + diff --git a/modules/text/embedding/w2v_sogou_target_word-word_dim300/__init__.py b/modules/text/embedding/w2v_sogou_target_word-word_dim300/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modules/text/embedding/w2v_sogou_target_word-word_dim300/module.py b/modules/text/embedding/w2v_sogou_target_word-word_dim300/module.py new file mode 100644 index 00000000..0c057c21 --- /dev/null +++ b/modules/text/embedding/w2v_sogou_target_word-word_dim300/module.py @@ -0,0 +1,55 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List +from paddlenlp.embeddings import TokenEmbedding +from paddlehub.module.module import moduleinfo, serving + + +@moduleinfo( + name="w2v_sogou_target_word-word_dim300", + version="1.0.0", + summary="", + author="paddlepaddle", + author_email="", + type="nlp/semantic_model") +class Embedding(TokenEmbedding): + """ + Embedding model + """ + def __init__(self, *args, **kwargs): + super(Embedding, self).__init__(embedding_name="w2v.sogou.target.word-word.dim300", *args, **kwargs) + + @serving + def calc_similarity(self, data: List[List[str]]): + """ + Calculate similarities of giving word pairs. + """ + results = [] + for word_pair in data: + if len(word_pair) != 2: + raise RuntimeError( + f'The input must have two words, but got {len(word_pair)}. Please check your inputs.') + if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str): + raise RuntimeError( + f'The types of text pair must be (str, str), but got' + f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.') + + for word in word_pair: + if self.get_idx_from_word(word) == \ + self.get_idx_from_word(self.vocab.unk_token): + raise RuntimeError( + f'Word "{word}" is not in vocab. Please check your inputs.') + results.append(str(self.cosine_sim(*word_pair))) + return results diff --git a/modules/text/embedding/w2v_weibo_target_bigram-char_dim300/README.md b/modules/text/embedding/w2v_weibo_target_bigram-char_dim300/README.md new file mode 100644 index 00000000..76b525c9 --- /dev/null +++ b/modules/text/embedding/w2v_weibo_target_bigram-char_dim300/README.md @@ -0,0 +1,127 @@ +## 概述 +PaddleHub提供多个开源的预训练Embedding模型。这些Embedding模型可根据不同语料、不同训练方式和不同的维度进行区分,关于模型的具体信息可参考PaddleNLP的文档:[Embedding模型汇总](https://github.com/PaddlePaddle/models/blob/release/2.0-beta/PaddleNLP/docs/embeddings.md) + +## API + +```python +def __init__( + *args, + **kwargs +) +``` + +创建一个Embedding Module对象,默认无需参数。 + +**参数** +* `*args`: 用户额外指定的列表类型的参数。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 + +关于额外参数的详情可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + + +```python +def search( + words: Union[List[str], str, int], +) +``` + +获取一个或多个词的embedding。输入可以是`str`、`List[str]`和`int`类型,分别代表获取一个词,多个词和指定词编号的embedding,词的编号和模型的词典相关,词典可通过模型实例的`vocab`属性获取。 + +**参数** +* `words`: 需要获取的词向量的词、词列表或者词编号。 + + +```python +def cosine_sim( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的余弦相似度。需要注意的是`word_a`和`word_b`都需要是词典里的单词,否则将会被认为是OOV(Out-Of-Vocabulary),同时被替换为`unknown_token`。 + +**参数** +* `word_a`: 需要计算余弦相似度的单词a。 +* `word_b`: 需要计算余弦相似度的单词b。 + + +```python +def dot( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的内积。对于输入单词同样需要注意OOV问题。 + +**参数** +* `word_a`: 需要计算内积的单词a。 +* `word_b`: 需要计算内积的单词b。 + +更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + +## 代码示例 + +```python +import paddlehub as hub +embedding = hub.Module(name='w2v_weibo_target_bigram-char_dim300') + +# 获取单词的embedding +embedding.search("中国") +# 计算两个词向量的余弦相似度 +embedding.cosine_sim("中国", "美国") +# 计算两个词向量的内积 +embedding.dot("中国", "美国") +``` + +## 部署服务 + +通过PaddleHub Serving,可以部署一个在线获取两个词向量的余弦相似度的服务。 + +### Step1: 启动PaddleHub Serving + +运行启动命令: + +```shell +$ hub serving start -m w2v_weibo_target_bigram-char_dim300 +``` + +这样就完成了一个获取词向量的余弦相似度服务化API的部署,默认端口号为8866。 + +**NOTE:** 如使用GPU预测,则需要在启动服务之前,请设置CUDA_VISIBLE_DEVICES环境变量,否则不用设置。 + +### Step2: 发送预测请求 + +配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果 + +```python +import requests +import json + +# 指定用于计算余弦相似度的单词对[[word_a, word_b], [word_a, word_b], ... ]] +word_pairs = [["中国", "美国"], ["今天", "明天"]] +# 以key的方式指定word_pairs传入预测方法的时的参数,此例中为"data",对于每一对单词,调用cosine_sim进行余弦相似度的计算 +data = {"data": word_pairs} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip +url = "http://10.12.121.132:8866/predict/w2v_weibo_target_bigram-char_dim300" +# 指定post请求的headers为application/json方式 +headers = {"Content-Type": "application/json"} + +r = requests.post(url=url, headers=headers, data=json.dumps(data)) +print(r.json()) +``` + +## 查看代码 + +https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings + +## 依赖 + +paddlepaddle >= 2.0.0 + +paddlehub >= 2.0.0 + +## 更新历史 + +* 1.0.0 + + 初始发布 + diff --git a/modules/text/embedding/w2v_weibo_target_bigram-char_dim300/__init__.py b/modules/text/embedding/w2v_weibo_target_bigram-char_dim300/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modules/text/embedding/w2v_weibo_target_bigram-char_dim300/module.py b/modules/text/embedding/w2v_weibo_target_bigram-char_dim300/module.py new file mode 100644 index 00000000..34caab57 --- /dev/null +++ b/modules/text/embedding/w2v_weibo_target_bigram-char_dim300/module.py @@ -0,0 +1,55 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List +from paddlenlp.embeddings import TokenEmbedding +from paddlehub.module.module import moduleinfo, serving + + +@moduleinfo( + name="w2v_weibo_target_bigram-char_dim300", + version="1.0.0", + summary="", + author="paddlepaddle", + author_email="", + type="nlp/semantic_model") +class Embedding(TokenEmbedding): + """ + Embedding model + """ + def __init__(self, *args, **kwargs): + super(Embedding, self).__init__(embedding_name="w2v.weibo.target.bigram-char.dim300", *args, **kwargs) + + @serving + def calc_similarity(self, data: List[List[str]]): + """ + Calculate similarities of giving word pairs. + """ + results = [] + for word_pair in data: + if len(word_pair) != 2: + raise RuntimeError( + f'The input must have two words, but got {len(word_pair)}. Please check your inputs.') + if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str): + raise RuntimeError( + f'The types of text pair must be (str, str), but got' + f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.') + + for word in word_pair: + if self.get_idx_from_word(word) == \ + self.get_idx_from_word(self.vocab.unk_token): + raise RuntimeError( + f'Word "{word}" is not in vocab. Please check your inputs.') + results.append(str(self.cosine_sim(*word_pair))) + return results diff --git a/modules/text/embedding/w2v_weibo_target_word-bigram_dim300/README.md b/modules/text/embedding/w2v_weibo_target_word-bigram_dim300/README.md new file mode 100644 index 00000000..38f62f7f --- /dev/null +++ b/modules/text/embedding/w2v_weibo_target_word-bigram_dim300/README.md @@ -0,0 +1,127 @@ +## 概述 +PaddleHub提供多个开源的预训练Embedding模型。这些Embedding模型可根据不同语料、不同训练方式和不同的维度进行区分,关于模型的具体信息可参考PaddleNLP的文档:[Embedding模型汇总](https://github.com/PaddlePaddle/models/blob/release/2.0-beta/PaddleNLP/docs/embeddings.md) + +## API + +```python +def __init__( + *args, + **kwargs +) +``` + +创建一个Embedding Module对象,默认无需参数。 + +**参数** +* `*args`: 用户额外指定的列表类型的参数。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 + +关于额外参数的详情可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + + +```python +def search( + words: Union[List[str], str, int], +) +``` + +获取一个或多个词的embedding。输入可以是`str`、`List[str]`和`int`类型,分别代表获取一个词,多个词和指定词编号的embedding,词的编号和模型的词典相关,词典可通过模型实例的`vocab`属性获取。 + +**参数** +* `words`: 需要获取的词向量的词、词列表或者词编号。 + + +```python +def cosine_sim( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的余弦相似度。需要注意的是`word_a`和`word_b`都需要是词典里的单词,否则将会被认为是OOV(Out-Of-Vocabulary),同时被替换为`unknown_token`。 + +**参数** +* `word_a`: 需要计算余弦相似度的单词a。 +* `word_b`: 需要计算余弦相似度的单词b。 + + +```python +def dot( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的内积。对于输入单词同样需要注意OOV问题。 + +**参数** +* `word_a`: 需要计算内积的单词a。 +* `word_b`: 需要计算内积的单词b。 + +更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + +## 代码示例 + +```python +import paddlehub as hub +embedding = hub.Module(name='w2v_weibo_target_word-bigram_dim300') + +# 获取单词的embedding +embedding.search("中国") +# 计算两个词向量的余弦相似度 +embedding.cosine_sim("中国", "美国") +# 计算两个词向量的内积 +embedding.dot("中国", "美国") +``` + +## 部署服务 + +通过PaddleHub Serving,可以部署一个在线获取两个词向量的余弦相似度的服务。 + +### Step1: 启动PaddleHub Serving + +运行启动命令: + +```shell +$ hub serving start -m w2v_weibo_target_word-bigram_dim300 +``` + +这样就完成了一个获取词向量的余弦相似度服务化API的部署,默认端口号为8866。 + +**NOTE:** 如使用GPU预测,则需要在启动服务之前,请设置CUDA_VISIBLE_DEVICES环境变量,否则不用设置。 + +### Step2: 发送预测请求 + +配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果 + +```python +import requests +import json + +# 指定用于计算余弦相似度的单词对[[word_a, word_b], [word_a, word_b], ... ]] +word_pairs = [["中国", "美国"], ["今天", "明天"]] +# 以key的方式指定word_pairs传入预测方法的时的参数,此例中为"data",对于每一对单词,调用cosine_sim进行余弦相似度的计算 +data = {"data": word_pairs} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip +url = "http://10.12.121.132:8866/predict/w2v_weibo_target_word-bigram_dim300" +# 指定post请求的headers为application/json方式 +headers = {"Content-Type": "application/json"} + +r = requests.post(url=url, headers=headers, data=json.dumps(data)) +print(r.json()) +``` + +## 查看代码 + +https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings + +## 依赖 + +paddlepaddle >= 2.0.0 + +paddlehub >= 2.0.0 + +## 更新历史 + +* 1.0.0 + + 初始发布 + diff --git a/modules/text/embedding/w2v_weibo_target_word-bigram_dim300/__init__.py b/modules/text/embedding/w2v_weibo_target_word-bigram_dim300/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modules/text/embedding/w2v_weibo_target_word-bigram_dim300/module.py b/modules/text/embedding/w2v_weibo_target_word-bigram_dim300/module.py new file mode 100644 index 00000000..09ee9de3 --- /dev/null +++ b/modules/text/embedding/w2v_weibo_target_word-bigram_dim300/module.py @@ -0,0 +1,55 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List +from paddlenlp.embeddings import TokenEmbedding +from paddlehub.module.module import moduleinfo, serving + + +@moduleinfo( + name="w2v_weibo_target_word-bigram_dim300", + version="1.0.0", + summary="", + author="paddlepaddle", + author_email="", + type="nlp/semantic_model") +class Embedding(TokenEmbedding): + """ + Embedding model + """ + def __init__(self, *args, **kwargs): + super(Embedding, self).__init__(embedding_name="w2v.weibo.target.word-bigram.dim300", *args, **kwargs) + + @serving + def calc_similarity(self, data: List[List[str]]): + """ + Calculate similarities of giving word pairs. + """ + results = [] + for word_pair in data: + if len(word_pair) != 2: + raise RuntimeError( + f'The input must have two words, but got {len(word_pair)}. Please check your inputs.') + if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str): + raise RuntimeError( + f'The types of text pair must be (str, str), but got' + f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.') + + for word in word_pair: + if self.get_idx_from_word(word) == \ + self.get_idx_from_word(self.vocab.unk_token): + raise RuntimeError( + f'Word "{word}" is not in vocab. Please check your inputs.') + results.append(str(self.cosine_sim(*word_pair))) + return results diff --git a/modules/text/embedding/w2v_weibo_target_word-char_dim300/README.md b/modules/text/embedding/w2v_weibo_target_word-char_dim300/README.md new file mode 100644 index 00000000..dcb7409c --- /dev/null +++ b/modules/text/embedding/w2v_weibo_target_word-char_dim300/README.md @@ -0,0 +1,127 @@ +## 概述 +PaddleHub提供多个开源的预训练Embedding模型。这些Embedding模型可根据不同语料、不同训练方式和不同的维度进行区分,关于模型的具体信息可参考PaddleNLP的文档:[Embedding模型汇总](https://github.com/PaddlePaddle/models/blob/release/2.0-beta/PaddleNLP/docs/embeddings.md) + +## API + +```python +def __init__( + *args, + **kwargs +) +``` + +创建一个Embedding Module对象,默认无需参数。 + +**参数** +* `*args`: 用户额外指定的列表类型的参数。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 + +关于额外参数的详情可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + + +```python +def search( + words: Union[List[str], str, int], +) +``` + +获取一个或多个词的embedding。输入可以是`str`、`List[str]`和`int`类型,分别代表获取一个词,多个词和指定词编号的embedding,词的编号和模型的词典相关,词典可通过模型实例的`vocab`属性获取。 + +**参数** +* `words`: 需要获取的词向量的词、词列表或者词编号。 + + +```python +def cosine_sim( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的余弦相似度。需要注意的是`word_a`和`word_b`都需要是词典里的单词,否则将会被认为是OOV(Out-Of-Vocabulary),同时被替换为`unknown_token`。 + +**参数** +* `word_a`: 需要计算余弦相似度的单词a。 +* `word_b`: 需要计算余弦相似度的单词b。 + + +```python +def dot( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的内积。对于输入单词同样需要注意OOV问题。 + +**参数** +* `word_a`: 需要计算内积的单词a。 +* `word_b`: 需要计算内积的单词b。 + +更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + +## 代码示例 + +```python +import paddlehub as hub +embedding = hub.Module(name='w2v_weibo_target_word-char_dim300') + +# 获取单词的embedding +embedding.search("中国") +# 计算两个词向量的余弦相似度 +embedding.cosine_sim("中国", "美国") +# 计算两个词向量的内积 +embedding.dot("中国", "美国") +``` + +## 部署服务 + +通过PaddleHub Serving,可以部署一个在线获取两个词向量的余弦相似度的服务。 + +### Step1: 启动PaddleHub Serving + +运行启动命令: + +```shell +$ hub serving start -m w2v_weibo_target_word-char_dim300 +``` + +这样就完成了一个获取词向量的余弦相似度服务化API的部署,默认端口号为8866。 + +**NOTE:** 如使用GPU预测,则需要在启动服务之前,请设置CUDA_VISIBLE_DEVICES环境变量,否则不用设置。 + +### Step2: 发送预测请求 + +配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果 + +```python +import requests +import json + +# 指定用于计算余弦相似度的单词对[[word_a, word_b], [word_a, word_b], ... ]] +word_pairs = [["中国", "美国"], ["今天", "明天"]] +# 以key的方式指定word_pairs传入预测方法的时的参数,此例中为"data",对于每一对单词,调用cosine_sim进行余弦相似度的计算 +data = {"data": word_pairs} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip +url = "http://10.12.121.132:8866/predict/w2v_weibo_target_word-char_dim300" +# 指定post请求的headers为application/json方式 +headers = {"Content-Type": "application/json"} + +r = requests.post(url=url, headers=headers, data=json.dumps(data)) +print(r.json()) +``` + +## 查看代码 + +https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings + +## 依赖 + +paddlepaddle >= 2.0.0 + +paddlehub >= 2.0.0 + +## 更新历史 + +* 1.0.0 + + 初始发布 + diff --git a/modules/text/embedding/w2v_weibo_target_word-char_dim300/__init__.py b/modules/text/embedding/w2v_weibo_target_word-char_dim300/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modules/text/embedding/w2v_weibo_target_word-char_dim300/module.py b/modules/text/embedding/w2v_weibo_target_word-char_dim300/module.py new file mode 100644 index 00000000..af00a323 --- /dev/null +++ b/modules/text/embedding/w2v_weibo_target_word-char_dim300/module.py @@ -0,0 +1,55 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List +from paddlenlp.embeddings import TokenEmbedding +from paddlehub.module.module import moduleinfo, serving + + +@moduleinfo( + name="w2v_weibo_target_word-char_dim300", + version="1.0.0", + summary="", + author="paddlepaddle", + author_email="", + type="nlp/semantic_model") +class Embedding(TokenEmbedding): + """ + Embedding model + """ + def __init__(self, *args, **kwargs): + super(Embedding, self).__init__(embedding_name="w2v.weibo.target.word-char.dim300", *args, **kwargs) + + @serving + def calc_similarity(self, data: List[List[str]]): + """ + Calculate similarities of giving word pairs. + """ + results = [] + for word_pair in data: + if len(word_pair) != 2: + raise RuntimeError( + f'The input must have two words, but got {len(word_pair)}. Please check your inputs.') + if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str): + raise RuntimeError( + f'The types of text pair must be (str, str), but got' + f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.') + + for word in word_pair: + if self.get_idx_from_word(word) == \ + self.get_idx_from_word(self.vocab.unk_token): + raise RuntimeError( + f'Word "{word}" is not in vocab. Please check your inputs.') + results.append(str(self.cosine_sim(*word_pair))) + return results diff --git a/modules/text/embedding/w2v_weibo_target_word-word_dim300/README.md b/modules/text/embedding/w2v_weibo_target_word-word_dim300/README.md new file mode 100644 index 00000000..24ffa2bc --- /dev/null +++ b/modules/text/embedding/w2v_weibo_target_word-word_dim300/README.md @@ -0,0 +1,127 @@ +## 概述 +PaddleHub提供多个开源的预训练Embedding模型。这些Embedding模型可根据不同语料、不同训练方式和不同的维度进行区分,关于模型的具体信息可参考PaddleNLP的文档:[Embedding模型汇总](https://github.com/PaddlePaddle/models/blob/release/2.0-beta/PaddleNLP/docs/embeddings.md) + +## API + +```python +def __init__( + *args, + **kwargs +) +``` + +创建一个Embedding Module对象,默认无需参数。 + +**参数** +* `*args`: 用户额外指定的列表类型的参数。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 + +关于额外参数的详情可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + + +```python +def search( + words: Union[List[str], str, int], +) +``` + +获取一个或多个词的embedding。输入可以是`str`、`List[str]`和`int`类型,分别代表获取一个词,多个词和指定词编号的embedding,词的编号和模型的词典相关,词典可通过模型实例的`vocab`属性获取。 + +**参数** +* `words`: 需要获取的词向量的词、词列表或者词编号。 + + +```python +def cosine_sim( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的余弦相似度。需要注意的是`word_a`和`word_b`都需要是词典里的单词,否则将会被认为是OOV(Out-Of-Vocabulary),同时被替换为`unknown_token`。 + +**参数** +* `word_a`: 需要计算余弦相似度的单词a。 +* `word_b`: 需要计算余弦相似度的单词b。 + + +```python +def dot( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的内积。对于输入单词同样需要注意OOV问题。 + +**参数** +* `word_a`: 需要计算内积的单词a。 +* `word_b`: 需要计算内积的单词b。 + +更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + +## 代码示例 + +```python +import paddlehub as hub +embedding = hub.Module(name='w2v_weibo_target_word-word_dim300') + +# 获取单词的embedding +embedding.search("中国") +# 计算两个词向量的余弦相似度 +embedding.cosine_sim("中国", "美国") +# 计算两个词向量的内积 +embedding.dot("中国", "美国") +``` + +## 部署服务 + +通过PaddleHub Serving,可以部署一个在线获取两个词向量的余弦相似度的服务。 + +### Step1: 启动PaddleHub Serving + +运行启动命令: + +```shell +$ hub serving start -m w2v_weibo_target_word-word_dim300 +``` + +这样就完成了一个获取词向量的余弦相似度服务化API的部署,默认端口号为8866。 + +**NOTE:** 如使用GPU预测,则需要在启动服务之前,请设置CUDA_VISIBLE_DEVICES环境变量,否则不用设置。 + +### Step2: 发送预测请求 + +配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果 + +```python +import requests +import json + +# 指定用于计算余弦相似度的单词对[[word_a, word_b], [word_a, word_b], ... ]] +word_pairs = [["中国", "美国"], ["今天", "明天"]] +# 以key的方式指定word_pairs传入预测方法的时的参数,此例中为"data",对于每一对单词,调用cosine_sim进行余弦相似度的计算 +data = {"data": word_pairs} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip +url = "http://10.12.121.132:8866/predict/w2v_weibo_target_word-word_dim300" +# 指定post请求的headers为application/json方式 +headers = {"Content-Type": "application/json"} + +r = requests.post(url=url, headers=headers, data=json.dumps(data)) +print(r.json()) +``` + +## 查看代码 + +https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings + +## 依赖 + +paddlepaddle >= 2.0.0 + +paddlehub >= 2.0.0 + +## 更新历史 + +* 1.0.0 + + 初始发布 + diff --git a/modules/text/embedding/w2v_weibo_target_word-word_dim300/__init__.py b/modules/text/embedding/w2v_weibo_target_word-word_dim300/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modules/text/embedding/w2v_weibo_target_word-word_dim300/module.py b/modules/text/embedding/w2v_weibo_target_word-word_dim300/module.py new file mode 100644 index 00000000..bf54b019 --- /dev/null +++ b/modules/text/embedding/w2v_weibo_target_word-word_dim300/module.py @@ -0,0 +1,55 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List +from paddlenlp.embeddings import TokenEmbedding +from paddlehub.module.module import moduleinfo, serving + + +@moduleinfo( + name="w2v_weibo_target_word-word_dim300", + version="1.0.0", + summary="", + author="paddlepaddle", + author_email="", + type="nlp/semantic_model") +class Embedding(TokenEmbedding): + """ + Embedding model + """ + def __init__(self, *args, **kwargs): + super(Embedding, self).__init__(embedding_name="w2v.weibo.target.word-word.dim300", *args, **kwargs) + + @serving + def calc_similarity(self, data: List[List[str]]): + """ + Calculate similarities of giving word pairs. + """ + results = [] + for word_pair in data: + if len(word_pair) != 2: + raise RuntimeError( + f'The input must have two words, but got {len(word_pair)}. Please check your inputs.') + if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str): + raise RuntimeError( + f'The types of text pair must be (str, str), but got' + f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.') + + for word in word_pair: + if self.get_idx_from_word(word) == \ + self.get_idx_from_word(self.vocab.unk_token): + raise RuntimeError( + f'Word "{word}" is not in vocab. Please check your inputs.') + results.append(str(self.cosine_sim(*word_pair))) + return results diff --git a/modules/text/embedding/w2v_wiki_target_bigram-char_dim300/README.md b/modules/text/embedding/w2v_wiki_target_bigram-char_dim300/README.md new file mode 100644 index 00000000..52e893ed --- /dev/null +++ b/modules/text/embedding/w2v_wiki_target_bigram-char_dim300/README.md @@ -0,0 +1,127 @@ +## 概述 +PaddleHub提供多个开源的预训练Embedding模型。这些Embedding模型可根据不同语料、不同训练方式和不同的维度进行区分,关于模型的具体信息可参考PaddleNLP的文档:[Embedding模型汇总](https://github.com/PaddlePaddle/models/blob/release/2.0-beta/PaddleNLP/docs/embeddings.md) + +## API + +```python +def __init__( + *args, + **kwargs +) +``` + +创建一个Embedding Module对象,默认无需参数。 + +**参数** +* `*args`: 用户额外指定的列表类型的参数。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 + +关于额外参数的详情可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + + +```python +def search( + words: Union[List[str], str, int], +) +``` + +获取一个或多个词的embedding。输入可以是`str`、`List[str]`和`int`类型,分别代表获取一个词,多个词和指定词编号的embedding,词的编号和模型的词典相关,词典可通过模型实例的`vocab`属性获取。 + +**参数** +* `words`: 需要获取的词向量的词、词列表或者词编号。 + + +```python +def cosine_sim( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的余弦相似度。需要注意的是`word_a`和`word_b`都需要是词典里的单词,否则将会被认为是OOV(Out-Of-Vocabulary),同时被替换为`unknown_token`。 + +**参数** +* `word_a`: 需要计算余弦相似度的单词a。 +* `word_b`: 需要计算余弦相似度的单词b。 + + +```python +def dot( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的内积。对于输入单词同样需要注意OOV问题。 + +**参数** +* `word_a`: 需要计算内积的单词a。 +* `word_b`: 需要计算内积的单词b。 + +更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + +## 代码示例 + +```python +import paddlehub as hub +embedding = hub.Module(name='w2v_wiki_target_bigram-char_dim300') + +# 获取单词的embedding +embedding.search("中国") +# 计算两个词向量的余弦相似度 +embedding.cosine_sim("中国", "美国") +# 计算两个词向量的内积 +embedding.dot("中国", "美国") +``` + +## 部署服务 + +通过PaddleHub Serving,可以部署一个在线获取两个词向量的余弦相似度的服务。 + +### Step1: 启动PaddleHub Serving + +运行启动命令: + +```shell +$ hub serving start -m w2v_wiki_target_bigram-char_dim300 +``` + +这样就完成了一个获取词向量的余弦相似度服务化API的部署,默认端口号为8866。 + +**NOTE:** 如使用GPU预测,则需要在启动服务之前,请设置CUDA_VISIBLE_DEVICES环境变量,否则不用设置。 + +### Step2: 发送预测请求 + +配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果 + +```python +import requests +import json + +# 指定用于计算余弦相似度的单词对[[word_a, word_b], [word_a, word_b], ... ]] +word_pairs = [["中国", "美国"], ["今天", "明天"]] +# 以key的方式指定word_pairs传入预测方法的时的参数,此例中为"data",对于每一对单词,调用cosine_sim进行余弦相似度的计算 +data = {"data": word_pairs} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip +url = "http://10.12.121.132:8866/predict/w2v_wiki_target_bigram-char_dim300" +# 指定post请求的headers为application/json方式 +headers = {"Content-Type": "application/json"} + +r = requests.post(url=url, headers=headers, data=json.dumps(data)) +print(r.json()) +``` + +## 查看代码 + +https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings + +## 依赖 + +paddlepaddle >= 2.0.0 + +paddlehub >= 2.0.0 + +## 更新历史 + +* 1.0.0 + + 初始发布 + diff --git a/modules/text/embedding/w2v_wiki_target_bigram-char_dim300/__init__.py b/modules/text/embedding/w2v_wiki_target_bigram-char_dim300/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modules/text/embedding/w2v_wiki_target_bigram-char_dim300/module.py b/modules/text/embedding/w2v_wiki_target_bigram-char_dim300/module.py new file mode 100644 index 00000000..02099397 --- /dev/null +++ b/modules/text/embedding/w2v_wiki_target_bigram-char_dim300/module.py @@ -0,0 +1,55 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List +from paddlenlp.embeddings import TokenEmbedding +from paddlehub.module.module import moduleinfo, serving + + +@moduleinfo( + name="w2v_wiki_target_bigram-char_dim300", + version="1.0.0", + summary="", + author="paddlepaddle", + author_email="", + type="nlp/semantic_model") +class Embedding(TokenEmbedding): + """ + Embedding model + """ + def __init__(self, *args, **kwargs): + super(Embedding, self).__init__(embedding_name="w2v.wiki.target.bigram-char.dim300", *args, **kwargs) + + @serving + def calc_similarity(self, data: List[List[str]]): + """ + Calculate similarities of giving word pairs. + """ + results = [] + for word_pair in data: + if len(word_pair) != 2: + raise RuntimeError( + f'The input must have two words, but got {len(word_pair)}. Please check your inputs.') + if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str): + raise RuntimeError( + f'The types of text pair must be (str, str), but got' + f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.') + + for word in word_pair: + if self.get_idx_from_word(word) == \ + self.get_idx_from_word(self.vocab.unk_token): + raise RuntimeError( + f'Word "{word}" is not in vocab. Please check your inputs.') + results.append(str(self.cosine_sim(*word_pair))) + return results diff --git a/modules/text/embedding/w2v_wiki_target_word-bigram_dim300/README.md b/modules/text/embedding/w2v_wiki_target_word-bigram_dim300/README.md new file mode 100644 index 00000000..49561ce2 --- /dev/null +++ b/modules/text/embedding/w2v_wiki_target_word-bigram_dim300/README.md @@ -0,0 +1,127 @@ +## 概述 +PaddleHub提供多个开源的预训练Embedding模型。这些Embedding模型可根据不同语料、不同训练方式和不同的维度进行区分,关于模型的具体信息可参考PaddleNLP的文档:[Embedding模型汇总](https://github.com/PaddlePaddle/models/blob/release/2.0-beta/PaddleNLP/docs/embeddings.md) + +## API + +```python +def __init__( + *args, + **kwargs +) +``` + +创建一个Embedding Module对象,默认无需参数。 + +**参数** +* `*args`: 用户额外指定的列表类型的参数。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 + +关于额外参数的详情可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + + +```python +def search( + words: Union[List[str], str, int], +) +``` + +获取一个或多个词的embedding。输入可以是`str`、`List[str]`和`int`类型,分别代表获取一个词,多个词和指定词编号的embedding,词的编号和模型的词典相关,词典可通过模型实例的`vocab`属性获取。 + +**参数** +* `words`: 需要获取的词向量的词、词列表或者词编号。 + + +```python +def cosine_sim( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的余弦相似度。需要注意的是`word_a`和`word_b`都需要是词典里的单词,否则将会被认为是OOV(Out-Of-Vocabulary),同时被替换为`unknown_token`。 + +**参数** +* `word_a`: 需要计算余弦相似度的单词a。 +* `word_b`: 需要计算余弦相似度的单词b。 + + +```python +def dot( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的内积。对于输入单词同样需要注意OOV问题。 + +**参数** +* `word_a`: 需要计算内积的单词a。 +* `word_b`: 需要计算内积的单词b。 + +更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + +## 代码示例 + +```python +import paddlehub as hub +embedding = hub.Module(name='w2v_wiki_target_word-bigram_dim300') + +# 获取单词的embedding +embedding.search("中国") +# 计算两个词向量的余弦相似度 +embedding.cosine_sim("中国", "美国") +# 计算两个词向量的内积 +embedding.dot("中国", "美国") +``` + +## 部署服务 + +通过PaddleHub Serving,可以部署一个在线获取两个词向量的余弦相似度的服务。 + +### Step1: 启动PaddleHub Serving + +运行启动命令: + +```shell +$ hub serving start -m w2v_wiki_target_word-bigram_dim300 +``` + +这样就完成了一个获取词向量的余弦相似度服务化API的部署,默认端口号为8866。 + +**NOTE:** 如使用GPU预测,则需要在启动服务之前,请设置CUDA_VISIBLE_DEVICES环境变量,否则不用设置。 + +### Step2: 发送预测请求 + +配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果 + +```python +import requests +import json + +# 指定用于计算余弦相似度的单词对[[word_a, word_b], [word_a, word_b], ... ]] +word_pairs = [["中国", "美国"], ["今天", "明天"]] +# 以key的方式指定word_pairs传入预测方法的时的参数,此例中为"data",对于每一对单词,调用cosine_sim进行余弦相似度的计算 +data = {"data": word_pairs} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip +url = "http://10.12.121.132:8866/predict/w2v_wiki_target_word-bigram_dim300" +# 指定post请求的headers为application/json方式 +headers = {"Content-Type": "application/json"} + +r = requests.post(url=url, headers=headers, data=json.dumps(data)) +print(r.json()) +``` + +## 查看代码 + +https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings + +## 依赖 + +paddlepaddle >= 2.0.0 + +paddlehub >= 2.0.0 + +## 更新历史 + +* 1.0.0 + + 初始发布 + diff --git a/modules/text/embedding/w2v_wiki_target_word-bigram_dim300/__init__.py b/modules/text/embedding/w2v_wiki_target_word-bigram_dim300/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modules/text/embedding/w2v_wiki_target_word-bigram_dim300/module.py b/modules/text/embedding/w2v_wiki_target_word-bigram_dim300/module.py new file mode 100644 index 00000000..86eefc43 --- /dev/null +++ b/modules/text/embedding/w2v_wiki_target_word-bigram_dim300/module.py @@ -0,0 +1,55 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List +from paddlenlp.embeddings import TokenEmbedding +from paddlehub.module.module import moduleinfo, serving + + +@moduleinfo( + name="w2v_wiki_target_word-bigram_dim300", + version="1.0.0", + summary="", + author="paddlepaddle", + author_email="", + type="nlp/semantic_model") +class Embedding(TokenEmbedding): + """ + Embedding model + """ + def __init__(self, *args, **kwargs): + super(Embedding, self).__init__(embedding_name="w2v.wiki.target.word-bigram.dim300", *args, **kwargs) + + @serving + def calc_similarity(self, data: List[List[str]]): + """ + Calculate similarities of giving word pairs. + """ + results = [] + for word_pair in data: + if len(word_pair) != 2: + raise RuntimeError( + f'The input must have two words, but got {len(word_pair)}. Please check your inputs.') + if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str): + raise RuntimeError( + f'The types of text pair must be (str, str), but got' + f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.') + + for word in word_pair: + if self.get_idx_from_word(word) == \ + self.get_idx_from_word(self.vocab.unk_token): + raise RuntimeError( + f'Word "{word}" is not in vocab. Please check your inputs.') + results.append(str(self.cosine_sim(*word_pair))) + return results diff --git a/modules/text/embedding/w2v_wiki_target_word-char_dim300/README.md b/modules/text/embedding/w2v_wiki_target_word-char_dim300/README.md new file mode 100644 index 00000000..da0788fe --- /dev/null +++ b/modules/text/embedding/w2v_wiki_target_word-char_dim300/README.md @@ -0,0 +1,127 @@ +## 概述 +PaddleHub提供多个开源的预训练Embedding模型。这些Embedding模型可根据不同语料、不同训练方式和不同的维度进行区分,关于模型的具体信息可参考PaddleNLP的文档:[Embedding模型汇总](https://github.com/PaddlePaddle/models/blob/release/2.0-beta/PaddleNLP/docs/embeddings.md) + +## API + +```python +def __init__( + *args, + **kwargs +) +``` + +创建一个Embedding Module对象,默认无需参数。 + +**参数** +* `*args`: 用户额外指定的列表类型的参数。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 + +关于额外参数的详情可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + + +```python +def search( + words: Union[List[str], str, int], +) +``` + +获取一个或多个词的embedding。输入可以是`str`、`List[str]`和`int`类型,分别代表获取一个词,多个词和指定词编号的embedding,词的编号和模型的词典相关,词典可通过模型实例的`vocab`属性获取。 + +**参数** +* `words`: 需要获取的词向量的词、词列表或者词编号。 + + +```python +def cosine_sim( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的余弦相似度。需要注意的是`word_a`和`word_b`都需要是词典里的单词,否则将会被认为是OOV(Out-Of-Vocabulary),同时被替换为`unknown_token`。 + +**参数** +* `word_a`: 需要计算余弦相似度的单词a。 +* `word_b`: 需要计算余弦相似度的单词b。 + + +```python +def dot( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的内积。对于输入单词同样需要注意OOV问题。 + +**参数** +* `word_a`: 需要计算内积的单词a。 +* `word_b`: 需要计算内积的单词b。 + +更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + +## 代码示例 + +```python +import paddlehub as hub +embedding = hub.Module(name='w2v_wiki_target_word-char_dim300') + +# 获取单词的embedding +embedding.search("中国") +# 计算两个词向量的余弦相似度 +embedding.cosine_sim("中国", "美国") +# 计算两个词向量的内积 +embedding.dot("中国", "美国") +``` + +## 部署服务 + +通过PaddleHub Serving,可以部署一个在线获取两个词向量的余弦相似度的服务。 + +### Step1: 启动PaddleHub Serving + +运行启动命令: + +```shell +$ hub serving start -m w2v_wiki_target_word-char_dim300 +``` + +这样就完成了一个获取词向量的余弦相似度服务化API的部署,默认端口号为8866。 + +**NOTE:** 如使用GPU预测,则需要在启动服务之前,请设置CUDA_VISIBLE_DEVICES环境变量,否则不用设置。 + +### Step2: 发送预测请求 + +配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果 + +```python +import requests +import json + +# 指定用于计算余弦相似度的单词对[[word_a, word_b], [word_a, word_b], ... ]] +word_pairs = [["中国", "美国"], ["今天", "明天"]] +# 以key的方式指定word_pairs传入预测方法的时的参数,此例中为"data",对于每一对单词,调用cosine_sim进行余弦相似度的计算 +data = {"data": word_pairs} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip +url = "http://10.12.121.132:8866/predict/w2v_wiki_target_word-char_dim300" +# 指定post请求的headers为application/json方式 +headers = {"Content-Type": "application/json"} + +r = requests.post(url=url, headers=headers, data=json.dumps(data)) +print(r.json()) +``` + +## 查看代码 + +https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings + +## 依赖 + +paddlepaddle >= 2.0.0 + +paddlehub >= 2.0.0 + +## 更新历史 + +* 1.0.0 + + 初始发布 + diff --git a/modules/text/embedding/w2v_wiki_target_word-char_dim300/__init__.py b/modules/text/embedding/w2v_wiki_target_word-char_dim300/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modules/text/embedding/w2v_wiki_target_word-char_dim300/module.py b/modules/text/embedding/w2v_wiki_target_word-char_dim300/module.py new file mode 100644 index 00000000..68626b10 --- /dev/null +++ b/modules/text/embedding/w2v_wiki_target_word-char_dim300/module.py @@ -0,0 +1,55 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List +from paddlenlp.embeddings import TokenEmbedding +from paddlehub.module.module import moduleinfo, serving + + +@moduleinfo( + name="w2v_wiki_target_word-char_dim300", + version="1.0.0", + summary="", + author="paddlepaddle", + author_email="", + type="nlp/semantic_model") +class Embedding(TokenEmbedding): + """ + Embedding model + """ + def __init__(self, *args, **kwargs): + super(Embedding, self).__init__(embedding_name="w2v.wiki.target.word-char.dim300", *args, **kwargs) + + @serving + def calc_similarity(self, data: List[List[str]]): + """ + Calculate similarities of giving word pairs. + """ + results = [] + for word_pair in data: + if len(word_pair) != 2: + raise RuntimeError( + f'The input must have two words, but got {len(word_pair)}. Please check your inputs.') + if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str): + raise RuntimeError( + f'The types of text pair must be (str, str), but got' + f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.') + + for word in word_pair: + if self.get_idx_from_word(word) == \ + self.get_idx_from_word(self.vocab.unk_token): + raise RuntimeError( + f'Word "{word}" is not in vocab. Please check your inputs.') + results.append(str(self.cosine_sim(*word_pair))) + return results diff --git a/modules/text/embedding/w2v_wiki_target_word-word_dim300/README.md b/modules/text/embedding/w2v_wiki_target_word-word_dim300/README.md new file mode 100644 index 00000000..1e7cd83e --- /dev/null +++ b/modules/text/embedding/w2v_wiki_target_word-word_dim300/README.md @@ -0,0 +1,127 @@ +## 概述 +PaddleHub提供多个开源的预训练Embedding模型。这些Embedding模型可根据不同语料、不同训练方式和不同的维度进行区分,关于模型的具体信息可参考PaddleNLP的文档:[Embedding模型汇总](https://github.com/PaddlePaddle/models/blob/release/2.0-beta/PaddleNLP/docs/embeddings.md) + +## API + +```python +def __init__( + *args, + **kwargs +) +``` + +创建一个Embedding Module对象,默认无需参数。 + +**参数** +* `*args`: 用户额外指定的列表类型的参数。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 + +关于额外参数的详情可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + + +```python +def search( + words: Union[List[str], str, int], +) +``` + +获取一个或多个词的embedding。输入可以是`str`、`List[str]`和`int`类型,分别代表获取一个词,多个词和指定词编号的embedding,词的编号和模型的词典相关,词典可通过模型实例的`vocab`属性获取。 + +**参数** +* `words`: 需要获取的词向量的词、词列表或者词编号。 + + +```python +def cosine_sim( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的余弦相似度。需要注意的是`word_a`和`word_b`都需要是词典里的单词,否则将会被认为是OOV(Out-Of-Vocabulary),同时被替换为`unknown_token`。 + +**参数** +* `word_a`: 需要计算余弦相似度的单词a。 +* `word_b`: 需要计算余弦相似度的单词b。 + + +```python +def dot( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的内积。对于输入单词同样需要注意OOV问题。 + +**参数** +* `word_a`: 需要计算内积的单词a。 +* `word_b`: 需要计算内积的单词b。 + +更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + +## 代码示例 + +```python +import paddlehub as hub +embedding = hub.Module(name='w2v_wiki_target_word-word_dim300') + +# 获取单词的embedding +embedding.search("中国") +# 计算两个词向量的余弦相似度 +embedding.cosine_sim("中国", "美国") +# 计算两个词向量的内积 +embedding.dot("中国", "美国") +``` + +## 部署服务 + +通过PaddleHub Serving,可以部署一个在线获取两个词向量的余弦相似度的服务。 + +### Step1: 启动PaddleHub Serving + +运行启动命令: + +```shell +$ hub serving start -m w2v_wiki_target_word-word_dim300 +``` + +这样就完成了一个获取词向量的余弦相似度服务化API的部署,默认端口号为8866。 + +**NOTE:** 如使用GPU预测,则需要在启动服务之前,请设置CUDA_VISIBLE_DEVICES环境变量,否则不用设置。 + +### Step2: 发送预测请求 + +配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果 + +```python +import requests +import json + +# 指定用于计算余弦相似度的单词对[[word_a, word_b], [word_a, word_b], ... ]] +word_pairs = [["中国", "美国"], ["今天", "明天"]] +# 以key的方式指定word_pairs传入预测方法的时的参数,此例中为"data",对于每一对单词,调用cosine_sim进行余弦相似度的计算 +data = {"data": word_pairs} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip +url = "http://10.12.121.132:8866/predict/w2v_wiki_target_word-word_dim300" +# 指定post请求的headers为application/json方式 +headers = {"Content-Type": "application/json"} + +r = requests.post(url=url, headers=headers, data=json.dumps(data)) +print(r.json()) +``` + +## 查看代码 + +https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings + +## 依赖 + +paddlepaddle >= 2.0.0 + +paddlehub >= 2.0.0 + +## 更新历史 + +* 1.0.0 + + 初始发布 + diff --git a/modules/text/embedding/w2v_wiki_target_word-word_dim300/__init__.py b/modules/text/embedding/w2v_wiki_target_word-word_dim300/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modules/text/embedding/w2v_wiki_target_word-word_dim300/module.py b/modules/text/embedding/w2v_wiki_target_word-word_dim300/module.py new file mode 100644 index 00000000..b217b0b3 --- /dev/null +++ b/modules/text/embedding/w2v_wiki_target_word-word_dim300/module.py @@ -0,0 +1,55 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List +from paddlenlp.embeddings import TokenEmbedding +from paddlehub.module.module import moduleinfo, serving + + +@moduleinfo( + name="w2v_wiki_target_word-word_dim300", + version="1.0.0", + summary="", + author="paddlepaddle", + author_email="", + type="nlp/semantic_model") +class Embedding(TokenEmbedding): + """ + Embedding model + """ + def __init__(self, *args, **kwargs): + super(Embedding, self).__init__(embedding_name="w2v.wiki.target.word-word.dim300", *args, **kwargs) + + @serving + def calc_similarity(self, data: List[List[str]]): + """ + Calculate similarities of giving word pairs. + """ + results = [] + for word_pair in data: + if len(word_pair) != 2: + raise RuntimeError( + f'The input must have two words, but got {len(word_pair)}. Please check your inputs.') + if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str): + raise RuntimeError( + f'The types of text pair must be (str, str), but got' + f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.') + + for word in word_pair: + if self.get_idx_from_word(word) == \ + self.get_idx_from_word(self.vocab.unk_token): + raise RuntimeError( + f'Word "{word}" is not in vocab. Please check your inputs.') + results.append(str(self.cosine_sim(*word_pair))) + return results diff --git a/modules/text/embedding/w2v_zhihu_target_bigram-char_dim300/README.md b/modules/text/embedding/w2v_zhihu_target_bigram-char_dim300/README.md new file mode 100644 index 00000000..6c6a8112 --- /dev/null +++ b/modules/text/embedding/w2v_zhihu_target_bigram-char_dim300/README.md @@ -0,0 +1,127 @@ +## 概述 +PaddleHub提供多个开源的预训练Embedding模型。这些Embedding模型可根据不同语料、不同训练方式和不同的维度进行区分,关于模型的具体信息可参考PaddleNLP的文档:[Embedding模型汇总](https://github.com/PaddlePaddle/models/blob/release/2.0-beta/PaddleNLP/docs/embeddings.md) + +## API + +```python +def __init__( + *args, + **kwargs +) +``` + +创建一个Embedding Module对象,默认无需参数。 + +**参数** +* `*args`: 用户额外指定的列表类型的参数。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 + +关于额外参数的详情可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + + +```python +def search( + words: Union[List[str], str, int], +) +``` + +获取一个或多个词的embedding。输入可以是`str`、`List[str]`和`int`类型,分别代表获取一个词,多个词和指定词编号的embedding,词的编号和模型的词典相关,词典可通过模型实例的`vocab`属性获取。 + +**参数** +* `words`: 需要获取的词向量的词、词列表或者词编号。 + + +```python +def cosine_sim( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的余弦相似度。需要注意的是`word_a`和`word_b`都需要是词典里的单词,否则将会被认为是OOV(Out-Of-Vocabulary),同时被替换为`unknown_token`。 + +**参数** +* `word_a`: 需要计算余弦相似度的单词a。 +* `word_b`: 需要计算余弦相似度的单词b。 + + +```python +def dot( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的内积。对于输入单词同样需要注意OOV问题。 + +**参数** +* `word_a`: 需要计算内积的单词a。 +* `word_b`: 需要计算内积的单词b。 + +更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + +## 代码示例 + +```python +import paddlehub as hub +embedding = hub.Module(name='w2v_zhihu_target_bigram-char_dim300') + +# 获取单词的embedding +embedding.search("中国") +# 计算两个词向量的余弦相似度 +embedding.cosine_sim("中国", "美国") +# 计算两个词向量的内积 +embedding.dot("中国", "美国") +``` + +## 部署服务 + +通过PaddleHub Serving,可以部署一个在线获取两个词向量的余弦相似度的服务。 + +### Step1: 启动PaddleHub Serving + +运行启动命令: + +```shell +$ hub serving start -m w2v_zhihu_target_bigram-char_dim300 +``` + +这样就完成了一个获取词向量的余弦相似度服务化API的部署,默认端口号为8866。 + +**NOTE:** 如使用GPU预测,则需要在启动服务之前,请设置CUDA_VISIBLE_DEVICES环境变量,否则不用设置。 + +### Step2: 发送预测请求 + +配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果 + +```python +import requests +import json + +# 指定用于计算余弦相似度的单词对[[word_a, word_b], [word_a, word_b], ... ]] +word_pairs = [["中国", "美国"], ["今天", "明天"]] +# 以key的方式指定word_pairs传入预测方法的时的参数,此例中为"data",对于每一对单词,调用cosine_sim进行余弦相似度的计算 +data = {"data": word_pairs} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip +url = "http://10.12.121.132:8866/predict/w2v_zhihu_target_bigram-char_dim300" +# 指定post请求的headers为application/json方式 +headers = {"Content-Type": "application/json"} + +r = requests.post(url=url, headers=headers, data=json.dumps(data)) +print(r.json()) +``` + +## 查看代码 + +https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings + +## 依赖 + +paddlepaddle >= 2.0.0 + +paddlehub >= 2.0.0 + +## 更新历史 + +* 1.0.0 + + 初始发布 + diff --git a/modules/text/embedding/w2v_zhihu_target_bigram-char_dim300/__init__.py b/modules/text/embedding/w2v_zhihu_target_bigram-char_dim300/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modules/text/embedding/w2v_zhihu_target_bigram-char_dim300/module.py b/modules/text/embedding/w2v_zhihu_target_bigram-char_dim300/module.py new file mode 100644 index 00000000..ffad06f6 --- /dev/null +++ b/modules/text/embedding/w2v_zhihu_target_bigram-char_dim300/module.py @@ -0,0 +1,55 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List +from paddlenlp.embeddings import TokenEmbedding +from paddlehub.module.module import moduleinfo, serving + + +@moduleinfo( + name="w2v_zhihu_target_bigram-char_dim300", + version="1.0.0", + summary="", + author="paddlepaddle", + author_email="", + type="nlp/semantic_model") +class Embedding(TokenEmbedding): + """ + Embedding model + """ + def __init__(self, *args, **kwargs): + super(Embedding, self).__init__(embedding_name="w2v.zhihu.target.bigram-char.dim300", *args, **kwargs) + + @serving + def calc_similarity(self, data: List[List[str]]): + """ + Calculate similarities of giving word pairs. + """ + results = [] + for word_pair in data: + if len(word_pair) != 2: + raise RuntimeError( + f'The input must have two words, but got {len(word_pair)}. Please check your inputs.') + if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str): + raise RuntimeError( + f'The types of text pair must be (str, str), but got' + f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.') + + for word in word_pair: + if self.get_idx_from_word(word) == \ + self.get_idx_from_word(self.vocab.unk_token): + raise RuntimeError( + f'Word "{word}" is not in vocab. Please check your inputs.') + results.append(str(self.cosine_sim(*word_pair))) + return results diff --git a/modules/text/embedding/w2v_zhihu_target_word-bigram_dim300/README.md b/modules/text/embedding/w2v_zhihu_target_word-bigram_dim300/README.md new file mode 100644 index 00000000..9321a414 --- /dev/null +++ b/modules/text/embedding/w2v_zhihu_target_word-bigram_dim300/README.md @@ -0,0 +1,127 @@ +## 概述 +PaddleHub提供多个开源的预训练Embedding模型。这些Embedding模型可根据不同语料、不同训练方式和不同的维度进行区分,关于模型的具体信息可参考PaddleNLP的文档:[Embedding模型汇总](https://github.com/PaddlePaddle/models/blob/release/2.0-beta/PaddleNLP/docs/embeddings.md) + +## API + +```python +def __init__( + *args, + **kwargs +) +``` + +创建一个Embedding Module对象,默认无需参数。 + +**参数** +* `*args`: 用户额外指定的列表类型的参数。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 + +关于额外参数的详情可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + + +```python +def search( + words: Union[List[str], str, int], +) +``` + +获取一个或多个词的embedding。输入可以是`str`、`List[str]`和`int`类型,分别代表获取一个词,多个词和指定词编号的embedding,词的编号和模型的词典相关,词典可通过模型实例的`vocab`属性获取。 + +**参数** +* `words`: 需要获取的词向量的词、词列表或者词编号。 + + +```python +def cosine_sim( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的余弦相似度。需要注意的是`word_a`和`word_b`都需要是词典里的单词,否则将会被认为是OOV(Out-Of-Vocabulary),同时被替换为`unknown_token`。 + +**参数** +* `word_a`: 需要计算余弦相似度的单词a。 +* `word_b`: 需要计算余弦相似度的单词b。 + + +```python +def dot( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的内积。对于输入单词同样需要注意OOV问题。 + +**参数** +* `word_a`: 需要计算内积的单词a。 +* `word_b`: 需要计算内积的单词b。 + +更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + +## 代码示例 + +```python +import paddlehub as hub +embedding = hub.Module(name='w2v_zhihu_target_word-bigram_dim300') + +# 获取单词的embedding +embedding.search("中国") +# 计算两个词向量的余弦相似度 +embedding.cosine_sim("中国", "美国") +# 计算两个词向量的内积 +embedding.dot("中国", "美国") +``` + +## 部署服务 + +通过PaddleHub Serving,可以部署一个在线获取两个词向量的余弦相似度的服务。 + +### Step1: 启动PaddleHub Serving + +运行启动命令: + +```shell +$ hub serving start -m w2v_zhihu_target_word-bigram_dim300 +``` + +这样就完成了一个获取词向量的余弦相似度服务化API的部署,默认端口号为8866。 + +**NOTE:** 如使用GPU预测,则需要在启动服务之前,请设置CUDA_VISIBLE_DEVICES环境变量,否则不用设置。 + +### Step2: 发送预测请求 + +配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果 + +```python +import requests +import json + +# 指定用于计算余弦相似度的单词对[[word_a, word_b], [word_a, word_b], ... ]] +word_pairs = [["中国", "美国"], ["今天", "明天"]] +# 以key的方式指定word_pairs传入预测方法的时的参数,此例中为"data",对于每一对单词,调用cosine_sim进行余弦相似度的计算 +data = {"data": word_pairs} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip +url = "http://10.12.121.132:8866/predict/w2v_zhihu_target_word-bigram_dim300" +# 指定post请求的headers为application/json方式 +headers = {"Content-Type": "application/json"} + +r = requests.post(url=url, headers=headers, data=json.dumps(data)) +print(r.json()) +``` + +## 查看代码 + +https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings + +## 依赖 + +paddlepaddle >= 2.0.0 + +paddlehub >= 2.0.0 + +## 更新历史 + +* 1.0.0 + + 初始发布 + diff --git a/modules/text/embedding/w2v_zhihu_target_word-bigram_dim300/__init__.py b/modules/text/embedding/w2v_zhihu_target_word-bigram_dim300/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modules/text/embedding/w2v_zhihu_target_word-bigram_dim300/module.py b/modules/text/embedding/w2v_zhihu_target_word-bigram_dim300/module.py new file mode 100644 index 00000000..cb9840c7 --- /dev/null +++ b/modules/text/embedding/w2v_zhihu_target_word-bigram_dim300/module.py @@ -0,0 +1,55 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List +from paddlenlp.embeddings import TokenEmbedding +from paddlehub.module.module import moduleinfo, serving + + +@moduleinfo( + name="w2v_zhihu_target_word-bigram_dim300", + version="1.0.0", + summary="", + author="paddlepaddle", + author_email="", + type="nlp/semantic_model") +class Embedding(TokenEmbedding): + """ + Embedding model + """ + def __init__(self, *args, **kwargs): + super(Embedding, self).__init__(embedding_name="w2v.zhihu.target.word-bigram.dim300", *args, **kwargs) + + @serving + def calc_similarity(self, data: List[List[str]]): + """ + Calculate similarities of giving word pairs. + """ + results = [] + for word_pair in data: + if len(word_pair) != 2: + raise RuntimeError( + f'The input must have two words, but got {len(word_pair)}. Please check your inputs.') + if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str): + raise RuntimeError( + f'The types of text pair must be (str, str), but got' + f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.') + + for word in word_pair: + if self.get_idx_from_word(word) == \ + self.get_idx_from_word(self.vocab.unk_token): + raise RuntimeError( + f'Word "{word}" is not in vocab. Please check your inputs.') + results.append(str(self.cosine_sim(*word_pair))) + return results diff --git a/modules/text/embedding/w2v_zhihu_target_word-char_dim300/README.md b/modules/text/embedding/w2v_zhihu_target_word-char_dim300/README.md new file mode 100644 index 00000000..c7310263 --- /dev/null +++ b/modules/text/embedding/w2v_zhihu_target_word-char_dim300/README.md @@ -0,0 +1,127 @@ +## 概述 +PaddleHub提供多个开源的预训练Embedding模型。这些Embedding模型可根据不同语料、不同训练方式和不同的维度进行区分,关于模型的具体信息可参考PaddleNLP的文档:[Embedding模型汇总](https://github.com/PaddlePaddle/models/blob/release/2.0-beta/PaddleNLP/docs/embeddings.md) + +## API + +```python +def __init__( + *args, + **kwargs +) +``` + +创建一个Embedding Module对象,默认无需参数。 + +**参数** +* `*args`: 用户额外指定的列表类型的参数。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 + +关于额外参数的详情可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + + +```python +def search( + words: Union[List[str], str, int], +) +``` + +获取一个或多个词的embedding。输入可以是`str`、`List[str]`和`int`类型,分别代表获取一个词,多个词和指定词编号的embedding,词的编号和模型的词典相关,词典可通过模型实例的`vocab`属性获取。 + +**参数** +* `words`: 需要获取的词向量的词、词列表或者词编号。 + + +```python +def cosine_sim( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的余弦相似度。需要注意的是`word_a`和`word_b`都需要是词典里的单词,否则将会被认为是OOV(Out-Of-Vocabulary),同时被替换为`unknown_token`。 + +**参数** +* `word_a`: 需要计算余弦相似度的单词a。 +* `word_b`: 需要计算余弦相似度的单词b。 + + +```python +def dot( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的内积。对于输入单词同样需要注意OOV问题。 + +**参数** +* `word_a`: 需要计算内积的单词a。 +* `word_b`: 需要计算内积的单词b。 + +更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + +## 代码示例 + +```python +import paddlehub as hub +embedding = hub.Module(name='w2v_zhihu_target_word-char_dim300') + +# 获取单词的embedding +embedding.search("中国") +# 计算两个词向量的余弦相似度 +embedding.cosine_sim("中国", "美国") +# 计算两个词向量的内积 +embedding.dot("中国", "美国") +``` + +## 部署服务 + +通过PaddleHub Serving,可以部署一个在线获取两个词向量的余弦相似度的服务。 + +### Step1: 启动PaddleHub Serving + +运行启动命令: + +```shell +$ hub serving start -m w2v_zhihu_target_word-char_dim300 +``` + +这样就完成了一个获取词向量的余弦相似度服务化API的部署,默认端口号为8866。 + +**NOTE:** 如使用GPU预测,则需要在启动服务之前,请设置CUDA_VISIBLE_DEVICES环境变量,否则不用设置。 + +### Step2: 发送预测请求 + +配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果 + +```python +import requests +import json + +# 指定用于计算余弦相似度的单词对[[word_a, word_b], [word_a, word_b], ... ]] +word_pairs = [["中国", "美国"], ["今天", "明天"]] +# 以key的方式指定word_pairs传入预测方法的时的参数,此例中为"data",对于每一对单词,调用cosine_sim进行余弦相似度的计算 +data = {"data": word_pairs} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip +url = "http://10.12.121.132:8866/predict/w2v_zhihu_target_word-char_dim300" +# 指定post请求的headers为application/json方式 +headers = {"Content-Type": "application/json"} + +r = requests.post(url=url, headers=headers, data=json.dumps(data)) +print(r.json()) +``` + +## 查看代码 + +https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings + +## 依赖 + +paddlepaddle >= 2.0.0 + +paddlehub >= 2.0.0 + +## 更新历史 + +* 1.0.0 + + 初始发布 + diff --git a/modules/text/embedding/w2v_zhihu_target_word-char_dim300/__init__.py b/modules/text/embedding/w2v_zhihu_target_word-char_dim300/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modules/text/embedding/w2v_zhihu_target_word-char_dim300/module.py b/modules/text/embedding/w2v_zhihu_target_word-char_dim300/module.py new file mode 100644 index 00000000..84cc17d9 --- /dev/null +++ b/modules/text/embedding/w2v_zhihu_target_word-char_dim300/module.py @@ -0,0 +1,55 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List +from paddlenlp.embeddings import TokenEmbedding +from paddlehub.module.module import moduleinfo, serving + + +@moduleinfo( + name="w2v_zhihu_target_word-char_dim300", + version="1.0.0", + summary="", + author="paddlepaddle", + author_email="", + type="nlp/semantic_model") +class Embedding(TokenEmbedding): + """ + Embedding model + """ + def __init__(self, *args, **kwargs): + super(Embedding, self).__init__(embedding_name="w2v.zhihu.target.word-char.dim300", *args, **kwargs) + + @serving + def calc_similarity(self, data: List[List[str]]): + """ + Calculate similarities of giving word pairs. + """ + results = [] + for word_pair in data: + if len(word_pair) != 2: + raise RuntimeError( + f'The input must have two words, but got {len(word_pair)}. Please check your inputs.') + if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str): + raise RuntimeError( + f'The types of text pair must be (str, str), but got' + f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.') + + for word in word_pair: + if self.get_idx_from_word(word) == \ + self.get_idx_from_word(self.vocab.unk_token): + raise RuntimeError( + f'Word "{word}" is not in vocab. Please check your inputs.') + results.append(str(self.cosine_sim(*word_pair))) + return results diff --git a/modules/text/embedding/w2v_zhihu_target_word-word_dim300/README.md b/modules/text/embedding/w2v_zhihu_target_word-word_dim300/README.md new file mode 100644 index 00000000..053532bb --- /dev/null +++ b/modules/text/embedding/w2v_zhihu_target_word-word_dim300/README.md @@ -0,0 +1,127 @@ +## 概述 +PaddleHub提供多个开源的预训练Embedding模型。这些Embedding模型可根据不同语料、不同训练方式和不同的维度进行区分,关于模型的具体信息可参考PaddleNLP的文档:[Embedding模型汇总](https://github.com/PaddlePaddle/models/blob/release/2.0-beta/PaddleNLP/docs/embeddings.md) + +## API + +```python +def __init__( + *args, + **kwargs +) +``` + +创建一个Embedding Module对象,默认无需参数。 + +**参数** +* `*args`: 用户额外指定的列表类型的参数。 +* `**kwargs`:用户额外指定的关键字字典类型的参数。 + +关于额外参数的详情可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + + +```python +def search( + words: Union[List[str], str, int], +) +``` + +获取一个或多个词的embedding。输入可以是`str`、`List[str]`和`int`类型,分别代表获取一个词,多个词和指定词编号的embedding,词的编号和模型的词典相关,词典可通过模型实例的`vocab`属性获取。 + +**参数** +* `words`: 需要获取的词向量的词、词列表或者词编号。 + + +```python +def cosine_sim( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的余弦相似度。需要注意的是`word_a`和`word_b`都需要是词典里的单词,否则将会被认为是OOV(Out-Of-Vocabulary),同时被替换为`unknown_token`。 + +**参数** +* `word_a`: 需要计算余弦相似度的单词a。 +* `word_b`: 需要计算余弦相似度的单词b。 + + +```python +def dot( + word_a: str, + word_b: str, +) +``` +计算两个词embedding的内积。对于输入单词同样需要注意OOV问题。 + +**参数** +* `word_a`: 需要计算内积的单词a。 +* `word_b`: 需要计算内积的单词b。 + +更多api详情和用法可参考[paddlenlp.embeddings](https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings) + +## 代码示例 + +```python +import paddlehub as hub +embedding = hub.Module(name='w2v_zhihu_target_word-word_dim300') + +# 获取单词的embedding +embedding.search("中国") +# 计算两个词向量的余弦相似度 +embedding.cosine_sim("中国", "美国") +# 计算两个词向量的内积 +embedding.dot("中国", "美国") +``` + +## 部署服务 + +通过PaddleHub Serving,可以部署一个在线获取两个词向量的余弦相似度的服务。 + +### Step1: 启动PaddleHub Serving + +运行启动命令: + +```shell +$ hub serving start -m w2v_zhihu_target_word-word_dim300 +``` + +这样就完成了一个获取词向量的余弦相似度服务化API的部署,默认端口号为8866。 + +**NOTE:** 如使用GPU预测,则需要在启动服务之前,请设置CUDA_VISIBLE_DEVICES环境变量,否则不用设置。 + +### Step2: 发送预测请求 + +配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果 + +```python +import requests +import json + +# 指定用于计算余弦相似度的单词对[[word_a, word_b], [word_a, word_b], ... ]] +word_pairs = [["中国", "美国"], ["今天", "明天"]] +# 以key的方式指定word_pairs传入预测方法的时的参数,此例中为"data",对于每一对单词,调用cosine_sim进行余弦相似度的计算 +data = {"data": word_pairs} +# 发送post请求,content-type类型应指定json方式,url中的ip地址需改为对应机器的ip +url = "http://10.12.121.132:8866/predict/w2v_zhihu_target_word-word_dim300" +# 指定post请求的headers为application/json方式 +headers = {"Content-Type": "application/json"} + +r = requests.post(url=url, headers=headers, data=json.dumps(data)) +print(r.json()) +``` + +## 查看代码 + +https://github.com/PaddlePaddle/models/tree/release/2.0-beta/PaddleNLP/paddlenlp/embeddings + +## 依赖 + +paddlepaddle >= 2.0.0 + +paddlehub >= 2.0.0 + +## 更新历史 + +* 1.0.0 + + 初始发布 + diff --git a/modules/text/embedding/w2v_zhihu_target_word-word_dim300/__init__.py b/modules/text/embedding/w2v_zhihu_target_word-word_dim300/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modules/text/embedding/w2v_zhihu_target_word-word_dim300/module.py b/modules/text/embedding/w2v_zhihu_target_word-word_dim300/module.py new file mode 100644 index 00000000..a1bc8a5c --- /dev/null +++ b/modules/text/embedding/w2v_zhihu_target_word-word_dim300/module.py @@ -0,0 +1,55 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List +from paddlenlp.embeddings import TokenEmbedding +from paddlehub.module.module import moduleinfo, serving + + +@moduleinfo( + name="w2v_zhihu_target_word-word_dim300", + version="1.0.0", + summary="", + author="paddlepaddle", + author_email="", + type="nlp/semantic_model") +class Embedding(TokenEmbedding): + """ + Embedding model + """ + def __init__(self, *args, **kwargs): + super(Embedding, self).__init__(embedding_name="w2v.zhihu.target.word-word.dim300", *args, **kwargs) + + @serving + def calc_similarity(self, data: List[List[str]]): + """ + Calculate similarities of giving word pairs. + """ + results = [] + for word_pair in data: + if len(word_pair) != 2: + raise RuntimeError( + f'The input must have two words, but got {len(word_pair)}. Please check your inputs.') + if not isinstance(word_pair[0], str) or not isinstance(word_pair[1], str): + raise RuntimeError( + f'The types of text pair must be (str, str), but got' + f' ({type(word_pair[0]).__name__}, {type(word_pair[1]).__name__}). Please check your inputs.') + + for word in word_pair: + if self.get_idx_from_word(word) == \ + self.get_idx_from_word(self.vocab.unk_token): + raise RuntimeError( + f'Word "{word}" is not in vocab. Please check your inputs.') + results.append(str(self.cosine_sim(*word_pair))) + return results -- GitLab