diff --git a/PaddleNLP/docs/embeddings.md b/PaddleNLP/docs/embeddings.md index 1b9488bf26d6de72d9a741df4f977dbc673db43a..dc375828b4fa649d268458e50f1ef7dee6ccb574 100644 --- a/PaddleNLP/docs/embeddings.md +++ b/PaddleNLP/docs/embeddings.md @@ -31,14 +31,14 @@ PaddleNLP提供多个开源的预训练Embedding模型,用户仅需在使用`p | Co-occurrence 类型 | 目标词向量 | 上下文词向量 | | --------------------------- | ------ | ---- | | Word → Word | w2v.baidu_encyclopedia.target.word-word.dim300 | w2v.baidu_encyclopedia.context.word-word.dim300 | -| Word → Ngram (1-2) | w2v.baidu_encyclopedia.target.word-ngram.1-2.dim300 | 暂无 | -| Word → Ngram (1-3) | 暂无 | 暂无 | -| Ngram (1-2) → Ngram (1-2)| 暂无 | 暂无 | +| Word → Ngram (1-2) | w2v.baidu_encyclopedia.target.word-ngram.1-2.dim300 | w2v.baidu_encyclopedia.context.word-ngram.1-2.dim300 | +| Word → Ngram (1-3) | w2v.baidu_encyclopedia.target.word-ngram.1-3.dim300 | w2v.baidu_encyclopedia.context.word-ngram.1-3.dim300 | +| Ngram (1-2) → Ngram (1-2)| w2v.baidu_encyclopedia.target.word-ngram.2-2.dim300 | w2v.baidu_encyclopedia.target.word-ngram.2-2.dim300 | | Word → Character (1) | w2v.baidu_encyclopedia.target.word-character.char1-1.dim300 | w2v.baidu_encyclopedia.context.word-character.char1-1.dim300 | | Word → Character (1-2) | w2v.baidu_encyclopedia.target.word-character.char1-2.dim300 | w2v.baidu_encyclopedia.context.word-character.char1-2.dim300 | | Word → Character (1-4) | w2v.baidu_encyclopedia.target.word-character.char1-4.dim300 | w2v.baidu_encyclopedia.context.word-character.char1-4.dim300 | -| Word → Word (left/right) | 暂无 | 暂无 | -| Word → Word (distance) | 暂无 | 暂无 | +| Word → Word (left/right) | w2v.baidu_encyclopedia.target.word-wordLR.dim300 | w2v.baidu_encyclopedia.context.word-wordLR.dim300 | +| Word → Word (distance) | w2v.baidu_encyclopedia.target.word-wordPosition.dim300 | w2v.baidu_encyclopedia.context.word-wordPosition.dim300 | ## 英文词向量 diff --git a/PaddleNLP/examples/text_generation/vae-seq2seq/README.md b/PaddleNLP/examples/text_generation/vae-seq2seq/README.md index b403a32aefd51cdc9a3089fe4f9cb3a5c1371157..52e17a2e6be2924b00619833bd744d4aa9c838cb 100644 --- a/PaddleNLP/examples/text_generation/vae-seq2seq/README.md +++ b/PaddleNLP/examples/text_generation/vae-seq2seq/README.md @@ -117,6 +117,7 @@ python predict.py \ --max_grad_norm 5.0 \ --dataset yahoo \ --use_gpu True \ + --infer_output_file infer_output.txt \ --init_from_ckpt yahoo_model/49 \ ``` diff --git a/PaddleNLP/examples/word_embedding/README.md b/PaddleNLP/examples/word_embedding/README.md index d046885332b559f8c7f12cea99b46acd7fb7fa74..898db75ebd982d9c4e407c319d191544f4676258 100644 --- a/PaddleNLP/examples/word_embedding/README.md +++ b/PaddleNLP/examples/word_embedding/README.md @@ -58,7 +58,7 @@ nohup python train.py --vocab_path='./dict.txt' --use_gpu=True --lr=1e-4 --batch 以上参数表示: * `vocab_path`: 词汇表文件路径。 -* `use_gpu`: 是否使用GPU进行训练, 默认为`False`。 +* `use_gpu`: 是否使用GPU进行训练, 默认为`True`。 * `lr`: 学习率, 默认为5e-4。 * `batch_size`: 运行一个batch大小,默认为64。 * `epochs`: 训练轮次,默认为5。 @@ -96,3 +96,7 @@ Eval Acc: ## 致谢 - 感谢 [Chinese-Word-Vectors](https://github.com/Embedding/Chinese-Word-Vectors)提供Word2Vec中文Embedding来源。 + +## 参考论文 +- Li, Shen, et al. "Analogical reasoning on chinese morphological and semantic relations." arXiv preprint arXiv:1805.06504 (2018). +- Qiu, Yuanyuan, et al. "Revisiting correlations between intrinsic and extrinsic evaluations of word embeddings." Chinese Computational Linguistics and Natural Language Processing Based on Naturally Annotated Big Data. Springer, Cham, 2018. 209-221. diff --git a/PaddleNLP/paddlenlp/embeddings/README.md b/PaddleNLP/paddlenlp/embeddings/README.md index 771d8772e0c5378d3e2adb8446147f9754915e77..41855f114a3149f8bd7d66cbcb52f5fd45ee1348 100644 --- a/PaddleNLP/paddlenlp/embeddings/README.md +++ b/PaddleNLP/paddlenlp/embeddings/README.md @@ -25,20 +25,21 @@ token_embedding = TokenEmbedding(embedding_name="w2v.baidu_encyclopedia.target.w # 查看token_embedding详情 print(token_embedding) -Object type: -Unknown index: 1 +Object type: +Unknown index: 635963 Unknown token: [UNK] -Padding index: 0 +Padding index: 635964 Padding token: [PAD] Parameter containing: -Tensor(shape=[636015, 300], dtype=float32, place=CPUPlace, stop_gradient=False, - [[ 0. , 0. , 0. , ..., 0. , 0. , 0. ], - [ 0.00372404, 0.01534354, 0.01341010, ..., -0.00605236, -0.02150303, 0.02372430], - [-0.24200200, 0.13931701, 0.07378800, ..., 0.14103900, 0.05592300, -0.08004800], +Tensor(shape=[635965, 300], dtype=float32, place=CPUPlace, stop_gradient=False, + [[-0.24200200, 0.13931701, 0.07378800, ..., 0.14103900, 0.05592300, -0.08004800], + [-0.08671700, 0.07770800, 0.09515300, ..., 0.11196400, 0.03082200, -0.12893000], + [-0.11436500, 0.12201900, 0.02833000, ..., 0.11068700, 0.03607300, -0.13763499], ..., - [ 0.01615800, -0.00266300, -0.00628300, ..., 0.01484100, 0.00196600, -0.01032000], - [ 0.01705700, 0.00040400, -0.01222000, ..., 0.02837200, 0.02402500, -0.00814800], - [ 0.02628800, -0.00008300, -0.00393500, ..., 0.00654000, 0.00024600, -0.00662600]]) + [ 0.02628800, -0.00008300, -0.00393500, ..., 0.00654000, 0.00024600, -0.00662600], + [-0.00924490, 0.00652097, 0.01049327, ..., -0.01796000, 0.03498908, -0.02209341], + [ 0. , 0. , 0. , ..., 0. , 0. , 0. ]]) + ``` ## 查询embedding结果 @@ -93,5 +94,5 @@ words = tokenizer.cut("中国人民") print(words) # ['中国人', '民'] tokens = tokenizer.encode("中国人民") -print(tokens) # [12532, 1336] +print(tokens) # [12530, 1334] ``` diff --git a/PaddleNLP/paddlenlp/embeddings/constant.py b/PaddleNLP/paddlenlp/embeddings/constant.py index 2498b6295995f8f1fac728dc2a6320b3b634da64..454b4c2265248edc30b6e7dd134e68fffb11a005 100644 --- a/PaddleNLP/paddlenlp/embeddings/constant.py +++ b/PaddleNLP/paddlenlp/embeddings/constant.py @@ -22,17 +22,27 @@ PAD_TOKEN = '[PAD]' UNK_TOKEN = '[UNK]' EMBEDDING_NAME_LIST = [ + # Word2Vec # baidu_encyclopedia "w2v.baidu_encyclopedia.target.word-word.dim300", "w2v.baidu_encyclopedia.target.word-character.char1-1.dim300", "w2v.baidu_encyclopedia.target.word-character.char1-2.dim300", "w2v.baidu_encyclopedia.target.word-character.char1-4.dim300", "w2v.baidu_encyclopedia.target.word-ngram.1-2.dim300", + "w2v.baidu_encyclopedia.target.word-ngram.1-3.dim300", + "w2v.baidu_encyclopedia.target.word-ngram.2-2.dim300", + "w2v.baidu_encyclopedia.target.word-wordLR.dim300", + "w2v.baidu_encyclopedia.target.word-wordPosition.dim300", "w2v.baidu_encyclopedia.target.bigram-char.dim300", "w2v.baidu_encyclopedia.context.word-word.dim300", "w2v.baidu_encyclopedia.context.word-character.char1-1.dim300", "w2v.baidu_encyclopedia.context.word-character.char1-2.dim300", "w2v.baidu_encyclopedia.context.word-character.char1-4.dim300", + "w2v.baidu_encyclopedia.context.word-ngram.1-2.dim300", + "w2v.baidu_encyclopedia.context.word-ngram.1-3.dim300", + "w2v.baidu_encyclopedia.context.word-ngram.2-2.dim300", + "w2v.baidu_encyclopedia.context.word-wordLR.dim300", + "w2v.baidu_encyclopedia.context.word-wordPosition.dim300", # wikipedia "w2v.wiki.target.bigram-char.dim300", "w2v.wiki.target.word-char.dim300",