未验证 提交 e59f15a1 编写于 作者: J Jack Zhou 提交者: GitHub

Add TokenEmbedding (#4983)

* Add TokenEmbedding

* download corpus embedding data
* load embedding data by specifying corpus name
* extend the vocab of tokenizer from corpus embedding data

* add unk token setting

* modify tokenizer

* add extend voacb

* move jieba tokenizer and rename corpus_name->embedding_name

* use bos url instead of localhost

* add log when loading data

* add token dot computation; add __repr__ of TokenEmbedding

* add color logging

* use paddlenlp.utils.log

* adjust repr

* update pretrained embedding table

* fix padding idx
上级 26368f23
......@@ -27,10 +27,7 @@ from paddlenlp.datasets import GlueQNLI, GlueSST2
from paddlenlp.data import Stack, Tuple, Pad
from paddlenlp.data.sampler import SamplerHelper
from paddlenlp.transformers import BertForSequenceClassification, BertTokenizer
FORMAT = '%(asctime)s-%(levelname)s: %(message)s'
logging.basicConfig(level=logging.INFO, format=FORMAT)
logger = logging.getLogger(__name__)
from paddlenlp.utils.log import logger
TASK_CLASSES = {
"qnli": (GlueQNLI, paddle.metric.Accuracy), # (dataset, metric)
......
......@@ -32,10 +32,7 @@ from paddle.io import DataLoader, Dataset
from paddlenlp.data import Stack, Tuple, Pad
from paddlenlp.transformers import BertForPretraining, BertModel, BertPretrainingCriterion
from paddlenlp.transformers import BertTokenizer
FORMAT = '%(asctime)s-%(levelname)s: %(message)s'
logging.basicConfig(level=logging.INFO, format=FORMAT)
logger = logging.getLogger(__name__)
from paddlenlp.utils.log import logger
MODEL_CLASSES = {"bert": (BertForPretraining, BertTokenizer), }
......
......@@ -31,12 +31,9 @@ from paddlenlp.datasets import GlueCoLA, GlueSST2, GlueMRPC, GlueSTSB, GlueQQP,
from paddlenlp.data import Stack, Tuple, Pad
from paddlenlp.data.sampler import SamplerHelper
from paddlenlp.transformers import ElectraForSequenceClassification, ElectraTokenizer
from paddlenlp.utils.log import logger
from paddlenlp.metrics import AccuracyAndF1, Mcc, PearsonAndSpearman
FORMAT = '%(asctime)s-%(levelname)s: %(message)s'
logging.basicConfig(level=logging.INFO, format=FORMAT)
logger = logging.getLogger(__name__)
TASK_CLASSES = {
"cola": (GlueCoLA, Mcc),
"sst-2": (GlueSST2, Accuracy),
......
......@@ -12,10 +12,7 @@ import paddle.distributed as dist
import reader
from paddlenlp.transformers import TransformerModel, CrossEntropyCriterion, position_encoding_init
FORMAT = '%(asctime)s-%(levelname)s: %(message)s'
logging.basicConfig(level=logging.INFO, format=FORMAT)
logger = logging.getLogger(__name__)
from paddlenlp.utils.log import logger
def parse_args():
......
......@@ -27,10 +27,7 @@ from pgl.contrib.imperative.graph_tensor import GraphTensor
from models import ErnieSageForLinkPrediction
from data import GraphDataset, TrainData, PredictData, GraphDataLoader
FORMAT = '%(asctime)s-%(levelname)s: %(message)s'
logging.basicConfig(level=logging.INFO, format=FORMAT)
logger = logging.getLogger(__name__)
from paddlenlp.utils.log import logger
def set_seed(config):
......
......@@ -14,4 +14,5 @@
from .collate import *
from .vocab import *
from .sampler import *
\ No newline at end of file
from .sampler import *
from .tokenizer import *
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import jieba
from .vocab import Vocab
def get_idx_from_word(word, word_to_idx, unk_word):
if word in word_to_idx:
return word_to_idx[word]
return word_to_idx[unk_word]
class BaseTokenizer(object):
def __init__(self, vocab):
self.vocab = vocab
def get_tokenizer(self):
return self.tokenizer
def cut(self, sentence):
pass
def encode(self, sentence):
pass
class JiebaTokenizer(BaseTokenizer):
def __init__(self, vocab):
super(JiebaTokenizer, self).__init__(vocab)
self.tokenizer = jieba.Tokenizer()
# initialize tokenizer
self.tokenizer.FREQ = {key: 1 for key in self.vocab.token_to_idx.keys()}
self.tokenizer.total = len(self.tokenizer.FREQ)
self.tokenizer.initialized = True
def cut(self, sentence, cut_all=False, use_hmm=True):
return self.tokenizer.lcut(sentence, cut_all, use_hmm)
def encode(self, sentence, cut_all=False, use_hmm=True):
words = self.cut(sentence, cut_all, use_hmm)
return [
get_idx_from_word(word, self.vocab.token_to_idx,
self.vocab.unk_token) for word in words
]
......@@ -11,3 +11,5 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .token_embedding import *
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from enum import Enum
import os.path as osp
URL_ROOT = "https://bj.bcebos.com/paddlenlp"
EMBEDDING_URL_ROOT = osp.join(URL_ROOT, "models/embeddings")
PAD_TOKEN = '[PAD]'
UNK_TOKEN = '[UNK]'
PAD_IDX = 0
UNK_IDX = 1
EMBEDDING_NAME_LIST = ["w2v.baidu_encyclopedia.target.word-word.dim300"]
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from enum import Enum
import os
import os.path as osp
import numpy as np
import logging
import paddle
import paddle.nn as nn
from paddle.utils.download import get_path_from_url
from paddlenlp.utils.env import _get_sub_home, MODEL_HOME
from paddlenlp.utils.log import logger
from paddlenlp.data import Vocab, get_idx_from_word
from .constant import EMBEDDING_URL_ROOT, PAD_TOKEN, UNK_TOKEN, PAD_IDX, \
UNK_IDX, EMBEDDING_NAME_LIST
EMBEDDING_HOME = _get_sub_home('embeddings', parent_home=MODEL_HOME)
__all__ = ['list_embedding_name', 'TokenEmbedding']
def list_embedding_name():
return list(EMBEDDING_NAME_LIST)
class TokenEmbedding(nn.Embedding):
def __init__(self,
embedding_name=EMBEDDING_NAME_LIST[0],
unknown_token=UNK_TOKEN,
unknown_token_vector=None,
extended_vocab_path=None,
trainable=True):
embedding_name = embedding_name.lower()
vector_path = osp.join(EMBEDDING_HOME, embedding_name + ".npz")
if not osp.exists(vector_path):
# download
url = osp.join(EMBEDDING_URL_ROOT, embedding_name + ".tar.gz")
get_path_from_url(url, EMBEDDING_HOME)
logger.info("Loading embedding vector...")
vector_np = np.load(vector_path)
self.embedding_dim = vector_np['embedding'].shape[1]
self.unknown_token = unknown_token
if unknown_token_vector is not None:
unk_vector = np.array(unknown_token_vector).astype(
paddle.get_default_dtype())
else:
unk_vector = np.random.normal(
scale=0.02,
size=self.embedding_dim).astype(paddle.get_default_dtype())
pad_vector = np.array(
[0] * self.embedding_dim).astype(paddle.get_default_dtype())
if extended_vocab_path is not None:
embedding_table = self._extend_vocab(extended_vocab_path, vector_np,
pad_vector, unk_vector)
trainable = True
else:
embedding_table = self._init_without_extend_vocab(
vector_np, pad_vector, unk_vector)
self.vocab = Vocab.from_dict(
self._word_to_idx, unk_token=unknown_token, pad_token=PAD_TOKEN)
self.num_embeddings = embedding_table.shape[0]
# import embedding
super(TokenEmbedding, self).__init__(
self.num_embeddings,
self.embedding_dim,
padding_idx=self._word_to_idx[PAD_TOKEN])
self.weight.set_value(embedding_table)
self.set_trainable(trainable)
logger.info("Finish loading embedding vector.")
def _init_without_extend_vocab(self, vector_np, pad_vector, unk_vector):
self._idx_to_word = list(vector_np['vocab'])
self._idx_to_word.insert(PAD_IDX, PAD_TOKEN)
self._idx_to_word.insert(UNK_IDX, self.unknown_token)
self._word_to_idx = self._construct_word_to_idx(self._idx_to_word)
# insert unk, pad embedding
embedding_table = np.insert(
vector_np['embedding'], [0], [pad_vector, unk_vector],
axis=0).astype(paddle.get_default_dtype())
return embedding_table
def _read_vocab_list_from_file(self, extended_vocab_path):
# load new vocab table from file
vocab_list = []
with open(extended_vocab_path, "r", encoding="utf-8") as f:
for line in f.readlines():
line = line.strip()
if line == "":
break
vocab = line.split()[0]
vocab_list.append(vocab)
return vocab_list
def _extend_vocab(self, extended_vocab_path, vector_np, pad_vector,
unk_vector):
logger.info("Start extending vocab.")
extend_vocab_list = self._read_vocab_list_from_file(extended_vocab_path)
extend_vocab_set = set(extend_vocab_list)
# update idx_to_word
self._idx_to_word = extend_vocab_list
embedding_table = np.random.normal(
scale=0.02,
size=(len(self._idx_to_word),
self.embedding_dim)).astype(paddle.get_default_dtype())
self._idx_to_word.append(PAD_TOKEN)
embedding_table = np.append(embedding_table, [pad_vector], axis=0)
if self.unknown_token not in extend_vocab_set:
self._idx_to_word.append(self.unknown_token)
embedding_table = np.append(embedding_table, [unk_vector], axis=0)
self._word_to_idx = self._construct_word_to_idx(self._idx_to_word)
else:
self._word_to_idx = self._construct_word_to_idx(self._idx_to_word)
unk_idx = self._word_to_idx[self.unknown_token]
embedding_table[unk_idx] = unk_vector
pretrained_idx_to_word = list(vector_np['vocab'])
pretrained_word_to_idx = self._construct_word_to_idx(
pretrained_idx_to_word)
pretrained_embedding_table = np.array(vector_np['embedding'])
pretrained_vocab_set = set(pretrained_idx_to_word)
extend_vocab_set = set(self._idx_to_word)
vocab_intersection = pretrained_vocab_set & extend_vocab_set
vocab_subtraction = pretrained_vocab_set - extend_vocab_set
# assignment from pretrained_vocab_embedding to extend_vocab_embedding
pretrained_vocab_intersect_index = [
pretrained_word_to_idx[word] for word in vocab_intersection
]
pretrained_vocab_subtract_index = [
pretrained_word_to_idx[word] for word in vocab_subtraction
]
extend_vocab_intersect_index = [
self._word_to_idx[word] for word in vocab_intersection
]
embedding_table[
extend_vocab_intersect_index] = pretrained_embedding_table[
pretrained_vocab_intersect_index]
for idx in pretrained_vocab_subtract_index:
word = pretrained_idx_to_word[idx]
self._idx_to_word.append(word)
self._word_to_idx[word] = len(self._idx_to_word) - 1
embedding_table = np.append(
embedding_table,
pretrained_embedding_table[pretrained_vocab_subtract_index],
axis=0)
logger.info("Finish extending vocab.")
return embedding_table
def set_trainable(self, trainable):
self.weight.stop_gradient = not trainable
def search(self, words):
idx_list = self.get_idx_list_from_words(words)
idx_tensor = paddle.to_tensor(idx_list)
return self(idx_tensor).numpy()
def get_idx_from_word(self, word):
return get_idx_from_word(word, self.vocab.token_to_idx,
self.unknown_token)
def get_idx_list_from_words(self, words):
if isinstance(words, str):
idx_list = [self.get_idx_from_word(words)]
elif isinstance(words, int):
idx_list = [words]
elif isinstance(words, list) or isinstance(words, tuple):
idx_list = [
self.get_idx_from_word(word) if isinstance(word, str) else word
for word in words
]
else:
raise TypeError
return idx_list
def _dot_np(self, array_a, array_b):
return np.sum(array_a * array_b)
def _calc_word(self, word_a, word_b, calc_kernel):
embeddings = self.search([word_a, word_b])
embedding_a = embeddings[0]
embedding_b = embeddings[1]
return calc_kernel(embedding_a, embedding_b)
def dot(self, word_a, word_b):
dot = self._dot_np
return self._calc_word(word_a, word_b, lambda x, y: dot(x, y))
def cosine_sim(self, word_a, word_b):
dot = self._dot_np
return self._calc_word(
word_a, word_b,
lambda x, y: dot(x, y) / (np.sqrt(dot(x, x)) * np.sqrt(dot(y, y))))
def _construct_word_to_idx(self, idx_to_word):
word_to_idx = {}
for i, word in enumerate(idx_to_word):
word_to_idx[word] = i
return word_to_idx
def __repr__(self):
s = "Object type: {}\
\nPadding index: {}\
\nPadding token: {}\
\nUnknown index: {}\
\nUnknown token: {}\
\n{}".format(
super(TokenEmbedding, self).__repr__(),
self._word_to_idx[PAD_TOKEN], PAD_TOKEN,
self._word_to_idx[self.unknown_token], self.unknown_token,
self.weight)
return s
......@@ -24,18 +24,10 @@ from paddle.nn import Layer
# TODO(fangzeyang) Temporary fix and replace by paddle framework downloader later
from paddlenlp.utils.downloader import get_path_from_url
from paddlenlp.utils.env import MODEL_HOME
from paddlenlp.utils.log import logger
from .utils import InitTrackerMeta, fn_args_to_dict
### FIXME(zhangxuefei): remove logging setting after logging format is clear
log = logging.getLogger(__name__)
formatter = logging.Formatter(
fmt='[%(levelname)s] %(asctime)s [%(filename)12s:%(lineno)5d]:\t%(message)s')
console = logging.StreamHandler()
console.setFormatter(formatter)
log.addHandler(console)
log.setLevel(logging.DEBUG)
__all__ = [
'PretrainedModel',
'register_base_model',
......@@ -163,11 +155,11 @@ class PretrainedModel(Layer):
if file_path is None or os.path.isfile(file_path):
resolved_resource_files[file_id] = file_path
elif os.path.exists(path):
log.info("Already cached %s" % path)
logger.info("Already cached %s" % path)
resolved_resource_files[file_id] = path
else:
log.info("Downloading %s and saved to %s" %
(file_path, default_root))
logger.info("Downloading %s and saved to %s" %
(file_path, default_root))
resolved_resource_files[file_id] = get_path_from_url(
file_path, default_root)
......
......@@ -52,8 +52,7 @@ except:
sys.stderr.write('\n')
import logging
logger = logging.getLogger(__name__)
from .log import logger
__all__ = ['get_weights_path_from_url']
......
......@@ -40,8 +40,8 @@ def _get_ppnlp_home():
return os.path.join(_get_user_home(), '.paddlenlp')
def _get_sub_home(directory):
home = os.path.join(_get_ppnlp_home(), directory)
def _get_sub_home(directory, parent_home=_get_ppnlp_home()):
home = os.path.join(parent_home, directory)
if not os.path.exists(home):
os.makedirs(home)
return home
......
# coding:utf-8
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import contextlib
import copy
import functools
import logging
import os
import sys
import time
import threading
from typing import List
import colorlog
from colorama import Fore
loggers = {}
log_config = {
'DEBUG': {
'level': 10,
'color': 'purple'
},
'INFO': {
'level': 20,
'color': 'green'
},
'TRAIN': {
'level': 21,
'color': 'cyan'
},
'EVAL': {
'level': 22,
'color': 'blue'
},
'WARNING': {
'level': 30,
'color': 'yellow'
},
'ERROR': {
'level': 40,
'color': 'red'
},
'CRITICAL': {
'level': 50,
'color': 'bold_red'
}
}
class Logger(object):
'''
Deafult logger in PaddleNLP
Args:
name(str) : Logger name, default is 'PaddleNLP'
'''
def __init__(self, name: str=None):
name = 'PaddleNLP' if not name else name
self.logger = logging.getLogger(name)
for key, conf in log_config.items():
logging.addLevelName(conf['level'], key)
self.__dict__[key] = functools.partial(self.__call__, conf['level'])
self.__dict__[key.lower()] = functools.partial(self.__call__,
conf['level'])
self.format = colorlog.ColoredFormatter(
'%(log_color)s[%(asctime)-15s] [%(levelname)8s]%(reset)s - %(message)s',
log_colors={
key: conf['color']
for key, conf in log_config.items()
})
self.handler = logging.StreamHandler()
self.handler.setFormatter(self.format)
self.logger.addHandler(self.handler)
self.logLevel = 'DEBUG'
self.logger.setLevel(logging.DEBUG)
self.logger.propagate = False
self._is_enable = True
def disable(self):
self._is_enable = False
def enable(self):
self._is_enable = True
@property
def is_enable(self) -> bool:
return self._is_enable
def __call__(self, log_level: str, msg: str):
if not self.is_enable:
return
self.logger.log(log_level, msg)
@contextlib.contextmanager
def use_terminator(self, terminator: str):
old_terminator = self.handler.terminator
self.handler.terminator = terminator
yield
self.handler.terminator = old_terminator
@contextlib.contextmanager
def processing(self, msg: str, interval: float=0.1):
'''
Continuously print a progress bar with rotating special effects.
Args:
msg(str): Message to be printed.
interval(float): Rotation interval. Default to 0.1.
'''
end = False
def _printer():
index = 0
flags = ['\\', '|', '/', '-']
while not end:
flag = flags[index % len(flags)]
with self.use_terminator('\r'):
self.info('{}: {}'.format(msg, flag))
time.sleep(interval)
index += 1
t = threading.Thread(target=_printer)
t.start()
yield
end = True
logger = Logger()
visualdl
jieba
h5py
\ No newline at end of file
h5py
colorlog
colorama
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册