提交 545d975d 编写于 作者: S Steffy-zxf

1.update nlp_reader, remove redundant returns

2. fix pre-set net spelling , pre-set net -> pre-defined net

3. remove version_compare usage
上级 99cda5db
......@@ -170,8 +170,8 @@ class TextClassifierTask(ClassifierTask):
num_classes,
feed_list,
data_reader,
token_feature=None,
feature=None,
token_feature=None,
network=None,
startup_program=None,
config=None,
......@@ -182,13 +182,13 @@ class TextClassifierTask(ClassifierTask):
num_classes: total labels of the text classification task.
feed_list(list): the variable name that will be feeded to the main program
data_reader(object): data reader for the task. It must be one of ClassifyReader and LACClassifyReader.
token_feature(Variable): the feature will be used to connect the preset net. It must be the token-level feature, shape as [-1, seq_len, emb_size]. Default None.
feature(Variable): the feature will be used to classify texts. It must be the sentence-level feature, shape as [-1, emb_size]. Token_feature and feature couldn't be setted as the same time. One of them must be setted as not None. Default None.
network(str): the preset network. Choices: 'bilstm', 'bow', 'cnn', 'dpcnn', 'gru' and 'lstm'. Default None. If network is setted, then token_feature must be seted and feature must be None.
main_program (object): the customized main_program, default None.
startup_program (object): the customized startup_program, default None.
feature(Variable): the `feature` will be used to classify texts. It must be the sentence-level feature, shape as [-1, emb_size]. `Token_feature` and `feature` couldn't be setted at the same time. One of them must be setted as not None. Default None.
token_feature(Variable): the `feature` will be used to connect the pre-defined network. It must be the token-level feature, shape as [-1, seq_len, emb_size]. Default None.
network(str): the pre-defined network. Choices: 'bilstm', 'bow', 'cnn', 'dpcnn', 'gru' and 'lstm'. Default None. If network is setted, then `token_feature` must be setted and `feature` must be None.
main_program (object): the customized main program, default None.
startup_program (object): the customized startup program, default None.
config (RunConfig): run config for the task, such as batch_size, epoch, learning_rate setting and so on. Default None.
hidden_units(list): the element of hidden_units list is the full-connect layer size. It will add the full-connect layers to the program. Default None.
hidden_units(list): the element of `hidden_units` list is the full-connect layer size. It will add the full-connect layers to the program. Default None.
metrics_choices(list): metrics used to the task, default ["acc"].
"""
if (not feature) and (not token_feature):
......@@ -247,7 +247,7 @@ class TextClassifierTask(ClassifierTask):
self.feature, length=self.seq_len_used)
if self.network:
# add preset net
# add pre-defined net
net_func = getattr(net.classification, self.network)
if self.network == 'dpcnn':
# deepcnn network is no need to unpad
......@@ -258,7 +258,7 @@ class TextClassifierTask(ClassifierTask):
logger.info(
"%s has been added in the TextClassifierTask!" % self.network)
else:
# not use preset net but to use fc net
# not use pre-defined net but to use fc net
cls_feats = fluid.layers.dropout(
x=self.feature,
dropout_prob=0.1,
......
......@@ -16,7 +16,6 @@
This module provide nets for text classification
"""
from paddlehub.common.utils import version_compare
import paddle
import paddle.fluid as fluid
......
......@@ -65,7 +65,6 @@ class BaseNLPReader(BaseReader):
logger.warning(
"use_task_id has been de discarded since PaddleHub v1.4.0, it's no necessary to feed task_ids now."
)
self.task_id = 0
self.Record_With_Label_Id = namedtuple(
'Record',
......@@ -287,36 +286,16 @@ class ClassifyReader(BaseNLPReader):
max_seq_len=self.max_seq_len,
pad_idx=self.pad_id)
return_list = [
padded_token_ids, padded_position_ids, padded_text_type_ids,
input_mask, batch_seq_lens
]
if phase != "predict":
batch_labels = [record.label_id for record in batch_records]
batch_labels = np.array(batch_labels).astype("int64").reshape(
[-1, 1])
return_list += [batch_labels]
return_list = [
padded_token_ids, padded_position_ids, padded_text_type_ids,
input_mask, batch_seq_lens, batch_labels
]
if self.use_task_id:
padded_task_ids = np.ones_like(
padded_token_ids, dtype="int64") * self.task_id
return_list = [
padded_token_ids, padded_position_ids, padded_text_type_ids,
input_mask, padded_task_ids, batch_seq_lens, batch_labels
]
else:
return_list = [
padded_token_ids, padded_position_ids, padded_text_type_ids,
input_mask, batch_seq_lens
]
if self.use_task_id:
padded_task_ids = np.ones_like(
padded_token_ids, dtype="int64") * self.task_id
return_list = [
padded_token_ids, padded_position_ids, padded_text_type_ids,
input_mask, padded_task_ids, batch_seq_lens
]
return return_list
......@@ -370,40 +349,20 @@ class SequenceLabelReader(BaseNLPReader):
max_seq_len=self.max_seq_len,
pad_idx=self.pad_id)
return_list = [
padded_token_ids, padded_position_ids, padded_text_type_ids,
input_mask
]
if phase != "predict":
batch_label_ids = [record.label_id for record in batch_records]
padded_label_ids = pad_batch_data(
batch_label_ids,
max_seq_len=self.max_seq_len,
pad_idx=len(self.label_map) - 1)
return_list = [
padded_token_ids, padded_position_ids, padded_text_type_ids,
input_mask, padded_label_ids, batch_seq_lens
]
if self.use_task_id:
padded_task_ids = np.ones_like(
padded_token_ids, dtype="int64") * self.task_id
return_list = [
padded_token_ids, padded_position_ids, padded_text_type_ids,
input_mask, padded_task_ids, padded_label_ids,
batch_seq_lens
]
return_list += [padded_label_ids, batch_seq_lens]
else:
return_list = [
padded_token_ids, padded_position_ids, padded_text_type_ids,
input_mask, batch_seq_lens
]
if self.use_task_id:
padded_task_ids = np.ones_like(
padded_token_ids, dtype="int64") * self.task_id
return_list = [
padded_token_ids, padded_position_ids, padded_text_type_ids,
input_mask, padded_task_ids, batch_seq_lens
]
return_list += [batch_seq_lens]
return return_list
......@@ -515,37 +474,18 @@ class MultiLabelClassifyReader(BaseNLPReader):
max_seq_len=self.max_seq_len,
pad_idx=self.pad_id)
return_list = [
padded_token_ids, padded_position_ids, padded_text_type_ids,
input_mask
]
if phase != "predict":
batch_labels_ids = [record.label_id for record in batch_records]
num_label = len(self.dataset.get_labels())
batch_labels = np.array(batch_labels_ids).astype("int64").reshape(
[-1, num_label])
return_list = [
padded_token_ids, padded_position_ids, padded_text_type_ids,
input_mask, batch_labels
]
return_list += [batch_labels]
if self.use_task_id:
padded_task_ids = np.ones_like(
padded_token_ids, dtype="int64") * self.task_id
return_list = [
padded_token_ids, padded_position_ids, padded_text_type_ids,
input_mask, padded_task_ids, batch_labels
]
else:
return_list = [
padded_token_ids, padded_position_ids, padded_text_type_ids,
input_mask
]
if self.use_task_id:
padded_task_ids = np.ones_like(
padded_token_ids, dtype="int64") * self.task_id
return_list = [
padded_token_ids, padded_position_ids, padded_text_type_ids,
input_mask, padded_task_ids
]
return return_list
def _convert_example_to_record(self,
......@@ -635,37 +575,17 @@ class RegressionReader(BaseNLPReader):
max_seq_len=self.max_seq_len,
pad_idx=self.pad_id)
return_list = [
padded_token_ids, padded_position_ids, padded_text_type_ids,
input_mask
]
if phase != "predict":
batch_labels = [record.label_id for record in batch_records]
# the only diff with ClassifyReader: astype("float32")
batch_labels = np.array(batch_labels).astype("float32").reshape(
[-1, 1])
return_list = [
padded_token_ids, padded_position_ids, padded_text_type_ids,
input_mask, batch_labels
]
if self.use_task_id:
padded_task_ids = np.ones_like(
padded_token_ids, dtype="int64") * self.task_id
return_list = [
padded_token_ids, padded_position_ids, padded_text_type_ids,
input_mask, padded_task_ids, batch_labels
]
else:
return_list = [
padded_token_ids, padded_position_ids, padded_text_type_ids,
input_mask
]
if self.use_task_id:
padded_task_ids = np.ones_like(
padded_token_ids, dtype="int64") * self.task_id
return_list = [
padded_token_ids, padded_position_ids, padded_text_type_ids,
input_mask, padded_task_ids
]
return_list += [batch_labels]
return return_list
......@@ -832,6 +752,10 @@ class ReadingComprehensionReader(BaseNLPReader):
pad_idx=self.pad_id,
max_seq_len=self.max_seq_len)
return_list = [
padded_token_ids, padded_position_ids, padded_text_type_ids,
input_mask, batch_unique_ids
]
if phase != "predict":
batch_start_position = [
record.start_position for record in batch_records
......@@ -844,33 +768,8 @@ class ReadingComprehensionReader(BaseNLPReader):
batch_end_position = np.array(batch_end_position).astype(
"int64").reshape([-1, 1])
return_list = [
padded_token_ids, padded_position_ids, padded_text_type_ids,
input_mask, batch_unique_ids, batch_start_position,
batch_end_position
]
if self.use_task_id:
padded_task_ids = np.ones_like(
padded_token_ids, dtype="int64") * self.task_id
return_list = [
padded_token_ids, padded_position_ids, padded_text_type_ids,
input_mask, padded_task_ids, batch_unique_ids,
batch_start_position, batch_end_position
]
return_list += [batch_start_position, batch_end_position]
else:
return_list = [
padded_token_ids, padded_position_ids, padded_text_type_ids,
input_mask, batch_unique_ids
]
if self.use_task_id:
padded_task_ids = np.ones_like(
padded_token_ids, dtype="int64") * self.task_id
return_list = [
padded_token_ids, padded_position_ids, padded_text_type_ids,
input_mask, padded_task_ids, batch_unique_ids
]
return return_list
def _prepare_batch_data(self, records, batch_size, phase=None):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册