提交 545d975d 编写于 作者: S Steffy-zxf

1.update nlp_reader, remove redundant returns

2. fix pre-set net spelling , pre-set net -> pre-defined net

3. remove version_compare usage
上级 99cda5db
...@@ -170,8 +170,8 @@ class TextClassifierTask(ClassifierTask): ...@@ -170,8 +170,8 @@ class TextClassifierTask(ClassifierTask):
num_classes, num_classes,
feed_list, feed_list,
data_reader, data_reader,
token_feature=None,
feature=None, feature=None,
token_feature=None,
network=None, network=None,
startup_program=None, startup_program=None,
config=None, config=None,
...@@ -182,13 +182,13 @@ class TextClassifierTask(ClassifierTask): ...@@ -182,13 +182,13 @@ class TextClassifierTask(ClassifierTask):
num_classes: total labels of the text classification task. num_classes: total labels of the text classification task.
feed_list(list): the variable name that will be feeded to the main program feed_list(list): the variable name that will be feeded to the main program
data_reader(object): data reader for the task. It must be one of ClassifyReader and LACClassifyReader. data_reader(object): data reader for the task. It must be one of ClassifyReader and LACClassifyReader.
token_feature(Variable): the feature will be used to connect the preset net. It must be the token-level feature, shape as [-1, seq_len, emb_size]. Default None. feature(Variable): the `feature` will be used to classify texts. It must be the sentence-level feature, shape as [-1, emb_size]. `Token_feature` and `feature` couldn't be setted at the same time. One of them must be setted as not None. Default None.
feature(Variable): the feature will be used to classify texts. It must be the sentence-level feature, shape as [-1, emb_size]. Token_feature and feature couldn't be setted as the same time. One of them must be setted as not None. Default None. token_feature(Variable): the `feature` will be used to connect the pre-defined network. It must be the token-level feature, shape as [-1, seq_len, emb_size]. Default None.
network(str): the preset network. Choices: 'bilstm', 'bow', 'cnn', 'dpcnn', 'gru' and 'lstm'. Default None. If network is setted, then token_feature must be seted and feature must be None. network(str): the pre-defined network. Choices: 'bilstm', 'bow', 'cnn', 'dpcnn', 'gru' and 'lstm'. Default None. If network is setted, then `token_feature` must be setted and `feature` must be None.
main_program (object): the customized main_program, default None. main_program (object): the customized main program, default None.
startup_program (object): the customized startup_program, default None. startup_program (object): the customized startup program, default None.
config (RunConfig): run config for the task, such as batch_size, epoch, learning_rate setting and so on. Default None. config (RunConfig): run config for the task, such as batch_size, epoch, learning_rate setting and so on. Default None.
hidden_units(list): the element of hidden_units list is the full-connect layer size. It will add the full-connect layers to the program. Default None. hidden_units(list): the element of `hidden_units` list is the full-connect layer size. It will add the full-connect layers to the program. Default None.
metrics_choices(list): metrics used to the task, default ["acc"]. metrics_choices(list): metrics used to the task, default ["acc"].
""" """
if (not feature) and (not token_feature): if (not feature) and (not token_feature):
...@@ -247,7 +247,7 @@ class TextClassifierTask(ClassifierTask): ...@@ -247,7 +247,7 @@ class TextClassifierTask(ClassifierTask):
self.feature, length=self.seq_len_used) self.feature, length=self.seq_len_used)
if self.network: if self.network:
# add preset net # add pre-defined net
net_func = getattr(net.classification, self.network) net_func = getattr(net.classification, self.network)
if self.network == 'dpcnn': if self.network == 'dpcnn':
# deepcnn network is no need to unpad # deepcnn network is no need to unpad
...@@ -258,7 +258,7 @@ class TextClassifierTask(ClassifierTask): ...@@ -258,7 +258,7 @@ class TextClassifierTask(ClassifierTask):
logger.info( logger.info(
"%s has been added in the TextClassifierTask!" % self.network) "%s has been added in the TextClassifierTask!" % self.network)
else: else:
# not use preset net but to use fc net # not use pre-defined net but to use fc net
cls_feats = fluid.layers.dropout( cls_feats = fluid.layers.dropout(
x=self.feature, x=self.feature,
dropout_prob=0.1, dropout_prob=0.1,
......
...@@ -16,7 +16,6 @@ ...@@ -16,7 +16,6 @@
This module provide nets for text classification This module provide nets for text classification
""" """
from paddlehub.common.utils import version_compare
import paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
......
...@@ -65,7 +65,6 @@ class BaseNLPReader(BaseReader): ...@@ -65,7 +65,6 @@ class BaseNLPReader(BaseReader):
logger.warning( logger.warning(
"use_task_id has been de discarded since PaddleHub v1.4.0, it's no necessary to feed task_ids now." "use_task_id has been de discarded since PaddleHub v1.4.0, it's no necessary to feed task_ids now."
) )
self.task_id = 0
self.Record_With_Label_Id = namedtuple( self.Record_With_Label_Id = namedtuple(
'Record', 'Record',
...@@ -287,36 +286,16 @@ class ClassifyReader(BaseNLPReader): ...@@ -287,36 +286,16 @@ class ClassifyReader(BaseNLPReader):
max_seq_len=self.max_seq_len, max_seq_len=self.max_seq_len,
pad_idx=self.pad_id) pad_idx=self.pad_id)
return_list = [
padded_token_ids, padded_position_ids, padded_text_type_ids,
input_mask, batch_seq_lens
]
if phase != "predict": if phase != "predict":
batch_labels = [record.label_id for record in batch_records] batch_labels = [record.label_id for record in batch_records]
batch_labels = np.array(batch_labels).astype("int64").reshape( batch_labels = np.array(batch_labels).astype("int64").reshape(
[-1, 1]) [-1, 1])
return_list += [batch_labels]
return_list = [
padded_token_ids, padded_position_ids, padded_text_type_ids,
input_mask, batch_seq_lens, batch_labels
]
if self.use_task_id:
padded_task_ids = np.ones_like(
padded_token_ids, dtype="int64") * self.task_id
return_list = [
padded_token_ids, padded_position_ids, padded_text_type_ids,
input_mask, padded_task_ids, batch_seq_lens, batch_labels
]
else:
return_list = [
padded_token_ids, padded_position_ids, padded_text_type_ids,
input_mask, batch_seq_lens
]
if self.use_task_id:
padded_task_ids = np.ones_like(
padded_token_ids, dtype="int64") * self.task_id
return_list = [
padded_token_ids, padded_position_ids, padded_text_type_ids,
input_mask, padded_task_ids, batch_seq_lens
]
return return_list return return_list
...@@ -370,40 +349,20 @@ class SequenceLabelReader(BaseNLPReader): ...@@ -370,40 +349,20 @@ class SequenceLabelReader(BaseNLPReader):
max_seq_len=self.max_seq_len, max_seq_len=self.max_seq_len,
pad_idx=self.pad_id) pad_idx=self.pad_id)
return_list = [
padded_token_ids, padded_position_ids, padded_text_type_ids,
input_mask
]
if phase != "predict": if phase != "predict":
batch_label_ids = [record.label_id for record in batch_records] batch_label_ids = [record.label_id for record in batch_records]
padded_label_ids = pad_batch_data( padded_label_ids = pad_batch_data(
batch_label_ids, batch_label_ids,
max_seq_len=self.max_seq_len, max_seq_len=self.max_seq_len,
pad_idx=len(self.label_map) - 1) pad_idx=len(self.label_map) - 1)
return_list += [padded_label_ids, batch_seq_lens]
return_list = [
padded_token_ids, padded_position_ids, padded_text_type_ids,
input_mask, padded_label_ids, batch_seq_lens
]
if self.use_task_id:
padded_task_ids = np.ones_like(
padded_token_ids, dtype="int64") * self.task_id
return_list = [
padded_token_ids, padded_position_ids, padded_text_type_ids,
input_mask, padded_task_ids, padded_label_ids,
batch_seq_lens
]
else: else:
return_list = [ return_list += [batch_seq_lens]
padded_token_ids, padded_position_ids, padded_text_type_ids,
input_mask, batch_seq_lens
]
if self.use_task_id:
padded_task_ids = np.ones_like(
padded_token_ids, dtype="int64") * self.task_id
return_list = [
padded_token_ids, padded_position_ids, padded_text_type_ids,
input_mask, padded_task_ids, batch_seq_lens
]
return return_list return return_list
...@@ -515,37 +474,18 @@ class MultiLabelClassifyReader(BaseNLPReader): ...@@ -515,37 +474,18 @@ class MultiLabelClassifyReader(BaseNLPReader):
max_seq_len=self.max_seq_len, max_seq_len=self.max_seq_len,
pad_idx=self.pad_id) pad_idx=self.pad_id)
return_list = [
padded_token_ids, padded_position_ids, padded_text_type_ids,
input_mask
]
if phase != "predict": if phase != "predict":
batch_labels_ids = [record.label_id for record in batch_records] batch_labels_ids = [record.label_id for record in batch_records]
num_label = len(self.dataset.get_labels()) num_label = len(self.dataset.get_labels())
batch_labels = np.array(batch_labels_ids).astype("int64").reshape( batch_labels = np.array(batch_labels_ids).astype("int64").reshape(
[-1, num_label]) [-1, num_label])
return_list = [ return_list += [batch_labels]
padded_token_ids, padded_position_ids, padded_text_type_ids,
input_mask, batch_labels
]
if self.use_task_id:
padded_task_ids = np.ones_like(
padded_token_ids, dtype="int64") * self.task_id
return_list = [
padded_token_ids, padded_position_ids, padded_text_type_ids,
input_mask, padded_task_ids, batch_labels
]
else:
return_list = [
padded_token_ids, padded_position_ids, padded_text_type_ids,
input_mask
]
if self.use_task_id:
padded_task_ids = np.ones_like(
padded_token_ids, dtype="int64") * self.task_id
return_list = [
padded_token_ids, padded_position_ids, padded_text_type_ids,
input_mask, padded_task_ids
]
return return_list return return_list
def _convert_example_to_record(self, def _convert_example_to_record(self,
...@@ -635,37 +575,17 @@ class RegressionReader(BaseNLPReader): ...@@ -635,37 +575,17 @@ class RegressionReader(BaseNLPReader):
max_seq_len=self.max_seq_len, max_seq_len=self.max_seq_len,
pad_idx=self.pad_id) pad_idx=self.pad_id)
return_list = [
padded_token_ids, padded_position_ids, padded_text_type_ids,
input_mask
]
if phase != "predict": if phase != "predict":
batch_labels = [record.label_id for record in batch_records] batch_labels = [record.label_id for record in batch_records]
# the only diff with ClassifyReader: astype("float32") # the only diff with ClassifyReader: astype("float32")
batch_labels = np.array(batch_labels).astype("float32").reshape( batch_labels = np.array(batch_labels).astype("float32").reshape(
[-1, 1]) [-1, 1])
return_list = [ return_list += [batch_labels]
padded_token_ids, padded_position_ids, padded_text_type_ids,
input_mask, batch_labels
]
if self.use_task_id:
padded_task_ids = np.ones_like(
padded_token_ids, dtype="int64") * self.task_id
return_list = [
padded_token_ids, padded_position_ids, padded_text_type_ids,
input_mask, padded_task_ids, batch_labels
]
else:
return_list = [
padded_token_ids, padded_position_ids, padded_text_type_ids,
input_mask
]
if self.use_task_id:
padded_task_ids = np.ones_like(
padded_token_ids, dtype="int64") * self.task_id
return_list = [
padded_token_ids, padded_position_ids, padded_text_type_ids,
input_mask, padded_task_ids
]
return return_list return return_list
...@@ -832,6 +752,10 @@ class ReadingComprehensionReader(BaseNLPReader): ...@@ -832,6 +752,10 @@ class ReadingComprehensionReader(BaseNLPReader):
pad_idx=self.pad_id, pad_idx=self.pad_id,
max_seq_len=self.max_seq_len) max_seq_len=self.max_seq_len)
return_list = [
padded_token_ids, padded_position_ids, padded_text_type_ids,
input_mask, batch_unique_ids
]
if phase != "predict": if phase != "predict":
batch_start_position = [ batch_start_position = [
record.start_position for record in batch_records record.start_position for record in batch_records
...@@ -844,33 +768,8 @@ class ReadingComprehensionReader(BaseNLPReader): ...@@ -844,33 +768,8 @@ class ReadingComprehensionReader(BaseNLPReader):
batch_end_position = np.array(batch_end_position).astype( batch_end_position = np.array(batch_end_position).astype(
"int64").reshape([-1, 1]) "int64").reshape([-1, 1])
return_list = [ return_list += [batch_start_position, batch_end_position]
padded_token_ids, padded_position_ids, padded_text_type_ids,
input_mask, batch_unique_ids, batch_start_position,
batch_end_position
]
if self.use_task_id:
padded_task_ids = np.ones_like(
padded_token_ids, dtype="int64") * self.task_id
return_list = [
padded_token_ids, padded_position_ids, padded_text_type_ids,
input_mask, padded_task_ids, batch_unique_ids,
batch_start_position, batch_end_position
]
else:
return_list = [
padded_token_ids, padded_position_ids, padded_text_type_ids,
input_mask, batch_unique_ids
]
if self.use_task_id:
padded_task_ids = np.ones_like(
padded_token_ids, dtype="int64") * self.task_id
return_list = [
padded_token_ids, padded_position_ids, padded_text_type_ids,
input_mask, padded_task_ids, batch_unique_ids
]
return return_list return return_list
def _prepare_batch_data(self, records, batch_size, phase=None): def _prepare_batch_data(self, records, batch_size, phase=None):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册