classifier_task.py 17.4 KB
Newer Older
K
kinghuin 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
#coding:utf-8
#  Copyright (c) 2019  PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from collections import OrderedDict
import numpy as np
22
import paddle
K
kinghuin 已提交
23
import paddle.fluid as fluid
S
Steffy-zxf 已提交
24
import time
K
kinghuin 已提交
25

S
Steffy-zxf 已提交
26
from paddlehub.common.logger import logger
K
kinghuin 已提交
27
from paddlehub.finetune.evaluate import calculate_f1_np, matthews_corrcoef
S
Steffy-zxf 已提交
28
from paddlehub.reader.nlp_reader import ClassifyReader
S
Steffy-zxf 已提交
29
import paddlehub.network as net
S
Steffy-zxf 已提交
30

K
kinghuin 已提交
31
from .base_task import BaseTask
K
kinghuin 已提交
32 33


K
kinghuin 已提交
34
class ClassifierTask(BaseTask):
K
kinghuin 已提交
35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111
    def __init__(self,
                 feature,
                 num_classes,
                 feed_list,
                 data_reader,
                 startup_program=None,
                 config=None,
                 hidden_units=None,
                 metrics_choices="default"):
        if metrics_choices == "default":
            metrics_choices = ["acc"]

        main_program = feature.block.program
        super(ClassifierTask, self).__init__(
            data_reader=data_reader,
            main_program=main_program,
            feed_list=feed_list,
            startup_program=startup_program,
            config=config,
            metrics_choices=metrics_choices)

        self.feature = feature
        self.num_classes = num_classes
        self.hidden_units = hidden_units

    def _build_net(self):
        cls_feats = self.feature
        if self.hidden_units is not None:
            for n_hidden in self.hidden_units:
                cls_feats = fluid.layers.fc(
                    input=cls_feats, size=n_hidden, act="relu")

        logits = fluid.layers.fc(
            input=cls_feats,
            size=self.num_classes,
            param_attr=fluid.ParamAttr(
                name="cls_out_w",
                initializer=fluid.initializer.TruncatedNormal(scale=0.02)),
            bias_attr=fluid.ParamAttr(
                name="cls_out_b", initializer=fluid.initializer.Constant(0.)),
            act="softmax")

        self.ret_infers = fluid.layers.reshape(
            x=fluid.layers.argmax(logits, axis=1), shape=[-1, 1])

        return [logits]

    def _add_label(self):
        return [fluid.layers.data(name="label", dtype="int64", shape=[1])]

    def _add_loss(self):
        ce_loss = fluid.layers.cross_entropy(
            input=self.outputs[0], label=self.labels[0])
        return fluid.layers.mean(x=ce_loss)

    def _add_metrics(self):
        acc = fluid.layers.accuracy(input=self.outputs[0], label=self.labels[0])
        return [acc]

    @property
    def fetch_list(self):
        if self.is_train_phase or self.is_test_phase:
            return [self.labels[0].name, self.ret_infers.name
                    ] + [metric.name
                         for metric in self.metrics] + [self.loss.name]
        return [output.name for output in self.outputs]

    def _calculate_metrics(self, run_states):
        loss_sum = acc_sum = run_examples = 0
        run_step = run_time_used = 0
        all_labels = np.array([])
        all_infers = np.array([])

        for run_state in run_states:
            run_examples += run_state.run_examples
            run_step += run_state.run_step
            loss_sum += np.mean(
112
                run_state.run_results[-2]) * run_state.run_examples
K
kinghuin 已提交
113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141
            acc_sum += np.mean(
                run_state.run_results[2]) * run_state.run_examples
            np_labels = run_state.run_results[0]
            np_infers = run_state.run_results[1]
            all_labels = np.hstack((all_labels, np_labels.reshape([-1])))
            all_infers = np.hstack((all_infers, np_infers.reshape([-1])))

        run_time_used = time.time() - run_states[0].run_time_begin
        avg_loss = loss_sum / run_examples
        run_speed = run_step / run_time_used

        # The first key will be used as main metrics to update the best model
        scores = OrderedDict()

        for metric in self.metrics_choices:
            if metric == "acc":
                avg_acc = acc_sum / run_examples
                scores["acc"] = avg_acc
            elif metric == "f1":
                f1 = calculate_f1_np(all_infers, all_labels)
                scores["f1"] = f1
            elif metric == "matthews":
                matthews = matthews_corrcoef(all_infers, all_labels)
                scores["matthews"] = matthews
            else:
                raise ValueError("Not Support Metric: \"%s\"" % metric)

        return scores, avg_loss, run_speed

K
kinghuin 已提交
142 143 144 145 146 147 148 149
    def _postprocessing(self, run_states):
        try:
            id2label = {
                val: key
                for key, val in self._base_data_reader.label_map.items()
            }
        except:
            raise Exception(
K
kinghuin 已提交
150
                "ImageClassificationDataset does not support postprocessing, please use BaseCVDataset instead"
K
kinghuin 已提交
151
            )
K
kinghuin 已提交
152 153 154
        results = []
        for batch_state in run_states:
            batch_result = batch_state.run_results
S
Steffy-zxf 已提交
155
            batch_infer = np.argmax(batch_result[0], axis=1)
K
kinghuin 已提交
156 157 158
            results += [id2label[sample_infer] for sample_infer in batch_infer]
        return results

K
kinghuin 已提交
159 160 161 162 163

ImageClassifierTask = ClassifierTask


class TextClassifierTask(ClassifierTask):
S
Steffy-zxf 已提交
164 165 166 167 168
    """
    Create a text classification task.
    It will use full-connect layer with softmax activation function to classify texts.
    """

K
kinghuin 已提交
169 170 171 172
    def __init__(self,
                 num_classes,
                 feed_list,
                 data_reader,
S
Steffy-zxf 已提交
173 174
                 token_feature=None,
                 feature=None,
175
                 network=None,
K
kinghuin 已提交
176 177 178 179
                 startup_program=None,
                 config=None,
                 hidden_units=None,
                 metrics_choices="default"):
S
Steffy-zxf 已提交
180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223
        """
        Args:
            num_classes: total labels of the text classification task.
            feed_list(list): the variable name that will be feeded to the main program
            data_reader(object): data reader for the task. It must be one of ClassifyReader and LACClassifyReader.
            token_feature(Variable): the feature will be used to connect the preset net. It must be the token-level feature, shape as [-1, seq_len, emb_size]. Default None.
            feature(Variable): the feature will be used to classify texts. It must be the sentence-level feature, shape as [-1, emb_size]. Token_feature and feature couldn't be setted as the same time. One of them must be setted as not None. Default None.
            network(str): the preset network. Choices: 'bilstm', 'bow', 'cnn', 'dpcnn', 'gru' and 'lstm'. Default None. If network is setted, then token_feature must be seted and feature must be None.
            main_program (object): the customized main_program, default None.
            startup_program (object): the customized startup_program, default None.
            config (RunConfig): run config for the task, such as batch_size, epoch, learning_rate setting and so on. Default None.
            hidden_units(list): the element of hidden_units list is the full-connect layer size. It will add the full-connect layers to the program. Default None.
            metrics_choices(list): metrics used to the task, default ["acc"].
        """
        if (not feature) and (not token_feature):
            logger.error(
                'Both token_feature and feature are None, one of them must be setted.'
            )
            exit(1)
        elif feature and token_feature:
            logger.error(
                'Both token_feature and feature are setted. One should be setted, the other should be None.'
            )
            exit(1)

        if network:
            assert network in [
                'bilstm', 'bow', 'cnn', 'dpcnn', 'gru', 'lstm'
            ], 'network choice must be one of bilstm, bow, cnn, dpcnn, gru, lstm!'
            assert token_feature and (
                not feature
            ), 'If you wanna use network, you must set token_feature ranther than feature for TextClassifierTask!'
            assert len(
                token_feature.shape
            ) == 3, 'When you use network, the parameter token_feature must be the token-level feature, such as the sequence_output of ERNIE, BERT, RoBERTa and ELECTRA module.'
        else:
            assert feature and (
                not token_feature
            ), 'If you do not use network, you must set feature ranther than token_feature for TextClassifierTask!'
            assert len(
                feature.shape
            ) == 2, 'When you do not use network, the parameter feture must be the sentence-level feature, such as the pooled_output of ERNIE, BERT, RoBERTa and ELECTRA module.'

        self.network = network
K
kinghuin 已提交
224 225 226

        if metrics_choices == "default":
            metrics_choices = ["acc"]
S
Steffy-zxf 已提交
227

K
kinghuin 已提交
228 229
        super(TextClassifierTask, self).__init__(
            data_reader=data_reader,
S
Steffy-zxf 已提交
230
            feature=feature if feature else token_feature,
K
kinghuin 已提交
231 232 233 234 235 236 237 238
            num_classes=num_classes,
            feed_list=feed_list,
            startup_program=startup_program,
            config=config,
            hidden_units=hidden_units,
            metrics_choices=metrics_choices)

    def _build_net(self):
S
Steffy-zxf 已提交
239 240 241 242 243 244 245 246 247
        if isinstance(self._base_data_reader, ClassifyReader):
            # ClassifyReader will return the seqence length of an input text
            self.seq_len = fluid.layers.data(
                name="seq_len", shape=[1], dtype='int64', lod_level=0)
            self.seq_len_used = fluid.layers.squeeze(self.seq_len, axes=[1])

            # unpad the token_feature
            unpad_feature = fluid.layers.sequence_unpad(
                self.feature, length=self.seq_len_used)
248 249

        if self.network:
S
Steffy-zxf 已提交
250
            # add preset net
251 252
            net_func = getattr(net.classification, self.network)
            if self.network == 'dpcnn':
S
Steffy-zxf 已提交
253 254 255
                # deepcnn network is no need to unpad
                cls_feats = net_func(
                    self.feature, emb_dim=self.feature.shape[-1])
256 257
            else:
                cls_feats = net_func(unpad_feature)
S
Steffy-zxf 已提交
258 259
            logger.info(
                "%s has been added in the TextClassifierTask!" % self.network)
260
        else:
S
Steffy-zxf 已提交
261
            # not use preset net but to use fc net
262 263 264 265 266
            cls_feats = fluid.layers.dropout(
                x=self.feature,
                dropout_prob=0.1,
                dropout_implementation="upscale_in_train")

S
Steffy-zxf 已提交
267 268 269 270
        if self.hidden_units is not None:
            for n_hidden in self.hidden_units:
                cls_feats = fluid.layers.fc(
                    input=cls_feats, size=n_hidden, act="relu")
K
kinghuin 已提交
271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286

        logits = fluid.layers.fc(
            input=cls_feats,
            size=self.num_classes,
            param_attr=fluid.ParamAttr(
                name="cls_out_w",
                initializer=fluid.initializer.TruncatedNormal(scale=0.02)),
            bias_attr=fluid.ParamAttr(
                name="cls_out_b", initializer=fluid.initializer.Constant(0.)),
            act="softmax")

        self.ret_infers = fluid.layers.reshape(
            x=fluid.layers.argmax(logits, axis=1), shape=[-1, 1])

        return [logits]

287 288
    @property
    def feed_list(self):
S
Steffy-zxf 已提交
289 290 291 292
        feed_list = [varname for varname in self._base_feed_list]
        if isinstance(self._base_data_reader, ClassifyReader):
            # ClassifyReader will return the seqence length of an input text
            feed_list += [self.seq_len.name]
293 294 295 296 297 298 299
        if self.is_train_phase or self.is_test_phase:
            feed_list += [self.labels[0].name]
        return feed_list

    @property
    def fetch_list(self):
        if self.is_train_phase or self.is_test_phase:
S
Steffy-zxf 已提交
300
            fetch_list = [
301
                self.labels[0].name, self.ret_infers.name, self.metrics[0].name,
S
Steffy-zxf 已提交
302
                self.loss.name
303
            ]
S
Steffy-zxf 已提交
304 305 306 307 308 309 310 311 312
        else:
            # predict phase
            fetch_list = [self.outputs[0].name]

        if isinstance(self._base_data_reader, ClassifyReader):
            # to avoid save_inference_model to prune seq_len variable
            fetch_list += [self.seq_len.name]

        return fetch_list
313

K
kinghuin 已提交
314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426

class MultiLabelClassifierTask(ClassifierTask):
    def __init__(self,
                 feature,
                 num_classes,
                 feed_list,
                 data_reader,
                 startup_program=None,
                 config=None,
                 hidden_units=None,
                 metrics_choices="default"):
        if metrics_choices == "default":
            metrics_choices = ["auc"]

        main_program = feature.block.program
        super(MultiLabelClassifierTask, self).__init__(
            data_reader=data_reader,
            feature=feature,
            num_classes=num_classes,
            feed_list=feed_list,
            startup_program=startup_program,
            config=config,
            hidden_units=hidden_units,
            metrics_choices=metrics_choices)
        self.class_name = list(data_reader.label_map.keys())

    def _build_net(self):
        cls_feats = fluid.layers.dropout(
            x=self.feature,
            dropout_prob=0.1,
            dropout_implementation="upscale_in_train")

        if self.hidden_units is not None:
            for n_hidden in self.hidden_units:
                cls_feats = fluid.layers.fc(
                    input=cls_feats, size=n_hidden, act="relu")

        probs = []
        for i in range(self.num_classes):
            probs.append(
                fluid.layers.fc(
                    input=cls_feats,
                    size=2,
                    param_attr=fluid.ParamAttr(
                        name="cls_out_w_%d" % i,
                        initializer=fluid.initializer.TruncatedNormal(
                            scale=0.02)),
                    bias_attr=fluid.ParamAttr(
                        name="cls_out_b_%d" % i,
                        initializer=fluid.initializer.Constant(0.)),
                    act="softmax"))

        return probs

    def _add_label(self):
        label = fluid.layers.data(
            name="label", shape=[self.num_classes], dtype='int64')
        return [label]

    def _add_loss(self):
        label_split = fluid.layers.split(
            self.labels[0], self.num_classes, dim=-1)
        total_loss = fluid.layers.fill_constant(
            shape=[1], value=0.0, dtype='float64')
        for index, probs in enumerate(self.outputs):
            ce_loss = fluid.layers.cross_entropy(
                input=probs, label=label_split[index])
            total_loss += fluid.layers.reduce_sum(ce_loss)
        loss = fluid.layers.mean(x=total_loss)
        return loss

    def _add_metrics(self):
        label_split = fluid.layers.split(
            self.labels[0], self.num_classes, dim=-1)
        # metrics change to auc of every class
        eval_list = []
        for index, probs in enumerate(self.outputs):
            current_auc, _, _ = fluid.layers.auc(
                input=probs, label=label_split[index])
            eval_list.append(current_auc)
        return eval_list

    def _calculate_metrics(self, run_states):
        loss_sum = acc_sum = run_examples = 0
        run_step = run_time_used = 0
        for run_state in run_states:
            run_examples += run_state.run_examples
            run_step += run_state.run_step
            loss_sum += np.mean(
                run_state.run_results[-1]) * run_state.run_examples
        auc_list = run_states[-1].run_results[:-1]

        run_time_used = time.time() - run_states[0].run_time_begin
        avg_loss = loss_sum / (run_examples * self.num_classes)
        run_speed = run_step / run_time_used

        # The first key will be used as main metrics to update the best model
        scores = OrderedDict()
        for metric in self.metrics_choices:
            if metric == "auc":
                scores["auc"] = np.mean(auc_list)
                # NOTE: for MultiLabelClassifierTask, the metrics will be used to evaluate all the label
                #      and their mean value will also be reported.
                for index, auc in enumerate(auc_list):
                    scores["auc_" + self.class_name[index]] = auc_list[index][0]
            else:
                raise ValueError("Not Support Metric: \"%s\"" % metric)
        return scores, avg_loss, run_speed

    @property
    def fetch_list(self):
        if self.is_train_phase or self.is_test_phase:
            return [metric.name for metric in self.metrics] + [self.loss.name]
S
Steffy-zxf 已提交
427
        return [output.name for output in self.outputs]
K
kinghuin 已提交
428 429 430

    def _postprocessing(self, run_states):
        results = []
K
kinghuin 已提交
431
        label_list = list(self._base_data_reader.label_map.keys())
K
kinghuin 已提交
432 433 434 435 436 437 438
        for batch_state in run_states:
            batch_result = batch_state.run_results
            for sample_id in range(len(batch_result[0])):
                sample_result = []
                for category_id in range(
                        self._base_data_reader.dataset.num_labels):
                    sample_category_prob = batch_result[category_id][sample_id]
K
kinghuin 已提交
439 440 441
                    sample_category_value = np.argmax(sample_category_prob)
                    sample_result.append(
                        {label_list[category_id]: sample_category_value})
K
kinghuin 已提交
442 443
                results.append(sample_result)
        return results