module.py

# -*- coding:utf-8 -*-
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Finetuning on classification task """
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os

import numpy as np
from paddlehub.common.logger import logger
from paddlehub.module.module import moduleinfo, serving
import paddlehub as hub


@moduleinfo(
    name="ernie_tiny_finetuned",
    version="1.0.0",
    summary="ERNIE tiny which was fine-tuned on the chnsenticorp dataset.",
    author="anonymous",
    author_email="",
    type="nlp/semantic_model")
class ERNIETinyFinetuned(hub.Module):
    def _initialize(self,
                    ckpt_dir="ckpt_chnsenticorp",
                    num_class=2,
                    max_seq_len=128,
                    use_gpu=False,
                    batch_size=1):
        self.ckpt_dir = os.path.join(self.directory, ckpt_dir)
        self.num_class = num_class
        self.MAX_SEQ_LEN = max_seq_len

        # Load Paddlehub ERNIE Tiny pretrained model
        self.module = hub.Module(name="ernie_tiny")
        inputs, outputs, program = self.module.context(
            trainable=True, max_seq_len=max_seq_len)

        self.vocab_path = self.module.get_vocab_path()

        # Download dataset and use accuracy as metrics
        # Choose dataset: GLUE/XNLI/ChinesesGLUE/NLPCC-DBQA/LCQMC
        # metric should be acc, f1 or matthews
        metrics_choices = ["acc"]

        # For ernie_tiny, it use sub-word to tokenize chinese sentence
        # If not ernie tiny, sp_model_path and word_dict_path should be set None
        reader = hub.reader.ClassifyReader(
            vocab_path=self.module.get_vocab_path(),
            max_seq_len=max_seq_len,
            sp_model_path=self.module.get_spm_path(),
            word_dict_path=self.module.get_word_dict_path())

        # Construct transfer learning network
        # Use "pooled_output" for classification tasks on an entire sentence.
        # Use "sequence_output" for token-level output.
        pooled_output = outputs["pooled_output"]

        # Setup feed list for data feeder
        # Must feed all the tensor of module need
        feed_list = [
            inputs["input_ids"].name,
            inputs["position_ids"].name,
            inputs["segment_ids"].name,
            inputs["input_mask"].name,
        ]

        # Setup runing config for PaddleHub Finetune API
        config = hub.RunConfig(
            use_data_parallel=False,
            use_cuda=use_gpu,
            batch_size=batch_size,
            checkpoint_dir=self.ckpt_dir,
            strategy=hub.AdamWeightDecayStrategy())

        # Define a classfication finetune task by PaddleHub's API
        self.cls_task = hub.TextClassifierTask(
            data_reader=reader,
            feature=pooled_output,
            feed_list=feed_list,
            num_classes=self.num_class,
            config=config,
            metrics_choices=metrics_choices)

    def predict(self, data, return_result=False, accelerate_mode=True):
        """
        Get prediction results
        """
        run_states = self.cls_task.predict(
            data=data,
            return_result=return_result,
            accelerate_mode=accelerate_mode)
        return run_states


if __name__ == "__main__":
    ernie_tiny = ERNIETinyFinetuned(
        ckpt_dir="../ckpt_chnsenticorp", num_class=2)

    # Data to be prdicted
    data = [["这个宾馆比较陈旧了，特价的房间也很一般。总体来说一般"], ["交通方便；环境很好；服务态度很好 房间较小"],
            ["19天硬盘就罢工了~~~算上运来的一周都没用上15天~~~可就是不能换了~~~唉~~~~你说这算什么事呀~~~"]]

    index = 0
    run_states = ernie_tiny.predict(data=data)
    results = [run_state.run_results for run_state in run_states]
    for batch_result in results:
        # get predict index
        batch_result = np.argmax(batch_result, axis=2)[0]
        for result in batch_result:
            print("%s\tpredict=%s" % (data[index][0], result))
            index += 1