# coding:utf-8 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License" # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from typing import Dict, List, Optional, Union, Tuple import os from paddlehub.tokenizer.bert_tokenizer import BertTokenizer from paddlehub.tokenizer.tokenizer import CustomTokenizer from .base_nlp_dataset import TextClassificationDataset from paddlehub.env import DATA_HOME _DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/chnsenticorp.tar.gz" class ChnSentiCorp(TextClassificationDataset): """ ChnSentiCorp is a dataset for chinese sentiment classification, which was published by Tan Songbo at ICT of Chinese Academy of Sciences. """ def __init__(self, tokenizer: Union[BertTokenizer, CustomTokenizer], max_seq_len: int = 128, mode: str = 'train'): """ Args: tokenizer (:obj:`BertTokenizer` or :obj:`CustomTokenizer`): It tokenizes the text and encodes the data as model needed. max_seq_len (:obj:`int`, `optional`, defaults to :128): The maximum length (in number of tokens) for the inputs to the selected module, such as enrie, bert and so on. mode (:obj:`str`, `optional`, defaults to `train`): It identifies the dataset mode (train, test or dev). Examples: .. code-block:: python from paddlehub.datasets.chnsenticorp import ChnSentiCorp from paddlehub.tokenizer.bert_tokenizer import BertTokenizer tokenizer = BertTokenizer(vocab_file='./vocab.txt') train_dataset = ChnSentiCorp(tokenizer=tokenizer, max_seq_len=120, mode='train') dev_dataset = ChnSentiCorp(tokenizer=tokenizer, max_seq_len=120, mode='dev') test_dataset = ChnSentiCorp(tokenizer=tokenizer, max_seq_len=120, mode='test') """ base_path = os.path.join(DATA_HOME, "chnsenticorp") self._download_and_uncompress_dataset(base_path, url=_DATA_URL) if mode == 'train': data_file = 'train.tsv' elif mode == 'test': data_file = 'test.tsv' else: data_file = 'dev.tsv' super(ChnSentiCorp, self).__init__( base_path=base_path, tokenizer=tokenizer, max_seq_len=max_seq_len, mode=mode, data_file=data_file, label_list=["0", "1"], is_file_with_header=True) if __name__ == "__main__": tokenizer = BertTokenizer(vocab_file='./vocab.txt') train_dataset = ChnSentiCorp(tokenizer=tokenizer, max_seq_len=60, mode='train') dev_dataset = ChnSentiCorp(tokenizer=tokenizer, max_seq_len=60, mode='dev') test_dataset = ChnSentiCorp(tokenizer=tokenizer, max_seq_len=60, mode='test') index = 0 while index < 3: record = train_dataset.__getitem__(index) print("train record: ", record) record = dev_dataset.__getitem__(index) print("dev record: ", record) record = test_dataset.__getitem__(index) print("test record: ", record) index += 1