dist_text_classification.py 6.5 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import re
17 18 19
import string
import tarfile

L
LoneRanger 已提交
20
import nets
21 22
from test_dist_base import TestDistRunnerBase, runtime_main

23
import paddle
24
from paddle import fluid
25

26 27 28 29 30 31 32 33 34 35
DTYPE = "float32"
VOCAB_URL = 'http://paddle-dist-ce-data.bj.bcebos.com/imdb.vocab'
VOCAB_MD5 = '23c86a0533c0151b6f12fa52b106dcc2'
DATA_URL = 'http://paddle-dist-ce-data.bj.bcebos.com/text_classification.tar.gz'
DATA_MD5 = '29ebfc94f11aea9362bbb7f5e9d86b8a'


# Load dictionary.
def load_vocab(filename):
    vocab = {}
T
tianshuo78520a 已提交
36 37 38
    with open(filename, 'r', encoding="utf-8") as f:
        for idx, line in enumerate(f):
            vocab[line.strip()] = idx
39 40 41 42 43 44 45 46 47 48
    return vocab


def get_worddict(dict_path):
    word_dict = load_vocab(dict_path)
    word_dict["<unk>"] = len(word_dict)
    dict_dim = len(word_dict)
    return word_dict, dict_dim


49 50 51 52 53 54 55 56 57
def conv_net(
    input,
    dict_dim,
    emb_dim=128,
    window_size=3,
    num_filters=128,
    fc0_dim=96,
    class_dim=2,
):
58 59 60 61
    emb = fluid.layers.embedding(
        input=input,
        size=[dict_dim, emb_dim],
        is_sparse=False,
62
        param_attr=fluid.ParamAttr(
63
            initializer=paddle.nn.initializer.Constant(value=0.01)
64 65
        ),
    )
66

L
LoneRanger 已提交
67
    conv_3 = nets.sequence_conv_pool(
68 69 70 71 72
        input=emb,
        num_filters=num_filters,
        filter_size=window_size,
        act="tanh",
        pool_type="max",
73
        param_attr=fluid.ParamAttr(
74
            initializer=paddle.nn.initializer.Constant(value=0.01)
75 76
        ),
    )
77

C
Charles-hit 已提交
78 79
    fc_0 = paddle.static.nn.fc(
        x=[conv_3],
80
        size=fc0_dim,
C
Charles-hit 已提交
81
        weight_attr=fluid.ParamAttr(
82
            initializer=paddle.nn.initializer.Constant(value=0.01)
83 84
        ),
    )
85

C
Charles-hit 已提交
86 87
    prediction = paddle.static.nn.fc(
        x=[fc_0],
88
        size=class_dim,
C
Charles-hit 已提交
89 90
        activation="softmax",
        weight_attr=fluid.ParamAttr(
91
            initializer=paddle.nn.initializer.Constant(value=0.01)
92 93
        ),
    )
94 95 96 97 98

    return prediction


def inference_network(dict_dim):
G
GGBond8488 已提交
99 100
    data = paddle.static.data(
        name="words", shape=[-1, 1], dtype="int64", lod_level=1
101
    )
102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122
    out = conv_net(data, dict_dim)
    return out


def get_reader(word_dict, batch_size):
    # The training data set.
    train_reader = paddle.batch(train(word_dict), batch_size=batch_size)

    # The testing data set.
    test_reader = paddle.batch(test(word_dict), batch_size=batch_size)

    return train_reader, test_reader


def get_optimizer(learning_rate):
    optimizer = fluid.optimizer.SGD(learning_rate=learning_rate)
    return optimizer


class TestDistTextClassification2x2(TestDistRunnerBase):
    def get_model(self, batch_size=2):
123 124 125
        vocab = os.path.join(
            paddle.dataset.common.DATA_HOME, "text_classification", "imdb.vocab"
        )
126 127 128
        word_dict, dict_dim = get_worddict(vocab)

        # Input data
G
GGBond8488 已提交
129 130
        data = paddle.static.data(
            name="words", shape=[-1, 1], dtype="int64", lod_level=1
131
        )
G
GGBond8488 已提交
132
        label = paddle.static.data(name='label', shape=[-1, 1], dtype='int64')
133 134 135

        # Train program
        predict = conv_net(data, dict_dim)
136 137 138
        cost = paddle.nn.functional.cross_entropy(
            input=predict, label=label, reduction='none', use_softmax=False
        )
139
        avg_cost = paddle.mean(x=cost)
140
        acc = paddle.static.accuracy(input=predict, label=label)
141 142 143 144 145 146 147 148 149
        inference_program = fluid.default_main_program().clone()

        # Optimization
        opt = get_optimizer(learning_rate=0.001)
        opt.minimize(avg_cost)

        # Reader
        train_reader, test_reader = get_reader(word_dict, batch_size)

150 151 152 153 154 155 156 157
        return (
            inference_program,
            avg_cost,
            train_reader,
            test_reader,
            acc,
            predict,
        )
158 159 160 161 162 163 164 165


def tokenize(pattern):
    """
    Read files that match the given pattern.  Tokenize and yield each file.
    """

    with tarfile.open(
166 167 168 169
        paddle.dataset.common.download(
            DATA_URL, 'text_classification', DATA_MD5
        )
    ) as tarf:
170 171 172 173 174
        # Note that we should use tarfile.next(), which does
        # sequential access of member files, other than
        # tarfile.extractfile, which does random access and might
        # destroy hard disks.
        tf = tarf.next()
175
        while tf is not None:
176 177
            if bool(pattern.match(tf.name)):
                # newline and punctuations removal and ad-hoc tokenization.
178
                yield tarf.extractfile(tf).read().rstrip(b'\n\r').translate(
179 180
                    None, string.punctuation.encode('latin-1')
                ).lower().split()
181 182 183 184 185 186 187 188 189 190 191 192 193 194 195
            tf = tarf.next()


def reader_creator(pos_pattern, neg_pattern, word_idx):
    UNK = word_idx['<unk>']
    INS = []

    def load(pattern, out, label):
        for doc in tokenize(pattern):
            out.append(([word_idx.get(w, UNK) for w in doc], label))

    load(pos_pattern, INS, 0)
    load(neg_pattern, INS, 1)

    def reader():
196
        yield from INS
197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212

    return reader


def train(word_idx):
    """
    IMDB training set creator.

    It returns a reader creator, each sample in the reader is an zero-based ID
    sequence and label in [0, 1].

    :param word_idx: word dictionary
    :type word_idx: dict
    :return: Training reader creator
    :rtype: callable
    """
213 214 215 216 217
    return reader_creator(
        re.compile(r"train/pos/.*\.txt$"),
        re.compile(r"train/neg/.*\.txt$"),
        word_idx,
    )
218 219 220 221 222 223 224 225 226 227 228 229 230 231


def test(word_idx):
    """
    IMDB test set creator.

    It returns a reader creator, each sample in the reader is an zero-based ID
    sequence and label in [0, 1].

    :param word_idx: word dictionary
    :type word_idx: dict
    :return: Test reader creator
    :rtype: callable
    """
232 233 234 235 236
    return reader_creator(
        re.compile(r"test/pos/.*\.txt$"),
        re.compile(r"test/neg/.*\.txt$"),
        word_idx,
    )
237 238 239 240 241 242


if __name__ == "__main__":
    paddle.dataset.common.download(VOCAB_URL, 'text_classification', VOCAB_MD5)
    paddle.dataset.common.download(DATA_URL, 'text_classification', DATA_MD5)
    runtime_main(TestDistTextClassification2x2)