hooks.py 9.9 KB
Newer Older
C
chenxuyi 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13
#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
C
chenxuyi 已提交
14
"""train hooks"""
C
chenxuyi 已提交
15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41
from __future__ import print_function
from __future__ import absolute_import
from __future__ import unicode_literals

import sys
import six
import os
import itertools

import numpy as np
import logging
import paddle.fluid as F
import paddle.fluid.layers as L

from propeller import util
from propeller.paddle.train import distribution
from propeller.paddle.train.metrics import Metrics

__all__ = [
    'RunHook', 'TqdmProgressBarHook', 'TqdmNotebookProgressBarHook',
    'CheckpointSaverHook', 'LoggingHook', 'StopAtStepHook', 'EvalHook'
]

log = logging.getLogger(__name__)


class RunHook(object):
C
chenxuyi 已提交
42 43
    """RunHook Base class"""

C
chenxuyi 已提交
44
    def __init__(self):
C
chenxuyi 已提交
45
        """doc"""
C
chenxuyi 已提交
46 47
        pass

C
chenxuyi 已提交
48 49
    def before_train(self, program):
        """doc"""
C
chenxuyi 已提交
50 51 52
        pass

    def before_run(self, state):
C
chenxuyi 已提交
53
        """doc"""
C
chenxuyi 已提交
54 55 56
        return []

    def after_run(self, res_list, state):
C
chenxuyi 已提交
57
        """doc"""
C
chenxuyi 已提交
58 59 60
        pass

    def should_stop(self, state):
C
chenxuyi 已提交
61
        """doc"""
C
chenxuyi 已提交
62 63 64
        return False

    def after_train(self):
C
chenxuyi 已提交
65
        """doc"""
C
chenxuyi 已提交
66 67 68 69
        pass


class TqdmProgressBarHook(RunHook):
C
chenxuyi 已提交
70 71
    """show a progress bar when training"""

C
chenxuyi 已提交
72
    def __init__(self, max_steps, desc=None):
C
chenxuyi 已提交
73
        """doc"""
C
chenxuyi 已提交
74 75 76 77 78
        self.tqdm = None
        import tqdm
        from propeller import log as main_log
        hdl = main_log.handlers[0]

C
chenxuyi 已提交
79
        class _TqdmLogginHandler(logging.Handler):
C
chenxuyi 已提交
80
            def emit(self, record):
C
chenxuyi 已提交
81
                """doc"""
C
chenxuyi 已提交
82 83 84 85
                try:
                    msg = self.format(record)
                    tqdm.tqdm.write(msg, file=sys.stderr)
                    self.flush()
C
chenxuyi 已提交
86 87
                except (KeyboardInterrupt, SystemExit) as e:
                    raise e
C
chenxuyi 已提交
88 89 90
                except:
                    self.handleError(record)

C
chenxuyi 已提交
91
        tqdm_hdl = _TqdmLogginHandler()
C
chenxuyi 已提交
92 93 94 95 96 97 98 99 100 101 102 103 104 105 106
        tqdm_hdl.setFormatter(hdl.formatter)
        main_log.removeHandler(hdl)
        main_log.addHandler(tqdm_hdl)
        self.tqdm = tqdm.tqdm(total=max_steps, desc=None)

    def before_run(self, state):
        self.tqdm.n = state.gstep
        return []

    def __del__(self):
        if self.tqdm:
            self.tqdm.close()


class TqdmNotebookProgressBarHook(RunHook):
C
chenxuyi 已提交
107 108
    """show a progress bar when training"""

C
chenxuyi 已提交
109
    def __init__(self, max_steps, desc=None):
C
chenxuyi 已提交
110
        """doc"""
C
chenxuyi 已提交
111 112 113 114 115
        self.tqdm = None
        import tqdm
        from propeller import log as main_log
        hdl = main_log.handlers[0]

C
chenxuyi 已提交
116
        class _TqdmLogginHandler(logging.Handler):
C
chenxuyi 已提交
117
            def emit(self, record):
C
chenxuyi 已提交
118
                """doc"""
C
chenxuyi 已提交
119 120 121 122
                try:
                    msg = self.format(record)
                    tqdm.tqdm.write(msg, file=sys.stderr)
                    self.flush()
C
chenxuyi 已提交
123 124
                except (KeyboardInterrupt, SystemExit) as e:
                    raise e
C
chenxuyi 已提交
125 126 127
                except:
                    self.handleError(record)

C
chenxuyi 已提交
128
        tqdm_hdl = _TqdmLogginHandler()
C
chenxuyi 已提交
129 130 131 132 133 134
        tqdm_hdl.setFormatter(hdl.formatter)
        main_log.removeHandler(hdl)
        main_log.addHandler(tqdm_hdl)
        self.tqdm = tqdm.tqdm_notebook(total=max_steps, desc=None)

    def before_run(self, state):
C
chenxuyi 已提交
135
        """doc"""
C
chenxuyi 已提交
136 137 138 139 140
        self.tqdm.n = state.gstep
        self.tqdm.refresh()
        return []

    def __del__(self):
C
chenxuyi 已提交
141
        """doc"""
C
chenxuyi 已提交
142 143 144 145 146
        if self.tqdm:
            self.tqdm.close()


class LoggingHook(RunHook):
147
    """log tensor in to screan and VisualDL"""
C
chenxuyi 已提交
148

C
chenxuyi 已提交
149 150 151 152 153 154
    def __init__(self,
                 loss,
                 per_step=10,
                 skip_step=100,
                 summary_writer=None,
                 summary_record=None):
C
chenxuyi 已提交
155
        """doc"""
C
chenxuyi 已提交
156 157 158 159 160 161 162 163 164 165
        if per_step is None or skip_step is None:
            raise ValueError('wrong step argument, per step: %d skip_step %d' %
                             (per_step, skip_step))
        self.loss = loss
        self.per_step = per_step
        self.skip_step = skip_step
        self.summary_record = summary_record
        self.writer = summary_writer
        self.last_state = None

C
chenxuyi 已提交
166 167
    def before_train(self, program):
        """doc"""
C
chenxuyi 已提交
168 169 170 171 172 173 174 175 176 177 178 179
        if self.summary_record:
            if self.summary_record.scalar:
                self.s_name, self.s_tolog = zip(*self.summary_record.scalar)
            else:
                self.s_name, self.s_tolog = [], []

            if self.summary_record.histogram:
                self.h_name, self.h_tolog = zip(*self.summary_record.histogram)
            else:
                self.h_name, self.h_tolog = [], []

    def before_run(self, state):
C
chenxuyi 已提交
180
        """doc"""
C
chenxuyi 已提交
181 182 183 184 185 186 187 188 189 190
        if state.gstep % self.per_step == 0 and state.step > self.skip_step:
            ret = [self.loss]
            if self.summary_record:
                ret += self.s_tolog
                ret += self.h_tolog
            return ret
        else:
            return []

    def after_run(self, res_list, state):
C
chenxuyi 已提交
191
        """doc"""
C
chenxuyi 已提交
192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207
        if state.gstep % self.per_step == 0 and state.step > self.skip_step:
            if not self.summary_record:
                return

            loss = float(res_list[0])
            s_np = res_list[1:1 + len(self.s_name)]
            h_np = res_list[1 + len(self.s_name):1 + len(self.s_name) + len(
                self.h_name)]

            if self.last_state is not None:
                speed = (state.gstep - self.last_state.gstep) / (
                    state.time - self.last_state.time)
            else:
                speed = -1.
            self.last_state = state

208
            # log to VisualDL
C
chenxuyi 已提交
209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236
            if self.writer is not None:
                self.writer.add_scalar('loss', loss, state.gstep)
                for name, t in zip(self.s_name, s_np):
                    if np.isnan(t).any():
                        log.warning('Nan summary: %s, skip' % name)
                    else:
                        self.writer.add_scalar(name, t, state.gstep)

                for name, t in zip(self.h_name, h_np):
                    if np.isnan(t).any():
                        log.warning('Nan summary: %s, skip' % name)
                    else:
                        self.writer.add_histogram(name, t, state.gstep)

                if speed > 0.:
                    self.writer.add_scalar('global_step', speed, state.gstep)

            # log to stdout
            log.debug('\t'.join([
                'step: %d' % state.gstep,
                'steps/sec: %.5f' % speed,
                'loss: %.5f' % loss,
                '' if self.summary_record is None else ' '.join(
                    map(lambda t: '%s:%s' % t, zip(self.s_name, s_np))),
            ]))


class StopAtStepHook(RunHook):
C
chenxuyi 已提交
237 238
    """stop training at some step"""

C
chenxuyi 已提交
239
    def __init__(self, stop_global_step, stop_step):
C
chenxuyi 已提交
240
        """doc"""
C
chenxuyi 已提交
241 242 243 244
        self._stop_gstep = stop_global_step
        self._stop_step = stop_step

    def should_stop(self, state):
C
chenxuyi 已提交
245
        """doc"""
C
chenxuyi 已提交
246 247 248 249 250 251 252 253 254 255 256 257
        if (self._stop_gstep and state.gstep >= self._stop_gstep) or \
           (self._stop_step and state.step >= self._stop_step):
            log.info('StopAtStepHook called stop')
            return True
        else:
            return False


class EvalHook(RunHook):
    """hook this on a eval Executor"""

    def __init__(self, metrics, summary_writer=None):
C
chenxuyi 已提交
258
        """doc"""
C
chenxuyi 已提交
259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276
        self.writer = summary_writer
        self._result = None

        if not isinstance(metrics, dict):
            raise ValueError('metrics should be dict, got %s' % repr(metrics))

        for k, m in six.iteritems(metrics):
            if not isinstance(m, Metrics):
                raise ValueError(
                    'metrics %s should be instance of propeller.Metrics, got %s'
                    % (k, repr(m)))

        if len(metrics):
            self.names = list(metrics.keys())
            self.metrics = list(metrics.values())
        else:
            self.names, self.metrics = [], []

C
chenxuyi 已提交
277 278
    def before_train(self, program):
        """doc"""
C
chenxuyi 已提交
279 280 281 282
        for m in self.metrics:
            m.reset()

    def before_run(self, state):
C
chenxuyi 已提交
283
        """doc"""
C
chenxuyi 已提交
284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299
        ls = [m.tensor for m in self.metrics]
        for i in ls:
            if not (isinstance(i, list) or isinstance(i, tuple)):
                raise ValueError(
                    'metrics should return tuple or list of tensors, got %s' %
                    repr(i))
            for ii in i:
                if not isinstance(ii, F.framework.Variable):
                    raise ValueError(
                        'metrics tensor be propeller.train.Metrics, got %s of type %s'
                        % (repr(ii), type(ii)))
        ls_flt, self.schema = util.flatten(ls)
        #log.debug(ls_flt)
        return ls_flt

    def after_run(self, res_list, state):
C
chenxuyi 已提交
300
        """doc"""
C
chenxuyi 已提交
301 302 303 304 305 306
        res = util.unflatten(res_list, self.schema)
        for r, m in zip(res, self.metrics):
            m.update(r)

    @property
    def result(self):
C
chenxuyi 已提交
307
        """doc"""
C
chenxuyi 已提交
308 309 310
        return self._result

    def after_train(self):
C
chenxuyi 已提交
311
        """doc"""
C
chenxuyi 已提交
312 313 314 315 316 317 318 319 320 321
        printable = []
        self._result = {}
        for n, m in zip(self.names, self.metrics):
            val = m.eval()
            self._result[n] = val

        return self.result


class CheckpointSaverHook(RunHook):
C
chenxuyi 已提交
322 323
    """Save checkpoint every n step"""

C
chenxuyi 已提交
324
    def __init__(self, saver, per_step=10, skip_step=100):
C
chenxuyi 已提交
325
        """doc"""
C
chenxuyi 已提交
326 327 328 329 330
        self.saver = saver
        self.per_step = per_step
        self.skip_step = skip_step

    def after_run(self, res_list, state):
C
chenxuyi 已提交
331
        """doc"""
C
chenxuyi 已提交
332 333 334
        if state.gstep % self.per_step == 0 and \
                state.step > self.skip_step:
            self.saver.save(state)