controller.py 24.5 KB
Newer Older
X
xixiaoyao 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
# -*- coding: UTF-8 -*-
#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

X
xixiaoyao 已提交
16 17
from __future__ import print_function

X
xixiaoyao 已提交
18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186
import os
import sys
import importlib
import multiprocessing
from paddle import fluid
from paddle.fluid import layers
import yaml
import json
import logging
import time
import numpy as np

from paddlepalm.utils.saver import init_pretraining_params, init_checkpoint
from paddlepalm.utils.config_helper import PDConfig
from paddlepalm.utils.print_helper import print_dict
from paddlepalm.utils.reader_helper import create_net_inputs, create_iterator_fn, create_joint_iterator_fn, merge_input_attrs 

from paddlepalm.default_settings import *
from paddlepalm.task_instance import TaskInstance, check_instances

DEBUG=False
VERBOSE=0

def _get_basename(f):
    return os.path.splitext(f)[0]


def _get_suffix(f):
    return os.path.splitext(f)[-1]


def _parse_yaml(f, asdict=True, support_cmd_line=False):
    assert os.path.exists(f), "file {} not found.".format(f)
    if support_cmd_line:
        args = PDConfig(yaml_file=f, fuse_args=True)
        args.build()
        return args.asdict() if asdict else args
    else:
        if asdict:
            with open(f, "r") as fin: 
                yaml_config = yaml.load(fin, Loader=yaml.SafeLoader)
            return yaml_config
        else:
            raise NotImplementedError()


def _parse_json(f, asdict=True, support_cmd_line=False):
    assert os.path.exists(f), "file {} not found.".format(f)
    if support_cmd_line:
        args = PDConfig(json_file=f, fuse_args=support_cmd_line)
        args.build()
        return args.asdict() if asdict else args
    else:
        if asdict:
            with open(f, "r") as fin: 
                config = json.load(fin)
            return config
        else:
            raise NotImplementedError()
            

def _parse_list(string, astype=str):
    assert isinstance(string, str), "{} is not a string.".format(string)
    if ',' not in string:
        return [astype(string)]
    string = string.replace(',', ' ')
    return [astype(i) for i in string.split()]


def _try_float(s):
    try:
        float(s)
        return(float(s))
    except:
        return s


def _check_conf(conf, checklist=None):
    assert isinstance(conf, dict), "{} is not a dict.".format(conf)
    ret = {}
    for k,v in conf.items():
        if isinstance(v, str):
            v = _try_float(v)
        ret[k] = v
    if checklist is not None:
        for k, t in checklist:
            assert k in ret, "required argument {} is NOT exist in config file.".format(k)
            assert isintance(ret[k], t), "value type of argument {} should be {}".format(k, t)
    return ret


# TODO: 增加None机制,允许hidden size、batch size和seqlen设置为None
def _check_io(in_attr, out_attr, strict=False, in_name="left", out_name="right"):
    for name, attr in in_attr.items():
        assert name in out_attr, in_name+': '+name+' not found in '+out_name
        if attr != out_attr[name]:
            if strict:
                raise ValueError(name+': shape or dtype not consistent!')
            else:
                logging.warning('{}: shape or dtype not consistent!\n{}:\n{}\n{}:\n{}'.format(name, in_name, attr, out_name, out_attr[name]))


def _merge_conf(conf1, conf2, conf1_first=True, strict=False):
    assert isinstance(conf1, dict), "{} is not a dict.".format(conf1)
    assert isinstance(conf2, dict), "{} is not a dict.".format(conf2)
    base_conf = conf2 if conf1_first else conf1
    base_conf = base_conf.copy()
    new_conf = conf1 if conf1_first else conf2

    for k, v in new_conf.items():
        if k in base_conf:
            if base_conf[k] != v:
                raise Warning("value of argument {} has been updated to {}.".format(k, v))
        else:
            if strict:
                continue
            
        base_conf[k] = v
    return base_conf


def _encode_inputs(inputs, scope_name, sep='/', cand_set=None):
    outputs = {}
    for k, v in inputs.items():
        if cand_set is not None:
            if k in cand_set:
                outputs[k] = v
            if scope_name+sep+k in cand_set:
                outputs[scope_name+sep+k] = v
        else:
            outputs[scope_name+sep+k] = v
    return outputs


def _decode_inputs(inputs, scope_name, sep='/', keep_unk_keys=True):
    outputs = {}
    for name, value in inputs.items():
        # var for backbone are also available to tasks
        if keep_unk_keys and sep not in name:
            outputs[name] = value
        # var for this inst
        if name.startswith(scope_name+'/'):
            outputs[name[len(scope_name+'/'):]] = value
    return outputs


def _init_env(use_gpu):
    if use_gpu:
        place = fluid.CUDAPlace(0)
        dev_count = fluid.core.get_cuda_device_count()
    else:
        place = fluid.CPUPlace()
        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
    return fluid.Executor(place), dev_count


def _fit_attr(conf, fit_attr, strict=False):
    for i, attr in fit_attr.items():
        if i not in conf:
            if strict:
                raise Exception('Argument {} is required to create a controller.'.format(i))
            else:
                continue
        conf[i] = attr(conf[i])
    return conf


class Controller(object):

X
xixiaoyao 已提交
187
    def __init__(self, tasks, mix_ratios=None, task_reuse_tag=None, use_gpu=True):
X
xixiaoyao 已提交
188 189 190 191
        """
        Args:
        """

X
xixiaoyao 已提交
192
        exe, dev_count = _init_env(use_gpu=use_gpu)
X
xixiaoyao 已提交
193 194 195 196
        self.exe = exe
        self.dev_count = dev_count

        # parse task instances and target tags
X
xixiaoyao 已提交
197 198
        for id in len(tasks):
            tasks[id]._set_id(id)
X
xixiaoyao 已提交
199 200

        # parse mix ratios
X
xixiaoyao 已提交
201 202 203 204 205 206
        if mix_ratios is not None:
            if isinstance(mix_ratios, str):
                mix_ratios = _parse_list(mix_ratios, astype=float)
            else:
                assert isinstance(mix_ratios, list)
                assert len(mix_ratios) == len(tasks), "number of mix_ratios is NOT consistent with num_instances."
X
xixiaoyao 已提交
207

X
xixiaoyao 已提交
208 209
            for mr, t in zip(mix_ratios, tasks):
                t.mix_ratio = mr
X
xixiaoyao 已提交
210 211 212

        # parse task layer reuse tags
        instname_to_reusehost = {i:i for i in instnames}
X
xixiaoyao 已提交
213 214 215 216 217 218 219 220
        if task_reuse_tag is not None:
            if isinstance(task_reuse_tag, str):
                tags = _parse_list(task_reuse_tag, astype=int)
            else:
                assert isinstance(task_reuse_tag, list)
                assert len(task_reuse_tag) == len(tasks), "number of task_reuse_tag is NOT consistent with num_tasks."
                tags = task_reuse_tag

X
xixiaoyao 已提交
221 222 223
        else:
            tags = []
            mapper = {}
X
xixiaoyao 已提交
224
            for inst in tasks:
X
xixiaoyao 已提交
225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240
                history = set()
                history.add(inst.name)
                cur_inst = inst
                while True:
                    if cur_inst.task_reuse_scope in history:
                        mapper[inst.name] = len(tags)
                        break
                    elif cur_inst.task_reuse_scope in mapper:
                        mapper[inst.name] = mapper[cur_inst.task_reuse_scope]
                        break
                    else:
                        cur_inst = name_to_instance[cur_inst.task_reuse_scope]
                        history.add(cur_inst.name)

                tags.append(mapper[inst.name])

X
xixiaoyao 已提交
241
        for i in range(1, len(tasks)):
X
xixiaoyao 已提交
242 243
            for j in range(i):
                if tags[i] == tags[j]:
X
xixiaoyao 已提交
244 245 246 247
                    # assert tasks[i].tasktype == \
                    #         instances[j].tasktype, \
                    #         "paradigm of reuse tasks should be consistent"
                    tasks[i]._task_reuse_scope = task[j].name
X
xixiaoyao 已提交
248 249
                    break

X
xixiaoyao 已提交
250 251 252 253 254 255 256 257 258 259 260 261 262 263 264
        # self.instances = instances
        # self.mrs = mrs
        # self.Backbone = Backbone
        # self.bb_conf = bb_conf
        # self.bb_name = bb_name

        # self.has_init_train = False
        # self.has_init_pred = False

        # if self._for_train:
        #     print("initialing for training...")
        #     self._init_train()
        #     self.has_init_train = True
        #     
    def build_forward(self, backbone, mask_task=[]):
X
xixiaoyao 已提交
265
        
X
xixiaoyao 已提交
266
        task_instances = self._tasks
X
xixiaoyao 已提交
267 268 269 270 271 272 273 274 275
        Backbone = self.Backbone
        bb_conf = self.bb_conf
        bb_name = self.bb_name
        dev_count = self.dev_count
        num_instances = len(instances)
        mrs = self.mrs

        # set first_target/main task instance
        main_inst = None
X
xixiaoyao 已提交
276
        for inst in task_instances:
X
xixiaoyao 已提交
277 278
            if inst.is_target:
                main_inst = inst
X
xixiaoyao 已提交
279
                inst._as_main = True
X
xixiaoyao 已提交
280 281
                break
        
X
xixiaoyao 已提交
282 283 284
        if save_path is not None and not os.path.exists(save_path):
            os.makedirs(save_path)
        
X
xixiaoyao 已提交
285 286 287 288
        # create reader, task
        # then check i/o across reader, backbone and task_layer
        task_attrs = []
        pred_task_attrs = []
X
xixiaoyao 已提交
289 290
        for inst in task_instances:
            task_attr_from_reader = _encode_inputs(inst._taskblock['train'].inputs_attrs['reader'], inst.name)
X
xixiaoyao 已提交
291 292
            task_attrs.append(task_attr_from_reader)

X
xixiaoyao 已提交
293 294 295
            _check_io(backbone.inputs_attr, inst._reader['train'].outputs_attr, in_name=bb_name+'_backbone', out_name='reader.train')
            _check_io(inst.taskblock['train'].inputs_attrs['reader'], inst._reader['train'].outputs_attr, in_name='task_paradigm.train.reader', out_name='reader.train')
            _check_io(inst._taskblock['train'].inputs_attrs['backbone'], train_backbone.outputs_attr, in_name='task_paradigm.train.backbone', out_name=bb_name+'_backbone')
X
xixiaoyao 已提交
296 297 298 299 300 301 302 303 304 305 306 307 308 309 310

            if inst.is_target:
                if 'pred_file' not in inst.config:
                    inst.config['pred_file'] = ''
                pred_reader = inst.Reader(inst.config, phase='pred')
                pred_parad = inst.Paradigm(inst.config, phase='pred', backbone_config=bb_conf)
                inst.task_layer['pred'] = pred_parad
                task_attr_from_reader = _encode_inputs(pred_parad.inputs_attrs['reader'], inst.name)
                pred_task_attrs.append(task_attr_from_reader)
                _check_io(pred_backbone.inputs_attr, pred_reader.outputs_attr, in_name=bb_name+'_backbone', out_name='reader.pred')
                _check_io(pred_parad.inputs_attrs['reader'], pred_reader.outputs_attr, in_name='task_paradigm.pred.reader', out_name='reader.pred')
                _check_io(pred_parad.inputs_attrs['backbone'], pred_backbone.outputs_attr, in_name='task_paradigm.pred.backbone', out_name=bb_name+'_backbone')

        # merge reader input attrs from backbone and task_instances
        joint_input_names, joint_shape_and_dtypes, name_to_position = merge_input_attrs(train_backbone.inputs_attr, task_attrs)
X
xixiaoyao 已提交
311
        pred_joint_input_names, pred_joint_shape_and_dtypes, _ = merge_input_attrs(pred_backbone.inputs_attr, pred_task_attrs, insert_taskid=False, insert_batchsize=False, insert_seqlen=False, insert_batchsize_x_seqlen=False)
X
xixiaoyao 已提交
312 313 314 315 316 317 318 319 320 321 322
        # shapes: [task_id, shapes_of_backbone, shapes_of_inst1, ..., shapes_of_instN]

        if DEBUG:
            print('----- for debug -----')
            print('joint input names:')
            print(joint_input_names)
            print('joint input shape and dtypes:')
            print(joint_shape_and_dtypes)

        # load data
        for inst in instances:
X
xixiaoyao 已提交
323
            print(inst.name+": preparing data...", end='')
X
xixiaoyao 已提交
324
            inst.reader['train'].load_data()
X
xixiaoyao 已提交
325
            print('ok!')
X
xixiaoyao 已提交
326 327 328 329 330 331 332 333 334 335

        # merge dataset iterators and create net input vars
        iterators = []
        prefixes = []
        mrs = []
        for inst in instances:
            iterators.append(inst.reader['train'].iterator())
            prefixes.append(inst.name)
            mrs.append(inst.mix_ratio)

X
xixiaoyao 已提交
336
        joint_iterator_fn = create_joint_iterator_fn(iterators, prefixes, joint_shape_and_dtypes, mrs, name_to_position, dev_count=dev_count, verbose=VERBOSE)
X
xixiaoyao 已提交
337 338 339 340 341 342

        input_attrs = [[i, j, k] for i, (j,k) in zip(joint_input_names, joint_shape_and_dtypes)]
        pred_input_attrs = [[i, j, k] for i, (j,k) in zip(pred_joint_input_names, pred_joint_shape_and_dtypes)]
        net_inputs = create_net_inputs(input_attrs, async=True, iterator_fn=joint_iterator_fn, dev_count=dev_count, n_prefetch=3)

        # build backbone and task layers
X
xixiaoyao 已提交
343 344
        train_prog = fluid.default_main_program()
        train_init_prog = fluid.default_startup_program()
X
xixiaoyao 已提交
345
        bb_output_vars = train_backbone.build(net_inputs, scope_name='__paddlepalm_')
X
xixiaoyao 已提交
346
        assert sorted(bb_output_vars.keys()) == sorted(train_backbone.outputs_attr.keys())
X
xixiaoyao 已提交
347
        
X
xixiaoyao 已提交
348 349 350 351 352
        pred_prog = fluid.Program()
        pred_init_prog = fluid.Program()

        with fluid.program_guard(main_program = pred_prog, startup_program = pred_init_prog):
            pred_net_inputs = create_net_inputs(pred_input_attrs)
X
xixiaoyao 已提交
353
            pred_bb_output_vars = pred_backbone.build(pred_net_inputs, scope_name='__paddlepalm_')
X
xixiaoyao 已提交
354 355 356 357 358 359 360 361 362 363 364 365

        fluid.framework.switch_main_program(train_prog)
        fluid.framework.switch_startup_program(train_init_prog)

        task_output_vars = {}
        for inst in instances:
            task_inputs = {'backbone': bb_output_vars}
            task_inputs_from_reader = _decode_inputs(net_inputs, inst.name)
            task_inputs['reader'] = task_inputs_from_reader

            scope = inst.task_reuse_scope + '/'
            with fluid.unique_name.guard(scope):
X
xixiaoyao 已提交
366
                output_vars = inst.build_task_layer(task_inputs, phase='train', scope=scope)
X
xixiaoyao 已提交
367 368 369 370 371
                output_vars = {inst.name+'/'+key: val for key, val in output_vars.items()}
                old = len(task_output_vars) # for debug
                task_output_vars.update(output_vars)
                assert len(task_output_vars) - old == len(output_vars) # for debug

X
xixiaoyao 已提交
372
            # prepare predict vars for saving inference model
X
xixiaoyao 已提交
373 374 375 376 377 378 379 380
            if inst.is_target:

                with fluid.program_guard(pred_prog, pred_init_prog):
                    cur_inputs = _decode_inputs(pred_net_inputs, inst.name)
                    inst.pred_input = cur_inputs
                    pred_task_inputs = {'backbone': pred_bb_output_vars, 'reader': cur_inputs}
                    scope = inst.task_reuse_scope + '/'
                    with fluid.unique_name.guard(scope):
X
xixiaoyao 已提交
381
                        inst.build_task_layer(pred_task_inputs, phase='pred', scope=scope)
X
xixiaoyao 已提交
382 383 384 385


        bb_fetches = {k: v.name for k,v in bb_output_vars.items()}
        task_fetches = {k: v.name for k,v in task_output_vars.items()}
X
xixiaoyao 已提交
386
        fetches = task_fetches
X
xixiaoyao 已提交
387 388 389 390 391 392 393 394
        fetches['__task_id'] = net_inputs['__task_id'].name

        # compute loss
        task_id_var = net_inputs['__task_id']
        task_id_vec = layers.one_hot(task_id_var, num_instances)
        losses = fluid.layers.concat([task_output_vars[inst.name+'/loss'] for inst in instances], axis=0)
        loss = layers.reduce_sum(task_id_vec * losses)

X
xixiaoyao 已提交
395
    def init_train(self, basetask, num_epochs, ):
X
xixiaoyao 已提交
396 397 398 399
        main_reader = main_inst.reader['train']

        num_examples = main_reader.num_examples
        for inst in instances:
X
xixiaoyao 已提交
400
            max_train_steps = int(main_conf['num_epochs']* inst.mix_ratio * (num_examples // main_conf['batch_size']  // dev_count))
X
xixiaoyao 已提交
401 402
            if inst.is_target:
                print('{}: expected train steps {}.'.format(inst.name, max_train_steps))
X
xixiaoyao 已提交
403 404 405
            inst.steps_pur_epoch = inst.reader['train'].num_examples // main_conf['batch_size']  // dev_count
            inst.expected_train_steps = max_train_steps

X
xixiaoyao 已提交
406
        global_max_train_steps = int(main_conf['num_epochs'] * sum(mrs) * (num_examples // main_conf['batch_size']  // dev_count))
X
xixiaoyao 已提交
407 408
        print('Estimated overall train steps {}.'.format(global_max_train_steps))

X
xixiaoyao 已提交
409 410 411 412 413 414 415 416 417 418
        # if 'warmup_proportion' in main_conf and main_conf['warmup_proportion'] > 0:
        #     warmup_steps = int(global_max_train_steps * main_conf['warmup_proportion'])
        #     print('Warmup steps: '+str(warmup_steps))
        # else:
        #     warmup_steps = 0

        return loss, max_train_steps


    def build_backward(self, optimizer, use_ema=False, ema_decay=0.9999):
X
xixiaoyao 已提交
419
        # build optimizer
X
xixiaoyao 已提交
420 421 422 423 424 425 426 427 428 429 430 431 432
        optimizer.optimize(fluid.default_main_program())

        # loss.persistable = True
        if use_ema:
            ema = fluid.optimizer.ExponentialMovingAverage(ema_decay)
            ema.update()

    def random_init_params(self):
        if not self._init_finish:
            # prepare for train
            self.train_program = fluid.CompiledProgram(fluid.default_main_program()).with_data_parallel(loss_name=loss.name)
            self.saver_program = fluid.default_main_program()
            self._init_finish = True
X
xixiaoyao 已提交
433 434

        print("\nRandomly initialize parameters...\n")
X
xixiaoyao 已提交
435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451
        self.exe.run(fluid.default_startup_program())

    def load_pretrain_params(self, pretrain_model_path=None):
        # load pretrain model (or ckpt)
        if pretrain_model_path is None:
            assert 'pretrain_model_path' in self.main_conf, "pretrain_model_path NOT set."
            pretrain_model_path = self.main_conf['pretrain_model_path']

        init_pretraining_params(
            self.exe,
            pretrain_model_path,
            main_program=fluid.default_startup_program())

        if not self._init_finish:
            self.train_program = fluid.CompiledProgram(fluid.default_main_program()).with_data_parallel(loss_name=loss.name)
            self.saver_program = fluid.default_main_program()
            self._init_finish = True
X
xixiaoyao 已提交
452

X
xixiaoyao 已提交
453
    def load_infermodel(self, instance, infer_model_path):
X
xixiaoyao 已提交
454
        inst = instance
X
xixiaoyao 已提交
455 456 457 458 459
        if 'pred_output_path' not in inst.config:
            inst.config['pred_output_path'] = os.path.join(inst.config.get('save_path', '.'), inst.name)

        if not os.path.exists(inst.config['pred_output_path']):
            os.makedirs(inst.config['pred_output_path'])
X
xixiaoyao 已提交
460 461 462 463 464 465

        pred_backbone = self.Backbone(self.bb_conf, phase='pred')
        pred_parad = inst.Paradigm(inst.config, phase='pred', backbone_config=self.bb_conf)
        inst.task_layer['pred'] = pred_parad
        pred_joint_input_names, pred_joint_shape_and_dtypes, name_to_position = merge_input_attrs(
            pred_backbone.inputs_attr, inst.task_layer['pred'].inputs_attrs['reader'], 
X
xixiaoyao 已提交
466
            insert_taskid=False, insert_batchsize=False, insert_seqlen=False, insert_batchsize_x_seqlen=False)
X
xixiaoyao 已提交
467 468 469 470 471 472 473

        pred_prog = inst.load(infer_model_path)
        if inst.reader['pred'] is None:
            pred_reader = inst.Reader(inst.config, phase='pred')
            inst.reader['pred'] = pred_reader
        return pred_prog

X
xixiaoyao 已提交
474
    def train(self, num_epochs):
X
xixiaoyao 已提交
475

X
xixiaoyao 已提交
476 477
        if not self._init_finish:
            raise Exception('params has not been initialized! Please init params with random_init_params or load_pretrain_params.')
X
xixiaoyao 已提交
478 479 480 481 482 483 484 485 486 487 488 489 490 491

        instances = self.instances
        num_instances = self.num_instances
        main_inst = self.main_inst
        main_conf = main_inst.config

        backbone = self.train_backbone
        train_program = self.train_program
        saver_program = self.saver_program
        fetches = self.fetches

        finish = []
        for inst in instances:
            if inst.is_target:
X
xixiaoyao 已提交
492 493 494 495 496 497
                if inst.expected_train_steps > 0:
                    finish.append(False)
                else:
                    finish.append(True)
                    print(inst.name+': train finished!')
                    inst.save()
X
xixiaoyao 已提交
498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524
        
        def train_finish():
            for inst in instances:
                if inst.is_target:
                    if not inst.train_finish:
                        return False
            return True

        # do training
        fetch_names, fetch_list = zip(*fetches.items())

        main_step = 0 # only count for main task
        global_step = 0 # count for all tasks
        epoch = 0
        time_begin = time.time()
        backbone_buffer = []
        while not train_finish():
            rt_outputs = self.exe.run(train_program, fetch_list=fetch_list)
            rt_outputs = {k:v for k,v in zip(fetch_names, rt_outputs)}
            rt_task_id = np.squeeze(rt_outputs['__task_id']).tolist()
            rt_task_id = rt_task_id[0] if isinstance(rt_task_id, list) else rt_task_id
            cur_task = instances[rt_task_id]

            backbone_rt_outputs = {k:v for k,v in rt_outputs.items() if '/' not in k}
            backbone_buffer.append(backbone.postprocess(backbone_rt_outputs))
            
            task_rt_outputs = {k[len(cur_task.name+'/'):]: v for k,v in rt_outputs.items() if k.startswith(cur_task.name+'/')}
X
xixiaoyao 已提交
525
            instances[rt_task_id].task_layer['train'].postprocess(task_rt_outputs)
X
xixiaoyao 已提交
526 527 528 529

            global_step += 1
            cur_task.cur_train_step += 1

X
xixiaoyao 已提交
530
            if cur_task.save_infermodel_every_n_steps > 0 and cur_task.cur_train_step % cur_task.save_infermodel_every_n_steps == 0:
X
xixiaoyao 已提交
531
                cur_task.save(suffix='.step'+str(cur_task.cur_train_step))
X
xixiaoyao 已提交
532

X
xixiaoyao 已提交
533 534 535 536 537 538 539 540 541 542 543 544
            if global_step % main_conf.get('print_every_n_steps', 5) == 0:
                loss = rt_outputs[cur_task.name+'/loss']
                loss = np.mean(np.squeeze(loss)).tolist()

                time_end = time.time()
                time_cost = time_end - time_begin

                print("Global step: {}. Task: {}, step {}/{} (epoch {}), loss: {:.3f}, speed: {:.2f} steps/s".format(
                       global_step, cur_task.name, cur_task.cur_train_step, cur_task.steps_pur_epoch, cur_task.cur_train_epoch,
                       loss, main_conf.get('print_every_n_steps', 5) / time_cost))
                time_begin = time.time()

X
xixiaoyao 已提交
545 546 547 548
            if cur_task.train_finish and cur_task.cur_train_step + cur_task.cur_train_epoch * cur_task.steps_pur_epoch == cur_task.expected_train_steps:
                print(cur_task.name+': train finished!')
                cur_task.save()

X
xixiaoyao 已提交
549 550 551 552 553
            if 'save_every_n_steps' in main_conf and global_step % main_conf['save_every_n_steps'] == 0:
                save_path = os.path.join(main_conf['save_path'],
                                         "step_" + str(global_step))
                fluid.io.save_persistables(self.exe, save_path, saver_program)

X
xixiaoyao 已提交
554
        print("ALL tasks train finished, exiting...")
X
xixiaoyao 已提交
555 556 557 558 559 560 561 562
            
    def pred(self, task_instance, inference_model_dir=None):
        if self._for_train:
            raise Exception('This controller is a trainer. Please build a new controller with for_train=False for predicting.')

        assert isinstance(task_instance, str)
        if isinstance(inference_model_dir, str):
            assert os.path.exists(inference_model_dir), inference_model_dir+" not found."
X
xixiaoyao 已提交
563 564 565 566 567
        # if not self.has_init_pred and inference_model_dir is None:
        #     raise ValueError('infer_model_path is required for prediction.')
        if inference_model_dir is None:
            assert 'save_path' in self.mtl_conf, "one of the `inference_model_dir` and 'save_path' should be set to load inference model."
            inference_model_dir = os.path.join(self.mtl_conf['save_path'], task_instance, 'infer_model')
X
xixiaoyao 已提交
568 569 570 571 572 573 574 575 576 577 578 579 580

        instance = None
        for inst in self.instances:
            if inst.name == task_instance:
                instance = inst
                break

        if instance is None:
            raise ValueError(task_instance + ' is not a valid task_instance.')

        pred_prog = self._init_pred(instance, inference_model_dir)
                
        inst = instance
X
xixiaoyao 已提交
581
        print(inst.name+": loading data...")
X
xixiaoyao 已提交
582 583 584
        inst.reader['pred'].load_data()
        fetch_names, fetch_vars = inst.pred_fetch_list

X
xixiaoyao 已提交
585
        print('predicting...')
X
xixiaoyao 已提交
586 587 588 589 590 591 592 593 594
        mapper = {k:v for k,v in inst.pred_input}
        buf = []
        for feed in inst.reader['pred'].iterator():
            feed = _encode_inputs(feed, inst.name, cand_set=mapper)
            feed = {mapper[k]: v for k,v in feed.items()}

            rt_outputs = self.exe.run(pred_prog, feed, fetch_vars)
            rt_outputs = {k:v for k,v in zip(fetch_names, rt_outputs)}
            inst.postprocess(rt_outputs, phase='pred')
X
xixiaoyao 已提交
595 596 597 598
        if inst.task_layer['pred'].epoch_inputs_attrs:
            reader_outputs = inst.reader['pred'].get_epoch_outputs()
        else:
            reader_outputs = None
X
xixiaoyao 已提交
599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614
        inst.epoch_postprocess({'reader':reader_outputs}, phase='pred')


if __name__ == '__main__':
    assert len(sys.argv) == 2, "Usage: python mtl_controller.py <mtl_conf_path>"
    conf_path = sys.argv[1]
    del sys.argv[1]
    controller = Controller(conf_path)
    if controller.main_conf['do_train']:
        controller.train()