transpiler_trainer.py 8.4 KB
Newer Older
T
tangwei 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
T
tangwei 已提交
16
Training use fluid with DistributeTranspiler
T
tangwei 已提交
17 18
"""
import os
T
tangwei 已提交
19

T
tangwei 已提交
20
import paddle.fluid as fluid
T
tangwei 已提交
21 22
from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet

T
rename  
tangwei 已提交
23 24 25
from fleetrec.core.trainer import Trainer
from fleetrec.core.utils import envs
from fleetrec.core.utils import dataloader_instance
T
tangwei 已提交
26 27


T
tangwei 已提交
28
class TranspileTrainer(Trainer):
T
tangwei 已提交
29 30
    def __init__(self, config=None):
        Trainer.__init__(self, config)
T
tangwei 已提交
31
        self.processor_register()
T
tangwei 已提交
32
        self.model = None
T
tangwei12 已提交
33 34 35
        self.inference_models = []
        self.increment_models = []

T
tangwei 已提交
36 37
    def processor_register(self):
        print("Need implement by trainer, `self.regist_context_processor('uninit', self.instance)` must be the first")
T
tangwei 已提交
38

39
    def _get_dataloader(self, state="TRAIN"):
M
malin10 已提交
40 41 42
        if state == "TRAIN":
            dataloader = self.model._data_loader
            namespace = "train.reader"
M
debug  
malin10 已提交
43
            class_name = "TrainReader"
M
malin10 已提交
44 45 46
        else:
            dataloader = self.model._infer_data_loader
            namespace = "evaluate.reader"
M
debug  
malin10 已提交
47
            class_name = "EvaluateReader"
M
malin10 已提交
48

T
tangwei 已提交
49 50
        batch_size = envs.get_global_env("batch_size", None, namespace)
        reader_class = envs.get_global_env("class", None, namespace)
C
chengmo 已提交
51 52 53
        print("batch_size: {}".format(batch_size))
        reader = dataloader_instance.dataloader(
            reader_class, state, self._config_yaml)
T
tangwei 已提交
54

M
debug  
malin10 已提交
55
        reader_class = envs.lazy_instance_by_fliename(reader_class, class_name)
Y
add din  
yaoxuefeng 已提交
56
        reader_ins = reader_class(self._config_yaml)
C
chengmo 已提交
57
        if hasattr(reader_ins, 'generate_batch_from_trainfiles'):
Y
add din  
yaoxuefeng 已提交
58 59 60
            dataloader.set_sample_list_generator(reader)
        else:
            dataloader.set_sample_generator(reader, batch_size)
T
tangwei 已提交
61 62
        return dataloader

63
    def _get_dataset(self, state="TRAIN"):
M
malin10 已提交
64 65 66
        if state == "TRAIN":
            inputs = self.model.get_inputs()
            namespace = "train.reader"
C
chengmo 已提交
67 68
            train_data_path = envs.get_global_env(
                "train_data_path", None, namespace)
M
malin10 已提交
69 70 71
        else:
            inputs = self.model.get_infer_inputs()
            namespace = "evaluate.reader"
C
chengmo 已提交
72 73
            train_data_path = envs.get_global_env(
                "test_data_path", None, namespace)
T
tangwei12 已提交
74

T
bug fix  
tangwei12 已提交
75 76
        #threads = int(envs.get_runtime_environ("train.trainer.threads"))
        threads = 2
T
tangwei12 已提交
77
        batch_size = envs.get_global_env("batch_size", None, namespace)
T
tangwei 已提交
78 79
        reader_class = envs.get_global_env("class", None, namespace)
        abs_dir = os.path.dirname(os.path.abspath(__file__))
T
tangwei 已提交
80
        reader = os.path.join(abs_dir, '../utils', 'dataset_instance.py')
C
chengmo 已提交
81 82
        pipe_cmd = "python {} {} {} {}".format(
            reader, reader_class, state, self._config_yaml)
T
tangwei12 已提交
83

T
tangwei 已提交
84 85 86
        if train_data_path.startswith("fleetrec::"):
            package_base = envs.get_runtime_environ("PACKAGE_BASE")
            assert package_base is not None
C
chengmo 已提交
87 88
            train_data_path = os.path.join(
                package_base, train_data_path.split("::")[1])
T
tangwei 已提交
89

T
tangwei 已提交
90 91
        dataset = fluid.DatasetFactory().create_dataset()
        dataset.set_use_var(inputs)
T
tangwei 已提交
92
        dataset.set_pipe_command(pipe_cmd)
T
tangwei 已提交
93 94 95
        dataset.set_batch_size(batch_size)
        dataset.set_thread(threads)
        file_list = [
T
tangwei12 已提交
96 97
            os.path.join(train_data_path, x)
            for x in os.listdir(train_data_path)
T
tangwei 已提交
98 99 100 101 102
        ]

        dataset.set_filelist(file_list)
        return dataset

T
tangwei 已提交
103
    def save(self, epoch_id, namespace, is_fleet=False):
T
tangwei12 已提交
104 105 106
        def need_save(epoch_id, epoch_interval, is_last=False):
            if is_last:
                return True
T
tangwei 已提交
107

T
tangwei12 已提交
108 109
            if epoch_id == -1:
                return False
T
tangwei 已提交
110

T
tangwei12 已提交
111 112
            return epoch_id % epoch_interval == 0

T
tangwei 已提交
113
        def save_inference_model():
C
chengmo 已提交
114 115
            save_interval = envs.get_global_env(
                "save.inference.epoch_interval", -1, namespace)
T
tangwei 已提交
116 117 118

            if not need_save(epoch_id, save_interval, False):
                return
M
malin10 已提交
119
            
120 121
          #  print("save inference model is not supported now.")
          #  return
C
chengmo 已提交
122

T
tangwei12 已提交
123 124
            feed_varnames = envs.get_global_env("save.inference.feed_varnames", None, namespace)
            fetch_varnames = envs.get_global_env("save.inference.fetch_varnames", None, namespace)
125 126
            if feed_varnames is None or fetch_varnames is None:
                return
T
tangwei12 已提交
127

M
malin10 已提交
128
            fetch_vars = [fluid.default_main_program().global_block().vars[varname] for varname in fetch_varnames]
T
tangwei12 已提交
129
            dirname = envs.get_global_env("save.inference.dirname", None, namespace)
T
tangwei 已提交
130 131 132

            assert dirname is not None
            dirname = os.path.join(dirname, str(epoch_id))
T
tangwei 已提交
133 134

            if is_fleet:
135
                fleet.save_inference_model(self._exe, dirname, feed_varnames, fetch_vars)
T
tangwei 已提交
136
            else:
C
chengmo 已提交
137 138
                fluid.io.save_inference_model(
                    dirname, feed_varnames, fetch_vars, self._exe)
T
tangwei12 已提交
139
            self.inference_models.append((epoch_id, dirname))
T
tangwei 已提交
140 141

        def save_persistables():
C
chengmo 已提交
142 143
            save_interval = envs.get_global_env(
                "save.increment.epoch_interval", -1, namespace)
T
tangwei 已提交
144 145 146 147

            if not need_save(epoch_id, save_interval, False):
                return

C
chengmo 已提交
148 149
            dirname = envs.get_global_env(
                "save.increment.dirname", None, namespace)
T
tangwei 已提交
150 151 152

            assert dirname is not None
            dirname = os.path.join(dirname, str(epoch_id))
T
tangwei 已提交
153 154

            if is_fleet:
T
tangwei 已提交
155
                fleet.save_persistables(self._exe, dirname)
T
tangwei 已提交
156
            else:
T
tangwei 已提交
157
                fluid.io.save_persistables(self._exe, dirname)
T
tangwei12 已提交
158
            self.increment_models.append((epoch_id, dirname))
T
tangwei 已提交
159 160 161 162

        save_persistables()
        save_inference_model()

T
tangwei 已提交
163 164
    def instance(self, context):
        models = envs.get_global_env("train.model.models")
T
tangwei 已提交
165
        model_class = envs.lazy_instance_by_fliename(models, "Model")
T
tangwei 已提交
166
        self.model = model_class(None)
T
tangwei 已提交
167
        context['status'] = 'init_pass'
T
tangwei 已提交
168

T
tangwei 已提交
169 170 171
    def init(self, context):
        print("Need to be implement")
        context['is_exit'] = True
T
tangwei 已提交
172

T
tangwei 已提交
173 174 175 176 177
    def dataloader_train(self, context):
        print("Need to be implement")
        context['is_exit'] = True

    def dataset_train(self, context):
T
tangwei 已提交
178 179
        print("Need to be implement")
        context['is_exit'] = True
T
tangwei 已提交
180

T
tangwei12 已提交
181
    def infer(self, context):
182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228
        infer_program = fluid.Program()
        startup_program = fluid.Program()
        with fluid.unique_name.guard():
            with fluid.program_guard(infer_program, startup_program):
                self.model.infer_net()

        if self.model._infer_data_loader is None:
            context['status'] = 'terminal_pass'
            return

        reader = self._get_dataloader("Evaluate")

        metrics_varnames = []
        metrics_format = []

        metrics_format.append("{}: {{}}".format("epoch"))
        metrics_format.append("{}: {{}}".format("batch"))

        for name, var in self.model.get_infer_results().items():
            metrics_varnames.append(var.name)
            metrics_format.append("{}: {{}}".format(name))

        metrics_format = ", ".join(metrics_format)
        self._exe.run(startup_program)

        for (epoch, model_dir) in self.increment_models:
            print("Begin to infer epoch {}, model_dir: {}".format(epoch, model_dir))
            program = infer_program.clone()
            fluid.io.load_persistables(self._exe, model_dir, program)
            reader.start()
            batch_id = 0
            try:
                while True:
                    metrics_rets = self._exe.run(
                        program=program,
                        fetch_list=metrics_varnames)

                    metrics = [epoch, batch_id]
                    metrics.extend(metrics_rets)

                    if batch_id % 2 == 0 and batch_id != 0:
                        print(metrics_format.format(*metrics))
                    batch_id += 1
            except fluid.core.EOFException:
                reader.reset()

        context['status'] = 'terminal_pass'
T
tangwei12 已提交
229 230

    def terminal(self, context):
T
tangwei 已提交
231
        print("clean up and exit")
T
tangwei12 已提交
232
        context['is_exit'] = True