transpiler_trainer.py 9.3 KB
Newer Older
T
tangwei 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
T
tangwei 已提交
16
Training use fluid with DistributeTranspiler
T
tangwei 已提交
17 18
"""
import os
T
tangwei 已提交
19

T
tangwei 已提交
20
import paddle.fluid as fluid
T
tangwei 已提交
21 22
from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet

23 24 25
from paddlerec.core.trainer import Trainer
from paddlerec.core.utils import envs
from paddlerec.core.utils import dataloader_instance
T
tangwei 已提交
26 27


T
tangwei 已提交
28
class TranspileTrainer(Trainer):
T
tangwei 已提交
29 30
    def __init__(self, config=None):
        Trainer.__init__(self, config)
T
tangwei 已提交
31
        self.processor_register()
T
tangwei 已提交
32
        self.model = None
T
tangwei12 已提交
33 34 35
        self.inference_models = []
        self.increment_models = []

T
tangwei 已提交
36 37
    def processor_register(self):
        print("Need implement by trainer, `self.regist_context_processor('uninit', self.instance)` must be the first")
T
tangwei 已提交
38

39
    def _get_dataloader(self, state="TRAIN"):
M
malin10 已提交
40 41 42
        if state == "TRAIN":
            dataloader = self.model._data_loader
            namespace = "train.reader"
M
debug  
malin10 已提交
43
            class_name = "TrainReader"
M
malin10 已提交
44
        else:
C
chengmo 已提交
45
            readerdataloader = self.model._infer_data_loader
M
malin10 已提交
46
            namespace = "evaluate.reader"
M
debug  
malin10 已提交
47
            class_name = "EvaluateReader"
M
malin10 已提交
48

T
tangwei 已提交
49 50
        batch_size = envs.get_global_env("batch_size", None, namespace)
        reader_class = envs.get_global_env("class", None, namespace)
C
chengmo 已提交
51 52 53
        print("batch_size: {}".format(batch_size))
        reader = dataloader_instance.dataloader(
            reader_class, state, self._config_yaml)
T
tangwei 已提交
54

C
chengmo 已提交
55 56 57 58 59 60 61 62
        reader_class = envs.lazy_instance_by_fliename(reader_class, class_name)
        reader_ins = reader_class(self._config_yaml)
        if hasattr(reader_ins, 'generate_batch_from_trainfiles'):
            dataloader.set_sample_list_generator(reader)
        else:
            dataloader.set_sample_generator(reader, batch_size)

        debug_mode = envs.get_global_env("reader_debug_mode", False, namespace)
C
chengmo 已提交
63 64
        if debug_mode:
            print("--- DataLoader Debug Mode Begin , show pre 10 data ---")
C
chengmo 已提交
65
            for idx, line in enumerate(reader()):
C
chengmo 已提交
66 67 68 69
                print(line)
                if idx >= 9:
                    break
            print("--- DataLoader Debug Mode End , show pre 10 data ---")
C
fix bug  
chengmo 已提交
70
            exit(0)
T
tangwei 已提交
71 72
        return dataloader

73
    def _get_dataset(self, state="TRAIN"):
M
malin10 已提交
74 75 76
        if state == "TRAIN":
            inputs = self.model.get_inputs()
            namespace = "train.reader"
C
chengmo 已提交
77 78
            train_data_path = envs.get_global_env(
                "train_data_path", None, namespace)
M
malin10 已提交
79 80 81
        else:
            inputs = self.model.get_infer_inputs()
            namespace = "evaluate.reader"
C
chengmo 已提交
82 83
            train_data_path = envs.get_global_env(
                "test_data_path", None, namespace)
T
tangwei12 已提交
84

T
bug fix  
tangwei12 已提交
85 86
        #threads = int(envs.get_runtime_environ("train.trainer.threads"))
        threads = 2
T
tangwei12 已提交
87
        batch_size = envs.get_global_env("batch_size", None, namespace)
T
tangwei 已提交
88 89
        reader_class = envs.get_global_env("class", None, namespace)
        abs_dir = os.path.dirname(os.path.abspath(__file__))
T
tangwei 已提交
90
        reader = os.path.join(abs_dir, '../utils', 'dataset_instance.py')
C
chengmo 已提交
91 92
        pipe_cmd = "python {} {} {} {}".format(
            reader, reader_class, state, self._config_yaml)
T
tangwei12 已提交
93

94
        if train_data_path.startswith("paddlerec::"):
T
tangwei 已提交
95 96
            package_base = envs.get_runtime_environ("PACKAGE_BASE")
            assert package_base is not None
C
chengmo 已提交
97 98
            train_data_path = os.path.join(
                package_base, train_data_path.split("::")[1])
T
tangwei 已提交
99

T
tangwei 已提交
100 101
        dataset = fluid.DatasetFactory().create_dataset()
        dataset.set_use_var(inputs)
T
tangwei 已提交
102
        dataset.set_pipe_command(pipe_cmd)
T
tangwei 已提交
103 104 105
        dataset.set_batch_size(batch_size)
        dataset.set_thread(threads)
        file_list = [
T
tangwei12 已提交
106 107
            os.path.join(train_data_path, x)
            for x in os.listdir(train_data_path)
T
tangwei 已提交
108 109 110
        ]

        dataset.set_filelist(file_list)
C
chengmo 已提交
111

C
chengmo 已提交
112
        debug_mode = envs.get_global_env("reader_debug_mode", False, namespace)
C
chengmo 已提交
113 114 115 116 117 118
        if debug_mode:
            print(
                "--- Dataset Debug Mode Begin , show pre 10 data of {}---".format(file_list[0]))
            os.system("cat {} | {} | head -10".format(file_list[0], pipe_cmd))
            print(
                "--- Dataset Debug Mode End , show pre 10 data of {}---".format(file_list[0]))
C
fix bug  
chengmo 已提交
119
            exit(0)
C
chengmo 已提交
120

T
tangwei 已提交
121 122
        return dataset

T
tangwei 已提交
123
    def save(self, epoch_id, namespace, is_fleet=False):
T
tangwei12 已提交
124 125 126
        def need_save(epoch_id, epoch_interval, is_last=False):
            if is_last:
                return True
T
tangwei 已提交
127

T
tangwei12 已提交
128 129
            if epoch_id == -1:
                return False
T
tangwei 已提交
130

T
tangwei12 已提交
131 132
            return epoch_id % epoch_interval == 0

T
tangwei 已提交
133
        def save_inference_model():
C
chengmo 已提交
134 135
            save_interval = envs.get_global_env(
                "save.inference.epoch_interval", -1, namespace)
T
tangwei 已提交
136 137 138

            if not need_save(epoch_id, save_interval, False):
                return
C
chengmo 已提交
139

140 141
          #  print("save inference model is not supported now.")
          #  return
C
chengmo 已提交
142

C
chengmo 已提交
143 144 145 146
            feed_varnames = envs.get_global_env(
                "save.inference.feed_varnames", None, namespace)
            fetch_varnames = envs.get_global_env(
                "save.inference.fetch_varnames", None, namespace)
147 148
            if feed_varnames is None or fetch_varnames is None:
                return
T
tangwei12 已提交
149

C
chengmo 已提交
150 151 152 153
            fetch_vars = [fluid.default_main_program().global_block().vars[varname]
                          for varname in fetch_varnames]
            dirname = envs.get_global_env(
                "save.inference.dirname", None, namespace)
T
tangwei 已提交
154 155 156

            assert dirname is not None
            dirname = os.path.join(dirname, str(epoch_id))
T
tangwei 已提交
157 158

            if is_fleet:
C
chengmo 已提交
159 160
                fleet.save_inference_model(
                    self._exe, dirname, feed_varnames, fetch_vars)
T
tangwei 已提交
161
            else:
C
chengmo 已提交
162 163
                fluid.io.save_inference_model(
                    dirname, feed_varnames, fetch_vars, self._exe)
T
tangwei12 已提交
164
            self.inference_models.append((epoch_id, dirname))
T
tangwei 已提交
165 166

        def save_persistables():
C
chengmo 已提交
167 168
            save_interval = envs.get_global_env(
                "save.increment.epoch_interval", -1, namespace)
T
tangwei 已提交
169 170 171 172

            if not need_save(epoch_id, save_interval, False):
                return

C
chengmo 已提交
173 174
            dirname = envs.get_global_env(
                "save.increment.dirname", None, namespace)
T
tangwei 已提交
175 176 177

            assert dirname is not None
            dirname = os.path.join(dirname, str(epoch_id))
T
tangwei 已提交
178 179

            if is_fleet:
T
tangwei 已提交
180
                fleet.save_persistables(self._exe, dirname)
T
tangwei 已提交
181
            else:
T
tangwei 已提交
182
                fluid.io.save_persistables(self._exe, dirname)
T
tangwei12 已提交
183
            self.increment_models.append((epoch_id, dirname))
T
tangwei 已提交
184 185 186 187

        save_persistables()
        save_inference_model()

T
tangwei 已提交
188 189
    def instance(self, context):
        models = envs.get_global_env("train.model.models")
T
tangwei 已提交
190
        model_class = envs.lazy_instance_by_fliename(models, "Model")
T
tangwei 已提交
191
        self.model = model_class(None)
T
tangwei 已提交
192
        context['status'] = 'init_pass'
T
tangwei 已提交
193

T
tangwei 已提交
194 195 196
    def init(self, context):
        print("Need to be implement")
        context['is_exit'] = True
T
tangwei 已提交
197

T
tangwei 已提交
198 199 200 201 202
    def dataloader_train(self, context):
        print("Need to be implement")
        context['is_exit'] = True

    def dataset_train(self, context):
T
tangwei 已提交
203 204
        print("Need to be implement")
        context['is_exit'] = True
T
tangwei 已提交
205

T
tangwei12 已提交
206
    def infer(self, context):
207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253
        infer_program = fluid.Program()
        startup_program = fluid.Program()
        with fluid.unique_name.guard():
            with fluid.program_guard(infer_program, startup_program):
                self.model.infer_net()

        if self.model._infer_data_loader is None:
            context['status'] = 'terminal_pass'
            return

        reader = self._get_dataloader("Evaluate")

        metrics_varnames = []
        metrics_format = []

        metrics_format.append("{}: {{}}".format("epoch"))
        metrics_format.append("{}: {{}}".format("batch"))

        for name, var in self.model.get_infer_results().items():
            metrics_varnames.append(var.name)
            metrics_format.append("{}: {{}}".format(name))

        metrics_format = ", ".join(metrics_format)
        self._exe.run(startup_program)

        for (epoch, model_dir) in self.increment_models:
            print("Begin to infer epoch {}, model_dir: {}".format(epoch, model_dir))
            program = infer_program.clone()
            fluid.io.load_persistables(self._exe, model_dir, program)
            reader.start()
            batch_id = 0
            try:
                while True:
                    metrics_rets = self._exe.run(
                        program=program,
                        fetch_list=metrics_varnames)

                    metrics = [epoch, batch_id]
                    metrics.extend(metrics_rets)

                    if batch_id % 2 == 0 and batch_id != 0:
                        print(metrics_format.format(*metrics))
                    batch_id += 1
            except fluid.core.EOFException:
                reader.reset()

        context['status'] = 'terminal_pass'
T
tangwei12 已提交
254 255

    def terminal(self, context):
T
tangwei 已提交
256
        print("clean up and exit")
T
tangwei12 已提交
257
        context['is_exit'] = True