transpiler_trainer.py 9.5 KB
Newer Older
T
tangwei 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
T
tangwei 已提交
16
Training use fluid with DistributeTranspiler
T
tangwei 已提交
17 18
"""
import os
T
tangwei 已提交
19

T
tangwei 已提交
20
import paddle.fluid as fluid
T
tangwei 已提交
21 22
from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet

23 24 25
from paddlerec.core.trainer import Trainer
from paddlerec.core.utils import envs
from paddlerec.core.utils import dataloader_instance
T
tangwei 已提交
26 27


T
tangwei 已提交
28
class TranspileTrainer(Trainer):
T
tangwei 已提交
29 30
    def __init__(self, config=None):
        Trainer.__init__(self, config)
Z
zhangwenhui03 已提交
31 32 33 34
        device = envs.get_global_env("train.device")
        if device == 'gpu':
            self._place = fluid.CUDAPlace(0)
            self._exe = fluid.Executor(self._place)
T
tangwei 已提交
35
        self.processor_register()
T
tangwei 已提交
36
        self.model = None
T
tangwei12 已提交
37 38 39
        self.inference_models = []
        self.increment_models = []

T
tangwei 已提交
40 41
    def processor_register(self):
        print("Need implement by trainer, `self.regist_context_processor('uninit', self.instance)` must be the first")
T
tangwei 已提交
42

43
    def _get_dataloader(self, state="TRAIN"):
M
malin10 已提交
44 45 46
        if state == "TRAIN":
            dataloader = self.model._data_loader
            namespace = "train.reader"
M
debug  
malin10 已提交
47
            class_name = "TrainReader"
M
malin10 已提交
48
        else:
C
chengmo 已提交
49
            readerdataloader = self.model._infer_data_loader
M
malin10 已提交
50
            namespace = "evaluate.reader"
M
debug  
malin10 已提交
51
            class_name = "EvaluateReader"
M
malin10 已提交
52

T
tangwei 已提交
53 54
        batch_size = envs.get_global_env("batch_size", None, namespace)
        reader_class = envs.get_global_env("class", None, namespace)
C
chengmo 已提交
55 56 57
        print("batch_size: {}".format(batch_size))
        reader = dataloader_instance.dataloader(
            reader_class, state, self._config_yaml)
T
tangwei 已提交
58

C
chengmo 已提交
59 60 61 62 63 64 65 66
        reader_class = envs.lazy_instance_by_fliename(reader_class, class_name)
        reader_ins = reader_class(self._config_yaml)
        if hasattr(reader_ins, 'generate_batch_from_trainfiles'):
            dataloader.set_sample_list_generator(reader)
        else:
            dataloader.set_sample_generator(reader, batch_size)

        debug_mode = envs.get_global_env("reader_debug_mode", False, namespace)
C
chengmo 已提交
67 68
        if debug_mode:
            print("--- DataLoader Debug Mode Begin , show pre 10 data ---")
C
chengmo 已提交
69
            for idx, line in enumerate(reader()):
C
chengmo 已提交
70 71 72 73
                print(line)
                if idx >= 9:
                    break
            print("--- DataLoader Debug Mode End , show pre 10 data ---")
C
fix bug  
chengmo 已提交
74
            exit(0)
T
tangwei 已提交
75 76
        return dataloader

77
    def _get_dataset(self, state="TRAIN"):
M
malin10 已提交
78 79 80
        if state == "TRAIN":
            inputs = self.model.get_inputs()
            namespace = "train.reader"
C
chengmo 已提交
81 82
            train_data_path = envs.get_global_env(
                "train_data_path", None, namespace)
M
malin10 已提交
83 84 85
        else:
            inputs = self.model.get_infer_inputs()
            namespace = "evaluate.reader"
C
chengmo 已提交
86 87
            train_data_path = envs.get_global_env(
                "test_data_path", None, namespace)
T
tangwei12 已提交
88

T
bug fix  
tangwei12 已提交
89 90
        #threads = int(envs.get_runtime_environ("train.trainer.threads"))
        threads = 2
T
tangwei12 已提交
91
        batch_size = envs.get_global_env("batch_size", None, namespace)
T
tangwei 已提交
92 93
        reader_class = envs.get_global_env("class", None, namespace)
        abs_dir = os.path.dirname(os.path.abspath(__file__))
T
tangwei 已提交
94
        reader = os.path.join(abs_dir, '../utils', 'dataset_instance.py')
C
chengmo 已提交
95 96
        pipe_cmd = "python {} {} {} {}".format(
            reader, reader_class, state, self._config_yaml)
T
tangwei12 已提交
97

98
        if train_data_path.startswith("paddlerec::"):
T
tangwei 已提交
99 100
            package_base = envs.get_runtime_environ("PACKAGE_BASE")
            assert package_base is not None
C
chengmo 已提交
101 102
            train_data_path = os.path.join(
                package_base, train_data_path.split("::")[1])
T
tangwei 已提交
103

T
tangwei 已提交
104 105
        dataset = fluid.DatasetFactory().create_dataset()
        dataset.set_use_var(inputs)
T
tangwei 已提交
106
        dataset.set_pipe_command(pipe_cmd)
T
tangwei 已提交
107 108 109
        dataset.set_batch_size(batch_size)
        dataset.set_thread(threads)
        file_list = [
T
tangwei12 已提交
110 111
            os.path.join(train_data_path, x)
            for x in os.listdir(train_data_path)
T
tangwei 已提交
112 113 114
        ]

        dataset.set_filelist(file_list)
C
chengmo 已提交
115

C
chengmo 已提交
116
        debug_mode = envs.get_global_env("reader_debug_mode", False, namespace)
C
chengmo 已提交
117 118 119 120 121 122
        if debug_mode:
            print(
                "--- Dataset Debug Mode Begin , show pre 10 data of {}---".format(file_list[0]))
            os.system("cat {} | {} | head -10".format(file_list[0], pipe_cmd))
            print(
                "--- Dataset Debug Mode End , show pre 10 data of {}---".format(file_list[0]))
C
fix bug  
chengmo 已提交
123
            exit(0)
C
chengmo 已提交
124

T
tangwei 已提交
125 126
        return dataset

T
tangwei 已提交
127
    def save(self, epoch_id, namespace, is_fleet=False):
T
tangwei12 已提交
128 129 130
        def need_save(epoch_id, epoch_interval, is_last=False):
            if is_last:
                return True
T
tangwei 已提交
131

T
tangwei12 已提交
132 133
            if epoch_id == -1:
                return False
T
tangwei 已提交
134

T
tangwei12 已提交
135 136
            return epoch_id % epoch_interval == 0

T
tangwei 已提交
137
        def save_inference_model():
C
chengmo 已提交
138 139
            save_interval = envs.get_global_env(
                "save.inference.epoch_interval", -1, namespace)
T
tangwei 已提交
140 141 142

            if not need_save(epoch_id, save_interval, False):
                return
C
chengmo 已提交
143

144 145
          #  print("save inference model is not supported now.")
          #  return
C
chengmo 已提交
146

C
chengmo 已提交
147 148 149 150
            feed_varnames = envs.get_global_env(
                "save.inference.feed_varnames", None, namespace)
            fetch_varnames = envs.get_global_env(
                "save.inference.fetch_varnames", None, namespace)
151 152
            if feed_varnames is None or fetch_varnames is None:
                return
T
tangwei12 已提交
153

C
chengmo 已提交
154 155 156 157
            fetch_vars = [fluid.default_main_program().global_block().vars[varname]
                          for varname in fetch_varnames]
            dirname = envs.get_global_env(
                "save.inference.dirname", None, namespace)
T
tangwei 已提交
158 159 160

            assert dirname is not None
            dirname = os.path.join(dirname, str(epoch_id))
T
tangwei 已提交
161 162

            if is_fleet:
C
chengmo 已提交
163 164
                fleet.save_inference_model(
                    self._exe, dirname, feed_varnames, fetch_vars)
T
tangwei 已提交
165
            else:
C
chengmo 已提交
166 167
                fluid.io.save_inference_model(
                    dirname, feed_varnames, fetch_vars, self._exe)
T
tangwei12 已提交
168
            self.inference_models.append((epoch_id, dirname))
T
tangwei 已提交
169 170

        def save_persistables():
C
chengmo 已提交
171 172
            save_interval = envs.get_global_env(
                "save.increment.epoch_interval", -1, namespace)
T
tangwei 已提交
173 174 175 176

            if not need_save(epoch_id, save_interval, False):
                return

C
chengmo 已提交
177 178
            dirname = envs.get_global_env(
                "save.increment.dirname", None, namespace)
T
tangwei 已提交
179 180 181

            assert dirname is not None
            dirname = os.path.join(dirname, str(epoch_id))
T
tangwei 已提交
182 183

            if is_fleet:
T
tangwei 已提交
184
                fleet.save_persistables(self._exe, dirname)
T
tangwei 已提交
185
            else:
T
tangwei 已提交
186
                fluid.io.save_persistables(self._exe, dirname)
T
tangwei12 已提交
187
            self.increment_models.append((epoch_id, dirname))
T
tangwei 已提交
188 189 190 191

        save_persistables()
        save_inference_model()

T
tangwei 已提交
192 193
    def instance(self, context):
        models = envs.get_global_env("train.model.models")
T
tangwei 已提交
194
        model_class = envs.lazy_instance_by_fliename(models, "Model")
T
tangwei 已提交
195
        self.model = model_class(None)
T
tangwei 已提交
196
        context['status'] = 'init_pass'
T
tangwei 已提交
197

T
tangwei 已提交
198 199 200
    def init(self, context):
        print("Need to be implement")
        context['is_exit'] = True
T
tangwei 已提交
201

T
tangwei 已提交
202 203 204 205 206
    def dataloader_train(self, context):
        print("Need to be implement")
        context['is_exit'] = True

    def dataset_train(self, context):
T
tangwei 已提交
207 208
        print("Need to be implement")
        context['is_exit'] = True
T
tangwei 已提交
209

T
tangwei12 已提交
210
    def infer(self, context):
211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257
        infer_program = fluid.Program()
        startup_program = fluid.Program()
        with fluid.unique_name.guard():
            with fluid.program_guard(infer_program, startup_program):
                self.model.infer_net()

        if self.model._infer_data_loader is None:
            context['status'] = 'terminal_pass'
            return

        reader = self._get_dataloader("Evaluate")

        metrics_varnames = []
        metrics_format = []

        metrics_format.append("{}: {{}}".format("epoch"))
        metrics_format.append("{}: {{}}".format("batch"))

        for name, var in self.model.get_infer_results().items():
            metrics_varnames.append(var.name)
            metrics_format.append("{}: {{}}".format(name))

        metrics_format = ", ".join(metrics_format)
        self._exe.run(startup_program)

        for (epoch, model_dir) in self.increment_models:
            print("Begin to infer epoch {}, model_dir: {}".format(epoch, model_dir))
            program = infer_program.clone()
            fluid.io.load_persistables(self._exe, model_dir, program)
            reader.start()
            batch_id = 0
            try:
                while True:
                    metrics_rets = self._exe.run(
                        program=program,
                        fetch_list=metrics_varnames)

                    metrics = [epoch, batch_id]
                    metrics.extend(metrics_rets)

                    if batch_id % 2 == 0 and batch_id != 0:
                        print(metrics_format.format(*metrics))
                    batch_id += 1
            except fluid.core.EOFException:
                reader.reset()

        context['status'] = 'terminal_pass'
T
tangwei12 已提交
258 259

    def terminal(self, context):
T
tangwei 已提交
260
        print("clean up and exit")
T
tangwei12 已提交
261
        context['is_exit'] = True