# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import numpy as np import paddle import paddle.nn as nn import paddle.nn.functional as F from paddle.nn import ReLU from paddleslim.nas.ofa import OFA, RunConfig, DistillConfig from paddleslim.nas.ofa import supernet class Model(nn.Layer): def __init__(self): super(Model, self).__init__() with supernet( kernel_size=(3, 5, 7), expand_ratio=[1, 2, 4]) as ofa_super: models = [] models += [nn.Conv2D(1, 6, 3)] models += [ReLU()] models += [nn.Pool2D(2, 'max', 2)] models += [nn.Conv2D(6, 16, 5, padding=0)] models += [ReLU()] models += [nn.Pool2D(2, 'max', 2)] models += [ nn.Linear(784, 120), nn.Linear(120, 84), nn.Linear(84, 10) ] models = ofa_super.convert(models) self.models = paddle.nn.Sequential(*models) def forward(self, inputs, label, depth=None): if depth != None: assert isinstance(depth, int) assert depth < len(self.models) models = self.models[:depth] else: depth = len(self.models) models = self.models[:] for idx, layer in enumerate(models): if idx == 6: inputs = paddle.flatten(inputs, 1) inputs = layer(inputs) inputs = F.softmax(inputs) return inputs def test_ofa(): model = Model() teacher_model = Model() default_run_config = { 'train_batch_size': 256, 'n_epochs': [[1], [2, 3], [4, 5]], 'init_learning_rate': [[0.001], [0.003, 0.001], [0.003, 0.001]], 'dynamic_batch_size': [1, 1, 1], 'total_images': 50000, #1281167, 'elastic_depth': (2, 5, 8) } run_config = RunConfig(**default_run_config) default_distill_config = { 'lambda_distill': 0.01, 'teacher_model': teacher_model, 'mapping_layers': ['models.0.fn'] } distill_config = DistillConfig(**default_distill_config) ofa_model = OFA(model, run_config, distill_config=distill_config) train_dataset = paddle.vision.datasets.MNIST( mode='train', backend='cv2', transform=transform) train_loader = paddle.io.DataLoader( train_dataset, places=place, feed_list=[image, label], drop_last=True, batch_size=64) start_epoch = 0 for idx in range(len(run_config.n_epochs)): cur_idx = run_config.n_epochs[idx] for ph_idx in range(len(cur_idx)): cur_lr = run_config.init_learning_rate[idx][ph_idx] adam = paddle.optimizer.Adam( learning_rate=cur_lr, parameter_list=(ofa_model.parameters() + ofa_model.netAs_param)) for epoch_id in range(start_epoch, run_config.n_epochs[idx][ph_idx]): for batch_id, data in enumerate(train_loader()): dy_x_data = np.array( [x[0].reshape(1, 28, 28) for x in data]).astype('float32') y_data = np.array([x[1] for x in data]).astype('int64').reshape( -1, 1) img = paddle.to_tensor(dy_x_data) label = paddle.to_tensor(y_data) label.stop_gradient = True for model_no in range(run_config.dynamic_batch_size[idx]): output, _ = ofa_model(img, label) loss = F.mean(output) dis_loss = ofa_model.calc_distill_loss() loss += dis_loss loss.backward() if batch_id % 10 == 0: print( 'epoch: {}, batch: {}, loss: {}, distill loss: {}'. format(epoch_id, batch_id, float(loss), float(dis_loss))) ### accumurate dynamic_batch_size network of gradients for same batch of data ### NOTE: need to fix gradients accumulate in PaddlePaddle adam.minimize(loss) adam.clear_gradients() start_epoch = run_config.n_epochs[idx][ph_idx] test_ofa()