未验证 提交 3616d593 编写于 作者: W whs 提交者: GitHub

Remove fluid API (#1578)

上级 c728e779
...@@ -17,11 +17,9 @@ from __future__ import division ...@@ -17,11 +17,9 @@ from __future__ import division
from __future__ import print_function from __future__ import print_function
import numpy as np import numpy as np
import paddle.fluid as fluid import paddle
from paddle.nn.initializer import Constant, KaimingUniform from paddle.nn.initializer import Constant, KaimingUniform
from paddle.nn import Conv2D from paddle.nn import Conv2D
from paddle.fluid.dygraph.nn import Pool2D, BatchNorm, Linear
from paddle.fluid.dygraph.base import to_variable
from genotypes import PRIMITIVES from genotypes import PRIMITIVES
from genotypes import Genotype from genotypes import Genotype
from operations import * from operations import *
...@@ -40,7 +38,7 @@ class ConvBN(paddle.nn.Layer): ...@@ -40,7 +38,7 @@ class ConvBN(paddle.nn.Layer):
name=name + "_conv" if name is not None else None, name=name + "_conv" if name is not None else None,
initializer=KaimingUniform()), initializer=KaimingUniform()),
bias_attr=False) bias_attr=False)
self.bn = BatchNorm( self.bn = paddle.nn.BatchNorm(
num_channels=c_out, num_channels=c_out,
param_attr=paddle.ParamAttr( param_attr=paddle.ParamAttr(
name=name + "_bn_scale" if name is not None else None, name=name + "_bn_scale" if name is not None else None,
...@@ -61,11 +59,11 @@ class ConvBN(paddle.nn.Layer): ...@@ -61,11 +59,11 @@ class ConvBN(paddle.nn.Layer):
class Classifier(paddle.nn.Layer): class Classifier(paddle.nn.Layer):
def __init__(self, input_dim, num_classes, name=None): def __init__(self, input_dim, num_classes, name=None):
super(Classifier, self).__init__() super(Classifier, self).__init__()
self.pool2d = Pool2D(pool_type='avg', global_pooling=True) self.pool2d = paddle.nn.AdaptiveAvgPool2D(output_size=1)
self.fc = Linear( self.fc = paddle.nn.Linear(
input_dim=input_dim, input_dim,
output_dim=num_classes, num_classes,
param_attr=paddle.ParamAttr( weight_attr=paddle.ParamAttr(
name=name + "_fc_weights" if name is not None else None, name=name + "_fc_weights" if name is not None else None,
initializer=KaimingUniform()), initializer=KaimingUniform()),
bias_attr=paddle.ParamAttr( bias_attr=paddle.ParamAttr(
...@@ -84,7 +82,7 @@ def drop_path(x, drop_prob): ...@@ -84,7 +82,7 @@ def drop_path(x, drop_prob):
keep_prob = 1. - drop_prob keep_prob = 1. - drop_prob
mask = 1 - np.random.binomial( mask = 1 - np.random.binomial(
1, drop_prob, size=[x.shape[0]]).astype(np.float32) 1, drop_prob, size=[x.shape[0]]).astype(np.float32)
mask = to_variable(mask) mask = paddle.to_tensor(mask)
x = paddle.multiply(x / keep_prob, mask) x = paddle.multiply(x / keep_prob, mask)
return x return x
...@@ -150,8 +148,7 @@ class Cell(paddle.nn.Layer): ...@@ -150,8 +148,7 @@ class Cell(paddle.nn.Layer):
class AuxiliaryHeadCIFAR(paddle.nn.Layer): class AuxiliaryHeadCIFAR(paddle.nn.Layer):
def __init__(self, C, num_classes): def __init__(self, C, num_classes):
super(AuxiliaryHeadCIFAR, self).__init__() super(AuxiliaryHeadCIFAR, self).__init__()
self.avgpool = Pool2D( self.avgpool = paddle.nn.AvgPool2D(5, stride=3, padding=0)
pool_size=5, pool_stride=3, pool_padding=0, pool_type='avg')
self.conv_bn1 = ConvBN( self.conv_bn1 = ConvBN(
c_curr=C, c_curr=C,
c_out=128, c_out=128,
...@@ -228,8 +225,7 @@ class NetworkCIFAR(paddle.nn.Layer): ...@@ -228,8 +225,7 @@ class NetworkCIFAR(paddle.nn.Layer):
class AuxiliaryHeadImageNet(paddle.nn.Layer): class AuxiliaryHeadImageNet(paddle.nn.Layer):
def __init__(self, C, num_classes): def __init__(self, C, num_classes):
super(AuxiliaryHeadImageNet, self).__init__() super(AuxiliaryHeadImageNet, self).__init__()
self.avgpool = Pool2D( self.avgpool = paddle.nn.AvgPool2D(5, stride=2, padding=0)
pool_size=5, pool_stride=2, pool_padding=0, pool_type='avg')
self.conv_bn1 = ConvBN( self.conv_bn1 = ConvBN(
c_curr=C, c_curr=C,
c_out=128, c_out=128,
......
...@@ -17,10 +17,8 @@ from __future__ import division ...@@ -17,10 +17,8 @@ from __future__ import division
from __future__ import print_function from __future__ import print_function
import paddle import paddle
import paddle.fluid as fluid
from paddle.nn.initializer import Normal, KaimingUniform, Constant from paddle.nn.initializer import Normal, KaimingUniform, Constant
from paddle.nn import Conv2D, Pool2D, BatchNorm, Linear from paddle.nn import Conv2D, Pool2D, BatchNorm, Linear
from paddle.fluid.dygraph.base import to_variable
from genotypes import PRIMITIVES from genotypes import PRIMITIVES
from operations import * from operations import *
import paddleslim import paddleslim
...@@ -159,9 +157,9 @@ class Network(paddle.nn.Layer): ...@@ -159,9 +157,9 @@ class Network(paddle.nn.Layer):
self.cells = paddle.nn.LayerList(cells) self.cells = paddle.nn.LayerList(cells)
self.global_pooling = Pool2D(pool_type='avg', global_pooling=True) self.global_pooling = Pool2D(pool_type='avg', global_pooling=True)
self.classifier = Linear( self.classifier = Linear(
input_dim=c_prev, c_prev,
output_dim=num_classes, num_classes,
param_attr=paddle.ParamAttr(initializer=KaimingUniform()), weight_attr=paddle.ParamAttr(initializer=KaimingUniform()),
bias_attr=paddle.ParamAttr(initializer=KaimingUniform())) bias_attr=paddle.ParamAttr(initializer=KaimingUniform()))
self._initialize_alphas() self._initialize_alphas()
......
...@@ -12,9 +12,9 @@ ...@@ -12,9 +12,9 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import paddle.fluid as fluid import paddle
from paddle.nn import Conv2D from paddle.nn import Conv2D
from paddle.fluid.dygraph.nn import Pool2D, BatchNorm from paddle.nn import BatchNorm
from paddle.nn.initializer import Constant, KaimingUniform from paddle.nn.initializer import Constant, KaimingUniform
...@@ -22,17 +22,15 @@ OPS = { ...@@ -22,17 +22,15 @@ OPS = {
'none': 'none':
lambda C, stride, affine: Zero(stride), lambda C, stride, affine: Zero(stride),
'avg_pool_3x3': 'avg_pool_3x3':
lambda C, stride, affine: Pool2D( lambda C, stride, affine: paddle.nn.AvgPool2D(
pool_size=3, 3,
pool_type="avg", stride=stride,
pool_stride=stride, padding=1),
pool_padding=1),
'max_pool_3x3': 'max_pool_3x3':
lambda C, stride, affine: Pool2D( lambda C, stride, affine: paddle.nn.MaxPool2D(
pool_size=3, 3,
pool_type="max", stride=stride,
pool_stride=stride, padding=1),
pool_padding=1),
'skip_connect': 'skip_connect':
lambda C, stride, affine: Identity() lambda C, stride, affine: Identity()
if stride == 1 else FactorizedReduce(C, C, affine), if stride == 1 else FactorizedReduce(C, C, affine),
...@@ -67,7 +65,7 @@ class Zero(paddle.nn.Layer): ...@@ -67,7 +65,7 @@ class Zero(paddle.nn.Layer):
def __init__(self, stride): def __init__(self, stride):
super(Zero, self).__init__() super(Zero, self).__init__()
self.stride = stride self.stride = stride
self.pool = Pool2D(pool_size=1, pool_stride=2) self.pool = paddle.nn.MaxPool2D(1, stride=2)
def forward(self, x): def forward(self, x):
pooled = self.pool(x) pooled = self.pool(x)
......
...@@ -22,8 +22,6 @@ import ast ...@@ -22,8 +22,6 @@ import ast
import argparse import argparse
import functools import functools
import paddle.fluid as fluid
from paddle.fluid.dygraph.base import to_variable
import reader import reader
from model_search import Network from model_search import Network
from paddleslim.nas.darts import DARTSearch from paddleslim.nas.darts import DARTSearch
...@@ -72,26 +70,25 @@ def main(args): ...@@ -72,26 +70,25 @@ def main(args):
is_shuffle=True, is_shuffle=True,
args=args) args=args)
with fluid.dygraph.guard(place): model = Network(args.init_channels, args.class_num, args.layers,
model = Network(args.init_channels, args.class_num, args.layers, args.method)
args.method) searcher = DARTSearch(
searcher = DARTSearch( model,
model, train_reader,
train_reader, valid_reader,
valid_reader, place,
place, learning_rate=args.learning_rate,
learning_rate=args.learning_rate, batchsize=args.batch_size,
batchsize=args.batch_size, num_imgs=args.trainset_num,
num_imgs=args.trainset_num, arch_learning_rate=args.arch_learning_rate,
arch_learning_rate=args.arch_learning_rate, unrolled=args.unrolled,
unrolled=args.unrolled, num_epochs=args.epochs,
num_epochs=args.epochs, epochs_no_archopt=args.epochs_no_archopt,
epochs_no_archopt=args.epochs_no_archopt, use_multiprocess=args.use_multiprocess,
use_multiprocess=args.use_multiprocess, use_data_parallel=args.use_data_parallel,
use_data_parallel=args.use_data_parallel, save_dir=args.model_save_dir,
save_dir=args.model_save_dir, log_freq=args.log_freq)
log_freq=args.log_freq) searcher.train()
searcher.train()
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -23,8 +23,8 @@ import logging ...@@ -23,8 +23,8 @@ import logging
import argparse import argparse
import functools import functools
import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
from paddle.fluid.dygraph.base import to_variable
from paddleslim.common import AvgrageMeter, get_logger from paddleslim.common import AvgrageMeter, get_logger
from paddleslim.nas.darts import count_parameters_in_MB from paddleslim.nas.darts import count_parameters_in_MB
...@@ -72,8 +72,8 @@ def train(model, train_reader, optimizer, epoch, drop_path_prob, args): ...@@ -72,8 +72,8 @@ def train(model, train_reader, optimizer, epoch, drop_path_prob, args):
for step_id, data in enumerate(train_reader()): for step_id, data in enumerate(train_reader()):
image_np, label_np = data image_np, label_np = data
image = to_variable(image_np) image = paddle.to_tensor(image_np)
label = to_variable(label_np) label = paddle.to_tensor(label_np)
label.stop_gradient = True label.stop_gradient = True
logits, logits_aux = model(image, drop_path_prob, True) logits, logits_aux = model(image, drop_path_prob, True)
...@@ -117,8 +117,8 @@ def valid(model, valid_reader, epoch, args): ...@@ -117,8 +117,8 @@ def valid(model, valid_reader, epoch, args):
for step_id, data in enumerate(valid_reader()): for step_id, data in enumerate(valid_reader()):
image_np, label_np = data image_np, label_np = data
image = to_variable(image_np) image = paddle.to_tensor(image_np)
label = to_variable(label_np) label = paddle.to_tensor(label_np)
logits, _ = model(image, 0, False) logits, _ = model(image, 0, False)
prec1 = paddle.static.accuracy(input=logits, label=label, k=1) prec1 = paddle.static.accuracy(input=logits, label=label, k=1)
prec5 = paddle.static.accuracy(input=logits, label=label, k=5) prec5 = paddle.static.accuracy(input=logits, label=label, k=5)
...@@ -140,83 +140,75 @@ def main(args): ...@@ -140,83 +140,75 @@ def main(args):
place = paddle.CUDAPlace(paddle.distributed.parallel.ParallelEnv().dev_id) \ place = paddle.CUDAPlace(paddle.distributed.parallel.ParallelEnv().dev_id) \
if args.use_data_parallel else paddle.CUDAPlace(0) if args.use_data_parallel else paddle.CUDAPlace(0)
with fluid.dygraph.guard(place): genotype = eval("genotypes.%s" % args.arch)
genotype = eval("genotypes.%s" % args.arch) model = Network(
model = Network( C=args.init_channels,
C=args.init_channels, num_classes=args.class_num,
num_classes=args.class_num, layers=args.layers,
layers=args.layers, auxiliary=args.auxiliary,
auxiliary=args.auxiliary, genotype=genotype)
genotype=genotype)
logger.info("param size = {:.6f}MB".format(
logger.info("param size = {:.6f}MB".format( count_parameters_in_MB(model.parameters())))
count_parameters_in_MB(model.parameters())))
device_num = paddle.distributed.parallel.ParallelEnv().nranks
device_num = paddle.distributed.parallel.ParallelEnv().nranks learning_rate = paddle.optimizer.lr.CosineAnnealingDecay(args.learning_rate,
step_per_epoch = int(args.trainset_num / (args.batch_size * device_num)) args.epochs / 2)
learning_rate = fluid.dygraph.CosineDecay(args.learning_rate, clip = paddle.nn.ClipGradByGlobalNorm(args.grad_clip)
step_per_epoch, args.epochs) optimizer = paddle.optimizer.Momentum(
clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=args.grad_clip) learning_rate,
optimizer = paddle.optimizer.Momentum( momentum=args.momentum,
learning_rate, regularization=paddle.regularizer.L2Decay(args.weight_decay),
momentum=args.momentum, parameter_list=model.parameters(),
regularization=fluid.regularizer.L2Decay(args.weight_decay), grad_clip=clip)
parameter_list=model.parameters(),
grad_clip=clip) if args.use_data_parallel:
strategy = paddle.distributed.init_parallel_env()
if args.use_data_parallel: model = paddle.DataParallel(model, strategy)
strategy = fluid.dygraph.parallel.prepare_context()
model = fluid.dygraph.parallel.DataParallel(model, strategy) train_loader = paddle.io.DataLoader.from_generator(
capacity=64,
train_loader = fluid.io.DataLoader.from_generator( use_double_buffer=True,
capacity=64, iterable=True,
use_double_buffer=True, return_list=True,
iterable=True, use_multiprocess=args.use_multiprocess)
return_list=True, valid_loader = paddle.io.DataLoader.from_generator(
use_multiprocess=args.use_multiprocess) capacity=64,
valid_loader = fluid.io.DataLoader.from_generator( use_double_buffer=True,
capacity=64, iterable=True,
use_double_buffer=True, return_list=True,
iterable=True, use_multiprocess=args.use_multiprocess)
return_list=True,
use_multiprocess=args.use_multiprocess) train_reader = reader.train_valid(
batch_size=args.batch_size, is_train=True, is_shuffle=True, args=args)
train_reader = reader.train_valid( valid_reader = reader.train_valid(
batch_size=args.batch_size, batch_size=args.batch_size, is_train=False, is_shuffle=False, args=args)
is_train=True, if args.use_data_parallel:
is_shuffle=True, train_reader = fluid.contrib.reader.distributed_batch_reader(
args=args) train_reader)
valid_reader = reader.train_valid(
batch_size=args.batch_size, train_loader.set_batch_generator(train_reader, places=place)
is_train=False, valid_loader.set_batch_generator(valid_reader, places=place)
is_shuffle=False,
args=args) save_parameters = (not args.use_data_parallel) or (
if args.use_data_parallel: args.use_data_parallel and
train_reader = fluid.contrib.reader.distributed_batch_reader( paddle.distributed.parallel.ParallelEnv().local_rank == 0)
train_reader) best_acc = 0
for epoch in range(args.epochs):
train_loader.set_batch_generator(train_reader, places=place) drop_path_prob = args.drop_path_prob * epoch / args.epochs
valid_loader.set_batch_generator(valid_reader, places=place) logger.info('Epoch {}, lr {:.6f}'.format(epoch,
optimizer.current_step_lr()))
save_parameters = (not args.use_data_parallel) or ( train_top1 = train(model, train_loader, optimizer, epoch,
args.use_data_parallel and drop_path_prob, args)
paddle.distributed.parallel.ParallelEnv().local_rank == 0) logger.info("Epoch {}, train_acc {:.6f}".format(epoch, train_top1))
best_acc = 0 valid_top1 = valid(model, valid_loader, epoch, args)
for epoch in range(args.epochs): if valid_top1 > best_acc:
drop_path_prob = args.drop_path_prob * epoch / args.epochs best_acc = valid_top1
logger.info('Epoch {}, lr {:.6f}'.format( if save_parameters:
epoch, optimizer.current_step_lr())) paddle.save(model.state_dict(),
train_top1 = train(model, train_loader, optimizer, epoch, args.model_save_dir + "/best_model")
drop_path_prob, args) logger.info("Epoch {}, valid_acc {:.6f}, best_valid_acc {:.6f}".format(
logger.info("Epoch {}, train_acc {:.6f}".format(epoch, train_top1)) epoch, valid_top1, best_acc))
valid_top1 = valid(model, valid_loader, epoch, args)
if valid_top1 > best_acc:
best_acc = valid_top1
if save_parameters:
paddle.save(model.state_dict(),
args.model_save_dir + "/best_model")
logger.info("Epoch {}, valid_acc {:.6f}, best_valid_acc {:.6f}".
format(epoch, valid_top1, best_acc))
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -23,8 +23,8 @@ import logging ...@@ -23,8 +23,8 @@ import logging
import argparse import argparse
import functools import functools
import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
from paddle.fluid.dygraph.base import to_variable
from paddleslim.common import AvgrageMeter, get_logger from paddleslim.common import AvgrageMeter, get_logger
from paddleslim.nas.darts import count_parameters_in_MB from paddleslim.nas.darts import count_parameters_in_MB
...@@ -68,7 +68,7 @@ add_arg('use_data_parallel', ast.literal_eval, False, "The flag indicating whet ...@@ -68,7 +68,7 @@ add_arg('use_data_parallel', ast.literal_eval, False, "The flag indicating whet
def cross_entropy_label_smooth(preds, targets, epsilon): def cross_entropy_label_smooth(preds, targets, epsilon):
preds = paddle.nn.functional.softmax(preds) preds = paddle.nn.functional.softmax(preds)
targets_one_hot = fluid.one_hot(input=targets, depth=args.class_num) targets_one_hot = paddle.nn.functional.one_hot(targets, args.class_num)
targets_smooth = paddle.nn.functional.label_smooth( targets_smooth = paddle.nn.functional.label_smooth(
targets_one_hot, epsilon=epsilon, dtype="float32") targets_one_hot, epsilon=epsilon, dtype="float32")
loss = paddle.nn.functional.cross_entropy( loss = paddle.nn.functional.cross_entropy(
...@@ -84,8 +84,8 @@ def train(model, train_reader, optimizer, epoch, args): ...@@ -84,8 +84,8 @@ def train(model, train_reader, optimizer, epoch, args):
for step_id, data in enumerate(train_reader()): for step_id, data in enumerate(train_reader()):
image_np, label_np = data image_np, label_np = data
image = to_variable(image_np) image = paddle.to_tensor(image_np)
label = to_variable(label_np) label = paddle.to_tensor(label_np)
label.stop_gradient = True label.stop_gradient = True
logits, logits_aux = model(image, True) logits, logits_aux = model(image, True)
...@@ -130,8 +130,8 @@ def valid(model, valid_reader, epoch, args): ...@@ -130,8 +130,8 @@ def valid(model, valid_reader, epoch, args):
for step_id, data in enumerate(valid_reader()): for step_id, data in enumerate(valid_reader()):
image_np, label_np = data image_np, label_np = data
image = to_variable(image_np) image = paddle.to_tensor(image_np)
label = to_variable(label_np) label = paddle.to_tensor(label_np)
logits, _ = model(image, False) logits, _ = model(image, False)
prec1 = paddle.static.accuracy(input=logits, label=label, k=1) prec1 = paddle.static.accuracy(input=logits, label=label, k=1)
prec5 = paddle.static.accuracy(input=logits, label=label, k=5) prec5 = paddle.static.accuracy(input=logits, label=label, k=5)
...@@ -153,79 +153,72 @@ def main(args): ...@@ -153,79 +153,72 @@ def main(args):
place = paddle.CUDAPlace(paddle.distributed.parallel.ParallelEnv().dev_id) \ place = paddle.CUDAPlace(paddle.distributed.parallel.ParallelEnv().dev_id) \
if args.use_data_parallel else paddle.CUDAPlace(0) if args.use_data_parallel else paddle.CUDAPlace(0)
with fluid.dygraph.guard(place): genotype = eval("genotypes.%s" % args.arch)
genotype = eval("genotypes.%s" % args.arch) model = Network(
model = Network( C=args.init_channels,
C=args.init_channels, num_classes=args.class_num,
num_classes=args.class_num, layers=args.layers,
layers=args.layers, auxiliary=args.auxiliary,
auxiliary=args.auxiliary, genotype=genotype)
genotype=genotype)
logger.info("param size = {:.6f}MB".format(
logger.info("param size = {:.6f}MB".format( count_parameters_in_MB(model.parameters())))
count_parameters_in_MB(model.parameters())))
device_num = paddle.distributed.parallel.ParallelEnv().nranks
device_num = paddle.distributed.parallel.ParallelEnv().nranks step_per_epoch = int(args.trainset_num / (args.batch_size * device_num))
step_per_epoch = int(args.trainset_num / (args.batch_size * device_num)) learning_rate = paddle.optimizer.lr.ExponentialDecay(args.learning_rate,
learning_rate = fluid.dygraph.ExponentialDecay( args.decay_rate)
args.learning_rate, step_per_epoch, args.decay_rate, staircase=True)
clip = paddle.nn.ClipGradByGlobalNorm(args.grad_clip)
clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=args.grad_clip) optimizer = paddle.optimizer.Momentum(
optimizer = paddle.optimizer.Momentum( learning_rate,
learning_rate, momentum=args.momentum,
momentum=args.momentum, regularization=paddle.regularizer.L2Decay(args.weight_decay),
regularization=fluid.regularizer.L2Decay(args.weight_decay), parameter_list=model.parameters(),
parameter_list=model.parameters(), grad_clip=clip)
grad_clip=clip)
if args.use_data_parallel:
if args.use_data_parallel: strategy = paddle.distributed.init_parallel_env()
strategy = fluid.dygraph.parallel.prepare_context() model = paddle.DataParallel(model, strategy)
model = fluid.dygraph.parallel.DataParallel(model, strategy)
train_loader = paddle.io.DataLoader.from_generator(
train_loader = fluid.io.DataLoader.from_generator( capacity=64, use_double_buffer=True, iterable=True, return_list=True)
capacity=64, valid_loader = paddle.io.DataLoader.from_generator(
use_double_buffer=True, capacity=64, use_double_buffer=True, iterable=True, return_list=True)
iterable=True,
return_list=True) train_reader = paddle.batch(
valid_loader = fluid.io.DataLoader.from_generator( reader.imagenet_reader(args.data_dir, 'train'),
capacity=64, batch_size=args.batch_size,
use_double_buffer=True, drop_last=True)
iterable=True, valid_reader = paddle.batch(
return_list=True) reader.imagenet_reader(args.data_dir, 'val'),
batch_size=args.batch_size)
train_reader = fluid.io.batch( if args.use_data_parallel:
reader.imagenet_reader(args.data_dir, 'train'), train_reader = fluid.contrib.reader.distributed_batch_reader(
batch_size=args.batch_size, train_reader)
drop_last=True)
valid_reader = fluid.io.batch( train_loader.set_sample_list_generator(train_reader, places=place)
reader.imagenet_reader(args.data_dir, 'val'), valid_loader.set_sample_list_generator(valid_reader, places=place)
batch_size=args.batch_size)
if args.use_data_parallel: save_parameters = (not args.use_data_parallel) or (
train_reader = fluid.contrib.reader.distributed_batch_reader( args.use_data_parallel and
train_reader) paddle.distributed.parallel.ParallelEnv().local_rank == 0)
best_top1 = 0
train_loader.set_sample_list_generator(train_reader, places=place) for epoch in range(args.epochs):
valid_loader.set_sample_list_generator(valid_reader, places=place) logger.info('Epoch {}, lr {:.6f}'.format(epoch, optimizer.get_lr()))
train_top1, train_top5 = train(model, train_loader, optimizer, epoch,
save_parameters = (not args.use_data_parallel) or ( args)
args.use_data_parallel and logger.info("Epoch {}, train_top1 {:.6f}, train_top5 {:.6f}".format(
paddle.distributed.parallel.ParallelEnv().local_rank == 0) epoch, train_top1, train_top5))
best_top1 = 0 valid_top1, valid_top5 = valid(model, valid_loader, epoch, args)
for epoch in range(args.epochs): if valid_top1 > best_top1:
logger.info('Epoch {}, lr {:.6f}'.format(epoch, optimizer.get_lr())) best_top1 = valid_top1
train_top1, train_top5 = train(model, train_loader, optimizer, if save_parameters:
epoch, args) paddle.save(model.state_dict(),
logger.info("Epoch {}, train_top1 {:.6f}, train_top5 {:.6f}".format( args.model_save_dir + "/best_model")
epoch, train_top1, train_top5)) logger.info(
valid_top1, valid_top5 = valid(model, valid_loader, epoch, args) "Epoch {}, valid_top1 {:.6f}, valid_top5 {:.6f}, best_valid_top1 {:6f}".
if valid_top1 > best_top1: format(epoch, valid_top1, valid_top5, best_top1))
best_top1 = valid_top1
if save_parameters:
paddle.save(model.state_dict(),
args.model_save_dir + "/best_model")
logger.info(
"Epoch {}, valid_top1 {:.6f}, valid_top5 {:.6f}, best_valid_top1 {:6f}".
format(epoch, valid_top1, valid_top5, best_top1))
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -22,7 +22,6 @@ import six ...@@ -22,7 +22,6 @@ import six
import numpy as np import numpy as np
import time import time
import paddle import paddle
import paddle.fluid as fluid
from paddle.fluid.framework import IrGraph from paddle.fluid.framework import IrGraph
from paddle.framework import core from paddle.framework import core
...@@ -244,7 +243,7 @@ class SampleTester(unittest.TestCase): ...@@ -244,7 +243,7 @@ class SampleTester(unittest.TestCase):
return outputs, acc1_avg, acc5_avg, fps_avg, latency_avg return outputs, acc1_avg, acc5_avg, fps_avg, latency_avg
def test_graph_transformation(self): def test_graph_transformation(self):
if not paddle.fluid.core.is_compiled_with_mkldnn(): if not paddle.framework.core.is_compiled_with_mkldnn():
return return
infer_model_path = test_case_args.infer_model infer_model_path = test_case_args.infer_model
......
...@@ -22,13 +22,8 @@ class PVANet(): ...@@ -22,13 +22,8 @@ class PVANet():
def net(self, input, include_last_bn_relu=True, class_dim=1000): def net(self, input, include_last_bn_relu=True, class_dim=1000):
conv1 = self._conv_bn_crelu(input, 16, 7, stride=2, name="conv1_1") conv1 = self._conv_bn_crelu(input, 16, 7, stride=2, name="conv1_1")
pool1 = fluid.layers.pool2d( pool1 = paddle.nn.functional.max_pool2d(
input=conv1, conv1, 3, stride=2, padding=1, name='pool1')
pool_size=3,
pool_stride=2,
pool_padding=1,
pool_type='max',
name='pool1')
end_points = {} end_points = {}
conv2 = self._conv_stage( conv2 = self._conv_stage(
...@@ -182,13 +177,8 @@ class PVANet(): ...@@ -182,13 +177,8 @@ class PVANet():
paths.append(path_net) paths.append(path_net)
if stride > 1: if stride > 1:
path_net = fluid.layers.pool2d( path_net = paddle.nn.functional.max_pool2d(
input, input, 3, stride=2, padding=1, name=name + '_pool')
pool_size=3,
pool_stride=2,
pool_padding=1,
pool_type='max',
name=name + '_pool')
path_net = self._conv_bn_relu(path_net, pool_path_outputs, 1, path_net = self._conv_bn_relu(path_net, pool_path_outputs, 1,
name + '_poolproj') name + '_poolproj')
paths.append(path_net) paths.append(path_net)
......
...@@ -2,7 +2,6 @@ from __future__ import absolute_import ...@@ -2,7 +2,6 @@ from __future__ import absolute_import
from __future__ import division from __future__ import division
from __future__ import print_function from __future__ import print_function
import paddle import paddle
import paddle.fluid as fluid
import math import math
__all__ = ["ResNet", "ResNet34", "ResNet50", "ResNet101", "ResNet152"] __all__ = ["ResNet", "ResNet34", "ResNet50", "ResNet101", "ResNet152"]
...@@ -50,12 +49,7 @@ class ResNet(): ...@@ -50,12 +49,7 @@ class ResNet():
stride=2, stride=2,
act='relu', act='relu',
name=prefix_name + conv1_name) name=prefix_name + conv1_name)
conv = fluid.layers.pool2d( conv = paddle.nn.functional.max_pool2d(conv, 3, stride=2, padding=1)
input=conv,
pool_size=3,
pool_stride=2,
pool_padding=1,
pool_type='max')
if layers >= 50: if layers >= 50:
for block in range(len(depth)): for block in range(len(depth)):
...@@ -74,8 +68,7 @@ class ResNet(): ...@@ -74,8 +68,7 @@ class ResNet():
stride=2 if i == 0 and block != 0 else 1, stride=2 if i == 0 and block != 0 else 1,
name=conv_name) name=conv_name)
pool = fluid.layers.pool2d( pool = paddle.nn.functional.adaptive_avg_pool2d(conv, 1)
input=conv, pool_size=7, pool_type='avg', global_pooling=True)
stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0) stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
fc_name = fc_name if fc_name is None else prefix_name + fc_name fc_name = fc_name if fc_name is None else prefix_name + fc_name
out = paddle.static.nn.fc( out = paddle.static.nn.fc(
...@@ -97,8 +90,7 @@ class ResNet(): ...@@ -97,8 +90,7 @@ class ResNet():
is_first=block == i == 0, is_first=block == i == 0,
name=conv_name) name=conv_name)
pool = fluid.layers.pool2d( pool = paddle.nn.functional.adaptive_avg_pool2d(conv, 1)
input=conv, pool_type='avg', global_pooling=True)
stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0) stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
fc_name = fc_name if fc_name is None else prefix_name + fc_name fc_name = fc_name if fc_name is None else prefix_name + fc_name
out = paddle.static.nn.fc( out = paddle.static.nn.fc(
......
...@@ -19,7 +19,6 @@ from __future__ import print_function ...@@ -19,7 +19,6 @@ from __future__ import print_function
import math import math
import paddle import paddle
import paddle.fluid as fluid
__all__ = [ __all__ = [
"ResNet", "ResNet18_vd", "ResNet34_vd", "ResNet50_vd", "ResNet101_vd", "ResNet", "ResNet18_vd", "ResNet34_vd", "ResNet50_vd", "ResNet101_vd",
...@@ -80,12 +79,7 @@ class ResNet(): ...@@ -80,12 +79,7 @@ class ResNet():
act='relu', act='relu',
name='conv1_3') name='conv1_3')
conv = fluid.layers.pool2d( conv = paddle.nn.functional.max_pool2d(conv, 3, stride=2, padding=1)
input=conv,
pool_size=3,
pool_stride=2,
pool_padding=1,
pool_type='max')
if layers >= 50: if layers >= 50:
for block in range(len(depth)): for block in range(len(depth)):
...@@ -114,8 +108,7 @@ class ResNet(): ...@@ -114,8 +108,7 @@ class ResNet():
if_first=block == i == 0, if_first=block == i == 0,
name=conv_name) name=conv_name)
pool = fluid.layers.pool2d( pool = paddle.nn.functional.adaptive_avg_pool2d(conv, 1)
input=conv, pool_type='avg', global_pooling=True)
stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0) stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
out = paddle.static.nn.fc( out = paddle.static.nn.fc(
...@@ -164,13 +157,8 @@ class ResNet(): ...@@ -164,13 +157,8 @@ class ResNet():
groups=1, groups=1,
act=None, act=None,
name=None): name=None):
pool = fluid.layers.pool2d( pool = paddle.nn.functional.avg_pool2d(
input=input, input, 2, stride=2, padding=0, ceil_mode=True)
pool_size=2,
pool_stride=2,
pool_padding=0,
pool_type='avg',
ceil_mode=True)
conv = paddle.static.nn.conv2d( conv = paddle.static.nn.conv2d(
input=pool, input=pool,
......
...@@ -17,7 +17,6 @@ import datetime ...@@ -17,7 +17,6 @@ import datetime
import numpy as np import numpy as np
import paddle import paddle
import paddle.fluid as fluid
from paddle.nn.initializer import KaimingUniform from paddle.nn.initializer import KaimingUniform
...@@ -154,7 +153,7 @@ class SlimFaceNet(): ...@@ -154,7 +153,7 @@ class SlimFaceNet():
param_attr=paddle.ParamAttr( param_attr=paddle.ParamAttr(
name='linear_conv1x1_weights', name='linear_conv1x1_weights',
initializer=KaimingUniform(), initializer=KaimingUniform(),
regularizer=fluid.regularizer.L2Decay(4e-4)), regularizer=paddle.regularizer.L2Decay(4e-4)),
bias_attr=False) bias_attr=False)
bn_name = 'linear_conv1x1_bn' bn_name = 'linear_conv1x1_bn'
x = paddle.static.nn.batch_norm( x = paddle.static.nn.batch_norm(
...@@ -233,8 +232,7 @@ class SlimFaceNet(): ...@@ -233,8 +232,7 @@ class SlimFaceNet():
def se_block(self, input, num_out_filter, ratio=4, name=None): def se_block(self, input, num_out_filter, ratio=4, name=None):
num_mid_filter = int(num_out_filter // ratio) num_mid_filter = int(num_out_filter // ratio)
pool = fluid.layers.pool2d( paddle.nn.functional.adaptive_avg_pool2d(input, 1)
input=input, pool_type='avg', global_pooling=True, use_cudnn=False)
conv1 = paddle.static.nn.conv2d( conv1 = paddle.static.nn.conv2d(
input=pool, input=pool,
filter_size=1, filter_size=1,
...@@ -247,7 +245,7 @@ class SlimFaceNet(): ...@@ -247,7 +245,7 @@ class SlimFaceNet():
mode='channel', mode='channel',
param_attr=paddle.ParamAttr( param_attr=paddle.ParamAttr(
name=name + '_prelu', name=name + '_prelu',
regularizer=fluid.regularizer.L2Decay(0.0))) regularizer=paddle.regularizer.L2Decay(0.0)))
conv2 = paddle.static.nn.conv2d( conv2 = paddle.static.nn.conv2d(
input=conv1, input=conv1,
filter_size=1, filter_size=1,
...@@ -293,7 +291,7 @@ class SlimFaceNet(): ...@@ -293,7 +291,7 @@ class SlimFaceNet():
mode='channel', mode='channel',
param_attr=paddle.ParamAttr( param_attr=paddle.ParamAttr(
name=name + '_prelu', name=name + '_prelu',
regularizer=fluid.regularizer.L2Decay(0.0))) regularizer=paddle.regularizer.L2Decay(0.0)))
else: else:
return bn return bn
...@@ -307,12 +305,12 @@ class SlimFaceNet(): ...@@ -307,12 +305,12 @@ class SlimFaceNet():
name='weight_norm', name='weight_norm',
attr=paddle.ParamAttr( attr=paddle.ParamAttr(
initializer=paddle.nn.initializer.Xavier(), initializer=paddle.nn.initializer.Xavier(),
regularizer=fluid.regularizer.L2Decay(4e-4))) regularizer=paddle.regularizer.L2Decay(4e-4)))
weight_norm = paddle.sqrt(paddle.sum(paddle.square(weight), dim=1)) weight_norm = paddle.sqrt(paddle.sum(paddle.square(weight), dim=1))
weight = paddle.divide(weight, weight_norm, axis=0) weight = paddle.divide(weight, weight_norm, axis=0)
weight = paddle.transpose(weight, perm=[1, 0]) weight = paddle.transpose(weight, perm=[1, 0])
cosine = fluid.layers.mul(input, weight) cosine = paddle.matmul(input, weight)
sine = paddle.sqrt(1.0 - paddle.square(cosine)) sine = paddle.sqrt(1.0 - paddle.square(cosine))
cos_m = math.cos(m) cos_m = math.cos(m)
...@@ -329,7 +327,7 @@ class SlimFaceNet(): ...@@ -329,7 +327,7 @@ class SlimFaceNet():
else: else:
pass pass
one_hot = fluid.layers.one_hot(input=label, depth=out_dim) one_hot = paddle.nn.functional.one_hot(label, out_dim)
output = paddle.multiply(one_hot, phi) + paddle.multiply( output = paddle.multiply(one_hot, phi) + paddle.multiply(
(1.0 - one_hot), cosine) (1.0 - one_hot), cosine)
output = output * s output = output * s
......
...@@ -15,16 +15,13 @@ ...@@ -15,16 +15,13 @@
import os import os
import numpy as np import numpy as np
import paddle import paddle
import paddle.fluid as F
import paddle.fluid.dygraph as FD
import paddle.fluid.layers as L
def compute_neuron_head_importance(args, model, dev_ds, place, model_cfg): def compute_neuron_head_importance(args, model, dev_ds, place, model_cfg):
n_layers, n_heads = model_cfg['num_hidden_layers'], model_cfg[ n_layers, n_heads = model_cfg['num_hidden_layers'], model_cfg[
'num_attention_heads'] 'num_attention_heads']
head_importance = L.zeros(shape=[n_layers, n_heads], dtype='float32') head_importance = paddle.zeros(shape=[n_layers, n_heads], dtype='float32')
head_mask = L.ones(shape=[n_layers, n_heads], dtype='float32') head_mask = paddle.ones(shape=[n_layers, n_heads], dtype='float32')
head_mask.stop_gradient = False head_mask.stop_gradient = False
intermediate_weight = [] intermediate_weight = []
...@@ -60,7 +57,8 @@ def compute_neuron_head_importance(args, model, dev_ds, place, model_cfg): ...@@ -60,7 +57,8 @@ def compute_neuron_head_importance(args, model, dev_ds, place, model_cfg):
num_layers=model_cfg['num_hidden_layers']) num_layers=model_cfg['num_hidden_layers'])
loss = out[0] loss = out[0]
loss.backward() loss.backward()
head_importance += L.abs(FD.to_variable(head_mask.gradient())) head_importance += paddle.abs(
paddle.to_tensor(head_mask.gradient()))
for w1, b1, w2, current_importance in zip( for w1, b1, w2, current_importance in zip(
intermediate_weight, intermediate_bias, output_weight, intermediate_weight, intermediate_bias, output_weight,
...@@ -78,34 +76,36 @@ def reorder_neuron_head(model, head_importance, neuron_importance): ...@@ -78,34 +76,36 @@ def reorder_neuron_head(model, head_importance, neuron_importance):
# reorder heads and ffn neurons # reorder heads and ffn neurons
for layer, current_importance in enumerate(neuron_importance): for layer, current_importance in enumerate(neuron_importance):
# reorder heads # reorder heads
idx = L.argsort(head_importance[layer], descending=True)[-1] idx = paddle.argsort(head_importance[layer], descending=True)[-1]
#model.encoder_stack.block[layer].attn.reorder_heads(idx) #model.encoder_stack.block[layer].attn.reorder_heads(idx)
reorder_head(model.encoder_stack.block[layer].attn, idx) reorder_head(model.encoder_stack.block[layer].attn, idx)
# reorder neurons # reorder neurons
idx = L.argsort(FD.to_variable(current_importance), descending=True)[-1] idx = paddle.argsort(
paddle.to_tensor(current_importance), descending=True)[-1]
#model.encoder_stack.block[layer].ffn.reorder_neurons(idx) #model.encoder_stack.block[layer].ffn.reorder_neurons(idx)
reorder_neuron(model.encoder_stack.block[layer].ffn, idx) reorder_neuron(model.encoder_stack.block[layer].ffn, idx)
def reorder_head(layer, idx): def reorder_head(layer, idx):
n, a = layer.n_head, layer.d_key n, a = layer.n_head, layer.d_key
index = L.reshape( index = paddle.reshape(
L.index_select( paddle.index_select(
L.reshape( paddle.reshape(
L.arange( paddle.arange(
0, n * a, dtype='int64'), shape=[n, a]), 0, n * a, dtype='int64'), shape=[n, a]),
idx, idx,
dim=0), axis=0),
shape=[-1]) shape=[-1])
def reorder_head_matrix(linearLayer, index, dim=1): def reorder_head_matrix(linearLayer, index, dim=1):
W = L.index_select(linearLayer.weight, index, dim=dim).detach() W = paddle.index_select(linearLayer.weight, index, axis=dim).detach()
if linearLayer.bias is not None: if linearLayer.bias is not None:
if dim == 0: if dim == 0:
b = L.assign(linearLayer.bias).detach() b = paddle.assign(linearLayer.bias).detach()
else: else:
b = L.assign(L.index_select( b = paddle.assign(
linearLayer.bias, index, dim=0)).detach() L.index_select(
linearLayer.bias, index, dim=0)).detach()
linearLayer.weight.stop_gradient = True linearLayer.weight.stop_gradient = True
linearLayer.weight.set_value(W) linearLayer.weight.set_value(W)
...@@ -127,13 +127,14 @@ def reorder_head(layer, idx): ...@@ -127,13 +127,14 @@ def reorder_head(layer, idx):
def reorder_neuron(layer, index, dim=0): def reorder_neuron(layer, index, dim=0):
def reorder_neurons_matrix(linearLayer, index, dim): def reorder_neurons_matrix(linearLayer, index, dim):
W = L.index_select(linearLayer.weight, index, dim=dim).detach() W = paddle.index_select(linearLayer.weight, index, axis=dim).detach()
if linearLayer.bias is not None: if linearLayer.bias is not None:
if dim == 0: if dim == 0:
b = L.assign(linearLayer.bias).detach() b = paddle.assign(linearLayer.bias).detach()
else: else:
b = L.assign(L.index_select( b = paddle.assign(
linearLayer.bias, index, dim=0)).detach() L.index_select(
linearLayer.bias, index, dim=0)).detach()
linearLayer.weight.stop_gradient = True linearLayer.weight.stop_gradient = True
linearLayer.weight.set_value(W) linearLayer.weight.set_value(W)
linearLayer.weight.stop_gradient = False linearLayer.weight.stop_gradient = False
......
...@@ -32,9 +32,6 @@ else: ...@@ -32,9 +32,6 @@ else:
from pathlib import Path from pathlib import Path
import paddle import paddle
import paddle.fluid.dygraph as D
import paddle.fluid as F
import paddle.fluid.layers as L
from ernie.file_utils import _fetch_from_remote from ernie.file_utils import _fetch_from_remote
from ernie.modeling_ernie import AttentionLayer, ErnieBlock, ErnieModel, ErnieEncoderStack, ErnieModelForSequenceClassification from ernie.modeling_ernie import AttentionLayer, ErnieBlock, ErnieModel, ErnieEncoderStack, ErnieModelForSequenceClassification
...@@ -66,8 +63,8 @@ def _attn_forward(self, ...@@ -66,8 +63,8 @@ def _attn_forward(self,
cache = (k, v) cache = (k, v)
if past_cache is not None: if past_cache is not None:
cached_k, cached_v = past_cache cached_k, cached_v = past_cache
k = L.concat([cached_k, k], 1) k = paddle.concat([cached_k, k], 1)
v = L.concat([cached_v, v], 1) v = paddle.concat([cached_v, v], 1)
if hasattr(self.q, 'fn') and self.q.fn.cur_config['expand_ratio'] != None: if hasattr(self.q, 'fn') and self.q.fn.cur_config['expand_ratio'] != None:
n_head = int(self.n_head * self.q.fn.cur_config['expand_ratio']) n_head = int(self.n_head * self.q.fn.cur_config['expand_ratio'])
...@@ -84,19 +81,19 @@ def _attn_forward(self, ...@@ -84,19 +81,19 @@ def _attn_forward(self,
paddle.reshape(v, [0, 0, n_head, v.shape[-1] // n_head]), paddle.reshape(v, [0, 0, n_head, v.shape[-1] // n_head]),
[0, 2, 1, 3]) #[batch, head, seq, dim] [0, 2, 1, 3]) #[batch, head, seq, dim]
q = L.scale(q, scale=self.d_key**-0.5) q = paddle.scale(q, scale=self.d_key**-0.5)
score = L.matmul(q, k, transpose_y=True) score = paddle.matmul(q, k, transpose_y=True)
if attn_bias is not None: if attn_bias is not None:
score += attn_bias score += attn_bias
score = L.softmax(score, use_cudnn=True) score = paddle.nn.functional.softmax(score, use_cudnn=True)
score = self.dropout(score) score = self.dropout(score)
if head_mask is not None: if head_mask is not None:
score = score * head_mask score = score * head_mask
out = L.matmul(score, v) out = paddle.matmul(score, v)
out = L.transpose(out, [0, 2, 1, 3]) out = paddle.transpose(out, [0, 2, 1, 3])
out = L.reshape(out, [0, 0, out.shape[2] * out.shape[3]]) out = paddle.reshape(out, [0, 0, out.shape[2] * out.shape[3]])
out = self.o(out) out = self.o(out)
return out, cache return out, cache
...@@ -188,23 +185,25 @@ def _ernie_model_forward(self, ...@@ -188,23 +185,25 @@ def _ernie_model_forward(self,
) == 2, 'expect src_ids.shape = [batch, sequecen], got %s' % ( ) == 2, 'expect src_ids.shape = [batch, sequecen], got %s' % (
repr(src_ids.shape)) repr(src_ids.shape))
assert attn_bias is not None if past_cache else True, 'if `past_cache` is specified; attn_bias should not be None' assert attn_bias is not None if past_cache else True, 'if `past_cache` is specified; attn_bias should not be None'
d_batch = L.shape(src_ids)[0] d_batch = paddle.shape(src_ids)[0]
d_seqlen = L.shape(src_ids)[1] d_seqlen = paddle.shape(src_ids)[1]
if pos_ids is None: if pos_ids is None:
pos_ids = L.reshape(L.range(0, d_seqlen, 1, dtype='int32'), [1, -1]) pos_ids = paddle.reshape(
pos_ids = L.cast(pos_ids, 'int64') L.range(
0, d_seqlen, 1, dtype='int32'), [1, -1])
pos_ids = paddle.cast(pos_ids, 'int64')
if attn_bias is None: if attn_bias is None:
if input_mask is None: if input_mask is None:
input_mask = L.cast(src_ids != 0, 'float32') input_mask = paddle.cast(src_ids != 0, 'float32')
assert len(input_mask.shape) == 2 assert len(input_mask.shape) == 2
input_mask = L.unsqueeze(input_mask, axes=[-1]) input_mask = paddle.unsqueeze(input_mask, axis=[-1])
attn_bias = L.matmul(input_mask, input_mask, transpose_y=True) attn_bias = paddle.matmul(input_mask, input_mask, transpose_y=True)
if use_causal_mask: if use_causal_mask:
sequence = L.reshape( sequence = paddle.reshape(
L.range( paddle.arange(
0, d_seqlen, 1, dtype='float32') + 1., [1, 1, -1, 1]) 0, d_seqlen, 1, dtype='float32') + 1., [1, 1, -1, 1])
causal_mask = L.cast( causal_mask = paddle.cast(
(L.matmul( (paddle.matmul(
sequence, 1. / sequence, transpose_y=True) >= 1.), sequence, 1. / sequence, transpose_y=True) >= 1.),
'float32') 'float32')
attn_bias *= causal_mask attn_bias *= causal_mask
...@@ -213,21 +212,23 @@ def _ernie_model_forward(self, ...@@ -213,21 +212,23 @@ def _ernie_model_forward(self,
attn_bias.shape attn_bias.shape
) == 3, 'expect attn_bias tobe rank 3, got %r' % attn_bias.shape ) == 3, 'expect attn_bias tobe rank 3, got %r' % attn_bias.shape
attn_bias = (1. - attn_bias) * -10000.0 attn_bias = (1. - attn_bias) * -10000.0
attn_bias = L.unsqueeze(attn_bias, [1]) attn_bias = paddle.unsqueeze(attn_bias, [1])
attn_bias.stop_gradient = True attn_bias.stop_gradient = True
if sent_ids is None: if sent_ids is None:
sent_ids = L.zeros_like(src_ids) sent_ids = paddle.zeros_like(src_ids)
if head_mask is not None: if head_mask is not None:
if len(head_mask.shape) == 1: if len(head_mask.shape) == 1:
head_mask = L.unsqueeze( head_mask = paddle.unsqueeze(
L.unsqueeze(L.unsqueeze(L.unsqueeze(head_mask, 0), 0), -1), -1) paddle.unsqueeze(
head_mask = L.expand( paddle.unsqueeze(paddle.unsqueeze(head_mask, 0), 0), -1),
head_mask, expand_times=[num_layers, 1, 1, 1, 1]) -1)
head_mask = paddle.expand(
head_mask, shape=[head_mask.shape[0] * num_layers, 1, 1, 1, 1])
elif len(head_mask.shape) == 2: elif len(head_mask.shape) == 2:
head_mask = L.unsqueeze( head_mask = paddle.unsqueeze(
L.unsqueeze(L.unsqueeze(head_mask, 1), -1), -1) paddle.unsqueeze(paddle.unsqueeze(head_mask, 1), -1), -1)
else: else:
head_mask = [None] * num_layers head_mask = [None] * num_layers
...@@ -274,8 +275,8 @@ def _seqence_forward(self, *args, **kwargs): ...@@ -274,8 +275,8 @@ def _seqence_forward(self, *args, **kwargs):
if labels is not None: if labels is not None:
if len(labels.shape) == 1: if len(labels.shape) == 1:
labels = L.reshape(labels, [-1, 1]) labels = paddle.reshape(labels, [-1, 1])
loss = L.softmax_with_cross_entropy(logits, labels) loss = paddle.nn.functional.softmax_with_cross_entropy(logits, labels)
loss = paddle.mean(loss) loss = paddle.mean(loss)
else: else:
loss = None loss = None
......
...@@ -19,12 +19,10 @@ from __future__ import unicode_literals ...@@ -19,12 +19,10 @@ from __future__ import unicode_literals
from __future__ import absolute_import from __future__ import absolute_import
import re import re
import paddle.fluid as F import paddle
import paddle.fluid.layers as L
import paddle.fluid.dygraph as D
class AdamW(F.optimizer.AdamOptimizer): class AdamW(padlde.optimizer.Adam):
"""AdamW object for dygraph""" """AdamW object for dygraph"""
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
...@@ -39,5 +37,6 @@ class AdamW(F.optimizer.AdamOptimizer): ...@@ -39,5 +37,6 @@ class AdamW(F.optimizer.AdamOptimizer):
super(AdamW, self).apply_optimize(loss, startup_program, params_grads) super(AdamW, self).apply_optimize(loss, startup_program, params_grads)
for p, g in params_grads: for p, g in params_grads:
if not self.pat.match(p.name): if not self.pat.match(p.name):
with D.no_grad(): with paddle.no_grad():
L.assign(p * (1. - self.wd * self.current_step_lr()), p) paddle.assign(p * (1. - self.wd * self.current_step_lr()),
p)
...@@ -26,9 +26,6 @@ import logging ...@@ -26,9 +26,6 @@ import logging
import argparse import argparse
import paddle import paddle
import paddle.fluid as F
import paddle.fluid.dygraph as FD
import paddle.fluid.layers as L
from paddleslim.nas.ofa import OFA, RunConfig, DistillConfig, utils from paddleslim.nas.ofa import OFA, RunConfig, DistillConfig, utils
from propeller import log from propeller import log
...@@ -44,9 +41,9 @@ from paddleslim.nas.ofa.convert_super import Convert, supernet ...@@ -44,9 +41,9 @@ from paddleslim.nas.ofa.convert_super import Convert, supernet
def soft_cross_entropy(inp, target): def soft_cross_entropy(inp, target):
inp_likelihood = L.log_softmax(inp, axis=-1) inp_likelihood = paddle.nn.functional.log_softmax(inp, axis=-1)
target_prob = L.softmax(target, axis=-1) target_prob = paddle.nn.functional.softmax(target, axis=-1)
return -1. * L.mean(paddle.sum(inp_likelihood * target_prob, dim=-1)) return -1. * paddle.mean(paddle.sum(inp_likelihood * target_prob, dim=-1))
if __name__ == '__main__': if __name__ == '__main__':
...@@ -194,200 +191,193 @@ if __name__ == '__main__': ...@@ -194,200 +191,193 @@ if __name__ == '__main__':
dev_ds.data_shapes = shapes dev_ds.data_shapes = shapes
dev_ds.data_types = types dev_ds.data_types = types
place = F.CUDAPlace(0) place = paddle.CUDAPlace(0)
with FD.guard(place): model = ErnieModelForSequenceClassification.from_pretrained(
model = ErnieModelForSequenceClassification.from_pretrained( args.from_pretrained, num_labels=3, name='')
args.from_pretrained, num_labels=3, name='') setattr(model, 'return_additional_info', True)
setattr(model, 'return_additional_info', True)
origin_weights = {}
origin_weights = {} for name, param in model.named_parameters():
for name, param in model.named_parameters(): origin_weights[name] = param
origin_weights[name] = param
sp_config = supernet(expand_ratio=args.width_mult_list)
sp_config = supernet(expand_ratio=args.width_mult_list) model = Convert(sp_config).convert(model)
model = Convert(sp_config).convert(model) utils.set_state_dict(model, origin_weights)
utils.set_state_dict(model, origin_weights) del origin_weights
del origin_weights
teacher_model = ErnieModelForSequenceClassification.from_pretrained(
teacher_model = ErnieModelForSequenceClassification.from_pretrained( args.from_pretrained, num_labels=3, name='teacher')
args.from_pretrained, num_labels=3, name='teacher') setattr(teacher_model, 'return_additional_info', True)
setattr(teacher_model, 'return_additional_info', True)
default_run_config = {
default_run_config = { 'n_epochs': [[4 * args.epoch], [6 * args.epoch]],
'n_epochs': [[4 * args.epoch], [6 * args.epoch]], 'init_learning_rate': [[args.lr], [args.lr]],
'init_learning_rate': [[args.lr], [args.lr]], 'elastic_depth': args.depth_mult_list,
'elastic_depth': args.depth_mult_list, 'dynamic_batch_size': [[1, 1], [1, 1]]
'dynamic_batch_size': [[1, 1], [1, 1]] }
} run_config = RunConfig(**default_run_config)
run_config = RunConfig(**default_run_config)
model_cfg = get_config(args.from_pretrained)
model_cfg = get_config(args.from_pretrained)
default_distill_config = {'teacher_model': teacher_model}
default_distill_config = {'teacher_model': teacher_model} distill_config = DistillConfig(**default_distill_config)
distill_config = DistillConfig(**default_distill_config)
ofa_model = OFA(model,
ofa_model = OFA(model, run_config,
run_config, distill_config=distill_config,
distill_config=distill_config, elastic_order=['width', 'depth'])
elastic_order=['width', 'depth'])
### suppose elastic width first
### suppose elastic width first if args.reorder_weight:
if args.reorder_weight: head_importance, neuron_importance = compute_neuron_head_importance(
head_importance, neuron_importance = compute_neuron_head_importance( args, ofa_model.model, dev_ds, place, model_cfg)
args, ofa_model.model, dev_ds, place, model_cfg) reorder_neuron_head(ofa_model.model, head_importance, neuron_importance)
reorder_neuron_head(ofa_model.model, head_importance, #################
neuron_importance)
################# if args.init_checkpoint is not None:
log.info('loading checkpoint from %s' % args.init_checkpoint)
if args.init_checkpoint is not None: sd, _ = paddle.load(args.init_checkpoint)
log.info('loading checkpoint from %s' % args.init_checkpoint) ofa_model.model.set_dict(sd)
sd, _ = FD.load_dygraph(args.init_checkpoint)
ofa_model.model.set_dict(sd) g_clip = paddle.nn.ClipGradByGlobalNorm(1.0) #experimental
if args.use_lr_decay:
g_clip = F.clip.GradientClipByGlobalNorm(1.0) #experimental opt = AdamW(
if args.use_lr_decay: learning_rate=LinearDecay(args.lr,
opt = AdamW( int(args.warmup_proportion *
learning_rate=LinearDecay(args.lr, args.max_steps), args.max_steps),
int(args.warmup_proportion * parameter_list=ofa_model.model.parameters(),
args.max_steps), args.max_steps), weight_decay=args.wd,
parameter_list=ofa_model.model.parameters(), grad_clip=g_clip)
weight_decay=args.wd, else:
grad_clip=g_clip) opt = AdamW(
args.lr,
parameter_list=ofa_model.model.parameters(),
weight_decay=args.wd,
grad_clip=g_clip)
for epoch in range(max(run_config.n_epochs[-1])):
ofa_model.set_epoch(epoch)
if epoch <= int(max(run_config.n_epochs[0])):
ofa_model.set_task('width')
depth_mult_list = [1.0]
else: else:
opt = AdamW( ofa_model.set_task('depth')
args.lr, depth_mult_list = run_config.elastic_depth
parameter_list=ofa_model.model.parameters(), for step, d in enumerate(tqdm(train_ds.start(place), desc='training')):
weight_decay=args.wd, ids, sids, label = d
grad_clip=g_clip)
accumulate_gradients = dict()
for epoch in range(max(run_config.n_epochs[-1])): for param in opt._parameter_list:
ofa_model.set_epoch(epoch) accumulate_gradients[param.name] = 0.0
if epoch <= int(max(run_config.n_epochs[0])):
ofa_model.set_task('width') for depth_mult in depth_mult_list:
depth_mult_list = [1.0] for width_mult in args.width_mult_list:
else: net_config = utils.dynabert_config(
ofa_model.set_task('depth') ofa_model, width_mult, depth_mult=depth_mult)
depth_mult_list = run_config.elastic_depth ofa_model.set_net_config(net_config)
for step, d in enumerate(
tqdm( student_output, teacher_output = ofa_model(
train_ds.start(place), desc='training')): ids,
ids, sids, label = d sids,
labels=label,
accumulate_gradients = dict() num_layers=model_cfg['num_hidden_layers'])
for param in opt._parameter_list: loss, student_logit, student_reps = student_output[
accumulate_gradients[param.name] = 0.0 0], student_output[1], student_output[2]['hiddens']
teacher_logit, teacher_reps = teacher_output[
1], teacher_output[2]['hiddens']
if ofa_model.task == 'depth':
depth_mult = ofa_model.current_config['depth']
depth = round(model_cfg['num_hidden_layers'] *
depth_mult)
kept_layers_index = []
for i in range(1, depth + 1):
kept_layers_index.append(
math.floor(i / depth_mult) - 1)
if mode == 'classification':
logit_loss = soft_cross_entropy(
student_logit, teacher_logit.detach())
else:
logit_loss = 0.0
### hidden_states distillation loss
rep_loss = 0.0
for stu_rep, tea_rep in zip(
student_reps,
list(teacher_reps[i]
for i in kept_layers_index)):
tmp_loss = paddle.nn.functional.mse_loss(
stu_rep, tea_rep.detach())
rep_loss += tmp_loss
loss = args.width_lambda1 * logit_loss + args.width_lambda2 * rep_loss
else:
### logit distillation loss
if mode == 'classification':
logit_loss = soft_cross_entropy(
student_logit, teacher_logit.detach())
else:
logit_loss = 0.0
### hidden_states distillation loss
rep_loss = 0.0
for stu_rep, tea_rep in zip(student_reps, teacher_reps):
tmp_loss = paddle.nn.functional.mse_loss(
stu_rep, tea_rep.detach())
rep_loss += tmp_loss
loss = args.width_lambda1 * logit_loss + args.width_lambda2 * rep_loss
if step % 10 == 0:
print('train loss %.5f lr %.3e' %
(loss.numpy(), opt.current_step_lr()))
loss.backward()
param_grads = opt.backward(loss)
for param in opt._parameter_list:
accumulate_gradients[param.name] += param.gradient()
for k, v in param_grads:
assert k.name in accumulate_gradients.keys(
), "{} not in accumulate_gradients".format(k.name)
v.set_value(accumulate_gradients[k.name])
opt.apply_optimize(
loss, startup_program=None, params_grads=param_grads)
ofa_model.model.clear_gradients()
if step % 100 == 0:
for depth_mult in depth_mult_list: for depth_mult in depth_mult_list:
for width_mult in args.width_mult_list: for width_mult in args.width_mult_list:
net_config = utils.dynabert_config( net_config = utils.dynabert_config(
ofa_model, width_mult, depth_mult=depth_mult) ofa_model, width_mult, depth_mult=depth_mult)
ofa_model.set_net_config(net_config) ofa_model.set_net_config(net_config)
student_output, teacher_output = ofa_model( acc = []
ids, tea_acc = []
sids, ofa_model.model.eval()
labels=label, for step, d in enumerate(
num_layers=model_cfg['num_hidden_layers']) tqdm(
loss, student_logit, student_reps = student_output[ dev_ds.start(place),
0], student_output[1], student_output[2]['hiddens'] desc='evaluating %d' % epoch)):
teacher_logit, teacher_reps = teacher_output[ ids, sids, label = d
1], teacher_output[2]['hiddens'] [loss, logits, _], [_, tea_logits, _] = ofa_model(
ids,
if ofa_model.task == 'depth': sids,
depth_mult = ofa_model.current_config['depth'] labels=label,
depth = round(model_cfg['num_hidden_layers'] * num_layers=model_cfg['num_hidden_layers'])
depth_mult) a = paddle.argmax(logits, -1) == label
kept_layers_index = [] acc.append(a.numpy())
for i in range(1, depth + 1):
kept_layers_index.append( ta = paddle.argmax(tea_logits, -1) == label
math.floor(i / depth_mult) - 1) tea_acc.append(ta.numpy())
ofa_model.model.train()
if mode == 'classification': print(
logit_loss = soft_cross_entropy( 'width_mult: %f, depth_mult: %f: acc %.5f, teacher acc %.5f'
student_logit, teacher_logit.detach()) % (width_mult, depth_mult,
else: np.concatenate(acc).mean(),
logit_loss = 0.0 np.concatenate(tea_acc).mean()))
### hidden_states distillation loss
rep_loss = 0.0
for stu_rep, tea_rep in zip(
student_reps,
list(teacher_reps[i]
for i in kept_layers_index)):
tmp_loss = L.mse_loss(stu_rep, tea_rep.detach())
rep_loss += tmp_loss
loss = args.width_lambda1 * logit_loss + args.width_lambda2 * rep_loss
else:
### logit distillation loss
if mode == 'classification':
logit_loss = soft_cross_entropy(
student_logit, teacher_logit.detach())
else:
logit_loss = 0.0
### hidden_states distillation loss
rep_loss = 0.0
for stu_rep, tea_rep in zip(student_reps,
teacher_reps):
tmp_loss = L.mse_loss(stu_rep, tea_rep.detach())
rep_loss += tmp_loss
loss = args.width_lambda1 * logit_loss + args.width_lambda2 * rep_loss
if step % 10 == 0:
print('train loss %.5f lr %.3e' %
(loss.numpy(), opt.current_step_lr()))
loss.backward()
param_grads = opt.backward(loss)
for param in opt._parameter_list:
accumulate_gradients[param.name] += param.gradient()
for k, v in param_grads:
assert k.name in accumulate_gradients.keys(
), "{} not in accumulate_gradients".format(k.name)
v.set_value(accumulate_gradients[k.name])
opt.apply_optimize(
loss, startup_program=None, params_grads=param_grads)
ofa_model.model.clear_gradients()
if step % 100 == 0:
for depth_mult in depth_mult_list:
for width_mult in args.width_mult_list:
net_config = utils.dynabert_config(
ofa_model, width_mult, depth_mult=depth_mult)
ofa_model.set_net_config(net_config)
acc = []
tea_acc = []
with FD.base._switch_tracer_mode_guard_(
is_train=False):
ofa_model.model.eval()
for step, d in enumerate(
tqdm(
dev_ds.start(place),
desc='evaluating %d' % epoch)):
ids, sids, label = d
[loss, logits,
_], [_, tea_logits, _] = ofa_model(
ids,
sids,
labels=label,
num_layers=model_cfg[
'num_hidden_layers'])
a = L.argmax(logits, -1) == label
acc.append(a.numpy())
ta = L.argmax(tea_logits, -1) == label
tea_acc.append(ta.numpy())
ofa_model.model.train()
print(
'width_mult: %f, depth_mult: %f: acc %.5f, teacher acc %.5f'
% (width_mult, depth_mult,
np.concatenate(acc).mean(),
np.concatenate(tea_acc).mean()))
if args.save_dir is not None: if args.save_dir is not None:
if not os.path.exists(args.save_dir): if not os.path.exists(args.save_dir):
os.makedirs(args.save_dir) os.makedirs(args.save_dir)
F.save_dygraph(ofa_model.model.state_dict(), args.save_dir) paddle.save(ofa_model.model.state_dict(), args.save_dir)
...@@ -107,8 +107,8 @@ def test_ofa(): ...@@ -107,8 +107,8 @@ def test_ofa():
y_data = np.array( y_data = np.array(
[x[1] for x in data]).astype('int64').reshape(-1, 1) [x[1] for x in data]).astype('int64').reshape(-1, 1)
img = paddle.dygraph.to_variable(dy_x_data) img = paddle.to_tensor(dy_x_data)
label = paddle.dygraph.to_variable(y_data) label = paddle.to_tensor(y_data)
label.stop_gradient = True label.stop_gradient = True
for model_no in range(run_config.dynamic_batch_size[idx]): for model_no in range(run_config.dynamic_batch_size[idx]):
......
...@@ -4,11 +4,11 @@ from webbrowser import get ...@@ -4,11 +4,11 @@ from webbrowser import get
import paddle import paddle
from paddle import tensor from paddle import tensor
from paddle.autograd import PyLayer from paddle.autograd import PyLayer
from paddle.fluid import layers
from paddle.nn import functional as F from paddle.nn import functional as F
from paddle.nn.layer.common import Linear, Embedding from paddle.nn.layer.common import Linear, Embedding
from paddle.nn.layer.transformer import MultiHeadAttention, _convert_attention_mask from paddle.nn.layer.transformer import MultiHeadAttention, _convert_attention_mask
class BinaryQuantizer(PyLayer): class BinaryQuantizer(PyLayer):
@staticmethod @staticmethod
def forward(ctx, input): def forward(ctx, input):
...@@ -24,6 +24,7 @@ class BinaryQuantizer(PyLayer): ...@@ -24,6 +24,7 @@ class BinaryQuantizer(PyLayer):
grad_input[input <= -1] = 0 grad_input[input <= -1] = 0
return grad_input.clone() return grad_input.clone()
class ZMeanBinaryQuantizer(PyLayer): class ZMeanBinaryQuantizer(PyLayer):
@staticmethod @staticmethod
def forward(ctx, input): def forward(ctx, input):
...@@ -39,43 +40,86 @@ class ZMeanBinaryQuantizer(PyLayer): ...@@ -39,43 +40,86 @@ class ZMeanBinaryQuantizer(PyLayer):
grad_input[input <= -1] = 0 grad_input[input <= -1] = 0
return grad_input.clone() return grad_input.clone()
class BiLinear(Linear): class BiLinear(Linear):
def __init__(self, in_features, out_features, weight_attr=None, bias_attr=None, name=None): def __init__(self,
super(BiLinear, self).__init__(in_features, out_features, weight_attr=weight_attr, bias_attr=bias_attr, name=name) in_features,
out_features,
weight_attr=None,
bias_attr=None,
name=None):
super(BiLinear, self).__init__(
in_features,
out_features,
weight_attr=weight_attr,
bias_attr=bias_attr,
name=name)
def forward(self, input): def forward(self, input):
scaling_factor = paddle.mean(self.weight.abs(), axis=1).unsqueeze(1).detach() scaling_factor = paddle.mean(
real_weights = self.weight - paddle.mean(self.weight, axis=-1).unsqueeze(-1) self.weight.abs(), axis=1).unsqueeze(1).detach()
real_weights = self.weight - paddle.mean(
self.weight, axis=-1).unsqueeze(-1)
binary_weights_no_grad = scaling_factor * paddle.sign(real_weights) binary_weights_no_grad = scaling_factor * paddle.sign(real_weights)
cliped_weights = paddle.clip(real_weights, -1.0, 1.0) cliped_weights = paddle.clip(real_weights, -1.0, 1.0)
weight = binary_weights_no_grad.detach() - cliped_weights.detach() + cliped_weights weight = binary_weights_no_grad.detach() - cliped_weights.detach(
) + cliped_weights
binary_input_no_grad = paddle.sign(input) binary_input_no_grad = paddle.sign(input)
cliped_input = paddle.clip(input, -1.0, 1.0) cliped_input = paddle.clip(input, -1.0, 1.0)
ba = binary_input_no_grad.detach() - cliped_input.detach() + cliped_input ba = binary_input_no_grad.detach() - cliped_input.detach(
) + cliped_input
out = F.linear(x=ba, weight=weight, bias=self.bias, name=self.name) out = F.linear(x=ba, weight=weight, bias=self.bias, name=self.name)
return out return out
class BiEmbedding(Embedding): class BiEmbedding(Embedding):
def __init__(self, num_embeddings, embedding_dim, padding_idx=None, sparse=False, weight_attr=None, name=None): def __init__(self,
super(BiEmbedding, self).__init__(num_embeddings, embedding_dim, padding_idx, sparse, weight_attr, name) num_embeddings,
embedding_dim,
padding_idx=None,
sparse=False,
weight_attr=None,
name=None):
super(BiEmbedding,
self).__init__(num_embeddings, embedding_dim, padding_idx, sparse,
weight_attr, name)
def forward(self, x): def forward(self, x):
scaling_factor = paddle.mean(self.weight.abs(), axis=1, keepdim=True) scaling_factor = paddle.mean(self.weight.abs(), axis=1, keepdim=True)
scaling_factor = scaling_factor.detach() scaling_factor = scaling_factor.detach()
real_weights = self.weight - paddle.mean(self.weight, axis=-1, keepdim=True) real_weights = self.weight - paddle.mean(
self.weight, axis=-1, keepdim=True)
binary_weights_no_grad = scaling_factor * paddle.sign(real_weights) binary_weights_no_grad = scaling_factor * paddle.sign(real_weights)
cliped_weights = paddle.clip(real_weights, -1.0, 1.0) cliped_weights = paddle.clip(real_weights, -1.0, 1.0)
weight = binary_weights_no_grad.detach() - cliped_weights.detach() + cliped_weights weight = binary_weights_no_grad.detach() - cliped_weights.detach(
return F.embedding(x, weight=weight, padding_idx=self._padding_idx, sparse=self._sparse, name=self._name) ) + cliped_weights
return F.embedding(
x,
weight=weight,
padding_idx=self._padding_idx,
sparse=self._sparse,
name=self._name)
class BiMultiHeadAttention(MultiHeadAttention): class BiMultiHeadAttention(MultiHeadAttention):
# fork from paddle.nn.layer.transformer.MultiHeadAttention # fork from paddle.nn.layer.transformer.MultiHeadAttention
Cache = collections.namedtuple("Cache", ["k", "v"]) Cache = collections.namedtuple("Cache", ["k", "v"])
StaticCache = collections.namedtuple("StaticCache", ["k", "v"]) StaticCache = collections.namedtuple("StaticCache", ["k", "v"])
def __init__(self, embed_dim, num_heads, dropout=0., kdim=None, vdim=None, need_weights=False, weight_attr=None, bias_attr=None): def __init__(self,
super(BiMultiHeadAttention, self).__init__(embed_dim, num_heads, dropout, kdim, vdim, need_weights, weight_attr, bias_attr) embed_dim,
num_heads,
dropout=0.,
kdim=None,
vdim=None,
need_weights=False,
weight_attr=None,
bias_attr=None):
super(BiMultiHeadAttention,
self).__init__(embed_dim, num_heads, dropout, kdim, vdim,
need_weights, weight_attr, bias_attr)
def forward(self, query, key=None, value=None, attn_mask=None, cache=None): def forward(self, query, key=None, value=None, attn_mask=None, cache=None):
key = query if key is None else key key = query if key is None else key
...@@ -85,14 +129,12 @@ class BiMultiHeadAttention(MultiHeadAttention): ...@@ -85,14 +129,12 @@ class BiMultiHeadAttention(MultiHeadAttention):
q, k, v = self._prepare_qkv(query, key, value, cache) q, k, v = self._prepare_qkv(query, key, value, cache)
else: else:
q, k, v, cache = self._prepare_qkv(query, key, value, cache) q, k, v, cache = self._prepare_qkv(query, key, value, cache)
q = BinaryQuantizer.apply(q) q = BinaryQuantizer.apply(q)
k = BinaryQuantizer.apply(k) k = BinaryQuantizer.apply(k)
# scale dot product attention product = paddle.matmul(x=q, y=k, transpose_y=True)
# TODO(guosheng): use tensor.matmul, however it doesn't support `alpha` product = paddle.scale(product, scale=self.head_dim**-0.5)
product = layers.matmul(
x=q, y=k, transpose_y=True, alpha=self.head_dim**-0.5)
if attn_mask is not None: if attn_mask is not None:
# Support bool or int mask # Support bool or int mask
attn_mask = _convert_attention_mask(attn_mask, product.dtype) attn_mask = _convert_attention_mask(attn_mask, product.dtype)
...@@ -123,17 +165,14 @@ class BiMultiHeadAttention(MultiHeadAttention): ...@@ -123,17 +165,14 @@ class BiMultiHeadAttention(MultiHeadAttention):
outs.append(cache) outs.append(cache)
return out if len(outs) == 1 else tuple(outs) return out if len(outs) == 1 else tuple(outs)
def _to_bi_function(model): def _to_bi_function(model):
for name, layer in model.named_children(): for name, layer in model.named_children():
if isinstance(layer, MultiHeadAttention): if isinstance(layer, MultiHeadAttention):
new_layer = BiMultiHeadAttention(layer.embed_dim, new_layer = BiMultiHeadAttention(
layer.num_heads, layer.embed_dim, layer.num_heads, layer.dropout, layer.kdim,
layer.dropout, layer.vdim, layer.need_weights, layer.q_proj._weight_attr,
layer.kdim, layer.q_proj._bias_attr)
layer.vdim,
layer.need_weights,
layer.q_proj._weight_attr,
layer.q_proj._bias_attr)
new_layer.q_proj = layer.q_proj new_layer.q_proj = layer.q_proj
new_layer.k_proj = layer.k_proj new_layer.k_proj = layer.k_proj
new_layer.v_proj = layer.v_proj new_layer.v_proj = layer.v_proj
...@@ -141,27 +180,30 @@ def _to_bi_function(model): ...@@ -141,27 +180,30 @@ def _to_bi_function(model):
model._sub_layers[name] = new_layer model._sub_layers[name] = new_layer
elif isinstance(layer, Embedding): elif isinstance(layer, Embedding):
if name != "word_embeddings": continue if name != "word_embeddings": continue
new_layer = BiEmbedding(layer._num_embeddings, new_layer = BiEmbedding(layer._num_embeddings, layer._embedding_dim,
layer._embedding_dim, layer._padding_idx, layer._sparse,
layer._padding_idx, layer._weight_attr, layer._name)
layer._sparse,
layer._weight_attr,
layer._name)
new_layer.weight = layer.weight new_layer.weight = layer.weight
model._sub_layers[name] = new_layer model._sub_layers[name] = new_layer
elif isinstance(layer, Linear): elif isinstance(layer, Linear):
if name == "classifier": continue if name == "classifier": continue
new_layer = BiLinear(layer.weight.shape[0], new_layer = BiLinear(layer.weight.shape[0], layer.weight.shape[1],
layer.weight.shape[1], layer._weight_attr, layer._bias_attr,
layer._weight_attr, layer.name)
layer._bias_attr,
layer.name)
new_layer.weight = layer.weight new_layer.weight = layer.weight
new_layer.bias = layer.bias new_layer.bias = layer.bias
model._sub_layers[name] = new_layer model._sub_layers[name] = new_layer
import math import math
def _MultiHeadAttention_forward(self, query, key=None, value=None, attn_mask=None, cache=None):
def _MultiHeadAttention_forward(self,
query,
key=None,
value=None,
attn_mask=None,
cache=None):
key = query if key is None else key key = query if key is None else key
value = query if value is None else value value = query if value is None else value
# compute q ,k ,v # compute q ,k ,v
...@@ -169,18 +211,16 @@ def _MultiHeadAttention_forward(self, query, key=None, value=None, attn_mask=Non ...@@ -169,18 +211,16 @@ def _MultiHeadAttention_forward(self, query, key=None, value=None, attn_mask=Non
q, k, v = self._prepare_qkv(query, key, value, cache) q, k, v = self._prepare_qkv(query, key, value, cache)
else: else:
q, k, v, cache = self._prepare_qkv(query, key, value, cache) q, k, v, cache = self._prepare_qkv(query, key, value, cache)
# distill qxq # distill qxq
query_scores = paddle.matmul(q, tensor.transpose(x=q, perm=[0, 1, 3, 2])) query_scores = paddle.matmul(q, tensor.transpose(x=q, perm=[0, 1, 3, 2]))
query_scores = query_scores / math.sqrt(self.head_dim) query_scores = query_scores / math.sqrt(self.head_dim)
# distill kxk # distill kxk
key_scores = paddle.matmul(k, tensor.transpose(x=k, perm=[0, 1, 3, 2])) key_scores = paddle.matmul(k, tensor.transpose(x=k, perm=[0, 1, 3, 2]))
key_scores = key_scores / math.sqrt(self.head_dim) key_scores = key_scores / math.sqrt(self.head_dim)
# scale dot product attention product = paddle.matmul(x=q, y=k, transpose_y=True)
# TODO(guosheng): use tensor.matmul, however it doesn't support `alpha` product = paddle.scale(product, scale=self.head_dim**-0.5)
product = layers.matmul(
x=q, y=k, transpose_y=True, alpha=self.head_dim**-0.5)
if attn_mask is not None: if attn_mask is not None:
# Support bool or int mask # Support bool or int mask
attn_mask = _convert_attention_mask(attn_mask, product.dtype) attn_mask = _convert_attention_mask(attn_mask, product.dtype)
...@@ -192,7 +232,7 @@ def _MultiHeadAttention_forward(self, query, key=None, value=None, attn_mask=Non ...@@ -192,7 +232,7 @@ def _MultiHeadAttention_forward(self, query, key=None, value=None, attn_mask=Non
self.dropout, self.dropout,
training=self.training, training=self.training,
mode="upscale_in_train") mode="upscale_in_train")
# distil vxv # distil vxv
value_scores = paddle.matmul(v, tensor.transpose(x=v, perm=[0, 1, 3, 2])) value_scores = paddle.matmul(v, tensor.transpose(x=v, perm=[0, 1, 3, 2]))
value_scores = value_scores / math.sqrt(self.head_dim) value_scores = value_scores / math.sqrt(self.head_dim)
...@@ -210,13 +250,19 @@ def _MultiHeadAttention_forward(self, query, key=None, value=None, attn_mask=Non ...@@ -210,13 +250,19 @@ def _MultiHeadAttention_forward(self, query, key=None, value=None, attn_mask=Non
outs.append(weights) outs.append(weights)
if cache is not None: if cache is not None:
outs.append(cache) outs.append(cache)
self.query_scores = query_scores self.query_scores = query_scores
self.key_scores = key_scores self.key_scores = key_scores
self.value_scores = value_scores self.value_scores = value_scores
return out if len(outs) == 1 else tuple(outs) return out if len(outs) == 1 else tuple(outs)
def _Bi_MultiHeadAttention_forward(self, query, key=None, value=None, attn_mask=None, cache=None):
def _Bi_MultiHeadAttention_forward(self,
query,
key=None,
value=None,
attn_mask=None,
cache=None):
key = query if key is None else key key = query if key is None else key
value = query if value is None else value value = query if value is None else value
# compute q ,k ,v # compute q ,k ,v
...@@ -224,25 +270,24 @@ def _Bi_MultiHeadAttention_forward(self, query, key=None, value=None, attn_mask= ...@@ -224,25 +270,24 @@ def _Bi_MultiHeadAttention_forward(self, query, key=None, value=None, attn_mask=
q, k, v = self._prepare_qkv(query, key, value, cache) q, k, v = self._prepare_qkv(query, key, value, cache)
else: else:
q, k, v, cache = self._prepare_qkv(query, key, value, cache) q, k, v, cache = self._prepare_qkv(query, key, value, cache)
# distill qxq # distill qxq
query_scores = paddle.matmul(q, tensor.transpose(x=q, perm=[0, 1, 3, 2])) query_scores = paddle.matmul(q, tensor.transpose(x=q, perm=[0, 1, 3, 2]))
query_scores = query_scores / math.sqrt(self.head_dim) query_scores = query_scores / math.sqrt(self.head_dim)
# distill kxk # distill kxk
key_scores = paddle.matmul(k, tensor.transpose(x=k, perm=[0, 1, 3, 2])) key_scores = paddle.matmul(k, tensor.transpose(x=k, perm=[0, 1, 3, 2]))
key_scores = key_scores / math.sqrt(self.head_dim) key_scores = key_scores / math.sqrt(self.head_dim)
q = BinaryQuantizer.apply(q) q = BinaryQuantizer.apply(q)
k = BinaryQuantizer.apply(k) k = BinaryQuantizer.apply(k)
# scale dot product attention product = paddle.matmul(x=q, y=k, transpose_y=True)
# TODO(guosheng): use tensor.matmul, however it doesn't support `alpha` product = paddle.scale(product, scale=self.head_dim**-0.5)
product = layers.matmul(
x=q, y=k, transpose_y=True, alpha=self.head_dim**-0.5)
if attn_mask is not None: if attn_mask is not None:
# Support bool or int mask # Support bool or int mask
attn_mask = _convert_attention_mask(attn_mask, product.dtype) attn_mask = _convert_attention_mask(attn_mask, product.dtype)
product = product + attn_mask product = product + attn_mask
# weights = F.softmax(product) # weights = F.softmax(product)
weights = product weights = product
if self.dropout: if self.dropout:
...@@ -251,7 +296,7 @@ def _Bi_MultiHeadAttention_forward(self, query, key=None, value=None, attn_mask= ...@@ -251,7 +296,7 @@ def _Bi_MultiHeadAttention_forward(self, query, key=None, value=None, attn_mask=
self.dropout, self.dropout,
training=self.training, training=self.training,
mode="upscale_in_train") mode="upscale_in_train")
# distil vxv # distil vxv
value_scores = paddle.matmul(v, tensor.transpose(x=v, perm=[0, 1, 3, 2])) value_scores = paddle.matmul(v, tensor.transpose(x=v, perm=[0, 1, 3, 2]))
value_scores = value_scores / math.sqrt(self.head_dim) value_scores = value_scores / math.sqrt(self.head_dim)
...@@ -279,6 +324,7 @@ def _Bi_MultiHeadAttention_forward(self, query, key=None, value=None, attn_mask= ...@@ -279,6 +324,7 @@ def _Bi_MultiHeadAttention_forward(self, query, key=None, value=None, attn_mask=
self.value_scores = value_scores self.value_scores = value_scores
return out if len(outs) == 1 else tuple(outs) return out if len(outs) == 1 else tuple(outs)
def _TransformerEncoderLayer_forward(self, src, src_mask=None, cache=None): def _TransformerEncoderLayer_forward(self, src, src_mask=None, cache=None):
src_mask = _convert_attention_mask(src_mask, src.dtype) src_mask = _convert_attention_mask(src_mask, src.dtype)
...@@ -289,8 +335,7 @@ def _TransformerEncoderLayer_forward(self, src, src_mask=None, cache=None): ...@@ -289,8 +335,7 @@ def _TransformerEncoderLayer_forward(self, src, src_mask=None, cache=None):
if cache is None: if cache is None:
src = self.self_attn(src, src, src, src_mask) src = self.self_attn(src, src, src, src_mask)
else: else:
src, incremental_cache = self.self_attn(src, src, src, src_mask, src, incremental_cache = self.self_attn(src, src, src, src_mask, cache)
cache)
src = residual + self.dropout1(src) src = residual + self.dropout1(src)
if not self.normalize_before: if not self.normalize_before:
...@@ -306,6 +351,7 @@ def _TransformerEncoderLayer_forward(self, src, src_mask=None, cache=None): ...@@ -306,6 +351,7 @@ def _TransformerEncoderLayer_forward(self, src, src_mask=None, cache=None):
self.rep = src self.rep = src
return src if cache is None else (src, incremental_cache) return src if cache is None else (src, incremental_cache)
def _get_attr(model, attr): def _get_attr(model, attr):
res = [] res = []
if hasattr(model, attr): if hasattr(model, attr):
...@@ -314,6 +360,7 @@ def _get_attr(model, attr): ...@@ -314,6 +360,7 @@ def _get_attr(model, attr):
res.extend(_get_attr(layer, attr)) res.extend(_get_attr(layer, attr))
return res return res
def _to_distill_function(model): def _to_distill_function(model):
from types import MethodType from types import MethodType
for layer in model.children(): for layer in model.children():
...@@ -321,6 +368,6 @@ def _to_distill_function(model): ...@@ -321,6 +368,6 @@ def _to_distill_function(model):
layer.forward = MethodType(_Bi_MultiHeadAttention_forward, layer) layer.forward = MethodType(_Bi_MultiHeadAttention_forward, layer)
elif isinstance(layer, MultiHeadAttention): elif isinstance(layer, MultiHeadAttention):
layer.forward = MethodType(_MultiHeadAttention_forward, layer) layer.forward = MethodType(_MultiHeadAttention_forward, layer)
elif isinstance(layer, paddle.nn.layer.transformer.TransformerEncoderLayer): elif isinstance(layer,
paddle.nn.layer.transformer.TransformerEncoderLayer):
layer.forward = MethodType(_TransformerEncoderLayer_forward, layer) layer.forward = MethodType(_TransformerEncoderLayer_forward, layer)
...@@ -8,7 +8,6 @@ import math ...@@ -8,7 +8,6 @@ import math
import time import time
import random import random
import numpy as np import numpy as np
import paddle.fluid as fluid
sys.path[0] = os.path.join( sys.path[0] = os.path.join(
os.path.dirname("__file__"), os.path.pardir, os.path.pardir) os.path.dirname("__file__"), os.path.pardir, os.path.pardir)
from paddleslim.common import get_logger from paddleslim.common import get_logger
......
...@@ -29,8 +29,6 @@ ...@@ -29,8 +29,6 @@
```text ```text
. .
├── cluster_train.py # 分布式训练函数
├── cluster_train.sh # 本地模拟多机脚本
├── train.py # 训练函数 ├── train.py # 训练函数
├── infer.py # 预测脚本 ├── infer.py # 预测脚本
├── net.py # 网络结构 ├── net.py # 网络结构
...@@ -119,12 +117,6 @@ python train.py -h ...@@ -119,12 +117,6 @@ python train.py -h
OPENBLAS_NUM_THREADS=1 CPU_NUM=5 python train.py --train_data_dir data/convert_text8 --dict_path data/test_build_dict --num_passes 10 --batch_size 100 --model_output_dir v1_cpu5_b100_lr1dir --base_lr 1.0 --print_batch 1000 --with_speed --is_sparse OPENBLAS_NUM_THREADS=1 CPU_NUM=5 python train.py --train_data_dir data/convert_text8 --dict_path data/test_build_dict --num_passes 10 --batch_size 100 --model_output_dir v1_cpu5_b100_lr1dir --base_lr 1.0 --print_batch 1000 --with_speed --is_sparse
``` ```
本地单机模拟多机训练
```bash
sh cluster_train.sh
```
本示例中按照单机多线程训练的命令进行训练,训练完毕后,可看到在当前文件夹下保存模型的路径为: ``v1_cpu5_b100_lr1dir``, 运行 ``ls v1_cpu5_b100_lr1dir``可看到该文件夹下保存了训练的10个epoch的模型文件。 本示例中按照单机多线程训练的命令进行训练,训练完毕后,可看到在当前文件夹下保存模型的路径为: ``v1_cpu5_b100_lr1dir``, 运行 ``ls v1_cpu5_b100_lr1dir``可看到该文件夹下保存了训练的10个epoch的模型文件。
``` ```
pass-0 pass-1 pass-2 pass-3 pass-4 pass-5 pass-6 pass-7 pass-8 pass-9 pass-0 pass-1 pass-2 pass-3 pass-4 pass-5 pass-6 pass-7 pass-8 pass-9
......
from __future__ import print_function
import argparse
import logging
import os
import time
import math
import random
import numpy as np
import paddle
import six
import reader
from net import skip_gram_word2vec
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger("paddle")
logger.setLevel(logging.INFO)
def parse_args():
parser = argparse.ArgumentParser(
description="PaddlePaddle Word2vec example")
parser.add_argument(
'--train_data_dir',
type=str,
default='./data/text',
help="The path of taining dataset")
parser.add_argument(
'--base_lr',
type=float,
default=0.01,
help="The number of learing rate (default: 0.01)")
parser.add_argument(
'--save_step',
type=int,
default=500000,
help="The number of step to save (default: 500000)")
parser.add_argument(
'--print_batch',
type=int,
default=100,
help="The number of print_batch (default: 10)")
parser.add_argument(
'--dict_path',
type=str,
default='./data/1-billion_dict',
help="The path of data dict")
parser.add_argument(
'--batch_size',
type=int,
default=500,
help="The size of mini-batch (default:500)")
parser.add_argument(
'--num_passes',
type=int,
default=10,
help="The number of passes to train (default: 10)")
parser.add_argument(
'--model_output_dir',
type=str,
default='models',
help='The path for model to store (default: models)')
parser.add_argument('--nce_num', type=int, default=5, help='nce_num')
parser.add_argument(
'--embedding_size',
type=int,
default=64,
help='sparse feature hashing space for index processing')
parser.add_argument(
'--is_sparse',
action='store_true',
required=False,
default=False,
help='embedding and nce will use sparse or not, (default: False)')
parser.add_argument(
'--with_speed',
action='store_true',
required=False,
default=False,
help='print speed or not , (default: False)')
parser.add_argument(
'--role', type=str, default='pserver', help='trainer or pserver')
parser.add_argument(
'--endpoints',
type=str,
default='127.0.0.1:6000',
help='The pserver endpoints, like: 127.0.0.1:6000, 127.0.0.1:6001')
parser.add_argument(
'--current_endpoint',
type=str,
default='127.0.0.1:6000',
help='The current_endpoint')
parser.add_argument(
'--trainer_id',
type=int,
default=0,
help='trainer id ,only trainer_id=0 save model')
parser.add_argument(
'--trainers',
type=int,
default=1,
help='The num of trianers, (default: 1)')
return parser.parse_args()
def convert_python_to_tensor(weight, batch_size, sample_reader):
def __reader__():
cs = np.array(weight).cumsum()
result = [[], []]
for sample in sample_reader():
for i, fea in enumerate(sample):
result[i].append(fea)
if len(result[0]) == batch_size:
tensor_result = []
for tensor in result:
t = paddle.fluid.Tensor()
dat = np.array(tensor, dtype='int64')
if len(dat.shape) > 2:
dat = dat.reshape((dat.shape[0], dat.shape[2]))
elif len(dat.shape) == 1:
dat = dat.reshape((-1, 1))
t.set(dat, paddle.CPUPlace())
tensor_result.append(t)
tt = paddle.fluid.Tensor()
neg_array = cs.searchsorted(np.random.sample(args.nce_num))
neg_array = np.tile(neg_array, batch_size)
tt.set(
neg_array.reshape((batch_size, args.nce_num)),
paddle.CPUPlace())
tensor_result.append(tt)
yield tensor_result
result = [[], []]
return __reader__
def train_loop(args, train_program, reader, py_reader, loss, trainer_id, weight,
lr):
py_reader.decorate_tensor_provider(
convert_python_to_tensor(weight, args.batch_size, reader.train()))
place = paddle.CPUPlace()
exe = paddle.static.Executor(place)
exe.run(paddle.static.default_startup_program())
print("CPU_NUM:" + str(os.getenv("CPU_NUM")))
train_exe = exe
for pass_id in range(args.num_passes):
py_reader.start()
time.sleep(10)
epoch_start = time.time()
batch_id = 0
start = time.time()
try:
while True:
loss_val = train_exe.run(fetch_list=[loss.name])
loss_val = np.mean(loss_val)
if batch_id % args.print_batch == 0:
logger.info(
"TRAIN --> pass: {} batch: {} loss: {} reader queue:{}".
format(pass_id, batch_id,
loss_val.mean(), py_reader.queue.size()))
if args.with_speed:
if batch_id % 500 == 0 and batch_id != 0:
elapsed = (time.time() - start)
start = time.time()
samples = 1001 * args.batch_size * int(
os.getenv("CPU_NUM"))
logger.info("Time used: {}, Samples/Sec: {}".format(
elapsed, samples / elapsed))
lr.step()
if batch_id % args.save_step == 0 and batch_id != 0:
model_dir = args.model_output_dir + '/pass-' + str(
pass_id) + ('/batch-' + str(batch_id))
if trainer_id == 0:
paddle.static.save(exe, model_dir, train_program)
print("model saved in %s" % model_dir)
batch_id += 1
except paddle.framework.core.EOFException:
py_reader.reset()
epoch_end = time.time()
logger.info("Epoch: {0}, Train total expend: {1} ".format(
pass_id, epoch_end - epoch_start))
model_dir = args.model_output_dir + '/pass-' + str(pass_id)
if trainer_id == 0:
paddle.static.save(exe, model_dir, train_program)
print("model saved in %s" % model_dir)
def GetFileList(data_path):
return os.listdir(data_path)
def train(args):
if not os.path.isdir(args.model_output_dir) and args.trainer_id == 0:
os.mkdir(args.model_output_dir)
filelist = GetFileList(args.train_data_dir)
word2vec_reader = reader.Word2VecReader(args.dict_path, args.train_data_dir,
filelist, 0, 1)
logger.info("dict_size: {}".format(word2vec_reader.dict_size))
np_power = np.power(np.array(word2vec_reader.id_frequencys), 0.75)
id_frequencys_pow = np_power / np_power.sum()
loss, py_reader = skip_gram_word2vec(
word2vec_reader.dict_size,
args.embedding_size,
is_sparse=args.is_sparse,
neg_num=args.nce_num)
learning_rate = paddle.optimizer.lr.ExponentialDecay(
args.base_lr, gama=0.999)
optimizer = paddle.optimizer.SGD(learning_rate=learning_rate)
optimizer.minimize(loss)
logger.info("run dist training")
t = paddle.fluid.DistributeTranspiler()
t.transpile(
args.trainer_id, pservers=args.endpoints, trainers=args.trainers)
if args.role == "pserver":
print("run psever")
pserver_prog = t.get_pserver_program(args.current_endpoint)
pserver_startup = t.get_startup_program(args.current_endpoint,
pserver_prog)
exe = paddle.static.Executor(paddle.CPUPlace())
exe.run(pserver_startup)
exe.run(pserver_prog)
elif args.role == "trainer":
print("run trainer")
train_loop(args,
t.get_trainer_program(), word2vec_reader, py_reader, loss,
args.trainer_id, id_frequencys_pow, learning_rate)
if __name__ == '__main__':
args = parse_args()
train(args)
#!/bin/bash
#export GLOG_v=30
#export GLOG_logtostderr=1
# start pserver0
export CPU_NUM=5
export FLAGS_rpc_deadline=3000000
python cluster_train.py \
--train_data_dir data/convert_text8 \
--dict_path data/test_build_dict \
--batch_size 100 \
--model_output_dir dis_model \
--base_lr 1.0 \
--print_batch 1 \
--is_sparse \
--with_speed \
--role pserver \
--endpoints 127.0.0.1:6000,127.0.0.1:6001 \
--current_endpoint 127.0.0.1:6000 \
--trainers 2 \
> pserver0.log 2>&1 &
python cluster_train.py \
--train_data_dir data/convert_text8 \
--dict_path data/test_build_dict \
--batch_size 100 \
--model_output_dir dis_model \
--base_lr 1.0 \
--print_batch 1 \
--is_sparse \
--with_speed \
--role pserver \
--endpoints 127.0.0.1:6000,127.0.0.1:6001 \
--current_endpoint 127.0.0.1:6001 \
--trainers 2 \
> pserver1.log 2>&1 &
# start trainer0
python cluster_train.py \
--train_data_dir data/convert_text8 \
--dict_path data/test_build_dict \
--batch_size 100 \
--model_output_dir dis_model \
--base_lr 1.0 \
--print_batch 1000 \
--is_sparse \
--with_speed \
--role trainer \
--endpoints 127.0.0.1:6000,127.0.0.1:6001 \
--trainers 2 \
--trainer_id 0 \
> trainer0.log 2>&1 &
# start trainer1
python cluster_train.py \
--train_data_dir data/convert_text8 \
--dict_path data/test_build_dict \
--batch_size 100 \
--model_output_dir dis_model \
--base_lr 1.0 \
--print_batch 1000 \
--is_sparse \
--with_speed \
--role trainer \
--endpoints 127.0.0.1:6000,127.0.0.1:6001 \
--trainers 2 \
--trainer_id 1 \
> trainer1.log 2>&1 &
...@@ -89,21 +89,21 @@ def skip_gram_word2vec(dict_size, embedding_size, is_sparse=False, neg_num=5): ...@@ -89,21 +89,21 @@ def skip_gram_word2vec(dict_size, embedding_size, is_sparse=False, neg_num=5):
paddle.multiply(input_emb, true_emb_w), keepdim=True), paddle.multiply(input_emb, true_emb_w), keepdim=True),
true_emb_b) true_emb_b)
input_emb_re = paddle.reshape(input_emb, shape=[-1, 1, embedding_size]) input_emb_re = paddle.reshape(input_emb, shape=[-1, 1, embedding_size])
neg_matmul = fluid.layers.matmul( neg_matmul = paddle.matmul(input_emb_re, neg_emb_w_re, transpose_y=True)
input_emb_re, neg_emb_w_re, transpose_y=True)
neg_matmul_re = paddle.reshape(neg_matmul, shape=[-1, neg_num]) neg_matmul_re = paddle.reshape(neg_matmul, shape=[-1, neg_num])
neg_logits = paddle.add(neg_matmul_re, neg_emb_b_vec) neg_logits = paddle.add(neg_matmul_re, neg_emb_b_vec)
#nce loss #nce loss
# TODO: replaced by paddle.tensor.creation.fill_constant_batch_size_like
label_ones = fluid.layers.fill_constant_batch_size_like( label_ones = fluid.layers.fill_constant_batch_size_like(
true_logits, shape=[-1, 1], value=1.0, dtype='float32') true_logits, shape=[-1, 1], value=1.0, dtype='float32')
label_zeros = fluid.layers.fill_constant_batch_size_like( label_zeros = fluid.layers.fill_constant_batch_size_like(
true_logits, shape=[-1, neg_num], value=0.0, dtype='float32') true_logits, shape=[-1, neg_num], value=0.0, dtype='float32')
true_xent = fluid.layers.sigmoid_cross_entropy_with_logits(true_logits, true_xent = paddle.nn.functional.binary_cross_entropy(true_logits,
label_ones) label_ones)
neg_xent = fluid.layers.sigmoid_cross_entropy_with_logits(neg_logits, neg_xent = paddle.nn.functional.binary_cross_entropy(neg_logits,
label_zeros) label_zeros)
cost = paddle.add(paddle.sum(true_xent, axis=1), cost = paddle.add(paddle.sum(true_xent, axis=1),
paddle.sum(neg_xent, axis=1)) paddle.sum(neg_xent, axis=1))
avg_cost = paddle.mean(cost) avg_cost = paddle.mean(cost)
...@@ -133,7 +133,7 @@ def infer_network(vocab_size, emb_size): ...@@ -133,7 +133,7 @@ def infer_network(vocab_size, emb_size):
emb_c = paddle.static.nn.embedding( emb_c = paddle.static.nn.embedding(
input=analogy_c, size=[vocab_size, emb_size], param_attr="emb") input=analogy_c, size=[vocab_size, emb_size], param_attr="emb")
target = paddle.add(paddle.add(emb_b, -emb_a), emb_c) target = paddle.add(paddle.add(emb_b, -emb_a), emb_c)
emb_all_label_l2 = fluid.layers.l2_normalize(x=emb_all_label, axis=1) emb_all_label_l2 = paddle.linalg.norm(emb_all_label, p=2, axis=1)
dist = fluid.layers.matmul(x=target, y=emb_all_label_l2, transpose_y=True) dist = paddle.matmul(x=target, y=emb_all_label_l2, transpose_y=True)
values, pred_idx = paddle.topk(x=dist, k=4) values, pred_idx = paddle.topk(x=dist, k=4)
return values, pred_idx return values, pred_idx
...@@ -97,7 +97,7 @@ def convert_python_to_tensor(weight, batch_size, sample_reader): ...@@ -97,7 +97,7 @@ def convert_python_to_tensor(weight, batch_size, sample_reader):
if len(result[0]) == batch_size: if len(result[0]) == batch_size:
tensor_result = [] tensor_result = []
for tensor in result: for tensor in result:
t = paddle.fluid.Tensor() t = paddle.Tensor()
dat = np.array(tensor, dtype='int64') dat = np.array(tensor, dtype='int64')
if len(dat.shape) > 2: if len(dat.shape) > 2:
dat = dat.reshape((dat.shape[0], dat.shape[2])) dat = dat.reshape((dat.shape[0], dat.shape[2]))
...@@ -105,7 +105,7 @@ def convert_python_to_tensor(weight, batch_size, sample_reader): ...@@ -105,7 +105,7 @@ def convert_python_to_tensor(weight, batch_size, sample_reader):
dat = dat.reshape((-1, 1)) dat = dat.reshape((-1, 1))
t.set(dat, paddle.CPUPlace()) t.set(dat, paddle.CPUPlace())
tensor_result.append(t) tensor_result.append(t)
tt = paddle.fluid.Tensor() tt = paddle.Tensor()
neg_array = cs.searchsorted(np.random.sample(args.nce_num)) neg_array = cs.searchsorted(np.random.sample(args.nce_num))
neg_array = np.tile(neg_array, batch_size) neg_array = np.tile(neg_array, batch_size)
tt.set( tt.set(
......
...@@ -66,8 +66,7 @@ def compress(args): ...@@ -66,8 +66,7 @@ def compress(args):
def if_exist(var): def if_exist(var):
return os.path.exists(os.path.join(args.pretrained_model, var.name)) return os.path.exists(os.path.join(args.pretrained_model, var.name))
paddle.fluid.io.load_vars( paddle.static.load_vars(exe, args.pretrained_model, predicate=if_exist)
exe, args.pretrained_model, predicate=if_exist)
valid_loader = paddle.io.DataLoader( valid_loader = paddle.io.DataLoader(
val_dataset, val_dataset,
......
...@@ -20,7 +20,6 @@ else: ...@@ -20,7 +20,6 @@ else:
import imageio as imgreader import imageio as imgreader
import os import os
import paddle import paddle
from paddle import fluid
class CASIA_Face(object): class CASIA_Face(object):
...@@ -79,19 +78,17 @@ if __name__ == '__main__': ...@@ -79,19 +78,17 @@ if __name__ == '__main__':
data_dir = 'PATH to CASIA dataset' data_dir = 'PATH to CASIA dataset'
place = paddle.CPUPlace() place = paddle.CPUPlace()
with fluid.dygraph.guard(place): dataset = CASIA_Face(root=data_dir)
dataset = CASIA_Face(root=data_dir) print(len(dataset))
print(len(dataset)) print(dataset.class_nums)
print(dataset.class_nums) trainloader = paddle.batch(dataset.reader, batch_size=1, drop_last=False)
trainloader = paddle.fluid.io.batch( for i in range(10):
dataset.reader, batch_size=1, drop_last=False) for data in trainloader():
for i in range(10): img = np.array([x[0] for x in data]).astype('float32')
for data in trainloader(): img = paddle.to_tensor(img)
img = np.array([x[0] for x in data]).astype('float32') print(img.shape)
img = fluid.dygraph.to_variable(img) label = np.array([x[1] for x in data]).astype('int64').reshape(-1,
print(img.shape) 1)
label = np.array([x[1] for x in data]).astype('int64').reshape( label = paddle.to_tensor(label)
-1, 1) print(label.shape)
label = fluid.dygraph.to_variable(label) print(len(dataset))
print(label.shape)
print(len(dataset))
...@@ -18,8 +18,6 @@ if six.PY2: ...@@ -18,8 +18,6 @@ if six.PY2:
import scipy.misc as imgreader import scipy.misc as imgreader
else: else:
import imageio as imgreader import imageio as imgreader
import paddle
from paddle import fluid
class LFW(object): class LFW(object):
......
...@@ -19,8 +19,6 @@ import scipy.io ...@@ -19,8 +19,6 @@ import scipy.io
import numpy as np import numpy as np
import paddle import paddle
from paddle import fluid
from dataloader.casia import CASIA_Face from dataloader.casia import CASIA_Face
from dataloader.lfw import LFW from dataloader.lfw import LFW
from paddleslim import models from paddleslim import models
...@@ -116,10 +114,7 @@ def test(test_reader, flods, flags, net, args): ...@@ -116,10 +114,7 @@ def test(test_reader, flods, flags, net, args):
data_list[1].append(data[_][1]) data_list[1].append(data[_][1])
data_list[2].append(data[_][2]) data_list[2].append(data[_][2])
data_list[3].append(data[_][3]) data_list[3].append(data[_][3])
res = [ res = [net(paddle.to_tensor(np.array(d))).numpy() for d in data_list]
net(fluid.dygraph.to_variable(np.array(d))).numpy()
for d in data_list
]
featureL = np.concatenate((res[0], res[1]), 1) featureL = np.concatenate((res[0], res[1]), 1)
featureR = np.concatenate((res[2], res[3]), 1) featureR = np.concatenate((res[2], res[3]), 1)
if featureLs is None: if featureLs is None:
...@@ -154,21 +149,18 @@ if __name__ == "__main__": ...@@ -154,21 +149,18 @@ if __name__ == "__main__":
args = parser.parse_args() args = parser.parse_args()
place = paddle.CPUPlace() if args.use_gpu == 0 else paddle.CUDAPlace(0) place = paddle.CPUPlace() if args.use_gpu == 0 else paddle.CUDAPlace(0)
with fluid.dygraph.guard(place): train_dataset = CASIA_Face(root=args.train_data_dir)
train_dataset = CASIA_Face(root=args.train_data_dir) nl, nr, flods, flags = parse_filelist(args.test_data_dir)
nl, nr, flods, flags = parse_filelist(args.test_data_dir) test_dataset = LFW(nl, nr)
test_dataset = LFW(nl, nr) test_reader = paddle.batch(
test_reader = paddle.fluid.io.batch( test_dataset.reader, batch_size=args.test_batchsize, drop_last=False)
test_dataset.reader,
batch_size=args.test_batchsize, net = models.__dict__[args.model](class_dim=train_dataset.class_nums)
drop_last=False) if args.resume:
assert os.path.exists(
net = models.__dict__[args.model](class_dim=train_dataset.class_nums) args.resume +
if args.resume: ".pdparams"), "Given dir {}.pdparams not exist.".format(args.resume)
assert os.path.exists(args.resume + ".pdparams" para_dict, opti_dict = paddle.load(args.resume)
), "Given dir {}.pdparams not exist.".format( net.set_dict(para_dict)
args.resume)
para_dict, opti_dict = fluid.dygraph.load_dygraph(args.resume) test(test_reader, flods, flags, net, args)
net.set_dict(para_dict)
test(test_reader, flods, flags, net, args)
...@@ -21,7 +21,6 @@ import numpy as np ...@@ -21,7 +21,6 @@ import numpy as np
import paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.compiler as compiler
from dataloader.casia import CASIA_Face from dataloader.casia import CASIA_Face
from dataloader.lfw import LFW from dataloader.lfw import LFW
...@@ -46,19 +45,19 @@ def creat_optimizer(args, trainset_scale): ...@@ -46,19 +45,19 @@ def creat_optimizer(args, trainset_scale):
] ]
lr = [float(e) for e in args.lr_list.strip().split(',')] lr = [float(e) for e in args.lr_list.strip().split(',')]
assert len(bd) == len(lr) - 1 assert len(bd) == len(lr) - 1
optimizer = fluid.optimizer.Momentum( optimizer = paddle.optimizer.Momentum(
learning_rate=fluid.layers.piecewise_decay( learning_rate=paddle.optimizer.lr.PiecewiseDecay(
boundaries=bd, values=lr), boundaries=bd, values=lr),
momentum=0.9, momentum=0.9,
regularization=fluid.regularizer.L2Decay(args.l2_decay)) weight_decay=args.l2_decay)
elif args.lr_strategy == 'cosine_decay': elif args.lr_strategy == 'cosine_decay':
lr = args.lr lr = args.lr
step_each_epoch = trainset_scale // args.train_batchsize step_each_epoch = trainset_scale // args.train_batchsize
optimizer = fluid.optimizer.Momentum( optimizer = paddle.optimizer.Momentum(
learning_rate=fluid.layers.cosine_decay(lr, step_each_epoch, learning_rate=paddle.optimizer.lr.CosineAnnealingDecay(
args.total_epoch), lr, args.total_epoch / 2),
momentum=0.9, momentum=0.9,
regularization=fluid.regularizer.L2Decay(args.l2_decay)) weight_decay=args.l2_decay)
else: else:
print('Wrong learning rate strategy') print('Wrong learning rate strategy')
exit() exit()
...@@ -117,9 +116,9 @@ def test(test_exe, test_program, test_out, args): ...@@ -117,9 +116,9 @@ def test(test_exe, test_program, test_out, args):
def train(exe, train_program, train_out, test_program, test_out, args): def train(exe, train_program, train_out, test_program, test_out, args):
loss, acc, global_lr, train_reader = train_out loss, acc, global_lr, train_reader = train_out
fetch_list_train = [loss.name, acc.name, global_lr.name] fetch_list_train = [loss.name, acc.name, global_lr.name]
build_strategy = fluid.BuildStrategy() build_strategy = paddle.static.BuildStrategy()
build_strategy.fuse_all_optimizer_ops = True build_strategy.fuse_all_optimizer_ops = True
compiled_prog = compiler.CompiledProgram( compiled_prog = paddle.static.CompiledProgram(
train_program, build_strategy=build_strategy).with_data_parallel( train_program, build_strategy=build_strategy).with_data_parallel(
loss_name=loss.name, build_strategy=build_strategy) loss_name=loss.name, build_strategy=build_strategy)
best_ave = 0 best_ave = 0
...@@ -136,8 +135,7 @@ def train(exe, train_program, train_out, test_program, test_out, args): ...@@ -136,8 +135,7 @@ def train(exe, train_program, train_out, test_program, test_out, args):
float(np.mean(np.array(global_lr))))) float(np.mean(np.array(global_lr)))))
if batch_id % args.save_frequency == 0: if batch_id % args.save_frequency == 0:
model_path = os.path.join(args.save_ckpt, str(epoch_id)) model_path = os.path.join(args.save_ckpt, str(epoch_id))
fluid.io.save_persistables( paddle.static.save(train_program, model_path)
executor=exe, dirname=model_path, main_program=train_program)
temp_ave = test(exe, test_program, test_out, args) temp_ave = test(exe, test_program, test_out, args)
if temp_ave > best_ave: if temp_ave > best_ave:
best_ave = temp_ave best_ave = temp_ave
...@@ -171,11 +169,11 @@ def build_program(program, startup, args, is_train=True): ...@@ -171,11 +169,11 @@ def build_program(program, startup, args, is_train=True):
name='image', shape=[-1, 3, 112, 96], dtype='float32') name='image', shape=[-1, 3, 112, 96], dtype='float32')
label = paddle.static.data( label = paddle.static.data(
name='label', shape=[-1, 1], dtype='int64') name='label', shape=[-1, 1], dtype='int64')
train_reader = fluid.io.batch( train_reader = paddle.batch(
train_dataset.reader, train_dataset.reader,
batch_size=args.train_batchsize // num_trainers, batch_size=args.train_batchsize // num_trainers,
drop_last=False) drop_last=False)
reader = fluid.io.DataLoader.from_generator( reader = paddle.io.DataLoader.from_generator(
feed_list=[image, label], feed_list=[image, label],
capacity=64, capacity=64,
iterable=True, iterable=True,
...@@ -192,7 +190,7 @@ def build_program(program, startup, args, is_train=True): ...@@ -192,7 +190,7 @@ def build_program(program, startup, args, is_train=True):
else: else:
nl, nr, flods, flags = parse_filelist(args.test_data_dir) nl, nr, flods, flags = parse_filelist(args.test_data_dir)
test_dataset = LFW(nl, nr) test_dataset = LFW(nl, nr)
test_reader = fluid.io.batch( test_reader = paddle.batch(
test_dataset.reader, test_dataset.reader,
batch_size=args.test_batchsize, batch_size=args.test_batchsize,
drop_last=False) drop_last=False)
...@@ -206,7 +204,7 @@ def build_program(program, startup, args, is_train=True): ...@@ -206,7 +204,7 @@ def build_program(program, startup, args, is_train=True):
name='image_test3', shape=[-1, 3, 112, 96], dtype='float32') name='image_test3', shape=[-1, 3, 112, 96], dtype='float32')
image_test4 = paddle.static.data( image_test4 = paddle.static.data(
name='image_test4', shape=[-1, 3, 112, 96], dtype='float32') name='image_test4', shape=[-1, 3, 112, 96], dtype='float32')
reader = fluid.io.DataLoader.from_generator( reader = paddle.io.DataLoader.from_generator(
feed_list=[ feed_list=[
image_test1, image_test2, image_test3, image_test4 image_test1, image_test2, image_test3, image_test4
], ],
...@@ -228,7 +226,7 @@ def build_program(program, startup, args, is_train=True): ...@@ -228,7 +226,7 @@ def build_program(program, startup, args, is_train=True):
def quant_val_reader_batch(): def quant_val_reader_batch():
nl, nr, flods, flags = parse_filelist(args.test_data_dir) nl, nr, flods, flags = parse_filelist(args.test_data_dir)
test_dataset = LFW(nl, nr) test_dataset = LFW(nl, nr)
test_reader = fluid.io.batch( test_reader = paddle.batch(
test_dataset.reader, batch_size=1, drop_last=False) test_dataset.reader, batch_size=1, drop_last=False)
shuffle_reader = fluid.io.shuffle(test_reader, 3) shuffle_reader = fluid.io.shuffle(test_reader, 3)
...@@ -296,7 +294,7 @@ def main(): ...@@ -296,7 +294,7 @@ def main():
args = parser.parse_args() args = parser.parse_args()
if args.use_gpu: if args.use_gpu:
num_trainers = paddle.fluid.core.get_cuda_device_count() num_trainers = paddle.framework.core.get_cuda_device_count()
else: else:
num_trainers = int(os.environ.get('CPU_NUM', 1)) num_trainers = int(os.environ.get('CPU_NUM', 1))
print(args) print(args)
...@@ -345,7 +343,7 @@ def main(): ...@@ -345,7 +343,7 @@ def main():
executor=exe) executor=exe)
nl, nr, flods, flags = parse_filelist(args.test_data_dir) nl, nr, flods, flags = parse_filelist(args.test_data_dir)
test_dataset = LFW(nl, nr) test_dataset = LFW(nl, nr)
test_reader = fluid.io.batch( test_reader = paddle.batch(
test_dataset.reader, test_dataset.reader,
batch_size=args.test_batchsize, batch_size=args.test_batchsize,
drop_last=False) drop_last=False)
...@@ -359,7 +357,7 @@ def main(): ...@@ -359,7 +357,7 @@ def main():
name='image_test3', shape=[-1, 3, 112, 96], dtype='float32') name='image_test3', shape=[-1, 3, 112, 96], dtype='float32')
image_test4 = paddle.static.data( image_test4 = paddle.static.data(
name='image_test4', shape=[-1, 3, 112, 96], dtype='float32') name='image_test4', shape=[-1, 3, 112, 96], dtype='float32')
reader = fluid.io.DataLoader.from_generator( reader = paddle.io.DataLoader.from_generator(
feed_list=[image_test1, image_test2, image_test3, image_test4], feed_list=[image_test1, image_test2, image_test3, image_test4],
capacity=64, capacity=64,
iterable=True, iterable=True,
......
...@@ -7,7 +7,6 @@ import functools ...@@ -7,7 +7,6 @@ import functools
import math import math
import time import time
import numpy as np import numpy as np
import paddle.fluid as fluid
sys.path.append(os.path.join(os.path.dirname("__file__"), os.path.pardir)) sys.path.append(os.path.join(os.path.dirname("__file__"), os.path.pardir))
from paddleslim.prune.unstructured_pruner import UnstructuredPruner from paddleslim.prune.unstructured_pruner import UnstructuredPruner
from paddleslim.common import get_logger from paddleslim.common import get_logger
...@@ -90,7 +89,7 @@ def compress(args): ...@@ -90,7 +89,7 @@ def compress(args):
return os.path.exists(os.path.join(args.pruned_model, var.name)) return os.path.exists(os.path.join(args.pruned_model, var.name))
_logger.info("Load pruned model from {}".format(args.pruned_model)) _logger.info("Load pruned model from {}".format(args.pruned_model))
paddle.fluid.io.load_vars(exe, args.pruned_model, predicate=if_exist) paddle.static.load_vars(exe, args.pruned_model, predicate=if_exist)
def test(epoch, program): def test(epoch, program):
acc_top1_ns = [] acc_top1_ns = []
......
...@@ -7,15 +7,14 @@ import functools ...@@ -7,15 +7,14 @@ import functools
import time import time
import random import random
import numpy as np import numpy as np
import paddle.fluid as fluid
from paddleslim.prune.unstructured_pruner import UnstructuredPruner, GMPUnstructuredPruner from paddleslim.prune.unstructured_pruner import UnstructuredPruner, GMPUnstructuredPruner
from paddleslim.common import get_logger from paddleslim.common import get_logger
sys.path.append(os.path.join(os.path.dirname("__file__"), os.path.pardir)) sys.path.append(os.path.join(os.path.dirname("__file__"), os.path.pardir))
import models import models
from utility import add_arguments, print_arguments from utility import add_arguments, print_arguments
import paddle.vision.transforms as T import paddle.vision.transforms as T
from paddle.fluid.incubate.fleet.collective import fleet, DistributedStrategy from paddle.distributed import fleet
from paddle.fluid.incubate.fleet.base import role_maker from paddle.distributed.fleet import DistributedStrategy
_logger = get_logger(__name__, level=logging.INFO) _logger = get_logger(__name__, level=logging.INFO)
...@@ -133,7 +132,7 @@ def compress(args): ...@@ -133,7 +132,7 @@ def compress(args):
if use_data_parallel: if use_data_parallel:
# Fleet step 1: initialize the distributed environment # Fleet step 1: initialize the distributed environment
role = role_maker.PaddleCloudRoleMaker(is_collective=True) role = fleet.PaddleCloudRoleMaker(is_collective=True)
fleet.init(role) fleet.init(role)
train_reader = None train_reader = None
...@@ -225,7 +224,7 @@ def compress(args): ...@@ -225,7 +224,7 @@ def compress(args):
if use_data_parallel: if use_data_parallel:
dist_strategy = DistributedStrategy() dist_strategy = DistributedStrategy()
dist_strategy.sync_batch_norm = False dist_strategy.sync_batch_norm = False
dist_strategy.exec_strategy = paddle.static.ExecutionStrategy() dist_strategy.execution_strategy = paddle.static.ExecutionStrategy()
dist_strategy.fuse_all_reduce_ops = False dist_strategy.fuse_all_reduce_ops = False
train_program = paddle.static.default_main_program() train_program = paddle.static.default_main_program()
...@@ -256,8 +255,7 @@ def compress(args): ...@@ -256,8 +255,7 @@ def compress(args):
if args.last_epoch > -1: if args.last_epoch > -1:
assert args.checkpoint is not None and os.path.exists( assert args.checkpoint is not None and os.path.exists(
args.checkpoint), "Please specify a valid checkpoint path." args.checkpoint), "Please specify a valid checkpoint path."
paddle.fluid.io.load_persistables( paddle.static.load(train_program, args.checkpoint)
executor=exe, dirname=args.checkpoint, main_program=train_program)
elif args.pretrained_model: elif args.pretrained_model:
assert os.path.exists( assert os.path.exists(
...@@ -270,10 +268,9 @@ def compress(args): ...@@ -270,10 +268,9 @@ def compress(args):
_logger.info("Load pretrained model from {}".format( _logger.info("Load pretrained model from {}".format(
args.pretrained_model)) args.pretrained_model))
# NOTE: We are using fluid.io.load_vars() because the pretrained model is from an older version which requires this API. # NOTE: We are using paddle.static.load_vars() because the pretrained model is from an older version which requires this API.
# Please consider using paddle.static.load(program, model_path) when possible # Please consider using paddle.static.load(program, model_path) when possible
paddle.fluid.io.load_vars( paddle.static.load_vars(exe, args.pretrained_model, predicate=if_exist)
exe, args.pretrained_model, predicate=if_exist)
def test(epoch, program): def test(epoch, program):
acc_top1_ns = [] acc_top1_ns = []
...@@ -336,12 +333,8 @@ def compress(args): ...@@ -336,12 +333,8 @@ def compress(args):
learning_rate.step() learning_rate.step()
reader_start = time.time() reader_start = time.time()
if use_data_parallel: compiled_train_program = paddle.static.CompiledProgram(
# Fleet step 4: get the compiled program from fleet paddle.static.default_main_program())
compiled_train_program = fleet.main_program
else:
compiled_train_program = paddle.static.CompiledProgram(
paddle.static.default_main_program())
for i in range(args.last_epoch + 1, args.num_epochs): for i in range(args.last_epoch + 1, args.num_epochs):
train(i, compiled_train_program) train(i, compiled_train_program)
...@@ -358,8 +351,8 @@ def compress(args): ...@@ -358,8 +351,8 @@ def compress(args):
if use_data_parallel: if use_data_parallel:
fleet.save_persistables(executor=exe, dirname=args.model_path) fleet.save_persistables(executor=exe, dirname=args.model_path)
else: else:
paddle.fluid.io.save_persistables( paddle.static.save(paddle.static.default_main_program(),
executor=exe, dirname=args.model_path) args.model_path)
def main(): def main():
......
...@@ -198,9 +198,9 @@ class TableLatencyPredictor(LatencyPredictor): ...@@ -198,9 +198,9 @@ class TableLatencyPredictor(LatencyPredictor):
paddle.enable_static() paddle.enable_static()
with open(pbmodel_file, "rb") as f: with open(pbmodel_file, "rb") as f:
fluid_program = paddle.static.Program.parse_from_string(f.read()) _program = paddle.static.Program.parse_from_string(f.read())
graph = GraphWrapper(fluid_program) graph = GraphWrapper(_program)
if input_shape != None: if input_shape != None:
ori_shape = self._get_input_shape(graph) ori_shape = self._get_input_shape(graph)
......
...@@ -23,7 +23,7 @@ def model_size(program): ...@@ -23,7 +23,7 @@ def model_size(program):
Get total value numbers of all parameters. Get total value numbers of all parameters.
Args: Args:
program(fluid.Program): The program used to calculate model size. program(paddle.static.Program): The program used to calculate model size.
Returns: Returns:
int: The total count of all parameters. int: The total count of all parameters.
......
...@@ -432,8 +432,8 @@ class ProgramInfo: ...@@ -432,8 +432,8 @@ class ProgramInfo:
""" """
ProgramInfo Config. ProgramInfo Config.
Args: Args:
startup_program(paddle.static.Program): Startup program, the means of startup program can reference `<https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/static/default_startup_program_cn.html#cn-api-fluid-default-startup-program>`_. startup_program(paddle.static.Program): Startup program, the means of startup program can reference `<https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/static/default_startup_program_cn.html#default-startup-program>`_.
program(paddle.static.Program): main program, the means of main program can reference `<https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/static/default_main_program_cn.html#cn-api-fluid-default-main-program>`_. program(paddle.static.Program): main program, the means of main program can reference `<https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/static/default_main_program_cn.html#default-main-program>`_.
feed_target_names(list(str)): The name of feed tensor in the program. feed_target_names(list(str)): The name of feed tensor in the program.
fetch_targets(list(Variable)): The fetch variable in the program. fetch_targets(list(Variable)): The fetch variable in the program.
optimizer(Optimizer, optional): Optimizer in training. Default: None. optimizer(Optimizer, optional): Optimizer in training. Default: None.
......
...@@ -57,15 +57,12 @@ def _recover_param_attr(program): ...@@ -57,15 +57,12 @@ def _recover_param_attr(program):
Params in infermodel are stored in the form of variable, which can not be trained.""" Params in infermodel are stored in the form of variable, which can not be trained."""
all_weights = [param for param in program.list_vars() \ all_weights = [param for param in program.list_vars() \
if param.persistable is True and param.name != 'feed' and param.name != 'fetch'] if param.persistable is True and param.name != 'feed' and param.name != 'fetch']
for w in all_weights: with paddle.static.program_guard(program):
new_w = paddle.fluid.framework.Parameter( for w in all_weights:
block=program.block(0), new_w = paddle.create_parameter(
shape=w.shape, shape=w.shape, dtype=w.dtype, name=w.name)
dtype=w.dtype, new_w.set_value(w.get_value())
type=w.type, program.block(0).vars[w.name] = new_w
name=w.name)
new_w.set_value(w.get_value())
program.block(0).vars[w.name] = new_w
return program return program
......
...@@ -16,31 +16,35 @@ import math ...@@ -16,31 +16,35 @@ import math
import logging import logging
import numpy as np import numpy as np
import paddle import paddle
import paddle.fluid as fluid from paddle.nn import LSTMCell
from paddle.fluid import ParamAttr
from paddle.fluid.layers import RNNCell, LSTMCell, rnn
from paddle.fluid.contrib.layers import basic_lstm
from ...controller import RLBaseController from ...controller import RLBaseController
from ...log_helper import get_logger from ...log_helper import get_logger
from ..utils import RLCONTROLLER from ..utils import RLCONTROLLER
_logger = get_logger(__name__, level=logging.INFO) _logger = get_logger(__name__, level=logging.INFO)
uniform_initializer = lambda x: fluid.initializer.UniformInitializer(low=-x, high=x) uniform_initializer = lambda x: paddle.nn.initializer.Uniform(low=-x, high=x)
class lstm_cell(RNNCell): class lstm_cell(paddle.nn.RNNCellBase):
def __init__(self, num_layers, hidden_size): def __init__(self, num_layers, hidden_size):
self.num_layers = num_layers self.num_layers = num_layers
self.hidden_size = hidden_size self.hidden_size = hidden_size
self.lstm_cells = [] self.lstm_cells = []
param_attr = ParamAttr(initializer=uniform_initializer( param_attr = paddle.ParamAttr(initializer=uniform_initializer(
1.0 / math.sqrt(hidden_size))) 1.0 / math.sqrt(hidden_size)))
bias_attr = ParamAttr(initializer=uniform_initializer( bias_attr = paddle.ParamAttr(initializer=uniform_initializer(
1.0 / math.sqrt(hidden_size))) 1.0 / math.sqrt(hidden_size)))
for i in range(num_layers): for i in range(num_layers):
self.lstm_cells.append(LSTMCell(hidden_size, param_attr, bias_attr)) self.lstm_cells.append(
LSTMCell(
hidden_size,
hidden_size,
weight_ih_attr=param_attr,
weight_hh_attr=param_attr,
bias_ih_attr=bias_attr,
bias_hh_attr=bias_attr))
def call(self, inputs, states): def call(self, inputs, states):
new_states = [] new_states = []
...@@ -100,7 +104,7 @@ class LSTM(RLBaseController): ...@@ -100,7 +104,7 @@ class LSTM(RLBaseController):
shape=(self.controller_batch_size, self.hidden_size), shape=(self.controller_batch_size, self.hidden_size),
dtype='float32', dtype='float32',
default_initializer=uniform_initializer(1.0)) default_initializer=uniform_initializer(1.0))
self.baseline = fluid.layers.create_global_var( self.baseline = paddle.static.create_global_var(
shape=[1], shape=[1],
value=0.0, value=0.0,
dtype='float32', dtype='float32',
...@@ -134,7 +138,10 @@ class LSTM(RLBaseController): ...@@ -134,7 +138,10 @@ class LSTM(RLBaseController):
action = paddle.squeeze(action, axis=[1]) action = paddle.squeeze(action, axis=[1])
action.stop_gradient = True action.stop_gradient = True
else: else:
action = fluid.layers.sampling_id(probs) multinomial = paddle.distribution.Multinomial(1, probs)
action = paddle.argmax(
multinomial.sample((1, )), axis=-1)
action = paddle.flatten(action)
actions.append(action) actions.append(action)
log_prob = paddle.nn.functional.softmax_with_cross_entropy( log_prob = paddle.nn.functional.softmax_with_cross_entropy(
logits, logits,
...@@ -171,22 +178,25 @@ class LSTM(RLBaseController): ...@@ -171,22 +178,25 @@ class LSTM(RLBaseController):
dtype='float32', dtype='float32',
default_initializer=uniform_initializer(1.0)) default_initializer=uniform_initializer(1.0))
paddle.assign( paddle.assign(paddle.uniform(shape=self.g_emb.shape), self.g_emb)
fluid.layers.uniform_random(shape=self.g_emb.shape), self.g_emb) hidden = paddle.static.data(
hidden = fluid.data(name='hidden', shape=[None, self.hidden_size]) name='hidden', shape=[None, self.hidden_size])
cell = fluid.data(name='cell', shape=[None, self.hidden_size]) cell = paddle.static.data(
name='cell', shape=[None, self.hidden_size])
self.tokens = self._network(hidden, cell, is_inference=is_inference) self.tokens = self._network(hidden, cell, is_inference=is_inference)
with paddle.static.program_guard(self.learn_program): with paddle.static.program_guard(self.learn_program):
hidden = fluid.data(name='hidden', shape=[None, self.hidden_size]) hidden = paddle.static.data(
cell = fluid.data(name='cell', shape=[None, self.hidden_size]) name='hidden', shape=[None, self.hidden_size])
init_actions = fluid.data( cell = paddle.static.data(
name='cell', shape=[None, self.hidden_size])
init_actions = paddle.static.data(
name='init_actions', name='init_actions',
shape=[None, len(self.range_tables)], shape=[None, len(self.range_tables)],
dtype='int64') dtype='int64')
self._network(hidden, cell, init_actions=init_actions) self._network(hidden, cell, init_actions=init_actions)
rewards = fluid.data(name='rewards', shape=[None]) rewards = paddle.static.data(name='rewards', shape=[None])
self.rewards = paddle.mean(rewards) self.rewards = paddle.mean(rewards)
if self.weight_entropy is not None: if self.weight_entropy is not None:
...@@ -197,7 +207,7 @@ class LSTM(RLBaseController): ...@@ -197,7 +207,7 @@ class LSTM(RLBaseController):
paddle.assign(self.baseline - (1.0 - self.decay) * paddle.assign(self.baseline - (1.0 - self.decay) *
(self.baseline - self.rewards), self.baseline) (self.baseline - self.rewards), self.baseline)
self.loss = self.sample_log_probs * (self.rewards - self.baseline) self.loss = self.sample_log_probs * (self.rewards - self.baseline)
clip = fluid.clip.GradientClipByNorm(clip_norm=5.0) clip = paddle.nn.ClipGradByNorm(clip_norm=5.0)
if self.decay_steps is not None: if self.decay_steps is not None:
lr = paddle.optimizer.lr.ExponentialDecay( lr = paddle.optimizer.lr.ExponentialDecay(
learning_rate=self.controller_lr, learning_rate=self.controller_lr,
...@@ -287,4 +297,4 @@ class LSTM(RLBaseController): ...@@ -287,4 +297,4 @@ class LSTM(RLBaseController):
_logger.info("Controller: current reward is {}, loss is {}".format( _logger.info("Controller: current reward is {}, loss is {}".format(
rewards, loss)) rewards, loss))
params_dict = self.get_params(self.learn_program) params_dict = self.get_params(self.learn_program)
return params_dict return params_dict
\ No newline at end of file
...@@ -94,7 +94,6 @@ def to_variables(inputs): ...@@ -94,7 +94,6 @@ def to_variables(inputs):
return ret return ret
@paddle.fluid.framework.dygraph_only
def dygraph2program(layer, inputs, dtypes=None): def dygraph2program(layer, inputs, dtypes=None):
assert isinstance(layer, paddle.nn.Layer) assert isinstance(layer, paddle.nn.Layer)
return _dy2prog(layer, inputs, dtypes) return _dy2prog(layer, inputs, dtypes)
......
...@@ -220,7 +220,7 @@ class OpWrapper(object): ...@@ -220,7 +220,7 @@ class OpWrapper(object):
class GraphWrapper(object): class GraphWrapper(object):
""" """
It is a wrapper of paddle.fluid.framework.IrGraph with some special functions It is a wrapper of paddle.framework.IrGraph with some special functions
for paddle slim framework. for paddle slim framework.
Args: Args:
......
...@@ -189,7 +189,7 @@ class DARTSearch(object): ...@@ -189,7 +189,7 @@ class DARTSearch(object):
learning_rate = paddle.optimizer.lr.CosineAnnealingDecay( learning_rate = paddle.optimizer.lr.CosineAnnealingDecay(
self.learning_rate, self.num_epochs // 2) self.learning_rate, self.num_epochs // 2)
clip = paddle.fluid.clip.GradientClipByGlobalNorm(clip_norm=5.0) clip = paddle.nn.ClipGradByGlobalNorm(5.0)
optimizer = paddle.optimizer.Momentum( optimizer = paddle.optimizer.Momentum(
learning_rate, learning_rate,
0.9, 0.9,
......
...@@ -1024,7 +1024,7 @@ class SuperBatchNorm2D(paddle.nn.BatchNorm2D): ...@@ -1024,7 +1024,7 @@ class SuperBatchNorm2D(paddle.nn.BatchNorm2D):
return batch_norm_out return batch_norm_out
paddle.fluid.data_feeder.check_variable_and_dtype( paddle.common_ops_import.check_variable_and_dtype(
input, 'input', ['float16', 'float32', 'float64'], 'BatchNorm') input, 'input', ['float16', 'float32', 'float64'], 'BatchNorm')
# for static need dict # for static need dict
...@@ -1111,7 +1111,7 @@ class SuperSyncBatchNorm(paddle.nn.SyncBatchNorm): ...@@ -1111,7 +1111,7 @@ class SuperSyncBatchNorm(paddle.nn.SyncBatchNorm):
"use_mkldnn", False, "fuse_with_relu", False, "use_mkldnn", False, "fuse_with_relu", False,
"use_global_stats", False, 'trainable_statistics', False) "use_global_stats", False, 'trainable_statistics', False)
if paddle.fluid.framework._non_static_mode(): if paddle.in_dynamic_mode():
if feature_dim != self._mean.shape[0]: if feature_dim != self._mean.shape[0]:
sync_batch_norm_out, _, _, _, _, _ = paddle._legacy_C_ops.sync_batch_norm( sync_batch_norm_out, _, _, _, _, _ = paddle._legacy_C_ops.sync_batch_norm(
input, weight, bias, self._mean, self._variance, mean_out, input, weight, bias, self._mean, self._variance, mean_out,
...@@ -1128,10 +1128,7 @@ class SuperSyncBatchNorm(paddle.nn.SyncBatchNorm): ...@@ -1128,10 +1128,7 @@ class SuperSyncBatchNorm(paddle.nn.SyncBatchNorm):
return sync_batch_norm_out return sync_batch_norm_out
print( paddle.common_ops_import.check_variable_and_dtype(
f"hit static check_variable_and_dtype in ofa-----------------------------------"
)
paddle.fluid.data_feeder.check_variable_and_dtype(
input, 'input', ['float16', 'float32', 'float64'], 'SyncBatchNorm') input, 'input', ['float16', 'float32', 'float64'], 'SyncBatchNorm')
attrs = { attrs = {
...@@ -1308,7 +1305,7 @@ class SuperLayerNorm(paddle.nn.LayerNorm): ...@@ -1308,7 +1305,7 @@ class SuperLayerNorm(paddle.nn.LayerNorm):
out, _, _ = paddle._C_ops.layer_norm( out, _, _ = paddle._C_ops.layer_norm(
input, weight, bias, self._epsilon, begin_norm_axis, False) input, weight, bias, self._epsilon, begin_norm_axis, False)
else: else:
paddle.fluid.data_feeder.check_variable_and_dtype( paddle.common_ops_import.check_variable_and_dtype(
input, 'input', ['float32', 'float64'], 'LayerNorm') input, 'input', ['float32', 'float64'], 'LayerNorm')
inputs = dict() inputs = dict()
......
...@@ -17,7 +17,7 @@ from .mobilenetv1 import MobileNetV1Space ...@@ -17,7 +17,7 @@ from .mobilenetv1 import MobileNetV1Space
from .resnet import ResNetSpace from .resnet import ResNetSpace
from .mobilenet_block import MobileNetV1BlockSpace, MobileNetV2BlockSpace from .mobilenet_block import MobileNetV1BlockSpace, MobileNetV2BlockSpace
from .resnet_block import ResNetBlockSpace from .resnet_block import ResNetBlockSpace
from .inception_block import InceptionABlockSpace, InceptionCBlockSpace from .inception_block import InceptionABlockSpace
from .darts_space import DartsSpace from .darts_space import DartsSpace
from .search_space_registry import SEARCHSPACE from .search_space_registry import SEARCHSPACE
from .search_space_factory import SearchSpaceFactory from .search_space_factory import SearchSpaceFactory
...@@ -25,6 +25,6 @@ from .search_space_base import SearchSpaceBase ...@@ -25,6 +25,6 @@ from .search_space_base import SearchSpaceBase
__all__ = [ __all__ = [
'MobileNetV1Space', 'MobileNetV2Space', 'ResNetSpace', 'DartsSpace', 'MobileNetV1Space', 'MobileNetV2Space', 'ResNetSpace', 'DartsSpace',
'MobileNetV1BlockSpace', 'MobileNetV2BlockSpace', 'ResNetBlockSpace', 'MobileNetV1BlockSpace', 'MobileNetV2BlockSpace', 'ResNetBlockSpace',
'InceptionABlockSpace', 'InceptionCBlockSpace', 'SearchSpaceBase', 'InceptionABlockSpace', 'SearchSpaceBase', 'SearchSpaceFactory',
'SearchSpaceFactory', 'SEARCHSPACE' 'SEARCHSPACE'
] ]
...@@ -107,8 +107,7 @@ class DartsSpace(SearchSpaceBase): ...@@ -107,8 +107,7 @@ class DartsSpace(SearchSpaceBase):
return net_arch return net_arch
def _classifier(self, x, num_classes, name): def _classifier(self, x, num_classes, name):
out = paddle.fluid.layers.pool2d( out = paddle.nn.functional.adaptive_avg_pool2d(x, 1)
x, pool_type='avg', global_pooling=True)
out = paddle.squeeze(x=out, axis=[2, 3]) out = paddle.squeeze(x=out, axis=[2, 3])
k = (1. / out.shape[1])**0.5 k = (1. / out.shape[1])**0.5
out = paddle.static.nn.fc(out, out = paddle.static.nn.fc(out,
...@@ -125,8 +124,7 @@ class DartsSpace(SearchSpaceBase): ...@@ -125,8 +124,7 @@ class DartsSpace(SearchSpaceBase):
def _auxiliary_cifar(self, x, num_classes, name): def _auxiliary_cifar(self, x, num_classes, name):
x = paddle.nn.functional.relu(x) x = paddle.nn.functional.relu(x)
pooled = paddle.fluid.layers.pool2d( pooled = paddle.nn.functional.avg_pool2d(x, 5, stride=3, padding=0)
x, pool_size=5, pool_stride=3, pool_padding=0, pool_type='avg')
conv1 = self._conv_bn( conv1 = self._conv_bn(
x=pooled, x=pooled,
c_out=128, c_out=128,
...@@ -309,13 +307,8 @@ class DartsSpace(SearchSpaceBase): ...@@ -309,13 +307,8 @@ class DartsSpace(SearchSpaceBase):
drop_path_cell, drop_path_cell,
is_train, is_train,
name=None): name=None):
hidden0_0 = paddle.fluid.layers.pool2d( hidden0_0 = paddle.nn.functional.max_pool2d(
input=s0, s0, 3, stride=2, padding=1, name=name + '_reduction_cell_hidden0_0')
pool_size=3,
pool_type="max",
pool_stride=2,
pool_padding=1,
name=name + '_reduction_cell_hidden0_0')
hidden0_1 = self._factorized_reduce( hidden0_1 = self._factorized_reduce(
s1, s1,
filter_num, filter_num,
...@@ -328,14 +321,8 @@ class DartsSpace(SearchSpaceBase): ...@@ -328,14 +321,8 @@ class DartsSpace(SearchSpaceBase):
drop_path_cell[:, 0, 0], drop_path_cell[:, 0, 0],
name=name + '_reduction_cell_hidden0_0') name=name + '_reduction_cell_hidden0_0')
r0 = hidden0_0 + hidden0_1 r0 = hidden0_0 + hidden0_1
hidden1_0 = paddle.nn.functional.max_pool2d(
hidden1_0 = paddle.fluid.layers.pool2d( s1, 3, stride=2, padding=1, name=name + '_reduction_cell_hidden1_0')
input=s1,
pool_size=3,
pool_type="max",
pool_stride=2,
pool_padding=1,
name=name + '_reduction_cell_hidden1_0')
hidden1_1 = r0 hidden1_1 = r0
if is_train: if is_train:
hidden1_0 = self._drop_path( hidden1_0 = self._drop_path(
...@@ -364,13 +351,8 @@ class DartsSpace(SearchSpaceBase): ...@@ -364,13 +351,8 @@ class DartsSpace(SearchSpaceBase):
r2 = hidden2_0 + hidden2_1 r2 = hidden2_0 + hidden2_1
hidden3_0 = r0 hidden3_0 = r0
hidden3_1 = paddle.fluid.layers.pool2d( hidden3_1 = paddle.nn.functional.max_pool2d(
input=s1, s1, 3, stride=2, padding=1, name=name + '_reduction_cell_hidden3_1')
pool_size=3,
pool_type="max",
pool_stride=2,
pool_padding=1,
name=name + '_reduction_cell_hidden3_1')
if is_train: if is_train:
hidden3_1 = self._drop_path( hidden3_1 = self._drop_path(
hidden3_1, hidden3_1,
......
...@@ -193,13 +193,9 @@ class InceptionABlockSpace(SearchSpaceBase): ...@@ -193,13 +193,9 @@ class InceptionABlockSpace(SearchSpaceBase):
stride, stride,
pool_type, pool_type,
name=None): name=None):
print(f"hit _inceptionA----------------------------") pool_op = paddle.nn.functional.avg_pool2d if pool_type == "avg" else paddle.nn.functional.max_pool2d
pool1 = paddle.fluid.layers.pool2d( pool1 = pool_op(
input=data, data, filter_size, padding='SAME', stride=1, name=name + '_pool2d')
pool_size=filter_size,
pool_padding='SAME',
pool_type=pool_type,
name=name + '_pool2d')
conv1 = conv_bn_layer( conv1 = conv_bn_layer(
input=pool1, input=pool1,
filter_size=1, filter_size=1,
...@@ -256,258 +252,3 @@ class InceptionABlockSpace(SearchSpaceBase): ...@@ -256,258 +252,3 @@ class InceptionABlockSpace(SearchSpaceBase):
concat = paddle.concat( concat = paddle.concat(
[conv1, conv2, conv3, conv4], axis=1, name=name + '_concat') [conv1, conv2, conv3, conv4], axis=1, name=name + '_concat')
return concat return concat
@SEARCHSPACE.register
class InceptionCBlockSpace(SearchSpaceBase):
def __init__(self, input_size, output_size, block_num, block_mask):
super(InceptionCBlockSpace, self).__init__(input_size, output_size,
block_num, block_mask)
if self.block_mask == None:
# use input_size and output_size to compute self.downsample_num
self.downsample_num = compute_downsample_num(self.input_size,
self.output_size)
if self.block_num != None:
assert self.downsample_num <= self.block_num, 'downsample numeber must be LESS THAN OR EQUAL TO block_num, but NOW: downsample numeber is {}, block_num is {}'.format(
self.downsample_num, self.block_num)
### self.filter_num means filter nums
self.filter_num = np.array([
3, 4, 8, 12, 16, 24, 32, 48, 64, 80, 96, 128, 144, 160, 192, 224,
256, 320, 384, 448, 480, 512, 1024
])
### self.k_size means kernel_size
self.k_size = np.array([3, 5])
### self.pool_type means pool type, 0 means avg, 1 means max
self.pool_type = np.array([0, 1])
### self.repeat means repeat of 1x1 conv in branch of inception
### self.repeat = np.array([0,1])
def init_tokens(self):
"""
The initial token.
"""
return get_random_tokens(self.range_table())
def range_table(self):
"""
Get range table of current search space, constrains the range of tokens.
"""
range_table_base = []
if self.block_mask != None:
range_table_length = len(self.block_mask)
else:
range_table_length = self.block_num
for i in range(range_table_length):
range_table_base.append(len(self.filter_num))
range_table_base.append(len(self.filter_num))
range_table_base.append(len(self.filter_num))
range_table_base.append(len(self.filter_num))
range_table_base.append(len(self.filter_num))
range_table_base.append(len(self.filter_num))
range_table_base.append(len(self.filter_num))
range_table_base.append(len(self.k_size))
range_table_base.append(len(self.pool_type))
return range_table_base
def token2arch(self, tokens=None):
"""
return net_arch function
"""
#assert self.block_num
if tokens is None:
tokens = self.init_tokens()
self.bottleneck_params_list = []
if self.block_mask != None:
for i in range(len(self.block_mask)):
self.bottleneck_params_list.append(
(self.filter_num[tokens[i * 11]],
self.filter_num[tokens[i * 11 + 1]],
self.filter_num[tokens[i * 11 + 2]],
self.filter_num[tokens[i * 11 + 3]],
self.filter_num[tokens[i * 11 + 4]],
self.filter_num[tokens[i * 11 + 5]],
self.filter_num[tokens[i * 11 + 6]],
self.filter_num[tokens[i * 11 + 7]],
self.filter_num[tokens[i * 11 + 8]],
self.k_size[tokens[i * 11 + 9]], 2 if self.block_mask == 1
else 1, self.pool_type[tokens[i * 11 + 10]]))
else:
repeat_num = int(self.block_num / self.downsample_num)
num_minus = self.block_num % self.downsample_num
### if block_num > downsample_num, add stride=1 block at last (block_num-downsample_num) layers
for i in range(self.downsample_num):
self.bottleneck_params_list.append(
(self.filter_num[tokens[i * 11]],
self.filter_num[tokens[i * 11 + 1]],
self.filter_num[tokens[i * 11 + 2]],
self.filter_num[tokens[i * 11 + 3]],
self.filter_num[tokens[i * 11 + 4]],
self.filter_num[tokens[i * 11 + 5]],
self.filter_num[tokens[i * 11 + 6]],
self.filter_num[tokens[i * 11 + 7]],
self.filter_num[tokens[i * 11 + 8]],
self.k_size[tokens[i * 11 + 9]], 2,
self.pool_type[tokens[i * 11 + 10]]))
### if block_num / downsample_num > 1, add (block_num / downsample_num) times stride=1 block
for k in range(repeat_num - 1):
kk = k * self.downsample_num + i
self.bottleneck_params_list.append(
(self.filter_num[tokens[kk * 11]],
self.filter_num[tokens[kk * 11 + 1]],
self.filter_num[tokens[kk * 11 + 2]],
self.filter_num[tokens[kk * 11 + 3]],
self.filter_num[tokens[kk * 11 + 4]],
self.filter_num[tokens[kk * 11 + 5]],
self.filter_num[tokens[kk * 11 + 6]],
self.filter_num[tokens[kk * 11 + 7]],
self.filter_num[tokens[kk * 11 + 8]],
self.k_size[tokens[kk * 11 + 9]], 1,
self.pool_type[tokens[kk * 11 + 10]]))
if self.downsample_num - i <= num_minus:
j = self.downsample_num * (repeat_num - 1) + i
self.bottleneck_params_list.append(
(self.filter_num[tokens[j * 11]],
self.filter_num[tokens[j * 11 + 1]],
self.filter_num[tokens[j * 11 + 2]],
self.filter_num[tokens[j * 11 + 3]],
self.filter_num[tokens[j * 11 + 4]],
self.filter_num[tokens[j * 11 + 5]],
self.filter_num[tokens[j * 11 + 6]],
self.filter_num[tokens[j * 11 + 7]],
self.filter_num[tokens[j * 11 + 8]],
self.k_size[tokens[j * 11 + 9]], 1,
self.pool_type[tokens[j * 11 + 10]]))
if self.downsample_num == 0 and self.block_num != 0:
for i in range(len(self.block_num)):
self.bottleneck_params_list.append(
(self.filter_num[tokens[i * 11]],
self.filter_num[tokens[i * 11 + 1]],
self.filter_num[tokens[i * 11 + 2]],
self.filter_num[tokens[i * 11 + 3]],
self.filter_num[tokens[i * 11 + 4]],
self.filter_num[tokens[i * 11 + 5]],
self.filter_num[tokens[i * 11 + 6]],
self.filter_num[tokens[i * 11 + 7]],
self.filter_num[tokens[i * 11 + 8]],
self.k_size[tokens[i * 11 + 9]], 1,
self.pool_type[tokens[i * 11 + 10]]))
def net_arch(input, return_mid_layer=False, return_block=None):
layer_count = 0
mid_layer = dict()
for i, layer_setting in enumerate(self.bottleneck_params_list):
filter_nums = layer_setting[0:9]
filter_size = layer_setting[9]
stride = layer_setting[10]
pool_type = 'avg' if layer_setting[11] == 0 else 'max'
if stride == 2:
layer_count += 1
if check_points((layer_count - 1), return_block):
mid_layer[layer_count - 1] = input
input = self._inceptionC(
input,
C_tokens=filter_nums,
filter_size=int(filter_size),
stride=stride,
pool_type=pool_type,
name='inceptionC_{}'.format(i + 1))
if return_mid_layer:
return input, mid_layer
else:
return input,
return net_arch
def _inceptionC(self,
data,
C_tokens,
filter_size,
stride,
pool_type,
name=None):
pool1 = paddle.fluid.layers.pool2d(
input=data,
pool_size=filter_size,
pool_padding='SAME',
pool_type=pool_type,
name=name + '_pool2d')
conv1 = conv_bn_layer(
input=pool1,
filter_size=1,
num_filters=C_tokens[0],
stride=stride,
act='relu',
name=name + '_conv1')
conv2 = conv_bn_layer(
input=data,
filter_size=1,
num_filters=C_tokens[1],
stride=stride,
act='relu',
name=name + '_conv2')
conv3 = conv_bn_layer(
input=data,
filter_size=1,
num_filters=C_tokens[2],
stride=1,
act='relu',
name=name + '_conv3_1')
conv3_1 = conv_bn_layer(
input=conv3,
filter_size=filter_size,
num_filters=C_tokens[3],
stride=stride,
act='relu',
name=name + '_conv3_2_1')
conv3_2 = conv_bn_layer(
input=conv3,
filter_size=filter_size,
num_filters=C_tokens[4],
stride=stride,
act='relu',
name=name + '_conv3_2_2')
conv4 = conv_bn_layer(
input=data,
filter_size=1,
num_filters=C_tokens[5],
stride=1,
act='relu',
name=name + '_conv4_1')
conv4 = conv_bn_layer(
input=conv4,
filter_size=filter_size,
num_filters=C_tokens[6],
stride=1,
act='relu',
name=name + '_conv4_2')
conv4_1 = conv_bn_layer(
input=conv4,
filter_size=filter_size,
num_filters=C_tokens[7],
stride=stride,
act='relu',
name=name + '_conv4_3_1')
conv4_2 = conv_bn_layer(
input=conv4,
filter_size=filter_size,
num_filters=C_tokens[8],
stride=stride,
act='relu',
name=name + '_conv4_3_2')
concat = paddle.concat(
[conv1, conv2, conv3_1, conv3_2, conv4_1, conv4_2],
axis=1,
name=name + '_concat')
return concat
...@@ -196,11 +196,8 @@ class MobileNetV1Space(SearchSpaceBase): ...@@ -196,11 +196,8 @@ class MobileNetV1Space(SearchSpaceBase):
if check_points(layer_count, end_points): if check_points(layer_count, end_points):
return input, decode_ends return input, decode_ends
input = paddle.fluid.layers.pool2d( input = paddle.nn.functional.adaptive_avg_pool2d(
input=input, input, 1, name='mobilenetv1_last_pool')
pool_type='avg',
global_pooling=True,
name='mobilenetv1_last_pool')
return input return input
......
...@@ -203,11 +203,8 @@ class MobileNetV2Space(SearchSpaceBase): ...@@ -203,11 +203,8 @@ class MobileNetV2Space(SearchSpaceBase):
act='relu6', act='relu6',
name='mobilenetv2_conv' + str(i + 1)) name='mobilenetv2_conv' + str(i + 1))
input = paddle.fluid.layers.pool2d( input = paddle.nn.functional.adaptive_avg_pool2d(
input=input, input, 1, name='mobilenetv2_last_pool')
pool_type='avg',
global_pooling=True,
name='mobilenetv2_last_pool')
return input return input
......
...@@ -796,13 +796,12 @@ def pact(x, name=None): ...@@ -796,13 +796,12 @@ def pact(x, name=None):
u_param_attr = paddle.ParamAttr( u_param_attr = paddle.ParamAttr(
name=x.name + '_pact', name=x.name + '_pact',
initializer=paddle.nn.initializer.Constant(value=init_thres), initializer=paddle.nn.initializer.Constant(value=init_thres),
regularizer=paddle.fluid.regularizer.L2Decay(0.0001), regularizer=paddle.regularizer.L2Decay(0.0001),
learning_rate=1) learning_rate=1)
u_param = helper.create_parameter(attr=u_param_attr, shape=[1], dtype=dtype) u_param = helper.create_parameter(attr=u_param_attr, shape=[1], dtype=dtype)
x = paddle.subtract(x, x = paddle.subtract(x,
paddle.nn.functional.relu(paddle.subtract(x, u_param))) paddle.nn.functional.relu(paddle.subtract(x, u_param)))
x = paddle.paddle.add( x = paddle.add(x, paddle.nn.functional.relu(paddle.subtract(-u_param, x)))
x, paddle.nn.functional.relu(paddle.subtract(-u_param, x)))
return x return x
......
...@@ -182,16 +182,18 @@ class TestPruningMul(unittest.TestCase): ...@@ -182,16 +182,18 @@ class TestPruningMul(unittest.TestCase):
for param in net.parameters(): for param in net.parameters():
if param.name not in shapes: if param.name not in shapes:
shapes[param.name] = param.shape shapes[param.name] = param.shape
print(
f"name {param.name}: {param.shape}, excepted: {shapes[param.name]}"
)
self.assertTrue(shapes[param.name] == param.shape) self.assertTrue(shapes[param.name] == param.shape)
pruner.restore() pruner.restore()
paddle.enable_static() paddle.enable_static()
def add_cases(suite): def add_cases(suite):
suite.addTest(TestStatus()) # suite.addTest(TestStatus())
suite.addTest(TestFilterPruner(param_names=["conv2d_0.w_0"])) # suite.addTest(TestFilterPruner(param_names=["conv2d_0.w_0"]))
suite.addTest(TestPruningGroupConv2d()) # suite.addTest(TestPruningGroupConv2d())
suite.addTest(TestPruningMul()) suite.addTest(TestPruningMul())
......
...@@ -19,10 +19,10 @@ import unittest ...@@ -19,10 +19,10 @@ import unittest
import logging import logging
import paddle import paddle
from paddleslim.common import get_logger
from paddleslim import PTQ from paddleslim import PTQ
_logger = paddle.fluid.log_helper.get_logger( _logger = get_logger(__name__, level=logging.INFO)
__name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
class ImperativeLenet(paddle.nn.Layer): class ImperativeLenet(paddle.nn.Layer):
......
...@@ -19,10 +19,10 @@ import unittest ...@@ -19,10 +19,10 @@ import unittest
import logging import logging
import paddle import paddle
from paddleslim.common import get_logger
from paddleslim.dygraph.quant import QAT from paddleslim.dygraph.quant import QAT
_logger = paddle.fluid.log_helper.get_logger( _logger = get_logger(__name__, level=logging.INFO)
__name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
class ImperativeLenet(paddle.nn.Layer): class ImperativeLenet(paddle.nn.Layer):
......
...@@ -113,7 +113,7 @@ class TestSensitivity(unittest.TestCase): ...@@ -113,7 +113,7 @@ class TestSensitivity(unittest.TestCase):
exe = paddle.static.Executor(place) exe = paddle.static.Executor(place)
exe.run(startup_program) exe.run(startup_program)
val_reader = paddle.fluid.io.batch(self.val_reader, batch_size=128) val_reader = paddle.batch(self.val_reader, batch_size=128)
def eval_func(program): def eval_func(program):
feeder = paddle.fluid.DataFeeder( feeder = paddle.fluid.DataFeeder(
......
...@@ -35,12 +35,14 @@ class AnalysisQATDemo(unittest.TestCase): ...@@ -35,12 +35,14 @@ class AnalysisQATDemo(unittest.TestCase):
super(AnalysisQATDemo, self).__init__(*args, **kwargs) super(AnalysisQATDemo, self).__init__(*args, **kwargs)
if not os.path.exists('MobileNetV1_infer'): if not os.path.exists('MobileNetV1_infer'):
os.system( os.system(
'wget -q https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/inference/MobileNetV1_infer.tar' 'wget https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/inference/MobileNetV1_infer.tar'
) )
os.system('tar -xf MobileNetV1_infer.tar') os.system('tar -xf MobileNetV1_infer.tar')
if not os.path.exists('ILSVRC2012_data_demo'): if not os.path.exists(
os.path.join('.', 'ILSVRC2012_data_demo', 'ILSVRC2012',
'train')):
os.system( os.system(
'wget -q https://sys-p0.bj.bcebos.com/slim_ci/ILSVRC2012_data_demo.tar.gz' 'wget https://sys-p0.bj.bcebos.com/slim_ci/ILSVRC2012_data_demo.tar.gz'
) )
os.system('tar -xf ILSVRC2012_data_demo.tar.gz') os.system('tar -xf ILSVRC2012_data_demo.tar.gz')
......
...@@ -93,7 +93,7 @@ class ModelCase4(paddle.nn.Layer): ...@@ -93,7 +93,7 @@ class ModelCase4(paddle.nn.Layer):
x = paddle.stack([x, y], axis=3) x = paddle.stack([x, y], axis=3)
x = paddle.slice(x, axes=[0], starts=[0], ends=[1]) x = paddle.slice(x, axes=[0], starts=[0], ends=[1])
x = paddle.exp(x) x = paddle.exp(x)
y += paddle.fluid.layers.uniform_random(y.shape) y += paddle.uniform(y.shape)
y = paddle.mean(x=y, axis=1, keepdim=True) y = paddle.mean(x=y, axis=1, keepdim=True)
return paddle.greater_equal(x, y) return paddle.greater_equal(x, y)
...@@ -286,8 +286,8 @@ class TestCase2(unittest.TestCase): ...@@ -286,8 +286,8 @@ class TestCase2(unittest.TestCase):
pred = LatencyPredictor() pred = LatencyPredictor()
paddle.enable_static() paddle.enable_static()
with open(pbmodel_file, "rb") as f: with open(pbmodel_file, "rb") as f:
fluid_program = paddle.static.Program.parse_from_string(f.read()) _program = paddle.static.Program.parse_from_string(f.read())
graph = paddleslim.core.GraphWrapper(fluid_program) graph = paddleslim.core.GraphWrapper(_program)
graph_keys = pred._get_key_info_from_graph(graph=graph) graph_keys = pred._get_key_info_from_graph(graph=graph)
assert len(graph_keys) > 0 assert len(graph_keys) > 0
...@@ -381,8 +381,8 @@ class TestCase6(unittest.TestCase): ...@@ -381,8 +381,8 @@ class TestCase6(unittest.TestCase):
paddle.enable_static() paddle.enable_static()
with open(pbmodel_file, "rb") as f: with open(pbmodel_file, "rb") as f:
fluid_program = paddle.static.Program.parse_from_string(f.read()) _program = paddle.static.Program.parse_from_string(f.read())
graph = paddleslim.core.GraphWrapper(fluid_program) graph = paddleslim.core.GraphWrapper(_program)
graph_keys = predictor._get_key_info_from_graph(graph=graph) graph_keys = predictor._get_key_info_from_graph(graph=graph)
assert len(graph_keys) > 0 assert len(graph_keys) > 0
...@@ -404,8 +404,8 @@ class TestCase7(unittest.TestCase): ...@@ -404,8 +404,8 @@ class TestCase7(unittest.TestCase):
paddle.enable_static() paddle.enable_static()
with open(pbmodel_file, "rb") as f: with open(pbmodel_file, "rb") as f:
fluid_program = paddle.static.Program.parse_from_string(f.read()) _program = paddle.static.Program.parse_from_string(f.read())
graph = paddleslim.core.GraphWrapper(fluid_program) graph = paddleslim.core.GraphWrapper(_program)
graph_keys = predictor._get_key_info_from_graph(graph=graph) graph_keys = predictor._get_key_info_from_graph(graph=graph)
assert len(graph_keys) > 0 assert len(graph_keys) > 0
......
...@@ -51,7 +51,7 @@ class TestPrune(StaticCase): ...@@ -51,7 +51,7 @@ class TestPrune(StaticCase):
flag = paddle.full(shape=[1], fill_value=1, dtype='int32') flag = paddle.full(shape=[1], fill_value=1, dtype='int32')
rand_flag = paddle.randint(2, dtype='int32') rand_flag = paddle.randint(2, dtype='int32')
cond = paddle.less_than(x=flag, y=rand_flag) cond = paddle.less_than(x=flag, y=rand_flag)
cond_output = paddle.fluid.layers.create_global_var( cond_output = paddle.static.create_global_var(
shape=[1], shape=[1],
value=0.0, value=0.0,
dtype='float32', dtype='float32',
...@@ -355,7 +355,6 @@ class TestPruneWorker(unittest.TestCase): ...@@ -355,7 +355,6 @@ class TestPruneWorker(unittest.TestCase):
cls = PRUNE_WORKER.get(self.op.type()) cls = PRUNE_WORKER.get(self.op.type())
if cls is None: if cls is None:
cls = PRUNE_WORKER.get("default_worker") cls = PRUNE_WORKER.get("default_worker")
# pruning input of conv op # pruning input of conv op
for _var, _axis, _ret in self.cases: for _var, _axis, _ret in self.cases:
pruned_params = [] pruned_params = []
...@@ -370,6 +369,7 @@ class TestPruneWorker(unittest.TestCase): ...@@ -370,6 +369,7 @@ class TestPruneWorker(unittest.TestCase):
if var.name() not in ret: if var.name() not in ret:
ret[var.name()] = [] ret[var.name()] = []
ret[var.name()].append(axis) ret[var.name()].append(axis)
print(f"excepted: {_ret}; actual: {ret}")
self.assertTrue(ret == _ret) self.assertTrue(ret == _ret)
...@@ -444,12 +444,6 @@ class TestActivation(TestPruneWorker): ...@@ -444,12 +444,6 @@ class TestActivation(TestPruneWorker):
act_suite = unittest.TestSuite() act_suite = unittest.TestSuite()
act_suite.addTest(
TestActivation(
op=paddle.fluid.layers.resize_bilinear, scale=2.))
act_suite.addTest(
TestActivation(
op=paddle.fluid.layers.resize_nearest, scale=2.))
act_suite.addTest(TestActivation(op=paddle.floor)) act_suite.addTest(TestActivation(op=paddle.floor))
act_suite.addTest(TestActivation(op=paddle.scale)) act_suite.addTest(TestActivation(op=paddle.scale))
...@@ -774,8 +768,6 @@ class TestAverageAccumulates(TestPruneWorker): ...@@ -774,8 +768,6 @@ class TestAverageAccumulates(TestPruneWorker):
out = paddle.mean(conv1) out = paddle.mean(conv1)
opt = paddle.optimizer.Adam() opt = paddle.optimizer.Adam()
opt.minimize(out) opt.minimize(out)
model_average = paddle.fluid.optimizer.ModelAverage(
0.15, min_average_window=10000, max_average_window=12500)
def set_cases(self): def set_cases(self):
weight_var = self.graph.var('conv1.w_0') weight_var = self.graph.var('conv1.w_0')
...@@ -783,9 +775,6 @@ class TestAverageAccumulates(TestPruneWorker): ...@@ -783,9 +775,6 @@ class TestAverageAccumulates(TestPruneWorker):
'conv1.w_0': [0], 'conv1.w_0': [0],
'conv1.w_0_moment1_0': [0], 'conv1.w_0_moment1_0': [0],
'conv1.w_0_moment2_0': [0], 'conv1.w_0_moment2_0': [0],
'conv1.w_0_sum_1_0': [0],
'conv1.w_0_sum_2_0': [0],
'conv1.w_0_sum_3_0': [0]
})) }))
def test_prune(self): def test_prune(self):
......
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import sys import sys
import os
sys.path.append("../") sys.path.append("../")
import unittest import unittest
import tempfile import tempfile
...@@ -102,14 +103,12 @@ class ReconPTQ(unittest.TestCase): ...@@ -102,14 +103,12 @@ class ReconPTQ(unittest.TestCase):
format(iter, cost, top1, top5)) format(iter, cost, top1, top5))
train(main_program) train(main_program)
paddle.fluid.io.save_inference_model( paddle.static.save_inference_model(
dirname=self.tmpdir.name, os.path.join(self.tmpdir.name, "infer"),
feeded_var_names=[image.name], feed_vars=[image],
target_vars=[out], fetch_vars=[out],
main_program=val_program, program=val_program,
executor=exe, executor=exe)
model_filename='model.pdmodel',
params_filename='params.pdiparams')
print(f"saved infer model to [{self.tmpdir.name}]") print(f"saved infer model to [{self.tmpdir.name}]")
self.data_loader = sample_generator_creator() self.data_loader = sample_generator_creator()
...@@ -130,8 +129,8 @@ class TestReconRegion(ReconPTQ): ...@@ -130,8 +129,8 @@ class TestReconRegion(ReconPTQ):
self.tmpdir.name, self.tmpdir.name,
quantize_model_path='output_region', quantize_model_path='output_region',
sample_generator=self.data_loader, sample_generator=self.data_loader,
model_filename='model.pdmodel', model_filename='infer.pdmodel',
params_filename='params.pdiparams', params_filename='infer.pdiparams',
batch_nums=1, batch_nums=1,
epochs=1, epochs=1,
algo='abs_max', algo='abs_max',
...@@ -154,8 +153,8 @@ class TestReconLayer(ReconPTQ): ...@@ -154,8 +153,8 @@ class TestReconLayer(ReconPTQ):
self.tmpdir.name, self.tmpdir.name,
quantize_model_path='output_layer', quantize_model_path='output_layer',
sample_generator=self.data_loader, sample_generator=self.data_loader,
model_filename='model.pdmodel', model_filename='infer.pdmodel',
params_filename='params.pdiparams', params_filename='infer.pdiparams',
batch_nums=1, batch_nums=1,
epochs=1, epochs=1,
algo='KL', algo='KL',
......
...@@ -24,6 +24,9 @@ import numpy as np ...@@ -24,6 +24,9 @@ import numpy as np
class TestDartsSpace(StaticCase): class TestDartsSpace(StaticCase):
def __init__(self, methodNmae="test_search_space"):
super(TestDartsSpace, self).__init__(methodNmae)
def setUp(self): def setUp(self):
paddle.enable_static() paddle.enable_static()
self.init_test_case() self.init_test_case()
...@@ -89,6 +92,7 @@ search_space_suite.addTest( ...@@ -89,6 +92,7 @@ search_space_suite.addTest(
search_space_suite.addTest(TestSearchSpace(search_sapce_name="ResNetSpace")) search_space_suite.addTest(TestSearchSpace(search_sapce_name="ResNetSpace"))
search_space_suite.addTest( search_space_suite.addTest(
TestSearchSpace(search_sapce_name="ResNetBlockSpace")) TestSearchSpace(search_sapce_name="ResNetBlockSpace"))
search_space_suite.addTest(TestDartsSpace())
if __name__ == '__main__': if __name__ == '__main__':
runner = unittest.TextTestRunner(verbosity=2) runner = unittest.TextTestRunner(verbosity=2)
......
...@@ -45,8 +45,7 @@ class TestSensitivity(StaticCase): ...@@ -45,8 +45,7 @@ class TestSensitivity(StaticCase):
exe = paddle.static.Executor(place) exe = paddle.static.Executor(place)
exe.run(startup_program) exe.run(startup_program)
val_reader = paddle.fluid.io.batch( val_reader = paddle.batch(paddle.dataset.mnist.test(), batch_size=128)
paddle.dataset.mnist.test(), batch_size=128)
def eval_func(program): def eval_func(program):
feeder = paddle.fluid.DataFeeder( feeder = paddle.fluid.DataFeeder(
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册